├── .flake8 ├── .github └── workflows │ ├── adorable_unicorns_deploy.yaml │ └── ghcr_flytelab.yml ├── .gitignore ├── LICENSE ├── README.md ├── projects ├── bravemusic │ ├── .dockerignore │ ├── .flyte │ │ ├── remote-config.yaml │ │ ├── remote.config │ │ ├── sandbox-config.yaml │ │ └── sandbox.config │ ├── .gitpod.yml │ ├── Dockerfile │ ├── README.md │ ├── bravemusic │ │ ├── __init__.py │ │ ├── datasource.py │ │ ├── preprocess.py │ │ ├── train.py │ │ └── workflows.py │ ├── dashboard │ │ ├── Option_B │ │ │ ├── app.py │ │ │ └── serve.py │ │ ├── app.py │ │ ├── remote.config │ │ └── sandbox.config │ ├── deploy.py │ ├── procfile │ ├── requirements-dev.txt │ ├── requirements.txt │ └── setup.sh ├── destinations_similarity │ ├── .dockerignore │ ├── .flyte │ │ ├── remote-config.yaml │ │ ├── remote.config │ │ ├── sandbox-config.yaml │ │ └── sandbox.config │ ├── Dockerfile │ ├── Makefile │ ├── README.md │ ├── conf.py │ ├── dashboard │ │ ├── app.py │ │ ├── beach_kinder.jpeg │ │ ├── kinder.jpeg │ │ └── requirements.txt │ ├── deploy.py │ ├── destinations_similarity │ │ ├── __init__.py │ │ ├── processing │ │ │ ├── __init__.py │ │ │ ├── feature_engineering.py │ │ │ └── text_preprocessing.py │ │ ├── scraper │ │ │ ├── __init__.py │ │ │ ├── brazilian_cities.py │ │ │ └── extractor.py │ │ ├── tasks.py │ │ └── workflows.py │ ├── docs │ │ ├── Dockerfile │ │ ├── dashboard │ │ │ ├── dashboard.rst │ │ │ └── streamlit.rst │ │ ├── guides │ │ │ ├── deploy.rst │ │ │ ├── deploy_code.rst │ │ │ ├── docs.rst │ │ │ └── guide.rst │ │ ├── images │ │ │ ├── SolutionDiagram.png │ │ │ ├── kinzinhoApresentando.jpg │ │ │ ├── kinzinhoBagunceiro.jpg │ │ │ ├── kinzinhoBigDog.png │ │ │ ├── kinzinhoCachu.jpg │ │ │ ├── kinzinhoGalante.jpg │ │ │ ├── kinzinhoPensativo.jpg │ │ │ ├── sphinx_server.png │ │ │ └── vamoDalheLogo.jpeg │ │ ├── model │ │ │ ├── feature_engineering.rst │ │ │ ├── model.rst │ │ │ └── text_preprocessing.rst │ │ └── scraper │ │ │ ├── extractor.rst │ │ │ ├── scraper.rst │ │ │ └── source.rst │ ├── index.rst │ ├── requirements-dev.txt │ ├── requirements.txt │ └── scripts │ │ ├── open_docs.sh │ │ └── rebuild_docs.sh ├── weather_forecasting │ ├── .flyte │ │ ├── remote-config.yaml │ │ ├── remote.config │ │ ├── sandbox-config.yaml │ │ └── sandbox.config │ ├── .gitignore │ ├── DEPLOYMENT.md │ ├── Dockerfile │ ├── Makefile │ ├── README.md │ ├── app │ │ ├── __init__.py │ │ └── workflow.py │ ├── dashboard │ │ ├── flyte.config │ │ ├── requirements.txt │ │ └── weather_forecasting.py │ ├── in_container.mk │ ├── requirements.txt │ └── scripts │ │ ├── activate-launch-plans.sh │ │ ├── archive-launch-plans.sh │ │ └── launch-plan-status.sh └── whats_cooking_good_looking │ ├── .flyte │ ├── remote-config.yaml │ ├── remote.config │ ├── sandbox-config.yaml │ └── sandbox.config │ ├── Dockerfile │ ├── README.md │ ├── dashboard │ ├── app.py │ ├── remote.config │ └── sandbox.config │ ├── deploy.py │ ├── docs │ ├── actual_pipeline.png │ ├── apply_pipeline.png │ ├── label_studio.png │ ├── target_pipeline.png │ └── train_pipeline.png │ ├── requirements-dev.txt │ ├── requirements.txt │ └── whats_cooking_good_looking │ ├── __init__.py │ ├── apply_ner_workflow.py │ ├── config.json │ ├── keywords.txt │ ├── train_ner_workflow.py │ └── utils.py └── templates ├── _common └── deploy.py ├── basic ├── README.md ├── cookiecutter.json ├── hooks │ └── pre_gen_project.py └── {{cookiecutter.project_name}} │ ├── .dockerignore │ ├── .flyte │ ├── remote-config.yaml │ ├── remote.config │ ├── sandbox-config.yaml │ └── sandbox.config │ ├── Dockerfile │ ├── README.md │ ├── dashboard │ ├── app.py │ ├── remote.config │ └── sandbox.config │ ├── deploy.py │ ├── requirements-dev.txt │ ├── requirements.txt │ └── {{cookiecutter.project_name}} │ ├── __init__.py │ └── workflows.py └── pytorch-gpu ├── README.md ├── cookiecutter.json ├── hooks └── pre_gen_project.py └── {{cookiecutter.project_name}} ├── .dockerignore ├── .flyte ├── remote-config.yaml ├── remote.config ├── sandbox-config.yaml └── sandbox.config ├── Dockerfile ├── README.md ├── dashboard ├── app.py ├── remote.config └── sandbox.config ├── deploy.py ├── requirements-dev.txt ├── requirements.txt └── {{cookiecutter.project_name}} ├── __init__.py └── workflows.py /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 120 3 | exclude = 4 | projects/whats_cooking_good_looking/dashboard/ 5 | ignore = E265, E999 6 | -------------------------------------------------------------------------------- /.github/workflows/adorable_unicorns_deploy.yaml: -------------------------------------------------------------------------------- 1 | # This is a basic workflow to help you get started with Actions 2 | 3 | name: CI 4 | 5 | # Controls when the workflow will run 6 | on: 7 | push: 8 | 9 | # Allows you to run this workflow manually from the Actions tab 10 | workflow_dispatch: 11 | 12 | env: 13 | REGISTERY: eu.gcr.io/flyte-sandbox-342013/flytelab 14 | PROJECT_NAME: adorable-unicorns-23 15 | PROJECT_PATH : projects/whats_cooking_good_looking 16 | 17 | # A workflow run is made up of one or more jobs that can run sequentially or in parallel 18 | jobs: 19 | # This workflow contains a single job called "build" 20 | build-and-deploy: 21 | # The type of runner that the job will run on 22 | runs-on: ubuntu-latest 23 | 24 | # Steps represent a sequence of tasks that will be executed as part of the job 25 | steps: 26 | # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it 27 | - name: Get tags 28 | id: tags 29 | run: | 30 | echo "::set-output name=date::$(date +'%Y-%m-%d-%H-%M-%S')" 31 | foo=${{ github.ref }} 32 | branch_name=${foo#refs/heads/} 33 | branch_slug=${branch_name/\//-} 34 | echo "::set-output name=tag::$branch_slug" 35 | 36 | - name: is preprod 37 | id: is-preprod 38 | env: 39 | main: main 40 | run: | 41 | if [ "${{ steps.tags.outputs.tag }}" = "main" ];then 42 | echo "::set-output name=domain::staging" 43 | echo "::set-output name=tag::${{ steps.tags.outputs.date }}" 44 | else 45 | echo "::set-output name=domain::development" 46 | echo "::set-output name=tag::${{steps.tags.outputs.tag}}-${{steps.tags.outputs.date}}" 47 | fi 48 | 49 | - uses: actions/checkout@v2 50 | - name: Build the Docker image 51 | run: docker build ${{ env.PROJECT_PATH }}/. --file ${{ env.PROJECT_PATH }}/Dockerfile --build-arg config=.flyte/remote-config.yaml --build-arg image=${{env.REGISTERY}}:${{steps.is-preprod.outputs.tag}} --build-arg creds=${{secrets.RUNNER_KEY}} --tag ${{env.REGISTERY}}:${{steps.is-preprod.outputs.tag}} 52 | 53 | - uses: google-github-actions/setup-gcloud@v0 54 | with: 55 | service_account_key: ${{ secrets.SERVICE_ACCOUNT_KEY }} 56 | project_id: flyte-sandbox-342013 57 | export_default_credentials: true 58 | 59 | - run: gcloud auth configure-docker -q 60 | 61 | - name: Push the Docker image 62 | run: docker push ${{env.REGISTERY}}:${{steps.is-preprod.outputs.tag}} 63 | 64 | - name: serialize 65 | uses: louisRDSC/FlyteSerializeAction@v1.2 66 | with: 67 | config: ${{ env.PROJECT_PATH }}/.flyte/remote.config 68 | tag: ${{env.REGISTERY}}:${{steps.is-preprod.outputs.tag}} 69 | requirements: ${{ env.PROJECT_PATH }}/requirements.txt 70 | pkgs : whats_cooking_good_looking 71 | source: ${{ env.PROJECT_PATH }}/ 72 | 73 | 74 | - name: Register 75 | uses: louisRDSC/FlyteRegisterAction@v1.3 76 | with: 77 | project: ${{ env.PROJECT_NAME }} 78 | config: ${{ env.PROJECT_PATH }}/.flyte/remote-config.yaml 79 | domain: ${{ steps.is-preprod.outputs.domain }} 80 | package: ${{ env.PROJECT_PATH }}/flyte-package.tgz 81 | version: ${{ steps.is-preprod.outputs.tag }} 82 | clientId: ${{ secrets.CLIENT_ID }} 83 | clientSecret: ${{ secrets.CLIENT_SECRET }} 84 | 85 | -------------------------------------------------------------------------------- /.github/workflows/ghcr_flytelab.yml: -------------------------------------------------------------------------------- 1 | name: Build & Push Flytelab Docker Image 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | pull_request: 8 | create: 9 | tags: 10 | - v* 11 | 12 | jobs: 13 | push-github: 14 | if: ${{ github.repository_owner }} == 'flyteorg' 15 | name: Push to GHCR 16 | runs-on: ubuntu-latest 17 | defaults: 18 | run: 19 | working-directory: projects/weather_forecasting 20 | steps: 21 | - uses: actions/checkout@v2 22 | with: 23 | fetch-depth: "0" 24 | - name: Push Flytelab Weather Forecasting Docker Image to Github Registry 25 | uses: whoan/docker-build-with-cache-action@v5 26 | with: 27 | # https://docs.github.com/en/packages/learn-github-packages/publishing-a-package 28 | username: "${{ secrets.FLYTE_BOT_USERNAME }}" 29 | password: "${{ secrets.FLYTE_BOT_PAT }}" 30 | image_name: ${{ github.repository_owner }}/flytelab 31 | image_tag: weather-forecasting-latest,weather-forecasting-${{ github.sha }} 32 | push_git_tag: ${{ github.event_name != 'pull_request' }} 33 | push_image_and_stages: ${{ github.event_name != 'pull_request' }} 34 | registry: ghcr.io 35 | build_extra_args: "--compress=true --build-arg=tag=ghcr.io/${{ github.repository_owner }}/flytelab:weather-forecasting-${{ github.sha }}" 36 | context: ./projects/weather_forecasting 37 | dockerfile: Dockerfile 38 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | secrets/ 2 | credentials/ 3 | local_explo/ 4 | 5 | # Repo-specific 6 | projects/**/.config 7 | projects/**/.kube 8 | .env/**/* 9 | env.txt 10 | .cache/* 11 | .vscode 12 | **/_pb_output/* 13 | bin 14 | flyte-package.tgz 15 | .DS_Store 16 | 17 | # Byte-compiled / optimized / DLL files 18 | __pycache__/ 19 | *.py[cod] 20 | *$py.class 21 | 22 | # C extensions 23 | *.so 24 | 25 | # Distribution / packaging 26 | .Python 27 | build/ 28 | develop-eggs/ 29 | dist/ 30 | downloads/ 31 | eggs/ 32 | .eggs/ 33 | lib/ 34 | lib64/ 35 | parts/ 36 | sdist/ 37 | var/ 38 | wheels/ 39 | pip-wheel-metadata/ 40 | share/python-wheels/ 41 | *.egg-info/ 42 | .installed.cfg 43 | *.egg 44 | MANIFEST 45 | 46 | # PyInstaller 47 | # Usually these files are written by a python script from a template 48 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 49 | *.manifest 50 | *.spec 51 | 52 | # Installer logs 53 | pip-log.txt 54 | pip-delete-this-directory.txt 55 | 56 | # Unit test / coverage reports 57 | htmlcov/ 58 | .tox/ 59 | .nox/ 60 | .coverage 61 | .coverage.* 62 | .cache 63 | nosetests.xml 64 | coverage.xml 65 | *.cover 66 | *.py,cover 67 | .hypothesis/ 68 | .pytest_cache/ 69 | 70 | # Translations 71 | *.mo 72 | *.pot 73 | 74 | # Django stuff: 75 | *.log 76 | local_settings.py 77 | db.sqlite3 78 | db.sqlite3-journal 79 | 80 | # Flask stuff: 81 | instance/ 82 | .webassets-cache 83 | 84 | # Scrapy stuff: 85 | .scrapy 86 | 87 | # Sphinx documentation 88 | docs/_build/ 89 | 90 | # PyBuilder 91 | target/ 92 | 93 | # Jupyter Notebook 94 | .ipynb_checkpoints 95 | 96 | # IPython 97 | profile_default/ 98 | ipython_config.py 99 | 100 | # pyenv 101 | .python-version 102 | 103 | # pipenv 104 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 105 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 106 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 107 | # install all needed dependencies. 108 | #Pipfile.lock 109 | 110 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 111 | __pypackages__/ 112 | 113 | # Celery stuff 114 | celerybeat-schedule 115 | celerybeat.pid 116 | 117 | # SageMath parsed files 118 | *.sage.py 119 | 120 | # Environments 121 | .env 122 | .venv 123 | env/ 124 | venv/ 125 | ENV/ 126 | env.bak/ 127 | venv.bak/ 128 | 129 | # Spyder project settings 130 | .spyderproject 131 | .spyproject 132 | 133 | # Rope project settings 134 | .ropeproject 135 | 136 | # mkdocs documentation 137 | /site 138 | 139 | # mypy 140 | .mypy_cache/ 141 | .dmypy.json 142 | dmypy.json 143 | 144 | # Pyre type checker 145 | .pyre/ 146 | 147 | */gtzan/* 148 | projects/bravemusic/bravemusic/gtzan/ -------------------------------------------------------------------------------- /projects/bravemusic/.dockerignore: -------------------------------------------------------------------------------- 1 | !.flyte -------------------------------------------------------------------------------- /projects/bravemusic/.flyte/remote-config.yaml: -------------------------------------------------------------------------------- 1 | admin: 2 | # For GRPC endpoints you might want to use dns:///flyte.myexample.com 3 | endpoint: dns:///playground.hosted.unionai.cloud 4 | authType: Pkce 5 | # Change insecure flag to ensure that you use the right setting for your environment 6 | insecure: false 7 | storage: 8 | type: stow 9 | stow: 10 | kind: s3 11 | config: 12 | auth_type: iam 13 | region: us-east-2 14 | logger: 15 | # Logger settings to control logger output. Useful to debug logger: 16 | show-source: true 17 | level: 1 18 | -------------------------------------------------------------------------------- /projects/bravemusic/.flyte/remote.config: -------------------------------------------------------------------------------- 1 | [sdk] 2 | # This option specifies the python packages where-in to search for workflows and tasks workflow packages. These workflows and tasks are then serialized during the `make serialize` commands 3 | workflow_packages=bravemusic 4 | 5 | [auth] 6 | raw_output_data_prefix=s3://open-compute-playground 7 | -------------------------------------------------------------------------------- /projects/bravemusic/.flyte/sandbox-config.yaml: -------------------------------------------------------------------------------- 1 | admin: 2 | # For GRPC endpoints you might want to use dns:///flyte.myexample.com 3 | endpoint: dns:///localhost:30081 4 | authType: Pkce 5 | insecure: true 6 | logger: 7 | show-source: true 8 | level: 0 9 | storage: 10 | connection: 11 | access-key: minio 12 | auth-type: accesskey 13 | disable-ssl: true 14 | endpoint: http://localhost:30084 15 | region: us-east-1 16 | secret-key: miniostorage 17 | type: minio 18 | container: "my-s3-bucket" 19 | enable-multicontainer: true 20 | -------------------------------------------------------------------------------- /projects/bravemusic/.flyte/sandbox.config: -------------------------------------------------------------------------------- 1 | [sdk] 2 | # This option specifies the python packages where-in to search for workflows and tasks workflow packages. These workflows and tasks are then serialized during the `make serialize` commands 3 | workflow_packages=bravemusic 4 | 5 | [auth] 6 | raw_output_data_prefix=s3://my-s3-bucket/flytelab 7 | -------------------------------------------------------------------------------- /projects/bravemusic/.gitpod.yml: -------------------------------------------------------------------------------- 1 | tasks: 2 | - init: | 3 | python -m venv ~/venvs/brave 4 | source ~/venvs/brave/bin/activate 5 | pip install -r requirements.txt -r requirements-dev.txt 6 | command: python3 projects/bravemusic/bravemusic/workflows.py 7 | 8 | -------------------------------------------------------------------------------- /projects/bravemusic/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.9-slim-buster 2 | 3 | WORKDIR /root 4 | ENV VENV /opt/venv 5 | ENV LANG C.UTF-8 6 | ENV LC_ALL C.UTF-8 7 | ENV PYTHONPATH /root 8 | 9 | # e.g. flyte.config or sandbox.config 10 | ARG config 11 | 12 | RUN apt-get update && \ 13 | apt-get install -y \ 14 | libsm6 \ 15 | libxext6 \ 16 | libxrender-dev \ 17 | ffmpeg \ 18 | build-essential 19 | 20 | # Install the AWS cli separately to prevent issues with boto being written over 21 | RUN pip3 install awscli 22 | 23 | ENV VENV /opt/venv 24 | 25 | # Virtual environment 26 | RUN python3 -m venv ${VENV} 27 | ENV PATH="${VENV}/bin:$PATH" 28 | 29 | # Install Python dependencies 30 | COPY requirements.txt /root 31 | RUN pip install -r /root/requirements.txt 32 | 33 | COPY bravemusic /root/bravemusic 34 | COPY $config /root/flyte.config 35 | 36 | # This image is supplied by the build script and will be used to determine the version 37 | # when registering tasks, workflows, and launch plans 38 | ARG image 39 | ENV FLYTE_INTERNAL_IMAGE $image 40 | -------------------------------------------------------------------------------- /projects/bravemusic/README.md: -------------------------------------------------------------------------------- 1 | # Design Doc: Brave-Hyenas-2 2 | ## MLOps Community: Engineering labs 3 | 4 | | Team name | brave-hyenas-2 | 5 | |---------------------|:----------------------------------------:| 6 | |Project name | brave-hyenas-2 | 7 | | Project description | Hackathon - brave-hyenas-2 team | 8 | |Using GPUs? (Yes/No) | No | 9 | 10 | 11 | 12 | ### Problem Statement 13 | What problem are you solving? 14 | It’s usually hard to identify correctly what kind of music genre is playing thus our team embraced in tackling to classify music genre using deep learning. 15 | 16 | 17 | ### ...... 18 | 19 | 20 | 21 | 22 | 23 | ![new](https://user-images.githubusercontent.com/85021780/161294904-a4158856-0558-424f-9f07-85aef8f4b423.jpg) 24 | 25 | 26 | 27 | 28 | ### Solution (working progress) .... 29 | -------------------------------------------------------------------------------- /projects/bravemusic/bravemusic/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/flyteorg/flytelab/67ff9a779004209c82f9f5df89f2873cc5acdd52/projects/bravemusic/bravemusic/__init__.py -------------------------------------------------------------------------------- /projects/bravemusic/bravemusic/datasource.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tarfile 3 | import git 4 | 5 | GIT_URL = "https://huggingface.co/datasets/marsyas/gtzan" 6 | GTZAN_PATH = "./gtzan" 7 | GTZAN_ZIP_FILE_PATH = "./gtzan/data" 8 | GTZAN_ZIP_FILE_NAME = "genres.tar.gz" 9 | 10 | 11 | class Progress(git.remote.RemoteProgress): 12 | def update(self, op_code, cur_count, max_count=None, message=""): 13 | print(self._cur_line) 14 | 15 | 16 | def download_gtzan_repo(): 17 | if not os.path.isdir(GTZAN_PATH) or not any(os.scandir(GTZAN_PATH)): 18 | git.Repo.clone_from(url=GIT_URL, to_path=GTZAN_PATH, progress=Progress()) 19 | extract_gtzan_repo_tarball() 20 | else: 21 | print("dataset already exists") 22 | 23 | 24 | def extract_gtzan_repo_tarball(): 25 | # open file 26 | file = tarfile.open(f"{GTZAN_ZIP_FILE_PATH}/{GTZAN_ZIP_FILE_NAME}") 27 | # extracting file 28 | file.extractall(GTZAN_ZIP_FILE_PATH) 29 | file.close() 30 | 31 | 32 | if __name__ == "__main__": 33 | download_gtzan_repo() 34 | -------------------------------------------------------------------------------- /projects/bravemusic/bravemusic/preprocess.py: -------------------------------------------------------------------------------- 1 | import os 2 | import math 3 | import librosa 4 | from datasource import download_gtzan_repo, GTZAN_ZIP_FILE_PATH 5 | 6 | SAMPLE_RATE = 22050 7 | TRACK_DURATION = 30 # measured in seconds 8 | SAMPLES_PER_TRACK = SAMPLE_RATE * TRACK_DURATION 9 | BAD_FORMATS = ["jazz.00054.wav"] 10 | 11 | 12 | def clean_dataset(): 13 | for (dir_path, dir_names, filenames) in os.walk(f"{GTZAN_ZIP_FILE_PATH}/genres/"): 14 | print(dir_path) 15 | [ 16 | os.remove(f"{dir_path}{filename}") 17 | for filename in filenames 18 | if not filename.endswith(".wav") 19 | ] 20 | [ 21 | os.renames( 22 | old=f"{dir_path}/{filename}", 23 | new=f"{dir_path}/{filename}".replace("._", ""), 24 | ) 25 | for filename in filenames 26 | if f"{dir_path}/{filename}".startswith("._") 27 | ] 28 | [ 29 | os.remove(f"{dir_path}/{filename}") 30 | for filename in filenames 31 | if filename.startswith("._") 32 | ] 33 | 34 | 35 | def preprocess( 36 | dataset_path: str, 37 | num_mfcc: int = 13, 38 | n_fft: int = 2048, 39 | hop_length: int = 512, 40 | num_segments: int = 10, 41 | ) -> dict: 42 | data = {"mapping": [], "labels": [], "mfcc": []} 43 | 44 | samples_per_segment = int(SAMPLES_PER_TRACK / num_segments) 45 | num_mfcc_vectors_per_segment = math.ceil(samples_per_segment / hop_length) 46 | 47 | # loop through all genre sub-folder 48 | for i, (dir_path, dir_names, filenames) in enumerate( 49 | os.walk(f"{GTZAN_ZIP_FILE_PATH}/genres/") 50 | ): 51 | # ensure we're processing a genre sub-folder level 52 | if dir_path is not dataset_path: 53 | # save genre label (i.e., sub-folder name) in the mapping 54 | semantic_label = dir_path.split("/")[-1] 55 | print(semantic_label) 56 | data["mapping"].append(semantic_label) 57 | print("Processing: {}".format(semantic_label)) 58 | 59 | # process all audio files in genre sub-dir 60 | for f in filenames: 61 | if f not in BAD_FORMATS: 62 | # load audio file 63 | file_path = os.path.join(dir_path, f) 64 | signal, sample_rate = librosa.load(path=file_path, sr=SAMPLE_RATE) 65 | 66 | # process all segments of audio file 67 | for d in range(num_segments): 68 | 69 | # calculate start and finish sample for current segment 70 | start = samples_per_segment * d 71 | finish = start + samples_per_segment 72 | 73 | # extract mfcc 74 | mfcc = librosa.feature.mfcc( 75 | y=signal[start:finish], 76 | sr=sample_rate, 77 | n_mfcc=num_mfcc, 78 | n_fft=n_fft, 79 | hop_length=hop_length, 80 | ) 81 | mfcc = mfcc.T 82 | 83 | # store only mfcc feature with expected number of vectors 84 | if len(mfcc) == num_mfcc_vectors_per_segment: 85 | data["mfcc"].append(mfcc.tolist()) 86 | data["labels"].append(i - 1) 87 | # print("{}, segment:{}".format(file_path, d + 1)) 88 | return data 89 | 90 | 91 | if __name__ == "__main__": 92 | download_gtzan_repo() 93 | clean_dataset() 94 | data = preprocess(dataset_path=GTZAN_ZIP_FILE_PATH) 95 | print(data) 96 | -------------------------------------------------------------------------------- /projects/bravemusic/bravemusic/train.py: -------------------------------------------------------------------------------- 1 | import json 2 | import typing 3 | import warnings 4 | import numpy as np 5 | from tensorflow import keras 6 | from dataclasses import dataclass 7 | from preprocess import preprocess 8 | from datasource import GTZAN_ZIP_FILE_PATH 9 | from dataclasses_json import dataclass_json 10 | from flytekit.types.directory import FlyteDirectory 11 | from sklearn.model_selection import train_test_split 12 | 13 | 14 | warnings.filterwarnings("ignore") 15 | MODELSAVE = [typing.TypeVar("str")] 16 | model_file = typing.NamedTuple("Model", model=FlyteDirectory[MODELSAVE]) 17 | 18 | 19 | @dataclass_json 20 | @dataclass 21 | class Hyperparameters(object): 22 | batch_size: int = 32 23 | metrics: str = "accuracy" 24 | loss = ("sparse_categorical_crossentropy",) 25 | epochs: int = 30 26 | learning_rate: float = 0.0001 27 | 28 | 29 | def train( 30 | data: dict, 31 | hp: Hyperparameters 32 | ) -> model_file: 33 | # with open("data.json", "r") as fp: 34 | # data = json.load(fp) 35 | 36 | # convert lists to numpy arrays 37 | X = np.array(data["mfcc"]) 38 | y = np.array(data["labels"]) 39 | 40 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) 41 | 42 | model = keras.Sequential( 43 | [ 44 | keras.layers.Flatten(input_shape=(X.shape[1], X.shape[2])), 45 | keras.layers.Dense(512, activation="relu"), 46 | keras.layers.Dense(256, activation="relu"), 47 | keras.layers.Dense(64, activation="relu"), 48 | keras.layers.Dense(10, activation="softmax"), 49 | ] 50 | ) 51 | optimiser = keras.optimizers.Adam(learning_rate=hp.learning_rate) 52 | model.compile( 53 | optimizer=optimiser, 54 | loss=hp.loss, 55 | metrics=[hp.metrics], 56 | ) 57 | # train model 58 | model.fit( 59 | X_train, 60 | y_train, 61 | validation_data=(X_test, y_test), 62 | batch_size=hp.batch_size, 63 | epochs=hp.epochs, 64 | ) 65 | 66 | Dir = "model" 67 | model.save(Dir) 68 | return model 69 | 70 | 71 | if __name__ == '__main__': 72 | data = preprocess(dataset_path=GTZAN_ZIP_FILE_PATH) 73 | model = train( 74 | data=data, 75 | hp=Hyperparameters(epochs=1) 76 | ) 77 | -------------------------------------------------------------------------------- /projects/bravemusic/bravemusic/workflows.py: -------------------------------------------------------------------------------- 1 | import json 2 | import typing 3 | import warnings 4 | from train import Hyperparameters, train 5 | from flytekit import Resources, task, workflow 6 | from preprocess import clean_dataset, preprocess 7 | from flytekit.types.directory import FlyteDirectory 8 | from datasource import download_gtzan_repo, GTZAN_ZIP_FILE_PATH 9 | 10 | 11 | SAMPLE_RATE = 22050 12 | TRACK_DURATION = 30 # measured in seconds 13 | warnings.filterwarnings("ignore") 14 | SAMPLES_PER_TRACK = SAMPLE_RATE * TRACK_DURATION 15 | 16 | MODELSAVE = [typing.TypeVar("str")] 17 | model_file = typing.NamedTuple("Model", model=FlyteDirectory[MODELSAVE]) 18 | workflow_outputs = typing.NamedTuple("WorkflowOutputs", model=FlyteDirectory[MODELSAVE]) 19 | 20 | 21 | @task 22 | def download_gtzan_dataset(): 23 | download_gtzan_repo() 24 | 25 | 26 | @task 27 | def clean_gtzan_dataset(): 28 | clean_dataset() 29 | 30 | 31 | @task(cache_version="1.0", cache=True, limits=Resources(mem="2000Mi")) 32 | def preprocess_gtzan_dataset( 33 | dataset_path: str 34 | ) -> dict: 35 | processed_data = preprocess(dataset_path=dataset_path) 36 | return processed_data 37 | 38 | 39 | @task(cache_version="1.0", cache=True, limits=Resources(mem="2000Mi")) 40 | def train_gtzan_dataset( 41 | data: dict, 42 | hp: Hyperparameters, 43 | )-> model_file: 44 | model = train(data=data, hp=hp) 45 | Dir = "model" 46 | model.save(Dir) 47 | return (Dir,) 48 | 49 | 50 | @workflow 51 | def flyteworkflow( 52 | dataset_path: str = GTZAN_ZIP_FILE_PATH 53 | )-> workflow_outputs: 54 | download_gtzan_dataset() 55 | clean_gtzan_dataset() 56 | processed_data = preprocess_gtzan_dataset( 57 | dataset_path=dataset_path, 58 | ) 59 | model = train_gtzan_dataset( 60 | data=processed_data, 61 | hp=Hyperparameters(epochs=10) 62 | ) 63 | 64 | return (model.model,) 65 | 66 | 67 | if __name__ == "__main__": 68 | print(f"Running {__file__} main...") 69 | print(flyteworkflow()) 70 | -------------------------------------------------------------------------------- /projects/bravemusic/dashboard/Option_B/app.py: -------------------------------------------------------------------------------- 1 | from fastapi import File 2 | import streamlit as st 3 | from streamlit_option_menu import option_menu 4 | import requests 5 | from pydub import AudioSegment 6 | 7 | with st.sidebar: 8 | selected = option_menu( 9 | menu_title="Main Menu", # required 10 | options=["Home", "Project Design", "Meet The Team"], # required 11 | icons=["house", "diagram-2", "people"], # optional 12 | menu_icon="cast", # optional 13 | default_index=0, # optional 14 | ) 15 | 16 | 17 | if selected == "Home": 18 | st.markdown( 19 | """

Music Genre Classification

""", 20 | unsafe_allow_html=True, 21 | ) 22 | global type 23 | UploadAudio = st.file_uploader("Upload Music To Classify", type=["wav", "mp3"]) 24 | st.markdown("""

Play:

""", unsafe_allow_html=True) 25 | st.audio(UploadAudio) 26 | 27 | if st.button("Predict"): 28 | if UploadAudio is not None: 29 | if type == "mp3": 30 | UploadAudio = AudioSegment.from_mp3(UploadAudio) 31 | UploadAudio.export("file.wav", format="wav") 32 | response = requests.post("http://127.0.0.1:8000/predict", data=UploadAudio) 33 | prediction = response 34 | st.success(f"You're Listening to: {prediction}") 35 | 36 | 37 | if selected == "Project Design": 38 | st.markdown( 39 | """

Our Project Holistic View

""", 40 | unsafe_allow_html=True, 41 | ) 42 | if selected == "Meet The Team": 43 | st.markdown( 44 | """

Meet Our Amazing Team

""", 45 | unsafe_allow_html=True, 46 | ) 47 | -------------------------------------------------------------------------------- /projects/bravemusic/dashboard/Option_B/serve.py: -------------------------------------------------------------------------------- 1 | import joblib 2 | import uvicorn 3 | import numpy as np 4 | import pandas as pd 5 | from pydantic import BaseModel 6 | import mlflow 7 | from fastapi import FastAPI, File, UploadFile 8 | from pip import main 9 | import tensorflow as tf 10 | import librosa 11 | import math 12 | 13 | # TODO 14 | # To accept Audio file sending from streamlit 15 | 16 | 17 | # Initiate app instance 18 | app = FastAPI(title="Brave Hyena", version="1.0", description="Trying Locally") 19 | 20 | # in deployment we use remote.fetch_workflow_execution to get the model 21 | model = tf.keras.models.load_model("< Copy paste the flyteworkflow output url>") 22 | genre = { 23 | 0: "blues", 24 | 1: "classical", 25 | 2: "country", 26 | 3: "disco", 27 | 4: "hiphop", 28 | 5: "jazz", 29 | 6: "metal", 30 | 7: "pop", 31 | 8: "reggae", 32 | 9: "rock", 33 | } 34 | 35 | 36 | # Api root or home endpoint 37 | @app.get("/") 38 | @app.get("/home") 39 | def read_home(): 40 | """ 41 | Home endpoint which can be used to test the availability of the application. 42 | """ 43 | return {"message": "Looks Good"} 44 | 45 | 46 | data = {"mfcc": []} 47 | 48 | 49 | @app.post("/predict") 50 | async def predict(file: UploadFile = File(...)): 51 | # Extract data in correct order 52 | hop_length = 512 53 | num_segments = 10 54 | SAMPLE_RATE = 22050 55 | TRACK_DURATION = 30 # measured in seconds 56 | SAMPLES_PER_TRACK = SAMPLE_RATE * TRACK_DURATION 57 | 58 | samples_per_segment = int(SAMPLES_PER_TRACK / num_segments) 59 | num_mfcc_vectors_per_segment = math.ceil(samples_per_segment / hop_length) 60 | audio, sample_rate = librosa.load(file.file, 22050) 61 | for d in range(num_segments): 62 | start = samples_per_segment * d 63 | finish = start + samples_per_segment 64 | mfcc = librosa.feature.mfcc( 65 | audio[start:finish], sample_rate, n_mfcc=13, n_fft=2048, hop_length=512 66 | ) 67 | mfcc = mfcc.T 68 | break 69 | 70 | data["mfcc"].append(mfcc.tolist()) if len( 71 | mfcc 72 | ) == num_mfcc_vectors_per_segment else print( 73 | "It's not the same as the Trained data" 74 | ) 75 | test = np.array(data["mfcc"]) 76 | predict_x = model.predict(test) 77 | prediction = np.argmax(predict_x, axis=1) 78 | 79 | return {"Genre": genre[round(prediction.mean())]} 80 | return item 81 | 82 | 83 | @app.get("/") 84 | async def root(): 85 | def main(): 86 | uvicorn.run(app, host="0.0.0.0", port=8000, reload=True) 87 | 88 | 89 | if __name__ == "__main__": 90 | main() 91 | -------------------------------------------------------------------------------- /projects/bravemusic/dashboard/app.py: -------------------------------------------------------------------------------- 1 | import os 2 | from argparse import ArgumentParser 3 | from pathlib import Path 4 | 5 | import streamlit as st 6 | 7 | from flytekit.remote import FlyteRemote 8 | from flytekit.models import filters 9 | from flytekit.models.admin.common import Sort 10 | 11 | from sklearn.datasets import load_digits 12 | from sqlite3 import DatabaseError 13 | from itsdangerous import json 14 | import streamlit as st 15 | from streamlit_option_menu import option_menu 16 | import numpy as np 17 | from pydub import AudioSegment 18 | import librosa 19 | import math 20 | import tensorflow as tf 21 | 22 | 23 | PROJECT_NAME = "flytelab-final".replace("_", "-") 24 | WORKFLOW_NAME = "final.workflows.main" 25 | 26 | 27 | parser = ArgumentParser() 28 | parser.add_argument("--remote", action="store_true") 29 | args = parser.parse_args() 30 | 31 | backend = os.getenv("FLYTE_BACKEND", "remote" if args.remote else "sandbox") 32 | 33 | # configuration for accessing a Flyte cluster backend 34 | remote = FlyteRemote.from_config( 35 | default_project=PROJECT_NAME, 36 | default_domain="development", 37 | config_file_path=Path(__file__).parent / f"{backend}.config", 38 | ) 39 | 40 | # get the latest workflow execution 41 | [latest_execution, *_], _ = remote.client.list_executions_paginated( 42 | PROJECT_NAME, 43 | "development", 44 | limit=1, 45 | filters=[ 46 | filters.Equal("launch_plan.name", WORKFLOW_NAME), 47 | filters.Equal("phase", "SUCCEEDED"), 48 | ], 49 | sort_by=Sort.from_python_std("desc(execution_created_at)"), 50 | ) 51 | 52 | wf_execution = remote.fetch_workflow_execution(name=latest_execution.id.name) 53 | remote.sync(wf_execution, sync_nodes=False) 54 | modelurl = wf_execution.outputs["o0"] 55 | print(model) 56 | 57 | 58 | ############ 59 | # App Code # 60 | ############ 61 | 62 | 63 | with st.sidebar: 64 | selected = option_menu( 65 | menu_title="Main Menu", # required 66 | options=["Home", "Project Design", "Meet The Team"], # required 67 | icons=["house", "diagram-2", "people"], # optional 68 | menu_icon="cast", # optional 69 | default_index=0, # optional 70 | ) 71 | 72 | 73 | if selected == "Home": 74 | st.markdown( 75 | """

Music Genre Classification

""", 76 | unsafe_allow_html=True, 77 | ) 78 | # in deployment we use remote.fetch_workflow_execution to get the model url 79 | model = tf.keras.models.load_model(modelurl) 80 | genre = { 81 | 0: "Blues", 82 | 1: "Classical", 83 | 2: "Country", 84 | 3: "Disco", 85 | 4: "Hiphop", 86 | 5: "Jazz", 87 | 6: "Metal", 88 | 7: "Pop", 89 | 8: "Reggae", 90 | 9: "Rock", 91 | } 92 | 93 | global type 94 | UploadAudio = st.file_uploader("Upload Music To Classify", type=["wav", "mp3"]) 95 | st.markdown("""

Play:

""", unsafe_allow_html=True) 96 | st.audio(UploadAudio) 97 | hop_length = 512 98 | num_segments = 10 99 | SAMPLE_RATE = 22050 100 | TRACK_DURATION = 30 # measured in seconds 101 | SAMPLES_PER_TRACK = SAMPLE_RATE * TRACK_DURATION 102 | 103 | data = {"mfcc": []} 104 | 105 | if st.button("Predict"): 106 | if UploadAudio is not None: 107 | type = UploadAudio.type 108 | if type == "audio/mpeg": 109 | UploadAudio = AudioSegment.from_mp3(UploadAudio) 110 | UploadAudio.export("file.wav", format="wav") 111 | samples_per_segment = int(SAMPLES_PER_TRACK / num_segments) 112 | num_mfcc_vectors_per_segment = math.ceil( 113 | samples_per_segment / hop_length 114 | ) 115 | audio, sample_rate = librosa.load(UploadAudio, 22050) 116 | for d in range(num_segments): 117 | start = samples_per_segment * d 118 | finish = start + samples_per_segment 119 | mfcc = librosa.feature.mfcc( 120 | audio[start:finish], 121 | sample_rate, 122 | n_mfcc=13, 123 | n_fft=2048, 124 | hop_length=512, 125 | ) 126 | mfcc = mfcc.T 127 | break 128 | 129 | data["mfcc"].append(mfcc.tolist()) if len( 130 | mfcc 131 | ) == num_mfcc_vectors_per_segment else print( 132 | "It's not the same as the Trained data" 133 | ) 134 | 135 | test = np.array(data["mfcc"]) 136 | predict_x = model.predict(test) 137 | predictions = np.argmax(predict_x, axis=1) 138 | prediction = genre[round(predictions.mean())] 139 | 140 | st.markdown( 141 | f"""

You're Listening to : {prediction}

""", 142 | unsafe_allow_html=True, 143 | ) 144 | 145 | else: 146 | samples_per_segment = int(SAMPLES_PER_TRACK / num_segments) 147 | num_mfcc_vectors_per_segment = math.ceil( 148 | samples_per_segment / hop_length 149 | ) 150 | audio, sample_rate = librosa.load(UploadAudio, 22050) 151 | for d in range(num_segments): 152 | start = samples_per_segment * d 153 | finish = start + samples_per_segment 154 | mfcc = librosa.feature.mfcc( 155 | audio[start:finish], 156 | sample_rate, 157 | n_mfcc=13, 158 | n_fft=2048, 159 | hop_length=512, 160 | ) 161 | mfcc = mfcc.T 162 | break 163 | 164 | data["mfcc"].append(mfcc.tolist()) if len( 165 | mfcc 166 | ) == num_mfcc_vectors_per_segment else print( 167 | "It's not the same as the Trained data" 168 | ) 169 | 170 | test = np.array(data["mfcc"]) 171 | predict_x = model.predict(test) 172 | predictions = np.argmax(predict_x, axis=1) 173 | prediction = genre[round(predictions.mean())] 174 | 175 | st.markdown( 176 | f"""

You're Listening to : {prediction}

""", 177 | unsafe_allow_html=True, 178 | ) 179 | # st.success(f"You're Listening to: {genre[round(prediction.mean())]}") 180 | 181 | 182 | if selected == "Project Design": 183 | st.markdown( 184 | """

Our Project Holistic View

""", 185 | unsafe_allow_html=True, 186 | ) 187 | if selected == "Meet The Team": 188 | st.markdown( 189 | """

Meet Our Amazing Team

""", 190 | unsafe_allow_html=True, 191 | ) 192 | -------------------------------------------------------------------------------- /projects/bravemusic/dashboard/remote.config: -------------------------------------------------------------------------------- 1 | [platform] 2 | url=playground.hosted.unionai.cloud 3 | insecure=False 4 | 5 | [credentials] 6 | client_id=flytepropeller 7 | auth_mode=basic 8 | authorization_metadata-key=flyte-authorization 9 | oauth_scopes=all 10 | -------------------------------------------------------------------------------- /projects/bravemusic/dashboard/sandbox.config: -------------------------------------------------------------------------------- 1 | [platform] 2 | url=localhost:30081 3 | insecure=True 4 | 5 | [aws] 6 | access_key_id=minio 7 | secret_access_key=miniostorage 8 | endpoint=http://localhost:30084 9 | -------------------------------------------------------------------------------- /projects/bravemusic/deploy.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | import uuid 4 | from pathlib import Path 5 | 6 | import docker 7 | import git 8 | import typer 9 | 10 | 11 | app = typer.Typer() 12 | 13 | docker_client = docker.from_env() 14 | 15 | 16 | IMAGE_NAME = "flytelab" 17 | REGISTRY = "ghcr.io/Abdullahi-Ahmed".lower() 18 | PROJECT_NAME = "flytelab-bravemusic".replace("_", "-").lower() 19 | DESCRIPTION = "Hackathon brave-hyenas-2 team project" 20 | 21 | 22 | def create_project(remote: bool): 23 | config = Path(".flyte") / f"{'remote' if remote else 'sandbox'}-config.yaml" 24 | output = subprocess.run( 25 | [ 26 | "flytectl", 27 | "get", 28 | "project", 29 | PROJECT_NAME, 30 | "--config", 31 | config, 32 | ], 33 | capture_output=True, 34 | check=True, 35 | ) 36 | if output.stdout.decode().strip(): 37 | return 38 | 39 | typer.echo(f"Creating project {PROJECT_NAME}") 40 | subprocess.run( 41 | [ 42 | "flytectl", 43 | "create", 44 | "project", 45 | "--project", 46 | PROJECT_NAME, 47 | "--name", 48 | PROJECT_NAME, 49 | "--id", 50 | PROJECT_NAME, 51 | "--description", 52 | DESCRIPTION, 53 | "--config", 54 | config, 55 | ], 56 | check=True, 57 | ) 58 | 59 | 60 | def get_version(fast: bool): 61 | repo = git.Repo(".", search_parent_directories=True) 62 | if not fast and repo.is_dirty(): 63 | typer.echo( 64 | "Please commit git changes before building. If you haven't updated any system/python dependencies " 65 | "but want to deploy task/workflow code changes, use the --fast flag to do fast registration.", 66 | err=True, 67 | ) 68 | raise typer.Exit(code=1) 69 | commit = repo.rev_parse("HEAD") 70 | return commit.hexsha 71 | 72 | 73 | def get_tag(version, registry=None): 74 | return f"{REGISTRY if registry is None else registry}/{IMAGE_NAME}:{PROJECT_NAME}-{version}" 75 | 76 | 77 | def sandbox_docker_build(tag): 78 | typer.echo("Building image in Flyte sandbox") 79 | subprocess.run( 80 | [ 81 | "flytectl", 82 | "sandbox", 83 | "exec", 84 | "--", 85 | "docker", 86 | "build", 87 | ".", 88 | "--tag", 89 | tag, 90 | ], 91 | check=True, 92 | ) 93 | 94 | 95 | def docker_build(tag: str, remote: bool) -> docker.models.images.Image: 96 | client = docker.from_env() 97 | 98 | # TODO: image build, push, flytectl serialization and registration 99 | config = Path(".flyte") / f"{'remote' if remote else 'sandbox'}.config" 100 | 101 | typer.echo(f"Building image: {tag}") 102 | image, build_logs = client.images.build( 103 | path=".", 104 | dockerfile="Dockerfile", 105 | tag=tag, 106 | buildargs={ 107 | "image": tag, 108 | "config": str(config), 109 | }, 110 | ) 111 | for line in build_logs: 112 | typer.echo(line) 113 | return image 114 | 115 | 116 | def docker_push(image: docker.models.images.Image): 117 | for line in docker_client.api.push(image.tags[0], stream=True, decode=True): 118 | typer.echo(line) 119 | 120 | 121 | def serialize(tag: str, remote: bool, fast: bool): 122 | typer.echo("Serializing Flyte workflows") 123 | config = Path(".flyte") / f"{'remote' if remote else 'sandbox'}.config" 124 | package = Path(".") / "flyte-package.tgz" 125 | if package.exists(): 126 | os.remove(package) 127 | subprocess.run( 128 | [ 129 | "pyflyte", 130 | "-c", 131 | str(config), 132 | "--pkgs", 133 | "bravemusic", 134 | "package", 135 | "--force", 136 | "--image", 137 | tag, 138 | *(["--fast"] if fast else ["--in-container-source-path", "/root"]), 139 | ], 140 | check=True, 141 | # inject the FLYTE_SANDBOX environment variable to the serialization runtime 142 | env={"FLYTE_SANDBOX": "1" if not remote else "0", **os.environ}, 143 | ) 144 | 145 | 146 | def register(version: str, remote: bool, fast: bool, domain: str): 147 | typer.echo("Registering Flyte workflows") 148 | config = Path(".flyte") / f"{'remote' if remote else 'sandbox'}-config.yaml" 149 | if fast: 150 | version = f"{version}-fast{uuid.uuid4().hex[:7]}" 151 | subprocess.run( 152 | [ 153 | "flytectl", 154 | "-c", 155 | config, 156 | "register", 157 | "files", 158 | "--project", 159 | PROJECT_NAME, 160 | "--domain", 161 | domain, 162 | "--archive", 163 | "flyte-package.tgz", 164 | "--force", 165 | "--version", 166 | version, 167 | ], 168 | check=True, 169 | ) 170 | typer.echo(f"Successfully registered version {version}") 171 | 172 | 173 | @app.command() 174 | def main( 175 | remote: bool = False, 176 | fast: bool = False, 177 | domain: str = "staging", 178 | registry: str = None, 179 | ): 180 | if remote and fast: 181 | typer.echo( 182 | "Fast registration is not enabled when deploying to remote. " 183 | "Please deploy your workflows without the --fast flag.", 184 | err=True, 185 | ) 186 | create_project(remote) 187 | version = get_version(fast) 188 | tag = get_tag(version, registry) 189 | if not fast: 190 | if remote: 191 | docker_push(docker_build(tag, remote)) 192 | else: 193 | sandbox_docker_build(tag) 194 | serialize(tag, remote, fast) 195 | register(version, remote, fast, domain) 196 | 197 | 198 | if __name__ == "__main__": 199 | app() 200 | -------------------------------------------------------------------------------- /projects/bravemusic/procfile: -------------------------------------------------------------------------------- 1 | web: sh setup.sh && streamlit run ./dashboard/app.py -------------------------------------------------------------------------------- /projects/bravemusic/requirements-dev.txt: -------------------------------------------------------------------------------- 1 | docker==5.0.3 2 | GitPython==3.1.27 3 | streamlit==1.8.1 4 | typer==0.4.1 5 | click==8.0.4 6 | streamlit_option_menu==0.3.2 7 | pydub==0.25.1 8 | itsdangerous==2.1.2 9 | -------------------------------------------------------------------------------- /projects/bravemusic/requirements.txt: -------------------------------------------------------------------------------- 1 | flytekit>=0.30.3 2 | pandas==1.4.1 3 | s3fs==2022.2.0 4 | scikit-learn==1.0.2 5 | librosa==0.9.1 6 | tensorflow==2.8.0 7 | numpy==1.21 8 | joblib==1.1.0 9 | requests==2.27.1 10 | dataclasses_json==0.5.7 11 | black==22.1.0 -------------------------------------------------------------------------------- /projects/bravemusic/setup.sh: -------------------------------------------------------------------------------- 1 | mkdir -p ~/.streamlit/ 2 | 3 | echo "\ 4 | [general]\n\ 5 | email = \"your-email@domain.com\"\n\ 6 | " > ~/.streamlit/credentials.toml 7 | 8 | echo "\ 9 | [server]\n\ 10 | headless = true\n\ 11 | enableCORS=false\n\ 12 | port = $PORT\n\ 13 | " > ~/.streamlit/config.toml -------------------------------------------------------------------------------- /projects/destinations_similarity/.dockerignore: -------------------------------------------------------------------------------- 1 | !.flyte -------------------------------------------------------------------------------- /projects/destinations_similarity/.flyte/remote-config.yaml: -------------------------------------------------------------------------------- 1 | admin: 2 | # For GRPC endpoints you might want to use dns:///flyte.myexample.com 3 | endpoint: dns:///playground.hosted.unionai.cloud 4 | authType: Pkce 5 | # Change insecure flag to ensure that you use the right setting for your environment 6 | insecure: false 7 | storage: 8 | type: stow 9 | stow: 10 | kind: s3 11 | config: 12 | auth_type: iam 13 | region: us-east-2 14 | logger: 15 | # Logger settings to control logger output. Useful to debug logger: 16 | show-source: true 17 | level: 1 18 | -------------------------------------------------------------------------------- /projects/destinations_similarity/.flyte/remote.config: -------------------------------------------------------------------------------- 1 | [sdk] 2 | # This option specifies the python packages where-in to search for workflows and tasks workflow packages. These workflows and tasks are then serialized during the `make serialize` commands 3 | workflow_packages=destinations_similarity 4 | 5 | [auth] 6 | raw_output_data_prefix=s3://open-compute-playground 7 | -------------------------------------------------------------------------------- /projects/destinations_similarity/.flyte/sandbox-config.yaml: -------------------------------------------------------------------------------- 1 | admin: 2 | # For GRPC endpoints you might want to use dns:///flyte.myexample.com 3 | endpoint: dns:///localhost:30081 4 | authType: Pkce 5 | insecure: true 6 | logger: 7 | show-source: true 8 | level: 0 9 | storage: 10 | connection: 11 | access-key: minio 12 | auth-type: accesskey 13 | disable-ssl: true 14 | endpoint: http://localhost:30084 15 | region: us-east-1 16 | secret-key: miniostorage 17 | type: minio 18 | container: "my-s3-bucket" 19 | enable-multicontainer: true 20 | -------------------------------------------------------------------------------- /projects/destinations_similarity/.flyte/sandbox.config: -------------------------------------------------------------------------------- 1 | [sdk] 2 | # This option specifies the python packages where-in to search for workflows and tasks workflow packages. These workflows and tasks are then serialized during the `make serialize` commands 3 | workflow_packages=destinations_similarity 4 | 5 | [auth] 6 | raw_output_data_prefix=s3://my-s3-bucket/flytelab 7 | -------------------------------------------------------------------------------- /projects/destinations_similarity/Dockerfile: -------------------------------------------------------------------------------- 1 | # syntax=docker/dockerfile:1 2 | 3 | # Dockerfile that sets up the Flyte image 4 | 5 | ARG image 6 | ARG config 7 | 8 | FROM pytorch/pytorch:1.11.0-cuda11.3-cudnn8-runtime 9 | 10 | # Defining environment variables 11 | ENV APPUSER="flyte" 12 | ENV LANG="C.UTF-8" 13 | ENV LC_ALL="C.UTF-8" 14 | ENV PATH="/home/${APPUSER}/.local/bin:${PATH}" 15 | ENV PYTHONPATH="/home/${APPUSER}:${PYTHONPATH}" 16 | 17 | # Updating and cleaning system 18 | RUN apt-get update && \ 19 | apt-get upgrade -y && \ 20 | apt-get install -y build-essential git && \ 21 | apt-get autoremove -yqq --purge && \ 22 | apt-get clean 23 | 24 | # Changing the user so that the container is non-root 25 | RUN useradd -u 1024 -m "${APPUSER}" 26 | USER "${APPUSER}" 27 | WORKDIR "/home/${APPUSER}" 28 | 29 | # Setup virtual environment 30 | ENV VENV="/home/${APPUSER}/venv" 31 | RUN python -m venv ${VENV} 32 | ENV PATH="${VENV}/bin:${PATH}" 33 | 34 | # Copy requirements 35 | COPY requirements.txt "/home/${APPUSER}/requirements.txt" 36 | 37 | # Upgrade pip, install dependencies and awscli 38 | RUN python -m pip install -U pip && \ 39 | pip install -r requirements.txt awscli 40 | 41 | # Copy the code and configuration 42 | COPY --chown="${APPUSER}:${APPUSER}" \ 43 | destinations_similarity/ "/home/${APPUSER}/destinations_similarity" 44 | COPY $config "/home/${APPUSER}/flyte.config" 45 | 46 | # Download nltk files 47 | RUN python -c "import nltk; nltk.download('punkt'); nltk.download('stopwords')" 48 | 49 | # Tag the image 50 | ENV FLYTE_INTERNAL_IMAGE="$image" 51 | -------------------------------------------------------------------------------- /projects/destinations_similarity/Makefile: -------------------------------------------------------------------------------- 1 | open-docs: 2 | @sh scripts/open_docs.sh 3 | 4 | rebuild-docs: 5 | @sh scripts/rebuild_docs.sh 6 | 7 | stop-docs-server: 8 | @docker stop sphinx-nginx -------------------------------------------------------------------------------- /projects/destinations_similarity/README.md: -------------------------------------------------------------------------------- 1 | # Destinations Similarity 2 | 3 | ### Short description 4 | 5 | Similar destination search. 6 | 7 | ### Problem statement 8 | 9 | Kinzinho is an adventurous dog who wants to know all about the destinations he could go to. To do so, Kinzinho decided to extract public data from Wikipedia and Wikivoyager to get to know them all! But now he realized it's too much and wants some recommendations based on where he's traveled before. Can we help him? 10 | 11 | Tip: of course! Everything is 'paw-sible' when you are a dog! :D 12 | 13 | ![kinzinho Big Dog](docs/images/kinzinhoBigDog.png) 14 | 15 | 16 | ### Solution implementation 17 | 18 | The solution to the problem was to extract the public database of Brazilian cities from Wikidata and model the relevant characteristics of cities to build a unique representation of each city. From there, we were able to determine the similarities by calculating the distances between the 19 | vector representations. 20 | 21 | The system's workflow was implemented in Flyte and is shown below: 22 | 23 | ![Solution Workflow](docs/images/SolutionDiagram.png) 24 | 25 | ### Detailed solution 26 | 27 | **Objective**: To help Kinzinho define his next travel destination, we seek to find other cities similar to the last travel destination he liked the most. 28 | 29 | **Strategy Solution**: To make a good evaluation between the cities, we chose to make a vector representation of each city in Brazil, encoding general information about each city such as its history, its geography, its climate and its tourist attractions. We chose this strategy because with a vector representation of the city we were able to apply similarity calculation operations between cities, considering various information about them. 30 | 31 | **Input data**: For our solution we use the following data from Wikipedia PT: summary, history, geography, climate; from Wikivoyage EN: summary, "See" section, "Do" section. 32 | 33 | **Preprocessing**: To process the data and extract only important information, we apply a series of pre-processing to clean unnecessary information and homogenize the texts. 34 | 35 | **Model**: To generate the best vector representation of each city's features, we used a pre-trained state-of-the-art model based on Transformers (BERTimbau). As a vector representation of each feature of the city, we use the output of the last layer of the BERTimbau language model. The vector representation of each city is generated from the average of the vectors of its features. 36 | 37 | **Similarity**: To calculate the similarity between the vector representations of each city, we use [Faiss](https://github.com/facebookresearch/faiss), an highly optimized similarity search library, to calculate the Euclidean distance between an input vector query (vector of the last city visited by Kinzinho) and all the other vectors of the cities available in our portfolio. 38 | 39 | 40 | ### Streamlit interface 41 | 42 | > ###### WARNING 43 | > It is important to note that our app on Streamlit does not have all the cities available in the original database for design reasons. The original database has about 5 thousand cities while the base used in the app will only present about 400 cities, the ones with pages on Wikivoyage. However, with some adjustments and a properly developed environment, it is possible to extend this analysis to all other cities. 44 | 45 | The user interaction interface was built using the Streamlit tool. After local testing, the stable version of the tool was posted on Streamlit's public server. You can access the interface through the link below. 46 | 47 | 48 | ### Sphinx docs 49 | 50 | > ###### WARNING 51 | > This documentation was built with Unix operating systems (or executions using WSL) in mind. 52 | 53 | It is possible to generate a detailed HTML documentation of the project through automation made in Sphinx and NGIX server to host the static HTMLs. 54 | 55 | ![Sphinx](docs/images/sphinx_server.png) 56 | 57 | There's not much mystery about building the documentation in HTML. We've already automated some things to make it easier. Generally speaking, Sphinx is responsible for creating a static page of an HTML documentation using manually typed information or other information inserted into the developed code. These generated static pages are moved into a folder in a container running an NGINX image which hosts the documentation page. 58 | 59 | To build the Docker image responsible for the documentation and start hosting the server, just run the command 60 | 61 | make open-docs 62 | 63 | > ###### WARNING 64 | > For this command to work it is necessary that in your system it is possible to run Makefile files and ensure that the working directory is inside `projects/destinations_similarity`. 65 | 66 | Once the command has been successfully executed, you can check with the command below if the container is running normally on your machine. 67 | 68 | 69 | docker ps 70 | 71 | the result should be 72 | 73 | 74 | $ docker ps 75 | CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES 76 | ... nginx "/docker-entrypoint.…" 36 seconds ago Up 35 seconds 0.0.0.0:8080->80/tcp, :::8080->80/tcp sphinx-nginx 77 | 78 | 79 | ### Team responsible for the project 80 | 81 | If you want to get in touch with the team members, use the communication channels below. 82 | 83 | | | E-mail | Linkedin | 84 | |---------------- |------------------------- |------------------------------------------------------------ | 85 | | Sergio Junior | sergio.junior@hurb.com | https://www.linkedin.com/in/sergio-barreto-jr/ | 86 | | Renata Gotler | renata.gotler@hurb.com | https://www.linkedin.com/in/renata-gotler/ | 87 | | Matheus Moreno | matheus.moreno@hurb.com | https://www.linkedin.com/in/matheusfmoreno/ | 88 | | Patrick Braz | patrick.braz@hurb.com | https://www.linkedin.com/in/patrick-franco-braz-752948163/ | 89 | 90 | ### Acknowledgments 91 | 92 | Kinzinho and his humans would like to thank everyone involved in making this project possible. They would also like to thank [Hurb](https://us.hurb.com/?pos=us)'s support in allowing and influencing participation in the Hackathon as training and recognition of the team's potential. And finally, thank Kinzinho himself for making the days of his humans around better. 93 | -------------------------------------------------------------------------------- /projects/destinations_similarity/conf.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Configuration file for the Sphinx documentation builder. 3 | # 4 | # This file only contains a selection of the most common options. For a full 5 | # list see the documentation: 6 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 7 | 8 | # -- Path setup -------------------------------------------------------------- 9 | 10 | # If extensions (or modules to document with autodoc) are in another directory, 11 | # add these directories to sys.path here. If the directory is relative to the 12 | # documentation root, use os.path.abspath to make it absolute, like shown here. 13 | 14 | import os 15 | import sys 16 | #sys.path.insert(0, os.path.abspath(".")) 17 | sys.path.insert(0, "/home") 18 | 19 | # -- Project information ----------------------------------------------------- 20 | #from nightswatch.version import __version__ 21 | 22 | # The master toctree document. 23 | master_doc = "index" 24 | 25 | project = "Destination Similarity" 26 | copyright = "2022, hurb.com" 27 | author = "data.science@hurb.com" 28 | release = "0.0.1" 29 | # -- General configuration --------------------------------------------------- 30 | 31 | # Add any Sphinx extension module names here, as strings. They can be 32 | # extensions coming with Sphinx (named "sphinx.ext.*") or your custom 33 | # ones. 34 | extensions = [ 35 | "sphinx_rtd_theme", 36 | "sphinx.ext.autodoc", 37 | "sphinx.ext.viewcode", 38 | "sphinx.ext.napoleon", 39 | "sphinx.ext.autosummary" 40 | ] 41 | 42 | # Add any paths that contain templates here, relative to this directory. 43 | templates_path = ["_templates"] 44 | 45 | # List of patterns, relative to source directory, that match files and 46 | # directories to ignore when looking for source files. 47 | # This pattern also affects html_static_path and html_extra_path. 48 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", "env", ".flyte", "scripts"] 49 | 50 | # The suffix(es) of source filenames. 51 | # You can specify multiple suffix as a list of string: 52 | # source_suffix = ['.rst', '.md'] 53 | source_suffix = [".rst"] 54 | 55 | # If true, the current module name will be prepended to all description 56 | # unit titles (such as .. function::). 57 | add_module_names = True 58 | 59 | # A boolean that decides whether codeauthor and sectionauthor directives produce any output in the 60 | # built files. 61 | show_authors = True 62 | 63 | suppress_warnings = [ 64 | ] 65 | 66 | autodoc_mock_imports = ["flytekit", "faiss", "torch", "requests", "BeautifulSoup", "pandas", "streamlit", "numpy", "bs4", 67 | "docker", "git", "typer", "sklearn", "streamlit", "argparse", "transformers", "swifter", "nltk", 68 | "unidecode", "deep_translator"] 69 | 70 | # -- Options for HTML output ------------------------------------------------- 71 | 72 | # The theme to use for HTML and HTML Help pages. See the documentation for 73 | # a list of builtin themes. 74 | # 75 | html_theme = "sphinx_rtd_theme" 76 | 77 | html_theme_options = { 78 | "analytics_anonymize_ip": False, 79 | "logo_only": False, 80 | "display_version": True, 81 | "prev_next_buttons_location": "bottom", 82 | "style_external_links": True, 83 | "vcs_pageview_mode": "", 84 | "style_nav_header_background": "gray", 85 | "collapse_navigation": False, 86 | "sticky_navigation": True, 87 | "navigation_depth": 4, 88 | "includehidden": True, 89 | "titles_only": False 90 | } 91 | 92 | html_logo = "docs/images/vamoDalheLogo.jpeg" 93 | 94 | # Add any paths that contain custom static files (such as style sheets) here, 95 | # relative to this directory. They are copied after the builtin static files, 96 | # so a file named "default.css" will overwrite the builtin "default.css". 97 | html_static_path = ["_static"] 98 | 99 | # --------------------------------------------------- 100 | -------------------------------------------------------------------------------- /projects/destinations_similarity/dashboard/app.py: -------------------------------------------------------------------------------- 1 | """Module for the Streamlit app.""" 2 | 3 | # pylint: disable=no-value-for-parameter 4 | 5 | import os 6 | import sys 7 | import logging 8 | from argparse import ArgumentParser 9 | 10 | from typing import List 11 | import faiss 12 | import streamlit as st 13 | import pandas as pd 14 | import numpy as np 15 | from PIL import Image 16 | 17 | GCS_BUCKET_PATH = "https://storage.googleapis.com/dsc-public-info/datasets/" 18 | EMBEDDINGS_FILENAME = "flytelab_embeddings.parquet" 19 | DATASET_FILENAME = "flytelab_dataset.parquet" 20 | 21 | CURRENT_DIRECTORY = os.path.dirname(os.path.abspath(__file__)) 22 | 23 | # Logging config 24 | LOGGER = logging.getLogger(__name__) 25 | 26 | logging.basicConfig( 27 | stream=sys.stdout, 28 | level=logging.INFO, 29 | format="[%(asctime)s] %(name)s: %(levelname)s | %(message)s" 30 | ) 31 | 32 | 33 | def retrieve_dataframe_from_remote(dataset_name: str) -> pd.DataFrame: 34 | """Retrieve a dataset saved as Parquet from remote.""" 35 | return pd.read_parquet(GCS_BUCKET_PATH + dataset_name) 36 | 37 | 38 | def get_k_nearest_neighbors( 39 | embeddings: pd.DataFrame, k_neighbors: int, city_name: str, state_name: str 40 | ) -> pd.DataFrame: 41 | """Retrieve the k-nearest neighbors of a city. 42 | 43 | Args: 44 | embeddings (pd.DataFrame): city vectors 45 | k_neighbors (int): number os similar cities to present 46 | city_name (str): last city visited 47 | state_name (str): last state visited 48 | Returns: 49 | pd.DataFrame: the cities most similar to city_name 50 | """ 51 | # Retrieve vectors to search 52 | vec_name = embeddings[~( 53 | (embeddings['city'] == city_name) & (embeddings['state'] == state_name) 54 | )].reset_index(drop=True) 55 | vec = vec_name.drop(['city', 'state'], axis=1) 56 | 57 | # Initialize faiss 58 | index = faiss.IndexFlatL2(vec.shape[1]) 59 | index.add(np.ascontiguousarray(np.float32(vec.values))) 60 | 61 | # Build query 62 | query = embeddings[( 63 | (embeddings['city'] == city_name) & (embeddings['state'] == state_name) 64 | )].drop(['city', 'state'], axis=1).values 65 | query = np.float32(query) 66 | 67 | # Retrieve k-nearest neighbors 68 | _, indexes = index.search(query, k_neighbors) 69 | nearest = vec_name[['city', 'state']].iloc[indexes[0]] 70 | 71 | return nearest 72 | 73 | 74 | def build_output( 75 | dataset: pd.DataFrame, nearest_cities: pd.DataFrame, 76 | columns_to_retrieve: List[str] 77 | ) -> pd.DataFrame: 78 | """Build the output text of inference. 79 | 80 | Args: 81 | dataset (pd.DataFrame): dataset scraper from wikipedia and wikivoyage 82 | nearest_cities (pd.DataFrame): output model of the nearest cities 83 | columns_to_retrieve (List[str]): list of columns to add to output 84 | 85 | Returns: 86 | str: Markdown-formatted text 87 | """ 88 | output = "" 89 | default_desc = ( 90 | "\nOops... Unfortunately we don't have records for this city. " 91 | "\U0001F615\n" 92 | ) 93 | 94 | for _, row in nearest_cities.iterrows(): 95 | output += f"\n## {row.city}, {row.state}\n" 96 | 97 | pois_suggestion = dataset[ 98 | (dataset['city'] == row.city) & (dataset['state'] == row.state) 99 | ][columns_to_retrieve].iloc[0] 100 | 101 | for column in columns_to_retrieve: 102 | section = ' '.join(column.split('_')[:-2]).capitalize() 103 | output += ( 104 | f"\n### {section}" 105 | f"\n{pois_suggestion[column] or default_desc}" 106 | ) 107 | 108 | return output 109 | 110 | 111 | if __name__ == '__main__': 112 | # Retrieve arguments 113 | parser = ArgumentParser() 114 | parser.add_argument("--remote", action="store_true") 115 | args = parser.parse_args() 116 | backend = os.getenv( 117 | "FLYTE_BACKEND", 'remote' if args.remote else 'sandbox') 118 | 119 | # Retrieve datasets from remote 120 | embs_df = retrieve_dataframe_from_remote(EMBEDDINGS_FILENAME) 121 | wiki_df = retrieve_dataframe_from_remote(DATASET_FILENAME) 122 | 123 | # App definition 124 | st.write( 125 | "# Flytelab: Destinations Similarity\n" 126 | "Kinder is an adventurous dog who loves to travel! He enjoys " 127 | "specially nature places: beaches, waterfalls, trails and more, " 128 | "which Brazil surely is abundant of. He wants experiences in other " 129 | "cities but he doesn't know where to go.\n" 130 | "## So he is now asking, **where should I go next**?" 131 | ) 132 | 133 | beach_kinder = Image.open( 134 | os.path.join(CURRENT_DIRECTORY, 'beach_kinder.jpeg')) 135 | st.image(beach_kinder, caption='Kinder in love with the beach') 136 | 137 | st.write( 138 | "Help Kinder by selecting a city you like in Brazil below so we can " 139 | "recommend similar places that he will most certainly enjoy!" 140 | ) 141 | 142 | # Select city, state, and n of recommendations 143 | desired_state = st.selectbox( 144 | 'From state...', 145 | embs_df['state'].unique().tolist() 146 | ) 147 | desired_city = st.selectbox( 148 | 'I like the city:', 149 | embs_df[embs_df['state'] == desired_state]['city'].unique().tolist() 150 | ) 151 | 152 | n_cities = st.slider('How many recommendations do you want?', 1, 30, 5) 153 | 154 | # Get recommendations 155 | cities_recommended = get_k_nearest_neighbors( 156 | embeddings=embs_df, k_neighbors=n_cities, 157 | city_name=desired_city, state_name=desired_state 158 | ) 159 | 160 | st.write("## So, where next?") 161 | st.write(build_output( 162 | dataset=wiki_df, nearest_cities=cities_recommended, 163 | columns_to_retrieve=[ 164 | 'summary_wikivoyage_en' 165 | ] 166 | )) 167 | 168 | kinder = Image.open(os.path.join(CURRENT_DIRECTORY, 'kinder.jpeg')) 169 | st.image(kinder, caption='The marvelous Kinder') 170 | 171 | st.write( 172 | "We hope you enjoy the recommendations! See you on your next trip." 173 | ) 174 | -------------------------------------------------------------------------------- /projects/destinations_similarity/dashboard/beach_kinder.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/flyteorg/flytelab/67ff9a779004209c82f9f5df89f2873cc5acdd52/projects/destinations_similarity/dashboard/beach_kinder.jpeg -------------------------------------------------------------------------------- /projects/destinations_similarity/dashboard/kinder.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/flyteorg/flytelab/67ff9a779004209c82f9f5df89f2873cc5acdd52/projects/destinations_similarity/dashboard/kinder.jpeg -------------------------------------------------------------------------------- /projects/destinations_similarity/dashboard/requirements.txt: -------------------------------------------------------------------------------- 1 | flytekit>=0.30.3 2 | pandas>=1.3.5 3 | requests~=2.27.1 4 | faiss-cpu~=1.7.2 5 | numpy>=1.21.5 6 | deep-translator~=1.8.3 7 | streamlit -------------------------------------------------------------------------------- /projects/destinations_similarity/deploy.py: -------------------------------------------------------------------------------- 1 | """Deployment script for Flyte projects.""" 2 | 3 | import os 4 | import uuid 5 | import subprocess 6 | from pathlib import Path 7 | 8 | import docker 9 | import git 10 | import typer 11 | 12 | 13 | IMAGE_NAME = "flytelab" 14 | REGISTRY = "ghcr.io/patrickfbraz".lower() 15 | PROJECT_NAME = "vamos-dalhe" 16 | DESCRIPTION = "Hurb project to the Flyte Hackathon" 17 | 18 | 19 | app = typer.Typer() 20 | 21 | docker_client = docker.from_env() 22 | 23 | 24 | def create_project(remote: bool): 25 | """Create project on Flyte cluster.""" 26 | config_type = 'remote' if remote else 'sandbox' 27 | config = Path(".flyte") / f"{config_type}-config.yaml" 28 | 29 | output = subprocess.run( 30 | [ 31 | "flytectl", "get", "project", PROJECT_NAME, 32 | "--config", config, 33 | ], 34 | capture_output=True, 35 | check=True, 36 | ) 37 | 38 | if not output.stdout.decode().strip(): 39 | typer.echo(f"Creating project {PROJECT_NAME}...") 40 | subprocess.run( 41 | [ 42 | "flytectl", "create", "project", 43 | "--project", PROJECT_NAME, 44 | "--name", PROJECT_NAME, 45 | "--id", PROJECT_NAME, 46 | "--description", DESCRIPTION, 47 | "--config", config, 48 | ], 49 | check=True, 50 | ) 51 | 52 | 53 | def get_version(fast: bool): 54 | """Get git version of code.""" 55 | repo = git.Repo(".", search_parent_directories=True) 56 | 57 | if not fast and repo.is_dirty(): 58 | typer.echo( 59 | "Please commit git changes before building. If you haven't updated" 60 | " any system/python dependencies but want to deploy task/workflow " 61 | "code changes, use the --fast flag to do fast registration.", 62 | err=True 63 | ) 64 | raise typer.Exit(code=1) 65 | 66 | commit = repo.rev_parse("HEAD") 67 | return commit.hexsha 68 | 69 | 70 | def get_tag(version, registry=None): 71 | """Get the tag of the project's image.""" 72 | return ( 73 | f"{REGISTRY if registry is None else registry}/{IMAGE_NAME}:" 74 | f"{PROJECT_NAME}-{version}" 75 | ) 76 | 77 | 78 | def sandbox_docker_build(tag): 79 | """Build image on the sandbox cluster.""" 80 | typer.echo("Building image in Flyte sandbox...") 81 | subprocess.run( 82 | [ 83 | "flytectl", "sandbox", "exec", "--", 84 | "docker", "build", ".", "--tag", tag, 85 | ], 86 | check=True, 87 | ) 88 | 89 | 90 | def docker_build(tag: str, remote: bool) -> docker.models.images.Image: 91 | """Build the image locally.""" 92 | client = docker.from_env() 93 | 94 | config = Path(".flyte") / f"{'remote' if remote else 'sandbox'}.config" 95 | 96 | typer.echo(f"Building image: {tag}...") 97 | image, build_logs = client.images.build( 98 | path=".", dockerfile="Dockerfile", tag=tag, 99 | buildargs={"image": tag, "config": str(config)}, rm=True, 100 | ) 101 | 102 | for line in build_logs: 103 | typer.echo(line) 104 | 105 | return image 106 | 107 | 108 | def docker_push(image: docker.models.images.Image): 109 | """Push the image to the remote registry.""" 110 | for line in docker_client.api.push( 111 | image.tags[0], stream=True, decode=True 112 | ): 113 | typer.echo(line) 114 | 115 | 116 | def serialize(tag: str, remote: bool, fast: bool): 117 | """Perform serialization of source code.""" 118 | typer.echo("Serializing Flyte workflows...") 119 | config = Path(".flyte") / f"{'remote' if remote else 'sandbox'}.config" 120 | 121 | package = Path(".") / "flyte-package.tgz" 122 | if package.exists(): 123 | os.remove(package) 124 | 125 | subprocess.run( 126 | [ 127 | "pyflyte", "-c", str(config), 128 | "--pkgs", "destinations_similarity", 129 | "package", "--force", "--image", tag, 130 | *( 131 | ["--fast"] if fast 132 | else ["--in-container-source-path", "/home/flyte"] 133 | ), 134 | ], 135 | check=True, 136 | # Inject the FLYTE_SANDBOX env variable to the serialization runtime 137 | env={"FLYTE_SANDBOX": "1" if not remote else "0", **os.environ}, 138 | ) 139 | 140 | 141 | def register(version: str, remote: bool, fast: bool, domain: str): 142 | """Register workflows to cluster.""" 143 | typer.echo("Registering Flyte workflows...") 144 | config_type = 'remote' if remote else 'sandbox' 145 | config = Path(".flyte") / f"{config_type}-config.yaml" 146 | 147 | if fast: 148 | version = f"{version}-fast{uuid.uuid4().hex[:7]}" 149 | 150 | subprocess.run( 151 | [ 152 | "flytectl", "-c", config, "register", "files", 153 | "--project", PROJECT_NAME, 154 | "--domain", domain, 155 | "--archive", "flyte-package.tgz", 156 | "--force", 157 | "--version", version 158 | ], 159 | check=True, 160 | ) 161 | typer.echo(f"Successfully registered version {version}.") 162 | 163 | 164 | @app.command() 165 | def main( 166 | remote: bool = False, fast: bool = False, domain: str = "development", 167 | registry: str = None 168 | ) -> None: 169 | """Deploy Flyte workflows locally or remotely.""" 170 | if remote and fast: 171 | typer.echo( 172 | "Fast registration is not enabled when deploying to remote. " 173 | "Please deploy your workflows without the --fast flag.", 174 | err=True 175 | ) 176 | 177 | create_project(remote) 178 | version = get_version(fast) 179 | tag = get_tag(version, registry) 180 | if not fast: 181 | if remote: 182 | docker_push(docker_build(tag, remote)) 183 | else: 184 | sandbox_docker_build(tag) 185 | serialize(tag, remote, fast) 186 | register(version, remote, fast, domain) 187 | 188 | 189 | if __name__ == "__main__": 190 | app() 191 | -------------------------------------------------------------------------------- /projects/destinations_similarity/destinations_similarity/__init__.py: -------------------------------------------------------------------------------- 1 | """Package with the source code for the destinations_similarity project.""" 2 | -------------------------------------------------------------------------------- /projects/destinations_similarity/destinations_similarity/processing/__init__.py: -------------------------------------------------------------------------------- 1 | """Processing submodule for the project.""" 2 | -------------------------------------------------------------------------------- /projects/destinations_similarity/destinations_similarity/processing/feature_engineering.py: -------------------------------------------------------------------------------- 1 | """Feature engineering for the data.""" 2 | 3 | import torch 4 | import pandas as pd 5 | from transformers import AutoTokenizer, AutoModel 6 | 7 | 8 | BASE_MODEL = 'neuralmind/bert-base-portuguese-cased' 9 | 10 | 11 | class TextVectorizer(): 12 | """Class used to vectorize text.""" 13 | 14 | def __init__(self, model: str = BASE_MODEL) -> None: 15 | """Initialize class. 16 | 17 | Args: 18 | model (str): huggingface path 19 | """ 20 | self.tokenizer = AutoTokenizer.from_pretrained( 21 | model, do_lower_case=False) 22 | self.model = AutoModel.from_pretrained(model) 23 | 24 | def encode_inputs(self, series_text: pd.Series) -> torch.Tensor: 25 | """Encode inputs. 26 | 27 | Args: 28 | series_text (pd.Series): text to be vectorized 29 | 30 | Returns: 31 | torch.tensor: tokens ids 32 | """ 33 | input_ids = self.tokenizer( 34 | list(series_text), padding=True, truncation=True, 35 | max_length=256, return_tensors="pt", add_special_tokens=True 36 | ) 37 | return input_ids 38 | 39 | def get_df_embedding(self, input_ids: pd.Series) -> pd.DataFrame: 40 | """Generate DataFrame with all text vector representations. 41 | 42 | Args: 43 | input_ids (torch.tensor): tokens ids 44 | 45 | Returns: 46 | pd.DataFrame: input id vectors 47 | """ 48 | with torch.no_grad(): 49 | outs = self.model( 50 | input_ids['input_ids'] 51 | )[0][:, 1:-1, :].mean(axis=1).cpu().numpy() 52 | return pd.DataFrame(outs) 53 | -------------------------------------------------------------------------------- /projects/destinations_similarity/destinations_similarity/processing/text_preprocessing.py: -------------------------------------------------------------------------------- 1 | """Text preprocessing tools.""" 2 | 3 | # pylama: ignore=W0611 4 | # pylint: disable=unused-import,broad-except 5 | 6 | import re 7 | from typing import List 8 | from string import punctuation 9 | 10 | import swifter 11 | import pandas as pd 12 | from nltk.tokenize import word_tokenize 13 | from nltk.corpus import stopwords 14 | from unidecode import unidecode 15 | from deep_translator import GoogleTranslator 16 | 17 | 18 | def lower_text(text: str) -> str: 19 | """Lower a text. 20 | 21 | Args: 22 | text (str): text to be lowered 23 | 24 | Returns: 25 | str: lower text 26 | """ 27 | return text.lower() 28 | 29 | 30 | def clean_text(texts: str) -> str: 31 | """Remove unnecessary parts of the text. 32 | 33 | Args: 34 | text (str): text to be cleaned 35 | 36 | Returns: 37 | str: cleaned text 38 | """ 39 | # Remove empty lines 40 | clean_empt_msg = re.compile(r'\n\s*\n') 41 | text = re.sub(clean_empt_msg, " ", texts) 42 | 43 | # Transliterate into ASCII 44 | text = unidecode(text) 45 | 46 | # Remove API mensage 47 | clean_msg = re.compile(r'\(.*?\)') 48 | text = re.sub(clean_msg, ' ', text) 49 | 50 | # Remove HTML characteres 51 | cleanr = re.compile(r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});') 52 | text = re.sub(cleanr, ' ', text) 53 | 54 | # Remove punctuations and numbers 55 | clean_pontuation = re.compile(r'[^a-zA-Z]') 56 | text = re.sub(clean_pontuation, ' ', text) 57 | 58 | # Single character removal 59 | clean_char = re.compile(r"\s+[a-zA-Z]\s+") 60 | text = re.sub(clean_char, ' ', text) 61 | 62 | # Removing multiple spaces 63 | clean_space = re.compile(r'\s+') 64 | text = re.sub(clean_space, ' ', text) 65 | 66 | return text 67 | 68 | 69 | def remove_stopwords( 70 | list_tokens: List[str], 71 | stopword_list: List[str] = stopwords.words('portuguese') 72 | ) -> str: 73 | """Remove stopwords of the text. 74 | 75 | Args: 76 | list_tokens (List[str]): list of sentence tokens 77 | stopword_list (List[str], optional): list of stopwords. Defaults to 78 | nltk's portuguese stopwords. 79 | 80 | Returns: 81 | List[str]: text without stopwords 82 | """ 83 | stopword = ( 84 | stopword_list + 85 | list(punctuation) + 86 | ["\n", 'municipio', 'clima'] 87 | ) 88 | 89 | txt_wo_stopwords = filter(lambda item: item not in stopword, list_tokens) 90 | return " ".join(txt_wo_stopwords) 91 | 92 | 93 | def tokenizer(text: str) -> List[str]: 94 | """Tokenize the text. 95 | 96 | Args: 97 | text (str): text to be tokenized 98 | 99 | Returns: 100 | List[str]: list of sentence tokens 101 | """ 102 | return word_tokenize(text) 103 | 104 | 105 | def preprocess_text(dataframe: pd.DataFrame, column_name: str) -> pd.Series: 106 | """Execute all of the preprocess methods. 107 | 108 | Args: 109 | dataframe (pd.DataFrame): dataframe with column to be processed 110 | column_name (str): column name to be processed 111 | 112 | Returns: 113 | pd.Series: column processed 114 | """ 115 | aux = dataframe[column_name].str.lower() 116 | aux = aux.swifter.apply(lambda x: clean_text(str(x))) 117 | aux = aux.swifter.apply( 118 | lambda x: remove_stopwords(list_tokens=tokenizer(x))) 119 | return aux 120 | 121 | 122 | def translate_description_series( 123 | dataframe: pd.DataFrame, column_name: str, target_lang: str = 'pt' 124 | ) -> pd.Series: 125 | """Translate columns to another language. 126 | 127 | Args: 128 | dataframe (pd.DataFrame): dataframe with column to be translated 129 | column_name (str): column name to be translated 130 | target_lang (str): taget language 131 | 132 | Returns: 133 | pd.Series: column translated 134 | """ 135 | dataframe[column_name] = dataframe[column_name].fillna("") 136 | dataframe[column_name] = dataframe[column_name].swifter.apply( 137 | lambda x: translate_description(x, target_lang) 138 | if isinstance(x, str) else x 139 | ) 140 | return dataframe[column_name] 141 | 142 | 143 | def translate_description(text: str, target_lang: str = 'pt') -> str: 144 | """Translate non-portuguese text. 145 | 146 | Args: 147 | text (str): column name to be translated 148 | target_lang (str): taget language 149 | 150 | Returns: 151 | str: text translated 152 | """ 153 | try: 154 | return GoogleTranslator( 155 | source='auto', target=target_lang).translate(text) 156 | except Exception: 157 | return text 158 | -------------------------------------------------------------------------------- /projects/destinations_similarity/destinations_similarity/scraper/__init__.py: -------------------------------------------------------------------------------- 1 | """Scraper submodule for the project.""" 2 | -------------------------------------------------------------------------------- /projects/destinations_similarity/destinations_similarity/scraper/brazilian_cities.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | """Module used to extract the base data from the wikis.""" 3 | 4 | import json 5 | from typing import Any 6 | 7 | import requests 8 | import pandas as pd 9 | 10 | 11 | WIKIDATA_ENDPOINT = 'https://query.wikidata.org/sparql' 12 | 13 | WIKIDATA_QUERY = """ 14 | PREFIX schema: 15 | 16 | SELECT ?cityLabel ?stateLabel ?wikivoyageLabel ?wikipediaLabel WHERE { 17 | ?city wdt:P31 wd:Q3184121; 18 | wdt:P131 ?state. 19 | OPTIONAL { 20 | ?wikipedia schema:about ?city. 21 | ?wikipedia schema:isPartOf ; 22 | schema:name ?wikipediaLabel. 23 | } 24 | OPTIONAL { 25 | ?wikivoyage schema:about ?city. 26 | ?wikivoyage schema:isPartOf ; 27 | schema:name ?wikivoyageLabel. 28 | } 29 | SERVICE wikibase:label { 30 | bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". 31 | } 32 | } 33 | """ 34 | 35 | 36 | def get_dataframe(df_object: object, **kwargs) -> pd.DataFrame: 37 | """Generate a pandas DataFrame from a DataFrame-like object. 38 | 39 | Args: 40 | df_object (object): A DataFrame-like object (dict, list, etc). 41 | 42 | Returns: 43 | pd.DataFrame: The DataFrame. 44 | """ 45 | dataframe = pd.DataFrame(df_object) 46 | 47 | if kwargs.get('generate_city_id'): 48 | dataframe['city_id'] = [(row + 1) for row in range(dataframe.shape[0])] 49 | 50 | return dataframe 51 | 52 | 53 | def get_brazilian_cities_data(save_data: callable, *args, **kwargs) -> Any: 54 | """Get data from brazilian cities from Wikimedia pages. 55 | 56 | Args: 57 | save_data (callable): Function to process the retrieved data. 58 | 59 | Returns: 60 | Any: Type returned by save_data. 61 | """ 62 | request = requests.get( 63 | WIKIDATA_ENDPOINT, 64 | params={ 65 | 'query': WIKIDATA_QUERY, 66 | 'format': 'json', 67 | }, 68 | allow_redirects=True, 69 | stream=True, 70 | ) 71 | 72 | response = json.loads(request.text) 73 | cities_raw = response['results']['bindings'] 74 | 75 | cities = sorted([{ 76 | 'city': elem.get('cityLabel', {}).get('value'), 77 | 'state': elem.get('stateLabel', {}).get('value'), 78 | 'title_wikipedia_pt': elem.get('wikipediaLabel', {}).get('value'), 79 | 'title_wikivoyage_en': elem.get('wikivoyageLabel', {}).get('value'), 80 | } for elem in cities_raw], key=lambda x: x['city']) 81 | 82 | return save_data(cities, *args, **kwargs) 83 | -------------------------------------------------------------------------------- /projects/destinations_similarity/destinations_similarity/scraper/extractor.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | """Base driver to scrape data from Wikimedia websites.""" 3 | 4 | import re 5 | import json 6 | from typing import Dict, List 7 | 8 | import requests 9 | from bs4 import BeautifulSoup 10 | 11 | 12 | APPLICATION_HEADERS = { 13 | 'User-Agent': 'destinations_similarity/0.1' 14 | } 15 | 16 | 17 | class WikiExtractor(object): 18 | """Class for extracting content from Wikimedia.""" 19 | 20 | def __init__(self, wiki: str, lang: str): 21 | """Initialize driver.""" 22 | self.wiki = wiki 23 | self.lang = lang 24 | self.rest_url = f"https://{lang}.{wiki}.org/api/rest_v1" 25 | 26 | # Create Session object for faster retrieval 27 | self.session = requests.Session() 28 | self.session.headers.update(APPLICATION_HEADERS) 29 | 30 | @classmethod 31 | def clean_content(cls, text: str, tags: List[str] = None) -> List[str]: 32 | """Remove HTML tags and citations from text. 33 | 34 | Args: 35 | text (str): The text to be cleaned. 36 | tags (List[str], optional): List of tags to be extracted. 37 | Defaults to ['p', 'li']. 38 | 39 | Returns: 40 | List[str]: A list with each piece of text extracted from the 41 | specified tags. 42 | """ 43 | tags = tags or ['p', 'li'] 44 | soup = BeautifulSoup(text, "html.parser") 45 | return [ 46 | re.sub(r'\[.*?\]|<.*?>', '', str(x)).strip() 47 | for x in soup.find_all(tags) 48 | ] 49 | 50 | def extract_images(self, page: str) -> List[str]: 51 | """Retrieve images (as links) for a specified page. 52 | 53 | Args: 54 | page (str): The name of the page. 55 | 56 | Returns: 57 | List[str]: A list with the URLs of the images. 58 | """ 59 | request = self.session.get(f"{self.rest_url}/page/media-list/{page}") 60 | response = json.loads(request.text) 61 | items = response.get('items', []) 62 | 63 | images_links = [] 64 | 65 | for item in items: 66 | if item['type'] == 'image' and 'srcset' in item: 67 | images_links += [f"https:{item['srcset'][0]['src']}"] 68 | 69 | return images_links 70 | 71 | def extract_content_raw( 72 | self, page: str, summary: bool, sections: List[str] = None 73 | ) -> Dict[str, str]: 74 | """Retrieve the HTML-formatted sections from a page. 75 | 76 | Args: 77 | page (str): The name of the page. 78 | summary (bool): Boolean that specifies if the summary for the page 79 | must be retrieved. 80 | sections (List[str], optional): A list of sections to be retrieved. 81 | Defaults to None. 82 | 83 | Returns: 84 | Dict[str, str]: A dictionary with the sections, where each key is 85 | the section name. 86 | """ 87 | sections = sections or [] 88 | 89 | request = self.session.get( 90 | f"{self.rest_url}/page/mobile-sections/{page}") 91 | response = json.loads(request.text) 92 | 93 | sections_data = {} 94 | 95 | # Retrieve summary 96 | if summary and 'lead' in response: 97 | sections_data['summary'] = response['lead']['sections'][0]['text'] 98 | 99 | # Retrieve sections and subsections (with HTML tags) 100 | if sections and 'remaining' in response: 101 | page_sections = response['remaining']['sections'] 102 | 103 | # Get index of sections found 104 | idx_sections_found = [ 105 | i for i, section in enumerate(page_sections) 106 | if section.get('line') in sections 107 | ] 108 | 109 | # Get level of each section, to identify subsections 110 | levels = [section.get('toclevel', -2) for section in page_sections] 111 | 112 | for start in idx_sections_found: 113 | try: 114 | # Get index next section at same toclevel 115 | end = next( 116 | i + (start + 1) 117 | for i, level in enumerate(levels[start + 1:]) 118 | if level <= levels[start] 119 | ) 120 | except StopIteration: # End of page reached 121 | end = len(page_sections) 122 | 123 | # Update dictionary 124 | sections_data[page_sections[start]['line']] = '\n'.join([ 125 | subsection.get('text', '') 126 | for subsection in page_sections[start:end] 127 | ]) 128 | 129 | return sections_data 130 | 131 | def extract_content( 132 | self, page: str, summary: bool, sections: List[str] = None, 133 | sections_tags: Dict[str, List[str]] = None, 134 | section_types: Dict[str, str] = None 135 | ) -> Dict[str, str]: 136 | """Retrieve formatted (clean) text from Wikipedia.""" 137 | sections = sections or [] 138 | results = self.extract_content_raw(page, summary, sections) 139 | 140 | # Get tags to keep and types to convert 141 | sections_tags = { 142 | section: sections_tags.get(section, ['p', 'li']) 143 | for section in ['summary'] + sections 144 | } 145 | section_types = { 146 | section: section_types.get(section, 'str') 147 | for section in ['summary'] + sections 148 | } 149 | 150 | # Clean the sections 151 | for section in results: 152 | if section_types[section] == 'list': 153 | results[section] = self.clean_content( 154 | results[section], tags=sections_tags[section]) 155 | elif section_types[section] == 'str': 156 | results[section] = '\n'.join(self.clean_content( 157 | results[section], tags=sections_tags[section])) 158 | else: 159 | raise NotImplementedError( 160 | f"No implementation for this type of output: " 161 | f"{section_types[section]}" 162 | ) 163 | 164 | return results 165 | -------------------------------------------------------------------------------- /projects/destinations_similarity/destinations_similarity/tasks.py: -------------------------------------------------------------------------------- 1 | """Tasks for the destinations_similarity Flyte project.""" 2 | 3 | import sys 4 | import logging 5 | from typing import List, Dict, Tuple 6 | 7 | import torch 8 | import pandas as pd 9 | import numpy as np 10 | from unidecode import unidecode 11 | from flytekit import task, Resources 12 | from flytekit.types.file import FlyteFile 13 | 14 | from destinations_similarity.scraper.extractor import WikiExtractor 15 | from destinations_similarity.scraper.brazilian_cities import ( 16 | get_brazilian_cities_data, get_dataframe) 17 | from destinations_similarity.processing.text_preprocessing import ( 18 | translate_description_series, preprocess_text) 19 | from destinations_similarity.processing.feature_engineering import ( 20 | TextVectorizer) 21 | 22 | 23 | # Logging config 24 | LOGGER = logging.getLogger(__name__) 25 | 26 | logging.basicConfig( 27 | stream=sys.stdout, 28 | level=logging.INFO, 29 | format="[%(asctime)s] %(name)s: %(levelname)s | %(message)s" 30 | ) 31 | 32 | # Flyte configuration 33 | LIGHT_RESOURCES = Resources(cpu="0.5", mem="1Gi") 34 | BASE_RESOURCES = Resources(cpu="1", mem="2Gi") 35 | INTENSIVE_RESOURCES = Resources(cpu="2", mem="16Gi") 36 | 37 | 38 | @task(retries=3, requests=LIGHT_RESOURCES) 39 | def get_base_data(generate_city_id: bool) -> pd.DataFrame: 40 | """Retrieve base data for the dataset. 41 | 42 | Args: 43 | generate_city_id (bool): Informs if an ID must be generated for each 44 | row of the dataset. 45 | 46 | Returns: 47 | pd.DataFrame: Base dataset. 48 | """ 49 | return get_brazilian_cities_data( 50 | get_dataframe, generate_city_id=generate_city_id) 51 | 52 | 53 | @task(retries=3, requests=LIGHT_RESOURCES) 54 | def scrap_wiki( 55 | base_data: pd.DataFrame, wiki: str, lang: str, summary: bool, 56 | sections: List[str], sections_tags: Dict[str, List[str]], 57 | sections_types: Dict[str, str] 58 | ) -> pd.DataFrame: 59 | """Scrap a Wikimedia page for info. 60 | 61 | Args: 62 | base_data (pd.DataFrame): Base dataset. 63 | wiki (str): Type of wiki ('wikipedia', 'wikivoyage'). 64 | lang (str): Language of wiki. 65 | summary (bool): If the summary must be retrieved. 66 | sections (List[str]): Which sections must be retrieved. 67 | sections_tags (Dict[str, List[str]]): Which HTML tags must be preserved 68 | for a given section. 69 | sections_types (Dict[str, str]): How each section will be 70 | saved on the dataset, 'str' or 'list'. 71 | 72 | Returns: 73 | pd.DataFrame: The updated dataset. 74 | """ 75 | # Initialize scraper 76 | extractor = WikiExtractor(wiki=wiki, lang=lang) 77 | 78 | # Setup fields for the sections 79 | sections_fields = { 80 | section: f"{section}_{wiki}_{lang}".lower().replace(' ', '_') 81 | for section in ['summary'] + sections 82 | } 83 | 84 | # Initialize dataset 85 | dataset = base_data.copy() 86 | dataset[f"images_{wiki}_{lang}"] = [[] for _ in range(len(dataset))] 87 | for section, field in sections_fields.items(): 88 | dataset[field] = ( 89 | [[] for _ in range(len(dataset))] 90 | if sections_types.get(section) == 'list' else "" 91 | ) 92 | 93 | # Retrieve data for each city 94 | for i, row in dataset.iterrows(): 95 | page_name = row[f"title_{wiki}_{lang}"] 96 | 97 | # Set content 98 | page_content = extractor.extract_content( 99 | page_name, summary=summary, sections=sections, 100 | sections_tags=sections_tags, section_types=sections_types 101 | ) 102 | for section, text in page_content.items(): 103 | dataset.at[i, sections_fields[section]] = text 104 | 105 | # Set images links 106 | page_images = extractor.extract_images(page_name) 107 | dataset.at[i, f"images_{wiki}_{lang}"] = page_images 108 | 109 | return dataset 110 | 111 | 112 | @task(cache=True, cache_version='1.0', requests=LIGHT_RESOURCES) 113 | def merge_dataframes( 114 | df_x: pd.DataFrame, df_y: pd.DataFrame, join: str 115 | ) -> pd.DataFrame: 116 | """Merge two DataFrames together. 117 | 118 | Args: 119 | df_x (pd.DataFrame): First DataFrame. 120 | df_y (pd.DataFrame): Second DataFrame. 121 | join (str): The type of merge, 'inner' or 'outer'. 122 | 123 | Returns: 124 | pd.DataFrame: The concatenation of the DataFrames. 125 | """ 126 | df_y_columns = df_y.columns.difference(df_x.columns) 127 | return pd.concat([df_x, df_y[df_y_columns]], axis=1, join=join) 128 | 129 | 130 | @task(cache=True, cache_version='1.0') 131 | def check_if_remote(uri: str) -> Tuple[bool, FlyteFile]: 132 | """Check if a URI points to a remote file.""" 133 | if uri: 134 | return True, uri 135 | return False, uri 136 | 137 | 138 | @task(retries=3, requests=LIGHT_RESOURCES) 139 | def retrieve_dataset_from_remote(uri: FlyteFile) -> pd.DataFrame: 140 | """Retrieve a dataset from a remote URL. 141 | 142 | Args: 143 | url (FlyteFile): Remote address of the dataset. Must be a Parquet file. 144 | 145 | Returns: 146 | pd.DataFrame: DataFrame with the dataset. 147 | """ 148 | # Download file if it has a remote source 149 | if uri.remote_source is not None: 150 | uri.download() 151 | 152 | dataset_df = pd.read_parquet(uri.path) 153 | dataset_df.columns = dataset_df.columns.astype(str) 154 | LOGGER.info("Retrieved dataset from '%s'.", uri.remote_source or uri.path) 155 | return dataset_df 156 | 157 | 158 | @task(cache=True, cache_version='1.0', requests=BASE_RESOURCES) 159 | def preprocess_input_data( 160 | dataframe: pd.DataFrame, columns_to_translate: List[str], 161 | columns_to_process: List[str], wikivoyage_summary: str 162 | ) -> pd.DataFrame: 163 | """Preprocess the scraped data. 164 | 165 | Args: 166 | dataframe (pd.DataFrame): remote dataframe with cities features 167 | columns_to_translate (List[str]): city features to be translated 168 | columns_to_process (List[str]): city features to be processed 169 | wikivoyage_summary (str): summary wikivoyage column name 170 | 171 | Returns: 172 | pd.DataFrame: remote dataframe pre-processed 173 | """ 174 | LOGGER.info("Preprocessing input data.") 175 | 176 | if wikivoyage_summary: 177 | dataframe = dataframe[ 178 | dataframe[wikivoyage_summary].notna() 179 | ].copy().reset_index(drop=True) 180 | LOGGER.info("Using %s rows of data.", dataframe.shape[0]) 181 | 182 | # Translate columns 183 | for col in columns_to_translate: 184 | dataframe[col] = translate_description_series(dataframe, col) 185 | 186 | LOGGER.info("Columns %s translated.", columns_to_translate) 187 | 188 | # Process specified columns 189 | for col in columns_to_process: 190 | dataframe[col] = dataframe[col].fillna("").swifter.apply( 191 | lambda x: unidecode(x) if isinstance(x, str) else x).str.lower() 192 | dataframe[col] = preprocess_text(dataframe, col) 193 | 194 | LOGGER.info("Columns %s processed.", columns_to_process) 195 | dataframe.columns = dataframe.columns.astype(str) 196 | return dataframe 197 | 198 | 199 | @task(cache=True, cache_version='1.0', requests=INTENSIVE_RESOURCES) 200 | def vectorize_columns( 201 | dataframe: pd.DataFrame, columns_to_vec: List[str], 202 | city_column: str, state_column: str 203 | ) -> List[pd.DataFrame]: 204 | """Generate embeddings with the cities' infos. 205 | 206 | Args: 207 | dataframe (pd.DataFrame): remote dataset pre-processed 208 | columns_to_vec (List[str]): city features to be vectorized 209 | city_column (str): city column name 210 | state_column (str): state column name 211 | 212 | Returns: 213 | List[pd.DataFrame]: list of dataframes with city feature vectors 214 | """ 215 | model = TextVectorizer() 216 | model.model.to('cuda') 217 | column_embeddings = [] 218 | 219 | LOGGER.info("Generating embeddings for columns.") 220 | 221 | # Generate embeddings for each column 222 | for col in columns_to_vec: 223 | inputs_ids = model.encode_inputs(dataframe[col]).to('cuda') 224 | embeddings = model.get_df_embedding(inputs_ids) 225 | city_embeddings = pd.concat( 226 | [dataframe[[city_column, state_column]], embeddings], axis=1) 227 | city_embeddings.columns = city_embeddings.columns.astype(str) 228 | column_embeddings.append(city_embeddings) 229 | 230 | LOGGER.info("Embeddings generated.") 231 | return column_embeddings 232 | 233 | 234 | @task(cache=True, cache_version='1.0', requests=INTENSIVE_RESOURCES) 235 | def build_mean_embedding( 236 | list_dataframes: List[pd.DataFrame] 237 | ) -> pd.DataFrame: 238 | """Build mean embeddings for cities. 239 | 240 | Args: 241 | list_dataframes (List[pd.DataFrame]): list of dataframes with 242 | city feature vectors 243 | 244 | Returns: 245 | pd.DataFrame: city vectors 246 | """ 247 | LOGGER.info("Building mean embeddings.") 248 | 249 | # Retrieve embeddings 250 | column_embeddings = [data.iloc[:, 2:].values for data in list_dataframes] 251 | 252 | # Compute mean embeddings 253 | aux = torch.Tensor(np.array(column_embeddings)) 254 | aux_mean = aux.mean(axis=0) 255 | aux_mean = pd.DataFrame(aux_mean).astype("float") 256 | aux_mean = aux_mean.fillna(0) 257 | aux_mean = pd.concat( 258 | [list_dataframes[0][['city', 'state']], aux_mean], axis=1) 259 | aux_mean.columns = aux_mean.columns.astype(str) 260 | 261 | LOGGER.info("Mean embeddings calculated.") 262 | return aux_mean 263 | -------------------------------------------------------------------------------- /projects/destinations_similarity/destinations_similarity/workflows.py: -------------------------------------------------------------------------------- 1 | """Workflows for the destinations_similarity Flyte project.""" 2 | 3 | from datetime import timedelta 4 | from typing import List 5 | 6 | import pandas as pd 7 | from flytekit import workflow, conditional, LaunchPlan, FixedRate 8 | 9 | from destinations_similarity import tasks 10 | 11 | 12 | @workflow 13 | def generate_dataset() -> pd.DataFrame: 14 | """Generate the dataset to be used for training. 15 | 16 | Returns: 17 | pd.DataFrame: The generated dataset. 18 | """ 19 | base_data = tasks.get_base_data(generate_city_id=False) 20 | 21 | # Retrieve data from pt.wikipedia.org 22 | data_wikipedia_pt = tasks.scrap_wiki( 23 | base_data=base_data, wiki='wikipedia', lang='pt', summary=True, 24 | sections=['Clima', 'Economia', 'História', 'Geografia'], 25 | sections_tags={}, sections_types={} 26 | ) 27 | 28 | # Retrieve data from en.wikivoyage.org 29 | data_wikivoyage_en = tasks.scrap_wiki( 30 | base_data=base_data, wiki='wikivoyage', lang='en', summary=True, 31 | sections=['Do', 'See', 'Go next'], 32 | sections_tags={'Go next': ['a', 'b']}, 33 | sections_types={'Go next': 'list'} 34 | ) 35 | 36 | # Merge data 37 | dataset = tasks.merge_dataframes( 38 | df_x=data_wikipedia_pt, df_y=data_wikivoyage_en, join='outer') 39 | 40 | return dataset 41 | 42 | 43 | @workflow 44 | def build_knowledge_base( 45 | columns_to_translate: List[str], columns_to_process: List[str], 46 | summary_wikivoyage_column_name: str, remote_dataset: str = "" 47 | ) -> pd.DataFrame: 48 | """Generate knowledge database. 49 | 50 | Args: 51 | columns_to_translate (List[str]): city features to be translated 52 | columns_to_process (List[str]): city features to be processed 53 | summary_wikivoyage_column_name (str): summary wikivoyage column name 54 | remote_dataset (str, optional): Remote dataset's URL. Generates 55 | dataset if no path is specified. 56 | 57 | Returns: 58 | pd.DataFrame: The generated dataset. 59 | """ 60 | remote, flyte_file = tasks.check_if_remote(uri=remote_dataset) 61 | 62 | dataframe = ( 63 | conditional("remote_dataset") 64 | .if_(remote.is_true()) # pylint: disable=no-member 65 | .then(tasks.retrieve_dataset_from_remote(uri=flyte_file)) 66 | .else_() 67 | .then(generate_dataset()) 68 | ) 69 | 70 | dataframe_processed = tasks.preprocess_input_data( 71 | dataframe=dataframe, 72 | columns_to_translate=columns_to_translate, 73 | columns_to_process=columns_to_process, 74 | wikivoyage_summary=summary_wikivoyage_column_name 75 | ) 76 | 77 | list_dataframes = tasks.vectorize_columns( 78 | dataframe=dataframe_processed, 79 | columns_to_vec=columns_to_process, 80 | city_column='city', 81 | state_column='state' 82 | ) 83 | 84 | city_vectors = tasks.build_mean_embedding(list_dataframes=list_dataframes) 85 | 86 | return city_vectors 87 | 88 | 89 | # Launch plans 90 | build_knowledge_base_lp = LaunchPlan.get_or_create( 91 | name='build_knowledge_base_default_lp', 92 | workflow=build_knowledge_base, 93 | default_inputs={ 94 | 'columns_to_translate': [ 95 | "see_wikivoyage_en", 96 | "do_wikivoyage_en", 97 | "summary_wikivoyage_en" 98 | ], 99 | 'columns_to_process': [ 100 | "summary_wikipedia_pt", 101 | "história_wikipedia_pt", 102 | "geografia_wikipedia_pt", 103 | "clima_wikipedia_pt", 104 | "see_wikivoyage_en", 105 | "do_wikivoyage_en", 106 | "summary_wikivoyage_en" 107 | ], 108 | 'summary_wikivoyage_column_name': "summary_wikivoyage_en", 109 | 'remote_dataset': 110 | "https://storage.googleapis.com" 111 | "/dsc-public-info/datasets/flytelab_dataset.parquet", 112 | }, 113 | schedule=FixedRate(duration=timedelta(weeks=4)) 114 | ) 115 | -------------------------------------------------------------------------------- /projects/destinations_similarity/docs/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.7.12-slim-buster 2 | 3 | RUN apt-get update && \ 4 | pip install --no-cache-dir --upgrade pip && \ 5 | pip install --no-cache-dir sphinx==4.2.0 sphinx_rtd_theme==1.0.0 6 | 7 | WORKDIR /home 8 | 9 | CMD ["sh","-c","rm -rf _build/ && sphinx-build -b html . _build/"] -------------------------------------------------------------------------------- /projects/destinations_similarity/docs/dashboard/dashboard.rst: -------------------------------------------------------------------------------- 1 | Streamlit Dashboard 2 | --------------------------- 3 | 4 | The user interaction interface was built using the streamlit tool. After local testing, the stable version of the tool was posted on Streamlit's public server. You can access the interface through the link below. 5 | 6 | 7 | 8 | To access the app code, just access the link below: 9 | 10 | .. toctree:: 11 | streamlit -------------------------------------------------------------------------------- /projects/destinations_similarity/docs/dashboard/streamlit.rst: -------------------------------------------------------------------------------- 1 | .. _streamlit: 2 | 3 | Streamlit app 4 | --------------------------- 5 | 6 | .. automodule:: dashboard.app 7 | :members: 8 | -------------------------------------------------------------------------------- /projects/destinations_similarity/docs/guides/deploy.rst: -------------------------------------------------------------------------------- 1 | Deploy project 2 | ---------------------------------- 3 | 4 | To deploy the project, just follow the steps recommended in the flyte README. Below are the functions performed to deploy the project. Just run the following command: 5 | 6 | .. code-block:: 7 | 8 | python3 deploy 9 | 10 | 11 | .. toctree:: 12 | deploy_code 13 | -------------------------------------------------------------------------------- /projects/destinations_similarity/docs/guides/deploy_code.rst: -------------------------------------------------------------------------------- 1 | .. _deploy_code: 2 | 3 | Deploy code 4 | ---------------------------------- 5 | 6 | .. automodule:: deploy 7 | :members: 8 | 9 | 10 | -------------------------------------------------------------------------------- /projects/destinations_similarity/docs/guides/docs.rst: -------------------------------------------------------------------------------- 1 | .. _docs: 2 | 3 | Building Sphinx docs 4 | --------------------------- 5 | 6 | .. warning:: 7 | This documentation was built thinking about linux/mac operating systems or executions using WSL. 8 | 9 | 10 | In addition to the local README, we decided to detail a few more things about the project and maybe in a more playful way to facilitate understanding. 11 | 12 | There's not much mystery about building the documentation in HTML. We've already automated some things to make it easier. Generally speaking, Sphinx is responsible for creating a static page of HTML documentation using manually typed information or other information inserted into the developed code. These generated static pages are moved into a folder in a container running an NGINX image which hosts the documentation page. 13 | 14 | 15 | To build the Docker image responsible for the documentation and start hosting the server, just run the command 16 | 17 | .. warning:: 18 | For this command to work it is necessary that in your location it is possible to run Makefile files and ensure that the working directory is inside projects/destinations_similarity 19 | 20 | .. code-block:: 21 | 22 | make open-docs 23 | 24 | Once the command has been successfully executed, you can check with the command below if the container is running normally on your machine. 25 | 26 | .. code-block:: 27 | 28 | docker ps 29 | 30 | the result should be 31 | 32 | .. code-block:: 33 | 34 | $ docker ps 35 | CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES 36 | 84cb390d977f nginx "/docker-entrypoint.…" 36 seconds ago Up 35 seconds 0.0.0.0:8080->80/tcp, :::8080->80/tcp sphinx-nginx 37 | -------------------------------------------------------------------------------- /projects/destinations_similarity/docs/guides/guide.rst: -------------------------------------------------------------------------------- 1 | Developer Guide 2 | --------------------------- 3 | 4 | Here is some information that might be useful and maybe even cause some doubts about the project. Feel free to ask any of the contributors if something is not clear. 5 | 6 | May the dogs be with you! 7 | 8 | .. toctree:: 9 | docs 10 | deploy 11 | -------------------------------------------------------------------------------- /projects/destinations_similarity/docs/images/SolutionDiagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/flyteorg/flytelab/67ff9a779004209c82f9f5df89f2873cc5acdd52/projects/destinations_similarity/docs/images/SolutionDiagram.png -------------------------------------------------------------------------------- /projects/destinations_similarity/docs/images/kinzinhoApresentando.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/flyteorg/flytelab/67ff9a779004209c82f9f5df89f2873cc5acdd52/projects/destinations_similarity/docs/images/kinzinhoApresentando.jpg -------------------------------------------------------------------------------- /projects/destinations_similarity/docs/images/kinzinhoBagunceiro.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/flyteorg/flytelab/67ff9a779004209c82f9f5df89f2873cc5acdd52/projects/destinations_similarity/docs/images/kinzinhoBagunceiro.jpg -------------------------------------------------------------------------------- /projects/destinations_similarity/docs/images/kinzinhoBigDog.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/flyteorg/flytelab/67ff9a779004209c82f9f5df89f2873cc5acdd52/projects/destinations_similarity/docs/images/kinzinhoBigDog.png -------------------------------------------------------------------------------- /projects/destinations_similarity/docs/images/kinzinhoCachu.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/flyteorg/flytelab/67ff9a779004209c82f9f5df89f2873cc5acdd52/projects/destinations_similarity/docs/images/kinzinhoCachu.jpg -------------------------------------------------------------------------------- /projects/destinations_similarity/docs/images/kinzinhoGalante.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/flyteorg/flytelab/67ff9a779004209c82f9f5df89f2873cc5acdd52/projects/destinations_similarity/docs/images/kinzinhoGalante.jpg -------------------------------------------------------------------------------- /projects/destinations_similarity/docs/images/kinzinhoPensativo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/flyteorg/flytelab/67ff9a779004209c82f9f5df89f2873cc5acdd52/projects/destinations_similarity/docs/images/kinzinhoPensativo.jpg -------------------------------------------------------------------------------- /projects/destinations_similarity/docs/images/sphinx_server.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/flyteorg/flytelab/67ff9a779004209c82f9f5df89f2873cc5acdd52/projects/destinations_similarity/docs/images/sphinx_server.png -------------------------------------------------------------------------------- /projects/destinations_similarity/docs/images/vamoDalheLogo.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/flyteorg/flytelab/67ff9a779004209c82f9f5df89f2873cc5acdd52/projects/destinations_similarity/docs/images/vamoDalheLogo.jpeg -------------------------------------------------------------------------------- /projects/destinations_similarity/docs/model/feature_engineering.rst: -------------------------------------------------------------------------------- 1 | .. _feature_engineering: 2 | 3 | Feature engineering 4 | ---------------------- 5 | 6 | .. automodule:: destinations_similarity.processing.feature_engineering 7 | :members: 8 | -------------------------------------------------------------------------------- /projects/destinations_similarity/docs/model/model.rst: -------------------------------------------------------------------------------- 1 | .. _model: 2 | 3 | Machine Learning Model 4 | --------------------------- 5 | 6 | **Objective**: To help Kinzinho define his next travel destination, we seek to find other cities similar to the last travel destination he liked the most. 7 | 8 | **Strategy Solution**: To make a good evaluation between the cities, we chose to make a vector representation of each city in Brazil, encoding general information about each city such as its history, its geography, its climate and its tourist attractions. We chose this strategy because with a vector representation of the city we were able to apply similarity calculation operations between cities, considering various information about them. 9 | 10 | **Input data**: For our solution we use the following data from wikipedia and wikivoyage: "summary_wikipedia_en", "history_wikipedia_pt", "geografia_wikipedia_pt", "clima_wikipedia_pt", "see_wikivoyage_en", "do_wikivoyage_en", "summary_wikivoyage_en" 11 | 12 | **Preprocessing**: To process the data and extract only important information, we apply a series of pre-processing to clean unnecessary information and homogenize the texts. 13 | 14 | **Model**: To generate the best vector representation of each city's features, we used a pre-trained state-of-the-art model based on Transformers (BERTimbau). As a vector representation of each feature of the city, we use the output of the last layer of the BERTimbau language model. The vector representation of each city is generated from the average of the vectors of its features. 15 | 16 | **Similarity**: To calculate the similarity between the vector representations of each city, we are using an high optimized library and calculate the Euclidean distance between an input vector query (vector of the last city visited by Kinzinho) and all vectors of the cities available in our portfolio. 17 | 18 | 19 | To see the dataset processing codes, access the links below. 20 | 21 | .. toctree:: 22 | text_preprocessing 23 | feature_engineering -------------------------------------------------------------------------------- /projects/destinations_similarity/docs/model/text_preprocessing.rst: -------------------------------------------------------------------------------- 1 | .. _text_preprocessing: 2 | 3 | Text preprocessing 4 | ---------------------- 5 | 6 | .. automodule:: destinations_similarity.processing.text_preprocessing 7 | :members: 8 | -------------------------------------------------------------------------------- /projects/destinations_similarity/docs/scraper/extractor.rst: -------------------------------------------------------------------------------- 1 | .. _extractor: 2 | 3 | Extractor 4 | ---------------------- 5 | 6 | .. automodule:: destinations_similarity.scraper.extractor 7 | :members: 8 | -------------------------------------------------------------------------------- /projects/destinations_similarity/docs/scraper/scraper.rst: -------------------------------------------------------------------------------- 1 | .. _source: 2 | 3 | Generate source dataset 4 | ---------------------------------- 5 | 6 | Kinzinho's humans built this module to meet all the information extraction needs of Brazilian cities. This information is further enriched with data extracted from Wikipedia and Wikivoyager. 7 | 8 | .. toctree:: 9 | source 10 | 11 | 12 | Wikipedia and Wikivoyager scraping 13 | ---------------------------------- 14 | 15 | As with the module above, humans were quite clever in exploring the Wikipedia and Wikivoyager APIs using information extracted from Brazilian city data sources. The API, unlike the raw HTML page, returns the information well separated and makes the search for the information of interest much easier. 16 | 17 | .. toctree:: 18 | extractor 19 | 20 | -------------------------------------------------------------------------------- /projects/destinations_similarity/docs/scraper/source.rst: -------------------------------------------------------------------------------- 1 | .. _scraper: 2 | 3 | Source Dataset 4 | ---------------------------------- 5 | 6 | .. automodule:: destinations_similarity.scraper.brazilian_cities 7 | :members: 8 | 9 | 10 | -------------------------------------------------------------------------------- /projects/destinations_similarity/index.rst: -------------------------------------------------------------------------------- 1 | Destination Similarity - Vamo Dalhe 2 | ============================ 3 | 4 | .. toctree:: 5 | :maxdepth: 2 6 | 7 | 8 | Hi, I hope you are well! 9 | 10 | This documentation page is entirely for Destination Similarity design. 11 | 12 | First of all, we would like to comment on the team's mascot, Kinzinho. Kinzinho is an adventurous puppy who has a great interest in traveling and seeing different places. To help his cause, it would be ideal for him to know about all the possible places and, in addition, to seek places similar to others he loves so much. 13 | 14 | .. image:: ./docs/images/kinzinhoGalante.jpg 15 | :width: 400 16 | :align: center 17 | 18 | In the following sections, the project will be presented and how the humans of Kinzinho worked to try to help it. 19 | 20 | --------------------------- 21 | 22 | Getting the data 23 | --------------------------- 24 | 25 | Kinzinho is an adventurous little dog who wanted to know about all the destinations he could go to. Kinzinho had the trouble of having to search for public databases which could be used. It is important to point out that his main focus was Brazil (even because Kinzinho is passionate about nature here). So their humans immediately came up with the idea of using information from Wikipedia and Wikivoyager. 26 | 27 | .. image:: ./docs/images/kinzinhoCachu.jpg 28 | :width: 400 29 | :align: center 30 | 31 | 32 | In the links below it is possible to access information from the codes made by Kinzinho's humans to search for all possible destinations in Brazil. 33 | 34 | .. toctree:: 35 | :maxdepth: 2 36 | 37 | docs/scraper/scraper 38 | ... 39 | 40 | Machine Learning Model 41 | --------------------------- 42 | 43 | Kinzinho has so many adventurous options that he had to extract public data on Wikipedia and Wikivoyager to get to know all of them! But he now realized that it is too much and wants some recommendations based on where people traveled before. Can we help him? 44 | 45 | Of course! Everything is 'paw-sible' when you have a dog :D !! 46 | 47 | .. toctree:: 48 | :maxdepth: 2 49 | 50 | docs/model/model 51 | ... 52 | 53 | Creating a user interface (Streamlit) 54 | --------------------------- 55 | 56 | Once the similarity inference model is ready, Kinzinho's humans built an interface to help not only him, but everyone interested in being as adventurous as Kinzinho is. 57 | 58 | .. image:: ./docs/images/kinzinhoBagunceiro.jpg 59 | :width: 400 60 | :align: center 61 | 62 | 63 | Shall we start the mess? I mean... Travel? 64 | 65 | 66 | .. toctree:: 67 | docs/dashboard/dashboard 68 | 69 | 70 | Developer guides (How to) 71 | --------------------------- 72 | 73 | Kinzinho is not the programming professional. But he asked his humans to leave a minimally detailed description so that evaluators and other Hackathon participants could understand a few things about the project. Access the link below for more details. 74 | 75 | .. toctree:: 76 | docs/guides/guide 77 | 78 | 79 | Project references 80 | --------------------------- 81 | 82 | * [Wikipedia API] - https://www.wikidata.org/wiki/Wikidata:SPARQL_query_service/queries/examples 83 | * [Calculate vector similarity] - https://github.com/facebookresearch/faiss 84 | * [Translate model] - https://deep-translator.readthedocs.io/en/latest/ 85 | * [NLP Model] - https://huggingface.co/neuralmind/bert-base-portuguese-cased 86 | * [Streamlit docs] - https://docs.streamlit.io/ 87 | * [Flyte docs] - https://docs.flyte.org/en/latest/ 88 | * [Sphinx docs] - https://www.sphinx-doc.org/en/master/ 89 | 90 | Acknowledgments 91 | --------------------------- 92 | 93 | Kinzinho and his humans would like to thank everyone involved who made this project possible. They would also like to thank `Hurb's `_ support in allowing and influencing participation in the hackathon as training and recognition of the team's potential. And finally, thank Kinzinho himself for making the days of his humans around better. -------------------------------------------------------------------------------- /projects/destinations_similarity/requirements-dev.txt: -------------------------------------------------------------------------------- 1 | docker 2 | gitpython 3 | streamlit 4 | typer 5 | -------------------------------------------------------------------------------- /projects/destinations_similarity/requirements.txt: -------------------------------------------------------------------------------- 1 | flytekit>=0.30.3 2 | pandas>=1.3.5 3 | beautifulsoup4~=4.10.0 4 | requests~=2.27.1 5 | transformers[torch]~=4.17.0 6 | numpy>=1.21.5 7 | unidecode~=1.3.4 8 | torch~=1.11.0 9 | swifter~=1.1.2 10 | nltk~=3.7 11 | deep-translator~=1.8.3 12 | -------------------------------------------------------------------------------- /projects/destinations_similarity/scripts/open_docs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | echo "Building sphinx HTML from $(pwd)" 3 | docker build --tag sphinx-server ./docs/ 4 | docker run --rm --name build-docs -it -v $(pwd):/home/ sphinx-server 5 | docker run --rm --name sphinx-nginx -v $(pwd)/_build/:/usr/share/nginx/html:ro -d -p 8080:80 nginx 6 | echo "Sphinx docs is hosted at: http://localhost:8080/" -------------------------------------------------------------------------------- /projects/destinations_similarity/scripts/rebuild_docs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | docker stop sphinx-nginx 3 | docker run --rm --name build-docs -it -v $(pwd):/home/ sphinx-server 4 | docker run --rm --name sphinx-nginx -v $(pwd)/_build/:/usr/share/nginx/html:ro -d -p 8080:80 nginx 5 | echo "Sphinx docs is hosted at: http://localhost:8080/" -------------------------------------------------------------------------------- /projects/weather_forecasting/.flyte/remote-config.yaml: -------------------------------------------------------------------------------- 1 | admin: 2 | # For GRPC endpoints you might want to use dns:///flyte.myexample.com 3 | endpoint: dns:///playground.hosted.unionai.cloud 4 | authType: Pkce 5 | # Change insecure flag to ensure that you use the right setting for your environment 6 | insecure: false 7 | storage: 8 | type: stow 9 | stow: 10 | kind: s3 11 | config: 12 | auth_type: iam 13 | region: us-east-2 14 | logger: 15 | # Logger settings to control logger output. Useful to debug logger: 16 | show-source: true 17 | level: 1 18 | -------------------------------------------------------------------------------- /projects/weather_forecasting/.flyte/remote.config: -------------------------------------------------------------------------------- 1 | [sdk] 2 | # This option specifies the python packages where-in to search for workflows and tasks workflow packages. These workflows and tasks are then serialized during the `make serialize` commands 3 | workflow_packages=app 4 | 5 | [auth] 6 | # Uncomment if you want to use a service account for all your tasks and workflow executions. This service account should be created by you and available on the k8s cluster and it will be used to read and write data from the backend store like S3/GCS, or to connect to any services that you use in your tasks. 7 | # to the blobstore (e.g. s3) used to write task execution outputs. 8 | # kubernetes_service_account=demo 9 | # You can set this prefix to specify where task output schema and blobs should be written to. 10 | raw_output_data_prefix=s3://open-compute-playground 11 | -------------------------------------------------------------------------------- /projects/weather_forecasting/.flyte/sandbox-config.yaml: -------------------------------------------------------------------------------- 1 | admin: 2 | # For GRPC endpoints you might want to use dns:///flyte.myexample.com 3 | endpoint: dns:///localhost:30081 4 | authType: Pkce 5 | insecure: true 6 | logger: 7 | show-source: true 8 | level: 0 9 | storage: 10 | connection: 11 | access-key: minio 12 | auth-type: accesskey 13 | disable-ssl: true 14 | endpoint: http://localhost:30084 15 | region: us-east-1 16 | secret-key: miniostorage 17 | type: minio 18 | container: "my-s3-bucket" 19 | enable-multicontainer: true -------------------------------------------------------------------------------- /projects/weather_forecasting/.flyte/sandbox.config: -------------------------------------------------------------------------------- 1 | [sdk] 2 | # This option specifies the python packages where-in to search for workflows and tasks workflow packages. These workflows and tasks are then serialized during the `make serialize` commands 3 | workflow_packages=app 4 | 5 | [auth] 6 | # Uncomment if you want to use a service account for all your tasks and workflow executions. This service account should be created by you and available on the k8s cluster and it will be used to read and write data from the backend store like S3/GCS, or to connect to any services that you use in your tasks. 7 | # to the blobstore (e.g. s3) used to write task execution outputs. 8 | # kubernetes_service_account=demo 9 | # You can set this prefix to specify where task output schema and blobs should be written to. 10 | raw_output_data_prefix=s3://my-s3-bucket/flytelab 11 | -------------------------------------------------------------------------------- /projects/weather_forecasting/.gitignore: -------------------------------------------------------------------------------- 1 | .kube -------------------------------------------------------------------------------- /projects/weather_forecasting/DEPLOYMENT.md: -------------------------------------------------------------------------------- 1 | # Deployment Instructions 2 | 3 | This page contains notes regarding how to use Flyte in an end-to-end ML 4 | system. 5 | 6 | ## Project Setup 7 | 8 | ### Sandbox 9 | 10 | Create project: 11 | 12 | ```bash 13 | flytectl create project \ 14 | --name flytelab \ 15 | --id flytelab \ 16 | --description "flytelab: ml projects in flyte" \ 17 | --config .flyte/sandbox-config.yaml \ 18 | --project flytelab 19 | ``` 20 | 21 | Update cluster resource attributes: 22 | 23 | ```bash 24 | flyte-cli -i \ 25 | -h localhost:30081 \ 26 | -p flytelab \ 27 | -d development update-cluster-resource-attributes \ 28 | --attributes projectQuotaCpu 16 \ 29 | --attributes projectQuotaMemory 30Gi 30 | ``` 31 | 32 | ### Remote 33 | 34 | Create project: 35 | 36 | ```bash 37 | flytectl create project \ 38 | --name flytelab \ 39 | --id flytelab \ 40 | --description "flytelab: ml projects in flyte" \ 41 | --config .flyte/remote-config.yaml \ 42 | --project flytelab 43 | ``` 44 | 45 | 46 | Make sure the NOAA api key is available in the shell session: 47 | ``` 48 | eval $(sed 's/^/export /g' env.txt) 49 | ``` 50 | 51 | ## Workflow Registration 52 | 53 | ## Sandbox 54 | 55 | ### Register Workflows 56 | 57 | ```bash 58 | FLYTECTL_CONFIG=.flyte/sandbox-config.yaml REGISTRY=ghcr.io/flyteorg make register 59 | ``` 60 | 61 | ### Fast Registering New Code 62 | 63 | In case you've only changed user code and not system-level dependencies: 64 | 65 | ```bash 66 | FLYTECTL_CONFIG=.flyte/sandbox-config.yaml REGISTRY=ghcr.io/flyteorg make fast_register 67 | ``` 68 | 69 | ## Production [playground.hosted.unionai.cloud](https://playground.hosted.unionai.cloud/console) 70 | 71 | ### Register Workflows 72 | 73 | ```bash 74 | FLYTECTL_CONFIG=.flyte/remote-config.yaml FLYTE_CONFIG=.flyte/remote.config REGISTRY=ghcr.io/flyteorg make register 75 | ``` 76 | 77 | ### Fast Registering New Code 78 | 79 | ```bash 80 | FLYTECTL_CONFIG=.flyte/remote-config.yaml REGISTRY=ghcr.io/flyteorg make fast_register 81 | ``` 82 | 83 | ### Activating Launch Plans 84 | 85 | List launch plan versions 86 | 87 | ```bash 88 | ./scripts/launch-plan-status.sh 89 | ``` 90 | 91 | To activate launch plans 92 | 93 | ```bash 94 | ./scripts/activate-launch-plans.sh # [VERSION] argument is optional to activate a specific version 95 | ``` 96 | 97 | To deactivate: 98 | 99 | ```bash 100 | ./scripts/archive-launch-plans.sh # [VERSION] argument is optional to activate a specific version 101 | ``` 102 | -------------------------------------------------------------------------------- /projects/weather_forecasting/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.8-slim-buster 2 | LABEL org.opencontainers.image.source https://github.com/flyteorg/flytelab 3 | 4 | WORKDIR /root 5 | ENV VENV /opt/venv 6 | ENV LANG C.UTF-8 7 | ENV LC_ALL C.UTF-8 8 | ENV PYTHONPATH /root 9 | 10 | # e.g. flyte.config or sandbox.config 11 | ARG config 12 | 13 | # This is necessary for opencv to work 14 | RUN apt-get update && \ 15 | apt-get install -y \ 16 | libsm6 \ 17 | libxext6 \ 18 | libxrender-dev \ 19 | ffmpeg \ 20 | build-essential 21 | 22 | # Install the AWS cli separately to prevent issues with boto being written over 23 | RUN pip3 install awscli 24 | 25 | ENV VENV /opt/venv 26 | # Virtual environment 27 | RUN python3 -m venv ${VENV} 28 | ENV PATH="${VENV}/bin:$PATH" 29 | 30 | # Install Python dependencies 31 | COPY requirements.txt /root 32 | RUN pip install -r /root/requirements.txt 33 | 34 | COPY app /root/app 35 | COPY $config /root/flyte.config 36 | 37 | # This tag is supplied by the build script and will be used to determine the version 38 | # when registering tasks, workflows, and launch plans 39 | ARG tag 40 | ENV FLYTE_INTERNAL_IMAGE $tag 41 | ARG noaa_api_key 42 | ENV NOAA_API_KEY $noaa_api_key 43 | -------------------------------------------------------------------------------- /projects/weather_forecasting/Makefile: -------------------------------------------------------------------------------- 1 | export REPOSITORY=flytelab 2 | 3 | VERSION=$(shell git rev-parse HEAD) 4 | IMAGE_NAME=flytelab 5 | 6 | ifeq ($(INSECURE), true) 7 | INSECURE=-i 8 | endif 9 | 10 | ifeq ($(NOPUSH), true) 11 | NOPUSH=1 12 | endif 13 | 14 | ifndef FLYTECTL_CONFIG 15 | FLYTECTL_CONFIG=~/.flyte/config.yaml 16 | endif 17 | 18 | 19 | # If the REGISTRY environment variable has been set, that means the image name will not just be tagged as 20 | # flytecookbook: but rather, 21 | # docker.io/lyft/flytecookbook: or whatever your REGISTRY is. 22 | ifneq ($(origin REGISTRY), undefined) 23 | FULL_IMAGE_NAME = ${REGISTRY}/${IMAGE_NAME} 24 | else 25 | FULL_IMAGE_NAME = ${IMAGE_NAME} 26 | endif 27 | 28 | export FLYTE_HOST ?= localhost:30081 29 | export PREFIX ?= weather-forecasting 30 | export FLYTE_CONFIG ?= .flyte/sandbox.config 31 | 32 | # The Flyte project and domain that we want to register under 33 | export PROJECT ?= flytelab 34 | export DOMAIN ?= development 35 | export DESCRIPTION ?= 'ML projects using Flyte' 36 | 37 | # This specifies where fast-registered code is uploaded to during registration. 38 | # If you're not using the standard minio deployment on flyte sandbox: update this path to something that 39 | # - you have write access to 40 | # - flytepropeller can read (depending on the role it uses) 41 | export ADDL_DISTRIBUTION_DIR ?= s3://my-s3-bucket/flyte-fast-distributions 42 | 43 | FLYTE_INTERNAL_IMAGE=${FULL_IMAGE_NAME}:${PREFIX}-${VERSION} 44 | FLYTE_INTERNAL_LATEST=${FULL_IMAGE_NAME}:${PREFIX}-latest 45 | 46 | # targets for local development 47 | venv: 48 | @virtualenv ./.venv/weather-forecasting 49 | 50 | deps: 51 | @pip install -r requirements.txt 52 | 53 | env.txt: 54 | @echo "NOAA_API_KEY=''" > env.txt 55 | 56 | .PHONY: env-export 57 | env-export: 58 | @eval $(sed 's/^/export /g' env.txt) 59 | 60 | # flyte-related targets 61 | .PHONY: create-project 62 | create-project: 63 | flyte-cli register-project -h ${FLYTE_HOST} ${INSECURE} -p ${PROJECT} -n ${PROJECT} -d ${DESCRIPTION} 64 | 65 | .PHONY: _requires-commit 66 | _requires-commit: 67 | @if [ -n "$(shell git status --porcelain)" ]; then \ 68 | echo "Please commit git changes before building"; \ 69 | exit 1; \ 70 | fi; 71 | 72 | .PHONY: docker-build 73 | docker-build: _requires-commit 74 | ifndef NOAA_API_KEY 75 | $(error NOAA_API_KEY must be defined) 76 | endif 77 | @echo "Building: ${FLYTE_INTERNAL_IMAGE}" 78 | docker build . \ 79 | --build-arg tag="${FLYTE_INTERNAL_IMAGE}" \ 80 | --build-arg config="${FLYTE_CONFIG}" \ 81 | --build-arg noaa_api_key="${NOAA_API_KEY}" \ 82 | -t "${FLYTE_INTERNAL_IMAGE}" \ 83 | -t "${FLYTE_INTERNAL_LATEST}" \ 84 | -f ./Dockerfile 85 | 86 | .PHONY: docker-push 87 | docker-push: docker-build 88 | @echo "Pushing: ${FLYTE_INTERNAL_IMAGE}" 89 | docker push "${FLYTE_INTERNAL_IMAGE}" 90 | docker push "${FLYTE_INTERNAL_LATEST}" 91 | 92 | .PHONY: serialize 93 | serialize: 94 | echo ${CURDIR} 95 | pyflyte -c flyte.config --pkgs app package \ 96 | --force \ 97 | --in-container-source-path /root \ 98 | --image ${FULL_IMAGE_NAME}:${PREFIX}-${VERSION} 99 | 100 | .PHONY: register 101 | register: docker-push serialize 102 | flytectl -c ${FLYTECTL_CONFIG} \ 103 | register files \ 104 | --project flytelab \ 105 | --domain development \ 106 | --archive flyte-package.tgz \ 107 | --force \ 108 | --version ${VERSION} 109 | 110 | .PHONY: fast_serialize 111 | fast_serialize: 112 | echo ${CURDIR} 113 | pyflyte -c flyte.config --pkgs app package \ 114 | --force \ 115 | --in-container-source-path /root \ 116 | --fast \ 117 | --image ${FLYTE_INTERNAL_LATEST} 118 | 119 | .PHONY: fast_register 120 | fast_register: fast_serialize 121 | flytectl -c ${FLYTECTL_CONFIG}} \ 122 | register files \ 123 | --project flytelab \ 124 | --domain development \ 125 | --archive flyte-package.tgz \ 126 | --version fast${VERSION} 127 | -------------------------------------------------------------------------------- /projects/weather_forecasting/README.md: -------------------------------------------------------------------------------- 1 | # Weather Forecasting 2 | 3 | [![streamlit](http://img.shields.io/badge/streamlit-app-blue.svg?style=flat)](https://share.streamlit.io/flyteorg/flytelab/main/projects/weather_forecasting/dashboard/weather_forecasting.py) 4 | 5 | The purpose of this project is to train a model to perform weather forecasting 6 | using [noaa.gov](https://www.ncei.noaa.gov/) data. 7 | 8 | **Note:** _For the best reading experience on github, we recommend installing the_ 9 | _[markdown diagrams browser extension](https://github.com/marcozaccari/markdown-diagrams-browser-extension)_ 10 | _to render all of the diagrams_ and _[mathjax (chrome)](https://github.com/orsharir/github-mathjax)_ 11 | for math rendering. 12 | 13 | ## Prototype 14 | 15 | Since the term "weather forecasting" is quite expansive, we'll scope the protoype 16 | to the problem of next-day mean temperature prediction (MTP) at a set of specified 17 | locations. At a high level, our trained model function should look like: 18 | 19 | ```python 20 | locations = ["Seattle, WA, USA", "Atlanta, GA, USA", "Hyderabad, India"] 21 | predictions: List[float] = predict_mean_temperature(locations) 22 | ``` 23 | 24 | Where `locations` might be more precisely defined by country, city, zipcode, etc. 25 | 26 | ### Training Data 27 | 28 | For training data, we'll use the [integrated surface database (ISD)](https://www.ncdc.noaa.gov/isd) 29 | to obtain global hourly weather data. 30 | 31 | - [ISD Home page](https://www.ncdc.noaa.gov/isd) 32 | - [Web Browser GUI](https://www.ncei.noaa.gov/access/search/data-search/global-hourly) 33 | - [Web Services API](https://www.ncdc.noaa.gov/cdo-web/webservices/ncdcwebservices) 34 | - [Full ISD Documentation](https://www.ncei.noaa.gov/data/global-hourly/doc/isd-format-document.pdf) 35 | 36 | 37 | ### Training Pipeline 38 | 39 | There are several options for training the MTP model: 40 | 41 | #### Online Training 42 | 43 | Online model that updates its parameters daily based on fixed set of historical data 44 | temperature data (and potentially other related data). 45 | 46 | The model would estimate the function $x^t = f(x^{t - 1}, ..., x^{t - n}, x^{t - 365 \times 1})$, where $x_t$ is the mean temperature for a particular day $t$. 47 | 48 | ```mermaid 49 | graph TB 50 | 51 | subgraph "look-back" 52 | x1["x (t - 1)"] 53 | x2["x (t - 2)"] 54 | xdot["..."] 55 | x14["x (t - 14)"] 56 | end 57 | 58 | x["x (t)"] 59 | x1 --> x 60 | x2 --> x 61 | xdot --> x 62 | x14 --> x 63 | ``` 64 | 65 | #### Offline Training 66 | 67 | Offline model trained once on historical data going back `n` years. The data would be 68 | processed in the same form as the online model, it would just have more training instances. 69 | 70 | #### Hybrid Training 71 | 72 | Combined offline-online model. An offline model can be trained on historical data 73 | and can be updated on a daily basis as new data points are obtained. 74 | 75 | ### Pipeline Tasks 76 | 77 | At a high level, the pipeline should look something like: 78 | 79 | ```python 80 | for today in days: 81 | # update the model 82 | training_instance = get_data_point(today) 83 | current_model = get_current_model() 84 | updated_model = update_model(current_model, training_instance) 85 | write_model(updated_model) 86 | 87 | # get prediction for tomorrow 88 | tomorrow = today + 1 89 | prediction_instance = get_data_point(tomorrow) 90 | mean_temp_tomorrow = updated_model(prediction_instance) 91 | write_predictions(mean_temp_tomorrow, tomorrow) 92 | 93 | # evaluate trailing performance of the model 94 | prediction_today = get_prediction(today) # yesterday's prediction for today 95 | trailing_performance = evaluate_model(prediction_today, training_instance["mean_temp"]) 96 | write_performance(trailing_performance, today) 97 | ``` 98 | 99 | ### Model Architecture 100 | 101 | For the prototype, we'll start with an SGD regression model using `sklearn`, which is 102 | able to express a confidence interval of its predictions. 103 | 104 | ## Extensions 105 | 106 | After a prototype is up and running, here are some extensions to make a more sophisticated model: 107 | 108 | - perform hourly predictions 109 | - predict other data points such as precipitation 110 | - experiment with other model architectures, like ensembles, to improve performance 111 | - for locations that don't have weather data, interpolate predictions for neighboring areas 112 | 113 | ## Repo Structure 114 | 115 | - `Dockerfile`: dockerfile for building the image associated with the weather forecasting app 116 | - `.flyte`: directory containing flyte .ini config files 117 | - `app`: this directory contains the flyte workflows 118 | - `v1`: unmaintained version of the weather forecasting app 119 | - `v2`: latest veresion of the weather forecasting app 120 | - `dashboard`: source for the streamlit app 121 | - `flytelab`: python package for weather forecasting-specific functions. **NOTE:** Currently the source code 122 | in this package is only used by the unmaintained `v1` weather forecasting app. In the future functionality from 123 | the `v2/workflow.py` script might be refactored into a the `flytelab.weather_forecasting` package. 124 | - `scripts`: contains bash utility scripts for activating/archiving launch plans 125 | 126 | ## Setup 127 | 128 | ``` 129 | $ make venv 130 | $ source ./.venv/weather-forecasting/bin/activate 131 | $ make deps 132 | $ make env.txt 133 | ``` 134 | 135 | Replace `` with an [official API key](https://www.ncdc.noaa.gov/cdo-web/token). 136 | 137 | 138 | ## Usage 139 | 140 | Export environment variables 141 | 142 | ``` 143 | $ eval $(sed 's/^/export /g' env.txt) 144 | ``` 145 | 146 | Run the workflow locally 147 | ``` 148 | python app/workflow.py 149 | ``` 150 | 151 | 152 | ## Deployment 153 | 154 | [DEPLOYMENT.md](DEPLOYMENT.md) contains instructions for how to deploy the weather forecasting workflow 155 | to a local or remote sandbox. 156 | 157 | ## Streamlit App 158 | 159 | To run the app locally, but connecting to https://demo.nuclyde.io/console as the backend: 160 | 161 | ``` 162 | pip install streamlit 163 | export FLYTE_CREDENTIALS_CLIENT_SECRET= # replace with client secret 164 | export FLYTE_CREDENTIALS_CLIENT_ID=flytepropeller 165 | export FLYTE_CREDENTIALS_AUTH_MODE=basic 166 | export FLYTE_CREDENTIALS_AUTHORIZATION_METADATA_KEY=flyte-authorization 167 | export FLYTE_CREDENTIALS_OAUTH_SCOPES=all 168 | streamlit run dashboard/weather_forecasting.py 169 | ``` 170 | 171 | [Live Demo](https://share.streamlit.io/flyteorg/flytelab/main/projects/weather_forecasting/dashboard/weather_forecasting.py) 172 | 173 | ## Resources 174 | 175 | Here are some additional resources related to this project: 176 | 177 | - [awesome online machine learning](https://github.com/MaxHalford/awesome-online-machine-learning) 178 | - [The correct way to evaluate online machine learning models](https://maxhalford.github.io/blog/online-learning-evaluation/) 179 | - [Time Series Modeling using Scikit, Pandas, and Numpy](https://towardsdatascience.com/time-series-modeling-using-scikit-pandas-and-numpy-682e3b8db8d1) 180 | -------------------------------------------------------------------------------- /projects/weather_forecasting/app/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/flyteorg/flytelab/67ff9a779004209c82f9f5df89f2873cc5acdd52/projects/weather_forecasting/app/__init__.py -------------------------------------------------------------------------------- /projects/weather_forecasting/dashboard/flyte.config: -------------------------------------------------------------------------------- 1 | [platform] 2 | url=demo.nuclyde.io 3 | insecure=False 4 | 5 | [credentials] 6 | client_id=flytepropeller 7 | auth_mode=basic 8 | authorization_metadata-key=flyte-authorization 9 | oauth_scopes=all 10 | -------------------------------------------------------------------------------- /projects/weather_forecasting/dashboard/requirements.txt: -------------------------------------------------------------------------------- 1 | flytekit==0.30.0 2 | flyteidl 3 | geopy 4 | joblib 5 | pandas 6 | pandera 7 | requests 8 | sklearn 9 | timezonefinder 10 | pygments>=2.7.4 # not directly required, pinned by Snyk to avoid a vulnerability 11 | -------------------------------------------------------------------------------- /projects/weather_forecasting/dashboard/weather_forecasting.py: -------------------------------------------------------------------------------- 1 | import os 2 | from dataclasses import dataclass 3 | from datetime import datetime 4 | from typing import Optional, List 5 | 6 | import pandas as pd 7 | import streamlit as st 8 | from dataclasses_json import dataclass_json 9 | 10 | from flytekit.remote import FlyteRemote 11 | from flytekit.models import filters 12 | from flytekit.models.admin.common import Sort 13 | 14 | 15 | @dataclass_json 16 | @dataclass 17 | class Scores: 18 | # keep track of mean absolute error 19 | train_exp_mae: float = 0.0 20 | valid_exp_mae: float = 0.0 21 | 22 | 23 | @dataclass_json 24 | @dataclass 25 | class Prediction: 26 | air_temp: Optional[float] 27 | dew_temp: Optional[float] 28 | date: datetime 29 | error: Optional[str] = None 30 | imputed: bool = False 31 | 32 | 33 | @dataclass_json 34 | @dataclass 35 | class Forecast: 36 | created_at: datetime 37 | model_id: str 38 | predictions: List[Prediction] 39 | 40 | 41 | LOGO = "https://docs.flyte.org/en/latest/_static/flyte_circle_gradient_1_4x4.png" 42 | 43 | LAUNCH_PLAN_MAP = { 44 | "seattle": "seattle_weather_forecast_v2", 45 | "atlanta": "atlanta_weather_forecast_v2", 46 | "hyderabad": "hyderabad_weather_forecast_v2", 47 | "mumbai": "mumbai_weather_forecast_v2", 48 | "taipei": "taipei_weather_forecast_v2", 49 | "appleton": "appleton_weather_forecast_v2", 50 | "dharamshala": "dharamshala_weather_forecast_v2", 51 | "fremont": "fremont_weather_forecast_v2", 52 | } 53 | 54 | 55 | CITY_LABEL_MAP = { 56 | "atlanta": "Atlanta, GA USA", 57 | "seattle": "Seattle, WA USA", 58 | "hyderabad": "Hyderabad, Telangana India", 59 | "mumbai": "Mumbai, MH India", 60 | "taipei": "Taipei, Taiwan", 61 | "appleton": "Green Bay, WI USA", 62 | "dharamshala": "Dharamsala, HP India", 63 | "fremont": "Fremont, CA USA", 64 | } 65 | 66 | 67 | remote = FlyteRemote.from_config( 68 | default_project="flytelab", 69 | default_domain="development", 70 | config_file_path=os.path.join( 71 | os.path.abspath(os.path.dirname(__file__)), "flyte.config" 72 | ) 73 | ) 74 | 75 | st.set_page_config( 76 | page_title="flytelab - weather forecasts", 77 | page_icon=LOGO, 78 | ) 79 | 80 | _, _, col, *_ = st.columns(5) 81 | with col: 82 | st.image(LOGO, width=100) 83 | st.title("Flytelab: Weather Forecasts ⛈☀️☔️") 84 | 85 | """ 86 | This app displays the weather forecasts produced by a model 87 | that was trained using [flyte](https://flyte.org/). For more information 88 | see the [flytelab weather forecasting project](https://github.com/flyteorg/flytelab/tree/main/projects/weather_forecasting). 89 | """ 90 | 91 | selected_city = st.selectbox( 92 | "Select a City", 93 | options=[ 94 | "atlanta", 95 | "seattle", 96 | "hyderabad", 97 | "mumbai", 98 | "taipei", 99 | "appleton", 100 | "dharamshala", 101 | "fremont", 102 | ], 103 | format_func=lambda x: CITY_LABEL_MAP[x] 104 | ) 105 | 106 | [latest_execution, *_], _ = remote.client.list_executions_paginated( 107 | "flytelab", 108 | "development", 109 | limit=1, 110 | filters=[ 111 | filters.Equal("launch_plan.name", LAUNCH_PLAN_MAP[selected_city]), 112 | filters.Equal("phase", "SUCCEEDED"), 113 | ], 114 | sort_by=Sort.from_python_std("desc(execution_created_at)"), 115 | ) 116 | 117 | wf_execution = remote.fetch_workflow_execution(name=latest_execution.id.name) 118 | remote.sync(wf_execution, sync_nodes=False) 119 | forecast = Forecast.from_dict(wf_execution.outputs["forecast"]) 120 | scores = wf_execution.outputs["scores"] 121 | 122 | with st.expander("Model Metadata"): 123 | st.markdown(f""" 124 | ``` 125 | model_id: {forecast.model_id} 126 | created_at: {forecast.created_at} 127 | training exp-weighted-mae: {scores.train_exp_mae} 128 | validation exp-weighted-mae: {scores.valid_exp_mae} 129 | ``` 130 | """) 131 | 132 | st.markdown(f""" 133 | ## {CITY_LABEL_MAP[selected_city]} 134 | 135 | Air Temperature and Dew Temperature Forecast (°C) 136 | """) 137 | 138 | air_temp = [] 139 | dew_temp = [] 140 | datetime_index = [] 141 | for p in forecast.predictions: 142 | date = p.date.replace(tzinfo=None) 143 | if date < pd.Timestamp.now().floor("D").to_pydatetime(): 144 | continue 145 | air_temp.append(p.air_temp) 146 | dew_temp.append(p.dew_temp) 147 | datetime_index.append(date) 148 | 149 | data = pd.DataFrame( 150 | {"air_temp": air_temp, "dew_temp": dew_temp}, 151 | index=datetime_index 152 | ) 153 | 154 | st.line_chart(data) 155 | 156 | st.markdown(f""" 157 | Predictions powered by [flyte](https://flyte.org/) 158 | """) 159 | -------------------------------------------------------------------------------- /projects/weather_forecasting/in_container.mk: -------------------------------------------------------------------------------- 1 | SERIALIZED_PB_OUTPUT_DIR := /tmp/output 2 | 3 | .PHONY: clean 4 | clean: 5 | rm -rf $(SERIALIZED_PB_OUTPUT_DIR)/* 6 | 7 | $(SERIALIZED_PB_OUTPUT_DIR): clean 8 | mkdir -p $(SERIALIZED_PB_OUTPUT_DIR) 9 | 10 | .PHONY: serialize 11 | serialize: $(SERIALIZED_PB_OUTPUT_DIR) 12 | pyflyte --config /root/flyte.config serialize workflows -f $(SERIALIZED_PB_OUTPUT_DIR) 13 | 14 | .PHONY: register 15 | register: serialize 16 | flyte-cli register-files -h ${FLYTE_HOST} ${INSECURE_FLAG} -p ${PROJECT} -d development -v ${VERSION} --kubernetes-service-account ${SERVICE_ACCOUNT} --output-location-prefix ${OUTPUT_DATA_PREFIX} $(SERIALIZED_PB_OUTPUT_DIR)/* 17 | 18 | .PHONY: fast_serialize 19 | fast_serialize: $(SERIALIZED_PB_OUTPUT_DIR) 20 | pyflyte --config /root/flyte.config serialize fast workflows -f $(SERIALIZED_PB_OUTPUT_DIR) 21 | 22 | .PHONY: fast_register 23 | fast_register: fast_serialize 24 | flyte-cli fast-register-files -h ${FLYTE_HOST} ${INSECURE_FLAG} -p ${PROJECT} -d development --kubernetes-service-account ${SERVICE_ACCOUNT} --output-location-prefix ${OUTPUT_DATA_PREFIX} --additional-distribution-dir ${ADDL_DISTRIBUTION_DIR} $(SERIALIZED_PB_OUTPUT_DIR)/* 25 | -------------------------------------------------------------------------------- /projects/weather_forecasting/requirements.txt: -------------------------------------------------------------------------------- 1 | flytekit==0.30.0 2 | flytekitplugins-pandera==0.30.0 3 | flyteidl 4 | geopy 5 | joblib 6 | pandas 7 | pandera 8 | requests 9 | sklearn 10 | timezonefinder 11 | -------------------------------------------------------------------------------- /projects/weather_forecasting/scripts/activate-launch-plans.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | if [ -z "$1" ] 3 | then 4 | version=$(git rev-parse HEAD) 5 | else 6 | version=$1 7 | fi 8 | 9 | locations="atlanta seattle hyderabad mumbai taipei appleton dharamshala fremont" 10 | 11 | 12 | activate () { 13 | flytectl -c .flyte/remote-config.yaml \ 14 | update launchplan \ 15 | -p flytelab \ 16 | -d development \ 17 | "$1_weather_forecast_v2" \ 18 | --version $version \ 19 | --activate 20 | } 21 | 22 | for location in $locations 23 | do 24 | echo activating launch plan version $version for $location 25 | activate $location 26 | echo 27 | done 28 | -------------------------------------------------------------------------------- /projects/weather_forecasting/scripts/archive-launch-plans.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | if [ -z "$1" ] 3 | then 4 | version=$(git rev-parse HEAD) 5 | else 6 | version=$1 7 | fi 8 | 9 | locations="atlanta seattle hyderabad mumbai taipei appleton dharamshala fremont" 10 | 11 | 12 | archive () { 13 | flytectl -c .flyte/remote-config.yaml \ 14 | update launchplan \ 15 | -p flytelab \ 16 | -d development \ 17 | "$1_weather_forecast_v2" \ 18 | --version $version \ 19 | --archive 20 | } 21 | 22 | for location in $locations 23 | do 24 | echo archiving launch plan version $version for $location 25 | archive $location 26 | done 27 | -------------------------------------------------------------------------------- /projects/weather_forecasting/scripts/launch-plan-status.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | if [ -z "$1" ] 3 | then 4 | version=$(git rev-parse HEAD) 5 | else 6 | version=$1 7 | fi 8 | 9 | locations="atlanta seattle hyderabad mumbai taipei appleton dharamshala fremont" 10 | 11 | 12 | get-status () { 13 | flytectl -c .flyte/remote-config.yaml \ 14 | get launchplan \ 15 | -p flytelab \ 16 | -d development \ 17 | -o yaml \ 18 | --latest \ 19 | "$1_weather_forecast_v2" 20 | } 21 | 22 | for location in $locations 23 | do 24 | echo "launch plan status for $location, version: $version" 25 | get-status $location | grep state 26 | echo 27 | done 28 | -------------------------------------------------------------------------------- /projects/whats_cooking_good_looking/.flyte/remote-config.yaml: -------------------------------------------------------------------------------- 1 | admin: 2 | # For GRPC endpoints you might want to use dns:///flyte.myexample.com 3 | endpoint: dns:///playground.hosted.unionai.cloud 4 | authType: ClientSecret 5 | # Change insecure flag to ensure that you use the right setting for your environment 6 | storage: 7 | type: stow 8 | stow: 9 | kind: s3 10 | config: 11 | auth_type: iam 12 | region: us-east-2 13 | logger: 14 | # Logger settings to control logger output. Useful to debug logger: 15 | show-source: true 16 | level: 1 17 | -------------------------------------------------------------------------------- /projects/whats_cooking_good_looking/.flyte/remote.config: -------------------------------------------------------------------------------- 1 | [sdk] 2 | # This option specifies the python packages where-in to search for workflows and tasks workflow packages. These workflows and tasks are then serialized during the `make serialize` commands 3 | workflow_packages=.projects.whats_cooking_good_looking.whats_cooking_good_looking 4 | 5 | [auth] 6 | raw_output_data_prefix=s3://open-compute-playground 7 | -------------------------------------------------------------------------------- /projects/whats_cooking_good_looking/.flyte/sandbox-config.yaml: -------------------------------------------------------------------------------- 1 | admin: 2 | # For GRPC endpoints you might want to use dns:///flyte.myexample.com 3 | endpoint: dns:///localhost:30081 4 | authType: Pkce 5 | insecure: true 6 | logger: 7 | show-source: true 8 | level: 0 9 | storage: 10 | connection: 11 | access-key: minio 12 | auth-type: accesskey 13 | disable-ssl: true 14 | endpoint: http://localhost:30084 15 | region: us-east-1 16 | secret-key: miniostorage 17 | type: minio 18 | container: "my-s3-bucket" 19 | enable-multicontainer: true 20 | -------------------------------------------------------------------------------- /projects/whats_cooking_good_looking/.flyte/sandbox.config: -------------------------------------------------------------------------------- 1 | [sdk] 2 | # This option specifies the python packages where-in to search for workflows and tasks workflow packages. These workflows and tasks are then serialized during the `make serialize` commands 3 | workflow_packages=whats_cooking_good_looking 4 | 5 | [auth] 6 | raw_output_data_prefix=s3://my-s3-bucket/flytelab 7 | -------------------------------------------------------------------------------- /projects/whats_cooking_good_looking/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.9-slim-buster 2 | 3 | WORKDIR /root 4 | ENV VENV /opt/venv 5 | ENV LANG C.UTF-8 6 | ENV LC_ALL C.UTF-8 7 | ENV PYTHONPATH /root 8 | 9 | # e.g. flyte.config or sandbox.config 10 | ARG config 11 | 12 | 13 | RUN apt-get update && \ 14 | apt-get install -y \ 15 | libsm6 \ 16 | libxext6 \ 17 | libxrender-dev \ 18 | ffmpeg \ 19 | build-essential 20 | 21 | # Install the AWS cli separately to prevent issues with boto being written over 22 | RUN pip3 install awscli 23 | 24 | ENV VENV /opt/venv 25 | 26 | # Virtual environment 27 | RUN python3 -m venv ${VENV} 28 | ENV PATH="${VENV}/bin:$PATH" 29 | 30 | # Install Python dependencies 31 | COPY requirements.txt /root 32 | RUN pip install -r /root/requirements.txt 33 | 34 | COPY whats_cooking_good_looking /root/whats_cooking_good_looking 35 | COPY $config /root/flyte.config 36 | 37 | 38 | ARG creds 39 | RUN echo $creds | base64 -d > /root/google_creds.json 40 | ENV GOOGLE_APPLICATION_CREDENTIALS "/root/google_creds.json" 41 | 42 | # This image is supplied by the build script and will be used to determine the version 43 | # when registering tasks, workflows, and launch plans 44 | ARG image 45 | ENV FLYTE_INTERNAL_IMAGE $image 46 | -------------------------------------------------------------------------------- /projects/whats_cooking_good_looking/README.md: -------------------------------------------------------------------------------- 1 | # Whats_cooking_good_looking 2 | 3 | ## Problem statement 4 | 5 | The world of beauty is in constant evolution. New molecules, new brands, new discovered benefits. Innovating is time consuming, especially when you are a large corporation. So a possible strategy is to find "indie brands" that innovate, and that the public likes, and buy them out. 6 | Social networks are a perfect place to talk about beauty, and therefore a great place to discover such brands. 7 | The problem then lies in, how to detect the new brands and trendy products or benefits, to help the merger division finding the right buy? 8 | 9 | ## Target Solution implementation 10 | 11 |

12 | 13 |

14 | 15 | In order to extract brands in a tweet, we implemented a NER (named entity recognition) pipeline that: 16 | 1. Retrieves tweets related to beauty based on keywords 17 | 2. Applies a pretrained NER model to those posts 18 | 3. Sends model results to a labelling interface and waits for a manual annotation to check given results 19 | 4. Computes evaluation metrics (so far only accuracy but it would be interesting to compute precision and recall as well) 20 | 5. 1. If metrics are good enough (defined by a business standard), the pipeline end 21 | 5. 2. If metrics are not good enough, it sends those labeled posts into a training task and goes back into the same piece of pipeline composed of steps 2 - 3 - 4 22 | 23 | 24 | ## Actual Solution implementation 25 | 26 |

27 | 28 |

29 | 30 | The project is cut into 3 steps: 31 | 32 | ### 1. NER application pipeline 33 | 1. Retrieves tweets related to beauty based on keywords 34 | 2. Applies a NER model to those posts 35 | 3. Sends model results in a format that could load into Label Studio in a GCS bucket 36 | 37 | To run this pipeline locally please run 38 | ```python whats_cooking_good_looking/apply_ner_workflow.py``` 39 | 40 |

41 | 42 |

43 | 44 | ### 2. Manual labelling part in Label Studio 45 | 46 |

47 | 48 |

49 | 50 | 51 | ### 3. NER training pipeline 52 | 1. Retrieves labelled tasks (Label Studio output) 53 | 2. Computes model accuracy based on those labelled observations 54 | 3. 1. If metrics are good enough, pipeline stops 55 | 3. 2. If metrics are not good enough, labelled tasks are used as input to train a new NER model 56 | 57 | The goal was to create a feedback loop where it was possible to iterate by training NER models based on new manual annotations. We chose to cut into 2 pipelines to get rid of the network constraints that we would have to handle and that were not evaluated in the scope of this hackaton. 58 | 59 | To run this pipeline locally please run 60 | ```python whats_cooking_good_looking/train_ner_workflow.py``` 61 | 62 |

63 | 64 |

65 | ## Pipeline deployment 66 | 67 | The Goal of this pipeline is deploy workflows automatically to the playground on pushed commits 68 | 69 | ### Steps and flow 70 | the steps are : 71 | 1. Build the dockerfile of our project 72 | 2. Push the build docker image to the container registery of our GCP project 73 | 3. serialize our workflows using the pyflyte cli 74 | 4. register the packages to our playgroud project 75 | 76 | Step 3 and 4 are using external docker based github actions that we made ourselves : [serialize](https://github.com/louisRDSC/FlyteSerializeAction), [register](https://github.com/louisRDSC/flyteRegisterAction) 77 | 78 | for all branches except main, the workflows are registered in the development environment. For main the workflows are registered in the staging environment. Future iterations could register them in the production evironment when a tag is created. 79 | 80 | In develpment each version of a workflow is named like followed : \-\. 81 | This allows ourself to easily recognize our work while working simultaneously on different branches. 82 | 83 | In staging each version of a workflow is named like followed : \ 84 | 85 | ### How to make it work 86 | 87 | 4 secrets are required for the pipeline to work : 88 | - ClIENT_ID : the ID to authenticate with the playground 89 | - ClIENT_SECRET : The secret to authenticate with the playgroud 90 | - RUNNER_KEY : The json key encoded in BASE64 of a GCP service account with read and write rights on the gcp bucket where the data il pulled and pushed from. 91 | - SERVICE_ACCOUNT_KEY : The json key of a service account with write rights on the container registery where we are pushing our images 92 | -------------------------------------------------------------------------------- /projects/whats_cooking_good_looking/dashboard/app.py: -------------------------------------------------------------------------------- 1 | import os 2 | from argparse import ArgumentParser 3 | from pathlib import Path 4 | 5 | import streamlit as st 6 | from flytekit.models import filters 7 | from flytekit.models.admin.common import Sort 8 | from flytekit.remote import FlyteRemote 9 | from sklearn.datasets import load_digits 10 | 11 | PROJECT_NAME = "flytelab-whats_cooking_good_looking".replace("_", "-") 12 | WORKFLOW_NAME = "whats_cooking_good_looking.workflows.main" 13 | 14 | 15 | parser = ArgumentParser() 16 | parser.add_argument("--remote", action="store_true") 17 | args = parser.parse_args() 18 | 19 | backend = os.getenv("FLYTE_BACKEND", 'remote' if args.remote else 'sandbox') 20 | 21 | # configuration for accessing a Flyte cluster backend 22 | remote = FlyteRemote.from_config( 23 | default_project=PROJECT_NAME, 24 | default_domain="development", 25 | config_file_path=Path(__file__).parent / f"{backend}.config", 26 | ) 27 | 28 | # get the latest workflow execution 29 | [latest_execution, *_], _ = remote.client.list_executions_paginated( 30 | PROJECT_NAME, 31 | "development", 32 | limit=1, 33 | filters=[ 34 | filters.Equal("launch_plan.name", WORKFLOW_NAME), 35 | filters.Equal("phase", "SUCCEEDED"), 36 | ], 37 | sort_by=Sort.from_python_std("desc(execution_created_at)"), 38 | ) 39 | 40 | wf_execution = remote.fetch_workflow_execution(name=latest_execution.id.name) 41 | remote.sync(wf_execution, sync_nodes=False) 42 | model = wf_execution.outputs["o0"] 43 | print(model) 44 | 45 | 46 | ############ 47 | # App Code # 48 | ############ 49 | 50 | data = load_digits(as_frame=True) 51 | 52 | st.write("# Flytelab: whats_cooking_good_looking") 53 | st.write("### TBD") 54 | st.write(f"Model: `{model}`") 55 | 56 | st.write("Use the slider below to select a sample for prediction") 57 | 58 | sample_index = st.slider( 59 | "Sample Number", 60 | min_value=0, 61 | max_value=data.frame.shape[0] - 1, 62 | value=0, 63 | step=1, 64 | ) 65 | 66 | st.image(data.images[sample_index], clamp=True, width=300) 67 | st.write(f"Ground Truth: {data.target[sample_index]}") 68 | st.write(f"Prediction: {model.predict(data.frame[data.feature_names].loc[[sample_index]])[0]}") 69 | -------------------------------------------------------------------------------- /projects/whats_cooking_good_looking/dashboard/remote.config: -------------------------------------------------------------------------------- 1 | [platform] 2 | url=playground.hosted.unionai.cloud 3 | insecure=False 4 | 5 | [credentials] 6 | client_id=flytepropeller 7 | auth_mode=basic 8 | authorization_metadata-key=flyte-authorization 9 | oauth_scopes=all 10 | -------------------------------------------------------------------------------- /projects/whats_cooking_good_looking/dashboard/sandbox.config: -------------------------------------------------------------------------------- 1 | [platform] 2 | url=localhost:30081 3 | insecure=True 4 | 5 | [aws] 6 | access_key_id=minio 7 | secret_access_key=miniostorage 8 | endpoint=http://localhost:30084 9 | -------------------------------------------------------------------------------- /projects/whats_cooking_good_looking/deploy.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | from pathlib import Path 3 | 4 | import docker 5 | import git 6 | import typer 7 | 8 | app = typer.Typer() 9 | 10 | docker_client = docker.from_env() 11 | 12 | 13 | IMAGE_NAME = "flytelab" 14 | REGISTRY = "eu.gcr.io/flyte-sandbox-342013" 15 | PROJECT_NAME = "adorable-unicorns-23" 16 | DESCRIPTION = "TBD" 17 | 18 | 19 | def create_project(remote: bool): 20 | config = Path(".flyte") / f"{'remote' if remote else 'sandbox'}-config.yaml" 21 | output = subprocess.run( 22 | [ 23 | "flytectl", 24 | "get", 25 | "project", 26 | PROJECT_NAME, 27 | "--config", config, 28 | ], 29 | capture_output=True, 30 | ) 31 | if output.stdout.decode().strip(): 32 | return 33 | 34 | typer.echo(f"Creating project {PROJECT_NAME}") 35 | subprocess.run( 36 | [ 37 | "flytectl", 38 | "create", 39 | "project", 40 | "--project", PROJECT_NAME, 41 | "--name", PROJECT_NAME, 42 | "--id", PROJECT_NAME, 43 | "--description", DESCRIPTION, 44 | "--config", config, 45 | ] 46 | ) 47 | 48 | 49 | def get_version(): 50 | repo = git.Repo(".", search_parent_directories=True) 51 | if repo.is_dirty(): 52 | typer.echo("Please commit git changes before building", err=True) 53 | raise typer.Exit(code=1) 54 | commit = repo.rev_parse("HEAD") 55 | return commit.hexsha 56 | 57 | 58 | def get_tag(version): 59 | return f"{REGISTRY}/{IMAGE_NAME}:{PROJECT_NAME}-{version}" 60 | 61 | 62 | def sandbox_docker_build(tag): 63 | typer.echo("Building image in Flyte sandbox") 64 | subprocess.run([ 65 | "flytectl", 66 | "sandbox", 67 | "exec", 68 | "--", 69 | "docker", 70 | "build", 71 | ".", 72 | "--tag", 73 | tag, 74 | ]) 75 | 76 | 77 | def docker_build(tag: str, remote: bool) -> docker.models.images.Image: 78 | client = docker.from_env() 79 | 80 | # TODO: image build, push, flytectl serialization and registration 81 | config = Path(".flyte") / f"{'remote' if remote else 'sandbox'}.config" 82 | 83 | typer.echo(f"Building image: {tag}") 84 | image, build_logs = client.images.build( 85 | path=".", 86 | dockerfile="Dockerfile", 87 | tag=tag, 88 | buildargs={ 89 | "image": tag, 90 | "config": str(config), 91 | } 92 | ) 93 | for line in build_logs: 94 | typer.echo(line) 95 | return image 96 | 97 | 98 | def docker_push(image: docker.models.images.Image): 99 | for line in docker_client.api.push(image.tags[0], stream=True, decode=True): 100 | typer.echo(line) 101 | 102 | 103 | def serialize(tag: str): 104 | typer.echo("Serializing Flyte workflows") 105 | subprocess.run([ 106 | "pyflyte", 107 | "-c", ".flyte/remote.config", 108 | #"--pkgs", "whats_cooking_good_looking", 109 | "package", 110 | "--force", 111 | "--in-container-source-path", "/root", 112 | "--image", tag 113 | ]) 114 | 115 | 116 | def register(version: str, remote: bool, domain: str): 117 | typer.echo("Registering Flyte workflows") 118 | config = Path(".flyte") / f"{'remote' if remote else 'sandbox'}-config.yaml" 119 | subprocess.run([ 120 | "flytectl", 121 | "-c", config, 122 | "register", 123 | "files", 124 | "--project", "adorable-unicorns-23", 125 | "--domain", domain, 126 | "--archive", "flyte-package.tgz", 127 | "--force", 128 | "--version", version 129 | ]) 130 | 131 | 132 | @app.command() 133 | def main(remote: bool = False, domain: str = "development"): 134 | create_project(remote) 135 | version = get_version() 136 | tag = get_tag(version) 137 | if remote: 138 | docker_push(docker_build(tag, remote)) 139 | else: 140 | sandbox_docker_build(tag) 141 | serialize(tag) 142 | register(version, remote, domain) 143 | 144 | 145 | if __name__ == "__main__": 146 | app() 147 | -------------------------------------------------------------------------------- /projects/whats_cooking_good_looking/docs/actual_pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/flyteorg/flytelab/67ff9a779004209c82f9f5df89f2873cc5acdd52/projects/whats_cooking_good_looking/docs/actual_pipeline.png -------------------------------------------------------------------------------- /projects/whats_cooking_good_looking/docs/apply_pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/flyteorg/flytelab/67ff9a779004209c82f9f5df89f2873cc5acdd52/projects/whats_cooking_good_looking/docs/apply_pipeline.png -------------------------------------------------------------------------------- /projects/whats_cooking_good_looking/docs/label_studio.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/flyteorg/flytelab/67ff9a779004209c82f9f5df89f2873cc5acdd52/projects/whats_cooking_good_looking/docs/label_studio.png -------------------------------------------------------------------------------- /projects/whats_cooking_good_looking/docs/target_pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/flyteorg/flytelab/67ff9a779004209c82f9f5df89f2873cc5acdd52/projects/whats_cooking_good_looking/docs/target_pipeline.png -------------------------------------------------------------------------------- /projects/whats_cooking_good_looking/docs/train_pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/flyteorg/flytelab/67ff9a779004209c82f9f5df89f2873cc5acdd52/projects/whats_cooking_good_looking/docs/train_pipeline.png -------------------------------------------------------------------------------- /projects/whats_cooking_good_looking/requirements-dev.txt: -------------------------------------------------------------------------------- 1 | docker 2 | gitpython 3 | streamlit 4 | typer 5 | -------------------------------------------------------------------------------- /projects/whats_cooking_good_looking/requirements.txt: -------------------------------------------------------------------------------- 1 | flytekit>=0.30.3 2 | s3fs>=2022.2.0 3 | 4 | snscrape==0.4.3.20220106 5 | spacy==3.2.3 6 | https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl 7 | google-cloud-storage==2.2.1 8 | gcsfs==2022.2.0 9 | click==7.1.2 10 | pandas==1.3.5 11 | -------------------------------------------------------------------------------- /projects/whats_cooking_good_looking/whats_cooking_good_looking/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/flyteorg/flytelab/67ff9a779004209c82f9f5df89f2873cc5acdd52/projects/whats_cooking_good_looking/whats_cooking_good_looking/__init__.py -------------------------------------------------------------------------------- /projects/whats_cooking_good_looking/whats_cooking_good_looking/apply_ner_workflow.py: -------------------------------------------------------------------------------- 1 | import json 2 | from pathlib import Path 3 | from typing import List 4 | 5 | import spacy 6 | from flytekit import Resources, task, workflow 7 | from snscrape.modules.twitter import TwitterSearchScraper 8 | 9 | from whats_cooking_good_looking.utils import (doc_to_spans, download_from_gcs, 10 | load_config, upload_to_gcs) 11 | 12 | SPACY_MODEL = {"en": "en_core_web_sm"} 13 | 14 | CACHE_VERSION = "2.2" 15 | request_resources = Resources(cpu="1", mem="500Mi", storage="500Mi") 16 | limit_resources = Resources(cpu="2", mem="1000Mi", storage="1000Mi") 17 | 18 | 19 | @task 20 | def get_tweets_list( 21 | keyword_list: List[str], lang: str = "en", max_results: int = 1000 22 | ) -> str: 23 | """Collects `max_results` tweets mentioning any of the words in `keywords_list` written in language `lang`. 24 | 25 | Args: 26 | keyword_list (List[str]): List of keywords that tweets must mention at least one of. 27 | lang (str, optional): Language in which tweets must be written(iso-code). Defaults to "en". 28 | max_results (int, optional): Number of maximum tweets to retrieve. Defaults to 1000. 29 | 30 | Returns: 31 | str: json dumped results with following shape 32 | [ 33 | { 34 | "date": "2022-03-25 16:23:01+00:00, 35 | "tweet_id": "XXXXXXX", 36 | "text": "some tweet", 37 | "username": "some user" 38 | }, 39 | ] 40 | """ 41 | keywords_query = " OR ".join(keyword_list) 42 | query = f"({keywords_query}) lang:{lang}" 43 | tweets_list = [] 44 | for tweet_idx, tweet_post in enumerate(TwitterSearchScraper(query).get_items()): 45 | if tweet_idx == max_results: 46 | break 47 | tweets_list.append( 48 | { 49 | "date": str(tweet_post.date), 50 | "tweet_id": str(tweet_post.id), 51 | "text": str(tweet_post.content), 52 | "username": str(tweet_post.username), 53 | } 54 | ) 55 | return json.dumps(tweets_list) 56 | 57 | 58 | @task 59 | def load_model( 60 | lang: str, 61 | from_gcs: bool, 62 | gcs_bucket: str, 63 | gcs_source_blob_name: str, 64 | ) -> spacy.Language: 65 | """Loads spacy model either from gcs if specified or given the source language. 66 | 67 | Args: 68 | lang (str): Language in which tweets must be written(iso-code). 69 | from_gcs (bool): True if needs to download custom spacy model from gcs. 70 | gcs_bucket (str): bucket name where to retrieve spacy model if from_gcs. 71 | gcs_source_blob_name (str): blob name where to retrieve spacy model if from_gcs. 72 | 73 | Returns: 74 | Language: spacy model. 75 | """ 76 | if from_gcs: 77 | Path("tmp").mkdir(parents=True, exist_ok=True) 78 | output_filename = download_from_gcs( 79 | gcs_bucket, gcs_source_blob_name, "tmp", explicit_filepath=True 80 | )[0] 81 | nlp = spacy.load(output_filename) 82 | else: 83 | model_name = SPACY_MODEL[lang] 84 | nlp = spacy.load(model_name) 85 | return nlp 86 | 87 | 88 | @task 89 | def apply_model( 90 | nlp: bytes, tweets_list: str, bucket_name: str, source_blob_name: str 91 | ) -> str: 92 | """Applies spacy model to each tweet to extract entities from and convert them into 93 | Label studio task format. 94 | 95 | Args: 96 | nlp (Language): Spacy model to use for inference. 97 | tweets_list (str): json dumped list of tweets. 98 | bucket_name (str): Name of the GCS bucket to upload to. 99 | source_blob_name (str): File name of GCS uploaded file. 100 | 101 | Returns: 102 | str: json dumped results with following shape 103 | [ 104 | { 105 | "date": "2022-03-25 16:23:01+00:00, 106 | "tweet_id": "XXXXXXX", 107 | "text": "some tweet", 108 | "username": "some user" 109 | "entities": [ 110 | { 111 | "label": "some label", 112 | "start_char": "index beginning char entity", 113 | "end_char": "index end char entity" 114 | }, 115 | ] 116 | 117 | } 118 | ] 119 | """ 120 | entities = set() 121 | labelstudio_tasks = [] 122 | model_name = SPACY_MODEL["en"] 123 | for tweet in json.loads(tweets_list): 124 | predictions = [] 125 | text = tweet["text"] 126 | doc = nlp(text) 127 | spans, ents = doc_to_spans(doc) 128 | entities |= ents 129 | predictions.append({"model_version": model_name, "result": spans}) 130 | labelstudio_tasks.append({"data": {"text": text}, "predictions": predictions}) 131 | with open("tasks.json", mode="w") as f: 132 | json.dump(labelstudio_tasks, f, indent=2) 133 | json_labelstudio_tasks = json.dumps(labelstudio_tasks) 134 | upload_to_gcs( 135 | bucket_name, source_blob_name, json_labelstudio_tasks, content_type=None 136 | ) 137 | return json_labelstudio_tasks 138 | 139 | 140 | @workflow 141 | def main() -> str: 142 | """Main workflow searching for entities in beauty related tweets. 143 | 144 | Returns: 145 | str: json dumped results with following shape 146 | [ 147 | { 148 | "date": "2022-03-25 16:23:01+00:00, 149 | "tweet_id": "XXXXXXX", 150 | "text": "some tweet", 151 | "username": "some user" 152 | "entities": [ 153 | { 154 | "label": "some label", 155 | "start_char": "index beginning char entity", 156 | "end_char": "index end char entity" 157 | }, 158 | ] 159 | 160 | } 161 | ] 162 | """ 163 | config = load_config("apply") 164 | tweets_list = get_tweets_list( 165 | keyword_list=config["keyword_list"], 166 | lang=config["lang"], 167 | max_results=config["max_results"], 168 | ) 169 | nlp = load_model( 170 | lang=config["lang"], 171 | from_gcs=config["from_gcs"], 172 | gcs_bucket=config["bucket_name"], 173 | gcs_source_blob_name=config["gcs_spacy_model_blob_name"], 174 | ) 175 | return apply_model( 176 | nlp=nlp, 177 | tweets_list=tweets_list, 178 | bucket_name=config["bucket_name"], 179 | source_blob_name=config["applied_model_output_blob_name"], 180 | ) 181 | 182 | 183 | if __name__ == "__main__": 184 | print(f"Applied model: {main()}") 185 | -------------------------------------------------------------------------------- /projects/whats_cooking_good_looking/whats_cooking_good_looking/config.json: -------------------------------------------------------------------------------- 1 | {"apply": 2 | { 3 | "lang": "en", 4 | "keyword_list": ["beauty", "skin", "hair"], 5 | "max_results": 1000, 6 | "from_gcs": false, 7 | "bucket_name": "wcgl_data", 8 | "gcs_spacy_model_blob_name": "", 9 | "applied_model_output_blob_name": "label_in/tasks_out.json" 10 | }, 11 | "train": 12 | { 13 | "bucket_name": "wcgl_data", 14 | "training_iterations": 10, 15 | "bucket_label_out_name": "wcgl_label_out", 16 | "model_name": "dummy", 17 | "label_studio_output_blob_name": "annotations.json", 18 | "model_output_blob_name": "spacy_model/models/dummy.pkl" 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /projects/whats_cooking_good_looking/whats_cooking_good_looking/keywords.txt: -------------------------------------------------------------------------------- 1 | açai 2 | acid serum 3 | acne prone 4 | active charcoal 5 | adaptogens 6 | advanced hydrators 7 | african black soap 8 | agentnateur 9 | ahas 10 | algae 11 | almond milk 12 | aloe soothing 13 | aloe vera 14 | alpha hydroxy acids 15 | amino 16 | ampoule 17 | amrezy highlighter 18 | anti pollution skincare 19 | antioxidant lip 20 | aox 21 | ardellbeauty 22 | ascorbic acid 23 | ash blonde 24 | babylights 25 | baking setting 26 | bakuchiol 27 | balayage tutorials 28 | balm 29 | bar soap 30 | beauty blender 31 | beauty oil 32 | beauty sponge 33 | beauty wellness 34 | benzoate 35 | bergamot 36 | beta carotene 37 | beta glucan 38 | biba eyeshadow 39 | big hair 40 | blaze 41 | blonde lightener 42 | blue light 43 | blueberry 44 | blush duo 45 | blush stix 46 | body butter 47 | body shimmer 48 | bottom lashes 49 | bounce 50 | box dye 51 | brassy 52 | bridal hair 53 | bright blonde 54 | brightening essence 55 | brightening serum 56 | brightens 57 | bronde 58 | bronzer blush 59 | bronzer brush 60 | bronzer highlighter 61 | brow blade 62 | brow duo 63 | brow pen 64 | brow routine 65 | brow styler 66 | browbone 67 | brown brow 68 | buckthorn 69 | burdock root 70 | bushy brow 71 | butyrospermum parkii 72 | calendula 73 | calm serum 74 | camu camu 75 | carotene 76 | carrageenan 77 | cbd 78 | cellulose mask 79 | cica 80 | cilantro 81 | cleansing balm 82 | collagen 83 | cucumber water 84 | damp hair 85 | deep hydration 86 | deeper lip 87 | dermaplane 88 | detox 89 | dew wet 90 | dewy glow 91 | dipbrow gel 92 | diy lip scrub 93 | double cleansing 94 | dragon fruit 95 | dream palette 96 | drmigy 97 | dry oil 98 | dryness 99 | elastin 100 | elastin 101 | elemis 102 | encapsulated 103 | endorphins 104 | enlight halo 105 | enlight powder 106 | environmental aggressors 107 | enzyme 108 | enzyme cleansing 109 | eraser 110 | essence oil 111 | essential oils 112 | exfoliator 113 | eye brush 114 | eye creams 115 | eye masks 116 | eyeliner easy 117 | eyeliner tutorial 118 | face massaging 119 | face mist 120 | face roller 121 | face serum 122 | facial hydrating 123 | falsies 124 | faux mink 125 | ferment 126 | fine mist 127 | finish foundation 128 | flaking 129 | flare ups 130 | flash palette 131 | flyliner 132 | fornax 133 | fouurthraybeauty 134 | fragrance free 135 | full on glam 136 | full reveal 137 | full tutorial 138 | fuller 139 | fuse 140 | gel like texture 141 | gel liners 142 | gel lips 143 | gemstone 144 | gentle exfoliation 145 | gentle peel 146 | ginseng 147 | glam eyeshadow 148 | glam palette 149 | glassy 150 | gloss set 151 | glossy lip 152 | glow gloss 153 | gluconolactone 154 | golden blonde 155 | grapeseed 156 | green level 157 | green tea 158 | grey blending 159 | growth factors 160 | gua sha 161 | hada labo 162 | hair oil 163 | halo powder 164 | harmful ingredients 165 | hazel 166 | healthy Glow 167 | heat damage 168 | helichrysum 169 | hemp 170 | hooded eyes 171 | hormonal 172 | hormonal acne 173 | hormones 174 | hyaluronic acid 175 | hydra mist 176 | hydrating foundation 177 | hydrating gel 178 | hydrating serum 179 | hydrocolloid 180 | hydrogel 181 | hydrojelly mask 182 | hydroquinone 183 | injectables 184 | injections 185 | iunik 186 | jeju 187 | jelly cleanser 188 | jojoba 189 | jojoba seed 190 | kakadu plum 191 | klairs 192 | kombucha 193 | konjac sponge 194 | korean skin care 195 | kose 196 | krave 197 | kush brow 198 | kush fiber 199 | kvd 200 | lactobacillus 201 | lanolin 202 | laser treatment 203 | lash glue 204 | lash line 205 | lashes super long 206 | lemon 207 | lifts 208 | light blonde 209 | light hydrating 210 | light makeup 211 | light to dark 212 | lip glosses 213 | lip kit 214 | lip liners 215 | lip lines 216 | lip mask 217 | lip oil 218 | lip topper 219 | lip treatment 220 | lipid 221 | lipoic acid 222 | liquid glass 223 | liquid peel 224 | liquid powder 225 | lob 226 | long lashes 227 | long lasting 228 | long wearing 229 | longwear foundation shade 230 | lotus 231 | low ph 232 | lower lash 233 | lowlights 234 | luminous foundation shade 235 | lunarbeauty 236 | magenta 237 | magnesium 238 | makeup geek 239 | makeup hair 240 | mandelic acid 241 | matcha 242 | matifying 243 | matte eyeshadows 244 | matte metal 245 | matte shadows 246 | matte velvet 247 | mehronmakeup 248 | melasma 249 | melted matte 250 | microdermabrasion 251 | microneedling 252 | milk honey 253 | milky 254 | mineral spf 255 | mini palette 256 | mink lashes 257 | mist 258 | misty mauve 259 | mochi 260 | moringa 261 | mothership palette 262 | mud 263 | mugwort 264 | multi purpose 265 | multi tonal 266 | murad 267 | nabla 268 | nanoparticles 269 | natural finish 270 | natural hyaluronic 271 | natural oils 272 | neck cream 273 | neon pink and neon yellow 274 | neroli 275 | niacinamide booster 276 | night cream 277 | night repair 278 | night routine 279 | nude lip 280 | nude shades 281 | oats 282 | oil Infused 283 | oil soluble 284 | oil vision 285 | olive 286 | omega 287 | overnight mask 288 | packaging free 289 | pad 290 | pale yellow 291 | palette dark 292 | palette fornax 293 | palette shades 294 | palmitate 295 | palmitoyl 296 | papaya 297 | paraben 298 | passionfruit 299 | patch testing 300 | peanut 301 | peel off mask 302 | peptides antioxidants 303 | perma 304 | permagel 305 | peroxide 306 | phloretin 307 | physical exfoliant 308 | pigment palette 309 | pink salt 310 | plouise makeup 311 | pollution protection 312 | porous 313 | powder palette 314 | priming 315 | priming moisturizer 316 | prism 317 | pro pigment 318 | probiotic skincare 319 | propanediol 320 | propolis 321 | protection cream 322 | purifying mask 323 | purito 324 | purple hair 325 | purple raven 326 | purples 327 | rainbow hair 328 | red light therapy 329 | refillable 330 | regrowth 331 | reiki 332 | reishi 333 | rejuvenating 334 | rejuvenation 335 | remover 336 | renewal mask 337 | resurfacing mask 338 | retinoid 339 | retinol 340 | rice toner 341 | root melt 342 | root shadow 343 | rootagé 344 | rose elixir 345 | rose hip oil 346 | rose toner 347 | rose water 348 | rosehip 349 | rosewater facial 350 | routine styling 351 | safari 352 | safflower 353 | sake 354 | sakura 355 | salicylic acid 356 | sativa 357 | seaweed 358 | semi opaque 359 | semi sheer 360 | serum retinol 361 | set powder 362 | shades in matte 363 | shea 364 | sheet masks 365 | shimmer shades 366 | shine spray 367 | silver hair 368 | silver smoke 369 | single layer 370 | sleeping masks 371 | sls 372 | smashbox studio 373 | smoky eye 374 | smoky quartz 375 | smudge brush 376 | smudge proof 377 | snail 378 | snowflake 379 | spoolie 380 | squalene 381 | stearate 382 | stem cell 383 | stick masks 384 | stix 385 | strengthening 386 | sugarpill 387 | sulfur 388 | sunflower 389 | super hydrating 390 | super pigmented 391 | super sensitive 392 | supercharged complex 393 | superfood 394 | sweet almond 395 | tattoo liner 396 | tea seed 397 | tea tree 398 | textured hair 399 | tinted mineral 400 | tony moly 401 | topper 402 | toxins 403 | treatment at home 404 | tube 405 | tuberose 406 | tyrosinase 407 | ultra matte 408 | upf 409 | urea 410 | vegan skincare 411 | velour lashes 412 | velvetine 413 | visible redness 414 | vitamin c serum 415 | volume and length 416 | walnuts 417 | water gel 418 | water loss 419 | waterline 420 | watermelon 421 | watery 422 | wet balayage 423 | wet balm 424 | wet hair 425 | white liquid 426 | white lotus 427 | willow bark 428 | witch hazel 429 | wiz taupe 430 | yellow shampoo 431 | zulu -------------------------------------------------------------------------------- /projects/whats_cooking_good_looking/whats_cooking_good_looking/train_ner_workflow.py: -------------------------------------------------------------------------------- 1 | import json 2 | import pickle 3 | import random 4 | from collections import defaultdict 5 | 6 | import spacy 7 | from flytekit import Resources, dynamic, task, workflow 8 | from spacy.language import Language 9 | from spacy.training import Example 10 | from spacy.util import compounding, minibatch 11 | 12 | from whats_cooking_good_looking.apply_ner_workflow import load_model 13 | from whats_cooking_good_looking.utils import (download_bytes_from_gcs, 14 | load_config, upload_to_gcs) 15 | 16 | SPACY_MODEL = {"en": "en_core_web_sm"} 17 | 18 | CACHE_VERSION = "2.2" 19 | request_resources = Resources(cpu="1", mem="500Mi", storage="500Mi") 20 | limit_resources = Resources(cpu="2", mem="1000Mi", storage="1000Mi") 21 | 22 | THRESHOLD_ACCURACY = 0.7 23 | 24 | 25 | @task 26 | def evaluate_ner(labelstudio_tasks: bytes) -> dict: 27 | """Computes accuracy, precision and recall of NER model out of label studio output. 28 | 29 | Args: 30 | labelstudio_tasks (list): List of dicts outputs of label studio annotation with following format 31 | [ 32 | { 33 | "result": [ 34 | { 35 | "value": {"start": 10, "end": 17, "text": "Chennai", "labels": ["LOC"]}, 36 | "from_name": "label", 37 | "to_name": "text", 38 | "type": "labels", 39 | "origin": "manual", 40 | } 41 | ], 42 | "predictions": [ 43 | { 44 | "result": {"start": 10, "end": 17, "text": "Chennai", "labels": ["LOC"]}, 45 | "model_version": "en_core_web_sm", 46 | } 47 | ], 48 | } 49 | ] 50 | 51 | Returns: 52 | dict: mapping {model_name: accuracy} 53 | 54 | """ 55 | model_acc = dict() 56 | model_hits = defaultdict(int) 57 | for ls_task in json.loads(labelstudio_tasks): 58 | annotation_result = ls_task["result"][0]["value"] 59 | for key in annotation_result: 60 | annotation_result.pop("id", None) 61 | for prediction in ls_task["predictions"]: 62 | model_version = prediction["model_version"] 63 | model_hits[model_version] += int(prediction["result"] == annotation_result) 64 | 65 | num_task = len(labelstudio_tasks) 66 | for model_name, num_hits in model_hits.items(): 67 | acc = num_hits / num_task 68 | model_acc[model_name] = acc 69 | print(f"Accuracy for {model_name}: {acc:.2f}%") 70 | return model_acc 71 | 72 | 73 | @task 74 | def load_tasks(bucket_name: str, source_blob_name: str) -> bytes: 75 | """Loads Label Studio annotations. 76 | 77 | Args: 78 | bucket_name (str): GCS bucket name where tasks are stored. 79 | source_blob_name (str): GCS blob name where tasks are stored. 80 | 81 | Returns: 82 | str: json dumped tasks 83 | """ 84 | labelstudio_tasks = download_bytes_from_gcs( 85 | bucket_name=bucket_name, source_blob_name=source_blob_name 86 | ) 87 | return labelstudio_tasks 88 | 89 | 90 | @task 91 | def format_tasks_for_train(labelstudio_tasks: bytes) -> str: 92 | """Format Label Studio output to be trained in spacy custom model. 93 | 94 | Args: 95 | labelstudio_tasks (str): json dumped labelstudio_tasks 96 | 97 | Returns: 98 | str: json dumped train data formatted 99 | """ 100 | train_data = [] 101 | for ls_task in json.loads(labelstudio_tasks): 102 | entities = [ 103 | (ent["value"]["start"], ent["value"]["end"], label) 104 | for ent in ls_task["result"] 105 | for label in ent["value"]["labels"] 106 | ] 107 | if entities != []: 108 | train_data.append((ls_task["task"]["data"]["text"], {"entities": entities})) 109 | return json.dumps(train_data) 110 | 111 | 112 | @task 113 | def train_model( 114 | train_data: str, 115 | nlp: Language, 116 | training_iterations: int, 117 | bucket_out: str, 118 | source_blob_name: str, 119 | ) -> Language: 120 | """ Uses new labelled data to improve spacy NER model. Uploads trained model in GCS. 121 | 122 | Args: 123 | train_data_files (List[str]): List of data filepath to train model on. After being loaded, format \ 124 | should be the following: 125 | train_data = [ 126 | ("Text to detect Entities in.", {"entities": [(15, 23, "PRODUCT")]}), 127 | ("Flyte is another example of organisation.", {"entities": [(0, 6, "ORG")]}), 128 | ] 129 | nlp (Language): Spacy base model to train on. 130 | training_iterations (int): Number of training iterations to make. Defaults to 30. 131 | 132 | Returns: 133 | Language: Trained spacy model 134 | """ 135 | train_data = json.loads(train_data) 136 | ner = nlp.get_pipe("ner") 137 | for _, annotations in train_data: 138 | for ent in annotations.get("entities"): 139 | ner.add_label(ent[2]) 140 | pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"] 141 | unaffected_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions] 142 | print("Starting model training") 143 | with nlp.disable_pipes(*unaffected_pipes): 144 | optimizer = spacy.blank("en").initialize() 145 | for iteration in range(training_iterations): 146 | random.shuffle(train_data) 147 | losses = {} 148 | batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001)) 149 | for batch in batches: 150 | for text, annotations in batch: 151 | doc = nlp.make_doc(text) 152 | example = Example.from_dict(doc, annotations) 153 | nlp.update([example], drop=0.35, losses=losses, sgd=optimizer) 154 | print("Iteration n°", iteration) 155 | print("Losses", losses) 156 | print("Model training completed !") 157 | upload_to_gcs(bucket_out, source_blob_name, pickle.dumps(nlp)) 158 | print("Model upload on GCS completed !") 159 | return nlp 160 | 161 | 162 | @dynamic( 163 | cache=False, 164 | requests=request_resources, 165 | limits=limit_resources, 166 | ) 167 | def train_model_if_necessary( 168 | labelstudio_tasks: bytes, 169 | metrics_dict: dict, 170 | model_name: str, 171 | training_iterations: int, 172 | bucket_out: str, 173 | model_output_blob_name: str, 174 | ): 175 | """Checks for model accuracy. If it's high enough, the pipeline stops, else it trains a new model \ 176 | and upload it to GCS. 177 | 178 | Args: 179 | labelstudio_tasks (bytes): Label studio annotations 180 | metrics_dict (dict): mapping between model name and accuracy 181 | model_name (str): model name from which we get accuracy 182 | training_iterations (int): number of training iterations for the spacy NER model 183 | """ 184 | if metrics_dict[model_name] >= THRESHOLD_ACCURACY: 185 | print(f"No need to train. Accuracy of {metrics_dict[model_name]} is above threshold {THRESHOLD_ACCURACY}") 186 | else: 187 | train_data = format_tasks_for_train(labelstudio_tasks=labelstudio_tasks) 188 | nlp = load_model( 189 | lang="en", 190 | from_gcs=False, 191 | gcs_bucket=bucket_out, 192 | gcs_source_blob_name=model_output_blob_name, 193 | ) 194 | nlp = train_model( 195 | train_data=train_data, 196 | nlp=nlp, 197 | training_iterations=training_iterations, 198 | bucket_out=bucket_out, 199 | source_blob_name=model_output_blob_name, 200 | ) 201 | 202 | 203 | @workflow 204 | def main(): 205 | """Main training workflow evaluating model based on labelled observations. 206 | * If accuracy is high enough, the pipeline ends 207 | * If accuracy is below threshold, the pipeline trains a new model based on those 208 | observations and dumps it on GCS 209 | """ 210 | config = load_config("train") 211 | labelstudio_tasks = load_tasks( 212 | bucket_name=config["bucket_label_out_name"], 213 | source_blob_name=config["label_studio_output_blob_name"], 214 | ) 215 | metrics_dict = evaluate_ner(labelstudio_tasks=labelstudio_tasks) 216 | train_model_if_necessary( 217 | labelstudio_tasks=labelstudio_tasks, 218 | metrics_dict=metrics_dict, 219 | training_iterations=config["training_iterations"], 220 | model_name=config["model_name"], 221 | bucket_out=config["bucket_name"], 222 | model_output_blob_name=config["model_output_blob_name"], 223 | ) 224 | 225 | 226 | if __name__ == "__main__": 227 | main() 228 | -------------------------------------------------------------------------------- /projects/whats_cooking_good_looking/whats_cooking_good_looking/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from itertools import groupby 4 | from pathlib import Path 5 | from typing import List, Union 6 | 7 | from google.cloud import storage 8 | 9 | 10 | def load_config(train_or_apply: str) -> dict: 11 | """Load config""" 12 | config_file_path = Path(__file__).parent.resolve() / "config.json" 13 | with open(config_file_path, "r") as f: 14 | config = json.load(f) 15 | return config[train_or_apply] 16 | 17 | 18 | def doc_to_spans(doc): 19 | """This function converts spaCy docs to the list of named entity spans in Label Studio compatible JSON format""" 20 | tokens = [(tok.text, tok.idx, tok.ent_type_) for tok in doc] 21 | results = [] 22 | entities = set() 23 | for entity, group in groupby(tokens, key=lambda t: t[-1]): 24 | if not entity: 25 | continue 26 | group = list(group) 27 | _, start, _ = group[0] 28 | word, last, _ = group[-1] 29 | text = " ".join(item[0] for item in group) 30 | end = last + len(word) 31 | results.append( 32 | { 33 | "from_name": "label", 34 | "to_name": "text", 35 | "type": "labels", 36 | "value": {"start": start, "end": end, "text": text, "labels": [entity]}, 37 | } 38 | ) 39 | entities.add(entity) 40 | 41 | return results, entities 42 | 43 | 44 | def load_train_data(train_data_files: str) -> List: 45 | """Load jsonl train data as a list, ready to be ingested by spacy model. 46 | 47 | Args: 48 | train_data_local_path (str): Path of files to load. 49 | 50 | Returns: 51 | List: Tuple of texts and dict of entities to be used for training. 52 | """ 53 | train_data = [] 54 | for data_file in train_data_files: 55 | with open(data_file, "r") as f: 56 | for json_str in list(f): 57 | train_data_dict = json.loads(json_str) 58 | train_text = train_data_dict["text"] 59 | train_entities = { 60 | "entities": [ 61 | tuple(entity_elt) for entity_elt in train_data_dict["entities"] 62 | ] 63 | } 64 | formatted_train_line = (train_text, train_entities) 65 | train_data.append(formatted_train_line) 66 | return train_data 67 | 68 | 69 | def download_from_gcs( 70 | bucket_name: str, 71 | source_blob_name: str, 72 | destination_folder: str, 73 | explicit_filepath: bool = False, 74 | ) -> Union[str, List[str]]: 75 | """Download gcs data locally. 76 | 77 | Args: 78 | bucket_name (str): Name of the GCS bucket. 79 | source_blob_name (str): GCS path to data in the bucket. 80 | destination_folder (str): Folder to download GCS data to. 81 | explicit_filepath (bool, optional): Decides whether to return explicit list of filepath instead \ 82 | of destination folder only. Default to False. 83 | 84 | Returns: 85 | str: Local destination folder 86 | """ 87 | storage_client = storage.Client() 88 | bucket = storage_client.bucket(bucket_name) 89 | blobs = bucket.list_blobs(prefix=source_blob_name) 90 | filepath_list = [] 91 | for blob in blobs: 92 | if not blob.name.endswith("/"): 93 | filename = blob.name.replace("/", "_") 94 | local_path = os.path.join(destination_folder, filename) 95 | blob.download_to_filename(local_path) 96 | filepath_list.append(local_path) 97 | print(f"Downloaded at {destination_folder}") 98 | if explicit_filepath: 99 | return filepath_list 100 | return destination_folder 101 | 102 | 103 | def download_bytes_from_gcs(bucket_name, source_blob_name): 104 | storage_client = storage.Client() 105 | bucket = storage_client.bucket(bucket_name) 106 | blob = bucket.blob(source_blob_name) 107 | return blob.download_as_string() 108 | 109 | 110 | def upload_to_gcs(bucket_name, source_blob_name, data, content_type=None): 111 | storage_client = storage.Client() 112 | bucket = storage_client.bucket(bucket_name) 113 | blob = bucket.blob(source_blob_name) 114 | blob.upload_from_string(data, content_type=content_type) 115 | -------------------------------------------------------------------------------- /templates/_common/deploy.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | import uuid 4 | from pathlib import Path 5 | 6 | import docker 7 | import git 8 | import typer 9 | 10 | 11 | app = typer.Typer() 12 | 13 | docker_client = docker.from_env() 14 | 15 | 16 | IMAGE_NAME = "flytelab" 17 | REGISTRY = "ghcr.io/{{cookiecutter.github_username}}".lower() 18 | PROJECT_NAME = "{{cookiecutter.flyte_project}}" 19 | DESCRIPTION = "{{cookiecutter.description}}" 20 | 21 | 22 | def create_project(remote: bool): 23 | config = Path(".flyte") / f"{'remote' if remote else 'sandbox'}-config.yaml" 24 | output = subprocess.run( 25 | [ 26 | "flytectl", 27 | "get", 28 | "project", 29 | PROJECT_NAME, 30 | "--config", config, 31 | ], 32 | capture_output=True, 33 | check=True, 34 | ) 35 | if output.stdout.decode().strip(): 36 | return 37 | 38 | typer.echo(f"Creating project {PROJECT_NAME}") 39 | subprocess.run( 40 | [ 41 | "flytectl", 42 | "create", 43 | "project", 44 | "--project", PROJECT_NAME, 45 | "--name", PROJECT_NAME, 46 | "--id", PROJECT_NAME, 47 | "--description", DESCRIPTION, 48 | "--config", config, 49 | ], 50 | check=True, 51 | ) 52 | 53 | 54 | def get_version(fast: bool): 55 | repo = git.Repo(".", search_parent_directories=True) 56 | if not fast and repo.is_dirty(): 57 | typer.echo( 58 | "Please commit git changes before building. If you haven't updated any system/python dependencies " 59 | "but want to deploy task/workflow code changes, use the --fast flag to do fast registration.", 60 | err=True 61 | ) 62 | raise typer.Exit(code=1) 63 | commit = repo.rev_parse("HEAD") 64 | return commit.hexsha 65 | 66 | 67 | def get_tag(version, registry=None): 68 | return f"{REGISTRY if registry is None else registry}/{IMAGE_NAME}:{PROJECT_NAME}-{version}" 69 | 70 | 71 | def sandbox_docker_build(tag): 72 | typer.echo("Building image in Flyte sandbox") 73 | subprocess.run( 74 | [ 75 | "flytectl", 76 | "sandbox", 77 | "exec", 78 | "--", 79 | "docker", 80 | "build", 81 | ".", 82 | "--tag", 83 | tag, 84 | ], 85 | check=True, 86 | ) 87 | 88 | 89 | def docker_build(tag: str, remote: bool) -> docker.models.images.Image: 90 | client = docker.from_env() 91 | 92 | # TODO: image build, push, flytectl serialization and registration 93 | config = Path(".flyte") / f"{'remote' if remote else 'sandbox'}.config" 94 | 95 | typer.echo(f"Building image: {tag}") 96 | image, build_logs = client.images.build( 97 | path=".", 98 | dockerfile="Dockerfile", 99 | tag=tag, 100 | buildargs={ 101 | "image": tag, 102 | "config": str(config), 103 | } 104 | ) 105 | for line in build_logs: 106 | typer.echo(line) 107 | return image 108 | 109 | 110 | def docker_push(image: docker.models.images.Image): 111 | for line in docker_client.api.push(image.tags[0], stream=True, decode=True): 112 | typer.echo(line) 113 | 114 | 115 | def serialize(tag: str, remote: bool, fast: bool): 116 | typer.echo("Serializing Flyte workflows") 117 | config = Path(".flyte") / f"{'remote' if remote else 'sandbox'}.config" 118 | package = Path(".") / "flyte-package.tgz" 119 | if package.exists(): 120 | os.remove(package) 121 | subprocess.run( 122 | [ 123 | "pyflyte", 124 | "-c", str(config), 125 | "--pkgs", "{{cookiecutter.project_name}}", 126 | "package", 127 | "--force", 128 | "--image", tag, 129 | *( 130 | ["--fast"] 131 | if fast 132 | else ["--in-container-source-path", "/root"] 133 | ), 134 | ], 135 | check=True, 136 | # inject the FLYTE_SANDBOX environment variable to the serialization runtime 137 | env={"FLYTE_SANDBOX": "1" if not remote else "0", **os.environ}, 138 | ) 139 | 140 | 141 | def register(version: str, remote: bool, fast: bool, domain: str): 142 | typer.echo("Registering Flyte workflows") 143 | config = Path(".flyte") / f"{'remote' if remote else 'sandbox'}-config.yaml" 144 | if fast: 145 | version = f"{version}-fast{uuid.uuid4().hex[:7]}" 146 | subprocess.run( 147 | [ 148 | "flytectl", 149 | "-c", config, 150 | "register", 151 | "files", 152 | "--project", PROJECT_NAME, 153 | "--domain", domain, 154 | "--archive", "flyte-package.tgz", 155 | "--force", 156 | "--version", version 157 | ], 158 | check=True, 159 | ) 160 | typer.echo(f"Successfully registered version {version}") 161 | 162 | 163 | @app.command() 164 | def main(remote: bool = False, fast: bool = False, domain: str = "development", registry: str = None): 165 | if remote and fast: 166 | typer.echo( 167 | "Fast registration is not enabled when deploying to remote. " 168 | "Please deploy your workflows without the --fast flag.", 169 | err=True 170 | ) 171 | create_project(remote) 172 | version = get_version(fast) 173 | tag = get_tag(version, registry) 174 | if not fast: 175 | if remote: 176 | docker_push(docker_build(tag, remote)) 177 | else: 178 | sandbox_docker_build(tag) 179 | serialize(tag, remote, fast) 180 | register(version, remote, fast, domain) 181 | 182 | 183 | if __name__ == "__main__": 184 | app() 185 | -------------------------------------------------------------------------------- /templates/basic/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/flyteorg/flytelab/67ff9a779004209c82f9f5df89f2873cc5acdd52/templates/basic/README.md -------------------------------------------------------------------------------- /templates/basic/cookiecutter.json: -------------------------------------------------------------------------------- 1 | { 2 | "project_name": null, 3 | "project_author": null, 4 | "github_username": null, 5 | "flyte_project": "{{ cookiecutter.project_author|lower|replace(' ', '-')|replace('_', '-') }}", 6 | "description": "A flytelab project" 7 | } 8 | -------------------------------------------------------------------------------- /templates/basic/hooks/pre_gen_project.py: -------------------------------------------------------------------------------- 1 | import re 2 | import sys 3 | 4 | PROJECT_NAME_REGEX = r'^[_a-zA-Z][_a-zA-Z0-9]+$' 5 | PROJECT_AUTHOR_REGEX = r'^[-_a-zA-Z0-9 ]+$' 6 | FLYTE_PROJECT_REGEX = r'^[-a-z0-9]+$' 7 | 8 | project_name = '{{ cookiecutter.project_name }}' 9 | project_author = '{{ cookiecutter.project_author }}' 10 | flyte_project = '{{ cookiecutter.flyte_project }}' or '{{ cookiecutter.project_author }}'.lower().replace("_", "-").replace(" ", "-") 11 | 12 | if not re.match(PROJECT_NAME_REGEX, project_name): 13 | print(f"ERROR: project_name '{project_name}' is invalid. Must match the expression {PROJECT_NAME_REGEX}") 14 | sys.exit(1) 15 | 16 | if not re.match(PROJECT_AUTHOR_REGEX, project_author): 17 | print(f"ERROR: project_author '{project_author}' is invalid. Must match the expression {PROJECT_AUTHOR_REGEX}") 18 | sys.exit(1) 19 | 20 | if not re.match(FLYTE_PROJECT_REGEX, flyte_project): 21 | print(f"ERROR: flyte_project '{flyte_project}' is invalid. Must match the expression {FLYTE_PROJECT_REGEX}") 22 | sys.exit(1) 23 | -------------------------------------------------------------------------------- /templates/basic/{{cookiecutter.project_name}}/.dockerignore: -------------------------------------------------------------------------------- 1 | !.flyte -------------------------------------------------------------------------------- /templates/basic/{{cookiecutter.project_name}}/.flyte/remote-config.yaml: -------------------------------------------------------------------------------- 1 | admin: 2 | # For GRPC endpoints you might want to use dns:///flyte.myexample.com 3 | endpoint: dns:///playground.hosted.unionai.cloud 4 | authType: Pkce 5 | # Change insecure flag to ensure that you use the right setting for your environment 6 | insecure: false 7 | storage: 8 | type: stow 9 | stow: 10 | kind: s3 11 | config: 12 | auth_type: iam 13 | region: us-east-2 14 | logger: 15 | # Logger settings to control logger output. Useful to debug logger: 16 | show-source: true 17 | level: 1 18 | -------------------------------------------------------------------------------- /templates/basic/{{cookiecutter.project_name}}/.flyte/remote.config: -------------------------------------------------------------------------------- 1 | [sdk] 2 | # This option specifies the python packages where-in to search for workflows and tasks workflow packages. These workflows and tasks are then serialized during the `make serialize` commands 3 | workflow_packages={{cookiecutter.project_name}} 4 | 5 | [auth] 6 | raw_output_data_prefix=s3://open-compute-playground 7 | -------------------------------------------------------------------------------- /templates/basic/{{cookiecutter.project_name}}/.flyte/sandbox-config.yaml: -------------------------------------------------------------------------------- 1 | admin: 2 | # For GRPC endpoints you might want to use dns:///flyte.myexample.com 3 | endpoint: dns:///localhost:30081 4 | authType: Pkce 5 | insecure: true 6 | logger: 7 | show-source: true 8 | level: 0 9 | storage: 10 | connection: 11 | access-key: minio 12 | auth-type: accesskey 13 | disable-ssl: true 14 | endpoint: http://localhost:30084 15 | region: us-east-1 16 | secret-key: miniostorage 17 | type: minio 18 | container: "my-s3-bucket" 19 | enable-multicontainer: true 20 | -------------------------------------------------------------------------------- /templates/basic/{{cookiecutter.project_name}}/.flyte/sandbox.config: -------------------------------------------------------------------------------- 1 | [sdk] 2 | # This option specifies the python packages where-in to search for workflows and tasks workflow packages. These workflows and tasks are then serialized during the `make serialize` commands 3 | workflow_packages={{cookiecutter.project_name}} 4 | 5 | [auth] 6 | raw_output_data_prefix=s3://my-s3-bucket/flytelab 7 | -------------------------------------------------------------------------------- /templates/basic/{{cookiecutter.project_name}}/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.9-slim-buster 2 | 3 | WORKDIR /root 4 | ENV VENV /opt/venv 5 | ENV LANG C.UTF-8 6 | ENV LC_ALL C.UTF-8 7 | ENV PYTHONPATH /root 8 | 9 | # e.g. flyte.config or sandbox.config 10 | ARG config 11 | 12 | RUN apt-get update && \ 13 | apt-get install -y \ 14 | libsm6 \ 15 | libxext6 \ 16 | libxrender-dev \ 17 | ffmpeg \ 18 | build-essential 19 | 20 | # Install the AWS cli separately to prevent issues with boto being written over 21 | RUN pip3 install awscli 22 | 23 | ENV VENV /opt/venv 24 | 25 | # Virtual environment 26 | RUN python3 -m venv ${VENV} 27 | ENV PATH="${VENV}/bin:$PATH" 28 | 29 | # Install Python dependencies 30 | COPY requirements.txt /root 31 | RUN pip install -r /root/requirements.txt 32 | 33 | COPY {{cookiecutter.project_name}} /root/{{cookiecutter.project_name}} 34 | COPY $config /root/flyte.config 35 | 36 | # This image is supplied by the build script and will be used to determine the version 37 | # when registering tasks, workflows, and launch plans 38 | ARG image 39 | ENV FLYTE_INTERNAL_IMAGE $image 40 | -------------------------------------------------------------------------------- /templates/basic/{{cookiecutter.project_name}}/README.md: -------------------------------------------------------------------------------- 1 | # {{cookiecutter.project_name}} 2 | 3 | {{cookiecutter.description}} 4 | -------------------------------------------------------------------------------- /templates/basic/{{cookiecutter.project_name}}/dashboard/app.py: -------------------------------------------------------------------------------- 1 | import os 2 | from argparse import ArgumentParser 3 | from pathlib import Path 4 | 5 | import streamlit as st 6 | 7 | from flytekit.remote import FlyteRemote 8 | from flytekit.models import filters 9 | from flytekit.models.admin.common import Sort 10 | 11 | from sklearn.datasets import load_digits 12 | 13 | 14 | PROJECT_NAME = "{{cookiecutter.flyte_project}}" 15 | WORKFLOW_NAME = "{{cookiecutter.project_name}}.workflows.main" 16 | 17 | 18 | parser = ArgumentParser() 19 | parser.add_argument("--remote", action="store_true") 20 | args = parser.parse_args() 21 | 22 | backend = os.getenv("FLYTE_BACKEND", 'remote' if args.remote else 'sandbox') 23 | 24 | # configuration for accessing a Flyte cluster backend 25 | remote = FlyteRemote.from_config( 26 | default_project=PROJECT_NAME, 27 | default_domain="development", 28 | config_file_path=Path(__file__).parent / f"{backend}.config", 29 | ) 30 | 31 | # get the latest workflow execution 32 | [latest_execution, *_], _ = remote.client.list_executions_paginated( 33 | PROJECT_NAME, 34 | "development", 35 | limit=1, 36 | filters=[ 37 | filters.Equal("launch_plan.name", WORKFLOW_NAME), 38 | filters.Equal("phase", "SUCCEEDED"), 39 | ], 40 | sort_by=Sort.from_python_std("desc(execution_created_at)"), 41 | ) 42 | 43 | wf_execution = remote.fetch_workflow_execution(name=latest_execution.id.name) 44 | remote.sync(wf_execution, sync_nodes=False) 45 | model = wf_execution.outputs["o0"] 46 | print(model) 47 | 48 | 49 | ############ 50 | # App Code # 51 | ############ 52 | 53 | data = load_digits(as_frame=True) 54 | 55 | st.write("# Flytelab: {{cookiecutter.project_name}}") 56 | st.write("### {{cookiecutter.description}}") 57 | st.write(f"Model: `{model}`") 58 | 59 | st.write("Use the slider below to select a sample for prediction") 60 | 61 | sample_index = st.slider( 62 | "Sample Number", 63 | min_value=0, 64 | max_value=data.frame.shape[0] - 1, 65 | value=0, 66 | step=1, 67 | ) 68 | 69 | st.image(data.images[sample_index], clamp=True, width=300) 70 | st.write(f"Ground Truth: {data.target[sample_index]}") 71 | st.write(f"Prediction: {model.predict(data.frame[data.feature_names].loc[[sample_index]])[0]}") 72 | -------------------------------------------------------------------------------- /templates/basic/{{cookiecutter.project_name}}/dashboard/remote.config: -------------------------------------------------------------------------------- 1 | [platform] 2 | url=playground.hosted.unionai.cloud 3 | insecure=False 4 | 5 | [credentials] 6 | client_id=flytepropeller 7 | auth_mode=basic 8 | authorization_metadata-key=flyte-authorization 9 | oauth_scopes=all 10 | -------------------------------------------------------------------------------- /templates/basic/{{cookiecutter.project_name}}/dashboard/sandbox.config: -------------------------------------------------------------------------------- 1 | [platform] 2 | url=localhost:30081 3 | insecure=True 4 | 5 | [aws] 6 | access_key_id=minio 7 | secret_access_key=miniostorage 8 | endpoint=http://localhost:30084 9 | -------------------------------------------------------------------------------- /templates/basic/{{cookiecutter.project_name}}/deploy.py: -------------------------------------------------------------------------------- 1 | ../../_common/deploy.py -------------------------------------------------------------------------------- /templates/basic/{{cookiecutter.project_name}}/requirements-dev.txt: -------------------------------------------------------------------------------- 1 | docker 2 | gitpython 3 | streamlit 4 | typer 5 | -------------------------------------------------------------------------------- /templates/basic/{{cookiecutter.project_name}}/requirements.txt: -------------------------------------------------------------------------------- 1 | flytekit>=0.30.3 2 | pandas 3 | s3fs 4 | sklearn 5 | -------------------------------------------------------------------------------- /templates/basic/{{cookiecutter.project_name}}/{{cookiecutter.project_name}}/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/flyteorg/flytelab/67ff9a779004209c82f9f5df89f2873cc5acdd52/templates/basic/{{cookiecutter.project_name}}/{{cookiecutter.project_name}}/__init__.py -------------------------------------------------------------------------------- /templates/basic/{{cookiecutter.project_name}}/{{cookiecutter.project_name}}/workflows.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from sklearn.datasets import load_digits 3 | from sklearn.linear_model import LogisticRegression 4 | 5 | from flytekit import task, workflow 6 | 7 | 8 | @task 9 | def get_dataset() -> pd.DataFrame: 10 | return load_digits(as_frame=True).frame 11 | 12 | 13 | @task 14 | def train_model(dataset: pd.DataFrame) -> LogisticRegression: 15 | model = LogisticRegression() 16 | features, target = dataset[[x for x in dataset if x != "target"]], dataset["target"] 17 | return model.fit(features, target) 18 | 19 | 20 | @workflow 21 | def main() -> LogisticRegression: 22 | return train_model(dataset=get_dataset()) 23 | 24 | 25 | if __name__ == "__main__": 26 | print(f"trained model: {main()}") 27 | -------------------------------------------------------------------------------- /templates/pytorch-gpu/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/flyteorg/flytelab/67ff9a779004209c82f9f5df89f2873cc5acdd52/templates/pytorch-gpu/README.md -------------------------------------------------------------------------------- /templates/pytorch-gpu/cookiecutter.json: -------------------------------------------------------------------------------- 1 | { 2 | "project_name": null, 3 | "project_author": null, 4 | "github_username": null, 5 | "description": "A flytelab project" 6 | } 7 | -------------------------------------------------------------------------------- /templates/pytorch-gpu/hooks/pre_gen_project.py: -------------------------------------------------------------------------------- 1 | import re 2 | import sys 3 | 4 | PROJECT_NAME_REGEX = r'^[_a-zA-Z][_a-zA-Z0-9]+$' 5 | PROJECT_AUTHOR_REGEX = r'^[-_a-zA-Z0-9 ]+$' 6 | FLYTE_PROJECT_REGEX = r'^[-a-z0-9]+$' 7 | 8 | project_name = '{{ cookiecutter.project_name }}' 9 | project_author = '{{ cookiecutter.project_author }}' 10 | flyte_project = '{{ cookiecutter.flyte_project }}' or '{{ cookiecutter.project_author }}'.lower().replace("_", "-").replace(" ", "-") 11 | 12 | if not re.match(PROJECT_NAME_REGEX, project_name): 13 | print(f"ERROR: project_name '{project_name}' is invalid. Must match the expression {PROJECT_NAME_REGEX}") 14 | sys.exit(1) 15 | 16 | if not re.match(PROJECT_AUTHOR_REGEX, project_author): 17 | print(f"ERROR: project_author '{project_author}' is invalid. Must match the expression {PROJECT_AUTHOR_REGEX}") 18 | sys.exit(1) 19 | 20 | if not re.match(FLYTE_PROJECT_REGEX, flyte_project): 21 | print(f"ERROR: flyte_project '{flyte_project}' is invalid. Must match the expression {FLYTE_PROJECT_REGEX}") 22 | sys.exit(1) 23 | -------------------------------------------------------------------------------- /templates/pytorch-gpu/{{cookiecutter.project_name}}/.dockerignore: -------------------------------------------------------------------------------- 1 | !.flyte -------------------------------------------------------------------------------- /templates/pytorch-gpu/{{cookiecutter.project_name}}/.flyte/remote-config.yaml: -------------------------------------------------------------------------------- 1 | admin: 2 | # For GRPC endpoints you might want to use dns:///flyte.myexample.com 3 | endpoint: dns:///playground.hosted.unionai.cloud 4 | authType: Pkce 5 | # Change insecure flag to ensure that you use the right setting for your environment 6 | insecure: false 7 | storage: 8 | type: stow 9 | stow: 10 | kind: s3 11 | config: 12 | auth_type: iam 13 | region: us-east-2 14 | logger: 15 | # Logger settings to control logger output. Useful to debug logger: 16 | show-source: true 17 | level: 1 18 | -------------------------------------------------------------------------------- /templates/pytorch-gpu/{{cookiecutter.project_name}}/.flyte/remote.config: -------------------------------------------------------------------------------- 1 | [sdk] 2 | # This option specifies the python packages where-in to search for workflows and tasks workflow packages. These workflows and tasks are then serialized during the `make serialize` commands 3 | workflow_packages={{cookiecutter.project_name}} 4 | 5 | [auth] 6 | raw_output_data_prefix=s3://open-compute-playground 7 | -------------------------------------------------------------------------------- /templates/pytorch-gpu/{{cookiecutter.project_name}}/.flyte/sandbox-config.yaml: -------------------------------------------------------------------------------- 1 | admin: 2 | # For GRPC endpoints you might want to use dns:///flyte.myexample.com 3 | endpoint: dns:///localhost:30081 4 | authType: Pkce 5 | insecure: true 6 | logger: 7 | show-source: true 8 | level: 0 9 | storage: 10 | connection: 11 | access-key: minio 12 | auth-type: accesskey 13 | disable-ssl: true 14 | endpoint: http://localhost:30084 15 | region: us-east-1 16 | secret-key: miniostorage 17 | type: minio 18 | container: "my-s3-bucket" 19 | enable-multicontainer: true 20 | -------------------------------------------------------------------------------- /templates/pytorch-gpu/{{cookiecutter.project_name}}/.flyte/sandbox.config: -------------------------------------------------------------------------------- 1 | [sdk] 2 | # This option specifies the python packages where-in to search for workflows and tasks workflow packages. These workflows and tasks are then serialized during the `make serialize` commands 3 | workflow_packages={{cookiecutter.project_name}} 4 | 5 | [auth] 6 | raw_output_data_prefix=s3://my-s3-bucket/flytelab 7 | -------------------------------------------------------------------------------- /templates/pytorch-gpu/{{cookiecutter.project_name}}/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM pytorch/pytorch:1.9.0-cuda10.2-cudnn7-runtime 2 | 3 | WORKDIR /root 4 | ENV VENV /opt/venv 5 | ENV LANG C.UTF-8 6 | ENV LC_ALL C.UTF-8 7 | ENV PYTHONPATH /root 8 | 9 | # e.g. flyte.config or sandbox.config 10 | ARG config 11 | 12 | RUN apt-get update && \ 13 | apt-get install -y \ 14 | libsm6 \ 15 | libxext6 \ 16 | libxrender-dev \ 17 | ffmpeg \ 18 | build-essential 19 | 20 | # Install the AWS cli separately to prevent issues with boto being written over 21 | RUN pip3 install awscli 22 | 23 | ENV VENV /opt/venv 24 | 25 | # Virtual environment 26 | RUN python3 -m venv ${VENV} 27 | ENV PATH="${VENV}/bin:$PATH" 28 | 29 | # Install Python dependencies 30 | COPY requirements.txt /root 31 | RUN pip install -r /root/requirements.txt 32 | 33 | COPY {{cookiecutter.project_name}} /root/{{cookiecutter.project_name}} 34 | COPY $config /root/flyte.config 35 | 36 | # This image is supplied by the build script and will be used to determine the version 37 | # when registering tasks, workflows, and launch plans 38 | ARG image 39 | ENV FLYTE_INTERNAL_IMAGE $image 40 | -------------------------------------------------------------------------------- /templates/pytorch-gpu/{{cookiecutter.project_name}}/README.md: -------------------------------------------------------------------------------- 1 | # {{cookiecutter.project_name}} 2 | 3 | {{cookiecutter.description}} 4 | -------------------------------------------------------------------------------- /templates/pytorch-gpu/{{cookiecutter.project_name}}/dashboard/app.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | import os 3 | import sys 4 | from argparse import ArgumentParser 5 | from pathlib import Path 6 | 7 | import streamlit as st 8 | import torch 9 | 10 | from flytekit.remote import FlyteRemote 11 | from flytekit.models import filters 12 | from flytekit.models.admin.common import Sort 13 | from sklearn.datasets import load_digits 14 | 15 | 16 | # import flytelab project source 17 | sys.path.append(str(Path(__file__).parent.parent)) 18 | importlib.import_module("{{cookiecutter.project_name}}") 19 | 20 | 21 | PROJECT_NAME = "{{cookiecutter.flyte_project}}" 22 | WORKFLOW_NAME = "{{cookiecutter.project_name}}.workflows.main" 23 | 24 | 25 | parser = ArgumentParser() 26 | parser.add_argument("--remote", action="store_true") 27 | args = parser.parse_args() 28 | 29 | backend = os.getenv("FLYTE_BACKEND", 'remote' if args.remote else 'sandbox') 30 | 31 | # configuration for accessing a Flyte cluster backend 32 | remote = FlyteRemote.from_config( 33 | default_project=PROJECT_NAME, 34 | default_domain="development", 35 | config_file_path=Path(__file__).parent / f"{backend}.config", 36 | ) 37 | 38 | # get the latest workflow execution 39 | [latest_execution, *_], _ = remote.client.list_executions_paginated( 40 | PROJECT_NAME, 41 | "development", 42 | limit=1, 43 | filters=[ 44 | filters.Equal("launch_plan.name", WORKFLOW_NAME), 45 | filters.Equal("phase", "SUCCEEDED"), 46 | ], 47 | sort_by=Sort.from_python_std("desc(execution_created_at)"), 48 | ) 49 | 50 | wf_execution = remote.fetch_workflow_execution(name=latest_execution.id.name) 51 | remote.sync(wf_execution, sync_nodes=False) 52 | model = wf_execution.outputs["o0"] 53 | print(model) 54 | 55 | 56 | ############ 57 | # App Code # 58 | ############ 59 | 60 | data = load_digits(as_frame=True) 61 | 62 | st.write("# Flytelab: {{cookiecutter.project_name}}") 63 | st.write("### {{cookiecutter.description}}") 64 | st.write(f"Model: `{model}`") 65 | 66 | st.write("Use the slider below to select a sample for prediction") 67 | 68 | sample_index = st.slider( 69 | "Sample Number", 70 | min_value=0, 71 | max_value=data.frame.shape[0] - 1, 72 | value=0, 73 | step=1, 74 | ) 75 | 76 | st.image(data.images[sample_index], clamp=True, width=300) 77 | st.write(f"Ground Truth: {data.target[sample_index]}") 78 | 79 | data = torch.from_numpy(data.frame[data.feature_names].loc[[sample_index]].values).float() 80 | prediction = model(data).argmax().item() 81 | st.write(f"Prediction: {prediction}") 82 | -------------------------------------------------------------------------------- /templates/pytorch-gpu/{{cookiecutter.project_name}}/dashboard/remote.config: -------------------------------------------------------------------------------- 1 | [platform] 2 | url=playground.hosted.unionai.cloud 3 | insecure=False 4 | 5 | [credentials] 6 | client_id=flytepropeller 7 | auth_mode=basic 8 | authorization_metadata-key=flyte-authorization 9 | oauth_scopes=all 10 | -------------------------------------------------------------------------------- /templates/pytorch-gpu/{{cookiecutter.project_name}}/dashboard/sandbox.config: -------------------------------------------------------------------------------- 1 | [platform] 2 | url=localhost:30081 3 | insecure=True 4 | 5 | [aws] 6 | access_key_id=minio 7 | secret_access_key=miniostorage 8 | endpoint=http://localhost:30084 9 | -------------------------------------------------------------------------------- /templates/pytorch-gpu/{{cookiecutter.project_name}}/deploy.py: -------------------------------------------------------------------------------- 1 | ../../_common/deploy.py -------------------------------------------------------------------------------- /templates/pytorch-gpu/{{cookiecutter.project_name}}/requirements-dev.txt: -------------------------------------------------------------------------------- 1 | docker 2 | gitpython 3 | streamlit 4 | typer 5 | -------------------------------------------------------------------------------- /templates/pytorch-gpu/{{cookiecutter.project_name}}/requirements.txt: -------------------------------------------------------------------------------- 1 | flytekit>=0.30.3 2 | pandas 3 | s3fs 4 | sklearn 5 | torch 6 | -------------------------------------------------------------------------------- /templates/pytorch-gpu/{{cookiecutter.project_name}}/{{cookiecutter.project_name}}/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/flyteorg/flytelab/67ff9a779004209c82f9f5df89f2873cc5acdd52/templates/pytorch-gpu/{{cookiecutter.project_name}}/{{cookiecutter.project_name}}/__init__.py -------------------------------------------------------------------------------- /templates/pytorch-gpu/{{cookiecutter.project_name}}/{{cookiecutter.project_name}}/workflows.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.functional as F 8 | import torch.optim as optim 9 | from sklearn.datasets import load_digits 10 | 11 | from flytekit import task, workflow, Resources 12 | 13 | 14 | dataset_resources = Resources(cpu="1", mem="1Gi", storage="1Gi") 15 | 16 | # This conditional is used at deployment time to determine whether the 17 | # task uses CPUs or GPUs. The "FLYTE_SANDBOX" environment variable is 18 | # automatically set by the `deploy.py` script when serializing tasks/workflows 19 | training_resources = ( 20 | Resources(cpu="1", mem="1Gi", storage="1Gi") 21 | if int(os.getenv("FLYTE_SANDBOX", "0")) 22 | else Resources(gpu="1", mem="4Gi", storage="4Gi") 23 | ) 24 | 25 | 26 | DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") 27 | 28 | 29 | class Model(nn.Module): 30 | def __init__(self, input_dim: int, hidden_dim: int, output_dim: int): 31 | super(Model, self).__init__() 32 | self.layer1 = nn.Linear(input_dim, hidden_dim) 33 | self.layer2 = nn.Linear(hidden_dim, hidden_dim) 34 | self.layer3 = nn.Linear(hidden_dim, output_dim) 35 | 36 | def forward(self, input): 37 | x = F.relu(self.layer1(input)) 38 | x = F.relu(self.layer2(x)) 39 | return F.log_softmax(self.layer3(x)) 40 | 41 | 42 | @task(requests=dataset_resources, limits=dataset_resources) 43 | def get_dataset() -> pd.DataFrame: 44 | return load_digits(as_frame=True).frame 45 | 46 | 47 | def dataset_iterator(features, target, n_batches: int): 48 | for X, y in zip(np.array_split(features, n_batches), np.array_split(target, n_batches)): 49 | yield ( 50 | torch.from_numpy(X.values).float().to(DEVICE), 51 | torch.from_numpy(y.values).long().to(DEVICE) 52 | ) 53 | 54 | 55 | @task(requests=training_resources, limits=training_resources) 56 | def train_model( 57 | dataset: pd.DataFrame, 58 | hidden_dim: int, 59 | n_epochs: int, 60 | batch_size: int, 61 | learning_rate: float, 62 | ) -> Model: 63 | features, target = dataset[[x for x in dataset if x != "target"]], dataset["target"] 64 | 65 | # define the model 66 | n_classes = target.nunique() 67 | model = Model(features.shape[1], hidden_dim, n_classes).to(DEVICE) 68 | opt = optim.SGD(model.parameters(), lr=learning_rate) 69 | 70 | # iterate through n_epochs and n_batches of the training data 71 | n_batches = int(features.shape[0] / batch_size) 72 | for epoch in range(1, n_epochs + 1): 73 | for batch, (X, y) in enumerate(dataset_iterator(features, target, n_batches), 1): 74 | 75 | opt.zero_grad() 76 | y_hat = model(X) 77 | loss = F.nll_loss(y_hat, y) 78 | loss.backward() 79 | opt.step() 80 | 81 | accuracy = (y_hat.argmax(1) == y).float().mean() 82 | 83 | print( 84 | f"epoch={epoch:02d}: " 85 | f"batch {batch:02d}/{n_batches} - " 86 | f"loss={loss.item():0.04f}; " 87 | f"accuracy={accuracy:0.04f}" 88 | ) 89 | 90 | return model.to("cpu") 91 | 92 | 93 | @workflow 94 | def main( 95 | hidden_dim: int = 300, 96 | n_epochs: int = 30, 97 | batch_size: int = 64, 98 | learning_rate: float = 0.001, 99 | ) -> Model: 100 | return train_model( 101 | dataset=get_dataset(), 102 | hidden_dim=hidden_dim, 103 | n_epochs=n_epochs, 104 | batch_size=batch_size, 105 | learning_rate=learning_rate, 106 | ) 107 | 108 | 109 | if __name__ == "__main__": 110 | print(f"trained model: {main()}") 111 | --------------------------------------------------------------------------------