├── .gitattributes ├── .github ├── labeler.yml ├── pull_request_template.md └── workflows │ ├── cd.yml │ ├── ci.yml │ ├── force-docs-build.yml │ ├── pr.yml │ └── remote-ci.yml ├── .gitignore ├── .pre-commit-config.yaml ├── CHANGELOG.md ├── CONTRIBUTING.md ├── LICENSE ├── MANIFEST.in ├── Makefile ├── README.md ├── codecov.yml ├── datasets ├── credit.txt ├── image-image_example.csv ├── text-image_example.csv └── text-text_example.csv ├── docs ├── Makefile ├── _static │ ├── JCloud-dark.svg │ ├── JCloud-light.svg │ ├── Powered-by-Jina-Large-Basic.svg │ ├── banner.png │ ├── cas-dark.svg │ ├── cas-light.svg │ ├── docarray-dark.svg │ ├── docarray-light.svg │ ├── docbot.css │ ├── favicon.ico │ ├── favicon.png │ ├── finetuner+_dark.svg │ ├── finetuner+_light.svg │ ├── finetuner-client-journey.svg │ ├── finetuner-dark.svg │ ├── finetuner-light.svg │ ├── finetuner-logo-ani.svg │ ├── hub-dark.svg │ ├── hub-light.svg │ ├── logo-dark.svg │ ├── logo-light.svg │ ├── main.css │ ├── now-dark.svg │ ├── now-light.svg │ ├── search-dark.svg │ └── search-light.svg ├── _templates │ ├── page.html │ ├── sidebar │ │ ├── brand.html │ │ └── navigation.html │ └── template_ft_in_action.md ├── advanced-topics │ ├── advanced-losses-optimizers-and-poolers.md │ ├── budget.md │ ├── finetuner-executor.md │ ├── linear-probe.md │ ├── negative-mining.md │ └── using-callbacks.md ├── api-rst.rst ├── conf.py ├── get-started │ ├── how-it-works.md │ ├── installation.md │ └── pretrained.md ├── html_extra │ └── robots.txt ├── imgs │ ├── DocumentArray_plot_image_sprites.png │ ├── DocumentArray_summary.png │ ├── Document_display.png │ ├── Document_summary.png │ ├── SphereFace-training.png │ ├── batch-sampling.png │ ├── distributions-loss.png │ ├── metric-train.png │ ├── mining.png │ └── tailor.svg ├── index.md ├── make.bat ├── makedoc.sh ├── notebooks │ ├── data_synthesis.ipynb │ ├── data_synthesis.md │ ├── image_to_image.ipynb │ ├── image_to_image.md │ ├── image_to_image_arcface.ipynb │ ├── image_to_image_arcface.md │ ├── mesh_to_mesh.ipynb │ ├── mesh_to_mesh.md │ ├── multilingual_text_to_image.ipynb │ ├── multilingual_text_to_image.md │ ├── text_to_image.ipynb │ ├── text_to_image.md │ ├── text_to_text.ipynb │ └── text_to_text.md ├── requirements.txt └── walkthrough │ ├── basic-concepts.md │ ├── choose-backbone.md │ ├── create-training-data.md │ ├── index.md │ ├── inference.md │ ├── login.md │ ├── run-job.md │ └── save-model.md ├── finetuner ├── __init__.py ├── callback.py ├── client │ ├── __init__.py │ ├── base.py │ ├── client.py │ └── session.py ├── console.py ├── constants.py ├── data.py ├── excepts.py ├── experiment.py ├── finetuner.py ├── hubble.py ├── model.py ├── names.py └── run.py ├── pyproject.toml ├── setup.cfg ├── setup.py └── tests ├── __init__.py ├── conftest.py ├── constants.py ├── helper.py ├── integration ├── __init__.py ├── conftest.py ├── test_data.py ├── test_experiments.py ├── test_hf_models.py └── test_runs.py └── unit ├── __init__.py ├── conftest.py ├── mocks.py ├── resources ├── cube.off ├── dummy.csv └── lena.png ├── test___init__.py ├── test_client.py ├── test_data.py ├── test_experiment.py ├── test_finetuner.py ├── test_hubble.py └── test_run.py /.gitattributes: -------------------------------------------------------------------------------- 1 | # ignore ipynb line counts 2 | *.ipynb linguist-documentation -------------------------------------------------------------------------------- /.github/labeler.yml: -------------------------------------------------------------------------------- 1 | area/docs: 2 | - docs/**/* 3 | 4 | area/testing: 5 | - tests/**/* 6 | 7 | area/setup: 8 | - setup.py 9 | - requirements* 10 | - MANIFEST.in 11 | 12 | area/housekeeping: 13 | - .github/**/* 14 | - ./.gitignore 15 | - ./*.yaml 16 | - ./*.yml 17 | 18 | area/cicd: 19 | - .github/workflows/**/* 20 | 21 | area/docker: 22 | - Dockerfiles/**/* 23 | - ./.dockerignore 24 | 25 | area/core: 26 | - finetuner/**/* 27 | 28 | area/entrypoint: 29 | - finetuner/__init__.py 30 | 31 | area/client: 32 | - finetuner/client/**/* -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | --- 5 | 6 | - [ ] This PR references an open issue 7 | - [ ] I have added a line about this change to CHANGELOG -------------------------------------------------------------------------------- /.github/workflows/cd.yml: -------------------------------------------------------------------------------- 1 | name: CD 2 | 3 | on: 4 | push: 5 | tags: 6 | - 'v*' 7 | 8 | env: 9 | HOST: ${{ secrets.FINETUNER_HOST }} 10 | JINA_AUTH_TOKEN: ${{ secrets.JINA_AUTH_TOKEN }} 11 | 12 | jobs: 13 | 14 | update-docs: 15 | name: Update docs 16 | runs-on: ubuntu-latest 17 | steps: 18 | - uses: benc-uk/workflow-dispatch@v1 19 | with: 20 | workflow: Manual Docs Build 21 | token: ${{ secrets.JINA_DEV_BOT }} 22 | inputs: '{ "release_token": "${{ env.release_token }}", "triggered_by": "TAG"}' 23 | env: 24 | release_token: ${{ secrets.FINETUNER_RELEASE_TOKEN }} 25 | 26 | release: 27 | runs-on: ubuntu-latest 28 | steps: 29 | - uses: actions/checkout@v3 30 | - name: Set up Python 31 | uses: actions/setup-python@v2 32 | with: 33 | python-version: 3.8 34 | - name: Release to PyPI 35 | env: 36 | TWINE_USERNAME: ${{ secrets.TWINE_USERNAME }} 37 | TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }} 38 | JINA_SLACK_WEBHOOK: ${{ secrets.JINA_SLACK_WEBHOOK }} 39 | run: | 40 | pip install twine wheel 41 | python setup.py sdist 42 | twine upload dist/* 43 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: [push] 4 | 5 | env: 6 | HOST: ${{ secrets.FINETUNER_HOST }} 7 | JINA_AUTH_TOKEN: ${{ secrets.JINA_AUTH_TOKEN }} 8 | HUGGING_FACE_HUB_TOKEN: ${{ secrets.HF_HUB_ACCESS_TOKEN }} 9 | 10 | jobs: 11 | 12 | check-codestyle: 13 | name: Check codestyle 14 | runs-on: ubuntu-latest 15 | steps: 16 | - uses: actions/checkout@v2 17 | - name: Set up Python 3.8 18 | uses: actions/setup-python@v2 19 | with: 20 | python-version: 3.8 21 | - name: Install requirements 22 | run: make init 23 | - name: Lint with flake8, black and isort 24 | run: make style 25 | 26 | run-tests: 27 | name: Run tests 28 | runs-on: ubuntu-latest 29 | steps: 30 | - uses: actions/checkout@v2 31 | - name: Set up Python 3.8 32 | uses: actions/setup-python@v2 33 | with: 34 | python-version: 3.8 35 | - name: Install requirements 36 | run: make init 37 | - name: Run tests 38 | run: make test 39 | 40 | check-versions: 41 | name: Check Python Versions 42 | runs-on: ubuntu-latest 43 | strategy: 44 | matrix: 45 | version: [3.8, 3.9, '3.10'] 46 | steps: 47 | - uses: actions/checkout@v2 48 | - name: Setp up python version 49 | uses: actions/setup-python@v2 50 | with: 51 | python-version: ${{ matrix.version }} 52 | - name: Test install 53 | run: make install 54 | 55 | # just for blocking the merge until all parallel core-test are successful 56 | success-all-test: 57 | needs: run-tests 58 | if: always() 59 | runs-on: ubuntu-latest 60 | steps: 61 | - uses: technote-space/workflow-conclusion-action@v2 62 | - name: Check Failure 63 | if: env.WORKFLOW_CONCLUSION == 'failure' 64 | run: exit 1 65 | - name: Success 66 | if: ${{ success() }} 67 | run: echo "All Done" 68 | -------------------------------------------------------------------------------- /.github/workflows/force-docs-build.yml: -------------------------------------------------------------------------------- 1 | name: Manual Docs Build 2 | 3 | on: 4 | workflow_dispatch: 5 | inputs: 6 | release_token: 7 | description: 'Your release token' 8 | required: true 9 | triggered_by: 10 | description: 'CD | TAG | MANUAL' 11 | required: false 12 | default: MANUAL 13 | 14 | jobs: 15 | token-check: 16 | runs-on: ubuntu-latest 17 | steps: 18 | - run: echo "success!" 19 | if: "${{ github.event.inputs.release_token }} == ${{ env.release_token }}" 20 | env: 21 | release_token: ${{ secrets.FINETUNER_RELEASE_TOKEN }} 22 | 23 | release-docs: 24 | needs: token-check 25 | runs-on: ubuntu-latest 26 | steps: 27 | - uses: actions/checkout@v2 28 | with: 29 | fetch-depth: 0 30 | - uses: actions/setup-python@v2 31 | with: 32 | python-version: 3.8 33 | - name: Build doc and push to gh-pages 34 | run: | 35 | git config --local user.email "dev-bot@jina.ai" 36 | git config --local user.name "Jina Dev Bot" 37 | pip install . 38 | mkdir gen-html 39 | cd docs 40 | pip install -r requirements.txt 41 | pip install -U furo 42 | export NUM_RELEASES=5 43 | bash makedoc.sh local-only 44 | make notebook 45 | cd ./build/dirhtml/ 46 | cp -r ./ ../../../gen-html 47 | cd - # back to ./docs 48 | cd .. 49 | git checkout -f gh-pages 50 | git rm -rf ./docs 51 | mkdir -p docs 52 | cd gen-html 53 | cp -r ./ ../docs 54 | cd ../docs 55 | ls -la 56 | touch .nojekyll 57 | cp 404/index.html 404.html 58 | sed -i 's/href="\.\./href="/' 404.html # fix asset urls that needs to be updated in 404.html 59 | echo finetuner.jina.ai > CNAME 60 | cd .. 61 | git status 62 | git add docs && git commit -m "chore(docs): update docs due to ${{github.event_name}} on ${{github.repository}}" 63 | git push --force origin gh-pages -------------------------------------------------------------------------------- /.github/workflows/pr.yml: -------------------------------------------------------------------------------- 1 | name: PR 2 | 3 | on: 4 | pull_request: 5 | 6 | jobs: 7 | 8 | assign-label-to-pr: 9 | name: Assign label to PR 10 | runs-on: ubuntu-latest 11 | if: ${{ !github.event.pull_request.head.repo.fork }} 12 | steps: 13 | - uses: codelytv/pr-size-labeler@v1 14 | with: 15 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 16 | xs_max_size: '10' 17 | s_max_size: '100' 18 | m_max_size: '500' 19 | l_max_size: '1000' 20 | fail_if_xl: 'false' 21 | - uses: actions/labeler@v3 22 | with: 23 | repo-token: "${{ secrets.GITHUB_TOKEN }}" 24 | - id: docs_updated 25 | if: contains( github.event.pull_request.labels.*.name, 'area/docs') 26 | run: echo '::set-output name=docs::true' 27 | outputs: 28 | docs: ${{ steps.docs_updated.outputs.docs }} 29 | 30 | deploy-to-netlify: 31 | name: Deploy docs to netlify 32 | runs-on: ubuntu-latest 33 | needs: assign-label-to-pr 34 | if: ${{ needs.assign-label-to-pr.outputs.docs == 'true' }} 35 | steps: 36 | - run: | 37 | echo "BRANCH_NAME=${{ github.head_ref }}" >> $GITHUB_ENV 38 | - uses: actions/checkout@v2 39 | with: 40 | repository: jina-ai/finetuner 41 | ref: ${{ env.BRANCH_NAME }} 42 | - uses: actions/setup-python@v2 43 | with: 44 | python-version: 3.8 45 | - uses: actions/setup-node@v2 46 | with: 47 | node-version: '14' 48 | - name: Build and Deploy 49 | run: | 50 | npm i -g netlify-cli 51 | python -m pip install --upgrade pip 52 | pip install -r requirements.txt 53 | git fetch origin 54 | export NUM_RELEASES=2 # only 2 last tags to save build time 55 | bash makedoc.sh development 56 | netlify deploy --dir=_build/dirhtml --alias="ft-${{ env.BRANCH_NAME }}" --message="Deploying docs to ${{ env.BRANCH_NAME }} branch" 57 | env: 58 | NETLIFY_AUTH_TOKEN: ${{ secrets.NETLIFY_AUTH_TOKEN }} 59 | NETLIFY_SITE_ID: ${{ secrets.NETLIFY_SITE_ID }} 60 | working-directory: docs 61 | - name: Find the prev comment if exists 62 | uses: peter-evans/find-comment@v1 63 | id: fc 64 | with: 65 | issue-number: ${{ github.event.pull_request.number }} 66 | comment-author: 'github-actions[bot]' 67 | body-includes: 'Docs are deployed' 68 | - name: Delete comment if exists 69 | if: ${{ steps.fc.outputs.comment-id != 0 && !github.event.pull_request.head.repo.fork }} 70 | uses: actions/github-script@v3 71 | with: 72 | github-token: ${{ secrets.GITHUB_TOKEN }} 73 | script: | 74 | github.issues.deleteComment({ 75 | owner: context.repo.owner, 76 | repo: context.repo.repo, 77 | comment_id: ${{ steps.fc.outputs.comment-id }}, 78 | }) 79 | - name: Add or update comment 80 | uses: peter-evans/create-or-update-comment@v1 81 | with: 82 | issue-number: ${{ github.event.pull_request.number }} 83 | body: | 84 | :memo: Docs are deployed on https://ft-${{ env.BRANCH_NAME }}--jina-docs.netlify.app :tada: -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Initially taken from Github's Python gitignore file 2 | 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | 53 | # Translations 54 | *.mo 55 | *.pot 56 | 57 | # Django stuff: 58 | *.log 59 | local_settings.py 60 | db.sqlite3 61 | 62 | # Flask stuff: 63 | instance/ 64 | .webassets-cache 65 | 66 | # Scrapy stuff: 67 | .scrapy 68 | 69 | # Sphinx documentation 70 | docs/_build/ 71 | docs/api/ 72 | 73 | # PyBuilder 74 | target/ 75 | 76 | # Jupyter Notebook 77 | .ipynb_checkpoints 78 | 79 | # IPython 80 | profile_default/ 81 | ipython_config.py 82 | 83 | # pyenv 84 | docs/.python-version 85 | 86 | # celery beat schedule file 87 | celerybeat-schedule 88 | 89 | # SageMath parsed files 90 | *.sage.py 91 | 92 | # Environments 93 | .venv 94 | env/ 95 | venv/ 96 | ENV/ 97 | env.bak/ 98 | venv.bak/ 99 | 100 | # Spyder project settings 101 | .spyderproject 102 | .spyproject 103 | 104 | # Rope project settings 105 | .ropeproject 106 | 107 | # mkdocs documentation 108 | /site 109 | 110 | # mypy 111 | .mypy_cache/ 112 | .dmypy.json 113 | dmypy.json 114 | 115 | # Pyre type checker 116 | .pyre/ 117 | .idea/ 118 | /toy*.py 119 | .DS_Store 120 | post/ 121 | /toy*.ipynb 122 | data/ 123 | *.c 124 | .nes_cache 125 | /toy*.yml 126 | *.tmp 127 | 128 | shell/jina-wizard.sh 129 | /junit/ 130 | /tests/junit/ 131 | /docs/chapters/proto/docs.md 132 | 133 | # IntelliJ IDEA 134 | *.iml 135 | .idea 136 | 137 | # VSCode 138 | .vscode 139 | 140 | # test with config in resources 141 | tests/integration/crud/simple/simple_indexer/ 142 | 143 | # latency tracking 144 | latency 145 | MyIndexer/ 146 | MyMemMap/ 147 | original/ 148 | output/ 149 | 150 | # Logging 151 | /wandb 152 | 153 | # env 154 | .env 155 | test.py -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/ambv/black 3 | rev: "23.3.0" 4 | hooks: 5 | - id: black 6 | types: [python] 7 | - repo: https://github.com/pycqa/flake8 8 | rev: "6.0.0" 9 | hooks: 10 | - id: flake8 11 | - repo: https://github.com/pycqa/isort 12 | rev: "5.12.0" 13 | hooks: 14 | - id: isort 15 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | 4 | ## Setup 5 | 6 | ### Install dev requirements 7 | 8 | ```bash 9 | make install-dev 10 | ``` 11 | 12 | ### Install finetuner 13 | 14 | ```bash 15 | make install 16 | ``` 17 | 18 | ### Enable precommit hook 19 | 20 | To automatically ensure formatting with `black`, import sorting with `isort` and linting 21 | with `flake8`, you can install the pre-commit hooks 22 | 23 | ```bash 24 | make pre-commit 25 | ``` 26 | 27 | 28 | ## Making a PR 29 | 30 | ### Open an issue 31 | 32 | Each PR should reference an open issue, and this issue should be linked to your PR. 33 | 34 | ### Running tests locally 35 | 36 | To run tests locally, all you need to do is 37 | 38 | ```bash 39 | make test 40 | ``` 41 | 42 | ### Adding an entry to the changelog 43 | 44 | Make an entry in [CHANGELOG.md](https://github.com/jina-ai/finetuner/blob/main/CHANGELOG.md), 45 | adding it to the `Unreleased` section (and the appropriate subsection), which should contain a 46 | short description of what you have done in the PR, as well as the PR's number, e.g. 47 | 48 | ``` 49 | - Add `NTXentLoss` loss class for supervised learning ([#24](https://github.com/jina-ai/finetuner.fit/pull/24)) 50 | ``` 51 | 52 | To avoid merge conflicts when multiple people are simultaneously working on new features, make sure there 53 | is **an empty line above and below the entry**. 54 | 55 | ## Update notebooks 56 | 57 | We have three Google Colab embedded inside the documentation: 58 | 59 | - [text-to-text with bert](https://colab.research.google.com/drive/1Ui3Gw3ZL785I7AuzlHv3I0-jTvFFxJ4_?usp=sharing) 60 | - [image-to-image with resnet](https://colab.research.google.com/drive/1QuUTy3iVR-kTPljkwplKYaJ-NTCgPEc_?usp=sharing) 61 | - [text-to-iamge with clip](https://colab.research.google.com/drive/1yKnmy2Qotrh3OhgwWRsMWPFwOSAecBxg?usp=sharing) 62 | 63 | To update code in colab: 64 | 65 | 1. Update code in the Google Colab. 66 | 2. Download into `docs/notebooks/` folder. 67 | 3. cd into `docs` folder, run `make notebook` and run `make dirhtml` to see output locally. 68 | 69 | Only members of the team have the permissions to modify the notebook. 70 | 71 | ## Releases 72 | 73 | To make a release, follow these steps, in order. 74 | 75 | ### Update CHANGELOG.md 76 | 77 | In `CHANGELOG.md`, rename the top `Unreleased` entry with the with the version number (`X.Y.Z`), and enter the current date. 78 | 79 | Then, add a new empty `Unreleased` section on top of it - this is where the changes for the next version will accumulate. 80 | 81 | ### Tag the commit on `main` branch 82 | 83 | In your repository, check out the `main` branch, and tag it with the appropriate version - it should match the one in `finetuner/__init__.py`! 84 | If it does not, change it there first. 85 | 86 | To tag the head commit in `main` branch, and then push this to remote, do the following steps 87 | (you can also do this automatially by creating a release on GitHub) 88 | 89 | ```bash 90 | git checkout main 91 | git tag vX.Y.Z 92 | git push --tags 93 | ``` 94 | 95 | At this point the new version is officially released. At this point any automated actions connected 96 | to release would have been run. 97 | 98 | ### Change version in `finetuner/__init__.py` 99 | 100 | Since now the `main` branch corresponds to the new development version, we need to change the version 101 | in `finetuner/__init__.py` to reflect that. So you should increment the version in that file. 102 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include requirements.txt 2 | include LICENSE 3 | prune tests/ -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # Run: 2 | # make help 3 | # 4 | # for a description of the available targets 5 | 6 | 7 | # ------------------------------------------------------------------------- Help target 8 | 9 | TARGET_MAX_CHAR_NUM=20 10 | GREEN := $(shell tput -Txterm setaf 2) 11 | YELLOW := $(shell tput -Txterm setaf 3) 12 | WHITE := $(shell tput -Txterm setaf 7) 13 | RESET := $(shell tput -Txterm sgr0) 14 | 15 | ## Show this help message 16 | help: 17 | @echo '' 18 | @echo 'Usage:' 19 | @echo ' ${YELLOW}make${RESET} ${GREEN}${RESET}' 20 | @echo '' 21 | @echo 'Targets:' 22 | @awk '/^[a-zA-Z\-\_0-9]+:/ { \ 23 | helpMessage = match(lastLine, /^## (.*)/); \ 24 | if (helpMessage) { \ 25 | helpCommand = substr($$1, 0, index($$1, ":")-1); \ 26 | helpMessage = substr(lastLine, RSTART + 3, RLENGTH); \ 27 | printf " ${YELLOW}%-$(TARGET_MAX_CHAR_NUM)s${RESET} ${GREEN}%s${RESET}\n", helpCommand, helpMessage; \ 28 | } \ 29 | } \ 30 | { lastLine = $$0 }' $(MAKEFILE_LIST) 31 | 32 | 33 | # ------------------------------------------------------------------------ Clean target 34 | 35 | ## Delete temp operational stuff like artifacts, test outputs etc 36 | clean: 37 | rm -rf .mypy_cache/ .pytest_cache/ 38 | rm -f .coverage .coverage.* 39 | rm -rf *.egg-info/ build/ docs/_build/ htmlcov/ 40 | 41 | 42 | # --------------------------------------------------------- Environment related targets 43 | 44 | ## Create a virtual environment 45 | env: 46 | python3.8 -m venv .venv 47 | source .venv/bin/activate 48 | pip install -U pip 49 | 50 | ## Install pre-commit hooks 51 | pre-commit: 52 | pip install pre-commit 53 | pre-commit install 54 | 55 | ## Install package requirements 56 | install: 57 | pip install --no-cache-dir -e ".[full]" 58 | rm -rf *.egg-info/ build/ 59 | 60 | ## Install dev requirements 61 | install-dev: 62 | pip install --no-cache-dir -e ".[test]" 63 | rm -rf *.egg-info/ build/ 64 | 65 | ## Install docs requirements 66 | install-docs: 67 | pip install --no-cache-dir -r docs/requirements.txt 68 | 69 | ## Bootstrap dev environment 70 | init: pre-commit install install-dev install-docs 71 | 72 | 73 | # ----------------------------------------------------------------------- Build targets 74 | 75 | ## Build wheel 76 | build: 77 | python setup.py bdist_wheel 78 | rm -rf .eggs/ build/ *egg-info 79 | 80 | ## Build source dist 81 | build-sdist: 82 | python setup.py sdist 83 | rm -rf .eggs/ build/ *egg-info 84 | 85 | 86 | # ---------------------------------------------------------------- Test related targets 87 | 88 | PYTEST_ARGS = --show-capture no --verbose --cov finetuner/ --cov-report term-missing --cov-report html 89 | 90 | ## Run tests 91 | test: 92 | pytest $(PYTEST_ARGS) $(TESTS_PATH) 93 | 94 | 95 | # ---------------------------------------------------------------- Docs related targets 96 | 97 | ## Build docs 98 | build-docs: 99 | cd docs/ && bash makedoc.sh development 100 | 101 | 102 | # ---------------------------------------------------------- Code style related targets 103 | 104 | SRC_CODE = finetuner/ tests/ 105 | 106 | ## Run the flake linter 107 | flake: 108 | flake8 $(SRC_CODE) 109 | 110 | ## Run the black formatter 111 | black: 112 | black $(SRC_CODE) 113 | 114 | ## Dry run the black formatter 115 | black-check: 116 | black --check $(SRC_CODE) 117 | 118 | ## Run the isort import formatter 119 | isort: 120 | isort $(SRC_CODE) 121 | 122 | ## Dry run the isort import formatter 123 | isort-check: 124 | isort --check $(SRC_CODE) 125 | 126 | ## Run the mypy static type checker 127 | mypy: 128 | mypy $(SRC_CODE) 129 | 130 | ## Format source code 131 | format: black isort 132 | 133 | ## Check code style 134 | style: flake black-check isort-check # mypy 135 | -------------------------------------------------------------------------------- /codecov.yml: -------------------------------------------------------------------------------- 1 | codecov: 2 | # https://docs.codecov.io/docs/comparing-commits 3 | allow_coverage_offsets: true 4 | coverage: 5 | status: 6 | project: 7 | default: 8 | informational: true 9 | target: auto # auto compares coverage to the previous base commit 10 | comment: 11 | layout: "reach, diff, flags, files" 12 | behavior: default 13 | require_changes: false # if true: only post the comment if coverage changes 14 | branches: # branch names that can post comment 15 | - "main" 16 | -------------------------------------------------------------------------------- /datasets/credit.txt: -------------------------------------------------------------------------------- 1 | Data for `image-image_example.csv` and `text-image_example.csv` is sourced from the Cross-Market Recommendation dataset. 2 | https://xmrec.github.io/data/de/ 3 | 4 | Data for `text-text_example.csv` is sourced from the Quora Duplicate Questions dataset. 5 | https://www.sbert.net/examples/training/quora_duplicate_questions/README.html?highlight=quora#dataset 6 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | 22 | notebook: 23 | jupytext --to markdown notebooks/*.ipynb 24 | -------------------------------------------------------------------------------- /docs/_static/JCloud-dark.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | JCloud-dark 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /docs/_static/JCloud-light.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | JCloud-light 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /docs/_static/banner.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jina-ai/finetuner/69ae77cb51c736e791126792de20015a70658b53/docs/_static/banner.png -------------------------------------------------------------------------------- /docs/_static/cas-dark.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | cas-dark2 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /docs/_static/cas-light.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | cas-dark2 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /docs/_static/docarray-dark.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | docarray-dark 2 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /docs/_static/docarray-light.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | docarray-light 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /docs/_static/docbot.css: -------------------------------------------------------------------------------- 1 | qa-bot[theme="follow"] { 2 | --qabot-color-shadow: var(--sd-color-shadow); 3 | --qabot-color-background: var(--color-background-primary); 4 | --qabot-color-padding: var(--sd-color-card-header); 5 | 6 | --qabot-color-primary: var(--sd-color-card-text); 7 | --qabot-color-action: var(--color-brand-primary); 8 | --qabot-color-action-contrast: var(--color-code-background); 9 | --qabot-color-dimmed: var(--color-background-border); 10 | --qabot-color-muted: var(--color-foreground-muted); 11 | } 12 | qa-bot:not(:defined) { 13 | display: none; 14 | } -------------------------------------------------------------------------------- /docs/_static/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jina-ai/finetuner/69ae77cb51c736e791126792de20015a70658b53/docs/_static/favicon.ico -------------------------------------------------------------------------------- /docs/_static/favicon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jina-ai/finetuner/69ae77cb51c736e791126792de20015a70658b53/docs/_static/favicon.png -------------------------------------------------------------------------------- /docs/_static/finetuner+_dark.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | finetuner+_light备份 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /docs/_static/finetuner+_light.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | finetuner+_light 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /docs/_static/finetuner-dark.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /docs/_static/finetuner-light.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /docs/_static/hub-dark.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | -------------------------------------------------------------------------------- /docs/_static/hub-light.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | -------------------------------------------------------------------------------- /docs/_static/main.css: -------------------------------------------------------------------------------- 1 | .sidebar-logo { 2 | max-width: 50%; 3 | } 4 | 5 | 6 | table.docutils { 7 | border: thin; 8 | } 9 | 10 | table.docutils td, table.docutils th { 11 | padding: 1rem 1rem; 12 | } 13 | 14 | .highlight { 15 | background: #f5f5f5; 16 | } 17 | 18 | h1, h2, h3 { 19 | margin-top: 3rem; 20 | } 21 | 22 | .highlight-console .highlight { 23 | background: #00232b !important; 24 | color: whitesmoke; 25 | } 26 | 27 | .highlight-text .highlight { 28 | background: #00232b !important; 29 | color: whitesmoke; 30 | } 31 | 32 | .highlight-json .highlight { 33 | background: #00232b !important; 34 | color: whitesmoke; 35 | } 36 | 37 | .highlight-shell .highlight { 38 | background: #00232b !important; 39 | color: whitesmoke; 40 | } 41 | 42 | .highlight-bash .highlight { 43 | background: #00232b !important; 44 | color: whitesmoke; 45 | } 46 | 47 | .tab-set > input:checked + label { 48 | border-color: var(--tabs--label-text--active); 49 | } 50 | 51 | .tab-set > input:checked + label:hover { 52 | border-color: var(--tabs--label-text--active); 53 | } 54 | 55 | 56 | table code { 57 | background: var(--color-inline-code-background); 58 | border: 1px solid var(--color-background-border); 59 | border-radius: .2em; 60 | font-size: var(--font-size--small--2); 61 | padding: .1em .2em; 62 | } 63 | 64 | .related-information { 65 | justify-content: space-between; 66 | } 67 | 68 | .social-btn { 69 | margin: 0 .3em; 70 | } 71 | 72 | .social-btn:hover { 73 | opacity: .5; 74 | } 75 | 76 | .social-btns { 77 | display: inline-block; 78 | } 79 | 80 | .announcement { 81 | background-color: var(--color-brand-primary); 82 | color: var(--color-background-primary) !important; 83 | } 84 | 85 | .announcement a { 86 | color: inherit; 87 | text-decoration: none; 88 | } 89 | 90 | .announcement a:hover { 91 | color: inherit; 92 | text-decoration: underline; 93 | } 94 | 95 | .usage-card { 96 | display: none; 97 | } 98 | 99 | .sidebar-ecosys-logo { 100 | width: 1.2em; 101 | margin-right: .5em; 102 | vertical-align: middle 103 | } 104 | 105 | 106 | body[data-theme="dark"] .only-dark-line { 107 | display: inline-block !important; 108 | } 109 | 110 | body[data-theme="dark"] .only-light-line { 111 | display: none !important; 112 | } 113 | 114 | body[data-theme="light"] .only-light-line { 115 | display: inline-block !important; 116 | } 117 | 118 | body[data-theme="light"] .only-dark-line { 119 | display: none !important; 120 | } 121 | 122 | body[data-theme="auto"] .only-light-line { 123 | display: inline-block !important; 124 | } 125 | 126 | body[data-theme="auto"] .only-dark-line { 127 | display: none !important; 128 | } 129 | 130 | .version-select { 131 | font-size: .7em; 132 | border-radius: 5px; 133 | cursor: pointer; 134 | background-color: #fff; 135 | background-image: linear-gradient(to top, #f9f9f9, #fff 33%); 136 | border-color: var(--color-background-border); 137 | height: 1.8em; 138 | line-height: 1.8em; 139 | outline: none; 140 | text-align: center; 141 | max-width: 7em; 142 | color: var(--color-foreground-muted); 143 | } -------------------------------------------------------------------------------- /docs/_static/now-dark.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | Now_Yellow 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /docs/_static/now-light.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | Now_Light_PureColor 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /docs/_static/search-dark.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /docs/_static/search-light.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /docs/_templates/sidebar/brand.html: -------------------------------------------------------------------------------- 1 | 19 |
20 | Star 21 | {% if versions %} 22 | 40 | {% endif %} 41 |
-------------------------------------------------------------------------------- /docs/_templates/sidebar/navigation.html: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/_templates/template_ft_in_action.md: -------------------------------------------------------------------------------- 1 | # Finetuner in Action Template 2 | This is a template for the documentation guides of Finetuner in action, with the general structure and layout to be used for demonstrating how Finetuner can be applied to solve different tasks. 3 | 4 | ```{admonition} See Also: Jina Contribution Guidelines 5 | :class: seealso 6 | For more info on best practices for documentation, see Jina's [contribution guidelines](https://github.com/jina-ai/jina/blob/master/CONTRIBUTING.md#-contributing-documentation) 7 | ``` 8 | 9 | ## Task overview 10 | Describe the task which this guide accomplishes, including which model will be fine-tuned and which dataset you will use. 11 | 12 | Also provide a brief description of what the task entails, what the dataset looks like and a high-level description of how the dataset is processed. 13 | 14 | 15 | ## Preparing data 16 | Outline where the data can be found, artifact names in Jina AI Cloud or if relevant, how a user might load their own custom data. 17 | Add a link to supplementary dataset info, for example as a `See Also` {admonition}. 18 | If you are outlining how to preprocess a dataset from scratch, use {dropdown} to hide long code snippets. 19 | 20 | 21 | ## Choosing the model 22 | Mention which model will be used in your fine-tuning task. Feel free to add a `See Also` {admonition} for supplementary info on the model, perhaps a relevant paper or site. 23 | 24 | You can also add a `Tip` {admonition} for how the user can view all available models, also referring to the `Choose backbone model` documentation. 25 | 26 | 27 | ## Creating a fine-tuning run 28 | Show the user how to create a fine-tuning run, then explain why your example run has particular parameters and what they do. Also mention which parameters are optional or required. 29 | Provide a more detailed explanation of parameters that are important for your particular experiment. 30 | 31 | Example: 32 | 33 | ```python 34 | run = finetuner.fit( 35 | ... 36 | ) 37 | ``` 38 | "Let's understand what this piece of code does ..." 39 | 40 | 41 | ## Monitoring your runs 42 | 43 | Also show the user how they can monitor their run, and reconnect to it if they were disconnected. 44 | 45 | Example: 46 | 47 | "Now that we've created a run, let's see its status. You can monitor the run by checking the status - `run.status()` or the logs - `run.logs()`. " 48 | ```python 49 | print(run.status()) 50 | ``` 51 | 52 | ```bash 53 | {'status': 'CREATED', 'details': 'Run submitted and awaits execution'} 54 | ``` 55 | 56 | "Since some runs might take up to several hours/days, you can reconnect to your run very easily to monitor its status and logs." 57 | ```python 58 | import finetuner 59 | finetuner.login() 60 | run = finetuner.get_run('my_run') 61 | ``` 62 | 63 | ## Saving your model 64 | Show the user how to save their model when fine-tuning has completed. 65 | 66 | Example: 67 | 68 | "If your run has finished successfully, you can save fine-tuned models in the following way:" 69 | ```python 70 | run.save_artifact('my_model') 71 | ``` 72 | 73 | ## Evaluating your model 74 | Explain to the user how they can track the performance of the model(s) they have fine-tuned in their runs. If this is not implemented yet, show the user an example log and how they might deduce model performance from this log. -------------------------------------------------------------------------------- /docs/advanced-topics/budget.md: -------------------------------------------------------------------------------- 1 | (budget)= 2 | # {octicon}`database` How much data? 3 | 4 | ```{admonition} Read full blog 5 | :class: hint 6 | Please checkout [Fine-tuning with Low Budget and High Expectations](https://jina.ai/news/fine-tuning-with-low-budget-and-high-expectations/) 7 | to read the full tech blog. 8 | ``` 9 | 10 | Fine-tuning takes a pre-trained model, 11 | trained on a related task, and then further trains it for a new task. 12 | Alternately, it can mean taking a model pre-trained for an open domain task, and further training it for a domain-specific one. 13 | Compared to training from scratch, fine-tuning is a much more cost-efficient solution whenever it is feasible. But: 14 | 15 | + Exactly how much **data** do you need to get a good result? 16 | + Exactly how much **time** do you need to get good results? 17 | 18 | ## Experiments 19 | 20 | We designed two experiments to quantitatively study how labeled data and training time affect fine-tuning performance. 21 | For each experiment, we constructed three search tasks by fine-tuning three models. 22 | We chose seven datasets, two of which are non-domain-specific public datasets, to ensure the generality of our experiment. 23 | 24 | We measured the performance of the fine-tuned models by evaluating their ability to perform search tasks, as measured by Mean Reciprocal Rank (mRR), Recall, and Mean Average Precision (mAP). 25 | These metrics are calculated using the top 20 results of each search in the validation subset held out from each dataset. 26 | 27 | ### How much labeled data is needed? 28 | 29 | We gradually increase the amount of labeled data fed to Finetuner from 100 items to 100,000 and see how this affects performance on the metrics described in the previous section. 30 | 31 | In the figures below, the X-axis represents the amount of labeled data, and the Y-axis represents the relative improvement over the pre-trained model. The higher, the better. 32 | 33 | ... | ... 34 | :-------------------------:|:-------------------------: 35 | ![text-text-quora](https://jina-ai-gmbh.ghost.io/content/images/2022/12/Text-to-text-search-on-QuoraQA--3-.svg) | ![text-text-clinc](https://jina-ai-gmbh.ghost.io/content/images/2022/12/Text-to-text-search-on-Clinc150--3-.svg) 36 | ![image-image-tll](https://jina-ai-gmbh.ghost.io/content/images/2022/12/Image-to-image-search-on-Totally-looks-like.svg) | ![image-image-celeba](https://jina-ai-gmbh.ghost.io/content/images/2022/12/Image-to-image-search-on-Celeba--4-.svg) 37 | ![image-image-flickr30k](https://jina-ai-gmbh.ghost.io/content/images/2022/12/Text-to-image-search-on-Flickr30K--5-.svg) | ![image-image-coco](https://jina-ai-gmbh.ghost.io/content/images/2022/12/Text-to-image-search-on-CoCoCaptions--4-.svg) 38 | 39 | These results are promising but not particularly surprising. 40 | Performance improves with more labeled data on nearly all tasks and all datasets, more for some tasks and datasets than for others. 41 | However, the only conclusion we can draw from these figures is that the Finetuner works as advertised. So far so good. 42 | 43 | We further calculate the return on investment (ROI), 44 | by dividing the relative improvement (a proxy for net profit) by the amount of labeled data (a proxy for investment cost). 45 | **This is useful because it indicates the point at which adding more data is producing diminishing returns.** 46 | 47 | In the figures below, the X-axis represents the amount of labeled data, and the Y-axis represents the ROI per labeled data item. The higher, the better. 48 | In particular, `ROI=0` means adding new labeled data at that point no longer contributes to any improvement. 49 | 50 | ... | ... 51 | :-------------------------:|:-------------------------: 52 | ![text-text-quora](https://jina-ai-gmbh.ghost.io/content/images/2022/12/Text-to-text-search-on-QuoraQA--7-.svg) | ![text-text-clinc](https://jina-ai-gmbh.ghost.io/content/images/2022/12/Text-to-text-search-on-Clinc150--7-.svg) 53 | ![image-image-tll](https://jina-ai-gmbh.ghost.io/content/images/2022/12/Image-to-image-search-on-Totally-looks-like--1-.svg) | ![image-image-celeba](https://jina-ai-gmbh.ghost.io/content/images/2022/12/Image-to-image-search-on-Celeba--5-.svg) 54 | ![image-image-flickr30k](https://jina-ai-gmbh.ghost.io/content/images/2022/12/Text-to-image-search-on-Flickr30K--6-.svg) | ![image-image-coco](https://jina-ai-gmbh.ghost.io/content/images/2022/12/Text-to-image-search-on-CoCoCaptions--5-.svg) 55 | 56 | Surprisingly, we can see that the ROI per unit of new labeled data starts to drop almost immediately. We expected that it would eventually decrease, but this is an unexpected result. 57 | 58 | ### How much time is needed? 59 | 60 | To measure the value of added training time, we fixed the amount of new labeled data to 1000 items, and then we gradually increased the number of training epochs from 1 to 10. 61 | At each increase, we measure improvement over the pre-trained model and calculate the ROI. 62 | For these experiments, the ROI is calculated by dividing the relative improvement by the elapsed time in seconds. 63 | This means that when `ROI=0`, adding training time no longer improves performance. 64 | 65 | ... | ... 66 | :-------------------------:|:-------------------------: 67 | ![text-text-quora](https://jina-ai-gmbh.ghost.io/content/images/2022/12/Text-to-text-search-on-QuoraQA--4-.svg) | ![text-text-clinc](https://jina-ai-gmbh.ghost.io/content/images/2022/12/Text-to-text-search-on-Clinc150--4-.svg) 68 | ![image-image-tll](https://jina-ai-gmbh.ghost.io/content/images/2022/12/Image-to-image-search-on-Totally-look-like--2-.svg) | ![image-image-celeba](https://jina-ai-gmbh.ghost.io/content/images/2022/12/Image-to-image-search-on-Celeba--2-.svg) 69 | ![image-image-flickr30k](https://jina-ai-gmbh.ghost.io/content/images/2022/12/Text-to-image-search-on-Flickr30K--3-.svg) | ![image-image-coco](https://jina-ai-gmbh.ghost.io/content/images/2022/12/Text-to-image-search-on-CocoCaptions--2-.svg) 70 | 71 | We knew in advance that adding more time does not guarantee any improvement at all. 72 | It can, in fact, reduce performance due to the overfitting problem. 73 | Some models (e.g. CLIP) are more prone to overfitting than others. 74 | In principle, if we keep training with the same 1000 data points over and over, we are guaranteed to overfit on the data and the overall performance will drop. 75 | 76 | Let's look at the ROI curves. 77 | 78 | ... | ... 79 | :-------------------------:|:-------------------------: 80 | ![text-text-quora](https://jina-ai-gmbh.ghost.io/content/images/2022/12/Text-to-text-search-on-QuoraQA--5-.svg) | ![text-text-clinc](https://jina-ai-gmbh.ghost.io/content/images/2022/12/Text-to-text-search-on-Clinc150--9-.svg) 81 | ![image-image-tll](https://jina-ai-gmbh.ghost.io/content/images/2022/12/Image-to-image-search-on-Totally-look-like--3-.svg) | ![image-image-celeba](https://jina-ai-gmbh.ghost.io/content/images/2022/12/Image-to-image-search-on-Celeba--3-.svg) 82 | ![image-image-flickr30k](https://jina-ai-gmbh.ghost.io/content/images/2022/12/Text-to-image-search-on-Flickr30K--4-.svg) | ![image-image-coco](https://jina-ai-gmbh.ghost.io/content/images/2022/12/Text-to-image-search-on-CocoCaptions--3-.svg) 83 | 84 | The ROI drops immediately after the first epoch of fine-tuning. 85 | Unlike in the last experiment, where ROI approached zero but stayed positive when increasing the number of epochs, here, the ROI on added time can go negative due to the overfitting problem! 86 | 87 | ## Summary 88 | 89 | What does this mean for users looking to maximize gains and minimize costs? 90 | 91 | + Many state-of-the-art deep neural networks are capable of few-shot learning. They are quick learners and can make large improvements with only a few hundred items of labeled data and only a few minutes of training time. You might have thought that deep neural network training requires millions of data items and a week of runtime, but we have shown in these examples how that stereotype does not hold up to reality. 92 | + Because they can learn so much, so fast, from so little data, ROI drops quickly as you put more time and data into fine-tuning. In the experiments above, ROI shrinks by 70% from its highest value after 500 labeled data items or 600 added seconds of GPU training time. Further investment beyond a few hundred items of training data and very minimal training time may not pay off as well as you would like. -------------------------------------------------------------------------------- /docs/advanced-topics/finetuner-executor.md: -------------------------------------------------------------------------------- 1 | (finetuner-executor)= 2 | # {octicon}`gear` Use FinetunerExecutor inside a Jina Flow 3 | 4 | Finetuner, being part of the Jina AI Cloud, provides a convenient way to use tuned models via [Jina Executors](https://docs.jina.ai/fundamentals/executor/). 5 | 6 | We've created the [`FinetunerExecutor`](https://cloud.jina.ai/executor/13dzxycc) which can be added in a [Jina Flow](https://docs.jina.ai/fundamentals/flow/) and load any tuned model. 7 | More specifically, the executor exposes an `/encode` endpoint that embeds [Documents](https://finetuner.jina.ai/walkthrough/create-training-data/#preparing-a-documentarray) using the fine-tuned model. 8 | 9 | Loading a tuned model is simple! You just need to provide a few parameters under the `uses_with` argument when adding the `FinetunerExecutor` to the [Flow]((https://docs.jina.ai/fundamentals/flow/)). 10 | You have three options: 11 | 12 | ````{tab} Artifact id and token 13 | ```python 14 | import finetuner 15 | from jina import Flow 16 | 17 | finetuner.login() 18 | 19 | token = finetuner.get_token() 20 | run = finetuner.get_run( 21 | experiment_name='YOUR-EXPERIMENT', 22 | run_name='YOUR-RUN' 23 | ) 24 | 25 | f = Flow().add( 26 | uses='jinahub+docker://FinetunerExecutor/latest', # use latest-gpu for gpu executor. 27 | uses_with={'artifact': run.artifact_id, 'token': token}, 28 | ) 29 | ``` 30 | ```` 31 | ````{tab} Locally saved artifact 32 | ```python 33 | from jina import Flow 34 | 35 | f = Flow().add( 36 | uses='jinahub+docker://FinetunerExecutor/latest', # use latest-gpu for gpu executor. 37 | uses_with={'artifact': '/mnt/YOUR-MODEL.zip'}, 38 | volumes=['/your/local/path/:/mnt'] # mount your model path to docker. 39 | ) 40 | ``` 41 | ```` 42 | ````{tab} YAML 43 | ```yaml 44 | jtype: Flow 45 | with: 46 | port: 51000 47 | protocol: grpc 48 | executors: 49 | uses: jinahub+docker://FinetunerExecutor/latest 50 | with: 51 | artifact: 'COPY-YOUR-ARTIFACT-ID-HERE' 52 | token: 'COPY-YOUR-TOKEN-HERE' # or better set as env 53 | ``` 54 | ```` 55 | 56 | As you can see, it's super easy! 57 | If you did not call {func}`~finetuner.run.Run.save_artifact`, 58 | you need to provide the `artifact_id` and `token`. 59 | `FinetunerExecutor` will automatically pull your model from the Jina AI Cloud to the container. 60 | 61 | On the other hand, 62 | if you have saved artifact locally, 63 | please mount the zipped artifact to the docker container. 64 | `FinetunerExecutor` will unzip the artifact and load models. 65 | 66 | You can start your flow with: 67 | 68 | ```python 69 | with f: 70 | # in this example, we fine-tuned a BERT model and embed a Document.. 71 | returned_docs = f.post( 72 | on='/encode', 73 | inputs=DocumentArray( 74 | [ 75 | Document( 76 | text='some text to encode' 77 | ) 78 | ] 79 | ) 80 | ) 81 | 82 | for doc in returned_docs: 83 | print(f'Text of the returned document: {doc.text}') 84 | print(f'Shape of the embedding: {doc.embedding.shape}') 85 | ``` 86 | 87 | ```console 88 | Text of the returned document: some text to encode 89 | Shape of the embedding: (768,) 90 | ``` 91 | 92 | In order to see what other options you can specify when initializing the executor, please go to the [`FinetunerExecutor`](https://cloud.jina.ai/executor/13dzxycc) page and click on `Arguments` on the top-right side. 93 | 94 | ```{admonition} FinetunerExecutor parameters 95 | :class: tip 96 | The only required argument is `artifact`. We provide default values for others. 97 | ``` 98 | 99 | ## Special case: Artifacts with CLIP models 100 | If your fine-tuning job was executed on a CLIP model, your artifact contains two 101 | models: `clip-vision` and `clip-text`. 102 | The vision model allows you to embed images and the text model can encode text passages 103 | into the same vector space. 104 | To use those models, you have to provide the name of the model via an additional 105 | `select_model` parameter to the {func}`~finetuner.get_model` function. 106 | 107 | If you want to host the CLIP models, you also have to provide the name of the model via the 108 | `select_model` parameter inside the `uses_with` attribute: 109 | 110 | ```python 111 | import finetuner 112 | from jina import Flow 113 | 114 | finetuner.login() 115 | 116 | token = finetuner.get_token() 117 | run = finetuner.get_run( 118 | experiment_name='YOUR-EXPERIMENT', 119 | run_name='YOUR-RUN' 120 | ) 121 | 122 | f = Flow().add( 123 | uses='jinahub+docker://FinetunerExecutor/latest', # use latest-gpu for gpu executor. 124 | uses_with={ 125 | 'artifact': run.artifact_id, 'token': token, 'select_model': 'clip-vision' 126 | }, 127 | ) 128 | 129 | ``` 130 | 131 | -------------------------------------------------------------------------------- /docs/advanced-topics/linear-probe.md: -------------------------------------------------------------------------------- 1 | (projection-head)= 2 | # {octicon}`pin` Projection Head 3 | 4 | ## Why freezing? 5 | 6 | Depending on your task and the amount of training data, 7 | it is not always necessary to tune the entire model. 8 | In some cases, 9 | freezing some of the weights of the pre-trained model and just fine-tuning specific layers produces comparable or better results. 10 | Furthermore, freezing weights can reduce the training time dramatically. 11 | 12 | Finetuner allows you to fine-tune a Linear Projection Head easily. 13 | 14 | ```{warning} 15 | Currently, we only allow you to freeze layers for image-to-image search tasks. 16 | These models are built on top of Convolutional Neural Networks (CNNs). 17 | 18 | For transformer architectures, 19 | we can only fine-tune the entire neural network. 20 | If you need to freeze weights for transformers, consider submitting a feature request in our [Github Issues page](https://github.com/jina-ai/finetuner/issues) 21 | ``` 22 | 23 | ```{admonition} Dimensionality reduction 24 | :class: hint 25 | Use a smaller `output_dim` to get compact embeddings. 26 | ``` 27 | 28 | ## How? 29 | 30 | Finetuner has a built-in module called Tailor. 31 | Given a general model written in Pytorch, 32 | Tailor performs the micro-operations on the model architecture required for fine-tuning and outputs an embedding model. 33 | 34 | Given a general model with weights, Tailor performs some or all of the following steps: 35 | 36 | + Iterating over all layers to find dense layers. 37 | + Chopping off all layers after a certain dense layer. 38 | + Freezing weights on specific layers. 39 | + Adding new layers on top of the model. 40 | 41 | ![tailor](../imgs/tailor.svg) 42 | 43 | For example, just using the arguments `freeze=True` and `output_dim=X` with the `fit` function, as shown below: 44 | 45 | ```diff 46 | run = finetuner.fit( 47 | model='resnet50', 48 | ..., 49 | + freeze=True, 50 | + output_dim=1024, # default output_dim of ResNet50 is 2048. 51 | ..., 52 | ) 53 | ``` 54 | 55 | Finetuner will: 56 | 57 | 1. Remove the classification head of a `ResNet` model, and convert it into an embedding model. 58 | 2. Freeze all layers of the embedding model. 59 | 3. Attach a trainable 3-layer Linear Projection Head on top of the embedding model with an `output_dim=1024`. 60 | 61 | ```warning 62 | Keep in mind that whenever you use `freeze=True`, always set `output_dim`. 63 | Otherwise, nothing can be tuned since all layers are frozen. 64 | ``` 65 | 66 | ## Summary 67 | 68 | If you want to achieve efficient fine-tuning without retraining the entire model, 69 | tuning a Linear Projection Head could be a good solution. -------------------------------------------------------------------------------- /docs/advanced-topics/negative-mining.md: -------------------------------------------------------------------------------- 1 | (negative-mining)= 2 | # {octicon}`telescope` Negative Mining 3 | 4 | Negative Mining is an advanced machine learning technique, which optimizes the way data is sampled from your training dataset. 5 | Usually, it aims at making the metric learning tasks for the model harder during the training. 6 | In this way, it can lead to better fine-tuning results. 7 | 8 | ## Context: Deep Metric Learning 9 | 10 | First, let's take a look at how we construct the training data for metric learning tasks. 11 | 12 | Metric Learning algorithms attempt to teach neural network models to tell 13 | which objects are semantically/visually similar and which ones are not. 14 | 15 | For uni-modal fine-tuning tasks such as text-to-text, image-to-image, or mesh-to-mesh, 16 | Finetuner constructs training data in the following way: 17 | 18 | ![batch-sample](../imgs/batch-sampling.png) 19 | 20 | Assume we have a list of Documents belonging to four classes: `1`, `2`, `3`, and `4`, 21 | Finetuner will evenly sample *X* items per class to make a batch *B* which is encoded by the model into a set of embeddings. 22 | 23 | Afterward, the loss is calculated based on the relations between the embeddings. 24 | Many of Finetuner's loss functions contrast the embeddings of three items, or a __Triplet__. 25 | Finetuner creates all possible triplets *(anchor, pos, neg)* from this batch which satisfy the following conditions: 26 | For each triplet, the first is the __anchor__, the second is an embedding that ought to be closer to the embedding of the anchor (has the same label), and the third is one that should be further from the anchor (has a different label). 27 | The objective is to pull the embeddings of items that belong to the same class closer together in the embedding space, 28 | while pushing the embeddings of items which belong to different classes farther away from each other. 29 | 30 | ![training](../imgs/metric-train.png) 31 | 32 | 33 | ## The Triplet Margin Miner 34 | 35 | For some triplets, the pre-trained model already performs well, i.e. 36 | 37 | the distance between the `anchor` embedding and `pos` is much smaller than 38 | the distance between `anchor` and `neg`? 39 | These triplets do not contribute to improving the model, since they are already in the desired relation to each other in the embedding space. 40 | A more effective way is to use only a subset of all triplets for model training. We call this subset the **hard** or **semi-hard negative samples**. 41 | 42 | ![mining](../imgs/mining.png) 43 | 44 | Let's say `1₀` is an `anchor`, `1₁` is the `pos` while `2₄` is the `neg`, and `D(x,y)` is the distance between the embeddings of `x` and `y`. 45 | 46 | If: 47 | 48 | + `D(anchor, neg) < D(anchor, pos) `, then `neg` can be considered as a "hard negative" (`2₄ - H`). 49 | + `D(anchor, pos) < D(anchor, neg) < D(anchor, pos) + margin`, where `neg` is a little further from the `pos`, but within the margin, then `neg` can be considered as a "semi-hard negative" (`2₄ - S`). 50 | + `D(anchor, neg) > D(anchor, pos) + margin`, then `neg` can be considered as "easy negative" (`2₄ - E`). 51 | 52 | Training is more effective when using only **hard** and **semi-hard** negatives, given a reasonable margin value to distinguish them from **easy** triplets. 53 | 54 | ## Doing Negative Mining in Finetuner 55 | 56 | Finetuner is compatible with the miners provided by the [PyTorch Metric Learning](https://kevinmusgrave.github.io/pytorch-metric-learning) framework. 57 | To select a specific miner, pass its name to the `fit` function, e.g., `AngularMiner`, `TripletMarginMiner`, ... 58 | 59 | Please note that the miner has to be compatible with the loss function you selected. 60 | For instance, if you choose to train a model with the `TripleMarginLoss`, you can use the `TripletMarginMiner`. 61 | While without this miner, all possible triples with an anchor, a positive, and a negative candidate are used to calculate the loss, the miner reduces this set of triples. 62 | By default, the miner only selects triples with hard negatives where the distance between the positive and the negative example is inside a margin of `0.2`. 63 | To pass additional parameters to configure the miner, use the `miner_options` parameter of the fit function. 64 | For example, add the following to use only hard-negative triplets and set the margin to `0.3`: 65 | 66 | ```diff 67 | run = finetuner.fit( 68 | ..., 69 | loss='TripleMarginLoss', 70 | + miner='TripletMarginMiner', 71 | + miner_options={'margin': 0.3, 'type_of_triplets': 'hard'} 72 | ) 73 | ``` 74 | 75 | Possible choices for `type_of_triplets` are: 76 | 77 | + `easy`: Use all easy triplets - all triplets that do not violate the margin. 78 | + `semihard`: Use semi-hard triplets, but not hard triplets, i.e. those where difference in distance is within the specified margin. 79 | + `hard`: Use only hard triplets - the negative is closer to the anchor than the positive. 80 | + `all`: Use `hard` and `semihard` triples - all but the `easy` triples 81 | 82 | Finetuner takes `TripleMarginLoss` as its default loss function with no negative mining. 83 | For a detailed description of the miners and their parameters, see the [PyTorch Metric Learning documentation](https://kevinmusgrave.github.io/pytorch-metric-learning/miners/). 84 | 85 | ## Summary 86 | 87 | Metric Learning and triplets are extremely useful for fine-tuning models for similarity search. 88 | Easy triplets have little impact on improving the model. 89 | Consider using semi-hard/hard triplets for model tuning. -------------------------------------------------------------------------------- /docs/api-rst.rst: -------------------------------------------------------------------------------- 1 | ====================== 2 | :fab:`python` Python API 3 | ====================== 4 | 5 | This section includes the API documentation from the `Finetuner` codebase, as extracted from the `docstrings `_ in the code. 6 | 7 | :mod:`finetuner.__init__` - Finetuner 8 | -------------------- 9 | 10 | .. currentmodule:: finetuner.__init__ 11 | 12 | .. autosummary:: 13 | :nosignatures: 14 | :template: class.rst 15 | 16 | finetuner.login 17 | finetuner.describe_models 18 | finetuner.fit 19 | finetuner.list_callbacks 20 | finetuner.get_run 21 | finetuner.get_experiment 22 | finetuner.get_token 23 | finetuner.build_model 24 | finetuner.get_model 25 | finetuner.encode 26 | finetuner.list_runs 27 | finetuner.delete_run 28 | finetuner.delete_runs 29 | finetuner.create_experiment 30 | finetuner.list_experiments 31 | finetuner.delete_experiment 32 | finetuner.delete_experiments 33 | 34 | :mod:`finetuner.run.Run` - Run 35 | -------------------- 36 | 37 | .. currentmodule:: finetuner.run.Run 38 | 39 | .. autosummary:: 40 | :nosignatures: 41 | :template: class.rst 42 | 43 | finetuner.run.Run.name 44 | finetuner.run.Run.config 45 | finetuner.run.Run.status 46 | finetuner.run.Run.logs 47 | finetuner.run.Run.stream_logs 48 | finetuner.run.Run.save_artifact 49 | finetuner.run.Run.artifact_id 50 | 51 | 52 | :mod:`finetuner.experiment.Experiment` - Experiment 53 | -------------------- 54 | 55 | .. currentmodule:: finetuner.experiment.Experiment 56 | 57 | .. autosummary:: 58 | :nosignatures: 59 | :template: class.rst 60 | 61 | finetuner.experiment.Experiment.name 62 | finetuner.experiment.Experiment.create_run 63 | finetuner.experiment.Experiment.get_run 64 | finetuner.experiment.Experiment.list_runs 65 | finetuner.experiment.Experiment.delete_run 66 | finetuner.experiment.Experiment.delete_runs 67 | 68 | -------------------------------------------------------------------------------- /docs/get-started/how-it-works.md: -------------------------------------------------------------------------------- 1 | # {octicon}`question` How Does it Work? 2 | 3 | Finetuner is a framework for using the contrastive learning approach to improve similarity matching with models that encode data into embeddings. 4 | This involves three steps: 5 | 6 | ## Step 1: Build an embedding model 7 | 8 | Finetuner takes an existing, pre-trained model, typically called the __backbone__, and analyzes its architecture. 9 | If this model does not already produce embeddings or the architecture is not suitable for training, Finetuner is able to remove the default *head* (the last layers of the network), add new projection layers, apply *pooling*, and freeze layers that do not need to be trained. 10 | 11 | For instance, Finetuner will turn an image classification model, e.g., for separating cats from dogs, into an *embedding model* 12 | by removing its last layer - the classification head (cat-dog classifier). 13 | 14 | This embedding model does not make predictions or output a probability, 15 | but instead outputs a feature vector (an __embedding__) that represents its input. 16 | 17 | ## Step 2: Tuple/Triplet construction 18 | 19 | ````{tab} Uni-modal (with label) 20 | Finetuner works on labeled data. 21 | It expects either a CSV file or a {class}`~docarray.array.document.DocumentArray` consisting of {class}`~docarray.document.Document`s where each one contains `finetuner_label` corresponding to the class of a specific training example. After receiving a CSV file, its contents are parsed and a {class}`~docarray.array.document.DocumentArray` is constructed. 22 | 23 | During the fine-tuning, Finetuner creates Triplets `(anchor, positive, negative)` on-the-fly. 24 | For each anchor, 25 | which can be any training example, 26 | Finetuner looks for a `Document` with the same `finetuner_label` (positive), 27 | and a `Document` with a different `finetuner_label` (negative). 28 | The objective is to pull `Document`s which belong to the same class together, 29 | while pushing the `Document`s which belong to a different class away from each other. 30 | ```` 31 | ````{tab} Cross-modal (without label) 32 | Finetuner works on unlabeled text-image pairs. 33 | You can fine-tune a CLIP-like model for text to images search directly without any labels. 34 | It expects either a CSV file or a {class}`~docarray.array.document.DocumentArray` consisting a list of {class}`~docarray.array.document.Document` that contain two chunks: an image chunk and a text chunk. 35 | 36 | During fine-tuning, Finetuner leverages text-image pairs and jointly optimizes two models (`CLIPTextEncoder` and `CLIPImageEncoder`) with respect to two classification losses: (1) given a text, find the best matching 37 | image and (2) given an image, find the best matching text. Then it aggregates the two losses into the `CLIPLoss`. 38 | At the end, the output embedding of your data from the `CLIPTextEncoder` is comparable to `CLIPImageEncoder`. 39 | ```` 40 | 41 | ## Step 3: Tuning in the cloud 42 | 43 | From an operational perspective, 44 | we have hidden all the complexity of machine learning algorithms and resource configuration (such as GPUs). 45 | All you need to do is decide on your backbone model and prepare your training data. 46 | 47 | Once you have logged in to the Jina Ecosystem with {meth}`~finetuner.login()`, 48 | Finetuner will push your training data into the *Jina AI Cloud* (only visible to you). 49 | At the same time, we will spin-up an isolated computational resource 50 | with proper memory, CPU, and a GPU dedicated to your fine-tuning job. 51 | 52 | Once fine-tuning is done, Finetuner will push your fine-tuned model to the *Jina AI Cloud* 53 | and make it available for you to download. 54 | That's it! 55 | 56 | On the other hand, 57 | if you have a certain level of machine learning knowledge, 58 | Finetuner gives you enough flexibility to adjust the training parameters. 59 | This will be explained in a later section. 60 | -------------------------------------------------------------------------------- /docs/get-started/installation.md: -------------------------------------------------------------------------------- 1 | (install-finetuner)= 2 | # {octicon}`desktop-download` Installation 3 | 4 | ![PyPI](https://img.shields.io/pypi/v/finetuner?color=%23ffffff&label=%20) is the latest version. 5 | 6 | Make sure you have `Python 3.8+` installed on Linux/Mac/Windows: 7 | 8 | ```bash 9 | pip install -U finetuner 10 | ``` 11 | 12 | If you want to submit a fine-tuning job on the cloud, please use: 13 | 14 | ```bash 15 | pip install "finetuner[full]" 16 | ``` 17 | 18 | To check your installation run: 19 | ```bash 20 | pip show finetuner 21 | ``` -------------------------------------------------------------------------------- /docs/get-started/pretrained.md: -------------------------------------------------------------------------------- 1 | (pretrained-models)= 2 | # {octicon}`rocket` Jina Embeddings 3 | 4 | Starting with Finetuner 0.8.0, 5 | we have introduced a suite of pre-trained text embedding models licensed under Apache 2.0. 6 | These models have a variety of use cases, including information retrieval, semantic textual similarity, text reranking, and more. 7 | The suite consists of the following models: 8 | 9 | - `jina-embedding-t-en-v1` [**[Huggingface](https://huggingface.co/jinaai/jina-embedding-t-en-v1)**]: The fastest embedding model in the world with 14 million parameters. 10 | - `jina-embedding-s-en-v1` [**[Huggingface](https://huggingface.co/jinaai/jina-embedding-s-en-v1)**]: This is a compact model with just 35 million parameters, that performs lightning-fast inference while delivering impressive performance. 11 | - `jina-embedding-b-en-v1` [**[Huggingface](https://huggingface.co/jinaai/jina-embedding-b-en-v1)**]: This model has a size of 110 million parameters, performs fast inference and delivers better performance than our smaller model. 12 | - `jina-embedding-l-en-v1` [**[Huggingface](https://huggingface.co/jinaai/jina-embedding-l-en-v1)**]: This is a relatively large model with a size of 330 million parameters, that performs single-gpu inference and delivers better performance than the other models. 13 | 14 | ## Usage 15 | 16 | ```python 17 | import finetuner 18 | 19 | model = finetuner.build_model('jinaai/jina-embedding-s-en-v1') 20 | embeddings = finetuner.encode( 21 | model=model, 22 | data=['how is the weather today', 'What is the current weather like today?'] 23 | ) 24 | print(finetuner.cos_sim(embeddings[0], embeddings[1])) 25 | ``` 26 | 27 | ## Training Data 28 | 29 | Jina Embeddings is a suite of language models that have been trained using Jina AI's Linnaeus-Clean dataset. 30 | This dataset consists of 380 million query-document pairs of sentences. 31 | These pairs were obtained from various domains and were carefully selected through a thorough cleaning process. 32 | The Linnaeus-Full dataset, from which the Linnaeus-Clean dataset is derived, originally contained 1.6 billion sentence pairs. 33 | 34 | ## Characteristics 35 | 36 | Each Jina embedding model can encode up to 512 tokens, 37 | with any further tokens being truncated. 38 | The models have different output dimensionalities, as shown in the table below: 39 | 40 | | Name | param |context| Dimension | 41 | |------------------------|-------|------|-----------| 42 | | jina-embedding-t-en-v1 | 14m |512| 312 | 43 | | jina-embedding-s-en-v1 | 35m |512| 512 | 44 | | jina-embedding-b-en-v1 | 110m |512| 768 | 45 | | jina-embedding-l-en-v1 | 330m |512| 1024 | 46 | 47 | ## Performance 48 | 49 | Please refer to the [Huggingface](https://huggingface.co/jinaai/jina-embedding-s-en-v1) page. 50 | 51 | ## Citations 52 | 53 | If you find Jina Embeddings useful in your research, please cite the following paper: 54 | 55 | ```text 56 | @misc{günther2023jina, 57 | title={Jina Embeddings: A Novel Set of High-Performance Sentence Embedding Models}, 58 | author={Michael Günther and Louis Milliken and Jonathan Geuter and Georgios Mastrapas and Bo Wang and Han Xiao}, 59 | year={2023}, 60 | eprint={2307.11224}, 61 | archivePrefix={arXiv}, 62 | primaryClass={cs.CL} 63 | } 64 | 65 | ``` 66 | -------------------------------------------------------------------------------- /docs/html_extra/robots.txt: -------------------------------------------------------------------------------- 1 | User-agent: * 2 | sitemap: https://finetuner.jina.ai/sitemap.xml -------------------------------------------------------------------------------- /docs/imgs/DocumentArray_plot_image_sprites.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jina-ai/finetuner/69ae77cb51c736e791126792de20015a70658b53/docs/imgs/DocumentArray_plot_image_sprites.png -------------------------------------------------------------------------------- /docs/imgs/DocumentArray_summary.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jina-ai/finetuner/69ae77cb51c736e791126792de20015a70658b53/docs/imgs/DocumentArray_summary.png -------------------------------------------------------------------------------- /docs/imgs/Document_display.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jina-ai/finetuner/69ae77cb51c736e791126792de20015a70658b53/docs/imgs/Document_display.png -------------------------------------------------------------------------------- /docs/imgs/Document_summary.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jina-ai/finetuner/69ae77cb51c736e791126792de20015a70658b53/docs/imgs/Document_summary.png -------------------------------------------------------------------------------- /docs/imgs/SphereFace-training.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jina-ai/finetuner/69ae77cb51c736e791126792de20015a70658b53/docs/imgs/SphereFace-training.png -------------------------------------------------------------------------------- /docs/imgs/batch-sampling.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jina-ai/finetuner/69ae77cb51c736e791126792de20015a70658b53/docs/imgs/batch-sampling.png -------------------------------------------------------------------------------- /docs/imgs/distributions-loss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jina-ai/finetuner/69ae77cb51c736e791126792de20015a70658b53/docs/imgs/distributions-loss.png -------------------------------------------------------------------------------- /docs/imgs/metric-train.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jina-ai/finetuner/69ae77cb51c736e791126792de20015a70658b53/docs/imgs/metric-train.png -------------------------------------------------------------------------------- /docs/imgs/mining.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jina-ai/finetuner/69ae77cb51c736e791126792de20015a70658b53/docs/imgs/mining.png -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | # Welcome to Finetuner! 2 | 3 | ```{include} ../README.md 4 | :start-after: 5 | :end-before: 6 | ``` 7 | 8 | ```{include} ../README.md 9 | :start-after: 10 | :end-before: 11 | ``` 12 | 13 | ```{include} ../README.md 14 | :start-after: 15 | :end-before: 16 | ``` 17 | 18 | ```{include} ../README.md 19 | :start-after: 20 | :end-before: 21 | ``` 22 | 23 | ```{toctree} 24 | :caption: Get Started 25 | :hidden: 26 | 27 | get-started/how-it-works 28 | get-started/installation 29 | get-started/pretrained 30 | walkthrough/index 31 | ``` 32 | 33 | ```{toctree} 34 | :caption: Advanced Topics 35 | :hidden: 36 | 37 | advanced-topics/budget 38 | advanced-topics/negative-mining 39 | advanced-topics/using-callbacks 40 | advanced-topics/linear-probe 41 | advanced-topics/advanced-losses-optimizers-and-poolers 42 | advanced-topics/finetuner-executor 43 | ``` 44 | 45 | 46 | 47 | ```{toctree} 48 | :caption: Finetuning Tasks 49 | :hidden: 50 | 51 | notebooks/text_to_text 52 | notebooks/image_to_image 53 | notebooks/image_to_image_arcface 54 | notebooks/text_to_image 55 | notebooks/multilingual_text_to_image 56 | notebooks/mesh_to_mesh 57 | notebooks/data_synthesis 58 | ``` 59 | 60 | ```{toctree} 61 | :caption: Developer Reference 62 | :hidden: 63 | :maxdepth: 1 64 | 65 | api-rst 66 | ``` 67 | 68 | --- 69 | {ref}`genindex` | {ref}`modindex` 70 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.https://www.sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/makedoc.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -ex 4 | 5 | if [[ $1 == "local-only" ]]; then 6 | rm -rf api && make clean 7 | make dirhtml 8 | else 9 | export NUM_RELEASES=${NUM_RELEASES:-5} 10 | export DEFAULT_BRANCH='main' 11 | export BUILD_DIR=_build/dirhtml 12 | 13 | declare -a ARR_SMV_TAG_WHITELIST=() 14 | declare -a ARR_SMV_BRANCH_WHITELIST=() 15 | 16 | rm -rf api && rm -rf ${BUILD_DIR} 17 | 18 | # Might error out with "API Limit exceeds" on local (would need api token), but on CI shouldn't face issues. 19 | declare -a LAST_N_TAGS=( $(curl -s -H "Accept: application/vnd.github.v3+json" \ 20 | "https://api.github.com/repos/jina-ai/finetuner/releases?per_page=${NUM_RELEASES}" \ 21 | | jq -r '.[].tag_name') ) 22 | 23 | export LATEST_FINETUNER_VERSION="${LAST_N_TAGS[0]}" 24 | 25 | if [[ $1 == "development" ]]; then 26 | current_branch=$(git branch --show-current) 27 | if [[ ${current_branch} != ${DEFAULT_BRANCH} ]]; then 28 | ARR_SMV_BRANCH_WHITELIST+=" ${current_branch}" 29 | fi 30 | fi 31 | 32 | ARR_SMV_BRANCH_WHITELIST+=" ${DEFAULT_BRANCH}" 33 | ARR_SMV_TAG_WHITELIST+=" ${LAST_N_TAGS[@]}" 34 | export SMV_BRANCH_WHITELIST="${ARR_SMV_BRANCH_WHITELIST}" 35 | export SMV_TAG_WHITELIST="${ARR_SMV_TAG_WHITELIST}" 36 | 37 | echo -e "Latest Finetuner Version: ${LATEST_FINETUNER_VERSION}" 38 | echo -e "Branches to whitelist: ${SMV_BRANCH_WHITELIST}" 39 | echo -e "Tags to whitelist: ${SMV_TAG_WHITELIST}" 40 | 41 | sphinx-multiversion . ${BUILD_DIR} -b dirhtml 42 | mv -v _build/dirhtml/${LATEST_FINETUNER_VERSION}/* _build/dirhtml 43 | fi -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | furo 2 | gitpython==3.1.13 3 | git+https://github.com/Holzhaus/sphinx-multiversion.git 4 | jupytext==1.14.1 5 | sphinx 6 | myst-parser==0.15.1 7 | nbsphinx==0.8.9 8 | sphinx-argparse==0.3.1 9 | sphinx-design 10 | sphinx-inline-tabs 11 | sphinx-autodoc-typehints==1.12.0 12 | sphinxext-opengraph 13 | sphinx-notfound-page==0.7.1 14 | sphinx-sitemap==2.2.0 15 | sphinx_copybutton==0.4.0 16 | sphinx_markdown_tables==0.0.16 17 | sphinxcontrib-apidoc==0.3.0 18 | -------------------------------------------------------------------------------- /docs/walkthrough/basic-concepts.md: -------------------------------------------------------------------------------- 1 | (experiment-and-runs)= 2 | # Basic Concepts 3 | 4 | Finetuner organizes your training based on two concepts: 5 | {class}`~finetuner.experiment.Experiment` and {class}`~finetuner.run.Run`. 6 | 7 | An Experiment defines the machine learning task you're fine-tuning for. 8 | A Run refers to a single execution of the Experiment with a specific configuration. 9 | An Experiment contains a list of Runs, each with different configurations. 10 | For example: 11 | 12 | + Experiment: Fine-tune a transformer on the QuoraQA dataset. 13 | - Run1: Use bert-based model. 14 | - Run2: Use sentence-transformer model. 15 | + Experiment: Fine-tune ResNet on WILD dataset. 16 | - Run1: Use ResNet18 with learning rate 0.01 and SGD optimizer. 17 | - Run2: Use ResNet50 with learning rate 0.01 and SGD optimizer. 18 | - Run3: Use ResNet50 with learning rate 0.0001 and Adam optimizer. 19 | 20 | All information and data produced during using Finetuner is linked to those two concepts. 21 | Each Experiment and each Run has a name. 22 | The name of the Experiment should be unique and the name of the Run is also required 23 | to be unique for each Experiment. 24 | Thus, if you want to retrieve the logs of a run or download the fine-tuned model later 25 | on, you can do this with the respective experiment and run names, as explained in section 26 | {doc}`/walkthrough/save-model`. 27 | 28 | When you start the fine-tuning job, you can declare the `experiment_name` and `run_name` like this: 29 | 30 | ```python 31 | import finetuner 32 | 33 | finetuner.fit( 34 | ..., 35 | experiment_name='quora-qa-finetune', 36 | run_name='quora-qa-finetune-bert', 37 | ) 38 | ``` 39 | 40 | Please note that these two arguments are optional. 41 | If not supplied, 42 | Finetuner will use the current working directory as a default `experiment_name`, 43 | and generate a random `run_name` for you, e.g., "infallible-colden". -------------------------------------------------------------------------------- /docs/walkthrough/choose-backbone.md: -------------------------------------------------------------------------------- 1 | (choose-backbone)= 2 | # Backbone Model 3 | 4 | Finetuner provides several widely used backbone models, 5 | including `resnet`, `efficientnet`, `clip` and `bert`. 6 | Thereby, for most of them, Finetuner provides multiple variants, e.g., the common `resnet50 ` and the more complex `resnet152` model. 7 | 8 | Finetuner will convert these backbone models to embedding models by removing 9 | the *head* or applying *pooling*, 10 | performing fine-tuning and producing the final embedding model. 11 | The embedding model can be fine-tuned for text-to-text, image-to-image or text-to-image 12 | search tasks. 13 | 14 | You can call: 15 | ````{tab} text-to-text 16 | ```python 17 | import finetuner 18 | 19 | finetuner.describe_models(task='text-to-text') 20 | ``` 21 | ```` 22 | ````{tab} image-to-image 23 | ```python 24 | import finetuner 25 | 26 | finetuner.describe_models(task='image-to-image') 27 | ``` 28 | ```` 29 | ````{tab} text-to-image 30 | ```python 31 | import finetuner 32 | 33 | finetuner.describe_models(task='text-to-image') 34 | ``` 35 | ```` 36 | ````{tab} mesh-to-mesh 37 | ```python 38 | import finetuner 39 | 40 | finetuner.describe_models(task='mesh-to-mesh') 41 | ``` 42 | ```` 43 | 44 | to get a list of supported models: 45 | 46 | ````{tab} text-to-text 47 | ```bash 48 | Finetuner backbones: text-to-text 49 | ┏━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ 50 | ┃ name ┃ task ┃ output_dim ┃ architecture ┃ description ┃ 51 | ┡━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩ 52 | │ jina-embedding-t-en-v1 │ text-to-text │ 312 │ transformer │ Text embedding model trained using Linnaeus-Clean dataset by Jina AI │ 53 | │ jina-embedding-s-en-v1 │ text-to-text │ 512 │ transformer │ Text embedding model trained using Linnaeus-Clean dataset by Jina AI │ 54 | │ jina-embedding-b-en-v1 │ text-to-text │ 768 │ transformer │ Text embedding model trained using Linnaeus-Clean dataset by Jina AI │ 55 | │ jina-embedding-l-en-v1 │ text-to-text │ 1024 │ transformer │ Text embedding model trained using Linnaeus-Clean dataset by Jina AI │ 56 | │ bert-base-en │ text-to-text │ 768 │ transformer │ BERT model pre-trained on BookCorpus and English Wikipedia │ 57 | │ bert-base-multi │ text-to-text │ 768 │ transformer │ BERT model pre-trained on multilingual Wikipedia │ 58 | │ distiluse-base-multi │ text-to-text │ 512 │ transformer │ Knowledge distilled version of the multilingual Sentence Encoder │ 59 | │ sbert-base-en │ text-to-text │ 768 │ transformer │ Pretrained BERT, fine-tuned on MS Marco │ 60 | └────────────────────────┴──────────────┴────────────┴──────────────┴─────────────────────────────────────────────────────────────────────────┘ 61 | ``` 62 | ```` 63 | ````{tab} image-to-image 64 | ```bash 65 | Finetuner backbones: image-to-image 66 | ┏━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ 67 | ┃ name ┃ task ┃ output_dim ┃ architecture ┃ description ┃ 68 | ┡━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩ 69 | │ efficientnet-base │ image-to-image │ 1792 │ cnn │ EfficientNet B4 pre-trained on ImageNet │ 70 | │ efficientnet-large │ image-to-image │ 2560 │ cnn │ EfficientNet B7 pre-trained on ImageNet │ 71 | │ resnet-large │ image-to-image │ 2048 │ cnn │ ResNet152 pre-trained on ImageNet │ 72 | │ resnet-base │ image-to-image │ 2048 │ cnn │ ResNet50 pre-trained on ImageNet │ 73 | └────────────────────┴────────────────┴────────────┴──────────────┴─────────────────────────────────────────┘ 74 | ``` 75 | ```` 76 | ````{tab} text-to-image 77 | ```bash 78 | Finetuner backbones: text-to-image 79 | ┏━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ 80 | ┃ name ┃ task ┃ output_dim ┃ architecture ┃ description ┃ 81 | ┡━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩ 82 | │ clip-base-en │ text-to-image │ 512 │ transformer │ CLIP base model │ 83 | │ clip-large-en │ text-to-image │ 1024 │ transformer │ CLIP large model with patch size 14 │ 84 | │ clip-base-multi │ text-to-image │ 512 │ transformer │ Open MCLIP │ 85 | │ │ │ │ │ "xlm-roberta-base-ViT-B-32::laion5b_s13b_b90k" model │ 86 | └─────────────────┴───────────────┴────────────┴──────────────┴───────────────────────────────────────────────────────┘ 87 | ``` 88 | ```` 89 | ````{tab} mesh-to-mesh 90 | ```bash 91 | Finetuner backbones: mesh-to-mesh 92 | ┏━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ 93 | ┃ name ┃ task ┃ output_dim ┃ architecture ┃ description ┃ 94 | ┡━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩ 95 | │ pointnet-base │ mesh-to-mesh │ 512 │ pointnet │ PointNet++ embedding model for 3D mesh point clouds │ 96 | └───────────────┴──────────────┴────────────┴──────────────┴─────────────────────────────────────────────────────┘ 97 | ``` 98 | ```` 99 | 100 | + ResNets are suitable for image-to-image search tasks with high performance requirements, where `resnet152` is bigger and requires higher computational resources than `resnet50`. 101 | + EfficientNets are suitable for image-to-image search tasks with low training and inference times. The model is more light-weighted than ResNet. Here, `efficientnet_b4` is the bigger and more complex model. 102 | + CLIP is the one for text-to-image search, where the images do not need to have any text descriptors. 103 | + BERT is generally suitable for text-to-text search tasks. 104 | + Msmarco-distilbert-base-v3 is designed for matching web search queries to short text passages and is a suitable backbone for similar text-to-text search tasks. 105 | + PointNet++ is an embedding model, which we derived from the popular [PointNet++ model](https://proceedings.neurips.cc/paper/2017/file/d8bf84be3800d12f74d8b05e9b89836f-Paper.pdf). 106 | The original model is designed for classifying 3D meshes. Our derived model can be used to encode meshes into vectors for search. 107 | 108 | It should be noted that: 109 | 110 | + ResNet/EfficientNet models are loaded from the [torchvision](https://pytorch.org/vision/stable/index.html) library. 111 | + Transformer-based models are loaded from the huggingface [transformers](https://github.com/huggingface/transformers) library. 112 | + `msmarco-distilbert-base-v3` has been fine-tuned once by [sentence-transformers](https://www.sbert.net/) on the [MS MARCO](https://microsoft.github.io/msmarco/) dataset on top of BERT. -------------------------------------------------------------------------------- /docs/walkthrough/index.md: -------------------------------------------------------------------------------- 1 | # {octicon}`list-ordered` Walkthrough 2 | 3 | Why do I need Finetuner? 4 | 5 | Because search quality matters. 6 | 7 | When you bring a pre-trained model to encode your data to embeddings, you are likely to get irrelevant search results. 8 | Pre-trained deep learning models are usually trained on large-scale datasets, that have a different *data distribution* over your own datasets or domains. 9 | This is referred to as a *distribution shift*. 10 | 11 | Finetuner provides a solution to this problem by leveraging a pre-trained model from a large dataset and fine-tuning the parameters of 12 | this model on your dataset. 13 | 14 | Once fine-tuning is done, you get a model adapted to your domain. This new model leverages better search performance on your-task-of-interest. 15 | 16 | Fine-tuning a pre-trained model includes a certain complexity and requires Machine Learning plus domain knowledge (on NLP, Computer Vision, etc.). 17 | Thus, it is a non-trivial task for business owners and engineers who lack practical deep-learning knowledge. Finetuner attempts 18 | to address this by providing a simple interface, which can be as easy as: 19 | 20 | ```python 21 | import finetuner 22 | from finetuner import DocumentArray 23 | 24 | # Login to Jina AI Cloud 25 | finetuner.login() 26 | 27 | # Prepare training data 28 | train_data = DocumentArray(...) 29 | 30 | # Fine-tune in the cloud 31 | run = finetuner.fit( 32 | model='resnet50', train_data=train_data, epochs=5, batch_size=128, 33 | ) 34 | 35 | print(run.name) 36 | for log_entry in run.stream_logs(): 37 | print(log_entry) 38 | 39 | # When ready 40 | run.save_artifact(directory='experiment') 41 | ``` 42 | 43 | You should see this in your terminal: 44 | 45 | ```bash 46 | 🔐 Successfully logged in to Jina AI as [USER NAME]! 47 | Run name: vigilant-tereshkova 48 | Run logs: 49 | 50 | Training [2/2] ━━━━━━━━━━━━━━━━━━━━━━━━━━━ 50/50 0:00:00 0:01:08 • loss: 0.050 51 | [09:13:23] INFO [__main__] Done ✨ __main__.py:214 52 | INFO [__main__] Saving fine-tuned models ... __main__.py:217 53 | INFO [__main__] Saving model 'tuned_model' in __main__.py:228 54 | /usr/src/app/tuned-models/model ... 55 | INFO [__main__] Pushing saved model to Hubble ... __main__.py:232 56 | [09:13:54] INFO [__main__] Pushed model artifact ID: __main__.py:238 57 | '62972acb5de25a53fdbfcecc' 58 | INFO [__main__] Finished 🚀 __main__.py:240 59 | ``` 60 | 61 | Submitted fine-tuning jobs run efficiently on the Jina AI Cloud on either CPU or GPU enabled hardware. 62 | 63 | Finetuner fully owns the complexity of setting up and maintaining the model training infrastructure plus the complexity of delivering SOTA training methods to production use cases. 64 | 65 | Please check out the following steps for more information: 66 | 67 | 68 | ```{toctree} 69 | basic-concepts 70 | login 71 | create-training-data 72 | choose-backbone 73 | run-job 74 | save-model 75 | inference 76 | ``` -------------------------------------------------------------------------------- /docs/walkthrough/inference.md: -------------------------------------------------------------------------------- 1 | # Inference 2 | 3 | Once fine-tuning is finished, it's time to actually use the model. 4 | You can use the fine-tuned models directly to encode DocumentArray objects or to set up an encoding service. 5 | When encoding, data can also be provided as a regular list. 6 | 7 | ```{admonition} Use FinetunerExecutor inside a Jina Flow 8 | :class: hint 9 | Finetuner offers the {class}`~finetuner.encode` interface to embed your data locally 10 | If you would like to use fine-tuned model inside a Jina Flow as an Executor, checkout 11 | {doc}`/advanced-topics/finetuner-executor`. 12 | ``` 13 | 14 | (integrate-with-list)= 15 | ## Encoding a List 16 | Data that is stored in a regular list can be embedded in the same way you would embed a DocumentArray. 17 | Since the modality of your input data can be inferred from the model being used, there is no need to provide any additional information besides the content you want to encode. 18 | When providing data as a list, the `finetuner.encode` method will return a `np.ndarray` of embeddings, instead of a `docarray.DocumentArray`: 19 | 20 | ````{tab} Artifact id and token 21 | ```python 22 | import finetuner 23 | 24 | finetuner.login() 25 | 26 | token = finetuner.get_token() 27 | run = finetuner.get_run( 28 | experiment_name='YOUR-EXPERIMENT', 29 | run_name='YOUR-RUN' 30 | ) 31 | 32 | model = finetuner.get_model( 33 | run.artifact_id, 34 | token=token, 35 | ) 36 | 37 | texts = ['some text to encode'] 38 | 39 | embeddings = finetuner.encode(model=model, data=texts) 40 | 41 | for text, embedding in zip(texts, embeddings): 42 | print(f'Text of the returned document: {text}') 43 | print(f'Shape of the embedding: {embedding.shape}') 44 | ``` 45 | ```` 46 | ````{tab} Locally saved artifact 47 | ```python 48 | import finetuner 49 | 50 | model = finetuner.get_model('/path/to/YOUR-MODEL.zip') 51 | 52 | texts = ['some text to encode'] 53 | 54 | embeddings = finetuner.encode(model=model, data=texts) 55 | 56 | for text, embedding in zip(texts, embeddings): 57 | print(f'Text of the returned document: {text}') 58 | print(f'Shape of the embedding: {embedding.shape}') 59 | ``` 60 | ```` 61 | ````{tab} (Special case) CLIP inference 62 | ```python 63 | import finetuner 64 | 65 | finetuner.login() 66 | 67 | token = finetuner.get_token() 68 | run = finetuner.get_run( 69 | experiment_name='YOUR-EXPERIMENT', 70 | run_name='YOUR-RUN' 71 | ) 72 | 73 | model = finetuner.get_model( 74 | run.artifact_id, 75 | token=token, 76 | select_model='clip-text' # use `clip-vision` to encode image. 77 | ) 78 | 79 | texts = ['some text to encode'] 80 | embeddings = finetuner.encode(model=model, data=texts) 81 | 82 | for text, embedding in zip(texts, embeddings): 83 | print(f'Text of the returned document: {text}') 84 | print(f'Shape of the embedding: {embedding.shape}') 85 | ``` 86 | ```` 87 | 88 | 89 | ```{admonition} Inference with ONNX 90 | :class: tip 91 | In case you set `to_onnx=True` when calling `finetuner.fit` function, 92 | please use `model = finetuner.get_model('/path/to/YOUR-MODEL.zip', is_onnx=True)`. 93 | ``` 94 | 95 | ```{admonition} Encoding other Modalities 96 | :class: tip 97 | Of course you can not only encode texts. 98 | For encoding a list of images, you can provide URIs, e.g., 99 | `embeddings = finetuner.encode(model=model, data=['path/to/apple.png'])`. 100 | ``` 101 | 102 | (integrate-with-docarray)= 103 | ## Encoding a DocumentArray 104 | 105 | To embed a DocumentArray with a fine-tuned model, you can get the model of your Run via the {func}`~finetuner.get_model` function and embed it via the {func}`finetuner.encode` function: 106 | 107 | ````{tab} Artifact id and token 108 | ```python 109 | from finetuner import DocumentArray, Document 110 | import finetuner 111 | 112 | finetuner.login() 113 | 114 | token = finetuner.get_token() 115 | run = finetuner.get_run( 116 | experiment_name='YOUR-EXPERIMENT', 117 | run_name='YOUR-RUN' 118 | ) 119 | 120 | model = finetuner.get_model( 121 | run.artifact_id, 122 | token=token, 123 | ) 124 | 125 | da = DocumentArray([Document(text='some text to encode')]) 126 | finetuner.encode(model=model, data=da) 127 | 128 | for doc in da: 129 | print(f'Text of the returned document: {doc.text}') 130 | print(f'Shape of the embedding: {doc.embedding.shape}') 131 | ``` 132 | ```` 133 | ````{tab} Locally saved artifact 134 | ```python 135 | from finetuner import DocumentArray, Document 136 | import finetuner 137 | 138 | model = finetuner.get_model('/path/to/YOUR-MODEL.zip') 139 | 140 | da = DocumentArray([Document(text='some text to encode')]) 141 | finetuner.encode(model=model, data=da) 142 | 143 | for doc in da: 144 | print(f'Text of the returned document: {doc.text}') 145 | print(f'Shape of the embedding: {doc.embedding.shape}') 146 | ``` 147 | ```` 148 | ````{tab} (Special case) CLIP inference 149 | ```python 150 | from finetuner import DocumentArray, Document 151 | import finetuner 152 | 153 | finetuner.login() 154 | 155 | token = finetuner.get_token() 156 | run = finetuner.get_run( 157 | experiment_name='YOUR-EXPERIMENT', 158 | run_name='YOUR-RUN' 159 | ) 160 | 161 | model = finetuner.get_model( 162 | run.artifact_id, 163 | token=token, 164 | select_model='clip-text' # use `clip-vision` to encode image. 165 | ) 166 | 167 | da = DocumentArray([Document(text='some text to encode')]) 168 | finetuner.encode(model=model, data=da) 169 | 170 | for doc in da: 171 | print(f'Text of the returned document: {doc.text}') 172 | print(f'Shape of the embedding: {doc.embedding.shape}') 173 | ``` 174 | ```` 175 | 176 | ```console 177 | Text of the returned document: some text to encode 178 | Shape of the embedding: (768,) 179 | ``` 180 | -------------------------------------------------------------------------------- /docs/walkthrough/login.md: -------------------------------------------------------------------------------- 1 | (login-to-jina-ecosystem)= 2 | # Login 3 | 4 | Since Finetuner leverages cloud resources for fine-tuning, 5 | you are required to {meth}`~finetuner.login()` and obtain a token from Jina before starting a fine-tuning job. 6 | It is as simple as: 7 | 8 | ```python 9 | import finetuner 10 | 11 | finetuner.login() 12 | ``` 13 | 14 | A browser window should pop up with different login options. 15 | After {meth}`~finetuner.login()` you will see the following message in your terminal: 16 | 17 | ```bash 18 | 🔐 Successfully logged in to Jina AI as [USER NAME]! 19 | ``` 20 | 21 | Now, an authentication token is generated which can be read with the {func}`~finetuner.get_token` function. 22 | If you have been logged in before, the existing token will not be overwritten, however, if you want this to happen, you can set the `force` attribute in the login function to true. 23 | 24 | ``` 25 | finetuner.login(force=True) 26 | ``` 27 | 28 | ```{admonition} Why do I need to login? 29 | :class: hint 30 | Login is required since Finetuner needs to push your {class}`~docarray.array.document.DocumentArray` or CSV file into the Jina AI Cloud as training or evaluation data. 31 | Once you have successfully logged in, your training data will be linked to your personal user profile and will only be visible to you. 32 | 33 | Once fine-tuning is completed, the fine-tuned model will be visible only to you in the Jina AI Cloud. 34 | ``` -------------------------------------------------------------------------------- /docs/walkthrough/save-model.md: -------------------------------------------------------------------------------- 1 | (retrieve-tuned-model)= 2 | # Save Artifact 3 | 4 | Perfect! 5 | Now, you have started the fine-tuning job in the Jina AI Cloud. 6 | When the fine-tuning job is finished, the resulting model is automatically stored under your Jina account in the Jina AI Cloud. 7 | Next, we can get its artifact id and download the model. 8 | 9 | ```{admonition} Managing fine-tuned models 10 | :class: hint 11 | To use a fine-tuned model in a Jina service running on [JCloud](https://github.com/jina-ai/jcloud), you do not need to download it. 12 | Each model has a artifact id, which is sufficient to setup an encoding serivce as explained in the section {doc}`/walkthrough/integrate-with-jina`. 13 | Alternatively, you can also download the model using the artifact id, as explained below, e.g., to use it in a locally runnig Jina service. 14 | ``` 15 | 16 | Please note that fine-tuning takes time. It highly depends on the size of your training data, evaluation data, and other hyperparameters. 17 | Because of this, you might have to close the session and reconnect to it several times. 18 | 19 | In the example below, we show how to connect to an existing run and download a tuned model: 20 | 21 | ```python 22 | import finetuner 23 | 24 | finetuner.login() 25 | 26 | # connect to the run we created previously. 27 | run = finetuner.get_run( 28 | run_name='finetune-flickr-dataset-efficientnet-1', 29 | experiment_name='finetune-flickr-dataset', 30 | ) 31 | print(f'Run status: {run.status()}') 32 | print(f'Run artifact id: {run.artifact_id}') 33 | ``` 34 | 35 | You can monitor your run status in two ways: 36 | 37 | 1. Log streaming: Pull logs from Jina AI Cloud lively, suitable for small fine-tuning tasks. 38 | 2. Query logs: Pull up-to-date logs from Jina AI Cloud, suitable for long-running tasks. 39 | 40 | ````{tab} Stream logs 41 | ```python 42 | for entry in run.stream_logs(): 43 | print(entry) 44 | ``` 45 | ```` 46 | ````{tab} Query logs 47 | ```python 48 | print(run.status()) 49 | print(run.logs()) 50 | ``` 51 | ```` 52 | 53 | Once run status is `FINISHED`, you can save the artifact with: 54 | 55 | ```python 56 | run.save_artifact('tuned_model') 57 | ``` 58 | 59 | ```{admonition} Share artifact with others 60 | :class: hint 61 | Finetuner allows you to set your artifact as a public artifact. 62 | At training time, you need to set `public=True` when calling the `fit` function. 63 | If `public=True`, anyone who knows the artifact id can download your artifact with the above function. 64 | ``` 65 | 66 | If the fine-tuning is finished, you will see the following message in the terminal: 67 | 68 | ```bash 69 | 🔐 Successfully logged in to Jina AI as [USER NAME]! 70 | Run status: FINISHED 71 | Run Artifact id: 62972acb5de25a53fdbfcecc 72 | Run logs: 73 | 74 | Training [2/2] ━━━━━━━━━━━━━━━━━━━━━━━━━━━ 50/50 0:00:00 0:01:08 • loss: 0.050 75 | [09:13:23] INFO [__main__] Done ✨ __main__.py:214 76 | INFO [__main__] Saving fine-tuned models ... __main__.py:217 77 | INFO [__main__] Saving model 'model' in __main__.py:228 78 | /usr/src/app/tuned-models/model ... 79 | INFO [__main__] Pushing saved model to Hubble ... __main__.py:232 80 | [09:13:54] INFO [__main__] Pushed model artifact ID: __main__.py:238 81 | '62972acb5de25a53fdbfcecc' 82 | INFO [__main__] Finished 🚀 __main__.py:240``` 83 | ``` 84 | -------------------------------------------------------------------------------- /finetuner/callback.py: -------------------------------------------------------------------------------- 1 | from _finetuner.runner.stubs.callback import * # noqa F401 2 | -------------------------------------------------------------------------------- /finetuner/client/__init__.py: -------------------------------------------------------------------------------- 1 | from finetuner.client.client import FinetunerV1Client # noqa: F401 2 | -------------------------------------------------------------------------------- /finetuner/client/base.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import List, Optional, Union 3 | 4 | import requests 5 | 6 | import hubble 7 | from finetuner.client.session import _HeaderPreservingSession 8 | from finetuner.constants import ( 9 | AUTHORIZATION, 10 | CHARSET, 11 | DATA, 12 | HOST, 13 | HUBBLE_USER_ID, 14 | TEXT, 15 | TOKEN_PREFIX, 16 | UTF_8, 17 | ) 18 | from finetuner.excepts import FinetunerServerError 19 | 20 | 21 | class _BaseClient: 22 | """ 23 | Base Finetuner API client. 24 | """ 25 | 26 | def __init__(self): 27 | self._base_url = os.environ.get(HOST) 28 | self._session = self._get_client_session() 29 | self.hubble_client = hubble.Client(max_retries=None, jsonify=True) 30 | self.hubble_user_id = self._get_hubble_user_id() 31 | 32 | def _get_hubble_user_id(self): 33 | user_info = self.hubble_client.get_user_info() 34 | if user_info['code'] >= 400: 35 | # will implement error-handling later 36 | pass 37 | hubble_user_id = user_info[DATA][HUBBLE_USER_ID] 38 | return hubble_user_id 39 | 40 | @staticmethod 41 | def _get_client_session() -> _HeaderPreservingSession: 42 | session = _HeaderPreservingSession(trusted_domains=[]) 43 | api_token = TOKEN_PREFIX + str(hubble.Auth.get_auth_token()) 44 | session.headers.update({CHARSET: UTF_8, AUTHORIZATION: api_token}) 45 | return session 46 | 47 | @staticmethod 48 | def _construct_url(*args) -> str: 49 | return '/'.join(args) 50 | 51 | def _handle_request( 52 | self, 53 | url: str, 54 | method: str, 55 | params: Optional[dict] = None, 56 | json_data: Optional[dict] = None, 57 | stream: bool = False, 58 | timeout: Optional[int] = None, 59 | ) -> Union[dict, List[dict], str, requests.Response]: 60 | """The base request handler. 61 | 62 | :param url: The url of the request. 63 | :param method: The request type (GET, POST or DELETE). 64 | :param params: Optional parameters for the request. 65 | :param json_data: Optional data payloads to be sent along with the request. 66 | :param stream: If the request is a streaming request set to True. 67 | :return: Response to the request. 68 | """ 69 | response = self._session.request( 70 | url=url, 71 | method=method, 72 | json=json_data, 73 | params=params, 74 | allow_redirects=True, 75 | stream=stream, 76 | timeout=timeout, 77 | ) 78 | if not response.ok: 79 | raise FinetunerServerError( 80 | message=response.reason, 81 | code=response.status_code, 82 | details=response.json()['detail'], 83 | ) 84 | if stream: 85 | return response 86 | else: 87 | if TEXT in response.headers['content-type']: 88 | return response.text 89 | return response.json() 90 | -------------------------------------------------------------------------------- /finetuner/client/session.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | from requests import Session 4 | from requests.utils import urlparse 5 | 6 | 7 | class _HeaderPreservingSession(Session): 8 | def __init__(self, trusted_domains: List[str]): 9 | super(_HeaderPreservingSession, self).__init__() 10 | self._trusted_domains = trusted_domains 11 | 12 | def rebuild_auth(self, prepared_request, response): 13 | """ 14 | Keep headers upon redirect as long as we are on any of the 15 | self._trusted_domains 16 | """ 17 | headers = prepared_request.headers 18 | url = prepared_request.url 19 | if 'Authorization' in headers: 20 | _original_parsed = urlparse(response.request.url) 21 | _redirect_parsed = urlparse(url) 22 | _original_domain = '.'.join(_original_parsed.hostname.split('.')[-2:]) 23 | _redirect_domain = '.'.join(_redirect_parsed.hostname.split('.')[-2:]) 24 | if ( 25 | _original_domain != _redirect_domain 26 | and _original_domain not in self._trusted_domains 27 | and _redirect_domain not in self._trusted_domains 28 | ): 29 | del headers['Authorization'] 30 | -------------------------------------------------------------------------------- /finetuner/console.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Dict, List, Optional 2 | 3 | from rich.console import Console 4 | from rich.table import Table 5 | 6 | from finetuner.model import list_model_classes 7 | 8 | console = Console() 9 | 10 | 11 | def print_model_table(model, task: Optional[str] = None): 12 | """Prints a table of model descriptions. 13 | 14 | :param model: Module with model definitions 15 | :param task: The fine-tuning task, should be one of `text-to-text`, 16 | """ 17 | title = 'Finetuner backbones' 18 | if task: 19 | title += f': {task}' 20 | table = Table(title=title) 21 | header = model.get_header() 22 | model_display_names = set() 23 | 24 | for column in header: 25 | table.add_column(column, justify='right', style='cyan', no_wrap=False) 26 | 27 | for _, _model_class in list_model_classes().items(): 28 | if _model_class.display_name not in model_display_names: 29 | row = model.get_row(_model_class) 30 | if task and row[1] != task: 31 | continue 32 | table.add_row(*row) 33 | model_display_names.add(_model_class.display_name) 34 | 35 | console.print(table) 36 | 37 | 38 | def print_examples(stage: str, results: Dict[str, List[Any]], k: int = 5): 39 | """ 40 | Prints a table of results of example queries from the evaluation data. 41 | 42 | :param stage: either 'before' or 'after' 43 | :param results: The example results to display 44 | :param k: maximal number of results per query to display 45 | """ 46 | table = Table( 47 | title=f'Results {stage} fine-tuning:', show_header=False, title_justify='left' 48 | ) 49 | table.add_column(justify='left', style='cyan', no_wrap=False) 50 | table.add_column(justify='left', style='cyan', no_wrap=False) 51 | for query in results: 52 | table.add_row('Query', str(query), style='yellow bold') 53 | for i, match in enumerate(results[query][:k]): 54 | table.add_row(f'Match {i+1}', str(match)) 55 | console.print(table) 56 | 57 | 58 | def print_metrics(stage: str, metrics: Dict[str, List[Any]]): 59 | """ 60 | Prints a table of retrieval metrics. 61 | :param stage: either 'before' or 'after' 62 | :param metrics: dictionary with retrieval metrics before and after fine-tuning. 63 | """ 64 | table = Table(title=f'Retrieval metrics {stage} fine-tuning:', title_justify='left') 65 | table.add_column('Retrieval Metric', justify='left', style='cyan', no_wrap=False) 66 | table.add_column('Value', justify='left', style='cyan', no_wrap=False) 67 | for metric, value in metrics.items(): 68 | table.add_row(metric, str(value)) 69 | console.print(table) 70 | -------------------------------------------------------------------------------- /finetuner/constants.py: -------------------------------------------------------------------------------- 1 | DELETE = 'DELETE' 2 | POST = 'POST' 3 | GET = 'GET' 4 | NAME = 'name' 5 | HUBBLE_USER_ID = '_id' 6 | ID = 'id' 7 | 8 | HOST = 'JINA_FINETUNER_REGISTRY' 9 | HUBBLE_REGISTRY = 'JINA_HUBBLE_REGISTRY' 10 | DEFAULT_FINETUNER_HOST = 'https://api.compute.finetuner.fit' 11 | DEFAULT_HUBBLE_REGISTRY = 'https://api.hubble.jina.ai' 12 | 13 | CONFIG = 'config' 14 | FINETUNER_VERSION = 'finetuner_version' 15 | DEVICE = 'device' 16 | CPUS = 'cpus' 17 | GPUS = 'gpus' 18 | NUM_WORKERS = 'num_workers' 19 | RUNS = 'runs' 20 | STATUS = 'status' 21 | LOGS = 'logs' 22 | LOGSTREAM = 'logstream' 23 | METRICS = 'metrics' 24 | EXAMPLES = 'examples' 25 | EXPERIMENTS = 'experiments' 26 | API_VERSION = 'api/v1' 27 | AUTHORIZATION = 'Authorization' 28 | CHARSET = 'Accept-Charset' 29 | UTF_8 = 'utf-8' 30 | TEXT = 'text' 31 | TOKEN_PREFIX = 'token ' 32 | DATA = 'data' 33 | TRAIN_DATA = 'train_data' 34 | EVAL_DATA = 'eval_data' 35 | VAL_SPLIT = 'val_split' 36 | EVALUATE = 'evaluate' 37 | ARTIFACTS_DIR = 'artifacts/' 38 | MODEL = 'model' 39 | MODEL_OPTIONS = 'model_options' 40 | MODEL_ARTIFACT = 'model_artifact' 41 | ARTIFACT = 'artifact' 42 | ARTIFACT_ID = 'artifact_id' 43 | DEFAULT_TAG_KEY = 'finetuner_label' 44 | DEFAULT_TAG_SCORE_KEY = 'finetuner_score' 45 | # Run status 46 | CREATED = 'CREATED' 47 | STARTED = 'STARTED' 48 | FINISHED = 'FINISHED' 49 | FAILED = 'FAILED' 50 | DEFAULT_EXPERIMENT_NAME = 'default_experiment' 51 | CREATED_AT = 'created_at' 52 | DESCRIPTION = 'description' 53 | FREEZE = 'freeze' 54 | OUTPUT_DIM = 'output_dim' 55 | MULTI_MODAL = 'multi_modal' 56 | IMAGE_MODALITY = 'image_modality' 57 | TEXT_MODALITY = 'text_modality' 58 | HYPER_PARAMETERS = 'hyper_parameters' 59 | LOSS = 'loss' 60 | LOSS_OPTIONS = 'loss_options' 61 | OPTIMIZER = 'optimizer' 62 | LOSS_OPTIMIZER = 'loss_optimizer' 63 | LOSS_OPTIMIZER_OPTIONS = 'loss_optimizer_options' 64 | SAMPLER = 'sampler' 65 | MINER = 'miner' 66 | MINER_OPTIONS = 'miner_options' 67 | BATCH_SIZE = 'batch_size' 68 | LEARNING_RATE = 'learning_rate' 69 | EPOCHS = 'epochs' 70 | EXPERIMENT_NAME = 'experiment_name' 71 | RUN_NAME = 'run_name' 72 | OPTIMIZER_OPTIONS = 'optimizer_options' 73 | SCHEDULER = 'scheduler' 74 | SCHEDULER_OPTIONS = 'scheduler_options' 75 | CALLBACKS = 'callbacks' 76 | OPTIONS = 'options' 77 | QUERY_DATA = 'query_data' 78 | INDEX_DATA = 'index_data' 79 | DA_PREFIX = 'finetuner-dastorage' 80 | ONNX = 'to_onnx' 81 | PUBLIC = 'public' 82 | NUM_ITEMS_PER_CLASS = 'num_items_per_class' 83 | VAL_SPLIT = 'val_split' 84 | TASK = 'task' 85 | TRAINING_TASK = 'training' 86 | SYNTHESIS_TASK = 'generation' 87 | # Synthesis job 88 | RAW_DATA_CONFIG = 'data' 89 | RELATION_MINING = 'relation_mining' 90 | CROSS_ENCODER = 'cross_encoder' 91 | QUERIES = 'queries' 92 | CORPUS = 'corpus' 93 | MODELS = 'models' 94 | NUM_RELATIONS = 'num_relations' 95 | TRAIN_DATA = 'train_data' 96 | MAX_NUM_DOCS = 'max_num_docs' 97 | HF_URL_PREFIX = 'https://huggingface.co/jinaai/' 98 | HF_ORG_PREFIX = 'jinaai/' 99 | -------------------------------------------------------------------------------- /finetuner/excepts.py: -------------------------------------------------------------------------------- 1 | class FinetunerServerError(Exception): 2 | def __init__( 3 | self, 4 | message: str = 'An unknown error occurred', 5 | details: str = '', 6 | code: int = -1, 7 | ): 8 | self.details = details 9 | self.message = message 10 | self.code = code 11 | 12 | def __str__(self): 13 | return f'{self.message} ({self.code}): {self.details}' 14 | 15 | 16 | class RunInProgressError(Exception): 17 | ... 18 | 19 | 20 | class RunPreparingError(Exception): 21 | ... 22 | 23 | 24 | class RunFailedError(Exception): 25 | ... 26 | -------------------------------------------------------------------------------- /finetuner/hubble.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import Dict, Optional, Tuple, Union 3 | 4 | from finetuner import DocumentArray 5 | from finetuner.constants import ARTIFACTS_DIR, DA_PREFIX 6 | 7 | 8 | def push_docarray( 9 | data: Union[None, str, DocumentArray], 10 | name: str, 11 | ids2names: Optional[Dict[int, str]] = None, 12 | ) -> Optional[str]: 13 | """Upload a DocumentArray to Jina AI Cloud and return its name.""" 14 | if isinstance(data, DocumentArray): 15 | _id = id(data) # get the reference id 16 | if ids2names is not None and _id in ids2names: 17 | return ids2names[_id] 18 | print(f'Pushing a DocumentArray to Jina AI Cloud under the name {name} ...') 19 | data.push(name=name, show_progress=True, public=False) 20 | if ids2names is not None: 21 | ids2names[id(data)] = name 22 | return name 23 | return data 24 | 25 | 26 | def push_training_data( 27 | experiment_name: str, 28 | run_name: str, 29 | train_data: Union[str, DocumentArray], 30 | eval_data: Union[None, str, DocumentArray] = None, 31 | query_data: Union[None, str, DocumentArray] = None, 32 | index_data: Union[None, str, DocumentArray] = None, 33 | ) -> Tuple[Optional[str], ...]: 34 | """Upload data to Jina AI Cloud and returns their names. 35 | 36 | Uploads all data needed for fine-tuning - training data, 37 | evaluation data and query/index data for `EvaluationCallback`. 38 | 39 | Data is given either as a `DocumentArray` or 40 | a name of the `DocumentArray` that is already pushed to Jina AI Cloud. 41 | 42 | Checks not to upload same dataset twice. 43 | 44 | :param experiment_name: Name of the experiment. 45 | :param run_name: Name of the run. 46 | :param train_data: Training data. 47 | :param eval_data: Evaluation data. 48 | :param query_data: Query data for `EvaluationCallback`. 49 | :param index_data: Index data for `EvaluationCallback`. 50 | :return: Name(s) of the uploaded data. 51 | """ 52 | _ids2names = dict() 53 | return ( 54 | push_docarray( 55 | train_data, f'{DA_PREFIX}-{experiment_name}-{run_name}-train', _ids2names 56 | ), 57 | push_docarray( 58 | eval_data, f'{DA_PREFIX}-{experiment_name}-{run_name}-eval', _ids2names 59 | ), 60 | push_docarray( 61 | query_data, f'{DA_PREFIX}-{experiment_name}-{run_name}-query', _ids2names 62 | ), 63 | push_docarray( 64 | index_data, f'{DA_PREFIX}-{experiment_name}-{run_name}-index', _ids2names 65 | ), 66 | ) 67 | 68 | 69 | def push_synthesis_data( 70 | experiment_name: str, 71 | run_name: str, 72 | query_data: Union[str, DocumentArray], 73 | corpus_data: Union[str, DocumentArray], 74 | ) -> Tuple[Optional[str], Optional[str]]: 75 | """Upload data to Jina AI Cloud and returns their names. 76 | 77 | Uploads all data needed for data synthesis - query data and corpus data. 78 | 79 | Data is given either as a `DocumentArray` or 80 | a name of the `DocumentArray` that is already pushed to Jina AI Cloud. 81 | 82 | Checks not to upload same dataset twice. 83 | 84 | :param experiment_name: Name of the experiment. 85 | :param run_name: Name of the run. 86 | :param query_data: Query data. 87 | :param corpus_data: Corpus data. 88 | :return: Names of the uploaded query and corpus data. 89 | """ 90 | _ids2names = dict() 91 | return ( 92 | push_docarray( 93 | query_data, f'{DA_PREFIX}-{experiment_name}-{run_name}-query', _ids2names 94 | ), 95 | push_docarray( 96 | corpus_data, f'{DA_PREFIX}-{experiment_name}-{run_name}-corpus', _ids2names 97 | ), 98 | ) 99 | 100 | 101 | def download_artifact( 102 | client, artifact_id: str, run_name: str, directory: str = ARTIFACTS_DIR 103 | ) -> str: 104 | """Download artifact from Jina AI Cloud by its ID. 105 | 106 | :param client: Hubble client instance. 107 | :param artifact_id: The artifact id stored in the Jina AI Cloud. 108 | :param run_name: The name of the run as artifact name to store locally. 109 | :param directory: Directory where the artifact will be stored. 110 | :returns: A string that indicates the download path. 111 | """ 112 | os.makedirs(directory, exist_ok=True) 113 | 114 | path = os.path.join(directory, f'{run_name}.zip') 115 | 116 | return client.hubble_client.download_artifact( 117 | id=artifact_id, f=path, show_progress=True 118 | ) 119 | -------------------------------------------------------------------------------- /finetuner/model.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import List, Union 3 | 4 | from _finetuner.runner.stubs import model 5 | from _finetuner.runner.stubs.model import * # noqa F401 6 | from _finetuner.runner.stubs.model import _EmbeddingModelStub 7 | 8 | 9 | def get_header() -> Tuple[str, ...]: 10 | """Get table header.""" 11 | return 'name', 'task', 'output_dim', 'architecture', 'description' 12 | 13 | 14 | def get_row(model_stub) -> Tuple[str, ...]: 15 | """Get table row.""" 16 | return ( 17 | model_stub.display_name, 18 | model_stub.task, 19 | str(model_stub.output_shape[1]), 20 | model_stub.architecture, 21 | model_stub.description, 22 | ) 23 | 24 | 25 | def list_model_classes() -> Dict[str, ModelStubType]: 26 | rv = {} 27 | members = inspect.getmembers(model, inspect.isclass) 28 | parent_class = _EmbeddingModelStub 29 | for name, stub in members: 30 | if ( 31 | name != 'MLPStub' 32 | and not name.startswith('_') 33 | and type(stub) != type 34 | and issubclass(stub, parent_class) 35 | ): 36 | rv[name] = stub 37 | return rv 38 | 39 | 40 | @dataclass 41 | class SynthesisModels: 42 | """Class specifying the models to be used in a data synthesis job. 43 | :param: relation_miner: The name of the model or list of models to use for 44 | relation mining. 45 | :param cross_encoder: The name of the model to use as the cross encoder 46 | """ 47 | 48 | relation_miner: Union[str, List[str]] 49 | cross_encoder: str 50 | 51 | 52 | synthesis_model_en = SynthesisModels( 53 | relation_miner='sbert-base-en', 54 | cross_encoder='crossencoder-base-en', 55 | ) 56 | 57 | synthesis_model_multi = SynthesisModels( 58 | relation_miner='distiluse-base-multi', 59 | cross_encoder='crossencoder-base-ml', 60 | ) 61 | -------------------------------------------------------------------------------- /finetuner/names.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | # moby-like silly name generator 4 | # taken from 5 | # https://github.com/moby/moby/blob/master/pkg/namesgenerator/names-generator.go 6 | 7 | 8 | adjectives = [ 9 | 'admiring', 10 | 'adoring', 11 | 'affectionate', 12 | 'agitated', 13 | 'amazing', 14 | 'angry', 15 | 'awesome', 16 | 'beautiful', 17 | 'blissful', 18 | 'bold', 19 | 'boring', 20 | 'brave', 21 | 'busy', 22 | 'charming', 23 | 'clever', 24 | 'cool', 25 | 'compassionate', 26 | 'competent', 27 | 'condescending', 28 | 'confident', 29 | 'cranky', 30 | 'crazy', 31 | 'dazzling', 32 | 'determined', 33 | 'distracted', 34 | 'dreamy', 35 | 'eager', 36 | 'ecstatic', 37 | 'elastic', 38 | 'elated', 39 | 'elegant', 40 | 'eloquent', 41 | 'epic', 42 | 'exciting', 43 | 'fervent', 44 | 'festive', 45 | 'flamboyant', 46 | 'focused', 47 | 'friendly', 48 | 'frosty', 49 | 'funny', 50 | 'gallant', 51 | 'gifted', 52 | 'goofy', 53 | 'gracious', 54 | 'great', 55 | 'happy', 56 | 'hardcore', 57 | 'heuristic', 58 | 'hopeful', 59 | 'hungry', 60 | 'infallible', 61 | 'inspiring', 62 | 'interesting', 63 | 'intelligent', 64 | 'jolly', 65 | 'jovial', 66 | 'keen', 67 | 'kind', 68 | 'laughing', 69 | 'loving', 70 | 'lucid', 71 | 'magical', 72 | 'mystifying', 73 | 'modest', 74 | 'musing', 75 | 'naughty', 76 | 'nervous', 77 | 'nice', 78 | 'nifty', 79 | 'nostalgic', 80 | 'objective', 81 | 'optimistic', 82 | 'peaceful', 83 | 'pedantic', 84 | 'pensive', 85 | 'practical', 86 | 'priceless', 87 | 'quirky', 88 | 'quizzical', 89 | 'recursing', 90 | 'relaxed', 91 | 'reverent', 92 | 'romantic', 93 | 'sad', 94 | 'serene', 95 | 'sharp', 96 | 'silly', 97 | 'sleepy', 98 | 'stoic', 99 | 'strange', 100 | 'stupefied', 101 | 'suspicious', 102 | 'sweet', 103 | 'tender', 104 | 'thirsty', 105 | 'trusting', 106 | 'unruffled', 107 | 'upbeat', 108 | 'vibrant', 109 | 'vigilant', 110 | 'vigorous', 111 | 'wizardly', 112 | 'wonderful', 113 | 'xenodochial', 114 | 'youthful', 115 | 'zealous', 116 | 'zen', 117 | ] 118 | 119 | 120 | surnames = [ 121 | 'albattani', 122 | 'allen', 123 | 'almeida', 124 | 'antonelli', 125 | 'agnesi', 126 | 'archimedes', 127 | 'ardinghelli', 128 | 'aryabhata', 129 | 'austin', 130 | 'babbage', 131 | 'banach', 132 | 'banzai', 133 | 'bardeen', 134 | 'bartik', 135 | 'bassi', 136 | 'beaver', 137 | 'bell', 138 | 'benz', 139 | 'bhabha', 140 | 'bhaskara', 141 | 'black', 142 | 'blackburn', 143 | 'blackwell', 144 | 'bohr', 145 | 'booth', 146 | 'borg', 147 | 'bose', 148 | 'bouman', 149 | 'boyd', 150 | 'brahmagupta', 151 | 'brattain', 152 | 'brown', 153 | 'buck', 154 | 'burnell', 155 | 'cannon', 156 | 'carson', 157 | 'cartwright', 158 | 'carver', 159 | 'cerf', 160 | 'chandrasekhar', 161 | 'chaplygin', 162 | 'chatelet', 163 | 'chatterjee', 164 | 'chebyshev', 165 | 'cohen', 166 | 'chaum', 167 | 'clarke', 168 | 'colden', 169 | 'cori', 170 | 'cray', 171 | 'curran', 172 | 'curie', 173 | 'darwin', 174 | 'davinci', 175 | 'dewdney', 176 | 'dhawan', 177 | 'diffie', 178 | 'dijkstra', 179 | 'dirac', 180 | 'driscoll', 181 | 'dubinsky', 182 | 'easley', 183 | 'edison', 184 | 'einstein', 185 | 'elbakyan', 186 | 'elgamal', 187 | 'elion', 188 | 'ellis', 189 | 'engelbart', 190 | 'euclid', 191 | 'euler', 192 | 'faraday', 193 | 'feistel', 194 | 'fermat', 195 | 'fermi', 196 | 'feynman', 197 | 'franklin', 198 | 'gagarin', 199 | 'galileo', 200 | 'galois', 201 | 'ganguly', 202 | 'gates', 203 | 'gauss', 204 | 'germain', 205 | 'goldberg', 206 | 'goldstine', 207 | 'goldwasser', 208 | 'golick', 209 | 'goodall', 210 | 'gould', 211 | 'greider', 212 | 'grothendieck', 213 | 'haibt', 214 | 'hamilton', 215 | 'haslett', 216 | 'hawking', 217 | 'hellman', 218 | 'heisenberg', 219 | 'hermann', 220 | 'herschel', 221 | 'hertz', 222 | 'heyrovsky', 223 | 'hodgkin', 224 | 'hofstadter', 225 | 'hoover', 226 | 'hopper', 227 | 'hugle', 228 | 'hypatia', 229 | 'ishizaka', 230 | 'jackson', 231 | 'jang', 232 | 'jemison', 233 | 'jennings', 234 | 'jepsen', 235 | 'johnson', 236 | 'joliot', 237 | 'jones', 238 | 'kalam', 239 | 'kapitsa', 240 | 'kare', 241 | 'keldysh', 242 | 'keller', 243 | 'kepler', 244 | 'khayyam', 245 | 'khorana', 246 | 'kilby', 247 | 'kirch', 248 | 'knuth', 249 | 'kowalevski', 250 | 'lalande', 251 | 'lamarr', 252 | 'lamport', 253 | 'leakey', 254 | 'leavitt', 255 | 'lederberg', 256 | 'lehmann', 257 | 'lewin', 258 | 'lichterman', 259 | 'liskov', 260 | 'lovelace', 261 | 'lumiere', 262 | 'mahavira', 263 | 'margulis', 264 | 'matsumoto', 265 | 'maxwell', 266 | 'mayer', 267 | 'mccarthy', 268 | 'mcclintock', 269 | 'mclaren', 270 | 'mclean', 271 | 'mcnulty', 272 | 'mendel', 273 | 'mendeleev', 274 | 'meitner', 275 | 'meninsky', 276 | 'merkle', 277 | 'mestorf', 278 | 'mirzakhani', 279 | 'moore', 280 | 'morse', 281 | 'murdock', 282 | 'moser', 283 | 'napier', 284 | 'nash', 285 | 'neumann', 286 | 'newton', 287 | 'nightingale', 288 | 'nobel', 289 | 'noether', 290 | 'northcutt', 291 | 'noyce', 292 | 'panini', 293 | 'pare', 294 | 'pascal', 295 | 'pasteur', 296 | 'payne', 297 | 'perlman', 298 | 'pike', 299 | 'poincare', 300 | 'poitras', 301 | 'proskuriakova', 302 | 'ptolemy', 303 | 'raman', 304 | 'ramanujan', 305 | 'ride', 306 | 'montalcini', 307 | 'ritchie', 308 | 'rhodes', 309 | 'robinson', 310 | 'roentgen', 311 | 'rosalind', 312 | 'rubin', 313 | 'saha', 314 | 'sammet', 315 | 'sanderson', 316 | 'satoshi', 317 | 'shamir', 318 | 'shannon', 319 | 'shaw', 320 | 'shirley', 321 | 'shockley', 322 | 'shtern', 323 | 'sinoussi', 324 | 'snyder', 325 | 'solomon', 326 | 'spence', 327 | 'stonebraker', 328 | 'sutherland', 329 | 'swanson', 330 | 'swartz', 331 | 'swirles', 332 | 'taussig', 333 | 'tereshkova', 334 | 'tesla', 335 | 'tharp', 336 | 'thompson', 337 | 'torvalds', 338 | 'tu', 339 | 'turing', 340 | 'varahamihira', 341 | 'vaughan', 342 | 'visvesvaraya', 343 | 'volhard', 344 | 'villani', 345 | 'wescoff', 346 | 'wilbur', 347 | 'wiles', 348 | 'williams', 349 | 'williamson', 350 | 'wilson', 351 | 'wing', 352 | 'wozniak', 353 | 'wright', 354 | 'wu', 355 | 'yalow', 356 | 'yonath', 357 | 'zhukovsky', 358 | ] 359 | 360 | 361 | def get_random_name() -> str: 362 | adjective = adjectives[random.randrange(0, len(adjectives))] 363 | surname = surnames[random.randrange(0, len(surnames))] 364 | return ''.join([adjective, '-', surname]) 365 | -------------------------------------------------------------------------------- /finetuner/run.py: -------------------------------------------------------------------------------- 1 | import time 2 | from typing import Any, Dict, Iterator, Optional 3 | 4 | from finetuner.client import FinetunerV1Client 5 | from finetuner.console import console, print_examples, print_metrics 6 | from finetuner.constants import ( 7 | ARTIFACT_ID, 8 | ARTIFACTS_DIR, 9 | CREATED, 10 | FAILED, 11 | STARTED, 12 | STATUS, 13 | SYNTHESIS_TASK, 14 | TRAIN_DATA, 15 | TRAINING_TASK, 16 | ) 17 | from finetuner.excepts import RunFailedError, RunInProgressError, RunPreparingError 18 | from finetuner.hubble import download_artifact 19 | 20 | 21 | class Run: 22 | """Class for a run. 23 | 24 | :param client: Client object for sending api requests. 25 | :param name: Name of the run. 26 | :param experiment_name: Name of the experiment. 27 | :param config: Configuration for the run. 28 | :param created_at: Creation time of the run. 29 | :param description: Optional description of the run. 30 | :param train_data: The name of the `DocumentArray` created if this run is a data 31 | synthesis job. 32 | """ 33 | 34 | def __init__( 35 | self, 36 | client: FinetunerV1Client, 37 | name: str, 38 | experiment_name: str, 39 | config: dict, 40 | created_at: str, 41 | description: str = '', 42 | task: str = TRAINING_TASK, 43 | train_data: Optional[str] = None, 44 | ): 45 | self._client = client 46 | self._name = name 47 | self._experiment_name = experiment_name 48 | self._config = config 49 | self._created_at = created_at 50 | self._description = description 51 | self._run = self._get_run() 52 | self.task = task 53 | self._train_data = train_data 54 | 55 | @property 56 | def name(self) -> str: 57 | """Get the name of the :class:`Run`.""" 58 | return self._name 59 | 60 | @property 61 | def config(self) -> dict: 62 | """Get the config of the :class:`Run`.""" 63 | return self._config 64 | 65 | @property 66 | def train_data(self) -> str: 67 | """Get the data generated by the :class:`Run` In the case that it was a 68 | Synthesis job, if it is a training job, a ValueError is thrown. 69 | """ 70 | if self.task != SYNTHESIS_TASK: 71 | raise ValueError(f'{self.task} run does not produce data.') 72 | else: 73 | self._check_run_status_finished() 74 | if self._train_data: 75 | return self._train_data 76 | else: 77 | run = self._get_run() 78 | try: 79 | train_data = run[TRAIN_DATA] 80 | except KeyError: 81 | raise ValueError(f'run {self.name} has no train_data.') 82 | self._train_data = train_data 83 | return train_data 84 | 85 | def _get_run(self) -> dict: 86 | """Get Run object as dict.""" 87 | return self._client.get_run( 88 | experiment_name=self._experiment_name, run_name=self._name 89 | ) 90 | 91 | def status(self) -> dict: 92 | """Get :class:`Run` status. 93 | 94 | :returns: A dict representing the :class:`Run` status. 95 | """ 96 | return self._client.get_run_status( 97 | experiment_name=self._experiment_name, run_name=self._name 98 | ) 99 | 100 | def logs(self) -> str: 101 | """Check the :class:`Run` logs. 102 | 103 | :returns: A string dump of the run logs. 104 | """ 105 | self._check_run_status_started() 106 | return self._client.get_run_logs( 107 | experiment_name=self._experiment_name, run_name=self._name 108 | ) 109 | 110 | def stream_logs(self, interval: int = 5) -> Iterator[str]: 111 | """Stream the :class:`Run` logs lively. 112 | 113 | :param interval: The time interval to sync the status of finetuner `Run`. 114 | :yield: An iterators keep stream the logs from server. 115 | """ 116 | status = self.status()[STATUS] 117 | msg_template = ( 118 | 'Preparing to run, logs will be ready to pull when ' 119 | '`status` is `STARTED`. Current status is `%s`' 120 | ) 121 | with console.status(msg_template % status, spinner="dots") as rich_status: 122 | while status == CREATED: 123 | time.sleep(interval) 124 | status = self.status()[STATUS] 125 | rich_status.update(msg_template % status) 126 | 127 | return self._client.stream_run_logs( 128 | experiment_name=self._experiment_name, run_name=self._name 129 | ) 130 | 131 | def metrics(self) -> Dict[str, Dict[str, float]]: 132 | """Get the evaluation metrics of the :class:`Run`. 133 | 134 | :return: dictionary with evaluation metrics before and after fine-tuning. 135 | """ 136 | self._check_run_status_finished() 137 | return self._client.get_run_metrics( 138 | experiment_name=self._experiment_name, run_name=self._name 139 | ) 140 | 141 | def display_metrics(self): 142 | """ 143 | Prints a table of retrieval metrics before and after fine-tuning 144 | """ 145 | metrics = self.metrics() 146 | for stage in metrics: 147 | print_metrics(stage, metrics[stage]) 148 | 149 | def example_results(self) -> Dict[str, Any]: 150 | """Get the results of example queries from the evaluation data of the 151 | :class:`Run`. 152 | 153 | :return: dictionary with results before and after fine-tuning. 154 | """ 155 | self._check_run_status_finished() 156 | return self._client.get_run_examples( 157 | experiment_name=self._experiment_name, run_name=self._name 158 | ) 159 | 160 | def display_examples(self, k: int = 5): 161 | """ 162 | Prints a table of results of example queries before and after fine-tuning. 163 | 164 | :param k: maximal number of results per query to display 165 | """ 166 | example_results = self.example_results() 167 | for stage in example_results: 168 | print_examples(stage, example_results[stage], k=k) 169 | 170 | def _check_run_status_finished(self): 171 | status = self.status()[STATUS] 172 | if status in [CREATED, STARTED]: 173 | raise RunInProgressError( 174 | 'The run needs to be finished in order to save the artifact.' 175 | ) 176 | if status == FAILED: 177 | raise RunFailedError( 178 | 'The run failed, please check the `logs` for detailed information.' 179 | ) 180 | 181 | def _check_run_status_started(self): 182 | status = self.status()[STATUS] 183 | if status == CREATED: 184 | raise RunPreparingError( 185 | 'Preparing to run, logs will be ready to pull when ' 186 | '`status` is `STARTED`.' 187 | ) 188 | 189 | def save_artifact(self, directory: str = ARTIFACTS_DIR) -> str: 190 | """Save artifact if the :class:`Run` is finished. 191 | 192 | :param directory: Directory where the artifact will be stored. 193 | :returns: A string object that indicates the download path. 194 | """ 195 | self._check_run_status_finished() 196 | return download_artifact( 197 | client=self._client, 198 | artifact_id=self._run[ARTIFACT_ID], 199 | run_name=self._name, 200 | directory=directory, 201 | ) 202 | 203 | @property 204 | def artifact_id(self): 205 | """Get artifact id of the :class:`Run`. 206 | 207 | An artifact in finetuner contains fine-tuned model and its metadata. 208 | Such as preprocessing function, collate function. This id could be useful 209 | if you want to directly pull the artifact from the cloud storage, such as 210 | using `FinetunerExecutor`. 211 | 212 | :return: Artifact id as string object. 213 | """ 214 | self._check_run_status_finished() 215 | return self._run[ARTIFACT_ID] 216 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.black] 2 | line-length = 88 3 | skip-string-normalization = true 4 | extend-exclude = 'docs/conf.py' 5 | 6 | [tool.isort] 7 | profile = 'black' 8 | extend_skip = ['docs/conf.py'] 9 | skip_gitignore = true 10 | src_paths = ['finetuner'] 11 | 12 | [tool.mypy] 13 | python_version = 3.9 14 | ignore_missing_imports = true 15 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | version = 0.8.1 3 | 4 | [flake8] 5 | # E501 is too long lines - ignore as black takes care of that 6 | # E203 is whitespace before ':' - which occurs in numpy slicing, e.g. in 7 | # dists[2 * i : 2 * i + 2, :] 8 | # W503 is line break before binary operator - happens when black splits up lines 9 | ignore = E203, W503, F405, F403 10 | exclude = .git,__pycache__,docs/conf.py,old,build,dist,scripts,target,setup.py 11 | max-line-length = 88 12 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import find_packages, setup 2 | 3 | # package name 4 | _name = 'finetuner' 5 | 6 | # package long description 7 | try: 8 | with open('README.md', encoding='utf8') as fp: 9 | _long_description = fp.read() 10 | except FileNotFoundError: 11 | _long_description = '' 12 | 13 | 14 | if __name__ == '__main__': 15 | setup( 16 | name=_name, 17 | packages=find_packages(exclude=['*.tests', '*.tests.*', 'tests.*', 'tests']), 18 | include_package_data=True, 19 | description='Task-oriented finetuning for better embeddings on neural search.', 20 | author='Jina AI', 21 | author_email='hello@jina.ai', 22 | url='https://github.com/jina-ai/finetuner/', 23 | license='Apache 2.0', 24 | download_url='https://github.com/jina-ai/finetuner/tags', 25 | long_description=_long_description, 26 | long_description_content_type='text/markdown', 27 | zip_safe=False, 28 | setup_requires=['setuptools>=18.0', 'wheel'], 29 | install_requires=[ 30 | 'docarray[common]<0.30.0', 31 | 'finetuner-stubs==0.13.10', 32 | 'finetuner-commons==0.13.10', 33 | ], 34 | extras_require={ 35 | 'full': [ 36 | 'jina-hubble-sdk==0.33.1', 37 | 'trimesh==3.16.4', 38 | ], 39 | 'test': [ 40 | 'black==23.3.0', 41 | 'flake8==6.0.0', 42 | 'isort==5.12.0', 43 | 'pytest==7.0.0', 44 | 'pytest-cov==3.0.0', 45 | 'pytest-mock==3.7.0', 46 | ], 47 | }, 48 | python_requires='>=3.8.0', 49 | classifiers=[ 50 | 'Development Status :: 5 - Production/Stable', 51 | 'Intended Audience :: Developers', 52 | 'Intended Audience :: Education', 53 | 'Intended Audience :: Science/Research', 54 | 'Programming Language :: Python :: 3.8', 55 | 'Programming Language :: Python :: 3.9', 56 | 'Programming Language :: Python :: 3.10', 57 | 'License :: OSI Approved :: Apache Software License', 58 | 'Environment :: Console', 59 | 'Operating System :: OS Independent', 60 | 'Topic :: Scientific/Engineering :: Artificial Intelligence', 61 | ], 62 | project_urls={ 63 | 'Documentation': 'https://finetuner.jina.ai', 64 | 'Source': 'https://github.com/jina-ai/finetuner/', 65 | 'Tracker': 'https://github.com/jina-ai/finetuner/issues', 66 | }, 67 | keywords=( 68 | 'jina neural-search neural-network deep-learning pretraining ' 69 | 'fine-tuning pretrained-models triplet-loss metric-learning ' 70 | 'siamese-network few-shot-learning' 71 | ), 72 | ) 73 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jina-ai/finetuner/69ae77cb51c736e791126792de20015a70658b53/tests/__init__.py -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pytest 4 | 5 | 6 | @pytest.fixture(autouse=True) 7 | def overwrite_hubble_registry(): 8 | os.environ['JINA_FINETUNER_REGISTRY'] = 'https://api.staging.finetuner.fit' 9 | os.environ['JINA_HUBBLE_REGISTRY'] = 'https://api.hubble.jina.ai' 10 | yield 11 | del os.environ['JINA_HUBBLE_REGISTRY'] 12 | del os.environ['JINA_FINETUNER_REGISTRY'] 13 | -------------------------------------------------------------------------------- /tests/constants.py: -------------------------------------------------------------------------------- 1 | FINETUNER_LABEL = 'finetuner_label' 2 | HUBBLE_USER_TEST_ID = '1' 3 | -------------------------------------------------------------------------------- /tests/helper.py: -------------------------------------------------------------------------------- 1 | import random 2 | import string 3 | 4 | 5 | def create_random_name(prefix='experiment', length=6): 6 | return f'{prefix}-' + ''.join( 7 | random.choices(string.ascii_uppercase + string.digits, k=length) 8 | ) 9 | -------------------------------------------------------------------------------- /tests/integration/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jina-ai/finetuner/69ae77cb51c736e791126792de20015a70658b53/tests/integration/__init__.py -------------------------------------------------------------------------------- /tests/integration/conftest.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import numpy as np 4 | import pytest 5 | from tests.constants import FINETUNER_LABEL 6 | 7 | import finetuner 8 | import hubble 9 | from finetuner import Document, DocumentArray 10 | 11 | 12 | @pytest.fixture() 13 | def get_image_data(): 14 | def generate_random_data(num_classes, images_per_class): 15 | da = DocumentArray() 16 | for class_id in range(num_classes): 17 | for _ in range(images_per_class): 18 | doc = Document( 19 | tensor=np.random.rand(28, 28, 3), 20 | tags={FINETUNER_LABEL: str(class_id)}, 21 | ) 22 | da.append(doc) 23 | return da 24 | 25 | train_da = generate_random_data(num_classes=10, images_per_class=10) 26 | eval_da = generate_random_data(num_classes=10, images_per_class=2) 27 | 28 | return train_da, eval_da 29 | 30 | 31 | @pytest.fixture() 32 | def get_feature_data(): 33 | def generate_random_data(num_classes, samples_per_class, dim): 34 | da = DocumentArray() 35 | for class_id in range(num_classes): 36 | for _ in range(samples_per_class): 37 | doc = Document( 38 | tensor=np.random.rand(dim).astype(np.float32), 39 | tags={FINETUNER_LABEL: str(class_id)}, 40 | ) 41 | da.append(doc) 42 | return da 43 | 44 | train_da = generate_random_data(num_classes=10, samples_per_class=32, dim=128) 45 | eval_da = generate_random_data(num_classes=10, samples_per_class=32, dim=128) 46 | 47 | return train_da, eval_da 48 | 49 | 50 | @pytest.fixture() 51 | def finetuner_mocker(mocker): 52 | def hubble_login_mocker(force: bool = False, post_success=None, **kwargs): 53 | print('Successfully logged in to Hubble!') 54 | if post_success: 55 | post_success() 56 | 57 | def get_auth_token(): 58 | if not os.environ.get('JINA_AUTH_TOKEN'): 59 | raise ValueError('Please set `JINA_AUTH_TOKEN` as an environment variable.') 60 | return os.environ.get('JINA_AUTH_TOKEN') 61 | 62 | mocker.patch.object(hubble, 'login', hubble_login_mocker) 63 | mocker.patch.object(hubble.Auth, 'get_auth_token', get_auth_token) 64 | 65 | finetuner.login() 66 | 67 | yield finetuner.ft 68 | 69 | 70 | @pytest.fixture() 71 | def synthesis_query_data(): 72 | return 'finetuner/xmarket_queries_da_s' 73 | 74 | 75 | @pytest.fixture() 76 | def synthesis_corpus_data(): 77 | return 'finetuner/xmarket_corpus_da_s' 78 | -------------------------------------------------------------------------------- /tests/integration/test_data.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from finetuner import build_model 4 | from finetuner.data import build_encoding_dataset 5 | 6 | 7 | @pytest.mark.parametrize( 8 | 'data, model_name, modality', 9 | [ 10 | (['text1', 'text2'], 'bert-base-en', 'text'), 11 | (['image1', 'image2', 'image3'], 'resnet-base', 'vision'), 12 | (['text1', 'text2'], 'clip-base-en', 'text'), 13 | (['image1', 'image2', 'image3'], 'clip-base-en', 'vision'), 14 | ], 15 | ) 16 | def test_build_encoding_dataset_str(data, model_name, modality): 17 | model = build_model(name=model_name, select_model='clip-' + modality) 18 | da = build_encoding_dataset(model=model, data=data) 19 | for doc, expected in zip(da, data): 20 | if modality == 'text': 21 | assert doc.text == expected 22 | else: 23 | assert doc.uri == expected 24 | -------------------------------------------------------------------------------- /tests/integration/test_experiments.py: -------------------------------------------------------------------------------- 1 | from tests.helper import create_random_name 2 | 3 | 4 | def test_experiments(finetuner_mocker): 5 | first_exp_name, second_exp_name = [create_random_name() for _ in range(2)] 6 | 7 | # create an experiment and retrieve it 8 | finetuner_mocker.create_experiment(name=first_exp_name) 9 | exp1 = finetuner_mocker.get_experiment(name=first_exp_name) 10 | assert exp1.name == first_exp_name 11 | assert exp1.status == 'ACTIVE' 12 | 13 | # create another experiment and list all experiments 14 | finetuner_mocker.create_experiment(second_exp_name) 15 | experiments = finetuner_mocker.list_experiments() 16 | experiment_names = [experiment.name for experiment in experiments] 17 | assert first_exp_name in experiment_names and second_exp_name in experiment_names 18 | 19 | for experiment in experiments: 20 | assert experiment.status == 'ACTIVE' 21 | 22 | # delete the first experiment 23 | finetuner_mocker.delete_experiment(first_exp_name) 24 | experiments = finetuner_mocker.list_experiments() 25 | assert second_exp_name in [experiment.name for experiment in experiments] 26 | 27 | # delete all experiments 28 | finetuner_mocker.delete_experiment(second_exp_name) 29 | experiments = finetuner_mocker.list_experiments() 30 | assert second_exp_name not in [experiment.name for experiment in experiments] 31 | # clear experiments 32 | finetuner_mocker.delete_experiments() 33 | -------------------------------------------------------------------------------- /tests/integration/test_hf_models.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from finetuner import build_model 4 | 5 | 6 | @pytest.mark.parametrize( 7 | 'model', 8 | [ 9 | 'jinaai/jina-embedding-s-en-v1', 10 | ], 11 | ) 12 | def test_build_model(model): 13 | model = build_model(name=model) 14 | assert model 15 | -------------------------------------------------------------------------------- /tests/integration/test_runs.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import numpy as np 4 | import pytest 5 | from docarray import DocumentArray 6 | from tests.helper import create_random_name 7 | 8 | import finetuner 9 | from finetuner.constants import FAILED, FINISHED, STATUS 10 | from finetuner.model import synthesis_model_en 11 | 12 | 13 | def test_runs(finetuner_mocker, get_feature_data): 14 | experiment_name = create_random_name() 15 | 16 | # get preprocessed data 17 | train_data, eval_data = get_feature_data 18 | 19 | # create an experiment and retrieve it 20 | finetuner_mocker.create_experiment(experiment_name) 21 | experiment = finetuner_mocker.get_experiment(name=experiment_name) 22 | assert experiment.name == experiment_name 23 | assert experiment.status == 'ACTIVE' 24 | 25 | # Create Runs 26 | first_run, second_run = [create_random_name(prefix='run') for _ in range(2)] 27 | 28 | # create a first run 29 | finetuner_mocker.create_training_run( 30 | model='mlp', 31 | model_options={'input_size': 128, 'hidden_sizes': [32]}, 32 | train_data=train_data, 33 | eval_data=eval_data, 34 | experiment_name=experiment_name, 35 | run_name=first_run, 36 | loss='TripletMarginLoss', 37 | optimizer='Adam', 38 | learning_rate=1e-3, 39 | batch_size=12, 40 | epochs=2, 41 | device='cpu', 42 | ) 43 | 44 | # get the first run 45 | run = finetuner_mocker.get_run(experiment_name=experiment_name, run_name=first_run) 46 | assert run.name == first_run 47 | 48 | # create another run 49 | finetuner_mocker.create_training_run( 50 | model='mlp', 51 | model_options={'input_size': 128, 'hidden_sizes': [32]}, 52 | train_data=train_data, 53 | eval_data=eval_data, 54 | experiment_name=experiment_name, 55 | run_name=second_run, 56 | loss='TripletMarginLoss', 57 | optimizer='Adam', 58 | learning_rate=1e-3, 59 | batch_size=12, 60 | epochs=1, 61 | device='cpu', 62 | ) 63 | 64 | # list all runs 65 | runs = finetuner_mocker.list_runs(experiment_name=experiment_name) 66 | assert len(runs) == 2 67 | run_names = [run.name for run in runs] 68 | assert first_run in run_names and second_run in run_names 69 | 70 | # delete the first run 71 | finetuner_mocker.delete_run(experiment_name=experiment_name, run_name=first_run) 72 | runs = finetuner_mocker.list_runs(experiment_name=experiment_name) 73 | assert len(runs) == 1 74 | 75 | # delete all existing runs 76 | finetuner_mocker.delete_runs(experiment_name=experiment_name) 77 | runs = finetuner_mocker.list_runs(experiment_name=experiment_name) 78 | assert not runs 79 | 80 | # delete experiment 81 | finetuner_mocker.delete_experiment(experiment_name) 82 | experiments = finetuner_mocker.list_experiments() 83 | assert experiment_name not in [experiment.name for experiment in experiments] 84 | 85 | 86 | @pytest.mark.parametrize('use_onnx', [True, False]) 87 | def test_create_training_run_and_save_model( 88 | finetuner_mocker, get_feature_data, tmp_path, use_onnx 89 | ): 90 | import time 91 | 92 | train_da, test_da = get_feature_data 93 | experiment_name = create_random_name() 94 | finetuner_mocker.create_experiment(name=experiment_name) 95 | run = finetuner_mocker.create_training_run( 96 | model='mlp', 97 | model_options={'input_size': 128, 'hidden_sizes': [32]}, 98 | train_data=train_da, 99 | loss='TripletMarginLoss', 100 | optimizer='Adam', 101 | learning_rate=0.001, 102 | batch_size=12, 103 | epochs=2, 104 | experiment_name=experiment_name, 105 | to_onnx=use_onnx, 106 | device='cpu', 107 | ) 108 | status = run.status()[STATUS] 109 | 110 | # wait for up to 20 minutes for the run to finish 111 | for _ in range(6 * 20): 112 | if status in [FAILED, FINISHED]: 113 | break 114 | time.sleep(10) 115 | status = run.status()[STATUS] 116 | 117 | assert status == FINISHED 118 | 119 | artifact_id = run.artifact_id 120 | assert isinstance(artifact_id, str) 121 | # the artifact id is a 24 character hex string defined in mongo db. 122 | assert len(artifact_id) == 24 123 | 124 | artifact = run.save_artifact(directory=tmp_path / 'finetuned_model') 125 | assert os.path.exists(tmp_path / 'finetuned_model') 126 | 127 | # encode and check the embeddings 128 | model = finetuner.get_model(artifact=artifact, is_onnx=use_onnx) 129 | finetuner.encode(model=model, data=test_da) 130 | assert test_da.embeddings is not None 131 | assert isinstance(test_da.embeddings, np.ndarray) 132 | 133 | # delete created experiments (and runs) 134 | finetuner_mocker.delete_experiment(experiment_name) 135 | experiments = finetuner_mocker.list_experiments() 136 | assert experiment_name not in [experiment.name for experiment in experiments] 137 | 138 | 139 | def test_create_synthesis_run_and_save_data( 140 | finetuner_mocker, synthesis_query_data, synthesis_corpus_data 141 | ): 142 | import time 143 | 144 | experiment_name = create_random_name() 145 | finetuner_mocker.create_experiment(name=experiment_name) 146 | run = finetuner_mocker.create_synthesis_run( 147 | query_data=synthesis_query_data, 148 | corpus_data=synthesis_corpus_data, 149 | models=synthesis_model_en, 150 | num_relations=3, 151 | experiment_name=experiment_name, 152 | ) 153 | status = run.status()[STATUS] 154 | 155 | # wait for up to 20 minutes for the run to finish 156 | for _ in range(6 * 20): 157 | if status in [FAILED, FINISHED]: 158 | break 159 | time.sleep(10) 160 | status = run.status()[STATUS] 161 | 162 | assert status == FINISHED 163 | 164 | train_data = run.train_data 165 | assert isinstance(train_data, str) 166 | train_data = DocumentArray.pull(train_data) 167 | 168 | for doc in train_data['@c']: 169 | assert doc.content is not None 170 | 171 | # delete created experiments (and runs) 172 | finetuner_mocker.delete_experiment(experiment_name) 173 | experiments = finetuner_mocker.list_experiments() 174 | assert experiment_name not in [experiment.name for experiment in experiments] 175 | -------------------------------------------------------------------------------- /tests/unit/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jina-ai/finetuner/69ae77cb51c736e791126792de20015a70658b53/tests/unit/__init__.py -------------------------------------------------------------------------------- /tests/unit/conftest.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pytest 4 | from tests.unit.mocks import create_client_mocker, create_request_mocker 5 | 6 | from finetuner import Finetuner 7 | 8 | current_dir = os.path.dirname(os.path.abspath(__file__)) 9 | 10 | 11 | @pytest.fixture 12 | def client_mocker(mocker): 13 | return create_request_mocker(mocker) 14 | 15 | 16 | @pytest.fixture 17 | def finetuner_mocker(mocker): 18 | base = create_client_mocker(mocker) 19 | finetuner = Finetuner() 20 | finetuner._client = base 21 | finetuner._default_experiment = finetuner._get_default_experiment() 22 | return finetuner 23 | -------------------------------------------------------------------------------- /tests/unit/mocks.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | 4 | import docarray 5 | from tests.constants import HUBBLE_USER_TEST_ID 6 | 7 | import hubble 8 | from finetuner.client import FinetunerV1Client 9 | from finetuner.constants import ( 10 | CONFIG, 11 | CREATED, 12 | CREATED_AT, 13 | DESCRIPTION, 14 | FAILED, 15 | FINISHED, 16 | NAME, 17 | RUN_NAME, 18 | STARTED, 19 | STATUS, 20 | ) 21 | 22 | 23 | def _create_base_mocker(mocker): 24 | def hubble_login_mocker(force: bool = False, **kwargs): 25 | print('Successfully logged in to Hubble!') 26 | 27 | def get_auth_token(): 28 | if not os.environ.get('JINA_AUTH_TOKEN'): 29 | raise ValueError('Please set `JINA_AUTH_TOKEN` as an environment variable.') 30 | return os.environ.get('JINA_AUTH_TOKEN') 31 | 32 | mocker.patch.object(hubble, 'login', hubble_login_mocker) 33 | mocker.patch.object(hubble.Auth, 'get_auth_token', get_auth_token) 34 | mocker.patch.object(docarray.DocumentArray, 'push', _return_args) 35 | hubble.login() 36 | client = FinetunerV1Client() 37 | mocker.patch.object(client, 'hubble_user_id', HUBBLE_USER_TEST_ID) 38 | return client 39 | 40 | 41 | def _return_args(*_, **kwargs): 42 | return kwargs 43 | 44 | 45 | def create_request_mocker(mocker): 46 | base_mocker = _create_base_mocker(mocker) 47 | mocker.patch.object(base_mocker, '_handle_request', _return_args) 48 | return base_mocker 49 | 50 | 51 | def create_client_mocker(mocker): 52 | def return_experiment(**kwargs): 53 | name = kwargs.get(NAME) or 'experiment name' 54 | return { 55 | STATUS: 'ACTIVE', 56 | NAME: name, 57 | DESCRIPTION: 'description', 58 | CREATED_AT: 'some time', 59 | } 60 | 61 | def return_experiments(**_): 62 | names = ['first experiment', 'second experiment'] 63 | return { 64 | 'items': [return_experiment(name=name) for name in names], 65 | 'total': 0, 66 | 'page': 1, 67 | 'size': len(names), 68 | } 69 | 70 | def return_status(**_): 71 | return { 72 | 'status': random.choice([CREATED, STARTED, FINISHED, FAILED]), 73 | 'details': '', 74 | } 75 | 76 | def return_run(**kwargs): 77 | name = kwargs.get(RUN_NAME) or 'run name' 78 | config = kwargs.get('run_config') or {} 79 | return { 80 | NAME: name, 81 | CONFIG: config, 82 | DESCRIPTION: 'description', 83 | CREATED_AT: 'some time', 84 | } 85 | 86 | def return_runs(**_): 87 | names = ['first run', 'second run'] 88 | return { 89 | 'items': [return_run(run_name=name) for name in names], 90 | 'total': 0, 91 | 'page': 1, 92 | 'size': len(names), 93 | } 94 | 95 | base_mocker = _create_base_mocker(mocker) 96 | 97 | mocker.patch.object(base_mocker, 'create_experiment', return_experiment) 98 | mocker.patch.object(base_mocker, 'get_experiment', return_experiment) 99 | mocker.patch.object(base_mocker, 'delete_experiment', return_experiment) 100 | mocker.patch.object(base_mocker, 'list_experiments', return_experiments) 101 | mocker.patch.object(base_mocker, 'delete_experiments', return_experiments) 102 | mocker.patch.object(base_mocker, 'get_run_status', return_status) 103 | mocker.patch.object(base_mocker, 'get_run', return_run) 104 | mocker.patch.object(base_mocker, 'create_run', return_run) 105 | mocker.patch.object(base_mocker, 'delete_run', return_run) 106 | mocker.patch.object(base_mocker, 'list_runs', return_runs) 107 | mocker.patch.object(base_mocker, 'delete_runs', return_runs) 108 | 109 | return base_mocker 110 | -------------------------------------------------------------------------------- /tests/unit/resources/cube.off: -------------------------------------------------------------------------------- 1 | OFF 2 | 8 6 0 3 | 1.0 0.0 1.0 4 | 0.0 1.0 1.0 5 | -1.0 0.0 1.0 6 | 0.0 -1.0 1.0 7 | 1.0 0.0 0.0 8 | 0.0 1.0 0.0 9 | -1.0 0.0 0.0 10 | 0.0 -1.0 0.0 11 | 4 0 1 2 3 12 | 4 7 4 0 3 13 | 4 4 5 1 0 14 | 4 5 6 2 1 15 | 4 3 2 6 7 16 | 4 6 5 4 7 17 | 18 | -------------------------------------------------------------------------------- /tests/unit/resources/dummy.csv: -------------------------------------------------------------------------------- 1 | This is an English sentence,Das ist ein englischer Satz 2 | This is another English sentence,Dies ist ein weiterer englischer Satz -------------------------------------------------------------------------------- /tests/unit/resources/lena.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jina-ai/finetuner/69ae77cb51c736e791126792de20015a70658b53/tests/unit/resources/lena.png -------------------------------------------------------------------------------- /tests/unit/test___init__.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | from _finetuner.excepts import SelectModelRequired 4 | from _finetuner.models.inference import ONNXRuntimeInferenceEngine, TorchInferenceEngine 5 | 6 | import finetuner 7 | from finetuner import Document, DocumentArray 8 | 9 | 10 | @pytest.mark.parametrize( 11 | 'descriptor, select_model, is_onnx, expect_error', 12 | [ 13 | ('bert-base-en', None, False, None), 14 | ('bert-base-en', None, True, None), 15 | ('clip-base-en', 'clip-text', False, None), 16 | ('clip-base-en', 'clip-vision', False, None), 17 | ('clip-base-en', None, False, SelectModelRequired), 18 | ('MADE UP MODEL', None, False, ValueError), 19 | ], 20 | ) 21 | def test_build_model(descriptor, select_model, is_onnx, expect_error): 22 | if expect_error: 23 | with pytest.raises(expect_error): 24 | model = finetuner.build_model( 25 | name=descriptor, 26 | select_model=select_model, 27 | is_onnx=is_onnx, 28 | ) 29 | else: 30 | model = finetuner.build_model( 31 | name=descriptor, select_model=select_model, is_onnx=is_onnx 32 | ) 33 | 34 | if is_onnx: 35 | assert isinstance(model, ONNXRuntimeInferenceEngine) 36 | else: 37 | assert isinstance(model, TorchInferenceEngine) 38 | 39 | 40 | @pytest.mark.parametrize('is_onnx', [True, False]) 41 | def test_build_model_embedding(is_onnx): 42 | model = finetuner.build_model(name='bert-base-cased', is_onnx=is_onnx) 43 | 44 | da = DocumentArray(Document(text='TEST TEXT')) 45 | finetuner.encode(model=model, data=da) 46 | assert da.embeddings is not None 47 | assert isinstance(da.embeddings, np.ndarray) 48 | 49 | 50 | def test_embedding_with_list(): 51 | model = finetuner.build_model(name='bert-base-cased') 52 | 53 | da = DocumentArray(Document(text='TEST TEXT')) 54 | lst = ['TEST TEXT'] 55 | da_embeddings = finetuner.encode(model=model, data=da) 56 | lst_embeddings = finetuner.encode(model=model, data=lst) 57 | 58 | for expected, actual in zip(da_embeddings.embeddings, lst_embeddings): 59 | assert np.array_equal(expected, actual) 60 | -------------------------------------------------------------------------------- /tests/unit/test_client.py: -------------------------------------------------------------------------------- 1 | from finetuner.constants import ( 2 | API_VERSION, 3 | CONFIG, 4 | DELETE, 5 | EXAMPLES, 6 | EXPERIMENTS, 7 | GET, 8 | LOGS, 9 | METRICS, 10 | NAME, 11 | POST, 12 | RUNS, 13 | STATUS, 14 | SYNTHESIS_TASK, 15 | TRAINING_TASK, 16 | ) 17 | from finetuner.experiment import Experiment 18 | from finetuner.model import synthesis_model_en 19 | 20 | 21 | def test_create_experiment(client_mocker, name='name'): 22 | response = client_mocker.create_experiment(name) 23 | assert ( 24 | response['url'] 25 | == client_mocker._construct_url( 26 | client_mocker._base_url, API_VERSION, EXPERIMENTS 27 | ) 28 | + '/' 29 | ) 30 | assert response['method'] == POST 31 | assert response['json_data'][NAME] == name 32 | 33 | 34 | def test_get_experiment(client_mocker, name='name'): 35 | sent_request = client_mocker.get_experiment(name) 36 | assert sent_request['url'] == client_mocker._construct_url( 37 | client_mocker._base_url, API_VERSION, EXPERIMENTS, name 38 | ) 39 | assert sent_request['method'] == GET 40 | 41 | 42 | def test_list_experiments(client_mocker): 43 | sent_request = client_mocker.list_experiments() 44 | assert ( 45 | sent_request['url'] 46 | == client_mocker._construct_url( 47 | client_mocker._base_url, API_VERSION, EXPERIMENTS 48 | ) 49 | + '/' 50 | ) 51 | assert sent_request['method'] == GET 52 | 53 | 54 | def test_delete_experiment(client_mocker, name='name'): 55 | sent_request = client_mocker.delete_experiment(name) 56 | assert sent_request['url'] == client_mocker._construct_url( 57 | client_mocker._base_url, API_VERSION, EXPERIMENTS, name 58 | ) 59 | assert sent_request['method'] == DELETE 60 | 61 | 62 | def test_delete_experiments(client_mocker): 63 | sent_request = client_mocker.delete_experiments() 64 | assert ( 65 | sent_request['url'] 66 | == client_mocker._construct_url( 67 | client_mocker._base_url, API_VERSION, EXPERIMENTS 68 | ) 69 | + '/' 70 | ) 71 | assert sent_request['method'] == DELETE 72 | 73 | 74 | def test_create_training_run(client_mocker, experiment_name='exp', run_name='run'): 75 | config = Experiment._create_finetuning_config( 76 | model='resnet50', 77 | train_data='data name', 78 | experiment_name=experiment_name, 79 | run_name=run_name, 80 | ) 81 | sent_request = client_mocker.create_run( 82 | experiment_name=experiment_name, 83 | run_name=run_name, 84 | run_config=config, 85 | task=TRAINING_TASK, 86 | device='cpu', 87 | cpus=1, 88 | gpus=1, 89 | ) 90 | assert sent_request['url'] == client_mocker._construct_url( 91 | client_mocker._base_url, API_VERSION, EXPERIMENTS, experiment_name, RUNS 92 | ) 93 | assert sent_request['method'] == POST 94 | assert sent_request['json_data'][NAME] == run_name 95 | assert sent_request['json_data'][CONFIG] == config 96 | 97 | 98 | def test_create_synthesis_run(client_mocker, experiment_name='exp', run_name='run'): 99 | config = Experiment._create_synthesis_config( 100 | query_data='query_data_name', 101 | corpus_data='corpus_data_name', 102 | models=synthesis_model_en, 103 | num_relations=3, 104 | experiment_name=experiment_name, 105 | run_name=run_name, 106 | ) 107 | sent_request = client_mocker.create_run( 108 | experiment_name=experiment_name, 109 | run_name=run_name, 110 | run_config=config, 111 | task=SYNTHESIS_TASK, 112 | device='cpu', 113 | cpus=1, 114 | gpus=1, 115 | ) 116 | assert sent_request['url'] == client_mocker._construct_url( 117 | client_mocker._base_url, API_VERSION, EXPERIMENTS, experiment_name, RUNS 118 | ) 119 | assert sent_request['method'] == POST 120 | assert sent_request['json_data'][NAME] == run_name 121 | assert sent_request['json_data'][CONFIG] == config 122 | 123 | 124 | def test_get_run(client_mocker, experiment_name='exp', run_name='run1'): 125 | sent_request = client_mocker.get_run( 126 | experiment_name=experiment_name, run_name=run_name 127 | ) 128 | assert sent_request['url'] == client_mocker._construct_url( 129 | client_mocker._base_url, 130 | API_VERSION, 131 | EXPERIMENTS, 132 | experiment_name, 133 | RUNS, 134 | run_name, 135 | ) 136 | assert sent_request['method'] == GET 137 | 138 | 139 | def test_delete_run(client_mocker, experiment_name='exp', run_name='run1'): 140 | sent_request = client_mocker.delete_run( 141 | experiment_name=experiment_name, run_name=run_name 142 | ) 143 | assert sent_request['url'] == client_mocker._construct_url( 144 | client_mocker._base_url, 145 | API_VERSION, 146 | EXPERIMENTS, 147 | experiment_name, 148 | RUNS, 149 | run_name, 150 | ) 151 | assert sent_request['method'] == DELETE 152 | 153 | 154 | def test_delete_runs(client_mocker, experiment_name='exp'): 155 | sent_request = client_mocker.delete_runs(experiment_name=experiment_name) 156 | assert sent_request['url'] == client_mocker._construct_url( 157 | client_mocker._base_url, API_VERSION, EXPERIMENTS, experiment_name, RUNS 158 | ) 159 | assert sent_request['method'] == DELETE 160 | 161 | 162 | def test_get_run_status(client_mocker, experiment_name='exp', run_name='run1'): 163 | sent_request = client_mocker.get_run_status( 164 | experiment_name=experiment_name, run_name=run_name 165 | ) 166 | assert sent_request['url'] == client_mocker._construct_url( 167 | client_mocker._base_url, 168 | API_VERSION, 169 | EXPERIMENTS, 170 | experiment_name, 171 | RUNS, 172 | run_name, 173 | STATUS, 174 | ) 175 | assert sent_request['method'] == GET 176 | 177 | 178 | def test_get_run_logs(client_mocker, experiment_name='exp', run_name='run1'): 179 | sent_request = client_mocker.get_run_logs( 180 | experiment_name=experiment_name, run_name=run_name 181 | ) 182 | assert sent_request['url'] == client_mocker._construct_url( 183 | client_mocker._base_url, 184 | API_VERSION, 185 | EXPERIMENTS, 186 | experiment_name, 187 | RUNS, 188 | run_name, 189 | LOGS, 190 | ) 191 | assert sent_request['method'] == GET 192 | 193 | 194 | def test_get_run_metrics(client_mocker, experiment_name='exp', run_name='run1'): 195 | sent_request = client_mocker.get_run_metrics( 196 | experiment_name=experiment_name, run_name=run_name 197 | ) 198 | assert sent_request['url'] == client_mocker._construct_url( 199 | client_mocker._base_url, 200 | API_VERSION, 201 | EXPERIMENTS, 202 | experiment_name, 203 | RUNS, 204 | run_name, 205 | METRICS, 206 | ) 207 | assert sent_request['method'] == GET 208 | 209 | 210 | def test_get_run_examples(client_mocker, experiment_name='exp', run_name='run1'): 211 | sent_request = client_mocker.get_run_examples( 212 | experiment_name=experiment_name, run_name=run_name 213 | ) 214 | assert sent_request['url'] == client_mocker._construct_url( 215 | client_mocker._base_url, 216 | API_VERSION, 217 | EXPERIMENTS, 218 | experiment_name, 219 | RUNS, 220 | run_name, 221 | EXAMPLES, 222 | ) 223 | assert sent_request['method'] == GET 224 | -------------------------------------------------------------------------------- /tests/unit/test_experiment.py: -------------------------------------------------------------------------------- 1 | import docarray 2 | import pytest 3 | 4 | from finetuner.callback import TrainingCheckpoint 5 | from finetuner.constants import ( 6 | ARTIFACT, 7 | BATCH_SIZE, 8 | CALLBACKS, 9 | CORPUS, 10 | CREATED, 11 | CROSS_ENCODER, 12 | DA_PREFIX, 13 | DATA, 14 | EPOCHS, 15 | EVAL_DATA, 16 | EVALUATE, 17 | EXPERIMENT_NAME, 18 | FAILED, 19 | FINISHED, 20 | FREEZE, 21 | HYPER_PARAMETERS, 22 | LEARNING_RATE, 23 | LOSS, 24 | LOSS_OPTIMIZER, 25 | LOSS_OPTIMIZER_OPTIONS, 26 | LOSS_OPTIONS, 27 | MAX_NUM_DOCS, 28 | MINER, 29 | MINER_OPTIONS, 30 | MODEL, 31 | MODELS, 32 | NAME, 33 | NUM_ITEMS_PER_CLASS, 34 | NUM_RELATIONS, 35 | NUM_WORKERS, 36 | ONNX, 37 | OPTIMIZER, 38 | OPTIMIZER_OPTIONS, 39 | OPTIONS, 40 | OUTPUT_DIM, 41 | PUBLIC, 42 | QUERIES, 43 | RAW_DATA_CONFIG, 44 | RELATION_MINING, 45 | RUN_NAME, 46 | SAMPLER, 47 | SCHEDULER, 48 | SCHEDULER_OPTIONS, 49 | STARTED, 50 | STATUS, 51 | TRAIN_DATA, 52 | VAL_SPLIT, 53 | ) 54 | from finetuner.experiment import Experiment 55 | from finetuner.model import synthesis_model_en 56 | 57 | 58 | @pytest.fixture 59 | def experiment(finetuner_mocker): 60 | experiment = Experiment( 61 | client=finetuner_mocker._client, 62 | name='experiment name', 63 | status='ACTIVE', 64 | created_at='some time', 65 | description='test description', 66 | ) 67 | return experiment 68 | 69 | 70 | def test_get_experiment_name(experiment): 71 | assert experiment.name == 'experiment name' 72 | 73 | 74 | def test_get_run(experiment): 75 | run = experiment.get_run(name='run name') 76 | assert run.name == 'run name' 77 | assert run.status()[STATUS] in [CREATED, STARTED, FINISHED, FAILED] 78 | 79 | 80 | def test_list_runs(experiment): 81 | runs = experiment.list_runs() 82 | # depends on `return_runs` in `unit/conftest.py` 83 | assert len(runs) == 2 84 | for run, expected_name in zip(runs, ['first run', 'second run']): 85 | assert run.name == expected_name 86 | assert run.status()[STATUS] in [CREATED, STARTED, FINISHED, FAILED] 87 | 88 | 89 | def test_create_training_run(experiment): 90 | data = docarray.DocumentArray().empty(1) 91 | run_name = 'run1' 92 | data_name = f'{DA_PREFIX}-{experiment.name}-{run_name}-train' 93 | run = experiment.create_training_run( 94 | model='resnet50', 95 | model_options={}, 96 | train_data=data, 97 | run_name=run_name, 98 | ) 99 | expected_config = Experiment._create_finetuning_config( 100 | model='resnet50', 101 | model_options={}, 102 | train_data=data_name, 103 | experiment_name=experiment.name, 104 | run_name=run_name, 105 | ) 106 | assert run.name == run_name 107 | assert run.status()[STATUS] in [CREATED, STARTED, FINISHED, FAILED] 108 | assert run.config == expected_config 109 | 110 | 111 | def test_create_synthesis_run(experiment): 112 | query_data = docarray.DocumentArray().empty(1) 113 | corpus_data = docarray.DocumentArray().empty(2) 114 | run_name = 'run1' 115 | query_data_name = f'{DA_PREFIX}-{experiment.name}-{run_name}-query' 116 | corpus_data_name = f'{DA_PREFIX}-{experiment.name}-{run_name}-corpus' 117 | run = experiment.create_synthesis_run( 118 | query_data=query_data, 119 | corpus_data=corpus_data, 120 | models=synthesis_model_en, 121 | num_relations=3, 122 | run_name=run_name, 123 | ) 124 | expected_config = Experiment._create_synthesis_config( 125 | query_data=query_data_name, 126 | corpus_data=corpus_data_name, 127 | models=synthesis_model_en, 128 | num_relations=3, 129 | experiment_name=experiment.name, 130 | run_name=run_name, 131 | ) 132 | assert run.name == run_name 133 | assert run.status()[STATUS] in [CREATED, STARTED, FINISHED, FAILED] 134 | assert run.config == expected_config 135 | 136 | 137 | def test_create_training_run_config(): 138 | expected_config = { 139 | MODEL: { 140 | NAME: 'resnet50', 141 | ARTIFACT: None, 142 | FREEZE: False, 143 | OUTPUT_DIM: None, 144 | OPTIONS: None, 145 | ONNX: False, 146 | }, 147 | DATA: { 148 | TRAIN_DATA: 'train_data', 149 | EVAL_DATA: 'eval_data', 150 | EVALUATE: False, 151 | NUM_WORKERS: 8, 152 | NUM_ITEMS_PER_CLASS: 4, 153 | VAL_SPLIT: 0.0, 154 | SAMPLER: 'auto', 155 | }, 156 | HYPER_PARAMETERS: { 157 | LOSS: 'TripletMarginLoss', 158 | LOSS_OPTIONS: None, 159 | OPTIMIZER: 'Adam', 160 | OPTIMIZER_OPTIONS: {'weight_decay': 0.01}, 161 | MINER: 'TripletMarginMiner', 162 | MINER_OPTIONS: {'margin': 0.3}, 163 | BATCH_SIZE: 8, 164 | LEARNING_RATE: 0.001, 165 | EPOCHS: 20, 166 | SCHEDULER: 'linear', 167 | SCHEDULER_OPTIONS: { 168 | 'num_training_steps': 'auto', 169 | 'num_warmup_steps': 2, 170 | 'scheduler_step': 'batch', 171 | }, 172 | LOSS_OPTIMIZER: None, 173 | LOSS_OPTIMIZER_OPTIONS: None, 174 | }, 175 | CALLBACKS: [ 176 | { 177 | NAME: 'TrainingCheckpoint', 178 | OPTIONS: { 179 | 'last_k_epochs': 2, 180 | }, 181 | } 182 | ], 183 | EXPERIMENT_NAME: 'exp name', 184 | PUBLIC: False, 185 | RUN_NAME: 'run name', 186 | } 187 | config = Experiment._create_finetuning_config( 188 | model='resnet50', 189 | train_data='train_data', 190 | experiment_name='exp name', 191 | run_name='run name', 192 | eval_data='eval_data', 193 | description=None, 194 | loss='TripletMarginLoss', 195 | miner='TripletMarginMiner', 196 | miner_options={'margin': 0.3}, 197 | optimizer='Adam', 198 | optimizer_options={'weight_decay': 0.01}, 199 | learning_rate=0.001, 200 | epochs=20, 201 | batch_size=8, 202 | callbacks=[TrainingCheckpoint(last_k_epochs=2)], 203 | scheduler='linear', 204 | scheduler_options={ 205 | 'num_warmup_steps': 2, 206 | 'scheduler_step': 'batch', 207 | }, 208 | freeze=False, 209 | output_dim=None, 210 | multi_modal=False, 211 | device='cuda', 212 | ) 213 | assert config == expected_config 214 | 215 | 216 | def test_create_synthesis_run_config(): 217 | expected_config = { 218 | RAW_DATA_CONFIG: { 219 | QUERIES: 'query_data', 220 | CORPUS: 'corpus_data', 221 | }, 222 | RELATION_MINING: { 223 | MODELS: [synthesis_model_en.relation_miner], 224 | NUM_RELATIONS: 3, 225 | }, 226 | CROSS_ENCODER: synthesis_model_en.cross_encoder, 227 | MAX_NUM_DOCS: None, 228 | EXPERIMENT_NAME: 'exp name', 229 | PUBLIC: False, 230 | RUN_NAME: 'run name', 231 | } 232 | 233 | config = Experiment._create_synthesis_config( 234 | train_data='train_data', 235 | experiment_name='exp name', 236 | models=synthesis_model_en, 237 | run_name='run name', 238 | query_data='query_data', 239 | corpus_data='corpus_data', 240 | num_relations=3, 241 | ) 242 | 243 | assert config == expected_config 244 | -------------------------------------------------------------------------------- /tests/unit/test_finetuner.py: -------------------------------------------------------------------------------- 1 | import docarray 2 | import pytest 3 | 4 | from finetuner.constants import CREATED, FAILED, FINISHED, STARTED, STATUS 5 | from finetuner.model import synthesis_model_en 6 | 7 | 8 | @pytest.mark.parametrize( 9 | 'experiment_name', 10 | ['exp name', None], 11 | ) 12 | def test_create_experiment(finetuner_mocker, experiment_name): 13 | if experiment_name: 14 | experiment = finetuner_mocker.create_experiment(name=experiment_name) 15 | else: 16 | experiment = finetuner_mocker.create_experiment() 17 | expected_name = experiment_name or 'default' 18 | assert experiment.name == expected_name 19 | assert experiment._status == 'ACTIVE' 20 | 21 | 22 | def test_get_experiment(finetuner_mocker, experiment_name='exp_name'): 23 | experiment = finetuner_mocker.get_experiment(name=experiment_name) 24 | assert experiment.name == experiment_name 25 | 26 | 27 | def test_list_experiments(finetuner_mocker): 28 | experiments = finetuner_mocker.list_experiments() 29 | # depends on `return_experiments` in `unit/conftest.py` 30 | assert len(experiments) == 2 31 | assert experiments[0].name == 'first experiment' 32 | assert experiments[1].name == 'second experiment' 33 | 34 | 35 | @pytest.mark.parametrize( 36 | 'experiment_name', 37 | ['exp name', None], 38 | ) 39 | def test_create_training_run(finetuner_mocker, experiment_name): 40 | data = docarray.DocumentArray().empty(1) 41 | run_name = 'run1' 42 | exp_name = experiment_name or 'default' 43 | run = finetuner_mocker.create_training_run( 44 | model='resnet50', 45 | train_data=data, 46 | run_name=run_name, 47 | experiment_name=experiment_name, 48 | ) 49 | assert run.name == run_name 50 | assert run.status()[STATUS] in [CREATED, STARTED, FINISHED, FAILED] 51 | assert run._experiment_name == exp_name 52 | 53 | 54 | @pytest.mark.parametrize( 55 | 'experiment_name', 56 | ['exp name', None], 57 | ) 58 | def test_create_synthesis_run(finetuner_mocker, experiment_name): 59 | data = docarray.DocumentArray().empty(1) 60 | run_name = 'run1' 61 | exp_name = experiment_name or 'default' 62 | run = finetuner_mocker.create_synthesis_run( 63 | query_data=data, 64 | corpus_data=data, 65 | models=synthesis_model_en, 66 | num_relations=3, 67 | run_name=run_name, 68 | experiment_name=experiment_name, 69 | ) 70 | assert run.name == run_name 71 | assert run.status()[STATUS] in [CREATED, STARTED, FINISHED, FAILED] 72 | assert run._experiment_name == exp_name 73 | 74 | 75 | @pytest.mark.parametrize( 76 | 'experiment_name', 77 | ['exp name', None], 78 | ) 79 | def test_get_run(finetuner_mocker, experiment_name): 80 | run = finetuner_mocker.get_run(run_name='run_name', experiment_name=experiment_name) 81 | exp_name = experiment_name or 'default' 82 | assert run.name == 'run_name' 83 | assert run._experiment_name == exp_name 84 | -------------------------------------------------------------------------------- /tests/unit/test_hubble.py: -------------------------------------------------------------------------------- 1 | import docarray 2 | 3 | from finetuner.constants import DA_PREFIX 4 | from finetuner.hubble import push_training_data 5 | 6 | 7 | def test_push_training_data(client_mocker, experiment_name='exp', run_name='run'): 8 | train_data = docarray.DocumentArray.empty(10) 9 | eval_data = query_data = docarray.DocumentArray.empty(5) 10 | index_data = None 11 | 12 | train_name, eval_name, query_name, index_name = push_training_data( 13 | experiment_name=experiment_name, 14 | run_name=run_name, 15 | train_data=train_data, 16 | eval_data=eval_data, 17 | query_data=query_data, 18 | index_data=index_data, 19 | ) 20 | assert train_name == f'{DA_PREFIX}-{experiment_name}-{run_name}-train' 21 | assert eval_name == query_name == f'{DA_PREFIX}-{experiment_name}-{run_name}-eval' 22 | assert not index_name 23 | -------------------------------------------------------------------------------- /tests/unit/test_run.py: -------------------------------------------------------------------------------- 1 | from finetuner.constants import CREATED, FAILED, FINISHED, STARTED, STATUS 2 | from finetuner.run import Run 3 | 4 | 5 | def test_run_obj(finetuner_mocker): 6 | test_config = {'type': 'test'} 7 | run = Run( 8 | client=finetuner_mocker._client, 9 | name='run name', 10 | experiment_name='exp name', 11 | config=test_config, 12 | created_at='some time', 13 | description='description', 14 | ) 15 | 16 | assert run.name == 'run name' 17 | assert run.status()[STATUS] in [CREATED, STARTED, FINISHED, FAILED] 18 | assert run.config == test_config 19 | --------------------------------------------------------------------------------