├── .github ├── .README_images │ ├── 0a8863abb3fcee182e1fe8fe46c47b7a.gif │ ├── d4165abd.png │ └── ed2907cd11ac26a2a3a2555f16071d13.gif ├── ci.sh ├── dependabot.yml ├── perf-script.sh ├── performance.txt ├── util │ └── pull_dataset.py └── workflows │ ├── ci.yml │ └── copyright.yml ├── .gitignore ├── LICENSE ├── README.md ├── __init__.py ├── audio-to-audio-search ├── .github │ └── demo.png ├── .gitignore ├── README.md ├── app.py ├── demo.html ├── executors.py ├── helper.py ├── requirements.txt ├── tests │ ├── __init__.py │ ├── data │ │ └── mp3 │ │ │ └── index │ │ │ ├── index_-Bu7YaslRW0.mp3 │ │ │ ├── index_-D--GWwca0g.mp3 │ │ │ ├── index_-nlkWWphiaM.mp3 │ │ │ ├── index_0bRUkLsttto.mp3 │ │ │ └── index_0slyl34xWug.mp3 │ ├── requirements.txt │ └── test_audio_to_audio_search.py └── vggish │ ├── mel_features.py │ ├── vggish_input.py │ ├── vggish_params.py │ ├── vggish_postprocess.py │ └── vggish_slim.py ├── cross-modal-search ├── .dockerignore ├── README.md ├── __init__.py ├── app.py ├── dataset.py ├── flows │ ├── executors.py │ ├── flow-index.yml │ └── flow-query.yml ├── get_data.sh ├── get_data30k.sh ├── requirements.txt ├── setup_run.sh ├── tests │ ├── __init__.py │ ├── conftest.py │ ├── requirements.txt │ └── test_cross_modal_search.py ├── toy-data │ ├── captions.txt │ └── images │ │ ├── 1000268201_693b08cb0e.jpg │ │ └── 1001773457_577c3a7d70.jpg └── visualizations │ ├── cross-modal-index-flow.png │ ├── cross-modal-query-flow.png │ ├── cross-modal-result.jpg │ ├── image_results.png │ └── text_results.png ├── example-guidelines.md ├── example_template.md ├── multires-lyrics-search ├── .github │ ├── demo.gif │ ├── index.jpg │ └── search.jpg ├── .gitignore ├── README.md ├── app.py ├── flows │ ├── index.yml │ └── query.yml ├── get_data.sh ├── helper.py ├── lyrics-data │ └── lyrics-toy-data1000.csv ├── requirements.txt ├── static │ ├── index.html │ ├── jina-logo.svg │ ├── main.css │ └── vue-bindings.js └── tests │ ├── __init__.py │ ├── conftest.py │ ├── requirements.txt │ └── test_flow_integration.py ├── pytest.ini ├── wikipedia-sentences-query-while-indexing ├── .github │ └── images │ │ ├── query.svg │ │ └── storage.svg ├── .gitignore ├── README.md ├── __init__.py ├── app.py ├── data │ └── toy.txt ├── flows │ ├── query.yml │ └── storage.yml ├── get_data.sh ├── manifest.yml ├── requirements.txt └── tests │ ├── __init__.py │ ├── requirements.txt │ └── test_query_while_indexing.py └── wikipedia-sentences ├── .dockerignore ├── .github └── flow.png ├── .gitignore ├── README.md ├── app.py ├── data └── toy-input.txt ├── flows └── flow.yml ├── get_data.sh ├── requirements.txt └── tests ├── __init__.py ├── conftest.py ├── requirements.txt ├── test_wikipediasearch.py └── toy-input.txt /.github/.README_images/0a8863abb3fcee182e1fe8fe46c47b7a.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jina-ai/examples/d8f903278597254bd96b1ed64fe8c9feefaa265c/.github/.README_images/0a8863abb3fcee182e1fe8fe46c47b7a.gif -------------------------------------------------------------------------------- /.github/.README_images/d4165abd.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jina-ai/examples/d8f903278597254bd96b1ed64fe8c9feefaa265c/.github/.README_images/d4165abd.png -------------------------------------------------------------------------------- /.github/.README_images/ed2907cd11ac26a2a3a2555f16071d13.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jina-ai/examples/d8f903278597254bd96b1ed64fe8c9feefaa265c/.github/.README_images/ed2907cd11ac26a2a3a2555f16071d13.gif -------------------------------------------------------------------------------- /.github/ci.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # find all the examples with changed code 3 | # run the tests in that directory 4 | changedExamples=() 5 | 6 | for changed_file in $CHANGED_FILES; do 7 | echo changed $changed_file 8 | file_base_dir=$(echo $changed_file | cut -d/ -f1) 9 | # if the example has not yet been added 10 | if [[ ! " ${changedExamples[@]} " =~ " ${file_base_dir} " ]]; then 11 | echo adding $file_base_dir 12 | changedExamples+=(${file_base_dir}) 13 | fi 14 | done 15 | 16 | echo will run tests on ${changedExamples[@]} 17 | 18 | EXIT_CODE=0 19 | 20 | root_dir=$(pwd) 21 | # install reqs and run the tests 22 | sudo apt-get -y update && sudo apt-get install libsndfile1 ffmpeg 23 | for example_dir in ${changedExamples[@]}; do 24 | cd $root_dir/$example_dir 25 | echo running tests in $example_dir 26 | pwd 27 | if test -f "tests/requirements.txt"; then 28 | if [[ -d "tests/" ]]; then 29 | python -m venv .venv 30 | source .venv/bin/activate 31 | pip install pytest pytest-mock 32 | pip install -r tests/requirements.txt 33 | if [[ $example_dir == "wikipedia-sentences-query-while-indexing" ]]; then 34 | docker run --add-host host.docker.internal:host-gateway \ 35 | -v /var/run/docker.sock:/var/run/docker.sock \ 36 | -v /tmp/jinad:/tmp/jinad \ 37 | -p 8000:8000 \ 38 | --name jinad \ 39 | -d jinaai/jina:2.1.0-daemon 40 | sleep 5 41 | fi 42 | pytest -s -v tests/ 43 | local_exit_code=$? 44 | deactivate 45 | if [[ ! $local_exit_code == 0 ]]; then 46 | EXIT_CODE=$local_exit_code 47 | echo this one failed. local_exit_code = $local_exit_code, exit = $EXIT_CODE 48 | fi 49 | else 50 | echo 'no tests/ folder here. skipping...' 51 | fi 52 | else 53 | echo 'this is not an example. skipping...' 54 | fi 55 | done 56 | 57 | echo final exit code = $EXIT_CODE 58 | exit $EXIT_CODE 59 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: "pip" 4 | schedule: 5 | interval: daily 6 | ignore: 7 | - dependency-name: "jina" 8 | update-types: ["version-update:semver-patch", "version-update:semver-major"] 9 | directory: "advanced-vector-search" 10 | allow: 11 | - dependency-name: "jina" 12 | dependency-type: "direct" 13 | open-pull-requests-limit: 1 14 | 15 | - package-ecosystem: "docker" 16 | schedule: 17 | interval: daily 18 | ignore: 19 | - dependency-name: "jina" 20 | update-types: ["version-update:semver-patch", "version-update:semver-major"] 21 | directory: "advanced-vector-search" 22 | allow: 23 | - dependency-name: "jina" 24 | dependency-type: "direct" 25 | open-pull-requests-limit: 1 26 | 27 | - package-ecosystem: "pip" 28 | schedule: 29 | interval: daily 30 | ignore: 31 | - dependency-name: "jina" 32 | update-types: ["version-update:semver-patch", "version-update:semver-major"] 33 | directory: "audio-search" 34 | allow: 35 | - dependency-name: "jina" 36 | dependency-type: "direct" 37 | open-pull-requests-limit: 1 38 | 39 | - package-ecosystem: "docker" 40 | schedule: 41 | interval: daily 42 | ignore: 43 | - dependency-name: "jina" 44 | update-types: ["version-update:semver-patch", "version-update:semver-major"] 45 | directory: "audio-search" 46 | allow: 47 | - dependency-name: "jina" 48 | dependency-type: "direct" 49 | open-pull-requests-limit: 1 50 | 51 | - package-ecosystem: "pip" 52 | schedule: 53 | interval: daily 54 | ignore: 55 | - dependency-name: "jina" 56 | update-types: ["version-update:semver-patch", "version-update:semver-major"] 57 | directory: "chinese-text-search" 58 | allow: 59 | - dependency-name: "jina" 60 | dependency-type: "direct" 61 | open-pull-requests-limit: 1 62 | 63 | - package-ecosystem: "pip" 64 | schedule: 65 | interval: daily 66 | ignore: 67 | - dependency-name: "jina" 68 | update-types: ["version-update:semver-patch", "version-update:semver-major"] 69 | directory: "cross-modal-search" 70 | allow: 71 | - dependency-name: "jina" 72 | dependency-type: "direct" 73 | open-pull-requests-limit: 1 74 | 75 | - package-ecosystem: "docker" 76 | schedule: 77 | interval: daily 78 | ignore: 79 | - dependency-name: "jina" 80 | update-types: ["version-update:semver-patch", "version-update:semver-major"] 81 | directory: "cross-modal-search" 82 | allow: 83 | - dependency-name: "jina" 84 | dependency-type: "direct" 85 | open-pull-requests-limit: 1 86 | 87 | - package-ecosystem: "pip" 88 | schedule: 89 | interval: daily 90 | ignore: 91 | - dependency-name: "jina" 92 | update-types: ["version-update:semver-patch", "version-update:semver-major"] 93 | directory: "fashion-example-query" 94 | allow: 95 | - dependency-name: "jina" 96 | dependency-type: "direct" 97 | open-pull-requests-limit: 1 98 | 99 | - package-ecosystem: "docker" 100 | schedule: 101 | interval: daily 102 | ignore: 103 | - dependency-name: "jina" 104 | update-types: ["version-update:semver-patch", "version-update:semver-major"] 105 | directory: "fashion-example-query" 106 | allow: 107 | - dependency-name: "jina" 108 | dependency-type: "direct" 109 | open-pull-requests-limit: 1 110 | 111 | - package-ecosystem: "pip" 112 | schedule: 113 | interval: daily 114 | ignore: 115 | - dependency-name: "jina" 116 | update-types: ["version-update:semver-patch", "version-update:semver-major"] 117 | directory: "multimodal-search-tirg" 118 | allow: 119 | - dependency-name: "jina" 120 | dependency-type: "direct" 121 | open-pull-requests-limit: 1 122 | 123 | - package-ecosystem: "pip" 124 | schedule: 125 | interval: daily 126 | ignore: 127 | - dependency-name: "jina" 128 | update-types: ["version-update:semver-patch", "version-update:semver-major"] 129 | directory: "multires-lyrics-search" 130 | allow: 131 | - dependency-name: "jina" 132 | dependency-type: "direct" 133 | open-pull-requests-limit: 1 134 | 135 | - package-ecosystem: "docker" 136 | schedule: 137 | interval: daily 138 | ignore: 139 | - dependency-name: "jina" 140 | update-types: ["version-update:semver-patch", "version-update:semver-major"] 141 | directory: "multires-lyrics-search" 142 | allow: 143 | - dependency-name: "jina" 144 | dependency-type: "direct" 145 | open-pull-requests-limit: 1 146 | 147 | - package-ecosystem: "pip" 148 | schedule: 149 | interval: daily 150 | ignore: 151 | - dependency-name: "jina" 152 | update-types: ["version-update:semver-patch", "version-update:semver-major"] 153 | directory: "object-search" 154 | allow: 155 | - dependency-name: "jina" 156 | dependency-type: "direct" 157 | open-pull-requests-limit: 1 158 | 159 | - package-ecosystem: "docker" 160 | schedule: 161 | interval: daily 162 | ignore: 163 | - dependency-name: "jina" 164 | update-types: ["version-update:semver-patch", "version-update:semver-major"] 165 | directory: "object-search" 166 | allow: 167 | - dependency-name: "jina" 168 | dependency-type: "direct" 169 | open-pull-requests-limit: 1 170 | 171 | - package-ecosystem: "pip" 172 | schedule: 173 | interval: daily 174 | ignore: 175 | - dependency-name: "jina" 176 | update-types: ["version-update:semver-patch", "version-update:semver-major"] 177 | directory: "pokedex-with-bit" 178 | allow: 179 | - dependency-name: "jina" 180 | dependency-type: "direct" 181 | open-pull-requests-limit: 1 182 | 183 | - package-ecosystem: "docker" 184 | schedule: 185 | interval: daily 186 | ignore: 187 | - dependency-name: "jina" 188 | update-types: ["version-update:semver-patch", "version-update:semver-major"] 189 | directory: "pokedex-with-bit" 190 | allow: 191 | - dependency-name: "jina" 192 | dependency-type: "direct" 193 | open-pull-requests-limit: 1 194 | 195 | - package-ecosystem: "pip" 196 | schedule: 197 | interval: daily 198 | ignore: 199 | - dependency-name: "jina" 200 | update-types: ["version-update:semver-patch", "version-update:semver-major"] 201 | directory: "tumblr-gif-search" 202 | allow: 203 | - dependency-name: "jina" 204 | dependency-type: "direct" 205 | open-pull-requests-limit: 1 206 | 207 | - package-ecosystem: "pip" 208 | schedule: 209 | interval: daily 210 | ignore: 211 | - dependency-name: "jina" 212 | update-types: ["version-update:semver-patch", "version-update:semver-major"] 213 | directory: "wikipedia-sentences" 214 | allow: 215 | - dependency-name: "jina" 216 | dependency-type: "direct" 217 | open-pull-requests-limit: 1 218 | -------------------------------------------------------------------------------- /.github/perf-script.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # required for downloading data from S3 3 | pip install -e git://github.com/jina-ai/cloud-helper.git@v0.0.2#egg=jinacld_tools 4 | 5 | cd .. 6 | reqs=`find . -name "requirements.txt"` 7 | folders=() 8 | for req in $reqs; do 9 | module=`dirname $req` 10 | if test -f "$module/setup_run.sh"; then 11 | echo "$module has 'setup_run.sh'. will be run" 12 | folders+=($module) 13 | fi 14 | done 15 | 16 | for folder in $folders; do 17 | cd $folder && 18 | pip install -r requirements.txt && \ 19 | bash setup_run.sh && \ 20 | cd .. 21 | done 22 | 23 | if test -f "performance.txt"; then 24 | rm performance.txt 25 | fi 26 | 27 | metrics=`find . -name "metrics.txt"` 28 | for file_m in $metrics; do 29 | echo `dirname $file_m` >> performance.txt && 30 | cat $file_m | grep "QPS: " | grep "takes" >> performance.txt 31 | done -------------------------------------------------------------------------------- /.github/performance.txt: -------------------------------------------------------------------------------- 1 | ./audio-search 2 | Flow@45181[I]:QPS: indexing 100 takes 3 seconds (3.92s) 3 | ./advanced-vector-search 4 | Flow@38382[I]:QPS: indexing 10000 takes 1 second (1.39s) 5 | Flow@38487[I]:QPS: query with 100 takes 32 seconds (32.71s) 6 | -------------------------------------------------------------------------------- /.github/util/pull_dataset.py: -------------------------------------------------------------------------------- 1 | # This script is only used by the CI Pipeline 2 | import logging 3 | import os 4 | 5 | import click 6 | from jinacld_tools.aws.services.s3 import S3Bucket 7 | 8 | 9 | BUCKET_NAME = "jina-examples-datasets" 10 | log = logging.getLogger(__name__) 11 | 12 | 13 | def _check_credentials_exist(): 14 | assert os.environ.get('AWS_ACCESS_KEY_ID') is not None,\ 15 | 'AWS_ACCESS_KEY_ID is not present in the environment variables but required for this script.' 16 | assert len(os.environ['AWS_ACCESS_KEY_ID']) > 0, \ 17 | 'AWS_ACCESS_KEY_ID was set in the environment but has length zero.' 18 | assert os.environ.get('AWS_SECRET_ACCESS_KEY') is not None,\ 19 | 'AWS_SECRET_ACCESS_KEY is not present in the environment variables but required for this script.' 20 | assert len(os.environ['AWS_SECRET_ACCESS_KEY']) > 0,\ 21 | 'AWS_SECRET_ACCESS_KEY was set in the environment but has length zero.' 22 | 23 | 24 | @click.command() 25 | @click.option( 26 | "--data-set", 27 | "-d", 28 | type=str, 29 | required=True, 30 | help='Path to the data-set in the S3 bucket relative to the root.' 31 | ) 32 | @click.option( 33 | "--pull-to-dir", 34 | "-p", 35 | type=click.Path(exists=False), 36 | required=True, 37 | help='Directory to download the data to. Must exist beforehand.' 38 | ) 39 | def main(data_set: str, pull_to_dir: str): 40 | _check_credentials_exist() 41 | assert os.path.isdir(pull_to_dir), "The pull dir parameter must be an existing directory" 42 | save_path = os.path.join(pull_to_dir, data_set) 43 | s3 = S3Bucket(BUCKET_NAME) 44 | try: 45 | s3.get(data_set, save_path) 46 | except Exception as e: 47 | log.error(e) 48 | 49 | 50 | if __name__ == '__main__': 51 | logging.basicConfig(level=logging.INFO) 52 | main() 53 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: [pull_request] 4 | 5 | jobs: 6 | example-tests: 7 | runs-on: ubuntu-latest 8 | strategy: 9 | matrix: 10 | python-version: ['3.7', '3.8'] 11 | steps: 12 | - name: Checkout 13 | uses: actions/checkout@v2 14 | - name: Set up Python 15 | uses: actions/setup-python@v2 16 | with: 17 | python-version: ${{ matrix.python-version }} 18 | - id: changed-files 19 | uses: jitterbit/get-changed-files@v1 20 | continue-on-error: true 21 | - name: ci 22 | run: ./.github/ci.sh 23 | timeout-minutes: 20 24 | env: 25 | CHANGED_FILES: ${{ steps.changed-files.outputs.all }} 26 | AWS_ACCESS_KEY_ID: AKIAWB5UZPEQHHYDKVUC 27 | AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_DEVBOT_AKIAWB5UZPEQHHYDKVUC }} 28 | -------------------------------------------------------------------------------- /.github/workflows/copyright.yml: -------------------------------------------------------------------------------- 1 | name: Copyright 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | 8 | jobs: 9 | update-copyright: 10 | if: "!startsWith(github.event.head_commit.message, 'chore')" 11 | runs-on: ubuntu-latest 12 | steps: 13 | - uses: actions/checkout@v2 14 | with: 15 | token: ${{ secrets.JINA_DEV_BOT }} 16 | - run: | 17 | git fetch --prune --unshallow 18 | git config --local user.email "dev-bot@jina.ai" 19 | git config --local user.name "Jina Dev Bot" 20 | - uses: VinnyBabuManjaly/copyright-action@v1.0.0 21 | with: 22 | CopyrightString: '__copyright__ = "Copyright (c) 2020-2021 Jina AI Limited. All rights reserved."\n__license__ = "Apache-2.0"\n\n' 23 | FileType: '.py' 24 | - run: | 25 | git add -u 26 | git commit -m "chore: update copyright header" 27 | git status 28 | git push 29 | continue-on-error: true 30 | 31 | update-toc: 32 | if: "!startsWith(github.event.head_commit.message, 'chore')" 33 | runs-on: ubuntu-latest 34 | steps: 35 | - uses: technote-space/toc-generator@v2 36 | with: 37 | MAX_HEADER_LEVEL: 2 38 | FOLDING: false 39 | GITHUB_TOKEN: ${{ secrets.JINA_DEV_BOT }} 40 | TOC_TITLE: '**Table of Contents**' 41 | TARGET_PATHS: '.' 42 | COMMIT_MESSAGE: 'chore(docs): update TOC' 43 | COMMIT_NAME: Jina Dev Bot 44 | COMMIT_EMAIL: dev-bot@jina.ai 45 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Initially taken from Github's Python gitignore file 2 | 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | 53 | # Translations 54 | *.mo 55 | *.pot 56 | 57 | # Django stuff: 58 | *.log 59 | local_settings.py 60 | db.sqlite3 61 | 62 | # Flask stuff: 63 | instance/ 64 | .webassets-cache 65 | 66 | # Scrapy stuff: 67 | .scrapy 68 | 69 | # Sphinx documentation 70 | docs/_build/ 71 | docs/api/ 72 | 73 | # PyBuilder 74 | target/ 75 | 76 | # Jupyter Notebook 77 | .ipynb_checkpoints 78 | 79 | # IPython 80 | profile_default/ 81 | ipython_config.py 82 | 83 | # pyenv 84 | docs/.python-version 85 | 86 | # celery beat schedule file 87 | celerybeat-schedule 88 | 89 | # SageMath parsed files 90 | *.sage.py 91 | 92 | # Environments 93 | .venv 94 | env/ 95 | venv/ 96 | ENV/ 97 | env.bak/ 98 | venv.bak/ 99 | 100 | # Spyder project settings 101 | .spyderproject 102 | .spyproject 103 | 104 | # Rope project settings 105 | .ropeproject 106 | 107 | # mkdocs documentation 108 | /site 109 | 110 | # mypy 111 | .mypy_cache/ 112 | .dmypy.json 113 | dmypy.json 114 | 115 | # Pyre type checker 116 | .pyre/ 117 | .idea/ 118 | toy*.py 119 | .DS_Store 120 | post/ 121 | toy*.ipynb 122 | data/ 123 | *.c 124 | .nes_cache 125 | toy*.yml 126 | *.tmp 127 | 128 | shell/jina-wizard.sh 129 | /junit/ 130 | /tests/junit/ 131 | jina-profile*.json 132 | 133 | .vscode/ 134 | */workspace 135 | 136 | models 137 | result.html # genreated in pokedex example 138 | /advanced-vector-search/siftsmall/ 139 | /image-search/pretrained 140 | **/metrics.txt 141 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # [PLEASE CHECK OUR DOC WEBSITE FOR MORE COMPREHENSIVE EXAMPLES.](https://docs.jina.ai) 3 | 4 | ![image](https://user-images.githubusercontent.com/2041322/139639975-bb140208-20ad-4d10-a6b2-b5aa85466ed0.png) 5 | 6 | 7 | 8 | 9 | 10 | 11 | # Examples for Jina 12 | 13 | 14 | 15 | **Table of Contents** 16 | 17 | - [Adding Tests for Examples](#adding-tests-for-examples) 18 | - [Community](#community) 19 | - [License](#license) 20 | 21 | 22 | 23 | These examples showcase Jina in action and provide sample code for you to work from. 24 | 25 | We suggest you read the following to get an overview of what Jina is and how it works: 26 | 27 | - [What is neural search?](https://github.com/jina-ai/jina/blob/master/.github/2.0/neural-search.md) 28 | - [Get started](https://github.com/jina-ai/jina/#build-your-first-jina-app) - especially the cookbooks for Document, Executor and Flow 29 | 30 | ## 🐣 Simple Examples 31 | 32 | 33 | 34 | 37 | 41 | 42 | 43 | 46 | 50 | 51 | 52 | 55 | 59 | 60 |
35 |

📄

36 |
38 |

Semantic Wikipedia Search with Transformers and DistilBERT

39 | Brand new to neural search? See a simple text-search example to understand how Jina works 40 |
44 |

📄

45 |
47 |

Search Lyrics with Transformers and PyTorch

48 | Get a better understanding of chunks by searching a lyrics database. Now with shiny front-end! 49 |
53 |

📄

54 |
56 |

Find Similar Audio Clips

57 | A simple example to show how to find similar audio clips using Jina 58 |
61 | 62 | ## 🚀 Advanced Examples 63 | 64 | 65 | 66 | 69 | 73 | 74 | 75 | 76 | 79 | 83 | 84 |
67 |

📄

68 |
70 |

Querying While Indexing in the Wikipedia Search Example

71 | Support both querying and indexing simultaneously in our Wikipedia Search Example 72 |
77 |

🖼️📄

78 |
80 |

Cross Modal: Search images from captions and vice-versa

81 | Use one modality (text) to search another (images) 82 |
85 | 86 | ## Community Examples 87 | 88 | Want to add your own example? Please check our [guidelines](example-guidelines.md)! 89 | 90 | 91 | 92 | 95 | 99 | 100 | 103 | 107 | 108 | 111 | 115 | 116 |
93 |

🖼️📄

94 |
96 |

Meme Search - Text search / Image search / Front end

97 | Search memes by caption or similar image. 98 |
101 |

📄

102 |
104 |

App Store Search

105 | Use Transformers to search through a rich app store dataset with a responsive front-end 106 |
109 |

📄

110 |
112 |

Star Wars Question Answering System

113 | Generate answers based on star wars descriptions from wookieepedia. 114 |
117 | 118 | #### Legacy examples 119 | 120 | 121 | 122 | 125 | 129 | 130 |
123 |

📄

124 |
126 |

Financial Question Answering Search

127 | Opinionated QA passage retrieval with BERT-based reranker 128 |
131 | 132 | ## Adding Tests for Examples 133 | 134 | You are highly encouraged to add a test for your example so that we will be alerted if it breaks in the future: 135 | 136 | 1. Put your test data in the `tests` folder. The test data can be a few text sentences, images or audio samples 137 | 2. Create `test_[your_example].py` in the `tests` folder. Add your test cases to the `tests` file with meaningful asserts depending on example input and output 138 | 3. Run the test locally to confirm before pushing with [pytest](https://docs.pytest.org/en/stable/contents.html) 139 | 4. Add your example folder name to the `path` variable in `matrix` of `.github/worflows/ci.yml`. This will trigger your example test on creating a pull request. 140 | 141 | 142 | ### Testing Tips 143 | 144 | - For reference, check out the `tests` folder from any of the examples in this repo. 145 | - Try using the original example function by importing them to the test. Avoid any modifications to original Flow or logic. 146 | - Use the [pytest fixture](https://docs.pytest.org/en/stable/fixture.html) `tmpdir` for temporary directory 147 | 148 | ## Community 149 | 150 | - [Slack channel](http://slack.jina.ai) - a communication platform for developers to discuss Jina 151 | - [LinkedIn](https://www.linkedin.com/company/jinaai/) - get to know Jina AI as a company and find job opportunities 152 | - [![Twitter Follow](https://img.shields.io/twitter/follow/JinaAI_?label=Follow%20%40JinaAI_&style=social)](https://twitter.com/JinaAI_) - follow us and interact with us using hashtag `#JinaSearch` 153 | - [Company](https://jina.ai) - know more about our company, we are fully committed to open-source! 154 | 155 | 156 | ## License 157 | 158 | Copyright (c) 2021 Jina AI Limited. All rights reserved. 159 | 160 | Jina is licensed under the Apache License, Version 2.0. See [LICENSE](https://github.com/jina-ai/jina/blob/master/LICENSE) for the full license text. 161 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jina-ai/examples/d8f903278597254bd96b1ed64fe8c9feefaa265c/__init__.py -------------------------------------------------------------------------------- /audio-to-audio-search/.github/demo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jina-ai/examples/d8f903278597254bd96b1ed64fe8c9feefaa265c/audio-to-audio-search/.github/demo.png -------------------------------------------------------------------------------- /audio-to-audio-search/.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | assets/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | ./data/ 17 | !/tests/data/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | wheels/ 26 | pip-wheel-metadata/ 27 | share/python-wheels/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | *.swp 32 | MANIFEST 33 | 34 | # PyInstaller 35 | # Usually these files are written by a python script from a template 36 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 37 | *.manifest 38 | *.spec 39 | 40 | # Installer logs 41 | pip-log.txt 42 | pip-delete-this-directory.txt 43 | 44 | # Unit test / coverage reports 45 | htmlcov/ 46 | .tox/ 47 | .nox/ 48 | .coverage 49 | .coverage.* 50 | .cache 51 | nosetests.xml 52 | coverage.xml 53 | *.cover 54 | *.py,cover 55 | .hypothesis/ 56 | .pytest_cache/ 57 | 58 | # Translations 59 | *.mo 60 | *.pot 61 | 62 | # Django stuff: 63 | *.log 64 | local_settings.py 65 | db.sqlite3 66 | db.sqlite3-journal 67 | 68 | # Flask stuff: 69 | instance/ 70 | .webassets-cache 71 | 72 | # Scrapy stuff: 73 | .scrapy 74 | 75 | # Sphinx documentation 76 | docs/_build/ 77 | 78 | # PyBuilder 79 | target/ 80 | 81 | # Jupyter Notebook 82 | .ipynb_checkpoints 83 | 84 | # IPython 85 | profile_default/ 86 | ipython_config.py 87 | 88 | # pyenv 89 | .python-version 90 | 91 | # pipenv 92 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 93 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 94 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 95 | # install all needed dependencies. 96 | #Pipfile.lock 97 | 98 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 99 | __pypackages__/ 100 | 101 | # Celery stuff 102 | celerybeat-schedule 103 | celerybeat.pid 104 | 105 | # SageMath parsed files 106 | *.sage.py 107 | 108 | # Environments 109 | .env 110 | .venv 111 | env/ 112 | venv/ 113 | ENV/ 114 | env.bak/ 115 | venv.bak/ 116 | 117 | # Spyder project settings 118 | .spyderproject 119 | .spyproject 120 | 121 | # Rope project settings 122 | .ropeproject 123 | 124 | # mkdocs documentation 125 | /site 126 | 127 | # mypy 128 | .mypy_cache/ 129 | .dmypy.json 130 | dmypy.json 131 | 132 | # Pyre type checker 133 | .pyre/ 134 | 135 | # MAC OSX 136 | .DS_Store 137 | .idea/ 138 | 139 | models/ 140 | workspace/ 141 | -------------------------------------------------------------------------------- /audio-to-audio-search/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # Find Similar Audio Clips 4 | 5 | This example checks if the query audio clip is part of the indexed audio tracks. 6 | 7 | ![Demo](.github/demo.png) 8 | 9 | ## Prerequisites 10 | 11 | To run this example, the user is required to `cd` into the directory containing 12 | `requirements.txt` to install: 13 | 14 | ``` 15 | sudo apt-get -y update && sudo apt-get install libsndfile1 ffmpeg 16 | pip install -r requirements.txt 17 | ``` 18 | 19 | 20 | 21 | ## Basic Usages 22 | 23 | You can run `app.py` by doing the following: 24 | 25 | ```shell 26 | python app.py index 27 | ``` 28 | 29 | By default, audio tracks in the `data/mp3/index` will be indexed. You can specify custom data path by: 30 | 31 | ``` 32 | export JINA_DATA_FILE= 33 | ``` 34 | where audios to be indexed are stored in `/index`. 35 | 36 | 37 | Then, to search the documents, do: 38 | 39 | ```shell 40 | python app.py search 41 | ``` 42 | 43 | This will generate a set of query audio clips on the fly in `data/mp3/query` (or, if you are using 44 | custom data path, in `/query`) by extracting snippets from a set of randomly sampled 45 | of index audio clips. The program then matches each query doc with the most similar index docs. 46 | 47 | The `-s` option allows user to specify which segmenter to use. `vad` uses Jinahub's VADSpeechSegmenter, and 48 | `time` uses TimeSegmenter. 49 | 50 | The `-e` option allows user to specify which encoder to use. `vgg` uses Jinahub's VGGishEncoder, and 51 | `clip` uses AudioCLIPEncoder. 52 | 53 | The `-t` option allows user to specify what the match threshold is. If score of match is below threshold, 54 | then it is not considered a match. 55 | 56 | 57 | 58 | ## Results and Demo 59 | 60 | 61 | 62 | Results are as follows. 63 | 64 | With `segmenter=VADSpeechSegmenter` and `encoder=AudioCLIPEncoder`: 65 | 66 | ``` 67 | +-------------+-------------+------------+ 68 | | target | prediction | is_correct | 69 | +-------------+-------------+------------+ 70 | | -0jeONf82dE | -0jeONf82dE | True | 71 | | -CXICIHCb6Y | -CXICIHCb6Y | True | 72 | | -OMB-w3LPNY | -By6I234TSs | False | 73 | | -QX2Gv7J5gY | -QX2Gv7J5gY | True | 74 | | -UKH_6moRZc | -UKH_6moRZc | True | 75 | | -ZJqu_4zLMc | -ZJqu_4zLMc | True | 76 | | -gz_moHFwl4 | -gz_moHFwl4 | True | 77 | | -i9uQMysy_A | -mKtgDnG0oM | False | 78 | | -mpapCZXors | -mpapCZXors | True | 79 | | 0KKTw8pfNjg | 0KKTw8pfNjg | True | 80 | | 0LTmV_dOmmo | 0LTmV_dOmmo | True | 81 | | 0N17tEW_WEU | 0N17tEW_WEU | True | 82 | | 0YIWrXgCjiM | 0YIWrXgCjiM | True | 83 | | 0YsC6M4GFoc | 0YsC6M4GFoc | True | 84 | | 0_O6nVfnCH8 | -sevczF5etI | False | 85 | | 0cZQm65sZjc | 0cZQm65sZjc | True | 86 | | 0jnvb2H25_Q | 0jnvb2H25_Q | True | 87 | | 0kQjfwXjFuY | -D--GWwca0g | False | 88 | | 0rbUCEM20aw | 0rbUCEM20aw | True | 89 | | 0slyl34xWug | 0slyl34xWug | True | 90 | +-------------+-------------+------------+ 91 | accuracy: 0.8 92 | ``` 93 | 94 | With `segmenter=TimeSegmenter` and `encoder=AudioCLIPEncoder`: 95 | 96 | ``` 97 | +-------------+-------------+------------+ 98 | | target | prediction | is_correct | 99 | +-------------+-------------+------------+ 100 | | -Bu7YaslRW0 | -Bu7YaslRW0 | True | 101 | | -CXICIHCb6Y | -CXICIHCb6Y | True | 102 | | -D--GWwca0g | -D--GWwca0g | True | 103 | | -OMB-w3LPNY | -OMB-w3LPNY | True | 104 | | -Z8bjo6q6jc | -CXICIHCb6Y | False | 105 | | -ZJqu_4zLMc | -ZJqu_4zLMc | True | 106 | | -_HXiz8XnV0 | -_HXiz8XnV0 | True | 107 | | -fz6omiAhZ8 | -fz6omiAhZ8 | True | 108 | | -mpapCZXors | -jaY3LS3Dv0 | False | 109 | | 05JAmKFVy44 | 05JAmKFVy44 | True | 110 | | 0YIWrXgCjiM | 0YIWrXgCjiM | True | 111 | | 0YsC6M4GFoc | 0YsC6M4GFoc | True | 112 | | 0ZN2HKsFg4A | 0ZN2HKsFg4A | True | 113 | | 0_O6nVfnCH8 | 0_O6nVfnCH8 | True | 114 | | 0cZQm65sZjc | 0cZQm65sZjc | True | 115 | | 0izHOfrwPn4 | 0izHOfrwPn4 | True | 116 | | 0qZ54ovyEWQ | 0qZ54ovyEWQ | True | 117 | | 0sYXPO7lzco | 0sYXPO7lzco | True | 118 | | 0slyl34xWug | 0slyl34xWug | True | 119 | | 0vg9qxNKXOw | 0vg9qxNKXOw | True | 120 | +-------------+-------------+------------+ 121 | accuracy: 0.9 122 | ``` 123 | 124 | With `segmenter=TimeSegmenter` and `encoder=VGGishAudioEncoder`: 125 | 126 | ``` 127 | +-------------+-------------+------------+ 128 | | target | prediction | is_correct | 129 | +-------------+-------------+------------+ 130 | | -Bu7YaslRW0 | -Bu7YaslRW0 | True | 131 | | -IvJaK7HLtQ | -IvJaK7HLtQ | True | 132 | | -OMB-w3LPNY | -OMB-w3LPNY | True | 133 | | -QX2Gv7J5gY | -QX2Gv7J5gY | True | 134 | | -UKH_6moRZc | -UKH_6moRZc | True | 135 | | -ZJqu_4zLMc | -ZJqu_4zLMc | True | 136 | | -mKtgDnG0oM | -mKtgDnG0oM | True | 137 | | -mpapCZXors | -mpapCZXors | True | 138 | | -nlkWWphiaM | -nlkWWphiaM | True | 139 | | -pUfYFcsgG4 | -pUfYFcsgG4 | True | 140 | | -sevczF5etI | -sevczF5etI | True | 141 | | 0N17tEW_WEU | 0N17tEW_WEU | True | 142 | | 0XeH2s-LzZE | 0XeH2s-LzZE | True | 143 | | 0YIWrXgCjiM | 0YIWrXgCjiM | True | 144 | | 0YsC6M4GFoc | 0YsC6M4GFoc | True | 145 | | 0bRUkLsttto | 0bRUkLsttto | True | 146 | | 0izHOfrwPn4 | 0izHOfrwPn4 | True | 147 | | 0jFQ21A6GRA | 0jFQ21A6GRA | True | 148 | | 0sYXPO7lzco | 0sYXPO7lzco | True | 149 | | 0vg9qxNKXOw | 0vg9qxNKXOw | True | 150 | +-------------+-------------+------------+ 151 | accuracy: 1.0 152 | ``` 153 | 154 | With `segmenter=VADSpeechSegmenter` and `encoder=VGGishAudioEncoder`: 155 | 156 | ``` 157 | +-------------+-------------+------------+ 158 | | target | prediction | is_correct | 159 | +-------------+-------------+------------+ 160 | | -0jeONf82dE | -0jeONf82dE | True | 161 | | -OMB-w3LPNY | 0sYXPO7lzco | False | 162 | | -QX2Gv7J5gY | 0LTmV_dOmmo | False | 163 | | -WKYdeVL3_k | -WKYdeVL3_k | True | 164 | | -Z8bjo6q6jc | -Z8bjo6q6jc | True | 165 | | -_HXiz8XnV0 | -_HXiz8XnV0 | True | 166 | | -e4wXAy1iVo | -e4wXAy1iVo | True | 167 | | -gz_moHFwl4 | -gz_moHFwl4 | True | 168 | | -i9uQMysy_A | -i9uQMysy_A | True | 169 | | -jaY3LS3Dv0 | -jaY3LS3Dv0 | True | 170 | | -sevczF5etI | -sevczF5etI | True | 171 | | 05JAmKFVy44 | 05JAmKFVy44 | True | 172 | | 0N17tEW_WEU | 0N17tEW_WEU | True | 173 | | 0OY8XXZ98rw | -0jeONf82dE | False | 174 | | 0YIWrXgCjiM | 0YIWrXgCjiM | True | 175 | | 0rbUCEM20aw | 0rbUCEM20aw | True | 176 | | 0vg9qxNKXOw | 0vg9qxNKXOw | True | 177 | +-------------+-------------+------------+ 178 | accuracy: 0.8235294117647058 179 | ``` 180 | 181 | After searching is completed, the program will open `demo.html` where user can click 182 | to listen to the query and matched docs. 183 | -------------------------------------------------------------------------------- /audio-to-audio-search/app.py: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | import shutil 4 | from functools import partial 5 | from pathlib import Path 6 | from typing import Optional 7 | 8 | import click 9 | from jina import DocumentArray, Document, Flow 10 | 11 | from executors import TimeSegmenter, Wav2MelCrafter, DebugExecutor 12 | from helper import report_results, write_html, create_query_audios, create_docs, logger 13 | 14 | 15 | def config(): 16 | os.environ.setdefault('JINA_WORKSPACE', str(Path(__file__).parent / 'workspace')) 17 | os.environ.setdefault('JINA_DATA_FILE', str(Path(__file__).parent / 'data' / 'mp3')) 18 | os.environ.setdefault( 19 | 'JINA_WORKSPACE_MOUNT', 20 | f'{os.environ.get("JINA_WORKSPACE")}:/workspace/workspace' 21 | ) 22 | 23 | 24 | def index(workspace: Path, data_dir: Path, flow: Flow): 25 | if workspace.exists(): 26 | shutil.rmtree(workspace) 27 | with flow: 28 | flow.post( 29 | '/index', inputs=create_docs(os.path.join(data_dir, 'index', '*.mp3')) 30 | ) 31 | 32 | 33 | def search( 34 | workspace: Path, 35 | data_dir: Path, 36 | flow: Flow, 37 | threshold: Optional[float], 38 | top_k: int, 39 | num_queries: int, 40 | ): 41 | if not workspace.exists(): 42 | raise FileNotFoundError( 43 | f'The directory {workspace} does not exist. Please index first via `python app.py index`' 44 | ) 45 | 46 | with flow: 47 | create_query_audios(num_queries, data_dir) 48 | responses = flow.post( 49 | '/search', 50 | inputs=create_docs(os.path.join(data_dir, 'query', '*.mp3')), 51 | return_results=True, 52 | ) 53 | 54 | result_html, accuracy = report_results(responses, threshold, top_k) 55 | write_html(str(workspace / 'demo.html'), result_html, accuracy, top_k) 56 | 57 | 58 | def validate_threshold( 59 | ctx: click.core.Option, param: click.core.Context, threshold: Optional[float] 60 | ): 61 | if threshold is not None and not 0 <= threshold <= 1: 62 | raise click.BadParameter('threshold should be between 0 and 1') 63 | 64 | 65 | @click.command() 66 | @click.argument('operation', type=click.Choice(['index', 'search'])) 67 | @click.option( 68 | '--segmenter', 69 | '-s', 70 | default='vad', 71 | type=click.Choice(['time', 'vad']), 72 | help='Specify the segmenter to use (i.e. vad or time)', 73 | ) 74 | @click.option( 75 | '--encoder', 76 | '-e', 77 | default='vgg', 78 | type=click.Choice(['vgg', 'clip']), 79 | help='Specify the encoder to use (i.e. vgg or clip)', 80 | ) 81 | @click.option( 82 | '--threshold', 83 | '-t', 84 | default=None, 85 | type=float, 86 | callback=validate_threshold, 87 | help='Specify the distance threshold for matching (between 0 to 1)', 88 | ) 89 | @click.option('--top_k', '-k', default=5, type=int, help='Specify top k for matching') 90 | @click.option( 91 | '--num_queries', 92 | '-n', 93 | default=25, 94 | type=int, 95 | help='Specify the number of querys to match', 96 | ) 97 | def cli( 98 | operation: str, 99 | segmenter: str, 100 | encoder: str, 101 | threshold: Optional[float], 102 | top_k: int, 103 | num_queries: int, 104 | ): 105 | config() 106 | 107 | data_dir = Path(os.environ["JINA_DATA_FILE"]) 108 | workspace = Path(os.environ["JINA_WORKSPACE"]) 109 | logger.info(f'data directory path: {data_dir}') 110 | logger.info(f'workspace path: {workspace}') 111 | 112 | segmenter_uses_with = {'chunk_duration': 2.5} if segmenter == 'time' else {} 113 | segmenter = {'time': TimeSegmenter, 'vad': 'jinahub://VADSpeechSegmenter'}[ 114 | segmenter 115 | ] 116 | 117 | flow = ( 118 | Flow() 119 | .add( 120 | uses=segmenter, 121 | uses_metas={'workspace': str(workspace)}, 122 | uses_with=segmenter_uses_with, 123 | ) 124 | ) 125 | 126 | if encoder == 'vgg': 127 | flow = flow.add(uses=Wav2MelCrafter) 128 | 129 | encoder = { 130 | 'clip': 'jinahub+docker://AudioCLIPEncoder', 131 | 'vgg': 'jinahub+docker://VGGishAudioEncoder', 132 | }[encoder] 133 | 134 | flow = ( 135 | flow 136 | .add(uses=encoder, uses_with={'default_traversal_paths': ['c']}) 137 | # Since matched chunks may come from the same top level query doc, 138 | # we set default_top_k to top_k * 2 so that we have sufficient information to 139 | # determine the true top k matches as a quick workaround. 140 | .add( 141 | uses='jinahub+docker://SimpleIndexer', 142 | volumes=os.environ['JINA_WORKSPACE_MOUNT'], 143 | uses_with={ 144 | 'index_file_name': 'simple_indexer', 145 | 'default_traversal_paths': ['c'], 146 | 'default_top_k': top_k * 2, 147 | }, 148 | ) 149 | .add(uses=DebugExecutor) 150 | .add(uses='jinahub+docker://SimpleRanker', uses_metas={'workspace': str(workspace)}) 151 | ) 152 | 153 | { 154 | 'index': index, 155 | 'search': partial( 156 | search, threshold=threshold, top_k=top_k, num_queries=num_queries 157 | ), 158 | }[operation](workspace, data_dir, flow) 159 | 160 | 161 | if __name__ == '__main__': 162 | cli() 163 | -------------------------------------------------------------------------------- /audio-to-audio-search/demo.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Jina Audio to Audio Search! 7 | 8 | 9 | 10 | 13 | 14 |
16 | 20 |
21 |
23 |

What just happened?

24 | This is Jina's
audio-to-audio-search
example. It downloads AudioSet and indexes 1000 10-second audio clips via Jina search framework. We then randomly sample 5-second query audio clips extracted from the index audio clips as Queries, ask Jina to retrieve relevant results. Below is Jina's retrievals, where the left-most column is query audio.
25 |
26 | Intrigued? Learn more about Jina and checkout our Github! 27 |
28 | 29 |
30 |

Precision@{% TOP_K %}: {% PRECISION_EVALUATION %}

31 | 32 |
33 | 34 | {% RESULT %}

Query

Top-K Results

35 | 36 | 37 | -------------------------------------------------------------------------------- /audio-to-audio-search/executors.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import Tuple, Dict, Optional 3 | 4 | import torch 5 | import numpy as np 6 | import librosa as lr 7 | import torchaudio 8 | from jina import Executor, DocumentArray, requests, Document 9 | from jina_commons import get_logger 10 | 11 | from vggish.vggish_input import waveform_to_examples 12 | from vggish.vggish_params import SAMPLE_RATE 13 | 14 | 15 | class Wav2MelCrafter(Executor): 16 | def __init__(self, *args, **kwargs): 17 | super().__init__(*args, **kwargs) 18 | self.logger = get_logger(self) 19 | 20 | @requests 21 | def segment(self, docs: Optional[DocumentArray] = None, **kwargs): 22 | if not docs: 23 | return 24 | for doc in docs: 25 | result_chunk = [] 26 | for chunk in doc.chunks: 27 | mel_data = waveform_to_examples(chunk.blob, chunk.tags['sample_rate']) 28 | if mel_data.ndim != 3: 29 | self.logger.warning( 30 | f'failed to convert from wave to mel, chunk.blob: {chunk.blob.shape}, sample_rate: {SAMPLE_RATE}' 31 | ) 32 | continue 33 | if mel_data.shape[0] <= 0: 34 | self.logger.warning( 35 | f'chunk between {chunk.location} is skipped due to the duration is too short' 36 | ) 37 | if mel_data.ndim == 2: 38 | mel_data = np.atleast_3d(mel_data) 39 | mel_data = mel_data.reshape(1, mel_data.shape[0], mel_data.shape[1]) 40 | chunk.blob = mel_data 41 | if mel_data.size > 0: 42 | result_chunk.append(chunk) 43 | doc.chunks = result_chunk 44 | 45 | 46 | class TimeSegmenter(Executor): 47 | def __init__(self, chunk_duration: int = 10, chunk_strip: int = 1, *args, **kwargs): 48 | super().__init__(*args, **kwargs) 49 | self.chunk_duration = chunk_duration # seconds 50 | self.strip = chunk_strip 51 | 52 | @requests(on=['/search', '/index']) 53 | def segment( 54 | self, docs: Optional[DocumentArray] = None, parameters: dict = {}, **kwargs 55 | ): 56 | if not docs: 57 | return 58 | for idx, doc in enumerate(docs): 59 | doc.blob, sample_rate = self._load_raw_audio(doc) 60 | doc.tags['sample_rate'] = sample_rate 61 | chunk_size = int(self.chunk_duration * sample_rate) 62 | strip = parameters.get('chunk_strip', self.strip) 63 | strip_size = int(strip * sample_rate) 64 | num_chunks = max(1, int((doc.blob.shape[0] - chunk_size) / strip_size)) 65 | for chunk_id in range(num_chunks): 66 | beg = chunk_id * strip_size 67 | end = beg + chunk_size 68 | if beg > doc.blob.shape[0]: 69 | break 70 | doc.chunks.append( 71 | Document( 72 | blob=doc.blob[beg:end], 73 | offset=idx, 74 | location=[beg, end], 75 | tags=doc.tags, 76 | ) 77 | ) 78 | 79 | def _load_raw_audio(self, doc: Document) -> Tuple[np.ndarray, int]: 80 | if doc.blob is not None and doc.tags.get('sample_rate', None) is None: 81 | raise BadDocType('data is blob but sample rate is not provided') 82 | elif doc.blob is not None: 83 | return doc.blob, int(doc.tags['sample_rate']) 84 | elif doc.uri is not None and doc.uri.endswith('.mp3'): 85 | return self._read_mp3(doc.uri) 86 | elif doc.uri is not None and doc.uri.endswith('.wav'): 87 | return self._read_wav(doc.uri) 88 | else: 89 | raise BadDocType('doc needs to have either a blob or a wav/mp3 uri') 90 | 91 | def _read_wav(self, file_path: str) -> Tuple[np.ndarray, int]: 92 | data, sample_rate = torchaudio.load(file_path) 93 | data = np.mean(data.cpu().numpy(), axis=0) 94 | return data, sample_rate 95 | 96 | def _read_mp3(self, file_path: str) -> Tuple[np.ndarray, int]: 97 | return lr.load(file_path) 98 | 99 | 100 | class DebugExecutor(Executor): 101 | @requests 102 | def debug(self, docs: Optional[DocumentArray] = None, **kwargs): 103 | logger = get_logger(self) 104 | if not docs: 105 | return 106 | for i, doc in enumerate(docs): 107 | for match in doc.matches: 108 | logger.info(f"doc {doc.tags['file']} match: ", match.tags['file']) 109 | -------------------------------------------------------------------------------- /audio-to-audio-search/helper.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import logging 3 | import os 4 | from pathlib import Path 5 | import re 6 | import random 7 | import shutil 8 | import subprocess 9 | from typing import List, Optional 10 | 11 | from prettytable import PrettyTable 12 | from jina import Document, DocumentArray 13 | from jina.types.request import Response 14 | import webbrowser 15 | 16 | 17 | ID_LEN = 11 18 | 19 | 20 | def get_logger(): 21 | """ 22 | Method to get logger. 23 | """ 24 | logger = logging.getLogger('app') 25 | logger.setLevel(logging.INFO) 26 | formatter = logging.Formatter('%(message)s') 27 | sh = logging.StreamHandler() 28 | sh.setFormatter(formatter) 29 | sh.setLevel(logging.INFO) 30 | logger.addHandler(sh) 31 | return logger 32 | 33 | 34 | logger = get_logger() 35 | 36 | 37 | def create_docs(filefolder_path: str): 38 | """ 39 | Method to create Jina documents. 40 | 41 | :param filefolder_path: paths to audio files 42 | """ 43 | docs = [] 44 | import librosa as lr 45 | 46 | logger.info('Creating docs..') 47 | for file_path in sorted(glob.glob(filefolder_path)): 48 | id = os.path.basename(file_path).split('.')[0] 49 | blob, sample_rate = lr.load(file_path) 50 | docs.append( 51 | Document( 52 | id=id, blob=blob, tags={'file': file_path, 'sample_rate': sample_rate} 53 | ) 54 | ) 55 | logger.info('docs created') 56 | return DocumentArray(docs) 57 | 58 | 59 | def create_query_audios(num_docs: int, data_folder: Path): 60 | """ 61 | Method to create query audio clips. 62 | 63 | :param num_docs: number of query docs 64 | :param data_folder: path to data folder 65 | """ 66 | input_docs_folder = data_folder / 'index' 67 | output_docs_folder = data_folder / 'query' 68 | if output_docs_folder.is_dir(): 69 | shutil.rmtree(output_docs_folder) 70 | output_docs_folder.mkdir() 71 | input_docs_filenames = glob.glob(str(input_docs_folder / '*.mp3')) 72 | 73 | if len(input_docs_filenames) < num_docs: 74 | raise FileNotFoundError( 75 | 'cannot find sufficient ' 76 | f'index audios clips. Number of index audio clips found: {len(input_docs_filenames)}, ' 77 | f'number of requested query docs: {num_docs}' 78 | ) 79 | 80 | for input_file in random.sample(input_docs_filenames, k=num_docs): 81 | id = re.match(r'index_(.*).mp3', os.path.basename(input_file))[1][-ID_LEN:] 82 | output_file = f"query_{id}.mp3" 83 | startTime = random.random() * 5 84 | endTime = startTime + random.random() * 4 + 3 85 | cmd = [ 86 | 'ffmpeg', 87 | '-i', 88 | os.path.abspath(input_file), 89 | '-ss', 90 | str(startTime), 91 | '-to', 92 | str(endTime), 93 | '-async', 94 | '1', 95 | output_file, 96 | ] 97 | subprocess.call(cmd, cwd=str(output_docs_folder)) 98 | 99 | 100 | def report_results(responses: List[Response], threshold: Optional[float], top_k: int): 101 | """ 102 | Method to report results 103 | 104 | :param responses: returned responses with data 105 | :param threshold: threshold for search 106 | :param top_k: top k number 107 | """ 108 | pred_list = [] 109 | table = PrettyTable() 110 | table.field_names = ['target', 'prediction', 'is_correct'] 111 | result_html = [] 112 | for i, response in enumerate(responses): 113 | for j, doc in enumerate(response.docs): 114 | if not doc.matches: 115 | continue 116 | match = doc.matches[0] 117 | target_result = os.path.basename(doc.tags["file"]).split('.')[0][-ID_LEN:] 118 | pred_result = os.path.basename(match.tags["file"]).split('.')[0][-ID_LEN:] 119 | pred_result = ( 120 | pred_result 121 | if threshold is None or 1 - match.scores['cosine'].value > threshold 122 | else 'None' 123 | ) 124 | table.add_row([target_result, pred_result, target_result == pred_result]) 125 | pred_list.append(target_result == pred_result) 126 | 127 | query_html = f""" 128 | 129 | 132 | """ 133 | seen = set() 134 | result_html.append(f'{query_html}') 135 | print('wt, ', len(doc.matches)) 136 | for k, match in enumerate(doc.matches): 137 | if len(seen) >= top_k: 138 | break 139 | if match.tags['file'] in seen: 140 | continue 141 | seen.add(match.tags['file']) 142 | match_html = f""" 143 | 144 | 147 | """ 148 | result_html.append(match_html) 149 | result_html.append('\n') 150 | 151 | logger.info(table) 152 | 153 | if not pred_list: 154 | return [], float('nan') 155 | 156 | accuracy = sum(pred_list) / len(pred_list) 157 | logger.info(f'accuracy: {accuracy}') 158 | return result_html, accuracy 159 | 160 | 161 | def write_html(html_path: str, result_html: str, accuracy: float, top_k: int): 162 | """ 163 | Method to present results in browser. 164 | 165 | :param html_path: path of the written html 166 | :param result_html: content of html to be written 167 | :param accuracy: accuracy of search 168 | :param top_k: top k number 169 | """ 170 | with open( 171 | os.path.join(os.path.dirname(os.path.realpath(__file__)), 'demo.html') 172 | ) as fp, open(html_path, 'w') as fw: 173 | t = fp.read() 174 | t = t.replace('{% RESULT %}', '\n'.join(result_html)) 175 | t = t.replace( 176 | '{% PRECISION_EVALUATION %}', 177 | '{:.2f}%'.format(accuracy * 100.0), 178 | ) 179 | t = t.replace('{% TOP_K %}', str(top_k)) 180 | fw.write(t) 181 | 182 | url_html_path = 'file://' + os.path.abspath(html_path) 183 | 184 | try: 185 | webbrowser.open(url_html_path, new=2) 186 | except: 187 | pass # intentional pass, browser support isn't cross-platform 188 | finally: 189 | logger.info( 190 | f'You should see a "demo.html" opened in your browser, ' 191 | f'if not you may open {url_html_path} manually' 192 | ) 193 | -------------------------------------------------------------------------------- /audio-to-audio-search/requirements.txt: -------------------------------------------------------------------------------- 1 | git+git://github.com/jina-ai/jina-commons@v0.0.3 2 | click 3 | jina~=2.0 4 | numpy==1.20.0 5 | soundfile==0.10.3.post1 6 | librosa==0.8.0 7 | visdom==0.1.8.9 8 | ffmpeg 9 | torchaudio 10 | prettytable 11 | -------------------------------------------------------------------------------- /audio-to-audio-search/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jina-ai/examples/d8f903278597254bd96b1ed64fe8c9feefaa265c/audio-to-audio-search/tests/__init__.py -------------------------------------------------------------------------------- /audio-to-audio-search/tests/data/mp3/index/index_-Bu7YaslRW0.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jina-ai/examples/d8f903278597254bd96b1ed64fe8c9feefaa265c/audio-to-audio-search/tests/data/mp3/index/index_-Bu7YaslRW0.mp3 -------------------------------------------------------------------------------- /audio-to-audio-search/tests/data/mp3/index/index_-D--GWwca0g.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jina-ai/examples/d8f903278597254bd96b1ed64fe8c9feefaa265c/audio-to-audio-search/tests/data/mp3/index/index_-D--GWwca0g.mp3 -------------------------------------------------------------------------------- /audio-to-audio-search/tests/data/mp3/index/index_-nlkWWphiaM.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jina-ai/examples/d8f903278597254bd96b1ed64fe8c9feefaa265c/audio-to-audio-search/tests/data/mp3/index/index_-nlkWWphiaM.mp3 -------------------------------------------------------------------------------- /audio-to-audio-search/tests/data/mp3/index/index_0bRUkLsttto.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jina-ai/examples/d8f903278597254bd96b1ed64fe8c9feefaa265c/audio-to-audio-search/tests/data/mp3/index/index_0bRUkLsttto.mp3 -------------------------------------------------------------------------------- /audio-to-audio-search/tests/data/mp3/index/index_0slyl34xWug.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jina-ai/examples/d8f903278597254bd96b1ed64fe8c9feefaa265c/audio-to-audio-search/tests/data/mp3/index/index_0slyl34xWug.mp3 -------------------------------------------------------------------------------- /audio-to-audio-search/tests/requirements.txt: -------------------------------------------------------------------------------- 1 | git+git://github.com/jina-ai/jina-commons@v0.0.3 2 | click 3 | pytest~=6.1.2 4 | jina~=2.0 5 | numpy==1.20.0 6 | soundfile==0.10.3.post1 7 | librosa==0.8.0 8 | visdom==0.1.8.9 9 | ffmpeg 10 | torchaudio 11 | prettytable 12 | -------------------------------------------------------------------------------- /audio-to-audio-search/tests/test_audio_to_audio_search.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | import shutil 4 | 5 | import pytest 6 | from click.testing import CliRunner 7 | from app import cli 8 | from pathlib import Path 9 | 10 | 11 | @pytest.mark.parametrize('segmenter', ['vad', 'time']) 12 | @pytest.mark.parametrize('encoder', ['vgg', 'clip']) 13 | def test_exec(tmp_path, segmenter, encoder): 14 | assert os.getcwd().endswith( 15 | 'audio-to-audio-search' 16 | ), "Please execute the tests from the root directory: >>> pytest tests/" 17 | os.environ['JINA_DATA_FILE'] = os.path.join('tests', 'data', 'mp3') 18 | workspace = os.environ['JINA_WORKSPACE'] = os.path.join(tmp_path, 'workspace') 19 | os.environ['JINA_WORKSPACE_MOUNT']= f'{workspace}:/workspace/workspace' 20 | runner = CliRunner() 21 | _test_index(runner, workspace, segmenter, encoder) 22 | _test_query(runner, segmenter, encoder) 23 | 24 | 25 | def _test_index(runner, workspace, segmenter, encoder): 26 | result = runner.invoke(cli, ['index', '-s', segmenter, '-e', encoder]) 27 | assert result.exception is None 28 | assert result.exit_code == 0 29 | assert Path(workspace).is_dir() 30 | assert ( 31 | len(set(glob.glob(os.path.join(workspace, '**', '*.bin'), recursive=True))) == 2 32 | ) 33 | 34 | 35 | def _test_query(runner, segmenter, encoder): 36 | # test error case: query more docs than indexed 37 | result = runner.invoke(cli, ['search', '-s', segmenter, '-e', encoder, '-n', 10]) 38 | 39 | with pytest.raises( 40 | FileNotFoundError, 41 | match='cannot find sufficient index audios clips. ' 42 | 'Number of index audio clips found: 5, number of requested query docs: 10', 43 | ): 44 | assert result.exception is not None 45 | raise result.exception 46 | 47 | assert result.exit_code != 0 48 | result = runner.invoke(cli, ['search', '-s', segmenter, '-e', encoder, '-n', 3]) 49 | assert result.exception is None 50 | assert result.exit_code == 0 51 | -------------------------------------------------------------------------------- /audio-to-audio-search/vggish/mel_features.py: -------------------------------------------------------------------------------- 1 | __copyright__ = "Copyright (c) 2021 Jina AI Limited. All rights reserved." 2 | __license__ = "Apache-2.0" 3 | 4 | # Copyright 2017 The TensorFlow Authors All Rights Reserved. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # ============================================================================== 18 | 19 | """Defines routines to compute mel spectrogram features from audio waveform.""" 20 | 21 | import numpy as np 22 | 23 | 24 | def frame(data, window_length, hop_length): 25 | """Convert array into a sequence of successive possibly overlapping frames. 26 | 27 | An n-dimensional array of shape (num_samples, ...) is converted into an 28 | (n+1)-D array of shape (num_frames, window_length, ...), where each frame 29 | starts hop_length points after the preceding one. 30 | 31 | This is accomplished using stride_tricks, so the original data is not 32 | copied. However, there is no zero-padding, so any incomplete frames at the 33 | end are not included. 34 | 35 | Args: 36 | data: np.array of dimension N >= 1. 37 | window_length: Number of samples in each frame. 38 | hop_length: Advance (in samples) between each window. 39 | 40 | Returns: 41 | (N+1)-D np.array with as many rows as there are complete frames that can be 42 | extracted. 43 | """ 44 | num_samples = data.shape[0] 45 | num_frames = 1 + int(np.floor((num_samples - window_length) / hop_length)) 46 | shape = (num_frames, window_length) + data.shape[1:] 47 | strides = (data.strides[0] * hop_length,) + data.strides 48 | return np.lib.stride_tricks.as_strided(data, shape=shape, strides=strides) 49 | 50 | 51 | def periodic_hann(window_length): 52 | """Calculate a "periodic" Hann window. 53 | 54 | The classic Hann window is defined as a raised cosine that starts and 55 | ends on zero, and where every value appears twice, except the middle 56 | point for an odd-length window. Matlab calls this a "symmetric" window 57 | and np.hanning() returns it. However, for Fourier analysis, this 58 | actually represents just over one cycle of a period N-1 cosine, and 59 | thus is not compactly expressed on a length-N Fourier basis. Instead, 60 | it's better to use a raised cosine that ends just before the final 61 | zero value - i.e. a complete cycle of a period-N cosine. Matlab 62 | calls this a "periodic" window. This routine calculates it. 63 | 64 | Args: 65 | window_length: The number of points in the returned window. 66 | 67 | Returns: 68 | A 1D np.array containing the periodic hann window. 69 | """ 70 | return 0.5 - (0.5 * np.cos(2 * np.pi / window_length * 71 | np.arange(window_length))) 72 | 73 | 74 | def stft_magnitude(signal, fft_length, 75 | hop_length=None, 76 | window_length=None): 77 | """Calculate the short-time Fourier transform magnitude. 78 | 79 | Args: 80 | signal: 1D np.array of the input time-domain signal. 81 | fft_length: Size of the FFT to apply. 82 | hop_length: Advance (in samples) between each frame passed to FFT. 83 | window_length: Length of each block of samples to pass to FFT. 84 | 85 | Returns: 86 | 2D np.array where each row contains the magnitudes of the fft_length/2+1 87 | unique values of the FFT for the corresponding frame of input samples. 88 | """ 89 | frames = frame(signal, window_length, hop_length) 90 | # Apply frame window to each frame. We use a periodic Hann (cosine of period 91 | # window_length) instead of the symmetric Hann of np.hanning (period 92 | # window_length-1). 93 | window = periodic_hann(window_length) 94 | windowed_frames = frames * window 95 | return np.abs(np.fft.rfft(windowed_frames, int(fft_length))) 96 | 97 | 98 | # Mel spectrum constants and functions. 99 | _MEL_BREAK_FREQUENCY_HERTZ = 700.0 100 | _MEL_HIGH_FREQUENCY_Q = 1127.0 101 | 102 | 103 | def hertz_to_mel(frequencies_hertz): 104 | """Convert frequencies to mel scale using HTK formula. 105 | 106 | Args: 107 | frequencies_hertz: Scalar or np.array of frequencies in hertz. 108 | 109 | Returns: 110 | Object of same size as frequencies_hertz containing corresponding values 111 | on the mel scale. 112 | """ 113 | return _MEL_HIGH_FREQUENCY_Q * np.log( 114 | 1.0 + (frequencies_hertz / _MEL_BREAK_FREQUENCY_HERTZ)) 115 | 116 | 117 | def spectrogram_to_mel_matrix(num_mel_bins=20, 118 | num_spectrogram_bins=129, 119 | audio_sample_rate=8000, 120 | lower_edge_hertz=125.0, 121 | upper_edge_hertz=3800.0): 122 | """Return a matrix that can post-multiply spectrogram rows to make mel. 123 | 124 | Returns a np.array matrix A that can be used to post-multiply a matrix S of 125 | spectrogram values (STFT magnitudes) arranged as frames x bins to generate a 126 | "mel spectrogram" M of frames x num_mel_bins. M = S A. 127 | 128 | The classic HTK algorithm exploits the complementarity of adjacent mel bands 129 | to multiply each FFT bin by only one mel weight, then add it, with positive 130 | and negative signs, to the two adjacent mel bands to which that bin 131 | contributes. Here, by expressing this operation as a matrix multiply, we go 132 | from num_fft multiplies per frame (plus around 2*num_fft adds) to around 133 | num_fft^2 multiplies and adds. However, because these are all presumably 134 | accomplished in a single call to np.dot(), it's not clear which approach is 135 | faster in Python. The matrix multiplication has the attraction of being more 136 | general and flexible, and much easier to read. 137 | 138 | Args: 139 | num_mel_bins: How many bands in the resulting mel spectrum. This is 140 | the number of columns in the output matrix. 141 | num_spectrogram_bins: How many bins there are in the source spectrogram 142 | data, which is understood to be fft_size/2 + 1, i.e. the spectrogram 143 | only contains the nonredundant FFT bins. 144 | audio_sample_rate: Samples per second of the audio at the input to the 145 | spectrogram. We need this to figure out the actual frequencies for 146 | each spectrogram bin, which dictates how they are mapped into mel. 147 | lower_edge_hertz: Lower bound on the frequencies to be included in the mel 148 | spectrum. This corresponds to the lower edge of the lowest triangular 149 | band. 150 | upper_edge_hertz: The desired top edge of the highest frequency band. 151 | 152 | Returns: 153 | An np.array with shape (num_spectrogram_bins, num_mel_bins). 154 | 155 | Raises: 156 | ValueError: if frequency edges are incorrectly ordered or out of range. 157 | """ 158 | nyquist_hertz = audio_sample_rate / 2. 159 | if lower_edge_hertz < 0.0: 160 | raise ValueError("lower_edge_hertz %.1f must be >= 0" % lower_edge_hertz) 161 | if lower_edge_hertz >= upper_edge_hertz: 162 | raise ValueError("lower_edge_hertz %.1f >= upper_edge_hertz %.1f" % 163 | (lower_edge_hertz, upper_edge_hertz)) 164 | if upper_edge_hertz > nyquist_hertz: 165 | raise ValueError("upper_edge_hertz %.1f is greater than Nyquist %.1f" % 166 | (upper_edge_hertz, nyquist_hertz)) 167 | spectrogram_bins_hertz = np.linspace(0.0, nyquist_hertz, num_spectrogram_bins) 168 | spectrogram_bins_mel = hertz_to_mel(spectrogram_bins_hertz) 169 | # The i'th mel band (starting from i=1) has center frequency 170 | # band_edges_mel[i], lower edge band_edges_mel[i-1], and higher edge 171 | # band_edges_mel[i+1]. Thus, we need num_mel_bins + 2 values in 172 | # the band_edges_mel arrays. 173 | band_edges_mel = np.linspace(hertz_to_mel(lower_edge_hertz), 174 | hertz_to_mel(upper_edge_hertz), num_mel_bins + 2) 175 | # Matrix to post-multiply feature arrays whose rows are num_spectrogram_bins 176 | # of spectrogram values. 177 | mel_weights_matrix = np.empty((num_spectrogram_bins, num_mel_bins)) 178 | for i in range(num_mel_bins): 179 | lower_edge_mel, center_mel, upper_edge_mel = band_edges_mel[i:i + 3] 180 | # Calculate lower and upper slopes for every spectrogram bin. 181 | # Line segments are linear in the *mel* domain, not hertz. 182 | lower_slope = ((spectrogram_bins_mel - lower_edge_mel) / 183 | (center_mel - lower_edge_mel)) 184 | upper_slope = ((upper_edge_mel - spectrogram_bins_mel) / 185 | (upper_edge_mel - center_mel)) 186 | # .. then intersect them with each other and zero. 187 | mel_weights_matrix[:, i] = np.maximum(0.0, np.minimum(lower_slope, 188 | upper_slope)) 189 | # HTK excludes the spectrogram DC bin; make sure it always gets a zero 190 | # coefficient. 191 | mel_weights_matrix[0, :] = 0.0 192 | return mel_weights_matrix 193 | 194 | 195 | def log_mel_spectrogram(data, 196 | audio_sample_rate=8000, 197 | log_offset=0.0, 198 | window_length_secs=0.025, 199 | hop_length_secs=0.010, 200 | **kwargs): 201 | """Convert waveform to a log magnitude mel-frequency spectrogram. 202 | 203 | Args: 204 | data: 1D np.array of waveform data. 205 | audio_sample_rate: The sampling rate of data. 206 | log_offset: Add this to values when taking log to avoid -Infs. 207 | window_length_secs: Duration of each window to analyze. 208 | hop_length_secs: Advance between successive analysis windows. 209 | **kwargs: Additional arguments to pass to spectrogram_to_mel_matrix. 210 | 211 | Returns: 212 | 2D np.array of (num_frames, num_mel_bins) consisting of log mel filterbank 213 | magnitudes for successive frames. 214 | """ 215 | window_length_samples = int(round(audio_sample_rate * window_length_secs)) 216 | hop_length_samples = int(round(audio_sample_rate * hop_length_secs)) 217 | fft_length = 2 ** int(np.ceil(np.log(window_length_samples) / np.log(2.0))) 218 | spectrogram = stft_magnitude( 219 | data, 220 | fft_length=fft_length, 221 | hop_length=hop_length_samples, 222 | window_length=window_length_samples) 223 | mel_spectrogram = np.dot(spectrogram, spectrogram_to_mel_matrix( 224 | num_spectrogram_bins=spectrogram.shape[1], 225 | audio_sample_rate=audio_sample_rate, **kwargs)) 226 | return np.log(mel_spectrogram + log_offset) 227 | -------------------------------------------------------------------------------- /audio-to-audio-search/vggish/vggish_input.py: -------------------------------------------------------------------------------- 1 | __copyright__ = "Copyright (c) 2021 Jina AI Limited. All rights reserved." 2 | __license__ = "Apache-2.0" 3 | 4 | # Copyright 2017 The TensorFlow Authors All Rights Reserved. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # ============================================================================== 18 | 19 | """Compute input examples for VGGish from audio waveform.""" 20 | 21 | import resampy 22 | 23 | 24 | from vggish.mel_features import * 25 | from vggish.vggish_params import * 26 | import librosa 27 | 28 | try: 29 | import soundfile as sf 30 | 31 | 32 | def wav_read(wav_file): 33 | wav_data, sr = sf.read(wav_file, dtype='int16') 34 | return wav_data, sr 35 | 36 | except ImportError: 37 | 38 | def wav_read(wav_file): 39 | raise NotImplementedError('WAV file reading requires soundfile package.') 40 | 41 | 42 | def waveform_to_examples(data, sample_rate): 43 | """Converts audio waveform into an array of examples for VGGish. 44 | 45 | Args: 46 | data: np.array of either one dimension (mono) or two dimensions 47 | (multi-channel, with the outer dimension representing channels). 48 | Each sample is generally expected to lie in the range [-1.0, +1.0], 49 | although this is not required. 50 | sample_rate: Sample rate of data. 51 | 52 | Returns: 53 | 3-D np.array of shape [num_examples, num_frames, num_bands] which represents 54 | a sequence of examples, each of which contains a patch of log mel 55 | spectrogram, covering num_frames frames of audio and num_bands mel frequency 56 | bands, where the frame length is STFT_HOP_LENGTH_SECONDS. 57 | """ 58 | # Convert to mono. 59 | if len(data.shape) > 1: 60 | data = np.mean(data, axis=1) 61 | # Resample to the rate assumed by VGGish. 62 | if sample_rate != SAMPLE_RATE: 63 | data = resampy.resample(data, sample_rate, SAMPLE_RATE) 64 | 65 | # Compute log mel spectrogram features. 66 | log_mel = log_mel_spectrogram( 67 | data, 68 | audio_sample_rate=SAMPLE_RATE, 69 | log_offset=LOG_OFFSET, 70 | window_length_secs=STFT_WINDOW_LENGTH_SECONDS, 71 | hop_length_secs=STFT_HOP_LENGTH_SECONDS, 72 | num_mel_bins=NUM_MEL_BINS, 73 | lower_edge_hertz=MEL_MIN_HZ, 74 | upper_edge_hertz=MEL_MAX_HZ) 75 | 76 | # Frame features into examples. 77 | features_sample_rate = 1.0 / STFT_HOP_LENGTH_SECONDS 78 | example_window_length = int(round( 79 | EXAMPLE_WINDOW_SECONDS * features_sample_rate)) 80 | example_hop_length = int(round( 81 | EXAMPLE_HOP_SECONDS * features_sample_rate)) 82 | log_mel_examples = frame( 83 | log_mel, 84 | window_length=example_window_length, 85 | hop_length=example_hop_length) 86 | return log_mel_examples 87 | 88 | 89 | def wavfile_to_examples(wav_file): 90 | """Convenience wrapper around waveform_to_examples() for a common WAV format. 91 | 92 | Args: 93 | wav_file: String path to a file, or a file-like object. The file 94 | is assumed to contain WAV audio data with signed 16-bit PCM samples. 95 | 96 | Returns: 97 | See waveform_to_examples. 98 | """ 99 | wav_data, sr = wav_read(wav_file) 100 | assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype 101 | samples = wav_data / 32768.0 # Convert to [-1.0, +1.0] 102 | return waveform_to_examples(samples, sr) 103 | 104 | 105 | def mp3_to_examples(mp3_file): 106 | """Convenience wrapper around waveform_to_examples() for a common mp3 format. 107 | 108 | Args: 109 | mp3_file: String path to a file, or a file-like object. The file 110 | is assumed to contain mp3 audio data. 111 | 112 | Returns: 113 | See waveform_to_examples. 114 | """ 115 | x_data, sr = librosa.load(mp3_file) 116 | #assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype 117 | #samples = x_data / 32768.0 # Convert to [-1.0, +1.0] 118 | return waveform_to_examples(x_data, sr) 119 | 120 | 121 | -------------------------------------------------------------------------------- /audio-to-audio-search/vggish/vggish_params.py: -------------------------------------------------------------------------------- 1 | __copyright__ = "Copyright (c) 2021 Jina AI Limited. All rights reserved." 2 | __license__ = "Apache-2.0" 3 | 4 | # Copyright 2017 The TensorFlow Authors All Rights Reserved. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # ============================================================================== 18 | 19 | """Global parameters for the VGGish model. 20 | 21 | See vggish_slim.py for more information. 22 | """ 23 | 24 | # Architectural constants. 25 | NUM_FRAMES = 96 # Frames in input mel-spectrogram patch. 26 | NUM_BANDS = 64 # Frequency bands in input mel-spectrogram patch. 27 | EMBEDDING_SIZE = 128 # Size of embedding layer. 28 | 29 | # Hyperparameters used in feature and example generation. 30 | SAMPLE_RATE = 16000 31 | STFT_WINDOW_LENGTH_SECONDS = 0.025 32 | STFT_HOP_LENGTH_SECONDS = 0.010 33 | NUM_MEL_BINS = NUM_BANDS 34 | MEL_MIN_HZ = 125 35 | MEL_MAX_HZ = 7500 36 | LOG_OFFSET = 0.01 # Offset used for stabilized log of input mel-spectrogram. 37 | EXAMPLE_WINDOW_SECONDS = 0.96 # Each example contains 96 10ms frames 38 | EXAMPLE_HOP_SECONDS = 0.96 # with zero overlap. 39 | 40 | # Parameters used for embedding postprocessing. 41 | PCA_EIGEN_VECTORS_NAME = 'pca_eigen_vectors' 42 | PCA_MEANS_NAME = 'pca_means' 43 | QUANTIZE_MIN_VAL = -2.0 44 | QUANTIZE_MAX_VAL = +2.0 45 | 46 | # Hyperparameters used in training. 47 | INIT_STDDEV = 0.01 # Standard deviation used to initialize weights. 48 | LEARNING_RATE = 1e-4 # Learning rate for the Adam optimizer. 49 | ADAM_EPSILON = 1e-8 # Epsilon for the Adam optimizer. 50 | 51 | # Names of ops, tensors, and features. 52 | INPUT_OP_NAME = 'vggish/input_features' 53 | INPUT_TENSOR_NAME = INPUT_OP_NAME + ':0' 54 | OUTPUT_OP_NAME = 'vggish/embedding' 55 | OUTPUT_TENSOR_NAME = OUTPUT_OP_NAME + ':0' 56 | AUDIO_EMBEDDING_FEATURE_NAME = 'audio_embedding' 57 | -------------------------------------------------------------------------------- /audio-to-audio-search/vggish/vggish_postprocess.py: -------------------------------------------------------------------------------- 1 | __copyright__ = "Copyright (c) 2021 Jina AI Limited. All rights reserved." 2 | __license__ = "Apache-2.0" 3 | 4 | # Copyright 2017 The TensorFlow Authors All Rights Reserved. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # ============================================================================== 18 | 19 | """Post-process embeddings from VGGish.""" 20 | 21 | import numpy as np 22 | 23 | from vggish.vggish_params import * 24 | 25 | 26 | class Postprocessor(object): 27 | """Post-processes VGGish embeddings. 28 | 29 | The initial release of AudioSet included 128-D VGGish embeddings for each 30 | segment of AudioSet. These released embeddings were produced by applying 31 | a PCA transformation (technically, a whitening transform is included as well) 32 | and 8-bit quantization to the raw embedding output from VGGish, in order to 33 | stay compatible with the YouTube-8M project which provides visual embeddings 34 | in the same format for a large set of YouTube videos. This class implements 35 | the same PCA (with whitening) and quantization transformations. 36 | """ 37 | 38 | def __init__(self, pca_params_npz_path): 39 | """Constructs a postprocessor. 40 | 41 | Args: 42 | pca_params_npz_path: Path to a NumPy-format .npz file that 43 | contains the PCA parameters used in postprocessing. 44 | """ 45 | params = np.load(pca_params_npz_path) 46 | self._pca_matrix = params[PCA_EIGEN_VECTORS_NAME] 47 | # Load means into a column vector for easier broadcasting later. 48 | self._pca_means = params[PCA_MEANS_NAME].reshape(-1, 1) 49 | assert self._pca_matrix.shape == ( 50 | EMBEDDING_SIZE, EMBEDDING_SIZE), ( 51 | 'Bad PCA matrix shape: %r' % (self._pca_matrix.shape,)) 52 | assert self._pca_means.shape == (EMBEDDING_SIZE, 1), ( 53 | 'Bad PCA means shape: %r' % (self._pca_means.shape,)) 54 | 55 | def postprocess(self, embeddings_batch): 56 | """Applies postprocessing to a batch of embeddings. 57 | 58 | Args: 59 | embeddings_batch: An nparray of shape [batch_size, embedding_size] 60 | containing output from the embedding layer of VGGish. 61 | 62 | Returns: 63 | An nparray of the same shape as the input but of type uint8, 64 | containing the PCA-transformed and quantized version of the input. 65 | """ 66 | assert len(embeddings_batch.shape) == 2, ( 67 | 'Expected 2-d batch, got %r' % (embeddings_batch.shape,)) 68 | assert embeddings_batch.shape[1] == EMBEDDING_SIZE, ( 69 | 'Bad batch shape: %r' % (embeddings_batch.shape,)) 70 | 71 | # Apply PCA. 72 | # - Embeddings come in as [batch_size, embedding_size]. 73 | # - Transpose to [embedding_size, batch_size]. 74 | # - Subtract pca_means column vector from each column. 75 | # - Premultiply by PCA matrix of shape [output_dims, input_dims] 76 | # where both are are equal to embedding_size in our case. 77 | # - Transpose result back to [batch_size, embedding_size]. 78 | pca_applied = np.dot(self._pca_matrix, 79 | (embeddings_batch.T - self._pca_means)).T 80 | 81 | # Quantize by: 82 | # - clipping to [min, max] range 83 | clipped_embeddings = np.clip( 84 | pca_applied, QUANTIZE_MIN_VAL, 85 | QUANTIZE_MAX_VAL) 86 | # - convert to 8-bit in range [0.0, 255.0] 87 | quantized_embeddings = ( 88 | (clipped_embeddings - QUANTIZE_MIN_VAL) * 89 | (255.0 / 90 | (QUANTIZE_MAX_VAL - QUANTIZE_MIN_VAL))) 91 | # - cast 8-bit float to uint8 92 | quantized_embeddings = quantized_embeddings.astype(np.uint8) 93 | 94 | return quantized_embeddings 95 | -------------------------------------------------------------------------------- /audio-to-audio-search/vggish/vggish_slim.py: -------------------------------------------------------------------------------- 1 | __copyright__ = "Copyright (c) 2020 Jina AI Limited. All rights reserved." 2 | __license__ = "Apache-2.0" 3 | 4 | # Copyright 2017 The TensorFlow Authors All Rights Reserved. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # ============================================================================== 18 | 19 | """Defines the 'VGGish' model used to generate AudioSet embedding features. 20 | 21 | The public AudioSet release (https://research.google.com/audioset/download.html) 22 | includes 128-D features extracted from the embedding layer of a VGG-like model 23 | that was trained on a large Google-internal YouTube dataset. Here we provide 24 | a TF-Slim definition of the same model, without any dependences on libraries 25 | internal to Google. We call it 'VGGish'. 26 | 27 | Note that we only define the model up to the embedding layer, which is the 28 | penultimate layer before the final classifier layer. We also provide various 29 | hyperparameter values (in vggish_params.py) that were used to train this model 30 | internally. 31 | 32 | For comparison, here is TF-Slim's VGG definition: 33 | https://github.com/tensorflow/models/blob/master/research/slim/nets/vgg.py 34 | """ 35 | 36 | import tensorflow.compat.v1 as tf 37 | import tf_slim as slim 38 | 39 | from vggish.vggish_params import * 40 | 41 | 42 | def define_vggish_slim(features_tensor=None, training=False): 43 | """Defines the VGGish TensorFlow model. 44 | 45 | All ops are created in the current default graph, under the scope 'vggish/'. 46 | 47 | The input is either a tensor passed in via the optional 'features_tensor' 48 | argument or a placeholder created below named 'vggish/input_features'. The 49 | input is expected to have dtype float32 and shape [batch_size, num_frames, 50 | num_bands] where batch_size is variable and num_frames and num_bands are 51 | constants, and [num_frames, num_bands] represents a log-mel-scale spectrogram 52 | patch covering num_bands frequency bands and num_frames time frames (where 53 | each frame step is usually 10ms). This is produced by computing the stabilized 54 | log(mel-spectrogram + LOG_OFFSET). The output is a tensor named 55 | 'vggish/embedding' which produces the pre-activation values of a 128-D 56 | embedding layer, which is usually the penultimate layer when used as part of a 57 | full model with a final classifier layer. 58 | 59 | Args: 60 | features_tensor: If not None, the tensor containing the input features. 61 | If None, a placeholder input is created. 62 | training: If true, all parameters are marked trainable. 63 | 64 | Returns: 65 | The op 'vggish/embeddings'. 66 | """ 67 | # Defaults: 68 | # - All weights are initialized to N(0, INIT_STDDEV). 69 | # - All biases are initialized to 0. 70 | # - All activations are ReLU. 71 | # - All convolutions are 3x3 with stride 1 and SAME padding. 72 | # - All max-pools are 2x2 with stride 2 and SAME padding. 73 | with slim.arg_scope([slim.conv2d, slim.fully_connected], 74 | weights_initializer=tf.truncated_normal_initializer( 75 | stddev=INIT_STDDEV), 76 | biases_initializer=tf.zeros_initializer(), 77 | activation_fn=tf.nn.relu, 78 | trainable=training), \ 79 | slim.arg_scope([slim.conv2d], 80 | kernel_size=[3, 3], stride=1, padding='SAME'), \ 81 | slim.arg_scope([slim.max_pool2d], 82 | kernel_size=[2, 2], stride=2, padding='SAME'), \ 83 | tf.variable_scope('vggish'): 84 | # Input: a batch of 2-D log-mel-spectrogram patches. 85 | if features_tensor is None: 86 | features_tensor = tf.placeholder( 87 | tf.float32, shape=(None, NUM_FRAMES, NUM_BANDS), 88 | name='input_features') 89 | # Reshape to 4-D so that we can convolve a batch with conv2d(). 90 | net = tf.reshape(features_tensor, 91 | [-1, NUM_FRAMES, NUM_BANDS, 1]) 92 | 93 | # The VGG stack of alternating convolutions and max-pools. 94 | net = slim.conv2d(net, 64, scope='conv1') 95 | net = slim.max_pool2d(net, scope='pool1') 96 | net = slim.conv2d(net, 128, scope='conv2') 97 | net = slim.max_pool2d(net, scope='pool2') 98 | net = slim.repeat(net, 2, slim.conv2d, 256, scope='conv3') 99 | net = slim.max_pool2d(net, scope='pool3') 100 | net = slim.repeat(net, 2, slim.conv2d, 512, scope='conv4') 101 | net = slim.max_pool2d(net, scope='pool4') 102 | 103 | # Flatten before entering fully-connected layers 104 | net = slim.flatten(net) 105 | net = slim.repeat(net, 2, slim.fully_connected, 4096, scope='fc1') 106 | # The embedding layer. 107 | net = slim.fully_connected(net, EMBEDDING_SIZE, scope='fc2', 108 | activation_fn=None) 109 | return tf.identity(net, name='embedding') 110 | 111 | 112 | def load_vggish_slim_checkpoint(session, checkpoint_path): 113 | """Loads a pre-trained VGGish-compatible checkpoint. 114 | 115 | This function can be used as an initialization function (referred to as 116 | init_fn in TensorFlow documentation) which is called in a Session after 117 | initializating all variables. When used as an init_fn, this will load 118 | a pre-trained checkpoint that is compatible with the VGGish model 119 | definition. Only variables defined by VGGish will be loaded. 120 | 121 | Args: 122 | session: an active TensorFlow session. 123 | checkpoint_path: path to a file containing a checkpoint that is 124 | compatible with the VGGish model definition. 125 | """ 126 | # Get the list of names of all VGGish variables that exist in 127 | # the checkpoint (i.e., all inference-mode VGGish variables). 128 | with tf.Graph().as_default(): 129 | define_vggish_slim(training=False) 130 | vggish_var_names = [v.name for v in tf.global_variables()] 131 | 132 | # Get the list of all currently existing variables that match 133 | # the list of variable names we just computed. 134 | vggish_vars = [v for v in tf.global_variables() if v.name in vggish_var_names] 135 | 136 | # Use a Saver to restore just the variables selected above. 137 | saver = tf.train.Saver(vggish_vars, name='vggish_load_pretrained', 138 | write_version=1) 139 | saver.restore(session, checkpoint_path) 140 | -------------------------------------------------------------------------------- /cross-modal-search/.dockerignore: -------------------------------------------------------------------------------- 1 | workspace 2 | venv 3 | .venv 4 | -------------------------------------------------------------------------------- /cross-modal-search/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jina-ai/examples/d8f903278597254bd96b1ed64fe8c9feefaa265c/cross-modal-search/__init__.py -------------------------------------------------------------------------------- /cross-modal-search/app.py: -------------------------------------------------------------------------------- 1 | __copyright__ = "Copyright (c) 2021 Jina AI Limited. All rights reserved." 2 | __license__ = "Apache-2.0" 3 | 4 | import os 5 | import sys 6 | 7 | import click 8 | from jina import Flow, Document, DocumentArray 9 | import logging 10 | import matplotlib.pyplot as plt 11 | 12 | from dataset import input_index_data 13 | 14 | MAX_DOCS = int(os.environ.get("JINA_MAX_DOCS", 10000)) 15 | cur_dir = os.path.dirname(os.path.abspath(__file__)) 16 | DEFAULT_QUERY_IMAGE = 'toy-data/images/1000268201_693b08cb0e.jpg' 17 | DEFAULT_QUERY_TEXT = 'a black dog and a spotted dog are fighting' 18 | 19 | 20 | def config(): 21 | os.environ.setdefault('JINA_WORKSPACE', os.path.join(cur_dir, 'workspace')) 22 | os.environ.setdefault( 23 | 'JINA_WORKSPACE_MOUNT', 24 | f'{os.environ.get("JINA_WORKSPACE")}:/workspace/workspace') 25 | os.environ.setdefault('JINA_LOG_LEVEL', 'INFO') 26 | os.environ.setdefault('JINA_PORT', str(45678)) 27 | 28 | 29 | def index_restful(): 30 | flow = Flow().load_config('flows/flow-index.yml', override_with={'protocol': 'http'}) 31 | with flow: 32 | flow.block() 33 | 34 | 35 | def check_query_result(text_doc, image_doc, img_uri): 36 | # Image doc matches are text: 37 | print(f'Searching with image {img_uri}. Matches:') 38 | if image_doc.matches: 39 | for m in image_doc.matches: 40 | print( 41 | f'\t-- text: "{m.text}" ' 42 | f'score: {m.scores["cosine"].value:.4f},' 43 | ) 44 | 45 | # Text doc matches are images 46 | print(f'\nSearching with text "{text_doc.text}". Matches:') 47 | if text_doc.matches: 48 | f, axarr = plt.subplots(1, len(text_doc.matches)) 49 | 50 | for i, m in enumerate(text_doc.matches): 51 | axarr[i].title.set_text(f'score={m.scores["cosine"].value:.4f}') 52 | axarr[i].imshow(m.blob) 53 | axarr[i].axes.xaxis.set_visible(False) 54 | axarr[i].axes.yaxis.set_visible(False) 55 | plt.suptitle(f"Best matches for '{text_doc.text}'") 56 | plt.show() 57 | 58 | 59 | def index(data_set, num_docs, request_size): 60 | flow = Flow().load_config('flows/flow-index.yml') 61 | with flow: 62 | flow.post(on='/index', 63 | inputs=input_index_data(num_docs, request_size, data_set), 64 | request_size=request_size, 65 | show_progress=True) 66 | 67 | 68 | def query(query_image, query_text): 69 | flow = Flow().load_config('flows/flow-query.yml') 70 | with flow: 71 | img_uri = query_image 72 | text_doc = Document(text=query_text, 73 | modality='text') 74 | image_doc = Document(uri=img_uri, 75 | modality='image') 76 | import time 77 | start = time.time() 78 | result_text = flow.post(on='/search', inputs=text_doc, 79 | return_results=True) 80 | result_image = flow.post(on='/search', inputs=image_doc, 81 | return_results=True) 82 | print(f'Request duration: {time.time() - start}') 83 | check_query_result(result_text[0].docs[0], result_image[0].docs[0], img_uri) 84 | 85 | 86 | 87 | def query_restful(): 88 | flow = Flow(cors=True).load_config('flows/flow-query.yml') 89 | flow.rest_api = True 90 | flow.protocol = 'http' 91 | with flow: 92 | flow.block() 93 | 94 | 95 | @click.command() 96 | @click.option('--task', '-t', type=click.Choice(['index', 'index_restful', 'query_restful', 'query']), default='index') 97 | @click.option("--num_docs", "-n", default=MAX_DOCS) 98 | @click.option('--request_size', '-s', default=16) 99 | @click.option('--data_set', '-d', type=click.Choice(['f30k', 'f8k', 'toy-data'], case_sensitive=False), default='toy-data') 100 | @click.option('--query-image', '-i', type=str, default=DEFAULT_QUERY_IMAGE) 101 | @click.option('--query-text', '-i', type=str, default=DEFAULT_QUERY_TEXT) 102 | def main(task, num_docs, request_size, data_set, query_image, query_text): 103 | config() 104 | workspace = os.environ['JINA_WORKSPACE'] 105 | logger = logging.getLogger('cross-modal-search') 106 | if 'index' in task: 107 | if os.path.exists(workspace): 108 | logger.error( 109 | f'\n +------------------------------------------------------------------------------------+ \ 110 | \n | 🤖🤖🤖 | \ 111 | \n | The directory {workspace} already exists. Please remove it before indexing again. | \ 112 | \n | 🤖🤖🤖 | \ 113 | \n +------------------------------------------------------------------------------------+' 114 | ) 115 | sys.exit(1) 116 | if 'query' in task: 117 | if not os.path.exists(workspace): 118 | logger.error(f'The directory {workspace} does not exist. Please index first via `python app.py -t index`') 119 | sys.exit(1) 120 | 121 | if task == 'index': 122 | index(data_set, num_docs, request_size) 123 | elif task == 'index_restful': 124 | index_restful() 125 | elif task == 'query': 126 | query(query_image, query_text) 127 | elif task == 'query_restful': 128 | query_restful() 129 | 130 | 131 | if __name__ == '__main__': 132 | main() 133 | -------------------------------------------------------------------------------- /cross-modal-search/dataset.py: -------------------------------------------------------------------------------- 1 | __copyright__ = "Copyright (c) 2021 Jina AI Limited. All rights reserved." 2 | __license__ = "Apache-2.0" 3 | 4 | 5 | import os 6 | import json as jsonmod 7 | import hashlib 8 | 9 | import torch 10 | import torch.utils.data as data 11 | from jina import Document 12 | 13 | 14 | cur_dir = os.path.dirname(os.path.abspath(__file__)) 15 | 16 | 17 | class Flickr30kDataset(data.Dataset): 18 | """ 19 | Dataset loader for Flickr30k full datasets. 20 | """ 21 | 22 | def __init__(self, images_root, json, split): 23 | self.images_root = images_root 24 | self.dataset = jsonmod.load(open(json, 'r'))['images'] 25 | self.ids = [] 26 | for i, d in enumerate(self.dataset): 27 | if d['split'] == split: 28 | self.ids += [(i, x) for x in range(len(d['sentences']))] 29 | 30 | def __getitem__(self, index): 31 | """This function returns a tuple that is further passed to collate_fn 32 | """ 33 | images_root = self.images_root 34 | ann_id = self.ids[index] 35 | img_id = ann_id[0] 36 | caption = self.dataset[img_id]['sentences'][ann_id[1]]['raw'] 37 | img_file_name = self.dataset[img_id]['filename'] 38 | 39 | image_file_path = os.path.join(images_root, img_file_name) 40 | with open(image_file_path, 'rb') as fp: 41 | image_buffer = fp.read() 42 | return image_buffer, str(caption).lower() 43 | 44 | def __len__(self): 45 | return len(self.ids) 46 | 47 | 48 | class FlickrDataset(data.Dataset): 49 | """ 50 | Dataset loader for Flickr8k full datasets. 51 | """ 52 | 53 | def __init__(self, images_root, captions_file_path): 54 | self.images_root = images_root 55 | self.captions_file_path = captions_file_path 56 | with open(self.captions_file_path, 'r') as cf: 57 | self.lines = cf.readlines()[1:] 58 | 59 | def __getitem__(self, index): 60 | """This function returns a tuple that is further passed to collate_fn 61 | """ 62 | image_file_name, caption = self.lines[index*5].split(',', 1) 63 | with open(os.path.join(self.images_root, image_file_name), 'rb') as fp: 64 | image_buffer = fp.read() 65 | return image_buffer, str(caption).lower().rstrip() 66 | 67 | def __len__(self): 68 | return int(len(self.lines)/5) 69 | 70 | 71 | def collate_fn(data): 72 | # Not sure this is actually needed 73 | images, captions = zip(*data) 74 | return images, captions 75 | 76 | 77 | def get_data_loader(split, root, captions, batch_size=8, dataset_type='f30k', shuffle=False, 78 | num_workers=1, collate_fn=collate_fn): 79 | """Returns torch.utils.data.DataLoader for custom coco dataset.""" 80 | 81 | if dataset_type == 'f30k': 82 | dataset = Flickr30kDataset(images_root=root, split=split, json=captions) 83 | elif dataset_type == 'f8k' or dataset_type == 'toy-data': 84 | dataset = FlickrDataset(images_root=root, captions_file_path=captions) 85 | else: 86 | raise NotImplementedError(f'Not valid dataset type {dataset_type}') 87 | # Data loader 88 | data_loader = torch.utils.data.DataLoader(dataset=dataset, 89 | batch_size=batch_size, 90 | shuffle=shuffle, 91 | pin_memory=True, 92 | num_workers=num_workers, 93 | collate_fn=collate_fn) 94 | 95 | return data_loader 96 | 97 | 98 | def input_index_data(num_docs=None, batch_size=8, dataset_type='f30k'): 99 | captions = 'dataset_flickr30k.json' if dataset_type == 'f30k' else 'captions.txt' 100 | if dataset_type == 'toy-data': 101 | base_folder = '.' 102 | else: 103 | base_folder = 'data' 104 | data_loader = get_data_loader( 105 | root=os.path.join(cur_dir, f'{base_folder}/{dataset_type}/images'), 106 | captions=os.path.join(cur_dir, f'{base_folder}/{dataset_type}/{captions}'), 107 | split='test', 108 | batch_size=batch_size, 109 | dataset_type=dataset_type 110 | ) 111 | 112 | for i, (images, captions) in enumerate(data_loader): 113 | for image, caption in zip(images, captions): 114 | hashed = hashlib.sha1(image).hexdigest() 115 | document_img = Document() 116 | 117 | document_img.buffer = image 118 | document_img.modality = 'image' 119 | document_img.mime_type = 'image/jpeg' 120 | 121 | document_caption = Document(id=hashed) 122 | 123 | document_caption.text = caption 124 | document_caption.modality = 'text' 125 | document_caption.mime_type = 'text/plain' 126 | document_caption.tags['id'] = caption 127 | 128 | yield document_img 129 | yield document_caption 130 | 131 | if num_docs and (i + 1) * batch_size >= num_docs: 132 | break 133 | -------------------------------------------------------------------------------- /cross-modal-search/flows/executors.py: -------------------------------------------------------------------------------- 1 | """ Implementation of filters for images and texts""" 2 | 3 | import numpy as np 4 | from jina import Executor, DocumentArray, requests 5 | 6 | 7 | class ImageReader(Executor): 8 | @requests(on='/index') 9 | def index_read(self, docs: 'DocumentArray', **kwargs): 10 | array = DocumentArray(list(filter(lambda doc: doc.modality=='image', docs))) 11 | for doc in array: 12 | doc.convert_image_buffer_to_blob() 13 | doc.blob = np.array(doc.blob).astype(np.uint8) 14 | return array 15 | 16 | @requests(on='/search') 17 | def search_read(self, docs: 'DocumentArray', **kwargs): 18 | image_docs = DocumentArray(list(filter(lambda doc: doc.mime_type in ('image/jpeg', 'image/png'), docs))) 19 | if not image_docs: 20 | return DocumentArray([]) 21 | for doc in image_docs: 22 | doc.convert_uri_to_buffer() 23 | doc.convert_image_buffer_to_blob() 24 | doc.blob = doc.blob.astype(np.uint8) 25 | return image_docs 26 | 27 | 28 | class TextFilter(Executor): 29 | @requests 30 | def filter_text(self, docs: 'DocumentArray', **kwargs): 31 | docs = DocumentArray(list(filter(lambda doc: doc.mime_type == 'text/plain', docs))) 32 | return docs 33 | -------------------------------------------------------------------------------- /cross-modal-search/flows/flow-index.yml: -------------------------------------------------------------------------------- 1 | jtype: Flow # We configure the index flow here that is used for indexing images and captions 2 | version: '1' # yml version 3 | with: # Parameters for the flow are defined after with 4 | prefetch: 10 # Number of prefetched requests from the client 5 | port_expose: $JINA_PORT # Port defined in environment variable 6 | workspace: $JINA_WORKSPACE # Workspace folder 7 | pods: # Now, we define the pods that are used 8 | - name: image_loader # The first executor is an image loader that filters only image documents 9 | uses: ImageReader # Type of the executor 10 | py_modules: 'flows/executors.py' # The python file where the executor is implemented 11 | read_only: true # Executor does not modify files 12 | needs: gateway # Executor is after the gateway, this means at the start of the flow 13 | - name: image_encoder # After the images are read, compute their embedding in the encoder 14 | uses: 'jinahub+docker://CLIPImageEncoder/v0.1' # The type of the executor - here, we use a hub executor from the jinahub in the form of a docker container 15 | volumes: $HOME/.cache/huggingface:/root/.cache/huggingface # Mount a volume into the executor 16 | timeout_ready: 600000 # Set a timeout for the executor 17 | read_only: true # Executor does not modify files 18 | needs: image_loader # This executor is located after the image loader in the flow 19 | - name: image_indexer # Executor that stores image embeddings 20 | uses: 'jinahub://SimpleIndexer/old' # Hub Executor - We use a SimpleIndexer here 21 | uses_with: # Define arguments for the SimpleIndexer 22 | index_file_name: 'image_index' # Folder path for this executor 23 | needs: image_encoder # This executor is after the image encoder in the flow 24 | - name: text_filter # Now, we define another path in the flow that is parallel in the execution 25 | uses: TextFilter # The first executor is a filter that filters all text documents and ignores images now 26 | py_modules: 'flows/executors.py' # File where the TextFilter is implemented 27 | needs: gateway # Start after the gateway, so at the beginning of the flow - this creates a second path in the flow 28 | - name: text_encoder # Create the next executor that computes embeddings for the text documents 29 | uses: 'jinahub+docker://CLIPTextEncoder/v0.1' # Use a hub executor in docker 30 | volumes: $HOME/.cache/huggingface:/root/.cache/huggingface # Mount the models directory 31 | timeout_ready: 600000 # Set timeout 32 | read_only: true # Executor does not modify files 33 | needs: text_filter # Run this executor after the image filter 34 | - name: text_indexer # Finally, store the indexed text documents with embeddings on disk 35 | uses: 'jinahub://SimpleIndexer/old' # Use SimpleIndexer from hub in docker again 36 | uses_with: # Define parameters for the text indexer 37 | index_file_name: 'text_index' # Folder name in the workspace 38 | needs: text_encoder # Start after the text encoder executor is finished 39 | - name: join_all # This is the last executor - it waits until both paths in the flow are finished (image and text path) 40 | needs: [image_indexer, text_indexer] # Wait for these two executors to finish - only then we can continue 41 | -------------------------------------------------------------------------------- /cross-modal-search/flows/flow-query.yml: -------------------------------------------------------------------------------- 1 | jtype: Flow # This file defines the query flow which is used for searching in the indexed documents 2 | version: '1' # The query flow is very similar to the index flow - only the differences are explained here 3 | with: 4 | prefetch: 10 5 | port_expose: $JINA_PORT 6 | workspace: $JINA_WORKSPACE 7 | pods: 8 | - name: loader # Again, we start two paths in the flow - here we start the image path 9 | uses: ImageReader 10 | py_modules: 'flows/executors.py' 11 | read_only: true 12 | needs: [gateway] 13 | - name: image_encoder # Now, encode the images and compute the embeddings 14 | uses: 'jinahub+docker://CLIPImageEncoder/v0.1' 15 | volumes: $HOME/.cache/huggingface:/root/.cache/huggingface 16 | timeout_ready: 600000 17 | read_only: true 18 | needs: loader 19 | - name: text_indexer # Now, we use the text indexer in the image path - This is how we achieve the cross-modality here 20 | uses: 'jinahub://SimpleIndexer/old' # The text indexer has indexed all text documents and stored them on disk. 21 | uses_with: # Then we return the closest matches as results 22 | index_file_name: 'text_index' 23 | needs: image_encoder 24 | force: True 25 | read_only: true 26 | - name: text_filter # Here, the text path starts 27 | uses: TextFilter 28 | py_modules: 'flows/executors.py' 29 | needs: [gateway] 30 | - name: text_encoder # Compute the embedding of the search text 31 | uses: 'jinahub+docker://CLIPTextEncoder/v0.1' 32 | volumes: $HOME/.cache/huggingface:/root/.cache/huggingface 33 | timeout_ready: 600000 34 | read_only: true 35 | needs: text_filter 36 | - name: image_indexer # Now, we use the image indexer in the text path - this is again how we get cross-modality 37 | uses: 'jinahub://SimpleIndexer/old' # The image indexer has indexed all images and their embeddings 38 | uses_with: 39 | index_file_name: 'image_index' 40 | force: True 41 | read_only: true 42 | needs: text_encoder 43 | - name: join_all # Wait for both paths to finish and join the results 44 | needs: [image_indexer, text_indexer] 45 | -------------------------------------------------------------------------------- /cross-modal-search/get_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | DATASET="adityajn105/flickr8k" 3 | DATA_DIR="data/f8k" 4 | 5 | if [ -d ${DATA_DIR} ]; then 6 | echo ${DATA_DIR}' exists, please remove it before running the script' 7 | exit 1 8 | fi 9 | 10 | mkdir -p ${DATA_DIR} && \ 11 | kaggle datasets download -d ${DATASET} && \ 12 | unzip -q flickr8k.zip && \ 13 | rm flickr8k.zip && \ 14 | mv Images data/f8k/images && \ 15 | mv captions.txt data/f8k/captions.txt -------------------------------------------------------------------------------- /cross-modal-search/get_data30k.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | pip install kaggle 3 | kaggle datasets download hsankesara/flickr-image-dataset && \ 4 | unzip flickr-image-dataset.zip && \ 5 | rm flickr-image-dataset.zip && \ 6 | wget -q http://www.cs.toronto.edu/~faghri/vsepp/data.tar && \ 7 | tar -xvf data.tar && \ 8 | rm -rf data.tar && \ 9 | rm -rf data/coco* && \ 10 | rm -rf data/f8k* && \ 11 | rm -rf data/*precomp* && \ 12 | rm -rf data/f30k/images && \ 13 | mv flickr-image-dataset data/f30k/images 14 | -------------------------------------------------------------------------------- /cross-modal-search/requirements.txt: -------------------------------------------------------------------------------- 1 | jina[standard,rich]==2.0.18 2 | click==8.0.1 3 | kaggle==1.5.12 4 | git+git://github.com/jina-ai/jina-commons@v0.0.3 5 | matplotlib==3.4.3 6 | torch==1.9.0 -------------------------------------------------------------------------------- /cross-modal-search/setup_run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | TEST_DATA_DIR=data/ 4 | 5 | rm -rf ${TEST_DATA_DIR} && \ 6 | mkdir -p ${TEST_DATA_DIR}/f8k/images && \ 7 | python ../.github/util/pull_dataset.py -d cross-modal-search/f8k.zip -p ../ && \ 8 | unzip -o f8k.zip -d ${TEST_DATA_DIR} && \ 9 | rm f8k.zip && \ 10 | mv ${TEST_DATA_DIR}/Images/* ${TEST_DATA_DIR}/f8k/images && \ 11 | mv ${TEST_DATA_DIR}/captions.txt data/f8k/captions.txt && \ 12 | rm -rf workspace && \ 13 | python app.py -t index | tee metrics.txt && \ 14 | rm -rf ${TEST_DATA_DIR} -------------------------------------------------------------------------------- /cross-modal-search/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jina-ai/examples/d8f903278597254bd96b1ed64fe8c9feefaa265c/cross-modal-search/tests/__init__.py -------------------------------------------------------------------------------- /cross-modal-search/tests/conftest.py: -------------------------------------------------------------------------------- 1 | """Patch the birthday problem for random parts""" 2 | 3 | import pytest 4 | 5 | 6 | @pytest.fixture(scope='function', autouse=True) 7 | def patched_random_port(mocker): 8 | used_ports = set() 9 | from jina.helper import random_port 10 | from jina.excepts import NoAvailablePortError 11 | 12 | def _random_port(): 13 | 14 | for i in range(10): 15 | _port = random_port() 16 | 17 | if _port is not None and _port not in used_ports: 18 | used_ports.add(_port) 19 | return _port 20 | raise NoAvailablePortError 21 | 22 | mocker.patch('jina.helper.random_port', new_callable=lambda: _random_port) -------------------------------------------------------------------------------- /cross-modal-search/tests/requirements.txt: -------------------------------------------------------------------------------- 1 | pytest==6.2.4 2 | git+https://github.com/jina-ai/jina.git@v2.0.18#egg=jina[standard,rich] 3 | click==8.0.1 4 | git+git://github.com/jina-ai/jina-commons@v0.0.3 5 | kaggle==1.5.12 6 | matplotlib==3.4.3 7 | torch==1.9.0 -------------------------------------------------------------------------------- /cross-modal-search/tests/test_cross_modal_search.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | sys.path.append('..') 4 | from app import main 5 | from click.testing import CliRunner 6 | 7 | 8 | def config(tmpdir): 9 | os.environ['JINA_WORKSPACE'] = os.path.join(tmpdir, 'workspace') 10 | 11 | 12 | def test_cross_modal_search(tmpdir): 13 | config(tmpdir) 14 | runner = CliRunner() 15 | result = runner.invoke(main, ['-t', 'index']) 16 | assert 'done in' in result.stdout 17 | assert result.stderr_bytes is None 18 | result = runner.invoke(main, ['-t', 'query']) 19 | assert result.stderr_bytes is None 20 | -------------------------------------------------------------------------------- /cross-modal-search/toy-data/captions.txt: -------------------------------------------------------------------------------- 1 | image,caption 2 | 1000268201_693b08cb0e.jpg,A child in a pink dress is climbing up a set of stairs in an entry way . 3 | 1000268201_693b08cb0e.jpg,A girl going into a wooden building . 4 | 1000268201_693b08cb0e.jpg,A little girl climbing into a wooden playhouse . 5 | 1000268201_693b08cb0e.jpg,A little girl climbing the stairs to her playhouse . 6 | 1000268201_693b08cb0e.jpg,A little girl in a pink dress going into a wooden cabin . 7 | 1001773457_577c3a7d70.jpg,A black dog and a spotted dog are fighting 8 | 1001773457_577c3a7d70.jpg,A black dog and a tri-colored dog playing with each other on the road . 9 | 1001773457_577c3a7d70.jpg,A black dog and a white dog with brown spots are staring at each other in the street . 10 | 1001773457_577c3a7d70.jpg,Two dogs of different breeds looking at each other on the road . 11 | 1001773457_577c3a7d70.jpg,Two dogs on pavement moving toward each other . 12 | -------------------------------------------------------------------------------- /cross-modal-search/toy-data/images/1000268201_693b08cb0e.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jina-ai/examples/d8f903278597254bd96b1ed64fe8c9feefaa265c/cross-modal-search/toy-data/images/1000268201_693b08cb0e.jpg -------------------------------------------------------------------------------- /cross-modal-search/toy-data/images/1001773457_577c3a7d70.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jina-ai/examples/d8f903278597254bd96b1ed64fe8c9feefaa265c/cross-modal-search/toy-data/images/1001773457_577c3a7d70.jpg -------------------------------------------------------------------------------- /cross-modal-search/visualizations/cross-modal-index-flow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jina-ai/examples/d8f903278597254bd96b1ed64fe8c9feefaa265c/cross-modal-search/visualizations/cross-modal-index-flow.png -------------------------------------------------------------------------------- /cross-modal-search/visualizations/cross-modal-query-flow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jina-ai/examples/d8f903278597254bd96b1ed64fe8c9feefaa265c/cross-modal-search/visualizations/cross-modal-query-flow.png -------------------------------------------------------------------------------- /cross-modal-search/visualizations/cross-modal-result.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jina-ai/examples/d8f903278597254bd96b1ed64fe8c9feefaa265c/cross-modal-search/visualizations/cross-modal-result.jpg -------------------------------------------------------------------------------- /cross-modal-search/visualizations/image_results.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jina-ai/examples/d8f903278597254bd96b1ed64fe8c9feefaa265c/cross-modal-search/visualizations/image_results.png -------------------------------------------------------------------------------- /cross-modal-search/visualizations/text_results.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jina-ai/examples/d8f903278597254bd96b1ed64fe8c9feefaa265c/cross-modal-search/visualizations/text_results.png -------------------------------------------------------------------------------- /example-guidelines.md: -------------------------------------------------------------------------------- 1 | # Submit Your Community Example! 2 | 3 | Thanks for your interest in submitting your example! Here are some rules and guidelines: 4 | 5 | ## Rules 6 | 7 | ### `jina` in `requirements.txt` 8 | 9 | To be eligible for listing, you **must** have `jina==x.x.x` in your `requirements.txt`, where `x.x.x` refers to the semantic version number. 10 | 11 | Note: If you're building a front-end that just interfaces with Jina's API and doesn't rely on Jina core itself, there's no need to follow this requirement. 12 | 13 | ### `jina-` at start of name 14 | 15 | Your repo name should be `jina-xxxxxxx`. 16 | 17 | ### Clear README 18 | 19 | - Explain what your example does and how to run it 20 | 21 | ### Use scripts to get external resources 22 | 23 | - **For datasets:** Use a script named `get_data.sh` 24 | - **For models**: If you use an externally-hosted model, call your script `get_model.sh` or similar 25 | - **For other assets:** Follow the `get_xxx.sh` pattern 26 | 27 | ### `.gitignore` and `.dockerignore` 28 | 29 | Have a `.gitignore` file and list any directories that should be ignored. The same goes for `.dockerignore` if you have `Dockerfile`: 30 | 31 | - `data` directory 32 | - `workspace` directory 33 | - virtual environment directories 34 | - directories that store assets retrieved by [scripts](#use-scripts-to-get-external-resources) 35 | 36 | ### License 37 | 38 | You **must** use an open-source license, specified in `LICENSE` in the root of your repo 39 | 40 | ## Guidelines 41 | 42 | We're more easy-going on these 43 | 44 | ### One Example Per Repo 45 | 46 | To make code more maintainable and easier for end users, please include one example per repo. 47 | 48 | ### Tests 49 | 50 | Please include tests to ensure your app or Pod works correctly. 51 | 52 | ### File Structure 53 | 54 | - Please follow the file structure as created by `jina hub new --type app` 55 | - Store data in `data` and externally-downloaded models in `models` 56 | 57 | ### Dockerfile 58 | 59 | We highly encourage you to add a `Dockerfile`. 60 | 61 | ### Docker image 62 | 63 | For self-contained apps, we would love to host a Docker image on [Jina Hub](https://github.com/jina-ai/jina-hub) 64 | -------------------------------------------------------------------------------- /example_template.md: -------------------------------------------------------------------------------- 1 | # Run the EXAMPLE NAME 2 | *You can also include a gif with a full demo of the example* 3 | 4 | 5 | *ADD A TABLE OF CONTENTS HERE * 6 | 7 | - [Overview](#overview) 8 | - [🐍 Build the app with Python](#-build-the-app-with-python) 9 | - [🔮 Overview of the files in this example](#-overview-of-the-files-in-this-example) 10 | - [🌀 Flow diagram](#-flow-diagram) 11 | - [🔨 Next steps, building your own app](#-next-steps-building-your-own-app) 12 | - [🐳 Deploy the prebuild application using Docker](#-deploy-the-prebuild-application-using-docker) 13 | - [🙍 Community](#-community) 14 | - [🦄 License](#-license) 15 | 16 | 17 | ## Overview 18 | | About this example: | | 19 | | ------------- | ------------- | 20 | | Learnings | *Describe what the user will learn after running this example* | 21 | | Used for indexing | *What is the datatype of the indexing input* | 22 | | Used for querying | *What is the data type of the query input* | 23 | | Dataset used | *Link to the datasets* | 24 | | Model used | *Link to the model* | 25 | 26 | 27 | ## 🐍 Build the app with Python 28 | 29 | These instructions explain how to build the example yourself and deploy it with Python. If you want to skip the building steps and just run the example with Docker, check [the Docker deployment instructions at the end of this README](#deploy-with-docker) 30 | 31 | 32 | ### 🗝️ Requirements 33 | 34 | *Here outline in bullet points anything the user is expected to have before diving in.* 35 | 36 | For example: 37 | 38 | 1. You have a working Python 3.8 environment. 39 | 2. We recommend creating a [new Python virtual environment](https://docs.python.org/3/tutorial/venv.html) to have a clean installation of Jina and prevent dependency conflicts. 40 | 3. You have at least 2GB of free space on your hard drive. 41 | 42 | ### 👾 Step 1. Clone the repo and install Jina 43 | 44 | Begin by cloning the repo, so you can get the required files and datasets. (If you already have the examples repository on your machine make sure to fetch the most recent version) 45 | 46 | ```sh 47 | git clone https://github.com/jina-ai/examples 48 | ```` 49 | 50 | And enter the correct folder: 51 | 52 | ```sh 53 | cd examples/example_to_use (replace as necessary) 54 | ``` 55 | 56 | In your terminal, you should now be located in you the *enter example name* folder. Let's install Jina and the other required Python libraries. For further information on installing Jina check out [our documentation](https://docs.jina.ai/chapters/core/setup/). 57 | 58 | ```sh 59 | pip install -r requirements.txt 60 | ``` 61 | 62 | ### 📥 Step 2. Download your data to search (Optional) 63 | 64 | There are two different options here. You can either use the toy data we provide in this repo, which is quick to index but will give very poor results. Alternatively, you can download a larger dataset, which takes longer to index, but will have better results. 65 | 66 | 1. **Toy dataset:** Skip to step 3. No action is needed here. 67 | 68 | 2. **Full dataset:** 69 | In order to get the full dataset, follow the instructions below: 70 | - Register for a free [Kaggle account](https://www.kaggle.com/account/login?phase=startRegisterTab&returnUrl=%2F) 71 | - Set up your API token (see [authentication section of their API docs](https://www.kaggle.com/docs/api)) 72 | - Run `pip install kaggle` 73 | - Run `sh get_data.sh` 74 | 75 | ### 🏃 Step 3. Index your data 76 | In this step, we will index our data. 77 | 78 | *Here describe the Index Flow. Be as specific as possible in describing how this Index Flow works and what is its input. You are encouraged to use code snippets, images, or whatever helps to clarify.* 79 | 80 | ``` 81 | python app.py -t index (replace as necessary) 82 | ``` 83 | 84 | If you see the following output, it means your data has been correctly indexed. 85 | 86 | ``` 87 | Flow@5162[S]:flow is closed and all resources are released, current build level is 0 88 | ``` 89 | 90 | ### 🔎 Step 4: Query your data 91 | Next, we will deploy our query Flow. 92 | 93 | *Here describe the Query Flow. Be as specific as possible in describing how this Query Flow works and what is its input. You are encouraged to use code snippets, images, or whatever helps to clarify.* 94 | 95 | Run the query Flow in your terminal like this: 96 | 97 | ``` 98 | python app.py -t query (replace as necessary) 99 | ``` 100 | ______ 101 | 102 | ## 📉 Understanding your results 103 | *Here include a short description of the results and how to interpret them if needed.* 104 | 105 | ## 🌀 Flow diagram 106 | This diagram provides a visual representation of the Flows in this example; Showing which executors are used in which order. 107 | 108 | *Here Show the Flow for this example.* 109 | 110 | ## 📖 Optional: Extra information useful for the user 111 | 112 | *Use this section to add extra information you think the user could benefit from. 113 | QueryLanguage, Faiss, Annoy for example.* 114 | 115 | ## 🔮 Overview of the files 116 | 117 | *Add a list with all folders/files in the example:* 118 | 119 | | | | 120 | | -------------------- | ---------------------------------------------------------------------------------------------------------------- | 121 | | 📂 `flows/` | Folder to store Flow configuration | 122 | | --- 📃 `index.yml` | YAML file to configure indexing Flow | 123 | | --- 📃 `query.yml` | YAML file to configure querying Flow | 124 | | 📂 `pods/` | Folder to store Pod configuration | 125 | | --- 📃 `encoder.yml` | YAML file to configure encoder Pod | 126 | | 📂 `workspace/` | Folder to store indexed files (embeddings and documents). Automatically created after the first indexing | 127 | 128 | _____ 129 | 130 | ## 🐋 Deploy with Docker 131 | To make it easier for you, we have built and published the Docker image for this example. 132 | 133 | ### ☑️ Requirements: 134 | 135 | 1. You have Docker installed and working. 136 | 2. You have at least 8GB of free space on your hard drive. 137 | 138 | ### 🏃🏿‍♂️ Pull and run the image 139 | Running the following command will pull the Docker image and run it. 140 | 141 | *Replace below with the command to run the Docker image of this example* 142 | 143 | ```bash 144 | docker . 145 | ``` 146 | 147 | _______ 148 | 149 | ## ⏭️ Next steps 150 | 151 | Did you like this example and are you interested in building your own? For a detailed tutorial on how to build your Jina app check out [How to Build Your First Jina App](https://docs.jina.ai/chapters/my_first_jina_app/#how-to-build-your-first-jina-app) guide in our documentation. 152 | 153 | If you have any issues following this guide, you can always get support from our [Slack community](https://slack.jina.ai) . 154 | 155 | ## 👩‍👩‍👧‍👦 Community 156 | 157 | - [Slack channel](https://slack.jina.ai/) - a communication platform for developers to discuss Jina. 158 | - [LinkedIn](https://www.linkedin.com/company/jinaai/) - get to know Jina AI as a company and find job opportunities. 159 | - [![Twitter Follow](https://img.shields.io/twitter/follow/JinaAI_?label=Follow%20%40JinaAI_&style=social)](https://twitter.com/JinaAI_) - follow us and interact with us using hashtag `#JinaSearch`. 160 | - [Company](https://jina.ai) - know more about our company, we are fully committed to open-source! 161 | 162 | ## 🦄 License 163 | 164 | Copyright (c) 2021 Jina AI Limited. All rights reserved. 165 | 166 | Jina is licensed under the Apache License, Version 2.0. See [LICENSE](https://github.com/jina-ai/examples/blob/master/LICENSE) for the full license text. 167 | -------------------------------------------------------------------------------- /multires-lyrics-search/.github/demo.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jina-ai/examples/d8f903278597254bd96b1ed64fe8c9feefaa265c/multires-lyrics-search/.github/demo.gif -------------------------------------------------------------------------------- /multires-lyrics-search/.github/index.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jina-ai/examples/d8f903278597254bd96b1ed64fe8c9feefaa265c/multires-lyrics-search/.github/index.jpg -------------------------------------------------------------------------------- /multires-lyrics-search/.github/search.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jina-ai/examples/d8f903278597254bd96b1ed64fe8c9feefaa265c/multires-lyrics-search/.github/search.jpg -------------------------------------------------------------------------------- /multires-lyrics-search/.gitignore: -------------------------------------------------------------------------------- 1 | lyrics-data/lyrics-data.csv 2 | -------------------------------------------------------------------------------- /multires-lyrics-search/app.py: -------------------------------------------------------------------------------- 1 | __copyright__ = "Copyright (c) 2021 Jina AI Limited. All rights reserved." 2 | __license__ = "Apache-2.0" 3 | 4 | __version__ = '0.0.1' 5 | 6 | import os 7 | import sys 8 | import click 9 | 10 | from jina import Flow, Document 11 | from helper import input_generator 12 | from jina.logging.predefined import default_logger as logger 13 | 14 | 15 | def config(): 16 | cur_dir = os.path.dirname(os.path.abspath(__file__)) 17 | os.environ.setdefault('JINA_WORKSPACE', os.path.join(cur_dir, 'workspace')) 18 | os.environ.setdefault('JINA_WORKSPACE_MOUNT', 19 | f'{os.environ.get("JINA_WORKSPACE")}:/workspace/workspace') 20 | os.environ.setdefault('JINA_LOG_LEVEL', 'INFO') 21 | if os.path.exists('lyrics-data/lyrics-data.csv'): 22 | os.environ.setdefault('JINA_DATA_FILE', 'lyrics-data/lyrics-data.csv') 23 | else: 24 | os.environ.setdefault('JINA_DATA_FILE', 'lyrics-data/lyrics-toy-data1000.csv') 25 | os.environ.setdefault('JINA_PORT', str(45678)) 26 | 27 | 28 | # for index 29 | def index(num_docs): 30 | flow = Flow.load_config('flows/index.yml') 31 | with flow: 32 | input_docs = input_generator(num_docs=num_docs) 33 | data_path = os.path.join(os.path.dirname(__file__), 34 | os.environ.get('JINA_DATA_FILE', None)) 35 | flow.logger.info(f'Indexing {data_path}') 36 | flow.post(on='/index', inputs=input_docs, request_size=10, 37 | show_progress=True) 38 | 39 | 40 | # for search 41 | def query(): 42 | flow = Flow.load_config('flows/query.yml') 43 | flow.rest_api = True 44 | flow.protocol = 'http' 45 | with flow: 46 | flow.block() 47 | 48 | 49 | def query_text(): 50 | def print_result(response): 51 | doc = response.docs[0] 52 | for index, parent in enumerate(doc.matches): 53 | print(f'Parent {index}: Song Name: {parent.tags["SName"]}\n{parent.text}') 54 | for index, chunk in enumerate(doc.chunks): 55 | print(f'Chunk {index}: {chunk.text}') 56 | for match in chunk.matches: 57 | print(f'\tMatch: {match.text}') 58 | 59 | f = Flow.load_config('flows/query.yml') 60 | with f: 61 | search_text = input('Please type a sentence: ') 62 | doc = Document(content=search_text, mime_type='text/plain') 63 | response = f.post('/search', inputs=doc, parameters={'lookup_type': 'parent'}, return_results=True) 64 | print_result(response[0].data) 65 | 66 | 67 | @click.command() 68 | @click.option('--task', '-t', 69 | type=click.Choice(['index', 'query', 'query_text'], case_sensitive=False)) 70 | @click.option('--num_docs', '-n', default=10000) 71 | def main(task, num_docs): 72 | config() 73 | workspace = os.environ["JINA_WORKSPACE"] 74 | if task == 'index': 75 | if os.path.exists(workspace): 76 | logger.error(f'\n +---------------------------------------------------------------------------------+ \ 77 | \n | 🤖🤖🤖 | \ 78 | \n | The directory {workspace} already exists. Please remove it before indexing again. | \ 79 | \n | 🤖🤖🤖 | \ 80 | \n +---------------------------------------------------------------------------------+') 81 | sys.exit(1) 82 | index(num_docs) 83 | elif task == 'query': 84 | query() 85 | elif task == 'query_text': 86 | query_text() 87 | else: 88 | raise NotImplementedError( 89 | f'Unknown task: {task}.') 90 | 91 | 92 | if __name__ == '__main__': 93 | main() 94 | -------------------------------------------------------------------------------- /multires-lyrics-search/flows/index.yml: -------------------------------------------------------------------------------- 1 | jtype: Flow # We define the flow used for indexing here 2 | version: '1' # yml version 3 | with: # Parameters for the flow 4 | workspace: $JINA_WORKSPACE # Workspace folder 5 | executors: # Now, define all the executors that are used 6 | - name: segmenter # The first executor splits the input text into sentences which are stored as chunks in the original documents 7 | uses: 'jinahub+docker://Sentencizer' # The type of the executor is Sentencizer, we download it from the hub as a docker container 8 | - name: encoder # Then, compute the embeddings of the sentences in this executor 9 | uses: 'jinahub+docker://TransformerTorchEncoder/v0.1' # We use a TransformerTorchEncoder from the hub 10 | volumes: '~/.cache/huggingface:/root/.cache/huggingface' # Mount the huggingface cache into the docker container 11 | uses_with: # Override some parameters for the executor 12 | pooling_strategy: 'cls' # This is the pooling strategy that is used by the encoder 13 | pretrained_model_name_or_path: distilbert-base-cased # The ML model that is used 14 | max_length: 96 # Max length argument for the tokenizer 15 | device: 'cpu' # Run the executor on CPU - For GPU, we would have to use another container! 16 | default_traversal_paths: ['c'] # Compute the embeddings on the chunk level - the sentences created before 17 | - name: indexer # Now, index the sentences and store them to disk. 18 | uses: 'jinahub://SimpleIndexer/old' # We use a simple indexer for that purpose (not in docker, but using source codes - there are some bugs with docker for this executor) 19 | uses_metas: # Set some meta arguments for this executor 20 | workspace: $JINA_WORKSPACE # Define the workspace folder for the executor 21 | uses_with: # Override parameters for the executor 22 | default_traversal_paths: ['c'] # Store the sentences on disk - this means on chunk level 23 | - name: root_indexer # Additionally to the sentences, we also need to store the original songs which are not split into sentences 24 | uses: 'jinahub+docker://LMDBStorage' # Therefore, we use a LMDBStorage indexer 25 | volumes: $JINA_WORKSPACE_MOUNT # Again, mount the workspace 26 | uses_with: # Override some parameters for the LMDBStorage 27 | default_traversal_paths: ['r'] # Now, we store the root documents, not the sentence chunks 28 | needs: [gateway] # We can start this at the beginning - in parallel to the sentence flow 29 | - name: wait_both # Now, we wait for both the root indexing and the sentence path to finish 30 | needs: [indexer, root_indexer] # Continue once these two executor are finished 31 | -------------------------------------------------------------------------------- /multires-lyrics-search/flows/query.yml: -------------------------------------------------------------------------------- 1 | jtype: Flow # Now, we define the search flow for this example 2 | version: '1' # It is quite similar to the index flow, only the differences are explained here 3 | with: 4 | port_expose: $JINA_PORT # Port to run the flow on 5 | cors: true # Add cross origin headers to the request responses 6 | executors: 7 | - name: segmenter # First, split the search text into sentences again 8 | uses: 'jinahub+docker://Sentencizer' 9 | - name: encoder # Encode the search sentences into embeddings 10 | uses: 'jinahub+docker://TransformerTorchEncoder/v0.1' 11 | volumes: '~/.cache/huggingface:/root/.cache/huggingface' 12 | uses_with: 13 | pooling_strategy: 'cls' 14 | pretrained_model_name_or_path: distilbert-base-cased 15 | max_length: 96 16 | device: 'cpu' 17 | default_traversal_paths: ['c'] 18 | - name: indexer # Compare the search sentence embeddings to the stored sentence embeddings from the indexing 19 | uses: 'jinahub://SimpleIndexer/old' # Then, return the closest matches for every sentence 20 | uses_metas: 21 | workspace: $JINA_WORKSPACE 22 | uses_with: 23 | default_traversal_paths: ['c'] 24 | read_only: True 25 | - name: ranker # Now, we need to use a special ranker in the query flow 26 | uses: 'jinahub+docker://SimpleRanker' # This ranker collects all the matches from the sentences and adds them to the root document 27 | uses_with: # It also orders the matches according to their minimum distance 28 | metric: 'cosine' 29 | - name: root_indexer # Now, we can collect the stored metadata from the root documents to the matches collected by the MinRanker 30 | uses: 'jinahub+docker://LMDBStorage' 31 | volumes: $JINA_WORKSPACE_MOUNT 32 | uses_with: 33 | default_traversal_paths: ['m'] 34 | read_only: True -------------------------------------------------------------------------------- /multires-lyrics-search/get_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | kaggle datasets download -d neisse/scrapped-lyrics-from-6-genres 3 | unzip scrapped-lyrics-from-6-genres.zip 4 | rm -rf scrapped-lyrics-from-6-genres.zip 5 | rm -rf artists-data.csv 6 | mv lyrics-data.csv lyrics-data/lyrics-data.csv 7 | -------------------------------------------------------------------------------- /multires-lyrics-search/helper.py: -------------------------------------------------------------------------------- 1 | """Helper functions for the multires example""" 2 | 3 | import csv 4 | import itertools as it 5 | import os 6 | import numpy as np 7 | 8 | from jina import Document 9 | 10 | 11 | def input_generator(num_docs: int): 12 | lyrics_file = os.environ.setdefault('JINA_DATA_FILE', 13 | 'lyrics-data/lyrics-toy-data1000.csv') 14 | with open(lyrics_file, newline='', encoding='utf-8') as f: 15 | reader = csv.reader(f) 16 | for row in it.islice(reader, num_docs): 17 | if row[-1] == 'ENGLISH': 18 | d = Document(text=row[3]) 19 | d.tags['ALink'] = row[0] 20 | d.tags['SName'] = row[1] 21 | d.tags['SLink'] = row[2] 22 | yield d 23 | 24 | 25 | def num_input_docs(): 26 | lyrics_file = os.environ.setdefault( 27 | 'JINA_DATA_PATH', 'lyrics-data/lyrics-toy-data1000.csv' 28 | ) 29 | with open(lyrics_file, newline='', encoding='utf-8') as f: 30 | reader = csv.reader(f) 31 | return len(list(reader)) 32 | 33 | def _ext_A(A): 34 | nA, dim = A.shape 35 | A_ext = np.ones((nA, dim * 3)) 36 | A_ext[:, dim : 2 * dim] = A 37 | A_ext[:, 2 * dim :] = A ** 2 38 | return A_ext 39 | 40 | 41 | def _ext_B(B): 42 | nB, dim = B.shape 43 | B_ext = np.ones((dim * 3, nB)) 44 | B_ext[:dim] = (B ** 2).T 45 | B_ext[dim : 2 * dim] = -2.0 * B.T 46 | del B 47 | return B_ext 48 | 49 | 50 | def _norm(A): 51 | return A / np.linalg.norm(A, ord=2, axis=1, keepdims=True) 52 | 53 | 54 | def _euclidean(A_ext, B_ext): 55 | sqdist = A_ext.dot(B_ext).clip(min=0) 56 | return np.sqrt(sqdist) -------------------------------------------------------------------------------- /multires-lyrics-search/requirements.txt: -------------------------------------------------------------------------------- 1 | click==8.0.1 2 | jina[standard]==2.0.18 3 | kaggle==1.5.12 4 | docker 5 | git+git://github.com/jina-ai/jina-commons@v0.0.3 -------------------------------------------------------------------------------- /multires-lyrics-search/static/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Lyrics Search Demo 8 | 9 | 10 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 31 | 32 |
33 |
34 |
35 |
36 |
37 | Query 38 |
39 |
40 | 41 | 42 |

{{ searchIndicator }}

43 |
44 |
45 |
46 |
47 | Breakdown 48 | 51 |
52 |
53 | 56 | only show distance < {{ distThreshold }} 57 | 58 | 59 |
60 | {{searchQuery.substring(item.location[0], item.location[1])}} 63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 | Top-{{topkDocs.length}} Results 71 |
72 |
73 |
75 |
77 |
78 |
79 |
80 |

82 |
83 | 84 | Song: {{item.tags["SName"]}} 85 | 86 |
87 |
88 |

89 | {{index+1}} 90 | Relevance: {{ 1 - item.scores["cosine"].value.toFixed(3) }} 91 |

92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 |
151 | 152 | 153 | 154 | 157 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | -------------------------------------------------------------------------------- /multires-lyrics-search/static/main.css: -------------------------------------------------------------------------------- 1 | .lyric-text { 2 | font-size: 10px; 3 | text-align: center; 4 | max-height: 30em; 5 | overflow: scroll; 6 | } 7 | 8 | .blockquote-footer { 9 | text-align: center; 10 | } 11 | 12 | .query-chunk { 13 | border-width: 1px; 14 | border-radius: 3px; 15 | border-style: solid; 16 | } 17 | 18 | .query-chunk-breakdown { 19 | margin: 5px; 20 | border-style: solid; 21 | border-width: 1px; 22 | border-radius: 5px; 23 | border-color: lightgray; 24 | padding: 5px; 25 | } 26 | 27 | .card { 28 | margin: 5px; 29 | } -------------------------------------------------------------------------------- /multires-lyrics-search/static/vue-bindings.js: -------------------------------------------------------------------------------- 1 | var VueMasonryPlugin = window["vue-masonry-plugin"].VueMasonryPlugin; 2 | Vue.use(VueMasonryPlugin); 3 | 4 | const vm = new Vue({ 5 | el: '#jina-ui', 6 | data: { 7 | serverUrl: 'http://localhost:45678/search', 8 | top_k: 50, 9 | topkDocs: [], 10 | topkDocsDict: {}, 11 | results: [], 12 | searchQuery: '', 13 | queryChunks: [], 14 | selectQueryChunks: [], 15 | queryItem: [], 16 | docItem: null, 17 | loadedItem: 0, 18 | loadedQuery: 0, 19 | searchQueryIsDirty: false, 20 | isCalculating: false, 21 | distThreshold: 999, 22 | sliderOptions: { 23 | dotSize: 14, 24 | width: 'auto', 25 | height: 4, 26 | contained: false, 27 | direction: 'ltr', 28 | data: null, 29 | min: 999, 30 | max: 0, 31 | interval: 0.01, 32 | disabled: false, 33 | clickable: true, 34 | duration: 0.5, 35 | adsorb: false, 36 | lazy: false, 37 | tooltip: 'active', 38 | tooltipPlacement: 'top', 39 | tooltipFormatter: void 0, 40 | useKeyboard: false, 41 | keydownHook: null, 42 | dragOnClick: false, 43 | enableCross: true, 44 | fixed: false, 45 | minRange: void 0, 46 | maxRange: void 0, 47 | order: true, 48 | marks: false, 49 | dotOptions: void 0, 50 | process: true, 51 | dotStyle: void 0, 52 | railStyle: void 0, 53 | processStyle: void 0, 54 | tooltipStyle: void 0, 55 | stepStyle: void 0, 56 | stepActiveStyle: void 0, 57 | labelStyle: void 0, 58 | labelActiveStyle: void 0, 59 | } 60 | }, 61 | mounted: function () { 62 | 63 | }, 64 | components: { 65 | 'vueSlider': window['vue-slider-component'], 66 | }, 67 | computed: { 68 | searchIndicator: function () { 69 | if (this.isCalculating) { 70 | return '⟳ Fetching new results...' 71 | } else if (this.searchQueryIsDirty) { 72 | return '... Typing' 73 | } else { 74 | 75 | return '✓ Done' 76 | } 77 | } 78 | }, 79 | watch: { 80 | searchQuery: function () { 81 | this.searchQueryIsDirty = true 82 | this.expensiveOperation() 83 | }, 84 | distThreshold: function () { 85 | this.refreshAllCards(); 86 | } 87 | }, 88 | methods: { 89 | clearAllSelect: function () { 90 | vm.queryChunks.forEach(function (item, i) { 91 | item['isSelect'] = !item['isSelect']; 92 | vm.refreshAllCards(); 93 | }); 94 | }, 95 | selectChunk: function (item) { 96 | item['isSelect'] = !item['isSelect']; 97 | vm.refreshAllCards(); 98 | }, 99 | refreshAllCards: function () { 100 | vm.topkDocsDict = new Map(vm.topkDocs.map(i => [i.id, { 101 | 'text': i.text, 102 | 'hlchunk': [], 103 | 'renderHTML': i.text 104 | }])); 105 | vm.queryChunks.forEach(function (item, i) { 106 | if (!('isSelect' in item)) { 107 | item['isSelect'] = true; 108 | } 109 | if (item['isSelect']) { 110 | item.matches.forEach(function (r) { 111 | if (vm.topkDocsDict.has(r.parentId)) { 112 | let dist = r.scores['cosine'].value 113 | if (dist < vm.distThreshold) { 114 | // console.log(item) 115 | vm.topkDocsDict.get(r.parentId)['hlchunk'].push({ 116 | 'range': r.location, 117 | 'idx': i, 118 | 'dist': dist, 119 | 'range_str': r.location[0] + ',' + r.location[1] 120 | }); 121 | } 122 | if (dist < vm.sliderOptions.min) { 123 | vm.sliderOptions.min = dist.toFixed(2) 124 | } 125 | if (dist > vm.sliderOptions.max) { 126 | vm.sliderOptions.max = dist.toFixed(2) 127 | } 128 | 129 | } else { 130 | console.error(r.id); 131 | } 132 | }); 133 | } 134 | }); 135 | vm.topkDocsDict.forEach(function (value, key, map) { 136 | vm.topkDocsDict.get(key)['hlchunk'].sort(function (a, b) { 137 | return b['range'][0] - a['range'][0] 138 | }) 139 | var replace_map = new Map(); 140 | value['hlchunk'].forEach(function (item) { 141 | if (!replace_map.has(item['range_str'])) { 142 | replace_map.set(item['range_str'], []) 143 | } 144 | replace_map.get(item['range_str']).push(item) 145 | 146 | }) 147 | 148 | replace_map.forEach(function (item, kk, mm) { 149 | value['renderHTML'] = replaceRange(value['renderHTML'], item[0]['range'][0], item[0]['range'][1], item) 150 | }) 151 | }) 152 | vm.$nextTick(function () { 153 | vm.$redrawVueMasonry('my-masonry'); 154 | }) 155 | }, 156 | // This is where the debounce actually belongs. 157 | expensiveOperation: _.debounce(function () { 158 | this.isCalculating = true 159 | vm.selectQueryChunks.length = 0; 160 | $.ajax({ 161 | url: this.serverUrl, 162 | type: "POST", 163 | contentType: "application/json", 164 | cache: false, 165 | data: JSON.stringify({ 166 | "parameters": {"top_k": this.top_k}, 167 | "data": [this.searchQuery] 168 | }), 169 | error: function (jqXHR, textStatus, errorThrown) { 170 | console.log(jqXHR); 171 | console.log(textStatus); 172 | console.log(errorThrown); 173 | }, 174 | success: function (data) { 175 | vm.topkDocs = data.data.docs[0].matches; 176 | console.log('Number parents: ' + vm.topkDocs.length); 177 | vm.queryChunks = data.data.docs[0].chunks; 178 | console.log('Number chunks: ' + vm.queryChunks.length); 179 | vm.refreshAllCards(); 180 | console.log('Success'); 181 | }, 182 | complete: function () { 183 | vm.isCalculating = false 184 | vm.searchQueryIsDirty = false 185 | vm.$nextTick(function () { 186 | vm.$redrawVueMasonry('my-masonry'); 187 | }) 188 | } 189 | }); 190 | 191 | }, 500) 192 | } 193 | }); 194 | 195 | function replaceRange(s, start, end, chunks) { 196 | var content = s.substring(start, end) 197 | chunks.forEach(function (c) { 198 | content = "" + content + "" 199 | }) 200 | return s.substring(0, start) + content + s.substring(end); 201 | } 202 | 203 | function selectColor(number, colored) { 204 | if (!colored) { 205 | return `#fff`; 206 | } 207 | const hue = number * 137.508; // use golden angle approximation 208 | return `hsl(${hue},50%,75%)`; 209 | } 210 | -------------------------------------------------------------------------------- /multires-lyrics-search/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jina-ai/examples/d8f903278597254bd96b1ed64fe8c9feefaa265c/multires-lyrics-search/tests/__init__.py -------------------------------------------------------------------------------- /multires-lyrics-search/tests/conftest.py: -------------------------------------------------------------------------------- 1 | """Patch the birthday problem for random parts""" 2 | 3 | import pytest 4 | 5 | 6 | @pytest.fixture(scope='function', autouse=True) 7 | def patched_random_port(mocker): 8 | used_ports = set() 9 | from jina.helper import random_port 10 | from jina.excepts import NoAvailablePortError 11 | 12 | def _random_port(): 13 | 14 | for i in range(10): 15 | _port = random_port() 16 | 17 | if _port is not None and _port not in used_ports: 18 | used_ports.add(_port) 19 | return _port 20 | raise NoAvailablePortError 21 | 22 | mocker.patch('jina.helper.random_port', new_callable=lambda: _random_port) -------------------------------------------------------------------------------- /multires-lyrics-search/tests/requirements.txt: -------------------------------------------------------------------------------- 1 | click==8.0.1 2 | git+https://github.com/jina-ai/jina.git@v2.0.18#egg=jina[standard] 3 | pytest==6.1.2 4 | kaggle==1.5.12 5 | docker 6 | git+git://github.com/jina-ai/jina-commons@v0.0.3 -------------------------------------------------------------------------------- /multires-lyrics-search/tests/test_flow_integration.py: -------------------------------------------------------------------------------- 1 | __copyright__ = "Copyright (c) 2021 Jina AI Limited. All rights reserved." 2 | __license__ = "Apache-2.0" 3 | 4 | import os 5 | import shutil 6 | import glob 7 | from typing import List 8 | from click.testing import CliRunner 9 | 10 | import pytest 11 | from jina import Flow, Document 12 | 13 | from app import main 14 | 15 | 16 | def get_files_with_patterns(directory: str, match_patterns: List[str]) -> List[str]: 17 | """ 18 | Returns all files from directory and subdirectories that match any of the patterns in the list. 19 | The returned list will only contain unique items. 20 | 21 | :param directory: Path to the directory 22 | :param match_patterns: A list of expressions to match the files against. E.g. `*.json` 23 | :return: List of matched files. 24 | """ 25 | index_files = [] 26 | for pattern in match_patterns: 27 | index_files += list(glob.glob(os.path.join(directory, '**', pattern), recursive=True)) 28 | return list(set(index_files)) 29 | 30 | 31 | @pytest.fixture(scope='session', autouse=True) 32 | def index(tmpdir_factory): 33 | """ 34 | This fixtures runs automatically once before each test session. 35 | It indexes a small set of files into a test workspace and checks that the indexing 36 | completes correctly. 37 | 38 | Other tests can use the created workspace and test queries against it. 39 | """ 40 | assert os.getcwd().endswith('multires-lyrics-search'), \ 41 | "Please execute the tests from the root directory: >>> pytest tests/" 42 | 43 | workspace = os.path.join(tmpdir_factory.getbasetemp(), 'test-workspace') 44 | assert not os.path.isdir(workspace), 'Directory ./test-workspace exists. Please remove before testing' 45 | os.environ['JINA_WORKSPACE'] = workspace 46 | os.environ.setdefault('JINA_WORKSPACE_MOUNT', 47 | f'{os.environ.get("JINA_WORKSPACE")}:/workspace/workspace') 48 | os.environ.setdefault('JINA_PORT', str(45678)) 49 | 50 | runner = CliRunner() 51 | result = runner.invoke(main, ['-t', 'index', '-n', '100']) 52 | assert result.stderr_bytes is None, f'Error messages found during indexing: {result.stderr}' 53 | 54 | assert os.path.isdir(workspace) 55 | index_files = get_files_with_patterns(workspace, ['*.bin', '*.lmdb', '*.lmdb-lock']) 56 | assert len(index_files) == 4, 'Expected three files in the workspace' 57 | for _file in index_files: 58 | assert os.path.getsize(_file) > 0, f'File {_file} is empty.' 59 | 60 | yield 61 | # shutil.rmtree(workspace) Not possible due to docker sudo rights 62 | 63 | 64 | def test_query_text(tmpdir_factory): 65 | def assert_result(response): 66 | docs = response.docs 67 | # check number of results 68 | assert len(docs) == 1 69 | assert len(docs[0].chunks) == 2 70 | parent_docs = docs[0].matches 71 | parent_ids = parent_docs.get_attributes('id') 72 | assert len(parent_docs) > 0 73 | for chunk in docs[0].chunks: 74 | assert len(chunk.matches) == 5 # top_k = 5 75 | match_ids = chunk.matches.get_attributes('id') 76 | assert len(match_ids) == len(list(set(match_ids))) 77 | for match in chunk.matches: 78 | assert match.text is not None 79 | assert match.location is not None 80 | assert match.parent_id in parent_ids 81 | assert match.text in parent_docs[parent_ids.index(match.parent_id)].text 82 | 83 | flow = Flow.load_config('flows/query.yml') 84 | with flow: 85 | search_text = 'looked through every window then. hello world.' 86 | doc = Document(content=search_text, mime_type='text/plain') 87 | response = flow.post('/search', inputs=doc, parameters={'top_k': 5}, return_results=True) 88 | assert_result(response[0]) 89 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | norecursedirs = cross-modal-search/img_emb/* cross-modal-search/txt_emb/* openapi/python-flask/openapi_server/* -------------------------------------------------------------------------------- /wikipedia-sentences-query-while-indexing/.github/images/storage.svg: -------------------------------------------------------------------------------- 1 |
gateway
storage_encoder
(jinahub+docker://FlairTextEncoder)
storage_indexer
(jinahub+docker://LMDBStorage)
gateway
-------------------------------------------------------------------------------- /wikipedia-sentences-query-while-indexing/.gitignore: -------------------------------------------------------------------------------- 1 | workspace* 2 | env 3 | results 4 | __pycache__ 5 | -------------------------------------------------------------------------------- /wikipedia-sentences-query-while-indexing/README.md: -------------------------------------------------------------------------------- 1 | # Querying While Indexing in the Wikipedia Search Example 2 | 3 | | About this example: | | 4 | | ------------- | ------------- | 5 | | Learnings | How to configure Jina for querying while indexing | 6 | | Used for indexing | Text data | 7 | | Used for querying | Text data | 8 | | Dataset used | [Wikipedia dataset from kaggle](https://www.kaggle.com/mikeortman/wikipedia-sentences) | 9 | | Model used | [flair-text](https://github.com/flairNLP/flair) | 10 | 11 | This is an example of using [Jina](http://www.jina.ai) to support both querying and indexing simultaneously in our [Wikipedia sentence search example](https://github.com/jina-ai/examples/tree/master/wikipedia-sentences). 12 | 13 | ## Table of contents: 14 | 15 | * [Prerequisites](#prerequisites) 16 | * [What is querying while indexing?](#what-is-querying-while-indexing) 17 | * [Configuration changes](#configuration-changes) 18 | * [🐍 Build the app with Python](#-build-the-app-with-python) 19 | * [Flow diagrams](#flow-diagrams) 20 | * [🔮 Overview of the files](#-overview-of-the-files) 21 | * [Troubleshooting](#troubleshooting) 22 | * [⏭️ Next steps](#-next-steps) 23 | * [👩‍👩‍👧‍👦 Community](#-community) 24 | * [🦄 License](#-license) 25 | 26 | ## Prerequisites 27 | 28 | - Run and understand our [Wikipedia sentence search example](https://github.com/jina-ai/examples/tree/master/wikipedia-sentences) 29 | 30 | ## What is querying while indexing? 31 | 32 | Querying while indexing means you are able to still query your data while new data is simultaneously being inserted (or updated, or deleted). 33 | Jina achieves this with its dump-reload feature. 34 | 35 | ## Configuration changes 36 | 37 | This feature requires you to split the Flow, one for Indexing (and Updates, Deletes) and one for Querying, and have them running at the same time. 38 | Also, you will need to replace the indexers in Flows. 39 | The Index Flow (also referred to as the Storage Flow) will require a [Storage Indexer](https://github.com/jina-ai/executors/tree/main/jinahub/indexers/storage), while the Query Flow requires a [Compound Searcher](https://github.com/jina-ai/executors/tree/main/jinahub/indexers/searcher). 40 | 41 | In our case we use : 42 | 43 | - [LMDBStorage](https://github.com/jina-ai/executors/tree/main/jinahub/indexers/storage/LMDBStorage), which uses a disk-based key-value storage [LMDB](https://lmdb.readthedocs.io/) as a storage engine. 44 | - [FaissLMDBSearcher](https://github.com/jina-ai/executors/tree/main/jinahub/indexers/searcher/compound/FaissLMDBSearcher), which uses the [`faiss`](https://github.com/spotify/annoy) algorithm to provide faster query results and LMDB to retrieve the metadata. 45 | 46 | _____ 47 | 48 | ## 🐍 Build the app with Python 49 | 50 | These instructions explain how to run the example yourself and deploy it with Python. 51 | 52 | ### 🗝️ Requirements 53 | 54 | 1. Have a working Python 3.7 or 3.8 environment. 55 | 1. We recommend creating a [new Python virtual environment](https://docs.python.org/3/tutorial/venv.html) to have a clean installation of Jina and prevent dependency conflicts. 56 | 1. Install [Docker Engine](https://docs.docker.com/engine/install/). 57 | 1. Have at least 5 GB of free space on your hard drive. 58 | 59 | 60 | ### Running the example 61 | 62 | ### 👾 Step 1. Clone the repo and install Jina 63 | 64 | Begin by cloning the repo so you can get the required files and datasets. (If you already have the examples repository on your machine make sure to fetch the most recent version) 65 | 66 | ```sh 67 | git clone https://github.com/jina-ai/examples 68 | cd examples/wikipedia-sentences-query-while-indexing 69 | ``` 70 | 71 | Let's install `jina` and the other required libraries. For further information on installing jina check out [our documentation](https://docs.jina.ai/get-started/install/). 72 | 73 | ```sh 74 | pip install -r requirements.txt 75 | ``` 76 | 77 | In order to run the example you will need to do the following: 78 | 79 | ### 📥 Step 2. Download your data to search (Optional) 80 | 81 | The repo includes a small subset of the Wikipedia dataset, for quick testing. You can just use that. 82 | 83 | If you want to use the entire dataset, run `bash get_data.sh` and then modify the `DATA_FILE` constant (in `app.py`) to point to that file. 84 | 85 | ### 🏃 Step 3. Running the Flows 86 | 87 | In this example, we use [JinaD]((https://docs.jina.ai/advanced/daemon/#remote-management-via-jinad)) to serve the two Flows (Index and Query) and listen to incoming requests. 88 | 89 | 1. Start `JinaD` server using the below command. 90 | 91 | ```bash 92 | docker run --add-host host.docker.internal:host-gateway \ 93 | -v /var/run/docker.sock:/var/run/docker.sock \ 94 | -v /tmp/jinad:/tmp/jinad \ 95 | -p 8000:8000 \ 96 | --name jinad \ 97 | -d jinaai/jina:2.1.0-daemon 98 | ``` 99 | 100 | 2. Run `python app.py -t flows` 101 | 102 | This will create the two Flows, and then repeatedly do the following (which can also be done in any other REST client), every 10 seconds: 103 | 104 | 1. Index 5 Documents. 105 | 2. Send a `DUMP` request to the Storage (Index) Flow to dump its data to a specific location. 106 | 3. Send a `ROLLING_UPDATE` request to the Query Flow to take down its Indexers and start them again, with the new data located at the respective path. 107 | 108 | **Warning**: If you want to use the entire wikipedia dataset, run `bash get_data.sh` and then modify the `DATA_FILE` constant to point to that file. 109 | 110 | ### 🔎 Step 4: Query your data 111 | 112 | Finally, in a second terminal, run `python app.py -t client` 113 | 114 | This will prompt you for a query, send the query to the Query Flow, and then show you the results. 115 | 116 | Since the Flows uses `http` protocol, you can query the REST API with whatever `Client` provided within jina or use `cURL`, `Postman` or [custom Swagger UI provided with jina](https://docs.jina.ai/fundamentals/practice-your-learning/#query-via-swaggerui) etc. 117 | 118 | #### Cleanup 119 | 120 | JinaD creates several containers during this process. In order to remove all the containers do the following after you are done using the example: 121 | 122 | `docker stop $(docker ps -a -q)` 123 | and 124 | `docker rm $(docker ps -a -q)` 125 | 126 | ## Flow diagrams 127 | 128 | Below you can see a graphical representation of the Flow pipeline: 129 | 130 | #### Storage Flow 131 | 132 | ![](.github/images/storage.svg) 133 | 134 | #### Query Flow 135 | 136 | ![](.github/images/query.svg) 137 | 138 | Notice the following: 139 | 140 | - the encoder has the same configuration 141 | - the Query Flow uses replicas. One replica continues to serve requests while the other is being reloaded. 142 | - the Indexer in the Query Flow is actually made up of two Indexers: one for vectors, one for Document metadata. On the Storage Flow, this data is stored into one Storage Indexer. 143 | 144 | ## 🔮 Overview of the files 145 | 146 | | File or folder | Contents | 147 | | -------------------- | ---------------------------------------------------------------------------------------------------------------- | 148 | | 📂 `data/` | Folder where the data files are stored | 149 | | 📂 `flows/` | Folder to store Flow configuration | 150 | | --- 📃 `storage.yml` | YAML file to configure Storage (Index) Flow | 151 | | --- 📃 `query.yml` | YAML file to configure Querying Flow | 152 | | 🐍 `app.py` | Code file for the example | 153 | 154 | _________ 155 | 156 | ## ⏭️ Next steps 157 | 158 | Did you like this example and are you interested in building your own? For a detailed tutorial on how to build your Jina app check out [How to Build Your First Jina App](https://docs.jina.ai/chapters/my_first_jina_app/#how-to-build-your-first-jina-app) guide in our documentation. 159 | 160 | If you have any issues following this guide, you can always get support from our [Slack community](https://slack.jina.ai) . 161 | 162 | ## 👩‍👩‍👧‍👦 Community 163 | 164 | - [Slack channel](https://slack.jina.ai) - a communication platform for developers to discuss Jina. 165 | - [LinkedIn](https://www.linkedin.com/company/jinaai/) - get to know Jina AI as a company and find job opportunities. 166 | - [![Twitter Follow](https://img.shields.io/twitter/follow/JinaAI_?label=Follow%20%40JinaAI_&style=social)](https://twitter.com/JinaAI_) - follow us and interact with us using hashtag `#JinaSearch`. 167 | - [Company](https://jina.ai) - know more about our company. We are fully committed to open-source! 168 | 169 | ## 🦄 License 170 | 171 | Copyright (c) 2021 Jina AI Limited. All rights reserved. 172 | 173 | Jina is licensed under the Apache License, Version 2.0. See LICENSE for the full license text. 174 | -------------------------------------------------------------------------------- /wikipedia-sentences-query-while-indexing/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jina-ai/examples/d8f903278597254bd96b1ed64fe8c9feefaa265c/wikipedia-sentences-query-while-indexing/__init__.py -------------------------------------------------------------------------------- /wikipedia-sentences-query-while-indexing/app.py: -------------------------------------------------------------------------------- 1 | __copyright__ = "Copyright (c) 2021 Jina AI Limited. All rights reserved." 2 | __license__ = "Apache-2.0" 3 | 4 | import os 5 | import time 6 | import traceback 7 | from typing import List, Dict 8 | 9 | import click 10 | from daemon.clients import JinaDClient 11 | from jina.logging.logger import JinaLogger 12 | from jina import __default_host__, Document, DocumentArray, Client 13 | 14 | os.environ['JINA_LOG_LEVEL'] = 'DEBUG' 15 | 16 | HOST = __default_host__ # change this if you are using remote jinad 17 | JINAD_PORT = 8000 # change this if you start jinad on a different port 18 | DUMP_PATH = '/jinad_workspace/dump' # the path where to dump 19 | SHARDS = 1 # change this if you change pods/query_indexer.yml 20 | DUMP_RELOAD_INTERVAL = 10 # time between dump - rolling update calls 21 | DATA_FILE = 'data/toy.txt' # change this if you get the full data 22 | DOCS_PER_ROUND = 5 # nr of documents to index in each round 23 | STORAGE_FLOW_YAML_FILE = 'storage.yml' # indexing Flow yaml name 24 | QUERY_FLOW_YAML_FILE = 'query.yml' # querying Flow yaml name 25 | STORAGE_REST_PORT = 9000 # REST port of storage Flow, defined in flows/storage.yml 26 | QUERY_REST_PORT = 9001 # REST port of Query Flow, defined in flows/query.yml 27 | 28 | logger = JinaLogger('jina') 29 | cur_dir = os.path.dirname(os.path.abspath(__file__)) 30 | jinad_client = JinaDClient(host=HOST, port=JINAD_PORT, timeout=10 * 60) 31 | 32 | 33 | def docarray_from_file(filename): 34 | docs = [] 35 | with open(filename) as f: 36 | for line in f: 37 | docs.append(Document(text=line)) 38 | return DocumentArray(docs) 39 | 40 | 41 | def query_restful(): 42 | while True: 43 | text = input('please type a sentence: ') 44 | if not text: 45 | break 46 | 47 | query_doc = Document() 48 | query_doc.text = text 49 | response = query_docs(query_doc) 50 | matches = response[0].data.docs[0].matches 51 | len_matches = len(matches) 52 | logger.info(f'Ta-Dah🔮, {len_matches} matches we found for: "{text}" :') 53 | 54 | for idx, match in enumerate(matches): 55 | score = match.scores['euclidean'].value 56 | if score < 0.0: 57 | continue 58 | logger.info(f'> {idx:>2d}({score:.2f}). {match.text}') 59 | 60 | 61 | def index_docs(docs: List[Dict], round: int): 62 | docs_to_send = docs[round * DOCS_PER_ROUND : (round + 1) * DOCS_PER_ROUND] 63 | logger.info(f'Indexing {len(docs_to_send)} document(s)...') 64 | Client(host=HOST, port=STORAGE_REST_PORT, protocol='http').index(inputs=docs_to_send) 65 | 66 | 67 | def query_docs(docs: Document): 68 | logger.info(f'Searching document {docs}...') 69 | return Client(host=HOST, port=QUERY_REST_PORT, protocol='http').search(inputs=docs, return_results=True) 70 | 71 | 72 | def create_flows(): 73 | workspace_id = jinad_client.workspaces.create(paths=[os.path.join(cur_dir, 'flows')]) 74 | jinad_workspace = jinad_client.workspaces.get(workspace_id)['metadata']['workdir'] 75 | 76 | logger.info('Creating storage Flow...') 77 | storage_flow_id = jinad_client.flows.create( 78 | workspace_id=workspace_id, filename=STORAGE_FLOW_YAML_FILE, envs={'JINAD_WORKSPACE': jinad_workspace} 79 | ) 80 | logger.info(f'Created successfully. Flow ID: {storage_flow_id}') 81 | logger.info('Creating Query Flow...') 82 | query_flow_id = jinad_client.flows.create( 83 | workspace_id=workspace_id, filename=QUERY_FLOW_YAML_FILE, envs={'JINAD_WORKSPACE': jinad_workspace} 84 | ) 85 | logger.info(f'Created successfully. Flow ID: {query_flow_id}') 86 | return storage_flow_id, query_flow_id, workspace_id 87 | 88 | 89 | def dump_and_roll_update(storage_flow_id: str, query_flow_id: str): 90 | docs = docarray_from_file(DATA_FILE) 91 | logger.info(f'starting dump and rolling-update process') 92 | round = 0 93 | while True: 94 | logger.info(f'round {round}:') 95 | index_docs(docs, round) 96 | current_dump_path = os.path.join(DUMP_PATH, str(round)) 97 | 98 | logger.info(f'dumping...') 99 | Client(host=HOST, port=STORAGE_REST_PORT, protocol='http').post( 100 | on='/dump', 101 | parameters={'shards': SHARDS, 'dump_path': current_dump_path}, 102 | target_peapod='storage_indexer', 103 | ) 104 | 105 | # JinaD is used for ctrl requests on Flows 106 | logger.info(f'performing rolling update across replicas...') 107 | jinad_client.flows.update( 108 | id=query_flow_id, 109 | kind='rolling_update', 110 | pod_name='query_indexer', 111 | dump_path=current_dump_path, 112 | ) 113 | logger.info(f'rolling update done. sleeping for {DUMP_RELOAD_INTERVAL}secs...') 114 | time.sleep(DUMP_RELOAD_INTERVAL) 115 | round += 1 116 | 117 | 118 | def cleanup(storage_flow_id, query_flow_id, workspace_id): 119 | jinad_client.flows.delete(storage_flow_id) 120 | jinad_client.flows.delete(query_flow_id) 121 | jinad_client.workspaces.delete(workspace_id) 122 | 123 | 124 | @click.command() 125 | @click.option( 126 | '--task', 127 | '-t', 128 | type=click.Choice(['flows', 'client'], case_sensitive=False), 129 | ) 130 | def main(task: str): 131 | """main entrypoint for this example""" 132 | if task == 'flows': 133 | # start a Index Flow, dump the data from the Index Flow, and load it into the Query Flow. 134 | try: 135 | storage_flow_id, query_flow_id, workspace_id = create_flows() 136 | # starting a loop that 137 | # - indexes some data in batches 138 | # - sends request to storage Flow in JinaD to dump its data to a location 139 | # - send request to Query Flow in JinaD to perform rolling update across its replicas, 140 | # which reads the new data in the dump 141 | dump_and_roll_update(storage_flow_id, query_flow_id) 142 | except (Exception, KeyboardInterrupt) as e: 143 | if e: 144 | logger.warning(f'Caught: {e}. Original stacktrace following:') 145 | logger.error(traceback.format_exc()) 146 | logger.info('Shutting down and cleaning Flows in JinaD...') 147 | cleanup(storage_flow_id, query_flow_id, workspace_id) 148 | 149 | elif task == 'client': 150 | query_restful() 151 | 152 | 153 | if __name__ == '__main__': 154 | main() 155 | -------------------------------------------------------------------------------- /wikipedia-sentences-query-while-indexing/flows/query.yml: -------------------------------------------------------------------------------- 1 | jtype: Flow # we define the search Flow 2 | version: '1' 3 | with: 4 | protocol: http # we use the REST API 5 | port_expose: 9001 # the port the Flow will listen on 6 | executors: # the list of components in this Flow 7 | - name: query_encoder # the name of this executor. This one takes the text and transforms it into vectors to be used in searching 8 | uses: jinahub+docker://FlairTextEncoder # we use a pre-built Executor docker image 9 | timeout_ready: -1 # disable timing out. (downloading the image can take some time) 10 | - name: query_indexer # the name. This is a Compound Executor, formed of a vector searcher and a key-value db 11 | uses: jinahub+docker://FaissLMDBSearcher # again, the docker image 12 | replicas: 2 # we want to replicate this executor, for better performance. This creates two identical copies. Requests are passed to either one 13 | timeout_ready: -1 # disable timing out. (downloading the image can take some time) 14 | volumes: $JINAD_WORKSPACE:/jinad_workspace # we need a workspace where the LMDB db file will be stored 15 | -------------------------------------------------------------------------------- /wikipedia-sentences-query-while-indexing/flows/storage.yml: -------------------------------------------------------------------------------- 1 | jtype: Flow # we define the Flow used for storing (CRUD operations) 2 | version: '1' 3 | with: 4 | protocol: http # we want to use the REST HTTP API 5 | port_expose: 9000 # the port to listen on. This is referenced in `app.py` 6 | executors: # the components in this Flow 7 | - name: storage_encoder # the name. This is the Encoder (transforms the text into vectors) 8 | uses: jinahub+docker://FlairTextEncoder # we use a pre-built Executor from Jina Hub 9 | timeout_ready: -1 # disable timing out on startup (downloading image can take some time) 10 | - name: storage_indexer # the name. This stores the data in an LMDB db 11 | uses: jinahub+docker://LMDBStorage # again, we use a docker image 12 | timeout_ready: -1 # disable startup 13 | volumes: $JINAD_WORKSPACE:/jinad_workspace # workspace where the file will be stored 14 | 15 | -------------------------------------------------------------------------------- /wikipedia-sentences-query-while-indexing/get_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | DATASET="mikeortman/wikipedia-sentences" 3 | DATA_DIR="data" 4 | LINES=3000 5 | 6 | cd ${DATA_DIR} 7 | kaggle datasets download -d ${DATASET} 8 | unzip wikipedia-sentences.zip 9 | rm -f toy-data.txt 10 | rm -f wikipedia-sentences.zip 11 | mv wikisent2.txt input.txt 12 | -------------------------------------------------------------------------------- /wikipedia-sentences-query-while-indexing/manifest.yml: -------------------------------------------------------------------------------- 1 | manifest_version: 1 2 | name: wikipedia-sentences-30k-query-while-indexing 3 | description: 'Example Jina app for searching 30,000 sentences from Wikipedia' 4 | author: Cristian Mitroi (cristian.mitroi@jina.ai) 5 | url: https://github.com/jina-ai/examples/tree/master/wikipedia-sentences-query-while-indexing 6 | vendor: Jina AI Limited 7 | documentation: https://github.com/jina-ai/examples/tree/master/wikipedia-sentences-query-while-indexing 8 | version: 0.1 9 | license: apache-2.0 10 | keywords: [NLP, wikipedia, text, distilbert, example, transformers] 11 | type: app 12 | kind: example 13 | avatar: None 14 | platform: "linux/amd64" 15 | update: "None" 16 | -------------------------------------------------------------------------------- /wikipedia-sentences-query-while-indexing/requirements.txt: -------------------------------------------------------------------------------- 1 | jina[daemon]==2.1.0 2 | kaggle==1.5.12 3 | click==7.1.2 4 | -------------------------------------------------------------------------------- /wikipedia-sentences-query-while-indexing/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jina-ai/examples/d8f903278597254bd96b1ed64fe8c9feefaa265c/wikipedia-sentences-query-while-indexing/tests/__init__.py -------------------------------------------------------------------------------- /wikipedia-sentences-query-while-indexing/tests/requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/jina-ai/jina.git@v2.1.0#egg=jina[daemon] 2 | click==7.1.2 3 | -------------------------------------------------------------------------------- /wikipedia-sentences-query-while-indexing/tests/test_query_while_indexing.py: -------------------------------------------------------------------------------- 1 | import time 2 | from threading import Thread 3 | 4 | from jina import Document, __default_host__, Client 5 | from daemon.clients import JinaDClient 6 | from jina.logging.logger import JinaLogger 7 | 8 | HOST = __default_host__ 9 | JINAD_PORT = 8000 10 | QUERY_REST_PORT = 9001 11 | logger = JinaLogger('test') 12 | 13 | 14 | def query_docs(docs: Document): 15 | logger.info(f'Searching document {docs}...') 16 | return Client(host=HOST, port=QUERY_REST_PORT, protocol='http').search(inputs=docs, return_results=True) 17 | 18 | 19 | def test_query_while_indexing(): 20 | try: 21 | from app import create_flows, dump_and_roll_update 22 | 23 | jinad_client = JinaDClient(host=HOST, port=JINAD_PORT) 24 | assert jinad_client.alive, 'cannot reach jinad' 25 | 26 | storage_flow_id, query_flow_id, workspace_id = create_flows() 27 | # start rolling update in the background 28 | Thread(target=dump_and_roll_update, args=(storage_flow_id, query_flow_id), daemon=True).start() 29 | 30 | logger.info('sleeping for 30 secs to allow 1 round of index, dump & rolling update') 31 | time.sleep(30) 32 | query_doc = Document(text='hello world') 33 | response = query_docs(query_doc) 34 | matches = response[0].data.docs[0].matches 35 | logger.info(f'got {len(matches)} matches') 36 | assert matches 37 | 38 | except (Exception, KeyboardInterrupt): 39 | raise 40 | 41 | finally: 42 | from app import cleanup 43 | 44 | cleanup(storage_flow_id, query_flow_id, workspace_id) 45 | -------------------------------------------------------------------------------- /wikipedia-sentences/.dockerignore: -------------------------------------------------------------------------------- 1 | .dockerignore 2 | .git 3 | .github 4 | .gitignore 5 | data 6 | env 7 | get-data.sh 8 | tests 9 | -------------------------------------------------------------------------------- /wikipedia-sentences/.github/flow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jina-ai/examples/d8f903278597254bd96b1ed64fe8c9feefaa265c/wikipedia-sentences/.github/flow.png -------------------------------------------------------------------------------- /wikipedia-sentences/.gitignore: -------------------------------------------------------------------------------- 1 | workspace* 2 | env 3 | results 4 | __pycache__ 5 | -------------------------------------------------------------------------------- /wikipedia-sentences/README.md: -------------------------------------------------------------------------------- 1 | # Semantic Wikipedia Search with Transformers and DistilBERT 2 | 3 | ![](https://docs.jina.ai/_images/jinabox-wikipedia.gif) 4 | 5 | ## Table of contents: 6 | 7 | - [Overview](#overview) 8 | - [🐍 Build the app with Python](#-build-the-app-with-python) 9 | - [🔮 Overview of the files in this example](#-overview-of-the-files-in-this-example) 10 | - [🌀 Flow diagram](#-flow-diagram) 11 | - [🔨 Next steps, building your own app](#-next-steps-building-your-own-app) 12 | - [🙍 Community](#-community) 13 | - [🦄 License](#-license) 14 | 15 | ## Overview 16 | | | | 17 | | ------------- | ------------- | 18 | | Summary | This showcases a semantic text search app | 19 | | Data for indexing | Wikipedia corpus | 20 | | Data for querying | A text sentence | 21 | | Dataset used | [Kaggle Wikipedia corpus](kaggle.com/mikeortman/wikipedia-sentences) | 22 | | ML model used | [`distilbert-base-nli-stsb-mean-tokens `](https://huggingface.co/sentence-transformers/distilbert-base-nli-stsb-mean-tokens) | 23 | 24 | This example shows you how to build a simple semantic search app powered by [Jina](http://www.jina.ai)'s neural search framework. You can index and search text sentences from Wikipedia using a state-of-the-art machine learning [`distilbert-base-nli-stsb-mean-tokens `](https://huggingface.co/sentence-transformers/distilbert-base-nli-stsb-mean-tokens) language model from the [Transformers](https://huggingface.co) library. 25 | 26 | | item | content | 27 | |--------|--------------------------------------------------| 28 | | Input | 1 text file with 1 sentence per line | 29 | | Output | *top_k* number of sentences that match input query | 30 | 31 | ## 🐍 Build the app with Python 32 | 33 | These instructions explain how to build the example yourself and deploy it with Python. If you want to skip the building steps and just run the app, check out the [Docker section](#---deploy-the-prebuild-application-using-docker) below. 34 | 35 | 36 | ### 🗝️ Requirements 37 | 1. You have a working Python 3.7 or 3.8 environment. 38 | 2. We recommend creating a [new Python virtual environment](https://docs.python.org/3/tutorial/venv.html) to have a clean installation of Jina and prevent dependency conflicts. 39 | 3. You have at least 2 GB of free space on your hard drive. 40 | 41 | ### 👾 Step 1. Clone the repo and install Jina 42 | 43 | 44 | Begin by cloning the repo, so you can get the required files and datasets. In case you already have the examples repository on your machine make sure to fetch the most recent version. 45 | 46 | ```sh 47 | git clone https://github.com/jina-ai/examples 48 | cd examples/wikipedia-sentences 49 | ``` 50 | 51 | In your terminal, you should now be located in you the wikipedia-sentences folder. Let's install Jina and the other required Python libraries. For further information on installing Jina check out our [documentation](https://docs.jina.ai/chapters/core/setup/). 52 | 53 | 54 | ```sh 55 | pip install -r requirements.txt 56 | ``` 57 | If this command runs without any error messages, you can then move onto step two. 58 | 59 | ### 📥 Step 2. Download your data to search 60 | 61 | By default, a small test dataset is used for indexing. This can lead to bad search results. 62 | 63 | To index the [full dataset](https://www.kaggle.com/mikeortman/wikipedia-sentences) (around 900 MB): 64 | 65 | 1. Set up [Kaggle](https://www.kaggle.com/docs/api#getting-started-installation-&-authentication) 66 | 2. Run the script: `sh get_data.sh` 67 | 3. Index your new dataset: `python app.py -t index -d full -n $num_docs` 68 | 69 | The whole dataset contains about 8 Million wikipedia sentences, indexing all of this will take a very long time. 70 | Therefore, we recommend selecting only a subset of the data, the number of elements can be selected by the `-n` flag. 71 | We recommend values smaller than 100000. For larger indexes, the SimpleIndexer used in this example will be very slow also in query time. 72 | It is then recommended to use more advanced indexers like the FaissIndexer. 73 | 74 | ### 🏃 Step 3. Index your data 75 | 76 | Index your data by running: 77 | 78 | ```sh 79 | python app.py -t index 80 | ``` 81 | Here, we can also specify the number of documents to index with ```--num_docs``` / ```-n``` (defult is 10000). 82 | 83 | ### 🔎 Step 4. Query your indexed data 84 | 85 | A search prompt will appear in your terminal after running: 86 | 87 | ```sh 88 | python app.py -t query 89 | ``` 90 | 91 | See the text below for an example search query and response. 92 | You can also specify the top k search results with ```--top_k``` / ```-k``` (default is 5) 93 | 94 | ``` 95 | please type a sentence: What is ROMEO 96 | 97 | Ta-Dah🔮, here are what we found for: What is ROMEO 98 | > 0(0.36). The ROMEO website, iOS app and Android app are commonly used by the male gay community to find friends, dates, love or get informed about LGBT+ topics. 99 | 100 | ``` 101 | 102 | ## 🔮 Overview of the files in this example 103 | Here is a small overview if you're interested in understanding what each file in this example is doing. 104 | 105 | | File | Explanation | 106 | |---|---| 107 | |📂 `test/*` | Various maintenance tests to keep the example running. | 108 | |📃 `app.py` | The gateway code to that runs the index & query Flow. | 109 | |📃 `get_data.sh` | Downloads the Kaggle dataset. | 110 | |📃 `requirements.txt` | Contains all required python libraries. | 111 | 112 | 113 | ## 🌀 Flow diagram 114 | 115 | This diagram provides a visual representation of the flow in this example, showing which Executors are used in which order: 116 | 117 | ![wiki_flow](.github/flow.png) 118 | 119 | It can be seen that the flow for this example is quite simple. We receive input Documents from the gateway, 120 | which are then fed into a transformer. This transformer computes an embedding based on the text of the document. 121 | Then, the documents are sent to the indexer which does the following: 122 | - Index time: Store all the documents on disk (in the workspace folder). 123 | - Query time: Compare the query document embedding with all stored embeddings and return closest matches 124 | 125 | ## ⏭️ Next steps, building your own app 126 | 127 | Did you like this example and are you interested in building your own? For a detailed tuturial on how to build your Jina app check out [How to Build Your First Jina App](https://docs.jina.ai/chapters/my_first_jina_app/#how-to-build-your-first-jina-app) guide in our documentation. 128 | 129 | - [Enable querying while indexing](https://github.com/jina-ai/examples/tree/master/wikipedia-sentences-query-while-indexing) 130 | 131 | ## 👩‍👩‍👧‍👦 Community 132 | 133 | - [Slack channel](https://slack.jina.ai) - a communication platform for developers to discuss Jina 134 | - [LinkedIn](https://www.linkedin.com/company/jinaai/) - get to know Jina AI as a company and find job opportunities 135 | - [![Twitter Follow](https://img.shields.io/twitter/follow/JinaAI_?label=Follow%20%40JinaAI_&style=social)](https://twitter.com/JinaAI_) - follow us and interact with us using hashtag `#JinaSearch` 136 | - [Company](https://jina.ai) - know more about our company, we are fully committed to open-source! 137 | 138 | ## 🦄 License 139 | 140 | Copyright (c) 2021 Jina AI Limited. All rights reserved. 141 | 142 | Jina is licensed under the Apache License, Version 2.0. See [LICENSE](https://github.com/jina-ai/examples/blob/master/LICENSE) for the full license text. 143 | -------------------------------------------------------------------------------- /wikipedia-sentences/app.py: -------------------------------------------------------------------------------- 1 | __copyright__ = "Copyright (c) 2021 Jina AI Limited. All rights reserved." 2 | __license__ = "Apache-2.0" 3 | 4 | import os 5 | import sys 6 | import click 7 | import random 8 | from jina import Flow, Document, DocumentArray 9 | from jina.logging.predefined import default_logger as logger 10 | 11 | MAX_DOCS = int(os.environ.get('JINA_MAX_DOCS', 10000)) 12 | 13 | 14 | def config(dataset: str): 15 | if dataset == 'toy': 16 | os.environ['JINA_DATA_FILE'] = os.environ.get('JINA_DATA_FILE', 'data/toy-input.txt') 17 | elif dataset == 'full': 18 | os.environ['JINA_DATA_FILE'] = os.environ.get('JINA_DATA_FILE', 'data/input.txt') 19 | os.environ['JINA_PORT'] = os.environ.get('JINA_PORT', str(45678)) 20 | cur_dir = os.path.dirname(os.path.abspath(__file__)) 21 | os.environ.setdefault('JINA_WORKSPACE', os.path.join(cur_dir, 'workspace')) 22 | os.environ.setdefault('JINA_WORKSPACE_MOUNT', 23 | f'{os.environ.get("JINA_WORKSPACE")}:/workspace/workspace') 24 | 25 | 26 | def print_topk(resp, sentence): 27 | for doc in resp.data.docs: 28 | print(f"\n\n\nTa-Dah🔮, here's what we found for: {sentence}") 29 | for idx, match in enumerate(doc.matches): 30 | score = match.scores['cosine'].value 31 | print(f'> {idx:>2d}({score:.2f}). {match.text}') 32 | 33 | 34 | def input_generator(num_docs: int, file_path: str): 35 | with open(file_path) as file: 36 | lines = file.readlines() 37 | num_lines = len(lines) 38 | random.shuffle(lines) 39 | for i in range(min(num_docs, num_lines)): 40 | yield Document(text=lines[i]) 41 | 42 | 43 | def index(num_docs): 44 | flow = Flow().load_config('flows/flow.yml') 45 | data_path = os.path.join(os.path.dirname(__file__), os.environ.get('JINA_DATA_FILE', None)) 46 | with flow: 47 | flow.post(on='/index', inputs=input_generator(num_docs, data_path), 48 | show_progress=True) 49 | 50 | 51 | def query(top_k): 52 | flow = Flow().load_config('flows/flow.yml') 53 | with flow: 54 | text = input('Please type a sentence: ') 55 | doc = Document(content=text) 56 | 57 | result = flow.post(on='/search', inputs=DocumentArray([doc]), 58 | parameters={'top_k': top_k}, 59 | line_format='text', 60 | return_results=True, 61 | ) 62 | print_topk(result[0], text) 63 | 64 | 65 | @click.command() 66 | @click.option( 67 | '--task', 68 | '-t', 69 | type=click.Choice(['index', 'query'], case_sensitive=False), 70 | ) 71 | @click.option('--num_docs', '-n', default=MAX_DOCS) 72 | @click.option('--top_k', '-k', default=5) 73 | @click.option('--dataset', '-d', type=click.Choice(['toy', 'full']), default='toy') 74 | def main(task, num_docs, top_k, dataset): 75 | config(dataset) 76 | if task == 'index': 77 | if os.path.exists(os.environ.get("JINA_WORKSPACE")): 78 | logger.error(f'\n +---------------------------------------------------------------------------------+ \ 79 | \n | 🤖🤖🤖 | \ 80 | \n | The directory {os.environ.get("JINA_WORKSPACE")} already exists. Please remove it before indexing again. | \ 81 | \n | 🤖🤖🤖 | \ 82 | \n +---------------------------------------------------------------------------------+') 83 | sys.exit(1) 84 | index(num_docs) 85 | elif task == 'query': 86 | query(top_k) 87 | 88 | 89 | if __name__ == '__main__': 90 | main() 91 | -------------------------------------------------------------------------------- /wikipedia-sentences/data/toy-input.txt: -------------------------------------------------------------------------------- 1 | The ROMEO website, iOS app and Android app are commonly used by the male gay community to find friends, dates, love or get informed about LGBT+ topics. 2 | Once derided as corporate raiders, shareholder activists are now the recipients of admiration for sparking change in corporate boardrooms, leading to corporate boards developing best practices for responding to shareholder activism. 3 | Slc22a21 belongs to a protein family of solute carriers. 4 | Ajrara gharana or Ajrada gharana is one of the six main traditional schools in tabla drum. 5 | A few female specimens were found in a forest stream inside the shola forest. 6 | Ziggeo is the initiator and backer of BetaJS, an open-source framework. 7 | The three sports of aquatics were held at Aquatic Center in Sport Authority of Thailand Sport Complex, Bangkok, Thailand. 8 | Sugandha is the fourth generation of her family into singing and belongs to the Indore gharana. 9 | WYES is the only independently owned public television station in Louisiana as it is not part of Louisiana Public Broadcasting, which owns all of the PBS member stations in the state that are located outside of New Orleans, and maintains a programming agreement with and partial ownership of the city's independent public television station, WLAE-TV (channel 32). 10 | The reef divides the strait into the Apo East Pass and the Apo West Pass. 11 | His novel The Book of Evidence was shortlisted for the Booker Prize and won the Guinness Peat Aviation award in 1989. 12 | Andrea Kremer (born February 25, 1959 in Philadelphia, Pennsylvania) is a multi-Emmy Award Winning American television sports journalist. 13 | The book was the first published novel by O'Grady, with an initial print run of 6,000 hardback copies. 14 | After Alice performs several "miracle" cures in front of the tree, and claims to have seen the Virgin Mary there, it starts to be treated as a Lourdes-like shrine by Catholic pilgrims. 15 | Tovar is no longer involved with smuggling but acts as a consultant to Goldenvoice, which now operates the Coachella Valley Music and Arts Festival that has been compared to the Glastonbury Festival and is the most profitable music festival in the US. 16 | Tiwari worked as a producer with NDTV from 1996-2003. 17 | It is the home arena for SaPKo of the Mestis hockey league the second top league in Finland behind Liiga. 18 | As of the 2011 apportionment, the district includes the Middlesex County municipalities of East Brunswick Township, Edison Township, Helmetta Borough, Highland Park Borough, Metuchen Borough, South Plainfield Borough and South River Borough. 19 | Lembosiella is a genus of fungi in the Microthyriaceae family; according to the 2007 Outline of Ascomycota, the placement in this family is uncertain. 20 | Later, he resigned from his teaching profession in Jan 2013 and became a full time lyricist, dialog writer and part time researcher in Karky Research Foundation. 21 | It is used in Intel Core microarchitecture based DP-capable server processors, the Dual-Core Xeon is codenamed Dempsey, Woodcrest, and Wolfdale and the Quad-Core processors Clovertown, Harpertown. 22 | The 35th Annual TV Week Logie Awards was held on Friday 19 March 1993 at the Grand Hyatt in Melbourne, and broadcast on Network Ten. 23 | Daund Patas Road railway station is a small railway station in Pune district, Maharashtra. 24 | Evagjelia Veli (born 16 July 1991) is an Albanian weightlifter. 25 | It was published in two volumes that appeared a decade apart. 26 | He is now professor of medicine (biotechnology in public health) at the University of Bergen and chairs the Faculty Council, Faculty of Medicine, Norwegian University of Science and Technology. 27 | Erin McGathy (born December 5, 1985) is an American podcast host, artist, and comedian. 28 | The song is the second single from their debut mini album First Invasion and it was released as a digital single on August 4, 2010. 29 | Chandru who makes his debut in direction after assisting few Tamil films. 30 | Mitchum got the tune for the song from a Norwegian folk-dance (Gammel Reinlender) song his mother used to sing to him. 31 | The shell of the No 69 grenade was composed entirely of the hard plastic, Bakelite, which shattered without producing fragments like a metal bodied grenade. 32 | It was released on 24 January 2014. 33 | Stafford Loans are available both as subsidized and unsubsidized loans. 34 | On the World Wide Web, a query string is the part of a uniform resource locator (URL) containing data that does not fit conveniently into a hierarchical path structure. 35 | They have bar eyes, bare metasternum, bare metapisternum, the anterior anepisternum is usually pillose. 36 | In 1904, the mayoral term was changed to two years. 37 | It was earlier known as Central Mall but underwent renovations and some parts were re-organized in 2017 and was re-branded and re-launced on 26 February 2018. 38 | Jang Young-sik (born 1935) is a South Korean economist. 39 | It is a medium-sized damselfly with a short stout body, it is black with blue markings, and has long dark wings with pterostigma. 40 | The first desegregated hotel casino, it was popular with many of the black entertainers of the time, who would entertain at the other hotels and casinos and stay at the Moulin Rouge. 41 | In February 2009, it was revealed that the site was projected onto a wall at The Daily Telegraph to allow journalists there to view breaking news posted by users to Twitter. 42 | His most recent novel in this series, The Bangkok Asset, was published on 4 August 2015. 43 | He served as the 24th Governor of Nevada from 1979 to 1983. 44 | The soils which range from acid to alkaline and front wet to dry gives rise to a diverse woodland structure. 45 | Their land was taken back by the Spanish Crown; and then irretrievably lost however, when California became part of the United States. 46 | With annual billings of $220 million, Tombras is one of the top 25 largest independent national advertising agencies. 47 | The couple intended to retire to China and purchased a property in Canton; however the Communist victory in 1949 changed their plans and in 1950 the couple sold the vineyard and moved to Blockhouse Bay, Auckland. 48 | "Super Scooter Happy" was covered by Kyary Pamyu Pamyu on her 2013 album, Nanda Collection. 49 | Filling four CD-ROMs, Final Fantasy IX featured a cast containing a variety of major and minor characters. 50 | The album was produced by Billy Harvey, and featured contributions by Rafael Gayol and the Tosca String Quartet. 51 | -------------------------------------------------------------------------------- /wikipedia-sentences/flows/flow.yml: -------------------------------------------------------------------------------- 1 | jtype: Flow # This file defines the flow (both index and query) for the wikipedia sentences example 2 | version: '1' # This is the yml file version 3 | with: # Additional arguments for the flow 4 | workspace: $JINA_WORKSPACE # Workspace folder path 5 | port_expose: $JINA_PORT # Network Port for the flow 6 | executors: # Now, define the executors that are run on this flow 7 | - name: transformer # This executor computes an embedding based on the input text documents 8 | uses: 'jinahub+docker://TransformerTorchEncoder/v0.1' # We use a Transformer Torch Encoder from the hub as a docker container 9 | - name: indexer # Now, index the text documents with the embeddings 10 | uses: 'jinahub://SimpleIndexer/old' # We use the SimpleIndexer for this purpose -------------------------------------------------------------------------------- /wikipedia-sentences/get_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | DATASET="mikeortman/wikipedia-sentences" 3 | DATA_DIR="data" 4 | LINES=3000 5 | 6 | 7 | 8 | cd ${DATA_DIR} 9 | kaggle datasets download -d ${DATASET} 10 | unzip wikipedia-sentences.zip 11 | rm -f toy-data.txt 12 | rm -f wikipedia-sentences.zip 13 | mv wikisent2.txt input.txt 14 | -------------------------------------------------------------------------------- /wikipedia-sentences/requirements.txt: -------------------------------------------------------------------------------- 1 | click==8.0.1 2 | jina[standard]==2.0.18 3 | git+git://github.com/jina-ai/jina-commons@v0.0.3 -------------------------------------------------------------------------------- /wikipedia-sentences/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jina-ai/examples/d8f903278597254bd96b1ed64fe8c9feefaa265c/wikipedia-sentences/tests/__init__.py -------------------------------------------------------------------------------- /wikipedia-sentences/tests/conftest.py: -------------------------------------------------------------------------------- 1 | """Patch the birthday problem for random parts""" 2 | 3 | import pytest 4 | 5 | 6 | @pytest.fixture(scope='function', autouse=True) 7 | def patched_random_port(mocker): 8 | used_ports = set() 9 | from jina.helper import random_port 10 | from jina.excepts import NoAvailablePortError 11 | 12 | def _random_port(): 13 | 14 | for i in range(10): 15 | _port = random_port() 16 | 17 | if _port is not None and _port not in used_ports: 18 | used_ports.add(_port) 19 | return _port 20 | raise NoAvailablePortError 21 | 22 | mocker.patch('jina.helper.random_port', new_callable=lambda: _random_port) -------------------------------------------------------------------------------- /wikipedia-sentences/tests/requirements.txt: -------------------------------------------------------------------------------- 1 | pytest==6.2.4 2 | click==8.0.1 3 | jina[standard]==2.0.18 4 | git+git://github.com/jina-ai/jina-commons@v0.0.3 -------------------------------------------------------------------------------- /wikipedia-sentences/tests/test_wikipediasearch.py: -------------------------------------------------------------------------------- 1 | __copyright__ = "Copyright (c) 2021 Jina AI Limited. All rights reserved." 2 | __license__ = "Apache-2.0" 3 | 4 | import os 5 | import sys 6 | from click.testing import CliRunner 7 | 8 | sys.path.append('..') 9 | from app import main 10 | 11 | 12 | def config(tmpdir): 13 | os.environ['JINA_WORKSPACE'] = os.path.join(tmpdir, 'workspace') 14 | 15 | 16 | def test_wikipedia_sentences(tmpdir): 17 | config(tmpdir) 18 | runner = CliRunner() 19 | result = runner.invoke(main, ['-t', 'index']) 20 | assert "done in" in result.stdout 21 | assert result.stderr_bytes is None 22 | result = runner.invoke(main, ['-t', 'query']) 23 | print(result.stdout) 24 | assert result.stderr_bytes is None 25 | -------------------------------------------------------------------------------- /wikipedia-sentences/tests/toy-input.txt: -------------------------------------------------------------------------------- 1 | ../data/toy-input.txt --------------------------------------------------------------------------------