├── .github
    ├── .README_images
    │   ├── 0a8863abb3fcee182e1fe8fe46c47b7a.gif
    │   ├── d4165abd.png
    │   └── ed2907cd11ac26a2a3a2555f16071d13.gif
    ├── ci.sh
    ├── dependabot.yml
    ├── perf-script.sh
    ├── performance.txt
    ├── util
    │   └── pull_dataset.py
    └── workflows
    │   ├── ci.yml
    │   └── copyright.yml
├── .gitignore
├── LICENSE
├── README.md
├── __init__.py
├── audio-to-audio-search
    ├── .github
    │   └── demo.png
    ├── .gitignore
    ├── README.md
    ├── app.py
    ├── demo.html
    ├── executors.py
    ├── helper.py
    ├── requirements.txt
    ├── tests
    │   ├── __init__.py
    │   ├── data
    │   │   └── mp3
    │   │   │   └── index
    │   │   │       ├── index_-Bu7YaslRW0.mp3
    │   │   │       ├── index_-D--GWwca0g.mp3
    │   │   │       ├── index_-nlkWWphiaM.mp3
    │   │   │       ├── index_0bRUkLsttto.mp3
    │   │   │       └── index_0slyl34xWug.mp3
    │   ├── requirements.txt
    │   └── test_audio_to_audio_search.py
    └── vggish
    │   ├── mel_features.py
    │   ├── vggish_input.py
    │   ├── vggish_params.py
    │   ├── vggish_postprocess.py
    │   └── vggish_slim.py
├── cross-modal-search
    ├── .dockerignore
    ├── README.md
    ├── __init__.py
    ├── app.py
    ├── dataset.py
    ├── flows
    │   ├── executors.py
    │   ├── flow-index.yml
    │   └── flow-query.yml
    ├── get_data.sh
    ├── get_data30k.sh
    ├── requirements.txt
    ├── setup_run.sh
    ├── tests
    │   ├── __init__.py
    │   ├── conftest.py
    │   ├── requirements.txt
    │   └── test_cross_modal_search.py
    ├── toy-data
    │   ├── captions.txt
    │   └── images
    │   │   ├── 1000268201_693b08cb0e.jpg
    │   │   └── 1001773457_577c3a7d70.jpg
    └── visualizations
    │   ├── cross-modal-index-flow.png
    │   ├── cross-modal-query-flow.png
    │   ├── cross-modal-result.jpg
    │   ├── image_results.png
    │   └── text_results.png
├── example-guidelines.md
├── example_template.md
├── multires-lyrics-search
    ├── .github
    │   ├── demo.gif
    │   ├── index.jpg
    │   └── search.jpg
    ├── .gitignore
    ├── README.md
    ├── app.py
    ├── flows
    │   ├── index.yml
    │   └── query.yml
    ├── get_data.sh
    ├── helper.py
    ├── lyrics-data
    │   └── lyrics-toy-data1000.csv
    ├── requirements.txt
    ├── static
    │   ├── index.html
    │   ├── jina-logo.svg
    │   ├── main.css
    │   └── vue-bindings.js
    └── tests
    │   ├── __init__.py
    │   ├── conftest.py
    │   ├── requirements.txt
    │   └── test_flow_integration.py
├── pytest.ini
├── wikipedia-sentences-query-while-indexing
    ├── .github
    │   └── images
    │   │   ├── query.svg
    │   │   └── storage.svg
    ├── .gitignore
    ├── README.md
    ├── __init__.py
    ├── app.py
    ├── data
    │   └── toy.txt
    ├── flows
    │   ├── query.yml
    │   └── storage.yml
    ├── get_data.sh
    ├── manifest.yml
    ├── requirements.txt
    └── tests
    │   ├── __init__.py
    │   ├── requirements.txt
    │   └── test_query_while_indexing.py
└── wikipedia-sentences
    ├── .dockerignore
    ├── .github
        └── flow.png
    ├── .gitignore
    ├── README.md
    ├── app.py
    ├── data
        └── toy-input.txt
    ├── flows
        └── flow.yml
    ├── get_data.sh
    ├── requirements.txt
    └── tests
        ├── __init__.py
        ├── conftest.py
        ├── requirements.txt
        ├── test_wikipediasearch.py
        └── toy-input.txt


/.github/.README_images/0a8863abb3fcee182e1fe8fe46c47b7a.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jina-ai/examples/d8f903278597254bd96b1ed64fe8c9feefaa265c/.github/.README_images/0a8863abb3fcee182e1fe8fe46c47b7a.gif


--------------------------------------------------------------------------------
/.github/.README_images/d4165abd.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jina-ai/examples/d8f903278597254bd96b1ed64fe8c9feefaa265c/.github/.README_images/d4165abd.png


--------------------------------------------------------------------------------
/.github/.README_images/ed2907cd11ac26a2a3a2555f16071d13.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jina-ai/examples/d8f903278597254bd96b1ed64fe8c9feefaa265c/.github/.README_images/ed2907cd11ac26a2a3a2555f16071d13.gif


--------------------------------------------------------------------------------
/.github/ci.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # find all the examples with changed code
 3 | # run the tests in that directory
 4 | changedExamples=()
 5 | 
 6 | for changed_file in $CHANGED_FILES; do
 7 |   echo changed $changed_file
 8 |   file_base_dir=$(echo $changed_file | cut -d/ -f1)
 9 |   # if the example has not yet been added
10 |   if [[ ! " ${changedExamples[@]} " =~ " ${file_base_dir} " ]]; then
11 |     echo adding $file_base_dir
12 |     changedExamples+=(${file_base_dir})
13 |   fi
14 | done
15 | 
16 | echo will run tests on ${changedExamples[@]}
17 | 
18 | EXIT_CODE=0
19 | 
20 | root_dir=$(pwd)
21 | # install reqs and run the tests
22 | sudo apt-get -y update && sudo apt-get install libsndfile1 ffmpeg
23 | for example_dir in ${changedExamples[@]}; do
24 |   cd $root_dir/$example_dir
25 |   echo running tests in $example_dir
26 |   pwd
27 |   if test -f "tests/requirements.txt"; then
28 |     if [[ -d "tests/" ]]; then
29 |       python -m venv .venv
30 |       source .venv/bin/activate
31 |       pip install pytest pytest-mock
32 |       pip install -r tests/requirements.txt
33 |       if [[ $example_dir == "wikipedia-sentences-query-while-indexing" ]]; then
34 |         docker run --add-host host.docker.internal:host-gateway \
35 |                   -v /var/run/docker.sock:/var/run/docker.sock \
36 |                   -v /tmp/jinad:/tmp/jinad \
37 |                   -p 8000:8000 \
38 |                   --name jinad \
39 |                   -d jinaai/jina:2.1.0-daemon
40 |         sleep 5
41 |       fi
42 |       pytest -s -v tests/
43 |       local_exit_code=$?
44 |       deactivate
45 |       if [[ ! $local_exit_code == 0 ]]; then
46 |         EXIT_CODE=$local_exit_code
47 |         echo this one failed. local_exit_code = $local_exit_code, exit = $EXIT_CODE
48 |       fi
49 |     else
50 |       echo 'no tests/ folder here. skipping...'
51 |     fi
52 |   else
53 |     echo 'this is not an example. skipping...'
54 |   fi
55 | done
56 | 
57 | echo final exit code = $EXIT_CODE
58 | exit $EXIT_CODE
59 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
  1 | version: 2
  2 | updates:
  3 |   - package-ecosystem: "pip"
  4 |     schedule:
  5 |       interval: daily
  6 |     ignore:
  7 |       - dependency-name: "jina"
  8 |         update-types: ["version-update:semver-patch", "version-update:semver-major"]
  9 |     directory: "advanced-vector-search"
 10 |     allow:
 11 |       - dependency-name: "jina"
 12 |         dependency-type: "direct"
 13 |     open-pull-requests-limit: 1
 14 | 
 15 |   - package-ecosystem: "docker"
 16 |     schedule:
 17 |       interval: daily
 18 |     ignore:
 19 |       - dependency-name: "jina"
 20 |         update-types: ["version-update:semver-patch", "version-update:semver-major"]
 21 |     directory: "advanced-vector-search"
 22 |     allow:
 23 |       - dependency-name: "jina"
 24 |         dependency-type: "direct"
 25 |     open-pull-requests-limit: 1
 26 | 
 27 |   - package-ecosystem: "pip"
 28 |     schedule:
 29 |       interval: daily
 30 |     ignore:
 31 |       - dependency-name: "jina"
 32 |         update-types: ["version-update:semver-patch", "version-update:semver-major"]
 33 |     directory: "audio-search"
 34 |     allow:
 35 |       - dependency-name: "jina"
 36 |         dependency-type: "direct"
 37 |     open-pull-requests-limit: 1
 38 | 
 39 |   - package-ecosystem: "docker"
 40 |     schedule:
 41 |       interval: daily
 42 |     ignore:
 43 |       - dependency-name: "jina"
 44 |         update-types: ["version-update:semver-patch", "version-update:semver-major"]
 45 |     directory: "audio-search"
 46 |     allow:
 47 |       - dependency-name: "jina"
 48 |         dependency-type: "direct"
 49 |     open-pull-requests-limit: 1
 50 | 
 51 |   - package-ecosystem: "pip"
 52 |     schedule:
 53 |       interval: daily
 54 |     ignore:
 55 |       - dependency-name: "jina"
 56 |         update-types: ["version-update:semver-patch", "version-update:semver-major"]
 57 |     directory: "chinese-text-search"
 58 |     allow:
 59 |       - dependency-name: "jina"
 60 |         dependency-type: "direct"
 61 |     open-pull-requests-limit: 1
 62 | 
 63 |   - package-ecosystem: "pip"
 64 |     schedule:
 65 |       interval: daily
 66 |     ignore:
 67 |       - dependency-name: "jina"
 68 |         update-types: ["version-update:semver-patch", "version-update:semver-major"]
 69 |     directory: "cross-modal-search"
 70 |     allow:
 71 |       - dependency-name: "jina"
 72 |         dependency-type: "direct"
 73 |     open-pull-requests-limit: 1
 74 | 
 75 |   - package-ecosystem: "docker"
 76 |     schedule:
 77 |       interval: daily
 78 |     ignore:
 79 |       - dependency-name: "jina"
 80 |         update-types: ["version-update:semver-patch", "version-update:semver-major"]
 81 |     directory: "cross-modal-search"
 82 |     allow:
 83 |       - dependency-name: "jina"
 84 |         dependency-type: "direct"
 85 |     open-pull-requests-limit: 1
 86 | 
 87 |   - package-ecosystem: "pip"
 88 |     schedule:
 89 |       interval: daily
 90 |     ignore:
 91 |       - dependency-name: "jina"
 92 |         update-types: ["version-update:semver-patch", "version-update:semver-major"]
 93 |     directory: "fashion-example-query"
 94 |     allow:
 95 |       - dependency-name: "jina"
 96 |         dependency-type: "direct"
 97 |     open-pull-requests-limit: 1
 98 | 
 99 |   - package-ecosystem: "docker"
100 |     schedule:
101 |       interval: daily
102 |     ignore:
103 |       - dependency-name: "jina"
104 |         update-types: ["version-update:semver-patch", "version-update:semver-major"]
105 |     directory: "fashion-example-query"
106 |     allow:
107 |       - dependency-name: "jina"
108 |         dependency-type: "direct"
109 |     open-pull-requests-limit: 1
110 | 
111 |   - package-ecosystem: "pip"
112 |     schedule:
113 |       interval: daily
114 |     ignore:
115 |       - dependency-name: "jina"
116 |         update-types: ["version-update:semver-patch", "version-update:semver-major"]
117 |     directory: "multimodal-search-tirg"
118 |     allow:
119 |       - dependency-name: "jina"
120 |         dependency-type: "direct"
121 |     open-pull-requests-limit: 1
122 | 
123 |   - package-ecosystem: "pip"
124 |     schedule:
125 |       interval: daily
126 |     ignore:
127 |       - dependency-name: "jina"
128 |         update-types: ["version-update:semver-patch", "version-update:semver-major"]
129 |     directory: "multires-lyrics-search"
130 |     allow:
131 |       - dependency-name: "jina"
132 |         dependency-type: "direct"
133 |     open-pull-requests-limit: 1
134 | 
135 |   - package-ecosystem: "docker"
136 |     schedule:
137 |       interval: daily
138 |     ignore:
139 |       - dependency-name: "jina"
140 |         update-types: ["version-update:semver-patch", "version-update:semver-major"]
141 |     directory: "multires-lyrics-search"
142 |     allow:
143 |       - dependency-name: "jina"
144 |         dependency-type: "direct"
145 |     open-pull-requests-limit: 1
146 | 
147 |   - package-ecosystem: "pip"
148 |     schedule:
149 |       interval: daily
150 |     ignore:
151 |       - dependency-name: "jina"
152 |         update-types: ["version-update:semver-patch", "version-update:semver-major"]
153 |     directory: "object-search"
154 |     allow:
155 |       - dependency-name: "jina"
156 |         dependency-type: "direct"
157 |     open-pull-requests-limit: 1
158 | 
159 |   - package-ecosystem: "docker"
160 |     schedule:
161 |       interval: daily
162 |     ignore:
163 |       - dependency-name: "jina"
164 |         update-types: ["version-update:semver-patch", "version-update:semver-major"]
165 |     directory: "object-search"
166 |     allow:
167 |       - dependency-name: "jina"
168 |         dependency-type: "direct"
169 |     open-pull-requests-limit: 1
170 | 
171 |   - package-ecosystem: "pip"
172 |     schedule:
173 |       interval: daily
174 |     ignore:
175 |       - dependency-name: "jina"
176 |         update-types: ["version-update:semver-patch", "version-update:semver-major"]
177 |     directory: "pokedex-with-bit"
178 |     allow:
179 |       - dependency-name: "jina"
180 |         dependency-type: "direct"
181 |     open-pull-requests-limit: 1
182 | 
183 |   - package-ecosystem: "docker"
184 |     schedule:
185 |       interval: daily
186 |     ignore:
187 |       - dependency-name: "jina"
188 |         update-types: ["version-update:semver-patch", "version-update:semver-major"]
189 |     directory: "pokedex-with-bit"
190 |     allow:
191 |       - dependency-name: "jina"
192 |         dependency-type: "direct"
193 |     open-pull-requests-limit: 1
194 | 
195 |   - package-ecosystem: "pip"
196 |     schedule:
197 |       interval: daily
198 |     ignore:
199 |       - dependency-name: "jina"
200 |         update-types: ["version-update:semver-patch", "version-update:semver-major"]
201 |     directory: "tumblr-gif-search"
202 |     allow:
203 |       - dependency-name: "jina"
204 |         dependency-type: "direct"
205 |     open-pull-requests-limit: 1
206 | 
207 |   - package-ecosystem: "pip"
208 |     schedule:
209 |       interval: daily
210 |     ignore:
211 |       - dependency-name: "jina"
212 |         update-types: ["version-update:semver-patch", "version-update:semver-major"]
213 |     directory: "wikipedia-sentences"
214 |     allow:
215 |       - dependency-name: "jina"
216 |         dependency-type: "direct"
217 |     open-pull-requests-limit: 1
218 | 


--------------------------------------------------------------------------------
/.github/perf-script.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # required for downloading data from S3
 3 | pip install -e git://github.com/jina-ai/cloud-helper.git@v0.0.2#egg=jinacld_tools
 4 | 
 5 | cd ..
 6 | reqs=`find . -name "requirements.txt"`
 7 | folders=()
 8 | for req in $reqs; do
 9 |   module=`dirname $req`
10 |   if test -f "$module/setup_run.sh"; then
11 |     echo "$module has 'setup_run.sh'. will be run"
12 |     folders+=($module)
13 |   fi
14 | done
15 | 
16 | for folder in $folders; do
17 |   cd $folder &&
18 |   pip install -r requirements.txt && \
19 |   bash setup_run.sh && \
20 |   cd ..
21 | done
22 | 
23 | if test -f "performance.txt"; then
24 |   rm performance.txt
25 | fi
26 | 
27 | metrics=`find . -name "metrics.txt"`
28 | for file_m in $metrics; do
29 |   echo `dirname $file_m` >> performance.txt &&
30 |   cat $file_m | grep "QPS: " | grep "takes" >> performance.txt
31 | done


--------------------------------------------------------------------------------
/.github/performance.txt:
--------------------------------------------------------------------------------
1 | ./audio-search
2 |            Flow@45181[I]:QPS: indexing 100 takes 3 seconds (3.92s)
3 | ./advanced-vector-search
4 |            Flow@38382[I]:QPS: indexing 10000 takes 1 second (1.39s)
5 |            Flow@38487[I]:QPS: query with 100 takes 32 seconds (32.71s)
6 | 


--------------------------------------------------------------------------------
/.github/util/pull_dataset.py:
--------------------------------------------------------------------------------
 1 | # This script is only used by the CI Pipeline
 2 | import logging
 3 | import os
 4 | 
 5 | import click
 6 | from jinacld_tools.aws.services.s3 import S3Bucket
 7 | 
 8 | 
 9 | BUCKET_NAME = "jina-examples-datasets"
10 | log = logging.getLogger(__name__)
11 | 
12 | 
13 | def _check_credentials_exist():
14 |     assert os.environ.get('AWS_ACCESS_KEY_ID') is not None,\
15 |         'AWS_ACCESS_KEY_ID is not present in the environment variables but required for this script.'
16 |     assert len(os.environ['AWS_ACCESS_KEY_ID']) > 0, \
17 |         'AWS_ACCESS_KEY_ID was set in the environment but has length zero.'
18 |     assert os.environ.get('AWS_SECRET_ACCESS_KEY') is not None,\
19 |         'AWS_SECRET_ACCESS_KEY is not present in the environment variables but required for this script.'
20 |     assert len(os.environ['AWS_SECRET_ACCESS_KEY']) > 0,\
21 |         'AWS_SECRET_ACCESS_KEY was set in the environment but has length zero.'
22 | 
23 | 
24 | @click.command()
25 | @click.option(
26 |     "--data-set",
27 |     "-d",
28 |     type=str,
29 |     required=True,
30 |     help='Path to the data-set in the S3 bucket relative to the root.'
31 | )
32 | @click.option(
33 |     "--pull-to-dir",
34 |     "-p",
35 |     type=click.Path(exists=False),
36 |     required=True,
37 |     help='Directory to download the data to. Must exist beforehand.'
38 | )
39 | def main(data_set: str, pull_to_dir: str):
40 |     _check_credentials_exist()
41 |     assert os.path.isdir(pull_to_dir), "The pull dir parameter must be an existing directory"
42 |     save_path = os.path.join(pull_to_dir, data_set)
43 |     s3 = S3Bucket(BUCKET_NAME)
44 |     try:
45 |         s3.get(data_set, save_path)
46 |     except Exception as e:
47 |         log.error(e)
48 | 
49 | 
50 | if __name__ == '__main__':
51 |     logging.basicConfig(level=logging.INFO)
52 |     main()
53 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | 
 3 | on: [pull_request]
 4 | 
 5 | jobs:
 6 |   example-tests:
 7 |     runs-on: ubuntu-latest
 8 |     strategy:
 9 |       matrix:
10 |         python-version: ['3.7', '3.8']
11 |     steps:
12 |       - name: Checkout
13 |         uses: actions/checkout@v2
14 |       - name: Set up Python
15 |         uses: actions/setup-python@v2
16 |         with:
17 |           python-version: ${{ matrix.python-version }}
18 |       - id: changed-files
19 |         uses: jitterbit/get-changed-files@v1
20 |         continue-on-error: true
21 |       - name: ci
22 |         run: ./.github/ci.sh
23 |         timeout-minutes: 20
24 |         env:
25 |           CHANGED_FILES: ${{ steps.changed-files.outputs.all }}
26 |           AWS_ACCESS_KEY_ID: AKIAWB5UZPEQHHYDKVUC
27 |           AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_DEVBOT_AKIAWB5UZPEQHHYDKVUC }}
28 | 


--------------------------------------------------------------------------------
/.github/workflows/copyright.yml:
--------------------------------------------------------------------------------
 1 | name: Copyright
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - master
 7 | 
 8 | jobs:
 9 |   update-copyright:
10 |     if: "!startsWith(github.event.head_commit.message, 'chore')"
11 |     runs-on: ubuntu-latest
12 |     steps:
13 |       - uses: actions/checkout@v2
14 |         with:
15 |           token: ${{ secrets.JINA_DEV_BOT }}
16 |       - run: |
17 |           git fetch --prune --unshallow
18 |           git config --local user.email "dev-bot@jina.ai"
19 |           git config --local user.name "Jina Dev Bot"
20 |       - uses: VinnyBabuManjaly/copyright-action@v1.0.0
21 |         with:
22 |           CopyrightString: '__copyright__ = "Copyright (c) 2020-2021 Jina AI Limited. All rights reserved."\n__license__ = "Apache-2.0"\n\n'
23 |           FileType: '.py'
24 |       - run: |
25 |           git add -u
26 |           git commit -m "chore: update copyright header"
27 |           git status
28 |           git push
29 |         continue-on-error: true
30 | 
31 |   update-toc:
32 |     if: "!startsWith(github.event.head_commit.message, 'chore')"
33 |     runs-on: ubuntu-latest
34 |     steps:
35 |       - uses: technote-space/toc-generator@v2
36 |         with:
37 |           MAX_HEADER_LEVEL: 2
38 |           FOLDING: false
39 |           GITHUB_TOKEN: ${{ secrets.JINA_DEV_BOT }}
40 |           TOC_TITLE: '**Table of Contents**'
41 |           TARGET_PATHS: '.'
42 |           COMMIT_MESSAGE: 'chore(docs): update TOC'
43 |           COMMIT_NAME: Jina Dev Bot
44 |           COMMIT_EMAIL: dev-bot@jina.ai
45 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Initially taken from Github's Python gitignore file
  2 | 
  3 | # Byte-compiled / optimized / DLL files
  4 | __pycache__/
  5 | *.py[cod]
  6 | *$py.class
  7 | 
  8 | # C extensions
  9 | *.so
 10 | 
 11 | # Distribution / packaging
 12 | .Python
 13 | build/
 14 | develop-eggs/
 15 | dist/
 16 | downloads/
 17 | eggs/
 18 | .eggs/
 19 | lib/
 20 | lib64/
 21 | parts/
 22 | sdist/
 23 | var/
 24 | wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | 
 53 | # Translations
 54 | *.mo
 55 | *.pot
 56 | 
 57 | # Django stuff:
 58 | *.log
 59 | local_settings.py
 60 | db.sqlite3
 61 | 
 62 | # Flask stuff:
 63 | instance/
 64 | .webassets-cache
 65 | 
 66 | # Scrapy stuff:
 67 | .scrapy
 68 | 
 69 | # Sphinx documentation
 70 | docs/_build/
 71 | docs/api/
 72 | 
 73 | # PyBuilder
 74 | target/
 75 | 
 76 | # Jupyter Notebook
 77 | .ipynb_checkpoints
 78 | 
 79 | # IPython
 80 | profile_default/
 81 | ipython_config.py
 82 | 
 83 | # pyenv
 84 | docs/.python-version
 85 | 
 86 | # celery beat schedule file
 87 | celerybeat-schedule
 88 | 
 89 | # SageMath parsed files
 90 | *.sage.py
 91 | 
 92 | # Environments
 93 | .venv
 94 | env/
 95 | venv/
 96 | ENV/
 97 | env.bak/
 98 | venv.bak/
 99 | 
100 | # Spyder project settings
101 | .spyderproject
102 | .spyproject
103 | 
104 | # Rope project settings
105 | .ropeproject
106 | 
107 | # mkdocs documentation
108 | /site
109 | 
110 | # mypy
111 | .mypy_cache/
112 | .dmypy.json
113 | dmypy.json
114 | 
115 | # Pyre type checker
116 | .pyre/
117 | .idea/
118 | toy*.py
119 | .DS_Store
120 | post/
121 | toy*.ipynb
122 | data/
123 | *.c
124 | .nes_cache
125 | toy*.yml
126 | *.tmp
127 | 
128 | shell/jina-wizard.sh
129 | /junit/
130 | /tests/junit/
131 | jina-profile*.json
132 | 
133 | .vscode/
134 | */workspace
135 | 
136 | models
137 | result.html # genreated in pokedex example
138 | /advanced-vector-search/siftsmall/
139 | /image-search/pretrained
140 | **/metrics.txt
141 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | # [PLEASE CHECK OUR DOC WEBSITE FOR MORE COMPREHENSIVE EXAMPLES.](https://docs.jina.ai)
  3 | 
  4 | ![image](https://user-images.githubusercontent.com/2041322/139639975-bb140208-20ad-4d10-a6b2-b5aa85466ed0.png)
  5 | 
  6 | 
  7 | 
  8 | 
  9 | 
 10 | 
 11 | # Examples for Jina
 12 | 
 13 | <!-- START doctoc generated TOC please keep comment here to allow auto update -->
 14 | <!-- DON'T EDIT THIS SECTION, INSTEAD RE-RUN doctoc TO UPDATE -->
 15 | **Table of Contents**
 16 | 
 17 | - [Adding Tests for Examples](#adding-tests-for-examples)
 18 | - [Community](#community)
 19 | - [License](#license)
 20 | 
 21 | <!-- END doctoc generated TOC please keep comment here to allow auto update -->
 22 | 
 23 | These examples showcase Jina in action and provide sample code for you to work from. 
 24 | 
 25 | We suggest you read the following to get an overview of what Jina is and how it works:
 26 | 
 27 | - [What is neural search?](https://github.com/jina-ai/jina/blob/master/.github/2.0/neural-search.md)
 28 | - [Get started](https://github.com/jina-ai/jina/#build-your-first-jina-app) - especially the cookbooks for Document, Executor and Flow
 29 | 
 30 | ## 🐣 Simple Examples
 31 | 
 32 | <table>
 33 |   <tr>
 34 |     <td>
 35 |       <h1>📄</h1>
 36 |     </td>
 37 |     <td>
 38 |       <h4><a href="./wikipedia-sentences">Semantic Wikipedia Search with Transformers and DistilBERT</a></h4>
 39 |       Brand new to neural search? See a simple text-search example to understand how Jina works
 40 |     </td>
 41 |   </tr>
 42 |   <tr>
 43 |     <td>
 44 |       <h1>📄</h1>
 45 |     </td>
 46 |     <td>
 47 |       <h4><a href="./multires-lyrics-search">Search Lyrics with Transformers and PyTorch</a></h4>
 48 |       Get a better understanding of chunks by searching a lyrics database. Now with shiny front-end!
 49 |     </td>
 50 |   </tr>
 51 |   <tr>
 52 |     <td>
 53 |       <h1>📄</h1>
 54 |     </td>
 55 |     <td>
 56 |       <h4><a href="./audio-to-audio-search">Find Similar Audio Clips</a></h4>
 57 |       A simple example to show how to find similar audio clips using Jina
 58 |     </td>
 59 |   </tr>
 60 | </table>
 61 | 
 62 | ## 🚀  Advanced Examples
 63 | 
 64 | <table>
 65 |   <tr>
 66 |     <td>
 67 |       <h1>📄</h1>
 68 |     </td>
 69 |     <td>
 70 |       <h4><a href="./wikipedia-sentences-query-while-indexing">Querying While Indexing in the Wikipedia Search Example</a></h4>
 71 |       Support both querying and indexing simultaneously in our Wikipedia Search Example
 72 |     </td>
 73 |   </tr>
 74 |   
 75 |   <tr>
 76 |     <td>
 77 |       <h1>🖼️📄</h1>
 78 |     </td>
 79 |     <td>
 80 |       <h4><a href="./cross-modal-search">Cross Modal: Search images from captions and vice-versa</a></h4>
 81 |       Use one modality (text) to search another (images)
 82 |     </td>
 83 |   </tr>
 84 | </table>
 85 | 
 86 | ## Community Examples
 87 | 
 88 | Want to add your own example? Please check our [guidelines](example-guidelines.md)!
 89 | 
 90 | <table>
 91 |     <tr>
 92 |     <td>
 93 |       <h1>🖼️📄</h1>
 94 |     </td>
 95 |     <td>
 96 |       <h4>Meme Search - <a href="https://github.com/alexcg1/jina-meme-search-example/">Text search</a> / <a href="https://github.com/alexcg1/jina-meme-search-image-backend">Image search</a> / <a href="https://github.com/alexcg1/jina-meme-search-frontend">Front end</a></h4>
 97 |       Search memes by caption or similar image.
 98 |     </td>
 99 |   </tr><tr>
100 |     <td>
101 |       <h1>📄</h1>
102 |     </td>
103 |     <td>
104 |       <h4><a href="https://github.com/alexcg1/jina-app-store-example">App Store Search</a></h4>
105 |       Use Transformers to search through a rich app store dataset with a responsive front-end
106 |     </td>
107 |   </tr><tr>
108 |     <td>
109 |       <h1>📄</h1>
110 |     </td>
111 |     <td>
112 |       <h4><a href="https://github.com/nikosNalmpantis/jina-star-wars-qa">Star Wars Question Answering System</a></h4>
113 |       Generate answers based on star wars descriptions from wookieepedia.
114 |     </td>
115 |   </tr>
116 | </table>
117 | 
118 | #### Legacy examples
119 | 
120 | <table>
121 |     <tr>
122 |     <td>
123 |       <h1>📄</h1>
124 |     </td>
125 |     <td>
126 |       <h4><a href="https://github.com/yuanbit/jina-financial-qa-search">Financial Question Answering Search</a></h4>
127 |       Opinionated QA passage retrieval with BERT-based reranker
128 |     </td>
129 |   </tr>
130 | </table>
131 | 
132 | ## Adding Tests for Examples
133 | 
134 | You are highly encouraged to add a test for your example so that we will be alerted if it breaks in the future:
135 | 
136 | 1. Put your test data in the `tests` folder. The test data can be a few text sentences, images or audio samples
137 | 2. Create `test_[your_example].py` in the `tests` folder. Add your test cases to the `tests` file with meaningful asserts depending on example input and output
138 | 3. Run the test locally to confirm before pushing with [pytest](https://docs.pytest.org/en/stable/contents.html)
139 | 4. Add your example folder name to the `path` variable in `matrix` of `.github/worflows/ci.yml`. This will trigger your example test on creating a pull request.
140 | 
141 | 
142 | ### Testing Tips
143 | 
144 | - For reference, check out the `tests` folder from any of the examples in this repo.
145 | - Try using the original example function by importing them to the test. Avoid any modifications to original Flow or logic.
146 | - Use the [pytest fixture](https://docs.pytest.org/en/stable/fixture.html) `tmpdir` for temporary directory
147 | 
148 | ## Community
149 | 
150 | - [Slack channel](http://slack.jina.ai) - a communication platform for developers to discuss Jina
151 | - [LinkedIn](https://www.linkedin.com/company/jinaai/) - get to know Jina AI as a company and find job opportunities
152 | - [![Twitter Follow](https://img.shields.io/twitter/follow/JinaAI_?label=Follow%20%40JinaAI_&style=social)](https://twitter.com/JinaAI_) - follow us and interact with us using hashtag `#JinaSearch`  
153 | - [Company](https://jina.ai) - know more about our company, we are fully committed to open-source!
154 | 
155 | 
156 | ## License
157 | 
158 | Copyright (c) 2021 Jina AI Limited. All rights reserved.
159 | 
160 | Jina is licensed under the Apache License, Version 2.0. See [LICENSE](https://github.com/jina-ai/jina/blob/master/LICENSE) for the full license text.
161 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jina-ai/examples/d8f903278597254bd96b1ed64fe8c9feefaa265c/__init__.py


--------------------------------------------------------------------------------
/audio-to-audio-search/.github/demo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jina-ai/examples/d8f903278597254bd96b1ed64fe8c9feefaa265c/audio-to-audio-search/.github/demo.png


--------------------------------------------------------------------------------
/audio-to-audio-search/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | assets/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | ./data/
 17 | !/tests/data/
 18 | eggs/
 19 | .eggs/
 20 | lib/
 21 | lib64/
 22 | parts/
 23 | sdist/
 24 | var/
 25 | wheels/
 26 | pip-wheel-metadata/
 27 | share/python-wheels/
 28 | *.egg-info/
 29 | .installed.cfg
 30 | *.egg
 31 | *.swp
 32 | MANIFEST
 33 | 
 34 | # PyInstaller
 35 | #  Usually these files are written by a python script from a template
 36 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 37 | *.manifest
 38 | *.spec
 39 | 
 40 | # Installer logs
 41 | pip-log.txt
 42 | pip-delete-this-directory.txt
 43 | 
 44 | # Unit test / coverage reports
 45 | htmlcov/
 46 | .tox/
 47 | .nox/
 48 | .coverage
 49 | .coverage.*
 50 | .cache
 51 | nosetests.xml
 52 | coverage.xml
 53 | *.cover
 54 | *.py,cover
 55 | .hypothesis/
 56 | .pytest_cache/
 57 | 
 58 | # Translations
 59 | *.mo
 60 | *.pot
 61 | 
 62 | # Django stuff:
 63 | *.log
 64 | local_settings.py
 65 | db.sqlite3
 66 | db.sqlite3-journal
 67 | 
 68 | # Flask stuff:
 69 | instance/
 70 | .webassets-cache
 71 | 
 72 | # Scrapy stuff:
 73 | .scrapy
 74 | 
 75 | # Sphinx documentation
 76 | docs/_build/
 77 | 
 78 | # PyBuilder
 79 | target/
 80 | 
 81 | # Jupyter Notebook
 82 | .ipynb_checkpoints
 83 | 
 84 | # IPython
 85 | profile_default/
 86 | ipython_config.py
 87 | 
 88 | # pyenv
 89 | .python-version
 90 | 
 91 | # pipenv
 92 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 93 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 94 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 95 | #   install all needed dependencies.
 96 | #Pipfile.lock
 97 | 
 98 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 99 | __pypackages__/
100 | 
101 | # Celery stuff
102 | celerybeat-schedule
103 | celerybeat.pid
104 | 
105 | # SageMath parsed files
106 | *.sage.py
107 | 
108 | # Environments
109 | .env
110 | .venv
111 | env/
112 | venv/
113 | ENV/
114 | env.bak/
115 | venv.bak/
116 | 
117 | # Spyder project settings
118 | .spyderproject
119 | .spyproject
120 | 
121 | # Rope project settings
122 | .ropeproject
123 | 
124 | # mkdocs documentation
125 | /site
126 | 
127 | # mypy
128 | .mypy_cache/
129 | .dmypy.json
130 | dmypy.json
131 | 
132 | # Pyre type checker
133 | .pyre/
134 | 
135 | # MAC OSX
136 | .DS_Store
137 | .idea/
138 | 
139 | models/
140 | workspace/
141 | 


--------------------------------------------------------------------------------
/audio-to-audio-search/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | # Find Similar Audio Clips
  4 | 
  5 | This example checks if the query audio clip is part of the indexed audio tracks.
  6 | 
  7 | ![Demo](.github/demo.png)
  8 | 
  9 | ## Prerequisites
 10 | 
 11 | To run this example, the user is required to `cd` into the directory containing
 12 | `requirements.txt` to install:
 13 | 
 14 | ```
 15 | sudo apt-get -y update && sudo apt-get install libsndfile1 ffmpeg
 16 | pip install -r requirements.txt
 17 | ```
 18 | 
 19 | 
 20 | 
 21 | ## Basic Usages
 22 | 
 23 | You can run `app.py` by doing the following:
 24 | 
 25 | ```shell
 26 | python app.py index
 27 | ```
 28 | 
 29 | By default, audio tracks in the `data/mp3/index` will be indexed. You can specify custom data path by:
 30 | 
 31 | ```
 32 | export JINA_DATA_FILE=<custom_data_path>
 33 | ```
 34 | where audios to be indexed are stored in `<custom_data_path>/index`.
 35 | 
 36 | 
 37 | Then, to search the documents, do:
 38 | 
 39 | ```shell
 40 | python app.py search
 41 | ```
 42 | 
 43 | This will generate a set of query audio clips on the fly in `data/mp3/query` (or, if you are using
 44 | custom data path, in `<custom_data_path>/query`) by extracting snippets from a set of randomly sampled
 45 | of index audio clips. The program then matches each query doc with the most similar index docs.
 46 | 
 47 | The `-s` option allows user to specify which segmenter to use. `vad` uses Jinahub's VADSpeechSegmenter, and
 48 | `time` uses TimeSegmenter.
 49 | 
 50 | The `-e` option allows user to specify which encoder to use. `vgg` uses Jinahub's VGGishEncoder, and
 51 | `clip` uses AudioCLIPEncoder.
 52 | 
 53 | The `-t` option allows user to specify what the match threshold is. If score of match is below threshold,
 54 | then it is not considered a match.
 55 | 
 56 | 
 57 | 
 58 | ## Results and Demo
 59 | 
 60 | 
 61 | 
 62 | Results are as follows.
 63 | 
 64 | With `segmenter=VADSpeechSegmenter` and `encoder=AudioCLIPEncoder`:
 65 | 
 66 | ```
 67 | +-------------+-------------+------------+ 
 68 | |    target   |  prediction | is_correct | 
 69 | +-------------+-------------+------------+ 
 70 | | -0jeONf82dE | -0jeONf82dE |    True    | 
 71 | | -CXICIHCb6Y | -CXICIHCb6Y |    True    | 
 72 | | -OMB-w3LPNY | -By6I234TSs |   False    | 
 73 | | -QX2Gv7J5gY | -QX2Gv7J5gY |    True    | 
 74 | | -UKH_6moRZc | -UKH_6moRZc |    True    | 
 75 | | -ZJqu_4zLMc | -ZJqu_4zLMc |    True    | 
 76 | | -gz_moHFwl4 | -gz_moHFwl4 |    True    | 
 77 | | -i9uQMysy_A | -mKtgDnG0oM |   False    | 
 78 | | -mpapCZXors | -mpapCZXors |    True    | 
 79 | | 0KKTw8pfNjg | 0KKTw8pfNjg |    True    | 
 80 | | 0LTmV_dOmmo | 0LTmV_dOmmo |    True    | 
 81 | | 0N17tEW_WEU | 0N17tEW_WEU |    True    | 
 82 | | 0YIWrXgCjiM | 0YIWrXgCjiM |    True    | 
 83 | | 0YsC6M4GFoc | 0YsC6M4GFoc |    True    | 
 84 | | 0_O6nVfnCH8 | -sevczF5etI |   False    | 
 85 | | 0cZQm65sZjc | 0cZQm65sZjc |    True    | 
 86 | | 0jnvb2H25_Q | 0jnvb2H25_Q |    True    | 
 87 | | 0kQjfwXjFuY | -D--GWwca0g |   False    | 
 88 | | 0rbUCEM20aw | 0rbUCEM20aw |    True    | 
 89 | | 0slyl34xWug | 0slyl34xWug |    True    | 
 90 | +-------------+-------------+------------+ 
 91 | accuracy: 0.8
 92 | ```
 93 | 
 94 | With `segmenter=TimeSegmenter` and `encoder=AudioCLIPEncoder`:
 95 | 
 96 | ```
 97 | +-------------+-------------+------------+
 98 | |    target   |  prediction | is_correct |
 99 | +-------------+-------------+------------+
100 | | -Bu7YaslRW0 | -Bu7YaslRW0 |    True    |
101 | | -CXICIHCb6Y | -CXICIHCb6Y |    True    |
102 | | -D--GWwca0g | -D--GWwca0g |    True    |
103 | | -OMB-w3LPNY | -OMB-w3LPNY |    True    |
104 | | -Z8bjo6q6jc | -CXICIHCb6Y |   False    |
105 | | -ZJqu_4zLMc | -ZJqu_4zLMc |    True    |
106 | | -_HXiz8XnV0 | -_HXiz8XnV0 |    True    |
107 | | -fz6omiAhZ8 | -fz6omiAhZ8 |    True    |
108 | | -mpapCZXors | -jaY3LS3Dv0 |   False    |
109 | | 05JAmKFVy44 | 05JAmKFVy44 |    True    |
110 | | 0YIWrXgCjiM | 0YIWrXgCjiM |    True    |
111 | | 0YsC6M4GFoc | 0YsC6M4GFoc |    True    |
112 | | 0ZN2HKsFg4A | 0ZN2HKsFg4A |    True    |
113 | | 0_O6nVfnCH8 | 0_O6nVfnCH8 |    True    |
114 | | 0cZQm65sZjc | 0cZQm65sZjc |    True    |
115 | | 0izHOfrwPn4 | 0izHOfrwPn4 |    True    |
116 | | 0qZ54ovyEWQ | 0qZ54ovyEWQ |    True    |
117 | | 0sYXPO7lzco | 0sYXPO7lzco |    True    |
118 | | 0slyl34xWug | 0slyl34xWug |    True    |
119 | | 0vg9qxNKXOw | 0vg9qxNKXOw |    True    |
120 | +-------------+-------------+------------+
121 | accuracy: 0.9
122 | ```
123 | 
124 | With `segmenter=TimeSegmenter` and `encoder=VGGishAudioEncoder`:
125 | 
126 | ```
127 | +-------------+-------------+------------+
128 | |    target   |  prediction | is_correct |
129 | +-------------+-------------+------------+
130 | | -Bu7YaslRW0 | -Bu7YaslRW0 |    True    |
131 | | -IvJaK7HLtQ | -IvJaK7HLtQ |    True    |
132 | | -OMB-w3LPNY | -OMB-w3LPNY |    True    |
133 | | -QX2Gv7J5gY | -QX2Gv7J5gY |    True    |
134 | | -UKH_6moRZc | -UKH_6moRZc |    True    |
135 | | -ZJqu_4zLMc | -ZJqu_4zLMc |    True    |
136 | | -mKtgDnG0oM | -mKtgDnG0oM |    True    |
137 | | -mpapCZXors | -mpapCZXors |    True    |
138 | | -nlkWWphiaM | -nlkWWphiaM |    True    |
139 | | -pUfYFcsgG4 | -pUfYFcsgG4 |    True    |
140 | | -sevczF5etI | -sevczF5etI |    True    |
141 | | 0N17tEW_WEU | 0N17tEW_WEU |    True    |
142 | | 0XeH2s-LzZE | 0XeH2s-LzZE |    True    |
143 | | 0YIWrXgCjiM | 0YIWrXgCjiM |    True    |
144 | | 0YsC6M4GFoc | 0YsC6M4GFoc |    True    |
145 | | 0bRUkLsttto | 0bRUkLsttto |    True    |
146 | | 0izHOfrwPn4 | 0izHOfrwPn4 |    True    |
147 | | 0jFQ21A6GRA | 0jFQ21A6GRA |    True    |
148 | | 0sYXPO7lzco | 0sYXPO7lzco |    True    |
149 | | 0vg9qxNKXOw | 0vg9qxNKXOw |    True    |
150 | +-------------+-------------+------------+
151 | accuracy: 1.0
152 | ```
153 | 
154 | With `segmenter=VADSpeechSegmenter` and `encoder=VGGishAudioEncoder`:
155 | 
156 | ```
157 | +-------------+-------------+------------+
158 | |    target   |  prediction | is_correct |
159 | +-------------+-------------+------------+
160 | | -0jeONf82dE | -0jeONf82dE |    True    |
161 | | -OMB-w3LPNY | 0sYXPO7lzco |   False    |
162 | | -QX2Gv7J5gY | 0LTmV_dOmmo |   False    |
163 | | -WKYdeVL3_k | -WKYdeVL3_k |    True    |
164 | | -Z8bjo6q6jc | -Z8bjo6q6jc |    True    |
165 | | -_HXiz8XnV0 | -_HXiz8XnV0 |    True    |
166 | | -e4wXAy1iVo | -e4wXAy1iVo |    True    |
167 | | -gz_moHFwl4 | -gz_moHFwl4 |    True    |
168 | | -i9uQMysy_A | -i9uQMysy_A |    True    |
169 | | -jaY3LS3Dv0 | -jaY3LS3Dv0 |    True    |
170 | | -sevczF5etI | -sevczF5etI |    True    |
171 | | 05JAmKFVy44 | 05JAmKFVy44 |    True    |
172 | | 0N17tEW_WEU | 0N17tEW_WEU |    True    |
173 | | 0OY8XXZ98rw | -0jeONf82dE |   False    |
174 | | 0YIWrXgCjiM | 0YIWrXgCjiM |    True    |
175 | | 0rbUCEM20aw | 0rbUCEM20aw |    True    |
176 | | 0vg9qxNKXOw | 0vg9qxNKXOw |    True    |
177 | +-------------+-------------+------------+
178 | accuracy: 0.8235294117647058
179 | ```
180 | 
181 | After searching is completed, the program will open `demo.html` where user can click
182 | to listen to the query and matched docs.
183 | 


--------------------------------------------------------------------------------
/audio-to-audio-search/app.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import glob
  3 | import shutil
  4 | from functools import partial
  5 | from pathlib import Path
  6 | from typing import Optional
  7 | 
  8 | import click
  9 | from jina import DocumentArray, Document, Flow
 10 | 
 11 | from executors import TimeSegmenter, Wav2MelCrafter, DebugExecutor
 12 | from helper import report_results, write_html, create_query_audios, create_docs, logger
 13 | 
 14 | 
 15 | def config():
 16 |     os.environ.setdefault('JINA_WORKSPACE', str(Path(__file__).parent / 'workspace'))
 17 |     os.environ.setdefault('JINA_DATA_FILE', str(Path(__file__).parent / 'data' / 'mp3'))
 18 |     os.environ.setdefault(
 19 |         'JINA_WORKSPACE_MOUNT',
 20 |         f'{os.environ.get("JINA_WORKSPACE")}:/workspace/workspace'
 21 |     )
 22 | 
 23 | 
 24 | def index(workspace: Path, data_dir: Path, flow: Flow):
 25 |     if workspace.exists():
 26 |         shutil.rmtree(workspace)
 27 |     with flow:
 28 |         flow.post(
 29 |             '/index', inputs=create_docs(os.path.join(data_dir, 'index', '*.mp3'))
 30 |         )
 31 | 
 32 | 
 33 | def search(
 34 |     workspace: Path,
 35 |     data_dir: Path,
 36 |     flow: Flow,
 37 |     threshold: Optional[float],
 38 |     top_k: int,
 39 |     num_queries: int,
 40 | ):
 41 |     if not workspace.exists():
 42 |         raise FileNotFoundError(
 43 |             f'The directory {workspace} does not exist. Please index first via `python app.py index`'
 44 |         )
 45 | 
 46 |     with flow:
 47 |         create_query_audios(num_queries, data_dir)
 48 |         responses = flow.post(
 49 |             '/search',
 50 |             inputs=create_docs(os.path.join(data_dir, 'query', '*.mp3')),
 51 |             return_results=True,
 52 |         )
 53 | 
 54 |     result_html, accuracy = report_results(responses, threshold, top_k)
 55 |     write_html(str(workspace / 'demo.html'), result_html, accuracy, top_k)
 56 | 
 57 | 
 58 | def validate_threshold(
 59 |     ctx: click.core.Option, param: click.core.Context, threshold: Optional[float]
 60 | ):
 61 |     if threshold is not None and not 0 <= threshold <= 1:
 62 |         raise click.BadParameter('threshold should be between 0 and 1')
 63 | 
 64 | 
 65 | @click.command()
 66 | @click.argument('operation', type=click.Choice(['index', 'search']))
 67 | @click.option(
 68 |     '--segmenter',
 69 |     '-s',
 70 |     default='vad',
 71 |     type=click.Choice(['time', 'vad']),
 72 |     help='Specify the segmenter to use (i.e. vad or time)',
 73 | )
 74 | @click.option(
 75 |     '--encoder',
 76 |     '-e',
 77 |     default='vgg',
 78 |     type=click.Choice(['vgg', 'clip']),
 79 |     help='Specify the encoder to use (i.e. vgg or clip)',
 80 | )
 81 | @click.option(
 82 |     '--threshold',
 83 |     '-t',
 84 |     default=None,
 85 |     type=float,
 86 |     callback=validate_threshold,
 87 |     help='Specify the distance threshold for matching (between 0 to 1)',
 88 | )
 89 | @click.option('--top_k', '-k', default=5, type=int, help='Specify top k for matching')
 90 | @click.option(
 91 |     '--num_queries',
 92 |     '-n',
 93 |     default=25,
 94 |     type=int,
 95 |     help='Specify the number of querys to match',
 96 | )
 97 | def cli(
 98 |     operation: str,
 99 |     segmenter: str,
100 |     encoder: str,
101 |     threshold: Optional[float],
102 |     top_k: int,
103 |     num_queries: int,
104 | ):
105 |     config()
106 | 
107 |     data_dir = Path(os.environ["JINA_DATA_FILE"])
108 |     workspace = Path(os.environ["JINA_WORKSPACE"])
109 |     logger.info(f'data directory path: {data_dir}')
110 |     logger.info(f'workspace path: {workspace}')
111 | 
112 |     segmenter_uses_with = {'chunk_duration': 2.5} if segmenter == 'time' else {}
113 |     segmenter = {'time': TimeSegmenter, 'vad': 'jinahub://VADSpeechSegmenter'}[
114 |         segmenter
115 |     ]
116 | 
117 |     flow = (
118 |         Flow()
119 |         .add(
120 |             uses=segmenter,
121 |             uses_metas={'workspace': str(workspace)},
122 |             uses_with=segmenter_uses_with,
123 |         )
124 |     )
125 | 
126 |     if encoder == 'vgg':
127 |         flow = flow.add(uses=Wav2MelCrafter)
128 | 
129 |     encoder = {
130 |         'clip': 'jinahub+docker://AudioCLIPEncoder',
131 |         'vgg': 'jinahub+docker://VGGishAudioEncoder',
132 |     }[encoder]
133 | 
134 |     flow = (
135 |         flow
136 |         .add(uses=encoder, uses_with={'default_traversal_paths': ['c']})
137 |         # Since matched chunks may come from the same top level query doc,
138 |         # we set default_top_k to top_k * 2 so that we have sufficient information to
139 |         # determine the true top k matches as a quick workaround.
140 |         .add(
141 |             uses='jinahub+docker://SimpleIndexer',
142 |             volumes=os.environ['JINA_WORKSPACE_MOUNT'],
143 |             uses_with={
144 |                 'index_file_name': 'simple_indexer',
145 |                 'default_traversal_paths': ['c'],
146 |                 'default_top_k': top_k * 2,
147 |             },
148 |         )
149 |         .add(uses=DebugExecutor)
150 |         .add(uses='jinahub+docker://SimpleRanker', uses_metas={'workspace': str(workspace)})
151 |     )
152 | 
153 |     {
154 |         'index': index,
155 |         'search': partial(
156 |             search, threshold=threshold, top_k=top_k, num_queries=num_queries
157 |         ),
158 |     }[operation](workspace, data_dir, flow)
159 | 
160 | 
161 | if __name__ == '__main__':
162 |     cli()
163 | 


--------------------------------------------------------------------------------
/audio-to-audio-search/demo.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 |     <head>
 4 |         <meta charset="UTF-8">
 5 |         <title>
 6 |             Jina Audio to Audio Search!
 7 |         </title>
 8 |     </head>
 9 |     <body>
10 |         <script>
11 |           var prev = null;
12 |         </script>
13 |         <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/4.7.0/css/font-awesome.min.css">
14 |         <div
15 |              class="topnav" style="height: 30px; margin-bottom: 20px; background: #009999; border-radius: 10px; padding: 15px;">
16 |             <a 
17 |                class="logo" href="https://github.com/jina-ai/jina/" style="height: inherit;">
18 |                 <img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAOwAAABICAYAAAD1XhnsAAAAAXNSR0IArs4c6QAAAERlWElmTU0AKgAAAAgAAYdpAAQAAAABAAAAGgAAAAAAA6ABAAMAAAABAAEAAKACAAQAAAABAAAA7KADAAQAAAABAAAASAAAAAB3FpgyAAAUNUlEQVR4Ae2dB7RUxRnHg72gYsMOD0WJWEBFxGNJNMZeMCqxxSieGGM5GhX12DiapoJGjeVoVDSWeDS2YCIeCxaMGjsidnhGRMUGFiwI5Pd/b+dx9+7M3C1z9+365jvnv/femW++75vvznen3Nndbj+oA82fP78HajYB64HVCliG4/wCvuD4dgGvc3yxW7duczhGih6IHkh4oFviPNgpAbokwnYAe4DtwdqgEl1fwf8MGA9uJ3gncowUPRA9ENIDBOrmYAz4EoSk1xF2OlgppL1RVvRAl/QAgbQjeArkTbNRcBXo3SUdHSsdPVCLBwicjcF4UG/6CoXngmVrsT+WjR5oNg9UMq8sqhvBciwJo8DiRRn+i1lkvwmmgQ/BXDAPrAjWBJrrrgrKpekwHs4cd1y5BSJf9EAze6DigCVQFVxjgBaUskiLR4+DBwt4luBSgDoJ+QrcIeBnYG+wBMiiK2A4HtnfZjE2Sz5+0LB/R9AftIKHqd+LHIMSeuTrrcHqQIt749DzPsdIze4Bbu6PwTSQRRNhOABU0vuWuIfyPcCRoJz58f3wdS8R0oQJ1OMYoPl6kuZxcSFYLESVkNMdjAFpmknCgSF0RBmd6AFu4jAwN313U9dPcL0HqLjnzqoaMjcDWYGrfI0Ampaw/0AgSvtaASsaHaJyyLmxTVrph/RK109C6IkyOsED3LydwbfARe+RsWvepqFjIaDeZxZw0WQy1sjbljzkY/eS4FOQDlaS2kiBpLwNatFP+e3apLk/pKO1Fh2xbCd5gBu3NUgPz5K3ehwXPetpHvrWAHcnjUidt3KtBaymImwekqqH6/I3tVQMoWe7BKfS4+uzWhydQ9mFfDK5edpOeA/QzqU0aevgyWAXFilmpDPzvEbfu8gfCi516FFDG4v92v7YTNSrTGPL5XOJKzcQa9Xj0h/TQ3uAxr46+ADY6GsStfWw0wk7fmczsJCmXjj4fDqvSmPrBp66JLMOrsUGBI1ICvOcN/V6QC0+arqy3EQNdW30HYnq3RqGsOdEm6GFtOMaxtAMQ7B3YaA5uGsOq3TN3/UFiqqJ8usDrUm49Giu/HDVCmLB+nqAm3UUsJFu5CH1taY8bdilnU820r7mdcqT0vlc2KrVcFswmeD6ZQgr0XEKEBm57VftK8Sfc9F0awAh/NJ0MrhRKwKtVNrohEatEMYuAp60GU2aNm40DWHvJmBSqi6anuwZshLIOwSox06SXs2tG1JPlJWjB7hZf0nevcT5+BzVBhGNrX1AugGaKuwcREmdhGB0N7Au2AsMAIvkoRq5iwP16np/3pKHjigzJw/ohoE5IE1aZOqXk9qgYrFz/7TxhevnOTbNAlRQp0Rh308P0KBHOxr7yGaqMXV4xFGPXZqpHtHW6AGnB2jg2luqfaRpeo2EmvYEO5XmlIG92pllo3/mpDKKzdED3MhcpgM5mpyb6KQj9kHLchZN57JR4RtLesMmYa9eSb2AgQNTRu5G+lrkv5NKb7hL7OyLUcOBfgdL9t6J3Y9yDEro0TeC9M2rVcCr4K+h/YOOrZC7L1gL6De7rkOHjkUEn9rfbkD7mDcGWt3Xd571uutrjp+AV4Du7TjwCHIa5re/CvZri+7mQJuO9M2zHmAp8BmYCd4Ez4MJ4CHs/45j5YSyu0GaZpDQVL2rqTl268sKNmr497IYvTfQl/RFepUm0vFsU78QR+RdLMEFMnr0SkcBE4SQdSYwss1RayL7GwWc9wPXgm+AIb1uMvyutI/IOAesbGR1xhH92wLFT9p+kopI9UnW6ROuLwctFdlNAQ2HTQPhtIMuqkhQAzFTA31RYHpHTRac3N9AZpaYgpkrAN1I2/tRpQ0uKVRFAnJ2BTaSjndAOd9D9mpGxmDgCjy9OtQrRO26Ml8sSdeZLC8Z/o/hUs9cV0JnXzAeiJKB2J6S/WnKqP4KXPXE2QSjfpPJRj/KLt24HFToBkul9BRculGtxrbdLTYnk84KYTsC1UB8pGFdTYTwrC8ZTCkYYALPZ48vT+XV+EeBRWsyuszC6NEoaBYwQcdp1WRkvIQE7ztws/nf9tSehe0aZzcz2d4d6wvgAxq4Upq3+Sgr31c2mSc585IJqfMQO52ybO1T0GnaYcqEsi9N+ZMooR8yyC1oka0NOuej5w6gH0wI8arQyNDXJp9F/lCOVjIVtQWsfs5Fv7nUzPSQw/iBjvRGSF44wwhzzzLYMrProSeUrZmVgcE0eo0KzyunQKU8BJLqcxcYAeaD0PVTHTT6uxNdR3EsIbNKbNsUoR/y7jTCYPWEGwKtlur3hjS+V4Vmgo/AG+BVHiqzOVqJPH0vtpXMlhSDViAj5e8BE0SValIwpMsqTZROb09d8Cm+33Lfx3P/xy5IDnJ2JlLMXDnLDo1ebAHtSjcGqox4LqIO6jSfMhk6moBdI5lYOH/LkpZrUiFIh6BErxq2BVlzTc1dHoXvdvAklTM3lcsO0oOnpeOq/aSmb7ukZMXLMB4wDVnHx8DL4F2gaz2we4MdgB7chpfTElIgKf962saGtInpJRxVJCDrpxQ7C6iNuYI1mfc0fPrRvFagDkYx1gdsCTRPTfJyWUQm0G9D70DqoNdZbaTxuN592QKjbr+cV7DhUOzQu2CbLSRbScO67QqYhpxrqFx6c8Q0S8melrSY1HkeUOPV+8hzwZWuIOP+Klj3Ahry6p2ui9Tglwea09b8hRX0KthuBiJXsOohIboOXEIdXtBFmpCl8uqQTgR6CLgePqqD6qiF092RJx+1ddkKWBt9bEsMmYYh+g2jw5GpIDsEVBKsaVP0knok8v4IknLeSzNyvaIlLSaF90BbI8sQKx5tolBPMtIVrJJB3mzwd041VbpVaR6S3CNoC8t6eMrNGgXjSsD0fOly0vU5UGANB9ZgVSHy5oP7gIL2ZKVBJtjbr4o/d+VymElyGWDyczviyN0RrkA9CnQPqGgnZN2E/JaCzK8tsn0OsrDHpJw8oPugEdAONGDtYCqL4P0MxoPABE8B9WR6cNe0f5x21A8ZP/foUR0+AIOw614PX0kW/HoQDC1kuB5ukn8GdrT17ApYV9C60gvyGYSzfA6WBz2NwI5Mxwl8y4HzyT4brOBgqzVZQ4nL0KOnoi0459SqIJYP4gG1MfVItmmLVwFlNIRWz/MVcDV2ydhNHzXQMZR1xYLRezD2vFmNDsqNpZymAm0BaZEh3RpRbK+8RYAqbKMetkSCQGXkBD25tF9S1yJtN3uN43jwAIaUDEXJ14KSAlWBlDetioKLwCSLoqbaG22x//uQpAfpRNrJA9VWRm2MNnUb5Q9xyFBAbeHIy0xGtoJlXyBbbUGrILsAOx7kWAuNpPDOYCCw6ZHs/cCDyvwQzAVp0kS7iKjApiRoRfYsoJ0wJlg5bftLDW1IOB7cA+/fwEFAv96vL2MfTfploB7Bipo2Wp9PTezTVPIwSTPE69w9oLY3IYCW6zwyFFDr0PYW9/D4sgaRqQe/K4jUy1/gE1BOHgEvOepcXHr04BkqWYvArG1RMzhPv+pYWwyG4NGcQcHoEmpYzXEDToTjgB4IPcBsUG+yzY9b621E1Gf1wFRramWJz2Sw601CL/BGBp8tWwHrIgXRv4ifUG9T1BFeDpYG6eGxrlchBlc3wTeFhDRtYxJg1EruCcDwm6yso5ylwNdGBTlN51pur1QORaqmkpECklqrlhYLhvSAepaaiIDR6mxWR1DtSrFGjC5SEP3DlVlpOvVQHbIWrQaYwHnOomAQgboM2I48reRWSotRoAUsmSiotFVA38Jx0UReHqd6YNgC1lbfPPRHmfXxwLuoUY/nomVcGRnp2rDhkxtihJA0oZULPQhctIaZgz5r4VDePmC4JS8rSYGpHtXIT/PrQaGeVvgCfAq+BKFpOYtADc+ftqR3tSRfQ2w2X2T1sNV2DGo/8pMriN4J7KgsecuZgHrMofhI0iutbFawplVpjil8CxS4s8A8UAupZ9Wc3OZorUzqIdHVyeabru6TdP2To8N0nq7VXkPSzAxh3duGxDRgzWEnW5jXsqT5khTcvp7VVzY5XFawLeVj9uTpqbg20DBoRQuf3ntF8g/1on/aPZD1YLdNt2rxXXrhNy2rW1vAFlJtDVlj+HKXxNVb9wam1y6IrfggmxR0Cvx1QE+glbOkrVwW0RJcKTgVqKq0elgtaNjK3EZ6JPvoI/ql2APaUeUbiVTaoRVLL71Sm/dSMrhugvMUC7dex3xgSU8mKTBkfFJeMr/ac/XY2g0laC4xBygQ5wHplD7x2JwqnjRNZjQxKZ0Yr6MHHB6YSrqtbRn2AZzcby4CHDdChm/OvKAHoiG/BPMEi1J1++rBfCSecntinxxfnhynYbOGyprz6qhrm0NVafXSabo0nRCvowc8HlBMuEgdwmGuzErTeRvThzLbAFt77hCXHjJe3pFTfKLAcJGGoBqyNhKl6yXbPgXXN5KR0ZaG94DvbYLaWH8CbctAtVDwe4NVetIN+1bSXlVGivpybVsx01DV1pOlitf1Ur2rzVZ9RzFr+b+uhkZlje0B2stELPwfsE2vZLza2p8J2pqmgoXeVbsIXXqkq42KAhYD55J6RiEvfViXhOTKrc5XTjM1wLWtt38Pu0Y1gG3RhObzwO2YXBQniSqoR9SXC36fSKvolGDVAqnWjzTNc+npkFnCQNDKwEc6OBac6CmiHlVlpEQryJldODz1Ip8tZ1CvPDZm1KtuUU/neUDrHurIXKRe9mQC70QXgyudMhoJ3gg0rPa13w4RJQFbyDmY4ycdXAtO9IpFPasWmWoaBiwQGfQsOQIwgh/iZIy5iMfogUo8wIN+Cvy3esoo0BS0ownAm4He/2cSfC0wPQb2z2ROMFgDFiOnwXN4gi95uh4X1nJJpjqfy2m2eat2jhxKfeTQSNED1XrgOArOAK45pomHA+B5l2DULx6uD4p6Ta71G2pbgVvgewNsCioio6ikEI38LhKvKMloT9DQ+CvQCIEgG2zBqmHMYdTjHY6Rogeq9gBt6EMKHwoUL1ltXnNRBfhkoL+/0b8/Pg7e5vprMAEMAxqhFgU015nkDNhCyRM4ujYaaGeR5oXayNAZpMouDGyLTLLn14WHjs4jRQ/U5AHa0r0I0Hw2K8iS+dqboMXaIaAXUHsVJXnaU9o/sx4G/qEtRuqJsCfQON5GmtNqmPCFLTPHND1otJnDtVljBLZfk9TPE84V2Em2eB494PPACDLvKzBkBleBT8GZ1TGKVfJcgaz8NsoURMOfCufW4OX2IiWf2rqowNW3bL4pyQ2bIHtVMQWrq3J/wubRSbUEq1a0J3E8KZnexc+zHrIhVtU1bfJRVr6vbDJvNhe+AFJ+zVTowHZD0MVA7c81p61GlzrFOzIKzs4MWAnAUL3H3Bb8V9cOWo10zSXfB6GHyRpKyEEKVF9PeRG2ngZPBxGk2tgxDrQA/buZXnS7gh2WLkPadue7/5MCeCJLxuQAOiRCnYmvLq7OpmL1tK+5QJscfgW0TuJ7UGTJNwF/C4ybgH9nFMjyZ3FxGrr+R/YhkEX6nSj9op3+t/ML8GUVmE0ZTdpVPotmwjCs2Fo8yX+cgkcshW8hzbfdMi2qbtfYdYLF3mTSDSGMQeBq4FNg+6vH2wLpWAn5+lPwtA61jyeBL8jKNgE5GwH9x2paD0nzry5bUIWMyN4UPCAlBVK9yiHDpwWpfYxazhVfWmVO10P8E8GihrfsI4UWBxcCo5RTLylYpwL9SfBHwBa8Jjh1VCOy/bk0yVbSje+TrgBpsvNOa4n2xAc5LJsu19nX2HSqx2ZlXRfKRmTpFcNbEpog/Z/L8gF1bIm8KQn5Ov0PaAmlQ3KQty9QB5EkPZjLei9aiy3o2AZcAz4DWaQHy1gwDJTsZSBtEzAZJOlpLrR45ZwHZtqPgO1hGgN6ZTLnw6ChyCigXUxzkiqwbVWu7wRanfPRi2TuQnkN+RuCsP1SDDnaY8yl2HusJ7+iLPTpqb0h0JRGXz9s5RiU0KGp0iCgtvIaeA49ZjjIZRhCj6Y/g4HeYLyIjlfCSC5PCvoVgBuAzYA2F2l9R2kzgd7jvgCex67ZHJ2EHI3+5C91RHpfG+avX3UjwGmgnCcLbMFoPJK2oCIlRPogoD/GKpemwtivRFAnJWCLhli+0cvZnWRaVPt98QANTL+Z+gegIW9epEasf9feyeU38vYDGlZXSrI7qzd2qQ2Wjg29wNwM4/cPpjAK6toeoKGpxx0O7gMaq4cgzX9GA22J9BI8A8B0UA1pbr27V0HOmei/qgzDf5izGVF8V/QADU9/knUAuAQ8BfTfO+WQFkFuBSOA5lYVEWV6g1dBNfQdhYZXpDAQM3p/kWGwet63AqmLYprUA3V7H0lj0xJ+T7Am0KJQci/lR1xr4ec9JuQ1v7BHlxYd7gHVDHO1mKWFqPs45k4Fv5yBopEFZb5XHedh16m5GxUVRA/U2wMEwlJAy+fVkN4d5vI6ALn6Y7DVwBbgZKARhci30KQ8jVC08hgpeuD76QEa+MLgalAN7RrSKxigANUmDs2Vk6Shbjl0QUh7oqzogYb1ANFwTjkRkeI5PVSFkKvNA5oflxucSVPUu+rFeUPuzArloygneqDIAzT4I4GCplw6rEhADRcofAJUE6wqo51fLTWoj0WjB5rTAzT8oaCcbY8KlP4haokc/crAN6BSkg2atwYdmoeoU5QRPVA3DxAA2j/7CfDRlaEMQkm1AfsWZQeGsiPKiR5oWg8QCP3BS8BG2vwedIUYefqJEPWYWWRWim+HUftiI0UPRA/IAwSEVpCPANqkoTnmtaDjq04hvYTcIcC26JQOYgV2HAKHdH6UFT1QjQcIxMFgAjBzaO15fhPcAbSrq281cmOZ6IHogRw9QGCqZ9cur0jRAxV74P9PS0qyBQk2NwAAAABJRU5ErkJggg==" style="height: inherit;"> 
19 |             </a>
20 |         </div>
21 |         <div
22 |              class="about" style="max-width: 50%;padding: 10px;">
23 |             <h2 style="color: #1E6E73">What just happened?</h2>
24 |             <span>This is Jina's <pre style="display: inline;">audio-to-audio-search</pre> example. It downloads AudioSet and indexes 1000 10-second audio clips via Jina search framework. We then randomly sample 5-second query audio clips extracted from the index audio clips as <i>Queries</i>, ask Jina to retrieve relevant results. Below is Jina's retrievals, where the left-most column is query audio.</span>
25 |             <br>
26 |             <span> Intrigued? Learn more about Jina and <a href="https://get.jina.ai/">checkout our Github!</a></span>
27 |         </div>
28 | 
29 |         <hr style="border: 1px dashed #009999;">
30 |         <h3 style="color: #1E6E73">Precision@{% TOP_K %}: {% PRECISION_EVALUATION %}</h3>
31 |         
32 |         <hr style="border: 1px dashed #009999;">
33 | 
34 |         <table style="width:100%; white-space: nowrap;"><tr><th align="left"><h3 style="color: #1E6E73">Query</h3></th><th align="left"><h3 style="color: #1E6E73">Top-K Results</h3></th></tr>{% RESULT %}</table>
35 |     </body>
36 | </html>
37 | 


--------------------------------------------------------------------------------
/audio-to-audio-search/executors.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from typing import Tuple, Dict, Optional
  3 | 
  4 | import torch
  5 | import numpy as np
  6 | import librosa as lr
  7 | import torchaudio
  8 | from jina import Executor, DocumentArray, requests, Document
  9 | from jina_commons import get_logger
 10 | 
 11 | from vggish.vggish_input import waveform_to_examples
 12 | from vggish.vggish_params import SAMPLE_RATE
 13 | 
 14 | 
 15 | class Wav2MelCrafter(Executor):
 16 |     def __init__(self, *args, **kwargs):
 17 |         super().__init__(*args, **kwargs)
 18 |         self.logger = get_logger(self)
 19 | 
 20 |     @requests
 21 |     def segment(self, docs: Optional[DocumentArray] = None, **kwargs):
 22 |         if not docs:
 23 |             return
 24 |         for doc in docs:
 25 |             result_chunk = []
 26 |             for chunk in doc.chunks:
 27 |                 mel_data = waveform_to_examples(chunk.blob, chunk.tags['sample_rate'])
 28 |                 if mel_data.ndim != 3:
 29 |                     self.logger.warning(
 30 |                         f'failed to convert from wave to mel, chunk.blob: {chunk.blob.shape}, sample_rate: {SAMPLE_RATE}'
 31 |                     )
 32 |                     continue
 33 |                 if mel_data.shape[0] <= 0:
 34 |                     self.logger.warning(
 35 |                         f'chunk between {chunk.location} is skipped due to the duration is too short'
 36 |                     )
 37 |                 if mel_data.ndim == 2:
 38 |                     mel_data = np.atleast_3d(mel_data)
 39 |                     mel_data = mel_data.reshape(1, mel_data.shape[0], mel_data.shape[1])
 40 |                 chunk.blob = mel_data
 41 |                 if mel_data.size > 0:
 42 |                     result_chunk.append(chunk)
 43 |             doc.chunks = result_chunk
 44 | 
 45 | 
 46 | class TimeSegmenter(Executor):
 47 |     def __init__(self, chunk_duration: int = 10, chunk_strip: int = 1, *args, **kwargs):
 48 |         super().__init__(*args, **kwargs)
 49 |         self.chunk_duration = chunk_duration  # seconds
 50 |         self.strip = chunk_strip
 51 | 
 52 |     @requests(on=['/search', '/index'])
 53 |     def segment(
 54 |         self, docs: Optional[DocumentArray] = None, parameters: dict = {}, **kwargs
 55 |     ):
 56 |         if not docs:
 57 |             return
 58 |         for idx, doc in enumerate(docs):
 59 |             doc.blob, sample_rate = self._load_raw_audio(doc)
 60 |             doc.tags['sample_rate'] = sample_rate
 61 |             chunk_size = int(self.chunk_duration * sample_rate)
 62 |             strip = parameters.get('chunk_strip', self.strip)
 63 |             strip_size = int(strip * sample_rate)
 64 |             num_chunks = max(1, int((doc.blob.shape[0] - chunk_size) / strip_size))
 65 |             for chunk_id in range(num_chunks):
 66 |                 beg = chunk_id * strip_size
 67 |                 end = beg + chunk_size
 68 |                 if beg > doc.blob.shape[0]:
 69 |                     break
 70 |                 doc.chunks.append(
 71 |                     Document(
 72 |                         blob=doc.blob[beg:end],
 73 |                         offset=idx,
 74 |                         location=[beg, end],
 75 |                         tags=doc.tags,
 76 |                     )
 77 |                 )
 78 | 
 79 |     def _load_raw_audio(self, doc: Document) -> Tuple[np.ndarray, int]:
 80 |         if doc.blob is not None and doc.tags.get('sample_rate', None) is None:
 81 |             raise BadDocType('data is blob but sample rate is not provided')
 82 |         elif doc.blob is not None:
 83 |             return doc.blob, int(doc.tags['sample_rate'])
 84 |         elif doc.uri is not None and doc.uri.endswith('.mp3'):
 85 |             return self._read_mp3(doc.uri)
 86 |         elif doc.uri is not None and doc.uri.endswith('.wav'):
 87 |             return self._read_wav(doc.uri)
 88 |         else:
 89 |             raise BadDocType('doc needs to have either a blob or a wav/mp3 uri')
 90 | 
 91 |     def _read_wav(self, file_path: str) -> Tuple[np.ndarray, int]:
 92 |         data, sample_rate = torchaudio.load(file_path)
 93 |         data = np.mean(data.cpu().numpy(), axis=0)
 94 |         return data, sample_rate
 95 | 
 96 |     def _read_mp3(self, file_path: str) -> Tuple[np.ndarray, int]:
 97 |         return lr.load(file_path)
 98 | 
 99 | 
100 | class DebugExecutor(Executor):
101 |     @requests
102 |     def debug(self, docs: Optional[DocumentArray] = None, **kwargs):
103 |         logger = get_logger(self)
104 |         if not docs:
105 |             return
106 |         for i, doc in enumerate(docs):
107 |             for match in doc.matches:
108 |                 logger.info(f"doc {doc.tags['file']} match: ", match.tags['file'])
109 | 


--------------------------------------------------------------------------------
/audio-to-audio-search/helper.py:
--------------------------------------------------------------------------------
  1 | import glob
  2 | import logging
  3 | import os
  4 | from pathlib import Path
  5 | import re
  6 | import random
  7 | import shutil
  8 | import subprocess
  9 | from typing import List, Optional
 10 | 
 11 | from prettytable import PrettyTable
 12 | from jina import Document, DocumentArray
 13 | from jina.types.request import Response
 14 | import webbrowser
 15 | 
 16 | 
 17 | ID_LEN = 11
 18 | 
 19 | 
 20 | def get_logger():
 21 |     """
 22 |     Method to get logger.
 23 |     """
 24 |     logger = logging.getLogger('app')
 25 |     logger.setLevel(logging.INFO)
 26 |     formatter = logging.Formatter('%(message)s')
 27 |     sh = logging.StreamHandler()
 28 |     sh.setFormatter(formatter)
 29 |     sh.setLevel(logging.INFO)
 30 |     logger.addHandler(sh)
 31 |     return logger
 32 | 
 33 | 
 34 | logger = get_logger()
 35 | 
 36 | 
 37 | def create_docs(filefolder_path: str):
 38 |     """
 39 |     Method to create Jina documents.
 40 | 
 41 |     :param filefolder_path: paths to audio files
 42 |     """
 43 |     docs = []
 44 |     import librosa as lr
 45 | 
 46 |     logger.info('Creating docs..')
 47 |     for file_path in sorted(glob.glob(filefolder_path)):
 48 |         id = os.path.basename(file_path).split('.')[0]
 49 |         blob, sample_rate = lr.load(file_path)
 50 |         docs.append(
 51 |             Document(
 52 |                 id=id, blob=blob, tags={'file': file_path, 'sample_rate': sample_rate}
 53 |             )
 54 |         )
 55 |     logger.info('docs created')
 56 |     return DocumentArray(docs)
 57 | 
 58 | 
 59 | def create_query_audios(num_docs: int, data_folder: Path):
 60 |     """
 61 |     Method to create query audio clips.
 62 | 
 63 |     :param num_docs: number of query docs
 64 |     :param data_folder: path to data folder
 65 |     """
 66 |     input_docs_folder = data_folder / 'index'
 67 |     output_docs_folder = data_folder / 'query'
 68 |     if output_docs_folder.is_dir():
 69 |         shutil.rmtree(output_docs_folder)
 70 |     output_docs_folder.mkdir()
 71 |     input_docs_filenames = glob.glob(str(input_docs_folder / '*.mp3'))
 72 | 
 73 |     if len(input_docs_filenames) < num_docs:
 74 |         raise FileNotFoundError(
 75 |             'cannot find sufficient '
 76 |             f'index audios clips. Number of index audio clips found: {len(input_docs_filenames)}, '
 77 |             f'number of requested query docs: {num_docs}'
 78 |         )
 79 | 
 80 |     for input_file in random.sample(input_docs_filenames, k=num_docs):
 81 |         id = re.match(r'index_(.*).mp3', os.path.basename(input_file))[1][-ID_LEN:]
 82 |         output_file = f"query_{id}.mp3"
 83 |         startTime = random.random() * 5
 84 |         endTime = startTime + random.random() * 4 + 3
 85 |         cmd = [
 86 |             'ffmpeg',
 87 |             '-i',
 88 |             os.path.abspath(input_file),
 89 |             '-ss',
 90 |             str(startTime),
 91 |             '-to',
 92 |             str(endTime),
 93 |             '-async',
 94 |             '1',
 95 |             output_file,
 96 |         ]
 97 |         subprocess.call(cmd, cwd=str(output_docs_folder))
 98 | 
 99 | 
100 | def report_results(responses: List[Response], threshold: Optional[float], top_k: int):
101 |     """
102 |     Method to report results
103 | 
104 |     :param responses: returned responses with data
105 |     :param threshold: threshold for search
106 |     :param top_k: top k number
107 |     """
108 |     pred_list = []
109 |     table = PrettyTable()
110 |     table.field_names = ['target', 'prediction', 'is_correct']
111 |     result_html = []
112 |     for i, response in enumerate(responses):
113 |         for j, doc in enumerate(response.docs):
114 |             if not doc.matches:
115 |                 continue
116 |             match = doc.matches[0]
117 |             target_result = os.path.basename(doc.tags["file"]).split('.')[0][-ID_LEN:]
118 |             pred_result = os.path.basename(match.tags["file"]).split('.')[0][-ID_LEN:]
119 |             pred_result = (
120 |                 pred_result
121 |                 if threshold is None or 1 - match.scores['cosine'].value > threshold
122 |                 else 'None'
123 |             )
124 |             table.add_row([target_result, pred_result, target_result == pred_result])
125 |             pred_list.append(target_result == pred_result)
126 | 
127 |             query_html = f"""
128 |                 <audio id="res{i}_query{j}" src="{'file://' + os.path.abspath(doc.tags['file'])}" preload="none" type="audio/mp3"></audio>
129 |                 <button style="padding: 1em" onclick=" if (prev!=null){{prev.pause(); prev.currentTIme=0;}} prev=document.getElementById('res{i}_query{j}'); prev.play();">
130 |                     <i class='fa fa-volume-up fa-2x'></i></a>
131 |                 </button>
132 |             """
133 |             seen = set()
134 |             result_html.append(f'<tr><td>{query_html}</td><td>')
135 |             print('wt, ', len(doc.matches))
136 |             for k, match in enumerate(doc.matches):
137 |                 if len(seen) >= top_k:
138 |                     break
139 |                 if match.tags['file'] in seen:
140 |                     continue
141 |                 seen.add(match.tags['file'])
142 |                 match_html = f"""
143 |                     <audio id="res{i}_query{j}_match{k}" src="{'file://' + os.path.abspath(match.tags['file'])}" preload="none" type="audio/mp3"></audio>
144 |                     <button style="padding: 1em" onclick="if(prev!=null){{prev.pause(); prev.currentTIme=0;}} prev=document.getElementById('res{i}_query{j}_match{k}'); prev.play();">
145 |                         <i class='fa fa-volume-up fa-2x'></i>
146 |                     </button>
147 |                 """
148 |                 result_html.append(match_html)
149 |             result_html.append('</td></tr>\n')
150 | 
151 |     logger.info(table)
152 | 
153 |     if not pred_list:
154 |         return [], float('nan')
155 | 
156 |     accuracy = sum(pred_list) / len(pred_list)
157 |     logger.info(f'accuracy: {accuracy}')
158 |     return result_html, accuracy
159 | 
160 | 
161 | def write_html(html_path: str, result_html: str, accuracy: float, top_k: int):
162 |     """
163 |     Method to present results in browser.
164 | 
165 |     :param html_path: path of the written html
166 |     :param result_html: content of html to be written
167 |     :param accuracy: accuracy of search
168 |     :param top_k: top k number
169 |     """
170 |     with open(
171 |         os.path.join(os.path.dirname(os.path.realpath(__file__)), 'demo.html')
172 |     ) as fp, open(html_path, 'w') as fw:
173 |         t = fp.read()
174 |         t = t.replace('{% RESULT %}', '\n'.join(result_html))
175 |         t = t.replace(
176 |             '{% PRECISION_EVALUATION %}',
177 |             '{:.2f}%'.format(accuracy * 100.0),
178 |         )
179 |         t = t.replace('{% TOP_K %}', str(top_k))
180 |         fw.write(t)
181 | 
182 |     url_html_path = 'file://' + os.path.abspath(html_path)
183 | 
184 |     try:
185 |         webbrowser.open(url_html_path, new=2)
186 |     except:
187 |         pass  # intentional pass, browser support isn't cross-platform
188 |     finally:
189 |         logger.info(
190 |             f'You should see a "demo.html" opened in your browser, '
191 |             f'if not you may open {url_html_path} manually'
192 |         )
193 | 


--------------------------------------------------------------------------------
/audio-to-audio-search/requirements.txt:
--------------------------------------------------------------------------------
 1 | git+git://github.com/jina-ai/jina-commons@v0.0.3
 2 | click
 3 | jina~=2.0
 4 | numpy==1.20.0
 5 | soundfile==0.10.3.post1
 6 | librosa==0.8.0
 7 | visdom==0.1.8.9
 8 | ffmpeg
 9 | torchaudio
10 | prettytable
11 | 


--------------------------------------------------------------------------------
/audio-to-audio-search/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jina-ai/examples/d8f903278597254bd96b1ed64fe8c9feefaa265c/audio-to-audio-search/tests/__init__.py


--------------------------------------------------------------------------------
/audio-to-audio-search/tests/data/mp3/index/index_-Bu7YaslRW0.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jina-ai/examples/d8f903278597254bd96b1ed64fe8c9feefaa265c/audio-to-audio-search/tests/data/mp3/index/index_-Bu7YaslRW0.mp3


--------------------------------------------------------------------------------
/audio-to-audio-search/tests/data/mp3/index/index_-D--GWwca0g.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jina-ai/examples/d8f903278597254bd96b1ed64fe8c9feefaa265c/audio-to-audio-search/tests/data/mp3/index/index_-D--GWwca0g.mp3


--------------------------------------------------------------------------------
/audio-to-audio-search/tests/data/mp3/index/index_-nlkWWphiaM.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jina-ai/examples/d8f903278597254bd96b1ed64fe8c9feefaa265c/audio-to-audio-search/tests/data/mp3/index/index_-nlkWWphiaM.mp3


--------------------------------------------------------------------------------
/audio-to-audio-search/tests/data/mp3/index/index_0bRUkLsttto.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jina-ai/examples/d8f903278597254bd96b1ed64fe8c9feefaa265c/audio-to-audio-search/tests/data/mp3/index/index_0bRUkLsttto.mp3


--------------------------------------------------------------------------------
/audio-to-audio-search/tests/data/mp3/index/index_0slyl34xWug.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jina-ai/examples/d8f903278597254bd96b1ed64fe8c9feefaa265c/audio-to-audio-search/tests/data/mp3/index/index_0slyl34xWug.mp3


--------------------------------------------------------------------------------
/audio-to-audio-search/tests/requirements.txt:
--------------------------------------------------------------------------------
 1 | git+git://github.com/jina-ai/jina-commons@v0.0.3
 2 | click
 3 | pytest~=6.1.2
 4 | jina~=2.0
 5 | numpy==1.20.0
 6 | soundfile==0.10.3.post1
 7 | librosa==0.8.0
 8 | visdom==0.1.8.9
 9 | ffmpeg
10 | torchaudio
11 | prettytable
12 | 


--------------------------------------------------------------------------------
/audio-to-audio-search/tests/test_audio_to_audio_search.py:
--------------------------------------------------------------------------------
 1 | import glob
 2 | import os
 3 | import shutil
 4 | 
 5 | import pytest
 6 | from click.testing import CliRunner
 7 | from app import cli
 8 | from pathlib import Path
 9 | 
10 | 
11 | @pytest.mark.parametrize('segmenter', ['vad', 'time'])
12 | @pytest.mark.parametrize('encoder', ['vgg', 'clip'])
13 | def test_exec(tmp_path, segmenter, encoder):
14 |     assert os.getcwd().endswith(
15 |         'audio-to-audio-search'
16 |     ), "Please execute the tests from the root directory: >>> pytest tests/"
17 |     os.environ['JINA_DATA_FILE'] = os.path.join('tests', 'data', 'mp3')
18 |     workspace = os.environ['JINA_WORKSPACE'] = os.path.join(tmp_path, 'workspace')
19 |     os.environ['JINA_WORKSPACE_MOUNT']= f'{workspace}:/workspace/workspace'
20 |     runner = CliRunner()
21 |     _test_index(runner, workspace, segmenter, encoder)
22 |     _test_query(runner, segmenter, encoder)
23 | 
24 | 
25 | def _test_index(runner, workspace, segmenter, encoder):
26 |     result = runner.invoke(cli, ['index', '-s', segmenter, '-e', encoder])
27 |     assert result.exception is None
28 |     assert result.exit_code == 0
29 |     assert Path(workspace).is_dir()
30 |     assert (
31 |         len(set(glob.glob(os.path.join(workspace, '**', '*.bin'), recursive=True))) == 2
32 |     )
33 | 
34 | 
35 | def _test_query(runner, segmenter, encoder):
36 |     # test error case: query more docs than indexed
37 |     result = runner.invoke(cli, ['search', '-s', segmenter, '-e', encoder, '-n', 10])
38 | 
39 |     with pytest.raises(
40 |         FileNotFoundError,
41 |         match='cannot find sufficient index audios clips. '
42 |         'Number of index audio clips found: 5, number of requested query docs: 10',
43 |     ):
44 |         assert result.exception is not None
45 |         raise result.exception
46 | 
47 |     assert result.exit_code != 0
48 |     result = runner.invoke(cli, ['search', '-s', segmenter, '-e', encoder, '-n', 3])
49 |     assert result.exception is None
50 |     assert result.exit_code == 0
51 | 


--------------------------------------------------------------------------------
/audio-to-audio-search/vggish/mel_features.py:
--------------------------------------------------------------------------------
  1 | __copyright__ = "Copyright (c) 2021 Jina AI Limited. All rights reserved."
  2 | __license__ = "Apache-2.0"
  3 | 
  4 | # Copyright 2017 The TensorFlow Authors All Rights Reserved.
  5 | #
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | #
 10 | #     http://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | # ==============================================================================
 18 | 
 19 | """Defines routines to compute mel spectrogram features from audio waveform."""
 20 | 
 21 | import numpy as np
 22 | 
 23 | 
 24 | def frame(data, window_length, hop_length):
 25 |     """Convert array into a sequence of successive possibly overlapping frames.
 26 | 
 27 |     An n-dimensional array of shape (num_samples, ...) is converted into an
 28 |     (n+1)-D array of shape (num_frames, window_length, ...), where each frame
 29 |     starts hop_length points after the preceding one.
 30 | 
 31 |     This is accomplished using stride_tricks, so the original data is not
 32 |     copied.  However, there is no zero-padding, so any incomplete frames at the
 33 |     end are not included.
 34 | 
 35 |     Args:
 36 |       data: np.array of dimension N >= 1.
 37 |       window_length: Number of samples in each frame.
 38 |       hop_length: Advance (in samples) between each window.
 39 | 
 40 |     Returns:
 41 |       (N+1)-D np.array with as many rows as there are complete frames that can be
 42 |       extracted.
 43 |     """
 44 |     num_samples = data.shape[0]
 45 |     num_frames = 1 + int(np.floor((num_samples - window_length) / hop_length))
 46 |     shape = (num_frames, window_length) + data.shape[1:]
 47 |     strides = (data.strides[0] * hop_length,) + data.strides
 48 |     return np.lib.stride_tricks.as_strided(data, shape=shape, strides=strides)
 49 | 
 50 | 
 51 | def periodic_hann(window_length):
 52 |     """Calculate a "periodic" Hann window.
 53 | 
 54 |     The classic Hann window is defined as a raised cosine that starts and
 55 |     ends on zero, and where every value appears twice, except the middle
 56 |     point for an odd-length window.  Matlab calls this a "symmetric" window
 57 |     and np.hanning() returns it.  However, for Fourier analysis, this
 58 |     actually represents just over one cycle of a period N-1 cosine, and
 59 |     thus is not compactly expressed on a length-N Fourier basis.  Instead,
 60 |     it's better to use a raised cosine that ends just before the final
 61 |     zero value - i.e. a complete cycle of a period-N cosine.  Matlab
 62 |     calls this a "periodic" window. This routine calculates it.
 63 | 
 64 |     Args:
 65 |       window_length: The number of points in the returned window.
 66 | 
 67 |     Returns:
 68 |       A 1D np.array containing the periodic hann window.
 69 |     """
 70 |     return 0.5 - (0.5 * np.cos(2 * np.pi / window_length *
 71 |                                np.arange(window_length)))
 72 | 
 73 | 
 74 | def stft_magnitude(signal, fft_length,
 75 |                    hop_length=None,
 76 |                    window_length=None):
 77 |     """Calculate the short-time Fourier transform magnitude.
 78 | 
 79 |     Args:
 80 |       signal: 1D np.array of the input time-domain signal.
 81 |       fft_length: Size of the FFT to apply.
 82 |       hop_length: Advance (in samples) between each frame passed to FFT.
 83 |       window_length: Length of each block of samples to pass to FFT.
 84 | 
 85 |     Returns:
 86 |       2D np.array where each row contains the magnitudes of the fft_length/2+1
 87 |       unique values of the FFT for the corresponding frame of input samples.
 88 |     """
 89 |     frames = frame(signal, window_length, hop_length)
 90 |     # Apply frame window to each frame. We use a periodic Hann (cosine of period
 91 |     # window_length) instead of the symmetric Hann of np.hanning (period
 92 |     # window_length-1).
 93 |     window = periodic_hann(window_length)
 94 |     windowed_frames = frames * window
 95 |     return np.abs(np.fft.rfft(windowed_frames, int(fft_length)))
 96 | 
 97 | 
 98 | # Mel spectrum constants and functions.
 99 | _MEL_BREAK_FREQUENCY_HERTZ = 700.0
100 | _MEL_HIGH_FREQUENCY_Q = 1127.0
101 | 
102 | 
103 | def hertz_to_mel(frequencies_hertz):
104 |     """Convert frequencies to mel scale using HTK formula.
105 | 
106 |     Args:
107 |       frequencies_hertz: Scalar or np.array of frequencies in hertz.
108 | 
109 |     Returns:
110 |       Object of same size as frequencies_hertz containing corresponding values
111 |       on the mel scale.
112 |     """
113 |     return _MEL_HIGH_FREQUENCY_Q * np.log(
114 |         1.0 + (frequencies_hertz / _MEL_BREAK_FREQUENCY_HERTZ))
115 | 
116 | 
117 | def spectrogram_to_mel_matrix(num_mel_bins=20,
118 |                               num_spectrogram_bins=129,
119 |                               audio_sample_rate=8000,
120 |                               lower_edge_hertz=125.0,
121 |                               upper_edge_hertz=3800.0):
122 |     """Return a matrix that can post-multiply spectrogram rows to make mel.
123 | 
124 |     Returns a np.array matrix A that can be used to post-multiply a matrix S of
125 |     spectrogram values (STFT magnitudes) arranged as frames x bins to generate a
126 |     "mel spectrogram" M of frames x num_mel_bins.  M = S A.
127 | 
128 |     The classic HTK algorithm exploits the complementarity of adjacent mel bands
129 |     to multiply each FFT bin by only one mel weight, then add it, with positive
130 |     and negative signs, to the two adjacent mel bands to which that bin
131 |     contributes.  Here, by expressing this operation as a matrix multiply, we go
132 |     from num_fft multiplies per frame (plus around 2*num_fft adds) to around
133 |     num_fft^2 multiplies and adds.  However, because these are all presumably
134 |     accomplished in a single call to np.dot(), it's not clear which approach is
135 |     faster in Python.  The matrix multiplication has the attraction of being more
136 |     general and flexible, and much easier to read.
137 | 
138 |     Args:
139 |       num_mel_bins: How many bands in the resulting mel spectrum.  This is
140 |         the number of columns in the output matrix.
141 |       num_spectrogram_bins: How many bins there are in the source spectrogram
142 |         data, which is understood to be fft_size/2 + 1, i.e. the spectrogram
143 |         only contains the nonredundant FFT bins.
144 |       audio_sample_rate: Samples per second of the audio at the input to the
145 |         spectrogram. We need this to figure out the actual frequencies for
146 |         each spectrogram bin, which dictates how they are mapped into mel.
147 |       lower_edge_hertz: Lower bound on the frequencies to be included in the mel
148 |         spectrum.  This corresponds to the lower edge of the lowest triangular
149 |         band.
150 |       upper_edge_hertz: The desired top edge of the highest frequency band.
151 | 
152 |     Returns:
153 |       An np.array with shape (num_spectrogram_bins, num_mel_bins).
154 | 
155 |     Raises:
156 |       ValueError: if frequency edges are incorrectly ordered or out of range.
157 |     """
158 |     nyquist_hertz = audio_sample_rate / 2.
159 |     if lower_edge_hertz < 0.0:
160 |         raise ValueError("lower_edge_hertz %.1f must be >= 0" % lower_edge_hertz)
161 |     if lower_edge_hertz >= upper_edge_hertz:
162 |         raise ValueError("lower_edge_hertz %.1f >= upper_edge_hertz %.1f" %
163 |                          (lower_edge_hertz, upper_edge_hertz))
164 |     if upper_edge_hertz > nyquist_hertz:
165 |         raise ValueError("upper_edge_hertz %.1f is greater than Nyquist %.1f" %
166 |                          (upper_edge_hertz, nyquist_hertz))
167 |     spectrogram_bins_hertz = np.linspace(0.0, nyquist_hertz, num_spectrogram_bins)
168 |     spectrogram_bins_mel = hertz_to_mel(spectrogram_bins_hertz)
169 |     # The i'th mel band (starting from i=1) has center frequency
170 |     # band_edges_mel[i], lower edge band_edges_mel[i-1], and higher edge
171 |     # band_edges_mel[i+1].  Thus, we need num_mel_bins + 2 values in
172 |     # the band_edges_mel arrays.
173 |     band_edges_mel = np.linspace(hertz_to_mel(lower_edge_hertz),
174 |                                  hertz_to_mel(upper_edge_hertz), num_mel_bins + 2)
175 |     # Matrix to post-multiply feature arrays whose rows are num_spectrogram_bins
176 |     # of spectrogram values.
177 |     mel_weights_matrix = np.empty((num_spectrogram_bins, num_mel_bins))
178 |     for i in range(num_mel_bins):
179 |         lower_edge_mel, center_mel, upper_edge_mel = band_edges_mel[i:i + 3]
180 |         # Calculate lower and upper slopes for every spectrogram bin.
181 |         # Line segments are linear in the *mel* domain, not hertz.
182 |         lower_slope = ((spectrogram_bins_mel - lower_edge_mel) /
183 |                        (center_mel - lower_edge_mel))
184 |         upper_slope = ((upper_edge_mel - spectrogram_bins_mel) /
185 |                        (upper_edge_mel - center_mel))
186 |         # .. then intersect them with each other and zero.
187 |         mel_weights_matrix[:, i] = np.maximum(0.0, np.minimum(lower_slope,
188 |                                                               upper_slope))
189 |     # HTK excludes the spectrogram DC bin; make sure it always gets a zero
190 |     # coefficient.
191 |     mel_weights_matrix[0, :] = 0.0
192 |     return mel_weights_matrix
193 | 
194 | 
195 | def log_mel_spectrogram(data,
196 |                         audio_sample_rate=8000,
197 |                         log_offset=0.0,
198 |                         window_length_secs=0.025,
199 |                         hop_length_secs=0.010,
200 |                         **kwargs):
201 |     """Convert waveform to a log magnitude mel-frequency spectrogram.
202 | 
203 |     Args:
204 |       data: 1D np.array of waveform data.
205 |       audio_sample_rate: The sampling rate of data.
206 |       log_offset: Add this to values when taking log to avoid -Infs.
207 |       window_length_secs: Duration of each window to analyze.
208 |       hop_length_secs: Advance between successive analysis windows.
209 |       **kwargs: Additional arguments to pass to spectrogram_to_mel_matrix.
210 | 
211 |     Returns:
212 |       2D np.array of (num_frames, num_mel_bins) consisting of log mel filterbank
213 |       magnitudes for successive frames.
214 |     """
215 |     window_length_samples = int(round(audio_sample_rate * window_length_secs))
216 |     hop_length_samples = int(round(audio_sample_rate * hop_length_secs))
217 |     fft_length = 2 ** int(np.ceil(np.log(window_length_samples) / np.log(2.0)))
218 |     spectrogram = stft_magnitude(
219 |         data,
220 |         fft_length=fft_length,
221 |         hop_length=hop_length_samples,
222 |         window_length=window_length_samples)
223 |     mel_spectrogram = np.dot(spectrogram, spectrogram_to_mel_matrix(
224 |         num_spectrogram_bins=spectrogram.shape[1],
225 |         audio_sample_rate=audio_sample_rate, **kwargs))
226 |     return np.log(mel_spectrogram + log_offset)
227 | 


--------------------------------------------------------------------------------
/audio-to-audio-search/vggish/vggish_input.py:
--------------------------------------------------------------------------------
  1 | __copyright__ = "Copyright (c) 2021 Jina AI Limited. All rights reserved."
  2 | __license__ = "Apache-2.0"
  3 | 
  4 | # Copyright 2017 The TensorFlow Authors All Rights Reserved.
  5 | #
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | #
 10 | #     http://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | # ==============================================================================
 18 | 
 19 | """Compute input examples for VGGish from audio waveform."""
 20 | 
 21 | import resampy
 22 | 
 23 | 
 24 | from vggish.mel_features import *
 25 | from vggish.vggish_params import *
 26 | import librosa
 27 | 
 28 | try:
 29 |     import soundfile as sf
 30 | 
 31 | 
 32 |     def wav_read(wav_file):
 33 |         wav_data, sr = sf.read(wav_file, dtype='int16')
 34 |         return wav_data, sr
 35 | 
 36 | except ImportError:
 37 | 
 38 |     def wav_read(wav_file):
 39 |         raise NotImplementedError('WAV file reading requires soundfile package.')
 40 | 
 41 | 
 42 | def waveform_to_examples(data, sample_rate):
 43 |     """Converts audio waveform into an array of examples for VGGish.
 44 | 
 45 |     Args:
 46 |       data: np.array of either one dimension (mono) or two dimensions
 47 |         (multi-channel, with the outer dimension representing channels).
 48 |         Each sample is generally expected to lie in the range [-1.0, +1.0],
 49 |         although this is not required.
 50 |       sample_rate: Sample rate of data.
 51 | 
 52 |     Returns:
 53 |       3-D np.array of shape [num_examples, num_frames, num_bands] which represents
 54 |       a sequence of examples, each of which contains a patch of log mel
 55 |       spectrogram, covering num_frames frames of audio and num_bands mel frequency
 56 |       bands, where the frame length is STFT_HOP_LENGTH_SECONDS.
 57 |     """
 58 |     # Convert to mono.
 59 |     if len(data.shape) > 1:
 60 |         data = np.mean(data, axis=1)
 61 |     # Resample to the rate assumed by VGGish.
 62 |     if sample_rate != SAMPLE_RATE:
 63 |         data = resampy.resample(data, sample_rate, SAMPLE_RATE)
 64 | 
 65 |     # Compute log mel spectrogram features.
 66 |     log_mel = log_mel_spectrogram(
 67 |         data,
 68 |         audio_sample_rate=SAMPLE_RATE,
 69 |         log_offset=LOG_OFFSET,
 70 |         window_length_secs=STFT_WINDOW_LENGTH_SECONDS,
 71 |         hop_length_secs=STFT_HOP_LENGTH_SECONDS,
 72 |         num_mel_bins=NUM_MEL_BINS,
 73 |         lower_edge_hertz=MEL_MIN_HZ,
 74 |         upper_edge_hertz=MEL_MAX_HZ)
 75 | 
 76 |     # Frame features into examples.
 77 |     features_sample_rate = 1.0 / STFT_HOP_LENGTH_SECONDS
 78 |     example_window_length = int(round(
 79 |         EXAMPLE_WINDOW_SECONDS * features_sample_rate))
 80 |     example_hop_length = int(round(
 81 |         EXAMPLE_HOP_SECONDS * features_sample_rate))
 82 |     log_mel_examples = frame(
 83 |         log_mel,
 84 |         window_length=example_window_length,
 85 |         hop_length=example_hop_length)
 86 |     return log_mel_examples
 87 | 
 88 | 
 89 | def wavfile_to_examples(wav_file):
 90 |     """Convenience wrapper around waveform_to_examples() for a common WAV format.
 91 | 
 92 |     Args:
 93 |       wav_file: String path to a file, or a file-like object. The file
 94 |       is assumed to contain WAV audio data with signed 16-bit PCM samples.
 95 | 
 96 |     Returns:
 97 |       See waveform_to_examples.
 98 |     """
 99 |     wav_data, sr = wav_read(wav_file)
100 |     assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype
101 |     samples = wav_data / 32768.0  # Convert to [-1.0, +1.0]
102 |     return waveform_to_examples(samples, sr)
103 | 
104 | 
105 | def mp3_to_examples(mp3_file):
106 |     """Convenience wrapper around waveform_to_examples() for a common mp3 format.
107 | 
108 |     Args:
109 |       mp3_file: String path to a file, or a file-like object. The file
110 |       is assumed to contain mp3 audio data.
111 | 
112 |     Returns:
113 |       See waveform_to_examples.
114 |     """
115 |     x_data, sr = librosa.load(mp3_file)
116 |     #assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype
117 |     #samples = x_data / 32768.0  # Convert to [-1.0, +1.0]
118 |     return waveform_to_examples(x_data, sr)
119 | 
120 | 
121 | 


--------------------------------------------------------------------------------
/audio-to-audio-search/vggish/vggish_params.py:
--------------------------------------------------------------------------------
 1 | __copyright__ = "Copyright (c) 2021 Jina AI Limited. All rights reserved."
 2 | __license__ = "Apache-2.0"
 3 | 
 4 | # Copyright 2017 The TensorFlow Authors All Rights Reserved.
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #     http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | # ==============================================================================
18 | 
19 | """Global parameters for the VGGish model.
20 | 
21 | See vggish_slim.py for more information.
22 | """
23 | 
24 | # Architectural constants.
25 | NUM_FRAMES = 96  # Frames in input mel-spectrogram patch.
26 | NUM_BANDS = 64  # Frequency bands in input mel-spectrogram patch.
27 | EMBEDDING_SIZE = 128  # Size of embedding layer.
28 | 
29 | # Hyperparameters used in feature and example generation.
30 | SAMPLE_RATE = 16000
31 | STFT_WINDOW_LENGTH_SECONDS = 0.025
32 | STFT_HOP_LENGTH_SECONDS = 0.010
33 | NUM_MEL_BINS = NUM_BANDS
34 | MEL_MIN_HZ = 125
35 | MEL_MAX_HZ = 7500
36 | LOG_OFFSET = 0.01  # Offset used for stabilized log of input mel-spectrogram.
37 | EXAMPLE_WINDOW_SECONDS = 0.96  # Each example contains 96 10ms frames
38 | EXAMPLE_HOP_SECONDS = 0.96  # with zero overlap.
39 | 
40 | # Parameters used for embedding postprocessing.
41 | PCA_EIGEN_VECTORS_NAME = 'pca_eigen_vectors'
42 | PCA_MEANS_NAME = 'pca_means'
43 | QUANTIZE_MIN_VAL = -2.0
44 | QUANTIZE_MAX_VAL = +2.0
45 | 
46 | # Hyperparameters used in training.
47 | INIT_STDDEV = 0.01  # Standard deviation used to initialize weights.
48 | LEARNING_RATE = 1e-4  # Learning rate for the Adam optimizer.
49 | ADAM_EPSILON = 1e-8  # Epsilon for the Adam optimizer.
50 | 
51 | # Names of ops, tensors, and features.
52 | INPUT_OP_NAME = 'vggish/input_features'
53 | INPUT_TENSOR_NAME = INPUT_OP_NAME + ':0'
54 | OUTPUT_OP_NAME = 'vggish/embedding'
55 | OUTPUT_TENSOR_NAME = OUTPUT_OP_NAME + ':0'
56 | AUDIO_EMBEDDING_FEATURE_NAME = 'audio_embedding'
57 | 


--------------------------------------------------------------------------------
/audio-to-audio-search/vggish/vggish_postprocess.py:
--------------------------------------------------------------------------------
 1 | __copyright__ = "Copyright (c) 2021 Jina AI Limited. All rights reserved."
 2 | __license__ = "Apache-2.0"
 3 | 
 4 | # Copyright 2017 The TensorFlow Authors All Rights Reserved.
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #     http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | # ==============================================================================
18 | 
19 | """Post-process embeddings from VGGish."""
20 | 
21 | import numpy as np
22 | 
23 | from vggish.vggish_params import *
24 | 
25 | 
26 | class Postprocessor(object):
27 |     """Post-processes VGGish embeddings.
28 | 
29 |     The initial release of AudioSet included 128-D VGGish embeddings for each
30 |     segment of AudioSet. These released embeddings were produced by applying
31 |     a PCA transformation (technically, a whitening transform is included as well)
32 |     and 8-bit quantization to the raw embedding output from VGGish, in order to
33 |     stay compatible with the YouTube-8M project which provides visual embeddings
34 |     in the same format for a large set of YouTube videos. This class implements
35 |     the same PCA (with whitening) and quantization transformations.
36 |     """
37 | 
38 |     def __init__(self, pca_params_npz_path):
39 |         """Constructs a postprocessor.
40 | 
41 |         Args:
42 |           pca_params_npz_path: Path to a NumPy-format .npz file that
43 |             contains the PCA parameters used in postprocessing.
44 |         """
45 |         params = np.load(pca_params_npz_path)
46 |         self._pca_matrix = params[PCA_EIGEN_VECTORS_NAME]
47 |         # Load means into a column vector for easier broadcasting later.
48 |         self._pca_means = params[PCA_MEANS_NAME].reshape(-1, 1)
49 |         assert self._pca_matrix.shape == (
50 |             EMBEDDING_SIZE, EMBEDDING_SIZE), (
51 |                 'Bad PCA matrix shape: %r' % (self._pca_matrix.shape,))
52 |         assert self._pca_means.shape == (EMBEDDING_SIZE, 1), (
53 |                 'Bad PCA means shape: %r' % (self._pca_means.shape,))
54 | 
55 |     def postprocess(self, embeddings_batch):
56 |         """Applies postprocessing to a batch of embeddings.
57 | 
58 |         Args:
59 |           embeddings_batch: An nparray of shape [batch_size, embedding_size]
60 |             containing output from the embedding layer of VGGish.
61 | 
62 |         Returns:
63 |           An nparray of the same shape as the input but of type uint8,
64 |           containing the PCA-transformed and quantized version of the input.
65 |         """
66 |         assert len(embeddings_batch.shape) == 2, (
67 |                 'Expected 2-d batch, got %r' % (embeddings_batch.shape,))
68 |         assert embeddings_batch.shape[1] == EMBEDDING_SIZE, (
69 |                 'Bad batch shape: %r' % (embeddings_batch.shape,))
70 | 
71 |         # Apply PCA.
72 |         # - Embeddings come in as [batch_size, embedding_size].
73 |         # - Transpose to [embedding_size, batch_size].
74 |         # - Subtract pca_means column vector from each column.
75 |         # - Premultiply by PCA matrix of shape [output_dims, input_dims]
76 |         #   where both are are equal to embedding_size in our case.
77 |         # - Transpose result back to [batch_size, embedding_size].
78 |         pca_applied = np.dot(self._pca_matrix,
79 |                              (embeddings_batch.T - self._pca_means)).T
80 | 
81 |         # Quantize by:
82 |         # - clipping to [min, max] range
83 |         clipped_embeddings = np.clip(
84 |             pca_applied, QUANTIZE_MIN_VAL,
85 |             QUANTIZE_MAX_VAL)
86 |         # - convert to 8-bit in range [0.0, 255.0]
87 |         quantized_embeddings = (
88 |                 (clipped_embeddings - QUANTIZE_MIN_VAL) *
89 |                 (255.0 /
90 |                  (QUANTIZE_MAX_VAL - QUANTIZE_MIN_VAL)))
91 |         # - cast 8-bit float to uint8
92 |         quantized_embeddings = quantized_embeddings.astype(np.uint8)
93 | 
94 |         return quantized_embeddings
95 | 


--------------------------------------------------------------------------------
/audio-to-audio-search/vggish/vggish_slim.py:
--------------------------------------------------------------------------------
  1 | __copyright__ = "Copyright (c) 2020 Jina AI Limited. All rights reserved."
  2 | __license__ = "Apache-2.0"
  3 | 
  4 | # Copyright 2017 The TensorFlow Authors All Rights Reserved.
  5 | #
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | #
 10 | #     http://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | # ==============================================================================
 18 | 
 19 | """Defines the 'VGGish' model used to generate AudioSet embedding features.
 20 | 
 21 | The public AudioSet release (https://research.google.com/audioset/download.html)
 22 | includes 128-D features extracted from the embedding layer of a VGG-like model
 23 | that was trained on a large Google-internal YouTube dataset. Here we provide
 24 | a TF-Slim definition of the same model, without any dependences on libraries
 25 | internal to Google. We call it 'VGGish'.
 26 | 
 27 | Note that we only define the model up to the embedding layer, which is the
 28 | penultimate layer before the final classifier layer. We also provide various
 29 | hyperparameter values (in vggish_params.py) that were used to train this model
 30 | internally.
 31 | 
 32 | For comparison, here is TF-Slim's VGG definition:
 33 | https://github.com/tensorflow/models/blob/master/research/slim/nets/vgg.py
 34 | """
 35 | 
 36 | import tensorflow.compat.v1 as tf
 37 | import tf_slim as slim
 38 | 
 39 | from vggish.vggish_params import *
 40 | 
 41 | 
 42 | def define_vggish_slim(features_tensor=None, training=False):
 43 |     """Defines the VGGish TensorFlow model.
 44 | 
 45 |     All ops are created in the current default graph, under the scope 'vggish/'.
 46 | 
 47 |     The input is either a tensor passed in via the optional 'features_tensor'
 48 |     argument or a placeholder created below named 'vggish/input_features'. The
 49 |     input is expected to have dtype float32 and shape [batch_size, num_frames,
 50 |     num_bands] where batch_size is variable and num_frames and num_bands are
 51 |     constants, and [num_frames, num_bands] represents a log-mel-scale spectrogram
 52 |     patch covering num_bands frequency bands and num_frames time frames (where
 53 |     each frame step is usually 10ms). This is produced by computing the stabilized
 54 |     log(mel-spectrogram + LOG_OFFSET).  The output is a tensor named
 55 |     'vggish/embedding' which produces the pre-activation values of a 128-D
 56 |     embedding layer, which is usually the penultimate layer when used as part of a
 57 |     full model with a final classifier layer.
 58 | 
 59 |     Args:
 60 |       features_tensor: If not None, the tensor containing the input features.
 61 |         If None, a placeholder input is created.
 62 |       training: If true, all parameters are marked trainable.
 63 | 
 64 |     Returns:
 65 |       The op 'vggish/embeddings'.
 66 |     """
 67 |     # Defaults:
 68 |     # - All weights are initialized to N(0, INIT_STDDEV).
 69 |     # - All biases are initialized to 0.
 70 |     # - All activations are ReLU.
 71 |     # - All convolutions are 3x3 with stride 1 and SAME padding.
 72 |     # - All max-pools are 2x2 with stride 2 and SAME padding.
 73 |     with slim.arg_scope([slim.conv2d, slim.fully_connected],
 74 |                         weights_initializer=tf.truncated_normal_initializer(
 75 |                             stddev=INIT_STDDEV),
 76 |                         biases_initializer=tf.zeros_initializer(),
 77 |                         activation_fn=tf.nn.relu,
 78 |                         trainable=training), \
 79 |          slim.arg_scope([slim.conv2d],
 80 |                         kernel_size=[3, 3], stride=1, padding='SAME'), \
 81 |          slim.arg_scope([slim.max_pool2d],
 82 |                         kernel_size=[2, 2], stride=2, padding='SAME'), \
 83 |          tf.variable_scope('vggish'):
 84 |         # Input: a batch of 2-D log-mel-spectrogram patches.
 85 |         if features_tensor is None:
 86 |             features_tensor = tf.placeholder(
 87 |                 tf.float32, shape=(None, NUM_FRAMES, NUM_BANDS),
 88 |                 name='input_features')
 89 |         # Reshape to 4-D so that we can convolve a batch with conv2d().
 90 |         net = tf.reshape(features_tensor,
 91 |                          [-1, NUM_FRAMES, NUM_BANDS, 1])
 92 | 
 93 |         # The VGG stack of alternating convolutions and max-pools.
 94 |         net = slim.conv2d(net, 64, scope='conv1')
 95 |         net = slim.max_pool2d(net, scope='pool1')
 96 |         net = slim.conv2d(net, 128, scope='conv2')
 97 |         net = slim.max_pool2d(net, scope='pool2')
 98 |         net = slim.repeat(net, 2, slim.conv2d, 256, scope='conv3')
 99 |         net = slim.max_pool2d(net, scope='pool3')
100 |         net = slim.repeat(net, 2, slim.conv2d, 512, scope='conv4')
101 |         net = slim.max_pool2d(net, scope='pool4')
102 | 
103 |         # Flatten before entering fully-connected layers
104 |         net = slim.flatten(net)
105 |         net = slim.repeat(net, 2, slim.fully_connected, 4096, scope='fc1')
106 |         # The embedding layer.
107 |         net = slim.fully_connected(net, EMBEDDING_SIZE, scope='fc2',
108 |                                    activation_fn=None)
109 |         return tf.identity(net, name='embedding')
110 | 
111 | 
112 | def load_vggish_slim_checkpoint(session, checkpoint_path):
113 |     """Loads a pre-trained VGGish-compatible checkpoint.
114 | 
115 |     This function can be used as an initialization function (referred to as
116 |     init_fn in TensorFlow documentation) which is called in a Session after
117 |     initializating all variables. When used as an init_fn, this will load
118 |     a pre-trained checkpoint that is compatible with the VGGish model
119 |     definition. Only variables defined by VGGish will be loaded.
120 | 
121 |     Args:
122 |       session: an active TensorFlow session.
123 |       checkpoint_path: path to a file containing a checkpoint that is
124 |         compatible with the VGGish model definition.
125 |     """
126 |     # Get the list of names of all VGGish variables that exist in
127 |     # the checkpoint (i.e., all inference-mode VGGish variables).
128 |     with tf.Graph().as_default():
129 |         define_vggish_slim(training=False)
130 |         vggish_var_names = [v.name for v in tf.global_variables()]
131 | 
132 |     # Get the list of all currently existing variables that match
133 |     # the list of variable names we just computed.
134 |     vggish_vars = [v for v in tf.global_variables() if v.name in vggish_var_names]
135 | 
136 |     # Use a Saver to restore just the variables selected above.
137 |     saver = tf.train.Saver(vggish_vars, name='vggish_load_pretrained',
138 |                            write_version=1)
139 |     saver.restore(session, checkpoint_path)
140 | 


--------------------------------------------------------------------------------
/cross-modal-search/.dockerignore:
--------------------------------------------------------------------------------
1 | workspace
2 | venv
3 | .venv
4 | 


--------------------------------------------------------------------------------
/cross-modal-search/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jina-ai/examples/d8f903278597254bd96b1ed64fe8c9feefaa265c/cross-modal-search/__init__.py


--------------------------------------------------------------------------------
/cross-modal-search/app.py:
--------------------------------------------------------------------------------
  1 | __copyright__ = "Copyright (c) 2021 Jina AI Limited. All rights reserved."
  2 | __license__ = "Apache-2.0"
  3 | 
  4 | import os
  5 | import sys
  6 | 
  7 | import click
  8 | from jina import Flow, Document, DocumentArray
  9 | import logging
 10 | import matplotlib.pyplot as plt
 11 | 
 12 | from dataset import input_index_data
 13 | 
 14 | MAX_DOCS = int(os.environ.get("JINA_MAX_DOCS", 10000))
 15 | cur_dir = os.path.dirname(os.path.abspath(__file__))
 16 | DEFAULT_QUERY_IMAGE = 'toy-data/images/1000268201_693b08cb0e.jpg'
 17 | DEFAULT_QUERY_TEXT = 'a black dog and a spotted dog are fighting'
 18 | 
 19 | 
 20 | def config():
 21 |     os.environ.setdefault('JINA_WORKSPACE', os.path.join(cur_dir, 'workspace'))
 22 |     os.environ.setdefault(
 23 |         'JINA_WORKSPACE_MOUNT',
 24 |         f'{os.environ.get("JINA_WORKSPACE")}:/workspace/workspace')
 25 |     os.environ.setdefault('JINA_LOG_LEVEL', 'INFO')
 26 |     os.environ.setdefault('JINA_PORT', str(45678))
 27 | 
 28 | 
 29 | def index_restful():
 30 |     flow = Flow().load_config('flows/flow-index.yml', override_with={'protocol': 'http'})
 31 |     with flow:
 32 |         flow.block()
 33 | 
 34 | 
 35 | def check_query_result(text_doc, image_doc, img_uri):
 36 |     # Image doc matches are text:
 37 |     print(f'Searching with image {img_uri}. Matches:')
 38 |     if image_doc.matches:
 39 |         for m in image_doc.matches:
 40 |             print(
 41 |                 f'\t-- text: "{m.text}" '
 42 |                 f'score: {m.scores["cosine"].value:.4f},'
 43 |              )
 44 | 
 45 |     # Text doc matches are images
 46 |     print(f'\nSearching with text "{text_doc.text}". Matches:')
 47 |     if text_doc.matches:
 48 |         f, axarr = plt.subplots(1, len(text_doc.matches))
 49 | 
 50 |         for i, m in enumerate(text_doc.matches):
 51 |             axarr[i].title.set_text(f'score={m.scores["cosine"].value:.4f}')
 52 |             axarr[i].imshow(m.blob)
 53 |             axarr[i].axes.xaxis.set_visible(False)
 54 |             axarr[i].axes.yaxis.set_visible(False)
 55 |         plt.suptitle(f"Best matches for '{text_doc.text}'")
 56 |         plt.show()
 57 | 
 58 | 
 59 | def index(data_set, num_docs, request_size):
 60 |     flow = Flow().load_config('flows/flow-index.yml')
 61 |     with flow:
 62 |         flow.post(on='/index',
 63 |                   inputs=input_index_data(num_docs, request_size, data_set),
 64 |                   request_size=request_size,
 65 |                   show_progress=True)
 66 | 
 67 | 
 68 | def query(query_image, query_text):
 69 |     flow = Flow().load_config('flows/flow-query.yml')
 70 |     with flow:
 71 |         img_uri = query_image
 72 |         text_doc = Document(text=query_text,
 73 |                             modality='text')
 74 |         image_doc = Document(uri=img_uri,
 75 |                              modality='image')
 76 |         import time
 77 |         start = time.time()
 78 |         result_text = flow.post(on='/search', inputs=text_doc,
 79 |                                 return_results=True)
 80 |         result_image = flow.post(on='/search', inputs=image_doc,
 81 |                                  return_results=True)
 82 |         print(f'Request duration: {time.time() - start}')
 83 |         check_query_result(result_text[0].docs[0], result_image[0].docs[0], img_uri)
 84 | 
 85 | 
 86 | 
 87 | def query_restful():
 88 |     flow = Flow(cors=True).load_config('flows/flow-query.yml')
 89 |     flow.rest_api = True
 90 |     flow.protocol = 'http'
 91 |     with flow:
 92 |         flow.block()
 93 | 
 94 | 
 95 | @click.command()
 96 | @click.option('--task', '-t', type=click.Choice(['index', 'index_restful', 'query_restful', 'query']), default='index')
 97 | @click.option("--num_docs", "-n", default=MAX_DOCS)
 98 | @click.option('--request_size', '-s', default=16)
 99 | @click.option('--data_set', '-d', type=click.Choice(['f30k', 'f8k', 'toy-data'], case_sensitive=False), default='toy-data')
100 | @click.option('--query-image', '-i', type=str, default=DEFAULT_QUERY_IMAGE)
101 | @click.option('--query-text', '-i', type=str, default=DEFAULT_QUERY_TEXT)
102 | def main(task, num_docs, request_size, data_set, query_image, query_text):
103 |     config()
104 |     workspace = os.environ['JINA_WORKSPACE']
105 |     logger = logging.getLogger('cross-modal-search')
106 |     if 'index' in task:
107 |         if os.path.exists(workspace):
108 |             logger.error(
109 |                 f'\n +------------------------------------------------------------------------------------+ \
110 |                     \n |                                   🤖🤖🤖                                           | \
111 |                     \n | The directory {workspace} already exists. Please remove it before indexing again.  | \
112 |                     \n |                                   🤖🤖🤖                                           | \
113 |                     \n +------------------------------------------------------------------------------------+'
114 |             )
115 |             sys.exit(1)
116 |     if 'query' in task:
117 |         if not os.path.exists(workspace):
118 |             logger.error(f'The directory {workspace} does not exist. Please index first via `python app.py -t index`')
119 |             sys.exit(1)
120 | 
121 |     if task == 'index':
122 |         index(data_set, num_docs, request_size)
123 |     elif task == 'index_restful':
124 |         index_restful()
125 |     elif task == 'query':
126 |         query(query_image, query_text)
127 |     elif task == 'query_restful':
128 |         query_restful()
129 | 
130 | 
131 | if __name__ == '__main__':
132 |     main()
133 | 


--------------------------------------------------------------------------------
/cross-modal-search/dataset.py:
--------------------------------------------------------------------------------
  1 | __copyright__ = "Copyright (c) 2021 Jina AI Limited. All rights reserved."
  2 | __license__ = "Apache-2.0"
  3 | 
  4 | 
  5 | import os
  6 | import json as jsonmod
  7 | import hashlib
  8 | 
  9 | import torch
 10 | import torch.utils.data as data
 11 | from jina import Document
 12 | 
 13 | 
 14 | cur_dir = os.path.dirname(os.path.abspath(__file__))
 15 | 
 16 | 
 17 | class Flickr30kDataset(data.Dataset):
 18 |     """
 19 |     Dataset loader for Flickr30k full datasets.
 20 |     """
 21 | 
 22 |     def __init__(self, images_root, json, split):
 23 |         self.images_root = images_root
 24 |         self.dataset = jsonmod.load(open(json, 'r'))['images']
 25 |         self.ids = []
 26 |         for i, d in enumerate(self.dataset):
 27 |             if d['split'] == split:
 28 |                 self.ids += [(i, x) for x in range(len(d['sentences']))]
 29 | 
 30 |     def __getitem__(self, index):
 31 |         """This function returns a tuple that is further passed to collate_fn
 32 |         """
 33 |         images_root = self.images_root
 34 |         ann_id = self.ids[index]
 35 |         img_id = ann_id[0]
 36 |         caption = self.dataset[img_id]['sentences'][ann_id[1]]['raw']
 37 |         img_file_name = self.dataset[img_id]['filename']
 38 | 
 39 |         image_file_path = os.path.join(images_root, img_file_name)
 40 |         with open(image_file_path, 'rb') as fp:
 41 |             image_buffer = fp.read()
 42 |         return image_buffer, str(caption).lower()
 43 | 
 44 |     def __len__(self):
 45 |         return len(self.ids)
 46 | 
 47 | 
 48 | class FlickrDataset(data.Dataset):
 49 |     """
 50 |     Dataset loader for Flickr8k full datasets.
 51 |     """
 52 | 
 53 |     def __init__(self, images_root, captions_file_path):
 54 |         self.images_root = images_root
 55 |         self.captions_file_path = captions_file_path
 56 |         with open(self.captions_file_path, 'r') as cf:
 57 |             self.lines = cf.readlines()[1:]
 58 | 
 59 |     def __getitem__(self, index):
 60 |         """This function returns a tuple that is further passed to collate_fn
 61 |         """
 62 |         image_file_name, caption = self.lines[index*5].split(',', 1)
 63 |         with open(os.path.join(self.images_root, image_file_name), 'rb') as fp:
 64 |             image_buffer = fp.read()
 65 |         return image_buffer, str(caption).lower().rstrip()
 66 | 
 67 |     def __len__(self):
 68 |         return int(len(self.lines)/5)
 69 | 
 70 | 
 71 | def collate_fn(data):
 72 |     # Not sure this is actually needed
 73 |     images, captions = zip(*data)
 74 |     return images, captions
 75 | 
 76 | 
 77 | def get_data_loader(split, root, captions, batch_size=8, dataset_type='f30k', shuffle=False,
 78 |                     num_workers=1, collate_fn=collate_fn):
 79 |     """Returns torch.utils.data.DataLoader for custom coco dataset."""
 80 | 
 81 |     if dataset_type == 'f30k':
 82 |         dataset = Flickr30kDataset(images_root=root, split=split, json=captions)
 83 |     elif dataset_type == 'f8k' or dataset_type == 'toy-data':
 84 |         dataset = FlickrDataset(images_root=root, captions_file_path=captions)
 85 |     else:
 86 |         raise NotImplementedError(f'Not valid dataset type {dataset_type}')
 87 |     # Data loader
 88 |     data_loader = torch.utils.data.DataLoader(dataset=dataset,
 89 |                                               batch_size=batch_size,
 90 |                                               shuffle=shuffle,
 91 |                                               pin_memory=True,
 92 |                                               num_workers=num_workers,
 93 |                                               collate_fn=collate_fn)
 94 | 
 95 |     return data_loader
 96 | 
 97 | 
 98 | def input_index_data(num_docs=None, batch_size=8, dataset_type='f30k'):
 99 |     captions = 'dataset_flickr30k.json' if dataset_type == 'f30k' else 'captions.txt'
100 |     if dataset_type == 'toy-data':
101 |         base_folder = '.'
102 |     else:
103 |         base_folder = 'data'
104 |     data_loader = get_data_loader(
105 |         root=os.path.join(cur_dir, f'{base_folder}/{dataset_type}/images'),
106 |         captions=os.path.join(cur_dir, f'{base_folder}/{dataset_type}/{captions}'),
107 |         split='test',
108 |         batch_size=batch_size,
109 |         dataset_type=dataset_type
110 |     )
111 | 
112 |     for i, (images, captions) in enumerate(data_loader):
113 |         for image, caption in zip(images, captions):
114 |             hashed = hashlib.sha1(image).hexdigest()
115 |             document_img = Document()
116 |             
117 |             document_img.buffer = image
118 |             document_img.modality = 'image'
119 |             document_img.mime_type = 'image/jpeg'
120 |             
121 |             document_caption = Document(id=hashed)
122 |             
123 |             document_caption.text = caption
124 |             document_caption.modality = 'text'
125 |             document_caption.mime_type = 'text/plain'
126 |             document_caption.tags['id'] = caption
127 | 
128 |             yield document_img
129 |             yield document_caption
130 | 
131 |         if num_docs and (i + 1) * batch_size >= num_docs:
132 |             break
133 | 


--------------------------------------------------------------------------------
/cross-modal-search/flows/executors.py:
--------------------------------------------------------------------------------
 1 | """ Implementation of filters for images and texts"""
 2 | 
 3 | import numpy as np
 4 | from jina import Executor, DocumentArray, requests
 5 | 
 6 | 
 7 | class ImageReader(Executor):
 8 |     @requests(on='/index')
 9 |     def index_read(self, docs: 'DocumentArray', **kwargs):
10 |         array = DocumentArray(list(filter(lambda doc: doc.modality=='image', docs)))
11 |         for doc in array:
12 |             doc.convert_image_buffer_to_blob()
13 |             doc.blob = np.array(doc.blob).astype(np.uint8)
14 |         return array
15 | 
16 |     @requests(on='/search')
17 |     def search_read(self, docs: 'DocumentArray', **kwargs):
18 |         image_docs = DocumentArray(list(filter(lambda doc: doc.mime_type in ('image/jpeg', 'image/png'), docs)))
19 |         if not image_docs:
20 |             return DocumentArray([])
21 |         for doc in image_docs:
22 |             doc.convert_uri_to_buffer()
23 |             doc.convert_image_buffer_to_blob()
24 |             doc.blob = doc.blob.astype(np.uint8)
25 |         return image_docs
26 | 
27 | 
28 | class TextFilter(Executor):
29 |     @requests
30 |     def filter_text(self, docs: 'DocumentArray', **kwargs):
31 |         docs = DocumentArray(list(filter(lambda doc: doc.mime_type == 'text/plain', docs)))
32 |         return docs
33 | 


--------------------------------------------------------------------------------
/cross-modal-search/flows/flow-index.yml:
--------------------------------------------------------------------------------
 1 | jtype: Flow                                       # We configure the index flow here that is used for indexing images and captions
 2 | version: '1'                                      # yml version
 3 | with:                                             # Parameters for the flow are defined after with
 4 |   prefetch: 10                                    # Number of prefetched requests from the client
 5 |   port_expose: $JINA_PORT                         # Port defined in environment variable
 6 |   workspace: $JINA_WORKSPACE                      # Workspace folder
 7 | pods:                                             # Now, we define the pods that are used
 8 |   - name: image_loader                            # The first executor is an image loader that filters only image documents
 9 |     uses: ImageReader                             # Type of the executor
10 |     py_modules: 'flows/executors.py'              # The python file where the executor is implemented
11 |     read_only: true                               # Executor does not modify files
12 |     needs: gateway                                # Executor is after the gateway, this means at the start of the flow
13 |   - name: image_encoder                           # After the images are read, compute their embedding in the encoder
14 |     uses: 'jinahub+docker://CLIPImageEncoder/v0.1'     # The type of the executor - here, we use a hub executor from the jinahub in the form of a docker container
15 |     volumes: $HOME/.cache/huggingface:/root/.cache/huggingface  # Mount a volume into the executor
16 |     timeout_ready: 600000                         # Set a timeout for the executor
17 |     read_only: true                               # Executor does not modify files
18 |     needs: image_loader                           # This executor is located after the image loader in the flow
19 |   - name: image_indexer                           # Executor that stores image embeddings
20 |     uses: 'jinahub://SimpleIndexer/old'           # Hub Executor - We use a SimpleIndexer here
21 |     uses_with:                                    # Define arguments for the SimpleIndexer
22 |       index_file_name: 'image_index'              # Folder path for this executor
23 |     needs: image_encoder                          # This executor is after the image encoder in the flow
24 |   - name: text_filter                             # Now, we define another path in the flow that is parallel in the execution
25 |     uses: TextFilter                              # The first executor is a filter that filters all text documents and ignores images now
26 |     py_modules: 'flows/executors.py'              # File where the TextFilter is implemented
27 |     needs: gateway                                # Start after the gateway, so at the beginning of the flow - this creates a second path in the flow
28 |   - name: text_encoder                            # Create the next executor that computes embeddings for the text documents
29 |     uses: 'jinahub+docker://CLIPTextEncoder/v0.1'      # Use a hub executor in docker
30 |     volumes: $HOME/.cache/huggingface:/root/.cache/huggingface  # Mount the models directory
31 |     timeout_ready: 600000                         # Set timeout
32 |     read_only: true                               # Executor does not modify files
33 |     needs: text_filter                            # Run this executor after the image filter
34 |   - name: text_indexer                            # Finally, store the indexed text documents with embeddings on disk
35 |     uses: 'jinahub://SimpleIndexer/old'           # Use SimpleIndexer from hub in docker again
36 |     uses_with:                                    # Define parameters for the text indexer
37 |       index_file_name: 'text_index'               # Folder name in the workspace
38 |     needs: text_encoder                           # Start after the text encoder executor is finished
39 |   - name: join_all                                # This is the last executor - it waits until both paths in the flow are finished (image and text path)
40 |     needs: [image_indexer, text_indexer]          # Wait for these two executors to finish - only then we can continue
41 | 


--------------------------------------------------------------------------------
/cross-modal-search/flows/flow-query.yml:
--------------------------------------------------------------------------------
 1 | jtype: Flow                                       # This file defines the query flow which is used for searching in the indexed documents
 2 | version: '1'                                      # The query flow is very similar to the index flow - only the differences are explained here
 3 | with:
 4 |   prefetch: 10
 5 |   port_expose: $JINA_PORT
 6 |   workspace: $JINA_WORKSPACE
 7 | pods:
 8 |   - name: loader                                  # Again, we start two paths in the flow - here we start the image path
 9 |     uses: ImageReader
10 |     py_modules: 'flows/executors.py'
11 |     read_only: true
12 |     needs: [gateway]
13 |   - name: image_encoder                           # Now, encode the images and compute the embeddings
14 |     uses: 'jinahub+docker://CLIPImageEncoder/v0.1'
15 |     volumes: $HOME/.cache/huggingface:/root/.cache/huggingface
16 |     timeout_ready: 600000
17 |     read_only: true
18 |     needs: loader
19 |   - name: text_indexer                            # Now, we use the text indexer in the image path - This is how we achieve the cross-modality here
20 |     uses: 'jinahub://SimpleIndexer/old'           # The text indexer has indexed all text documents and stored them on disk.
21 |     uses_with:                                    # Then we return the closest matches as results
22 |      index_file_name: 'text_index'
23 |     needs: image_encoder
24 |     force: True
25 |     read_only: true
26 |   - name: text_filter                             # Here, the text path starts
27 |     uses: TextFilter
28 |     py_modules: 'flows/executors.py'
29 |     needs: [gateway]
30 |   - name: text_encoder                            # Compute the embedding of the search text
31 |     uses: 'jinahub+docker://CLIPTextEncoder/v0.1'
32 |     volumes: $HOME/.cache/huggingface:/root/.cache/huggingface
33 |     timeout_ready: 600000
34 |     read_only: true
35 |     needs: text_filter
36 |   - name: image_indexer                           # Now, we use the image indexer in the text path - this is again how we get cross-modality
37 |     uses: 'jinahub://SimpleIndexer/old'           # The image indexer has indexed all images and their embeddings
38 |     uses_with:
39 |       index_file_name: 'image_index'
40 |     force: True
41 |     read_only: true
42 |     needs: text_encoder
43 |   - name: join_all                                # Wait for both paths to finish and join the results
44 |     needs: [image_indexer, text_indexer]
45 | 


--------------------------------------------------------------------------------
/cross-modal-search/get_data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | DATASET="adityajn105/flickr8k"
 3 | DATA_DIR="data/f8k"
 4 | 
 5 | if [ -d ${DATA_DIR} ]; then
 6 |   echo ${DATA_DIR}' exists, please remove it before running the script'
 7 |   exit 1
 8 | fi
 9 | 
10 | mkdir -p ${DATA_DIR} && \
11 | kaggle datasets download -d ${DATASET} && \
12 | unzip -q flickr8k.zip && \
13 | rm flickr8k.zip && \
14 | mv Images data/f8k/images && \
15 | mv captions.txt data/f8k/captions.txt


--------------------------------------------------------------------------------
/cross-modal-search/get_data30k.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | pip install kaggle
 3 | kaggle datasets download hsankesara/flickr-image-dataset && \
 4 | unzip flickr-image-dataset.zip && \
 5 | rm flickr-image-dataset.zip && \
 6 | wget -q http://www.cs.toronto.edu/~faghri/vsepp/data.tar && \
 7 | tar -xvf data.tar && \
 8 | rm -rf data.tar && \
 9 | rm -rf data/coco* && \
10 | rm -rf data/f8k* && \
11 | rm -rf data/*precomp* && \
12 | rm -rf data/f30k/images && \
13 | mv flickr-image-dataset data/f30k/images
14 | 


--------------------------------------------------------------------------------
/cross-modal-search/requirements.txt:
--------------------------------------------------------------------------------
1 | jina[standard,rich]==2.0.18
2 | click==8.0.1
3 | kaggle==1.5.12
4 | git+git://github.com/jina-ai/jina-commons@v0.0.3
5 | matplotlib==3.4.3
6 | torch==1.9.0


--------------------------------------------------------------------------------
/cross-modal-search/setup_run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | TEST_DATA_DIR=data/
 4 | 
 5 | rm -rf ${TEST_DATA_DIR} && \
 6 | mkdir -p ${TEST_DATA_DIR}/f8k/images && \
 7 | python ../.github/util/pull_dataset.py -d cross-modal-search/f8k.zip -p ../ && \
 8 | unzip -o f8k.zip -d ${TEST_DATA_DIR} && \
 9 | rm f8k.zip && \
10 | mv ${TEST_DATA_DIR}/Images/* ${TEST_DATA_DIR}/f8k/images && \
11 | mv ${TEST_DATA_DIR}/captions.txt data/f8k/captions.txt && \
12 | rm -rf workspace && \
13 | python app.py -t index | tee metrics.txt && \
14 | rm -rf ${TEST_DATA_DIR}


--------------------------------------------------------------------------------
/cross-modal-search/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jina-ai/examples/d8f903278597254bd96b1ed64fe8c9feefaa265c/cross-modal-search/tests/__init__.py


--------------------------------------------------------------------------------
/cross-modal-search/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | """Patch the birthday problem for random parts"""
 2 | 
 3 | import pytest
 4 | 
 5 | 
 6 | @pytest.fixture(scope='function', autouse=True)
 7 | def patched_random_port(mocker):
 8 |     used_ports = set()
 9 |     from jina.helper import random_port
10 |     from jina.excepts import NoAvailablePortError
11 | 
12 |     def _random_port():
13 | 
14 |         for i in range(10):
15 |             _port = random_port()
16 | 
17 |             if _port is not None and _port not in used_ports:
18 |                 used_ports.add(_port)
19 |                 return _port
20 |         raise NoAvailablePortError
21 | 
22 |     mocker.patch('jina.helper.random_port', new_callable=lambda: _random_port)


--------------------------------------------------------------------------------
/cross-modal-search/tests/requirements.txt:
--------------------------------------------------------------------------------
1 | pytest==6.2.4
2 | git+https://github.com/jina-ai/jina.git@v2.0.18#egg=jina[standard,rich]
3 | click==8.0.1
4 | git+git://github.com/jina-ai/jina-commons@v0.0.3
5 | kaggle==1.5.12
6 | matplotlib==3.4.3
7 | torch==1.9.0


--------------------------------------------------------------------------------
/cross-modal-search/tests/test_cross_modal_search.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | sys.path.append('..')
 4 | from app import main
 5 | from click.testing import CliRunner
 6 | 
 7 | 
 8 | def config(tmpdir):
 9 |     os.environ['JINA_WORKSPACE'] = os.path.join(tmpdir, 'workspace')
10 | 
11 | 
12 | def test_cross_modal_search(tmpdir):
13 |     config(tmpdir)
14 |     runner = CliRunner()
15 |     result = runner.invoke(main, ['-t', 'index'])
16 |     assert 'done in' in result.stdout
17 |     assert result.stderr_bytes is None
18 |     result = runner.invoke(main, ['-t', 'query'])
19 |     assert result.stderr_bytes is None
20 | 


--------------------------------------------------------------------------------
/cross-modal-search/toy-data/captions.txt:
--------------------------------------------------------------------------------
 1 | image,caption
 2 | 1000268201_693b08cb0e.jpg,A child in a pink dress is climbing up a set of stairs in an entry way .
 3 | 1000268201_693b08cb0e.jpg,A girl going into a wooden building .
 4 | 1000268201_693b08cb0e.jpg,A little girl climbing into a wooden playhouse .
 5 | 1000268201_693b08cb0e.jpg,A little girl climbing the stairs to her playhouse .
 6 | 1000268201_693b08cb0e.jpg,A little girl in a pink dress going into a wooden cabin .
 7 | 1001773457_577c3a7d70.jpg,A black dog and a spotted dog are fighting
 8 | 1001773457_577c3a7d70.jpg,A black dog and a tri-colored dog playing with each other on the road .
 9 | 1001773457_577c3a7d70.jpg,A black dog and a white dog with brown spots are staring at each other in the street .
10 | 1001773457_577c3a7d70.jpg,Two dogs of different breeds looking at each other on the road .
11 | 1001773457_577c3a7d70.jpg,Two dogs on pavement moving toward each other .
12 | 


--------------------------------------------------------------------------------
/cross-modal-search/toy-data/images/1000268201_693b08cb0e.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jina-ai/examples/d8f903278597254bd96b1ed64fe8c9feefaa265c/cross-modal-search/toy-data/images/1000268201_693b08cb0e.jpg


--------------------------------------------------------------------------------
/cross-modal-search/toy-data/images/1001773457_577c3a7d70.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jina-ai/examples/d8f903278597254bd96b1ed64fe8c9feefaa265c/cross-modal-search/toy-data/images/1001773457_577c3a7d70.jpg


--------------------------------------------------------------------------------
/cross-modal-search/visualizations/cross-modal-index-flow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jina-ai/examples/d8f903278597254bd96b1ed64fe8c9feefaa265c/cross-modal-search/visualizations/cross-modal-index-flow.png


--------------------------------------------------------------------------------
/cross-modal-search/visualizations/cross-modal-query-flow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jina-ai/examples/d8f903278597254bd96b1ed64fe8c9feefaa265c/cross-modal-search/visualizations/cross-modal-query-flow.png


--------------------------------------------------------------------------------
/cross-modal-search/visualizations/cross-modal-result.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jina-ai/examples/d8f903278597254bd96b1ed64fe8c9feefaa265c/cross-modal-search/visualizations/cross-modal-result.jpg


--------------------------------------------------------------------------------
/cross-modal-search/visualizations/image_results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jina-ai/examples/d8f903278597254bd96b1ed64fe8c9feefaa265c/cross-modal-search/visualizations/image_results.png


--------------------------------------------------------------------------------
/cross-modal-search/visualizations/text_results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jina-ai/examples/d8f903278597254bd96b1ed64fe8c9feefaa265c/cross-modal-search/visualizations/text_results.png


--------------------------------------------------------------------------------
/example-guidelines.md:
--------------------------------------------------------------------------------
 1 | # Submit Your Community Example!
 2 | 
 3 | Thanks for your interest in submitting your example! Here are some rules and guidelines:
 4 | 
 5 | ## Rules
 6 | 
 7 | ### `jina` in `requirements.txt`
 8 | 
 9 | To be eligible for listing, you **must** have `jina==x.x.x` in your `requirements.txt`, where `x.x.x` refers to the semantic version number.
10 | 
11 | Note: If you're building a front-end that just interfaces with Jina's API and doesn't rely on Jina core itself, there's no need to follow this requirement.
12 | 
13 | ### `jina-` at start of name
14 | 
15 | Your repo name should be `jina-xxxxxxx`.
16 | 
17 | ### Clear README
18 | 
19 | - Explain what your example does and how to run it
20 | 
21 | ### Use scripts to get external resources
22 | 
23 | - **For datasets:** Use a script named `get_data.sh`
24 | - **For models**: If you use an externally-hosted model, call your script `get_model.sh` or similar
25 | - **For other assets:** Follow the `get_xxx.sh` pattern
26 | 
27 | ### `.gitignore` and `.dockerignore`
28 | 
29 | Have a `.gitignore` file and list any directories that should be ignored. The same goes for `.dockerignore` if you have `Dockerfile`:
30 | 
31 | - `data` directory
32 | - `workspace` directory
33 | - virtual environment directories
34 | - directories that store assets retrieved by [scripts](#use-scripts-to-get-external-resources)
35 | 
36 | ### License
37 | 
38 | You **must** use an open-source license, specified in `LICENSE` in the root of your repo
39 | 
40 | ## Guidelines
41 | 
42 | We're more easy-going on these
43 | 
44 | ### One Example Per Repo
45 | 
46 | To make code more maintainable and easier for end users, please include one example per repo.
47 | 
48 | ### Tests
49 | 
50 | Please include tests to ensure your app or Pod works correctly.
51 | 
52 | ### File Structure
53 | 
54 | - Please follow the file structure as created by `jina hub new --type app`
55 | - Store data in `data` and externally-downloaded models in `models`
56 | 
57 | ### Dockerfile
58 | 
59 | We highly encourage you to add a `Dockerfile`.
60 | 
61 | ### Docker image
62 | 
63 | For self-contained apps, we would love to host a Docker image on [Jina Hub](https://github.com/jina-ai/jina-hub)
64 | 


--------------------------------------------------------------------------------
/example_template.md:
--------------------------------------------------------------------------------
  1 | # Run the EXAMPLE NAME
  2 | *You can also include a gif with a full demo of the example*
  3 | 
  4 | 
  5 |  *ADD A TABLE OF CONTENTS HERE *
  6 |  
  7 |  - [Overview](#overview)
  8 | - [🐍 Build the app with Python](#-build-the-app-with-python)
  9 | - [🔮 Overview of the files in this example](#-overview-of-the-files-in-this-example)
 10 | - [🌀 Flow diagram](#-flow-diagram)
 11 | - [🔨 Next steps, building your own app](#-next-steps-building-your-own-app)
 12 | - [🐳 Deploy the prebuild application using Docker](#-deploy-the-prebuild-application-using-docker)
 13 | - [🙍 Community](#-community)
 14 | - [🦄 License](#-license)
 15 | 
 16 | 
 17 | ## Overview
 18 | | About this example: |  |
 19 | | ------------- | ------------- |
 20 | | Learnings | *Describe what the user will learn after running this example* |
 21 | | Used for indexing | *What is the datatype of the indexing input* |
 22 | | Used for querying | *What is the data type of the query input* |
 23 | | Dataset used | *Link to the datasets* |
 24 | | Model used | *Link to the model* |
 25 | 
 26 | 
 27 | ## 🐍 Build the app with Python
 28 | 
 29 | These instructions explain how to build the example yourself and deploy it with Python. If you want to skip the building steps and just run the example with Docker, check [the Docker deployment instructions at the end of this README](#deploy-with-docker)  
 30 | 
 31 | 
 32 | ### 🗝️ Requirements
 33 | 
 34 | *Here outline in bullet points anything the user is expected to have before diving in.* 
 35 | 
 36 | For example:
 37 | 
 38 | 1. You have a working Python 3.8 environment. 
 39 | 2. We recommend creating a [new Python virtual environment](https://docs.python.org/3/tutorial/venv.html) to have a clean installation of Jina and prevent dependency conflicts.   
 40 | 3. You have at least 2GB of free space on your hard drive. 
 41 | 
 42 | ### 👾 Step 1. Clone the repo and install Jina
 43 | 
 44 | Begin by cloning the repo, so you can get the required files and datasets. (If you already have the examples repository on your machine make sure to fetch the most recent version)
 45 | 
 46 | ```sh
 47 | git clone https://github.com/jina-ai/examples
 48 | ````
 49 | 
 50 | And enter the correct folder:
 51 | 
 52 | ```sh
 53 | cd examples/example_to_use (replace as necessary)
 54 | ```
 55 | 
 56 | In your terminal, you should now be located in you the *enter example name* folder. Let's install Jina and the other required Python libraries. For further information on installing Jina check out [our documentation](https://docs.jina.ai/chapters/core/setup/).
 57 | 
 58 | ```sh
 59 | pip install -r requirements.txt
 60 | ```
 61 | 
 62 | ### 📥 Step 2. Download your data to search (Optional)
 63 | 
 64 | There are two different options here. You can either use the toy data we provide in this repo, which is quick to index but will give very poor results. Alternatively, you can download a larger dataset, which takes longer to index, but will have better results.
 65 | 
 66 | 1. **Toy dataset:** Skip to step 3. No action is needed here.
 67 | 
 68 | 2. **Full dataset:**
 69 |   In order to get the full dataset, follow the instructions below:
 70 |   - Register for a free [Kaggle account](https://www.kaggle.com/account/login?phase=startRegisterTab&returnUrl=%2F)
 71 |   - Set up your API token (see [authentication section of their API docs](https://www.kaggle.com/docs/api))
 72 |   - Run `pip install kaggle`
 73 |   - Run `sh get_data.sh`
 74 | 
 75 | ### 🏃 Step 3. Index your data
 76 | In this step, we will index our data.
 77 | 
 78 | *Here describe the Index Flow. Be as specific as possible in describing how this Index Flow works and what is its input. You are encouraged to use code snippets, images, or whatever helps to clarify.*
 79 | 
 80 | ```
 81 | python app.py -t index (replace as necessary)
 82 | ```
 83 | 
 84 | If you see the following output, it means your data has been correctly indexed.
 85 | 
 86 | ```
 87 | Flow@5162[S]:flow is closed and all resources are released, current build level is 0
 88 | ```
 89 | 
 90 | ### 🔎 Step 4: Query your data
 91 | Next, we will deploy our query Flow.
 92 | 
 93 | *Here describe the Query Flow. Be as specific as possible in describing how this Query Flow works and what is its input. You are encouraged to use code snippets, images, or whatever helps to clarify.*
 94 | 
 95 | Run the query Flow in your terminal like this:
 96 | 
 97 | ```
 98 | python app.py -t query (replace as necessary)
 99 | ``` 
100 | ______
101 | 
102 | ## 📉 Understanding your results
103 | *Here include a short description of the results and how to interpret them if needed.*
104 | 
105 | ## 🌀 Flow diagram
106 | This diagram provides a visual representation of the Flows in this example; Showing which executors are used in which order.
107 | 
108 | *Here Show the Flow for this example.*
109 | 
110 | ## 📖 Optional: Extra information useful for the user
111 | 
112 | *Use this section to add extra information you think the user could benefit from.
113 | QueryLanguage, Faiss, Annoy for example.*
114 | 
115 | ## 🔮 Overview of the files
116 | 
117 | *Add a list with all folders/files in the example:*
118 | 
119 | |                      |                                                                                                                  |
120 | | -------------------- | ---------------------------------------------------------------------------------------------------------------- |
121 | | 📂 `flows/`          | Folder to store Flow configuration                                                                               |
122 | | --- 📃 `index.yml`     | YAML file to configure indexing Flow                                                                             |
123 | | --- 📃 `query.yml`     | YAML file to configure querying Flow                                                                             |
124 | | 📂 `pods/`           | Folder to store Pod configuration                                                                                |
125 | | --- 📃 `encoder.yml`   | YAML file to configure encoder Pod                                                                               |
126 | | 📂 `workspace/`      | Folder to store indexed files (embeddings and documents). Automatically created after the first indexing   |
127 | 
128 | _____
129 | 
130 | ## 🐋 Deploy with Docker
131 | To make it easier for you, we have built and published the Docker image for this example.
132 | 
133 | ### ☑️ Requirements:
134 | 
135 | 1. You have Docker installed and working.
136 | 2. You have at least 8GB of free space on your hard drive.
137 | 
138 | ### 🏃🏿‍♂️ Pull and run the image
139 | Running the following command will pull the Docker image and run it.
140 | 
141 | *Replace below with the command to run the Docker image of this example*
142 | 
143 | ```bash
144 | docker .
145 | ```
146 | 
147 | _______
148 | 
149 | ## ⏭️ Next steps
150 | 
151 | Did you like this example and are you interested in building your own? For a detailed tutorial on how to build your Jina app check out [How to Build Your First Jina App](https://docs.jina.ai/chapters/my_first_jina_app/#how-to-build-your-first-jina-app) guide in our documentation. 
152 | 
153 | If you have any issues following this guide, you can always get support from our [Slack community](https://slack.jina.ai) .
154 | 
155 | ## 👩‍👩‍👧‍👦 Community
156 | 
157 | - [Slack channel](https://slack.jina.ai/) - a communication platform for developers to discuss Jina.
158 | - [LinkedIn](https://www.linkedin.com/company/jinaai/) - get to know Jina AI as a company and find job opportunities.
159 | - [![Twitter Follow](https://img.shields.io/twitter/follow/JinaAI_?label=Follow%20%40JinaAI_&style=social)](https://twitter.com/JinaAI_) - follow us and interact with us using hashtag `#JinaSearch`.  
160 | - [Company](https://jina.ai) - know more about our company, we are fully committed to open-source!
161 | 
162 | ## 🦄 License
163 | 
164 | Copyright (c) 2021 Jina AI Limited. All rights reserved.
165 | 
166 | Jina is licensed under the Apache License, Version 2.0. See [LICENSE](https://github.com/jina-ai/examples/blob/master/LICENSE) for the full license text.
167 | 


--------------------------------------------------------------------------------
/multires-lyrics-search/.github/demo.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jina-ai/examples/d8f903278597254bd96b1ed64fe8c9feefaa265c/multires-lyrics-search/.github/demo.gif


--------------------------------------------------------------------------------
/multires-lyrics-search/.github/index.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jina-ai/examples/d8f903278597254bd96b1ed64fe8c9feefaa265c/multires-lyrics-search/.github/index.jpg


--------------------------------------------------------------------------------
/multires-lyrics-search/.github/search.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jina-ai/examples/d8f903278597254bd96b1ed64fe8c9feefaa265c/multires-lyrics-search/.github/search.jpg


--------------------------------------------------------------------------------
/multires-lyrics-search/.gitignore:
--------------------------------------------------------------------------------
1 | lyrics-data/lyrics-data.csv
2 | 


--------------------------------------------------------------------------------
/multires-lyrics-search/app.py:
--------------------------------------------------------------------------------
 1 | __copyright__ = "Copyright (c) 2021 Jina AI Limited. All rights reserved."
 2 | __license__ = "Apache-2.0"
 3 | 
 4 | __version__ = '0.0.1'
 5 | 
 6 | import os
 7 | import sys
 8 | import click
 9 | 
10 | from jina import Flow, Document
11 | from helper import input_generator
12 | from jina.logging.predefined import default_logger as logger
13 | 
14 | 
15 | def config():
16 |     cur_dir = os.path.dirname(os.path.abspath(__file__))
17 |     os.environ.setdefault('JINA_WORKSPACE', os.path.join(cur_dir, 'workspace'))
18 |     os.environ.setdefault('JINA_WORKSPACE_MOUNT',
19 |                           f'{os.environ.get("JINA_WORKSPACE")}:/workspace/workspace')
20 |     os.environ.setdefault('JINA_LOG_LEVEL', 'INFO')
21 |     if os.path.exists('lyrics-data/lyrics-data.csv'):
22 |         os.environ.setdefault('JINA_DATA_FILE', 'lyrics-data/lyrics-data.csv')
23 |     else:
24 |         os.environ.setdefault('JINA_DATA_FILE', 'lyrics-data/lyrics-toy-data1000.csv')
25 |     os.environ.setdefault('JINA_PORT', str(45678))
26 | 
27 | 
28 | # for index
29 | def index(num_docs):
30 |     flow = Flow.load_config('flows/index.yml')
31 |     with flow:
32 |         input_docs = input_generator(num_docs=num_docs)
33 |         data_path = os.path.join(os.path.dirname(__file__),
34 |                                  os.environ.get('JINA_DATA_FILE', None))
35 |         flow.logger.info(f'Indexing {data_path}')
36 |         flow.post(on='/index', inputs=input_docs, request_size=10,
37 |                   show_progress=True)
38 | 
39 | 
40 | # for search
41 | def query():
42 |     flow = Flow.load_config('flows/query.yml')
43 |     flow.rest_api = True
44 |     flow.protocol = 'http'
45 |     with flow:
46 |         flow.block()
47 | 
48 | 
49 | def query_text():
50 |     def print_result(response):
51 |         doc = response.docs[0]
52 |         for index, parent in enumerate(doc.matches):
53 |             print(f'Parent {index}: Song Name: {parent.tags["SName"]}\n{parent.text}')
54 |         for index, chunk in enumerate(doc.chunks):
55 |             print(f'Chunk {index}: {chunk.text}')
56 |             for match in chunk.matches:
57 |                 print(f'\tMatch: {match.text}')
58 | 
59 |     f = Flow.load_config('flows/query.yml')
60 |     with f:
61 |         search_text = input('Please type a sentence: ')
62 |         doc = Document(content=search_text, mime_type='text/plain')
63 |         response = f.post('/search', inputs=doc, parameters={'lookup_type': 'parent'}, return_results=True)
64 |         print_result(response[0].data)
65 | 
66 | 
67 | @click.command()
68 | @click.option('--task', '-t',
69 |               type=click.Choice(['index', 'query', 'query_text'], case_sensitive=False))
70 | @click.option('--num_docs', '-n', default=10000)
71 | def main(task, num_docs):
72 |     config()
73 |     workspace = os.environ["JINA_WORKSPACE"]
74 |     if task == 'index':
75 |         if os.path.exists(workspace):
76 |             logger.error(f'\n +---------------------------------------------------------------------------------+ \
77 |                     \n |                                   🤖🤖🤖                                        | \
78 |                     \n | The directory {workspace} already exists. Please remove it before indexing again. | \
79 |                     \n |                                   🤖🤖🤖                                        | \
80 |                     \n +---------------------------------------------------------------------------------+')
81 |             sys.exit(1)
82 |         index(num_docs)
83 |     elif task == 'query':
84 |         query()
85 |     elif task == 'query_text':
86 |         query_text()
87 |     else:
88 |         raise NotImplementedError(
89 |             f'Unknown task: {task}.')
90 | 
91 | 
92 | if __name__ == '__main__':
93 |     main()
94 | 


--------------------------------------------------------------------------------
/multires-lyrics-search/flows/index.yml:
--------------------------------------------------------------------------------
 1 | jtype: Flow                                                   # We define the flow used for indexing here
 2 | version: '1'                                                  # yml version
 3 | with:                                                         # Parameters for the flow
 4 |   workspace: $JINA_WORKSPACE                                  # Workspace folder
 5 | executors:                                                    # Now, define all the executors that are used
 6 |   - name: segmenter                                           # The first executor splits the input text into sentences which are stored as chunks in the original documents
 7 |     uses: 'jinahub+docker://Sentencizer'                      # The type of the executor is Sentencizer, we download it from the hub as a docker container
 8 |   - name: encoder                                             # Then, compute the embeddings of the sentences in this executor
 9 |     uses: 'jinahub+docker://TransformerTorchEncoder/v0.1'          # We use a TransformerTorchEncoder from the hub
10 |     volumes: '~/.cache/huggingface:/root/.cache/huggingface'  # Mount the huggingface cache into the docker container
11 |     uses_with:                                                # Override some parameters for the executor
12 |       pooling_strategy: 'cls'                                 # This is the pooling strategy that is used by the encoder
13 |       pretrained_model_name_or_path: distilbert-base-cased    # The ML model that is used
14 |       max_length: 96                                          # Max length argument for the tokenizer
15 |       device: 'cpu'                                           # Run the executor on CPU - For GPU, we would have to use another container!
16 |       default_traversal_paths: ['c']                          # Compute the embeddings on the chunk level - the sentences created before
17 |   - name: indexer                                             # Now, index the sentences and store them to disk.
18 |     uses: 'jinahub://SimpleIndexer/old'                           # We use a simple indexer for that purpose (not in docker, but using source codes - there are some bugs with docker for this executor)
19 |     uses_metas:                                               # Set some meta arguments for this executor
20 |       workspace: $JINA_WORKSPACE                              # Define the workspace folder for the executor
21 |     uses_with:                                                # Override parameters for the executor
22 |       default_traversal_paths: ['c']                          # Store the sentences on disk - this means on chunk level
23 |   - name: root_indexer                                        # Additionally to the sentences, we also need to store the original songs which are not split into sentences
24 |     uses: 'jinahub+docker://LMDBStorage'                      # Therefore, we use a LMDBStorage indexer
25 |     volumes: $JINA_WORKSPACE_MOUNT                            # Again, mount the workspace
26 |     uses_with:                                                # Override some parameters for the LMDBStorage
27 |       default_traversal_paths: ['r']                          # Now, we store the root documents, not the sentence chunks
28 |     needs: [gateway]                                          # We can start this at the beginning - in parallel to the sentence flow
29 |   - name: wait_both                                           # Now, we wait for both the root indexing and the sentence path to finish
30 |     needs: [indexer, root_indexer]                            # Continue once these two executor are finished
31 | 


--------------------------------------------------------------------------------
/multires-lyrics-search/flows/query.yml:
--------------------------------------------------------------------------------
 1 | jtype: Flow                                                   # Now, we define the search flow for this example
 2 | version: '1'                                                  # It is quite similar to the index flow, only the differences are explained here
 3 | with:
 4 |   port_expose: $JINA_PORT                                     # Port to run the flow on
 5 |   cors: true                                                  # Add cross origin headers to the request responses
 6 | executors:
 7 |   - name: segmenter                                           # First, split the search text into sentences again
 8 |     uses: 'jinahub+docker://Sentencizer'
 9 |   - name: encoder                                             # Encode the search sentences into embeddings
10 |     uses: 'jinahub+docker://TransformerTorchEncoder/v0.1'
11 |     volumes: '~/.cache/huggingface:/root/.cache/huggingface'
12 |     uses_with:
13 |       pooling_strategy: 'cls'
14 |       pretrained_model_name_or_path: distilbert-base-cased
15 |       max_length: 96
16 |       device: 'cpu'
17 |       default_traversal_paths: ['c']
18 |   - name: indexer                                             # Compare the search sentence embeddings to the stored sentence embeddings from the indexing
19 |     uses: 'jinahub://SimpleIndexer/old'                           # Then, return the closest matches for every sentence
20 |     uses_metas:
21 |       workspace: $JINA_WORKSPACE
22 |     uses_with:
23 |       default_traversal_paths: ['c']
24 |     read_only: True
25 |   - name: ranker                                              # Now, we need to use a special ranker in the query flow
26 |     uses: 'jinahub+docker://SimpleRanker'                     # This ranker collects all the matches from the sentences and adds them to the root document
27 |     uses_with:                                                # It also orders the matches according to their minimum distance
28 |       metric: 'cosine'
29 |   - name: root_indexer                                        # Now, we can collect the stored metadata from the root documents to the matches collected by the MinRanker
30 |     uses: 'jinahub+docker://LMDBStorage'
31 |     volumes: $JINA_WORKSPACE_MOUNT
32 |     uses_with:
33 |       default_traversal_paths: ['m']
34 |     read_only: True


--------------------------------------------------------------------------------
/multires-lyrics-search/get_data.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | kaggle datasets download -d neisse/scrapped-lyrics-from-6-genres
3 | unzip scrapped-lyrics-from-6-genres.zip
4 | rm -rf scrapped-lyrics-from-6-genres.zip
5 | rm -rf artists-data.csv
6 | mv lyrics-data.csv lyrics-data/lyrics-data.csv
7 | 


--------------------------------------------------------------------------------
/multires-lyrics-search/helper.py:
--------------------------------------------------------------------------------
 1 | """Helper functions for the multires example"""
 2 | 
 3 | import csv
 4 | import itertools as it
 5 | import os
 6 | import numpy as np
 7 | 
 8 | from jina import Document
 9 | 
10 | 
11 | def input_generator(num_docs: int):
12 |     lyrics_file = os.environ.setdefault('JINA_DATA_FILE',
13 |                                         'lyrics-data/lyrics-toy-data1000.csv')
14 |     with open(lyrics_file, newline='', encoding='utf-8') as f:
15 |         reader = csv.reader(f)
16 |         for row in it.islice(reader, num_docs):
17 |             if row[-1] == 'ENGLISH':
18 |                 d = Document(text=row[3])
19 |                 d.tags['ALink'] = row[0]
20 |                 d.tags['SName'] = row[1]
21 |                 d.tags['SLink'] = row[2]
22 |                 yield d
23 | 
24 | 
25 | def num_input_docs():
26 |     lyrics_file = os.environ.setdefault(
27 |         'JINA_DATA_PATH', 'lyrics-data/lyrics-toy-data1000.csv'
28 |     )
29 |     with open(lyrics_file, newline='', encoding='utf-8') as f:
30 |         reader = csv.reader(f)
31 |         return len(list(reader))
32 | 
33 | def _ext_A(A):
34 |     nA, dim = A.shape
35 |     A_ext = np.ones((nA, dim * 3))
36 |     A_ext[:, dim : 2 * dim] = A
37 |     A_ext[:, 2 * dim :] = A ** 2
38 |     return A_ext
39 | 
40 | 
41 | def _ext_B(B):
42 |     nB, dim = B.shape
43 |     B_ext = np.ones((dim * 3, nB))
44 |     B_ext[:dim] = (B ** 2).T
45 |     B_ext[dim : 2 * dim] = -2.0 * B.T
46 |     del B
47 |     return B_ext
48 | 
49 | 
50 | def _norm(A):
51 |     return A / np.linalg.norm(A, ord=2, axis=1, keepdims=True)
52 | 
53 | 
54 | def _euclidean(A_ext, B_ext):
55 |     sqdist = A_ext.dot(B_ext).clip(min=0)
56 |     return np.sqrt(sqdist)


--------------------------------------------------------------------------------
/multires-lyrics-search/requirements.txt:
--------------------------------------------------------------------------------
1 | click==8.0.1
2 | jina[standard]==2.0.18
3 | kaggle==1.5.12
4 | docker
5 | git+git://github.com/jina-ai/jina-commons@v0.0.3


--------------------------------------------------------------------------------
/multires-lyrics-search/static/index.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html lang="en">
  3 | <head>
  4 |     <meta charset="utf-8">
  5 |     <meta http-equiv="X-UA-Compatible" content="IE=edge">
  6 |     <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
  7 |     <title>Lyrics Search Demo</title>
  8 | 
  9 |     <!-- Bootstrap -->
 10 |     <link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/css/bootstrap.min.css"
 11 |           integrity="sha384-ggOyR0iXCbMQv3Xipma34MD+dH/1fQ784/j6cY/iJTQUOhcWr7x9JvoRxT2MZw1T" crossorigin="anonymous">
 12 |     <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/vue-slider-component@latest/theme/default.css">
 13 | 
 14 |     <link href="main.css" rel="stylesheet"/>
 15 | </head>
 16 | <body>
 17 | 
 18 | <div class="container-fluid">
 19 |     <nav class="navbar navbar-light bg-light">
 20 |         <a class="navbar-brand" href="https://jina.ai">
 21 |             <img src="jina-logo.svg" width="30" height="30"
 22 |                  class="d-inline-block align-top" alt="" loading="lazy">
 23 |             Lyrics Search Demo with Jina
 24 |         </a>
 25 |         <form class="form-inline">
 26 |             <a class="btn btn-sm btn-outline-danger" type="button" href="https://www.youtube.com/watch?v=GzufeV8AY_w">Watch it on Youtube</a>
 27 |             <a class="btn btn-sm btn-outline-secondary" type="button" href="https://get.jina.ai">Fork it on Github</a>
 28 |         </form>
 29 |     </nav>
 30 | </div>
 31 | 
 32 | <div class="container-fluid" id="jina-ui">
 33 |     <div class="row">
 34 |         <div class="col-3">
 35 |             <div class="card">
 36 |                 <div class="card-header">
 37 |                     Query
 38 |                 </div>
 39 |                 <div class="card-body">
 40 |                     <label for="queryDoc">Input a multiline doc here:</label>
 41 |                     <textarea class="form-control" id="queryDoc" rows="10" v-model="searchQuery"></textarea>
 42 |                     <p class="card-text"><small class="text-muted">{{ searchIndicator }}</small></p>
 43 |                 </div>
 44 |             </div>
 45 |             <div class="card">
 46 |                 <div class="card-header">
 47 |                     Breakdown
 48 |                     <button type="button" v-on:click="clearAllSelect()" class="btn btn-sm btn-secondary float-right">
 49 |                         Toggle all
 50 |                     </button>
 51 |                 </div>
 52 |                 <div class="card-body">
 53 |                     <vue-slider ref="slider"
 54 |                                 v-model="distThreshold"
 55 |                                 v-bind="sliderOptions"></vue-slider>
 56 |                     <small class="card-text text-muted">only show distance < {{ distThreshold }}</small>
 57 | 
 58 | 
 59 |                     <div class="query-chunk-breakdown">
 60 |                         <span class="query-chunk" v-on:click="selectChunk(item)"
 61 |                               v-bind:style="{background: selectColor(index, item['isSelect'])}"
 62 |                               v-for="(item, index) in queryChunks">{{searchQuery.substring(item.location[0], item.location[1])}}</span>
 63 |                     </div>
 64 |                 </div>
 65 |             </div>
 66 |         </div>
 67 |         <div class="col">
 68 |             <div class="card">
 69 |                 <div class="card-header">
 70 |                     Top-{{topkDocs.length}} Results
 71 |                 </div>
 72 |                 <div class="card-body scroll-card">
 73 |                     <div v-masonry transition-duration="1s" item-selector=".item" class="masonry-container"
 74 |                          style="height: 100%" id="my-masonry">
 75 |                         <div v-masonry-tile class="item" :key="index"
 76 |                              v-for="(item, index) in topkDocs" style="width: 16%">
 77 |                             <div class="card" v-on:click="searchQuery=item.text">
 78 |                                 <div class="card-body">
 79 |                                     <blockquote class="blockquote mb-0">
 80 |                                         <p class="lyric-text"
 81 |                                            v-html="topkDocsDict.get(item['id'])['renderHTML']"></p>
 82 |                                         <footer class="blockquote-footer">
 83 |                                             <small class="text-muted">
 84 |                                                 Song: <cite title="Source Title">{{item.tags["SName"]}} </cite>
 85 |                                             </small>
 86 |                                         </footer>
 87 |                                     </blockquote>
 88 |                                     <p class="card-text">
 89 |                                         <span class="badge badge-warning">{{index+1}}</span>
 90 |                                         <small class="text-muted">Relevance: <cite title="Source Title">{{ 1 - item.scores["cosine"].value.toFixed(3) }} </cite></small>
 91 |                                     </p>
 92 |                                 </div>
 93 |                             </div>
 94 |                         </div>
 95 |                     </div>
 96 |                 </div>
 97 |             </div>
 98 |         </div>
 99 |     </div>
100 | 
101 |     <!--    <div class="modal" id="exampleModal" tabindex="-1" role="dialog" aria-labelledby="exampleModalLabel"-->
102 |     <!--         aria-hidden="true">-->
103 |     <!--        <div class="modal-dialog modal-lg" role="document">-->
104 |     <!--            <div class="modal-content">-->
105 |     <!--                <div class="modal-header">-->
106 |     <!--                    <h5 class="modal-title" id="exampleModalLabel">-->
107 |     <!--                        doc_id: <span class="badge badge-light">{{docItem.matchDoc.docId}}</span>-->
108 |     <!--                        Score: <span class="badge badge-light">{{docItem.score.value.toFixed(4)}}</span>-->
109 |     <!--                    </h5>-->
110 |     <!--                </div>-->
111 |     <!--                <div class="modal-body">-->
112 |     <!--                    <div class="container col-md-12">-->
113 |     <!--                        <div class="row">-->
114 |     <!--                            <div class="col-3">-->
115 |     <!--                                <h5>Doc info</h5>-->
116 |     <!--                                <div class="table-responsive">-->
117 |     <!--                                    <table class="table table-striped table-dark">-->
118 |     <!--                                        <tbody>-->
119 |     <!--                                        <tr v-for="(value, name) in docItem.matchDoc">-->
120 |     <!--                                            <th scope="row">{{ name }}</th>-->
121 |     <!--                                            <td>{{ value }}</td>-->
122 |     <!--                                        </tr>-->
123 |     <!--                                        </tbody>-->
124 |     <!--                                    </table>-->
125 |     <!--                                </div>-->
126 |     <!--                            </div>-->
127 |     <!--                            <div class="col">-->
128 |     <!--                                <h5>Breakdown</h5>-->
129 |     <!--                                <div class="well" style="overflow: hidden">-->
130 |     <!--                                    <json-tree :data="docItem.score.explained" :level="jsonTreeLevel"></json-tree>-->
131 |     <!--                                </div>-->
132 |     <!--                            </div>-->
133 |     <!--                        </div>-->
134 |     <!--                        <div class="row">-->
135 |     <!--                            <h5>Query frames: <span class="badge badge-light">{{queryItem.query.length}}</span></h5>-->
136 |     <!--                            <img :src="getThumbnail(queryItem.query.metaInfo)" width="100%" loading="lazy">-->
137 |     <!--                        </div>-->
138 |     <!--                        <div class="row">-->
139 |     <!--                            <h5>Doc frames: <span class="badge badge-light"> {{docItem.matchDoc.length}}</span></h5>-->
140 |     <!--                            <img :src="getThumbnail(docItem.matchDoc.metaInfo)" width="100%" loading="lazy">-->
141 |     <!--                        </div>-->
142 |     <!--                    </div>-->
143 |     <!--                </div>-->
144 |     <!--                <div class="modal-footer">-->
145 |     <!--                    <button type="button" class="btn btn-primary" data-dismiss="modal">Close</button>-->
146 |     <!--                </div>-->
147 |     <!--            </div>-->
148 |     <!--        </div>-->
149 |     <!--    </div>-->
150 | </div>
151 | 
152 | <script src="https://cdn.jsdelivr.net/lodash/4.13.1/lodash.js"></script>
153 | <script src="https://code.jquery.com/jquery-3.3.1.min.js" crossorigin="anonymous"></script>
154 | <script src="https://cdnjs.cloudflare.com/ajax/libs/popper.js/1.14.7/umd/popper.min.js"
155 |         integrity="sha384-UO2eT0CpHqdSJQ6hJty5KVphtPhzWj9WO1clHTMGa3JDZwrnQq4sF86dIHNDz0W1"
156 |         crossorigin="anonymous"></script>
157 | <script src="https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/js/bootstrap.min.js"
158 |         integrity="sha384-JjSmVgyd0p3pXB1rRibZUAYoIIy6OrQ6VrjIEaFf/nJGzIxFDsf4x0xIM+B07jRM"
159 |         crossorigin="anonymous"></script>
160 | <script async defer src="https://cdnjs.cloudflare.com/ajax/libs/masonry/4.0.0/masonry.pkgd.min.js"></script>
161 | <!--<script src="vue.js"></script>-->
162 | <script src="https://unpkg.com/jinabox"></script>
163 | <script src="https://unpkg.com/vue@2.6.10/dist/vue.min.js"></script>
164 | <script src="https://unpkg.com/vue-masonry@0.11.3/dist/vue-masonry-plugin-window.js"></script>
165 | <script src="https://cdn.jsdelivr.net/npm/vue-slider-component@latest/dist/vue-slider-component.umd.min.js"></script>
166 | <script type="text/javascript" src="vue-bindings.js"></script>
167 | </body>
168 | </html>
169 | 


--------------------------------------------------------------------------------
/multires-lyrics-search/static/main.css:
--------------------------------------------------------------------------------
 1 | .lyric-text {
 2 |     font-size: 10px;
 3 |     text-align: center;
 4 |     max-height: 30em;
 5 |     overflow: scroll;
 6 | }
 7 | 
 8 | .blockquote-footer {
 9 |     text-align: center;
10 | }
11 | 
12 | .query-chunk {
13 |     border-width: 1px;
14 |     border-radius: 3px;
15 |     border-style: solid;
16 | }
17 | 
18 | .query-chunk-breakdown {
19 |     margin: 5px;
20 |     border-style: solid;
21 |     border-width: 1px;
22 |     border-radius: 5px;
23 |     border-color: lightgray;
24 |     padding: 5px;
25 | }
26 | 
27 | .card {
28 |     margin: 5px;
29 | }


--------------------------------------------------------------------------------
/multires-lyrics-search/static/vue-bindings.js:
--------------------------------------------------------------------------------
  1 | var VueMasonryPlugin = window["vue-masonry-plugin"].VueMasonryPlugin;
  2 | Vue.use(VueMasonryPlugin);
  3 | 
  4 | const vm = new Vue({
  5 |     el: '#jina-ui',
  6 |     data: {
  7 |         serverUrl: 'http://localhost:45678/search',
  8 |         top_k: 50,
  9 |         topkDocs: [],
 10 |         topkDocsDict: {},
 11 |         results: [],
 12 |         searchQuery: '',
 13 |         queryChunks: [],
 14 |         selectQueryChunks: [],
 15 |         queryItem: [],
 16 |         docItem: null,
 17 |         loadedItem: 0,
 18 |         loadedQuery: 0,
 19 |         searchQueryIsDirty: false,
 20 |         isCalculating: false,
 21 |         distThreshold: 999,
 22 |         sliderOptions: {
 23 |             dotSize: 14,
 24 |             width: 'auto',
 25 |             height: 4,
 26 |             contained: false,
 27 |             direction: 'ltr',
 28 |             data: null,
 29 |             min: 999,
 30 |             max: 0,
 31 |             interval: 0.01,
 32 |             disabled: false,
 33 |             clickable: true,
 34 |             duration: 0.5,
 35 |             adsorb: false,
 36 |             lazy: false,
 37 |             tooltip: 'active',
 38 |             tooltipPlacement: 'top',
 39 |             tooltipFormatter: void 0,
 40 |             useKeyboard: false,
 41 |             keydownHook: null,
 42 |             dragOnClick: false,
 43 |             enableCross: true,
 44 |             fixed: false,
 45 |             minRange: void 0,
 46 |             maxRange: void 0,
 47 |             order: true,
 48 |             marks: false,
 49 |             dotOptions: void 0,
 50 |             process: true,
 51 |             dotStyle: void 0,
 52 |             railStyle: void 0,
 53 |             processStyle: void 0,
 54 |             tooltipStyle: void 0,
 55 |             stepStyle: void 0,
 56 |             stepActiveStyle: void 0,
 57 |             labelStyle: void 0,
 58 |             labelActiveStyle: void 0,
 59 |         }
 60 |     },
 61 |     mounted: function () {
 62 | 
 63 |     },
 64 |     components: {
 65 |         'vueSlider': window['vue-slider-component'],
 66 |     },
 67 |     computed: {
 68 |         searchIndicator: function () {
 69 |             if (this.isCalculating) {
 70 |                 return '⟳ Fetching new results...'
 71 |             } else if (this.searchQueryIsDirty) {
 72 |                 return '... Typing'
 73 |             } else {
 74 | 
 75 |                 return '✓ Done'
 76 |             }
 77 |         }
 78 |     },
 79 |     watch: {
 80 |         searchQuery: function () {
 81 |             this.searchQueryIsDirty = true
 82 |             this.expensiveOperation()
 83 |         },
 84 |         distThreshold: function () {
 85 |             this.refreshAllCards();
 86 |         }
 87 |     },
 88 |     methods: {
 89 |         clearAllSelect: function () {
 90 |             vm.queryChunks.forEach(function (item, i) {
 91 |                 item['isSelect'] = !item['isSelect'];
 92 |                 vm.refreshAllCards();
 93 |             });
 94 |         },
 95 |         selectChunk: function (item) {
 96 |             item['isSelect'] = !item['isSelect'];
 97 |             vm.refreshAllCards();
 98 |         },
 99 |         refreshAllCards: function () {
100 |             vm.topkDocsDict = new Map(vm.topkDocs.map(i => [i.id, {
101 |                 'text': i.text,
102 |                 'hlchunk': [],
103 |                 'renderHTML': i.text
104 |             }]));
105 |             vm.queryChunks.forEach(function (item, i) {
106 |                 if (!('isSelect' in item)) {
107 |                     item['isSelect'] = true;
108 |                 }
109 |                 if (item['isSelect']) {
110 |                     item.matches.forEach(function (r) {
111 |                         if (vm.topkDocsDict.has(r.parentId)) {
112 |                             let dist = r.scores['cosine'].value
113 |                             if (dist < vm.distThreshold) {
114 |                                 // console.log(item)
115 |                                 vm.topkDocsDict.get(r.parentId)['hlchunk'].push({
116 |                                     'range': r.location,
117 |                                     'idx': i,
118 |                                     'dist': dist,
119 |                                     'range_str': r.location[0] + ',' + r.location[1]
120 |                                 });
121 |                             }
122 |                             if (dist < vm.sliderOptions.min) {
123 |                                 vm.sliderOptions.min = dist.toFixed(2)
124 |                             }
125 |                             if (dist > vm.sliderOptions.max) {
126 |                                 vm.sliderOptions.max = dist.toFixed(2)
127 |                             }
128 | 
129 |                         } else {
130 |                             console.error(r.id);
131 |                         }
132 |                     });
133 |                 }
134 |             });
135 |             vm.topkDocsDict.forEach(function (value, key, map) {
136 |                 vm.topkDocsDict.get(key)['hlchunk'].sort(function (a, b) {
137 |                     return b['range'][0] - a['range'][0]
138 |                 })
139 |                 var replace_map = new Map();
140 |                 value['hlchunk'].forEach(function (item) {
141 |                     if (!replace_map.has(item['range_str'])) {
142 |                         replace_map.set(item['range_str'], [])
143 |                     }
144 |                     replace_map.get(item['range_str']).push(item)
145 | 
146 |                 })
147 | 
148 |                 replace_map.forEach(function (item, kk, mm) {
149 |                     value['renderHTML'] = replaceRange(value['renderHTML'], item[0]['range'][0], item[0]['range'][1], item)
150 |                 })
151 |             })
152 |             vm.$nextTick(function () {
153 |                 vm.$redrawVueMasonry('my-masonry');
154 |             })
155 |         },
156 |         // This is where the debounce actually belongs.
157 |         expensiveOperation: _.debounce(function () {
158 |             this.isCalculating = true
159 |             vm.selectQueryChunks.length = 0;
160 |             $.ajax({
161 |                 url: this.serverUrl,
162 |                 type: "POST",
163 |                 contentType: "application/json",
164 |                 cache: false,
165 |                 data: JSON.stringify({
166 |                     "parameters": {"top_k": this.top_k},
167 |                     "data": [this.searchQuery]
168 |                 }),
169 |                 error: function (jqXHR, textStatus, errorThrown) {
170 |                     console.log(jqXHR);
171 |                     console.log(textStatus);
172 |                     console.log(errorThrown);
173 |                 },
174 |                 success: function (data) {
175 |                     vm.topkDocs = data.data.docs[0].matches;
176 |                     console.log('Number parents: ' + vm.topkDocs.length);
177 |                     vm.queryChunks = data.data.docs[0].chunks;
178 |                     console.log('Number chunks: ' + vm.queryChunks.length);
179 |                     vm.refreshAllCards();
180 |                     console.log('Success');
181 |                 },
182 |                 complete: function () {
183 |                     vm.isCalculating = false
184 |                     vm.searchQueryIsDirty = false
185 |                     vm.$nextTick(function () {
186 |                         vm.$redrawVueMasonry('my-masonry');
187 |                     })
188 |                 }
189 |             });
190 | 
191 |         }, 500)
192 |     }
193 | });
194 | 
195 | function replaceRange(s, start, end, chunks) {
196 |     var content = s.substring(start, end)
197 |     chunks.forEach(function (c) {
198 |         content = "<span class=\"match-chunk query-chunk match-chunk-" + c.idx + "\" match-dist=" + c.dist + " style=\"background:" + selectColor(c.idx, true) + "\">" + content + "</span>"
199 |     })
200 |     return s.substring(0, start) + content + s.substring(end);
201 | }
202 | 
203 | function selectColor(number, colored) {
204 |     if (!colored) {
205 |         return `#fff`;
206 |     }
207 |     const hue = number * 137.508; // use golden angle approximation
208 |     return `hsl(${hue},50%,75%)`;
209 | }
210 | 


--------------------------------------------------------------------------------
/multires-lyrics-search/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jina-ai/examples/d8f903278597254bd96b1ed64fe8c9feefaa265c/multires-lyrics-search/tests/__init__.py


--------------------------------------------------------------------------------
/multires-lyrics-search/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | """Patch the birthday problem for random parts"""
 2 | 
 3 | import pytest
 4 | 
 5 | 
 6 | @pytest.fixture(scope='function', autouse=True)
 7 | def patched_random_port(mocker):
 8 |     used_ports = set()
 9 |     from jina.helper import random_port
10 |     from jina.excepts import NoAvailablePortError
11 | 
12 |     def _random_port():
13 | 
14 |         for i in range(10):
15 |             _port = random_port()
16 | 
17 |             if _port is not None and _port not in used_ports:
18 |                 used_ports.add(_port)
19 |                 return _port
20 |         raise NoAvailablePortError
21 | 
22 |     mocker.patch('jina.helper.random_port', new_callable=lambda: _random_port)


--------------------------------------------------------------------------------
/multires-lyrics-search/tests/requirements.txt:
--------------------------------------------------------------------------------
1 | click==8.0.1
2 | git+https://github.com/jina-ai/jina.git@v2.0.18#egg=jina[standard]
3 | pytest==6.1.2
4 | kaggle==1.5.12
5 | docker
6 | git+git://github.com/jina-ai/jina-commons@v0.0.3


--------------------------------------------------------------------------------
/multires-lyrics-search/tests/test_flow_integration.py:
--------------------------------------------------------------------------------
 1 | __copyright__ = "Copyright (c) 2021 Jina AI Limited. All rights reserved."
 2 | __license__ = "Apache-2.0"
 3 | 
 4 | import os
 5 | import shutil
 6 | import glob
 7 | from typing import List
 8 | from click.testing import CliRunner
 9 | 
10 | import pytest
11 | from jina import Flow, Document
12 | 
13 | from app import main
14 | 
15 | 
16 | def get_files_with_patterns(directory: str, match_patterns: List[str]) -> List[str]:
17 |     """
18 |     Returns all files from directory and subdirectories that match any of the patterns in the list.
19 |     The returned list will only contain unique items.
20 | 
21 |     :param directory: Path to the directory
22 |     :param match_patterns: A list of expressions to match the files against. E.g. `*.json`
23 |     :return: List of matched files.
24 |     """
25 |     index_files = []
26 |     for pattern in match_patterns:
27 |         index_files += list(glob.glob(os.path.join(directory, '**', pattern), recursive=True))
28 |     return list(set(index_files))
29 | 
30 | 
31 | @pytest.fixture(scope='session', autouse=True)
32 | def index(tmpdir_factory):
33 |     """
34 |     This fixtures runs automatically once before each test session.
35 |     It indexes a small set of files into a test workspace and checks that the indexing
36 |     completes correctly.
37 | 
38 |     Other tests can use the created workspace and test queries against it.
39 |     """
40 |     assert os.getcwd().endswith('multires-lyrics-search'), \
41 |         "Please execute the tests from the root directory: >>> pytest tests/"
42 | 
43 |     workspace = os.path.join(tmpdir_factory.getbasetemp(), 'test-workspace')
44 |     assert not os.path.isdir(workspace), 'Directory ./test-workspace exists. Please remove before testing'
45 |     os.environ['JINA_WORKSPACE'] = workspace
46 |     os.environ.setdefault('JINA_WORKSPACE_MOUNT',
47 |                           f'{os.environ.get("JINA_WORKSPACE")}:/workspace/workspace')
48 |     os.environ.setdefault('JINA_PORT', str(45678))
49 | 
50 |     runner = CliRunner()
51 |     result = runner.invoke(main, ['-t', 'index', '-n', '100'])
52 |     assert result.stderr_bytes is None, f'Error messages found during indexing: {result.stderr}'
53 | 
54 |     assert os.path.isdir(workspace)
55 |     index_files = get_files_with_patterns(workspace, ['*.bin', '*.lmdb', '*.lmdb-lock'])
56 |     assert len(index_files) == 4, 'Expected three files in the workspace'
57 |     for _file in index_files:
58 |         assert os.path.getsize(_file) > 0, f'File {_file} is empty.'
59 | 
60 |     yield
61 |     # shutil.rmtree(workspace) Not possible due to docker sudo rights
62 | 
63 | 
64 | def test_query_text(tmpdir_factory):
65 |     def assert_result(response):
66 |         docs = response.docs
67 |         # check number of results
68 |         assert len(docs) == 1
69 |         assert len(docs[0].chunks) == 2
70 |         parent_docs = docs[0].matches
71 |         parent_ids = parent_docs.get_attributes('id')
72 |         assert len(parent_docs) > 0
73 |         for chunk in docs[0].chunks:
74 |             assert len(chunk.matches) == 5  # top_k = 5
75 |             match_ids = chunk.matches.get_attributes('id')
76 |             assert len(match_ids) == len(list(set(match_ids)))
77 |             for match in chunk.matches:
78 |                 assert match.text is not None
79 |                 assert match.location is not None
80 |                 assert match.parent_id in parent_ids
81 |                 assert match.text in parent_docs[parent_ids.index(match.parent_id)].text
82 | 
83 |     flow = Flow.load_config('flows/query.yml')
84 |     with flow:
85 |         search_text = 'looked through every window then. hello world.'
86 |         doc = Document(content=search_text, mime_type='text/plain')
87 |         response = flow.post('/search', inputs=doc, parameters={'top_k': 5}, return_results=True)
88 |         assert_result(response[0])
89 | 


--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | norecursedirs = cross-modal-search/img_emb/* cross-modal-search/txt_emb/* openapi/python-flask/openapi_server/*


--------------------------------------------------------------------------------
/wikipedia-sentences-query-while-indexing/.github/images/storage.svg:
--------------------------------------------------------------------------------
1 | <svg id="mermaid-svg" width="100%" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" height="72" style="max-width: 852px;" viewBox="0 0 852 72"><style>#mermaid-svg{font-family:"trebuchet ms",verdana,arial,sans-serif;font-size:16px;fill:#333;}#mermaid-svg .error-icon{fill:hsl(1.935483871,60.7843137255%,55%);}#mermaid-svg .error-text{fill:rgb(45,179.9999999999,184.5);stroke:rgb(45,179.9999999999,184.5);}#mermaid-svg .edge-thickness-normal{stroke-width:2px;}#mermaid-svg .edge-thickness-thick{stroke-width:3.5px;}#mermaid-svg .edge-pattern-solid{stroke-dasharray:0;}#mermaid-svg .edge-pattern-dashed{stroke-dasharray:3;}#mermaid-svg .edge-pattern-dotted{stroke-dasharray:2;}#mermaid-svg .marker{fill:#0b0b0b;stroke:#0b0b0b;}#mermaid-svg .marker.cross{stroke:#0b0b0b;}#mermaid-svg svg{font-family:"trebuchet ms",verdana,arial,sans-serif;font-size:16px;}#mermaid-svg .label{font-family:"trebuchet ms",verdana,arial,sans-serif;color:#333;}#mermaid-svg .cluster-label text{fill:rgb(45,179.9999999999,184.5);}#mermaid-svg .cluster-label span{color:rgb(45,179.9999999999,184.5);}#mermaid-svg .label text,#mermaid-svg span{fill:#333;color:#333;}#mermaid-svg .node rect,#mermaid-svg .node circle,#mermaid-svg .node ellipse,#mermaid-svg .node polygon,#mermaid-svg .node path{fill:#32C8CD;stroke:hsl(181.935483871,20.7843137255%,40%);stroke-width:1px;}#mermaid-svg .node .label{text-align:center;}#mermaid-svg .node.clickable{cursor:pointer;}#mermaid-svg .arrowheadPath{fill:undefined;}#mermaid-svg .edgePath .path{stroke:#0b0b0b;stroke-width:1.5px;}#mermaid-svg .flowchart-link{stroke:#0b0b0b;fill:none;}#mermaid-svg .edgeLabel{background-color:#fff;text-align:center;}#mermaid-svg .edgeLabel rect{opacity:0.5;background-color:#fff;fill:#fff;}#mermaid-svg .cluster rect{fill:#FFCC66;stroke:hsl(1.935483871,20.7843137255%,45%);stroke-width:1px;}#mermaid-svg .cluster text{fill:rgb(45,179.9999999999,184.5);}#mermaid-svg .cluster span{color:rgb(45,179.9999999999,184.5);}#mermaid-svg div.mermaidTooltip{position:absolute;text-align:center;max-width:200px;padding:2px;font-family:"trebuchet ms",verdana,arial,sans-serif;font-size:12px;background:hsl(1.935483871,60.7843137255%,55%);border:1px solid undefined;border-radius:2px;pointer-events:none;z-index:100;}#mermaid-svg:root{--mermaid-font-family:"trebuchet ms",verdana,arial,sans-serif;}#mermaid-svg .POD &gt; *{fill:#32C8CD !important;stroke:#009999 !important;}#mermaid-svg .POD span{fill:#32C8CD !important;stroke:#009999 !important;}#mermaid-svg .INSPECT &gt; *{fill:#ff6666 !important;color:#fff !important;}#mermaid-svg .INSPECT span{fill:#ff6666 !important;color:#fff !important;}#mermaid-svg .JOIN_INSPECT &gt; *{fill:#ff6666 !important;color:#fff !important;}#mermaid-svg .JOIN_INSPECT span{fill:#ff6666 !important;color:#fff !important;}#mermaid-svg .GATEWAY &gt; *{fill:#6E7278 !important;color:#fff !important;}#mermaid-svg .GATEWAY span{fill:#6E7278 !important;color:#fff !important;}#mermaid-svg .INSPECT_AUX_PASS &gt; *{fill:#fff !important;color:#000 !important;stroke-dasharray:5 5 !important;}#mermaid-svg .INSPECT_AUX_PASS span{fill:#fff !important;color:#000 !important;stroke-dasharray:5 5 !important;}#mermaid-svg .pea &gt; *{fill:#009999 !important;stroke:#1E6E73 !important;}#mermaid-svg .pea span{fill:#009999 !important;stroke:#1E6E73 !important;}</style><g><g class="output"><g class="clusters"></g><g class="edgePaths"><g class="edgePath LS-gateway LE-storage_encoder" id="L-gateway-storage_encoder" style="opacity: 1;"><path class="path" d="M88,36L92.16666666666667,36C96.33333333333333,36,104.66666666666667,36,113,36C121.33333333333333,36,129.66666666666666,36,133.83333333333334,36L138,36" marker-end="url(#arrowhead17)" style="fill:none"></path><defs><marker id="arrowhead17" viewBox="0 0 10 10" refX="9" refY="5" markerUnits="strokeWidth" markerWidth="8" markerHeight="6" orient="auto"><path d="M 0 0 L 10 5 L 0 10 z" class="arrowheadPath" style="stroke-width: 1; stroke-dasharray: 1, 0;"></path></marker></defs></g><g class="edgePath LS-storage_encoder LE-storage_indexer" id="L-storage_encoder-storage_indexer" style="opacity: 1;"><path class="path" d="M411,36L415.1666666666667,36C419.3333333333333,36,427.6666666666667,36,436,36C444.3333333333333,36,452.6666666666667,36,456.8333333333333,36L461,36" marker-end="url(#arrowhead18)" style="fill:none"></path><defs><marker id="arrowhead18" viewBox="0 0 10 10" refX="9" refY="5" markerUnits="strokeWidth" markerWidth="8" markerHeight="6" orient="auto"><path d="M 0 0 L 10 5 L 0 10 z" class="arrowheadPath" style="stroke-width: 1; stroke-dasharray: 1, 0;"></path></marker></defs></g><g class="edgePath LS-storage_indexer LE-gateway_END" id="L-storage_indexer-gateway_END" style="opacity: 1;"><path class="path" d="M714,36L718.1666666666666,36C722.3333333333334,36,730.6666666666666,36,739,36C747.3333333333334,36,755.6666666666666,36,759.8333333333334,36L764,36" marker-end="url(#arrowhead19)" style="fill:none"></path><defs><marker id="arrowhead19" viewBox="0 0 10 10" refX="9" refY="5" markerUnits="strokeWidth" markerWidth="8" markerHeight="6" orient="auto"><path d="M 0 0 L 10 5 L 0 10 z" class="arrowheadPath" style="stroke-width: 1; stroke-dasharray: 1, 0;"></path></marker></defs></g></g><g class="edgeLabels"><g class="edgeLabel" transform="" style="opacity: 1;"><g transform="translate(0,0)" class="label"><rect rx="0" ry="0" width="0" height="0"></rect><foreignObject width="0" height="0"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; white-space: nowrap;"><span id="L-L-gateway-storage_encoder" class="edgeLabel L-LS-gateway' L-LE-storage_encoder"></span></div></foreignObject></g></g><g class="edgeLabel" transform="" style="opacity: 1;"><g transform="translate(0,0)" class="label"><rect rx="0" ry="0" width="0" height="0"></rect><foreignObject width="0" height="0"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; white-space: nowrap;"><span id="L-L-storage_encoder-storage_indexer" class="edgeLabel L-LS-storage_encoder' L-LE-storage_indexer"></span></div></foreignObject></g></g><g class="edgeLabel" transform="" style="opacity: 1;"><g transform="translate(0,0)" class="label"><rect rx="0" ry="0" width="0" height="0"></rect><foreignObject width="0" height="0"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; white-space: nowrap;"><span id="L-L-storage_indexer-gateway_END" class="edgeLabel L-LS-storage_indexer' L-LE-gateway_END"></span></div></foreignObject></g></g></g><g class="nodes"><g class="node GATEWAY" id="flowchart-gateway-6" transform="translate(48,36)" style="opacity: 1;"><rect rx="5" ry="5" x="-40" y="-19" width="80" height="38" class="label-container"></rect><g class="label" transform="translate(0,0)"><g transform="translate(-30,-9)"><foreignObject width="60" height="18"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; white-space: nowrap;">gateway</div></foreignObject></g></g></g><g class="node POD POD" id="flowchart-storage_encoder-7" transform="translate(274.5,36)" style="opacity: 1;"><rect rx="5" ry="5" x="-136.5" y="-28" width="273" height="56" class="label-container"></rect><g class="label" transform="translate(0,0)"><g transform="translate(-126.5,-18)"><foreignObject width="253" height="36"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; white-space: nowrap;">storage_encoder<br/>(jinahub+docker://FlairTextEncoder)</div></foreignObject></g></g></g><g class="node POD POD" id="flowchart-storage_indexer-9" transform="translate(587.5,36)" style="opacity: 1;"><rect rx="5" ry="5" x="-126.5" y="-28" width="253" height="56" class="label-container"></rect><g class="label" transform="translate(0,0)"><g transform="translate(-116.5,-18)"><foreignObject width="233" height="36"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; white-space: nowrap;">storage_indexer<br/>(jinahub+docker://LMDBStorage)</div></foreignObject></g></g></g><g class="node GATEWAY" id="flowchart-gateway_END-11" transform="translate(804,36)" style="opacity: 1;"><rect rx="5" ry="5" x="-40" y="-19" width="80" height="38" class="label-container"></rect><g class="label" transform="translate(0,0)"><g transform="translate(-30,-9)"><foreignObject width="60" height="18"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; white-space: nowrap;">gateway</div></foreignObject></g></g></g></g></g></g></svg>


--------------------------------------------------------------------------------
/wikipedia-sentences-query-while-indexing/.gitignore:
--------------------------------------------------------------------------------
1 | workspace*
2 | env
3 | results
4 | __pycache__
5 | 


--------------------------------------------------------------------------------
/wikipedia-sentences-query-while-indexing/README.md:
--------------------------------------------------------------------------------
  1 | # Querying While Indexing in the Wikipedia Search Example
  2 | 
  3 | | About this example: |  |
  4 | | ------------- | ------------- |
  5 | | Learnings | How to configure Jina for querying while indexing |
  6 | | Used for indexing | Text data |
  7 | | Used for querying | Text data |
  8 | | Dataset used | [Wikipedia dataset from kaggle](https://www.kaggle.com/mikeortman/wikipedia-sentences) |
  9 | | Model used | [flair-text](https://github.com/flairNLP/flair) |
 10 | 
 11 | This is an example of using [Jina](http://www.jina.ai) to support both querying and indexing simultaneously in our [Wikipedia sentence search example](https://github.com/jina-ai/examples/tree/master/wikipedia-sentences).
 12 | 
 13 | ## Table of contents:
 14 | 
 15 |   * [Prerequisites](#prerequisites)
 16 |   * [What is querying while indexing?](#what-is-querying-while-indexing)
 17 |   * [Configuration changes](#configuration-changes)
 18 |   * [🐍 Build the app with Python](#-build-the-app-with-python)
 19 |   * [Flow diagrams](#flow-diagrams)
 20 |   * [🔮 Overview of the files](#-overview-of-the-files)
 21 |   * [Troubleshooting](#troubleshooting)
 22 |   * [⏭️ Next steps](#-next-steps)
 23 |   * [👩‍👩‍👧‍👦 Community](#-community)
 24 |   * [🦄 License](#-license)
 25 | 
 26 | ## Prerequisites
 27 | 
 28 | - Run and understand our [Wikipedia sentence search example](https://github.com/jina-ai/examples/tree/master/wikipedia-sentences)
 29 | 
 30 | ## What is querying while indexing?
 31 | 
 32 | Querying while indexing means you are able to still query your data while new data is simultaneously being inserted (or updated, or deleted).
 33 | Jina achieves this with its dump-reload feature.
 34 | 
 35 | ## Configuration changes
 36 | 
 37 | This feature requires you to split the Flow, one for Indexing (and Updates, Deletes) and one for Querying, and have them running at the same time.
 38 | Also, you will need to replace the indexers in Flows.
 39 | The Index Flow (also referred to as the Storage Flow) will require a [Storage Indexer](https://github.com/jina-ai/executors/tree/main/jinahub/indexers/storage), while the Query Flow requires a [Compound Searcher](https://github.com/jina-ai/executors/tree/main/jinahub/indexers/searcher).
 40 | 
 41 | In our case we use :
 42 | 
 43 | - [LMDBStorage](https://github.com/jina-ai/executors/tree/main/jinahub/indexers/storage/LMDBStorage), which uses a disk-based key-value storage [LMDB](https://lmdb.readthedocs.io/) as a storage engine.
 44 | - [FaissLMDBSearcher](https://github.com/jina-ai/executors/tree/main/jinahub/indexers/searcher/compound/FaissLMDBSearcher), which uses the [`faiss`](https://github.com/spotify/annoy) algorithm to provide faster query results and LMDB to retrieve the metadata.
 45 | 
 46 | _____
 47 | 
 48 | ## 🐍 Build the app with Python
 49 | 
 50 | These instructions explain how to run the example yourself and deploy it with Python.
 51 | 
 52 | ### 🗝️ Requirements
 53 | 
 54 | 1. Have a working Python 3.7 or 3.8 environment.
 55 | 1. We recommend creating a [new Python virtual environment](https://docs.python.org/3/tutorial/venv.html) to have a clean installation of Jina and prevent dependency conflicts.
 56 | 1. Install [Docker Engine](https://docs.docker.com/engine/install/).
 57 | 1. Have at least 5 GB of free space on your hard drive.
 58 | 
 59 | 
 60 | ### Running the example
 61 | 
 62 | ### 👾 Step 1. Clone the repo and install Jina
 63 | 
 64 | Begin by cloning the repo so you can get the required files and datasets. (If you already have the examples repository on your machine make sure to fetch the most recent version)
 65 | 
 66 | ```sh
 67 | git clone https://github.com/jina-ai/examples
 68 | cd examples/wikipedia-sentences-query-while-indexing
 69 | ```
 70 | 
 71 | Let's install `jina` and the other required libraries. For further information on installing jina check out [our documentation](https://docs.jina.ai/get-started/install/).
 72 | 
 73 | ```sh
 74 | pip install -r requirements.txt
 75 | ```
 76 | 
 77 | In order to run the example you will need to do the following:
 78 | 
 79 | ### 📥 Step 2. Download your data to search (Optional)
 80 | 
 81 | The repo includes a small subset of the Wikipedia dataset, for quick testing. You can just use that.
 82 | 
 83 | If you want to use the entire dataset, run `bash get_data.sh` and then modify the `DATA_FILE` constant (in `app.py`) to point to that file.
 84 | 
 85 | ### 🏃 Step 3. Running the Flows
 86 | 
 87 | In this example, we use [JinaD]((https://docs.jina.ai/advanced/daemon/#remote-management-via-jinad)) to serve the two Flows (Index and Query) and listen to incoming requests.
 88 | 
 89 | 1. Start `JinaD` server using the below command.
 90 | 
 91 |    ```bash
 92 |    docker run --add-host host.docker.internal:host-gateway \
 93 |            -v /var/run/docker.sock:/var/run/docker.sock \
 94 |            -v /tmp/jinad:/tmp/jinad \
 95 |            -p 8000:8000 \
 96 |            --name jinad \
 97 |            -d jinaai/jina:2.1.0-daemon
 98 |    ```
 99 | 
100 | 2. Run `python app.py -t flows`
101 | 
102 |     This will create the two Flows, and then repeatedly do the following (which can also be done in any other REST client), every 10 seconds:
103 | 
104 |     1. Index 5 Documents.
105 |     2. Send a `DUMP` request to the Storage (Index) Flow to dump its data to a specific location.
106 |     3. Send a `ROLLING_UPDATE` request to the Query Flow to take down its Indexers and start them again, with the new data located at the respective path.
107 | 
108 |     **Warning**: If you want to use the entire wikipedia dataset, run `bash get_data.sh` and then modify the `DATA_FILE` constant to point to that file.
109 | 
110 | ### 🔎 Step 4: Query your data
111 | 
112 | Finally, in a second terminal, run `python app.py -t client`
113 | 
114 | This will prompt you for a query, send the query to the Query Flow, and then show you the results.
115 | 
116 | Since the Flows uses `http` protocol, you can query the REST API with whatever `Client` provided within jina or use `cURL`, `Postman` or [custom Swagger UI provided with jina](https://docs.jina.ai/fundamentals/practice-your-learning/#query-via-swaggerui) etc.
117 | 
118 | #### Cleanup
119 | 
120 | JinaD creates several containers during this process. In order to remove all the containers do the following after you are done using the example:
121 | 
122 | `docker stop $(docker ps -a -q)`
123 | and
124 | `docker rm $(docker ps -a -q)`
125 | 
126 | ## Flow diagrams
127 | 
128 | Below you can see a graphical representation of the Flow pipeline:
129 | 
130 | #### Storage Flow
131 | 
132 | ![](.github/images/storage.svg)
133 | 
134 | #### Query Flow
135 | 
136 | ![](.github/images/query.svg)
137 | 
138 | Notice the following:
139 | 
140 | - the encoder has the same configuration
141 | - the Query Flow uses replicas. One replica continues to serve requests while the other is being reloaded.
142 | - the Indexer in the Query Flow is actually made up of two Indexers: one for vectors, one for Document metadata. On the Storage Flow, this data is stored into one Storage Indexer.
143 | 
144 | ## 🔮 Overview of the files
145 | 
146 | | File or folder |  Contents |
147 | | -------------------- | ---------------------------------------------------------------------------------------------------------------- |
148 | | 📂 `data/`      | Folder where the data files are stored   |
149 | | 📂 `flows/`          | Folder to store Flow configuration                                                                               |
150 | | --- 📃 `storage.yml`     | YAML file to configure Storage (Index) Flow                                                                             |
151 | | --- 📃 `query.yml`     | YAML file to configure Querying Flow                                                                             |
152 | | 🐍 `app.py`      | Code file for the example   |
153 | 
154 | _________
155 | 
156 | ## ⏭️ Next steps
157 | 
158 | Did you like this example and are you interested in building your own? For a detailed tutorial on how to build your Jina app check out [How to Build Your First Jina App](https://docs.jina.ai/chapters/my_first_jina_app/#how-to-build-your-first-jina-app) guide in our documentation.
159 | 
160 | If you have any issues following this guide, you can always get support from our [Slack community](https://slack.jina.ai) .
161 | 
162 | ## 👩‍👩‍👧‍👦 Community
163 | 
164 | - [Slack channel](https://slack.jina.ai) - a communication platform for developers to discuss Jina.
165 | - [LinkedIn](https://www.linkedin.com/company/jinaai/) - get to know Jina AI as a company and find job opportunities.
166 | - [![Twitter Follow](https://img.shields.io/twitter/follow/JinaAI_?label=Follow%20%40JinaAI_&style=social)](https://twitter.com/JinaAI_) - follow us and interact with us using hashtag `#JinaSearch`.
167 | - [Company](https://jina.ai) - know more about our company. We are fully committed to open-source!
168 | 
169 | ## 🦄 License
170 | 
171 | Copyright (c) 2021 Jina AI Limited. All rights reserved.
172 | 
173 | Jina is licensed under the Apache License, Version 2.0. See LICENSE for the full license text.
174 | 


--------------------------------------------------------------------------------
/wikipedia-sentences-query-while-indexing/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jina-ai/examples/d8f903278597254bd96b1ed64fe8c9feefaa265c/wikipedia-sentences-query-while-indexing/__init__.py


--------------------------------------------------------------------------------
/wikipedia-sentences-query-while-indexing/app.py:
--------------------------------------------------------------------------------
  1 | __copyright__ = "Copyright (c) 2021 Jina AI Limited. All rights reserved."
  2 | __license__ = "Apache-2.0"
  3 | 
  4 | import os
  5 | import time
  6 | import traceback
  7 | from typing import List, Dict
  8 | 
  9 | import click
 10 | from daemon.clients import JinaDClient
 11 | from jina.logging.logger import JinaLogger
 12 | from jina import __default_host__, Document, DocumentArray, Client
 13 | 
 14 | os.environ['JINA_LOG_LEVEL'] = 'DEBUG'
 15 | 
 16 | HOST = __default_host__  # change this if you are using remote jinad
 17 | JINAD_PORT = 8000  # change this if you start jinad on a different port
 18 | DUMP_PATH = '/jinad_workspace/dump'  # the path where to dump
 19 | SHARDS = 1  # change this if you change pods/query_indexer.yml
 20 | DUMP_RELOAD_INTERVAL = 10  # time between dump - rolling update calls
 21 | DATA_FILE = 'data/toy.txt'  # change this if you get the full data
 22 | DOCS_PER_ROUND = 5  # nr of documents to index in each round
 23 | STORAGE_FLOW_YAML_FILE = 'storage.yml'  # indexing Flow yaml name
 24 | QUERY_FLOW_YAML_FILE = 'query.yml'  # querying Flow yaml name
 25 | STORAGE_REST_PORT = 9000  # REST port of storage Flow, defined in flows/storage.yml
 26 | QUERY_REST_PORT = 9001  # REST port of Query Flow, defined in flows/query.yml
 27 | 
 28 | logger = JinaLogger('jina')
 29 | cur_dir = os.path.dirname(os.path.abspath(__file__))
 30 | jinad_client = JinaDClient(host=HOST, port=JINAD_PORT, timeout=10 * 60)
 31 | 
 32 | 
 33 | def docarray_from_file(filename):
 34 |     docs = []
 35 |     with open(filename) as f:
 36 |         for line in f:
 37 |             docs.append(Document(text=line))
 38 |     return DocumentArray(docs)
 39 | 
 40 | 
 41 | def query_restful():
 42 |     while True:
 43 |         text = input('please type a sentence: ')
 44 |         if not text:
 45 |             break
 46 | 
 47 |         query_doc = Document()
 48 |         query_doc.text = text
 49 |         response = query_docs(query_doc)
 50 |         matches = response[0].data.docs[0].matches
 51 |         len_matches = len(matches)
 52 |         logger.info(f'Ta-Dah🔮, {len_matches} matches we found for: "{text}" :')
 53 | 
 54 |         for idx, match in enumerate(matches):
 55 |             score = match.scores['euclidean'].value
 56 |             if score < 0.0:
 57 |                 continue
 58 |             logger.info(f'> {idx:>2d}({score:.2f}). {match.text}')
 59 | 
 60 | 
 61 | def index_docs(docs: List[Dict], round: int):
 62 |     docs_to_send = docs[round * DOCS_PER_ROUND : (round + 1) * DOCS_PER_ROUND]
 63 |     logger.info(f'Indexing {len(docs_to_send)} document(s)...')
 64 |     Client(host=HOST, port=STORAGE_REST_PORT, protocol='http').index(inputs=docs_to_send)
 65 | 
 66 | 
 67 | def query_docs(docs: Document):
 68 |     logger.info(f'Searching document {docs}...')
 69 |     return Client(host=HOST, port=QUERY_REST_PORT, protocol='http').search(inputs=docs, return_results=True)
 70 | 
 71 | 
 72 | def create_flows():
 73 |     workspace_id = jinad_client.workspaces.create(paths=[os.path.join(cur_dir, 'flows')])
 74 |     jinad_workspace = jinad_client.workspaces.get(workspace_id)['metadata']['workdir']
 75 | 
 76 |     logger.info('Creating storage Flow...')
 77 |     storage_flow_id = jinad_client.flows.create(
 78 |         workspace_id=workspace_id, filename=STORAGE_FLOW_YAML_FILE, envs={'JINAD_WORKSPACE': jinad_workspace}
 79 |     )
 80 |     logger.info(f'Created successfully. Flow ID: {storage_flow_id}')
 81 |     logger.info('Creating Query Flow...')
 82 |     query_flow_id = jinad_client.flows.create(
 83 |         workspace_id=workspace_id, filename=QUERY_FLOW_YAML_FILE, envs={'JINAD_WORKSPACE': jinad_workspace}
 84 |     )
 85 |     logger.info(f'Created successfully. Flow ID: {query_flow_id}')
 86 |     return storage_flow_id, query_flow_id, workspace_id
 87 | 
 88 | 
 89 | def dump_and_roll_update(storage_flow_id: str, query_flow_id: str):
 90 |     docs = docarray_from_file(DATA_FILE)
 91 |     logger.info(f'starting dump and rolling-update process')
 92 |     round = 0
 93 |     while True:
 94 |         logger.info(f'round {round}:')
 95 |         index_docs(docs, round)
 96 |         current_dump_path = os.path.join(DUMP_PATH, str(round))
 97 | 
 98 |         logger.info(f'dumping...')
 99 |         Client(host=HOST, port=STORAGE_REST_PORT, protocol='http').post(
100 |             on='/dump',
101 |             parameters={'shards': SHARDS, 'dump_path': current_dump_path},
102 |             target_peapod='storage_indexer',
103 |         )
104 | 
105 |         # JinaD is used for ctrl requests on Flows
106 |         logger.info(f'performing rolling update across replicas...')
107 |         jinad_client.flows.update(
108 |             id=query_flow_id,
109 |             kind='rolling_update',
110 |             pod_name='query_indexer',
111 |             dump_path=current_dump_path,
112 |         )
113 |         logger.info(f'rolling update done. sleeping for {DUMP_RELOAD_INTERVAL}secs...')
114 |         time.sleep(DUMP_RELOAD_INTERVAL)
115 |         round += 1
116 | 
117 | 
118 | def cleanup(storage_flow_id, query_flow_id, workspace_id):
119 |     jinad_client.flows.delete(storage_flow_id)
120 |     jinad_client.flows.delete(query_flow_id)
121 |     jinad_client.workspaces.delete(workspace_id)
122 | 
123 | 
124 | @click.command()
125 | @click.option(
126 |     '--task',
127 |     '-t',
128 |     type=click.Choice(['flows', 'client'], case_sensitive=False),
129 | )
130 | def main(task: str):
131 |     """main entrypoint for this example"""
132 |     if task == 'flows':
133 |         # start a Index Flow, dump the data from the Index Flow, and load it into the Query Flow.
134 |         try:
135 |             storage_flow_id, query_flow_id, workspace_id = create_flows()
136 |             # starting a loop that
137 |             # - indexes some data in batches
138 |             # - sends request to storage Flow in JinaD to dump its data to a location
139 |             # - send request to Query Flow in JinaD to perform rolling update across its replicas,
140 |             # which reads the new data in the dump
141 |             dump_and_roll_update(storage_flow_id, query_flow_id)
142 |         except (Exception, KeyboardInterrupt) as e:
143 |             if e:
144 |                 logger.warning(f'Caught: {e}. Original stacktrace following:')
145 |                 logger.error(traceback.format_exc())
146 |             logger.info('Shutting down and cleaning Flows in JinaD...')
147 |             cleanup(storage_flow_id, query_flow_id, workspace_id)
148 | 
149 |     elif task == 'client':
150 |         query_restful()
151 | 
152 | 
153 | if __name__ == '__main__':
154 |     main()
155 | 


--------------------------------------------------------------------------------
/wikipedia-sentences-query-while-indexing/flows/query.yml:
--------------------------------------------------------------------------------
 1 | jtype: Flow                                       # we define the search Flow
 2 | version: '1'
 3 | with:
 4 |   protocol: http                                  # we use the REST API
 5 |   port_expose: 9001                               # the port the Flow will listen on
 6 | executors:                                        # the list of components in this Flow
 7 |   - name: query_encoder                           # the name of this executor. This one takes the text and transforms it into vectors to be used in searching
 8 |     uses: jinahub+docker://FlairTextEncoder       # we use a pre-built Executor docker image
 9 |     timeout_ready: -1                             # disable timing out. (downloading the image can take some time)
10 |   - name: query_indexer                           # the name. This is a Compound Executor, formed of a vector searcher and a key-value db
11 |     uses: jinahub+docker://FaissLMDBSearcher      # again, the docker image
12 |     replicas: 2                                   # we want to replicate this executor, for better performance. This creates two identical copies. Requests are passed to either one
13 |     timeout_ready: -1                             # disable timing out. (downloading the image can take some time)
14 |     volumes: $JINAD_WORKSPACE:/jinad_workspace    # we need a workspace where the LMDB db file will be stored
15 | 


--------------------------------------------------------------------------------
/wikipedia-sentences-query-while-indexing/flows/storage.yml:
--------------------------------------------------------------------------------
 1 | jtype: Flow                                       # we define the Flow used for storing (CRUD operations)
 2 | version: '1'
 3 | with:
 4 |   protocol: http                                  # we want to use the REST HTTP API
 5 |   port_expose: 9000                               # the port to listen on. This is referenced in `app.py`
 6 | executors:                                        # the components in this Flow
 7 |   - name: storage_encoder                         # the name. This is the Encoder (transforms the text into vectors)
 8 |     uses: jinahub+docker://FlairTextEncoder       # we use a pre-built Executor from Jina Hub
 9 |     timeout_ready: -1                             # disable timing out on startup (downloading image can take some time)
10 |   - name: storage_indexer                         # the name. This stores the data in an LMDB db
11 |     uses: jinahub+docker://LMDBStorage            # again, we use a docker image
12 |     timeout_ready: -1                             # disable startup
13 |     volumes: $JINAD_WORKSPACE:/jinad_workspace    # workspace where the file will be stored
14 | 
15 | 


--------------------------------------------------------------------------------
/wikipedia-sentences-query-while-indexing/get_data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | DATASET="mikeortman/wikipedia-sentences"
 3 | DATA_DIR="data"
 4 | LINES=3000
 5 | 
 6 | cd ${DATA_DIR}
 7 | kaggle datasets download -d ${DATASET}
 8 | unzip wikipedia-sentences.zip
 9 | rm -f toy-data.txt
10 | rm -f wikipedia-sentences.zip
11 | mv wikisent2.txt input.txt
12 | 


--------------------------------------------------------------------------------
/wikipedia-sentences-query-while-indexing/manifest.yml:
--------------------------------------------------------------------------------
 1 | manifest_version: 1
 2 | name: wikipedia-sentences-30k-query-while-indexing
 3 | description: 'Example Jina app for searching 30,000 sentences from Wikipedia'
 4 | author: Cristian Mitroi (cristian.mitroi@jina.ai)
 5 | url: https://github.com/jina-ai/examples/tree/master/wikipedia-sentences-query-while-indexing
 6 | vendor: Jina AI Limited
 7 | documentation: https://github.com/jina-ai/examples/tree/master/wikipedia-sentences-query-while-indexing
 8 | version: 0.1
 9 | license: apache-2.0
10 | keywords: [NLP, wikipedia, text, distilbert, example, transformers]
11 | type: app
12 | kind: example
13 | avatar: None
14 | platform: "linux/amd64"
15 | update: "None"
16 | 


--------------------------------------------------------------------------------
/wikipedia-sentences-query-while-indexing/requirements.txt:
--------------------------------------------------------------------------------
1 | jina[daemon]==2.1.0
2 | kaggle==1.5.12
3 | click==7.1.2
4 | 


--------------------------------------------------------------------------------
/wikipedia-sentences-query-while-indexing/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jina-ai/examples/d8f903278597254bd96b1ed64fe8c9feefaa265c/wikipedia-sentences-query-while-indexing/tests/__init__.py


--------------------------------------------------------------------------------
/wikipedia-sentences-query-while-indexing/tests/requirements.txt:
--------------------------------------------------------------------------------
1 | git+https://github.com/jina-ai/jina.git@v2.1.0#egg=jina[daemon]
2 | click==7.1.2
3 | 


--------------------------------------------------------------------------------
/wikipedia-sentences-query-while-indexing/tests/test_query_while_indexing.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | from threading import Thread
 3 | 
 4 | from jina import Document, __default_host__, Client
 5 | from daemon.clients import JinaDClient
 6 | from jina.logging.logger import JinaLogger
 7 | 
 8 | HOST = __default_host__
 9 | JINAD_PORT = 8000
10 | QUERY_REST_PORT = 9001
11 | logger = JinaLogger('test')
12 | 
13 | 
14 | def query_docs(docs: Document):
15 |     logger.info(f'Searching document {docs}...')
16 |     return Client(host=HOST, port=QUERY_REST_PORT, protocol='http').search(inputs=docs, return_results=True)
17 | 
18 | 
19 | def test_query_while_indexing():
20 |     try:
21 |         from app import create_flows, dump_and_roll_update
22 | 
23 |         jinad_client = JinaDClient(host=HOST, port=JINAD_PORT)
24 |         assert jinad_client.alive, 'cannot reach jinad'
25 | 
26 |         storage_flow_id, query_flow_id, workspace_id = create_flows()
27 |         # start rolling update in the background
28 |         Thread(target=dump_and_roll_update, args=(storage_flow_id, query_flow_id), daemon=True).start()
29 | 
30 |         logger.info('sleeping for 30 secs to allow 1 round of index, dump & rolling update')
31 |         time.sleep(30)
32 |         query_doc = Document(text='hello world')
33 |         response = query_docs(query_doc)
34 |         matches = response[0].data.docs[0].matches
35 |         logger.info(f'got {len(matches)} matches')
36 |         assert matches
37 | 
38 |     except (Exception, KeyboardInterrupt):
39 |         raise
40 | 
41 |     finally:
42 |         from app import cleanup
43 | 
44 |         cleanup(storage_flow_id, query_flow_id, workspace_id)
45 | 


--------------------------------------------------------------------------------
/wikipedia-sentences/.dockerignore:
--------------------------------------------------------------------------------
1 | .dockerignore
2 | .git
3 | .github
4 | .gitignore
5 | data
6 | env
7 | get-data.sh
8 | tests
9 | 


--------------------------------------------------------------------------------
/wikipedia-sentences/.github/flow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jina-ai/examples/d8f903278597254bd96b1ed64fe8c9feefaa265c/wikipedia-sentences/.github/flow.png


--------------------------------------------------------------------------------
/wikipedia-sentences/.gitignore:
--------------------------------------------------------------------------------
1 | workspace*
2 | env
3 | results
4 | __pycache__
5 | 


--------------------------------------------------------------------------------
/wikipedia-sentences/README.md:
--------------------------------------------------------------------------------
  1 | # Semantic Wikipedia Search with Transformers and DistilBERT
  2 | 
  3 | ![](https://docs.jina.ai/_images/jinabox-wikipedia.gif)
  4 | 
  5 | ## Table of contents: 
  6 | 
  7 | - [Overview](#overview)
  8 | - [🐍 Build the app with Python](#-build-the-app-with-python)
  9 | - [🔮 Overview of the files in this example](#-overview-of-the-files-in-this-example)
 10 | - [🌀 Flow diagram](#-flow-diagram)
 11 | - [🔨 Next steps, building your own app](#-next-steps-building-your-own-app)
 12 | - [🙍 Community](#-community)
 13 | - [🦄 License](#-license)
 14 | 
 15 | ## Overview
 16 | |  |  |
 17 | | ------------- | ------------- |
 18 | | Summary | This showcases a semantic text search app |
 19 | | Data for indexing | Wikipedia corpus |
 20 | | Data for querying | A text sentence  |
 21 | | Dataset used |  [Kaggle Wikipedia corpus](kaggle.com/mikeortman/wikipedia-sentences)     |
 22 | | ML model used |  [`distilbert-base-nli-stsb-mean-tokens `](https://huggingface.co/sentence-transformers/distilbert-base-nli-stsb-mean-tokens) |
 23 | 
 24 | This example shows you how to build a simple semantic search app powered by [Jina](http://www.jina.ai)'s neural search framework. You can index and search text sentences from Wikipedia using a state-of-the-art machine learning  [`distilbert-base-nli-stsb-mean-tokens `](https://huggingface.co/sentence-transformers/distilbert-base-nli-stsb-mean-tokens) language model from the [Transformers](https://huggingface.co) library.
 25 | 
 26 | | item   | content                                          |
 27 | |--------|--------------------------------------------------|
 28 | | Input  | 1 text file with 1 sentence per line             |
 29 | | Output | *top_k* number of sentences that match input query |
 30 | 
 31 | ## 🐍 Build the app with Python
 32 | 
 33 | These instructions explain how to build the example yourself and deploy it with Python. If you want to skip the building steps and just run the app, check out the  [Docker section](#---deploy-the-prebuild-application-using-docker) below.
 34 | 
 35 | 
 36 | ### 🗝️ Requirements
 37 | 1. You have a working Python 3.7 or 3.8 environment. 
 38 | 2. We recommend creating a [new Python virtual environment](https://docs.python.org/3/tutorial/venv.html) to have a clean installation of Jina and prevent dependency conflicts.   
 39 | 3. You have at least 2 GB of free space on your hard drive. 
 40 | 
 41 | ### 👾 Step 1. Clone the repo and install Jina
 42 | 
 43 | 
 44 | Begin by cloning the repo, so you can get the required files and datasets. In case you already have the examples repository on your machine make sure to fetch the most recent version.
 45 | 
 46 | ```sh
 47 | git clone https://github.com/jina-ai/examples
 48 | cd examples/wikipedia-sentences
 49 | ```
 50 | 
 51 | In your terminal,  you should now be located in you the wikipedia-sentences folder. Let's install Jina and the other required Python libraries. For further information on installing Jina check out our [documentation](https://docs.jina.ai/chapters/core/setup/). 
 52 | 
 53 | 
 54 | ```sh
 55 | pip install -r requirements.txt
 56 | ```
 57 | If this command runs without any error messages, you can then move onto step two. 
 58 | 
 59 | ### 📥 Step 2. Download your data to search 
 60 | 
 61 | By default, a small test dataset is used for indexing. This can lead to bad search results.
 62 | 
 63 | To index the [full dataset](https://www.kaggle.com/mikeortman/wikipedia-sentences) (around 900 MB):
 64 | 
 65 | 1. Set up [Kaggle](https://www.kaggle.com/docs/api#getting-started-installation-&-authentication)
 66 | 2. Run the script: `sh get_data.sh`
 67 | 3. Index your new dataset: `python app.py -t index -d full -n $num_docs`
 68 | 
 69 | The whole dataset contains about 8 Million wikipedia sentences, indexing all of this will take a very long time.
 70 | Therefore, we recommend selecting only a subset of the data, the number of elements can be selected by the `-n` flag.
 71 | We recommend values smaller than 100000. For larger indexes, the SimpleIndexer used in this example will be very slow also in query time.
 72 | It is then recommended to use more advanced indexers like the FaissIndexer.  
 73 | 
 74 | ### 🏃 Step 3. Index your data
 75 | 
 76 | Index your data by running:
 77 | 
 78 | ```sh
 79 | python app.py -t index
 80 | ```
 81 | Here, we can also specify the number of documents to index with ```--num_docs``` / ```-n``` (defult is 10000).
 82 | 
 83 | ### 🔎 Step 4. Query your indexed data
 84 | 
 85 | A search prompt will appear in your terminal after running:
 86 | 
 87 | ```sh
 88 | python app.py -t query
 89 | ```
 90 | 
 91 | See the text below for an example search query and response.
 92 | You can also specify the top k search results with ```--top_k``` /  ```-k``` (default is 5)
 93 | 
 94 | ```
 95 | please type a sentence: What is ROMEO
 96 |          
 97 | Ta-Dah🔮, here are what we found for: What is ROMEO
 98 | >  0(0.36). The ROMEO website, iOS app and Android app are commonly used by the male gay community to find friends, dates, love or get informed about LGBT+ topics.
 99 | 
100 | ```
101 | 
102 | ## 🔮 Overview of the files in this example
103 | Here is a small overview if you're interested in understanding what each file in this example is doing. 
104 | 
105 | | File | Explanation |
106 | |---|---|
107 | |📂 `test/*` |  Various maintenance tests to keep the example running. |
108 | |📃 `app.py`  |  The gateway code to that runs the index & query Flow. |
109 | |📃 `get_data.sh`  |  Downloads the Kaggle dataset. |
110 | |📃 `requirements.txt` |   Contains all required python libraries. |
111 | 
112 | 
113 | ## 🌀 Flow diagram
114 | 
115 | This diagram provides a visual representation of the flow in this example, showing which Executors are used in which order:
116 | 
117 | ![wiki_flow](.github/flow.png)  
118 | 
119 | It can be seen that the flow for this example is quite simple. We receive input Documents from the gateway,
120 | which are then fed into a transformer. This transformer computes an embedding based on the text of the document.
121 | Then, the documents are sent to the indexer which does the following:
122 |  - Index time: Store all the documents on disk (in the workspace folder).
123 |  - Query time: Compare the query document embedding with all stored embeddings and return closest matches
124 | 
125 | ## ⏭️ Next steps, building your own app
126 | 
127 | Did you like this example and are you interested in building your own? For a detailed tuturial on how to build your Jina app check out [How to Build Your First Jina App](https://docs.jina.ai/chapters/my_first_jina_app/#how-to-build-your-first-jina-app) guide in our documentation.
128 | 
129 | - [Enable querying while indexing](https://github.com/jina-ai/examples/tree/master/wikipedia-sentences-query-while-indexing)
130 | 
131 | ## 👩‍👩‍👧‍👦 Community
132 | 
133 | - [Slack channel](https://slack.jina.ai) - a communication platform for developers to discuss Jina
134 | - [LinkedIn](https://www.linkedin.com/company/jinaai/) - get to know Jina AI as a company and find job opportunities
135 | - [![Twitter Follow](https://img.shields.io/twitter/follow/JinaAI_?label=Follow%20%40JinaAI_&style=social)](https://twitter.com/JinaAI_) - follow us and interact with us using hashtag `#JinaSearch`  
136 | - [Company](https://jina.ai) - know more about our company, we are fully committed to open-source!
137 | 
138 | ## 🦄 License
139 | 
140 | Copyright (c) 2021 Jina AI Limited. All rights reserved.
141 | 
142 | Jina is licensed under the Apache License, Version 2.0. See [LICENSE](https://github.com/jina-ai/examples/blob/master/LICENSE) for the full license text.
143 | 


--------------------------------------------------------------------------------
/wikipedia-sentences/app.py:
--------------------------------------------------------------------------------
 1 | __copyright__ = "Copyright (c) 2021 Jina AI Limited. All rights reserved."
 2 | __license__ = "Apache-2.0"
 3 | 
 4 | import os
 5 | import sys
 6 | import click
 7 | import random
 8 | from jina import Flow, Document, DocumentArray
 9 | from jina.logging.predefined import default_logger as logger
10 | 
11 | MAX_DOCS = int(os.environ.get('JINA_MAX_DOCS', 10000))
12 | 
13 | 
14 | def config(dataset: str):
15 |     if dataset == 'toy':
16 |         os.environ['JINA_DATA_FILE'] = os.environ.get('JINA_DATA_FILE', 'data/toy-input.txt')
17 |     elif dataset == 'full':
18 |         os.environ['JINA_DATA_FILE'] = os.environ.get('JINA_DATA_FILE', 'data/input.txt')
19 |     os.environ['JINA_PORT'] = os.environ.get('JINA_PORT', str(45678))
20 |     cur_dir = os.path.dirname(os.path.abspath(__file__))
21 |     os.environ.setdefault('JINA_WORKSPACE', os.path.join(cur_dir, 'workspace'))
22 |     os.environ.setdefault('JINA_WORKSPACE_MOUNT',
23 |                           f'{os.environ.get("JINA_WORKSPACE")}:/workspace/workspace')
24 | 
25 | 
26 | def print_topk(resp, sentence):
27 |     for doc in resp.data.docs:
28 |         print(f"\n\n\nTa-Dah🔮, here's what we found for: {sentence}")
29 |         for idx, match in enumerate(doc.matches):
30 |             score = match.scores['cosine'].value
31 |             print(f'> {idx:>2d}({score:.2f}). {match.text}')
32 | 
33 | 
34 | def input_generator(num_docs: int, file_path: str):
35 |     with open(file_path) as file:
36 |         lines = file.readlines()
37 |     num_lines = len(lines)
38 |     random.shuffle(lines)
39 |     for i in range(min(num_docs, num_lines)):
40 |         yield Document(text=lines[i])
41 | 
42 | 
43 | def index(num_docs):
44 |     flow = Flow().load_config('flows/flow.yml')
45 |     data_path = os.path.join(os.path.dirname(__file__), os.environ.get('JINA_DATA_FILE', None))
46 |     with flow:
47 |         flow.post(on='/index', inputs=input_generator(num_docs, data_path),
48 |                   show_progress=True)
49 | 
50 | 
51 | def query(top_k):
52 |     flow = Flow().load_config('flows/flow.yml')
53 |     with flow:
54 |         text = input('Please type a sentence: ')
55 |         doc = Document(content=text)
56 | 
57 |         result = flow.post(on='/search', inputs=DocumentArray([doc]),
58 |                            parameters={'top_k': top_k},
59 |                            line_format='text',
60 |                            return_results=True,
61 |                            )
62 |         print_topk(result[0], text)
63 | 
64 | 
65 | @click.command()
66 | @click.option(
67 |     '--task',
68 |     '-t',
69 |     type=click.Choice(['index', 'query'], case_sensitive=False),
70 | )
71 | @click.option('--num_docs', '-n', default=MAX_DOCS)
72 | @click.option('--top_k', '-k', default=5)
73 | @click.option('--dataset', '-d', type=click.Choice(['toy', 'full']), default='toy')
74 | def main(task, num_docs, top_k, dataset):
75 |     config(dataset)
76 |     if task == 'index':
77 |         if os.path.exists(os.environ.get("JINA_WORKSPACE")):
78 |             logger.error(f'\n +---------------------------------------------------------------------------------+ \
79 |                     \n |                                   🤖🤖🤖                                        | \
80 |                     \n | The directory {os.environ.get("JINA_WORKSPACE")} already exists. Please remove it before indexing again. | \
81 |                     \n |                                   🤖🤖🤖                                        | \
82 |                     \n +---------------------------------------------------------------------------------+')
83 |             sys.exit(1)
84 |         index(num_docs)
85 |     elif task == 'query':
86 |         query(top_k)
87 | 
88 | 
89 | if __name__ == '__main__':
90 |     main()
91 | 


--------------------------------------------------------------------------------
/wikipedia-sentences/data/toy-input.txt:
--------------------------------------------------------------------------------
 1 | The ROMEO website, iOS app and Android app are commonly used by the male gay community to find friends, dates, love or get informed about LGBT+ topics.
 2 | Once derided as corporate raiders, shareholder activists are now the recipients of admiration for sparking change in corporate boardrooms, leading to corporate boards developing best practices for responding to shareholder activism.
 3 | Slc22a21 belongs to a protein family of solute carriers.
 4 | Ajrara gharana or Ajrada gharana is one of the six main traditional schools in tabla drum.
 5 | A few female specimens were found in a forest stream inside the shola forest.
 6 | Ziggeo is the initiator and backer of BetaJS, an open-source framework.
 7 | The three sports of aquatics were held at Aquatic Center in Sport Authority of Thailand Sport Complex, Bangkok, Thailand.
 8 | Sugandha is the fourth generation of her family into singing and belongs to the Indore gharana.
 9 | WYES is the only independently owned public television station in Louisiana as it is not part of Louisiana Public Broadcasting, which owns all of the PBS member stations in the state that are located outside of New Orleans, and maintains a programming agreement with and partial ownership of the city's independent public television station, WLAE-TV (channel 32).
10 | The reef divides the strait into the Apo East Pass and the Apo West Pass.
11 | His novel The Book of Evidence was shortlisted for the Booker Prize and won the Guinness Peat Aviation award in 1989.
12 | Andrea Kremer (born February 25, 1959 in Philadelphia, Pennsylvania) is a multi-Emmy Award Winning American television sports journalist.
13 | The book was the first published novel by O'Grady, with an initial print run of 6,000 hardback copies.
14 | After Alice performs several "miracle" cures in front of the tree, and claims to have seen the Virgin Mary there, it starts to be treated as a Lourdes-like shrine by Catholic pilgrims.
15 | Tovar is no longer involved with smuggling but acts as a consultant to Goldenvoice, which now operates the Coachella Valley Music and Arts Festival that has been compared to the Glastonbury Festival and is the most profitable music festival in the US.
16 | Tiwari worked as a producer with NDTV from 1996-2003.
17 | It is the home arena for SaPKo of the Mestis hockey league the second top league in Finland behind Liiga.
18 | As of the 2011 apportionment, the district includes the Middlesex County municipalities of East Brunswick Township, Edison Township, Helmetta Borough, Highland Park Borough, Metuchen Borough, South Plainfield Borough and South River Borough.
19 | Lembosiella is a genus of fungi in the Microthyriaceae family; according to the 2007 Outline of Ascomycota, the placement in this family is uncertain.
20 | Later, he resigned from his teaching profession in Jan 2013 and became a full time lyricist, dialog writer and part time researcher in Karky Research Foundation.
21 | It is used in Intel Core microarchitecture based DP-capable server processors, the Dual-Core Xeon is codenamed Dempsey, Woodcrest, and Wolfdale and the Quad-Core processors Clovertown, Harpertown.
22 | The 35th Annual TV Week Logie Awards was held on Friday 19 March 1993 at the Grand Hyatt in Melbourne, and broadcast on Network Ten.
23 | Daund Patas Road railway station is a small railway station in Pune district, Maharashtra.
24 | Evagjelia Veli (born 16 July 1991) is an Albanian weightlifter.
25 | It was published in two volumes that appeared a decade apart.
26 | He is now professor of medicine (biotechnology in public health) at the University of Bergen and chairs the Faculty Council, Faculty of Medicine, Norwegian University of Science and Technology.
27 | Erin McGathy (born December 5, 1985) is an American podcast host, artist, and comedian.
28 | The song is the second single from their debut mini album First Invasion and it was released as a digital single on August 4, 2010.
29 | Chandru who makes his debut in direction after assisting few Tamil films.
30 | Mitchum got the tune for the song from a Norwegian folk-dance (Gammel Reinlender) song his mother used to sing to him.
31 | The shell of the No 69 grenade was composed entirely of the hard plastic, Bakelite, which shattered without producing fragments like a metal bodied grenade.
32 | It was released on 24 January 2014.
33 | Stafford Loans are available both as subsidized and unsubsidized loans.
34 | On the World Wide Web, a query string is the part of a uniform resource locator (URL) containing data that does not fit conveniently into a hierarchical path structure.
35 | They have bar eyes, bare metasternum, bare metapisternum, the anterior anepisternum is usually pillose.
36 | In 1904, the mayoral term was changed to two years.
37 | It was earlier known as Central Mall but underwent renovations and some parts were re-organized in 2017 and was re-branded and re-launced on 26 February 2018.
38 | Jang Young-sik (born 1935) is a South Korean economist.
39 | It is a medium-sized damselfly with a short stout body, it is black with blue markings, and has long dark wings with pterostigma.
40 | The first desegregated hotel casino, it was popular with many of the black entertainers of the time, who would entertain at the other hotels and casinos and stay at the Moulin Rouge.
41 | In February 2009, it was revealed that the site was projected onto a wall at The Daily Telegraph to allow journalists there to view breaking news posted by users to Twitter.
42 | His most recent novel in this series, The Bangkok Asset, was published on 4 August 2015.
43 | He served as the 24th Governor of Nevada from 1979 to 1983.
44 | The soils which range from acid to alkaline and front wet to dry gives rise to a diverse woodland structure.
45 | Their land was taken back by the Spanish Crown; and then irretrievably lost however, when California became part of the United States.
46 | With annual billings of $220 million, Tombras is one of the top 25 largest independent national advertising agencies.
47 | The couple intended to retire to China and purchased a property in Canton; however the Communist victory in 1949 changed their plans and in 1950 the couple sold the vineyard and moved to Blockhouse Bay, Auckland.
48 | "Super Scooter Happy" was covered by Kyary Pamyu Pamyu on her 2013 album, Nanda Collection.
49 | Filling four CD-ROMs, Final Fantasy IX featured a cast containing a variety of major and minor characters.
50 | The album was produced by Billy Harvey, and featured contributions by Rafael Gayol and the Tosca String Quartet.
51 | 


--------------------------------------------------------------------------------
/wikipedia-sentences/flows/flow.yml:
--------------------------------------------------------------------------------
 1 | jtype: Flow                                           # This file defines the flow (both index and query) for the wikipedia sentences example
 2 | version: '1'                                          # This is the yml file version
 3 | with:                                                 # Additional arguments for the flow
 4 |   workspace: $JINA_WORKSPACE                          # Workspace folder path
 5 |   port_expose: $JINA_PORT                             # Network Port for the flow
 6 | executors:                                            # Now, define the executors that are run on this flow
 7 |   - name: transformer                                 # This executor computes an embedding based on the input text documents
 8 |     uses: 'jinahub+docker://TransformerTorchEncoder/v0.1'  # We use a Transformer Torch Encoder from the hub as a docker container
 9 |   - name: indexer                                     # Now, index the text documents with the embeddings
10 |     uses: 'jinahub://SimpleIndexer/old'                   # We use the SimpleIndexer for this purpose


--------------------------------------------------------------------------------
/wikipedia-sentences/get_data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | DATASET="mikeortman/wikipedia-sentences"
 3 | DATA_DIR="data"
 4 | LINES=3000
 5 | 
 6 | 
 7 | 
 8 | cd ${DATA_DIR}
 9 | kaggle datasets download -d ${DATASET}
10 | unzip wikipedia-sentences.zip
11 | rm -f toy-data.txt
12 | rm -f wikipedia-sentences.zip
13 | mv wikisent2.txt input.txt
14 | 


--------------------------------------------------------------------------------
/wikipedia-sentences/requirements.txt:
--------------------------------------------------------------------------------
1 | click==8.0.1
2 | jina[standard]==2.0.18
3 | git+git://github.com/jina-ai/jina-commons@v0.0.3


--------------------------------------------------------------------------------
/wikipedia-sentences/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jina-ai/examples/d8f903278597254bd96b1ed64fe8c9feefaa265c/wikipedia-sentences/tests/__init__.py


--------------------------------------------------------------------------------
/wikipedia-sentences/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | """Patch the birthday problem for random parts"""
 2 | 
 3 | import pytest
 4 | 
 5 | 
 6 | @pytest.fixture(scope='function', autouse=True)
 7 | def patched_random_port(mocker):
 8 |     used_ports = set()
 9 |     from jina.helper import random_port
10 |     from jina.excepts import NoAvailablePortError
11 | 
12 |     def _random_port():
13 | 
14 |         for i in range(10):
15 |             _port = random_port()
16 | 
17 |             if _port is not None and _port not in used_ports:
18 |                 used_ports.add(_port)
19 |                 return _port
20 |         raise NoAvailablePortError
21 | 
22 |     mocker.patch('jina.helper.random_port', new_callable=lambda: _random_port)


--------------------------------------------------------------------------------
/wikipedia-sentences/tests/requirements.txt:
--------------------------------------------------------------------------------
1 | pytest==6.2.4
2 | click==8.0.1
3 | jina[standard]==2.0.18
4 | git+git://github.com/jina-ai/jina-commons@v0.0.3


--------------------------------------------------------------------------------
/wikipedia-sentences/tests/test_wikipediasearch.py:
--------------------------------------------------------------------------------
 1 | __copyright__ = "Copyright (c) 2021 Jina AI Limited. All rights reserved."
 2 | __license__ = "Apache-2.0"
 3 | 
 4 | import os
 5 | import sys
 6 | from click.testing import CliRunner
 7 | 
 8 | sys.path.append('..')
 9 | from app import main
10 | 
11 | 
12 | def config(tmpdir):
13 |     os.environ['JINA_WORKSPACE'] = os.path.join(tmpdir, 'workspace')
14 | 
15 | 
16 | def test_wikipedia_sentences(tmpdir):
17 |     config(tmpdir)
18 |     runner = CliRunner()
19 |     result = runner.invoke(main, ['-t', 'index'])
20 |     assert "done in" in result.stdout
21 |     assert result.stderr_bytes is None
22 |     result = runner.invoke(main, ['-t', 'query'])
23 |     print(result.stdout)
24 |     assert result.stderr_bytes is None
25 | 


--------------------------------------------------------------------------------
/wikipedia-sentences/tests/toy-input.txt:
--------------------------------------------------------------------------------
1 | ../data/toy-input.txt


--------------------------------------------------------------------------------