example. It downloads AudioSet and indexes 1000 10-second audio clips via Jina search framework. We then randomly sample 5-second query audio clips extracted from the index audio clips as Queries, ask Jina to retrieve relevant results. Below is Jina's retrievals, where the left-most column is query audio.
25 |
26 | Intrigued? Learn more about Jina and checkout our Github!
27 |
28 |
29 |
30 |
Precision@{% TOP_K %}: {% PRECISION_EVALUATION %}
31 |
32 |
33 |
34 |
Query
Top-K Results
{% RESULT %}
35 |
36 |
37 |
--------------------------------------------------------------------------------
/audio-to-audio-search/executors.py:
--------------------------------------------------------------------------------
1 | import os
2 | from typing import Tuple, Dict, Optional
3 |
4 | import torch
5 | import numpy as np
6 | import librosa as lr
7 | import torchaudio
8 | from jina import Executor, DocumentArray, requests, Document
9 | from jina_commons import get_logger
10 |
11 | from vggish.vggish_input import waveform_to_examples
12 | from vggish.vggish_params import SAMPLE_RATE
13 |
14 |
15 | class Wav2MelCrafter(Executor):
16 | def __init__(self, *args, **kwargs):
17 | super().__init__(*args, **kwargs)
18 | self.logger = get_logger(self)
19 |
20 | @requests
21 | def segment(self, docs: Optional[DocumentArray] = None, **kwargs):
22 | if not docs:
23 | return
24 | for doc in docs:
25 | result_chunk = []
26 | for chunk in doc.chunks:
27 | mel_data = waveform_to_examples(chunk.blob, chunk.tags['sample_rate'])
28 | if mel_data.ndim != 3:
29 | self.logger.warning(
30 | f'failed to convert from wave to mel, chunk.blob: {chunk.blob.shape}, sample_rate: {SAMPLE_RATE}'
31 | )
32 | continue
33 | if mel_data.shape[0] <= 0:
34 | self.logger.warning(
35 | f'chunk between {chunk.location} is skipped due to the duration is too short'
36 | )
37 | if mel_data.ndim == 2:
38 | mel_data = np.atleast_3d(mel_data)
39 | mel_data = mel_data.reshape(1, mel_data.shape[0], mel_data.shape[1])
40 | chunk.blob = mel_data
41 | if mel_data.size > 0:
42 | result_chunk.append(chunk)
43 | doc.chunks = result_chunk
44 |
45 |
46 | class TimeSegmenter(Executor):
47 | def __init__(self, chunk_duration: int = 10, chunk_strip: int = 1, *args, **kwargs):
48 | super().__init__(*args, **kwargs)
49 | self.chunk_duration = chunk_duration # seconds
50 | self.strip = chunk_strip
51 |
52 | @requests(on=['/search', '/index'])
53 | def segment(
54 | self, docs: Optional[DocumentArray] = None, parameters: dict = {}, **kwargs
55 | ):
56 | if not docs:
57 | return
58 | for idx, doc in enumerate(docs):
59 | doc.blob, sample_rate = self._load_raw_audio(doc)
60 | doc.tags['sample_rate'] = sample_rate
61 | chunk_size = int(self.chunk_duration * sample_rate)
62 | strip = parameters.get('chunk_strip', self.strip)
63 | strip_size = int(strip * sample_rate)
64 | num_chunks = max(1, int((doc.blob.shape[0] - chunk_size) / strip_size))
65 | for chunk_id in range(num_chunks):
66 | beg = chunk_id * strip_size
67 | end = beg + chunk_size
68 | if beg > doc.blob.shape[0]:
69 | break
70 | doc.chunks.append(
71 | Document(
72 | blob=doc.blob[beg:end],
73 | offset=idx,
74 | location=[beg, end],
75 | tags=doc.tags,
76 | )
77 | )
78 |
79 | def _load_raw_audio(self, doc: Document) -> Tuple[np.ndarray, int]:
80 | if doc.blob is not None and doc.tags.get('sample_rate', None) is None:
81 | raise BadDocType('data is blob but sample rate is not provided')
82 | elif doc.blob is not None:
83 | return doc.blob, int(doc.tags['sample_rate'])
84 | elif doc.uri is not None and doc.uri.endswith('.mp3'):
85 | return self._read_mp3(doc.uri)
86 | elif doc.uri is not None and doc.uri.endswith('.wav'):
87 | return self._read_wav(doc.uri)
88 | else:
89 | raise BadDocType('doc needs to have either a blob or a wav/mp3 uri')
90 |
91 | def _read_wav(self, file_path: str) -> Tuple[np.ndarray, int]:
92 | data, sample_rate = torchaudio.load(file_path)
93 | data = np.mean(data.cpu().numpy(), axis=0)
94 | return data, sample_rate
95 |
96 | def _read_mp3(self, file_path: str) -> Tuple[np.ndarray, int]:
97 | return lr.load(file_path)
98 |
99 |
100 | class DebugExecutor(Executor):
101 | @requests
102 | def debug(self, docs: Optional[DocumentArray] = None, **kwargs):
103 | logger = get_logger(self)
104 | if not docs:
105 | return
106 | for i, doc in enumerate(docs):
107 | for match in doc.matches:
108 | logger.info(f"doc {doc.tags['file']} match: ", match.tags['file'])
109 |
--------------------------------------------------------------------------------
/audio-to-audio-search/helper.py:
--------------------------------------------------------------------------------
1 | import glob
2 | import logging
3 | import os
4 | from pathlib import Path
5 | import re
6 | import random
7 | import shutil
8 | import subprocess
9 | from typing import List, Optional
10 |
11 | from prettytable import PrettyTable
12 | from jina import Document, DocumentArray
13 | from jina.types.request import Response
14 | import webbrowser
15 |
16 |
17 | ID_LEN = 11
18 |
19 |
20 | def get_logger():
21 | """
22 | Method to get logger.
23 | """
24 | logger = logging.getLogger('app')
25 | logger.setLevel(logging.INFO)
26 | formatter = logging.Formatter('%(message)s')
27 | sh = logging.StreamHandler()
28 | sh.setFormatter(formatter)
29 | sh.setLevel(logging.INFO)
30 | logger.addHandler(sh)
31 | return logger
32 |
33 |
34 | logger = get_logger()
35 |
36 |
37 | def create_docs(filefolder_path: str):
38 | """
39 | Method to create Jina documents.
40 |
41 | :param filefolder_path: paths to audio files
42 | """
43 | docs = []
44 | import librosa as lr
45 |
46 | logger.info('Creating docs..')
47 | for file_path in sorted(glob.glob(filefolder_path)):
48 | id = os.path.basename(file_path).split('.')[0]
49 | blob, sample_rate = lr.load(file_path)
50 | docs.append(
51 | Document(
52 | id=id, blob=blob, tags={'file': file_path, 'sample_rate': sample_rate}
53 | )
54 | )
55 | logger.info('docs created')
56 | return DocumentArray(docs)
57 |
58 |
59 | def create_query_audios(num_docs: int, data_folder: Path):
60 | """
61 | Method to create query audio clips.
62 |
63 | :param num_docs: number of query docs
64 | :param data_folder: path to data folder
65 | """
66 | input_docs_folder = data_folder / 'index'
67 | output_docs_folder = data_folder / 'query'
68 | if output_docs_folder.is_dir():
69 | shutil.rmtree(output_docs_folder)
70 | output_docs_folder.mkdir()
71 | input_docs_filenames = glob.glob(str(input_docs_folder / '*.mp3'))
72 |
73 | if len(input_docs_filenames) < num_docs:
74 | raise FileNotFoundError(
75 | 'cannot find sufficient '
76 | f'index audios clips. Number of index audio clips found: {len(input_docs_filenames)}, '
77 | f'number of requested query docs: {num_docs}'
78 | )
79 |
80 | for input_file in random.sample(input_docs_filenames, k=num_docs):
81 | id = re.match(r'index_(.*).mp3', os.path.basename(input_file))[1][-ID_LEN:]
82 | output_file = f"query_{id}.mp3"
83 | startTime = random.random() * 5
84 | endTime = startTime + random.random() * 4 + 3
85 | cmd = [
86 | 'ffmpeg',
87 | '-i',
88 | os.path.abspath(input_file),
89 | '-ss',
90 | str(startTime),
91 | '-to',
92 | str(endTime),
93 | '-async',
94 | '1',
95 | output_file,
96 | ]
97 | subprocess.call(cmd, cwd=str(output_docs_folder))
98 |
99 |
100 | def report_results(responses: List[Response], threshold: Optional[float], top_k: int):
101 | """
102 | Method to report results
103 |
104 | :param responses: returned responses with data
105 | :param threshold: threshold for search
106 | :param top_k: top k number
107 | """
108 | pred_list = []
109 | table = PrettyTable()
110 | table.field_names = ['target', 'prediction', 'is_correct']
111 | result_html = []
112 | for i, response in enumerate(responses):
113 | for j, doc in enumerate(response.docs):
114 | if not doc.matches:
115 | continue
116 | match = doc.matches[0]
117 | target_result = os.path.basename(doc.tags["file"]).split('.')[0][-ID_LEN:]
118 | pred_result = os.path.basename(match.tags["file"]).split('.')[0][-ID_LEN:]
119 | pred_result = (
120 | pred_result
121 | if threshold is None or 1 - match.scores['cosine'].value > threshold
122 | else 'None'
123 | )
124 | table.add_row([target_result, pred_result, target_result == pred_result])
125 | pred_list.append(target_result == pred_result)
126 |
127 | query_html = f"""
128 |
129 |
132 | """
133 | seen = set()
134 | result_html.append(f'
{query_html}
')
135 | print('wt, ', len(doc.matches))
136 | for k, match in enumerate(doc.matches):
137 | if len(seen) >= top_k:
138 | break
139 | if match.tags['file'] in seen:
140 | continue
141 | seen.add(match.tags['file'])
142 | match_html = f"""
143 |
144 |
147 | """
148 | result_html.append(match_html)
149 | result_html.append('
\n')
150 |
151 | logger.info(table)
152 |
153 | if not pred_list:
154 | return [], float('nan')
155 |
156 | accuracy = sum(pred_list) / len(pred_list)
157 | logger.info(f'accuracy: {accuracy}')
158 | return result_html, accuracy
159 |
160 |
161 | def write_html(html_path: str, result_html: str, accuracy: float, top_k: int):
162 | """
163 | Method to present results in browser.
164 |
165 | :param html_path: path of the written html
166 | :param result_html: content of html to be written
167 | :param accuracy: accuracy of search
168 | :param top_k: top k number
169 | """
170 | with open(
171 | os.path.join(os.path.dirname(os.path.realpath(__file__)), 'demo.html')
172 | ) as fp, open(html_path, 'w') as fw:
173 | t = fp.read()
174 | t = t.replace('{% RESULT %}', '\n'.join(result_html))
175 | t = t.replace(
176 | '{% PRECISION_EVALUATION %}',
177 | '{:.2f}%'.format(accuracy * 100.0),
178 | )
179 | t = t.replace('{% TOP_K %}', str(top_k))
180 | fw.write(t)
181 |
182 | url_html_path = 'file://' + os.path.abspath(html_path)
183 |
184 | try:
185 | webbrowser.open(url_html_path, new=2)
186 | except:
187 | pass # intentional pass, browser support isn't cross-platform
188 | finally:
189 | logger.info(
190 | f'You should see a "demo.html" opened in your browser, '
191 | f'if not you may open {url_html_path} manually'
192 | )
193 |
--------------------------------------------------------------------------------
/audio-to-audio-search/requirements.txt:
--------------------------------------------------------------------------------
1 | git+git://github.com/jina-ai/jina-commons@v0.0.3
2 | click
3 | jina~=2.0
4 | numpy==1.20.0
5 | soundfile==0.10.3.post1
6 | librosa==0.8.0
7 | visdom==0.1.8.9
8 | ffmpeg
9 | torchaudio
10 | prettytable
11 |
--------------------------------------------------------------------------------
/audio-to-audio-search/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jina-ai/examples/d8f903278597254bd96b1ed64fe8c9feefaa265c/audio-to-audio-search/tests/__init__.py
--------------------------------------------------------------------------------
/audio-to-audio-search/tests/data/mp3/index/index_-Bu7YaslRW0.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jina-ai/examples/d8f903278597254bd96b1ed64fe8c9feefaa265c/audio-to-audio-search/tests/data/mp3/index/index_-Bu7YaslRW0.mp3
--------------------------------------------------------------------------------
/audio-to-audio-search/tests/data/mp3/index/index_-D--GWwca0g.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jina-ai/examples/d8f903278597254bd96b1ed64fe8c9feefaa265c/audio-to-audio-search/tests/data/mp3/index/index_-D--GWwca0g.mp3
--------------------------------------------------------------------------------
/audio-to-audio-search/tests/data/mp3/index/index_-nlkWWphiaM.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jina-ai/examples/d8f903278597254bd96b1ed64fe8c9feefaa265c/audio-to-audio-search/tests/data/mp3/index/index_-nlkWWphiaM.mp3
--------------------------------------------------------------------------------
/audio-to-audio-search/tests/data/mp3/index/index_0bRUkLsttto.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jina-ai/examples/d8f903278597254bd96b1ed64fe8c9feefaa265c/audio-to-audio-search/tests/data/mp3/index/index_0bRUkLsttto.mp3
--------------------------------------------------------------------------------
/audio-to-audio-search/tests/data/mp3/index/index_0slyl34xWug.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jina-ai/examples/d8f903278597254bd96b1ed64fe8c9feefaa265c/audio-to-audio-search/tests/data/mp3/index/index_0slyl34xWug.mp3
--------------------------------------------------------------------------------
/audio-to-audio-search/tests/requirements.txt:
--------------------------------------------------------------------------------
1 | git+git://github.com/jina-ai/jina-commons@v0.0.3
2 | click
3 | pytest~=6.1.2
4 | jina~=2.0
5 | numpy==1.20.0
6 | soundfile==0.10.3.post1
7 | librosa==0.8.0
8 | visdom==0.1.8.9
9 | ffmpeg
10 | torchaudio
11 | prettytable
12 |
--------------------------------------------------------------------------------
/audio-to-audio-search/tests/test_audio_to_audio_search.py:
--------------------------------------------------------------------------------
1 | import glob
2 | import os
3 | import shutil
4 |
5 | import pytest
6 | from click.testing import CliRunner
7 | from app import cli
8 | from pathlib import Path
9 |
10 |
11 | @pytest.mark.parametrize('segmenter', ['vad', 'time'])
12 | @pytest.mark.parametrize('encoder', ['vgg', 'clip'])
13 | def test_exec(tmp_path, segmenter, encoder):
14 | assert os.getcwd().endswith(
15 | 'audio-to-audio-search'
16 | ), "Please execute the tests from the root directory: >>> pytest tests/"
17 | os.environ['JINA_DATA_FILE'] = os.path.join('tests', 'data', 'mp3')
18 | workspace = os.environ['JINA_WORKSPACE'] = os.path.join(tmp_path, 'workspace')
19 | os.environ['JINA_WORKSPACE_MOUNT']= f'{workspace}:/workspace/workspace'
20 | runner = CliRunner()
21 | _test_index(runner, workspace, segmenter, encoder)
22 | _test_query(runner, segmenter, encoder)
23 |
24 |
25 | def _test_index(runner, workspace, segmenter, encoder):
26 | result = runner.invoke(cli, ['index', '-s', segmenter, '-e', encoder])
27 | assert result.exception is None
28 | assert result.exit_code == 0
29 | assert Path(workspace).is_dir()
30 | assert (
31 | len(set(glob.glob(os.path.join(workspace, '**', '*.bin'), recursive=True))) == 2
32 | )
33 |
34 |
35 | def _test_query(runner, segmenter, encoder):
36 | # test error case: query more docs than indexed
37 | result = runner.invoke(cli, ['search', '-s', segmenter, '-e', encoder, '-n', 10])
38 |
39 | with pytest.raises(
40 | FileNotFoundError,
41 | match='cannot find sufficient index audios clips. '
42 | 'Number of index audio clips found: 5, number of requested query docs: 10',
43 | ):
44 | assert result.exception is not None
45 | raise result.exception
46 |
47 | assert result.exit_code != 0
48 | result = runner.invoke(cli, ['search', '-s', segmenter, '-e', encoder, '-n', 3])
49 | assert result.exception is None
50 | assert result.exit_code == 0
51 |
--------------------------------------------------------------------------------
/audio-to-audio-search/vggish/mel_features.py:
--------------------------------------------------------------------------------
1 | __copyright__ = "Copyright (c) 2021 Jina AI Limited. All rights reserved."
2 | __license__ = "Apache-2.0"
3 |
4 | # Copyright 2017 The TensorFlow Authors All Rights Reserved.
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | # ==============================================================================
18 |
19 | """Defines routines to compute mel spectrogram features from audio waveform."""
20 |
21 | import numpy as np
22 |
23 |
24 | def frame(data, window_length, hop_length):
25 | """Convert array into a sequence of successive possibly overlapping frames.
26 |
27 | An n-dimensional array of shape (num_samples, ...) is converted into an
28 | (n+1)-D array of shape (num_frames, window_length, ...), where each frame
29 | starts hop_length points after the preceding one.
30 |
31 | This is accomplished using stride_tricks, so the original data is not
32 | copied. However, there is no zero-padding, so any incomplete frames at the
33 | end are not included.
34 |
35 | Args:
36 | data: np.array of dimension N >= 1.
37 | window_length: Number of samples in each frame.
38 | hop_length: Advance (in samples) between each window.
39 |
40 | Returns:
41 | (N+1)-D np.array with as many rows as there are complete frames that can be
42 | extracted.
43 | """
44 | num_samples = data.shape[0]
45 | num_frames = 1 + int(np.floor((num_samples - window_length) / hop_length))
46 | shape = (num_frames, window_length) + data.shape[1:]
47 | strides = (data.strides[0] * hop_length,) + data.strides
48 | return np.lib.stride_tricks.as_strided(data, shape=shape, strides=strides)
49 |
50 |
51 | def periodic_hann(window_length):
52 | """Calculate a "periodic" Hann window.
53 |
54 | The classic Hann window is defined as a raised cosine that starts and
55 | ends on zero, and where every value appears twice, except the middle
56 | point for an odd-length window. Matlab calls this a "symmetric" window
57 | and np.hanning() returns it. However, for Fourier analysis, this
58 | actually represents just over one cycle of a period N-1 cosine, and
59 | thus is not compactly expressed on a length-N Fourier basis. Instead,
60 | it's better to use a raised cosine that ends just before the final
61 | zero value - i.e. a complete cycle of a period-N cosine. Matlab
62 | calls this a "periodic" window. This routine calculates it.
63 |
64 | Args:
65 | window_length: The number of points in the returned window.
66 |
67 | Returns:
68 | A 1D np.array containing the periodic hann window.
69 | """
70 | return 0.5 - (0.5 * np.cos(2 * np.pi / window_length *
71 | np.arange(window_length)))
72 |
73 |
74 | def stft_magnitude(signal, fft_length,
75 | hop_length=None,
76 | window_length=None):
77 | """Calculate the short-time Fourier transform magnitude.
78 |
79 | Args:
80 | signal: 1D np.array of the input time-domain signal.
81 | fft_length: Size of the FFT to apply.
82 | hop_length: Advance (in samples) between each frame passed to FFT.
83 | window_length: Length of each block of samples to pass to FFT.
84 |
85 | Returns:
86 | 2D np.array where each row contains the magnitudes of the fft_length/2+1
87 | unique values of the FFT for the corresponding frame of input samples.
88 | """
89 | frames = frame(signal, window_length, hop_length)
90 | # Apply frame window to each frame. We use a periodic Hann (cosine of period
91 | # window_length) instead of the symmetric Hann of np.hanning (period
92 | # window_length-1).
93 | window = periodic_hann(window_length)
94 | windowed_frames = frames * window
95 | return np.abs(np.fft.rfft(windowed_frames, int(fft_length)))
96 |
97 |
98 | # Mel spectrum constants and functions.
99 | _MEL_BREAK_FREQUENCY_HERTZ = 700.0
100 | _MEL_HIGH_FREQUENCY_Q = 1127.0
101 |
102 |
103 | def hertz_to_mel(frequencies_hertz):
104 | """Convert frequencies to mel scale using HTK formula.
105 |
106 | Args:
107 | frequencies_hertz: Scalar or np.array of frequencies in hertz.
108 |
109 | Returns:
110 | Object of same size as frequencies_hertz containing corresponding values
111 | on the mel scale.
112 | """
113 | return _MEL_HIGH_FREQUENCY_Q * np.log(
114 | 1.0 + (frequencies_hertz / _MEL_BREAK_FREQUENCY_HERTZ))
115 |
116 |
117 | def spectrogram_to_mel_matrix(num_mel_bins=20,
118 | num_spectrogram_bins=129,
119 | audio_sample_rate=8000,
120 | lower_edge_hertz=125.0,
121 | upper_edge_hertz=3800.0):
122 | """Return a matrix that can post-multiply spectrogram rows to make mel.
123 |
124 | Returns a np.array matrix A that can be used to post-multiply a matrix S of
125 | spectrogram values (STFT magnitudes) arranged as frames x bins to generate a
126 | "mel spectrogram" M of frames x num_mel_bins. M = S A.
127 |
128 | The classic HTK algorithm exploits the complementarity of adjacent mel bands
129 | to multiply each FFT bin by only one mel weight, then add it, with positive
130 | and negative signs, to the two adjacent mel bands to which that bin
131 | contributes. Here, by expressing this operation as a matrix multiply, we go
132 | from num_fft multiplies per frame (plus around 2*num_fft adds) to around
133 | num_fft^2 multiplies and adds. However, because these are all presumably
134 | accomplished in a single call to np.dot(), it's not clear which approach is
135 | faster in Python. The matrix multiplication has the attraction of being more
136 | general and flexible, and much easier to read.
137 |
138 | Args:
139 | num_mel_bins: How many bands in the resulting mel spectrum. This is
140 | the number of columns in the output matrix.
141 | num_spectrogram_bins: How many bins there are in the source spectrogram
142 | data, which is understood to be fft_size/2 + 1, i.e. the spectrogram
143 | only contains the nonredundant FFT bins.
144 | audio_sample_rate: Samples per second of the audio at the input to the
145 | spectrogram. We need this to figure out the actual frequencies for
146 | each spectrogram bin, which dictates how they are mapped into mel.
147 | lower_edge_hertz: Lower bound on the frequencies to be included in the mel
148 | spectrum. This corresponds to the lower edge of the lowest triangular
149 | band.
150 | upper_edge_hertz: The desired top edge of the highest frequency band.
151 |
152 | Returns:
153 | An np.array with shape (num_spectrogram_bins, num_mel_bins).
154 |
155 | Raises:
156 | ValueError: if frequency edges are incorrectly ordered or out of range.
157 | """
158 | nyquist_hertz = audio_sample_rate / 2.
159 | if lower_edge_hertz < 0.0:
160 | raise ValueError("lower_edge_hertz %.1f must be >= 0" % lower_edge_hertz)
161 | if lower_edge_hertz >= upper_edge_hertz:
162 | raise ValueError("lower_edge_hertz %.1f >= upper_edge_hertz %.1f" %
163 | (lower_edge_hertz, upper_edge_hertz))
164 | if upper_edge_hertz > nyquist_hertz:
165 | raise ValueError("upper_edge_hertz %.1f is greater than Nyquist %.1f" %
166 | (upper_edge_hertz, nyquist_hertz))
167 | spectrogram_bins_hertz = np.linspace(0.0, nyquist_hertz, num_spectrogram_bins)
168 | spectrogram_bins_mel = hertz_to_mel(spectrogram_bins_hertz)
169 | # The i'th mel band (starting from i=1) has center frequency
170 | # band_edges_mel[i], lower edge band_edges_mel[i-1], and higher edge
171 | # band_edges_mel[i+1]. Thus, we need num_mel_bins + 2 values in
172 | # the band_edges_mel arrays.
173 | band_edges_mel = np.linspace(hertz_to_mel(lower_edge_hertz),
174 | hertz_to_mel(upper_edge_hertz), num_mel_bins + 2)
175 | # Matrix to post-multiply feature arrays whose rows are num_spectrogram_bins
176 | # of spectrogram values.
177 | mel_weights_matrix = np.empty((num_spectrogram_bins, num_mel_bins))
178 | for i in range(num_mel_bins):
179 | lower_edge_mel, center_mel, upper_edge_mel = band_edges_mel[i:i + 3]
180 | # Calculate lower and upper slopes for every spectrogram bin.
181 | # Line segments are linear in the *mel* domain, not hertz.
182 | lower_slope = ((spectrogram_bins_mel - lower_edge_mel) /
183 | (center_mel - lower_edge_mel))
184 | upper_slope = ((upper_edge_mel - spectrogram_bins_mel) /
185 | (upper_edge_mel - center_mel))
186 | # .. then intersect them with each other and zero.
187 | mel_weights_matrix[:, i] = np.maximum(0.0, np.minimum(lower_slope,
188 | upper_slope))
189 | # HTK excludes the spectrogram DC bin; make sure it always gets a zero
190 | # coefficient.
191 | mel_weights_matrix[0, :] = 0.0
192 | return mel_weights_matrix
193 |
194 |
195 | def log_mel_spectrogram(data,
196 | audio_sample_rate=8000,
197 | log_offset=0.0,
198 | window_length_secs=0.025,
199 | hop_length_secs=0.010,
200 | **kwargs):
201 | """Convert waveform to a log magnitude mel-frequency spectrogram.
202 |
203 | Args:
204 | data: 1D np.array of waveform data.
205 | audio_sample_rate: The sampling rate of data.
206 | log_offset: Add this to values when taking log to avoid -Infs.
207 | window_length_secs: Duration of each window to analyze.
208 | hop_length_secs: Advance between successive analysis windows.
209 | **kwargs: Additional arguments to pass to spectrogram_to_mel_matrix.
210 |
211 | Returns:
212 | 2D np.array of (num_frames, num_mel_bins) consisting of log mel filterbank
213 | magnitudes for successive frames.
214 | """
215 | window_length_samples = int(round(audio_sample_rate * window_length_secs))
216 | hop_length_samples = int(round(audio_sample_rate * hop_length_secs))
217 | fft_length = 2 ** int(np.ceil(np.log(window_length_samples) / np.log(2.0)))
218 | spectrogram = stft_magnitude(
219 | data,
220 | fft_length=fft_length,
221 | hop_length=hop_length_samples,
222 | window_length=window_length_samples)
223 | mel_spectrogram = np.dot(spectrogram, spectrogram_to_mel_matrix(
224 | num_spectrogram_bins=spectrogram.shape[1],
225 | audio_sample_rate=audio_sample_rate, **kwargs))
226 | return np.log(mel_spectrogram + log_offset)
227 |
--------------------------------------------------------------------------------
/audio-to-audio-search/vggish/vggish_input.py:
--------------------------------------------------------------------------------
1 | __copyright__ = "Copyright (c) 2021 Jina AI Limited. All rights reserved."
2 | __license__ = "Apache-2.0"
3 |
4 | # Copyright 2017 The TensorFlow Authors All Rights Reserved.
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | # ==============================================================================
18 |
19 | """Compute input examples for VGGish from audio waveform."""
20 |
21 | import resampy
22 |
23 |
24 | from vggish.mel_features import *
25 | from vggish.vggish_params import *
26 | import librosa
27 |
28 | try:
29 | import soundfile as sf
30 |
31 |
32 | def wav_read(wav_file):
33 | wav_data, sr = sf.read(wav_file, dtype='int16')
34 | return wav_data, sr
35 |
36 | except ImportError:
37 |
38 | def wav_read(wav_file):
39 | raise NotImplementedError('WAV file reading requires soundfile package.')
40 |
41 |
42 | def waveform_to_examples(data, sample_rate):
43 | """Converts audio waveform into an array of examples for VGGish.
44 |
45 | Args:
46 | data: np.array of either one dimension (mono) or two dimensions
47 | (multi-channel, with the outer dimension representing channels).
48 | Each sample is generally expected to lie in the range [-1.0, +1.0],
49 | although this is not required.
50 | sample_rate: Sample rate of data.
51 |
52 | Returns:
53 | 3-D np.array of shape [num_examples, num_frames, num_bands] which represents
54 | a sequence of examples, each of which contains a patch of log mel
55 | spectrogram, covering num_frames frames of audio and num_bands mel frequency
56 | bands, where the frame length is STFT_HOP_LENGTH_SECONDS.
57 | """
58 | # Convert to mono.
59 | if len(data.shape) > 1:
60 | data = np.mean(data, axis=1)
61 | # Resample to the rate assumed by VGGish.
62 | if sample_rate != SAMPLE_RATE:
63 | data = resampy.resample(data, sample_rate, SAMPLE_RATE)
64 |
65 | # Compute log mel spectrogram features.
66 | log_mel = log_mel_spectrogram(
67 | data,
68 | audio_sample_rate=SAMPLE_RATE,
69 | log_offset=LOG_OFFSET,
70 | window_length_secs=STFT_WINDOW_LENGTH_SECONDS,
71 | hop_length_secs=STFT_HOP_LENGTH_SECONDS,
72 | num_mel_bins=NUM_MEL_BINS,
73 | lower_edge_hertz=MEL_MIN_HZ,
74 | upper_edge_hertz=MEL_MAX_HZ)
75 |
76 | # Frame features into examples.
77 | features_sample_rate = 1.0 / STFT_HOP_LENGTH_SECONDS
78 | example_window_length = int(round(
79 | EXAMPLE_WINDOW_SECONDS * features_sample_rate))
80 | example_hop_length = int(round(
81 | EXAMPLE_HOP_SECONDS * features_sample_rate))
82 | log_mel_examples = frame(
83 | log_mel,
84 | window_length=example_window_length,
85 | hop_length=example_hop_length)
86 | return log_mel_examples
87 |
88 |
89 | def wavfile_to_examples(wav_file):
90 | """Convenience wrapper around waveform_to_examples() for a common WAV format.
91 |
92 | Args:
93 | wav_file: String path to a file, or a file-like object. The file
94 | is assumed to contain WAV audio data with signed 16-bit PCM samples.
95 |
96 | Returns:
97 | See waveform_to_examples.
98 | """
99 | wav_data, sr = wav_read(wav_file)
100 | assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype
101 | samples = wav_data / 32768.0 # Convert to [-1.0, +1.0]
102 | return waveform_to_examples(samples, sr)
103 |
104 |
105 | def mp3_to_examples(mp3_file):
106 | """Convenience wrapper around waveform_to_examples() for a common mp3 format.
107 |
108 | Args:
109 | mp3_file: String path to a file, or a file-like object. The file
110 | is assumed to contain mp3 audio data.
111 |
112 | Returns:
113 | See waveform_to_examples.
114 | """
115 | x_data, sr = librosa.load(mp3_file)
116 | #assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype
117 | #samples = x_data / 32768.0 # Convert to [-1.0, +1.0]
118 | return waveform_to_examples(x_data, sr)
119 |
120 |
121 |
--------------------------------------------------------------------------------
/audio-to-audio-search/vggish/vggish_params.py:
--------------------------------------------------------------------------------
1 | __copyright__ = "Copyright (c) 2021 Jina AI Limited. All rights reserved."
2 | __license__ = "Apache-2.0"
3 |
4 | # Copyright 2017 The TensorFlow Authors All Rights Reserved.
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | # ==============================================================================
18 |
19 | """Global parameters for the VGGish model.
20 |
21 | See vggish_slim.py for more information.
22 | """
23 |
24 | # Architectural constants.
25 | NUM_FRAMES = 96 # Frames in input mel-spectrogram patch.
26 | NUM_BANDS = 64 # Frequency bands in input mel-spectrogram patch.
27 | EMBEDDING_SIZE = 128 # Size of embedding layer.
28 |
29 | # Hyperparameters used in feature and example generation.
30 | SAMPLE_RATE = 16000
31 | STFT_WINDOW_LENGTH_SECONDS = 0.025
32 | STFT_HOP_LENGTH_SECONDS = 0.010
33 | NUM_MEL_BINS = NUM_BANDS
34 | MEL_MIN_HZ = 125
35 | MEL_MAX_HZ = 7500
36 | LOG_OFFSET = 0.01 # Offset used for stabilized log of input mel-spectrogram.
37 | EXAMPLE_WINDOW_SECONDS = 0.96 # Each example contains 96 10ms frames
38 | EXAMPLE_HOP_SECONDS = 0.96 # with zero overlap.
39 |
40 | # Parameters used for embedding postprocessing.
41 | PCA_EIGEN_VECTORS_NAME = 'pca_eigen_vectors'
42 | PCA_MEANS_NAME = 'pca_means'
43 | QUANTIZE_MIN_VAL = -2.0
44 | QUANTIZE_MAX_VAL = +2.0
45 |
46 | # Hyperparameters used in training.
47 | INIT_STDDEV = 0.01 # Standard deviation used to initialize weights.
48 | LEARNING_RATE = 1e-4 # Learning rate for the Adam optimizer.
49 | ADAM_EPSILON = 1e-8 # Epsilon for the Adam optimizer.
50 |
51 | # Names of ops, tensors, and features.
52 | INPUT_OP_NAME = 'vggish/input_features'
53 | INPUT_TENSOR_NAME = INPUT_OP_NAME + ':0'
54 | OUTPUT_OP_NAME = 'vggish/embedding'
55 | OUTPUT_TENSOR_NAME = OUTPUT_OP_NAME + ':0'
56 | AUDIO_EMBEDDING_FEATURE_NAME = 'audio_embedding'
57 |
--------------------------------------------------------------------------------
/audio-to-audio-search/vggish/vggish_postprocess.py:
--------------------------------------------------------------------------------
1 | __copyright__ = "Copyright (c) 2021 Jina AI Limited. All rights reserved."
2 | __license__ = "Apache-2.0"
3 |
4 | # Copyright 2017 The TensorFlow Authors All Rights Reserved.
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | # ==============================================================================
18 |
19 | """Post-process embeddings from VGGish."""
20 |
21 | import numpy as np
22 |
23 | from vggish.vggish_params import *
24 |
25 |
26 | class Postprocessor(object):
27 | """Post-processes VGGish embeddings.
28 |
29 | The initial release of AudioSet included 128-D VGGish embeddings for each
30 | segment of AudioSet. These released embeddings were produced by applying
31 | a PCA transformation (technically, a whitening transform is included as well)
32 | and 8-bit quantization to the raw embedding output from VGGish, in order to
33 | stay compatible with the YouTube-8M project which provides visual embeddings
34 | in the same format for a large set of YouTube videos. This class implements
35 | the same PCA (with whitening) and quantization transformations.
36 | """
37 |
38 | def __init__(self, pca_params_npz_path):
39 | """Constructs a postprocessor.
40 |
41 | Args:
42 | pca_params_npz_path: Path to a NumPy-format .npz file that
43 | contains the PCA parameters used in postprocessing.
44 | """
45 | params = np.load(pca_params_npz_path)
46 | self._pca_matrix = params[PCA_EIGEN_VECTORS_NAME]
47 | # Load means into a column vector for easier broadcasting later.
48 | self._pca_means = params[PCA_MEANS_NAME].reshape(-1, 1)
49 | assert self._pca_matrix.shape == (
50 | EMBEDDING_SIZE, EMBEDDING_SIZE), (
51 | 'Bad PCA matrix shape: %r' % (self._pca_matrix.shape,))
52 | assert self._pca_means.shape == (EMBEDDING_SIZE, 1), (
53 | 'Bad PCA means shape: %r' % (self._pca_means.shape,))
54 |
55 | def postprocess(self, embeddings_batch):
56 | """Applies postprocessing to a batch of embeddings.
57 |
58 | Args:
59 | embeddings_batch: An nparray of shape [batch_size, embedding_size]
60 | containing output from the embedding layer of VGGish.
61 |
62 | Returns:
63 | An nparray of the same shape as the input but of type uint8,
64 | containing the PCA-transformed and quantized version of the input.
65 | """
66 | assert len(embeddings_batch.shape) == 2, (
67 | 'Expected 2-d batch, got %r' % (embeddings_batch.shape,))
68 | assert embeddings_batch.shape[1] == EMBEDDING_SIZE, (
69 | 'Bad batch shape: %r' % (embeddings_batch.shape,))
70 |
71 | # Apply PCA.
72 | # - Embeddings come in as [batch_size, embedding_size].
73 | # - Transpose to [embedding_size, batch_size].
74 | # - Subtract pca_means column vector from each column.
75 | # - Premultiply by PCA matrix of shape [output_dims, input_dims]
76 | # where both are are equal to embedding_size in our case.
77 | # - Transpose result back to [batch_size, embedding_size].
78 | pca_applied = np.dot(self._pca_matrix,
79 | (embeddings_batch.T - self._pca_means)).T
80 |
81 | # Quantize by:
82 | # - clipping to [min, max] range
83 | clipped_embeddings = np.clip(
84 | pca_applied, QUANTIZE_MIN_VAL,
85 | QUANTIZE_MAX_VAL)
86 | # - convert to 8-bit in range [0.0, 255.0]
87 | quantized_embeddings = (
88 | (clipped_embeddings - QUANTIZE_MIN_VAL) *
89 | (255.0 /
90 | (QUANTIZE_MAX_VAL - QUANTIZE_MIN_VAL)))
91 | # - cast 8-bit float to uint8
92 | quantized_embeddings = quantized_embeddings.astype(np.uint8)
93 |
94 | return quantized_embeddings
95 |
--------------------------------------------------------------------------------
/audio-to-audio-search/vggish/vggish_slim.py:
--------------------------------------------------------------------------------
1 | __copyright__ = "Copyright (c) 2020 Jina AI Limited. All rights reserved."
2 | __license__ = "Apache-2.0"
3 |
4 | # Copyright 2017 The TensorFlow Authors All Rights Reserved.
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | # ==============================================================================
18 |
19 | """Defines the 'VGGish' model used to generate AudioSet embedding features.
20 |
21 | The public AudioSet release (https://research.google.com/audioset/download.html)
22 | includes 128-D features extracted from the embedding layer of a VGG-like model
23 | that was trained on a large Google-internal YouTube dataset. Here we provide
24 | a TF-Slim definition of the same model, without any dependences on libraries
25 | internal to Google. We call it 'VGGish'.
26 |
27 | Note that we only define the model up to the embedding layer, which is the
28 | penultimate layer before the final classifier layer. We also provide various
29 | hyperparameter values (in vggish_params.py) that were used to train this model
30 | internally.
31 |
32 | For comparison, here is TF-Slim's VGG definition:
33 | https://github.com/tensorflow/models/blob/master/research/slim/nets/vgg.py
34 | """
35 |
36 | import tensorflow.compat.v1 as tf
37 | import tf_slim as slim
38 |
39 | from vggish.vggish_params import *
40 |
41 |
42 | def define_vggish_slim(features_tensor=None, training=False):
43 | """Defines the VGGish TensorFlow model.
44 |
45 | All ops are created in the current default graph, under the scope 'vggish/'.
46 |
47 | The input is either a tensor passed in via the optional 'features_tensor'
48 | argument or a placeholder created below named 'vggish/input_features'. The
49 | input is expected to have dtype float32 and shape [batch_size, num_frames,
50 | num_bands] where batch_size is variable and num_frames and num_bands are
51 | constants, and [num_frames, num_bands] represents a log-mel-scale spectrogram
52 | patch covering num_bands frequency bands and num_frames time frames (where
53 | each frame step is usually 10ms). This is produced by computing the stabilized
54 | log(mel-spectrogram + LOG_OFFSET). The output is a tensor named
55 | 'vggish/embedding' which produces the pre-activation values of a 128-D
56 | embedding layer, which is usually the penultimate layer when used as part of a
57 | full model with a final classifier layer.
58 |
59 | Args:
60 | features_tensor: If not None, the tensor containing the input features.
61 | If None, a placeholder input is created.
62 | training: If true, all parameters are marked trainable.
63 |
64 | Returns:
65 | The op 'vggish/embeddings'.
66 | """
67 | # Defaults:
68 | # - All weights are initialized to N(0, INIT_STDDEV).
69 | # - All biases are initialized to 0.
70 | # - All activations are ReLU.
71 | # - All convolutions are 3x3 with stride 1 and SAME padding.
72 | # - All max-pools are 2x2 with stride 2 and SAME padding.
73 | with slim.arg_scope([slim.conv2d, slim.fully_connected],
74 | weights_initializer=tf.truncated_normal_initializer(
75 | stddev=INIT_STDDEV),
76 | biases_initializer=tf.zeros_initializer(),
77 | activation_fn=tf.nn.relu,
78 | trainable=training), \
79 | slim.arg_scope([slim.conv2d],
80 | kernel_size=[3, 3], stride=1, padding='SAME'), \
81 | slim.arg_scope([slim.max_pool2d],
82 | kernel_size=[2, 2], stride=2, padding='SAME'), \
83 | tf.variable_scope('vggish'):
84 | # Input: a batch of 2-D log-mel-spectrogram patches.
85 | if features_tensor is None:
86 | features_tensor = tf.placeholder(
87 | tf.float32, shape=(None, NUM_FRAMES, NUM_BANDS),
88 | name='input_features')
89 | # Reshape to 4-D so that we can convolve a batch with conv2d().
90 | net = tf.reshape(features_tensor,
91 | [-1, NUM_FRAMES, NUM_BANDS, 1])
92 |
93 | # The VGG stack of alternating convolutions and max-pools.
94 | net = slim.conv2d(net, 64, scope='conv1')
95 | net = slim.max_pool2d(net, scope='pool1')
96 | net = slim.conv2d(net, 128, scope='conv2')
97 | net = slim.max_pool2d(net, scope='pool2')
98 | net = slim.repeat(net, 2, slim.conv2d, 256, scope='conv3')
99 | net = slim.max_pool2d(net, scope='pool3')
100 | net = slim.repeat(net, 2, slim.conv2d, 512, scope='conv4')
101 | net = slim.max_pool2d(net, scope='pool4')
102 |
103 | # Flatten before entering fully-connected layers
104 | net = slim.flatten(net)
105 | net = slim.repeat(net, 2, slim.fully_connected, 4096, scope='fc1')
106 | # The embedding layer.
107 | net = slim.fully_connected(net, EMBEDDING_SIZE, scope='fc2',
108 | activation_fn=None)
109 | return tf.identity(net, name='embedding')
110 |
111 |
112 | def load_vggish_slim_checkpoint(session, checkpoint_path):
113 | """Loads a pre-trained VGGish-compatible checkpoint.
114 |
115 | This function can be used as an initialization function (referred to as
116 | init_fn in TensorFlow documentation) which is called in a Session after
117 | initializating all variables. When used as an init_fn, this will load
118 | a pre-trained checkpoint that is compatible with the VGGish model
119 | definition. Only variables defined by VGGish will be loaded.
120 |
121 | Args:
122 | session: an active TensorFlow session.
123 | checkpoint_path: path to a file containing a checkpoint that is
124 | compatible with the VGGish model definition.
125 | """
126 | # Get the list of names of all VGGish variables that exist in
127 | # the checkpoint (i.e., all inference-mode VGGish variables).
128 | with tf.Graph().as_default():
129 | define_vggish_slim(training=False)
130 | vggish_var_names = [v.name for v in tf.global_variables()]
131 |
132 | # Get the list of all currently existing variables that match
133 | # the list of variable names we just computed.
134 | vggish_vars = [v for v in tf.global_variables() if v.name in vggish_var_names]
135 |
136 | # Use a Saver to restore just the variables selected above.
137 | saver = tf.train.Saver(vggish_vars, name='vggish_load_pretrained',
138 | write_version=1)
139 | saver.restore(session, checkpoint_path)
140 |
--------------------------------------------------------------------------------
/cross-modal-search/.dockerignore:
--------------------------------------------------------------------------------
1 | workspace
2 | venv
3 | .venv
4 |
--------------------------------------------------------------------------------
/cross-modal-search/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jina-ai/examples/d8f903278597254bd96b1ed64fe8c9feefaa265c/cross-modal-search/__init__.py
--------------------------------------------------------------------------------
/cross-modal-search/app.py:
--------------------------------------------------------------------------------
1 | __copyright__ = "Copyright (c) 2021 Jina AI Limited. All rights reserved."
2 | __license__ = "Apache-2.0"
3 |
4 | import os
5 | import sys
6 |
7 | import click
8 | from jina import Flow, Document, DocumentArray
9 | import logging
10 | import matplotlib.pyplot as plt
11 |
12 | from dataset import input_index_data
13 |
14 | MAX_DOCS = int(os.environ.get("JINA_MAX_DOCS", 10000))
15 | cur_dir = os.path.dirname(os.path.abspath(__file__))
16 | DEFAULT_QUERY_IMAGE = 'toy-data/images/1000268201_693b08cb0e.jpg'
17 | DEFAULT_QUERY_TEXT = 'a black dog and a spotted dog are fighting'
18 |
19 |
20 | def config():
21 | os.environ.setdefault('JINA_WORKSPACE', os.path.join(cur_dir, 'workspace'))
22 | os.environ.setdefault(
23 | 'JINA_WORKSPACE_MOUNT',
24 | f'{os.environ.get("JINA_WORKSPACE")}:/workspace/workspace')
25 | os.environ.setdefault('JINA_LOG_LEVEL', 'INFO')
26 | os.environ.setdefault('JINA_PORT', str(45678))
27 |
28 |
29 | def index_restful():
30 | flow = Flow().load_config('flows/flow-index.yml', override_with={'protocol': 'http'})
31 | with flow:
32 | flow.block()
33 |
34 |
35 | def check_query_result(text_doc, image_doc, img_uri):
36 | # Image doc matches are text:
37 | print(f'Searching with image {img_uri}. Matches:')
38 | if image_doc.matches:
39 | for m in image_doc.matches:
40 | print(
41 | f'\t-- text: "{m.text}" '
42 | f'score: {m.scores["cosine"].value:.4f},'
43 | )
44 |
45 | # Text doc matches are images
46 | print(f'\nSearching with text "{text_doc.text}". Matches:')
47 | if text_doc.matches:
48 | f, axarr = plt.subplots(1, len(text_doc.matches))
49 |
50 | for i, m in enumerate(text_doc.matches):
51 | axarr[i].title.set_text(f'score={m.scores["cosine"].value:.4f}')
52 | axarr[i].imshow(m.blob)
53 | axarr[i].axes.xaxis.set_visible(False)
54 | axarr[i].axes.yaxis.set_visible(False)
55 | plt.suptitle(f"Best matches for '{text_doc.text}'")
56 | plt.show()
57 |
58 |
59 | def index(data_set, num_docs, request_size):
60 | flow = Flow().load_config('flows/flow-index.yml')
61 | with flow:
62 | flow.post(on='/index',
63 | inputs=input_index_data(num_docs, request_size, data_set),
64 | request_size=request_size,
65 | show_progress=True)
66 |
67 |
68 | def query(query_image, query_text):
69 | flow = Flow().load_config('flows/flow-query.yml')
70 | with flow:
71 | img_uri = query_image
72 | text_doc = Document(text=query_text,
73 | modality='text')
74 | image_doc = Document(uri=img_uri,
75 | modality='image')
76 | import time
77 | start = time.time()
78 | result_text = flow.post(on='/search', inputs=text_doc,
79 | return_results=True)
80 | result_image = flow.post(on='/search', inputs=image_doc,
81 | return_results=True)
82 | print(f'Request duration: {time.time() - start}')
83 | check_query_result(result_text[0].docs[0], result_image[0].docs[0], img_uri)
84 |
85 |
86 |
87 | def query_restful():
88 | flow = Flow(cors=True).load_config('flows/flow-query.yml')
89 | flow.rest_api = True
90 | flow.protocol = 'http'
91 | with flow:
92 | flow.block()
93 |
94 |
95 | @click.command()
96 | @click.option('--task', '-t', type=click.Choice(['index', 'index_restful', 'query_restful', 'query']), default='index')
97 | @click.option("--num_docs", "-n", default=MAX_DOCS)
98 | @click.option('--request_size', '-s', default=16)
99 | @click.option('--data_set', '-d', type=click.Choice(['f30k', 'f8k', 'toy-data'], case_sensitive=False), default='toy-data')
100 | @click.option('--query-image', '-i', type=str, default=DEFAULT_QUERY_IMAGE)
101 | @click.option('--query-text', '-i', type=str, default=DEFAULT_QUERY_TEXT)
102 | def main(task, num_docs, request_size, data_set, query_image, query_text):
103 | config()
104 | workspace = os.environ['JINA_WORKSPACE']
105 | logger = logging.getLogger('cross-modal-search')
106 | if 'index' in task:
107 | if os.path.exists(workspace):
108 | logger.error(
109 | f'\n +------------------------------------------------------------------------------------+ \
110 | \n | 🤖🤖🤖 | \
111 | \n | The directory {workspace} already exists. Please remove it before indexing again. | \
112 | \n | 🤖🤖🤖 | \
113 | \n +------------------------------------------------------------------------------------+'
114 | )
115 | sys.exit(1)
116 | if 'query' in task:
117 | if not os.path.exists(workspace):
118 | logger.error(f'The directory {workspace} does not exist. Please index first via `python app.py -t index`')
119 | sys.exit(1)
120 |
121 | if task == 'index':
122 | index(data_set, num_docs, request_size)
123 | elif task == 'index_restful':
124 | index_restful()
125 | elif task == 'query':
126 | query(query_image, query_text)
127 | elif task == 'query_restful':
128 | query_restful()
129 |
130 |
131 | if __name__ == '__main__':
132 | main()
133 |
--------------------------------------------------------------------------------
/cross-modal-search/dataset.py:
--------------------------------------------------------------------------------
1 | __copyright__ = "Copyright (c) 2021 Jina AI Limited. All rights reserved."
2 | __license__ = "Apache-2.0"
3 |
4 |
5 | import os
6 | import json as jsonmod
7 | import hashlib
8 |
9 | import torch
10 | import torch.utils.data as data
11 | from jina import Document
12 |
13 |
14 | cur_dir = os.path.dirname(os.path.abspath(__file__))
15 |
16 |
17 | class Flickr30kDataset(data.Dataset):
18 | """
19 | Dataset loader for Flickr30k full datasets.
20 | """
21 |
22 | def __init__(self, images_root, json, split):
23 | self.images_root = images_root
24 | self.dataset = jsonmod.load(open(json, 'r'))['images']
25 | self.ids = []
26 | for i, d in enumerate(self.dataset):
27 | if d['split'] == split:
28 | self.ids += [(i, x) for x in range(len(d['sentences']))]
29 |
30 | def __getitem__(self, index):
31 | """This function returns a tuple that is further passed to collate_fn
32 | """
33 | images_root = self.images_root
34 | ann_id = self.ids[index]
35 | img_id = ann_id[0]
36 | caption = self.dataset[img_id]['sentences'][ann_id[1]]['raw']
37 | img_file_name = self.dataset[img_id]['filename']
38 |
39 | image_file_path = os.path.join(images_root, img_file_name)
40 | with open(image_file_path, 'rb') as fp:
41 | image_buffer = fp.read()
42 | return image_buffer, str(caption).lower()
43 |
44 | def __len__(self):
45 | return len(self.ids)
46 |
47 |
48 | class FlickrDataset(data.Dataset):
49 | """
50 | Dataset loader for Flickr8k full datasets.
51 | """
52 |
53 | def __init__(self, images_root, captions_file_path):
54 | self.images_root = images_root
55 | self.captions_file_path = captions_file_path
56 | with open(self.captions_file_path, 'r') as cf:
57 | self.lines = cf.readlines()[1:]
58 |
59 | def __getitem__(self, index):
60 | """This function returns a tuple that is further passed to collate_fn
61 | """
62 | image_file_name, caption = self.lines[index*5].split(',', 1)
63 | with open(os.path.join(self.images_root, image_file_name), 'rb') as fp:
64 | image_buffer = fp.read()
65 | return image_buffer, str(caption).lower().rstrip()
66 |
67 | def __len__(self):
68 | return int(len(self.lines)/5)
69 |
70 |
71 | def collate_fn(data):
72 | # Not sure this is actually needed
73 | images, captions = zip(*data)
74 | return images, captions
75 |
76 |
77 | def get_data_loader(split, root, captions, batch_size=8, dataset_type='f30k', shuffle=False,
78 | num_workers=1, collate_fn=collate_fn):
79 | """Returns torch.utils.data.DataLoader for custom coco dataset."""
80 |
81 | if dataset_type == 'f30k':
82 | dataset = Flickr30kDataset(images_root=root, split=split, json=captions)
83 | elif dataset_type == 'f8k' or dataset_type == 'toy-data':
84 | dataset = FlickrDataset(images_root=root, captions_file_path=captions)
85 | else:
86 | raise NotImplementedError(f'Not valid dataset type {dataset_type}')
87 | # Data loader
88 | data_loader = torch.utils.data.DataLoader(dataset=dataset,
89 | batch_size=batch_size,
90 | shuffle=shuffle,
91 | pin_memory=True,
92 | num_workers=num_workers,
93 | collate_fn=collate_fn)
94 |
95 | return data_loader
96 |
97 |
98 | def input_index_data(num_docs=None, batch_size=8, dataset_type='f30k'):
99 | captions = 'dataset_flickr30k.json' if dataset_type == 'f30k' else 'captions.txt'
100 | if dataset_type == 'toy-data':
101 | base_folder = '.'
102 | else:
103 | base_folder = 'data'
104 | data_loader = get_data_loader(
105 | root=os.path.join(cur_dir, f'{base_folder}/{dataset_type}/images'),
106 | captions=os.path.join(cur_dir, f'{base_folder}/{dataset_type}/{captions}'),
107 | split='test',
108 | batch_size=batch_size,
109 | dataset_type=dataset_type
110 | )
111 |
112 | for i, (images, captions) in enumerate(data_loader):
113 | for image, caption in zip(images, captions):
114 | hashed = hashlib.sha1(image).hexdigest()
115 | document_img = Document()
116 |
117 | document_img.buffer = image
118 | document_img.modality = 'image'
119 | document_img.mime_type = 'image/jpeg'
120 |
121 | document_caption = Document(id=hashed)
122 |
123 | document_caption.text = caption
124 | document_caption.modality = 'text'
125 | document_caption.mime_type = 'text/plain'
126 | document_caption.tags['id'] = caption
127 |
128 | yield document_img
129 | yield document_caption
130 |
131 | if num_docs and (i + 1) * batch_size >= num_docs:
132 | break
133 |
--------------------------------------------------------------------------------
/cross-modal-search/flows/executors.py:
--------------------------------------------------------------------------------
1 | """ Implementation of filters for images and texts"""
2 |
3 | import numpy as np
4 | from jina import Executor, DocumentArray, requests
5 |
6 |
7 | class ImageReader(Executor):
8 | @requests(on='/index')
9 | def index_read(self, docs: 'DocumentArray', **kwargs):
10 | array = DocumentArray(list(filter(lambda doc: doc.modality=='image', docs)))
11 | for doc in array:
12 | doc.convert_image_buffer_to_blob()
13 | doc.blob = np.array(doc.blob).astype(np.uint8)
14 | return array
15 |
16 | @requests(on='/search')
17 | def search_read(self, docs: 'DocumentArray', **kwargs):
18 | image_docs = DocumentArray(list(filter(lambda doc: doc.mime_type in ('image/jpeg', 'image/png'), docs)))
19 | if not image_docs:
20 | return DocumentArray([])
21 | for doc in image_docs:
22 | doc.convert_uri_to_buffer()
23 | doc.convert_image_buffer_to_blob()
24 | doc.blob = doc.blob.astype(np.uint8)
25 | return image_docs
26 |
27 |
28 | class TextFilter(Executor):
29 | @requests
30 | def filter_text(self, docs: 'DocumentArray', **kwargs):
31 | docs = DocumentArray(list(filter(lambda doc: doc.mime_type == 'text/plain', docs)))
32 | return docs
33 |
--------------------------------------------------------------------------------
/cross-modal-search/flows/flow-index.yml:
--------------------------------------------------------------------------------
1 | jtype: Flow # We configure the index flow here that is used for indexing images and captions
2 | version: '1' # yml version
3 | with: # Parameters for the flow are defined after with
4 | prefetch: 10 # Number of prefetched requests from the client
5 | port_expose: $JINA_PORT # Port defined in environment variable
6 | workspace: $JINA_WORKSPACE # Workspace folder
7 | pods: # Now, we define the pods that are used
8 | - name: image_loader # The first executor is an image loader that filters only image documents
9 | uses: ImageReader # Type of the executor
10 | py_modules: 'flows/executors.py' # The python file where the executor is implemented
11 | read_only: true # Executor does not modify files
12 | needs: gateway # Executor is after the gateway, this means at the start of the flow
13 | - name: image_encoder # After the images are read, compute their embedding in the encoder
14 | uses: 'jinahub+docker://CLIPImageEncoder/v0.1' # The type of the executor - here, we use a hub executor from the jinahub in the form of a docker container
15 | volumes: $HOME/.cache/huggingface:/root/.cache/huggingface # Mount a volume into the executor
16 | timeout_ready: 600000 # Set a timeout for the executor
17 | read_only: true # Executor does not modify files
18 | needs: image_loader # This executor is located after the image loader in the flow
19 | - name: image_indexer # Executor that stores image embeddings
20 | uses: 'jinahub://SimpleIndexer/old' # Hub Executor - We use a SimpleIndexer here
21 | uses_with: # Define arguments for the SimpleIndexer
22 | index_file_name: 'image_index' # Folder path for this executor
23 | needs: image_encoder # This executor is after the image encoder in the flow
24 | - name: text_filter # Now, we define another path in the flow that is parallel in the execution
25 | uses: TextFilter # The first executor is a filter that filters all text documents and ignores images now
26 | py_modules: 'flows/executors.py' # File where the TextFilter is implemented
27 | needs: gateway # Start after the gateway, so at the beginning of the flow - this creates a second path in the flow
28 | - name: text_encoder # Create the next executor that computes embeddings for the text documents
29 | uses: 'jinahub+docker://CLIPTextEncoder/v0.1' # Use a hub executor in docker
30 | volumes: $HOME/.cache/huggingface:/root/.cache/huggingface # Mount the models directory
31 | timeout_ready: 600000 # Set timeout
32 | read_only: true # Executor does not modify files
33 | needs: text_filter # Run this executor after the image filter
34 | - name: text_indexer # Finally, store the indexed text documents with embeddings on disk
35 | uses: 'jinahub://SimpleIndexer/old' # Use SimpleIndexer from hub in docker again
36 | uses_with: # Define parameters for the text indexer
37 | index_file_name: 'text_index' # Folder name in the workspace
38 | needs: text_encoder # Start after the text encoder executor is finished
39 | - name: join_all # This is the last executor - it waits until both paths in the flow are finished (image and text path)
40 | needs: [image_indexer, text_indexer] # Wait for these two executors to finish - only then we can continue
41 |
--------------------------------------------------------------------------------
/cross-modal-search/flows/flow-query.yml:
--------------------------------------------------------------------------------
1 | jtype: Flow # This file defines the query flow which is used for searching in the indexed documents
2 | version: '1' # The query flow is very similar to the index flow - only the differences are explained here
3 | with:
4 | prefetch: 10
5 | port_expose: $JINA_PORT
6 | workspace: $JINA_WORKSPACE
7 | pods:
8 | - name: loader # Again, we start two paths in the flow - here we start the image path
9 | uses: ImageReader
10 | py_modules: 'flows/executors.py'
11 | read_only: true
12 | needs: [gateway]
13 | - name: image_encoder # Now, encode the images and compute the embeddings
14 | uses: 'jinahub+docker://CLIPImageEncoder/v0.1'
15 | volumes: $HOME/.cache/huggingface:/root/.cache/huggingface
16 | timeout_ready: 600000
17 | read_only: true
18 | needs: loader
19 | - name: text_indexer # Now, we use the text indexer in the image path - This is how we achieve the cross-modality here
20 | uses: 'jinahub://SimpleIndexer/old' # The text indexer has indexed all text documents and stored them on disk.
21 | uses_with: # Then we return the closest matches as results
22 | index_file_name: 'text_index'
23 | needs: image_encoder
24 | force: True
25 | read_only: true
26 | - name: text_filter # Here, the text path starts
27 | uses: TextFilter
28 | py_modules: 'flows/executors.py'
29 | needs: [gateway]
30 | - name: text_encoder # Compute the embedding of the search text
31 | uses: 'jinahub+docker://CLIPTextEncoder/v0.1'
32 | volumes: $HOME/.cache/huggingface:/root/.cache/huggingface
33 | timeout_ready: 600000
34 | read_only: true
35 | needs: text_filter
36 | - name: image_indexer # Now, we use the image indexer in the text path - this is again how we get cross-modality
37 | uses: 'jinahub://SimpleIndexer/old' # The image indexer has indexed all images and their embeddings
38 | uses_with:
39 | index_file_name: 'image_index'
40 | force: True
41 | read_only: true
42 | needs: text_encoder
43 | - name: join_all # Wait for both paths to finish and join the results
44 | needs: [image_indexer, text_indexer]
45 |
--------------------------------------------------------------------------------
/cross-modal-search/get_data.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | DATASET="adityajn105/flickr8k"
3 | DATA_DIR="data/f8k"
4 |
5 | if [ -d ${DATA_DIR} ]; then
6 | echo ${DATA_DIR}' exists, please remove it before running the script'
7 | exit 1
8 | fi
9 |
10 | mkdir -p ${DATA_DIR} && \
11 | kaggle datasets download -d ${DATASET} && \
12 | unzip -q flickr8k.zip && \
13 | rm flickr8k.zip && \
14 | mv Images data/f8k/images && \
15 | mv captions.txt data/f8k/captions.txt
--------------------------------------------------------------------------------
/cross-modal-search/get_data30k.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | pip install kaggle
3 | kaggle datasets download hsankesara/flickr-image-dataset && \
4 | unzip flickr-image-dataset.zip && \
5 | rm flickr-image-dataset.zip && \
6 | wget -q http://www.cs.toronto.edu/~faghri/vsepp/data.tar && \
7 | tar -xvf data.tar && \
8 | rm -rf data.tar && \
9 | rm -rf data/coco* && \
10 | rm -rf data/f8k* && \
11 | rm -rf data/*precomp* && \
12 | rm -rf data/f30k/images && \
13 | mv flickr-image-dataset data/f30k/images
14 |
--------------------------------------------------------------------------------
/cross-modal-search/requirements.txt:
--------------------------------------------------------------------------------
1 | jina[standard,rich]==2.0.18
2 | click==8.0.1
3 | kaggle==1.5.12
4 | git+git://github.com/jina-ai/jina-commons@v0.0.3
5 | matplotlib==3.4.3
6 | torch==1.9.0
--------------------------------------------------------------------------------
/cross-modal-search/setup_run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | TEST_DATA_DIR=data/
4 |
5 | rm -rf ${TEST_DATA_DIR} && \
6 | mkdir -p ${TEST_DATA_DIR}/f8k/images && \
7 | python ../.github/util/pull_dataset.py -d cross-modal-search/f8k.zip -p ../ && \
8 | unzip -o f8k.zip -d ${TEST_DATA_DIR} && \
9 | rm f8k.zip && \
10 | mv ${TEST_DATA_DIR}/Images/* ${TEST_DATA_DIR}/f8k/images && \
11 | mv ${TEST_DATA_DIR}/captions.txt data/f8k/captions.txt && \
12 | rm -rf workspace && \
13 | python app.py -t index | tee metrics.txt && \
14 | rm -rf ${TEST_DATA_DIR}
--------------------------------------------------------------------------------
/cross-modal-search/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jina-ai/examples/d8f903278597254bd96b1ed64fe8c9feefaa265c/cross-modal-search/tests/__init__.py
--------------------------------------------------------------------------------
/cross-modal-search/tests/conftest.py:
--------------------------------------------------------------------------------
1 | """Patch the birthday problem for random parts"""
2 |
3 | import pytest
4 |
5 |
6 | @pytest.fixture(scope='function', autouse=True)
7 | def patched_random_port(mocker):
8 | used_ports = set()
9 | from jina.helper import random_port
10 | from jina.excepts import NoAvailablePortError
11 |
12 | def _random_port():
13 |
14 | for i in range(10):
15 | _port = random_port()
16 |
17 | if _port is not None and _port not in used_ports:
18 | used_ports.add(_port)
19 | return _port
20 | raise NoAvailablePortError
21 |
22 | mocker.patch('jina.helper.random_port', new_callable=lambda: _random_port)
--------------------------------------------------------------------------------
/cross-modal-search/tests/requirements.txt:
--------------------------------------------------------------------------------
1 | pytest==6.2.4
2 | git+https://github.com/jina-ai/jina.git@v2.0.18#egg=jina[standard,rich]
3 | click==8.0.1
4 | git+git://github.com/jina-ai/jina-commons@v0.0.3
5 | kaggle==1.5.12
6 | matplotlib==3.4.3
7 | torch==1.9.0
--------------------------------------------------------------------------------
/cross-modal-search/tests/test_cross_modal_search.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | sys.path.append('..')
4 | from app import main
5 | from click.testing import CliRunner
6 |
7 |
8 | def config(tmpdir):
9 | os.environ['JINA_WORKSPACE'] = os.path.join(tmpdir, 'workspace')
10 |
11 |
12 | def test_cross_modal_search(tmpdir):
13 | config(tmpdir)
14 | runner = CliRunner()
15 | result = runner.invoke(main, ['-t', 'index'])
16 | assert 'done in' in result.stdout
17 | assert result.stderr_bytes is None
18 | result = runner.invoke(main, ['-t', 'query'])
19 | assert result.stderr_bytes is None
20 |
--------------------------------------------------------------------------------
/cross-modal-search/toy-data/captions.txt:
--------------------------------------------------------------------------------
1 | image,caption
2 | 1000268201_693b08cb0e.jpg,A child in a pink dress is climbing up a set of stairs in an entry way .
3 | 1000268201_693b08cb0e.jpg,A girl going into a wooden building .
4 | 1000268201_693b08cb0e.jpg,A little girl climbing into a wooden playhouse .
5 | 1000268201_693b08cb0e.jpg,A little girl climbing the stairs to her playhouse .
6 | 1000268201_693b08cb0e.jpg,A little girl in a pink dress going into a wooden cabin .
7 | 1001773457_577c3a7d70.jpg,A black dog and a spotted dog are fighting
8 | 1001773457_577c3a7d70.jpg,A black dog and a tri-colored dog playing with each other on the road .
9 | 1001773457_577c3a7d70.jpg,A black dog and a white dog with brown spots are staring at each other in the street .
10 | 1001773457_577c3a7d70.jpg,Two dogs of different breeds looking at each other on the road .
11 | 1001773457_577c3a7d70.jpg,Two dogs on pavement moving toward each other .
12 |
--------------------------------------------------------------------------------
/cross-modal-search/toy-data/images/1000268201_693b08cb0e.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jina-ai/examples/d8f903278597254bd96b1ed64fe8c9feefaa265c/cross-modal-search/toy-data/images/1000268201_693b08cb0e.jpg
--------------------------------------------------------------------------------
/cross-modal-search/toy-data/images/1001773457_577c3a7d70.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jina-ai/examples/d8f903278597254bd96b1ed64fe8c9feefaa265c/cross-modal-search/toy-data/images/1001773457_577c3a7d70.jpg
--------------------------------------------------------------------------------
/cross-modal-search/visualizations/cross-modal-index-flow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jina-ai/examples/d8f903278597254bd96b1ed64fe8c9feefaa265c/cross-modal-search/visualizations/cross-modal-index-flow.png
--------------------------------------------------------------------------------
/cross-modal-search/visualizations/cross-modal-query-flow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jina-ai/examples/d8f903278597254bd96b1ed64fe8c9feefaa265c/cross-modal-search/visualizations/cross-modal-query-flow.png
--------------------------------------------------------------------------------
/cross-modal-search/visualizations/cross-modal-result.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jina-ai/examples/d8f903278597254bd96b1ed64fe8c9feefaa265c/cross-modal-search/visualizations/cross-modal-result.jpg
--------------------------------------------------------------------------------
/cross-modal-search/visualizations/image_results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jina-ai/examples/d8f903278597254bd96b1ed64fe8c9feefaa265c/cross-modal-search/visualizations/image_results.png
--------------------------------------------------------------------------------
/cross-modal-search/visualizations/text_results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jina-ai/examples/d8f903278597254bd96b1ed64fe8c9feefaa265c/cross-modal-search/visualizations/text_results.png
--------------------------------------------------------------------------------
/example-guidelines.md:
--------------------------------------------------------------------------------
1 | # Submit Your Community Example!
2 |
3 | Thanks for your interest in submitting your example! Here are some rules and guidelines:
4 |
5 | ## Rules
6 |
7 | ### `jina` in `requirements.txt`
8 |
9 | To be eligible for listing, you **must** have `jina==x.x.x` in your `requirements.txt`, where `x.x.x` refers to the semantic version number.
10 |
11 | Note: If you're building a front-end that just interfaces with Jina's API and doesn't rely on Jina core itself, there's no need to follow this requirement.
12 |
13 | ### `jina-` at start of name
14 |
15 | Your repo name should be `jina-xxxxxxx`.
16 |
17 | ### Clear README
18 |
19 | - Explain what your example does and how to run it
20 |
21 | ### Use scripts to get external resources
22 |
23 | - **For datasets:** Use a script named `get_data.sh`
24 | - **For models**: If you use an externally-hosted model, call your script `get_model.sh` or similar
25 | - **For other assets:** Follow the `get_xxx.sh` pattern
26 |
27 | ### `.gitignore` and `.dockerignore`
28 |
29 | Have a `.gitignore` file and list any directories that should be ignored. The same goes for `.dockerignore` if you have `Dockerfile`:
30 |
31 | - `data` directory
32 | - `workspace` directory
33 | - virtual environment directories
34 | - directories that store assets retrieved by [scripts](#use-scripts-to-get-external-resources)
35 |
36 | ### License
37 |
38 | You **must** use an open-source license, specified in `LICENSE` in the root of your repo
39 |
40 | ## Guidelines
41 |
42 | We're more easy-going on these
43 |
44 | ### One Example Per Repo
45 |
46 | To make code more maintainable and easier for end users, please include one example per repo.
47 |
48 | ### Tests
49 |
50 | Please include tests to ensure your app or Pod works correctly.
51 |
52 | ### File Structure
53 |
54 | - Please follow the file structure as created by `jina hub new --type app`
55 | - Store data in `data` and externally-downloaded models in `models`
56 |
57 | ### Dockerfile
58 |
59 | We highly encourage you to add a `Dockerfile`.
60 |
61 | ### Docker image
62 |
63 | For self-contained apps, we would love to host a Docker image on [Jina Hub](https://github.com/jina-ai/jina-hub)
64 |
--------------------------------------------------------------------------------
/example_template.md:
--------------------------------------------------------------------------------
1 | # Run the EXAMPLE NAME
2 | *You can also include a gif with a full demo of the example*
3 |
4 |
5 | *ADD A TABLE OF CONTENTS HERE *
6 |
7 | - [Overview](#overview)
8 | - [🐍 Build the app with Python](#-build-the-app-with-python)
9 | - [🔮 Overview of the files in this example](#-overview-of-the-files-in-this-example)
10 | - [🌀 Flow diagram](#-flow-diagram)
11 | - [🔨 Next steps, building your own app](#-next-steps-building-your-own-app)
12 | - [🐳 Deploy the prebuild application using Docker](#-deploy-the-prebuild-application-using-docker)
13 | - [🙍 Community](#-community)
14 | - [🦄 License](#-license)
15 |
16 |
17 | ## Overview
18 | | About this example: | |
19 | | ------------- | ------------- |
20 | | Learnings | *Describe what the user will learn after running this example* |
21 | | Used for indexing | *What is the datatype of the indexing input* |
22 | | Used for querying | *What is the data type of the query input* |
23 | | Dataset used | *Link to the datasets* |
24 | | Model used | *Link to the model* |
25 |
26 |
27 | ## 🐍 Build the app with Python
28 |
29 | These instructions explain how to build the example yourself and deploy it with Python. If you want to skip the building steps and just run the example with Docker, check [the Docker deployment instructions at the end of this README](#deploy-with-docker)
30 |
31 |
32 | ### 🗝️ Requirements
33 |
34 | *Here outline in bullet points anything the user is expected to have before diving in.*
35 |
36 | For example:
37 |
38 | 1. You have a working Python 3.8 environment.
39 | 2. We recommend creating a [new Python virtual environment](https://docs.python.org/3/tutorial/venv.html) to have a clean installation of Jina and prevent dependency conflicts.
40 | 3. You have at least 2GB of free space on your hard drive.
41 |
42 | ### 👾 Step 1. Clone the repo and install Jina
43 |
44 | Begin by cloning the repo, so you can get the required files and datasets. (If you already have the examples repository on your machine make sure to fetch the most recent version)
45 |
46 | ```sh
47 | git clone https://github.com/jina-ai/examples
48 | ````
49 |
50 | And enter the correct folder:
51 |
52 | ```sh
53 | cd examples/example_to_use (replace as necessary)
54 | ```
55 |
56 | In your terminal, you should now be located in you the *enter example name* folder. Let's install Jina and the other required Python libraries. For further information on installing Jina check out [our documentation](https://docs.jina.ai/chapters/core/setup/).
57 |
58 | ```sh
59 | pip install -r requirements.txt
60 | ```
61 |
62 | ### 📥 Step 2. Download your data to search (Optional)
63 |
64 | There are two different options here. You can either use the toy data we provide in this repo, which is quick to index but will give very poor results. Alternatively, you can download a larger dataset, which takes longer to index, but will have better results.
65 |
66 | 1. **Toy dataset:** Skip to step 3. No action is needed here.
67 |
68 | 2. **Full dataset:**
69 | In order to get the full dataset, follow the instructions below:
70 | - Register for a free [Kaggle account](https://www.kaggle.com/account/login?phase=startRegisterTab&returnUrl=%2F)
71 | - Set up your API token (see [authentication section of their API docs](https://www.kaggle.com/docs/api))
72 | - Run `pip install kaggle`
73 | - Run `sh get_data.sh`
74 |
75 | ### 🏃 Step 3. Index your data
76 | In this step, we will index our data.
77 |
78 | *Here describe the Index Flow. Be as specific as possible in describing how this Index Flow works and what is its input. You are encouraged to use code snippets, images, or whatever helps to clarify.*
79 |
80 | ```
81 | python app.py -t index (replace as necessary)
82 | ```
83 |
84 | If you see the following output, it means your data has been correctly indexed.
85 |
86 | ```
87 | Flow@5162[S]:flow is closed and all resources are released, current build level is 0
88 | ```
89 |
90 | ### 🔎 Step 4: Query your data
91 | Next, we will deploy our query Flow.
92 |
93 | *Here describe the Query Flow. Be as specific as possible in describing how this Query Flow works and what is its input. You are encouraged to use code snippets, images, or whatever helps to clarify.*
94 |
95 | Run the query Flow in your terminal like this:
96 |
97 | ```
98 | python app.py -t query (replace as necessary)
99 | ```
100 | ______
101 |
102 | ## 📉 Understanding your results
103 | *Here include a short description of the results and how to interpret them if needed.*
104 |
105 | ## 🌀 Flow diagram
106 | This diagram provides a visual representation of the Flows in this example; Showing which executors are used in which order.
107 |
108 | *Here Show the Flow for this example.*
109 |
110 | ## 📖 Optional: Extra information useful for the user
111 |
112 | *Use this section to add extra information you think the user could benefit from.
113 | QueryLanguage, Faiss, Annoy for example.*
114 |
115 | ## 🔮 Overview of the files
116 |
117 | *Add a list with all folders/files in the example:*
118 |
119 | | | |
120 | | -------------------- | ---------------------------------------------------------------------------------------------------------------- |
121 | | 📂 `flows/` | Folder to store Flow configuration |
122 | | --- 📃 `index.yml` | YAML file to configure indexing Flow |
123 | | --- 📃 `query.yml` | YAML file to configure querying Flow |
124 | | 📂 `pods/` | Folder to store Pod configuration |
125 | | --- 📃 `encoder.yml` | YAML file to configure encoder Pod |
126 | | 📂 `workspace/` | Folder to store indexed files (embeddings and documents). Automatically created after the first indexing |
127 |
128 | _____
129 |
130 | ## 🐋 Deploy with Docker
131 | To make it easier for you, we have built and published the Docker image for this example.
132 |
133 | ### ☑️ Requirements:
134 |
135 | 1. You have Docker installed and working.
136 | 2. You have at least 8GB of free space on your hard drive.
137 |
138 | ### 🏃🏿♂️ Pull and run the image
139 | Running the following command will pull the Docker image and run it.
140 |
141 | *Replace below with the command to run the Docker image of this example*
142 |
143 | ```bash
144 | docker .
145 | ```
146 |
147 | _______
148 |
149 | ## ⏭️ Next steps
150 |
151 | Did you like this example and are you interested in building your own? For a detailed tutorial on how to build your Jina app check out [How to Build Your First Jina App](https://docs.jina.ai/chapters/my_first_jina_app/#how-to-build-your-first-jina-app) guide in our documentation.
152 |
153 | If you have any issues following this guide, you can always get support from our [Slack community](https://slack.jina.ai) .
154 |
155 | ## 👩👩👧👦 Community
156 |
157 | - [Slack channel](https://slack.jina.ai/) - a communication platform for developers to discuss Jina.
158 | - [LinkedIn](https://www.linkedin.com/company/jinaai/) - get to know Jina AI as a company and find job opportunities.
159 | - [](https://twitter.com/JinaAI_) - follow us and interact with us using hashtag `#JinaSearch`.
160 | - [Company](https://jina.ai) - know more about our company, we are fully committed to open-source!
161 |
162 | ## 🦄 License
163 |
164 | Copyright (c) 2021 Jina AI Limited. All rights reserved.
165 |
166 | Jina is licensed under the Apache License, Version 2.0. See [LICENSE](https://github.com/jina-ai/examples/blob/master/LICENSE) for the full license text.
167 |
--------------------------------------------------------------------------------
/multires-lyrics-search/.github/demo.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jina-ai/examples/d8f903278597254bd96b1ed64fe8c9feefaa265c/multires-lyrics-search/.github/demo.gif
--------------------------------------------------------------------------------
/multires-lyrics-search/.github/index.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jina-ai/examples/d8f903278597254bd96b1ed64fe8c9feefaa265c/multires-lyrics-search/.github/index.jpg
--------------------------------------------------------------------------------
/multires-lyrics-search/.github/search.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jina-ai/examples/d8f903278597254bd96b1ed64fe8c9feefaa265c/multires-lyrics-search/.github/search.jpg
--------------------------------------------------------------------------------
/multires-lyrics-search/.gitignore:
--------------------------------------------------------------------------------
1 | lyrics-data/lyrics-data.csv
2 |
--------------------------------------------------------------------------------
/multires-lyrics-search/app.py:
--------------------------------------------------------------------------------
1 | __copyright__ = "Copyright (c) 2021 Jina AI Limited. All rights reserved."
2 | __license__ = "Apache-2.0"
3 |
4 | __version__ = '0.0.1'
5 |
6 | import os
7 | import sys
8 | import click
9 |
10 | from jina import Flow, Document
11 | from helper import input_generator
12 | from jina.logging.predefined import default_logger as logger
13 |
14 |
15 | def config():
16 | cur_dir = os.path.dirname(os.path.abspath(__file__))
17 | os.environ.setdefault('JINA_WORKSPACE', os.path.join(cur_dir, 'workspace'))
18 | os.environ.setdefault('JINA_WORKSPACE_MOUNT',
19 | f'{os.environ.get("JINA_WORKSPACE")}:/workspace/workspace')
20 | os.environ.setdefault('JINA_LOG_LEVEL', 'INFO')
21 | if os.path.exists('lyrics-data/lyrics-data.csv'):
22 | os.environ.setdefault('JINA_DATA_FILE', 'lyrics-data/lyrics-data.csv')
23 | else:
24 | os.environ.setdefault('JINA_DATA_FILE', 'lyrics-data/lyrics-toy-data1000.csv')
25 | os.environ.setdefault('JINA_PORT', str(45678))
26 |
27 |
28 | # for index
29 | def index(num_docs):
30 | flow = Flow.load_config('flows/index.yml')
31 | with flow:
32 | input_docs = input_generator(num_docs=num_docs)
33 | data_path = os.path.join(os.path.dirname(__file__),
34 | os.environ.get('JINA_DATA_FILE', None))
35 | flow.logger.info(f'Indexing {data_path}')
36 | flow.post(on='/index', inputs=input_docs, request_size=10,
37 | show_progress=True)
38 |
39 |
40 | # for search
41 | def query():
42 | flow = Flow.load_config('flows/query.yml')
43 | flow.rest_api = True
44 | flow.protocol = 'http'
45 | with flow:
46 | flow.block()
47 |
48 |
49 | def query_text():
50 | def print_result(response):
51 | doc = response.docs[0]
52 | for index, parent in enumerate(doc.matches):
53 | print(f'Parent {index}: Song Name: {parent.tags["SName"]}\n{parent.text}')
54 | for index, chunk in enumerate(doc.chunks):
55 | print(f'Chunk {index}: {chunk.text}')
56 | for match in chunk.matches:
57 | print(f'\tMatch: {match.text}')
58 |
59 | f = Flow.load_config('flows/query.yml')
60 | with f:
61 | search_text = input('Please type a sentence: ')
62 | doc = Document(content=search_text, mime_type='text/plain')
63 | response = f.post('/search', inputs=doc, parameters={'lookup_type': 'parent'}, return_results=True)
64 | print_result(response[0].data)
65 |
66 |
67 | @click.command()
68 | @click.option('--task', '-t',
69 | type=click.Choice(['index', 'query', 'query_text'], case_sensitive=False))
70 | @click.option('--num_docs', '-n', default=10000)
71 | def main(task, num_docs):
72 | config()
73 | workspace = os.environ["JINA_WORKSPACE"]
74 | if task == 'index':
75 | if os.path.exists(workspace):
76 | logger.error(f'\n +---------------------------------------------------------------------------------+ \
77 | \n | 🤖🤖🤖 | \
78 | \n | The directory {workspace} already exists. Please remove it before indexing again. | \
79 | \n | 🤖🤖🤖 | \
80 | \n +---------------------------------------------------------------------------------+')
81 | sys.exit(1)
82 | index(num_docs)
83 | elif task == 'query':
84 | query()
85 | elif task == 'query_text':
86 | query_text()
87 | else:
88 | raise NotImplementedError(
89 | f'Unknown task: {task}.')
90 |
91 |
92 | if __name__ == '__main__':
93 | main()
94 |
--------------------------------------------------------------------------------
/multires-lyrics-search/flows/index.yml:
--------------------------------------------------------------------------------
1 | jtype: Flow # We define the flow used for indexing here
2 | version: '1' # yml version
3 | with: # Parameters for the flow
4 | workspace: $JINA_WORKSPACE # Workspace folder
5 | executors: # Now, define all the executors that are used
6 | - name: segmenter # The first executor splits the input text into sentences which are stored as chunks in the original documents
7 | uses: 'jinahub+docker://Sentencizer' # The type of the executor is Sentencizer, we download it from the hub as a docker container
8 | - name: encoder # Then, compute the embeddings of the sentences in this executor
9 | uses: 'jinahub+docker://TransformerTorchEncoder/v0.1' # We use a TransformerTorchEncoder from the hub
10 | volumes: '~/.cache/huggingface:/root/.cache/huggingface' # Mount the huggingface cache into the docker container
11 | uses_with: # Override some parameters for the executor
12 | pooling_strategy: 'cls' # This is the pooling strategy that is used by the encoder
13 | pretrained_model_name_or_path: distilbert-base-cased # The ML model that is used
14 | max_length: 96 # Max length argument for the tokenizer
15 | device: 'cpu' # Run the executor on CPU - For GPU, we would have to use another container!
16 | default_traversal_paths: ['c'] # Compute the embeddings on the chunk level - the sentences created before
17 | - name: indexer # Now, index the sentences and store them to disk.
18 | uses: 'jinahub://SimpleIndexer/old' # We use a simple indexer for that purpose (not in docker, but using source codes - there are some bugs with docker for this executor)
19 | uses_metas: # Set some meta arguments for this executor
20 | workspace: $JINA_WORKSPACE # Define the workspace folder for the executor
21 | uses_with: # Override parameters for the executor
22 | default_traversal_paths: ['c'] # Store the sentences on disk - this means on chunk level
23 | - name: root_indexer # Additionally to the sentences, we also need to store the original songs which are not split into sentences
24 | uses: 'jinahub+docker://LMDBStorage' # Therefore, we use a LMDBStorage indexer
25 | volumes: $JINA_WORKSPACE_MOUNT # Again, mount the workspace
26 | uses_with: # Override some parameters for the LMDBStorage
27 | default_traversal_paths: ['r'] # Now, we store the root documents, not the sentence chunks
28 | needs: [gateway] # We can start this at the beginning - in parallel to the sentence flow
29 | - name: wait_both # Now, we wait for both the root indexing and the sentence path to finish
30 | needs: [indexer, root_indexer] # Continue once these two executor are finished
31 |
--------------------------------------------------------------------------------
/multires-lyrics-search/flows/query.yml:
--------------------------------------------------------------------------------
1 | jtype: Flow # Now, we define the search flow for this example
2 | version: '1' # It is quite similar to the index flow, only the differences are explained here
3 | with:
4 | port_expose: $JINA_PORT # Port to run the flow on
5 | cors: true # Add cross origin headers to the request responses
6 | executors:
7 | - name: segmenter # First, split the search text into sentences again
8 | uses: 'jinahub+docker://Sentencizer'
9 | - name: encoder # Encode the search sentences into embeddings
10 | uses: 'jinahub+docker://TransformerTorchEncoder/v0.1'
11 | volumes: '~/.cache/huggingface:/root/.cache/huggingface'
12 | uses_with:
13 | pooling_strategy: 'cls'
14 | pretrained_model_name_or_path: distilbert-base-cased
15 | max_length: 96
16 | device: 'cpu'
17 | default_traversal_paths: ['c']
18 | - name: indexer # Compare the search sentence embeddings to the stored sentence embeddings from the indexing
19 | uses: 'jinahub://SimpleIndexer/old' # Then, return the closest matches for every sentence
20 | uses_metas:
21 | workspace: $JINA_WORKSPACE
22 | uses_with:
23 | default_traversal_paths: ['c']
24 | read_only: True
25 | - name: ranker # Now, we need to use a special ranker in the query flow
26 | uses: 'jinahub+docker://SimpleRanker' # This ranker collects all the matches from the sentences and adds them to the root document
27 | uses_with: # It also orders the matches according to their minimum distance
28 | metric: 'cosine'
29 | - name: root_indexer # Now, we can collect the stored metadata from the root documents to the matches collected by the MinRanker
30 | uses: 'jinahub+docker://LMDBStorage'
31 | volumes: $JINA_WORKSPACE_MOUNT
32 | uses_with:
33 | default_traversal_paths: ['m']
34 | read_only: True
--------------------------------------------------------------------------------
/multires-lyrics-search/get_data.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | kaggle datasets download -d neisse/scrapped-lyrics-from-6-genres
3 | unzip scrapped-lyrics-from-6-genres.zip
4 | rm -rf scrapped-lyrics-from-6-genres.zip
5 | rm -rf artists-data.csv
6 | mv lyrics-data.csv lyrics-data/lyrics-data.csv
7 |
--------------------------------------------------------------------------------
/multires-lyrics-search/helper.py:
--------------------------------------------------------------------------------
1 | """Helper functions for the multires example"""
2 |
3 | import csv
4 | import itertools as it
5 | import os
6 | import numpy as np
7 |
8 | from jina import Document
9 |
10 |
11 | def input_generator(num_docs: int):
12 | lyrics_file = os.environ.setdefault('JINA_DATA_FILE',
13 | 'lyrics-data/lyrics-toy-data1000.csv')
14 | with open(lyrics_file, newline='', encoding='utf-8') as f:
15 | reader = csv.reader(f)
16 | for row in it.islice(reader, num_docs):
17 | if row[-1] == 'ENGLISH':
18 | d = Document(text=row[3])
19 | d.tags['ALink'] = row[0]
20 | d.tags['SName'] = row[1]
21 | d.tags['SLink'] = row[2]
22 | yield d
23 |
24 |
25 | def num_input_docs():
26 | lyrics_file = os.environ.setdefault(
27 | 'JINA_DATA_PATH', 'lyrics-data/lyrics-toy-data1000.csv'
28 | )
29 | with open(lyrics_file, newline='', encoding='utf-8') as f:
30 | reader = csv.reader(f)
31 | return len(list(reader))
32 |
33 | def _ext_A(A):
34 | nA, dim = A.shape
35 | A_ext = np.ones((nA, dim * 3))
36 | A_ext[:, dim : 2 * dim] = A
37 | A_ext[:, 2 * dim :] = A ** 2
38 | return A_ext
39 |
40 |
41 | def _ext_B(B):
42 | nB, dim = B.shape
43 | B_ext = np.ones((dim * 3, nB))
44 | B_ext[:dim] = (B ** 2).T
45 | B_ext[dim : 2 * dim] = -2.0 * B.T
46 | del B
47 | return B_ext
48 |
49 |
50 | def _norm(A):
51 | return A / np.linalg.norm(A, ord=2, axis=1, keepdims=True)
52 |
53 |
54 | def _euclidean(A_ext, B_ext):
55 | sqdist = A_ext.dot(B_ext).clip(min=0)
56 | return np.sqrt(sqdist)
--------------------------------------------------------------------------------
/multires-lyrics-search/requirements.txt:
--------------------------------------------------------------------------------
1 | click==8.0.1
2 | jina[standard]==2.0.18
3 | kaggle==1.5.12
4 | docker
5 | git+git://github.com/jina-ai/jina-commons@v0.0.3
--------------------------------------------------------------------------------
/multires-lyrics-search/static/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | Lyrics Search Demo
8 |
9 |
10 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 | Query
38 |
39 |
40 |
41 |
42 |
{{ searchIndicator }}
43 |
44 |
45 |
46 |
47 | Breakdown
48 |
51 |
52 |
53 |
56 | only show distance < {{ distThreshold }}
57 |
58 |
59 |
151 |
152 |
153 |
154 |
157 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
--------------------------------------------------------------------------------
/multires-lyrics-search/static/main.css:
--------------------------------------------------------------------------------
1 | .lyric-text {
2 | font-size: 10px;
3 | text-align: center;
4 | max-height: 30em;
5 | overflow: scroll;
6 | }
7 |
8 | .blockquote-footer {
9 | text-align: center;
10 | }
11 |
12 | .query-chunk {
13 | border-width: 1px;
14 | border-radius: 3px;
15 | border-style: solid;
16 | }
17 |
18 | .query-chunk-breakdown {
19 | margin: 5px;
20 | border-style: solid;
21 | border-width: 1px;
22 | border-radius: 5px;
23 | border-color: lightgray;
24 | padding: 5px;
25 | }
26 |
27 | .card {
28 | margin: 5px;
29 | }
--------------------------------------------------------------------------------
/multires-lyrics-search/static/vue-bindings.js:
--------------------------------------------------------------------------------
1 | var VueMasonryPlugin = window["vue-masonry-plugin"].VueMasonryPlugin;
2 | Vue.use(VueMasonryPlugin);
3 |
4 | const vm = new Vue({
5 | el: '#jina-ui',
6 | data: {
7 | serverUrl: 'http://localhost:45678/search',
8 | top_k: 50,
9 | topkDocs: [],
10 | topkDocsDict: {},
11 | results: [],
12 | searchQuery: '',
13 | queryChunks: [],
14 | selectQueryChunks: [],
15 | queryItem: [],
16 | docItem: null,
17 | loadedItem: 0,
18 | loadedQuery: 0,
19 | searchQueryIsDirty: false,
20 | isCalculating: false,
21 | distThreshold: 999,
22 | sliderOptions: {
23 | dotSize: 14,
24 | width: 'auto',
25 | height: 4,
26 | contained: false,
27 | direction: 'ltr',
28 | data: null,
29 | min: 999,
30 | max: 0,
31 | interval: 0.01,
32 | disabled: false,
33 | clickable: true,
34 | duration: 0.5,
35 | adsorb: false,
36 | lazy: false,
37 | tooltip: 'active',
38 | tooltipPlacement: 'top',
39 | tooltipFormatter: void 0,
40 | useKeyboard: false,
41 | keydownHook: null,
42 | dragOnClick: false,
43 | enableCross: true,
44 | fixed: false,
45 | minRange: void 0,
46 | maxRange: void 0,
47 | order: true,
48 | marks: false,
49 | dotOptions: void 0,
50 | process: true,
51 | dotStyle: void 0,
52 | railStyle: void 0,
53 | processStyle: void 0,
54 | tooltipStyle: void 0,
55 | stepStyle: void 0,
56 | stepActiveStyle: void 0,
57 | labelStyle: void 0,
58 | labelActiveStyle: void 0,
59 | }
60 | },
61 | mounted: function () {
62 |
63 | },
64 | components: {
65 | 'vueSlider': window['vue-slider-component'],
66 | },
67 | computed: {
68 | searchIndicator: function () {
69 | if (this.isCalculating) {
70 | return '⟳ Fetching new results...'
71 | } else if (this.searchQueryIsDirty) {
72 | return '... Typing'
73 | } else {
74 |
75 | return '✓ Done'
76 | }
77 | }
78 | },
79 | watch: {
80 | searchQuery: function () {
81 | this.searchQueryIsDirty = true
82 | this.expensiveOperation()
83 | },
84 | distThreshold: function () {
85 | this.refreshAllCards();
86 | }
87 | },
88 | methods: {
89 | clearAllSelect: function () {
90 | vm.queryChunks.forEach(function (item, i) {
91 | item['isSelect'] = !item['isSelect'];
92 | vm.refreshAllCards();
93 | });
94 | },
95 | selectChunk: function (item) {
96 | item['isSelect'] = !item['isSelect'];
97 | vm.refreshAllCards();
98 | },
99 | refreshAllCards: function () {
100 | vm.topkDocsDict = new Map(vm.topkDocs.map(i => [i.id, {
101 | 'text': i.text,
102 | 'hlchunk': [],
103 | 'renderHTML': i.text
104 | }]));
105 | vm.queryChunks.forEach(function (item, i) {
106 | if (!('isSelect' in item)) {
107 | item['isSelect'] = true;
108 | }
109 | if (item['isSelect']) {
110 | item.matches.forEach(function (r) {
111 | if (vm.topkDocsDict.has(r.parentId)) {
112 | let dist = r.scores['cosine'].value
113 | if (dist < vm.distThreshold) {
114 | // console.log(item)
115 | vm.topkDocsDict.get(r.parentId)['hlchunk'].push({
116 | 'range': r.location,
117 | 'idx': i,
118 | 'dist': dist,
119 | 'range_str': r.location[0] + ',' + r.location[1]
120 | });
121 | }
122 | if (dist < vm.sliderOptions.min) {
123 | vm.sliderOptions.min = dist.toFixed(2)
124 | }
125 | if (dist > vm.sliderOptions.max) {
126 | vm.sliderOptions.max = dist.toFixed(2)
127 | }
128 |
129 | } else {
130 | console.error(r.id);
131 | }
132 | });
133 | }
134 | });
135 | vm.topkDocsDict.forEach(function (value, key, map) {
136 | vm.topkDocsDict.get(key)['hlchunk'].sort(function (a, b) {
137 | return b['range'][0] - a['range'][0]
138 | })
139 | var replace_map = new Map();
140 | value['hlchunk'].forEach(function (item) {
141 | if (!replace_map.has(item['range_str'])) {
142 | replace_map.set(item['range_str'], [])
143 | }
144 | replace_map.get(item['range_str']).push(item)
145 |
146 | })
147 |
148 | replace_map.forEach(function (item, kk, mm) {
149 | value['renderHTML'] = replaceRange(value['renderHTML'], item[0]['range'][0], item[0]['range'][1], item)
150 | })
151 | })
152 | vm.$nextTick(function () {
153 | vm.$redrawVueMasonry('my-masonry');
154 | })
155 | },
156 | // This is where the debounce actually belongs.
157 | expensiveOperation: _.debounce(function () {
158 | this.isCalculating = true
159 | vm.selectQueryChunks.length = 0;
160 | $.ajax({
161 | url: this.serverUrl,
162 | type: "POST",
163 | contentType: "application/json",
164 | cache: false,
165 | data: JSON.stringify({
166 | "parameters": {"top_k": this.top_k},
167 | "data": [this.searchQuery]
168 | }),
169 | error: function (jqXHR, textStatus, errorThrown) {
170 | console.log(jqXHR);
171 | console.log(textStatus);
172 | console.log(errorThrown);
173 | },
174 | success: function (data) {
175 | vm.topkDocs = data.data.docs[0].matches;
176 | console.log('Number parents: ' + vm.topkDocs.length);
177 | vm.queryChunks = data.data.docs[0].chunks;
178 | console.log('Number chunks: ' + vm.queryChunks.length);
179 | vm.refreshAllCards();
180 | console.log('Success');
181 | },
182 | complete: function () {
183 | vm.isCalculating = false
184 | vm.searchQueryIsDirty = false
185 | vm.$nextTick(function () {
186 | vm.$redrawVueMasonry('my-masonry');
187 | })
188 | }
189 | });
190 |
191 | }, 500)
192 | }
193 | });
194 |
195 | function replaceRange(s, start, end, chunks) {
196 | var content = s.substring(start, end)
197 | chunks.forEach(function (c) {
198 | content = "" + content + ""
199 | })
200 | return s.substring(0, start) + content + s.substring(end);
201 | }
202 |
203 | function selectColor(number, colored) {
204 | if (!colored) {
205 | return `#fff`;
206 | }
207 | const hue = number * 137.508; // use golden angle approximation
208 | return `hsl(${hue},50%,75%)`;
209 | }
210 |
--------------------------------------------------------------------------------
/multires-lyrics-search/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jina-ai/examples/d8f903278597254bd96b1ed64fe8c9feefaa265c/multires-lyrics-search/tests/__init__.py
--------------------------------------------------------------------------------
/multires-lyrics-search/tests/conftest.py:
--------------------------------------------------------------------------------
1 | """Patch the birthday problem for random parts"""
2 |
3 | import pytest
4 |
5 |
6 | @pytest.fixture(scope='function', autouse=True)
7 | def patched_random_port(mocker):
8 | used_ports = set()
9 | from jina.helper import random_port
10 | from jina.excepts import NoAvailablePortError
11 |
12 | def _random_port():
13 |
14 | for i in range(10):
15 | _port = random_port()
16 |
17 | if _port is not None and _port not in used_ports:
18 | used_ports.add(_port)
19 | return _port
20 | raise NoAvailablePortError
21 |
22 | mocker.patch('jina.helper.random_port', new_callable=lambda: _random_port)
--------------------------------------------------------------------------------
/multires-lyrics-search/tests/requirements.txt:
--------------------------------------------------------------------------------
1 | click==8.0.1
2 | git+https://github.com/jina-ai/jina.git@v2.0.18#egg=jina[standard]
3 | pytest==6.1.2
4 | kaggle==1.5.12
5 | docker
6 | git+git://github.com/jina-ai/jina-commons@v0.0.3
--------------------------------------------------------------------------------
/multires-lyrics-search/tests/test_flow_integration.py:
--------------------------------------------------------------------------------
1 | __copyright__ = "Copyright (c) 2021 Jina AI Limited. All rights reserved."
2 | __license__ = "Apache-2.0"
3 |
4 | import os
5 | import shutil
6 | import glob
7 | from typing import List
8 | from click.testing import CliRunner
9 |
10 | import pytest
11 | from jina import Flow, Document
12 |
13 | from app import main
14 |
15 |
16 | def get_files_with_patterns(directory: str, match_patterns: List[str]) -> List[str]:
17 | """
18 | Returns all files from directory and subdirectories that match any of the patterns in the list.
19 | The returned list will only contain unique items.
20 |
21 | :param directory: Path to the directory
22 | :param match_patterns: A list of expressions to match the files against. E.g. `*.json`
23 | :return: List of matched files.
24 | """
25 | index_files = []
26 | for pattern in match_patterns:
27 | index_files += list(glob.glob(os.path.join(directory, '**', pattern), recursive=True))
28 | return list(set(index_files))
29 |
30 |
31 | @pytest.fixture(scope='session', autouse=True)
32 | def index(tmpdir_factory):
33 | """
34 | This fixtures runs automatically once before each test session.
35 | It indexes a small set of files into a test workspace and checks that the indexing
36 | completes correctly.
37 |
38 | Other tests can use the created workspace and test queries against it.
39 | """
40 | assert os.getcwd().endswith('multires-lyrics-search'), \
41 | "Please execute the tests from the root directory: >>> pytest tests/"
42 |
43 | workspace = os.path.join(tmpdir_factory.getbasetemp(), 'test-workspace')
44 | assert not os.path.isdir(workspace), 'Directory ./test-workspace exists. Please remove before testing'
45 | os.environ['JINA_WORKSPACE'] = workspace
46 | os.environ.setdefault('JINA_WORKSPACE_MOUNT',
47 | f'{os.environ.get("JINA_WORKSPACE")}:/workspace/workspace')
48 | os.environ.setdefault('JINA_PORT', str(45678))
49 |
50 | runner = CliRunner()
51 | result = runner.invoke(main, ['-t', 'index', '-n', '100'])
52 | assert result.stderr_bytes is None, f'Error messages found during indexing: {result.stderr}'
53 |
54 | assert os.path.isdir(workspace)
55 | index_files = get_files_with_patterns(workspace, ['*.bin', '*.lmdb', '*.lmdb-lock'])
56 | assert len(index_files) == 4, 'Expected three files in the workspace'
57 | for _file in index_files:
58 | assert os.path.getsize(_file) > 0, f'File {_file} is empty.'
59 |
60 | yield
61 | # shutil.rmtree(workspace) Not possible due to docker sudo rights
62 |
63 |
64 | def test_query_text(tmpdir_factory):
65 | def assert_result(response):
66 | docs = response.docs
67 | # check number of results
68 | assert len(docs) == 1
69 | assert len(docs[0].chunks) == 2
70 | parent_docs = docs[0].matches
71 | parent_ids = parent_docs.get_attributes('id')
72 | assert len(parent_docs) > 0
73 | for chunk in docs[0].chunks:
74 | assert len(chunk.matches) == 5 # top_k = 5
75 | match_ids = chunk.matches.get_attributes('id')
76 | assert len(match_ids) == len(list(set(match_ids)))
77 | for match in chunk.matches:
78 | assert match.text is not None
79 | assert match.location is not None
80 | assert match.parent_id in parent_ids
81 | assert match.text in parent_docs[parent_ids.index(match.parent_id)].text
82 |
83 | flow = Flow.load_config('flows/query.yml')
84 | with flow:
85 | search_text = 'looked through every window then. hello world.'
86 | doc = Document(content=search_text, mime_type='text/plain')
87 | response = flow.post('/search', inputs=doc, parameters={'top_k': 5}, return_results=True)
88 | assert_result(response[0])
89 |
--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | norecursedirs = cross-modal-search/img_emb/* cross-modal-search/txt_emb/* openapi/python-flask/openapi_server/*
--------------------------------------------------------------------------------
/wikipedia-sentences-query-while-indexing/.github/images/storage.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/wikipedia-sentences-query-while-indexing/.gitignore:
--------------------------------------------------------------------------------
1 | workspace*
2 | env
3 | results
4 | __pycache__
5 |
--------------------------------------------------------------------------------
/wikipedia-sentences-query-while-indexing/README.md:
--------------------------------------------------------------------------------
1 | # Querying While Indexing in the Wikipedia Search Example
2 |
3 | | About this example: | |
4 | | ------------- | ------------- |
5 | | Learnings | How to configure Jina for querying while indexing |
6 | | Used for indexing | Text data |
7 | | Used for querying | Text data |
8 | | Dataset used | [Wikipedia dataset from kaggle](https://www.kaggle.com/mikeortman/wikipedia-sentences) |
9 | | Model used | [flair-text](https://github.com/flairNLP/flair) |
10 |
11 | This is an example of using [Jina](http://www.jina.ai) to support both querying and indexing simultaneously in our [Wikipedia sentence search example](https://github.com/jina-ai/examples/tree/master/wikipedia-sentences).
12 |
13 | ## Table of contents:
14 |
15 | * [Prerequisites](#prerequisites)
16 | * [What is querying while indexing?](#what-is-querying-while-indexing)
17 | * [Configuration changes](#configuration-changes)
18 | * [🐍 Build the app with Python](#-build-the-app-with-python)
19 | * [Flow diagrams](#flow-diagrams)
20 | * [🔮 Overview of the files](#-overview-of-the-files)
21 | * [Troubleshooting](#troubleshooting)
22 | * [⏭️ Next steps](#-next-steps)
23 | * [👩👩👧👦 Community](#-community)
24 | * [🦄 License](#-license)
25 |
26 | ## Prerequisites
27 |
28 | - Run and understand our [Wikipedia sentence search example](https://github.com/jina-ai/examples/tree/master/wikipedia-sentences)
29 |
30 | ## What is querying while indexing?
31 |
32 | Querying while indexing means you are able to still query your data while new data is simultaneously being inserted (or updated, or deleted).
33 | Jina achieves this with its dump-reload feature.
34 |
35 | ## Configuration changes
36 |
37 | This feature requires you to split the Flow, one for Indexing (and Updates, Deletes) and one for Querying, and have them running at the same time.
38 | Also, you will need to replace the indexers in Flows.
39 | The Index Flow (also referred to as the Storage Flow) will require a [Storage Indexer](https://github.com/jina-ai/executors/tree/main/jinahub/indexers/storage), while the Query Flow requires a [Compound Searcher](https://github.com/jina-ai/executors/tree/main/jinahub/indexers/searcher).
40 |
41 | In our case we use :
42 |
43 | - [LMDBStorage](https://github.com/jina-ai/executors/tree/main/jinahub/indexers/storage/LMDBStorage), which uses a disk-based key-value storage [LMDB](https://lmdb.readthedocs.io/) as a storage engine.
44 | - [FaissLMDBSearcher](https://github.com/jina-ai/executors/tree/main/jinahub/indexers/searcher/compound/FaissLMDBSearcher), which uses the [`faiss`](https://github.com/spotify/annoy) algorithm to provide faster query results and LMDB to retrieve the metadata.
45 |
46 | _____
47 |
48 | ## 🐍 Build the app with Python
49 |
50 | These instructions explain how to run the example yourself and deploy it with Python.
51 |
52 | ### 🗝️ Requirements
53 |
54 | 1. Have a working Python 3.7 or 3.8 environment.
55 | 1. We recommend creating a [new Python virtual environment](https://docs.python.org/3/tutorial/venv.html) to have a clean installation of Jina and prevent dependency conflicts.
56 | 1. Install [Docker Engine](https://docs.docker.com/engine/install/).
57 | 1. Have at least 5 GB of free space on your hard drive.
58 |
59 |
60 | ### Running the example
61 |
62 | ### 👾 Step 1. Clone the repo and install Jina
63 |
64 | Begin by cloning the repo so you can get the required files and datasets. (If you already have the examples repository on your machine make sure to fetch the most recent version)
65 |
66 | ```sh
67 | git clone https://github.com/jina-ai/examples
68 | cd examples/wikipedia-sentences-query-while-indexing
69 | ```
70 |
71 | Let's install `jina` and the other required libraries. For further information on installing jina check out [our documentation](https://docs.jina.ai/get-started/install/).
72 |
73 | ```sh
74 | pip install -r requirements.txt
75 | ```
76 |
77 | In order to run the example you will need to do the following:
78 |
79 | ### 📥 Step 2. Download your data to search (Optional)
80 |
81 | The repo includes a small subset of the Wikipedia dataset, for quick testing. You can just use that.
82 |
83 | If you want to use the entire dataset, run `bash get_data.sh` and then modify the `DATA_FILE` constant (in `app.py`) to point to that file.
84 |
85 | ### 🏃 Step 3. Running the Flows
86 |
87 | In this example, we use [JinaD]((https://docs.jina.ai/advanced/daemon/#remote-management-via-jinad)) to serve the two Flows (Index and Query) and listen to incoming requests.
88 |
89 | 1. Start `JinaD` server using the below command.
90 |
91 | ```bash
92 | docker run --add-host host.docker.internal:host-gateway \
93 | -v /var/run/docker.sock:/var/run/docker.sock \
94 | -v /tmp/jinad:/tmp/jinad \
95 | -p 8000:8000 \
96 | --name jinad \
97 | -d jinaai/jina:2.1.0-daemon
98 | ```
99 |
100 | 2. Run `python app.py -t flows`
101 |
102 | This will create the two Flows, and then repeatedly do the following (which can also be done in any other REST client), every 10 seconds:
103 |
104 | 1. Index 5 Documents.
105 | 2. Send a `DUMP` request to the Storage (Index) Flow to dump its data to a specific location.
106 | 3. Send a `ROLLING_UPDATE` request to the Query Flow to take down its Indexers and start them again, with the new data located at the respective path.
107 |
108 | **Warning**: If you want to use the entire wikipedia dataset, run `bash get_data.sh` and then modify the `DATA_FILE` constant to point to that file.
109 |
110 | ### 🔎 Step 4: Query your data
111 |
112 | Finally, in a second terminal, run `python app.py -t client`
113 |
114 | This will prompt you for a query, send the query to the Query Flow, and then show you the results.
115 |
116 | Since the Flows uses `http` protocol, you can query the REST API with whatever `Client` provided within jina or use `cURL`, `Postman` or [custom Swagger UI provided with jina](https://docs.jina.ai/fundamentals/practice-your-learning/#query-via-swaggerui) etc.
117 |
118 | #### Cleanup
119 |
120 | JinaD creates several containers during this process. In order to remove all the containers do the following after you are done using the example:
121 |
122 | `docker stop $(docker ps -a -q)`
123 | and
124 | `docker rm $(docker ps -a -q)`
125 |
126 | ## Flow diagrams
127 |
128 | Below you can see a graphical representation of the Flow pipeline:
129 |
130 | #### Storage Flow
131 |
132 | 
133 |
134 | #### Query Flow
135 |
136 | 
137 |
138 | Notice the following:
139 |
140 | - the encoder has the same configuration
141 | - the Query Flow uses replicas. One replica continues to serve requests while the other is being reloaded.
142 | - the Indexer in the Query Flow is actually made up of two Indexers: one for vectors, one for Document metadata. On the Storage Flow, this data is stored into one Storage Indexer.
143 |
144 | ## 🔮 Overview of the files
145 |
146 | | File or folder | Contents |
147 | | -------------------- | ---------------------------------------------------------------------------------------------------------------- |
148 | | 📂 `data/` | Folder where the data files are stored |
149 | | 📂 `flows/` | Folder to store Flow configuration |
150 | | --- 📃 `storage.yml` | YAML file to configure Storage (Index) Flow |
151 | | --- 📃 `query.yml` | YAML file to configure Querying Flow |
152 | | 🐍 `app.py` | Code file for the example |
153 |
154 | _________
155 |
156 | ## ⏭️ Next steps
157 |
158 | Did you like this example and are you interested in building your own? For a detailed tutorial on how to build your Jina app check out [How to Build Your First Jina App](https://docs.jina.ai/chapters/my_first_jina_app/#how-to-build-your-first-jina-app) guide in our documentation.
159 |
160 | If you have any issues following this guide, you can always get support from our [Slack community](https://slack.jina.ai) .
161 |
162 | ## 👩👩👧👦 Community
163 |
164 | - [Slack channel](https://slack.jina.ai) - a communication platform for developers to discuss Jina.
165 | - [LinkedIn](https://www.linkedin.com/company/jinaai/) - get to know Jina AI as a company and find job opportunities.
166 | - [](https://twitter.com/JinaAI_) - follow us and interact with us using hashtag `#JinaSearch`.
167 | - [Company](https://jina.ai) - know more about our company. We are fully committed to open-source!
168 |
169 | ## 🦄 License
170 |
171 | Copyright (c) 2021 Jina AI Limited. All rights reserved.
172 |
173 | Jina is licensed under the Apache License, Version 2.0. See LICENSE for the full license text.
174 |
--------------------------------------------------------------------------------
/wikipedia-sentences-query-while-indexing/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jina-ai/examples/d8f903278597254bd96b1ed64fe8c9feefaa265c/wikipedia-sentences-query-while-indexing/__init__.py
--------------------------------------------------------------------------------
/wikipedia-sentences-query-while-indexing/app.py:
--------------------------------------------------------------------------------
1 | __copyright__ = "Copyright (c) 2021 Jina AI Limited. All rights reserved."
2 | __license__ = "Apache-2.0"
3 |
4 | import os
5 | import time
6 | import traceback
7 | from typing import List, Dict
8 |
9 | import click
10 | from daemon.clients import JinaDClient
11 | from jina.logging.logger import JinaLogger
12 | from jina import __default_host__, Document, DocumentArray, Client
13 |
14 | os.environ['JINA_LOG_LEVEL'] = 'DEBUG'
15 |
16 | HOST = __default_host__ # change this if you are using remote jinad
17 | JINAD_PORT = 8000 # change this if you start jinad on a different port
18 | DUMP_PATH = '/jinad_workspace/dump' # the path where to dump
19 | SHARDS = 1 # change this if you change pods/query_indexer.yml
20 | DUMP_RELOAD_INTERVAL = 10 # time between dump - rolling update calls
21 | DATA_FILE = 'data/toy.txt' # change this if you get the full data
22 | DOCS_PER_ROUND = 5 # nr of documents to index in each round
23 | STORAGE_FLOW_YAML_FILE = 'storage.yml' # indexing Flow yaml name
24 | QUERY_FLOW_YAML_FILE = 'query.yml' # querying Flow yaml name
25 | STORAGE_REST_PORT = 9000 # REST port of storage Flow, defined in flows/storage.yml
26 | QUERY_REST_PORT = 9001 # REST port of Query Flow, defined in flows/query.yml
27 |
28 | logger = JinaLogger('jina')
29 | cur_dir = os.path.dirname(os.path.abspath(__file__))
30 | jinad_client = JinaDClient(host=HOST, port=JINAD_PORT, timeout=10 * 60)
31 |
32 |
33 | def docarray_from_file(filename):
34 | docs = []
35 | with open(filename) as f:
36 | for line in f:
37 | docs.append(Document(text=line))
38 | return DocumentArray(docs)
39 |
40 |
41 | def query_restful():
42 | while True:
43 | text = input('please type a sentence: ')
44 | if not text:
45 | break
46 |
47 | query_doc = Document()
48 | query_doc.text = text
49 | response = query_docs(query_doc)
50 | matches = response[0].data.docs[0].matches
51 | len_matches = len(matches)
52 | logger.info(f'Ta-Dah🔮, {len_matches} matches we found for: "{text}" :')
53 |
54 | for idx, match in enumerate(matches):
55 | score = match.scores['euclidean'].value
56 | if score < 0.0:
57 | continue
58 | logger.info(f'> {idx:>2d}({score:.2f}). {match.text}')
59 |
60 |
61 | def index_docs(docs: List[Dict], round: int):
62 | docs_to_send = docs[round * DOCS_PER_ROUND : (round + 1) * DOCS_PER_ROUND]
63 | logger.info(f'Indexing {len(docs_to_send)} document(s)...')
64 | Client(host=HOST, port=STORAGE_REST_PORT, protocol='http').index(inputs=docs_to_send)
65 |
66 |
67 | def query_docs(docs: Document):
68 | logger.info(f'Searching document {docs}...')
69 | return Client(host=HOST, port=QUERY_REST_PORT, protocol='http').search(inputs=docs, return_results=True)
70 |
71 |
72 | def create_flows():
73 | workspace_id = jinad_client.workspaces.create(paths=[os.path.join(cur_dir, 'flows')])
74 | jinad_workspace = jinad_client.workspaces.get(workspace_id)['metadata']['workdir']
75 |
76 | logger.info('Creating storage Flow...')
77 | storage_flow_id = jinad_client.flows.create(
78 | workspace_id=workspace_id, filename=STORAGE_FLOW_YAML_FILE, envs={'JINAD_WORKSPACE': jinad_workspace}
79 | )
80 | logger.info(f'Created successfully. Flow ID: {storage_flow_id}')
81 | logger.info('Creating Query Flow...')
82 | query_flow_id = jinad_client.flows.create(
83 | workspace_id=workspace_id, filename=QUERY_FLOW_YAML_FILE, envs={'JINAD_WORKSPACE': jinad_workspace}
84 | )
85 | logger.info(f'Created successfully. Flow ID: {query_flow_id}')
86 | return storage_flow_id, query_flow_id, workspace_id
87 |
88 |
89 | def dump_and_roll_update(storage_flow_id: str, query_flow_id: str):
90 | docs = docarray_from_file(DATA_FILE)
91 | logger.info(f'starting dump and rolling-update process')
92 | round = 0
93 | while True:
94 | logger.info(f'round {round}:')
95 | index_docs(docs, round)
96 | current_dump_path = os.path.join(DUMP_PATH, str(round))
97 |
98 | logger.info(f'dumping...')
99 | Client(host=HOST, port=STORAGE_REST_PORT, protocol='http').post(
100 | on='/dump',
101 | parameters={'shards': SHARDS, 'dump_path': current_dump_path},
102 | target_peapod='storage_indexer',
103 | )
104 |
105 | # JinaD is used for ctrl requests on Flows
106 | logger.info(f'performing rolling update across replicas...')
107 | jinad_client.flows.update(
108 | id=query_flow_id,
109 | kind='rolling_update',
110 | pod_name='query_indexer',
111 | dump_path=current_dump_path,
112 | )
113 | logger.info(f'rolling update done. sleeping for {DUMP_RELOAD_INTERVAL}secs...')
114 | time.sleep(DUMP_RELOAD_INTERVAL)
115 | round += 1
116 |
117 |
118 | def cleanup(storage_flow_id, query_flow_id, workspace_id):
119 | jinad_client.flows.delete(storage_flow_id)
120 | jinad_client.flows.delete(query_flow_id)
121 | jinad_client.workspaces.delete(workspace_id)
122 |
123 |
124 | @click.command()
125 | @click.option(
126 | '--task',
127 | '-t',
128 | type=click.Choice(['flows', 'client'], case_sensitive=False),
129 | )
130 | def main(task: str):
131 | """main entrypoint for this example"""
132 | if task == 'flows':
133 | # start a Index Flow, dump the data from the Index Flow, and load it into the Query Flow.
134 | try:
135 | storage_flow_id, query_flow_id, workspace_id = create_flows()
136 | # starting a loop that
137 | # - indexes some data in batches
138 | # - sends request to storage Flow in JinaD to dump its data to a location
139 | # - send request to Query Flow in JinaD to perform rolling update across its replicas,
140 | # which reads the new data in the dump
141 | dump_and_roll_update(storage_flow_id, query_flow_id)
142 | except (Exception, KeyboardInterrupt) as e:
143 | if e:
144 | logger.warning(f'Caught: {e}. Original stacktrace following:')
145 | logger.error(traceback.format_exc())
146 | logger.info('Shutting down and cleaning Flows in JinaD...')
147 | cleanup(storage_flow_id, query_flow_id, workspace_id)
148 |
149 | elif task == 'client':
150 | query_restful()
151 |
152 |
153 | if __name__ == '__main__':
154 | main()
155 |
--------------------------------------------------------------------------------
/wikipedia-sentences-query-while-indexing/flows/query.yml:
--------------------------------------------------------------------------------
1 | jtype: Flow # we define the search Flow
2 | version: '1'
3 | with:
4 | protocol: http # we use the REST API
5 | port_expose: 9001 # the port the Flow will listen on
6 | executors: # the list of components in this Flow
7 | - name: query_encoder # the name of this executor. This one takes the text and transforms it into vectors to be used in searching
8 | uses: jinahub+docker://FlairTextEncoder # we use a pre-built Executor docker image
9 | timeout_ready: -1 # disable timing out. (downloading the image can take some time)
10 | - name: query_indexer # the name. This is a Compound Executor, formed of a vector searcher and a key-value db
11 | uses: jinahub+docker://FaissLMDBSearcher # again, the docker image
12 | replicas: 2 # we want to replicate this executor, for better performance. This creates two identical copies. Requests are passed to either one
13 | timeout_ready: -1 # disable timing out. (downloading the image can take some time)
14 | volumes: $JINAD_WORKSPACE:/jinad_workspace # we need a workspace where the LMDB db file will be stored
15 |
--------------------------------------------------------------------------------
/wikipedia-sentences-query-while-indexing/flows/storage.yml:
--------------------------------------------------------------------------------
1 | jtype: Flow # we define the Flow used for storing (CRUD operations)
2 | version: '1'
3 | with:
4 | protocol: http # we want to use the REST HTTP API
5 | port_expose: 9000 # the port to listen on. This is referenced in `app.py`
6 | executors: # the components in this Flow
7 | - name: storage_encoder # the name. This is the Encoder (transforms the text into vectors)
8 | uses: jinahub+docker://FlairTextEncoder # we use a pre-built Executor from Jina Hub
9 | timeout_ready: -1 # disable timing out on startup (downloading image can take some time)
10 | - name: storage_indexer # the name. This stores the data in an LMDB db
11 | uses: jinahub+docker://LMDBStorage # again, we use a docker image
12 | timeout_ready: -1 # disable startup
13 | volumes: $JINAD_WORKSPACE:/jinad_workspace # workspace where the file will be stored
14 |
15 |
--------------------------------------------------------------------------------
/wikipedia-sentences-query-while-indexing/get_data.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | DATASET="mikeortman/wikipedia-sentences"
3 | DATA_DIR="data"
4 | LINES=3000
5 |
6 | cd ${DATA_DIR}
7 | kaggle datasets download -d ${DATASET}
8 | unzip wikipedia-sentences.zip
9 | rm -f toy-data.txt
10 | rm -f wikipedia-sentences.zip
11 | mv wikisent2.txt input.txt
12 |
--------------------------------------------------------------------------------
/wikipedia-sentences-query-while-indexing/manifest.yml:
--------------------------------------------------------------------------------
1 | manifest_version: 1
2 | name: wikipedia-sentences-30k-query-while-indexing
3 | description: 'Example Jina app for searching 30,000 sentences from Wikipedia'
4 | author: Cristian Mitroi (cristian.mitroi@jina.ai)
5 | url: https://github.com/jina-ai/examples/tree/master/wikipedia-sentences-query-while-indexing
6 | vendor: Jina AI Limited
7 | documentation: https://github.com/jina-ai/examples/tree/master/wikipedia-sentences-query-while-indexing
8 | version: 0.1
9 | license: apache-2.0
10 | keywords: [NLP, wikipedia, text, distilbert, example, transformers]
11 | type: app
12 | kind: example
13 | avatar: None
14 | platform: "linux/amd64"
15 | update: "None"
16 |
--------------------------------------------------------------------------------
/wikipedia-sentences-query-while-indexing/requirements.txt:
--------------------------------------------------------------------------------
1 | jina[daemon]==2.1.0
2 | kaggle==1.5.12
3 | click==7.1.2
4 |
--------------------------------------------------------------------------------
/wikipedia-sentences-query-while-indexing/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jina-ai/examples/d8f903278597254bd96b1ed64fe8c9feefaa265c/wikipedia-sentences-query-while-indexing/tests/__init__.py
--------------------------------------------------------------------------------
/wikipedia-sentences-query-while-indexing/tests/requirements.txt:
--------------------------------------------------------------------------------
1 | git+https://github.com/jina-ai/jina.git@v2.1.0#egg=jina[daemon]
2 | click==7.1.2
3 |
--------------------------------------------------------------------------------
/wikipedia-sentences-query-while-indexing/tests/test_query_while_indexing.py:
--------------------------------------------------------------------------------
1 | import time
2 | from threading import Thread
3 |
4 | from jina import Document, __default_host__, Client
5 | from daemon.clients import JinaDClient
6 | from jina.logging.logger import JinaLogger
7 |
8 | HOST = __default_host__
9 | JINAD_PORT = 8000
10 | QUERY_REST_PORT = 9001
11 | logger = JinaLogger('test')
12 |
13 |
14 | def query_docs(docs: Document):
15 | logger.info(f'Searching document {docs}...')
16 | return Client(host=HOST, port=QUERY_REST_PORT, protocol='http').search(inputs=docs, return_results=True)
17 |
18 |
19 | def test_query_while_indexing():
20 | try:
21 | from app import create_flows, dump_and_roll_update
22 |
23 | jinad_client = JinaDClient(host=HOST, port=JINAD_PORT)
24 | assert jinad_client.alive, 'cannot reach jinad'
25 |
26 | storage_flow_id, query_flow_id, workspace_id = create_flows()
27 | # start rolling update in the background
28 | Thread(target=dump_and_roll_update, args=(storage_flow_id, query_flow_id), daemon=True).start()
29 |
30 | logger.info('sleeping for 30 secs to allow 1 round of index, dump & rolling update')
31 | time.sleep(30)
32 | query_doc = Document(text='hello world')
33 | response = query_docs(query_doc)
34 | matches = response[0].data.docs[0].matches
35 | logger.info(f'got {len(matches)} matches')
36 | assert matches
37 |
38 | except (Exception, KeyboardInterrupt):
39 | raise
40 |
41 | finally:
42 | from app import cleanup
43 |
44 | cleanup(storage_flow_id, query_flow_id, workspace_id)
45 |
--------------------------------------------------------------------------------
/wikipedia-sentences/.dockerignore:
--------------------------------------------------------------------------------
1 | .dockerignore
2 | .git
3 | .github
4 | .gitignore
5 | data
6 | env
7 | get-data.sh
8 | tests
9 |
--------------------------------------------------------------------------------
/wikipedia-sentences/.github/flow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jina-ai/examples/d8f903278597254bd96b1ed64fe8c9feefaa265c/wikipedia-sentences/.github/flow.png
--------------------------------------------------------------------------------
/wikipedia-sentences/.gitignore:
--------------------------------------------------------------------------------
1 | workspace*
2 | env
3 | results
4 | __pycache__
5 |
--------------------------------------------------------------------------------
/wikipedia-sentences/README.md:
--------------------------------------------------------------------------------
1 | # Semantic Wikipedia Search with Transformers and DistilBERT
2 |
3 | 
4 |
5 | ## Table of contents:
6 |
7 | - [Overview](#overview)
8 | - [🐍 Build the app with Python](#-build-the-app-with-python)
9 | - [🔮 Overview of the files in this example](#-overview-of-the-files-in-this-example)
10 | - [🌀 Flow diagram](#-flow-diagram)
11 | - [🔨 Next steps, building your own app](#-next-steps-building-your-own-app)
12 | - [🙍 Community](#-community)
13 | - [🦄 License](#-license)
14 |
15 | ## Overview
16 | | | |
17 | | ------------- | ------------- |
18 | | Summary | This showcases a semantic text search app |
19 | | Data for indexing | Wikipedia corpus |
20 | | Data for querying | A text sentence |
21 | | Dataset used | [Kaggle Wikipedia corpus](kaggle.com/mikeortman/wikipedia-sentences) |
22 | | ML model used | [`distilbert-base-nli-stsb-mean-tokens `](https://huggingface.co/sentence-transformers/distilbert-base-nli-stsb-mean-tokens) |
23 |
24 | This example shows you how to build a simple semantic search app powered by [Jina](http://www.jina.ai)'s neural search framework. You can index and search text sentences from Wikipedia using a state-of-the-art machine learning [`distilbert-base-nli-stsb-mean-tokens `](https://huggingface.co/sentence-transformers/distilbert-base-nli-stsb-mean-tokens) language model from the [Transformers](https://huggingface.co) library.
25 |
26 | | item | content |
27 | |--------|--------------------------------------------------|
28 | | Input | 1 text file with 1 sentence per line |
29 | | Output | *top_k* number of sentences that match input query |
30 |
31 | ## 🐍 Build the app with Python
32 |
33 | These instructions explain how to build the example yourself and deploy it with Python. If you want to skip the building steps and just run the app, check out the [Docker section](#---deploy-the-prebuild-application-using-docker) below.
34 |
35 |
36 | ### 🗝️ Requirements
37 | 1. You have a working Python 3.7 or 3.8 environment.
38 | 2. We recommend creating a [new Python virtual environment](https://docs.python.org/3/tutorial/venv.html) to have a clean installation of Jina and prevent dependency conflicts.
39 | 3. You have at least 2 GB of free space on your hard drive.
40 |
41 | ### 👾 Step 1. Clone the repo and install Jina
42 |
43 |
44 | Begin by cloning the repo, so you can get the required files and datasets. In case you already have the examples repository on your machine make sure to fetch the most recent version.
45 |
46 | ```sh
47 | git clone https://github.com/jina-ai/examples
48 | cd examples/wikipedia-sentences
49 | ```
50 |
51 | In your terminal, you should now be located in you the wikipedia-sentences folder. Let's install Jina and the other required Python libraries. For further information on installing Jina check out our [documentation](https://docs.jina.ai/chapters/core/setup/).
52 |
53 |
54 | ```sh
55 | pip install -r requirements.txt
56 | ```
57 | If this command runs without any error messages, you can then move onto step two.
58 |
59 | ### 📥 Step 2. Download your data to search
60 |
61 | By default, a small test dataset is used for indexing. This can lead to bad search results.
62 |
63 | To index the [full dataset](https://www.kaggle.com/mikeortman/wikipedia-sentences) (around 900 MB):
64 |
65 | 1. Set up [Kaggle](https://www.kaggle.com/docs/api#getting-started-installation-&-authentication)
66 | 2. Run the script: `sh get_data.sh`
67 | 3. Index your new dataset: `python app.py -t index -d full -n $num_docs`
68 |
69 | The whole dataset contains about 8 Million wikipedia sentences, indexing all of this will take a very long time.
70 | Therefore, we recommend selecting only a subset of the data, the number of elements can be selected by the `-n` flag.
71 | We recommend values smaller than 100000. For larger indexes, the SimpleIndexer used in this example will be very slow also in query time.
72 | It is then recommended to use more advanced indexers like the FaissIndexer.
73 |
74 | ### 🏃 Step 3. Index your data
75 |
76 | Index your data by running:
77 |
78 | ```sh
79 | python app.py -t index
80 | ```
81 | Here, we can also specify the number of documents to index with ```--num_docs``` / ```-n``` (defult is 10000).
82 |
83 | ### 🔎 Step 4. Query your indexed data
84 |
85 | A search prompt will appear in your terminal after running:
86 |
87 | ```sh
88 | python app.py -t query
89 | ```
90 |
91 | See the text below for an example search query and response.
92 | You can also specify the top k search results with ```--top_k``` / ```-k``` (default is 5)
93 |
94 | ```
95 | please type a sentence: What is ROMEO
96 |
97 | Ta-Dah🔮, here are what we found for: What is ROMEO
98 | > 0(0.36). The ROMEO website, iOS app and Android app are commonly used by the male gay community to find friends, dates, love or get informed about LGBT+ topics.
99 |
100 | ```
101 |
102 | ## 🔮 Overview of the files in this example
103 | Here is a small overview if you're interested in understanding what each file in this example is doing.
104 |
105 | | File | Explanation |
106 | |---|---|
107 | |📂 `test/*` | Various maintenance tests to keep the example running. |
108 | |📃 `app.py` | The gateway code to that runs the index & query Flow. |
109 | |📃 `get_data.sh` | Downloads the Kaggle dataset. |
110 | |📃 `requirements.txt` | Contains all required python libraries. |
111 |
112 |
113 | ## 🌀 Flow diagram
114 |
115 | This diagram provides a visual representation of the flow in this example, showing which Executors are used in which order:
116 |
117 | 
118 |
119 | It can be seen that the flow for this example is quite simple. We receive input Documents from the gateway,
120 | which are then fed into a transformer. This transformer computes an embedding based on the text of the document.
121 | Then, the documents are sent to the indexer which does the following:
122 | - Index time: Store all the documents on disk (in the workspace folder).
123 | - Query time: Compare the query document embedding with all stored embeddings and return closest matches
124 |
125 | ## ⏭️ Next steps, building your own app
126 |
127 | Did you like this example and are you interested in building your own? For a detailed tuturial on how to build your Jina app check out [How to Build Your First Jina App](https://docs.jina.ai/chapters/my_first_jina_app/#how-to-build-your-first-jina-app) guide in our documentation.
128 |
129 | - [Enable querying while indexing](https://github.com/jina-ai/examples/tree/master/wikipedia-sentences-query-while-indexing)
130 |
131 | ## 👩👩👧👦 Community
132 |
133 | - [Slack channel](https://slack.jina.ai) - a communication platform for developers to discuss Jina
134 | - [LinkedIn](https://www.linkedin.com/company/jinaai/) - get to know Jina AI as a company and find job opportunities
135 | - [](https://twitter.com/JinaAI_) - follow us and interact with us using hashtag `#JinaSearch`
136 | - [Company](https://jina.ai) - know more about our company, we are fully committed to open-source!
137 |
138 | ## 🦄 License
139 |
140 | Copyright (c) 2021 Jina AI Limited. All rights reserved.
141 |
142 | Jina is licensed under the Apache License, Version 2.0. See [LICENSE](https://github.com/jina-ai/examples/blob/master/LICENSE) for the full license text.
143 |
--------------------------------------------------------------------------------
/wikipedia-sentences/app.py:
--------------------------------------------------------------------------------
1 | __copyright__ = "Copyright (c) 2021 Jina AI Limited. All rights reserved."
2 | __license__ = "Apache-2.0"
3 |
4 | import os
5 | import sys
6 | import click
7 | import random
8 | from jina import Flow, Document, DocumentArray
9 | from jina.logging.predefined import default_logger as logger
10 |
11 | MAX_DOCS = int(os.environ.get('JINA_MAX_DOCS', 10000))
12 |
13 |
14 | def config(dataset: str):
15 | if dataset == 'toy':
16 | os.environ['JINA_DATA_FILE'] = os.environ.get('JINA_DATA_FILE', 'data/toy-input.txt')
17 | elif dataset == 'full':
18 | os.environ['JINA_DATA_FILE'] = os.environ.get('JINA_DATA_FILE', 'data/input.txt')
19 | os.environ['JINA_PORT'] = os.environ.get('JINA_PORT', str(45678))
20 | cur_dir = os.path.dirname(os.path.abspath(__file__))
21 | os.environ.setdefault('JINA_WORKSPACE', os.path.join(cur_dir, 'workspace'))
22 | os.environ.setdefault('JINA_WORKSPACE_MOUNT',
23 | f'{os.environ.get("JINA_WORKSPACE")}:/workspace/workspace')
24 |
25 |
26 | def print_topk(resp, sentence):
27 | for doc in resp.data.docs:
28 | print(f"\n\n\nTa-Dah🔮, here's what we found for: {sentence}")
29 | for idx, match in enumerate(doc.matches):
30 | score = match.scores['cosine'].value
31 | print(f'> {idx:>2d}({score:.2f}). {match.text}')
32 |
33 |
34 | def input_generator(num_docs: int, file_path: str):
35 | with open(file_path) as file:
36 | lines = file.readlines()
37 | num_lines = len(lines)
38 | random.shuffle(lines)
39 | for i in range(min(num_docs, num_lines)):
40 | yield Document(text=lines[i])
41 |
42 |
43 | def index(num_docs):
44 | flow = Flow().load_config('flows/flow.yml')
45 | data_path = os.path.join(os.path.dirname(__file__), os.environ.get('JINA_DATA_FILE', None))
46 | with flow:
47 | flow.post(on='/index', inputs=input_generator(num_docs, data_path),
48 | show_progress=True)
49 |
50 |
51 | def query(top_k):
52 | flow = Flow().load_config('flows/flow.yml')
53 | with flow:
54 | text = input('Please type a sentence: ')
55 | doc = Document(content=text)
56 |
57 | result = flow.post(on='/search', inputs=DocumentArray([doc]),
58 | parameters={'top_k': top_k},
59 | line_format='text',
60 | return_results=True,
61 | )
62 | print_topk(result[0], text)
63 |
64 |
65 | @click.command()
66 | @click.option(
67 | '--task',
68 | '-t',
69 | type=click.Choice(['index', 'query'], case_sensitive=False),
70 | )
71 | @click.option('--num_docs', '-n', default=MAX_DOCS)
72 | @click.option('--top_k', '-k', default=5)
73 | @click.option('--dataset', '-d', type=click.Choice(['toy', 'full']), default='toy')
74 | def main(task, num_docs, top_k, dataset):
75 | config(dataset)
76 | if task == 'index':
77 | if os.path.exists(os.environ.get("JINA_WORKSPACE")):
78 | logger.error(f'\n +---------------------------------------------------------------------------------+ \
79 | \n | 🤖🤖🤖 | \
80 | \n | The directory {os.environ.get("JINA_WORKSPACE")} already exists. Please remove it before indexing again. | \
81 | \n | 🤖🤖🤖 | \
82 | \n +---------------------------------------------------------------------------------+')
83 | sys.exit(1)
84 | index(num_docs)
85 | elif task == 'query':
86 | query(top_k)
87 |
88 |
89 | if __name__ == '__main__':
90 | main()
91 |
--------------------------------------------------------------------------------
/wikipedia-sentences/data/toy-input.txt:
--------------------------------------------------------------------------------
1 | The ROMEO website, iOS app and Android app are commonly used by the male gay community to find friends, dates, love or get informed about LGBT+ topics.
2 | Once derided as corporate raiders, shareholder activists are now the recipients of admiration for sparking change in corporate boardrooms, leading to corporate boards developing best practices for responding to shareholder activism.
3 | Slc22a21 belongs to a protein family of solute carriers.
4 | Ajrara gharana or Ajrada gharana is one of the six main traditional schools in tabla drum.
5 | A few female specimens were found in a forest stream inside the shola forest.
6 | Ziggeo is the initiator and backer of BetaJS, an open-source framework.
7 | The three sports of aquatics were held at Aquatic Center in Sport Authority of Thailand Sport Complex, Bangkok, Thailand.
8 | Sugandha is the fourth generation of her family into singing and belongs to the Indore gharana.
9 | WYES is the only independently owned public television station in Louisiana as it is not part of Louisiana Public Broadcasting, which owns all of the PBS member stations in the state that are located outside of New Orleans, and maintains a programming agreement with and partial ownership of the city's independent public television station, WLAE-TV (channel 32).
10 | The reef divides the strait into the Apo East Pass and the Apo West Pass.
11 | His novel The Book of Evidence was shortlisted for the Booker Prize and won the Guinness Peat Aviation award in 1989.
12 | Andrea Kremer (born February 25, 1959 in Philadelphia, Pennsylvania) is a multi-Emmy Award Winning American television sports journalist.
13 | The book was the first published novel by O'Grady, with an initial print run of 6,000 hardback copies.
14 | After Alice performs several "miracle" cures in front of the tree, and claims to have seen the Virgin Mary there, it starts to be treated as a Lourdes-like shrine by Catholic pilgrims.
15 | Tovar is no longer involved with smuggling but acts as a consultant to Goldenvoice, which now operates the Coachella Valley Music and Arts Festival that has been compared to the Glastonbury Festival and is the most profitable music festival in the US.
16 | Tiwari worked as a producer with NDTV from 1996-2003.
17 | It is the home arena for SaPKo of the Mestis hockey league the second top league in Finland behind Liiga.
18 | As of the 2011 apportionment, the district includes the Middlesex County municipalities of East Brunswick Township, Edison Township, Helmetta Borough, Highland Park Borough, Metuchen Borough, South Plainfield Borough and South River Borough.
19 | Lembosiella is a genus of fungi in the Microthyriaceae family; according to the 2007 Outline of Ascomycota, the placement in this family is uncertain.
20 | Later, he resigned from his teaching profession in Jan 2013 and became a full time lyricist, dialog writer and part time researcher in Karky Research Foundation.
21 | It is used in Intel Core microarchitecture based DP-capable server processors, the Dual-Core Xeon is codenamed Dempsey, Woodcrest, and Wolfdale and the Quad-Core processors Clovertown, Harpertown.
22 | The 35th Annual TV Week Logie Awards was held on Friday 19 March 1993 at the Grand Hyatt in Melbourne, and broadcast on Network Ten.
23 | Daund Patas Road railway station is a small railway station in Pune district, Maharashtra.
24 | Evagjelia Veli (born 16 July 1991) is an Albanian weightlifter.
25 | It was published in two volumes that appeared a decade apart.
26 | He is now professor of medicine (biotechnology in public health) at the University of Bergen and chairs the Faculty Council, Faculty of Medicine, Norwegian University of Science and Technology.
27 | Erin McGathy (born December 5, 1985) is an American podcast host, artist, and comedian.
28 | The song is the second single from their debut mini album First Invasion and it was released as a digital single on August 4, 2010.
29 | Chandru who makes his debut in direction after assisting few Tamil films.
30 | Mitchum got the tune for the song from a Norwegian folk-dance (Gammel Reinlender) song his mother used to sing to him.
31 | The shell of the No 69 grenade was composed entirely of the hard plastic, Bakelite, which shattered without producing fragments like a metal bodied grenade.
32 | It was released on 24 January 2014.
33 | Stafford Loans are available both as subsidized and unsubsidized loans.
34 | On the World Wide Web, a query string is the part of a uniform resource locator (URL) containing data that does not fit conveniently into a hierarchical path structure.
35 | They have bar eyes, bare metasternum, bare metapisternum, the anterior anepisternum is usually pillose.
36 | In 1904, the mayoral term was changed to two years.
37 | It was earlier known as Central Mall but underwent renovations and some parts were re-organized in 2017 and was re-branded and re-launced on 26 February 2018.
38 | Jang Young-sik (born 1935) is a South Korean economist.
39 | It is a medium-sized damselfly with a short stout body, it is black with blue markings, and has long dark wings with pterostigma.
40 | The first desegregated hotel casino, it was popular with many of the black entertainers of the time, who would entertain at the other hotels and casinos and stay at the Moulin Rouge.
41 | In February 2009, it was revealed that the site was projected onto a wall at The Daily Telegraph to allow journalists there to view breaking news posted by users to Twitter.
42 | His most recent novel in this series, The Bangkok Asset, was published on 4 August 2015.
43 | He served as the 24th Governor of Nevada from 1979 to 1983.
44 | The soils which range from acid to alkaline and front wet to dry gives rise to a diverse woodland structure.
45 | Their land was taken back by the Spanish Crown; and then irretrievably lost however, when California became part of the United States.
46 | With annual billings of $220 million, Tombras is one of the top 25 largest independent national advertising agencies.
47 | The couple intended to retire to China and purchased a property in Canton; however the Communist victory in 1949 changed their plans and in 1950 the couple sold the vineyard and moved to Blockhouse Bay, Auckland.
48 | "Super Scooter Happy" was covered by Kyary Pamyu Pamyu on her 2013 album, Nanda Collection.
49 | Filling four CD-ROMs, Final Fantasy IX featured a cast containing a variety of major and minor characters.
50 | The album was produced by Billy Harvey, and featured contributions by Rafael Gayol and the Tosca String Quartet.
51 |
--------------------------------------------------------------------------------
/wikipedia-sentences/flows/flow.yml:
--------------------------------------------------------------------------------
1 | jtype: Flow # This file defines the flow (both index and query) for the wikipedia sentences example
2 | version: '1' # This is the yml file version
3 | with: # Additional arguments for the flow
4 | workspace: $JINA_WORKSPACE # Workspace folder path
5 | port_expose: $JINA_PORT # Network Port for the flow
6 | executors: # Now, define the executors that are run on this flow
7 | - name: transformer # This executor computes an embedding based on the input text documents
8 | uses: 'jinahub+docker://TransformerTorchEncoder/v0.1' # We use a Transformer Torch Encoder from the hub as a docker container
9 | - name: indexer # Now, index the text documents with the embeddings
10 | uses: 'jinahub://SimpleIndexer/old' # We use the SimpleIndexer for this purpose
--------------------------------------------------------------------------------
/wikipedia-sentences/get_data.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | DATASET="mikeortman/wikipedia-sentences"
3 | DATA_DIR="data"
4 | LINES=3000
5 |
6 |
7 |
8 | cd ${DATA_DIR}
9 | kaggle datasets download -d ${DATASET}
10 | unzip wikipedia-sentences.zip
11 | rm -f toy-data.txt
12 | rm -f wikipedia-sentences.zip
13 | mv wikisent2.txt input.txt
14 |
--------------------------------------------------------------------------------
/wikipedia-sentences/requirements.txt:
--------------------------------------------------------------------------------
1 | click==8.0.1
2 | jina[standard]==2.0.18
3 | git+git://github.com/jina-ai/jina-commons@v0.0.3
--------------------------------------------------------------------------------
/wikipedia-sentences/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jina-ai/examples/d8f903278597254bd96b1ed64fe8c9feefaa265c/wikipedia-sentences/tests/__init__.py
--------------------------------------------------------------------------------
/wikipedia-sentences/tests/conftest.py:
--------------------------------------------------------------------------------
1 | """Patch the birthday problem for random parts"""
2 |
3 | import pytest
4 |
5 |
6 | @pytest.fixture(scope='function', autouse=True)
7 | def patched_random_port(mocker):
8 | used_ports = set()
9 | from jina.helper import random_port
10 | from jina.excepts import NoAvailablePortError
11 |
12 | def _random_port():
13 |
14 | for i in range(10):
15 | _port = random_port()
16 |
17 | if _port is not None and _port not in used_ports:
18 | used_ports.add(_port)
19 | return _port
20 | raise NoAvailablePortError
21 |
22 | mocker.patch('jina.helper.random_port', new_callable=lambda: _random_port)
--------------------------------------------------------------------------------
/wikipedia-sentences/tests/requirements.txt:
--------------------------------------------------------------------------------
1 | pytest==6.2.4
2 | click==8.0.1
3 | jina[standard]==2.0.18
4 | git+git://github.com/jina-ai/jina-commons@v0.0.3
--------------------------------------------------------------------------------
/wikipedia-sentences/tests/test_wikipediasearch.py:
--------------------------------------------------------------------------------
1 | __copyright__ = "Copyright (c) 2021 Jina AI Limited. All rights reserved."
2 | __license__ = "Apache-2.0"
3 |
4 | import os
5 | import sys
6 | from click.testing import CliRunner
7 |
8 | sys.path.append('..')
9 | from app import main
10 |
11 |
12 | def config(tmpdir):
13 | os.environ['JINA_WORKSPACE'] = os.path.join(tmpdir, 'workspace')
14 |
15 |
16 | def test_wikipedia_sentences(tmpdir):
17 | config(tmpdir)
18 | runner = CliRunner()
19 | result = runner.invoke(main, ['-t', 'index'])
20 | assert "done in" in result.stdout
21 | assert result.stderr_bytes is None
22 | result = runner.invoke(main, ['-t', 'query'])
23 | print(result.stdout)
24 | assert result.stderr_bytes is None
25 |
--------------------------------------------------------------------------------
/wikipedia-sentences/tests/toy-input.txt:
--------------------------------------------------------------------------------
1 | ../data/toy-input.txt
--------------------------------------------------------------------------------