├── src ├── __init__.py ├── utils │ ├── __init__.py │ ├── test.png │ ├── timecontext.py │ ├── profiler.py │ └── benchmark.py ├── document_attributes.py ├── document_non_empty_fields.py ├── document_uri.py ├── document_chunks.py ├── document_matches.py ├── document_pop.py ├── document_content_hash.py ├── document_content_type.py ├── document_array_reverse.py ├── document_plot.py ├── document_copy_from.py ├── document_merge_from.py ├── executor.py ├── document_array_insert.py ├── document_get_sparse_blob.py ├── document_get_sparse_embedding.py ├── document_parent_id.py ├── pages.py ├── document_array_sort.py ├── document_graph_adjacency.py ├── document_dict.py ├── document_json.py ├── document_id.py ├── document_array_shuffle.py ├── document_array_clear.py ├── document_content.py ├── document_embedding.py ├── document_weight.py ├── document_granularity.py ├── document_mime_type.py ├── document_array_append.py ├── document_clear.py ├── document_scores.py ├── document_tags.py ├── document_evaluations.py ├── document_update.py ├── document_modality.py ├── document_array_persistence.py ├── document_array_save_json_load_json.py ├── document_array_save_binary_load_binary.py ├── document_array_embeddings.py ├── document_array_save.py ├── zed_runtime_callback.py ├── document_set_attributes.py ├── document_get_attributes.py ├── document_array_extend.py ├── document_array_traverse.py ├── document_property_getter.py ├── document_array_match.py ├── flow.py ├── document_array_construct.py ├── document_array_get_attributes.py ├── document_graph_construction.py ├── searchers_compare.py ├── document_conversions_blob_image_uri_text.py └── document_construct.py ├── .github ├── CODEOWNERS └── workflows │ ├── create-pr.yml │ ├── gh-page.yml │ └── pr.yml ├── docs ├── static │ ├── CNAME │ └── artifacts │ │ ├── 2.1.2 │ │ └── searchers_compare.json │ │ └── 2.0.12 │ │ └── report.json ├── archetypes │ └── default.md └── config.yml ├── pyproject.toml ├── .gitmodules ├── requirements.txt ├── Dockerfile ├── .pre-commit-config.yaml ├── README.md ├── conftest.py ├── .gitignore ├── LICENSE └── scripts └── site_generator.py /src/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | * @maateen 2 | -------------------------------------------------------------------------------- /docs/static/CNAME: -------------------------------------------------------------------------------- 1 | benchmark.jina.ai -------------------------------------------------------------------------------- /src/utils/test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jina-ai/benchmark/HEAD/src/utils/test.png -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.pytest.ini_options] 2 | testpaths = ["src"] 3 | python_files = "*.py" 4 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "docs/themes/book"] 2 | path = docs/themes/book 3 | url = https://github.com/alex-shpak/hugo-book 4 | -------------------------------------------------------------------------------- /docs/archetypes/default.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "{{ replace .Name "-" " " | title }}" 3 | date: {{ .Date }} 4 | draft: true 5 | --- 6 | 7 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pympler==0.9 2 | faker==8.11.0 3 | packaging==21.0 4 | pytest==6.2.4 5 | pytest-json-report==1.4.0 6 | scipy==1.7.1 7 | Pillow==8.3.2 8 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | ARG JINA_VER 2 | 3 | FROM jinaai/jina:$JINA_VER 4 | 5 | WORKDIR /app 6 | 7 | ADD requirements.txt . 8 | 9 | # install dependencies 10 | RUN apt-get update && \ 11 | apt-get install -y gcc && \ 12 | pip3 install -r requirements.txt 13 | 14 | # run benchmark 15 | ENTRYPOINT ["pytest"] 16 | -------------------------------------------------------------------------------- /src/utils/timecontext.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | 4 | class TimeContext: 5 | """Timing a code snippet with a context manager.""" 6 | 7 | def __enter__(self): 8 | self.start = time.time_ns() 9 | return self 10 | 11 | def __exit__(self, typ, value, traceback): 12 | self.duration = self.time_since_start() 13 | 14 | def time_since_start(self): 15 | return time.time_ns() - self.start 16 | -------------------------------------------------------------------------------- /.github/workflows/create-pr.yml: -------------------------------------------------------------------------------- 1 | name: Create PR 2 | 3 | on: 4 | push: 5 | branches: 6 | - "benchmark-*" 7 | 8 | jobs: 9 | create-pr: 10 | runs-on: ubuntu-latest 11 | if: ${{ github.actor == 'jina-bot' }} 12 | steps: 13 | - uses: actions/checkout@v2 14 | 15 | - name: create PR 16 | id: open-pr 17 | uses: repo-sync/pull-request@v2 18 | with: 19 | pr_label: automerge 20 | destination_branch: "main" 21 | pr_body: "This is an automated PR." 22 | github_token: ${{ secrets.JINA_DEV_BOT }} 23 | -------------------------------------------------------------------------------- /src/document_attributes.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from jina import Document 3 | 4 | from .pages import Pages 5 | from .utils.benchmark import benchmark_time 6 | 7 | 8 | @pytest.mark.parametrize('num_docs', [1, 100, 10_000]) 9 | def test_document_attributes(num_docs, json_writer): 10 | def _input_docs(): 11 | return (), dict(docs=[Document(text='doc') for _ in range(num_docs)]) 12 | 13 | def _attributes(docs): 14 | for d in docs: 15 | aux = d.attributes() 16 | 17 | result = benchmark_time(setup=_input_docs, func=_attributes) 18 | 19 | json_writer.append( 20 | page=Pages.DOCUMENT_HELPER, 21 | result=result, 22 | metadata=dict(num_docs=num_docs), 23 | ) 24 | -------------------------------------------------------------------------------- /src/document_non_empty_fields.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from jina import Document 3 | 4 | from .pages import Pages 5 | from .utils.benchmark import benchmark_time 6 | 7 | 8 | @pytest.mark.parametrize('num_docs', [1, 100, 10_000]) 9 | def test_document_non_empty_fields(num_docs, json_writer): 10 | def _input_docs(): 11 | return (), dict(docs=[Document(text='doc') for _ in range(num_docs)]) 12 | 13 | def _non_empty_fields(docs): 14 | for d in docs: 15 | aux = d.dict() 16 | 17 | result = benchmark_time(setup=_input_docs, func=_non_empty_fields) 18 | 19 | json_writer.append( 20 | page=Pages.DOCUMENT_HELPER, 21 | result=result, 22 | metadata=dict(num_docs=num_docs), 23 | ) 24 | -------------------------------------------------------------------------------- /src/document_uri.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from jina import Document 3 | 4 | from .pages import Pages 5 | from .utils.benchmark import benchmark_time 6 | 7 | 8 | @pytest.mark.parametrize("num_docs", [100, 10_000]) 9 | def test_document_uri(num_docs, json_writer): 10 | def _input_docs(): 11 | return ( 12 | (), 13 | {"docs": [Document(text=f"d{i}") for i in range(num_docs)]}, 14 | ) 15 | 16 | def _doc_uri(docs): 17 | for doc in docs: 18 | _ = doc.uri 19 | 20 | result = benchmark_time(setup=_input_docs, func=_doc_uri) 21 | 22 | json_writer.append( 23 | page=Pages.DOCUMENT_CONTENT, 24 | result=result, 25 | metadata=dict(num_docs=num_docs), 26 | ) 27 | -------------------------------------------------------------------------------- /src/document_chunks.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from jina import Document 3 | 4 | from .pages import Pages 5 | from .utils.benchmark import benchmark_time 6 | 7 | 8 | @pytest.mark.parametrize("num_docs", [100, 1000, 10_000]) 9 | def test_document_document_chunks(num_docs, json_writer): 10 | def _input_docs(): 11 | doc = Document() 12 | doc.chunks = [Document(text=f"d{i}") for i in range(num_docs)] 13 | return ((), {"doc": doc}) 14 | 15 | def _get_chunks(doc): 16 | return doc.chunks 17 | 18 | result = benchmark_time(setup=_input_docs, func=_get_chunks) 19 | 20 | json_writer.append( 21 | page=Pages.DOCUMENT_RECURSIVE, 22 | result=result, 23 | metadata=dict(num_docs=num_docs), 24 | ) 25 | -------------------------------------------------------------------------------- /src/document_matches.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from jina import Document 3 | 4 | from .pages import Pages 5 | from .utils.benchmark import benchmark_time 6 | 7 | 8 | @pytest.mark.parametrize("num_docs", [100, 1000, 10_000]) 9 | def test_document_document_matches(num_docs, json_writer): 10 | def _input_docs(): 11 | doc = Document(text="d1") 12 | doc.matches = [Document(text=f"d{i}") for i in range(num_docs)] 13 | return ((), {"doc": doc}) 14 | 15 | def _get_matches(doc): 16 | return doc.matches 17 | 18 | result = benchmark_time(setup=_input_docs, func=_get_matches) 19 | 20 | json_writer.append( 21 | page=Pages.DOCUMENT_RECURSIVE, 22 | result=result, 23 | metadata=dict(num_docs=num_docs), 24 | ) 25 | -------------------------------------------------------------------------------- /src/document_pop.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from jina import Document, DocumentArray 3 | 4 | from .pages import Pages 5 | from .utils.benchmark import benchmark_time 6 | 7 | 8 | @pytest.mark.parametrize('num_docs', [100, 1000, 10_000]) 9 | def test_document_document_pop(num_docs, json_writer): 10 | def _input_docs(): 11 | return (), dict( 12 | docs=DocumentArray([Document(text='hey here') for _ in range(num_docs)]) 13 | ) 14 | 15 | def _pop_text(docs): 16 | for d in docs: 17 | d.pop('text') 18 | 19 | result = benchmark_time(setup=_input_docs, func=_pop_text) 20 | 21 | json_writer.append( 22 | page=Pages.DOCUMENT_HELPER, 23 | result=result, 24 | metadata=dict(num_docs=num_docs), 25 | ) 26 | -------------------------------------------------------------------------------- /src/document_content_hash.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from jina import Document 3 | 4 | from .pages import Pages 5 | from .utils.benchmark import benchmark_time 6 | 7 | 8 | @pytest.mark.parametrize('num_docs', [100, 1000, 10_000]) 9 | def test_document_document_content_hash(num_docs, json_writer): 10 | def _input_docs(): 11 | return ( 12 | (), 13 | {'docs': [Document(text=f'text doc {i}') for i in range(num_docs)]}, 14 | ) 15 | 16 | def _content_hash(docs): 17 | for d in docs: 18 | d.content_hash 19 | 20 | result = benchmark_time(setup=_input_docs, func=_content_hash) 21 | json_writer.append( 22 | page=Pages.DOCUMENT_META, 23 | result=result, 24 | metadata=dict(num_docs=num_docs), 25 | ) 26 | -------------------------------------------------------------------------------- /src/document_content_type.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from jina import Document 3 | 4 | from .pages import Pages 5 | from .utils.benchmark import benchmark_time 6 | 7 | 8 | @pytest.mark.parametrize("num_docs", [100, 1000, 10_000]) 9 | def test_document_get_content_type(num_docs, json_writer): 10 | def _input_docs(): 11 | return ( 12 | (), 13 | {"docs": [Document(text=f"d{i}") for i in range(num_docs)]}, 14 | ) 15 | 16 | def _doc_content_type(docs): 17 | for doc in docs: 18 | _ = doc.content_type 19 | 20 | result = benchmark_time(setup=_input_docs, func=_doc_content_type) 21 | 22 | json_writer.append( 23 | page=Pages.DOCUMENT_META, 24 | result=result, 25 | metadata=dict(num_docs=num_docs), 26 | ) 27 | -------------------------------------------------------------------------------- /src/document_array_reverse.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from jina import Document, DocumentArray 3 | 4 | from .pages import Pages 5 | from .utils.benchmark import benchmark_time 6 | 7 | NUM_REPETITIONS = 10 8 | 9 | 10 | @pytest.mark.parametrize('num_docs', [100, 10_000]) 11 | def test_da_reverse(num_docs, json_writer): 12 | def _setup(): 13 | docs = [Document(text=f'doc{i}') for i in range(num_docs)] 14 | da = DocumentArray(docs) 15 | return (), dict(da=da) 16 | 17 | def _da_reverse(da): 18 | da.reverse() 19 | 20 | result = benchmark_time( 21 | setup=_setup, 22 | func=_da_reverse, 23 | n=NUM_REPETITIONS, 24 | ) 25 | 26 | json_writer.append( 27 | page=Pages.DA_INSERT, 28 | result=result, 29 | metadata=dict(num_docs=num_docs), 30 | ) 31 | -------------------------------------------------------------------------------- /src/document_plot.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from jina import Document 3 | from jina.helper import random_identity 4 | 5 | from .pages import Pages 6 | from .utils.benchmark import benchmark_time 7 | 8 | random_identity(use_uuid1=True) 9 | 10 | 11 | @pytest.mark.parametrize("num_docs", [1, 5]) 12 | def test_document_plot(num_docs, json_writer, ephemeral_tmpdir): 13 | def _input_docs(): 14 | return ( 15 | (), 16 | dict(docs=[Document(text="doc") for _ in range(num_docs)]), 17 | ) 18 | 19 | def _plot(docs): 20 | for d in docs: 21 | d.plot() 22 | 23 | result = benchmark_time(setup=_input_docs, func=_plot) 24 | 25 | json_writer.append( 26 | page=Pages.DOCUMENT_HELPER, 27 | result=result, 28 | metadata=dict(num_docs=num_docs), 29 | ) 30 | -------------------------------------------------------------------------------- /src/document_copy_from.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from jina import Document 3 | 4 | from .pages import Pages 5 | from .utils.benchmark import benchmark_time 6 | 7 | 8 | @pytest.mark.parametrize("num_docs", [1, 100, 10_000]) 9 | def test_document_copy_from(num_docs, json_writer): 10 | def _input_docs(): 11 | return ( 12 | (), 13 | { 14 | "docs": [Document(text=f"{i}") for i in range(num_docs)], 15 | "doc": Document(text="newdoc"), 16 | }, 17 | ) 18 | 19 | def _copy_from(docs, doc): 20 | for d in docs: 21 | d.CopyFrom(doc) 22 | 23 | result = benchmark_time(setup=_input_docs, func=_copy_from) 24 | 25 | json_writer.append( 26 | page=Pages.DOCUMENT_HELPER, 27 | result=result, 28 | metadata=dict(num_docs=num_docs), 29 | ) 30 | -------------------------------------------------------------------------------- /src/document_merge_from.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from jina import Document 3 | 4 | from .pages import Pages 5 | from .utils.benchmark import benchmark_time 6 | 7 | 8 | @pytest.mark.parametrize("num_docs", [1, 100, 10_000]) 9 | def test_document_merge_from(num_docs, json_writer): 10 | def _input_docs(): 11 | return ( 12 | (), 13 | { 14 | "docs": [Document(text=f"{i}") for i in range(num_docs)], 15 | "doc": Document(text="newdoc"), 16 | }, 17 | ) 18 | 19 | def _merge_from(docs, doc): 20 | for d in docs: 21 | d.MergeFrom(doc) 22 | 23 | result = benchmark_time(setup=_input_docs, func=_merge_from) 24 | 25 | json_writer.append( 26 | page=Pages.DOCUMENT_HELPER, 27 | result=result, 28 | metadata=dict(num_docs=num_docs), 29 | ) 30 | -------------------------------------------------------------------------------- /src/executor.py: -------------------------------------------------------------------------------- 1 | from jina import Executor, requests 2 | 3 | from .pages import Pages 4 | from .utils.benchmark import benchmark_time 5 | 6 | NUM_REPETITIONS = 100 7 | 8 | 9 | class DummyLoadExecutor(Executor): 10 | def __init__(self, a, b, c, d, *args, **kwargs): 11 | super().__init__(*args, **kwargs) 12 | 13 | @requests 14 | def foo(self, **kwargs): 15 | pass 16 | 17 | 18 | executor_yaml = ''' 19 | jtype: DummyLoadExecutor 20 | with: 21 | a: 0 22 | b: 1 23 | c: 2 24 | d: 3 25 | metas: 26 | name: dummy-executor 27 | ''' 28 | 29 | 30 | def test_executor_load_config(json_writer): 31 | def _build(): 32 | _ = Executor.load_config(executor_yaml) 33 | 34 | result = benchmark_time(func=_build) 35 | 36 | json_writer.append( 37 | page=Pages.EXECUTOR, 38 | result=result, 39 | metadata={}, 40 | ) 41 | -------------------------------------------------------------------------------- /src/document_array_insert.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from jina import Document, DocumentArray 3 | 4 | from .pages import Pages 5 | from .utils.benchmark import benchmark_time 6 | 7 | NUM_REPETITIONS = 10 8 | 9 | 10 | @pytest.mark.parametrize('num_docs', [100, 10_000]) 11 | def test_da_insert(num_docs, json_writer): 12 | def _setup(): 13 | docs = [Document(text=f'doc{i}') for i in range(num_docs)] 14 | da = DocumentArray() 15 | return (), dict(da=da, docs=docs) 16 | 17 | def _insert_in_da(da, docs): 18 | for doc in docs: 19 | da.insert(index=0, doc=doc) 20 | 21 | result = benchmark_time( 22 | setup=_setup, 23 | func=_insert_in_da, 24 | n=NUM_REPETITIONS, 25 | ) 26 | 27 | json_writer.append( 28 | page=Pages.DA_INSERT, 29 | result=result, 30 | metadata=dict(num_docs=num_docs), 31 | ) 32 | -------------------------------------------------------------------------------- /src/document_get_sparse_blob.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import scipy.sparse as sp 3 | from jina import Document 4 | 5 | from .pages import Pages 6 | from .utils.benchmark import benchmark_time 7 | 8 | 9 | @pytest.mark.parametrize("num_docs", [100, 1000, 10_000]) 10 | def test_document_document_get_sparse_blob_scipy(num_docs, json_writer): 11 | def _input_docs(): 12 | return ( 13 | (), 14 | { 15 | "docs": [ 16 | Document(blob=sp.csr_matrix([0, 0, 4, 0, 1])) 17 | for _ in range(num_docs) 18 | ] 19 | }, 20 | ) 21 | 22 | def _get_sparse_blob(docs): 23 | for d in docs: 24 | d.blob 25 | 26 | result = benchmark_time(setup=_input_docs, func=_get_sparse_blob) 27 | 28 | json_writer.append( 29 | page=Pages.DOCUMENT_CONTENT, 30 | result=result, 31 | metadata=dict(num_docs=num_docs), 32 | ) 33 | -------------------------------------------------------------------------------- /src/document_get_sparse_embedding.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import scipy.sparse as sp 3 | from jina import Document 4 | 5 | from .pages import Pages 6 | from .utils.benchmark import benchmark_time 7 | 8 | 9 | @pytest.mark.parametrize("num_docs", [100, 1000, 10_000]) 10 | def test_document_document_get_sparse_embedding_scipy(num_docs, json_writer): 11 | def _input_docs(): 12 | return ( 13 | (), 14 | { 15 | "docs": [ 16 | Document(blob=sp.csr_matrix([0, 0, 4, 0, 1])) 17 | for i in range(num_docs) 18 | ] 19 | }, 20 | ) 21 | 22 | def _get_sparse_blob(docs): 23 | for d in docs: 24 | d.embedding 25 | 26 | result = benchmark_time(setup=_input_docs, func=_get_sparse_blob) 27 | 28 | json_writer.append( 29 | page=Pages.DOCUMENT_CONTENT, 30 | result=result, 31 | metadata=dict(num_docs=num_docs), 32 | ) 33 | -------------------------------------------------------------------------------- /src/document_parent_id.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from jina import Document 3 | 4 | from .pages import Pages 5 | from .utils.benchmark import benchmark_time 6 | 7 | 8 | @pytest.mark.parametrize("num_docs", [100, 1000, 10_000]) 9 | def test_document_document_parent_id(num_docs, json_writer): 10 | def _input_docs(): 11 | return ( 12 | (), 13 | { 14 | "chunks": [ 15 | Document( 16 | chunks=[Document(text="d1 original text", id=str(i))], id="123" 17 | ).chunks[0] 18 | for i in range(num_docs) 19 | ] 20 | }, 21 | ) 22 | 23 | def _parent_id(chunks): 24 | for c in chunks: 25 | c.parent_id 26 | 27 | result = benchmark_time(setup=_input_docs, func=_parent_id) 28 | 29 | json_writer.append( 30 | page=Pages.DOCUMENT_META, 31 | result=result, 32 | metadata=dict(num_docs=num_docs), 33 | ) 34 | -------------------------------------------------------------------------------- /src/pages.py: -------------------------------------------------------------------------------- 1 | class Pages: 2 | DA_APPEND = 'document_array_append' 3 | DA_CLEAR = 'document_array_clear' 4 | DA_CONSTRUCT = 'document_array_construct' 5 | DA_EXTEND = 'document_array_extend' 6 | DA_GET_ATTRIBUTES = 'document_array_get_attributes' 7 | DA_INSERT = 'document_array_insert' 8 | DA_MATCH = 'document_array_match' 9 | DA_PERSISTENCE = 'document_array_persistence' 10 | DA_SHUFFLE = 'document_array_shuffle' 11 | DA_SORT = 'document_array_sort' 12 | DA_TRAVERSE = 'document_array_traverse' 13 | DOCUMENT_CONSTRUCT = 'document_construct' 14 | DOCUMENT_CONTENT = 'document_content_attributes' 15 | DOCUMENT_CONVERSION = 'document_conversion' 16 | DOCUMENT_GRAPH = 'document_graph' 17 | DOCUMENT_HELPER = 'document_helper_functions' 18 | DOCUMENT_META = 'document_meta_attributes' 19 | DOCUMENT_RECURSIVE = 'document_recursive_attributes' 20 | DOCUMENT_RELEVANCE = 'document_relevance_attributes' 21 | EXECUTOR = 'executor' 22 | FLOW = 'flow' 23 | INDEXER_COMPARISON = 'indexer_comparison' 24 | -------------------------------------------------------------------------------- /src/document_array_sort.py: -------------------------------------------------------------------------------- 1 | import random 2 | import string 3 | 4 | import pytest 5 | from jina import Document, DocumentArray 6 | 7 | from .pages import Pages 8 | from .utils.benchmark import benchmark_time 9 | 10 | NUM_REPETITIONS = 25 11 | NUM_DOCS = 1000 12 | CHARS = tuple(string.ascii_uppercase + string.digits) 13 | 14 | 15 | def _get_docs(num_docs): 16 | return [Document(scores={'cosine': random.random()}) for _ in range(num_docs)] 17 | 18 | 19 | @pytest.mark.parametrize('num_docs', [100, 100_000]) 20 | def test_da_sort(num_docs, json_writer): 21 | def _sort(da): 22 | da.sort(key=lambda x: x.scores['cosine'].value) 23 | 24 | def _build_da(**kwargs): 25 | docs = kwargs.get('docs') 26 | da = DocumentArray(docs) 27 | return (), dict(da=da) 28 | 29 | result = benchmark_time( 30 | setup=_build_da, 31 | func=_sort, 32 | n=NUM_REPETITIONS, 33 | kwargs=dict(docs=_get_docs(num_docs)), 34 | ) 35 | 36 | json_writer.append( 37 | page=Pages.DA_SORT, 38 | result=result, 39 | metadata=dict(num_docs=num_docs), 40 | ) 41 | -------------------------------------------------------------------------------- /.github/workflows/gh-page.yml: -------------------------------------------------------------------------------- 1 | name: Github Page 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | workflow_dispatch: 8 | inputs: 9 | reason: 10 | description: "Why?" 11 | required: true 12 | default: "Just casually!" 13 | 14 | jobs: 15 | build-deploy: 16 | runs-on: ubuntu-latest 17 | steps: 18 | - uses: actions/checkout@v2 19 | with: 20 | submodules: true 21 | fetch-depth: 0 22 | 23 | - name: Set up Python 3.9 24 | uses: actions/setup-python@v2 25 | with: 26 | python-version: 3.9 27 | 28 | - name: Setup Hugo 29 | uses: peaceiris/actions-hugo@v2 30 | with: 31 | hugo-version: "0.82.0" 32 | extended: true 33 | 34 | - name: Generate site 35 | run: python scripts/site_generator.py 36 | 37 | - name: Build Site 38 | run: | 39 | cd docs 40 | hugo --minify 41 | 42 | - name: Deploy Site 43 | uses: peaceiris/actions-gh-pages@v3 44 | with: 45 | personal_token: ${{ secrets.JINA_DEV_BOT }} 46 | publish_branch: gh-pages 47 | publish_dir: ./docs/public 48 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | # See https://pre-commit.com for more information 2 | # See https://pre-commit.com/hooks.html for more hooks 3 | repos: 4 | - repo: https://github.com/pre-commit/pre-commit-hooks 5 | rev: v3.2.0 6 | hooks: 7 | - id: check-ast 8 | - id: check-added-large-files 9 | - id: check-yaml 10 | - id: end-of-file-fixer 11 | - id: trailing-whitespace 12 | 13 | - repo: https://github.com/psf/black 14 | rev: 20.8b1 15 | hooks: 16 | - id: black 17 | name: Black 18 | types: [python] 19 | files: ^(src/|scripts/) 20 | args: 21 | - -S 22 | 23 | - repo: https://github.com/PyCQA/isort 24 | rev: 5.9.3 25 | hooks: 26 | - id: isort 27 | name: Sort Python Imports 28 | 29 | - repo: https://github.com/sirosen/check-jsonschema 30 | rev: 0.4.1 31 | hooks: 32 | - id: check-github-workflows 33 | files: ^(.github/workflows/) 34 | 35 | - repo: local 36 | hooks: 37 | - id: site_generator 38 | name: Site Generator 39 | entry: python scripts/site_generator.py 40 | language: python 41 | types: [python] 42 | additional_dependencies: ['packaging==21.0', 'requests==2.26.0'] 43 | -------------------------------------------------------------------------------- /src/document_graph_adjacency.py: -------------------------------------------------------------------------------- 1 | # from jina import Document 2 | # from jina.types.document.graph import GraphDocument 3 | # 4 | # from .pages import Pages 5 | # from .utils.benchmark import benchmark_time 6 | # 7 | # 8 | # def test_empty_document_graph_adjacency(json_writer): 9 | # def _input_graphdoc(): 10 | # return ((), {"gdoc": GraphDocument()}) 11 | # 12 | # def _doc_get_adjacency(gdoc): 13 | # _ = gdoc.adjacency 14 | # 15 | # result = benchmark_time(setup=_input_graphdoc, func=_doc_get_adjacency) 16 | # 17 | # json_writer.append( 18 | # page=Pages.DOCUMENT_META, 19 | # result=result, 20 | # ) 21 | # 22 | # 23 | # def test_document_graph_adjacency(json_writer): 24 | # def _input_graphdoc(): 25 | # gdoc = GraphDocument() 26 | # gdoc.add_edges( 27 | # [Document(id=1), Document(id=2)], [Document(id=3), Document(id=1)] 28 | # ) 29 | # 30 | # return ((), {"gdoc": gdoc}) 31 | # 32 | # def _doc_get_adjacency(gdoc): 33 | # _ = gdoc.adjacency 34 | # 35 | # result = benchmark_time(setup=_input_graphdoc, func=_doc_get_adjacency) 36 | # 37 | # json_writer.append( 38 | # page=Pages.DOCUMENT_META, 39 | # result=result, 40 | # ) 41 | -------------------------------------------------------------------------------- /src/document_dict.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | from jina import Document 4 | 5 | from .pages import Pages 6 | from .utils.benchmark import benchmark_time 7 | 8 | 9 | @pytest.mark.parametrize('num_docs', [1, 100, 10_000]) 10 | def test_document_dict_with_text(num_docs, json_writer): 11 | def _input_docs(): 12 | return (), dict(docs=[Document(text='doc') for _ in range(num_docs)]) 13 | 14 | def _dict(docs): 15 | for d in docs: 16 | aux = d.dict() 17 | 18 | result = benchmark_time(setup=_input_docs, func=_dict) 19 | 20 | json_writer.append( 21 | page=Pages.DOCUMENT_HELPER, 22 | result=result, 23 | metadata=dict(num_docs=num_docs), 24 | ) 25 | 26 | 27 | @pytest.mark.parametrize('num_docs', [1, 100, 10_000]) 28 | def test_document_dict_with_array(num_docs, json_writer): 29 | def _input_docs(): 30 | return (), dict(docs=[Document(blob=np.array([1, 2])) for _ in range(num_docs)]) 31 | 32 | def _dict(docs): 33 | for d in docs: 34 | aux = d.dict() 35 | 36 | result = benchmark_time(setup=_input_docs, func=_dict) 37 | 38 | json_writer.append( 39 | page=Pages.DOCUMENT_HELPER, 40 | result=result, 41 | metadata=dict(num_docs=num_docs), 42 | ) 43 | -------------------------------------------------------------------------------- /src/document_json.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | from jina import Document 4 | 5 | from .pages import Pages 6 | from .utils.benchmark import benchmark_time 7 | 8 | 9 | @pytest.mark.parametrize('num_docs', [1, 100, 10_000]) 10 | def test_document_json_with_text(num_docs, json_writer): 11 | def _input_docs(): 12 | return (), dict(docs=[Document(text='doc') for _ in range(num_docs)]) 13 | 14 | def _dict(docs): 15 | for d in docs: 16 | aux = d.json() 17 | 18 | result = benchmark_time(setup=_input_docs, func=_dict) 19 | 20 | json_writer.append( 21 | page=Pages.DOCUMENT_HELPER, 22 | result=result, 23 | metadata=dict(num_docs=num_docs), 24 | ) 25 | 26 | 27 | @pytest.mark.parametrize('num_docs', [1, 100, 10_000]) 28 | def test_document_json_with_array(num_docs, json_writer): 29 | def _input_docs(): 30 | return (), dict(docs=[Document(blob=np.array([1, 2])) for _ in range(num_docs)]) 31 | 32 | def _dict(docs): 33 | for d in docs: 34 | aux = d.json() 35 | 36 | result = benchmark_time(setup=_input_docs, func=_dict) 37 | 38 | json_writer.append( 39 | page=Pages.DOCUMENT_HELPER, 40 | result=result, 41 | metadata=dict(num_docs=num_docs), 42 | ) 43 | -------------------------------------------------------------------------------- /src/document_id.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from jina import Document, DocumentArray 3 | from jina.helper import random_identity 4 | 5 | from .pages import Pages 6 | from .utils.benchmark import benchmark_time 7 | 8 | random_identity(use_uuid1=True) 9 | 10 | 11 | @pytest.mark.parametrize("num_docs", [100, 1000, 10_000]) 12 | @pytest.mark.parametrize("use_uuid1", [True, False]) 13 | def test_document_document_generate_id(num_docs, use_uuid1, json_writer): 14 | def _generate_id(): 15 | for _ in range(num_docs): 16 | random_identity(use_uuid1) 17 | 18 | result = benchmark_time(func=_generate_id) 19 | 20 | json_writer.append( 21 | page=Pages.DOCUMENT_HELPER, 22 | result=result, 23 | metadata=dict(num_docs=num_docs), 24 | ) 25 | 26 | 27 | @pytest.mark.parametrize("num_docs", [100, 1000, 10_000]) 28 | def test_document_document_get_id(num_docs, json_writer): 29 | def _input_docs(): 30 | return ( 31 | (), 32 | dict( 33 | docs=DocumentArray([Document(text="hey here") for _ in range(num_docs)]) 34 | ), 35 | ) 36 | 37 | def _get_id(docs): 38 | for d in docs: 39 | aux = d.id 40 | 41 | result = benchmark_time(setup=_input_docs, func=_get_id) 42 | 43 | json_writer.append( 44 | page=Pages.DOCUMENT_META, 45 | result=result, 46 | metadata=dict(num_docs=num_docs), 47 | ) 48 | -------------------------------------------------------------------------------- /src/document_array_shuffle.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from jina import Document, DocumentArray, DocumentArrayMemmap 3 | 4 | from .pages import Pages 5 | from .utils.benchmark import benchmark_time 6 | 7 | 8 | @pytest.mark.parametrize('memmap', [False, True]) 9 | @pytest.mark.parametrize('n_docs', [1000, 10_000]) 10 | def test_da_shuffle(name, memmap, n_docs, ephemeral_tmpdir, json_writer): 11 | def _setup(memmap, n_docs): 12 | docs = [Document(text=f'Document{i}') for i in range(n_docs)] 13 | da = ( 14 | DocumentArrayMemmap(f'{str(ephemeral_tmpdir)}/memmap') 15 | if memmap 16 | else DocumentArray() 17 | ) 18 | da.extend(docs) 19 | return (), dict(da=da) 20 | 21 | def _shuffle_da(da): 22 | da.shuffle() 23 | 24 | def _teardown(): 25 | import os 26 | import shutil 27 | 28 | if os.path.exists(f'{str(ephemeral_tmpdir)}/memmap'): 29 | shutil.rmtree(f'{str(ephemeral_tmpdir)}/memmap') 30 | 31 | result = benchmark_time( 32 | setup=_setup, 33 | func=_shuffle_da, 34 | teardown=_teardown, 35 | kwargs=dict(memmap=memmap, n_docs=n_docs), 36 | ) 37 | if memmap: 38 | name = name.replace('_da_', '_dam_') 39 | json_writer.append( 40 | name=name, 41 | page=Pages.DA_SHUFFLE, 42 | result=result, 43 | metadata=dict(n_nodes=memmap, n_docs=n_docs), 44 | ) 45 | -------------------------------------------------------------------------------- /src/document_array_clear.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from jina import Document, DocumentArray, DocumentArrayMemmap 3 | 4 | from .pages import Pages 5 | from .utils.benchmark import benchmark_time 6 | 7 | 8 | @pytest.mark.parametrize('num_docs', [100, 10_000]) 9 | def test_da_clear(num_docs, json_writer): 10 | def _setup(): 11 | da = DocumentArray([Document(text=f'doc{i}') for i in range(num_docs)]) 12 | return (), dict(da=da) 13 | 14 | def _da_clear(da): 15 | da.clear() 16 | 17 | result = benchmark_time(setup=_setup, func=_da_clear) 18 | 19 | json_writer.append( 20 | page=Pages.DA_CLEAR, 21 | result=result, 22 | metadata=dict(num_docs=num_docs), 23 | ) 24 | 25 | 26 | @pytest.mark.parametrize('num_docs', [100, 10_000]) 27 | def test_dam_clear(num_docs, json_writer, ephemeral_tmpdir): 28 | def _setup(): 29 | dam = DocumentArrayMemmap((f'{str(ephemeral_tmpdir)}/memmap')) 30 | dam.extend([Document(text=f'doc{i}') for i in range(num_docs)]) 31 | return (), dict(dam=dam) 32 | 33 | def _dam_clear(dam): 34 | dam.clear() 35 | 36 | def _teardown(): 37 | import shutil 38 | 39 | shutil.rmtree(f'{str(ephemeral_tmpdir)}/memmap') 40 | 41 | result = benchmark_time(setup=_setup, func=_dam_clear, teardown=_teardown) 42 | 43 | json_writer.append( 44 | page=Pages.DA_CLEAR, 45 | result=result, 46 | metadata=dict(num_docs=num_docs), 47 | ) 48 | -------------------------------------------------------------------------------- /src/document_content.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | from jina import Document 4 | 5 | from .pages import Pages 6 | from .utils.benchmark import benchmark_time 7 | 8 | 9 | @pytest.mark.parametrize("num_docs", [100, 1000, 10_000]) 10 | def test_document_get_content(num_docs, json_writer): 11 | def _input_docs(): 12 | return ( 13 | (), 14 | {"docs": [Document(text=f"d{i}") for i in range(num_docs)]}, 15 | ) 16 | 17 | def _doc_get_content(docs): 18 | for doc in docs: 19 | _ = doc.content 20 | 21 | result = benchmark_time(setup=_input_docs, func=_doc_get_content) 22 | 23 | json_writer.append( 24 | page=Pages.DOCUMENT_CONTENT, 25 | result=result, 26 | metadata=dict(num_docs=num_docs), 27 | ) 28 | 29 | 30 | @pytest.mark.parametrize("num_docs", [100, 1000, 10_000]) 31 | def test_document_set_content(num_docs, json_writer): 32 | def _input_docs(): 33 | return ( 34 | (), 35 | {"docs": [Document(blob=np.array([1, 2])) for i in range(num_docs)]}, 36 | ) 37 | 38 | def _doc_get_content(docs): 39 | x = np.array([2, 3, 4]) 40 | for doc in docs: 41 | doc.content = x 42 | 43 | result = benchmark_time(setup=_input_docs, func=_doc_get_content) 44 | 45 | json_writer.append( 46 | page=Pages.DOCUMENT_CONTENT, 47 | result=result, 48 | metadata=dict(num_docs=num_docs), 49 | ) 50 | -------------------------------------------------------------------------------- /src/document_embedding.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | from jina import Document, DocumentArray, Executor, requests 4 | 5 | from .pages import Pages 6 | from .utils.benchmark import benchmark_time 7 | 8 | NUM_DOCS = 100 9 | 10 | 11 | class DummyEncoder(Executor): 12 | @requests 13 | def encode(self, docs, **kwargs): 14 | texts = docs.get_attributes('text') 15 | embeddings = [np.random.rand(1, 1024) for _ in texts] 16 | for doc, embedding in zip(docs, embeddings): 17 | doc.embedding = embedding 18 | 19 | 20 | @pytest.fixture() 21 | def input_docs(): 22 | return DocumentArray([Document(text='hey here') for _ in range(NUM_DOCS)]) 23 | 24 | 25 | @pytest.fixture() 26 | def executor(): 27 | return DummyEncoder() 28 | 29 | 30 | @pytest.mark.skip() 31 | def test_document_encoder_executor(executor, input_docs, json_writer): 32 | def _function(**kwargs): 33 | executor.encode(input_docs) 34 | 35 | result = benchmark_time(profile_cls=[Document, DocumentArray], func=_function) 36 | profiles = result.profiles 37 | document_profile = profiles[0] 38 | document_array_profile = profiles[1] 39 | 40 | json_writer.append( 41 | page=Pages.DOCUMENT_CONTENT, 42 | result=result, 43 | metadata=dict( 44 | profiles=dict( 45 | Document=document_profile, DocumentArray=document_array_profile 46 | ), 47 | num_docs=NUM_DOCS, 48 | ), 49 | ) 50 | -------------------------------------------------------------------------------- /src/document_weight.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from jina import Document, DocumentArray 3 | 4 | from .pages import Pages 5 | from .utils.benchmark import benchmark_time 6 | 7 | 8 | @pytest.mark.parametrize("num_docs", [100, 1000, 10_000]) 9 | def test_document_document_set_weight(num_docs, json_writer): 10 | def _input_docs(): 11 | return ( 12 | (), 13 | dict( 14 | docs=DocumentArray([Document(text="hey here") for _ in range(num_docs)]) 15 | ), 16 | ) 17 | 18 | def _set_weight(docs): 19 | for d in docs: 20 | d.weight = 2.3 21 | 22 | result = benchmark_time(setup=_input_docs, func=_set_weight) 23 | 24 | json_writer.append( 25 | page=Pages.DOCUMENT_META, 26 | result=result, 27 | metadata=dict(num_docs=num_docs), 28 | ) 29 | 30 | 31 | @pytest.mark.parametrize("num_docs", [100, 1000, 10_000]) 32 | def test_document_document_get_weight(num_docs, json_writer): 33 | def _input_docs(): 34 | return ( 35 | (), 36 | dict( 37 | docs=DocumentArray([Document(text="hey here") for _ in range(num_docs)]) 38 | ), 39 | ) 40 | 41 | def _get_weight(docs): 42 | for d in docs: 43 | aux = d.weight 44 | 45 | result = benchmark_time(setup=_input_docs, func=_get_weight) 46 | 47 | json_writer.append( 48 | page=Pages.DOCUMENT_META, 49 | result=result, 50 | metadata=dict(num_docs=num_docs), 51 | ) 52 | -------------------------------------------------------------------------------- /src/document_granularity.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | from jina import Document 4 | 5 | from .pages import Pages 6 | from .utils.benchmark import benchmark_time 7 | 8 | 9 | @pytest.mark.parametrize("num_docs", [100, 1000, 10_000]) 10 | def test_document_get_granularity(num_docs, json_writer): 11 | def _input_docs(): 12 | return ( 13 | (), 14 | {"docs": [Document(text=f"d{i}", granularity=2) for i in range(num_docs)]}, 15 | ) 16 | 17 | def _doc_get_granularity(docs): 18 | for doc in docs: 19 | _ = doc.granularity 20 | 21 | result = benchmark_time(setup=_input_docs, func=_doc_get_granularity) 22 | 23 | json_writer.append( 24 | page=Pages.DOCUMENT_META, 25 | result=result, 26 | metadata=dict(num_docs=num_docs), 27 | ) 28 | 29 | 30 | @pytest.mark.parametrize("num_docs", [100, 1000, 10_000]) 31 | def test_document_set_granularity(num_docs, json_writer): 32 | def _input_docs(): 33 | return ( 34 | (), 35 | {"docs": [Document(text=f"d{i}", granularity=2) for i in range(num_docs)]}, 36 | ) 37 | 38 | def _doc_set_granularity(docs): 39 | x = np.array([2, 3, 4]) 40 | for doc in docs: 41 | doc.granularity = 3 42 | 43 | result = benchmark_time(setup=_input_docs, func=_doc_set_granularity) 44 | 45 | json_writer.append( 46 | page=Pages.DOCUMENT_META, 47 | result=result, 48 | metadata=dict(num_docs=num_docs), 49 | ) 50 | -------------------------------------------------------------------------------- /src/document_mime_type.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from jina import Document 3 | 4 | from .pages import Pages 5 | from .utils.benchmark import benchmark_time 6 | 7 | 8 | @pytest.mark.parametrize("num_docs", [100, 1000, 10_000]) 9 | def test_document_get_mime_type(num_docs, json_writer): 10 | def _input_docs(): 11 | docs = [] 12 | for i in range(num_docs): 13 | d = Document(text=f"d{i}") 14 | d.mime_type = "text" 15 | docs.append(d) 16 | 17 | return ( 18 | (), 19 | {"docs": docs}, 20 | ) 21 | 22 | def _get_mime_type(docs): 23 | for doc in docs: 24 | _ = doc.mime_type 25 | 26 | result = benchmark_time(setup=_input_docs, func=_get_mime_type) 27 | 28 | json_writer.append( 29 | page=Pages.DOCUMENT_META, 30 | result=result, 31 | metadata=dict(num_docs=num_docs), 32 | ) 33 | 34 | 35 | @pytest.mark.parametrize("num_docs", [100, 1000, 10_000]) 36 | def test_document_set_mime_type(num_docs, json_writer): 37 | def _input_docs(): 38 | return ( 39 | (), 40 | {"docs": [Document(text=f"d{i}") for i in range(num_docs)]}, 41 | ) 42 | 43 | def _set_mime_type(docs): 44 | for doc in docs: 45 | doc.mime_type = "text" 46 | 47 | result = benchmark_time(setup=_input_docs, func=_set_mime_type) 48 | 49 | json_writer.append( 50 | page=Pages.DOCUMENT_META, 51 | result=result, 52 | metadata=dict(num_docs=num_docs), 53 | ) 54 | -------------------------------------------------------------------------------- /.github/workflows/pr.yml: -------------------------------------------------------------------------------- 1 | name: PR Tests 2 | 3 | on: pull_request 4 | 5 | jobs: 6 | check-black: 7 | runs-on: ubuntu-latest 8 | steps: 9 | - uses: actions/checkout@v2 10 | with: 11 | fetch-depth: 0 12 | 13 | - name: Set up Python 3.9 14 | uses: actions/setup-python@v2 15 | with: 16 | python-version: 3.9 17 | 18 | - name: check black 19 | run: | 20 | pip install black==20.8b1 21 | black -S --check src/ 22 | black -S --check scripts/ 23 | 24 | check-site-generation: 25 | runs-on: ubuntu-latest 26 | needs: check-black 27 | steps: 28 | - uses: actions/checkout@v2 29 | with: 30 | fetch-depth: 0 31 | 32 | - name: Set up Python 3.9 33 | uses: actions/setup-python@v2 34 | with: 35 | python-version: 3.9 36 | 37 | - name: check site generation 38 | run: | 39 | pip install requests==2.26.0 packaging==21.0 40 | python scripts/site_generator.py 41 | git status 42 | git diff-index --quiet HEAD -- || exit 1 43 | 44 | - name: automerge 45 | uses: "pascalgn/automerge-action@v0.14.2" 46 | if: ${{ github.actor == 'jina-bot' }} 47 | env: 48 | GITHUB_TOKEN: "${{ secrets.JINA_DEV_BOT }}" 49 | MERGE_LABELS: automerge 50 | MERGE_METHOD: merge 51 | MERGE_COMMIT_MESSAGE: automatic 52 | MERGE_FILTER_AUTHOR: jina-bot 53 | MERGE_FORKS: false 54 | MERGE_DELETE_BRANCH: true 55 | -------------------------------------------------------------------------------- /src/document_array_append.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from faker import Faker 3 | from jina import Document, DocumentArray, DocumentArrayMemmap 4 | 5 | from .pages import Pages 6 | from .utils.benchmark import benchmark_time 7 | 8 | fake = Faker() 9 | Faker.seed(42) 10 | NUM_DOCS = 10000 11 | 12 | 13 | @pytest.fixture 14 | def docs(): 15 | return [Document(text=fake.text()) for _ in range(NUM_DOCS)] 16 | 17 | 18 | def test_da_append(docs, json_writer): 19 | def _append(da): 20 | for doc in docs: 21 | da.append(doc) 22 | 23 | def _setup(**kwargs): 24 | return (), dict(da=DocumentArray()) 25 | 26 | result = benchmark_time(setup=_setup, func=_append) 27 | 28 | json_writer.append( 29 | page=Pages.DA_APPEND, 30 | result=result, 31 | metadata=dict(num_docs_append=NUM_DOCS), 32 | ) 33 | 34 | 35 | @pytest.mark.parametrize('flush', [True, False]) 36 | def test_dam_append(docs, flush, json_writer, ephemeral_tmpdir): 37 | def _append(da): 38 | for doc in docs: 39 | da.append(doc, flush=flush) 40 | 41 | def _setup(**kwargs): 42 | return (), dict(da=DocumentArrayMemmap(f'{str(ephemeral_tmpdir)}/memmap')) 43 | 44 | def _teardown(): 45 | import shutil 46 | 47 | shutil.rmtree(f'{str(ephemeral_tmpdir)}/memmap') 48 | 49 | result = benchmark_time(setup=_setup, func=_append, teardown=_teardown) 50 | 51 | json_writer.append( 52 | page=Pages.DA_APPEND, 53 | result=result, 54 | metadata=dict(num_docs_append=NUM_DOCS, flush=flush), 55 | ) 56 | -------------------------------------------------------------------------------- /src/document_clear.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | from jina import Document, DocumentArray 4 | 5 | from .pages import Pages 6 | from .utils.benchmark import benchmark_time 7 | 8 | 9 | @pytest.mark.parametrize('num_docs', [100, 1000, 10_000]) 10 | def test_document_document_clear_doc_with_1_field(num_docs, json_writer): 11 | def _input_docs(): 12 | return (), dict( 13 | docs=DocumentArray([Document(text='hey here') for _ in range(num_docs)]) 14 | ) 15 | 16 | def _pop_text(docs): 17 | for d in docs: 18 | d.clear() 19 | 20 | result = benchmark_time(setup=_input_docs, func=_pop_text) 21 | 22 | json_writer.append( 23 | page=Pages.DOCUMENT_HELPER, 24 | result=result, 25 | metadata=dict(num_docs=num_docs), 26 | ) 27 | 28 | 29 | @pytest.mark.parametrize('num_docs', [100, 1000, 10_000]) 30 | def test_document_document_clear_doc_with_2_fields(num_docs, json_writer): 31 | def _input_docs(): 32 | return (), dict( 33 | docs=DocumentArray( 34 | [ 35 | Document(text='hey here', embedding=np.array([1, 2, 3])) 36 | for _ in range(num_docs) 37 | ] 38 | ) 39 | ) 40 | 41 | def _pop_text(docs): 42 | for d in docs: 43 | d.pop('text') 44 | 45 | result = benchmark_time(setup=_input_docs, func=_pop_text) 46 | 47 | json_writer.append( 48 | page=Pages.DOCUMENT_HELPER, 49 | result=result, 50 | metadata=dict(num_docs=num_docs), 51 | ) 52 | -------------------------------------------------------------------------------- /src/document_scores.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from jina import Document 3 | 4 | from .pages import Pages 5 | from .utils.benchmark import benchmark_time 6 | 7 | 8 | @pytest.mark.parametrize("num_docs", [100, 1000, 10_000]) 9 | def test_document_get_scores(num_docs, json_writer): 10 | def _input_docs(): 11 | return ( 12 | (), 13 | { 14 | "docs": [ 15 | Document(text=f"d{i}", scores={"euclidean": 5, "cosine": 0.5}) 16 | for i in range(num_docs) 17 | ] 18 | }, 19 | ) 20 | 21 | def _doc_get_scores(docs): 22 | for doc in docs: 23 | _ = doc.scores["euclidean"] 24 | 25 | result = benchmark_time(setup=_input_docs, func=_doc_get_scores) 26 | 27 | json_writer.append( 28 | page=Pages.DOCUMENT_RELEVANCE, 29 | result=result, 30 | metadata=dict(num_docs=num_docs), 31 | ) 32 | 33 | 34 | @pytest.mark.parametrize("num_docs", [100, 1000, 10_000]) 35 | def test_document_set_scores(num_docs, json_writer): 36 | def _input_docs(): 37 | return ( 38 | (), 39 | {"docs": [Document(text=f"d{i}") for i in range(num_docs)]}, 40 | ) 41 | 42 | def _doc_set_scores(docs): 43 | for doc in docs: 44 | doc.scores["euclidean"] = 23 45 | 46 | result = benchmark_time(setup=_input_docs, func=_doc_set_scores) 47 | 48 | json_writer.append( 49 | page=Pages.DOCUMENT_RELEVANCE, 50 | result=result, 51 | metadata=dict(num_docs=num_docs), 52 | ) 53 | -------------------------------------------------------------------------------- /src/document_tags.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from jina import Document, DocumentArray 3 | 4 | from .pages import Pages 5 | from .utils.benchmark import benchmark_time 6 | 7 | 8 | @pytest.mark.parametrize("num_docs", [100, 1000, 10_000]) 9 | def test_document_document_tags_setter(num_docs, json_writer): 10 | def _input_docs(): 11 | return ( 12 | (), 13 | dict( 14 | docs=DocumentArray( 15 | [Document(tags={"tag1": "val1"}) for _ in range(num_docs)] 16 | ) 17 | ), 18 | ) 19 | 20 | def _tags_set(docs): 21 | for d in docs: 22 | d.tags["tag1"] = "newval1" 23 | 24 | result = benchmark_time(setup=_input_docs, func=_tags_set) 25 | 26 | json_writer.append( 27 | page=Pages.DOCUMENT_CONTENT, 28 | result=result, 29 | metadata=dict(num_docs=num_docs), 30 | ) 31 | 32 | 33 | @pytest.mark.parametrize("num_docs", [100, 1000, 10_000]) 34 | def test_document_document_tags_getter(num_docs, json_writer): 35 | def _input_docs(): 36 | return ( 37 | (), 38 | dict( 39 | docs=DocumentArray( 40 | [Document(tags={"tag1": "val1"}) for _ in range(num_docs)] 41 | ) 42 | ), 43 | ) 44 | 45 | def _get_tags_tag1(docs): 46 | for d in docs: 47 | tag = d.tags.get("tag1") 48 | 49 | result = benchmark_time(setup=_input_docs, func=_get_tags_tag1) 50 | 51 | json_writer.append( 52 | page=Pages.DOCUMENT_CONTENT, 53 | result=result, 54 | metadata=dict(num_docs=num_docs), 55 | ) 56 | -------------------------------------------------------------------------------- /src/document_evaluations.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from jina import Document 3 | 4 | from .pages import Pages 5 | from .utils.benchmark import benchmark_time 6 | 7 | 8 | @pytest.mark.parametrize("num_docs", [100, 1000, 10_000]) 9 | def test_document_get_evaluations(num_docs, json_writer): 10 | def _input_docs(): 11 | return ( 12 | (), 13 | { 14 | "docs": [ 15 | Document(evaluations={'precision': 0.9}) for i in range(num_docs) 16 | ] 17 | }, 18 | ) 19 | 20 | def _doc_get_evaluations(docs): 21 | for doc in docs: 22 | _ = doc.evaluations['precision'].value 23 | 24 | result = benchmark_time(setup=_input_docs, func=_doc_get_evaluations) 25 | 26 | json_writer.append( 27 | page=Pages.DOCUMENT_RELEVANCE, 28 | result=result, 29 | metadata=dict(num_docs=num_docs), 30 | ) 31 | 32 | 33 | @pytest.mark.parametrize("num_docs", [100, 1000, 10_000]) 34 | def test_document_set_evaluations(num_docs, json_writer): 35 | def _input_docs(): 36 | return ( 37 | (), 38 | { 39 | "docs": [ 40 | Document(evaluations={'precision': 0.9}) for i in range(num_docs) 41 | ] 42 | }, 43 | ) 44 | 45 | def _doc_set_evaluations(docs): 46 | for doc in docs: 47 | doc.evaluations['precision'] = 0.99 48 | 49 | result = benchmark_time(setup=_input_docs, func=_doc_set_evaluations) 50 | 51 | json_writer.append( 52 | page=Pages.DOCUMENT_RELEVANCE, 53 | result=result, 54 | metadata=dict(num_docs=num_docs), 55 | ) 56 | -------------------------------------------------------------------------------- /src/document_update.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | from jina import Document 4 | 5 | from .pages import Pages 6 | from .utils.benchmark import benchmark_time 7 | 8 | 9 | @pytest.mark.parametrize("num_docs", [100, 1000, 10_000]) 10 | def test_document_document_update_embedding(num_docs, json_writer): 11 | def _input_docs(): 12 | return ( 13 | (), 14 | { 15 | "docs": [ 16 | Document(embedding=np.array([1, 2, 3])) for _ in range(num_docs) 17 | ], 18 | "new_doc": Document(embedding=np.array([4, 5, 6])), 19 | }, 20 | ) 21 | 22 | def _update_embedding(docs, new_doc): 23 | for d in docs: 24 | d.update(new_doc) 25 | 26 | result = benchmark_time(setup=_input_docs, func=_update_embedding) 27 | json_writer.append( 28 | page=Pages.DOCUMENT_HELPER, 29 | result=result, 30 | metadata=dict(num_docs=num_docs), 31 | ) 32 | 33 | 34 | @pytest.mark.parametrize("num_docs", [100, 1000, 10_000]) 35 | def test_document_document_update_text(num_docs, json_writer): 36 | def _input_docs(): 37 | return ( 38 | (), 39 | { 40 | "docs": [Document(text="original text") for _ in range(num_docs)], 41 | "new_doc": Document(text="new text"), 42 | }, 43 | ) 44 | 45 | def _update_text(docs, new_doc): 46 | for d in docs: 47 | d.update(new_doc) 48 | 49 | result = benchmark_time(setup=_input_docs, func=_update_text) 50 | json_writer.append( 51 | page=Pages.DOCUMENT_HELPER, 52 | result=result, 53 | metadata=dict(num_docs=num_docs), 54 | ) 55 | -------------------------------------------------------------------------------- /src/document_modality.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | from jina import Document, DocumentArray 4 | 5 | from .pages import Pages 6 | from .utils.benchmark import benchmark_time 7 | 8 | 9 | @pytest.mark.parametrize("num_docs", [100, 1000, 10_000]) 10 | def test_document_document_modality_setter(num_docs, json_writer): 11 | def _input_docs(): 12 | return ( 13 | (), 14 | dict( 15 | docs=DocumentArray([Document(text="hey here") for _ in range(num_docs)]) 16 | ), 17 | ) 18 | 19 | def _set_modality(docs): 20 | for d in docs: 21 | d.modality = "modality" 22 | 23 | result = benchmark_time(setup=_input_docs, func=_set_modality) 24 | 25 | json_writer.append( 26 | page=Pages.DOCUMENT_META, 27 | result=result, 28 | metadata=dict(num_docs=num_docs), 29 | ) 30 | 31 | 32 | @pytest.mark.parametrize("num_docs", [100, 1000, 10_000]) 33 | def test_document_document_modality_getter(num_docs, json_writer): 34 | def _input_docs(): 35 | return ( 36 | (), 37 | dict( 38 | docs=DocumentArray( 39 | [ 40 | Document(text="hey here", embedding=np.array([1, 2, 3])) 41 | for _ in range(num_docs) 42 | ] 43 | ) 44 | ), 45 | ) 46 | 47 | def _get_modality(docs): 48 | for d in docs: 49 | aux = d.modality 50 | 51 | result = benchmark_time(setup=_input_docs, func=_get_modality) 52 | 53 | json_writer.append( 54 | page=Pages.DOCUMENT_META, 55 | result=result, 56 | metadata=dict(num_docs=num_docs), 57 | ) 58 | -------------------------------------------------------------------------------- /src/document_array_persistence.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from jina import Document, DocumentArray 3 | 4 | from .pages import Pages 5 | from .utils.benchmark import benchmark_time 6 | 7 | NUM_DOCS = 100000 8 | 9 | 10 | @pytest.fixture 11 | def doc_array(): 12 | return DocumentArray( 13 | (Document(text=f'This is the document number: {i}') for i in range(NUM_DOCS)) 14 | ) 15 | 16 | 17 | @pytest.mark.parametrize('file_format', ['json', 'binary']) 18 | def test_da_save(doc_array, file_format, json_writer, ephemeral_tmpdir): 19 | extension = 'bin' if file_format == 'binary' else 'json' 20 | file = f'{str(ephemeral_tmpdir)}/doc_array.{extension}' 21 | 22 | def _save(): 23 | doc_array.save(file, file_format=file_format) 24 | 25 | def _teardown(): 26 | import os 27 | 28 | os.remove(file) 29 | 30 | result = benchmark_time(func=_save, teardown=_teardown) 31 | 32 | json_writer.append( 33 | page=Pages.DA_PERSISTENCE, 34 | result=result, 35 | metadata=dict(num_docs_append=NUM_DOCS, file_format=file_format), 36 | ) 37 | 38 | 39 | @pytest.mark.parametrize('file_format', ['json', 'binary']) 40 | def test_da_load(doc_array, file_format, json_writer, ephemeral_tmpdir): 41 | extension = 'bin' if file_format == 'binary' else 'json' 42 | file = f'{str(ephemeral_tmpdir)}/doc_array.{extension}' 43 | 44 | def _save(): 45 | doc_array.save(file, file_format=file_format) 46 | return (), dict() 47 | 48 | def _load(): 49 | DocumentArray.load(file, file_format=file_format) 50 | 51 | def _teardown(): 52 | import os 53 | 54 | os.remove(file) 55 | 56 | result = benchmark_time(setup=_save, func=_load, teardown=_teardown) 57 | 58 | json_writer.append( 59 | page=Pages.DA_PERSISTENCE, 60 | result=result, 61 | metadata=dict(num_docs_append=NUM_DOCS, file_format=file_format), 62 | ) 63 | -------------------------------------------------------------------------------- /src/document_array_save_json_load_json.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from jina import Document, DocumentArray 3 | 4 | from .pages import Pages 5 | from .utils.benchmark import benchmark_time 6 | 7 | NUM_REPETITIONS = 10 8 | 9 | 10 | @pytest.mark.parametrize('num_docs', [100, 10_000]) 11 | def test_da_save_json(num_docs, json_writer, ephemeral_tmpdir): 12 | def _setup(): 13 | docs = [Document(text=f'doc{i}') for i in range(num_docs)] 14 | da = DocumentArray(docs) 15 | return (), dict(da=da) 16 | 17 | def _da_save_json(da): 18 | da.save_json(f'{str(ephemeral_tmpdir)}/docarray.json') 19 | 20 | def _teardown(): 21 | import os 22 | 23 | os.remove(f'{str(ephemeral_tmpdir)}/docarray.json') 24 | 25 | result = benchmark_time( 26 | setup=_setup, 27 | func=_da_save_json, 28 | teardown=_teardown, 29 | n=NUM_REPETITIONS, 30 | ) 31 | 32 | json_writer.append( 33 | page=Pages.DA_INSERT, 34 | result=result, 35 | metadata=dict(num_docs=num_docs), 36 | ) 37 | 38 | 39 | @pytest.mark.parametrize('num_docs', [100, 1000, 10_000]) 40 | def test_da_load_json(num_docs, json_writer, ephemeral_tmpdir): 41 | def _setup(): 42 | docs = [Document(text=f'doc{i}') for i in range(num_docs)] 43 | da = DocumentArray(docs) 44 | da.save_json(f'{str(ephemeral_tmpdir)}/docarray.json') 45 | return (), dict(da=da) 46 | 47 | def _da_load_json(da): 48 | da.load_json(f'{str(ephemeral_tmpdir)}/docarray.json') 49 | 50 | def _teardown(): 51 | import os 52 | 53 | os.remove(f'{str(ephemeral_tmpdir)}/docarray.json') 54 | 55 | result = benchmark_time( 56 | setup=_setup, func=_da_load_json, teardown=_teardown, n=NUM_REPETITIONS 57 | ) 58 | 59 | json_writer.append( 60 | page=Pages.DA_INSERT, 61 | result=result, 62 | metadata=dict(num_docs=num_docs), 63 | ) 64 | -------------------------------------------------------------------------------- /src/document_array_save_binary_load_binary.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from jina import Document, DocumentArray 3 | 4 | from .pages import Pages 5 | from .utils.benchmark import benchmark_time 6 | 7 | NUM_REPETITIONS = 10 8 | 9 | 10 | @pytest.mark.parametrize('num_docs', [100, 10_000]) 11 | def test_da_save_binary(num_docs, json_writer, ephemeral_tmpdir): 12 | def _setup(): 13 | docs = [Document(text=f'doc{i}') for i in range(num_docs)] 14 | da = DocumentArray(docs) 15 | return (), dict(da=da) 16 | 17 | def _da_save_binary(da): 18 | da.save_binary(f'{str(ephemeral_tmpdir)}/docarray.bin') 19 | 20 | def _teardown(): 21 | import os 22 | 23 | os.remove(f'{str(ephemeral_tmpdir)}/docarray.bin') 24 | 25 | result = benchmark_time( 26 | setup=_setup, 27 | func=_da_save_binary, 28 | teardown=_teardown, 29 | n=NUM_REPETITIONS, 30 | ) 31 | 32 | json_writer.append( 33 | page=Pages.DA_INSERT, 34 | result=result, 35 | metadata=dict(num_docs=num_docs), 36 | ) 37 | 38 | 39 | @pytest.mark.parametrize('num_docs', [100, 10_000]) 40 | def test_da_load_binary(num_docs, json_writer, ephemeral_tmpdir): 41 | def _setup(): 42 | docs = [Document(text=f'doc{i}') for i in range(num_docs)] 43 | da = DocumentArray(docs) 44 | da.save_binary(f'{str(ephemeral_tmpdir)}/docarray.bin') 45 | return (), dict(da=da) 46 | 47 | def _da_load_binary(da): 48 | da.load_binary(f'{str(ephemeral_tmpdir)}/docarray.bin') 49 | 50 | def _teardown(): 51 | import os 52 | 53 | os.remove(f'{str(ephemeral_tmpdir)}/docarray.bin') 54 | 55 | result = benchmark_time( 56 | setup=_setup, 57 | func=_da_load_binary, 58 | teardown=_teardown, 59 | n=NUM_REPETITIONS, 60 | ) 61 | 62 | json_writer.append( 63 | page=Pages.DA_INSERT, 64 | result=result, 65 | metadata=dict(num_docs=num_docs), 66 | ) 67 | -------------------------------------------------------------------------------- /src/document_array_embeddings.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | from jina import Document, DocumentArray, DocumentArrayMemmap 4 | 5 | from .pages import Pages 6 | from .utils.benchmark import benchmark_time 7 | 8 | NUM_REPETITIONS = 10 9 | 10 | 11 | @pytest.mark.parametrize( 12 | 'num_docs,num_feat', [(100, 128), (10_000, 128), (10_000, 256)] 13 | ) 14 | def test_da_embeddings(num_docs, num_feat, json_writer): 15 | def _setup(): 16 | da = DocumentArray( 17 | [Document(embedding=np.random.random(num_feat)) for i in range(num_docs)] 18 | ) 19 | return (), dict(da=da) 20 | 21 | def _da_embeddings(da): 22 | embeddings = da.embeddings 23 | 24 | result = benchmark_time( 25 | setup=_setup, 26 | func=_da_embeddings, 27 | n=NUM_REPETITIONS, 28 | ) 29 | 30 | json_writer.append( 31 | page=Pages.DA_GET_ATTRIBUTES, 32 | result=result, 33 | metadata=dict(num_docs=num_docs, num_feat=num_feat), 34 | ) 35 | 36 | 37 | @pytest.mark.parametrize( 38 | 'num_docs,num_feat', [(100, 128), (10_000, 128), (10_000, 256)] 39 | ) 40 | def test_dam_embeddings(num_docs, num_feat, json_writer, ephemeral_tmpdir): 41 | def _setup(): 42 | dam = DocumentArrayMemmap((f'{str(ephemeral_tmpdir)}/memmap')) 43 | dam.extend( 44 | [Document(embedding=np.random.rand(num_feat)) for i in range(num_docs)] 45 | ) 46 | return (), dict(dam=dam) 47 | 48 | def _dam_clear(dam): 49 | dam.clear() 50 | 51 | def _teardown(): 52 | import shutil 53 | 54 | shutil.rmtree(f'{str(ephemeral_tmpdir)}/memmap') 55 | 56 | result = benchmark_time( 57 | setup=_setup, 58 | func=_dam_clear, 59 | teardown=_teardown, 60 | n=NUM_REPETITIONS, 61 | ) 62 | 63 | json_writer.append( 64 | page=Pages.DA_GET_ATTRIBUTES, 65 | result=result, 66 | metadata=dict(num_docs=num_docs, num_feat=num_feat), 67 | ) 68 | -------------------------------------------------------------------------------- /src/document_array_save.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from jina import Document, DocumentArray, DocumentArrayMemmap 3 | 4 | from .pages import Pages 5 | from .utils.benchmark import benchmark_time 6 | 7 | NUM_REPETITIONS = 10 8 | 9 | # IMPORTANT: This benchmark currently is covered by 10 | # - document_array_save_binary_load_binary.py 11 | # - document_array_save_json_load_json.py 12 | # Only relevant if for future releases `.save` expands to other methods 13 | 14 | 15 | @pytest.mark.parametrize('num_docs', [100, 10_000]) 16 | def test_da_save(num_docs, json_writer, ephemeral_tmpdir): 17 | def _setup(): 18 | da = DocumentArray([Document(text=f'doc{i}') for i in range(num_docs)]) 19 | return (), dict(da=da) 20 | 21 | def _da_save(da): 22 | da.save(f'{str(ephemeral_tmpdir)}/docarray') 23 | 24 | def _teardown(): 25 | import os 26 | 27 | os.remove(f'{str(ephemeral_tmpdir)}/docarray') 28 | 29 | result = benchmark_time( 30 | setup=_setup, func=_da_save, teardown=_teardown, n=NUM_REPETITIONS 31 | ) 32 | 33 | def _teardown(): 34 | import shutil 35 | 36 | shutil.rmtree(f'{str(ephemeral_tmpdir)}/save') 37 | 38 | json_writer.append( 39 | page=Pages.DA_CLEAR, 40 | result=result, 41 | metadata=dict(num_docs=num_docs), 42 | ) 43 | 44 | 45 | @pytest.mark.parametrize('num_docs', [100, 10_000]) 46 | def test_dam_save(num_docs, json_writer, ephemeral_tmpdir): 47 | def _setup(): 48 | dam = DocumentArrayMemmap((f'{str(ephemeral_tmpdir)}/memmap')) 49 | dam.extend([Document(text=f'doc{i}') for i in range(num_docs)]) 50 | return (), dict(dam=dam) 51 | 52 | def _dam_clear(dam): 53 | dam.clear() 54 | 55 | def _teardown(): 56 | import shutil 57 | 58 | shutil.rmtree(f'{str(ephemeral_tmpdir)}/memmap') 59 | 60 | result = benchmark_time( 61 | setup=_setup, func=_dam_clear, teardown=_teardown, n=NUM_REPETITIONS 62 | ) 63 | 64 | json_writer.append( 65 | page=Pages.DA_CLEAR, 66 | result=result, 67 | metadata=dict(num_docs=num_docs), 68 | ) 69 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Benchmark Jina 2 | 3 | We are currenty considering time metrics to benchmark Jina features and using [pytest](https://docs.pytest.org) to run these tests. 4 | 5 | ## Playbook 6 | 7 | ### Prepare environment 8 | 9 | ```bash 10 | pip install -r requirements.txt 11 | pip install pre-commit==2.13.0 12 | pre-commit install 13 | git submodule update --init 14 | ``` 15 | 16 | ### Run Locally 17 | 18 | ```bash 19 | pytest 20 | ``` 21 | 22 | ### Run on Docker 23 | 24 | ```bash 25 | JINA_VER=master 26 | docker build --build-arg JINA_VER=$JINA_VER -t bechmark . 27 | docker run -v $(pwd):/app bechmark:latest 28 | ``` 29 | 30 | ### Generate docs locally and run server 31 | 32 | ```bash 33 | python scripts/site_generator.py 34 | cd docs 35 | hugo server -D 36 | ``` 37 | 38 | ## Machine 39 | 40 | We are running all tests sequentially for a version on a single machine of following properties: 41 | 42 | | Item | Value | 43 | | :---: | :---: | 44 | | Cloud Vendor | AWS | 45 | | Instance | c5.xlarge | 46 | | Memory | 8 GiB | 47 | | vCPU | 4 | 48 | | Processor | Intel Xeon Platinum 8124M | 49 | | Clock Speed | 3 GHz | 50 | | Storage | EBS (gp2) | 51 | 52 | ## Contributing 53 | 54 | We welcome all kinds of contributions from the open-source community, individuals and partners. We owe our success to your active involvement. 55 | 56 | Here're some quick notes you need to know before starting to contribute: 57 | 58 | - Please keep all of your tests under `src` folder and ensure they behave as expected with `pytest`. 59 | - Please save the benchmarking artifacts in `JSON` format in `docs/static/artifacts/${JINA_VERSION}/report.json` file. 60 | - Please enlist any Python dependency to `requirements.txt` file. 61 | - Please run `scripts/site_generator.py` to generate the website everytime you generate new benchmarking artifacts. 62 | - `report.json` file should have the following shema: 63 | 64 | ```json 65 | [ 66 | { 67 | "name": "document_array_append/test_docarray_append", 68 | "iterations": 5, 69 | "mean_time": 0.007944801799999368, 70 | "std_time": 0.0012715548259231583, 71 | "metadata": { 72 | "num_docs_append": 10000 73 | } 74 | } 75 | ] 76 | ``` 77 | -------------------------------------------------------------------------------- /src/zed_runtime_callback.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | from jina import Document, DocumentArray, Executor, requests 4 | from jina.clients.request import request_generator 5 | from jina.parsers import set_pea_parser 6 | from jina.peapods.runtimes.zmq.zed import ZEDRuntime 7 | from jina.types.message import Message 8 | from jina.types.request import Request 9 | 10 | from .utils.benchmark import benchmark_time 11 | 12 | NUM_DOCS = 100 13 | 14 | 15 | class DummyEncoder(Executor): 16 | @requests 17 | def encode(self, docs, **kwargs): 18 | texts = docs.get_attributes('text') 19 | embeddings = [np.random.rand(1, 1024) for _ in texts] 20 | for doc, embedding in zip(docs, embeddings): 21 | doc.embedding = embedding 22 | 23 | 24 | @pytest.fixture() 25 | def process_message(): 26 | req = list( 27 | request_generator( 28 | '/', 29 | DocumentArray([Document(text='input document') for _ in range(NUM_DOCS)]), 30 | ) 31 | )[0] 32 | msg = Message(None, req, 'test', '123') 33 | return msg 34 | 35 | 36 | @pytest.fixture() 37 | def runtime(): 38 | args = set_pea_parser().parse_args(['--uses', 'DummyEncoder']) 39 | return ZEDRuntime(args) 40 | 41 | 42 | @pytest.mark.skip() 43 | def test_zed_runtime_callback(runtime, process_message, json_writer): 44 | def _function(**kwargs): 45 | runtime._callback(process_message) 46 | 47 | result = benchmark_time( 48 | profile_cls=[Document, DocumentArray, Message, Request], 49 | func=_function, 50 | ) 51 | profiles = result.profiles 52 | document_profile = profiles[0] 53 | document_array_profile = profiles[1] 54 | message_profile = profiles[2] 55 | request_profile = profiles[3] 56 | 57 | json_writer.append( 58 | name='zed_runtime_callback/test_zed_runtime_callback', 59 | result=result, 60 | metadata=dict( 61 | profiles=dict( 62 | Document=document_profile, 63 | DocumentArray=document_array_profile, 64 | Message=message_profile, 65 | Request=request_profile, 66 | ), 67 | num_docs=NUM_DOCS, 68 | ), 69 | ) 70 | -------------------------------------------------------------------------------- /src/document_set_attributes.py: -------------------------------------------------------------------------------- 1 | import random 2 | import string 3 | 4 | import numpy as np 5 | import pytest 6 | from jina import Document 7 | 8 | from .pages import Pages 9 | from .utils.benchmark import benchmark_time 10 | 11 | 12 | def _generate_random_text(text_length): 13 | return ''.join( 14 | random.choice(string.ascii_uppercase + string.digits) 15 | for _ in range(text_length) 16 | ) 17 | 18 | 19 | def _generate_random_buffer(buffer_length): 20 | return bytes(bytearray(random.getrandbits(8) for _ in range(buffer_length))) 21 | 22 | 23 | def _generate_random_blob(num_dims): 24 | # 1 and 3 can cover from audio signals to images. 3 dimensions make the memory too high 25 | shape = [random.randint(100, 200)] * num_dims 26 | 27 | return np.random.rand(*shape) 28 | 29 | 30 | @pytest.mark.parametrize('text_length', [10, 100, 1000, 10000]) 31 | def test_set_attribute_text(text_length, json_writer): 32 | def _set_doc(doc): 33 | doc._set_attributes(text=_generate_random_text(text_length)) 34 | 35 | result = benchmark_time( 36 | func=_set_doc, 37 | kwargs=dict(doc=Document()), 38 | ) 39 | 40 | json_writer.append( 41 | page=Pages.DOCUMENT_CONTENT, 42 | result=result, 43 | metadata=dict(text_length=text_length), 44 | ) 45 | 46 | 47 | @pytest.mark.parametrize('num_dims', [1, 2]) 48 | def test_set_attribute_blob(num_dims, json_writer): 49 | def _set_doc(doc): 50 | doc._set_attributes(blob=_generate_random_blob(num_dims)) 51 | 52 | result = benchmark_time( 53 | func=_set_doc, 54 | kwargs=dict(doc=Document()), 55 | ) 56 | 57 | json_writer.append( 58 | page=Pages.DOCUMENT_CONTENT, 59 | result=result, 60 | metadata=dict(num_dims=num_dims), 61 | ) 62 | 63 | 64 | @pytest.mark.parametrize('buffer_length', [10, 1000, 100000]) 65 | def test_set_attribute_buffer(buffer_length, json_writer): 66 | def _set_doc(doc): 67 | doc._set_attributes(buffer=_generate_random_buffer(buffer_length)) 68 | 69 | result = benchmark_time( 70 | func=_set_doc, 71 | kwargs=dict(doc=Document()), 72 | ) 73 | 74 | json_writer.append( 75 | page=Pages.DOCUMENT_CONTENT, 76 | result=result, 77 | metadata=dict(buffer_length=buffer_length), 78 | ) 79 | -------------------------------------------------------------------------------- /src/document_get_attributes.py: -------------------------------------------------------------------------------- 1 | import random 2 | import string 3 | 4 | import numpy as np 5 | import pytest 6 | from jina import Document 7 | 8 | from .pages import Pages 9 | from .utils.benchmark import benchmark_time 10 | 11 | 12 | def _generate_random_text(text_length): 13 | return ''.join( 14 | random.choice(string.ascii_uppercase + string.digits) 15 | for _ in range(text_length) 16 | ) 17 | 18 | 19 | def _generate_random_buffer(buffer_length): 20 | return bytes(bytearray(random.getrandbits(8) for _ in range(buffer_length))) 21 | 22 | 23 | def _generate_random_blob(num_dims): 24 | # 1 and 3 can cover from audio signals to images. 3 dimensions make the memory too high 25 | shape = [random.randint(100, 200)] * num_dims 26 | 27 | return np.random.rand(*shape) 28 | 29 | 30 | @pytest.mark.parametrize('text_length', [10, 100, 1000, 10000]) 31 | def test_get_attributes_text(text_length, json_writer): 32 | def _doc_get(doc): 33 | _ = doc.get_attributes(*['text']) 34 | 35 | result = benchmark_time( 36 | func=_doc_get, 37 | kwargs=dict(doc=Document(text=_generate_random_text(text_length))), 38 | ) 39 | 40 | json_writer.append( 41 | page=Pages.DOCUMENT_CONTENT, 42 | result=result, 43 | metadata=dict(text_length=text_length), 44 | ) 45 | 46 | 47 | @pytest.mark.parametrize('num_dims', [1, 2]) 48 | def test_get_attribute_blob(num_dims, json_writer): 49 | def _doc_get(doc): 50 | _ = doc.get_attributes(*['blob']) 51 | 52 | result = benchmark_time( 53 | func=_doc_get, 54 | kwargs=dict(doc=Document(blob=_generate_random_blob(num_dims))), 55 | ) 56 | 57 | json_writer.append( 58 | page=Pages.DOCUMENT_CONTENT, 59 | result=result, 60 | metadata=dict(num_dims=num_dims), 61 | ) 62 | 63 | 64 | @pytest.mark.parametrize('buffer_length', [10, 1000, 100000]) 65 | def test_get_attribute_buffer(buffer_length, json_writer): 66 | def _doc_get(doc): 67 | _ = doc.get_attributes(*['buffer']) 68 | 69 | result = benchmark_time( 70 | func=_doc_get, 71 | kwargs=dict(doc=Document(buffer=_generate_random_buffer(buffer_length))), 72 | ) 73 | 74 | json_writer.append( 75 | page=Pages.DOCUMENT_CONTENT, 76 | result=result, 77 | metadata=dict(buffer_length=buffer_length), 78 | ) 79 | -------------------------------------------------------------------------------- /src/document_array_extend.py: -------------------------------------------------------------------------------- 1 | import os 2 | import string 3 | 4 | import numpy as np 5 | import pytest 6 | from jina import Document, DocumentArray, DocumentArrayMemmap 7 | 8 | from .pages import Pages 9 | from .utils.benchmark import benchmark_time 10 | 11 | NUM_REPETITIONS = 25 12 | NUM_DOCS = 1000 13 | CHARS = tuple(string.ascii_uppercase + string.digits) 14 | 15 | 16 | def _generate_random_text(): 17 | return ''.join(np.random.choice(CHARS, 256)) 18 | 19 | 20 | def _generate_random_blob(): 21 | return np.random.random(512) 22 | 23 | 24 | def _generate_random_buffer(): 25 | return bytes(bytearray(os.urandom(512 * 4))) 26 | 27 | 28 | def empty_docs(): 29 | return [Document() for _ in range(NUM_DOCS)] 30 | 31 | 32 | def text_docs(): 33 | return [Document(text=_generate_random_text()) for _ in range(NUM_DOCS)] 34 | 35 | 36 | def blob_docs(): 37 | return [Document(blob=_generate_random_blob()) for _ in range(NUM_DOCS)] 38 | 39 | 40 | def buffer_docs(): 41 | return [Document(buffer=_generate_random_buffer()) for _ in range(NUM_DOCS)] 42 | 43 | 44 | @pytest.mark.parametrize('memmap', [False, True]) 45 | @pytest.mark.parametrize( 46 | 'docs, label', 47 | [ 48 | (empty_docs(), 'empty'), 49 | (blob_docs(), 'blob'), 50 | (text_docs(), 'text'), 51 | (buffer_docs(), 'buffer'), 52 | ], 53 | ) 54 | def test_da_extend(docs, label, memmap, json_writer, ephemeral_tmpdir): 55 | def _extend(da): 56 | da.extend(docs) 57 | 58 | def _build_da(**kwargs): 59 | memmap = kwargs.get('memmap', False) 60 | da = ( 61 | DocumentArray() 62 | if not memmap 63 | else DocumentArrayMemmap(f'{str(ephemeral_tmpdir)}/memmap') 64 | ) 65 | return (), dict(da=da) 66 | 67 | def _teardown(): 68 | import os 69 | import shutil 70 | 71 | if os.path.exists(f'{str(ephemeral_tmpdir)}/memmap'): 72 | shutil.rmtree(f'{str(ephemeral_tmpdir)}/memmap') 73 | 74 | result = benchmark_time( 75 | setup=_build_da, 76 | func=_extend, 77 | teardown=_teardown, 78 | n=NUM_REPETITIONS, 79 | kwargs=dict(memmap=memmap), 80 | ) 81 | 82 | json_writer.append( 83 | page=Pages.DA_EXTEND, 84 | result=result, 85 | metadata=dict(num_docs=len(docs), label=label, memmap=memmap), 86 | ) 87 | -------------------------------------------------------------------------------- /src/document_array_traverse.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | 3 | import pytest 4 | from jina import Document, DocumentArray, DocumentArrayMemmap 5 | 6 | from .pages import Pages 7 | from .utils.benchmark import benchmark_time 8 | 9 | 10 | def _get_docs(num_docs): 11 | return [Document(text=f'This is the document number: {i}') for i in range(num_docs)] 12 | 13 | 14 | def _build_da(num_docs, num_matches, num_chunks): 15 | da = DocumentArray(_get_docs(num_docs)) 16 | for doc in da: 17 | if num_matches > 0: 18 | doc.matches.extend(_get_docs(num_matches)) 19 | if num_chunks > 0: 20 | doc.chunks.extend(_get_docs(num_chunks)) 21 | 22 | return da 23 | 24 | 25 | @pytest.mark.parametrize( 26 | 'num_docs,num_matches,num_chunks,traversal_paths', 27 | [ 28 | (10, 10, 10, 'r,c,m'), 29 | (100, 100, 100, 'r,c,m'), 30 | (1000, 100, 100, 'r,c,m'), 31 | (1000, 10, 10, 'r'), 32 | (1000, 10, 100, 'c'), 33 | (1000, 100, 10, 'm'), 34 | ], 35 | ) 36 | @pytest.mark.parametrize('memmap', [False, True]) 37 | def test_da_traverse_flat( 38 | name, 39 | num_docs, 40 | num_matches, 41 | num_chunks, 42 | traversal_paths, 43 | memmap, 44 | json_writer, 45 | ephemeral_tmpdir, 46 | ): 47 | if num_docs == 1000 and num_chunks == 1000 and num_matches == 1000: 48 | pytest.skip('problems with memory') 49 | 50 | def _traverse_flat(da): 51 | for d in da.traverse_flat(traversal_paths): 52 | pass 53 | 54 | def _build_da(): 55 | docs = _get_docs(num_docs) 56 | for doc in docs: 57 | if num_matches > 0: 58 | doc.matches.extend(_get_docs(num_matches)) 59 | if num_chunks > 0: 60 | doc.chunks.extend(_get_docs(num_chunks)) 61 | 62 | da = ( 63 | DocumentArray() 64 | if not memmap 65 | else DocumentArrayMemmap(f'{str(ephemeral_tmpdir)}/memmap') 66 | ) 67 | da.extend(docs) 68 | 69 | return (), dict(da=da) 70 | 71 | def _teardown(): 72 | try: 73 | shutil.rmtree(f'{str(ephemeral_tmpdir)}/memmap') 74 | except FileNotFoundError: 75 | pass 76 | 77 | result = benchmark_time(setup=_build_da, func=_traverse_flat, teardown=_teardown) 78 | if memmap: 79 | name = name.replace('_da_', '_dam_') 80 | json_writer.append( 81 | name=name, 82 | page=Pages.DA_TRAVERSE, 83 | result=result, 84 | metadata=dict( 85 | num_docs=num_docs, 86 | num_matches=num_matches, 87 | num_chunks=num_chunks, 88 | traversal_paths=traversal_paths, 89 | memmap=memmap, 90 | ), 91 | ) 92 | -------------------------------------------------------------------------------- /src/document_property_getter.py: -------------------------------------------------------------------------------- 1 | import random 2 | import string 3 | 4 | import numpy as np 5 | import pytest 6 | from jina import Document 7 | 8 | from .pages import Pages 9 | from .utils.benchmark import benchmark_time 10 | 11 | 12 | def _generate_random_text(text_length): 13 | return ''.join( 14 | random.choice(string.ascii_uppercase + string.digits) 15 | for _ in range(text_length) 16 | ) 17 | 18 | 19 | def _generate_random_buffer(buffer_length): 20 | return bytes(bytearray(random.getrandbits(8) for _ in range(buffer_length))) 21 | 22 | 23 | def _generate_random_blob(num_dims): 24 | # 1 and 3 can cover from audio signals to images. 3 dimensions make the memory too high 25 | shape = [random.randint(100, 200)] * num_dims 26 | 27 | return np.random.rand(*shape) 28 | 29 | 30 | @pytest.mark.parametrize('text_length', [10, 100, 1000, 10000]) 31 | def test_get_content_text(text_length, json_writer): 32 | def _doc_get(doc): 33 | _ = doc.text 34 | 35 | result = benchmark_time( 36 | func=_doc_get, 37 | kwargs=dict(doc=Document(text=_generate_random_text(text_length))), 38 | ) 39 | 40 | json_writer.append( 41 | page=Pages.DOCUMENT_CONTENT, 42 | result=result, 43 | metadata=dict(text_length=text_length), 44 | ) 45 | 46 | 47 | @pytest.mark.parametrize('num_dims', [1, 2]) 48 | def test_get_content_blob(num_dims, json_writer): 49 | def _doc_get(doc): 50 | _ = doc.blob 51 | 52 | result = benchmark_time( 53 | func=_doc_get, 54 | kwargs=dict(doc=Document(blob=_generate_random_blob(num_dims))), 55 | ) 56 | 57 | json_writer.append( 58 | page=Pages.DOCUMENT_CONTENT, 59 | result=result, 60 | metadata=dict(num_dims=num_dims), 61 | ) 62 | 63 | 64 | @pytest.mark.parametrize('buffer_length', [10, 1000, 100000]) 65 | def test_get_content_buffer(buffer_length, json_writer): 66 | def _doc_get(doc): 67 | _ = doc.buffer 68 | 69 | result = benchmark_time( 70 | func=_doc_get, 71 | kwargs=dict(doc=Document(buffer=_generate_random_buffer(buffer_length))), 72 | ) 73 | 74 | json_writer.append( 75 | page=Pages.DOCUMENT_CONTENT, 76 | result=result, 77 | metadata=dict(buffer_length=buffer_length), 78 | ) 79 | 80 | 81 | @pytest.mark.parametrize('num_dims', [1, 2]) 82 | def test_get_embedding(num_dims, json_writer): 83 | def _doc_get(doc): 84 | _ = doc.embedding 85 | 86 | result = benchmark_time( 87 | func=_doc_get, 88 | kwargs=dict(doc=Document(embedding=_generate_random_blob(num_dims))), 89 | ) 90 | 91 | json_writer.append( 92 | page=Pages.DOCUMENT_CONTENT, 93 | result=result, 94 | metadata=dict(buffer_length=num_dims), 95 | ) 96 | -------------------------------------------------------------------------------- /conftest.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from pathlib import Path 4 | 5 | from collections import defaultdict 6 | 7 | import pytest 8 | from jina import __version__ 9 | 10 | 11 | def pytest_addoption(parser): 12 | parser.addoption('--output-file', action='store', default='report.json') 13 | 14 | 15 | class ResultsCollector: 16 | def __init__(self, output_dir, default_filename): 17 | self.results = defaultdict(list) 18 | self.output_dir = output_dir 19 | self.default_filename = default_filename 20 | 21 | def get_test_name(): 22 | test = os.environ['PYTEST_CURRENT_TEST'] 23 | removed_head = test.split('::')[-1] 24 | return removed_head.split('[')[0].split(' (')[0] 25 | 26 | def append(self, page, result, metadata=None, name=None, target_file=None): 27 | if metadata is None: 28 | metadata = {} 29 | 30 | if name is None: 31 | name = ResultsCollector.get_test_name() 32 | if target_file is None: 33 | target_file = self.default_filename 34 | 35 | self.results[target_file].append( 36 | dict( 37 | name=name, 38 | page=page, 39 | iterations=result.iterations, 40 | mean_time=result.mean, 41 | std_time=result.std, 42 | metadata=metadata, 43 | ) 44 | ) 45 | 46 | def append_raw(self, dict_, target_file=None): 47 | if target_file is None: 48 | target_file = self.default_filename 49 | 50 | self.results[target_file].append(dict_) 51 | return self.results[target_file] 52 | 53 | def dump(self): 54 | Path(self.output_dir).mkdir(parents=True, exist_ok=True) 55 | for filename, content in self.results.items(): 56 | file_path = f'{self.output_dir}/{filename}' 57 | with open(file_path, 'w+') as file: 58 | json.dump(content, file) 59 | 60 | 61 | @pytest.fixture(scope='session') 62 | def json_writer(pytestconfig): 63 | version = os.environ.get('JINA_VERSION', __version__) 64 | 65 | if version == 'master': 66 | version = __version__ 67 | elif version.startswith('v'): 68 | version = version[1:] 69 | output_dir = f'docs/static/artifacts/{version}' 70 | 71 | collector = ResultsCollector(output_dir, pytestconfig.getoption('output_file')) 72 | yield collector 73 | 74 | collector.dump() 75 | 76 | 77 | @pytest.fixture() 78 | def ephemeral_tmpdir(tmpdir): 79 | yield tmpdir 80 | 81 | import shutil 82 | 83 | shutil.rmtree(str(tmpdir)) 84 | 85 | 86 | @pytest.fixture() 87 | def name(): 88 | test = os.environ['PYTEST_CURRENT_TEST'] 89 | removed_head = test.split('::')[-1] 90 | removed_tail = removed_head.split('[')[0].split(' (')[0] 91 | 92 | return removed_tail 93 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | # vscode configuration directory 132 | .vscode 133 | 134 | # output directory 135 | output/ 136 | outputs/ 137 | 138 | # indexer directory 139 | **/MyIndexer/ 140 | **/MyMemMap/ 141 | **/tmp/ 142 | 143 | # Hugo default output directory 144 | **/public 145 | **/resources 146 | 147 | # pycharm 148 | .idea/ 149 | 150 | # custom 151 | docs/content/ 152 | -------------------------------------------------------------------------------- /src/utils/profiler.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | from statistics import mean, stdev 3 | from typing import Dict, List 4 | 5 | from .timecontext import TimeContext 6 | 7 | 8 | def profile(profile, function, *args, **kwargs): 9 | def wrapper(*args, **kwargs): 10 | with TimeContext() as timer: 11 | func = function(*args, **kwargs) 12 | 13 | if function.__name__ in profile.keys(): 14 | profile[function.__name__]['time'] += timer.duration 15 | profile[function.__name__]['calls'] += 1 16 | else: 17 | profile[function.__name__] = {} 18 | profile[function.__name__]['time'] = timer.duration 19 | profile[function.__name__]['calls'] = 1 20 | return func 21 | 22 | return wrapper 23 | 24 | 25 | def merge_profiles(profiles: List[Dict]) -> Dict: 26 | avg_profile = {} 27 | for profile in profiles: 28 | for function in profile.keys(): 29 | if function in avg_profile: 30 | avg_profile[function]['time'].append(profile[function]['time']) 31 | avg_profile[function]['calls'].append(profile[function]['calls']) 32 | else: 33 | avg_profile[function] = {} 34 | avg_profile[function]['time'] = [] 35 | avg_profile[function]['calls'] = [] 36 | avg_profile[function]['time'].append(profile[function]['time']) 37 | avg_profile[function]['calls'].append(profile[function]['calls']) 38 | 39 | for function in avg_profile.keys(): 40 | avg_time = mean(avg_profile[function]['time']) 41 | stdev_time = ( 42 | stdev(avg_profile[function]['time']) 43 | if len(avg_profile[function]['time']) > 0 44 | else None 45 | ) 46 | avg_calls = mean(avg_profile[function]['calls']) 47 | stdev_calls = ( 48 | stdev(avg_profile[function]['calls']) 49 | if len(avg_profile[function]['calls']) > 0 50 | else None 51 | ) 52 | del avg_profile[function]['time'] 53 | del avg_profile[function]['calls'] 54 | avg_profile[function]['mean_time'] = avg_time 55 | avg_profile[function]['std_time'] = stdev_time 56 | avg_profile[function]['mean_calls'] = avg_calls 57 | avg_profile[function]['std_calls'] = stdev_calls 58 | 59 | return avg_profile 60 | 61 | 62 | class Profiler: 63 | def __init__(self, cls): 64 | self._cls = cls 65 | self.profile = {} 66 | self._old_funcs = {} 67 | 68 | def __enter__(self): 69 | for _, f in inspect.getmembers(self._cls, predicate=inspect.isfunction): 70 | self._old_funcs[f.__name__] = f 71 | setattr(self._cls, f.__name__, profile(self.profile, f)) 72 | return self 73 | 74 | def __exit__(self, exc_type, exc_val, exc_tb): 75 | for func_name, func_val in self._old_funcs.items(): 76 | setattr(self._cls, func_name, func_val) 77 | -------------------------------------------------------------------------------- /docs/config.yml: -------------------------------------------------------------------------------- 1 | baseURL: 'https://benchmark.jina.ai/' 2 | defaultContentLanguage: en 3 | title: Benchmark Jina 4 | theme: book 5 | 6 | # Book configuration 7 | disablePathToLower: true 8 | enableGitInfo: true 9 | 10 | # Needed for mermaid/katex shortcodes 11 | markup: 12 | goldmark: 13 | renderer: 14 | unsafe: true 15 | tableOfContents: 16 | startLevel: 1 17 | 18 | params: 19 | # (Optional, default light) Sets color theme: light, dark or auto. 20 | # Theme 'auto' switches between dark and light modes based on browser/os preferences 21 | BookTheme: light 22 | 23 | # (Optional, default true) Controls table of contents visibility on right side of pages. 24 | # Start and end levels can be controlled with markup.tableOfContents setting. 25 | # You can also specify this parameter per page in front matter. 26 | BookToC: true 27 | 28 | # (Optional, default none) Set the path to a logo for the book. If the logo is 29 | # /static/logo.png then the path would be logo.png 30 | # BookLogo: /img/logo-only.gif 31 | 32 | # (Optional, default none) Set leaf bundle to render as side menu 33 | # When not specified file structure and weights will be used 34 | BookMenuBundle: /menu 35 | 36 | # (Optional, default docs) Specify section of content to render as menu 37 | # You can also set value to "*" to render all sections to menu 38 | BookSection: docs 39 | 40 | # Set source repository location. 41 | # Used for 'Last Modified' and 'Edit this page' links. 42 | BookRepo: https://github.com/jina-ai/benchmark 43 | 44 | # Specifies commit portion of the link to the page's last modified commit hash for 'doc' page 45 | # type. 46 | # Required if 'BookRepo' param is set. 47 | # Value used to construct a URL consisting of BookRepo/BookCommitPath/ 48 | # Github uses 'commit', Bitbucket uses 'commits' 49 | BookCommitPath: commit 50 | 51 | # Enable 'Edit this page' links for 'doc' page type. 52 | # Disabled by default. Uncomment to enable. Requires 'BookRepo' param. 53 | # Path must point to the site directory. 54 | BookEditPath: edit/main/docs 55 | 56 | # (Optional, default January 2, 2006) Configure the date format used on the pages 57 | # - In git information 58 | # - In blog posts 59 | BookDateFormat: '2 January, 2006' 60 | 61 | # (Optional, default true) Enables search function with flexsearch, 62 | # Index is built on fly, therefore it might slowdown your website. 63 | # Configuration for indexing can be adjusted in i18n folder per language. 64 | BookSearch: false 65 | 66 | # (Optional, default true) Enables comments template on pages 67 | # By default partials/docs/comments.html includes Disqus template 68 | # See https://gohugo.io/content-management/comments/#configure-disqus 69 | # Can be overwritten by same param in page frontmatter 70 | BookComments: false 71 | 72 | # /!\ This is an experimental feature, might be removed or changed at any time 73 | # (Optional, experimental, default false) Enables service worker that caches visited pages and resources for offline use. 74 | BookServiceWorker: true 75 | -------------------------------------------------------------------------------- /src/utils/benchmark.py: -------------------------------------------------------------------------------- 1 | from collections import namedtuple 2 | from contextlib import ExitStack 3 | from statistics import mean, stdev 4 | from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple 5 | 6 | from .profiler import Profiler, merge_profiles 7 | from .timecontext import TimeContext 8 | 9 | BenchmarkResult = namedtuple( 10 | 'BenchmarkResult', ['mean', 'std', 'iterations', 'profiles'] 11 | ) 12 | 13 | 14 | def benchmark_time( 15 | func: Callable[[Any], Any], 16 | n: int = 5, 17 | setup: Optional[Callable[[Any], Optional[Tuple[Iterable, Dict[str, Any]]]]] = None, 18 | teardown: Optional[Callable[[None], None]] = None, 19 | profile_cls: Optional[List[type]] = [], 20 | args: Optional[Tuple] = None, 21 | kwargs: Optional[Dict] = None, 22 | ): 23 | """Get average time and std by benchmarking a function multiple times 24 | 25 | :param func: The function to benchmark 26 | :param setup: A setup function that can perform setup before running 27 | the ``func``. It should take as inputs the ``args`` and ``kwargs`` 28 | that you provided, and return a tuple of an iterable, which will 29 | be used to provide ``args`` to ``func``, and a dictionary, which 30 | will be used to provide ``kwargs`` to ``func``. 31 | :param teardown: A teardown function that can perform teardown/cleanup after running 32 | the ``func``. 33 | :param profile_cls: A list of the classes that want to be profiled 34 | :param n: Number of repetitions 35 | :param args: Positional arguments to pass to ``func`` (or ``setup``) 36 | :param kwargs: Keyword arguments to pass to ``func`` (or ``setup``) 37 | """ 38 | 39 | results = [] 40 | args = args if args is not None else () 41 | kwargs = kwargs if kwargs is not None else {} 42 | 43 | profiles_by_cls = {_cls: [] for _cls in profile_cls} 44 | 45 | with TimeContext() as test_timer: 46 | while test_timer.time_since_start() < 1e9 or len(results) < n: 47 | if setup is not None: 48 | new_args, new_kwargs = setup(*args, **kwargs) 49 | else: 50 | new_args, new_kwargs = args, kwargs 51 | 52 | ctx_manager = ExitStack() 53 | 54 | profiles = [ctx_manager.enter_context(Profiler(cls)) for cls in profile_cls] 55 | with ctx_manager: 56 | with TimeContext() as t: 57 | func(*new_args, **new_kwargs) 58 | 59 | for p in profiles: 60 | profiles_by_cls[p._cls].append(p.profile) 61 | 62 | if teardown is not None: 63 | teardown() 64 | 65 | results.append(t.duration) 66 | 67 | mean_profiles = [] 68 | for profile_cls, profile_list in profiles_by_cls.items(): 69 | mean_profiles.append(merge_profiles(profile_list)) 70 | 71 | m = int(mean(results)) 72 | s = int(stdev(results)) if len(results) > 1 else None 73 | print( 74 | f'----> mean_time={round(m,3)}, std_time={round(s,3)}, iterations={len(results)}' 75 | ) 76 | 77 | return BenchmarkResult(m, s, len(results), mean_profiles) 78 | -------------------------------------------------------------------------------- /src/document_array_match.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Generator, Union 2 | 3 | import numpy as np 4 | import pytest 5 | from jina import Document, DocumentArray, DocumentArrayMemmap 6 | 7 | from .pages import Pages 8 | from .utils.benchmark import benchmark_time 9 | 10 | 11 | def _generate_docs_with_embs( 12 | n_docs: int, emb_size: int 13 | ) -> Generator[Document, None, None]: 14 | embedings = np.random.random((n_docs, emb_size)) 15 | for emb in embedings: 16 | yield Document(embedding=emb) 17 | 18 | 19 | def match_arrays( 20 | array1: Union[DocumentArray, DocumentArrayMemmap], 21 | array2: Union[DocumentArray, DocumentArrayMemmap], 22 | topk: int, 23 | metric: str, 24 | use_scipy: bool, 25 | ): 26 | array1.match(array2, limit=topk, metric=metric, use_scipy=use_scipy) 27 | 28 | 29 | def _prepare_inputs_standard( 30 | size1: int = 10, 31 | size2: int = 10_000, 32 | emb_size: int = 128, 33 | topk: int = 10, 34 | metric: str = 'cosine', 35 | use_scipy: bool = False, 36 | dam_x: bool = False, 37 | dam_y: bool = False, 38 | dam_path: str = './', 39 | ) -> Dict: 40 | if not dam_x: 41 | x = DocumentArray(_generate_docs_with_embs(size1, emb_size)) 42 | else: 43 | x = DocumentArrayMemmap(f'{dam_path}/x') 44 | x.extend(_generate_docs_with_embs(size1, emb_size)) 45 | if not dam_y: 46 | y = DocumentArray(_generate_docs_with_embs(size2, emb_size)) 47 | else: 48 | y = DocumentArrayMemmap(f'{dam_path}/y') 49 | y.extend(_generate_docs_with_embs(size2, emb_size)) 50 | return dict( 51 | array1=x, 52 | array2=y, 53 | topk=topk, 54 | metric=metric, 55 | use_scipy=use_scipy, 56 | ) 57 | 58 | 59 | @pytest.mark.parametrize('size_X', [10]) 60 | @pytest.mark.parametrize('size_Y', [100000]) 61 | @pytest.mark.parametrize('dam_x', [False]) 62 | @pytest.mark.parametrize('dam_y', [False]) 63 | @pytest.mark.parametrize('emb_size', [256]) 64 | @pytest.mark.parametrize('use_scipy', [False]) 65 | @pytest.mark.parametrize('metric', ['euclidean']) 66 | @pytest.mark.parametrize('top_k', [3]) 67 | def test_match( 68 | size_X: int, 69 | size_Y: int, 70 | dam_x: bool, 71 | dam_y: bool, 72 | emb_size: int, 73 | use_scipy: bool, 74 | metric: str, 75 | top_k: int, 76 | ephemeral_tmpdir, 77 | json_writer, 78 | ): 79 | result = benchmark_time( 80 | match_arrays, 81 | kwargs=_prepare_inputs_standard( 82 | size1=size_X, 83 | size2=size_Y, 84 | dam_x=dam_x, 85 | dam_y=dam_y, 86 | emb_size=emb_size, 87 | use_scipy=use_scipy, 88 | metric=metric, 89 | dam_path=str(ephemeral_tmpdir), 90 | topk=top_k, 91 | ), 92 | ) 93 | 94 | json_writer.append( 95 | page=Pages.DA_MATCH, 96 | result=result, 97 | metadata=dict( 98 | size_X=size_X, 99 | size_Y=size_Y, 100 | dam_x=dam_x, 101 | dam_y=dam_y, 102 | emb_size=emb_size, 103 | use_scipy=use_scipy, 104 | metric=metric, 105 | top_k=top_k, 106 | ), 107 | ) 108 | -------------------------------------------------------------------------------- /src/flow.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from jina import Flow 3 | 4 | from .pages import Pages 5 | from .utils.benchmark import benchmark_time 6 | 7 | NUM_PODS = 10 8 | 9 | 10 | def _long_flow(): 11 | f = Flow() 12 | for _ in range(NUM_PODS): 13 | f = f.add() 14 | 15 | return f 16 | 17 | 18 | def _wide_flow(): 19 | f = Flow().add(name='pod0') 20 | for i in range(NUM_PODS): 21 | f = f.add(needs=['pod0'], name=f'wide_{i}') 22 | f = f.add(name='join', needs=[f'wide_{i}' for i in range(NUM_PODS)]) 23 | return f 24 | 25 | 26 | @pytest.mark.parametrize( 27 | 'flow, ftype', [(_long_flow(), 'long'), (_wide_flow(), 'wide')] 28 | ) 29 | def test_local_flow_start(flow, ftype, json_writer): 30 | def _start(): 31 | flow.start() 32 | 33 | def _close(): 34 | flow.close() 35 | 36 | result = benchmark_time(func=_start, teardown=_close) 37 | 38 | json_writer.append( 39 | page=Pages.FLOW, 40 | result=result, 41 | metadata=dict(flow=ftype, num_pods=NUM_PODS), 42 | ) 43 | 44 | 45 | @pytest.mark.parametrize( 46 | 'flow, ftype', [(_long_flow(), 'long'), (_wide_flow(), 'wide')] 47 | ) 48 | def test_local_flow_close(flow, ftype, json_writer): 49 | def _start(): 50 | flow.start() 51 | return (), {} 52 | 53 | def _close(): 54 | flow.close() 55 | 56 | result = benchmark_time(setup=_start, func=_close) 57 | 58 | json_writer.append( 59 | page=Pages.FLOW, 60 | result=result, 61 | metadata=dict(flow=ftype, num_pods=NUM_PODS), 62 | ) 63 | 64 | 65 | yaml_long = '''jtype: Flow 66 | version: '1' 67 | pods: 68 | - uses: 69 | name: pod1 70 | - uses: 71 | name: pod2 72 | - uses: 73 | name: pod3 74 | - uses: 75 | name: pod4 76 | - uses: 77 | name: pod5 78 | - uses: 79 | name: pod6 80 | - uses: 81 | name: pod7 82 | - uses: 83 | name: pod8 84 | - uses: 85 | name: pod9 86 | - uses: 87 | name: pod10 88 | ''' 89 | 90 | yaml_wide = '''jtype: Flow 91 | version: '1' 92 | pods: 93 | - uses: 94 | name: pod0 95 | - uses: 96 | name: wide_0 97 | needs: [pod0] 98 | - uses: 99 | name: wide_1 100 | needs: [pod0] 101 | - uses: 102 | name: wide_2 103 | needs: [pod0] 104 | - uses: 105 | name: wide_3 106 | needs: [pod0] 107 | - uses: 108 | name: wide_4 109 | needs: [pod0] 110 | - uses: 111 | name: wide_5 112 | needs: [pod0] 113 | - uses: 114 | name: wide_6 115 | needs: [pod0] 116 | - uses: 117 | name: wide_7 118 | needs: [pod0] 119 | - uses: 120 | name: wide_8 121 | needs: [pod0] 122 | - uses: 123 | name: wide_9 124 | needs: [pod0] 125 | - uses: 126 | name: join 127 | needs: [wide_0, wide_1, wide_2, wide_3, wide_4, wide_5, wide_6, wide_7, wide_8, wide_9] 128 | ''' 129 | 130 | 131 | @pytest.mark.parametrize('config, ftype', [(yaml_long, 'long'), (yaml_wide, 'wide')]) 132 | def test_flow_load_config(config, ftype, json_writer): 133 | def _build(): 134 | Flow.load_config(config) 135 | 136 | result = benchmark_time(func=_build) 137 | 138 | json_writer.append( 139 | page=Pages.FLOW, 140 | result=result, 141 | metadata=dict(flow=ftype, num_pods=NUM_PODS), 142 | ) 143 | -------------------------------------------------------------------------------- /src/document_array_construct.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from faker import Faker 3 | from jina import Document, DocumentArray, DocumentArrayMemmap 4 | 5 | from .pages import Pages 6 | from .utils.benchmark import benchmark_time 7 | 8 | fake = Faker() 9 | Faker.seed(42) 10 | NUM_DOCS = 10000 11 | 12 | 13 | @pytest.fixture 14 | def docs(): 15 | return [Document(text=fake.text()) for _ in range(NUM_DOCS)] 16 | 17 | 18 | @pytest.fixture 19 | def doc_with_chunks(): 20 | d = Document() 21 | for idx in range(NUM_DOCS): 22 | d.chunks.append(Document(text=fake.text())) 23 | return d 24 | 25 | 26 | @pytest.fixture() 27 | def tuple_docs(docs): 28 | return tuple(docs) 29 | 30 | 31 | @pytest.fixture 32 | def doc_array(docs): 33 | return DocumentArray(docs) 34 | 35 | 36 | @pytest.fixture 37 | def doc_array_memmap(docs, ephemeral_tmpdir): 38 | dam = DocumentArrayMemmap(f'{str(ephemeral_tmpdir)}/memmap') 39 | dam.extend(docs) 40 | return dam 41 | 42 | 43 | def test_construct_document_array_from_repeated_container(doc_with_chunks, json_writer): 44 | def _construct(): 45 | DocumentArray(doc_with_chunks.chunks) 46 | 47 | result = benchmark_time(func=_construct) 48 | 49 | json_writer.append( 50 | page=Pages.DA_CONSTRUCT, 51 | result=result, 52 | metadata=dict(num_chunks=NUM_DOCS), 53 | ) 54 | 55 | 56 | def test_construct_document_array_from_another_documentarray(doc_array, json_writer): 57 | def _construct(): 58 | DocumentArray(doc_array) 59 | 60 | result = benchmark_time(func=_construct) 61 | 62 | json_writer.append( 63 | page=Pages.DA_CONSTRUCT, 64 | result=result, 65 | metadata=dict(num_docs=len(doc_array)), 66 | ) 67 | 68 | 69 | def test_construct_document_array_from_list_of_documents(docs, json_writer): 70 | def _construct(): 71 | DocumentArray(docs) 72 | 73 | result = benchmark_time(func=_construct) 74 | 75 | json_writer.append( 76 | page=Pages.DA_CONSTRUCT, 77 | result=result, 78 | metadata=dict(num_docs=len(docs)), 79 | ) 80 | 81 | 82 | def test_construct_document_array_from_tuple_of_documents(tuple_docs, json_writer): 83 | def _construct(): 84 | DocumentArray(tuple_docs) 85 | 86 | result = benchmark_time(func=_construct) 87 | 88 | json_writer.append( 89 | page=Pages.DA_CONSTRUCT, 90 | result=result, 91 | metadata=dict(num_docs=len(tuple_docs)), 92 | ) 93 | 94 | 95 | def test_construct_document_array_from_generator(json_writer): 96 | def _yield_documents(): 97 | """Used to benchmark construct DocumentArray from a document generator.""" 98 | for idx in range(NUM_DOCS): 99 | yield Document(text=fake.text()) 100 | 101 | def _construct(): 102 | DocumentArray(_yield_documents()) 103 | 104 | result = benchmark_time(func=_construct) 105 | 106 | json_writer.append( 107 | page=Pages.DA_CONSTRUCT, 108 | result=result, 109 | metadata=dict(num_docs=NUM_DOCS), 110 | ) 111 | 112 | 113 | def test_construct_document_array_from_another_documentarray_memmap( 114 | doc_array_memmap, json_writer 115 | ): 116 | def _construct(): 117 | DocumentArray(doc_array_memmap) 118 | 119 | result = benchmark_time(func=_construct) 120 | 121 | json_writer.append( 122 | page=Pages.DA_CONSTRUCT, 123 | result=result, 124 | metadata=dict(num_docs=len(doc_array_memmap)), 125 | ) 126 | -------------------------------------------------------------------------------- /src/document_array_get_attributes.py: -------------------------------------------------------------------------------- 1 | import os 2 | import string 3 | 4 | import numpy as np 5 | import pytest 6 | from faker import Faker 7 | from jina import Document, DocumentArray, DocumentArrayMemmap 8 | 9 | from .pages import Pages 10 | from .utils.benchmark import benchmark_time 11 | 12 | fake = Faker() 13 | Faker.seed(42) 14 | NUM_DOCS = 1000 15 | CHARS = tuple(string.ascii_uppercase + string.digits) 16 | 17 | 18 | def _generate_random_text(): 19 | return ''.join(np.random.choice(CHARS, 256)) 20 | 21 | 22 | def _generate_random_blob(): 23 | return np.random.random(512) 24 | 25 | 26 | def _generate_random_buffer(): 27 | return bytes(bytearray(os.urandom(512 * 4))) 28 | 29 | 30 | def empty_docs(): 31 | return [Document() for _ in range(NUM_DOCS)] 32 | 33 | 34 | def text_docs(num_docs): 35 | return [Document(text=_generate_random_text()) for _ in range(num_docs)] 36 | 37 | 38 | def blob_docs(num_docs): 39 | return [Document(blob=_generate_random_blob()) for _ in range(num_docs)] 40 | 41 | 42 | def buffer_docs(num_docs): 43 | return [Document(buffer=_generate_random_buffer()) for _ in range(num_docs)] 44 | 45 | 46 | def embedding_docs(num_docs): 47 | return [Document(embedding=_generate_random_blob()) for _ in range(num_docs)] 48 | 49 | 50 | @pytest.mark.parametrize('memmap', [False, True]) 51 | @pytest.mark.parametrize( 52 | 'field, docs_get_fn', 53 | [ 54 | ('blob', blob_docs), 55 | ('text', text_docs), 56 | ('buffer', buffer_docs), 57 | ('embedding', embedding_docs), 58 | ], 59 | ) 60 | @pytest.mark.parametrize( 61 | 'num_docs', 62 | [100, 10000], 63 | ) 64 | def test_da_get_attributes( 65 | name, field, docs_get_fn, memmap, num_docs, json_writer, ephemeral_tmpdir 66 | ): 67 | def _get_attributes(da): 68 | da.get_attributes(*[field]) 69 | 70 | def _build_da(**kwargs): 71 | memmap = kwargs.get('memmap', False) 72 | docs = kwargs.get('docs', False) 73 | da = ( 74 | DocumentArray() 75 | if not memmap 76 | else DocumentArrayMemmap(f'{str(ephemeral_tmpdir)}/memmap') 77 | ) 78 | da.extend(docs) 79 | return (), dict(da=da) 80 | 81 | def _teardown(): 82 | import os 83 | import shutil 84 | 85 | if os.path.exists(f'{str(ephemeral_tmpdir)}/memmap'): 86 | shutil.rmtree(f'{str(ephemeral_tmpdir)}/memmap') 87 | 88 | result = benchmark_time( 89 | setup=_build_da, 90 | func=_get_attributes, 91 | teardown=_teardown, 92 | kwargs=dict(memmap=memmap, docs=docs_get_fn(num_docs)), 93 | ) 94 | if memmap: 95 | name = name.replace('_da_', '_dam_') 96 | json_writer.append( 97 | name=name, 98 | page=Pages.DA_GET_ATTRIBUTES, 99 | result=result, 100 | metadata=dict(num_docs=num_docs, field=field, memmap=memmap), 101 | ) 102 | 103 | 104 | @pytest.mark.parametrize('memmap', [False, True]) 105 | @pytest.mark.parametrize( 106 | 'num_docs', 107 | [100, 10000], 108 | ) 109 | def test_da_embeddings_property(name, memmap, num_docs, json_writer, ephemeral_tmpdir): 110 | def _get_embeddings(da): 111 | da.embeddings 112 | 113 | def _build_da(**kwargs): 114 | memmap = kwargs.get('memmap', False) 115 | docs = embedding_docs(num_docs) 116 | da = ( 117 | DocumentArray() 118 | if not memmap 119 | else DocumentArrayMemmap(f'{str(ephemeral_tmpdir)}/memmap') 120 | ) 121 | da.extend(docs) 122 | return (), dict(da=da) 123 | 124 | def _teardown(): 125 | import os 126 | import shutil 127 | 128 | if os.path.exists(f'{str(ephemeral_tmpdir)}/memmap'): 129 | shutil.rmtree(f'{str(ephemeral_tmpdir)}/memmap') 130 | 131 | result = benchmark_time( 132 | setup=_build_da, 133 | func=_get_embeddings, 134 | teardown=_teardown, 135 | kwargs=dict(memmap=memmap), 136 | ) 137 | if memmap: 138 | name = name.replace('_da_', '_dam_') 139 | json_writer.append( 140 | name=name, 141 | page=Pages.DA_GET_ATTRIBUTES, 142 | result=result, 143 | metadata=dict(num_docs=num_docs, memmap=memmap), 144 | ) 145 | -------------------------------------------------------------------------------- /src/document_graph_construction.py: -------------------------------------------------------------------------------- 1 | # import random 2 | # 3 | # import pytest 4 | # from jina import Document 5 | # from jina.types.document.graph import GraphDocument 6 | # 7 | # from .pages import Pages 8 | # from .utils.benchmark import benchmark_time 9 | # 10 | # 11 | # @pytest.mark.parametrize('n_edges', [200, 2_000]) 12 | # @pytest.mark.parametrize('n_nodes', [100, 1_000]) 13 | # def test_graph_add_edges_assuming_no_nodes_present(n_nodes, n_edges, json_writer): 14 | # def _setup(): 15 | # docs = [Document(text=f'Document{i}') for i in range(n_nodes)] 16 | # sources = [random.choice(docs) for i in range(n_edges)] 17 | # targets = [random.choice(docs) for i in range(n_edges)] 18 | # edge_features = [ 19 | # {'text': f'I connect Doc{i} and Doc{j}'} for i, j in zip(sources, targets) 20 | # ] 21 | # return (), dict(sources=sources, targets=targets, edge_features=edge_features) 22 | # 23 | # def _build_graph_doc(sources, targets, edge_features): 24 | # graph = GraphDocument() 25 | # graph.add_edges(sources, targets, edge_features=edge_features) 26 | # 27 | # result = benchmark_time( 28 | # setup=_setup, 29 | # func=_build_graph_doc, 30 | # ) 31 | # 32 | # json_writer.append( 33 | # page=Pages.DOCUMENT_GRAPH, 34 | # result=result, 35 | # metadata=dict(n_nodes=n_nodes, n_edges=n_edges), 36 | # ) 37 | # 38 | # 39 | # @pytest.mark.parametrize('n_edges', [200, 2_000]) 40 | # @pytest.mark.parametrize('n_nodes', [100, 1_000]) 41 | # def test_graph_add_edges_assuming_all_nodes_present(n_nodes, n_edges, json_writer): 42 | # def _setup(): 43 | # docs = [Document(text=f'Document{i}') for i in range(n_nodes)] 44 | # sources = [random.choice(docs) for i in range(n_edges)] 45 | # targets = [random.choice(docs) for i in range(n_edges)] 46 | # edge_features = [ 47 | # {'text': f'I connect Doc{i} and Doc{j}'} for i, j in zip(sources, targets) 48 | # ] 49 | # graph = GraphDocument() 50 | # graph.add_nodes(docs) 51 | # return (), dict( 52 | # graph=graph, sources=sources, targets=targets, edge_features=edge_features 53 | # ) 54 | # 55 | # def _build_graph_doc(graph, sources, targets, edge_features): 56 | # graph.add_edges(sources, targets, edge_features=edge_features) 57 | # 58 | # result = benchmark_time( 59 | # setup=_setup, 60 | # func=_build_graph_doc, 61 | # ) 62 | # 63 | # json_writer.append( 64 | # page=Pages.DOCUMENT_GRAPH, 65 | # result=result, 66 | # metadata=dict(n_nodes=n_nodes, n_edges=n_edges), 67 | # ) 68 | # 69 | # 70 | # @pytest.mark.parametrize('n_edges', [200, 2_000]) 71 | # @pytest.mark.parametrize('n_nodes', [100, 1_000]) 72 | # def test_graph_add_single_edge_assuming_all_nodes_present( 73 | # n_nodes, n_edges, json_writer 74 | # ): 75 | # def _setup(): 76 | # docs = [Document(text=f'Document{i}') for i in range(n_nodes)] 77 | # sources = [random.choice(docs) for i in range(n_edges)] 78 | # targets = [random.choice(docs) for i in range(n_edges)] 79 | # graph = GraphDocument() 80 | # graph.add_nodes(docs) 81 | # return (), dict(graph=graph, sources=sources, targets=targets) 82 | # 83 | # def _build_graph_doc(graph, sources, targets): 84 | # for source, target in zip(sources, targets): 85 | # graph.add_single_edge(source, target) 86 | # return graph 87 | # 88 | # result = benchmark_time(setup=_setup, func=_build_graph_doc) 89 | # 90 | # json_writer.append( 91 | # page=Pages.DOCUMENT_GRAPH, 92 | # result=result, 93 | # metadata=dict(n_nodes=n_nodes, n_edges=n_edges), 94 | # ) 95 | # 96 | # 97 | # @pytest.mark.parametrize('n_edges', [200, 2_000]) 98 | # @pytest.mark.parametrize('n_nodes', [100, 1_000]) 99 | # def test_graph_add_single_edge_assuming_no_nodes_present(n_nodes, n_edges, json_writer): 100 | # def _setup(): 101 | # docs = [Document(text=f'Document{i}') for i in range(n_nodes)] 102 | # sources = [random.choice(docs) for i in range(n_edges)] 103 | # targets = [random.choice(docs) for i in range(n_edges)] 104 | # return (), dict(sources=sources, targets=targets) 105 | # 106 | # def _build_graph_doc(sources, targets): 107 | # graph = GraphDocument() 108 | # for source, target in zip(sources, targets): 109 | # graph.add_single_edge(source, target) 110 | # return graph 111 | # 112 | # result = benchmark_time(setup=_setup, func=_build_graph_doc) 113 | # 114 | # json_writer.append( 115 | # page=Pages.DOCUMENT_GRAPH, 116 | # result=result, 117 | # metadata=dict(n_nodes=n_nodes, n_edges=n_edges), 118 | # ) 119 | # 120 | # 121 | # @pytest.mark.parametrize('n_nodes', [100, 1_000]) 122 | # def test_graph_add_single_node(n_nodes, json_writer): 123 | # def _setup(): 124 | # docs = [Document(text=f'Document{i}') for i in range(n_nodes)] 125 | # graph = GraphDocument() 126 | # return (), dict(graph=graph, docs=docs) 127 | # 128 | # def _build_graph_doc(graph, docs): 129 | # for doc in docs: 130 | # graph.add_single_node(doc) 131 | # 132 | # result = benchmark_time( 133 | # setup=_setup, 134 | # func=_build_graph_doc, 135 | # ) 136 | # 137 | # json_writer.append( 138 | # page=Pages.DOCUMENT_GRAPH, 139 | # result=result, 140 | # metadata=dict(n_nodes=n_nodes), 141 | # ) 142 | # 143 | # 144 | # @pytest.mark.parametrize('n_nodes', [100, 1_000]) 145 | # def test_graph_add_nodes(n_nodes, json_writer): 146 | # def _setup(): 147 | # docs = [Document(text=f'Document{i}') for i in range(n_nodes)] 148 | # graph = GraphDocument() 149 | # graph.add_nodes(docs) 150 | # return (), dict(graph=graph, docs=docs) 151 | # 152 | # def _build_graph_doc(graph, docs): 153 | # graph.add_nodes(docs) 154 | # 155 | # result = benchmark_time( 156 | # setup=_setup, 157 | # func=_build_graph_doc, 158 | # ) 159 | # 160 | # json_writer.append( 161 | # page=Pages.DOCUMENT_GRAPH, 162 | # result=result, 163 | # metadata=dict(n_nodes=n_nodes), 164 | # ) 165 | -------------------------------------------------------------------------------- /src/searchers_compare.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | from collections import defaultdict 4 | from statistics import mean, stdev 5 | 6 | import numpy as np 7 | import pytest 8 | from jina import Document, DocumentArray, Executor, requests, DocumentArrayMemmap 9 | from pympler import asizeof, tracker 10 | 11 | from .pages import Pages 12 | from .utils.timecontext import TimeContext 13 | 14 | NUM_REPETITIONS = 5 15 | NUM_REQUESTS = 100 16 | TARGET_FILE = 'searchers_compare.json' 17 | 18 | 19 | def _get_docs(number_of_documents, embedding_size): 20 | return [ 21 | Document(embedding=np.random.rand(embedding_size), id=str(i)) 22 | for i in range(number_of_documents) 23 | ] 24 | 25 | 26 | def _get_dam(number_of_documents, embedding_size, dir_path, **kwargs): 27 | tmp_path = f'{dir_path}/memmap_{number_of_documents}_{embedding_size}_tmp' 28 | path = f'{dir_path}/memmap_{number_of_documents}_{embedding_size}' 29 | if os.path.exists(path): 30 | return path 31 | da = DocumentArrayMemmap(tmp_path) 32 | docs = _get_docs(number_of_documents, embedding_size) 33 | da.extend(docs) 34 | da.save() 35 | shutil.copytree(tmp_path, path) 36 | da.clear() 37 | da._last_mmap = None 38 | return path 39 | 40 | 41 | def _get_da(number_of_documents, embedding_size, dir_path, **kwargs): 42 | path = f'{dir_path}/docs.bin' 43 | if os.path.exists(path): 44 | return path 45 | da = DocumentArray() 46 | docs = _get_docs(number_of_documents, embedding_size) 47 | da.extend(docs) 48 | da.save(path, file_format='binary') 49 | da.clear() 50 | return path 51 | 52 | 53 | def _get_document_array(dam_index, **kwargs): 54 | return _get_dam(**kwargs) if dam_index else _get_da(**kwargs) 55 | 56 | 57 | class DocumentArraySearcher(Executor): 58 | def __init__( 59 | self, 60 | indexed_docs_path, 61 | dam_index, 62 | warmup=False, 63 | top_k: int = 50, 64 | *args, 65 | **kwargs, 66 | ): 67 | super().__init__(*args, **kwargs) 68 | self.indexed_docs_path = indexed_docs_path 69 | self._index_docs = ( 70 | DocumentArray.load(indexed_docs_path, file_format='binary') 71 | if not dam_index 72 | else DocumentArrayMemmap(indexed_docs_path) 73 | ) 74 | if warmup: 75 | self._index_docs.get_attributes('embedding') 76 | self._top_k = top_k 77 | 78 | @requests 79 | def search(self, docs, **kwargs): 80 | docs.match( 81 | self._index_docs, 82 | metric='cosine', 83 | use_scipy=False, 84 | limit=self._top_k, 85 | ) 86 | 87 | 88 | @pytest.mark.skipif( 89 | 'JINA_BENCHMARK_SEARCHERS' not in os.environ, 90 | reason='This test take a lot of time, to be run explicitly and isolated from the rest', 91 | ) 92 | @pytest.mark.parametrize( 93 | 'name,indexed_docs,docs_per_request,emb_size', 94 | [ 95 | ('Tiny Index', 100, 1, 128), 96 | ('Small Index', 10000, 1, 128), 97 | ('Medium Index', 100000, 1, 128), 98 | # ('Big Index', 1000000, 1, 128), 99 | ('Batch requesting', 100000, 32, 128), 100 | ('Big embeddings', 100000, 1, 512), 101 | ], 102 | ) 103 | @pytest.mark.parametrize( 104 | 'dam_index,warmup', [(False, False), (True, False), (True, True)] 105 | ) 106 | def test_search_compare( 107 | name, 108 | indexed_docs, 109 | docs_per_request, 110 | emb_size, 111 | dam_index, 112 | warmup, 113 | ephemeral_tmpdir, 114 | json_writer, 115 | ): 116 | def _get_indexer(): 117 | path = _get_document_array( 118 | dam_index=dam_index, 119 | number_of_documents=indexed_docs, 120 | embedding_size=emb_size, 121 | dir_path=str(ephemeral_tmpdir), 122 | ) 123 | 124 | return DocumentArraySearcher( 125 | indexed_docs_path=path, dam_index=dam_index, warmup=warmup 126 | ) 127 | 128 | query_docs = [ 129 | DocumentArray(_get_docs(docs_per_request, embedding_size=emb_size)) 130 | ] * NUM_REQUESTS 131 | 132 | data_points = defaultdict(list) 133 | all_search_timings = [] 134 | 135 | def _func(): 136 | with TimeContext() as indexer_context: 137 | indexer = _get_indexer() 138 | print(f' indexer created/loaded in {indexer_context.duration / 1e6} ms') 139 | data_points['index_time'].append(indexer_context.duration) 140 | data_points['index_memory'].append(asizeof.asizeof(indexer)) 141 | 142 | tr = tracker.SummaryTracker() 143 | sum1 = tr.create_summary() 144 | timings = [] 145 | for i in range(NUM_REQUESTS): 146 | with TimeContext() as seach_context: 147 | indexer.search(query_docs[i]) 148 | timings.append(seach_context.duration) 149 | sum2 = tr.create_summary() 150 | diff = tr.diff(sum1, sum2) 151 | print(f' search finished in {sum(timings) / 1e6} ms') 152 | data_points['search_time'].append(sum(timings)) 153 | all_search_timings.extend(timings) 154 | data_points['search_memory'].append(sum([ob_sum[2] for ob_sum in diff])) 155 | 156 | shutil.rmtree(str(ephemeral_tmpdir), ignore_errors=True) 157 | os.makedirs(str(ephemeral_tmpdir)) 158 | 159 | for i in range(NUM_REPETITIONS): 160 | _func() 161 | 162 | results = {} 163 | 164 | for field in ['index_time', 'index_memory', 'search_time', 'search_memory']: 165 | results[f'mean_{field}'], results[f'std_{field}'] = get_mean_and_std( 166 | data_points[field] 167 | ) 168 | 169 | results['p90'] = get_percentile(all_search_timings, 90) 170 | results['p99'] = get_percentile(all_search_timings, 99) 171 | 172 | json_writer.append_raw( 173 | target_file=TARGET_FILE, 174 | dict_=dict( 175 | name=name, 176 | page=Pages.INDEXER_COMPARISON, 177 | iterations=NUM_REPETITIONS, 178 | results=results, 179 | metadata=dict( 180 | indexed_docs=indexed_docs, 181 | embedding_size=emb_size, 182 | docs_per_request=docs_per_request, 183 | num_requests=NUM_REQUESTS, 184 | dam_index=dam_index, 185 | warmup_embeddings=warmup, 186 | ), 187 | ), 188 | ) 189 | 190 | 191 | def get_mean_and_std(data): 192 | mean_ = mean(data) 193 | std_ = stdev(data) if len(data) > 1 else None 194 | return mean_, std_ 195 | 196 | 197 | def get_percentile(timings, percentile): 198 | array = np.array(timings) 199 | return np.percentile(array, percentile) 200 | -------------------------------------------------------------------------------- /docs/static/artifacts/2.1.2/searchers_compare.json: -------------------------------------------------------------------------------- 1 | [{"name": "Tiny Index", "page": "indexer_comparison", "iterations": 5, "results": {"mean_index_time": 4663456, "std_index_time": 83310.14742514864, "mean_index_memory": 15688, "std_index_memory": 0.0, "mean_search_time": 464436898.6, "std_search_time": 4998683.976273986, "mean_search_memory": 1400770.6, "std_search_memory": 4516.044153902838, "p90": 4762196.6, "p99": 5214065.249999998}, "metadata": {"indexed_docs": 100, "embedding_size": 128, "docs_per_request": 1, "num_requests": 100, "dam_index": false, "warmup_embeddings": false}}, {"name": "Small Index", "page": "indexer_comparison", "iterations": 5, "results": {"mean_index_time": 412290205, "std_index_time": 13893922.332277287, "mean_index_memory": 1177192, "std_index_memory": 0.0, "mean_search_time": 4712511737.6, "std_search_time": 10168175.533933548, "mean_search_memory": 1399055.6, "std_search_memory": 1360.870971106372, "p90": 47989977.9, "p99": 48727291.11}, "metadata": {"indexed_docs": 10000, "embedding_size": 128, "docs_per_request": 1, "num_requests": 100, "dam_index": false, "warmup_embeddings": false}}, {"name": "Medium Index", "page": "indexer_comparison", "iterations": 5, "results": {"mean_index_time": 4369986530.8, "std_index_time": 66034291.207184196, "mean_index_memory": 14045160, "std_index_memory": 0.0, "mean_search_time": 27651288787.6, "std_search_time": 674765844.5562556, "mean_search_memory": 1399217.8, "std_search_memory": 1660.9512936868439, "p90": 281231409.1, "p99": 283362721.43}, "metadata": {"indexed_docs": 100000, "embedding_size": 128, "docs_per_request": 1, "num_requests": 100, "dam_index": false, "warmup_embeddings": false}}, {"name": "Batch requesting", "page": "indexer_comparison", "iterations": 5, "results": {"mean_index_time": 4350911889.2, "std_index_time": 15223257.526421051, "mean_index_memory": 14045160, "std_index_memory": 0.0, "mean_search_time": 40605954140.4, "std_search_time": 534816506.5463835, "mean_search_memory": 1398503, "std_search_memory": 0.0, "p90": 412417490.7, "p99": 414500806.53}, "metadata": {"indexed_docs": 100000, "embedding_size": 128, "docs_per_request": 32, "num_requests": 100, "dam_index": false, "warmup_embeddings": false}}, {"name": "Big embeddings", "page": "indexer_comparison", "iterations": 5, "results": {"mean_index_time": 5331926431.2, "std_index_time": 149836770.8967121, "mean_index_memory": 14045160, "std_index_memory": 0.0, "mean_search_time": 69454878061.4, "std_search_time": 350069105.02848333, "mean_search_memory": 1398475, "std_search_memory": 0.0, "p90": 698528512.0, "p99": 701192757.06}, "metadata": {"indexed_docs": 100000, "embedding_size": 512, "docs_per_request": 1, "num_requests": 100, "dam_index": false, "warmup_embeddings": false}}, {"name": "Tiny Index", "page": "indexer_comparison", "iterations": 5, "results": {"mean_index_time": 6735359.8, "std_index_time": 176449.9806579757, "mean_index_memory": 51968, "std_index_memory": 0.0, "mean_search_time": 574747374.4, "std_search_time": 24103476.25610035, "mean_search_memory": 1446180.2, "std_search_memory": 316.7983585816063, "p90": 5750141.4, "p99": 6004823.099999961}, "metadata": {"indexed_docs": 100, "embedding_size": 128, "docs_per_request": 1, "num_requests": 100, "dam_index": true, "warmup_embeddings": false}}, {"name": "Small Index", "page": "indexer_comparison", "iterations": 5, "results": {"mean_index_time": 526890272.6, "std_index_time": 4792654.779521659, "mean_index_memory": 3639272, "std_index_memory": 0.0, "mean_search_time": 1258268522.8, "std_search_time": 89951360.81594121, "mean_search_memory": 1914582.8, "std_search_memory": 517.762204105321, "p90": 10897318.5, "p99": 13446589.099998713}, "metadata": {"indexed_docs": 10000, "embedding_size": 128, "docs_per_request": 1, "num_requests": 100, "dam_index": true, "warmup_embeddings": false}}, {"name": "Medium Index", "page": "indexer_comparison", "iterations": 5, "results": {"mean_index_time": 5441540046, "std_index_time": 15804107.99446163, "mean_index_memory": 39353320, "std_index_memory": 0.0, "mean_search_time": 10250244642.4, "std_search_time": 1403353517.6519396, "mean_search_memory": 1915834, "std_search_memory": 94.05317644821997, "p90": 85302622.7, "p99": 114642680.12997666}, "metadata": {"indexed_docs": 100000, "embedding_size": 128, "docs_per_request": 1, "num_requests": 100, "dam_index": true, "warmup_embeddings": false}}, {"name": "Batch requesting", "page": "indexer_comparison", "iterations": 5, "results": {"mean_index_time": 5436396953.8, "std_index_time": 11955926.022260036, "mean_index_memory": 39353320, "std_index_memory": 0.0, "mean_search_time": 28176033346.2, "std_search_time": 603483342.9079329, "mean_search_memory": 1997230, "std_search_memory": 22.93468988235943, "p90": 269443261.2, "p99": 288813878.339986}, "metadata": {"indexed_docs": 100000, "embedding_size": 128, "docs_per_request": 32, "num_requests": 100, "dam_index": true, "warmup_embeddings": false}}, {"name": "Big embeddings", "page": "indexer_comparison", "iterations": 5, "results": {"mean_index_time": 6557379258.6, "std_index_time": 146945815.77846104, "mean_index_memory": 39353320, "std_index_memory": 0.0, "mean_search_time": 32178833682, "std_search_time": 4081333685.770137, "mean_search_memory": 1915692.8, "std_search_memory": 138.3011207474473, "p90": 242671132.2, "p99": 295682987.7999634}, "metadata": {"indexed_docs": 100000, "embedding_size": 512, "docs_per_request": 1, "num_requests": 100, "dam_index": true, "warmup_embeddings": false}}, {"name": "Tiny Index", "page": "indexer_comparison", "iterations": 5, "results": {"mean_index_time": 10787957.6, "std_index_time": 286894.32551777666, "mean_index_memory": 99648, "std_index_memory": 0.0, "mean_search_time": 554426485.4, "std_search_time": 2681315.314795203, "mean_search_memory": 1399307, "std_search_memory": 0.0, "p90": 5701958.4, "p99": 6041982.189999999}, "metadata": {"indexed_docs": 100, "embedding_size": 128, "docs_per_request": 1, "num_requests": 100, "dam_index": true, "warmup_embeddings": true}}, {"name": "Small Index", "page": "indexer_comparison", "iterations": 5, "results": {"mean_index_time": 669417835.2, "std_index_time": 5122590.498427109, "mean_index_memory": 4178768, "std_index_memory": 0.0, "mean_search_time": 1051474361.4, "std_search_time": 9151433.955484506, "mean_search_memory": 1403393.4, "std_search_memory": 71.07249819726334, "p90": 10893107.0, "p99": 12101473.249999993}, "metadata": {"indexed_docs": 10000, "embedding_size": 128, "docs_per_request": 1, "num_requests": 100, "dam_index": true, "warmup_embeddings": true}}, {"name": "Medium Index", "page": "indexer_comparison", "iterations": 5, "results": {"mean_index_time": 6861858093.8, "std_index_time": 21880322.70985081, "mean_index_memory": 39892816, "std_index_memory": 0.0, "mean_search_time": 5103391090.8, "std_search_time": 16373359.897277229, "mean_search_memory": 1403631.4, "std_search_memory": 51.699129586483366, "p90": 51674190.2, "p99": 55007636.15999998}, "metadata": {"indexed_docs": 100000, "embedding_size": 128, "docs_per_request": 1, "num_requests": 100, "dam_index": true, "warmup_embeddings": true}}, {"name": "Batch requesting", "page": "indexer_comparison", "iterations": 5, "results": {"mean_index_time": 6920679618, "std_index_time": 20688891.064720158, "mean_index_memory": 39892816, "std_index_memory": 0.0, "mean_search_time": 24183200250.8, "std_search_time": 365741686.15848285, "mean_search_memory": 1485103.4, "std_search_memory": 28.29840984931839, "p90": 245789731.3, "p99": 247716863.32}, "metadata": {"indexed_docs": 100000, "embedding_size": 128, "docs_per_request": 32, "num_requests": 100, "dam_index": true, "warmup_embeddings": true}}, {"name": "Big embeddings", "page": "indexer_comparison", "iterations": 5, "results": {"mean_index_time": 11407799365.6, "std_index_time": 927906708.8663467, "mean_index_memory": 39892848, "std_index_memory": 0.0, "mean_search_time": 22219834503.4, "std_search_time": 3892685783.9269333, "mean_search_memory": 1403646.6, "std_search_memory": 45.41805808266135, "p90": 241667129.1, "p99": 249390008.06}, "metadata": {"indexed_docs": 100000, "embedding_size": 512, "docs_per_request": 1, "num_requests": 100, "dam_index": true, "warmup_embeddings": true}}] -------------------------------------------------------------------------------- /src/document_conversions_blob_image_uri_text.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import numpy as np 4 | import pytest 5 | from jina import Document 6 | 7 | from .pages import Pages 8 | from .utils.benchmark import benchmark_time 9 | 10 | """ 11 | This file contains tests for the following methods from Document: 12 | 13 | - load_uri_to_image_blob 14 | - convert_image_buffer_to_blob 15 | - convert_image_datauri_to_blob 16 | - convert_buffer_to_blob 17 | - convert_image_blob_to_uri 18 | - convert_blob_to_buffer 19 | - load_uri_to_buffer 20 | - convert_uri_to_datauri 21 | - convert_buffer_to_uri 22 | - convert_text_to_uri 23 | - load_uri_to_text 24 | - convert_content_to_uri 25 | """ 26 | 27 | 28 | cur_dir = os.path.dirname(os.path.abspath(__file__)) 29 | 30 | 31 | @pytest.mark.parametrize("num_docs", [1, 100, 1000]) 32 | def test_document_load_uri_to_image_blob(num_docs, json_writer): 33 | def _input_docs(): 34 | image_dir = os.path.join(cur_dir, "utils", "test.png") 35 | return (), dict(docs=[Document(uri=image_dir) for _ in range(num_docs)]) 36 | 37 | def _load_uri_to_image_blob(docs): 38 | for doc in docs: 39 | doc.load_uri_to_image_blob() 40 | 41 | result = benchmark_time(setup=_input_docs, func=_load_uri_to_image_blob) 42 | 43 | json_writer.append( 44 | page=Pages.DOCUMENT_CONVERSION, 45 | result=result, 46 | metadata=dict(num_docs=num_docs), 47 | ) 48 | 49 | 50 | @pytest.mark.parametrize("num_docs", [1, 100, 1000]) 51 | def test_document_convert_uri_to_buffer(num_docs, json_writer): 52 | def _input_docs(): 53 | image_dir = os.path.join(cur_dir, "utils", "test.png") 54 | docs = [] 55 | for _ in range(num_docs): 56 | doc = Document(uri=image_dir) 57 | docs.append(doc) 58 | 59 | return (), dict(docs=docs) 60 | 61 | def _load_uri_to_buffer(docs): 62 | for doc in docs: 63 | doc.load_uri_to_buffer() 64 | 65 | result = benchmark_time(setup=_input_docs, func=_load_uri_to_buffer) 66 | 67 | json_writer.append( 68 | page=Pages.DOCUMENT_CONVERSION, 69 | result=result, 70 | metadata=dict(num_docs=num_docs), 71 | ) 72 | 73 | 74 | @pytest.mark.parametrize("num_docs", [1, 100, 1000]) 75 | def test_document_convert_image_buffer_to_blob(num_docs, json_writer): 76 | def _input_docs(): 77 | image_dir = os.path.join(cur_dir, "utils", "test.png") 78 | docs = [] 79 | for _ in range(num_docs): 80 | doc = Document(uri=image_dir) 81 | doc.load_uri_to_buffer() 82 | docs.append(doc) 83 | 84 | return (), dict(docs=docs) 85 | 86 | def _image_buffer_to_blob(docs): 87 | for doc in docs: 88 | doc.convert_buffer_to_image_blob() 89 | 90 | result = benchmark_time(setup=_input_docs, func=_image_buffer_to_blob) 91 | 92 | json_writer.append( 93 | page=Pages.DOCUMENT_CONVERSION, 94 | result=result, 95 | metadata=dict(num_docs=num_docs), 96 | ) 97 | 98 | 99 | @pytest.mark.parametrize("num_docs", [1, 100, 1000]) 100 | def test_document_convert_image_datauri_to_blob(num_docs, json_writer): 101 | def _input_docs(): 102 | image_dir = os.path.join(cur_dir, "utils", "test.png") 103 | docs = [] 104 | for _ in range(num_docs): 105 | doc = Document(uri=image_dir) 106 | doc.convert_uri_to_datauri() 107 | docs.append(doc) 108 | 109 | return (), dict(docs=docs) 110 | 111 | def _load_uri_to_image_blob(docs): 112 | for doc in docs: 113 | doc.load_uri_to_image_blob() 114 | 115 | result = benchmark_time(setup=_input_docs, func=_load_uri_to_image_blob) 116 | 117 | json_writer.append( 118 | page=Pages.DOCUMENT_CONVERSION, 119 | result=result, 120 | metadata=dict(num_docs=num_docs), 121 | ) 122 | 123 | 124 | @pytest.mark.parametrize("num_docs", [1, 100, 1000]) 125 | def test_document_convert_uri_to_datauri(num_docs, json_writer): 126 | def _input_docs(): 127 | image_dir = os.path.join(cur_dir, "utils", "test.png") 128 | docs = [] 129 | for _ in range(num_docs): 130 | doc = Document(uri=image_dir) 131 | docs.append(doc) 132 | 133 | return (), dict(docs=docs) 134 | 135 | def _convert_uri_to_datauri(docs): 136 | for doc in docs: 137 | doc.convert_uri_to_datauri() 138 | 139 | result = benchmark_time(setup=_input_docs, func=_convert_uri_to_datauri) 140 | 141 | json_writer.append( 142 | page=Pages.DOCUMENT_CONVERSION, 143 | result=result, 144 | metadata=dict(num_docs=num_docs), 145 | ) 146 | 147 | 148 | @pytest.mark.parametrize("num_docs", [1, 100, 1000]) 149 | def test_document_convert_buffer_to_blob(num_docs, json_writer): 150 | def _input_docs(): 151 | return ( 152 | (), 153 | dict( 154 | docs=[ 155 | Document(content=np.random.random((85, 152, 3))) 156 | for _ in range(num_docs) 157 | ] 158 | ), 159 | ) 160 | 161 | def _convert_buffer_to_blob(docs): 162 | for doc in docs: 163 | doc.convert_buffer_to_blob() 164 | 165 | result = benchmark_time(setup=_input_docs, func=_convert_buffer_to_blob) 166 | 167 | json_writer.append( 168 | page=Pages.DOCUMENT_CONVERSION, 169 | result=result, 170 | metadata=dict(num_docs=num_docs), 171 | ) 172 | 173 | 174 | @pytest.mark.parametrize("num_docs", [1, 100, 1000]) 175 | def test_document_convert_image_blob_to_uri(num_docs, json_writer): 176 | def _input_docs(): 177 | return ( 178 | (), 179 | dict( 180 | docs=[ 181 | Document(content=np.random.randint(0, 255, 32 * 28)) 182 | for _ in range(num_docs) 183 | ] 184 | ), 185 | ) 186 | 187 | def _convert_image_blob_to_uri(docs): 188 | for doc in docs: 189 | doc.convert_image_blob_to_uri() 190 | 191 | result = benchmark_time(setup=_input_docs, func=_convert_image_blob_to_uri) 192 | 193 | json_writer.append( 194 | page=Pages.DOCUMENT_CONVERSION, 195 | result=result, 196 | metadata=dict(num_docs=num_docs), 197 | ) 198 | 199 | 200 | @pytest.mark.parametrize("num_docs", [1, 100, 1000]) 201 | def test_document_convert_content_to_uri(num_docs, json_writer): 202 | def _input_docs(): 203 | return ( 204 | (), 205 | dict( 206 | docs=[ 207 | Document(content=np.random.randint(0, 255, 32 * 28)) 208 | for _ in range(num_docs) 209 | ] 210 | ), 211 | ) 212 | 213 | def _convert_content_to_uri(docs): 214 | for doc in docs: 215 | _ = doc.convert_content_to_uri 216 | 217 | result = benchmark_time(setup=_input_docs, func=_convert_content_to_uri) 218 | 219 | json_writer.append( 220 | page=Pages.DOCUMENT_CONVERSION, 221 | result=result, 222 | metadata=dict(num_docs=num_docs), 223 | ) 224 | 225 | 226 | @pytest.mark.parametrize("num_docs", [1, 100, 1000]) 227 | def test_document_convert_text_to_uri(num_docs, json_writer): 228 | def _input_docs(): 229 | return ( 230 | (), 231 | dict( 232 | docs=[ 233 | Document(content=np.random.randint(0, 255, 32 * 28)) 234 | for _ in range(num_docs) 235 | ] 236 | ), 237 | ) 238 | 239 | def _convert_text_to_uri(docs): 240 | for doc in docs: 241 | _ = doc.dump_text_to_datauri 242 | 243 | result = benchmark_time(setup=_input_docs, func=_convert_text_to_uri) 244 | 245 | json_writer.append( 246 | page=Pages.DOCUMENT_CONVERSION, 247 | result=result, 248 | metadata=dict(num_docs=num_docs), 249 | ) 250 | 251 | 252 | @pytest.mark.parametrize("num_docs", [1, 100, 1000]) 253 | def test_document_convert_buffer_to_uri(num_docs, json_writer): 254 | def _input_docs(): 255 | return ( 256 | (), 257 | dict( 258 | docs=[ 259 | Document(uri=os.path.join(cur_dir, "test.png")) 260 | for _ in range(num_docs) 261 | ] 262 | ), 263 | ) 264 | 265 | def _convert_buffer_to_uri(docs): 266 | for doc in docs: 267 | _ = doc.convert_buffer_to_uri() 268 | 269 | result = benchmark_time(setup=_input_docs, func=_convert_buffer_to_uri) 270 | 271 | json_writer.append( 272 | page=Pages.DOCUMENT_CONVERSION, 273 | result=result, 274 | metadata=dict(num_docs=num_docs), 275 | ) 276 | 277 | 278 | @pytest.mark.parametrize("num_docs", [1, 5]) 279 | def test_document_load_uri_to_text(num_docs, json_writer): 280 | def _input_docs(): 281 | return ( 282 | (), 283 | dict( 284 | docs=[ 285 | Document(uri="http://google.com/index.html", mime_type="text/html") 286 | for _ in range(num_docs) 287 | ] 288 | ), 289 | ) 290 | 291 | def _load_uri_to_text(docs): 292 | for doc in docs: 293 | _ = doc.load_uri_to_text() 294 | 295 | result = benchmark_time(setup=_input_docs, func=_load_uri_to_text) 296 | 297 | json_writer.append( 298 | page=Pages.DOCUMENT_CONVERSION, 299 | result=result, 300 | metadata=dict(num_docs=num_docs), 301 | ) 302 | 303 | 304 | @pytest.mark.parametrize("num_docs", [1, 100, 1000]) 305 | def test_document_convert_blob_to_buffer(num_docs, json_writer): 306 | def _input_docs(): 307 | return ( 308 | (), 309 | dict( 310 | docs=[ 311 | Document(content=np.random.randint(0, 255, 32 * 28)) 312 | for _ in range(num_docs) 313 | ] 314 | ), 315 | ) 316 | 317 | def _convert_buffer_to_uri(docs): 318 | for doc in docs: 319 | _ = doc.convert_blob_to_buffer() 320 | 321 | result = benchmark_time(setup=_input_docs, func=_convert_buffer_to_uri) 322 | 323 | json_writer.append( 324 | page=Pages.DOCUMENT_CONVERSION, 325 | result=result, 326 | metadata=dict(num_docs=num_docs), 327 | ) 328 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /src/document_construct.py: -------------------------------------------------------------------------------- 1 | import random 2 | import string 3 | 4 | import numpy as np 5 | import pytest 6 | from jina import Document 7 | 8 | from .pages import Pages 9 | from .utils.benchmark import benchmark_time 10 | 11 | 12 | def _generate_random_text(text_length): 13 | return ''.join( 14 | random.choice(string.ascii_uppercase + string.digits) 15 | for _ in range(text_length) 16 | ) 17 | 18 | 19 | def _generate_random_buffer(buffer_length): 20 | return bytes(bytearray(random.getrandbits(8) for _ in range(buffer_length))) 21 | 22 | 23 | def _generate_random_blob(num_dims): 24 | # 1 and 3 can cover from audio signals to images. 3 dimensions make the memory too high 25 | shape = [random.randint(100, 200)] * num_dims 26 | 27 | return np.random.rand(*shape) 28 | 29 | 30 | def _generate_random_document( 31 | origin, text_length=None, buffer_length=None, num_dims=None 32 | ): 33 | tags = {'tag1': [0, 2, 3], 'tag2': 'value of tag2'} 34 | if origin == 'text': 35 | return Document(text=_generate_random_text(text_length), tags=tags) 36 | if origin == 'blob': 37 | return Document(blob=_generate_random_blob(num_dims), tags=tags) 38 | if origin == 'buffer': 39 | return Document(buffer=_generate_random_buffer(buffer_length), tags=tags) 40 | 41 | 42 | def _generate_random_document_with_chunks_and_matches( 43 | origin, text_length=None, buffer_length=None, num_dims=None 44 | ): 45 | root = _generate_random_document(origin, text_length, buffer_length, num_dims) 46 | 47 | num_chunks = random.randint(1, 20) 48 | num_matches = random.randint(1, 20) 49 | for _ in range(num_chunks): 50 | root.chunks.append( 51 | _generate_random_document(origin, text_length, buffer_length, num_dims) 52 | ) 53 | for _ in range(num_matches): 54 | root.matches.append( 55 | _generate_random_document(origin, text_length, buffer_length, num_dims) 56 | ) 57 | return root 58 | 59 | 60 | @pytest.mark.parametrize('text_length', [10, 100, 1000, 10000]) 61 | def test_construct_text(text_length, json_writer): 62 | def _doc_build(text): 63 | Document(text=text) 64 | 65 | result = benchmark_time( 66 | func=_doc_build, kwargs=dict(text=_generate_random_text(text_length)) 67 | ) 68 | 69 | json_writer.append( 70 | page=Pages.DOCUMENT_CONSTRUCT, 71 | result=result, 72 | metadata=dict(text_length=text_length), 73 | ) 74 | 75 | 76 | @pytest.mark.parametrize('num_dims', [1, 2]) 77 | def test_construct_blob(num_dims, json_writer): 78 | def _doc_build(blob): 79 | Document(blob=blob) 80 | 81 | result = benchmark_time( 82 | func=_doc_build, kwargs=dict(blob=_generate_random_blob(num_dims)) 83 | ) 84 | 85 | json_writer.append( 86 | page=Pages.DOCUMENT_CONSTRUCT, 87 | result=result, 88 | metadata=dict(num_dims=num_dims), 89 | ) 90 | 91 | 92 | @pytest.mark.parametrize('buffer_length', [10, 1000, 100000]) 93 | def test_construct_buffer(buffer_length, json_writer): 94 | def _doc_build(buffer): 95 | Document(buffer=buffer) 96 | 97 | result = benchmark_time( 98 | func=_doc_build, kwargs=dict(buffer=_generate_random_buffer(buffer_length)) 99 | ) 100 | 101 | json_writer.append( 102 | page=Pages.DOCUMENT_CONSTRUCT, 103 | result=result, 104 | metadata=dict(buffer_length=buffer_length), 105 | ) 106 | 107 | 108 | @pytest.mark.parametrize('text_length', [10, 100, 1000, 10000]) 109 | def test_construct_btyes_origin_text(text_length, json_writer): 110 | def _doc_build(b): 111 | Document(obj=b) 112 | 113 | result = benchmark_time( 114 | func=_doc_build, 115 | kwargs=dict( 116 | b=_generate_random_document( 117 | 'text', text_length=text_length 118 | ).proto.SerializeToString() 119 | ), 120 | ) 121 | 122 | json_writer.append( 123 | page=Pages.DOCUMENT_CONSTRUCT, 124 | result=result, 125 | metadata=dict(text_length=text_length), 126 | ) 127 | 128 | 129 | @pytest.mark.parametrize('num_dims', [1, 2]) 130 | def test_construct_btyes_origin_blob(num_dims, json_writer): 131 | def _doc_build(b): 132 | Document(obj=b) 133 | 134 | result = benchmark_time( 135 | func=_doc_build, 136 | kwargs=dict( 137 | b=_generate_random_document( 138 | 'blob', num_dims=num_dims 139 | ).proto.SerializeToString() 140 | ), 141 | ) 142 | 143 | json_writer.append( 144 | page=Pages.DOCUMENT_CONSTRUCT, 145 | result=result, 146 | metadata=dict(num_dims=num_dims), 147 | ) 148 | 149 | 150 | @pytest.mark.parametrize('buffer_length', [10, 1000, 100000]) 151 | def test_construct_btyes_origin_buffer(buffer_length, json_writer): 152 | def _doc_build(b): 153 | Document(obj=b) 154 | 155 | result = benchmark_time( 156 | func=_doc_build, 157 | kwargs=dict( 158 | b=_generate_random_document( 159 | 'buffer', buffer_length=buffer_length 160 | ).proto.SerializeToString() 161 | ), 162 | ) 163 | 164 | json_writer.append( 165 | page=Pages.DOCUMENT_CONSTRUCT, 166 | result=result, 167 | metadata=dict(buffer_length=buffer_length), 168 | ) 169 | 170 | 171 | @pytest.mark.parametrize('text_length', [10, 100, 1000, 10000]) 172 | def test_construct_str_json_origin_text(text_length, json_writer): 173 | def _doc_build(b): 174 | Document(obj=b) 175 | 176 | result = benchmark_time( 177 | func=_doc_build, 178 | kwargs=dict( 179 | b=_generate_random_document('text', text_length=text_length).json() 180 | ), 181 | ) 182 | 183 | json_writer.append( 184 | page=Pages.DOCUMENT_CONSTRUCT, 185 | result=result, 186 | metadata=dict(text_length=text_length), 187 | ) 188 | 189 | 190 | @pytest.mark.parametrize('num_dims', [1, 2]) 191 | def test_construct_str_json_origin_blob(num_dims, json_writer): 192 | def _doc_build(b): 193 | Document(obj=b) 194 | 195 | result = benchmark_time( 196 | func=_doc_build, 197 | kwargs=dict(b=_generate_random_document('blob', num_dims=num_dims).json()), 198 | ) 199 | 200 | json_writer.append( 201 | page=Pages.DOCUMENT_CONSTRUCT, 202 | result=result, 203 | metadata=dict(num_dims=num_dims), 204 | ) 205 | 206 | 207 | @pytest.mark.parametrize('buffer_length', [10, 1000, 100000]) 208 | def test_construct_str_json_origin_buffer(buffer_length, json_writer): 209 | def _doc_build(b): 210 | Document(obj=b) 211 | 212 | result = benchmark_time( 213 | func=_doc_build, 214 | kwargs=dict( 215 | b=_generate_random_document('buffer', buffer_length=buffer_length).json() 216 | ), 217 | ) 218 | 219 | json_writer.append( 220 | page=Pages.DOCUMENT_CONSTRUCT, 221 | result=result, 222 | metadata=dict(buffer_length=buffer_length), 223 | ) 224 | 225 | 226 | @pytest.mark.parametrize('text_length', [10, 100, 1000, 10000]) 227 | def test_construct_dict_origin_text(text_length, json_writer): 228 | def _doc_build(b): 229 | Document(obj=b) 230 | 231 | result = benchmark_time( 232 | func=_doc_build, 233 | kwargs=dict( 234 | b=_generate_random_document('text', text_length=text_length).dict() 235 | ), 236 | ) 237 | 238 | json_writer.append( 239 | page=Pages.DOCUMENT_CONSTRUCT, 240 | result=result, 241 | metadata=dict(text_length=text_length), 242 | ) 243 | 244 | 245 | @pytest.mark.parametrize('num_dims', [1, 2]) 246 | def test_construct_dict_origin_blob(num_dims, json_writer): 247 | def _doc_build(b): 248 | Document(obj=b) 249 | 250 | result = benchmark_time( 251 | func=_doc_build, 252 | kwargs=dict(b=_generate_random_document('blob', num_dims=num_dims).dict()), 253 | ) 254 | 255 | json_writer.append( 256 | page=Pages.DOCUMENT_CONSTRUCT, 257 | result=result, 258 | metadata=dict(num_dims=num_dims), 259 | ) 260 | 261 | 262 | @pytest.mark.parametrize('buffer_length', [10, 1000, 100000]) 263 | def test_construct_dict_origin_buffer(buffer_length, json_writer): 264 | def _doc_build(b): 265 | Document(obj=b) 266 | 267 | result = benchmark_time( 268 | func=_doc_build, 269 | kwargs=dict( 270 | b=_generate_random_document('buffer', buffer_length=buffer_length).dict() 271 | ), 272 | ) 273 | 274 | json_writer.append( 275 | page=Pages.DOCUMENT_CONSTRUCT, 276 | result=result, 277 | metadata=dict(buffer_length=buffer_length), 278 | ) 279 | 280 | 281 | @pytest.mark.parametrize('copy', [True, False]) 282 | @pytest.mark.parametrize('text_length', [10, 100, 1000, 10000]) 283 | def test_construct_document_origin_text(copy, text_length, json_writer): 284 | def _doc_build(d): 285 | Document(obj=d, copy=copy) 286 | 287 | _doc_build(d=_generate_random_document('text', text_length)) 288 | 289 | result = benchmark_time( 290 | func=_doc_build, 291 | kwargs=dict(d=_generate_random_document('text', text_length)), 292 | ) 293 | 294 | json_writer.append( 295 | page=Pages.DOCUMENT_CONSTRUCT, 296 | result=result, 297 | metadata=dict(text_length=text_length, copy=copy), 298 | ) 299 | 300 | 301 | @pytest.mark.parametrize('copy', [True, False]) 302 | @pytest.mark.parametrize('num_dims', [1, 2]) 303 | def test_construct_document_origin_blob(copy, num_dims, json_writer): 304 | def _doc_build(d): 305 | Document(obj=d, copy=copy) 306 | 307 | result = benchmark_time( 308 | func=_doc_build, 309 | kwargs=dict(d=_generate_random_document('blob', num_dims=num_dims)), 310 | ) 311 | 312 | json_writer.append( 313 | page=Pages.DOCUMENT_CONSTRUCT, 314 | result=result, 315 | metadata=dict(num_dims=num_dims, copy=copy), 316 | ) 317 | 318 | 319 | @pytest.mark.parametrize('copy', [True, False]) 320 | @pytest.mark.parametrize('buffer_length', [10, 1000, 100000]) 321 | def test_construct_document_origin_buffer(copy, buffer_length, json_writer): 322 | def _doc_build(d): 323 | Document(obj=d, copy=copy) 324 | 325 | result = benchmark_time( 326 | func=_doc_build, 327 | kwargs=dict(d=_generate_random_document('buffer', buffer_length=buffer_length)), 328 | ) 329 | 330 | json_writer.append( 331 | page=Pages.DOCUMENT_CONSTRUCT, 332 | result=result, 333 | metadata=dict(buffer_length=buffer_length, copy=copy), 334 | ) 335 | 336 | 337 | @pytest.mark.parametrize('copy', [True, False]) 338 | @pytest.mark.parametrize('text_length', [10, 100, 1000, 10000]) 339 | def test_construct_document_origin_text_proto(copy, text_length, json_writer): 340 | def _doc_build(d): 341 | Document(obj=d, copy=copy) 342 | 343 | result = benchmark_time( 344 | func=_doc_build, 345 | kwargs=dict(d=_generate_random_document('text', text_length).proto), 346 | ) 347 | 348 | json_writer.append( 349 | page=Pages.DOCUMENT_CONSTRUCT, 350 | result=result, 351 | metadata=dict(text_length=text_length, copy=copy), 352 | ) 353 | 354 | 355 | @pytest.mark.parametrize('copy', [True, False]) 356 | @pytest.mark.parametrize('num_dims', [1, 2]) 357 | def test_construct_document_origin_blob_proto(copy, num_dims, json_writer): 358 | def _doc_build(d): 359 | Document(obj=d, copy=copy) 360 | 361 | result = benchmark_time( 362 | func=_doc_build, 363 | kwargs=dict(d=_generate_random_document('blob', num_dims=num_dims).proto), 364 | ) 365 | 366 | json_writer.append( 367 | page=Pages.DOCUMENT_CONSTRUCT, 368 | result=result, 369 | metadata=dict(num_dims=num_dims, copy=copy), 370 | ) 371 | 372 | 373 | @pytest.mark.parametrize('copy', [True, False]) 374 | @pytest.mark.parametrize('buffer_length', [10, 1000, 100000]) 375 | def test_construct_document_origin_buffer_proto(copy, buffer_length, json_writer): 376 | def _doc_build(d): 377 | Document(obj=d, copy=copy) 378 | 379 | result = benchmark_time( 380 | func=_doc_build, 381 | kwargs=dict( 382 | d=_generate_random_document('buffer', buffer_length=buffer_length).proto 383 | ), 384 | ) 385 | 386 | json_writer.append( 387 | page=Pages.DOCUMENT_CONSTRUCT, 388 | result=result, 389 | metadata=dict(buffer_length=buffer_length, copy=copy), 390 | ) 391 | -------------------------------------------------------------------------------- /scripts/site_generator.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | import json 4 | import os 5 | import copy 6 | from collections import defaultdict 7 | from distutils.version import LooseVersion 8 | from pathlib import Path 9 | from typing import Any, Dict, List, Tuple, Union, Optional 10 | 11 | 12 | COLOR_VALUES = [ 13 | '#10a100', 14 | '#7ead14', 15 | '#bab73c', 16 | '#e8c268', 17 | '#e59838', 18 | '#e36717', 19 | '#de1414', 20 | ] 21 | 22 | COLOR_NAN = '#9b00a1' 23 | 24 | NOT_A_NUMBER = 'N/A' 25 | 26 | STD_MEAN_THRESHOLD = 0.5 27 | 28 | COLOR_LEGEND = ' | '.join( 29 | [ 30 | f'{i*10} - {(i+1)*10}%' 31 | for i, color in enumerate(COLOR_VALUES) 32 | ] 33 | ) 34 | 35 | LEGEND = f""" 36 | The following data should be read as follows: 37 | 38 | - Colors of cells display the percentage of the minimum value in the column:\n 39 | {COLOR_LEGEND} 40 | - 1337: unstable tests with "standard deviation / mean > {STD_MEAN_THRESHOLD}" 41 | """ 42 | 43 | 44 | def _format(data: Union[int, float]) -> Any: 45 | if isinstance(data, bool): 46 | return str(data) 47 | elif isinstance(data, int) or isinstance(data, float): 48 | if data >= 1000: 49 | _data = data 50 | i = 0 51 | while abs(_data) >= 1000: 52 | i += 1 53 | _data /= 1000 54 | 55 | if isinstance(data, int): 56 | return '%d%s' % (_data, ['', 'K', 'M', 'G', 'T', 'P'][i]) 57 | else: 58 | return '%.2f%s' % (_data, ['', 'K', 'M', 'G', 'T', 'P'][i]) 59 | else: 60 | i = 1 61 | _data = round(data, i) 62 | while _data == 0 and i <= 5: 63 | i += 1 64 | _data = round(data, i) 65 | 66 | return _data 67 | else: 68 | return data 69 | 70 | 71 | def _get_color(mean_time, master_mean_time): 72 | if mean_time is None or mean_time == NOT_A_NUMBER or master_mean_time == 0: 73 | return COLOR_NAN 74 | raw_bucket = int((float(mean_time) / float(master_mean_time) - 1) * 10) 75 | bucket = max(0, min(6, raw_bucket)) 76 | 77 | return COLOR_VALUES[bucket] 78 | 79 | 80 | def _get_cleaned_mean_time(time: Optional[int], scaling: int) -> str: 81 | """Return cleaned data""" 82 | 83 | if time is not None: 84 | return str(int(int(time) / scaling)) 85 | else: 86 | return NOT_A_NUMBER 87 | 88 | 89 | def _cleaned_title(raw_heading: str) -> str: 90 | """Return cleaned title of artifact name.""" 91 | return raw_heading.replace('test_', '').replace('_', ' ').title() 92 | 93 | 94 | def is_test_unstable(run_stats): 95 | mean = run_stats.get('mean_time', 1e20) 96 | return mean != 0 and run_stats.get('std_time', 0.0) / mean > STD_MEAN_THRESHOLD 97 | 98 | 99 | def _get_table_header(raw_data: List[Dict[str, Any]]) -> Tuple[str, str]: 100 | """Return metadata table title and table separator.""" 101 | titles = {} 102 | for test_run in raw_data: 103 | for name in test_run['metadata']: 104 | titles[name] = [] 105 | break 106 | separators = [] 107 | for result in raw_data: 108 | separators.append('---:') 109 | for field in titles: 110 | if 'metadata' in result: 111 | value = result['metadata'].get(field, 'N/A') 112 | titles[field].append(f'**{value}**') 113 | 114 | else: 115 | titles[field].append('**N/A**') 116 | final = [] 117 | for title, values in titles.items(): 118 | final.append(f'| **{title}** | {" | ".join(values)} |\n') 119 | header = f'{final[0]}| :---: | {" | ".join(separators)} |\n{"".join(final[1:])}' 120 | return header 121 | 122 | 123 | def _get_version_list(artifacts_dir: str) -> List[str]: 124 | """Generates sorted list of all versions found in reports. 125 | 126 | Args: 127 | artifacts_dir: Absolute path to artifact directory. 128 | 129 | Return: List of versions found in reports. 130 | """ 131 | lv = [] 132 | 133 | for folder in os.listdir(artifacts_dir): 134 | if os.path.isfile(os.path.join(artifacts_dir, folder, 'report.json')): 135 | lv.append(LooseVersion(folder)) 136 | 137 | lv.sort() 138 | sorted_dev = [v.vstring for v in lv] 139 | 140 | import re 141 | 142 | p = re.compile('dev\\d+$') 143 | 144 | i = 0 145 | while i + 1 < len(sorted_dev): 146 | tmp = sorted_dev[i] 147 | m = p.search(sorted_dev[i + 1]) 148 | if m and sorted_dev[i + 1].startswith(tmp): 149 | sorted_dev[i] = sorted_dev[i + 1] 150 | sorted_dev[i + 1] = tmp 151 | i += 1 152 | 153 | version_list = [sorted_dev[i - 1] for i in range(len(sorted_dev), 0, -1)] 154 | 155 | return version_list 156 | 157 | 158 | def _get_cum_data(version_list: List[str], artifacts_dir: str) -> Dict[Any, Any]: 159 | """Generates cumulative data and return in a dict. 160 | 161 | Args: 162 | version_list: List of versions found in reports. 163 | artifacts_dir: Absolute path to artifact directory. 164 | 165 | Return: Dict of cumulative data 166 | """ 167 | data: Dict[Any, Any] = defaultdict( 168 | lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(dict))) 169 | ) 170 | 171 | for version in version_list: 172 | report_file = os.path.join(artifacts_dir, version, 'report.json') 173 | searchers_compare_file = os.path.join( 174 | artifacts_dir, version, 'searchers_compare.json' 175 | ) 176 | 177 | if os.path.isfile(report_file): 178 | with open(report_file) as fp: 179 | _raw_data = json.load(fp) 180 | 181 | if os.path.isfile(searchers_compare_file): 182 | with open(searchers_compare_file) as fp: 183 | _raw_data.extend(json.load(fp)) 184 | 185 | for i in _raw_data: 186 | page = i.get('page', 'unsorted_tests') 187 | test_name = i['name'] 188 | metadata_hash = _hash_run(i) 189 | 190 | data[page][test_name][version][metadata_hash] = i 191 | 192 | return data 193 | 194 | 195 | def generate_homepage(output_dir: str) -> None: 196 | """This generate required homepage for the website. 197 | 198 | Args: 199 | output_dir: Absolute path to Hugo content directory. 200 | """ 201 | src = os.path.join(os.getcwd(), 'README.md') 202 | dst = os.path.join(output_dir, '_index.md') 203 | Path(output_dir).mkdir(parents=True, exist_ok=True) 204 | 205 | if os.path.isfile(src): 206 | with open(src) as f: 207 | data = f.read() 208 | 209 | with open(dst, 'w') as fp: 210 | fp.write('---\n') 211 | fp.write('title: Benchmark Jina\n') 212 | fp.write('type: docs\n') 213 | fp.write('---\n') 214 | fp.write(data) 215 | 216 | 217 | def _hash_run(d): 218 | tmp_dict = copy.deepcopy(d) 219 | tmp_dict.pop('mean_time', None) 220 | tmp_dict.pop('std_time', None) 221 | tmp_dict.pop('iterations', None) 222 | tmp_dict.pop('results', None) 223 | 224 | return json.dumps(tmp_dict, sort_keys=True) 225 | 226 | 227 | def _get_stats(test_data, latest_version): 228 | results = defaultdict(dict) 229 | for version, test_results in test_data.items(): 230 | for test_result in test_results.values(): 231 | parameter_hash = _hash_run(test_result) 232 | metadata = test_result.get('metadata', {}) 233 | if not metadata: 234 | metadata = {'name': test_result['name']} 235 | results[parameter_hash]['metadata'] = metadata 236 | 237 | results[parameter_hash]['min'] = min( 238 | results[parameter_hash].get('min', 1e20), test_result['mean_time'] 239 | ) 240 | results[parameter_hash]['max'] = max( 241 | results[parameter_hash].get('max', 0), test_result['mean_time'] 242 | ) 243 | results[parameter_hash]['parameter_hash'] = parameter_hash 244 | 245 | if version == latest_version: 246 | results[parameter_hash]['last_version_mean'] = test_result['mean_time'] 247 | 248 | stats = list(results.values()) 249 | _add_scaling(stats) 250 | return stats 251 | 252 | 253 | def _get_one_version_stats(test_results): 254 | results = defaultdict(lambda x: 1e20) 255 | results['min_mean_docs_per_sec'] = 0 256 | 257 | for test in test_results: 258 | results['min_time'] = min(results['min_time'], test['mean_time']) 259 | results['min_memory'] = min(results['min_memory'], test['mean_memory']) 260 | results['min_indexer_memory'] = min( 261 | results['min_indexer_memory'], test['mean_indexer_memory'] 262 | ) 263 | results['min_mean_docs_per_sec'] = max( 264 | results['min_mean_docs_per_sec'], test['mean_mean_docs_per_sec'] 265 | ) 266 | results['min_latency'] = min(results['min_latency'], test['mean_latency']) 267 | 268 | return results 269 | 270 | 271 | def _add_scaling(stats): 272 | for run_stats in stats: 273 | if run_stats['min'] > 10_000_000_000: 274 | run_stats['scaling'] = 1_000_000_000 275 | run_stats['metadata']['unit'] = 's' 276 | if run_stats['min'] > 10_000_000: 277 | run_stats['scaling'] = 1_000_000 278 | run_stats['metadata']['unit'] = 'ms' 279 | elif run_stats['min'] > 10_000: 280 | run_stats['scaling'] = 1_000 281 | run_stats['metadata']['unit'] = 'μs' 282 | else: 283 | run_stats['scaling'] = 1 284 | run_stats['metadata']['unit'] = 'ns' 285 | run_stats['min'] = int(run_stats['min'] / run_stats['scaling']) 286 | run_stats['max'] = int(run_stats['max'] / run_stats['scaling']) 287 | 288 | 289 | def generate_docs( 290 | version_list: List[str], cum_data: Dict[Any, Any], output_dir: str 291 | ) -> None: 292 | """This generate required docs from artifacts. 293 | 294 | Args: 295 | version_list: List of versions found in reports. 296 | cum_data: Cumulative data in Dict. 297 | output_dir: Absolute path to Hugo docs directory. 298 | """ 299 | Path(output_dir).mkdir(parents=True, exist_ok=True) 300 | 301 | for page, page_data in cum_data.items(): 302 | output_file = os.path.join(output_dir, f'{page}.md') 303 | if page == 'indexer_comparison': 304 | generate_comparison_test(page_data, output_file, _cleaned_title(page)) 305 | else: 306 | generate_versioned_test(page_data, output_file, _cleaned_title(page)) 307 | 308 | 309 | def _get_last_version(single_test_data): 310 | versions = list(single_test_data.keys()) 311 | if versions: 312 | return max(versions) 313 | else: 314 | return None 315 | 316 | 317 | def generate_versioned_test(page_data, output_file, title): 318 | with open(output_file, 'w') as fp: 319 | fp.write('---\n') 320 | fp.write(f'title: {title}\n') 321 | fp.write('---\n') 322 | fp.write(f'# {title}\n\n') 323 | 324 | fp.write(f'{LEGEND}\n') 325 | 326 | for test_name, single_test_data in page_data.items(): 327 | latest_version = _get_last_version(single_test_data) 328 | 329 | if latest_version is None: 330 | return 331 | 332 | stats = _get_stats(single_test_data, latest_version) 333 | header = _get_table_header(stats) 334 | 335 | fp.write(f'## {_cleaned_title(test_name)}\n') 336 | fp.write(header) 337 | 338 | for version, data_dict in single_test_data.items(): 339 | fp.write(f'| {version} |') 340 | for run in stats: 341 | run_data = data_dict[run['parameter_hash']] 342 | 343 | mean_time = _get_cleaned_mean_time( 344 | run_data.get('mean_time', None), run['scaling'] 345 | ) 346 | color = _get_color(mean_time, run['min']) 347 | 348 | if is_test_unstable(run_data): 349 | mean_time = f'{mean_time}' 350 | 351 | fp.write(f' {mean_time} |') 352 | fp.write('\n') 353 | fp.write('\n') 354 | 355 | 356 | def generate_comparison_test(page_data, output_file, title): 357 | with open(output_file, 'w') as fp: 358 | fp.write('---\n') 359 | fp.write(f'title: {title}\n') 360 | fp.write('---\n') 361 | fp.write(f'# {title}\n\n') 362 | 363 | for test_name, single_test_data in page_data.items(): 364 | latest_version = _get_last_version(single_test_data) 365 | 366 | if latest_version is None: 367 | continue 368 | 369 | table = [] 370 | 371 | test_data = single_test_data[latest_version] 372 | 373 | header = _get_table_header(list(test_data.values())) 374 | 375 | fp.write(f'## {_cleaned_title(test_name)}\n') 376 | fp.write(f'Tests were performed against Jina {latest_version}.\n\n') 377 | fp.write(header) 378 | 379 | table.append( 380 | [ 381 | 'index time in ms', 382 | 'search time in ms', 383 | 'index memory', 384 | 'search memory', 385 | 'p90 in ms', 386 | 'p99 in ms', 387 | 'RPS', 388 | 'Documents per second', 389 | ] 390 | ) 391 | 392 | for run in test_data.values(): 393 | 394 | table.append( 395 | [ 396 | _get_cleaned_mean_time(run['results']['mean_index_time'], 1e6), 397 | _get_cleaned_mean_time(run['results']['mean_search_time'], 1e6), 398 | get_readable_size(run['results']['mean_search_memory']), 399 | get_readable_size(run['results']['mean_index_memory']), 400 | _get_cleaned_mean_time(run['results']['p90'], 1e6), 401 | _get_cleaned_mean_time(run['results']['p99'], 1e6), 402 | get_rps(run), 403 | get_dps(run), 404 | ] 405 | ) 406 | 407 | transposed = list(map(list, zip(*table))) 408 | 409 | fp.write('|\n|'.join(' | '.join(row) for row in transposed)) 410 | fp.write('\n\n') 411 | 412 | 413 | def get_dps(run): 414 | total_docs = run['metadata']['docs_per_request'] * run['metadata']['num_requests'] 415 | dps = total_docs / (run['results']['mean_search_time'] / 1e9) 416 | return f'{dps:.2f}' 417 | 418 | 419 | def get_rps(run): 420 | rps = run['metadata']['num_requests'] / (run['results']['mean_search_time'] / 1e9) 421 | return f'{rps:.2f}' 422 | 423 | 424 | def get_readable_size(num_bytes: Union[int, float]) -> str: 425 | """ 426 | Transform the bytes into readable value with different units (e.g. 1 KB, 20 MB, 30.1 GB). 427 | 428 | :param num_bytes: Number of bytes. 429 | :return: Human readable string representation. 430 | """ 431 | num_bytes = int(num_bytes) 432 | if num_bytes < 1024: 433 | return f'{num_bytes} Bytes' 434 | elif num_bytes < 1024 ** 2: 435 | return f'{num_bytes / 1024:.1f} KB' 436 | elif num_bytes < 1024 ** 3: 437 | return f'{num_bytes / (1024 ** 2):.1f} MB' 438 | else: 439 | return f'{num_bytes / (1024 ** 3):.1f} GB' 440 | 441 | 442 | def generate_menus(cum_data: Dict[Any, Any], output_dir: str) -> None: 443 | """This generate required menus from artifacts. 444 | 445 | Args: 446 | cum_data: Cumulative data in Dict. 447 | output_dir: Absolute path to Hugo menus directory. 448 | """ 449 | menu_dir = os.path.join(output_dir, 'menu') 450 | menu_index = os.path.join(menu_dir, 'index.md') 451 | Path(menu_dir).mkdir(parents=True, exist_ok=True) 452 | 453 | with open(menu_index, 'w') as fp: 454 | fp.write('---\n') 455 | fp.write('headless: true\n') 456 | fp.write('---\n\n') 457 | 458 | for page in cum_data: 459 | fp.write( 460 | '- [%s]({{< relref "/docs/%s.md" >}})\n' % (_cleaned_title(page), page) 461 | ) 462 | 463 | 464 | def main(): 465 | """This is the main function to call.""" 466 | base_dir = os.path.join(os.getcwd(), 'docs') 467 | content_dir = os.path.join(base_dir, 'content') 468 | docs_dir = os.path.join(content_dir, 'docs') 469 | artifacts_dir = os.path.join(base_dir, 'static/artifacts') 470 | 471 | version_list = _get_version_list(artifacts_dir) 472 | cum_data = _get_cum_data(version_list, artifacts_dir) 473 | 474 | generate_homepage(content_dir) 475 | generate_docs(version_list, cum_data, docs_dir) 476 | generate_menus(cum_data, content_dir) 477 | 478 | 479 | if __name__ == '__main__': 480 | main() 481 | -------------------------------------------------------------------------------- /docs/static/artifacts/2.0.12/report.json: -------------------------------------------------------------------------------- 1 | [{"name": "test_da_append", "page": "document_array_append", "iterations": 161, "mean_time": 12393200, "std_time": 86694, "metadata": {"num_docs_append": 10000}}, {"name": "test_dam_append", "page": "document_array_append", "iterations": 14, "mean_time": 148900621, "std_time": 6612586, "metadata": {"num_docs_append": 10000, "flush": true}}, {"name": "test_dam_append", "page": "document_array_append", "iterations": 22, "mean_time": 92221945, "std_time": 739450, "metadata": {"num_docs_append": 10000, "flush": false}}, {"name": "test_da_clear", "page": "document_array_clear", "iterations": 473, "mean_time": 59096, "std_time": 2631, "metadata": {"num_docs": 100}}, {"name": "test_da_clear", "page": "document_array_clear", "iterations": 5, "mean_time": 8685183, "std_time": 230697, "metadata": {"num_docs": 10000}}, {"name": "test_dam_clear", "page": "document_array_clear", "iterations": 364, "mean_time": 94221, "std_time": 3466, "metadata": {"num_docs": 100}}, {"name": "test_dam_clear", "page": "document_array_clear", "iterations": 5, "mean_time": 1129013, "std_time": 40003, "metadata": {"num_docs": 10000}}, {"name": "test_construct_document_array_from_repeated_container", "page": "document_array_construct", "iterations": 360373, "mean_time": 3762, "std_time": 357, "metadata": {"num_chunks": 10000}}, {"name": "test_construct_document_array_from_another_documentarray", "page": "document_array_construct", "iterations": 700784, "mean_time": 1196, "std_time": 174, "metadata": {"num_docs": 10000}}, {"name": "test_construct_document_array_from_list_of_documents", "page": "document_array_construct", "iterations": 1064, "mean_time": 1875242, "std_time": 24081, "metadata": {"num_docs": 10000}}, {"name": "test_construct_document_array_from_generator", "page": "document_array_construct", "iterations": 5, "mean_time": 992149743, "std_time": 13339544, "metadata": {"num_docs": 10000}}, {"name": "test_construct_document_array_from_another_documentarray_memmap", "page": "document_array_construct", "iterations": 5, "mean_time": 582987914, "std_time": 14897190, "metadata": {"num_docs": 10000}}, {"name": "test_dam_embeddings", "page": "document_array_get_attributes", "iterations": 238, "mean_time": 111895, "std_time": 4055, "metadata": {"num_docs": 100, "num_feat": 128}}, {"name": "test_dam_embeddings", "page": "document_array_get_attributes", "iterations": 10, "mean_time": 2394109, "std_time": 23696, "metadata": {"num_docs": 10000, "num_feat": 128}}, {"name": "test_dam_embeddings", "page": "document_array_get_attributes", "iterations": 10, "mean_time": 3750944, "std_time": 78874, "metadata": {"num_docs": 10000, "num_feat": 256}}, {"name": "test_da_extend", "page": "document_array_extend", "iterations": 1640, "mean_time": 1211932, "std_time": 14254, "metadata": {"num_docs": 1000, "label": "empty", "memmap": false}}, {"name": "test_da_extend", "page": "document_array_extend", "iterations": 229, "mean_time": 8420927, "std_time": 60451, "metadata": {"num_docs": 1000, "label": "empty", "memmap": true}}, {"name": "test_da_extend", "page": "document_array_extend", "iterations": 1629, "mean_time": 1220334, "std_time": 21265, "metadata": {"num_docs": 1000, "label": "blob", "memmap": false}}, {"name": "test_da_extend", "page": "document_array_extend", "iterations": 109, "mean_time": 17562986, "std_time": 207467, "metadata": {"num_docs": 1000, "label": "blob", "memmap": true}}, {"name": "test_da_extend", "page": "document_array_extend", "iterations": 1638, "mean_time": 1213585, "std_time": 13941, "metadata": {"num_docs": 1000, "label": "text", "memmap": false}}, {"name": "test_da_extend", "page": "document_array_extend", "iterations": 212, "mean_time": 9137207, "std_time": 56778, "metadata": {"num_docs": 1000, "label": "text", "memmap": true}}, {"name": "test_da_extend", "page": "document_array_extend", "iterations": 1638, "mean_time": 1213142, "std_time": 8774, "metadata": {"num_docs": 1000, "label": "buffer", "memmap": false}}, {"name": "test_da_extend", "page": "document_array_extend", "iterations": 142, "mean_time": 13528808, "std_time": 135447, "metadata": {"num_docs": 1000, "label": "buffer", "memmap": true}}, {"name": "test_da_get_attributes", "page": "document_array_get_attributes", "iterations": 1131, "mean_time": 1617980, "std_time": 20063, "metadata": {"num_docs": 100, "field": "blob", "memmap": false}}, {"name": "test_dam_get_attributes", "page": "document_array_get_attributes", "iterations": 175, "mean_time": 9298763, "std_time": 77533, "metadata": {"num_docs": 100, "field": "blob", "memmap": true}}, {"name": "test_da_get_attributes", "page": "document_array_get_attributes", "iterations": 2353, "mean_time": 715476, "std_time": 8404, "metadata": {"num_docs": 100, "field": "text", "memmap": false}}, {"name": "test_dam_get_attributes", "page": "document_array_get_attributes", "iterations": 282, "mean_time": 5872680, "std_time": 34821, "metadata": {"num_docs": 100, "field": "text", "memmap": true}}, {"name": "test_da_get_attributes", "page": "document_array_get_attributes", "iterations": 2287, "mean_time": 735093, "std_time": 10244, "metadata": {"num_docs": 100, "field": "buffer", "memmap": false}}, {"name": "test_dam_get_attributes", "page": "document_array_get_attributes", "iterations": 246, "mean_time": 6370017, "std_time": 44060, "metadata": {"num_docs": 100, "field": "buffer", "memmap": true}}, {"name": "test_da_get_attributes", "page": "document_array_get_attributes", "iterations": 1121, "mean_time": 1634398, "std_time": 18609, "metadata": {"num_docs": 100, "field": "embedding", "memmap": false}}, {"name": "test_dam_get_attributes", "page": "document_array_get_attributes", "iterations": 175, "mean_time": 9297756, "std_time": 62184, "metadata": {"num_docs": 100, "field": "embedding", "memmap": true}}, {"name": "test_da_get_attributes", "page": "document_array_get_attributes", "iterations": 11, "mean_time": 177432561, "std_time": 14403633, "metadata": {"num_docs": 10000, "field": "blob", "memmap": false}}, {"name": "test_dam_get_attributes", "page": "document_array_get_attributes", "iterations": 5, "mean_time": 965614153, "std_time": 27364696, "metadata": {"num_docs": 10000, "field": "blob", "memmap": true}}, {"name": "test_da_get_attributes", "page": "document_array_get_attributes", "iterations": 22, "mean_time": 80287663, "std_time": 13992505, "metadata": {"num_docs": 10000, "field": "text", "memmap": false}}, {"name": "test_dam_get_attributes", "page": "document_array_get_attributes", "iterations": 5, "mean_time": 601798762, "std_time": 18118377, "metadata": {"num_docs": 10000, "field": "text", "memmap": true}}, {"name": "test_da_get_attributes", "page": "document_array_get_attributes", "iterations": 21, "mean_time": 84370381, "std_time": 11803020, "metadata": {"num_docs": 10000, "field": "buffer", "memmap": false}}, {"name": "test_dam_get_attributes", "page": "document_array_get_attributes", "iterations": 5, "mean_time": 649465702, "std_time": 18064889, "metadata": {"num_docs": 10000, "field": "buffer", "memmap": true}}, {"name": "test_da_get_attributes", "page": "document_array_get_attributes", "iterations": 11, "mean_time": 176325898, "std_time": 14181744, "metadata": {"num_docs": 10000, "field": "embedding", "memmap": false}}, {"name": "test_dam_get_attributes", "page": "document_array_get_attributes", "iterations": 5, "mean_time": 952172404, "std_time": 19682355, "metadata": {"num_docs": 10000, "field": "embedding", "memmap": true}}, {"name": "test_da_insert", "page": "document_array_insert", "iterations": 454, "mean_time": 168415, "std_time": 4970, "metadata": {"num_docs": 100}}, {"name": "test_da_insert", "page": "document_array_insert", "iterations": 10, "mean_time": 34379713, "std_time": 165342, "metadata": {"num_docs": 10000}}, {"name": "test_match", "page": "document_array_match", "iterations": 5, "mean_time": 2172444973, "std_time": 29236483, "metadata": {"size_X": 10, "size_Y": 100000, "dam_x": false, "dam_y": false, "emb_size": 256, "use_scipy": false, "metric": "euclidean", "top_k": 3}}, {"name": "test_da_save", "page": "document_array_persistence", "iterations": 5, "mean_time": 2532785116, "std_time": 24374942, "metadata": {"num_docs_append": 100000, "file_format": "json"}}, {"name": "test_da_save", "page": "document_array_persistence", "iterations": 5, "mean_time": 462705214, "std_time": 1265027, "metadata": {"num_docs_append": 100000, "file_format": "binary"}}, {"name": "test_da_load", "page": "document_array_persistence", "iterations": 5, "mean_time": 8371470563, "std_time": 27786218, "metadata": {"num_docs_append": 100000, "file_format": "json"}}, {"name": "test_da_load", "page": "document_array_persistence", "iterations": 5, "mean_time": 176426036, "std_time": 1138379, "metadata": {"num_docs_append": 100000, "file_format": "binary"}}, {"name": "test_da_reverse", "page": "document_array_insert", "iterations": 433, "mean_time": 370287, "std_time": 7839, "metadata": {"num_docs": 100}}, {"name": "test_da_reverse", "page": "document_array_insert", "iterations": 10, "mean_time": 37988003, "std_time": 153439, "metadata": {"num_docs": 10000}}, {"name": "test_da_save", "page": "document_array_clear", "iterations": 283, "mean_time": 2608163, "std_time": 33340, "metadata": {"num_docs": 100}}, {"name": "test_da_save", "page": "document_array_clear", "iterations": 10, "mean_time": 249012378, "std_time": 1041847, "metadata": {"num_docs": 10000}}, {"name": "test_dam_save", "page": "document_array_clear", "iterations": 362, "mean_time": 94501, "std_time": 4883, "metadata": {"num_docs": 100}}, {"name": "test_dam_save", "page": "document_array_clear", "iterations": 10, "mean_time": 984830, "std_time": 40911, "metadata": {"num_docs": 10000}}, {"name": "test_da_save_binary", "page": "document_array_insert", "iterations": 419, "mean_time": 453235, "std_time": 9762, "metadata": {"num_docs": 100}}, {"name": "test_da_save_binary", "page": "document_array_insert", "iterations": 10, "mean_time": 41394899, "std_time": 359549, "metadata": {"num_docs": 10000}}, {"name": "test_da_load_binary", "page": "document_array_insert", "iterations": 402, "mean_time": 195699, "std_time": 6539, "metadata": {"num_docs": 100}}, {"name": "test_da_load_binary", "page": "document_array_insert", "iterations": 10, "mean_time": 15253357, "std_time": 194289, "metadata": {"num_docs": 10000}}, {"name": "test_da_save_json", "page": "document_array_insert", "iterations": 281, "mean_time": 2679379, "std_time": 1333361, "metadata": {"num_docs": 100}}, {"name": "test_da_save_json", "page": "document_array_insert", "iterations": 10, "mean_time": 249181491, "std_time": 671351, "metadata": {"num_docs": 10000}}, {"name": "test_da_load_json", "page": "document_array_insert", "iterations": 128, "mean_time": 8424379, "std_time": 71019, "metadata": {"num_docs": 100}}, {"name": "test_da_load_json", "page": "document_array_insert", "iterations": 13, "mean_time": 82027979, "std_time": 243711, "metadata": {"num_docs": 1000}}, {"name": "test_da_load_json", "page": "document_array_insert", "iterations": 10, "mean_time": 818522000, "std_time": 8782199, "metadata": {"num_docs": 10000}}, {"name": "test_da_shuffle", "page": "document_array_shuffle", "iterations": 26, "mean_time": 34943751, "std_time": 4587628, "metadata": {"n_nodes": false, "n_docs": 1000}}, {"name": "test_dam_shuffle", "page": "document_array_shuffle", "iterations": 17, "mean_time": 70945594, "std_time": 150713, "metadata": {"n_nodes": true, "n_docs": 1000}}, {"name": "test_da_shuffle", "page": "document_array_shuffle", "iterations": 5, "mean_time": 346844400, "std_time": 2112699, "metadata": {"n_nodes": false, "n_docs": 10000}}, {"name": "test_dam_shuffle", "page": "document_array_shuffle", "iterations": 5, "mean_time": 733615093, "std_time": 16847624, "metadata": {"n_nodes": true, "n_docs": 10000}}, {"name": "test_da_sort", "page": "document_array_sort", "iterations": 20664, "mean_time": 75164, "std_time": 1906, "metadata": {"num_docs": 100}}, {"name": "test_da_sort", "page": "document_array_sort", "iterations": 25, "mean_time": 105843792, "std_time": 726113, "metadata": {"num_docs": 100000}}, {"name": "test_da_traverse_flat", "page": "document_array_traverse", "iterations": 81, "mean_time": 2997426, "std_time": 40195, "metadata": {"num_docs": 10, "num_matches": 10, "num_chunks": 10, "traversal_paths": ["r", "c", "m"], "memmap": false}}, {"name": "test_da_traverse_flat", "page": "document_array_traverse", "iterations": 5, "mean_time": 290059338, "std_time": 15122446, "metadata": {"num_docs": 100, "num_matches": 100, "num_chunks": 100, "traversal_paths": ["r", "c", "m"], "memmap": false}}, {"name": "test_da_traverse_flat", "page": "document_array_traverse", "iterations": 5, "mean_time": 3122751565, "std_time": 78540318, "metadata": {"num_docs": 1000, "num_matches": 100, "num_chunks": 100, "traversal_paths": ["r", "c", "m"], "memmap": false}}, {"name": "test_da_traverse_flat", "page": "document_array_traverse", "iterations": 5, "mean_time": 15347609, "std_time": 3680900, "metadata": {"num_docs": 1000, "num_matches": 10, "num_chunks": 10, "traversal_paths": ["r"], "memmap": false}}, {"name": "test_da_traverse_flat", "page": "document_array_traverse", "iterations": 5, "mean_time": 1540620978, "std_time": 54161616, "metadata": {"num_docs": 1000, "num_matches": 10, "num_chunks": 100, "traversal_paths": ["c"], "memmap": false}}, {"name": "test_da_traverse_flat", "page": "document_array_traverse", "iterations": 5, "mean_time": 1507878414, "std_time": 29284556, "metadata": {"num_docs": 1000, "num_matches": 100, "num_chunks": 10, "traversal_paths": ["m"], "memmap": false}}, {"name": "test_dam_traverse_flat", "page": "document_array_traverse", "iterations": 68, "mean_time": 6504295, "std_time": 29828, "metadata": {"num_docs": 10, "num_matches": 10, "num_chunks": 10, "traversal_paths": ["r", "c", "m"], "memmap": true}}, {"name": "test_dam_traverse_flat", "page": "document_array_traverse", "iterations": 5, "mean_time": 421479163, "std_time": 722261, "metadata": {"num_docs": 100, "num_matches": 100, "num_chunks": 100, "traversal_paths": ["r", "c", "m"], "memmap": true}}, {"name": "test_dam_traverse_flat", "page": "document_array_traverse", "iterations": 5, "mean_time": 4260193925, "std_time": 11129114, "metadata": {"num_docs": 1000, "num_matches": 100, "num_chunks": 100, "traversal_paths": ["r", "c", "m"], "memmap": true}}, {"name": "test_dam_traverse_flat", "page": "document_array_traverse", "iterations": 5, "mean_time": 158296238, "std_time": 1791414, "metadata": {"num_docs": 1000, "num_matches": 10, "num_chunks": 10, "traversal_paths": ["r"], "memmap": true}}, {"name": "test_dam_traverse_flat", "page": "document_array_traverse", "iterations": 5, "mean_time": 1290982118, "std_time": 32703763, "metadata": {"num_docs": 1000, "num_matches": 10, "num_chunks": 100, "traversal_paths": ["c"], "memmap": true}}, {"name": "test_dam_traverse_flat", "page": "document_array_traverse", "iterations": 5, "mean_time": 1236756984, "std_time": 3084354, "metadata": {"num_docs": 1000, "num_matches": 100, "num_chunks": 10, "traversal_paths": ["m"], "memmap": true}}, {"name": "test_construct_text", "page": "document_construct", "iterations": 42374, "mean_time": 43724, "std_time": 3600, "metadata": {"text_length": 10}}, {"name": "test_construct_text", "page": "document_construct", "iterations": 42268, "mean_time": 43846, "std_time": 3676, "metadata": {"text_length": 100}}, {"name": "test_construct_text", "page": "document_construct", "iterations": 40117, "mean_time": 46332, "std_time": 3744, "metadata": {"text_length": 1000}}, {"name": "test_construct_text", "page": "document_construct", "iterations": 30383, "mean_time": 62289, "std_time": 3938, "metadata": {"text_length": 10000}}, {"name": "test_construct_blob", "page": "document_construct", "iterations": 29621, "mean_time": 63723, "std_time": 4448, "metadata": {"num_dims": 1}}, {"name": "test_construct_blob", "page": "document_construct", "iterations": 4095, "mean_time": 482268, "std_time": 9099, "metadata": {"num_dims": 2}}, {"name": "test_construct_buffer", "page": "document_construct", "iterations": 44099, "mean_time": 41955, "std_time": 3585, "metadata": {"buffer_length": 10}}, {"name": "test_construct_buffer", "page": "document_construct", "iterations": 42204, "mean_time": 43984, "std_time": 3672, "metadata": {"buffer_length": 1000}}, {"name": "test_construct_buffer", "page": "document_construct", "iterations": 9544, "mean_time": 205931, "std_time": 5771, "metadata": {"buffer_length": 100000}}, {"name": "test_construct_btyes_origin_text", "page": "document_construct", "iterations": 32269, "mean_time": 58352, "std_time": 2364, "metadata": {"text_length": 10}}, {"name": "test_construct_btyes_origin_text", "page": "document_construct", "iterations": 32033, "mean_time": 58774, "std_time": 2674, "metadata": {"text_length": 100}}, {"name": "test_construct_btyes_origin_text", "page": "document_construct", "iterations": 30998, "mean_time": 60884, "std_time": 2343, "metadata": {"text_length": 1000}}, {"name": "test_construct_btyes_origin_text", "page": "document_construct", "iterations": 24223, "mean_time": 78870, "std_time": 2523, "metadata": {"text_length": 10000}}, {"name": "test_construct_btyes_origin_blob", "page": "document_construct", "iterations": 29278, "mean_time": 64568, "std_time": 2297, "metadata": {"num_dims": 1}}, {"name": "test_construct_btyes_origin_blob", "page": "document_construct", "iterations": 4572, "mean_time": 431850, "std_time": 6777, "metadata": {"num_dims": 2}}, {"name": "test_construct_btyes_origin_buffer", "page": "document_construct", "iterations": 32567, "mean_time": 57812, "std_time": 2113, "metadata": {"buffer_length": 10}}, {"name": "test_construct_btyes_origin_buffer", "page": "document_construct", "iterations": 31459, "mean_time": 59957, "std_time": 2253, "metadata": {"buffer_length": 1000}}, {"name": "test_construct_btyes_origin_buffer", "page": "document_construct", "iterations": 8801, "mean_time": 223217, "std_time": 4620, "metadata": {"buffer_length": 100000}}, {"name": "test_construct_str_json_origin_text", "page": "document_construct", "iterations": 16096, "mean_time": 119673, "std_time": 7330, "metadata": {"text_length": 10}}, {"name": "test_construct_str_json_origin_text", "page": "document_construct", "iterations": 15497, "mean_time": 124415, "std_time": 7154, "metadata": {"text_length": 100}}, {"name": "test_construct_str_json_origin_text", "page": "document_construct", "iterations": 12035, "mean_time": 161395, "std_time": 7675, "metadata": {"text_length": 1000}}, {"name": "test_construct_str_json_origin_text", "page": "document_construct", "iterations": 3936, "mean_time": 503038, "std_time": 9843, "metadata": {"text_length": 10000}}, {"name": "test_construct_str_json_origin_blob", "page": "document_construct", "iterations": 11874, "mean_time": 163144, "std_time": 8050, "metadata": {"num_dims": 1}}, {"name": "test_construct_str_json_origin_blob", "page": "document_construct", "iterations": 869, "mean_time": 2293109, "std_time": 19065, "metadata": {"num_dims": 2}}, {"name": "test_construct_str_json_origin_buffer", "page": "document_construct", "iterations": 16369, "mean_time": 117579, "std_time": 7311, "metadata": {"buffer_length": 10}}, {"name": "test_construct_str_json_origin_buffer", "page": "document_construct", "iterations": 14688, "mean_time": 131344, "std_time": 7391, "metadata": {"buffer_length": 1000}}, {"name": "test_construct_str_json_origin_buffer", "page": "document_construct", "iterations": 1886, "mean_time": 1052675, "std_time": 13785, "metadata": {"buffer_length": 100000}}, {"name": "test_construct_dict_origin_text", "page": "document_construct", "iterations": 17670, "mean_time": 108815, "std_time": 7229, "metadata": {"text_length": 10}}, {"name": "test_construct_dict_origin_text", "page": "document_construct", "iterations": 17078, "mean_time": 112703, "std_time": 7089, "metadata": {"text_length": 100}}, {"name": "test_construct_dict_origin_text", "page": "document_construct", "iterations": 13112, "mean_time": 147994, "std_time": 7234, "metadata": {"text_length": 1000}}, {"name": "test_construct_dict_origin_text", "page": "document_construct", "iterations": 4151, "mean_time": 477028, "std_time": 10060, "metadata": {"text_length": 10000}}, {"name": "test_construct_dict_origin_blob", "page": "document_construct", "iterations": 13169, "mean_time": 146925, "std_time": 7940, "metadata": {"num_dims": 1}}, {"name": "test_construct_dict_origin_blob", "page": "document_construct", "iterations": 1459, "mean_time": 1362811, "std_time": 15940, "metadata": {"num_dims": 2}}, {"name": "test_construct_dict_origin_buffer", "page": "document_construct", "iterations": 17906, "mean_time": 107385, "std_time": 6940, "metadata": {"buffer_length": 10}}, {"name": "test_construct_dict_origin_buffer", "page": "document_construct", "iterations": 16348, "mean_time": 117840, "std_time": 7103, "metadata": {"buffer_length": 1000}}, {"name": "test_construct_dict_origin_buffer", "page": "document_construct", "iterations": 2307, "mean_time": 859580, "std_time": 13879, "metadata": {"buffer_length": 100000}}, {"name": "test_construct_document_origin_text", "page": "document_construct", "iterations": 34807, "mean_time": 53876, "std_time": 1945, "metadata": {"text_length": 10, "copy": true}}, {"name": "test_construct_document_origin_text", "page": "document_construct", "iterations": 39117, "mean_time": 47630, "std_time": 1723, "metadata": {"text_length": 10, "copy": false}}, {"name": "test_construct_document_origin_text", "page": "document_construct", "iterations": 34636, "mean_time": 54186, "std_time": 1908, "metadata": {"text_length": 100, "copy": true}}, {"name": "test_construct_document_origin_text", "page": "document_construct", "iterations": 38984, "mean_time": 47823, "std_time": 1852, "metadata": {"text_length": 100, "copy": false}}, {"name": "test_construct_document_origin_text", "page": "document_construct", "iterations": 33323, "mean_time": 56463, "std_time": 2106, "metadata": {"text_length": 1000, "copy": true}}, {"name": "test_construct_document_origin_text", "page": "document_construct", "iterations": 37478, "mean_time": 49873, "std_time": 1777, "metadata": {"text_length": 1000, "copy": false}}, {"name": "test_construct_document_origin_text", "page": "document_construct", "iterations": 26623, "mean_time": 71545, "std_time": 2205, "metadata": {"text_length": 10000, "copy": true}}, {"name": "test_construct_document_origin_text", "page": "document_construct", "iterations": 29069, "mean_time": 65284, "std_time": 2220, "metadata": {"text_length": 10000, "copy": false}}, {"name": "test_construct_document_origin_blob", "page": "document_construct", "iterations": 30779, "mean_time": 61366, "std_time": 2060, "metadata": {"num_dims": 1, "copy": true}}, {"name": "test_construct_document_origin_blob", "page": "document_construct", "iterations": 35525, "mean_time": 52772, "std_time": 1993, "metadata": {"num_dims": 1, "copy": false}}, {"name": "test_construct_document_origin_blob", "page": "document_construct", "iterations": 4348, "mean_time": 455206, "std_time": 10579, "metadata": {"num_dims": 2, "copy": true}}, {"name": "test_construct_document_origin_blob", "page": "document_construct", "iterations": 3840, "mean_time": 515814, "std_time": 6556, "metadata": {"num_dims": 2, "copy": false}}, {"name": "test_construct_document_origin_buffer", "page": "document_construct", "iterations": 34993, "mean_time": 53621, "std_time": 1983, "metadata": {"buffer_length": 10, "copy": true}}, {"name": "test_construct_document_origin_buffer", "page": "document_construct", "iterations": 39380, "mean_time": 47328, "std_time": 1792, "metadata": {"buffer_length": 10, "copy": false}}, {"name": "test_construct_document_origin_buffer", "page": "document_construct", "iterations": 33838, "mean_time": 55580, "std_time": 1895, "metadata": {"buffer_length": 1000, "copy": true}}, {"name": "test_construct_document_origin_buffer", "page": "document_construct", "iterations": 37612, "mean_time": 49709, "std_time": 1829, "metadata": {"buffer_length": 1000, "copy": false}}, {"name": "test_construct_document_origin_buffer", "page": "document_construct", "iterations": 9251, "mean_time": 212441, "std_time": 3967, "metadata": {"buffer_length": 100000, "copy": true}}, {"name": "test_construct_document_origin_buffer", "page": "document_construct", "iterations": 9534, "mean_time": 206148, "std_time": 3760, "metadata": {"buffer_length": 100000, "copy": false}}, {"name": "test_construct_document_origin_text_proto", "page": "document_construct", "iterations": 35327, "mean_time": 53068, "std_time": 1800, "metadata": {"text_length": 10, "copy": true}}, {"name": "test_construct_document_origin_text_proto", "page": "document_construct", "iterations": 39744, "mean_time": 46794, "std_time": 1689, "metadata": {"text_length": 10, "copy": false}}, {"name": "test_construct_document_origin_text_proto", "page": "document_construct", "iterations": 35344, "mean_time": 53039, "std_time": 1905, "metadata": {"text_length": 100, "copy": true}}, {"name": "test_construct_document_origin_text_proto", "page": "document_construct", "iterations": 39792, "mean_time": 46851, "std_time": 1799, "metadata": {"text_length": 100, "copy": false}}, {"name": "test_construct_document_origin_text_proto", "page": "document_construct", "iterations": 34108, "mean_time": 55133, "std_time": 1851, "metadata": {"text_length": 1000, "copy": true}}, {"name": "test_construct_document_origin_text_proto", "page": "document_construct", "iterations": 38505, "mean_time": 48443, "std_time": 1758, "metadata": {"text_length": 1000, "copy": false}}, {"name": "test_construct_document_origin_text_proto", "page": "document_construct", "iterations": 26900, "mean_time": 70817, "std_time": 2269, "metadata": {"text_length": 10000, "copy": true}}, {"name": "test_construct_document_origin_text_proto", "page": "document_construct", "iterations": 29545, "mean_time": 64234, "std_time": 2155, "metadata": {"text_length": 10000, "copy": false}}, {"name": "test_construct_document_origin_blob_proto", "page": "document_construct", "iterations": 31489, "mean_time": 59972, "std_time": 1967, "metadata": {"num_dims": 1, "copy": true}}, {"name": "test_construct_document_origin_blob_proto", "page": "document_construct", "iterations": 35529, "mean_time": 52720, "std_time": 1824, "metadata": {"num_dims": 1, "copy": false}}, {"name": "test_construct_document_origin_blob_proto", "page": "document_construct", "iterations": 6729, "mean_time": 293396, "std_time": 4630, "metadata": {"num_dims": 2, "copy": true}}, {"name": "test_construct_document_origin_blob_proto", "page": "document_construct", "iterations": 6744, "mean_time": 292839, "std_time": 4633, "metadata": {"num_dims": 2, "copy": false}}, {"name": "test_construct_document_origin_buffer_proto", "page": "document_construct", "iterations": 35779, "mean_time": 52402, "std_time": 1789, "metadata": {"buffer_length": 10, "copy": true}}, {"name": "test_construct_document_origin_buffer_proto", "page": "document_construct", "iterations": 40479, "mean_time": 46001, "std_time": 1741, "metadata": {"buffer_length": 10, "copy": false}}, {"name": "test_construct_document_origin_buffer_proto", "page": "document_construct", "iterations": 34578, "mean_time": 54367, "std_time": 2073, "metadata": {"buffer_length": 1000, "copy": true}}, {"name": "test_construct_document_origin_buffer_proto", "page": "document_construct", "iterations": 38547, "mean_time": 48371, "std_time": 2248, "metadata": {"buffer_length": 1000, "copy": false}}, {"name": "test_construct_document_origin_buffer_proto", "page": "document_construct", "iterations": 9301, "mean_time": 211422, "std_time": 4100, "metadata": {"buffer_length": 100000, "copy": true}}, {"name": "test_construct_document_origin_buffer_proto", "page": "document_construct", "iterations": 9618, "mean_time": 204463, "std_time": 3658, "metadata": {"buffer_length": 100000, "copy": false}}, {"name": "test_get_attributes_text", "page": "document_get_attributes", "iterations": 716198, "mean_time": 1099, "std_time": 189, "metadata": {"text_length": 10}}, {"name": "test_get_attributes_text", "page": "document_get_attributes", "iterations": 707005, "mean_time": 1145, "std_time": 400, "metadata": {"text_length": 100}}, {"name": "test_get_attributes_text", "page": "document_get_attributes", "iterations": 680565, "mean_time": 1249, "std_time": 210, "metadata": {"text_length": 1000}}, {"name": "test_get_attributes_text", "page": "document_get_attributes", "iterations": 521967, "mean_time": 2038, "std_time": 257, "metadata": {"text_length": 10000}}, {"name": "test_get_attribute_blob", "page": "document_get_attributes", "iterations": 220350, "mean_time": 7114, "std_time": 547, "metadata": {"num_dims": 1}}, {"name": "test_get_attribute_blob", "page": "document_get_attributes", "iterations": 64347, "mean_time": 29120, "std_time": 2254, "metadata": {"num_dims": 2}}, {"name": "test_get_attribute_buffer", "page": "document_get_attributes", "iterations": 734820, "mean_time": 1072, "std_time": 165, "metadata": {"buffer_length": 10}}, {"name": "test_get_attribute_buffer", "page": "document_get_attributes", "iterations": 690877, "mean_time": 1206, "std_time": 187, "metadata": {"buffer_length": 1000}}, {"name": "test_get_attribute_buffer", "page": "document_get_attributes", "iterations": 295309, "mean_time": 5019, "std_time": 925, "metadata": {"buffer_length": 100000}}, {"name": "test_get_content_text", "page": "document_property_getter", "iterations": 862175, "mean_time": 652, "std_time": 132, "metadata": {"text_length": 10}}, {"name": "test_get_content_text", "page": "document_property_getter", "iterations": 846074, "mean_time": 681, "std_time": 125, "metadata": {"text_length": 100}}, {"name": "test_get_content_text", "page": "document_property_getter", "iterations": 787490, "mean_time": 831, "std_time": 141, "metadata": {"text_length": 1000}}, {"name": "test_get_content_text", "page": "document_property_getter", "iterations": 594867, "mean_time": 1572, "std_time": 205, "metadata": {"text_length": 10000}}, {"name": "test_get_content_blob", "page": "document_property_getter", "iterations": 228749, "mean_time": 6713, "std_time": 462, "metadata": {"num_dims": 1}}, {"name": "test_get_content_blob", "page": "document_property_getter", "iterations": 77631, "mean_time": 23748, "std_time": 2999, "metadata": {"num_dims": 2}}, {"name": "test_get_content_buffer", "page": "document_property_getter", "iterations": 863141, "mean_time": 640, "std_time": 136, "metadata": {"buffer_length": 10}}, {"name": "test_get_content_buffer", "page": "document_property_getter", "iterations": 807124, "mean_time": 754, "std_time": 143, "metadata": {"buffer_length": 1000}}, {"name": "test_get_content_buffer", "page": "document_property_getter", "iterations": 303903, "mean_time": 4802, "std_time": 855, "metadata": {"buffer_length": 100000}}, {"name": "test_get_embedding", "page": "document_property_getter", "iterations": 224089, "mean_time": 6908, "std_time": 466, "metadata": {"buffer_length": 1}}, {"name": "test_get_embedding", "page": "document_property_getter", "iterations": 142062, "mean_time": 12093, "std_time": 1060, "metadata": {"buffer_length": 2}}, {"name": "test_set_attribute_text", "page": "document_set_attributes", "iterations": 220972, "mean_time": 7172, "std_time": 535, "metadata": {"text_length": 10}}, {"name": "test_set_attribute_text", "page": "document_set_attributes", "iterations": 40087, "mean_time": 47985, "std_time": 1413, "metadata": {"text_length": 100}}, {"name": "test_set_attribute_text", "page": "document_set_attributes", "iterations": 4345, "mean_time": 458333, "std_time": 4506, "metadata": {"text_length": 1000}}, {"name": "test_set_attribute_text", "page": "document_set_attributes", "iterations": 440, "mean_time": 4548337, "std_time": 18055, "metadata": {"text_length": 10000}}, {"name": "test_set_attribute_blob", "page": "document_set_attributes", "iterations": 138417, "mean_time": 12280, "std_time": 856, "metadata": {"num_dims": 1}}, {"name": "test_set_attribute_blob", "page": "document_set_attributes", "iterations": 11222, "mean_time": 175538, "std_time": 63380, "metadata": {"num_dims": 2}}, {"name": "test_set_attribute_buffer", "page": "document_set_attributes", "iterations": 416221, "mean_time": 3026, "std_time": 348, "metadata": {"buffer_length": 10}}, {"name": "test_set_attribute_buffer", "page": "document_set_attributes", "iterations": 23379, "mean_time": 83689, "std_time": 1746, "metadata": {"buffer_length": 1000}}, {"name": "test_set_attribute_buffer", "page": "document_set_attributes", "iterations": 246, "mean_time": 8143859, "std_time": 47535, "metadata": {"buffer_length": 100000}}, {"name": "test_executor_load_config", "page": "executor", "iterations": 860, "mean_time": 2318388, "std_time": 1396410, "metadata": {}}, {"name": "test_local_flow_start", "page": "flow", "iterations": 5, "mean_time": 724767375, "std_time": 8320444, "metadata": {"flow": "wide", "num_pods": 10}}, {"name": "test_local_flow_close", "page": "flow", "iterations": 5, "mean_time": 256889969, "std_time": 34431738, "metadata": {"flow": "long", "num_pods": 10}}, {"name": "test_local_flow_close", "page": "flow", "iterations": 5, "mean_time": 305463673, "std_time": 1149236, "metadata": {"flow": "wide", "num_pods": 10}}, {"name": "test_flow_load_config", "page": "flow", "iterations": 70, "mean_time": 28705205, "std_time": 1096895, "metadata": {"flow": "long", "num_pods": 10}}, {"name": "test_flow_load_config", "page": "flow", "iterations": 54, "mean_time": 37559951, "std_time": 4789177, "metadata": {"flow": "wide", "num_pods": 10}}] --------------------------------------------------------------------------------