├── src
    ├── __init__.py
    ├── utils
    │   ├── __init__.py
    │   ├── test.png
    │   ├── timecontext.py
    │   ├── profiler.py
    │   └── benchmark.py
    ├── document_attributes.py
    ├── document_non_empty_fields.py
    ├── document_uri.py
    ├── document_chunks.py
    ├── document_matches.py
    ├── document_pop.py
    ├── document_content_hash.py
    ├── document_content_type.py
    ├── document_array_reverse.py
    ├── document_plot.py
    ├── document_copy_from.py
    ├── document_merge_from.py
    ├── executor.py
    ├── document_array_insert.py
    ├── document_get_sparse_blob.py
    ├── document_get_sparse_embedding.py
    ├── document_parent_id.py
    ├── pages.py
    ├── document_array_sort.py
    ├── document_graph_adjacency.py
    ├── document_dict.py
    ├── document_json.py
    ├── document_id.py
    ├── document_array_shuffle.py
    ├── document_array_clear.py
    ├── document_content.py
    ├── document_embedding.py
    ├── document_weight.py
    ├── document_granularity.py
    ├── document_mime_type.py
    ├── document_array_append.py
    ├── document_clear.py
    ├── document_scores.py
    ├── document_tags.py
    ├── document_evaluations.py
    ├── document_update.py
    ├── document_modality.py
    ├── document_array_persistence.py
    ├── document_array_save_json_load_json.py
    ├── document_array_save_binary_load_binary.py
    ├── document_array_embeddings.py
    ├── document_array_save.py
    ├── zed_runtime_callback.py
    ├── document_set_attributes.py
    ├── document_get_attributes.py
    ├── document_array_extend.py
    ├── document_array_traverse.py
    ├── document_property_getter.py
    ├── document_array_match.py
    ├── flow.py
    ├── document_array_construct.py
    ├── document_array_get_attributes.py
    ├── document_graph_construction.py
    ├── searchers_compare.py
    ├── document_conversions_blob_image_uri_text.py
    └── document_construct.py
├── .github
    ├── CODEOWNERS
    └── workflows
    │   ├── create-pr.yml
    │   ├── gh-page.yml
    │   └── pr.yml
├── docs
    ├── static
    │   ├── CNAME
    │   └── artifacts
    │   │   ├── 2.1.2
    │   │       └── searchers_compare.json
    │   │   └── 2.0.12
    │   │       └── report.json
    ├── archetypes
    │   └── default.md
    └── config.yml
├── pyproject.toml
├── .gitmodules
├── requirements.txt
├── Dockerfile
├── .pre-commit-config.yaml
├── README.md
├── conftest.py
├── .gitignore
├── LICENSE
└── scripts
    └── site_generator.py


/src/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.github/CODEOWNERS:
--------------------------------------------------------------------------------
1 | * @maateen
2 | 


--------------------------------------------------------------------------------
/docs/static/CNAME:
--------------------------------------------------------------------------------
1 | benchmark.jina.ai


--------------------------------------------------------------------------------
/src/utils/test.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jina-ai/benchmark/HEAD/src/utils/test.png


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.pytest.ini_options]
2 | testpaths = ["src"]
3 | python_files = "*.py"
4 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "docs/themes/book"]
2 | 	path = docs/themes/book
3 | 	url = https://github.com/alex-shpak/hugo-book
4 | 


--------------------------------------------------------------------------------
/docs/archetypes/default.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "{{ replace .Name "-" " " | title }}"
3 | date: {{ .Date }}
4 | draft: true
5 | ---
6 | 
7 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | pympler==0.9
2 | faker==8.11.0
3 | packaging==21.0
4 | pytest==6.2.4
5 | pytest-json-report==1.4.0
6 | scipy==1.7.1
7 | Pillow==8.3.2
8 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | ARG JINA_VER
 2 | 
 3 | FROM jinaai/jina:$JINA_VER
 4 | 
 5 | WORKDIR /app
 6 | 
 7 | ADD requirements.txt .
 8 | 
 9 | # install dependencies
10 | RUN apt-get update && \
11 |     apt-get install -y gcc && \
12 |     pip3 install -r requirements.txt
13 | 
14 | # run benchmark
15 | ENTRYPOINT ["pytest"]
16 | 


--------------------------------------------------------------------------------
/src/utils/timecontext.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | 
 3 | 
 4 | class TimeContext:
 5 |     """Timing a code snippet with a context manager."""
 6 | 
 7 |     def __enter__(self):
 8 |         self.start = time.time_ns()
 9 |         return self
10 | 
11 |     def __exit__(self, typ, value, traceback):
12 |         self.duration = self.time_since_start()
13 | 
14 |     def time_since_start(self):
15 |         return time.time_ns() - self.start
16 | 


--------------------------------------------------------------------------------
/.github/workflows/create-pr.yml:
--------------------------------------------------------------------------------
 1 | name: Create PR
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - "benchmark-*"
 7 | 
 8 | jobs:
 9 |   create-pr:
10 |     runs-on: ubuntu-latest
11 |     if: ${{ github.actor == 'jina-bot' }}
12 |     steps:
13 |       - uses: actions/checkout@v2
14 | 
15 |       - name: create PR
16 |         id: open-pr
17 |         uses: repo-sync/pull-request@v2
18 |         with:
19 |           pr_label: automerge
20 |           destination_branch: "main"
21 |           pr_body: "This is an automated PR."
22 |           github_token: ${{ secrets.JINA_DEV_BOT }}
23 | 


--------------------------------------------------------------------------------
/src/document_attributes.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from jina import Document
 3 | 
 4 | from .pages import Pages
 5 | from .utils.benchmark import benchmark_time
 6 | 
 7 | 
 8 | @pytest.mark.parametrize('num_docs', [1, 100, 10_000])
 9 | def test_document_attributes(num_docs, json_writer):
10 |     def _input_docs():
11 |         return (), dict(docs=[Document(text='doc') for _ in range(num_docs)])
12 | 
13 |     def _attributes(docs):
14 |         for d in docs:
15 |             aux = d.attributes()
16 | 
17 |     result = benchmark_time(setup=_input_docs, func=_attributes)
18 | 
19 |     json_writer.append(
20 |         page=Pages.DOCUMENT_HELPER,
21 |         result=result,
22 |         metadata=dict(num_docs=num_docs),
23 |     )
24 | 


--------------------------------------------------------------------------------
/src/document_non_empty_fields.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from jina import Document
 3 | 
 4 | from .pages import Pages
 5 | from .utils.benchmark import benchmark_time
 6 | 
 7 | 
 8 | @pytest.mark.parametrize('num_docs', [1, 100, 10_000])
 9 | def test_document_non_empty_fields(num_docs, json_writer):
10 |     def _input_docs():
11 |         return (), dict(docs=[Document(text='doc') for _ in range(num_docs)])
12 | 
13 |     def _non_empty_fields(docs):
14 |         for d in docs:
15 |             aux = d.dict()
16 | 
17 |     result = benchmark_time(setup=_input_docs, func=_non_empty_fields)
18 | 
19 |     json_writer.append(
20 |         page=Pages.DOCUMENT_HELPER,
21 |         result=result,
22 |         metadata=dict(num_docs=num_docs),
23 |     )
24 | 


--------------------------------------------------------------------------------
/src/document_uri.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from jina import Document
 3 | 
 4 | from .pages import Pages
 5 | from .utils.benchmark import benchmark_time
 6 | 
 7 | 
 8 | @pytest.mark.parametrize("num_docs", [100, 10_000])
 9 | def test_document_uri(num_docs, json_writer):
10 |     def _input_docs():
11 |         return (
12 |             (),
13 |             {"docs": [Document(text=f"d{i}") for i in range(num_docs)]},
14 |         )
15 | 
16 |     def _doc_uri(docs):
17 |         for doc in docs:
18 |             _ = doc.uri
19 | 
20 |     result = benchmark_time(setup=_input_docs, func=_doc_uri)
21 | 
22 |     json_writer.append(
23 |         page=Pages.DOCUMENT_CONTENT,
24 |         result=result,
25 |         metadata=dict(num_docs=num_docs),
26 |     )
27 | 


--------------------------------------------------------------------------------
/src/document_chunks.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from jina import Document
 3 | 
 4 | from .pages import Pages
 5 | from .utils.benchmark import benchmark_time
 6 | 
 7 | 
 8 | @pytest.mark.parametrize("num_docs", [100, 1000, 10_000])
 9 | def test_document_document_chunks(num_docs, json_writer):
10 |     def _input_docs():
11 |         doc = Document()
12 |         doc.chunks = [Document(text=f"d{i}") for i in range(num_docs)]
13 |         return ((), {"doc": doc})
14 | 
15 |     def _get_chunks(doc):
16 |         return doc.chunks
17 | 
18 |     result = benchmark_time(setup=_input_docs, func=_get_chunks)
19 | 
20 |     json_writer.append(
21 |         page=Pages.DOCUMENT_RECURSIVE,
22 |         result=result,
23 |         metadata=dict(num_docs=num_docs),
24 |     )
25 | 


--------------------------------------------------------------------------------
/src/document_matches.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from jina import Document
 3 | 
 4 | from .pages import Pages
 5 | from .utils.benchmark import benchmark_time
 6 | 
 7 | 
 8 | @pytest.mark.parametrize("num_docs", [100, 1000, 10_000])
 9 | def test_document_document_matches(num_docs, json_writer):
10 |     def _input_docs():
11 |         doc = Document(text="d1")
12 |         doc.matches = [Document(text=f"d{i}") for i in range(num_docs)]
13 |         return ((), {"doc": doc})
14 | 
15 |     def _get_matches(doc):
16 |         return doc.matches
17 | 
18 |     result = benchmark_time(setup=_input_docs, func=_get_matches)
19 | 
20 |     json_writer.append(
21 |         page=Pages.DOCUMENT_RECURSIVE,
22 |         result=result,
23 |         metadata=dict(num_docs=num_docs),
24 |     )
25 | 


--------------------------------------------------------------------------------
/src/document_pop.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from jina import Document, DocumentArray
 3 | 
 4 | from .pages import Pages
 5 | from .utils.benchmark import benchmark_time
 6 | 
 7 | 
 8 | @pytest.mark.parametrize('num_docs', [100, 1000, 10_000])
 9 | def test_document_document_pop(num_docs, json_writer):
10 |     def _input_docs():
11 |         return (), dict(
12 |             docs=DocumentArray([Document(text='hey here') for _ in range(num_docs)])
13 |         )
14 | 
15 |     def _pop_text(docs):
16 |         for d in docs:
17 |             d.pop('text')
18 | 
19 |     result = benchmark_time(setup=_input_docs, func=_pop_text)
20 | 
21 |     json_writer.append(
22 |         page=Pages.DOCUMENT_HELPER,
23 |         result=result,
24 |         metadata=dict(num_docs=num_docs),
25 |     )
26 | 


--------------------------------------------------------------------------------
/src/document_content_hash.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from jina import Document
 3 | 
 4 | from .pages import Pages
 5 | from .utils.benchmark import benchmark_time
 6 | 
 7 | 
 8 | @pytest.mark.parametrize('num_docs', [100, 1000, 10_000])
 9 | def test_document_document_content_hash(num_docs, json_writer):
10 |     def _input_docs():
11 |         return (
12 |             (),
13 |             {'docs': [Document(text=f'text doc {i}') for i in range(num_docs)]},
14 |         )
15 | 
16 |     def _content_hash(docs):
17 |         for d in docs:
18 |             d.content_hash
19 | 
20 |     result = benchmark_time(setup=_input_docs, func=_content_hash)
21 |     json_writer.append(
22 |         page=Pages.DOCUMENT_META,
23 |         result=result,
24 |         metadata=dict(num_docs=num_docs),
25 |     )
26 | 


--------------------------------------------------------------------------------
/src/document_content_type.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from jina import Document
 3 | 
 4 | from .pages import Pages
 5 | from .utils.benchmark import benchmark_time
 6 | 
 7 | 
 8 | @pytest.mark.parametrize("num_docs", [100, 1000, 10_000])
 9 | def test_document_get_content_type(num_docs, json_writer):
10 |     def _input_docs():
11 |         return (
12 |             (),
13 |             {"docs": [Document(text=f"d{i}") for i in range(num_docs)]},
14 |         )
15 | 
16 |     def _doc_content_type(docs):
17 |         for doc in docs:
18 |             _ = doc.content_type
19 | 
20 |     result = benchmark_time(setup=_input_docs, func=_doc_content_type)
21 | 
22 |     json_writer.append(
23 |         page=Pages.DOCUMENT_META,
24 |         result=result,
25 |         metadata=dict(num_docs=num_docs),
26 |     )
27 | 


--------------------------------------------------------------------------------
/src/document_array_reverse.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from jina import Document, DocumentArray
 3 | 
 4 | from .pages import Pages
 5 | from .utils.benchmark import benchmark_time
 6 | 
 7 | NUM_REPETITIONS = 10
 8 | 
 9 | 
10 | @pytest.mark.parametrize('num_docs', [100, 10_000])
11 | def test_da_reverse(num_docs, json_writer):
12 |     def _setup():
13 |         docs = [Document(text=f'doc{i}') for i in range(num_docs)]
14 |         da = DocumentArray(docs)
15 |         return (), dict(da=da)
16 | 
17 |     def _da_reverse(da):
18 |         da.reverse()
19 | 
20 |     result = benchmark_time(
21 |         setup=_setup,
22 |         func=_da_reverse,
23 |         n=NUM_REPETITIONS,
24 |     )
25 | 
26 |     json_writer.append(
27 |         page=Pages.DA_INSERT,
28 |         result=result,
29 |         metadata=dict(num_docs=num_docs),
30 |     )
31 | 


--------------------------------------------------------------------------------
/src/document_plot.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from jina import Document
 3 | from jina.helper import random_identity
 4 | 
 5 | from .pages import Pages
 6 | from .utils.benchmark import benchmark_time
 7 | 
 8 | random_identity(use_uuid1=True)
 9 | 
10 | 
11 | @pytest.mark.parametrize("num_docs", [1, 5])
12 | def test_document_plot(num_docs, json_writer, ephemeral_tmpdir):
13 |     def _input_docs():
14 |         return (
15 |             (),
16 |             dict(docs=[Document(text="doc") for _ in range(num_docs)]),
17 |         )
18 | 
19 |     def _plot(docs):
20 |         for d in docs:
21 |             d.plot()
22 | 
23 |     result = benchmark_time(setup=_input_docs, func=_plot)
24 | 
25 |     json_writer.append(
26 |         page=Pages.DOCUMENT_HELPER,
27 |         result=result,
28 |         metadata=dict(num_docs=num_docs),
29 |     )
30 | 


--------------------------------------------------------------------------------
/src/document_copy_from.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from jina import Document
 3 | 
 4 | from .pages import Pages
 5 | from .utils.benchmark import benchmark_time
 6 | 
 7 | 
 8 | @pytest.mark.parametrize("num_docs", [1, 100, 10_000])
 9 | def test_document_copy_from(num_docs, json_writer):
10 |     def _input_docs():
11 |         return (
12 |             (),
13 |             {
14 |                 "docs": [Document(text=f"{i}") for i in range(num_docs)],
15 |                 "doc": Document(text="newdoc"),
16 |             },
17 |         )
18 | 
19 |     def _copy_from(docs, doc):
20 |         for d in docs:
21 |             d.CopyFrom(doc)
22 | 
23 |     result = benchmark_time(setup=_input_docs, func=_copy_from)
24 | 
25 |     json_writer.append(
26 |         page=Pages.DOCUMENT_HELPER,
27 |         result=result,
28 |         metadata=dict(num_docs=num_docs),
29 |     )
30 | 


--------------------------------------------------------------------------------
/src/document_merge_from.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from jina import Document
 3 | 
 4 | from .pages import Pages
 5 | from .utils.benchmark import benchmark_time
 6 | 
 7 | 
 8 | @pytest.mark.parametrize("num_docs", [1, 100, 10_000])
 9 | def test_document_merge_from(num_docs, json_writer):
10 |     def _input_docs():
11 |         return (
12 |             (),
13 |             {
14 |                 "docs": [Document(text=f"{i}") for i in range(num_docs)],
15 |                 "doc": Document(text="newdoc"),
16 |             },
17 |         )
18 | 
19 |     def _merge_from(docs, doc):
20 |         for d in docs:
21 |             d.MergeFrom(doc)
22 | 
23 |     result = benchmark_time(setup=_input_docs, func=_merge_from)
24 | 
25 |     json_writer.append(
26 |         page=Pages.DOCUMENT_HELPER,
27 |         result=result,
28 |         metadata=dict(num_docs=num_docs),
29 |     )
30 | 


--------------------------------------------------------------------------------
/src/executor.py:
--------------------------------------------------------------------------------
 1 | from jina import Executor, requests
 2 | 
 3 | from .pages import Pages
 4 | from .utils.benchmark import benchmark_time
 5 | 
 6 | NUM_REPETITIONS = 100
 7 | 
 8 | 
 9 | class DummyLoadExecutor(Executor):
10 |     def __init__(self, a, b, c, d, *args, **kwargs):
11 |         super().__init__(*args, **kwargs)
12 | 
13 |     @requests
14 |     def foo(self, **kwargs):
15 |         pass
16 | 
17 | 
18 | executor_yaml = '''
19 | jtype: DummyLoadExecutor
20 | with:
21 |   a: 0
22 |   b: 1
23 |   c: 2
24 |   d: 3
25 | metas:
26 |   name: dummy-executor
27 | '''
28 | 
29 | 
30 | def test_executor_load_config(json_writer):
31 |     def _build():
32 |         _ = Executor.load_config(executor_yaml)
33 | 
34 |     result = benchmark_time(func=_build)
35 | 
36 |     json_writer.append(
37 |         page=Pages.EXECUTOR,
38 |         result=result,
39 |         metadata={},
40 |     )
41 | 


--------------------------------------------------------------------------------
/src/document_array_insert.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from jina import Document, DocumentArray
 3 | 
 4 | from .pages import Pages
 5 | from .utils.benchmark import benchmark_time
 6 | 
 7 | NUM_REPETITIONS = 10
 8 | 
 9 | 
10 | @pytest.mark.parametrize('num_docs', [100, 10_000])
11 | def test_da_insert(num_docs, json_writer):
12 |     def _setup():
13 |         docs = [Document(text=f'doc{i}') for i in range(num_docs)]
14 |         da = DocumentArray()
15 |         return (), dict(da=da, docs=docs)
16 | 
17 |     def _insert_in_da(da, docs):
18 |         for doc in docs:
19 |             da.insert(index=0, doc=doc)
20 | 
21 |     result = benchmark_time(
22 |         setup=_setup,
23 |         func=_insert_in_da,
24 |         n=NUM_REPETITIONS,
25 |     )
26 | 
27 |     json_writer.append(
28 |         page=Pages.DA_INSERT,
29 |         result=result,
30 |         metadata=dict(num_docs=num_docs),
31 |     )
32 | 


--------------------------------------------------------------------------------
/src/document_get_sparse_blob.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import scipy.sparse as sp
 3 | from jina import Document
 4 | 
 5 | from .pages import Pages
 6 | from .utils.benchmark import benchmark_time
 7 | 
 8 | 
 9 | @pytest.mark.parametrize("num_docs", [100, 1000, 10_000])
10 | def test_document_document_get_sparse_blob_scipy(num_docs, json_writer):
11 |     def _input_docs():
12 |         return (
13 |             (),
14 |             {
15 |                 "docs": [
16 |                     Document(blob=sp.csr_matrix([0, 0, 4, 0, 1]))
17 |                     for _ in range(num_docs)
18 |                 ]
19 |             },
20 |         )
21 | 
22 |     def _get_sparse_blob(docs):
23 |         for d in docs:
24 |             d.blob
25 | 
26 |     result = benchmark_time(setup=_input_docs, func=_get_sparse_blob)
27 | 
28 |     json_writer.append(
29 |         page=Pages.DOCUMENT_CONTENT,
30 |         result=result,
31 |         metadata=dict(num_docs=num_docs),
32 |     )
33 | 


--------------------------------------------------------------------------------
/src/document_get_sparse_embedding.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import scipy.sparse as sp
 3 | from jina import Document
 4 | 
 5 | from .pages import Pages
 6 | from .utils.benchmark import benchmark_time
 7 | 
 8 | 
 9 | @pytest.mark.parametrize("num_docs", [100, 1000, 10_000])
10 | def test_document_document_get_sparse_embedding_scipy(num_docs, json_writer):
11 |     def _input_docs():
12 |         return (
13 |             (),
14 |             {
15 |                 "docs": [
16 |                     Document(blob=sp.csr_matrix([0, 0, 4, 0, 1]))
17 |                     for i in range(num_docs)
18 |                 ]
19 |             },
20 |         )
21 | 
22 |     def _get_sparse_blob(docs):
23 |         for d in docs:
24 |             d.embedding
25 | 
26 |     result = benchmark_time(setup=_input_docs, func=_get_sparse_blob)
27 | 
28 |     json_writer.append(
29 |         page=Pages.DOCUMENT_CONTENT,
30 |         result=result,
31 |         metadata=dict(num_docs=num_docs),
32 |     )
33 | 


--------------------------------------------------------------------------------
/src/document_parent_id.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from jina import Document
 3 | 
 4 | from .pages import Pages
 5 | from .utils.benchmark import benchmark_time
 6 | 
 7 | 
 8 | @pytest.mark.parametrize("num_docs", [100, 1000, 10_000])
 9 | def test_document_document_parent_id(num_docs, json_writer):
10 |     def _input_docs():
11 |         return (
12 |             (),
13 |             {
14 |                 "chunks": [
15 |                     Document(
16 |                         chunks=[Document(text="d1 original text", id=str(i))], id="123"
17 |                     ).chunks[0]
18 |                     for i in range(num_docs)
19 |                 ]
20 |             },
21 |         )
22 | 
23 |     def _parent_id(chunks):
24 |         for c in chunks:
25 |             c.parent_id
26 | 
27 |     result = benchmark_time(setup=_input_docs, func=_parent_id)
28 | 
29 |     json_writer.append(
30 |         page=Pages.DOCUMENT_META,
31 |         result=result,
32 |         metadata=dict(num_docs=num_docs),
33 |     )
34 | 


--------------------------------------------------------------------------------
/src/pages.py:
--------------------------------------------------------------------------------
 1 | class Pages:
 2 |     DA_APPEND = 'document_array_append'
 3 |     DA_CLEAR = 'document_array_clear'
 4 |     DA_CONSTRUCT = 'document_array_construct'
 5 |     DA_EXTEND = 'document_array_extend'
 6 |     DA_GET_ATTRIBUTES = 'document_array_get_attributes'
 7 |     DA_INSERT = 'document_array_insert'
 8 |     DA_MATCH = 'document_array_match'
 9 |     DA_PERSISTENCE = 'document_array_persistence'
10 |     DA_SHUFFLE = 'document_array_shuffle'
11 |     DA_SORT = 'document_array_sort'
12 |     DA_TRAVERSE = 'document_array_traverse'
13 |     DOCUMENT_CONSTRUCT = 'document_construct'
14 |     DOCUMENT_CONTENT = 'document_content_attributes'
15 |     DOCUMENT_CONVERSION = 'document_conversion'
16 |     DOCUMENT_GRAPH = 'document_graph'
17 |     DOCUMENT_HELPER = 'document_helper_functions'
18 |     DOCUMENT_META = 'document_meta_attributes'
19 |     DOCUMENT_RECURSIVE = 'document_recursive_attributes'
20 |     DOCUMENT_RELEVANCE = 'document_relevance_attributes'
21 |     EXECUTOR = 'executor'
22 |     FLOW = 'flow'
23 |     INDEXER_COMPARISON = 'indexer_comparison'
24 | 


--------------------------------------------------------------------------------
/src/document_array_sort.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import string
 3 | 
 4 | import pytest
 5 | from jina import Document, DocumentArray
 6 | 
 7 | from .pages import Pages
 8 | from .utils.benchmark import benchmark_time
 9 | 
10 | NUM_REPETITIONS = 25
11 | NUM_DOCS = 1000
12 | CHARS = tuple(string.ascii_uppercase + string.digits)
13 | 
14 | 
15 | def _get_docs(num_docs):
16 |     return [Document(scores={'cosine': random.random()}) for _ in range(num_docs)]
17 | 
18 | 
19 | @pytest.mark.parametrize('num_docs', [100, 100_000])
20 | def test_da_sort(num_docs, json_writer):
21 |     def _sort(da):
22 |         da.sort(key=lambda x: x.scores['cosine'].value)
23 | 
24 |     def _build_da(**kwargs):
25 |         docs = kwargs.get('docs')
26 |         da = DocumentArray(docs)
27 |         return (), dict(da=da)
28 | 
29 |     result = benchmark_time(
30 |         setup=_build_da,
31 |         func=_sort,
32 |         n=NUM_REPETITIONS,
33 |         kwargs=dict(docs=_get_docs(num_docs)),
34 |     )
35 | 
36 |     json_writer.append(
37 |         page=Pages.DA_SORT,
38 |         result=result,
39 |         metadata=dict(num_docs=num_docs),
40 |     )
41 | 


--------------------------------------------------------------------------------
/.github/workflows/gh-page.yml:
--------------------------------------------------------------------------------
 1 | name: Github Page
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |   workflow_dispatch:
 8 |     inputs:
 9 |       reason:
10 |         description: "Why?"
11 |         required: true
12 |         default: "Just casually!"
13 | 
14 | jobs:
15 |   build-deploy:
16 |     runs-on: ubuntu-latest
17 |     steps:
18 |       - uses: actions/checkout@v2
19 |         with:
20 |           submodules: true
21 |           fetch-depth: 0
22 | 
23 |       - name: Set up Python 3.9
24 |         uses: actions/setup-python@v2
25 |         with:
26 |           python-version: 3.9
27 | 
28 |       - name: Setup Hugo
29 |         uses: peaceiris/actions-hugo@v2
30 |         with:
31 |           hugo-version: "0.82.0"
32 |           extended: true
33 | 
34 |       - name: Generate site
35 |         run: python scripts/site_generator.py
36 | 
37 |       - name: Build Site
38 |         run: |
39 |           cd docs
40 |           hugo --minify
41 | 
42 |       - name: Deploy Site
43 |         uses: peaceiris/actions-gh-pages@v3
44 |         with:
45 |           personal_token: ${{ secrets.JINA_DEV_BOT }}
46 |           publish_branch: gh-pages
47 |           publish_dir: ./docs/public
48 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | # See https://pre-commit.com for more information
 2 | # See https://pre-commit.com/hooks.html for more hooks
 3 | repos:
 4 |   - repo: https://github.com/pre-commit/pre-commit-hooks
 5 |     rev: v3.2.0
 6 |     hooks:
 7 |       - id: check-ast
 8 |       - id: check-added-large-files
 9 |       - id: check-yaml
10 |       - id: end-of-file-fixer
11 |       - id: trailing-whitespace
12 | 
13 |   - repo: https://github.com/psf/black
14 |     rev: 20.8b1
15 |     hooks:
16 |       - id: black
17 |         name: Black
18 |         types: [python]
19 |         files: ^(src/|scripts/)
20 |         args:
21 |           - -S
22 | 
23 |   - repo: https://github.com/PyCQA/isort
24 |     rev: 5.9.3
25 |     hooks:
26 |       - id: isort
27 |         name: Sort Python Imports
28 | 
29 |   - repo: https://github.com/sirosen/check-jsonschema
30 |     rev: 0.4.1
31 |     hooks:
32 |       - id: check-github-workflows
33 |         files: ^(.github/workflows/)
34 | 
35 |   - repo: local
36 |     hooks:
37 |       - id: site_generator
38 |         name: Site Generator
39 |         entry: python scripts/site_generator.py
40 |         language: python
41 |         types: [python]
42 |         additional_dependencies: ['packaging==21.0', 'requests==2.26.0']
43 | 


--------------------------------------------------------------------------------
/src/document_graph_adjacency.py:
--------------------------------------------------------------------------------
 1 | # from jina import Document
 2 | # from jina.types.document.graph import GraphDocument
 3 | #
 4 | # from .pages import Pages
 5 | # from .utils.benchmark import benchmark_time
 6 | #
 7 | #
 8 | # def test_empty_document_graph_adjacency(json_writer):
 9 | #     def _input_graphdoc():
10 | #         return ((), {"gdoc": GraphDocument()})
11 | #
12 | #     def _doc_get_adjacency(gdoc):
13 | #         _ = gdoc.adjacency
14 | #
15 | #     result = benchmark_time(setup=_input_graphdoc, func=_doc_get_adjacency)
16 | #
17 | #     json_writer.append(
18 | #         page=Pages.DOCUMENT_META,
19 | #         result=result,
20 | #     )
21 | #
22 | #
23 | # def test_document_graph_adjacency(json_writer):
24 | #     def _input_graphdoc():
25 | #         gdoc = GraphDocument()
26 | #         gdoc.add_edges(
27 | #             [Document(id=1), Document(id=2)], [Document(id=3), Document(id=1)]
28 | #         )
29 | #
30 | #         return ((), {"gdoc": gdoc})
31 | #
32 | #     def _doc_get_adjacency(gdoc):
33 | #         _ = gdoc.adjacency
34 | #
35 | #     result = benchmark_time(setup=_input_graphdoc, func=_doc_get_adjacency)
36 | #
37 | #     json_writer.append(
38 | #         page=Pages.DOCUMENT_META,
39 | #         result=result,
40 | #     )
41 | 


--------------------------------------------------------------------------------
/src/document_dict.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pytest
 3 | from jina import Document
 4 | 
 5 | from .pages import Pages
 6 | from .utils.benchmark import benchmark_time
 7 | 
 8 | 
 9 | @pytest.mark.parametrize('num_docs', [1, 100, 10_000])
10 | def test_document_dict_with_text(num_docs, json_writer):
11 |     def _input_docs():
12 |         return (), dict(docs=[Document(text='doc') for _ in range(num_docs)])
13 | 
14 |     def _dict(docs):
15 |         for d in docs:
16 |             aux = d.dict()
17 | 
18 |     result = benchmark_time(setup=_input_docs, func=_dict)
19 | 
20 |     json_writer.append(
21 |         page=Pages.DOCUMENT_HELPER,
22 |         result=result,
23 |         metadata=dict(num_docs=num_docs),
24 |     )
25 | 
26 | 
27 | @pytest.mark.parametrize('num_docs', [1, 100, 10_000])
28 | def test_document_dict_with_array(num_docs, json_writer):
29 |     def _input_docs():
30 |         return (), dict(docs=[Document(blob=np.array([1, 2])) for _ in range(num_docs)])
31 | 
32 |     def _dict(docs):
33 |         for d in docs:
34 |             aux = d.dict()
35 | 
36 |     result = benchmark_time(setup=_input_docs, func=_dict)
37 | 
38 |     json_writer.append(
39 |         page=Pages.DOCUMENT_HELPER,
40 |         result=result,
41 |         metadata=dict(num_docs=num_docs),
42 |     )
43 | 


--------------------------------------------------------------------------------
/src/document_json.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pytest
 3 | from jina import Document
 4 | 
 5 | from .pages import Pages
 6 | from .utils.benchmark import benchmark_time
 7 | 
 8 | 
 9 | @pytest.mark.parametrize('num_docs', [1, 100, 10_000])
10 | def test_document_json_with_text(num_docs, json_writer):
11 |     def _input_docs():
12 |         return (), dict(docs=[Document(text='doc') for _ in range(num_docs)])
13 | 
14 |     def _dict(docs):
15 |         for d in docs:
16 |             aux = d.json()
17 | 
18 |     result = benchmark_time(setup=_input_docs, func=_dict)
19 | 
20 |     json_writer.append(
21 |         page=Pages.DOCUMENT_HELPER,
22 |         result=result,
23 |         metadata=dict(num_docs=num_docs),
24 |     )
25 | 
26 | 
27 | @pytest.mark.parametrize('num_docs', [1, 100, 10_000])
28 | def test_document_json_with_array(num_docs, json_writer):
29 |     def _input_docs():
30 |         return (), dict(docs=[Document(blob=np.array([1, 2])) for _ in range(num_docs)])
31 | 
32 |     def _dict(docs):
33 |         for d in docs:
34 |             aux = d.json()
35 | 
36 |     result = benchmark_time(setup=_input_docs, func=_dict)
37 | 
38 |     json_writer.append(
39 |         page=Pages.DOCUMENT_HELPER,
40 |         result=result,
41 |         metadata=dict(num_docs=num_docs),
42 |     )
43 | 


--------------------------------------------------------------------------------
/src/document_id.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from jina import Document, DocumentArray
 3 | from jina.helper import random_identity
 4 | 
 5 | from .pages import Pages
 6 | from .utils.benchmark import benchmark_time
 7 | 
 8 | random_identity(use_uuid1=True)
 9 | 
10 | 
11 | @pytest.mark.parametrize("num_docs", [100, 1000, 10_000])
12 | @pytest.mark.parametrize("use_uuid1", [True, False])
13 | def test_document_document_generate_id(num_docs, use_uuid1, json_writer):
14 |     def _generate_id():
15 |         for _ in range(num_docs):
16 |             random_identity(use_uuid1)
17 | 
18 |     result = benchmark_time(func=_generate_id)
19 | 
20 |     json_writer.append(
21 |         page=Pages.DOCUMENT_HELPER,
22 |         result=result,
23 |         metadata=dict(num_docs=num_docs),
24 |     )
25 | 
26 | 
27 | @pytest.mark.parametrize("num_docs", [100, 1000, 10_000])
28 | def test_document_document_get_id(num_docs, json_writer):
29 |     def _input_docs():
30 |         return (
31 |             (),
32 |             dict(
33 |                 docs=DocumentArray([Document(text="hey here") for _ in range(num_docs)])
34 |             ),
35 |         )
36 | 
37 |     def _get_id(docs):
38 |         for d in docs:
39 |             aux = d.id
40 | 
41 |     result = benchmark_time(setup=_input_docs, func=_get_id)
42 | 
43 |     json_writer.append(
44 |         page=Pages.DOCUMENT_META,
45 |         result=result,
46 |         metadata=dict(num_docs=num_docs),
47 |     )
48 | 


--------------------------------------------------------------------------------
/src/document_array_shuffle.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from jina import Document, DocumentArray, DocumentArrayMemmap
 3 | 
 4 | from .pages import Pages
 5 | from .utils.benchmark import benchmark_time
 6 | 
 7 | 
 8 | @pytest.mark.parametrize('memmap', [False, True])
 9 | @pytest.mark.parametrize('n_docs', [1000, 10_000])
10 | def test_da_shuffle(name, memmap, n_docs, ephemeral_tmpdir, json_writer):
11 |     def _setup(memmap, n_docs):
12 |         docs = [Document(text=f'Document{i}') for i in range(n_docs)]
13 |         da = (
14 |             DocumentArrayMemmap(f'{str(ephemeral_tmpdir)}/memmap')
15 |             if memmap
16 |             else DocumentArray()
17 |         )
18 |         da.extend(docs)
19 |         return (), dict(da=da)
20 | 
21 |     def _shuffle_da(da):
22 |         da.shuffle()
23 | 
24 |     def _teardown():
25 |         import os
26 |         import shutil
27 | 
28 |         if os.path.exists(f'{str(ephemeral_tmpdir)}/memmap'):
29 |             shutil.rmtree(f'{str(ephemeral_tmpdir)}/memmap')
30 | 
31 |     result = benchmark_time(
32 |         setup=_setup,
33 |         func=_shuffle_da,
34 |         teardown=_teardown,
35 |         kwargs=dict(memmap=memmap, n_docs=n_docs),
36 |     )
37 |     if memmap:
38 |         name = name.replace('_da_', '_dam_')
39 |     json_writer.append(
40 |         name=name,
41 |         page=Pages.DA_SHUFFLE,
42 |         result=result,
43 |         metadata=dict(n_nodes=memmap, n_docs=n_docs),
44 |     )
45 | 


--------------------------------------------------------------------------------
/src/document_array_clear.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from jina import Document, DocumentArray, DocumentArrayMemmap
 3 | 
 4 | from .pages import Pages
 5 | from .utils.benchmark import benchmark_time
 6 | 
 7 | 
 8 | @pytest.mark.parametrize('num_docs', [100, 10_000])
 9 | def test_da_clear(num_docs, json_writer):
10 |     def _setup():
11 |         da = DocumentArray([Document(text=f'doc{i}') for i in range(num_docs)])
12 |         return (), dict(da=da)
13 | 
14 |     def _da_clear(da):
15 |         da.clear()
16 | 
17 |     result = benchmark_time(setup=_setup, func=_da_clear)
18 | 
19 |     json_writer.append(
20 |         page=Pages.DA_CLEAR,
21 |         result=result,
22 |         metadata=dict(num_docs=num_docs),
23 |     )
24 | 
25 | 
26 | @pytest.mark.parametrize('num_docs', [100, 10_000])
27 | def test_dam_clear(num_docs, json_writer, ephemeral_tmpdir):
28 |     def _setup():
29 |         dam = DocumentArrayMemmap((f'{str(ephemeral_tmpdir)}/memmap'))
30 |         dam.extend([Document(text=f'doc{i}') for i in range(num_docs)])
31 |         return (), dict(dam=dam)
32 | 
33 |     def _dam_clear(dam):
34 |         dam.clear()
35 | 
36 |     def _teardown():
37 |         import shutil
38 | 
39 |         shutil.rmtree(f'{str(ephemeral_tmpdir)}/memmap')
40 | 
41 |     result = benchmark_time(setup=_setup, func=_dam_clear, teardown=_teardown)
42 | 
43 |     json_writer.append(
44 |         page=Pages.DA_CLEAR,
45 |         result=result,
46 |         metadata=dict(num_docs=num_docs),
47 |     )
48 | 


--------------------------------------------------------------------------------
/src/document_content.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pytest
 3 | from jina import Document
 4 | 
 5 | from .pages import Pages
 6 | from .utils.benchmark import benchmark_time
 7 | 
 8 | 
 9 | @pytest.mark.parametrize("num_docs", [100, 1000, 10_000])
10 | def test_document_get_content(num_docs, json_writer):
11 |     def _input_docs():
12 |         return (
13 |             (),
14 |             {"docs": [Document(text=f"d{i}") for i in range(num_docs)]},
15 |         )
16 | 
17 |     def _doc_get_content(docs):
18 |         for doc in docs:
19 |             _ = doc.content
20 | 
21 |     result = benchmark_time(setup=_input_docs, func=_doc_get_content)
22 | 
23 |     json_writer.append(
24 |         page=Pages.DOCUMENT_CONTENT,
25 |         result=result,
26 |         metadata=dict(num_docs=num_docs),
27 |     )
28 | 
29 | 
30 | @pytest.mark.parametrize("num_docs", [100, 1000, 10_000])
31 | def test_document_set_content(num_docs, json_writer):
32 |     def _input_docs():
33 |         return (
34 |             (),
35 |             {"docs": [Document(blob=np.array([1, 2])) for i in range(num_docs)]},
36 |         )
37 | 
38 |     def _doc_get_content(docs):
39 |         x = np.array([2, 3, 4])
40 |         for doc in docs:
41 |             doc.content = x
42 | 
43 |     result = benchmark_time(setup=_input_docs, func=_doc_get_content)
44 | 
45 |     json_writer.append(
46 |         page=Pages.DOCUMENT_CONTENT,
47 |         result=result,
48 |         metadata=dict(num_docs=num_docs),
49 |     )
50 | 


--------------------------------------------------------------------------------
/src/document_embedding.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pytest
 3 | from jina import Document, DocumentArray, Executor, requests
 4 | 
 5 | from .pages import Pages
 6 | from .utils.benchmark import benchmark_time
 7 | 
 8 | NUM_DOCS = 100
 9 | 
10 | 
11 | class DummyEncoder(Executor):
12 |     @requests
13 |     def encode(self, docs, **kwargs):
14 |         texts = docs.get_attributes('text')
15 |         embeddings = [np.random.rand(1, 1024) for _ in texts]
16 |         for doc, embedding in zip(docs, embeddings):
17 |             doc.embedding = embedding
18 | 
19 | 
20 | @pytest.fixture()
21 | def input_docs():
22 |     return DocumentArray([Document(text='hey here') for _ in range(NUM_DOCS)])
23 | 
24 | 
25 | @pytest.fixture()
26 | def executor():
27 |     return DummyEncoder()
28 | 
29 | 
30 | @pytest.mark.skip()
31 | def test_document_encoder_executor(executor, input_docs, json_writer):
32 |     def _function(**kwargs):
33 |         executor.encode(input_docs)
34 | 
35 |     result = benchmark_time(profile_cls=[Document, DocumentArray], func=_function)
36 |     profiles = result.profiles
37 |     document_profile = profiles[0]
38 |     document_array_profile = profiles[1]
39 | 
40 |     json_writer.append(
41 |         page=Pages.DOCUMENT_CONTENT,
42 |         result=result,
43 |         metadata=dict(
44 |             profiles=dict(
45 |                 Document=document_profile, DocumentArray=document_array_profile
46 |             ),
47 |             num_docs=NUM_DOCS,
48 |         ),
49 |     )
50 | 


--------------------------------------------------------------------------------
/src/document_weight.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from jina import Document, DocumentArray
 3 | 
 4 | from .pages import Pages
 5 | from .utils.benchmark import benchmark_time
 6 | 
 7 | 
 8 | @pytest.mark.parametrize("num_docs", [100, 1000, 10_000])
 9 | def test_document_document_set_weight(num_docs, json_writer):
10 |     def _input_docs():
11 |         return (
12 |             (),
13 |             dict(
14 |                 docs=DocumentArray([Document(text="hey here") for _ in range(num_docs)])
15 |             ),
16 |         )
17 | 
18 |     def _set_weight(docs):
19 |         for d in docs:
20 |             d.weight = 2.3
21 | 
22 |     result = benchmark_time(setup=_input_docs, func=_set_weight)
23 | 
24 |     json_writer.append(
25 |         page=Pages.DOCUMENT_META,
26 |         result=result,
27 |         metadata=dict(num_docs=num_docs),
28 |     )
29 | 
30 | 
31 | @pytest.mark.parametrize("num_docs", [100, 1000, 10_000])
32 | def test_document_document_get_weight(num_docs, json_writer):
33 |     def _input_docs():
34 |         return (
35 |             (),
36 |             dict(
37 |                 docs=DocumentArray([Document(text="hey here") for _ in range(num_docs)])
38 |             ),
39 |         )
40 | 
41 |     def _get_weight(docs):
42 |         for d in docs:
43 |             aux = d.weight
44 | 
45 |     result = benchmark_time(setup=_input_docs, func=_get_weight)
46 | 
47 |     json_writer.append(
48 |         page=Pages.DOCUMENT_META,
49 |         result=result,
50 |         metadata=dict(num_docs=num_docs),
51 |     )
52 | 


--------------------------------------------------------------------------------
/src/document_granularity.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pytest
 3 | from jina import Document
 4 | 
 5 | from .pages import Pages
 6 | from .utils.benchmark import benchmark_time
 7 | 
 8 | 
 9 | @pytest.mark.parametrize("num_docs", [100, 1000, 10_000])
10 | def test_document_get_granularity(num_docs, json_writer):
11 |     def _input_docs():
12 |         return (
13 |             (),
14 |             {"docs": [Document(text=f"d{i}", granularity=2) for i in range(num_docs)]},
15 |         )
16 | 
17 |     def _doc_get_granularity(docs):
18 |         for doc in docs:
19 |             _ = doc.granularity
20 | 
21 |     result = benchmark_time(setup=_input_docs, func=_doc_get_granularity)
22 | 
23 |     json_writer.append(
24 |         page=Pages.DOCUMENT_META,
25 |         result=result,
26 |         metadata=dict(num_docs=num_docs),
27 |     )
28 | 
29 | 
30 | @pytest.mark.parametrize("num_docs", [100, 1000, 10_000])
31 | def test_document_set_granularity(num_docs, json_writer):
32 |     def _input_docs():
33 |         return (
34 |             (),
35 |             {"docs": [Document(text=f"d{i}", granularity=2) for i in range(num_docs)]},
36 |         )
37 | 
38 |     def _doc_set_granularity(docs):
39 |         x = np.array([2, 3, 4])
40 |         for doc in docs:
41 |             doc.granularity = 3
42 | 
43 |     result = benchmark_time(setup=_input_docs, func=_doc_set_granularity)
44 | 
45 |     json_writer.append(
46 |         page=Pages.DOCUMENT_META,
47 |         result=result,
48 |         metadata=dict(num_docs=num_docs),
49 |     )
50 | 


--------------------------------------------------------------------------------
/src/document_mime_type.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from jina import Document
 3 | 
 4 | from .pages import Pages
 5 | from .utils.benchmark import benchmark_time
 6 | 
 7 | 
 8 | @pytest.mark.parametrize("num_docs", [100, 1000, 10_000])
 9 | def test_document_get_mime_type(num_docs, json_writer):
10 |     def _input_docs():
11 |         docs = []
12 |         for i in range(num_docs):
13 |             d = Document(text=f"d{i}")
14 |             d.mime_type = "text"
15 |             docs.append(d)
16 | 
17 |         return (
18 |             (),
19 |             {"docs": docs},
20 |         )
21 | 
22 |     def _get_mime_type(docs):
23 |         for doc in docs:
24 |             _ = doc.mime_type
25 | 
26 |     result = benchmark_time(setup=_input_docs, func=_get_mime_type)
27 | 
28 |     json_writer.append(
29 |         page=Pages.DOCUMENT_META,
30 |         result=result,
31 |         metadata=dict(num_docs=num_docs),
32 |     )
33 | 
34 | 
35 | @pytest.mark.parametrize("num_docs", [100, 1000, 10_000])
36 | def test_document_set_mime_type(num_docs, json_writer):
37 |     def _input_docs():
38 |         return (
39 |             (),
40 |             {"docs": [Document(text=f"d{i}") for i in range(num_docs)]},
41 |         )
42 | 
43 |     def _set_mime_type(docs):
44 |         for doc in docs:
45 |             doc.mime_type = "text"
46 | 
47 |     result = benchmark_time(setup=_input_docs, func=_set_mime_type)
48 | 
49 |     json_writer.append(
50 |         page=Pages.DOCUMENT_META,
51 |         result=result,
52 |         metadata=dict(num_docs=num_docs),
53 |     )
54 | 


--------------------------------------------------------------------------------
/.github/workflows/pr.yml:
--------------------------------------------------------------------------------
 1 | name: PR Tests
 2 | 
 3 | on: pull_request
 4 | 
 5 | jobs:
 6 |   check-black:
 7 |     runs-on: ubuntu-latest
 8 |     steps:
 9 |       - uses: actions/checkout@v2
10 |         with:
11 |           fetch-depth: 0
12 | 
13 |       - name: Set up Python 3.9
14 |         uses: actions/setup-python@v2
15 |         with:
16 |           python-version: 3.9
17 | 
18 |       - name: check black
19 |         run: |
20 |           pip install black==20.8b1
21 |           black -S --check src/
22 |           black -S --check scripts/
23 | 
24 |   check-site-generation:
25 |     runs-on: ubuntu-latest
26 |     needs: check-black
27 |     steps:
28 |       - uses: actions/checkout@v2
29 |         with:
30 |           fetch-depth: 0
31 | 
32 |       - name: Set up Python 3.9
33 |         uses: actions/setup-python@v2
34 |         with:
35 |           python-version: 3.9
36 | 
37 |       - name: check site generation
38 |         run: |
39 |           pip install requests==2.26.0 packaging==21.0
40 |           python scripts/site_generator.py
41 |           git status
42 |           git diff-index --quiet HEAD -- || exit 1
43 | 
44 |       - name: automerge
45 |         uses: "pascalgn/automerge-action@v0.14.2"
46 |         if: ${{ github.actor == 'jina-bot' }}
47 |         env:
48 |           GITHUB_TOKEN: "${{ secrets.JINA_DEV_BOT }}"
49 |           MERGE_LABELS: automerge
50 |           MERGE_METHOD: merge
51 |           MERGE_COMMIT_MESSAGE: automatic
52 |           MERGE_FILTER_AUTHOR: jina-bot
53 |           MERGE_FORKS: false
54 |           MERGE_DELETE_BRANCH: true
55 | 


--------------------------------------------------------------------------------
/src/document_array_append.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from faker import Faker
 3 | from jina import Document, DocumentArray, DocumentArrayMemmap
 4 | 
 5 | from .pages import Pages
 6 | from .utils.benchmark import benchmark_time
 7 | 
 8 | fake = Faker()
 9 | Faker.seed(42)
10 | NUM_DOCS = 10000
11 | 
12 | 
13 | @pytest.fixture
14 | def docs():
15 |     return [Document(text=fake.text()) for _ in range(NUM_DOCS)]
16 | 
17 | 
18 | def test_da_append(docs, json_writer):
19 |     def _append(da):
20 |         for doc in docs:
21 |             da.append(doc)
22 | 
23 |     def _setup(**kwargs):
24 |         return (), dict(da=DocumentArray())
25 | 
26 |     result = benchmark_time(setup=_setup, func=_append)
27 | 
28 |     json_writer.append(
29 |         page=Pages.DA_APPEND,
30 |         result=result,
31 |         metadata=dict(num_docs_append=NUM_DOCS),
32 |     )
33 | 
34 | 
35 | @pytest.mark.parametrize('flush', [True, False])
36 | def test_dam_append(docs, flush, json_writer, ephemeral_tmpdir):
37 |     def _append(da):
38 |         for doc in docs:
39 |             da.append(doc, flush=flush)
40 | 
41 |     def _setup(**kwargs):
42 |         return (), dict(da=DocumentArrayMemmap(f'{str(ephemeral_tmpdir)}/memmap'))
43 | 
44 |     def _teardown():
45 |         import shutil
46 | 
47 |         shutil.rmtree(f'{str(ephemeral_tmpdir)}/memmap')
48 | 
49 |     result = benchmark_time(setup=_setup, func=_append, teardown=_teardown)
50 | 
51 |     json_writer.append(
52 |         page=Pages.DA_APPEND,
53 |         result=result,
54 |         metadata=dict(num_docs_append=NUM_DOCS, flush=flush),
55 |     )
56 | 


--------------------------------------------------------------------------------
/src/document_clear.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pytest
 3 | from jina import Document, DocumentArray
 4 | 
 5 | from .pages import Pages
 6 | from .utils.benchmark import benchmark_time
 7 | 
 8 | 
 9 | @pytest.mark.parametrize('num_docs', [100, 1000, 10_000])
10 | def test_document_document_clear_doc_with_1_field(num_docs, json_writer):
11 |     def _input_docs():
12 |         return (), dict(
13 |             docs=DocumentArray([Document(text='hey here') for _ in range(num_docs)])
14 |         )
15 | 
16 |     def _pop_text(docs):
17 |         for d in docs:
18 |             d.clear()
19 | 
20 |     result = benchmark_time(setup=_input_docs, func=_pop_text)
21 | 
22 |     json_writer.append(
23 |         page=Pages.DOCUMENT_HELPER,
24 |         result=result,
25 |         metadata=dict(num_docs=num_docs),
26 |     )
27 | 
28 | 
29 | @pytest.mark.parametrize('num_docs', [100, 1000, 10_000])
30 | def test_document_document_clear_doc_with_2_fields(num_docs, json_writer):
31 |     def _input_docs():
32 |         return (), dict(
33 |             docs=DocumentArray(
34 |                 [
35 |                     Document(text='hey here', embedding=np.array([1, 2, 3]))
36 |                     for _ in range(num_docs)
37 |                 ]
38 |             )
39 |         )
40 | 
41 |     def _pop_text(docs):
42 |         for d in docs:
43 |             d.pop('text')
44 | 
45 |     result = benchmark_time(setup=_input_docs, func=_pop_text)
46 | 
47 |     json_writer.append(
48 |         page=Pages.DOCUMENT_HELPER,
49 |         result=result,
50 |         metadata=dict(num_docs=num_docs),
51 |     )
52 | 


--------------------------------------------------------------------------------
/src/document_scores.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from jina import Document
 3 | 
 4 | from .pages import Pages
 5 | from .utils.benchmark import benchmark_time
 6 | 
 7 | 
 8 | @pytest.mark.parametrize("num_docs", [100, 1000, 10_000])
 9 | def test_document_get_scores(num_docs, json_writer):
10 |     def _input_docs():
11 |         return (
12 |             (),
13 |             {
14 |                 "docs": [
15 |                     Document(text=f"d{i}", scores={"euclidean": 5, "cosine": 0.5})
16 |                     for i in range(num_docs)
17 |                 ]
18 |             },
19 |         )
20 | 
21 |     def _doc_get_scores(docs):
22 |         for doc in docs:
23 |             _ = doc.scores["euclidean"]
24 | 
25 |     result = benchmark_time(setup=_input_docs, func=_doc_get_scores)
26 | 
27 |     json_writer.append(
28 |         page=Pages.DOCUMENT_RELEVANCE,
29 |         result=result,
30 |         metadata=dict(num_docs=num_docs),
31 |     )
32 | 
33 | 
34 | @pytest.mark.parametrize("num_docs", [100, 1000, 10_000])
35 | def test_document_set_scores(num_docs, json_writer):
36 |     def _input_docs():
37 |         return (
38 |             (),
39 |             {"docs": [Document(text=f"d{i}") for i in range(num_docs)]},
40 |         )
41 | 
42 |     def _doc_set_scores(docs):
43 |         for doc in docs:
44 |             doc.scores["euclidean"] = 23
45 | 
46 |     result = benchmark_time(setup=_input_docs, func=_doc_set_scores)
47 | 
48 |     json_writer.append(
49 |         page=Pages.DOCUMENT_RELEVANCE,
50 |         result=result,
51 |         metadata=dict(num_docs=num_docs),
52 |     )
53 | 


--------------------------------------------------------------------------------
/src/document_tags.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from jina import Document, DocumentArray
 3 | 
 4 | from .pages import Pages
 5 | from .utils.benchmark import benchmark_time
 6 | 
 7 | 
 8 | @pytest.mark.parametrize("num_docs", [100, 1000, 10_000])
 9 | def test_document_document_tags_setter(num_docs, json_writer):
10 |     def _input_docs():
11 |         return (
12 |             (),
13 |             dict(
14 |                 docs=DocumentArray(
15 |                     [Document(tags={"tag1": "val1"}) for _ in range(num_docs)]
16 |                 )
17 |             ),
18 |         )
19 | 
20 |     def _tags_set(docs):
21 |         for d in docs:
22 |             d.tags["tag1"] = "newval1"
23 | 
24 |     result = benchmark_time(setup=_input_docs, func=_tags_set)
25 | 
26 |     json_writer.append(
27 |         page=Pages.DOCUMENT_CONTENT,
28 |         result=result,
29 |         metadata=dict(num_docs=num_docs),
30 |     )
31 | 
32 | 
33 | @pytest.mark.parametrize("num_docs", [100, 1000, 10_000])
34 | def test_document_document_tags_getter(num_docs, json_writer):
35 |     def _input_docs():
36 |         return (
37 |             (),
38 |             dict(
39 |                 docs=DocumentArray(
40 |                     [Document(tags={"tag1": "val1"}) for _ in range(num_docs)]
41 |                 )
42 |             ),
43 |         )
44 | 
45 |     def _get_tags_tag1(docs):
46 |         for d in docs:
47 |             tag = d.tags.get("tag1")
48 | 
49 |     result = benchmark_time(setup=_input_docs, func=_get_tags_tag1)
50 | 
51 |     json_writer.append(
52 |         page=Pages.DOCUMENT_CONTENT,
53 |         result=result,
54 |         metadata=dict(num_docs=num_docs),
55 |     )
56 | 


--------------------------------------------------------------------------------
/src/document_evaluations.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from jina import Document
 3 | 
 4 | from .pages import Pages
 5 | from .utils.benchmark import benchmark_time
 6 | 
 7 | 
 8 | @pytest.mark.parametrize("num_docs", [100, 1000, 10_000])
 9 | def test_document_get_evaluations(num_docs, json_writer):
10 |     def _input_docs():
11 |         return (
12 |             (),
13 |             {
14 |                 "docs": [
15 |                     Document(evaluations={'precision': 0.9}) for i in range(num_docs)
16 |                 ]
17 |             },
18 |         )
19 | 
20 |     def _doc_get_evaluations(docs):
21 |         for doc in docs:
22 |             _ = doc.evaluations['precision'].value
23 | 
24 |     result = benchmark_time(setup=_input_docs, func=_doc_get_evaluations)
25 | 
26 |     json_writer.append(
27 |         page=Pages.DOCUMENT_RELEVANCE,
28 |         result=result,
29 |         metadata=dict(num_docs=num_docs),
30 |     )
31 | 
32 | 
33 | @pytest.mark.parametrize("num_docs", [100, 1000, 10_000])
34 | def test_document_set_evaluations(num_docs, json_writer):
35 |     def _input_docs():
36 |         return (
37 |             (),
38 |             {
39 |                 "docs": [
40 |                     Document(evaluations={'precision': 0.9}) for i in range(num_docs)
41 |                 ]
42 |             },
43 |         )
44 | 
45 |     def _doc_set_evaluations(docs):
46 |         for doc in docs:
47 |             doc.evaluations['precision'] = 0.99
48 | 
49 |     result = benchmark_time(setup=_input_docs, func=_doc_set_evaluations)
50 | 
51 |     json_writer.append(
52 |         page=Pages.DOCUMENT_RELEVANCE,
53 |         result=result,
54 |         metadata=dict(num_docs=num_docs),
55 |     )
56 | 


--------------------------------------------------------------------------------
/src/document_update.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pytest
 3 | from jina import Document
 4 | 
 5 | from .pages import Pages
 6 | from .utils.benchmark import benchmark_time
 7 | 
 8 | 
 9 | @pytest.mark.parametrize("num_docs", [100, 1000, 10_000])
10 | def test_document_document_update_embedding(num_docs, json_writer):
11 |     def _input_docs():
12 |         return (
13 |             (),
14 |             {
15 |                 "docs": [
16 |                     Document(embedding=np.array([1, 2, 3])) for _ in range(num_docs)
17 |                 ],
18 |                 "new_doc": Document(embedding=np.array([4, 5, 6])),
19 |             },
20 |         )
21 | 
22 |     def _update_embedding(docs, new_doc):
23 |         for d in docs:
24 |             d.update(new_doc)
25 | 
26 |     result = benchmark_time(setup=_input_docs, func=_update_embedding)
27 |     json_writer.append(
28 |         page=Pages.DOCUMENT_HELPER,
29 |         result=result,
30 |         metadata=dict(num_docs=num_docs),
31 |     )
32 | 
33 | 
34 | @pytest.mark.parametrize("num_docs", [100, 1000, 10_000])
35 | def test_document_document_update_text(num_docs, json_writer):
36 |     def _input_docs():
37 |         return (
38 |             (),
39 |             {
40 |                 "docs": [Document(text="original text") for _ in range(num_docs)],
41 |                 "new_doc": Document(text="new text"),
42 |             },
43 |         )
44 | 
45 |     def _update_text(docs, new_doc):
46 |         for d in docs:
47 |             d.update(new_doc)
48 | 
49 |     result = benchmark_time(setup=_input_docs, func=_update_text)
50 |     json_writer.append(
51 |         page=Pages.DOCUMENT_HELPER,
52 |         result=result,
53 |         metadata=dict(num_docs=num_docs),
54 |     )
55 | 


--------------------------------------------------------------------------------
/src/document_modality.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pytest
 3 | from jina import Document, DocumentArray
 4 | 
 5 | from .pages import Pages
 6 | from .utils.benchmark import benchmark_time
 7 | 
 8 | 
 9 | @pytest.mark.parametrize("num_docs", [100, 1000, 10_000])
10 | def test_document_document_modality_setter(num_docs, json_writer):
11 |     def _input_docs():
12 |         return (
13 |             (),
14 |             dict(
15 |                 docs=DocumentArray([Document(text="hey here") for _ in range(num_docs)])
16 |             ),
17 |         )
18 | 
19 |     def _set_modality(docs):
20 |         for d in docs:
21 |             d.modality = "modality"
22 | 
23 |     result = benchmark_time(setup=_input_docs, func=_set_modality)
24 | 
25 |     json_writer.append(
26 |         page=Pages.DOCUMENT_META,
27 |         result=result,
28 |         metadata=dict(num_docs=num_docs),
29 |     )
30 | 
31 | 
32 | @pytest.mark.parametrize("num_docs", [100, 1000, 10_000])
33 | def test_document_document_modality_getter(num_docs, json_writer):
34 |     def _input_docs():
35 |         return (
36 |             (),
37 |             dict(
38 |                 docs=DocumentArray(
39 |                     [
40 |                         Document(text="hey here", embedding=np.array([1, 2, 3]))
41 |                         for _ in range(num_docs)
42 |                     ]
43 |                 )
44 |             ),
45 |         )
46 | 
47 |     def _get_modality(docs):
48 |         for d in docs:
49 |             aux = d.modality
50 | 
51 |     result = benchmark_time(setup=_input_docs, func=_get_modality)
52 | 
53 |     json_writer.append(
54 |         page=Pages.DOCUMENT_META,
55 |         result=result,
56 |         metadata=dict(num_docs=num_docs),
57 |     )
58 | 


--------------------------------------------------------------------------------
/src/document_array_persistence.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from jina import Document, DocumentArray
 3 | 
 4 | from .pages import Pages
 5 | from .utils.benchmark import benchmark_time
 6 | 
 7 | NUM_DOCS = 100000
 8 | 
 9 | 
10 | @pytest.fixture
11 | def doc_array():
12 |     return DocumentArray(
13 |         (Document(text=f'This is the document number: {i}') for i in range(NUM_DOCS))
14 |     )
15 | 
16 | 
17 | @pytest.mark.parametrize('file_format', ['json', 'binary'])
18 | def test_da_save(doc_array, file_format, json_writer, ephemeral_tmpdir):
19 |     extension = 'bin' if file_format == 'binary' else 'json'
20 |     file = f'{str(ephemeral_tmpdir)}/doc_array.{extension}'
21 | 
22 |     def _save():
23 |         doc_array.save(file, file_format=file_format)
24 | 
25 |     def _teardown():
26 |         import os
27 | 
28 |         os.remove(file)
29 | 
30 |     result = benchmark_time(func=_save, teardown=_teardown)
31 | 
32 |     json_writer.append(
33 |         page=Pages.DA_PERSISTENCE,
34 |         result=result,
35 |         metadata=dict(num_docs_append=NUM_DOCS, file_format=file_format),
36 |     )
37 | 
38 | 
39 | @pytest.mark.parametrize('file_format', ['json', 'binary'])
40 | def test_da_load(doc_array, file_format, json_writer, ephemeral_tmpdir):
41 |     extension = 'bin' if file_format == 'binary' else 'json'
42 |     file = f'{str(ephemeral_tmpdir)}/doc_array.{extension}'
43 | 
44 |     def _save():
45 |         doc_array.save(file, file_format=file_format)
46 |         return (), dict()
47 | 
48 |     def _load():
49 |         DocumentArray.load(file, file_format=file_format)
50 | 
51 |     def _teardown():
52 |         import os
53 | 
54 |         os.remove(file)
55 | 
56 |     result = benchmark_time(setup=_save, func=_load, teardown=_teardown)
57 | 
58 |     json_writer.append(
59 |         page=Pages.DA_PERSISTENCE,
60 |         result=result,
61 |         metadata=dict(num_docs_append=NUM_DOCS, file_format=file_format),
62 |     )
63 | 


--------------------------------------------------------------------------------
/src/document_array_save_json_load_json.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from jina import Document, DocumentArray
 3 | 
 4 | from .pages import Pages
 5 | from .utils.benchmark import benchmark_time
 6 | 
 7 | NUM_REPETITIONS = 10
 8 | 
 9 | 
10 | @pytest.mark.parametrize('num_docs', [100, 10_000])
11 | def test_da_save_json(num_docs, json_writer, ephemeral_tmpdir):
12 |     def _setup():
13 |         docs = [Document(text=f'doc{i}') for i in range(num_docs)]
14 |         da = DocumentArray(docs)
15 |         return (), dict(da=da)
16 | 
17 |     def _da_save_json(da):
18 |         da.save_json(f'{str(ephemeral_tmpdir)}/docarray.json')
19 | 
20 |     def _teardown():
21 |         import os
22 | 
23 |         os.remove(f'{str(ephemeral_tmpdir)}/docarray.json')
24 | 
25 |     result = benchmark_time(
26 |         setup=_setup,
27 |         func=_da_save_json,
28 |         teardown=_teardown,
29 |         n=NUM_REPETITIONS,
30 |     )
31 | 
32 |     json_writer.append(
33 |         page=Pages.DA_INSERT,
34 |         result=result,
35 |         metadata=dict(num_docs=num_docs),
36 |     )
37 | 
38 | 
39 | @pytest.mark.parametrize('num_docs', [100, 1000, 10_000])
40 | def test_da_load_json(num_docs, json_writer, ephemeral_tmpdir):
41 |     def _setup():
42 |         docs = [Document(text=f'doc{i}') for i in range(num_docs)]
43 |         da = DocumentArray(docs)
44 |         da.save_json(f'{str(ephemeral_tmpdir)}/docarray.json')
45 |         return (), dict(da=da)
46 | 
47 |     def _da_load_json(da):
48 |         da.load_json(f'{str(ephemeral_tmpdir)}/docarray.json')
49 | 
50 |     def _teardown():
51 |         import os
52 | 
53 |         os.remove(f'{str(ephemeral_tmpdir)}/docarray.json')
54 | 
55 |     result = benchmark_time(
56 |         setup=_setup, func=_da_load_json, teardown=_teardown, n=NUM_REPETITIONS
57 |     )
58 | 
59 |     json_writer.append(
60 |         page=Pages.DA_INSERT,
61 |         result=result,
62 |         metadata=dict(num_docs=num_docs),
63 |     )
64 | 


--------------------------------------------------------------------------------
/src/document_array_save_binary_load_binary.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from jina import Document, DocumentArray
 3 | 
 4 | from .pages import Pages
 5 | from .utils.benchmark import benchmark_time
 6 | 
 7 | NUM_REPETITIONS = 10
 8 | 
 9 | 
10 | @pytest.mark.parametrize('num_docs', [100, 10_000])
11 | def test_da_save_binary(num_docs, json_writer, ephemeral_tmpdir):
12 |     def _setup():
13 |         docs = [Document(text=f'doc{i}') for i in range(num_docs)]
14 |         da = DocumentArray(docs)
15 |         return (), dict(da=da)
16 | 
17 |     def _da_save_binary(da):
18 |         da.save_binary(f'{str(ephemeral_tmpdir)}/docarray.bin')
19 | 
20 |     def _teardown():
21 |         import os
22 | 
23 |         os.remove(f'{str(ephemeral_tmpdir)}/docarray.bin')
24 | 
25 |     result = benchmark_time(
26 |         setup=_setup,
27 |         func=_da_save_binary,
28 |         teardown=_teardown,
29 |         n=NUM_REPETITIONS,
30 |     )
31 | 
32 |     json_writer.append(
33 |         page=Pages.DA_INSERT,
34 |         result=result,
35 |         metadata=dict(num_docs=num_docs),
36 |     )
37 | 
38 | 
39 | @pytest.mark.parametrize('num_docs', [100, 10_000])
40 | def test_da_load_binary(num_docs, json_writer, ephemeral_tmpdir):
41 |     def _setup():
42 |         docs = [Document(text=f'doc{i}') for i in range(num_docs)]
43 |         da = DocumentArray(docs)
44 |         da.save_binary(f'{str(ephemeral_tmpdir)}/docarray.bin')
45 |         return (), dict(da=da)
46 | 
47 |     def _da_load_binary(da):
48 |         da.load_binary(f'{str(ephemeral_tmpdir)}/docarray.bin')
49 | 
50 |     def _teardown():
51 |         import os
52 | 
53 |         os.remove(f'{str(ephemeral_tmpdir)}/docarray.bin')
54 | 
55 |     result = benchmark_time(
56 |         setup=_setup,
57 |         func=_da_load_binary,
58 |         teardown=_teardown,
59 |         n=NUM_REPETITIONS,
60 |     )
61 | 
62 |     json_writer.append(
63 |         page=Pages.DA_INSERT,
64 |         result=result,
65 |         metadata=dict(num_docs=num_docs),
66 |     )
67 | 


--------------------------------------------------------------------------------
/src/document_array_embeddings.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pytest
 3 | from jina import Document, DocumentArray, DocumentArrayMemmap
 4 | 
 5 | from .pages import Pages
 6 | from .utils.benchmark import benchmark_time
 7 | 
 8 | NUM_REPETITIONS = 10
 9 | 
10 | 
11 | @pytest.mark.parametrize(
12 |     'num_docs,num_feat', [(100, 128), (10_000, 128), (10_000, 256)]
13 | )
14 | def test_da_embeddings(num_docs, num_feat, json_writer):
15 |     def _setup():
16 |         da = DocumentArray(
17 |             [Document(embedding=np.random.random(num_feat)) for i in range(num_docs)]
18 |         )
19 |         return (), dict(da=da)
20 | 
21 |     def _da_embeddings(da):
22 |         embeddings = da.embeddings
23 | 
24 |     result = benchmark_time(
25 |         setup=_setup,
26 |         func=_da_embeddings,
27 |         n=NUM_REPETITIONS,
28 |     )
29 | 
30 |     json_writer.append(
31 |         page=Pages.DA_GET_ATTRIBUTES,
32 |         result=result,
33 |         metadata=dict(num_docs=num_docs, num_feat=num_feat),
34 |     )
35 | 
36 | 
37 | @pytest.mark.parametrize(
38 |     'num_docs,num_feat', [(100, 128), (10_000, 128), (10_000, 256)]
39 | )
40 | def test_dam_embeddings(num_docs, num_feat, json_writer, ephemeral_tmpdir):
41 |     def _setup():
42 |         dam = DocumentArrayMemmap((f'{str(ephemeral_tmpdir)}/memmap'))
43 |         dam.extend(
44 |             [Document(embedding=np.random.rand(num_feat)) for i in range(num_docs)]
45 |         )
46 |         return (), dict(dam=dam)
47 | 
48 |     def _dam_clear(dam):
49 |         dam.clear()
50 | 
51 |     def _teardown():
52 |         import shutil
53 | 
54 |         shutil.rmtree(f'{str(ephemeral_tmpdir)}/memmap')
55 | 
56 |     result = benchmark_time(
57 |         setup=_setup,
58 |         func=_dam_clear,
59 |         teardown=_teardown,
60 |         n=NUM_REPETITIONS,
61 |     )
62 | 
63 |     json_writer.append(
64 |         page=Pages.DA_GET_ATTRIBUTES,
65 |         result=result,
66 |         metadata=dict(num_docs=num_docs, num_feat=num_feat),
67 |     )
68 | 


--------------------------------------------------------------------------------
/src/document_array_save.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from jina import Document, DocumentArray, DocumentArrayMemmap
 3 | 
 4 | from .pages import Pages
 5 | from .utils.benchmark import benchmark_time
 6 | 
 7 | NUM_REPETITIONS = 10
 8 | 
 9 | # IMPORTANT: This benchmark currently is covered by
10 | # - document_array_save_binary_load_binary.py
11 | # - document_array_save_json_load_json.py
12 | # Only relevant if for future releases `.save` expands to other methods
13 | 
14 | 
15 | @pytest.mark.parametrize('num_docs', [100, 10_000])
16 | def test_da_save(num_docs, json_writer, ephemeral_tmpdir):
17 |     def _setup():
18 |         da = DocumentArray([Document(text=f'doc{i}') for i in range(num_docs)])
19 |         return (), dict(da=da)
20 | 
21 |     def _da_save(da):
22 |         da.save(f'{str(ephemeral_tmpdir)}/docarray')
23 | 
24 |     def _teardown():
25 |         import os
26 | 
27 |         os.remove(f'{str(ephemeral_tmpdir)}/docarray')
28 | 
29 |     result = benchmark_time(
30 |         setup=_setup, func=_da_save, teardown=_teardown, n=NUM_REPETITIONS
31 |     )
32 | 
33 |     def _teardown():
34 |         import shutil
35 | 
36 |         shutil.rmtree(f'{str(ephemeral_tmpdir)}/save')
37 | 
38 |     json_writer.append(
39 |         page=Pages.DA_CLEAR,
40 |         result=result,
41 |         metadata=dict(num_docs=num_docs),
42 |     )
43 | 
44 | 
45 | @pytest.mark.parametrize('num_docs', [100, 10_000])
46 | def test_dam_save(num_docs, json_writer, ephemeral_tmpdir):
47 |     def _setup():
48 |         dam = DocumentArrayMemmap((f'{str(ephemeral_tmpdir)}/memmap'))
49 |         dam.extend([Document(text=f'doc{i}') for i in range(num_docs)])
50 |         return (), dict(dam=dam)
51 | 
52 |     def _dam_clear(dam):
53 |         dam.clear()
54 | 
55 |     def _teardown():
56 |         import shutil
57 | 
58 |         shutil.rmtree(f'{str(ephemeral_tmpdir)}/memmap')
59 | 
60 |     result = benchmark_time(
61 |         setup=_setup, func=_dam_clear, teardown=_teardown, n=NUM_REPETITIONS
62 |     )
63 | 
64 |     json_writer.append(
65 |         page=Pages.DA_CLEAR,
66 |         result=result,
67 |         metadata=dict(num_docs=num_docs),
68 |     )
69 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Benchmark Jina
 2 | 
 3 | We are currenty considering time metrics to benchmark Jina features and using [pytest](https://docs.pytest.org) to run these tests.
 4 | 
 5 | ## Playbook
 6 | 
 7 | ### Prepare environment
 8 | 
 9 | ```bash
10 | pip install -r requirements.txt
11 | pip install pre-commit==2.13.0
12 | pre-commit install
13 | git submodule update --init
14 | ```
15 | 
16 | ### Run Locally
17 | 
18 | ```bash
19 | pytest
20 | ```
21 | 
22 | ### Run on Docker
23 | 
24 | ```bash
25 | JINA_VER=master
26 | docker build --build-arg JINA_VER=$JINA_VER -t bechmark .
27 | docker run -v $(pwd):/app bechmark:latest
28 | ```
29 | 
30 | ### Generate docs locally and run server
31 | 
32 | ```bash
33 | python scripts/site_generator.py
34 | cd docs
35 | hugo server -D
36 | ```
37 | 
38 | ## Machine
39 | 
40 | We are running all tests sequentially for a version on a single machine of following properties:
41 | 
42 | | Item | Value |
43 | | :---: | :---: |
44 | | Cloud Vendor | AWS |
45 | | Instance | c5.xlarge |
46 | | Memory | 8 GiB |
47 | | vCPU | 4 |
48 | | Processor | Intel Xeon Platinum 8124M |
49 | | Clock Speed | 3 GHz |
50 | | Storage | EBS (gp2) |
51 | 
52 | ## Contributing
53 | 
54 | We welcome all kinds of contributions from the open-source community, individuals and partners. We owe our success to your active involvement.
55 | 
56 | Here're some quick notes you need to know before starting to contribute:
57 | 
58 | - Please keep all of your tests under `src` folder and ensure they behave as expected with `pytest`.
59 | - Please save the benchmarking artifacts in `JSON` format in `docs/static/artifacts/${JINA_VERSION}/report.json` file.
60 | - Please enlist any Python dependency to `requirements.txt` file.
61 | - Please run `scripts/site_generator.py` to generate the website everytime you generate new benchmarking artifacts.
62 | - `report.json` file should have the following shema:
63 | 
64 | ```json
65 | [
66 |   {
67 |     "name": "document_array_append/test_docarray_append",
68 |     "iterations": 5,
69 |     "mean_time": 0.007944801799999368,
70 |     "std_time": 0.0012715548259231583,
71 |     "metadata": {
72 |       "num_docs_append": 10000
73 |     }
74 |   }
75 | ]
76 | ```
77 | 


--------------------------------------------------------------------------------
/src/zed_runtime_callback.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pytest
 3 | from jina import Document, DocumentArray, Executor, requests
 4 | from jina.clients.request import request_generator
 5 | from jina.parsers import set_pea_parser
 6 | from jina.peapods.runtimes.zmq.zed import ZEDRuntime
 7 | from jina.types.message import Message
 8 | from jina.types.request import Request
 9 | 
10 | from .utils.benchmark import benchmark_time
11 | 
12 | NUM_DOCS = 100
13 | 
14 | 
15 | class DummyEncoder(Executor):
16 |     @requests
17 |     def encode(self, docs, **kwargs):
18 |         texts = docs.get_attributes('text')
19 |         embeddings = [np.random.rand(1, 1024) for _ in texts]
20 |         for doc, embedding in zip(docs, embeddings):
21 |             doc.embedding = embedding
22 | 
23 | 
24 | @pytest.fixture()
25 | def process_message():
26 |     req = list(
27 |         request_generator(
28 |             '/',
29 |             DocumentArray([Document(text='input document') for _ in range(NUM_DOCS)]),
30 |         )
31 |     )[0]
32 |     msg = Message(None, req, 'test', '123')
33 |     return msg
34 | 
35 | 
36 | @pytest.fixture()
37 | def runtime():
38 |     args = set_pea_parser().parse_args(['--uses', 'DummyEncoder'])
39 |     return ZEDRuntime(args)
40 | 
41 | 
42 | @pytest.mark.skip()
43 | def test_zed_runtime_callback(runtime, process_message, json_writer):
44 |     def _function(**kwargs):
45 |         runtime._callback(process_message)
46 | 
47 |     result = benchmark_time(
48 |         profile_cls=[Document, DocumentArray, Message, Request],
49 |         func=_function,
50 |     )
51 |     profiles = result.profiles
52 |     document_profile = profiles[0]
53 |     document_array_profile = profiles[1]
54 |     message_profile = profiles[2]
55 |     request_profile = profiles[3]
56 | 
57 |     json_writer.append(
58 |         name='zed_runtime_callback/test_zed_runtime_callback',
59 |         result=result,
60 |         metadata=dict(
61 |             profiles=dict(
62 |                 Document=document_profile,
63 |                 DocumentArray=document_array_profile,
64 |                 Message=message_profile,
65 |                 Request=request_profile,
66 |             ),
67 |             num_docs=NUM_DOCS,
68 |         ),
69 |     )
70 | 


--------------------------------------------------------------------------------
/src/document_set_attributes.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import string
 3 | 
 4 | import numpy as np
 5 | import pytest
 6 | from jina import Document
 7 | 
 8 | from .pages import Pages
 9 | from .utils.benchmark import benchmark_time
10 | 
11 | 
12 | def _generate_random_text(text_length):
13 |     return ''.join(
14 |         random.choice(string.ascii_uppercase + string.digits)
15 |         for _ in range(text_length)
16 |     )
17 | 
18 | 
19 | def _generate_random_buffer(buffer_length):
20 |     return bytes(bytearray(random.getrandbits(8) for _ in range(buffer_length)))
21 | 
22 | 
23 | def _generate_random_blob(num_dims):
24 |     # 1 and 3 can cover from audio signals to images. 3 dimensions make the memory too high
25 |     shape = [random.randint(100, 200)] * num_dims
26 | 
27 |     return np.random.rand(*shape)
28 | 
29 | 
30 | @pytest.mark.parametrize('text_length', [10, 100, 1000, 10000])
31 | def test_set_attribute_text(text_length, json_writer):
32 |     def _set_doc(doc):
33 |         doc._set_attributes(text=_generate_random_text(text_length))
34 | 
35 |     result = benchmark_time(
36 |         func=_set_doc,
37 |         kwargs=dict(doc=Document()),
38 |     )
39 | 
40 |     json_writer.append(
41 |         page=Pages.DOCUMENT_CONTENT,
42 |         result=result,
43 |         metadata=dict(text_length=text_length),
44 |     )
45 | 
46 | 
47 | @pytest.mark.parametrize('num_dims', [1, 2])
48 | def test_set_attribute_blob(num_dims, json_writer):
49 |     def _set_doc(doc):
50 |         doc._set_attributes(blob=_generate_random_blob(num_dims))
51 | 
52 |     result = benchmark_time(
53 |         func=_set_doc,
54 |         kwargs=dict(doc=Document()),
55 |     )
56 | 
57 |     json_writer.append(
58 |         page=Pages.DOCUMENT_CONTENT,
59 |         result=result,
60 |         metadata=dict(num_dims=num_dims),
61 |     )
62 | 
63 | 
64 | @pytest.mark.parametrize('buffer_length', [10, 1000, 100000])
65 | def test_set_attribute_buffer(buffer_length, json_writer):
66 |     def _set_doc(doc):
67 |         doc._set_attributes(buffer=_generate_random_buffer(buffer_length))
68 | 
69 |     result = benchmark_time(
70 |         func=_set_doc,
71 |         kwargs=dict(doc=Document()),
72 |     )
73 | 
74 |     json_writer.append(
75 |         page=Pages.DOCUMENT_CONTENT,
76 |         result=result,
77 |         metadata=dict(buffer_length=buffer_length),
78 |     )
79 | 


--------------------------------------------------------------------------------
/src/document_get_attributes.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import string
 3 | 
 4 | import numpy as np
 5 | import pytest
 6 | from jina import Document
 7 | 
 8 | from .pages import Pages
 9 | from .utils.benchmark import benchmark_time
10 | 
11 | 
12 | def _generate_random_text(text_length):
13 |     return ''.join(
14 |         random.choice(string.ascii_uppercase + string.digits)
15 |         for _ in range(text_length)
16 |     )
17 | 
18 | 
19 | def _generate_random_buffer(buffer_length):
20 |     return bytes(bytearray(random.getrandbits(8) for _ in range(buffer_length)))
21 | 
22 | 
23 | def _generate_random_blob(num_dims):
24 |     # 1 and 3 can cover from audio signals to images. 3 dimensions make the memory too high
25 |     shape = [random.randint(100, 200)] * num_dims
26 | 
27 |     return np.random.rand(*shape)
28 | 
29 | 
30 | @pytest.mark.parametrize('text_length', [10, 100, 1000, 10000])
31 | def test_get_attributes_text(text_length, json_writer):
32 |     def _doc_get(doc):
33 |         _ = doc.get_attributes(*['text'])
34 | 
35 |     result = benchmark_time(
36 |         func=_doc_get,
37 |         kwargs=dict(doc=Document(text=_generate_random_text(text_length))),
38 |     )
39 | 
40 |     json_writer.append(
41 |         page=Pages.DOCUMENT_CONTENT,
42 |         result=result,
43 |         metadata=dict(text_length=text_length),
44 |     )
45 | 
46 | 
47 | @pytest.mark.parametrize('num_dims', [1, 2])
48 | def test_get_attribute_blob(num_dims, json_writer):
49 |     def _doc_get(doc):
50 |         _ = doc.get_attributes(*['blob'])
51 | 
52 |     result = benchmark_time(
53 |         func=_doc_get,
54 |         kwargs=dict(doc=Document(blob=_generate_random_blob(num_dims))),
55 |     )
56 | 
57 |     json_writer.append(
58 |         page=Pages.DOCUMENT_CONTENT,
59 |         result=result,
60 |         metadata=dict(num_dims=num_dims),
61 |     )
62 | 
63 | 
64 | @pytest.mark.parametrize('buffer_length', [10, 1000, 100000])
65 | def test_get_attribute_buffer(buffer_length, json_writer):
66 |     def _doc_get(doc):
67 |         _ = doc.get_attributes(*['buffer'])
68 | 
69 |     result = benchmark_time(
70 |         func=_doc_get,
71 |         kwargs=dict(doc=Document(buffer=_generate_random_buffer(buffer_length))),
72 |     )
73 | 
74 |     json_writer.append(
75 |         page=Pages.DOCUMENT_CONTENT,
76 |         result=result,
77 |         metadata=dict(buffer_length=buffer_length),
78 |     )
79 | 


--------------------------------------------------------------------------------
/src/document_array_extend.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import string
 3 | 
 4 | import numpy as np
 5 | import pytest
 6 | from jina import Document, DocumentArray, DocumentArrayMemmap
 7 | 
 8 | from .pages import Pages
 9 | from .utils.benchmark import benchmark_time
10 | 
11 | NUM_REPETITIONS = 25
12 | NUM_DOCS = 1000
13 | CHARS = tuple(string.ascii_uppercase + string.digits)
14 | 
15 | 
16 | def _generate_random_text():
17 |     return ''.join(np.random.choice(CHARS, 256))
18 | 
19 | 
20 | def _generate_random_blob():
21 |     return np.random.random(512)
22 | 
23 | 
24 | def _generate_random_buffer():
25 |     return bytes(bytearray(os.urandom(512 * 4)))
26 | 
27 | 
28 | def empty_docs():
29 |     return [Document() for _ in range(NUM_DOCS)]
30 | 
31 | 
32 | def text_docs():
33 |     return [Document(text=_generate_random_text()) for _ in range(NUM_DOCS)]
34 | 
35 | 
36 | def blob_docs():
37 |     return [Document(blob=_generate_random_blob()) for _ in range(NUM_DOCS)]
38 | 
39 | 
40 | def buffer_docs():
41 |     return [Document(buffer=_generate_random_buffer()) for _ in range(NUM_DOCS)]
42 | 
43 | 
44 | @pytest.mark.parametrize('memmap', [False, True])
45 | @pytest.mark.parametrize(
46 |     'docs, label',
47 |     [
48 |         (empty_docs(), 'empty'),
49 |         (blob_docs(), 'blob'),
50 |         (text_docs(), 'text'),
51 |         (buffer_docs(), 'buffer'),
52 |     ],
53 | )
54 | def test_da_extend(docs, label, memmap, json_writer, ephemeral_tmpdir):
55 |     def _extend(da):
56 |         da.extend(docs)
57 | 
58 |     def _build_da(**kwargs):
59 |         memmap = kwargs.get('memmap', False)
60 |         da = (
61 |             DocumentArray()
62 |             if not memmap
63 |             else DocumentArrayMemmap(f'{str(ephemeral_tmpdir)}/memmap')
64 |         )
65 |         return (), dict(da=da)
66 | 
67 |     def _teardown():
68 |         import os
69 |         import shutil
70 | 
71 |         if os.path.exists(f'{str(ephemeral_tmpdir)}/memmap'):
72 |             shutil.rmtree(f'{str(ephemeral_tmpdir)}/memmap')
73 | 
74 |     result = benchmark_time(
75 |         setup=_build_da,
76 |         func=_extend,
77 |         teardown=_teardown,
78 |         n=NUM_REPETITIONS,
79 |         kwargs=dict(memmap=memmap),
80 |     )
81 | 
82 |     json_writer.append(
83 |         page=Pages.DA_EXTEND,
84 |         result=result,
85 |         metadata=dict(num_docs=len(docs), label=label, memmap=memmap),
86 |     )
87 | 


--------------------------------------------------------------------------------
/src/document_array_traverse.py:
--------------------------------------------------------------------------------
 1 | import shutil
 2 | 
 3 | import pytest
 4 | from jina import Document, DocumentArray, DocumentArrayMemmap
 5 | 
 6 | from .pages import Pages
 7 | from .utils.benchmark import benchmark_time
 8 | 
 9 | 
10 | def _get_docs(num_docs):
11 |     return [Document(text=f'This is the document number: {i}') for i in range(num_docs)]
12 | 
13 | 
14 | def _build_da(num_docs, num_matches, num_chunks):
15 |     da = DocumentArray(_get_docs(num_docs))
16 |     for doc in da:
17 |         if num_matches > 0:
18 |             doc.matches.extend(_get_docs(num_matches))
19 |         if num_chunks > 0:
20 |             doc.chunks.extend(_get_docs(num_chunks))
21 | 
22 |     return da
23 | 
24 | 
25 | @pytest.mark.parametrize(
26 |     'num_docs,num_matches,num_chunks,traversal_paths',
27 |     [
28 |         (10, 10, 10, 'r,c,m'),
29 |         (100, 100, 100, 'r,c,m'),
30 |         (1000, 100, 100, 'r,c,m'),
31 |         (1000, 10, 10, 'r'),
32 |         (1000, 10, 100, 'c'),
33 |         (1000, 100, 10, 'm'),
34 |     ],
35 | )
36 | @pytest.mark.parametrize('memmap', [False, True])
37 | def test_da_traverse_flat(
38 |     name,
39 |     num_docs,
40 |     num_matches,
41 |     num_chunks,
42 |     traversal_paths,
43 |     memmap,
44 |     json_writer,
45 |     ephemeral_tmpdir,
46 | ):
47 |     if num_docs == 1000 and num_chunks == 1000 and num_matches == 1000:
48 |         pytest.skip('problems with memory')
49 | 
50 |     def _traverse_flat(da):
51 |         for d in da.traverse_flat(traversal_paths):
52 |             pass
53 | 
54 |     def _build_da():
55 |         docs = _get_docs(num_docs)
56 |         for doc in docs:
57 |             if num_matches > 0:
58 |                 doc.matches.extend(_get_docs(num_matches))
59 |             if num_chunks > 0:
60 |                 doc.chunks.extend(_get_docs(num_chunks))
61 | 
62 |         da = (
63 |             DocumentArray()
64 |             if not memmap
65 |             else DocumentArrayMemmap(f'{str(ephemeral_tmpdir)}/memmap')
66 |         )
67 |         da.extend(docs)
68 | 
69 |         return (), dict(da=da)
70 | 
71 |     def _teardown():
72 |         try:
73 |             shutil.rmtree(f'{str(ephemeral_tmpdir)}/memmap')
74 |         except FileNotFoundError:
75 |             pass
76 | 
77 |     result = benchmark_time(setup=_build_da, func=_traverse_flat, teardown=_teardown)
78 |     if memmap:
79 |         name = name.replace('_da_', '_dam_')
80 |     json_writer.append(
81 |         name=name,
82 |         page=Pages.DA_TRAVERSE,
83 |         result=result,
84 |         metadata=dict(
85 |             num_docs=num_docs,
86 |             num_matches=num_matches,
87 |             num_chunks=num_chunks,
88 |             traversal_paths=traversal_paths,
89 |             memmap=memmap,
90 |         ),
91 |     )
92 | 


--------------------------------------------------------------------------------
/src/document_property_getter.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import string
 3 | 
 4 | import numpy as np
 5 | import pytest
 6 | from jina import Document
 7 | 
 8 | from .pages import Pages
 9 | from .utils.benchmark import benchmark_time
10 | 
11 | 
12 | def _generate_random_text(text_length):
13 |     return ''.join(
14 |         random.choice(string.ascii_uppercase + string.digits)
15 |         for _ in range(text_length)
16 |     )
17 | 
18 | 
19 | def _generate_random_buffer(buffer_length):
20 |     return bytes(bytearray(random.getrandbits(8) for _ in range(buffer_length)))
21 | 
22 | 
23 | def _generate_random_blob(num_dims):
24 |     # 1 and 3 can cover from audio signals to images. 3 dimensions make the memory too high
25 |     shape = [random.randint(100, 200)] * num_dims
26 | 
27 |     return np.random.rand(*shape)
28 | 
29 | 
30 | @pytest.mark.parametrize('text_length', [10, 100, 1000, 10000])
31 | def test_get_content_text(text_length, json_writer):
32 |     def _doc_get(doc):
33 |         _ = doc.text
34 | 
35 |     result = benchmark_time(
36 |         func=_doc_get,
37 |         kwargs=dict(doc=Document(text=_generate_random_text(text_length))),
38 |     )
39 | 
40 |     json_writer.append(
41 |         page=Pages.DOCUMENT_CONTENT,
42 |         result=result,
43 |         metadata=dict(text_length=text_length),
44 |     )
45 | 
46 | 
47 | @pytest.mark.parametrize('num_dims', [1, 2])
48 | def test_get_content_blob(num_dims, json_writer):
49 |     def _doc_get(doc):
50 |         _ = doc.blob
51 | 
52 |     result = benchmark_time(
53 |         func=_doc_get,
54 |         kwargs=dict(doc=Document(blob=_generate_random_blob(num_dims))),
55 |     )
56 | 
57 |     json_writer.append(
58 |         page=Pages.DOCUMENT_CONTENT,
59 |         result=result,
60 |         metadata=dict(num_dims=num_dims),
61 |     )
62 | 
63 | 
64 | @pytest.mark.parametrize('buffer_length', [10, 1000, 100000])
65 | def test_get_content_buffer(buffer_length, json_writer):
66 |     def _doc_get(doc):
67 |         _ = doc.buffer
68 | 
69 |     result = benchmark_time(
70 |         func=_doc_get,
71 |         kwargs=dict(doc=Document(buffer=_generate_random_buffer(buffer_length))),
72 |     )
73 | 
74 |     json_writer.append(
75 |         page=Pages.DOCUMENT_CONTENT,
76 |         result=result,
77 |         metadata=dict(buffer_length=buffer_length),
78 |     )
79 | 
80 | 
81 | @pytest.mark.parametrize('num_dims', [1, 2])
82 | def test_get_embedding(num_dims, json_writer):
83 |     def _doc_get(doc):
84 |         _ = doc.embedding
85 | 
86 |     result = benchmark_time(
87 |         func=_doc_get,
88 |         kwargs=dict(doc=Document(embedding=_generate_random_blob(num_dims))),
89 |     )
90 | 
91 |     json_writer.append(
92 |         page=Pages.DOCUMENT_CONTENT,
93 |         result=result,
94 |         metadata=dict(buffer_length=num_dims),
95 |     )
96 | 


--------------------------------------------------------------------------------
/conftest.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | from pathlib import Path
 4 | 
 5 | from collections import defaultdict
 6 | 
 7 | import pytest
 8 | from jina import __version__
 9 | 
10 | 
11 | def pytest_addoption(parser):
12 |     parser.addoption('--output-file', action='store', default='report.json')
13 | 
14 | 
15 | class ResultsCollector:
16 |     def __init__(self, output_dir, default_filename):
17 |         self.results = defaultdict(list)
18 |         self.output_dir = output_dir
19 |         self.default_filename = default_filename
20 | 
21 |     def get_test_name():
22 |         test = os.environ['PYTEST_CURRENT_TEST']
23 |         removed_head = test.split('::')[-1]
24 |         return removed_head.split('[')[0].split(' (')[0]
25 | 
26 |     def append(self, page, result, metadata=None, name=None, target_file=None):
27 |         if metadata is None:
28 |             metadata = {}
29 | 
30 |         if name is None:
31 |             name = ResultsCollector.get_test_name()
32 |         if target_file is None:
33 |             target_file = self.default_filename
34 | 
35 |         self.results[target_file].append(
36 |             dict(
37 |                 name=name,
38 |                 page=page,
39 |                 iterations=result.iterations,
40 |                 mean_time=result.mean,
41 |                 std_time=result.std,
42 |                 metadata=metadata,
43 |             )
44 |         )
45 | 
46 |     def append_raw(self, dict_, target_file=None):
47 |         if target_file is None:
48 |             target_file = self.default_filename
49 | 
50 |         self.results[target_file].append(dict_)
51 |         return self.results[target_file]
52 | 
53 |     def dump(self):
54 |         Path(self.output_dir).mkdir(parents=True, exist_ok=True)
55 |         for filename, content in self.results.items():
56 |             file_path = f'{self.output_dir}/{filename}'
57 |             with open(file_path, 'w+') as file:
58 |                 json.dump(content, file)
59 | 
60 | 
61 | @pytest.fixture(scope='session')
62 | def json_writer(pytestconfig):
63 |     version = os.environ.get('JINA_VERSION', __version__)
64 | 
65 |     if version == 'master':
66 |         version = __version__
67 |     elif version.startswith('v'):
68 |         version = version[1:]
69 |     output_dir = f'docs/static/artifacts/{version}'
70 | 
71 |     collector = ResultsCollector(output_dir, pytestconfig.getoption('output_file'))
72 |     yield collector
73 | 
74 |     collector.dump()
75 | 
76 | 
77 | @pytest.fixture()
78 | def ephemeral_tmpdir(tmpdir):
79 |     yield tmpdir
80 | 
81 |     import shutil
82 | 
83 |     shutil.rmtree(str(tmpdir))
84 | 
85 | 
86 | @pytest.fixture()
87 | def name():
88 |     test = os.environ['PYTEST_CURRENT_TEST']
89 |     removed_head = test.split('::')[-1]
90 |     removed_tail = removed_head.split('[')[0].split(' (')[0]
91 | 
92 |     return removed_tail
93 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 
131 | # vscode configuration directory
132 | .vscode
133 | 
134 | # output directory
135 | output/
136 | outputs/
137 | 
138 | # indexer directory
139 | **/MyIndexer/
140 | **/MyMemMap/
141 | **/tmp/
142 | 
143 | # Hugo default output directory
144 | **/public
145 | **/resources
146 | 
147 | # pycharm
148 | .idea/
149 | 
150 | # custom
151 | docs/content/
152 | 


--------------------------------------------------------------------------------
/src/utils/profiler.py:
--------------------------------------------------------------------------------
 1 | import inspect
 2 | from statistics import mean, stdev
 3 | from typing import Dict, List
 4 | 
 5 | from .timecontext import TimeContext
 6 | 
 7 | 
 8 | def profile(profile, function, *args, **kwargs):
 9 |     def wrapper(*args, **kwargs):
10 |         with TimeContext() as timer:
11 |             func = function(*args, **kwargs)
12 | 
13 |         if function.__name__ in profile.keys():
14 |             profile[function.__name__]['time'] += timer.duration
15 |             profile[function.__name__]['calls'] += 1
16 |         else:
17 |             profile[function.__name__] = {}
18 |             profile[function.__name__]['time'] = timer.duration
19 |             profile[function.__name__]['calls'] = 1
20 |         return func
21 | 
22 |     return wrapper
23 | 
24 | 
25 | def merge_profiles(profiles: List[Dict]) -> Dict:
26 |     avg_profile = {}
27 |     for profile in profiles:
28 |         for function in profile.keys():
29 |             if function in avg_profile:
30 |                 avg_profile[function]['time'].append(profile[function]['time'])
31 |                 avg_profile[function]['calls'].append(profile[function]['calls'])
32 |             else:
33 |                 avg_profile[function] = {}
34 |                 avg_profile[function]['time'] = []
35 |                 avg_profile[function]['calls'] = []
36 |                 avg_profile[function]['time'].append(profile[function]['time'])
37 |                 avg_profile[function]['calls'].append(profile[function]['calls'])
38 | 
39 |     for function in avg_profile.keys():
40 |         avg_time = mean(avg_profile[function]['time'])
41 |         stdev_time = (
42 |             stdev(avg_profile[function]['time'])
43 |             if len(avg_profile[function]['time']) > 0
44 |             else None
45 |         )
46 |         avg_calls = mean(avg_profile[function]['calls'])
47 |         stdev_calls = (
48 |             stdev(avg_profile[function]['calls'])
49 |             if len(avg_profile[function]['calls']) > 0
50 |             else None
51 |         )
52 |         del avg_profile[function]['time']
53 |         del avg_profile[function]['calls']
54 |         avg_profile[function]['mean_time'] = avg_time
55 |         avg_profile[function]['std_time'] = stdev_time
56 |         avg_profile[function]['mean_calls'] = avg_calls
57 |         avg_profile[function]['std_calls'] = stdev_calls
58 | 
59 |     return avg_profile
60 | 
61 | 
62 | class Profiler:
63 |     def __init__(self, cls):
64 |         self._cls = cls
65 |         self.profile = {}
66 |         self._old_funcs = {}
67 | 
68 |     def __enter__(self):
69 |         for _, f in inspect.getmembers(self._cls, predicate=inspect.isfunction):
70 |             self._old_funcs[f.__name__] = f
71 |             setattr(self._cls, f.__name__, profile(self.profile, f))
72 |         return self
73 | 
74 |     def __exit__(self, exc_type, exc_val, exc_tb):
75 |         for func_name, func_val in self._old_funcs.items():
76 |             setattr(self._cls, func_name, func_val)
77 | 


--------------------------------------------------------------------------------
/docs/config.yml:
--------------------------------------------------------------------------------
 1 | baseURL: 'https://benchmark.jina.ai/'
 2 | defaultContentLanguage: en
 3 | title: Benchmark Jina
 4 | theme: book
 5 | 
 6 | # Book configuration
 7 | disablePathToLower: true
 8 | enableGitInfo: true
 9 | 
10 | # Needed for mermaid/katex shortcodes
11 | markup:
12 |   goldmark:
13 |     renderer:
14 |       unsafe: true
15 |   tableOfContents:
16 |     startLevel: 1
17 | 
18 | params:
19 |   # (Optional, default light) Sets color theme: light, dark or auto.
20 |   # Theme 'auto' switches between dark and light modes based on browser/os preferences
21 |   BookTheme: light
22 | 
23 |   # (Optional, default true) Controls table of contents visibility on right side of pages.
24 |   # Start and end levels can be controlled with markup.tableOfContents setting.
25 |   # You can also specify this parameter per page in front matter.
26 |   BookToC: true
27 | 
28 |   # (Optional, default none) Set the path to a logo for the book. If the logo is
29 |   # /static/logo.png then the path would be logo.png
30 |   # BookLogo: /img/logo-only.gif
31 | 
32 |   # (Optional, default none) Set leaf bundle to render as side menu
33 |   # When not specified file structure and weights will be used
34 |   BookMenuBundle: /menu
35 | 
36 |   # (Optional, default docs) Specify section of content to render as menu
37 |   # You can also set value to "*" to render all sections to menu
38 |   BookSection: docs
39 | 
40 |   # Set source repository location.
41 |   # Used for 'Last Modified' and 'Edit this page' links.
42 |   BookRepo: https://github.com/jina-ai/benchmark
43 | 
44 |   # Specifies commit portion of the link to the page's last modified commit hash for 'doc' page
45 |   # type.
46 |   # Required if 'BookRepo' param is set.
47 |   # Value used to construct a URL consisting of BookRepo/BookCommitPath/<commit-hash>
48 |   # Github uses 'commit', Bitbucket uses 'commits'
49 |   BookCommitPath: commit
50 | 
51 |   # Enable 'Edit this page' links for 'doc' page type.
52 |   # Disabled by default. Uncomment to enable. Requires 'BookRepo' param.
53 |   # Path must point to the site directory.
54 |   BookEditPath: edit/main/docs
55 | 
56 |   # (Optional, default January 2, 2006) Configure the date format used on the pages
57 |   # - In git information
58 |   # - In blog posts
59 |   BookDateFormat: '2 January, 2006'
60 | 
61 |   # (Optional, default true) Enables search function with flexsearch,
62 |   # Index is built on fly, therefore it might slowdown your website.
63 |   # Configuration for indexing can be adjusted in i18n folder per language.
64 |   BookSearch: false
65 | 
66 |   # (Optional, default true) Enables comments template on pages
67 |   # By default partials/docs/comments.html includes Disqus template
68 |   # See https://gohugo.io/content-management/comments/#configure-disqus
69 |   # Can be overwritten by same param in page frontmatter
70 |   BookComments: false
71 | 
72 |   # /!\ This is an experimental feature, might be removed or changed at any time
73 |   # (Optional, experimental, default false) Enables service worker that caches visited pages and resources for offline use.
74 |   BookServiceWorker: true
75 | 


--------------------------------------------------------------------------------
/src/utils/benchmark.py:
--------------------------------------------------------------------------------
 1 | from collections import namedtuple
 2 | from contextlib import ExitStack
 3 | from statistics import mean, stdev
 4 | from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple
 5 | 
 6 | from .profiler import Profiler, merge_profiles
 7 | from .timecontext import TimeContext
 8 | 
 9 | BenchmarkResult = namedtuple(
10 |     'BenchmarkResult', ['mean', 'std', 'iterations', 'profiles']
11 | )
12 | 
13 | 
14 | def benchmark_time(
15 |     func: Callable[[Any], Any],
16 |     n: int = 5,
17 |     setup: Optional[Callable[[Any], Optional[Tuple[Iterable, Dict[str, Any]]]]] = None,
18 |     teardown: Optional[Callable[[None], None]] = None,
19 |     profile_cls: Optional[List[type]] = [],
20 |     args: Optional[Tuple] = None,
21 |     kwargs: Optional[Dict] = None,
22 | ):
23 |     """Get average time and std by benchmarking a function multiple times
24 | 
25 |     :param func: The function to benchmark
26 |     :param setup: A setup function that can perform setup before running
27 |         the ``func``. It should take as inputs the ``args`` and ``kwargs``
28 |         that you provided, and return a tuple of an iterable, which will
29 |         be used to provide ``args`` to ``func``, and a dictionary, which
30 |         will be used to provide ``kwargs`` to ``func``.
31 |     :param teardown: A teardown function that can perform teardown/cleanup after running
32 |         the ``func``.
33 |     :param profile_cls: A list of the classes that want to be profiled
34 |     :param n: Number of repetitions
35 |     :param args: Positional arguments to pass to ``func`` (or ``setup``)
36 |     :param kwargs: Keyword arguments to pass to ``func`` (or ``setup``)
37 |     """
38 | 
39 |     results = []
40 |     args = args if args is not None else ()
41 |     kwargs = kwargs if kwargs is not None else {}
42 | 
43 |     profiles_by_cls = {_cls: [] for _cls in profile_cls}
44 | 
45 |     with TimeContext() as test_timer:
46 |         while test_timer.time_since_start() < 1e9 or len(results) < n:
47 |             if setup is not None:
48 |                 new_args, new_kwargs = setup(*args, **kwargs)
49 |             else:
50 |                 new_args, new_kwargs = args, kwargs
51 | 
52 |             ctx_manager = ExitStack()
53 | 
54 |             profiles = [ctx_manager.enter_context(Profiler(cls)) for cls in profile_cls]
55 |             with ctx_manager:
56 |                 with TimeContext() as t:
57 |                     func(*new_args, **new_kwargs)
58 | 
59 |             for p in profiles:
60 |                 profiles_by_cls[p._cls].append(p.profile)
61 | 
62 |             if teardown is not None:
63 |                 teardown()
64 | 
65 |             results.append(t.duration)
66 | 
67 |     mean_profiles = []
68 |     for profile_cls, profile_list in profiles_by_cls.items():
69 |         mean_profiles.append(merge_profiles(profile_list))
70 | 
71 |     m = int(mean(results))
72 |     s = int(stdev(results)) if len(results) > 1 else None
73 |     print(
74 |         f'----> mean_time={round(m,3)}, std_time={round(s,3)}, iterations={len(results)}'
75 |     )
76 | 
77 |     return BenchmarkResult(m, s, len(results), mean_profiles)
78 | 


--------------------------------------------------------------------------------
/src/document_array_match.py:
--------------------------------------------------------------------------------
  1 | from typing import Dict, Generator, Union
  2 | 
  3 | import numpy as np
  4 | import pytest
  5 | from jina import Document, DocumentArray, DocumentArrayMemmap
  6 | 
  7 | from .pages import Pages
  8 | from .utils.benchmark import benchmark_time
  9 | 
 10 | 
 11 | def _generate_docs_with_embs(
 12 |     n_docs: int, emb_size: int
 13 | ) -> Generator[Document, None, None]:
 14 |     embedings = np.random.random((n_docs, emb_size))
 15 |     for emb in embedings:
 16 |         yield Document(embedding=emb)
 17 | 
 18 | 
 19 | def match_arrays(
 20 |     array1: Union[DocumentArray, DocumentArrayMemmap],
 21 |     array2: Union[DocumentArray, DocumentArrayMemmap],
 22 |     topk: int,
 23 |     metric: str,
 24 |     use_scipy: bool,
 25 | ):
 26 |     array1.match(array2, limit=topk, metric=metric, use_scipy=use_scipy)
 27 | 
 28 | 
 29 | def _prepare_inputs_standard(
 30 |     size1: int = 10,
 31 |     size2: int = 10_000,
 32 |     emb_size: int = 128,
 33 |     topk: int = 10,
 34 |     metric: str = 'cosine',
 35 |     use_scipy: bool = False,
 36 |     dam_x: bool = False,
 37 |     dam_y: bool = False,
 38 |     dam_path: str = './',
 39 | ) -> Dict:
 40 |     if not dam_x:
 41 |         x = DocumentArray(_generate_docs_with_embs(size1, emb_size))
 42 |     else:
 43 |         x = DocumentArrayMemmap(f'{dam_path}/x')
 44 |         x.extend(_generate_docs_with_embs(size1, emb_size))
 45 |     if not dam_y:
 46 |         y = DocumentArray(_generate_docs_with_embs(size2, emb_size))
 47 |     else:
 48 |         y = DocumentArrayMemmap(f'{dam_path}/y')
 49 |         y.extend(_generate_docs_with_embs(size2, emb_size))
 50 |     return dict(
 51 |         array1=x,
 52 |         array2=y,
 53 |         topk=topk,
 54 |         metric=metric,
 55 |         use_scipy=use_scipy,
 56 |     )
 57 | 
 58 | 
 59 | @pytest.mark.parametrize('size_X', [10])
 60 | @pytest.mark.parametrize('size_Y', [100000])
 61 | @pytest.mark.parametrize('dam_x', [False])
 62 | @pytest.mark.parametrize('dam_y', [False])
 63 | @pytest.mark.parametrize('emb_size', [256])
 64 | @pytest.mark.parametrize('use_scipy', [False])
 65 | @pytest.mark.parametrize('metric', ['euclidean'])
 66 | @pytest.mark.parametrize('top_k', [3])
 67 | def test_match(
 68 |     size_X: int,
 69 |     size_Y: int,
 70 |     dam_x: bool,
 71 |     dam_y: bool,
 72 |     emb_size: int,
 73 |     use_scipy: bool,
 74 |     metric: str,
 75 |     top_k: int,
 76 |     ephemeral_tmpdir,
 77 |     json_writer,
 78 | ):
 79 |     result = benchmark_time(
 80 |         match_arrays,
 81 |         kwargs=_prepare_inputs_standard(
 82 |             size1=size_X,
 83 |             size2=size_Y,
 84 |             dam_x=dam_x,
 85 |             dam_y=dam_y,
 86 |             emb_size=emb_size,
 87 |             use_scipy=use_scipy,
 88 |             metric=metric,
 89 |             dam_path=str(ephemeral_tmpdir),
 90 |             topk=top_k,
 91 |         ),
 92 |     )
 93 | 
 94 |     json_writer.append(
 95 |         page=Pages.DA_MATCH,
 96 |         result=result,
 97 |         metadata=dict(
 98 |             size_X=size_X,
 99 |             size_Y=size_Y,
100 |             dam_x=dam_x,
101 |             dam_y=dam_y,
102 |             emb_size=emb_size,
103 |             use_scipy=use_scipy,
104 |             metric=metric,
105 |             top_k=top_k,
106 |         ),
107 |     )
108 | 


--------------------------------------------------------------------------------
/src/flow.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | from jina import Flow
  3 | 
  4 | from .pages import Pages
  5 | from .utils.benchmark import benchmark_time
  6 | 
  7 | NUM_PODS = 10
  8 | 
  9 | 
 10 | def _long_flow():
 11 |     f = Flow()
 12 |     for _ in range(NUM_PODS):
 13 |         f = f.add()
 14 | 
 15 |     return f
 16 | 
 17 | 
 18 | def _wide_flow():
 19 |     f = Flow().add(name='pod0')
 20 |     for i in range(NUM_PODS):
 21 |         f = f.add(needs=['pod0'], name=f'wide_{i}')
 22 |     f = f.add(name='join', needs=[f'wide_{i}' for i in range(NUM_PODS)])
 23 |     return f
 24 | 
 25 | 
 26 | @pytest.mark.parametrize(
 27 |     'flow, ftype', [(_long_flow(), 'long'), (_wide_flow(), 'wide')]
 28 | )
 29 | def test_local_flow_start(flow, ftype, json_writer):
 30 |     def _start():
 31 |         flow.start()
 32 | 
 33 |     def _close():
 34 |         flow.close()
 35 | 
 36 |     result = benchmark_time(func=_start, teardown=_close)
 37 | 
 38 |     json_writer.append(
 39 |         page=Pages.FLOW,
 40 |         result=result,
 41 |         metadata=dict(flow=ftype, num_pods=NUM_PODS),
 42 |     )
 43 | 
 44 | 
 45 | @pytest.mark.parametrize(
 46 |     'flow, ftype', [(_long_flow(), 'long'), (_wide_flow(), 'wide')]
 47 | )
 48 | def test_local_flow_close(flow, ftype, json_writer):
 49 |     def _start():
 50 |         flow.start()
 51 |         return (), {}
 52 | 
 53 |     def _close():
 54 |         flow.close()
 55 | 
 56 |     result = benchmark_time(setup=_start, func=_close)
 57 | 
 58 |     json_writer.append(
 59 |         page=Pages.FLOW,
 60 |         result=result,
 61 |         metadata=dict(flow=ftype, num_pods=NUM_PODS),
 62 |     )
 63 | 
 64 | 
 65 | yaml_long = '''jtype: Flow
 66 | version: '1'
 67 | pods:
 68 |   - uses:
 69 |     name: pod1
 70 |   - uses:
 71 |     name: pod2
 72 |   - uses:
 73 |     name: pod3
 74 |   - uses:
 75 |     name: pod4
 76 |   - uses:
 77 |     name: pod5
 78 |   - uses:
 79 |     name: pod6
 80 |   - uses:
 81 |     name: pod7
 82 |   - uses:
 83 |     name: pod8
 84 |   - uses:
 85 |     name: pod9
 86 |   - uses:
 87 |     name: pod10
 88 |     '''
 89 | 
 90 | yaml_wide = '''jtype: Flow
 91 | version: '1'
 92 | pods:
 93 |   - uses:
 94 |     name: pod0
 95 |   - uses:
 96 |     name: wide_0
 97 |     needs: [pod0]
 98 |   - uses:
 99 |     name: wide_1
100 |     needs: [pod0]
101 |   - uses:
102 |     name: wide_2
103 |     needs: [pod0]
104 |   - uses:
105 |     name: wide_3
106 |     needs: [pod0]
107 |   - uses:
108 |     name: wide_4
109 |     needs: [pod0]
110 |   - uses:
111 |     name: wide_5
112 |     needs: [pod0]
113 |   - uses:
114 |     name: wide_6
115 |     needs: [pod0]
116 |   - uses:
117 |     name: wide_7
118 |     needs: [pod0]
119 |   - uses:
120 |     name: wide_8
121 |     needs: [pod0]
122 |   - uses:
123 |     name: wide_9
124 |     needs: [pod0]
125 |   - uses:
126 |     name: join
127 |     needs: [wide_0, wide_1, wide_2, wide_3, wide_4, wide_5, wide_6, wide_7, wide_8, wide_9]
128 |     '''
129 | 
130 | 
131 | @pytest.mark.parametrize('config, ftype', [(yaml_long, 'long'), (yaml_wide, 'wide')])
132 | def test_flow_load_config(config, ftype, json_writer):
133 |     def _build():
134 |         Flow.load_config(config)
135 | 
136 |     result = benchmark_time(func=_build)
137 | 
138 |     json_writer.append(
139 |         page=Pages.FLOW,
140 |         result=result,
141 |         metadata=dict(flow=ftype, num_pods=NUM_PODS),
142 |     )
143 | 


--------------------------------------------------------------------------------
/src/document_array_construct.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | from faker import Faker
  3 | from jina import Document, DocumentArray, DocumentArrayMemmap
  4 | 
  5 | from .pages import Pages
  6 | from .utils.benchmark import benchmark_time
  7 | 
  8 | fake = Faker()
  9 | Faker.seed(42)
 10 | NUM_DOCS = 10000
 11 | 
 12 | 
 13 | @pytest.fixture
 14 | def docs():
 15 |     return [Document(text=fake.text()) for _ in range(NUM_DOCS)]
 16 | 
 17 | 
 18 | @pytest.fixture
 19 | def doc_with_chunks():
 20 |     d = Document()
 21 |     for idx in range(NUM_DOCS):
 22 |         d.chunks.append(Document(text=fake.text()))
 23 |     return d
 24 | 
 25 | 
 26 | @pytest.fixture()
 27 | def tuple_docs(docs):
 28 |     return tuple(docs)
 29 | 
 30 | 
 31 | @pytest.fixture
 32 | def doc_array(docs):
 33 |     return DocumentArray(docs)
 34 | 
 35 | 
 36 | @pytest.fixture
 37 | def doc_array_memmap(docs, ephemeral_tmpdir):
 38 |     dam = DocumentArrayMemmap(f'{str(ephemeral_tmpdir)}/memmap')
 39 |     dam.extend(docs)
 40 |     return dam
 41 | 
 42 | 
 43 | def test_construct_document_array_from_repeated_container(doc_with_chunks, json_writer):
 44 |     def _construct():
 45 |         DocumentArray(doc_with_chunks.chunks)
 46 | 
 47 |     result = benchmark_time(func=_construct)
 48 | 
 49 |     json_writer.append(
 50 |         page=Pages.DA_CONSTRUCT,
 51 |         result=result,
 52 |         metadata=dict(num_chunks=NUM_DOCS),
 53 |     )
 54 | 
 55 | 
 56 | def test_construct_document_array_from_another_documentarray(doc_array, json_writer):
 57 |     def _construct():
 58 |         DocumentArray(doc_array)
 59 | 
 60 |     result = benchmark_time(func=_construct)
 61 | 
 62 |     json_writer.append(
 63 |         page=Pages.DA_CONSTRUCT,
 64 |         result=result,
 65 |         metadata=dict(num_docs=len(doc_array)),
 66 |     )
 67 | 
 68 | 
 69 | def test_construct_document_array_from_list_of_documents(docs, json_writer):
 70 |     def _construct():
 71 |         DocumentArray(docs)
 72 | 
 73 |     result = benchmark_time(func=_construct)
 74 | 
 75 |     json_writer.append(
 76 |         page=Pages.DA_CONSTRUCT,
 77 |         result=result,
 78 |         metadata=dict(num_docs=len(docs)),
 79 |     )
 80 | 
 81 | 
 82 | def test_construct_document_array_from_tuple_of_documents(tuple_docs, json_writer):
 83 |     def _construct():
 84 |         DocumentArray(tuple_docs)
 85 | 
 86 |     result = benchmark_time(func=_construct)
 87 | 
 88 |     json_writer.append(
 89 |         page=Pages.DA_CONSTRUCT,
 90 |         result=result,
 91 |         metadata=dict(num_docs=len(tuple_docs)),
 92 |     )
 93 | 
 94 | 
 95 | def test_construct_document_array_from_generator(json_writer):
 96 |     def _yield_documents():
 97 |         """Used to benchmark construct DocumentArray from a document generator."""
 98 |         for idx in range(NUM_DOCS):
 99 |             yield Document(text=fake.text())
100 | 
101 |     def _construct():
102 |         DocumentArray(_yield_documents())
103 | 
104 |     result = benchmark_time(func=_construct)
105 | 
106 |     json_writer.append(
107 |         page=Pages.DA_CONSTRUCT,
108 |         result=result,
109 |         metadata=dict(num_docs=NUM_DOCS),
110 |     )
111 | 
112 | 
113 | def test_construct_document_array_from_another_documentarray_memmap(
114 |     doc_array_memmap, json_writer
115 | ):
116 |     def _construct():
117 |         DocumentArray(doc_array_memmap)
118 | 
119 |     result = benchmark_time(func=_construct)
120 | 
121 |     json_writer.append(
122 |         page=Pages.DA_CONSTRUCT,
123 |         result=result,
124 |         metadata=dict(num_docs=len(doc_array_memmap)),
125 |     )
126 | 


--------------------------------------------------------------------------------
/src/document_array_get_attributes.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import string
  3 | 
  4 | import numpy as np
  5 | import pytest
  6 | from faker import Faker
  7 | from jina import Document, DocumentArray, DocumentArrayMemmap
  8 | 
  9 | from .pages import Pages
 10 | from .utils.benchmark import benchmark_time
 11 | 
 12 | fake = Faker()
 13 | Faker.seed(42)
 14 | NUM_DOCS = 1000
 15 | CHARS = tuple(string.ascii_uppercase + string.digits)
 16 | 
 17 | 
 18 | def _generate_random_text():
 19 |     return ''.join(np.random.choice(CHARS, 256))
 20 | 
 21 | 
 22 | def _generate_random_blob():
 23 |     return np.random.random(512)
 24 | 
 25 | 
 26 | def _generate_random_buffer():
 27 |     return bytes(bytearray(os.urandom(512 * 4)))
 28 | 
 29 | 
 30 | def empty_docs():
 31 |     return [Document() for _ in range(NUM_DOCS)]
 32 | 
 33 | 
 34 | def text_docs(num_docs):
 35 |     return [Document(text=_generate_random_text()) for _ in range(num_docs)]
 36 | 
 37 | 
 38 | def blob_docs(num_docs):
 39 |     return [Document(blob=_generate_random_blob()) for _ in range(num_docs)]
 40 | 
 41 | 
 42 | def buffer_docs(num_docs):
 43 |     return [Document(buffer=_generate_random_buffer()) for _ in range(num_docs)]
 44 | 
 45 | 
 46 | def embedding_docs(num_docs):
 47 |     return [Document(embedding=_generate_random_blob()) for _ in range(num_docs)]
 48 | 
 49 | 
 50 | @pytest.mark.parametrize('memmap', [False, True])
 51 | @pytest.mark.parametrize(
 52 |     'field, docs_get_fn',
 53 |     [
 54 |         ('blob', blob_docs),
 55 |         ('text', text_docs),
 56 |         ('buffer', buffer_docs),
 57 |         ('embedding', embedding_docs),
 58 |     ],
 59 | )
 60 | @pytest.mark.parametrize(
 61 |     'num_docs',
 62 |     [100, 10000],
 63 | )
 64 | def test_da_get_attributes(
 65 |     name, field, docs_get_fn, memmap, num_docs, json_writer, ephemeral_tmpdir
 66 | ):
 67 |     def _get_attributes(da):
 68 |         da.get_attributes(*[field])
 69 | 
 70 |     def _build_da(**kwargs):
 71 |         memmap = kwargs.get('memmap', False)
 72 |         docs = kwargs.get('docs', False)
 73 |         da = (
 74 |             DocumentArray()
 75 |             if not memmap
 76 |             else DocumentArrayMemmap(f'{str(ephemeral_tmpdir)}/memmap')
 77 |         )
 78 |         da.extend(docs)
 79 |         return (), dict(da=da)
 80 | 
 81 |     def _teardown():
 82 |         import os
 83 |         import shutil
 84 | 
 85 |         if os.path.exists(f'{str(ephemeral_tmpdir)}/memmap'):
 86 |             shutil.rmtree(f'{str(ephemeral_tmpdir)}/memmap')
 87 | 
 88 |     result = benchmark_time(
 89 |         setup=_build_da,
 90 |         func=_get_attributes,
 91 |         teardown=_teardown,
 92 |         kwargs=dict(memmap=memmap, docs=docs_get_fn(num_docs)),
 93 |     )
 94 |     if memmap:
 95 |         name = name.replace('_da_', '_dam_')
 96 |     json_writer.append(
 97 |         name=name,
 98 |         page=Pages.DA_GET_ATTRIBUTES,
 99 |         result=result,
100 |         metadata=dict(num_docs=num_docs, field=field, memmap=memmap),
101 |     )
102 | 
103 | 
104 | @pytest.mark.parametrize('memmap', [False, True])
105 | @pytest.mark.parametrize(
106 |     'num_docs',
107 |     [100, 10000],
108 | )
109 | def test_da_embeddings_property(name, memmap, num_docs, json_writer, ephemeral_tmpdir):
110 |     def _get_embeddings(da):
111 |         da.embeddings
112 | 
113 |     def _build_da(**kwargs):
114 |         memmap = kwargs.get('memmap', False)
115 |         docs = embedding_docs(num_docs)
116 |         da = (
117 |             DocumentArray()
118 |             if not memmap
119 |             else DocumentArrayMemmap(f'{str(ephemeral_tmpdir)}/memmap')
120 |         )
121 |         da.extend(docs)
122 |         return (), dict(da=da)
123 | 
124 |     def _teardown():
125 |         import os
126 |         import shutil
127 | 
128 |         if os.path.exists(f'{str(ephemeral_tmpdir)}/memmap'):
129 |             shutil.rmtree(f'{str(ephemeral_tmpdir)}/memmap')
130 | 
131 |     result = benchmark_time(
132 |         setup=_build_da,
133 |         func=_get_embeddings,
134 |         teardown=_teardown,
135 |         kwargs=dict(memmap=memmap),
136 |     )
137 |     if memmap:
138 |         name = name.replace('_da_', '_dam_')
139 |     json_writer.append(
140 |         name=name,
141 |         page=Pages.DA_GET_ATTRIBUTES,
142 |         result=result,
143 |         metadata=dict(num_docs=num_docs, memmap=memmap),
144 |     )
145 | 


--------------------------------------------------------------------------------
/src/document_graph_construction.py:
--------------------------------------------------------------------------------
  1 | # import random
  2 | #
  3 | # import pytest
  4 | # from jina import Document
  5 | # from jina.types.document.graph import GraphDocument
  6 | #
  7 | # from .pages import Pages
  8 | # from .utils.benchmark import benchmark_time
  9 | #
 10 | #
 11 | # @pytest.mark.parametrize('n_edges', [200, 2_000])
 12 | # @pytest.mark.parametrize('n_nodes', [100, 1_000])
 13 | # def test_graph_add_edges_assuming_no_nodes_present(n_nodes, n_edges, json_writer):
 14 | #     def _setup():
 15 | #         docs = [Document(text=f'Document{i}') for i in range(n_nodes)]
 16 | #         sources = [random.choice(docs) for i in range(n_edges)]
 17 | #         targets = [random.choice(docs) for i in range(n_edges)]
 18 | #         edge_features = [
 19 | #             {'text': f'I connect Doc{i} and Doc{j}'} for i, j in zip(sources, targets)
 20 | #         ]
 21 | #         return (), dict(sources=sources, targets=targets, edge_features=edge_features)
 22 | #
 23 | #     def _build_graph_doc(sources, targets, edge_features):
 24 | #         graph = GraphDocument()
 25 | #         graph.add_edges(sources, targets, edge_features=edge_features)
 26 | #
 27 | #     result = benchmark_time(
 28 | #         setup=_setup,
 29 | #         func=_build_graph_doc,
 30 | #     )
 31 | #
 32 | #     json_writer.append(
 33 | #         page=Pages.DOCUMENT_GRAPH,
 34 | #         result=result,
 35 | #         metadata=dict(n_nodes=n_nodes, n_edges=n_edges),
 36 | #     )
 37 | #
 38 | #
 39 | # @pytest.mark.parametrize('n_edges', [200, 2_000])
 40 | # @pytest.mark.parametrize('n_nodes', [100, 1_000])
 41 | # def test_graph_add_edges_assuming_all_nodes_present(n_nodes, n_edges, json_writer):
 42 | #     def _setup():
 43 | #         docs = [Document(text=f'Document{i}') for i in range(n_nodes)]
 44 | #         sources = [random.choice(docs) for i in range(n_edges)]
 45 | #         targets = [random.choice(docs) for i in range(n_edges)]
 46 | #         edge_features = [
 47 | #             {'text': f'I connect Doc{i} and Doc{j}'} for i, j in zip(sources, targets)
 48 | #         ]
 49 | #         graph = GraphDocument()
 50 | #         graph.add_nodes(docs)
 51 | #         return (), dict(
 52 | #             graph=graph, sources=sources, targets=targets, edge_features=edge_features
 53 | #         )
 54 | #
 55 | #     def _build_graph_doc(graph, sources, targets, edge_features):
 56 | #         graph.add_edges(sources, targets, edge_features=edge_features)
 57 | #
 58 | #     result = benchmark_time(
 59 | #         setup=_setup,
 60 | #         func=_build_graph_doc,
 61 | #     )
 62 | #
 63 | #     json_writer.append(
 64 | #         page=Pages.DOCUMENT_GRAPH,
 65 | #         result=result,
 66 | #         metadata=dict(n_nodes=n_nodes, n_edges=n_edges),
 67 | #     )
 68 | #
 69 | #
 70 | # @pytest.mark.parametrize('n_edges', [200, 2_000])
 71 | # @pytest.mark.parametrize('n_nodes', [100, 1_000])
 72 | # def test_graph_add_single_edge_assuming_all_nodes_present(
 73 | #     n_nodes, n_edges, json_writer
 74 | # ):
 75 | #     def _setup():
 76 | #         docs = [Document(text=f'Document{i}') for i in range(n_nodes)]
 77 | #         sources = [random.choice(docs) for i in range(n_edges)]
 78 | #         targets = [random.choice(docs) for i in range(n_edges)]
 79 | #         graph = GraphDocument()
 80 | #         graph.add_nodes(docs)
 81 | #         return (), dict(graph=graph, sources=sources, targets=targets)
 82 | #
 83 | #     def _build_graph_doc(graph, sources, targets):
 84 | #         for source, target in zip(sources, targets):
 85 | #             graph.add_single_edge(source, target)
 86 | #         return graph
 87 | #
 88 | #     result = benchmark_time(setup=_setup, func=_build_graph_doc)
 89 | #
 90 | #     json_writer.append(
 91 | #         page=Pages.DOCUMENT_GRAPH,
 92 | #         result=result,
 93 | #         metadata=dict(n_nodes=n_nodes, n_edges=n_edges),
 94 | #     )
 95 | #
 96 | #
 97 | # @pytest.mark.parametrize('n_edges', [200, 2_000])
 98 | # @pytest.mark.parametrize('n_nodes', [100, 1_000])
 99 | # def test_graph_add_single_edge_assuming_no_nodes_present(n_nodes, n_edges, json_writer):
100 | #     def _setup():
101 | #         docs = [Document(text=f'Document{i}') for i in range(n_nodes)]
102 | #         sources = [random.choice(docs) for i in range(n_edges)]
103 | #         targets = [random.choice(docs) for i in range(n_edges)]
104 | #         return (), dict(sources=sources, targets=targets)
105 | #
106 | #     def _build_graph_doc(sources, targets):
107 | #         graph = GraphDocument()
108 | #         for source, target in zip(sources, targets):
109 | #             graph.add_single_edge(source, target)
110 | #         return graph
111 | #
112 | #     result = benchmark_time(setup=_setup, func=_build_graph_doc)
113 | #
114 | #     json_writer.append(
115 | #         page=Pages.DOCUMENT_GRAPH,
116 | #         result=result,
117 | #         metadata=dict(n_nodes=n_nodes, n_edges=n_edges),
118 | #     )
119 | #
120 | #
121 | # @pytest.mark.parametrize('n_nodes', [100, 1_000])
122 | # def test_graph_add_single_node(n_nodes, json_writer):
123 | #     def _setup():
124 | #         docs = [Document(text=f'Document{i}') for i in range(n_nodes)]
125 | #         graph = GraphDocument()
126 | #         return (), dict(graph=graph, docs=docs)
127 | #
128 | #     def _build_graph_doc(graph, docs):
129 | #         for doc in docs:
130 | #             graph.add_single_node(doc)
131 | #
132 | #     result = benchmark_time(
133 | #         setup=_setup,
134 | #         func=_build_graph_doc,
135 | #     )
136 | #
137 | #     json_writer.append(
138 | #         page=Pages.DOCUMENT_GRAPH,
139 | #         result=result,
140 | #         metadata=dict(n_nodes=n_nodes),
141 | #     )
142 | #
143 | #
144 | # @pytest.mark.parametrize('n_nodes', [100, 1_000])
145 | # def test_graph_add_nodes(n_nodes, json_writer):
146 | #     def _setup():
147 | #         docs = [Document(text=f'Document{i}') for i in range(n_nodes)]
148 | #         graph = GraphDocument()
149 | #         graph.add_nodes(docs)
150 | #         return (), dict(graph=graph, docs=docs)
151 | #
152 | #     def _build_graph_doc(graph, docs):
153 | #         graph.add_nodes(docs)
154 | #
155 | #     result = benchmark_time(
156 | #         setup=_setup,
157 | #         func=_build_graph_doc,
158 | #     )
159 | #
160 | #     json_writer.append(
161 | #         page=Pages.DOCUMENT_GRAPH,
162 | #         result=result,
163 | #         metadata=dict(n_nodes=n_nodes),
164 | #     )
165 | 


--------------------------------------------------------------------------------
/src/searchers_compare.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import shutil
  3 | from collections import defaultdict
  4 | from statistics import mean, stdev
  5 | 
  6 | import numpy as np
  7 | import pytest
  8 | from jina import Document, DocumentArray, Executor, requests, DocumentArrayMemmap
  9 | from pympler import asizeof, tracker
 10 | 
 11 | from .pages import Pages
 12 | from .utils.timecontext import TimeContext
 13 | 
 14 | NUM_REPETITIONS = 5
 15 | NUM_REQUESTS = 100
 16 | TARGET_FILE = 'searchers_compare.json'
 17 | 
 18 | 
 19 | def _get_docs(number_of_documents, embedding_size):
 20 |     return [
 21 |         Document(embedding=np.random.rand(embedding_size), id=str(i))
 22 |         for i in range(number_of_documents)
 23 |     ]
 24 | 
 25 | 
 26 | def _get_dam(number_of_documents, embedding_size, dir_path, **kwargs):
 27 |     tmp_path = f'{dir_path}/memmap_{number_of_documents}_{embedding_size}_tmp'
 28 |     path = f'{dir_path}/memmap_{number_of_documents}_{embedding_size}'
 29 |     if os.path.exists(path):
 30 |         return path
 31 |     da = DocumentArrayMemmap(tmp_path)
 32 |     docs = _get_docs(number_of_documents, embedding_size)
 33 |     da.extend(docs)
 34 |     da.save()
 35 |     shutil.copytree(tmp_path, path)
 36 |     da.clear()
 37 |     da._last_mmap = None
 38 |     return path
 39 | 
 40 | 
 41 | def _get_da(number_of_documents, embedding_size, dir_path, **kwargs):
 42 |     path = f'{dir_path}/docs.bin'
 43 |     if os.path.exists(path):
 44 |         return path
 45 |     da = DocumentArray()
 46 |     docs = _get_docs(number_of_documents, embedding_size)
 47 |     da.extend(docs)
 48 |     da.save(path, file_format='binary')
 49 |     da.clear()
 50 |     return path
 51 | 
 52 | 
 53 | def _get_document_array(dam_index, **kwargs):
 54 |     return _get_dam(**kwargs) if dam_index else _get_da(**kwargs)
 55 | 
 56 | 
 57 | class DocumentArraySearcher(Executor):
 58 |     def __init__(
 59 |         self,
 60 |         indexed_docs_path,
 61 |         dam_index,
 62 |         warmup=False,
 63 |         top_k: int = 50,
 64 |         *args,
 65 |         **kwargs,
 66 |     ):
 67 |         super().__init__(*args, **kwargs)
 68 |         self.indexed_docs_path = indexed_docs_path
 69 |         self._index_docs = (
 70 |             DocumentArray.load(indexed_docs_path, file_format='binary')
 71 |             if not dam_index
 72 |             else DocumentArrayMemmap(indexed_docs_path)
 73 |         )
 74 |         if warmup:
 75 |             self._index_docs.get_attributes('embedding')
 76 |         self._top_k = top_k
 77 | 
 78 |     @requests
 79 |     def search(self, docs, **kwargs):
 80 |         docs.match(
 81 |             self._index_docs,
 82 |             metric='cosine',
 83 |             use_scipy=False,
 84 |             limit=self._top_k,
 85 |         )
 86 | 
 87 | 
 88 | @pytest.mark.skipif(
 89 |     'JINA_BENCHMARK_SEARCHERS' not in os.environ,
 90 |     reason='This test take a lot of time, to be run explicitly and isolated from the rest',
 91 | )
 92 | @pytest.mark.parametrize(
 93 |     'name,indexed_docs,docs_per_request,emb_size',
 94 |     [
 95 |         ('Tiny Index', 100, 1, 128),
 96 |         ('Small Index', 10000, 1, 128),
 97 |         ('Medium Index', 100000, 1, 128),
 98 |         # ('Big Index', 1000000, 1, 128),
 99 |         ('Batch requesting', 100000, 32, 128),
100 |         ('Big embeddings', 100000, 1, 512),
101 |     ],
102 | )
103 | @pytest.mark.parametrize(
104 |     'dam_index,warmup', [(False, False), (True, False), (True, True)]
105 | )
106 | def test_search_compare(
107 |     name,
108 |     indexed_docs,
109 |     docs_per_request,
110 |     emb_size,
111 |     dam_index,
112 |     warmup,
113 |     ephemeral_tmpdir,
114 |     json_writer,
115 | ):
116 |     def _get_indexer():
117 |         path = _get_document_array(
118 |             dam_index=dam_index,
119 |             number_of_documents=indexed_docs,
120 |             embedding_size=emb_size,
121 |             dir_path=str(ephemeral_tmpdir),
122 |         )
123 | 
124 |         return DocumentArraySearcher(
125 |             indexed_docs_path=path, dam_index=dam_index, warmup=warmup
126 |         )
127 | 
128 |     query_docs = [
129 |         DocumentArray(_get_docs(docs_per_request, embedding_size=emb_size))
130 |     ] * NUM_REQUESTS
131 | 
132 |     data_points = defaultdict(list)
133 |     all_search_timings = []
134 | 
135 |     def _func():
136 |         with TimeContext() as indexer_context:
137 |             indexer = _get_indexer()
138 |         print(f' indexer created/loaded in {indexer_context.duration / 1e6} ms')
139 |         data_points['index_time'].append(indexer_context.duration)
140 |         data_points['index_memory'].append(asizeof.asizeof(indexer))
141 | 
142 |         tr = tracker.SummaryTracker()
143 |         sum1 = tr.create_summary()
144 |         timings = []
145 |         for i in range(NUM_REQUESTS):
146 |             with TimeContext() as seach_context:
147 |                 indexer.search(query_docs[i])
148 |             timings.append(seach_context.duration)
149 |         sum2 = tr.create_summary()
150 |         diff = tr.diff(sum1, sum2)
151 |         print(f' search finished in {sum(timings) / 1e6} ms')
152 |         data_points['search_time'].append(sum(timings))
153 |         all_search_timings.extend(timings)
154 |         data_points['search_memory'].append(sum([ob_sum[2] for ob_sum in diff]))
155 | 
156 |         shutil.rmtree(str(ephemeral_tmpdir), ignore_errors=True)
157 |         os.makedirs(str(ephemeral_tmpdir))
158 | 
159 |     for i in range(NUM_REPETITIONS):
160 |         _func()
161 | 
162 |     results = {}
163 | 
164 |     for field in ['index_time', 'index_memory', 'search_time', 'search_memory']:
165 |         results[f'mean_{field}'], results[f'std_{field}'] = get_mean_and_std(
166 |             data_points[field]
167 |         )
168 | 
169 |     results['p90'] = get_percentile(all_search_timings, 90)
170 |     results['p99'] = get_percentile(all_search_timings, 99)
171 | 
172 |     json_writer.append_raw(
173 |         target_file=TARGET_FILE,
174 |         dict_=dict(
175 |             name=name,
176 |             page=Pages.INDEXER_COMPARISON,
177 |             iterations=NUM_REPETITIONS,
178 |             results=results,
179 |             metadata=dict(
180 |                 indexed_docs=indexed_docs,
181 |                 embedding_size=emb_size,
182 |                 docs_per_request=docs_per_request,
183 |                 num_requests=NUM_REQUESTS,
184 |                 dam_index=dam_index,
185 |                 warmup_embeddings=warmup,
186 |             ),
187 |         ),
188 |     )
189 | 
190 | 
191 | def get_mean_and_std(data):
192 |     mean_ = mean(data)
193 |     std_ = stdev(data) if len(data) > 1 else None
194 |     return mean_, std_
195 | 
196 | 
197 | def get_percentile(timings, percentile):
198 |     array = np.array(timings)
199 |     return np.percentile(array, percentile)
200 | 


--------------------------------------------------------------------------------
/docs/static/artifacts/2.1.2/searchers_compare.json:
--------------------------------------------------------------------------------
1 | [{"name": "Tiny Index", "page": "indexer_comparison", "iterations": 5, "results": {"mean_index_time": 4663456, "std_index_time": 83310.14742514864, "mean_index_memory": 15688, "std_index_memory": 0.0, "mean_search_time": 464436898.6, "std_search_time": 4998683.976273986, "mean_search_memory": 1400770.6, "std_search_memory": 4516.044153902838, "p90": 4762196.6, "p99": 5214065.249999998}, "metadata": {"indexed_docs": 100, "embedding_size": 128, "docs_per_request": 1, "num_requests": 100, "dam_index": false, "warmup_embeddings": false}}, {"name": "Small Index", "page": "indexer_comparison", "iterations": 5, "results": {"mean_index_time": 412290205, "std_index_time": 13893922.332277287, "mean_index_memory": 1177192, "std_index_memory": 0.0, "mean_search_time": 4712511737.6, "std_search_time": 10168175.533933548, "mean_search_memory": 1399055.6, "std_search_memory": 1360.870971106372, "p90": 47989977.9, "p99": 48727291.11}, "metadata": {"indexed_docs": 10000, "embedding_size": 128, "docs_per_request": 1, "num_requests": 100, "dam_index": false, "warmup_embeddings": false}}, {"name": "Medium Index", "page": "indexer_comparison", "iterations": 5, "results": {"mean_index_time": 4369986530.8, "std_index_time": 66034291.207184196, "mean_index_memory": 14045160, "std_index_memory": 0.0, "mean_search_time": 27651288787.6, "std_search_time": 674765844.5562556, "mean_search_memory": 1399217.8, "std_search_memory": 1660.9512936868439, "p90": 281231409.1, "p99": 283362721.43}, "metadata": {"indexed_docs": 100000, "embedding_size": 128, "docs_per_request": 1, "num_requests": 100, "dam_index": false, "warmup_embeddings": false}}, {"name": "Batch requesting", "page": "indexer_comparison", "iterations": 5, "results": {"mean_index_time": 4350911889.2, "std_index_time": 15223257.526421051, "mean_index_memory": 14045160, "std_index_memory": 0.0, "mean_search_time": 40605954140.4, "std_search_time": 534816506.5463835, "mean_search_memory": 1398503, "std_search_memory": 0.0, "p90": 412417490.7, "p99": 414500806.53}, "metadata": {"indexed_docs": 100000, "embedding_size": 128, "docs_per_request": 32, "num_requests": 100, "dam_index": false, "warmup_embeddings": false}}, {"name": "Big embeddings", "page": "indexer_comparison", "iterations": 5, "results": {"mean_index_time": 5331926431.2, "std_index_time": 149836770.8967121, "mean_index_memory": 14045160, "std_index_memory": 0.0, "mean_search_time": 69454878061.4, "std_search_time": 350069105.02848333, "mean_search_memory": 1398475, "std_search_memory": 0.0, "p90": 698528512.0, "p99": 701192757.06}, "metadata": {"indexed_docs": 100000, "embedding_size": 512, "docs_per_request": 1, "num_requests": 100, "dam_index": false, "warmup_embeddings": false}}, {"name": "Tiny Index", "page": "indexer_comparison", "iterations": 5, "results": {"mean_index_time": 6735359.8, "std_index_time": 176449.9806579757, "mean_index_memory": 51968, "std_index_memory": 0.0, "mean_search_time": 574747374.4, "std_search_time": 24103476.25610035, "mean_search_memory": 1446180.2, "std_search_memory": 316.7983585816063, "p90": 5750141.4, "p99": 6004823.099999961}, "metadata": {"indexed_docs": 100, "embedding_size": 128, "docs_per_request": 1, "num_requests": 100, "dam_index": true, "warmup_embeddings": false}}, {"name": "Small Index", "page": "indexer_comparison", "iterations": 5, "results": {"mean_index_time": 526890272.6, "std_index_time": 4792654.779521659, "mean_index_memory": 3639272, "std_index_memory": 0.0, "mean_search_time": 1258268522.8, "std_search_time": 89951360.81594121, "mean_search_memory": 1914582.8, "std_search_memory": 517.762204105321, "p90": 10897318.5, "p99": 13446589.099998713}, "metadata": {"indexed_docs": 10000, "embedding_size": 128, "docs_per_request": 1, "num_requests": 100, "dam_index": true, "warmup_embeddings": false}}, {"name": "Medium Index", "page": "indexer_comparison", "iterations": 5, "results": {"mean_index_time": 5441540046, "std_index_time": 15804107.99446163, "mean_index_memory": 39353320, "std_index_memory": 0.0, "mean_search_time": 10250244642.4, "std_search_time": 1403353517.6519396, "mean_search_memory": 1915834, "std_search_memory": 94.05317644821997, "p90": 85302622.7, "p99": 114642680.12997666}, "metadata": {"indexed_docs": 100000, "embedding_size": 128, "docs_per_request": 1, "num_requests": 100, "dam_index": true, "warmup_embeddings": false}}, {"name": "Batch requesting", "page": "indexer_comparison", "iterations": 5, "results": {"mean_index_time": 5436396953.8, "std_index_time": 11955926.022260036, "mean_index_memory": 39353320, "std_index_memory": 0.0, "mean_search_time": 28176033346.2, "std_search_time": 603483342.9079329, "mean_search_memory": 1997230, "std_search_memory": 22.93468988235943, "p90": 269443261.2, "p99": 288813878.339986}, "metadata": {"indexed_docs": 100000, "embedding_size": 128, "docs_per_request": 32, "num_requests": 100, "dam_index": true, "warmup_embeddings": false}}, {"name": "Big embeddings", "page": "indexer_comparison", "iterations": 5, "results": {"mean_index_time": 6557379258.6, "std_index_time": 146945815.77846104, "mean_index_memory": 39353320, "std_index_memory": 0.0, "mean_search_time": 32178833682, "std_search_time": 4081333685.770137, "mean_search_memory": 1915692.8, "std_search_memory": 138.3011207474473, "p90": 242671132.2, "p99": 295682987.7999634}, "metadata": {"indexed_docs": 100000, "embedding_size": 512, "docs_per_request": 1, "num_requests": 100, "dam_index": true, "warmup_embeddings": false}}, {"name": "Tiny Index", "page": "indexer_comparison", "iterations": 5, "results": {"mean_index_time": 10787957.6, "std_index_time": 286894.32551777666, "mean_index_memory": 99648, "std_index_memory": 0.0, "mean_search_time": 554426485.4, "std_search_time": 2681315.314795203, "mean_search_memory": 1399307, "std_search_memory": 0.0, "p90": 5701958.4, "p99": 6041982.189999999}, "metadata": {"indexed_docs": 100, "embedding_size": 128, "docs_per_request": 1, "num_requests": 100, "dam_index": true, "warmup_embeddings": true}}, {"name": "Small Index", "page": "indexer_comparison", "iterations": 5, "results": {"mean_index_time": 669417835.2, "std_index_time": 5122590.498427109, "mean_index_memory": 4178768, "std_index_memory": 0.0, "mean_search_time": 1051474361.4, "std_search_time": 9151433.955484506, "mean_search_memory": 1403393.4, "std_search_memory": 71.07249819726334, "p90": 10893107.0, "p99": 12101473.249999993}, "metadata": {"indexed_docs": 10000, "embedding_size": 128, "docs_per_request": 1, "num_requests": 100, "dam_index": true, "warmup_embeddings": true}}, {"name": "Medium Index", "page": "indexer_comparison", "iterations": 5, "results": {"mean_index_time": 6861858093.8, "std_index_time": 21880322.70985081, "mean_index_memory": 39892816, "std_index_memory": 0.0, "mean_search_time": 5103391090.8, "std_search_time": 16373359.897277229, "mean_search_memory": 1403631.4, "std_search_memory": 51.699129586483366, "p90": 51674190.2, "p99": 55007636.15999998}, "metadata": {"indexed_docs": 100000, "embedding_size": 128, "docs_per_request": 1, "num_requests": 100, "dam_index": true, "warmup_embeddings": true}}, {"name": "Batch requesting", "page": "indexer_comparison", "iterations": 5, "results": {"mean_index_time": 6920679618, "std_index_time": 20688891.064720158, "mean_index_memory": 39892816, "std_index_memory": 0.0, "mean_search_time": 24183200250.8, "std_search_time": 365741686.15848285, "mean_search_memory": 1485103.4, "std_search_memory": 28.29840984931839, "p90": 245789731.3, "p99": 247716863.32}, "metadata": {"indexed_docs": 100000, "embedding_size": 128, "docs_per_request": 32, "num_requests": 100, "dam_index": true, "warmup_embeddings": true}}, {"name": "Big embeddings", "page": "indexer_comparison", "iterations": 5, "results": {"mean_index_time": 11407799365.6, "std_index_time": 927906708.8663467, "mean_index_memory": 39892848, "std_index_memory": 0.0, "mean_search_time": 22219834503.4, "std_search_time": 3892685783.9269333, "mean_search_memory": 1403646.6, "std_search_memory": 45.41805808266135, "p90": 241667129.1, "p99": 249390008.06}, "metadata": {"indexed_docs": 100000, "embedding_size": 512, "docs_per_request": 1, "num_requests": 100, "dam_index": true, "warmup_embeddings": true}}]


--------------------------------------------------------------------------------
/src/document_conversions_blob_image_uri_text.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | import numpy as np
  4 | import pytest
  5 | from jina import Document
  6 | 
  7 | from .pages import Pages
  8 | from .utils.benchmark import benchmark_time
  9 | 
 10 | """
 11 | This file contains tests for the following methods from Document:
 12 | 
 13 | - load_uri_to_image_blob
 14 | - convert_image_buffer_to_blob
 15 | - convert_image_datauri_to_blob
 16 | - convert_buffer_to_blob
 17 | - convert_image_blob_to_uri
 18 | - convert_blob_to_buffer
 19 | - load_uri_to_buffer
 20 | - convert_uri_to_datauri
 21 | - convert_buffer_to_uri
 22 | - convert_text_to_uri
 23 | - load_uri_to_text
 24 | - convert_content_to_uri
 25 | """
 26 | 
 27 | 
 28 | cur_dir = os.path.dirname(os.path.abspath(__file__))
 29 | 
 30 | 
 31 | @pytest.mark.parametrize("num_docs", [1, 100, 1000])
 32 | def test_document_load_uri_to_image_blob(num_docs, json_writer):
 33 |     def _input_docs():
 34 |         image_dir = os.path.join(cur_dir, "utils", "test.png")
 35 |         return (), dict(docs=[Document(uri=image_dir) for _ in range(num_docs)])
 36 | 
 37 |     def _load_uri_to_image_blob(docs):
 38 |         for doc in docs:
 39 |             doc.load_uri_to_image_blob()
 40 | 
 41 |     result = benchmark_time(setup=_input_docs, func=_load_uri_to_image_blob)
 42 | 
 43 |     json_writer.append(
 44 |         page=Pages.DOCUMENT_CONVERSION,
 45 |         result=result,
 46 |         metadata=dict(num_docs=num_docs),
 47 |     )
 48 | 
 49 | 
 50 | @pytest.mark.parametrize("num_docs", [1, 100, 1000])
 51 | def test_document_convert_uri_to_buffer(num_docs, json_writer):
 52 |     def _input_docs():
 53 |         image_dir = os.path.join(cur_dir, "utils", "test.png")
 54 |         docs = []
 55 |         for _ in range(num_docs):
 56 |             doc = Document(uri=image_dir)
 57 |             docs.append(doc)
 58 | 
 59 |         return (), dict(docs=docs)
 60 | 
 61 |     def _load_uri_to_buffer(docs):
 62 |         for doc in docs:
 63 |             doc.load_uri_to_buffer()
 64 | 
 65 |     result = benchmark_time(setup=_input_docs, func=_load_uri_to_buffer)
 66 | 
 67 |     json_writer.append(
 68 |         page=Pages.DOCUMENT_CONVERSION,
 69 |         result=result,
 70 |         metadata=dict(num_docs=num_docs),
 71 |     )
 72 | 
 73 | 
 74 | @pytest.mark.parametrize("num_docs", [1, 100, 1000])
 75 | def test_document_convert_image_buffer_to_blob(num_docs, json_writer):
 76 |     def _input_docs():
 77 |         image_dir = os.path.join(cur_dir, "utils", "test.png")
 78 |         docs = []
 79 |         for _ in range(num_docs):
 80 |             doc = Document(uri=image_dir)
 81 |             doc.load_uri_to_buffer()
 82 |             docs.append(doc)
 83 | 
 84 |         return (), dict(docs=docs)
 85 | 
 86 |     def _image_buffer_to_blob(docs):
 87 |         for doc in docs:
 88 |             doc.convert_buffer_to_image_blob()
 89 | 
 90 |     result = benchmark_time(setup=_input_docs, func=_image_buffer_to_blob)
 91 | 
 92 |     json_writer.append(
 93 |         page=Pages.DOCUMENT_CONVERSION,
 94 |         result=result,
 95 |         metadata=dict(num_docs=num_docs),
 96 |     )
 97 | 
 98 | 
 99 | @pytest.mark.parametrize("num_docs", [1, 100, 1000])
100 | def test_document_convert_image_datauri_to_blob(num_docs, json_writer):
101 |     def _input_docs():
102 |         image_dir = os.path.join(cur_dir, "utils", "test.png")
103 |         docs = []
104 |         for _ in range(num_docs):
105 |             doc = Document(uri=image_dir)
106 |             doc.convert_uri_to_datauri()
107 |             docs.append(doc)
108 | 
109 |         return (), dict(docs=docs)
110 | 
111 |     def _load_uri_to_image_blob(docs):
112 |         for doc in docs:
113 |             doc.load_uri_to_image_blob()
114 | 
115 |     result = benchmark_time(setup=_input_docs, func=_load_uri_to_image_blob)
116 | 
117 |     json_writer.append(
118 |         page=Pages.DOCUMENT_CONVERSION,
119 |         result=result,
120 |         metadata=dict(num_docs=num_docs),
121 |     )
122 | 
123 | 
124 | @pytest.mark.parametrize("num_docs", [1, 100, 1000])
125 | def test_document_convert_uri_to_datauri(num_docs, json_writer):
126 |     def _input_docs():
127 |         image_dir = os.path.join(cur_dir, "utils", "test.png")
128 |         docs = []
129 |         for _ in range(num_docs):
130 |             doc = Document(uri=image_dir)
131 |             docs.append(doc)
132 | 
133 |         return (), dict(docs=docs)
134 | 
135 |     def _convert_uri_to_datauri(docs):
136 |         for doc in docs:
137 |             doc.convert_uri_to_datauri()
138 | 
139 |     result = benchmark_time(setup=_input_docs, func=_convert_uri_to_datauri)
140 | 
141 |     json_writer.append(
142 |         page=Pages.DOCUMENT_CONVERSION,
143 |         result=result,
144 |         metadata=dict(num_docs=num_docs),
145 |     )
146 | 
147 | 
148 | @pytest.mark.parametrize("num_docs", [1, 100, 1000])
149 | def test_document_convert_buffer_to_blob(num_docs, json_writer):
150 |     def _input_docs():
151 |         return (
152 |             (),
153 |             dict(
154 |                 docs=[
155 |                     Document(content=np.random.random((85, 152, 3)))
156 |                     for _ in range(num_docs)
157 |                 ]
158 |             ),
159 |         )
160 | 
161 |     def _convert_buffer_to_blob(docs):
162 |         for doc in docs:
163 |             doc.convert_buffer_to_blob()
164 | 
165 |     result = benchmark_time(setup=_input_docs, func=_convert_buffer_to_blob)
166 | 
167 |     json_writer.append(
168 |         page=Pages.DOCUMENT_CONVERSION,
169 |         result=result,
170 |         metadata=dict(num_docs=num_docs),
171 |     )
172 | 
173 | 
174 | @pytest.mark.parametrize("num_docs", [1, 100, 1000])
175 | def test_document_convert_image_blob_to_uri(num_docs, json_writer):
176 |     def _input_docs():
177 |         return (
178 |             (),
179 |             dict(
180 |                 docs=[
181 |                     Document(content=np.random.randint(0, 255, 32 * 28))
182 |                     for _ in range(num_docs)
183 |                 ]
184 |             ),
185 |         )
186 | 
187 |     def _convert_image_blob_to_uri(docs):
188 |         for doc in docs:
189 |             doc.convert_image_blob_to_uri()
190 | 
191 |     result = benchmark_time(setup=_input_docs, func=_convert_image_blob_to_uri)
192 | 
193 |     json_writer.append(
194 |         page=Pages.DOCUMENT_CONVERSION,
195 |         result=result,
196 |         metadata=dict(num_docs=num_docs),
197 |     )
198 | 
199 | 
200 | @pytest.mark.parametrize("num_docs", [1, 100, 1000])
201 | def test_document_convert_content_to_uri(num_docs, json_writer):
202 |     def _input_docs():
203 |         return (
204 |             (),
205 |             dict(
206 |                 docs=[
207 |                     Document(content=np.random.randint(0, 255, 32 * 28))
208 |                     for _ in range(num_docs)
209 |                 ]
210 |             ),
211 |         )
212 | 
213 |     def _convert_content_to_uri(docs):
214 |         for doc in docs:
215 |             _ = doc.convert_content_to_uri
216 | 
217 |     result = benchmark_time(setup=_input_docs, func=_convert_content_to_uri)
218 | 
219 |     json_writer.append(
220 |         page=Pages.DOCUMENT_CONVERSION,
221 |         result=result,
222 |         metadata=dict(num_docs=num_docs),
223 |     )
224 | 
225 | 
226 | @pytest.mark.parametrize("num_docs", [1, 100, 1000])
227 | def test_document_convert_text_to_uri(num_docs, json_writer):
228 |     def _input_docs():
229 |         return (
230 |             (),
231 |             dict(
232 |                 docs=[
233 |                     Document(content=np.random.randint(0, 255, 32 * 28))
234 |                     for _ in range(num_docs)
235 |                 ]
236 |             ),
237 |         )
238 | 
239 |     def _convert_text_to_uri(docs):
240 |         for doc in docs:
241 |             _ = doc.dump_text_to_datauri
242 | 
243 |     result = benchmark_time(setup=_input_docs, func=_convert_text_to_uri)
244 | 
245 |     json_writer.append(
246 |         page=Pages.DOCUMENT_CONVERSION,
247 |         result=result,
248 |         metadata=dict(num_docs=num_docs),
249 |     )
250 | 
251 | 
252 | @pytest.mark.parametrize("num_docs", [1, 100, 1000])
253 | def test_document_convert_buffer_to_uri(num_docs, json_writer):
254 |     def _input_docs():
255 |         return (
256 |             (),
257 |             dict(
258 |                 docs=[
259 |                     Document(uri=os.path.join(cur_dir, "test.png"))
260 |                     for _ in range(num_docs)
261 |                 ]
262 |             ),
263 |         )
264 | 
265 |     def _convert_buffer_to_uri(docs):
266 |         for doc in docs:
267 |             _ = doc.convert_buffer_to_uri()
268 | 
269 |     result = benchmark_time(setup=_input_docs, func=_convert_buffer_to_uri)
270 | 
271 |     json_writer.append(
272 |         page=Pages.DOCUMENT_CONVERSION,
273 |         result=result,
274 |         metadata=dict(num_docs=num_docs),
275 |     )
276 | 
277 | 
278 | @pytest.mark.parametrize("num_docs", [1, 5])
279 | def test_document_load_uri_to_text(num_docs, json_writer):
280 |     def _input_docs():
281 |         return (
282 |             (),
283 |             dict(
284 |                 docs=[
285 |                     Document(uri="http://google.com/index.html", mime_type="text/html")
286 |                     for _ in range(num_docs)
287 |                 ]
288 |             ),
289 |         )
290 | 
291 |     def _load_uri_to_text(docs):
292 |         for doc in docs:
293 |             _ = doc.load_uri_to_text()
294 | 
295 |     result = benchmark_time(setup=_input_docs, func=_load_uri_to_text)
296 | 
297 |     json_writer.append(
298 |         page=Pages.DOCUMENT_CONVERSION,
299 |         result=result,
300 |         metadata=dict(num_docs=num_docs),
301 |     )
302 | 
303 | 
304 | @pytest.mark.parametrize("num_docs", [1, 100, 1000])
305 | def test_document_convert_blob_to_buffer(num_docs, json_writer):
306 |     def _input_docs():
307 |         return (
308 |             (),
309 |             dict(
310 |                 docs=[
311 |                     Document(content=np.random.randint(0, 255, 32 * 28))
312 |                     for _ in range(num_docs)
313 |                 ]
314 |             ),
315 |         )
316 | 
317 |     def _convert_buffer_to_uri(docs):
318 |         for doc in docs:
319 |             _ = doc.convert_blob_to_buffer()
320 | 
321 |     result = benchmark_time(setup=_input_docs, func=_convert_buffer_to_uri)
322 | 
323 |     json_writer.append(
324 |         page=Pages.DOCUMENT_CONVERSION,
325 |         result=result,
326 |         metadata=dict(num_docs=num_docs),
327 |     )
328 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/src/document_construct.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import string
  3 | 
  4 | import numpy as np
  5 | import pytest
  6 | from jina import Document
  7 | 
  8 | from .pages import Pages
  9 | from .utils.benchmark import benchmark_time
 10 | 
 11 | 
 12 | def _generate_random_text(text_length):
 13 |     return ''.join(
 14 |         random.choice(string.ascii_uppercase + string.digits)
 15 |         for _ in range(text_length)
 16 |     )
 17 | 
 18 | 
 19 | def _generate_random_buffer(buffer_length):
 20 |     return bytes(bytearray(random.getrandbits(8) for _ in range(buffer_length)))
 21 | 
 22 | 
 23 | def _generate_random_blob(num_dims):
 24 |     # 1 and 3 can cover from audio signals to images. 3 dimensions make the memory too high
 25 |     shape = [random.randint(100, 200)] * num_dims
 26 | 
 27 |     return np.random.rand(*shape)
 28 | 
 29 | 
 30 | def _generate_random_document(
 31 |     origin, text_length=None, buffer_length=None, num_dims=None
 32 | ):
 33 |     tags = {'tag1': [0, 2, 3], 'tag2': 'value of tag2'}
 34 |     if origin == 'text':
 35 |         return Document(text=_generate_random_text(text_length), tags=tags)
 36 |     if origin == 'blob':
 37 |         return Document(blob=_generate_random_blob(num_dims), tags=tags)
 38 |     if origin == 'buffer':
 39 |         return Document(buffer=_generate_random_buffer(buffer_length), tags=tags)
 40 | 
 41 | 
 42 | def _generate_random_document_with_chunks_and_matches(
 43 |     origin, text_length=None, buffer_length=None, num_dims=None
 44 | ):
 45 |     root = _generate_random_document(origin, text_length, buffer_length, num_dims)
 46 | 
 47 |     num_chunks = random.randint(1, 20)
 48 |     num_matches = random.randint(1, 20)
 49 |     for _ in range(num_chunks):
 50 |         root.chunks.append(
 51 |             _generate_random_document(origin, text_length, buffer_length, num_dims)
 52 |         )
 53 |     for _ in range(num_matches):
 54 |         root.matches.append(
 55 |             _generate_random_document(origin, text_length, buffer_length, num_dims)
 56 |         )
 57 |     return root
 58 | 
 59 | 
 60 | @pytest.mark.parametrize('text_length', [10, 100, 1000, 10000])
 61 | def test_construct_text(text_length, json_writer):
 62 |     def _doc_build(text):
 63 |         Document(text=text)
 64 | 
 65 |     result = benchmark_time(
 66 |         func=_doc_build, kwargs=dict(text=_generate_random_text(text_length))
 67 |     )
 68 | 
 69 |     json_writer.append(
 70 |         page=Pages.DOCUMENT_CONSTRUCT,
 71 |         result=result,
 72 |         metadata=dict(text_length=text_length),
 73 |     )
 74 | 
 75 | 
 76 | @pytest.mark.parametrize('num_dims', [1, 2])
 77 | def test_construct_blob(num_dims, json_writer):
 78 |     def _doc_build(blob):
 79 |         Document(blob=blob)
 80 | 
 81 |     result = benchmark_time(
 82 |         func=_doc_build, kwargs=dict(blob=_generate_random_blob(num_dims))
 83 |     )
 84 | 
 85 |     json_writer.append(
 86 |         page=Pages.DOCUMENT_CONSTRUCT,
 87 |         result=result,
 88 |         metadata=dict(num_dims=num_dims),
 89 |     )
 90 | 
 91 | 
 92 | @pytest.mark.parametrize('buffer_length', [10, 1000, 100000])
 93 | def test_construct_buffer(buffer_length, json_writer):
 94 |     def _doc_build(buffer):
 95 |         Document(buffer=buffer)
 96 | 
 97 |     result = benchmark_time(
 98 |         func=_doc_build, kwargs=dict(buffer=_generate_random_buffer(buffer_length))
 99 |     )
100 | 
101 |     json_writer.append(
102 |         page=Pages.DOCUMENT_CONSTRUCT,
103 |         result=result,
104 |         metadata=dict(buffer_length=buffer_length),
105 |     )
106 | 
107 | 
108 | @pytest.mark.parametrize('text_length', [10, 100, 1000, 10000])
109 | def test_construct_btyes_origin_text(text_length, json_writer):
110 |     def _doc_build(b):
111 |         Document(obj=b)
112 | 
113 |     result = benchmark_time(
114 |         func=_doc_build,
115 |         kwargs=dict(
116 |             b=_generate_random_document(
117 |                 'text', text_length=text_length
118 |             ).proto.SerializeToString()
119 |         ),
120 |     )
121 | 
122 |     json_writer.append(
123 |         page=Pages.DOCUMENT_CONSTRUCT,
124 |         result=result,
125 |         metadata=dict(text_length=text_length),
126 |     )
127 | 
128 | 
129 | @pytest.mark.parametrize('num_dims', [1, 2])
130 | def test_construct_btyes_origin_blob(num_dims, json_writer):
131 |     def _doc_build(b):
132 |         Document(obj=b)
133 | 
134 |     result = benchmark_time(
135 |         func=_doc_build,
136 |         kwargs=dict(
137 |             b=_generate_random_document(
138 |                 'blob', num_dims=num_dims
139 |             ).proto.SerializeToString()
140 |         ),
141 |     )
142 | 
143 |     json_writer.append(
144 |         page=Pages.DOCUMENT_CONSTRUCT,
145 |         result=result,
146 |         metadata=dict(num_dims=num_dims),
147 |     )
148 | 
149 | 
150 | @pytest.mark.parametrize('buffer_length', [10, 1000, 100000])
151 | def test_construct_btyes_origin_buffer(buffer_length, json_writer):
152 |     def _doc_build(b):
153 |         Document(obj=b)
154 | 
155 |     result = benchmark_time(
156 |         func=_doc_build,
157 |         kwargs=dict(
158 |             b=_generate_random_document(
159 |                 'buffer', buffer_length=buffer_length
160 |             ).proto.SerializeToString()
161 |         ),
162 |     )
163 | 
164 |     json_writer.append(
165 |         page=Pages.DOCUMENT_CONSTRUCT,
166 |         result=result,
167 |         metadata=dict(buffer_length=buffer_length),
168 |     )
169 | 
170 | 
171 | @pytest.mark.parametrize('text_length', [10, 100, 1000, 10000])
172 | def test_construct_str_json_origin_text(text_length, json_writer):
173 |     def _doc_build(b):
174 |         Document(obj=b)
175 | 
176 |     result = benchmark_time(
177 |         func=_doc_build,
178 |         kwargs=dict(
179 |             b=_generate_random_document('text', text_length=text_length).json()
180 |         ),
181 |     )
182 | 
183 |     json_writer.append(
184 |         page=Pages.DOCUMENT_CONSTRUCT,
185 |         result=result,
186 |         metadata=dict(text_length=text_length),
187 |     )
188 | 
189 | 
190 | @pytest.mark.parametrize('num_dims', [1, 2])
191 | def test_construct_str_json_origin_blob(num_dims, json_writer):
192 |     def _doc_build(b):
193 |         Document(obj=b)
194 | 
195 |     result = benchmark_time(
196 |         func=_doc_build,
197 |         kwargs=dict(b=_generate_random_document('blob', num_dims=num_dims).json()),
198 |     )
199 | 
200 |     json_writer.append(
201 |         page=Pages.DOCUMENT_CONSTRUCT,
202 |         result=result,
203 |         metadata=dict(num_dims=num_dims),
204 |     )
205 | 
206 | 
207 | @pytest.mark.parametrize('buffer_length', [10, 1000, 100000])
208 | def test_construct_str_json_origin_buffer(buffer_length, json_writer):
209 |     def _doc_build(b):
210 |         Document(obj=b)
211 | 
212 |     result = benchmark_time(
213 |         func=_doc_build,
214 |         kwargs=dict(
215 |             b=_generate_random_document('buffer', buffer_length=buffer_length).json()
216 |         ),
217 |     )
218 | 
219 |     json_writer.append(
220 |         page=Pages.DOCUMENT_CONSTRUCT,
221 |         result=result,
222 |         metadata=dict(buffer_length=buffer_length),
223 |     )
224 | 
225 | 
226 | @pytest.mark.parametrize('text_length', [10, 100, 1000, 10000])
227 | def test_construct_dict_origin_text(text_length, json_writer):
228 |     def _doc_build(b):
229 |         Document(obj=b)
230 | 
231 |     result = benchmark_time(
232 |         func=_doc_build,
233 |         kwargs=dict(
234 |             b=_generate_random_document('text', text_length=text_length).dict()
235 |         ),
236 |     )
237 | 
238 |     json_writer.append(
239 |         page=Pages.DOCUMENT_CONSTRUCT,
240 |         result=result,
241 |         metadata=dict(text_length=text_length),
242 |     )
243 | 
244 | 
245 | @pytest.mark.parametrize('num_dims', [1, 2])
246 | def test_construct_dict_origin_blob(num_dims, json_writer):
247 |     def _doc_build(b):
248 |         Document(obj=b)
249 | 
250 |     result = benchmark_time(
251 |         func=_doc_build,
252 |         kwargs=dict(b=_generate_random_document('blob', num_dims=num_dims).dict()),
253 |     )
254 | 
255 |     json_writer.append(
256 |         page=Pages.DOCUMENT_CONSTRUCT,
257 |         result=result,
258 |         metadata=dict(num_dims=num_dims),
259 |     )
260 | 
261 | 
262 | @pytest.mark.parametrize('buffer_length', [10, 1000, 100000])
263 | def test_construct_dict_origin_buffer(buffer_length, json_writer):
264 |     def _doc_build(b):
265 |         Document(obj=b)
266 | 
267 |     result = benchmark_time(
268 |         func=_doc_build,
269 |         kwargs=dict(
270 |             b=_generate_random_document('buffer', buffer_length=buffer_length).dict()
271 |         ),
272 |     )
273 | 
274 |     json_writer.append(
275 |         page=Pages.DOCUMENT_CONSTRUCT,
276 |         result=result,
277 |         metadata=dict(buffer_length=buffer_length),
278 |     )
279 | 
280 | 
281 | @pytest.mark.parametrize('copy', [True, False])
282 | @pytest.mark.parametrize('text_length', [10, 100, 1000, 10000])
283 | def test_construct_document_origin_text(copy, text_length, json_writer):
284 |     def _doc_build(d):
285 |         Document(obj=d, copy=copy)
286 | 
287 |     _doc_build(d=_generate_random_document('text', text_length))
288 | 
289 |     result = benchmark_time(
290 |         func=_doc_build,
291 |         kwargs=dict(d=_generate_random_document('text', text_length)),
292 |     )
293 | 
294 |     json_writer.append(
295 |         page=Pages.DOCUMENT_CONSTRUCT,
296 |         result=result,
297 |         metadata=dict(text_length=text_length, copy=copy),
298 |     )
299 | 
300 | 
301 | @pytest.mark.parametrize('copy', [True, False])
302 | @pytest.mark.parametrize('num_dims', [1, 2])
303 | def test_construct_document_origin_blob(copy, num_dims, json_writer):
304 |     def _doc_build(d):
305 |         Document(obj=d, copy=copy)
306 | 
307 |     result = benchmark_time(
308 |         func=_doc_build,
309 |         kwargs=dict(d=_generate_random_document('blob', num_dims=num_dims)),
310 |     )
311 | 
312 |     json_writer.append(
313 |         page=Pages.DOCUMENT_CONSTRUCT,
314 |         result=result,
315 |         metadata=dict(num_dims=num_dims, copy=copy),
316 |     )
317 | 
318 | 
319 | @pytest.mark.parametrize('copy', [True, False])
320 | @pytest.mark.parametrize('buffer_length', [10, 1000, 100000])
321 | def test_construct_document_origin_buffer(copy, buffer_length, json_writer):
322 |     def _doc_build(d):
323 |         Document(obj=d, copy=copy)
324 | 
325 |     result = benchmark_time(
326 |         func=_doc_build,
327 |         kwargs=dict(d=_generate_random_document('buffer', buffer_length=buffer_length)),
328 |     )
329 | 
330 |     json_writer.append(
331 |         page=Pages.DOCUMENT_CONSTRUCT,
332 |         result=result,
333 |         metadata=dict(buffer_length=buffer_length, copy=copy),
334 |     )
335 | 
336 | 
337 | @pytest.mark.parametrize('copy', [True, False])
338 | @pytest.mark.parametrize('text_length', [10, 100, 1000, 10000])
339 | def test_construct_document_origin_text_proto(copy, text_length, json_writer):
340 |     def _doc_build(d):
341 |         Document(obj=d, copy=copy)
342 | 
343 |     result = benchmark_time(
344 |         func=_doc_build,
345 |         kwargs=dict(d=_generate_random_document('text', text_length).proto),
346 |     )
347 | 
348 |     json_writer.append(
349 |         page=Pages.DOCUMENT_CONSTRUCT,
350 |         result=result,
351 |         metadata=dict(text_length=text_length, copy=copy),
352 |     )
353 | 
354 | 
355 | @pytest.mark.parametrize('copy', [True, False])
356 | @pytest.mark.parametrize('num_dims', [1, 2])
357 | def test_construct_document_origin_blob_proto(copy, num_dims, json_writer):
358 |     def _doc_build(d):
359 |         Document(obj=d, copy=copy)
360 | 
361 |     result = benchmark_time(
362 |         func=_doc_build,
363 |         kwargs=dict(d=_generate_random_document('blob', num_dims=num_dims).proto),
364 |     )
365 | 
366 |     json_writer.append(
367 |         page=Pages.DOCUMENT_CONSTRUCT,
368 |         result=result,
369 |         metadata=dict(num_dims=num_dims, copy=copy),
370 |     )
371 | 
372 | 
373 | @pytest.mark.parametrize('copy', [True, False])
374 | @pytest.mark.parametrize('buffer_length', [10, 1000, 100000])
375 | def test_construct_document_origin_buffer_proto(copy, buffer_length, json_writer):
376 |     def _doc_build(d):
377 |         Document(obj=d, copy=copy)
378 | 
379 |     result = benchmark_time(
380 |         func=_doc_build,
381 |         kwargs=dict(
382 |             d=_generate_random_document('buffer', buffer_length=buffer_length).proto
383 |         ),
384 |     )
385 | 
386 |     json_writer.append(
387 |         page=Pages.DOCUMENT_CONSTRUCT,
388 |         result=result,
389 |         metadata=dict(buffer_length=buffer_length, copy=copy),
390 |     )
391 | 


--------------------------------------------------------------------------------
/scripts/site_generator.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python3
  2 | 
  3 | import json
  4 | import os
  5 | import copy
  6 | from collections import defaultdict
  7 | from distutils.version import LooseVersion
  8 | from pathlib import Path
  9 | from typing import Any, Dict, List, Tuple, Union, Optional
 10 | 
 11 | 
 12 | COLOR_VALUES = [
 13 |     '#10a100',
 14 |     '#7ead14',
 15 |     '#bab73c',
 16 |     '#e8c268',
 17 |     '#e59838',
 18 |     '#e36717',
 19 |     '#de1414',
 20 | ]
 21 | 
 22 | COLOR_NAN = '#9b00a1'
 23 | 
 24 | NOT_A_NUMBER = 'N/A'
 25 | 
 26 | STD_MEAN_THRESHOLD = 0.5
 27 | 
 28 | COLOR_LEGEND = ' | '.join(
 29 |     [
 30 |         f'<span style="color:{color};">{i*10} - {(i+1)*10}%</span>'
 31 |         for i, color in enumerate(COLOR_VALUES)
 32 |     ]
 33 | )
 34 | 
 35 | LEGEND = f"""
 36 | The following data should be read as follows:
 37 | 
 38 | - Colors of cells display the percentage of the minimum value in the column:\n
 39 |   {COLOR_LEGEND}
 40 | - <s>1337</s>: unstable tests with "standard deviation / mean > {STD_MEAN_THRESHOLD}"
 41 | """
 42 | 
 43 | 
 44 | def _format(data: Union[int, float]) -> Any:
 45 |     if isinstance(data, bool):
 46 |         return str(data)
 47 |     elif isinstance(data, int) or isinstance(data, float):
 48 |         if data >= 1000:
 49 |             _data = data
 50 |             i = 0
 51 |             while abs(_data) >= 1000:
 52 |                 i += 1
 53 |                 _data /= 1000
 54 | 
 55 |             if isinstance(data, int):
 56 |                 return '%d%s' % (_data, ['', 'K', 'M', 'G', 'T', 'P'][i])
 57 |             else:
 58 |                 return '%.2f%s' % (_data, ['', 'K', 'M', 'G', 'T', 'P'][i])
 59 |         else:
 60 |             i = 1
 61 |             _data = round(data, i)
 62 |             while _data == 0 and i <= 5:
 63 |                 i += 1
 64 |                 _data = round(data, i)
 65 | 
 66 |             return _data
 67 |     else:
 68 |         return data
 69 | 
 70 | 
 71 | def _get_color(mean_time, master_mean_time):
 72 |     if mean_time is None or mean_time == NOT_A_NUMBER or master_mean_time == 0:
 73 |         return COLOR_NAN
 74 |     raw_bucket = int((float(mean_time) / float(master_mean_time) - 1) * 10)
 75 |     bucket = max(0, min(6, raw_bucket))
 76 | 
 77 |     return COLOR_VALUES[bucket]
 78 | 
 79 | 
 80 | def _get_cleaned_mean_time(time: Optional[int], scaling: int) -> str:
 81 |     """Return cleaned data"""
 82 | 
 83 |     if time is not None:
 84 |         return str(int(int(time) / scaling))
 85 |     else:
 86 |         return NOT_A_NUMBER
 87 | 
 88 | 
 89 | def _cleaned_title(raw_heading: str) -> str:
 90 |     """Return cleaned title of artifact name."""
 91 |     return raw_heading.replace('test_', '').replace('_', ' ').title()
 92 | 
 93 | 
 94 | def is_test_unstable(run_stats):
 95 |     mean = run_stats.get('mean_time', 1e20)
 96 |     return mean != 0 and run_stats.get('std_time', 0.0) / mean > STD_MEAN_THRESHOLD
 97 | 
 98 | 
 99 | def _get_table_header(raw_data: List[Dict[str, Any]]) -> Tuple[str, str]:
100 |     """Return metadata table title and table separator."""
101 |     titles = {}
102 |     for test_run in raw_data:
103 |         for name in test_run['metadata']:
104 |             titles[name] = []
105 |         break
106 |     separators = []
107 |     for result in raw_data:
108 |         separators.append('---:')
109 |         for field in titles:
110 |             if 'metadata' in result:
111 |                 value = result['metadata'].get(field, 'N/A')
112 |                 titles[field].append(f'**{value}**')
113 | 
114 |             else:
115 |                 titles[field].append('**N/A**')
116 |     final = []
117 |     for title, values in titles.items():
118 |         final.append(f'| **{title}** | {" | ".join(values)} |\n')
119 |     header = f'{final[0]}| :---: | {" | ".join(separators)} |\n{"".join(final[1:])}'
120 |     return header
121 | 
122 | 
123 | def _get_version_list(artifacts_dir: str) -> List[str]:
124 |     """Generates sorted list of all versions found in reports.
125 | 
126 |     Args:
127 |         artifacts_dir: Absolute path to artifact directory.
128 | 
129 |     Return: List of versions found in reports.
130 |     """
131 |     lv = []
132 | 
133 |     for folder in os.listdir(artifacts_dir):
134 |         if os.path.isfile(os.path.join(artifacts_dir, folder, 'report.json')):
135 |             lv.append(LooseVersion(folder))
136 | 
137 |     lv.sort()
138 |     sorted_dev = [v.vstring for v in lv]
139 | 
140 |     import re
141 | 
142 |     p = re.compile('dev\\d+$')
143 | 
144 |     i = 0
145 |     while i + 1 < len(sorted_dev):
146 |         tmp = sorted_dev[i]
147 |         m = p.search(sorted_dev[i + 1])
148 |         if m and sorted_dev[i + 1].startswith(tmp):
149 |             sorted_dev[i] = sorted_dev[i + 1]
150 |             sorted_dev[i + 1] = tmp
151 |         i += 1
152 | 
153 |     version_list = [sorted_dev[i - 1] for i in range(len(sorted_dev), 0, -1)]
154 | 
155 |     return version_list
156 | 
157 | 
158 | def _get_cum_data(version_list: List[str], artifacts_dir: str) -> Dict[Any, Any]:
159 |     """Generates cumulative data and return in a dict.
160 | 
161 |     Args:
162 |         version_list: List of versions found in reports.
163 |         artifacts_dir: Absolute path to artifact directory.
164 | 
165 |     Return: Dict of cumulative data
166 |     """
167 |     data: Dict[Any, Any] = defaultdict(
168 |         lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(dict)))
169 |     )
170 | 
171 |     for version in version_list:
172 |         report_file = os.path.join(artifacts_dir, version, 'report.json')
173 |         searchers_compare_file = os.path.join(
174 |             artifacts_dir, version, 'searchers_compare.json'
175 |         )
176 | 
177 |         if os.path.isfile(report_file):
178 |             with open(report_file) as fp:
179 |                 _raw_data = json.load(fp)
180 | 
181 |             if os.path.isfile(searchers_compare_file):
182 |                 with open(searchers_compare_file) as fp:
183 |                     _raw_data.extend(json.load(fp))
184 | 
185 |             for i in _raw_data:
186 |                 page = i.get('page', 'unsorted_tests')
187 |                 test_name = i['name']
188 |                 metadata_hash = _hash_run(i)
189 | 
190 |                 data[page][test_name][version][metadata_hash] = i
191 | 
192 |     return data
193 | 
194 | 
195 | def generate_homepage(output_dir: str) -> None:
196 |     """This generate required homepage for the website.
197 | 
198 |     Args:
199 |         output_dir: Absolute path to Hugo content directory.
200 |     """
201 |     src = os.path.join(os.getcwd(), 'README.md')
202 |     dst = os.path.join(output_dir, '_index.md')
203 |     Path(output_dir).mkdir(parents=True, exist_ok=True)
204 | 
205 |     if os.path.isfile(src):
206 |         with open(src) as f:
207 |             data = f.read()
208 | 
209 |         with open(dst, 'w') as fp:
210 |             fp.write('---\n')
211 |             fp.write('title: Benchmark Jina\n')
212 |             fp.write('type: docs\n')
213 |             fp.write('---\n')
214 |             fp.write(data)
215 | 
216 | 
217 | def _hash_run(d):
218 |     tmp_dict = copy.deepcopy(d)
219 |     tmp_dict.pop('mean_time', None)
220 |     tmp_dict.pop('std_time', None)
221 |     tmp_dict.pop('iterations', None)
222 |     tmp_dict.pop('results', None)
223 | 
224 |     return json.dumps(tmp_dict, sort_keys=True)
225 | 
226 | 
227 | def _get_stats(test_data, latest_version):
228 |     results = defaultdict(dict)
229 |     for version, test_results in test_data.items():
230 |         for test_result in test_results.values():
231 |             parameter_hash = _hash_run(test_result)
232 |             metadata = test_result.get('metadata', {})
233 |             if not metadata:
234 |                 metadata = {'name': test_result['name']}
235 |             results[parameter_hash]['metadata'] = metadata
236 | 
237 |             results[parameter_hash]['min'] = min(
238 |                 results[parameter_hash].get('min', 1e20), test_result['mean_time']
239 |             )
240 |             results[parameter_hash]['max'] = max(
241 |                 results[parameter_hash].get('max', 0), test_result['mean_time']
242 |             )
243 |             results[parameter_hash]['parameter_hash'] = parameter_hash
244 | 
245 |             if version == latest_version:
246 |                 results[parameter_hash]['last_version_mean'] = test_result['mean_time']
247 | 
248 |     stats = list(results.values())
249 |     _add_scaling(stats)
250 |     return stats
251 | 
252 | 
253 | def _get_one_version_stats(test_results):
254 |     results = defaultdict(lambda x: 1e20)
255 |     results['min_mean_docs_per_sec'] = 0
256 | 
257 |     for test in test_results:
258 |         results['min_time'] = min(results['min_time'], test['mean_time'])
259 |         results['min_memory'] = min(results['min_memory'], test['mean_memory'])
260 |         results['min_indexer_memory'] = min(
261 |             results['min_indexer_memory'], test['mean_indexer_memory']
262 |         )
263 |         results['min_mean_docs_per_sec'] = max(
264 |             results['min_mean_docs_per_sec'], test['mean_mean_docs_per_sec']
265 |         )
266 |         results['min_latency'] = min(results['min_latency'], test['mean_latency'])
267 | 
268 |     return results
269 | 
270 | 
271 | def _add_scaling(stats):
272 |     for run_stats in stats:
273 |         if run_stats['min'] > 10_000_000_000:
274 |             run_stats['scaling'] = 1_000_000_000
275 |             run_stats['metadata']['unit'] = 's'
276 |         if run_stats['min'] > 10_000_000:
277 |             run_stats['scaling'] = 1_000_000
278 |             run_stats['metadata']['unit'] = 'ms'
279 |         elif run_stats['min'] > 10_000:
280 |             run_stats['scaling'] = 1_000
281 |             run_stats['metadata']['unit'] = 'μs'
282 |         else:
283 |             run_stats['scaling'] = 1
284 |             run_stats['metadata']['unit'] = 'ns'
285 |         run_stats['min'] = int(run_stats['min'] / run_stats['scaling'])
286 |         run_stats['max'] = int(run_stats['max'] / run_stats['scaling'])
287 | 
288 | 
289 | def generate_docs(
290 |     version_list: List[str], cum_data: Dict[Any, Any], output_dir: str
291 | ) -> None:
292 |     """This generate required docs from artifacts.
293 | 
294 |     Args:
295 |         version_list: List of versions found in reports.
296 |         cum_data: Cumulative data in Dict.
297 |         output_dir: Absolute path to Hugo docs directory.
298 |     """
299 |     Path(output_dir).mkdir(parents=True, exist_ok=True)
300 | 
301 |     for page, page_data in cum_data.items():
302 |         output_file = os.path.join(output_dir, f'{page}.md')
303 |         if page == 'indexer_comparison':
304 |             generate_comparison_test(page_data, output_file, _cleaned_title(page))
305 |         else:
306 |             generate_versioned_test(page_data, output_file, _cleaned_title(page))
307 | 
308 | 
309 | def _get_last_version(single_test_data):
310 |     versions = list(single_test_data.keys())
311 |     if versions:
312 |         return max(versions)
313 |     else:
314 |         return None
315 | 
316 | 
317 | def generate_versioned_test(page_data, output_file, title):
318 |     with open(output_file, 'w') as fp:
319 |         fp.write('---\n')
320 |         fp.write(f'title: {title}\n')
321 |         fp.write('---\n')
322 |         fp.write(f'# {title}\n\n')
323 | 
324 |         fp.write(f'{LEGEND}\n')
325 | 
326 |         for test_name, single_test_data in page_data.items():
327 |             latest_version = _get_last_version(single_test_data)
328 | 
329 |             if latest_version is None:
330 |                 return
331 | 
332 |             stats = _get_stats(single_test_data, latest_version)
333 |             header = _get_table_header(stats)
334 | 
335 |             fp.write(f'## {_cleaned_title(test_name)}\n')
336 |             fp.write(header)
337 | 
338 |             for version, data_dict in single_test_data.items():
339 |                 fp.write(f'| {version} |')
340 |                 for run in stats:
341 |                     run_data = data_dict[run['parameter_hash']]
342 | 
343 |                     mean_time = _get_cleaned_mean_time(
344 |                         run_data.get('mean_time', None), run['scaling']
345 |                     )
346 |                     color = _get_color(mean_time, run['min'])
347 | 
348 |                     if is_test_unstable(run_data):
349 |                         mean_time = f'<s>{mean_time}</s>'
350 | 
351 |                     fp.write(f' <span style="color:{color};">{mean_time}</span> |')
352 |                 fp.write('\n')
353 |             fp.write('\n')
354 | 
355 | 
356 | def generate_comparison_test(page_data, output_file, title):
357 |     with open(output_file, 'w') as fp:
358 |         fp.write('---\n')
359 |         fp.write(f'title: {title}\n')
360 |         fp.write('---\n')
361 |         fp.write(f'# {title}\n\n')
362 | 
363 |         for test_name, single_test_data in page_data.items():
364 |             latest_version = _get_last_version(single_test_data)
365 | 
366 |             if latest_version is None:
367 |                 continue
368 | 
369 |             table = []
370 | 
371 |             test_data = single_test_data[latest_version]
372 | 
373 |             header = _get_table_header(list(test_data.values()))
374 | 
375 |             fp.write(f'## {_cleaned_title(test_name)}\n')
376 |             fp.write(f'Tests were performed against Jina {latest_version}.\n\n')
377 |             fp.write(header)
378 | 
379 |             table.append(
380 |                 [
381 |                     'index time in ms',
382 |                     'search time in ms',
383 |                     'index memory',
384 |                     'search memory',
385 |                     'p90 in ms',
386 |                     'p99 in ms',
387 |                     'RPS',
388 |                     'Documents per second',
389 |                 ]
390 |             )
391 | 
392 |             for run in test_data.values():
393 | 
394 |                 table.append(
395 |                     [
396 |                         _get_cleaned_mean_time(run['results']['mean_index_time'], 1e6),
397 |                         _get_cleaned_mean_time(run['results']['mean_search_time'], 1e6),
398 |                         get_readable_size(run['results']['mean_search_memory']),
399 |                         get_readable_size(run['results']['mean_index_memory']),
400 |                         _get_cleaned_mean_time(run['results']['p90'], 1e6),
401 |                         _get_cleaned_mean_time(run['results']['p99'], 1e6),
402 |                         get_rps(run),
403 |                         get_dps(run),
404 |                     ]
405 |                 )
406 | 
407 |             transposed = list(map(list, zip(*table)))
408 | 
409 |             fp.write('|\n|'.join(' | '.join(row) for row in transposed))
410 |             fp.write('\n\n')
411 | 
412 | 
413 | def get_dps(run):
414 |     total_docs = run['metadata']['docs_per_request'] * run['metadata']['num_requests']
415 |     dps = total_docs / (run['results']['mean_search_time'] / 1e9)
416 |     return f'{dps:.2f}'
417 | 
418 | 
419 | def get_rps(run):
420 |     rps = run['metadata']['num_requests'] / (run['results']['mean_search_time'] / 1e9)
421 |     return f'{rps:.2f}'
422 | 
423 | 
424 | def get_readable_size(num_bytes: Union[int, float]) -> str:
425 |     """
426 |     Transform the bytes into readable value with different units (e.g. 1 KB, 20 MB, 30.1 GB).
427 | 
428 |     :param num_bytes: Number of bytes.
429 |     :return: Human readable string representation.
430 |     """
431 |     num_bytes = int(num_bytes)
432 |     if num_bytes < 1024:
433 |         return f'{num_bytes} Bytes'
434 |     elif num_bytes < 1024 ** 2:
435 |         return f'{num_bytes / 1024:.1f} KB'
436 |     elif num_bytes < 1024 ** 3:
437 |         return f'{num_bytes / (1024 ** 2):.1f} MB'
438 |     else:
439 |         return f'{num_bytes / (1024 ** 3):.1f} GB'
440 | 
441 | 
442 | def generate_menus(cum_data: Dict[Any, Any], output_dir: str) -> None:
443 |     """This generate required menus from artifacts.
444 | 
445 |     Args:
446 |         cum_data: Cumulative data in Dict.
447 |         output_dir: Absolute path to Hugo menus directory.
448 |     """
449 |     menu_dir = os.path.join(output_dir, 'menu')
450 |     menu_index = os.path.join(menu_dir, 'index.md')
451 |     Path(menu_dir).mkdir(parents=True, exist_ok=True)
452 | 
453 |     with open(menu_index, 'w') as fp:
454 |         fp.write('---\n')
455 |         fp.write('headless: true\n')
456 |         fp.write('---\n\n')
457 | 
458 |         for page in cum_data:
459 |             fp.write(
460 |                 '- [%s]({{< relref "/docs/%s.md" >}})\n' % (_cleaned_title(page), page)
461 |             )
462 | 
463 | 
464 | def main():
465 |     """This is the main function to call."""
466 |     base_dir = os.path.join(os.getcwd(), 'docs')
467 |     content_dir = os.path.join(base_dir, 'content')
468 |     docs_dir = os.path.join(content_dir, 'docs')
469 |     artifacts_dir = os.path.join(base_dir, 'static/artifacts')
470 | 
471 |     version_list = _get_version_list(artifacts_dir)
472 |     cum_data = _get_cum_data(version_list, artifacts_dir)
473 | 
474 |     generate_homepage(content_dir)
475 |     generate_docs(version_list, cum_data, docs_dir)
476 |     generate_menus(cum_data, content_dir)
477 | 
478 | 
479 | if __name__ == '__main__':
480 |     main()
481 | 


--------------------------------------------------------------------------------
/docs/static/artifacts/2.0.12/report.json:
--------------------------------------------------------------------------------
1 | [{"name": "test_da_append", "page": "document_array_append", "iterations": 161, "mean_time": 12393200, "std_time": 86694, "metadata": {"num_docs_append": 10000}}, {"name": "test_dam_append", "page": "document_array_append", "iterations": 14, "mean_time": 148900621, "std_time": 6612586, "metadata": {"num_docs_append": 10000, "flush": true}}, {"name": "test_dam_append", "page": "document_array_append", "iterations": 22, "mean_time": 92221945, "std_time": 739450, "metadata": {"num_docs_append": 10000, "flush": false}}, {"name": "test_da_clear", "page": "document_array_clear", "iterations": 473, "mean_time": 59096, "std_time": 2631, "metadata": {"num_docs": 100}}, {"name": "test_da_clear", "page": "document_array_clear", "iterations": 5, "mean_time": 8685183, "std_time": 230697, "metadata": {"num_docs": 10000}}, {"name": "test_dam_clear", "page": "document_array_clear", "iterations": 364, "mean_time": 94221, "std_time": 3466, "metadata": {"num_docs": 100}}, {"name": "test_dam_clear", "page": "document_array_clear", "iterations": 5, "mean_time": 1129013, "std_time": 40003, "metadata": {"num_docs": 10000}}, {"name": "test_construct_document_array_from_repeated_container", "page": "document_array_construct", "iterations": 360373, "mean_time": 3762, "std_time": 357, "metadata": {"num_chunks": 10000}}, {"name": "test_construct_document_array_from_another_documentarray", "page": "document_array_construct", "iterations": 700784, "mean_time": 1196, "std_time": 174, "metadata": {"num_docs": 10000}}, {"name": "test_construct_document_array_from_list_of_documents", "page": "document_array_construct", "iterations": 1064, "mean_time": 1875242, "std_time": 24081, "metadata": {"num_docs": 10000}}, {"name": "test_construct_document_array_from_generator", "page": "document_array_construct", "iterations": 5, "mean_time": 992149743, "std_time": 13339544, "metadata": {"num_docs": 10000}}, {"name": "test_construct_document_array_from_another_documentarray_memmap", "page": "document_array_construct", "iterations": 5, "mean_time": 582987914, "std_time": 14897190, "metadata": {"num_docs": 10000}}, {"name": "test_dam_embeddings", "page": "document_array_get_attributes", "iterations": 238, "mean_time": 111895, "std_time": 4055, "metadata": {"num_docs": 100, "num_feat": 128}}, {"name": "test_dam_embeddings", "page": "document_array_get_attributes", "iterations": 10, "mean_time": 2394109, "std_time": 23696, "metadata": {"num_docs": 10000, "num_feat": 128}}, {"name": "test_dam_embeddings", "page": "document_array_get_attributes", "iterations": 10, "mean_time": 3750944, "std_time": 78874, "metadata": {"num_docs": 10000, "num_feat": 256}}, {"name": "test_da_extend", "page": "document_array_extend", "iterations": 1640, "mean_time": 1211932, "std_time": 14254, "metadata": {"num_docs": 1000, "label": "empty", "memmap": false}}, {"name": "test_da_extend", "page": "document_array_extend", "iterations": 229, "mean_time": 8420927, "std_time": 60451, "metadata": {"num_docs": 1000, "label": "empty", "memmap": true}}, {"name": "test_da_extend", "page": "document_array_extend", "iterations": 1629, "mean_time": 1220334, "std_time": 21265, "metadata": {"num_docs": 1000, "label": "blob", "memmap": false}}, {"name": "test_da_extend", "page": "document_array_extend", "iterations": 109, "mean_time": 17562986, "std_time": 207467, "metadata": {"num_docs": 1000, "label": "blob", "memmap": true}}, {"name": "test_da_extend", "page": "document_array_extend", "iterations": 1638, "mean_time": 1213585, "std_time": 13941, "metadata": {"num_docs": 1000, "label": "text", "memmap": false}}, {"name": "test_da_extend", "page": "document_array_extend", "iterations": 212, "mean_time": 9137207, "std_time": 56778, "metadata": {"num_docs": 1000, "label": "text", "memmap": true}}, {"name": "test_da_extend", "page": "document_array_extend", "iterations": 1638, "mean_time": 1213142, "std_time": 8774, "metadata": {"num_docs": 1000, "label": "buffer", "memmap": false}}, {"name": "test_da_extend", "page": "document_array_extend", "iterations": 142, "mean_time": 13528808, "std_time": 135447, "metadata": {"num_docs": 1000, "label": "buffer", "memmap": true}}, {"name": "test_da_get_attributes", "page": "document_array_get_attributes", "iterations": 1131, "mean_time": 1617980, "std_time": 20063, "metadata": {"num_docs": 100, "field": "blob", "memmap": false}}, {"name": "test_dam_get_attributes", "page": "document_array_get_attributes", "iterations": 175, "mean_time": 9298763, "std_time": 77533, "metadata": {"num_docs": 100, "field": "blob", "memmap": true}}, {"name": "test_da_get_attributes", "page": "document_array_get_attributes", "iterations": 2353, "mean_time": 715476, "std_time": 8404, "metadata": {"num_docs": 100, "field": "text", "memmap": false}}, {"name": "test_dam_get_attributes", "page": "document_array_get_attributes", "iterations": 282, "mean_time": 5872680, "std_time": 34821, "metadata": {"num_docs": 100, "field": "text", "memmap": true}}, {"name": "test_da_get_attributes", "page": "document_array_get_attributes", "iterations": 2287, "mean_time": 735093, "std_time": 10244, "metadata": {"num_docs": 100, "field": "buffer", "memmap": false}}, {"name": "test_dam_get_attributes", "page": "document_array_get_attributes", "iterations": 246, "mean_time": 6370017, "std_time": 44060, "metadata": {"num_docs": 100, "field": "buffer", "memmap": true}}, {"name": "test_da_get_attributes", "page": "document_array_get_attributes", "iterations": 1121, "mean_time": 1634398, "std_time": 18609, "metadata": {"num_docs": 100, "field": "embedding", "memmap": false}}, {"name": "test_dam_get_attributes", "page": "document_array_get_attributes", "iterations": 175, "mean_time": 9297756, "std_time": 62184, "metadata": {"num_docs": 100, "field": "embedding", "memmap": true}}, {"name": "test_da_get_attributes", "page": "document_array_get_attributes", "iterations": 11, "mean_time": 177432561, "std_time": 14403633, "metadata": {"num_docs": 10000, "field": "blob", "memmap": false}}, {"name": "test_dam_get_attributes", "page": "document_array_get_attributes", "iterations": 5, "mean_time": 965614153, "std_time": 27364696, "metadata": {"num_docs": 10000, "field": "blob", "memmap": true}}, {"name": "test_da_get_attributes", "page": "document_array_get_attributes", "iterations": 22, "mean_time": 80287663, "std_time": 13992505, "metadata": {"num_docs": 10000, "field": "text", "memmap": false}}, {"name": "test_dam_get_attributes", "page": "document_array_get_attributes", "iterations": 5, "mean_time": 601798762, "std_time": 18118377, "metadata": {"num_docs": 10000, "field": "text", "memmap": true}}, {"name": "test_da_get_attributes", "page": "document_array_get_attributes", "iterations": 21, "mean_time": 84370381, "std_time": 11803020, "metadata": {"num_docs": 10000, "field": "buffer", "memmap": false}}, {"name": "test_dam_get_attributes", "page": "document_array_get_attributes", "iterations": 5, "mean_time": 649465702, "std_time": 18064889, "metadata": {"num_docs": 10000, "field": "buffer", "memmap": true}}, {"name": "test_da_get_attributes", "page": "document_array_get_attributes", "iterations": 11, "mean_time": 176325898, "std_time": 14181744, "metadata": {"num_docs": 10000, "field": "embedding", "memmap": false}}, {"name": "test_dam_get_attributes", "page": "document_array_get_attributes", "iterations": 5, "mean_time": 952172404, "std_time": 19682355, "metadata": {"num_docs": 10000, "field": "embedding", "memmap": true}}, {"name": "test_da_insert", "page": "document_array_insert", "iterations": 454, "mean_time": 168415, "std_time": 4970, "metadata": {"num_docs": 100}}, {"name": "test_da_insert", "page": "document_array_insert", "iterations": 10, "mean_time": 34379713, "std_time": 165342, "metadata": {"num_docs": 10000}}, {"name": "test_match", "page": "document_array_match", "iterations": 5, "mean_time": 2172444973, "std_time": 29236483, "metadata": {"size_X": 10, "size_Y": 100000, "dam_x": false, "dam_y": false, "emb_size": 256, "use_scipy": false, "metric": "euclidean", "top_k": 3}}, {"name": "test_da_save", "page": "document_array_persistence", "iterations": 5, "mean_time": 2532785116, "std_time": 24374942, "metadata": {"num_docs_append": 100000, "file_format": "json"}}, {"name": "test_da_save", "page": "document_array_persistence", "iterations": 5, "mean_time": 462705214, "std_time": 1265027, "metadata": {"num_docs_append": 100000, "file_format": "binary"}}, {"name": "test_da_load", "page": "document_array_persistence", "iterations": 5, "mean_time": 8371470563, "std_time": 27786218, "metadata": {"num_docs_append": 100000, "file_format": "json"}}, {"name": "test_da_load", "page": "document_array_persistence", "iterations": 5, "mean_time": 176426036, "std_time": 1138379, "metadata": {"num_docs_append": 100000, "file_format": "binary"}}, {"name": "test_da_reverse", "page": "document_array_insert", "iterations": 433, "mean_time": 370287, "std_time": 7839, "metadata": {"num_docs": 100}}, {"name": "test_da_reverse", "page": "document_array_insert", "iterations": 10, "mean_time": 37988003, "std_time": 153439, "metadata": {"num_docs": 10000}}, {"name": "test_da_save", "page": "document_array_clear", "iterations": 283, "mean_time": 2608163, "std_time": 33340, "metadata": {"num_docs": 100}}, {"name": "test_da_save", "page": "document_array_clear", "iterations": 10, "mean_time": 249012378, "std_time": 1041847, "metadata": {"num_docs": 10000}}, {"name": "test_dam_save", "page": "document_array_clear", "iterations": 362, "mean_time": 94501, "std_time": 4883, "metadata": {"num_docs": 100}}, {"name": "test_dam_save", "page": "document_array_clear", "iterations": 10, "mean_time": 984830, "std_time": 40911, "metadata": {"num_docs": 10000}}, {"name": "test_da_save_binary", "page": "document_array_insert", "iterations": 419, "mean_time": 453235, "std_time": 9762, "metadata": {"num_docs": 100}}, {"name": "test_da_save_binary", "page": "document_array_insert", "iterations": 10, "mean_time": 41394899, "std_time": 359549, "metadata": {"num_docs": 10000}}, {"name": "test_da_load_binary", "page": "document_array_insert", "iterations": 402, "mean_time": 195699, "std_time": 6539, "metadata": {"num_docs": 100}}, {"name": "test_da_load_binary", "page": "document_array_insert", "iterations": 10, "mean_time": 15253357, "std_time": 194289, "metadata": {"num_docs": 10000}}, {"name": "test_da_save_json", "page": "document_array_insert", "iterations": 281, "mean_time": 2679379, "std_time": 1333361, "metadata": {"num_docs": 100}}, {"name": "test_da_save_json", "page": "document_array_insert", "iterations": 10, "mean_time": 249181491, "std_time": 671351, "metadata": {"num_docs": 10000}}, {"name": "test_da_load_json", "page": "document_array_insert", "iterations": 128, "mean_time": 8424379, "std_time": 71019, "metadata": {"num_docs": 100}}, {"name": "test_da_load_json", "page": "document_array_insert", "iterations": 13, "mean_time": 82027979, "std_time": 243711, "metadata": {"num_docs": 1000}}, {"name": "test_da_load_json", "page": "document_array_insert", "iterations": 10, "mean_time": 818522000, "std_time": 8782199, "metadata": {"num_docs": 10000}}, {"name": "test_da_shuffle", "page": "document_array_shuffle", "iterations": 26, "mean_time": 34943751, "std_time": 4587628, "metadata": {"n_nodes": false, "n_docs": 1000}}, {"name": "test_dam_shuffle", "page": "document_array_shuffle", "iterations": 17, "mean_time": 70945594, "std_time": 150713, "metadata": {"n_nodes": true, "n_docs": 1000}}, {"name": "test_da_shuffle", "page": "document_array_shuffle", "iterations": 5, "mean_time": 346844400, "std_time": 2112699, "metadata": {"n_nodes": false, "n_docs": 10000}}, {"name": "test_dam_shuffle", "page": "document_array_shuffle", "iterations": 5, "mean_time": 733615093, "std_time": 16847624, "metadata": {"n_nodes": true, "n_docs": 10000}}, {"name": "test_da_sort", "page": "document_array_sort", "iterations": 20664, "mean_time": 75164, "std_time": 1906, "metadata": {"num_docs": 100}}, {"name": "test_da_sort", "page": "document_array_sort", "iterations": 25, "mean_time": 105843792, "std_time": 726113, "metadata": {"num_docs": 100000}}, {"name": "test_da_traverse_flat", "page": "document_array_traverse", "iterations": 81, "mean_time": 2997426, "std_time": 40195, "metadata": {"num_docs": 10, "num_matches": 10, "num_chunks": 10, "traversal_paths": ["r", "c", "m"], "memmap": false}}, {"name": "test_da_traverse_flat", "page": "document_array_traverse", "iterations": 5, "mean_time": 290059338, "std_time": 15122446, "metadata": {"num_docs": 100, "num_matches": 100, "num_chunks": 100, "traversal_paths": ["r", "c", "m"], "memmap": false}}, {"name": "test_da_traverse_flat", "page": "document_array_traverse", "iterations": 5, "mean_time": 3122751565, "std_time": 78540318, "metadata": {"num_docs": 1000, "num_matches": 100, "num_chunks": 100, "traversal_paths": ["r", "c", "m"], "memmap": false}}, {"name": "test_da_traverse_flat", "page": "document_array_traverse", "iterations": 5, "mean_time": 15347609, "std_time": 3680900, "metadata": {"num_docs": 1000, "num_matches": 10, "num_chunks": 10, "traversal_paths": ["r"], "memmap": false}}, {"name": "test_da_traverse_flat", "page": "document_array_traverse", "iterations": 5, "mean_time": 1540620978, "std_time": 54161616, "metadata": {"num_docs": 1000, "num_matches": 10, "num_chunks": 100, "traversal_paths": ["c"], "memmap": false}}, {"name": "test_da_traverse_flat", "page": "document_array_traverse", "iterations": 5, "mean_time": 1507878414, "std_time": 29284556, "metadata": {"num_docs": 1000, "num_matches": 100, "num_chunks": 10, "traversal_paths": ["m"], "memmap": false}}, {"name": "test_dam_traverse_flat", "page": "document_array_traverse", "iterations": 68, "mean_time": 6504295, "std_time": 29828, "metadata": {"num_docs": 10, "num_matches": 10, "num_chunks": 10, "traversal_paths": ["r", "c", "m"], "memmap": true}}, {"name": "test_dam_traverse_flat", "page": "document_array_traverse", "iterations": 5, "mean_time": 421479163, "std_time": 722261, "metadata": {"num_docs": 100, "num_matches": 100, "num_chunks": 100, "traversal_paths": ["r", "c", "m"], "memmap": true}}, {"name": "test_dam_traverse_flat", "page": "document_array_traverse", "iterations": 5, "mean_time": 4260193925, "std_time": 11129114, "metadata": {"num_docs": 1000, "num_matches": 100, "num_chunks": 100, "traversal_paths": ["r", "c", "m"], "memmap": true}}, {"name": "test_dam_traverse_flat", "page": "document_array_traverse", "iterations": 5, "mean_time": 158296238, "std_time": 1791414, "metadata": {"num_docs": 1000, "num_matches": 10, "num_chunks": 10, "traversal_paths": ["r"], "memmap": true}}, {"name": "test_dam_traverse_flat", "page": "document_array_traverse", "iterations": 5, "mean_time": 1290982118, "std_time": 32703763, "metadata": {"num_docs": 1000, "num_matches": 10, "num_chunks": 100, "traversal_paths": ["c"], "memmap": true}}, {"name": "test_dam_traverse_flat", "page": "document_array_traverse", "iterations": 5, "mean_time": 1236756984, "std_time": 3084354, "metadata": {"num_docs": 1000, "num_matches": 100, "num_chunks": 10, "traversal_paths": ["m"], "memmap": true}}, {"name": "test_construct_text", "page": "document_construct", "iterations": 42374, "mean_time": 43724, "std_time": 3600, "metadata": {"text_length": 10}}, {"name": "test_construct_text", "page": "document_construct", "iterations": 42268, "mean_time": 43846, "std_time": 3676, "metadata": {"text_length": 100}}, {"name": "test_construct_text", "page": "document_construct", "iterations": 40117, "mean_time": 46332, "std_time": 3744, "metadata": {"text_length": 1000}}, {"name": "test_construct_text", "page": "document_construct", "iterations": 30383, "mean_time": 62289, "std_time": 3938, "metadata": {"text_length": 10000}}, {"name": "test_construct_blob", "page": "document_construct", "iterations": 29621, "mean_time": 63723, "std_time": 4448, "metadata": {"num_dims": 1}}, {"name": "test_construct_blob", "page": "document_construct", "iterations": 4095, "mean_time": 482268, "std_time": 9099, "metadata": {"num_dims": 2}}, {"name": "test_construct_buffer", "page": "document_construct", "iterations": 44099, "mean_time": 41955, "std_time": 3585, "metadata": {"buffer_length": 10}}, {"name": "test_construct_buffer", "page": "document_construct", "iterations": 42204, "mean_time": 43984, "std_time": 3672, "metadata": {"buffer_length": 1000}}, {"name": "test_construct_buffer", "page": "document_construct", "iterations": 9544, "mean_time": 205931, "std_time": 5771, "metadata": {"buffer_length": 100000}}, {"name": "test_construct_btyes_origin_text", "page": "document_construct", "iterations": 32269, "mean_time": 58352, "std_time": 2364, "metadata": {"text_length": 10}}, {"name": "test_construct_btyes_origin_text", "page": "document_construct", "iterations": 32033, "mean_time": 58774, "std_time": 2674, "metadata": {"text_length": 100}}, {"name": "test_construct_btyes_origin_text", "page": "document_construct", "iterations": 30998, "mean_time": 60884, "std_time": 2343, "metadata": {"text_length": 1000}}, {"name": "test_construct_btyes_origin_text", "page": "document_construct", "iterations": 24223, "mean_time": 78870, "std_time": 2523, "metadata": {"text_length": 10000}}, {"name": "test_construct_btyes_origin_blob", "page": "document_construct", "iterations": 29278, "mean_time": 64568, "std_time": 2297, "metadata": {"num_dims": 1}}, {"name": "test_construct_btyes_origin_blob", "page": "document_construct", "iterations": 4572, "mean_time": 431850, "std_time": 6777, "metadata": {"num_dims": 2}}, {"name": "test_construct_btyes_origin_buffer", "page": "document_construct", "iterations": 32567, "mean_time": 57812, "std_time": 2113, "metadata": {"buffer_length": 10}}, {"name": "test_construct_btyes_origin_buffer", "page": "document_construct", "iterations": 31459, "mean_time": 59957, "std_time": 2253, "metadata": {"buffer_length": 1000}}, {"name": "test_construct_btyes_origin_buffer", "page": "document_construct", "iterations": 8801, "mean_time": 223217, "std_time": 4620, "metadata": {"buffer_length": 100000}}, {"name": "test_construct_str_json_origin_text", "page": "document_construct", "iterations": 16096, "mean_time": 119673, "std_time": 7330, "metadata": {"text_length": 10}}, {"name": "test_construct_str_json_origin_text", "page": "document_construct", "iterations": 15497, "mean_time": 124415, "std_time": 7154, "metadata": {"text_length": 100}}, {"name": "test_construct_str_json_origin_text", "page": "document_construct", "iterations": 12035, "mean_time": 161395, "std_time": 7675, "metadata": {"text_length": 1000}}, {"name": "test_construct_str_json_origin_text", "page": "document_construct", "iterations": 3936, "mean_time": 503038, "std_time": 9843, "metadata": {"text_length": 10000}}, {"name": "test_construct_str_json_origin_blob", "page": "document_construct", "iterations": 11874, "mean_time": 163144, "std_time": 8050, "metadata": {"num_dims": 1}}, {"name": "test_construct_str_json_origin_blob", "page": "document_construct", "iterations": 869, "mean_time": 2293109, "std_time": 19065, "metadata": {"num_dims": 2}}, {"name": "test_construct_str_json_origin_buffer", "page": "document_construct", "iterations": 16369, "mean_time": 117579, "std_time": 7311, "metadata": {"buffer_length": 10}}, {"name": "test_construct_str_json_origin_buffer", "page": "document_construct", "iterations": 14688, "mean_time": 131344, "std_time": 7391, "metadata": {"buffer_length": 1000}}, {"name": "test_construct_str_json_origin_buffer", "page": "document_construct", "iterations": 1886, "mean_time": 1052675, "std_time": 13785, "metadata": {"buffer_length": 100000}}, {"name": "test_construct_dict_origin_text", "page": "document_construct", "iterations": 17670, "mean_time": 108815, "std_time": 7229, "metadata": {"text_length": 10}}, {"name": "test_construct_dict_origin_text", "page": "document_construct", "iterations": 17078, "mean_time": 112703, "std_time": 7089, "metadata": {"text_length": 100}}, {"name": "test_construct_dict_origin_text", "page": "document_construct", "iterations": 13112, "mean_time": 147994, "std_time": 7234, "metadata": {"text_length": 1000}}, {"name": "test_construct_dict_origin_text", "page": "document_construct", "iterations": 4151, "mean_time": 477028, "std_time": 10060, "metadata": {"text_length": 10000}}, {"name": "test_construct_dict_origin_blob", "page": "document_construct", "iterations": 13169, "mean_time": 146925, "std_time": 7940, "metadata": {"num_dims": 1}}, {"name": "test_construct_dict_origin_blob", "page": "document_construct", "iterations": 1459, "mean_time": 1362811, "std_time": 15940, "metadata": {"num_dims": 2}}, {"name": "test_construct_dict_origin_buffer", "page": "document_construct", "iterations": 17906, "mean_time": 107385, "std_time": 6940, "metadata": {"buffer_length": 10}}, {"name": "test_construct_dict_origin_buffer", "page": "document_construct", "iterations": 16348, "mean_time": 117840, "std_time": 7103, "metadata": {"buffer_length": 1000}}, {"name": "test_construct_dict_origin_buffer", "page": "document_construct", "iterations": 2307, "mean_time": 859580, "std_time": 13879, "metadata": {"buffer_length": 100000}}, {"name": "test_construct_document_origin_text", "page": "document_construct", "iterations": 34807, "mean_time": 53876, "std_time": 1945, "metadata": {"text_length": 10, "copy": true}}, {"name": "test_construct_document_origin_text", "page": "document_construct", "iterations": 39117, "mean_time": 47630, "std_time": 1723, "metadata": {"text_length": 10, "copy": false}}, {"name": "test_construct_document_origin_text", "page": "document_construct", "iterations": 34636, "mean_time": 54186, "std_time": 1908, "metadata": {"text_length": 100, "copy": true}}, {"name": "test_construct_document_origin_text", "page": "document_construct", "iterations": 38984, "mean_time": 47823, "std_time": 1852, "metadata": {"text_length": 100, "copy": false}}, {"name": "test_construct_document_origin_text", "page": "document_construct", "iterations": 33323, "mean_time": 56463, "std_time": 2106, "metadata": {"text_length": 1000, "copy": true}}, {"name": "test_construct_document_origin_text", "page": "document_construct", "iterations": 37478, "mean_time": 49873, "std_time": 1777, "metadata": {"text_length": 1000, "copy": false}}, {"name": "test_construct_document_origin_text", "page": "document_construct", "iterations": 26623, "mean_time": 71545, "std_time": 2205, "metadata": {"text_length": 10000, "copy": true}}, {"name": "test_construct_document_origin_text", "page": "document_construct", "iterations": 29069, "mean_time": 65284, "std_time": 2220, "metadata": {"text_length": 10000, "copy": false}}, {"name": "test_construct_document_origin_blob", "page": "document_construct", "iterations": 30779, "mean_time": 61366, "std_time": 2060, "metadata": {"num_dims": 1, "copy": true}}, {"name": "test_construct_document_origin_blob", "page": "document_construct", "iterations": 35525, "mean_time": 52772, "std_time": 1993, "metadata": {"num_dims": 1, "copy": false}}, {"name": "test_construct_document_origin_blob", "page": "document_construct", "iterations": 4348, "mean_time": 455206, "std_time": 10579, "metadata": {"num_dims": 2, "copy": true}}, {"name": "test_construct_document_origin_blob", "page": "document_construct", "iterations": 3840, "mean_time": 515814, "std_time": 6556, "metadata": {"num_dims": 2, "copy": false}}, {"name": "test_construct_document_origin_buffer", "page": "document_construct", "iterations": 34993, "mean_time": 53621, "std_time": 1983, "metadata": {"buffer_length": 10, "copy": true}}, {"name": "test_construct_document_origin_buffer", "page": "document_construct", "iterations": 39380, "mean_time": 47328, "std_time": 1792, "metadata": {"buffer_length": 10, "copy": false}}, {"name": "test_construct_document_origin_buffer", "page": "document_construct", "iterations": 33838, "mean_time": 55580, "std_time": 1895, "metadata": {"buffer_length": 1000, "copy": true}}, {"name": "test_construct_document_origin_buffer", "page": "document_construct", "iterations": 37612, "mean_time": 49709, "std_time": 1829, "metadata": {"buffer_length": 1000, "copy": false}}, {"name": "test_construct_document_origin_buffer", "page": "document_construct", "iterations": 9251, "mean_time": 212441, "std_time": 3967, "metadata": {"buffer_length": 100000, "copy": true}}, {"name": "test_construct_document_origin_buffer", "page": "document_construct", "iterations": 9534, "mean_time": 206148, "std_time": 3760, "metadata": {"buffer_length": 100000, "copy": false}}, {"name": "test_construct_document_origin_text_proto", "page": "document_construct", "iterations": 35327, "mean_time": 53068, "std_time": 1800, "metadata": {"text_length": 10, "copy": true}}, {"name": "test_construct_document_origin_text_proto", "page": "document_construct", "iterations": 39744, "mean_time": 46794, "std_time": 1689, "metadata": {"text_length": 10, "copy": false}}, {"name": "test_construct_document_origin_text_proto", "page": "document_construct", "iterations": 35344, "mean_time": 53039, "std_time": 1905, "metadata": {"text_length": 100, "copy": true}}, {"name": "test_construct_document_origin_text_proto", "page": "document_construct", "iterations": 39792, "mean_time": 46851, "std_time": 1799, "metadata": {"text_length": 100, "copy": false}}, {"name": "test_construct_document_origin_text_proto", "page": "document_construct", "iterations": 34108, "mean_time": 55133, "std_time": 1851, "metadata": {"text_length": 1000, "copy": true}}, {"name": "test_construct_document_origin_text_proto", "page": "document_construct", "iterations": 38505, "mean_time": 48443, "std_time": 1758, "metadata": {"text_length": 1000, "copy": false}}, {"name": "test_construct_document_origin_text_proto", "page": "document_construct", "iterations": 26900, "mean_time": 70817, "std_time": 2269, "metadata": {"text_length": 10000, "copy": true}}, {"name": "test_construct_document_origin_text_proto", "page": "document_construct", "iterations": 29545, "mean_time": 64234, "std_time": 2155, "metadata": {"text_length": 10000, "copy": false}}, {"name": "test_construct_document_origin_blob_proto", "page": "document_construct", "iterations": 31489, "mean_time": 59972, "std_time": 1967, "metadata": {"num_dims": 1, "copy": true}}, {"name": "test_construct_document_origin_blob_proto", "page": "document_construct", "iterations": 35529, "mean_time": 52720, "std_time": 1824, "metadata": {"num_dims": 1, "copy": false}}, {"name": "test_construct_document_origin_blob_proto", "page": "document_construct", "iterations": 6729, "mean_time": 293396, "std_time": 4630, "metadata": {"num_dims": 2, "copy": true}}, {"name": "test_construct_document_origin_blob_proto", "page": "document_construct", "iterations": 6744, "mean_time": 292839, "std_time": 4633, "metadata": {"num_dims": 2, "copy": false}}, {"name": "test_construct_document_origin_buffer_proto", "page": "document_construct", "iterations": 35779, "mean_time": 52402, "std_time": 1789, "metadata": {"buffer_length": 10, "copy": true}}, {"name": "test_construct_document_origin_buffer_proto", "page": "document_construct", "iterations": 40479, "mean_time": 46001, "std_time": 1741, "metadata": {"buffer_length": 10, "copy": false}}, {"name": "test_construct_document_origin_buffer_proto", "page": "document_construct", "iterations": 34578, "mean_time": 54367, "std_time": 2073, "metadata": {"buffer_length": 1000, "copy": true}}, {"name": "test_construct_document_origin_buffer_proto", "page": "document_construct", "iterations": 38547, "mean_time": 48371, "std_time": 2248, "metadata": {"buffer_length": 1000, "copy": false}}, {"name": "test_construct_document_origin_buffer_proto", "page": "document_construct", "iterations": 9301, "mean_time": 211422, "std_time": 4100, "metadata": {"buffer_length": 100000, "copy": true}}, {"name": "test_construct_document_origin_buffer_proto", "page": "document_construct", "iterations": 9618, "mean_time": 204463, "std_time": 3658, "metadata": {"buffer_length": 100000, "copy": false}}, {"name": "test_get_attributes_text", "page": "document_get_attributes", "iterations": 716198, "mean_time": 1099, "std_time": 189, "metadata": {"text_length": 10}}, {"name": "test_get_attributes_text", "page": "document_get_attributes", "iterations": 707005, "mean_time": 1145, "std_time": 400, "metadata": {"text_length": 100}}, {"name": "test_get_attributes_text", "page": "document_get_attributes", "iterations": 680565, "mean_time": 1249, "std_time": 210, "metadata": {"text_length": 1000}}, {"name": "test_get_attributes_text", "page": "document_get_attributes", "iterations": 521967, "mean_time": 2038, "std_time": 257, "metadata": {"text_length": 10000}}, {"name": "test_get_attribute_blob", "page": "document_get_attributes", "iterations": 220350, "mean_time": 7114, "std_time": 547, "metadata": {"num_dims": 1}}, {"name": "test_get_attribute_blob", "page": "document_get_attributes", "iterations": 64347, "mean_time": 29120, "std_time": 2254, "metadata": {"num_dims": 2}}, {"name": "test_get_attribute_buffer", "page": "document_get_attributes", "iterations": 734820, "mean_time": 1072, "std_time": 165, "metadata": {"buffer_length": 10}}, {"name": "test_get_attribute_buffer", "page": "document_get_attributes", "iterations": 690877, "mean_time": 1206, "std_time": 187, "metadata": {"buffer_length": 1000}}, {"name": "test_get_attribute_buffer", "page": "document_get_attributes", "iterations": 295309, "mean_time": 5019, "std_time": 925, "metadata": {"buffer_length": 100000}}, {"name": "test_get_content_text", "page": "document_property_getter", "iterations": 862175, "mean_time": 652, "std_time": 132, "metadata": {"text_length": 10}}, {"name": "test_get_content_text", "page": "document_property_getter", "iterations": 846074, "mean_time": 681, "std_time": 125, "metadata": {"text_length": 100}}, {"name": "test_get_content_text", "page": "document_property_getter", "iterations": 787490, "mean_time": 831, "std_time": 141, "metadata": {"text_length": 1000}}, {"name": "test_get_content_text", "page": "document_property_getter", "iterations": 594867, "mean_time": 1572, "std_time": 205, "metadata": {"text_length": 10000}}, {"name": "test_get_content_blob", "page": "document_property_getter", "iterations": 228749, "mean_time": 6713, "std_time": 462, "metadata": {"num_dims": 1}}, {"name": "test_get_content_blob", "page": "document_property_getter", "iterations": 77631, "mean_time": 23748, "std_time": 2999, "metadata": {"num_dims": 2}}, {"name": "test_get_content_buffer", "page": "document_property_getter", "iterations": 863141, "mean_time": 640, "std_time": 136, "metadata": {"buffer_length": 10}}, {"name": "test_get_content_buffer", "page": "document_property_getter", "iterations": 807124, "mean_time": 754, "std_time": 143, "metadata": {"buffer_length": 1000}}, {"name": "test_get_content_buffer", "page": "document_property_getter", "iterations": 303903, "mean_time": 4802, "std_time": 855, "metadata": {"buffer_length": 100000}}, {"name": "test_get_embedding", "page": "document_property_getter", "iterations": 224089, "mean_time": 6908, "std_time": 466, "metadata": {"buffer_length": 1}}, {"name": "test_get_embedding", "page": "document_property_getter", "iterations": 142062, "mean_time": 12093, "std_time": 1060, "metadata": {"buffer_length": 2}}, {"name": "test_set_attribute_text", "page": "document_set_attributes", "iterations": 220972, "mean_time": 7172, "std_time": 535, "metadata": {"text_length": 10}}, {"name": "test_set_attribute_text", "page": "document_set_attributes", "iterations": 40087, "mean_time": 47985, "std_time": 1413, "metadata": {"text_length": 100}}, {"name": "test_set_attribute_text", "page": "document_set_attributes", "iterations": 4345, "mean_time": 458333, "std_time": 4506, "metadata": {"text_length": 1000}}, {"name": "test_set_attribute_text", "page": "document_set_attributes", "iterations": 440, "mean_time": 4548337, "std_time": 18055, "metadata": {"text_length": 10000}}, {"name": "test_set_attribute_blob", "page": "document_set_attributes", "iterations": 138417, "mean_time": 12280, "std_time": 856, "metadata": {"num_dims": 1}}, {"name": "test_set_attribute_blob", "page": "document_set_attributes", "iterations": 11222, "mean_time": 175538, "std_time": 63380, "metadata": {"num_dims": 2}}, {"name": "test_set_attribute_buffer", "page": "document_set_attributes", "iterations": 416221, "mean_time": 3026, "std_time": 348, "metadata": {"buffer_length": 10}}, {"name": "test_set_attribute_buffer", "page": "document_set_attributes", "iterations": 23379, "mean_time": 83689, "std_time": 1746, "metadata": {"buffer_length": 1000}}, {"name": "test_set_attribute_buffer", "page": "document_set_attributes", "iterations": 246, "mean_time": 8143859, "std_time": 47535, "metadata": {"buffer_length": 100000}}, {"name": "test_executor_load_config", "page": "executor", "iterations": 860, "mean_time": 2318388, "std_time": 1396410, "metadata": {}}, {"name": "test_local_flow_start", "page": "flow", "iterations": 5, "mean_time": 724767375, "std_time": 8320444, "metadata": {"flow": "wide", "num_pods": 10}}, {"name": "test_local_flow_close", "page": "flow", "iterations": 5, "mean_time": 256889969, "std_time": 34431738, "metadata": {"flow": "long", "num_pods": 10}}, {"name": "test_local_flow_close", "page": "flow", "iterations": 5, "mean_time": 305463673, "std_time": 1149236, "metadata": {"flow": "wide", "num_pods": 10}}, {"name": "test_flow_load_config", "page": "flow", "iterations": 70, "mean_time": 28705205, "std_time": 1096895, "metadata": {"flow": "long", "num_pods": 10}}, {"name": "test_flow_load_config", "page": "flow", "iterations": 54, "mean_time": 37559951, "std_time": 4789177, "metadata": {"flow": "wide", "num_pods": 10}}]


--------------------------------------------------------------------------------