├── .gitattributes
├── .gitignore
├── .gitmodules
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── SECURITY.md
├── VERSION.txt
├── applications
    ├── indexing
    │   ├── build_image.sh
    │   ├── create_plaid.py
    │   ├── indexing_pipeline.py
    │   ├── launch_indexing_workflow.py
    │   ├── marco_indexing
    │   │   ├── colbert_indexing_pipeline.yml
    │   │   ├── emr_indexing_pipeline.yml
    │   │   ├── faiss_indexing_pipeline.yml
    │   │   ├── marco_dataset.py
    │   │   └── prepare_env.sh
    │   ├── marco_indexing_workflow.yml
    │   ├── ray_indexing_pipeline.py
    │   ├── run-ray-cluster.sh
    │   ├── stackoverflow_indexing
    │   │   ├── colbert_indexing_pipeline.yml
    │   │   ├── emr_indexing_pipeline.yml
    │   │   ├── faiss_indexing_pipeline.yml
    │   │   ├── prepare_env.sh
    │   │   └── stackoverflow_dataset.py
    │   └── stackoverflow_indexing_workflow.yml
    └── odqa_pipelines
    │   ├── config
    │       ├── env.marco.esds_bm25r_colbert
    │       ├── env.marco.esds_emr_faq
    │       ├── env.marco.faiss_dpr
    │       ├── env.stackoverflow.esds_bm25r_colbert
    │       ├── env.stackoverflow.esds_emr_faq
    │       ├── env.stackoverflow.faiss_dpr
    │       └── env.wiki.plaid_colbertv2
    │   ├── docker-compose
    │       ├── docker-compose-dpr.yml
    │       ├── docker-compose-gpu-dpr.yml
    │       ├── docker-compose-gpu.yml
    │       ├── docker-compose-nginx.yml
    │       ├── docker-compose-plaid.yml
    │       └── docker-compose.yml
    │   ├── launch_pipeline.sh
    │   └── ui_config
    │       ├── marco
    │           └── config.yml
    │       └── stackoverflow
    │           └── config.yml
├── conftest.py
├── doc
    └── workflow_stackoverflow.md
├── docker
    ├── Dockerfile
    └── Dockerfile-GPU
├── haystack
    ├── __init__.py
    ├── config.py
    ├── document_stores
    │   ├── __init__.py
    │   ├── base.py
    │   ├── deepsetcloud.py
    │   ├── elasticsearch.py
    │   ├── es_converter.py
    │   ├── faiss.py
    │   ├── filter_utils.py
    │   ├── graphdb.py
    │   ├── memory.py
    │   ├── memory_knowledgegraph.py
    │   ├── milvus.py
    │   ├── opensearch.py
    │   ├── pinecone.py
    │   ├── plaid.py
    │   ├── search_engine.py
    │   ├── sql.py
    │   ├── utils.py
    │   └── weaviate.py
    ├── environment.py
    ├── errors.py
    ├── modeling
    │   ├── __init__.py
    │   ├── data_handler
    │   │   ├── __init__.py
    │   │   ├── data_silo.py
    │   │   ├── dataloader.py
    │   │   ├── dataset.py
    │   │   ├── input_features.py
    │   │   ├── inputs.py
    │   │   ├── processor.py
    │   │   └── samples.py
    │   ├── evaluation
    │   │   ├── __init__.py
    │   │   ├── eval.py
    │   │   ├── metrics.py
    │   │   └── squad.py
    │   ├── infer.py
    │   ├── model
    │   │   ├── __init__.py
    │   │   ├── adaptive_model.py
    │   │   ├── biadaptive_model.py
    │   │   ├── feature_extraction.py
    │   │   ├── language_model.py
    │   │   ├── multimodal
    │   │   │   ├── __init__.py
    │   │   │   ├── base.py
    │   │   │   └── sentence_transformers.py
    │   │   ├── optimization.py
    │   │   ├── prediction_head.py
    │   │   ├── predictions.py
    │   │   └── triadaptive_model.py
    │   ├── training
    │   │   ├── __init__.py
    │   │   ├── base.py
    │   │   ├── dpr.py
    │   │   └── question_answering.py
    │   ├── utils.py
    │   └── visual.py
    ├── nodes
    │   ├── __init__.py
    │   ├── _json_schema.py
    │   ├── answer_generator
    │   │   ├── __init__.py
    │   │   ├── base.py
    │   │   ├── openai.py
    │   │   └── transformers.py
    │   ├── audio
    │   │   ├── __init__.py
    │   │   ├── _text_to_speech.py
    │   │   ├── answer_to_speech.py
    │   │   └── document_to_speech.py
    │   ├── base.py
    │   ├── connector
    │   │   ├── __init__.py
    │   │   └── crawler.py
    │   ├── document_classifier
    │   │   ├── __init__.py
    │   │   ├── base.py
    │   │   └── transformers.py
    │   ├── evaluator
    │   │   ├── __init__.py
    │   │   └── evaluator.py
    │   ├── extractor
    │   │   ├── __init__.py
    │   │   └── entity.py
    │   ├── file_classifier
    │   │   ├── __init__.py
    │   │   └── file_type.py
    │   ├── file_converter
    │   │   ├── __init__.py
    │   │   ├── azure.py
    │   │   ├── base.py
    │   │   ├── docx.py
    │   │   ├── image.py
    │   │   ├── markdown.py
    │   │   ├── parsr.py
    │   │   ├── pdf.py
    │   │   ├── tika.py
    │   │   └── txt.py
    │   ├── label_generator
    │   │   ├── __init__.py
    │   │   └── pseudo_label_generator.py
    │   ├── other
    │   │   ├── __init__.py
    │   │   ├── dataset.py
    │   │   ├── docs2answers.py
    │   │   ├── document_merger.py
    │   │   ├── join.py
    │   │   ├── join_answers.py
    │   │   ├── join_docs.py
    │   │   └── route_documents.py
    │   ├── preprocessor
    │   │   ├── __init__.py
    │   │   ├── base.py
    │   │   └── preprocessor.py
    │   ├── prompt
    │   │   ├── __init__.py
    │   │   └── prompt_node.py
    │   ├── query_classifier
    │   │   ├── __init__.py
    │   │   ├── base.py
    │   │   ├── sklearn.py
    │   │   └── transformers.py
    │   ├── question_generator
    │   │   ├── __init__.py
    │   │   └── question_generator.py
    │   ├── ranker
    │   │   ├── __init__.py
    │   │   ├── base.py
    │   │   ├── colbert_modeling.py
    │   │   ├── sentence_transformers.py
    │   │   └── st_modeling.py
    │   ├── reader
    │   │   ├── __init__.py
    │   │   ├── base.py
    │   │   ├── farm.py
    │   │   ├── table.py
    │   │   └── transformers.py
    │   ├── retriever
    │   │   ├── __init__.py
    │   │   ├── _embedding_encoder.py
    │   │   ├── _losses.py
    │   │   ├── base.py
    │   │   ├── dense.py
    │   │   ├── multimodal
    │   │   │   ├── __init__.py
    │   │   │   ├── embedder.py
    │   │   │   └── retriever.py
    │   │   ├── sparse.py
    │   │   └── text2sparql.py
    │   ├── summarizer
    │   │   ├── __init__.py
    │   │   ├── base.py
    │   │   └── transformers.py
    │   └── translator
    │   │   ├── __init__.py
    │   │   ├── base.py
    │   │   └── transformers.py
    ├── pipelines
    │   ├── __init__.py
    │   ├── base.py
    │   ├── config.py
    │   ├── ray.py
    │   ├── standard_pipelines.py
    │   └── utils.py
    ├── schema.py
    ├── telemetry.py
    └── utils
    │   ├── __init__.py
    │   ├── augment_squad.py
    │   ├── cleaning.py
    │   ├── context_matching.py
    │   ├── deepsetcloud.py
    │   ├── doc_store.py
    │   ├── docker.py
    │   ├── early_stopping.py
    │   ├── experiment_tracking.py
    │   ├── export_utils.py
    │   ├── import_utils.py
    │   ├── labels.py
    │   ├── preprocessing.py
    │   ├── reflection.py
    │   ├── squad_data.py
    │   ├── squad_to_dpr.py
    │   └── torch_utils.py
├── images
    ├── odqa_workflow.png
    ├── pipeline1.PNG
    ├── pipeline2.PNG
    ├── pipeline3.PNG
    └── ui.png
├── nginx
    ├── Dockerfile
    └── nginx.conf
├── prepare_env.sh
├── pyproject.toml
├── rest_api
    ├── LICENSE
    ├── README.md
    ├── pyproject.toml
    ├── rest_api
    │   ├── __about__.py
    │   ├── __init__.py
    │   ├── application.py
    │   ├── config.py
    │   ├── controller
    │   │   ├── __init__.py
    │   │   ├── document.py
    │   │   ├── errors
    │   │   │   ├── __init__.py
    │   │   │   └── http_error.py
    │   │   ├── feedback.py
    │   │   ├── file_upload.py
    │   │   ├── health.py
    │   │   ├── search.py
    │   │   └── utils.py
    │   ├── pipeline
    │   │   ├── __init__.py
    │   │   ├── custom_component.py
    │   │   ├── pipeline_empty.haystack-pipeline.yml
    │   │   ├── pipeline_plaid_colbertv2.yml
    │   │   ├── pipelines.colbertRanker.haystack-pipeline.yml
    │   │   ├── pipelines.haystack-EmbeddingRetriever-pipeline.yml
    │   │   ├── pipelines.haystack-pipeline.yml
    │   │   └── pipelines_dpr.haystack-pipeline.yml
    │   ├── schema.py
    │   └── utils.py
    └── test
    │   ├── __init__.py
    │   ├── samples
    │       ├── pdf
    │       │   ├── sample_pdf_1.pdf
    │       │   └── sample_pdf_2.pdf
    │       └── test.haystack-pipeline.yml
    │   └── test_rest_api.py
├── test
    ├── __init__.py
    ├── benchmarks
    │   ├── README.md
    │   ├── config.json
    │   ├── data_scripts
    │   │   ├── embeddings_slice.py
    │   │   └── shuffle_passages.py
    │   ├── distillation_config.json
    │   ├── model_distillation.py
    │   ├── nq_to_squad.py
    │   ├── reader.py
    │   ├── reader_results.csv
    │   ├── results_to_json.py
    │   ├── retriever.py
    │   ├── retriever_index_results.csv
    │   ├── retriever_query_results.csv
    │   ├── retriever_query_results.md
    │   ├── retriever_simplified.py
    │   ├── run.py
    │   ├── templates.py
    │   └── utils.py
    ├── conftest.py
    ├── document_stores
    │   ├── __init__.py
    │   ├── test_document_store.py
    │   ├── test_faiss_and_milvus.py
    │   ├── test_knowledge_graph.py
    │   └── test_weaviate.py
    ├── modeling
    │   ├── __init__.py
    │   ├── test_distillation.py
    │   ├── test_modeling_dpr.py
    │   ├── test_modeling_inference.py
    │   ├── test_modeling_prediction_head.py
    │   ├── test_modeling_processor.py
    │   ├── test_modeling_processor_saving_loading.py
    │   ├── test_modeling_question_answering.py
    │   └── test_tokenization.py
    ├── nodes
    │   ├── __init__.py
    │   ├── test_connector.py
    │   ├── test_document_classifier.py
    │   ├── test_extractor.py
    │   ├── test_file_converter.py
    │   ├── test_filetype_classifier.py
    │   ├── test_generator.py
    │   ├── test_label_generator.py
    │   ├── test_preprocessor.py
    │   ├── test_question_generator.py
    │   ├── test_ranker.py
    │   ├── test_reader.py
    │   ├── test_retriever.py
    │   ├── test_summarizer.py
    │   ├── test_summarizer_translation.py
    │   ├── test_table_reader.py
    │   └── test_translator.py
    ├── others
    │   ├── __init__.py
    │   ├── test_schema.py
    │   ├── test_telemetry.py
    │   └── test_utils.py
    ├── pipelines
    │   ├── __init__.py
    │   ├── test_eval.py
    │   ├── test_pipeline.py
    │   ├── test_pipeline_debug_and_validation.py
    │   ├── test_pipeline_extractive_qa.py
    │   ├── test_pipeline_yaml.py
    │   ├── test_ray.py
    │   └── test_standard_pipelines.py
    └── samples
    │   ├── dc
    │       ├── documents-stream.response
    │       ├── matching_test_1.csv
    │       ├── pipeline_config.json
    │       └── query_winterfell.response
    │   ├── docs
    │       ├── doc_1.txt
    │       └── doc_2.txt
    │   ├── docx
    │       └── sample_docx.docx
    │   ├── dpr
    │       └── sample.json
    │   ├── extensionless_files
    │       ├── docx_file
    │       ├── gif_file
    │       ├── html_file
    │       ├── jpg_file
    │       ├── mp3_file
    │       ├── odt_file
    │       ├── pdf_file
    │       ├── png_file
    │       ├── pptx_file
    │       ├── txt_file
    │       ├── wav_file
    │       └── zip_file
    │   ├── glove
    │       └── tiny.txt
    │   ├── markdown
    │       └── sample.md
    │   ├── mmr
    │       └── sample.json
    │   ├── pdf
    │       ├── sample_pdf_1.pdf
    │       └── sample_pdf_2.pdf
    │   ├── pipeline
    │       ├── ray.haystack-pipeline.yml
    │       └── test.haystack-pipeline.yml
    │   ├── qa
    │       ├── answer-offset-wrong.json
    │       ├── answer-wrong.json
    │       ├── dev-sample.json
    │       ├── eval-sample.json
    │       ├── noanswer.json
    │       ├── train-sample.json
    │       └── vanilla.json
    │   └── squad
    │       ├── small.json
    │       ├── tiny.json
    │       ├── tiny_augmented.json
    │       └── tiny_passages.json
├── third-party-programs.txt
└── ui
    ├── Dockerfile
    ├── LICENSE
    ├── README.md
    ├── pyproject.toml
    ├── test
        ├── __init__.py
        └── test_ui_utils.py
    └── ui
        ├── __about__.py
        ├── __init__.py
        ├── eval_labels_example.csv
        ├── utils.py
        └── webapp.py


/.gitattributes:
--------------------------------------------------------------------------------
1 | examples/odqa_pipelines/faiss_data/marco filter=lfs diff=lfs merge=lfs -text
2 | examples/odqa_pipelines/faiss_data/stackoverflow filter=lfs diff=lfs merge=lfs -text
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Local run files
  2 | qa.db
  3 | **/qa.db
  4 | **/*qa*.db
  5 | **/test-reports
  6 | 
  7 | # Byte-compiled / optimized / DLL files
  8 | __pycache__/
  9 | *.py[cod]
 10 | *$py.class
 11 | 
 12 | # C extensions
 13 | *.so
 14 | 
 15 | # Distribution / packaging
 16 | .Python
 17 | build/
 18 | develop-eggs/
 19 | dist/
 20 | downloads/
 21 | eggs/
 22 | .eggs/
 23 | lib/
 24 | lib64/
 25 | parts/
 26 | sdist/
 27 | var/
 28 | wheels/
 29 | pip-wheel-metadata/
 30 | share/python-wheels/
 31 | *.egg-info/
 32 | .installed.cfg
 33 | *.egg
 34 | MANIFEST
 35 | 
 36 | # PyInstaller
 37 | #  Usually these files are written by a python script from a template
 38 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 39 | *.manifest
 40 | *.spec
 41 | 
 42 | # Installer logs
 43 | pip-log.txt
 44 | pip-delete-this-directory.txt
 45 | 
 46 | # Unit test / coverage reports
 47 | htmlcov/
 48 | .tox/
 49 | .nox/
 50 | .coverage
 51 | .coverage.*
 52 | .cache
 53 | nosetests.xml
 54 | coverage.xml
 55 | *.cover
 56 | *.py,cover
 57 | .hypothesis/
 58 | .pytest_cache/
 59 | 
 60 | # Translations
 61 | *.mo
 62 | *.pot
 63 | 
 64 | # Django stuff:
 65 | *.log
 66 | local_settings.py
 67 | db.sqlite3
 68 | db.sqlite3-journal
 69 | 
 70 | # Flask stuff:
 71 | instance/
 72 | .webassets-cache
 73 | 
 74 | # Scrapy stuff:
 75 | .scrapy
 76 | 
 77 | # Sphinx documentation
 78 | docs/_build/
 79 | 
 80 | # PyBuilder
 81 | target/
 82 | 
 83 | # Jupyter Notebook
 84 | .ipynb_checkpoints
 85 | 
 86 | # IPython
 87 | profile_default/
 88 | ipython_config.py
 89 | 
 90 | # pyenv
 91 | .python-version
 92 | 
 93 | # pipenv
 94 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 95 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 96 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 97 | #   install all needed dependencies.
 98 | #Pipfile.lock
 99 | 
100 | # pyflow
101 | __pypackages__/
102 | 
103 | # Celery stuff
104 | celerybeat-schedule
105 | celerybeat.pid
106 | 
107 | # SageMath parsed files
108 | *.sage.py
109 | 
110 | # Environments
111 | .env
112 | .venv
113 | env/
114 | venv/
115 | ENV/
116 | env.bak/
117 | venv.bak/
118 | 
119 | # Spyder project settings
120 | .spyderproject
121 | .spyproject
122 | 
123 | # Rope project settings
124 | .ropeproject
125 | 
126 | # mkdocs documentation
127 | /site
128 | 
129 | # mypy
130 | .mypy_cache/
131 | .dmypy.json
132 | dmypy.json
133 | 
134 | # Pyre type checker
135 | .pyre/
136 | 
137 | # PyCharm
138 | .idea
139 | 
140 | # VSCode
141 | .vscode
142 | 
143 | # haystack files
144 | haystack/document_store/qa.db
145 | data
146 | **/mlruns/**
147 | src
148 | tutorials/cache
149 | tutorials/mlruns
150 | tutorials/model
151 | models
152 | saved_models
153 | *_build
154 | rest_api/file-upload/*
155 | **/feedback_squad_direct.json
156 | .DS_Store
157 | 
158 | # http cache (requests-cache)
159 | **/http_cache.sqlite
160 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "third_party/sentence-transformers"]
2 | 	path = third_party/sentence-transformers
3 | 	url = https://github.com/UKPLab/sentence-transformers.git
4 | [submodule "third_party/ColBERT"]
5 | 	path = third_party/ColBERT
6 | 	url = https://github.com/kaixuanliu/ColBERT.git
7 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing
 2 | 
 3 | ### License
 4 | 
 5 | <PROJECT NAME> is licensed under the terms in [LICENSE]<link to license file in repo>. By contributing to the project, you agree to the license and copyright terms therein and release your contribution under these terms.
 6 | 
 7 | ### Sign your work
 8 | 
 9 | Please use the sign-off line at the end of the patch. Your signature certifies that you wrote the patch or otherwise have the right to pass it on as an open-source patch. The rules are pretty simple: if you can certify
10 | the below (from [developercertificate.org](http://developercertificate.org/)):
11 | 
12 | ```
13 | Developer Certificate of Origin
14 | Version 1.1
15 | 
16 | Copyright (C) 2004, 2006 The Linux Foundation and its contributors.
17 | 660 York Street, Suite 102,
18 | San Francisco, CA 94110 USA
19 | 
20 | Everyone is permitted to copy and distribute verbatim copies of this
21 | license document, but changing it is not allowed.
22 | 
23 | Developer's Certificate of Origin 1.1
24 | 
25 | By making a contribution to this project, I certify that:
26 | 
27 | (a) The contribution was created in whole or in part by me and I
28 |     have the right to submit it under the open source license
29 |     indicated in the file; or
30 | 
31 | (b) The contribution is based upon previous work that, to the best
32 |     of my knowledge, is covered under an appropriate open source
33 |     license and I have the right under that license to submit that
34 |     work with modifications, whether created in whole or in part
35 |     by me, under the same open source license (unless I am
36 |     permitted to submit under a different license), as indicated
37 |     in the file; or
38 | 
39 | (c) The contribution was provided directly to me by some other
40 |     person who certified (a), (b) or (c) and I have not modified
41 |     it.
42 | 
43 | (d) I understand and agree that this project and the contribution
44 |     are public and that a record of the contribution (including all
45 |     personal information I submit with it, including my sign-off) is
46 |     maintained indefinitely and may be redistributed consistent with
47 |     this project or the open source license(s) involved.
48 | ```
49 | 
50 | Then you just add a line to every git commit message:
51 | 
52 |     Signed-off-by: Joe Smith <joe.smith@email.com>
53 | 
54 | Use your real name (sorry, no pseudonyms or anonymous contributions.)
55 | 
56 | If you set your `user.name` and `user.email` git configs, you can sign your
57 | commit automatically with `git commit -s`.
58 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | # Security Policy
 2 | 
 3 | ## Report a Vulnerability
 4 | 
 5 | Please report security issues or vulnerabilities to the [Intel® Security Center].
 6 | 
 7 | For more information on how Intel® works to resolve security issues, see
 8 | [Vulnerability Handling Guidelines].
 9 | 
10 | [Intel® Security Center]:https://www.intel.com/content/www/us/en/security-center/default.html
11 | 
12 | [Vulnerability Handling Guidelines]:https://www.intel.com/content/www/us/en/security-center/vulnerability-handling-guidelines.html


--------------------------------------------------------------------------------
/VERSION.txt:
--------------------------------------------------------------------------------
1 | 1.12.2
2 | 


--------------------------------------------------------------------------------
/applications/indexing/build_image.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | # You can remove build-arg http_proxy and https_proxy if your network doesn't need it
 5 | 
 6 | DOCKER_BUILDKIT=0 docker build \
 7 |     -f ../../docker/Dockerfile ../../ \
 8 |     -t intel/ai-workflows:odqa-haystack-api \
 9 |     --network=host \
10 |     --build-arg http_proxy=${http_proxy} \
11 |     --build-arg https_proxy=${https_proxy} \
12 |     --build-arg no_proxy=${no_proxy}
13 | 


--------------------------------------------------------------------------------
/applications/indexing/create_plaid.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import logging
 3 | from pathlib import Path
 4 | 
 5 | from haystack.document_stores.plaid import PLAIDDocumentStore
 6 | 
 7 | logger = logging.getLogger(__name__)
 8 | 
 9 | if __name__ == "__main__":
10 |     parser = argparse.ArgumentParser("Create an index using PLAID engine as a backend")
11 |     parser.add_argument("--checkpoint", type=Path, required=True)
12 |     parser.add_argument("--collection", type=Path, required=True)
13 |     parser.add_argument("--index-save-path", type=Path, required=True)
14 |     parser.add_argument("--gpus", type=int, default=0)
15 |     parser.add_argument("--ranks", type=int, default=1)
16 |     parser.add_argument("--doc-max-length", type=int, default=120)
17 |     parser.add_argument("--query-max-length", type=int, default=60)
18 |     parser.add_argument("--kmeans-iterations", type=int, default=4)
19 |     parser.add_argument("--name", type=str, default="plaid_index")
20 |     parser.add_argument("--nbits", type=int, default=2)
21 | 
22 |     args = parser.parse_args()
23 | 
24 |     if args.gpus > 1:
25 |         args.ranks = args.gpus
26 |         args.amp = True
27 |     assert args.ranks > 0
28 |     if args.gpus == 0:
29 |         assert args.ranks > 0
30 | 
31 |     store = PLAIDDocumentStore(
32 |         index_path=f"{args.index_save_path}",
33 |         checkpoint_path=f"{args.checkpoint}",
34 |         collection_path=f"{args.collection}",
35 |         create=True,
36 |         nbits=args.nbits,
37 |         gpus=args.gpus,
38 |         ranks=args.ranks,
39 |         doc_maxlen=args.doc_max_length,
40 |         query_maxlen=args.query_max_length,
41 |         kmeans_niters=args.kmeans_iterations,
42 |     )
43 |     logger.info("Done.")
44 | 


--------------------------------------------------------------------------------
/applications/indexing/indexing_pipeline.py:
--------------------------------------------------------------------------------
 1 | from ray_indexing_pipeline import RayIndexingPipeline
 2 | import argparse, time, os
 3 | 
 4 | def run_indexing_pipeline(cfg):
 5 |     if cfg.enable_sample == 1:
 6 |         os.environ["ENABLE_SAMPLING_LIMIT"] = "1"
 7 |     else:
 8 |         os.environ["ENABLE_SAMPLING_LIMIT"] = "0"
 9 | 
10 |     start = time.time()
11 |     pipeline = RayIndexingPipeline.load_from_yaml(path=cfg.pipeline_yaml)
12 |     pipeline.run()
13 |     cost = time.time() - start
14 |     print(f'Spent {cost}s for pipeline: {cfg.pipeline_yaml}')
15 | 
16 | 
17 | def parse_cmd():
18 |     desc = 'generate documentstore for marco dataset...\n\n'
19 |     args = argparse.ArgumentParser(description=desc, epilog=' ', formatter_class=argparse.RawTextHelpFormatter)
20 |     args.add_argument('-p', type=str, default='faiss_indexing_pipeline.yml', dest='pipeline_yaml', help='pipeline config file')
21 |     args.add_argument('-s', type=int, default=0, dest='enable_sample', help='Only retrieve 500 samples for indexing')
22 |     return args.parse_args()
23 | 
24 | 
25 | if __name__ == "__main__":
26 |     config = parse_cmd()
27 |     run_indexing_pipeline(config)
28 | 


--------------------------------------------------------------------------------
/applications/indexing/marco_indexing/colbert_indexing_pipeline.yml:
--------------------------------------------------------------------------------
 1 | version: 1.12.2
 2 | extras: ray
 3 | components:    # define all the building-blocks for Pipeline
 4 |   - name: DocumentStore
 5 |     type: ElasticsearchDocumentStore
 6 |     actor: True
 7 |     params:
 8 |       host: $host_ip #host IP of head node
 9 |       custom_mapping:
10 |         mappings:
11 |           properties:
12 |             content:
13 |               type: text
14 |             question_id:
15 |               type: integer
16 |             question-body:
17 |               type: text
18 |               index: false
19 |             answer:
20 |               type: text
21 |               index: false
22 |             colbert_emb:
23 |               type: binary
24 |               index: false
25 |   
26 |   - name: Ranker
27 |     type: ColBERTRanker
28 |     actor: True
29 |     params:
30 |       model_path: /home/user/data/colbertv2.0
31 | 
32 |   - name: Dataset
33 |     type: MarcoDataset
34 |     path: /home/user/workspace/marco_dataset.py
35 |     actor: False
36 |     params:
37 |       file: /home/user/dataset/train_v2.1.json
38 |       batch_size: 200000
39 | 
40 | 
41 | pipelines:
42 |   - name: indexing
43 |     nodes:
44 |       - name: Dataset
45 |         inputs: [File]
46 |       - name: Ranker
47 |         inputs: [Dataset]
48 |         serve_deployment_kwargs:
49 |             num_replicas: 128  # number of replicas to create on the Ray cluster
50 |             batch_size: 256
51 |             num_cpus: 2
52 |       - name: DocumentStore
53 |         inputs: [Ranker]
54 |         serve_deployment_kwargs:
55 |             num_replicas: 10  # number of replicas to create on the Ray cluster
56 |             batch_size: 2000
57 |             num_cpus: 8
58 | 


--------------------------------------------------------------------------------
/applications/indexing/marco_indexing/emr_indexing_pipeline.yml:
--------------------------------------------------------------------------------
 1 | version: 1.12.2
 2 | extras: ray
 3 | components:    # define all the building-blocks for Pipeline
 4 |   - name: DocumentStore
 5 |     type: ElasticsearchDocumentStore
 6 |     actor: True
 7 |     params:
 8 |       host: $host_ip #host IP of head node
 9 |       index: document
10 |       embedding_field: question_emb
11 |       embedding_dim: 768
12 |       excluded_meta_data: ["question_emb"]
13 |   
14 |   - name: Retriever
15 |     type: EmbeddingRetriever
16 |     actor: True
17 |     params:
18 |       document_store: DocumentStore    # params can reference other components defined in the YAML
19 |       embedding_model: deepset/sentence_bert
20 |       batch_size: 256
21 | 
22 |   - name: Dataset
23 |     type: MarcoDataset
24 |     path: /home/user/workspace/marco_dataset.py
25 |     actor: False
26 |     params:
27 |       file: /home/user/dataset/train_v2.1.json
28 |       batch_size: 200000
29 | 
30 | 
31 | pipelines:
32 |   - name: indexing
33 |     nodes:
34 |       - name: Dataset
35 |         inputs: [File]
36 |       - name: Retriever
37 |         inputs: [Dataset]
38 |         serve_deployment_kwargs:
39 |             num_replicas: 128  # number of replicas to create on the Ray cluster
40 |             batch_size: 16
41 |             num_cpus: 2
42 |       - name: DocumentStore
43 |         inputs: [Retriever]
44 |         serve_deployment_kwargs:
45 |             num_replicas: 10  # number of replicas to create on the Ray cluster
46 |             batch_size: 2000
47 |             num_cpus: 8
48 | 


--------------------------------------------------------------------------------
/applications/indexing/marco_indexing/faiss_indexing_pipeline.yml:
--------------------------------------------------------------------------------
 1 | version: 1.12.2
 2 | extras: ray
 3 | components:    # define all the building-blocks for Pipeline
 4 |   
 5 |   - name: DocumentStore
 6 |     type: FAISSDocumentStore
 7 |     faiss_index_path: /home/user/data/faiss-index-so.faiss
 8 |     actor: False
 9 |     params:
10 |       sql_url: postgresql://postgres:postgres@$host_ip/haystack # postgresql url, please set host_ip to host IP of head node
11 |       faiss_index_factory_str: HNSW
12 | 
13 |   - name: Retriever
14 |     type: DensePassageRetriever
15 |     actor: True
16 |     params:
17 |       query_embedding_model: "facebook/dpr-question_encoder-single-nq-base"
18 |       passage_embedding_model: "facebook/dpr-ctx_encoder-single-nq-base"
19 |       max_seq_len_query: 64
20 |       max_seq_len_passage: 256
21 |       batch_size: 16
22 |       embed_title: True
23 |       use_fast_tokenizers: True
24 | 
25 |   - name: Dataset
26 |     type: MarcoDataset
27 |     path: /home/user/workspace/marco_dataset.py
28 |     actor: False
29 |     params:
30 |       file: /home/user/dataset/train_v2.1.json
31 |       batch_size: 200000
32 | 
33 | 
34 | pipelines:
35 |   - name: indexing
36 |     nodes:
37 |       - name: Dataset
38 |         inputs: [File]
39 |       - name: Retriever
40 |         inputs: [Dataset]
41 |         serve_deployment_kwargs:
42 |             num_replicas: 128  # number of replicas to create on the Ray cluster
43 |             batch_size: 256
44 |             num_cpus: 2
45 |       - name: DocumentStore
46 |         inputs: [Retriever]
47 | 


--------------------------------------------------------------------------------
/applications/indexing/marco_indexing/marco_dataset.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Union, Optional
 2 | from haystack.schema import Document
 3 | from haystack.nodes.other import Dataset
 4 | import time, ray
 5 | import pandas as pd
 6 | import modin.pandas as modin_pd
 7 | import os
 8 | import json
 9 | os.environ["__MODIN_AUTOIMPORT_PANDAS__"] = "1"
10 | 
11 | def _generate_documents(batch: pd.DataFrame) -> List[Document]:
12 |     documents = []
13 |     for _, data in batch.iterrows():
14 |         if isinstance(data['answers'], list) == False:
15 |             continue
16 | 
17 |         data['answers'] = data['answers'][0]
18 |         if len(str(data['wellFormedAnswers'])) > 2:
19 |             if isinstance(data['wellFormedAnswers'], list) :
20 |                 data['answers'] = data['wellFormedAnswers'][0]
21 | 
22 |         elif "No Answer Present." in data['answers']:
23 |             data['answers'] = data['passages']
24 | 
25 |         if len(str(data['answers'])) == 0:
26 |             print("no answers, drop the document!")
27 |             continue
28 | 
29 |         doc = {'content': str(data['query']), 'meta': {'answer': str(data['answers']), 'question_id': str(data['query_id']), 'question_type': str(data['query_type'])}}
30 |         documents.append(Document.from_dict(doc))
31 |     return documents
32 | 
33 | 
34 |     
35 | class MarcoDataset(Dataset):
36 |     """
37 |     This Node is used to convert MS Marco dataset into ray.data.Dataset of Haystack Document format.
38 |     """
39 | 
40 |     outgoing_edges = 1
41 | 
42 |     def __init__(self,
43 |         file: str,
44 |         batch_size: Optional[int] = 4096,
45 |         ) :
46 | 
47 |         super().__init__(batch_size=batch_size)
48 |         self.file = file
49 | 
50 |     def convert(self) -> ray.data.Dataset:
51 |         dataset = modin_pd.read_json(self.file)
52 |         dataset = ray.data.from_modin(dataset)
53 |         start = time.time()
54 |         dataset = dataset.map_batches(_generate_documents)
55 |         cost = time.time() - start
56 |         return dataset
57 | 


--------------------------------------------------------------------------------
/applications/indexing/marco_indexing/prepare_env.sh:
--------------------------------------------------------------------------------
1 | pip install modin[all]


--------------------------------------------------------------------------------
/applications/indexing/marco_indexing_workflow.yml:
--------------------------------------------------------------------------------
 1 | nodes:
 2 |   - node: $host_ip #IP address of head node, Head node is launched in local machine
 3 |     type: head # value:[head, worker] Must exist.
 4 |     cores: 0-64 #total cpu cores used
 5 |     image: intel/ai-workflows:odqa-haystack-api
 6 |     dataset_dir: $dataset_dir #mount to /home/user/dataset of container. It should include the dataset files.
 7 |     customer_dir: $customer_dir #mount to /home/user/data of container
 8 |     workspace_dir: $(pwd)/marco_indexing #mount to /home/user/workspace of container. $workspace_dir should be ./marco_indexing for marco indexing.
 9 | 
10 |   - node: $host_ip #IP address of worker node
11 |     type: worker
12 |     image: intel/ai-workflows:odqa-haystack-api
13 |     cores: 0-71 
14 |     user: $user         #configure the user of worker node for remote access 
15 |     password: $password #configure the password of worker node for remote access and sudo executing
16 |     dataset_dir: $dataset_dir #If you use the nfs share storage, it should be same with path of head node.
17 |     customer_dir: $customer_dir #If you use the nfs share storage, it should be same with path of head node.
18 |     workspace_dir: $workspace_dir #If you use the nfs share storage, it should be same with path of head node.
19 |  
20 |  
21 | pipelines:
22 |   - name: colbert_indexing_pipeline.yml #The name should be same with file name of pipeline file which is included in $workspace_dir
23 |     database: # The database containers will be launched in head node.
24 |       type: elasticsearch # value:[elasticsearch, postgres]. Must be consistent with the database type of DocumentStore configured in the pipeline file 
25 |       image: elasticsearch:7.9.2 #For elasticsearch, the 7.9.2 is prefered. To change the version, you need to ensure that you use the same version as the query pipeline
26 |       cores: 65-71
27 |       data_dir: $data_dir_1 #The data directory of database which mountes to /usr/share/elasticsearch/data of elasticsearch container, or /var/lib/postgresql/data of postgresql container
28 |   - name: emr_indexing_pipeline.yml
29 |     database:
30 |       type: elasticsearch
31 |       image: elasticsearch:7.9.2
32 |       cores: 65-71
33 |       data_dir: $data_dir_2
34 |   - name: faiss_indexing_pipeline.yml
35 |     database:     # The database containers will run in head node.
36 |       type: postgres
37 |       image: postgres:14.1-alpine
38 |       cores: 65-71
39 |       data_dir: $data_dir_3
40 | 


--------------------------------------------------------------------------------
/applications/indexing/stackoverflow_indexing/colbert_indexing_pipeline.yml:
--------------------------------------------------------------------------------
 1 | version: 1.12.2
 2 | extras: ray
 3 | components:    # define all the building-blocks for Pipeline
 4 |   - name: DocumentStore
 5 |     type: ElasticsearchDocumentStore
 6 |     actor: True
 7 |     params:
 8 |       host: $host_ip #host IP of head node
 9 |       custom_mapping:
10 |         mappings:
11 |           properties:
12 |             content:
13 |               type: text
14 |             question_id:
15 |               type: integer
16 |             question-body:
17 |               type: text
18 |               index: false
19 |             answer:
20 |               type: text
21 |               index: false
22 |             colbert_emb:
23 |               type: binary
24 |               index: false
25 |   
26 |   - name: Ranker
27 |     type: ColBERTRanker
28 |     actor: True
29 |     params:
30 |       model_path: /home/user/data/colbertv2.0
31 | 
32 |   - name: Dataset
33 |     type: StackoverflowDataset
34 |     path: /home/user/workspace/stackoverflow_dataset.py
35 |     actor: False
36 |     params:
37 |       question_file: /home/user/dataset/Questions.csv
38 |       answer_file: /home/user/dataset/Answers.csv
39 |       batch_size: 200000
40 | 
41 | 
42 | pipelines:
43 |   - name: indexing
44 |     nodes:
45 |       - name: Dataset
46 |         inputs: [File]
47 |       - name: Ranker
48 |         inputs: [Dataset]
49 |         serve_deployment_kwargs:
50 |             num_replicas: 80  # number of replicas to create on the Ray cluster
51 |             batch_size: 256
52 |             num_cpus: 2
53 |       - name: DocumentStore
54 |         inputs: [Ranker]
55 |         serve_deployment_kwargs:
56 |             num_replicas: 10  # number of replicas to create on the Ray cluster
57 |             batch_size: 2000
58 |             num_cpus: 8
59 | 


--------------------------------------------------------------------------------
/applications/indexing/stackoverflow_indexing/emr_indexing_pipeline.yml:
--------------------------------------------------------------------------------
 1 | version: 1.12.2
 2 | extras: ray
 3 | components:    # define all the building-blocks for Pipeline
 4 |   - name: DocumentStore
 5 |     type: ElasticsearchDocumentStore
 6 |     actor: True
 7 |     params:
 8 |       host: $host_ip #host IP of head node
 9 |       index: document
10 |       embedding_field: question_emb
11 |       embedding_dim: 768
12 |       excluded_meta_data: ["question_emb"]
13 |   
14 |   - name: Retriever
15 |     type: EmbeddingRetriever
16 |     actor: True
17 |     params:
18 |       document_store: DocumentStore    # params can reference other components defined in the YAML
19 |       embedding_model: deepset/sentence_bert
20 |       batch_size: 256
21 | 
22 |   - name: Dataset
23 |     type: StackoverflowDataset
24 |     path: /home/user/workspace/stackoverflow_dataset.py
25 |     actor: False
26 |     params:
27 |       question_file: /home/user/dataset/Questions.csv
28 |       answer_file: /home/user/dataset/Answers.csv
29 |       batch_size: 200000
30 | 
31 | 
32 | pipelines:
33 |   - name: indexing
34 |     nodes:
35 |       - name: Dataset
36 |         inputs: [File]
37 |       - name: Retriever
38 |         inputs: [Dataset]
39 |         serve_deployment_kwargs:
40 |             num_replicas: 80  # number of replicas to create on the Ray cluster
41 |             batch_size: 256
42 |             num_cpus: 2
43 |       - name: DocumentStore
44 |         inputs: [Retriever]
45 |         serve_deployment_kwargs:
46 |             num_replicas: 10  # number of replicas to create on the Ray cluster
47 |             batch_size: 2000
48 |             num_cpus: 8
49 | 


--------------------------------------------------------------------------------
/applications/indexing/stackoverflow_indexing/faiss_indexing_pipeline.yml:
--------------------------------------------------------------------------------
 1 | version: 1.12.2
 2 | extras: ray
 3 | components:    # define all the building-blocks for Pipeline
 4 |   
 5 |   - name: DocumentStore
 6 |     type: FAISSDocumentStore
 7 |     faiss_index_path: /home/user/data/faiss-index-so.faiss
 8 |     actor: False
 9 |     params:
10 |       sql_url: postgresql://postgres:postgres@$host_ip/haystack # postgresql url, please set host_ip to host IP of head node
11 |       faiss_index_factory_str: HNSW
12 | 
13 |   - name: Retriever
14 |     type: DensePassageRetriever
15 |     actor: True
16 |     params:
17 |       query_embedding_model: "facebook/dpr-question_encoder-single-nq-base"
18 |       passage_embedding_model: "facebook/dpr-ctx_encoder-single-nq-base"
19 |       max_seq_len_query: 64
20 |       max_seq_len_passage: 256
21 |       batch_size: 16
22 |       embed_title: True
23 |       use_fast_tokenizers: True
24 | 
25 |   - name: Dataset
26 |     type: StackoverflowDataset
27 |     path: /home/user/workspace/stackoverflow_dataset.py
28 |     actor: False
29 |     params:
30 |       question_file: /home/user/dataset/Questions.csv
31 |       answer_file: /home/user/dataset/Answers.csv
32 |       batch_size: 200000
33 | 
34 | 
35 | pipelines:
36 |   - name: indexing
37 |     nodes:
38 |       - name: Dataset
39 |         inputs: [File]
40 |       - name: Retriever
41 |         inputs: [Dataset]
42 |         serve_deployment_kwargs:
43 |             num_replicas: 140  # number of replicas to create on the Ray cluster
44 |             batch_size: 256
45 |             num_cpus: 2
46 |       - name: DocumentStore
47 |         inputs: [Retriever]
48 | 


--------------------------------------------------------------------------------
/applications/indexing/stackoverflow_indexing/prepare_env.sh:
--------------------------------------------------------------------------------
1 | pip install modin[all]==0.18.0


--------------------------------------------------------------------------------
/applications/indexing/stackoverflow_indexing_workflow.yml:
--------------------------------------------------------------------------------
 1 | nodes:
 2 |   - node: $host_ip
 3 |     type: head
 4 |     cores: 0-64
 5 |     image: intel/ai-workflows:odqa-haystack-api
 6 |     dataset_dir: $dataset_dir
 7 |     customer_dir: $customer_dir
 8 |     workspace_dir: $(pwd)/stackoverflow_indexing
 9 | 
10 |   - node: $host_ip
11 |     type: worker
12 |     image: intel/ai-workflows:odqa-haystack-api
13 |     cores: 0-71 
14 |     user: $user         #configure the user of worker node for remote access 
15 |     password: $password #configure the password of worker node for remote access and sudo executing
16 |     dataset_dir: $dataset_dir
17 |     customer_dir: $customer_dir
18 |     workspace_dir: $workspace_dir
19 |  
20 | pipelines:
21 |   - name: colbert_indexing_pipeline.yml
22 |     database:
23 |       type: elasticsearch
24 |       image: elasticsearch:7.9.2
25 |       cores: 65-71
26 |       data_dir: $data_dir_1
27 |   - name: emr_indexing_pipeline.yml
28 |     database:
29 |       type: elasticsearch
30 |       image: elasticsearch:7.9.2
31 |       cores: 65-71
32 |       data_dir: $data_dir_2
33 |   - name: faiss_indexing_pipeline.yml
34 |     database:     # The database containers will run in head node.
35 |       type: postgres
36 |       image: postgres:14.1-alpine
37 |       cores: 65-71
38 |       data_dir: $data_dir_3
39 | 


--------------------------------------------------------------------------------
/applications/odqa_pipelines/config/env.marco.esds_bm25r_colbert:
--------------------------------------------------------------------------------
 1 | PIPELINE_NAME=query
 2 | PIPELINE_PATH=./rest_api/rest_api/pipeline/pipelines.colbertRanker.haystack-pipeline.yml
 3 | ELASTICSEARCH_IMG=elasticsearch:7.9.2
 4 | COLBERT_OPT=True
 5 | ENABLE_IPEX=False
 6 | IPEX_BF16=False
 7 | CUSTOMER_DIR=$customer_dir
 8 | DATA_DIR=$data_dir
 9 | UI_CONFIG_DIR=../ui_config/marco
10 | 


--------------------------------------------------------------------------------
/applications/odqa_pipelines/config/env.marco.esds_emr_faq:
--------------------------------------------------------------------------------
1 | PIPELINE_NAME=query
2 | PIPELINE_PATH=./rest_api/rest_api/pipeline/pipelines.haystack-EmbeddingRetriever-pipeline.yml
3 | ELASTICSEARCH_IMG=elasticsearch:7.9.2
4 | ENABLE_IPEX=False
5 | IPEX_BF16=False
6 | DATA_DIR=$data_dir
7 | UI_CONFIG_DIR=../ui_config/marco
8 | 
9 | 


--------------------------------------------------------------------------------
/applications/odqa_pipelines/config/env.marco.faiss_dpr:
--------------------------------------------------------------------------------
1 | PIPELINE_NAME='query'
2 | PIPELINE_PATH=./rest_api/rest_api/pipeline/pipelines_dpr.haystack-pipeline.yml
3 | POSTGRES_HOST_AUTH_METHOD=trust
4 | CUSTOMER_DIR=$customer_dir
5 | DATA_DIR=$data_dir
6 | UI_CONFIG_DIR=../ui_config/marco
7 | 


--------------------------------------------------------------------------------
/applications/odqa_pipelines/config/env.stackoverflow.esds_bm25r_colbert:
--------------------------------------------------------------------------------
 1 | PIPELINE_NAME=query
 2 | PIPELINE_PATH=./rest_api/rest_api/pipeline/pipelines.colbertRanker.haystack-pipeline.yml
 3 | ELASTICSEARCH_IMG=elasticsearch:7.9.2
 4 | ENABLE_IPEX=False
 5 | IPEX_BF16=False
 6 | COLBERT_OPT=True
 7 | CUSTOMER_DIR=$customer_dir
 8 | DATA_DIR=$data_dir
 9 | UI_CONFIG_DIR=../ui_config/stackoverflow
10 | 


--------------------------------------------------------------------------------
/applications/odqa_pipelines/config/env.stackoverflow.esds_emr_faq:
--------------------------------------------------------------------------------
1 | PIPELINE_NAME=query
2 | PIPELINE_PATH=./rest_api/rest_api/pipeline/pipelines.haystack-EmbeddingRetriever-pipeline.yml
3 | ELASTICSEARCH_IMG=elasticsearch:7.9.2
4 | #CUSTOMER_DIR=/tmp/data
5 | ENABLE_IPEX=False
6 | IPEX_BF16=False
7 | DATA_DIR=$data_dir
8 | UI_CONFIG_DIR=../ui_config/stackoverflow
9 | 


--------------------------------------------------------------------------------
/applications/odqa_pipelines/config/env.stackoverflow.faiss_dpr:
--------------------------------------------------------------------------------
1 | PIPELINE_NAME='query'
2 | PIPELINE_PATH=./rest_api/rest_api/pipeline/pipelines_dpr.haystack-pipeline.yml
3 | POSTGRES_HOST_AUTH_METHOD=trust
4 | CUSTOMER_DIR=$customer_dir
5 | DATA_DIR=$data_dir
6 | UI_CONFIG_DIR=../ui_config/stackoverflow
7 | 


--------------------------------------------------------------------------------
/applications/odqa_pipelines/config/env.wiki.plaid_colbertv2:
--------------------------------------------------------------------------------
1 | PIPELINE_NAME='query'
2 | MODE=1
3 | DATASET='wiki-dpr'
4 | PIPELINE_PATH=./rest_api/rest_api/pipeline/pipeline_plaid_colbertv2.yml
5 | CHECKPOINT_PATH=/localdisk/kaixuan/colbertv2.0/
6 | CUSTOMER_DIR=/localdisk/kaixuan/plaid/
7 | 


--------------------------------------------------------------------------------
/applications/odqa_pipelines/docker-compose/docker-compose-dpr.yml:
--------------------------------------------------------------------------------
 1 | version: "3"
 2 | services:
 3 |   haystack-api:
 4 |     build:
 5 |       context: ../../../docker
 6 |       dockerfile: Dockerfile
 7 |       args:
 8 |         - http_proxy=$http_proxy
 9 |         - https_proxy=$https_proxy
10 |         - no_proxy=$no_proxy
11 |     image: "intel/ai-workflows:odqa-haystack-api"
12 |     network_mode: host
13 |     # Mount custom Pipeline YAML and custom Components.
14 |     volumes:
15 |       - $CUSTOMER_DIR:/home/user/data
16 |     ports:
17 |       - 8000:8000
18 |     restart: on-failure
19 |     environment:
20 |       # See rest_api/pipeline/pipelines.haystack-pipeline.yml for configurations of Search & Indexing Pipeline.
21 |       - PIPELINE_YAML_PATH=$PIPELINE_PATH
22 |       - QUERY_PIPELINE_NAME=$PIPELINE_NAME
23 |       - INDEX_NAME=$INDEX_NAME
24 |       - CONCURRENT_REQUEST_PER_WORKER
25 |       - http_proxy=$http_proxy
26 |       - https_proxy=$https_proxy
27 |       - no_proxy=$no_proxy
28 |       #- ONEDNN_VERBOSE=1
29 |       - KMP_BLOCKTIME=20
30 |       - MKL_ENABLE_INSTRUCTIONS=AVX512_E1
31 |       - LD_PRELOAD=/usr/local/lib/libiomp5.so
32 |       - KMP_AFFINITY=granularity=fine,verbose,compact,1,0
33 |       #- MKL_VERBOSE=1
34 |     depends_on:
35 |       - postsql-db
36 |     # Starts REST API with only 2 workers so that it can be run on systems with just 4GB of memory
37 |     # If you need to handle large loads of incoming requests and have memory to spare, consider increasing the number of workers
38 |     command: "/bin/bash -c 'sleep 10 && gunicorn rest_api.application:app -b 0.0.0.0 -k uvicorn.workers.UvicornWorker --workers 1 --timeout 600'"
39 |   postsql-db:
40 |     image: "postgres:14.1-alpine"
41 |     ports:
42 |       - 5432:5432
43 |     restart: on-failure
44 |     environment:
45 |       - POSTGRES_HOST_AUTH_METHOD=$POSTGRES_HOST_AUTH_METHOD
46 |     volumes:
47 |       - $DATA_DIR:/var/lib/postgresql/data
48 | 
49 |   ui:
50 |     build:
51 |       context: ../../../ui
52 |       dockerfile: Dockerfile
53 |       args:
54 |         - http_proxy=$http_proxy
55 |         - https_proxy=$https_proxy
56 |     image: "intel/ai-workflows:odqa-haystack-ui"
57 |     network_mode: host
58 |     ports:
59 |       - 8501:8501
60 |     restart: on-failure
61 |     volumes:
62 |       - $UI_CONFIG_DIR:/home/user/data/
63 |     environment:
64 |       - API_ENDPOINT=http://localhost:8000
65 |       - EVAL_FILE=ui/eval_labels_example.csv
66 |       - PIPELINE_PATH=$PIPELINE_PATH
67 |       # The value fot the following variables will be read from the host, if present.
68 |       # They can also be temporarily set for docker-compose, for example:
69 |       # DISABLE_FILE_UPLOAD=1 DEFAULT_DOCS_FROM_RETRIEVER=5 docker-compose up
70 |       - DISABLE_FILE_UPLOAD=True
71 |       - DEFAULT_QUESTION_AT_STARTUP
72 |       - DEFAULT_DOCS_FROM_RETRIEVER
73 |       - DEFAULT_NUMBER_OF_ANSWERS
74 |     command: "/bin/bash -c 'sleep 15 && python -m streamlit run ui/webapp.py'"
75 | 


--------------------------------------------------------------------------------
/applications/odqa_pipelines/docker-compose/docker-compose-gpu-dpr.yml:
--------------------------------------------------------------------------------
 1 | version: "3"
 2 | services:
 3 | 
 4 |   haystack-api:
 5 |     build:
 6 |       context: ../../../docker
 7 |       dockerfile: Dockerfile-GPU
 8 |       args:
 9 |         - http_proxy=$HTTP_PROXY
10 |         - https_proxy=$HTTPS_PROXY
11 |         - no_proxy=$NO_PROXY
12 |     image: "intel/ai-workflows:odqa-haystack-api-gpu"
13 |     # in recent docker-compose version you can enable GPU resources. Make sure to fulfill the prerequisites listed here: https://docs.docker.com/compose/gpu-support/
14 |     deploy:
15 |       resources:
16 |         reservations:
17 |           devices:
18 |           - driver: nvidia
19 |             #count: 1
20 |             capabilities: [gpu]
21 |     # # Mount custom Pipeline YAML and custom Components.
22 |     # volumes:
23 |     #   - ./rest_api/pipeline:/home/user/rest_api/pipeline
24 |     ports:
25 |       - 8000:8000
26 |     restart: on-failure
27 | 
28 |     volumes:
29 |       - $CUSTOMER_DIR:/home/user/data
30 | 
31 |     environment:
32 |       # See rest_api/pipeline/pipelines.haystack-pipeline.yml for configurations of Search & Indexing Pipeline.
33 |       #- DOCUMENTSTORE_PARAMS_HOST=elasticsearch
34 |         #- PIPELINE_YAML_PATH=/home/user/rest_api/pipeline/pipelines_dpr.haystack-pipeline.yml
35 |       - PIPELINE_YAML_PATH=$PIPELINE_PATH
36 |       - QUERY_PIPELINE_NAME=$PIPELINE_NAME
37 |       #- INDEX_NAME=$INDEX_NAME
38 |       - CONCURRENT_REQUEST_PER_WORKER
39 |       - http_proxy=$HTTP_PROXY
40 |       - https_proxy=$HTTPS_PROXY
41 |       - no_proxy=$NO_PROXY
42 |       #- COLBERT_OPT=$COLBERT_OPT
43 |     depends_on:
44 |       - postsql-db
45 |     command: "/bin/bash -c 'sleep 10 && gunicorn rest_api.application:app -b 0.0.0.0 -k uvicorn.workers.UvicornWorker --workers 1 --timeout 600 --graceful-timeout 600'"
46 | 
47 |   postsql-db:
48 |     image: "postgres:14.1-alpine"
49 |     ports:
50 |       - 5432:5432
51 |     restart: on-failure
52 |     environment:
53 |       - POSTGRES_HOST_AUTH_METHOD=$POSTGRES_HOST_AUTH_METHOD
54 |     volumes:
55 |       - $DATA_DIR:/var/lib/postgresql/data
56 |     # environment:
57 |     #   - discovery.type=single-node
58 | 
59 |   ui:
60 |     build:
61 |       context: ../../../ui
62 |       dockerfile: Dockerfile
63 |       args:
64 |         - http_proxy=$HTTP_PROXY
65 |         - https_proxy=$HTTPS_PROXY
66 |     image: "intel/ai-workflows:odqa-haystack-ui"
67 |     ports:
68 |       - 8501:8501
69 |     restart: on-failure
70 |     volumes:
71 |       - $UI_CONFIG_DIR:/home/user/data/
72 |     environment:
73 |       - API_ENDPOINT=http://haystack-api:8000
74 |       - EVAL_FILE=ui/eval_labels_example.csv
75 |       - PIPELINE_PATH=$PIPELINE_PATH
76 |       # The value fot the following variables will be read from the host, if present.
77 |       # They can also be temporarily set for docker-compose, for example:
78 |       # DISABLE_FILE_UPLOAD=1 DEFAULT_DOCS_FROM_RETRIEVER=5 docker-compose up
79 |       - DISABLE_FILE_UPLOAD=True
80 |       - DEFAULT_QUESTION_AT_STARTUP
81 |       - DEFAULT_DOCS_FROM_RETRIEVER
82 |       - DEFAULT_NUMBER_OF_ANSWERS
83 |     command: "/bin/bash -c 'sleep 15 && python -m streamlit run ui/webapp.py'"
84 | 


--------------------------------------------------------------------------------
/applications/odqa_pipelines/docker-compose/docker-compose-gpu.yml:
--------------------------------------------------------------------------------
 1 | version: "3"
 2 | services:
 3 | 
 4 |   haystack-api:
 5 |     build:
 6 |       context: ../../../docker
 7 |       dockerfile: Dockerfile-GPU
 8 |       args:
 9 |         - http_proxy=$HTTP_PROXY
10 |         - https_proxy=$HTTPS_PROXY
11 |         - no_proxy=$NO_PROXY
12 |     image: "intel/ai-workflows:odqa-haystack-api-gpu"
13 |     # in recent docker-compose version you can enable GPU resources. Make sure to fulfill the prerequisites listed here: https://docs.docker.com/compose/gpu-support/
14 |     deploy:
15 |       resources:
16 |         reservations:
17 |           devices:
18 |           - driver: nvidia
19 |             #count: 1
20 |             capabilities: [gpu]
21 |     # # Mount custom Pipeline YAML and custom Components.
22 |     # volumes:
23 |     #   - ./rest_api/pipeline:/home/user/rest_api/pipeline
24 |     ports:
25 |       - 8000:8000
26 |     restart: on-failure
27 | 
28 |     #volumes:
29 |     #  - $CUSTOMER_DIR:/home/user/data
30 | 
31 |     environment:
32 |       # See rest_api/pipeline/pipelines.haystack-pipeline.yml for configurations of Search & Indexing Pipeline.
33 |       - DOCUMENTSTORE_PARAMS_HOST=elasticsearch
34 |         #- PIPELINE_YAML_PATH=/home/user/rest_api/pipeline/pipelines_dpr.haystack-pipeline.yml
35 |       - PIPELINE_YAML_PATH=$PIPELINE_PATH
36 |       - QUERY_PIPELINE_NAME=$PIPELINE_NAME
37 |       - INDEX_NAME=$INDEX_NAME
38 |       - CONCURRENT_REQUEST_PER_WORKER
39 |       - http_proxy=$HTTP_PROXY
40 |       - https_proxy=$HTTPS_PROXY
41 |       - no_proxy=$NO_PROXY
42 |       - COLBERT_OPT=$COLBERT_OPT
43 |       - CHECKPOINT_PATH=$CHECKPOINT_PATH
44 |       - IS_DICT_CHECKPOINT=$IS_DICT_CHECKPOINT
45 |     depends_on:
46 |       - elasticsearch
47 |     command: "/bin/bash -c 'sleep 10 && gunicorn rest_api.application:app -b 0.0.0.0 -k uvicorn.workers.UvicornWorker --workers 1 --timeout 600 --graceful-timeout 600'"
48 | 
49 |   elasticsearch:
50 |     # This will start an empty elasticsearch instance (so you have to add your documents yourself)
51 |     #image: "elasticsearch:7.9.2"
52 |     # If you want a demo image instead that is "ready-to-query" with some indexed articles
53 |     # about countries and capital cities from Wikipedia:
54 |     #image: "dingke1980/elasticsearch-stack-overflow:1.0"
55 |     image: $ELASTICSEARCH_IMG
56 |     ports:
57 |       - 9200:9200
58 |     restart: on-failure
59 |     volumes:
60 |       - $DATA_DIR:/usr/share/elasticsearch/data
61 |     environment:
62 |       - discovery.type=single-node
63 |       - ES_JAVA_OPTS=-Xmx4g -Xms4g
64 | 
65 |   ui:
66 |     build:
67 |       context: ../../../ui
68 |       dockerfile: Dockerfile
69 |       args:
70 |         - http_proxy=$HTTP_PROXY
71 |         - https_proxy=$HTTPS_PROXY
72 |     image: "intel/ai-workflows:odqa-haystack-ui"
73 |     ports:
74 |       - 8501:8501
75 |     restart: on-failure
76 |     volumes:
77 |       - $UI_CONFIG_DIR:/home/user/data/
78 |     environment:
79 |       - API_ENDPOINT=http://haystack-api:8000
80 |       - EVAL_FILE=ui/eval_labels_example.csv
81 |       - PIPELINE_PATH=$PIPELINE_PATH
82 |       # The value fot the following variables will be read from the host, if present.
83 |       # They can also be temporarily set for docker-compose, for example:
84 |       # DISABLE_FILE_UPLOAD=1 DEFAULT_DOCS_FROM_RETRIEVER=5 docker-compose up
85 |       - DISABLE_FILE_UPLOAD=True
86 |       - DEFAULT_QUESTION_AT_STARTUP
87 |       - DEFAULT_DOCS_FROM_RETRIEVER
88 |       - DEFAULT_NUMBER_OF_ANSWERS
89 |     command: "/bin/bash -c 'sleep 15 && python -m streamlit run ui/webapp.py'"
90 | 


--------------------------------------------------------------------------------
/applications/odqa_pipelines/docker-compose/docker-compose-plaid.yml:
--------------------------------------------------------------------------------
 1 | version: "3"
 2 | services:
 3 |   haystack-api:
 4 |     build:
 5 |       context: ../../../docker
 6 |       dockerfile: Dockerfile
 7 |       args:
 8 |         - http_proxy=$HTTP_PROXY
 9 |         - https_proxy=$HTTPS_PROXY
10 |         - no_proxy=$NO_PROXY
11 |     image: "intel/ai-workflows:odqa-haystack-api"
12 |     # Mount custom Pipeline YAML and custom Components.
13 |     # volumes:
14 |     #   - ./rest_api/pipeline:/home/user/rest_api/pipeline
15 |     #network_mode: host
16 |     ports:
17 |       - 8000:8000
18 |     restart: on-failure
19 |     volumes:
20 |       - $CUSTOMER_DIR:/home/user/data
21 |       - $CHECKPOINT_PATH:/home/user/model
22 |     environment:
23 |       # See rest_api/pipeline/pipelines.haystack-pipeline.yml for configurations of Search & Indexing Pipeline.
24 |       - PIPELINE_YAML_PATH=$PIPELINE_PATH
25 |       - QUERY_PIPELINE_NAME=$PIPELINE_NAME
26 |       - INDEX_NAME=$INDEX_NAME
27 |       - CONCURRENT_REQUEST_PER_WORKER=48
28 |       - http_proxy=$HTTP_PROXY
29 |       - https_proxy=$HTTPS_PROXY
30 |       - no_proxy=$NO_PROXY
31 |       - KMP_BLOCKTIME=20
32 |       - MKL_ENABLE_INSTRUCTIONS=AVX512_E1
33 |       - ENABLE_IPEX=$ENABLE_IPEX
34 |       - IPEX_BF16=$IPEX_BF16
35 |       - CHECKPOINT_PATH=$CHECKPOINT_PATH
36 |     # Starts REST API with only 2 workers so that it can be run on systems with just 4GB of memory
37 |     # If you need to handle large loads of incoming requests and have memory to spare, consider increasing the number of workers
38 |     command: "/bin/bash -c 'sleep 10 && gunicorn rest_api.application:app -b 0.0.0.0 -k uvicorn.workers.UvicornWorker --workers 1 --timeout 600'"
39 | 
40 |   ui:
41 |     build:
42 |       context: ../../../ui
43 |       dockerfile: Dockerfile
44 |       args:
45 |         - http_proxy=$HTTP_PROXY
46 |         - https_proxy=$HTTPS_PROXY
47 |         - no_proxy=$NO_PROXY
48 |     image: "intel/ai-workflows:odqa-haystack-ui"
49 |     #network_mode: host
50 |     ports:
51 |       - 8501:8501
52 |     restart: on-failure
53 |     volumes:
54 |       - $UI_CONFIG_DIR:/home/user/data/
55 |     environment:
56 |       - API_ENDPOINT=http://haystack-api:8000
57 |       #- API_ENDPOINT=http://localhost:8000
58 |       - EVAL_FILE=ui/eval_labels_example.csv
59 |       - PIPELINE_PATH=$PIPELINE_PATH
60 |       # The value fot the following variables will be read from the host, if present.
61 |       # They can also be temporarily set for docker-compose, for example:
62 |       # DISABLE_FILE_UPLOAD=1 DEFAULT_DOCS_FROM_RETRIEVER=5 docker-compose up
63 |       - DISABLE_FILE_UPLOAD=True
64 |       - DEFAULT_QUESTION_AT_STARTUP
65 |       - DEFAULT_DOCS_FROM_RETRIEVER
66 |       - DEFAULT_NUMBER_OF_ANSWERS
67 |     command: "/bin/bash -c 'sleep 15 && python -m streamlit run ui/webapp.py'"
68 | 


--------------------------------------------------------------------------------
/applications/odqa_pipelines/docker-compose/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: "3"
 2 | services:
 3 |   haystack-api:
 4 |     build:
 5 |       context: ../../../docker
 6 |       dockerfile: Dockerfile
 7 |       args:
 8 |         - http_proxy=$http_proxy
 9 |         - https_proxy=$https_proxy
10 |         - no_proxy=$no_proxy
11 |     image: "intel/ai-workflows:odqa-haystack-api"
12 |     network_mode: host
13 |     ports:
14 |       - 8000:8000
15 |     restart: on-failure
16 |     #cpuset: "0"
17 |     # Mount custom Pipeline YAML and custom Components.
18 |     #volumes:
19 |     #  - $CUSTOMER_DIR:/home/user/data
20 |     environment:
21 |       - DOCUMENTSTORE_PARAMS_HOST=localhost
22 |       - DOCUMENTSTORE_PARAMS_PORT=9200
23 |       # See rest_api/pipeline/pipelines.haystack-pipeline.yml for configurations of Search & Indexing Pipeline.
24 |       - PIPELINE_YAML_PATH=$PIPELINE_PATH
25 |       - QUERY_PIPELINE_NAME=$PIPELINE_NAME
26 |       - CONCURRENT_REQUEST_PER_WORKER=48
27 |       - http_proxy=$http_proxy
28 |       - https_proxy=$https_proxy
29 |       - no_proxy=$no_proxy
30 |       - KMP_BLOCKTIME=20
31 |       - MKL_ENABLE_INSTRUCTIONS=AVX512_E1
32 |       - COLBERT_OPT=$COLBERT_OPT
33 |       - ENABLE_IPEX=$ENABLE_IPEX
34 |       - IPEX_BF16=$IPEX_BF16
35 |       - CHECKPOINT_PATH=$CHECKPOINT_PATH
36 |       - IS_DICT_CHECKPOINT=$IS_DICT_CHECKPOINT
37 |     depends_on:
38 |       - elasticsearch
39 |     # Starts REST API with only 2 workers so that it can be run on systems with just 4GB of memory
40 |     # If you need to handle large loads of incoming requests and have memory to spare, consider increasing the number of workers
41 |     command: "/bin/bash -c 'sleep 10 && gunicorn rest_api.application:app -b 0.0.0.0 -k uvicorn.workers.UvicornWorker --workers 1 --timeout 600'"
42 |   elasticsearch:
43 |     # This will start an empty elasticsearch instance (so you have to add your documents yourself)
44 |     # If you want a demo image instead that is "ready-to-query" with some indexed articles
45 |     # about countries and capital cities from Wikipedia:
46 |     #image: "deepset/elasticsearch-countries-and-capitals"
47 |     image: $ELASTICSEARCH_IMG
48 |     ports:
49 |       - 9200:9200
50 |     restart: on-failure
51 |     environment:
52 |       - discovery.type=single-node
53 |       - ES_JAVA_OPTS=-Xmx4g -Xms4g
54 |     volumes:
55 |       - $DATA_DIR:/usr/share/elasticsearch/data
56 |   ui:
57 |     build:
58 |       context: ../../../ui
59 |       dockerfile: Dockerfile
60 |       args:
61 |         - http_proxy=$http_proxy
62 |         - https_proxy=$https_proxy
63 |         - no_proxy=$no_proxy
64 |     image: "intel/ai-workflows:odqa-haystack-ui"
65 |     network_mode: host
66 |     ports:
67 |       - 8501:8501
68 |     restart: on-failure
69 |     volumes:
70 |       - $UI_CONFIG_DIR:/home/user/data/
71 |     environment:
72 |       - PIPELINE_PATH=$PIPELINE_PATH
73 |       # - API_ENDPOINT=http://haystack-api:8000
74 |       - API_ENDPOINT=http://localhost:8000
75 |       # The value fot the following variables will be read from the host, if present.
76 |       # They can also be temporarily set for docker-compose, for example:
77 |       # DISABLE_FILE_UPLOAD=1 DEFAULT_DOCS_FROM_RETRIEVER=5 docker-compose up
78 |       - DISABLE_FILE_UPLOAD=True
79 |       - DEFAULT_QUESTION_AT_STARTUP
80 |       - DEFAULT_DOCS_FROM_RETRIEVER
81 |       - DEFAULT_NUMBER_OF_ANSWERS
82 |     command: "/bin/bash -c 'sleep 15 && python -m streamlit run ui/webapp.py'"
83 | 


--------------------------------------------------------------------------------
/applications/odqa_pipelines/launch_pipeline.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | # A POSIX variable
  3 | OPTIND=1         # Reset in case getopts has been used previously in the shell.
  4 | rebuild='1'
  5 | nginx='0'
  6 | pipeline='emr_faq'
  7 | device='cpu'
  8 | database='marco'
  9 | 
 10 | usage() {
 11 |   echo "Usage: $0 -p [pipeline] [optional parameters]"
 12 |   echo "  options:"
 13 |   echo "    -h Display usage"
 14 |   echo "    -p pipeline"
 15 |   echo "         pipelines = [emr_faq, faiss_faq, colbert_faq, colbert_opt_faq]"
 16 |   echo "    -r rebuild"
 17 |   echo "         rebuild the images [1 : yes, 0 : no] "
 18 |   echo "    -d device"
 19 |   echo "         devices = [cpu, gpu]"
 20 |   echo "    -n nginx"
 21 |   echo "         Use nginx for load balance [1 : yes, 0 : no]"
 22 |   echo "    -e database"
 23 |   echo "         Use database [stackoverflow, marco] for searching"
 24 |   echo ""
 25 |   echo "  examples:"
 26 |   echo "    Run emr_faq pipeline on CPU "
 27 |   echo "      $0 -r 1 -d cpu -n 0 -p emr_faq"
 28 |   echo ""
 29 | }
 30 | 
 31 | while getopts "h?r:d:n:p:e:" opt; do
 32 |     case "$opt" in
 33 |     h|\?)
 34 |         usage
 35 |         exit 1
 36 |         ;;
 37 |     r)  rebuild=$OPTARG
 38 |         ;;
 39 |     n)  nginx=$OPTARG
 40 |         ;;
 41 |     p)  pipeline=$OPTARG
 42 |         ;;
 43 |     d)  device=$OPTARG
 44 |         ;;
 45 |     e)  database=$OPTARG
 46 |     esac
 47 | done
 48 | 
 49 | shift $((OPTIND-1))
 50 | 
 51 | [ "${1:-}" = "--" ] && shift
 52 | 
 53 | 
 54 | ## Override default values for values specified by the user
 55 | if [ ! -z "$rebuild" ]; then
 56 |     rebuild=$rebuild
 57 | fi
 58 | 
 59 | 
 60 | if [ ! -z "$device" ]; then
 61 |   device=$device
 62 | fi
 63 | 
 64 | if [ ! -z "$pipeline" ]; then
 65 |   pipeline=$pipeline
 66 | fi
 67 | 
 68 | if [ ! -z "$nginx" ]; then
 69 |   nginx=$nginx
 70 | fi
 71 | 
 72 | if [ ! -z "$database" ]; then
 73 |   database=$database
 74 | fi
 75 | 
 76 | config='config/env.stackoverflow.esds_emr_yml_faq'
 77 | build=''
 78 | yaml_file='docker-compose.yml'
 79 | 
 80 | if [[ $pipeline = "emr_faq" ]]; then
 81 |     config='config/env.'${database}'.esds_emr_faq'
 82 | 
 83 | elif [[ $pipeline = "faiss_faq" ]]; then
 84 |     config='config/env.'${database}'.faiss_dpr'
 85 |     yaml_file='docker-compose-dpr.yml'
 86 | 
 87 | elif [[ $pipeline = "colbert_faq" ]]; then
 88 |     config='config/env.'${database}'.esds_bm25r_colbert'
 89 |     if [[ $database = "stackoverflow" ]]; then
 90 |         echo "Cannot support ${pipeline} with ${database}, need the fine-tuned colbert model with ${database}"
 91 |         exit 0    
 92 |     fi 
 93 | 
 94 | elif [[ $pipeline = "colbert_opt_faq" ]]; then
 95 |     config='config/env.'${database}'.esds_bm25r_colbert_opt'
 96 |     if [[ $database = "stackoverflow" ]]; then
 97 |         echo "Cannot support ${pipeline} with ${database}, need the fine-tuned colbert model with ${database}"
 98 |         exit 0
 99 |     fi
100 | fi
101 | 
102 | if [[ $rebuild = "1" ]]; then
103 |     echo "rebuild docker images"
104 |     build='--build'
105 | fi
106 | echo "device = ${device}"
107 | if [[ $device = "gpu" ]]; then
108 |     yaml_file='docker-compose-gpu.yml'
109 |     if [[ $pipeline = "faiss_faq" ]]; then
110 |         yaml_file='docker-compose-gpu-dpr.yml'
111 |     fi
112 | fi
113 | 
114 | if [[ $nginx = "1" ]]; then
115 |     echo "use the nginx for load balance, only CPU mode supported!"
116 |     yaml_file='docker-compose-nginx.yml'
117 | fi
118 | 
119 | echo "run the ${pipeline} with ${database} on ${device}"
120 | yaml_file='docker-compose/'$yaml_file
121 | 
122 | docker-compose --env-file $config -f $yaml_file up $build
123 | 


--------------------------------------------------------------------------------
/applications/odqa_pipelines/ui_config/marco/config.yml:
--------------------------------------------------------------------------------
 1 | dataset:
 2 |   name: Marco
 3 |   questions:
 4 |     - question: "What's the deepest lake in the world?"
 5 |       answer: "Lake Bajkal"
 6 |     - question: "Which Chinese city is the largest?"
 7 |       answer: "Shanghai"
 8 |     - question: "What's the longest river of Europe?"
 9 |       answer: "The Volga"
10 |     - question: "What's the tallest mountain in Africa?"
11 |       answer: "Mount Kilimanjaro"
12 | pipelines:
13 |   - name: pipelines.haystack-EmbeddingRetriever-pipeline.yml
14 |     top_k_sliders:
15 |       - name: answer
16 |         desc: Max. number of answers
17 |         default_value: 3
18 |         keys:
19 |           - key: Retriever
20 |             param: top_k
21 |   
22 |   - name: pipelines_dpr.haystack-pipeline.yml
23 |     top_k_sliders:
24 |       - name: answer
25 |         desc: "Max. number of answers"
26 |         default_value: 3
27 |         keys:
28 |           - key: Retriever
29 |             param: top_k
30 |   
31 |   - name: pipelines.colbertRanker.haystack-pipeline.yml
32 |     top_k_sliders:
33 |       - name: answer
34 |         desc: "Max. number of answers"
35 |         default_value: 3
36 |         keys:
37 |           - key: Ranker
38 |             param: top_k
39 |       
40 |       - name: retriever
41 |         desc: "Max. number of documents from retriever"
42 |         default_value: 3
43 |         keys:
44 |           - key: Retriever
45 |             param: top_k


--------------------------------------------------------------------------------
/applications/odqa_pipelines/ui_config/stackoverflow/config.yml:
--------------------------------------------------------------------------------
 1 | dataset:
 2 |   name: StackOverflow
 3 |   questions:
 4 |     - question: "How to pass a function as a parameter in C?"
 5 |     - question: "How to open a file in C++?"
 6 |     - question: "How to convert a string to integer in C?"
 7 |     - question: "How to get local IP-Address from an udp-socket (C/C++)?"
 8 | 
 9 | pipelines:
10 |   - name: pipelines.haystack-EmbeddingRetriever-pipeline.yml
11 |     top_k_sliders:
12 |       - name: answer
13 |         desc: Max. number of answers
14 |         default_value: 3
15 |         keys:
16 |           - key: Retriever
17 |             param: top_k
18 |   
19 |   - name: pipelines_dpr.haystack-pipeline.yml
20 |     top_k_sliders:
21 |       - name: answer
22 |         desc: "Max. number of answers"
23 |         default_value: 3
24 |         keys:
25 |           - key: Retriever
26 |             param: top_k
27 |   
28 |   - name: pipelines.colbertRanker.haystack-pipeline.yml
29 |     top_k_sliders:
30 |       - name: answer
31 |         desc: "Max. number of answers"
32 |         default_value: 3
33 |         keys:
34 |           - key: Ranker
35 |             param: top_k
36 |       
37 |       - name: retriever
38 |         desc: "Max. number of documents from retriever"
39 |         default_value: 3
40 |         keys:
41 |           - key: Retriever
42 |             param: top_k


--------------------------------------------------------------------------------
/conftest.py:
--------------------------------------------------------------------------------
 1 | def pytest_addoption(parser):
 2 |     parser.addoption(
 3 |         "--document_store_type", action="store", default="elasticsearch, faiss, sql, memory, milvus1, milvus, weaviate"
 4 |     )
 5 | 
 6 | 
 7 | def pytest_generate_tests(metafunc):
 8 |     # Get selected docstores from CLI arg
 9 |     document_store_type = metafunc.config.option.document_store_type
10 |     selected_doc_stores = [item.strip() for item in document_store_type.split(",")]
11 | 
12 |     # parametrize document_store fixture if it's in the test function argument list
13 |     # but does not have an explicit parametrize annotation e.g
14 |     # @pytest.mark.parametrize("document_store", ["memory"], indirect=False)
15 |     found_mark_parametrize_document_store = False
16 |     for marker in metafunc.definition.iter_markers("parametrize"):
17 |         if "document_store" in marker.args[0]:
18 |             found_mark_parametrize_document_store = True
19 |             break
20 |     # for all others that don't have explicit parametrization, we add the ones from the CLI arg
21 |     if "document_store" in metafunc.fixturenames and not found_mark_parametrize_document_store:
22 |         metafunc.parametrize("document_store", selected_doc_stores, indirect=True)
23 | 


--------------------------------------------------------------------------------
/docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:20.04
 2 | 
 3 | WORKDIR /home/user
 4 | 
 5 | RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \
 6 |     python3  \
 7 |     python3-pip \
 8 |     libpoppler-cpp-dev \
 9 |     tesseract-ocr \
10 |     wget \
11 |     git \
12 |     libtesseract-dev \
13 |     poppler-utils \
14 |     libmkl-dev
15 | 
16 | # Install PDF converter
17 | RUN wget --no-check-certificate https://dl.xpdfreader.com/xpdf-tools-linux-4.04.tar.gz && \
18 |     tar -xvf xpdf-tools-linux-4.04.tar.gz && cp xpdf-tools-linux-4.04/bin64/pdftotext /usr/local/bin
19 | 
20 | #clone the code repo
21 | #RUN git clone --depth=1 https://github.com/intel/open-domain-question-and-answer.git
22 | #WORKDIR /home/user/open-domain-question-and-answer
23 | #RUN git submodule update --init --recursive
24 | 
25 | COPY / /home/user/open-domain-question-and-answer/
26 | WORKDIR /home/user/open-domain-question-and-answer
27 | # Install package
28 | RUN pip install torch torchvision --force-reinstall --extra-index-url https://download.pytorch.org/whl/cpu
29 | RUN pip install --upgrade pip
30 | # RUN pip install --no-cache-dir .[docstores,crawler,preprocessing,ocr,ray]
31 | RUN pip install --no-cache-dir .[faiss,preprocessing,ocr,ray]
32 | RUN pip install --no-cache-dir rest_api/
33 | RUN pip install --no-cache-dir third_party/ColBERT/
34 | RUN pip install numba
35 | #RUN pip install faiss-1.6.3-py3-none-any.whl
36 | RUN python3 -m pip install intel-extension-for-pytorch
37 | RUN pip install intel-openmp
38 | RUN ls /home/user
39 | RUN pip freeze
40 | RUN python3 -c "from haystack.utils.docker import cache_models;cache_models()"
41 | 
42 | # create folder for /file-upload API endpoint with write permissions, this might be adjusted depending on FILE_UPLOAD_PATH
43 | RUN mkdir -p /home/user/open-domain-question-and-answer/rest_api/file-upload
44 | RUN chmod 777 /home/user/open-domain-question-and-answer/rest_api/file-upload
45 | RUN ln -s /usr/bin/python3.8 /usr/bin/python
46 | 
47 | EXPOSE 8000
48 | ENV HAYSTACK_DOCKER_CONTAINER="HAYSTACK_CPU_CONTAINER"
49 | 
50 | # cmd for running the API
51 | CMD ["gunicorn", "rest_api.application:app", "-b", "0.0.0.0", "-k", "uvicorn.workers.UvicornWorker", "--workers", "1", "--timeout", "180"]
52 | 


--------------------------------------------------------------------------------
/docker/Dockerfile-GPU:
--------------------------------------------------------------------------------
 1 | FROM nvidia/cuda:11.3.0-runtime-ubuntu20.04
 2 | 
 3 | WORKDIR /home/user
 4 | 
 5 | ENV LC_ALL=C.UTF-8
 6 | ENV LANG=C.UTF-8
 7 | 
 8 | # Install software dependencies
 9 | RUN apt-get update && apt-get install -y software-properties-common && \
10 |     add-apt-repository ppa:deadsnakes/ppa && \
11 |     apt-get install -y \
12 |         cmake \
13 |         curl \
14 |         git \
15 |         libpoppler-cpp-dev \
16 |         libtesseract-dev \
17 |         pkg-config \
18 |         poppler-utils \
19 |         python3-pip \
20 |         python3.7 \
21 |         python3.7-dev \
22 |         python3.7-distutils \
23 |         swig \
24 |         tesseract-ocr \
25 |         wget && \
26 |     rm -rf /var/lib/apt/lists/*
27 | 
28 | # Install PDF converter
29 | RUN wget https://dl.xpdfreader.com/xpdf-tools-linux-4.04.tar.gz && \
30 |     tar -xvzf xpdf-tools-linux-4.04.tar.gz -C /usr/local/bin --strip-components=2 xpdf-tools-linux-4.04/bin64/pdftotext
31 | 
32 | # Set default Python version
33 | RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.7 1 && \
34 |     update-alternatives --set python3 /usr/bin/python3.7
35 | 
36 | #clone the code repo
37 | RUN git clone --depth=1 https://github.com/intel/open-domain-question-and-answer.git
38 | WORKDIR /home/user/open-domain-question-and-answer
39 | RUN git submodule update --init --recursive
40 | 
41 | # Install package
42 | RUN pip install --upgrade pip
43 | # RUN pip install --no-cache-dir .[docstores-gpu,crawler,preprocessing,ocr,ray]
44 | RUN pip install --no-cache-dir .[faiss,preprocessing,ocr,ray]
45 | RUN pip install --no-cache-dir rest_api/
46 | RUN pip install --no-cache-dir third_party/ColBERT/
47 | RUN pip install numba
48 | # Install PyTorch for CUDA 11
49 | RUN pip3 install --no-cache-dir torch==1.10.1+cu111 -f https://download.pytorch.org/whl/torch_stable.html
50 | 
51 | # Cache Roberta and NLTK data
52 | RUN python3 -c "from haystack.utils.docker import cache_models;cache_models()"
53 | 
54 | # create folder for /file-upload API endpoint with write permissions, this might be adjusted depending on FILE_UPLOAD_PATH
55 | RUN mkdir -p /home/user/open-domain-question-and-answer/rest_api/file-upload
56 | RUN chmod 777 /home/user/open-domain-question-and-answer/rest_api/file-upload
57 | 
58 | 
59 | EXPOSE 8000
60 | ENV HAYSTACK_DOCKER_CONTAINER="HAYSTACK_GPU_CONTAINER"
61 | 
62 | # cmd for running the API (note: "--preload" is not working with cuda)
63 | CMD ["gunicorn", "rest_api.application:app",  "-b", "0.0.0.0", "-k", "uvicorn.workers.UvicornWorker", "--workers", "1", "--timeout", "180"]
64 | 


--------------------------------------------------------------------------------
/haystack/__init__.py:
--------------------------------------------------------------------------------
 1 | # pylint: disable=wrong-import-position,wrong-import-order
 2 | 
 3 | from typing import Union
 4 | from types import ModuleType
 5 | 
 6 | try:
 7 |     from importlib import metadata
 8 | except (ModuleNotFoundError, ImportError):
 9 |     # Python <= 3.7
10 |     import importlib_metadata as metadata  # type: ignore
11 | 
12 | __version__: str = str(metadata.version("farm-haystack"))
13 | 
14 | 
15 | # Logging is not configured here on purpose, see https://github.com/deepset-ai/haystack/issues/2485
16 | import logging
17 | 
18 | import pandas as pd
19 | 
20 | from haystack.schema import Document, Answer, Label, MultiLabel, Span, EvaluationResult
21 | from haystack.nodes.base import BaseComponent
22 | from haystack.pipelines.base import Pipeline
23 | 
24 | 
25 | pd.options.display.max_colwidth = 80
26 | 


--------------------------------------------------------------------------------
/haystack/config.py:
--------------------------------------------------------------------------------
1 | import os
2 | from pathlib import Path
3 | COLBERT_OPT = os.getenv("COLBERT_OPT", "False")
4 | ENABLE_IPEX = os.getenv("ENABLE_IPEX", "False")
5 | IPEX_BF16 = os.getenv("IPEX_BF16", "False")
6 | IS_DICT_CHECKPOINT = os.getenv("IS_DICT_CHECKPOINT", "False")
7 | 


--------------------------------------------------------------------------------
/haystack/document_stores/__init__.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import importlib
 3 | from haystack.utils.import_utils import safe_import
 4 | from haystack.document_stores.base import BaseDocumentStore, BaseKnowledgeGraph, KeywordDocumentStore
 5 | 
 6 | from haystack.document_stores.memory import InMemoryDocumentStore
 7 | from haystack.document_stores.deepsetcloud import DeepsetCloudDocumentStore
 8 | from haystack.document_stores.utils import eval_data_from_json, eval_data_from_jsonl, squad_json_to_jsonl
 9 | 
10 | from haystack.document_stores.elasticsearch import ElasticsearchDocumentStore
11 | from haystack.document_stores.es_converter import (
12 |     elasticsearch_index_to_document_store,
13 |     open_search_index_to_document_store,
14 | )
15 | 
16 | OpenSearchDocumentStore = safe_import("haystack.document_stores.opensearch", "OpenSearchDocumentStore", "opensearch")
17 | OpenDistroElasticsearchDocumentStore = safe_import(
18 |     "haystack.document_stores.opensearch", "OpenDistroElasticsearchDocumentStore", "opensearch"
19 | )
20 | SQLDocumentStore = safe_import("haystack.document_stores.sql", "SQLDocumentStore", "sql")
21 | FAISSDocumentStore = safe_import("haystack.document_stores.faiss", "FAISSDocumentStore", "faiss")
22 | PLAIDDocumentStore = safe_import("haystack.document_stores.plaid", "PLAIDDocumentStore", "plaid")
23 | PineconeDocumentStore = safe_import("haystack.document_stores.pinecone", "PineconeDocumentStore", "pinecone")
24 | MilvusDocumentStore = safe_import("haystack.document_stores.milvus", "MilvusDocumentStore", "milvus")
25 | WeaviateDocumentStore = safe_import("haystack.document_stores.weaviate", "WeaviateDocumentStore", "weaviate")
26 | GraphDBKnowledgeGraph = safe_import("haystack.document_stores.graphdb", "GraphDBKnowledgeGraph", "graphdb")
27 | InMemoryKnowledgeGraph = safe_import(
28 |     "haystack.document_stores.memory_knowledgegraph", "InMemoryKnowledgeGraph", "inmemorygraph"
29 | )
30 | 


--------------------------------------------------------------------------------
/haystack/environment.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import platform
 3 | import sys
 4 | from typing import Any, Dict
 5 | import torch
 6 | import transformers
 7 | 
 8 | from haystack import __version__
 9 | 
10 | 
11 | HAYSTACK_EXECUTION_CONTEXT = "HAYSTACK_EXECUTION_CONTEXT"
12 | HAYSTACK_DOCKER_CONTAINER = "HAYSTACK_DOCKER_CONTAINER"
13 | 
14 | 
15 | env_meta_data: Dict[str, Any] = {}
16 | 
17 | 
18 | def get_or_create_env_meta_data() -> Dict[str, Any]:
19 |     """
20 |     Collects meta data about the setup that is used with Haystack, such as: operating system, python version, Haystack version, transformers version, pytorch version, number of GPUs, execution environment, and the value stored in the env variable HAYSTACK_EXECUTION_CONTEXT.
21 |     """
22 |     global env_meta_data  # pylint: disable=global-statement
23 |     if not env_meta_data:
24 |         env_meta_data = {
25 |             "os_version": platform.release(),
26 |             "os_family": platform.system(),
27 |             "os_machine": platform.machine(),
28 |             "python_version": platform.python_version(),
29 |             "haystack_version": __version__,
30 |             "transformers_version": transformers.__version__,
31 |             "torch_version": torch.__version__,
32 |             "torch_cuda_version": torch.version.cuda if torch.cuda.is_available() else 0,
33 |             "n_gpu": torch.cuda.device_count() if torch.cuda.is_available() else 0,
34 |             "n_cpu": os.cpu_count(),
35 |             "context": os.environ.get(HAYSTACK_EXECUTION_CONTEXT),
36 |             "execution_env": _get_execution_environment(),
37 |         }
38 |     return env_meta_data
39 | 
40 | 
41 | def _get_execution_environment():
42 |     """
43 |     Identifies the execution environment that Haystack is running in.
44 |     Options are: colab notebook, kubernetes, CPU/GPU docker container, test environment, jupyter notebook, python script
45 |     """
46 |     if os.environ.get("CI", "False").lower() == "true":
47 |         execution_env = "ci"
48 |     elif "google.colab" in sys.modules:
49 |         execution_env = "colab"
50 |     elif "KUBERNETES_SERVICE_HOST" in os.environ:
51 |         execution_env = "kubernetes"
52 |     elif HAYSTACK_DOCKER_CONTAINER in os.environ:
53 |         execution_env = os.environ.get(HAYSTACK_DOCKER_CONTAINER)
54 |     # check if pytest is imported
55 |     elif "pytest" in sys.modules:
56 |         execution_env = "test"
57 |     else:
58 |         try:
59 |             execution_env = get_ipython().__class__.__name__  # pylint: disable=undefined-variable
60 |         except NameError:
61 |             execution_env = "script"
62 |     return execution_env
63 | 


--------------------------------------------------------------------------------
/haystack/modeling/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/open-domain-question-and-answer/7869af3f98ff6b06d7e1d8e576a6c5ca8bbd573d/haystack/modeling/__init__.py


--------------------------------------------------------------------------------
/haystack/modeling/data_handler/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/open-domain-question-and-answer/7869af3f98ff6b06d7e1d8e576a6c5ca8bbd573d/haystack/modeling/data_handler/__init__.py


--------------------------------------------------------------------------------
/haystack/modeling/data_handler/dataloader.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional, List
 2 | 
 3 | from math import ceil
 4 | 
 5 | import torch
 6 | from torch.utils.data import DataLoader, Dataset, Sampler
 7 | 
 8 | from haystack.errors import ModelingError
 9 | 
10 | 
11 | class NamedDataLoader(DataLoader):
12 |     def __init__(
13 |         self,
14 |         dataset: Dataset,
15 |         batch_size: int,
16 |         sampler: Optional[Sampler] = None,
17 |         tensor_names: Optional[List[str]] = None,
18 |         num_workers: int = 0,
19 |         pin_memory: bool = False,
20 |     ):
21 |         """
22 |         A modified version of the PyTorch DataLoader that returns a dictionary where the key is
23 |         the name of the tensor and the value is the tensor itself.
24 | 
25 |         :param dataset: The dataset that will be wrapped by this NamedDataLoader
26 |         :param sampler: The sampler used by the NamedDataLoader to choose which samples to include in the batch
27 |         :param batch_size: The size of the batch to be returned by the NamedDataLoader
28 |         :param tensor_names: The names of the tensor, in the order that the dataset returns them in.
29 |         :param num_workers: number of workers to use for the DataLoader
30 |         :param pin_memory: argument for Data Loader to use page-locked memory for faster transfer of data to GPU
31 |         """
32 | 
33 |         def collate_fn(batch):
34 |             """
35 |             A custom collate function that formats the batch as a dictionary where the key is
36 |             the name of the tensor and the value is the tensor itself
37 |             """
38 |             if type(dataset).__name__ == "_StreamingDataSet":
39 |                 _tensor_names = dataset.tensor_names
40 |             else:
41 |                 _tensor_names = tensor_names
42 | 
43 |             if type(batch[0]) == list:
44 |                 batch = batch[0]
45 | 
46 |             if len(batch[0]) != len(_tensor_names):
47 |                 raise ModelingError(
48 |                     f"Dataset contains {len(batch[0])} tensors while there are {len(_tensor_names)} tensor names supplied: {_tensor_names}"
49 |                 )
50 |             lists_temp = [[] for _ in range(len(_tensor_names))]
51 |             ret = dict(zip(_tensor_names, lists_temp))
52 | 
53 |             for example in batch:
54 |                 for name, tensor in zip(_tensor_names, example):
55 |                     ret[name].append(tensor)
56 | 
57 |             for key in ret:
58 |                 ret[key] = torch.stack(ret[key])
59 | 
60 |             return ret
61 | 
62 |         super().__init__(
63 |             dataset=dataset,
64 |             sampler=sampler,
65 |             batch_size=batch_size,
66 |             collate_fn=collate_fn,
67 |             pin_memory=pin_memory,
68 |             num_workers=num_workers,
69 |         )
70 | 
71 |     def __len__(self):
72 |         if type(self.dataset).__name__ == "_StreamingDataSet":
73 |             num_samples = len(self.dataset)
74 |             num_batches = ceil(num_samples / self.dataset.batch_size)
75 |             return num_batches
76 |         else:
77 |             return super().__len__()
78 | 


--------------------------------------------------------------------------------
/haystack/modeling/data_handler/inputs.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional, List, Union
 2 | 
 3 | 
 4 | class Question:
 5 |     def __init__(self, text: str, uid: Optional[str] = None):
 6 |         self.text = text
 7 |         self.uid = uid
 8 | 
 9 |     def to_dict(self):
10 |         ret = {"question": self.text, "id": self.uid, "answers": []}
11 |         return ret
12 | 
13 | 
14 | class QAInput:
15 |     def __init__(self, doc_text: str, questions: Union[List[Question], Question]):
16 |         self.doc_text = doc_text
17 |         if type(questions) == Question:
18 |             self.questions = [questions]
19 |         else:
20 |             self.questions = questions  # type: ignore
21 | 
22 |     def to_dict(self):
23 |         questions = [q.to_dict() for q in self.questions]
24 |         ret = {"qas": questions, "context": self.doc_text}
25 |         return ret
26 | 


--------------------------------------------------------------------------------
/haystack/modeling/evaluation/__init__.py:
--------------------------------------------------------------------------------
1 | from haystack.modeling.evaluation.eval import Evaluator
2 | 


--------------------------------------------------------------------------------
/haystack/modeling/evaluation/squad.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This is a copy of the official evaluation script for SQuAD version 2.0.
 3 | Modified by XLNet authors to update `find_best_threshold` scripts for SQuAD V2.0
 4 | 
 5 | In addition to basic functionality, we also compute additional statistics and
 6 | plot precision-recall curves if an additional na_prob.json file is provided.
 7 | This file is expected to map question ID's to the model's predicted probability
 8 | that a question is unanswerable.
 9 | """
10 | import collections
11 | import re
12 | import string
13 | 
14 | 
15 | def normalize_answer(s: str):
16 |     """
17 |     Lower text and remove punctuation, articles and extra whitespace.
18 |     """
19 | 
20 |     def remove_articles(text):
21 |         regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
22 |         return re.sub(regex, " ", text)
23 | 
24 |     def white_space_fix(text):
25 |         return " ".join(text.split())
26 | 
27 |     def remove_punc(text):
28 |         exclude = set(string.punctuation)
29 |         return "".join(ch for ch in text if ch not in exclude)
30 | 
31 |     def lower(text):
32 |         return text.lower()
33 | 
34 |     return white_space_fix(remove_articles(remove_punc(lower(s))))
35 | 
36 | 
37 | def get_tokens(s: str):
38 |     if not s:
39 |         return []
40 |     return normalize_answer(s).split()
41 | 
42 | 
43 | def compute_exact(a_gold: str, a_pred: str):
44 |     return int(normalize_answer(a_gold) == normalize_answer(a_pred))
45 | 
46 | 
47 | def compute_f1(a_gold: str, a_pred: str):
48 |     gold_toks = get_tokens(a_gold)
49 |     pred_toks = get_tokens(a_pred)
50 |     common: collections.Counter = collections.Counter(gold_toks) & collections.Counter(pred_toks)
51 |     num_same = sum(common.values())
52 |     if len(gold_toks) == 0 or len(pred_toks) == 0:
53 |         # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
54 |         return int(gold_toks == pred_toks)
55 |     if num_same == 0:
56 |         return 0
57 |     precision = 1.0 * num_same / len(pred_toks)
58 |     recall = 1.0 * num_same / len(gold_toks)
59 |     f1 = (2 * precision * recall) / (precision + recall)
60 |     return f1
61 | 


--------------------------------------------------------------------------------
/haystack/modeling/model/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/open-domain-question-and-answer/7869af3f98ff6b06d7e1d8e576a6c5ca8bbd573d/haystack/modeling/model/__init__.py


--------------------------------------------------------------------------------
/haystack/modeling/model/multimodal/base.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, List, Union, Optional
 2 | 
 3 | import logging
 4 | from pathlib import Path
 5 | from abc import ABC, abstractmethod
 6 | 
 7 | import torch
 8 | 
 9 | 
10 | logger = logging.getLogger(__name__)
11 | 
12 | 
13 | class HaystackModel(ABC):
14 |     """
15 |     The interface on top of HaystackTransformer and HaystackSentenceTransformer.
16 |     """
17 | 
18 |     def __init__(
19 |         self, pretrained_model_name_or_path: Union[str, Path], model_type: Optional[str], content_type: str
20 |     ):  # replace the type of content_type with ContentTypes starting Python3.8
21 |         """
22 |         :param pretrained_model_name_or_path: The name of the model to load
23 |         :param model_type: the value of `model_type` from the model's `Config` class.
24 |         :param content_type: The type of data (such as "text", "image" and so on) the model should process.
25 |             See the values of `haystack.schema.ContentTypes`.
26 |         """
27 |         logger.info(
28 |             f" 🤖 Loading '{pretrained_model_name_or_path}' "
29 |             f"({self.__class__.__name__} of type '{model_type if model_type else '<unknown>'}' "
30 |             f"for {content_type} data)"
31 |         )
32 |         self.model_name_or_path = pretrained_model_name_or_path
33 |         self.model_type = model_type
34 |         self.content_type = content_type
35 | 
36 |     @abstractmethod
37 |     def encode(self, data: List[Any], **kwargs) -> torch.Tensor:
38 |         """
39 |         Run the model on the input data to obtain output vectors.
40 |         """
41 |         raise NotImplementedError("Abstract method, use a subclass.")
42 | 
43 |     @abstractmethod
44 |     def to(self, devices: Optional[List[torch.device]]) -> None:
45 |         """
46 |         Send the model to the specified PyTorch device(s)
47 |         """
48 |         raise NotImplementedError("Abstract method, use a subclass.")
49 | 
50 |     @property
51 |     @abstractmethod
52 |     def embedding_dim(self) -> int:
53 |         """
54 |         The output embedding size.
55 |         """
56 |         raise NotImplementedError("Abstract method, use a subclass.")
57 | 


--------------------------------------------------------------------------------
/haystack/modeling/training/__init__.py:
--------------------------------------------------------------------------------
1 | from haystack.modeling.training.base import Trainer, DistillationTrainer, TinyBERTDistillationTrainer
2 | 


--------------------------------------------------------------------------------
/haystack/modeling/training/dpr.py:
--------------------------------------------------------------------------------
1 | # TODO create DPR_Trainer class here that can be called from retriever.dense.DPR.train()
2 | 


--------------------------------------------------------------------------------
/haystack/modeling/training/question_answering.py:
--------------------------------------------------------------------------------
1 | # TODO make QA_Trainer class and use insider reader.train
2 | 


--------------------------------------------------------------------------------
/haystack/modeling/visual.py:
--------------------------------------------------------------------------------
  1 | FLOWERS = r"""
  2 |          
  3 |    vVVVv                  vVVVv      
  4 |    (___)       vVVVv      (___)       vVVVv
  5 |     ~Y~        (___)       ~Y~        (___)
  6 |     \|         \~Y~/       \|         \~Y~/
  7 |    \\|//       \\|//      \\|//       \\|//
  8 |    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 
  9 | """
 10 | 
 11 | SAMPLE = r"""
 12 |       .--.        _____                       _      
 13 |     .'_\/_'.     / ____|                     | |     
 14 |     '. /\ .'    | (___   __ _ _ __ ___  _ __ | | ___ 
 15 |       "||"       \___ \ / _` | '_ ` _ \| '_ \| |/ _ \ 
 16 |        || /\     ____) | (_| | | | | | | |_) | |  __/
 17 |     /\ ||//\)   |_____/ \__,_|_| |_| |_| .__/|_|\___|
 18 |    (/\\||/                             |_|           
 19 | ______\||/___________________________________________                     
 20 | """
 21 | 
 22 | FENCE = r"""
 23 |   _   _   _   _   _   _   _   _   _   _   _   _   _   _   _   _   _   _   _   _ 
 24 | _| |_| |_| |_| |_| |_| |_| |_| |_| |_| |_| |_| |_| |_| |_| |_| |_| |_| |_| |_| |_
 25 | -| |-| |-| |-| |-| |-| |-| |-| |-| |-| |-| |-| |-| |-| |-| |-| |-| |-| |-| |-| |-
 26 |  | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
 27 | _| |_| |_| |_| |_| |_| |_| |_| |_| |_| |_| |_| |_| |_| |_| |_| |_| |_| |_| |_| |_
 28 | -| |-| |-| |-| |-| |-| |-| |-| |-| |-| |-| |-| |-| |-| |-| |-| |-| |-| |-| |-| |-
 29 |  |_| |_| |_| |_| |_| |_| |_| |_| |_| |_| |_| |_| |_| |_| |_| |_| |_| |_| |_| |_|  
 30 | ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
 31 | 
 32 | """
 33 | 
 34 | TRACTOR_SMALL = r""" 
 35 |               ______
 36 |                |o  |   !
 37 |    __          |:`_|---'-.
 38 |   |__|______.-/ _ \-----.|       
 39 |  (o)(o)------'\ _ /     ( )      
 40 |  """
 41 | 
 42 | 
 43 | TRACTOR_WITH_SILO_LINE = r""" 
 44 |                                      ____
 45 |                                     /____\
 46 |               ______               |      |
 47 |                |o  |   !           |      | 
 48 |    __          |:`_|---'-.         |      |
 49 |   |__|______.-/ _ \-----.|         |______|
 50 |  (o)(o)------'\ _ /     ( )        |      |
 51 |  """
 52 | 
 53 | 
 54 | ROOSTER = r"""
 55 |                              _  m
 56 |                            ,`.\/'>
 57 |                            (`\<_/`
 58 |                              `<<
 59 | """
 60 | 
 61 | PIG = r"""
 62 | 
 63 |         .-~~~~-. |\\_
 64 |      @_/        /  oo\_
 65 |        |    \   \   _(")
 66 |         \   /-| ||'--'
 67 |          \_\  \_\\
 68 | 
 69 | """
 70 | SMALL_PIG = r"""
 71 |      @___,__
 72 |      (   ^'_]
 73 |      //-\\'
 74 |      ^^  ^^
 75 | """
 76 | FENCE_SEP = r"""
 77 | |---||---|---|---|---|---|---|---|
 78 | """
 79 | 
 80 | BUSH_SEP = r"""\\|//       \\|//      \\|//       \\|//     \\|//
 81 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"""
 82 | 
 83 | WATERING_CAN = r"""
 84 |             ______                                         
 85 |  _        ,',----.`.                   
 86 | '.`-.  .-' '----. ||                                       
 87 |    `.`-'--------| ;;      
 88 |      `.|--------|//                                        
 89 |        \         /         
 90 |        '--------'                                       
 91 | """
 92 | 
 93 | WORKER_M = r""" 0  
 94 | /|\ 
 95 | /'\ """
 96 | 
 97 | WORKER_F = r""" 0  
 98 | /w\ 
 99 | / \ """
100 | 
101 | WORKER_X = r""" 0  
102 | /w\ 
103 | /'\ """
104 | 


--------------------------------------------------------------------------------
/haystack/nodes/__init__.py:
--------------------------------------------------------------------------------
 1 | from haystack.utils.import_utils import safe_import
 2 | 
 3 | from haystack.nodes.base import BaseComponent
 4 | 
 5 | from haystack.nodes.answer_generator import BaseGenerator, RAGenerator, Seq2SeqGenerator, OpenAIAnswerGenerator
 6 | from haystack.nodes.document_classifier import BaseDocumentClassifier, TransformersDocumentClassifier
 7 | from haystack.nodes.evaluator import EvalDocuments, EvalAnswers
 8 | from haystack.nodes.extractor import EntityExtractor, simplify_ner_for_qa
 9 | from haystack.nodes.file_classifier import FileTypeClassifier
10 | from haystack.nodes.file_converter import (
11 |     BaseConverter,
12 |     DocxToTextConverter,
13 |     ImageToTextConverter,
14 |     MarkdownConverter,
15 |     PDFToTextConverter,
16 |     PDFToTextOCRConverter,
17 |     TikaConverter,
18 |     TikaXHTMLParser,
19 |     TextConverter,
20 |     AzureConverter,
21 |     ParsrConverter,
22 | )
23 | from haystack.nodes.label_generator import PseudoLabelGenerator
24 | from haystack.nodes.other import Docs2Answers, JoinDocuments, RouteDocuments, JoinAnswers, DocumentMerger, Dataset
25 | from haystack.nodes.preprocessor import BasePreProcessor, PreProcessor
26 | from haystack.nodes.prompt import PromptNode, PromptTemplate, PromptModel
27 | from haystack.nodes.query_classifier import SklearnQueryClassifier, TransformersQueryClassifier
28 | from haystack.nodes.question_generator import QuestionGenerator
29 | from haystack.nodes.ranker import BaseRanker, SentenceTransformersRanker, ColBERTRanker
30 | from haystack.nodes.reader import BaseReader, FARMReader, TransformersReader, TableReader, RCIReader
31 | from haystack.nodes.retriever import (
32 |     BaseRetriever,
33 |     DenseRetriever,
34 |     DensePassageRetriever,
35 |     EmbeddingRetriever,
36 |     BM25Retriever,
37 |     ElasticsearchRetriever,
38 |     FilterRetriever,
39 |     MultihopEmbeddingRetriever,
40 |     ElasticsearchFilterOnlyRetriever,
41 |     TfidfRetriever,
42 |     Text2SparqlRetriever,
43 |     TableTextRetriever,
44 |     MultiModalRetriever,
45 | )
46 | from haystack.nodes.summarizer import BaseSummarizer, TransformersSummarizer
47 | from haystack.nodes.translator import BaseTranslator, TransformersTranslator
48 | 
49 | Crawler = safe_import("haystack.nodes.connector.crawler", "Crawler", "crawler")  # Has optional dependencies
50 | AnswerToSpeech = safe_import(
51 |     "haystack.nodes.audio.answer_to_speech", "AnswerToSpeech", "audio"
52 | )  # Has optional dependencies
53 | DocumentToSpeech = safe_import(
54 |     "haystack.nodes.audio.document_to_speech", "DocumentToSpeech", "audio"
55 | )  # Has optional dependencies
56 | 


--------------------------------------------------------------------------------
/haystack/nodes/answer_generator/__init__.py:
--------------------------------------------------------------------------------
1 | from haystack.nodes.answer_generator.base import BaseGenerator
2 | from haystack.nodes.answer_generator.transformers import RAGenerator, Seq2SeqGenerator
3 | from haystack.nodes.answer_generator.openai import OpenAIAnswerGenerator
4 | 


--------------------------------------------------------------------------------
/haystack/nodes/audio/__init__.py:
--------------------------------------------------------------------------------
1 | from haystack.utils.import_utils import safe_import
2 | 
3 | AnswerToSpeech = safe_import(
4 |     "haystack.nodes.audio.answer_to_speech", "AnswerToSpeech", "audio"
5 | )  # Has optional dependencies
6 | DocumentToSpeech = safe_import(
7 |     "haystack.nodes.audio.document_to_speech", "DocumentToSpeech", "audio"
8 | )  # Has optional dependencies
9 | 


--------------------------------------------------------------------------------
/haystack/nodes/connector/__init__.py:
--------------------------------------------------------------------------------
1 | from haystack.utils.import_utils import safe_import
2 | 
3 | Crawler = safe_import("haystack.nodes.connector.crawler", "Crawler", "crawler")  # Has optional dependencies
4 | 


--------------------------------------------------------------------------------
/haystack/nodes/document_classifier/__init__.py:
--------------------------------------------------------------------------------
1 | from haystack.nodes.document_classifier.base import BaseDocumentClassifier
2 | from haystack.nodes.document_classifier.transformers import TransformersDocumentClassifier
3 | 


--------------------------------------------------------------------------------
/haystack/nodes/document_classifier/base.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Union, Optional
 2 | 
 3 | import logging
 4 | from abc import abstractmethod
 5 | from functools import wraps
 6 | from time import perf_counter
 7 | 
 8 | from haystack.schema import Document
 9 | from haystack.nodes.base import BaseComponent
10 | 
11 | 
12 | logger = logging.getLogger(__name__)
13 | 
14 | 
15 | class BaseDocumentClassifier(BaseComponent):
16 |     outgoing_edges = 1
17 |     query_count = 0
18 |     query_time = 0
19 | 
20 |     @abstractmethod
21 |     def predict(self, documents: List[Document]):
22 |         pass
23 | 
24 |     @abstractmethod
25 |     def predict_batch(
26 |         self, documents: Union[List[Document], List[List[Document]]], batch_size: Optional[int] = None
27 |     ) -> Union[List[Document], List[List[Document]]]:
28 |         pass
29 | 
30 |     def run(self, documents: Union[List[dict], List[Document]], root_node: str):  # type: ignore
31 |         self.query_count += 1
32 |         if documents:
33 |             predict = self.timing(self.predict, "query_time")
34 |             documents = [Document.from_dict(doc) if isinstance(doc, dict) else doc for doc in documents]
35 |             results = predict(documents=documents)
36 |         else:
37 |             results = []
38 | 
39 |         document_ids = [doc.id for doc in results]
40 |         logger.debug("Classified documents with IDs: %s", document_ids)
41 | 
42 |         # convert back to dicts if we are in an indexing pipeline
43 |         if root_node == "File":
44 |             results = [doc.to_dict() for doc in results]
45 | 
46 |         output = {"documents": results}
47 | 
48 |         return output, "output_1"
49 | 
50 |     def run_batch(self, documents: Union[List[Document], List[List[Document]]], batch_size: Optional[int] = None):  # type: ignore
51 |         predict_batch = self.timing(self.predict_batch, "query_time")
52 |         results = predict_batch(documents=documents, batch_size=batch_size)
53 |         output = {"documents": results}
54 | 
55 |         if isinstance(documents[0], Document):
56 |             document_ids = [doc.id for doc in results]
57 |             logger.debug("Classified documents with IDs: %s", document_ids)
58 |         else:
59 |             for doc_list in results:
60 |                 document_ids = [doc.id for doc in doc_list]
61 |                 logger.debug("Classified documents with IDs: %s", document_ids)
62 | 
63 |         return output, "output_1"
64 | 
65 |     def timing(self, fn, attr_name):
66 |         """Wrapper method used to time functions."""
67 | 
68 |         @wraps(fn)
69 |         def wrapper(*args, **kwargs):
70 |             if attr_name not in self.__dict__:
71 |                 self.__dict__[attr_name] = 0
72 |             tic = perf_counter()
73 |             ret = fn(*args, **kwargs)
74 |             toc = perf_counter()
75 |             self.__dict__[attr_name] += toc - tic
76 |             return ret
77 | 
78 |         return wrapper
79 | 
80 |     def print_time(self):
81 |         print("Classifier (Speed)")
82 |         print("---------------")
83 |         if not self.query_count:
84 |             print("No querying performed via Classifier.run()")
85 |         else:
86 |             print(f"Queries Performed: {self.query_count}")
87 |             print(f"Query time: {self.query_time}s")
88 |             print(f"{self.query_time / self.query_count} seconds per query")
89 | 


--------------------------------------------------------------------------------
/haystack/nodes/evaluator/__init__.py:
--------------------------------------------------------------------------------
1 | from haystack.nodes.evaluator.evaluator import EvalDocuments, EvalAnswers
2 | 


--------------------------------------------------------------------------------
/haystack/nodes/extractor/__init__.py:
--------------------------------------------------------------------------------
1 | from haystack.nodes.extractor.entity import EntityExtractor, simplify_ner_for_qa
2 | 


--------------------------------------------------------------------------------
/haystack/nodes/file_classifier/__init__.py:
--------------------------------------------------------------------------------
1 | from haystack.utils.import_utils import safe_import
2 | 
3 | FileTypeClassifier = safe_import(
4 |     "haystack.nodes.file_classifier.file_type", "FileTypeClassifier", "preprocessing"
5 | )  # Has optional dependencies
6 | 


--------------------------------------------------------------------------------
/haystack/nodes/file_converter/__init__.py:
--------------------------------------------------------------------------------
 1 | from haystack.nodes.file_converter.base import BaseConverter
 2 | 
 3 | from haystack.utils.import_utils import safe_import
 4 | 
 5 | from haystack.nodes.file_converter.docx import DocxToTextConverter
 6 | from haystack.nodes.file_converter.tika import TikaConverter, TikaXHTMLParser
 7 | from haystack.nodes.file_converter.txt import TextConverter
 8 | from haystack.nodes.file_converter.azure import AzureConverter
 9 | from haystack.nodes.file_converter.parsr import ParsrConverter
10 | 
11 | MarkdownConverter = safe_import(
12 |     "haystack.nodes.file_converter.markdown", "MarkdownConverter", "preprocessing"
13 | )  # Has optional dependencies
14 | ImageToTextConverter = safe_import(
15 |     "haystack.nodes.file_converter.image", "ImageToTextConverter", "ocr"
16 | )  # Has optional dependencies
17 | PDFToTextConverter = safe_import(
18 |     "haystack.nodes.file_converter.pdf", "PDFToTextConverter", "ocr"
19 | )  # Has optional dependencies
20 | PDFToTextOCRConverter = safe_import(
21 |     "haystack.nodes.file_converter.pdf", "PDFToTextOCRConverter", "ocr"
22 | )  # Has optional dependencies
23 | 


--------------------------------------------------------------------------------
/haystack/nodes/file_converter/docx.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict, Optional, List
 2 | 
 3 | import logging
 4 | from pathlib import Path
 5 | import docx
 6 | 
 7 | from haystack.nodes.file_converter.base import BaseConverter
 8 | from haystack.schema import Document
 9 | 
10 | 
11 | logger = logging.getLogger(__name__)
12 | 
13 | 
14 | class DocxToTextConverter(BaseConverter):
15 |     def convert(
16 |         self,
17 |         file_path: Path,
18 |         meta: Optional[Dict[str, str]] = None,
19 |         remove_numeric_tables: Optional[bool] = None,
20 |         valid_languages: Optional[List[str]] = None,
21 |         encoding: Optional[str] = None,
22 |         id_hash_keys: Optional[List[str]] = None,
23 |     ) -> List[Document]:
24 |         """
25 |         Extract text from a .docx file.
26 |         Note: As docx doesn't contain "page" information, we actually extract and return a list of paragraphs here.
27 |         For compliance with other converters we nevertheless opted for keeping the methods name.
28 | 
29 |         :param file_path: Path to the .docx file you want to convert
30 |         :param meta: dictionary of meta data key-value pairs to append in the returned document.
31 |         :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
32 |                                       The tabular structures in documents might be noise for the reader model if it
33 |                                       does not have table parsing capability for finding answers. However, tables
34 |                                       may also have long strings that could possible candidate for searching answers.
35 |                                       The rows containing strings are thus retained in this option.
36 |         :param valid_languages: validate languages from a list of languages specified in the ISO 639-1
37 |                                 (https://en.wikipedia.org/wiki/ISO_639-1) format.
38 |                                 This option can be used to add test for encoding errors. If the extracted text is
39 |                                 not one of the valid languages, then it might likely be encoding error resulting
40 |                                 in garbled text.
41 |         :param encoding: Not applicable
42 |         :param id_hash_keys: Generate the document id from a custom list of strings that refer to the document's
43 |             attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
44 |             not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
45 |             In this case the id will be generated by using the content and the defined metadata.
46 |         """
47 |         if remove_numeric_tables is None:
48 |             remove_numeric_tables = self.remove_numeric_tables
49 |         if valid_languages is None:
50 |             valid_languages = self.valid_languages
51 |         if remove_numeric_tables is True:
52 |             raise Exception("'remove_numeric_tables' is not supported by DocxToTextConverter.")
53 |         if valid_languages is True:
54 |             raise Exception("Language validation using 'valid_languages' is not supported by DocxToTextConverter.")
55 |         if id_hash_keys is None:
56 |             id_hash_keys = self.id_hash_keys
57 | 
58 |         file = docx.Document(file_path)  # Creating word reader object.
59 |         paragraphs = [para.text for para in file.paragraphs]
60 |         text = "\n".join(paragraphs)
61 |         document = Document(content=text, meta=meta, id_hash_keys=id_hash_keys)
62 |         return [document]
63 | 


--------------------------------------------------------------------------------
/haystack/nodes/label_generator/__init__.py:
--------------------------------------------------------------------------------
1 | from haystack.nodes.label_generator.pseudo_label_generator import PseudoLabelGenerator
2 | 


--------------------------------------------------------------------------------
/haystack/nodes/other/__init__.py:
--------------------------------------------------------------------------------
1 | from haystack.nodes.other.docs2answers import Docs2Answers
2 | from haystack.nodes.other.join_docs import JoinDocuments
3 | from haystack.nodes.other.route_documents import RouteDocuments
4 | from haystack.nodes.other.join_answers import JoinAnswers
5 | from haystack.nodes.other.join import JoinNode
6 | from haystack.nodes.other.document_merger import DocumentMerger
7 | from haystack.nodes.other.dataset import Dataset
8 | 


--------------------------------------------------------------------------------
/haystack/nodes/other/dataset.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Union, Optional
 2 | from abc import abstractmethod
 3 | from haystack.schema import Document
 4 | from haystack.nodes.base import BaseComponent
 5 | import ray,os
 6 | 
 7 | class Dataset(BaseComponent):
 8 |     """
 9 |     This Node is used to convert dataset into haystack Documents or files path format.
10 |     It is useful for integrated the different dataset into haystack indexing pipeline.
11 |     It uses the ray distributed processing for the dataset. 
12 |     This ensures that your output is in a compatible format.
13 |     """
14 | 
15 |     outgoing_edges = 1
16 | 
17 |     def __init__(self, batch_size: Optional[int]):
18 |         super().__init__()
19 |         self.dataset = None
20 |         self.batch_size = batch_size
21 | 
22 |     @abstractmethod
23 |     def convert(self) -> ray.data.Dataset:
24 |         """
25 |         Convert a dataset to ray.data.Dataset of Haystack Documents or files path.
26 |         """
27 |         pass
28 |  
29 |     def run(self):  # type: ignore
30 |         # conversion from dataset-> Documents or files path
31 |         self.dataset = self.convert()
32 |         enable_sample = os.getenv('ENABLE_SAMPLING_LIMIT', default="0")
33 |         if enable_sample == "1" :
34 |             self.dataset = self.dataset.limit(500)
35 |         return {}, "output_1"
36 | 
37 |     def run_batch(self):  # type: ignore
38 |         return self.run()
39 | 
40 |     def dataset_batched_generator(self) :
41 |       """
42 |       Generator to generate the batched haystack Documents or batched files path
43 |       """
44 |       return self.dataset.iter_batches(batch_size=self.batch_size)


--------------------------------------------------------------------------------
/haystack/nodes/other/docs2answers.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Union, Dict
 2 | 
 3 | from tqdm.auto import tqdm
 4 | 
 5 | from haystack.errors import HaystackError
 6 | from haystack.schema import Document, Answer, Span
 7 | from haystack.nodes.base import BaseComponent
 8 | 
 9 | 
10 | class Docs2Answers(BaseComponent):
11 |     """
12 |     This Node is used to convert retrieved documents into predicted answers format.
13 |     It is useful for situations where you are calling a Retriever only pipeline via REST API.
14 |     This ensures that your output is in a compatible format.
15 | 
16 |     :param progress_bar: Whether to show a progress bar
17 |     """
18 | 
19 |     outgoing_edges = 1
20 | 
21 |     def __init__(self, progress_bar: bool = True):
22 |         super().__init__()
23 |         self.progress_bar = progress_bar
24 | 
25 |     def run(self, query: str, documents: List[Document]):  # type: ignore
26 |         # conversion from Document -> Answer
27 |         answers: List[Answer] = []
28 |         for doc in documents:
29 |             cur_answer = self._convert_doc_to_answer(doc)
30 |             answers.append(cur_answer)
31 | 
32 |         output = {"query": query, "answers": answers}
33 | 
34 |         return output, "output_1"
35 | 
36 |     def run_batch(self, queries: List[str], documents: Union[List[Document], List[List[Document]]]):  # type: ignore
37 |         output: Dict = {"queries": queries, "answers": []}
38 | 
39 |         # Docs case 1: single list of Documents
40 |         if len(documents) > 0 and isinstance(documents[0], Document):
41 |             for doc in tqdm(documents, disable=not self.progress_bar, desc="Converting to answers"):
42 |                 if not isinstance(doc, Document):
43 |                     raise HaystackError(f"doc was of type {type(doc)}, but expected a Document.")
44 |                 answers = [self._convert_doc_to_answer(doc)]
45 |                 output["answers"].append(answers)
46 | 
47 |         # Docs case 2: list of lists of Documents
48 |         elif len(documents) > 0 and isinstance(documents[0], list):
49 |             for docs in tqdm(documents, disable=not self.progress_bar, desc="Converting to answers"):
50 |                 if not isinstance(docs, list):
51 |                     raise HaystackError(f"docs was of type {type(docs)}, but expected a list of Documents.")
52 |                 answers = []
53 |                 for doc in docs:
54 |                     cur_answer = self._convert_doc_to_answer(doc)
55 |                     answers.append(cur_answer)
56 |                 output["answers"].append(answers)
57 | 
58 |         return output, "output_1"
59 | 
60 |     @staticmethod
61 |     def _convert_doc_to_answer(doc: Document) -> Answer:
62 |         # For FAQ style QA use cases
63 |         if "answer" in doc.meta:
64 |             doc.meta["query"] = doc.content  # question from the existing FAQ
65 |             answer = Answer(
66 |                 answer=doc.meta["answer"],
67 |                 type="other",
68 |                 score=doc.score,
69 |                 context=doc.meta["answer"],
70 |                 offsets_in_context=[Span(start=0, end=len(doc.meta["answer"]))],
71 |                 document_id=doc.id,
72 |                 meta=doc.meta,
73 |             )
74 |         else:
75 |             # Regular docs
76 |             answer = Answer(
77 |                 answer=doc.content,
78 |                 type="other",
79 |                 score=doc.score,
80 |                 context=doc.content,
81 |                 document_id=doc.id,
82 |                 meta=doc.meta,
83 |                 offsets_in_context=[Span(start=0, end=len(doc.content))]
84 |             )
85 | 
86 |         return answer
87 | 


--------------------------------------------------------------------------------
/haystack/nodes/other/join.py:
--------------------------------------------------------------------------------
 1 | from abc import abstractmethod
 2 | from typing import Optional, List, Tuple, Dict, Union, Any
 3 | import warnings
 4 | 
 5 | from haystack import MultiLabel, Document, Answer
 6 | from haystack.nodes.base import BaseComponent
 7 | 
 8 | 
 9 | class JoinNode(BaseComponent):
10 | 
11 |     outgoing_edges: int = 1
12 | 
13 |     def run(  # type: ignore
14 |         self,
15 |         inputs: Optional[List[dict]] = None,
16 |         query: Optional[str] = None,
17 |         file_paths: Optional[List[str]] = None,
18 |         labels: Optional[MultiLabel] = None,
19 |         documents: Optional[List[Document]] = None,
20 |         meta: Optional[dict] = None,
21 |         answers: Optional[List[Answer]] = None,
22 |         top_k_join: Optional[int] = None,
23 |     ) -> Tuple[Dict, str]:
24 |         if inputs:
25 |             return self.run_accumulated(inputs, top_k_join=top_k_join)
26 |         warnings.warn("You are using a JoinNode with only one input. This is usually equivalent to a no-op.")
27 |         return self.run_accumulated(
28 |             inputs=[
29 |                 {
30 |                     "query": query,
31 |                     "file_paths": file_paths,
32 |                     "labels": labels,
33 |                     "documents": documents,
34 |                     "meta": meta,
35 |                     "answers": answers,
36 |                 }
37 |             ],
38 |             top_k_join=top_k_join,
39 |         )
40 | 
41 |     @abstractmethod
42 |     def run_accumulated(self, inputs: List[dict], top_k_join: Optional[int] = None) -> Tuple[Dict, str]:
43 |         pass
44 | 
45 |     def run_batch(  # type: ignore
46 |         self,
47 |         inputs: Optional[List[dict]] = None,
48 |         queries: Optional[Union[str, List[str]]] = None,
49 |         file_paths: Optional[List[str]] = None,
50 |         labels: Optional[Union[MultiLabel, List[MultiLabel]]] = None,
51 |         documents: Optional[Union[List[Document], List[List[Document]]]] = None,
52 |         meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
53 |         params: Optional[dict] = None,
54 |         debug: Optional[bool] = None,
55 |         answers: Optional[List[Answer]] = None,
56 |         top_k_join: Optional[int] = None,
57 |     ) -> Tuple[Dict, str]:
58 |         if inputs:
59 |             return self.run_batch_accumulated(inputs=inputs, top_k_join=top_k_join)
60 |         warnings.warn("You are using a JoinNode with only one input. This is usually equivalent to a no-op.")
61 |         return self.run_batch_accumulated(
62 |             inputs=[
63 |                 {
64 |                     "queries": queries,
65 |                     "file_paths": file_paths,
66 |                     "labels": labels,
67 |                     "documents": documents,
68 |                     "meta": meta,
69 |                     "params": params,
70 |                     "debug": debug,
71 |                     "answers": answers,
72 |                 }
73 |             ],
74 |             top_k_join=top_k_join,
75 |         )
76 | 
77 |     @abstractmethod
78 |     def run_batch_accumulated(self, inputs: List[dict], top_k_join: Optional[int] = None) -> Tuple[Dict, str]:
79 |         pass
80 | 


--------------------------------------------------------------------------------
/haystack/nodes/preprocessor/__init__.py:
--------------------------------------------------------------------------------
1 | from haystack.nodes.preprocessor.base import BasePreProcessor
2 | from haystack.nodes.preprocessor.preprocessor import PreProcessor
3 | 


--------------------------------------------------------------------------------
/haystack/nodes/prompt/__init__.py:
--------------------------------------------------------------------------------
1 | from haystack.nodes.prompt.prompt_node import PromptNode, PromptTemplate, PromptModel
2 | 


--------------------------------------------------------------------------------
/haystack/nodes/query_classifier/__init__.py:
--------------------------------------------------------------------------------
1 | from haystack.nodes.query_classifier.base import BaseQueryClassifier
2 | from haystack.nodes.query_classifier.sklearn import SklearnQueryClassifier
3 | from haystack.nodes.query_classifier.transformers import TransformersQueryClassifier
4 | 


--------------------------------------------------------------------------------
/haystack/nodes/query_classifier/base.py:
--------------------------------------------------------------------------------
 1 | from abc import abstractmethod
 2 | from typing import List, Optional
 3 | 
 4 | from haystack.nodes.base import BaseComponent
 5 | 
 6 | 
 7 | class BaseQueryClassifier(BaseComponent):
 8 |     """
 9 |     Abstract class for Query Classifiers
10 |     """
11 | 
12 |     outgoing_edges = 2
13 | 
14 |     @abstractmethod
15 |     def run(self, query: str):  # type: ignore
16 |         pass
17 | 
18 |     @abstractmethod
19 |     def run_batch(self, queries: List[str], batch_size: Optional[int] = None):  # type: ignore
20 |         pass
21 | 


--------------------------------------------------------------------------------
/haystack/nodes/question_generator/__init__.py:
--------------------------------------------------------------------------------
1 | from haystack.nodes.question_generator.question_generator import QuestionGenerator
2 | 


--------------------------------------------------------------------------------
/haystack/nodes/ranker/__init__.py:
--------------------------------------------------------------------------------
1 | from haystack.nodes.ranker.base import BaseRanker
2 | from haystack.nodes.ranker.colbert_modeling import ColBERTRanker
3 | from haystack.nodes.ranker.sentence_transformers import SentenceTransformersRanker
4 | 


--------------------------------------------------------------------------------
/haystack/nodes/ranker/st_modeling.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | from typing import List, Optional, Union
 3 | from haystack.nodes.ranker.base import BaseRanker
 4 | from haystack.schema import Document
 5 | from sentence_transformers import SentenceTransformer, util
 6 | 
 7 | 
 8 | class STRanker(BaseRanker):
 9 |     def __init__(
10 |         self,
11 |         model_name_or_path: Union[str, Path],
12 |         top_k: int = 10,
13 |     ):
14 |         self.model = SentenceTransformer(model_name_or_path)
15 |         self.top_k = top_k
16 |         self.model.eval()
17 | 
18 |     def predict(self, query: str, documents: List[Document], top_k: Optional[int] = None):
19 |         if top_k is None:
20 |             top_k = self.top_k
21 | 
22 |         docs = [d.content for d in documents]
23 |         document_embeddings = self.model.encode(docs)
24 |         query_embedding = self.model.encode(query)
25 |         scores = util.cos_sim(query_embedding, document_embeddings).flatten()
26 | 
27 |         # rank documents according to scores
28 |         sorted_scores_and_documents = sorted(
29 |             zip(scores, documents),
30 |             key=lambda similarity_document_tuple: similarity_document_tuple[0],
31 |             reverse=True,
32 |         )
33 |         sorted_documents = [doc for _, doc in sorted_scores_and_documents]
34 |         return sorted_documents[:top_k]
35 | 
36 |     def predict_batch(
37 |         self,
38 |         query_doc_list: List[dict],
39 |         top_k: Optional[int] = None,
40 |     ):
41 |         raise NotImplementedError
42 | 


--------------------------------------------------------------------------------
/haystack/nodes/reader/__init__.py:
--------------------------------------------------------------------------------
1 | from haystack.nodes.reader.base import BaseReader
2 | from haystack.nodes.reader.farm import FARMReader
3 | from haystack.nodes.reader.transformers import TransformersReader
4 | from haystack.nodes.reader.table import TableReader, RCIReader
5 | 


--------------------------------------------------------------------------------
/haystack/nodes/retriever/__init__.py:
--------------------------------------------------------------------------------
 1 | from haystack.nodes.retriever.base import BaseRetriever
 2 | from haystack.nodes.retriever.dense import (
 3 |     DenseRetriever,
 4 |     DensePassageRetriever,
 5 |     EmbeddingRetriever,
 6 |     MultihopEmbeddingRetriever,
 7 |     TableTextRetriever,
 8 | )
 9 | from haystack.nodes.retriever.sparse import (
10 |     BM25Retriever,
11 |     ElasticsearchRetriever,
12 |     ElasticsearchFilterOnlyRetriever,
13 |     FilterRetriever,
14 |     TfidfRetriever,
15 | )
16 | from haystack.nodes.retriever.text2sparql import Text2SparqlRetriever
17 | from haystack.nodes.retriever.multimodal import MultiModalRetriever
18 | 


--------------------------------------------------------------------------------
/haystack/nodes/retriever/_losses.py:
--------------------------------------------------------------------------------
 1 | from collections import namedtuple
 2 | from typing import Dict
 3 | 
 4 | from sentence_transformers import losses
 5 | 
 6 | 
 7 | SentenceTransformerLoss = namedtuple("SentenceTransformerLoss", "loss required_attrs")
 8 | 
 9 | _TRAINING_LOSSES: Dict[str, SentenceTransformerLoss] = {
10 |     "mnrl": SentenceTransformerLoss(losses.MultipleNegativesRankingLoss, {"question", "pos_doc"}),
11 |     "margin_mse": SentenceTransformerLoss(losses.MarginMSELoss, {"question", "pos_doc", "neg_doc", "score"}),
12 | }
13 | 


--------------------------------------------------------------------------------
/haystack/nodes/retriever/multimodal/__init__.py:
--------------------------------------------------------------------------------
1 | from haystack.nodes.retriever.multimodal.retriever import MultiModalRetriever
2 | from haystack.nodes.retriever.multimodal.embedder import MultiModalEmbedder
3 | 


--------------------------------------------------------------------------------
/haystack/nodes/summarizer/__init__.py:
--------------------------------------------------------------------------------
1 | from haystack.nodes.summarizer.base import BaseSummarizer
2 | from haystack.nodes.summarizer.transformers import TransformersSummarizer
3 | 


--------------------------------------------------------------------------------
/haystack/nodes/summarizer/base.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Dict, Optional, Union
 2 | 
 3 | from abc import abstractmethod
 4 | 
 5 | from haystack.schema import Document
 6 | from haystack.nodes.base import BaseComponent
 7 | 
 8 | 
 9 | class BaseSummarizer(BaseComponent):
10 |     """
11 |     Abstract class for Summarizer
12 |     """
13 | 
14 |     outgoing_edges = 1
15 | 
16 |     @abstractmethod
17 |     def predict(self, documents: List[Document], generate_single_summary: Optional[bool] = None) -> List[Document]:
18 |         """
19 |         Abstract method for creating a summary.
20 | 
21 |         :param documents: Related documents (e.g. coming from a retriever) that the answer shall be conditioned on.
22 |         :param generate_single_summary: This parameter is deprecated and will be removed in Haystack 1.12
23 |         :return: List of Documents, where Document.meta["summary"] contains the summarization
24 |         """
25 |         pass
26 | 
27 |     @abstractmethod
28 |     def predict_batch(
29 |         self,
30 |         documents: Union[List[Document], List[List[Document]]],
31 |         generate_single_summary: Optional[bool] = None,
32 |         batch_size: Optional[int] = None,
33 |     ) -> Union[List[Document], List[List[Document]]]:
34 |         pass
35 | 
36 |     def run(self, documents: List[Document], generate_single_summary: Optional[bool] = None):  # type: ignore
37 | 
38 |         results: Dict = {"documents": []}
39 | 
40 |         if documents:
41 |             results["documents"] = self.predict(documents=documents, generate_single_summary=generate_single_summary)
42 | 
43 |         return results, "output_1"
44 | 
45 |     def run_batch(  # type: ignore
46 |         self,
47 |         documents: Union[List[Document], List[List[Document]]],
48 |         generate_single_summary: Optional[bool] = None,
49 |         batch_size: Optional[int] = None,
50 |     ):
51 | 
52 |         results = self.predict_batch(
53 |             documents=documents, batch_size=batch_size, generate_single_summary=generate_single_summary
54 |         )
55 | 
56 |         return {"documents": results}, "output_1"
57 | 


--------------------------------------------------------------------------------
/haystack/nodes/translator/__init__.py:
--------------------------------------------------------------------------------
1 | from haystack.nodes.translator.base import BaseTranslator
2 | from haystack.nodes.translator.transformers import TransformersTranslator
3 | 


--------------------------------------------------------------------------------
/haystack/pipelines/__init__.py:
--------------------------------------------------------------------------------
 1 | from haystack.pipelines.base import Pipeline, RootNode
 2 | from haystack.pipelines.ray import RayPipeline
 3 | from haystack.pipelines.standard_pipelines import (
 4 |     BaseStandardPipeline,
 5 |     DocumentSearchPipeline,
 6 |     QuestionGenerationPipeline,
 7 |     TranslationWrapperPipeline,
 8 |     SearchSummarizationPipeline,
 9 |     MostSimilarDocumentsPipeline,
10 |     QuestionAnswerGenerationPipeline,
11 |     RetrieverQuestionGenerationPipeline,
12 |     GenerativeQAPipeline,
13 |     ExtractiveQAPipeline,
14 |     FAQPipeline,
15 |     TextIndexingPipeline,
16 | )
17 | 


--------------------------------------------------------------------------------
/haystack/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | from haystack.utils.reflection import args_to_kwargs
 2 | from haystack.utils.preprocessing import convert_files_to_docs, tika_convert_files_to_docs
 3 | from haystack.utils.import_utils import fetch_archive_from_http
 4 | from haystack.utils.cleaning import clean_wiki_text
 5 | from haystack.utils.doc_store import (
 6 |     launch_es,
 7 |     launch_milvus,
 8 |     launch_opensearch,
 9 |     launch_weaviate,
10 |     stop_opensearch,
11 |     stop_service,
12 | )
13 | from haystack.utils.deepsetcloud import DeepsetCloud, DeepsetCloudError, DeepsetCloudExperiments
14 | from haystack.utils.export_utils import (
15 |     print_answers,
16 |     print_documents,
17 |     print_questions,
18 |     export_answers_to_csv,
19 |     convert_labels_to_squad,
20 | )
21 | from haystack.utils.squad_data import SquadData
22 | from haystack.utils.context_matching import calculate_context_similarity, match_context, match_contexts
23 | from haystack.utils.experiment_tracking import (
24 |     Tracker,
25 |     NoTrackingHead,
26 |     BaseTrackingHead,
27 |     MLflowTrackingHead,
28 |     StdoutTrackingHead,
29 | )
30 | from haystack.utils.early_stopping import EarlyStopping
31 | from haystack.utils.labels import aggregate_labels
32 | 


--------------------------------------------------------------------------------
/haystack/utils/cleaning.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | 
 4 | def clean_wiki_text(text: str) -> str:
 5 |     """
 6 |     Clean wikipedia text by removing multiple new lines, removing extremely short lines,
 7 |     adding paragraph breaks and removing empty paragraphs
 8 |     """
 9 |     # get rid of multiple new lines
10 |     while "\n\n" in text:
11 |         text = text.replace("\n\n", "\n")
12 | 
13 |     # remove extremely short lines
14 |     lines = text.split("\n")
15 |     cleaned = []
16 |     for l in lines:
17 |         if len(l) > 30:
18 |             cleaned.append(l)
19 |         elif l[:2] == "==" and l[-2:] == "==":
20 |             cleaned.append(l)
21 |     text = "\n".join(cleaned)
22 | 
23 |     # add paragraphs (identified by wiki section title which is always in format "==Some Title==")
24 |     text = text.replace("\n==", "\n\n\n==")
25 | 
26 |     # remove empty paragrahps
27 |     text = re.sub(r"(==.*==\n\n\n)", "", text)
28 | 
29 |     return text
30 | 


--------------------------------------------------------------------------------
/haystack/utils/docker.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from typing import List, Union, Optional
 3 | 
 4 | 
 5 | def cache_models(models: Optional[List[str]] = None, use_auth_token: Optional[Union[str, bool]] = None):
 6 |     """
 7 |     Small function that caches models and other data.
 8 |     Used only in the Dockerfile to include these caches in the images.
 9 | 
10 |     :param models: List of Hugging Face model names to cache
11 |     :param use_auth_token: The API token used to download private models from Huggingface.
12 |                            If this parameter is set to `True`, then the token generated when running
13 |                            `transformers-cli login` (stored in ~/.huggingface) will be used.
14 |                            Additional information can be found here
15 |                            https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained
16 |     """
17 |     # Backward compat after adding the `model` param
18 |     if models is None:
19 |         models = ["deepset/roberta-base-squad2"]
20 | 
21 |     # download punkt tokenizer
22 |     logging.info("Caching punkt data")
23 |     import nltk
24 | 
25 |     nltk.download("punkt")
26 | 
27 |     # Cache models
28 |     import transformers
29 | 
30 |     for model_to_cache in models:
31 |         logging.info("Caching %s", model_to_cache)
32 |         transformers.AutoTokenizer.from_pretrained(model_to_cache, use_auth_token=use_auth_token)
33 |         transformers.AutoModel.from_pretrained(model_to_cache, use_auth_token=use_auth_token)
34 | 


--------------------------------------------------------------------------------
/haystack/utils/reflection.py:
--------------------------------------------------------------------------------
 1 | import inspect
 2 | import logging
 3 | import time
 4 | from random import random
 5 | from typing import Any, Dict, Tuple, Callable
 6 | 
 7 | from haystack.errors import OpenAIRateLimitError
 8 | 
 9 | logger = logging.getLogger(__name__)
10 | 
11 | 
12 | def args_to_kwargs(args: Tuple, func: Callable) -> Dict[str, Any]:
13 |     sig = inspect.signature(func)
14 |     arg_names = list(sig.parameters.keys())
15 |     # skip self and cls args for instance and class methods
16 |     if any(arg_names) and arg_names[0] in ["self", "cls"]:
17 |         arg_names = arg_names[1 : 1 + len(args)]
18 |     args_as_kwargs = {arg_name: arg for arg, arg_name in zip(args, arg_names)}
19 |     return args_as_kwargs
20 | 
21 | 
22 | def retry_with_exponential_backoff(
23 |     backoff_in_seconds: float = 1, max_retries: int = 10, errors: tuple = (OpenAIRateLimitError,)
24 | ):
25 |     """
26 |     Decorator to retry a function with exponential backoff.
27 |     :param backoff_in_seconds: The initial backoff in seconds.
28 |     :param max_retries: The maximum number of retries.
29 |     :param errors: The errors to catch retry on.
30 |     """
31 | 
32 |     def decorator(function):
33 |         def wrapper(*args, **kwargs):
34 |             # Initialize variables
35 |             num_retries = 0
36 | 
37 |             # Loop until a successful response or max_retries is hit or an exception is raised
38 |             while True:
39 |                 try:
40 |                     return function(*args, **kwargs)
41 | 
42 |                 # Retry on specified errors
43 |                 except errors as e:
44 |                     # Check if max retries has been reached
45 |                     if num_retries > max_retries:
46 |                         raise Exception(f"Maximum number of retries ({max_retries}) exceeded.")
47 | 
48 |                     # Increment the delay
49 |                     sleep_time = backoff_in_seconds * 2**num_retries + random()
50 | 
51 |                     # Sleep for the delay
52 |                     logger.warning(
53 |                         f"{e.__class__.__name__ } - {e}, "
54 |                         f"retry {function.__name__} in {'{0:.2f}'.format(sleep_time)} seconds..."
55 |                     )
56 |                     time.sleep(sleep_time)
57 | 
58 |                     # Increment retries
59 |                     num_retries += 1
60 | 
61 |                 # Raise exceptions for any errors not specified
62 |                 except Exception as e:
63 |                     raise e
64 | 
65 |         return wrapper
66 | 
67 |     return decorator
68 | 


--------------------------------------------------------------------------------
/haystack/utils/torch_utils.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional, List, Union
 2 | 
 3 | import torch
 4 | from torch.utils.data import Dataset
 5 | 
 6 | 
 7 | class ListDataset(Dataset):
 8 |     def __init__(self, original_list):
 9 |         self.original_list = original_list
10 | 
11 |     def __len__(self):
12 |         return len(self.original_list)
13 | 
14 |     def __getitem__(self, i):
15 |         return self.original_list[i]
16 | 
17 | 
18 | def ensure_tensor_on_device(inputs: Union[dict, list, tuple, torch.Tensor], device: torch.device):
19 |     """Utility function to check that all torch tensors present in `inputs` are sent to the correct device.
20 | 
21 |     :param inputs: Contains the torch tensors that will be sent to `device`.
22 |     :param device: The torch device to send the tensors to.
23 |     """
24 |     if isinstance(inputs, dict):
25 |         return {name: ensure_tensor_on_device(tensor, device) for name, tensor in inputs.items()}
26 |     elif isinstance(inputs, list):
27 |         return [ensure_tensor_on_device(item, device) for item in inputs]
28 |     elif isinstance(inputs, tuple):
29 |         return tuple(ensure_tensor_on_device(item, device) for item in inputs)
30 |     elif isinstance(inputs, torch.Tensor):
31 |         if device == torch.device("cpu") and inputs.dtype in {torch.float16, torch.bfloat16}:
32 |             inputs = inputs.float()
33 |         return inputs.to(device)
34 |     else:
35 |         return inputs
36 | 
37 | 
38 | def get_devices(devices: Optional[List[Union[str, torch.device]]]) -> List[torch.device]:
39 |     """
40 |     Convert a list of device names into a list of Torch devices,
41 |     depending on the system's configuration and hardware.
42 |     """
43 |     if devices is not None:
44 |         return [torch.device(device) for device in devices]
45 |     elif torch.cuda.is_available():
46 |         return [torch.device(device) for device in range(torch.cuda.device_count())]
47 |     return [torch.device("cpu")]
48 | 


--------------------------------------------------------------------------------
/images/odqa_workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/open-domain-question-and-answer/7869af3f98ff6b06d7e1d8e576a6c5ca8bbd573d/images/odqa_workflow.png


--------------------------------------------------------------------------------
/images/pipeline1.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/open-domain-question-and-answer/7869af3f98ff6b06d7e1d8e576a6c5ca8bbd573d/images/pipeline1.PNG


--------------------------------------------------------------------------------
/images/pipeline2.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/open-domain-question-and-answer/7869af3f98ff6b06d7e1d8e576a6c5ca8bbd573d/images/pipeline2.PNG


--------------------------------------------------------------------------------
/images/pipeline3.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/open-domain-question-and-answer/7869af3f98ff6b06d7e1d8e576a6c5ca8bbd573d/images/pipeline3.PNG


--------------------------------------------------------------------------------
/images/ui.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/open-domain-question-and-answer/7869af3f98ff6b06d7e1d8e576a6c5ca8bbd573d/images/ui.png


--------------------------------------------------------------------------------
/nginx/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM nginx
2 | RUN rm /etc/nginx/conf.d/default.conf
3 | COPY nginx.conf /etc/nginx/conf.d/default.conf
4 | EXPOSE 8000
5 | 


--------------------------------------------------------------------------------
/nginx/nginx.conf:
--------------------------------------------------------------------------------
 1 | upstream loadbalancer {
 2 |   server 172.17.0.1:8001;
 3 |   server 172.17.0.1:8002;
 4 |   server 172.17.0.1:8003;
 5 | }
 6 | server {
 7 |   listen       8000;
 8 |   location / {
 9 |     proxy_pass http://loadbalancer;
10 |     proxy_connect_timeout      140;
11 |     proxy_send_timeout         180;
12 |     proxy_read_timeout         180;
13 |   }
14 | }
15 | 


--------------------------------------------------------------------------------
/prepare_env.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # install required software packages
 3 | yum updtate
 4 | yum install -y yum-utils
 5 | 
 6 | # set up the repository
 7 | yum-config-manager --add-repo https://download.docker.com/linux/centos/docker-ce.repo
 8 | 
 9 | # install and start service
10 | yum install -y docker-ce
11 | systemctl start docker
12 | 
13 | # install docker compose
14 | curl -SL https://github.com/docker/compose/releases/download/v2.6.1/docker-compose-linux-x86_64 -o /usr/local/bin/docker-compose
15 | chmod +x /usr/local/bin/docker-compose
16 | ln -sf /usr/local/bin/docker-compose /usr/bin/docker-compose
17 | # check the installation
18 | echo `docker-compose version`
19 | 


--------------------------------------------------------------------------------
/rest_api/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/open-domain-question-and-answer/7869af3f98ff6b06d7e1d8e576a6c5ca8bbd573d/rest_api/README.md


--------------------------------------------------------------------------------
/rest_api/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["hatchling"]
 3 | build-backend = "hatchling.build"
 4 | 
 5 | [project]
 6 | name = "rest-api"
 7 | description = 'API server for Haystack (https://github.com/deepset-ai/haystack)'
 8 | readme = "README.md"
 9 | requires-python = ">=3.7"
10 | license = "Apache-2.0"
11 | keywords = []
12 | authors = [
13 |   { name = "deepset.ai", email = "malte.pietsch@deepset.ai" },
14 | ]
15 | classifiers = [
16 |   "Development Status :: 5 - Production/Stable",
17 |   "Intended Audience :: Science/Research",
18 |   "Topic :: Scientific/Engineering :: Artificial Intelligence",
19 |   "Operating System :: OS Independent",
20 |   "Programming Language :: Python",
21 |   "Programming Language :: Python :: 3.7",
22 |   "Programming Language :: Python :: 3.8",
23 |   "Programming Language :: Python :: 3.9",
24 |   "Programming Language :: Python :: 3.10",
25 |   "Programming Language :: Python :: Implementation :: CPython",
26 | ]
27 | dependencies = [
28 |     "farm-haystack",
29 |     "fastapi<1",
30 |     "uvicorn<1",
31 |     "gunicorn<21",
32 |     "python-multipart<1",  # optional FastAPI dependency for form data
33 |     "pynvml",
34 |     "psutil"
35 | ]
36 | dynamic = ["version"]
37 | 
38 | [project.optional-dependencies]
39 | dev = [
40 |   "httpx"
41 | ]
42 | 
43 | [project.urls]
44 | Documentation = "https://github.com/deepset-ai/haystack/tree/main/rest_api#readme"
45 | Issues = "https://github.com/deepset-ai/haystack/issues"
46 | Source = "https://github.com/deepset-ai/haystack/tree/main/rest_api"
47 | 
48 | [tool.hatch.version]
49 | path = "rest_api/__about__.py"
50 | 
51 | [tool.hatch.build.targets.sdist]
52 | [tool.hatch.build.targets.wheel]
53 | 
54 | [tool.hatch.envs.default]
55 | dependencies = [
56 |   "pytest",
57 |   "pytest-cov",
58 | ]
59 | [tool.hatch.envs.default.scripts]
60 | cov = "pytest --cov-report=term-missing --cov-config=pyproject.toml --cov=rest_api --cov=tests"
61 | no-cov = "cov --no-cov"
62 | 
63 | [[tool.hatch.envs.test.matrix]]
64 | python = ["37", "38", "39", "310"]
65 | 
66 | [tool.coverage.run]
67 | branch = true
68 | parallel = true
69 | omit = [
70 |   "rest_api/__about__.py",
71 | ]
72 | 
73 | [tool.coverage.report]
74 | exclude_lines = [
75 |   "no cov",
76 |   "if __name__ == .__main__.:",
77 |   "if TYPE_CHECKING:",
78 | ]
79 | 
80 | [tool.black]
81 | line-length = 120
82 | skip_magic_trailing_comma = true  # For compatibility with pydoc>=4.6, check if still needed.
83 | 


--------------------------------------------------------------------------------
/rest_api/rest_api/__about__.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | from pathlib import Path
 4 | 
 5 | 
 6 | __version__ = "0.0.0"
 7 | try:
 8 |     __version__ = open(Path(__file__).parent.parent / "VERSION.txt", "r").read()
 9 | except Exception as e:
10 |     logging.exception("No VERSION.txt found!")
11 | 


--------------------------------------------------------------------------------
/rest_api/rest_api/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/open-domain-question-and-answer/7869af3f98ff6b06d7e1d8e576a6c5ca8bbd573d/rest_api/rest_api/__init__.py


--------------------------------------------------------------------------------
/rest_api/rest_api/application.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | import uvicorn
 4 | from rest_api.utils import get_app, get_pipelines
 5 | 
 6 | 
 7 | logging.basicConfig(format="%(asctime)s %(message)s", datefmt="%m/%d/%Y %I:%M:%S %p")
 8 | logger = logging.getLogger(__name__)
 9 | logging.getLogger("elasticsearch").setLevel(logging.WARNING)
10 | logging.getLogger("haystack").setLevel(logging.INFO)
11 | 
12 | 
13 | app = get_app()
14 | pipelines = get_pipelines()  # Unused here, called to init the pipelines early
15 | 
16 | 
17 | logger.info("Open http://127.0.0.1:8000/docs to see Swagger API Documentation.")
18 | logger.info(
19 |     """
20 |     Or just try it out directly: curl --request POST --url 'http://127.0.0.1:8000/query' 
21 |     -H "Content-Type: application/json"  --data '{"query": "Who is the father of Arya Stark?"}'
22 |     """
23 | )
24 | 
25 | 
26 | if __name__ == "__main__":
27 |     uvicorn.run(app, host="0.0.0.0", port=8000)
28 | 


--------------------------------------------------------------------------------
/rest_api/rest_api/config.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from pathlib import Path
 3 | 
 4 | 
 5 | PIPELINE_YAML_PATH = os.getenv(
 6 |     "PIPELINE_YAML_PATH", str((Path(__file__).parent / "pipeline" / "pipelines.haystack-pipeline.yml").absolute())
 7 | )
 8 | QUERY_PIPELINE_NAME = os.getenv("QUERY_PIPELINE_NAME", "query")
 9 | INDEXING_PIPELINE_NAME = os.getenv("INDEXING_PIPELINE_NAME", "indexing")
10 | INDEX_NAME = os.getenv("INDEX_NAME", "document")
11 | DOCUMENTSTORE_PARAMS_HOST = os.getenv("DOCUMENTSTORE_PARAMS_HOST", "elasticsearch")
12 | DOCUMENTSTORE_PARAMS_PORT = os.getenv("DOCUMENTSTORE_PARAMS_PORT", "9200")
13 | FILE_UPLOAD_PATH = os.getenv("FILE_UPLOAD_PATH", str((Path(__file__).parent / "file-upload").absolute()))
14 | LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO")
15 | ROOT_PATH = os.getenv("ROOT_PATH", "/")
16 | CHECKPOINT_PATH = os.getenv("CHECKPOINT_PATH","/home/user/data/colbert.dnn")
17 | FAISS_DB_PATH = os.getenv("FAISS_DB_PATH","/home/user/data/faiss-index-so.faiss")
18 | MODEL_PATH = os.getenv("MODEL_PATH", "/home/user/model")
19 | PLAID_INDEX_PATH = os.getenv("PLAID_INDEX_PATH", "/home/user/data/plaid_indexing/")
20 | PLAID_COLLECTION_PATH=os.getenv("PLAID_COLLECTION_PATH", "/home/user/data/psgs_w100.tsv")
21 | CONCURRENT_REQUEST_PER_WORKER = int(os.getenv("CONCURRENT_REQUEST_PER_WORKER", "4"))
22 | 


--------------------------------------------------------------------------------
/rest_api/rest_api/controller/__init__.py:
--------------------------------------------------------------------------------
1 | from rest_api.pipeline import custom_component  # this import is required for the Custom Components to be registered
2 | 


--------------------------------------------------------------------------------
/rest_api/rest_api/controller/document.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | 
 3 | import logging
 4 | 
 5 | from fastapi import FastAPI, APIRouter
 6 | from haystack.document_stores import BaseDocumentStore
 7 | from haystack.schema import Document
 8 | 
 9 | from rest_api.utils import get_app, get_pipelines
10 | from rest_api.config import LOG_LEVEL
11 | from rest_api.schema import FilterRequest
12 | 
13 | 
14 | logging.getLogger("haystack").setLevel(LOG_LEVEL)
15 | logger = logging.getLogger("haystack")
16 | 
17 | 
18 | router = APIRouter()
19 | app: FastAPI = get_app()
20 | document_store: BaseDocumentStore = get_pipelines().get("document_store", None)
21 | 
22 | 
23 | @router.post("/documents/get_by_filters", response_model=List[Document], response_model_exclude_none=True)
24 | def get_documents(filters: FilterRequest):
25 |     """
26 |     This endpoint allows you to retrieve documents contained in your document store.
27 |     You can filter the documents to retrieve by metadata (like the document's name),
28 |     or provide an empty JSON object to clear the document store.
29 | 
30 |     Example of filters:
31 |     `'{"filters": {{"name": ["some", "more"], "category": ["only_one"]}}'`
32 | 
33 |     To get all documents you should provide an empty dict, like:
34 |     `'{"filters": {}}'`
35 |     """
36 |     docs = document_store.get_all_documents(filters=filters.filters)
37 |     for doc in docs:
38 |         doc.embedding = None
39 |     return docs
40 | 
41 | 
42 | @router.post("/documents/delete_by_filters", response_model=bool)
43 | def delete_documents(filters: FilterRequest):
44 |     """
45 |     This endpoint allows you to delete documents contained in your document store.
46 |     You can filter the documents to delete by metadata (like the document's name),
47 |     or provide an empty JSON object to clear the document store.
48 | 
49 |     Example of filters:
50 |     `'{"filters": {{"name": ["some", "more"], "category": ["only_one"]}}'`
51 | 
52 |     To get all documents you should provide an empty dict, like:
53 |     `'{"filters": {}}'`
54 |     """
55 |     document_store.delete_documents(filters=filters.filters)
56 |     return True
57 | 


--------------------------------------------------------------------------------
/rest_api/rest_api/controller/errors/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/open-domain-question-and-answer/7869af3f98ff6b06d7e1d8e576a6c5ca8bbd573d/rest_api/rest_api/controller/errors/__init__.py


--------------------------------------------------------------------------------
/rest_api/rest_api/controller/errors/http_error.py:
--------------------------------------------------------------------------------
1 | from fastapi import HTTPException
2 | from starlette.requests import Request
3 | from starlette.responses import JSONResponse
4 | 
5 | 
6 | async def http_error_handler(_: Request, exc: HTTPException) -> JSONResponse:
7 |     return JSONResponse({"errors": [exc.detail]}, status_code=exc.status_code)
8 | 


--------------------------------------------------------------------------------
/rest_api/rest_api/controller/file_upload.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional, List
 2 | 
 3 | import json
 4 | import shutil
 5 | import uuid
 6 | from pathlib import Path
 7 | 
 8 | from fastapi import FastAPI, APIRouter, UploadFile, File, Form, HTTPException, Depends
 9 | from pydantic import BaseModel
10 | from haystack import Pipeline
11 | from haystack.nodes import BaseConverter, PreProcessor
12 | 
13 | from rest_api.utils import get_app, get_pipelines
14 | from rest_api.config import FILE_UPLOAD_PATH
15 | from rest_api.controller.utils import as_form
16 | 
17 | 
18 | router = APIRouter()
19 | app: FastAPI = get_app()
20 | indexing_pipeline: Pipeline = get_pipelines().get("indexing_pipeline", None)
21 | 
22 | 
23 | @as_form
24 | class FileConverterParams(BaseModel):
25 |     remove_numeric_tables: Optional[bool] = None
26 |     valid_languages: Optional[List[str]] = None
27 | 
28 | 
29 | @as_form
30 | class PreprocessorParams(BaseModel):
31 |     clean_whitespace: Optional[bool] = None
32 |     clean_empty_lines: Optional[bool] = None
33 |     clean_header_footer: Optional[bool] = None
34 |     split_by: Optional[str] = None
35 |     split_length: Optional[int] = None
36 |     split_overlap: Optional[int] = None
37 |     split_respect_sentence_boundary: Optional[bool] = None
38 | 
39 | 
40 | class Response(BaseModel):
41 |     file_id: str
42 | 
43 | 
44 | @router.post("/file-upload")
45 | def upload_file(
46 |     files: List[UploadFile] = File(...),
47 |     # JSON serialized string
48 |     meta: Optional[str] = Form("null"),  # type: ignore
49 |     fileconverter_params: FileConverterParams = Depends(FileConverterParams.as_form),  # type: ignore
50 |     preprocessor_params: PreprocessorParams = Depends(PreprocessorParams.as_form),  # type: ignore
51 | ):
52 |     """
53 |     You can use this endpoint to upload a file for indexing
54 |     (see https://haystack.deepset.ai/guides/rest-api#indexing-documents-in-the-haystack-rest-api-document-store).
55 |     """
56 |     if not indexing_pipeline:
57 |         raise HTTPException(status_code=501, detail="Indexing Pipeline is not configured.")
58 | 
59 |     file_paths: list = []
60 |     file_metas: list = []
61 | 
62 |     meta_form = json.loads(meta) or {}  # type: ignore
63 |     if not isinstance(meta_form, dict):
64 |         raise HTTPException(status_code=500, detail=f"The meta field must be a dict or None, not {type(meta_form)}")
65 | 
66 |     for file in files:
67 |         try:
68 |             file_path = Path(FILE_UPLOAD_PATH) / f"{uuid.uuid4().hex}_{file.filename}"
69 |             with file_path.open("wb") as buffer:
70 |                 shutil.copyfileobj(file.file, buffer)
71 | 
72 |             file_paths.append(file_path)
73 |             meta_form["name"] = file.filename
74 |             file_metas.append(meta_form)
75 |         finally:
76 |             file.file.close()
77 | 
78 |     # Find nodes names
79 |     converters = indexing_pipeline.get_nodes_by_class(BaseConverter)
80 |     preprocessors = indexing_pipeline.get_nodes_by_class(PreProcessor)
81 | 
82 |     params = {}
83 |     for converter in converters:
84 |         params[converter.name] = fileconverter_params.dict()
85 |     for preprocessor in preprocessors:
86 |         params[preprocessor.name] = preprocessor_params.dict()
87 | 
88 |     indexing_pipeline.run(file_paths=file_paths, meta=file_metas, params=params)
89 | 


--------------------------------------------------------------------------------
/rest_api/rest_api/controller/utils.py:
--------------------------------------------------------------------------------
 1 | from typing import Type, NewType
 2 | 
 3 | import inspect
 4 | from contextlib import contextmanager
 5 | from threading import Semaphore
 6 | 
 7 | from fastapi import Form, HTTPException
 8 | from pydantic import BaseModel
 9 | 
10 | 
11 | class RequestLimiter:
12 |     def __init__(self, limit):
13 |         self.semaphore = Semaphore(limit - 1)
14 | 
15 |     @contextmanager
16 |     def run(self):
17 |         acquired = self.semaphore.acquire(blocking=False)
18 |         if not acquired:
19 |             raise HTTPException(status_code=503, detail="The server is busy processing requests.")
20 |         try:
21 |             yield acquired
22 |         finally:
23 |             self.semaphore.release()
24 | 
25 | 
26 | StringId = NewType("StringId", str)
27 | 
28 | 
29 | def as_form(cls: Type[BaseModel]):
30 |     """
31 |     Adds an as_form class method to decorated models. The as_form class method
32 |     can be used with FastAPI endpoints
33 |     """
34 |     new_params = [
35 |         inspect.Parameter(
36 |             field.alias,
37 |             inspect.Parameter.POSITIONAL_ONLY,
38 |             default=(Form(field.default) if not field.required else Form(...)),
39 |         )
40 |         for field in cls.__fields__.values()
41 |     ]
42 | 
43 |     async def _as_form(**data):
44 |         return cls(**data)
45 | 
46 |     sig = inspect.signature(_as_form)
47 |     sig = sig.replace(parameters=new_params)
48 |     _as_form.__signature__ = sig  # type: ignore
49 |     setattr(cls, "as_form", _as_form)
50 |     return cls
51 | 


--------------------------------------------------------------------------------
/rest_api/rest_api/pipeline/__init__.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Dict
 2 | 
 3 | import os
 4 | import logging
 5 | from pathlib import Path
 6 | 
 7 | from haystack.pipelines.base import Pipeline
 8 | from haystack.document_stores import FAISSDocumentStore, InMemoryDocumentStore
 9 | from haystack.errors import PipelineConfigError
10 | 
11 | from rest_api.controller.utils import RequestLimiter
12 | 
13 | 
14 | logger = logging.getLogger(__name__)
15 | 
16 | # Since each instance of FAISSDocumentStore creates an in-memory FAISS index, the Indexing & Query Pipelines would
17 | # end up with different indices. The same applies for InMemoryDocumentStore.
18 | UNSUPPORTED_DOC_STORES = (FAISSDocumentStore, InMemoryDocumentStore)
19 | 
20 | 
21 | def setup_pipelines() -> Dict[str, Any]:
22 |     # Re-import the configuration variables
23 |     from rest_api import config  # pylint: disable=reimported
24 | 
25 |     pipelines = {}
26 | 
27 |     # Load query pipeline
28 |     query_pipeline = Pipeline.load_from_yaml(Path(config.PIPELINE_YAML_PATH), pipeline_name=config.QUERY_PIPELINE_NAME)
29 |     logging.info("Loaded pipeline nodes: %s", query_pipeline.graph.nodes.keys())
30 |     pipelines["query_pipeline"] = query_pipeline
31 | 
32 |     # Find document store
33 |     document_store = query_pipeline.get_document_store()
34 |     logging.info("Loaded docstore: %s", document_store)
35 |     pipelines["document_store"] = document_store
36 | 
37 |     # Setup concurrency limiter
38 |     concurrency_limiter = RequestLimiter(config.CONCURRENT_REQUEST_PER_WORKER)
39 |     logging.info("Concurrent requests per worker: %s", config.CONCURRENT_REQUEST_PER_WORKER)
40 |     pipelines["concurrency_limiter"] = concurrency_limiter
41 | 
42 |     # Load indexing pipeline (if available)
43 |     try:
44 |         indexing_pipeline = Pipeline.load_from_yaml(
45 |             Path(config.PIPELINE_YAML_PATH), pipeline_name=config.INDEXING_PIPELINE_NAME
46 |         )
47 |         docstore = indexing_pipeline.get_document_store()
48 |         if isinstance(docstore, UNSUPPORTED_DOC_STORES):
49 |             indexing_pipeline = None
50 |             raise PipelineConfigError(
51 |                 "Indexing pipelines with FAISSDocumentStore or InMemoryDocumentStore are not supported by the REST APIs."
52 |             )
53 | 
54 |     except PipelineConfigError as e:
55 |         indexing_pipeline = None
56 |         logger.error("%s\nFile Upload API will not be available.", e.message)
57 | 
58 |     finally:
59 |         pipelines["indexing_pipeline"] = indexing_pipeline
60 | 
61 |     # Create directory for uploaded files
62 |     os.makedirs(config.FILE_UPLOAD_PATH, exist_ok=True)
63 | 
64 |     return pipelines
65 | 


--------------------------------------------------------------------------------
/rest_api/rest_api/pipeline/custom_component.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Pipelines allow putting together Components to build a graph.
 3 | 
 4 | In addition to the standard Haystack Components, custom user-defined Components
 5 | can be used in a Pipeline YAML configuration.
 6 | 
 7 | The classes for the Custom Components must be defined in this file.
 8 | """
 9 | 
10 | 
11 | from haystack.nodes.base import BaseComponent
12 | 
13 | 
14 | class SampleComponent(BaseComponent):
15 |     outgoing_edges: int = 1
16 | 
17 |     def run(self, **kwargs):
18 |         raise NotImplementedError
19 | 


--------------------------------------------------------------------------------
/rest_api/rest_api/pipeline/pipeline_empty.haystack-pipeline.yml:
--------------------------------------------------------------------------------
 1 | # Dummy pipeline, used when the CI needs to load the REST API to extract the OpenAPI specs. DO NOT USE.
 2 | version: ignore
 3 | 
 4 | components:
 5 |   - name: FileTypeClassifier
 6 |     type: FileTypeClassifier
 7 | 
 8 | pipelines:
 9 |   - name: query
10 |     nodes:
11 |       - name: FileTypeClassifier
12 |         inputs: [File]
13 | 
14 |   - name: indexing
15 |     nodes:
16 |       - name: FileTypeClassifier
17 |         inputs: [File]


--------------------------------------------------------------------------------
/rest_api/rest_api/pipeline/pipeline_plaid_colbertv2.yml:
--------------------------------------------------------------------------------
 1 | # To allow your IDE to autocomplete and validate your YAML pipelines, name them as <name of your choice>.haystack-pipeline.yml
 2 | 
 3 | version: ignore
 4 | 
 5 | components:    # define all the building-blocks for Pipeline
 6 |   - name: DocumentStore
 7 |     type: PLAIDDocumentStore
 8 |     params:
 9 |       index_path: /home/user/data/plaid_indexing/
10 |       checkpoint_path: /home/user/model/
11 |       collection_path: /home/user/data/psgs_w100.tsv
12 |   - name: Retriever
13 |     type: ColBERTRetriever
14 |     params:
15 |       document_store: DocumentStore    # params can reference other components defined in the YAML
16 |       top_k: 5
17 |   - name: Doc2Answers       # custom-name for the component; helpful for visualization & debugging
18 |     type: Docs2Answers
19 | pipelines:
20 |   - name: query    # a sample extractive-qa Pipeline
21 |     nodes:
22 |       - name: Retriever
23 |         inputs: [Query]
24 |       - name: Doc2Answers
25 |         inputs: [Retriever]
26 | 


--------------------------------------------------------------------------------
/rest_api/rest_api/pipeline/pipelines.colbertRanker.haystack-pipeline.yml:
--------------------------------------------------------------------------------
 1 | # To allow your IDE to autocomplete and validate your YAML pipelines, name them as <name of your choice>.haystack-pipeline.yml
 2 | 
 3 | version: ignore
 4 | 
 5 | components:    # define all the building-blocks for Pipeline
 6 |   - name: DocumentStore
 7 |     type: ElasticsearchDocumentStore
 8 |     params:
 9 |       host: localhost
10 |       index: document
11 |   - name: Retriever
12 |     type: BM25Retriever
13 |     params:
14 |       document_store: DocumentStore    # params can reference other components defined in the YAML
15 |       top_k: 5
16 |   - name: Ranker
17 |     type: ColBERTRanker
18 |     params:
19 |       model_path: /home/user/data
20 |   - name: Doc2Answers       # custom-name for the component; helpful for visualization & debugging
21 |     type: Docs2Answers    # Haystack Class name for the component
22 |   - name: TextFileConverter
23 |     type: TextConverter
24 |   - name: PDFFileConverter
25 |     type: PDFToTextConverter
26 |   - name: Preprocessor
27 |     type: PreProcessor
28 |     params:
29 |       split_by: word
30 |       split_length: 1000
31 |   - name: FileTypeClassifier
32 |     type: FileTypeClassifier
33 | pipelines:
34 |   - name: query    # a sample extractive-qa Pipeline
35 |     nodes:
36 |       - name: Retriever
37 |         inputs: [Query]
38 |       - name: Ranker
39 |         inputs: [Retriever]  
40 |       - name: Doc2Answers
41 |         inputs: [Ranker]
42 | 
43 |   - name: indexing
44 |     nodes:
45 |       - name: FileTypeClassifier
46 |         inputs: [File]
47 |       - name: TextFileConverter
48 |         inputs: [FileTypeClassifier.output_1]
49 |       - name: PDFFileConverter
50 |         inputs: [FileTypeClassifier.output_2]
51 |       - name: Preprocessor
52 |         inputs: [PDFFileConverter, TextFileConverter]
53 |       - name: Ranker
54 |         inputs: [Preprocessor]
55 |       - name: DocumentStore
56 |         inputs: [Ranker]
57 | 


--------------------------------------------------------------------------------
/rest_api/rest_api/pipeline/pipelines.haystack-EmbeddingRetriever-pipeline.yml:
--------------------------------------------------------------------------------
 1 | # To allow your IDE to autocomplete and validate your YAML pipelines, name them as <name of your choice>.haystack-pipeline.yml
 2 | 
 3 | version: ignore
 4 | 
 5 | components:    # define all the building-blocks for Pipeline
 6 |   - name: DocumentStore
 7 |     type: ElasticsearchDocumentStore
 8 |     params:
 9 |       host: localhost
10 |       index: document
11 |       embedding_field: question_emb
12 |       embedding_dim: 768
13 |       excluded_meta_data: ["question_emb"]
14 |   - name: Retriever
15 |     type: EmbeddingRetriever
16 |     params:
17 |       document_store: DocumentStore    # params can reference other components defined in the YAML
18 |       embedding_model: deepset/sentence_bert
19 |       top_k: 5
20 |   - name: Doc2Answers       # custom-name for the component; helpful for visualization & debugging
21 |     type: Docs2Answers    # Haystack Class name for the component
22 | pipelines:
23 |   - name: query    # a sample extractive-qa Pipeline
24 |     nodes:
25 |       - name: Retriever
26 |         inputs: [Query]
27 |       - name: Doc2Answers
28 |         inputs: [Retriever]
29 | 


--------------------------------------------------------------------------------
/rest_api/rest_api/pipeline/pipelines.haystack-pipeline.yml:
--------------------------------------------------------------------------------
 1 | # To allow your IDE to autocomplete and validate your YAML pipelines, name them as <name of your choice>.haystack-pipeline.yml
 2 | 
 3 | version: ignore
 4 | 
 5 | components:    # define all the building-blocks for Pipeline
 6 |   - name: DocumentStore
 7 |     type: ElasticsearchDocumentStore
 8 |     params:
 9 |       host: localhost
10 |   - name: Retriever
11 |     type: BM25Retriever
12 |     params:
13 |       document_store: DocumentStore    # params can reference other components defined in the YAML
14 |       top_k: 5
15 |   - name: Reader       # custom-name for the component; helpful for visualization & debugging
16 |     type: FARMReader    # Haystack Class name for the component
17 |     params:
18 |       model_name_or_path: deepset/roberta-base-squad2
19 |       context_window_size: 500
20 |       return_no_answer: true
21 |   - name: TextFileConverter
22 |     type: TextConverter
23 |   - name: PDFFileConverter
24 |     type: PDFToTextConverter
25 |   - name: Preprocessor
26 |     type: PreProcessor
27 |     params:
28 |       split_by: word
29 |       split_length: 1000
30 |   - name: FileTypeClassifier
31 |     type: FileTypeClassifier
32 | 
33 | pipelines:
34 |   - name: query    # a sample extractive-qa Pipeline
35 |     nodes:
36 |       - name: Retriever
37 |         inputs: [Query]
38 |       - name: Reader
39 |         inputs: [Retriever]
40 |   - name: indexing
41 |     nodes:
42 |       - name: FileTypeClassifier
43 |         inputs: [File]
44 |       - name: TextFileConverter
45 |         inputs: [FileTypeClassifier.output_1]
46 |       - name: PDFFileConverter
47 |         inputs: [FileTypeClassifier.output_2]
48 |       - name: Preprocessor
49 |         inputs: [PDFFileConverter, TextFileConverter]
50 |       - name: Retriever
51 |         inputs: [Preprocessor]
52 |       - name: DocumentStore
53 |         inputs: [Retriever]
54 | 


--------------------------------------------------------------------------------
/rest_api/rest_api/pipeline/pipelines_dpr.haystack-pipeline.yml:
--------------------------------------------------------------------------------
 1 | # To allow your IDE to autocomplete and validate your YAML pipelines, name them as <name of your choice>.haystack-pipeline.yml
 2 | 
 3 | version: ignore
 4 | 
 5 | components:    # define all the building-blocks for Pipeline
 6 |   - name: DocumentStore
 7 |     type: FAISSDocumentStore  # consider using MilvusDocumentStore or WeaviateDocumentStore for scaling to large number of documents
 8 |     params:
 9 |       faiss_index_path: /home/user/data/faiss-index-so.faiss
10 |       faiss_config_path: /home/user/data/faiss-index-so.json
11 |   - name: Retriever
12 |     type: DensePassageRetriever
13 |     params:
14 |       document_store: DocumentStore    # params can reference other components defined in the YAML
15 |       top_k: 5
16 |       query_embedding_model: "facebook/dpr-question_encoder-single-nq-base"
17 |       passage_embedding_model: "facebook/dpr-ctx_encoder-single-nq-base"
18 |       max_seq_len_query: 64
19 |       max_seq_len_passage: 256
20 |       batch_size: 16
21 |       embed_title: True
22 |       use_fast_tokenizers: True
23 |   - name: Doc2Answers       # custom-name for the component; helpful for visualization & debugging
24 |     type: Docs2Answers
25 | pipelines:
26 |   - name: query    # a sample extractive-qa Pipeline
27 |     nodes:
28 |       - name: Retriever
29 |         inputs: [Query]
30 |       - name: Doc2Answers
31 |         inputs: [Retriever]
32 | 


--------------------------------------------------------------------------------
/rest_api/rest_api/schema.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from typing import Dict, List, Optional, Union
 4 | import numpy as np
 5 | import pandas as pd
 6 | 
 7 | try:
 8 |     from typing import Literal
 9 | except ImportError:
10 |     from typing_extensions import Literal  # type: ignore
11 | 
12 | from pydantic import BaseModel, Field, Extra
13 | from pydantic import BaseConfig
14 | 
15 | from haystack.schema import Answer, Document
16 | 
17 | 
18 | BaseConfig.arbitrary_types_allowed = True
19 | BaseConfig.json_encoders = {np.ndarray: lambda x: x.tolist(), pd.DataFrame: lambda x: x.to_dict(orient="records")}
20 | 
21 | 
22 | PrimitiveType = Union[str, int, float, bool]
23 | 
24 | 
25 | class RequestBaseModel(BaseModel):
26 |     class Config:
27 |         # Forbid any extra fields in the request to avoid silent failures
28 |         extra = Extra.forbid
29 | 
30 | 
31 | class QueryRequest(RequestBaseModel):
32 |     query: str
33 |     pipeline: str = None
34 |     mode: int = 0
35 |     params: Optional[dict] = None
36 |     debug: Optional[bool] = False
37 | 
38 | 
39 | class FilterRequest(RequestBaseModel):
40 |     filters: Optional[Dict[str, Union[PrimitiveType, List[PrimitiveType], Dict[str, PrimitiveType]]]] = None
41 | 
42 | 
43 | class CreateLabelSerialized(RequestBaseModel):
44 |     id: Optional[str] = None
45 |     query: str
46 |     document: Document
47 |     is_correct_answer: bool
48 |     is_correct_document: bool
49 |     origin: Literal["user-feedback", "gold-label"]
50 |     answer: Optional[Answer] = None
51 |     no_answer: Optional[bool] = None
52 |     pipeline_id: Optional[str] = None
53 |     created_at: Optional[str] = None
54 |     updated_at: Optional[str] = None
55 |     meta: Optional[dict] = None
56 |     filters: Optional[dict] = None
57 | 
58 | 
59 | class QueryResponse(BaseModel):
60 |     query: str
61 |     answers: List[Answer] = []
62 |     documents: List[Document] = []
63 |     debug: Optional[Dict] = Field(None, alias="_debug")
64 | 


--------------------------------------------------------------------------------
/rest_api/rest_api/utils.py:
--------------------------------------------------------------------------------
 1 | from fastapi import FastAPI, HTTPException, APIRouter
 2 | from fastapi.routing import APIRoute
 3 | from fastapi.openapi.utils import get_openapi
 4 | from starlette.middleware.cors import CORSMiddleware
 5 | from haystack import __version__ as haystack_version
 6 | 
 7 | from rest_api.pipeline import setup_pipelines
 8 | from rest_api.controller.errors.http_error import http_error_handler
 9 | 
10 | 
11 | app = None
12 | pipelines = None
13 | 
14 | 
15 | def get_app() -> FastAPI:
16 |     """
17 |     Initializes the App object and creates the global pipelines as possible.
18 |     """
19 |     global app  # pylint: disable=global-statement
20 |     if app:
21 |         return app
22 | 
23 |     from rest_api.config import ROOT_PATH
24 | 
25 |     app = FastAPI(title="Haystack REST API", debug=True, version=haystack_version, root_path=ROOT_PATH)
26 | 
27 |     # Creates the router for the API calls
28 |     from rest_api.controller import file_upload, search, feedback, document, health
29 | 
30 |     router = APIRouter()
31 |     router.include_router(search.router, tags=["search"])
32 |     router.include_router(feedback.router, tags=["feedback"])
33 |     router.include_router(file_upload.router, tags=["file-upload"])
34 |     router.include_router(document.router, tags=["document"])
35 |     router.include_router(health.router, tags=["health"])
36 | 
37 |     # This middleware enables allow all cross-domain requests to the API from a browser. For production
38 |     # deployments, it could be made more restrictive.
39 |     app.add_middleware(
40 |         CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"]
41 |     )
42 |     app.add_exception_handler(HTTPException, http_error_handler)
43 |     app.include_router(router)
44 | 
45 |     # Simplify operation IDs so that generated API clients have simpler function
46 |     # names (see https://fastapi.tiangolo.com/advanced/path-operation-advanced-configuration/#using-the-path-operation-function-name-as-the-operationid).
47 |     # The operation IDs will be the same as the route names (i.e. the python method names of the endpoints)
48 |     # Should be called only after all routes have been added.
49 |     for route in app.routes:
50 |         if isinstance(route, APIRoute):
51 |             route.operation_id = route.name
52 | 
53 |     return app
54 | 
55 | 
56 | def get_pipelines():
57 |     global pipelines  # pylint: disable=global-statement
58 |     if not pipelines:
59 |         pipelines = setup_pipelines()
60 |     return pipelines
61 | 
62 | 
63 | def get_openapi_specs() -> dict:
64 |     """
65 |     Used to autogenerate OpenAPI specs file to use in the documentation.
66 | 
67 |     See `docs/_src/api/openapi/generate_openapi_specs.py`
68 |     """
69 |     app = get_app()
70 |     return get_openapi(
71 |         title=app.title,
72 |         version=app.version,
73 |         openapi_version=app.openapi_version,
74 |         description=app.description,
75 |         routes=app.routes,
76 |     )
77 | 


--------------------------------------------------------------------------------
/rest_api/test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/open-domain-question-and-answer/7869af3f98ff6b06d7e1d8e576a6c5ca8bbd573d/rest_api/test/__init__.py


--------------------------------------------------------------------------------
/rest_api/test/samples/pdf/sample_pdf_1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/open-domain-question-and-answer/7869af3f98ff6b06d7e1d8e576a6c5ca8bbd573d/rest_api/test/samples/pdf/sample_pdf_1.pdf


--------------------------------------------------------------------------------
/rest_api/test/samples/pdf/sample_pdf_2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/open-domain-question-and-answer/7869af3f98ff6b06d7e1d8e576a6c5ca8bbd573d/rest_api/test/samples/pdf/sample_pdf_2.pdf


--------------------------------------------------------------------------------
/rest_api/test/samples/test.haystack-pipeline.yml:
--------------------------------------------------------------------------------
 1 | version: 'ignore'
 2 | 
 3 | components:
 4 |   - name: TestReader
 5 |     type: MockReader
 6 |   - name: TestRetriever
 7 |     type: MockRetriever
 8 |     params:
 9 |       document_store: TestDocumentStore
10 |   - name: TestDocumentStore
11 |     type: MockDocumentStore
12 |   - name: TestPreprocessor
13 |     type: PreProcessor
14 |     params:
15 |       clean_whitespace: true
16 |   - name: TestPDFConverter
17 |     type: MockPDFToTextConverter
18 |     params:
19 |       remove_numeric_tables: false
20 | 
21 | 
22 | pipelines:
23 |   - name: test-query
24 |     nodes:
25 |       - name: TestRetriever
26 |         inputs: [Query]
27 |       - name: TestReader
28 |         inputs: [TestRetriever]
29 | 
30 |   - name: test-indexing
31 |     nodes:
32 |       - name: TestPDFConverter
33 |         inputs: [File]
34 |       - name: TestPreprocessor
35 |         inputs: [TestPDFConverter]
36 |       - name: TestDocumentStore
37 |         inputs: [TestPreprocessor]


--------------------------------------------------------------------------------
/test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/open-domain-question-and-answer/7869af3f98ff6b06d7e1d8e576a6c5ca8bbd573d/test/__init__.py


--------------------------------------------------------------------------------
/test/benchmarks/README.md:
--------------------------------------------------------------------------------
 1 | # Benchmarks
 2 | 
 3 | Run the benchmarks with the following command:
 4 | 
 5 | ```
 6 | python run.py [--reader] [--retriever_index] [--retriever_query] [--ci] [--update-json]
 7 | ```
 8 | 
 9 | You can specify which components and processes to benchmark with the following flags.
10 | 
11 | **--reader** will trigger the speed and accuracy benchmarks for the reader. Here we simply use the SQuAD dev set.
12 | 
13 | **--retriever_index** will trigger indexing benchmarks
14 | 
15 | **--retriever_query** will trigger querying benchmarks (embeddings will be loaded from file instead of being computed on the fly)
16 | 
17 | **--ci** will cause the the benchmarks to run on a smaller slice of each dataset and a smaller subset of Retriever / Reader / DocStores. 
18 | 
19 | **--update-json** will cause the script to update the json files in docs/_src/benchmarks so that the website benchmarks will be updated.
20 |  


--------------------------------------------------------------------------------
/test/benchmarks/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "params": {
 3 |     "full": {
 4 |       "retriever_doc_stores": [
 5 |         [
 6 |           "elastic",
 7 |           "elasticsearch"
 8 |         ],
 9 |         [
10 |           "elastic",
11 |           "opensearch_flat"
12 |         ],
13 |         [
14 |           "dpr",
15 |           "opensearch_flat"
16 |         ],
17 |         [
18 |           "dpr",
19 |           "opensearch_hnsw"
20 |         ],
21 |         [
22 |           "dpr",
23 |           "elasticsearch"
24 |         ],
25 |         [
26 |           "dpr",
27 |           "milvus_flat"
28 |         ],
29 |         [
30 |           "dpr",
31 |           "milvus_hnsw"
32 |         ],
33 |         [
34 |           "dpr",
35 |           "faiss_flat"
36 |         ],
37 |         [
38 |           "dpr",
39 |           "faiss_hnsw"
40 |         ],
41 |         [
42 |           "sentence_transformers",
43 |           "elasticsearch"
44 |         ]
45 |       ],
46 |       "n_docs_options": [
47 |         1000,
48 |         10000,
49 |         100000,
50 |         500000
51 |       ],
52 |       "n_queries": null
53 |     },
54 |     "ci": {
55 |       "retriever_doc_stores": [
56 |         [
57 |           "elastic",
58 |           "elasticsearch"
59 |         ]
60 |       ],
61 |       "n_docs_options": [
62 |         1000
63 |       ],
64 |       "n_queries": 100
65 |     }
66 |   },
67 |   "filenames": {
68 |     "data_s3_url": "https://ext-haystack-retriever-eval.s3-eu-west-1.amazonaws.com/",
69 |     "data_dir": "../../data/retriever/",
70 |     "filename_gold": "nq2squad-dev.json",
71 |     "filenames_negative": {
72 |       "10000": "psgs_w100_minus_gold_10k.tsv",
73 |       "100000": "psgs_w100_minus_gold_100k.tsv",
74 |       "1000000": "psgs_w100_minus_gold_1m.tsv"
75 |     },
76 |     "embeddings_dir": "embeddings/",
77 |     "embeddings_filenames": {
78 |       "10000": "wikipedia_passages_10k.pkl",
79 |       "100000": "wikipedia_passages_100k.pkl",
80 |       "1000000":  "wikipedia_passages_1m.pkl"}
81 |   }
82 | }
83 | 


--------------------------------------------------------------------------------
/test/benchmarks/data_scripts/embeddings_slice.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | from pathlib import Path
 3 | from tqdm import tqdm
 4 | import json
 5 | 
 6 | n_passages = 1_000_000
 7 | embeddings_dir = Path("embeddings")
 8 | embeddings_filenames = [f"wikipedia_passages_{i}.pkl" for i in range(50)]
 9 | neg_passages_filename = "psgs_w100_minus_gold.tsv"
10 | gold_passages_filename = "nq2squad-dev.json"
11 | 
12 | # Extract gold passage ids
13 | passage_ids = []
14 | gold_data = json.load(open(gold_passages_filename))["data"]
15 | for d in gold_data:
16 |     for p in d["paragraphs"]:
17 |         passage_ids.append(str(p["passage_id"]))
18 | print("gold_ids")
19 | print(len(passage_ids))
20 | print()
21 | 
22 | # Extract neg passage ids
23 | with open(neg_passages_filename) as f:
24 |     f.readline()  # Ignore column headers
25 |     for _ in range(n_passages - len(passage_ids)):
26 |         l = f.readline()
27 |         passage_ids.append(str(l.split()[0]))
28 | assert len(passage_ids) == len(set(passage_ids))
29 | assert set([type(x) for x in passage_ids]) == {str}
30 | passage_ids = set(passage_ids)
31 | print("all_ids")
32 | print(len(passage_ids))
33 | print()
34 | 
35 | 
36 | # Gather vectors for passages
37 | ret = []
38 | for ef in tqdm(embeddings_filenames):
39 |     curr = pickle.load(open(embeddings_dir / ef, "rb"))
40 |     for i, vec in curr:
41 |         if i in passage_ids:
42 |             ret.append((i, vec))
43 | print("n_vectors")
44 | print(len(ret))
45 | print()
46 | 
47 | # Write vectors to file
48 | with open(f"wikipedia_passages_{n_passages}.pkl", "wb") as f:
49 |     pickle.dump(ret, f)
50 | 


--------------------------------------------------------------------------------
/test/benchmarks/data_scripts/shuffle_passages.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from tqdm import tqdm
 3 | import time
 4 | import random
 5 | 
 6 | random.seed(42)
 7 | 
 8 | lines = []
 9 | with open("psgs_w100_minus_gold_unshuffled.tsv") as f:
10 |     f.readline()  # Remove column header
11 |     lines = [l for l in tqdm(f)]
12 | 
13 | tic = time.perf_counter()
14 | random.shuffle(lines)
15 | toc = time.perf_counter()
16 | t = toc - tic
17 | print(t)
18 | with open("psgs_w100_minus_gold.tsv", "w") as f:
19 |     f.write("id\ttext\title\n")
20 |     for l in tqdm(lines):
21 |         f.write(l)
22 | 


--------------------------------------------------------------------------------
/test/benchmarks/distillation_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "student_model": {
 3 |         "model_name_or_path": "roberta-base",
 4 |         "batch_size": 80
 5 |     },
 6 |     "teacher_model": {
 7 |         "model_name_or_path": "deepset/roberta-large-squad2",
 8 |         "batch_size": 512
 9 |     },
10 |     "distillation_settings": {
11 |         "distillation_loss": "kl_div",
12 |         "distillation_loss_weight": [0.75, 1],
13 |         "temperature": [5, 10]
14 |     },
15 |     "training_settings": {
16 |         "n_epochs": 2,
17 |         "max_seq_len": 384,
18 |         "learning_rate": 3e-5
19 |     },
20 |     "dataset": "squad2",
21 |     "download_folder": "dataset/squad2",
22 |     "evaluate_teacher": true,
23 |     "evaluate_student_without_distillation": true,
24 |     "evaluate_student_with_distillation": true
25 | }


--------------------------------------------------------------------------------
/test/benchmarks/reader_results.csv:
--------------------------------------------------------------------------------
1 | ,EM,f1,top_n_accuracy,top_n,reader_time,seconds_per_query,passages_per_second,reader,error
2 | 0,0.7839204449688185,0.8258860575299658,0.9742120343839542,5,98.16358173700064,0.008272676701247315,125.81040525892847,deepset/roberta-base-squad2,
3 | 1,0.7438058317883027,0.7887858491007042,0.9719366256531266,5,47.38258053499885,0.003993138423647299,260.6443097981493,deepset/minilm-uncased-squad2,
4 | 2,0.6947581324793528,0.7431182400443286,0.9557559413450194,5,101.99811779300217,0.008595829916821352,121.08066567525722,deepset/bert-base-cased-squad2,
5 | 3,0.7897353783920446,0.8326306774734308,0.976908815101972,5,292.51886408200517,0.024651851009776266,42.21949937744112,deepset/bert-large-uncased-whole-word-masking-squad2,
6 | 4,0.8021237148154391,0.8450422699207468,0.974043485589078,5,293.53038741600176,0.024737096529243364,42.07400844838984,deepset/xlm-roberta-large-squad2,
7 | 5,0.3729984830608461,0.4231925844723574,0.9539019046013821,5,55.403011280999635,0.004669055391960192,222.91207128366705,distilbert-base-uncased-distilled-squad,
8 | 


--------------------------------------------------------------------------------
/test/benchmarks/retriever_index_results.csv:
--------------------------------------------------------------------------------
 1 | ,retriever,doc_store,n_docs,indexing_time,docs_per_second,date_time,error
 2 | 9,dpr,elasticsearch,10000,139.7465313429998,71.55812673057035,2021-04-12 13:06:34.024778,
 3 | 14,elastic,elasticsearch,100000,205.94765839000047,485.56026702003703,2021-04-12 13:44:31.464961,
 4 | 8,elastic,elasticsearch,10000,19.96974077699997,500.7576268349683,2021-04-12 13:03:44.944941,
 5 | 3,dpr,elasticsearch,1000,14.592372578999857,68.52895199777984,2021-04-12 12:58:01.128834,
 6 | 2,elastic,elasticsearch,1000,2.1051091760000418,475.034744706267,2021-04-12 12:57:18.604681,
 7 | 15,dpr,elasticsearch,100000,1401.1558383250003,71.36964873196699,2021-04-12 14:08:31.400192,
 8 | 20,elastic,elasticsearch,500000,1027.416534557,486.6575368242339,2021-04-12 17:30:22.080196,
 9 | 21,dpr,elasticsearch,500000,7010.269106937998,71.32393812174124,2021-04-12 19:28:39.657070,
10 | 4,dpr,faiss_flat,1000,9.570316116999948,104.48975642755202,2021-04-12 12:58:47.918981,
11 | 22,dpr,faiss_flat,500000,5041.962777018001,99.16772933728758,2021-04-12 20:55:28.443354,
12 | 10,dpr,faiss_flat,10000,95.71089355200002,104.48131481049198,2021-04-12 13:08:50.343175,
13 | 16,dpr,faiss_flat,100000,999.8815230299997,100.0118491008456,2021-04-12 14:26:14.495997,
14 | 11,dpr,faiss_hnsw,10000,108.9302881550002,91.80183188142033,2021-04-12 13:11:13.117266,
15 | 17,dpr,faiss_hnsw,100000,1112.2988848330006,89.90389306648807,2021-04-12 14:45:22.644624,
16 | 23,dpr,faiss_hnsw,500000,5802.5877488399965,86.16845132586847,2021-04-12 22:32:53.095579,
17 | 5,dpr,faiss_hnsw,1000,9.837438108000242,101.65248197970928,2021-04-12 12:59:30.777696,
18 | 0,dpr,milvus_flat,1000,9.717840198999966,102.90352377917338,2021-04-12 12:56:32.363797,
19 | 6,dpr,milvus_flat,10000,87.06480573199997,114.85697252666792,2021-04-12 13:01:21.834327,
20 | 12,dpr,milvus_flat,100000,861.995940363,116.00982709720004,2021-04-12 13:26:00.742197,
21 | 18,dpr,milvus_flat,500000,4364.3841063849995,114.56370195934652,2021-04-12 15:58:40.069278,
22 | 1,dpr,milvus_hnsw,1000,8.522245804999784,117.33996212750934,2021-04-12 12:57:04.976604,
23 | 7,dpr,milvus_hnsw,10000,87.128293364,114.77327988306308,2021-04-12 13:03:13.381764,
24 | 19,dpr,milvus_hnsw,500000,4414.051032668,113.27463056035022,2021-04-12 17:12:50.943619,
25 | 13,dpr,milvus_hnsw,100000,864.9713281529998,115.61076852516385,2021-04-12 13:40:51.875517,
26 | 0,sentence_transformers,elasticsearch,1000,10.380210993000219,96.33715544648746,2021-06-02 08:49:29.922794,
27 | 1,sentence_transformers,elasticsearch,10000,82.89545158599958,120.63388049253265,2021-06-02 08:51:09.796056,
28 | 2,sentence_transformers,elasticsearch,100000,836.6144149759998,119.52937722555106,2021-06-02 09:05:26.454063,
29 | 3,sentence_transformers,elasticsearch,500000,4207.770141414,118.82778364694073,2021-06-02 10:16:20.514575,
30 | 1,dpr,opensearch_flat,100000,1427.47408267,70.05381128388427,2021-07-22 12:33:02.890691,
31 | 0,elastic,opensearch_flat,100000,207.3902409509992,482.18276588833,2021-07-22 12:08:18.041527,
32 | 2,dpr,opensearch_hnsw,100000,1422.2719023249992,70.31004397719536,2021-07-22 12:57:54.770107,
33 | 


--------------------------------------------------------------------------------
/test/benchmarks/retriever_query_results.md:
--------------------------------------------------------------------------------
 1 | |    | retriever   | doc_store     |   n_docs |   n_queries |   retrieve_time |   queries_per_second |   seconds_per_query |   recall |      map |   top_k | date_time                  | error   |
 2 | |---:|:------------|:--------------|---------:|------------:|----------------:|---------------------:|--------------------:|---------:|---------:|--------:|:---------------------------|:--------|
 3 | |  1 | dpr         | elasticsearch |     1000 |        1064 |        34.6755  |             30.6845  |          0.0325897  | 0.991541 | 0.929511 |      10 | 2021-02-01 11:27:43.048502 |         |
 4 | |  5 | dpr         | elasticsearch |    10000 |        5637 |       288.061   |             19.5688  |          0.0511019  | 0.974987 | 0.89871  |      10 | 2021-02-01 11:37:21.149887 |         |
 5 | |  9 | dpr         | elasticsearch |   100000 |        5637 |      1225.63    |              4.59928 |          0.217425   | 0.957956 | 0.865456 |      10 | 2021-02-01 12:15:52.757320 |         |
 6 | | 13 | dpr         | elasticsearch |   500000 |        5637 |      5339.01    |              1.05581 |          0.947136   | 0.930814 | 0.808614 |      10 | 2021-02-01 14:52:23.056230 |         |
 7 | |  0 | elastic     | elasticsearch |     1000 |        1064 |         4.04654 |            262.941   |          0.00380314 | 0.890977 | 0.742044 |      10 | 2021-02-01 11:26:04.346134 |         |
 8 | |  4 | elastic     | elasticsearch |    10000 |        5637 |        30.7014  |            183.607   |          0.00544641 | 0.81107  | 0.662063 |      10 | 2021-02-01 11:31:20.470092 |         |
 9 | |  8 | elastic     | elasticsearch |   100000 |        5637 |        34.7055  |            162.424   |          0.00615673 | 0.719354 | 0.562596 |      10 | 2021-02-01 11:50:36.048887 |         |
10 | | 12 | elastic     | elasticsearch |   500000 |        5637 |        68.3838  |             82.4318  |          0.0121312  | 0.627461 | 0.455945 |      10 | 2021-02-01 13:02:16.905187 |         |
11 | |  2 | dpr         | faiss_flat    |     1000 |        1064 |        30.0533  |             35.4038  |          0.0282456  | 0.991541 | 0.929511 |      10 | 2021-02-01 11:28:55.544474 |         |
12 | |  6 | dpr         | faiss_flat    |    10000 |        5637 |       218.594   |             25.7875  |          0.0387785  | 0.974987 | 0.89871  |      10 | 2021-02-01 11:42:07.545869 |         |
13 | | 10 | dpr         | faiss_flat    |   100000 |        5637 |       865.744   |              6.51116 |          0.153582   | 0.957956 | 0.865461 |      10 | 2021-02-01 12:34:29.493598 |         |
14 | | 14 | dpr         | faiss_flat    |   500000 |        5637 |      3717.95    |              1.51616 |          0.659561   | 0.930814 | 0.808614 |      10 | 2021-02-01 16:12:52.804436 |         |
15 | |  3 | dpr         | faiss_hnsw    |     1000 |        1064 |        27.1677  |             39.1641  |          0.0255336  | 0.991541 | 0.929511 |      10 | 2021-02-01 11:30:02.684535 |         |
16 | |  7 | dpr         | faiss_hnsw    |    10000 |        5637 |       167.552   |             33.6432  |          0.0297237  | 0.972503 | 0.896994 |      10 | 2021-02-01 11:46:07.130588 |         |
17 | | 11 | dpr         | faiss_hnsw    |   100000 |        5637 |       167.482   |             33.6573  |          0.0297112  | 0.940216 | 0.850798 |      10 | 2021-02-01 12:43:21.697968 |         |
18 | | 15 | dpr         | faiss_hnsw    |   500000 |        5637 |       164.456   |             34.2767  |          0.0291743  | 0.882562 | 0.769148 |      10 | 2021-02-01 16:47:01.710072 |         |


--------------------------------------------------------------------------------
/test/benchmarks/retriever_simplified.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This script performs the same query benchmarking as `retriever.py` but with less of the loops that iterate
 3 | over all the parameters so that it is easier to inspect what is happening
 4 | """
 5 | 
 6 | 
 7 | from haystack.document_stores import MilvusDocumentStore, FAISSDocumentStore
 8 | from haystack.nodes import DensePassageRetriever
 9 | from retriever import prepare_data
10 | import datetime
11 | from pprint import pprint
12 | from milvus import IndexType
13 | from utils import get_document_store
14 | 
15 | 
16 | def benchmark_querying(index_type, n_docs=100_000, similarity="dot_product"):
17 | 
18 |     doc_index = "document"
19 |     label_index = "label"
20 | 
21 |     docs, labels = prepare_data(
22 |         data_dir="data/",
23 |         filename_gold="nq2squad-dev.json",
24 |         filename_negative="psgs_w100_minus_gold_100k.tsv",
25 |         remote_url="https://ext-haystack-retriever-eval.s3-eu-west-1.amazonaws.com/",
26 |         embeddings_filenames=["wikipedia_passages_100k.pkl"],
27 |         embeddings_dir="embeddings/",
28 |         n_docs=n_docs,
29 |         add_precomputed=True,
30 |     )
31 | 
32 |     doc_store = get_document_store(document_store_type=index_type, similarity=similarity)
33 | 
34 |     # if index_type == "milvus_flat":
35 |     #     doc_store = MilvusDocumentStore(index=doc_index, similarity=similarity)
36 |     # elif index_type == "milvus_hnsw":
37 |     #     index_param = {"M": 64, "efConstruction": 80}
38 |     #     search_param = {"ef": 20}
39 |     #     doc_store = MilvusDocumentStore(
40 |     #         index=doc_index,
41 |     #         index_type=IndexType.HNSW,
42 |     #         index_param=index_param,
43 |     #         search_param=search_param,
44 |     #         similarity=similarity
45 |     #     )
46 | 
47 |     doc_store.write_documents(documents=docs, index=doc_index)
48 |     doc_store.write_labels(labels=labels, index=label_index)
49 | 
50 |     retriever = DensePassageRetriever(
51 |         document_store=doc_store,
52 |         query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
53 |         passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
54 |         use_gpu=True,
55 |         use_fast_tokenizers=True,
56 |     )
57 | 
58 |     raw_results = retriever.eval(label_index=label_index, doc_index=doc_index)
59 |     results = {
60 |         "n_queries": raw_results["n_questions"],
61 |         "retrieve_time": raw_results["retrieve_time"],
62 |         "queries_per_second": raw_results["n_questions"] / raw_results["retrieve_time"],
63 |         "seconds_per_query": raw_results["retrieve_time"] / raw_results["n_questions"],
64 |         "recall": raw_results["recall"] * 100,
65 |         "map": raw_results["map"] * 100,
66 |         "top_k": raw_results["top_k"],
67 |         "date_time": datetime.datetime.now(),
68 |         "error": None,
69 |     }
70 | 
71 |     pprint(results)
72 | 
73 |     doc_store.delete_all_documents(index=doc_index)
74 |     doc_store.delete_all_documents(index=label_index)
75 | 
76 | 
77 | if __name__ == "__main__":
78 |     similarity = "l2"
79 |     n_docs = 1000
80 | 
81 |     benchmark_querying(index_type="milvus_flat", similarity=similarity, n_docs=n_docs)
82 |     benchmark_querying(index_type="milvus_hnsw", similarity=similarity, n_docs=n_docs)
83 |     benchmark_querying(index_type="faiss_flat", similarity=similarity, n_docs=n_docs)
84 |     benchmark_querying(index_type="faiss_hnsw", similarity=similarity, n_docs=n_docs)
85 | 


--------------------------------------------------------------------------------
/test/benchmarks/run.py:
--------------------------------------------------------------------------------
 1 | # The benchmarks use
 2 | # - a variant of the Natural Questions Dataset (https://ai.google.com/research/NaturalQuestions) from Google Research
 3 | #   licensed under CC BY-SA 3.0 (https://creativecommons.org/licenses/by-sa/3.0/)
 4 | # - the SQuAD 2.0 Dataset (https://rajpurkar.github.io/SQuAD-explorer/) from  Rajpurkar et al.
 5 | #   licensed under  CC BY-SA 4.0 (https://creativecommons.org/licenses/by-sa/4.0/legalcode)
 6 | 
 7 | from retriever import benchmark_indexing, benchmark_querying
 8 | from reader import benchmark_reader
 9 | from utils import load_config
10 | import argparse
11 | 
12 | 
13 | parser = argparse.ArgumentParser()
14 | 
15 | parser.add_argument("--reader", default=False, action="store_true", help="Perform Reader benchmarks")
16 | parser.add_argument(
17 |     "--retriever_index", default=False, action="store_true", help="Perform Retriever indexing benchmarks"
18 | )
19 | parser.add_argument(
20 |     "--retriever_query", default=False, action="store_true", help="Perform Retriever querying benchmarks"
21 | )
22 | parser.add_argument(
23 |     "--ci", default=False, action="store_true", help="Perform a smaller subset of benchmarks that are quicker to run"
24 | )
25 | parser.add_argument(
26 |     "--update_json",
27 |     default=False,
28 |     action="store_true",
29 |     help="Update the json file with the results of this run so that the website can be updated",
30 | )
31 | parser.add_argument(
32 |     "--save_markdown",
33 |     default=False,
34 |     action="store_true",
35 |     help="Update the json file with the results of this run so that the website can be updated",
36 | )
37 | args = parser.parse_args()
38 | 
39 | # load config
40 | params, filenames = load_config(config_filename="config.json", ci=args.ci)
41 | 
42 | if args.retriever_index:
43 |     benchmark_indexing(
44 |         **params, **filenames, ci=args.ci, update_json=args.update_json, save_markdown=args.save_markdown
45 |     )
46 | if args.retriever_query:
47 |     benchmark_querying(
48 |         **params, **filenames, ci=args.ci, update_json=args.update_json, save_markdown=args.save_markdown
49 |     )
50 | if args.reader:
51 |     benchmark_reader(**params, **filenames, ci=args.ci, update_json=args.update_json, save_markdown=args.save_markdown)
52 | 


--------------------------------------------------------------------------------
/test/document_stores/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/open-domain-question-and-answer/7869af3f98ff6b06d7e1d8e576a6c5ca8bbd573d/test/document_stores/__init__.py


--------------------------------------------------------------------------------
/test/document_stores/test_knowledge_graph.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | import pytest
 4 | 
 5 | from haystack.nodes import Text2SparqlRetriever
 6 | from haystack.document_stores import GraphDBKnowledgeGraph
 7 | from haystack.utils import fetch_archive_from_http
 8 | 
 9 | 
10 | @pytest.mark.graphdb
11 | def test_graph_retrieval():
12 |     # TODO rename doc_dir
13 |     graph_dir = "../data/tutorial10_knowledge_graph/"
14 |     s3_url = "https://fandom-qa.s3-eu-west-1.amazonaws.com/triples_and_config.zip"
15 |     fetch_archive_from_http(url=s3_url, output_dir=graph_dir)
16 | 
17 |     # Fetch a pre-trained BART model that translates natural language questions to SPARQL queries
18 |     model_dir = "../saved_models/tutorial10_knowledge_graph/"
19 |     s3_url = "https://fandom-qa.s3-eu-west-1.amazonaws.com/saved_models/hp_v3.4.zip"
20 |     fetch_archive_from_http(url=s3_url, output_dir=model_dir)
21 | 
22 |     kg = GraphDBKnowledgeGraph(index="tutorial_10_index")
23 |     kg.delete_index()
24 |     kg.create_index(config_path=Path(graph_dir + "repo-config.ttl"))
25 |     kg.import_from_ttl_file(index="tutorial_10_index", path=Path(graph_dir + "triples.ttl"))
26 |     triple = {
27 |         "p": {"type": "uri", "value": "https://deepset.ai/harry_potter/_paternalgrandfather"},
28 |         "s": {"type": "uri", "value": "https://deepset.ai/harry_potter/Melody_fawley"},
29 |         "o": {"type": "uri", "value": "https://deepset.ai/harry_potter/Marshall_fawley"},
30 |     }
31 |     triples = kg.get_all_triples()
32 |     assert len(triples) > 0
33 |     assert triple in triples
34 | 
35 |     # Define prefixes for names of resources so that we can use shorter resource names in queries
36 |     prefixes = """PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
37 |     PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
38 |     PREFIX hp: <https://deepset.ai/harry_potter/>
39 |     """
40 |     kg.prefixes = prefixes
41 | 
42 |     kgqa_retriever = Text2SparqlRetriever(knowledge_graph=kg, model_name_or_path=model_dir + "hp_v3.4")
43 | 
44 |     result = kgqa_retriever.retrieve(query="In which house is Harry Potter?")
45 |     assert result[0] == {
46 |         "answer": ["https://deepset.ai/harry_potter/Gryffindor"],
47 |         "prediction_meta": {
48 |             "model": "Text2SparqlRetriever",
49 |             "sparql_query": "select ?a { hp:Harry_potter hp:house ?a . }",
50 |         },
51 |     }
52 | 
53 |     result = kgqa_retriever._query_kg(
54 |         sparql_query="select distinct ?sbj where { ?sbj hp:job hp:Keeper_of_keys_and_grounds . }"
55 |     )
56 |     assert result[0][0] == "https://deepset.ai/harry_potter/Rubeus_hagrid"
57 | 
58 |     result = kgqa_retriever._query_kg(
59 |         sparql_query="select distinct ?obj where { <https://deepset.ai/harry_potter/Hermione_granger> <https://deepset.ai/harry_potter/patronus> ?obj . }"
60 |     )
61 |     assert result[0][0] == "https://deepset.ai/harry_potter/Otter"
62 | 


--------------------------------------------------------------------------------
/test/modeling/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/open-domain-question-and-answer/7869af3f98ff6b06d7e1d8e576a6c5ca8bbd573d/test/modeling/__init__.py


--------------------------------------------------------------------------------
/test/modeling/test_modeling_inference.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | 
 4 | @pytest.mark.parametrize("multiprocessing_chunksize", [None, 2])
 5 | @pytest.mark.parametrize("num_processes", [2, 0, None], scope="module")
 6 | def test_qa_format_and_results(adaptive_model_qa, multiprocessing_chunksize):
 7 |     qa_inputs_dicts = [
 8 |         {
 9 |             "questions": ["In what country is Normandy"],
10 |             "text": "The Normans are an ethnic group that arose in Normandy, a northern region "
11 |             "of France, from contact between Viking settlers and indigenous Franks and Gallo-Romans",
12 |         },
13 |         {
14 |             "questions": ["Who counted the game among the best ever made?"],
15 |             "text": "Twilight Princess was released to universal critical acclaim and commercial success. It received "
16 |             "perfect scores from major publications such as 1UP.com, Computer and Video Games, Electronic "
17 |             "Gaming Monthly, Game Informer, GamesRadar, and GameSpy. On the review aggregators GameRankings "
18 |             "and Metacritic, Twilight Princess has average scores of 95% and 95 for the Wii version and scores "
19 |             "of 95% and 96 for the GameCube version. GameTrailers in their review called it one of the "
20 |             "greatest games ever created.",
21 |         },
22 |     ]
23 |     ground_truths = ["France", "GameTrailers"]
24 | 
25 |     results = adaptive_model_qa.inference_from_dicts(
26 |         dicts=qa_inputs_dicts, multiprocessing_chunksize=multiprocessing_chunksize
27 |     )
28 |     # sample results
29 |     # [
30 |     #     {
31 |     #         "task": "qa",
32 |     #         "predictions": [
33 |     #             {
34 |     #                 "question": "In what country is Normandy",
35 |     #                 "question_id": "None",
36 |     #                 "ground_truth": None,
37 |     #                 "answers": [
38 |     #                     {
39 |     #                         "score": 1.1272038221359253,
40 |     #                         "probability": -1,
41 |     #                         "answer": "France",
42 |     #                         "offset_answer_start": 54,
43 |     #                         "offset_answer_end": 60,
44 |     #                         "context": "The Normans gave their name to Normandy, a region in France.",
45 |     #                         "offset_context_start": 0,
46 |     #                         "offset_context_end": 60,
47 |     #                         "document_id": None,
48 |     #                     }
49 |     #                 ]
50 |     #             }
51 |     #         ],
52 |     #     }
53 |     # ]
54 |     predictions = list(results)[0]["predictions"]
55 | 
56 |     for prediction, ground_truth, qa_input_dict in zip(predictions, ground_truths, qa_inputs_dicts):
57 |         assert prediction["question"] == qa_input_dict["questions"][0]
58 |         answer = prediction["answers"][0]
59 |         assert answer["answer"] in answer["context"]
60 |         assert answer["answer"] == ground_truth
61 |         assert {
62 |             "answer",
63 |             "score",
64 |             "probability",
65 |             "offset_answer_start",
66 |             "offset_answer_end",
67 |             "context",
68 |             "offset_context_start",
69 |             "offset_context_end",
70 |             "document_id",
71 |         } == answer.keys()
72 | 
73 | 
74 | if __name__ == "__main__":
75 |     test_qa_format_and_results()
76 | 


--------------------------------------------------------------------------------
/test/modeling/test_modeling_prediction_head.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | from haystack.modeling.model.adaptive_model import AdaptiveModel
 4 | from haystack.modeling.model.language_model import LanguageModel
 5 | from haystack.modeling.model.prediction_head import QuestionAnsweringHead
 6 | from haystack.modeling.utils import set_all_seeds, initialize_device_settings
 7 | 
 8 | 
 9 | def test_prediction_head_load_save(tmp_path, caplog=None):
10 |     if caplog:
11 |         caplog.set_level(logging.CRITICAL)
12 | 
13 |     set_all_seeds(seed=42)
14 |     devices, n_gpu = initialize_device_settings(use_cuda=False)
15 |     lang_model = "bert-base-german-cased"
16 | 
17 |     language_model = LanguageModel.load(lang_model)
18 |     prediction_head = QuestionAnsweringHead()
19 | 
20 |     model = AdaptiveModel(
21 |         language_model=language_model,
22 |         prediction_heads=[prediction_head],
23 |         embeds_dropout_prob=0.1,
24 |         lm_output_types=["per_sequence"],
25 |         device=devices[0],
26 |     )
27 | 
28 |     model.save(tmp_path)
29 |     model_loaded = AdaptiveModel.load(tmp_path, device="cpu")
30 |     assert model_loaded is not None
31 | 


--------------------------------------------------------------------------------
/test/modeling/test_modeling_processor_saving_loading.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from pathlib import Path
 3 | 
 4 | from haystack.modeling.data_handler.processor import SquadProcessor
 5 | from haystack.modeling.model.tokenization import Tokenizer
 6 | from haystack.modeling.utils import set_all_seeds
 7 | import torch
 8 | 
 9 | from ..conftest import SAMPLES_PATH
10 | 
11 | 
12 | def test_processor_saving_loading(tmp_path, caplog):
13 |     if caplog is not None:
14 |         caplog.set_level(logging.CRITICAL)
15 | 
16 |     set_all_seeds(seed=42)
17 |     lang_model = "roberta-base"
18 | 
19 |     tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=False)
20 | 
21 |     processor = SquadProcessor(
22 |         tokenizer=tokenizer,
23 |         max_seq_len=256,
24 |         label_list=["start_token", "end_token"],
25 |         train_filename="train-sample.json",
26 |         dev_filename="dev-sample.json",
27 |         test_filename=None,
28 |         data_dir=SAMPLES_PATH / "qa",
29 |     )
30 | 
31 |     dicts = processor.file_to_dicts(file=SAMPLES_PATH / "qa" / "dev-sample.json")
32 |     data, tensor_names, _ = processor.dataset_from_dicts(dicts=dicts, indices=[1])
33 | 
34 |     save_dir = tmp_path / Path("testsave/processor")
35 |     processor.save(save_dir)
36 | 
37 |     processor = processor.load_from_dir(save_dir)
38 |     dicts = processor.file_to_dicts(file=SAMPLES_PATH / "qa" / "dev-sample.json")
39 |     data_loaded, tensor_names_loaded, _ = processor.dataset_from_dicts(dicts, indices=[1])
40 | 
41 |     assert tensor_names == tensor_names_loaded
42 |     for i in range(len(data.tensors)):
43 |         assert torch.all(torch.eq(data.tensors[i], data_loaded.tensors[i]))
44 | 
45 | 
46 | if __name__ == "__main__":
47 |     test_processor_saving_loading(None)
48 | 


--------------------------------------------------------------------------------
/test/nodes/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/open-domain-question-and-answer/7869af3f98ff6b06d7e1d8e576a6c5ca8bbd573d/test/nodes/__init__.py


--------------------------------------------------------------------------------
/test/nodes/test_label_generator.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | import pytest
 4 | 
 5 | from haystack.nodes import QuestionGenerator, EmbeddingRetriever, PseudoLabelGenerator
 6 | from test.conftest import DOCS_WITH_EMBEDDINGS
 7 | 
 8 | 
 9 | @pytest.mark.slow
10 | @pytest.mark.generator
11 | @pytest.mark.parametrize("document_store", ["memory"], indirect=True)
12 | @pytest.mark.parametrize("retriever", ["embedding_sbert"], indirect=True)
13 | def test_pseudo_label_generator(
14 |     document_store, retriever: EmbeddingRetriever, question_generator: QuestionGenerator, tmp_path: Path
15 | ):
16 |     document_store.write_documents(DOCS_WITH_EMBEDDINGS)
17 |     psg = PseudoLabelGenerator(question_generator, retriever)
18 |     train_examples = []
19 |     for idx, doc in enumerate(document_store):
20 |         output, stream = psg.run(documents=[doc])
21 |         assert "gpl_labels" in output
22 |         for item in output["gpl_labels"]:
23 |             assert "question" in item and "pos_doc" in item and "neg_doc" in item and "score" in item
24 |             train_examples.append(item)
25 | 
26 |     assert len(train_examples) > 0
27 |     retriever.train(train_examples)
28 |     retriever.save(tmp_path)
29 | 
30 | 
31 | @pytest.mark.slow
32 | @pytest.mark.generator
33 | @pytest.mark.parametrize("document_store", ["memory"], indirect=True)
34 | @pytest.mark.parametrize("retriever", ["embedding_sbert"], indirect=True)
35 | def test_pseudo_label_generator_using_question_document_pairs(
36 |     document_store, retriever: EmbeddingRetriever, tmp_path: Path
37 | ):
38 |     document_store.write_documents(DOCS_WITH_EMBEDDINGS)
39 |     docs = [
40 |         {
41 |             "question": "What is the capital of Germany?",
42 |             "document": "Berlin is the capital and largest city of Germany by both area and population.",
43 |         },
44 |         {
45 |             "question": "What is the largest city in Germany by population and area?",
46 |             "document": "Berlin is the capital and largest city of Germany by both area and population.",
47 |         },
48 |     ]
49 |     psg = PseudoLabelGenerator(docs, retriever)
50 |     train_examples = []
51 |     for idx, doc in enumerate(document_store):
52 |         # the documents passed here are ignored as we provided source documents in the constructor
53 |         output, stream = psg.run(documents=[doc])
54 |         assert "gpl_labels" in output
55 |         for item in output["gpl_labels"]:
56 |             assert "question" in item and "pos_doc" in item and "neg_doc" in item and "score" in item
57 |             train_examples.append(item)
58 | 
59 |     assert len(train_examples) > 0
60 | 
61 |     retriever.train(train_examples)
62 |     retriever.save(tmp_path)
63 | 


--------------------------------------------------------------------------------
/test/nodes/test_question_generator.py:
--------------------------------------------------------------------------------
 1 | from haystack.pipelines import (
 2 |     QuestionAnswerGenerationPipeline,
 3 |     QuestionGenerationPipeline,
 4 |     RetrieverQuestionGenerationPipeline,
 5 | )
 6 | from haystack.schema import Document
 7 | import pytest
 8 | 
 9 | 
10 | text = 'The Living End are an Australian punk rockabilly band from Melbourne, formed in 1994. Since 2002, the line-up consists of Chris Cheney (vocals, guitar), Scott Owen (double bass, vocals), and Andy Strachan (drums). The band rose to fame in 1997 after the release of their EP Second Solution / Prisoner of Society, which peaked at No. 4 on the Australian ARIA Singles Chart. They have released eight studio albums, two of which reached the No. 1 spot on the ARIA Albums Chart: The Living End (October 1998) and State of Emergency (February 2006). They have also achieved chart success in the U.S. and the United Kingdom. The Band was nominated 27 times and won five awards at the Australian ARIA Music Awards ceremonies: "Highest Selling Single" for Second Solution / Prisoner of Society (1998), "Breakthrough Artist – Album" and "Best Group" for The Living End (1999), as well as "Best Rock Album" for White Noise (2008) and The Ending Is Just the Beginning Repeating (2011). In October 2010, their debut album was listed in the book "100 Best Australian Albums". Australian musicologist Ian McFarlane described the group as "one of Australia’s premier rock acts. By blending a range of styles (punk, rockabilly and flat out rock) with great success, The Living End has managed to produce anthemic choruses and memorable songs in abundance".'
11 | document = Document(content=text)
12 | query = "Living End"
13 | 
14 | 
15 | def test_qg_pipeline(question_generator):
16 |     p = QuestionGenerationPipeline(question_generator)
17 |     result = p.run(documents=[document])
18 |     keys = list(result)
19 |     assert "generated_questions" in keys
20 |     assert len(result["generated_questions"][0]["questions"]) > 0
21 | 
22 | 
23 | @pytest.mark.parametrize("retriever,document_store", [("tfidf", "memory")], indirect=True)
24 | def test_rqg_pipeline(question_generator, retriever):
25 |     retriever.document_store.write_documents([document])
26 |     retriever.fit()
27 |     p = RetrieverQuestionGenerationPipeline(retriever, question_generator)
28 |     result = p.run(query)
29 |     keys = list(result)
30 |     assert "generated_questions" in keys
31 |     assert len(result["generated_questions"][0]["questions"]) > 0
32 | 
33 | 
34 | @pytest.mark.parametrize("reader", ["farm"], indirect=True)
35 | def test_qag_pipeline(question_generator, reader):
36 |     p = QuestionAnswerGenerationPipeline(question_generator, reader)
37 |     results = p.run(documents=[document])
38 |     assert "queries" in results
39 |     assert "answers" in results
40 |     assert len(results["queries"]) == len(results["answers"])
41 |     assert len(results["answers"]) > 0
42 |     assert results["answers"][0][0].answer is not None
43 | 


--------------------------------------------------------------------------------
/test/nodes/test_summarizer_translation.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from haystack.pipelines import TranslationWrapperPipeline, SearchSummarizationPipeline
 4 | from haystack.nodes import DensePassageRetriever, EmbeddingRetriever
 5 | from .test_summarizer import SPLIT_DOCS
 6 | 
 7 | # Keeping few (retriever,document_store) combination to reduce test time
 8 | @pytest.mark.slow
 9 | @pytest.mark.elasticsearch
10 | @pytest.mark.summarizer
11 | @pytest.mark.parametrize(
12 |     "retriever,document_store", [("embedding", "memory"), ("elasticsearch", "elasticsearch")], indirect=True
13 | )
14 | def test_summarization_pipeline_with_translator(
15 |     document_store, retriever, summarizer, en_to_de_translator, de_to_en_translator
16 | ):
17 |     document_store.write_documents(SPLIT_DOCS)
18 | 
19 |     if isinstance(retriever, EmbeddingRetriever) or isinstance(retriever, DensePassageRetriever):
20 |         document_store.update_embeddings(retriever=retriever)
21 | 
22 |     query = "Wo steht der Eiffelturm?"
23 |     base_pipeline = SearchSummarizationPipeline(retriever=retriever, summarizer=summarizer)
24 |     pipeline = TranslationWrapperPipeline(
25 |         input_translator=de_to_en_translator, output_translator=en_to_de_translator, pipeline=base_pipeline
26 |     )
27 |     output = pipeline.run(
28 |         query=query, params={"Retriever": {"top_k": 2}, "Summarizer": {"generate_single_summary": True}}
29 |     )
30 |     # SearchSummarizationPipeline return answers but Summarizer return documents
31 |     documents = output["documents"]
32 |     assert len(documents) == 1
33 |     assert documents[0].content in [
34 |         "Der Eiffelturm ist ein Wahrzeichen in Paris, Frankreich.",
35 |         "Der Eiffelturm, der 1889 in Paris, Frankreich, erbaut wurde, ist das höchste freistehende Bauwerk der Welt.",
36 |     ]
37 | 


--------------------------------------------------------------------------------
/test/nodes/test_translator.py:
--------------------------------------------------------------------------------
 1 | from haystack.schema import Document
 2 | 
 3 | import pytest
 4 | 
 5 | EXPECTED_OUTPUT = "Ich lebe in Berlin"
 6 | INPUT = "I live in Berlin"
 7 | 
 8 | 
 9 | def test_translator_with_query(en_to_de_translator):
10 |     assert en_to_de_translator.translate(query=INPUT) == EXPECTED_OUTPUT
11 | 
12 | 
13 | def test_translator_with_list(en_to_de_translator):
14 |     assert en_to_de_translator.translate(documents=[INPUT])[0] == EXPECTED_OUTPUT
15 | 
16 | 
17 | def test_translator_with_document(en_to_de_translator):
18 |     assert en_to_de_translator.translate(documents=[Document(content=INPUT)])[0].content == EXPECTED_OUTPUT
19 | 
20 | 
21 | def test_translator_with_dictionary(en_to_de_translator):
22 |     assert en_to_de_translator.translate(documents=[{"content": INPUT}])[0]["content"] == EXPECTED_OUTPUT
23 | 
24 | 
25 | def test_translator_with_dictionary_with_dict_key(en_to_de_translator):
26 |     assert en_to_de_translator.translate(documents=[{"key": INPUT}], dict_key="key")[0]["key"] == EXPECTED_OUTPUT
27 | 
28 | 
29 | def test_translator_with_empty_input(en_to_de_translator):
30 |     with pytest.raises(AttributeError):
31 |         en_to_de_translator.translate()
32 | 
33 | 
34 | def test_translator_with_query_and_documents(en_to_de_translator):
35 |     with pytest.raises(AttributeError):
36 |         en_to_de_translator.translate(query=INPUT, documents=[INPUT])
37 | 
38 | 
39 | def test_translator_with_dict_without_text_key(en_to_de_translator):
40 |     with pytest.raises(AttributeError):
41 |         en_to_de_translator.translate(documents=[{"text1": INPUT}])
42 | 
43 | 
44 | def test_translator_with_dict_with_non_string_value(en_to_de_translator):
45 |     with pytest.raises(AttributeError):
46 |         en_to_de_translator.translate(documents=[{"text": 123}])
47 | 


--------------------------------------------------------------------------------
/test/others/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/open-domain-question-and-answer/7869af3f98ff6b06d7e1d8e576a6c5ca8bbd573d/test/others/__init__.py


--------------------------------------------------------------------------------
/test/pipelines/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/open-domain-question-and-answer/7869af3f98ff6b06d7e1d8e576a6c5ca8bbd573d/test/pipelines/__init__.py


--------------------------------------------------------------------------------
/test/pipelines/test_ray.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | import pytest
 4 | import ray
 5 | 
 6 | from haystack.pipelines import RayPipeline
 7 | 
 8 | from ..conftest import SAMPLES_PATH
 9 | 
10 | 
11 | @pytest.fixture(autouse=True)
12 | def shutdown_ray():
13 |     yield
14 |     try:
15 |         import ray
16 | 
17 |         ray.shutdown()
18 |     except:
19 |         pass
20 | 
21 | 
22 | @pytest.mark.integration
23 | @pytest.mark.parametrize("document_store_with_docs", ["elasticsearch"], indirect=True)
24 | def test_load_pipeline(document_store_with_docs):
25 |     pipeline = RayPipeline.load_from_yaml(
26 |         SAMPLES_PATH / "pipeline" / "ray.haystack-pipeline.yml",
27 |         pipeline_name="ray_query_pipeline",
28 |         ray_args={"num_cpus": 8},
29 |     )
30 |     prediction = pipeline.run(query="Who lives in Berlin?", params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 3}})
31 | 
32 |     assert ray.serve.get_deployment(name="ESRetriever").num_replicas == 2
33 |     assert ray.serve.get_deployment(name="Reader").num_replicas == 1
34 |     assert prediction["query"] == "Who lives in Berlin?"
35 |     assert prediction["answers"][0].answer == "Carla"
36 | 


--------------------------------------------------------------------------------
/test/samples/dc/matching_test_1.csv:
--------------------------------------------------------------------------------
1 | query,text,context,file_name,answer_start,answer_end
2 | "What are Primitives?","These are classes that carry data through the system.","# Primitives\n\nIn Haystack, there are a handful of core classes that are regularly used in many different places.\nThese are classes that carry data through the system.\nUsers will likely interact with these as either the input or output of their pipeline.\n\n## Document\n\nThe Document class contains all the information regarding the contents of a document,\nincluding its id and metadata.\nIt may also contain information created in the pipeline including the confidence ","sample_pdf_1.pdf",113,166
3 | 


--------------------------------------------------------------------------------
/test/samples/dc/pipeline_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "version": "master",
 3 |     "name": "document_retrieval_1",
 4 |     "components": [
 5 |         {
 6 |             "name": "DocumentStore",
 7 |             "type": "DeepsetCloudDocumentStore",
 8 |             "params": {
 9 |                 "similarity": "cosine"
10 |             }
11 |         },
12 |         {
13 |             "name": "Retriever",
14 |             "type": "BM25Retriever",
15 |             "params": {
16 |                 "document_store": "DocumentStore",
17 |                 "top_k": 5
18 |             }
19 |         },
20 |         {
21 |             "name": "Reader",
22 |             "type": "FARMReader",
23 |             "params": {
24 |                 "model_name_or_path": "deepset/minilm-uncased-squad2"
25 |             }
26 |         },
27 |         {
28 |             "name": "TextFileConverter",
29 |             "type": "TextConverter"
30 |         },
31 |         {
32 |             "name": "Preprocessor",
33 |             "type": "PreProcessor",
34 |             "params": {
35 |                 "split_by": "word",
36 |                 "split_length": 1000
37 |             }
38 |         }
39 |     ],
40 |     "pipelines": [
41 |         {
42 |             "name": "query",
43 |             "nodes": [
44 |                 {
45 |                     "name": "Retriever",
46 |                     "inputs": [
47 |                         "Query"
48 |                     ]
49 |                 }
50 |             ]
51 |         },
52 |         {
53 |             "name": "indexing",
54 |             "nodes": [
55 |                 {
56 |                     "name": "TextFileConverter",
57 |                     "inputs": [
58 |                         "File"
59 |                     ]
60 |                 },
61 |                 {
62 |                     "name": "Preprocessor",
63 |                     "inputs": [
64 |                         "TextFileConverter"
65 |                     ]
66 |                 },
67 |                 {
68 |                     "name": "Retriever",
69 |                     "inputs": [
70 |                         "Preprocessor"
71 |                     ]
72 |                 },
73 |                 {
74 |                     "name": "DocumentStore",
75 |                     "inputs": [
76 |                         "Retriever"
77 |                     ]
78 |                 }
79 |             ]
80 |         }
81 |     ]
82 | }


--------------------------------------------------------------------------------
/test/samples/docs/doc_1.txt:
--------------------------------------------------------------------------------
1 | Some text for testing.
2 | Two lines in here.


--------------------------------------------------------------------------------
/test/samples/docs/doc_2.txt:
--------------------------------------------------------------------------------
1 | A Doc specifically talking about haystack.
2 | Haystack can be used to scale QA models to large document collections.


--------------------------------------------------------------------------------
/test/samples/docx/sample_docx.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/open-domain-question-and-answer/7869af3f98ff6b06d7e1d8e576a6c5ca8bbd573d/test/samples/docx/sample_docx.docx


--------------------------------------------------------------------------------
/test/samples/extensionless_files/docx_file:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/open-domain-question-and-answer/7869af3f98ff6b06d7e1d8e576a6c5ca8bbd573d/test/samples/extensionless_files/docx_file


--------------------------------------------------------------------------------
/test/samples/extensionless_files/gif_file:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/open-domain-question-and-answer/7869af3f98ff6b06d7e1d8e576a6c5ca8bbd573d/test/samples/extensionless_files/gif_file


--------------------------------------------------------------------------------
/test/samples/extensionless_files/html_file:
--------------------------------------------------------------------------------
1 | <!DOCTYPE html>
2 | <a>sample</a>


--------------------------------------------------------------------------------
/test/samples/extensionless_files/jpg_file:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/open-domain-question-and-answer/7869af3f98ff6b06d7e1d8e576a6c5ca8bbd573d/test/samples/extensionless_files/jpg_file


--------------------------------------------------------------------------------
/test/samples/extensionless_files/mp3_file:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/open-domain-question-and-answer/7869af3f98ff6b06d7e1d8e576a6c5ca8bbd573d/test/samples/extensionless_files/mp3_file


--------------------------------------------------------------------------------
/test/samples/extensionless_files/odt_file:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/open-domain-question-and-answer/7869af3f98ff6b06d7e1d8e576a6c5ca8bbd573d/test/samples/extensionless_files/odt_file


--------------------------------------------------------------------------------
/test/samples/extensionless_files/pdf_file:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/open-domain-question-and-answer/7869af3f98ff6b06d7e1d8e576a6c5ca8bbd573d/test/samples/extensionless_files/pdf_file


--------------------------------------------------------------------------------
/test/samples/extensionless_files/png_file:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/open-domain-question-and-answer/7869af3f98ff6b06d7e1d8e576a6c5ca8bbd573d/test/samples/extensionless_files/png_file


--------------------------------------------------------------------------------
/test/samples/extensionless_files/pptx_file:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/open-domain-question-and-answer/7869af3f98ff6b06d7e1d8e576a6c5ca8bbd573d/test/samples/extensionless_files/pptx_file


--------------------------------------------------------------------------------
/test/samples/extensionless_files/txt_file:
--------------------------------------------------------------------------------
1 | Sample


--------------------------------------------------------------------------------
/test/samples/extensionless_files/wav_file:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/open-domain-question-and-answer/7869af3f98ff6b06d7e1d8e576a6c5ca8bbd573d/test/samples/extensionless_files/wav_file


--------------------------------------------------------------------------------
/test/samples/extensionless_files/zip_file:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/open-domain-question-and-answer/7869af3f98ff6b06d7e1d8e576a6c5ca8bbd573d/test/samples/extensionless_files/zip_file


--------------------------------------------------------------------------------
/test/samples/pdf/sample_pdf_1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/open-domain-question-and-answer/7869af3f98ff6b06d7e1d8e576a6c5ca8bbd573d/test/samples/pdf/sample_pdf_1.pdf


--------------------------------------------------------------------------------
/test/samples/pdf/sample_pdf_2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/open-domain-question-and-answer/7869af3f98ff6b06d7e1d8e576a6c5ca8bbd573d/test/samples/pdf/sample_pdf_2.pdf


--------------------------------------------------------------------------------
/test/samples/pipeline/ray.haystack-pipeline.yml:
--------------------------------------------------------------------------------
 1 | version: ignore
 2 | extras: ray
 3 | 
 4 | components:
 5 |   - name: DocumentStore
 6 |     type: ElasticsearchDocumentStore
 7 |     params:
 8 |       index: haystack_test
 9 |       label_index: haystack_test_label
10 |   - name: ESRetriever
11 |     type: BM25Retriever
12 |     params:
13 |       document_store: DocumentStore
14 |   - name: Reader
15 |     type: FARMReader
16 |     params:
17 |       no_ans_boost: -10
18 |       model_name_or_path: deepset/roberta-base-squad2
19 |       num_processes: 0
20 |   - name: PDFConverter
21 |     type: PDFToTextConverter
22 |     params:
23 |       remove_numeric_tables: false
24 |   - name: Preprocessor
25 |     type: PreProcessor
26 |     params:
27 |       clean_whitespace: true
28 |   - name: IndexTimeDocumentClassifier
29 |     type: TransformersDocumentClassifier
30 |     params:
31 |       batch_size: 16
32 |       use_gpu: false
33 |   - name: QueryTimeDocumentClassifier
34 |     type: TransformersDocumentClassifier
35 |     params:
36 |       use_gpu: false
37 | 
38 | 
39 | pipelines:
40 |   - name: ray_query_pipeline
41 |     nodes:
42 |       - name: ESRetriever
43 |         replicas: 2
44 |         inputs: [ Query ]
45 |       - name: Reader
46 |         inputs: [ ESRetriever ]
47 | 


--------------------------------------------------------------------------------
/test/samples/pipeline/test.haystack-pipeline.yml:
--------------------------------------------------------------------------------
 1 | version: ignore
 2 | 
 3 | components:
 4 |   - name: Reader
 5 |     type: FARMReader
 6 |     params:
 7 |       no_ans_boost: -10
 8 |       model_name_or_path: deepset/roberta-base-squad2
 9 |       num_processes: 0
10 |   - name: ESRetriever
11 |     type: BM25Retriever
12 |     params:
13 |       document_store: DocumentStore
14 |   - name: DocumentStore
15 |     type: ElasticsearchDocumentStore
16 |     params:
17 |       index: haystack_test
18 |       label_index: haystack_test_label
19 |   - name: PDFConverter
20 |     type: PDFToTextConverter
21 |     params:
22 |       remove_numeric_tables: false
23 |   - name: TextConverter
24 |     type: TextConverter
25 |   - name: Preprocessor
26 |     type: PreProcessor
27 |     params:
28 |       clean_whitespace: true
29 |   - name: IndexTimeDocumentClassifier
30 |     type: TransformersDocumentClassifier
31 |     params:
32 |       batch_size: 16
33 |       use_gpu: false
34 |   - name: QueryTimeDocumentClassifier
35 |     type: TransformersDocumentClassifier
36 |     params:
37 |       use_gpu: false
38 | 
39 | 
40 | pipelines:
41 |   - name: query_pipeline
42 |     nodes:
43 |       - name: ESRetriever
44 |         inputs: [Query]
45 |       - name: Reader
46 |         inputs: [ESRetriever]
47 | 
48 |   - name: query_pipeline_with_document_classifier
49 |     nodes:
50 |       - name: ESRetriever
51 |         inputs: [Query]
52 |       - name: QueryTimeDocumentClassifier
53 |         inputs: [ESRetriever]
54 |       - name: Reader
55 |         inputs: [QueryTimeDocumentClassifier]
56 | 
57 |   - name: indexing_pipeline
58 |     nodes:
59 |       - name: PDFConverter
60 |         inputs: [File]
61 |       - name: Preprocessor
62 |         inputs: [PDFConverter]
63 |       - name: ESRetriever
64 |         inputs: [Preprocessor]
65 |       - name: DocumentStore
66 |         inputs: [ESRetriever]
67 | 
68 |   - name: indexing_text_pipeline
69 |     nodes:
70 |       - name: TextConverter
71 |         inputs: [File]
72 |       - name: Preprocessor
73 |         inputs: [TextConverter]
74 |       - name: ESRetriever
75 |         inputs: [Preprocessor]
76 |       - name: DocumentStore
77 |         inputs: [ESRetriever]
78 | 
79 |   - name: indexing_pipeline_with_classifier
80 |     nodes:
81 |       - name: PDFConverter
82 |         inputs: [File]
83 |       - name: Preprocessor
84 |         inputs: [PDFConverter]
85 |       - name: IndexTimeDocumentClassifier
86 |         inputs: [Preprocessor]
87 |       - name: ESRetriever
88 |         inputs: [IndexTimeDocumentClassifier]
89 |       - name: DocumentStore
90 |         inputs: [ESRetriever]
91 | 


--------------------------------------------------------------------------------
/test/samples/qa/answer-offset-wrong.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "data": [
 3 |         {
 4 |             "title": "Test",
 5 |             "paragraphs": [
 6 |                 {
 7 |                     "context": "Berlin has 10 inhabitants.",
 8 |                     "qas": [
 9 |                         {
10 |                             "question": "How many people live in Berlin?",
11 |                             "id": "5ad3d560604f3c001a3ff2c8",
12 |                             "answers": [{"text": "10", "answer_start": 0}],
13 |                             "is_impossible": false
14 |                         }
15 |                     ]
16 |                 }
17 |             ]
18 |         }
19 |     ],
20 |     "version": "v2.0"
21 | }


--------------------------------------------------------------------------------
/test/samples/qa/answer-wrong.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "data": [
 3 |         {
 4 |             "title": "Test",
 5 |             "paragraphs": [
 6 |                 {
 7 |                     "context": "Berlin has 10 inhabitants.",
 8 |                     "qas": [
 9 |                         {
10 |                             "question": "How many people live in Berlin?",
11 |                             "id": "5ad3d560604f3c001a3ff2c8",
12 |                             "answers": [{"text": "11", "answer_start": 11}],
13 |                             "is_impossible": false
14 |                         }
15 |                     ]
16 |                 }
17 |             ]
18 |         }
19 |     ],
20 |     "version": "v2.0"
21 | }


--------------------------------------------------------------------------------
/test/samples/qa/dev-sample.json:
--------------------------------------------------------------------------------
1 | {"data": [{"paragraphs": [{"qas": [{"question": "In what country is Normandy located?", "id": "56ddde6b9a695914005b9628", "answers": [{"text": "France", "answer_start": 53}], "is_impossible": false}], "context": "The Normans gave their name to Normandy, a region in France."}]}]}


--------------------------------------------------------------------------------
/test/samples/qa/eval-sample.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "data": [
 3 |         {
 4 |             "title": "Test",
 5 |             "paragraphs": [
 6 |                 {
 7 |                     "context": "Berlin has 10 inhabitants.",
 8 |                     "qas": [
 9 |                         {
10 |                             "question": "How many people live in Paris?",
11 |                             "id": "5ad3d560604f3c001a3ff2c6",
12 |                             "answers": [],
13 |                             "is_impossible": true
14 |                         }
15 |                     ]
16 |                 }
17 |             ]
18 |         },
19 |          {
20 |             "title": "Test2",
21 |             "paragraphs": [
22 |                 {
23 |                     "context": "Berlin has 10 inhabitants.",
24 |                     "qas": [
25 |                         {
26 |                             "question": "How many people live in Berlin?",
27 |                             "id": "5ad3d560604f3c001a3ff2c7",
28 |                             "answers": [{"text": "10", "answer_start": 11}, {"text": "10 inhabitants", "answer_start": 11}],
29 |                             "is_impossible": false
30 |                         },
31 |                         {
32 |                             "question": "How many people live in Berlin?",
33 |                             "id": "5ad3d560604f3c001a3ff2c8",
34 |                             "answers": [{"text": "Berlin", "answer_start": 0}, {"text": "Berlin", "answer_start": 0}],
35 |                             "is_impossible": false
36 |                         }
37 |                     ]
38 |                 }
39 |             ]
40 |         }
41 |     ],
42 |     "version": "v2.0"
43 | }


--------------------------------------------------------------------------------
/test/samples/qa/noanswer.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "data": [
 3 |         {
 4 |             "title": "Test",
 5 |             "paragraphs": [
 6 |                 {
 7 |                     "context": "Berlin has 10 inhabitants.",
 8 |                     "qas": [
 9 |                         {
10 |                             "question": "How many people live in Paris?",
11 |                             "id": "5ad3d560604f3c001a3ff2c8",
12 |                             "answers": [],
13 |                             "is_impossible": true
14 |                         }
15 |                     ]
16 |                 }
17 |             ]
18 |         }
19 |     ],
20 |     "version": "v2.0"
21 | }


--------------------------------------------------------------------------------
/test/samples/qa/train-sample.json:
--------------------------------------------------------------------------------
1 | {"data": [{"paragraphs": [{"qas": [{"question": "In what country is Normandy located?", "id": "56ddde6b9a695914005b9628", "answers": [{"text": "France", "answer_start": 159}], "is_impossible": false}], "context": "The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse (\"Norman\" comes from \"Norseman\") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia."}]}]}


--------------------------------------------------------------------------------
/test/samples/qa/vanilla.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "data": [
 3 |         {
 4 |             "title": "Test",
 5 |             "paragraphs": [
 6 |                 {
 7 |                     "context": "Berlin has 10 inhabitants.",
 8 |                     "qas": [
 9 |                         {
10 |                             "question": "How many people live in Berlin?",
11 |                             "id": "5ad3d560604f3c001a3ff2c8",
12 |                             "answers": [{"text": "10", "answer_start": 11}, {"text": "10 inhabitants", "answer_start": 11}],
13 |                             "is_impossible": false
14 |                         }
15 |                     ]
16 |                 }
17 |             ]
18 |         }
19 |     ],
20 |     "version": "v2.0"
21 | }


--------------------------------------------------------------------------------
/test/samples/squad/tiny.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "data": [
 3 |    {
 4 |       "title": "test1",
 5 |       "paragraphs": [
 6 |         {
 7 |           "context": "My name is Carla and I live together with Abdul in Berlin",
 8 |           "qas": [
 9 |             {
10 |               "answers": [
11 |                 {
12 |                   "answer_start": 11,
13 |                   "text": "Carla"
14 |                 },
15 |                 {
16 |                   "answer_start": 42,
17 |                   "text": "Abdul"
18 |                 },
19 |                 {
20 |                   "answer_start": 11,
21 |                   "text": "Carla and I live together with Abdul"
22 |                 }
23 |               ],
24 |               "id": 7211011040021040393,
25 |               "question": "Who lives in Berlin?",
26 |               "is_impossible": false
27 |             }
28 |           ]
29 |         }
30 |       ]
31 |     },
32 |     {
33 |       "title": "test2",
34 |       "paragraphs": [
35 |         {
36 |           "context": "This is another test context",
37 |           "qas": [
38 |             {
39 |               "answers": [
40 |                 {
41 |                   "answer_start": 0,
42 |                   "text": "This"
43 |                 },
44 |                 {
45 |                   "answer_start": 5,
46 |                   "text": "is"
47 |                 }
48 |               ],
49 |               "id": -5782547119306399562,
50 |               "question": "The model can't answer this",
51 |               "is_impossible": false
52 |             }
53 |           ]
54 |         }
55 |       ]
56 |     }
57 |   ]
58 | }


--------------------------------------------------------------------------------
/test/samples/squad/tiny_passages.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "data": [
 3 |    {
 4 |       "title": "test1",
 5 |       "paragraphs": [
 6 |         {
 7 |           "context": "My name is Carla and I live together with Abdul in Berlin. \n\nThis is a new passage saying Leila lives in Berlin, too.",
 8 |           "qas": [
 9 |             {
10 |               "answers": [
11 |                 {
12 |                   "answer_start": 11,
13 |                   "text": "Carla"
14 |                 },
15 |                 {
16 |                   "answer_start": 42,
17 |                   "text": "Abdul"
18 |                 },
19 |                 {
20 |                   "answer_start": 89,
21 |                   "text": "Leila"
22 |                 }
23 |               ],
24 |               "id": 7211011040021040393,
25 |               "question": "Who lives in Berlin?",
26 |               "is_impossible": false
27 |             }
28 |           ]
29 |         }
30 |       ]
31 |     }
32 |   ]
33 | }


--------------------------------------------------------------------------------
/ui/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:20.04
 2 | 
 3 | # RUN apt-get update && apt-get install -y curl git pkg-config cmake
 4 | 
 5 | # copy code
 6 | COPY . /ui
 7 | 
 8 | RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \
 9 |     python3  \
10 |     python3-pip
11 | 
12 | # install as a package
13 | RUN pip install --upgrade pip && \
14 |     pip install /ui/ \
15 |     pip install pyyaml
16 | 
17 | RUN ln -s /usr/bin/python3.8 /usr/bin/python
18 | WORKDIR /ui
19 | EXPOSE 8501
20 | 
21 | # cmd for running the API
22 | CMD ["python", "-m", "streamlit", "run", "ui/webapp.py"]
23 | 


--------------------------------------------------------------------------------
/ui/README.md:
--------------------------------------------------------------------------------
 1 | ## Demo UI
 2 | 
 3 | This is a minimal UI that can spin up to test Haystack for your prototypes. It's based on streamlit and is very easy to extend for your purposes. 
 4 | 
 5 | ![Screenshot](https://raw.githubusercontent.com/deepset-ai/haystack/main/docs/img/streamlit_ui_screenshot.png)
 6 | 
 7 | ## Usage
 8 | 
 9 | ### Get started with Haystack
10 | 
11 | The UI interacts with the Haystack REST API. To get started with Haystack please visit the [README](https://github.com/deepset-ai/haystack/tree/main#key-components) or checko out our [tutorials](https://haystack.deepset.ai/tutorials/first-qa-system).
12 | 
13 | ### Option 1: Local
14 | 
15 | Execute in this folder:
16 | ```
17 | streamlit run ui/webapp.py
18 | ```
19 | 
20 | Requirements: This expects a running Haystack REST API at `http://localhost:8000`
21 | 
22 | ### Option 2: Container
23 | 
24 | Just run
25 | ```
26 | docker-compose up -d
27 | ``` 
28 | in the root folder of the Haystack repository. This will start three containers (Elasticsearch, Haystack API, Haystack UI).
29 | You can find the UI at `http://localhost:8501`
30 | 
31 | ## Evaluation Mode
32 | 
33 | The evaluation mode leverages the feedback REST API endpoint of haystack. The user has the options "Wrong answer", "Wrong answer and wrong passage" and "Wrong answer and wrong passage" to give feedback. 
34 | 
35 | In order to use the UI in evaluation mode, you need an ElasticSearch instance with pre-indexed files and the Haystack REST API. You can set the environment up via docker images. For ElasticSearch, you can check out our [documentation](https://haystack.deepset.ai/usage/document-store#initialisation) and for setting up the REST API this [link](https://github.com/deepset-ai/haystack/blob/main/README.md#7-rest-api).
36 | 
37 | To enter the evaluation mode, select the checkbox "Evaluation mode" in the sidebar. The UI will load the predefined questions from the file [`eval_labels_examples`](https://raw.githubusercontent.com/deepset-ai/haystack/main/ui/ui/eval_labels_example.csv). The file needs to be prefilled with your data. This way, the user will get a random question from the set and can give his feedback with the buttons below the questions. To load a new question, click the button "Get random question". 
38 | 
39 | The file just needs to have two columns separated by semicolon. You can add more columns but the UI will ignore them. Every line represents a questions answer pair. The columns with the questions needs to be named “Question Text” and the answer column “Answer” so that they can be loaded correctly. Currently, the easiest way to create the file is manually by adding question answer pairs. 
40 | 
41 | The feedback can be exported with the API endpoint `export-doc-qa-feedback`. To learn more about finetuning a model with user feedback, please check out our [docs](https://haystack.deepset.ai/usage/domain-adaptation#user-feedback).
42 | 
43 | ![Screenshot](https://raw.githubusercontent.com/deepset-ai/haystack/main/docs/img/streamlit_ui_screenshot_eval_mode.png)


--------------------------------------------------------------------------------
/ui/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["hatchling"]
 3 | build-backend = "hatchling.build"
 4 | 
 5 | [project]
 6 | name = "ui"
 7 | description = 'Minimal UI for Haystack (https://github.com/deepset-ai/haystack)'
 8 | readme = "README.md"
 9 | requires-python = ">=3.7"
10 | license = "Apache-2.0"
11 | keywords = []
12 | authors = [
13 |   { name = "deepset.ai", email = "malte.pietsch@deepset.ai" },
14 | ]
15 | classifiers = [
16 |   "Development Status :: 5 - Production/Stable",
17 |   "Intended Audience :: Science/Research",
18 |   "Topic :: Scientific/Engineering :: Artificial Intelligence",
19 |   "Operating System :: OS Independent",
20 |   "Programming Language :: Python",
21 |   "Programming Language :: Python :: 3.7",
22 |   "Programming Language :: Python :: 3.8",
23 |   "Programming Language :: Python :: 3.9",
24 |   "Programming Language :: Python :: 3.10",
25 |   "Programming Language :: Python :: Implementation :: CPython",
26 | ]
27 | dependencies = [
28 |     #"streamlit >= 1.9.0, < 2",
29 |     "streamlit == 1.11.1",
30 |     "st-annotated-text >= 2.0.0, < 3",
31 |     "markdown >= 3.3.4, < 4"
32 | ]
33 | dynamic = ["version"]
34 | 
35 | [project.urls]
36 | Documentation = "https://github.com/deepset-ai/haystack/tree/main/ui#readme"
37 | Issues = "https://github.com/deepset-ai/haystack/issues"
38 | Source = "https://github.com/deepset-ai/haystack/tree/main/ui"
39 | 
40 | [tool.hatch.version]
41 | path = "ui/__about__.py"
42 | 
43 | [tool.hatch.build.targets.sdist]
44 | [tool.hatch.build.targets.wheel]
45 | 
46 | [tool.hatch.envs.default]
47 | dependencies = [
48 |   "pytest",
49 |   "pytest-cov",
50 | ]
51 | [tool.hatch.envs.default.scripts]
52 | cov = "pytest --cov-report=term-missing --cov-config=pyproject.toml --cov=ui --cov=tests"
53 | no-cov = "cov --no-cov"
54 | 
55 | [[tool.hatch.envs.test.matrix]]
56 | python = ["37", "38", "39", "310"]
57 | 
58 | [tool.coverage.run]
59 | branch = true
60 | parallel = true
61 | omit = [
62 |   "ui/__about__.py",
63 | ]
64 | 
65 | [tool.coverage.report]
66 | exclude_lines = [
67 |   "no cov",
68 |   "if __name__ == .__main__.:",
69 |   "if TYPE_CHECKING:",
70 | ]
71 | 
72 | [tool.black]
73 | line-length = 120
74 | skip_magic_trailing_comma = true  # For compatibility with pydoc>=4.6, check if still needed.
75 | 


--------------------------------------------------------------------------------
/ui/test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/open-domain-question-and-answer/7869af3f98ff6b06d7e1d8e576a6c5ca8bbd573d/ui/test/__init__.py


--------------------------------------------------------------------------------
/ui/test/test_ui_utils.py:
--------------------------------------------------------------------------------
 1 | from unittest.mock import patch
 2 | 
 3 | from ui.utils import haystack_is_ready
 4 | 
 5 | 
 6 | def test_haystack_is_ready():
 7 |     with patch("requests.get") as mocked_get:
 8 |         mocked_get.return_value.status_code = 200
 9 |         assert haystack_is_ready()
10 | 
11 | 
12 | def test_haystack_is_ready_fail():
13 |     with patch("requests.get") as mocked_get:
14 |         mocked_get.return_value.status_code = 400
15 |         assert not haystack_is_ready()
16 | 


--------------------------------------------------------------------------------
/ui/ui/__about__.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | from pathlib import Path
 4 | 
 5 | 
 6 | __version__ = "0.0.0"
 7 | try:
 8 |     __version__ = open(Path(__file__).parent.parent / "VERSION.txt", "r").read()
 9 | except Exception as e:
10 |     logging.exception("No VERSION.txt found!")
11 | 


--------------------------------------------------------------------------------
/ui/ui/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/open-domain-question-and-answer/7869af3f98ff6b06d7e1d8e576a6c5ca8bbd573d/ui/ui/__init__.py


--------------------------------------------------------------------------------
/ui/ui/eval_labels_example.csv:
--------------------------------------------------------------------------------
 1 | "Question Text";"Answer"
 2 | "What is the capital of France?";"Paris"
 3 | "What's the tallest mountain in Africa?";"Mount Kilimanjaro"
 4 | "What's the climate of Beijing?";"monsoon-influenced humid continental"
 5 | "What's the longest river of Europe?";"The Volga"
 6 | "What's the deepest lake in the world?";"Lake Bajkal"
 7 | "How many people live in the capital of the US?";"689,545"
 8 | "Which Chinese city is the largest?";"Shanghai"
 9 | "What's the type of government of the UK?";"unitary parliamentary democracy and constitutional monarchy"
10 | "What currency is used in Hungary?";"Hungarian forint"
11 | "In which city is the Louvre?";"Paris"
12 | "Who is the current king of Spain?";"Felipe VI"
13 | "Which countries border with Mongolia?";"Russia and China"
14 | "What's the current name of Swaziland?";"Eswatini"


--------------------------------------------------------------------------------