├── .gitattributes ├── .gitignore ├── .gitmodules ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── SECURITY.md ├── VERSION.txt ├── applications ├── indexing │ ├── build_image.sh │ ├── create_plaid.py │ ├── indexing_pipeline.py │ ├── launch_indexing_workflow.py │ ├── marco_indexing │ │ ├── colbert_indexing_pipeline.yml │ │ ├── emr_indexing_pipeline.yml │ │ ├── faiss_indexing_pipeline.yml │ │ ├── marco_dataset.py │ │ └── prepare_env.sh │ ├── marco_indexing_workflow.yml │ ├── ray_indexing_pipeline.py │ ├── run-ray-cluster.sh │ ├── stackoverflow_indexing │ │ ├── colbert_indexing_pipeline.yml │ │ ├── emr_indexing_pipeline.yml │ │ ├── faiss_indexing_pipeline.yml │ │ ├── prepare_env.sh │ │ └── stackoverflow_dataset.py │ └── stackoverflow_indexing_workflow.yml └── odqa_pipelines │ ├── config │ ├── env.marco.esds_bm25r_colbert │ ├── env.marco.esds_emr_faq │ ├── env.marco.faiss_dpr │ ├── env.stackoverflow.esds_bm25r_colbert │ ├── env.stackoverflow.esds_emr_faq │ ├── env.stackoverflow.faiss_dpr │ └── env.wiki.plaid_colbertv2 │ ├── docker-compose │ ├── docker-compose-dpr.yml │ ├── docker-compose-gpu-dpr.yml │ ├── docker-compose-gpu.yml │ ├── docker-compose-nginx.yml │ ├── docker-compose-plaid.yml │ └── docker-compose.yml │ ├── launch_pipeline.sh │ └── ui_config │ ├── marco │ └── config.yml │ └── stackoverflow │ └── config.yml ├── conftest.py ├── doc └── workflow_stackoverflow.md ├── docker ├── Dockerfile └── Dockerfile-GPU ├── haystack ├── __init__.py ├── config.py ├── document_stores │ ├── __init__.py │ ├── base.py │ ├── deepsetcloud.py │ ├── elasticsearch.py │ ├── es_converter.py │ ├── faiss.py │ ├── filter_utils.py │ ├── graphdb.py │ ├── memory.py │ ├── memory_knowledgegraph.py │ ├── milvus.py │ ├── opensearch.py │ ├── pinecone.py │ ├── plaid.py │ ├── search_engine.py │ ├── sql.py │ ├── utils.py │ └── weaviate.py ├── environment.py ├── errors.py ├── modeling │ ├── __init__.py │ ├── data_handler │ │ ├── __init__.py │ │ ├── data_silo.py │ │ ├── dataloader.py │ │ ├── dataset.py │ │ ├── input_features.py │ │ ├── inputs.py │ │ ├── processor.py │ │ └── samples.py │ ├── evaluation │ │ ├── __init__.py │ │ ├── eval.py │ │ ├── metrics.py │ │ └── squad.py │ ├── infer.py │ ├── model │ │ ├── __init__.py │ │ ├── adaptive_model.py │ │ ├── biadaptive_model.py │ │ ├── feature_extraction.py │ │ ├── language_model.py │ │ ├── multimodal │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ └── sentence_transformers.py │ │ ├── optimization.py │ │ ├── prediction_head.py │ │ ├── predictions.py │ │ └── triadaptive_model.py │ ├── training │ │ ├── __init__.py │ │ ├── base.py │ │ ├── dpr.py │ │ └── question_answering.py │ ├── utils.py │ └── visual.py ├── nodes │ ├── __init__.py │ ├── _json_schema.py │ ├── answer_generator │ │ ├── __init__.py │ │ ├── base.py │ │ ├── openai.py │ │ └── transformers.py │ ├── audio │ │ ├── __init__.py │ │ ├── _text_to_speech.py │ │ ├── answer_to_speech.py │ │ └── document_to_speech.py │ ├── base.py │ ├── connector │ │ ├── __init__.py │ │ └── crawler.py │ ├── document_classifier │ │ ├── __init__.py │ │ ├── base.py │ │ └── transformers.py │ ├── evaluator │ │ ├── __init__.py │ │ └── evaluator.py │ ├── extractor │ │ ├── __init__.py │ │ └── entity.py │ ├── file_classifier │ │ ├── __init__.py │ │ └── file_type.py │ ├── file_converter │ │ ├── __init__.py │ │ ├── azure.py │ │ ├── base.py │ │ ├── docx.py │ │ ├── image.py │ │ ├── markdown.py │ │ ├── parsr.py │ │ ├── pdf.py │ │ ├── tika.py │ │ └── txt.py │ ├── label_generator │ │ ├── __init__.py │ │ └── pseudo_label_generator.py │ ├── other │ │ ├── __init__.py │ │ ├── dataset.py │ │ ├── docs2answers.py │ │ ├── document_merger.py │ │ ├── join.py │ │ ├── join_answers.py │ │ ├── join_docs.py │ │ └── route_documents.py │ ├── preprocessor │ │ ├── __init__.py │ │ ├── base.py │ │ └── preprocessor.py │ ├── prompt │ │ ├── __init__.py │ │ └── prompt_node.py │ ├── query_classifier │ │ ├── __init__.py │ │ ├── base.py │ │ ├── sklearn.py │ │ └── transformers.py │ ├── question_generator │ │ ├── __init__.py │ │ └── question_generator.py │ ├── ranker │ │ ├── __init__.py │ │ ├── base.py │ │ ├── colbert_modeling.py │ │ ├── sentence_transformers.py │ │ └── st_modeling.py │ ├── reader │ │ ├── __init__.py │ │ ├── base.py │ │ ├── farm.py │ │ ├── table.py │ │ └── transformers.py │ ├── retriever │ │ ├── __init__.py │ │ ├── _embedding_encoder.py │ │ ├── _losses.py │ │ ├── base.py │ │ ├── dense.py │ │ ├── multimodal │ │ │ ├── __init__.py │ │ │ ├── embedder.py │ │ │ └── retriever.py │ │ ├── sparse.py │ │ └── text2sparql.py │ ├── summarizer │ │ ├── __init__.py │ │ ├── base.py │ │ └── transformers.py │ └── translator │ │ ├── __init__.py │ │ ├── base.py │ │ └── transformers.py ├── pipelines │ ├── __init__.py │ ├── base.py │ ├── config.py │ ├── ray.py │ ├── standard_pipelines.py │ └── utils.py ├── schema.py ├── telemetry.py └── utils │ ├── __init__.py │ ├── augment_squad.py │ ├── cleaning.py │ ├── context_matching.py │ ├── deepsetcloud.py │ ├── doc_store.py │ ├── docker.py │ ├── early_stopping.py │ ├── experiment_tracking.py │ ├── export_utils.py │ ├── import_utils.py │ ├── labels.py │ ├── preprocessing.py │ ├── reflection.py │ ├── squad_data.py │ ├── squad_to_dpr.py │ └── torch_utils.py ├── images ├── odqa_workflow.png ├── pipeline1.PNG ├── pipeline2.PNG ├── pipeline3.PNG └── ui.png ├── nginx ├── Dockerfile └── nginx.conf ├── prepare_env.sh ├── pyproject.toml ├── rest_api ├── LICENSE ├── README.md ├── pyproject.toml ├── rest_api │ ├── __about__.py │ ├── __init__.py │ ├── application.py │ ├── config.py │ ├── controller │ │ ├── __init__.py │ │ ├── document.py │ │ ├── errors │ │ │ ├── __init__.py │ │ │ └── http_error.py │ │ ├── feedback.py │ │ ├── file_upload.py │ │ ├── health.py │ │ ├── search.py │ │ └── utils.py │ ├── pipeline │ │ ├── __init__.py │ │ ├── custom_component.py │ │ ├── pipeline_empty.haystack-pipeline.yml │ │ ├── pipeline_plaid_colbertv2.yml │ │ ├── pipelines.colbertRanker.haystack-pipeline.yml │ │ ├── pipelines.haystack-EmbeddingRetriever-pipeline.yml │ │ ├── pipelines.haystack-pipeline.yml │ │ └── pipelines_dpr.haystack-pipeline.yml │ ├── schema.py │ └── utils.py └── test │ ├── __init__.py │ ├── samples │ ├── pdf │ │ ├── sample_pdf_1.pdf │ │ └── sample_pdf_2.pdf │ └── test.haystack-pipeline.yml │ └── test_rest_api.py ├── test ├── __init__.py ├── benchmarks │ ├── README.md │ ├── config.json │ ├── data_scripts │ │ ├── embeddings_slice.py │ │ └── shuffle_passages.py │ ├── distillation_config.json │ ├── model_distillation.py │ ├── nq_to_squad.py │ ├── reader.py │ ├── reader_results.csv │ ├── results_to_json.py │ ├── retriever.py │ ├── retriever_index_results.csv │ ├── retriever_query_results.csv │ ├── retriever_query_results.md │ ├── retriever_simplified.py │ ├── run.py │ ├── templates.py │ └── utils.py ├── conftest.py ├── document_stores │ ├── __init__.py │ ├── test_document_store.py │ ├── test_faiss_and_milvus.py │ ├── test_knowledge_graph.py │ └── test_weaviate.py ├── modeling │ ├── __init__.py │ ├── test_distillation.py │ ├── test_modeling_dpr.py │ ├── test_modeling_inference.py │ ├── test_modeling_prediction_head.py │ ├── test_modeling_processor.py │ ├── test_modeling_processor_saving_loading.py │ ├── test_modeling_question_answering.py │ └── test_tokenization.py ├── nodes │ ├── __init__.py │ ├── test_connector.py │ ├── test_document_classifier.py │ ├── test_extractor.py │ ├── test_file_converter.py │ ├── test_filetype_classifier.py │ ├── test_generator.py │ ├── test_label_generator.py │ ├── test_preprocessor.py │ ├── test_question_generator.py │ ├── test_ranker.py │ ├── test_reader.py │ ├── test_retriever.py │ ├── test_summarizer.py │ ├── test_summarizer_translation.py │ ├── test_table_reader.py │ └── test_translator.py ├── others │ ├── __init__.py │ ├── test_schema.py │ ├── test_telemetry.py │ └── test_utils.py ├── pipelines │ ├── __init__.py │ ├── test_eval.py │ ├── test_pipeline.py │ ├── test_pipeline_debug_and_validation.py │ ├── test_pipeline_extractive_qa.py │ ├── test_pipeline_yaml.py │ ├── test_ray.py │ └── test_standard_pipelines.py └── samples │ ├── dc │ ├── documents-stream.response │ ├── matching_test_1.csv │ ├── pipeline_config.json │ └── query_winterfell.response │ ├── docs │ ├── doc_1.txt │ └── doc_2.txt │ ├── docx │ └── sample_docx.docx │ ├── dpr │ └── sample.json │ ├── extensionless_files │ ├── docx_file │ ├── gif_file │ ├── html_file │ ├── jpg_file │ ├── mp3_file │ ├── odt_file │ ├── pdf_file │ ├── png_file │ ├── pptx_file │ ├── txt_file │ ├── wav_file │ └── zip_file │ ├── glove │ └── tiny.txt │ ├── markdown │ └── sample.md │ ├── mmr │ └── sample.json │ ├── pdf │ ├── sample_pdf_1.pdf │ └── sample_pdf_2.pdf │ ├── pipeline │ ├── ray.haystack-pipeline.yml │ └── test.haystack-pipeline.yml │ ├── qa │ ├── answer-offset-wrong.json │ ├── answer-wrong.json │ ├── dev-sample.json │ ├── eval-sample.json │ ├── noanswer.json │ ├── train-sample.json │ └── vanilla.json │ └── squad │ ├── small.json │ ├── tiny.json │ ├── tiny_augmented.json │ └── tiny_passages.json ├── third-party-programs.txt └── ui ├── Dockerfile ├── LICENSE ├── README.md ├── pyproject.toml ├── test ├── __init__.py └── test_ui_utils.py └── ui ├── __about__.py ├── __init__.py ├── eval_labels_example.csv ├── utils.py └── webapp.py /.gitattributes: -------------------------------------------------------------------------------- 1 | examples/odqa_pipelines/faiss_data/marco filter=lfs diff=lfs merge=lfs -text 2 | examples/odqa_pipelines/faiss_data/stackoverflow filter=lfs diff=lfs merge=lfs -text 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Local run files 2 | qa.db 3 | **/qa.db 4 | **/*qa*.db 5 | **/test-reports 6 | 7 | # Byte-compiled / optimized / DLL files 8 | __pycache__/ 9 | *.py[cod] 10 | *$py.class 11 | 12 | # C extensions 13 | *.so 14 | 15 | # Distribution / packaging 16 | .Python 17 | build/ 18 | develop-eggs/ 19 | dist/ 20 | downloads/ 21 | eggs/ 22 | .eggs/ 23 | lib/ 24 | lib64/ 25 | parts/ 26 | sdist/ 27 | var/ 28 | wheels/ 29 | pip-wheel-metadata/ 30 | share/python-wheels/ 31 | *.egg-info/ 32 | .installed.cfg 33 | *.egg 34 | MANIFEST 35 | 36 | # PyInstaller 37 | # Usually these files are written by a python script from a template 38 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 39 | *.manifest 40 | *.spec 41 | 42 | # Installer logs 43 | pip-log.txt 44 | pip-delete-this-directory.txt 45 | 46 | # Unit test / coverage reports 47 | htmlcov/ 48 | .tox/ 49 | .nox/ 50 | .coverage 51 | .coverage.* 52 | .cache 53 | nosetests.xml 54 | coverage.xml 55 | *.cover 56 | *.py,cover 57 | .hypothesis/ 58 | .pytest_cache/ 59 | 60 | # Translations 61 | *.mo 62 | *.pot 63 | 64 | # Django stuff: 65 | *.log 66 | local_settings.py 67 | db.sqlite3 68 | db.sqlite3-journal 69 | 70 | # Flask stuff: 71 | instance/ 72 | .webassets-cache 73 | 74 | # Scrapy stuff: 75 | .scrapy 76 | 77 | # Sphinx documentation 78 | docs/_build/ 79 | 80 | # PyBuilder 81 | target/ 82 | 83 | # Jupyter Notebook 84 | .ipynb_checkpoints 85 | 86 | # IPython 87 | profile_default/ 88 | ipython_config.py 89 | 90 | # pyenv 91 | .python-version 92 | 93 | # pipenv 94 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 95 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 96 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 97 | # install all needed dependencies. 98 | #Pipfile.lock 99 | 100 | # pyflow 101 | __pypackages__/ 102 | 103 | # Celery stuff 104 | celerybeat-schedule 105 | celerybeat.pid 106 | 107 | # SageMath parsed files 108 | *.sage.py 109 | 110 | # Environments 111 | .env 112 | .venv 113 | env/ 114 | venv/ 115 | ENV/ 116 | env.bak/ 117 | venv.bak/ 118 | 119 | # Spyder project settings 120 | .spyderproject 121 | .spyproject 122 | 123 | # Rope project settings 124 | .ropeproject 125 | 126 | # mkdocs documentation 127 | /site 128 | 129 | # mypy 130 | .mypy_cache/ 131 | .dmypy.json 132 | dmypy.json 133 | 134 | # Pyre type checker 135 | .pyre/ 136 | 137 | # PyCharm 138 | .idea 139 | 140 | # VSCode 141 | .vscode 142 | 143 | # haystack files 144 | haystack/document_store/qa.db 145 | data 146 | **/mlruns/** 147 | src 148 | tutorials/cache 149 | tutorials/mlruns 150 | tutorials/model 151 | models 152 | saved_models 153 | *_build 154 | rest_api/file-upload/* 155 | **/feedback_squad_direct.json 156 | .DS_Store 157 | 158 | # http cache (requests-cache) 159 | **/http_cache.sqlite 160 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "third_party/sentence-transformers"] 2 | path = third_party/sentence-transformers 3 | url = https://github.com/UKPLab/sentence-transformers.git 4 | [submodule "third_party/ColBERT"] 5 | path = third_party/ColBERT 6 | url = https://github.com/kaixuanliu/ColBERT.git 7 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | ### License 4 | 5 | is licensed under the terms in [LICENSE]. By contributing to the project, you agree to the license and copyright terms therein and release your contribution under these terms. 6 | 7 | ### Sign your work 8 | 9 | Please use the sign-off line at the end of the patch. Your signature certifies that you wrote the patch or otherwise have the right to pass it on as an open-source patch. The rules are pretty simple: if you can certify 10 | the below (from [developercertificate.org](http://developercertificate.org/)): 11 | 12 | ``` 13 | Developer Certificate of Origin 14 | Version 1.1 15 | 16 | Copyright (C) 2004, 2006 The Linux Foundation and its contributors. 17 | 660 York Street, Suite 102, 18 | San Francisco, CA 94110 USA 19 | 20 | Everyone is permitted to copy and distribute verbatim copies of this 21 | license document, but changing it is not allowed. 22 | 23 | Developer's Certificate of Origin 1.1 24 | 25 | By making a contribution to this project, I certify that: 26 | 27 | (a) The contribution was created in whole or in part by me and I 28 | have the right to submit it under the open source license 29 | indicated in the file; or 30 | 31 | (b) The contribution is based upon previous work that, to the best 32 | of my knowledge, is covered under an appropriate open source 33 | license and I have the right under that license to submit that 34 | work with modifications, whether created in whole or in part 35 | by me, under the same open source license (unless I am 36 | permitted to submit under a different license), as indicated 37 | in the file; or 38 | 39 | (c) The contribution was provided directly to me by some other 40 | person who certified (a), (b) or (c) and I have not modified 41 | it. 42 | 43 | (d) I understand and agree that this project and the contribution 44 | are public and that a record of the contribution (including all 45 | personal information I submit with it, including my sign-off) is 46 | maintained indefinitely and may be redistributed consistent with 47 | this project or the open source license(s) involved. 48 | ``` 49 | 50 | Then you just add a line to every git commit message: 51 | 52 | Signed-off-by: Joe Smith 53 | 54 | Use your real name (sorry, no pseudonyms or anonymous contributions.) 55 | 56 | If you set your `user.name` and `user.email` git configs, you can sign your 57 | commit automatically with `git commit -s`. 58 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | # Security Policy 2 | 3 | ## Report a Vulnerability 4 | 5 | Please report security issues or vulnerabilities to the [Intel® Security Center]. 6 | 7 | For more information on how Intel® works to resolve security issues, see 8 | [Vulnerability Handling Guidelines]. 9 | 10 | [Intel® Security Center]:https://www.intel.com/content/www/us/en/security-center/default.html 11 | 12 | [Vulnerability Handling Guidelines]:https://www.intel.com/content/www/us/en/security-center/vulnerability-handling-guidelines.html -------------------------------------------------------------------------------- /VERSION.txt: -------------------------------------------------------------------------------- 1 | 1.12.2 2 | -------------------------------------------------------------------------------- /applications/indexing/build_image.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | # You can remove build-arg http_proxy and https_proxy if your network doesn't need it 5 | 6 | DOCKER_BUILDKIT=0 docker build \ 7 | -f ../../docker/Dockerfile ../../ \ 8 | -t intel/ai-workflows:odqa-haystack-api \ 9 | --network=host \ 10 | --build-arg http_proxy=${http_proxy} \ 11 | --build-arg https_proxy=${https_proxy} \ 12 | --build-arg no_proxy=${no_proxy} 13 | -------------------------------------------------------------------------------- /applications/indexing/create_plaid.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | from pathlib import Path 4 | 5 | from haystack.document_stores.plaid import PLAIDDocumentStore 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | if __name__ == "__main__": 10 | parser = argparse.ArgumentParser("Create an index using PLAID engine as a backend") 11 | parser.add_argument("--checkpoint", type=Path, required=True) 12 | parser.add_argument("--collection", type=Path, required=True) 13 | parser.add_argument("--index-save-path", type=Path, required=True) 14 | parser.add_argument("--gpus", type=int, default=0) 15 | parser.add_argument("--ranks", type=int, default=1) 16 | parser.add_argument("--doc-max-length", type=int, default=120) 17 | parser.add_argument("--query-max-length", type=int, default=60) 18 | parser.add_argument("--kmeans-iterations", type=int, default=4) 19 | parser.add_argument("--name", type=str, default="plaid_index") 20 | parser.add_argument("--nbits", type=int, default=2) 21 | 22 | args = parser.parse_args() 23 | 24 | if args.gpus > 1: 25 | args.ranks = args.gpus 26 | args.amp = True 27 | assert args.ranks > 0 28 | if args.gpus == 0: 29 | assert args.ranks > 0 30 | 31 | store = PLAIDDocumentStore( 32 | index_path=f"{args.index_save_path}", 33 | checkpoint_path=f"{args.checkpoint}", 34 | collection_path=f"{args.collection}", 35 | create=True, 36 | nbits=args.nbits, 37 | gpus=args.gpus, 38 | ranks=args.ranks, 39 | doc_maxlen=args.doc_max_length, 40 | query_maxlen=args.query_max_length, 41 | kmeans_niters=args.kmeans_iterations, 42 | ) 43 | logger.info("Done.") 44 | -------------------------------------------------------------------------------- /applications/indexing/indexing_pipeline.py: -------------------------------------------------------------------------------- 1 | from ray_indexing_pipeline import RayIndexingPipeline 2 | import argparse, time, os 3 | 4 | def run_indexing_pipeline(cfg): 5 | if cfg.enable_sample == 1: 6 | os.environ["ENABLE_SAMPLING_LIMIT"] = "1" 7 | else: 8 | os.environ["ENABLE_SAMPLING_LIMIT"] = "0" 9 | 10 | start = time.time() 11 | pipeline = RayIndexingPipeline.load_from_yaml(path=cfg.pipeline_yaml) 12 | pipeline.run() 13 | cost = time.time() - start 14 | print(f'Spent {cost}s for pipeline: {cfg.pipeline_yaml}') 15 | 16 | 17 | def parse_cmd(): 18 | desc = 'generate documentstore for marco dataset...\n\n' 19 | args = argparse.ArgumentParser(description=desc, epilog=' ', formatter_class=argparse.RawTextHelpFormatter) 20 | args.add_argument('-p', type=str, default='faiss_indexing_pipeline.yml', dest='pipeline_yaml', help='pipeline config file') 21 | args.add_argument('-s', type=int, default=0, dest='enable_sample', help='Only retrieve 500 samples for indexing') 22 | return args.parse_args() 23 | 24 | 25 | if __name__ == "__main__": 26 | config = parse_cmd() 27 | run_indexing_pipeline(config) 28 | -------------------------------------------------------------------------------- /applications/indexing/marco_indexing/colbert_indexing_pipeline.yml: -------------------------------------------------------------------------------- 1 | version: 1.12.2 2 | extras: ray 3 | components: # define all the building-blocks for Pipeline 4 | - name: DocumentStore 5 | type: ElasticsearchDocumentStore 6 | actor: True 7 | params: 8 | host: $host_ip #host IP of head node 9 | custom_mapping: 10 | mappings: 11 | properties: 12 | content: 13 | type: text 14 | question_id: 15 | type: integer 16 | question-body: 17 | type: text 18 | index: false 19 | answer: 20 | type: text 21 | index: false 22 | colbert_emb: 23 | type: binary 24 | index: false 25 | 26 | - name: Ranker 27 | type: ColBERTRanker 28 | actor: True 29 | params: 30 | model_path: /home/user/data/colbertv2.0 31 | 32 | - name: Dataset 33 | type: MarcoDataset 34 | path: /home/user/workspace/marco_dataset.py 35 | actor: False 36 | params: 37 | file: /home/user/dataset/train_v2.1.json 38 | batch_size: 200000 39 | 40 | 41 | pipelines: 42 | - name: indexing 43 | nodes: 44 | - name: Dataset 45 | inputs: [File] 46 | - name: Ranker 47 | inputs: [Dataset] 48 | serve_deployment_kwargs: 49 | num_replicas: 128 # number of replicas to create on the Ray cluster 50 | batch_size: 256 51 | num_cpus: 2 52 | - name: DocumentStore 53 | inputs: [Ranker] 54 | serve_deployment_kwargs: 55 | num_replicas: 10 # number of replicas to create on the Ray cluster 56 | batch_size: 2000 57 | num_cpus: 8 58 | -------------------------------------------------------------------------------- /applications/indexing/marco_indexing/emr_indexing_pipeline.yml: -------------------------------------------------------------------------------- 1 | version: 1.12.2 2 | extras: ray 3 | components: # define all the building-blocks for Pipeline 4 | - name: DocumentStore 5 | type: ElasticsearchDocumentStore 6 | actor: True 7 | params: 8 | host: $host_ip #host IP of head node 9 | index: document 10 | embedding_field: question_emb 11 | embedding_dim: 768 12 | excluded_meta_data: ["question_emb"] 13 | 14 | - name: Retriever 15 | type: EmbeddingRetriever 16 | actor: True 17 | params: 18 | document_store: DocumentStore # params can reference other components defined in the YAML 19 | embedding_model: deepset/sentence_bert 20 | batch_size: 256 21 | 22 | - name: Dataset 23 | type: MarcoDataset 24 | path: /home/user/workspace/marco_dataset.py 25 | actor: False 26 | params: 27 | file: /home/user/dataset/train_v2.1.json 28 | batch_size: 200000 29 | 30 | 31 | pipelines: 32 | - name: indexing 33 | nodes: 34 | - name: Dataset 35 | inputs: [File] 36 | - name: Retriever 37 | inputs: [Dataset] 38 | serve_deployment_kwargs: 39 | num_replicas: 128 # number of replicas to create on the Ray cluster 40 | batch_size: 16 41 | num_cpus: 2 42 | - name: DocumentStore 43 | inputs: [Retriever] 44 | serve_deployment_kwargs: 45 | num_replicas: 10 # number of replicas to create on the Ray cluster 46 | batch_size: 2000 47 | num_cpus: 8 48 | -------------------------------------------------------------------------------- /applications/indexing/marco_indexing/faiss_indexing_pipeline.yml: -------------------------------------------------------------------------------- 1 | version: 1.12.2 2 | extras: ray 3 | components: # define all the building-blocks for Pipeline 4 | 5 | - name: DocumentStore 6 | type: FAISSDocumentStore 7 | faiss_index_path: /home/user/data/faiss-index-so.faiss 8 | actor: False 9 | params: 10 | sql_url: postgresql://postgres:postgres@$host_ip/haystack # postgresql url, please set host_ip to host IP of head node 11 | faiss_index_factory_str: HNSW 12 | 13 | - name: Retriever 14 | type: DensePassageRetriever 15 | actor: True 16 | params: 17 | query_embedding_model: "facebook/dpr-question_encoder-single-nq-base" 18 | passage_embedding_model: "facebook/dpr-ctx_encoder-single-nq-base" 19 | max_seq_len_query: 64 20 | max_seq_len_passage: 256 21 | batch_size: 16 22 | embed_title: True 23 | use_fast_tokenizers: True 24 | 25 | - name: Dataset 26 | type: MarcoDataset 27 | path: /home/user/workspace/marco_dataset.py 28 | actor: False 29 | params: 30 | file: /home/user/dataset/train_v2.1.json 31 | batch_size: 200000 32 | 33 | 34 | pipelines: 35 | - name: indexing 36 | nodes: 37 | - name: Dataset 38 | inputs: [File] 39 | - name: Retriever 40 | inputs: [Dataset] 41 | serve_deployment_kwargs: 42 | num_replicas: 128 # number of replicas to create on the Ray cluster 43 | batch_size: 256 44 | num_cpus: 2 45 | - name: DocumentStore 46 | inputs: [Retriever] 47 | -------------------------------------------------------------------------------- /applications/indexing/marco_indexing/marco_dataset.py: -------------------------------------------------------------------------------- 1 | from typing import List, Union, Optional 2 | from haystack.schema import Document 3 | from haystack.nodes.other import Dataset 4 | import time, ray 5 | import pandas as pd 6 | import modin.pandas as modin_pd 7 | import os 8 | import json 9 | os.environ["__MODIN_AUTOIMPORT_PANDAS__"] = "1" 10 | 11 | def _generate_documents(batch: pd.DataFrame) -> List[Document]: 12 | documents = [] 13 | for _, data in batch.iterrows(): 14 | if isinstance(data['answers'], list) == False: 15 | continue 16 | 17 | data['answers'] = data['answers'][0] 18 | if len(str(data['wellFormedAnswers'])) > 2: 19 | if isinstance(data['wellFormedAnswers'], list) : 20 | data['answers'] = data['wellFormedAnswers'][0] 21 | 22 | elif "No Answer Present." in data['answers']: 23 | data['answers'] = data['passages'] 24 | 25 | if len(str(data['answers'])) == 0: 26 | print("no answers, drop the document!") 27 | continue 28 | 29 | doc = {'content': str(data['query']), 'meta': {'answer': str(data['answers']), 'question_id': str(data['query_id']), 'question_type': str(data['query_type'])}} 30 | documents.append(Document.from_dict(doc)) 31 | return documents 32 | 33 | 34 | 35 | class MarcoDataset(Dataset): 36 | """ 37 | This Node is used to convert MS Marco dataset into ray.data.Dataset of Haystack Document format. 38 | """ 39 | 40 | outgoing_edges = 1 41 | 42 | def __init__(self, 43 | file: str, 44 | batch_size: Optional[int] = 4096, 45 | ) : 46 | 47 | super().__init__(batch_size=batch_size) 48 | self.file = file 49 | 50 | def convert(self) -> ray.data.Dataset: 51 | dataset = modin_pd.read_json(self.file) 52 | dataset = ray.data.from_modin(dataset) 53 | start = time.time() 54 | dataset = dataset.map_batches(_generate_documents) 55 | cost = time.time() - start 56 | return dataset 57 | -------------------------------------------------------------------------------- /applications/indexing/marco_indexing/prepare_env.sh: -------------------------------------------------------------------------------- 1 | pip install modin[all] -------------------------------------------------------------------------------- /applications/indexing/marco_indexing_workflow.yml: -------------------------------------------------------------------------------- 1 | nodes: 2 | - node: $host_ip #IP address of head node, Head node is launched in local machine 3 | type: head # value:[head, worker] Must exist. 4 | cores: 0-64 #total cpu cores used 5 | image: intel/ai-workflows:odqa-haystack-api 6 | dataset_dir: $dataset_dir #mount to /home/user/dataset of container. It should include the dataset files. 7 | customer_dir: $customer_dir #mount to /home/user/data of container 8 | workspace_dir: $(pwd)/marco_indexing #mount to /home/user/workspace of container. $workspace_dir should be ./marco_indexing for marco indexing. 9 | 10 | - node: $host_ip #IP address of worker node 11 | type: worker 12 | image: intel/ai-workflows:odqa-haystack-api 13 | cores: 0-71 14 | user: $user #configure the user of worker node for remote access 15 | password: $password #configure the password of worker node for remote access and sudo executing 16 | dataset_dir: $dataset_dir #If you use the nfs share storage, it should be same with path of head node. 17 | customer_dir: $customer_dir #If you use the nfs share storage, it should be same with path of head node. 18 | workspace_dir: $workspace_dir #If you use the nfs share storage, it should be same with path of head node. 19 | 20 | 21 | pipelines: 22 | - name: colbert_indexing_pipeline.yml #The name should be same with file name of pipeline file which is included in $workspace_dir 23 | database: # The database containers will be launched in head node. 24 | type: elasticsearch # value:[elasticsearch, postgres]. Must be consistent with the database type of DocumentStore configured in the pipeline file 25 | image: elasticsearch:7.9.2 #For elasticsearch, the 7.9.2 is prefered. To change the version, you need to ensure that you use the same version as the query pipeline 26 | cores: 65-71 27 | data_dir: $data_dir_1 #The data directory of database which mountes to /usr/share/elasticsearch/data of elasticsearch container, or /var/lib/postgresql/data of postgresql container 28 | - name: emr_indexing_pipeline.yml 29 | database: 30 | type: elasticsearch 31 | image: elasticsearch:7.9.2 32 | cores: 65-71 33 | data_dir: $data_dir_2 34 | - name: faiss_indexing_pipeline.yml 35 | database: # The database containers will run in head node. 36 | type: postgres 37 | image: postgres:14.1-alpine 38 | cores: 65-71 39 | data_dir: $data_dir_3 40 | -------------------------------------------------------------------------------- /applications/indexing/stackoverflow_indexing/colbert_indexing_pipeline.yml: -------------------------------------------------------------------------------- 1 | version: 1.12.2 2 | extras: ray 3 | components: # define all the building-blocks for Pipeline 4 | - name: DocumentStore 5 | type: ElasticsearchDocumentStore 6 | actor: True 7 | params: 8 | host: $host_ip #host IP of head node 9 | custom_mapping: 10 | mappings: 11 | properties: 12 | content: 13 | type: text 14 | question_id: 15 | type: integer 16 | question-body: 17 | type: text 18 | index: false 19 | answer: 20 | type: text 21 | index: false 22 | colbert_emb: 23 | type: binary 24 | index: false 25 | 26 | - name: Ranker 27 | type: ColBERTRanker 28 | actor: True 29 | params: 30 | model_path: /home/user/data/colbertv2.0 31 | 32 | - name: Dataset 33 | type: StackoverflowDataset 34 | path: /home/user/workspace/stackoverflow_dataset.py 35 | actor: False 36 | params: 37 | question_file: /home/user/dataset/Questions.csv 38 | answer_file: /home/user/dataset/Answers.csv 39 | batch_size: 200000 40 | 41 | 42 | pipelines: 43 | - name: indexing 44 | nodes: 45 | - name: Dataset 46 | inputs: [File] 47 | - name: Ranker 48 | inputs: [Dataset] 49 | serve_deployment_kwargs: 50 | num_replicas: 80 # number of replicas to create on the Ray cluster 51 | batch_size: 256 52 | num_cpus: 2 53 | - name: DocumentStore 54 | inputs: [Ranker] 55 | serve_deployment_kwargs: 56 | num_replicas: 10 # number of replicas to create on the Ray cluster 57 | batch_size: 2000 58 | num_cpus: 8 59 | -------------------------------------------------------------------------------- /applications/indexing/stackoverflow_indexing/emr_indexing_pipeline.yml: -------------------------------------------------------------------------------- 1 | version: 1.12.2 2 | extras: ray 3 | components: # define all the building-blocks for Pipeline 4 | - name: DocumentStore 5 | type: ElasticsearchDocumentStore 6 | actor: True 7 | params: 8 | host: $host_ip #host IP of head node 9 | index: document 10 | embedding_field: question_emb 11 | embedding_dim: 768 12 | excluded_meta_data: ["question_emb"] 13 | 14 | - name: Retriever 15 | type: EmbeddingRetriever 16 | actor: True 17 | params: 18 | document_store: DocumentStore # params can reference other components defined in the YAML 19 | embedding_model: deepset/sentence_bert 20 | batch_size: 256 21 | 22 | - name: Dataset 23 | type: StackoverflowDataset 24 | path: /home/user/workspace/stackoverflow_dataset.py 25 | actor: False 26 | params: 27 | question_file: /home/user/dataset/Questions.csv 28 | answer_file: /home/user/dataset/Answers.csv 29 | batch_size: 200000 30 | 31 | 32 | pipelines: 33 | - name: indexing 34 | nodes: 35 | - name: Dataset 36 | inputs: [File] 37 | - name: Retriever 38 | inputs: [Dataset] 39 | serve_deployment_kwargs: 40 | num_replicas: 80 # number of replicas to create on the Ray cluster 41 | batch_size: 256 42 | num_cpus: 2 43 | - name: DocumentStore 44 | inputs: [Retriever] 45 | serve_deployment_kwargs: 46 | num_replicas: 10 # number of replicas to create on the Ray cluster 47 | batch_size: 2000 48 | num_cpus: 8 49 | -------------------------------------------------------------------------------- /applications/indexing/stackoverflow_indexing/faiss_indexing_pipeline.yml: -------------------------------------------------------------------------------- 1 | version: 1.12.2 2 | extras: ray 3 | components: # define all the building-blocks for Pipeline 4 | 5 | - name: DocumentStore 6 | type: FAISSDocumentStore 7 | faiss_index_path: /home/user/data/faiss-index-so.faiss 8 | actor: False 9 | params: 10 | sql_url: postgresql://postgres:postgres@$host_ip/haystack # postgresql url, please set host_ip to host IP of head node 11 | faiss_index_factory_str: HNSW 12 | 13 | - name: Retriever 14 | type: DensePassageRetriever 15 | actor: True 16 | params: 17 | query_embedding_model: "facebook/dpr-question_encoder-single-nq-base" 18 | passage_embedding_model: "facebook/dpr-ctx_encoder-single-nq-base" 19 | max_seq_len_query: 64 20 | max_seq_len_passage: 256 21 | batch_size: 16 22 | embed_title: True 23 | use_fast_tokenizers: True 24 | 25 | - name: Dataset 26 | type: StackoverflowDataset 27 | path: /home/user/workspace/stackoverflow_dataset.py 28 | actor: False 29 | params: 30 | question_file: /home/user/dataset/Questions.csv 31 | answer_file: /home/user/dataset/Answers.csv 32 | batch_size: 200000 33 | 34 | 35 | pipelines: 36 | - name: indexing 37 | nodes: 38 | - name: Dataset 39 | inputs: [File] 40 | - name: Retriever 41 | inputs: [Dataset] 42 | serve_deployment_kwargs: 43 | num_replicas: 140 # number of replicas to create on the Ray cluster 44 | batch_size: 256 45 | num_cpus: 2 46 | - name: DocumentStore 47 | inputs: [Retriever] 48 | -------------------------------------------------------------------------------- /applications/indexing/stackoverflow_indexing/prepare_env.sh: -------------------------------------------------------------------------------- 1 | pip install modin[all]==0.18.0 -------------------------------------------------------------------------------- /applications/indexing/stackoverflow_indexing_workflow.yml: -------------------------------------------------------------------------------- 1 | nodes: 2 | - node: $host_ip 3 | type: head 4 | cores: 0-64 5 | image: intel/ai-workflows:odqa-haystack-api 6 | dataset_dir: $dataset_dir 7 | customer_dir: $customer_dir 8 | workspace_dir: $(pwd)/stackoverflow_indexing 9 | 10 | - node: $host_ip 11 | type: worker 12 | image: intel/ai-workflows:odqa-haystack-api 13 | cores: 0-71 14 | user: $user #configure the user of worker node for remote access 15 | password: $password #configure the password of worker node for remote access and sudo executing 16 | dataset_dir: $dataset_dir 17 | customer_dir: $customer_dir 18 | workspace_dir: $workspace_dir 19 | 20 | pipelines: 21 | - name: colbert_indexing_pipeline.yml 22 | database: 23 | type: elasticsearch 24 | image: elasticsearch:7.9.2 25 | cores: 65-71 26 | data_dir: $data_dir_1 27 | - name: emr_indexing_pipeline.yml 28 | database: 29 | type: elasticsearch 30 | image: elasticsearch:7.9.2 31 | cores: 65-71 32 | data_dir: $data_dir_2 33 | - name: faiss_indexing_pipeline.yml 34 | database: # The database containers will run in head node. 35 | type: postgres 36 | image: postgres:14.1-alpine 37 | cores: 65-71 38 | data_dir: $data_dir_3 39 | -------------------------------------------------------------------------------- /applications/odqa_pipelines/config/env.marco.esds_bm25r_colbert: -------------------------------------------------------------------------------- 1 | PIPELINE_NAME=query 2 | PIPELINE_PATH=./rest_api/rest_api/pipeline/pipelines.colbertRanker.haystack-pipeline.yml 3 | ELASTICSEARCH_IMG=elasticsearch:7.9.2 4 | COLBERT_OPT=True 5 | ENABLE_IPEX=False 6 | IPEX_BF16=False 7 | CUSTOMER_DIR=$customer_dir 8 | DATA_DIR=$data_dir 9 | UI_CONFIG_DIR=../ui_config/marco 10 | -------------------------------------------------------------------------------- /applications/odqa_pipelines/config/env.marco.esds_emr_faq: -------------------------------------------------------------------------------- 1 | PIPELINE_NAME=query 2 | PIPELINE_PATH=./rest_api/rest_api/pipeline/pipelines.haystack-EmbeddingRetriever-pipeline.yml 3 | ELASTICSEARCH_IMG=elasticsearch:7.9.2 4 | ENABLE_IPEX=False 5 | IPEX_BF16=False 6 | DATA_DIR=$data_dir 7 | UI_CONFIG_DIR=../ui_config/marco 8 | 9 | -------------------------------------------------------------------------------- /applications/odqa_pipelines/config/env.marco.faiss_dpr: -------------------------------------------------------------------------------- 1 | PIPELINE_NAME='query' 2 | PIPELINE_PATH=./rest_api/rest_api/pipeline/pipelines_dpr.haystack-pipeline.yml 3 | POSTGRES_HOST_AUTH_METHOD=trust 4 | CUSTOMER_DIR=$customer_dir 5 | DATA_DIR=$data_dir 6 | UI_CONFIG_DIR=../ui_config/marco 7 | -------------------------------------------------------------------------------- /applications/odqa_pipelines/config/env.stackoverflow.esds_bm25r_colbert: -------------------------------------------------------------------------------- 1 | PIPELINE_NAME=query 2 | PIPELINE_PATH=./rest_api/rest_api/pipeline/pipelines.colbertRanker.haystack-pipeline.yml 3 | ELASTICSEARCH_IMG=elasticsearch:7.9.2 4 | ENABLE_IPEX=False 5 | IPEX_BF16=False 6 | COLBERT_OPT=True 7 | CUSTOMER_DIR=$customer_dir 8 | DATA_DIR=$data_dir 9 | UI_CONFIG_DIR=../ui_config/stackoverflow 10 | -------------------------------------------------------------------------------- /applications/odqa_pipelines/config/env.stackoverflow.esds_emr_faq: -------------------------------------------------------------------------------- 1 | PIPELINE_NAME=query 2 | PIPELINE_PATH=./rest_api/rest_api/pipeline/pipelines.haystack-EmbeddingRetriever-pipeline.yml 3 | ELASTICSEARCH_IMG=elasticsearch:7.9.2 4 | #CUSTOMER_DIR=/tmp/data 5 | ENABLE_IPEX=False 6 | IPEX_BF16=False 7 | DATA_DIR=$data_dir 8 | UI_CONFIG_DIR=../ui_config/stackoverflow 9 | -------------------------------------------------------------------------------- /applications/odqa_pipelines/config/env.stackoverflow.faiss_dpr: -------------------------------------------------------------------------------- 1 | PIPELINE_NAME='query' 2 | PIPELINE_PATH=./rest_api/rest_api/pipeline/pipelines_dpr.haystack-pipeline.yml 3 | POSTGRES_HOST_AUTH_METHOD=trust 4 | CUSTOMER_DIR=$customer_dir 5 | DATA_DIR=$data_dir 6 | UI_CONFIG_DIR=../ui_config/stackoverflow 7 | -------------------------------------------------------------------------------- /applications/odqa_pipelines/config/env.wiki.plaid_colbertv2: -------------------------------------------------------------------------------- 1 | PIPELINE_NAME='query' 2 | MODE=1 3 | DATASET='wiki-dpr' 4 | PIPELINE_PATH=./rest_api/rest_api/pipeline/pipeline_plaid_colbertv2.yml 5 | CHECKPOINT_PATH=/localdisk/kaixuan/colbertv2.0/ 6 | CUSTOMER_DIR=/localdisk/kaixuan/plaid/ 7 | -------------------------------------------------------------------------------- /applications/odqa_pipelines/docker-compose/docker-compose-dpr.yml: -------------------------------------------------------------------------------- 1 | version: "3" 2 | services: 3 | haystack-api: 4 | build: 5 | context: ../../../docker 6 | dockerfile: Dockerfile 7 | args: 8 | - http_proxy=$http_proxy 9 | - https_proxy=$https_proxy 10 | - no_proxy=$no_proxy 11 | image: "intel/ai-workflows:odqa-haystack-api" 12 | network_mode: host 13 | # Mount custom Pipeline YAML and custom Components. 14 | volumes: 15 | - $CUSTOMER_DIR:/home/user/data 16 | ports: 17 | - 8000:8000 18 | restart: on-failure 19 | environment: 20 | # See rest_api/pipeline/pipelines.haystack-pipeline.yml for configurations of Search & Indexing Pipeline. 21 | - PIPELINE_YAML_PATH=$PIPELINE_PATH 22 | - QUERY_PIPELINE_NAME=$PIPELINE_NAME 23 | - INDEX_NAME=$INDEX_NAME 24 | - CONCURRENT_REQUEST_PER_WORKER 25 | - http_proxy=$http_proxy 26 | - https_proxy=$https_proxy 27 | - no_proxy=$no_proxy 28 | #- ONEDNN_VERBOSE=1 29 | - KMP_BLOCKTIME=20 30 | - MKL_ENABLE_INSTRUCTIONS=AVX512_E1 31 | - LD_PRELOAD=/usr/local/lib/libiomp5.so 32 | - KMP_AFFINITY=granularity=fine,verbose,compact,1,0 33 | #- MKL_VERBOSE=1 34 | depends_on: 35 | - postsql-db 36 | # Starts REST API with only 2 workers so that it can be run on systems with just 4GB of memory 37 | # If you need to handle large loads of incoming requests and have memory to spare, consider increasing the number of workers 38 | command: "/bin/bash -c 'sleep 10 && gunicorn rest_api.application:app -b 0.0.0.0 -k uvicorn.workers.UvicornWorker --workers 1 --timeout 600'" 39 | postsql-db: 40 | image: "postgres:14.1-alpine" 41 | ports: 42 | - 5432:5432 43 | restart: on-failure 44 | environment: 45 | - POSTGRES_HOST_AUTH_METHOD=$POSTGRES_HOST_AUTH_METHOD 46 | volumes: 47 | - $DATA_DIR:/var/lib/postgresql/data 48 | 49 | ui: 50 | build: 51 | context: ../../../ui 52 | dockerfile: Dockerfile 53 | args: 54 | - http_proxy=$http_proxy 55 | - https_proxy=$https_proxy 56 | image: "intel/ai-workflows:odqa-haystack-ui" 57 | network_mode: host 58 | ports: 59 | - 8501:8501 60 | restart: on-failure 61 | volumes: 62 | - $UI_CONFIG_DIR:/home/user/data/ 63 | environment: 64 | - API_ENDPOINT=http://localhost:8000 65 | - EVAL_FILE=ui/eval_labels_example.csv 66 | - PIPELINE_PATH=$PIPELINE_PATH 67 | # The value fot the following variables will be read from the host, if present. 68 | # They can also be temporarily set for docker-compose, for example: 69 | # DISABLE_FILE_UPLOAD=1 DEFAULT_DOCS_FROM_RETRIEVER=5 docker-compose up 70 | - DISABLE_FILE_UPLOAD=True 71 | - DEFAULT_QUESTION_AT_STARTUP 72 | - DEFAULT_DOCS_FROM_RETRIEVER 73 | - DEFAULT_NUMBER_OF_ANSWERS 74 | command: "/bin/bash -c 'sleep 15 && python -m streamlit run ui/webapp.py'" 75 | -------------------------------------------------------------------------------- /applications/odqa_pipelines/docker-compose/docker-compose-gpu-dpr.yml: -------------------------------------------------------------------------------- 1 | version: "3" 2 | services: 3 | 4 | haystack-api: 5 | build: 6 | context: ../../../docker 7 | dockerfile: Dockerfile-GPU 8 | args: 9 | - http_proxy=$HTTP_PROXY 10 | - https_proxy=$HTTPS_PROXY 11 | - no_proxy=$NO_PROXY 12 | image: "intel/ai-workflows:odqa-haystack-api-gpu" 13 | # in recent docker-compose version you can enable GPU resources. Make sure to fulfill the prerequisites listed here: https://docs.docker.com/compose/gpu-support/ 14 | deploy: 15 | resources: 16 | reservations: 17 | devices: 18 | - driver: nvidia 19 | #count: 1 20 | capabilities: [gpu] 21 | # # Mount custom Pipeline YAML and custom Components. 22 | # volumes: 23 | # - ./rest_api/pipeline:/home/user/rest_api/pipeline 24 | ports: 25 | - 8000:8000 26 | restart: on-failure 27 | 28 | volumes: 29 | - $CUSTOMER_DIR:/home/user/data 30 | 31 | environment: 32 | # See rest_api/pipeline/pipelines.haystack-pipeline.yml for configurations of Search & Indexing Pipeline. 33 | #- DOCUMENTSTORE_PARAMS_HOST=elasticsearch 34 | #- PIPELINE_YAML_PATH=/home/user/rest_api/pipeline/pipelines_dpr.haystack-pipeline.yml 35 | - PIPELINE_YAML_PATH=$PIPELINE_PATH 36 | - QUERY_PIPELINE_NAME=$PIPELINE_NAME 37 | #- INDEX_NAME=$INDEX_NAME 38 | - CONCURRENT_REQUEST_PER_WORKER 39 | - http_proxy=$HTTP_PROXY 40 | - https_proxy=$HTTPS_PROXY 41 | - no_proxy=$NO_PROXY 42 | #- COLBERT_OPT=$COLBERT_OPT 43 | depends_on: 44 | - postsql-db 45 | command: "/bin/bash -c 'sleep 10 && gunicorn rest_api.application:app -b 0.0.0.0 -k uvicorn.workers.UvicornWorker --workers 1 --timeout 600 --graceful-timeout 600'" 46 | 47 | postsql-db: 48 | image: "postgres:14.1-alpine" 49 | ports: 50 | - 5432:5432 51 | restart: on-failure 52 | environment: 53 | - POSTGRES_HOST_AUTH_METHOD=$POSTGRES_HOST_AUTH_METHOD 54 | volumes: 55 | - $DATA_DIR:/var/lib/postgresql/data 56 | # environment: 57 | # - discovery.type=single-node 58 | 59 | ui: 60 | build: 61 | context: ../../../ui 62 | dockerfile: Dockerfile 63 | args: 64 | - http_proxy=$HTTP_PROXY 65 | - https_proxy=$HTTPS_PROXY 66 | image: "intel/ai-workflows:odqa-haystack-ui" 67 | ports: 68 | - 8501:8501 69 | restart: on-failure 70 | volumes: 71 | - $UI_CONFIG_DIR:/home/user/data/ 72 | environment: 73 | - API_ENDPOINT=http://haystack-api:8000 74 | - EVAL_FILE=ui/eval_labels_example.csv 75 | - PIPELINE_PATH=$PIPELINE_PATH 76 | # The value fot the following variables will be read from the host, if present. 77 | # They can also be temporarily set for docker-compose, for example: 78 | # DISABLE_FILE_UPLOAD=1 DEFAULT_DOCS_FROM_RETRIEVER=5 docker-compose up 79 | - DISABLE_FILE_UPLOAD=True 80 | - DEFAULT_QUESTION_AT_STARTUP 81 | - DEFAULT_DOCS_FROM_RETRIEVER 82 | - DEFAULT_NUMBER_OF_ANSWERS 83 | command: "/bin/bash -c 'sleep 15 && python -m streamlit run ui/webapp.py'" 84 | -------------------------------------------------------------------------------- /applications/odqa_pipelines/docker-compose/docker-compose-gpu.yml: -------------------------------------------------------------------------------- 1 | version: "3" 2 | services: 3 | 4 | haystack-api: 5 | build: 6 | context: ../../../docker 7 | dockerfile: Dockerfile-GPU 8 | args: 9 | - http_proxy=$HTTP_PROXY 10 | - https_proxy=$HTTPS_PROXY 11 | - no_proxy=$NO_PROXY 12 | image: "intel/ai-workflows:odqa-haystack-api-gpu" 13 | # in recent docker-compose version you can enable GPU resources. Make sure to fulfill the prerequisites listed here: https://docs.docker.com/compose/gpu-support/ 14 | deploy: 15 | resources: 16 | reservations: 17 | devices: 18 | - driver: nvidia 19 | #count: 1 20 | capabilities: [gpu] 21 | # # Mount custom Pipeline YAML and custom Components. 22 | # volumes: 23 | # - ./rest_api/pipeline:/home/user/rest_api/pipeline 24 | ports: 25 | - 8000:8000 26 | restart: on-failure 27 | 28 | #volumes: 29 | # - $CUSTOMER_DIR:/home/user/data 30 | 31 | environment: 32 | # See rest_api/pipeline/pipelines.haystack-pipeline.yml for configurations of Search & Indexing Pipeline. 33 | - DOCUMENTSTORE_PARAMS_HOST=elasticsearch 34 | #- PIPELINE_YAML_PATH=/home/user/rest_api/pipeline/pipelines_dpr.haystack-pipeline.yml 35 | - PIPELINE_YAML_PATH=$PIPELINE_PATH 36 | - QUERY_PIPELINE_NAME=$PIPELINE_NAME 37 | - INDEX_NAME=$INDEX_NAME 38 | - CONCURRENT_REQUEST_PER_WORKER 39 | - http_proxy=$HTTP_PROXY 40 | - https_proxy=$HTTPS_PROXY 41 | - no_proxy=$NO_PROXY 42 | - COLBERT_OPT=$COLBERT_OPT 43 | - CHECKPOINT_PATH=$CHECKPOINT_PATH 44 | - IS_DICT_CHECKPOINT=$IS_DICT_CHECKPOINT 45 | depends_on: 46 | - elasticsearch 47 | command: "/bin/bash -c 'sleep 10 && gunicorn rest_api.application:app -b 0.0.0.0 -k uvicorn.workers.UvicornWorker --workers 1 --timeout 600 --graceful-timeout 600'" 48 | 49 | elasticsearch: 50 | # This will start an empty elasticsearch instance (so you have to add your documents yourself) 51 | #image: "elasticsearch:7.9.2" 52 | # If you want a demo image instead that is "ready-to-query" with some indexed articles 53 | # about countries and capital cities from Wikipedia: 54 | #image: "dingke1980/elasticsearch-stack-overflow:1.0" 55 | image: $ELASTICSEARCH_IMG 56 | ports: 57 | - 9200:9200 58 | restart: on-failure 59 | volumes: 60 | - $DATA_DIR:/usr/share/elasticsearch/data 61 | environment: 62 | - discovery.type=single-node 63 | - ES_JAVA_OPTS=-Xmx4g -Xms4g 64 | 65 | ui: 66 | build: 67 | context: ../../../ui 68 | dockerfile: Dockerfile 69 | args: 70 | - http_proxy=$HTTP_PROXY 71 | - https_proxy=$HTTPS_PROXY 72 | image: "intel/ai-workflows:odqa-haystack-ui" 73 | ports: 74 | - 8501:8501 75 | restart: on-failure 76 | volumes: 77 | - $UI_CONFIG_DIR:/home/user/data/ 78 | environment: 79 | - API_ENDPOINT=http://haystack-api:8000 80 | - EVAL_FILE=ui/eval_labels_example.csv 81 | - PIPELINE_PATH=$PIPELINE_PATH 82 | # The value fot the following variables will be read from the host, if present. 83 | # They can also be temporarily set for docker-compose, for example: 84 | # DISABLE_FILE_UPLOAD=1 DEFAULT_DOCS_FROM_RETRIEVER=5 docker-compose up 85 | - DISABLE_FILE_UPLOAD=True 86 | - DEFAULT_QUESTION_AT_STARTUP 87 | - DEFAULT_DOCS_FROM_RETRIEVER 88 | - DEFAULT_NUMBER_OF_ANSWERS 89 | command: "/bin/bash -c 'sleep 15 && python -m streamlit run ui/webapp.py'" 90 | -------------------------------------------------------------------------------- /applications/odqa_pipelines/docker-compose/docker-compose-plaid.yml: -------------------------------------------------------------------------------- 1 | version: "3" 2 | services: 3 | haystack-api: 4 | build: 5 | context: ../../../docker 6 | dockerfile: Dockerfile 7 | args: 8 | - http_proxy=$HTTP_PROXY 9 | - https_proxy=$HTTPS_PROXY 10 | - no_proxy=$NO_PROXY 11 | image: "intel/ai-workflows:odqa-haystack-api" 12 | # Mount custom Pipeline YAML and custom Components. 13 | # volumes: 14 | # - ./rest_api/pipeline:/home/user/rest_api/pipeline 15 | #network_mode: host 16 | ports: 17 | - 8000:8000 18 | restart: on-failure 19 | volumes: 20 | - $CUSTOMER_DIR:/home/user/data 21 | - $CHECKPOINT_PATH:/home/user/model 22 | environment: 23 | # See rest_api/pipeline/pipelines.haystack-pipeline.yml for configurations of Search & Indexing Pipeline. 24 | - PIPELINE_YAML_PATH=$PIPELINE_PATH 25 | - QUERY_PIPELINE_NAME=$PIPELINE_NAME 26 | - INDEX_NAME=$INDEX_NAME 27 | - CONCURRENT_REQUEST_PER_WORKER=48 28 | - http_proxy=$HTTP_PROXY 29 | - https_proxy=$HTTPS_PROXY 30 | - no_proxy=$NO_PROXY 31 | - KMP_BLOCKTIME=20 32 | - MKL_ENABLE_INSTRUCTIONS=AVX512_E1 33 | - ENABLE_IPEX=$ENABLE_IPEX 34 | - IPEX_BF16=$IPEX_BF16 35 | - CHECKPOINT_PATH=$CHECKPOINT_PATH 36 | # Starts REST API with only 2 workers so that it can be run on systems with just 4GB of memory 37 | # If you need to handle large loads of incoming requests and have memory to spare, consider increasing the number of workers 38 | command: "/bin/bash -c 'sleep 10 && gunicorn rest_api.application:app -b 0.0.0.0 -k uvicorn.workers.UvicornWorker --workers 1 --timeout 600'" 39 | 40 | ui: 41 | build: 42 | context: ../../../ui 43 | dockerfile: Dockerfile 44 | args: 45 | - http_proxy=$HTTP_PROXY 46 | - https_proxy=$HTTPS_PROXY 47 | - no_proxy=$NO_PROXY 48 | image: "intel/ai-workflows:odqa-haystack-ui" 49 | #network_mode: host 50 | ports: 51 | - 8501:8501 52 | restart: on-failure 53 | volumes: 54 | - $UI_CONFIG_DIR:/home/user/data/ 55 | environment: 56 | - API_ENDPOINT=http://haystack-api:8000 57 | #- API_ENDPOINT=http://localhost:8000 58 | - EVAL_FILE=ui/eval_labels_example.csv 59 | - PIPELINE_PATH=$PIPELINE_PATH 60 | # The value fot the following variables will be read from the host, if present. 61 | # They can also be temporarily set for docker-compose, for example: 62 | # DISABLE_FILE_UPLOAD=1 DEFAULT_DOCS_FROM_RETRIEVER=5 docker-compose up 63 | - DISABLE_FILE_UPLOAD=True 64 | - DEFAULT_QUESTION_AT_STARTUP 65 | - DEFAULT_DOCS_FROM_RETRIEVER 66 | - DEFAULT_NUMBER_OF_ANSWERS 67 | command: "/bin/bash -c 'sleep 15 && python -m streamlit run ui/webapp.py'" 68 | -------------------------------------------------------------------------------- /applications/odqa_pipelines/docker-compose/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3" 2 | services: 3 | haystack-api: 4 | build: 5 | context: ../../../docker 6 | dockerfile: Dockerfile 7 | args: 8 | - http_proxy=$http_proxy 9 | - https_proxy=$https_proxy 10 | - no_proxy=$no_proxy 11 | image: "intel/ai-workflows:odqa-haystack-api" 12 | network_mode: host 13 | ports: 14 | - 8000:8000 15 | restart: on-failure 16 | #cpuset: "0" 17 | # Mount custom Pipeline YAML and custom Components. 18 | #volumes: 19 | # - $CUSTOMER_DIR:/home/user/data 20 | environment: 21 | - DOCUMENTSTORE_PARAMS_HOST=localhost 22 | - DOCUMENTSTORE_PARAMS_PORT=9200 23 | # See rest_api/pipeline/pipelines.haystack-pipeline.yml for configurations of Search & Indexing Pipeline. 24 | - PIPELINE_YAML_PATH=$PIPELINE_PATH 25 | - QUERY_PIPELINE_NAME=$PIPELINE_NAME 26 | - CONCURRENT_REQUEST_PER_WORKER=48 27 | - http_proxy=$http_proxy 28 | - https_proxy=$https_proxy 29 | - no_proxy=$no_proxy 30 | - KMP_BLOCKTIME=20 31 | - MKL_ENABLE_INSTRUCTIONS=AVX512_E1 32 | - COLBERT_OPT=$COLBERT_OPT 33 | - ENABLE_IPEX=$ENABLE_IPEX 34 | - IPEX_BF16=$IPEX_BF16 35 | - CHECKPOINT_PATH=$CHECKPOINT_PATH 36 | - IS_DICT_CHECKPOINT=$IS_DICT_CHECKPOINT 37 | depends_on: 38 | - elasticsearch 39 | # Starts REST API with only 2 workers so that it can be run on systems with just 4GB of memory 40 | # If you need to handle large loads of incoming requests and have memory to spare, consider increasing the number of workers 41 | command: "/bin/bash -c 'sleep 10 && gunicorn rest_api.application:app -b 0.0.0.0 -k uvicorn.workers.UvicornWorker --workers 1 --timeout 600'" 42 | elasticsearch: 43 | # This will start an empty elasticsearch instance (so you have to add your documents yourself) 44 | # If you want a demo image instead that is "ready-to-query" with some indexed articles 45 | # about countries and capital cities from Wikipedia: 46 | #image: "deepset/elasticsearch-countries-and-capitals" 47 | image: $ELASTICSEARCH_IMG 48 | ports: 49 | - 9200:9200 50 | restart: on-failure 51 | environment: 52 | - discovery.type=single-node 53 | - ES_JAVA_OPTS=-Xmx4g -Xms4g 54 | volumes: 55 | - $DATA_DIR:/usr/share/elasticsearch/data 56 | ui: 57 | build: 58 | context: ../../../ui 59 | dockerfile: Dockerfile 60 | args: 61 | - http_proxy=$http_proxy 62 | - https_proxy=$https_proxy 63 | - no_proxy=$no_proxy 64 | image: "intel/ai-workflows:odqa-haystack-ui" 65 | network_mode: host 66 | ports: 67 | - 8501:8501 68 | restart: on-failure 69 | volumes: 70 | - $UI_CONFIG_DIR:/home/user/data/ 71 | environment: 72 | - PIPELINE_PATH=$PIPELINE_PATH 73 | # - API_ENDPOINT=http://haystack-api:8000 74 | - API_ENDPOINT=http://localhost:8000 75 | # The value fot the following variables will be read from the host, if present. 76 | # They can also be temporarily set for docker-compose, for example: 77 | # DISABLE_FILE_UPLOAD=1 DEFAULT_DOCS_FROM_RETRIEVER=5 docker-compose up 78 | - DISABLE_FILE_UPLOAD=True 79 | - DEFAULT_QUESTION_AT_STARTUP 80 | - DEFAULT_DOCS_FROM_RETRIEVER 81 | - DEFAULT_NUMBER_OF_ANSWERS 82 | command: "/bin/bash -c 'sleep 15 && python -m streamlit run ui/webapp.py'" 83 | -------------------------------------------------------------------------------- /applications/odqa_pipelines/launch_pipeline.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # A POSIX variable 3 | OPTIND=1 # Reset in case getopts has been used previously in the shell. 4 | rebuild='1' 5 | nginx='0' 6 | pipeline='emr_faq' 7 | device='cpu' 8 | database='marco' 9 | 10 | usage() { 11 | echo "Usage: $0 -p [pipeline] [optional parameters]" 12 | echo " options:" 13 | echo " -h Display usage" 14 | echo " -p pipeline" 15 | echo " pipelines = [emr_faq, faiss_faq, colbert_faq, colbert_opt_faq]" 16 | echo " -r rebuild" 17 | echo " rebuild the images [1 : yes, 0 : no] " 18 | echo " -d device" 19 | echo " devices = [cpu, gpu]" 20 | echo " -n nginx" 21 | echo " Use nginx for load balance [1 : yes, 0 : no]" 22 | echo " -e database" 23 | echo " Use database [stackoverflow, marco] for searching" 24 | echo "" 25 | echo " examples:" 26 | echo " Run emr_faq pipeline on CPU " 27 | echo " $0 -r 1 -d cpu -n 0 -p emr_faq" 28 | echo "" 29 | } 30 | 31 | while getopts "h?r:d:n:p:e:" opt; do 32 | case "$opt" in 33 | h|\?) 34 | usage 35 | exit 1 36 | ;; 37 | r) rebuild=$OPTARG 38 | ;; 39 | n) nginx=$OPTARG 40 | ;; 41 | p) pipeline=$OPTARG 42 | ;; 43 | d) device=$OPTARG 44 | ;; 45 | e) database=$OPTARG 46 | esac 47 | done 48 | 49 | shift $((OPTIND-1)) 50 | 51 | [ "${1:-}" = "--" ] && shift 52 | 53 | 54 | ## Override default values for values specified by the user 55 | if [ ! -z "$rebuild" ]; then 56 | rebuild=$rebuild 57 | fi 58 | 59 | 60 | if [ ! -z "$device" ]; then 61 | device=$device 62 | fi 63 | 64 | if [ ! -z "$pipeline" ]; then 65 | pipeline=$pipeline 66 | fi 67 | 68 | if [ ! -z "$nginx" ]; then 69 | nginx=$nginx 70 | fi 71 | 72 | if [ ! -z "$database" ]; then 73 | database=$database 74 | fi 75 | 76 | config='config/env.stackoverflow.esds_emr_yml_faq' 77 | build='' 78 | yaml_file='docker-compose.yml' 79 | 80 | if [[ $pipeline = "emr_faq" ]]; then 81 | config='config/env.'${database}'.esds_emr_faq' 82 | 83 | elif [[ $pipeline = "faiss_faq" ]]; then 84 | config='config/env.'${database}'.faiss_dpr' 85 | yaml_file='docker-compose-dpr.yml' 86 | 87 | elif [[ $pipeline = "colbert_faq" ]]; then 88 | config='config/env.'${database}'.esds_bm25r_colbert' 89 | if [[ $database = "stackoverflow" ]]; then 90 | echo "Cannot support ${pipeline} with ${database}, need the fine-tuned colbert model with ${database}" 91 | exit 0 92 | fi 93 | 94 | elif [[ $pipeline = "colbert_opt_faq" ]]; then 95 | config='config/env.'${database}'.esds_bm25r_colbert_opt' 96 | if [[ $database = "stackoverflow" ]]; then 97 | echo "Cannot support ${pipeline} with ${database}, need the fine-tuned colbert model with ${database}" 98 | exit 0 99 | fi 100 | fi 101 | 102 | if [[ $rebuild = "1" ]]; then 103 | echo "rebuild docker images" 104 | build='--build' 105 | fi 106 | echo "device = ${device}" 107 | if [[ $device = "gpu" ]]; then 108 | yaml_file='docker-compose-gpu.yml' 109 | if [[ $pipeline = "faiss_faq" ]]; then 110 | yaml_file='docker-compose-gpu-dpr.yml' 111 | fi 112 | fi 113 | 114 | if [[ $nginx = "1" ]]; then 115 | echo "use the nginx for load balance, only CPU mode supported!" 116 | yaml_file='docker-compose-nginx.yml' 117 | fi 118 | 119 | echo "run the ${pipeline} with ${database} on ${device}" 120 | yaml_file='docker-compose/'$yaml_file 121 | 122 | docker-compose --env-file $config -f $yaml_file up $build 123 | -------------------------------------------------------------------------------- /applications/odqa_pipelines/ui_config/marco/config.yml: -------------------------------------------------------------------------------- 1 | dataset: 2 | name: Marco 3 | questions: 4 | - question: "What's the deepest lake in the world?" 5 | answer: "Lake Bajkal" 6 | - question: "Which Chinese city is the largest?" 7 | answer: "Shanghai" 8 | - question: "What's the longest river of Europe?" 9 | answer: "The Volga" 10 | - question: "What's the tallest mountain in Africa?" 11 | answer: "Mount Kilimanjaro" 12 | pipelines: 13 | - name: pipelines.haystack-EmbeddingRetriever-pipeline.yml 14 | top_k_sliders: 15 | - name: answer 16 | desc: Max. number of answers 17 | default_value: 3 18 | keys: 19 | - key: Retriever 20 | param: top_k 21 | 22 | - name: pipelines_dpr.haystack-pipeline.yml 23 | top_k_sliders: 24 | - name: answer 25 | desc: "Max. number of answers" 26 | default_value: 3 27 | keys: 28 | - key: Retriever 29 | param: top_k 30 | 31 | - name: pipelines.colbertRanker.haystack-pipeline.yml 32 | top_k_sliders: 33 | - name: answer 34 | desc: "Max. number of answers" 35 | default_value: 3 36 | keys: 37 | - key: Ranker 38 | param: top_k 39 | 40 | - name: retriever 41 | desc: "Max. number of documents from retriever" 42 | default_value: 3 43 | keys: 44 | - key: Retriever 45 | param: top_k -------------------------------------------------------------------------------- /applications/odqa_pipelines/ui_config/stackoverflow/config.yml: -------------------------------------------------------------------------------- 1 | dataset: 2 | name: StackOverflow 3 | questions: 4 | - question: "How to pass a function as a parameter in C?" 5 | - question: "How to open a file in C++?" 6 | - question: "How to convert a string to integer in C?" 7 | - question: "How to get local IP-Address from an udp-socket (C/C++)?" 8 | 9 | pipelines: 10 | - name: pipelines.haystack-EmbeddingRetriever-pipeline.yml 11 | top_k_sliders: 12 | - name: answer 13 | desc: Max. number of answers 14 | default_value: 3 15 | keys: 16 | - key: Retriever 17 | param: top_k 18 | 19 | - name: pipelines_dpr.haystack-pipeline.yml 20 | top_k_sliders: 21 | - name: answer 22 | desc: "Max. number of answers" 23 | default_value: 3 24 | keys: 25 | - key: Retriever 26 | param: top_k 27 | 28 | - name: pipelines.colbertRanker.haystack-pipeline.yml 29 | top_k_sliders: 30 | - name: answer 31 | desc: "Max. number of answers" 32 | default_value: 3 33 | keys: 34 | - key: Ranker 35 | param: top_k 36 | 37 | - name: retriever 38 | desc: "Max. number of documents from retriever" 39 | default_value: 3 40 | keys: 41 | - key: Retriever 42 | param: top_k -------------------------------------------------------------------------------- /conftest.py: -------------------------------------------------------------------------------- 1 | def pytest_addoption(parser): 2 | parser.addoption( 3 | "--document_store_type", action="store", default="elasticsearch, faiss, sql, memory, milvus1, milvus, weaviate" 4 | ) 5 | 6 | 7 | def pytest_generate_tests(metafunc): 8 | # Get selected docstores from CLI arg 9 | document_store_type = metafunc.config.option.document_store_type 10 | selected_doc_stores = [item.strip() for item in document_store_type.split(",")] 11 | 12 | # parametrize document_store fixture if it's in the test function argument list 13 | # but does not have an explicit parametrize annotation e.g 14 | # @pytest.mark.parametrize("document_store", ["memory"], indirect=False) 15 | found_mark_parametrize_document_store = False 16 | for marker in metafunc.definition.iter_markers("parametrize"): 17 | if "document_store" in marker.args[0]: 18 | found_mark_parametrize_document_store = True 19 | break 20 | # for all others that don't have explicit parametrization, we add the ones from the CLI arg 21 | if "document_store" in metafunc.fixturenames and not found_mark_parametrize_document_store: 22 | metafunc.parametrize("document_store", selected_doc_stores, indirect=True) 23 | -------------------------------------------------------------------------------- /docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:20.04 2 | 3 | WORKDIR /home/user 4 | 5 | RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \ 6 | python3 \ 7 | python3-pip \ 8 | libpoppler-cpp-dev \ 9 | tesseract-ocr \ 10 | wget \ 11 | git \ 12 | libtesseract-dev \ 13 | poppler-utils \ 14 | libmkl-dev 15 | 16 | # Install PDF converter 17 | RUN wget --no-check-certificate https://dl.xpdfreader.com/xpdf-tools-linux-4.04.tar.gz && \ 18 | tar -xvf xpdf-tools-linux-4.04.tar.gz && cp xpdf-tools-linux-4.04/bin64/pdftotext /usr/local/bin 19 | 20 | #clone the code repo 21 | #RUN git clone --depth=1 https://github.com/intel/open-domain-question-and-answer.git 22 | #WORKDIR /home/user/open-domain-question-and-answer 23 | #RUN git submodule update --init --recursive 24 | 25 | COPY / /home/user/open-domain-question-and-answer/ 26 | WORKDIR /home/user/open-domain-question-and-answer 27 | # Install package 28 | RUN pip install torch torchvision --force-reinstall --extra-index-url https://download.pytorch.org/whl/cpu 29 | RUN pip install --upgrade pip 30 | # RUN pip install --no-cache-dir .[docstores,crawler,preprocessing,ocr,ray] 31 | RUN pip install --no-cache-dir .[faiss,preprocessing,ocr,ray] 32 | RUN pip install --no-cache-dir rest_api/ 33 | RUN pip install --no-cache-dir third_party/ColBERT/ 34 | RUN pip install numba 35 | #RUN pip install faiss-1.6.3-py3-none-any.whl 36 | RUN python3 -m pip install intel-extension-for-pytorch 37 | RUN pip install intel-openmp 38 | RUN ls /home/user 39 | RUN pip freeze 40 | RUN python3 -c "from haystack.utils.docker import cache_models;cache_models()" 41 | 42 | # create folder for /file-upload API endpoint with write permissions, this might be adjusted depending on FILE_UPLOAD_PATH 43 | RUN mkdir -p /home/user/open-domain-question-and-answer/rest_api/file-upload 44 | RUN chmod 777 /home/user/open-domain-question-and-answer/rest_api/file-upload 45 | RUN ln -s /usr/bin/python3.8 /usr/bin/python 46 | 47 | EXPOSE 8000 48 | ENV HAYSTACK_DOCKER_CONTAINER="HAYSTACK_CPU_CONTAINER" 49 | 50 | # cmd for running the API 51 | CMD ["gunicorn", "rest_api.application:app", "-b", "0.0.0.0", "-k", "uvicorn.workers.UvicornWorker", "--workers", "1", "--timeout", "180"] 52 | -------------------------------------------------------------------------------- /docker/Dockerfile-GPU: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda:11.3.0-runtime-ubuntu20.04 2 | 3 | WORKDIR /home/user 4 | 5 | ENV LC_ALL=C.UTF-8 6 | ENV LANG=C.UTF-8 7 | 8 | # Install software dependencies 9 | RUN apt-get update && apt-get install -y software-properties-common && \ 10 | add-apt-repository ppa:deadsnakes/ppa && \ 11 | apt-get install -y \ 12 | cmake \ 13 | curl \ 14 | git \ 15 | libpoppler-cpp-dev \ 16 | libtesseract-dev \ 17 | pkg-config \ 18 | poppler-utils \ 19 | python3-pip \ 20 | python3.7 \ 21 | python3.7-dev \ 22 | python3.7-distutils \ 23 | swig \ 24 | tesseract-ocr \ 25 | wget && \ 26 | rm -rf /var/lib/apt/lists/* 27 | 28 | # Install PDF converter 29 | RUN wget https://dl.xpdfreader.com/xpdf-tools-linux-4.04.tar.gz && \ 30 | tar -xvzf xpdf-tools-linux-4.04.tar.gz -C /usr/local/bin --strip-components=2 xpdf-tools-linux-4.04/bin64/pdftotext 31 | 32 | # Set default Python version 33 | RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.7 1 && \ 34 | update-alternatives --set python3 /usr/bin/python3.7 35 | 36 | #clone the code repo 37 | RUN git clone --depth=1 https://github.com/intel/open-domain-question-and-answer.git 38 | WORKDIR /home/user/open-domain-question-and-answer 39 | RUN git submodule update --init --recursive 40 | 41 | # Install package 42 | RUN pip install --upgrade pip 43 | # RUN pip install --no-cache-dir .[docstores-gpu,crawler,preprocessing,ocr,ray] 44 | RUN pip install --no-cache-dir .[faiss,preprocessing,ocr,ray] 45 | RUN pip install --no-cache-dir rest_api/ 46 | RUN pip install --no-cache-dir third_party/ColBERT/ 47 | RUN pip install numba 48 | # Install PyTorch for CUDA 11 49 | RUN pip3 install --no-cache-dir torch==1.10.1+cu111 -f https://download.pytorch.org/whl/torch_stable.html 50 | 51 | # Cache Roberta and NLTK data 52 | RUN python3 -c "from haystack.utils.docker import cache_models;cache_models()" 53 | 54 | # create folder for /file-upload API endpoint with write permissions, this might be adjusted depending on FILE_UPLOAD_PATH 55 | RUN mkdir -p /home/user/open-domain-question-and-answer/rest_api/file-upload 56 | RUN chmod 777 /home/user/open-domain-question-and-answer/rest_api/file-upload 57 | 58 | 59 | EXPOSE 8000 60 | ENV HAYSTACK_DOCKER_CONTAINER="HAYSTACK_GPU_CONTAINER" 61 | 62 | # cmd for running the API (note: "--preload" is not working with cuda) 63 | CMD ["gunicorn", "rest_api.application:app", "-b", "0.0.0.0", "-k", "uvicorn.workers.UvicornWorker", "--workers", "1", "--timeout", "180"] 64 | -------------------------------------------------------------------------------- /haystack/__init__.py: -------------------------------------------------------------------------------- 1 | # pylint: disable=wrong-import-position,wrong-import-order 2 | 3 | from typing import Union 4 | from types import ModuleType 5 | 6 | try: 7 | from importlib import metadata 8 | except (ModuleNotFoundError, ImportError): 9 | # Python <= 3.7 10 | import importlib_metadata as metadata # type: ignore 11 | 12 | __version__: str = str(metadata.version("farm-haystack")) 13 | 14 | 15 | # Logging is not configured here on purpose, see https://github.com/deepset-ai/haystack/issues/2485 16 | import logging 17 | 18 | import pandas as pd 19 | 20 | from haystack.schema import Document, Answer, Label, MultiLabel, Span, EvaluationResult 21 | from haystack.nodes.base import BaseComponent 22 | from haystack.pipelines.base import Pipeline 23 | 24 | 25 | pd.options.display.max_colwidth = 80 26 | -------------------------------------------------------------------------------- /haystack/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | COLBERT_OPT = os.getenv("COLBERT_OPT", "False") 4 | ENABLE_IPEX = os.getenv("ENABLE_IPEX", "False") 5 | IPEX_BF16 = os.getenv("IPEX_BF16", "False") 6 | IS_DICT_CHECKPOINT = os.getenv("IS_DICT_CHECKPOINT", "False") 7 | -------------------------------------------------------------------------------- /haystack/document_stores/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import importlib 3 | from haystack.utils.import_utils import safe_import 4 | from haystack.document_stores.base import BaseDocumentStore, BaseKnowledgeGraph, KeywordDocumentStore 5 | 6 | from haystack.document_stores.memory import InMemoryDocumentStore 7 | from haystack.document_stores.deepsetcloud import DeepsetCloudDocumentStore 8 | from haystack.document_stores.utils import eval_data_from_json, eval_data_from_jsonl, squad_json_to_jsonl 9 | 10 | from haystack.document_stores.elasticsearch import ElasticsearchDocumentStore 11 | from haystack.document_stores.es_converter import ( 12 | elasticsearch_index_to_document_store, 13 | open_search_index_to_document_store, 14 | ) 15 | 16 | OpenSearchDocumentStore = safe_import("haystack.document_stores.opensearch", "OpenSearchDocumentStore", "opensearch") 17 | OpenDistroElasticsearchDocumentStore = safe_import( 18 | "haystack.document_stores.opensearch", "OpenDistroElasticsearchDocumentStore", "opensearch" 19 | ) 20 | SQLDocumentStore = safe_import("haystack.document_stores.sql", "SQLDocumentStore", "sql") 21 | FAISSDocumentStore = safe_import("haystack.document_stores.faiss", "FAISSDocumentStore", "faiss") 22 | PLAIDDocumentStore = safe_import("haystack.document_stores.plaid", "PLAIDDocumentStore", "plaid") 23 | PineconeDocumentStore = safe_import("haystack.document_stores.pinecone", "PineconeDocumentStore", "pinecone") 24 | MilvusDocumentStore = safe_import("haystack.document_stores.milvus", "MilvusDocumentStore", "milvus") 25 | WeaviateDocumentStore = safe_import("haystack.document_stores.weaviate", "WeaviateDocumentStore", "weaviate") 26 | GraphDBKnowledgeGraph = safe_import("haystack.document_stores.graphdb", "GraphDBKnowledgeGraph", "graphdb") 27 | InMemoryKnowledgeGraph = safe_import( 28 | "haystack.document_stores.memory_knowledgegraph", "InMemoryKnowledgeGraph", "inmemorygraph" 29 | ) 30 | -------------------------------------------------------------------------------- /haystack/environment.py: -------------------------------------------------------------------------------- 1 | import os 2 | import platform 3 | import sys 4 | from typing import Any, Dict 5 | import torch 6 | import transformers 7 | 8 | from haystack import __version__ 9 | 10 | 11 | HAYSTACK_EXECUTION_CONTEXT = "HAYSTACK_EXECUTION_CONTEXT" 12 | HAYSTACK_DOCKER_CONTAINER = "HAYSTACK_DOCKER_CONTAINER" 13 | 14 | 15 | env_meta_data: Dict[str, Any] = {} 16 | 17 | 18 | def get_or_create_env_meta_data() -> Dict[str, Any]: 19 | """ 20 | Collects meta data about the setup that is used with Haystack, such as: operating system, python version, Haystack version, transformers version, pytorch version, number of GPUs, execution environment, and the value stored in the env variable HAYSTACK_EXECUTION_CONTEXT. 21 | """ 22 | global env_meta_data # pylint: disable=global-statement 23 | if not env_meta_data: 24 | env_meta_data = { 25 | "os_version": platform.release(), 26 | "os_family": platform.system(), 27 | "os_machine": platform.machine(), 28 | "python_version": platform.python_version(), 29 | "haystack_version": __version__, 30 | "transformers_version": transformers.__version__, 31 | "torch_version": torch.__version__, 32 | "torch_cuda_version": torch.version.cuda if torch.cuda.is_available() else 0, 33 | "n_gpu": torch.cuda.device_count() if torch.cuda.is_available() else 0, 34 | "n_cpu": os.cpu_count(), 35 | "context": os.environ.get(HAYSTACK_EXECUTION_CONTEXT), 36 | "execution_env": _get_execution_environment(), 37 | } 38 | return env_meta_data 39 | 40 | 41 | def _get_execution_environment(): 42 | """ 43 | Identifies the execution environment that Haystack is running in. 44 | Options are: colab notebook, kubernetes, CPU/GPU docker container, test environment, jupyter notebook, python script 45 | """ 46 | if os.environ.get("CI", "False").lower() == "true": 47 | execution_env = "ci" 48 | elif "google.colab" in sys.modules: 49 | execution_env = "colab" 50 | elif "KUBERNETES_SERVICE_HOST" in os.environ: 51 | execution_env = "kubernetes" 52 | elif HAYSTACK_DOCKER_CONTAINER in os.environ: 53 | execution_env = os.environ.get(HAYSTACK_DOCKER_CONTAINER) 54 | # check if pytest is imported 55 | elif "pytest" in sys.modules: 56 | execution_env = "test" 57 | else: 58 | try: 59 | execution_env = get_ipython().__class__.__name__ # pylint: disable=undefined-variable 60 | except NameError: 61 | execution_env = "script" 62 | return execution_env 63 | -------------------------------------------------------------------------------- /haystack/modeling/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intel/open-domain-question-and-answer/7869af3f98ff6b06d7e1d8e576a6c5ca8bbd573d/haystack/modeling/__init__.py -------------------------------------------------------------------------------- /haystack/modeling/data_handler/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intel/open-domain-question-and-answer/7869af3f98ff6b06d7e1d8e576a6c5ca8bbd573d/haystack/modeling/data_handler/__init__.py -------------------------------------------------------------------------------- /haystack/modeling/data_handler/dataloader.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, List 2 | 3 | from math import ceil 4 | 5 | import torch 6 | from torch.utils.data import DataLoader, Dataset, Sampler 7 | 8 | from haystack.errors import ModelingError 9 | 10 | 11 | class NamedDataLoader(DataLoader): 12 | def __init__( 13 | self, 14 | dataset: Dataset, 15 | batch_size: int, 16 | sampler: Optional[Sampler] = None, 17 | tensor_names: Optional[List[str]] = None, 18 | num_workers: int = 0, 19 | pin_memory: bool = False, 20 | ): 21 | """ 22 | A modified version of the PyTorch DataLoader that returns a dictionary where the key is 23 | the name of the tensor and the value is the tensor itself. 24 | 25 | :param dataset: The dataset that will be wrapped by this NamedDataLoader 26 | :param sampler: The sampler used by the NamedDataLoader to choose which samples to include in the batch 27 | :param batch_size: The size of the batch to be returned by the NamedDataLoader 28 | :param tensor_names: The names of the tensor, in the order that the dataset returns them in. 29 | :param num_workers: number of workers to use for the DataLoader 30 | :param pin_memory: argument for Data Loader to use page-locked memory for faster transfer of data to GPU 31 | """ 32 | 33 | def collate_fn(batch): 34 | """ 35 | A custom collate function that formats the batch as a dictionary where the key is 36 | the name of the tensor and the value is the tensor itself 37 | """ 38 | if type(dataset).__name__ == "_StreamingDataSet": 39 | _tensor_names = dataset.tensor_names 40 | else: 41 | _tensor_names = tensor_names 42 | 43 | if type(batch[0]) == list: 44 | batch = batch[0] 45 | 46 | if len(batch[0]) != len(_tensor_names): 47 | raise ModelingError( 48 | f"Dataset contains {len(batch[0])} tensors while there are {len(_tensor_names)} tensor names supplied: {_tensor_names}" 49 | ) 50 | lists_temp = [[] for _ in range(len(_tensor_names))] 51 | ret = dict(zip(_tensor_names, lists_temp)) 52 | 53 | for example in batch: 54 | for name, tensor in zip(_tensor_names, example): 55 | ret[name].append(tensor) 56 | 57 | for key in ret: 58 | ret[key] = torch.stack(ret[key]) 59 | 60 | return ret 61 | 62 | super().__init__( 63 | dataset=dataset, 64 | sampler=sampler, 65 | batch_size=batch_size, 66 | collate_fn=collate_fn, 67 | pin_memory=pin_memory, 68 | num_workers=num_workers, 69 | ) 70 | 71 | def __len__(self): 72 | if type(self.dataset).__name__ == "_StreamingDataSet": 73 | num_samples = len(self.dataset) 74 | num_batches = ceil(num_samples / self.dataset.batch_size) 75 | return num_batches 76 | else: 77 | return super().__len__() 78 | -------------------------------------------------------------------------------- /haystack/modeling/data_handler/inputs.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, List, Union 2 | 3 | 4 | class Question: 5 | def __init__(self, text: str, uid: Optional[str] = None): 6 | self.text = text 7 | self.uid = uid 8 | 9 | def to_dict(self): 10 | ret = {"question": self.text, "id": self.uid, "answers": []} 11 | return ret 12 | 13 | 14 | class QAInput: 15 | def __init__(self, doc_text: str, questions: Union[List[Question], Question]): 16 | self.doc_text = doc_text 17 | if type(questions) == Question: 18 | self.questions = [questions] 19 | else: 20 | self.questions = questions # type: ignore 21 | 22 | def to_dict(self): 23 | questions = [q.to_dict() for q in self.questions] 24 | ret = {"qas": questions, "context": self.doc_text} 25 | return ret 26 | -------------------------------------------------------------------------------- /haystack/modeling/evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | from haystack.modeling.evaluation.eval import Evaluator 2 | -------------------------------------------------------------------------------- /haystack/modeling/evaluation/squad.py: -------------------------------------------------------------------------------- 1 | """ 2 | This is a copy of the official evaluation script for SQuAD version 2.0. 3 | Modified by XLNet authors to update `find_best_threshold` scripts for SQuAD V2.0 4 | 5 | In addition to basic functionality, we also compute additional statistics and 6 | plot precision-recall curves if an additional na_prob.json file is provided. 7 | This file is expected to map question ID's to the model's predicted probability 8 | that a question is unanswerable. 9 | """ 10 | import collections 11 | import re 12 | import string 13 | 14 | 15 | def normalize_answer(s: str): 16 | """ 17 | Lower text and remove punctuation, articles and extra whitespace. 18 | """ 19 | 20 | def remove_articles(text): 21 | regex = re.compile(r"\b(a|an|the)\b", re.UNICODE) 22 | return re.sub(regex, " ", text) 23 | 24 | def white_space_fix(text): 25 | return " ".join(text.split()) 26 | 27 | def remove_punc(text): 28 | exclude = set(string.punctuation) 29 | return "".join(ch for ch in text if ch not in exclude) 30 | 31 | def lower(text): 32 | return text.lower() 33 | 34 | return white_space_fix(remove_articles(remove_punc(lower(s)))) 35 | 36 | 37 | def get_tokens(s: str): 38 | if not s: 39 | return [] 40 | return normalize_answer(s).split() 41 | 42 | 43 | def compute_exact(a_gold: str, a_pred: str): 44 | return int(normalize_answer(a_gold) == normalize_answer(a_pred)) 45 | 46 | 47 | def compute_f1(a_gold: str, a_pred: str): 48 | gold_toks = get_tokens(a_gold) 49 | pred_toks = get_tokens(a_pred) 50 | common: collections.Counter = collections.Counter(gold_toks) & collections.Counter(pred_toks) 51 | num_same = sum(common.values()) 52 | if len(gold_toks) == 0 or len(pred_toks) == 0: 53 | # If either is no-answer, then F1 is 1 if they agree, 0 otherwise 54 | return int(gold_toks == pred_toks) 55 | if num_same == 0: 56 | return 0 57 | precision = 1.0 * num_same / len(pred_toks) 58 | recall = 1.0 * num_same / len(gold_toks) 59 | f1 = (2 * precision * recall) / (precision + recall) 60 | return f1 61 | -------------------------------------------------------------------------------- /haystack/modeling/model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intel/open-domain-question-and-answer/7869af3f98ff6b06d7e1d8e576a6c5ca8bbd573d/haystack/modeling/model/__init__.py -------------------------------------------------------------------------------- /haystack/modeling/model/multimodal/base.py: -------------------------------------------------------------------------------- 1 | from typing import Any, List, Union, Optional 2 | 3 | import logging 4 | from pathlib import Path 5 | from abc import ABC, abstractmethod 6 | 7 | import torch 8 | 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | 13 | class HaystackModel(ABC): 14 | """ 15 | The interface on top of HaystackTransformer and HaystackSentenceTransformer. 16 | """ 17 | 18 | def __init__( 19 | self, pretrained_model_name_or_path: Union[str, Path], model_type: Optional[str], content_type: str 20 | ): # replace the type of content_type with ContentTypes starting Python3.8 21 | """ 22 | :param pretrained_model_name_or_path: The name of the model to load 23 | :param model_type: the value of `model_type` from the model's `Config` class. 24 | :param content_type: The type of data (such as "text", "image" and so on) the model should process. 25 | See the values of `haystack.schema.ContentTypes`. 26 | """ 27 | logger.info( 28 | f" 🤖 Loading '{pretrained_model_name_or_path}' " 29 | f"({self.__class__.__name__} of type '{model_type if model_type else ''}' " 30 | f"for {content_type} data)" 31 | ) 32 | self.model_name_or_path = pretrained_model_name_or_path 33 | self.model_type = model_type 34 | self.content_type = content_type 35 | 36 | @abstractmethod 37 | def encode(self, data: List[Any], **kwargs) -> torch.Tensor: 38 | """ 39 | Run the model on the input data to obtain output vectors. 40 | """ 41 | raise NotImplementedError("Abstract method, use a subclass.") 42 | 43 | @abstractmethod 44 | def to(self, devices: Optional[List[torch.device]]) -> None: 45 | """ 46 | Send the model to the specified PyTorch device(s) 47 | """ 48 | raise NotImplementedError("Abstract method, use a subclass.") 49 | 50 | @property 51 | @abstractmethod 52 | def embedding_dim(self) -> int: 53 | """ 54 | The output embedding size. 55 | """ 56 | raise NotImplementedError("Abstract method, use a subclass.") 57 | -------------------------------------------------------------------------------- /haystack/modeling/training/__init__.py: -------------------------------------------------------------------------------- 1 | from haystack.modeling.training.base import Trainer, DistillationTrainer, TinyBERTDistillationTrainer 2 | -------------------------------------------------------------------------------- /haystack/modeling/training/dpr.py: -------------------------------------------------------------------------------- 1 | # TODO create DPR_Trainer class here that can be called from retriever.dense.DPR.train() 2 | -------------------------------------------------------------------------------- /haystack/modeling/training/question_answering.py: -------------------------------------------------------------------------------- 1 | # TODO make QA_Trainer class and use insider reader.train 2 | -------------------------------------------------------------------------------- /haystack/modeling/visual.py: -------------------------------------------------------------------------------- 1 | FLOWERS = r""" 2 | 3 | vVVVv vVVVv 4 | (___) vVVVv (___) vVVVv 5 | ~Y~ (___) ~Y~ (___) 6 | \| \~Y~/ \| \~Y~/ 7 | \\|// \\|// \\|// \\|// 8 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 9 | """ 10 | 11 | SAMPLE = r""" 12 | .--. _____ _ 13 | .'_\/_'. / ____| | | 14 | '. /\ .' | (___ __ _ _ __ ___ _ __ | | ___ 15 | "||" \___ \ / _` | '_ ` _ \| '_ \| |/ _ \ 16 | || /\ ____) | (_| | | | | | | |_) | | __/ 17 | /\ ||//\) |_____/ \__,_|_| |_| |_| .__/|_|\___| 18 | (/\\||/ |_| 19 | ______\||/___________________________________________ 20 | """ 21 | 22 | FENCE = r""" 23 | _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 24 | _| |_| |_| |_| |_| |_| |_| |_| |_| |_| |_| |_| |_| |_| |_| |_| |_| |_| |_| |_| |_ 25 | -| |-| |-| |-| |-| |-| |-| |-| |-| |-| |-| |-| |-| |-| |-| |-| |-| |-| |-| |-| |- 26 | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | 27 | _| |_| |_| |_| |_| |_| |_| |_| |_| |_| |_| |_| |_| |_| |_| |_| |_| |_| |_| |_| |_ 28 | -| |-| |-| |-| |-| |-| |-| |-| |-| |-| |-| |-| |-| |-| |-| |-| |-| |-| |-| |-| |- 29 | |_| |_| |_| |_| |_| |_| |_| |_| |_| |_| |_| |_| |_| |_| |_| |_| |_| |_| |_| |_| 30 | ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, 31 | 32 | """ 33 | 34 | TRACTOR_SMALL = r""" 35 | ______ 36 | |o | ! 37 | __ |:`_|---'-. 38 | |__|______.-/ _ \-----.| 39 | (o)(o)------'\ _ / ( ) 40 | """ 41 | 42 | 43 | TRACTOR_WITH_SILO_LINE = r""" 44 | ____ 45 | /____\ 46 | ______ | | 47 | |o | ! | | 48 | __ |:`_|---'-. | | 49 | |__|______.-/ _ \-----.| |______| 50 | (o)(o)------'\ _ / ( ) | | 51 | """ 52 | 53 | 54 | ROOSTER = r""" 55 | _ m 56 | ,`.\/'> 57 | (`\<_/` 58 | `<< 59 | """ 60 | 61 | PIG = r""" 62 | 63 | .-~~~~-. |\\_ 64 | @_/ / oo\_ 65 | | \ \ _(") 66 | \ /-| ||'--' 67 | \_\ \_\\ 68 | 69 | """ 70 | SMALL_PIG = r""" 71 | @___,__ 72 | ( ^'_] 73 | //-\\' 74 | ^^ ^^ 75 | """ 76 | FENCE_SEP = r""" 77 | |---||---|---|---|---|---|---|---| 78 | """ 79 | 80 | BUSH_SEP = r"""\\|// \\|// \\|// \\|// \\|// 81 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^""" 82 | 83 | WATERING_CAN = r""" 84 | ______ 85 | _ ,',----.`. 86 | '.`-. .-' '----. || 87 | `.`-'--------| ;; 88 | `.|--------|// 89 | \ / 90 | '--------' 91 | """ 92 | 93 | WORKER_M = r""" 0 94 | /|\ 95 | /'\ """ 96 | 97 | WORKER_F = r""" 0 98 | /w\ 99 | / \ """ 100 | 101 | WORKER_X = r""" 0 102 | /w\ 103 | /'\ """ 104 | -------------------------------------------------------------------------------- /haystack/nodes/__init__.py: -------------------------------------------------------------------------------- 1 | from haystack.utils.import_utils import safe_import 2 | 3 | from haystack.nodes.base import BaseComponent 4 | 5 | from haystack.nodes.answer_generator import BaseGenerator, RAGenerator, Seq2SeqGenerator, OpenAIAnswerGenerator 6 | from haystack.nodes.document_classifier import BaseDocumentClassifier, TransformersDocumentClassifier 7 | from haystack.nodes.evaluator import EvalDocuments, EvalAnswers 8 | from haystack.nodes.extractor import EntityExtractor, simplify_ner_for_qa 9 | from haystack.nodes.file_classifier import FileTypeClassifier 10 | from haystack.nodes.file_converter import ( 11 | BaseConverter, 12 | DocxToTextConverter, 13 | ImageToTextConverter, 14 | MarkdownConverter, 15 | PDFToTextConverter, 16 | PDFToTextOCRConverter, 17 | TikaConverter, 18 | TikaXHTMLParser, 19 | TextConverter, 20 | AzureConverter, 21 | ParsrConverter, 22 | ) 23 | from haystack.nodes.label_generator import PseudoLabelGenerator 24 | from haystack.nodes.other import Docs2Answers, JoinDocuments, RouteDocuments, JoinAnswers, DocumentMerger, Dataset 25 | from haystack.nodes.preprocessor import BasePreProcessor, PreProcessor 26 | from haystack.nodes.prompt import PromptNode, PromptTemplate, PromptModel 27 | from haystack.nodes.query_classifier import SklearnQueryClassifier, TransformersQueryClassifier 28 | from haystack.nodes.question_generator import QuestionGenerator 29 | from haystack.nodes.ranker import BaseRanker, SentenceTransformersRanker, ColBERTRanker 30 | from haystack.nodes.reader import BaseReader, FARMReader, TransformersReader, TableReader, RCIReader 31 | from haystack.nodes.retriever import ( 32 | BaseRetriever, 33 | DenseRetriever, 34 | DensePassageRetriever, 35 | EmbeddingRetriever, 36 | BM25Retriever, 37 | ElasticsearchRetriever, 38 | FilterRetriever, 39 | MultihopEmbeddingRetriever, 40 | ElasticsearchFilterOnlyRetriever, 41 | TfidfRetriever, 42 | Text2SparqlRetriever, 43 | TableTextRetriever, 44 | MultiModalRetriever, 45 | ) 46 | from haystack.nodes.summarizer import BaseSummarizer, TransformersSummarizer 47 | from haystack.nodes.translator import BaseTranslator, TransformersTranslator 48 | 49 | Crawler = safe_import("haystack.nodes.connector.crawler", "Crawler", "crawler") # Has optional dependencies 50 | AnswerToSpeech = safe_import( 51 | "haystack.nodes.audio.answer_to_speech", "AnswerToSpeech", "audio" 52 | ) # Has optional dependencies 53 | DocumentToSpeech = safe_import( 54 | "haystack.nodes.audio.document_to_speech", "DocumentToSpeech", "audio" 55 | ) # Has optional dependencies 56 | -------------------------------------------------------------------------------- /haystack/nodes/answer_generator/__init__.py: -------------------------------------------------------------------------------- 1 | from haystack.nodes.answer_generator.base import BaseGenerator 2 | from haystack.nodes.answer_generator.transformers import RAGenerator, Seq2SeqGenerator 3 | from haystack.nodes.answer_generator.openai import OpenAIAnswerGenerator 4 | -------------------------------------------------------------------------------- /haystack/nodes/audio/__init__.py: -------------------------------------------------------------------------------- 1 | from haystack.utils.import_utils import safe_import 2 | 3 | AnswerToSpeech = safe_import( 4 | "haystack.nodes.audio.answer_to_speech", "AnswerToSpeech", "audio" 5 | ) # Has optional dependencies 6 | DocumentToSpeech = safe_import( 7 | "haystack.nodes.audio.document_to_speech", "DocumentToSpeech", "audio" 8 | ) # Has optional dependencies 9 | -------------------------------------------------------------------------------- /haystack/nodes/connector/__init__.py: -------------------------------------------------------------------------------- 1 | from haystack.utils.import_utils import safe_import 2 | 3 | Crawler = safe_import("haystack.nodes.connector.crawler", "Crawler", "crawler") # Has optional dependencies 4 | -------------------------------------------------------------------------------- /haystack/nodes/document_classifier/__init__.py: -------------------------------------------------------------------------------- 1 | from haystack.nodes.document_classifier.base import BaseDocumentClassifier 2 | from haystack.nodes.document_classifier.transformers import TransformersDocumentClassifier 3 | -------------------------------------------------------------------------------- /haystack/nodes/document_classifier/base.py: -------------------------------------------------------------------------------- 1 | from typing import List, Union, Optional 2 | 3 | import logging 4 | from abc import abstractmethod 5 | from functools import wraps 6 | from time import perf_counter 7 | 8 | from haystack.schema import Document 9 | from haystack.nodes.base import BaseComponent 10 | 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | class BaseDocumentClassifier(BaseComponent): 16 | outgoing_edges = 1 17 | query_count = 0 18 | query_time = 0 19 | 20 | @abstractmethod 21 | def predict(self, documents: List[Document]): 22 | pass 23 | 24 | @abstractmethod 25 | def predict_batch( 26 | self, documents: Union[List[Document], List[List[Document]]], batch_size: Optional[int] = None 27 | ) -> Union[List[Document], List[List[Document]]]: 28 | pass 29 | 30 | def run(self, documents: Union[List[dict], List[Document]], root_node: str): # type: ignore 31 | self.query_count += 1 32 | if documents: 33 | predict = self.timing(self.predict, "query_time") 34 | documents = [Document.from_dict(doc) if isinstance(doc, dict) else doc for doc in documents] 35 | results = predict(documents=documents) 36 | else: 37 | results = [] 38 | 39 | document_ids = [doc.id for doc in results] 40 | logger.debug("Classified documents with IDs: %s", document_ids) 41 | 42 | # convert back to dicts if we are in an indexing pipeline 43 | if root_node == "File": 44 | results = [doc.to_dict() for doc in results] 45 | 46 | output = {"documents": results} 47 | 48 | return output, "output_1" 49 | 50 | def run_batch(self, documents: Union[List[Document], List[List[Document]]], batch_size: Optional[int] = None): # type: ignore 51 | predict_batch = self.timing(self.predict_batch, "query_time") 52 | results = predict_batch(documents=documents, batch_size=batch_size) 53 | output = {"documents": results} 54 | 55 | if isinstance(documents[0], Document): 56 | document_ids = [doc.id for doc in results] 57 | logger.debug("Classified documents with IDs: %s", document_ids) 58 | else: 59 | for doc_list in results: 60 | document_ids = [doc.id for doc in doc_list] 61 | logger.debug("Classified documents with IDs: %s", document_ids) 62 | 63 | return output, "output_1" 64 | 65 | def timing(self, fn, attr_name): 66 | """Wrapper method used to time functions.""" 67 | 68 | @wraps(fn) 69 | def wrapper(*args, **kwargs): 70 | if attr_name not in self.__dict__: 71 | self.__dict__[attr_name] = 0 72 | tic = perf_counter() 73 | ret = fn(*args, **kwargs) 74 | toc = perf_counter() 75 | self.__dict__[attr_name] += toc - tic 76 | return ret 77 | 78 | return wrapper 79 | 80 | def print_time(self): 81 | print("Classifier (Speed)") 82 | print("---------------") 83 | if not self.query_count: 84 | print("No querying performed via Classifier.run()") 85 | else: 86 | print(f"Queries Performed: {self.query_count}") 87 | print(f"Query time: {self.query_time}s") 88 | print(f"{self.query_time / self.query_count} seconds per query") 89 | -------------------------------------------------------------------------------- /haystack/nodes/evaluator/__init__.py: -------------------------------------------------------------------------------- 1 | from haystack.nodes.evaluator.evaluator import EvalDocuments, EvalAnswers 2 | -------------------------------------------------------------------------------- /haystack/nodes/extractor/__init__.py: -------------------------------------------------------------------------------- 1 | from haystack.nodes.extractor.entity import EntityExtractor, simplify_ner_for_qa 2 | -------------------------------------------------------------------------------- /haystack/nodes/file_classifier/__init__.py: -------------------------------------------------------------------------------- 1 | from haystack.utils.import_utils import safe_import 2 | 3 | FileTypeClassifier = safe_import( 4 | "haystack.nodes.file_classifier.file_type", "FileTypeClassifier", "preprocessing" 5 | ) # Has optional dependencies 6 | -------------------------------------------------------------------------------- /haystack/nodes/file_converter/__init__.py: -------------------------------------------------------------------------------- 1 | from haystack.nodes.file_converter.base import BaseConverter 2 | 3 | from haystack.utils.import_utils import safe_import 4 | 5 | from haystack.nodes.file_converter.docx import DocxToTextConverter 6 | from haystack.nodes.file_converter.tika import TikaConverter, TikaXHTMLParser 7 | from haystack.nodes.file_converter.txt import TextConverter 8 | from haystack.nodes.file_converter.azure import AzureConverter 9 | from haystack.nodes.file_converter.parsr import ParsrConverter 10 | 11 | MarkdownConverter = safe_import( 12 | "haystack.nodes.file_converter.markdown", "MarkdownConverter", "preprocessing" 13 | ) # Has optional dependencies 14 | ImageToTextConverter = safe_import( 15 | "haystack.nodes.file_converter.image", "ImageToTextConverter", "ocr" 16 | ) # Has optional dependencies 17 | PDFToTextConverter = safe_import( 18 | "haystack.nodes.file_converter.pdf", "PDFToTextConverter", "ocr" 19 | ) # Has optional dependencies 20 | PDFToTextOCRConverter = safe_import( 21 | "haystack.nodes.file_converter.pdf", "PDFToTextOCRConverter", "ocr" 22 | ) # Has optional dependencies 23 | -------------------------------------------------------------------------------- /haystack/nodes/file_converter/docx.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Optional, List 2 | 3 | import logging 4 | from pathlib import Path 5 | import docx 6 | 7 | from haystack.nodes.file_converter.base import BaseConverter 8 | from haystack.schema import Document 9 | 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | 14 | class DocxToTextConverter(BaseConverter): 15 | def convert( 16 | self, 17 | file_path: Path, 18 | meta: Optional[Dict[str, str]] = None, 19 | remove_numeric_tables: Optional[bool] = None, 20 | valid_languages: Optional[List[str]] = None, 21 | encoding: Optional[str] = None, 22 | id_hash_keys: Optional[List[str]] = None, 23 | ) -> List[Document]: 24 | """ 25 | Extract text from a .docx file. 26 | Note: As docx doesn't contain "page" information, we actually extract and return a list of paragraphs here. 27 | For compliance with other converters we nevertheless opted for keeping the methods name. 28 | 29 | :param file_path: Path to the .docx file you want to convert 30 | :param meta: dictionary of meta data key-value pairs to append in the returned document. 31 | :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables. 32 | The tabular structures in documents might be noise for the reader model if it 33 | does not have table parsing capability for finding answers. However, tables 34 | may also have long strings that could possible candidate for searching answers. 35 | The rows containing strings are thus retained in this option. 36 | :param valid_languages: validate languages from a list of languages specified in the ISO 639-1 37 | (https://en.wikipedia.org/wiki/ISO_639-1) format. 38 | This option can be used to add test for encoding errors. If the extracted text is 39 | not one of the valid languages, then it might likely be encoding error resulting 40 | in garbled text. 41 | :param encoding: Not applicable 42 | :param id_hash_keys: Generate the document id from a custom list of strings that refer to the document's 43 | attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are 44 | not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]). 45 | In this case the id will be generated by using the content and the defined metadata. 46 | """ 47 | if remove_numeric_tables is None: 48 | remove_numeric_tables = self.remove_numeric_tables 49 | if valid_languages is None: 50 | valid_languages = self.valid_languages 51 | if remove_numeric_tables is True: 52 | raise Exception("'remove_numeric_tables' is not supported by DocxToTextConverter.") 53 | if valid_languages is True: 54 | raise Exception("Language validation using 'valid_languages' is not supported by DocxToTextConverter.") 55 | if id_hash_keys is None: 56 | id_hash_keys = self.id_hash_keys 57 | 58 | file = docx.Document(file_path) # Creating word reader object. 59 | paragraphs = [para.text for para in file.paragraphs] 60 | text = "\n".join(paragraphs) 61 | document = Document(content=text, meta=meta, id_hash_keys=id_hash_keys) 62 | return [document] 63 | -------------------------------------------------------------------------------- /haystack/nodes/label_generator/__init__.py: -------------------------------------------------------------------------------- 1 | from haystack.nodes.label_generator.pseudo_label_generator import PseudoLabelGenerator 2 | -------------------------------------------------------------------------------- /haystack/nodes/other/__init__.py: -------------------------------------------------------------------------------- 1 | from haystack.nodes.other.docs2answers import Docs2Answers 2 | from haystack.nodes.other.join_docs import JoinDocuments 3 | from haystack.nodes.other.route_documents import RouteDocuments 4 | from haystack.nodes.other.join_answers import JoinAnswers 5 | from haystack.nodes.other.join import JoinNode 6 | from haystack.nodes.other.document_merger import DocumentMerger 7 | from haystack.nodes.other.dataset import Dataset 8 | -------------------------------------------------------------------------------- /haystack/nodes/other/dataset.py: -------------------------------------------------------------------------------- 1 | from typing import List, Union, Optional 2 | from abc import abstractmethod 3 | from haystack.schema import Document 4 | from haystack.nodes.base import BaseComponent 5 | import ray,os 6 | 7 | class Dataset(BaseComponent): 8 | """ 9 | This Node is used to convert dataset into haystack Documents or files path format. 10 | It is useful for integrated the different dataset into haystack indexing pipeline. 11 | It uses the ray distributed processing for the dataset. 12 | This ensures that your output is in a compatible format. 13 | """ 14 | 15 | outgoing_edges = 1 16 | 17 | def __init__(self, batch_size: Optional[int]): 18 | super().__init__() 19 | self.dataset = None 20 | self.batch_size = batch_size 21 | 22 | @abstractmethod 23 | def convert(self) -> ray.data.Dataset: 24 | """ 25 | Convert a dataset to ray.data.Dataset of Haystack Documents or files path. 26 | """ 27 | pass 28 | 29 | def run(self): # type: ignore 30 | # conversion from dataset-> Documents or files path 31 | self.dataset = self.convert() 32 | enable_sample = os.getenv('ENABLE_SAMPLING_LIMIT', default="0") 33 | if enable_sample == "1" : 34 | self.dataset = self.dataset.limit(500) 35 | return {}, "output_1" 36 | 37 | def run_batch(self): # type: ignore 38 | return self.run() 39 | 40 | def dataset_batched_generator(self) : 41 | """ 42 | Generator to generate the batched haystack Documents or batched files path 43 | """ 44 | return self.dataset.iter_batches(batch_size=self.batch_size) -------------------------------------------------------------------------------- /haystack/nodes/other/docs2answers.py: -------------------------------------------------------------------------------- 1 | from typing import List, Union, Dict 2 | 3 | from tqdm.auto import tqdm 4 | 5 | from haystack.errors import HaystackError 6 | from haystack.schema import Document, Answer, Span 7 | from haystack.nodes.base import BaseComponent 8 | 9 | 10 | class Docs2Answers(BaseComponent): 11 | """ 12 | This Node is used to convert retrieved documents into predicted answers format. 13 | It is useful for situations where you are calling a Retriever only pipeline via REST API. 14 | This ensures that your output is in a compatible format. 15 | 16 | :param progress_bar: Whether to show a progress bar 17 | """ 18 | 19 | outgoing_edges = 1 20 | 21 | def __init__(self, progress_bar: bool = True): 22 | super().__init__() 23 | self.progress_bar = progress_bar 24 | 25 | def run(self, query: str, documents: List[Document]): # type: ignore 26 | # conversion from Document -> Answer 27 | answers: List[Answer] = [] 28 | for doc in documents: 29 | cur_answer = self._convert_doc_to_answer(doc) 30 | answers.append(cur_answer) 31 | 32 | output = {"query": query, "answers": answers} 33 | 34 | return output, "output_1" 35 | 36 | def run_batch(self, queries: List[str], documents: Union[List[Document], List[List[Document]]]): # type: ignore 37 | output: Dict = {"queries": queries, "answers": []} 38 | 39 | # Docs case 1: single list of Documents 40 | if len(documents) > 0 and isinstance(documents[0], Document): 41 | for doc in tqdm(documents, disable=not self.progress_bar, desc="Converting to answers"): 42 | if not isinstance(doc, Document): 43 | raise HaystackError(f"doc was of type {type(doc)}, but expected a Document.") 44 | answers = [self._convert_doc_to_answer(doc)] 45 | output["answers"].append(answers) 46 | 47 | # Docs case 2: list of lists of Documents 48 | elif len(documents) > 0 and isinstance(documents[0], list): 49 | for docs in tqdm(documents, disable=not self.progress_bar, desc="Converting to answers"): 50 | if not isinstance(docs, list): 51 | raise HaystackError(f"docs was of type {type(docs)}, but expected a list of Documents.") 52 | answers = [] 53 | for doc in docs: 54 | cur_answer = self._convert_doc_to_answer(doc) 55 | answers.append(cur_answer) 56 | output["answers"].append(answers) 57 | 58 | return output, "output_1" 59 | 60 | @staticmethod 61 | def _convert_doc_to_answer(doc: Document) -> Answer: 62 | # For FAQ style QA use cases 63 | if "answer" in doc.meta: 64 | doc.meta["query"] = doc.content # question from the existing FAQ 65 | answer = Answer( 66 | answer=doc.meta["answer"], 67 | type="other", 68 | score=doc.score, 69 | context=doc.meta["answer"], 70 | offsets_in_context=[Span(start=0, end=len(doc.meta["answer"]))], 71 | document_id=doc.id, 72 | meta=doc.meta, 73 | ) 74 | else: 75 | # Regular docs 76 | answer = Answer( 77 | answer=doc.content, 78 | type="other", 79 | score=doc.score, 80 | context=doc.content, 81 | document_id=doc.id, 82 | meta=doc.meta, 83 | offsets_in_context=[Span(start=0, end=len(doc.content))] 84 | ) 85 | 86 | return answer 87 | -------------------------------------------------------------------------------- /haystack/nodes/other/join.py: -------------------------------------------------------------------------------- 1 | from abc import abstractmethod 2 | from typing import Optional, List, Tuple, Dict, Union, Any 3 | import warnings 4 | 5 | from haystack import MultiLabel, Document, Answer 6 | from haystack.nodes.base import BaseComponent 7 | 8 | 9 | class JoinNode(BaseComponent): 10 | 11 | outgoing_edges: int = 1 12 | 13 | def run( # type: ignore 14 | self, 15 | inputs: Optional[List[dict]] = None, 16 | query: Optional[str] = None, 17 | file_paths: Optional[List[str]] = None, 18 | labels: Optional[MultiLabel] = None, 19 | documents: Optional[List[Document]] = None, 20 | meta: Optional[dict] = None, 21 | answers: Optional[List[Answer]] = None, 22 | top_k_join: Optional[int] = None, 23 | ) -> Tuple[Dict, str]: 24 | if inputs: 25 | return self.run_accumulated(inputs, top_k_join=top_k_join) 26 | warnings.warn("You are using a JoinNode with only one input. This is usually equivalent to a no-op.") 27 | return self.run_accumulated( 28 | inputs=[ 29 | { 30 | "query": query, 31 | "file_paths": file_paths, 32 | "labels": labels, 33 | "documents": documents, 34 | "meta": meta, 35 | "answers": answers, 36 | } 37 | ], 38 | top_k_join=top_k_join, 39 | ) 40 | 41 | @abstractmethod 42 | def run_accumulated(self, inputs: List[dict], top_k_join: Optional[int] = None) -> Tuple[Dict, str]: 43 | pass 44 | 45 | def run_batch( # type: ignore 46 | self, 47 | inputs: Optional[List[dict]] = None, 48 | queries: Optional[Union[str, List[str]]] = None, 49 | file_paths: Optional[List[str]] = None, 50 | labels: Optional[Union[MultiLabel, List[MultiLabel]]] = None, 51 | documents: Optional[Union[List[Document], List[List[Document]]]] = None, 52 | meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None, 53 | params: Optional[dict] = None, 54 | debug: Optional[bool] = None, 55 | answers: Optional[List[Answer]] = None, 56 | top_k_join: Optional[int] = None, 57 | ) -> Tuple[Dict, str]: 58 | if inputs: 59 | return self.run_batch_accumulated(inputs=inputs, top_k_join=top_k_join) 60 | warnings.warn("You are using a JoinNode with only one input. This is usually equivalent to a no-op.") 61 | return self.run_batch_accumulated( 62 | inputs=[ 63 | { 64 | "queries": queries, 65 | "file_paths": file_paths, 66 | "labels": labels, 67 | "documents": documents, 68 | "meta": meta, 69 | "params": params, 70 | "debug": debug, 71 | "answers": answers, 72 | } 73 | ], 74 | top_k_join=top_k_join, 75 | ) 76 | 77 | @abstractmethod 78 | def run_batch_accumulated(self, inputs: List[dict], top_k_join: Optional[int] = None) -> Tuple[Dict, str]: 79 | pass 80 | -------------------------------------------------------------------------------- /haystack/nodes/preprocessor/__init__.py: -------------------------------------------------------------------------------- 1 | from haystack.nodes.preprocessor.base import BasePreProcessor 2 | from haystack.nodes.preprocessor.preprocessor import PreProcessor 3 | -------------------------------------------------------------------------------- /haystack/nodes/prompt/__init__.py: -------------------------------------------------------------------------------- 1 | from haystack.nodes.prompt.prompt_node import PromptNode, PromptTemplate, PromptModel 2 | -------------------------------------------------------------------------------- /haystack/nodes/query_classifier/__init__.py: -------------------------------------------------------------------------------- 1 | from haystack.nodes.query_classifier.base import BaseQueryClassifier 2 | from haystack.nodes.query_classifier.sklearn import SklearnQueryClassifier 3 | from haystack.nodes.query_classifier.transformers import TransformersQueryClassifier 4 | -------------------------------------------------------------------------------- /haystack/nodes/query_classifier/base.py: -------------------------------------------------------------------------------- 1 | from abc import abstractmethod 2 | from typing import List, Optional 3 | 4 | from haystack.nodes.base import BaseComponent 5 | 6 | 7 | class BaseQueryClassifier(BaseComponent): 8 | """ 9 | Abstract class for Query Classifiers 10 | """ 11 | 12 | outgoing_edges = 2 13 | 14 | @abstractmethod 15 | def run(self, query: str): # type: ignore 16 | pass 17 | 18 | @abstractmethod 19 | def run_batch(self, queries: List[str], batch_size: Optional[int] = None): # type: ignore 20 | pass 21 | -------------------------------------------------------------------------------- /haystack/nodes/question_generator/__init__.py: -------------------------------------------------------------------------------- 1 | from haystack.nodes.question_generator.question_generator import QuestionGenerator 2 | -------------------------------------------------------------------------------- /haystack/nodes/ranker/__init__.py: -------------------------------------------------------------------------------- 1 | from haystack.nodes.ranker.base import BaseRanker 2 | from haystack.nodes.ranker.colbert_modeling import ColBERTRanker 3 | from haystack.nodes.ranker.sentence_transformers import SentenceTransformersRanker 4 | -------------------------------------------------------------------------------- /haystack/nodes/ranker/st_modeling.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from typing import List, Optional, Union 3 | from haystack.nodes.ranker.base import BaseRanker 4 | from haystack.schema import Document 5 | from sentence_transformers import SentenceTransformer, util 6 | 7 | 8 | class STRanker(BaseRanker): 9 | def __init__( 10 | self, 11 | model_name_or_path: Union[str, Path], 12 | top_k: int = 10, 13 | ): 14 | self.model = SentenceTransformer(model_name_or_path) 15 | self.top_k = top_k 16 | self.model.eval() 17 | 18 | def predict(self, query: str, documents: List[Document], top_k: Optional[int] = None): 19 | if top_k is None: 20 | top_k = self.top_k 21 | 22 | docs = [d.content for d in documents] 23 | document_embeddings = self.model.encode(docs) 24 | query_embedding = self.model.encode(query) 25 | scores = util.cos_sim(query_embedding, document_embeddings).flatten() 26 | 27 | # rank documents according to scores 28 | sorted_scores_and_documents = sorted( 29 | zip(scores, documents), 30 | key=lambda similarity_document_tuple: similarity_document_tuple[0], 31 | reverse=True, 32 | ) 33 | sorted_documents = [doc for _, doc in sorted_scores_and_documents] 34 | return sorted_documents[:top_k] 35 | 36 | def predict_batch( 37 | self, 38 | query_doc_list: List[dict], 39 | top_k: Optional[int] = None, 40 | ): 41 | raise NotImplementedError 42 | -------------------------------------------------------------------------------- /haystack/nodes/reader/__init__.py: -------------------------------------------------------------------------------- 1 | from haystack.nodes.reader.base import BaseReader 2 | from haystack.nodes.reader.farm import FARMReader 3 | from haystack.nodes.reader.transformers import TransformersReader 4 | from haystack.nodes.reader.table import TableReader, RCIReader 5 | -------------------------------------------------------------------------------- /haystack/nodes/retriever/__init__.py: -------------------------------------------------------------------------------- 1 | from haystack.nodes.retriever.base import BaseRetriever 2 | from haystack.nodes.retriever.dense import ( 3 | DenseRetriever, 4 | DensePassageRetriever, 5 | EmbeddingRetriever, 6 | MultihopEmbeddingRetriever, 7 | TableTextRetriever, 8 | ) 9 | from haystack.nodes.retriever.sparse import ( 10 | BM25Retriever, 11 | ElasticsearchRetriever, 12 | ElasticsearchFilterOnlyRetriever, 13 | FilterRetriever, 14 | TfidfRetriever, 15 | ) 16 | from haystack.nodes.retriever.text2sparql import Text2SparqlRetriever 17 | from haystack.nodes.retriever.multimodal import MultiModalRetriever 18 | -------------------------------------------------------------------------------- /haystack/nodes/retriever/_losses.py: -------------------------------------------------------------------------------- 1 | from collections import namedtuple 2 | from typing import Dict 3 | 4 | from sentence_transformers import losses 5 | 6 | 7 | SentenceTransformerLoss = namedtuple("SentenceTransformerLoss", "loss required_attrs") 8 | 9 | _TRAINING_LOSSES: Dict[str, SentenceTransformerLoss] = { 10 | "mnrl": SentenceTransformerLoss(losses.MultipleNegativesRankingLoss, {"question", "pos_doc"}), 11 | "margin_mse": SentenceTransformerLoss(losses.MarginMSELoss, {"question", "pos_doc", "neg_doc", "score"}), 12 | } 13 | -------------------------------------------------------------------------------- /haystack/nodes/retriever/multimodal/__init__.py: -------------------------------------------------------------------------------- 1 | from haystack.nodes.retriever.multimodal.retriever import MultiModalRetriever 2 | from haystack.nodes.retriever.multimodal.embedder import MultiModalEmbedder 3 | -------------------------------------------------------------------------------- /haystack/nodes/summarizer/__init__.py: -------------------------------------------------------------------------------- 1 | from haystack.nodes.summarizer.base import BaseSummarizer 2 | from haystack.nodes.summarizer.transformers import TransformersSummarizer 3 | -------------------------------------------------------------------------------- /haystack/nodes/summarizer/base.py: -------------------------------------------------------------------------------- 1 | from typing import List, Dict, Optional, Union 2 | 3 | from abc import abstractmethod 4 | 5 | from haystack.schema import Document 6 | from haystack.nodes.base import BaseComponent 7 | 8 | 9 | class BaseSummarizer(BaseComponent): 10 | """ 11 | Abstract class for Summarizer 12 | """ 13 | 14 | outgoing_edges = 1 15 | 16 | @abstractmethod 17 | def predict(self, documents: List[Document], generate_single_summary: Optional[bool] = None) -> List[Document]: 18 | """ 19 | Abstract method for creating a summary. 20 | 21 | :param documents: Related documents (e.g. coming from a retriever) that the answer shall be conditioned on. 22 | :param generate_single_summary: This parameter is deprecated and will be removed in Haystack 1.12 23 | :return: List of Documents, where Document.meta["summary"] contains the summarization 24 | """ 25 | pass 26 | 27 | @abstractmethod 28 | def predict_batch( 29 | self, 30 | documents: Union[List[Document], List[List[Document]]], 31 | generate_single_summary: Optional[bool] = None, 32 | batch_size: Optional[int] = None, 33 | ) -> Union[List[Document], List[List[Document]]]: 34 | pass 35 | 36 | def run(self, documents: List[Document], generate_single_summary: Optional[bool] = None): # type: ignore 37 | 38 | results: Dict = {"documents": []} 39 | 40 | if documents: 41 | results["documents"] = self.predict(documents=documents, generate_single_summary=generate_single_summary) 42 | 43 | return results, "output_1" 44 | 45 | def run_batch( # type: ignore 46 | self, 47 | documents: Union[List[Document], List[List[Document]]], 48 | generate_single_summary: Optional[bool] = None, 49 | batch_size: Optional[int] = None, 50 | ): 51 | 52 | results = self.predict_batch( 53 | documents=documents, batch_size=batch_size, generate_single_summary=generate_single_summary 54 | ) 55 | 56 | return {"documents": results}, "output_1" 57 | -------------------------------------------------------------------------------- /haystack/nodes/translator/__init__.py: -------------------------------------------------------------------------------- 1 | from haystack.nodes.translator.base import BaseTranslator 2 | from haystack.nodes.translator.transformers import TransformersTranslator 3 | -------------------------------------------------------------------------------- /haystack/pipelines/__init__.py: -------------------------------------------------------------------------------- 1 | from haystack.pipelines.base import Pipeline, RootNode 2 | from haystack.pipelines.ray import RayPipeline 3 | from haystack.pipelines.standard_pipelines import ( 4 | BaseStandardPipeline, 5 | DocumentSearchPipeline, 6 | QuestionGenerationPipeline, 7 | TranslationWrapperPipeline, 8 | SearchSummarizationPipeline, 9 | MostSimilarDocumentsPipeline, 10 | QuestionAnswerGenerationPipeline, 11 | RetrieverQuestionGenerationPipeline, 12 | GenerativeQAPipeline, 13 | ExtractiveQAPipeline, 14 | FAQPipeline, 15 | TextIndexingPipeline, 16 | ) 17 | -------------------------------------------------------------------------------- /haystack/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from haystack.utils.reflection import args_to_kwargs 2 | from haystack.utils.preprocessing import convert_files_to_docs, tika_convert_files_to_docs 3 | from haystack.utils.import_utils import fetch_archive_from_http 4 | from haystack.utils.cleaning import clean_wiki_text 5 | from haystack.utils.doc_store import ( 6 | launch_es, 7 | launch_milvus, 8 | launch_opensearch, 9 | launch_weaviate, 10 | stop_opensearch, 11 | stop_service, 12 | ) 13 | from haystack.utils.deepsetcloud import DeepsetCloud, DeepsetCloudError, DeepsetCloudExperiments 14 | from haystack.utils.export_utils import ( 15 | print_answers, 16 | print_documents, 17 | print_questions, 18 | export_answers_to_csv, 19 | convert_labels_to_squad, 20 | ) 21 | from haystack.utils.squad_data import SquadData 22 | from haystack.utils.context_matching import calculate_context_similarity, match_context, match_contexts 23 | from haystack.utils.experiment_tracking import ( 24 | Tracker, 25 | NoTrackingHead, 26 | BaseTrackingHead, 27 | MLflowTrackingHead, 28 | StdoutTrackingHead, 29 | ) 30 | from haystack.utils.early_stopping import EarlyStopping 31 | from haystack.utils.labels import aggregate_labels 32 | -------------------------------------------------------------------------------- /haystack/utils/cleaning.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | 4 | def clean_wiki_text(text: str) -> str: 5 | """ 6 | Clean wikipedia text by removing multiple new lines, removing extremely short lines, 7 | adding paragraph breaks and removing empty paragraphs 8 | """ 9 | # get rid of multiple new lines 10 | while "\n\n" in text: 11 | text = text.replace("\n\n", "\n") 12 | 13 | # remove extremely short lines 14 | lines = text.split("\n") 15 | cleaned = [] 16 | for l in lines: 17 | if len(l) > 30: 18 | cleaned.append(l) 19 | elif l[:2] == "==" and l[-2:] == "==": 20 | cleaned.append(l) 21 | text = "\n".join(cleaned) 22 | 23 | # add paragraphs (identified by wiki section title which is always in format "==Some Title==") 24 | text = text.replace("\n==", "\n\n\n==") 25 | 26 | # remove empty paragrahps 27 | text = re.sub(r"(==.*==\n\n\n)", "", text) 28 | 29 | return text 30 | -------------------------------------------------------------------------------- /haystack/utils/docker.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from typing import List, Union, Optional 3 | 4 | 5 | def cache_models(models: Optional[List[str]] = None, use_auth_token: Optional[Union[str, bool]] = None): 6 | """ 7 | Small function that caches models and other data. 8 | Used only in the Dockerfile to include these caches in the images. 9 | 10 | :param models: List of Hugging Face model names to cache 11 | :param use_auth_token: The API token used to download private models from Huggingface. 12 | If this parameter is set to `True`, then the token generated when running 13 | `transformers-cli login` (stored in ~/.huggingface) will be used. 14 | Additional information can be found here 15 | https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained 16 | """ 17 | # Backward compat after adding the `model` param 18 | if models is None: 19 | models = ["deepset/roberta-base-squad2"] 20 | 21 | # download punkt tokenizer 22 | logging.info("Caching punkt data") 23 | import nltk 24 | 25 | nltk.download("punkt") 26 | 27 | # Cache models 28 | import transformers 29 | 30 | for model_to_cache in models: 31 | logging.info("Caching %s", model_to_cache) 32 | transformers.AutoTokenizer.from_pretrained(model_to_cache, use_auth_token=use_auth_token) 33 | transformers.AutoModel.from_pretrained(model_to_cache, use_auth_token=use_auth_token) 34 | -------------------------------------------------------------------------------- /haystack/utils/reflection.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | import logging 3 | import time 4 | from random import random 5 | from typing import Any, Dict, Tuple, Callable 6 | 7 | from haystack.errors import OpenAIRateLimitError 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | 12 | def args_to_kwargs(args: Tuple, func: Callable) -> Dict[str, Any]: 13 | sig = inspect.signature(func) 14 | arg_names = list(sig.parameters.keys()) 15 | # skip self and cls args for instance and class methods 16 | if any(arg_names) and arg_names[0] in ["self", "cls"]: 17 | arg_names = arg_names[1 : 1 + len(args)] 18 | args_as_kwargs = {arg_name: arg for arg, arg_name in zip(args, arg_names)} 19 | return args_as_kwargs 20 | 21 | 22 | def retry_with_exponential_backoff( 23 | backoff_in_seconds: float = 1, max_retries: int = 10, errors: tuple = (OpenAIRateLimitError,) 24 | ): 25 | """ 26 | Decorator to retry a function with exponential backoff. 27 | :param backoff_in_seconds: The initial backoff in seconds. 28 | :param max_retries: The maximum number of retries. 29 | :param errors: The errors to catch retry on. 30 | """ 31 | 32 | def decorator(function): 33 | def wrapper(*args, **kwargs): 34 | # Initialize variables 35 | num_retries = 0 36 | 37 | # Loop until a successful response or max_retries is hit or an exception is raised 38 | while True: 39 | try: 40 | return function(*args, **kwargs) 41 | 42 | # Retry on specified errors 43 | except errors as e: 44 | # Check if max retries has been reached 45 | if num_retries > max_retries: 46 | raise Exception(f"Maximum number of retries ({max_retries}) exceeded.") 47 | 48 | # Increment the delay 49 | sleep_time = backoff_in_seconds * 2**num_retries + random() 50 | 51 | # Sleep for the delay 52 | logger.warning( 53 | f"{e.__class__.__name__ } - {e}, " 54 | f"retry {function.__name__} in {'{0:.2f}'.format(sleep_time)} seconds..." 55 | ) 56 | time.sleep(sleep_time) 57 | 58 | # Increment retries 59 | num_retries += 1 60 | 61 | # Raise exceptions for any errors not specified 62 | except Exception as e: 63 | raise e 64 | 65 | return wrapper 66 | 67 | return decorator 68 | -------------------------------------------------------------------------------- /haystack/utils/torch_utils.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, List, Union 2 | 3 | import torch 4 | from torch.utils.data import Dataset 5 | 6 | 7 | class ListDataset(Dataset): 8 | def __init__(self, original_list): 9 | self.original_list = original_list 10 | 11 | def __len__(self): 12 | return len(self.original_list) 13 | 14 | def __getitem__(self, i): 15 | return self.original_list[i] 16 | 17 | 18 | def ensure_tensor_on_device(inputs: Union[dict, list, tuple, torch.Tensor], device: torch.device): 19 | """Utility function to check that all torch tensors present in `inputs` are sent to the correct device. 20 | 21 | :param inputs: Contains the torch tensors that will be sent to `device`. 22 | :param device: The torch device to send the tensors to. 23 | """ 24 | if isinstance(inputs, dict): 25 | return {name: ensure_tensor_on_device(tensor, device) for name, tensor in inputs.items()} 26 | elif isinstance(inputs, list): 27 | return [ensure_tensor_on_device(item, device) for item in inputs] 28 | elif isinstance(inputs, tuple): 29 | return tuple(ensure_tensor_on_device(item, device) for item in inputs) 30 | elif isinstance(inputs, torch.Tensor): 31 | if device == torch.device("cpu") and inputs.dtype in {torch.float16, torch.bfloat16}: 32 | inputs = inputs.float() 33 | return inputs.to(device) 34 | else: 35 | return inputs 36 | 37 | 38 | def get_devices(devices: Optional[List[Union[str, torch.device]]]) -> List[torch.device]: 39 | """ 40 | Convert a list of device names into a list of Torch devices, 41 | depending on the system's configuration and hardware. 42 | """ 43 | if devices is not None: 44 | return [torch.device(device) for device in devices] 45 | elif torch.cuda.is_available(): 46 | return [torch.device(device) for device in range(torch.cuda.device_count())] 47 | return [torch.device("cpu")] 48 | -------------------------------------------------------------------------------- /images/odqa_workflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intel/open-domain-question-and-answer/7869af3f98ff6b06d7e1d8e576a6c5ca8bbd573d/images/odqa_workflow.png -------------------------------------------------------------------------------- /images/pipeline1.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intel/open-domain-question-and-answer/7869af3f98ff6b06d7e1d8e576a6c5ca8bbd573d/images/pipeline1.PNG -------------------------------------------------------------------------------- /images/pipeline2.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intel/open-domain-question-and-answer/7869af3f98ff6b06d7e1d8e576a6c5ca8bbd573d/images/pipeline2.PNG -------------------------------------------------------------------------------- /images/pipeline3.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intel/open-domain-question-and-answer/7869af3f98ff6b06d7e1d8e576a6c5ca8bbd573d/images/pipeline3.PNG -------------------------------------------------------------------------------- /images/ui.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intel/open-domain-question-and-answer/7869af3f98ff6b06d7e1d8e576a6c5ca8bbd573d/images/ui.png -------------------------------------------------------------------------------- /nginx/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nginx 2 | RUN rm /etc/nginx/conf.d/default.conf 3 | COPY nginx.conf /etc/nginx/conf.d/default.conf 4 | EXPOSE 8000 5 | -------------------------------------------------------------------------------- /nginx/nginx.conf: -------------------------------------------------------------------------------- 1 | upstream loadbalancer { 2 | server 172.17.0.1:8001; 3 | server 172.17.0.1:8002; 4 | server 172.17.0.1:8003; 5 | } 6 | server { 7 | listen 8000; 8 | location / { 9 | proxy_pass http://loadbalancer; 10 | proxy_connect_timeout 140; 11 | proxy_send_timeout 180; 12 | proxy_read_timeout 180; 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /prepare_env.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # install required software packages 3 | yum updtate 4 | yum install -y yum-utils 5 | 6 | # set up the repository 7 | yum-config-manager --add-repo https://download.docker.com/linux/centos/docker-ce.repo 8 | 9 | # install and start service 10 | yum install -y docker-ce 11 | systemctl start docker 12 | 13 | # install docker compose 14 | curl -SL https://github.com/docker/compose/releases/download/v2.6.1/docker-compose-linux-x86_64 -o /usr/local/bin/docker-compose 15 | chmod +x /usr/local/bin/docker-compose 16 | ln -sf /usr/local/bin/docker-compose /usr/bin/docker-compose 17 | # check the installation 18 | echo `docker-compose version` 19 | -------------------------------------------------------------------------------- /rest_api/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intel/open-domain-question-and-answer/7869af3f98ff6b06d7e1d8e576a6c5ca8bbd573d/rest_api/README.md -------------------------------------------------------------------------------- /rest_api/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["hatchling"] 3 | build-backend = "hatchling.build" 4 | 5 | [project] 6 | name = "rest-api" 7 | description = 'API server for Haystack (https://github.com/deepset-ai/haystack)' 8 | readme = "README.md" 9 | requires-python = ">=3.7" 10 | license = "Apache-2.0" 11 | keywords = [] 12 | authors = [ 13 | { name = "deepset.ai", email = "malte.pietsch@deepset.ai" }, 14 | ] 15 | classifiers = [ 16 | "Development Status :: 5 - Production/Stable", 17 | "Intended Audience :: Science/Research", 18 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 19 | "Operating System :: OS Independent", 20 | "Programming Language :: Python", 21 | "Programming Language :: Python :: 3.7", 22 | "Programming Language :: Python :: 3.8", 23 | "Programming Language :: Python :: 3.9", 24 | "Programming Language :: Python :: 3.10", 25 | "Programming Language :: Python :: Implementation :: CPython", 26 | ] 27 | dependencies = [ 28 | "farm-haystack", 29 | "fastapi<1", 30 | "uvicorn<1", 31 | "gunicorn<21", 32 | "python-multipart<1", # optional FastAPI dependency for form data 33 | "pynvml", 34 | "psutil" 35 | ] 36 | dynamic = ["version"] 37 | 38 | [project.optional-dependencies] 39 | dev = [ 40 | "httpx" 41 | ] 42 | 43 | [project.urls] 44 | Documentation = "https://github.com/deepset-ai/haystack/tree/main/rest_api#readme" 45 | Issues = "https://github.com/deepset-ai/haystack/issues" 46 | Source = "https://github.com/deepset-ai/haystack/tree/main/rest_api" 47 | 48 | [tool.hatch.version] 49 | path = "rest_api/__about__.py" 50 | 51 | [tool.hatch.build.targets.sdist] 52 | [tool.hatch.build.targets.wheel] 53 | 54 | [tool.hatch.envs.default] 55 | dependencies = [ 56 | "pytest", 57 | "pytest-cov", 58 | ] 59 | [tool.hatch.envs.default.scripts] 60 | cov = "pytest --cov-report=term-missing --cov-config=pyproject.toml --cov=rest_api --cov=tests" 61 | no-cov = "cov --no-cov" 62 | 63 | [[tool.hatch.envs.test.matrix]] 64 | python = ["37", "38", "39", "310"] 65 | 66 | [tool.coverage.run] 67 | branch = true 68 | parallel = true 69 | omit = [ 70 | "rest_api/__about__.py", 71 | ] 72 | 73 | [tool.coverage.report] 74 | exclude_lines = [ 75 | "no cov", 76 | "if __name__ == .__main__.:", 77 | "if TYPE_CHECKING:", 78 | ] 79 | 80 | [tool.black] 81 | line-length = 120 82 | skip_magic_trailing_comma = true # For compatibility with pydoc>=4.6, check if still needed. 83 | -------------------------------------------------------------------------------- /rest_api/rest_api/__about__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from pathlib import Path 4 | 5 | 6 | __version__ = "0.0.0" 7 | try: 8 | __version__ = open(Path(__file__).parent.parent / "VERSION.txt", "r").read() 9 | except Exception as e: 10 | logging.exception("No VERSION.txt found!") 11 | -------------------------------------------------------------------------------- /rest_api/rest_api/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intel/open-domain-question-and-answer/7869af3f98ff6b06d7e1d8e576a6c5ca8bbd573d/rest_api/rest_api/__init__.py -------------------------------------------------------------------------------- /rest_api/rest_api/application.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import uvicorn 4 | from rest_api.utils import get_app, get_pipelines 5 | 6 | 7 | logging.basicConfig(format="%(asctime)s %(message)s", datefmt="%m/%d/%Y %I:%M:%S %p") 8 | logger = logging.getLogger(__name__) 9 | logging.getLogger("elasticsearch").setLevel(logging.WARNING) 10 | logging.getLogger("haystack").setLevel(logging.INFO) 11 | 12 | 13 | app = get_app() 14 | pipelines = get_pipelines() # Unused here, called to init the pipelines early 15 | 16 | 17 | logger.info("Open http://127.0.0.1:8000/docs to see Swagger API Documentation.") 18 | logger.info( 19 | """ 20 | Or just try it out directly: curl --request POST --url 'http://127.0.0.1:8000/query' 21 | -H "Content-Type: application/json" --data '{"query": "Who is the father of Arya Stark?"}' 22 | """ 23 | ) 24 | 25 | 26 | if __name__ == "__main__": 27 | uvicorn.run(app, host="0.0.0.0", port=8000) 28 | -------------------------------------------------------------------------------- /rest_api/rest_api/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | 4 | 5 | PIPELINE_YAML_PATH = os.getenv( 6 | "PIPELINE_YAML_PATH", str((Path(__file__).parent / "pipeline" / "pipelines.haystack-pipeline.yml").absolute()) 7 | ) 8 | QUERY_PIPELINE_NAME = os.getenv("QUERY_PIPELINE_NAME", "query") 9 | INDEXING_PIPELINE_NAME = os.getenv("INDEXING_PIPELINE_NAME", "indexing") 10 | INDEX_NAME = os.getenv("INDEX_NAME", "document") 11 | DOCUMENTSTORE_PARAMS_HOST = os.getenv("DOCUMENTSTORE_PARAMS_HOST", "elasticsearch") 12 | DOCUMENTSTORE_PARAMS_PORT = os.getenv("DOCUMENTSTORE_PARAMS_PORT", "9200") 13 | FILE_UPLOAD_PATH = os.getenv("FILE_UPLOAD_PATH", str((Path(__file__).parent / "file-upload").absolute())) 14 | LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO") 15 | ROOT_PATH = os.getenv("ROOT_PATH", "/") 16 | CHECKPOINT_PATH = os.getenv("CHECKPOINT_PATH","/home/user/data/colbert.dnn") 17 | FAISS_DB_PATH = os.getenv("FAISS_DB_PATH","/home/user/data/faiss-index-so.faiss") 18 | MODEL_PATH = os.getenv("MODEL_PATH", "/home/user/model") 19 | PLAID_INDEX_PATH = os.getenv("PLAID_INDEX_PATH", "/home/user/data/plaid_indexing/") 20 | PLAID_COLLECTION_PATH=os.getenv("PLAID_COLLECTION_PATH", "/home/user/data/psgs_w100.tsv") 21 | CONCURRENT_REQUEST_PER_WORKER = int(os.getenv("CONCURRENT_REQUEST_PER_WORKER", "4")) 22 | -------------------------------------------------------------------------------- /rest_api/rest_api/controller/__init__.py: -------------------------------------------------------------------------------- 1 | from rest_api.pipeline import custom_component # this import is required for the Custom Components to be registered 2 | -------------------------------------------------------------------------------- /rest_api/rest_api/controller/document.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | import logging 4 | 5 | from fastapi import FastAPI, APIRouter 6 | from haystack.document_stores import BaseDocumentStore 7 | from haystack.schema import Document 8 | 9 | from rest_api.utils import get_app, get_pipelines 10 | from rest_api.config import LOG_LEVEL 11 | from rest_api.schema import FilterRequest 12 | 13 | 14 | logging.getLogger("haystack").setLevel(LOG_LEVEL) 15 | logger = logging.getLogger("haystack") 16 | 17 | 18 | router = APIRouter() 19 | app: FastAPI = get_app() 20 | document_store: BaseDocumentStore = get_pipelines().get("document_store", None) 21 | 22 | 23 | @router.post("/documents/get_by_filters", response_model=List[Document], response_model_exclude_none=True) 24 | def get_documents(filters: FilterRequest): 25 | """ 26 | This endpoint allows you to retrieve documents contained in your document store. 27 | You can filter the documents to retrieve by metadata (like the document's name), 28 | or provide an empty JSON object to clear the document store. 29 | 30 | Example of filters: 31 | `'{"filters": {{"name": ["some", "more"], "category": ["only_one"]}}'` 32 | 33 | To get all documents you should provide an empty dict, like: 34 | `'{"filters": {}}'` 35 | """ 36 | docs = document_store.get_all_documents(filters=filters.filters) 37 | for doc in docs: 38 | doc.embedding = None 39 | return docs 40 | 41 | 42 | @router.post("/documents/delete_by_filters", response_model=bool) 43 | def delete_documents(filters: FilterRequest): 44 | """ 45 | This endpoint allows you to delete documents contained in your document store. 46 | You can filter the documents to delete by metadata (like the document's name), 47 | or provide an empty JSON object to clear the document store. 48 | 49 | Example of filters: 50 | `'{"filters": {{"name": ["some", "more"], "category": ["only_one"]}}'` 51 | 52 | To get all documents you should provide an empty dict, like: 53 | `'{"filters": {}}'` 54 | """ 55 | document_store.delete_documents(filters=filters.filters) 56 | return True 57 | -------------------------------------------------------------------------------- /rest_api/rest_api/controller/errors/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intel/open-domain-question-and-answer/7869af3f98ff6b06d7e1d8e576a6c5ca8bbd573d/rest_api/rest_api/controller/errors/__init__.py -------------------------------------------------------------------------------- /rest_api/rest_api/controller/errors/http_error.py: -------------------------------------------------------------------------------- 1 | from fastapi import HTTPException 2 | from starlette.requests import Request 3 | from starlette.responses import JSONResponse 4 | 5 | 6 | async def http_error_handler(_: Request, exc: HTTPException) -> JSONResponse: 7 | return JSONResponse({"errors": [exc.detail]}, status_code=exc.status_code) 8 | -------------------------------------------------------------------------------- /rest_api/rest_api/controller/file_upload.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, List 2 | 3 | import json 4 | import shutil 5 | import uuid 6 | from pathlib import Path 7 | 8 | from fastapi import FastAPI, APIRouter, UploadFile, File, Form, HTTPException, Depends 9 | from pydantic import BaseModel 10 | from haystack import Pipeline 11 | from haystack.nodes import BaseConverter, PreProcessor 12 | 13 | from rest_api.utils import get_app, get_pipelines 14 | from rest_api.config import FILE_UPLOAD_PATH 15 | from rest_api.controller.utils import as_form 16 | 17 | 18 | router = APIRouter() 19 | app: FastAPI = get_app() 20 | indexing_pipeline: Pipeline = get_pipelines().get("indexing_pipeline", None) 21 | 22 | 23 | @as_form 24 | class FileConverterParams(BaseModel): 25 | remove_numeric_tables: Optional[bool] = None 26 | valid_languages: Optional[List[str]] = None 27 | 28 | 29 | @as_form 30 | class PreprocessorParams(BaseModel): 31 | clean_whitespace: Optional[bool] = None 32 | clean_empty_lines: Optional[bool] = None 33 | clean_header_footer: Optional[bool] = None 34 | split_by: Optional[str] = None 35 | split_length: Optional[int] = None 36 | split_overlap: Optional[int] = None 37 | split_respect_sentence_boundary: Optional[bool] = None 38 | 39 | 40 | class Response(BaseModel): 41 | file_id: str 42 | 43 | 44 | @router.post("/file-upload") 45 | def upload_file( 46 | files: List[UploadFile] = File(...), 47 | # JSON serialized string 48 | meta: Optional[str] = Form("null"), # type: ignore 49 | fileconverter_params: FileConverterParams = Depends(FileConverterParams.as_form), # type: ignore 50 | preprocessor_params: PreprocessorParams = Depends(PreprocessorParams.as_form), # type: ignore 51 | ): 52 | """ 53 | You can use this endpoint to upload a file for indexing 54 | (see https://haystack.deepset.ai/guides/rest-api#indexing-documents-in-the-haystack-rest-api-document-store). 55 | """ 56 | if not indexing_pipeline: 57 | raise HTTPException(status_code=501, detail="Indexing Pipeline is not configured.") 58 | 59 | file_paths: list = [] 60 | file_metas: list = [] 61 | 62 | meta_form = json.loads(meta) or {} # type: ignore 63 | if not isinstance(meta_form, dict): 64 | raise HTTPException(status_code=500, detail=f"The meta field must be a dict or None, not {type(meta_form)}") 65 | 66 | for file in files: 67 | try: 68 | file_path = Path(FILE_UPLOAD_PATH) / f"{uuid.uuid4().hex}_{file.filename}" 69 | with file_path.open("wb") as buffer: 70 | shutil.copyfileobj(file.file, buffer) 71 | 72 | file_paths.append(file_path) 73 | meta_form["name"] = file.filename 74 | file_metas.append(meta_form) 75 | finally: 76 | file.file.close() 77 | 78 | # Find nodes names 79 | converters = indexing_pipeline.get_nodes_by_class(BaseConverter) 80 | preprocessors = indexing_pipeline.get_nodes_by_class(PreProcessor) 81 | 82 | params = {} 83 | for converter in converters: 84 | params[converter.name] = fileconverter_params.dict() 85 | for preprocessor in preprocessors: 86 | params[preprocessor.name] = preprocessor_params.dict() 87 | 88 | indexing_pipeline.run(file_paths=file_paths, meta=file_metas, params=params) 89 | -------------------------------------------------------------------------------- /rest_api/rest_api/controller/utils.py: -------------------------------------------------------------------------------- 1 | from typing import Type, NewType 2 | 3 | import inspect 4 | from contextlib import contextmanager 5 | from threading import Semaphore 6 | 7 | from fastapi import Form, HTTPException 8 | from pydantic import BaseModel 9 | 10 | 11 | class RequestLimiter: 12 | def __init__(self, limit): 13 | self.semaphore = Semaphore(limit - 1) 14 | 15 | @contextmanager 16 | def run(self): 17 | acquired = self.semaphore.acquire(blocking=False) 18 | if not acquired: 19 | raise HTTPException(status_code=503, detail="The server is busy processing requests.") 20 | try: 21 | yield acquired 22 | finally: 23 | self.semaphore.release() 24 | 25 | 26 | StringId = NewType("StringId", str) 27 | 28 | 29 | def as_form(cls: Type[BaseModel]): 30 | """ 31 | Adds an as_form class method to decorated models. The as_form class method 32 | can be used with FastAPI endpoints 33 | """ 34 | new_params = [ 35 | inspect.Parameter( 36 | field.alias, 37 | inspect.Parameter.POSITIONAL_ONLY, 38 | default=(Form(field.default) if not field.required else Form(...)), 39 | ) 40 | for field in cls.__fields__.values() 41 | ] 42 | 43 | async def _as_form(**data): 44 | return cls(**data) 45 | 46 | sig = inspect.signature(_as_form) 47 | sig = sig.replace(parameters=new_params) 48 | _as_form.__signature__ = sig # type: ignore 49 | setattr(cls, "as_form", _as_form) 50 | return cls 51 | -------------------------------------------------------------------------------- /rest_api/rest_api/pipeline/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Dict 2 | 3 | import os 4 | import logging 5 | from pathlib import Path 6 | 7 | from haystack.pipelines.base import Pipeline 8 | from haystack.document_stores import FAISSDocumentStore, InMemoryDocumentStore 9 | from haystack.errors import PipelineConfigError 10 | 11 | from rest_api.controller.utils import RequestLimiter 12 | 13 | 14 | logger = logging.getLogger(__name__) 15 | 16 | # Since each instance of FAISSDocumentStore creates an in-memory FAISS index, the Indexing & Query Pipelines would 17 | # end up with different indices. The same applies for InMemoryDocumentStore. 18 | UNSUPPORTED_DOC_STORES = (FAISSDocumentStore, InMemoryDocumentStore) 19 | 20 | 21 | def setup_pipelines() -> Dict[str, Any]: 22 | # Re-import the configuration variables 23 | from rest_api import config # pylint: disable=reimported 24 | 25 | pipelines = {} 26 | 27 | # Load query pipeline 28 | query_pipeline = Pipeline.load_from_yaml(Path(config.PIPELINE_YAML_PATH), pipeline_name=config.QUERY_PIPELINE_NAME) 29 | logging.info("Loaded pipeline nodes: %s", query_pipeline.graph.nodes.keys()) 30 | pipelines["query_pipeline"] = query_pipeline 31 | 32 | # Find document store 33 | document_store = query_pipeline.get_document_store() 34 | logging.info("Loaded docstore: %s", document_store) 35 | pipelines["document_store"] = document_store 36 | 37 | # Setup concurrency limiter 38 | concurrency_limiter = RequestLimiter(config.CONCURRENT_REQUEST_PER_WORKER) 39 | logging.info("Concurrent requests per worker: %s", config.CONCURRENT_REQUEST_PER_WORKER) 40 | pipelines["concurrency_limiter"] = concurrency_limiter 41 | 42 | # Load indexing pipeline (if available) 43 | try: 44 | indexing_pipeline = Pipeline.load_from_yaml( 45 | Path(config.PIPELINE_YAML_PATH), pipeline_name=config.INDEXING_PIPELINE_NAME 46 | ) 47 | docstore = indexing_pipeline.get_document_store() 48 | if isinstance(docstore, UNSUPPORTED_DOC_STORES): 49 | indexing_pipeline = None 50 | raise PipelineConfigError( 51 | "Indexing pipelines with FAISSDocumentStore or InMemoryDocumentStore are not supported by the REST APIs." 52 | ) 53 | 54 | except PipelineConfigError as e: 55 | indexing_pipeline = None 56 | logger.error("%s\nFile Upload API will not be available.", e.message) 57 | 58 | finally: 59 | pipelines["indexing_pipeline"] = indexing_pipeline 60 | 61 | # Create directory for uploaded files 62 | os.makedirs(config.FILE_UPLOAD_PATH, exist_ok=True) 63 | 64 | return pipelines 65 | -------------------------------------------------------------------------------- /rest_api/rest_api/pipeline/custom_component.py: -------------------------------------------------------------------------------- 1 | """ 2 | Pipelines allow putting together Components to build a graph. 3 | 4 | In addition to the standard Haystack Components, custom user-defined Components 5 | can be used in a Pipeline YAML configuration. 6 | 7 | The classes for the Custom Components must be defined in this file. 8 | """ 9 | 10 | 11 | from haystack.nodes.base import BaseComponent 12 | 13 | 14 | class SampleComponent(BaseComponent): 15 | outgoing_edges: int = 1 16 | 17 | def run(self, **kwargs): 18 | raise NotImplementedError 19 | -------------------------------------------------------------------------------- /rest_api/rest_api/pipeline/pipeline_empty.haystack-pipeline.yml: -------------------------------------------------------------------------------- 1 | # Dummy pipeline, used when the CI needs to load the REST API to extract the OpenAPI specs. DO NOT USE. 2 | version: ignore 3 | 4 | components: 5 | - name: FileTypeClassifier 6 | type: FileTypeClassifier 7 | 8 | pipelines: 9 | - name: query 10 | nodes: 11 | - name: FileTypeClassifier 12 | inputs: [File] 13 | 14 | - name: indexing 15 | nodes: 16 | - name: FileTypeClassifier 17 | inputs: [File] -------------------------------------------------------------------------------- /rest_api/rest_api/pipeline/pipeline_plaid_colbertv2.yml: -------------------------------------------------------------------------------- 1 | # To allow your IDE to autocomplete and validate your YAML pipelines, name them as .haystack-pipeline.yml 2 | 3 | version: ignore 4 | 5 | components: # define all the building-blocks for Pipeline 6 | - name: DocumentStore 7 | type: PLAIDDocumentStore 8 | params: 9 | index_path: /home/user/data/plaid_indexing/ 10 | checkpoint_path: /home/user/model/ 11 | collection_path: /home/user/data/psgs_w100.tsv 12 | - name: Retriever 13 | type: ColBERTRetriever 14 | params: 15 | document_store: DocumentStore # params can reference other components defined in the YAML 16 | top_k: 5 17 | - name: Doc2Answers # custom-name for the component; helpful for visualization & debugging 18 | type: Docs2Answers 19 | pipelines: 20 | - name: query # a sample extractive-qa Pipeline 21 | nodes: 22 | - name: Retriever 23 | inputs: [Query] 24 | - name: Doc2Answers 25 | inputs: [Retriever] 26 | -------------------------------------------------------------------------------- /rest_api/rest_api/pipeline/pipelines.colbertRanker.haystack-pipeline.yml: -------------------------------------------------------------------------------- 1 | # To allow your IDE to autocomplete and validate your YAML pipelines, name them as .haystack-pipeline.yml 2 | 3 | version: ignore 4 | 5 | components: # define all the building-blocks for Pipeline 6 | - name: DocumentStore 7 | type: ElasticsearchDocumentStore 8 | params: 9 | host: localhost 10 | index: document 11 | - name: Retriever 12 | type: BM25Retriever 13 | params: 14 | document_store: DocumentStore # params can reference other components defined in the YAML 15 | top_k: 5 16 | - name: Ranker 17 | type: ColBERTRanker 18 | params: 19 | model_path: /home/user/data 20 | - name: Doc2Answers # custom-name for the component; helpful for visualization & debugging 21 | type: Docs2Answers # Haystack Class name for the component 22 | - name: TextFileConverter 23 | type: TextConverter 24 | - name: PDFFileConverter 25 | type: PDFToTextConverter 26 | - name: Preprocessor 27 | type: PreProcessor 28 | params: 29 | split_by: word 30 | split_length: 1000 31 | - name: FileTypeClassifier 32 | type: FileTypeClassifier 33 | pipelines: 34 | - name: query # a sample extractive-qa Pipeline 35 | nodes: 36 | - name: Retriever 37 | inputs: [Query] 38 | - name: Ranker 39 | inputs: [Retriever] 40 | - name: Doc2Answers 41 | inputs: [Ranker] 42 | 43 | - name: indexing 44 | nodes: 45 | - name: FileTypeClassifier 46 | inputs: [File] 47 | - name: TextFileConverter 48 | inputs: [FileTypeClassifier.output_1] 49 | - name: PDFFileConverter 50 | inputs: [FileTypeClassifier.output_2] 51 | - name: Preprocessor 52 | inputs: [PDFFileConverter, TextFileConverter] 53 | - name: Ranker 54 | inputs: [Preprocessor] 55 | - name: DocumentStore 56 | inputs: [Ranker] 57 | -------------------------------------------------------------------------------- /rest_api/rest_api/pipeline/pipelines.haystack-EmbeddingRetriever-pipeline.yml: -------------------------------------------------------------------------------- 1 | # To allow your IDE to autocomplete and validate your YAML pipelines, name them as .haystack-pipeline.yml 2 | 3 | version: ignore 4 | 5 | components: # define all the building-blocks for Pipeline 6 | - name: DocumentStore 7 | type: ElasticsearchDocumentStore 8 | params: 9 | host: localhost 10 | index: document 11 | embedding_field: question_emb 12 | embedding_dim: 768 13 | excluded_meta_data: ["question_emb"] 14 | - name: Retriever 15 | type: EmbeddingRetriever 16 | params: 17 | document_store: DocumentStore # params can reference other components defined in the YAML 18 | embedding_model: deepset/sentence_bert 19 | top_k: 5 20 | - name: Doc2Answers # custom-name for the component; helpful for visualization & debugging 21 | type: Docs2Answers # Haystack Class name for the component 22 | pipelines: 23 | - name: query # a sample extractive-qa Pipeline 24 | nodes: 25 | - name: Retriever 26 | inputs: [Query] 27 | - name: Doc2Answers 28 | inputs: [Retriever] 29 | -------------------------------------------------------------------------------- /rest_api/rest_api/pipeline/pipelines.haystack-pipeline.yml: -------------------------------------------------------------------------------- 1 | # To allow your IDE to autocomplete and validate your YAML pipelines, name them as .haystack-pipeline.yml 2 | 3 | version: ignore 4 | 5 | components: # define all the building-blocks for Pipeline 6 | - name: DocumentStore 7 | type: ElasticsearchDocumentStore 8 | params: 9 | host: localhost 10 | - name: Retriever 11 | type: BM25Retriever 12 | params: 13 | document_store: DocumentStore # params can reference other components defined in the YAML 14 | top_k: 5 15 | - name: Reader # custom-name for the component; helpful for visualization & debugging 16 | type: FARMReader # Haystack Class name for the component 17 | params: 18 | model_name_or_path: deepset/roberta-base-squad2 19 | context_window_size: 500 20 | return_no_answer: true 21 | - name: TextFileConverter 22 | type: TextConverter 23 | - name: PDFFileConverter 24 | type: PDFToTextConverter 25 | - name: Preprocessor 26 | type: PreProcessor 27 | params: 28 | split_by: word 29 | split_length: 1000 30 | - name: FileTypeClassifier 31 | type: FileTypeClassifier 32 | 33 | pipelines: 34 | - name: query # a sample extractive-qa Pipeline 35 | nodes: 36 | - name: Retriever 37 | inputs: [Query] 38 | - name: Reader 39 | inputs: [Retriever] 40 | - name: indexing 41 | nodes: 42 | - name: FileTypeClassifier 43 | inputs: [File] 44 | - name: TextFileConverter 45 | inputs: [FileTypeClassifier.output_1] 46 | - name: PDFFileConverter 47 | inputs: [FileTypeClassifier.output_2] 48 | - name: Preprocessor 49 | inputs: [PDFFileConverter, TextFileConverter] 50 | - name: Retriever 51 | inputs: [Preprocessor] 52 | - name: DocumentStore 53 | inputs: [Retriever] 54 | -------------------------------------------------------------------------------- /rest_api/rest_api/pipeline/pipelines_dpr.haystack-pipeline.yml: -------------------------------------------------------------------------------- 1 | # To allow your IDE to autocomplete and validate your YAML pipelines, name them as .haystack-pipeline.yml 2 | 3 | version: ignore 4 | 5 | components: # define all the building-blocks for Pipeline 6 | - name: DocumentStore 7 | type: FAISSDocumentStore # consider using MilvusDocumentStore or WeaviateDocumentStore for scaling to large number of documents 8 | params: 9 | faiss_index_path: /home/user/data/faiss-index-so.faiss 10 | faiss_config_path: /home/user/data/faiss-index-so.json 11 | - name: Retriever 12 | type: DensePassageRetriever 13 | params: 14 | document_store: DocumentStore # params can reference other components defined in the YAML 15 | top_k: 5 16 | query_embedding_model: "facebook/dpr-question_encoder-single-nq-base" 17 | passage_embedding_model: "facebook/dpr-ctx_encoder-single-nq-base" 18 | max_seq_len_query: 64 19 | max_seq_len_passage: 256 20 | batch_size: 16 21 | embed_title: True 22 | use_fast_tokenizers: True 23 | - name: Doc2Answers # custom-name for the component; helpful for visualization & debugging 24 | type: Docs2Answers 25 | pipelines: 26 | - name: query # a sample extractive-qa Pipeline 27 | nodes: 28 | - name: Retriever 29 | inputs: [Query] 30 | - name: Doc2Answers 31 | inputs: [Retriever] 32 | -------------------------------------------------------------------------------- /rest_api/rest_api/schema.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import Dict, List, Optional, Union 4 | import numpy as np 5 | import pandas as pd 6 | 7 | try: 8 | from typing import Literal 9 | except ImportError: 10 | from typing_extensions import Literal # type: ignore 11 | 12 | from pydantic import BaseModel, Field, Extra 13 | from pydantic import BaseConfig 14 | 15 | from haystack.schema import Answer, Document 16 | 17 | 18 | BaseConfig.arbitrary_types_allowed = True 19 | BaseConfig.json_encoders = {np.ndarray: lambda x: x.tolist(), pd.DataFrame: lambda x: x.to_dict(orient="records")} 20 | 21 | 22 | PrimitiveType = Union[str, int, float, bool] 23 | 24 | 25 | class RequestBaseModel(BaseModel): 26 | class Config: 27 | # Forbid any extra fields in the request to avoid silent failures 28 | extra = Extra.forbid 29 | 30 | 31 | class QueryRequest(RequestBaseModel): 32 | query: str 33 | pipeline: str = None 34 | mode: int = 0 35 | params: Optional[dict] = None 36 | debug: Optional[bool] = False 37 | 38 | 39 | class FilterRequest(RequestBaseModel): 40 | filters: Optional[Dict[str, Union[PrimitiveType, List[PrimitiveType], Dict[str, PrimitiveType]]]] = None 41 | 42 | 43 | class CreateLabelSerialized(RequestBaseModel): 44 | id: Optional[str] = None 45 | query: str 46 | document: Document 47 | is_correct_answer: bool 48 | is_correct_document: bool 49 | origin: Literal["user-feedback", "gold-label"] 50 | answer: Optional[Answer] = None 51 | no_answer: Optional[bool] = None 52 | pipeline_id: Optional[str] = None 53 | created_at: Optional[str] = None 54 | updated_at: Optional[str] = None 55 | meta: Optional[dict] = None 56 | filters: Optional[dict] = None 57 | 58 | 59 | class QueryResponse(BaseModel): 60 | query: str 61 | answers: List[Answer] = [] 62 | documents: List[Document] = [] 63 | debug: Optional[Dict] = Field(None, alias="_debug") 64 | -------------------------------------------------------------------------------- /rest_api/rest_api/utils.py: -------------------------------------------------------------------------------- 1 | from fastapi import FastAPI, HTTPException, APIRouter 2 | from fastapi.routing import APIRoute 3 | from fastapi.openapi.utils import get_openapi 4 | from starlette.middleware.cors import CORSMiddleware 5 | from haystack import __version__ as haystack_version 6 | 7 | from rest_api.pipeline import setup_pipelines 8 | from rest_api.controller.errors.http_error import http_error_handler 9 | 10 | 11 | app = None 12 | pipelines = None 13 | 14 | 15 | def get_app() -> FastAPI: 16 | """ 17 | Initializes the App object and creates the global pipelines as possible. 18 | """ 19 | global app # pylint: disable=global-statement 20 | if app: 21 | return app 22 | 23 | from rest_api.config import ROOT_PATH 24 | 25 | app = FastAPI(title="Haystack REST API", debug=True, version=haystack_version, root_path=ROOT_PATH) 26 | 27 | # Creates the router for the API calls 28 | from rest_api.controller import file_upload, search, feedback, document, health 29 | 30 | router = APIRouter() 31 | router.include_router(search.router, tags=["search"]) 32 | router.include_router(feedback.router, tags=["feedback"]) 33 | router.include_router(file_upload.router, tags=["file-upload"]) 34 | router.include_router(document.router, tags=["document"]) 35 | router.include_router(health.router, tags=["health"]) 36 | 37 | # This middleware enables allow all cross-domain requests to the API from a browser. For production 38 | # deployments, it could be made more restrictive. 39 | app.add_middleware( 40 | CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"] 41 | ) 42 | app.add_exception_handler(HTTPException, http_error_handler) 43 | app.include_router(router) 44 | 45 | # Simplify operation IDs so that generated API clients have simpler function 46 | # names (see https://fastapi.tiangolo.com/advanced/path-operation-advanced-configuration/#using-the-path-operation-function-name-as-the-operationid). 47 | # The operation IDs will be the same as the route names (i.e. the python method names of the endpoints) 48 | # Should be called only after all routes have been added. 49 | for route in app.routes: 50 | if isinstance(route, APIRoute): 51 | route.operation_id = route.name 52 | 53 | return app 54 | 55 | 56 | def get_pipelines(): 57 | global pipelines # pylint: disable=global-statement 58 | if not pipelines: 59 | pipelines = setup_pipelines() 60 | return pipelines 61 | 62 | 63 | def get_openapi_specs() -> dict: 64 | """ 65 | Used to autogenerate OpenAPI specs file to use in the documentation. 66 | 67 | See `docs/_src/api/openapi/generate_openapi_specs.py` 68 | """ 69 | app = get_app() 70 | return get_openapi( 71 | title=app.title, 72 | version=app.version, 73 | openapi_version=app.openapi_version, 74 | description=app.description, 75 | routes=app.routes, 76 | ) 77 | -------------------------------------------------------------------------------- /rest_api/test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intel/open-domain-question-and-answer/7869af3f98ff6b06d7e1d8e576a6c5ca8bbd573d/rest_api/test/__init__.py -------------------------------------------------------------------------------- /rest_api/test/samples/pdf/sample_pdf_1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intel/open-domain-question-and-answer/7869af3f98ff6b06d7e1d8e576a6c5ca8bbd573d/rest_api/test/samples/pdf/sample_pdf_1.pdf -------------------------------------------------------------------------------- /rest_api/test/samples/pdf/sample_pdf_2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intel/open-domain-question-and-answer/7869af3f98ff6b06d7e1d8e576a6c5ca8bbd573d/rest_api/test/samples/pdf/sample_pdf_2.pdf -------------------------------------------------------------------------------- /rest_api/test/samples/test.haystack-pipeline.yml: -------------------------------------------------------------------------------- 1 | version: 'ignore' 2 | 3 | components: 4 | - name: TestReader 5 | type: MockReader 6 | - name: TestRetriever 7 | type: MockRetriever 8 | params: 9 | document_store: TestDocumentStore 10 | - name: TestDocumentStore 11 | type: MockDocumentStore 12 | - name: TestPreprocessor 13 | type: PreProcessor 14 | params: 15 | clean_whitespace: true 16 | - name: TestPDFConverter 17 | type: MockPDFToTextConverter 18 | params: 19 | remove_numeric_tables: false 20 | 21 | 22 | pipelines: 23 | - name: test-query 24 | nodes: 25 | - name: TestRetriever 26 | inputs: [Query] 27 | - name: TestReader 28 | inputs: [TestRetriever] 29 | 30 | - name: test-indexing 31 | nodes: 32 | - name: TestPDFConverter 33 | inputs: [File] 34 | - name: TestPreprocessor 35 | inputs: [TestPDFConverter] 36 | - name: TestDocumentStore 37 | inputs: [TestPreprocessor] -------------------------------------------------------------------------------- /test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intel/open-domain-question-and-answer/7869af3f98ff6b06d7e1d8e576a6c5ca8bbd573d/test/__init__.py -------------------------------------------------------------------------------- /test/benchmarks/README.md: -------------------------------------------------------------------------------- 1 | # Benchmarks 2 | 3 | Run the benchmarks with the following command: 4 | 5 | ``` 6 | python run.py [--reader] [--retriever_index] [--retriever_query] [--ci] [--update-json] 7 | ``` 8 | 9 | You can specify which components and processes to benchmark with the following flags. 10 | 11 | **--reader** will trigger the speed and accuracy benchmarks for the reader. Here we simply use the SQuAD dev set. 12 | 13 | **--retriever_index** will trigger indexing benchmarks 14 | 15 | **--retriever_query** will trigger querying benchmarks (embeddings will be loaded from file instead of being computed on the fly) 16 | 17 | **--ci** will cause the the benchmarks to run on a smaller slice of each dataset and a smaller subset of Retriever / Reader / DocStores. 18 | 19 | **--update-json** will cause the script to update the json files in docs/_src/benchmarks so that the website benchmarks will be updated. 20 | -------------------------------------------------------------------------------- /test/benchmarks/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "params": { 3 | "full": { 4 | "retriever_doc_stores": [ 5 | [ 6 | "elastic", 7 | "elasticsearch" 8 | ], 9 | [ 10 | "elastic", 11 | "opensearch_flat" 12 | ], 13 | [ 14 | "dpr", 15 | "opensearch_flat" 16 | ], 17 | [ 18 | "dpr", 19 | "opensearch_hnsw" 20 | ], 21 | [ 22 | "dpr", 23 | "elasticsearch" 24 | ], 25 | [ 26 | "dpr", 27 | "milvus_flat" 28 | ], 29 | [ 30 | "dpr", 31 | "milvus_hnsw" 32 | ], 33 | [ 34 | "dpr", 35 | "faiss_flat" 36 | ], 37 | [ 38 | "dpr", 39 | "faiss_hnsw" 40 | ], 41 | [ 42 | "sentence_transformers", 43 | "elasticsearch" 44 | ] 45 | ], 46 | "n_docs_options": [ 47 | 1000, 48 | 10000, 49 | 100000, 50 | 500000 51 | ], 52 | "n_queries": null 53 | }, 54 | "ci": { 55 | "retriever_doc_stores": [ 56 | [ 57 | "elastic", 58 | "elasticsearch" 59 | ] 60 | ], 61 | "n_docs_options": [ 62 | 1000 63 | ], 64 | "n_queries": 100 65 | } 66 | }, 67 | "filenames": { 68 | "data_s3_url": "https://ext-haystack-retriever-eval.s3-eu-west-1.amazonaws.com/", 69 | "data_dir": "../../data/retriever/", 70 | "filename_gold": "nq2squad-dev.json", 71 | "filenames_negative": { 72 | "10000": "psgs_w100_minus_gold_10k.tsv", 73 | "100000": "psgs_w100_minus_gold_100k.tsv", 74 | "1000000": "psgs_w100_minus_gold_1m.tsv" 75 | }, 76 | "embeddings_dir": "embeddings/", 77 | "embeddings_filenames": { 78 | "10000": "wikipedia_passages_10k.pkl", 79 | "100000": "wikipedia_passages_100k.pkl", 80 | "1000000": "wikipedia_passages_1m.pkl"} 81 | } 82 | } 83 | -------------------------------------------------------------------------------- /test/benchmarks/data_scripts/embeddings_slice.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | from pathlib import Path 3 | from tqdm import tqdm 4 | import json 5 | 6 | n_passages = 1_000_000 7 | embeddings_dir = Path("embeddings") 8 | embeddings_filenames = [f"wikipedia_passages_{i}.pkl" for i in range(50)] 9 | neg_passages_filename = "psgs_w100_minus_gold.tsv" 10 | gold_passages_filename = "nq2squad-dev.json" 11 | 12 | # Extract gold passage ids 13 | passage_ids = [] 14 | gold_data = json.load(open(gold_passages_filename))["data"] 15 | for d in gold_data: 16 | for p in d["paragraphs"]: 17 | passage_ids.append(str(p["passage_id"])) 18 | print("gold_ids") 19 | print(len(passage_ids)) 20 | print() 21 | 22 | # Extract neg passage ids 23 | with open(neg_passages_filename) as f: 24 | f.readline() # Ignore column headers 25 | for _ in range(n_passages - len(passage_ids)): 26 | l = f.readline() 27 | passage_ids.append(str(l.split()[0])) 28 | assert len(passage_ids) == len(set(passage_ids)) 29 | assert set([type(x) for x in passage_ids]) == {str} 30 | passage_ids = set(passage_ids) 31 | print("all_ids") 32 | print(len(passage_ids)) 33 | print() 34 | 35 | 36 | # Gather vectors for passages 37 | ret = [] 38 | for ef in tqdm(embeddings_filenames): 39 | curr = pickle.load(open(embeddings_dir / ef, "rb")) 40 | for i, vec in curr: 41 | if i in passage_ids: 42 | ret.append((i, vec)) 43 | print("n_vectors") 44 | print(len(ret)) 45 | print() 46 | 47 | # Write vectors to file 48 | with open(f"wikipedia_passages_{n_passages}.pkl", "wb") as f: 49 | pickle.dump(ret, f) 50 | -------------------------------------------------------------------------------- /test/benchmarks/data_scripts/shuffle_passages.py: -------------------------------------------------------------------------------- 1 | import json 2 | from tqdm import tqdm 3 | import time 4 | import random 5 | 6 | random.seed(42) 7 | 8 | lines = [] 9 | with open("psgs_w100_minus_gold_unshuffled.tsv") as f: 10 | f.readline() # Remove column header 11 | lines = [l for l in tqdm(f)] 12 | 13 | tic = time.perf_counter() 14 | random.shuffle(lines) 15 | toc = time.perf_counter() 16 | t = toc - tic 17 | print(t) 18 | with open("psgs_w100_minus_gold.tsv", "w") as f: 19 | f.write("id\ttext\title\n") 20 | for l in tqdm(lines): 21 | f.write(l) 22 | -------------------------------------------------------------------------------- /test/benchmarks/distillation_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "student_model": { 3 | "model_name_or_path": "roberta-base", 4 | "batch_size": 80 5 | }, 6 | "teacher_model": { 7 | "model_name_or_path": "deepset/roberta-large-squad2", 8 | "batch_size": 512 9 | }, 10 | "distillation_settings": { 11 | "distillation_loss": "kl_div", 12 | "distillation_loss_weight": [0.75, 1], 13 | "temperature": [5, 10] 14 | }, 15 | "training_settings": { 16 | "n_epochs": 2, 17 | "max_seq_len": 384, 18 | "learning_rate": 3e-5 19 | }, 20 | "dataset": "squad2", 21 | "download_folder": "dataset/squad2", 22 | "evaluate_teacher": true, 23 | "evaluate_student_without_distillation": true, 24 | "evaluate_student_with_distillation": true 25 | } -------------------------------------------------------------------------------- /test/benchmarks/reader_results.csv: -------------------------------------------------------------------------------- 1 | ,EM,f1,top_n_accuracy,top_n,reader_time,seconds_per_query,passages_per_second,reader,error 2 | 0,0.7839204449688185,0.8258860575299658,0.9742120343839542,5,98.16358173700064,0.008272676701247315,125.81040525892847,deepset/roberta-base-squad2, 3 | 1,0.7438058317883027,0.7887858491007042,0.9719366256531266,5,47.38258053499885,0.003993138423647299,260.6443097981493,deepset/minilm-uncased-squad2, 4 | 2,0.6947581324793528,0.7431182400443286,0.9557559413450194,5,101.99811779300217,0.008595829916821352,121.08066567525722,deepset/bert-base-cased-squad2, 5 | 3,0.7897353783920446,0.8326306774734308,0.976908815101972,5,292.51886408200517,0.024651851009776266,42.21949937744112,deepset/bert-large-uncased-whole-word-masking-squad2, 6 | 4,0.8021237148154391,0.8450422699207468,0.974043485589078,5,293.53038741600176,0.024737096529243364,42.07400844838984,deepset/xlm-roberta-large-squad2, 7 | 5,0.3729984830608461,0.4231925844723574,0.9539019046013821,5,55.403011280999635,0.004669055391960192,222.91207128366705,distilbert-base-uncased-distilled-squad, 8 | -------------------------------------------------------------------------------- /test/benchmarks/retriever_index_results.csv: -------------------------------------------------------------------------------- 1 | ,retriever,doc_store,n_docs,indexing_time,docs_per_second,date_time,error 2 | 9,dpr,elasticsearch,10000,139.7465313429998,71.55812673057035,2021-04-12 13:06:34.024778, 3 | 14,elastic,elasticsearch,100000,205.94765839000047,485.56026702003703,2021-04-12 13:44:31.464961, 4 | 8,elastic,elasticsearch,10000,19.96974077699997,500.7576268349683,2021-04-12 13:03:44.944941, 5 | 3,dpr,elasticsearch,1000,14.592372578999857,68.52895199777984,2021-04-12 12:58:01.128834, 6 | 2,elastic,elasticsearch,1000,2.1051091760000418,475.034744706267,2021-04-12 12:57:18.604681, 7 | 15,dpr,elasticsearch,100000,1401.1558383250003,71.36964873196699,2021-04-12 14:08:31.400192, 8 | 20,elastic,elasticsearch,500000,1027.416534557,486.6575368242339,2021-04-12 17:30:22.080196, 9 | 21,dpr,elasticsearch,500000,7010.269106937998,71.32393812174124,2021-04-12 19:28:39.657070, 10 | 4,dpr,faiss_flat,1000,9.570316116999948,104.48975642755202,2021-04-12 12:58:47.918981, 11 | 22,dpr,faiss_flat,500000,5041.962777018001,99.16772933728758,2021-04-12 20:55:28.443354, 12 | 10,dpr,faiss_flat,10000,95.71089355200002,104.48131481049198,2021-04-12 13:08:50.343175, 13 | 16,dpr,faiss_flat,100000,999.8815230299997,100.0118491008456,2021-04-12 14:26:14.495997, 14 | 11,dpr,faiss_hnsw,10000,108.9302881550002,91.80183188142033,2021-04-12 13:11:13.117266, 15 | 17,dpr,faiss_hnsw,100000,1112.2988848330006,89.90389306648807,2021-04-12 14:45:22.644624, 16 | 23,dpr,faiss_hnsw,500000,5802.5877488399965,86.16845132586847,2021-04-12 22:32:53.095579, 17 | 5,dpr,faiss_hnsw,1000,9.837438108000242,101.65248197970928,2021-04-12 12:59:30.777696, 18 | 0,dpr,milvus_flat,1000,9.717840198999966,102.90352377917338,2021-04-12 12:56:32.363797, 19 | 6,dpr,milvus_flat,10000,87.06480573199997,114.85697252666792,2021-04-12 13:01:21.834327, 20 | 12,dpr,milvus_flat,100000,861.995940363,116.00982709720004,2021-04-12 13:26:00.742197, 21 | 18,dpr,milvus_flat,500000,4364.3841063849995,114.56370195934652,2021-04-12 15:58:40.069278, 22 | 1,dpr,milvus_hnsw,1000,8.522245804999784,117.33996212750934,2021-04-12 12:57:04.976604, 23 | 7,dpr,milvus_hnsw,10000,87.128293364,114.77327988306308,2021-04-12 13:03:13.381764, 24 | 19,dpr,milvus_hnsw,500000,4414.051032668,113.27463056035022,2021-04-12 17:12:50.943619, 25 | 13,dpr,milvus_hnsw,100000,864.9713281529998,115.61076852516385,2021-04-12 13:40:51.875517, 26 | 0,sentence_transformers,elasticsearch,1000,10.380210993000219,96.33715544648746,2021-06-02 08:49:29.922794, 27 | 1,sentence_transformers,elasticsearch,10000,82.89545158599958,120.63388049253265,2021-06-02 08:51:09.796056, 28 | 2,sentence_transformers,elasticsearch,100000,836.6144149759998,119.52937722555106,2021-06-02 09:05:26.454063, 29 | 3,sentence_transformers,elasticsearch,500000,4207.770141414,118.82778364694073,2021-06-02 10:16:20.514575, 30 | 1,dpr,opensearch_flat,100000,1427.47408267,70.05381128388427,2021-07-22 12:33:02.890691, 31 | 0,elastic,opensearch_flat,100000,207.3902409509992,482.18276588833,2021-07-22 12:08:18.041527, 32 | 2,dpr,opensearch_hnsw,100000,1422.2719023249992,70.31004397719536,2021-07-22 12:57:54.770107, 33 | -------------------------------------------------------------------------------- /test/benchmarks/retriever_query_results.md: -------------------------------------------------------------------------------- 1 | | | retriever | doc_store | n_docs | n_queries | retrieve_time | queries_per_second | seconds_per_query | recall | map | top_k | date_time | error | 2 | |---:|:------------|:--------------|---------:|------------:|----------------:|---------------------:|--------------------:|---------:|---------:|--------:|:---------------------------|:--------| 3 | | 1 | dpr | elasticsearch | 1000 | 1064 | 34.6755 | 30.6845 | 0.0325897 | 0.991541 | 0.929511 | 10 | 2021-02-01 11:27:43.048502 | | 4 | | 5 | dpr | elasticsearch | 10000 | 5637 | 288.061 | 19.5688 | 0.0511019 | 0.974987 | 0.89871 | 10 | 2021-02-01 11:37:21.149887 | | 5 | | 9 | dpr | elasticsearch | 100000 | 5637 | 1225.63 | 4.59928 | 0.217425 | 0.957956 | 0.865456 | 10 | 2021-02-01 12:15:52.757320 | | 6 | | 13 | dpr | elasticsearch | 500000 | 5637 | 5339.01 | 1.05581 | 0.947136 | 0.930814 | 0.808614 | 10 | 2021-02-01 14:52:23.056230 | | 7 | | 0 | elastic | elasticsearch | 1000 | 1064 | 4.04654 | 262.941 | 0.00380314 | 0.890977 | 0.742044 | 10 | 2021-02-01 11:26:04.346134 | | 8 | | 4 | elastic | elasticsearch | 10000 | 5637 | 30.7014 | 183.607 | 0.00544641 | 0.81107 | 0.662063 | 10 | 2021-02-01 11:31:20.470092 | | 9 | | 8 | elastic | elasticsearch | 100000 | 5637 | 34.7055 | 162.424 | 0.00615673 | 0.719354 | 0.562596 | 10 | 2021-02-01 11:50:36.048887 | | 10 | | 12 | elastic | elasticsearch | 500000 | 5637 | 68.3838 | 82.4318 | 0.0121312 | 0.627461 | 0.455945 | 10 | 2021-02-01 13:02:16.905187 | | 11 | | 2 | dpr | faiss_flat | 1000 | 1064 | 30.0533 | 35.4038 | 0.0282456 | 0.991541 | 0.929511 | 10 | 2021-02-01 11:28:55.544474 | | 12 | | 6 | dpr | faiss_flat | 10000 | 5637 | 218.594 | 25.7875 | 0.0387785 | 0.974987 | 0.89871 | 10 | 2021-02-01 11:42:07.545869 | | 13 | | 10 | dpr | faiss_flat | 100000 | 5637 | 865.744 | 6.51116 | 0.153582 | 0.957956 | 0.865461 | 10 | 2021-02-01 12:34:29.493598 | | 14 | | 14 | dpr | faiss_flat | 500000 | 5637 | 3717.95 | 1.51616 | 0.659561 | 0.930814 | 0.808614 | 10 | 2021-02-01 16:12:52.804436 | | 15 | | 3 | dpr | faiss_hnsw | 1000 | 1064 | 27.1677 | 39.1641 | 0.0255336 | 0.991541 | 0.929511 | 10 | 2021-02-01 11:30:02.684535 | | 16 | | 7 | dpr | faiss_hnsw | 10000 | 5637 | 167.552 | 33.6432 | 0.0297237 | 0.972503 | 0.896994 | 10 | 2021-02-01 11:46:07.130588 | | 17 | | 11 | dpr | faiss_hnsw | 100000 | 5637 | 167.482 | 33.6573 | 0.0297112 | 0.940216 | 0.850798 | 10 | 2021-02-01 12:43:21.697968 | | 18 | | 15 | dpr | faiss_hnsw | 500000 | 5637 | 164.456 | 34.2767 | 0.0291743 | 0.882562 | 0.769148 | 10 | 2021-02-01 16:47:01.710072 | | -------------------------------------------------------------------------------- /test/benchmarks/retriever_simplified.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script performs the same query benchmarking as `retriever.py` but with less of the loops that iterate 3 | over all the parameters so that it is easier to inspect what is happening 4 | """ 5 | 6 | 7 | from haystack.document_stores import MilvusDocumentStore, FAISSDocumentStore 8 | from haystack.nodes import DensePassageRetriever 9 | from retriever import prepare_data 10 | import datetime 11 | from pprint import pprint 12 | from milvus import IndexType 13 | from utils import get_document_store 14 | 15 | 16 | def benchmark_querying(index_type, n_docs=100_000, similarity="dot_product"): 17 | 18 | doc_index = "document" 19 | label_index = "label" 20 | 21 | docs, labels = prepare_data( 22 | data_dir="data/", 23 | filename_gold="nq2squad-dev.json", 24 | filename_negative="psgs_w100_minus_gold_100k.tsv", 25 | remote_url="https://ext-haystack-retriever-eval.s3-eu-west-1.amazonaws.com/", 26 | embeddings_filenames=["wikipedia_passages_100k.pkl"], 27 | embeddings_dir="embeddings/", 28 | n_docs=n_docs, 29 | add_precomputed=True, 30 | ) 31 | 32 | doc_store = get_document_store(document_store_type=index_type, similarity=similarity) 33 | 34 | # if index_type == "milvus_flat": 35 | # doc_store = MilvusDocumentStore(index=doc_index, similarity=similarity) 36 | # elif index_type == "milvus_hnsw": 37 | # index_param = {"M": 64, "efConstruction": 80} 38 | # search_param = {"ef": 20} 39 | # doc_store = MilvusDocumentStore( 40 | # index=doc_index, 41 | # index_type=IndexType.HNSW, 42 | # index_param=index_param, 43 | # search_param=search_param, 44 | # similarity=similarity 45 | # ) 46 | 47 | doc_store.write_documents(documents=docs, index=doc_index) 48 | doc_store.write_labels(labels=labels, index=label_index) 49 | 50 | retriever = DensePassageRetriever( 51 | document_store=doc_store, 52 | query_embedding_model="facebook/dpr-question_encoder-single-nq-base", 53 | passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base", 54 | use_gpu=True, 55 | use_fast_tokenizers=True, 56 | ) 57 | 58 | raw_results = retriever.eval(label_index=label_index, doc_index=doc_index) 59 | results = { 60 | "n_queries": raw_results["n_questions"], 61 | "retrieve_time": raw_results["retrieve_time"], 62 | "queries_per_second": raw_results["n_questions"] / raw_results["retrieve_time"], 63 | "seconds_per_query": raw_results["retrieve_time"] / raw_results["n_questions"], 64 | "recall": raw_results["recall"] * 100, 65 | "map": raw_results["map"] * 100, 66 | "top_k": raw_results["top_k"], 67 | "date_time": datetime.datetime.now(), 68 | "error": None, 69 | } 70 | 71 | pprint(results) 72 | 73 | doc_store.delete_all_documents(index=doc_index) 74 | doc_store.delete_all_documents(index=label_index) 75 | 76 | 77 | if __name__ == "__main__": 78 | similarity = "l2" 79 | n_docs = 1000 80 | 81 | benchmark_querying(index_type="milvus_flat", similarity=similarity, n_docs=n_docs) 82 | benchmark_querying(index_type="milvus_hnsw", similarity=similarity, n_docs=n_docs) 83 | benchmark_querying(index_type="faiss_flat", similarity=similarity, n_docs=n_docs) 84 | benchmark_querying(index_type="faiss_hnsw", similarity=similarity, n_docs=n_docs) 85 | -------------------------------------------------------------------------------- /test/benchmarks/run.py: -------------------------------------------------------------------------------- 1 | # The benchmarks use 2 | # - a variant of the Natural Questions Dataset (https://ai.google.com/research/NaturalQuestions) from Google Research 3 | # licensed under CC BY-SA 3.0 (https://creativecommons.org/licenses/by-sa/3.0/) 4 | # - the SQuAD 2.0 Dataset (https://rajpurkar.github.io/SQuAD-explorer/) from Rajpurkar et al. 5 | # licensed under CC BY-SA 4.0 (https://creativecommons.org/licenses/by-sa/4.0/legalcode) 6 | 7 | from retriever import benchmark_indexing, benchmark_querying 8 | from reader import benchmark_reader 9 | from utils import load_config 10 | import argparse 11 | 12 | 13 | parser = argparse.ArgumentParser() 14 | 15 | parser.add_argument("--reader", default=False, action="store_true", help="Perform Reader benchmarks") 16 | parser.add_argument( 17 | "--retriever_index", default=False, action="store_true", help="Perform Retriever indexing benchmarks" 18 | ) 19 | parser.add_argument( 20 | "--retriever_query", default=False, action="store_true", help="Perform Retriever querying benchmarks" 21 | ) 22 | parser.add_argument( 23 | "--ci", default=False, action="store_true", help="Perform a smaller subset of benchmarks that are quicker to run" 24 | ) 25 | parser.add_argument( 26 | "--update_json", 27 | default=False, 28 | action="store_true", 29 | help="Update the json file with the results of this run so that the website can be updated", 30 | ) 31 | parser.add_argument( 32 | "--save_markdown", 33 | default=False, 34 | action="store_true", 35 | help="Update the json file with the results of this run so that the website can be updated", 36 | ) 37 | args = parser.parse_args() 38 | 39 | # load config 40 | params, filenames = load_config(config_filename="config.json", ci=args.ci) 41 | 42 | if args.retriever_index: 43 | benchmark_indexing( 44 | **params, **filenames, ci=args.ci, update_json=args.update_json, save_markdown=args.save_markdown 45 | ) 46 | if args.retriever_query: 47 | benchmark_querying( 48 | **params, **filenames, ci=args.ci, update_json=args.update_json, save_markdown=args.save_markdown 49 | ) 50 | if args.reader: 51 | benchmark_reader(**params, **filenames, ci=args.ci, update_json=args.update_json, save_markdown=args.save_markdown) 52 | -------------------------------------------------------------------------------- /test/document_stores/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intel/open-domain-question-and-answer/7869af3f98ff6b06d7e1d8e576a6c5ca8bbd573d/test/document_stores/__init__.py -------------------------------------------------------------------------------- /test/document_stores/test_knowledge_graph.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import pytest 4 | 5 | from haystack.nodes import Text2SparqlRetriever 6 | from haystack.document_stores import GraphDBKnowledgeGraph 7 | from haystack.utils import fetch_archive_from_http 8 | 9 | 10 | @pytest.mark.graphdb 11 | def test_graph_retrieval(): 12 | # TODO rename doc_dir 13 | graph_dir = "../data/tutorial10_knowledge_graph/" 14 | s3_url = "https://fandom-qa.s3-eu-west-1.amazonaws.com/triples_and_config.zip" 15 | fetch_archive_from_http(url=s3_url, output_dir=graph_dir) 16 | 17 | # Fetch a pre-trained BART model that translates natural language questions to SPARQL queries 18 | model_dir = "../saved_models/tutorial10_knowledge_graph/" 19 | s3_url = "https://fandom-qa.s3-eu-west-1.amazonaws.com/saved_models/hp_v3.4.zip" 20 | fetch_archive_from_http(url=s3_url, output_dir=model_dir) 21 | 22 | kg = GraphDBKnowledgeGraph(index="tutorial_10_index") 23 | kg.delete_index() 24 | kg.create_index(config_path=Path(graph_dir + "repo-config.ttl")) 25 | kg.import_from_ttl_file(index="tutorial_10_index", path=Path(graph_dir + "triples.ttl")) 26 | triple = { 27 | "p": {"type": "uri", "value": "https://deepset.ai/harry_potter/_paternalgrandfather"}, 28 | "s": {"type": "uri", "value": "https://deepset.ai/harry_potter/Melody_fawley"}, 29 | "o": {"type": "uri", "value": "https://deepset.ai/harry_potter/Marshall_fawley"}, 30 | } 31 | triples = kg.get_all_triples() 32 | assert len(triples) > 0 33 | assert triple in triples 34 | 35 | # Define prefixes for names of resources so that we can use shorter resource names in queries 36 | prefixes = """PREFIX rdf: 37 | PREFIX xsd: 38 | PREFIX hp: 39 | """ 40 | kg.prefixes = prefixes 41 | 42 | kgqa_retriever = Text2SparqlRetriever(knowledge_graph=kg, model_name_or_path=model_dir + "hp_v3.4") 43 | 44 | result = kgqa_retriever.retrieve(query="In which house is Harry Potter?") 45 | assert result[0] == { 46 | "answer": ["https://deepset.ai/harry_potter/Gryffindor"], 47 | "prediction_meta": { 48 | "model": "Text2SparqlRetriever", 49 | "sparql_query": "select ?a { hp:Harry_potter hp:house ?a . }", 50 | }, 51 | } 52 | 53 | result = kgqa_retriever._query_kg( 54 | sparql_query="select distinct ?sbj where { ?sbj hp:job hp:Keeper_of_keys_and_grounds . }" 55 | ) 56 | assert result[0][0] == "https://deepset.ai/harry_potter/Rubeus_hagrid" 57 | 58 | result = kgqa_retriever._query_kg( 59 | sparql_query="select distinct ?obj where { ?obj . }" 60 | ) 61 | assert result[0][0] == "https://deepset.ai/harry_potter/Otter" 62 | -------------------------------------------------------------------------------- /test/modeling/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intel/open-domain-question-and-answer/7869af3f98ff6b06d7e1d8e576a6c5ca8bbd573d/test/modeling/__init__.py -------------------------------------------------------------------------------- /test/modeling/test_modeling_inference.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | @pytest.mark.parametrize("multiprocessing_chunksize", [None, 2]) 5 | @pytest.mark.parametrize("num_processes", [2, 0, None], scope="module") 6 | def test_qa_format_and_results(adaptive_model_qa, multiprocessing_chunksize): 7 | qa_inputs_dicts = [ 8 | { 9 | "questions": ["In what country is Normandy"], 10 | "text": "The Normans are an ethnic group that arose in Normandy, a northern region " 11 | "of France, from contact between Viking settlers and indigenous Franks and Gallo-Romans", 12 | }, 13 | { 14 | "questions": ["Who counted the game among the best ever made?"], 15 | "text": "Twilight Princess was released to universal critical acclaim and commercial success. It received " 16 | "perfect scores from major publications such as 1UP.com, Computer and Video Games, Electronic " 17 | "Gaming Monthly, Game Informer, GamesRadar, and GameSpy. On the review aggregators GameRankings " 18 | "and Metacritic, Twilight Princess has average scores of 95% and 95 for the Wii version and scores " 19 | "of 95% and 96 for the GameCube version. GameTrailers in their review called it one of the " 20 | "greatest games ever created.", 21 | }, 22 | ] 23 | ground_truths = ["France", "GameTrailers"] 24 | 25 | results = adaptive_model_qa.inference_from_dicts( 26 | dicts=qa_inputs_dicts, multiprocessing_chunksize=multiprocessing_chunksize 27 | ) 28 | # sample results 29 | # [ 30 | # { 31 | # "task": "qa", 32 | # "predictions": [ 33 | # { 34 | # "question": "In what country is Normandy", 35 | # "question_id": "None", 36 | # "ground_truth": None, 37 | # "answers": [ 38 | # { 39 | # "score": 1.1272038221359253, 40 | # "probability": -1, 41 | # "answer": "France", 42 | # "offset_answer_start": 54, 43 | # "offset_answer_end": 60, 44 | # "context": "The Normans gave their name to Normandy, a region in France.", 45 | # "offset_context_start": 0, 46 | # "offset_context_end": 60, 47 | # "document_id": None, 48 | # } 49 | # ] 50 | # } 51 | # ], 52 | # } 53 | # ] 54 | predictions = list(results)[0]["predictions"] 55 | 56 | for prediction, ground_truth, qa_input_dict in zip(predictions, ground_truths, qa_inputs_dicts): 57 | assert prediction["question"] == qa_input_dict["questions"][0] 58 | answer = prediction["answers"][0] 59 | assert answer["answer"] in answer["context"] 60 | assert answer["answer"] == ground_truth 61 | assert { 62 | "answer", 63 | "score", 64 | "probability", 65 | "offset_answer_start", 66 | "offset_answer_end", 67 | "context", 68 | "offset_context_start", 69 | "offset_context_end", 70 | "document_id", 71 | } == answer.keys() 72 | 73 | 74 | if __name__ == "__main__": 75 | test_qa_format_and_results() 76 | -------------------------------------------------------------------------------- /test/modeling/test_modeling_prediction_head.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from haystack.modeling.model.adaptive_model import AdaptiveModel 4 | from haystack.modeling.model.language_model import LanguageModel 5 | from haystack.modeling.model.prediction_head import QuestionAnsweringHead 6 | from haystack.modeling.utils import set_all_seeds, initialize_device_settings 7 | 8 | 9 | def test_prediction_head_load_save(tmp_path, caplog=None): 10 | if caplog: 11 | caplog.set_level(logging.CRITICAL) 12 | 13 | set_all_seeds(seed=42) 14 | devices, n_gpu = initialize_device_settings(use_cuda=False) 15 | lang_model = "bert-base-german-cased" 16 | 17 | language_model = LanguageModel.load(lang_model) 18 | prediction_head = QuestionAnsweringHead() 19 | 20 | model = AdaptiveModel( 21 | language_model=language_model, 22 | prediction_heads=[prediction_head], 23 | embeds_dropout_prob=0.1, 24 | lm_output_types=["per_sequence"], 25 | device=devices[0], 26 | ) 27 | 28 | model.save(tmp_path) 29 | model_loaded = AdaptiveModel.load(tmp_path, device="cpu") 30 | assert model_loaded is not None 31 | -------------------------------------------------------------------------------- /test/modeling/test_modeling_processor_saving_loading.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from pathlib import Path 3 | 4 | from haystack.modeling.data_handler.processor import SquadProcessor 5 | from haystack.modeling.model.tokenization import Tokenizer 6 | from haystack.modeling.utils import set_all_seeds 7 | import torch 8 | 9 | from ..conftest import SAMPLES_PATH 10 | 11 | 12 | def test_processor_saving_loading(tmp_path, caplog): 13 | if caplog is not None: 14 | caplog.set_level(logging.CRITICAL) 15 | 16 | set_all_seeds(seed=42) 17 | lang_model = "roberta-base" 18 | 19 | tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=False) 20 | 21 | processor = SquadProcessor( 22 | tokenizer=tokenizer, 23 | max_seq_len=256, 24 | label_list=["start_token", "end_token"], 25 | train_filename="train-sample.json", 26 | dev_filename="dev-sample.json", 27 | test_filename=None, 28 | data_dir=SAMPLES_PATH / "qa", 29 | ) 30 | 31 | dicts = processor.file_to_dicts(file=SAMPLES_PATH / "qa" / "dev-sample.json") 32 | data, tensor_names, _ = processor.dataset_from_dicts(dicts=dicts, indices=[1]) 33 | 34 | save_dir = tmp_path / Path("testsave/processor") 35 | processor.save(save_dir) 36 | 37 | processor = processor.load_from_dir(save_dir) 38 | dicts = processor.file_to_dicts(file=SAMPLES_PATH / "qa" / "dev-sample.json") 39 | data_loaded, tensor_names_loaded, _ = processor.dataset_from_dicts(dicts, indices=[1]) 40 | 41 | assert tensor_names == tensor_names_loaded 42 | for i in range(len(data.tensors)): 43 | assert torch.all(torch.eq(data.tensors[i], data_loaded.tensors[i])) 44 | 45 | 46 | if __name__ == "__main__": 47 | test_processor_saving_loading(None) 48 | -------------------------------------------------------------------------------- /test/nodes/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intel/open-domain-question-and-answer/7869af3f98ff6b06d7e1d8e576a6c5ca8bbd573d/test/nodes/__init__.py -------------------------------------------------------------------------------- /test/nodes/test_label_generator.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import pytest 4 | 5 | from haystack.nodes import QuestionGenerator, EmbeddingRetriever, PseudoLabelGenerator 6 | from test.conftest import DOCS_WITH_EMBEDDINGS 7 | 8 | 9 | @pytest.mark.slow 10 | @pytest.mark.generator 11 | @pytest.mark.parametrize("document_store", ["memory"], indirect=True) 12 | @pytest.mark.parametrize("retriever", ["embedding_sbert"], indirect=True) 13 | def test_pseudo_label_generator( 14 | document_store, retriever: EmbeddingRetriever, question_generator: QuestionGenerator, tmp_path: Path 15 | ): 16 | document_store.write_documents(DOCS_WITH_EMBEDDINGS) 17 | psg = PseudoLabelGenerator(question_generator, retriever) 18 | train_examples = [] 19 | for idx, doc in enumerate(document_store): 20 | output, stream = psg.run(documents=[doc]) 21 | assert "gpl_labels" in output 22 | for item in output["gpl_labels"]: 23 | assert "question" in item and "pos_doc" in item and "neg_doc" in item and "score" in item 24 | train_examples.append(item) 25 | 26 | assert len(train_examples) > 0 27 | retriever.train(train_examples) 28 | retriever.save(tmp_path) 29 | 30 | 31 | @pytest.mark.slow 32 | @pytest.mark.generator 33 | @pytest.mark.parametrize("document_store", ["memory"], indirect=True) 34 | @pytest.mark.parametrize("retriever", ["embedding_sbert"], indirect=True) 35 | def test_pseudo_label_generator_using_question_document_pairs( 36 | document_store, retriever: EmbeddingRetriever, tmp_path: Path 37 | ): 38 | document_store.write_documents(DOCS_WITH_EMBEDDINGS) 39 | docs = [ 40 | { 41 | "question": "What is the capital of Germany?", 42 | "document": "Berlin is the capital and largest city of Germany by both area and population.", 43 | }, 44 | { 45 | "question": "What is the largest city in Germany by population and area?", 46 | "document": "Berlin is the capital and largest city of Germany by both area and population.", 47 | }, 48 | ] 49 | psg = PseudoLabelGenerator(docs, retriever) 50 | train_examples = [] 51 | for idx, doc in enumerate(document_store): 52 | # the documents passed here are ignored as we provided source documents in the constructor 53 | output, stream = psg.run(documents=[doc]) 54 | assert "gpl_labels" in output 55 | for item in output["gpl_labels"]: 56 | assert "question" in item and "pos_doc" in item and "neg_doc" in item and "score" in item 57 | train_examples.append(item) 58 | 59 | assert len(train_examples) > 0 60 | 61 | retriever.train(train_examples) 62 | retriever.save(tmp_path) 63 | -------------------------------------------------------------------------------- /test/nodes/test_question_generator.py: -------------------------------------------------------------------------------- 1 | from haystack.pipelines import ( 2 | QuestionAnswerGenerationPipeline, 3 | QuestionGenerationPipeline, 4 | RetrieverQuestionGenerationPipeline, 5 | ) 6 | from haystack.schema import Document 7 | import pytest 8 | 9 | 10 | text = 'The Living End are an Australian punk rockabilly band from Melbourne, formed in 1994. Since 2002, the line-up consists of Chris Cheney (vocals, guitar), Scott Owen (double bass, vocals), and Andy Strachan (drums). The band rose to fame in 1997 after the release of their EP Second Solution / Prisoner of Society, which peaked at No. 4 on the Australian ARIA Singles Chart. They have released eight studio albums, two of which reached the No. 1 spot on the ARIA Albums Chart: The Living End (October 1998) and State of Emergency (February 2006). They have also achieved chart success in the U.S. and the United Kingdom. The Band was nominated 27 times and won five awards at the Australian ARIA Music Awards ceremonies: "Highest Selling Single" for Second Solution / Prisoner of Society (1998), "Breakthrough Artist – Album" and "Best Group" for The Living End (1999), as well as "Best Rock Album" for White Noise (2008) and The Ending Is Just the Beginning Repeating (2011). In October 2010, their debut album was listed in the book "100 Best Australian Albums". Australian musicologist Ian McFarlane described the group as "one of Australia’s premier rock acts. By blending a range of styles (punk, rockabilly and flat out rock) with great success, The Living End has managed to produce anthemic choruses and memorable songs in abundance".' 11 | document = Document(content=text) 12 | query = "Living End" 13 | 14 | 15 | def test_qg_pipeline(question_generator): 16 | p = QuestionGenerationPipeline(question_generator) 17 | result = p.run(documents=[document]) 18 | keys = list(result) 19 | assert "generated_questions" in keys 20 | assert len(result["generated_questions"][0]["questions"]) > 0 21 | 22 | 23 | @pytest.mark.parametrize("retriever,document_store", [("tfidf", "memory")], indirect=True) 24 | def test_rqg_pipeline(question_generator, retriever): 25 | retriever.document_store.write_documents([document]) 26 | retriever.fit() 27 | p = RetrieverQuestionGenerationPipeline(retriever, question_generator) 28 | result = p.run(query) 29 | keys = list(result) 30 | assert "generated_questions" in keys 31 | assert len(result["generated_questions"][0]["questions"]) > 0 32 | 33 | 34 | @pytest.mark.parametrize("reader", ["farm"], indirect=True) 35 | def test_qag_pipeline(question_generator, reader): 36 | p = QuestionAnswerGenerationPipeline(question_generator, reader) 37 | results = p.run(documents=[document]) 38 | assert "queries" in results 39 | assert "answers" in results 40 | assert len(results["queries"]) == len(results["answers"]) 41 | assert len(results["answers"]) > 0 42 | assert results["answers"][0][0].answer is not None 43 | -------------------------------------------------------------------------------- /test/nodes/test_summarizer_translation.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from haystack.pipelines import TranslationWrapperPipeline, SearchSummarizationPipeline 4 | from haystack.nodes import DensePassageRetriever, EmbeddingRetriever 5 | from .test_summarizer import SPLIT_DOCS 6 | 7 | # Keeping few (retriever,document_store) combination to reduce test time 8 | @pytest.mark.slow 9 | @pytest.mark.elasticsearch 10 | @pytest.mark.summarizer 11 | @pytest.mark.parametrize( 12 | "retriever,document_store", [("embedding", "memory"), ("elasticsearch", "elasticsearch")], indirect=True 13 | ) 14 | def test_summarization_pipeline_with_translator( 15 | document_store, retriever, summarizer, en_to_de_translator, de_to_en_translator 16 | ): 17 | document_store.write_documents(SPLIT_DOCS) 18 | 19 | if isinstance(retriever, EmbeddingRetriever) or isinstance(retriever, DensePassageRetriever): 20 | document_store.update_embeddings(retriever=retriever) 21 | 22 | query = "Wo steht der Eiffelturm?" 23 | base_pipeline = SearchSummarizationPipeline(retriever=retriever, summarizer=summarizer) 24 | pipeline = TranslationWrapperPipeline( 25 | input_translator=de_to_en_translator, output_translator=en_to_de_translator, pipeline=base_pipeline 26 | ) 27 | output = pipeline.run( 28 | query=query, params={"Retriever": {"top_k": 2}, "Summarizer": {"generate_single_summary": True}} 29 | ) 30 | # SearchSummarizationPipeline return answers but Summarizer return documents 31 | documents = output["documents"] 32 | assert len(documents) == 1 33 | assert documents[0].content in [ 34 | "Der Eiffelturm ist ein Wahrzeichen in Paris, Frankreich.", 35 | "Der Eiffelturm, der 1889 in Paris, Frankreich, erbaut wurde, ist das höchste freistehende Bauwerk der Welt.", 36 | ] 37 | -------------------------------------------------------------------------------- /test/nodes/test_translator.py: -------------------------------------------------------------------------------- 1 | from haystack.schema import Document 2 | 3 | import pytest 4 | 5 | EXPECTED_OUTPUT = "Ich lebe in Berlin" 6 | INPUT = "I live in Berlin" 7 | 8 | 9 | def test_translator_with_query(en_to_de_translator): 10 | assert en_to_de_translator.translate(query=INPUT) == EXPECTED_OUTPUT 11 | 12 | 13 | def test_translator_with_list(en_to_de_translator): 14 | assert en_to_de_translator.translate(documents=[INPUT])[0] == EXPECTED_OUTPUT 15 | 16 | 17 | def test_translator_with_document(en_to_de_translator): 18 | assert en_to_de_translator.translate(documents=[Document(content=INPUT)])[0].content == EXPECTED_OUTPUT 19 | 20 | 21 | def test_translator_with_dictionary(en_to_de_translator): 22 | assert en_to_de_translator.translate(documents=[{"content": INPUT}])[0]["content"] == EXPECTED_OUTPUT 23 | 24 | 25 | def test_translator_with_dictionary_with_dict_key(en_to_de_translator): 26 | assert en_to_de_translator.translate(documents=[{"key": INPUT}], dict_key="key")[0]["key"] == EXPECTED_OUTPUT 27 | 28 | 29 | def test_translator_with_empty_input(en_to_de_translator): 30 | with pytest.raises(AttributeError): 31 | en_to_de_translator.translate() 32 | 33 | 34 | def test_translator_with_query_and_documents(en_to_de_translator): 35 | with pytest.raises(AttributeError): 36 | en_to_de_translator.translate(query=INPUT, documents=[INPUT]) 37 | 38 | 39 | def test_translator_with_dict_without_text_key(en_to_de_translator): 40 | with pytest.raises(AttributeError): 41 | en_to_de_translator.translate(documents=[{"text1": INPUT}]) 42 | 43 | 44 | def test_translator_with_dict_with_non_string_value(en_to_de_translator): 45 | with pytest.raises(AttributeError): 46 | en_to_de_translator.translate(documents=[{"text": 123}]) 47 | -------------------------------------------------------------------------------- /test/others/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intel/open-domain-question-and-answer/7869af3f98ff6b06d7e1d8e576a6c5ca8bbd573d/test/others/__init__.py -------------------------------------------------------------------------------- /test/pipelines/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intel/open-domain-question-and-answer/7869af3f98ff6b06d7e1d8e576a6c5ca8bbd573d/test/pipelines/__init__.py -------------------------------------------------------------------------------- /test/pipelines/test_ray.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import pytest 4 | import ray 5 | 6 | from haystack.pipelines import RayPipeline 7 | 8 | from ..conftest import SAMPLES_PATH 9 | 10 | 11 | @pytest.fixture(autouse=True) 12 | def shutdown_ray(): 13 | yield 14 | try: 15 | import ray 16 | 17 | ray.shutdown() 18 | except: 19 | pass 20 | 21 | 22 | @pytest.mark.integration 23 | @pytest.mark.parametrize("document_store_with_docs", ["elasticsearch"], indirect=True) 24 | def test_load_pipeline(document_store_with_docs): 25 | pipeline = RayPipeline.load_from_yaml( 26 | SAMPLES_PATH / "pipeline" / "ray.haystack-pipeline.yml", 27 | pipeline_name="ray_query_pipeline", 28 | ray_args={"num_cpus": 8}, 29 | ) 30 | prediction = pipeline.run(query="Who lives in Berlin?", params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 3}}) 31 | 32 | assert ray.serve.get_deployment(name="ESRetriever").num_replicas == 2 33 | assert ray.serve.get_deployment(name="Reader").num_replicas == 1 34 | assert prediction["query"] == "Who lives in Berlin?" 35 | assert prediction["answers"][0].answer == "Carla" 36 | -------------------------------------------------------------------------------- /test/samples/dc/matching_test_1.csv: -------------------------------------------------------------------------------- 1 | query,text,context,file_name,answer_start,answer_end 2 | "What are Primitives?","These are classes that carry data through the system.","# Primitives\n\nIn Haystack, there are a handful of core classes that are regularly used in many different places.\nThese are classes that carry data through the system.\nUsers will likely interact with these as either the input or output of their pipeline.\n\n## Document\n\nThe Document class contains all the information regarding the contents of a document,\nincluding its id and metadata.\nIt may also contain information created in the pipeline including the confidence ","sample_pdf_1.pdf",113,166 3 | -------------------------------------------------------------------------------- /test/samples/dc/pipeline_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": "master", 3 | "name": "document_retrieval_1", 4 | "components": [ 5 | { 6 | "name": "DocumentStore", 7 | "type": "DeepsetCloudDocumentStore", 8 | "params": { 9 | "similarity": "cosine" 10 | } 11 | }, 12 | { 13 | "name": "Retriever", 14 | "type": "BM25Retriever", 15 | "params": { 16 | "document_store": "DocumentStore", 17 | "top_k": 5 18 | } 19 | }, 20 | { 21 | "name": "Reader", 22 | "type": "FARMReader", 23 | "params": { 24 | "model_name_or_path": "deepset/minilm-uncased-squad2" 25 | } 26 | }, 27 | { 28 | "name": "TextFileConverter", 29 | "type": "TextConverter" 30 | }, 31 | { 32 | "name": "Preprocessor", 33 | "type": "PreProcessor", 34 | "params": { 35 | "split_by": "word", 36 | "split_length": 1000 37 | } 38 | } 39 | ], 40 | "pipelines": [ 41 | { 42 | "name": "query", 43 | "nodes": [ 44 | { 45 | "name": "Retriever", 46 | "inputs": [ 47 | "Query" 48 | ] 49 | } 50 | ] 51 | }, 52 | { 53 | "name": "indexing", 54 | "nodes": [ 55 | { 56 | "name": "TextFileConverter", 57 | "inputs": [ 58 | "File" 59 | ] 60 | }, 61 | { 62 | "name": "Preprocessor", 63 | "inputs": [ 64 | "TextFileConverter" 65 | ] 66 | }, 67 | { 68 | "name": "Retriever", 69 | "inputs": [ 70 | "Preprocessor" 71 | ] 72 | }, 73 | { 74 | "name": "DocumentStore", 75 | "inputs": [ 76 | "Retriever" 77 | ] 78 | } 79 | ] 80 | } 81 | ] 82 | } -------------------------------------------------------------------------------- /test/samples/docs/doc_1.txt: -------------------------------------------------------------------------------- 1 | Some text for testing. 2 | Two lines in here. -------------------------------------------------------------------------------- /test/samples/docs/doc_2.txt: -------------------------------------------------------------------------------- 1 | A Doc specifically talking about haystack. 2 | Haystack can be used to scale QA models to large document collections. -------------------------------------------------------------------------------- /test/samples/docx/sample_docx.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intel/open-domain-question-and-answer/7869af3f98ff6b06d7e1d8e576a6c5ca8bbd573d/test/samples/docx/sample_docx.docx -------------------------------------------------------------------------------- /test/samples/extensionless_files/docx_file: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intel/open-domain-question-and-answer/7869af3f98ff6b06d7e1d8e576a6c5ca8bbd573d/test/samples/extensionless_files/docx_file -------------------------------------------------------------------------------- /test/samples/extensionless_files/gif_file: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intel/open-domain-question-and-answer/7869af3f98ff6b06d7e1d8e576a6c5ca8bbd573d/test/samples/extensionless_files/gif_file -------------------------------------------------------------------------------- /test/samples/extensionless_files/html_file: -------------------------------------------------------------------------------- 1 | 2 | sample -------------------------------------------------------------------------------- /test/samples/extensionless_files/jpg_file: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intel/open-domain-question-and-answer/7869af3f98ff6b06d7e1d8e576a6c5ca8bbd573d/test/samples/extensionless_files/jpg_file -------------------------------------------------------------------------------- /test/samples/extensionless_files/mp3_file: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intel/open-domain-question-and-answer/7869af3f98ff6b06d7e1d8e576a6c5ca8bbd573d/test/samples/extensionless_files/mp3_file -------------------------------------------------------------------------------- /test/samples/extensionless_files/odt_file: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intel/open-domain-question-and-answer/7869af3f98ff6b06d7e1d8e576a6c5ca8bbd573d/test/samples/extensionless_files/odt_file -------------------------------------------------------------------------------- /test/samples/extensionless_files/pdf_file: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intel/open-domain-question-and-answer/7869af3f98ff6b06d7e1d8e576a6c5ca8bbd573d/test/samples/extensionless_files/pdf_file -------------------------------------------------------------------------------- /test/samples/extensionless_files/png_file: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intel/open-domain-question-and-answer/7869af3f98ff6b06d7e1d8e576a6c5ca8bbd573d/test/samples/extensionless_files/png_file -------------------------------------------------------------------------------- /test/samples/extensionless_files/pptx_file: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intel/open-domain-question-and-answer/7869af3f98ff6b06d7e1d8e576a6c5ca8bbd573d/test/samples/extensionless_files/pptx_file -------------------------------------------------------------------------------- /test/samples/extensionless_files/txt_file: -------------------------------------------------------------------------------- 1 | Sample -------------------------------------------------------------------------------- /test/samples/extensionless_files/wav_file: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intel/open-domain-question-and-answer/7869af3f98ff6b06d7e1d8e576a6c5ca8bbd573d/test/samples/extensionless_files/wav_file -------------------------------------------------------------------------------- /test/samples/extensionless_files/zip_file: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intel/open-domain-question-and-answer/7869af3f98ff6b06d7e1d8e576a6c5ca8bbd573d/test/samples/extensionless_files/zip_file -------------------------------------------------------------------------------- /test/samples/pdf/sample_pdf_1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intel/open-domain-question-and-answer/7869af3f98ff6b06d7e1d8e576a6c5ca8bbd573d/test/samples/pdf/sample_pdf_1.pdf -------------------------------------------------------------------------------- /test/samples/pdf/sample_pdf_2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intel/open-domain-question-and-answer/7869af3f98ff6b06d7e1d8e576a6c5ca8bbd573d/test/samples/pdf/sample_pdf_2.pdf -------------------------------------------------------------------------------- /test/samples/pipeline/ray.haystack-pipeline.yml: -------------------------------------------------------------------------------- 1 | version: ignore 2 | extras: ray 3 | 4 | components: 5 | - name: DocumentStore 6 | type: ElasticsearchDocumentStore 7 | params: 8 | index: haystack_test 9 | label_index: haystack_test_label 10 | - name: ESRetriever 11 | type: BM25Retriever 12 | params: 13 | document_store: DocumentStore 14 | - name: Reader 15 | type: FARMReader 16 | params: 17 | no_ans_boost: -10 18 | model_name_or_path: deepset/roberta-base-squad2 19 | num_processes: 0 20 | - name: PDFConverter 21 | type: PDFToTextConverter 22 | params: 23 | remove_numeric_tables: false 24 | - name: Preprocessor 25 | type: PreProcessor 26 | params: 27 | clean_whitespace: true 28 | - name: IndexTimeDocumentClassifier 29 | type: TransformersDocumentClassifier 30 | params: 31 | batch_size: 16 32 | use_gpu: false 33 | - name: QueryTimeDocumentClassifier 34 | type: TransformersDocumentClassifier 35 | params: 36 | use_gpu: false 37 | 38 | 39 | pipelines: 40 | - name: ray_query_pipeline 41 | nodes: 42 | - name: ESRetriever 43 | replicas: 2 44 | inputs: [ Query ] 45 | - name: Reader 46 | inputs: [ ESRetriever ] 47 | -------------------------------------------------------------------------------- /test/samples/pipeline/test.haystack-pipeline.yml: -------------------------------------------------------------------------------- 1 | version: ignore 2 | 3 | components: 4 | - name: Reader 5 | type: FARMReader 6 | params: 7 | no_ans_boost: -10 8 | model_name_or_path: deepset/roberta-base-squad2 9 | num_processes: 0 10 | - name: ESRetriever 11 | type: BM25Retriever 12 | params: 13 | document_store: DocumentStore 14 | - name: DocumentStore 15 | type: ElasticsearchDocumentStore 16 | params: 17 | index: haystack_test 18 | label_index: haystack_test_label 19 | - name: PDFConverter 20 | type: PDFToTextConverter 21 | params: 22 | remove_numeric_tables: false 23 | - name: TextConverter 24 | type: TextConverter 25 | - name: Preprocessor 26 | type: PreProcessor 27 | params: 28 | clean_whitespace: true 29 | - name: IndexTimeDocumentClassifier 30 | type: TransformersDocumentClassifier 31 | params: 32 | batch_size: 16 33 | use_gpu: false 34 | - name: QueryTimeDocumentClassifier 35 | type: TransformersDocumentClassifier 36 | params: 37 | use_gpu: false 38 | 39 | 40 | pipelines: 41 | - name: query_pipeline 42 | nodes: 43 | - name: ESRetriever 44 | inputs: [Query] 45 | - name: Reader 46 | inputs: [ESRetriever] 47 | 48 | - name: query_pipeline_with_document_classifier 49 | nodes: 50 | - name: ESRetriever 51 | inputs: [Query] 52 | - name: QueryTimeDocumentClassifier 53 | inputs: [ESRetriever] 54 | - name: Reader 55 | inputs: [QueryTimeDocumentClassifier] 56 | 57 | - name: indexing_pipeline 58 | nodes: 59 | - name: PDFConverter 60 | inputs: [File] 61 | - name: Preprocessor 62 | inputs: [PDFConverter] 63 | - name: ESRetriever 64 | inputs: [Preprocessor] 65 | - name: DocumentStore 66 | inputs: [ESRetriever] 67 | 68 | - name: indexing_text_pipeline 69 | nodes: 70 | - name: TextConverter 71 | inputs: [File] 72 | - name: Preprocessor 73 | inputs: [TextConverter] 74 | - name: ESRetriever 75 | inputs: [Preprocessor] 76 | - name: DocumentStore 77 | inputs: [ESRetriever] 78 | 79 | - name: indexing_pipeline_with_classifier 80 | nodes: 81 | - name: PDFConverter 82 | inputs: [File] 83 | - name: Preprocessor 84 | inputs: [PDFConverter] 85 | - name: IndexTimeDocumentClassifier 86 | inputs: [Preprocessor] 87 | - name: ESRetriever 88 | inputs: [IndexTimeDocumentClassifier] 89 | - name: DocumentStore 90 | inputs: [ESRetriever] 91 | -------------------------------------------------------------------------------- /test/samples/qa/answer-offset-wrong.json: -------------------------------------------------------------------------------- 1 | { 2 | "data": [ 3 | { 4 | "title": "Test", 5 | "paragraphs": [ 6 | { 7 | "context": "Berlin has 10 inhabitants.", 8 | "qas": [ 9 | { 10 | "question": "How many people live in Berlin?", 11 | "id": "5ad3d560604f3c001a3ff2c8", 12 | "answers": [{"text": "10", "answer_start": 0}], 13 | "is_impossible": false 14 | } 15 | ] 16 | } 17 | ] 18 | } 19 | ], 20 | "version": "v2.0" 21 | } -------------------------------------------------------------------------------- /test/samples/qa/answer-wrong.json: -------------------------------------------------------------------------------- 1 | { 2 | "data": [ 3 | { 4 | "title": "Test", 5 | "paragraphs": [ 6 | { 7 | "context": "Berlin has 10 inhabitants.", 8 | "qas": [ 9 | { 10 | "question": "How many people live in Berlin?", 11 | "id": "5ad3d560604f3c001a3ff2c8", 12 | "answers": [{"text": "11", "answer_start": 11}], 13 | "is_impossible": false 14 | } 15 | ] 16 | } 17 | ] 18 | } 19 | ], 20 | "version": "v2.0" 21 | } -------------------------------------------------------------------------------- /test/samples/qa/dev-sample.json: -------------------------------------------------------------------------------- 1 | {"data": [{"paragraphs": [{"qas": [{"question": "In what country is Normandy located?", "id": "56ddde6b9a695914005b9628", "answers": [{"text": "France", "answer_start": 53}], "is_impossible": false}], "context": "The Normans gave their name to Normandy, a region in France."}]}]} -------------------------------------------------------------------------------- /test/samples/qa/eval-sample.json: -------------------------------------------------------------------------------- 1 | { 2 | "data": [ 3 | { 4 | "title": "Test", 5 | "paragraphs": [ 6 | { 7 | "context": "Berlin has 10 inhabitants.", 8 | "qas": [ 9 | { 10 | "question": "How many people live in Paris?", 11 | "id": "5ad3d560604f3c001a3ff2c6", 12 | "answers": [], 13 | "is_impossible": true 14 | } 15 | ] 16 | } 17 | ] 18 | }, 19 | { 20 | "title": "Test2", 21 | "paragraphs": [ 22 | { 23 | "context": "Berlin has 10 inhabitants.", 24 | "qas": [ 25 | { 26 | "question": "How many people live in Berlin?", 27 | "id": "5ad3d560604f3c001a3ff2c7", 28 | "answers": [{"text": "10", "answer_start": 11}, {"text": "10 inhabitants", "answer_start": 11}], 29 | "is_impossible": false 30 | }, 31 | { 32 | "question": "How many people live in Berlin?", 33 | "id": "5ad3d560604f3c001a3ff2c8", 34 | "answers": [{"text": "Berlin", "answer_start": 0}, {"text": "Berlin", "answer_start": 0}], 35 | "is_impossible": false 36 | } 37 | ] 38 | } 39 | ] 40 | } 41 | ], 42 | "version": "v2.0" 43 | } -------------------------------------------------------------------------------- /test/samples/qa/noanswer.json: -------------------------------------------------------------------------------- 1 | { 2 | "data": [ 3 | { 4 | "title": "Test", 5 | "paragraphs": [ 6 | { 7 | "context": "Berlin has 10 inhabitants.", 8 | "qas": [ 9 | { 10 | "question": "How many people live in Paris?", 11 | "id": "5ad3d560604f3c001a3ff2c8", 12 | "answers": [], 13 | "is_impossible": true 14 | } 15 | ] 16 | } 17 | ] 18 | } 19 | ], 20 | "version": "v2.0" 21 | } -------------------------------------------------------------------------------- /test/samples/qa/train-sample.json: -------------------------------------------------------------------------------- 1 | {"data": [{"paragraphs": [{"qas": [{"question": "In what country is Normandy located?", "id": "56ddde6b9a695914005b9628", "answers": [{"text": "France", "answer_start": 159}], "is_impossible": false}], "context": "The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse (\"Norman\" comes from \"Norseman\") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia."}]}]} -------------------------------------------------------------------------------- /test/samples/qa/vanilla.json: -------------------------------------------------------------------------------- 1 | { 2 | "data": [ 3 | { 4 | "title": "Test", 5 | "paragraphs": [ 6 | { 7 | "context": "Berlin has 10 inhabitants.", 8 | "qas": [ 9 | { 10 | "question": "How many people live in Berlin?", 11 | "id": "5ad3d560604f3c001a3ff2c8", 12 | "answers": [{"text": "10", "answer_start": 11}, {"text": "10 inhabitants", "answer_start": 11}], 13 | "is_impossible": false 14 | } 15 | ] 16 | } 17 | ] 18 | } 19 | ], 20 | "version": "v2.0" 21 | } -------------------------------------------------------------------------------- /test/samples/squad/tiny.json: -------------------------------------------------------------------------------- 1 | { 2 | "data": [ 3 | { 4 | "title": "test1", 5 | "paragraphs": [ 6 | { 7 | "context": "My name is Carla and I live together with Abdul in Berlin", 8 | "qas": [ 9 | { 10 | "answers": [ 11 | { 12 | "answer_start": 11, 13 | "text": "Carla" 14 | }, 15 | { 16 | "answer_start": 42, 17 | "text": "Abdul" 18 | }, 19 | { 20 | "answer_start": 11, 21 | "text": "Carla and I live together with Abdul" 22 | } 23 | ], 24 | "id": 7211011040021040393, 25 | "question": "Who lives in Berlin?", 26 | "is_impossible": false 27 | } 28 | ] 29 | } 30 | ] 31 | }, 32 | { 33 | "title": "test2", 34 | "paragraphs": [ 35 | { 36 | "context": "This is another test context", 37 | "qas": [ 38 | { 39 | "answers": [ 40 | { 41 | "answer_start": 0, 42 | "text": "This" 43 | }, 44 | { 45 | "answer_start": 5, 46 | "text": "is" 47 | } 48 | ], 49 | "id": -5782547119306399562, 50 | "question": "The model can't answer this", 51 | "is_impossible": false 52 | } 53 | ] 54 | } 55 | ] 56 | } 57 | ] 58 | } -------------------------------------------------------------------------------- /test/samples/squad/tiny_passages.json: -------------------------------------------------------------------------------- 1 | { 2 | "data": [ 3 | { 4 | "title": "test1", 5 | "paragraphs": [ 6 | { 7 | "context": "My name is Carla and I live together with Abdul in Berlin. \n\nThis is a new passage saying Leila lives in Berlin, too.", 8 | "qas": [ 9 | { 10 | "answers": [ 11 | { 12 | "answer_start": 11, 13 | "text": "Carla" 14 | }, 15 | { 16 | "answer_start": 42, 17 | "text": "Abdul" 18 | }, 19 | { 20 | "answer_start": 89, 21 | "text": "Leila" 22 | } 23 | ], 24 | "id": 7211011040021040393, 25 | "question": "Who lives in Berlin?", 26 | "is_impossible": false 27 | } 28 | ] 29 | } 30 | ] 31 | } 32 | ] 33 | } -------------------------------------------------------------------------------- /ui/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:20.04 2 | 3 | # RUN apt-get update && apt-get install -y curl git pkg-config cmake 4 | 5 | # copy code 6 | COPY . /ui 7 | 8 | RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \ 9 | python3 \ 10 | python3-pip 11 | 12 | # install as a package 13 | RUN pip install --upgrade pip && \ 14 | pip install /ui/ \ 15 | pip install pyyaml 16 | 17 | RUN ln -s /usr/bin/python3.8 /usr/bin/python 18 | WORKDIR /ui 19 | EXPOSE 8501 20 | 21 | # cmd for running the API 22 | CMD ["python", "-m", "streamlit", "run", "ui/webapp.py"] 23 | -------------------------------------------------------------------------------- /ui/README.md: -------------------------------------------------------------------------------- 1 | ## Demo UI 2 | 3 | This is a minimal UI that can spin up to test Haystack for your prototypes. It's based on streamlit and is very easy to extend for your purposes. 4 | 5 | ![Screenshot](https://raw.githubusercontent.com/deepset-ai/haystack/main/docs/img/streamlit_ui_screenshot.png) 6 | 7 | ## Usage 8 | 9 | ### Get started with Haystack 10 | 11 | The UI interacts with the Haystack REST API. To get started with Haystack please visit the [README](https://github.com/deepset-ai/haystack/tree/main#key-components) or checko out our [tutorials](https://haystack.deepset.ai/tutorials/first-qa-system). 12 | 13 | ### Option 1: Local 14 | 15 | Execute in this folder: 16 | ``` 17 | streamlit run ui/webapp.py 18 | ``` 19 | 20 | Requirements: This expects a running Haystack REST API at `http://localhost:8000` 21 | 22 | ### Option 2: Container 23 | 24 | Just run 25 | ``` 26 | docker-compose up -d 27 | ``` 28 | in the root folder of the Haystack repository. This will start three containers (Elasticsearch, Haystack API, Haystack UI). 29 | You can find the UI at `http://localhost:8501` 30 | 31 | ## Evaluation Mode 32 | 33 | The evaluation mode leverages the feedback REST API endpoint of haystack. The user has the options "Wrong answer", "Wrong answer and wrong passage" and "Wrong answer and wrong passage" to give feedback. 34 | 35 | In order to use the UI in evaluation mode, you need an ElasticSearch instance with pre-indexed files and the Haystack REST API. You can set the environment up via docker images. For ElasticSearch, you can check out our [documentation](https://haystack.deepset.ai/usage/document-store#initialisation) and for setting up the REST API this [link](https://github.com/deepset-ai/haystack/blob/main/README.md#7-rest-api). 36 | 37 | To enter the evaluation mode, select the checkbox "Evaluation mode" in the sidebar. The UI will load the predefined questions from the file [`eval_labels_examples`](https://raw.githubusercontent.com/deepset-ai/haystack/main/ui/ui/eval_labels_example.csv). The file needs to be prefilled with your data. This way, the user will get a random question from the set and can give his feedback with the buttons below the questions. To load a new question, click the button "Get random question". 38 | 39 | The file just needs to have two columns separated by semicolon. You can add more columns but the UI will ignore them. Every line represents a questions answer pair. The columns with the questions needs to be named “Question Text” and the answer column “Answer” so that they can be loaded correctly. Currently, the easiest way to create the file is manually by adding question answer pairs. 40 | 41 | The feedback can be exported with the API endpoint `export-doc-qa-feedback`. To learn more about finetuning a model with user feedback, please check out our [docs](https://haystack.deepset.ai/usage/domain-adaptation#user-feedback). 42 | 43 | ![Screenshot](https://raw.githubusercontent.com/deepset-ai/haystack/main/docs/img/streamlit_ui_screenshot_eval_mode.png) -------------------------------------------------------------------------------- /ui/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["hatchling"] 3 | build-backend = "hatchling.build" 4 | 5 | [project] 6 | name = "ui" 7 | description = 'Minimal UI for Haystack (https://github.com/deepset-ai/haystack)' 8 | readme = "README.md" 9 | requires-python = ">=3.7" 10 | license = "Apache-2.0" 11 | keywords = [] 12 | authors = [ 13 | { name = "deepset.ai", email = "malte.pietsch@deepset.ai" }, 14 | ] 15 | classifiers = [ 16 | "Development Status :: 5 - Production/Stable", 17 | "Intended Audience :: Science/Research", 18 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 19 | "Operating System :: OS Independent", 20 | "Programming Language :: Python", 21 | "Programming Language :: Python :: 3.7", 22 | "Programming Language :: Python :: 3.8", 23 | "Programming Language :: Python :: 3.9", 24 | "Programming Language :: Python :: 3.10", 25 | "Programming Language :: Python :: Implementation :: CPython", 26 | ] 27 | dependencies = [ 28 | #"streamlit >= 1.9.0, < 2", 29 | "streamlit == 1.11.1", 30 | "st-annotated-text >= 2.0.0, < 3", 31 | "markdown >= 3.3.4, < 4" 32 | ] 33 | dynamic = ["version"] 34 | 35 | [project.urls] 36 | Documentation = "https://github.com/deepset-ai/haystack/tree/main/ui#readme" 37 | Issues = "https://github.com/deepset-ai/haystack/issues" 38 | Source = "https://github.com/deepset-ai/haystack/tree/main/ui" 39 | 40 | [tool.hatch.version] 41 | path = "ui/__about__.py" 42 | 43 | [tool.hatch.build.targets.sdist] 44 | [tool.hatch.build.targets.wheel] 45 | 46 | [tool.hatch.envs.default] 47 | dependencies = [ 48 | "pytest", 49 | "pytest-cov", 50 | ] 51 | [tool.hatch.envs.default.scripts] 52 | cov = "pytest --cov-report=term-missing --cov-config=pyproject.toml --cov=ui --cov=tests" 53 | no-cov = "cov --no-cov" 54 | 55 | [[tool.hatch.envs.test.matrix]] 56 | python = ["37", "38", "39", "310"] 57 | 58 | [tool.coverage.run] 59 | branch = true 60 | parallel = true 61 | omit = [ 62 | "ui/__about__.py", 63 | ] 64 | 65 | [tool.coverage.report] 66 | exclude_lines = [ 67 | "no cov", 68 | "if __name__ == .__main__.:", 69 | "if TYPE_CHECKING:", 70 | ] 71 | 72 | [tool.black] 73 | line-length = 120 74 | skip_magic_trailing_comma = true # For compatibility with pydoc>=4.6, check if still needed. 75 | -------------------------------------------------------------------------------- /ui/test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intel/open-domain-question-and-answer/7869af3f98ff6b06d7e1d8e576a6c5ca8bbd573d/ui/test/__init__.py -------------------------------------------------------------------------------- /ui/test/test_ui_utils.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import patch 2 | 3 | from ui.utils import haystack_is_ready 4 | 5 | 6 | def test_haystack_is_ready(): 7 | with patch("requests.get") as mocked_get: 8 | mocked_get.return_value.status_code = 200 9 | assert haystack_is_ready() 10 | 11 | 12 | def test_haystack_is_ready_fail(): 13 | with patch("requests.get") as mocked_get: 14 | mocked_get.return_value.status_code = 400 15 | assert not haystack_is_ready() 16 | -------------------------------------------------------------------------------- /ui/ui/__about__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from pathlib import Path 4 | 5 | 6 | __version__ = "0.0.0" 7 | try: 8 | __version__ = open(Path(__file__).parent.parent / "VERSION.txt", "r").read() 9 | except Exception as e: 10 | logging.exception("No VERSION.txt found!") 11 | -------------------------------------------------------------------------------- /ui/ui/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intel/open-domain-question-and-answer/7869af3f98ff6b06d7e1d8e576a6c5ca8bbd573d/ui/ui/__init__.py -------------------------------------------------------------------------------- /ui/ui/eval_labels_example.csv: -------------------------------------------------------------------------------- 1 | "Question Text";"Answer" 2 | "What is the capital of France?";"Paris" 3 | "What's the tallest mountain in Africa?";"Mount Kilimanjaro" 4 | "What's the climate of Beijing?";"monsoon-influenced humid continental" 5 | "What's the longest river of Europe?";"The Volga" 6 | "What's the deepest lake in the world?";"Lake Bajkal" 7 | "How many people live in the capital of the US?";"689,545" 8 | "Which Chinese city is the largest?";"Shanghai" 9 | "What's the type of government of the UK?";"unitary parliamentary democracy and constitutional monarchy" 10 | "What currency is used in Hungary?";"Hungarian forint" 11 | "In which city is the Louvre?";"Paris" 12 | "Who is the current king of Spain?";"Felipe VI" 13 | "Which countries border with Mongolia?";"Russia and China" 14 | "What's the current name of Swaziland?";"Eswatini" --------------------------------------------------------------------------------