├── .dockerignore ├── .dvc ├── .gitignore ├── config └── plots │ ├── confusion.json │ ├── default.json │ ├── scatter.json │ └── smooth.json ├── .dvcignore ├── .env.example ├── .github ├── ISSUE_TEMPLATE │ ├── bug-report.md │ ├── documentation.md │ ├── feature-request.md │ └── other-questions-help.md ├── PULL_REQUEST_TEMPLATE.md └── workflows │ └── ci.yaml ├── .gitignore ├── .mypy.ini ├── .readthedocs.yml ├── AUTHORS.md ├── CONTRIBUTING.md ├── COPYING ├── COPYING.LESSER ├── README.md ├── benchmarks ├── conftest.py ├── test_benchmark_insert.py └── test_benchmark_servers.py ├── data_and_models ├── annotations │ ├── README.md │ ├── ner │ │ ├── .gitignore │ │ ├── README.md │ │ ├── analyze.py │ │ ├── annotations10_EmmanuelleLogette_2020-08-28_raw1_raw5_10EntityTypes.jsonl.dvc │ │ ├── annotations11_CharlotteLorin_2020-08-28_raw1_10EntityTypes.jsonl.dvc │ │ ├── annotations12_EmmanuelleLogette_2020-08-28_raw7_10EntityTypes.jsonl.dvc │ │ ├── annotations13_CharlotteLorin_2020-09-02_raw7_10EntityTypes.jsonl.dvc │ │ ├── annotations14_EmmanuelleLogette_2020-09-02_raw8_CellCompartmentDrugOrgan.jsonl.dvc │ │ ├── annotations15_EmmanuelleLogette_2020-09-22_raw9_Pathway.jsonl.dvc │ │ ├── annotations1_EmmanuelleLogette_2020-06-19_raw1_8FirstLabels.jsonl.dvc │ │ ├── annotations2_CharlotteLorin_2020-06-19_8FirstLabels.jsonl.dvc │ │ ├── annotations3_EmmanuelleLogette_2020-07-06_raw1_8FirstLabels.jsonl.dvc │ │ ├── annotations4_CharlotteLorin_2020-07-02_raw1_8FirstLabels.jsonl.dvc │ │ ├── annotations5_EmmanuelleLogette_2020-06-30_raw2_Disease.jsonl.dvc │ │ ├── annotations6_EmmanuelleLogette_2020-07-07_raw4_TaxonChebi.jsonl.dvc │ │ ├── annotations7_EmmanuelleLogette_2020-07-06_raw1_9EntityTypes.jsonl.dvc │ │ ├── annotations8_EmmanuelleLogette_2020-07-08_raw5_9EntityTypes.jsonl.dvc │ │ ├── annotations9_EmmanuelleLogette_2020-07-08_raw6_CelltypeProtein.jsonl.dvc │ │ ├── patterns │ │ │ ├── .gitignore │ │ │ ├── README.md │ │ │ ├── pathway_patterns.jsonl.dvc │ │ │ └── patterns.jsonl.dvc │ │ └── rule_based_patterns.jsonl.dvc │ └── sentence_embedding │ │ ├── .gitignore │ │ ├── README.md │ │ ├── cord19_v47_sentences_pre.txt.dvc │ │ └── sentence_similarity_cord19.csv.dvc ├── metrics │ ├── ner │ │ ├── cell_compartment.json │ │ ├── cell_type.json │ │ ├── chemical.json │ │ ├── disease.json │ │ ├── drug.json │ │ ├── interrater │ │ │ ├── cell_compartment.json │ │ │ ├── cell_type.json │ │ │ ├── chemical.json │ │ │ ├── condition.json │ │ │ ├── disease.json │ │ │ ├── drug.json │ │ │ ├── organ.json │ │ │ ├── organism.json │ │ │ ├── pathway.json │ │ │ └── protein.json │ │ ├── organ.json │ │ ├── organism.json │ │ ├── pathway.json │ │ └── protein.json │ └── sentence_embedding │ │ ├── .gitignore │ │ ├── biobert_nli_sts.json │ │ ├── biobert_nli_sts_cord19_v1.json │ │ ├── count.json │ │ ├── sbert.json │ │ ├── sbiobert.json │ │ └── tf_idf.json ├── models │ ├── language_modeling │ │ ├── .gitignore │ │ └── biobert_cord19_v1.dvc │ ├── ner │ │ └── .gitignore │ ├── ner_er │ │ └── .gitignore │ └── sentence_embedding │ │ ├── .gitignore │ │ └── biobert_nli_sts_cord19_v1.dvc ├── pipelines │ ├── README.md │ ├── ner │ │ ├── Dockerfile │ │ ├── add_er.py │ │ ├── clean.py │ │ ├── config.cfg │ │ ├── dvc.lock │ │ ├── dvc.yaml │ │ ├── eval_ner.py │ │ ├── interrater.py │ │ ├── params.yaml │ │ ├── preprocess.py │ │ └── transformers_vs_spacy │ │ │ ├── requirements.txt │ │ │ ├── spacy │ │ │ ├── .gitignore │ │ │ ├── compare_tokens.py │ │ │ ├── eval.sh │ │ │ └── eval_spacy.py │ │ │ └── transformers │ │ │ ├── .gitignore │ │ │ ├── 0_prepare_data.sh │ │ │ ├── 1_run_transformers_ner.py │ │ │ ├── 1_run_transformers_ner.sh │ │ │ ├── 2_eval_pred.py │ │ │ ├── 2_eval_pred.sh │ │ │ ├── 3_compare_tokens.py │ │ │ ├── create_pickle.py │ │ │ ├── francesco_script.py │ │ │ └── our_bert_classifier.py │ ├── relation_extraction │ │ ├── README.md │ │ └── convert_chemprot_fmt.py │ └── sentence_embedding │ │ ├── .gitignore │ │ ├── Dockerfile │ │ ├── dvc.lock │ │ ├── dvc.yaml │ │ ├── eval_se.py │ │ ├── params.yaml │ │ ├── train.py │ │ └── training_transformers │ │ ├── .gitignore │ │ ├── biosses_sentences.txt.dvc │ │ ├── fine_tune.py │ │ ├── sentences-filtered_11-527-877.txt.dvc │ │ └── train.py └── raw_sentences │ ├── .gitignore │ ├── README.md │ ├── raw1_2020-06-10_cord19_TestSet.jsonl.dvc │ ├── raw2_2020-06-29_cord19_Disease.jsonl.dvc │ ├── raw3_2020-06-30_cord19_Disease.jsonl.dvc │ ├── raw4_2020-07-02_cord19_ChemicalOrganism.jsonl.dvc │ ├── raw5_2020-07-08_cord19_Drug_TestSet.jsonl.dvc │ ├── raw6_2020-07-08_cord19_CelltypeProtein.jsonl.dvc │ ├── raw7_2020-09-01_cord19v35_CellCompartment.jsonl.dvc │ ├── raw8_2020-09-02_cord19v35_CellCompartmentDrugOrgan.jsonl.dvc │ └── raw9_2020-09-02_cord19v35_Pathway.jsonl.dvc ├── docker-compose.yml ├── docker ├── base.Dockerfile ├── corenlp.Dockerfile ├── embedding.Dockerfile ├── grobid_quantities.Dockerfile ├── mining.Dockerfile ├── mining.sh ├── mining_cache.Dockerfile ├── mining_cache.sh ├── mysql-make-backup ├── mysql.Dockerfile ├── search.Dockerfile └── utils.sh ├── docs ├── Makefile ├── _static │ └── .keep ├── conf.py ├── index.rst └── source │ ├── _substitutions.rst │ ├── api │ ├── bluesearch.database.article.rst │ ├── bluesearch.database.cord_19.rst │ ├── bluesearch.database.download.rst │ ├── bluesearch.database.mesh.rst │ ├── bluesearch.database.mining_cache.rst │ ├── bluesearch.database.pdf.rst │ ├── bluesearch.database.rst │ ├── bluesearch.database.topic.rst │ ├── bluesearch.database.topic_info.rst │ ├── bluesearch.database.topic_rule.rst │ ├── bluesearch.embedding_models.rst │ ├── bluesearch.entrypoint.create_database.rst │ ├── bluesearch.entrypoint.database.add.rst │ ├── bluesearch.entrypoint.database.add_es.rst │ ├── bluesearch.entrypoint.database.convert_pdf.rst │ ├── bluesearch.entrypoint.database.download.rst │ ├── bluesearch.entrypoint.database.init.rst │ ├── bluesearch.entrypoint.database.parent.rst │ ├── bluesearch.entrypoint.database.parse.rst │ ├── bluesearch.entrypoint.database.parse_mesh_rdf.rst │ ├── bluesearch.entrypoint.database.rst │ ├── bluesearch.entrypoint.database.run.rst │ ├── bluesearch.entrypoint.database.schemas.rst │ ├── bluesearch.entrypoint.database.topic_extract.rst │ ├── bluesearch.entrypoint.database.topic_filter.rst │ ├── bluesearch.entrypoint.embedding_server.rst │ ├── bluesearch.entrypoint.embeddings.rst │ ├── bluesearch.entrypoint.mining_cache.rst │ ├── bluesearch.entrypoint.mining_server.rst │ ├── bluesearch.entrypoint.rst │ ├── bluesearch.entrypoint.search_server.rst │ ├── bluesearch.k8s.connect.rst │ ├── bluesearch.k8s.create_indices.rst │ ├── bluesearch.k8s.rst │ ├── bluesearch.mining.attribute.rst │ ├── bluesearch.mining.entity.rst │ ├── bluesearch.mining.eval.rst │ ├── bluesearch.mining.pipeline.rst │ ├── bluesearch.mining.relation.rst │ ├── bluesearch.mining.rst │ ├── bluesearch.rst │ ├── bluesearch.search.rst │ ├── bluesearch.server.embedding_server.rst │ ├── bluesearch.server.invalid_usage_exception.rst │ ├── bluesearch.server.mining_server.rst │ ├── bluesearch.server.rst │ ├── bluesearch.server.search_server.rst │ ├── bluesearch.sql.rst │ ├── bluesearch.utils.rst │ ├── bluesearch.widgets.article_saver.rst │ ├── bluesearch.widgets.mining_schema.rst │ ├── bluesearch.widgets.mining_widget.rst │ ├── bluesearch.widgets.rst │ └── bluesearch.widgets.search_widget.rst │ ├── entrypoint.rst │ ├── example.rst │ ├── faq.rst │ ├── instructions.rst │ ├── logo │ └── BlueBrainSearch_banner.jpg │ ├── server.rst │ └── whatsnew.rst ├── luigi.cfg ├── notebooks ├── STS_evaluation.ipynb ├── create_indices.ipynb └── demo_attribute_extraction.ipynb ├── pyproject.toml ├── requirements-data_and_models.txt ├── requirements-dev.txt ├── requirements.txt ├── screenshots ├── mining_widget_articles.png ├── mining_widget_text.png └── search_widget.png ├── setup.py ├── src └── bluesearch │ ├── __init__.py │ ├── _css │ ├── __init__.py │ ├── style.py │ └── stylesheet.css │ ├── database │ ├── __init__.py │ ├── article.py │ ├── cord_19.py │ ├── download.py │ ├── mesh.py │ ├── mining_cache.py │ ├── pdf.py │ ├── topic.py │ ├── topic_info.py │ └── topic_rule.py │ ├── embedding_models.py │ ├── entrypoint │ ├── __init__.py │ ├── _helper.py │ ├── create_database.py │ ├── database │ │ ├── __init__.py │ │ ├── add.py │ │ ├── add_es.py │ │ ├── convert_pdf.py │ │ ├── download.py │ │ ├── init.py │ │ ├── parent.py │ │ ├── parse.py │ │ ├── parse_mesh_rdf.py │ │ ├── run.py │ │ ├── schemas.py │ │ ├── topic_extract.py │ │ └── topic_filter.py │ ├── embedding_server.py │ ├── embeddings.py │ ├── mining_cache.py │ ├── mining_server.py │ └── search_server.py │ ├── k8s │ ├── __init__.py │ ├── connect.py │ └── create_indices.py │ ├── mining │ ├── __init__.py │ ├── attribute.py │ ├── entity.py │ ├── eval.py │ ├── pipeline.py │ └── relation.py │ ├── py.typed │ ├── search.py │ ├── server │ ├── __init__.py │ ├── embedding_server.py │ ├── invalid_usage_exception.py │ ├── mining_server.py │ └── search_server.py │ ├── sql.py │ ├── utils.py │ └── widgets │ ├── __init__.py │ ├── article_saver.py │ ├── mining_schema.py │ ├── mining_widget.py │ └── search_widget.py ├── tests ├── conftest.py ├── data │ ├── 1411.7903v4.xml │ ├── CORD19_samples │ │ ├── biorxiv_medrxiv │ │ │ └── biorxiv_medrxiv │ │ │ │ └── pdf_json │ │ │ │ ├── 9ae476404f7ef1ec1ede965f0b898f31a5bf5a81.json │ │ │ │ └── b52e0f732cefa36aae4d45ebc13208fba190b5af.json │ │ ├── comm_use_subset │ │ │ └── comm_use_subset │ │ │ │ ├── pdf_json │ │ │ │ └── 820acf55c4e52411482f6eb44360ffa35288b89a.json │ │ │ │ └── pmc_json │ │ │ │ └── PMC5878846.xml.json │ │ ├── custom_license │ │ │ └── custom_license │ │ │ │ ├── pdf_json │ │ │ │ ├── bd21184623ceed45f1cede4066b540ff330ccb63.json │ │ │ │ └── be602928156cf0ace9899c1c8569eb4f4ea4597b.json │ │ │ │ └── pmc_json │ │ │ │ ├── PMC3396214.xml.json │ │ │ │ └── PMC6863268.xml.json │ │ ├── metadata.csv │ │ └── noncomm_use_subset │ │ │ └── noncomm_use_subset │ │ │ ├── pdf_json │ │ │ └── 67a52569919632f4bf58782538ff24838ac7f26c.json │ │ │ └── pmc_json │ │ │ └── PMC3863901.xml.json │ ├── arxiv_api_response.xml │ ├── biorxiv.xml │ ├── cord19_v35 │ │ ├── document_parses │ │ │ ├── pdf_json │ │ │ │ ├── 16e82ce0e0c8a1b36497afc0d4392b4fe21eb174.json │ │ │ │ └── 5f267fa1ef3a65e239aa974329e935a4d93dafd2.json │ │ │ └── pmc_json │ │ │ │ ├── PMC7140272.xml.json │ │ │ │ ├── PMC7186928.xml.json │ │ │ │ └── PMC7223769.xml.json │ │ └── metadata.csv │ ├── efetchpubmed_response.txt │ ├── jats_article.xml │ ├── mining │ │ ├── eval │ │ │ ├── iob_punctuation_after.csv │ │ │ ├── iob_punctuation_before.csv │ │ │ └── ner_iob_sample.csv │ │ └── request │ │ │ └── request.csv │ ├── nlmcatalog_response.txt │ ├── pubmed_article.xml │ ├── pubmed_article_minimal.xml │ ├── pubmed_articles.xml │ └── pubmed_download_index.html ├── integration │ └── test_bbs_database.py └── unit │ ├── database │ ├── test_article.py │ ├── test_cord_19.py │ ├── test_download.py │ ├── test_mesh.py │ ├── test_mining_cache.py │ ├── test_pdf.py │ ├── test_topic.py │ ├── test_topic_info.py │ └── test_topic_rule.py │ ├── entrypoint │ ├── __init__.py │ ├── database │ │ ├── __init__.py │ │ ├── test_add.py │ │ ├── test_add_es.py │ │ ├── test_convert_pdf.py │ │ ├── test_download.py │ │ ├── test_init.py │ │ ├── test_parent.py │ │ ├── test_parse.py │ │ ├── test_parse_mesh_rdf.py │ │ ├── test_run.py │ │ ├── test_topic_extract.py │ │ └── test_topic_filter.py │ ├── test__helper.py │ ├── test_create_database.py │ ├── test_create_mining_cache.py │ ├── test_embedding_server.py │ ├── test_embeddings.py │ ├── test_entrypoint_installation.py │ ├── test_mining_server.py │ └── test_search_sever.py │ ├── k8s │ └── test_create_indices.py │ ├── mining │ ├── test_attribute.py │ ├── test_entity.py │ ├── test_eval.py │ ├── test_pipeline.py │ └── test_relation.py │ ├── server │ ├── __init__.py │ ├── test_embedding_server.py │ ├── test_mining_server.py │ └── test_search_server.py │ ├── test_embedding_models.py │ ├── test_fixtures.py │ ├── test_search.py │ ├── test_sql.py │ ├── test_utils.py │ └── widgets │ ├── test_article_saver.py │ ├── test_mining_schema.py │ ├── test_mining_widget.py │ └── test_search_widget.py └── tox.ini /.dockerignore: -------------------------------------------------------------------------------- 1 | # Blue Brain Search is a text mining toolbox focused on scientific use cases. 2 | # 3 | # Copyright (C) 2020 Blue Brain Project, EPFL. 4 | # 5 | # This program is free software: you can redistribute it and/or modify 6 | # it under the terms of the GNU Lesser General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or 8 | # (at your option) any later version. 9 | # 10 | # This program is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU Lesser General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU Lesser General Public License 16 | # along with this program. If not, see . 17 | 18 | /.tox/ 19 | /assets/ 20 | /data/ 21 | /docs/ 22 | /htmlcov/ 23 | /notebooks/ 24 | /sandbox/ 25 | /tests/ 26 | *.egg-info/ 27 | *.egg 28 | .env* 29 | !.env*.example 30 | /.dvc/cache/ 31 | /.dvc/config.local 32 | -------------------------------------------------------------------------------- /.dvc/.gitignore: -------------------------------------------------------------------------------- 1 | # Blue Brain Search is a text mining toolbox focused on scientific use cases. 2 | # 3 | # Copyright (C) 2020 Blue Brain Project, EPFL. 4 | # 5 | # This program is free software: you can redistribute it and/or modify 6 | # it under the terms of the GNU Lesser General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or 8 | # (at your option) any later version. 9 | # 10 | # This program is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU Lesser General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU Lesser General Public License 16 | # along with this program. If not, see . 17 | 18 | /config.local 19 | /tmp 20 | /cache 21 | -------------------------------------------------------------------------------- /.dvc/config: -------------------------------------------------------------------------------- 1 | [core] 2 | remote = gpfs_ssh 3 | ['remote "gpfs_ssh"'] 4 | url = ssh://bbpv1.bbp.epfl.ch/gpfs/bbp.cscs.ch/data/project/proj115/dvc_remote_storage/ 5 | -------------------------------------------------------------------------------- /.dvc/plots/confusion.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://vega.github.io/schema/vega-lite/v4.json", 3 | "data": { 4 | "values": "" 5 | }, 6 | "title": "", 7 | "mark": "rect", 8 | "encoding": { 9 | "x": { 10 | "field": "", 11 | "type": "nominal", 12 | "sort": "ascending", 13 | "title": "" 14 | }, 15 | "y": { 16 | "field": "", 17 | "type": "nominal", 18 | "sort": "ascending", 19 | "title": "" 20 | }, 21 | "color": { 22 | "aggregate": "count", 23 | "type": "quantitative" 24 | }, 25 | "facet": { 26 | "field": "rev", 27 | "type": "nominal" 28 | } 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /.dvc/plots/default.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://vega.github.io/schema/vega-lite/v4.json", 3 | "data": { 4 | "values": "" 5 | }, 6 | "title": "", 7 | "mark": { 8 | "type": "line" 9 | }, 10 | "encoding": { 11 | "x": { 12 | "field": "", 13 | "type": "quantitative", 14 | "title": "" 15 | }, 16 | "y": { 17 | "field": "", 18 | "type": "quantitative", 19 | "title": "", 20 | "scale": { 21 | "zero": false 22 | } 23 | }, 24 | "color": { 25 | "field": "rev", 26 | "type": "nominal" 27 | } 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /.dvc/plots/scatter.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://vega.github.io/schema/vega-lite/v4.json", 3 | "data": { 4 | "values": "" 5 | }, 6 | "title": "", 7 | "mark": "point", 8 | "encoding": { 9 | "x": { 10 | "field": "", 11 | "type": "quantitative", 12 | "title": "" 13 | }, 14 | "y": { 15 | "field": "", 16 | "type": "quantitative", 17 | "title": "", 18 | "scale": { 19 | "zero": false 20 | } 21 | }, 22 | "color": { 23 | "field": "rev", 24 | "type": "nominal" 25 | } 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /.dvc/plots/smooth.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://vega.github.io/schema/vega-lite/v4.json", 3 | "data": { 4 | "values": "" 5 | }, 6 | "title": "", 7 | "mark": { 8 | "type": "line" 9 | }, 10 | "encoding": { 11 | "x": { 12 | "field": "", 13 | "type": "quantitative", 14 | "title": "" 15 | }, 16 | "y": { 17 | "field": "", 18 | "type": "quantitative", 19 | "title": "", 20 | "scale": { 21 | "zero": false 22 | } 23 | }, 24 | "color": { 25 | "field": "rev", 26 | "type": "nominal" 27 | } 28 | }, 29 | "transform": [ 30 | { 31 | "loess": "", 32 | "on": "", 33 | "groupby": [ 34 | "rev" 35 | ], 36 | "bandwidth": 0.3 37 | } 38 | ] 39 | } 40 | -------------------------------------------------------------------------------- /.dvcignore: -------------------------------------------------------------------------------- 1 | # Add patterns of files dvc should ignore, which could improve 2 | # the performance. Learn more at 3 | # https://dvc.org/doc/user-guide/dvcignore 4 | 5 | # Blue Brain Search is a text mining toolbox focused on scientific use cases. 6 | # 7 | # Copyright (C) 2020 Blue Brain Project, EPFL. 8 | # 9 | # This program is free software: you can redistribute it and/or modify 10 | # it under the terms of the GNU Lesser General Public License as published by 11 | # the Free Software Foundation, either version 3 of the License, or 12 | # (at your option) any later version. 13 | # 14 | # This program is distributed in the hope that it will be useful, 15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 | # GNU Lesser General Public License for more details. 18 | # 19 | # You should have received a copy of the GNU Lesser General Public License 20 | # along with this program. If not, see . 21 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug-report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: "\U0001F41B Bug Report" 3 | about: Submit report to help us reproduce and correct the bug 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | ## 🐛 Bug description 11 | 12 | A clear and concise description of what the bug is. 13 | 14 | 15 | ## To reproduce 16 | 17 | Steps to reproduce the behavior: 18 | 19 | 1. 20 | 2. 21 | 3. 22 | 23 | If you have a code sample, error messages, stack traces, please provide it here as well. 24 | 25 | 26 | ## Expected behavior 27 | 28 | A clear and concise description of what you expected to happen. 29 | 30 | 31 | 32 | ## Environment 33 | 34 | Please provide the following information. 35 | 36 | - Blue Brain Search version (use `python -c "import bluesearch; print(bluesearch.__version__)"`): 37 | - OS (e.g., Linux): 38 | - How you installed Blue Brain Search (source, pip, ...): 39 | - Installation command you used (if compiling from source): 40 | - Python version (use `python -V`): 41 | - Any other relevant information: 42 | 43 | ## Additional context 44 | 45 | Add any other screenshot, context, or information about the problem here. 46 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/documentation.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: "\U0001F4DA Documentation" 3 | about: Report an issue related to the docs 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | A clear and concise description of what content in the docs is an issue. 11 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature-request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: "\U0001F680 Feature Request" 3 | about: Submit a request for a new feature 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | ## 🚀 Feature 11 | 12 | A clear and concise description of the feature proposal. 13 | 14 | 15 | ## Motivation 16 | 17 | Please outline the motivation for the proposal. Is your feature request related to a problem? e.g., I'm always frustrated when [...]. If this is related to another GitHub issue, please link here too. 18 | 19 | 20 | ## Pitch 21 | 22 | A clear description of what you want to happen. 23 | 24 | 25 | ## Alternatives 26 | 27 | Think about any alternative solutions or features that could be used. 28 | Then, write here a clear list of alternatives. 29 | 30 | 31 | ## Additional context 32 | Add any other screenshot, context, or information about the problem here. 33 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/other-questions-help.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: "❓ Other Questions / Help" 3 | about: Do you need help or have other questions? 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | Please refer to our docs first. 11 | 12 | If you have a question or help request that you could not find mentioned in our docs, write it here in a clear, concise, and actionable way. 13 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | Fixes #{issue-id-number}. 2 | 3 | ## Description 4 | 5 | Please provide here a summary of the changes introduced by this PR. 6 | 7 | ## How to test? 8 | 9 | Please provide here instructions on how to test the changes introduced by this PR. 10 | (if some changes cannot be tested by automated tests) 11 | 12 | ## Checklist 13 | 14 | - [ ] This PR refers to an issue from the [issue tracker](https://github.com/BlueBrain/Search/issues). 15 | (if it is not the case, please create an issue first). 16 | - [ ] Unit tests added. 17 | (if needed) 18 | - [ ] Documentation and `whatsnew.rst` updated. 19 | (if needed) 20 | - [ ] `setup.py` and `requirements.txt` updated with new dependencies. 21 | (if needed) 22 | - [ ] Type annotations added. 23 | (if a function is added or modified) 24 | - [ ] All CI tests pass. 25 | -------------------------------------------------------------------------------- /.mypy.ini: -------------------------------------------------------------------------------- 1 | ;Blue Brain Search is a text mining toolbox focused on scientific use cases. 2 | ; 3 | ;Copyright (C) 2020 Blue Brain Project, EPFL. 4 | ; 5 | ;This program is free software: you can redistribute it and/or modify 6 | ;it under the terms of the GNU Lesser General Public License as published by 7 | ;the Free Software Foundation, either version 3 of the License, or 8 | ;(at your option) any later version. 9 | ; 10 | ;This program is distributed in the hope that it will be useful, 11 | ;but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | ;MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | ;GNU Lesser General Public License for more details. 14 | ; 15 | ;You should have received a copy of the GNU Lesser General Public License 16 | ;along with this program. If not, see . 17 | 18 | [mypy] 19 | ignore_missing_imports = True 20 | no_implicit_optional = True 21 | check_untyped_defs = True 22 | strict_equality = True 23 | warn_redundant_casts = True 24 | warn_unused_ignores = True 25 | show_error_codes = True 26 | plugins = sqlmypy 27 | exclude = benchmarks/conftest.py|data_and_models/pipelines/ner/transformers_vs_spacy/transformers/|data_and_models/pipelines/sentence_embedding/training_transformers/ 28 | -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | formats: [] 4 | 5 | sphinx: 6 | builder: html 7 | configuration: docs/conf.py 8 | 9 | build: 10 | image: "6.0" 11 | 12 | python: 13 | version: 3.7 14 | install: 15 | - method: pip 16 | path: . 17 | extra_requirements: 18 | - dev 19 | system_packages: true 20 | -------------------------------------------------------------------------------- /AUTHORS.md: -------------------------------------------------------------------------------- 1 | - [Francesco Casalegno](https://github.com/FrancescoCasalegno) @ Blue Brain Project, EPFL 2 | - [Emilie Delattre](https://github.com/EmilieDel) @ Blue Brain Project, EPFL 3 | - [Pierre-Alexandre Fonta](https://github.com/pafonta) @ Blue Brain Project, EPFL 4 | - [Jan Krepl](https://github.com/jankrepl) @ Blue Brain Project, EPFL 5 | - [Stanislav Schmidt](https://github.com/Stannislav) @ Blue Brain Project, EPFL 6 | - [Anıl Tuncel](https://github.com/anilbey) @ Blue Brain Project, EPFL -------------------------------------------------------------------------------- /benchmarks/conftest.py: -------------------------------------------------------------------------------- 1 | """Configuration of pytest benchmarks.""" 2 | 3 | # Blue Brain Search is a text mining toolbox focused on scientific use cases. 4 | # 5 | # Copyright (C) 2020 Blue Brain Project, EPFL. 6 | # 7 | # This program is free software: you can redistribute it and/or modify 8 | # it under the terms of the GNU Lesser General Public License as published by 9 | # the Free Software Foundation, either version 3 of the License, or 10 | # (at your option) any later version. 11 | # 12 | # This program is distributed in the hope that it will be useful, 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | # GNU Lesser General Public License for more details. 16 | # 17 | # You should have received a copy of the GNU Lesser General Public License 18 | # along with this program. If not, see . 19 | 20 | import pytest 21 | 22 | 23 | def pytest_addoption(parser): 24 | parser.addoption("--embedding_server", default="", help="Embedding server URI") 25 | parser.addoption("--mining_server", default="", help="Mining server URI") 26 | parser.addoption("--mysql_server", default="", help="MySQL server URI") 27 | parser.addoption("--search_server", default="", help="Search server URI") 28 | 29 | 30 | @pytest.fixture(scope="session") 31 | def benchmark_parameters(request): 32 | return { 33 | "embedding_server": request.config.getoption("--embedding_server"), 34 | "mining_server": request.config.getoption("--mining_server"), 35 | "mysql_server": request.config.getoption("--mysql_server"), 36 | "search_server": request.config.getoption("--search_server"), 37 | } 38 | -------------------------------------------------------------------------------- /benchmarks/test_benchmark_insert.py: -------------------------------------------------------------------------------- 1 | """Benchmark INSERT operations through Pandas with and without transactions.""" 2 | 3 | # Blue Brain Search is a text mining toolbox focused on scientific use cases. 4 | # 5 | # Copyright (C) 2020 Blue Brain Project, EPFL. 6 | # 7 | # This program is free software: you can redistribute it and/or modify 8 | # it under the terms of the GNU Lesser General Public License as published by 9 | # the Free Software Foundation, either version 3 of the License, or 10 | # (at your option) any later version. 11 | # 12 | # This program is distributed in the hope that it will be useful, 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | # GNU Lesser General Public License for more details. 16 | # 17 | # You should have received a copy of the GNU Lesser General Public License 18 | # along with this program. If not, see . 19 | 20 | import numpy as np 21 | import pandas as pd 22 | import pytest as pt 23 | import sqlalchemy 24 | 25 | PORT = 9731 26 | 27 | 28 | @pt.fixture 29 | def data(): 30 | rng = np.random.default_rng(1739) 31 | numbers = rng.integers(10, size=100000) 32 | return pd.DataFrame({"column": numbers}) 33 | 34 | 35 | @pt.fixture 36 | def engine(): 37 | return sqlalchemy.create_engine( 38 | f"mysql+pymysql://root:root@localhost:{PORT}/benchmarks" 39 | ) 40 | 41 | 42 | def insert_without_transactions(data, engine): 43 | data.to_sql("without", engine, if_exists="append", index=False) 44 | 45 | 46 | def insert_with_transactions(data, engine): 47 | with engine.begin() as con: 48 | data.to_sql("with", con, if_exists="append", index=False) 49 | 50 | 51 | def test_insert_without_transactions(benchmark, data, engine): 52 | benchmark(insert_without_transactions, data, engine) 53 | 54 | 55 | def test_insert_with_transactions(benchmark, data, engine): 56 | benchmark(insert_with_transactions, data, engine) 57 | -------------------------------------------------------------------------------- /data_and_models/annotations/README.md: -------------------------------------------------------------------------------- 1 | 19 | 20 | # Description 21 | This directory contains collections of annotations that can be used 22 | for training or evaluating NLP models. 23 | The raw sentences, without annotations, can be found in the 24 | directory `raw_sentences/`. 25 | 26 | # Content 27 | ## `ner/` 28 | - Annotations collected with `prodigy` in order to train or evaluate NER models. 29 | 30 | ## `sentence_embedding/` 31 | - Annotations collected in order to train or evaluate sentence embedding models. 32 | -------------------------------------------------------------------------------- /data_and_models/annotations/ner/.gitignore: -------------------------------------------------------------------------------- 1 | # Blue Brain Search is a text mining toolbox focused on scientific use cases. 2 | # 3 | # Copyright (C) 2020 Blue Brain Project, EPFL. 4 | # 5 | # This program is free software: you can redistribute it and/or modify 6 | # it under the terms of the GNU Lesser General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or 8 | # (at your option) any later version. 9 | # 10 | # This program is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU Lesser General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU Lesser General Public License 16 | # along with this program. If not, see . 17 | 18 | /annotations10_EmmanuelleLogette_2020-08-28_raw1_raw5_10EntityTypes.jsonl 19 | /annotations11_CharlotteLorin_2020-08-28_raw1_10EntityTypes.jsonl 20 | /annotations12_EmmanuelleLogette_2020-08-28_raw7_10EntityTypes.jsonl 21 | /annotations13_CharlotteLorin_2020-09-02_raw7_10EntityTypes.jsonl 22 | /annotations1_EmmanuelleLogette_2020-06-19_raw1_8FirstLabels.jsonl 23 | /annotations2_CharlotteLorin_2020-06-19_8FirstLabels.jsonl 24 | /annotations3_EmmanuelleLogette_2020-07-06_raw1_8FirstLabels.jsonl 25 | /annotations4_CharlotteLorin_2020-07-02_raw1_8FirstLabels.jsonl 26 | /annotations5_EmmanuelleLogette_2020-06-30_raw2_Disease.jsonl 27 | /annotations6_EmmanuelleLogette_2020-07-07_raw4_TaxonChebi.jsonl 28 | /annotations7_EmmanuelleLogette_2020-07-06_raw1_9EntityTypes.jsonl 29 | /annotations8_EmmanuelleLogette_2020-07-08_raw5_9EntityTypes.jsonl 30 | /annotations9_EmmanuelleLogette_2020-07-08_raw6_CelltypeProtein.jsonl 31 | /annotations14_EmmanuelleLogette_2020-09-02_raw8_CellCompartmentDrugOrgan.jsonl 32 | /annotations15_EmmanuelleLogette_2020-09-22_raw9_Pathway.jsonl 33 | /rule_based_patterns.jsonl 34 | /annotations_cell_compartment.jsonl 35 | /annotations_cell_type.jsonl 36 | /annotations_chemical.jsonl 37 | /annotations_disease.jsonl 38 | /annotations_drug.jsonl 39 | /annotations_organ.jsonl 40 | /annotations_organism.jsonl 41 | /annotations_pathway.jsonl 42 | /annotations_protein.jsonl 43 | /annotations_cell_compartment.dev.spacy 44 | /annotations_cell_compartment.train.spacy 45 | /annotations_cell_type.dev.spacy 46 | /annotations_cell_type.train.spacy 47 | /annotations_chemical.dev.spacy 48 | /annotations_chemical.train.spacy 49 | /annotations_disease.dev.spacy 50 | /annotations_disease.train.spacy 51 | /annotations_drug.dev.spacy 52 | /annotations_drug.train.spacy 53 | /annotations_organ.dev.spacy 54 | /annotations_organ.train.spacy 55 | /annotations_organism.dev.spacy 56 | /annotations_organism.train.spacy 57 | /annotations_pathway.dev.spacy 58 | /annotations_pathway.train.spacy 59 | /annotations_protein.dev.spacy 60 | /annotations_protein.train.spacy 61 | -------------------------------------------------------------------------------- /data_and_models/annotations/ner/annotations10_EmmanuelleLogette_2020-08-28_raw1_raw5_10EntityTypes.jsonl.dvc: -------------------------------------------------------------------------------- 1 | outs: 2 | - md5: 563441b77b5c39063cda3fa0fb03803c 3 | path: annotations10_EmmanuelleLogette_2020-08-28_raw1_raw5_10EntityTypes.jsonl 4 | -------------------------------------------------------------------------------- /data_and_models/annotations/ner/annotations11_CharlotteLorin_2020-08-28_raw1_10EntityTypes.jsonl.dvc: -------------------------------------------------------------------------------- 1 | outs: 2 | - md5: 735cba4532a4c2c5e928399eafc1000f 3 | path: annotations11_CharlotteLorin_2020-08-28_raw1_10EntityTypes.jsonl 4 | -------------------------------------------------------------------------------- /data_and_models/annotations/ner/annotations12_EmmanuelleLogette_2020-08-28_raw7_10EntityTypes.jsonl.dvc: -------------------------------------------------------------------------------- 1 | outs: 2 | - md5: 117da032ef2e2792429dff88c06c90b7 3 | path: annotations12_EmmanuelleLogette_2020-08-28_raw7_10EntityTypes.jsonl 4 | -------------------------------------------------------------------------------- /data_and_models/annotations/ner/annotations13_CharlotteLorin_2020-09-02_raw7_10EntityTypes.jsonl.dvc: -------------------------------------------------------------------------------- 1 | outs: 2 | - md5: 6e248863604ce14861f38e7c9a8281bb 3 | path: annotations13_CharlotteLorin_2020-09-02_raw7_10EntityTypes.jsonl 4 | -------------------------------------------------------------------------------- /data_and_models/annotations/ner/annotations14_EmmanuelleLogette_2020-09-02_raw8_CellCompartmentDrugOrgan.jsonl.dvc: -------------------------------------------------------------------------------- 1 | outs: 2 | - md5: 2459e49418599ff96ab9db60a29b757b 3 | path: annotations14_EmmanuelleLogette_2020-09-02_raw8_CellCompartmentDrugOrgan.jsonl 4 | -------------------------------------------------------------------------------- /data_and_models/annotations/ner/annotations15_EmmanuelleLogette_2020-09-22_raw9_Pathway.jsonl.dvc: -------------------------------------------------------------------------------- 1 | outs: 2 | - md5: ceb6ea77d2a6a69962b88218f4f5a663 3 | path: annotations15_EmmanuelleLogette_2020-09-22_raw9_Pathway.jsonl 4 | -------------------------------------------------------------------------------- /data_and_models/annotations/ner/annotations1_EmmanuelleLogette_2020-06-19_raw1_8FirstLabels.jsonl.dvc: -------------------------------------------------------------------------------- 1 | outs: 2 | - md5: 58e2ab00caf7a42e958755ec8d4bb999 3 | path: annotations1_EmmanuelleLogette_2020-06-19_raw1_8FirstLabels.jsonl 4 | -------------------------------------------------------------------------------- /data_and_models/annotations/ner/annotations2_CharlotteLorin_2020-06-19_8FirstLabels.jsonl.dvc: -------------------------------------------------------------------------------- 1 | outs: 2 | - md5: 17f2d3dec1ef70f75973ad07e533efe1 3 | path: annotations2_CharlotteLorin_2020-06-19_8FirstLabels.jsonl 4 | -------------------------------------------------------------------------------- /data_and_models/annotations/ner/annotations3_EmmanuelleLogette_2020-07-06_raw1_8FirstLabels.jsonl.dvc: -------------------------------------------------------------------------------- 1 | outs: 2 | - md5: c4919fd5fbc48a8b75019e18517d5842 3 | path: annotations3_EmmanuelleLogette_2020-07-06_raw1_8FirstLabels.jsonl 4 | -------------------------------------------------------------------------------- /data_and_models/annotations/ner/annotations4_CharlotteLorin_2020-07-02_raw1_8FirstLabels.jsonl.dvc: -------------------------------------------------------------------------------- 1 | outs: 2 | - md5: 6e371a67674af1fd7a1cc1243c107b4a 3 | path: annotations4_CharlotteLorin_2020-07-02_raw1_8FirstLabels.jsonl 4 | -------------------------------------------------------------------------------- /data_and_models/annotations/ner/annotations5_EmmanuelleLogette_2020-06-30_raw2_Disease.jsonl.dvc: -------------------------------------------------------------------------------- 1 | outs: 2 | - md5: 613308ed830d86284a1aee2747d911d0 3 | path: annotations5_EmmanuelleLogette_2020-06-30_raw2_Disease.jsonl 4 | -------------------------------------------------------------------------------- /data_and_models/annotations/ner/annotations6_EmmanuelleLogette_2020-07-07_raw4_TaxonChebi.jsonl.dvc: -------------------------------------------------------------------------------- 1 | outs: 2 | - md5: 4b6fff4b0091fa10ae7c88a4aeb42ae0 3 | path: annotations6_EmmanuelleLogette_2020-07-07_raw4_TaxonChebi.jsonl 4 | -------------------------------------------------------------------------------- /data_and_models/annotations/ner/annotations7_EmmanuelleLogette_2020-07-06_raw1_9EntityTypes.jsonl.dvc: -------------------------------------------------------------------------------- 1 | outs: 2 | - md5: b26a0e08e2265b53f625a48f1e1da10d 3 | path: annotations7_EmmanuelleLogette_2020-07-06_raw1_9EntityTypes.jsonl 4 | -------------------------------------------------------------------------------- /data_and_models/annotations/ner/annotations8_EmmanuelleLogette_2020-07-08_raw5_9EntityTypes.jsonl.dvc: -------------------------------------------------------------------------------- 1 | outs: 2 | - md5: 36d574da83a1b52b61016b7418494272 3 | path: annotations8_EmmanuelleLogette_2020-07-08_raw5_9EntityTypes.jsonl 4 | -------------------------------------------------------------------------------- /data_and_models/annotations/ner/annotations9_EmmanuelleLogette_2020-07-08_raw6_CelltypeProtein.jsonl.dvc: -------------------------------------------------------------------------------- 1 | outs: 2 | - md5: ec5d6b86181b9e4cdefb2b2198d5ae4f 3 | path: annotations9_EmmanuelleLogette_2020-07-08_raw6_CelltypeProtein.jsonl 4 | -------------------------------------------------------------------------------- /data_and_models/annotations/ner/patterns/.gitignore: -------------------------------------------------------------------------------- 1 | # Blue Brain Search is a text mining toolbox focused on scientific use cases. 2 | # 3 | # Copyright (C) 2020 Blue Brain Project, EPFL. 4 | # 5 | # This program is free software: you can redistribute it and/or modify 6 | # it under the terms of the GNU Lesser General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or 8 | # (at your option) any later version. 9 | # 10 | # This program is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU Lesser General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU Lesser General Public License 16 | # along with this program. If not, see . 17 | 18 | /patterns.jsonl 19 | /pathway_patterns.jsonl 20 | -------------------------------------------------------------------------------- /data_and_models/annotations/ner/patterns/README.md: -------------------------------------------------------------------------------- 1 | 19 | 20 | # Description 21 | - These pattern files are sometimes used to train NER models to provide a first guess. 22 | - This is particularly necessary when no basis model can be found (e.g. SciSpaCy models) to provide good first 23 | guesses for the entity type of interest. 24 | 25 | # Content 26 | ## `patterns/patterns.jsonl` 27 | - Contains all entities that Emmanuelle identified in Ontology v3 (it then 28 | pre-annotates those entities in the prodigy GUI). 29 | 30 | ## `patterns/pathway_patterns.jsonl` 31 | - Contains a list of entities that Emmanuelle considers as a good starting point 32 | for the entity type PATHWAY. 33 | - The file was generated using `prodigy terms.teach`. 34 | -------------------------------------------------------------------------------- /data_and_models/annotations/ner/patterns/pathway_patterns.jsonl.dvc: -------------------------------------------------------------------------------- 1 | outs: 2 | - md5: 75f7ba9b965cef0aab2aaaa434fa9e2d 3 | path: pathway_patterns.jsonl 4 | -------------------------------------------------------------------------------- /data_and_models/annotations/ner/patterns/patterns.jsonl.dvc: -------------------------------------------------------------------------------- 1 | outs: 2 | - md5: 01ea4545220f5b09a4d55cf873ff22cb 3 | path: patterns.jsonl 4 | -------------------------------------------------------------------------------- /data_and_models/annotations/ner/rule_based_patterns.jsonl.dvc: -------------------------------------------------------------------------------- 1 | outs: 2 | - md5: 044c1a326c2472aa4eb72c4c98a7400b 3 | size: 1709 4 | path: rule_based_patterns.jsonl 5 | -------------------------------------------------------------------------------- /data_and_models/annotations/sentence_embedding/.gitignore: -------------------------------------------------------------------------------- 1 | # Blue Brain Search is a text mining toolbox focused on scientific use cases. 2 | # 3 | # Copyright (C) 2020 Blue Brain Project, EPFL. 4 | # 5 | # This program is free software: you can redistribute it and/or modify 6 | # it under the terms of the GNU Lesser General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or 8 | # (at your option) any later version. 9 | # 10 | # This program is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU Lesser General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU Lesser General Public License 16 | # along with this program. If not, see . 17 | 18 | /sentence_similarity_cord19.csv 19 | /cord19_v47_sentences_pre.txt 20 | -------------------------------------------------------------------------------- /data_and_models/annotations/sentence_embedding/README.md: -------------------------------------------------------------------------------- 1 | 19 | 20 | # Description 21 | - Annotations collected in order to train or evaluate sentence embedding models. 22 | 23 | # Content 24 | 25 | ## `cord19_v47_sentences_pre.txt` 26 | - Unannotated file of sentences (one line per sentence) from cord-19. 27 | - 20,510,932 total sentences. 28 | - Can be used to train unsupervised nlp models. 29 | 30 | ## `sentence_similarity_cord19.csv` 31 | - Sentences pairs with similarity scores annotated by Emmanuelle Logette. 32 | - 40 sentences pairs in total: 33 | - 20 pairs (those with `sentence_id` starting by `A-`) are generically 34 | extracted from the CORD-19 dataset 35 | - 20 pairs (those with `sentence_id` starting by `B-`) are also extracted from 36 | the CORD-19 dataset but are focused on "COVID-19" and "glucose" topics. 37 | - The scoring system is the one used in Soğancıoğlu G. et al. "BIOSSES: a semantic sentence 38 | similarity estimation system for the biomedical domain." Bioinformatics 33.14 (2017): i49-i58. 39 | 40 | | Score | Comment | 41 | | --- | --- | 42 | | 0 | The two sentences are on different topics. | 43 | | 1 | The two sentences are not equivalent, but are on the same topic. | 44 | | 2 | The two sentences are not equivalent, but share some details. | 45 | | 3 | The two sentences are roughly equivalent, but some important information differs/missing. | 46 | | 4 | The two sentences are completely or mostly equivalent, as they mean the same thing. | 47 | -------------------------------------------------------------------------------- /data_and_models/annotations/sentence_embedding/cord19_v47_sentences_pre.txt.dvc: -------------------------------------------------------------------------------- 1 | outs: 2 | - md5: d0c68b698738f714df81eb5fb29236fe 3 | path: cord19_v47_sentences_pre.txt 4 | -------------------------------------------------------------------------------- /data_and_models/annotations/sentence_embedding/sentence_similarity_cord19.csv.dvc: -------------------------------------------------------------------------------- 1 | outs: 2 | - md5: 92a4235ce8d292ff382ce008c31da45c 3 | path: sentence_similarity_cord19.csv 4 | -------------------------------------------------------------------------------- /data_and_models/metrics/ner/cell_compartment.json: -------------------------------------------------------------------------------- 1 | {"entity_precision": 0.6666666666666666, "entity_recall": 0.8571428571428571, "entity_f1-score": 0.75, "entity_support": 42, "token_precision": 0.6857142857142857, "token_recall": 0.9411764705882353, "token_f1-score": 0.7933884297520661, "token_support": 51} -------------------------------------------------------------------------------- /data_and_models/metrics/ner/cell_type.json: -------------------------------------------------------------------------------- 1 | {"entity_precision": 0.6363636363636364, "entity_recall": 0.7857142857142857, "entity_f1-score": 0.7031963470319634, "entity_support": 98, "token_precision": 0.6475770925110133, "token_recall": 0.8855421686746988, "token_f1-score": 0.7480916030534351, "token_support": 166} -------------------------------------------------------------------------------- /data_and_models/metrics/ner/chemical.json: -------------------------------------------------------------------------------- 1 | {"entity_precision": 0.5, "entity_recall": 0.48175182481751827, "entity_f1-score": 0.49070631970260226, "entity_support": 137, "token_precision": 0.5508982035928144, "token_recall": 0.5542168674698795, "token_f1-score": 0.5525525525525525, "token_support": 166} -------------------------------------------------------------------------------- /data_and_models/metrics/ner/disease.json: -------------------------------------------------------------------------------- 1 | {"entity_precision": 0.693939393939394, "entity_recall": 0.720125786163522, "entity_f1-score": 0.7067901234567902, "entity_support": 318, "token_precision": 0.7504363001745201, "token_recall": 0.7948243992606284, "token_f1-score": 0.7719928186714542, "token_support": 541} -------------------------------------------------------------------------------- /data_and_models/metrics/ner/drug.json: -------------------------------------------------------------------------------- 1 | {"entity_precision": 0.6904761904761905, "entity_recall": 0.7631578947368421, "entity_f1-score": 0.725, "entity_support": 76, "token_precision": 0.7058823529411765, "token_recall": 0.7228915662650602, "token_f1-score": 0.7142857142857143, "token_support": 83} -------------------------------------------------------------------------------- /data_and_models/metrics/ner/interrater/cell_compartment.json: -------------------------------------------------------------------------------- 1 | {"entity_precision": 0.8125, "entity_recall": 0.6341463414634146, "entity_f1-score": 0.7123287671232876, "entity_support": 41, "token_precision": 0.8297872340425532, "token_recall": 0.78, "token_f1-score": 0.8041237113402062, "token_support": 50} -------------------------------------------------------------------------------- /data_and_models/metrics/ner/interrater/cell_type.json: -------------------------------------------------------------------------------- 1 | {"entity_precision": 0.5869565217391305, "entity_recall": 0.8350515463917526, "entity_f1-score": 0.6893617021276596, "entity_support": 97, "token_precision": 0.6512605042016807, "token_recall": 0.9393939393939394, "token_f1-score": 0.7692307692307693, "token_support": 165} -------------------------------------------------------------------------------- /data_and_models/metrics/ner/interrater/chemical.json: -------------------------------------------------------------------------------- 1 | {"entity_precision": 0.4913294797687861, "entity_recall": 0.625, "entity_f1-score": 0.5501618122977346, "entity_support": 136, "token_precision": 0.483739837398374, "token_recall": 0.7212121212121212, "token_f1-score": 0.5790754257907542, "token_support": 165} -------------------------------------------------------------------------------- /data_and_models/metrics/ner/interrater/condition.json: -------------------------------------------------------------------------------- 1 | {"entity_precision": 0.4928571428571429, "entity_recall": 0.6831683168316832, "entity_f1-score": 0.5726141078838174, "entity_support": 101, "token_precision": 0.4841628959276018, "token_recall": 0.7588652482269503, "token_f1-score": 0.5911602209944752, "token_support": 141} -------------------------------------------------------------------------------- /data_and_models/metrics/ner/interrater/disease.json: -------------------------------------------------------------------------------- 1 | {"entity_precision": 0.6478494623655914, "entity_recall": 0.8169491525423729, "entity_f1-score": 0.7226386806596702, "entity_support": 295, "token_precision": 0.7301829268292683, "token_recall": 0.9466403162055336, "token_f1-score": 0.8244406196213425, "token_support": 506} -------------------------------------------------------------------------------- /data_and_models/metrics/ner/interrater/drug.json: -------------------------------------------------------------------------------- 1 | {"entity_precision": 0.34210526315789475, "entity_recall": 0.48148148148148145, "entity_f1-score": 0.4, "entity_support": 27, "token_precision": 0.30434782608695654, "token_recall": 0.4666666666666667, "token_f1-score": 0.3684210526315789, "token_support": 30} -------------------------------------------------------------------------------- /data_and_models/metrics/ner/interrater/organ.json: -------------------------------------------------------------------------------- 1 | {"entity_precision": 0.8243243243243243, "entity_recall": 0.6853932584269663, "entity_f1-score": 0.7484662576687117, "entity_support": 89, "token_precision": 0.8125, "token_recall": 0.7289719626168224, "token_f1-score": 0.7684729064039408, "token_support": 107} -------------------------------------------------------------------------------- /data_and_models/metrics/ner/interrater/organism.json: -------------------------------------------------------------------------------- 1 | {"entity_precision": 0.5610687022900763, "entity_recall": 0.7696335078534031, "entity_f1-score": 0.6490066225165563, "entity_support": 191, "token_precision": 0.6412776412776413, "token_recall": 0.8938356164383562, "token_f1-score": 0.7467811158798283, "token_support": 292} -------------------------------------------------------------------------------- /data_and_models/metrics/ner/interrater/pathway.json: -------------------------------------------------------------------------------- 1 | {"entity_precision": 0.5575757575757576, "entity_recall": 0.6571428571428571, "entity_f1-score": 0.6032786885245902, "entity_support": 140, "token_precision": 0.5785714285714286, "token_recall": 0.7297297297297297, "token_f1-score": 0.6454183266932271, "token_support": 222} -------------------------------------------------------------------------------- /data_and_models/metrics/ner/interrater/protein.json: -------------------------------------------------------------------------------- 1 | {"entity_precision": 0.6317829457364341, "entity_recall": 0.6197718631178707, "entity_f1-score": 0.6257197696737045, "entity_support": 263, "token_precision": 0.6307339449541285, "token_recall": 0.7236842105263158, "token_f1-score": 0.6740196078431373, "token_support": 380} -------------------------------------------------------------------------------- /data_and_models/metrics/ner/organ.json: -------------------------------------------------------------------------------- 1 | {"entity_precision": 0.4896551724137931, "entity_recall": 0.7634408602150538, "entity_f1-score": 0.5966386554621849, "entity_support": 93, "token_precision": 0.5112359550561798, "token_recall": 0.8125, "token_f1-score": 0.6275862068965518, "token_support": 112} -------------------------------------------------------------------------------- /data_and_models/metrics/ner/organism.json: -------------------------------------------------------------------------------- 1 | {"entity_precision": 0.6031746031746031, "entity_recall": 0.7307692307692307, "entity_f1-score": 0.6608695652173913, "entity_support": 208, "token_precision": 0.698005698005698, "token_recall": 0.7827476038338658, "token_f1-score": 0.7379518072289156, "token_support": 313} -------------------------------------------------------------------------------- /data_and_models/metrics/ner/pathway.json: -------------------------------------------------------------------------------- 1 | {"entity_precision": 0.6090225563909775, "entity_recall": 0.5510204081632653, "entity_f1-score": 0.5785714285714286, "entity_support": 147, "token_precision": 0.6831683168316832, "token_recall": 0.592274678111588, "token_f1-score": 0.6344827586206897, "token_support": 233} -------------------------------------------------------------------------------- /data_and_models/metrics/ner/protein.json: -------------------------------------------------------------------------------- 1 | {"entity_precision": 0.48739495798319327, "entity_recall": 0.6444444444444445, "entity_f1-score": 0.5550239234449761, "entity_support": 270, "token_precision": 0.5196078431372549, "token_recall": 0.8112244897959183, "token_f1-score": 0.6334661354581673, "token_support": 392} -------------------------------------------------------------------------------- /data_and_models/metrics/sentence_embedding/.gitignore: -------------------------------------------------------------------------------- 1 | # Blue Brain Search is a text mining toolbox focused on scientific use cases. 2 | # 3 | # Copyright (C) 2020 Blue Brain Project, EPFL. 4 | # 5 | # This program is free software: you can redistribute it and/or modify 6 | # it under the terms of the GNU Lesser General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or 8 | # (at your option) any later version. 9 | # 10 | # This program is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU Lesser General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU Lesser General Public License 16 | # along with this program. If not, see . 17 | 18 | /biobert_nli_sts.csv 19 | /biobert_nli_sts.png 20 | /tf_idf.csv 21 | /tf_idf.png 22 | /count.csv 23 | /count.png 24 | /sbert.csv 25 | /sbert.png 26 | /sbiobert.csv 27 | /sbiobert.png 28 | /biobert_nli_sts_cord19_v1.csv 29 | /biobert_nli_sts_cord19_v1.png 30 | -------------------------------------------------------------------------------- /data_and_models/metrics/sentence_embedding/biobert_nli_sts.json: -------------------------------------------------------------------------------- 1 | {"kendall_tau": 0.4507476668114661, "pearson_r": 0.5885845851038655, "spearman_rho": 0.5780078495658358} -------------------------------------------------------------------------------- /data_and_models/metrics/sentence_embedding/biobert_nli_sts_cord19_v1.json: -------------------------------------------------------------------------------- 1 | {"kendall_tau": 0.5778437302402729, "pearson_r": 0.7254632318864527, "spearman_rho": 0.7167140825467725} -------------------------------------------------------------------------------- /data_and_models/metrics/sentence_embedding/count.json: -------------------------------------------------------------------------------- 1 | {"kendall_tau": 0.31182871376137494, "pearson_r": 0.4221325079692087, "spearman_rho": 0.4103474240867628} -------------------------------------------------------------------------------- /data_and_models/metrics/sentence_embedding/sbert.json: -------------------------------------------------------------------------------- 1 | {"kendall_tau": 0.28818293451880617, "pearson_r": 0.4174311065388488, "spearman_rho": 0.38012159475593804} -------------------------------------------------------------------------------- /data_and_models/metrics/sentence_embedding/sbiobert.json: -------------------------------------------------------------------------------- 1 | {"kendall_tau": 0.40050038592100756, "pearson_r": 0.5297193358767457, "spearman_rho": 0.5147194625851121} -------------------------------------------------------------------------------- /data_and_models/metrics/sentence_embedding/tf_idf.json: -------------------------------------------------------------------------------- 1 | {"kendall_tau": 0.37685460667843884, "pearson_r": 0.4973378796506119, "spearman_rho": 0.48742817979126074} -------------------------------------------------------------------------------- /data_and_models/models/language_modeling/.gitignore: -------------------------------------------------------------------------------- 1 | /biobert_cord19_v1 2 | /biobert_cord19_v1__logs 3 | -------------------------------------------------------------------------------- /data_and_models/models/language_modeling/biobert_cord19_v1.dvc: -------------------------------------------------------------------------------- 1 | outs: 2 | - md5: 8ce898d7b0d4920b9312768925a56802.dir 3 | size: 1300619338 4 | nfiles: 9 5 | path: biobert_cord19_v1 6 | -------------------------------------------------------------------------------- /data_and_models/models/ner/.gitignore: -------------------------------------------------------------------------------- 1 | # Blue Brain Search is a text mining toolbox focused on scientific use cases. 2 | # 3 | # Copyright (C) 2020 Blue Brain Project, EPFL. 4 | # 5 | # This program is free software: you can redistribute it and/or modify 6 | # it under the terms of the GNU Lesser General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or 8 | # (at your option) any later version. 9 | # 10 | # This program is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU Lesser General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU Lesser General Public License 16 | # along with this program. If not, see . 17 | 18 | /model-cell_compartment 19 | /model-cell_type 20 | /model-chemical 21 | /model-disease 22 | /model-drug 23 | /model-organ 24 | /model-organism 25 | /model-pathway 26 | /model-protein 27 | -------------------------------------------------------------------------------- /data_and_models/models/ner_er/.gitignore: -------------------------------------------------------------------------------- 1 | # Blue Brain Search is a text mining toolbox focused on scientific use cases. 2 | # 3 | # Copyright (C) 2020 Blue Brain Project, EPFL. 4 | # 5 | # This program is free software: you can redistribute it and/or modify 6 | # it under the terms of the GNU Lesser General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or 8 | # (at your option) any later version. 9 | # 10 | # This program is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU Lesser General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU Lesser General Public License 16 | # along with this program. If not, see . 17 | 18 | /model-cell_compartment 19 | /model-cell_type 20 | /model-chemical 21 | /model-disease 22 | /model-drug 23 | /model-organ 24 | /model-organism 25 | /model-pathway 26 | /model-protein 27 | -------------------------------------------------------------------------------- /data_and_models/models/sentence_embedding/.gitignore: -------------------------------------------------------------------------------- 1 | # Blue Brain Search is a text mining toolbox focused on scientific use cases. 2 | # 3 | # Copyright (C) 2020 Blue Brain Project, EPFL. 4 | # 5 | # This program is free software: you can redistribute it and/or modify 6 | # it under the terms of the GNU Lesser General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or 8 | # (at your option) any later version. 9 | # 10 | # This program is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU Lesser General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU Lesser General Public License 16 | # along with this program. If not, see . 17 | 18 | /tf_idf 19 | /count 20 | /biobert_nli_sts_cord19_v1 21 | -------------------------------------------------------------------------------- /data_and_models/models/sentence_embedding/biobert_nli_sts_cord19_v1.dvc: -------------------------------------------------------------------------------- 1 | outs: 2 | - md5: 9dccac418759a8c53f5da608dbd9f835.dir 3 | size: 433540587 4 | nfiles: 9 5 | path: biobert_nli_sts_cord19_v1 6 | -------------------------------------------------------------------------------- /data_and_models/pipelines/README.md: -------------------------------------------------------------------------------- 1 | 19 | 20 | # Description 21 | - DVC pipelines to train and evaluate machine learning models. 22 | 23 | # Content 24 | 25 | ## `ner/` 26 | - DVC pipelines to train and evaluate models for Named Entity Recognition. 27 | 28 | ## `sentence_embedding/` 29 | - DVC pipelines to train and evaluate models for Sentence Embedding. 30 | -------------------------------------------------------------------------------- /data_and_models/pipelines/ner/Dockerfile: -------------------------------------------------------------------------------- 1 | # Blue Brain Search is a text mining toolbox focused on scientific use cases. 2 | # 3 | # Copyright (C) 2020 Blue Brain Project, EPFL. 4 | # 5 | # This program is free software: you can redistribute it and/or modify 6 | # it under the terms of the GNU Lesser General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or 8 | # (at your option) any later version. 9 | # 10 | # This program is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU Lesser General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU Lesser General Public License 16 | # along with this program. If not, see . 17 | 18 | FROM continuumio/miniconda3:4.9.2 19 | 20 | ENV HTTP_PROXY='http://bbpproxy.epfl.ch:80/' 21 | ENV HTTPS_PROXY='http://bbpproxy.epfl.ch:80/' 22 | ENV http_proxy='http://bbpproxy.epfl.ch:80/' 23 | ENV https_proxy='http://bbpproxy.epfl.ch:80/' 24 | 25 | # Update conda, install additional system packages 26 | RUN true \ 27 | && conda update conda \ 28 | && apt-get update \ 29 | && apt-get install -y gcc g++ build-essential vim libfontconfig1 30 | RUN conda install -c carta mysqlclient 31 | 32 | # Install Blue Brain Search -- revision can be a branch, sha, or tag 33 | ARG BBS_REVISION=v0.2.0 34 | ADD . /src 35 | WORKDIR /src 36 | RUN git checkout $BBS_REVISION 37 | # remove ruamel-yaml: https://github.com/pypa/pip/issues/5247#issuecomment-381550610 38 | RUN rm -rf /opt/conda/lib/python3.8/site-packages/ruamel* 39 | RUN pip install -r requirements.txt 40 | RUN pip install -r requirements-data_and_models.txt 41 | RUN pip install $PWD[data_and_models] 42 | 43 | 44 | EXPOSE 8888 45 | 46 | RUN groupadd -g 999 docker 47 | RUN useradd --create-home --uid 1000 --gid docker bbsuser 48 | 49 | WORKDIR /bbs 50 | ENTRYPOINT ["/bin/bash"] 51 | -------------------------------------------------------------------------------- /data_and_models/pipelines/ner/add_er.py: -------------------------------------------------------------------------------- 1 | """Append an entity ruler to a spacy pipeline.""" 2 | 3 | # Blue Brain Search is a text mining toolbox focused on scientific use cases. 4 | # 5 | # Copyright (C) 2020 Blue Brain Project, EPFL. 6 | # 7 | # This program is free software: you can redistribute it and/or modify 8 | # it under the terms of the GNU Lesser General Public License as published by 9 | # the Free Software Foundation, either version 3 of the License, or 10 | # (at your option) any later version. 11 | # 12 | # This program is distributed in the hope that it will be useful, 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | # GNU Lesser General Public License for more details. 16 | # 17 | # You should have received a copy of the GNU Lesser General Public License 18 | # along with this program. If not, see . 19 | 20 | import pathlib 21 | from argparse import ArgumentParser 22 | 23 | import spacy 24 | 25 | from bluesearch.mining.entity import global2model_patterns 26 | from bluesearch.utils import JSONL 27 | 28 | parser = ArgumentParser() 29 | parser.add_argument( 30 | "--model", 31 | required=True, 32 | type=str, 33 | help="SpaCy model without an entity ruler.", 34 | ) 35 | parser.add_argument( 36 | "--output_file", 37 | required=True, 38 | type=str, 39 | help="File to which we save the enhanced spacy pipeline.", 40 | ) 41 | parser.add_argument( 42 | "--patterns_file", 43 | required=True, 44 | type=str, 45 | help="Path to the patterns file used for rule-based entity recognition.", 46 | ) 47 | args = parser.parse_args() 48 | 49 | 50 | def main(): 51 | """Add entity ruler to NER models.""" 52 | # Load and preprocess the annotations 53 | ner_model = spacy.load(args.model) 54 | 55 | print("Loading patterns") 56 | path_patterns = pathlib.Path(args.patterns_file) 57 | patterns = JSONL.load_jsonl(path_patterns) 58 | _, _, entity_type = args.model.rpartition("-") 59 | modified_patterns = global2model_patterns(patterns, entity_type.upper()) 60 | er_config = {"validate": True, "overwrite_ents": True} 61 | er = ner_model.add_pipe("entity_ruler", after="ner", config=er_config) 62 | er.add_patterns(modified_patterns) 63 | 64 | print("Saving model with an entity ruler") 65 | ner_model.to_disk(args.output_file) 66 | 67 | 68 | if __name__ == "__main__": 69 | main() 70 | -------------------------------------------------------------------------------- /data_and_models/pipelines/ner/params.yaml: -------------------------------------------------------------------------------- 1 | entities: 2 | - cell_compartment 3 | - cell_type 4 | - chemical 5 | - disease 6 | - drug 7 | - organ 8 | - organism 9 | - pathway 10 | - protein 11 | train: 12 | corpora: 13 | dev_size: 0.1 # (float) Proportion. | (int) Number of samples. 14 | shuffle_seed: 0 # (int) Shuffling applied before the split. 15 | eval: 16 | disease: 17 | etype_name: DISEASE 18 | cell_compartment: 19 | etype_name: CELL_COMPARTMENT 20 | drug: 21 | etype_name: DRUG 22 | organ: 23 | etype_name: ORGAN 24 | chemical: 25 | etype_name: CHEMICAL 26 | organism: 27 | etype_name: ORGANISM 28 | cell_type: 29 | etype_name: CELL_TYPE 30 | protein: 31 | etype_name: PROTEIN 32 | pathway: 33 | etype_name: PATHWAY 34 | -------------------------------------------------------------------------------- /data_and_models/pipelines/ner/transformers_vs_spacy/requirements.txt: -------------------------------------------------------------------------------- 1 | # Blue Brain Search is a text mining toolbox focused on scientific use cases. 2 | # 3 | # Copyright (C) 2020 Blue Brain Project, EPFL. 4 | # 5 | # This program is free software: you can redistribute it and/or modify 6 | # it under the terms of the GNU Lesser General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or 8 | # (at your option) any later version. 9 | # 10 | # This program is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU Lesser General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU Lesser General Public License 16 | # along with this program. If not, see . 17 | 18 | seqeval 19 | datasets >= 1.1.3 20 | -------------------------------------------------------------------------------- /data_and_models/pipelines/ner/transformers_vs_spacy/spacy/.gitignore: -------------------------------------------------------------------------------- 1 | # Blue Brain Search is a text mining toolbox focused on scientific use cases. 2 | # 3 | # Copyright (C) 2020 Blue Brain Project, EPFL. 4 | # 5 | # This program is free software: you can redistribute it and/or modify 6 | # it under the terms of the GNU Lesser General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or 8 | # (at your option) any later version. 9 | # 10 | # This program is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU Lesser General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU Lesser General Public License 16 | # along with this program. If not, see . 17 | 18 | df_test_pred.pkl 19 | pathway_metrics.json 20 | -------------------------------------------------------------------------------- /data_and_models/pipelines/ner/transformers_vs_spacy/spacy/eval.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Blue Brain Search is a text mining toolbox focused on scientific use cases. 4 | # 5 | # Copyright (C) 2020 Blue Brain Project, EPFL. 6 | # 7 | # This program is free software: you can redistribute it and/or modify 8 | # it under the terms of the GNU Lesser General Public License as published by 9 | # the Free Software Foundation, either version 3 of the License, or 10 | # (at your option) any later version. 11 | # 12 | # This program is distributed in the hope that it will be useful, 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | # GNU Lesser General Public License for more details. 16 | # 17 | # You should have received a copy of the GNU Lesser General Public License 18 | # along with this program. If not, see . 19 | 20 | DVC_BASE="../../../.." 21 | test_data_1="$DVC_BASE/annotations/ner/annotations10_EmmanuelleLogette_2020-08-28_raw1_raw5_10EntityTypes.jsonl" 22 | test_data_2="$DVC_BASE/annotations/ner/annotations12_EmmanuelleLogette_2020-08-28_raw7_10EntityTypes.jsonl" 23 | 24 | 25 | python eval_spacy.py \ 26 | --annotation_files "$test_data_1,$test_data_2" \ 27 | --model "$DVC_BASE/models/ner/model-pathway" \ 28 | --output_file "pathway_metrics.json" \ 29 | --etype "PATHWAY" 30 | -------------------------------------------------------------------------------- /data_and_models/pipelines/ner/transformers_vs_spacy/transformers/.gitignore: -------------------------------------------------------------------------------- 1 | # Blue Brain Search is a text mining toolbox focused on scientific use cases. 2 | # 3 | # Copyright (C) 2020 Blue Brain Project, EPFL. 4 | # 5 | # This program is free software: you can redistribute it and/or modify 6 | # it under the terms of the GNU Lesser General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or 8 | # (at your option) any later version. 9 | # 10 | # This program is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU Lesser General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU Lesser General Public License 16 | # along with this program. If not, see . 17 | 18 | checkpoints/ 19 | logs/ 20 | pathway_metrics.json 21 | pathway_metrics_token.json 22 | pathway_metrics_entity.json 23 | test_data.pkl 24 | train_data.pkl 25 | -------------------------------------------------------------------------------- /data_and_models/pipelines/ner/transformers_vs_spacy/transformers/0_prepare_data.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Blue Brain Search is a text mining toolbox focused on scientific use cases. 4 | # 5 | # Copyright (C) 2020 Blue Brain Project, EPFL. 6 | # 7 | # This program is free software: you can redistribute it and/or modify 8 | # it under the terms of the GNU Lesser General Public License as published by 9 | # the Free Software Foundation, either version 3 of the License, or 10 | # (at your option) any later version. 11 | # 12 | # This program is distributed in the hope that it will be useful, 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | # GNU Lesser General Public License for more details. 16 | # 17 | # You should have received a copy of the GNU Lesser General Public License 18 | # along with this program. If not, see . 19 | 20 | export LANG=C.UTF-8 21 | 22 | 23 | train_data="../../../../annotations/ner/annotations15_EmmanuelleLogette_2020-09-22_raw9_Pathway.jsonl" 24 | test_data_1="../../../../annotations/ner/annotations10_EmmanuelleLogette_2020-08-28_raw1_raw5_10EntityTypes.jsonl" 25 | test_data_2="../../../../annotations/ner/annotations12_EmmanuelleLogette_2020-08-28_raw7_10EntityTypes.jsonl" 26 | 27 | python3 francesco_script.py --annotation-files "$train_data" -o train_data.txt --keep-punctuation --entity-type "PATHWAY" 28 | python3 francesco_script.py --annotation-files "$test_data_1,$test_data_2" -o test_data.txt --keep-punctuation --entity-type "PATHWAY" 29 | 30 | python3 create_pickle.py train_data.txt train_data.pkl 31 | python3 create_pickle.py test_data.txt test_data.pkl 32 | 33 | rm train_data.txt 34 | rm test_data.txt 35 | 36 | -------------------------------------------------------------------------------- /data_and_models/pipelines/ner/transformers_vs_spacy/transformers/1_run_transformers_ner.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # This script is from huggingface/transformers v4.4.0. 4 | # It is from https://github.com/huggingface/transformers/tree/v4.4.0/examples/token-classification/run.sh. 5 | 6 | # Copyright 2020 The HuggingFace Team. All rights reserved. 7 | # 8 | # Licensed under the Apache License, Version 2.0 (the "License"); 9 | # you may not use this file except in compliance with the License. 10 | # You may obtain a copy of the License at 11 | # 12 | # http://www.apache.org/licenses/LICENSE-2.0 13 | # 14 | # Unless required by applicable law or agreed to in writing, software 15 | # distributed under the License is distributed on an "AS IS" BASIS, 16 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | # See the License for the specific language governing permissions and 18 | # limitations under the License. 19 | 20 | 21 | exp_name="evaluate_transformers" 22 | 23 | # --model_name_or_path bert-base-uncased \ 24 | # --lr_scheduler_type "constant" \ 25 | DS="train_data.pkl" 26 | DS_VAL="test_data.pkl" 27 | python3 1_run_transformers_ner.py \ 28 | --model_name_or_path "dmis-lab/biobert-large-cased-v1.1" \ 29 | --output_dir "checkpoints/$exp_name" \ 30 | --do_train \ 31 | --do_eval \ 32 | --do_predict \ 33 | --evaluation_strategy "steps" \ 34 | --eval_steps 10 \ 35 | --train_file "$DS" \ 36 | --validation_file "$DS_VAL" \ 37 | --test_file "$DS_VAL" \ 38 | --num_train_epochs 50 \ 39 | --learning_rate "1e-4" \ 40 | --logging_strategy "epoch" \ 41 | --logging_dir "logs/$exp_name" \ 42 | $@ 43 | # --dataset_name conll2003 \ 44 | -------------------------------------------------------------------------------- /data_and_models/pipelines/ner/transformers_vs_spacy/transformers/2_eval_pred.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Blue Brain Search is a text mining toolbox focused on scientific use cases. 4 | # 5 | # Copyright (C) 2020 Blue Brain Project, EPFL. 6 | # 7 | # This program is free software: you can redistribute it and/or modify 8 | # it under the terms of the GNU Lesser General Public License as published by 9 | # the Free Software Foundation, either version 3 of the License, or 10 | # (at your option) any later version. 11 | # 12 | # This program is distributed in the hope that it will be useful, 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | # GNU Lesser General Public License for more details. 16 | # 17 | # You should have received a copy of the GNU Lesser General Public License 18 | # along with this program. If not, see . 19 | 20 | python 2_eval_pred.py checkpoints/evaluate_transformers/test_predictions.txt test_data.pkl 21 | 22 | -------------------------------------------------------------------------------- /data_and_models/pipelines/ner/transformers_vs_spacy/transformers/3_compare_tokens.py: -------------------------------------------------------------------------------- 1 | # Blue Brain Search is a text mining toolbox focused on scientific use cases. 2 | # 3 | # Copyright (C) 2020 Blue Brain Project, EPFL. 4 | # 5 | # This program is free software: you can redistribute it and/or modify 6 | # it under the terms of the GNU Lesser General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or 8 | # (at your option) any later version. 9 | # 10 | # This program is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU Lesser General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU Lesser General Public License 16 | # along with this program. If not, see . 17 | 18 | import pandas as pd 19 | import spacy 20 | 21 | 22 | nlp = spacy.load("en_core_web_lg", disable=["vocab", "ner"]) 23 | 24 | 25 | def unroll_rows(df): 26 | return pd.concat([pd.DataFrame(row.to_dict()) for i, row in df.iterrows()]) 27 | 28 | 29 | def poor_venn(set1, set2): 30 | print(f"[ {len(set1 - set2)} | {len(set1 & set2)} | {len(set2 - set1)} ]") 31 | 32 | 33 | def lemma(word): 34 | return next(iter(nlp(word.lower()))).lemma_ 35 | 36 | 37 | df_train = pd.read_pickle("train_data.pkl") 38 | df_test = pd.read_pickle("test_data.pkl") 39 | with open("checkpoints/evaluate_transformers/test_predictions.txt") as fp: 40 | df_test["pred"] = [line.strip().split() for line in fp] 41 | 42 | 43 | df_train_flat = unroll_rows(df_train) 44 | df_test_flat = unroll_rows(df_test) 45 | 46 | train_entities = set(df_train_flat.token[df_train_flat.entity_type != "O"]) 47 | test_entities = set(df_test_flat.token[df_test_flat.entity_type != "O"]) 48 | pred_entities = set(df_test_flat.token[df_test_flat.pred != "O"]) 49 | 50 | train_entities = set(map(lemma, train_entities)) 51 | test_entities = set(map(lemma, test_entities)) 52 | pred_entities = set(map(lemma, pred_entities)) 53 | 54 | print("{train, test, pred} = Unique token lemmata in the corresponding sets with an entity type that is not 'O'") 55 | print() 56 | 57 | print("train - test") 58 | print(sorted(train_entities - test_entities)) 59 | print() 60 | 61 | print("train - pred") 62 | print(sorted(train_entities - pred_entities)) 63 | print() 64 | 65 | print("test - train") 66 | print(sorted(test_entities - train_entities)) 67 | print() 68 | 69 | print("pred - train") 70 | print(sorted(pred_entities - train_entities)) 71 | print() 72 | 73 | print("len(train) =", len(train_entities)) 74 | print("len(test) =", len(test_entities)) 75 | print("len(pred) =", len(pred_entities)) 76 | print() 77 | 78 | print("VENN: train vs. test") 79 | poor_venn(train_entities, test_entities) 80 | print("VENN: train vs. pred") 81 | poor_venn(train_entities, pred_entities) 82 | print("VENN: test vs. pred") 83 | poor_venn(test_entities, pred_entities) 84 | print() 85 | 86 | print("How many of the unseen tokens were predicted?") 87 | seen = test_entities & train_entities 88 | unseen = test_entities - train_entities 89 | print(f"Out of {len(unseen)} unseen tokens {len(unseen & pred_entities)} were predicted") 90 | print(f"Out of {len(seen)} seen tokens {len(seen & pred_entities)} were predicted") 91 | -------------------------------------------------------------------------------- /data_and_models/pipelines/ner/transformers_vs_spacy/transformers/create_pickle.py: -------------------------------------------------------------------------------- 1 | # Blue Brain Search is a text mining toolbox focused on scientific use cases. 2 | # 3 | # Copyright (C) 2020 Blue Brain Project, EPFL. 4 | # 5 | # This program is free software: you can redistribute it and/or modify 6 | # it under the terms of the GNU Lesser General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or 8 | # (at your option) any later version. 9 | # 10 | # This program is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU Lesser General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU Lesser General Public License 16 | # along with this program. If not, see . 17 | 18 | import argparse 19 | import pathlib 20 | 21 | from datasets import load_dataset 22 | import pandas as pd 23 | from typing import List 24 | 25 | parser = argparse.ArgumentParser() 26 | parser.add_argument("input", default="dataset.txt") 27 | parser.add_argument("output", default="dataset.pkl") 28 | args = parser.parse_args() 29 | input_path = pathlib.Path(args.input) 30 | 31 | sequences: List[List[str]]= [] 32 | targets: List[List[str]] = [] 33 | 34 | with input_path.open("r", encoding="utf-8") as f: 35 | sequence: List[str] = [] 36 | target: List[str] = [] 37 | 38 | all_lines = list(f.readlines()) 39 | 40 | # Make sure the last line is a new line 41 | if all_lines[-1] != "\n": 42 | all_lines.append("\n") 43 | 44 | for line in all_lines: 45 | if line == "\n": 46 | sequences.append(sequence[:]) 47 | targets.append(target[:]) 48 | 49 | sequence.clear() 50 | target.clear() 51 | continue 52 | 53 | try: 54 | token, entity_type = line.split(" ") 55 | entity_type = entity_type.strip("\n") 56 | except: 57 | print(f"Something went wrong: {line}") 58 | 59 | sequence.append(token) 60 | target.append(entity_type) 61 | 62 | df = pd.DataFrame({"token": sequences, "entity_type": targets}) 63 | df.to_pickle(args.output) 64 | -------------------------------------------------------------------------------- /data_and_models/pipelines/sentence_embedding/.gitignore: -------------------------------------------------------------------------------- 1 | /datasets 2 | -------------------------------------------------------------------------------- /data_and_models/pipelines/sentence_embedding/Dockerfile: -------------------------------------------------------------------------------- 1 | # Blue Brain Search is a text mining toolbox focused on scientific use cases. 2 | # 3 | # Copyright (C) 2020 Blue Brain Project, EPFL. 4 | # 5 | # This program is free software: you can redistribute it and/or modify 6 | # it under the terms of the GNU Lesser General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or 8 | # (at your option) any later version. 9 | # 10 | # This program is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU Lesser General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU Lesser General Public License 16 | # along with this program. If not, see . 17 | 18 | FROM nvcr.io/nvidia/pytorch:21.06-py3 19 | 20 | ENV HTTP_PROXY='http://bbpproxy.epfl.ch:80/' 21 | ENV HTTPS_PROXY='http://bbpproxy.epfl.ch:80/' 22 | ENV http_proxy='http://bbpproxy.epfl.ch:80/' 23 | ENV https_proxy='http://bbpproxy.epfl.ch:80/' 24 | 25 | # Update conda, install additional system packages 26 | RUN true \ 27 | && conda update conda \ 28 | && apt-get update \ 29 | && apt-get install -y gcc g++ build-essential vim libfontconfig1 30 | RUN conda install -c carta mysqlclient 31 | 32 | # Instal BlueBrainSearach -- revision can be a branch, sha, or tag 33 | ARG BBS_REVISION=v0.2.0 34 | ADD . /src 35 | WORKDIR /src 36 | RUN git checkout $BBS_REVISION 37 | # remove ruamel-yaml: https://github.com/pypa/pip/issues/5247#issuecomment-381550610 38 | RUN rm -rf /opt/conda/lib/python3.8/site-packages/ruamel* 39 | RUN pip install -r requirements.txt 40 | RUN pip install -r requirements-data_and_models.txt 41 | RUN pip install $PWD[data_and_models] 42 | # Force the version of the script training_transformers/train.py 43 | RUN pip install transformers==3.4.0 44 | 45 | 46 | EXPOSE 8888 47 | 48 | RUN groupadd -g 999 docker 49 | RUN useradd --create-home --uid 1000 --gid docker bbsuser 50 | 51 | WORKDIR /bbs 52 | RUN rm -rf /bbs/tmp 53 | ENTRYPOINT ["/bin/bash"] 54 | -------------------------------------------------------------------------------- /data_and_models/pipelines/sentence_embedding/params.yaml: -------------------------------------------------------------------------------- 1 | train: 2 | tf_idf: 3 | init_kwargs: 4 | lowercase: True 5 | count: 6 | init_kwargs: 7 | lowercase: True 8 | 9 | eval: 10 | biobert_nli_sts_cord19_v1: 11 | class: SentTransformer 12 | init_kwargs: 13 | model_name_or_path: ../../models/sentence_embedding/biobert_nli_sts_cord19_v1/ 14 | biobert_nli_sts: 15 | class: SentTransformer 16 | init_kwargs: 17 | model_name_or_path: clagator/biobert_v1.1_pubmed_nli_sts 18 | tf_idf: 19 | class: SklearnVectorizer 20 | init_kwargs: 21 | checkpoint_path: ../../models/sentence_embedding/tf_idf/model.pkl 22 | count: 23 | class: SklearnVectorizer 24 | init_kwargs: 25 | checkpoint_path: ../../models/sentence_embedding/count/model.pkl 26 | sbert: 27 | class: SentTransformer 28 | init_kwargs: 29 | model_name_or_path: bert-base-nli-mean-tokens 30 | sbiobert: 31 | class: SentTransformer 32 | init_kwargs: 33 | model_name_or_path: gsarti/biobert-nli 34 | -------------------------------------------------------------------------------- /data_and_models/pipelines/sentence_embedding/training_transformers/.gitignore: -------------------------------------------------------------------------------- 1 | # Blue Brain Search is a text mining toolbox focused on scientific use cases. 2 | # 3 | # Copyright (C) 2020 Blue Brain Project, EPFL. 4 | # 5 | # This program is free software: you can redistribute it and/or modify 6 | # it under the terms of the GNU Lesser General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or 8 | # (at your option) any later version. 9 | # 10 | # This program is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU Lesser General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU Lesser General Public License 16 | # along with this program. If not, see . 17 | 18 | /biosses_sentences.txt 19 | /sentences-filtered_11-527-877.txt 20 | -------------------------------------------------------------------------------- /data_and_models/pipelines/sentence_embedding/training_transformers/biosses_sentences.txt.dvc: -------------------------------------------------------------------------------- 1 | outs: 2 | - md5: c48471cf689ad28c0cbc20f7fa18f4d8 3 | size: 31828 4 | path: biosses_sentences.txt 5 | -------------------------------------------------------------------------------- /data_and_models/pipelines/sentence_embedding/training_transformers/sentences-filtered_11-527-877.txt.dvc: -------------------------------------------------------------------------------- 1 | outs: 2 | - md5: 96aaec3358dc184bcb7015b07c4c893a 3 | size: 1655753737 4 | path: sentences-filtered_11-527-877.txt 5 | -------------------------------------------------------------------------------- /data_and_models/raw_sentences/.gitignore: -------------------------------------------------------------------------------- 1 | # Blue Brain Search is a text mining toolbox focused on scientific use cases. 2 | # 3 | # Copyright (C) 2020 Blue Brain Project, EPFL. 4 | # 5 | # This program is free software: you can redistribute it and/or modify 6 | # it under the terms of the GNU Lesser General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or 8 | # (at your option) any later version. 9 | # 10 | # This program is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU Lesser General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU Lesser General Public License 16 | # along with this program. If not, see . 17 | 18 | /raw1_2020-06-10_cord19_TestSet.jsonl 19 | /raw2_2020-06-29_cord19_Disease.jsonl 20 | /raw3_2020-06-30_cord19_Disease.jsonl 21 | /raw4_2020-07-02_cord19_ChemicalOrganism.jsonl 22 | /raw5_2020-07-08_cord19_Drug_TestSet.jsonl 23 | /raw6_2020-07-08_cord19_CelltypeProtein.jsonl 24 | /raw7_2020-09-01_cord19v35_CellCompartment.jsonl 25 | /raw8_2020-09-02_cord19v35_CellCompartmentDrugOrgan.jsonl 26 | /raw9_2020-09-02_cord19v35_Pathway.jsonl 27 | -------------------------------------------------------------------------------- /data_and_models/raw_sentences/raw1_2020-06-10_cord19_TestSet.jsonl.dvc: -------------------------------------------------------------------------------- 1 | outs: 2 | - md5: 03ead15d2a2e4b5d25ddd973ad886c5d 3 | path: raw1_2020-06-10_cord19_TestSet.jsonl 4 | -------------------------------------------------------------------------------- /data_and_models/raw_sentences/raw2_2020-06-29_cord19_Disease.jsonl.dvc: -------------------------------------------------------------------------------- 1 | outs: 2 | - md5: e63e82e9fcfef81a3c2e1d1ebfc5a02e 3 | path: raw2_2020-06-29_cord19_Disease.jsonl 4 | -------------------------------------------------------------------------------- /data_and_models/raw_sentences/raw3_2020-06-30_cord19_Disease.jsonl.dvc: -------------------------------------------------------------------------------- 1 | outs: 2 | - md5: 81d203dd8f0e3461cb44caa15cb0bab4 3 | path: raw3_2020-06-30_cord19_Disease.jsonl 4 | -------------------------------------------------------------------------------- /data_and_models/raw_sentences/raw4_2020-07-02_cord19_ChemicalOrganism.jsonl.dvc: -------------------------------------------------------------------------------- 1 | outs: 2 | - md5: cc8d780d8b58521d21adb04502c9c269 3 | path: raw4_2020-07-02_cord19_ChemicalOrganism.jsonl 4 | -------------------------------------------------------------------------------- /data_and_models/raw_sentences/raw5_2020-07-08_cord19_Drug_TestSet.jsonl.dvc: -------------------------------------------------------------------------------- 1 | outs: 2 | - md5: 998e96d170fc05117d978a588d0f07a8 3 | path: raw5_2020-07-08_cord19_Drug_TestSet.jsonl 4 | -------------------------------------------------------------------------------- /data_and_models/raw_sentences/raw6_2020-07-08_cord19_CelltypeProtein.jsonl.dvc: -------------------------------------------------------------------------------- 1 | outs: 2 | - md5: 50bbcec563fd2d566783b11785253cb4 3 | path: raw6_2020-07-08_cord19_CelltypeProtein.jsonl 4 | -------------------------------------------------------------------------------- /data_and_models/raw_sentences/raw7_2020-09-01_cord19v35_CellCompartment.jsonl.dvc: -------------------------------------------------------------------------------- 1 | outs: 2 | - md5: 6263f835ab71cad54f6f55c830b7879e 3 | path: raw7_2020-09-01_cord19v35_CellCompartment.jsonl 4 | -------------------------------------------------------------------------------- /data_and_models/raw_sentences/raw8_2020-09-02_cord19v35_CellCompartmentDrugOrgan.jsonl.dvc: -------------------------------------------------------------------------------- 1 | outs: 2 | - md5: 470209cd6fa7df1c7c8be0016d3926e7 3 | path: raw8_2020-09-02_cord19v35_CellCompartmentDrugOrgan.jsonl 4 | -------------------------------------------------------------------------------- /data_and_models/raw_sentences/raw9_2020-09-02_cord19v35_Pathway.jsonl.dvc: -------------------------------------------------------------------------------- 1 | outs: 2 | - md5: efb28f1a3f082a908abc8adb587b4b0c 3 | path: raw9_2020-09-02_cord19v35_Pathway.jsonl 4 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | # Blue Brain Search is a text mining toolbox focused on scientific use cases. 2 | # 3 | # Copyright (C) 2020 Blue Brain Project, EPFL. 4 | # 5 | # This program is free software: you can redistribute it and/or modify 6 | # it under the terms of the GNU Lesser General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or 8 | # (at your option) any later version. 9 | # 10 | # This program is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU Lesser General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU Lesser General Public License 16 | # along with this program. If not, see . 17 | 18 | version: "3.5" 19 | services: 20 | 21 | base: 22 | build: 23 | context: . 24 | dockerfile: docker/base.Dockerfile 25 | args: 26 | - BBS_HTTP_PROXY 27 | - BBS_HTTPS_PROXY 28 | - BBS_http_proxy 29 | - BBS_https_proxy 30 | - BBS_USERS 31 | image: bbs-base 32 | container_name: bbs-base 33 | networks: 34 | - bbs_network 35 | 36 | search: 37 | build: 38 | context: . 39 | dockerfile: docker/search.Dockerfile 40 | image: bbs_search 41 | container_name: bbs_search 42 | env_file: .env 43 | ports: 44 | - 8850:8080 45 | volumes: 46 | - /raid:/raid 47 | networks: 48 | - bbs_network 49 | 50 | embedding: 51 | build: 52 | context: . 53 | dockerfile: docker/embedding.Dockerfile 54 | image: bbs_embedding 55 | container_name: bbs_embedding 56 | env_file: .env 57 | ports: 58 | - 8851:8080 59 | volumes: 60 | - /raid:/raid 61 | networks: 62 | - bbs_network 63 | 64 | mining: 65 | build: 66 | context: . 67 | dockerfile: docker/mining.Dockerfile 68 | image: bbs_mining 69 | container_name: bbs_mining 70 | env_file: .env 71 | ports: 72 | - 8852:8080 73 | volumes: 74 | - /raid:/raid 75 | networks: 76 | - bbs_network 77 | 78 | networks: 79 | bbs_network: 80 | driver: bridge 81 | -------------------------------------------------------------------------------- /docker/corenlp.Dockerfile: -------------------------------------------------------------------------------- 1 | # Blue Brain Search is a text mining toolbox focused on scientific use cases. 2 | # 3 | # Copyright (C) 2020 Blue Brain Project, EPFL. 4 | # 5 | # This program is free software: you can redistribute it and/or modify 6 | # it under the terms of the GNU Lesser General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or 8 | # (at your option) any later version. 9 | # 10 | # This program is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU Lesser General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU Lesser General Public License 16 | # along with this program. If not, see . 17 | 18 | FROM ubuntu:latest 19 | 20 | LABEL maintainer="Stanislav Schmidt " 21 | LABEL version="1.0" 22 | LABEL description="CoreNLP Server" 23 | 24 | # ENV HTTP_PROXY='http://bbpproxy.epfl.ch:80/' 25 | # ENV HTTPS_PROXY='http://bbpproxy.epfl.ch:80/' 26 | # ENV http_proxy='http://bbpproxy.epfl.ch:80/' 27 | # ENV https_proxy='http://bbpproxy.epfl.ch:80/' 28 | 29 | # Install git, gcc, and g++ 30 | RUN apt-get update && apt-get install -y \ 31 | default-jre \ 32 | unzip \ 33 | wget 34 | 35 | # Download and install CoreNLP 4.0.0 (2020-04-19) 36 | # See https://stanfordnlp.github.io/CoreNLP/history.html 37 | # COPY corenlp_download.zip . 38 | RUN true \ 39 | && export CORENLP_VERSION=4.0.0 \ 40 | && URL=http://nlp.stanford.edu/software/stanford-corenlp-${CORENLP_VERSION}.zip \ 41 | && wget -q --show-progress --progress=bar:force -O corenlp_download.zip $URL 2>&1 \ 42 | && unzip -q -j corenlp_download.zip -d /corenlp \ 43 | && rm corenlp_download.zip 44 | 45 | 46 | # Add a user 47 | RUN useradd corenlpuser 48 | WORKDIR /corenlp 49 | USER corenlpuser 50 | 51 | # Expose a port 52 | EXPOSE 9000 53 | 54 | ENTRYPOINT exec java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -annotators "tokenize,ssplit,pos,depparse" 55 | 56 | -------------------------------------------------------------------------------- /docker/embedding.Dockerfile: -------------------------------------------------------------------------------- 1 | # Blue Brain Search is a text mining toolbox focused on scientific use cases. 2 | # 3 | # Copyright (C) 2020 Blue Brain Project, EPFL. 4 | # 5 | # This program is free software: you can redistribute it and/or modify 6 | # it under the terms of the GNU Lesser General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or 8 | # (at your option) any later version. 9 | # 10 | # This program is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU Lesser General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU Lesser General Public License 16 | # along with this program. If not, see . 17 | 18 | FROM bbs-base 19 | 20 | USER root 21 | 22 | # Install the app 23 | ADD . /src 24 | WORKDIR /src 25 | RUN pip install . 26 | RUN pip install gunicorn 27 | 28 | # Set image version 29 | LABEL maintainer="BBP-EPFL Machine Learning team " 30 | LABEL description="REST API Server for Text Embeddings" 31 | 32 | # Add a user 33 | RUN useradd --create-home serveruser 34 | WORKDIR /home/serveruser 35 | USER serveruser 36 | 37 | # Run the entry point 38 | EXPOSE 8080 39 | ENTRYPOINT [\ 40 | "gunicorn", \ 41 | "--bind", "0.0.0.0:8080", \ 42 | "--workers", "1", \ 43 | "--timeout", "180", \ 44 | "bluesearch.entrypoint.embedding_server:get_embedding_app()"] 45 | -------------------------------------------------------------------------------- /docker/grobid_quantities.Dockerfile: -------------------------------------------------------------------------------- 1 | # Blue Brain Search is a text mining toolbox focused on scientific use cases. 2 | # 3 | # Copyright (C) 2020 Blue Brain Project, EPFL. 4 | # 5 | # This program is free software: you can redistribute it and/or modify 6 | # it under the terms of the GNU Lesser General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or 8 | # (at your option) any later version. 9 | # 10 | # This program is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU Lesser General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU Lesser General Public License 16 | # along with this program. If not, see . 17 | 18 | FROM ubuntu:latest 19 | 20 | LABEL maintainer="Stanislav Schmidt " 21 | LABEL version="1.0" 22 | LABEL description="GROBID Quantities Server" 23 | 24 | # ENV HTTP_PROXY='http://bbpproxy.epfl.ch:80/' 25 | # ENV HTTPS_PROXY='http://bbpproxy.epfl.ch:80/' 26 | # ENV http_proxy='http://bbpproxy.epfl.ch:80/' 27 | # ENV https_proxy='http://bbpproxy.epfl.ch:80/' 28 | 29 | 30 | # Install java, git, unzip and wget 31 | RUN apt-get update && apt-get install -y \ 32 | default-jre \ 33 | git \ 34 | unzip \ 35 | wget 36 | 37 | # Add a user 38 | RUN useradd --create-home grobiduser 39 | WORKDIR /home/grobiduser 40 | USER grobiduser 41 | 42 | # Download and install GROBID 43 | RUN true \ 44 | && git clone --depth=1 https://github.com/kermitt2/grobid.git grobid \ 45 | && cd grobid \ 46 | # && echo "systemProp.https.proxyHost=bbpproxy.epfl.ch" >> gradle.properties \ 47 | && ./gradlew clean install 48 | 49 | # Download and install GROBID Quantities 50 | RUN true \ 51 | && git clone --depth=1 https://github.com/kermitt2/grobid-quantities.git grobid/grobid-quantities \ 52 | && cd grobid/grobid-quantities/ \ 53 | # && echo "\nsystemProp.https.proxyHost=bbpproxy.epfl.ch" >> gradle.properties \ 54 | && ./gradlew copyModels \ 55 | && ./gradlew clean install 56 | 57 | # Expose a port and set working directory 58 | EXPOSE 8060 59 | WORKDIR /home/grobiduser/grobid/grobid-quantities 60 | 61 | ENTRYPOINT exec java -jar $(find build/libs -name "grobid-*onejar.jar") server resources/config/config.yml 62 | # ENTRYPOINT exec java -jar build/libs/grobid-quantities-0.6.1-SNAPSHOT-onejar.jar server resources/config/config.yml 63 | 64 | -------------------------------------------------------------------------------- /docker/mining.Dockerfile: -------------------------------------------------------------------------------- 1 | # Blue Brain Search is a text mining toolbox focused on scientific use cases. 2 | # 3 | # Copyright (C) 2020 Blue Brain Project, EPFL. 4 | # 5 | # This program is free software: you can redistribute it and/or modify 6 | # it under the terms of the GNU Lesser General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or 8 | # (at your option) any later version. 9 | # 10 | # This program is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU Lesser General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU Lesser General Public License 16 | # along with this program. If not, see . 17 | 18 | FROM bbs-base 19 | 20 | USER root 21 | 22 | # Install the app 23 | ADD . /src 24 | WORKDIR /src 25 | RUN pip install -e .[data_and_models] 26 | 27 | # Set image version 28 | LABEL maintainer="BBP-EPFL Machine Learning team " 29 | LABEL description="REST API Server for Test Mining" 30 | 31 | RUN chmod -R a+rwX /src 32 | 33 | # Run the entry point 34 | EXPOSE 8080 35 | ENV BBS_DATA_AND_MODELS_DIR="/src/data_and_models" 36 | ENTRYPOINT ["/src/docker/mining.sh"] 37 | -------------------------------------------------------------------------------- /docker/mining.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Blue Brain Search is a text mining toolbox focused on scientific use cases. 4 | # 5 | # Copyright (C) 2020 Blue Brain Project, EPFL. 6 | # 7 | # This program is free software: you can redistribute it and/or modify 8 | # it under the terms of the GNU Lesser General Public License as published by 9 | # the Free Software Foundation, either version 3 of the License, or 10 | # (at your option) any later version. 11 | # 12 | # This program is distributed in the hope that it will be useful, 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | # GNU Lesser General Public License for more details. 16 | # 17 | # You should have received a copy of the GNU Lesser General Public License 18 | # along with this program. If not, see . 19 | 20 | source /src/docker/utils.sh 21 | # If $BBS_SSH_USERNAME is empty then this is a no-op 22 | dvc_configure_ssh_remote_authentication "$BBS_SSH_USERNAME" 23 | # Not usable in README as it works only when inside the `bbs_` containers. 24 | # If $DATA_DIR is empty then this will fail 25 | dvc_pull_models "$BBS_DATA_AND_MODELS_DIR" 26 | 27 | # Launch mining server 28 | pip install gunicorn 29 | gunicorn --bind 0.0.0.0:8080 --workers 1 --timeout 7200 'bluesearch.entrypoint.mining_server:get_mining_app()' 30 | -------------------------------------------------------------------------------- /docker/mining_cache.Dockerfile: -------------------------------------------------------------------------------- 1 | # Blue Brain Search is a text mining toolbox focused on scientific use cases. 2 | # 3 | # Copyright (C) 2020 Blue Brain Project, EPFL. 4 | # 5 | # This program is free software: you can redistribute it and/or modify 6 | # it under the terms of the GNU Lesser General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or 8 | # (at your option) any later version. 9 | # 10 | # This program is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU Lesser General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU Lesser General Public License 16 | # along with this program. If not, see . 17 | 18 | FROM bbs-base 19 | 20 | USER root 21 | 22 | # Install the app 23 | ADD . /src 24 | WORKDIR /src 25 | RUN pip install -e .[data_and_models] 26 | 27 | # Set image version 28 | LABEL maintainer="BBP-EPFL Machine Learning team " 29 | LABEL description="Creation of a Mining Cache for the Mining Server" 30 | 31 | 32 | RUN chmod -R a+rwX /src 33 | 34 | # Run the entry point 35 | ENV DATA_DIR="/src/data_and_models" 36 | ENTRYPOINT ["/src/docker/mining_cache.sh"] 37 | -------------------------------------------------------------------------------- /docker/mining_cache.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Blue Brain Search is a text mining toolbox focused on scientific use cases. 4 | # 5 | # Copyright (C) 2020 Blue Brain Project, EPFL. 6 | # 7 | # This program is free software: you can redistribute it and/or modify 8 | # it under the terms of the GNU Lesser General Public License as published by 9 | # the Free Software Foundation, either version 3 of the License, or 10 | # (at your option) any later version. 11 | # 12 | # This program is distributed in the hope that it will be useful, 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | # GNU Lesser General Public License for more details. 16 | # 17 | # You should have received a copy of the GNU Lesser General Public License 18 | # along with this program. If not, see . 19 | 20 | source /src/docker/utils.sh 21 | dvc_configure_ssh_remote_authentication "$BBS_SSH_USERNAME" 22 | dvc_pull_models "$DATA_DIR" 23 | 24 | # Launch mining cache creation, using arguments only if defined 25 | create_mining_cache \ 26 | $([ -n "$BBS_MINING_CACHE_TARGET_TABLE" ] && echo "--target-table-name $BBS_MINING_CACHE_TARGET_TABLE") \ 27 | $([ -n "$BBS_MINING_CACHE_PROCESSORS_PER_MODEL" ] && echo "--n-processes-per-model $BBS_MINING_CACHE_PROCESSORS_PER_MODEL") \ 28 | $([ -n "$BBS_MINING_CACHE_LOG_FILE" ] && echo "--log-file $BBS_MINING_CACHE_LOG_FILE") \ 29 | $([ -n "$BBS_MINING_CACHE_LOG_LEVEL" ] && echo "--log-level $BBS_MINING_CACHE_LOG_LEVEL") 30 | -------------------------------------------------------------------------------- /docker/mysql-make-backup: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | DB_NAME="$1" 4 | FILENAME="/backup/$(date +%Y%m%d-%H%M%S)-${DB_NAME}.sql" 5 | 6 | # Check MySQL credentials are in the environment 7 | if [ -z "$MYSQL_USER" ] || [ -z "$MYSQL_PWD" ] 8 | then 9 | echo "The variables MYSQL_USER and MYSQL_PWD need to be in the environment" 10 | exit 1 11 | fi 12 | 13 | # Check argument 14 | if [ -z "$DB_NAME" ] 15 | then 16 | echo "Usage: $(basename $0) DB-NAME" 17 | exit 1 18 | fi 19 | 20 | # Test credentials 21 | mysql -u "$MYSQL_USER" -e "quit" >& /dev/null 22 | if [ "$?" -ne "0" ] 23 | then 24 | echo "Access to database denied. Wrong credentials?" 25 | exit 1 26 | fi 27 | 28 | # Test if database exists 29 | mysql -u "$MYSQL_USER" -e "use $DB_NAME" >& /dev/null 30 | if [ "$?" -ne "0" ] 31 | then 32 | echo "Database ${DB_NAME} doesn't exist" 33 | exit 1 34 | fi 35 | 36 | # Make backup 37 | echo "Saving back-up to $FILENAME" 38 | /usr/bin/mysqldump -u "$MYSQL_USER" "$DB_NAME" -RE --triggers > "${FILENAME}" 39 | echo "Back-up finished." 40 | 41 | -------------------------------------------------------------------------------- /docker/mysql.Dockerfile: -------------------------------------------------------------------------------- 1 | # Blue Brain Search is a text mining toolbox focused on scientific use cases. 2 | # 3 | # Copyright (C) 2020 Blue Brain Project, EPFL. 4 | # 5 | # This program is free software: you can redistribute it and/or modify 6 | # it under the terms of the GNU Lesser General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or 8 | # (at your option) any later version. 9 | # 10 | # This program is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU Lesser General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU Lesser General Public License 16 | # along with this program. If not, see . 17 | 18 | FROM mysql:8 19 | 20 | # ENV HTTP_PROXY="http://bbpproxy.epfl.ch:80" 21 | # ENV HTTPS_PROXY="http://bbpproxy.epfl.ch:80" 22 | # ENV http_proxy="http://bbpproxy.epfl.ch:80" 23 | # ENV https_proxy="http://bbpproxy.epfl.ch:80" 24 | 25 | # Set timezone for correct timestamp, install useful packages 26 | ENV TZ="Europe/Zurich" 27 | RUN \ 28 | apt-get update &&\ 29 | apt-get install -y --no-install-recommends man vim less procps &&\ 30 | apt-get clean &&\ 31 | rm -r /var/lib/apt/lists/* 32 | 33 | # Limit incremental binary log to 7 days. This is a system variable and has to 34 | # go in the [mysqld] section, which is in docker.cnf 35 | # Accordingly it would make sense to do file dumps every 7 days 36 | RUN echo "binlog_expire_logs_seconds = 604800" >> /etc/mysql/conf.d/docker.cnf 37 | 38 | # Install the backup script 39 | COPY docker/mysql-make-backup /usr/local/bin/make-backup 40 | RUN chmod +x /usr/local/bin/make-backup 41 | 42 | # Pre-initialize the docker volume for the backup 43 | VOLUME ["/backup"] 44 | -------------------------------------------------------------------------------- /docker/search.Dockerfile: -------------------------------------------------------------------------------- 1 | # Blue Brain Search is a text mining toolbox focused on scientific use cases. 2 | # 3 | # Copyright (C) 2020 Blue Brain Project, EPFL. 4 | # 5 | # This program is free software: you can redistribute it and/or modify 6 | # it under the terms of the GNU Lesser General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or 8 | # (at your option) any later version. 9 | # 10 | # This program is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU Lesser General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU Lesser General Public License 16 | # along with this program. If not, see . 17 | 18 | FROM bbs-base 19 | 20 | USER root 21 | 22 | # Install the app 23 | ADD . /src 24 | WORKDIR /src 25 | RUN pip install . 26 | RUN pip install gunicorn 27 | 28 | # Set image version 29 | LABEL maintainer="BBP-EPFL Machine Learning team " 30 | LABEL description="REST API Server for Blue Brain Search" 31 | 32 | # Add a user 33 | RUN useradd --create-home serveruser 34 | WORKDIR /home/serveruser 35 | USER serveruser 36 | 37 | # Run the entry point 38 | # Note the "timeout" parameter. That's to let the server initialisation finish before 39 | # gunicorn decides that the worker is not responsive and restarts it again. 40 | # Might think about a better solution in the future... (initialize in a threading?) 41 | EXPOSE 8080 42 | ENTRYPOINT [\ 43 | "gunicorn", \ 44 | "--bind", "0.0.0.0:8080", \ 45 | "--workers", "1", \ 46 | "--timeout", "7200", \ 47 | "bluesearch.entrypoint.search_server:get_search_app()"] 48 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Blue Brain Search is a text mining toolbox focused on scientific use cases. 2 | # 3 | # Copyright (C) 2020 Blue Brain Project, EPFL. 4 | # 5 | # This program is free software: you can redistribute it and/or modify 6 | # it under the terms of the GNU Lesser General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or 8 | # (at your option) any later version. 9 | # 10 | # This program is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU Lesser General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU Lesser General Public License 16 | # along with this program. If not, see . 17 | 18 | 19 | # Minimal makefile for Sphinx documentation 20 | # 21 | # You can set these variables from the command line, and also 22 | # from the environment for the first two. 23 | SPHINXOPTS ?= 24 | SPHINXBUILD ?= sphinx-build 25 | SOURCEDIR = . 26 | BUILDDIR = _build 27 | 28 | # Put it first so that "make" without argument is like "make help". 29 | help: 30 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 31 | 32 | .PHONY: help Makefile 33 | 34 | # Catch-all target: route all unknown targets to Sphinx using the new 35 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 36 | %: Makefile 37 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 38 | -------------------------------------------------------------------------------- /docs/_static/.keep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BlueBrain/Search/503fdf320a62ab2eb1f9a2a371600e4f1a38df62/docs/_static/.keep -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | 3 | # Blue Brain Search is a text mining toolbox focused on scientific use cases. 4 | # 5 | # Copyright (C) 2020 Blue Brain Project, EPFL. 6 | # 7 | # This program is free software: you can redistribute it and/or modify 8 | # it under the terms of the GNU Lesser General Public License as published by 9 | # the Free Software Foundation, either version 3 of the License, or 10 | # (at your option) any later version. 11 | # 12 | # This program is distributed in the hope that it will be useful, 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | # GNU Lesser General Public License for more details. 16 | # 17 | # You should have received a copy of the GNU Lesser General Public License 18 | # along with this program. If not, see . 19 | 20 | import os 21 | import sys 22 | 23 | import bluesearch 24 | 25 | sys.path.insert(0, os.path.abspath(".")) 26 | 27 | # -- Project information ----------------------------------------------------- 28 | 29 | project = "Blue Brain Search" 30 | author = "Blue Brain Project" 31 | version = bluesearch.__version__ 32 | 33 | # -- General configuration --------------------------------------------------- 34 | suppress_warnings = ["ref.ref"] # because of luigi.util.requires 35 | 36 | # Add any Sphinx extension module names here, as strings. They can be 37 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 38 | # ones. 39 | extensions = [ 40 | "sphinx.ext.mathjax", 41 | "sphinx.ext.autodoc", 42 | "sphinx.ext.doctest", 43 | "sphinx.ext.napoleon", 44 | "sphinx.ext.viewcode", 45 | ] 46 | 47 | # Add any paths that contain templates here, relative to this directory. 48 | templates_path = ["_templates"] 49 | 50 | # List of patterns, relative to source directory, that match files and 51 | # directories to ignore when looking for source files. 52 | # This pattern also affects html_static_path and html_extra_path. 53 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] 54 | 55 | # -- Options for HTML output ------------------------------------------------- 56 | 57 | # The theme to use for HTML and HTML Help pages. See the documentation for 58 | # a list of builtin themes. 59 | html_theme = "sphinx-bluebrain-theme" 60 | html_title = "Blue Brain Search" 61 | html_theme_options = { 62 | "metadata_distribution": "bluesearch", 63 | "repo_name": "bluesearch", 64 | "repo_url": "https://github.com/BlueBrain/Search", 65 | } 66 | # If true, the reST sources are included in the HTML build as _sources/name. 67 | html_copy_source = False 68 | # If true (and html_copy_source is true as well), links to the reST sources 69 | # will be added to the sidebar. 70 | html_show_sourcelink = False 71 | # Add any paths that contain custom static files (such as style sheets) here, 72 | # relative to this directory. They are copied after the builtin static files, 73 | # so a file named "default.css" will overwrite the builtin "default.css". 74 | html_static_path = ["_static"] 75 | # A boolean that decides whether module names are prepended to all object names 76 | # (for object types where a “module” of some kind is defined), e.g. for 77 | # py:function directives. 78 | add_module_names = False 79 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. Blue Brain Search is a text mining toolbox focused on scientific use cases. 2 | Copyright (C) 2020 Blue Brain Project, EPFL. 3 | This program is free software: you can redistribute it and/or modify 4 | it under the terms of the GNU Lesser General Public License as published by 5 | the Free Software Foundation, either version 3 of the License, or 6 | (at your option) any later version. 7 | This program is distributed in the hope that it will be useful, 8 | but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 | GNU Lesser General Public License for more details. 11 | You should have received a copy of the GNU Lesser General Public License 12 | along with this program. If not, see . 13 | 14 | 15 | Welcome to Blue Brain Search's documentation! 16 | ============================================= 17 | 18 | .. toctree:: 19 | :maxdepth: 2 20 | :caption: Contents: 21 | 22 | source/instructions 23 | source/server 24 | source/entrypoint 25 | source/faq 26 | source/example 27 | 28 | .. toctree:: 29 | :maxdepth: 2 30 | :caption: API: 31 | 32 | source/api/bluesearch 33 | 34 | .. toctree:: 35 | :maxdepth: 1 36 | :caption: Release History 37 | 38 | source/whatsnew 39 | 40 | 41 | 42 | Indices and tables 43 | ================== 44 | 45 | * :ref:`genindex` 46 | * :ref:`modindex` 47 | * :ref:`search` 48 | -------------------------------------------------------------------------------- /docs/source/_substitutions.rst: -------------------------------------------------------------------------------- 1 | .. Blue Brain Search is a text mining toolbox focused on scientific use cases. 2 | Copyright (C) 2020 Blue Brain Project, EPFL. 3 | This program is free software: you can redistribute it and/or modify 4 | it under the terms of the GNU Lesser General Public License as published by 5 | the Free Software Foundation, either version 3 of the License, or 6 | (at your option) any later version. 7 | This program is distributed in the hope that it will be useful, 8 | but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 | GNU Lesser General Public License for more details. 11 | You should have received a copy of the GNU Lesser General Public License 12 | along with this program. If not, see . 13 | 14 | .. 15 | This file defines rst substitutions. 16 | 17 | .. role:: raw-html(raw) 18 | :format: html 19 | 20 | .. role:: raw-latex(raw) 21 | :format: latex 22 | 23 | .. |Add| replace:: :raw-html:`Add` 24 | 25 | .. |Fix| replace:: :raw-html:`Fix` 26 | 27 | .. |Change| replace:: :raw-html:`Change` 28 | 29 | .. |Deprecate| replace:: :raw-html:`Deprecate` 30 | 31 | .. |Remove| replace:: :raw-html:`Remove` 32 | -------------------------------------------------------------------------------- /docs/source/api/bluesearch.database.article.rst: -------------------------------------------------------------------------------- 1 | bluesearch.database.article module 2 | ================================== 3 | 4 | .. automodule:: bluesearch.database.article 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/bluesearch.database.cord_19.rst: -------------------------------------------------------------------------------- 1 | bluesearch.database.cord\_19 module 2 | =================================== 3 | 4 | .. automodule:: bluesearch.database.cord_19 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/bluesearch.database.download.rst: -------------------------------------------------------------------------------- 1 | bluesearch.database.download module 2 | =================================== 3 | 4 | .. automodule:: bluesearch.database.download 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/bluesearch.database.mesh.rst: -------------------------------------------------------------------------------- 1 | bluesearch.database.mesh module 2 | =============================== 3 | 4 | .. automodule:: bluesearch.database.mesh 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/bluesearch.database.mining_cache.rst: -------------------------------------------------------------------------------- 1 | bluesearch.database.mining\_cache module 2 | ======================================== 3 | 4 | .. automodule:: bluesearch.database.mining_cache 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/bluesearch.database.pdf.rst: -------------------------------------------------------------------------------- 1 | bluesearch.database.pdf module 2 | ============================== 3 | 4 | .. automodule:: bluesearch.database.pdf 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/bluesearch.database.rst: -------------------------------------------------------------------------------- 1 | bluesearch.database package 2 | =========================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | .. toctree:: 8 | :maxdepth: 4 9 | 10 | bluesearch.database.article 11 | bluesearch.database.cord_19 12 | bluesearch.database.download 13 | bluesearch.database.mesh 14 | bluesearch.database.mining_cache 15 | bluesearch.database.pdf 16 | bluesearch.database.topic 17 | bluesearch.database.topic_info 18 | bluesearch.database.topic_rule 19 | 20 | Module contents 21 | --------------- 22 | 23 | .. automodule:: bluesearch.database 24 | :members: 25 | :undoc-members: 26 | :show-inheritance: 27 | -------------------------------------------------------------------------------- /docs/source/api/bluesearch.database.topic.rst: -------------------------------------------------------------------------------- 1 | bluesearch.database.topic module 2 | ================================ 3 | 4 | .. automodule:: bluesearch.database.topic 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/bluesearch.database.topic_info.rst: -------------------------------------------------------------------------------- 1 | bluesearch.database.topic\_info module 2 | ====================================== 3 | 4 | .. automodule:: bluesearch.database.topic_info 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/bluesearch.database.topic_rule.rst: -------------------------------------------------------------------------------- 1 | bluesearch.database.topic\_rule module 2 | ====================================== 3 | 4 | .. automodule:: bluesearch.database.topic_rule 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/bluesearch.embedding_models.rst: -------------------------------------------------------------------------------- 1 | bluesearch.embedding\_models module 2 | =================================== 3 | 4 | .. automodule:: bluesearch.embedding_models 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/bluesearch.entrypoint.create_database.rst: -------------------------------------------------------------------------------- 1 | bluesearch.entrypoint.create\_database module 2 | ============================================= 3 | 4 | .. automodule:: bluesearch.entrypoint.create_database 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/bluesearch.entrypoint.database.add.rst: -------------------------------------------------------------------------------- 1 | bluesearch.entrypoint.database.add module 2 | ========================================= 3 | 4 | .. automodule:: bluesearch.entrypoint.database.add 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/bluesearch.entrypoint.database.add_es.rst: -------------------------------------------------------------------------------- 1 | bluesearch.entrypoint.database.add\_es module 2 | ============================================= 3 | 4 | .. automodule:: bluesearch.entrypoint.database.add_es 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/bluesearch.entrypoint.database.convert_pdf.rst: -------------------------------------------------------------------------------- 1 | bluesearch.entrypoint.database.convert\_pdf module 2 | ================================================== 3 | 4 | .. automodule:: bluesearch.entrypoint.database.convert_pdf 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/bluesearch.entrypoint.database.download.rst: -------------------------------------------------------------------------------- 1 | bluesearch.entrypoint.database.download module 2 | ============================================== 3 | 4 | .. automodule:: bluesearch.entrypoint.database.download 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/bluesearch.entrypoint.database.init.rst: -------------------------------------------------------------------------------- 1 | bluesearch.entrypoint.database.init module 2 | ========================================== 3 | 4 | .. automodule:: bluesearch.entrypoint.database.init 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/bluesearch.entrypoint.database.parent.rst: -------------------------------------------------------------------------------- 1 | bluesearch.entrypoint.database.parent module 2 | ============================================ 3 | 4 | .. automodule:: bluesearch.entrypoint.database.parent 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/bluesearch.entrypoint.database.parse.rst: -------------------------------------------------------------------------------- 1 | bluesearch.entrypoint.database.parse module 2 | =========================================== 3 | 4 | .. automodule:: bluesearch.entrypoint.database.parse 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/bluesearch.entrypoint.database.parse_mesh_rdf.rst: -------------------------------------------------------------------------------- 1 | bluesearch.entrypoint.database.parse\_mesh\_rdf module 2 | ====================================================== 3 | 4 | .. automodule:: bluesearch.entrypoint.database.parse_mesh_rdf 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/bluesearch.entrypoint.database.rst: -------------------------------------------------------------------------------- 1 | bluesearch.entrypoint.database package 2 | ====================================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | .. toctree:: 8 | :maxdepth: 4 9 | 10 | bluesearch.entrypoint.database.add 11 | bluesearch.entrypoint.database.add_es 12 | bluesearch.entrypoint.database.convert_pdf 13 | bluesearch.entrypoint.database.download 14 | bluesearch.entrypoint.database.init 15 | bluesearch.entrypoint.database.parent 16 | bluesearch.entrypoint.database.parse 17 | bluesearch.entrypoint.database.parse_mesh_rdf 18 | bluesearch.entrypoint.database.run 19 | bluesearch.entrypoint.database.schemas 20 | bluesearch.entrypoint.database.topic_extract 21 | bluesearch.entrypoint.database.topic_filter 22 | 23 | Module contents 24 | --------------- 25 | 26 | .. automodule:: bluesearch.entrypoint.database 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | -------------------------------------------------------------------------------- /docs/source/api/bluesearch.entrypoint.database.run.rst: -------------------------------------------------------------------------------- 1 | bluesearch.entrypoint.database.run module 2 | ========================================= 3 | 4 | .. automodule:: bluesearch.entrypoint.database.run 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/bluesearch.entrypoint.database.schemas.rst: -------------------------------------------------------------------------------- 1 | bluesearch.entrypoint.database.schemas module 2 | ============================================= 3 | 4 | .. automodule:: bluesearch.entrypoint.database.schemas 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/bluesearch.entrypoint.database.topic_extract.rst: -------------------------------------------------------------------------------- 1 | bluesearch.entrypoint.database.topic\_extract module 2 | ==================================================== 3 | 4 | .. automodule:: bluesearch.entrypoint.database.topic_extract 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/bluesearch.entrypoint.database.topic_filter.rst: -------------------------------------------------------------------------------- 1 | bluesearch.entrypoint.database.topic\_filter module 2 | =================================================== 3 | 4 | .. automodule:: bluesearch.entrypoint.database.topic_filter 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/bluesearch.entrypoint.embedding_server.rst: -------------------------------------------------------------------------------- 1 | bluesearch.entrypoint.embedding\_server module 2 | ============================================== 3 | 4 | .. automodule:: bluesearch.entrypoint.embedding_server 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/bluesearch.entrypoint.embeddings.rst: -------------------------------------------------------------------------------- 1 | bluesearch.entrypoint.embeddings module 2 | ======================================= 3 | 4 | .. automodule:: bluesearch.entrypoint.embeddings 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/bluesearch.entrypoint.mining_cache.rst: -------------------------------------------------------------------------------- 1 | bluesearch.entrypoint.mining\_cache module 2 | ========================================== 3 | 4 | .. automodule:: bluesearch.entrypoint.mining_cache 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/bluesearch.entrypoint.mining_server.rst: -------------------------------------------------------------------------------- 1 | bluesearch.entrypoint.mining\_server module 2 | =========================================== 3 | 4 | .. automodule:: bluesearch.entrypoint.mining_server 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/bluesearch.entrypoint.rst: -------------------------------------------------------------------------------- 1 | bluesearch.entrypoint package 2 | ============================= 3 | 4 | Subpackages 5 | ----------- 6 | 7 | .. toctree:: 8 | :maxdepth: 4 9 | 10 | bluesearch.entrypoint.database 11 | 12 | Submodules 13 | ---------- 14 | 15 | .. toctree:: 16 | :maxdepth: 4 17 | 18 | bluesearch.entrypoint.create_database 19 | bluesearch.entrypoint.embedding_server 20 | bluesearch.entrypoint.embeddings 21 | bluesearch.entrypoint.mining_cache 22 | bluesearch.entrypoint.mining_server 23 | bluesearch.entrypoint.search_server 24 | 25 | Module contents 26 | --------------- 27 | 28 | .. automodule:: bluesearch.entrypoint 29 | :members: 30 | :undoc-members: 31 | :show-inheritance: 32 | -------------------------------------------------------------------------------- /docs/source/api/bluesearch.entrypoint.search_server.rst: -------------------------------------------------------------------------------- 1 | bluesearch.entrypoint.search\_server module 2 | =========================================== 3 | 4 | .. automodule:: bluesearch.entrypoint.search_server 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/bluesearch.k8s.connect.rst: -------------------------------------------------------------------------------- 1 | bluesearch.k8s.connect module 2 | ============================= 3 | 4 | .. automodule:: bluesearch.k8s.connect 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/bluesearch.k8s.create_indices.rst: -------------------------------------------------------------------------------- 1 | bluesearch.k8s.create\_indices module 2 | ===================================== 3 | 4 | .. automodule:: bluesearch.k8s.create_indices 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/bluesearch.k8s.rst: -------------------------------------------------------------------------------- 1 | bluesearch.k8s package 2 | ====================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | .. toctree:: 8 | :maxdepth: 4 9 | 10 | bluesearch.k8s.connect 11 | bluesearch.k8s.create_indices 12 | 13 | Module contents 14 | --------------- 15 | 16 | .. automodule:: bluesearch.k8s 17 | :members: 18 | :undoc-members: 19 | :show-inheritance: 20 | -------------------------------------------------------------------------------- /docs/source/api/bluesearch.mining.attribute.rst: -------------------------------------------------------------------------------- 1 | bluesearch.mining.attribute module 2 | ================================== 3 | 4 | .. automodule:: bluesearch.mining.attribute 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/bluesearch.mining.entity.rst: -------------------------------------------------------------------------------- 1 | bluesearch.mining.entity module 2 | =============================== 3 | 4 | .. automodule:: bluesearch.mining.entity 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/bluesearch.mining.eval.rst: -------------------------------------------------------------------------------- 1 | bluesearch.mining.eval module 2 | ============================= 3 | 4 | .. automodule:: bluesearch.mining.eval 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/bluesearch.mining.pipeline.rst: -------------------------------------------------------------------------------- 1 | bluesearch.mining.pipeline module 2 | ================================= 3 | 4 | .. automodule:: bluesearch.mining.pipeline 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/bluesearch.mining.relation.rst: -------------------------------------------------------------------------------- 1 | bluesearch.mining.relation module 2 | ================================= 3 | 4 | .. automodule:: bluesearch.mining.relation 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/bluesearch.mining.rst: -------------------------------------------------------------------------------- 1 | bluesearch.mining package 2 | ========================= 3 | 4 | Submodules 5 | ---------- 6 | 7 | .. toctree:: 8 | :maxdepth: 4 9 | 10 | bluesearch.mining.attribute 11 | bluesearch.mining.entity 12 | bluesearch.mining.eval 13 | bluesearch.mining.pipeline 14 | bluesearch.mining.relation 15 | 16 | Module contents 17 | --------------- 18 | 19 | .. automodule:: bluesearch.mining 20 | :members: 21 | :undoc-members: 22 | :show-inheritance: 23 | -------------------------------------------------------------------------------- /docs/source/api/bluesearch.rst: -------------------------------------------------------------------------------- 1 | bluesearch package 2 | ================== 3 | 4 | Subpackages 5 | ----------- 6 | 7 | .. toctree:: 8 | :maxdepth: 4 9 | 10 | bluesearch.database 11 | bluesearch.entrypoint 12 | bluesearch.k8s 13 | bluesearch.mining 14 | bluesearch.server 15 | bluesearch.widgets 16 | 17 | Submodules 18 | ---------- 19 | 20 | .. toctree:: 21 | :maxdepth: 4 22 | 23 | bluesearch.embedding_models 24 | bluesearch.search 25 | bluesearch.sql 26 | bluesearch.utils 27 | 28 | Module contents 29 | --------------- 30 | 31 | .. automodule:: bluesearch 32 | :members: 33 | :undoc-members: 34 | :show-inheritance: 35 | -------------------------------------------------------------------------------- /docs/source/api/bluesearch.search.rst: -------------------------------------------------------------------------------- 1 | bluesearch.search module 2 | ======================== 3 | 4 | .. automodule:: bluesearch.search 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/bluesearch.server.embedding_server.rst: -------------------------------------------------------------------------------- 1 | bluesearch.server.embedding\_server module 2 | ========================================== 3 | 4 | .. automodule:: bluesearch.server.embedding_server 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/bluesearch.server.invalid_usage_exception.rst: -------------------------------------------------------------------------------- 1 | bluesearch.server.invalid\_usage\_exception module 2 | ================================================== 3 | 4 | .. automodule:: bluesearch.server.invalid_usage_exception 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/bluesearch.server.mining_server.rst: -------------------------------------------------------------------------------- 1 | bluesearch.server.mining\_server module 2 | ======================================= 3 | 4 | .. automodule:: bluesearch.server.mining_server 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/bluesearch.server.rst: -------------------------------------------------------------------------------- 1 | bluesearch.server package 2 | ========================= 3 | 4 | Submodules 5 | ---------- 6 | 7 | .. toctree:: 8 | :maxdepth: 4 9 | 10 | bluesearch.server.embedding_server 11 | bluesearch.server.invalid_usage_exception 12 | bluesearch.server.mining_server 13 | bluesearch.server.search_server 14 | 15 | Module contents 16 | --------------- 17 | 18 | .. automodule:: bluesearch.server 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | -------------------------------------------------------------------------------- /docs/source/api/bluesearch.server.search_server.rst: -------------------------------------------------------------------------------- 1 | bluesearch.server.search\_server module 2 | ======================================= 3 | 4 | .. automodule:: bluesearch.server.search_server 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/bluesearch.sql.rst: -------------------------------------------------------------------------------- 1 | bluesearch.sql module 2 | ===================== 3 | 4 | .. automodule:: bluesearch.sql 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/bluesearch.utils.rst: -------------------------------------------------------------------------------- 1 | bluesearch.utils module 2 | ======================= 3 | 4 | .. automodule:: bluesearch.utils 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/bluesearch.widgets.article_saver.rst: -------------------------------------------------------------------------------- 1 | bluesearch.widgets.article\_saver module 2 | ======================================== 3 | 4 | .. automodule:: bluesearch.widgets.article_saver 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/bluesearch.widgets.mining_schema.rst: -------------------------------------------------------------------------------- 1 | bluesearch.widgets.mining\_schema module 2 | ======================================== 3 | 4 | .. automodule:: bluesearch.widgets.mining_schema 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/bluesearch.widgets.mining_widget.rst: -------------------------------------------------------------------------------- 1 | bluesearch.widgets.mining\_widget module 2 | ======================================== 3 | 4 | .. automodule:: bluesearch.widgets.mining_widget 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/api/bluesearch.widgets.rst: -------------------------------------------------------------------------------- 1 | bluesearch.widgets package 2 | ========================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | .. toctree:: 8 | :maxdepth: 4 9 | 10 | bluesearch.widgets.article_saver 11 | bluesearch.widgets.mining_schema 12 | bluesearch.widgets.mining_widget 13 | bluesearch.widgets.search_widget 14 | 15 | Module contents 16 | --------------- 17 | 18 | .. automodule:: bluesearch.widgets 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | -------------------------------------------------------------------------------- /docs/source/api/bluesearch.widgets.search_widget.rst: -------------------------------------------------------------------------------- 1 | bluesearch.widgets.search\_widget module 2 | ======================================== 3 | 4 | .. automodule:: bluesearch.widgets.search_widget 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/example.rst: -------------------------------------------------------------------------------- 1 | .. Blue Brain Search is a text mining toolbox focused on scientific use cases. 2 | Copyright (C) 2020 Blue Brain Project, EPFL. 3 | This program is free software: you can redistribute it and/or modify 4 | it under the terms of the GNU Lesser General Public License as published by 5 | the Free Software Foundation, either version 3 of the License, or 6 | (at your option) any later version. 7 | This program is distributed in the hope that it will be useful, 8 | but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 | GNU Lesser General Public License for more details. 11 | You should have received a copy of the GNU Lesser General Public License 12 | along with this program. If not, see . 13 | 14 | Example section 15 | =============== 16 | 17 | .. testcode:: 18 | 19 | print('Hello world') 20 | 21 | .. testoutput:: 22 | 23 | Hello world 24 | -------------------------------------------------------------------------------- /docs/source/faq.rst: -------------------------------------------------------------------------------- 1 | .. Blue Brain Search is a text mining toolbox focused on scientific use cases. 2 | Copyright (C) 2020 Blue Brain Project, EPFL. 3 | This program is free software: you can redistribute it and/or modify 4 | it under the terms of the GNU Lesser General Public License as published by 5 | the Free Software Foundation, either version 3 of the License, or 6 | (at your option) any later version. 7 | This program is distributed in the hope that it will be useful, 8 | but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 | GNU Lesser General Public License for more details. 11 | You should have received a copy of the GNU Lesser General Public License 12 | along with this program. If not, see . 13 | 14 | FAQ 15 | === 16 | 17 | This section describes how to handle common issues. 18 | 19 | 20 | MySQL encoding issue 21 | --------------------- 22 | 23 | When interacting in Python with the MySQL database, using SQLAlchemy and the 24 | MySQL driver :code:`mysqldb`, one might run into the following error when 25 | retrieving columns with text: 26 | 27 | .. code-block:: text 28 | 29 | UnicodeDecodeError: 'charmap' codec can't decode byte 0x81 in position 239: 30 | character maps to 31 | 32 | The solution is to append :code:`?charset=utf8mb4` to the database URL. 33 | 34 | So, if the database URL was: 35 | 36 | .. code-block:: python 37 | 38 | f"mysql+mysqldb://{username}:{password}@{host}:{port}/{database}" 39 | 40 | then the new URL would be: 41 | 42 | .. code-block:: python 43 | 44 | f"mysql+mysqldb://{username}:{password}@{host}:{port}/{database}?charset=utf8mb4" 45 | 46 | The database URL is what is passed as a first argument to create the engine: 47 | 48 | .. code-block:: python 49 | 50 | import sqlalchemy 51 | 52 | engine = sqlalchemy.create_engine(f"{dialect}+{driver}://{username}:{password}@{host}:{port}/{database}") 53 | 54 | 55 | DVC dataclasses issue 56 | ---------------------- 57 | 58 | When in a Python 3.7+ environment the package :code:`dataclasses` is installed, 59 | one might run into the following error when doing :code:`dvc pull`: 60 | 61 | .. code-block:: bash 62 | 63 | AttributeError: module 'typing' has no attribute '_ClassVar' 64 | 65 | The solution is to uninstall the package :code:`dataclasses`: 66 | 67 | .. code-block:: bash 68 | 69 | pip uninstall dataclasses 70 | 71 | 72 | DVC pull issue 73 | -------------- 74 | 75 | When launching mining_cache or mining_server entrypoints or even simply 76 | :code:`dvc pull`, one might run into the following error: 77 | 78 | .. code-block:: text 79 | 80 | WARNING: Some of the cache files do not exist neither locally nor on remote. 81 | Missing cache files: 82 | 83 | In this case, the solution is to go to the :code:`.dvc` directory 84 | and remove the file called `config.local`: 85 | 86 | .. code-block:: bash 87 | 88 | cd .dvc 89 | rm config.local 90 | 91 | Doing `dvc pull` again should work fine after this. 92 | -------------------------------------------------------------------------------- /docs/source/instructions.rst: -------------------------------------------------------------------------------- 1 | .. Blue Brain Search is a text mining toolbox focused on scientific use cases. 2 | Copyright (C) 2020 Blue Brain Project, EPFL. 3 | This program is free software: you can redistribute it and/or modify 4 | it under the terms of the GNU Lesser General Public License as published by 5 | the Free Software Foundation, either version 3 of the License, or 6 | (at your option) any later version. 7 | This program is distributed in the hope that it will be useful, 8 | but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 | GNU Lesser General Public License for more details. 11 | You should have received a copy of the GNU Lesser General Public License 12 | along with this program. If not, see . 13 | 14 | Instructions 15 | ============ 16 | 17 | Installation 18 | ------------ 19 | Before installation, please make sure you have a recent :code:`pip` installed (:code:`>=19.1`) 20 | 21 | Then you can easily install :code:`bluesearch` from PyPI: 22 | 23 | .. code-block:: bash 24 | 25 | pip install bluesearch[data_and_models] 26 | 27 | You can also build from source if you prefer: 28 | 29 | .. code-block:: bash 30 | 31 | pip install .[data_and_models] # use -e for editable install 32 | 33 | NB: The optional dependencies installed with the :code:`[data_and_models]` 34 | option are only necessary if you want to execute training or inference using the 35 | :code:`dvc` and the model and scripts contained under :code:`data_and_models/`. If this is not 36 | the case, you can skip the :code:`[data_and_models]` at the end of :code:`pip install`. 37 | 38 | 39 | Generating docs 40 | --------------- 41 | All the versions of our documentation, both stable and latest, 42 | `can be found on Read the Docs `_. 43 | 44 | 45 | To generate the documentation manually, we use :code:`sphinx` with a custom BBP theme. 46 | Make sure to install the :code:`bluesearch` package with :code:`dev` extras to get 47 | the necessary dependencies. 48 | 49 | .. code-block:: bash 50 | 51 | pip install -e .[dev] 52 | 53 | To generate autodoc directives one can run 54 | 55 | .. code-block:: bash 56 | 57 | cd docs 58 | sphinx-apidoc -o source/api/ -f -e ../src/bluesearch/ ../src/bluesearch/entrypoint/* 59 | 60 | Note that it only needs to be rerun when there are new subpackages/modules. 61 | 62 | To generate the documentation run 63 | 64 | .. code-block:: bash 65 | 66 | cd docs 67 | make clean && make html 68 | 69 | 70 | Finally, one can also run doctests 71 | 72 | .. code-block:: bash 73 | 74 | cd docs 75 | make doctest 76 | -------------------------------------------------------------------------------- /docs/source/logo/BlueBrainSearch_banner.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BlueBrain/Search/503fdf320a62ab2eb1f9a2a371600e4f1a38df62/docs/source/logo/BlueBrainSearch_banner.jpg -------------------------------------------------------------------------------- /luigi.cfg: -------------------------------------------------------------------------------- 1 | ;Blue Brain Search is a text mining toolbox focused on scientific use cases. 2 | ; 3 | ;Copyright (C) 2020 Blue Brain Project, EPFL. 4 | ; 5 | ;This program is free software: you can redistribute it and/or modify 6 | ;it under the terms of the GNU Lesser General Public License as published by 7 | ;the Free Software Foundation, either version 3 of the License, or 8 | ;(at your option) any later version. 9 | ; 10 | ;This program is distributed in the hope that it will be useful, 11 | ;but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | ;MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | ;GNU Lesser General Public License for more details. 14 | ; 15 | ;You should have received a copy of the GNU Lesser General Public License 16 | ;along with this program. If not, see . 17 | 18 | [core] 19 | autoload_range = true 20 | log_level = INFO 21 | local_scheduler = True 22 | 23 | [GlobalParams] 24 | source = pubmed 25 | 26 | [DownloadTask] 27 | from_month = 2021-12 28 | to_month = 2022-02 29 | output_dir = luigi-pipeline 30 | identifier = 31 | ; emtpy string is considered default value 32 | 33 | [TopicExtractTask] 34 | mesh_topic_db = luigi-pipeline/mesh_topic_db.json 35 | 36 | [TopicFilterTask] 37 | filter_config = luigi-pipeline/filter-config.jsonl 38 | 39 | [ConvertPDFTask] 40 | grobid_host = 0.0.0.0 41 | grobid_port = 8070 42 | 43 | [AddTask] 44 | db_url = luigi-pipeline/my-db.db 45 | db_type = sqlite 46 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | # Blue Brain Search is a text mining toolbox focused on scientific use cases. 2 | # 3 | # Copyright (C) 2020 Blue Brain Project, EPFL. 4 | # 5 | # This program is free software: you can redistribute it and/or modify 6 | # it under the terms of the GNU Lesser General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or 8 | # (at your option) any later version. 9 | # 10 | # This program is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU Lesser General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU Lesser General Public License 16 | # along with this program. If not, see . 17 | 18 | [build-system] 19 | requires = [ 20 | "pip>=9", 21 | "setuptools>=45", 22 | "setuptools_scm[toml]>=3.4", 23 | "wheel", 24 | ] 25 | # This is pip's default value if the build-backend key is missing 26 | # Ref: https://pip.pypa.io/en/stable/reference/build-system/pyproject-toml/#fallback-behaviour 27 | # Tox with tox.isolated_build = true needs this key to be defined explicitly. 28 | # Setuptools instructs setting build-backend without __legacy__, ref: 29 | # https://setuptools.pypa.io/en/latest/build_meta.html#how-to-use-it 30 | build-backend = "setuptools.build_meta" 31 | 32 | [tool.black] 33 | extend-exclude = """data_and_models/pipelines/ner/transformers_vs_spacy/transformers/ 34 | |data_and_models/pipelines/sentence_embedding/training_transformers/""" 35 | -------------------------------------------------------------------------------- /requirements-data_and_models.txt: -------------------------------------------------------------------------------- 1 | # Blue Brain Search is a text mining toolbox focused on scientific use cases. 2 | # 3 | # Copyright (C) 2020 Blue Brain Project, EPFL. 4 | # 5 | # This program is free software: you can redistribute it and/or modify 6 | # it under the terms of the GNU Lesser General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or 8 | # (at your option) any later version. 9 | # 10 | # This program is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU Lesser General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU Lesser General Public License 16 | # along with this program. If not, see . 17 | 18 | PyYAML==5.4.1 19 | dvc[ssh]==2.5.4 20 | matplotlib==3.4.2 21 | scipy==1.7.0 22 | spacy_lookups_data==1.0.2 23 | srsly==2.4.1 24 | transformers==4.6.1 25 | typer==0.3.2 26 | -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | # Blue Brain Search is a text mining toolbox focused on scientific use cases. 2 | # 3 | # Copyright (C) 2020 Blue Brain Project, EPFL. 4 | # 5 | # This program is free software: you can redistribute it and/or modify 6 | # it under the terms of the GNU Lesser General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or 8 | # (at your option) any later version. 9 | # 10 | # This program is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU Lesser General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU Lesser General Public License 16 | # along with this program. If not, see . 17 | 18 | Sphinx==4.1.1 19 | docker==5.0.0 20 | en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0.tar.gz 21 | pytest-benchmark==3.4.1 22 | pytest-cov==2.12.1 23 | pytest==6.2.4 24 | responses==0.19.0 25 | sphinx-bluebrain-theme==0.2.4 26 | tox==3.24.0 27 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # Blue Brain Search is a text mining toolbox focused on scientific use cases. 2 | # 3 | # Copyright (C) 2020 Blue Brain Project, EPFL. 4 | # 5 | # This program is free software: you can redistribute it and/or modify 6 | # it under the terms of the GNU Lesser General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or 8 | # (at your option) any later version. 9 | # 10 | # This program is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU Lesser General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU Lesser General Public License 16 | # along with this program. If not, see . 17 | 18 | Flask==2.0.1 19 | SQLAlchemy[mysql,pymysql]==1.4.21 20 | boto3==1.20.16 21 | catalogue==2.0.4 22 | cryptography==3.4.7 23 | defusedxml==0.6.0 24 | google-cloud-storage==1.43.0 25 | h5py==3.3.0 26 | ipython==7.31.1 27 | ipywidgets==7.6.3 28 | jupyterlab==3.0.17 29 | langdetect==1.0.9 30 | luigi==3.0.3 31 | mashumaro==3.0 32 | numpy==1.21.0 33 | pandas==1.3.0 34 | pg8000==1.23.0 35 | python-dotenv==0.18.0 36 | requests==2.26.0 37 | scikit-learn==0.24.2 38 | sentence-transformers==2.0.0 39 | spacy==3.0.7 40 | spacy-transformers==1.0.3 41 | torch==1.9.0 42 | elasticsearch==8.3.3 -------------------------------------------------------------------------------- /screenshots/mining_widget_articles.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BlueBrain/Search/503fdf320a62ab2eb1f9a2a371600e4f1a38df62/screenshots/mining_widget_articles.png -------------------------------------------------------------------------------- /screenshots/mining_widget_text.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BlueBrain/Search/503fdf320a62ab2eb1f9a2a371600e4f1a38df62/screenshots/mining_widget_text.png -------------------------------------------------------------------------------- /screenshots/search_widget.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BlueBrain/Search/503fdf320a62ab2eb1f9a2a371600e4f1a38df62/screenshots/search_widget.png -------------------------------------------------------------------------------- /src/bluesearch/__init__.py: -------------------------------------------------------------------------------- 1 | """bluesearch: a Python package for text mining on scientific use cases.""" 2 | 3 | # Blue Brain Search is a text mining toolbox focused on scientific use cases. 4 | # 5 | # Copyright (C) 2020 Blue Brain Project, EPFL. 6 | # 7 | # This program is free software: you can redistribute it and/or modify 8 | # it under the terms of the GNU Lesser General Public License as published by 9 | # the Free Software Foundation, either version 3 of the License, or 10 | # (at your option) any later version. 11 | # 12 | # This program is distributed in the hope that it will be useful, 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | # GNU Lesser General Public License for more details. 16 | # 17 | # You should have received a copy of the GNU Lesser General Public License 18 | # along with this program. If not, see . 19 | 20 | from bluesearch.version import __version__ # noqa 21 | -------------------------------------------------------------------------------- /src/bluesearch/_css/__init__.py: -------------------------------------------------------------------------------- 1 | """CSS styling utilities.""" 2 | 3 | # Blue Brain Search is a text mining toolbox focused on scientific use cases. 4 | # 5 | # Copyright (C) 2020 Blue Brain Project, EPFL. 6 | # 7 | # This program is free software: you can redistribute it and/or modify 8 | # it under the terms of the GNU Lesser General Public License as published by 9 | # the Free Software Foundation, either version 3 of the License, or 10 | # (at your option) any later version. 11 | # 12 | # This program is distributed in the hope that it will be useful, 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | # GNU Lesser General Public License for more details. 16 | # 17 | # You should have received a copy of the GNU Lesser General Public License 18 | # along with this program. If not, see . 19 | -------------------------------------------------------------------------------- /src/bluesearch/_css/style.py: -------------------------------------------------------------------------------- 1 | """CSS styling utilities.""" 2 | 3 | # Blue Brain Search is a text mining toolbox focused on scientific use cases. 4 | # 5 | # Copyright (C) 2020 Blue Brain Project, EPFL. 6 | # 7 | # This program is free software: you can redistribute it and/or modify 8 | # it under the terms of the GNU Lesser General Public License as published by 9 | # the Free Software Foundation, either version 3 of the License, or 10 | # (at your option) any later version. 11 | # 12 | # This program is distributed in the hope that it will be useful, 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | # GNU Lesser General Public License for more details. 16 | # 17 | # You should have received a copy of the GNU Lesser General Public License 18 | # along with this program. If not, see . 19 | 20 | import pkg_resources 21 | 22 | 23 | def get_css_style(): 24 | """Get content of CSS style sheet.""" 25 | css_file = pkg_resources.resource_filename(__name__, "stylesheet.css") 26 | with open(css_file, "r") as f: 27 | css_style = f.read() 28 | return css_style 29 | -------------------------------------------------------------------------------- /src/bluesearch/_css/stylesheet.css: -------------------------------------------------------------------------------- 1 | /* 2 | Blue Brain Search is a text mining toolbox focused on scientific use cases. 3 | 4 | Copyright (C) 2020 Blue Brain Project, EPFL. 5 | 6 | This program is free software: you can redistribute it and/or modify 7 | it under the terms of the GNU Lesser General Public License as published by 8 | the Free Software Foundation, either version 3 of the License, or 9 | (at your option) any later version. 10 | 11 | This program is distributed in the hope that it will be useful, 12 | but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | GNU Lesser General Public License for more details. 15 | 16 | You should have received a copy of the GNU Lesser General Public License 17 | along with this program. If not, see . 18 | */ 19 | 20 | /* search engine */ 21 | 22 | .article_title { 23 | font-size: 17px; 24 | color: #1A0DAB; 25 | } 26 | .paragraph { 27 | font-size: 13px; 28 | color: #222; 29 | } 30 | .paragraph_emph { 31 | font-weight: bold; 32 | color: #000; 33 | } 34 | .metadata { 35 | font-size: 13px; 36 | color: #006621; 37 | } 38 | 39 | /* success */ 40 | .bbs_success { 41 | color : #388E3B 42 | } 43 | 44 | /* warnings */ 45 | .bbs_warning { 46 | color: #DDB62C 47 | } 48 | 49 | /* errors */ 50 | .bbs_error { 51 | color: #E75C58 52 | } 53 | 54 | /* widgets buttons */ 55 | .bbs_button { 56 | background-color: #3c96f3; 57 | color: #FFF; 58 | font-size: 150%; 59 | transition-duration: 0.2s; 60 | } 61 | .bbs_button:hover { 62 | background-color: #3176d2; 63 | } 64 | 65 | .jupyter-button:active, .jupyter-button.mod-active { 66 | color: #FFF; 67 | background-color: #3c96f3; 68 | } 69 | .jupyter-button:hover { 70 | color: #FFF; 71 | background-color: #3176d2; 72 | } 73 | 74 | /* attribute extraction */ 75 | 76 | .number { 77 | display: inline-block; 78 | background: lightgreen; 79 | padding: 0.2em 0.5em; 80 | border-radius: 7px; 81 | } 82 | .unit { 83 | display: inline-block; 84 | background: pink; 85 | padding: 0.2em 0.5em; 86 | border-radius: 7px; 87 | } 88 | .quantityType { 89 | display: inline-block; 90 | background: yellow; 91 | font-variant:small-caps; 92 | padding: 0.2em 0.5em; 93 | border-radius: 7px; 94 | } 95 | .fixedWidth { 96 | width: 4px; 97 | text-align: justify; 98 | } 99 | -------------------------------------------------------------------------------- /src/bluesearch/database/__init__.py: -------------------------------------------------------------------------------- 1 | """Embedding and Mining Databases.""" 2 | 3 | # Blue Brain Search is a text mining toolbox focused on scientific use cases. 4 | # 5 | # Copyright (C) 2020 Blue Brain Project, EPFL. 6 | # 7 | # This program is free software: you can redistribute it and/or modify 8 | # it under the terms of the GNU Lesser General Public License as published by 9 | # the Free Software Foundation, either version 3 of the License, or 10 | # (at your option) any later version. 11 | # 12 | # This program is distributed in the hope that it will be useful, 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | # GNU Lesser General Public License for more details. 16 | # 17 | # You should have received a copy of the GNU Lesser General Public License 18 | # along with this program. If not, see . 19 | -------------------------------------------------------------------------------- /src/bluesearch/database/pdf.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2020 Blue Brain Project, EPFL. 2 | # 3 | # This program is free software: you can redistribute it and/or modify 4 | # it under the terms of the GNU Lesser General Public License as published by 5 | # the Free Software Foundation, either version 3 of the License, or 6 | # (at your option) any later version. 7 | # 8 | # This program is distributed in the hope that it will be useful, 9 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | # GNU Lesser General Public License for more details. 12 | # 13 | # You should have received a copy of the GNU Lesser General Public License 14 | # along with this program. If not, see . 15 | """Module for PDF conversion.""" 16 | import requests 17 | 18 | 19 | def grobid_is_alive(host: str, port: int) -> bool: 20 | """Test if the GROBID server is alive. 21 | 22 | This server API is documented here: 23 | https://grobid.readthedocs.io/en/latest/Grobid-service/#service-checks 24 | 25 | Parameters 26 | ---------- 27 | host 28 | Host of the GROBID server. 29 | port 30 | Port of the GROBID server. 31 | 32 | Returns 33 | ------- 34 | bool 35 | Whether the GROBID server is alive. 36 | """ 37 | try: 38 | response = requests.get(f"http://{host}:{port}/api/isalive") 39 | except requests.RequestException: 40 | return False 41 | 42 | if response.content == b"true": 43 | return True 44 | else: 45 | return False 46 | 47 | 48 | def grobid_pdf_to_tei_xml(pdf_content: bytes, host: str, port: int) -> str: 49 | """Convert PDF file to TEI XML using GROBID server. 50 | 51 | This function uses the GROBID API service to convert PDF to a TEI XML format. 52 | In order to setup GROBID server, follow the instructions from 53 | https://grobid.readthedocs.io/en/latest/Grobid-docker/. 54 | 55 | Parameters 56 | ---------- 57 | pdf_content 58 | PDF content 59 | host 60 | Host of the GROBID server. 61 | port 62 | Port of the GROBID server. 63 | 64 | Returns 65 | ------- 66 | str 67 | TEI XML parsing of the PDF content. 68 | """ 69 | url = f"http://{host}:{port}/api/processFulltextDocument" 70 | files = {"input": pdf_content} 71 | headers = {"Accept": "application/xml"} 72 | timeout = 60 73 | 74 | response = requests.post( 75 | url=url, 76 | files=files, 77 | headers=headers, 78 | timeout=timeout, 79 | ) 80 | response.raise_for_status() 81 | return response.text 82 | -------------------------------------------------------------------------------- /src/bluesearch/entrypoint/__init__.py: -------------------------------------------------------------------------------- 1 | """Subpackage containing all the entry points.""" 2 | 3 | # Blue Brain Search is a text mining toolbox focused on scientific use cases. 4 | # 5 | # Copyright (C) 2020 Blue Brain Project, EPFL. 6 | # 7 | # This program is free software: you can redistribute it and/or modify 8 | # it under the terms of the GNU Lesser General Public License as published by 9 | # the Free Software Foundation, either version 3 of the License, or 10 | # (at your option) any later version. 11 | # 12 | # This program is distributed in the hope that it will be useful, 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | # GNU Lesser General Public License for more details. 16 | # 17 | # You should have received a copy of the GNU Lesser General Public License 18 | # along with this program. If not, see . 19 | -------------------------------------------------------------------------------- /src/bluesearch/entrypoint/database/__init__.py: -------------------------------------------------------------------------------- 1 | """Subpackage for database creation.""" 2 | -------------------------------------------------------------------------------- /src/bluesearch/entrypoint/database/init.py: -------------------------------------------------------------------------------- 1 | """Initialization of the database.""" 2 | import argparse 3 | import logging 4 | 5 | logger = logging.getLogger(__name__) 6 | 7 | 8 | def init_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser: 9 | """Initialise the argument parser for the init subcommand. 10 | 11 | Parameters 12 | ---------- 13 | parser 14 | The argument parser to initialise. 15 | 16 | Returns 17 | ------- 18 | argparse.ArgumentParser 19 | The initialised argument parser. The same object as the `parser` 20 | argument. 21 | """ 22 | parser.description = "Initialize a database." 23 | 24 | parser.add_argument( 25 | "db_url", 26 | type=str, 27 | help=""" 28 | The location of the database depending on the database type. 29 | 30 | For MySQL and MariaDB the server URL should be provided, for SQLite the 31 | location of the database file. Generally, the scheme part of 32 | the URL should be omitted, e.g. for MySQL the URL should be 33 | of the form 'my_sql_server.ch:1234/my_database' and for SQLite 34 | of the form '/path/to/the/local/database.db'. 35 | """, 36 | ) 37 | parser.add_argument( 38 | "--db-type", 39 | default="sqlite", 40 | type=str, 41 | choices=("mariadb", "mysql", "postgres", "sqlite"), 42 | help="Type of the database.", 43 | ) 44 | return parser 45 | 46 | 47 | def run( 48 | *, 49 | db_url: str, 50 | db_type: str, 51 | ) -> int: 52 | """Initialize database. 53 | 54 | Parameter description and potential defaults are documented inside of the 55 | `get_parser` function. 56 | """ 57 | logger.info("Importing dependencies") 58 | import sqlalchemy 59 | 60 | from bluesearch.entrypoint.database.schemas import schema_articles, schema_sentences 61 | 62 | if db_type == "sqlite": 63 | engine = sqlalchemy.create_engine(f"sqlite:///{db_url}") 64 | 65 | elif db_type in {"mariadb", "mysql"}: 66 | engine = sqlalchemy.create_engine(f"mysql+pymysql://{db_url}") 67 | 68 | elif db_type == "postgres": 69 | engine = sqlalchemy.create_engine(f"postgresql+pg8000://{db_url}") 70 | 71 | else: 72 | # This branch never reached because of `choices` in `argparse` 73 | raise ValueError(f"Unrecognized database type {db_type}") # pragma: nocover 74 | 75 | metadata = sqlalchemy.MetaData() 76 | 77 | # Creation of the schema of the tables 78 | schema_articles(metadata) 79 | schema_sentences(metadata) 80 | 81 | # Construction 82 | with engine.begin() as connection: 83 | metadata.create_all(connection) 84 | 85 | logger.info("Initialization done") 86 | 87 | return 0 88 | -------------------------------------------------------------------------------- /src/bluesearch/entrypoint/database/parse_mesh_rdf.py: -------------------------------------------------------------------------------- 1 | # Blue Brain Search is a text mining toolbox focused on scientific use cases. 2 | # 3 | # Copyright (C) 2022 Blue Brain Project, EPFL. 4 | # 5 | # This program is free software: you can redistribute it and/or modify 6 | # it under the terms of the GNU Lesser General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or 8 | # (at your option) any later version. 9 | # 10 | # This program is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU Lesser General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU Lesser General Public License 16 | # along with this program. If not, see . 17 | """CLI sub-command for parsing MeSH RDF files.""" 18 | from __future__ import annotations 19 | 20 | import argparse 21 | import gzip 22 | import json 23 | import logging 24 | import pathlib 25 | 26 | logger = logging.getLogger(__name__) 27 | 28 | 29 | def init_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser: 30 | """Initialise the argument parser for the parse-mesh-rdf subcommand. 31 | 32 | Parameters 33 | ---------- 34 | parser 35 | The argument parser to initialise. 36 | 37 | Returns 38 | ------- 39 | argparse.ArgumentParser 40 | The initialised argument parser. The same object as the `parser` 41 | argument. 42 | """ 43 | parser.description = "Parse a MeSH RDF file in N-Triples format." 44 | parser.add_argument( 45 | "mesh_nt_gz_file", 46 | type=pathlib.Path, 47 | help=""" 48 | Path to a "mesh*.nt.gz" file downloaded from 49 | https://nlmpubs.nlm.nih.gov/projects/mesh/rdf/ 50 | """, 51 | ) 52 | parser.add_argument( 53 | "output_json_file", 54 | type=pathlib.Path, 55 | help=""" 56 | The output file for parsing results. The JSON file will contain a 57 | flat dictionary with MeSH tree names as keys and corresponding topic 58 | labels as values. 59 | """, 60 | ) 61 | return parser 62 | 63 | 64 | def run(*, mesh_nt_gz_file: pathlib.Path, output_json_file: pathlib.Path) -> int: 65 | """Parse a MeSH RDF file to extract the topic tree structure. 66 | 67 | See the description of the `init_parser` command for more information on 68 | the command and its parameters. 69 | """ 70 | from bluesearch.database import mesh 71 | 72 | if not mesh_nt_gz_file.exists(): 73 | logger.error(f"The file {mesh_nt_gz_file} does not exist.") 74 | return 1 75 | if not mesh_nt_gz_file.is_file(): 76 | logger.error(f"The path {mesh_nt_gz_file} must be a file.") 77 | return 1 78 | 79 | logger.info(f"Parsing the MeSH file {mesh_nt_gz_file.resolve().as_uri()}") 80 | with gzip.open(mesh_nt_gz_file, "rt") as fh: 81 | tree_number_to_label = mesh.parse_tree_numbers(fh) 82 | 83 | logger.info(f"Saving results to {output_json_file.resolve().as_uri()}") 84 | with open(output_json_file, "w") as fh: 85 | json.dump(tree_number_to_label, fh) 86 | 87 | logger.info("Done") 88 | return 0 89 | -------------------------------------------------------------------------------- /src/bluesearch/entrypoint/database/schemas.py: -------------------------------------------------------------------------------- 1 | """Module for defining SQL schemas.""" 2 | 3 | # Blue Brain Search is a text mining toolbox focused on scientific use cases. 4 | # 5 | # Copyright (C) 2020 Blue Brain Project, EPFL. 6 | # 7 | # This program is free software: you can redistribute it and/or modify 8 | # it under the terms of the GNU Lesser General Public License as published by 9 | # the Free Software Foundation, either version 3 of the License, or 10 | # (at your option) any later version. 11 | # 12 | # This program is distributed in the hope that it will be useful, 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | # GNU Lesser General Public License for more details. 16 | # 17 | # You should have received a copy of the GNU Lesser General Public License 18 | # along with this program. If not, see . 19 | 20 | from sqlalchemy import ( 21 | Boolean, 22 | Column, 23 | Date, 24 | ForeignKey, 25 | Integer, 26 | MetaData, 27 | String, 28 | Table, 29 | Text, 30 | UniqueConstraint, 31 | ) 32 | 33 | 34 | def schema_articles(metadata: MetaData) -> None: 35 | """Add to the given 'metadata' the schema of the table 'articles'.""" 36 | Table( 37 | "articles", 38 | metadata, 39 | Column("article_id", String(32), primary_key=True), 40 | Column("doi", Text()), 41 | Column("pmc_id", Text()), 42 | Column("pubmed_id", Text()), 43 | Column("title", Text()), 44 | Column("authors", Text()), 45 | Column("abstract", Text()), 46 | Column("journal", Text()), 47 | Column("publish_time", Date()), 48 | Column("license", Text()), 49 | Column("is_english", Boolean()), 50 | ) 51 | 52 | 53 | def schema_sentences(metadata: MetaData) -> None: 54 | """Add to the given 'metadata' the schema of the table 'sentences'.""" 55 | Table( 56 | "sentences", 57 | metadata, 58 | Column("sentence_id", Integer(), primary_key=True, autoincrement=True), 59 | Column("section_name", Text()), 60 | Column("text", Text()), 61 | Column( 62 | "article_id", String(32), ForeignKey("articles.article_id"), nullable=False 63 | ), 64 | Column("paragraph_pos_in_article", Integer(), nullable=False), 65 | Column("sentence_pos_in_paragraph", Integer(), nullable=False), 66 | UniqueConstraint( 67 | "article_id", 68 | "paragraph_pos_in_article", 69 | "sentence_pos_in_paragraph", 70 | name="sentence_unique_identifier", 71 | ), 72 | Column("is_bad", Boolean(), server_default="0"), 73 | ) 74 | -------------------------------------------------------------------------------- /src/bluesearch/entrypoint/embedding_server.py: -------------------------------------------------------------------------------- 1 | """Entrypoint for launching an embedding server.""" 2 | 3 | # Blue Brain Search is a text mining toolbox focused on scientific use cases. 4 | # 5 | # Copyright (C) 2020 Blue Brain Project, EPFL. 6 | # 7 | # This program is free software: you can redistribute it and/or modify 8 | # it under the terms of the GNU Lesser General Public License as published by 9 | # the Free Software Foundation, either version 3 of the License, or 10 | # (at your option) any later version. 11 | # 12 | # This program is distributed in the hope that it will be useful, 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | # GNU Lesser General Public License for more details. 16 | # 17 | # You should have received a copy of the GNU Lesser General Public License 18 | # along with this program. If not, see . 19 | 20 | import logging 21 | import sys 22 | 23 | from bluesearch.embedding_models import get_embedding_model 24 | from bluesearch.entrypoint._helper import configure_logging, get_var, run_server 25 | 26 | 27 | def get_embedding_app(): 28 | """Construct the embedding flask app.""" 29 | from bluesearch.server.embedding_server import EmbeddingServer 30 | 31 | # Read configuration 32 | log_file = get_var("BBS_EMBEDDING_LOG_FILE", check_not_set=False) 33 | log_level = get_var("BBS_EMBEDDING_LOG_LEVEL", logging.INFO, var_type=int) 34 | 35 | # Configure logging 36 | configure_logging(log_file, log_level) 37 | logger = logging.getLogger(__name__) 38 | 39 | logger.info(" Configuration ".center(80, "-")) 40 | logger.info(f"log-file : {log_file}") 41 | logger.info(f"log-level : {log_level}") 42 | logger.info("-" * 80) 43 | 44 | # Load embedding models 45 | logger.info("Loading embedding models") 46 | supported_models = ["SBERT", "SBioBERT", "BioBERT NLI+STS"] 47 | embedding_models = { 48 | model_name: get_embedding_model(model_name) for model_name in supported_models 49 | } 50 | 51 | # Create Server app 52 | logger.info("Creating the server app") 53 | embedding_app = EmbeddingServer(embedding_models) 54 | 55 | return embedding_app 56 | 57 | 58 | def run_embedding_server(): 59 | """Run the embedding server.""" 60 | run_server(get_embedding_app, "embedding") 61 | 62 | 63 | if __name__ == "__main__": # pragma: no cover 64 | sys.exit(run_embedding_server()) 65 | -------------------------------------------------------------------------------- /src/bluesearch/entrypoint/search_server.py: -------------------------------------------------------------------------------- 1 | """The entrypoint script for the search server.""" 2 | 3 | # Blue Brain Search is a text mining toolbox focused on scientific use cases. 4 | # 5 | # Copyright (C) 2020 Blue Brain Project, EPFL. 6 | # 7 | # This program is free software: you can redistribute it and/or modify 8 | # it under the terms of the GNU Lesser General Public License as published by 9 | # the Free Software Foundation, either version 3 of the License, or 10 | # (at your option) any later version. 11 | # 12 | # This program is distributed in the hope that it will be useful, 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | # GNU Lesser General Public License for more details. 16 | # 17 | # You should have received a copy of the GNU Lesser General Public License 18 | # along with this program. If not, see . 19 | 20 | import logging 21 | import pathlib 22 | import sys 23 | 24 | import sqlalchemy 25 | 26 | from bluesearch.entrypoint._helper import configure_logging, get_var, run_server 27 | 28 | 29 | def get_search_app(): 30 | """Construct the search flask app.""" 31 | from bluesearch.server.search_server import SearchServer 32 | from bluesearch.utils import H5 33 | 34 | # Read configuration 35 | log_file = get_var("BBS_SEARCH_LOG_FILE", check_not_set=False) 36 | log_level = get_var("BBS_SEARCH_LOG_LEVEL", logging.INFO, var_type=int) 37 | 38 | models_path = get_var("BBS_SEARCH_MODELS_PATH") 39 | embeddings_path = get_var("BBS_SEARCH_EMBEDDINGS_PATH") 40 | which_models = get_var("BBS_SEARCH_MODELS") 41 | 42 | mysql_url = get_var("BBS_SEARCH_DB_URL") 43 | mysql_user = get_var("BBS_SEARCH_MYSQL_USER") 44 | mysql_password = get_var("BBS_SEARCH_MYSQL_PASSWORD") 45 | 46 | # Configure logging 47 | configure_logging(log_file, log_level) 48 | logger = logging.getLogger(__name__) 49 | 50 | logger.info(" Configuration ".center(80, "-")) 51 | logger.info(f"log-file : {log_file}") 52 | logger.info(f"log-level : {log_level}") 53 | logger.info(f"models-path : {models_path}") 54 | logger.info(f"embeddings-path : {embeddings_path}") 55 | logger.info(f"which-models : {which_models}") 56 | logger.info(f"mysql_url : {mysql_url}") 57 | logger.info(f"mysql_user : {mysql_user}") 58 | logger.info(f"mysql_password : {mysql_password}") 59 | logger.info("-" * 80) 60 | 61 | # Initialize flask app 62 | logger.info("Creating the Flask app") 63 | models_path = pathlib.Path(models_path) 64 | embeddings_path = pathlib.Path(embeddings_path) 65 | engine_url = f"mysql://{mysql_user}:{mysql_password}@{mysql_url}" 66 | engine = sqlalchemy.create_engine(engine_url, pool_recycle=14400) 67 | models_list = [model.strip() for model in which_models.split(",")] 68 | indices = H5.find_populated_rows(embeddings_path, models_list[0]) 69 | 70 | server_app = SearchServer( 71 | models_path, embeddings_path, indices, engine, models_list 72 | ) 73 | return server_app 74 | 75 | 76 | def run_search_server(): 77 | """Run the search server.""" 78 | run_server(get_search_app, "search") 79 | 80 | 81 | if __name__ == "__main__": # pragma: no cover 82 | sys.exit(run_search_server()) 83 | -------------------------------------------------------------------------------- /src/bluesearch/k8s/__init__.py: -------------------------------------------------------------------------------- 1 | """Subpackage for Kubernetes related code.""" 2 | -------------------------------------------------------------------------------- /src/bluesearch/k8s/connect.py: -------------------------------------------------------------------------------- 1 | # Blue Brain Search is a text mining toolbox focused on scientific use cases. 2 | # 3 | # Copyright (C) 2020 Blue Brain Project, EPFL. 4 | # 5 | # This program is free software: you can redistribute it and/or modify 6 | # it under the terms of the GNU Lesser General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or 8 | # (at your option) any later version. 9 | # 10 | # This program is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU Lesser General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU Lesser General Public License 16 | # along with this program. If not, see . 17 | """connects to ES.""" 18 | import logging 19 | import os 20 | 21 | import urllib3 22 | from dotenv import load_dotenv 23 | from elasticsearch import Elasticsearch 24 | 25 | load_dotenv() 26 | urllib3.disable_warnings() 27 | 28 | logger = logging.getLogger(__name__) 29 | 30 | 31 | def connect() -> Elasticsearch: 32 | """Return a client connect ES.""" 33 | client = Elasticsearch( 34 | os.environ["ES_URL"], 35 | basic_auth=("elastic", os.environ["ES_PASS"]), 36 | verify_certs=False, 37 | ) 38 | 39 | if not client.ping(): 40 | raise RuntimeError(f"Cannot connect to ES: {os.environ['ES_URL']}") 41 | 42 | logger.info("Connected to ES") 43 | 44 | return client 45 | 46 | 47 | if __name__ == "__main__": 48 | connect() 49 | -------------------------------------------------------------------------------- /src/bluesearch/mining/__init__.py: -------------------------------------------------------------------------------- 1 | """Subpackage for text mining.""" 2 | 3 | # Blue Brain Search is a text mining toolbox focused on scientific use cases. 4 | # 5 | # Copyright (C) 2020 Blue Brain Project, EPFL. 6 | # 7 | # This program is free software: you can redistribute it and/or modify 8 | # it under the terms of the GNU Lesser General Public License as published by 9 | # the Free Software Foundation, either version 3 of the License, or 10 | # (at your option) any later version. 11 | # 12 | # This program is distributed in the hope that it will be useful, 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | # GNU Lesser General Public License for more details. 16 | # 17 | # You should have received a copy of the GNU Lesser General Public License 18 | # along with this program. If not, see . 19 | -------------------------------------------------------------------------------- /src/bluesearch/py.typed: -------------------------------------------------------------------------------- 1 | # Marker file for PEP 561. 2 | -------------------------------------------------------------------------------- /src/bluesearch/server/__init__.py: -------------------------------------------------------------------------------- 1 | """Implementation of servers.""" 2 | 3 | # Blue Brain Search is a text mining toolbox focused on scientific use cases. 4 | # 5 | # Copyright (C) 2020 Blue Brain Project, EPFL. 6 | # 7 | # This program is free software: you can redistribute it and/or modify 8 | # it under the terms of the GNU Lesser General Public License as published by 9 | # the Free Software Foundation, either version 3 of the License, or 10 | # (at your option) any later version. 11 | # 12 | # This program is distributed in the hope that it will be useful, 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | # GNU Lesser General Public License for more details. 16 | # 17 | # You should have received a copy of the GNU Lesser General Public License 18 | # along with this program. If not, see . 19 | -------------------------------------------------------------------------------- /src/bluesearch/server/invalid_usage_exception.py: -------------------------------------------------------------------------------- 1 | """Custom exceptions.""" 2 | 3 | # Blue Brain Search is a text mining toolbox focused on scientific use cases. 4 | # 5 | # Copyright (C) 2020 Blue Brain Project, EPFL. 6 | # 7 | # This program is free software: you can redistribute it and/or modify 8 | # it under the terms of the GNU Lesser General Public License as published by 9 | # the Free Software Foundation, either version 3 of the License, or 10 | # (at your option) any later version. 11 | # 12 | # This program is distributed in the hope that it will be useful, 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | # GNU Lesser General Public License for more details. 16 | # 17 | # You should have received a copy of the GNU Lesser General Public License 18 | # along with this program. If not, see . 19 | 20 | 21 | class InvalidUsage(Exception): 22 | """An exception used in the REST API server. 23 | 24 | The code was largely copied from 25 | https://flask.palletsprojects.com/en/1.1.x/patterns/apierrors/ 26 | """ 27 | 28 | def __init__(self, message, status_code=None): 29 | Exception.__init__(self) 30 | self.message = message 31 | if status_code is None: 32 | self.status_code = 400 33 | else: 34 | self.status_code = status_code 35 | 36 | def to_dict(self): 37 | """Generate a dictionary.""" 38 | rv = {} 39 | rv["message"] = self.message 40 | return rv 41 | -------------------------------------------------------------------------------- /src/bluesearch/widgets/__init__.py: -------------------------------------------------------------------------------- 1 | """Various widgets related to the BBS.""" 2 | 3 | # Blue Brain Search is a text mining toolbox focused on scientific use cases. 4 | # 5 | # Copyright (C) 2020 Blue Brain Project, EPFL. 6 | # 7 | # This program is free software: you can redistribute it and/or modify 8 | # it under the terms of the GNU Lesser General Public License as published by 9 | # the Free Software Foundation, either version 3 of the License, or 10 | # (at your option) any later version. 11 | # 12 | # This program is distributed in the hope that it will be useful, 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | # GNU Lesser General Public License for more details. 16 | # 17 | # You should have received a copy of the GNU Lesser General Public License 18 | # along with this program. If not, see . 19 | -------------------------------------------------------------------------------- /tests/data/cord19_v35/document_parses/pmc_json/PMC7186928.xml.json: -------------------------------------------------------------------------------- 1 | { 2 | "paper_id": "PMC7186928", 3 | "metadata": { 4 | "title": "Will we see protection or reinfection in COVID-19?", 5 | "authors": [ 6 | { 7 | "first": "Miyo", 8 | "middle": [], 9 | "last": "Ota", 10 | "suffix": "", 11 | "email": "sinai.immunology@gmail.com", 12 | "affiliation": {} 13 | } 14 | ] 15 | }, 16 | "body_text": [ 17 | { 18 | "text": "There is rising concern that patients who recover from COVID-19 may be at risk of reinfection. In this preprint, Bao et al. investigated acquired immunity to SARS-CoV-2 in rhesus macaques. Four rhesus monkeys were infected with SARS-CoV-2 and two were reinfected after confirmed recovery. After primary infection, viral replication was detected in the nose, pharynx, lungs and gut, with histopathological evidence of lung damage. Sera collected from recovered monkeys before reinfection exhibited neutralizing activity against SARS-CoV-2. Upon reinfection, viral replication was not detected in nasopharyngeal or anal swabs, and reinfected monkeys did not show any signs of COVID-19 disease recurrence. This suggests that immunity acquired following primary infection with SARS-CoV-2 may protect upon subsequent exposure to the virus.", 19 | "cite_spans": [], 20 | "section": "", 21 | "ref_spans": [] 22 | } 23 | ], 24 | "ref_entries": {}, 25 | "back_matter": [], 26 | "bib_entries": { 27 | "BIBREF0": { 28 | "title": "Reinfection could not occur in SARS-CoV-2 infected rhesus macaques", 29 | "authors": [ 30 | { 31 | "first": "L", 32 | "middle": [], 33 | "last": "Bao", 34 | "suffix": "" 35 | } 36 | ], 37 | "year": 2020, 38 | "venue": "bioRxiv", 39 | "volume": "", 40 | "issn": "", 41 | "pages": null, 42 | "other_ids": { 43 | "DOI": [ 44 | "10.1101/2020.03.13.990226" 45 | ] 46 | } 47 | } 48 | } 49 | } -------------------------------------------------------------------------------- /tests/data/mining/eval/iob_punctuation_after.csv: -------------------------------------------------------------------------------- 1 | text,class_ann1,class_ann2,class_ann3 2 | Potato,B-VEGETABLE,B-VEGETABLE,B-VEGETABLE 3 | Solanum,B-VEGETABLE,I-VEGETABLE,B-VEGETABLE 4 | tuberosum,I-VEGETABLE,I-VEGETABLE,I-VEGETABLE 5 | is,O,O,O 6 | a,O,O,O 7 | vegetable,B-VEGETABLE,B-VEGETABLE,B-VEGETABLE 8 | Cherry,B-FRUIT,B-FRUIT,B-FRUIT 9 | tomato,I-FRUIT,I-FRUIT,I-FRUIT 10 | is,O,O,O 11 | technically,O,O,O 12 | a,O,O,O 13 | fruit,B-FRUIT,B-VEGETABLE,B-FRUIT 14 | but,O,O,O 15 | few,O,O,O 16 | know,O,O,O 17 | that,O,O,O -------------------------------------------------------------------------------- /tests/data/mining/eval/iob_punctuation_before.csv: -------------------------------------------------------------------------------- 1 | text,class_ann1,class_ann2,class_ann3 2 | Potato,B-VEGETABLE,B-VEGETABLE,B-VEGETABLE 3 | (,B-VEGETABLE,I-VEGETABLE,I-VEGETABLE 4 | Solanum,I-VEGETABLE,I-VEGETABLE,B-VEGETABLE 5 | tuberosum,I-VEGETABLE,I-VEGETABLE,I-VEGETABLE 6 | ),I-VEGETABLE,O,I-VEGETABLE 7 | is,O,O,O 8 | a,O,O,O 9 | """",B-VEGETABLE,O,B-VEGETABLE 10 | vegetable,I-VEGETABLE,B-VEGETABLE,I-VEGETABLE 11 | """",I-VEGETABLE,I-VEGETABLE,B-FRUIT 12 | .,B-FRUIT,I-VEGETABLE,I-FRUIT 13 | """",I-FRUIT,B-FRUIT,I-FRUIT 14 | Cherry,I-FRUIT,I-FRUIT,I-FRUIT 15 | tomato,I-FRUIT,I-FRUIT,I-FRUIT 16 | """",O,I-FRUIT,O 17 | is,O,O,O 18 | technically,O,O,O 19 | a,O,O,O 20 | fruit,B-FRUIT,B-VEGETABLE,B-FRUIT 21 | ",",I-FRUIT,O,I-FRUIT 22 | but,O,O,O 23 | few,O,O,O 24 | know,O,O,O 25 | that,O,O,O 26 | .,O,O,O -------------------------------------------------------------------------------- /tests/data/mining/request/request.csv: -------------------------------------------------------------------------------- 1 | entity_type,property,property_type,property_value_type,ontology_source 2 | DISEASE,,,,UMLS 3 | CELL_TYPE,,,, 4 | CHEMICAL,,,,UMLS 5 | PROTEIN,,,,UMLS 6 | ORGAN,,,,UMLS 7 | DISEASE,,,,UMLS 8 | CHEMICAL,agonist_of,relation,PROTEIN,UMLS 9 | CHEMICAL,inhibitor_of,relation,PROTEIN,UMLS 10 | CHEMICAL,product_of,relation,PROTEIN,UMLS 11 | ORGANISM,,,,UMLS 12 | CHEMICAL,concentration,attribute,QuantitativeValue,UMLS 13 | DISEASE,is_hereditary,attribute,Boolean, 14 | -------------------------------------------------------------------------------- /tests/data/pubmed_article_minimal.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 123456 4 |
5 | 6 | 7 | 8 | 2020 9 | 10 | 11 | 12 | Article Title 13 | 14 | 012-34 15 | 16 | eng 17 | 18 | Journal Article 19 | MeSH Publication Type 20 | 21 |
22 | 23 | Medline TA 24 | 25 |
26 |
27 | -------------------------------------------------------------------------------- /tests/data/pubmed_articles.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 123456 7 |
8 | 9 | 10 | 11 | 2020 12 | 13 | 14 | 15 | Article Title 1 16 | 17 | 012-34 18 | 19 | eng 20 | 21 | Journal Article 22 | MeSH Publication Type 23 | 24 |
25 | 26 | Medline TA 1 27 | 28 |
29 |
30 | 31 | 32 | 789123 33 |
34 | 35 | 36 | 37 | 2021 38 | 39 | 40 | 41 | Article Title 2 42 | 43 | 567-89 44 | 45 | eng 46 | 47 | Journal Article 48 | MeSH Publication Type 49 | 50 |
51 | 52 | Medline TA 2 53 | 54 |
55 |
56 |
57 | -------------------------------------------------------------------------------- /tests/data/pubmed_download_index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Index of /pubmed/updatefiles 5 | 6 | 7 |

Index of /pubmed/updatefiles

8 |
Name                     Last modified      Size  
Parent Directory - 9 | README.txt 2020-12-14 08:15 4.0K 10 | pubmed21n1063.xml.gz 2020-12-14 14:10 67M 11 | pubmed21n1063.xml.gz.md5 2020-12-14 14:10 60 12 | pubmed21n1063_stats.html 2020-12-14 14:10 585 13 | pubmed21n1064.xml.gz 2020-12-14 14:10 53M 14 | pubmed21n1064.xml.gz.md5 2020-12-14 14:10 60 15 | pubmed21n1064_stats.html 2020-12-14 14:10 582 16 | pubmed21n1065.xml.gz 2020-12-14 14:10 12M 17 | pubmed21n1065.xml.gz.md5 2020-12-14 14:10 60 18 | pubmed21n1065_stats.html 2020-12-14 14:10 571 19 | pubmed21n1066.xml.gz 2020-12-15 14:04 64M 20 | pubmed21n1066.xml.gz.md5 2020-12-15 14:04 60 21 | pubmed21n1066_stats.html 2020-12-15 14:04 584 22 | pubmed21n1067.xml.gz 2020-12-15 14:04 7.7M 23 | pubmed21n1067.xml.gz.md5 2020-12-15 14:04 60 24 | pubmed21n1067_stats.html 2020-12-15 14:04 571 25 | pubmed21n1068.xml.gz 2020-12-16 14:02 51M 26 | pubmed21n1068.xml.gz.md5 2020-12-16 14:02 60 27 | pubmed21n1068_stats.html 2020-12-16 14:02 583 28 | pubmed21n1069.xml.gz 2020-12-17 14:02 61M 29 | pubmed21n1069.xml.gz.md5 2020-12-17 14:02 60 30 | pubmed21n1069_stats.html 2020-12-17 14:02 582 31 | pubmed21n1070.xml.gz 2020-12-18 14:04 57M 32 |
33 | 34 | -------------------------------------------------------------------------------- /tests/unit/database/test_pdf.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | import pytest 4 | import requests 5 | import responses 6 | 7 | from bluesearch.database.pdf import grobid_is_alive, grobid_pdf_to_tei_xml 8 | 9 | 10 | @responses.activate 11 | def test_conversion_pdf(monkeypatch): 12 | """Test PDF conversion""" 13 | 14 | responses.add( 15 | responses.POST, 16 | "http://fake_host:8888/api/processFulltextDocument", 17 | body="body", 18 | ) 19 | 20 | result = grobid_pdf_to_tei_xml(b"", host="fake_host", port=8888) 21 | assert result == "body" 22 | assert len(responses.calls) == 1 23 | 24 | 25 | @pytest.mark.parametrize( 26 | ("body", "expected_result"), 27 | ( 28 | ("true", True), 29 | (requests.RequestException(), False), 30 | ("false", False), 31 | ("unknown", False), 32 | ), 33 | ) 34 | @responses.activate 35 | def test_grobid_is_alive(body, expected_result): 36 | host = "host" 37 | port = 12345 38 | responses.add( 39 | responses.GET, 40 | re.compile(rf"http://{host}:{port}/.*"), 41 | body=body, 42 | ) 43 | assert grobid_is_alive(host, port) is expected_result 44 | -------------------------------------------------------------------------------- /tests/unit/entrypoint/__init__.py: -------------------------------------------------------------------------------- 1 | """Collection of tests covering entrypoint functionalities.""" 2 | 3 | # Blue Brain Search is a text mining toolbox focused on scientific use cases. 4 | # 5 | # Copyright (C) 2020 Blue Brain Project, EPFL. 6 | # 7 | # This program is free software: you can redistribute it and/or modify 8 | # it under the terms of the GNU Lesser General Public License as published by 9 | # the Free Software Foundation, either version 3 of the License, or 10 | # (at your option) any later version. 11 | # 12 | # This program is distributed in the hope that it will be useful, 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | # GNU Lesser General Public License for more details. 16 | # 17 | # You should have received a copy of the GNU Lesser General Public License 18 | # along with this program. If not, see . 19 | -------------------------------------------------------------------------------- /tests/unit/entrypoint/database/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BlueBrain/Search/503fdf320a62ab2eb1f9a2a371600e4f1a38df62/tests/unit/entrypoint/database/__init__.py -------------------------------------------------------------------------------- /tests/unit/entrypoint/database/test_init.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | 3 | import sqlalchemy 4 | 5 | from bluesearch.entrypoint.database.parent import main 6 | from bluesearch.entrypoint.database.schemas import schema_articles, schema_sentences 7 | 8 | 9 | def test_sqlite(tmpdir): 10 | tmpdir = pathlib.Path(str(tmpdir)) 11 | db_path = tmpdir / "database.db" 12 | 13 | args_and_opts = [ 14 | "init", 15 | str(db_path), 16 | "--db-type=sqlite", 17 | ] 18 | 19 | assert not db_path.exists() 20 | 21 | main(args_and_opts) 22 | 23 | assert db_path.exists() 24 | 25 | engine = sqlalchemy.create_engine(f"sqlite:///{db_path}") 26 | metadata = sqlalchemy.MetaData(engine) 27 | metadata.reflect(engine) 28 | tables = metadata.sorted_tables 29 | 30 | expected_metadata = sqlalchemy.MetaData() 31 | schema_articles(expected_metadata) 32 | schema_sentences(expected_metadata) 33 | expected_tables = expected_metadata.sorted_tables 34 | 35 | assert len(tables) == len(expected_tables) 36 | 37 | for table, expected in zip(tables, expected_tables): 38 | assert table.compare(expected) 39 | -------------------------------------------------------------------------------- /tests/unit/entrypoint/database/test_parent.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import logging 4 | import subprocess 5 | 6 | import pytest 7 | 8 | from bluesearch.entrypoint.database.parent import _setup_logging 9 | 10 | 11 | @pytest.mark.parametrize("command", ["add", "convert-pdf", "init", "parse"]) 12 | def test_commands_work(command): 13 | subprocess.run(["bbs_database", command, "--help"], check=True) 14 | 15 | 16 | def test_setup_logging(caplog): 17 | def get_levels(loggers: dict[str, logging.Logger]) -> dict[str, int]: 18 | """Get logging level for each logger.""" 19 | return {name: logger.getEffectiveLevel() for name, logger in loggers.items()} 20 | 21 | caplog.set_level(logging.WARNING, logger="bluesearch") 22 | 23 | all_loggers = logging.root.manager.loggerDict 24 | bluesearch_loggers = { 25 | k: v 26 | for k, v in all_loggers.items() 27 | if k.startswith("bluesearch") and isinstance(v, logging.Logger) 28 | } 29 | external_loggers = { 30 | k: v 31 | for k, v in all_loggers.items() 32 | if not k.startswith("bluesearch") and isinstance(v, logging.Logger) 33 | } 34 | 35 | bluesearch_levels_before = get_levels(bluesearch_loggers) 36 | external_levels_before = get_levels(external_loggers) 37 | 38 | _setup_logging(logging.DEBUG) 39 | 40 | bluesearch_levels_after = get_levels(bluesearch_loggers) 41 | external_levels_after = get_levels(external_loggers) 42 | 43 | assert set(bluesearch_levels_before.values()) == {logging.WARNING} 44 | assert set(bluesearch_levels_after.values()) == {logging.DEBUG} 45 | 46 | assert external_levels_before == external_levels_after 47 | -------------------------------------------------------------------------------- /tests/unit/entrypoint/test__helper.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import argparse 4 | from typing import Sequence 5 | 6 | import pytest 7 | 8 | from bluesearch.entrypoint._helper import parse_args_or_environment 9 | 10 | 11 | def test_parse_args_or_environment(monkeypatch): 12 | parser = argparse.ArgumentParser() 13 | parser.add_argument("--normal-arg") 14 | parser.add_argument("--env-arg", default=argparse.SUPPRESS) 15 | argv_value = "5" 16 | env_value = "6" 17 | 18 | # --env-arg not provided at all 19 | argv: Sequence[str] = [] 20 | env_variable_names: dict[str, str] = {} 21 | args = parse_args_or_environment(parser, env_variable_names, argv) 22 | assert "normal_arg" in args.__dict__ 23 | assert "env_arg" not in args.__dict__ 24 | 25 | # --env-arg provided through the CLI 26 | argv = ["--env-arg", argv_value] 27 | env_variable_names = {} 28 | args = parse_args_or_environment(parser, env_variable_names, argv) 29 | assert "normal_arg" in args.__dict__ 30 | assert "env_arg" in args.__dict__ 31 | assert args.env_arg == argv_value 32 | 33 | # --env-arg provided through the environment 34 | argv = [] 35 | environ = { 36 | "ENV_ARG": env_value, 37 | } 38 | monkeypatch.setattr("bluesearch.entrypoint._helper.os.environ", environ) 39 | env_variable_names = { 40 | "env_arg": "ENV_ARG", 41 | } 42 | args = parse_args_or_environment(parser, env_variable_names, argv) 43 | assert "normal_arg" in args.__dict__ 44 | assert "env_arg" in args.__dict__ 45 | assert args.env_arg == env_value 46 | 47 | # Check that CLI argument have precedence over environment variables 48 | argv = ["--env-arg", argv_value] 49 | environ = { 50 | "ENV_ARG": env_value, 51 | } 52 | monkeypatch.setattr("bluesearch.entrypoint._helper.os.environ", environ) 53 | env_variable_names = { 54 | "env_arg": "ENV_ARG", 55 | } 56 | args = parse_args_or_environment(parser, env_variable_names, argv) 57 | assert "normal_arg" in args.__dict__ 58 | assert "env_arg" in args.__dict__ 59 | assert args.env_arg == argv_value 60 | 61 | # Value not specified through the CLI, nor through environment 62 | argv = [] 63 | environ = {} 64 | monkeypatch.setattr("bluesearch.entrypoint._helper.os.environ", environ) 65 | env_variable_names = { 66 | "env_arg": "ENV_ARG", 67 | } 68 | with pytest.raises(SystemExit) as pytest_wrapped_e: 69 | parse_args_or_environment(parser, env_variable_names, argv) 70 | assert pytest_wrapped_e.value.code == 1 71 | -------------------------------------------------------------------------------- /tests/unit/entrypoint/test_embedding_server.py: -------------------------------------------------------------------------------- 1 | """Collection of tests focusing on the `embedding_server` entrypoint.""" 2 | 3 | # Blue Brain Search is a text mining toolbox focused on scientific use cases. 4 | # 5 | # Copyright (C) 2020 Blue Brain Project, EPFL. 6 | # 7 | # This program is free software: you can redistribute it and/or modify 8 | # it under the terms of the GNU Lesser General Public License as published by 9 | # the Free Software Foundation, either version 3 of the License, or 10 | # (at your option) any later version. 11 | # 12 | # This program is distributed in the hope that it will be useful, 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | # GNU Lesser General Public License for more details. 16 | # 17 | # You should have received a copy of the GNU Lesser General Public License 18 | # along with this program. If not, see . 19 | 20 | import pathlib 21 | from unittest.mock import Mock 22 | 23 | from bluesearch.entrypoint.embedding_server import get_embedding_app 24 | from bluesearch.server.embedding_server import EmbeddingServer 25 | 26 | 27 | def test_environment_reading(monkeypatch, tmpdir): 28 | tmpdir = pathlib.Path(str(tmpdir)) 29 | logfile = tmpdir / "log.txt" 30 | logfile.touch() 31 | 32 | fake_embedding_server_inst = Mock(spec=EmbeddingServer) 33 | fake_embedding_server_class = Mock(return_value=fake_embedding_server_inst) 34 | 35 | monkeypatch.setattr( 36 | "bluesearch.server.embedding_server.EmbeddingServer", 37 | fake_embedding_server_class, 38 | ) 39 | 40 | # Mock all of our embedding models 41 | embedding_models = ["SentTransformer"] 42 | 43 | for model in embedding_models: 44 | monkeypatch.setattr(f"bluesearch.embedding_models.{model}", Mock()) 45 | 46 | monkeypatch.setenv("BBS_EMBEDDING_LOG_FILE", str(logfile)) 47 | 48 | embedding_app = get_embedding_app() 49 | 50 | assert embedding_app is fake_embedding_server_inst 51 | 52 | args, _ = fake_embedding_server_class.call_args 53 | 54 | assert len(args) == 1 55 | assert isinstance(args[0], dict) 56 | -------------------------------------------------------------------------------- /tests/unit/entrypoint/test_entrypoint_installation.py: -------------------------------------------------------------------------------- 1 | """Tests covering entrypoint installation.""" 2 | 3 | # Blue Brain Search is a text mining toolbox focused on scientific use cases. 4 | # 5 | # Copyright (C) 2020 Blue Brain Project, EPFL. 6 | # 7 | # This program is free software: you can redistribute it and/or modify 8 | # it under the terms of the GNU Lesser General Public License as published by 9 | # the Free Software Foundation, either version 3 of the License, or 10 | # (at your option) any later version. 11 | # 12 | # This program is distributed in the hope that it will be useful, 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | # GNU Lesser General Public License for more details. 16 | # 17 | # You should have received a copy of the GNU Lesser General Public License 18 | # along with this program. If not, see . 19 | 20 | import subprocess 21 | 22 | import pytest 23 | 24 | 25 | @pytest.mark.parametrize( 26 | "entrypoint_name", 27 | [ 28 | "bbs_database", 29 | "compute_embeddings", 30 | "create_database", 31 | "create_mining_cache", 32 | "embedding_server", 33 | "mining_server", 34 | "search_server", 35 | ], 36 | ) 37 | def test_entrypoint(entrypoint_name): 38 | subprocess.run([entrypoint_name, "--help"], check=True) 39 | -------------------------------------------------------------------------------- /tests/unit/entrypoint/test_mining_server.py: -------------------------------------------------------------------------------- 1 | """Collection of tests focused on the `mining_server`.""" 2 | 3 | # Blue Brain Search is a text mining toolbox focused on scientific use cases. 4 | # 5 | # Copyright (C) 2020 Blue Brain Project, EPFL. 6 | # 7 | # This program is free software: you can redistribute it and/or modify 8 | # it under the terms of the GNU Lesser General Public License as published by 9 | # the Free Software Foundation, either version 3 of the License, or 10 | # (at your option) any later version. 11 | # 12 | # This program is distributed in the hope that it will be useful, 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | # GNU Lesser General Public License for more details. 16 | # 17 | # You should have received a copy of the GNU Lesser General Public License 18 | # along with this program. If not, see . 19 | 20 | import pathlib 21 | from unittest.mock import Mock 22 | 23 | import pytest 24 | 25 | from bluesearch.entrypoint.mining_server import get_mining_app 26 | 27 | 28 | @pytest.mark.parametrize( 29 | ("db_type", "sqlite_db_exists"), 30 | ( 31 | ("sqlite", True), 32 | ("sqlite", False), 33 | ("mysql", False), 34 | ("wrong", False), 35 | ), 36 | ) 37 | def test_send_through( 38 | tmpdir, monkeypatch, db_type, sqlite_db_exists, entity_types, spacy_model_path 39 | ): 40 | tmpdir = pathlib.Path(str(tmpdir)) 41 | logfile = tmpdir / "log.txt" 42 | db_path = tmpdir / "something.db" 43 | 44 | if sqlite_db_exists: 45 | db_path.parent.mkdir(exist_ok=True, parents=True) 46 | db_path.touch() 47 | 48 | monkeypatch.setenv("BBS_MINING_LOG_FILE", str(logfile)) 49 | monkeypatch.setenv("BBS_MINING_DB_TYPE", db_type) 50 | monkeypatch.setenv("BBS_MINING_DB_URL", str(db_path)) 51 | monkeypatch.setenv("BBS_MINING_MYSQL_USER", "some_user") 52 | monkeypatch.setenv("BBS_MINING_MYSQL_PASSWORD", "some_pwd") 53 | monkeypatch.setenv("BBS_DATA_AND_MODELS_DIR", str(spacy_model_path)) 54 | 55 | fake_sqlalchemy = Mock() 56 | fake_mining_server_inst = Mock() 57 | fake_mining_server_class = Mock(return_value=fake_mining_server_inst) 58 | 59 | monkeypatch.setattr( 60 | "bluesearch.server.mining_server.MiningServer", fake_mining_server_class 61 | ) 62 | monkeypatch.setattr( 63 | "bluesearch.entrypoint.mining_server.sqlalchemy", fake_sqlalchemy 64 | ) 65 | 66 | if db_type not in {"mysql", "sqlite"}: 67 | with pytest.raises(ValueError): 68 | get_mining_app() 69 | else: 70 | mining_app = get_mining_app() 71 | 72 | fake_mining_server_class.assert_called_once() 73 | assert mining_app == fake_mining_server_inst 74 | 75 | args, kwargs = fake_mining_server_class.call_args 76 | assert not args 77 | assert kwargs["connection"] == fake_sqlalchemy.create_engine.return_value 78 | assert "ee" in kwargs["models_libs"] 79 | assert isinstance(kwargs["models_libs"]["ee"], dict) 80 | assert len(kwargs["models_libs"]["ee"]) == len(entity_types) 81 | -------------------------------------------------------------------------------- /tests/unit/entrypoint/test_search_sever.py: -------------------------------------------------------------------------------- 1 | """Collection of tests focused on "search_server" entrypoint.""" 2 | 3 | # Blue Brain Search is a text mining toolbox focused on scientific use cases. 4 | # 5 | # Copyright (C) 2020 Blue Brain Project, EPFL. 6 | # 7 | # This program is free software: you can redistribute it and/or modify 8 | # it under the terms of the GNU Lesser General Public License as published by 9 | # the Free Software Foundation, either version 3 of the License, or 10 | # (at your option) any later version. 11 | # 12 | # This program is distributed in the hope that it will be useful, 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | # GNU Lesser General Public License for more details. 16 | # 17 | # You should have received a copy of the GNU Lesser General Public License 18 | # along with this program. If not, see . 19 | 20 | import pathlib 21 | from unittest.mock import Mock 22 | 23 | import numpy as np 24 | import pytest 25 | 26 | from bluesearch.entrypoint.search_server import get_search_app 27 | from bluesearch.server.search_server import SearchServer 28 | 29 | 30 | @pytest.mark.parametrize( 31 | "embeddings_path,models,models_path", 32 | [ 33 | ("path_1", ["A", "B", "C"], "path_a"), 34 | ("path_2", ["X", "Y"], "path_b"), 35 | ], 36 | ) 37 | def test_send_through(tmpdir, monkeypatch, embeddings_path, models, models_path): 38 | tmpdir = pathlib.Path(str(tmpdir)) 39 | logfile = tmpdir / "log.txt" 40 | 41 | monkeypatch.setenv("BBS_SEARCH_LOG_FILE", str(logfile)) 42 | monkeypatch.setenv("BBS_SEARCH_MODELS_PATH", models_path) 43 | monkeypatch.setenv("BBS_SEARCH_EMBEDDINGS_PATH", embeddings_path) 44 | monkeypatch.setenv("BBS_SEARCH_MODELS", ",".join(models)) 45 | monkeypatch.setenv("BBS_SEARCH_DB_URL", "some_url") 46 | monkeypatch.setenv("BBS_SEARCH_MYSQL_USER", "some_user") 47 | monkeypatch.setenv("BBS_SEARCH_MYSQL_PASSWORD", "some_pwd") 48 | 49 | fake_sqlalchemy = Mock() 50 | fake_H5 = Mock() 51 | fake_H5.find_populated_rows.return_value = np.arange(1, 11) 52 | fake_search_server_inst = Mock(spec=SearchServer) 53 | fake_search_server_class = Mock(return_value=fake_search_server_inst) 54 | 55 | monkeypatch.setattr( 56 | "bluesearch.entrypoint.search_server.sqlalchemy", fake_sqlalchemy 57 | ) 58 | monkeypatch.setattr("bluesearch.utils.H5", fake_H5) 59 | monkeypatch.setattr( 60 | "bluesearch.server.search_server.SearchServer", fake_search_server_class 61 | ) 62 | 63 | server_app = get_search_app() 64 | 65 | # Checks 66 | fake_search_server_class.assert_called_once() 67 | fake_H5.find_populated_rows.assert_called_once() 68 | fake_sqlalchemy.create_engine.assert_called_once() 69 | 70 | assert server_app is fake_search_server_inst 71 | 72 | args, kwargs = fake_search_server_class.call_args 73 | 74 | assert args[0] == pathlib.Path(models_path) 75 | assert args[1] == pathlib.Path(embeddings_path) 76 | np.testing.assert_array_equal(args[2], np.arange(1, 11)) 77 | assert args[3] is fake_sqlalchemy.create_engine.return_value 78 | assert args[4] == models 79 | -------------------------------------------------------------------------------- /tests/unit/k8s/test_create_indices.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from bluesearch.k8s.create_indices import add_index, remove_index 4 | 5 | 6 | def test_create_and_remove_index(get_es_client): 7 | client = get_es_client 8 | 9 | if client is None: 10 | pytest.skip("Elastic search is not available") 11 | 12 | index = "test_index" 13 | 14 | add_index(client, index) 15 | remove_index(client, index) 16 | -------------------------------------------------------------------------------- /tests/unit/server/__init__.py: -------------------------------------------------------------------------------- 1 | """Collection of tests covering server functionalities.""" 2 | 3 | # Blue Brain Search is a text mining toolbox focused on scientific use cases. 4 | # 5 | # Copyright (C) 2020 Blue Brain Project, EPFL. 6 | # 7 | # This program is free software: you can redistribute it and/or modify 8 | # it under the terms of the GNU Lesser General Public License as published by 9 | # the Free Software Foundation, either version 3 of the License, or 10 | # (at your option) any later version. 11 | # 12 | # This program is distributed in the hope that it will be useful, 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | # GNU Lesser General Public License for more details. 16 | # 17 | # You should have received a copy of the GNU Lesser General Public License 18 | # along with this program. If not, see . 19 | -------------------------------------------------------------------------------- /tests/unit/server/test_embedding_server.py: -------------------------------------------------------------------------------- 1 | """Tests covering embedding server""" 2 | 3 | # Blue Brain Search is a text mining toolbox focused on scientific use cases. 4 | # 5 | # Copyright (C) 2020 Blue Brain Project, EPFL. 6 | # 7 | # This program is free software: you can redistribute it and/or modify 8 | # it under the terms of the GNU Lesser General Public License as published by 9 | # the Free Software Foundation, either version 3 of the License, or 10 | # (at your option) any later version. 11 | # 12 | # This program is distributed in the hope that it will be useful, 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | # GNU Lesser General Public License for more details. 16 | # 17 | # You should have received a copy of the GNU Lesser General Public License 18 | # along with this program. If not, see . 19 | 20 | from unittest.mock import Mock 21 | 22 | import numpy as np 23 | import pytest 24 | 25 | from bluesearch.server.embedding_server import EmbeddingServer 26 | 27 | 28 | @pytest.fixture(scope="session") 29 | def embedding_client(): 30 | """Fixture to create a client for mining_server.""" 31 | 32 | sbiobert = Mock() 33 | sbiobert.preprocess.return_value = "This is a dummy sentence" 34 | sbiobert.embed.return_value = np.ones((2,)) 35 | embedding_models = {"sbiobert": sbiobert} 36 | 37 | embedding_server_app = EmbeddingServer(embedding_models=embedding_models) 38 | embedding_server_app.config["TESTING"] = True 39 | with embedding_server_app.test_client() as client: 40 | yield client 41 | 42 | 43 | class TestEmbeddingServer: 44 | def test_embedding_server_help(self, embedding_client): 45 | response = embedding_client.post("/help") 46 | assert response.status_code == 200 47 | assert response.json["name"] == "EmbeddingServer" 48 | 49 | def test_embedding_server_welcome(self, embedding_client): 50 | response = embedding_client.get("/") 51 | assert response.status_code == 200 52 | response = embedding_client.post("/") 53 | assert response.status_code == 405 54 | 55 | def test_embedding_server_embed(self, embedding_client): 56 | request_json = {"model": "sbiobert", "text": "hello"} 57 | response = embedding_client.post("/v1/embed/json", json=request_json) 58 | assert response.status_code == 200 59 | 60 | request_json = {"model": "sbiobert"} 61 | response = embedding_client.post("/v1/embed/json", json=request_json) 62 | assert response.status_code == 400 63 | 64 | request_json = {"model": "sbiobert", "text": "hello"} 65 | response = embedding_client.post("/v1/embed/csv", json=request_json) 66 | assert response.status_code == 200 67 | 68 | request_json = {"model": "invalid_model", "text": "hello"} 69 | response = embedding_client.post("/v1/embed/csv", json=request_json) 70 | assert response.status_code == 400 71 | 72 | response = embedding_client.post("/v1/embed/csv", data="not json") 73 | assert response.status_code == 400 74 | 75 | response = embedding_client.post("/v1/embed/invalid_format", data="not json") 76 | assert response.status_code == 400 77 | -------------------------------------------------------------------------------- /tests/unit/server/test_search_server.py: -------------------------------------------------------------------------------- 1 | """Tests covering the search server.""" 2 | 3 | # Blue Brain Search is a text mining toolbox focused on scientific use cases. 4 | # 5 | # Copyright (C) 2020 Blue Brain Project, EPFL. 6 | # 7 | # This program is free software: you can redistribute it and/or modify 8 | # it under the terms of the GNU Lesser General Public License as published by 9 | # the Free Software Foundation, either version 3 of the License, or 10 | # (at your option) any later version. 11 | # 12 | # This program is distributed in the hope that it will be useful, 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | # GNU Lesser General Public License for more details. 16 | # 17 | # You should have received a copy of the GNU Lesser General Public License 18 | # along with this program. If not, see . 19 | 20 | from unittest.mock import Mock 21 | 22 | import numpy as np 23 | import pytest 24 | 25 | from bluesearch.server.search_server import SearchServer 26 | from bluesearch.utils import H5 27 | 28 | 29 | @pytest.fixture 30 | def search_client( 31 | monkeypatch, embeddings_h5_path, fake_sqlalchemy_engine, test_parameters 32 | ): 33 | """Fixture to create a client for mining_server.""" 34 | 35 | fake_embedding_model = Mock() 36 | fake_embedding_model.preprocess.return_value = "hello" 37 | fake_embedding_model.embed.return_value = np.ones( 38 | (test_parameters["embedding_size"],) 39 | ) 40 | 41 | monkeypatch.setattr( 42 | "bluesearch.server.search_server.get_embedding_model", 43 | lambda *args, **kwargs: fake_embedding_model, 44 | ) 45 | 46 | indices = H5.find_populated_rows(embeddings_h5_path, "SBioBERT") 47 | 48 | search_server_app = SearchServer( 49 | trained_models_path="", 50 | embeddings_h5_path=embeddings_h5_path, 51 | indices=indices, 52 | connection=fake_sqlalchemy_engine, 53 | models=["SBioBERT"], 54 | ) 55 | search_server_app.config["TESTING"] = True 56 | with search_server_app.test_client() as client: 57 | yield client 58 | 59 | 60 | class TestSearchServer: 61 | def test_search_server(self, search_client): 62 | # Test the help request 63 | response = search_client.post("/help") 64 | assert response.status_code == 200 65 | assert response.json["name"] == "SearchServer" 66 | 67 | # Test a valid JSON request 68 | k = 3 69 | request_json = {"which_model": "SBioBERT", "k": k, "query_text": "hello"} 70 | response = search_client.post("/", json=request_json) 71 | assert response.status_code == 200 72 | json_response = response.json 73 | assert len(json_response["sentence_ids"]) == k 74 | assert len(json_response["similarities"]) == k 75 | 76 | # Test a non-JSON request 77 | response = search_client.post("/", data="data is not a json") 78 | assert response.status_code == 200 79 | json_response = response.json 80 | assert json_response["sentence_ids"] is None 81 | assert json_response["similarities"] is None 82 | -------------------------------------------------------------------------------- /tests/unit/test_fixtures.py: -------------------------------------------------------------------------------- 1 | """Collection of tests that make sure that fixtures are set up correctly. 2 | 3 | Notes 4 | ----- 5 | The internals of fixtures might vary based on how conftest.py sets them up. 6 | The goal of these tests is to run simple sanity checks rather than detailed 7 | bookkeeping. 8 | """ 9 | 10 | # Blue Brain Search is a text mining toolbox focused on scientific use cases. 11 | # 12 | # Copyright (C) 2020 Blue Brain Project, EPFL. 13 | # 14 | # This program is free software: you can redistribute it and/or modify 15 | # it under the terms of the GNU Lesser General Public License as published by 16 | # the Free Software Foundation, either version 3 of the License, or 17 | # (at your option) any later version. 18 | # 19 | # This program is distributed in the hope that it will be useful, 20 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 21 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 22 | # GNU Lesser General Public License for more details. 23 | # 24 | # You should have received a copy of the GNU Lesser General Public License 25 | # along with this program. If not, see . 26 | 27 | import pandas as pd 28 | import pytest 29 | import sqlalchemy 30 | from sqlalchemy.exc import OperationalError, ProgrammingError 31 | 32 | 33 | def test_database(fake_sqlalchemy_engine, backend_database): 34 | """Make sure database tables setup correctly.""" 35 | inspector = sqlalchemy.inspect(fake_sqlalchemy_engine) 36 | 37 | for table_name in ["articles", "sentences", "mining_cache"]: 38 | res = pd.read_sql("SELECT * FROM {}".format(table_name), fake_sqlalchemy_engine) 39 | 40 | if table_name != "articles": 41 | # Mysql consider that sentences table has 2 indexes (article_id one + UNIQUE 42 | # constraint) 43 | # sqlite will only consider 1 index for this table (article_id one) 44 | assert len(inspector.get_indexes(table_name)) >= 1 45 | 46 | assert len(res) > 0 47 | if backend_database == "sqlite": 48 | with pytest.raises(OperationalError): 49 | fake_sqlalchemy_engine.execute("SELECT * FROM fake_table").all() 50 | else: 51 | with pytest.raises(ProgrammingError): 52 | fake_sqlalchemy_engine.execute("SELECT * FROM fake_table").all() 53 | 54 | 55 | def test_h5(embeddings_h5_path): 56 | assert embeddings_h5_path.is_file() 57 | 58 | 59 | def test_metadata(metadata_path): 60 | """Make sure all metadata csv is correct""" 61 | df = pd.read_csv(str(metadata_path)) 62 | 63 | assert len(df) > 0 64 | 65 | 66 | def test_jsons(jsons_path): 67 | """Make sure all jsons are present.""" 68 | n_json_files = len(list(jsons_path.rglob("*.json"))) 69 | 70 | assert n_json_files > 0 71 | -------------------------------------------------------------------------------- /tests/unit/widgets/test_mining_schema.py: -------------------------------------------------------------------------------- 1 | """Tests covering the MiningSchema class.""" 2 | 3 | # Blue Brain Search is a text mining toolbox focused on scientific use cases. 4 | # 5 | # Copyright (C) 2020 Blue Brain Project, EPFL. 6 | # 7 | # This program is free software: you can redistribute it and/or modify 8 | # it under the terms of the GNU Lesser General Public License as published by 9 | # the Free Software Foundation, either version 3 of the License, or 10 | # (at your option) any later version. 11 | # 12 | # This program is distributed in the hope that it will be useful, 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | # GNU Lesser General Public License for more details. 16 | # 17 | # You should have received a copy of the GNU Lesser General Public License 18 | # along with this program. If not, see . 19 | 20 | import pytest 21 | 22 | from bluesearch.widgets.mining_schema import MiningSchema 23 | 24 | 25 | def test_add_entity(): 26 | mining_schema = MiningSchema() 27 | 28 | # Test adding entities 29 | mining_schema.add_entity( 30 | "CHEMICAL", 31 | property_name="isChiral", 32 | property_type="ATTRIBUTE", 33 | property_value_type="BOOLEAN", 34 | ontology_source="NCIT", 35 | ) 36 | mining_schema.add_entity("DRUG") 37 | assert len(mining_schema.schema_df) == 2 38 | 39 | # Test warning upon adding a duplicate entity 40 | with pytest.warns(UserWarning, match=r"already exists"): 41 | mining_schema.add_entity("DRUG") 42 | 43 | 44 | def test_df(mining_schema_df): 45 | # We won't be testing for duplicates in this test 46 | mining_schema_df = mining_schema_df.drop_duplicates(ignore_index=True) 47 | 48 | # Test adding from a dataframe 49 | mining_schema = MiningSchema() 50 | mining_schema.add_from_df(mining_schema_df) 51 | # Make sure a copy is returned 52 | assert mining_schema.df is not mining_schema.schema_df 53 | # Check that all data was added 54 | assert mining_schema.df.equals(mining_schema_df) 55 | 56 | # Test missing entity_type 57 | wrong_schema_df = mining_schema_df.drop("entity_type", axis=1) 58 | mining_schema = MiningSchema() 59 | with pytest.raises(ValueError, match=r"entity_type.* not found"): 60 | mining_schema.add_from_df(wrong_schema_df) 61 | 62 | # Test ignoring unknown columns 63 | schema_df_new = mining_schema_df.drop_duplicates().copy() 64 | schema_df_new["unknown_column"] = list(range(len(schema_df_new))) 65 | mining_schema = MiningSchema() 66 | with pytest.warns(UserWarning, match=r"column.* unknown_column"): 67 | mining_schema.add_from_df(schema_df_new) 68 | # Check that all data was added and the unknown columns was ignored 69 | assert mining_schema.df.equals(mining_schema_df) 70 | --------------------------------------------------------------------------------