├── .dockerignore
├── .dvc
    ├── .gitignore
    ├── config
    └── plots
    │   ├── confusion.json
    │   ├── default.json
    │   ├── scatter.json
    │   └── smooth.json
├── .dvcignore
├── .env.example
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── bug-report.md
    │   ├── documentation.md
    │   ├── feature-request.md
    │   └── other-questions-help.md
    ├── PULL_REQUEST_TEMPLATE.md
    └── workflows
    │   └── ci.yaml
├── .gitignore
├── .mypy.ini
├── .readthedocs.yml
├── AUTHORS.md
├── CONTRIBUTING.md
├── COPYING
├── COPYING.LESSER
├── README.md
├── benchmarks
    ├── conftest.py
    ├── test_benchmark_insert.py
    └── test_benchmark_servers.py
├── data_and_models
    ├── annotations
    │   ├── README.md
    │   ├── ner
    │   │   ├── .gitignore
    │   │   ├── README.md
    │   │   ├── analyze.py
    │   │   ├── annotations10_EmmanuelleLogette_2020-08-28_raw1_raw5_10EntityTypes.jsonl.dvc
    │   │   ├── annotations11_CharlotteLorin_2020-08-28_raw1_10EntityTypes.jsonl.dvc
    │   │   ├── annotations12_EmmanuelleLogette_2020-08-28_raw7_10EntityTypes.jsonl.dvc
    │   │   ├── annotations13_CharlotteLorin_2020-09-02_raw7_10EntityTypes.jsonl.dvc
    │   │   ├── annotations14_EmmanuelleLogette_2020-09-02_raw8_CellCompartmentDrugOrgan.jsonl.dvc
    │   │   ├── annotations15_EmmanuelleLogette_2020-09-22_raw9_Pathway.jsonl.dvc
    │   │   ├── annotations1_EmmanuelleLogette_2020-06-19_raw1_8FirstLabels.jsonl.dvc
    │   │   ├── annotations2_CharlotteLorin_2020-06-19_8FirstLabels.jsonl.dvc
    │   │   ├── annotations3_EmmanuelleLogette_2020-07-06_raw1_8FirstLabels.jsonl.dvc
    │   │   ├── annotations4_CharlotteLorin_2020-07-02_raw1_8FirstLabels.jsonl.dvc
    │   │   ├── annotations5_EmmanuelleLogette_2020-06-30_raw2_Disease.jsonl.dvc
    │   │   ├── annotations6_EmmanuelleLogette_2020-07-07_raw4_TaxonChebi.jsonl.dvc
    │   │   ├── annotations7_EmmanuelleLogette_2020-07-06_raw1_9EntityTypes.jsonl.dvc
    │   │   ├── annotations8_EmmanuelleLogette_2020-07-08_raw5_9EntityTypes.jsonl.dvc
    │   │   ├── annotations9_EmmanuelleLogette_2020-07-08_raw6_CelltypeProtein.jsonl.dvc
    │   │   ├── patterns
    │   │   │   ├── .gitignore
    │   │   │   ├── README.md
    │   │   │   ├── pathway_patterns.jsonl.dvc
    │   │   │   └── patterns.jsonl.dvc
    │   │   └── rule_based_patterns.jsonl.dvc
    │   └── sentence_embedding
    │   │   ├── .gitignore
    │   │   ├── README.md
    │   │   ├── cord19_v47_sentences_pre.txt.dvc
    │   │   └── sentence_similarity_cord19.csv.dvc
    ├── metrics
    │   ├── ner
    │   │   ├── cell_compartment.json
    │   │   ├── cell_type.json
    │   │   ├── chemical.json
    │   │   ├── disease.json
    │   │   ├── drug.json
    │   │   ├── interrater
    │   │   │   ├── cell_compartment.json
    │   │   │   ├── cell_type.json
    │   │   │   ├── chemical.json
    │   │   │   ├── condition.json
    │   │   │   ├── disease.json
    │   │   │   ├── drug.json
    │   │   │   ├── organ.json
    │   │   │   ├── organism.json
    │   │   │   ├── pathway.json
    │   │   │   └── protein.json
    │   │   ├── organ.json
    │   │   ├── organism.json
    │   │   ├── pathway.json
    │   │   └── protein.json
    │   └── sentence_embedding
    │   │   ├── .gitignore
    │   │   ├── biobert_nli_sts.json
    │   │   ├── biobert_nli_sts_cord19_v1.json
    │   │   ├── count.json
    │   │   ├── sbert.json
    │   │   ├── sbiobert.json
    │   │   └── tf_idf.json
    ├── models
    │   ├── language_modeling
    │   │   ├── .gitignore
    │   │   └── biobert_cord19_v1.dvc
    │   ├── ner
    │   │   └── .gitignore
    │   ├── ner_er
    │   │   └── .gitignore
    │   └── sentence_embedding
    │   │   ├── .gitignore
    │   │   └── biobert_nli_sts_cord19_v1.dvc
    ├── pipelines
    │   ├── README.md
    │   ├── ner
    │   │   ├── Dockerfile
    │   │   ├── add_er.py
    │   │   ├── clean.py
    │   │   ├── config.cfg
    │   │   ├── dvc.lock
    │   │   ├── dvc.yaml
    │   │   ├── eval_ner.py
    │   │   ├── interrater.py
    │   │   ├── params.yaml
    │   │   ├── preprocess.py
    │   │   └── transformers_vs_spacy
    │   │   │   ├── requirements.txt
    │   │   │   ├── spacy
    │   │   │       ├── .gitignore
    │   │   │       ├── compare_tokens.py
    │   │   │       ├── eval.sh
    │   │   │       └── eval_spacy.py
    │   │   │   └── transformers
    │   │   │       ├── .gitignore
    │   │   │       ├── 0_prepare_data.sh
    │   │   │       ├── 1_run_transformers_ner.py
    │   │   │       ├── 1_run_transformers_ner.sh
    │   │   │       ├── 2_eval_pred.py
    │   │   │       ├── 2_eval_pred.sh
    │   │   │       ├── 3_compare_tokens.py
    │   │   │       ├── create_pickle.py
    │   │   │       ├── francesco_script.py
    │   │   │       └── our_bert_classifier.py
    │   ├── relation_extraction
    │   │   ├── README.md
    │   │   └── convert_chemprot_fmt.py
    │   └── sentence_embedding
    │   │   ├── .gitignore
    │   │   ├── Dockerfile
    │   │   ├── dvc.lock
    │   │   ├── dvc.yaml
    │   │   ├── eval_se.py
    │   │   ├── params.yaml
    │   │   ├── train.py
    │   │   └── training_transformers
    │   │       ├── .gitignore
    │   │       ├── biosses_sentences.txt.dvc
    │   │       ├── fine_tune.py
    │   │       ├── sentences-filtered_11-527-877.txt.dvc
    │   │       └── train.py
    └── raw_sentences
    │   ├── .gitignore
    │   ├── README.md
    │   ├── raw1_2020-06-10_cord19_TestSet.jsonl.dvc
    │   ├── raw2_2020-06-29_cord19_Disease.jsonl.dvc
    │   ├── raw3_2020-06-30_cord19_Disease.jsonl.dvc
    │   ├── raw4_2020-07-02_cord19_ChemicalOrganism.jsonl.dvc
    │   ├── raw5_2020-07-08_cord19_Drug_TestSet.jsonl.dvc
    │   ├── raw6_2020-07-08_cord19_CelltypeProtein.jsonl.dvc
    │   ├── raw7_2020-09-01_cord19v35_CellCompartment.jsonl.dvc
    │   ├── raw8_2020-09-02_cord19v35_CellCompartmentDrugOrgan.jsonl.dvc
    │   └── raw9_2020-09-02_cord19v35_Pathway.jsonl.dvc
├── docker-compose.yml
├── docker
    ├── base.Dockerfile
    ├── corenlp.Dockerfile
    ├── embedding.Dockerfile
    ├── grobid_quantities.Dockerfile
    ├── mining.Dockerfile
    ├── mining.sh
    ├── mining_cache.Dockerfile
    ├── mining_cache.sh
    ├── mysql-make-backup
    ├── mysql.Dockerfile
    ├── search.Dockerfile
    └── utils.sh
├── docs
    ├── Makefile
    ├── _static
    │   └── .keep
    ├── conf.py
    ├── index.rst
    └── source
    │   ├── _substitutions.rst
    │   ├── api
    │       ├── bluesearch.database.article.rst
    │       ├── bluesearch.database.cord_19.rst
    │       ├── bluesearch.database.download.rst
    │       ├── bluesearch.database.mesh.rst
    │       ├── bluesearch.database.mining_cache.rst
    │       ├── bluesearch.database.pdf.rst
    │       ├── bluesearch.database.rst
    │       ├── bluesearch.database.topic.rst
    │       ├── bluesearch.database.topic_info.rst
    │       ├── bluesearch.database.topic_rule.rst
    │       ├── bluesearch.embedding_models.rst
    │       ├── bluesearch.entrypoint.create_database.rst
    │       ├── bluesearch.entrypoint.database.add.rst
    │       ├── bluesearch.entrypoint.database.add_es.rst
    │       ├── bluesearch.entrypoint.database.convert_pdf.rst
    │       ├── bluesearch.entrypoint.database.download.rst
    │       ├── bluesearch.entrypoint.database.init.rst
    │       ├── bluesearch.entrypoint.database.parent.rst
    │       ├── bluesearch.entrypoint.database.parse.rst
    │       ├── bluesearch.entrypoint.database.parse_mesh_rdf.rst
    │       ├── bluesearch.entrypoint.database.rst
    │       ├── bluesearch.entrypoint.database.run.rst
    │       ├── bluesearch.entrypoint.database.schemas.rst
    │       ├── bluesearch.entrypoint.database.topic_extract.rst
    │       ├── bluesearch.entrypoint.database.topic_filter.rst
    │       ├── bluesearch.entrypoint.embedding_server.rst
    │       ├── bluesearch.entrypoint.embeddings.rst
    │       ├── bluesearch.entrypoint.mining_cache.rst
    │       ├── bluesearch.entrypoint.mining_server.rst
    │       ├── bluesearch.entrypoint.rst
    │       ├── bluesearch.entrypoint.search_server.rst
    │       ├── bluesearch.k8s.connect.rst
    │       ├── bluesearch.k8s.create_indices.rst
    │       ├── bluesearch.k8s.rst
    │       ├── bluesearch.mining.attribute.rst
    │       ├── bluesearch.mining.entity.rst
    │       ├── bluesearch.mining.eval.rst
    │       ├── bluesearch.mining.pipeline.rst
    │       ├── bluesearch.mining.relation.rst
    │       ├── bluesearch.mining.rst
    │       ├── bluesearch.rst
    │       ├── bluesearch.search.rst
    │       ├── bluesearch.server.embedding_server.rst
    │       ├── bluesearch.server.invalid_usage_exception.rst
    │       ├── bluesearch.server.mining_server.rst
    │       ├── bluesearch.server.rst
    │       ├── bluesearch.server.search_server.rst
    │       ├── bluesearch.sql.rst
    │       ├── bluesearch.utils.rst
    │       ├── bluesearch.widgets.article_saver.rst
    │       ├── bluesearch.widgets.mining_schema.rst
    │       ├── bluesearch.widgets.mining_widget.rst
    │       ├── bluesearch.widgets.rst
    │       └── bluesearch.widgets.search_widget.rst
    │   ├── entrypoint.rst
    │   ├── example.rst
    │   ├── faq.rst
    │   ├── instructions.rst
    │   ├── logo
    │       └── BlueBrainSearch_banner.jpg
    │   ├── server.rst
    │   └── whatsnew.rst
├── luigi.cfg
├── notebooks
    ├── STS_evaluation.ipynb
    ├── create_indices.ipynb
    └── demo_attribute_extraction.ipynb
├── pyproject.toml
├── requirements-data_and_models.txt
├── requirements-dev.txt
├── requirements.txt
├── screenshots
    ├── mining_widget_articles.png
    ├── mining_widget_text.png
    └── search_widget.png
├── setup.py
├── src
    └── bluesearch
    │   ├── __init__.py
    │   ├── _css
    │       ├── __init__.py
    │       ├── style.py
    │       └── stylesheet.css
    │   ├── database
    │       ├── __init__.py
    │       ├── article.py
    │       ├── cord_19.py
    │       ├── download.py
    │       ├── mesh.py
    │       ├── mining_cache.py
    │       ├── pdf.py
    │       ├── topic.py
    │       ├── topic_info.py
    │       └── topic_rule.py
    │   ├── embedding_models.py
    │   ├── entrypoint
    │       ├── __init__.py
    │       ├── _helper.py
    │       ├── create_database.py
    │       ├── database
    │       │   ├── __init__.py
    │       │   ├── add.py
    │       │   ├── add_es.py
    │       │   ├── convert_pdf.py
    │       │   ├── download.py
    │       │   ├── init.py
    │       │   ├── parent.py
    │       │   ├── parse.py
    │       │   ├── parse_mesh_rdf.py
    │       │   ├── run.py
    │       │   ├── schemas.py
    │       │   ├── topic_extract.py
    │       │   └── topic_filter.py
    │       ├── embedding_server.py
    │       ├── embeddings.py
    │       ├── mining_cache.py
    │       ├── mining_server.py
    │       └── search_server.py
    │   ├── k8s
    │       ├── __init__.py
    │       ├── connect.py
    │       └── create_indices.py
    │   ├── mining
    │       ├── __init__.py
    │       ├── attribute.py
    │       ├── entity.py
    │       ├── eval.py
    │       ├── pipeline.py
    │       └── relation.py
    │   ├── py.typed
    │   ├── search.py
    │   ├── server
    │       ├── __init__.py
    │       ├── embedding_server.py
    │       ├── invalid_usage_exception.py
    │       ├── mining_server.py
    │       └── search_server.py
    │   ├── sql.py
    │   ├── utils.py
    │   └── widgets
    │       ├── __init__.py
    │       ├── article_saver.py
    │       ├── mining_schema.py
    │       ├── mining_widget.py
    │       └── search_widget.py
├── tests
    ├── conftest.py
    ├── data
    │   ├── 1411.7903v4.xml
    │   ├── CORD19_samples
    │   │   ├── biorxiv_medrxiv
    │   │   │   └── biorxiv_medrxiv
    │   │   │   │   └── pdf_json
    │   │   │   │       ├── 9ae476404f7ef1ec1ede965f0b898f31a5bf5a81.json
    │   │   │   │       └── b52e0f732cefa36aae4d45ebc13208fba190b5af.json
    │   │   ├── comm_use_subset
    │   │   │   └── comm_use_subset
    │   │   │   │   ├── pdf_json
    │   │   │   │       └── 820acf55c4e52411482f6eb44360ffa35288b89a.json
    │   │   │   │   └── pmc_json
    │   │   │   │       └── PMC5878846.xml.json
    │   │   ├── custom_license
    │   │   │   └── custom_license
    │   │   │   │   ├── pdf_json
    │   │   │   │       ├── bd21184623ceed45f1cede4066b540ff330ccb63.json
    │   │   │   │       └── be602928156cf0ace9899c1c8569eb4f4ea4597b.json
    │   │   │   │   └── pmc_json
    │   │   │   │       ├── PMC3396214.xml.json
    │   │   │   │       └── PMC6863268.xml.json
    │   │   ├── metadata.csv
    │   │   └── noncomm_use_subset
    │   │   │   └── noncomm_use_subset
    │   │   │       ├── pdf_json
    │   │   │           └── 67a52569919632f4bf58782538ff24838ac7f26c.json
    │   │   │       └── pmc_json
    │   │   │           └── PMC3863901.xml.json
    │   ├── arxiv_api_response.xml
    │   ├── biorxiv.xml
    │   ├── cord19_v35
    │   │   ├── document_parses
    │   │   │   ├── pdf_json
    │   │   │   │   ├── 16e82ce0e0c8a1b36497afc0d4392b4fe21eb174.json
    │   │   │   │   └── 5f267fa1ef3a65e239aa974329e935a4d93dafd2.json
    │   │   │   └── pmc_json
    │   │   │   │   ├── PMC7140272.xml.json
    │   │   │   │   ├── PMC7186928.xml.json
    │   │   │   │   └── PMC7223769.xml.json
    │   │   └── metadata.csv
    │   ├── efetchpubmed_response.txt
    │   ├── jats_article.xml
    │   ├── mining
    │   │   ├── eval
    │   │   │   ├── iob_punctuation_after.csv
    │   │   │   ├── iob_punctuation_before.csv
    │   │   │   └── ner_iob_sample.csv
    │   │   └── request
    │   │   │   └── request.csv
    │   ├── nlmcatalog_response.txt
    │   ├── pubmed_article.xml
    │   ├── pubmed_article_minimal.xml
    │   ├── pubmed_articles.xml
    │   └── pubmed_download_index.html
    ├── integration
    │   └── test_bbs_database.py
    └── unit
    │   ├── database
    │       ├── test_article.py
    │       ├── test_cord_19.py
    │       ├── test_download.py
    │       ├── test_mesh.py
    │       ├── test_mining_cache.py
    │       ├── test_pdf.py
    │       ├── test_topic.py
    │       ├── test_topic_info.py
    │       └── test_topic_rule.py
    │   ├── entrypoint
    │       ├── __init__.py
    │       ├── database
    │       │   ├── __init__.py
    │       │   ├── test_add.py
    │       │   ├── test_add_es.py
    │       │   ├── test_convert_pdf.py
    │       │   ├── test_download.py
    │       │   ├── test_init.py
    │       │   ├── test_parent.py
    │       │   ├── test_parse.py
    │       │   ├── test_parse_mesh_rdf.py
    │       │   ├── test_run.py
    │       │   ├── test_topic_extract.py
    │       │   └── test_topic_filter.py
    │       ├── test__helper.py
    │       ├── test_create_database.py
    │       ├── test_create_mining_cache.py
    │       ├── test_embedding_server.py
    │       ├── test_embeddings.py
    │       ├── test_entrypoint_installation.py
    │       ├── test_mining_server.py
    │       └── test_search_sever.py
    │   ├── k8s
    │       └── test_create_indices.py
    │   ├── mining
    │       ├── test_attribute.py
    │       ├── test_entity.py
    │       ├── test_eval.py
    │       ├── test_pipeline.py
    │       └── test_relation.py
    │   ├── server
    │       ├── __init__.py
    │       ├── test_embedding_server.py
    │       ├── test_mining_server.py
    │       └── test_search_server.py
    │   ├── test_embedding_models.py
    │   ├── test_fixtures.py
    │   ├── test_search.py
    │   ├── test_sql.py
    │   ├── test_utils.py
    │   └── widgets
    │       ├── test_article_saver.py
    │       ├── test_mining_schema.py
    │       ├── test_mining_widget.py
    │       └── test_search_widget.py
└── tox.ini


/.dockerignore:
--------------------------------------------------------------------------------
 1 | # Blue Brain Search is a text mining toolbox focused on scientific use cases.
 2 | #
 3 | # Copyright (C) 2020  Blue Brain Project, EPFL.
 4 | #
 5 | # This program is free software: you can redistribute it and/or modify
 6 | # it under the terms of the GNU Lesser General Public License as published by
 7 | # the Free Software Foundation, either version 3 of the License, or
 8 | # (at your option) any later version.
 9 | #
10 | # This program is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 | # GNU Lesser General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU Lesser General Public License
16 | # along with this program. If not, see <https://www.gnu.org/licenses/>.
17 | 
18 | /.tox/
19 | /assets/
20 | /data/
21 | /docs/
22 | /htmlcov/
23 | /notebooks/
24 | /sandbox/
25 | /tests/
26 | *.egg-info/
27 | *.egg
28 | .env*
29 | !.env*.example
30 | /.dvc/cache/
31 | /.dvc/config.local
32 | 


--------------------------------------------------------------------------------
/.dvc/.gitignore:
--------------------------------------------------------------------------------
 1 | # Blue Brain Search is a text mining toolbox focused on scientific use cases.
 2 | #
 3 | # Copyright (C) 2020  Blue Brain Project, EPFL.
 4 | #
 5 | # This program is free software: you can redistribute it and/or modify
 6 | # it under the terms of the GNU Lesser General Public License as published by
 7 | # the Free Software Foundation, either version 3 of the License, or
 8 | # (at your option) any later version.
 9 | #
10 | # This program is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 | # GNU Lesser General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU Lesser General Public License
16 | # along with this program. If not, see <https://www.gnu.org/licenses/>.
17 | 
18 | /config.local
19 | /tmp
20 | /cache
21 | 


--------------------------------------------------------------------------------
/.dvc/config:
--------------------------------------------------------------------------------
1 | [core]
2 |     remote = gpfs_ssh
3 | ['remote "gpfs_ssh"']
4 |     url = ssh://bbpv1.bbp.epfl.ch/gpfs/bbp.cscs.ch/data/project/proj115/dvc_remote_storage/
5 | 


--------------------------------------------------------------------------------
/.dvc/plots/confusion.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "$schema": "https://vega.github.io/schema/vega-lite/v4.json",
 3 |     "data": {
 4 |         "values": "<DVC_METRIC_DATA>"
 5 |     },
 6 |     "title": "<DVC_METRIC_TITLE>",
 7 |     "mark": "rect",
 8 |     "encoding": {
 9 |         "x": {
10 |             "field": "<DVC_METRIC_X>",
11 |             "type": "nominal",
12 |             "sort": "ascending",
13 |             "title": "<DVC_METRIC_X_LABEL>"
14 |         },
15 |         "y": {
16 |             "field": "<DVC_METRIC_Y>",
17 |             "type": "nominal",
18 |             "sort": "ascending",
19 |             "title": "<DVC_METRIC_Y_LABEL>"
20 |         },
21 |         "color": {
22 |             "aggregate": "count",
23 |             "type": "quantitative"
24 |         },
25 |         "facet": {
26 |             "field": "rev",
27 |             "type": "nominal"
28 |         }
29 |     }
30 | }
31 | 


--------------------------------------------------------------------------------
/.dvc/plots/default.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "$schema": "https://vega.github.io/schema/vega-lite/v4.json",
 3 |     "data": {
 4 |         "values": "<DVC_METRIC_DATA>"
 5 |     },
 6 |     "title": "<DVC_METRIC_TITLE>",
 7 |     "mark": {
 8 |         "type": "line"
 9 |     },
10 |     "encoding": {
11 |         "x": {
12 |             "field": "<DVC_METRIC_X>",
13 |             "type": "quantitative",
14 |             "title": "<DVC_METRIC_X_LABEL>"
15 |         },
16 |         "y": {
17 |             "field": "<DVC_METRIC_Y>",
18 |             "type": "quantitative",
19 |             "title": "<DVC_METRIC_Y_LABEL>",
20 |             "scale": {
21 |                 "zero": false
22 |             }
23 |         },
24 |         "color": {
25 |             "field": "rev",
26 |             "type": "nominal"
27 |         }
28 |     }
29 | }
30 | 


--------------------------------------------------------------------------------
/.dvc/plots/scatter.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "$schema": "https://vega.github.io/schema/vega-lite/v4.json",
 3 |     "data": {
 4 |         "values": "<DVC_METRIC_DATA>"
 5 |     },
 6 |     "title": "<DVC_METRIC_TITLE>",
 7 |     "mark": "point",
 8 |     "encoding": {
 9 |         "x": {
10 |             "field": "<DVC_METRIC_X>",
11 |             "type": "quantitative",
12 |             "title": "<DVC_METRIC_X_LABEL>"
13 |         },
14 |         "y": {
15 |             "field": "<DVC_METRIC_Y>",
16 |             "type": "quantitative",
17 |             "title": "<DVC_METRIC_Y_LABEL>",
18 |             "scale": {
19 |                 "zero": false
20 |             }
21 |         },
22 |         "color": {
23 |             "field": "rev",
24 |             "type": "nominal"
25 |         }
26 |     }
27 | }
28 | 


--------------------------------------------------------------------------------
/.dvc/plots/smooth.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "$schema": "https://vega.github.io/schema/vega-lite/v4.json",
 3 |     "data": {
 4 |         "values": "<DVC_METRIC_DATA>"
 5 |     },
 6 |     "title": "<DVC_METRIC_TITLE>",
 7 |     "mark": {
 8 |         "type": "line"
 9 |     },
10 |     "encoding": {
11 |         "x": {
12 |             "field": "<DVC_METRIC_X>",
13 |             "type": "quantitative",
14 |             "title": "<DVC_METRIC_X_LABEL>"
15 |         },
16 |         "y": {
17 |             "field": "<DVC_METRIC_Y>",
18 |             "type": "quantitative",
19 |             "title": "<DVC_METRIC_Y_LABEL>",
20 |             "scale": {
21 |                 "zero": false
22 |             }
23 |         },
24 |         "color": {
25 |             "field": "rev",
26 |             "type": "nominal"
27 |         }
28 |     },
29 |     "transform": [
30 |         {
31 |             "loess": "<DVC_METRIC_Y>",
32 |             "on": "<DVC_METRIC_X>",
33 |             "groupby": [
34 |                 "rev"
35 |             ],
36 |             "bandwidth": 0.3
37 |         }
38 |     ]
39 | }
40 | 


--------------------------------------------------------------------------------
/.dvcignore:
--------------------------------------------------------------------------------
 1 | # Add patterns of files dvc should ignore, which could improve
 2 | # the performance. Learn more at
 3 | # https://dvc.org/doc/user-guide/dvcignore
 4 | 
 5 | # Blue Brain Search is a text mining toolbox focused on scientific use cases.
 6 | #
 7 | # Copyright (C) 2020  Blue Brain Project, EPFL.
 8 | #
 9 | # This program is free software: you can redistribute it and/or modify
10 | # it under the terms of the GNU Lesser General Public License as published by
11 | # the Free Software Foundation, either version 3 of the License, or
12 | # (at your option) any later version.
13 | #
14 | # This program is distributed in the hope that it will be useful,
15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17 | # GNU Lesser General Public License for more details.
18 | #
19 | # You should have received a copy of the GNU Lesser General Public License
20 | # along with this program. If not, see <https://www.gnu.org/licenses/>.
21 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug-report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: "\U0001F41B Bug Report"
 3 | about: Submit report to help us reproduce and correct the bug
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | ## 🐛 Bug description
11 | 
12 | A clear and concise description of what the bug is.
13 | 
14 | 
15 | ## To reproduce
16 | 
17 | Steps to reproduce the behavior:
18 | 
19 | 1.
20 | 2.
21 | 3.
22 | 
23 | If you have a code sample, error messages, stack traces, please provide it here as well.
24 | 
25 | 
26 | ## Expected behavior
27 | 
28 | A clear and concise description of what you expected to happen. 
29 | 
30 | 
31 | 
32 | ## Environment
33 | 
34 | Please provide the following information.
35 | 
36 |  - Blue Brain Search version (use `python -c "import bluesearch; print(bluesearch.__version__)"`):
37 |  - OS (e.g., Linux):
38 |  - How you installed Blue Brain Search (source, pip, ...):
39 |  - Installation command you used (if compiling from source):
40 |  - Python version (use `python -V`):
41 |  - Any other relevant information:
42 | 
43 | ## Additional context
44 | 
45 | Add any other screenshot, context, or information about the problem here.
46 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/documentation.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: "\U0001F4DA Documentation"
 3 | about: Report an issue related to the docs
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | A clear and concise description of what content in the docs is an issue.
11 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature-request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: "\U0001F680 Feature Request"
 3 | about: Submit a request for a new feature
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | ## 🚀 Feature
11 | 
12 | A clear and concise description of the feature proposal.
13 | 
14 | 
15 | ## Motivation
16 | 
17 | Please outline the motivation for the proposal. Is your feature request related to a problem? e.g., I'm always frustrated when [...]. If this is related to another GitHub issue, please link here too.
18 | 
19 | 
20 | ## Pitch
21 | 
22 | A clear description of what you want to happen.
23 | 
24 | 
25 | ## Alternatives
26 | 
27 | Think about any alternative solutions or features that could be used.
28 | Then, write here a clear list of alternatives.
29 | 
30 | 
31 | ## Additional context
32 | Add any other screenshot, context, or information about the problem here.
33 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/other-questions-help.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: "❓ Other Questions / Help"
 3 | about: Do you need help or have other questions?
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | Please refer to our docs first.
11 | 
12 | If you have a question or help request that you could not find mentioned in our docs, write it here in a clear, concise, and actionable way.
13 | 


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | Fixes #{issue-id-number}.
 2 | 
 3 | ## Description
 4 | 
 5 | Please provide here a summary of the changes introduced by this PR.
 6 | 
 7 | ## How to test?
 8 | 
 9 | Please provide here instructions on how to test the changes introduced by this PR.
10 | (if some changes cannot be tested by automated tests)
11 | 
12 | ## Checklist
13 | 
14 | - [ ] This PR refers to an issue from the [issue tracker](https://github.com/BlueBrain/Search/issues).
15 |   (if it is not the case, please create an issue first).
16 | - [ ] Unit tests added.
17 |   (if needed)
18 | - [ ] Documentation and `whatsnew.rst` updated.
19 |   (if needed)
20 | - [ ] `setup.py` and `requirements.txt` updated with new dependencies.
21 |   (if needed)
22 | - [ ] Type annotations added.
23 |   (if a function is added or modified)
24 | - [ ] All CI tests pass. 
25 | 


--------------------------------------------------------------------------------
/.mypy.ini:
--------------------------------------------------------------------------------
 1 | ;Blue Brain Search is a text mining toolbox focused on scientific use cases.
 2 | ;
 3 | ;Copyright (C) 2020  Blue Brain Project, EPFL.
 4 | ;
 5 | ;This program is free software: you can redistribute it and/or modify
 6 | ;it under the terms of the GNU Lesser General Public License as published by
 7 | ;the Free Software Foundation, either version 3 of the License, or
 8 | ;(at your option) any later version.
 9 | ;
10 | ;This program is distributed in the hope that it will be useful,
11 | ;but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | ;MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 | ;GNU Lesser General Public License for more details.
14 | ;
15 | ;You should have received a copy of the GNU Lesser General Public License
16 | ;along with this program. If not, see <https://www.gnu.org/licenses/>.
17 | 
18 | [mypy]
19 | ignore_missing_imports = True
20 | no_implicit_optional = True
21 | check_untyped_defs = True
22 | strict_equality = True
23 | warn_redundant_casts = True
24 | warn_unused_ignores = True
25 | show_error_codes = True
26 | plugins = sqlmypy
27 | exclude = benchmarks/conftest.py|data_and_models/pipelines/ner/transformers_vs_spacy/transformers/|data_and_models/pipelines/sentence_embedding/training_transformers/
28 | 


--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | formats: []
 4 | 
 5 | sphinx:
 6 |   builder: html
 7 |   configuration: docs/conf.py
 8 | 
 9 | build:
10 |     image: "6.0"
11 | 
12 | python:
13 |   version: 3.7
14 |   install:
15 |     - method: pip
16 |       path: .
17 |       extra_requirements:
18 |         - dev
19 |   system_packages: true
20 | 


--------------------------------------------------------------------------------
/AUTHORS.md:
--------------------------------------------------------------------------------
1 | - [Francesco Casalegno](https://github.com/FrancescoCasalegno) @ Blue Brain Project, EPFL
2 | - [Emilie Delattre](https://github.com/EmilieDel) @ Blue Brain Project, EPFL
3 | - [Pierre-Alexandre Fonta](https://github.com/pafonta) @ Blue Brain Project, EPFL
4 | - [Jan Krepl](https://github.com/jankrepl) @ Blue Brain Project, EPFL
5 | - [Stanislav Schmidt](https://github.com/Stannislav) @ Blue Brain Project, EPFL
6 | - [Anıl Tuncel](https://github.com/anilbey) @ Blue Brain Project, EPFL


--------------------------------------------------------------------------------
/benchmarks/conftest.py:
--------------------------------------------------------------------------------
 1 | """Configuration of pytest benchmarks."""
 2 | 
 3 | # Blue Brain Search is a text mining toolbox focused on scientific use cases.
 4 | #
 5 | # Copyright (C) 2020  Blue Brain Project, EPFL.
 6 | #
 7 | # This program is free software: you can redistribute it and/or modify
 8 | # it under the terms of the GNU Lesser General Public License as published by
 9 | # the Free Software Foundation, either version 3 of the License, or
10 | # (at your option) any later version.
11 | #
12 | # This program is distributed in the hope that it will be useful,
13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 | # GNU Lesser General Public License for more details.
16 | #
17 | # You should have received a copy of the GNU Lesser General Public License
18 | # along with this program. If not, see <https://www.gnu.org/licenses/>.
19 | 
20 | import pytest
21 | 
22 | 
23 | def pytest_addoption(parser):
24 |     parser.addoption("--embedding_server", default="", help="Embedding server URI")
25 |     parser.addoption("--mining_server", default="", help="Mining server URI")
26 |     parser.addoption("--mysql_server", default="", help="MySQL server URI")
27 |     parser.addoption("--search_server", default="", help="Search server URI")
28 | 
29 | 
30 | @pytest.fixture(scope="session")
31 | def benchmark_parameters(request):
32 |     return {
33 |         "embedding_server": request.config.getoption("--embedding_server"),
34 |         "mining_server": request.config.getoption("--mining_server"),
35 |         "mysql_server": request.config.getoption("--mysql_server"),
36 |         "search_server": request.config.getoption("--search_server"),
37 |     }
38 | 


--------------------------------------------------------------------------------
/benchmarks/test_benchmark_insert.py:
--------------------------------------------------------------------------------
 1 | """Benchmark INSERT operations through Pandas with and without transactions."""
 2 | 
 3 | # Blue Brain Search is a text mining toolbox focused on scientific use cases.
 4 | #
 5 | # Copyright (C) 2020  Blue Brain Project, EPFL.
 6 | #
 7 | # This program is free software: you can redistribute it and/or modify
 8 | # it under the terms of the GNU Lesser General Public License as published by
 9 | # the Free Software Foundation, either version 3 of the License, or
10 | # (at your option) any later version.
11 | #
12 | # This program is distributed in the hope that it will be useful,
13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 | # GNU Lesser General Public License for more details.
16 | #
17 | # You should have received a copy of the GNU Lesser General Public License
18 | # along with this program. If not, see <https://www.gnu.org/licenses/>.
19 | 
20 | import numpy as np
21 | import pandas as pd
22 | import pytest as pt
23 | import sqlalchemy
24 | 
25 | PORT = 9731
26 | 
27 | 
28 | @pt.fixture
29 | def data():
30 |     rng = np.random.default_rng(1739)
31 |     numbers = rng.integers(10, size=100000)
32 |     return pd.DataFrame({"column": numbers})
33 | 
34 | 
35 | @pt.fixture
36 | def engine():
37 |     return sqlalchemy.create_engine(
38 |         f"mysql+pymysql://root:root@localhost:{PORT}/benchmarks"
39 |     )
40 | 
41 | 
42 | def insert_without_transactions(data, engine):
43 |     data.to_sql("without", engine, if_exists="append", index=False)
44 | 
45 | 
46 | def insert_with_transactions(data, engine):
47 |     with engine.begin() as con:
48 |         data.to_sql("with", con, if_exists="append", index=False)
49 | 
50 | 
51 | def test_insert_without_transactions(benchmark, data, engine):
52 |     benchmark(insert_without_transactions, data, engine)
53 | 
54 | 
55 | def test_insert_with_transactions(benchmark, data, engine):
56 |     benchmark(insert_with_transactions, data, engine)
57 | 


--------------------------------------------------------------------------------
/data_and_models/annotations/README.md:
--------------------------------------------------------------------------------
 1 | <!---
 2 | Blue Brain Search is a text mining toolbox focused on scientific use cases.
 3 | 
 4 | Copyright (C) 2020  Blue Brain Project, EPFL.
 5 | 
 6 | This program is free software: you can redistribute it and/or modify
 7 | it under the terms of the GNU Lesser General Public License as published by
 8 | the Free Software Foundation, either version 3 of the License, or
 9 | (at your option) any later version.
10 | 
11 | This program is distributed in the hope that it will be useful,
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 | GNU Lesser General Public License for more details.
15 | 
16 | You should have received a copy of the GNU Lesser General Public License
17 | along with this program. If not, see <https://www.gnu.org/licenses/>.
18 | -->
19 | 
20 | # Description
21 | This directory contains collections of annotations that can be used 
22 | for training or evaluating NLP models.
23 | The raw sentences, without annotations, can be found in the 
24 | directory `raw_sentences/`.
25 | 
26 | # Content
27 | ## `ner/`
28 | - Annotations collected with `prodigy` in order to train or evaluate NER models.
29 | 
30 | ## `sentence_embedding/`
31 | - Annotations collected in order to train or evaluate sentence embedding models. 
32 | 


--------------------------------------------------------------------------------
/data_and_models/annotations/ner/.gitignore:
--------------------------------------------------------------------------------
 1 | # Blue Brain Search is a text mining toolbox focused on scientific use cases.
 2 | #
 3 | # Copyright (C) 2020  Blue Brain Project, EPFL.
 4 | #
 5 | # This program is free software: you can redistribute it and/or modify
 6 | # it under the terms of the GNU Lesser General Public License as published by
 7 | # the Free Software Foundation, either version 3 of the License, or
 8 | # (at your option) any later version.
 9 | #
10 | # This program is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 | # GNU Lesser General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU Lesser General Public License
16 | # along with this program. If not, see <https://www.gnu.org/licenses/>.
17 | 
18 | /annotations10_EmmanuelleLogette_2020-08-28_raw1_raw5_10EntityTypes.jsonl
19 | /annotations11_CharlotteLorin_2020-08-28_raw1_10EntityTypes.jsonl
20 | /annotations12_EmmanuelleLogette_2020-08-28_raw7_10EntityTypes.jsonl
21 | /annotations13_CharlotteLorin_2020-09-02_raw7_10EntityTypes.jsonl
22 | /annotations1_EmmanuelleLogette_2020-06-19_raw1_8FirstLabels.jsonl
23 | /annotations2_CharlotteLorin_2020-06-19_8FirstLabels.jsonl
24 | /annotations3_EmmanuelleLogette_2020-07-06_raw1_8FirstLabels.jsonl
25 | /annotations4_CharlotteLorin_2020-07-02_raw1_8FirstLabels.jsonl
26 | /annotations5_EmmanuelleLogette_2020-06-30_raw2_Disease.jsonl
27 | /annotations6_EmmanuelleLogette_2020-07-07_raw4_TaxonChebi.jsonl
28 | /annotations7_EmmanuelleLogette_2020-07-06_raw1_9EntityTypes.jsonl
29 | /annotations8_EmmanuelleLogette_2020-07-08_raw5_9EntityTypes.jsonl
30 | /annotations9_EmmanuelleLogette_2020-07-08_raw6_CelltypeProtein.jsonl
31 | /annotations14_EmmanuelleLogette_2020-09-02_raw8_CellCompartmentDrugOrgan.jsonl
32 | /annotations15_EmmanuelleLogette_2020-09-22_raw9_Pathway.jsonl
33 | /rule_based_patterns.jsonl
34 | /annotations_cell_compartment.jsonl
35 | /annotations_cell_type.jsonl
36 | /annotations_chemical.jsonl
37 | /annotations_disease.jsonl
38 | /annotations_drug.jsonl
39 | /annotations_organ.jsonl
40 | /annotations_organism.jsonl
41 | /annotations_pathway.jsonl
42 | /annotations_protein.jsonl
43 | /annotations_cell_compartment.dev.spacy
44 | /annotations_cell_compartment.train.spacy
45 | /annotations_cell_type.dev.spacy
46 | /annotations_cell_type.train.spacy
47 | /annotations_chemical.dev.spacy
48 | /annotations_chemical.train.spacy
49 | /annotations_disease.dev.spacy
50 | /annotations_disease.train.spacy
51 | /annotations_drug.dev.spacy
52 | /annotations_drug.train.spacy
53 | /annotations_organ.dev.spacy
54 | /annotations_organ.train.spacy
55 | /annotations_organism.dev.spacy
56 | /annotations_organism.train.spacy
57 | /annotations_pathway.dev.spacy
58 | /annotations_pathway.train.spacy
59 | /annotations_protein.dev.spacy
60 | /annotations_protein.train.spacy
61 | 


--------------------------------------------------------------------------------
/data_and_models/annotations/ner/annotations10_EmmanuelleLogette_2020-08-28_raw1_raw5_10EntityTypes.jsonl.dvc:
--------------------------------------------------------------------------------
1 | outs:
2 | - md5: 563441b77b5c39063cda3fa0fb03803c
3 |   path: annotations10_EmmanuelleLogette_2020-08-28_raw1_raw5_10EntityTypes.jsonl
4 | 


--------------------------------------------------------------------------------
/data_and_models/annotations/ner/annotations11_CharlotteLorin_2020-08-28_raw1_10EntityTypes.jsonl.dvc:
--------------------------------------------------------------------------------
1 | outs:
2 | - md5: 735cba4532a4c2c5e928399eafc1000f
3 |   path: annotations11_CharlotteLorin_2020-08-28_raw1_10EntityTypes.jsonl
4 | 


--------------------------------------------------------------------------------
/data_and_models/annotations/ner/annotations12_EmmanuelleLogette_2020-08-28_raw7_10EntityTypes.jsonl.dvc:
--------------------------------------------------------------------------------
1 | outs:
2 | - md5: 117da032ef2e2792429dff88c06c90b7
3 |   path: annotations12_EmmanuelleLogette_2020-08-28_raw7_10EntityTypes.jsonl
4 | 


--------------------------------------------------------------------------------
/data_and_models/annotations/ner/annotations13_CharlotteLorin_2020-09-02_raw7_10EntityTypes.jsonl.dvc:
--------------------------------------------------------------------------------
1 | outs:
2 | - md5: 6e248863604ce14861f38e7c9a8281bb
3 |   path: annotations13_CharlotteLorin_2020-09-02_raw7_10EntityTypes.jsonl
4 | 


--------------------------------------------------------------------------------
/data_and_models/annotations/ner/annotations14_EmmanuelleLogette_2020-09-02_raw8_CellCompartmentDrugOrgan.jsonl.dvc:
--------------------------------------------------------------------------------
1 | outs:
2 | - md5: 2459e49418599ff96ab9db60a29b757b
3 |   path: annotations14_EmmanuelleLogette_2020-09-02_raw8_CellCompartmentDrugOrgan.jsonl
4 | 


--------------------------------------------------------------------------------
/data_and_models/annotations/ner/annotations15_EmmanuelleLogette_2020-09-22_raw9_Pathway.jsonl.dvc:
--------------------------------------------------------------------------------
1 | outs:
2 | - md5: ceb6ea77d2a6a69962b88218f4f5a663
3 |   path: annotations15_EmmanuelleLogette_2020-09-22_raw9_Pathway.jsonl
4 | 


--------------------------------------------------------------------------------
/data_and_models/annotations/ner/annotations1_EmmanuelleLogette_2020-06-19_raw1_8FirstLabels.jsonl.dvc:
--------------------------------------------------------------------------------
1 | outs:
2 | - md5: 58e2ab00caf7a42e958755ec8d4bb999
3 |   path: annotations1_EmmanuelleLogette_2020-06-19_raw1_8FirstLabels.jsonl
4 | 


--------------------------------------------------------------------------------
/data_and_models/annotations/ner/annotations2_CharlotteLorin_2020-06-19_8FirstLabels.jsonl.dvc:
--------------------------------------------------------------------------------
1 | outs:
2 | - md5: 17f2d3dec1ef70f75973ad07e533efe1
3 |   path: annotations2_CharlotteLorin_2020-06-19_8FirstLabels.jsonl
4 | 


--------------------------------------------------------------------------------
/data_and_models/annotations/ner/annotations3_EmmanuelleLogette_2020-07-06_raw1_8FirstLabels.jsonl.dvc:
--------------------------------------------------------------------------------
1 | outs:
2 | - md5: c4919fd5fbc48a8b75019e18517d5842
3 |   path: annotations3_EmmanuelleLogette_2020-07-06_raw1_8FirstLabels.jsonl
4 | 


--------------------------------------------------------------------------------
/data_and_models/annotations/ner/annotations4_CharlotteLorin_2020-07-02_raw1_8FirstLabels.jsonl.dvc:
--------------------------------------------------------------------------------
1 | outs:
2 | - md5: 6e371a67674af1fd7a1cc1243c107b4a
3 |   path: annotations4_CharlotteLorin_2020-07-02_raw1_8FirstLabels.jsonl
4 | 


--------------------------------------------------------------------------------
/data_and_models/annotations/ner/annotations5_EmmanuelleLogette_2020-06-30_raw2_Disease.jsonl.dvc:
--------------------------------------------------------------------------------
1 | outs:
2 | - md5: 613308ed830d86284a1aee2747d911d0
3 |   path: annotations5_EmmanuelleLogette_2020-06-30_raw2_Disease.jsonl
4 | 


--------------------------------------------------------------------------------
/data_and_models/annotations/ner/annotations6_EmmanuelleLogette_2020-07-07_raw4_TaxonChebi.jsonl.dvc:
--------------------------------------------------------------------------------
1 | outs:
2 | - md5: 4b6fff4b0091fa10ae7c88a4aeb42ae0
3 |   path: annotations6_EmmanuelleLogette_2020-07-07_raw4_TaxonChebi.jsonl
4 | 


--------------------------------------------------------------------------------
/data_and_models/annotations/ner/annotations7_EmmanuelleLogette_2020-07-06_raw1_9EntityTypes.jsonl.dvc:
--------------------------------------------------------------------------------
1 | outs:
2 | - md5: b26a0e08e2265b53f625a48f1e1da10d
3 |   path: annotations7_EmmanuelleLogette_2020-07-06_raw1_9EntityTypes.jsonl
4 | 


--------------------------------------------------------------------------------
/data_and_models/annotations/ner/annotations8_EmmanuelleLogette_2020-07-08_raw5_9EntityTypes.jsonl.dvc:
--------------------------------------------------------------------------------
1 | outs:
2 | - md5: 36d574da83a1b52b61016b7418494272
3 |   path: annotations8_EmmanuelleLogette_2020-07-08_raw5_9EntityTypes.jsonl
4 | 


--------------------------------------------------------------------------------
/data_and_models/annotations/ner/annotations9_EmmanuelleLogette_2020-07-08_raw6_CelltypeProtein.jsonl.dvc:
--------------------------------------------------------------------------------
1 | outs:
2 | - md5: ec5d6b86181b9e4cdefb2b2198d5ae4f
3 |   path: annotations9_EmmanuelleLogette_2020-07-08_raw6_CelltypeProtein.jsonl
4 | 


--------------------------------------------------------------------------------
/data_and_models/annotations/ner/patterns/.gitignore:
--------------------------------------------------------------------------------
 1 | # Blue Brain Search is a text mining toolbox focused on scientific use cases.
 2 | #
 3 | # Copyright (C) 2020  Blue Brain Project, EPFL.
 4 | #
 5 | # This program is free software: you can redistribute it and/or modify
 6 | # it under the terms of the GNU Lesser General Public License as published by
 7 | # the Free Software Foundation, either version 3 of the License, or
 8 | # (at your option) any later version.
 9 | #
10 | # This program is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 | # GNU Lesser General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU Lesser General Public License
16 | # along with this program. If not, see <https://www.gnu.org/licenses/>.
17 | 
18 | /patterns.jsonl
19 | /pathway_patterns.jsonl
20 | 


--------------------------------------------------------------------------------
/data_and_models/annotations/ner/patterns/README.md:
--------------------------------------------------------------------------------
 1 | <!---
 2 | Blue Brain Search is a text mining toolbox focused on scientific use cases.
 3 | 
 4 | Copyright (C) 2020  Blue Brain Project, EPFL.
 5 | 
 6 | This program is free software: you can redistribute it and/or modify
 7 | it under the terms of the GNU Lesser General Public License as published by
 8 | the Free Software Foundation, either version 3 of the License, or
 9 | (at your option) any later version.
10 | 
11 | This program is distributed in the hope that it will be useful,
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 | GNU Lesser General Public License for more details.
15 | 
16 | You should have received a copy of the GNU Lesser General Public License
17 | along with this program. If not, see <https://www.gnu.org/licenses/>.
18 | -->
19 | 
20 | # Description
21 | - These pattern files are sometimes used to train NER models to provide a first guess.
22 | - This is particularly necessary when no basis model can be found (e.g. SciSpaCy models) to provide good first
23 |  guesses for the entity type of interest. 
24 | 
25 | # Content
26 | ## `patterns/patterns.jsonl`
27 | - Contains all entities that Emmanuelle identified in Ontology v3 (it then 
28 | pre-annotates those entities in the prodigy GUI).
29 | 
30 | ## `patterns/pathway_patterns.jsonl`
31 | - Contains a list of entities that Emmanuelle considers as a good starting point
32 | for the entity type PATHWAY. 
33 | - The file was generated using `prodigy terms.teach`.
34 | 


--------------------------------------------------------------------------------
/data_and_models/annotations/ner/patterns/pathway_patterns.jsonl.dvc:
--------------------------------------------------------------------------------
1 | outs:
2 | - md5: 75f7ba9b965cef0aab2aaaa434fa9e2d
3 |   path: pathway_patterns.jsonl
4 | 


--------------------------------------------------------------------------------
/data_and_models/annotations/ner/patterns/patterns.jsonl.dvc:
--------------------------------------------------------------------------------
1 | outs:
2 | - md5: 01ea4545220f5b09a4d55cf873ff22cb
3 |   path: patterns.jsonl
4 | 


--------------------------------------------------------------------------------
/data_and_models/annotations/ner/rule_based_patterns.jsonl.dvc:
--------------------------------------------------------------------------------
1 | outs:
2 | - md5: 044c1a326c2472aa4eb72c4c98a7400b
3 |   size: 1709
4 |   path: rule_based_patterns.jsonl
5 | 


--------------------------------------------------------------------------------
/data_and_models/annotations/sentence_embedding/.gitignore:
--------------------------------------------------------------------------------
 1 | # Blue Brain Search is a text mining toolbox focused on scientific use cases.
 2 | #
 3 | # Copyright (C) 2020  Blue Brain Project, EPFL.
 4 | #
 5 | # This program is free software: you can redistribute it and/or modify
 6 | # it under the terms of the GNU Lesser General Public License as published by
 7 | # the Free Software Foundation, either version 3 of the License, or
 8 | # (at your option) any later version.
 9 | #
10 | # This program is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 | # GNU Lesser General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU Lesser General Public License
16 | # along with this program. If not, see <https://www.gnu.org/licenses/>.
17 | 
18 | /sentence_similarity_cord19.csv
19 | /cord19_v47_sentences_pre.txt
20 | 


--------------------------------------------------------------------------------
/data_and_models/annotations/sentence_embedding/README.md:
--------------------------------------------------------------------------------
 1 | <!---
 2 | Blue Brain Search is a text mining toolbox focused on scientific use cases.
 3 | 
 4 | Copyright (C) 2020  Blue Brain Project, EPFL.
 5 | 
 6 | This program is free software: you can redistribute it and/or modify
 7 | it under the terms of the GNU Lesser General Public License as published by
 8 | the Free Software Foundation, either version 3 of the License, or
 9 | (at your option) any later version.
10 | 
11 | This program is distributed in the hope that it will be useful,
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 | GNU Lesser General Public License for more details.
15 | 
16 | You should have received a copy of the GNU Lesser General Public License
17 | along with this program. If not, see <https://www.gnu.org/licenses/>.
18 | -->
19 | 
20 | # Description
21 | - Annotations collected in order to train or evaluate sentence embedding models. 
22 | 
23 | # Content
24 | 
25 | ## `cord19_v47_sentences_pre.txt`
26 | - Unannotated file of sentences (one line per sentence) from cord-19.
27 | - 20,510,932 total sentences.
28 | - Can be used to train unsupervised nlp models.
29 | 
30 | ## `sentence_similarity_cord19.csv`
31 | - Sentences pairs with similarity scores annotated by Emmanuelle Logette. 
32 | - 40 sentences pairs in total:
33 |   - 20 pairs (those with `sentence_id` starting by `A-`) are generically
34 |   extracted from the CORD-19 dataset
35 |   - 20 pairs (those with `sentence_id` starting by `B-`) are also extracted from
36 |   the CORD-19 dataset but are focused on "COVID-19" and "glucose" topics.
37 | - The scoring system is the one used in Soğancıoğlu G. et al. "BIOSSES: a semantic sentence
38 |  similarity estimation system for the biomedical domain." Bioinformatics 33.14 (2017): i49-i58.
39 | 	
40 | | Score | Comment |
41 | | --- | --- |
42 | | 0 | The two sentences are on different topics. |
43 | | 1 | The two sentences are not equivalent, but are on the same topic. |
44 | | 2 | The two sentences are not equivalent, but share some details. |
45 | | 3 | The two sentences are roughly equivalent, but some important information differs/missing. |
46 | | 4 | The two sentences are completely or mostly equivalent, as they mean the same thing. |
47 | 


--------------------------------------------------------------------------------
/data_and_models/annotations/sentence_embedding/cord19_v47_sentences_pre.txt.dvc:
--------------------------------------------------------------------------------
1 | outs:
2 | - md5: d0c68b698738f714df81eb5fb29236fe
3 |   path: cord19_v47_sentences_pre.txt
4 | 


--------------------------------------------------------------------------------
/data_and_models/annotations/sentence_embedding/sentence_similarity_cord19.csv.dvc:
--------------------------------------------------------------------------------
1 | outs:
2 | - md5: 92a4235ce8d292ff382ce008c31da45c
3 |   path: sentence_similarity_cord19.csv
4 | 


--------------------------------------------------------------------------------
/data_and_models/metrics/ner/cell_compartment.json:
--------------------------------------------------------------------------------
1 | {"entity_precision": 0.6666666666666666, "entity_recall": 0.8571428571428571, "entity_f1-score": 0.75, "entity_support": 42, "token_precision": 0.6857142857142857, "token_recall": 0.9411764705882353, "token_f1-score": 0.7933884297520661, "token_support": 51}


--------------------------------------------------------------------------------
/data_and_models/metrics/ner/cell_type.json:
--------------------------------------------------------------------------------
1 | {"entity_precision": 0.6363636363636364, "entity_recall": 0.7857142857142857, "entity_f1-score": 0.7031963470319634, "entity_support": 98, "token_precision": 0.6475770925110133, "token_recall": 0.8855421686746988, "token_f1-score": 0.7480916030534351, "token_support": 166}


--------------------------------------------------------------------------------
/data_and_models/metrics/ner/chemical.json:
--------------------------------------------------------------------------------
1 | {"entity_precision": 0.5, "entity_recall": 0.48175182481751827, "entity_f1-score": 0.49070631970260226, "entity_support": 137, "token_precision": 0.5508982035928144, "token_recall": 0.5542168674698795, "token_f1-score": 0.5525525525525525, "token_support": 166}


--------------------------------------------------------------------------------
/data_and_models/metrics/ner/disease.json:
--------------------------------------------------------------------------------
1 | {"entity_precision": 0.693939393939394, "entity_recall": 0.720125786163522, "entity_f1-score": 0.7067901234567902, "entity_support": 318, "token_precision": 0.7504363001745201, "token_recall": 0.7948243992606284, "token_f1-score": 0.7719928186714542, "token_support": 541}


--------------------------------------------------------------------------------
/data_and_models/metrics/ner/drug.json:
--------------------------------------------------------------------------------
1 | {"entity_precision": 0.6904761904761905, "entity_recall": 0.7631578947368421, "entity_f1-score": 0.725, "entity_support": 76, "token_precision": 0.7058823529411765, "token_recall": 0.7228915662650602, "token_f1-score": 0.7142857142857143, "token_support": 83}


--------------------------------------------------------------------------------
/data_and_models/metrics/ner/interrater/cell_compartment.json:
--------------------------------------------------------------------------------
1 | {"entity_precision": 0.8125, "entity_recall": 0.6341463414634146, "entity_f1-score": 0.7123287671232876, "entity_support": 41, "token_precision": 0.8297872340425532, "token_recall": 0.78, "token_f1-score": 0.8041237113402062, "token_support": 50}


--------------------------------------------------------------------------------
/data_and_models/metrics/ner/interrater/cell_type.json:
--------------------------------------------------------------------------------
1 | {"entity_precision": 0.5869565217391305, "entity_recall": 0.8350515463917526, "entity_f1-score": 0.6893617021276596, "entity_support": 97, "token_precision": 0.6512605042016807, "token_recall": 0.9393939393939394, "token_f1-score": 0.7692307692307693, "token_support": 165}


--------------------------------------------------------------------------------
/data_and_models/metrics/ner/interrater/chemical.json:
--------------------------------------------------------------------------------
1 | {"entity_precision": 0.4913294797687861, "entity_recall": 0.625, "entity_f1-score": 0.5501618122977346, "entity_support": 136, "token_precision": 0.483739837398374, "token_recall": 0.7212121212121212, "token_f1-score": 0.5790754257907542, "token_support": 165}


--------------------------------------------------------------------------------
/data_and_models/metrics/ner/interrater/condition.json:
--------------------------------------------------------------------------------
1 | {"entity_precision": 0.4928571428571429, "entity_recall": 0.6831683168316832, "entity_f1-score": 0.5726141078838174, "entity_support": 101, "token_precision": 0.4841628959276018, "token_recall": 0.7588652482269503, "token_f1-score": 0.5911602209944752, "token_support": 141}


--------------------------------------------------------------------------------
/data_and_models/metrics/ner/interrater/disease.json:
--------------------------------------------------------------------------------
1 | {"entity_precision": 0.6478494623655914, "entity_recall": 0.8169491525423729, "entity_f1-score": 0.7226386806596702, "entity_support": 295, "token_precision": 0.7301829268292683, "token_recall": 0.9466403162055336, "token_f1-score": 0.8244406196213425, "token_support": 506}


--------------------------------------------------------------------------------
/data_and_models/metrics/ner/interrater/drug.json:
--------------------------------------------------------------------------------
1 | {"entity_precision": 0.34210526315789475, "entity_recall": 0.48148148148148145, "entity_f1-score": 0.4, "entity_support": 27, "token_precision": 0.30434782608695654, "token_recall": 0.4666666666666667, "token_f1-score": 0.3684210526315789, "token_support": 30}


--------------------------------------------------------------------------------
/data_and_models/metrics/ner/interrater/organ.json:
--------------------------------------------------------------------------------
1 | {"entity_precision": 0.8243243243243243, "entity_recall": 0.6853932584269663, "entity_f1-score": 0.7484662576687117, "entity_support": 89, "token_precision": 0.8125, "token_recall": 0.7289719626168224, "token_f1-score": 0.7684729064039408, "token_support": 107}


--------------------------------------------------------------------------------
/data_and_models/metrics/ner/interrater/organism.json:
--------------------------------------------------------------------------------
1 | {"entity_precision": 0.5610687022900763, "entity_recall": 0.7696335078534031, "entity_f1-score": 0.6490066225165563, "entity_support": 191, "token_precision": 0.6412776412776413, "token_recall": 0.8938356164383562, "token_f1-score": 0.7467811158798283, "token_support": 292}


--------------------------------------------------------------------------------
/data_and_models/metrics/ner/interrater/pathway.json:
--------------------------------------------------------------------------------
1 | {"entity_precision": 0.5575757575757576, "entity_recall": 0.6571428571428571, "entity_f1-score": 0.6032786885245902, "entity_support": 140, "token_precision": 0.5785714285714286, "token_recall": 0.7297297297297297, "token_f1-score": 0.6454183266932271, "token_support": 222}


--------------------------------------------------------------------------------
/data_and_models/metrics/ner/interrater/protein.json:
--------------------------------------------------------------------------------
1 | {"entity_precision": 0.6317829457364341, "entity_recall": 0.6197718631178707, "entity_f1-score": 0.6257197696737045, "entity_support": 263, "token_precision": 0.6307339449541285, "token_recall": 0.7236842105263158, "token_f1-score": 0.6740196078431373, "token_support": 380}


--------------------------------------------------------------------------------
/data_and_models/metrics/ner/organ.json:
--------------------------------------------------------------------------------
1 | {"entity_precision": 0.4896551724137931, "entity_recall": 0.7634408602150538, "entity_f1-score": 0.5966386554621849, "entity_support": 93, "token_precision": 0.5112359550561798, "token_recall": 0.8125, "token_f1-score": 0.6275862068965518, "token_support": 112}


--------------------------------------------------------------------------------
/data_and_models/metrics/ner/organism.json:
--------------------------------------------------------------------------------
1 | {"entity_precision": 0.6031746031746031, "entity_recall": 0.7307692307692307, "entity_f1-score": 0.6608695652173913, "entity_support": 208, "token_precision": 0.698005698005698, "token_recall": 0.7827476038338658, "token_f1-score": 0.7379518072289156, "token_support": 313}


--------------------------------------------------------------------------------
/data_and_models/metrics/ner/pathway.json:
--------------------------------------------------------------------------------
1 | {"entity_precision": 0.6090225563909775, "entity_recall": 0.5510204081632653, "entity_f1-score": 0.5785714285714286, "entity_support": 147, "token_precision": 0.6831683168316832, "token_recall": 0.592274678111588, "token_f1-score": 0.6344827586206897, "token_support": 233}


--------------------------------------------------------------------------------
/data_and_models/metrics/ner/protein.json:
--------------------------------------------------------------------------------
1 | {"entity_precision": 0.48739495798319327, "entity_recall": 0.6444444444444445, "entity_f1-score": 0.5550239234449761, "entity_support": 270, "token_precision": 0.5196078431372549, "token_recall": 0.8112244897959183, "token_f1-score": 0.6334661354581673, "token_support": 392}


--------------------------------------------------------------------------------
/data_and_models/metrics/sentence_embedding/.gitignore:
--------------------------------------------------------------------------------
 1 | # Blue Brain Search is a text mining toolbox focused on scientific use cases.
 2 | #
 3 | # Copyright (C) 2020  Blue Brain Project, EPFL.
 4 | #
 5 | # This program is free software: you can redistribute it and/or modify
 6 | # it under the terms of the GNU Lesser General Public License as published by
 7 | # the Free Software Foundation, either version 3 of the License, or
 8 | # (at your option) any later version.
 9 | #
10 | # This program is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 | # GNU Lesser General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU Lesser General Public License
16 | # along with this program. If not, see <https://www.gnu.org/licenses/>.
17 | 
18 | /biobert_nli_sts.csv
19 | /biobert_nli_sts.png
20 | /tf_idf.csv
21 | /tf_idf.png
22 | /count.csv
23 | /count.png
24 | /sbert.csv
25 | /sbert.png
26 | /sbiobert.csv
27 | /sbiobert.png
28 | /biobert_nli_sts_cord19_v1.csv
29 | /biobert_nli_sts_cord19_v1.png
30 | 


--------------------------------------------------------------------------------
/data_and_models/metrics/sentence_embedding/biobert_nli_sts.json:
--------------------------------------------------------------------------------
1 | {"kendall_tau": 0.4507476668114661, "pearson_r": 0.5885845851038655, "spearman_rho": 0.5780078495658358}


--------------------------------------------------------------------------------
/data_and_models/metrics/sentence_embedding/biobert_nli_sts_cord19_v1.json:
--------------------------------------------------------------------------------
1 | {"kendall_tau": 0.5778437302402729, "pearson_r": 0.7254632318864527, "spearman_rho": 0.7167140825467725}


--------------------------------------------------------------------------------
/data_and_models/metrics/sentence_embedding/count.json:
--------------------------------------------------------------------------------
1 | {"kendall_tau": 0.31182871376137494, "pearson_r": 0.4221325079692087, "spearman_rho": 0.4103474240867628}


--------------------------------------------------------------------------------
/data_and_models/metrics/sentence_embedding/sbert.json:
--------------------------------------------------------------------------------
1 | {"kendall_tau": 0.28818293451880617, "pearson_r": 0.4174311065388488, "spearman_rho": 0.38012159475593804}


--------------------------------------------------------------------------------
/data_and_models/metrics/sentence_embedding/sbiobert.json:
--------------------------------------------------------------------------------
1 | {"kendall_tau": 0.40050038592100756, "pearson_r": 0.5297193358767457, "spearman_rho": 0.5147194625851121}


--------------------------------------------------------------------------------
/data_and_models/metrics/sentence_embedding/tf_idf.json:
--------------------------------------------------------------------------------
1 | {"kendall_tau": 0.37685460667843884, "pearson_r": 0.4973378796506119, "spearman_rho": 0.48742817979126074}


--------------------------------------------------------------------------------
/data_and_models/models/language_modeling/.gitignore:
--------------------------------------------------------------------------------
1 | /biobert_cord19_v1
2 | /biobert_cord19_v1__logs
3 | 


--------------------------------------------------------------------------------
/data_and_models/models/language_modeling/biobert_cord19_v1.dvc:
--------------------------------------------------------------------------------
1 | outs:
2 | - md5: 8ce898d7b0d4920b9312768925a56802.dir
3 |   size: 1300619338
4 |   nfiles: 9
5 |   path: biobert_cord19_v1
6 | 


--------------------------------------------------------------------------------
/data_and_models/models/ner/.gitignore:
--------------------------------------------------------------------------------
 1 | # Blue Brain Search is a text mining toolbox focused on scientific use cases.
 2 | #
 3 | # Copyright (C) 2020  Blue Brain Project, EPFL.
 4 | #
 5 | # This program is free software: you can redistribute it and/or modify
 6 | # it under the terms of the GNU Lesser General Public License as published by
 7 | # the Free Software Foundation, either version 3 of the License, or
 8 | # (at your option) any later version.
 9 | #
10 | # This program is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 | # GNU Lesser General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU Lesser General Public License
16 | # along with this program. If not, see <https://www.gnu.org/licenses/>.
17 | 
18 | /model-cell_compartment
19 | /model-cell_type
20 | /model-chemical
21 | /model-disease
22 | /model-drug
23 | /model-organ
24 | /model-organism
25 | /model-pathway
26 | /model-protein
27 | 


--------------------------------------------------------------------------------
/data_and_models/models/ner_er/.gitignore:
--------------------------------------------------------------------------------
 1 | # Blue Brain Search is a text mining toolbox focused on scientific use cases.
 2 | #
 3 | # Copyright (C) 2020  Blue Brain Project, EPFL.
 4 | #
 5 | # This program is free software: you can redistribute it and/or modify
 6 | # it under the terms of the GNU Lesser General Public License as published by
 7 | # the Free Software Foundation, either version 3 of the License, or
 8 | # (at your option) any later version.
 9 | #
10 | # This program is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 | # GNU Lesser General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU Lesser General Public License
16 | # along with this program. If not, see <https://www.gnu.org/licenses/>.
17 | 
18 | /model-cell_compartment
19 | /model-cell_type
20 | /model-chemical
21 | /model-disease
22 | /model-drug
23 | /model-organ
24 | /model-organism
25 | /model-pathway
26 | /model-protein
27 | 


--------------------------------------------------------------------------------
/data_and_models/models/sentence_embedding/.gitignore:
--------------------------------------------------------------------------------
 1 | # Blue Brain Search is a text mining toolbox focused on scientific use cases.
 2 | #
 3 | # Copyright (C) 2020  Blue Brain Project, EPFL.
 4 | #
 5 | # This program is free software: you can redistribute it and/or modify
 6 | # it under the terms of the GNU Lesser General Public License as published by
 7 | # the Free Software Foundation, either version 3 of the License, or
 8 | # (at your option) any later version.
 9 | #
10 | # This program is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 | # GNU Lesser General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU Lesser General Public License
16 | # along with this program. If not, see <https://www.gnu.org/licenses/>.
17 | 
18 | /tf_idf
19 | /count
20 | /biobert_nli_sts_cord19_v1
21 | 


--------------------------------------------------------------------------------
/data_and_models/models/sentence_embedding/biobert_nli_sts_cord19_v1.dvc:
--------------------------------------------------------------------------------
1 | outs:
2 | - md5: 9dccac418759a8c53f5da608dbd9f835.dir
3 |   size: 433540587
4 |   nfiles: 9
5 |   path: biobert_nli_sts_cord19_v1
6 | 


--------------------------------------------------------------------------------
/data_and_models/pipelines/README.md:
--------------------------------------------------------------------------------
 1 | <!---
 2 | Blue Brain Search is a text mining toolbox focused on scientific use cases.
 3 | 
 4 | Copyright (C) 2020  Blue Brain Project, EPFL.
 5 | 
 6 | This program is free software: you can redistribute it and/or modify
 7 | it under the terms of the GNU Lesser General Public License as published by
 8 | the Free Software Foundation, either version 3 of the License, or
 9 | (at your option) any later version.
10 | 
11 | This program is distributed in the hope that it will be useful,
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 | GNU Lesser General Public License for more details.
15 | 
16 | You should have received a copy of the GNU Lesser General Public License
17 | along with this program. If not, see <https://www.gnu.org/licenses/>.
18 | -->
19 | 
20 | # Description
21 | - DVC pipelines to train and evaluate machine learning models. 
22 | 
23 | # Content
24 |  
25 | ## `ner/`
26 | - DVC pipelines to train and evaluate models for Named Entity Recognition.
27 | 
28 | ## `sentence_embedding/`
29 | - DVC pipelines to train and evaluate models for Sentence Embedding.
30 | 


--------------------------------------------------------------------------------
/data_and_models/pipelines/ner/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Blue Brain Search is a text mining toolbox focused on scientific use cases.
 2 | #
 3 | # Copyright (C) 2020  Blue Brain Project, EPFL.
 4 | #
 5 | # This program is free software: you can redistribute it and/or modify
 6 | # it under the terms of the GNU Lesser General Public License as published by
 7 | # the Free Software Foundation, either version 3 of the License, or
 8 | # (at your option) any later version.
 9 | #
10 | # This program is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 | # GNU Lesser General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU Lesser General Public License
16 | # along with this program. If not, see <https://www.gnu.org/licenses/>.
17 | 
18 | FROM continuumio/miniconda3:4.9.2
19 | 
20 | ENV HTTP_PROXY='http://bbpproxy.epfl.ch:80/'
21 | ENV HTTPS_PROXY='http://bbpproxy.epfl.ch:80/'
22 | ENV http_proxy='http://bbpproxy.epfl.ch:80/'
23 | ENV https_proxy='http://bbpproxy.epfl.ch:80/'
24 | 
25 | # Update conda, install additional system packages
26 | RUN true \
27 | 	&& conda update conda \
28 | 	&& apt-get update \
29 | 	&& apt-get install -y gcc g++ build-essential vim libfontconfig1
30 | RUN conda install -c carta mysqlclient
31 | 
32 | # Install Blue Brain Search -- revision can be a branch, sha, or tag
33 | ARG BBS_REVISION=v0.2.0
34 | ADD . /src
35 | WORKDIR /src
36 | RUN git checkout $BBS_REVISION
37 | # remove ruamel-yaml: https://github.com/pypa/pip/issues/5247#issuecomment-381550610
38 | RUN rm -rf /opt/conda/lib/python3.8/site-packages/ruamel*
39 | RUN pip install -r requirements.txt
40 | RUN pip install -r requirements-data_and_models.txt
41 | RUN pip install $PWD[data_and_models]
42 | 
43 | 
44 | EXPOSE 8888
45 | 
46 | RUN groupadd -g 999 docker
47 | RUN useradd --create-home --uid 1000 --gid docker bbsuser
48 | 
49 | WORKDIR /bbs
50 | ENTRYPOINT ["/bin/bash"]
51 | 


--------------------------------------------------------------------------------
/data_and_models/pipelines/ner/add_er.py:
--------------------------------------------------------------------------------
 1 | """Append an entity ruler to a spacy pipeline."""
 2 | 
 3 | # Blue Brain Search is a text mining toolbox focused on scientific use cases.
 4 | #
 5 | # Copyright (C) 2020  Blue Brain Project, EPFL.
 6 | #
 7 | # This program is free software: you can redistribute it and/or modify
 8 | # it under the terms of the GNU Lesser General Public License as published by
 9 | # the Free Software Foundation, either version 3 of the License, or
10 | # (at your option) any later version.
11 | #
12 | # This program is distributed in the hope that it will be useful,
13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 | # GNU Lesser General Public License for more details.
16 | #
17 | # You should have received a copy of the GNU Lesser General Public License
18 | # along with this program. If not, see <https://www.gnu.org/licenses/>.
19 | 
20 | import pathlib
21 | from argparse import ArgumentParser
22 | 
23 | import spacy
24 | 
25 | from bluesearch.mining.entity import global2model_patterns
26 | from bluesearch.utils import JSONL
27 | 
28 | parser = ArgumentParser()
29 | parser.add_argument(
30 |     "--model",
31 |     required=True,
32 |     type=str,
33 |     help="SpaCy model without an entity ruler.",
34 | )
35 | parser.add_argument(
36 |     "--output_file",
37 |     required=True,
38 |     type=str,
39 |     help="File to which we save the enhanced spacy pipeline.",
40 | )
41 | parser.add_argument(
42 |     "--patterns_file",
43 |     required=True,
44 |     type=str,
45 |     help="Path to the patterns file used for rule-based entity recognition.",
46 | )
47 | args = parser.parse_args()
48 | 
49 | 
50 | def main():
51 |     """Add entity ruler to NER models."""
52 |     # Load and preprocess the annotations
53 |     ner_model = spacy.load(args.model)
54 | 
55 |     print("Loading patterns")
56 |     path_patterns = pathlib.Path(args.patterns_file)
57 |     patterns = JSONL.load_jsonl(path_patterns)
58 |     _, _, entity_type = args.model.rpartition("-")
59 |     modified_patterns = global2model_patterns(patterns, entity_type.upper())
60 |     er_config = {"validate": True, "overwrite_ents": True}
61 |     er = ner_model.add_pipe("entity_ruler", after="ner", config=er_config)
62 |     er.add_patterns(modified_patterns)
63 | 
64 |     print("Saving model with an entity ruler")
65 |     ner_model.to_disk(args.output_file)
66 | 
67 | 
68 | if __name__ == "__main__":
69 |     main()
70 | 


--------------------------------------------------------------------------------
/data_and_models/pipelines/ner/params.yaml:
--------------------------------------------------------------------------------
 1 | entities:
 2 |   - cell_compartment
 3 |   - cell_type
 4 |   - chemical
 5 |   - disease
 6 |   - drug
 7 |   - organ
 8 |   - organism
 9 |   - pathway
10 |   - protein
11 | train:
12 |   corpora:
13 |     dev_size: 0.1  # (float) Proportion. | (int) Number of samples.
14 |     shuffle_seed: 0  # (int) Shuffling applied before the split.
15 | eval:
16 |   disease:
17 |     etype_name: DISEASE
18 |   cell_compartment:
19 |     etype_name: CELL_COMPARTMENT
20 |   drug:
21 |     etype_name: DRUG
22 |   organ:
23 |     etype_name: ORGAN
24 |   chemical:
25 |     etype_name: CHEMICAL
26 |   organism:
27 |     etype_name: ORGANISM
28 |   cell_type:
29 |     etype_name: CELL_TYPE
30 |   protein:
31 |     etype_name: PROTEIN
32 |   pathway:
33 |     etype_name: PATHWAY
34 | 


--------------------------------------------------------------------------------
/data_and_models/pipelines/ner/transformers_vs_spacy/requirements.txt:
--------------------------------------------------------------------------------
 1 | # Blue Brain Search is a text mining toolbox focused on scientific use cases.
 2 | #
 3 | # Copyright (C) 2020  Blue Brain Project, EPFL.
 4 | #
 5 | # This program is free software: you can redistribute it and/or modify
 6 | # it under the terms of the GNU Lesser General Public License as published by
 7 | # the Free Software Foundation, either version 3 of the License, or
 8 | # (at your option) any later version.
 9 | #
10 | # This program is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 | # GNU Lesser General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU Lesser General Public License
16 | # along with this program. If not, see <https://www.gnu.org/licenses/>.
17 | 
18 | seqeval
19 | datasets >= 1.1.3
20 | 


--------------------------------------------------------------------------------
/data_and_models/pipelines/ner/transformers_vs_spacy/spacy/.gitignore:
--------------------------------------------------------------------------------
 1 | # Blue Brain Search is a text mining toolbox focused on scientific use cases.
 2 | #
 3 | # Copyright (C) 2020  Blue Brain Project, EPFL.
 4 | #
 5 | # This program is free software: you can redistribute it and/or modify
 6 | # it under the terms of the GNU Lesser General Public License as published by
 7 | # the Free Software Foundation, either version 3 of the License, or
 8 | # (at your option) any later version.
 9 | #
10 | # This program is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 | # GNU Lesser General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU Lesser General Public License
16 | # along with this program. If not, see <https://www.gnu.org/licenses/>.
17 | 
18 | df_test_pred.pkl
19 | pathway_metrics.json
20 | 


--------------------------------------------------------------------------------
/data_and_models/pipelines/ner/transformers_vs_spacy/spacy/eval.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Blue Brain Search is a text mining toolbox focused on scientific use cases.
 4 | #
 5 | # Copyright (C) 2020  Blue Brain Project, EPFL.
 6 | #
 7 | # This program is free software: you can redistribute it and/or modify
 8 | # it under the terms of the GNU Lesser General Public License as published by
 9 | # the Free Software Foundation, either version 3 of the License, or
10 | # (at your option) any later version.
11 | #
12 | # This program is distributed in the hope that it will be useful,
13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 | # GNU Lesser General Public License for more details.
16 | #
17 | # You should have received a copy of the GNU Lesser General Public License
18 | # along with this program. If not, see <https://www.gnu.org/licenses/>.
19 | 
20 | DVC_BASE="../../../.."
21 | test_data_1="$DVC_BASE/annotations/ner/annotations10_EmmanuelleLogette_2020-08-28_raw1_raw5_10EntityTypes.jsonl"
22 | test_data_2="$DVC_BASE/annotations/ner/annotations12_EmmanuelleLogette_2020-08-28_raw7_10EntityTypes.jsonl"
23 | 
24 | 
25 | python eval_spacy.py \
26 |   --annotation_files "$test_data_1,$test_data_2" \
27 |   --model "$DVC_BASE/models/ner/model-pathway" \
28 |   --output_file "pathway_metrics.json" \
29 |   --etype "PATHWAY"
30 | 


--------------------------------------------------------------------------------
/data_and_models/pipelines/ner/transformers_vs_spacy/transformers/.gitignore:
--------------------------------------------------------------------------------
 1 | # Blue Brain Search is a text mining toolbox focused on scientific use cases.
 2 | #
 3 | # Copyright (C) 2020  Blue Brain Project, EPFL.
 4 | #
 5 | # This program is free software: you can redistribute it and/or modify
 6 | # it under the terms of the GNU Lesser General Public License as published by
 7 | # the Free Software Foundation, either version 3 of the License, or
 8 | # (at your option) any later version.
 9 | #
10 | # This program is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 | # GNU Lesser General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU Lesser General Public License
16 | # along with this program. If not, see <https://www.gnu.org/licenses/>.
17 | 
18 | checkpoints/
19 | logs/
20 | pathway_metrics.json
21 | pathway_metrics_token.json
22 | pathway_metrics_entity.json
23 | test_data.pkl
24 | train_data.pkl
25 | 


--------------------------------------------------------------------------------
/data_and_models/pipelines/ner/transformers_vs_spacy/transformers/0_prepare_data.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Blue Brain Search is a text mining toolbox focused on scientific use cases.
 4 | #
 5 | # Copyright (C) 2020  Blue Brain Project, EPFL.
 6 | #
 7 | # This program is free software: you can redistribute it and/or modify
 8 | # it under the terms of the GNU Lesser General Public License as published by
 9 | # the Free Software Foundation, either version 3 of the License, or
10 | # (at your option) any later version.
11 | #
12 | # This program is distributed in the hope that it will be useful,
13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 | # GNU Lesser General Public License for more details.
16 | #
17 | # You should have received a copy of the GNU Lesser General Public License
18 | # along with this program. If not, see <https://www.gnu.org/licenses/>.
19 | 
20 | export LANG=C.UTF-8
21 | 
22 | 
23 | train_data="../../../../annotations/ner/annotations15_EmmanuelleLogette_2020-09-22_raw9_Pathway.jsonl"
24 | test_data_1="../../../../annotations/ner/annotations10_EmmanuelleLogette_2020-08-28_raw1_raw5_10EntityTypes.jsonl"
25 | test_data_2="../../../../annotations/ner/annotations12_EmmanuelleLogette_2020-08-28_raw7_10EntityTypes.jsonl"
26 | 
27 | python3 francesco_script.py --annotation-files "$train_data" -o train_data.txt --keep-punctuation --entity-type "PATHWAY"
28 | python3 francesco_script.py --annotation-files "$test_data_1,$test_data_2" -o test_data.txt --keep-punctuation --entity-type "PATHWAY"
29 | 
30 | python3 create_pickle.py train_data.txt train_data.pkl
31 | python3 create_pickle.py test_data.txt test_data.pkl
32 | 
33 | rm train_data.txt
34 | rm test_data.txt
35 | 
36 | 


--------------------------------------------------------------------------------
/data_and_models/pipelines/ner/transformers_vs_spacy/transformers/1_run_transformers_ner.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # This script is from huggingface/transformers v4.4.0.
 4 | # It is from https://github.com/huggingface/transformers/tree/v4.4.0/examples/token-classification/run.sh.
 5 | 
 6 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 7 | #
 8 | # Licensed under the Apache License, Version 2.0 (the "License");
 9 | # you may not use this file except in compliance with the License.
10 | # You may obtain a copy of the License at
11 | #
12 | #     http://www.apache.org/licenses/LICENSE-2.0
13 | #
14 | # Unless required by applicable law or agreed to in writing, software
15 | # distributed under the License is distributed on an "AS IS" BASIS,
16 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | # See the License for the specific language governing permissions and
18 | # limitations under the License.
19 | 
20 | 
21 | exp_name="evaluate_transformers"
22 | 
23 | #  --model_name_or_path bert-base-uncased \
24 | #  --lr_scheduler_type "constant" \
25 | DS="train_data.pkl"
26 | DS_VAL="test_data.pkl"
27 | python3 1_run_transformers_ner.py \
28 |   --model_name_or_path "dmis-lab/biobert-large-cased-v1.1" \
29 |   --output_dir "checkpoints/$exp_name" \
30 |   --do_train \
31 |   --do_eval \
32 |   --do_predict \
33 |   --evaluation_strategy "steps" \
34 |   --eval_steps 10 \
35 |   --train_file "$DS" \
36 |   --validation_file "$DS_VAL" \
37 |   --test_file "$DS_VAL" \
38 |   --num_train_epochs 50 \
39 |   --learning_rate "1e-4" \
40 |   --logging_strategy "epoch" \
41 |   --logging_dir "logs/$exp_name" \
42 |   $@
43 |   # --dataset_name conll2003 \
44 | 


--------------------------------------------------------------------------------
/data_and_models/pipelines/ner/transformers_vs_spacy/transformers/2_eval_pred.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Blue Brain Search is a text mining toolbox focused on scientific use cases.
 4 | #
 5 | # Copyright (C) 2020  Blue Brain Project, EPFL.
 6 | #
 7 | # This program is free software: you can redistribute it and/or modify
 8 | # it under the terms of the GNU Lesser General Public License as published by
 9 | # the Free Software Foundation, either version 3 of the License, or
10 | # (at your option) any later version.
11 | #
12 | # This program is distributed in the hope that it will be useful,
13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 | # GNU Lesser General Public License for more details.
16 | #
17 | # You should have received a copy of the GNU Lesser General Public License
18 | # along with this program. If not, see <https://www.gnu.org/licenses/>.
19 | 
20 | python 2_eval_pred.py checkpoints/evaluate_transformers/test_predictions.txt test_data.pkl
21 | 
22 | 


--------------------------------------------------------------------------------
/data_and_models/pipelines/ner/transformers_vs_spacy/transformers/3_compare_tokens.py:
--------------------------------------------------------------------------------
 1 | # Blue Brain Search is a text mining toolbox focused on scientific use cases.
 2 | #
 3 | # Copyright (C) 2020  Blue Brain Project, EPFL.
 4 | #
 5 | # This program is free software: you can redistribute it and/or modify
 6 | # it under the terms of the GNU Lesser General Public License as published by
 7 | # the Free Software Foundation, either version 3 of the License, or
 8 | # (at your option) any later version.
 9 | #
10 | # This program is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 | # GNU Lesser General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU Lesser General Public License
16 | # along with this program. If not, see <https://www.gnu.org/licenses/>.
17 | 
18 | import pandas as pd
19 | import spacy
20 | 
21 | 
22 | nlp = spacy.load("en_core_web_lg", disable=["vocab", "ner"])
23 | 
24 | 
25 | def unroll_rows(df):
26 |     return pd.concat([pd.DataFrame(row.to_dict()) for i, row in df.iterrows()])
27 | 
28 | 
29 | def poor_venn(set1, set2):
30 |     print(f"[ {len(set1 - set2)} | {len(set1 & set2)} | {len(set2 - set1)} ]")
31 | 
32 | 
33 | def lemma(word):
34 |     return next(iter(nlp(word.lower()))).lemma_
35 | 
36 | 
37 | df_train = pd.read_pickle("train_data.pkl")
38 | df_test = pd.read_pickle("test_data.pkl")
39 | with open("checkpoints/evaluate_transformers/test_predictions.txt") as fp:
40 |     df_test["pred"] = [line.strip().split() for line in fp]
41 | 
42 | 
43 | df_train_flat = unroll_rows(df_train)
44 | df_test_flat = unroll_rows(df_test)
45 | 
46 | train_entities = set(df_train_flat.token[df_train_flat.entity_type != "O"])
47 | test_entities = set(df_test_flat.token[df_test_flat.entity_type != "O"])
48 | pred_entities = set(df_test_flat.token[df_test_flat.pred != "O"])
49 | 
50 | train_entities = set(map(lemma, train_entities))
51 | test_entities = set(map(lemma, test_entities))
52 | pred_entities = set(map(lemma, pred_entities))
53 | 
54 | print("{train, test, pred} = Unique token lemmata in the corresponding sets with an entity type that is not 'O'")
55 | print()
56 | 
57 | print("train - test")
58 | print(sorted(train_entities - test_entities))
59 | print()
60 | 
61 | print("train - pred")
62 | print(sorted(train_entities - pred_entities))
63 | print()
64 | 
65 | print("test - train")
66 | print(sorted(test_entities - train_entities))
67 | print()
68 | 
69 | print("pred - train")
70 | print(sorted(pred_entities - train_entities))
71 | print()
72 | 
73 | print("len(train) =", len(train_entities))
74 | print("len(test) =", len(test_entities))
75 | print("len(pred) =", len(pred_entities))
76 | print()
77 | 
78 | print("VENN: train vs. test")
79 | poor_venn(train_entities, test_entities)
80 | print("VENN: train vs. pred")
81 | poor_venn(train_entities, pred_entities)
82 | print("VENN: test vs. pred")
83 | poor_venn(test_entities, pred_entities)
84 | print()
85 | 
86 | print("How many of the unseen tokens were predicted?")
87 | seen = test_entities & train_entities
88 | unseen = test_entities - train_entities
89 | print(f"Out of {len(unseen)} unseen tokens {len(unseen & pred_entities)} were predicted")
90 | print(f"Out of {len(seen)} seen tokens {len(seen & pred_entities)} were predicted")
91 | 


--------------------------------------------------------------------------------
/data_and_models/pipelines/ner/transformers_vs_spacy/transformers/create_pickle.py:
--------------------------------------------------------------------------------
 1 | # Blue Brain Search is a text mining toolbox focused on scientific use cases.
 2 | #
 3 | # Copyright (C) 2020  Blue Brain Project, EPFL.
 4 | #
 5 | # This program is free software: you can redistribute it and/or modify
 6 | # it under the terms of the GNU Lesser General Public License as published by
 7 | # the Free Software Foundation, either version 3 of the License, or
 8 | # (at your option) any later version.
 9 | #
10 | # This program is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 | # GNU Lesser General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU Lesser General Public License
16 | # along with this program. If not, see <https://www.gnu.org/licenses/>.
17 | 
18 | import argparse
19 | import pathlib
20 | 
21 | from datasets import load_dataset
22 | import pandas as pd
23 | from typing import List
24 | 
25 | parser = argparse.ArgumentParser()
26 | parser.add_argument("input", default="dataset.txt")
27 | parser.add_argument("output", default="dataset.pkl")
28 | args = parser.parse_args()
29 | input_path = pathlib.Path(args.input)
30 | 
31 | sequences: List[List[str]]= []
32 | targets: List[List[str]] = []
33 | 
34 | with input_path.open("r", encoding="utf-8") as f:
35 |     sequence: List[str] = []
36 |     target: List[str] = []
37 | 
38 |     all_lines = list(f.readlines())
39 | 
40 |     # Make sure the last line is a new line
41 |     if all_lines[-1] != "\n":
42 |         all_lines.append("\n")
43 | 
44 |     for line in all_lines:
45 |         if line == "\n":
46 |             sequences.append(sequence[:])
47 |             targets.append(target[:])
48 | 
49 |             sequence.clear()
50 |             target.clear()
51 |             continue
52 |          
53 |         try:
54 |             token, entity_type = line.split(" ")        
55 |             entity_type = entity_type.strip("\n")
56 |         except:
57 |             print(f"Something went wrong: {line}")
58 | 
59 |         sequence.append(token)
60 |         target.append(entity_type)
61 | 
62 | df = pd.DataFrame({"token": sequences, "entity_type": targets})
63 | df.to_pickle(args.output)
64 | 


--------------------------------------------------------------------------------
/data_and_models/pipelines/sentence_embedding/.gitignore:
--------------------------------------------------------------------------------
1 | /datasets
2 | 


--------------------------------------------------------------------------------
/data_and_models/pipelines/sentence_embedding/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Blue Brain Search is a text mining toolbox focused on scientific use cases.
 2 | #
 3 | # Copyright (C) 2020  Blue Brain Project, EPFL.
 4 | #
 5 | # This program is free software: you can redistribute it and/or modify
 6 | # it under the terms of the GNU Lesser General Public License as published by
 7 | # the Free Software Foundation, either version 3 of the License, or
 8 | # (at your option) any later version.
 9 | #
10 | # This program is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 | # GNU Lesser General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU Lesser General Public License
16 | # along with this program. If not, see <https://www.gnu.org/licenses/>.
17 | 
18 | FROM nvcr.io/nvidia/pytorch:21.06-py3
19 | 
20 | ENV HTTP_PROXY='http://bbpproxy.epfl.ch:80/'
21 | ENV HTTPS_PROXY='http://bbpproxy.epfl.ch:80/'
22 | ENV http_proxy='http://bbpproxy.epfl.ch:80/'
23 | ENV https_proxy='http://bbpproxy.epfl.ch:80/'
24 | 
25 | # Update conda, install additional system packages
26 | RUN true \
27 | 	&& conda update conda \
28 | 	&& apt-get update \
29 | 	&& apt-get install -y gcc g++ build-essential vim libfontconfig1
30 | RUN conda install -c carta mysqlclient
31 | 
32 | # Instal BlueBrainSearach -- revision can be a branch, sha, or tag
33 | ARG BBS_REVISION=v0.2.0
34 | ADD . /src
35 | WORKDIR /src
36 | RUN git checkout $BBS_REVISION
37 | # remove ruamel-yaml: https://github.com/pypa/pip/issues/5247#issuecomment-381550610
38 | RUN rm -rf /opt/conda/lib/python3.8/site-packages/ruamel*
39 | RUN pip install -r requirements.txt
40 | RUN pip install -r requirements-data_and_models.txt
41 | RUN pip install $PWD[data_and_models]
42 | # Force the version of the script training_transformers/train.py
43 | RUN pip install transformers==3.4.0
44 | 
45 | 
46 | EXPOSE 8888
47 | 
48 | RUN groupadd -g 999 docker
49 | RUN useradd --create-home --uid 1000 --gid docker bbsuser
50 | 
51 | WORKDIR /bbs
52 | RUN rm -rf /bbs/tmp
53 | ENTRYPOINT ["/bin/bash"]
54 | 


--------------------------------------------------------------------------------
/data_and_models/pipelines/sentence_embedding/params.yaml:
--------------------------------------------------------------------------------
 1 | train:
 2 |   tf_idf:
 3 |     init_kwargs:
 4 |       lowercase: True
 5 |   count:
 6 |     init_kwargs:
 7 |       lowercase: True
 8 | 
 9 | eval:
10 |   biobert_nli_sts_cord19_v1:
11 |     class: SentTransformer
12 |     init_kwargs:
13 |       model_name_or_path: ../../models/sentence_embedding/biobert_nli_sts_cord19_v1/
14 |   biobert_nli_sts:
15 |     class: SentTransformer
16 |     init_kwargs:
17 |       model_name_or_path: clagator/biobert_v1.1_pubmed_nli_sts
18 |   tf_idf:
19 |     class: SklearnVectorizer
20 |     init_kwargs:
21 |       checkpoint_path: ../../models/sentence_embedding/tf_idf/model.pkl
22 |   count:
23 |     class: SklearnVectorizer
24 |     init_kwargs:
25 |       checkpoint_path: ../../models/sentence_embedding/count/model.pkl
26 |   sbert:
27 |     class: SentTransformer
28 |     init_kwargs:
29 |       model_name_or_path: bert-base-nli-mean-tokens
30 |   sbiobert:
31 |     class: SentTransformer
32 |     init_kwargs:
33 |       model_name_or_path: gsarti/biobert-nli
34 | 


--------------------------------------------------------------------------------
/data_and_models/pipelines/sentence_embedding/training_transformers/.gitignore:
--------------------------------------------------------------------------------
 1 | # Blue Brain Search is a text mining toolbox focused on scientific use cases.
 2 | #
 3 | # Copyright (C) 2020  Blue Brain Project, EPFL.
 4 | #
 5 | # This program is free software: you can redistribute it and/or modify
 6 | # it under the terms of the GNU Lesser General Public License as published by
 7 | # the Free Software Foundation, either version 3 of the License, or
 8 | # (at your option) any later version.
 9 | #
10 | # This program is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 | # GNU Lesser General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU Lesser General Public License
16 | # along with this program. If not, see <https://www.gnu.org/licenses/>.
17 | 
18 | /biosses_sentences.txt
19 | /sentences-filtered_11-527-877.txt
20 | 


--------------------------------------------------------------------------------
/data_and_models/pipelines/sentence_embedding/training_transformers/biosses_sentences.txt.dvc:
--------------------------------------------------------------------------------
1 | outs:
2 | - md5: c48471cf689ad28c0cbc20f7fa18f4d8
3 |   size: 31828
4 |   path: biosses_sentences.txt
5 | 


--------------------------------------------------------------------------------
/data_and_models/pipelines/sentence_embedding/training_transformers/sentences-filtered_11-527-877.txt.dvc:
--------------------------------------------------------------------------------
1 | outs:
2 | - md5: 96aaec3358dc184bcb7015b07c4c893a
3 |   size: 1655753737
4 |   path: sentences-filtered_11-527-877.txt
5 | 


--------------------------------------------------------------------------------
/data_and_models/raw_sentences/.gitignore:
--------------------------------------------------------------------------------
 1 | # Blue Brain Search is a text mining toolbox focused on scientific use cases.
 2 | #
 3 | # Copyright (C) 2020  Blue Brain Project, EPFL.
 4 | #
 5 | # This program is free software: you can redistribute it and/or modify
 6 | # it under the terms of the GNU Lesser General Public License as published by
 7 | # the Free Software Foundation, either version 3 of the License, or
 8 | # (at your option) any later version.
 9 | #
10 | # This program is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 | # GNU Lesser General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU Lesser General Public License
16 | # along with this program. If not, see <https://www.gnu.org/licenses/>.
17 | 
18 | /raw1_2020-06-10_cord19_TestSet.jsonl
19 | /raw2_2020-06-29_cord19_Disease.jsonl
20 | /raw3_2020-06-30_cord19_Disease.jsonl
21 | /raw4_2020-07-02_cord19_ChemicalOrganism.jsonl
22 | /raw5_2020-07-08_cord19_Drug_TestSet.jsonl
23 | /raw6_2020-07-08_cord19_CelltypeProtein.jsonl
24 | /raw7_2020-09-01_cord19v35_CellCompartment.jsonl
25 | /raw8_2020-09-02_cord19v35_CellCompartmentDrugOrgan.jsonl
26 | /raw9_2020-09-02_cord19v35_Pathway.jsonl
27 | 


--------------------------------------------------------------------------------
/data_and_models/raw_sentences/raw1_2020-06-10_cord19_TestSet.jsonl.dvc:
--------------------------------------------------------------------------------
1 | outs:
2 | - md5: 03ead15d2a2e4b5d25ddd973ad886c5d
3 |   path: raw1_2020-06-10_cord19_TestSet.jsonl
4 | 


--------------------------------------------------------------------------------
/data_and_models/raw_sentences/raw2_2020-06-29_cord19_Disease.jsonl.dvc:
--------------------------------------------------------------------------------
1 | outs:
2 | - md5: e63e82e9fcfef81a3c2e1d1ebfc5a02e
3 |   path: raw2_2020-06-29_cord19_Disease.jsonl
4 | 


--------------------------------------------------------------------------------
/data_and_models/raw_sentences/raw3_2020-06-30_cord19_Disease.jsonl.dvc:
--------------------------------------------------------------------------------
1 | outs:
2 | - md5: 81d203dd8f0e3461cb44caa15cb0bab4
3 |   path: raw3_2020-06-30_cord19_Disease.jsonl
4 | 


--------------------------------------------------------------------------------
/data_and_models/raw_sentences/raw4_2020-07-02_cord19_ChemicalOrganism.jsonl.dvc:
--------------------------------------------------------------------------------
1 | outs:
2 | - md5: cc8d780d8b58521d21adb04502c9c269
3 |   path: raw4_2020-07-02_cord19_ChemicalOrganism.jsonl
4 | 


--------------------------------------------------------------------------------
/data_and_models/raw_sentences/raw5_2020-07-08_cord19_Drug_TestSet.jsonl.dvc:
--------------------------------------------------------------------------------
1 | outs:
2 | - md5: 998e96d170fc05117d978a588d0f07a8
3 |   path: raw5_2020-07-08_cord19_Drug_TestSet.jsonl
4 | 


--------------------------------------------------------------------------------
/data_and_models/raw_sentences/raw6_2020-07-08_cord19_CelltypeProtein.jsonl.dvc:
--------------------------------------------------------------------------------
1 | outs:
2 | - md5: 50bbcec563fd2d566783b11785253cb4
3 |   path: raw6_2020-07-08_cord19_CelltypeProtein.jsonl
4 | 


--------------------------------------------------------------------------------
/data_and_models/raw_sentences/raw7_2020-09-01_cord19v35_CellCompartment.jsonl.dvc:
--------------------------------------------------------------------------------
1 | outs:
2 | - md5: 6263f835ab71cad54f6f55c830b7879e
3 |   path: raw7_2020-09-01_cord19v35_CellCompartment.jsonl
4 | 


--------------------------------------------------------------------------------
/data_and_models/raw_sentences/raw8_2020-09-02_cord19v35_CellCompartmentDrugOrgan.jsonl.dvc:
--------------------------------------------------------------------------------
1 | outs:
2 | - md5: 470209cd6fa7df1c7c8be0016d3926e7
3 |   path: raw8_2020-09-02_cord19v35_CellCompartmentDrugOrgan.jsonl
4 | 


--------------------------------------------------------------------------------
/data_and_models/raw_sentences/raw9_2020-09-02_cord19v35_Pathway.jsonl.dvc:
--------------------------------------------------------------------------------
1 | outs:
2 | - md5: efb28f1a3f082a908abc8adb587b4b0c
3 |   path: raw9_2020-09-02_cord19v35_Pathway.jsonl
4 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | # Blue Brain Search is a text mining toolbox focused on scientific use cases.
 2 | #
 3 | # Copyright (C) 2020  Blue Brain Project, EPFL.
 4 | #
 5 | # This program is free software: you can redistribute it and/or modify
 6 | # it under the terms of the GNU Lesser General Public License as published by
 7 | # the Free Software Foundation, either version 3 of the License, or
 8 | # (at your option) any later version.
 9 | #
10 | # This program is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 | # GNU Lesser General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU Lesser General Public License
16 | # along with this program. If not, see <https://www.gnu.org/licenses/>.
17 | 
18 | version: "3.5"
19 | services:
20 | 
21 |   base:
22 |     build:
23 |       context: .
24 |       dockerfile: docker/base.Dockerfile
25 |       args:
26 |         - BBS_HTTP_PROXY
27 |         - BBS_HTTPS_PROXY
28 |         - BBS_http_proxy
29 |         - BBS_https_proxy
30 |         - BBS_USERS
31 |     image: bbs-base
32 |     container_name: bbs-base
33 |     networks:
34 |       - bbs_network
35 | 
36 |   search:
37 |     build:
38 |       context: .
39 |       dockerfile: docker/search.Dockerfile
40 |     image: bbs_search
41 |     container_name: bbs_search
42 |     env_file: .env
43 |     ports:
44 |       - 8850:8080
45 |     volumes:
46 |       - /raid:/raid
47 |     networks:
48 |       - bbs_network
49 | 
50 |   embedding:
51 |     build:
52 |       context: .
53 |       dockerfile: docker/embedding.Dockerfile
54 |     image: bbs_embedding
55 |     container_name: bbs_embedding
56 |     env_file: .env
57 |     ports:
58 |       - 8851:8080
59 |     volumes:
60 |       - /raid:/raid
61 |     networks:
62 |       - bbs_network
63 | 
64 |   mining:
65 |     build:
66 |       context: .
67 |       dockerfile: docker/mining.Dockerfile
68 |     image: bbs_mining
69 |     container_name: bbs_mining
70 |     env_file: .env
71 |     ports:
72 |       - 8852:8080
73 |     volumes:
74 |       - /raid:/raid
75 |     networks:
76 |       - bbs_network
77 | 
78 | networks:
79 |   bbs_network:
80 |     driver: bridge
81 | 


--------------------------------------------------------------------------------
/docker/corenlp.Dockerfile:
--------------------------------------------------------------------------------
 1 | # Blue Brain Search is a text mining toolbox focused on scientific use cases.
 2 | #
 3 | # Copyright (C) 2020  Blue Brain Project, EPFL.
 4 | #
 5 | # This program is free software: you can redistribute it and/or modify
 6 | # it under the terms of the GNU Lesser General Public License as published by
 7 | # the Free Software Foundation, either version 3 of the License, or
 8 | # (at your option) any later version.
 9 | #
10 | # This program is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 | # GNU Lesser General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU Lesser General Public License
16 | # along with this program. If not, see <https://www.gnu.org/licenses/>.
17 | 
18 | FROM ubuntu:latest
19 | 
20 | LABEL maintainer="Stanislav Schmidt <stanislav.schmidt@epfl.ch>"
21 | LABEL version="1.0"
22 | LABEL description="CoreNLP Server"
23 | 
24 | # ENV HTTP_PROXY='http://bbpproxy.epfl.ch:80/'
25 | # ENV HTTPS_PROXY='http://bbpproxy.epfl.ch:80/'
26 | # ENV http_proxy='http://bbpproxy.epfl.ch:80/'
27 | # ENV https_proxy='http://bbpproxy.epfl.ch:80/'
28 | 
29 | # Install git, gcc, and g++
30 | RUN apt-get update && apt-get install -y \
31 | 	default-jre \
32 | 	unzip \
33 | 	wget
34 | 
35 | # Download and install CoreNLP 4.0.0 (2020-04-19)
36 | # See https://stanfordnlp.github.io/CoreNLP/history.html
37 | # COPY corenlp_download.zip .
38 | RUN true \
39 | 	&& export CORENLP_VERSION=4.0.0 \
40 | 	&& URL=http://nlp.stanford.edu/software/stanford-corenlp-${CORENLP_VERSION}.zip \
41 | 	&& wget -q --show-progress --progress=bar:force -O corenlp_download.zip $URL 2>&1 \
42 | 	&& unzip -q -j corenlp_download.zip -d /corenlp \
43 | 	&& rm corenlp_download.zip
44 | 
45 | 
46 | # Add a user
47 | RUN useradd corenlpuser
48 | WORKDIR /corenlp
49 | USER corenlpuser
50 | 
51 | # Expose a port
52 | EXPOSE 9000
53 | 
54 | ENTRYPOINT exec java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -annotators "tokenize,ssplit,pos,depparse"
55 | 
56 | 


--------------------------------------------------------------------------------
/docker/embedding.Dockerfile:
--------------------------------------------------------------------------------
 1 | # Blue Brain Search is a text mining toolbox focused on scientific use cases.
 2 | #
 3 | # Copyright (C) 2020  Blue Brain Project, EPFL.
 4 | #
 5 | # This program is free software: you can redistribute it and/or modify
 6 | # it under the terms of the GNU Lesser General Public License as published by
 7 | # the Free Software Foundation, either version 3 of the License, or
 8 | # (at your option) any later version.
 9 | #
10 | # This program is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 | # GNU Lesser General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU Lesser General Public License
16 | # along with this program. If not, see <https://www.gnu.org/licenses/>.
17 | 
18 | FROM bbs-base
19 | 
20 | USER root
21 | 
22 | # Install the app
23 | ADD . /src
24 | WORKDIR /src
25 | RUN pip install .
26 | RUN pip install gunicorn
27 | 
28 | # Set image version
29 | LABEL maintainer="BBP-EPFL Machine Learning team <bbp-ou-machinelearning@groupes.epfl.ch>"
30 | LABEL description="REST API Server for Text Embeddings"
31 | 
32 | # Add a user
33 | RUN useradd --create-home serveruser
34 | WORKDIR /home/serveruser
35 | USER serveruser
36 | 
37 | # Run the entry point
38 | EXPOSE 8080
39 | ENTRYPOINT [\
40 | "gunicorn", \
41 | "--bind", "0.0.0.0:8080", \
42 | "--workers", "1", \
43 | "--timeout", "180", \
44 | "bluesearch.entrypoint.embedding_server:get_embedding_app()"]
45 | 


--------------------------------------------------------------------------------
/docker/grobid_quantities.Dockerfile:
--------------------------------------------------------------------------------
 1 | # Blue Brain Search is a text mining toolbox focused on scientific use cases.
 2 | #
 3 | # Copyright (C) 2020  Blue Brain Project, EPFL.
 4 | #
 5 | # This program is free software: you can redistribute it and/or modify
 6 | # it under the terms of the GNU Lesser General Public License as published by
 7 | # the Free Software Foundation, either version 3 of the License, or
 8 | # (at your option) any later version.
 9 | #
10 | # This program is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 | # GNU Lesser General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU Lesser General Public License
16 | # along with this program. If not, see <https://www.gnu.org/licenses/>.
17 | 
18 | FROM ubuntu:latest
19 | 
20 | LABEL maintainer="Stanislav Schmidt <stanislav.schmidt@epfl.ch>"
21 | LABEL version="1.0"
22 | LABEL description="GROBID Quantities Server"
23 | 
24 | # ENV HTTP_PROXY='http://bbpproxy.epfl.ch:80/'
25 | # ENV HTTPS_PROXY='http://bbpproxy.epfl.ch:80/'
26 | # ENV http_proxy='http://bbpproxy.epfl.ch:80/'
27 | # ENV https_proxy='http://bbpproxy.epfl.ch:80/'
28 | 
29 | 
30 | # Install java, git, unzip and wget
31 | RUN apt-get update && apt-get install -y \
32 | 	default-jre \
33 | 	git \
34 | 	unzip \
35 | 	wget
36 | 
37 | # Add a user
38 | RUN useradd --create-home grobiduser
39 | WORKDIR /home/grobiduser
40 | USER grobiduser
41 | 
42 | # Download and install GROBID
43 | RUN true \
44 | 	&& git clone --depth=1 https://github.com/kermitt2/grobid.git grobid \
45 | 	&& cd grobid \
46 | #	&& echo "systemProp.https.proxyHost=bbpproxy.epfl.ch" >> gradle.properties \
47 | 	&& ./gradlew clean install
48 | 
49 | # Download and install GROBID Quantities
50 | RUN true \
51 | 	&& git clone --depth=1 https://github.com/kermitt2/grobid-quantities.git grobid/grobid-quantities \
52 | 	&& cd grobid/grobid-quantities/ \
53 | #	&& echo "\nsystemProp.https.proxyHost=bbpproxy.epfl.ch" >> gradle.properties \
54 | 	&& ./gradlew copyModels \
55 | 	&& ./gradlew clean install
56 | 
57 | # Expose a port and set working directory
58 | EXPOSE 8060
59 | WORKDIR /home/grobiduser/grobid/grobid-quantities
60 | 
61 | ENTRYPOINT exec java -jar $(find build/libs -name "grobid-*onejar.jar") server resources/config/config.yml
62 | # ENTRYPOINT exec java -jar build/libs/grobid-quantities-0.6.1-SNAPSHOT-onejar.jar server resources/config/config.yml
63 | 
64 | 


--------------------------------------------------------------------------------
/docker/mining.Dockerfile:
--------------------------------------------------------------------------------
 1 | # Blue Brain Search is a text mining toolbox focused on scientific use cases.
 2 | #
 3 | # Copyright (C) 2020  Blue Brain Project, EPFL.
 4 | #
 5 | # This program is free software: you can redistribute it and/or modify
 6 | # it under the terms of the GNU Lesser General Public License as published by
 7 | # the Free Software Foundation, either version 3 of the License, or
 8 | # (at your option) any later version.
 9 | #
10 | # This program is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 | # GNU Lesser General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU Lesser General Public License
16 | # along with this program. If not, see <https://www.gnu.org/licenses/>.
17 | 
18 | FROM bbs-base
19 | 
20 | USER root
21 | 
22 | # Install the app
23 | ADD . /src
24 | WORKDIR /src
25 | RUN pip install -e .[data_and_models]
26 | 
27 | # Set image version
28 | LABEL maintainer="BBP-EPFL Machine Learning team <bbp-ou-machinelearning@groupes.epfl.ch>"
29 | LABEL description="REST API Server for Test Mining"
30 | 
31 | RUN chmod -R a+rwX /src
32 | 
33 | # Run the entry point
34 | EXPOSE 8080
35 | ENV BBS_DATA_AND_MODELS_DIR="/src/data_and_models"
36 | ENTRYPOINT ["/src/docker/mining.sh"]
37 | 


--------------------------------------------------------------------------------
/docker/mining.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Blue Brain Search is a text mining toolbox focused on scientific use cases.
 4 | #
 5 | # Copyright (C) 2020  Blue Brain Project, EPFL.
 6 | #
 7 | # This program is free software: you can redistribute it and/or modify
 8 | # it under the terms of the GNU Lesser General Public License as published by
 9 | # the Free Software Foundation, either version 3 of the License, or
10 | # (at your option) any later version.
11 | #
12 | # This program is distributed in the hope that it will be useful,
13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 | # GNU Lesser General Public License for more details.
16 | #
17 | # You should have received a copy of the GNU Lesser General Public License
18 | # along with this program. If not, see <https://www.gnu.org/licenses/>.
19 | 
20 | source /src/docker/utils.sh
21 | # If $BBS_SSH_USERNAME is empty then this is a no-op
22 | dvc_configure_ssh_remote_authentication "$BBS_SSH_USERNAME"
23 | # Not usable in README as it works only when inside the `bbs_` containers.
24 | # If $DATA_DIR is empty then this will fail
25 | dvc_pull_models "$BBS_DATA_AND_MODELS_DIR"
26 | 
27 | # Launch mining server
28 | pip install gunicorn
29 | gunicorn --bind 0.0.0.0:8080 --workers 1 --timeout 7200 'bluesearch.entrypoint.mining_server:get_mining_app()'
30 | 


--------------------------------------------------------------------------------
/docker/mining_cache.Dockerfile:
--------------------------------------------------------------------------------
 1 | # Blue Brain Search is a text mining toolbox focused on scientific use cases.
 2 | #
 3 | # Copyright (C) 2020  Blue Brain Project, EPFL.
 4 | #
 5 | # This program is free software: you can redistribute it and/or modify
 6 | # it under the terms of the GNU Lesser General Public License as published by
 7 | # the Free Software Foundation, either version 3 of the License, or
 8 | # (at your option) any later version.
 9 | #
10 | # This program is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 | # GNU Lesser General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU Lesser General Public License
16 | # along with this program. If not, see <https://www.gnu.org/licenses/>.
17 | 
18 | FROM bbs-base
19 | 
20 | USER root
21 | 
22 | # Install the app
23 | ADD . /src
24 | WORKDIR /src
25 | RUN pip install -e .[data_and_models]
26 | 
27 | # Set image version
28 | LABEL maintainer="BBP-EPFL Machine Learning team <bbp-ou-machinelearning@groupes.epfl.ch>"
29 | LABEL description="Creation of a Mining Cache for the Mining Server"
30 | 
31 | 
32 | RUN chmod -R a+rwX /src
33 | 
34 | # Run the entry point
35 | ENV DATA_DIR="/src/data_and_models"
36 | ENTRYPOINT ["/src/docker/mining_cache.sh"]
37 | 


--------------------------------------------------------------------------------
/docker/mining_cache.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Blue Brain Search is a text mining toolbox focused on scientific use cases.
 4 | #
 5 | # Copyright (C) 2020  Blue Brain Project, EPFL.
 6 | #
 7 | # This program is free software: you can redistribute it and/or modify
 8 | # it under the terms of the GNU Lesser General Public License as published by
 9 | # the Free Software Foundation, either version 3 of the License, or
10 | # (at your option) any later version.
11 | #
12 | # This program is distributed in the hope that it will be useful,
13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 | # GNU Lesser General Public License for more details.
16 | #
17 | # You should have received a copy of the GNU Lesser General Public License
18 | # along with this program. If not, see <https://www.gnu.org/licenses/>.
19 | 
20 | source /src/docker/utils.sh
21 | dvc_configure_ssh_remote_authentication "$BBS_SSH_USERNAME"
22 | dvc_pull_models "$DATA_DIR"
23 | 
24 | # Launch mining cache creation, using arguments only if defined
25 | create_mining_cache \
26 |   $([ -n "$BBS_MINING_CACHE_TARGET_TABLE" ] && echo "--target-table-name $BBS_MINING_CACHE_TARGET_TABLE") \
27 |   $([ -n "$BBS_MINING_CACHE_PROCESSORS_PER_MODEL" ] && echo "--n-processes-per-model $BBS_MINING_CACHE_PROCESSORS_PER_MODEL") \
28 |   $([ -n "$BBS_MINING_CACHE_LOG_FILE" ] && echo "--log-file $BBS_MINING_CACHE_LOG_FILE") \
29 |   $([ -n "$BBS_MINING_CACHE_LOG_LEVEL" ] && echo "--log-level $BBS_MINING_CACHE_LOG_LEVEL")
30 | 


--------------------------------------------------------------------------------
/docker/mysql-make-backup:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | DB_NAME="$1"
 4 | FILENAME="/backup/$(date +%Y%m%d-%H%M%S)-${DB_NAME}.sql"
 5 | 
 6 | # Check MySQL credentials are in the environment
 7 | if [ -z "$MYSQL_USER" ] || [ -z "$MYSQL_PWD" ]
 8 | then
 9 |     echo "The variables MYSQL_USER and MYSQL_PWD need to be in the environment"
10 |     exit 1
11 | fi
12 | 
13 | # Check argument
14 | if [ -z "$DB_NAME" ]
15 | then
16 |     echo "Usage: $(basename $0) DB-NAME"
17 |     exit 1
18 | fi
19 | 
20 | # Test credentials
21 | mysql -u "$MYSQL_USER" -e "quit" >& /dev/null
22 | if [ "$?" -ne "0" ]
23 | then
24 |     echo "Access to database denied. Wrong credentials?"
25 |     exit 1
26 | fi
27 | 
28 | # Test if database exists
29 | mysql -u "$MYSQL_USER" -e "use $DB_NAME" >& /dev/null
30 | if [ "$?" -ne "0" ]
31 | then
32 |     echo "Database ${DB_NAME} doesn't exist"
33 |     exit 1
34 | fi
35 | 
36 | # Make backup
37 | echo "Saving back-up to $FILENAME"
38 | /usr/bin/mysqldump -u "$MYSQL_USER" "$DB_NAME" -RE --triggers > "${FILENAME}"
39 | echo "Back-up finished."
40 | 
41 | 


--------------------------------------------------------------------------------
/docker/mysql.Dockerfile:
--------------------------------------------------------------------------------
 1 | # Blue Brain Search is a text mining toolbox focused on scientific use cases.
 2 | #
 3 | # Copyright (C) 2020  Blue Brain Project, EPFL.
 4 | #
 5 | # This program is free software: you can redistribute it and/or modify
 6 | # it under the terms of the GNU Lesser General Public License as published by
 7 | # the Free Software Foundation, either version 3 of the License, or
 8 | # (at your option) any later version.
 9 | #
10 | # This program is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 | # GNU Lesser General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU Lesser General Public License
16 | # along with this program. If not, see <https://www.gnu.org/licenses/>.
17 | 
18 | FROM mysql:8
19 | 
20 | # ENV HTTP_PROXY="http://bbpproxy.epfl.ch:80"
21 | # ENV HTTPS_PROXY="http://bbpproxy.epfl.ch:80"
22 | # ENV http_proxy="http://bbpproxy.epfl.ch:80"
23 | # ENV https_proxy="http://bbpproxy.epfl.ch:80"
24 | 
25 | # Set timezone for correct timestamp, install useful packages
26 | ENV TZ="Europe/Zurich"
27 | RUN \
28 | apt-get update &&\
29 | apt-get install -y --no-install-recommends man vim less procps &&\
30 | apt-get clean &&\
31 | rm -r /var/lib/apt/lists/*
32 | 
33 | # Limit incremental binary log to 7 days. This is a system variable and has to
34 | # go in the [mysqld] section, which is in docker.cnf
35 | # Accordingly it would make sense to do file dumps every 7 days
36 | RUN echo "binlog_expire_logs_seconds = 604800" >> /etc/mysql/conf.d/docker.cnf
37 | 
38 | # Install the backup script
39 | COPY docker/mysql-make-backup /usr/local/bin/make-backup
40 | RUN chmod +x /usr/local/bin/make-backup
41 | 
42 | # Pre-initialize the docker volume for the backup
43 | VOLUME ["/backup"]
44 | 


--------------------------------------------------------------------------------
/docker/search.Dockerfile:
--------------------------------------------------------------------------------
 1 | # Blue Brain Search is a text mining toolbox focused on scientific use cases.
 2 | #
 3 | # Copyright (C) 2020  Blue Brain Project, EPFL.
 4 | #
 5 | # This program is free software: you can redistribute it and/or modify
 6 | # it under the terms of the GNU Lesser General Public License as published by
 7 | # the Free Software Foundation, either version 3 of the License, or
 8 | # (at your option) any later version.
 9 | #
10 | # This program is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 | # GNU Lesser General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU Lesser General Public License
16 | # along with this program. If not, see <https://www.gnu.org/licenses/>.
17 | 
18 | FROM bbs-base
19 | 
20 | USER root
21 | 
22 | # Install the app
23 | ADD . /src
24 | WORKDIR /src
25 | RUN pip install .
26 | RUN pip install gunicorn
27 | 
28 | # Set image version
29 | LABEL maintainer="BBP-EPFL Machine Learning team <bbp-ou-machinelearning@groupes.epfl.ch>"
30 | LABEL description="REST API Server for Blue Brain Search"
31 | 
32 | # Add a user
33 | RUN useradd --create-home serveruser
34 | WORKDIR /home/serveruser
35 | USER serveruser
36 | 
37 | # Run the entry point
38 | # Note the "timeout" parameter. That's to let the server initialisation finish before
39 | # gunicorn decides that the worker is not responsive and restarts it again.
40 | # Might think about a better solution in the future... (initialize in a threading?)
41 | EXPOSE 8080
42 | ENTRYPOINT [\
43 | "gunicorn", \
44 | "--bind", "0.0.0.0:8080", \
45 | "--workers", "1", \
46 | "--timeout", "7200", \
47 | "bluesearch.entrypoint.search_server:get_search_app()"]
48 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Blue Brain Search is a text mining toolbox focused on scientific use cases.
 2 | #
 3 | # Copyright (C) 2020  Blue Brain Project, EPFL.
 4 | #
 5 | # This program is free software: you can redistribute it and/or modify
 6 | # it under the terms of the GNU Lesser General Public License as published by
 7 | # the Free Software Foundation, either version 3 of the License, or
 8 | # (at your option) any later version.
 9 | #
10 | # This program is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 | # GNU Lesser General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU Lesser General Public License
16 | # along with this program. If not, see <https://www.gnu.org/licenses/>.
17 | 
18 | 
19 | # Minimal makefile for Sphinx documentation
20 | #
21 | # You can set these variables from the command line, and also
22 | # from the environment for the first two.
23 | SPHINXOPTS    ?=
24 | SPHINXBUILD   ?= sphinx-build
25 | SOURCEDIR     = .
26 | BUILDDIR      = _build
27 | 
28 | # Put it first so that "make" without argument is like "make help".
29 | help:
30 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
31 | 
32 | .PHONY: help Makefile
33 | 
34 | # Catch-all target: route all unknown targets to Sphinx using the new
35 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
36 | %: Makefile
37 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
38 | 


--------------------------------------------------------------------------------
/docs/_static/.keep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BlueBrain/Search/503fdf320a62ab2eb1f9a2a371600e4f1a38df62/docs/_static/.keep


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | 
 3 | # Blue Brain Search is a text mining toolbox focused on scientific use cases.
 4 | #
 5 | # Copyright (C) 2020  Blue Brain Project, EPFL.
 6 | #
 7 | # This program is free software: you can redistribute it and/or modify
 8 | # it under the terms of the GNU Lesser General Public License as published by
 9 | # the Free Software Foundation, either version 3 of the License, or
10 | # (at your option) any later version.
11 | #
12 | # This program is distributed in the hope that it will be useful,
13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 | # GNU Lesser General Public License for more details.
16 | #
17 | # You should have received a copy of the GNU Lesser General Public License
18 | # along with this program. If not, see <https://www.gnu.org/licenses/>.
19 | 
20 | import os
21 | import sys
22 | 
23 | import bluesearch
24 | 
25 | sys.path.insert(0, os.path.abspath("."))
26 | 
27 | # -- Project information -----------------------------------------------------
28 | 
29 | project = "Blue Brain Search"
30 | author = "Blue Brain Project"
31 | version = bluesearch.__version__
32 | 
33 | # -- General configuration ---------------------------------------------------
34 | suppress_warnings = ["ref.ref"]  # because of luigi.util.requires
35 | 
36 | # Add any Sphinx extension module names here, as strings. They can be
37 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
38 | # ones.
39 | extensions = [
40 |     "sphinx.ext.mathjax",
41 |     "sphinx.ext.autodoc",
42 |     "sphinx.ext.doctest",
43 |     "sphinx.ext.napoleon",
44 |     "sphinx.ext.viewcode",
45 | ]
46 | 
47 | # Add any paths that contain templates here, relative to this directory.
48 | templates_path = ["_templates"]
49 | 
50 | # List of patterns, relative to source directory, that match files and
51 | # directories to ignore when looking for source files.
52 | # This pattern also affects html_static_path and html_extra_path.
53 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
54 | 
55 | # -- Options for HTML output -------------------------------------------------
56 | 
57 | # The theme to use for HTML and HTML Help pages.  See the documentation for
58 | # a list of builtin themes.
59 | html_theme = "sphinx-bluebrain-theme"
60 | html_title = "Blue Brain Search"
61 | html_theme_options = {
62 |     "metadata_distribution": "bluesearch",
63 |     "repo_name": "bluesearch",
64 |     "repo_url": "https://github.com/BlueBrain/Search",
65 | }
66 | # If true, the reST sources are included in the HTML build as _sources/name.
67 | html_copy_source = False
68 | # If true (and html_copy_source is true as well), links to the reST sources
69 | # will be added to the sidebar.
70 | html_show_sourcelink = False
71 | # Add any paths that contain custom static files (such as style sheets) here,
72 | # relative to this directory. They are copied after the builtin static files,
73 | # so a file named "default.css" will overwrite the builtin "default.css".
74 | html_static_path = ["_static"]
75 | # A boolean that decides whether module names are prepended to all object names
76 | # (for object types where a “module” of some kind is defined), e.g. for
77 | # py:function directives.
78 | add_module_names = False
79 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | .. Blue Brain Search is a text mining toolbox focused on scientific use cases.
 2 |    Copyright (C) 2020  Blue Brain Project, EPFL.
 3 |    This program is free software: you can redistribute it and/or modify
 4 |    it under the terms of the GNU Lesser General Public License as published by
 5 |    the Free Software Foundation, either version 3 of the License, or
 6 |    (at your option) any later version.
 7 |    This program is distributed in the hope that it will be useful,
 8 |    but WITHOUT ANY WARRANTY; without even the implied warranty of
 9 |    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10 |    GNU Lesser General Public License for more details.
11 |    You should have received a copy of the GNU Lesser General Public License
12 |    along with this program. If not, see <https://www.gnu.org/licenses/>.
13 | 
14 | 
15 | Welcome to Blue Brain Search's documentation!
16 | =============================================
17 | 
18 | .. toctree::
19 |    :maxdepth: 2
20 |    :caption: Contents:
21 | 
22 |    source/instructions
23 |    source/server
24 |    source/entrypoint
25 |    source/faq
26 |    source/example
27 | 
28 | .. toctree::
29 |    :maxdepth: 2
30 |    :caption: API:
31 | 
32 |    source/api/bluesearch
33 | 
34 | .. toctree::
35 |    :maxdepth: 1
36 |    :caption: Release History
37 | 
38 |    source/whatsnew
39 | 
40 | 
41 | 
42 | Indices and tables
43 | ==================
44 | 
45 | * :ref:`genindex`
46 | * :ref:`modindex`
47 | * :ref:`search`
48 | 


--------------------------------------------------------------------------------
/docs/source/_substitutions.rst:
--------------------------------------------------------------------------------
 1 | .. Blue Brain Search is a text mining toolbox focused on scientific use cases.
 2 |    Copyright (C) 2020  Blue Brain Project, EPFL.
 3 |    This program is free software: you can redistribute it and/or modify
 4 |    it under the terms of the GNU Lesser General Public License as published by
 5 |    the Free Software Foundation, either version 3 of the License, or
 6 |    (at your option) any later version.
 7 |    This program is distributed in the hope that it will be useful,
 8 |    but WITHOUT ANY WARRANTY; without even the implied warranty of
 9 |    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10 |    GNU Lesser General Public License for more details.
11 |    You should have received a copy of the GNU Lesser General Public License
12 |    along with this program. If not, see <https://www.gnu.org/licenses/>.
13 | 
14 | ..
15 |      This file defines rst substitutions.
16 | 
17 | .. role:: raw-html(raw)
18 |    :format: html
19 | 
20 | .. role:: raw-latex(raw)
21 |    :format: latex
22 | 
23 | .. |Add| replace:: :raw-html:`<span style="color:#FFF; background:#06C; padding: 0.2em 0.3em; margin: 0 0.25em;border-radius:0.25em;">Add</span>`
24 | 
25 | .. |Fix| replace:: :raw-html:`<span style="color:#FFF; background:#060; padding: 0.2em 0.3em; margin: 0 0.25em;border-radius:0.25em;">Fix</span>`
26 | 
27 | .. |Change| replace:: :raw-html:`<span style="color:#FFF; background:#60C; padding: 0.2em 0.3em; margin: 0 0.25em;border-radius:0.25em;">Change</span>`
28 | 
29 | .. |Deprecate| replace:: :raw-html:`<span style="color:#FFF; background:#C60; padding: 0.2em 0.3em; margin: 0 0.25em;border-radius: 0.25em;">Deprecate</span>`
30 | 
31 | .. |Remove| replace:: :raw-html:`<span style="color:#FFF; background:#C00; padding: 0.2em 0.3em; margin: 0 0.25em;border-radius: 0.25em;">Remove</span>`
32 | 


--------------------------------------------------------------------------------
/docs/source/api/bluesearch.database.article.rst:
--------------------------------------------------------------------------------
1 | bluesearch.database.article module
2 | ==================================
3 | 
4 | .. automodule:: bluesearch.database.article
5 |    :members:
6 |    :undoc-members:
7 |    :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/source/api/bluesearch.database.cord_19.rst:
--------------------------------------------------------------------------------
1 | bluesearch.database.cord\_19 module
2 | ===================================
3 | 
4 | .. automodule:: bluesearch.database.cord_19
5 |    :members:
6 |    :undoc-members:
7 |    :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/source/api/bluesearch.database.download.rst:
--------------------------------------------------------------------------------
1 | bluesearch.database.download module
2 | ===================================
3 | 
4 | .. automodule:: bluesearch.database.download
5 |    :members:
6 |    :undoc-members:
7 |    :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/source/api/bluesearch.database.mesh.rst:
--------------------------------------------------------------------------------
1 | bluesearch.database.mesh module
2 | ===============================
3 | 
4 | .. automodule:: bluesearch.database.mesh
5 |    :members:
6 |    :undoc-members:
7 |    :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/source/api/bluesearch.database.mining_cache.rst:
--------------------------------------------------------------------------------
1 | bluesearch.database.mining\_cache module
2 | ========================================
3 | 
4 | .. automodule:: bluesearch.database.mining_cache
5 |    :members:
6 |    :undoc-members:
7 |    :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/source/api/bluesearch.database.pdf.rst:
--------------------------------------------------------------------------------
1 | bluesearch.database.pdf module
2 | ==============================
3 | 
4 | .. automodule:: bluesearch.database.pdf
5 |    :members:
6 |    :undoc-members:
7 |    :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/source/api/bluesearch.database.rst:
--------------------------------------------------------------------------------
 1 | bluesearch.database package
 2 | ===========================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | .. toctree::
 8 |    :maxdepth: 4
 9 | 
10 |    bluesearch.database.article
11 |    bluesearch.database.cord_19
12 |    bluesearch.database.download
13 |    bluesearch.database.mesh
14 |    bluesearch.database.mining_cache
15 |    bluesearch.database.pdf
16 |    bluesearch.database.topic
17 |    bluesearch.database.topic_info
18 |    bluesearch.database.topic_rule
19 | 
20 | Module contents
21 | ---------------
22 | 
23 | .. automodule:: bluesearch.database
24 |    :members:
25 |    :undoc-members:
26 |    :show-inheritance:
27 | 


--------------------------------------------------------------------------------
/docs/source/api/bluesearch.database.topic.rst:
--------------------------------------------------------------------------------
1 | bluesearch.database.topic module
2 | ================================
3 | 
4 | .. automodule:: bluesearch.database.topic
5 |    :members:
6 |    :undoc-members:
7 |    :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/source/api/bluesearch.database.topic_info.rst:
--------------------------------------------------------------------------------
1 | bluesearch.database.topic\_info module
2 | ======================================
3 | 
4 | .. automodule:: bluesearch.database.topic_info
5 |    :members:
6 |    :undoc-members:
7 |    :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/source/api/bluesearch.database.topic_rule.rst:
--------------------------------------------------------------------------------
1 | bluesearch.database.topic\_rule module
2 | ======================================
3 | 
4 | .. automodule:: bluesearch.database.topic_rule
5 |    :members:
6 |    :undoc-members:
7 |    :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/source/api/bluesearch.embedding_models.rst:
--------------------------------------------------------------------------------
1 | bluesearch.embedding\_models module
2 | ===================================
3 | 
4 | .. automodule:: bluesearch.embedding_models
5 |    :members:
6 |    :undoc-members:
7 |    :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/source/api/bluesearch.entrypoint.create_database.rst:
--------------------------------------------------------------------------------
1 | bluesearch.entrypoint.create\_database module
2 | =============================================
3 | 
4 | .. automodule:: bluesearch.entrypoint.create_database
5 |    :members:
6 |    :undoc-members:
7 |    :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/source/api/bluesearch.entrypoint.database.add.rst:
--------------------------------------------------------------------------------
1 | bluesearch.entrypoint.database.add module
2 | =========================================
3 | 
4 | .. automodule:: bluesearch.entrypoint.database.add
5 |    :members:
6 |    :undoc-members:
7 |    :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/source/api/bluesearch.entrypoint.database.add_es.rst:
--------------------------------------------------------------------------------
1 | bluesearch.entrypoint.database.add\_es module
2 | =============================================
3 | 
4 | .. automodule:: bluesearch.entrypoint.database.add_es
5 |    :members:
6 |    :undoc-members:
7 |    :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/source/api/bluesearch.entrypoint.database.convert_pdf.rst:
--------------------------------------------------------------------------------
1 | bluesearch.entrypoint.database.convert\_pdf module
2 | ==================================================
3 | 
4 | .. automodule:: bluesearch.entrypoint.database.convert_pdf
5 |    :members:
6 |    :undoc-members:
7 |    :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/source/api/bluesearch.entrypoint.database.download.rst:
--------------------------------------------------------------------------------
1 | bluesearch.entrypoint.database.download module
2 | ==============================================
3 | 
4 | .. automodule:: bluesearch.entrypoint.database.download
5 |    :members:
6 |    :undoc-members:
7 |    :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/source/api/bluesearch.entrypoint.database.init.rst:
--------------------------------------------------------------------------------
1 | bluesearch.entrypoint.database.init module
2 | ==========================================
3 | 
4 | .. automodule:: bluesearch.entrypoint.database.init
5 |    :members:
6 |    :undoc-members:
7 |    :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/source/api/bluesearch.entrypoint.database.parent.rst:
--------------------------------------------------------------------------------
1 | bluesearch.entrypoint.database.parent module
2 | ============================================
3 | 
4 | .. automodule:: bluesearch.entrypoint.database.parent
5 |    :members:
6 |    :undoc-members:
7 |    :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/source/api/bluesearch.entrypoint.database.parse.rst:
--------------------------------------------------------------------------------
1 | bluesearch.entrypoint.database.parse module
2 | ===========================================
3 | 
4 | .. automodule:: bluesearch.entrypoint.database.parse
5 |    :members:
6 |    :undoc-members:
7 |    :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/source/api/bluesearch.entrypoint.database.parse_mesh_rdf.rst:
--------------------------------------------------------------------------------
1 | bluesearch.entrypoint.database.parse\_mesh\_rdf module
2 | ======================================================
3 | 
4 | .. automodule:: bluesearch.entrypoint.database.parse_mesh_rdf
5 |    :members:
6 |    :undoc-members:
7 |    :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/source/api/bluesearch.entrypoint.database.rst:
--------------------------------------------------------------------------------
 1 | bluesearch.entrypoint.database package
 2 | ======================================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | .. toctree::
 8 |    :maxdepth: 4
 9 | 
10 |    bluesearch.entrypoint.database.add
11 |    bluesearch.entrypoint.database.add_es
12 |    bluesearch.entrypoint.database.convert_pdf
13 |    bluesearch.entrypoint.database.download
14 |    bluesearch.entrypoint.database.init
15 |    bluesearch.entrypoint.database.parent
16 |    bluesearch.entrypoint.database.parse
17 |    bluesearch.entrypoint.database.parse_mesh_rdf
18 |    bluesearch.entrypoint.database.run
19 |    bluesearch.entrypoint.database.schemas
20 |    bluesearch.entrypoint.database.topic_extract
21 |    bluesearch.entrypoint.database.topic_filter
22 | 
23 | Module contents
24 | ---------------
25 | 
26 | .. automodule:: bluesearch.entrypoint.database
27 |    :members:
28 |    :undoc-members:
29 |    :show-inheritance:
30 | 


--------------------------------------------------------------------------------
/docs/source/api/bluesearch.entrypoint.database.run.rst:
--------------------------------------------------------------------------------
1 | bluesearch.entrypoint.database.run module
2 | =========================================
3 | 
4 | .. automodule:: bluesearch.entrypoint.database.run
5 |    :members:
6 |    :undoc-members:
7 |    :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/source/api/bluesearch.entrypoint.database.schemas.rst:
--------------------------------------------------------------------------------
1 | bluesearch.entrypoint.database.schemas module
2 | =============================================
3 | 
4 | .. automodule:: bluesearch.entrypoint.database.schemas
5 |    :members:
6 |    :undoc-members:
7 |    :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/source/api/bluesearch.entrypoint.database.topic_extract.rst:
--------------------------------------------------------------------------------
1 | bluesearch.entrypoint.database.topic\_extract module
2 | ====================================================
3 | 
4 | .. automodule:: bluesearch.entrypoint.database.topic_extract
5 |    :members:
6 |    :undoc-members:
7 |    :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/source/api/bluesearch.entrypoint.database.topic_filter.rst:
--------------------------------------------------------------------------------
1 | bluesearch.entrypoint.database.topic\_filter module
2 | ===================================================
3 | 
4 | .. automodule:: bluesearch.entrypoint.database.topic_filter
5 |    :members:
6 |    :undoc-members:
7 |    :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/source/api/bluesearch.entrypoint.embedding_server.rst:
--------------------------------------------------------------------------------
1 | bluesearch.entrypoint.embedding\_server module
2 | ==============================================
3 | 
4 | .. automodule:: bluesearch.entrypoint.embedding_server
5 |    :members:
6 |    :undoc-members:
7 |    :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/source/api/bluesearch.entrypoint.embeddings.rst:
--------------------------------------------------------------------------------
1 | bluesearch.entrypoint.embeddings module
2 | =======================================
3 | 
4 | .. automodule:: bluesearch.entrypoint.embeddings
5 |    :members:
6 |    :undoc-members:
7 |    :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/source/api/bluesearch.entrypoint.mining_cache.rst:
--------------------------------------------------------------------------------
1 | bluesearch.entrypoint.mining\_cache module
2 | ==========================================
3 | 
4 | .. automodule:: bluesearch.entrypoint.mining_cache
5 |    :members:
6 |    :undoc-members:
7 |    :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/source/api/bluesearch.entrypoint.mining_server.rst:
--------------------------------------------------------------------------------
1 | bluesearch.entrypoint.mining\_server module
2 | ===========================================
3 | 
4 | .. automodule:: bluesearch.entrypoint.mining_server
5 |    :members:
6 |    :undoc-members:
7 |    :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/source/api/bluesearch.entrypoint.rst:
--------------------------------------------------------------------------------
 1 | bluesearch.entrypoint package
 2 | =============================
 3 | 
 4 | Subpackages
 5 | -----------
 6 | 
 7 | .. toctree::
 8 |    :maxdepth: 4
 9 | 
10 |    bluesearch.entrypoint.database
11 | 
12 | Submodules
13 | ----------
14 | 
15 | .. toctree::
16 |    :maxdepth: 4
17 | 
18 |    bluesearch.entrypoint.create_database
19 |    bluesearch.entrypoint.embedding_server
20 |    bluesearch.entrypoint.embeddings
21 |    bluesearch.entrypoint.mining_cache
22 |    bluesearch.entrypoint.mining_server
23 |    bluesearch.entrypoint.search_server
24 | 
25 | Module contents
26 | ---------------
27 | 
28 | .. automodule:: bluesearch.entrypoint
29 |    :members:
30 |    :undoc-members:
31 |    :show-inheritance:
32 | 


--------------------------------------------------------------------------------
/docs/source/api/bluesearch.entrypoint.search_server.rst:
--------------------------------------------------------------------------------
1 | bluesearch.entrypoint.search\_server module
2 | ===========================================
3 | 
4 | .. automodule:: bluesearch.entrypoint.search_server
5 |    :members:
6 |    :undoc-members:
7 |    :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/source/api/bluesearch.k8s.connect.rst:
--------------------------------------------------------------------------------
1 | bluesearch.k8s.connect module
2 | =============================
3 | 
4 | .. automodule:: bluesearch.k8s.connect
5 |    :members:
6 |    :undoc-members:
7 |    :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/source/api/bluesearch.k8s.create_indices.rst:
--------------------------------------------------------------------------------
1 | bluesearch.k8s.create\_indices module
2 | =====================================
3 | 
4 | .. automodule:: bluesearch.k8s.create_indices
5 |    :members:
6 |    :undoc-members:
7 |    :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/source/api/bluesearch.k8s.rst:
--------------------------------------------------------------------------------
 1 | bluesearch.k8s package
 2 | ======================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | .. toctree::
 8 |    :maxdepth: 4
 9 | 
10 |    bluesearch.k8s.connect
11 |    bluesearch.k8s.create_indices
12 | 
13 | Module contents
14 | ---------------
15 | 
16 | .. automodule:: bluesearch.k8s
17 |    :members:
18 |    :undoc-members:
19 |    :show-inheritance:
20 | 


--------------------------------------------------------------------------------
/docs/source/api/bluesearch.mining.attribute.rst:
--------------------------------------------------------------------------------
1 | bluesearch.mining.attribute module
2 | ==================================
3 | 
4 | .. automodule:: bluesearch.mining.attribute
5 |    :members:
6 |    :undoc-members:
7 |    :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/source/api/bluesearch.mining.entity.rst:
--------------------------------------------------------------------------------
1 | bluesearch.mining.entity module
2 | ===============================
3 | 
4 | .. automodule:: bluesearch.mining.entity
5 |    :members:
6 |    :undoc-members:
7 |    :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/source/api/bluesearch.mining.eval.rst:
--------------------------------------------------------------------------------
1 | bluesearch.mining.eval module
2 | =============================
3 | 
4 | .. automodule:: bluesearch.mining.eval
5 |    :members:
6 |    :undoc-members:
7 |    :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/source/api/bluesearch.mining.pipeline.rst:
--------------------------------------------------------------------------------
1 | bluesearch.mining.pipeline module
2 | =================================
3 | 
4 | .. automodule:: bluesearch.mining.pipeline
5 |    :members:
6 |    :undoc-members:
7 |    :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/source/api/bluesearch.mining.relation.rst:
--------------------------------------------------------------------------------
1 | bluesearch.mining.relation module
2 | =================================
3 | 
4 | .. automodule:: bluesearch.mining.relation
5 |    :members:
6 |    :undoc-members:
7 |    :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/source/api/bluesearch.mining.rst:
--------------------------------------------------------------------------------
 1 | bluesearch.mining package
 2 | =========================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | .. toctree::
 8 |    :maxdepth: 4
 9 | 
10 |    bluesearch.mining.attribute
11 |    bluesearch.mining.entity
12 |    bluesearch.mining.eval
13 |    bluesearch.mining.pipeline
14 |    bluesearch.mining.relation
15 | 
16 | Module contents
17 | ---------------
18 | 
19 | .. automodule:: bluesearch.mining
20 |    :members:
21 |    :undoc-members:
22 |    :show-inheritance:
23 | 


--------------------------------------------------------------------------------
/docs/source/api/bluesearch.rst:
--------------------------------------------------------------------------------
 1 | bluesearch package
 2 | ==================
 3 | 
 4 | Subpackages
 5 | -----------
 6 | 
 7 | .. toctree::
 8 |    :maxdepth: 4
 9 | 
10 |    bluesearch.database
11 |    bluesearch.entrypoint
12 |    bluesearch.k8s
13 |    bluesearch.mining
14 |    bluesearch.server
15 |    bluesearch.widgets
16 | 
17 | Submodules
18 | ----------
19 | 
20 | .. toctree::
21 |    :maxdepth: 4
22 | 
23 |    bluesearch.embedding_models
24 |    bluesearch.search
25 |    bluesearch.sql
26 |    bluesearch.utils
27 | 
28 | Module contents
29 | ---------------
30 | 
31 | .. automodule:: bluesearch
32 |    :members:
33 |    :undoc-members:
34 |    :show-inheritance:
35 | 


--------------------------------------------------------------------------------
/docs/source/api/bluesearch.search.rst:
--------------------------------------------------------------------------------
1 | bluesearch.search module
2 | ========================
3 | 
4 | .. automodule:: bluesearch.search
5 |    :members:
6 |    :undoc-members:
7 |    :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/source/api/bluesearch.server.embedding_server.rst:
--------------------------------------------------------------------------------
1 | bluesearch.server.embedding\_server module
2 | ==========================================
3 | 
4 | .. automodule:: bluesearch.server.embedding_server
5 |    :members:
6 |    :undoc-members:
7 |    :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/source/api/bluesearch.server.invalid_usage_exception.rst:
--------------------------------------------------------------------------------
1 | bluesearch.server.invalid\_usage\_exception module
2 | ==================================================
3 | 
4 | .. automodule:: bluesearch.server.invalid_usage_exception
5 |    :members:
6 |    :undoc-members:
7 |    :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/source/api/bluesearch.server.mining_server.rst:
--------------------------------------------------------------------------------
1 | bluesearch.server.mining\_server module
2 | =======================================
3 | 
4 | .. automodule:: bluesearch.server.mining_server
5 |    :members:
6 |    :undoc-members:
7 |    :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/source/api/bluesearch.server.rst:
--------------------------------------------------------------------------------
 1 | bluesearch.server package
 2 | =========================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | .. toctree::
 8 |    :maxdepth: 4
 9 | 
10 |    bluesearch.server.embedding_server
11 |    bluesearch.server.invalid_usage_exception
12 |    bluesearch.server.mining_server
13 |    bluesearch.server.search_server
14 | 
15 | Module contents
16 | ---------------
17 | 
18 | .. automodule:: bluesearch.server
19 |    :members:
20 |    :undoc-members:
21 |    :show-inheritance:
22 | 


--------------------------------------------------------------------------------
/docs/source/api/bluesearch.server.search_server.rst:
--------------------------------------------------------------------------------
1 | bluesearch.server.search\_server module
2 | =======================================
3 | 
4 | .. automodule:: bluesearch.server.search_server
5 |    :members:
6 |    :undoc-members:
7 |    :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/source/api/bluesearch.sql.rst:
--------------------------------------------------------------------------------
1 | bluesearch.sql module
2 | =====================
3 | 
4 | .. automodule:: bluesearch.sql
5 |    :members:
6 |    :undoc-members:
7 |    :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/source/api/bluesearch.utils.rst:
--------------------------------------------------------------------------------
1 | bluesearch.utils module
2 | =======================
3 | 
4 | .. automodule:: bluesearch.utils
5 |    :members:
6 |    :undoc-members:
7 |    :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/source/api/bluesearch.widgets.article_saver.rst:
--------------------------------------------------------------------------------
1 | bluesearch.widgets.article\_saver module
2 | ========================================
3 | 
4 | .. automodule:: bluesearch.widgets.article_saver
5 |    :members:
6 |    :undoc-members:
7 |    :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/source/api/bluesearch.widgets.mining_schema.rst:
--------------------------------------------------------------------------------
1 | bluesearch.widgets.mining\_schema module
2 | ========================================
3 | 
4 | .. automodule:: bluesearch.widgets.mining_schema
5 |    :members:
6 |    :undoc-members:
7 |    :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/source/api/bluesearch.widgets.mining_widget.rst:
--------------------------------------------------------------------------------
1 | bluesearch.widgets.mining\_widget module
2 | ========================================
3 | 
4 | .. automodule:: bluesearch.widgets.mining_widget
5 |    :members:
6 |    :undoc-members:
7 |    :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/source/api/bluesearch.widgets.rst:
--------------------------------------------------------------------------------
 1 | bluesearch.widgets package
 2 | ==========================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | .. toctree::
 8 |    :maxdepth: 4
 9 | 
10 |    bluesearch.widgets.article_saver
11 |    bluesearch.widgets.mining_schema
12 |    bluesearch.widgets.mining_widget
13 |    bluesearch.widgets.search_widget
14 | 
15 | Module contents
16 | ---------------
17 | 
18 | .. automodule:: bluesearch.widgets
19 |    :members:
20 |    :undoc-members:
21 |    :show-inheritance:
22 | 


--------------------------------------------------------------------------------
/docs/source/api/bluesearch.widgets.search_widget.rst:
--------------------------------------------------------------------------------
1 | bluesearch.widgets.search\_widget module
2 | ========================================
3 | 
4 | .. automodule:: bluesearch.widgets.search_widget
5 |    :members:
6 |    :undoc-members:
7 |    :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/source/example.rst:
--------------------------------------------------------------------------------
 1 | .. Blue Brain Search is a text mining toolbox focused on scientific use cases.
 2 |    Copyright (C) 2020  Blue Brain Project, EPFL.
 3 |    This program is free software: you can redistribute it and/or modify
 4 |    it under the terms of the GNU Lesser General Public License as published by
 5 |    the Free Software Foundation, either version 3 of the License, or
 6 |    (at your option) any later version.
 7 |    This program is distributed in the hope that it will be useful,
 8 |    but WITHOUT ANY WARRANTY; without even the implied warranty of
 9 |    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10 |    GNU Lesser General Public License for more details.
11 |    You should have received a copy of the GNU Lesser General Public License
12 |    along with this program. If not, see <https://www.gnu.org/licenses/>.
13 | 
14 | Example section
15 | ===============
16 | 
17 | .. testcode::
18 | 
19 |     print('Hello world')
20 | 
21 | .. testoutput::
22 | 
23 |     Hello world
24 | 


--------------------------------------------------------------------------------
/docs/source/faq.rst:
--------------------------------------------------------------------------------
 1 | .. Blue Brain Search is a text mining toolbox focused on scientific use cases.
 2 |    Copyright (C) 2020  Blue Brain Project, EPFL.
 3 |    This program is free software: you can redistribute it and/or modify
 4 |    it under the terms of the GNU Lesser General Public License as published by
 5 |    the Free Software Foundation, either version 3 of the License, or
 6 |    (at your option) any later version.
 7 |    This program is distributed in the hope that it will be useful,
 8 |    but WITHOUT ANY WARRANTY; without even the implied warranty of
 9 |    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10 |    GNU Lesser General Public License for more details.
11 |    You should have received a copy of the GNU Lesser General Public License
12 |    along with this program. If not, see <https://www.gnu.org/licenses/>.
13 | 
14 | FAQ
15 | ===
16 | 
17 | This section describes how to handle common issues.
18 | 
19 | 
20 | MySQL encoding issue
21 | ---------------------
22 | 
23 | When interacting in Python with the MySQL database, using SQLAlchemy and the
24 | MySQL driver :code:`mysqldb`, one might run into the following error when
25 | retrieving columns with text:
26 | 
27 | .. code-block:: text
28 | 
29 |     UnicodeDecodeError: 'charmap' codec can't decode byte 0x81 in position 239:
30 |     character maps to <undefined>
31 | 
32 | The solution is to append :code:`?charset=utf8mb4` to the database URL.
33 | 
34 | So, if the database URL was:
35 | 
36 | .. code-block:: python
37 | 
38 |     f"mysql+mysqldb://{username}:{password}@{host}:{port}/{database}"
39 | 
40 | then the new URL would be:
41 | 
42 | .. code-block:: python
43 | 
44 |     f"mysql+mysqldb://{username}:{password}@{host}:{port}/{database}?charset=utf8mb4"
45 | 
46 | The database URL is what is passed as a first argument to create the engine:
47 | 
48 | .. code-block:: python
49 | 
50 |     import sqlalchemy
51 | 
52 |     engine = sqlalchemy.create_engine(f"{dialect}+{driver}://{username}:{password}@{host}:{port}/{database}")
53 | 
54 | 
55 | DVC dataclasses issue
56 | ----------------------
57 | 
58 | When in a Python 3.7+ environment the package :code:`dataclasses` is installed,
59 | one might run into the following error when doing :code:`dvc pull`:
60 | 
61 | .. code-block:: bash
62 | 
63 |     AttributeError: module 'typing' has no attribute '_ClassVar'
64 | 
65 | The solution is to uninstall the package :code:`dataclasses`:
66 | 
67 | .. code-block:: bash
68 | 
69 |     pip uninstall dataclasses
70 | 
71 | 
72 | DVC pull issue
73 | --------------
74 | 
75 | When launching mining_cache or mining_server entrypoints or even simply
76 | :code:`dvc pull`, one might run into the following error:
77 | 
78 | .. code-block:: text
79 | 
80 |     WARNING: Some of the cache files do not exist neither locally nor on remote.
81 |     Missing cache files:
82 | 
83 | In this case, the solution is to go to the :code:`.dvc` directory
84 | and remove the file called `config.local`:
85 | 
86 | .. code-block:: bash
87 | 
88 |     cd .dvc
89 |     rm config.local
90 | 
91 | Doing `dvc pull` again should work fine after this.
92 | 


--------------------------------------------------------------------------------
/docs/source/instructions.rst:
--------------------------------------------------------------------------------
 1 | .. Blue Brain Search is a text mining toolbox focused on scientific use cases.
 2 |    Copyright (C) 2020  Blue Brain Project, EPFL.
 3 |    This program is free software: you can redistribute it and/or modify
 4 |    it under the terms of the GNU Lesser General Public License as published by
 5 |    the Free Software Foundation, either version 3 of the License, or
 6 |    (at your option) any later version.
 7 |    This program is distributed in the hope that it will be useful,
 8 |    but WITHOUT ANY WARRANTY; without even the implied warranty of
 9 |    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10 |    GNU Lesser General Public License for more details.
11 |    You should have received a copy of the GNU Lesser General Public License
12 |    along with this program. If not, see <https://www.gnu.org/licenses/>.
13 | 
14 | Instructions
15 | ============
16 | 
17 | Installation
18 | ------------
19 | Before installation, please make sure you have a recent :code:`pip` installed (:code:`>=19.1`)
20 | 
21 | Then you can easily install :code:`bluesearch` from PyPI:
22 | 
23 | .. code-block:: bash
24 | 
25 |    pip install bluesearch[data_and_models]
26 | 
27 | You can also build from source if you prefer:
28 | 
29 | .. code-block:: bash
30 | 
31 |     pip install .[data_and_models]  # use -e for editable install
32 | 
33 | NB: The optional dependencies installed with the :code:`[data_and_models]`
34 | option are only necessary if you want to execute training or inference using the
35 | :code:`dvc` and the model and scripts contained under :code:`data_and_models/`. If this is not
36 | the case, you can skip the :code:`[data_and_models]` at the end of :code:`pip install`.
37 | 
38 | 
39 | Generating docs
40 | ---------------
41 | All the versions of our documentation, both stable and latest,
42 | `can be found on Read the Docs <https://blue-brain-search.readthedocs.io/en/stable/>`_.
43 | 
44 | 
45 | To generate the documentation manually, we use :code:`sphinx` with a custom BBP theme.
46 | Make sure to install the :code:`bluesearch` package with :code:`dev` extras to get
47 | the necessary dependencies.
48 | 
49 | .. code-block:: bash
50 | 
51 |     pip install -e .[dev]
52 | 
53 | To generate autodoc directives one can run
54 | 
55 | .. code-block:: bash
56 | 
57 |     cd docs
58 |     sphinx-apidoc -o source/api/ -f -e ../src/bluesearch/ ../src/bluesearch/entrypoint/*
59 | 
60 | Note that it only needs to be rerun when there are new subpackages/modules.
61 | 
62 | To generate the documentation run
63 | 
64 | .. code-block:: bash
65 | 
66 |     cd docs
67 |     make clean && make html
68 | 
69 | 
70 | Finally, one can also run doctests
71 | 
72 | .. code-block:: bash
73 | 
74 |     cd docs
75 |     make doctest
76 | 


--------------------------------------------------------------------------------
/docs/source/logo/BlueBrainSearch_banner.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BlueBrain/Search/503fdf320a62ab2eb1f9a2a371600e4f1a38df62/docs/source/logo/BlueBrainSearch_banner.jpg


--------------------------------------------------------------------------------
/luigi.cfg:
--------------------------------------------------------------------------------
 1 | ;Blue Brain Search is a text mining toolbox focused on scientific use cases.
 2 | ;
 3 | ;Copyright (C) 2020  Blue Brain Project, EPFL.
 4 | ;
 5 | ;This program is free software: you can redistribute it and/or modify
 6 | ;it under the terms of the GNU Lesser General Public License as published by
 7 | ;the Free Software Foundation, either version 3 of the License, or
 8 | ;(at your option) any later version.
 9 | ;
10 | ;This program is distributed in the hope that it will be useful,
11 | ;but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | ;MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 | ;GNU Lesser General Public License for more details.
14 | ;
15 | ;You should have received a copy of the GNU Lesser General Public License
16 | ;along with this program. If not, see <https://www.gnu.org/licenses/>.
17 | 
18 | [core]
19 | autoload_range = true
20 | log_level = INFO
21 | local_scheduler = True
22 | 
23 | [GlobalParams]
24 | source = pubmed
25 | 
26 | [DownloadTask]
27 | from_month = 2021-12
28 | to_month = 2022-02
29 | output_dir = luigi-pipeline
30 | identifier =
31 | ; emtpy string is considered default value
32 | 
33 | [TopicExtractTask]
34 | mesh_topic_db = luigi-pipeline/mesh_topic_db.json
35 | 
36 | [TopicFilterTask]
37 | filter_config = luigi-pipeline/filter-config.jsonl
38 | 
39 | [ConvertPDFTask]
40 | grobid_host = 0.0.0.0
41 | grobid_port = 8070
42 | 
43 | [AddTask]
44 | db_url = luigi-pipeline/my-db.db
45 | db_type = sqlite
46 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | # Blue Brain Search is a text mining toolbox focused on scientific use cases.
 2 | #
 3 | # Copyright (C) 2020  Blue Brain Project, EPFL.
 4 | #
 5 | # This program is free software: you can redistribute it and/or modify
 6 | # it under the terms of the GNU Lesser General Public License as published by
 7 | # the Free Software Foundation, either version 3 of the License, or
 8 | # (at your option) any later version.
 9 | #
10 | # This program is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 | # GNU Lesser General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU Lesser General Public License
16 | # along with this program. If not, see <https://www.gnu.org/licenses/>.
17 | 
18 | [build-system]
19 | requires = [
20 |     "pip>=9",
21 |     "setuptools>=45",
22 |     "setuptools_scm[toml]>=3.4",
23 |     "wheel",
24 | ]
25 | # This is pip's default value if the build-backend key is missing
26 | # Ref: https://pip.pypa.io/en/stable/reference/build-system/pyproject-toml/#fallback-behaviour
27 | # Tox with tox.isolated_build = true needs this key to be defined explicitly.
28 | # Setuptools instructs setting build-backend without __legacy__, ref:
29 | # https://setuptools.pypa.io/en/latest/build_meta.html#how-to-use-it
30 | build-backend = "setuptools.build_meta"
31 | 
32 | [tool.black]
33 | extend-exclude = """data_and_models/pipelines/ner/transformers_vs_spacy/transformers/
34 | |data_and_models/pipelines/sentence_embedding/training_transformers/"""
35 | 


--------------------------------------------------------------------------------
/requirements-data_and_models.txt:
--------------------------------------------------------------------------------
 1 | # Blue Brain Search is a text mining toolbox focused on scientific use cases.
 2 | #
 3 | # Copyright (C) 2020  Blue Brain Project, EPFL.
 4 | #
 5 | # This program is free software: you can redistribute it and/or modify
 6 | # it under the terms of the GNU Lesser General Public License as published by
 7 | # the Free Software Foundation, either version 3 of the License, or
 8 | # (at your option) any later version.
 9 | #
10 | # This program is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 | # GNU Lesser General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU Lesser General Public License
16 | # along with this program. If not, see <https://www.gnu.org/licenses/>.
17 | 
18 | PyYAML==5.4.1
19 | dvc[ssh]==2.5.4
20 | matplotlib==3.4.2
21 | scipy==1.7.0
22 | spacy_lookups_data==1.0.2
23 | srsly==2.4.1
24 | transformers==4.6.1
25 | typer==0.3.2
26 | 


--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
 1 | # Blue Brain Search is a text mining toolbox focused on scientific use cases.
 2 | #
 3 | # Copyright (C) 2020  Blue Brain Project, EPFL.
 4 | #
 5 | # This program is free software: you can redistribute it and/or modify
 6 | # it under the terms of the GNU Lesser General Public License as published by
 7 | # the Free Software Foundation, either version 3 of the License, or
 8 | # (at your option) any later version.
 9 | #
10 | # This program is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 | # GNU Lesser General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU Lesser General Public License
16 | # along with this program. If not, see <https://www.gnu.org/licenses/>.
17 | 
18 | Sphinx==4.1.1
19 | docker==5.0.0
20 | en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0.tar.gz
21 | pytest-benchmark==3.4.1
22 | pytest-cov==2.12.1
23 | pytest==6.2.4
24 | responses==0.19.0
25 | sphinx-bluebrain-theme==0.2.4
26 | tox==3.24.0
27 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | # Blue Brain Search is a text mining toolbox focused on scientific use cases.
 2 | #
 3 | # Copyright (C) 2020  Blue Brain Project, EPFL.
 4 | #
 5 | # This program is free software: you can redistribute it and/or modify
 6 | # it under the terms of the GNU Lesser General Public License as published by
 7 | # the Free Software Foundation, either version 3 of the License, or
 8 | # (at your option) any later version.
 9 | #
10 | # This program is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 | # GNU Lesser General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU Lesser General Public License
16 | # along with this program. If not, see <https://www.gnu.org/licenses/>.
17 | 
18 | Flask==2.0.1
19 | SQLAlchemy[mysql,pymysql]==1.4.21
20 | boto3==1.20.16
21 | catalogue==2.0.4
22 | cryptography==3.4.7
23 | defusedxml==0.6.0
24 | google-cloud-storage==1.43.0
25 | h5py==3.3.0
26 | ipython==7.31.1
27 | ipywidgets==7.6.3
28 | jupyterlab==3.0.17
29 | langdetect==1.0.9
30 | luigi==3.0.3
31 | mashumaro==3.0
32 | numpy==1.21.0
33 | pandas==1.3.0
34 | pg8000==1.23.0
35 | python-dotenv==0.18.0
36 | requests==2.26.0
37 | scikit-learn==0.24.2
38 | sentence-transformers==2.0.0
39 | spacy==3.0.7
40 | spacy-transformers==1.0.3
41 | torch==1.9.0
42 | elasticsearch==8.3.3


--------------------------------------------------------------------------------
/screenshots/mining_widget_articles.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BlueBrain/Search/503fdf320a62ab2eb1f9a2a371600e4f1a38df62/screenshots/mining_widget_articles.png


--------------------------------------------------------------------------------
/screenshots/mining_widget_text.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BlueBrain/Search/503fdf320a62ab2eb1f9a2a371600e4f1a38df62/screenshots/mining_widget_text.png


--------------------------------------------------------------------------------
/screenshots/search_widget.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BlueBrain/Search/503fdf320a62ab2eb1f9a2a371600e4f1a38df62/screenshots/search_widget.png


--------------------------------------------------------------------------------
/src/bluesearch/__init__.py:
--------------------------------------------------------------------------------
 1 | """bluesearch: a Python package for text mining on scientific use cases."""
 2 | 
 3 | # Blue Brain Search is a text mining toolbox focused on scientific use cases.
 4 | #
 5 | # Copyright (C) 2020  Blue Brain Project, EPFL.
 6 | #
 7 | # This program is free software: you can redistribute it and/or modify
 8 | # it under the terms of the GNU Lesser General Public License as published by
 9 | # the Free Software Foundation, either version 3 of the License, or
10 | # (at your option) any later version.
11 | #
12 | # This program is distributed in the hope that it will be useful,
13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 | # GNU Lesser General Public License for more details.
16 | #
17 | # You should have received a copy of the GNU Lesser General Public License
18 | # along with this program. If not, see <https://www.gnu.org/licenses/>.
19 | 
20 | from bluesearch.version import __version__  # noqa
21 | 


--------------------------------------------------------------------------------
/src/bluesearch/_css/__init__.py:
--------------------------------------------------------------------------------
 1 | """CSS styling utilities."""
 2 | 
 3 | # Blue Brain Search is a text mining toolbox focused on scientific use cases.
 4 | #
 5 | # Copyright (C) 2020  Blue Brain Project, EPFL.
 6 | #
 7 | # This program is free software: you can redistribute it and/or modify
 8 | # it under the terms of the GNU Lesser General Public License as published by
 9 | # the Free Software Foundation, either version 3 of the License, or
10 | # (at your option) any later version.
11 | #
12 | # This program is distributed in the hope that it will be useful,
13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 | # GNU Lesser General Public License for more details.
16 | #
17 | # You should have received a copy of the GNU Lesser General Public License
18 | # along with this program. If not, see <https://www.gnu.org/licenses/>.
19 | 


--------------------------------------------------------------------------------
/src/bluesearch/_css/style.py:
--------------------------------------------------------------------------------
 1 | """CSS styling utilities."""
 2 | 
 3 | # Blue Brain Search is a text mining toolbox focused on scientific use cases.
 4 | #
 5 | # Copyright (C) 2020  Blue Brain Project, EPFL.
 6 | #
 7 | # This program is free software: you can redistribute it and/or modify
 8 | # it under the terms of the GNU Lesser General Public License as published by
 9 | # the Free Software Foundation, either version 3 of the License, or
10 | # (at your option) any later version.
11 | #
12 | # This program is distributed in the hope that it will be useful,
13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 | # GNU Lesser General Public License for more details.
16 | #
17 | # You should have received a copy of the GNU Lesser General Public License
18 | # along with this program. If not, see <https://www.gnu.org/licenses/>.
19 | 
20 | import pkg_resources
21 | 
22 | 
23 | def get_css_style():
24 |     """Get content of CSS style sheet."""
25 |     css_file = pkg_resources.resource_filename(__name__, "stylesheet.css")
26 |     with open(css_file, "r") as f:
27 |         css_style = f.read()
28 |     return css_style
29 | 


--------------------------------------------------------------------------------
/src/bluesearch/_css/stylesheet.css:
--------------------------------------------------------------------------------
 1 | /*
 2 | Blue Brain Search is a text mining toolbox focused on scientific use cases.
 3 | 
 4 | Copyright (C) 2020  Blue Brain Project, EPFL.
 5 | 
 6 | This program is free software: you can redistribute it and/or modify
 7 | it under the terms of the GNU Lesser General Public License as published by
 8 | the Free Software Foundation, either version 3 of the License, or
 9 | (at your option) any later version.
10 | 
11 | This program is distributed in the hope that it will be useful,
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 | GNU Lesser General Public License for more details.
15 | 
16 | You should have received a copy of the GNU Lesser General Public License
17 | along with this program. If not, see <https://www.gnu.org/licenses/>.
18 | */
19 | 
20 | /* search engine */
21 | 
22 | .article_title {
23 |     font-size: 17px;
24 |     color: #1A0DAB;
25 | }
26 | .paragraph {
27 |     font-size: 13px;
28 |     color: #222;
29 | }
30 | .paragraph_emph {
31 |     font-weight: bold;
32 |     color: #000;
33 | }
34 | .metadata {
35 |     font-size: 13px;
36 |     color: #006621;
37 | }
38 | 
39 | /* success */
40 | .bbs_success {
41 |     color : #388E3B
42 | }
43 | 
44 | /* warnings */
45 | .bbs_warning {
46 |     color: #DDB62C
47 | }
48 | 
49 | /* errors */
50 | .bbs_error {
51 |     color: #E75C58
52 | }
53 | 
54 | /* widgets buttons */
55 | .bbs_button {
56 |     background-color: #3c96f3;
57 |     color: #FFF;
58 |     font-size: 150%;
59 |     transition-duration: 0.2s;
60 | }
61 | .bbs_button:hover {
62 |     background-color: #3176d2;
63 | }
64 | 
65 | .jupyter-button:active, .jupyter-button.mod-active {
66 |     color: #FFF;
67 |     background-color: #3c96f3;
68 | }
69 | .jupyter-button:hover {
70 |     color: #FFF;
71 |     background-color: #3176d2;
72 | }
73 | 
74 | /* attribute extraction */
75 | 
76 | .number  {
77 |     display: inline-block;
78 |     background: lightgreen;
79 |     padding: 0.2em 0.5em;
80 |     border-radius: 7px;
81 | }
82 | .unit {
83 |     display: inline-block;
84 |     background: pink;
85 |     padding: 0.2em 0.5em;
86 |     border-radius: 7px;
87 | }
88 | .quantityType {
89 |     display: inline-block;
90 |     background: yellow;
91 |     font-variant:small-caps;
92 |     padding: 0.2em 0.5em;
93 |     border-radius: 7px;
94 | }
95 | .fixedWidth {
96 |     width: 4px;
97 |     text-align: justify;
98 | }
99 | 


--------------------------------------------------------------------------------
/src/bluesearch/database/__init__.py:
--------------------------------------------------------------------------------
 1 | """Embedding and Mining Databases."""
 2 | 
 3 | # Blue Brain Search is a text mining toolbox focused on scientific use cases.
 4 | #
 5 | # Copyright (C) 2020  Blue Brain Project, EPFL.
 6 | #
 7 | # This program is free software: you can redistribute it and/or modify
 8 | # it under the terms of the GNU Lesser General Public License as published by
 9 | # the Free Software Foundation, either version 3 of the License, or
10 | # (at your option) any later version.
11 | #
12 | # This program is distributed in the hope that it will be useful,
13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 | # GNU Lesser General Public License for more details.
16 | #
17 | # You should have received a copy of the GNU Lesser General Public License
18 | # along with this program. If not, see <https://www.gnu.org/licenses/>.
19 | 


--------------------------------------------------------------------------------
/src/bluesearch/database/pdf.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2020  Blue Brain Project, EPFL.
 2 | #
 3 | # This program is free software: you can redistribute it and/or modify
 4 | # it under the terms of the GNU Lesser General Public License as published by
 5 | # the Free Software Foundation, either version 3 of the License, or
 6 | # (at your option) any later version.
 7 | #
 8 | # This program is distributed in the hope that it will be useful,
 9 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11 | # GNU Lesser General Public License for more details.
12 | #
13 | # You should have received a copy of the GNU Lesser General Public License
14 | # along with this program. If not, see <https://www.gnu.org/licenses/>.
15 | """Module for PDF conversion."""
16 | import requests
17 | 
18 | 
19 | def grobid_is_alive(host: str, port: int) -> bool:
20 |     """Test if the GROBID server is alive.
21 | 
22 |     This server API is documented here:
23 |     https://grobid.readthedocs.io/en/latest/Grobid-service/#service-checks
24 | 
25 |     Parameters
26 |     ----------
27 |     host
28 |         Host of the GROBID server.
29 |     port
30 |         Port of the GROBID server.
31 | 
32 |     Returns
33 |     -------
34 |     bool
35 |         Whether the GROBID server is alive.
36 |     """
37 |     try:
38 |         response = requests.get(f"http://{host}:{port}/api/isalive")
39 |     except requests.RequestException:
40 |         return False
41 | 
42 |     if response.content == b"true":
43 |         return True
44 |     else:
45 |         return False
46 | 
47 | 
48 | def grobid_pdf_to_tei_xml(pdf_content: bytes, host: str, port: int) -> str:
49 |     """Convert PDF file to TEI XML using GROBID server.
50 | 
51 |     This function uses the GROBID API service to convert PDF to a TEI XML format.
52 |     In order to setup GROBID server, follow the instructions from
53 |     https://grobid.readthedocs.io/en/latest/Grobid-docker/.
54 | 
55 |     Parameters
56 |     ----------
57 |     pdf_content
58 |         PDF content
59 |     host
60 |         Host of the GROBID server.
61 |     port
62 |         Port of the GROBID server.
63 | 
64 |     Returns
65 |     -------
66 |     str
67 |         TEI XML parsing of the PDF content.
68 |     """
69 |     url = f"http://{host}:{port}/api/processFulltextDocument"
70 |     files = {"input": pdf_content}
71 |     headers = {"Accept": "application/xml"}
72 |     timeout = 60
73 | 
74 |     response = requests.post(
75 |         url=url,
76 |         files=files,
77 |         headers=headers,
78 |         timeout=timeout,
79 |     )
80 |     response.raise_for_status()
81 |     return response.text
82 | 


--------------------------------------------------------------------------------
/src/bluesearch/entrypoint/__init__.py:
--------------------------------------------------------------------------------
 1 | """Subpackage containing all the entry points."""
 2 | 
 3 | # Blue Brain Search is a text mining toolbox focused on scientific use cases.
 4 | #
 5 | # Copyright (C) 2020  Blue Brain Project, EPFL.
 6 | #
 7 | # This program is free software: you can redistribute it and/or modify
 8 | # it under the terms of the GNU Lesser General Public License as published by
 9 | # the Free Software Foundation, either version 3 of the License, or
10 | # (at your option) any later version.
11 | #
12 | # This program is distributed in the hope that it will be useful,
13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 | # GNU Lesser General Public License for more details.
16 | #
17 | # You should have received a copy of the GNU Lesser General Public License
18 | # along with this program. If not, see <https://www.gnu.org/licenses/>.
19 | 


--------------------------------------------------------------------------------
/src/bluesearch/entrypoint/database/__init__.py:
--------------------------------------------------------------------------------
1 | """Subpackage for database creation."""
2 | 


--------------------------------------------------------------------------------
/src/bluesearch/entrypoint/database/init.py:
--------------------------------------------------------------------------------
 1 | """Initialization of the database."""
 2 | import argparse
 3 | import logging
 4 | 
 5 | logger = logging.getLogger(__name__)
 6 | 
 7 | 
 8 | def init_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
 9 |     """Initialise the argument parser for the init subcommand.
10 | 
11 |     Parameters
12 |     ----------
13 |     parser
14 |         The argument parser to initialise.
15 | 
16 |     Returns
17 |     -------
18 |     argparse.ArgumentParser
19 |         The initialised argument parser. The same object as the `parser`
20 |         argument.
21 |     """
22 |     parser.description = "Initialize a database."
23 | 
24 |     parser.add_argument(
25 |         "db_url",
26 |         type=str,
27 |         help="""
28 |         The location of the database depending on the database type.
29 | 
30 |         For MySQL and MariaDB the server URL should be provided, for SQLite the
31 |         location of the database file. Generally, the scheme part of
32 |         the URL should be omitted, e.g. for MySQL the URL should be
33 |         of the form 'my_sql_server.ch:1234/my_database' and for SQLite
34 |         of the form '/path/to/the/local/database.db'.
35 |         """,
36 |     )
37 |     parser.add_argument(
38 |         "--db-type",
39 |         default="sqlite",
40 |         type=str,
41 |         choices=("mariadb", "mysql", "postgres", "sqlite"),
42 |         help="Type of the database.",
43 |     )
44 |     return parser
45 | 
46 | 
47 | def run(
48 |     *,
49 |     db_url: str,
50 |     db_type: str,
51 | ) -> int:
52 |     """Initialize database.
53 | 
54 |     Parameter description and potential defaults are documented inside of the
55 |     `get_parser` function.
56 |     """
57 |     logger.info("Importing dependencies")
58 |     import sqlalchemy
59 | 
60 |     from bluesearch.entrypoint.database.schemas import schema_articles, schema_sentences
61 | 
62 |     if db_type == "sqlite":
63 |         engine = sqlalchemy.create_engine(f"sqlite:///{db_url}")
64 | 
65 |     elif db_type in {"mariadb", "mysql"}:
66 |         engine = sqlalchemy.create_engine(f"mysql+pymysql://{db_url}")
67 | 
68 |     elif db_type == "postgres":
69 |         engine = sqlalchemy.create_engine(f"postgresql+pg8000://{db_url}")
70 | 
71 |     else:
72 |         # This branch never reached because of `choices` in `argparse`
73 |         raise ValueError(f"Unrecognized database type {db_type}")  # pragma: nocover
74 | 
75 |     metadata = sqlalchemy.MetaData()
76 | 
77 |     # Creation of the schema of the tables
78 |     schema_articles(metadata)
79 |     schema_sentences(metadata)
80 | 
81 |     # Construction
82 |     with engine.begin() as connection:
83 |         metadata.create_all(connection)
84 | 
85 |     logger.info("Initialization done")
86 | 
87 |     return 0
88 | 


--------------------------------------------------------------------------------
/src/bluesearch/entrypoint/database/parse_mesh_rdf.py:
--------------------------------------------------------------------------------
 1 | #  Blue Brain Search is a text mining toolbox focused on scientific use cases.
 2 | #
 3 | #  Copyright (C) 2022 Blue Brain Project, EPFL.
 4 | #
 5 | #  This program is free software: you can redistribute it and/or modify
 6 | #  it under the terms of the GNU Lesser General Public License as published by
 7 | #  the Free Software Foundation, either version 3 of the License, or
 8 | #  (at your option) any later version.
 9 | #
10 | #  This program is distributed in the hope that it will be useful,
11 | #  but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | #  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 | #  GNU Lesser General Public License for more details.
14 | #
15 | #  You should have received a copy of the GNU Lesser General Public License
16 | #  along with this program. If not, see <https://www.gnu.org/licenses/>.
17 | """CLI sub-command for parsing MeSH RDF files."""
18 | from __future__ import annotations
19 | 
20 | import argparse
21 | import gzip
22 | import json
23 | import logging
24 | import pathlib
25 | 
26 | logger = logging.getLogger(__name__)
27 | 
28 | 
29 | def init_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
30 |     """Initialise the argument parser for the parse-mesh-rdf subcommand.
31 | 
32 |     Parameters
33 |     ----------
34 |     parser
35 |         The argument parser to initialise.
36 | 
37 |     Returns
38 |     -------
39 |     argparse.ArgumentParser
40 |         The initialised argument parser. The same object as the `parser`
41 |         argument.
42 |     """
43 |     parser.description = "Parse a MeSH RDF file in N-Triples format."
44 |     parser.add_argument(
45 |         "mesh_nt_gz_file",
46 |         type=pathlib.Path,
47 |         help="""
48 |         Path to a "mesh*.nt.gz" file downloaded from
49 |         https://nlmpubs.nlm.nih.gov/projects/mesh/rdf/
50 |         """,
51 |     )
52 |     parser.add_argument(
53 |         "output_json_file",
54 |         type=pathlib.Path,
55 |         help="""
56 |         The output file for parsing results. The JSON file will contain a
57 |         flat dictionary with MeSH tree names as keys and corresponding topic
58 |         labels as values.
59 |         """,
60 |     )
61 |     return parser
62 | 
63 | 
64 | def run(*, mesh_nt_gz_file: pathlib.Path, output_json_file: pathlib.Path) -> int:
65 |     """Parse a MeSH RDF file to extract the topic tree structure.
66 | 
67 |     See the description of the `init_parser` command for more information on
68 |     the command and its parameters.
69 |     """
70 |     from bluesearch.database import mesh
71 | 
72 |     if not mesh_nt_gz_file.exists():
73 |         logger.error(f"The file {mesh_nt_gz_file} does not exist.")
74 |         return 1
75 |     if not mesh_nt_gz_file.is_file():
76 |         logger.error(f"The path {mesh_nt_gz_file} must be a file.")
77 |         return 1
78 | 
79 |     logger.info(f"Parsing the MeSH file {mesh_nt_gz_file.resolve().as_uri()}")
80 |     with gzip.open(mesh_nt_gz_file, "rt") as fh:
81 |         tree_number_to_label = mesh.parse_tree_numbers(fh)
82 | 
83 |     logger.info(f"Saving results to {output_json_file.resolve().as_uri()}")
84 |     with open(output_json_file, "w") as fh:
85 |         json.dump(tree_number_to_label, fh)
86 | 
87 |     logger.info("Done")
88 |     return 0
89 | 


--------------------------------------------------------------------------------
/src/bluesearch/entrypoint/database/schemas.py:
--------------------------------------------------------------------------------
 1 | """Module for defining SQL schemas."""
 2 | 
 3 | # Blue Brain Search is a text mining toolbox focused on scientific use cases.
 4 | #
 5 | # Copyright (C) 2020  Blue Brain Project, EPFL.
 6 | #
 7 | # This program is free software: you can redistribute it and/or modify
 8 | # it under the terms of the GNU Lesser General Public License as published by
 9 | # the Free Software Foundation, either version 3 of the License, or
10 | # (at your option) any later version.
11 | #
12 | # This program is distributed in the hope that it will be useful,
13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 | # GNU Lesser General Public License for more details.
16 | #
17 | # You should have received a copy of the GNU Lesser General Public License
18 | # along with this program. If not, see <https://www.gnu.org/licenses/>.
19 | 
20 | from sqlalchemy import (
21 |     Boolean,
22 |     Column,
23 |     Date,
24 |     ForeignKey,
25 |     Integer,
26 |     MetaData,
27 |     String,
28 |     Table,
29 |     Text,
30 |     UniqueConstraint,
31 | )
32 | 
33 | 
34 | def schema_articles(metadata: MetaData) -> None:
35 |     """Add to the given 'metadata' the schema of the table 'articles'."""
36 |     Table(
37 |         "articles",
38 |         metadata,
39 |         Column("article_id", String(32), primary_key=True),
40 |         Column("doi", Text()),
41 |         Column("pmc_id", Text()),
42 |         Column("pubmed_id", Text()),
43 |         Column("title", Text()),
44 |         Column("authors", Text()),
45 |         Column("abstract", Text()),
46 |         Column("journal", Text()),
47 |         Column("publish_time", Date()),
48 |         Column("license", Text()),
49 |         Column("is_english", Boolean()),
50 |     )
51 | 
52 | 
53 | def schema_sentences(metadata: MetaData) -> None:
54 |     """Add to the given 'metadata' the schema of the table 'sentences'."""
55 |     Table(
56 |         "sentences",
57 |         metadata,
58 |         Column("sentence_id", Integer(), primary_key=True, autoincrement=True),
59 |         Column("section_name", Text()),
60 |         Column("text", Text()),
61 |         Column(
62 |             "article_id", String(32), ForeignKey("articles.article_id"), nullable=False
63 |         ),
64 |         Column("paragraph_pos_in_article", Integer(), nullable=False),
65 |         Column("sentence_pos_in_paragraph", Integer(), nullable=False),
66 |         UniqueConstraint(
67 |             "article_id",
68 |             "paragraph_pos_in_article",
69 |             "sentence_pos_in_paragraph",
70 |             name="sentence_unique_identifier",
71 |         ),
72 |         Column("is_bad", Boolean(), server_default="0"),
73 |     )
74 | 


--------------------------------------------------------------------------------
/src/bluesearch/entrypoint/embedding_server.py:
--------------------------------------------------------------------------------
 1 | """Entrypoint for launching an embedding server."""
 2 | 
 3 | # Blue Brain Search is a text mining toolbox focused on scientific use cases.
 4 | #
 5 | # Copyright (C) 2020  Blue Brain Project, EPFL.
 6 | #
 7 | # This program is free software: you can redistribute it and/or modify
 8 | # it under the terms of the GNU Lesser General Public License as published by
 9 | # the Free Software Foundation, either version 3 of the License, or
10 | # (at your option) any later version.
11 | #
12 | # This program is distributed in the hope that it will be useful,
13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 | # GNU Lesser General Public License for more details.
16 | #
17 | # You should have received a copy of the GNU Lesser General Public License
18 | # along with this program. If not, see <https://www.gnu.org/licenses/>.
19 | 
20 | import logging
21 | import sys
22 | 
23 | from bluesearch.embedding_models import get_embedding_model
24 | from bluesearch.entrypoint._helper import configure_logging, get_var, run_server
25 | 
26 | 
27 | def get_embedding_app():
28 |     """Construct the embedding flask app."""
29 |     from bluesearch.server.embedding_server import EmbeddingServer
30 | 
31 |     # Read configuration
32 |     log_file = get_var("BBS_EMBEDDING_LOG_FILE", check_not_set=False)
33 |     log_level = get_var("BBS_EMBEDDING_LOG_LEVEL", logging.INFO, var_type=int)
34 | 
35 |     # Configure logging
36 |     configure_logging(log_file, log_level)
37 |     logger = logging.getLogger(__name__)
38 | 
39 |     logger.info(" Configuration ".center(80, "-"))
40 |     logger.info(f"log-file            : {log_file}")
41 |     logger.info(f"log-level           : {log_level}")
42 |     logger.info("-" * 80)
43 | 
44 |     # Load embedding models
45 |     logger.info("Loading embedding models")
46 |     supported_models = ["SBERT", "SBioBERT", "BioBERT NLI+STS"]
47 |     embedding_models = {
48 |         model_name: get_embedding_model(model_name) for model_name in supported_models
49 |     }
50 | 
51 |     # Create Server app
52 |     logger.info("Creating the server app")
53 |     embedding_app = EmbeddingServer(embedding_models)
54 | 
55 |     return embedding_app
56 | 
57 | 
58 | def run_embedding_server():
59 |     """Run the embedding server."""
60 |     run_server(get_embedding_app, "embedding")
61 | 
62 | 
63 | if __name__ == "__main__":  # pragma: no cover
64 |     sys.exit(run_embedding_server())
65 | 


--------------------------------------------------------------------------------
/src/bluesearch/entrypoint/search_server.py:
--------------------------------------------------------------------------------
 1 | """The entrypoint script for the search server."""
 2 | 
 3 | # Blue Brain Search is a text mining toolbox focused on scientific use cases.
 4 | #
 5 | # Copyright (C) 2020  Blue Brain Project, EPFL.
 6 | #
 7 | # This program is free software: you can redistribute it and/or modify
 8 | # it under the terms of the GNU Lesser General Public License as published by
 9 | # the Free Software Foundation, either version 3 of the License, or
10 | # (at your option) any later version.
11 | #
12 | # This program is distributed in the hope that it will be useful,
13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 | # GNU Lesser General Public License for more details.
16 | #
17 | # You should have received a copy of the GNU Lesser General Public License
18 | # along with this program. If not, see <https://www.gnu.org/licenses/>.
19 | 
20 | import logging
21 | import pathlib
22 | import sys
23 | 
24 | import sqlalchemy
25 | 
26 | from bluesearch.entrypoint._helper import configure_logging, get_var, run_server
27 | 
28 | 
29 | def get_search_app():
30 |     """Construct the search flask app."""
31 |     from bluesearch.server.search_server import SearchServer
32 |     from bluesearch.utils import H5
33 | 
34 |     # Read configuration
35 |     log_file = get_var("BBS_SEARCH_LOG_FILE", check_not_set=False)
36 |     log_level = get_var("BBS_SEARCH_LOG_LEVEL", logging.INFO, var_type=int)
37 | 
38 |     models_path = get_var("BBS_SEARCH_MODELS_PATH")
39 |     embeddings_path = get_var("BBS_SEARCH_EMBEDDINGS_PATH")
40 |     which_models = get_var("BBS_SEARCH_MODELS")
41 | 
42 |     mysql_url = get_var("BBS_SEARCH_DB_URL")
43 |     mysql_user = get_var("BBS_SEARCH_MYSQL_USER")
44 |     mysql_password = get_var("BBS_SEARCH_MYSQL_PASSWORD")
45 | 
46 |     # Configure logging
47 |     configure_logging(log_file, log_level)
48 |     logger = logging.getLogger(__name__)
49 | 
50 |     logger.info(" Configuration ".center(80, "-"))
51 |     logger.info(f"log-file          : {log_file}")
52 |     logger.info(f"log-level         : {log_level}")
53 |     logger.info(f"models-path       : {models_path}")
54 |     logger.info(f"embeddings-path   : {embeddings_path}")
55 |     logger.info(f"which-models      : {which_models}")
56 |     logger.info(f"mysql_url         : {mysql_url}")
57 |     logger.info(f"mysql_user        : {mysql_user}")
58 |     logger.info(f"mysql_password    : {mysql_password}")
59 |     logger.info("-" * 80)
60 | 
61 |     # Initialize flask app
62 |     logger.info("Creating the Flask app")
63 |     models_path = pathlib.Path(models_path)
64 |     embeddings_path = pathlib.Path(embeddings_path)
65 |     engine_url = f"mysql://{mysql_user}:{mysql_password}@{mysql_url}"
66 |     engine = sqlalchemy.create_engine(engine_url, pool_recycle=14400)
67 |     models_list = [model.strip() for model in which_models.split(",")]
68 |     indices = H5.find_populated_rows(embeddings_path, models_list[0])
69 | 
70 |     server_app = SearchServer(
71 |         models_path, embeddings_path, indices, engine, models_list
72 |     )
73 |     return server_app
74 | 
75 | 
76 | def run_search_server():
77 |     """Run the search server."""
78 |     run_server(get_search_app, "search")
79 | 
80 | 
81 | if __name__ == "__main__":  # pragma: no cover
82 |     sys.exit(run_search_server())
83 | 


--------------------------------------------------------------------------------
/src/bluesearch/k8s/__init__.py:
--------------------------------------------------------------------------------
1 | """Subpackage for Kubernetes related code."""
2 | 


--------------------------------------------------------------------------------
/src/bluesearch/k8s/connect.py:
--------------------------------------------------------------------------------
 1 | # Blue Brain Search is a text mining toolbox focused on scientific use cases.
 2 | #
 3 | # Copyright (C) 2020  Blue Brain Project, EPFL.
 4 | #
 5 | # This program is free software: you can redistribute it and/or modify
 6 | # it under the terms of the GNU Lesser General Public License as published by
 7 | # the Free Software Foundation, either version 3 of the License, or
 8 | # (at your option) any later version.
 9 | #
10 | # This program is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 | # GNU Lesser General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU Lesser General Public License
16 | # along with this program. If not, see <https://www.gnu.org/licenses/>.
17 | """connects to ES."""
18 | import logging
19 | import os
20 | 
21 | import urllib3
22 | from dotenv import load_dotenv
23 | from elasticsearch import Elasticsearch
24 | 
25 | load_dotenv()
26 | urllib3.disable_warnings()
27 | 
28 | logger = logging.getLogger(__name__)
29 | 
30 | 
31 | def connect() -> Elasticsearch:
32 |     """Return a client connect ES."""
33 |     client = Elasticsearch(
34 |         os.environ["ES_URL"],
35 |         basic_auth=("elastic", os.environ["ES_PASS"]),
36 |         verify_certs=False,
37 |     )
38 | 
39 |     if not client.ping():
40 |         raise RuntimeError(f"Cannot connect to ES: {os.environ['ES_URL']}")
41 | 
42 |     logger.info("Connected to ES")
43 | 
44 |     return client
45 | 
46 | 
47 | if __name__ == "__main__":
48 |     connect()
49 | 


--------------------------------------------------------------------------------
/src/bluesearch/mining/__init__.py:
--------------------------------------------------------------------------------
 1 | """Subpackage for text mining."""
 2 | 
 3 | # Blue Brain Search is a text mining toolbox focused on scientific use cases.
 4 | #
 5 | # Copyright (C) 2020  Blue Brain Project, EPFL.
 6 | #
 7 | # This program is free software: you can redistribute it and/or modify
 8 | # it under the terms of the GNU Lesser General Public License as published by
 9 | # the Free Software Foundation, either version 3 of the License, or
10 | # (at your option) any later version.
11 | #
12 | # This program is distributed in the hope that it will be useful,
13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 | # GNU Lesser General Public License for more details.
16 | #
17 | # You should have received a copy of the GNU Lesser General Public License
18 | # along with this program. If not, see <https://www.gnu.org/licenses/>.
19 | 


--------------------------------------------------------------------------------
/src/bluesearch/py.typed:
--------------------------------------------------------------------------------
1 | # Marker file for PEP 561.
2 | 


--------------------------------------------------------------------------------
/src/bluesearch/server/__init__.py:
--------------------------------------------------------------------------------
 1 | """Implementation of servers."""
 2 | 
 3 | # Blue Brain Search is a text mining toolbox focused on scientific use cases.
 4 | #
 5 | # Copyright (C) 2020  Blue Brain Project, EPFL.
 6 | #
 7 | # This program is free software: you can redistribute it and/or modify
 8 | # it under the terms of the GNU Lesser General Public License as published by
 9 | # the Free Software Foundation, either version 3 of the License, or
10 | # (at your option) any later version.
11 | #
12 | # This program is distributed in the hope that it will be useful,
13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 | # GNU Lesser General Public License for more details.
16 | #
17 | # You should have received a copy of the GNU Lesser General Public License
18 | # along with this program. If not, see <https://www.gnu.org/licenses/>.
19 | 


--------------------------------------------------------------------------------
/src/bluesearch/server/invalid_usage_exception.py:
--------------------------------------------------------------------------------
 1 | """Custom exceptions."""
 2 | 
 3 | # Blue Brain Search is a text mining toolbox focused on scientific use cases.
 4 | #
 5 | # Copyright (C) 2020  Blue Brain Project, EPFL.
 6 | #
 7 | # This program is free software: you can redistribute it and/or modify
 8 | # it under the terms of the GNU Lesser General Public License as published by
 9 | # the Free Software Foundation, either version 3 of the License, or
10 | # (at your option) any later version.
11 | #
12 | # This program is distributed in the hope that it will be useful,
13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 | # GNU Lesser General Public License for more details.
16 | #
17 | # You should have received a copy of the GNU Lesser General Public License
18 | # along with this program. If not, see <https://www.gnu.org/licenses/>.
19 | 
20 | 
21 | class InvalidUsage(Exception):
22 |     """An exception used in the REST API server.
23 | 
24 |     The code was largely copied from
25 |     https://flask.palletsprojects.com/en/1.1.x/patterns/apierrors/
26 |     """
27 | 
28 |     def __init__(self, message, status_code=None):
29 |         Exception.__init__(self)
30 |         self.message = message
31 |         if status_code is None:
32 |             self.status_code = 400
33 |         else:
34 |             self.status_code = status_code
35 | 
36 |     def to_dict(self):
37 |         """Generate a dictionary."""
38 |         rv = {}
39 |         rv["message"] = self.message
40 |         return rv
41 | 


--------------------------------------------------------------------------------
/src/bluesearch/widgets/__init__.py:
--------------------------------------------------------------------------------
 1 | """Various widgets related to the BBS."""
 2 | 
 3 | # Blue Brain Search is a text mining toolbox focused on scientific use cases.
 4 | #
 5 | # Copyright (C) 2020  Blue Brain Project, EPFL.
 6 | #
 7 | # This program is free software: you can redistribute it and/or modify
 8 | # it under the terms of the GNU Lesser General Public License as published by
 9 | # the Free Software Foundation, either version 3 of the License, or
10 | # (at your option) any later version.
11 | #
12 | # This program is distributed in the hope that it will be useful,
13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 | # GNU Lesser General Public License for more details.
16 | #
17 | # You should have received a copy of the GNU Lesser General Public License
18 | # along with this program. If not, see <https://www.gnu.org/licenses/>.
19 | 


--------------------------------------------------------------------------------
/tests/data/cord19_v35/document_parses/pmc_json/PMC7186928.xml.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "paper_id": "PMC7186928",
 3 |     "metadata": {
 4 |         "title": "Will we see protection or reinfection in COVID-19?",
 5 |         "authors": [
 6 |             {
 7 |                 "first": "Miyo",
 8 |                 "middle": [],
 9 |                 "last": "Ota",
10 |                 "suffix": "",
11 |                 "email": "sinai.immunology@gmail.com",
12 |                 "affiliation": {}
13 |             }
14 |         ]
15 |     },
16 |     "body_text": [
17 |         {
18 |             "text": "There is rising concern that patients who recover from COVID-19 may be at risk of reinfection. In this preprint, Bao et al. investigated acquired immunity to SARS-CoV-2 in rhesus macaques. Four rhesus monkeys were infected with SARS-CoV-2 and two were reinfected after confirmed recovery. After primary infection, viral replication was detected in the nose, pharynx, lungs and gut, with histopathological evidence of lung damage. Sera collected from recovered monkeys before reinfection exhibited neutralizing activity against SARS-CoV-2. Upon reinfection, viral replication was not detected in nasopharyngeal or anal swabs, and reinfected monkeys did not show any signs of COVID-19 disease recurrence. This suggests that immunity acquired following primary infection with SARS-CoV-2 may protect upon subsequent exposure to the virus.",
19 |             "cite_spans": [],
20 |             "section": "",
21 |             "ref_spans": []
22 |         }
23 |     ],
24 |     "ref_entries": {},
25 |     "back_matter": [],
26 |     "bib_entries": {
27 |         "BIBREF0": {
28 |             "title": "Reinfection could not occur in SARS-CoV-2 infected rhesus macaques",
29 |             "authors": [
30 |                 {
31 |                     "first": "L",
32 |                     "middle": [],
33 |                     "last": "Bao",
34 |                     "suffix": ""
35 |                 }
36 |             ],
37 |             "year": 2020,
38 |             "venue": "bioRxiv",
39 |             "volume": "",
40 |             "issn": "",
41 |             "pages": null,
42 |             "other_ids": {
43 |                 "DOI": [
44 |                     "10.1101/2020.03.13.990226"
45 |                 ]
46 |             }
47 |         }
48 |     }
49 | }


--------------------------------------------------------------------------------
/tests/data/mining/eval/iob_punctuation_after.csv:
--------------------------------------------------------------------------------
 1 | text,class_ann1,class_ann2,class_ann3
 2 | Potato,B-VEGETABLE,B-VEGETABLE,B-VEGETABLE
 3 | Solanum,B-VEGETABLE,I-VEGETABLE,B-VEGETABLE
 4 | tuberosum,I-VEGETABLE,I-VEGETABLE,I-VEGETABLE
 5 | is,O,O,O
 6 | a,O,O,O
 7 | vegetable,B-VEGETABLE,B-VEGETABLE,B-VEGETABLE
 8 | Cherry,B-FRUIT,B-FRUIT,B-FRUIT
 9 | tomato,I-FRUIT,I-FRUIT,I-FRUIT
10 | is,O,O,O
11 | technically,O,O,O
12 | a,O,O,O
13 | fruit,B-FRUIT,B-VEGETABLE,B-FRUIT
14 | but,O,O,O
15 | few,O,O,O
16 | know,O,O,O
17 | that,O,O,O


--------------------------------------------------------------------------------
/tests/data/mining/eval/iob_punctuation_before.csv:
--------------------------------------------------------------------------------
 1 | text,class_ann1,class_ann2,class_ann3
 2 | Potato,B-VEGETABLE,B-VEGETABLE,B-VEGETABLE
 3 | (,B-VEGETABLE,I-VEGETABLE,I-VEGETABLE
 4 | Solanum,I-VEGETABLE,I-VEGETABLE,B-VEGETABLE
 5 | tuberosum,I-VEGETABLE,I-VEGETABLE,I-VEGETABLE
 6 | ),I-VEGETABLE,O,I-VEGETABLE
 7 | is,O,O,O
 8 | a,O,O,O
 9 | """",B-VEGETABLE,O,B-VEGETABLE
10 | vegetable,I-VEGETABLE,B-VEGETABLE,I-VEGETABLE
11 | """",I-VEGETABLE,I-VEGETABLE,B-FRUIT
12 | .,B-FRUIT,I-VEGETABLE,I-FRUIT
13 | """",I-FRUIT,B-FRUIT,I-FRUIT
14 | Cherry,I-FRUIT,I-FRUIT,I-FRUIT
15 | tomato,I-FRUIT,I-FRUIT,I-FRUIT
16 | """",O,I-FRUIT,O
17 | is,O,O,O
18 | technically,O,O,O
19 | a,O,O,O
20 | fruit,B-FRUIT,B-VEGETABLE,B-FRUIT
21 | ",",I-FRUIT,O,I-FRUIT
22 | but,O,O,O
23 | few,O,O,O
24 | know,O,O,O
25 | that,O,O,O
26 | .,O,O,O


--------------------------------------------------------------------------------
/tests/data/mining/request/request.csv:
--------------------------------------------------------------------------------
 1 | entity_type,property,property_type,property_value_type,ontology_source
 2 | DISEASE,,,,UMLS
 3 | CELL_TYPE,,,,
 4 | CHEMICAL,,,,UMLS
 5 | PROTEIN,,,,UMLS
 6 | ORGAN,,,,UMLS
 7 | DISEASE,,,,UMLS
 8 | CHEMICAL,agonist_of,relation,PROTEIN,UMLS
 9 | CHEMICAL,inhibitor_of,relation,PROTEIN,UMLS
10 | CHEMICAL,product_of,relation,PROTEIN,UMLS
11 | ORGANISM,,,,UMLS
12 | CHEMICAL,concentration,attribute,QuantitativeValue,UMLS
13 | DISEASE,is_hereditary,attribute,Boolean,
14 | 


--------------------------------------------------------------------------------
/tests/data/pubmed_article_minimal.xml:
--------------------------------------------------------------------------------
 1 | <PubmedArticle>
 2 |     <MedlineCitation Status="MEDLINE">
 3 |         <PMID Version="1">123456</PMID>
 4 |         <Article PubModel="Print">
 5 |             <Journal>
 6 |                 <JournalIssue CitedMedium="Print">
 7 |                     <PubDate>
 8 |                         <Year>2020</Year>
 9 |                     </PubDate>
10 |                 </JournalIssue>
11 |             </Journal>
12 |             <ArticleTitle>Article Title</ArticleTitle>
13 |             <Pagination>
14 |                 <MedlinePgn>012-34</MedlinePgn>
15 |             </Pagination>
16 |             <Language>eng</Language>
17 |             <PublicationTypeList>
18 |                 <PublicationType UI="D012345">Journal Article</PublicationType>
19 |                 <PublicationType UI="D678901">MeSH Publication Type</PublicationType>
20 |             </PublicationTypeList>
21 |         </Article>
22 |         <MedlineJournalInfo>
23 |             <MedlineTA>Medline TA</MedlineTA>
24 |         </MedlineJournalInfo>
25 |     </MedlineCitation>
26 | </PubmedArticle>
27 | 


--------------------------------------------------------------------------------
/tests/data/pubmed_articles.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="utf-8"?>
 2 | <!DOCTYPE PubmedArticleSet SYSTEM "http://dtd.nlm.nih.gov/ncbi/pubmed/out/pubmed_190101.dtd">
 3 | <PubmedArticleSet>
 4 | <PubmedArticle>
 5 |     <MedlineCitation Status="MEDLINE">
 6 |         <PMID Version="1">123456</PMID>
 7 |         <Article PubModel="Print">
 8 |             <Journal>
 9 |                 <JournalIssue CitedMedium="Print">
10 |                     <PubDate>
11 |                         <Year>2020</Year>
12 |                     </PubDate>
13 |                 </JournalIssue>
14 |             </Journal>
15 |             <ArticleTitle>Article Title 1</ArticleTitle>
16 |             <Pagination>
17 |                 <MedlinePgn>012-34</MedlinePgn>
18 |             </Pagination>
19 |             <Language>eng</Language>
20 |             <PublicationTypeList>
21 |                 <PublicationType UI="D012345">Journal Article</PublicationType>
22 |                 <PublicationType UI="D678901">MeSH Publication Type</PublicationType>
23 |             </PublicationTypeList>
24 |         </Article>
25 |         <MedlineJournalInfo>
26 |             <MedlineTA>Medline TA 1</MedlineTA>
27 |         </MedlineJournalInfo>
28 |     </MedlineCitation>
29 | </PubmedArticle>
30 | <PubmedArticle>
31 |     <MedlineCitation Status="MEDLINE">
32 |         <PMID Version="1">789123</PMID>
33 |         <Article PubModel="Print">
34 |             <Journal>
35 |                 <JournalIssue CitedMedium="Print">
36 |                     <PubDate>
37 |                         <Year>2021</Year>
38 |                     </PubDate>
39 |                 </JournalIssue>
40 |             </Journal>
41 |             <ArticleTitle>Article Title 2</ArticleTitle>
42 |             <Pagination>
43 |                 <MedlinePgn>567-89</MedlinePgn>
44 |             </Pagination>
45 |             <Language>eng</Language>
46 |             <PublicationTypeList>
47 |                 <PublicationType UI="D012345">Journal Article</PublicationType>
48 |                 <PublicationType UI="D678901">MeSH Publication Type</PublicationType>
49 |             </PublicationTypeList>
50 |         </Article>
51 |         <MedlineJournalInfo>
52 |             <MedlineTA>Medline TA 2</MedlineTA>
53 |         </MedlineJournalInfo>
54 |     </MedlineCitation>
55 | </PubmedArticle>
56 | </PubmedArticleSet>
57 | 


--------------------------------------------------------------------------------
/tests/data/pubmed_download_index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
 2 | <html>
 3 |  <head>
 4 |   <title>Index of /pubmed/updatefiles</title>
 5 |  </head>
 6 |  <body>
 7 | <h1>Index of /pubmed/updatefiles</h1>
 8 | <pre>Name                     Last modified      Size  <hr><a href="/pubmed/">Parent Directory</a>                              -   
 9 | <a href="README.txt">README.txt</a>               2020-12-14 08:15  4.0K  
10 | <a href="pubmed21n1063.xml.gz">pubmed21n1063.xml.gz</a>     2020-12-14 14:10   67M  
11 | <a href="pubmed21n1063.xml.gz.md5">pubmed21n1063.xml.gz.md5</a> 2020-12-14 14:10   60   
12 | <a href="pubmed21n1063_stats.html">pubmed21n1063_stats.html</a> 2020-12-14 14:10  585   
13 | <a href="pubmed21n1064.xml.gz">pubmed21n1064.xml.gz</a>     2020-12-14 14:10   53M  
14 | <a href="pubmed21n1064.xml.gz.md5">pubmed21n1064.xml.gz.md5</a> 2020-12-14 14:10   60   
15 | <a href="pubmed21n1064_stats.html">pubmed21n1064_stats.html</a> 2020-12-14 14:10  582   
16 | <a href="pubmed21n1065.xml.gz">pubmed21n1065.xml.gz</a>     2020-12-14 14:10   12M  
17 | <a href="pubmed21n1065.xml.gz.md5">pubmed21n1065.xml.gz.md5</a> 2020-12-14 14:10   60   
18 | <a href="pubmed21n1065_stats.html">pubmed21n1065_stats.html</a> 2020-12-14 14:10  571   
19 | <a href="pubmed21n1066.xml.gz">pubmed21n1066.xml.gz</a>     2020-12-15 14:04   64M  
20 | <a href="pubmed21n1066.xml.gz.md5">pubmed21n1066.xml.gz.md5</a> 2020-12-15 14:04   60   
21 | <a href="pubmed21n1066_stats.html">pubmed21n1066_stats.html</a> 2020-12-15 14:04  584   
22 | <a href="pubmed21n1067.xml.gz">pubmed21n1067.xml.gz</a>     2020-12-15 14:04  7.7M  
23 | <a href="pubmed21n1067.xml.gz.md5">pubmed21n1067.xml.gz.md5</a> 2020-12-15 14:04   60   
24 | <a href="pubmed21n1067_stats.html">pubmed21n1067_stats.html</a> 2020-12-15 14:04  571   
25 | <a href="pubmed21n1068.xml.gz">pubmed21n1068.xml.gz</a>     2020-12-16 14:02   51M  
26 | <a href="pubmed21n1068.xml.gz.md5">pubmed21n1068.xml.gz.md5</a> 2020-12-16 14:02   60   
27 | <a href="pubmed21n1068_stats.html">pubmed21n1068_stats.html</a> 2020-12-16 14:02  583   
28 | <a href="pubmed21n1069.xml.gz">pubmed21n1069.xml.gz</a>     2020-12-17 14:02   61M  
29 | <a href="pubmed21n1069.xml.gz.md5">pubmed21n1069.xml.gz.md5</a> 2020-12-17 14:02   60   
30 | <a href="pubmed21n1069_stats.html">pubmed21n1069_stats.html</a> 2020-12-17 14:02  582   
31 | <a href="pubmed21n1070.xml.gz">pubmed21n1070.xml.gz</a>     2020-12-18 14:04   57M  
32 | <hr></pre>
33 | </body></html>
34 | 


--------------------------------------------------------------------------------
/tests/unit/database/test_pdf.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | import pytest
 4 | import requests
 5 | import responses
 6 | 
 7 | from bluesearch.database.pdf import grobid_is_alive, grobid_pdf_to_tei_xml
 8 | 
 9 | 
10 | @responses.activate
11 | def test_conversion_pdf(monkeypatch):
12 |     """Test PDF conversion"""
13 | 
14 |     responses.add(
15 |         responses.POST,
16 |         "http://fake_host:8888/api/processFulltextDocument",
17 |         body="body",
18 |     )
19 | 
20 |     result = grobid_pdf_to_tei_xml(b"", host="fake_host", port=8888)
21 |     assert result == "body"
22 |     assert len(responses.calls) == 1
23 | 
24 | 
25 | @pytest.mark.parametrize(
26 |     ("body", "expected_result"),
27 |     (
28 |         ("true", True),
29 |         (requests.RequestException(), False),
30 |         ("false", False),
31 |         ("unknown", False),
32 |     ),
33 | )
34 | @responses.activate
35 | def test_grobid_is_alive(body, expected_result):
36 |     host = "host"
37 |     port = 12345
38 |     responses.add(
39 |         responses.GET,
40 |         re.compile(rf"http://{host}:{port}/.*"),
41 |         body=body,
42 |     )
43 |     assert grobid_is_alive(host, port) is expected_result
44 | 


--------------------------------------------------------------------------------
/tests/unit/entrypoint/__init__.py:
--------------------------------------------------------------------------------
 1 | """Collection of tests covering entrypoint functionalities."""
 2 | 
 3 | # Blue Brain Search is a text mining toolbox focused on scientific use cases.
 4 | #
 5 | # Copyright (C) 2020  Blue Brain Project, EPFL.
 6 | #
 7 | # This program is free software: you can redistribute it and/or modify
 8 | # it under the terms of the GNU Lesser General Public License as published by
 9 | # the Free Software Foundation, either version 3 of the License, or
10 | # (at your option) any later version.
11 | #
12 | # This program is distributed in the hope that it will be useful,
13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 | # GNU Lesser General Public License for more details.
16 | #
17 | # You should have received a copy of the GNU Lesser General Public License
18 | # along with this program. If not, see <https://www.gnu.org/licenses/>.
19 | 


--------------------------------------------------------------------------------
/tests/unit/entrypoint/database/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BlueBrain/Search/503fdf320a62ab2eb1f9a2a371600e4f1a38df62/tests/unit/entrypoint/database/__init__.py


--------------------------------------------------------------------------------
/tests/unit/entrypoint/database/test_init.py:
--------------------------------------------------------------------------------
 1 | import pathlib
 2 | 
 3 | import sqlalchemy
 4 | 
 5 | from bluesearch.entrypoint.database.parent import main
 6 | from bluesearch.entrypoint.database.schemas import schema_articles, schema_sentences
 7 | 
 8 | 
 9 | def test_sqlite(tmpdir):
10 |     tmpdir = pathlib.Path(str(tmpdir))
11 |     db_path = tmpdir / "database.db"
12 | 
13 |     args_and_opts = [
14 |         "init",
15 |         str(db_path),
16 |         "--db-type=sqlite",
17 |     ]
18 | 
19 |     assert not db_path.exists()
20 | 
21 |     main(args_and_opts)
22 | 
23 |     assert db_path.exists()
24 | 
25 |     engine = sqlalchemy.create_engine(f"sqlite:///{db_path}")
26 |     metadata = sqlalchemy.MetaData(engine)
27 |     metadata.reflect(engine)
28 |     tables = metadata.sorted_tables
29 | 
30 |     expected_metadata = sqlalchemy.MetaData()
31 |     schema_articles(expected_metadata)
32 |     schema_sentences(expected_metadata)
33 |     expected_tables = expected_metadata.sorted_tables
34 | 
35 |     assert len(tables) == len(expected_tables)
36 | 
37 |     for table, expected in zip(tables, expected_tables):
38 |         assert table.compare(expected)
39 | 


--------------------------------------------------------------------------------
/tests/unit/entrypoint/database/test_parent.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import logging
 4 | import subprocess
 5 | 
 6 | import pytest
 7 | 
 8 | from bluesearch.entrypoint.database.parent import _setup_logging
 9 | 
10 | 
11 | @pytest.mark.parametrize("command", ["add", "convert-pdf", "init", "parse"])
12 | def test_commands_work(command):
13 |     subprocess.run(["bbs_database", command, "--help"], check=True)
14 | 
15 | 
16 | def test_setup_logging(caplog):
17 |     def get_levels(loggers: dict[str, logging.Logger]) -> dict[str, int]:
18 |         """Get logging level for each logger."""
19 |         return {name: logger.getEffectiveLevel() for name, logger in loggers.items()}
20 | 
21 |     caplog.set_level(logging.WARNING, logger="bluesearch")
22 | 
23 |     all_loggers = logging.root.manager.loggerDict
24 |     bluesearch_loggers = {
25 |         k: v
26 |         for k, v in all_loggers.items()
27 |         if k.startswith("bluesearch") and isinstance(v, logging.Logger)
28 |     }
29 |     external_loggers = {
30 |         k: v
31 |         for k, v in all_loggers.items()
32 |         if not k.startswith("bluesearch") and isinstance(v, logging.Logger)
33 |     }
34 | 
35 |     bluesearch_levels_before = get_levels(bluesearch_loggers)
36 |     external_levels_before = get_levels(external_loggers)
37 | 
38 |     _setup_logging(logging.DEBUG)
39 | 
40 |     bluesearch_levels_after = get_levels(bluesearch_loggers)
41 |     external_levels_after = get_levels(external_loggers)
42 | 
43 |     assert set(bluesearch_levels_before.values()) == {logging.WARNING}
44 |     assert set(bluesearch_levels_after.values()) == {logging.DEBUG}
45 | 
46 |     assert external_levels_before == external_levels_after
47 | 


--------------------------------------------------------------------------------
/tests/unit/entrypoint/test__helper.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import argparse
 4 | from typing import Sequence
 5 | 
 6 | import pytest
 7 | 
 8 | from bluesearch.entrypoint._helper import parse_args_or_environment
 9 | 
10 | 
11 | def test_parse_args_or_environment(monkeypatch):
12 |     parser = argparse.ArgumentParser()
13 |     parser.add_argument("--normal-arg")
14 |     parser.add_argument("--env-arg", default=argparse.SUPPRESS)
15 |     argv_value = "5"
16 |     env_value = "6"
17 | 
18 |     # --env-arg not provided at all
19 |     argv: Sequence[str] = []
20 |     env_variable_names: dict[str, str] = {}
21 |     args = parse_args_or_environment(parser, env_variable_names, argv)
22 |     assert "normal_arg" in args.__dict__
23 |     assert "env_arg" not in args.__dict__
24 | 
25 |     # --env-arg provided through the CLI
26 |     argv = ["--env-arg", argv_value]
27 |     env_variable_names = {}
28 |     args = parse_args_or_environment(parser, env_variable_names, argv)
29 |     assert "normal_arg" in args.__dict__
30 |     assert "env_arg" in args.__dict__
31 |     assert args.env_arg == argv_value
32 | 
33 |     # --env-arg provided through the environment
34 |     argv = []
35 |     environ = {
36 |         "ENV_ARG": env_value,
37 |     }
38 |     monkeypatch.setattr("bluesearch.entrypoint._helper.os.environ", environ)
39 |     env_variable_names = {
40 |         "env_arg": "ENV_ARG",
41 |     }
42 |     args = parse_args_or_environment(parser, env_variable_names, argv)
43 |     assert "normal_arg" in args.__dict__
44 |     assert "env_arg" in args.__dict__
45 |     assert args.env_arg == env_value
46 | 
47 |     # Check that CLI argument have precedence over environment variables
48 |     argv = ["--env-arg", argv_value]
49 |     environ = {
50 |         "ENV_ARG": env_value,
51 |     }
52 |     monkeypatch.setattr("bluesearch.entrypoint._helper.os.environ", environ)
53 |     env_variable_names = {
54 |         "env_arg": "ENV_ARG",
55 |     }
56 |     args = parse_args_or_environment(parser, env_variable_names, argv)
57 |     assert "normal_arg" in args.__dict__
58 |     assert "env_arg" in args.__dict__
59 |     assert args.env_arg == argv_value
60 | 
61 |     # Value not specified through the CLI, nor through environment
62 |     argv = []
63 |     environ = {}
64 |     monkeypatch.setattr("bluesearch.entrypoint._helper.os.environ", environ)
65 |     env_variable_names = {
66 |         "env_arg": "ENV_ARG",
67 |     }
68 |     with pytest.raises(SystemExit) as pytest_wrapped_e:
69 |         parse_args_or_environment(parser, env_variable_names, argv)
70 |     assert pytest_wrapped_e.value.code == 1
71 | 


--------------------------------------------------------------------------------
/tests/unit/entrypoint/test_embedding_server.py:
--------------------------------------------------------------------------------
 1 | """Collection of tests focusing on the `embedding_server` entrypoint."""
 2 | 
 3 | # Blue Brain Search is a text mining toolbox focused on scientific use cases.
 4 | #
 5 | # Copyright (C) 2020  Blue Brain Project, EPFL.
 6 | #
 7 | # This program is free software: you can redistribute it and/or modify
 8 | # it under the terms of the GNU Lesser General Public License as published by
 9 | # the Free Software Foundation, either version 3 of the License, or
10 | # (at your option) any later version.
11 | #
12 | # This program is distributed in the hope that it will be useful,
13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 | # GNU Lesser General Public License for more details.
16 | #
17 | # You should have received a copy of the GNU Lesser General Public License
18 | # along with this program. If not, see <https://www.gnu.org/licenses/>.
19 | 
20 | import pathlib
21 | from unittest.mock import Mock
22 | 
23 | from bluesearch.entrypoint.embedding_server import get_embedding_app
24 | from bluesearch.server.embedding_server import EmbeddingServer
25 | 
26 | 
27 | def test_environment_reading(monkeypatch, tmpdir):
28 |     tmpdir = pathlib.Path(str(tmpdir))
29 |     logfile = tmpdir / "log.txt"
30 |     logfile.touch()
31 | 
32 |     fake_embedding_server_inst = Mock(spec=EmbeddingServer)
33 |     fake_embedding_server_class = Mock(return_value=fake_embedding_server_inst)
34 | 
35 |     monkeypatch.setattr(
36 |         "bluesearch.server.embedding_server.EmbeddingServer",
37 |         fake_embedding_server_class,
38 |     )
39 | 
40 |     # Mock all of our embedding models
41 |     embedding_models = ["SentTransformer"]
42 | 
43 |     for model in embedding_models:
44 |         monkeypatch.setattr(f"bluesearch.embedding_models.{model}", Mock())
45 | 
46 |     monkeypatch.setenv("BBS_EMBEDDING_LOG_FILE", str(logfile))
47 | 
48 |     embedding_app = get_embedding_app()
49 | 
50 |     assert embedding_app is fake_embedding_server_inst
51 | 
52 |     args, _ = fake_embedding_server_class.call_args
53 | 
54 |     assert len(args) == 1
55 |     assert isinstance(args[0], dict)
56 | 


--------------------------------------------------------------------------------
/tests/unit/entrypoint/test_entrypoint_installation.py:
--------------------------------------------------------------------------------
 1 | """Tests covering entrypoint installation."""
 2 | 
 3 | # Blue Brain Search is a text mining toolbox focused on scientific use cases.
 4 | #
 5 | # Copyright (C) 2020  Blue Brain Project, EPFL.
 6 | #
 7 | # This program is free software: you can redistribute it and/or modify
 8 | # it under the terms of the GNU Lesser General Public License as published by
 9 | # the Free Software Foundation, either version 3 of the License, or
10 | # (at your option) any later version.
11 | #
12 | # This program is distributed in the hope that it will be useful,
13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 | # GNU Lesser General Public License for more details.
16 | #
17 | # You should have received a copy of the GNU Lesser General Public License
18 | # along with this program. If not, see <https://www.gnu.org/licenses/>.
19 | 
20 | import subprocess
21 | 
22 | import pytest
23 | 
24 | 
25 | @pytest.mark.parametrize(
26 |     "entrypoint_name",
27 |     [
28 |         "bbs_database",
29 |         "compute_embeddings",
30 |         "create_database",
31 |         "create_mining_cache",
32 |         "embedding_server",
33 |         "mining_server",
34 |         "search_server",
35 |     ],
36 | )
37 | def test_entrypoint(entrypoint_name):
38 |     subprocess.run([entrypoint_name, "--help"], check=True)
39 | 


--------------------------------------------------------------------------------
/tests/unit/entrypoint/test_mining_server.py:
--------------------------------------------------------------------------------
 1 | """Collection of tests focused on the `mining_server`."""
 2 | 
 3 | # Blue Brain Search is a text mining toolbox focused on scientific use cases.
 4 | #
 5 | # Copyright (C) 2020  Blue Brain Project, EPFL.
 6 | #
 7 | # This program is free software: you can redistribute it and/or modify
 8 | # it under the terms of the GNU Lesser General Public License as published by
 9 | # the Free Software Foundation, either version 3 of the License, or
10 | # (at your option) any later version.
11 | #
12 | # This program is distributed in the hope that it will be useful,
13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 | # GNU Lesser General Public License for more details.
16 | #
17 | # You should have received a copy of the GNU Lesser General Public License
18 | # along with this program. If not, see <https://www.gnu.org/licenses/>.
19 | 
20 | import pathlib
21 | from unittest.mock import Mock
22 | 
23 | import pytest
24 | 
25 | from bluesearch.entrypoint.mining_server import get_mining_app
26 | 
27 | 
28 | @pytest.mark.parametrize(
29 |     ("db_type", "sqlite_db_exists"),
30 |     (
31 |         ("sqlite", True),
32 |         ("sqlite", False),
33 |         ("mysql", False),
34 |         ("wrong", False),
35 |     ),
36 | )
37 | def test_send_through(
38 |     tmpdir, monkeypatch, db_type, sqlite_db_exists, entity_types, spacy_model_path
39 | ):
40 |     tmpdir = pathlib.Path(str(tmpdir))
41 |     logfile = tmpdir / "log.txt"
42 |     db_path = tmpdir / "something.db"
43 | 
44 |     if sqlite_db_exists:
45 |         db_path.parent.mkdir(exist_ok=True, parents=True)
46 |         db_path.touch()
47 | 
48 |     monkeypatch.setenv("BBS_MINING_LOG_FILE", str(logfile))
49 |     monkeypatch.setenv("BBS_MINING_DB_TYPE", db_type)
50 |     monkeypatch.setenv("BBS_MINING_DB_URL", str(db_path))
51 |     monkeypatch.setenv("BBS_MINING_MYSQL_USER", "some_user")
52 |     monkeypatch.setenv("BBS_MINING_MYSQL_PASSWORD", "some_pwd")
53 |     monkeypatch.setenv("BBS_DATA_AND_MODELS_DIR", str(spacy_model_path))
54 | 
55 |     fake_sqlalchemy = Mock()
56 |     fake_mining_server_inst = Mock()
57 |     fake_mining_server_class = Mock(return_value=fake_mining_server_inst)
58 | 
59 |     monkeypatch.setattr(
60 |         "bluesearch.server.mining_server.MiningServer", fake_mining_server_class
61 |     )
62 |     monkeypatch.setattr(
63 |         "bluesearch.entrypoint.mining_server.sqlalchemy", fake_sqlalchemy
64 |     )
65 | 
66 |     if db_type not in {"mysql", "sqlite"}:
67 |         with pytest.raises(ValueError):
68 |             get_mining_app()
69 |     else:
70 |         mining_app = get_mining_app()
71 | 
72 |         fake_mining_server_class.assert_called_once()
73 |         assert mining_app == fake_mining_server_inst
74 | 
75 |         args, kwargs = fake_mining_server_class.call_args
76 |         assert not args
77 |         assert kwargs["connection"] == fake_sqlalchemy.create_engine.return_value
78 |         assert "ee" in kwargs["models_libs"]
79 |         assert isinstance(kwargs["models_libs"]["ee"], dict)
80 |         assert len(kwargs["models_libs"]["ee"]) == len(entity_types)
81 | 


--------------------------------------------------------------------------------
/tests/unit/entrypoint/test_search_sever.py:
--------------------------------------------------------------------------------
 1 | """Collection of tests focused on "search_server" entrypoint."""
 2 | 
 3 | # Blue Brain Search is a text mining toolbox focused on scientific use cases.
 4 | #
 5 | # Copyright (C) 2020  Blue Brain Project, EPFL.
 6 | #
 7 | # This program is free software: you can redistribute it and/or modify
 8 | # it under the terms of the GNU Lesser General Public License as published by
 9 | # the Free Software Foundation, either version 3 of the License, or
10 | # (at your option) any later version.
11 | #
12 | # This program is distributed in the hope that it will be useful,
13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 | # GNU Lesser General Public License for more details.
16 | #
17 | # You should have received a copy of the GNU Lesser General Public License
18 | # along with this program. If not, see <https://www.gnu.org/licenses/>.
19 | 
20 | import pathlib
21 | from unittest.mock import Mock
22 | 
23 | import numpy as np
24 | import pytest
25 | 
26 | from bluesearch.entrypoint.search_server import get_search_app
27 | from bluesearch.server.search_server import SearchServer
28 | 
29 | 
30 | @pytest.mark.parametrize(
31 |     "embeddings_path,models,models_path",
32 |     [
33 |         ("path_1", ["A", "B", "C"], "path_a"),
34 |         ("path_2", ["X", "Y"], "path_b"),
35 |     ],
36 | )
37 | def test_send_through(tmpdir, monkeypatch, embeddings_path, models, models_path):
38 |     tmpdir = pathlib.Path(str(tmpdir))
39 |     logfile = tmpdir / "log.txt"
40 | 
41 |     monkeypatch.setenv("BBS_SEARCH_LOG_FILE", str(logfile))
42 |     monkeypatch.setenv("BBS_SEARCH_MODELS_PATH", models_path)
43 |     monkeypatch.setenv("BBS_SEARCH_EMBEDDINGS_PATH", embeddings_path)
44 |     monkeypatch.setenv("BBS_SEARCH_MODELS", ",".join(models))
45 |     monkeypatch.setenv("BBS_SEARCH_DB_URL", "some_url")
46 |     monkeypatch.setenv("BBS_SEARCH_MYSQL_USER", "some_user")
47 |     monkeypatch.setenv("BBS_SEARCH_MYSQL_PASSWORD", "some_pwd")
48 | 
49 |     fake_sqlalchemy = Mock()
50 |     fake_H5 = Mock()
51 |     fake_H5.find_populated_rows.return_value = np.arange(1, 11)
52 |     fake_search_server_inst = Mock(spec=SearchServer)
53 |     fake_search_server_class = Mock(return_value=fake_search_server_inst)
54 | 
55 |     monkeypatch.setattr(
56 |         "bluesearch.entrypoint.search_server.sqlalchemy", fake_sqlalchemy
57 |     )
58 |     monkeypatch.setattr("bluesearch.utils.H5", fake_H5)
59 |     monkeypatch.setattr(
60 |         "bluesearch.server.search_server.SearchServer", fake_search_server_class
61 |     )
62 | 
63 |     server_app = get_search_app()
64 | 
65 |     # Checks
66 |     fake_search_server_class.assert_called_once()
67 |     fake_H5.find_populated_rows.assert_called_once()
68 |     fake_sqlalchemy.create_engine.assert_called_once()
69 | 
70 |     assert server_app is fake_search_server_inst
71 | 
72 |     args, kwargs = fake_search_server_class.call_args
73 | 
74 |     assert args[0] == pathlib.Path(models_path)
75 |     assert args[1] == pathlib.Path(embeddings_path)
76 |     np.testing.assert_array_equal(args[2], np.arange(1, 11))
77 |     assert args[3] is fake_sqlalchemy.create_engine.return_value
78 |     assert args[4] == models
79 | 


--------------------------------------------------------------------------------
/tests/unit/k8s/test_create_indices.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from bluesearch.k8s.create_indices import add_index, remove_index
 4 | 
 5 | 
 6 | def test_create_and_remove_index(get_es_client):
 7 |     client = get_es_client
 8 | 
 9 |     if client is None:
10 |         pytest.skip("Elastic search is not available")
11 | 
12 |     index = "test_index"
13 | 
14 |     add_index(client, index)
15 |     remove_index(client, index)
16 | 


--------------------------------------------------------------------------------
/tests/unit/server/__init__.py:
--------------------------------------------------------------------------------
 1 | """Collection of tests covering server functionalities."""
 2 | 
 3 | # Blue Brain Search is a text mining toolbox focused on scientific use cases.
 4 | #
 5 | # Copyright (C) 2020  Blue Brain Project, EPFL.
 6 | #
 7 | # This program is free software: you can redistribute it and/or modify
 8 | # it under the terms of the GNU Lesser General Public License as published by
 9 | # the Free Software Foundation, either version 3 of the License, or
10 | # (at your option) any later version.
11 | #
12 | # This program is distributed in the hope that it will be useful,
13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 | # GNU Lesser General Public License for more details.
16 | #
17 | # You should have received a copy of the GNU Lesser General Public License
18 | # along with this program. If not, see <https://www.gnu.org/licenses/>.
19 | 


--------------------------------------------------------------------------------
/tests/unit/server/test_embedding_server.py:
--------------------------------------------------------------------------------
 1 | """Tests covering embedding server"""
 2 | 
 3 | # Blue Brain Search is a text mining toolbox focused on scientific use cases.
 4 | #
 5 | # Copyright (C) 2020  Blue Brain Project, EPFL.
 6 | #
 7 | # This program is free software: you can redistribute it and/or modify
 8 | # it under the terms of the GNU Lesser General Public License as published by
 9 | # the Free Software Foundation, either version 3 of the License, or
10 | # (at your option) any later version.
11 | #
12 | # This program is distributed in the hope that it will be useful,
13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 | # GNU Lesser General Public License for more details.
16 | #
17 | # You should have received a copy of the GNU Lesser General Public License
18 | # along with this program. If not, see <https://www.gnu.org/licenses/>.
19 | 
20 | from unittest.mock import Mock
21 | 
22 | import numpy as np
23 | import pytest
24 | 
25 | from bluesearch.server.embedding_server import EmbeddingServer
26 | 
27 | 
28 | @pytest.fixture(scope="session")
29 | def embedding_client():
30 |     """Fixture to create a client for mining_server."""
31 | 
32 |     sbiobert = Mock()
33 |     sbiobert.preprocess.return_value = "This is a dummy sentence"
34 |     sbiobert.embed.return_value = np.ones((2,))
35 |     embedding_models = {"sbiobert": sbiobert}
36 | 
37 |     embedding_server_app = EmbeddingServer(embedding_models=embedding_models)
38 |     embedding_server_app.config["TESTING"] = True
39 |     with embedding_server_app.test_client() as client:
40 |         yield client
41 | 
42 | 
43 | class TestEmbeddingServer:
44 |     def test_embedding_server_help(self, embedding_client):
45 |         response = embedding_client.post("/help")
46 |         assert response.status_code == 200
47 |         assert response.json["name"] == "EmbeddingServer"
48 | 
49 |     def test_embedding_server_welcome(self, embedding_client):
50 |         response = embedding_client.get("/")
51 |         assert response.status_code == 200
52 |         response = embedding_client.post("/")
53 |         assert response.status_code == 405
54 | 
55 |     def test_embedding_server_embed(self, embedding_client):
56 |         request_json = {"model": "sbiobert", "text": "hello"}
57 |         response = embedding_client.post("/v1/embed/json", json=request_json)
58 |         assert response.status_code == 200
59 | 
60 |         request_json = {"model": "sbiobert"}
61 |         response = embedding_client.post("/v1/embed/json", json=request_json)
62 |         assert response.status_code == 400
63 | 
64 |         request_json = {"model": "sbiobert", "text": "hello"}
65 |         response = embedding_client.post("/v1/embed/csv", json=request_json)
66 |         assert response.status_code == 200
67 | 
68 |         request_json = {"model": "invalid_model", "text": "hello"}
69 |         response = embedding_client.post("/v1/embed/csv", json=request_json)
70 |         assert response.status_code == 400
71 | 
72 |         response = embedding_client.post("/v1/embed/csv", data="not json")
73 |         assert response.status_code == 400
74 | 
75 |         response = embedding_client.post("/v1/embed/invalid_format", data="not json")
76 |         assert response.status_code == 400
77 | 


--------------------------------------------------------------------------------
/tests/unit/server/test_search_server.py:
--------------------------------------------------------------------------------
 1 | """Tests covering the search server."""
 2 | 
 3 | # Blue Brain Search is a text mining toolbox focused on scientific use cases.
 4 | #
 5 | # Copyright (C) 2020  Blue Brain Project, EPFL.
 6 | #
 7 | # This program is free software: you can redistribute it and/or modify
 8 | # it under the terms of the GNU Lesser General Public License as published by
 9 | # the Free Software Foundation, either version 3 of the License, or
10 | # (at your option) any later version.
11 | #
12 | # This program is distributed in the hope that it will be useful,
13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 | # GNU Lesser General Public License for more details.
16 | #
17 | # You should have received a copy of the GNU Lesser General Public License
18 | # along with this program. If not, see <https://www.gnu.org/licenses/>.
19 | 
20 | from unittest.mock import Mock
21 | 
22 | import numpy as np
23 | import pytest
24 | 
25 | from bluesearch.server.search_server import SearchServer
26 | from bluesearch.utils import H5
27 | 
28 | 
29 | @pytest.fixture
30 | def search_client(
31 |     monkeypatch, embeddings_h5_path, fake_sqlalchemy_engine, test_parameters
32 | ):
33 |     """Fixture to create a client for mining_server."""
34 | 
35 |     fake_embedding_model = Mock()
36 |     fake_embedding_model.preprocess.return_value = "hello"
37 |     fake_embedding_model.embed.return_value = np.ones(
38 |         (test_parameters["embedding_size"],)
39 |     )
40 | 
41 |     monkeypatch.setattr(
42 |         "bluesearch.server.search_server.get_embedding_model",
43 |         lambda *args, **kwargs: fake_embedding_model,
44 |     )
45 | 
46 |     indices = H5.find_populated_rows(embeddings_h5_path, "SBioBERT")
47 | 
48 |     search_server_app = SearchServer(
49 |         trained_models_path="",
50 |         embeddings_h5_path=embeddings_h5_path,
51 |         indices=indices,
52 |         connection=fake_sqlalchemy_engine,
53 |         models=["SBioBERT"],
54 |     )
55 |     search_server_app.config["TESTING"] = True
56 |     with search_server_app.test_client() as client:
57 |         yield client
58 | 
59 | 
60 | class TestSearchServer:
61 |     def test_search_server(self, search_client):
62 |         # Test the help request
63 |         response = search_client.post("/help")
64 |         assert response.status_code == 200
65 |         assert response.json["name"] == "SearchServer"
66 | 
67 |         # Test a valid JSON request
68 |         k = 3
69 |         request_json = {"which_model": "SBioBERT", "k": k, "query_text": "hello"}
70 |         response = search_client.post("/", json=request_json)
71 |         assert response.status_code == 200
72 |         json_response = response.json
73 |         assert len(json_response["sentence_ids"]) == k
74 |         assert len(json_response["similarities"]) == k
75 | 
76 |         # Test a non-JSON request
77 |         response = search_client.post("/", data="data is not a json")
78 |         assert response.status_code == 200
79 |         json_response = response.json
80 |         assert json_response["sentence_ids"] is None
81 |         assert json_response["similarities"] is None
82 | 


--------------------------------------------------------------------------------
/tests/unit/test_fixtures.py:
--------------------------------------------------------------------------------
 1 | """Collection of tests that make sure that fixtures are set up correctly.
 2 | 
 3 | Notes
 4 | -----
 5 | The internals of fixtures might vary based on how conftest.py sets them up.
 6 | The goal of these tests is to run simple sanity checks rather than detailed
 7 | bookkeeping.
 8 | """
 9 | 
10 | # Blue Brain Search is a text mining toolbox focused on scientific use cases.
11 | #
12 | # Copyright (C) 2020  Blue Brain Project, EPFL.
13 | #
14 | # This program is free software: you can redistribute it and/or modify
15 | # it under the terms of the GNU Lesser General Public License as published by
16 | # the Free Software Foundation, either version 3 of the License, or
17 | # (at your option) any later version.
18 | #
19 | # This program is distributed in the hope that it will be useful,
20 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
21 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
22 | # GNU Lesser General Public License for more details.
23 | #
24 | # You should have received a copy of the GNU Lesser General Public License
25 | # along with this program. If not, see <https://www.gnu.org/licenses/>.
26 | 
27 | import pandas as pd
28 | import pytest
29 | import sqlalchemy
30 | from sqlalchemy.exc import OperationalError, ProgrammingError
31 | 
32 | 
33 | def test_database(fake_sqlalchemy_engine, backend_database):
34 |     """Make sure database tables setup correctly."""
35 |     inspector = sqlalchemy.inspect(fake_sqlalchemy_engine)
36 | 
37 |     for table_name in ["articles", "sentences", "mining_cache"]:
38 |         res = pd.read_sql("SELECT * FROM {}".format(table_name), fake_sqlalchemy_engine)
39 | 
40 |         if table_name != "articles":
41 |             # Mysql consider that sentences table has 2 indexes (article_id one + UNIQUE
42 |             # constraint)
43 |             # sqlite will only consider 1 index for this table (article_id one)
44 |             assert len(inspector.get_indexes(table_name)) >= 1
45 | 
46 |         assert len(res) > 0
47 |     if backend_database == "sqlite":
48 |         with pytest.raises(OperationalError):
49 |             fake_sqlalchemy_engine.execute("SELECT * FROM fake_table").all()
50 |     else:
51 |         with pytest.raises(ProgrammingError):
52 |             fake_sqlalchemy_engine.execute("SELECT * FROM fake_table").all()
53 | 
54 | 
55 | def test_h5(embeddings_h5_path):
56 |     assert embeddings_h5_path.is_file()
57 | 
58 | 
59 | def test_metadata(metadata_path):
60 |     """Make sure all metadata csv is correct"""
61 |     df = pd.read_csv(str(metadata_path))
62 | 
63 |     assert len(df) > 0
64 | 
65 | 
66 | def test_jsons(jsons_path):
67 |     """Make sure all jsons are present."""
68 |     n_json_files = len(list(jsons_path.rglob("*.json")))
69 | 
70 |     assert n_json_files > 0
71 | 


--------------------------------------------------------------------------------
/tests/unit/widgets/test_mining_schema.py:
--------------------------------------------------------------------------------
 1 | """Tests covering the MiningSchema class."""
 2 | 
 3 | # Blue Brain Search is a text mining toolbox focused on scientific use cases.
 4 | #
 5 | # Copyright (C) 2020  Blue Brain Project, EPFL.
 6 | #
 7 | # This program is free software: you can redistribute it and/or modify
 8 | # it under the terms of the GNU Lesser General Public License as published by
 9 | # the Free Software Foundation, either version 3 of the License, or
10 | # (at your option) any later version.
11 | #
12 | # This program is distributed in the hope that it will be useful,
13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 | # GNU Lesser General Public License for more details.
16 | #
17 | # You should have received a copy of the GNU Lesser General Public License
18 | # along with this program. If not, see <https://www.gnu.org/licenses/>.
19 | 
20 | import pytest
21 | 
22 | from bluesearch.widgets.mining_schema import MiningSchema
23 | 
24 | 
25 | def test_add_entity():
26 |     mining_schema = MiningSchema()
27 | 
28 |     # Test adding entities
29 |     mining_schema.add_entity(
30 |         "CHEMICAL",
31 |         property_name="isChiral",
32 |         property_type="ATTRIBUTE",
33 |         property_value_type="BOOLEAN",
34 |         ontology_source="NCIT",
35 |     )
36 |     mining_schema.add_entity("DRUG")
37 |     assert len(mining_schema.schema_df) == 2
38 | 
39 |     # Test warning upon adding a duplicate entity
40 |     with pytest.warns(UserWarning, match=r"already exists"):
41 |         mining_schema.add_entity("DRUG")
42 | 
43 | 
44 | def test_df(mining_schema_df):
45 |     # We won't be testing for duplicates in this test
46 |     mining_schema_df = mining_schema_df.drop_duplicates(ignore_index=True)
47 | 
48 |     # Test adding from a dataframe
49 |     mining_schema = MiningSchema()
50 |     mining_schema.add_from_df(mining_schema_df)
51 |     # Make sure a copy is returned
52 |     assert mining_schema.df is not mining_schema.schema_df
53 |     # Check that all data was added
54 |     assert mining_schema.df.equals(mining_schema_df)
55 | 
56 |     # Test missing entity_type
57 |     wrong_schema_df = mining_schema_df.drop("entity_type", axis=1)
58 |     mining_schema = MiningSchema()
59 |     with pytest.raises(ValueError, match=r"entity_type.* not found"):
60 |         mining_schema.add_from_df(wrong_schema_df)
61 | 
62 |     # Test ignoring unknown columns
63 |     schema_df_new = mining_schema_df.drop_duplicates().copy()
64 |     schema_df_new["unknown_column"] = list(range(len(schema_df_new)))
65 |     mining_schema = MiningSchema()
66 |     with pytest.warns(UserWarning, match=r"column.* unknown_column"):
67 |         mining_schema.add_from_df(schema_df_new)
68 |     # Check that all data was added and the unknown columns was ignored
69 |     assert mining_schema.df.equals(mining_schema_df)
70 | 


--------------------------------------------------------------------------------