├── .flake8 ├── .github ├── ISSUE_TEMPLATE │ ├── bug.md │ └── feature.md ├── copilot-instructions.md ├── pull_request_template.md └── workflows │ ├── delete-preview-docs.yml │ ├── documentation.yml │ ├── release.yml │ ├── test-build.yml │ └── tests.yml ├── .gitignore ├── .pre-commit-config.yaml ├── CITATION.cff ├── LICENSE ├── Makefile ├── README.md ├── changelog.md ├── contributing.md ├── demo ├── app.py └── requirements.txt ├── docs ├── advanced-tutorials │ ├── fastapi.md │ ├── index.md │ └── word-vectors.md ├── assets │ ├── fragments │ │ ├── aids-examples.md │ │ ├── alcohol-examples.md │ │ ├── cerebrovascular-accident-examples.md │ │ ├── ckd-examples.md │ │ ├── congestive-heart-failure-examples.md │ │ ├── connective-tissue-disease-examples.md │ │ ├── copd-examples.md │ │ ├── dementia-examples.md │ │ ├── diabetes-examples.md │ │ ├── hemiplegia-examples.md │ │ ├── leukemia-examples.md │ │ ├── liver-disease-examples.md │ │ ├── lymphoma-examples.md │ │ ├── myocardial-infarction-examples.md │ │ ├── peptic-ulcer-disease-examples.md │ │ ├── peripheral-vascular-disease-examples.md │ │ ├── solid-tumor-examples.md │ │ └── tobacco-examples.md │ ├── images │ │ ├── class_span_linker.png │ │ ├── edsnlp-ner.svg │ │ ├── hybrid-pipeline-example.png │ │ ├── model-parallelism.png │ │ ├── multiprocessing.png │ │ ├── sharing-components.png │ │ ├── synonym_span_linker.png │ │ └── transformer-windowing.svg │ ├── logo │ │ ├── aphp-blue.svg │ │ ├── aphp-white.svg │ │ └── edsnlp.svg │ ├── overrides │ │ ├── main.html │ │ └── partials │ │ │ └── comments.html │ ├── stylesheets │ │ ├── cards.css │ │ ├── extra.css │ │ ├── giscus_dark.css │ │ └── giscus_light.css │ ├── templates │ │ └── python │ │ │ └── material │ │ │ ├── class.html │ │ │ ├── docstring.html │ │ │ ├── docstring │ │ │ ├── examples.html │ │ │ └── parameters.html │ │ │ └── function.html │ └── termynal │ │ ├── termynal.css │ │ └── termynal.js ├── concepts │ ├── inference.md │ ├── pipeline.md │ └── torch-component.md ├── data │ ├── conll.md │ ├── converters.md │ ├── index.md │ ├── json.md │ ├── overview.png │ ├── pandas.md │ ├── parquet.md │ ├── polars.md │ ├── spark.md │ └── standoff.md ├── index.md ├── pipes │ ├── architecture.md │ ├── core │ │ ├── contextual-matcher.md │ │ ├── endlines.md │ │ ├── index.md │ │ ├── matcher.md │ │ ├── normalizer.md │ │ ├── resources │ │ │ ├── alignment.svg │ │ │ └── span-alignment.svg │ │ ├── sentences.md │ │ └── terminology.md │ ├── index.md │ ├── misc │ │ ├── consultation-dates.md │ │ ├── dates.md │ │ ├── index.md │ │ ├── quantities.md │ │ ├── reason.md │ │ ├── sections.md │ │ ├── split.md │ │ └── tables.md │ ├── ner │ │ ├── adicap.md │ │ ├── behaviors │ │ │ ├── alcohol.md │ │ │ ├── index.md │ │ │ └── tobacco.md │ │ ├── cim10.md │ │ ├── covid.md │ │ ├── disorders │ │ │ ├── aids.md │ │ │ ├── cerebrovascular-accident.md │ │ │ ├── ckd.md │ │ │ ├── congestive-heart-failure.md │ │ │ ├── connective-tissue-disease.md │ │ │ ├── copd.md │ │ │ ├── dementia.md │ │ │ ├── diabetes.md │ │ │ ├── hemiplegia.md │ │ │ ├── index.md │ │ │ ├── leukemia.md │ │ │ ├── liver-disease.md │ │ │ ├── lymphoma.md │ │ │ ├── myocardial-infarction.md │ │ │ ├── peptic-ulcer-disease.md │ │ │ ├── peripheral-vascular-disease.md │ │ │ └── solid-tumor.md │ │ ├── drugs.md │ │ ├── index.md │ │ ├── scores │ │ │ ├── charlson.md │ │ │ ├── elston-ellis.md │ │ │ ├── emergency-ccmu.md │ │ │ ├── emergency-gemsa.md │ │ │ ├── emergency-priority.md │ │ │ ├── index.md │ │ │ └── sofa.md │ │ ├── suicide_attempt.md │ │ ├── tnm.md │ │ └── umls.md │ ├── qualifiers │ │ ├── family.md │ │ ├── history.md │ │ ├── hypothesis.md │ │ ├── index.md │ │ ├── negation.md │ │ └── reported-speech.md │ └── trainable │ │ ├── biaffine-dependency-parser.md │ │ ├── embeddings │ │ ├── span_pooler.md │ │ ├── text_cnn.md │ │ └── transformer.md │ │ ├── extractive-qa.md │ │ ├── index.md │ │ ├── ner.md │ │ ├── span-classifier.md │ │ └── span-linker.md ├── references.bib ├── resources │ └── sections.svg ├── scripts │ ├── autorefs │ │ ├── LICENSE │ │ └── plugin.py │ ├── bibtex.py │ ├── cards.py │ ├── clickable_snippets.py │ ├── griffe_ext.py │ └── plugin.py ├── tokenizers.md ├── tutorials │ ├── aggregating-results.md │ ├── detecting-dates.md │ ├── endlines.md │ ├── index.md │ ├── make-a-training-script.md │ ├── matching-a-terminology.md │ ├── multiple-texts.md │ ├── qualifying-entities.md │ ├── reason.md │ ├── spacy101.md │ ├── training.md │ ├── tuning.md │ └── visualization.md └── utilities │ ├── connectors │ ├── brat.md │ ├── labeltool.md │ ├── omop.md │ └── overview.md │ ├── evaluation.md │ ├── index.md │ ├── matchers.md │ ├── regex.md │ └── tests │ ├── blocs.md │ ├── examples.md │ └── index.md ├── edsnlp ├── __init__.py ├── conjugator.py ├── connectors │ ├── __init__.py │ ├── brat.py │ ├── labeltool.py │ └── omop.py ├── core │ ├── __init__.py │ ├── pipeline.py │ ├── registries.py │ ├── stream.py │ └── torch_component.py ├── data │ ├── __init__.py │ ├── base.py │ ├── brat.py │ ├── conll.py │ ├── converters.py │ ├── json.py │ ├── pandas.py │ ├── parquet.py │ ├── polars.py │ ├── spark.py │ └── standoff.py ├── evaluate.py ├── extensions.py ├── language.py ├── matchers │ ├── __init__.py │ ├── phrase.pxd │ ├── phrase.pyx │ ├── regex.py │ ├── simstring.py │ └── utils │ │ ├── __init__.py │ │ ├── offset.py │ │ └── text.py ├── metrics │ ├── __init__.py │ ├── dep_parsing.py │ ├── ner.py │ └── span_attributes.py ├── package.py ├── patch_spacy.py ├── pipes │ ├── __init__.py │ ├── base.py │ ├── core │ │ ├── __init__.py │ │ ├── contextual_matcher │ │ │ ├── __init__.py │ │ │ ├── contextual_matcher.py │ │ │ ├── factory.py │ │ │ └── models.py │ │ ├── endlines │ │ │ ├── __init__.py │ │ │ ├── endlines.py │ │ │ ├── factory.py │ │ │ ├── functional.py │ │ │ └── model.py │ │ ├── matcher │ │ │ ├── __init__.py │ │ │ ├── factory.py │ │ │ └── matcher.py │ │ ├── normalizer │ │ │ ├── __init__.py │ │ │ ├── accents │ │ │ │ ├── __init__.py │ │ │ │ ├── accents.py │ │ │ │ ├── factory.py │ │ │ │ └── patterns.py │ │ │ ├── factory.py │ │ │ ├── normalizer.py │ │ │ ├── pollution │ │ │ │ ├── __init__.py │ │ │ │ ├── factory.py │ │ │ │ ├── patterns.py │ │ │ │ └── pollution.py │ │ │ ├── quotes │ │ │ │ ├── __init__.py │ │ │ │ ├── factory.py │ │ │ │ ├── patterns.py │ │ │ │ └── quotes.py │ │ │ ├── remove_lowercase │ │ │ │ ├── __init__.py │ │ │ │ └── factory.py │ │ │ └── spaces │ │ │ │ ├── __init__.py │ │ │ │ ├── factory.py │ │ │ │ └── spaces.py │ │ ├── sentences │ │ │ ├── __init__.py │ │ │ ├── factory.py │ │ │ ├── fast_sentences.pxd │ │ │ ├── fast_sentences.pyx │ │ │ ├── sentences.py │ │ │ └── terms.py │ │ └── terminology │ │ │ ├── __init__.py │ │ │ ├── factory.py │ │ │ └── terminology.py │ ├── misc │ │ ├── __init__.py │ │ ├── consultation_dates │ │ │ ├── __init__.py │ │ │ ├── consultation_dates.py │ │ │ ├── factory.py │ │ │ └── patterns.py │ │ ├── dates │ │ │ ├── __init__.py │ │ │ ├── dates.py │ │ │ ├── factory.py │ │ │ ├── models.py │ │ │ └── patterns │ │ │ │ ├── __init__.py │ │ │ │ ├── absolute.py │ │ │ │ ├── atomic │ │ │ │ ├── __init__.py │ │ │ │ ├── days.py │ │ │ │ ├── delimiters.py │ │ │ │ ├── directions.py │ │ │ │ ├── modes.py │ │ │ │ ├── months.py │ │ │ │ ├── numbers.py │ │ │ │ ├── time.py │ │ │ │ ├── units.py │ │ │ │ └── years.py │ │ │ │ ├── current.py │ │ │ │ ├── duration.py │ │ │ │ ├── false_positive.py │ │ │ │ └── relative.py │ │ ├── quantities │ │ │ ├── __init__.py │ │ │ ├── factory.py │ │ │ ├── patterns.py │ │ │ └── quantities.py │ │ ├── reason │ │ │ ├── __init__.py │ │ │ ├── factory.py │ │ │ ├── patterns.py │ │ │ └── reason.py │ │ ├── sections │ │ │ ├── __init__.py │ │ │ ├── factory.py │ │ │ ├── patterns.py │ │ │ └── sections.py │ │ ├── split │ │ │ ├── __init__.py │ │ │ └── split.py │ │ └── tables │ │ │ ├── __init__.py │ │ │ ├── factory.py │ │ │ ├── patterns.py │ │ │ └── tables.py │ ├── ner │ │ ├── __init__.py │ │ ├── adicap │ │ │ ├── __init__.py │ │ │ ├── adicap.py │ │ │ ├── factory.py │ │ │ ├── models.py │ │ │ └── patterns.py │ │ ├── behaviors │ │ │ ├── __init__.py │ │ │ ├── alcohol │ │ │ │ ├── __init__.py │ │ │ │ ├── alcohol.py │ │ │ │ ├── factory.py │ │ │ │ └── patterns.py │ │ │ └── tobacco │ │ │ │ ├── __init__.py │ │ │ │ ├── factory.py │ │ │ │ ├── patterns.py │ │ │ │ └── tobacco.py │ │ ├── cim10 │ │ │ ├── __init__.py │ │ │ ├── factory.py │ │ │ └── patterns.py │ │ ├── covid │ │ │ ├── __init__.py │ │ │ ├── factory.py │ │ │ └── patterns.py │ │ ├── disorders │ │ │ ├── __init__.py │ │ │ ├── aids │ │ │ │ ├── __init__.py │ │ │ │ ├── aids.py │ │ │ │ ├── factory.py │ │ │ │ └── patterns.py │ │ │ ├── base.py │ │ │ ├── cerebrovascular_accident │ │ │ │ ├── __init__.py │ │ │ │ ├── cerebrovascular_accident.py │ │ │ │ ├── factory.py │ │ │ │ └── patterns.py │ │ │ ├── ckd │ │ │ │ ├── __init__.py │ │ │ │ ├── ckd.py │ │ │ │ ├── factory.py │ │ │ │ └── patterns.py │ │ │ ├── congestive_heart_failure │ │ │ │ ├── __init__.py │ │ │ │ ├── congestive_heart_failure.py │ │ │ │ ├── factory.py │ │ │ │ └── patterns.py │ │ │ ├── connective_tissue_disease │ │ │ │ ├── __init__.py │ │ │ │ ├── connective_tissue_disease.py │ │ │ │ ├── factory.py │ │ │ │ └── patterns.py │ │ │ ├── copd │ │ │ │ ├── __init__.py │ │ │ │ ├── copd.py │ │ │ │ ├── factory.py │ │ │ │ └── patterns.py │ │ │ ├── dementia │ │ │ │ ├── __init__.py │ │ │ │ ├── dementia.py │ │ │ │ ├── factory.py │ │ │ │ └── patterns.py │ │ │ ├── diabetes │ │ │ │ ├── __init__.py │ │ │ │ ├── diabetes.py │ │ │ │ ├── factory.py │ │ │ │ └── patterns.py │ │ │ ├── hemiplegia │ │ │ │ ├── __init__.py │ │ │ │ ├── factory.py │ │ │ │ ├── hemiplegia.py │ │ │ │ └── patterns.py │ │ │ ├── leukemia │ │ │ │ ├── __init__.py │ │ │ │ ├── factory.py │ │ │ │ ├── leukemia.py │ │ │ │ └── patterns.py │ │ │ ├── liver_disease │ │ │ │ ├── __init__.py │ │ │ │ ├── factory.py │ │ │ │ ├── liver_disease.py │ │ │ │ └── patterns.py │ │ │ ├── lymphoma │ │ │ │ ├── __init__.py │ │ │ │ ├── factory.py │ │ │ │ ├── lymphoma.py │ │ │ │ └── patterns.py │ │ │ ├── myocardial_infarction │ │ │ │ ├── __init__.py │ │ │ │ ├── factory.py │ │ │ │ ├── myocardial_infarction.py │ │ │ │ └── patterns.py │ │ │ ├── peptic_ulcer_disease │ │ │ │ ├── __init__.py │ │ │ │ ├── factory.py │ │ │ │ ├── patterns.py │ │ │ │ └── peptic_ulcer_disease.py │ │ │ ├── peripheral_vascular_disease │ │ │ │ ├── __init__.py │ │ │ │ ├── factory.py │ │ │ │ ├── patterns.py │ │ │ │ └── peripheral_vascular_disease.py │ │ │ ├── solid_tumor │ │ │ │ ├── __init__.py │ │ │ │ ├── factory.py │ │ │ │ ├── patterns.py │ │ │ │ └── solid_tumor.py │ │ │ └── terms.py │ │ ├── drugs │ │ │ ├── __init__.py │ │ │ ├── factory.py │ │ │ └── patterns.py │ │ ├── scores │ │ │ ├── __init__.py │ │ │ ├── base_score.py │ │ │ ├── charlson │ │ │ │ ├── __init__.py │ │ │ │ ├── factory.py │ │ │ │ └── patterns.py │ │ │ ├── elston_ellis │ │ │ │ ├── __init__.py │ │ │ │ ├── factory.py │ │ │ │ └── patterns.py │ │ │ ├── emergency │ │ │ │ ├── __init__.py │ │ │ │ ├── ccmu │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── factory.py │ │ │ │ │ └── patterns.py │ │ │ │ ├── gemsa │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── factory.py │ │ │ │ │ └── patterns.py │ │ │ │ └── priority │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── factory.py │ │ │ │ │ └── patterns.py │ │ │ ├── factory.py │ │ │ └── sofa │ │ │ │ ├── __init__.py │ │ │ │ ├── factory.py │ │ │ │ ├── patterns.py │ │ │ │ └── sofa.py │ │ ├── suicide_attempt │ │ │ ├── __init__.py │ │ │ ├── factory.py │ │ │ ├── patterns.py │ │ │ └── suicide_attempt.py │ │ ├── tnm │ │ │ ├── __init__.py │ │ │ ├── factory.py │ │ │ ├── model.py │ │ │ ├── patterns.py │ │ │ └── tnm.py │ │ └── umls │ │ │ ├── __init__.py │ │ │ ├── factory.py │ │ │ └── patterns.py │ ├── qualifiers │ │ ├── __init__.py │ │ ├── base.py │ │ ├── family │ │ │ ├── __init__.py │ │ │ ├── factory.py │ │ │ ├── family.py │ │ │ └── patterns.py │ │ ├── history │ │ │ ├── __init__.py │ │ │ ├── factory.py │ │ │ ├── history.py │ │ │ └── patterns.py │ │ ├── hypothesis │ │ │ ├── __init__.py │ │ │ ├── factory.py │ │ │ ├── hypothesis.py │ │ │ └── patterns.py │ │ ├── negation │ │ │ ├── __init__.py │ │ │ ├── factory.py │ │ │ ├── negation.py │ │ │ └── patterns.py │ │ └── reported_speech │ │ │ ├── __init__.py │ │ │ ├── factory.py │ │ │ ├── patterns.py │ │ │ └── reported_speech.py │ ├── terminations.py │ └── trainable │ │ ├── __init__.py │ │ ├── biaffine_dep_parser │ │ ├── __init__.py │ │ ├── biaffine_dep_parser.py │ │ └── factory.py │ │ ├── embeddings │ │ ├── __init__.py │ │ ├── span_pooler │ │ │ ├── __init__.py │ │ │ ├── factory.py │ │ │ └── span_pooler.py │ │ ├── text_cnn │ │ │ ├── __init__.py │ │ │ ├── factory.py │ │ │ └── text_cnn.py │ │ ├── transformer │ │ │ ├── __init__.py │ │ │ ├── factory.py │ │ │ └── transformer.py │ │ └── typing.py │ │ ├── extractive_qa │ │ ├── __init__.py │ │ ├── extractive_qa.py │ │ └── factory.py │ │ ├── layers │ │ ├── __init__.py │ │ ├── crf.py │ │ ├── metric.py │ │ └── text_cnn.py │ │ ├── ner_crf │ │ ├── __init__.py │ │ ├── factory.py │ │ └── ner_crf.py │ │ ├── span_classifier │ │ ├── __init__.py │ │ ├── factory.py │ │ └── span_classifier.py │ │ └── span_linker │ │ ├── __init__.py │ │ ├── factory.py │ │ └── span_linker.py ├── processing │ ├── __init__.py │ ├── deprecated_pipe.py │ ├── multiprocessing.py │ ├── simple.py │ └── spark.py ├── reducers.py ├── resources │ ├── AVC.csv.gz │ ├── adicap.json.gz │ ├── cim10.csv.gz │ ├── drugs.json │ └── verbs.csv.gz ├── train.py ├── training │ ├── __init__.py │ ├── optimizer.py │ └── trainer.py ├── tune.py ├── utils │ ├── __init__.py │ ├── batching.py │ ├── bindings.py │ ├── collections.py │ ├── deprecation.py │ ├── doc_to_text.py │ ├── examples.py │ ├── extensions.py │ ├── file_system.py │ ├── filter.py │ ├── inclusion.py │ ├── lazy_module.py │ ├── numbers.py │ ├── regex.py │ ├── resources.py │ ├── span_getters.py │ ├── spark_dtypes.py │ ├── stream_sentinels.py │ ├── torch.py │ └── typing.py └── viz │ └── __init__.py ├── mkdocs.yml ├── notebooks ├── README.md ├── connectors │ ├── context.py │ └── omop.md ├── context.py ├── dates │ ├── context.py │ ├── prototype.md │ └── user-guide.md ├── endlines │ └── endlines-example.md ├── example.txt ├── normalizer │ ├── context.py │ ├── profiling.md │ └── prototype.md ├── pipeline.md ├── premier-pipeline.md ├── sections │ ├── annotated_sections.csv │ ├── context.py │ ├── section-dataset.md │ ├── sections.xlsx │ └── testing.md ├── sentences │ ├── context.py │ └── sentences.md ├── tnm │ └── prototype.md ├── tokenizer │ ├── context.py │ └── tokenizer.md └── utilities │ ├── brat.md │ └── context.py ├── pyproject.toml ├── scripts ├── adicap.py ├── cim10.py ├── conjugate_verbs.py ├── context.py └── serve.py ├── setup.py └── tests ├── conftest.py ├── connectors ├── test_labeltool.py └── test_omop.py ├── data ├── test_conll.py ├── test_converters.py ├── test_json.py ├── test_pandas.py ├── test_parquet.py ├── test_polars.py ├── test_spark.py ├── test_standoff.py └── test_stream.py ├── extract_docs_code.py ├── helpers.py ├── matchers ├── test_phrase.py ├── test_regex.py └── test_simstring.py ├── pipelines ├── core │ ├── test_contextual_matcher.py │ ├── test_endlines.py │ ├── test_matcher.py │ ├── test_normalisation.py │ ├── test_sentences.py │ └── test_terminology.py ├── misc │ ├── test_consultation_date.py │ ├── test_consultation_date_town.py │ ├── test_dates.py │ ├── test_quantities.py │ ├── test_reason.py │ ├── test_sections.py │ ├── test_split.py │ └── test_tables.py ├── ner │ ├── disorders │ │ ├── AIDS.py │ │ ├── CKD.py │ │ ├── COPD.py │ │ ├── alcohol.py │ │ ├── cerebrovascular_accident.py │ │ ├── congestive_heart_failure.py │ │ ├── connective_tissue_disease.py │ │ ├── dementia.py │ │ ├── diabetes.py │ │ ├── hemiplegia.py │ │ ├── leukemia.py │ │ ├── liver_disease.py │ │ ├── lymphoma.py │ │ ├── myocardial_infarction.py │ │ ├── peptic_ulcer_disease.py │ │ ├── peripheral_vascular_disease.py │ │ ├── solid_tumor.py │ │ ├── test_all.py │ │ └── tobacco.py │ ├── test_adicap.py │ ├── test_adicap_decoder.py │ ├── test_cim10.py │ ├── test_covid.py │ ├── test_drugs.py │ ├── test_score.py │ ├── test_suicide_attempt.py │ ├── test_tnm.py │ ├── test_umls.py │ └── test_value_extension.py ├── qualifiers │ ├── conftest.py │ ├── test_family.py │ ├── test_history.py │ ├── test_hypothesis.py │ ├── test_negation.py │ └── test_reported_speech.py ├── test_pipelines.py └── trainable │ ├── test_extractive_qa.py │ ├── test_ner.py │ ├── test_span_linker.py │ ├── test_span_qualifier.py │ └── test_transformer.py ├── processing ├── mp_simple_pipe.py ├── test_backends.py └── test_processing.py ├── readme.md ├── resources ├── brat_data │ └── subfolder │ │ ├── doc-1.ann │ │ ├── doc-1.txt │ │ ├── doc-2.txt │ │ └── doc-3.txt ├── docs.jsonl └── docs.parquet ├── test_conjugator.py ├── test_docs.py ├── test_entrypoints.py ├── test_language.py ├── test_pipeline.py ├── test_reducers.py ├── test_scorers.py ├── test_span_args.py ├── training ├── dataset.jsonl ├── dataset │ ├── annotation.conf │ ├── sample-1.ann │ ├── sample-1.txt │ ├── sample-2.ann │ └── sample-2.txt ├── dep_parser_config.yml ├── ner_qlf_diff_bert_config.yml ├── ner_qlf_same_bert_config.yml ├── qlf_config.yml ├── rhapsodie_sample.conllu ├── test_optimizer.py └── test_train.py ├── tuning ├── config.cfg ├── config.yml ├── test_checkpoints │ ├── single_phase_gpu_hour │ │ └── study_.pkl │ ├── single_phase_n_trials │ │ └── study_.pkl │ ├── two_phase_gpu_hour │ │ ├── config.yml │ │ ├── results_summary.txt │ │ └── study_.pkl │ └── two_phase_n_trials │ │ ├── config.yml │ │ ├── results_summary.txt │ │ └── study_.pkl ├── test_end_to_end.py ├── test_tuning.py └── test_update_config.py └── utils ├── test_batching.py ├── test_bindings.py ├── test_collections.py ├── test_examples.py ├── test_filter.py ├── test_package.py ├── test_span_getters.py ├── test_spark_dtypes.py └── test_typing.py /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 88 3 | exclude = .git,__pycache__,__init__.py,.mypy_cache,.pytest_cache,.venv,build 4 | per-file-ignores = __init__.py:F401 5 | ignore = W503, E203 6 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: "Bug Report" 3 | about: Use this template if you came across a bug or unexpected behaviour differing from the docs. 4 | --- 5 | 6 | 7 | 8 | ## Description 9 | 10 | 11 | 12 | ## How to reproduce the bug 13 | 14 | 15 | 16 | ```python 17 | import spacy 18 | 19 | nlp = spacy.blank("fr") 20 | nlp.add_pipe("eds.normalizer") 21 | 22 | # ... 23 | ``` 24 | 25 | ## Your Environment 26 | 27 | 28 | 29 | - Operating System: 30 | - Python Version Used: 31 | - spaCy Version Used: 32 | - EDS-NLP Version Used: 33 | - Environment Information: 34 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: "Feature request" 3 | about: Use this template if you'd like EDS-NLP to add a new feature. 4 | title: "Feature request: [feature]" 5 | --- 6 | 7 | ## Feature type 8 | 9 | 10 | 11 | ## Description 12 | 13 | 14 | -------------------------------------------------------------------------------- /.github/copilot-instructions.md: -------------------------------------------------------------------------------- 1 | First, when starting to develop, install the project with 2 | 3 | ```bash 4 | pip install -e ".[dev]" 5 | pre-commit install 6 | ``` 7 | 8 | Then, when fixing an issue, add a new test to reproduce it. If the issue concerns an existing 9 | component, add the test to the corresponding test file, or create a new test file (only when needed, this should not be the most common scenario). 10 | 11 | Create a new branch (or checkout the auto-created branch for the issue). 12 | 13 | Then update the codebase to fix the issue, and run the new test to check that everything is working as expected. 14 | 15 | Update the changelog.md file with a concise explanation of the change/fix/new feature. 16 | 17 | Before commiting, stash, checkout master and pull to ensure you have the latest version of master, then checkout the branch you were working on and rebase it on top of master. 18 | If the rebase has changed something to the codebase, rerun the edited tests to ensure everything is still working as expected. 19 | 20 | Finally, run git log to look at the commit messages and get an idea of what the commit messages should look like (concise, neutral, conventional commits messages). 21 | 22 | ```bash 23 | git log --no-pager 24 | ``` 25 | 26 | Finally commit the changes. 27 | 28 | !!! note 29 | 30 | Whenever you run a command, ensure that you do it without making it prompt the user for input (ie, use --no-edit in git rebase, --no-pager, --yes, etc. when possible). 31 | -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Description 4 | 5 | 6 | 7 | ## Checklist 8 | 9 | 10 | 11 | - [ ] If this PR is a bug fix, the bug is documented in the test suite. 12 | - [ ] Changes were documented in the changelog (pending section). 13 | - [ ] If necessary, changes were made to the documentation (eg new pipeline). 14 | -------------------------------------------------------------------------------- /.github/workflows/delete-preview-docs.yml: -------------------------------------------------------------------------------- 1 | name: Delete preview docs 2 | 3 | on: 4 | workflow_dispatch: 5 | delete: 6 | 7 | jobs: 8 | delete: 9 | name: Delete Vercel Project 10 | if: github.event.ref_type == 'branch' 11 | runs-on: ubuntu-latest 12 | steps: 13 | - run: | 14 | # Set up Vercel 15 | npm install --global vercel@latest 16 | # Pull Vercel environment 17 | vercel pull --yes --environment=preview --token=${{ secrets.VERCEL_TOKEN }} 18 | # Delete vercel project linked to this branch 19 | vercel remove edsnlp-${{ github.event.ref }} --yes --token=${{ secrets.VERCEL_TOKEN }} 20 | -------------------------------------------------------------------------------- /.github/workflows/documentation.yml: -------------------------------------------------------------------------------- 1 | name: Documentation 2 | 3 | on: 4 | workflow_dispatch: 5 | push: 6 | branches: [master, dev] 7 | 8 | env: 9 | BRANCH_NAME: ${{ github.head_ref || github.ref_name }} 10 | # UV_INDEX_STRATEGY: "unsafe-first-match" 11 | # UV_EXTRA_INDEX_URL: "https://download.pytorch.org/whl/cpu" 12 | PIP_EXTRA_INDEX_URL: "https://download.pytorch.org/whl/cpu" 13 | 14 | jobs: 15 | Documentation: 16 | runs-on: ubuntu-22.04 17 | steps: 18 | - uses: actions/checkout@v2 19 | 20 | - uses: actions/setup-python@v4 21 | with: 22 | python-version: "3.9" 23 | cache: 'pip' 24 | 25 | - run: echo WEEK=$(date +%V) >>$GITHUB_ENV 26 | shell: bash 27 | 28 | # - uses: hynek/setup-cached-uv@v1 29 | # with: 30 | # cache-suffix: -docs-${{ matrix.python-version }}-${{ env.WEEK }} 31 | 32 | - name: Install dependencies 33 | run: | 34 | pip install '.[docs]' 35 | # uv venv 36 | # uv pip install '.[docs]' 37 | 38 | - name: Set up Git 39 | run: | 40 | git config user.name ${{ github.actor }} 41 | git config user.email ${{ github.actor }}@users.noreply.github.com 42 | 43 | - name: Build documentation 44 | run: | 45 | git fetch origin gh-pages 46 | mike delete $BRANCH_NAME 47 | mike deploy --push $BRANCH_NAME 48 | # source .venv/bin/activate 49 | -------------------------------------------------------------------------------- /.github/workflows/test-build.yml: -------------------------------------------------------------------------------- 1 | # This tries to build packages, and tests the packages. 2 | # It runs on every push to branches following the pattern v*.*.*. 3 | # It makes sure that everything will run when the version is released. 4 | 5 | name: Test Build 6 | 7 | 8 | on: 9 | workflow_dispatch: 10 | pull_request: 11 | branches: 12 | - v*.*.* 13 | - build-* 14 | 15 | jobs: 16 | build_wheels: 17 | name: Build wheels on ${{ matrix.os }} 18 | runs-on: ${{ matrix.os }} 19 | strategy: 20 | matrix: 21 | os: [ubuntu-22.04, windows-latest, macos-latest] 22 | 23 | steps: 24 | - uses: actions/checkout@v2 25 | 26 | - name: Build wheels 27 | # 2.4 is too low (can't build for macos, 2.16 is too high (OpenSSL issues) 28 | uses: pypa/cibuildwheel@v2.16.5 29 | env: 30 | CIBW_ARCHS_MACOS: "x86_64 arm64" 31 | CIBW_ENVIRONMENT: PIP_EXTRA_INDEX_URL=https://download.pytorch.org/whl/cpu 32 | 33 | 34 | build_sdist: 35 | name: Build source distribution 36 | runs-on: ubuntu-22.04 37 | steps: 38 | - uses: actions/checkout@v2 39 | 40 | - name: Build sdist 41 | run: pipx run build --sdist 42 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Python 2 | __pycache__/ 3 | 4 | # Distribution / packaging 5 | init 6 | .Python 7 | env/ 8 | venv/ 9 | build/ 10 | develop-eggs/ 11 | dist/ 12 | downloads/ 13 | eggs/ 14 | .eggs/ 15 | lib/ 16 | lib64/ 17 | parts/ 18 | sdist/ 19 | var/ 20 | site/ 21 | *.egg-info/ 22 | .installed.cfg 23 | *.egg 24 | *.cpp 25 | *.so 26 | *.c 27 | public/ 28 | 29 | # Unit test / coverage reports 30 | htmlcov/ 31 | .tox/ 32 | .coverage 33 | .coverage.* 34 | .cache 35 | nosetests.xml 36 | coverage.xml 37 | *,cover 38 | .hypothesis/ 39 | .pytest_cache/ 40 | 41 | # Documentation 42 | _build/ 43 | 44 | # Notebooks 45 | .ipynb_checkpoints/ 46 | *.ipynb 47 | 48 | # Data 49 | *.csv 50 | *.pickle 51 | *.txt 52 | *.xls 53 | *.xlsx 54 | *.tar.gz 55 | *.tsv 56 | *.ann 57 | 58 | # Editors 59 | .idea 60 | .vscode 61 | 62 | # Files 63 | .DS_Store 64 | 65 | # Environment 66 | .venv 67 | 68 | # Test resources 69 | !tests/resources/**/* 70 | 71 | # Generated docs 72 | docs/reference 73 | docs/changelog.md 74 | docs/contributing.md 75 | .vercel 76 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | # See https://pre-commit.com for more information 2 | # See https://pre-commit.com/hooks.html for more hooks 3 | repos: 4 | - repo: https://github.com/pre-commit/pre-commit-hooks 5 | rev: v3.2.0 6 | hooks: 7 | - id: trailing-whitespace 8 | exclude: | 9 | (?x)^( 10 | tests/resources/.*| 11 | edsnlp/resources/.* 12 | )$ 13 | - id: no-commit-to-branch 14 | - id: end-of-file-fixer 15 | - id: check-yaml 16 | args: ["--unsafe"] 17 | - id: check-toml 18 | - id: check-json 19 | - id: check-symlinks 20 | - id: check-added-large-files 21 | - id: detect-private-key 22 | # ruff 23 | - repo: https://github.com/charliermarsh/ruff-pre-commit 24 | # Ruff version. 25 | rev: 'v0.9.6' 26 | hooks: 27 | - id: ruff 28 | args: ['--config', 'pyproject.toml', '--fix', '--show-fixes'] 29 | - id: ruff-format 30 | args: ['--config', 'pyproject.toml', '--diff'] 31 | - id: ruff-format 32 | args: ['--config', 'pyproject.toml'] 33 | - repo: https://github.com/asottile/blacken-docs 34 | rev: v1.10.0 35 | hooks: 36 | - id: blacken-docs 37 | additional_dependencies: [black==20.8b1] 38 | exclude: notebooks/ 39 | - repo: https://github.com/econchick/interrogate 40 | rev: 237be78 41 | hooks: 42 | - id: interrogate 43 | args: ["--config=pyproject.toml"] 44 | pass_filenames: false 45 | -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | # This CITATION.cff file was generated with cffinit. 2 | # Visit https://bit.ly/cffinit to generate yours today! 3 | 4 | cff-version: 1.2.0 5 | title: >- 6 | EDS-NLP: efficient information extraction from 7 | French clinical notes 8 | message: If you use EDS-NLP, please cite us as below. 9 | type: software 10 | authors: 11 | - given-names: Perceval 12 | family-names: Wajsburt 13 | affiliation: Assistance Publique – Hôpitaux de Paris 14 | - given-names: Thomas 15 | family-names: Petit-Jean 16 | affiliation: Assistance Publique – Hôpitaux de Paris 17 | - given-names: Basile 18 | family-names: Dura 19 | orcid: "https://orcid.org/0000-0002-8315-4050" 20 | affiliation: Assistance Publique – Hôpitaux de Paris 21 | - given-names: Ariel 22 | family-names: Cohen 23 | orcid: "https://orcid.org/0000-0002-2550-9773" 24 | affiliation: Assistance Publique – Hôpitaux de Paris 25 | - given-names: Charline 26 | family-names: Jean 27 | affiliation: Assistance Publique – Hôpitaux de Paris 28 | - given-names: Romain 29 | family-names: Bey 30 | affiliation: Assistance Publique – Hôpitaux de Paris 31 | repository-code: "https://github.com/aphp/edsnlp" 32 | url: "http://aphp.github.io/edsnlp" 33 | abstract: >- 34 | EDS-NLP provides a set of spaCy components that are 35 | used to extract information from clinical notes 36 | written in French. 37 | keywords: 38 | - NLP 39 | - clinical 40 | license: BSD-3-Clause 41 | year: 2022 42 | doi: 10.5281/zenodo.6424993 43 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2021 Assistance Publique - Hôpitaux de Paris 2 | 3 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 4 | 5 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 6 | 7 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 8 | 9 | 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 10 | 11 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 12 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | 2 | .ONESHELL: 3 | SHELL:=/bin/bash 4 | 5 | .PHONY: create-env install documentation test 6 | 7 | default: 8 | @echo "Call a specific subcommand: create-env,install,documentation,test" 9 | 10 | .venv: 11 | python -m venv .venv 12 | 13 | create-env: .venv 14 | 15 | install : .venv 16 | . .venv/bin/activate 17 | pip install -r '.[dev,setup]'.txt 18 | python scripts/conjugate_verbs.py 19 | pip install -e . 20 | pre-commit install 21 | 22 | documentation: .venv 23 | . .venv/bin/activate 24 | pip install -e '.[docs]' 25 | mkdocs serve 26 | 27 | test: .venv 28 | . .venv/bin/activate 29 | python -m pytest 30 | -------------------------------------------------------------------------------- /demo/requirements.txt: -------------------------------------------------------------------------------- 1 | pydantic-core==2.14.4 2 | git+https://github.com/aphp/edsnlp.git 3 | streamlit 4 | -------------------------------------------------------------------------------- /docs/advanced-tutorials/index.md: -------------------------------------------------------------------------------- 1 | # Advanced use cases 2 | 3 | In this section, we review a few advanced use cases: 4 | 5 | - Adding pre-computed word vectors to spaCy 6 | - Deploying your spaCy pipeline as an API 7 | - Creating your own component 8 | -------------------------------------------------------------------------------- /docs/assets/fragments/aids-examples.md: -------------------------------------------------------------------------------- 1 | === "SIDA" 2 | ```python 3 | text = "Patient atteint du VIH au stade SIDA." 4 | doc = nlp(text) 5 | spans = doc.spans["aids"] 6 | 7 | spans 8 | # Out: [VIH au stade SIDA] 9 | ``` 10 | 11 | 12 | 13 | === "VIH" 14 | ```python 15 | text = "Patient atteint du VIH." 16 | doc = nlp(text) 17 | spans = doc.spans["aids"] 18 | 19 | spans 20 | # Out: [] 21 | ``` 22 | 23 | 24 | 25 | === "Coinfection" 26 | ```python 27 | text = "Il y a un VIH avec coinfection pneumocystose" 28 | doc = nlp(text) 29 | spans = doc.spans["aids"] 30 | 31 | spans 32 | # Out: [VIH] 33 | 34 | span = spans[0] 35 | 36 | span._.assigned 37 | # Out: {'opportunist': [coinfection, pneumocystose]} 38 | ``` 39 | 40 | 41 | 42 | === "VIH stade SIDA" 43 | ```python 44 | text = "Présence d'un VIH stade C" 45 | doc = nlp(text) 46 | spans = doc.spans["aids"] 47 | 48 | spans 49 | # Out: [VIH] 50 | 51 | span = spans[0] 52 | 53 | span._.assigned 54 | # Out: {'stage': [C]} 55 | ``` 56 | -------------------------------------------------------------------------------- /docs/assets/fragments/congestive-heart-failure-examples.md: -------------------------------------------------------------------------------- 1 | 2 | === "1" 3 | ```python 4 | text = "Présence d'un oedème pulmonaire" 5 | doc = nlp(text) 6 | spans = doc.spans["congestive_heart_failure"] 7 | 8 | spans 9 | # Out: [oedème pulmonaire] 10 | ``` 11 | 12 | === "2" 13 | ```python 14 | text = "Le patient est équipé d'un pace-maker" 15 | doc = nlp(text) 16 | spans = doc.spans["congestive_heart_failure"] 17 | 18 | spans 19 | # Out: [pace-maker] 20 | ``` 21 | 22 | === "3" 23 | ```python 24 | text = "Un cardiopathie non décompensée" 25 | doc = nlp(text) 26 | spans = doc.spans["congestive_heart_failure"] 27 | 28 | spans 29 | # Out: [] 30 | ``` 31 | 32 | === "4" 33 | ```python 34 | text = "Insuffisance cardiaque" 35 | doc = nlp(text) 36 | spans = doc.spans["congestive_heart_failure"] 37 | 38 | spans 39 | # Out: [Insuffisance cardiaque] 40 | ``` 41 | 42 | === "5" 43 | ```python 44 | text = "Insuffisance cardiaque minime" 45 | doc = nlp(text) 46 | spans = doc.spans["congestive_heart_failure"] 47 | 48 | spans 49 | # Out: [] 50 | ``` 51 | -------------------------------------------------------------------------------- /docs/assets/fragments/connective-tissue-disease-examples.md: -------------------------------------------------------------------------------- 1 | === "1" 2 | ```python 3 | text = "Présence d'une sclérodermie." 4 | doc = nlp(text) 5 | spans = doc.spans["connective_tissue_disease"] 6 | 7 | spans 8 | # Out: [sclérodermie] 9 | ``` 10 | 11 | 12 | 13 | === "2" 14 | ```python 15 | text = "Patient atteint d'un lupus." 16 | doc = nlp(text) 17 | spans = doc.spans["connective_tissue_disease"] 18 | 19 | spans 20 | # Out: [lupus] 21 | ``` 22 | 23 | 24 | 25 | === "3" 26 | ```python 27 | text = "Présence d'anticoagulants lupiques," 28 | doc = nlp(text) 29 | spans = doc.spans["connective_tissue_disease"] 30 | 31 | spans 32 | # Out: [] 33 | ``` 34 | 35 | 36 | 37 | === "4" 38 | ```python 39 | text = "Il y a une MICI." 40 | doc = nlp(text) 41 | spans = doc.spans["connective_tissue_disease"] 42 | 43 | spans 44 | # Out: [MICI] 45 | ``` 46 | 47 | 48 | 49 | === "5" 50 | ```python 51 | text = "Syndrome de Raynaud" 52 | doc = nlp(text) 53 | spans = doc.spans["connective_tissue_disease"] 54 | 55 | spans 56 | # Out: [Raynaud] 57 | ``` 58 | -------------------------------------------------------------------------------- /docs/assets/fragments/copd-examples.md: -------------------------------------------------------------------------------- 1 | === "1" 2 | ```python 3 | text = "Une fibrose interstitielle diffuse idiopathique" 4 | doc = nlp(text) 5 | spans = doc.spans["copd"] 6 | 7 | spans 8 | # Out: [fibrose interstitielle diffuse idiopathique] 9 | ``` 10 | 11 | 12 | 13 | === "2" 14 | ```python 15 | text = "Patient atteint de pneumoconiose" 16 | doc = nlp(text) 17 | spans = doc.spans["copd"] 18 | 19 | spans 20 | # Out: [pneumoconiose] 21 | ``` 22 | 23 | 24 | 25 | === "3" 26 | ```python 27 | text = "Présence d'une HTAP." 28 | doc = nlp(text) 29 | spans = doc.spans["copd"] 30 | 31 | spans 32 | # Out: [HTAP] 33 | ``` 34 | 35 | 36 | 37 | === "4" 38 | ```python 39 | text = "On voit une hypertension pulmonaire minime" 40 | doc = nlp(text) 41 | spans = doc.spans["copd"] 42 | 43 | spans 44 | # Out: [] 45 | ``` 46 | 47 | 48 | 49 | === "5" 50 | ```python 51 | text = "La patiente a été mis sous oxygénorequérance" 52 | doc = nlp(text) 53 | spans = doc.spans["copd"] 54 | 55 | spans 56 | # Out: [] 57 | ``` 58 | 59 | 60 | 61 | === "6" 62 | ```python 63 | text = "La patiente est sous oxygénorequérance au long cours" 64 | doc = nlp(text) 65 | spans = doc.spans["copd"] 66 | 67 | spans 68 | # Out: [oxygénorequérance au long cours] 69 | 70 | span = spans[0] 71 | 72 | span._.assigned 73 | # Out: {'long': [long cours]} 74 | ``` 75 | -------------------------------------------------------------------------------- /docs/assets/fragments/dementia-examples.md: -------------------------------------------------------------------------------- 1 | === "1" 2 | ```python 3 | text = "D'importants déficits cognitifs" 4 | doc = nlp(text) 5 | spans = doc.spans["dementia"] 6 | 7 | spans 8 | # Out: [déficits cognitifs] 9 | ``` 10 | 11 | 12 | 13 | === "2" 14 | ```python 15 | text = "Patient atteint de démence" 16 | doc = nlp(text) 17 | spans = doc.spans["dementia"] 18 | 19 | spans 20 | # Out: [démence] 21 | ``` 22 | 23 | 24 | 25 | === "3" 26 | ```python 27 | text = "On retrouve des anti-SLA" 28 | doc = nlp(text) 29 | spans = doc.spans["dementia"] 30 | 31 | spans 32 | # Out: [] 33 | ``` 34 | 35 | 36 | 37 | === "4" 38 | ```python 39 | text = "Une maladie de Charcot" 40 | doc = nlp(text) 41 | spans = doc.spans["dementia"] 42 | 43 | spans 44 | # Out: [maladie de Charcot] 45 | ``` 46 | -------------------------------------------------------------------------------- /docs/assets/fragments/hemiplegia-examples.md: -------------------------------------------------------------------------------- 1 | === "1" 2 | ```python 3 | text = "Patient hémiplégique" 4 | doc = nlp(text) 5 | spans = doc.spans["hemiplegia"] 6 | 7 | spans 8 | # Out: [hémiplégique] 9 | ``` 10 | 11 | 12 | 13 | === "2" 14 | ```python 15 | text = "Paralysie des membres inférieurs" 16 | doc = nlp(text) 17 | spans = doc.spans["hemiplegia"] 18 | 19 | spans 20 | # Out: [Paralysie des membres] 21 | ``` 22 | 23 | 24 | 25 | === "3" 26 | ```python 27 | text = "Patient en LIS" 28 | doc = nlp(text) 29 | spans = doc.spans["hemiplegia"] 30 | 31 | spans 32 | # Out: [LIS] 33 | ``` 34 | -------------------------------------------------------------------------------- /docs/assets/fragments/leukemia-examples.md: -------------------------------------------------------------------------------- 1 | === "1" 2 | ```python 3 | text = "Sydrome myéloprolifératif" 4 | doc = nlp(text) 5 | spans = doc.spans["leukemia"] 6 | 7 | spans 8 | # Out: [myéloprolifératif] 9 | ``` 10 | 11 | 12 | 13 | === "2" 14 | ```python 15 | text = "Sydrome myéloprolifératif bénin" 16 | doc = nlp(text) 17 | spans = doc.spans["leukemia"] 18 | 19 | spans 20 | # Out: [] 21 | ``` 22 | 23 | 24 | 25 | === "3" 26 | ```python 27 | text = "Patient atteint d'une LAM" 28 | doc = nlp(text) 29 | spans = doc.spans["leukemia"] 30 | 31 | spans 32 | # Out: [LAM] 33 | ``` 34 | 35 | 36 | 37 | === "4" 38 | ```python 39 | text = "Une maladie de Vaquez" 40 | doc = nlp(text) 41 | spans = doc.spans["leukemia"] 42 | 43 | spans 44 | # Out: [Vaquez] 45 | ``` 46 | -------------------------------------------------------------------------------- /docs/assets/fragments/liver-disease-examples.md: -------------------------------------------------------------------------------- 1 | === "1" 2 | ```python 3 | text = "Il y a une fibrose hépatique" 4 | doc = nlp(text) 5 | spans = doc.spans["liver_disease"] 6 | 7 | spans 8 | # Out: [fibrose hépatique] 9 | ``` 10 | 11 | 12 | 13 | === "2" 14 | ```python 15 | text = "Une hépatite B chronique" 16 | doc = nlp(text) 17 | spans = doc.spans["liver_disease"] 18 | 19 | spans 20 | # Out: [hépatite B chronique] 21 | ``` 22 | 23 | 24 | 25 | === "3" 26 | ```python 27 | text = "Le patient consulte pour une cirrhose" 28 | doc = nlp(text) 29 | spans = doc.spans["liver_disease"] 30 | 31 | spans 32 | # Out: [cirrhose] 33 | 34 | span = spans[0] 35 | 36 | span._.detailed_status 37 | # Out: MODERATE_TO_SEVERE 38 | ``` 39 | 40 | 41 | 42 | === "4" 43 | ```python 44 | text = "Greffe hépatique." 45 | doc = nlp(text) 46 | spans = doc.spans["liver_disease"] 47 | 48 | spans 49 | # Out: [Greffe hépatique] 50 | 51 | span = spans[0] 52 | 53 | span._.detailed_status 54 | # Out: MODERATE_TO_SEVERE 55 | ``` 56 | -------------------------------------------------------------------------------- /docs/assets/fragments/lymphoma-examples.md: -------------------------------------------------------------------------------- 1 | === "1" 2 | ```python 3 | text = "Un lymphome de Hodgkin." 4 | doc = nlp(text) 5 | spans = doc.spans["lymphoma"] 6 | 7 | spans 8 | # Out: [lymphome de Hodgkin] 9 | ``` 10 | 11 | 12 | 13 | === "2" 14 | ```python 15 | text = "Atteint d'un Waldenstörm" 16 | doc = nlp(text) 17 | spans = doc.spans["lymphoma"] 18 | 19 | spans 20 | # Out: [Waldenstörm] 21 | ``` 22 | 23 | 24 | 25 | === "3" 26 | ```python 27 | text = "Un LAGC" 28 | doc = nlp(text) 29 | spans = doc.spans["lymphoma"] 30 | 31 | spans 32 | # Out: [LAGC] 33 | ``` 34 | 35 | 36 | 37 | === "4" 38 | ```python 39 | text = "anti LAGC: 10^4/mL" 40 | doc = nlp(text) 41 | spans = doc.spans["lymphoma"] 42 | 43 | spans 44 | # Out: [] 45 | ``` 46 | -------------------------------------------------------------------------------- /docs/assets/fragments/myocardial-infarction-examples.md: -------------------------------------------------------------------------------- 1 | === "1" 2 | ```python 3 | text = "Une cardiopathie ischémique" 4 | doc = nlp(text) 5 | spans = doc.spans["myocardial_infarction"] 6 | 7 | spans 8 | # Out: [cardiopathie ischémique] 9 | ``` 10 | 11 | 12 | 13 | === "2" 14 | ```python 15 | text = "Une cardiopathie non-ischémique" 16 | doc = nlp(text) 17 | spans = doc.spans["myocardial_infarction"] 18 | 19 | spans 20 | # Out: [] 21 | ``` 22 | 23 | 24 | 25 | === "3" 26 | ```python 27 | text = "Présence d'un stent sur la marginale" 28 | doc = nlp(text) 29 | spans = doc.spans["myocardial_infarction"] 30 | 31 | spans 32 | # Out: [stent sur la marginale] 33 | 34 | span = spans[0] 35 | 36 | span._.assigned 37 | # Out: {'heart_localized': [marginale]} 38 | ``` 39 | 40 | 41 | 42 | === "4" 43 | ```python 44 | text = "Présence d'un stent périphérique" 45 | doc = nlp(text) 46 | spans = doc.spans["myocardial_infarction"] 47 | 48 | spans 49 | # Out: [] 50 | ``` 51 | 52 | 53 | 54 | === "5" 55 | ```python 56 | text = "infarctus du myocarde" 57 | doc = nlp(text) 58 | spans = doc.spans["myocardial_infarction"] 59 | 60 | spans 61 | # Out: [infarctus du myocarde] 62 | 63 | span = spans[0] 64 | 65 | span._.assigned 66 | # Out: {'heart_localized': [myocarde]} 67 | ``` 68 | -------------------------------------------------------------------------------- /docs/assets/fragments/peptic-ulcer-disease-examples.md: -------------------------------------------------------------------------------- 1 | === "1" 2 | ```python 3 | text = "Beaucoup d'ulcères gastriques" 4 | doc = nlp(text) 5 | spans = doc.spans["peptic_ulcer_disease"] 6 | 7 | spans 8 | # Out: [ulcères gastriques] 9 | ``` 10 | 11 | 12 | 13 | === "2" 14 | ```python 15 | text = "Présence d'UGD" 16 | doc = nlp(text) 17 | spans = doc.spans["peptic_ulcer_disease"] 18 | 19 | spans 20 | # Out: [UGD] 21 | ``` 22 | 23 | 24 | 25 | === "3" 26 | ```python 27 | text = "La patient à des ulcères" 28 | doc = nlp(text) 29 | spans = doc.spans["peptic_ulcer_disease"] 30 | 31 | spans 32 | # Out: [] 33 | ``` 34 | 35 | 36 | 37 | === "4" 38 | ```python 39 | text = "Au niveau gastrique: blabla blabla blabla blabla blabla quelques ulcères" 40 | doc = nlp(text) 41 | spans = doc.spans["peptic_ulcer_disease"] 42 | 43 | spans 44 | # Out: [gastrique: blabla blabla blabla blabla blabla quelques ulcères] 45 | 46 | span = spans[0] 47 | 48 | span._.assigned 49 | # Out: {'is_peptic': [gastrique]} 50 | ``` 51 | -------------------------------------------------------------------------------- /docs/assets/images/class_span_linker.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edsnlp/8e9ed84f56e6af741023e8b3a9de38ba93912953/docs/assets/images/class_span_linker.png -------------------------------------------------------------------------------- /docs/assets/images/hybrid-pipeline-example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edsnlp/8e9ed84f56e6af741023e8b3a9de38ba93912953/docs/assets/images/hybrid-pipeline-example.png -------------------------------------------------------------------------------- /docs/assets/images/model-parallelism.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edsnlp/8e9ed84f56e6af741023e8b3a9de38ba93912953/docs/assets/images/model-parallelism.png -------------------------------------------------------------------------------- /docs/assets/images/multiprocessing.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edsnlp/8e9ed84f56e6af741023e8b3a9de38ba93912953/docs/assets/images/multiprocessing.png -------------------------------------------------------------------------------- /docs/assets/images/sharing-components.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edsnlp/8e9ed84f56e6af741023e8b3a9de38ba93912953/docs/assets/images/sharing-components.png -------------------------------------------------------------------------------- /docs/assets/images/synonym_span_linker.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edsnlp/8e9ed84f56e6af741023e8b3a9de38ba93912953/docs/assets/images/synonym_span_linker.png -------------------------------------------------------------------------------- /docs/assets/overrides/main.html: -------------------------------------------------------------------------------- 1 | {% extends "base.html" %} 2 | 3 | {% block announce %} 4 | Check out the new Model Training tutorial ! 5 | {% endblock %} 6 | -------------------------------------------------------------------------------- /docs/assets/templates/python/material/docstring/examples.html: -------------------------------------------------------------------------------- 1 | {{ "# Examples\n"|convert_markdown(heading_level, html_id) }} 2 | {% for section_type, sub_section in section.value %} 3 | {% if section_type.value == "text" %} 4 | {{ sub_section|convert_markdown(heading_level, html_id) }} 5 | {% elif section_type.value == "examples" %} 6 | {{ sub_section|convert_markdown(heading_level, html_id) }} 7 | {% endif %} 8 | {% endfor %} 9 | -------------------------------------------------------------------------------- /docs/data/json.md: -------------------------------------------------------------------------------- 1 | # JSON 2 | 3 | ??? abstract "TLDR" 4 | 5 | ```{ .python .no-check } 6 | import edsnlp 7 | 8 | stream = edsnlp.data.read_json(path, converter="omop") 9 | stream = stream.map_pipeline(nlp) 10 | res = stream.to_json(path, converter="omop") 11 | # or equivalently 12 | edsnlp.data.to_json(stream, path, converter="omop") 13 | ``` 14 | 15 | We provide methods to read and write documents (raw or annotated) from and to json files. 16 | 17 | As an example, imagine that we have the following document that uses the OMOP schema 18 | 19 | ```{ title="data.jsonl" } 20 | { "note_id": 0, "note_text": "Le patient ...", "note_datetime": "2021-10-23", "entities": [...] } 21 | { "note_id": 1, "note_text": "Autre doc ...", "note_datetime": "2022-12-24", "entities": [] } 22 | ... 23 | ``` 24 | 25 | You could also have multiple `.json` files in a directory, the reader will read them all. 26 | 27 | ## Reading JSON files {: #edsnlp.data.json.read_json } 28 | 29 | ::: edsnlp.data.json.read_json 30 | options: 31 | heading_level: 3 32 | show_source: false 33 | show_toc: false 34 | show_bases: false 35 | 36 | ## Writing JSON files {: #edsnlp.data.json.write_json } 37 | 38 | ::: edsnlp.data.json.write_json 39 | options: 40 | heading_level: 3 41 | show_source: false 42 | show_toc: false 43 | show_bases: false 44 | -------------------------------------------------------------------------------- /docs/data/overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edsnlp/8e9ed84f56e6af741023e8b3a9de38ba93912953/docs/data/overview.png -------------------------------------------------------------------------------- /docs/data/polars.md: -------------------------------------------------------------------------------- 1 | # Polars 2 | 3 | ??? abstract "TLDR" 4 | 5 | ```{ .python .no-check } 6 | import edsnlp 7 | 8 | stream = edsnlp.data.from_polars(df, converter="omop") 9 | stream = stream.map_pipeline(nlp) 10 | res = stream.to_polars(converter="omop") 11 | # or equivalently 12 | edsnlp.data.to_polars(stream, converter="omop") 13 | ``` 14 | 15 | We provide methods to read and write documents (raw or annotated) from and to Polars DataFrames. 16 | 17 | As an example, imagine that we have the following OMOP dataframe (we'll name it `note_df`) 18 | 19 | | note_id | note_text | note_datetime | 20 | |--------:|:----------------------------------------------|:--------------| 21 | | 0 | Le patient est admis pour une pneumopathie... | 2021-10-23 | 22 | 23 | ## Reading from a Polars Dataframe {: #edsnlp.data.polars.from_polars } 24 | 25 | ::: edsnlp.data.polars.from_polars 26 | options: 27 | heading_level: 3 28 | show_source: false 29 | show_toc: false 30 | show_bases: false 31 | 32 | 33 | ## Writing to a Polars DataFrame {: #edsnlp.data.polars.to_polars } 34 | 35 | ::: edsnlp.data.polars.to_polars 36 | options: 37 | heading_level: 3 38 | show_source: false 39 | show_toc: false 40 | show_bases: false 41 | -------------------------------------------------------------------------------- /docs/data/standoff.md: -------------------------------------------------------------------------------- 1 | # BRAT and Standoff 2 | 3 | ??? abstract "TLDR" 4 | 5 | ```{ .python .no-check } 6 | import edsnlp 7 | 8 | stream = edsnlp.data.read_standoff(path) 9 | stream = stream.map_pipeline(nlp) 10 | res = stream.write_standoff(path) 11 | # or equivalently 12 | edsnlp.data.write_standoff(stream, path) 13 | ``` 14 | 15 | You can easily integrate [BRAT](https://brat.nlplab.org/) into your project by using EDS-NLP's BRAT reader and writer. 16 | 17 | BRAT annotations are in the [standoff format](https://brat.nlplab.org/standoff.html). Consider the following document: 18 | 19 | ```{ title="doc.txt" } 20 | Le patient est admis pour une pneumopathie au coronavirus. 21 | On lui prescrit du paracétamol. 22 | ``` 23 | 24 | Brat annotations are stored in a separate file formatted as follows: 25 | 26 | ```{ title="doc.ann" } 27 | T1 Patient 4 11 patient 28 | T2 Disease 31 58 pneumopathie au coronavirus 29 | T3 Drug 79 90 paracétamol 30 | ``` 31 | 32 | ## Reading Standoff files {: #edsnlp.data.standoff.read_standoff } 33 | 34 | ::: edsnlp.data.standoff.read_standoff 35 | options: 36 | heading_level: 3 37 | show_source: false 38 | show_toc: false 39 | show_bases: false 40 | 41 | ## Writing Standoff files {: #edsnlp.data.standoff.write_standoff } 42 | 43 | ::: edsnlp.data.standoff.write_standoff 44 | options: 45 | heading_level: 3 46 | show_source: false 47 | show_toc: false 48 | show_bases: false 49 | -------------------------------------------------------------------------------- /docs/pipes/core/endlines.md: -------------------------------------------------------------------------------- 1 | # Endlines {: #edsnlp.pipes.core.endlines.factory.create_component } 2 | 3 | ::: edsnlp.pipes.core.endlines.factory.create_component 4 | options: 5 | heading_level: 2 6 | show_bases: false 7 | show_source: false 8 | only_class_level: true 9 | -------------------------------------------------------------------------------- /docs/pipes/core/index.md: -------------------------------------------------------------------------------- 1 | # Core Components 2 | 3 | This section deals with "core" functionalities offered by EDS-NLP: 4 | 5 | - Generic matchers against regular expressions and list of terms 6 | - Text cleaning 7 | - Sentence boundaries detection 8 | 9 | ## Available components 10 | 11 | 12 | 13 | | Component | Description | 14 | |-------------------------|-------------------------------------------------| 15 | | `eds.normalizer` | Non-destructive input text normalisation | 16 | | `eds.sentences` | Better sentence boundary detection | 17 | | `eds.matcher` | A simple yet powerful entity extractor | 18 | | `eds.terminology` | A simple yet powerful terminology matcher | 19 | | `eds.contextual_matcher` | A conditional entity extractor | 20 | | `eds.endlines` | An unsupervised model to classify each end line | 21 | 22 | 23 | -------------------------------------------------------------------------------- /docs/pipes/core/matcher.md: -------------------------------------------------------------------------------- 1 | # Matcher {: #edsnlp.pipes.core.matcher.factory.create_component } 2 | 3 | ::: edsnlp.pipes.core.matcher.factory.create_component 4 | options: 5 | heading_level: 2 6 | show_bases: false 7 | show_source: false 8 | only_class_level: true 9 | -------------------------------------------------------------------------------- /docs/pipes/core/sentences.md: -------------------------------------------------------------------------------- 1 | # Sentences {: #edsnlp.pipes.core.sentences.factory.create_component } 2 | 3 | ::: edsnlp.pipes.core.sentences.factory.create_component 4 | options: 5 | heading_level: 2 6 | show_bases: false 7 | show_source: false 8 | only_class_level: true 9 | -------------------------------------------------------------------------------- /docs/pipes/core/terminology.md: -------------------------------------------------------------------------------- 1 | # Terminology {: #edsnlp.pipes.core.terminology.factory.create_component } 2 | 3 | ::: edsnlp.pipes.core.terminology.factory.create_component 4 | options: 5 | heading_level: 2 6 | show_bases: false 7 | show_source: false 8 | only_class_level: true 9 | -------------------------------------------------------------------------------- /docs/pipes/misc/consultation-dates.md: -------------------------------------------------------------------------------- 1 | # Consultation dates {: #edsnlp.pipes.misc.consultation_dates.factory.create_component } 2 | 3 | ::: edsnlp.pipes.misc.consultation_dates.factory.create_component 4 | options: 5 | heading_level: 2 6 | show_bases: false 7 | show_source: false 8 | only_class_level: true 9 | -------------------------------------------------------------------------------- /docs/pipes/misc/dates.md: -------------------------------------------------------------------------------- 1 | # Dates {: #edsnlp.pipes.misc.dates.factory.create_component } 2 | 3 | ::: edsnlp.pipes.misc.dates.factory.create_component 4 | options: 5 | heading_level: 2 6 | show_bases: false 7 | show_source: false 8 | only_class_level: true 9 | -------------------------------------------------------------------------------- /docs/pipes/misc/index.md: -------------------------------------------------------------------------------- 1 | # Miscellaneous 2 | 3 | This section regroups components that extract information that can be used by other components, but have little medical value in itself. 4 | 5 | For instance, the date detection and normalisation pipeline falls in this category. 6 | 7 | ## Available components 8 | 9 | 10 | 11 | | Component | Description | 12 | |--------------------------|---------------------------------------------| 13 | | `eds.dates` | Date extraction and normalisation | 14 | | `eds.consultation_dates` | Identify consultation dates | 15 | | `eds.quantities` | Quantity extraction and normalisation | 16 | | `eds.sections` | Section detection | 17 | | `eds.reason` | Rule-based hospitalisation reason detection | 18 | | `eds.tables` | Tables detection | 19 | | `eds.split` | Doc splitting | 20 | 21 | 22 | -------------------------------------------------------------------------------- /docs/pipes/misc/quantities.md: -------------------------------------------------------------------------------- 1 | # Quantities {: #edsnlp.pipes.misc.quantities.factory.create_component } 2 | 3 | ::: edsnlp.pipes.misc.quantities.factory.create_component 4 | options: 5 | heading_level: 2 6 | show_bases: false 7 | show_source: false 8 | only_class_level: true 9 | -------------------------------------------------------------------------------- /docs/pipes/misc/reason.md: -------------------------------------------------------------------------------- 1 | # Reasons {: #edsnlp.pipes.misc.reason.factory.create_component } 2 | 3 | ::: edsnlp.pipes.misc.reason.factory.create_component 4 | options: 5 | heading_level: 2 6 | show_bases: false 7 | show_source: false 8 | only_class_level: true 9 | -------------------------------------------------------------------------------- /docs/pipes/misc/sections.md: -------------------------------------------------------------------------------- 1 | # Sections {: #edsnlp.pipes.misc.sections.factory.create_component } 2 | 3 | ::: edsnlp.pipes.misc.sections.factory.create_component 4 | options: 5 | heading_level: 2 6 | show_bases: false 7 | show_source: false 8 | only_class_level: true 9 | -------------------------------------------------------------------------------- /docs/pipes/misc/split.md: -------------------------------------------------------------------------------- 1 | # Split {: #edsnlp.pipes.misc.split.split.Split } 2 | 3 | ::: edsnlp.pipes.misc.split.split.Split 4 | options: 5 | heading_level: 2 6 | show_bases: false 7 | show_source: false 8 | only_class_level: true 9 | skip_parameters: ["nlp", "name"] 10 | -------------------------------------------------------------------------------- /docs/pipes/misc/tables.md: -------------------------------------------------------------------------------- 1 | # Tables {: #edsnlp.pipes.misc.tables.factory.create_component } 2 | 3 | ::: edsnlp.pipes.misc.tables.factory.create_component 4 | options: 5 | heading_level: 2 6 | show_bases: false 7 | show_source: false 8 | only_class_level: true 9 | -------------------------------------------------------------------------------- /docs/pipes/ner/adicap.md: -------------------------------------------------------------------------------- 1 | # Adicap {: #edsnlp.pipes.ner.adicap.factory.create_component } 2 | 3 | ::: edsnlp.pipes.ner.adicap.factory.create_component 4 | options: 5 | heading_level: 2 6 | show_bases: false 7 | show_source: false 8 | only_class_level: true 9 | -------------------------------------------------------------------------------- /docs/pipes/ner/behaviors/alcohol.md: -------------------------------------------------------------------------------- 1 | # Alcohol consumption {: #edsnlp.pipes.ner.behaviors.alcohol.factory.create_component } 2 | 3 | ::: edsnlp.pipes.ner.behaviors.alcohol.factory.create_component 4 | options: 5 | heading_level: 2 6 | show_bases: false 7 | show_source: false 8 | -------------------------------------------------------------------------------- /docs/pipes/ner/behaviors/tobacco.md: -------------------------------------------------------------------------------- 1 | # Tobacco consumption {: #edsnlp.pipes.ner.behaviors.tobacco.factory.create_component } 2 | 3 | ::: edsnlp.pipes.ner.behaviors.tobacco.factory.create_component 4 | options: 5 | heading_level: 2 6 | show_bases: false 7 | show_source: false 8 | -------------------------------------------------------------------------------- /docs/pipes/ner/cim10.md: -------------------------------------------------------------------------------- 1 | # CIM10 {: #edsnlp.pipes.ner.cim10.factory.create_component } 2 | 3 | ::: edsnlp.pipes.ner.cim10.factory.create_component 4 | options: 5 | heading_level: 2 6 | show_bases: false 7 | show_source: false 8 | only_class_level: true 9 | -------------------------------------------------------------------------------- /docs/pipes/ner/covid.md: -------------------------------------------------------------------------------- 1 | # COVID {: #edsnlp.pipes.ner.covid.factory.create_component } 2 | 3 | ::: edsnlp.pipes.ner.covid.factory.create_component 4 | options: 5 | heading_level: 2 6 | show_bases: false 7 | show_source: false 8 | only_class_level: true 9 | -------------------------------------------------------------------------------- /docs/pipes/ner/disorders/aids.md: -------------------------------------------------------------------------------- 1 | # AIDS {: #edsnlp.pipes.ner.disorders.aids.factory.create_component } 2 | 3 | ::: edsnlp.pipes.ner.disorders.aids.factory.create_component 4 | options: 5 | heading_level: 2 6 | show_bases: false 7 | show_source: false 8 | only_class_level: true 9 | -------------------------------------------------------------------------------- /docs/pipes/ner/disorders/cerebrovascular-accident.md: -------------------------------------------------------------------------------- 1 | # Cerebrovascular accident {: #edsnlp.pipes.ner.disorders.cerebrovascular_accident.factory.create_component } 2 | 3 | ::: edsnlp.pipes.ner.disorders.cerebrovascular_accident.factory.create_component 4 | options: 5 | heading_level: 2 6 | show_bases: false 7 | show_source: false 8 | only_class_level: true 9 | -------------------------------------------------------------------------------- /docs/pipes/ner/disorders/ckd.md: -------------------------------------------------------------------------------- 1 | # CKD {: #edsnlp.pipes.ner.disorders.ckd.factory.create_component } 2 | 3 | ::: edsnlp.pipes.ner.disorders.ckd.factory.create_component 4 | options: 5 | heading_level: 2 6 | show_bases: false 7 | show_source: false 8 | only_class_level: true 9 | -------------------------------------------------------------------------------- /docs/pipes/ner/disorders/congestive-heart-failure.md: -------------------------------------------------------------------------------- 1 | # Congestive heart failure {: #edsnlp.pipes.ner.disorders.congestive_heart_failure.factory.create_component } 2 | 3 | ::: edsnlp.pipes.ner.disorders.congestive_heart_failure.factory.create_component 4 | options: 5 | heading_level: 2 6 | show_bases: false 7 | show_source: false 8 | only_class_level: true 9 | -------------------------------------------------------------------------------- /docs/pipes/ner/disorders/connective-tissue-disease.md: -------------------------------------------------------------------------------- 1 | # Connective tissue disease {: #edsnlp.pipes.ner.disorders.connective_tissue_disease.factory.create_component } 2 | 3 | ::: edsnlp.pipes.ner.disorders.connective_tissue_disease.factory.create_component 4 | options: 5 | heading_level: 2 6 | show_bases: false 7 | show_source: false 8 | only_class_level: true 9 | -------------------------------------------------------------------------------- /docs/pipes/ner/disorders/copd.md: -------------------------------------------------------------------------------- 1 | # COPD {: #edsnlp.pipes.ner.disorders.copd.factory.create_component } 2 | 3 | ::: edsnlp.pipes.ner.disorders.copd.factory.create_component 4 | options: 5 | heading_level: 2 6 | show_bases: false 7 | show_source: false 8 | only_class_level: true 9 | -------------------------------------------------------------------------------- /docs/pipes/ner/disorders/dementia.md: -------------------------------------------------------------------------------- 1 | # Dementia {: #edsnlp.pipes.ner.disorders.dementia.factory.create_component } 2 | 3 | ::: edsnlp.pipes.ner.disorders.dementia.factory.create_component 4 | options: 5 | heading_level: 2 6 | show_bases: false 7 | show_source: false 8 | only_class_level: true 9 | -------------------------------------------------------------------------------- /docs/pipes/ner/disorders/diabetes.md: -------------------------------------------------------------------------------- 1 | # Diabetes {: #edsnlp.pipes.ner.disorders.diabetes.factory.create_component } 2 | 3 | ::: edsnlp.pipes.ner.disorders.diabetes.factory.create_component 4 | options: 5 | heading_level: 2 6 | show_bases: false 7 | show_source: false 8 | only_class_level: true 9 | -------------------------------------------------------------------------------- /docs/pipes/ner/disorders/hemiplegia.md: -------------------------------------------------------------------------------- 1 | # Hemiplegia {: #edsnlp.pipes.ner.disorders.hemiplegia.factory.create_component } 2 | 3 | ::: edsnlp.pipes.ner.disorders.hemiplegia.factory.create_component 4 | options: 5 | heading_level: 2 6 | show_bases: false 7 | show_source: false 8 | only_class_level: true 9 | -------------------------------------------------------------------------------- /docs/pipes/ner/disorders/leukemia.md: -------------------------------------------------------------------------------- 1 | # Leukemia {: #edsnlp.pipes.ner.disorders.leukemia.factory.create_component } 2 | 3 | ::: edsnlp.pipes.ner.disorders.leukemia.factory.create_component 4 | options: 5 | heading_level: 2 6 | show_bases: false 7 | show_source: false 8 | only_class_level: true 9 | -------------------------------------------------------------------------------- /docs/pipes/ner/disorders/liver-disease.md: -------------------------------------------------------------------------------- 1 | # Liver disease {: #edsnlp.pipes.ner.disorders.liver_disease.factory.create_component } 2 | 3 | ::: edsnlp.pipes.ner.disorders.liver_disease.factory.create_component 4 | options: 5 | heading_level: 2 6 | show_bases: false 7 | show_source: false 8 | only_class_level: true 9 | -------------------------------------------------------------------------------- /docs/pipes/ner/disorders/lymphoma.md: -------------------------------------------------------------------------------- 1 | # Lymphoma {: #edsnlp.pipes.ner.disorders.lymphoma.factory.create_component } 2 | 3 | ::: edsnlp.pipes.ner.disorders.lymphoma.factory.create_component 4 | options: 5 | heading_level: 2 6 | show_bases: false 7 | show_source: false 8 | only_class_level: true 9 | -------------------------------------------------------------------------------- /docs/pipes/ner/disorders/myocardial-infarction.md: -------------------------------------------------------------------------------- 1 | # Myocardial infarction {: #edsnlp.pipes.ner.disorders.myocardial_infarction.factory.create_component } 2 | 3 | ::: edsnlp.pipes.ner.disorders.myocardial_infarction.factory.create_component 4 | options: 5 | heading_level: 2 6 | show_bases: false 7 | show_source: false 8 | only_class_level: true 9 | -------------------------------------------------------------------------------- /docs/pipes/ner/disorders/peptic-ulcer-disease.md: -------------------------------------------------------------------------------- 1 | # Peptic ulcer disease {: #edsnlp.pipes.ner.disorders.peptic_ulcer_disease.factory.create_component } 2 | 3 | ::: edsnlp.pipes.ner.disorders.peptic_ulcer_disease.factory.create_component 4 | options: 5 | heading_level: 2 6 | show_bases: false 7 | show_source: false 8 | only_class_level: true 9 | -------------------------------------------------------------------------------- /docs/pipes/ner/disorders/peripheral-vascular-disease.md: -------------------------------------------------------------------------------- 1 | # Peripheral vascular disease {: #edsnlp.pipes.ner.disorders.peripheral_vascular_disease.factory.create_component } 2 | 3 | ::: edsnlp.pipes.ner.disorders.peripheral_vascular_disease.factory.create_component 4 | options: 5 | heading_level: 2 6 | show_bases: false 7 | show_source: false 8 | only_class_level: true 9 | -------------------------------------------------------------------------------- /docs/pipes/ner/disorders/solid-tumor.md: -------------------------------------------------------------------------------- 1 | # Solid tumor {: #edsnlp.pipes.ner.disorders.solid_tumor.factory.create_component } 2 | 3 | ::: edsnlp.pipes.ner.disorders.solid_tumor.factory.create_component 4 | options: 5 | heading_level: 2 6 | show_bases: false 7 | show_source: false 8 | only_class_level: true 9 | -------------------------------------------------------------------------------- /docs/pipes/ner/drugs.md: -------------------------------------------------------------------------------- 1 | # Drugs {: #edsnlp.pipes.ner.drugs.factory.create_component } 2 | 3 | ::: edsnlp.pipes.ner.drugs.factory.create_component 4 | options: 5 | heading_level: 2 6 | show_bases: false 7 | show_source: false 8 | only_class_level: true 9 | -------------------------------------------------------------------------------- /docs/pipes/ner/scores/charlson.md: -------------------------------------------------------------------------------- 1 | # Charlson {: #edsnlp.pipes.ner.scores.charlson.factory.create_component } 2 | 3 | ::: edsnlp.pipes.ner.scores.charlson.factory.create_component 4 | options: 5 | heading_level: 2 6 | show_bases: false 7 | show_source: false 8 | only_class_level: true 9 | -------------------------------------------------------------------------------- /docs/pipes/ner/scores/elston-ellis.md: -------------------------------------------------------------------------------- 1 | # Elston-Ellis {: #edsnlp.pipes.ner.scores.elston_ellis.factory.create_component } 2 | 3 | ::: edsnlp.pipes.ner.scores.elston_ellis.factory.create_component 4 | options: 5 | heading_level: 2 6 | show_bases: false 7 | show_source: false 8 | only_class_level: true 9 | -------------------------------------------------------------------------------- /docs/pipes/ner/scores/emergency-ccmu.md: -------------------------------------------------------------------------------- 1 | # Emergency CCMU {: #edsnlp.pipes.ner.scores.emergency.ccmu.factory.create_component } 2 | 3 | ::: edsnlp.pipes.ner.scores.emergency.ccmu.factory.create_component 4 | options: 5 | heading_level: 2 6 | show_bases: false 7 | show_source: false 8 | only_class_level: true 9 | -------------------------------------------------------------------------------- /docs/pipes/ner/scores/emergency-gemsa.md: -------------------------------------------------------------------------------- 1 | # Emergency GEMSA {: #edsnlp.pipes.ner.scores.emergency.gemsa.factory.create_component } 2 | 3 | ::: edsnlp.pipes.ner.scores.emergency.gemsa.factory.create_component 4 | options: 5 | heading_level: 2 6 | show_bases: false 7 | show_source: false 8 | only_class_level: true 9 | -------------------------------------------------------------------------------- /docs/pipes/ner/scores/emergency-priority.md: -------------------------------------------------------------------------------- 1 | # Emergency Priority {: #edsnlp.pipes.ner.scores.emergency.priority.factory.create_component } 2 | 3 | ::: edsnlp.pipes.ner.scores.emergency.priority.factory.create_component 4 | options: 5 | heading_level: 2 6 | show_bases: false 7 | show_source: false 8 | only_class_level: true 9 | -------------------------------------------------------------------------------- /docs/pipes/ner/scores/sofa.md: -------------------------------------------------------------------------------- 1 | # SOFA {: #edsnlp.pipes.ner.scores.sofa.factory.create_component } 2 | 3 | ::: edsnlp.pipes.ner.scores.sofa.factory.create_component 4 | options: 5 | heading_level: 2 6 | show_bases: false 7 | show_source: false 8 | only_class_level: true 9 | -------------------------------------------------------------------------------- /docs/pipes/ner/suicide_attempt.md: -------------------------------------------------------------------------------- 1 | # Suicide Attempt {: #edsnlp.pipes.ner.suicide_attempt.factory.create_component } 2 | 3 | ::: edsnlp.pipes.ner.suicide_attempt.factory.create_component 4 | options: 5 | heading_level: 2 6 | show_bases: true 7 | show_source: false 8 | only_class_level: true 9 | -------------------------------------------------------------------------------- /docs/pipes/ner/tnm.md: -------------------------------------------------------------------------------- 1 | # TNM {: #edsnlp.pipes.ner.tnm.factory.create_component } 2 | 3 | ::: edsnlp.pipes.ner.tnm.factory.create_component 4 | options: 5 | heading_level: 2 6 | show_bases: false 7 | show_source: false 8 | only_class_level: true 9 | -------------------------------------------------------------------------------- /docs/pipes/ner/umls.md: -------------------------------------------------------------------------------- 1 | # UMLS {: #edsnlp.pipes.ner.umls.factory.create_component } 2 | 3 | ::: edsnlp.pipes.ner.umls.factory.create_component 4 | options: 5 | heading_level: 2 6 | show_bases: false 7 | show_source: false 8 | only_class_level: true 9 | -------------------------------------------------------------------------------- /docs/pipes/qualifiers/family.md: -------------------------------------------------------------------------------- 1 | # Family Context {: #edsnlp.pipes.qualifiers.family.factory.create_component } 2 | 3 | ::: edsnlp.pipes.qualifiers.family.factory.create_component 4 | options: 5 | heading_level: 2 6 | show_bases: false 7 | show_source: false 8 | only_class_level: true 9 | -------------------------------------------------------------------------------- /docs/pipes/qualifiers/history.md: -------------------------------------------------------------------------------- 1 | # Medical History {: #edsnlp.pipes.qualifiers.history.factory.create_component } 2 | 3 | ::: edsnlp.pipes.qualifiers.history.factory.create_component 4 | options: 5 | heading_level: 2 6 | show_bases: false 7 | show_source: false 8 | only_class_level: true 9 | -------------------------------------------------------------------------------- /docs/pipes/qualifiers/hypothesis.md: -------------------------------------------------------------------------------- 1 | # Hypothesis {: #edsnlp.pipes.qualifiers.hypothesis.factory.create_component } 2 | 3 | ::: edsnlp.pipes.qualifiers.hypothesis.factory.create_component 4 | options: 5 | heading_level: 2 6 | show_bases: false 7 | show_source: false 8 | only_class_level: true 9 | -------------------------------------------------------------------------------- /docs/pipes/qualifiers/negation.md: -------------------------------------------------------------------------------- 1 | # Negation {: #edsnlp.pipes.qualifiers.negation.factory.create_component } 2 | 3 | ::: edsnlp.pipes.qualifiers.negation.factory.create_component 4 | options: 5 | heading_level: 2 6 | show_bases: false 7 | show_source: false 8 | only_class_level: true 9 | -------------------------------------------------------------------------------- /docs/pipes/qualifiers/reported-speech.md: -------------------------------------------------------------------------------- 1 | # Reported Speech {: #edsnlp.pipes.qualifiers.reported_speech.factory.create_component } 2 | 3 | ::: edsnlp.pipes.qualifiers.reported_speech.factory.create_component 4 | options: 5 | heading_level: 2 6 | show_bases: false 7 | show_source: false 8 | only_class_level: true 9 | -------------------------------------------------------------------------------- /docs/pipes/trainable/biaffine-dependency-parser.md: -------------------------------------------------------------------------------- 1 | # Trainable Biaffine Dependency Parser {: #edsnlp.pipes.trainable.biaffine_dep_parser.factory.create_component } 2 | 3 | ::: edsnlp.pipes.trainable.biaffine_dep_parser.factory.create_component 4 | options: 5 | heading_level: 2 6 | show_bases: false 7 | show_source: false 8 | only_class_level: true 9 | -------------------------------------------------------------------------------- /docs/pipes/trainable/embeddings/span_pooler.md: -------------------------------------------------------------------------------- 1 | # Span Pooler {: #edsnlp.pipes.trainable.embeddings.span_pooler.factory.create_component } 2 | 3 | ::: edsnlp.pipes.trainable.embeddings.span_pooler.factory.create_component 4 | options: 5 | heading_level: 2 6 | show_bases: false 7 | show_source: false 8 | only_class_level: true 9 | -------------------------------------------------------------------------------- /docs/pipes/trainable/embeddings/text_cnn.md: -------------------------------------------------------------------------------- 1 | # Text CNN {: #edsnlp.pipes.trainable.embeddings.text_cnn.factory.create_component } 2 | 3 | ::: edsnlp.pipes.trainable.embeddings.text_cnn.factory.create_component 4 | options: 5 | heading_level: 2 6 | show_bases: false 7 | show_source: false 8 | only_class_level: true 9 | -------------------------------------------------------------------------------- /docs/pipes/trainable/embeddings/transformer.md: -------------------------------------------------------------------------------- 1 | # Transformer {: #edsnlp.pipes.trainable.embeddings.transformer.factory.create_component } 2 | 3 | ::: edsnlp.pipes.trainable.embeddings.transformer.factory.create_component 4 | options: 5 | heading_level: 2 6 | show_bases: false 7 | show_source: false 8 | only_class_level: true 9 | -------------------------------------------------------------------------------- /docs/pipes/trainable/extractive-qa.md: -------------------------------------------------------------------------------- 1 | # Extractive Question Answering {: #edsnlp.pipes.trainable.extractive_qa.factory.create_component } 2 | 3 | ::: edsnlp.pipes.trainable.extractive_qa.factory.create_component 4 | options: 5 | heading_level: 2 6 | show_bases: false 7 | show_source: false 8 | only_class_level: true 9 | -------------------------------------------------------------------------------- /docs/pipes/trainable/ner.md: -------------------------------------------------------------------------------- 1 | # Trainable NER {: #edsnlp.pipes.trainable.ner_crf.factory.create_component } 2 | 3 | ::: edsnlp.pipes.trainable.ner_crf.factory.create_component 4 | options: 5 | heading_level: 2 6 | show_bases: false 7 | show_source: false 8 | only_class_level: true 9 | -------------------------------------------------------------------------------- /docs/pipes/trainable/span-classifier.md: -------------------------------------------------------------------------------- 1 | # Trainable Span Classifier {: #edsnlp.pipes.trainable.span_classifier.factory.create_component } 2 | 3 | ::: edsnlp.pipes.trainable.span_classifier.factory.create_component 4 | options: 5 | heading_level: 2 6 | show_bases: false 7 | show_source: false 8 | only_class_level: true 9 | -------------------------------------------------------------------------------- /docs/pipes/trainable/span-linker.md: -------------------------------------------------------------------------------- 1 | # Trainable Span Linker {: #edsnlp.pipes.trainable.span_linker.factory.create_component } 2 | 3 | ::: edsnlp.pipes.trainable.span_linker.factory.create_component 4 | options: 5 | heading_level: 2 6 | show_bases: false 7 | show_source: false 8 | only_class_level: true 9 | -------------------------------------------------------------------------------- /docs/scripts/autorefs/LICENSE: -------------------------------------------------------------------------------- 1 | ISC License 2 | 3 | Copyright (c) 2019, Oleh Prypin 4 | Copyright (c) 2019, Timothée Mazzucotelli 5 | 6 | Permission to use, copy, modify, and/or distribute this software for any 7 | purpose with or without fee is hereby granted, provided that the above 8 | copyright notice and this permission notice appear in all copies. 9 | 10 | THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 11 | WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 | MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 13 | ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 | WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 | ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 | OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 | -------------------------------------------------------------------------------- /docs/utilities/connectors/brat.md: -------------------------------------------------------------------------------- 1 | # BRAT Connector 2 | 3 | BRAT is currently the only supported in-text annotation editor at EDS. BRAT annotations are in the [standoff format](https://brat.nlplab.org/standoff.html). Consider the following document: 4 | 5 | ``` 6 | Le patient est admis pour une pneumopathie au coronavirus. 7 | On lui prescrit du paracétamol. 8 | ``` 9 | 10 | It could be annotated as follows : 11 | 12 | ``` 13 | T1 Patient 4 11 patient 14 | T2 Disease 31 58 pneumopathie au coronavirus 15 | T3 Drug 79 90 paracétamol 16 | ``` 17 | 18 | The point of the BRAT connector is to go from the standoff annotation format to an annotated spaCy document : 19 | 20 | ```{ .python .no-check } 21 | import edsnlp 22 | from edsnlp.connectors.brat import BratConnector 23 | 24 | # Instantiate the connector 25 | brat = BratConnector("path/to/brat") 26 | 27 | # Instantiate the spacy pipeline 28 | nlp = edsnlp.blank("eds") 29 | 30 | # Convert all BRAT files to a list of documents 31 | docs = brat.brat2docs(nlp) 32 | doc = docs[0] 33 | 34 | doc.ents 35 | # Out: [patient, pneumopathie au coronavirus, paracétamol] 36 | 37 | doc.ents[0].label_ 38 | # Out: Patient 39 | ``` 40 | 41 | The connector can also go the other way around, enabling pre-annotations and an ersatz of active learning. 42 | -------------------------------------------------------------------------------- /docs/utilities/connectors/labeltool.md: -------------------------------------------------------------------------------- 1 | # LabelTool Connector 2 | 3 | LabelTool is an in-house module enabling rapid annotation of pre-extracted entities. 4 | 5 | We provide a ready-to-use function that converts a list of annotated spaCy documents into a `pandas` DataFrame that is readable to LabelTool. 6 | 7 | ```python 8 | import edsnlp, edsnlp.pipes as eds 9 | 10 | from edsnlp.connectors.labeltool import docs2labeltool 11 | 12 | corpus = [ 13 | "Ceci est un document médical.", 14 | "Le patient n'est pas malade.", 15 | ] 16 | 17 | # Instantiate the spacy pipeline 18 | nlp = edsnlp.blank("fr") 19 | nlp.add_pipe(eds.sentences()) 20 | nlp.add_pipe(eds.matcher(terms=dict(medical="médical", malade="malade"))) 21 | nlp.add_pipe(eds.negation()) 22 | 23 | # Convert all BRAT files to a list of documents 24 | docs = nlp.pipe(corpus) 25 | 26 | df = docs2labeltool(docs, extensions=["negation"]) 27 | ``` 28 | 29 | The results: 30 | 31 | | note_id | note_text | start | end | label | lexical_variant | negation | 32 | | ------- | ----------------------------- | ----- | --- | ------- | --------------- | -------- | 33 | | 0 | Ceci est un document médical. | 21 | 28 | medical | médical | False | 34 | | 1 | Le patient n'est pas malade. | 21 | 27 | malade | malade | True | 35 | -------------------------------------------------------------------------------- /docs/utilities/connectors/overview.md: -------------------------------------------------------------------------------- 1 | # Overview of connectors 2 | 3 | EDS-NLP provides a series of connectors apt to convert back and forth from different formats into spaCy representation. 4 | 5 | We provide the following connectors: 6 | 7 | - [BRAT](./brat.md) 8 | - [OMOP](./omop.md) 9 | 10 | -------------------------------------------------------------------------------- /docs/utilities/evaluation.md: -------------------------------------------------------------------------------- 1 | # Pipeline evaluation 2 | -------------------------------------------------------------------------------- /docs/utilities/index.md: -------------------------------------------------------------------------------- 1 | # Utilities 2 | 3 | EDS-NLP provides a few utilities to deploy pipelines, process RegExps, etc. 4 | -------------------------------------------------------------------------------- /docs/utilities/regex.md: -------------------------------------------------------------------------------- 1 | # Work with RegExp 2 | -------------------------------------------------------------------------------- /docs/utilities/tests/blocs.md: -------------------------------------------------------------------------------- 1 | # Testing Code Blocs 2 | 3 | We created a utility that scans through the documentation, extracts code blocs and executes them to check that everything is indeed functional. 4 | 5 | There is more! Whenever the utility comes across an example (denoted by `# Out: `, see example below), an `assert` statement is dynamically added to the snippet to check that the output matches. 6 | 7 | For instance: 8 | 9 | ```python 10 | a = 1 11 | 12 | a 13 | # Out: 1 14 | ``` 15 | 16 | Is transformed into: 17 | 18 | ```python 19 | a = 1 20 | 21 | v = a 22 | assert repr(v) == "1" 23 | ``` 24 | 25 | We can disable code checking for a specific code bloc by adding a `.no-check` class to the code bloc: 26 | 27 | ````md 28 | ```python { .no-check } 29 | test = undeclared_function(42) 30 | ``` 31 | ```` 32 | 33 | Visit the source code of [test_docs.py](https://github.com/aphp/edsnlp/blob/master/tests/test_docs.py) for more information. 34 | -------------------------------------------------------------------------------- /docs/utilities/tests/examples.md: -------------------------------------------------------------------------------- 1 | # Creating Examples 2 | 3 | Testing a NER/qualifier pipeline can be a hassle. We created a utility to simplify that process. 4 | 5 | Using the [`parse_example`][edsnlp.utils.examples.parse_example] method, you can define a full example in a human-readable way: 6 | 7 | ```python 8 | from edsnlp.utils.examples import parse_example 9 | 10 | example = "Absence d'image osseuse d'allure évolutive." 11 | 12 | text, entities = parse_example(example) 13 | 14 | text 15 | # Out: "Absence d'image osseuse d'allure évolutive." 16 | 17 | entities 18 | # Out: [Entity(start_char=10, end_char=42, modifiers=[Modifier(key='negated', value=True)])] 19 | ``` 20 | 21 | Entities are defined using the `` tag. You can encode complexe information by adding keys into the tag (see example above). The `parse_example` method strips the text of the tags, and outputs a list of `Entity` objects that contain: 22 | 23 | - the character indices of the entity ; 24 | - custom user-defined "modifiers". 25 | 26 | See the [dedicated reference page][edsnlp.utils.examples.parse_example] for more information. 27 | -------------------------------------------------------------------------------- /docs/utilities/tests/index.md: -------------------------------------------------------------------------------- 1 | # Tests Utilities 2 | 3 | We provide a few testing utilities that simplify the process of: 4 | 5 | - creating testing examples for NLP pipelines; 6 | - testing documentation code blocs. 7 | -------------------------------------------------------------------------------- /edsnlp/connectors/__init__.py: -------------------------------------------------------------------------------- 1 | from .brat import BratConnector 2 | from .omop import OmopConnector 3 | -------------------------------------------------------------------------------- /edsnlp/core/__init__.py: -------------------------------------------------------------------------------- 1 | from .registries import registry 2 | from .pipeline import PipelineProtocol 3 | -------------------------------------------------------------------------------- /edsnlp/data/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import TYPE_CHECKING 2 | from edsnlp.utils.lazy_module import lazify 3 | 4 | lazify() 5 | 6 | if TYPE_CHECKING: 7 | from .base import from_iterable, to_iterable 8 | from .standoff import read_standoff, write_standoff 9 | from .brat import read_brat, write_brat 10 | from .conll import read_conll 11 | from .json import read_json, write_json 12 | from .parquet import read_parquet, write_parquet 13 | from .spark import from_spark, to_spark 14 | from .pandas import from_pandas, to_pandas 15 | from .polars import from_polars, to_polars 16 | from .converters import get_dict2doc_converter, get_doc2dict_converter 17 | -------------------------------------------------------------------------------- /edsnlp/data/brat.py: -------------------------------------------------------------------------------- 1 | from edsnlp.data.standoff import ( 2 | dump_standoff_file, 3 | parse_standoff_file, 4 | read_standoff, 5 | write_standoff, 6 | ) 7 | 8 | load_from_brat = parse_standoff_file 9 | export_to_brat = dump_standoff_file 10 | 11 | read_brat = read_standoff 12 | write_brat = write_standoff 13 | -------------------------------------------------------------------------------- /edsnlp/extensions.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | from datetime import date, datetime 3 | 4 | from dateutil.parser import parse as parse_date 5 | from spacy.tokens import Doc 6 | 7 | if not Doc.has_extension("note_id"): 8 | Doc.set_extension("note_id", default=None) 9 | 10 | 11 | def set_note_datetime(doc, dt): 12 | try: 13 | if type(dt) is datetime: 14 | pass 15 | elif isinstance(dt, str): 16 | dt = parse_date(dt) 17 | elif isinstance(dt, (int, float)): 18 | dt = datetime.fromtimestamp(dt) 19 | elif isinstance(dt, date): 20 | dt = datetime(dt.year, dt.month, dt.day) 21 | elif dt is None: 22 | pass 23 | key = doc._._get_key("note_datetime") 24 | doc.doc.user_data[key] = dt 25 | return 26 | except Exception: 27 | pass 28 | 29 | warnings.warn(f"Cannot cast {dt} as a note datetime", UserWarning) 30 | 31 | 32 | def get_note_datetime(doc): 33 | key = doc._._get_key("note_datetime") 34 | return doc.user_data.get(key, None) 35 | 36 | 37 | if not Doc.has_extension("note_datetime"): 38 | Doc.set_extension( 39 | "note_datetime", 40 | getter=get_note_datetime, 41 | setter=set_note_datetime, 42 | ) 43 | 44 | if not Doc.has_extension("birth_datetime"): 45 | Doc.set_extension("birth_datetime", default=None) 46 | -------------------------------------------------------------------------------- /edsnlp/matchers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edsnlp/8e9ed84f56e6af741023e8b3a9de38ba93912953/edsnlp/matchers/__init__.py -------------------------------------------------------------------------------- /edsnlp/matchers/phrase.pxd: -------------------------------------------------------------------------------- 1 | from libcpp.vector cimport vector 2 | from spacy.matcher.phrasematcher cimport PhraseMatcher 3 | from spacy.structs cimport SpanC 4 | from spacy.tokens.doc cimport Doc 5 | from spacy.tokens.span cimport Span 6 | from spacy.typedefs cimport attr_t 7 | 8 | 9 | cdef class EDSPhraseMatcher(PhraseMatcher): 10 | cdef attr_t space_hash 11 | cdef attr_t excluded_hash 12 | 13 | cdef void find_matches(self, Doc doc, int start_idx, int end_idx, vector[SpanC] *matches) nogil 14 | -------------------------------------------------------------------------------- /edsnlp/matchers/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List, Union 2 | 3 | ListOrStr = Union[List[str], str] 4 | DictOrPattern = Union[str, List[str], Dict[str, Union[str, List[str]]]] 5 | Patterns = Dict[str, DictOrPattern] 6 | 7 | 8 | def normalize_token_attr(attr): 9 | if attr.startswith("doc.") or attr.startswith("span."): 10 | return None 11 | attr = attr.replace("token.", "") 12 | lower = attr.replace("_", "").lower() 13 | return "text" if lower == "orth" else lower 14 | 15 | 16 | ATTRIBUTES = { 17 | "LOWER": "lower_", 18 | "TEXT": "text", 19 | "NORM": "norm_", 20 | "SHAPE": "shape_", 21 | } 22 | 23 | from .offset import alignment # noqa: E402, F401 24 | from .text import get_text # noqa: E402, F401 25 | -------------------------------------------------------------------------------- /edsnlp/matchers/utils/offset.py: -------------------------------------------------------------------------------- 1 | from edsnlp.utils.doc_to_text import get_char_offsets as alignment # noqa: E402, F401 2 | -------------------------------------------------------------------------------- /edsnlp/matchers/utils/text.py: -------------------------------------------------------------------------------- 1 | from edsnlp.utils.doc_to_text import get_text # noqa: E402, F401 2 | -------------------------------------------------------------------------------- /edsnlp/pipes/core/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edsnlp/8e9ed84f56e6af741023e8b3a9de38ba93912953/edsnlp/pipes/core/__init__.py -------------------------------------------------------------------------------- /edsnlp/pipes/core/contextual_matcher/__init__.py: -------------------------------------------------------------------------------- 1 | from .contextual_matcher import ContextualMatcher 2 | -------------------------------------------------------------------------------- /edsnlp/pipes/core/contextual_matcher/factory.py: -------------------------------------------------------------------------------- 1 | from edsnlp.core import registry 2 | from edsnlp.pipes.core.contextual_matcher import ContextualMatcher 3 | 4 | DEFAULT_CONFIG = dict( 5 | assign_as_span=False, 6 | alignment_mode="expand", 7 | attr="NORM", 8 | regex_flags=0, 9 | ignore_excluded=False, 10 | ignore_space_tokens=False, 11 | include_assigned=False, 12 | label_name=None, 13 | label=None, 14 | span_setter={"ents": True}, 15 | ) 16 | 17 | create_component = registry.factory.register( 18 | "eds.contextual_matcher", 19 | deprecated=["eds.contextual-matcher", "contextual-matcher"], 20 | )(ContextualMatcher) 21 | -------------------------------------------------------------------------------- /edsnlp/pipes/core/endlines/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edsnlp/8e9ed84f56e6af741023e8b3a9de38ba93912953/edsnlp/pipes/core/endlines/__init__.py -------------------------------------------------------------------------------- /edsnlp/pipes/core/endlines/factory.py: -------------------------------------------------------------------------------- 1 | from edsnlp.core import registry 2 | 3 | from .endlines import EndLinesMatcher 4 | 5 | DEFAULT_CONFIG = dict( 6 | model_path=None, 7 | ) 8 | 9 | create_component = registry.factory.register( 10 | "eds.endlines", 11 | assigns=["doc.ents", "doc.spans"], 12 | deprecated=["spaces"], 13 | )(EndLinesMatcher) 14 | -------------------------------------------------------------------------------- /edsnlp/pipes/core/endlines/functional.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import numpy as np 4 | import pandas as pd 5 | 6 | 7 | def get_dir_path(file): 8 | path_file = os.path.dirname(os.path.realpath(file)) 9 | return path_file 10 | 11 | 12 | def build_path(file, relative_path): 13 | """ 14 | Function to build an absolut path. 15 | 16 | Parameters 17 | ---------- 18 | file: main file from where we are calling. It could be __file__ 19 | relative_path: str, 20 | relative path from the main file to the desired output 21 | 22 | Returns 23 | ------- 24 | path: absolute path 25 | """ 26 | dir_path = get_dir_path(file) 27 | path = os.path.abspath(os.path.join(dir_path, relative_path)) 28 | return path 29 | 30 | 31 | def _convert_series_to_array(s: pd.Series) -> np.ndarray: 32 | """Converts pandas series of n elements to an array of shape (n,1). 33 | 34 | Parameters 35 | ---------- 36 | s : pd.Series 37 | 38 | Returns 39 | ------- 40 | np.ndarray 41 | """ 42 | X = s.to_numpy().reshape(-1, 1).astype("O") # .astype(np.int64) 43 | return X 44 | -------------------------------------------------------------------------------- /edsnlp/pipes/core/matcher/__init__.py: -------------------------------------------------------------------------------- 1 | from .matcher import GenericMatcher 2 | -------------------------------------------------------------------------------- /edsnlp/pipes/core/matcher/factory.py: -------------------------------------------------------------------------------- 1 | from edsnlp.core import registry 2 | 3 | from .matcher import GenericMatcher 4 | 5 | DEFAULT_CONFIG = dict( 6 | terms=None, 7 | regex=None, 8 | attr="TEXT", 9 | ignore_excluded=False, 10 | ignore_space_tokens=False, 11 | term_matcher="exact", 12 | term_matcher_config={}, 13 | span_setter={"ents": True}, 14 | ) 15 | 16 | create_component = registry.factory.register( 17 | "eds.matcher", 18 | assigns=["doc.ents", "doc.spans"], 19 | deprecated=["matcher"], 20 | )(GenericMatcher) 21 | -------------------------------------------------------------------------------- /edsnlp/pipes/core/normalizer/__init__.py: -------------------------------------------------------------------------------- 1 | from spacy.tokens import Token 2 | 3 | if not Token.has_extension("excluded"): 4 | Token.set_extension("excluded", default=False) 5 | 6 | 7 | def excluded_or_space_getter(t): 8 | return t.is_space or t.tag_ == "EXCLUDED" 9 | 10 | 11 | if not Token.has_extension("excluded_or_space"): 12 | Token.set_extension( 13 | "excluded_or_space", 14 | getter=excluded_or_space_getter, 15 | ) 16 | -------------------------------------------------------------------------------- /edsnlp/pipes/core/normalizer/accents/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edsnlp/8e9ed84f56e6af741023e8b3a9de38ba93912953/edsnlp/pipes/core/normalizer/accents/__init__.py -------------------------------------------------------------------------------- /edsnlp/pipes/core/normalizer/accents/factory.py: -------------------------------------------------------------------------------- 1 | from edsnlp.core import registry 2 | 3 | from . import patterns 4 | from .accents import AccentsConverter 5 | 6 | DEFAULT_CONFIG = dict( 7 | accents=patterns.accents, 8 | ) 9 | 10 | create_component = registry.factory.register( 11 | "eds.accents", 12 | assigns=["token.norm"], 13 | deprecated=["accents"], 14 | )(AccentsConverter) 15 | -------------------------------------------------------------------------------- /edsnlp/pipes/core/normalizer/accents/patterns.py: -------------------------------------------------------------------------------- 1 | from typing import List, Tuple 2 | 3 | # Accentuated characters 4 | accents: List[Tuple[str, str]] = [ 5 | ("ç", "c"), 6 | ("àáâä", "a"), 7 | ("èéêë", "e"), 8 | ("ìíîï", "i"), 9 | ("òóôö", "o"), 10 | ("ùúûü", "u"), 11 | ] 12 | # Add uppercase 13 | accents += [(k.upper(), v.upper()) for k, v in accents] 14 | -------------------------------------------------------------------------------- /edsnlp/pipes/core/normalizer/pollution/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edsnlp/8e9ed84f56e6af741023e8b3a9de38ba93912953/edsnlp/pipes/core/normalizer/pollution/__init__.py -------------------------------------------------------------------------------- /edsnlp/pipes/core/normalizer/pollution/factory.py: -------------------------------------------------------------------------------- 1 | from edsnlp.core import registry 2 | 3 | from .patterns import default_enabled 4 | from .pollution import PollutionTagger 5 | 6 | DEFAULT_CONFIG = dict( 7 | pollution=default_enabled, 8 | ) 9 | 10 | create_component = registry.factory.register( 11 | "eds.pollution", 12 | assigns=["doc.spans"], 13 | deprecated=["pollution"], 14 | )(PollutionTagger) 15 | -------------------------------------------------------------------------------- /edsnlp/pipes/core/normalizer/quotes/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edsnlp/8e9ed84f56e6af741023e8b3a9de38ba93912953/edsnlp/pipes/core/normalizer/quotes/__init__.py -------------------------------------------------------------------------------- /edsnlp/pipes/core/normalizer/quotes/factory.py: -------------------------------------------------------------------------------- 1 | from edsnlp.core import registry 2 | 3 | from .patterns import quotes_and_apostrophes 4 | from .quotes import QuotesConverter 5 | 6 | DEFAULT_CONFIG = dict( 7 | quotes=quotes_and_apostrophes, 8 | ) 9 | 10 | create_component = registry.factory.register( 11 | "eds.quotes", 12 | assigns=["token.norm"], 13 | deprecated=["quotes"], 14 | )(QuotesConverter) 15 | -------------------------------------------------------------------------------- /edsnlp/pipes/core/normalizer/quotes/patterns.py: -------------------------------------------------------------------------------- 1 | from typing import List, Tuple 2 | 3 | # Source : https://util.unicode.org/UnicodeJsps/character.jsp?a=02EE 4 | quotes: List[str] = [ 5 | """, 6 | "〃", 7 | "ײ", 8 | "᳓", 9 | "″", 10 | "״", 11 | "‶", 12 | "˶", 13 | "ʺ", 14 | "“", 15 | "”", 16 | "˝", 17 | "‟", 18 | ] 19 | 20 | # Source : https://util.unicode.org/UnicodeJsps/character.jsp?a=0027 21 | apostrophes: List[str] = [ 22 | "`", 23 | "΄", 24 | "'", 25 | "ˈ", 26 | "ˊ", 27 | "ᑊ", 28 | "ˋ", 29 | "ꞌ", 30 | "ᛌ", 31 | "𖽒", 32 | "𖽑", 33 | "‘", 34 | "’", 35 | "י", 36 | "՚", 37 | "‛", 38 | "՝", 39 | "`", 40 | "`", 41 | "′", 42 | "׳", 43 | "´", 44 | "ʹ", 45 | "˴", 46 | "ߴ", 47 | "‵", 48 | "ߵ", 49 | "ʹ", 50 | "ʻ", 51 | "ʼ", 52 | "´", 53 | "᾽", 54 | "ʽ", 55 | "῾", 56 | "ʾ", 57 | "᾿", 58 | ] 59 | 60 | quotes_and_apostrophes: List[Tuple[str, str]] = [ 61 | ("".join(quotes), '"'), 62 | ("".join(apostrophes), "'"), 63 | ] 64 | -------------------------------------------------------------------------------- /edsnlp/pipes/core/normalizer/remove_lowercase/__init__.py: -------------------------------------------------------------------------------- 1 | from .factory import create_component 2 | -------------------------------------------------------------------------------- /edsnlp/pipes/core/normalizer/remove_lowercase/factory.py: -------------------------------------------------------------------------------- 1 | from spacy.tokens import Doc 2 | 3 | from edsnlp.core import PipelineProtocol, registry 4 | 5 | 6 | def remove_lowercase(doc: Doc): 7 | """ 8 | Add case on the `NORM` custom attribute. Should always be applied first. 9 | 10 | Parameters 11 | ---------- 12 | doc : Doc 13 | The spaCy `Doc` object. 14 | 15 | Returns 16 | ------- 17 | Doc 18 | The document, with case put back in `NORM`. 19 | """ 20 | 21 | for token in doc: 22 | token.norm_ = token.text 23 | 24 | return doc 25 | 26 | 27 | @registry.factory.register( 28 | "eds.remove_lowercase", 29 | assigns=["token.norm"], 30 | deprecated=[ 31 | "remove-lowercase", 32 | "eds.remove-lowercase", 33 | ], 34 | ) 35 | def create_component( 36 | nlp: PipelineProtocol, 37 | name: str, 38 | ): 39 | """ 40 | Add case on the `NORM` custom attribute. Should always be applied first. 41 | 42 | Parameters 43 | ---------- 44 | nlp : PipelineProtocol 45 | The pipeline object. 46 | name : str 47 | The name of the component. 48 | """ 49 | return remove_lowercase # pragma: no cover 50 | -------------------------------------------------------------------------------- /edsnlp/pipes/core/normalizer/spaces/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edsnlp/8e9ed84f56e6af741023e8b3a9de38ba93912953/edsnlp/pipes/core/normalizer/spaces/__init__.py -------------------------------------------------------------------------------- /edsnlp/pipes/core/normalizer/spaces/factory.py: -------------------------------------------------------------------------------- 1 | from edsnlp.core import registry 2 | 3 | from .spaces import SpacesTagger 4 | 5 | DEFAULT_CONFIG = dict(newline=True) 6 | 7 | create_component = registry.factory.register( 8 | "eds.spaces", 9 | assigns=["token.tag"], 10 | deprecated=["spaces"], 11 | )(SpacesTagger) 12 | -------------------------------------------------------------------------------- /edsnlp/pipes/core/normalizer/spaces/spaces.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from spacy.tokens import Doc 4 | 5 | from edsnlp.core import PipelineProtocol 6 | from edsnlp.pipes.base import BaseComponent 7 | 8 | 9 | class SpacesTagger(BaseComponent): 10 | """ 11 | We assign "SPACE" to `token.tag` to be used by optimized components 12 | such as the EDSPhraseMatcher 13 | 14 | Parameters 15 | ---------- 16 | nlp : Optional[PipelineProtocol] 17 | The pipeline object. 18 | name : Optional[str] 19 | The component name. 20 | newline : bool 21 | Whether to update the newline tokens too 22 | """ 23 | 24 | def __init__( 25 | self, 26 | nlp: Optional[PipelineProtocol] = None, 27 | name: Optional[str] = "spaces", 28 | *, 29 | newline: bool = True, 30 | ): 31 | super().__init__(nlp, name) 32 | self.newline = newline 33 | 34 | def __call__(self, doc: Doc) -> Doc: 35 | """ 36 | Apply the component to the doc. 37 | 38 | Parameters 39 | ---------- 40 | doc: Doc 41 | 42 | Returns 43 | ------- 44 | doc: Doc 45 | """ 46 | space_hash = doc.vocab.strings["SPACE"] 47 | for token in doc: 48 | if len(token.text.strip()) == 0: 49 | token.tag = space_hash 50 | 51 | return doc 52 | -------------------------------------------------------------------------------- /edsnlp/pipes/core/sentences/__init__.py: -------------------------------------------------------------------------------- 1 | from .sentences import SentenceSegmenter 2 | -------------------------------------------------------------------------------- /edsnlp/pipes/core/sentences/factory.py: -------------------------------------------------------------------------------- 1 | from edsnlp import registry 2 | 3 | from .sentences import SentenceSegmenter 4 | 5 | create_component = registry.factory.register( 6 | "eds.sentences", 7 | assigns=["token.is_sent_start"], 8 | deprecated=["sentences"], 9 | )(SentenceSegmenter) 10 | -------------------------------------------------------------------------------- /edsnlp/pipes/core/sentences/fast_sentences.pxd: -------------------------------------------------------------------------------- 1 | from libcpp cimport bool 2 | from libcpp.set cimport set 3 | from spacy.tokens.doc cimport Doc 4 | from spacy.typedefs cimport attr_t 5 | 6 | cdef class SentenceSegmenter(object): 7 | cdef str name 8 | 9 | cdef class FastSentenceSegmenter(object): 10 | cdef bool ignore_excluded 11 | cdef attr_t newline_hash 12 | cdef attr_t excluded_hash 13 | cdef attr_t endline_hash 14 | cdef set[attr_t] punct_chars_hash 15 | cdef set[attr_t] capitalized_shapes_hash 16 | cdef bool check_capitalized 17 | cdef int min_newline_count 18 | 19 | cdef void process(self, Doc doc) nogil 20 | -------------------------------------------------------------------------------- /edsnlp/pipes/core/sentences/terms.py: -------------------------------------------------------------------------------- 1 | # Default punctuation defined for the sentencizer : https://spacy.io/api/sentencizer 2 | punctuation = { 3 | "!", 4 | ".", 5 | "?", 6 | "܂", 7 | "‼", 8 | "‽", 9 | "⁇", 10 | "⁈", 11 | "⁉", 12 | "﹖", 13 | "﹗", 14 | "!", 15 | ".", 16 | "?", 17 | } 18 | -------------------------------------------------------------------------------- /edsnlp/pipes/core/terminology/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edsnlp/8e9ed84f56e6af741023e8b3a9de38ba93912953/edsnlp/pipes/core/terminology/__init__.py -------------------------------------------------------------------------------- /edsnlp/pipes/core/terminology/factory.py: -------------------------------------------------------------------------------- 1 | from edsnlp.core import registry 2 | 3 | from .terminology import TerminologyMatcher 4 | 5 | DEFAULT_CONFIG = dict( 6 | terms=None, 7 | regex=None, 8 | attr="TEXT", 9 | ignore_excluded=False, 10 | ignore_space_tokens=False, 11 | term_matcher="exact", 12 | term_matcher_config=None, 13 | span_setter={"ents": True}, 14 | ) 15 | 16 | create_component = registry.factory.register( 17 | "eds.terminology", 18 | assigns=["doc.ents", "doc.spans"], 19 | deprecated=["terminology"], 20 | )(TerminologyMatcher) 21 | -------------------------------------------------------------------------------- /edsnlp/pipes/misc/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edsnlp/8e9ed84f56e6af741023e8b3a9de38ba93912953/edsnlp/pipes/misc/__init__.py -------------------------------------------------------------------------------- /edsnlp/pipes/misc/consultation_dates/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edsnlp/8e9ed84f56e6af741023e8b3a9de38ba93912953/edsnlp/pipes/misc/consultation_dates/__init__.py -------------------------------------------------------------------------------- /edsnlp/pipes/misc/consultation_dates/factory.py: -------------------------------------------------------------------------------- 1 | from edsnlp.core import registry 2 | 3 | from .consultation_dates import ConsultationDatesMatcher 4 | 5 | DEFAULT_CONFIG = dict( 6 | consultation_mention=True, 7 | town_mention=False, 8 | document_date_mention=False, 9 | attr="NORM", 10 | ignore_excluded=False, 11 | ignore_spacy_tokens=False, 12 | label="consultation_date", 13 | span_setter={"ents": True, "consultation_dates": True}, 14 | ) 15 | 16 | create_component = registry.factory.register( 17 | "eds.consultation_dates", 18 | assigns=["doc.spans", "doc.ents"], 19 | deprecated=["consultation_dates"], 20 | )(ConsultationDatesMatcher) 21 | -------------------------------------------------------------------------------- /edsnlp/pipes/misc/consultation_dates/patterns.py: -------------------------------------------------------------------------------- 1 | consultation_mention = [ 2 | "rendez-vous pris", 3 | r"consultation", 4 | r"consultation.{1,8}examen", 5 | r"\bcs\b", 6 | "examen clinique", 7 | r"de compte rendu", 8 | r"date de l'examen", 9 | r"examen realise le", 10 | "date de la visite", 11 | ] 12 | 13 | town_mention = [ 14 | "paris", 15 | "kremlin.bicetre", 16 | "creteil", 17 | "boulogne.billancourt", 18 | "villejuif", 19 | "clamart", 20 | "bobigny", 21 | "clichy", 22 | "ivry.sur.seine", 23 | "issy.les.moulineaux", 24 | "draveil", 25 | "limeil", 26 | "champcueil", 27 | "roche.guyon", 28 | "bondy", 29 | "colombes", 30 | "hendaye", 31 | "berck.sur.mer", 32 | "labruyere", 33 | "garches", 34 | "sevran", 35 | "hyeres", 36 | ] 37 | 38 | document_date_mention = [ 39 | "imprime le", 40 | r"signe electroniquement", 41 | "signe le", 42 | "saisi le", 43 | "dicte le", 44 | "tape le", 45 | "date de reference", 46 | r"date\s*:", 47 | "dactylographie le", 48 | "date du rapport", 49 | ] 50 | -------------------------------------------------------------------------------- /edsnlp/pipes/misc/dates/__init__.py: -------------------------------------------------------------------------------- 1 | from .dates import DatesMatcher 2 | -------------------------------------------------------------------------------- /edsnlp/pipes/misc/dates/factory.py: -------------------------------------------------------------------------------- 1 | from edsnlp.core import registry 2 | 3 | from .dates import DatesMatcher 4 | 5 | DEFAULT_CONFIG = dict( 6 | absolute=None, 7 | relative=None, 8 | duration=None, 9 | false_positive=None, 10 | on_ents_only=False, 11 | span_getter=None, 12 | merge_mode="intersect", 13 | detect_periods=False, 14 | detect_time=True, 15 | period_proximity_threshold=3, 16 | as_ents=False, 17 | attr="LOWER", 18 | date_label="date", 19 | duration_label="duration", 20 | period_label="period", 21 | span_setter={ 22 | "dates": ["date"], 23 | "durations": ["duration"], 24 | "periods": ["period"], 25 | }, 26 | ) 27 | 28 | create_component = registry.factory.register( 29 | "eds.dates", 30 | assigns=["doc.spans", "doc.ents"], 31 | deprecated=["dates"], 32 | )(DatesMatcher) 33 | -------------------------------------------------------------------------------- /edsnlp/pipes/misc/dates/patterns/__init__.py: -------------------------------------------------------------------------------- 1 | from .absolute import absolute_pattern, absolute_pattern_with_time 2 | from .current import current_pattern 3 | from .duration import duration_pattern 4 | from .false_positive import false_positive_pattern 5 | from .relative import relative_pattern 6 | -------------------------------------------------------------------------------- /edsnlp/pipes/misc/dates/patterns/atomic/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edsnlp/8e9ed84f56e6af741023e8b3a9de38ba93912953/edsnlp/pipes/misc/dates/patterns/atomic/__init__.py -------------------------------------------------------------------------------- /edsnlp/pipes/misc/dates/patterns/atomic/delimiters.py: -------------------------------------------------------------------------------- 1 | from edsnlp.utils.regex import make_pattern 2 | 3 | raw_delimiters = [r"\/", r"[-−]"] 4 | delimiters = raw_delimiters + [r"\.", r"[^\S]+"] 5 | 6 | raw_delimiter_pattern = make_pattern(raw_delimiters) 7 | raw_delimiter_with_spaces_pattern = make_pattern(raw_delimiters + [r"[^\S]+"]) 8 | delimiter_pattern = make_pattern(delimiters) 9 | 10 | ante_num_pattern = ( 11 | f"(?depuis|depuis\s+le|il\s+y\s+a|à)", 5 | r"(?Pdans)", 6 | ] 7 | 8 | following_directions = [ 9 | r"(?Pprochaine?s?|suivante?s?|plus\s+tard)", 10 | r"(?Pderni[eè]re?s?|passée?s?|pr[ée]c[ée]dente?s?|plus\s+t[ôo]t)", 11 | ] 12 | 13 | preceding_direction_pattern = make_pattern(preceding_directions, with_breaks=True) 14 | following_direction_pattern = make_pattern(following_directions, with_breaks=True) 15 | -------------------------------------------------------------------------------- /edsnlp/pipes/misc/dates/patterns/atomic/modes.py: -------------------------------------------------------------------------------- 1 | from edsnlp.utils.regex import make_pattern 2 | 3 | modes = [ 4 | r"(?Pdepuis|depuis\s+le|[àa]\s+partir\s+d[eu]|du)", 5 | r"(?Pjusqu'[àa]u?|au)", 6 | ] 7 | 8 | mode_pattern = make_pattern(modes, with_breaks=True) 9 | -------------------------------------------------------------------------------- /edsnlp/pipes/misc/dates/patterns/atomic/months.py: -------------------------------------------------------------------------------- 1 | from edsnlp.utils.regex import make_pattern 2 | 3 | letter_months = [ 4 | r"(?Pjanvier|janv\.?)", 5 | r"(?Pf[ée]vrier|f[ée]v\.?)", 6 | r"(?Pmars|mar\.?)", 7 | r"(?Pavril|avr\.?)", 8 | r"(?Pmai)", 9 | r"(?Pjuin)", 10 | r"(?Pjuillet|juill?\.?)", 11 | r"(?Pao[uû]t)", 12 | r"(?Pseptembre|sept?\.?)", 13 | r"(?Poctobre|oct\.?)", 14 | r"(?Pnovembre|nov\.?)", 15 | r"(?Pd[ée]cembre|d[ée]c\.?)", 16 | ] 17 | 18 | 19 | letter_month_pattern = make_pattern(letter_months, with_breaks=True) 20 | 21 | numeric_month_pattern = r"(?{numeric_month_pattern})" 25 | lz_numeric_month_pattern = f"(?P{lz_numeric_month_pattern})" 26 | month_pattern = f"({letter_month_pattern}|{numeric_month_pattern})" 27 | -------------------------------------------------------------------------------- /edsnlp/pipes/misc/dates/patterns/atomic/time.py: -------------------------------------------------------------------------------- 1 | hour_pattern = r"(?0?[0-9]|1\d|2[0-3])(?!\d)" 2 | lz_hour_pattern = r"(?0[1-9]|[12]\d|3[01])(?!\d)" 3 | 4 | minute_pattern = r"(?0?[1-9]|[1-5]\d)(?!\d)" 5 | lz_minute_pattern = r"(?0[0-9]|[1-5]\d)(?!\d)" 6 | 7 | second_pattern = r"(?0?[1-9]|[1-5]\d)(?!\d)" 8 | lz_second_pattern = r"(?0[0-9]|[1-5]\d)(?!\d)" 9 | 10 | # The time pattern is always optional 11 | time_pattern = ( 12 | r"(\s.{,3}" 13 | + f"{hour_pattern}[h:]({lz_minute_pattern})?" 14 | + f"((:|m|min){lz_second_pattern})?" 15 | + ")?" 16 | ) 17 | -------------------------------------------------------------------------------- /edsnlp/pipes/misc/dates/patterns/atomic/units.py: -------------------------------------------------------------------------------- 1 | from edsnlp.utils.regex import make_pattern 2 | 3 | units = [ 4 | r"(?Pans?|ann[ée]es?)", 5 | r"(?Psemestres?)", 6 | r"(?Ptrimestres?)", 7 | r"(?Pmois)", 8 | r"(?Psemaines?)", 9 | r"(?Pjours?|journ[ée]es?)", 10 | r"(?Ph|heures?)", 11 | r"(?Pmin|minutes?)", 12 | r"(?Psec|secondes?|s)", 13 | ] 14 | 15 | unit_pattern = make_pattern(units, with_breaks=True) 16 | -------------------------------------------------------------------------------- /edsnlp/pipes/misc/dates/patterns/atomic/years.py: -------------------------------------------------------------------------------- 1 | from datetime import date 2 | from typing import List 3 | 4 | from edsnlp.utils.regex import make_pattern 5 | 6 | year_patterns: List[str] = [ 7 | r"19\d\d", 8 | ] + [str(year) for year in range(2000, date.today().year + 2)] 9 | 10 | full_year_pattern = make_pattern(year_patterns, name="year") 11 | year_pattern = make_pattern(year_patterns + [r"\d\d"], name="year") 12 | 13 | full_year_pattern = r"(?cette\s+ann[ée]e)(?![-\s]l[àa])", 7 | r"(?Pce\s+jour|aujourd['\s]?hui)", 8 | r"(?Pcette\s+semaine|ces\sjours[-\s]ci)", 9 | r"(?Pce\smois([-\s]ci)?)", 10 | ] 11 | 12 | current_pattern = make_pattern(current_patterns, with_breaks=True) 13 | -------------------------------------------------------------------------------- /edsnlp/pipes/misc/dates/patterns/duration.py: -------------------------------------------------------------------------------- 1 | from .atomic import numbers, units 2 | 3 | cue_pattern = r"(pendant|durant|pdt)" 4 | 5 | duration_pattern = [ 6 | cue_pattern + r".{,3}" + numbers.number_pattern + r"\s*" + units.unit_pattern 7 | ] 8 | -------------------------------------------------------------------------------- /edsnlp/pipes/misc/dates/patterns/false_positive.py: -------------------------------------------------------------------------------- 1 | from edsnlp.utils.regex import make_pattern 2 | 3 | from .atomic.delimiters import delimiters 4 | 5 | # Pagination 6 | page_patterns = [r"\d\/\d"] 7 | 8 | # Phone numbers 9 | phone_patterns = [r"(\d\d" + delimiter + r"){3,}\d\d" for delimiter in delimiters] 10 | 11 | false_positive_pattern = make_pattern(page_patterns + phone_patterns) 12 | -------------------------------------------------------------------------------- /edsnlp/pipes/misc/quantities/__init__.py: -------------------------------------------------------------------------------- 1 | from edsnlp.pipes.misc.quantities.quantities import QuantitiesMatcher 2 | from edsnlp.pipes.misc.quantities.patterns import * 3 | 4 | from . import factory 5 | -------------------------------------------------------------------------------- /edsnlp/pipes/misc/quantities/factory.py: -------------------------------------------------------------------------------- 1 | from edsnlp.core import registry 2 | 3 | from . import patterns 4 | from .quantities import QuantitiesMatcher 5 | 6 | DEFAULT_CONFIG = dict( 7 | quantities=list(patterns.common_quantities.keys()), # noqa: E501 8 | units_config=patterns.units_config, 9 | number_terms=patterns.number_terms, 10 | number_regex=patterns.number_regex, 11 | stopwords=patterns.stopwords, 12 | unit_divisors=patterns.unit_divisors, 13 | ignore_excluded=True, 14 | compose_units=True, 15 | attr="NORM", 16 | extract_ranges=False, 17 | range_patterns=patterns.range_patterns, 18 | after_snippet_limit=6, 19 | before_snippet_limit=10, 20 | span_getter=None, 21 | merge_mode="intersect", 22 | as_ents=False, 23 | span_setter=None, 24 | ) 25 | 26 | create_component = registry.factory.register( 27 | "eds.quantities", 28 | assigns=["doc.spans", "doc.ents"], 29 | deprecated=["eds.measures", "eds.measurements"], 30 | )(QuantitiesMatcher) 31 | -------------------------------------------------------------------------------- /edsnlp/pipes/misc/reason/__init__.py: -------------------------------------------------------------------------------- 1 | from .patterns import reasons 2 | from .reason import ReasonMatcher 3 | -------------------------------------------------------------------------------- /edsnlp/pipes/misc/reason/factory.py: -------------------------------------------------------------------------------- 1 | from edsnlp.core import registry 2 | 3 | from .reason import ReasonMatcher 4 | 5 | DEFAULT_CONFIG = dict( 6 | reasons=None, 7 | attr="TEXT", 8 | use_sections=False, 9 | ignore_excluded=False, 10 | ) 11 | 12 | create_component = registry.factory.register( 13 | "eds.reason", 14 | assigns=["doc.spans", "doc.ents"], 15 | deprecated=["reason"], 16 | )(ReasonMatcher) 17 | -------------------------------------------------------------------------------- /edsnlp/pipes/misc/reason/patterns.py: -------------------------------------------------------------------------------- 1 | reasons = dict( 2 | reasons=[ 3 | r"(?i)motif de l.?hospitalisation : .+", 4 | r"(?i)hospitalis[ée].?.*(pour|. cause|suite [àa]).+", 5 | ( 6 | r"(?i)(consulte|prise en charge" 7 | r"(?!\set\svous\sassurer\sun\straitement\sadapté)).*pour.+" 8 | ), 9 | r"(?i)motif\sd.hospitalisation\s:.+", 10 | r"(?i)au total\s?\:?\s?\n?.+", 11 | r"(?i)motif\sde\sla\sconsultation", 12 | r"(?i)motif\sd.admission", 13 | r"(?i)conclusion\smedicale", 14 | ] 15 | ) 16 | 17 | sections_reason = ["motif", "conclusion"] 18 | 19 | section_exclude = [ 20 | "antécédents", 21 | "antécédents familiaux", 22 | "histoire de la maladie", 23 | ] 24 | -------------------------------------------------------------------------------- /edsnlp/pipes/misc/sections/__init__.py: -------------------------------------------------------------------------------- 1 | from .sections import SectionsMatcher 2 | 3 | Sections = SectionsMatcher 4 | -------------------------------------------------------------------------------- /edsnlp/pipes/misc/sections/factory.py: -------------------------------------------------------------------------------- 1 | from edsnlp.core import registry 2 | 3 | from .patterns import sections 4 | from .sections import SectionsMatcher 5 | 6 | DEFAULT_CONFIG = dict( 7 | sections=sections, 8 | add_patterns=True, 9 | attr="NORM", 10 | ignore_excluded=True, 11 | ) 12 | 13 | create_component = registry.factory.register( 14 | "eds.sections", 15 | assigns=["doc.spans", "doc.ents"], 16 | deprecated=["sections"], 17 | )(SectionsMatcher) 18 | -------------------------------------------------------------------------------- /edsnlp/pipes/misc/split/__init__.py: -------------------------------------------------------------------------------- 1 | from .split import Split 2 | -------------------------------------------------------------------------------- /edsnlp/pipes/misc/tables/__init__.py: -------------------------------------------------------------------------------- 1 | from .tables import TablesMatcher 2 | -------------------------------------------------------------------------------- /edsnlp/pipes/misc/tables/factory.py: -------------------------------------------------------------------------------- 1 | from edsnlp.core import registry 2 | from edsnlp.pipes.misc.tables import TablesMatcher 3 | 4 | DEFAULT_CONFIG = dict( 5 | tables_pattern=None, 6 | sep_pattern=None, 7 | attr="TEXT", 8 | ignore_excluded=True, 9 | ) 10 | 11 | create_component = registry.factory.register( 12 | "eds.tables", 13 | assigns=["doc.spans", "doc.ents"], 14 | deprecated=["tables"], 15 | )(TablesMatcher) 16 | -------------------------------------------------------------------------------- /edsnlp/pipes/misc/tables/patterns.py: -------------------------------------------------------------------------------- 1 | sep = ["¦", "|"] 2 | regex_template = [r"(?:{sep}?(?:[^{sep}\n]*{sep})+[^{sep}\n]*{sep}?\n){{{n},}}"] 3 | -------------------------------------------------------------------------------- /edsnlp/pipes/ner/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edsnlp/8e9ed84f56e6af741023e8b3a9de38ba93912953/edsnlp/pipes/ner/__init__.py -------------------------------------------------------------------------------- /edsnlp/pipes/ner/adicap/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edsnlp/8e9ed84f56e6af741023e8b3a9de38ba93912953/edsnlp/pipes/ner/adicap/__init__.py -------------------------------------------------------------------------------- /edsnlp/pipes/ner/adicap/factory.py: -------------------------------------------------------------------------------- 1 | from edsnlp.core import registry 2 | 3 | from .adicap import AdicapMatcher 4 | from .patterns import adicap_prefix, base_code 5 | 6 | DEFAULT_CONFIG = dict( 7 | pattern=base_code, 8 | prefix=adicap_prefix, 9 | window=500, 10 | attr="TEXT", 11 | label="adicap", 12 | span_setter={"ents": True, "adicap": True}, 13 | ) 14 | 15 | create_component = registry.factory.register( 16 | "eds.adicap", 17 | assigns=["doc.ents", "doc.spans"], 18 | )(AdicapMatcher) 19 | -------------------------------------------------------------------------------- /edsnlp/pipes/ner/adicap/models.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | import pydantic 4 | 5 | 6 | class AdicapCode(pydantic.BaseModel): 7 | code: str 8 | sampling_mode: Optional[str] = None 9 | technic: Optional[str] = None 10 | organ: Optional[str] = None 11 | pathology: Optional[str] = None 12 | pathology_type: Optional[str] = None 13 | behaviour_type: Optional[str] = None 14 | 15 | def norm(self) -> str: 16 | return self.code 17 | 18 | def __str__(self): 19 | return self.norm() 20 | 21 | if pydantic.VERSION < "2": 22 | model_dump = pydantic.BaseModel.dict 23 | -------------------------------------------------------------------------------- /edsnlp/pipes/ner/adicap/patterns.py: -------------------------------------------------------------------------------- 1 | """ 2 | Source : 3 | https://esante.gouv.fr/sites/default/files/media_entity/documents/cgts_sem_adicap_fiche-detaillee.pdf 4 | """ 5 | 6 | 7 | # d1_4 = r"[A-Z]{4}" 8 | d1_4 = r"[A-Z]\.?[A-Z]\.?[A-Z]{2}\.?" 9 | d5_8_v1 = r"\d{4}" 10 | d5_8_v2 = r"\d{4}|[A-Z][0-9A-Z][A-Z][0-9]" 11 | d5_8_v3 = r"[0-9A-Z][0-9][09A-Z][0-9]" 12 | d5_8_v4 = r"0[A-Z][0-9]{2}" 13 | 14 | 15 | adicap_prefix = r"(?i)(codification|adicap)" 16 | base_code = ( 17 | r"(" 18 | + d1_4 19 | + r"(?:" 20 | + d5_8_v1 21 | + r"|" 22 | + d5_8_v2 23 | + r"|" 24 | + d5_8_v3 25 | + r"|" 26 | + d5_8_v4 27 | + r"))" 28 | ) 29 | -------------------------------------------------------------------------------- /edsnlp/pipes/ner/behaviors/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edsnlp/8e9ed84f56e6af741023e8b3a9de38ba93912953/edsnlp/pipes/ner/behaviors/__init__.py -------------------------------------------------------------------------------- /edsnlp/pipes/ner/behaviors/alcohol/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edsnlp/8e9ed84f56e6af741023e8b3a9de38ba93912953/edsnlp/pipes/ner/behaviors/alcohol/__init__.py -------------------------------------------------------------------------------- /edsnlp/pipes/ner/behaviors/alcohol/factory.py: -------------------------------------------------------------------------------- 1 | from edsnlp.core import registry 2 | 3 | from .alcohol import AlcoholMatcher 4 | from .patterns import default_patterns 5 | 6 | DEFAULT_CONFIG = dict( 7 | patterns=default_patterns, 8 | label="alcohol", 9 | span_setter={"ents": True, "alcohol": True}, 10 | ) 11 | 12 | create_component = registry.factory.register( 13 | "eds.alcohol", 14 | assigns=["doc.ents", "doc.spans"], 15 | )(AlcoholMatcher) 16 | -------------------------------------------------------------------------------- /edsnlp/pipes/ner/behaviors/alcohol/patterns.py: -------------------------------------------------------------------------------- 1 | default_pattern = dict( 2 | source="alcohol", 3 | regex=[ 4 | r"\balco[ol]", 5 | r"\bethyl", 6 | r"(? Dict[str, List[str]]: 9 | df = pd.read_csv(BASE_DIR / "resources" / "cim10.csv.gz") 10 | 11 | df["code_pattern"] = df["code"] 12 | df["code_point"] = df["code"].str[:2] + "." + df["code"].str[2:] 13 | df["code_space"] = df["code"].str[0] + " " + df["code"].str[1:] 14 | df["code_space_point"] = ( 15 | df["code"].str[0] + " " + df["code"].str[1] + "." + df["code"].str[2:] 16 | ) 17 | 18 | df = pd.concat( 19 | [ 20 | df[["code", "short"]].rename(columns={"short": "patterns"}), 21 | df[["code", "long"]].rename(columns={"long": "patterns"}), 22 | df[["code", "code_pattern"]].rename(columns={"code_pattern": "patterns"}), 23 | df[["code", "code_point"]].rename(columns={"code_point": "patterns"}), 24 | df[["code", "code_space"]].rename(columns={"code_space": "patterns"}), 25 | df[["code", "code_space_point"]].rename( 26 | columns={"code_space_point": "patterns"} 27 | ), 28 | ] 29 | ) 30 | 31 | patterns = df.groupby("code")["patterns"].agg(list).to_dict() 32 | 33 | return patterns 34 | -------------------------------------------------------------------------------- /edsnlp/pipes/ner/covid/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edsnlp/8e9ed84f56e6af741023e8b3a9de38ba93912953/edsnlp/pipes/ner/covid/__init__.py -------------------------------------------------------------------------------- /edsnlp/pipes/ner/covid/patterns.py: -------------------------------------------------------------------------------- 1 | from edsnlp.utils.regex import make_pattern 2 | 3 | covid = [ 4 | r"covid([-\s]?19)?", 5 | r"sars[-\s]?cov[-\s]?2", 6 | r"corona[-\s]?virus", 7 | ] 8 | 9 | diseases = [r"pneumopathies?", r"infections?"] 10 | 11 | patterns = [r"(" + make_pattern(diseases) + r"\s[àa]u?\s)?" + make_pattern(covid)] 12 | -------------------------------------------------------------------------------- /edsnlp/pipes/ner/disorders/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edsnlp/8e9ed84f56e6af741023e8b3a9de38ba93912953/edsnlp/pipes/ner/disorders/__init__.py -------------------------------------------------------------------------------- /edsnlp/pipes/ner/disorders/aids/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edsnlp/8e9ed84f56e6af741023e8b3a9de38ba93912953/edsnlp/pipes/ner/disorders/aids/__init__.py -------------------------------------------------------------------------------- /edsnlp/pipes/ner/disorders/aids/factory.py: -------------------------------------------------------------------------------- 1 | from edsnlp import registry 2 | 3 | from .aids import AIDSMatcher 4 | from .patterns import default_patterns 5 | 6 | DEFAULT_CONFIG = dict( 7 | patterns=default_patterns, 8 | label="aids", 9 | span_setter={"ents": True, "aids": True}, 10 | ) 11 | 12 | create_component = registry.factory.register( 13 | "eds.aids", 14 | assigns=["doc.ents", "doc.spans"], 15 | deprecated=["eds.AIDS"], 16 | )(AIDSMatcher) 17 | -------------------------------------------------------------------------------- /edsnlp/pipes/ner/disorders/aids/patterns.py: -------------------------------------------------------------------------------- 1 | aids = dict( 2 | source="aids", 3 | regex=[ 4 | r"(vih.{1,5}stade.{1,5})?\bsida\b", 5 | ], 6 | regex_attr="NORM", 7 | ) 8 | 9 | hiv = dict( 10 | source="hiv", 11 | regex=[ 12 | r"\bhiv\b", 13 | r"\bvih\b", 14 | ], 15 | exclude=dict( 16 | regex=["serologie", "prelevement"], 17 | window=(-20, 20), 18 | limit_to_sentence=False, 19 | ), 20 | assign=[ 21 | dict( 22 | name="opportunist", 23 | regex=r"(" 24 | + r"|".join( 25 | [ 26 | r"kapo[sz]i", 27 | r"toxoplasmose", 28 | r"meningo.?encephalite.toxo", 29 | r"pneumocystose", 30 | r"\bpep\b", 31 | r"pneumocystis", 32 | r"cryptococcose", 33 | r"cytomégalovirus", 34 | r"myobact", 35 | r"opportunist", 36 | r"co.?infect", 37 | ] 38 | ) 39 | + ")" 40 | + r"(?!.{0,20}(?:non|0))", 41 | window=(-10, 30), 42 | limit_to_sentence=False, 43 | ), 44 | dict( 45 | name="stage", 46 | regex=r"stade.{0,5}\b(b|c)\b", 47 | window=10, 48 | ), 49 | ], 50 | regex_attr="NORM", 51 | ) 52 | 53 | default_patterns = [ 54 | aids, 55 | hiv, 56 | ] 57 | -------------------------------------------------------------------------------- /edsnlp/pipes/ner/disorders/cerebrovascular_accident/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edsnlp/8e9ed84f56e6af741023e8b3a9de38ba93912953/edsnlp/pipes/ner/disorders/cerebrovascular_accident/__init__.py -------------------------------------------------------------------------------- /edsnlp/pipes/ner/disorders/cerebrovascular_accident/factory.py: -------------------------------------------------------------------------------- 1 | from edsnlp.core import registry 2 | 3 | from .cerebrovascular_accident import CerebrovascularAccidentMatcher 4 | from .patterns import default_patterns 5 | 6 | DEFAULT_CONFIG = dict( 7 | patterns=default_patterns, 8 | label="cerebrovascular_accident", 9 | span_setter={"ents": True, "cerebrovascular_accident": True}, 10 | ) 11 | 12 | create_component = registry.factory.register( 13 | "eds.cerebrovascular_accident", 14 | assigns=["doc.ents", "doc.spans"], 15 | )(CerebrovascularAccidentMatcher) 16 | -------------------------------------------------------------------------------- /edsnlp/pipes/ner/disorders/ckd/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edsnlp/8e9ed84f56e6af741023e8b3a9de38ba93912953/edsnlp/pipes/ner/disorders/ckd/__init__.py -------------------------------------------------------------------------------- /edsnlp/pipes/ner/disorders/ckd/factory.py: -------------------------------------------------------------------------------- 1 | from edsnlp import registry 2 | 3 | from .ckd import CKDMatcher 4 | from .patterns import default_patterns 5 | 6 | DEFAULT_CONFIG = dict( 7 | patterns=default_patterns, 8 | label="ckd", 9 | span_setter={"ents": True, "ckd": True}, 10 | ) 11 | 12 | create_component = registry.factory.register( 13 | "eds.ckd", 14 | assigns=["doc.ents", "doc.spans"], 15 | deprecated=["eds.CKD"], 16 | )(CKDMatcher) 17 | -------------------------------------------------------------------------------- /edsnlp/pipes/ner/disorders/congestive_heart_failure/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edsnlp/8e9ed84f56e6af741023e8b3a9de38ba93912953/edsnlp/pipes/ner/disorders/congestive_heart_failure/__init__.py -------------------------------------------------------------------------------- /edsnlp/pipes/ner/disorders/congestive_heart_failure/factory.py: -------------------------------------------------------------------------------- 1 | from edsnlp.core import registry 2 | 3 | from .congestive_heart_failure import CongestiveHeartFailureMatcher 4 | from .patterns import default_patterns 5 | 6 | DEFAULT_CONFIG = dict( 7 | patterns=default_patterns, 8 | label="congestive_heart_failure", 9 | span_setter={"ents": True, "congestive_heart_failure": True}, 10 | ) 11 | 12 | create_component = registry.factory.register( 13 | "eds.congestive_heart_failure", 14 | assigns=["doc.ents", "doc.spans"], 15 | )(CongestiveHeartFailureMatcher) 16 | -------------------------------------------------------------------------------- /edsnlp/pipes/ner/disorders/connective_tissue_disease/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edsnlp/8e9ed84f56e6af741023e8b3a9de38ba93912953/edsnlp/pipes/ner/disorders/connective_tissue_disease/__init__.py -------------------------------------------------------------------------------- /edsnlp/pipes/ner/disorders/connective_tissue_disease/factory.py: -------------------------------------------------------------------------------- 1 | from edsnlp.core import registry 2 | 3 | from .connective_tissue_disease import ConnectiveTissueDiseaseMatcher 4 | from .patterns import default_patterns 5 | 6 | DEFAULT_CONFIG = dict( 7 | patterns=default_patterns, 8 | label="connective_tissue_disease", 9 | span_setter={"ents": True, "connective_tissue_disease": True}, 10 | ) 11 | 12 | create_component = registry.factory.register( 13 | "eds.connective_tissue_disease", 14 | assigns=["doc.ents", "doc.spans"], 15 | )(ConnectiveTissueDiseaseMatcher) 16 | -------------------------------------------------------------------------------- /edsnlp/pipes/ner/disorders/copd/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edsnlp/8e9ed84f56e6af741023e8b3a9de38ba93912953/edsnlp/pipes/ner/disorders/copd/__init__.py -------------------------------------------------------------------------------- /edsnlp/pipes/ner/disorders/copd/factory.py: -------------------------------------------------------------------------------- 1 | from edsnlp import registry 2 | 3 | from .copd import COPDMatcher 4 | from .patterns import default_patterns 5 | 6 | DEFAULT_CONFIG = dict( 7 | patterns=default_patterns, 8 | label="copd", 9 | span_setter={"ents": True, "copd": True}, 10 | ) 11 | 12 | create_component = registry.factory.register( 13 | "eds.copd", 14 | assigns=["doc.ents", "doc.spans"], 15 | deprecated=["eds.COPD"], 16 | )(COPDMatcher) 17 | -------------------------------------------------------------------------------- /edsnlp/pipes/ner/disorders/dementia/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edsnlp/8e9ed84f56e6af741023e8b3a9de38ba93912953/edsnlp/pipes/ner/disorders/dementia/__init__.py -------------------------------------------------------------------------------- /edsnlp/pipes/ner/disorders/dementia/factory.py: -------------------------------------------------------------------------------- 1 | from edsnlp.core import registry 2 | 3 | from .dementia import DementiaMatcher 4 | from .patterns import default_patterns 5 | 6 | DEFAULT_CONFIG = dict( 7 | patterns=default_patterns, 8 | label="dementia", 9 | span_setter={"ents": True, "dementia": True}, 10 | ) 11 | 12 | create_component = registry.factory.register( 13 | "eds.dementia", 14 | assigns=["doc.ents", "doc.spans"], 15 | )(DementiaMatcher) 16 | -------------------------------------------------------------------------------- /edsnlp/pipes/ner/disorders/diabetes/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edsnlp/8e9ed84f56e6af741023e8b3a9de38ba93912953/edsnlp/pipes/ner/disorders/diabetes/__init__.py -------------------------------------------------------------------------------- /edsnlp/pipes/ner/disorders/diabetes/factory.py: -------------------------------------------------------------------------------- 1 | from edsnlp.core import registry 2 | 3 | from .diabetes import DiabetesMatcher 4 | from .patterns import default_patterns 5 | 6 | DEFAULT_CONFIG = dict( 7 | patterns=default_patterns, 8 | label="diabetes", 9 | span_setter={"ents": True, "diabetes": True}, 10 | ) 11 | 12 | create_component = registry.factory.register( 13 | "eds.diabetes", 14 | assigns=["doc.ents", "doc.spans"], 15 | )(DiabetesMatcher) 16 | -------------------------------------------------------------------------------- /edsnlp/pipes/ner/disorders/hemiplegia/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edsnlp/8e9ed84f56e6af741023e8b3a9de38ba93912953/edsnlp/pipes/ner/disorders/hemiplegia/__init__.py -------------------------------------------------------------------------------- /edsnlp/pipes/ner/disorders/hemiplegia/factory.py: -------------------------------------------------------------------------------- 1 | from edsnlp.core import registry 2 | 3 | from .hemiplegia import HemiplegiaMatcher 4 | from .patterns import default_patterns 5 | 6 | DEFAULT_CONFIG = dict( 7 | patterns=default_patterns, 8 | label="hemiplegia", 9 | span_setter={"ents": True, "hemiplegia": True}, 10 | ) 11 | 12 | create_component = registry.factory.register( 13 | "eds.hemiplegia", 14 | assigns=["doc.ents", "doc.spans"], 15 | )(HemiplegiaMatcher) 16 | -------------------------------------------------------------------------------- /edsnlp/pipes/ner/disorders/hemiplegia/patterns.py: -------------------------------------------------------------------------------- 1 | main_pattern = dict( 2 | source="main", 3 | regex=[ 4 | r"hemiplegi", 5 | r"tetraplegi", 6 | r"quadriplegi", 7 | r"paraplegi", 8 | r"neuropathie.{1,25}motrice.{1,30}type [5V]", 9 | r"charcot.?marie.?tooth", 10 | r"locked.?in", 11 | r"syndrome.{1,5}(enfermement|verrouillage)|(desafferen)", 12 | r"paralysie.{1,10}hemicorps", 13 | r"paralysie.{1,10}jambe", 14 | r"paralysie.{1,10}membre", 15 | r"paralysie.{1,10}cote", 16 | r"paralysie.{1,5}cerebrale.{1,5}spastique", 17 | ], 18 | regex_attr="NORM", 19 | ) 20 | 21 | acronym = dict( 22 | source="acronym", 23 | regex=[ 24 | r"\bLIS\b", 25 | r"\bNMSH\b", 26 | ], 27 | regex_attr="TEXT", 28 | ) 29 | 30 | default_patterns = [ 31 | main_pattern, 32 | acronym, 33 | ] 34 | -------------------------------------------------------------------------------- /edsnlp/pipes/ner/disorders/leukemia/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edsnlp/8e9ed84f56e6af741023e8b3a9de38ba93912953/edsnlp/pipes/ner/disorders/leukemia/__init__.py -------------------------------------------------------------------------------- /edsnlp/pipes/ner/disorders/leukemia/factory.py: -------------------------------------------------------------------------------- 1 | from edsnlp.core import registry 2 | 3 | from .leukemia import LeukemiaMatcher 4 | from .patterns import default_patterns 5 | 6 | DEFAULT_CONFIG = dict( 7 | patterns=default_patterns, 8 | label="leukemia", 9 | span_setter={"ents": True, "leukemia": True}, 10 | ) 11 | 12 | create_component = registry.factory.register( 13 | "eds.leukemia", 14 | assigns=["doc.ents", "doc.spans"], 15 | )(LeukemiaMatcher) 16 | -------------------------------------------------------------------------------- /edsnlp/pipes/ner/disorders/liver_disease/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edsnlp/8e9ed84f56e6af741023e8b3a9de38ba93912953/edsnlp/pipes/ner/disorders/liver_disease/__init__.py -------------------------------------------------------------------------------- /edsnlp/pipes/ner/disorders/liver_disease/factory.py: -------------------------------------------------------------------------------- 1 | from edsnlp.core import registry 2 | 3 | from .liver_disease import LiverDiseaseMatcher 4 | from .patterns import default_patterns 5 | 6 | DEFAULT_CONFIG = dict( 7 | patterns=default_patterns, 8 | label="liver_disease", 9 | span_setter={"ents": True, "liver_disease": True}, 10 | ) 11 | 12 | create_component = registry.factory.register( 13 | "eds.liver_disease", 14 | assigns=["doc.ents", "doc.spans"], 15 | )(LiverDiseaseMatcher) 16 | -------------------------------------------------------------------------------- /edsnlp/pipes/ner/disorders/lymphoma/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edsnlp/8e9ed84f56e6af741023e8b3a9de38ba93912953/edsnlp/pipes/ner/disorders/lymphoma/__init__.py -------------------------------------------------------------------------------- /edsnlp/pipes/ner/disorders/lymphoma/factory.py: -------------------------------------------------------------------------------- 1 | from edsnlp.core import registry 2 | 3 | from .lymphoma import LymphomaMatcher 4 | from .patterns import default_patterns 5 | 6 | DEFAULT_CONFIG = dict( 7 | patterns=default_patterns, 8 | label="lymphoma", 9 | span_setter={"ents": True, "lymphoma": True}, 10 | ) 11 | 12 | create_component = registry.factory.register( 13 | "eds.lymphoma", 14 | assigns=["doc.ents", "doc.spans"], 15 | )(LymphomaMatcher) 16 | -------------------------------------------------------------------------------- /edsnlp/pipes/ner/disorders/myocardial_infarction/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edsnlp/8e9ed84f56e6af741023e8b3a9de38ba93912953/edsnlp/pipes/ner/disorders/myocardial_infarction/__init__.py -------------------------------------------------------------------------------- /edsnlp/pipes/ner/disorders/myocardial_infarction/factory.py: -------------------------------------------------------------------------------- 1 | from edsnlp.core import registry 2 | 3 | from .myocardial_infarction import MyocardialInfarctionMatcher 4 | from .patterns import default_patterns 5 | 6 | DEFAULT_CONFIG = dict( 7 | patterns=default_patterns, 8 | label="myocardial_infarction", 9 | span_setter={"ents": True, "myocardial_infarction": True}, 10 | ) 11 | 12 | create_component = registry.factory.register( 13 | "eds.myocardial_infarction", 14 | assigns=["doc.ents", "doc.spans"], 15 | )(MyocardialInfarctionMatcher) 16 | -------------------------------------------------------------------------------- /edsnlp/pipes/ner/disorders/myocardial_infarction/patterns.py: -------------------------------------------------------------------------------- 1 | from ..terms import HEART 2 | 3 | main_pattern = dict( 4 | source="main", 5 | regex=[ 6 | r"coronaropathie", 7 | r"angor.{1,5}instable", 8 | r"cardiopathie(?!.{0,20}non).{0,20}(ischem|arteriosc)", 9 | r"cardio.?myopathie(?!.{0,20}non).{0,20}(ischem|arteriosc)", 10 | r"ischemi.{1,15}myocard", 11 | r"syndrome.{1,5}corona.{1,10}aigu", 12 | r"syndrome.{1,5}corona.{1,10}st", 13 | r"pontage.{1,5}mammaire", 14 | ], 15 | regex_attr="NORM", 16 | ) 17 | 18 | with_localization = dict( 19 | source="with_localization", 20 | regex=[ 21 | r"\bstent", 22 | r"endoprothese", 23 | r"pontage", 24 | r"anevr[iy]sme", 25 | "infarctus", 26 | r"angioplasti", 27 | ], 28 | assign=[ 29 | dict( 30 | name="heart_localized", 31 | regex="(" + r"|".join(HEART) + ")", 32 | window=(-10, 10), 33 | ), 34 | ], 35 | regex_attr="NORM", 36 | ) 37 | 38 | acronym = dict( 39 | source="acronym", 40 | regex=[ 41 | r"\bidm\b", 42 | r"\bsca\b", 43 | r"\batl\b", 44 | ], 45 | regex_attr="NORM", 46 | assign=dict( 47 | name="segment", 48 | regex=r"st([+-])", 49 | window=2, 50 | ), 51 | ) 52 | 53 | 54 | default_patterns = [ 55 | main_pattern, 56 | with_localization, 57 | acronym, 58 | ] 59 | -------------------------------------------------------------------------------- /edsnlp/pipes/ner/disorders/peptic_ulcer_disease/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edsnlp/8e9ed84f56e6af741023e8b3a9de38ba93912953/edsnlp/pipes/ner/disorders/peptic_ulcer_disease/__init__.py -------------------------------------------------------------------------------- /edsnlp/pipes/ner/disorders/peptic_ulcer_disease/factory.py: -------------------------------------------------------------------------------- 1 | from edsnlp.core import registry 2 | 3 | from .patterns import default_patterns 4 | from .peptic_ulcer_disease import PepticUlcerDiseaseMatcher 5 | 6 | DEFAULT_CONFIG = dict( 7 | patterns=default_patterns, 8 | label="peptic_ulcer_disease", 9 | span_setter={"ents": True, "peptic_ulcer_disease": True}, 10 | ) 11 | 12 | create_component = registry.factory.register( 13 | "eds.peptic_ulcer_disease", 14 | assigns=["doc.ents", "doc.spans"], 15 | )(PepticUlcerDiseaseMatcher) 16 | -------------------------------------------------------------------------------- /edsnlp/pipes/ner/disorders/peptic_ulcer_disease/patterns.py: -------------------------------------------------------------------------------- 1 | main_pattern = dict( 2 | source="main", 3 | regex=[ 4 | r"ulcere.{1,10}gastr", 5 | r"ulcere.{1,10}duoden", 6 | r"ulcere.{1,10}antra", 7 | r"ulcere.{1,10}pept", 8 | r"ulcere.{1,10}estomac", 9 | r"ulcere.{1,10}curling", 10 | r"ulcere.{1,10}bulb", 11 | r"(œ|oe)sophagites.{1,5}pepti.{1,10}ulcer", 12 | r"gastrite.{1,20}ulcer", 13 | r"antrite.{1,5}ulcer", 14 | ], 15 | regex_attr="NORM", 16 | ) 17 | 18 | acronym = dict( 19 | source="acronym", 20 | regex=[ 21 | r"\bUGD\b", 22 | ], 23 | regex_attr="TEXT", 24 | ) 25 | 26 | generic = dict( 27 | source="generic", 28 | regex=r"ulcere", 29 | regex_attr="NORM", 30 | assign=dict( 31 | name="is_peptic", 32 | regex=r"\b(gastr|digest)", 33 | window=(-20, 20), 34 | limit_to_sentence=False, 35 | ), 36 | ) 37 | 38 | default_patterns = [ 39 | main_pattern, 40 | acronym, 41 | generic, 42 | ] 43 | -------------------------------------------------------------------------------- /edsnlp/pipes/ner/disorders/peripheral_vascular_disease/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edsnlp/8e9ed84f56e6af741023e8b3a9de38ba93912953/edsnlp/pipes/ner/disorders/peripheral_vascular_disease/__init__.py -------------------------------------------------------------------------------- /edsnlp/pipes/ner/disorders/peripheral_vascular_disease/factory.py: -------------------------------------------------------------------------------- 1 | from edsnlp import registry 2 | 3 | from .patterns import default_patterns 4 | from .peripheral_vascular_disease import PeripheralVascularDiseaseMatcher 5 | 6 | DEFAULT_CONFIG = dict( 7 | patterns=default_patterns, 8 | label="peripheral_vascular_disease", 9 | span_setter={"ents": True, "peripheral_vascular_disease": True}, 10 | ) 11 | 12 | create_component = registry.factory.register( 13 | "eds.peripheral_vascular_disease", 14 | assigns=["doc.ents", "doc.spans"], 15 | )(PeripheralVascularDiseaseMatcher) 16 | -------------------------------------------------------------------------------- /edsnlp/pipes/ner/disorders/solid_tumor/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edsnlp/8e9ed84f56e6af741023e8b3a9de38ba93912953/edsnlp/pipes/ner/disorders/solid_tumor/__init__.py -------------------------------------------------------------------------------- /edsnlp/pipes/ner/disorders/solid_tumor/factory.py: -------------------------------------------------------------------------------- 1 | from edsnlp.core import registry 2 | 3 | from .patterns import default_patterns 4 | from .solid_tumor import SolidTumorMatcher 5 | 6 | DEFAULT_CONFIG = dict( 7 | patterns=default_patterns, 8 | use_tnm=False, 9 | label="solid_tumor", 10 | span_setter={"ents": True, "solid_tumor": True}, 11 | ) 12 | 13 | create_component = registry.factory.register( 14 | "eds.solid_tumor", 15 | assigns=["doc.ents", "doc.spans"], 16 | )(SolidTumorMatcher) 17 | -------------------------------------------------------------------------------- /edsnlp/pipes/ner/disorders/terms.py: -------------------------------------------------------------------------------- 1 | HEART = [ 2 | r"cardi", 3 | r"coronari", 4 | r"coronair", 5 | r"\bcd\b", 6 | r"\biva\d?\b", 7 | r"\bivp\d?\b", 8 | r"\bivg\d?\b", 9 | r"\bivd\d?\b", 10 | r"intra.?va\b", 11 | r"intra.?vp\b", 12 | r"intra.?vg\b", 13 | r"intra.?vd\b", 14 | r"circonflexe", 15 | r"\bcx\b", 16 | r"marginale", 17 | r"\bmg\b", 18 | r"\bdiago", 19 | r"\brvp\b", 20 | r"myocard", 21 | "apical", 22 | "septal", 23 | "ventricul", 24 | "coeur", 25 | "cœur", 26 | "auriculaire", 27 | "parietal", 28 | "septum", 29 | ] 30 | 31 | BRAIN = [ 32 | r"cerveau", 33 | r"cereb", 34 | r"cran", 35 | r"v4", 36 | r"m1", 37 | r"aica", 38 | r"\bpica", 39 | r"basilaire", 40 | r"polygone de willis", 41 | r"cercle de willis", 42 | r"sylvien", 43 | r"arachnoi", 44 | r"meninge", 45 | r"dura(?:l|ux)", 46 | r"puncti", 47 | r"front", 48 | r"tempo", 49 | r"occipi", 50 | r"parieta", 51 | # r"segment", Too generic 52 | ] 53 | 54 | PERIPHERAL = [ 55 | "pied", 56 | "main", 57 | r"\bmi\b", 58 | r"\bmig\b", 59 | r"\bmid\b", 60 | "membre", 61 | "jambe", 62 | "bras", 63 | "doigt", 64 | "digital", 65 | "orteil", 66 | ] 67 | 68 | ASYMPTOMATIC = [ 69 | r"asympto", 70 | r"sans.decompens", 71 | r"non.decompens", 72 | ] 73 | -------------------------------------------------------------------------------- /edsnlp/pipes/ner/drugs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edsnlp/8e9ed84f56e6af741023e8b3a9de38ba93912953/edsnlp/pipes/ner/drugs/__init__.py -------------------------------------------------------------------------------- /edsnlp/pipes/ner/drugs/patterns.py: -------------------------------------------------------------------------------- 1 | import json 2 | from typing import Dict, List 3 | 4 | from edsnlp import BASE_DIR 5 | 6 | drugs_file = BASE_DIR / "resources" / "drugs.json" 7 | 8 | 9 | def get_patterns() -> Dict[str, List[str]]: 10 | with open(drugs_file, "r") as f: 11 | return json.load(f) 12 | -------------------------------------------------------------------------------- /edsnlp/pipes/ner/scores/__init__.py: -------------------------------------------------------------------------------- 1 | from edsnlp.pipes.ner.scores.base_score import SimpleScoreMatcher 2 | 3 | Score = SimpleScoreMatcher 4 | -------------------------------------------------------------------------------- /edsnlp/pipes/ner/scores/charlson/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edsnlp/8e9ed84f56e6af741023e8b3a9de38ba93912953/edsnlp/pipes/ner/scores/charlson/__init__.py -------------------------------------------------------------------------------- /edsnlp/pipes/ner/scores/charlson/patterns.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | 3 | import spacy 4 | 5 | regex = [r"charlson"] 6 | 7 | value_extract = r"^.*?[\n\W]*?(\d+)" 8 | 9 | score_normalization_str = "score_normalization.charlson" 10 | 11 | 12 | @spacy.registry.misc(score_normalization_str) 13 | def score_normalization(extracted_score: Union[str, None]): 14 | """ 15 | Charlson score normalization. 16 | If available, returns the integer value of the Charlson score. 17 | """ 18 | score_range = list(range(0, 30)) 19 | try: 20 | if (extracted_score is not None) and (int(extracted_score) in score_range): 21 | return int(extracted_score) 22 | except ValueError: 23 | return None 24 | -------------------------------------------------------------------------------- /edsnlp/pipes/ner/scores/elston_ellis/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edsnlp/8e9ed84f56e6af741023e8b3a9de38ba93912953/edsnlp/pipes/ner/scores/elston_ellis/__init__.py -------------------------------------------------------------------------------- /edsnlp/pipes/ner/scores/elston_ellis/patterns.py: -------------------------------------------------------------------------------- 1 | import re 2 | from typing import Union 3 | 4 | import spacy 5 | 6 | regex = [r"[Ee]lston (& |et |and )?[Ee]llis", r"\b[Ee]{2}\b"] 7 | 8 | pattern1 = r"[^\d\(\)]*[0-3]" 9 | pattern2 = r".{0,2}[\+,]" 10 | value_extract = rf"(?s).(\({pattern1}{pattern2}{pattern1}{pattern2}{pattern1}\))" 11 | 12 | score_normalization_str = "score_normalization.elstonellis" 13 | 14 | 15 | @spacy.registry.misc(score_normalization_str) 16 | def score_normalization(extracted_score: Union[str, None]): 17 | """ 18 | Elston and Ellis score normalization. 19 | If available, returns the integer value of the Elston and Ellis score. 20 | """ 21 | try: 22 | x = 0 23 | for i in re.findall(r"[0-3]", extracted_score): 24 | x += int(i) 25 | 26 | if x <= 5: 27 | return 1 28 | 29 | elif x <= 7: 30 | return 2 31 | 32 | else: 33 | return 3 34 | 35 | except ValueError: 36 | return None 37 | -------------------------------------------------------------------------------- /edsnlp/pipes/ner/scores/emergency/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edsnlp/8e9ed84f56e6af741023e8b3a9de38ba93912953/edsnlp/pipes/ner/scores/emergency/__init__.py -------------------------------------------------------------------------------- /edsnlp/pipes/ner/scores/emergency/ccmu/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edsnlp/8e9ed84f56e6af741023e8b3a9de38ba93912953/edsnlp/pipes/ner/scores/emergency/ccmu/__init__.py -------------------------------------------------------------------------------- /edsnlp/pipes/ner/scores/emergency/ccmu/patterns.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | 3 | import spacy 4 | 5 | regex = [r"\bccmu\b"] 6 | 7 | value_extract = r"^.*?[\n\W]*?(\d+)" 8 | 9 | score_normalization_str = "score_normalization.ccmu" 10 | 11 | 12 | @spacy.registry.misc(score_normalization_str) 13 | def score_normalization(extracted_score: Union[str, None]): 14 | """ 15 | CCMU score normalization. 16 | If available, returns the integer value of the CCMU score. 17 | """ 18 | score_range = [1, 2, 3, 4, 5] 19 | if (extracted_score is not None) and (int(extracted_score) in score_range): 20 | return int(extracted_score) 21 | -------------------------------------------------------------------------------- /edsnlp/pipes/ner/scores/emergency/gemsa/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edsnlp/8e9ed84f56e6af741023e8b3a9de38ba93912953/edsnlp/pipes/ner/scores/emergency/gemsa/__init__.py -------------------------------------------------------------------------------- /edsnlp/pipes/ner/scores/emergency/gemsa/patterns.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | 3 | import spacy 4 | 5 | regex = [r"\bgemsa\b"] 6 | 7 | value_extract = r"^.*?[\n\W]*?(\d+)" 8 | 9 | score_normalization_str = "score_normalization.gemsa" 10 | 11 | 12 | @spacy.registry.misc(score_normalization_str) 13 | def score_normalization(extracted_score: Union[str, None]): 14 | """ 15 | GEMSA score normalization. 16 | If available, returns the integer value of the GEMSA score. 17 | """ 18 | score_range = [1, 2, 3, 4, 5, 6] 19 | if (extracted_score is not None) and (int(extracted_score) in score_range): 20 | return int(extracted_score) 21 | -------------------------------------------------------------------------------- /edsnlp/pipes/ner/scores/emergency/priority/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edsnlp/8e9ed84f56e6af741023e8b3a9de38ba93912953/edsnlp/pipes/ner/scores/emergency/priority/__init__.py -------------------------------------------------------------------------------- /edsnlp/pipes/ner/scores/emergency/priority/patterns.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | 3 | import spacy 4 | 5 | regex = [r"\bpriorite\b"] 6 | 7 | value_extract = r"^.*?[\n\W]*?(\d+)" 8 | 9 | score_normalization_str = "score_normalization.priority" 10 | 11 | 12 | @spacy.registry.misc(score_normalization_str) 13 | def score_normalization(extracted_score: Union[str, None]): 14 | """ 15 | Priority score normalization. 16 | If available, returns the integer value of the priority score. 17 | """ 18 | score_range = list(range(0, 6)) 19 | if (extracted_score is not None) and (int(extracted_score) in score_range): 20 | return int(extracted_score) 21 | -------------------------------------------------------------------------------- /edsnlp/pipes/ner/scores/factory.py: -------------------------------------------------------------------------------- 1 | from edsnlp.core import registry 2 | from edsnlp.pipes.ner.scores.base_score import SimpleScoreMatcher 3 | 4 | DEFAULT_CONFIG = dict( 5 | regex=None, 6 | attr="NORM", 7 | value_extract=None, 8 | score_normalization=None, 9 | window=7, 10 | ignore_excluded=False, 11 | ignore_space_tokens=False, 12 | flags=0, 13 | span_setter={"ents": True}, 14 | ) 15 | 16 | create_component = registry.factory.register( 17 | "eds.score", 18 | assigns=["doc.ents", "doc.spans"], 19 | deprecated=["score"], 20 | )(SimpleScoreMatcher) 21 | -------------------------------------------------------------------------------- /edsnlp/pipes/ner/scores/sofa/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edsnlp/8e9ed84f56e6af741023e8b3a9de38ba93912953/edsnlp/pipes/ner/scores/sofa/__init__.py -------------------------------------------------------------------------------- /edsnlp/pipes/ner/scores/sofa/patterns.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | 3 | import spacy 4 | 5 | regex = [r"\bsofa\b"] 6 | 7 | digits = r"[^\d]*(\d*)" 8 | 9 | value_extract = [ 10 | dict( 11 | name="method_max", 12 | regex=r"(max)", 13 | reduce_mode="keep_first", 14 | ), 15 | dict( 16 | name="method_24h", 17 | regex=r"(24h)", 18 | reduce_mode="keep_first", 19 | ), 20 | dict( 21 | name="method_adm", 22 | regex=r"(admission)", 23 | reduce_mode="keep_first", 24 | ), 25 | dict( 26 | name="value", 27 | regex=r"^.*?[\n\W]*?(\d+)(?![h0-9])", 28 | ), 29 | ] 30 | 31 | score_normalization_str = "score_normalization.sofa" 32 | 33 | 34 | @spacy.registry.misc(score_normalization_str) 35 | def score_normalization(extracted_score: Union[str, None]): 36 | """ 37 | Sofa score normalization. 38 | If available, returns the integer value of the SOFA score. 39 | """ 40 | score_range = list(range(0, 30)) 41 | if (extracted_score is not None) and (int(extracted_score) in score_range): 42 | return int(extracted_score) 43 | -------------------------------------------------------------------------------- /edsnlp/pipes/ner/suicide_attempt/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edsnlp/8e9ed84f56e6af741023e8b3a9de38ba93912953/edsnlp/pipes/ner/suicide_attempt/__init__.py -------------------------------------------------------------------------------- /edsnlp/pipes/ner/suicide_attempt/factory.py: -------------------------------------------------------------------------------- 1 | from edsnlp.core import registry 2 | from edsnlp.pipes.ner.suicide_attempt.suicide_attempt import SuicideAttemptMatcher 3 | 4 | create_component = registry.factory.register( 5 | "eds.suicide_attempt", 6 | assigns=["doc.ents", "doc.spans"], 7 | )(SuicideAttemptMatcher) 8 | -------------------------------------------------------------------------------- /edsnlp/pipes/ner/tnm/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edsnlp/8e9ed84f56e6af741023e8b3a9de38ba93912953/edsnlp/pipes/ner/tnm/__init__.py -------------------------------------------------------------------------------- /edsnlp/pipes/ner/tnm/factory.py: -------------------------------------------------------------------------------- 1 | from edsnlp.core import registry 2 | 3 | from .patterns import tnm_pattern 4 | from .tnm import TNMMatcher 5 | 6 | DEFAULT_CONFIG = dict( 7 | pattern=tnm_pattern, 8 | attr="TEXT", 9 | label="tnm", 10 | span_setter={"ents": True, "tnm": True}, 11 | ) 12 | 13 | create_component = registry.factory.register( 14 | "eds.tnm", 15 | assigns=["doc.ents", "doc.spans"], 16 | deprecated=["eds.TNM"], 17 | )(TNMMatcher) 18 | -------------------------------------------------------------------------------- /edsnlp/pipes/ner/tnm/patterns.py: -------------------------------------------------------------------------------- 1 | prefix_pattern = r"(?P[cpPyraum]p?)" 2 | tumour_pattern = r"T\s?(?P([0-4o]|is))?(?P[abcdx]|mi)?" 3 | tumour_pattern += r"(?:\((?P[^()]{1,10})\))?" 4 | node_pattern = r"(\s{,2}\/?\s{,2}([cpPyraum]p?)?\s{,2}N\s?(?P[0-3o]|x)" 5 | node_pattern += ( 6 | r"(?P[abcdx]|mi)?(?:\((?P[^()]{1,10})\))?)" 7 | ) 8 | 9 | metastasis_pattern = ( 10 | r"(\s{,2}\/?\s{,2}([cpPyraum]p?)?\s{,2}M\s?(?P([01o]|x))x?)" # noqa: E501 11 | ) 12 | resection_completeness = r"(\s{,2}\/?\s{,2}R\s?(?P[012]))" 13 | 14 | version_pattern = ( 15 | r"\(?(?Puicc|accj|tnm|UICC|ACCJ|TNM)" 16 | r"\s+([éeE]ditions|[éeE]d\.?)?\s{,2}?" 17 | r"(?P\d{4}|\d{2})\)?" 18 | ) 19 | 20 | spacer = r"(.|\n){1,5}" 21 | 22 | tnm_pattern = f"(?<={version_pattern}{spacer})?" 23 | tnm_pattern += prefix_pattern + r"\s{,2}?" + f"({tumour_pattern})" 24 | tnm_pattern += r"(\s{,2}" + f"{node_pattern})?" 25 | tnm_pattern += r"(\s{,2}" + f"{metastasis_pattern})?" 26 | tnm_pattern += r"(\s{,2}" + f"{resection_completeness})?" 27 | tnm_pattern += f"({spacer}{version_pattern})?" 28 | tnm_pattern = r"(?:\b|^)" + tnm_pattern + r"(?:\b|$)" 29 | -------------------------------------------------------------------------------- /edsnlp/pipes/ner/umls/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edsnlp/8e9ed84f56e6af741023e8b3a9de38ba93912953/edsnlp/pipes/ner/umls/__init__.py -------------------------------------------------------------------------------- /edsnlp/pipes/qualifiers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edsnlp/8e9ed84f56e6af741023e8b3a9de38ba93912953/edsnlp/pipes/qualifiers/__init__.py -------------------------------------------------------------------------------- /edsnlp/pipes/qualifiers/family/__init__.py: -------------------------------------------------------------------------------- 1 | from .family import FamilyContextQualifier 2 | 3 | FamilyContext = FamilyContextQualifier 4 | -------------------------------------------------------------------------------- /edsnlp/pipes/qualifiers/family/factory.py: -------------------------------------------------------------------------------- 1 | from edsnlp.core import registry 2 | 3 | from .family import FamilyContextQualifier 4 | 5 | DEFAULT_CONFIG = dict( 6 | attr="NORM", 7 | family=None, 8 | termination=None, 9 | use_sections=True, 10 | span_getter=None, 11 | on_ents_only=True, 12 | explain=False, 13 | ) 14 | 15 | create_component = registry.factory.register( 16 | "eds.family", 17 | assigns=["span._.family"], 18 | deprecated=["family"], 19 | )(FamilyContextQualifier) 20 | -------------------------------------------------------------------------------- /edsnlp/pipes/qualifiers/family/patterns.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | family: List[str] = [ 4 | "aïeul", 5 | "aïeux", 6 | "antécédent familial", 7 | "antécédents familiaux", 8 | "arrière-grand-mère", 9 | "arrière-grand-père", 10 | "arrière-grands-parents", 11 | "cousin", 12 | "cousine", 13 | "cousines", 14 | "cousins", 15 | "enfant", 16 | "enfants", 17 | "épouse", 18 | "époux", 19 | "familial", 20 | "familiale", 21 | "familiales", 22 | "familiaux", 23 | "famille", 24 | "fiancé", 25 | "fiancée", 26 | "fils", 27 | "fille", 28 | "filles", 29 | "frère", 30 | "frères", 31 | "grand-mère", 32 | "grand-père", 33 | "grands-parents", 34 | "maman", 35 | "mari", 36 | "mère", 37 | "oncle", 38 | "papa", 39 | "parent", 40 | "parents", 41 | "père", 42 | "soeur", 43 | "sœur", 44 | "sœurs", 45 | "soeurs", 46 | "tante", 47 | "neveu", 48 | "neveux", 49 | "nièce", 50 | "nièces", 51 | ] 52 | -------------------------------------------------------------------------------- /edsnlp/pipes/qualifiers/history/__init__.py: -------------------------------------------------------------------------------- 1 | from .history import HistoryQualifier 2 | 3 | History = HistoryQualifier 4 | -------------------------------------------------------------------------------- /edsnlp/pipes/qualifiers/history/factory.py: -------------------------------------------------------------------------------- 1 | from edsnlp.core import registry 2 | 3 | from .history import HistoryQualifier 4 | 5 | DEFAULT_CONFIG = dict( 6 | history=None, 7 | termination=None, 8 | use_sections=False, 9 | use_dates=False, 10 | attr="NORM", 11 | history_limit=14, 12 | closest_dates_only=True, 13 | exclude_birthdate=True, 14 | span_getter=None, 15 | on_ents_only=True, 16 | explain=False, 17 | ) 18 | 19 | create_component = registry.factory.register( 20 | "eds.history", 21 | assigns=["span._.history"], 22 | deprecated=[ 23 | "history", 24 | "antecedents", 25 | "eds.antecedents", 26 | ], 27 | )(HistoryQualifier) 28 | -------------------------------------------------------------------------------- /edsnlp/pipes/qualifiers/history/patterns.py: -------------------------------------------------------------------------------- 1 | history = [ 2 | "antécédents", 3 | "atcd", 4 | "atcds", 5 | "tacds", 6 | "antécédent", 7 | ] 8 | 9 | sections_history = [ 10 | "antécédents", 11 | "antécédents familiaux", 12 | "histoire de la maladie", 13 | ] 14 | -------------------------------------------------------------------------------- /edsnlp/pipes/qualifiers/hypothesis/__init__.py: -------------------------------------------------------------------------------- 1 | from .hypothesis import HypothesisQualifier 2 | 3 | Hypothesis = HypothesisQualifier 4 | -------------------------------------------------------------------------------- /edsnlp/pipes/qualifiers/hypothesis/factory.py: -------------------------------------------------------------------------------- 1 | from edsnlp.core import registry 2 | 3 | from .hypothesis import HypothesisQualifier 4 | 5 | DEFAULT_CONFIG = dict( 6 | pseudo=None, 7 | preceding=None, 8 | following=None, 9 | verbs_eds=None, 10 | verbs_hyp=None, 11 | termination=None, 12 | attr="NORM", 13 | span_getter=None, 14 | on_ents_only=True, 15 | within_ents=False, 16 | explain=False, 17 | ) 18 | 19 | create_component = registry.factory.register( 20 | "eds.hypothesis", 21 | assigns=["span._.hypothesis"], 22 | deprecated=["hypothesis"], 23 | )(HypothesisQualifier) 24 | -------------------------------------------------------------------------------- /edsnlp/pipes/qualifiers/negation/__init__.py: -------------------------------------------------------------------------------- 1 | from .negation import NegationQualifier 2 | 3 | Negation = NegationQualifier 4 | -------------------------------------------------------------------------------- /edsnlp/pipes/qualifiers/negation/factory.py: -------------------------------------------------------------------------------- 1 | from edsnlp.core import registry 2 | 3 | from .negation import NegationQualifier 4 | 5 | DEFAULT_CONFIG = dict( 6 | pseudo=None, 7 | preceding=None, 8 | preceding_regex=None, 9 | following=None, 10 | verbs=None, 11 | termination=None, 12 | attr="NORM", 13 | span_getter=None, 14 | on_ents_only=True, 15 | within_ents=False, 16 | explain=False, 17 | ) 18 | 19 | create_component = registry.factory.register( 20 | "eds.negation", 21 | assigns=["span._.negation"], 22 | deprecated=["negation"], 23 | )(NegationQualifier) 24 | -------------------------------------------------------------------------------- /edsnlp/pipes/qualifiers/reported_speech/__init__.py: -------------------------------------------------------------------------------- 1 | from .reported_speech import ReportedSpeechQualifier 2 | 3 | ReportedSpeech = ReportedSpeechQualifier 4 | -------------------------------------------------------------------------------- /edsnlp/pipes/qualifiers/reported_speech/factory.py: -------------------------------------------------------------------------------- 1 | from edsnlp.core import registry 2 | 3 | from .reported_speech import ReportedSpeechQualifier 4 | 5 | DEFAULT_CONFIG = dict( 6 | pseudo=None, 7 | preceding=None, 8 | following=None, 9 | quotation=None, 10 | verbs=None, 11 | attr="NORM", 12 | span_getter=None, 13 | on_ents_only=True, 14 | within_ents=False, 15 | explain=False, 16 | ) 17 | 18 | create_component = registry.factory.register( 19 | "eds.reported_speech", 20 | assigns=["span._.reported_speech"], 21 | deprecated=[ 22 | "reported_speech", 23 | "rspeech", 24 | ], 25 | )(ReportedSpeechQualifier) 26 | -------------------------------------------------------------------------------- /edsnlp/pipes/qualifiers/reported_speech/patterns.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | verbs: List[str] = [ 4 | # 'admettre', > False positive: "admis à l'hopital" 5 | "affirmer", 6 | "ajouter", 7 | "assurer", 8 | "confirmer", 9 | "demander", 10 | "dire", 11 | "déclarer", 12 | "décrire", 13 | "décrire", 14 | "démontrer", 15 | "expliquer", 16 | "faire remarquer", 17 | "indiquer", 18 | "informer", 19 | "insinuer", 20 | "insister", 21 | "jurer", 22 | "nier", 23 | "nier", 24 | "noter", 25 | "objecter", 26 | "observer", 27 | "parler", 28 | "promettre", 29 | "préciser", 30 | "prétendre", 31 | "prévenir", 32 | "raconter", 33 | "rappeler", 34 | "rapporter", 35 | "reconnaître", 36 | "réfuter", 37 | "répliquer", 38 | "répondre", 39 | "répéter", 40 | "révéler", 41 | "se plaindre", 42 | "souhaiter", 43 | "souligner", 44 | "supplier", 45 | "verbaliser", 46 | "vouloir", 47 | "vouloir", 48 | ] 49 | 50 | following: List[str] = [r"d'après le patient", r"d'après la patiente"] 51 | 52 | preceding: List[str] = [ 53 | r"pas de critique de", 54 | r"crainte de", 55 | r"menace de", 56 | r"insiste sur le fait que", 57 | r"d'après le patient", 58 | r"d'après la patiente", 59 | r"peur de", 60 | ] 61 | quotation: str = r"(\".+\")|(\«.+\»)" 62 | -------------------------------------------------------------------------------- /edsnlp/pipes/terminations.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | termination: List[str] = [ 4 | "et", 5 | "bien que", 6 | "même si", 7 | "mais", 8 | "or", 9 | "alors que", 10 | "sauf", 11 | "cependant", 12 | "pourtant", 13 | "cause de", 14 | "source de", 15 | "hormis", 16 | "car", 17 | "parce que", 18 | "pourtant", 19 | "puisque", 20 | "ni", 21 | "en raison de", 22 | "qui", 23 | "que", 24 | "ainsi que", 25 | "avec", 26 | "toutefois", 27 | "en dehors", 28 | "dans le cadre", 29 | "du fait", 30 | ".", 31 | ",", 32 | ";", 33 | "...", 34 | "…", 35 | "(", 36 | ")", 37 | '"', 38 | ] 39 | -------------------------------------------------------------------------------- /edsnlp/pipes/trainable/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edsnlp/8e9ed84f56e6af741023e8b3a9de38ba93912953/edsnlp/pipes/trainable/__init__.py -------------------------------------------------------------------------------- /edsnlp/pipes/trainable/biaffine_dep_parser/__init__.py: -------------------------------------------------------------------------------- 1 | from .factory import create_component 2 | -------------------------------------------------------------------------------- /edsnlp/pipes/trainable/biaffine_dep_parser/factory.py: -------------------------------------------------------------------------------- 1 | from edsnlp import registry 2 | 3 | from .biaffine_dep_parser import TrainableBiaffineDependencyParser 4 | 5 | create_component = registry.factory.register( 6 | "eds.biaffine_dep_parser", 7 | assigns=["token.head", "token.dep"], 8 | )(TrainableBiaffineDependencyParser) 9 | -------------------------------------------------------------------------------- /edsnlp/pipes/trainable/embeddings/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edsnlp/8e9ed84f56e6af741023e8b3a9de38ba93912953/edsnlp/pipes/trainable/embeddings/__init__.py -------------------------------------------------------------------------------- /edsnlp/pipes/trainable/embeddings/span_pooler/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edsnlp/8e9ed84f56e6af741023e8b3a9de38ba93912953/edsnlp/pipes/trainable/embeddings/span_pooler/__init__.py -------------------------------------------------------------------------------- /edsnlp/pipes/trainable/embeddings/span_pooler/factory.py: -------------------------------------------------------------------------------- 1 | from edsnlp import registry 2 | 3 | from .span_pooler import SpanPooler 4 | 5 | create_component = registry.factory.register( 6 | "eds.span_pooler", 7 | assigns=[], 8 | deprecated=[], 9 | )(SpanPooler) 10 | -------------------------------------------------------------------------------- /edsnlp/pipes/trainable/embeddings/text_cnn/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edsnlp/8e9ed84f56e6af741023e8b3a9de38ba93912953/edsnlp/pipes/trainable/embeddings/text_cnn/__init__.py -------------------------------------------------------------------------------- /edsnlp/pipes/trainable/embeddings/text_cnn/factory.py: -------------------------------------------------------------------------------- 1 | from edsnlp import registry 2 | 3 | from .text_cnn import TextCnnEncoder 4 | 5 | create_component = registry.factory.register( 6 | "eds.text_cnn", 7 | assigns=[], 8 | deprecated=[], 9 | )(TextCnnEncoder) 10 | -------------------------------------------------------------------------------- /edsnlp/pipes/trainable/embeddings/transformer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edsnlp/8e9ed84f56e6af741023e8b3a9de38ba93912953/edsnlp/pipes/trainable/embeddings/transformer/__init__.py -------------------------------------------------------------------------------- /edsnlp/pipes/trainable/embeddings/transformer/factory.py: -------------------------------------------------------------------------------- 1 | from edsnlp import registry 2 | 3 | from .transformer import Transformer 4 | 5 | create_component = registry.factory.register( 6 | "eds.transformer", 7 | assigns=[], 8 | deprecated=[], 9 | )(Transformer) 10 | -------------------------------------------------------------------------------- /edsnlp/pipes/trainable/extractive_qa/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edsnlp/8e9ed84f56e6af741023e8b3a9de38ba93912953/edsnlp/pipes/trainable/extractive_qa/__init__.py -------------------------------------------------------------------------------- /edsnlp/pipes/trainable/extractive_qa/factory.py: -------------------------------------------------------------------------------- 1 | from typing import TYPE_CHECKING 2 | 3 | from edsnlp import registry 4 | 5 | from .extractive_qa import TrainableExtractiveQA 6 | 7 | create_component = registry.factory.register( 8 | "eds.extractive_qa", 9 | assigns=[], 10 | deprecated=[], 11 | )(TrainableExtractiveQA) 12 | 13 | if TYPE_CHECKING: 14 | create_component = TrainableExtractiveQA 15 | -------------------------------------------------------------------------------- /edsnlp/pipes/trainable/layers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edsnlp/8e9ed84f56e6af741023e8b3a9de38ba93912953/edsnlp/pipes/trainable/layers/__init__.py -------------------------------------------------------------------------------- /edsnlp/pipes/trainable/ner_crf/__init__.py: -------------------------------------------------------------------------------- 1 | from .factory import create_component 2 | -------------------------------------------------------------------------------- /edsnlp/pipes/trainable/ner_crf/factory.py: -------------------------------------------------------------------------------- 1 | from edsnlp import registry 2 | 3 | from .ner_crf import TrainableNerCrf 4 | 5 | create_component = registry.factory.register( 6 | "eds.ner_crf", 7 | assigns=["doc.ents", "doc.spans"], 8 | deprecated=[ 9 | "eds.nested_ner", 10 | "nested_ner", 11 | ], 12 | )(TrainableNerCrf) 13 | -------------------------------------------------------------------------------- /edsnlp/pipes/trainable/span_classifier/__init__.py: -------------------------------------------------------------------------------- 1 | from .factory import create_component 2 | -------------------------------------------------------------------------------- /edsnlp/pipes/trainable/span_classifier/factory.py: -------------------------------------------------------------------------------- 1 | from edsnlp import registry 2 | 3 | from .span_classifier import TrainableSpanClassifier 4 | 5 | create_component = registry.factory.register( 6 | "eds.span_classifier", 7 | assigns=[], 8 | deprecated=["eds.span_qualifier"], 9 | )(TrainableSpanClassifier) 10 | -------------------------------------------------------------------------------- /edsnlp/pipes/trainable/span_linker/__init__.py: -------------------------------------------------------------------------------- 1 | from .factory import create_component 2 | -------------------------------------------------------------------------------- /edsnlp/pipes/trainable/span_linker/factory.py: -------------------------------------------------------------------------------- 1 | from typing import TYPE_CHECKING 2 | 3 | from edsnlp import registry 4 | 5 | from .span_linker import TrainableSpanLinker 6 | 7 | create_component = registry.factory.register( 8 | "eds.span_linker", 9 | assigns=[], 10 | deprecated=[], 11 | )(TrainableSpanLinker) 12 | 13 | if TYPE_CHECKING: 14 | create_component = TrainableSpanLinker 15 | -------------------------------------------------------------------------------- /edsnlp/processing/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import TYPE_CHECKING 2 | 3 | from edsnlp.utils.lazy_module import lazify 4 | 5 | lazify() 6 | 7 | if TYPE_CHECKING: 8 | from .deprecated_pipe import pipe # DEPRECATED 9 | from .spark import execute_spark_backend 10 | from .simple import execute_simple_backend 11 | from .multiprocessing import execute_multiprocessing_backend 12 | -------------------------------------------------------------------------------- /edsnlp/resources/AVC.csv.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edsnlp/8e9ed84f56e6af741023e8b3a9de38ba93912953/edsnlp/resources/AVC.csv.gz -------------------------------------------------------------------------------- /edsnlp/resources/adicap.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edsnlp/8e9ed84f56e6af741023e8b3a9de38ba93912953/edsnlp/resources/adicap.json.gz -------------------------------------------------------------------------------- /edsnlp/resources/cim10.csv.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edsnlp/8e9ed84f56e6af741023e8b3a9de38ba93912953/edsnlp/resources/cim10.csv.gz -------------------------------------------------------------------------------- /edsnlp/resources/verbs.csv.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edsnlp/8e9ed84f56e6af741023e8b3a9de38ba93912953/edsnlp/resources/verbs.csv.gz -------------------------------------------------------------------------------- /edsnlp/train.py: -------------------------------------------------------------------------------- 1 | from confit import Cli 2 | 3 | from edsnlp.training.trainer import * # noqa: F403 4 | from edsnlp.training.trainer import registry, train 5 | 6 | app = Cli(pretty_exceptions_show_locals=False) 7 | train_command = app.command(name="train", registry=registry)(train) 8 | 9 | if __name__ == "__main__": 10 | app() 11 | -------------------------------------------------------------------------------- /edsnlp/training/__init__.py: -------------------------------------------------------------------------------- 1 | from .trainer import train, GenericScorer, TrainingData 2 | from .optimizer import ScheduledOptimizer, LinearSchedule 3 | -------------------------------------------------------------------------------- /edsnlp/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edsnlp/8e9ed84f56e6af741023e8b3a9de38ba93912953/edsnlp/utils/__init__.py -------------------------------------------------------------------------------- /edsnlp/utils/deprecation.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | from typing import Any, Union 3 | 4 | from confit import VisibleDeprecationWarning 5 | from spacy.tokens import Doc, Span, Token 6 | 7 | 8 | def deprecated_extension(name: str, new_name: str) -> None: 9 | msg = ( 10 | f'The extension "{name}" is deprecated and will be ' 11 | "removed in a future version. " 12 | f'Please use "{new_name}" instead.' 13 | ) 14 | 15 | warnings.warn(msg, VisibleDeprecationWarning) 16 | 17 | 18 | class deprecated_getter_factory: 19 | def __init__(self, name: str, new_name: str): 20 | self.name = name 21 | self.new_name = new_name 22 | 23 | def __call__(self, toklike: Union[Token, Span, Doc]) -> Any: 24 | n = f"{type(toklike).__name__}._.{self.name}" 25 | nn = f"{type(toklike).__name__}._.{self.new_name}" 26 | 27 | deprecated_extension(n, nn) 28 | 29 | return getattr(toklike._, self.new_name) 30 | -------------------------------------------------------------------------------- /edsnlp/utils/extensions.py: -------------------------------------------------------------------------------- 1 | import functools 2 | from typing import Any, List 3 | 4 | 5 | def rgetattr(obj: Any, attr: str, *args: List[Any]) -> Any: 6 | """ 7 | Get attribute recursively 8 | 9 | Parameters 10 | ---------- 11 | obj : Any 12 | An object 13 | attr : str 14 | The name of the attribute to get. Can contain dots. 15 | """ 16 | 17 | def _getattr(obj, attr): 18 | return None if obj is None else getattr(obj, attr, *args) 19 | 20 | return functools.reduce(_getattr, [obj] + attr.split(".")) 21 | -------------------------------------------------------------------------------- /edsnlp/utils/inclusion.py: -------------------------------------------------------------------------------- 1 | from spacy.tokens import Span 2 | 3 | 4 | def check_inclusion(span: Span, start: int, end: int) -> bool: 5 | """ 6 | Checks whether the span overlaps the boundaries. 7 | 8 | Parameters 9 | ---------- 10 | span : Span 11 | Span to check. 12 | start : int 13 | Start of the boundary 14 | end : int 15 | End of the boundary 16 | 17 | Returns 18 | ------- 19 | bool 20 | Whether the span overlaps the boundaries. 21 | """ 22 | 23 | if span.start >= end or span.end <= start: 24 | return False 25 | return True 26 | 27 | 28 | def check_sent_inclusion(span: Span, start: int, end: int) -> bool: 29 | """ 30 | Checks whether the span overlaps the boundaries. 31 | 32 | Parameters 33 | ---------- 34 | span : Span 35 | Span to check. 36 | start : int 37 | Start of the boundary 38 | end : int 39 | End of the boundary 40 | 41 | Returns 42 | ------- 43 | bool 44 | Whether the span overlaps the boundaries. 45 | """ 46 | if span.sent.start >= end or span.sent.end <= start: 47 | return False 48 | return True 49 | -------------------------------------------------------------------------------- /edsnlp/utils/numbers.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | 3 | from spacy.tokens import Span 4 | 5 | from edsnlp.matchers.utils import get_text 6 | 7 | DIGITS_VALUE = list(range(11)) 8 | DIGITS_STR = [ 9 | ["zero"], 10 | ["un", "une", "i"], 11 | ["deux", "ii"], 12 | ["trois", "iii"], 13 | ["quatre", "iv"], 14 | ["cinq", "v"], 15 | ["six", "vi"], 16 | ["sept", "vii"], 17 | ["huit", "viii"], 18 | ["neuf", "ix"], 19 | ["dix", "x"], 20 | ] 21 | 22 | DIGITS_MAPPINGS = { 23 | string: digit for digit, strings in enumerate(DIGITS_STR) for string in strings 24 | } 25 | 26 | 27 | def parse_digit(s: Union[str, Span], **kwargs): 28 | if isinstance(s, Span): 29 | string = get_text( 30 | s, 31 | attr=kwargs.get("attr", "TEXT"), 32 | ignore_excluded=kwargs.get("ignore_excluded", True), 33 | ) 34 | else: 35 | string = s 36 | string = string.lower().strip() 37 | try: 38 | return int(string) 39 | except ValueError: 40 | parsed = DIGITS_MAPPINGS.get(string, None) 41 | return parsed 42 | -------------------------------------------------------------------------------- /edsnlp/utils/stream_sentinels.py: -------------------------------------------------------------------------------- 1 | class StreamSentinel: 2 | pass 3 | 4 | 5 | class FragmentEndSentinel(StreamSentinel): 6 | kind = "fragment" 7 | 8 | def __init__(self, name: str): 9 | self.name = name 10 | 11 | 12 | class DatasetEndSentinel(StreamSentinel): 13 | # Singleton is important since the DatasetEndSentinel object may be passed to 14 | # other processes, i.e. pickled, depickled, while it should 15 | # always be the same object. 16 | kind = "dataset" 17 | instance = None 18 | 19 | def __new__(cls, *args, **kwargs): 20 | if cls.instance is None: 21 | cls.instance = super().__new__(cls) 22 | return cls.instance 23 | 24 | 25 | DATASET_END_SENTINEL = DatasetEndSentinel() 26 | -------------------------------------------------------------------------------- /edsnlp/viz/__init__.py: -------------------------------------------------------------------------------- 1 | from .quick_examples import QuickExample 2 | -------------------------------------------------------------------------------- /notebooks/README.md: -------------------------------------------------------------------------------- 1 | # Notebooks 2 | 3 | Check out the pipeline notebook to experiment with baseline components written in spaCy. 4 | -------------------------------------------------------------------------------- /notebooks/connectors/context.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | REPO_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")) 5 | sys.path.insert(0, REPO_PATH) 6 | -------------------------------------------------------------------------------- /notebooks/context.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | REPO_PATH = os.path.abspath(os.path.join(os.path.dirname("__file__"), "..")) 5 | sys.path.insert(0, REPO_PATH) 6 | -------------------------------------------------------------------------------- /notebooks/dates/context.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | REPO_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")) 5 | sys.path.insert(0, REPO_PATH) 6 | -------------------------------------------------------------------------------- /notebooks/example.txt: -------------------------------------------------------------------------------- 1 | Motif : 2 | Le patient est admis le 29 août pour des difficultés respiratoires. 3 | 4 | Antécédents familiaux : 5 | Le père est asthmatique, sans traitement particulier. 6 | 7 | HISTOIRE DE LA MALADIE 8 | Le patient dit avoir de la toux depuis trois jours. Elle a empiré jusqu'à nécessiter un passage aux urgences. 9 | 10 | Conclusion 11 | Possible infection au coronavirus 12 | -------------------------------------------------------------------------------- /notebooks/normalizer/context.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | REPO_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")) 5 | sys.path.insert(0, REPO_PATH) 6 | -------------------------------------------------------------------------------- /notebooks/sections/context.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | REPO_PATH = os.path.abspath(os.path.join(os.path.dirname("__file__"), "..", "..")) 5 | sys.path.insert(0, REPO_PATH) 6 | -------------------------------------------------------------------------------- /notebooks/sections/sections.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edsnlp/8e9ed84f56e6af741023e8b3a9de38ba93912953/notebooks/sections/sections.xlsx -------------------------------------------------------------------------------- /notebooks/sentences/context.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | REPO_PATH = os.path.abspath(os.path.join(os.path.dirname("__file__"), "..", "..")) 5 | sys.path.insert(0, REPO_PATH) 6 | -------------------------------------------------------------------------------- /notebooks/tnm/prototype.md: -------------------------------------------------------------------------------- 1 | --- 2 | jupyter: 3 | jupytext: 4 | formats: md,ipynb 5 | text_representation: 6 | extension: .md 7 | format_name: markdown 8 | format_version: '1.3' 9 | jupytext_version: 1.13.0 10 | kernelspec: 11 | display_name: Python 3 (ipykernel) 12 | language: python 13 | name: python3 14 | --- 15 | 16 | ```python 17 | %reload_ext autoreload 18 | %autoreload 2 19 | ``` 20 | 21 | ```python 22 | import spacy 23 | from spacy import displacy 24 | from spacy.tokens import Doc 25 | ``` 26 | 27 | # TNM mentions 28 | 29 | ```python 30 | nlp = spacy.blank("fr") 31 | dates = nlp.add_pipe("eds.tnm") 32 | ``` 33 | 34 | ```python 35 | text = "patient a un pTNM : pT0N2M1" 36 | ``` 37 | 38 | ```python 39 | doc = nlp(text) 40 | ``` 41 | 42 | ```python 43 | tnms = doc.spans['tnm'] 44 | ``` 45 | 46 | ```python 47 | def display_tnm(doc: Doc): 48 | doc.ents = doc.spans['tnm'] 49 | return displacy.render(doc, style='ent') 50 | ``` 51 | 52 | ```python 53 | display_tnm(doc) 54 | ``` 55 | 56 | ```python 57 | for tnm in tnms: 58 | print(f"{str(tnm):<25}{repr(tnm._.value)}") 59 | ``` 60 | 61 | ```python 62 | 63 | ``` 64 | -------------------------------------------------------------------------------- /notebooks/tokenizer/context.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | REPO_PATH = os.path.abspath(os.path.join(os.path.dirname("__file__"), "..", "..")) 5 | sys.path.insert(0, REPO_PATH) 6 | -------------------------------------------------------------------------------- /notebooks/utilities/context.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | REPO_PATH = os.path.abspath(os.path.join(os.path.dirname("__file__"), "..", "..")) 5 | sys.path.insert(0, REPO_PATH) 6 | -------------------------------------------------------------------------------- /scripts/cim10.py: -------------------------------------------------------------------------------- 1 | """ 2 | Process CIM10 patterns. 3 | 4 | !!! warning "Watch out for the encoding" 5 | 6 | We had to convert the CIM-10 file from windows-1252 to utf-8. 7 | 8 | Source: https://www.atih.sante.fr/plateformes-de-transmission-et-logiciels/logiciels-espace-de-telechargement/id_lot/456 9 | """ # noqa 10 | 11 | from pathlib import Path 12 | 13 | import pandas as pd 14 | import typer 15 | 16 | 17 | def run( 18 | raw: Path = typer.Argument(..., help="Path to the raw file"), 19 | output: Path = typer.Option( 20 | "edsnlp/resources/cim10.csv.gz", help="Path to the output CSV table." 21 | ), 22 | ) -> None: 23 | """ 24 | Convenience script to automatically process the CIM10 terminology 25 | into a processable file. 26 | """ 27 | 28 | df = pd.read_csv(raw, sep="|", header=None) 29 | 30 | typer.echo(f"Processing {len(df)} French ICD codes...") 31 | 32 | df.columns = ["code", "type", "ssr", "psy", "short", "long"] 33 | for column in ["code", "short", "long"]: 34 | df[column] = df[column].str.strip() 35 | 36 | typer.echo(f"Saving to {output}") 37 | 38 | df.to_csv(output, index=False) 39 | 40 | typer.echo("Done !") 41 | 42 | 43 | if __name__ == "__main__": 44 | typer.run(run) 45 | -------------------------------------------------------------------------------- /scripts/conjugate_verbs.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | from pathlib import Path 3 | 4 | import context # noqa 5 | import typer 6 | 7 | from edsnlp.conjugator import conjugate 8 | from edsnlp.pipelines.qualifiers.hypothesis.patterns import verbs_eds, verbs_hyp 9 | from edsnlp.pipelines.qualifiers.negation.patterns import verbs as neg_verbs 10 | from edsnlp.pipelines.qualifiers.reported_speech.patterns import verbs as rspeech_verbs 11 | 12 | warnings.filterwarnings("ignore") 13 | 14 | 15 | def conjugate_verbs( 16 | output_path: Path = typer.Argument( 17 | "edsnlp/resources/verbs.csv.gz", help="Path to the output CSV table." 18 | ) 19 | ) -> None: 20 | """ 21 | Convenience script to automatically conjugate a set of verbs, 22 | using mlconjug3 library. 23 | """ 24 | 25 | all_verbs = set(neg_verbs + rspeech_verbs + verbs_eds + verbs_hyp) 26 | 27 | typer.echo(f"Conjugating {len(all_verbs)} verbs...") 28 | 29 | df = conjugate(list(all_verbs)) 30 | 31 | typer.echo(f"Saving to {output_path}") 32 | 33 | output_path.parent.mkdir(exist_ok=True, parents=True) 34 | df.to_csv(output_path, index=False) 35 | 36 | typer.echo("Done !") 37 | 38 | 39 | if __name__ == "__main__": 40 | typer.run(conjugate_verbs) 41 | -------------------------------------------------------------------------------- /scripts/context.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from pathlib import Path 3 | 4 | sys.path.insert(0, str(Path(__file__).parent.parent)) 5 | -------------------------------------------------------------------------------- /tests/connectors/test_labeltool.py: -------------------------------------------------------------------------------- 1 | from edsnlp.connectors.labeltool import docs2labeltool 2 | 3 | texts = [ 4 | "Le patient est malade", 5 | "Le patient n'est pas malade", 6 | "Le patient est peut-être malade", 7 | "Le patient dit qu'il est malade", 8 | ] 9 | 10 | 11 | def test_docs2labeltool(nlp): 12 | 13 | modifiers = ["negated", "hypothesis", "reported_speech"] 14 | 15 | docs = list(nlp.pipe(texts)) 16 | 17 | df = docs2labeltool(docs, extensions=modifiers) 18 | assert len(df) 19 | 20 | df = docs2labeltool(docs) 21 | assert len(df) 22 | -------------------------------------------------------------------------------- /tests/data/test_conll.py: -------------------------------------------------------------------------------- 1 | from itertools import islice 2 | from pathlib import Path 3 | 4 | import pytest 5 | from typing_extensions import Literal 6 | 7 | import edsnlp 8 | 9 | 10 | @pytest.mark.parametrize("num_cpu_workers", [0, 2]) 11 | @pytest.mark.parametrize("shuffle", ["dataset"]) 12 | def test_read_shuffle_loop( 13 | num_cpu_workers: int, 14 | shuffle: Literal["dataset", "fragment"], 15 | ): 16 | input_file = ( 17 | Path(__file__).parent.parent.resolve() / "training" / "rhapsodie_sample.conllu" 18 | ) 19 | notes = edsnlp.data.read_conll( 20 | input_file, 21 | shuffle=shuffle, 22 | seed=42, 23 | loop=True, 24 | ).set_processing(num_cpu_workers=num_cpu_workers) 25 | notes = list(islice(notes, 6)) 26 | assert len(notes) == 6 27 | # 32 ce ce PRON _ Gender=Masc|Number=Sing|Person=3|PronType=Dem 30 obl:arg _ _ # noqa: E501 28 | word_attrs = { 29 | "text": "ce", 30 | "lemma_": "ce", 31 | "pos_": "PRON", 32 | "dep_": "obl:arg", 33 | "morph": "Gender=Masc|Number=Sing|Person=3|PronType=Dem", 34 | "head": "profité", 35 | } 36 | word = notes[0][31] 37 | for attr, val in word_attrs.items(): 38 | assert str(getattr(word, attr)) == val 39 | -------------------------------------------------------------------------------- /tests/data/test_spark.py: -------------------------------------------------------------------------------- 1 | import edsnlp 2 | 3 | 4 | def test_read_write(blank_nlp, text, df_notes_pyspark): 5 | # line below is just to mix params to avoid running too many tests 6 | shuffle = "dataset" if blank_nlp.lang == "eds" else False 7 | 8 | reader = edsnlp.data.from_spark( 9 | df_notes_pyspark, 10 | converter="omop", 11 | nlp=blank_nlp, 12 | shuffle=shuffle, 13 | ).set_processing(backend="simple") 14 | doc = list(reader)[0] 15 | assert doc.text == text 16 | 17 | blank_nlp.add_pipe("eds.matcher", config={"terms": {"douleur": ["douleurs"]}}) 18 | blank_nlp.add_pipe("eds.negation") 19 | docs = blank_nlp.pipe(reader) 20 | 21 | writer = edsnlp.data.to_spark( 22 | docs, 23 | converter="omop", 24 | span_attributes=["negation"], 25 | span_getter=["ents"], 26 | ) 27 | res = writer.toPandas().to_dict(orient="records") 28 | assert len(res) == 20 29 | assert sum(len(r["entities"]) for r in res) == 20 30 | -------------------------------------------------------------------------------- /tests/helpers.py: -------------------------------------------------------------------------------- 1 | import spacy 2 | 3 | import edsnlp 4 | 5 | 6 | def make_nlp(lang): 7 | if lang == "eds": 8 | model = spacy.blank("eds") 9 | else: 10 | model = edsnlp.blank("fr") 11 | 12 | model.add_pipe("eds.normalizer") 13 | 14 | model.add_pipe("eds.sentences") 15 | model.add_pipe("eds.sections") 16 | 17 | model.add_pipe( 18 | "eds.matcher", 19 | config=dict( 20 | terms=dict(patient="patient"), 21 | attr="NORM", 22 | ignore_excluded=True, 23 | ), 24 | ) 25 | model.add_pipe( 26 | "eds.matcher", 27 | name="matcher2", 28 | config=dict( 29 | regex=dict(anomalie=r"anomalie"), 30 | ), 31 | ) 32 | 33 | model.add_pipe("eds.hypothesis") 34 | model.add_pipe("eds.negation") 35 | model.add_pipe("eds.family") 36 | model.add_pipe("eds.history") 37 | model.add_pipe("eds.reported_speech") 38 | 39 | model.add_pipe("eds.dates") 40 | model.add_pipe("eds.quantities") 41 | 42 | return model 43 | -------------------------------------------------------------------------------- /tests/pipelines/core/test_terminology.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from edsnlp.core import PipelineProtocol 4 | from edsnlp.utils.examples import parse_example 5 | 6 | example = "1g de doliprane" 7 | 8 | 9 | @pytest.mark.parametrize("term_matcher", ["exact", "simstring"]) 10 | def test_terminology(blank_nlp: PipelineProtocol, term_matcher: str): 11 | blank_nlp.add_pipe( 12 | "eds.terminology", 13 | config=dict( 14 | label="drugs", 15 | terms=dict(paracetamol=["doliprane", "tylenol", "paracetamol"]), 16 | attr="NORM", 17 | term_matcher=term_matcher, 18 | ), 19 | ) 20 | 21 | text, entities = parse_example(example) 22 | 23 | doc = blank_nlp(text) 24 | 25 | assert len(entities) == len(doc.ents) 26 | 27 | for ent, entity in zip(doc.ents, entities): 28 | assert ent.text == text[entity.start_char : entity.end_char] 29 | assert ent.kb_id_ == entity.modifiers[0].value 30 | -------------------------------------------------------------------------------- /tests/pipelines/misc/test_consultation_date_town.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edsnlp/8e9ed84f56e6af741023e8b3a9de38ba93912953/tests/pipelines/misc/test_consultation_date_town.py -------------------------------------------------------------------------------- /tests/pipelines/misc/test_reason.py: -------------------------------------------------------------------------------- 1 | import spacy 2 | from pytest import mark 3 | 4 | text = """COMPTE RENDU D'HOSPITALISATION du 11/07/2018 au 12/07/2018 5 | MOTIF D'HOSPITALISATION 6 | Monsieur Dupont Jean Michel, de sexe masculin, âgée de 39 ans, née le 23/11/1978, 7 | a été hospitalisé du 11/08/2019 au 17/08/2019 pour une quinte de toux. 8 | 9 | ANTÉCÉDENTS 10 | Antécédents médicaux : 11 | Premier épisode: il a été hospitalisé pour asthme en mai 2018.""" 12 | 13 | 14 | @mark.parametrize("use_sections", [True, False]) 15 | def test_reason(lang, use_sections): 16 | nlp = spacy.blank(lang) 17 | # Extraction d'entités nommées 18 | nlp.add_pipe( 19 | "eds.matcher", 20 | config=dict( 21 | terms=dict( 22 | respiratoire=[ 23 | "asthmatique", 24 | "asthme", 25 | "toux", 26 | ] 27 | ) 28 | ), 29 | ) 30 | nlp.add_pipe("eds.normalizer") 31 | nlp.add_pipe("eds.reason", config=dict(use_sections=use_sections)) 32 | nlp.remove_pipe("eds.reason") 33 | nlp.add_pipe("eds.sections") 34 | nlp.add_pipe("eds.reason", config=dict(use_sections=use_sections)) 35 | 36 | doc = nlp(text) 37 | reason = doc.spans["reasons"][0] 38 | entities = reason._.ents_reason 39 | 40 | assert entities[0].label_ == "respiratoire" 41 | assert reason._.is_reason 42 | assert doc.ents[1]._.is_reason is not use_sections 43 | -------------------------------------------------------------------------------- /tests/pipelines/ner/disorders/AIDS.py: -------------------------------------------------------------------------------- 1 | results_aids = dict( 2 | has_match=[ 3 | True, 4 | False, 5 | True, 6 | True, 7 | ], 8 | detailled_status=[ 9 | None, 10 | None, 11 | None, 12 | None, 13 | ], 14 | assign=None, 15 | texts=[ 16 | "Patient atteint du VIH au stade SIDA.", 17 | "Patient atteint du VIH.", 18 | "Il y a un VIH avec coinfection pneumocystose", 19 | "Présence d'un VIH stade C", 20 | ], 21 | ) 22 | -------------------------------------------------------------------------------- /tests/pipelines/ner/disorders/CKD.py: -------------------------------------------------------------------------------- 1 | results_ckd = dict( 2 | has_match=[ 3 | True, 4 | False, 5 | True, 6 | True, 7 | False, 8 | True, 9 | False, 10 | True, 11 | True, 12 | True, 13 | False, 14 | ], 15 | detailled_status=None, 16 | assign=8 * [None] + [{"stage": "IV"}, {"dfg": 30}, None], 17 | texts=[ 18 | "Patient atteint d'une glomérulopathie.", 19 | "Patient atteint d'une tubulopathie aigüe.", 20 | "Patient transplanté rénal", 21 | "Présence d'une insuffisance rénale aigüe sur chronique", 22 | "Le patient a été dialysé", # ponctuelle 23 | "Le patient est dialysé chaque lundi", # chronique 24 | "Présence d'une IRC", # severity non mentionned 25 | "Présence d'une IRC sévère", 26 | "Présence d'une IRC de classe IV", 27 | "Présence d'une IRC avec DFG à 30", # severe 28 | "Présence d'une maladie rénale avec DFG à 110", # no renal failure 29 | ], 30 | ) 31 | -------------------------------------------------------------------------------- /tests/pipelines/ner/disorders/COPD.py: -------------------------------------------------------------------------------- 1 | results_copd = dict( 2 | has_match=[ 3 | True, 4 | True, 5 | True, 6 | False, 7 | False, 8 | True, 9 | ], 10 | detailled_status=None, 11 | assign=None, 12 | texts=[ 13 | "Une fibrose interstitielle diffuse idiopathique", 14 | "Patient atteint de pneumoconiose", 15 | "Présence d'une HTAP.", 16 | "On voit une hypertension pulmonaire minime", 17 | "La patiente a été mis sous oxygénorequérance", # Ponctual: not extracted 18 | "La patiente est sous oxygénorequérance au long cours", 19 | ], 20 | ) 21 | -------------------------------------------------------------------------------- /tests/pipelines/ner/disorders/alcohol.py: -------------------------------------------------------------------------------- 1 | results_alcohol = dict( 2 | has_match=[ 3 | True, 4 | True, 5 | False, 6 | False, 7 | True, 8 | True, 9 | True, 10 | True, 11 | True, 12 | ], 13 | detailled_status=[ 14 | None, 15 | None, 16 | None, 17 | None, 18 | "ABSTINENCE", 19 | None, 20 | None, 21 | "ABSTINENCE", 22 | None, 23 | ], 24 | negation=[ 25 | None, 26 | None, 27 | None, 28 | None, 29 | None, 30 | None, 31 | True, 32 | None, 33 | True, 34 | ], 35 | assign=None, 36 | texts=[ 37 | "Patient alcoolique.", 38 | "OH chronique.", 39 | "Prise d'alcool occasionnelle", 40 | "Application d'un pansement alcoolisé", 41 | "Alcoolisme sevré", 42 | "Alcoolisme non sevré", 43 | "Alcool: 0", 44 | "Le patient est en cours de sevrage éthylotabagique", 45 | "Patient alcoolique: non.", 46 | ], 47 | ) 48 | -------------------------------------------------------------------------------- /tests/pipelines/ner/disorders/cerebrovascular_accident.py: -------------------------------------------------------------------------------- 1 | results_cerebrovascular_accident = dict( 2 | has_match=[ 3 | False, 4 | True, 5 | True, 6 | False, 7 | True, 8 | True, 9 | True, 10 | ], 11 | detailled_status=None, 12 | assign=None, 13 | texts=[ 14 | "Patient hospitalisé à AVC.", 15 | "Hospitalisation pour un AVC.", 16 | "Saignement intracranien", 17 | "Thrombose périphérique", 18 | "Thrombose sylvienne", 19 | "Infarctus cérébral", 20 | "Soigné via un thrombolyse", 21 | ], 22 | ) 23 | -------------------------------------------------------------------------------- /tests/pipelines/ner/disorders/congestive_heart_failure.py: -------------------------------------------------------------------------------- 1 | results_congestive_heart_failure = dict( 2 | has_match=[ 3 | True, 4 | True, 5 | False, 6 | True, 7 | False, 8 | ], 9 | detailled_status=None, 10 | assign=None, 11 | texts=[ 12 | "Présence d'un oedème pulmonaire", 13 | "Le patient est équipé d'un pace-maker", 14 | "Un cardiopathie non décompensée", # no decompensation 15 | "Insuffisance cardiaque", 16 | "Insuffisance cardiaque minime", # minimal severity 17 | ], 18 | ) 19 | -------------------------------------------------------------------------------- /tests/pipelines/ner/disorders/connective_tissue_disease.py: -------------------------------------------------------------------------------- 1 | results_connective_tissue_disease = dict( 2 | has_match=[ 3 | True, 4 | True, 5 | False, 6 | True, 7 | True, 8 | ], 9 | detailled_status=None, 10 | assign=None, 11 | texts=[ 12 | "Présence d'une sclérodermie.", 13 | "Patient atteint d'un lupus.", 14 | "Présence d'anticoagulants lupiques,", 15 | "Il y a une MICI.", 16 | "Syndrome de Raynaud", 17 | ], 18 | ) 19 | -------------------------------------------------------------------------------- /tests/pipelines/ner/disorders/dementia.py: -------------------------------------------------------------------------------- 1 | results_dementia = dict( 2 | has_match=[ 3 | True, 4 | True, 5 | False, 6 | True, 7 | ], 8 | detailled_status=None, 9 | assign=None, 10 | texts=[ 11 | "D'importants déficits cognitifs", 12 | "Patient atteint de démence", 13 | "On retrouve des anti-SLA", # antibody 14 | "Une maladie de Charcot", 15 | ], 16 | ) 17 | -------------------------------------------------------------------------------- /tests/pipelines/ner/disorders/diabetes.py: -------------------------------------------------------------------------------- 1 | results_diabetes = dict( 2 | has_match=[ 3 | True, 4 | True, 5 | True, 6 | False, 7 | True, 8 | True, 9 | True, 10 | ], 11 | detailled_status=[ 12 | "WITHOUT_COMPLICATION", 13 | "WITHOUT_COMPLICATION", 14 | "WITHOUT_COMPLICATION", 15 | None, 16 | "WITH_COMPLICATION", 17 | "WITH_COMPLICATION", 18 | "WITH_COMPLICATION", 19 | ], 20 | assign=None, 21 | texts=[ 22 | "Présence d'un DT2", 23 | "Présence d'un DNID", 24 | "Patient diabétique", 25 | "Un diabète insipide", 26 | "Atteinte neurologique d'origine diabétique", 27 | "Une rétinopathie diabétique", 28 | "Il y a un mal perforant plantaire", 29 | ], 30 | ) 31 | -------------------------------------------------------------------------------- /tests/pipelines/ner/disorders/hemiplegia.py: -------------------------------------------------------------------------------- 1 | results_hemiplegia = dict( 2 | has_match=[ 3 | True, 4 | True, 5 | True, 6 | ], 7 | detailled_status=None, 8 | assign=None, 9 | texts=[ 10 | "Patient hémiplégique", 11 | "Paralysie des membres inférieurs", 12 | "Patient en LIS", # locked-in syndrom 13 | ], 14 | ) 15 | -------------------------------------------------------------------------------- /tests/pipelines/ner/disorders/leukemia.py: -------------------------------------------------------------------------------- 1 | results_leukemia = dict( 2 | has_match=[ 3 | True, 4 | False, 5 | True, 6 | True, 7 | ], 8 | detailled_status=None, 9 | assign=None, 10 | texts=[ 11 | "Sydrome myéloprolifératif", 12 | "Sydrome myéloprolifératif bénin", 13 | "Patient atteint d'une LAM", 14 | "Une maladie de Vaquez", 15 | ], 16 | ) 17 | -------------------------------------------------------------------------------- /tests/pipelines/ner/disorders/liver_disease.py: -------------------------------------------------------------------------------- 1 | results_liver_disease = dict( 2 | has_match=4 * [True], 3 | detailled_status=[ 4 | "MILD", 5 | "MILD", 6 | "MODERATE_TO_SEVERE", 7 | "MODERATE_TO_SEVERE", 8 | ], 9 | assign=None, 10 | texts=[ 11 | "Il y a une fibrose hépatique", 12 | "Une hépatite B chronique", 13 | "Le patient consulte pour une cirrhose", 14 | "Greffe hépatique.", 15 | ], 16 | ) 17 | -------------------------------------------------------------------------------- /tests/pipelines/ner/disorders/lymphoma.py: -------------------------------------------------------------------------------- 1 | results_lymphoma = dict( 2 | has_match=[ 3 | True, 4 | True, 5 | True, 6 | False, 7 | ], 8 | detailled_status=None, 9 | assign=None, 10 | texts=[ 11 | "Un lymphome de Hodgkin.", 12 | "Atteint d'un Waldenstörm", 13 | "Un LAGC", 14 | "anti LAGC: 10^4/mL", # Dosage 15 | ], 16 | ) 17 | -------------------------------------------------------------------------------- /tests/pipelines/ner/disorders/myocardial_infarction.py: -------------------------------------------------------------------------------- 1 | results_myocardial_infarction = dict( 2 | has_match=[ 3 | True, 4 | False, 5 | True, 6 | False, 7 | True, 8 | ], 9 | detailled_status=None, 10 | assign=None, 11 | texts=[ 12 | "Une cardiopathie ischémique", 13 | "Une cardiopathie non-ischémique", 14 | "Présence d'un stent sur la marginale", 15 | "Présence d'un stent périphérique", 16 | "infarctus du myocarde", 17 | ], 18 | ) 19 | -------------------------------------------------------------------------------- /tests/pipelines/ner/disorders/peptic_ulcer_disease.py: -------------------------------------------------------------------------------- 1 | results_peptic_ulcer_disease = dict( 2 | has_match=[ 3 | True, 4 | True, 5 | False, 6 | True, 7 | ], 8 | detailled_status=None, 9 | assign=None, 10 | texts=[ 11 | "Beaucoup d'ulcères gastriques", 12 | "Présence d'UGD", 13 | "La patient à des ulcères", 14 | "Au niveau gastrique: " + 5 * "blabla " + "quelques ulcères", 15 | ], 16 | ) 17 | -------------------------------------------------------------------------------- /tests/pipelines/ner/disorders/peripheral_vascular_disease.py: -------------------------------------------------------------------------------- 1 | results_peripheral_vascular_disease = dict( 2 | has_match=[ 3 | True, 4 | True, 5 | False, 6 | True, 7 | False, 8 | False, 9 | True, 10 | False, 11 | True, 12 | True, 13 | False, 14 | True, 15 | False, 16 | ], 17 | detailled_status=None, 18 | assign=None, 19 | texts=[ 20 | "Un AOMI", 21 | "Présence d'un infarctus rénal", 22 | "Une angiopathie cérébrale", 23 | "Une angiopathie", 24 | "Une thrombose cérébrale", 25 | "Une thrombose des veines superficielles", 26 | "Une thrombose", 27 | "Effectuer un bilan pre-trombose", 28 | "Une ischémie des MI est remarquée.", 29 | "Plusieurs cas d'EP", 30 | "Effectuer des cures d'EP", 31 | "Le patient est hypertendu", # Echange plasmatique 32 | "Une hypertension portale", 33 | ], 34 | ) 35 | -------------------------------------------------------------------------------- /tests/pipelines/ner/disorders/solid_tumor.py: -------------------------------------------------------------------------------- 1 | results_solid_tumor = dict( 2 | has_match=[True, True, False, True, True, True, True, True, True], 3 | detailled_status=[ 4 | "LOCALIZED", 5 | "LOCALIZED", 6 | None, 7 | "METASTASIS", 8 | "METASTASIS", 9 | "LOCALIZED", 10 | "METASTASIS", 11 | "METASTASIS", 12 | "METASTASIS", 13 | "METASTASIS", 14 | ], 15 | assign=None, 16 | texts=[ 17 | "Présence d'un carcinome intra-hépatique.", 18 | "Patient avec un K sein.", 19 | "Il y a une tumeur bénigne", 20 | "Tumeur métastasée", 21 | "Cancer du poumon au stade 4", 22 | "Cancer du poumon au stade 2", 23 | "Présence de nombreuses lésions secondaires", 24 | "Patient avec fracture abcddd secondaire. Cancer de", 25 | "Patient avec lesions non ciblées", 26 | "TNM: pTx N1 M1", 27 | ], 28 | ) 29 | 30 | solid_tumor_config = dict(use_patterns_metastasis_ct_scan=True, use_tnm=True) 31 | -------------------------------------------------------------------------------- /tests/pipelines/ner/disorders/tobacco.py: -------------------------------------------------------------------------------- 1 | results_tobacco = dict( 2 | has_match=[ 3 | True, 4 | True, 5 | False, 6 | True, 7 | True, 8 | True, 9 | True, 10 | True, 11 | True, 12 | ], 13 | detailled_status=[ 14 | None, 15 | None, 16 | None, 17 | "ABSTINENCE", 18 | None, 19 | None, 20 | "ABSTINENCE", 21 | None, 22 | None, 23 | ], 24 | negation=[ 25 | None, 26 | None, 27 | None, 28 | None, 29 | True, 30 | True, 31 | None, 32 | True, 33 | True, 34 | ], 35 | assign=[{"PA": 15}] + 8 * [None], 36 | texts=[ 37 | "Tabagisme évalué à 15 PA", 38 | "Patient tabagique", 39 | "Tabagisme festif", 40 | "On a un tabagisme ancien", 41 | "Tabac: 0", 42 | "Tabagisme passif", 43 | "Tabac: sevré depuis 5 ans", 44 | "Le patient ne fume aucun truc.", 45 | "Le patient fume 0 PA.", 46 | ], 47 | ) 48 | -------------------------------------------------------------------------------- /tests/pipelines/ner/test_cim10.py: -------------------------------------------------------------------------------- 1 | from edsnlp.core import PipelineProtocol 2 | from edsnlp.utils.examples import parse_example 3 | 4 | examples = [ 5 | "Patient admis pour fièvres typhoïde et paratyphoïde", 6 | "Patient admis pour C2.21", 7 | ] 8 | 9 | 10 | def test_cim10(blank_nlp: PipelineProtocol): 11 | blank_nlp.add_pipe("eds.cim10") 12 | 13 | for text, entities in map(parse_example, examples): 14 | doc = blank_nlp(text) 15 | 16 | assert len(doc.ents) == len(entities) 17 | 18 | for ent, entity in zip(doc.ents, entities): 19 | assert ent.text == text[entity.start_char : entity.end_char] 20 | assert ent.kb_id_ == entity.modifiers[0].value 21 | -------------------------------------------------------------------------------- /tests/pipelines/ner/test_covid.py: -------------------------------------------------------------------------------- 1 | def test_covid(blank_nlp): 2 | examples = [ 3 | ("Patient admis pour coronavirus", "coronavirus"), 4 | ("Patient admis pour pneumopathie à coronavirus", "pneumopathie à coronavirus"), 5 | ] 6 | 7 | blank_nlp.add_pipe("eds.covid") 8 | 9 | for example, text in examples: 10 | doc = blank_nlp(example) 11 | 12 | covid = doc.ents[0] 13 | assert covid.text == text 14 | -------------------------------------------------------------------------------- /tests/pipelines/ner/test_drugs.py: -------------------------------------------------------------------------------- 1 | def test_drugs(blank_nlp): 2 | blank_nlp.add_pipe("eds.normalizer") 3 | blank_nlp.add_pipe("eds.drugs") 4 | 5 | text = "Traitement habituel: Kardégic, cardensiel (bisoprolol), glucophage, lasilix" 6 | doc = blank_nlp(text) 7 | drugs_expected = [ 8 | ("Kardégic", "B01AC06"), 9 | ("cardensiel", "C07AB07"), 10 | ("bisoprolol", "C07AB07"), 11 | ("glucophage", "A10BA02"), 12 | ("lasilix", "C03CA01"), 13 | ] 14 | drugs_detected = [(x.text, x.kb_id_) for x in doc.ents] 15 | assert drugs_detected == drugs_expected 16 | -------------------------------------------------------------------------------- /tests/pipelines/ner/test_tnm.py: -------------------------------------------------------------------------------- 1 | from edsnlp.utils.examples import parse_example 2 | 3 | examples = [ 4 | "TNM: aTxN1M0", 5 | "TNM: p Tx N1M 0", 6 | "TNM: p Tx N1M 0 (UICC 20)", 7 | "TNM: aTxN1M0 (UICC 68)", 8 | "TNM: aTxN1 R2", 9 | "TNM: pT2c N0 R0 (TNM 2010)", 10 | "TNM: aTx / N1 / M0", 11 | "TNM: pT2 N1mi", 12 | "TNM: pT1(m)N1 M0", 13 | "TNM: pT1bN0(sn)", 14 | "TNM: pT1 pN1 M0\n \n ", 15 | "TNM: aTxN1M0 ", 16 | "TNM: cT3N0M0 \n \n", 17 | "TNM: PT", 18 | "TNM: p T \n", 19 | "TNM: a T \n", 20 | "TNM: pT \n \n0", 21 | ] 22 | 23 | 24 | def test_scores(blank_nlp): 25 | blank_nlp.add_pipe("eds.tnm") 26 | 27 | for example in examples: 28 | text, entities = parse_example(example=example) 29 | 30 | doc = blank_nlp(text) 31 | 32 | assert len(entities) == len(doc.ents) 33 | 34 | for entity, ent in zip(entities, doc.ents): 35 | norm = entity.modifiers[0].value 36 | assert ent.text == text[entity.start_char : entity.end_char] 37 | assert norm == ent._.value.norm() 38 | -------------------------------------------------------------------------------- /tests/pipelines/qualifiers/conftest.py: -------------------------------------------------------------------------------- 1 | from pytest import fixture 2 | 3 | 4 | @fixture(params=[True, False]) 5 | def blank_nlp(blank_nlp, request, lang): 6 | if request.param: 7 | blank_nlp.add_pipe("normalizer") 8 | return blank_nlp 9 | -------------------------------------------------------------------------------- /tests/pipelines/test_pipelines.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | import edsnlp 4 | 5 | 6 | def test_pipelines(doc): 7 | assert len(doc.ents) == 3 8 | patient, _, anomalie = doc.ents 9 | 10 | assert not patient._.negation 11 | assert anomalie._.negation 12 | 13 | assert not doc[0]._.history 14 | 15 | 16 | def test_import_all(): 17 | import edsnlp.pipes 18 | 19 | for name in dir(edsnlp.pipes): 20 | if not name.startswith("_") and "endlines" not in name: 21 | try: 22 | getattr(edsnlp.pipes, name) 23 | except (ImportError, AttributeError) as e: 24 | if "torch" in str(e): 25 | pass 26 | 27 | 28 | def test_non_existing_pipe(): 29 | with pytest.raises(AttributeError) as e: 30 | getattr(edsnlp.pipes, "non_existing_pipe") 31 | 32 | assert str(e.value) == "module edsnlp.pipes has no attribute non_existing_pipe" 33 | -------------------------------------------------------------------------------- /tests/readme.md: -------------------------------------------------------------------------------- 1 | # Testing the algorithm 2 | 3 | Various tests for the components of the spaCy pipeline. 4 | 5 | We decided to design tests entity-wise, meaning that we only check the validity 6 | of the computed modality on a set of entities. This design choice is motivated by 7 | the fact that : 8 | 9 | 1. That's what we actually care about. We want our pipeline to detect negation, 10 | family context, patient history and hypothesis relative to a given entity. 11 | 12 | 2. Deciding on the span of an annotation (negated, hypothesis, etc) is tricky. 13 | Consider the example : `"Le patient n'est pas malade."`. Should the negated span 14 | correspond to `["est", "malade"]`, `["malade"]`, `["n'", "est", "pas", "malade", "."]` ? 15 | 16 | 3. Depending on the design of the algorithm, the span might be off, even though it 17 | can correctly assign polarity to a given entity (but considered that the punctuation 18 | was negated as well). 19 | By relaxing the need to infer the correct span, we avoid giving an unfair disadvantage 20 | to an otherwise great algorithm. 21 | -------------------------------------------------------------------------------- /tests/resources/brat_data/subfolder/doc-1.ann: -------------------------------------------------------------------------------- 1 | R1 lieu Arg1:T8 Arg2:T9 2 | T1 sosy 30 38 douleurs 3 | T2 localisation 39 57 dans le bras droit 4 | T3 anatomie 47 57 bras droit 5 | T4 pathologie 75 83;85 98 problème de locomotion 6 | A1 assertion T4 absent 7 | A9 bool flag 0 T4 8 | T5 pathologie 114 117 AVC 9 | A2 etat T5 passé 10 | A3 assertion T5 non-associé 11 | T6 pathologie 159 164 rhume 12 | A4 etat T6 présent 13 | A5 assertion T6 hypothétique 14 | T7 pathologie 291 296 rhume 15 | A6 etat T7 présent 16 | A7 assertion T7 hypothétique 17 | T8 sosy 306 314 Douleurs 18 | T9 localisation 315 333 dans le bras droit 19 | T10 anatomie 323 333 bras droit 20 | T11 sosy 378 386 anomalie 21 | #1 AnnotatorNotes T7 Repetition 22 | R2 lieu Arg1:T1 Arg2:T2 23 | A8 assertion T11 absent 24 | E1 MyArg1:T3 MyArg2:T1 25 | E2 MyArg1:T1 MyArg2:E1 26 | T12 test label 0 378 386 anomalie 27 | #1 AnnotatorNotes T1 C0030193 28 | -------------------------------------------------------------------------------- /tests/resources/brat_data/subfolder/doc-1.txt: -------------------------------------------------------------------------------- 1 | Le patient est admis pour des douleurs dans le bras droit, mais n'a pas de problème 2 | de locomotion. 3 | Historique d'AVC dans la famille. pourrait être un cas de rhume. 4 | NBNbWbWbNbWbNBNbNbWbWbNBNbWbNbNbWbNBNbWbNbNBWbWbNbNbNBWbNbWbNbWBNbNbWbNbNBNbWbWbNbWBNbNbWbNBNbWbWbNb 5 | Pourrait être un cas de rhume. 6 | Motif : 7 | Douleurs dans le bras droit. 8 | ANTÉCÉDENTS 9 | Le patient est déjà venu 10 | Pas d'anomalie détectée. 11 | -------------------------------------------------------------------------------- /tests/resources/brat_data/subfolder/doc-2.txt: -------------------------------------------------------------------------------- 1 | Small text 2 | -------------------------------------------------------------------------------- /tests/resources/brat_data/subfolder/doc-3.txt: -------------------------------------------------------------------------------- 1 | Another small text 2 | -------------------------------------------------------------------------------- /tests/resources/docs.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edsnlp/8e9ed84f56e6af741023e8b3a9de38ba93912953/tests/resources/docs.parquet -------------------------------------------------------------------------------- /tests/test_entrypoints.py: -------------------------------------------------------------------------------- 1 | import catalogue 2 | import pytest 3 | 4 | try: 5 | from importlib.metadata import entry_points 6 | except ImportError: 7 | from importlib_metadata import entry_points 8 | 9 | try: 10 | import torch.nn 11 | except ImportError: 12 | torch = None 13 | 14 | if torch is None: 15 | pytest.skip("torch not installed", allow_module_level=True) 16 | 17 | 18 | def test_entrypoints(): 19 | ep = entry_points() 20 | namespaces = ep.groups if hasattr(ep, "groups") else ep.keys() 21 | for ns in namespaces: 22 | if ns.startswith("spacy_") or ns.startswith("edsnlp_"): 23 | reg = catalogue.Registry(ns.split("_"), entry_points=True) 24 | reg.get_all() 25 | -------------------------------------------------------------------------------- /tests/test_span_args.py: -------------------------------------------------------------------------------- 1 | from confit import validate_arguments 2 | 3 | from edsnlp.pipes.base import ( 4 | SpanGetterArg, 5 | SpanSetterArg, 6 | validate_span_getter, 7 | validate_span_setter, 8 | ) 9 | 10 | 11 | def test_span_getter(): 12 | assert validate_span_getter("ents") == {"ents": True} 13 | assert validate_span_getter(["ents"]) == {"ents": True} 14 | assert validate_span_getter(["ents", "group"]) == {"ents": True, "group": True} 15 | assert validate_span_getter({"grp": True}) == {"grp": True} 16 | assert validate_span_getter({"grp": ["a", "b", "c"]}) == {"grp": ["a", "b", "c"]} 17 | 18 | 19 | def test_span_setter(): 20 | assert validate_span_setter("ents") == {"ents": True} 21 | assert validate_span_setter(["ents"]) == {"ents": True} 22 | assert validate_span_setter(["ents", "group"]) == {"ents": True, "group": True} 23 | assert validate_span_setter({"grp": True}) == {"grp": True} 24 | assert validate_span_setter({"grp": ["a", "b", "c"]}) == {"grp": ["a", "b", "c"]} 25 | 26 | 27 | def test_validate_args(): 28 | @validate_arguments 29 | def my_func(span_getter: SpanGetterArg, span_setter: SpanSetterArg): 30 | return span_getter, span_setter 31 | 32 | assert my_func("ents", "ents") == ({"ents": True}, {"ents": True}) 33 | -------------------------------------------------------------------------------- /tests/training/dataset.jsonl: -------------------------------------------------------------------------------- 1 | {"note_id": "1", "note_text": "Pas de cancer chez le patient ou sa famille.\nOn trouve un nodule superieur centimétrique droit évocateur de fibroanédome.", "entities": [{"start": 7, "end": 13, "label": "sosy", "negation": true}, {"start": 58, "end": 64, "label": "sosy", "negation": false}, {"start": 75, "end": 88, "label": "measure", "unit": "cm"}, {"start": 108, "end": 120, "label": "sosy", "negation": false}]} 2 | {"note_id": "2", "note_text": "La patiente a un gros rhume, sans fièvre ou douleur thoracique. Elle fait 30 kg.", "entities": [{"start": 22, "end": 27, "label": "sosy", "negation": false}, {"start": 34, "end": 40, "label": "sosy", "negation": true}, {"start": 44, "end": 62, "label": "sosy", "negation": true}, {"start": 74, "end": 79, "label": "measure", "unit": "kg"}]} 3 | -------------------------------------------------------------------------------- /tests/training/dataset/annotation.conf: -------------------------------------------------------------------------------- 1 | [entities] 2 | 3 | sosy 4 | measure 5 | 6 | [attributes] 7 | 8 | negation Arg:sosy 9 | unit Arg:measure, Value:cm|kg 10 | 11 | [relations] 12 | 13 | [events] 14 | -------------------------------------------------------------------------------- /tests/training/dataset/sample-1.ann: -------------------------------------------------------------------------------- 1 | T1 sosy 7 13 cancer 2 | A1 negation T1 3 | T2 sosy 58 64 nodule 4 | T3 measure 75 88 centimétrique 5 | T4 sosy 108 120 fibroanédome 6 | A2 unit T3 cm 7 | -------------------------------------------------------------------------------- /tests/training/dataset/sample-1.txt: -------------------------------------------------------------------------------- 1 | Pas de cancer chez le patient ou sa famille. 2 | On trouve un nodule superieur centimétrique droit évocateur de fibroanédome. 3 | -------------------------------------------------------------------------------- /tests/training/dataset/sample-2.ann: -------------------------------------------------------------------------------- 1 | T1 sosy 22 27 rhume 2 | T2 sosy 34 40 fièvre 3 | T3 sosy 44 62 douleur thoracique 4 | A1 negation T2 5 | A2 negation T3 6 | T4 measure 74 79 30 kg 7 | A3 unit T4 kg 8 | -------------------------------------------------------------------------------- /tests/training/dataset/sample-2.txt: -------------------------------------------------------------------------------- 1 | La patiente a un gros rhume, sans fièvre ou douleur thoracique. Elle fait 30 kg. 2 | -------------------------------------------------------------------------------- /tests/training/dep_parser_config.yml: -------------------------------------------------------------------------------- 1 | # 🤖 PIPELINE DEFINITION 2 | nlp: 3 | "@core": pipeline 4 | 5 | lang: fr 6 | 7 | components: 8 | parser: 9 | '@factory': eds.biaffine_dep_parser 10 | hidden_size: 64 11 | decoding_mode: greedy 12 | dropout_p: 0. 13 | use_attrs: ['pos_'] 14 | 15 | embedding: 16 | '@factory': eds.transformer 17 | model: hf-internal-testing/tiny-bert 18 | window: 512 19 | stride: 256 20 | 21 | # 📈 SCORERS 22 | scorer: 23 | speed: false 24 | dep: 25 | '@metrics': "eds.dep_parsing" 26 | 27 | # 🎛️ OPTIMIZER 28 | optimizer: 29 | optim: adamw 30 | module: ${ nlp } 31 | total_steps: ${ train.max_steps } 32 | groups: 33 | ".*": 34 | lr: 1e-3 35 | 36 | # 📚 DATA 37 | train_data: 38 | data: 39 | "@readers": conll 40 | path: ./rhapsodie_sample.conllu 41 | shuffle: dataset 42 | batch_size: 1 docs 43 | pipe_names: [ "parser" ] 44 | 45 | val_data: 46 | "@readers": conll 47 | path: ./rhapsodie_sample.conllu 48 | 49 | # 🚀 TRAIN SCRIPT OPTIONS 50 | train: 51 | nlp: ${ nlp } 52 | train_data: ${ train_data } 53 | val_data: ${ val_data } 54 | max_steps: 20 55 | validation_interval: 10 56 | max_grad_norm: 5.0 57 | scorer: ${ scorer } 58 | num_workers: 0 59 | optimizer: ${ optimizer } 60 | grad_dev_policy: "clip_mean" 61 | log_weight_grads: true 62 | -------------------------------------------------------------------------------- /tests/tuning/config.cfg: -------------------------------------------------------------------------------- 1 | # My usefull comment 2 | [train] 3 | param1 = 1 4 | -------------------------------------------------------------------------------- /tests/tuning/test_checkpoints/single_phase_gpu_hour/study_.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edsnlp/8e9ed84f56e6af741023e8b3a9de38ba93912953/tests/tuning/test_checkpoints/single_phase_gpu_hour/study_.pkl -------------------------------------------------------------------------------- /tests/tuning/test_checkpoints/single_phase_n_trials/study_.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edsnlp/8e9ed84f56e6af741023e8b3a9de38ba93912953/tests/tuning/test_checkpoints/single_phase_n_trials/study_.pkl -------------------------------------------------------------------------------- /tests/tuning/test_checkpoints/two_phase_gpu_hour/results_summary.txt: -------------------------------------------------------------------------------- 1 | Study Summary 2 | ================== 3 | Best trial: 2 4 | 5 | Value: 0.7674011016524788 6 | 7 | Params: 8 | start_value: 0.00017235427021406453 9 | warmup_rate: 0.1 10 | 11 | Importances: 12 | start_value: 0.7 13 | warmup_rate: 0.3 14 | -------------------------------------------------------------------------------- /tests/tuning/test_checkpoints/two_phase_gpu_hour/study_.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edsnlp/8e9ed84f56e6af741023e8b3a9de38ba93912953/tests/tuning/test_checkpoints/two_phase_gpu_hour/study_.pkl -------------------------------------------------------------------------------- /tests/tuning/test_checkpoints/two_phase_n_trials/results_summary.txt: -------------------------------------------------------------------------------- 1 | Study Summary 2 | ================== 3 | Best trial: 2 4 | 5 | Value: 0.7674011016524788 6 | 7 | Params: 8 | start_value: 0.00017235427021406453 9 | warmup_rate: 0.1 10 | 11 | Importances: 12 | start_value: 0.7 13 | warmup_rate: 0.3 14 | -------------------------------------------------------------------------------- /tests/tuning/test_checkpoints/two_phase_n_trials/study_.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edsnlp/8e9ed84f56e6af741023e8b3a9de38ba93912953/tests/tuning/test_checkpoints/two_phase_n_trials/study_.pkl -------------------------------------------------------------------------------- /tests/utils/test_bindings.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from confit import validate_arguments 3 | from confit.errors import ConfitValidationError 4 | 5 | from edsnlp.utils.bindings import BINDING_GETTERS, BINDING_SETTERS, AttributesArg 6 | 7 | 8 | def test_qualifier_validation(): 9 | @validate_arguments 10 | def fn(arg: AttributesArg): 11 | return arg 12 | 13 | assert fn("_.negated") == {"_.negated": True} 14 | assert fn(["_.negated", "_.event"]) == {"_.negated": True, "_.event": True} 15 | assert fn({"_.negated": True, "_.event": "DATE"}) == { 16 | "_.negated": True, 17 | "_.event": ["DATE"], 18 | } 19 | 20 | callback = lambda x: x # noqa: E731 21 | 22 | assert fn(callback) is callback 23 | 24 | with pytest.raises(ConfitValidationError): 25 | fn(1) 26 | 27 | with pytest.raises(ConfitValidationError): 28 | fn({"_.negated": 1}) 29 | 30 | 31 | def test_bindings(): 32 | class custom: 33 | def __init__(self, value): 34 | self.value = value 35 | 36 | obj = custom([custom(1), custom(2)]) 37 | assert BINDING_GETTERS["value[0].value"](obj) == 1 38 | assert BINDING_GETTERS[("value[0].value", 1)](obj) is True 39 | BINDING_SETTERS[("value[1].value", 3)](obj) 40 | assert obj.value[1].value == 3 41 | -------------------------------------------------------------------------------- /tests/utils/test_filter.py: -------------------------------------------------------------------------------- 1 | from spacy.tokens import Doc, Span 2 | 3 | from edsnlp.utils.filter import filter_spans 4 | 5 | 6 | def test_filter_spans(doc: Doc): 7 | spans = [ 8 | doc[0:3], 9 | doc[0:4], 10 | doc[1:2], 11 | doc[0:2], 12 | doc[0:3], 13 | ] 14 | 15 | filtered = filter_spans(spans) 16 | 17 | assert len(filtered) == 1 18 | assert len(filtered[0]) == 4 19 | 20 | 21 | def test_filter_spans_strict_nesting(doc: Doc): 22 | spans = [ 23 | doc[0:5], 24 | doc[1:4], 25 | ] 26 | 27 | filtered = filter_spans(spans) 28 | 29 | assert len(filtered) == 1 30 | assert len(filtered[0]) == 5 31 | 32 | 33 | def test_label_to_remove(doc: Doc): 34 | 35 | spans = [ 36 | Span(doc, 0, 5, label="test"), 37 | Span(doc, 6, 10, label="test"), 38 | Span(doc, 6, 10, label="remove"), 39 | ] 40 | 41 | filtered = filter_spans(spans, label_to_remove="remove") 42 | 43 | assert len(filtered) == 2 44 | 45 | spans = [ 46 | Span(doc, 6, 10, label="remove"), 47 | Span(doc, 0, 5, label="test"), 48 | Span(doc, 6, 10, label="test"), 49 | ] 50 | 51 | filtered = filter_spans(spans, label_to_remove="remove") 52 | 53 | assert len(filtered) == 1 54 | -------------------------------------------------------------------------------- /tests/utils/test_typing.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from confit import validate_arguments 3 | from confit.errors import ConfitValidationError 4 | 5 | from edsnlp.utils.typing import AsList 6 | 7 | 8 | def test_as_list(): 9 | @validate_arguments 10 | def func(a: AsList[int]): 11 | return a 12 | 13 | assert func("1") == [1] 14 | 15 | with pytest.raises(ConfitValidationError) as e: 16 | func("a") 17 | 18 | assert ( 19 | "1 validation error for test_typing.test_as_list..func()\n" "-> a.0\n" 20 | ) in str(e.value) 21 | --------------------------------------------------------------------------------