├── docs ├── .gitinclude ├── Gemfile ├── scispacy-logo.png ├── scispacy-logo-square.png ├── _config.yml ├── index.md └── example.svg ├── tests ├── __init__.py ├── fixtures │ ├── umls_META │ │ ├── MRDEF.RRF │ │ ├── MRSTY.RRF │ │ ├── MRCONSO.RRF │ │ └── MRFILES.RRF │ ├── test_umls_tree.tsv │ ├── ner_test.tsv │ ├── med_mentions.txt │ └── umls_test_fixture.json ├── custom_tests │ ├── data_fixtures │ │ ├── test.pmids │ │ └── raw │ │ │ ├── 9170401.txt │ │ │ └── 9171236.txt │ ├── test_whitespace.py │ ├── test_all_model.py │ └── test_custom_tokenizer.py ├── test_util.py ├── test_per_class_scorer.py ├── test_umls_semantic_type_tree.py ├── test_hyponym_detector.py ├── test_umls_utils.py ├── test_candidate_generation.py ├── test_file_cache.py ├── test_linking.py ├── test_data_util.py ├── conftest.py └── test_abbreviation_detection.py ├── pytest.ini ├── scispacy ├── __init__.py ├── umls_linking.py ├── version.py ├── consts.py ├── train_utils.py ├── util.py ├── custom_sentence_segmenter.py ├── base_project_code.py ├── per_class_scorer.py ├── linking_utils.py ├── umls_semantic_type_tree.py ├── hyponym_detector.py ├── file_cache.py ├── custom_tokenizer.py ├── linking.py ├── umls_utils.py ├── abbreviation.py └── data_util.py ├── scripts ├── mypy.sh ├── create_linker.py ├── evaluate_ner.py ├── convert_freqs.py ├── count_word_frequencies.py └── export_umls_json.py ├── MANIFEST.in ├── data ├── meta_large.json ├── meta_small.json ├── meta_medium.json ├── craft_ner.json ├── bc5cdr_ner.json ├── jnlpba_ner.json ├── bionlp13cg_ner.json └── meta_scibert.json ├── requirements.in ├── .github └── workflows │ ├── main.yml │ └── publish.yml ├── .flake8 ├── Dockerfile ├── RELEASE.md ├── .gitignore ├── setup.py ├── configs ├── base_ner.cfg ├── base_specialized_ner.cfg ├── base_ner_scibert.cfg ├── base_parser_tagger.cfg └── base_parser_tagger_scibert.cfg └── evaluation └── sentence_splitting_evaluation.py /docs/.gitinclude: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | testpaths = tests/ 3 | -------------------------------------------------------------------------------- /scispacy/__init__.py: -------------------------------------------------------------------------------- 1 | from scispacy.version import VERSION as __version__ 2 | -------------------------------------------------------------------------------- /docs/Gemfile: -------------------------------------------------------------------------------- 1 | 2 | source 'https://rubygems.org' 3 | 4 | gem "github-pages", group: :jekyll_plugins 5 | 6 | -------------------------------------------------------------------------------- /docs/scispacy-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/scispacy/main/docs/scispacy-logo.png -------------------------------------------------------------------------------- /scripts/mypy.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Run type checking over the python code. 3 | 4 | mypy scispacy --ignore-missing-imports 5 | -------------------------------------------------------------------------------- /docs/scispacy-logo-square.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/scispacy/main/docs/scispacy-logo-square.png -------------------------------------------------------------------------------- /scispacy/umls_linking.py: -------------------------------------------------------------------------------- 1 | # Kept for backward compatability. 2 | from scispacy.linking import EntityLinker as UmlsEntityLinker # noqa: F401 3 | -------------------------------------------------------------------------------- /tests/fixtures/umls_META/MRDEF.RRF: -------------------------------------------------------------------------------- 1 | C0000039|A0016515|AT38152019||MSH|Synthetic phospholipid used in liposomes and lipid bilayers to study biological membranes.|N|| -------------------------------------------------------------------------------- /docs/_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-minimal 2 | description: SpaCy models for biomedical text processing 3 | show_downloads: true 4 | logo: /scispacy-logo-square.png 5 | -------------------------------------------------------------------------------- /scispacy/version.py: -------------------------------------------------------------------------------- 1 | _MAJOR = "0" 2 | _MINOR = "4" 3 | _REVISION = "0" 4 | 5 | VERSION_SHORT = "{0}.{1}".format(_MAJOR, _MINOR) 6 | VERSION = "{0}.{1}.{2}".format(_MAJOR, _MINOR, _REVISION) 7 | -------------------------------------------------------------------------------- /tests/fixtures/test_umls_tree.tsv: -------------------------------------------------------------------------------- 1 | Event T051 1 2 | Activity T052 2 3 | Behavior T053 3 4 | Social Behavior T054 4 5 | Individual Behavior T055 4 6 | Daily or Recreational Activity T056 3 7 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | 2 | 3 | include LICENSE 4 | include README.md 5 | include requirements.in 6 | recursive-include data * 7 | recursive-exclude proto_model * 8 | recursive-exclude scispacy/models * 9 | recursive-exclude * __pycache__ 10 | -------------------------------------------------------------------------------- /tests/fixtures/umls_META/MRSTY.RRF: -------------------------------------------------------------------------------- 1 | C0000005|T116|A1.4.1.2.1.7|Amino Acid, Peptide, or Protein|AT17648347|256| 2 | C0000039|T109|A1.4.1.2.1|Organic Chemical|AT45562015|256| 3 | C0000039|T121|A1.4.1.1.1|Pharmacologic Substance|AT17567371|256| -------------------------------------------------------------------------------- /tests/custom_tests/data_fixtures/test.pmids: -------------------------------------------------------------------------------- 1 | 9170401 2 | 9170401 3 | 9170401 4 | 9170401 5 | 9170401 6 | 9170401 7 | 9170401 8 | 9170401 9 | 9170401 10 | 9170401 11 | 9170401 12 | 9170401 13 | 9170401 14 | 9171236 15 | 9171236 16 | 9171236 17 | 9171236 18 | 9171236 19 | 9171236 20 | 9171236 21 | 9171236 22 | 9171236 23 | 9171236 24 | -------------------------------------------------------------------------------- /data/meta_large.json: -------------------------------------------------------------------------------- 1 | { 2 | "lang":"en", 3 | "name":"core_sci_lg", 4 | "sources": ["OntoNotes 5", "Common Crawl", "GENIA 1.0"], 5 | "description":"Spacy Models for Biomedical Text.", 6 | "author":"Allen Institute for Artificial Intelligence", 7 | "email": "ai2-info@allenai.org", 8 | "url":"https://allenai.github.io/SciSpaCy/", 9 | "license":"CC BY-SA 3.0" 10 | } 11 | -------------------------------------------------------------------------------- /data/meta_small.json: -------------------------------------------------------------------------------- 1 | { 2 | "lang":"en", 3 | "name":"core_sci_sm", 4 | "sources": ["OntoNotes 5", "Common Crawl", "GENIA 1.0"], 5 | "description":"Spacy Models for Biomedical Text.", 6 | "author":"Allen Institute for Artificial Intelligence", 7 | "email": "ai2-info@allenai.org", 8 | "url":"https://allenai.github.io/SciSpaCy/", 9 | "license":"CC BY-SA 3.0" 10 | } 11 | -------------------------------------------------------------------------------- /tests/fixtures/umls_META/MRCONSO.RRF: -------------------------------------------------------------------------------- 1 | C0000005|ENG|P|L0000005|PF|S0007492|Y|A26634265||M0019694|D012711|MSH|PEP|D012711|(131)I-Macroaggregated Albumin|0|N|256| 2 | C0000005|ENG|S|L0270109|PF|S0007491|Y|A26634266||M0019694|D012711|MSH|ET|D012711|(131)I-MAA|0|N|256| 3 | C0000039|ENG|P|L0000039|PF|S0007564|N|A0016515||M0023172|D015060|MSH|MH|D015060|1,2-Dipalmitoylphosphatidylcholine|0|N|256| -------------------------------------------------------------------------------- /data/meta_medium.json: -------------------------------------------------------------------------------- 1 | { 2 | "lang":"en", 3 | "name":"core_sci_md", 4 | "sources": ["OntoNotes 5", "Common Crawl", "GENIA 1.0"], 5 | "description":"Spacy Models for Biomedical Text.", 6 | "author":"Allen Institute for Artificial Intelligence", 7 | "email": "ai2-info@allenai.org", 8 | "url":"https://allenai.github.io/SciSpaCy/", 9 | "license":"CC BY-SA 3.0" 10 | } 11 | -------------------------------------------------------------------------------- /data/craft_ner.json: -------------------------------------------------------------------------------- 1 | { 2 | "lang":"en", 3 | "name":"ner_craft_md", 4 | "sources": ["CRAFT", "OntoNotes 5", "Common Crawl", "GENIA 1.0"], 5 | "description":"Spacy Models for Biomedical Text.", 6 | "author":"Allen Institute for Artificial Intelligence", 7 | "email": "ai2-info@allenai.org", 8 | "url":"https://allenai.github.io/SciSpaCy/", 9 | "license":"CC BY-SA 3.0" 10 | } 11 | -------------------------------------------------------------------------------- /data/bc5cdr_ner.json: -------------------------------------------------------------------------------- 1 | { 2 | "lang":"en", 3 | "name":"ner_bc5cdr_md", 4 | "sources": ["BC5CDR", "OntoNotes 5", "Common Crawl", "GENIA 1.0"], 5 | "description":"Spacy Models for Biomedical Text.", 6 | "author":"Allen Institute for Artificial Intelligence", 7 | "email": "ai2-info@allenai.org", 8 | "url":"https://allenai.github.io/SciSpaCy/", 9 | "license":"CC BY-SA 3.0" 10 | } 11 | -------------------------------------------------------------------------------- /data/jnlpba_ner.json: -------------------------------------------------------------------------------- 1 | { 2 | "lang":"en", 3 | "name":"ner_jnlpba_md", 4 | "sources": ["JNLPBA", "OntoNotes 5", "Common Crawl", "GENIA 1.0"], 5 | "description":"Spacy Models for Biomedical Text.", 6 | "author":"Allen Institute for Artificial Intelligence", 7 | "email": "ai2-info@allenai.org", 8 | "url":"https://allenai.github.io/SciSpaCy/", 9 | "license":"CC BY-SA 3.0" 10 | } 11 | -------------------------------------------------------------------------------- /data/bionlp13cg_ner.json: -------------------------------------------------------------------------------- 1 | { 2 | "lang":"en", 3 | "name":"ner_bionlp13cg_md", 4 | "sources": ["BIONLP13CG", "OntoNotes 5", "Common Crawl", "GENIA 1.0"], 5 | "description":"Spacy Models for Biomedical Text.", 6 | "author":"Allen Institute for Artificial Intelligence", 7 | "email": "ai2-info@allenai.org", 8 | "url":"https://allenai.github.io/SciSpaCy/", 9 | "license":"CC BY-SA 3.0" 10 | } 11 | -------------------------------------------------------------------------------- /scispacy/consts.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | ABBREVIATIONS: List[str] = [ 4 | "sec.", 5 | "secs.", 6 | "Sec.", 7 | "Secs.", 8 | "fig.", 9 | "figs.", 10 | "Fig.", 11 | "Figs.", 12 | "eq.", 13 | "eqs.", 14 | "Eq.", 15 | "Eqs.", 16 | "no.", 17 | "nos.", 18 | "No.", 19 | "Nos.", 20 | "al.", 21 | "gen.", 22 | "sp.", 23 | "nov.", 24 | ] 25 | -------------------------------------------------------------------------------- /data/meta_scibert.json: -------------------------------------------------------------------------------- 1 | { 2 | "lang": "en", 3 | "name": "core_sci_scibert", 4 | "sources": [ 5 | "OntoNotes 5", 6 | "Common Crawl", 7 | "GENIA 1.0" 8 | ], 9 | "description": "Spacy Models for Biomedical Text.", 10 | "author": "Allen Institute for Artificial Intelligence", 11 | "email": "ai2-info@allenai.org", 12 | "url": "https://allenai.github.io/SciSpaCy/", 13 | "license": "CC BY-SA 3.0", 14 | "requirements": [ 15 | "spacy-transformers" 16 | ] 17 | } -------------------------------------------------------------------------------- /tests/fixtures/ner_test.tsv: -------------------------------------------------------------------------------- 1 | Intraocular O 2 | pressure O 3 | in O 4 | genetically B-SO 5 | distinct O 6 | mice B-Taxon 7 | : O 8 | an O 9 | update O 10 | and O 11 | strain O 12 | survey O 13 | 14 | Abstract O 15 | 16 | Background O 17 | 18 | Little O 19 | is O 20 | known O 21 | about O 22 | genetic B-SO 23 | factors O 24 | affecting O 25 | intraocular O 26 | pressure O 27 | ( O 28 | IOP O 29 | ) O 30 | in O 31 | mice B-Taxon 32 | and O 33 | other O 34 | mammals B-Taxon 35 | . O 36 | -------------------------------------------------------------------------------- /tests/test_util.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import spacy 4 | 5 | from scispacy.util import WhitespaceTokenizer 6 | 7 | class TestUtil(unittest.TestCase): 8 | 9 | def setUp(self): 10 | super().setUp() 11 | 12 | self.nlp = spacy.load("en_core_web_sm") 13 | 14 | def test_whitespace_tokenizer(self): 15 | 16 | self.nlp.tokenizer = WhitespaceTokenizer(self.nlp.vocab) 17 | text = "don't split this contraction." 18 | doc = self.nlp(text) 19 | 20 | assert [t.text for t in doc] == text.split(" ") 21 | -------------------------------------------------------------------------------- /requirements.in: -------------------------------------------------------------------------------- 1 | numpy 2 | spacy>=3.0.0,<3.1.0 3 | spacy-lookups-data 4 | pandas 5 | requests>=2.0.0,<3.0.0 6 | conllu 7 | 8 | # Candidate generation and entity linking 9 | joblib 10 | nmslib>=1.7.3.6 11 | scikit-learn>=0.20.3 12 | 13 | # Required for testing. 14 | pytest 15 | pytest-cov 16 | flake8 17 | # black currently pinned because of a dependency issue with spacy, typer, and click 18 | black<=21.12b0 19 | mypy 20 | types-requests 21 | 22 | # Required for releases. 23 | twine 24 | 25 | # required for the tests to run, or to use the custom sentence splitter 26 | pysbd 27 | -------------------------------------------------------------------------------- /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | pull_request: 5 | branches: 6 | - main 7 | 8 | jobs: 9 | build: 10 | 11 | runs-on: ubuntu-latest 12 | 13 | steps: 14 | - uses: actions/checkout@v1 15 | - name: Build and test with Docker 16 | run: | 17 | docker build --tag scispacy . 18 | docker run --rm scispacy pytest tests/ 19 | docker run --rm scispacy flake8 scispacy 20 | docker run --rm scispacy black scispacy --check --line-length 88 21 | docker run --rm scispacy bash scripts/mypy.sh 22 | docker run --rm scispacy pytest tests/ --cov scispacy --cov-fail-under=20 23 | -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 115 3 | 4 | ignore = 5 | # these rules don't play well with black 6 | E203 # whitespace before : 7 | W503 # line break before binary operator 8 | W504 # line break after binary operator 9 | 10 | exclude = 11 | build/** 12 | docs/** 13 | 14 | per-file-ignores = 15 | # __init__.py files are allowed to have unused imports and lines-too-long 16 | scispacy/__init__.py:F401 17 | scispacy/**/__init__.py:F401,E501 18 | 19 | # scripts don't have to respect 20 | # E501: line length 21 | # E402: imports not at top of file (because we mess with sys.path) 22 | scripts/**:E501,E402 23 | -------------------------------------------------------------------------------- /scripts/create_linker.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | 4 | from scispacy.candidate_generation import create_tfidf_ann_index 5 | from scispacy.linking_utils import KnowledgeBase 6 | 7 | 8 | def main(kb_path: str, output_path: str): 9 | 10 | os.makedirs(output_path, exist_ok=True) 11 | kb = KnowledgeBase(kb_path) 12 | create_tfidf_ann_index(output_path, kb) 13 | 14 | 15 | if __name__ == "__main__": 16 | parser = argparse.ArgumentParser() 17 | parser.add_argument( 18 | '--kb_path', 19 | help="Path to the KB file." 20 | ) 21 | parser.add_argument( 22 | '--output_path', 23 | help="Path to the output directory." 24 | ) 25 | 26 | args = parser.parse_args() 27 | main(args.kb_path, args.output_path) 28 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.8-buster 2 | 3 | # install base packages 4 | RUN apt-get clean \ 5 | && apt-get update --fix-missing \ 6 | && apt-get install -y \ 7 | git \ 8 | curl \ 9 | gcc \ 10 | g++ \ 11 | build-essential \ 12 | wget \ 13 | awscli 14 | 15 | WORKDIR /work 16 | 17 | # install python packages 18 | COPY requirements.in . 19 | 20 | RUN pip install -r requirements.in 21 | RUN pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_sm-0.4.0.tar.gz 22 | RUN python -m spacy download en_core_web_sm 23 | RUN python -m spacy download en_core_web_md 24 | 25 | # add the code as the final step so that when we modify the code 26 | # we don't bust the cached layers holding the dependencies and 27 | # system packages. 28 | COPY scispacy/ scispacy/ 29 | COPY scripts/ scripts/ 30 | COPY tests/ tests/ 31 | COPY .flake8 .flake8 32 | 33 | CMD [ "/bin/bash" ] 34 | -------------------------------------------------------------------------------- /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | # This workflow publishes the scispacy package (not the scispacy models) to pypi. 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries 3 | 4 | name: Publish Package 5 | 6 | on: 7 | release: 8 | types: [published] 9 | 10 | jobs: 11 | deploy: 12 | 13 | if: github.repository == 'allenai/scispacy' 14 | runs-on: ubuntu-latest 15 | 16 | steps: 17 | - uses: actions/checkout@v2 18 | - name: Set up Python 19 | uses: actions/setup-python@v2 20 | with: 21 | python-version: '3.7' 22 | - name: Install dependencies 23 | run: | 24 | python -m pip install --upgrade pip 25 | pip install setuptools wheel twine 26 | - name: Build and publish 27 | run: | 28 | python setup.py sdist bdist_wheel 29 | twine upload -u scispacy -p ${{ secrets.PYPI_PASSWORD }} dist/* 30 | -------------------------------------------------------------------------------- /scispacy/train_utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import tqdm 4 | from spacy.language import Language 5 | 6 | from scispacy.per_class_scorer import PerClassScorer 7 | 8 | 9 | def evaluate_ner( 10 | nlp: Language, eval_data, dump_path: str = None, verbose: bool = False 11 | ) -> PerClassScorer: 12 | 13 | scorer = PerClassScorer() 14 | print("Evaluating %d rows" % len(eval_data)) 15 | for i, (text, gold_spans) in enumerate(tqdm.tqdm(eval_data)): 16 | 17 | # parse dev data with trained model 18 | doc = nlp(text) 19 | predicted_spans = [ 20 | (ent.start_char, ent.end_char, ent.label_) for ent in doc.ents 21 | ] 22 | scorer(predicted_spans, gold_spans["entities"]) 23 | 24 | if i % 1000 == 0 and i > 0: 25 | for name, metric in scorer.get_metric().items(): 26 | print(f"{name}: {metric}") 27 | 28 | metrics = scorer.get_metric() 29 | if dump_path is not None: 30 | json.dump(metrics, open(dump_path, "a+")) 31 | for name, metric in metrics.items(): 32 | if "overall" in name or "untyped" in name or verbose: 33 | print(f"{name}: \t\t {metric}") 34 | 35 | return metrics 36 | -------------------------------------------------------------------------------- /scispacy/util.py: -------------------------------------------------------------------------------- 1 | import spacy 2 | from spacy.language import Language 3 | from spacy.tokens import Doc 4 | 5 | from scispacy.custom_sentence_segmenter import pysbd_sentencizer 6 | from scispacy.custom_tokenizer import combined_rule_tokenizer 7 | 8 | 9 | def save_model(nlp: Language, output_path: str): 10 | nlp.to_disk(output_path) 11 | 12 | 13 | def create_combined_rule_model() -> Language: 14 | nlp = spacy.load("en_core_web_sm") 15 | nlp.tokenizer = combined_rule_tokenizer(nlp) 16 | nlp.add_pipe(pysbd_sentencizer, first=True) 17 | return nlp 18 | 19 | 20 | class WhitespaceTokenizer: 21 | """ 22 | Spacy doesn't assume that text is tokenised. Sometimes this 23 | is annoying, like when you have gold data which is pre-tokenised, 24 | but Spacy's tokenisation doesn't match the gold. This can be used 25 | as follows: 26 | nlp = spacy.load("en_core_web_md") 27 | # hack to replace tokenizer with a whitespace tokenizer 28 | nlp.tokenizer = WhitespaceTokenizer(nlp.vocab) 29 | ... use nlp("here is some text") as normal. 30 | """ 31 | 32 | def __init__(self, vocab): 33 | self.vocab = vocab 34 | 35 | def __call__(self, text): 36 | words = text.split(" ") 37 | # All tokens 'own' a subsequent space character in 38 | # this tokenizer. This is a technicality and probably 39 | # not that interesting. 40 | spaces = [True] * len(words) 41 | return Doc(self.vocab, words=words, spaces=spaces) 42 | -------------------------------------------------------------------------------- /RELEASE.md: -------------------------------------------------------------------------------- 1 | 2 | ### Creating a release 3 | 4 | Scispacy has two components: 5 | 6 | - The scispacy pip package 7 | - The scispacy models 8 | 9 | The scispacy pip package is published automatically using the `.github/actions/publish.yml` github action. It happens whenever a release is published (with an associated tag) in the github releases UI. 10 | 11 | In order to create a new release, the following should happen: 12 | 13 | #### Updating `scispacy/version.py` 14 | Update the version in version.py. 15 | 16 | #### Training new models 17 | 18 | For the release, new models should be trained using the `scripts/pipeline.sh` and `scripts/ner_pipeline.sh` scripts, for the small, medium and large models, and specialized NER models. Remember to export the `ONTONOTES_PATH` and `ONTONOTES_PERCENT` environment variables to mix in the ontonotes training data. 19 | 20 | ``` 21 | bash scripts/pipeline.sh small 22 | bash scripts/pipeline.sh medium 23 | bash scripts/pipeline.sh large 24 | bash scripts/ner_pipeline.sh 25 | ``` 26 | 27 | these should then be uploaded to the `https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/{VERSION}` S3 bucket, and references to previous models (e.g in the readme and in the docs) should be updated. You can find all these places using `git grep `. 28 | 29 | #### Merge a PR with the above changes 30 | Merge a PR with the above changes, and publish a release with a tag corresponding to the commit from the merged PR. This should trigger the publish github action, which will create the `scispacy` package and publish it to pypi. 31 | 32 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # vscode 2 | *.vscode 3 | 4 | # Byte-compiled / optimized / DLL files 5 | __pycache__/ 6 | *.py[cod] 7 | *$py.class 8 | 9 | # C extensions 10 | *.so 11 | 12 | # Distribution / packaging 13 | .Python 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | wheels/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | MANIFEST 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | 53 | # Translations 54 | *.mo 55 | *.pot 56 | 57 | # Django stuff: 58 | *.log 59 | local_settings.py 60 | db.sqlite3 61 | 62 | # Flask stuff: 63 | instance/ 64 | .webassets-cache 65 | 66 | # Scrapy stuff: 67 | .scrapy 68 | 69 | # Sphinx documentation 70 | docs/_build/ 71 | 72 | # PyBuilder 73 | target/ 74 | 75 | # Jupyter Notebook 76 | .ipynb_checkpoints 77 | 78 | # pyenv 79 | .python-version 80 | 81 | # celery beat schedule file 82 | celerybeat-schedule 83 | 84 | # SageMath parsed files 85 | *.sage.py 86 | 87 | # Environments 88 | .env 89 | .venv 90 | env/ 91 | venv/ 92 | ENV/ 93 | env.bak/ 94 | venv.bak/ 95 | 96 | # Spyder project settings 97 | .spyderproject 98 | .spyproject 99 | 100 | # Rope project settings 101 | .ropeproject 102 | 103 | # mkdocs documentation 104 | /site 105 | 106 | # mypy 107 | .mypy_cache/ 108 | -------------------------------------------------------------------------------- /tests/custom_tests/data_fixtures/raw/9170401.txt: -------------------------------------------------------------------------------- 1 | Induction of cytokine expression in leukocytes by binding of thrombin-stimulated platelets. 2 | BACKGROUND: Activated platelets tether and activate myeloid leukocytes. 3 | To investigate the potential relevance of this mechanism in acute myocardial infarction (AMI), we examined cytokine induction by leukocyte-platelet adhesion and the occurrence of leukocyte-platelet conjugates in patients with AMI. 4 | METHODS AND RESULTS: We obtained peripheral venous blood samples in 20 patients with AMI before and daily for 5 days after direct percutaneous transluminal coronary angioplasty (PTCA) and in 20 patients undergoing elective PTCA. 5 | Throughout the study period, CD41 immunofluorescence of leukocytes (flow cytometry) revealed increased leukocyte-platelet adhesion in patients with AMI compared with control patients (mean +/- SE of fluorescence [channels] before PTCA: 77 +/- 16 versus 35 +/- 9; P = .003). 6 | In vitro, thrombin-stimulated fixed platelets bound to neutrophils and monocytes. 7 | Within 2 hours, this resulted in increased mRNA for interleukin (IL),1 beta, IL-8, and monocyte chemoattractant protein (MCP)-1 in unfractionated leukocytes. 8 | After 4 hours, IL-1 beta and IL-8 concentration of the cell-free supernatant had increased by 268 +/- 36% and 210 +/- 7%, respectively, and cellular MCP-1 content had increased by 170 +/- 8%. 9 | Addition of activated platelets to adherent monocytes had a similar effect and was associated with nuclear factor-kappa B activation. 10 | Inhibition of binding by anti-P selectin antibodies reduced the effect of activated platelets on cytokine production. 11 | CONCLUSIONS: In patients with AMI, leukocyte-platelet adhesion is increased. 12 | Binding of activated platelets induces IL-1 beta, IL-8, and MCP-1 in leukocytes. 13 | Our findings suggest that leukocyte-platelet adhesion contributes to the regulation of inflammatory responses in AMI. 14 | -------------------------------------------------------------------------------- /tests/custom_tests/test_whitespace.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | """Test that tokens are created correctly for whitespace.""" 3 | 4 | 5 | from __future__ import unicode_literals 6 | 7 | import pytest 8 | 9 | import spacy 10 | from spacy.language import Language as SpacyModelType 11 | 12 | from scispacy.custom_sentence_segmenter import pysbd_sentencizer 13 | 14 | 15 | class TestWhitespace: 16 | nlp = spacy.load("en_core_sci_sm") 17 | 18 | @pytest.mark.parametrize("text", ["lorem ipsum"]) 19 | def test_tokenizer_splits_single_space(self, text): 20 | tokens = self.nlp(text) 21 | assert len(tokens) == 2 22 | 23 | @pytest.mark.parametrize("text", ["lorem ipsum"]) 24 | def test_tokenizer_splits_double_space(self, text): 25 | tokens = self.nlp(text) 26 | assert len(tokens) == 3 27 | assert tokens[1].text == " " 28 | 29 | @pytest.mark.parametrize("text", ["lorem ipsum "]) 30 | def test_tokenizer_handles_double_trainling_ws(self, text): 31 | tokens = self.nlp(text) 32 | assert repr(tokens.text_with_ws) == repr(text) 33 | 34 | @pytest.mark.parametrize("text", ["lorem\nipsum"]) 35 | def test_tokenizer_splits_newline(self, text): 36 | tokens = self.nlp(text) 37 | assert len(tokens) == 3 38 | assert tokens[1].text == "\n" 39 | 40 | @pytest.mark.parametrize("text", ["lorem \nipsum"]) 41 | def test_tokenizer_splits_newline_space(self, text): 42 | tokens = self.nlp(text) 43 | assert len(tokens) == 3 44 | 45 | @pytest.mark.parametrize("text", ["lorem \nipsum"]) 46 | def test_tokenizer_splits_newline_double_space(self, text): 47 | tokens = self.nlp(text) 48 | assert len(tokens) == 3 49 | 50 | @pytest.mark.parametrize("text", ["lorem \n ipsum"]) 51 | def test_tokenizer_splits_newline_space_wrap(self, text): 52 | tokens = self.nlp(text) 53 | assert len(tokens) == 3 54 | -------------------------------------------------------------------------------- /tests/test_per_class_scorer.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import unittest 4 | 5 | from scispacy.per_class_scorer import PerClassScorer 6 | 7 | class TestPerClassScorer(unittest.TestCase): 8 | 9 | def test_per_class_scorer_counts_correctly(self): 10 | 11 | scorer = PerClassScorer() 12 | 13 | predicted = [(1, 3, "PER"), (10, 12, "LOC")] 14 | gold = [(1, 3, "PER"), (10, 12, "ORG")] 15 | original_gold = [x for x in gold] 16 | original_predicted = [x for x in predicted] 17 | 18 | scorer(predicted, gold) 19 | 20 | correct_metrics = {'precision-PER': 1.0, 21 | 'recall-PER': 1.0, 22 | 'f1-measure-PER': 1.0, 23 | 'precision-LOC': 0.0, 24 | 'recall-LOC': 0.0, 25 | 'f1-measure-LOC': 0.0, 26 | 'precision-untyped': 1.0, 27 | 'recall-untyped': 1.0, 28 | 'f1-measure-untyped': 1.0, 29 | 'precision-ORG': 0.0, 30 | 'recall-ORG': 0.0, 31 | 'f1-measure-ORG': 0.0, 32 | 'precision-overall': 0.5, 33 | 'recall-overall': 0.5, 34 | 'f1-measure-overall': 0.5} 35 | metrics = scorer.get_metric() 36 | assert set(metrics.keys()) == set(correct_metrics.keys()) 37 | for metric, value in metrics.items(): 38 | self.assertAlmostEqual(value, correct_metrics[metric]) 39 | 40 | scorer.get_metric(reset=True) 41 | 42 | # Check input is not modified. 43 | assert gold == original_gold 44 | assert predicted == original_predicted 45 | # Check reseting. 46 | assert scorer._true_positives == {} 47 | assert scorer._false_positives == {} 48 | assert scorer._false_negatives == {} 49 | -------------------------------------------------------------------------------- /tests/custom_tests/data_fixtures/raw/9171236.txt: -------------------------------------------------------------------------------- 1 | Defective survival and activation of thymocytes in transgenic mice expressing a catalytically inactive form of Ca2+/calmodulin-dependent protein kinase IV. 2 | We have generated transgenic mice that express a catalytically inactive form of Ca2+/calmodulin-dependent protein kinase IV (CaMKIV) specifically in thymic T cells. 3 | The presence of this protein results in a markedly reduced thymic cellularity, although the distribution of the remaining cells is normal based on evaluation of the CD4 and CD8 cell surface antigens that are used to gauge T cell development. 4 | Isolated thymic T cells from the transgenic mice also show a dramatically decreased survival rate when evaluated in culture under conditions that do not favor activation. 5 | When challenged with an activating stimulus such as alpha-CD3 or a combination of phorbol ester plus ionophore, the cells are severely compromised in their ability to produce the cytokine interleukin-2 (IL-2). 6 | Reduction of IL-2 production is secondary to the inability to phosphorylate the cAMP response element binding protein, CREB, and induce expression of the immediate early genes such as Fos B that are required to transactivate the IL-2 promoter. 7 | Because transgene expression was regulated by the proximal promoter of the murine lck gene and this promoter is inactivated in T cells that exit the thymus, the mutant hCaMKIV is not present in peripheral T cells. 8 | Consequently, T lymphocytes present in the spleen can be activated normally in response to either stimulus mentioned above, demonstrating that the effects of the inactive CaMKIV on activation are reversible. 9 | Our results suggest that CaMKIV may represent a physiologically relevant CREB kinase in T cells and that the enzyme is also required to ensure normal expansion of T cells in the thymus. 10 | Whereas the pathway responsible for this latter role is yet to be elucidated, it is unlikely to include CREB phosphorylation. 11 | -------------------------------------------------------------------------------- /tests/test_umls_semantic_type_tree.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import unittest 4 | 5 | from scispacy.umls_semantic_type_tree import construct_umls_tree_from_tsv 6 | 7 | class TestUmlsSemanticTypeTree(unittest.TestCase): 8 | 9 | def setUp(self): 10 | super().setUp() 11 | self.tree = construct_umls_tree_from_tsv("tests/fixtures/test_umls_tree.tsv") 12 | 13 | def test_tree_can_be_read_from_file(self): 14 | 15 | correct_names = ["Activity", "Behavior", "Social Behavior", "Individual Behavior", 16 | "Daily or Recreational Activity", "Event"] 17 | correct_ids = ['T052', 'T053', 'T054', 'T055', 'T056', 'T051'] 18 | for node, name, umls_id in zip(self.tree.flat_nodes, correct_names, correct_ids): 19 | assert node.full_name == name 20 | assert node.type_id == umls_id 21 | 22 | def test_tree_can_collapse_nodes(self): 23 | new_mapping = self.tree.get_collapsed_type_id_map_at_level(2) 24 | assert new_mapping == {'T052': 'T052', 25 | 'T053': 'T052', 26 | 'T054': 'T052', 27 | 'T055': 'T052', 28 | 'T056': 'T052', 29 | 'T051': 'T051'} 30 | assert ["T052"] == [node.type_id for node in self.tree.get_nodes_at_depth(2)] 31 | 32 | def test_get_parent_root(self): 33 | root_node = self.tree.get_node_from_id("T051") 34 | parent = self.tree.get_parent(root_node) 35 | assert parent is None 36 | 37 | def test_get_parent(self): 38 | level_1_node = self.tree.get_node_from_id("T052") 39 | level_1_node_parent = self.tree.get_parent(level_1_node) 40 | assert level_1_node_parent.type_id == "T051" 41 | 42 | leaf_node = self.tree.get_node_from_id("T055") 43 | leaf_node_parent = self.tree.get_parent(leaf_node) 44 | assert leaf_node_parent.type_id == "T053" 45 | 46 | -------------------------------------------------------------------------------- /scripts/evaluate_ner.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | import argparse 4 | import spacy 5 | import importlib 6 | 7 | from scispacy.data_util import read_full_med_mentions, read_ner_from_tsv 8 | from scispacy.train_utils import evaluate_ner 9 | 10 | 11 | def main(model_path: str, dataset: str, output_path: str, code: Optional[str], med_mentions_folder_path: Optional[str]): 12 | if code is not None: 13 | # need to import code before loading a spacy model 14 | spec = importlib.util.spec_from_file_location(name, str(loc)) 15 | module = importlib.util.module_from_spec(spec) 16 | spec.loader.exec_module(module) 17 | 18 | nlp = spacy.load(model_path) 19 | if dataset.startswith("medmentions"): 20 | train_data, dev_data, test_data = read_full_med_mentions(med_mentions_folder_path, None, False) 21 | data_split = dataset.split("-")[1] 22 | if data_split == "train": 23 | data = train_data 24 | elif data_split == "dev": 25 | data = dev_data 26 | elif data_split == "test": 27 | data = test_data 28 | else: 29 | raise Exception(f"Unrecognized split {data_split}") 30 | else: 31 | data = read_ner_from_tsv(dataset) 32 | 33 | evaluate_ner(nlp, data, dump_path=output_path) 34 | 35 | 36 | if __name__ == "__main__": 37 | parser = argparse.ArgumentParser() 38 | parser.add_argument("--model_path", type=str, help="Path to model to evaluate") 39 | parser.add_argument("--dataset", type=str, help="medmentions-, or a tsv file to evalute") 40 | parser.add_argument("--output_path", type=str, help="Path to write results to") 41 | parser.add_argument("--code", type=str, default=None, help="Path to code to import before loading spacy model") 42 | parser.add_argument("--med_mentions_folder_path", type=str, default=None, help="Path to the med mentions folder") 43 | 44 | args = parser.parse_args() 45 | main(args.model_path, args.dataset, args.output_path, args.code, args.med_mentions_folder_path) -------------------------------------------------------------------------------- /tests/test_hyponym_detector.py: -------------------------------------------------------------------------------- 1 | # pylint: disable=no-self-use,invalid-name 2 | import unittest 3 | import spacy 4 | 5 | from scispacy.hyponym_detector import HyponymDetector 6 | 7 | 8 | class TestHyponymDetector(unittest.TestCase): 9 | def setUp(self): 10 | super().setUp() 11 | self.nlp = spacy.load("en_core_sci_sm") 12 | self.detector = HyponymDetector(self.nlp, extended=True) 13 | self.nlp.add_pipe("hyponym_detector", config={"extended": True}, last=True) 14 | 15 | def test_sentences(self): 16 | text = ( 17 | "Recognizing that the preferred habitats for the species " 18 | "are in the valleys, systematic planting of keystone plant " 19 | "species such as fig trees (Ficus) creates the best microhabitats." 20 | ) 21 | doc = self.nlp(text) 22 | fig_trees = doc[21:23] 23 | keystone_plant_species = doc[16:19] 24 | assert doc._.hearst_patterns == [("such_as", keystone_plant_species, fig_trees)] 25 | 26 | doc = self.nlp("SARS, or other coronaviruses, are bad.") 27 | assert doc._.hearst_patterns == [("other", doc[4:5], doc[0:1])] 28 | doc = self.nlp("Coronaviruses, including SARS and MERS, are bad.") 29 | assert doc._.hearst_patterns == [ 30 | ("include", doc[0:1], doc[3:4]), 31 | ("include", doc[0:1], doc[5:6]), 32 | ] 33 | 34 | def test_find_noun_compound_head(self): 35 | 36 | doc = self.nlp("The potassium channel is good.") 37 | 38 | head = self.detector.find_noun_compound_head(doc[1]) 39 | assert head == doc[2] 40 | 41 | doc = self.nlp("Planting of large plants.") 42 | head = self.detector.find_noun_compound_head(doc[3]) 43 | # Planting is a noun, but not a compound with 'plants'. 44 | assert head != doc[0] 45 | assert head == doc[3] 46 | 47 | def test_expand_noun_phrase(self): 48 | doc = self.nlp("Keystone plant habitats are good.") 49 | chunk = self.detector.expand_to_noun_compound(doc[1], doc) 50 | assert chunk == doc[0:3] 51 | -------------------------------------------------------------------------------- /scispacy/custom_sentence_segmenter.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | import pysbd 4 | 5 | from spacy.tokens import Doc 6 | from spacy.language import Language 7 | from pysbd.utils import TextSpan 8 | 9 | from scispacy.consts import ABBREVIATIONS 10 | 11 | 12 | @Language.component("pysbd_sentencizer") 13 | def pysbd_sentencizer(doc: Doc) -> Doc: 14 | """Adds sentence boundaries to a Doc. 15 | Intended to be used as a pipe in a spaCy pipeline. 16 | Uses https://github.com/nipunsadvilkar/pySBD to get proper sentence and 17 | respective char_spans 18 | 19 | Handle special cases: 20 | New lines cannot be end of sentence tokens. 21 | New lines that separate sentences will be added to the 22 | beginning of the next sentence. 23 | 24 | @param doc: the spaCy document to be annotated with sentence boundaries 25 | """ 26 | segmenter = pysbd.Segmenter(language="en", clean=False, char_span=True) 27 | sents_char_spans: List[TextSpan] = segmenter.segment(doc.text) 28 | 29 | char_spans = [ 30 | doc.char_span( 31 | sent_span.start, 32 | # strip off trailing spaces when creating spans to accomodate spacy 33 | sent_span.end - (len(sent_span.sent) - len(sent_span.sent.rstrip(" "))), 34 | ) 35 | for sent_span in sents_char_spans 36 | ] 37 | start_token_char_offsets = [span[0].idx for span in char_spans if span is not None] 38 | for token in doc: 39 | prev_token = token.nbor(-1) if token.i != 0 else None 40 | if token.idx in start_token_char_offsets: 41 | if prev_token and ( 42 | prev_token.text in ABBREVIATIONS 43 | # Glom new lines at the beginning of the text onto the following sentence 44 | or (prev_token.i == 0 and all(c == "\n" for c in prev_token.text)) 45 | ): 46 | token.is_sent_start = False 47 | else: 48 | token.is_sent_start = True 49 | # check if previous token contains more than 2 newline chars 50 | elif prev_token and prev_token.i != 0 and prev_token.text.count("\n") >= 2: 51 | token.is_sent_start = True 52 | else: 53 | token.is_sent_start = False 54 | return doc 55 | -------------------------------------------------------------------------------- /tests/test_umls_utils.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from scispacy import umls_utils 4 | 5 | class TestUtil(unittest.TestCase): 6 | 7 | expected_concepts = [ 8 | {'concept_id': 'C0000005', 'canonical_name': '(131)I-Macroaggregated Albumin', 9 | 'types': ['T116'], 'aliases': ['(131)I-MAA']}, 10 | {'concept_id': 'C0000039', 'aliases': ['1,2-Dipalmitoylphosphatidylcholine'], 11 | 'types': ['T109', 'T121'], 'definition': 12 | 'Synthetic phospholipid used in liposomes and lipid bilayers to study biological membranes.'} 13 | ] 14 | 15 | def test_read_umls_concepts(self): 16 | meta_path = 'tests/fixtures/umls_META' 17 | concept_details = {} 18 | umls_utils.read_umls_concepts(meta_path, concept_details) 19 | assert len(self.expected_concepts) == len(concept_details) 20 | 21 | for expected_concept in self.expected_concepts: 22 | assert expected_concept['concept_id'] in concept_details 23 | concept = concept_details[expected_concept['concept_id']] 24 | if 'canonical_name' in expected_concept: 25 | assert concept['canonical_name'] == expected_concept['canonical_name'] 26 | assert concept['aliases'] == expected_concept['aliases'] 27 | 28 | def test_read_umls_types(self): 29 | meta_path = 'tests/fixtures/umls_META' 30 | concept_details = {} 31 | umls_utils.read_umls_concepts(meta_path, concept_details) 32 | umls_utils.read_umls_types(meta_path, concept_details) 33 | for expected_concept in self.expected_concepts: 34 | concept = concept_details[expected_concept['concept_id']] 35 | assert concept['types'] == expected_concept['types'] 36 | 37 | def test_read_umls_definitions(self): 38 | meta_path = 'tests/fixtures/umls_META' 39 | concept_details = {} 40 | umls_utils.read_umls_concepts(meta_path, concept_details) 41 | umls_utils.read_umls_definitions(meta_path, concept_details) 42 | for expected_concept in self.expected_concepts: 43 | concept = concept_details[expected_concept['concept_id']] 44 | if 'definition' in expected_concept: 45 | assert concept['definition'] == expected_concept['definition'] 46 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | 4 | """ 5 | Instructions for creating a release of the scispacy library. 6 | 7 | 1. Make sure your working directory is clean. 8 | 2. Make sure that you have changed the versions in "scispacy/version.py". 9 | 3. Create the distribution by running "python setup.py sdist" in the root of the repository. 10 | 4. Check you can install the new distribution in a clean environment. 11 | 5. Upload the distribution to pypi by running "twine upload -u -p ". 12 | This step will ask you for a username and password - the username is "scispacy" you can 13 | get the password from LastPass. 14 | """ 15 | 16 | VERSION = {} 17 | # version.py defines VERSION and VERSION_SHORT variables. 18 | # We use exec here to read it so that we don't import scispacy 19 | # whilst setting up the package. 20 | with open("scispacy/version.py", "r") as version_file: 21 | exec(version_file.read(), VERSION) 22 | 23 | setup( 24 | name="scispacy", 25 | version=VERSION["VERSION"], 26 | url="https://allenai.github.io/SciSpaCy/", 27 | author="Allen Institute for Artificial Intelligence", 28 | author_email="ai2-info@allenai.org", 29 | description="A full SpaCy pipeline and models for scientific/biomedical documents.", 30 | long_description=open("README.md").read(), 31 | long_description_content_type="text/markdown", 32 | keywords=["bioinformatics nlp spacy SpaCy biomedical"], 33 | classifiers=[ 34 | "Intended Audience :: Science/Research", 35 | "Development Status :: 3 - Alpha", 36 | "License :: OSI Approved :: Apache Software License", 37 | "Programming Language :: Python :: 3.6", 38 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 39 | "Topic :: Scientific/Engineering :: Bio-Informatics", 40 | ], 41 | packages=find_packages(exclude=["*.tests", "*.tests.*", "tests.*", "tests"]), 42 | license="Apache", 43 | install_requires=[ 44 | "spacy>=3.0.0,<3.1.0", 45 | "requests>=2.0.0,<3.0.0", 46 | "conllu", 47 | "numpy", 48 | "joblib", 49 | "nmslib>=1.7.3.6", 50 | "scikit-learn>=0.20.3", 51 | "pysbd", 52 | ], 53 | tests_require=["pytest", "pytest-cov", "flake8", "black", "mypy"], 54 | python_requires=">=3.6.0", 55 | ) 56 | -------------------------------------------------------------------------------- /tests/test_candidate_generation.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import tempfile 3 | 4 | from scispacy.candidate_generation import CandidateGenerator, create_tfidf_ann_index, MentionCandidate 5 | from scispacy.umls_utils import UmlsKnowledgeBase 6 | 7 | 8 | class TestCandidateGeneration(unittest.TestCase): 9 | 10 | def test_create_index(self): 11 | 12 | umls_fixture = UmlsKnowledgeBase("tests/fixtures/umls_test_fixture.json") 13 | with tempfile.TemporaryDirectory() as dir_name: 14 | umls_concept_aliases, tfidf_vectorizer, ann_index = create_tfidf_ann_index(dir_name, umls_fixture) 15 | 16 | assert len(umls_concept_aliases) == 93 17 | assert len(ann_index) == 93 # Number of deduplicated aliases + canonical ids 18 | tfidf_params = tfidf_vectorizer.get_params() 19 | 20 | assert tfidf_params["analyzer"] == "char_wb" 21 | assert tfidf_params["min_df"] == 10 22 | assert tfidf_params["ngram_range"] == (3, 3) 23 | 24 | def test_candidate_generation(self): 25 | 26 | umls_fixture = UmlsKnowledgeBase("tests/fixtures/umls_test_fixture.json") 27 | with tempfile.TemporaryDirectory() as dir_name: 28 | umls_concept_aliases, tfidf_vectorizer, ann_index = create_tfidf_ann_index(dir_name, umls_fixture) 29 | 30 | candidate_generator = CandidateGenerator(ann_index, tfidf_vectorizer, umls_concept_aliases, umls_fixture) 31 | results = candidate_generator(['(131)I-Macroaggregated Albumin'], 10) 32 | 33 | canonical_ids = [x.concept_id for x in results[0]] 34 | assert canonical_ids == ['C0000005', 'C0000015', 'C0000074', 'C0000102', 'C0000103'] 35 | 36 | # The mention was an exact match, so should have a distance of zero to a concept: 37 | assert results[0][0] == MentionCandidate(concept_id='C0000005', 38 | aliases=['(131)I-Macroaggregated Albumin'], 39 | similarities=[1.0]) 40 | 41 | # Test we don't crash with zero vectors 42 | results = candidate_generator(['ZZZZ'], 10) 43 | assert results == [[]] 44 | 45 | def test_empty_list(self): 46 | 47 | umls_fixture = UmlsKnowledgeBase("tests/fixtures/umls_test_fixture.json") 48 | with tempfile.TemporaryDirectory() as dir_name: 49 | umls_concept_aliases, tfidf_vectorizer, ann_index = create_tfidf_ann_index(dir_name, umls_fixture) 50 | 51 | candidate_generator = CandidateGenerator(ann_index, tfidf_vectorizer, umls_concept_aliases, umls_fixture) 52 | results = candidate_generator([], 10) 53 | 54 | assert results == [] 55 | -------------------------------------------------------------------------------- /scripts/convert_freqs.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import math 3 | import json 4 | from ast import literal_eval 5 | from tqdm import tqdm 6 | from preshed.counter import PreshCounter 7 | from spacy.util import ensure_path 8 | from scispacy.file_cache import cached_path 9 | 10 | 11 | def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50): 12 | print("Counting frequencies...") 13 | counts = PreshCounter() 14 | total = 0 15 | with freqs_loc.open() as f: 16 | for i, line in tqdm(enumerate(f)): 17 | freq, doc_freq, key = line.rstrip().split("\t", 2) 18 | freq = int(freq) 19 | counts.inc(i + 1, freq) 20 | total += freq 21 | counts.smooth() 22 | log_total = math.log(total) 23 | probs = {} 24 | with freqs_loc.open() as f: 25 | for line in tqdm(f): 26 | freq, doc_freq, key = line.rstrip().split("\t", 2) 27 | doc_freq = int(doc_freq) 28 | freq = int(freq) 29 | if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length: 30 | try: 31 | word = literal_eval(key) 32 | except SyntaxError: 33 | # Take odd strings literally. 34 | word = literal_eval("'%s'" % key) 35 | smooth_count = counts.smoother(int(freq)) 36 | probs[word] = math.log(smooth_count) - log_total 37 | oov_prob = math.log(counts.smoother(0)) - log_total 38 | return probs, oov_prob 39 | 40 | 41 | def main(input_path: str, output_path: str, min_word_frequency: int): 42 | if input_path is not None: 43 | input_path = cached_path(input_path) 44 | input_path = ensure_path(input_path) 45 | 46 | probs, oov_prob = ( 47 | read_freqs(input_path, min_freq=min_word_frequency) 48 | if input_path is not None 49 | else ({}, -20) 50 | ) 51 | 52 | with open(output_path, "w") as _jsonl_file: 53 | _jsonl_file.write( 54 | json.dumps({"lang": "en", "settings": {"oov_prob": -20.502029418945312}}) 55 | ) 56 | _jsonl_file.write("\n") 57 | 58 | for word, prob in probs.items(): 59 | _jsonl_file.write(json.dumps({"orth": word, "prob": prob})) 60 | _jsonl_file.write("\n") 61 | 62 | 63 | if __name__ == "__main__": 64 | parser = argparse.ArgumentParser() 65 | parser.add_argument( 66 | "--input_path", type=str, default=None, help="Path to the freqs file" 67 | ) 68 | parser.add_argument( 69 | "--output_path", type=str, help="Output path for the jsonl file" 70 | ) 71 | parser.add_argument( 72 | "--min_word_frequency", 73 | type=int, 74 | default=50, 75 | help="Minimum word frequency for inclusion", 76 | ) 77 | 78 | args = parser.parse_args() 79 | main(args.input_path, args.output_path, args.min_word_frequency) -------------------------------------------------------------------------------- /tests/test_file_cache.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pathlib 3 | import json 4 | import unittest 5 | import shutil 6 | 7 | import pytest 8 | 9 | from scispacy.file_cache import filename_to_url, url_to_filename 10 | 11 | class TestFileUtils(unittest.TestCase): 12 | def setUp(self): 13 | super().setUp() 14 | self.TEST_DIR = "/tmp/scispacy" 15 | os.makedirs(self.TEST_DIR, exist_ok=True) 16 | 17 | def tearDown(self): 18 | shutil.rmtree(self.TEST_DIR) 19 | 20 | def test_url_to_filename(self): 21 | for url in ['http://allenai.org', 'http://cool.org', 22 | 'https://www.google.com', 'http://pytorch.org', 23 | 'https://s3-us-west-2.amazonaws.com/cool' + '/long' * 20 + '/url']: 24 | filename = url_to_filename(url) 25 | assert "http" not in filename 26 | with pytest.raises(FileNotFoundError): 27 | filename_to_url(filename, cache_dir=self.TEST_DIR) 28 | pathlib.Path(os.path.join(self.TEST_DIR, filename)).touch() 29 | with pytest.raises(FileNotFoundError): 30 | filename_to_url(filename, cache_dir=self.TEST_DIR) 31 | json.dump({'url': url, 'etag': None}, 32 | open(os.path.join(self.TEST_DIR, filename + '.json'), 'w')) 33 | back_to_url, etag = filename_to_url(filename, cache_dir=self.TEST_DIR) 34 | assert back_to_url == url 35 | assert etag is None 36 | 37 | def test_url_to_filename_with_etags(self): 38 | for url in ['http://allenai.org', 'http://cool.org', 39 | 'https://www.google.com', 'http://pytorch.org']: 40 | filename = url_to_filename(url, etag="mytag") 41 | assert "http" not in filename 42 | pathlib.Path(os.path.join(self.TEST_DIR, filename)).touch() 43 | json.dump({'url': url, 'etag': 'mytag'}, 44 | open(os.path.join(self.TEST_DIR, filename + '.json'), 'w')) 45 | back_to_url, etag = filename_to_url(filename, cache_dir=self.TEST_DIR) 46 | assert back_to_url == url 47 | assert etag == "mytag" 48 | baseurl = 'http://allenai.org/' 49 | assert url_to_filename(baseurl + '1') != url_to_filename(baseurl, etag='1') 50 | 51 | def test_url_to_filename_with_etags_eliminates_quotes(self): 52 | for url in ['http://allenai.org', 'http://cool.org', 53 | 'https://www.google.com', 'http://pytorch.org']: 54 | filename = url_to_filename(url, etag='"mytag"') 55 | assert "http" not in filename 56 | pathlib.Path(os.path.join(self.TEST_DIR, filename)).touch() 57 | json.dump({'url': url, 'etag': 'mytag'}, 58 | open(os.path.join(self.TEST_DIR, filename + '.json'), 'w')) 59 | back_to_url, etag = filename_to_url(filename, cache_dir=self.TEST_DIR) 60 | assert back_to_url == url 61 | assert etag == "mytag" 62 | -------------------------------------------------------------------------------- /scripts/count_word_frequencies.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from typing import List, Tuple 4 | import os 5 | import io 6 | import sys 7 | import tempfile 8 | import shutil 9 | from collections import Counter 10 | from pathlib import Path 11 | from multiprocessing import Pool 12 | 13 | import plac 14 | 15 | import spacy.util 16 | from spacy.language import Language 17 | 18 | sys.path.insert(0, os.path.dirname(os.path.abspath(os.path.join(__file__, os.pardir)))) 19 | 20 | from scispacy.custom_tokenizer import combined_rule_tokenizer 21 | 22 | def count_frequencies(language_class: Language, input_path: Path): 23 | """ 24 | Given a file containing single documents per line 25 | (for scispacy, these are Pubmed abstracts), split the text 26 | using a science specific tokenizer and compute word and 27 | document frequencies for all words. 28 | """ 29 | print(f"Processing {input_path}.") 30 | tokenizer = combined_rule_tokenizer(language_class()) 31 | counts = Counter() 32 | doc_counts = Counter() 33 | for line in open(input_path, "r"): 34 | words = [t.text for t in tokenizer(line)] 35 | counts.update(words) 36 | doc_counts.update(set(words)) 37 | 38 | return counts, doc_counts 39 | 40 | def parallelize(func, iterator, n_jobs): 41 | pool = Pool(processes=n_jobs) 42 | counts = pool.starmap(func, iterator) 43 | return counts 44 | 45 | def merge_counts(frequencies: List[Tuple[Counter, Counter]], output_path: str): 46 | """ 47 | Merge a number of frequency counts generated from `count_frequencies` 48 | into a single file, written to `output_path`. 49 | """ 50 | counts = Counter() 51 | doc_counts = Counter() 52 | for word_count, doc_count in frequencies: 53 | counts.update(word_count) 54 | doc_counts.update(doc_count) 55 | with io.open(output_path, 'w+', encoding='utf8') as file_: 56 | for word, count in counts.most_common(): 57 | if not word.isspace(): 58 | file_.write(f"{count}\t{doc_counts[word]}\t{repr(word)}\n") 59 | 60 | 61 | @plac.annotations( 62 | raw_dir=("Location of input file list", "positional", None, Path), 63 | output_dir=("Location for output file", "positional", None, Path), 64 | n_jobs=("Number of workers", "option", "n", int)) 65 | def main(raw_dir: Path, output_dir: Path, n_jobs=2): 66 | 67 | language_class = spacy.util.get_lang_class("en") 68 | tasks = [] 69 | freqs_dir = Path(tempfile.mkdtemp(prefix="scispacy_freqs")) 70 | for input_path in [os.path.join(raw_dir, filename) 71 | for filename in os.listdir(raw_dir)]: 72 | input_path = Path(input_path.strip()) 73 | if not input_path: 74 | continue 75 | tasks.append((language_class, input_path)) 76 | 77 | if tasks: 78 | counts = parallelize(count_frequencies, tasks, n_jobs) 79 | 80 | print("Merge") 81 | merge_counts(counts, output_dir) 82 | shutil.rmtree(freqs_dir) 83 | 84 | if __name__ == '__main__': 85 | plac.call(main) 86 | -------------------------------------------------------------------------------- /tests/custom_tests/test_all_model.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import spacy 4 | from spacy.vocab import Vocab 5 | import shutil 6 | import pytest 7 | 8 | 9 | def test_custom_segmentation(combined_all_model_fixture): 10 | text = "Induction of cytokine expression in leukocytes by binding of thrombin-stimulated platelets. BACKGROUND: Activated platelets tether and activate myeloid leukocytes." 11 | doc = combined_all_model_fixture(text) 12 | sents = list(doc.sents) 13 | assert len(sents) == 2 14 | expected_tokens = [ 15 | "Induction", 16 | "of", 17 | "cytokine", 18 | "expression", 19 | "in", 20 | "leukocytes", 21 | "by", 22 | "binding", 23 | "of", 24 | "thrombin-stimulated", 25 | "platelets", 26 | ".", 27 | "BACKGROUND", 28 | ":", 29 | "Activated", 30 | "platelets", 31 | "tether", 32 | "and", 33 | "activate", 34 | "myeloid", 35 | "leukocytes", 36 | ".", 37 | ] 38 | actual_tokens = [t.text for t in doc] 39 | assert expected_tokens == actual_tokens 40 | assert doc.has_annotation("DEP") 41 | assert doc[0].dep_ == "ROOT" 42 | assert doc[0].tag_ == "NN" 43 | 44 | def test_full_pipe_serializable(combined_all_model_fixture): 45 | text = "Induction of cytokine expression in leukocytes (CEIL) by binding of thrombin-stimulated platelets. BACKGROUND: Activated platelets tether and activate myeloid leukocytes." 46 | doc = [doc for doc in combined_all_model_fixture.pipe([text, text], n_process = 2)][0] 47 | # If we got here this means that both model is serializable and there is an abbreviation that would break if it wasn't 48 | assert len(doc._.abbreviations) > 0 49 | abbrev = doc._.abbreviations[0] 50 | assert abbrev["short_text"] == "CEIL" 51 | assert abbrev["long_text"] == "cytokine expression in leukocytes" 52 | assert doc[abbrev["short_start"] : abbrev["short_end"]].text == abbrev["short_text"] 53 | assert doc[abbrev["long_start"] : abbrev["long_end"]].text == abbrev["long_text"] 54 | 55 | def test_full_pipe_not_serializable(combined_all_model_fixture_non_serializable_abbrev): 56 | text = "Induction of cytokine expression in leukocytes (CEIL) by binding of thrombin-stimulated platelets. BACKGROUND: Activated platelets tether and activate myeloid leukocytes." 57 | # This line requires the pipeline to be serializable, so the test should fail here 58 | doc = combined_all_model_fixture_non_serializable_abbrev(text) 59 | with pytest.raises(TypeError): 60 | doc.to_bytes() 61 | 62 | # Below is the test version to be used once we move to spacy v3.1.0 or higher 63 | # def test_full_pipe_not_serializable(combined_all_model_fixture_non_serializable_abbrev): 64 | # text = "Induction of cytokine expression in leukocytes (CEIL) by binding of thrombin-stimulated platelets. BACKGROUND: Activated platelets tether and activate myeloid leukocytes." 65 | # # This line requires the pipeline to be serializable (because it uses 2 processes), so the test should fail here 66 | # with pytest.raises(TypeError): 67 | # list(combined_all_model_fixture_non_serializable_abbrev.pipe([text, text], n_process = 2)) -------------------------------------------------------------------------------- /tests/test_linking.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import tempfile 3 | 4 | import spacy 5 | 6 | from scispacy.candidate_generation import CandidateGenerator, create_tfidf_ann_index 7 | from scispacy.linking import EntityLinker 8 | from scispacy.umls_utils import UmlsKnowledgeBase 9 | from scispacy.abbreviation import AbbreviationDetector 10 | 11 | 12 | class TestLinker(unittest.TestCase): 13 | def setUp(self): 14 | super().setUp() 15 | self.nlp = spacy.load("en_core_web_sm") 16 | 17 | umls_fixture = UmlsKnowledgeBase("tests/fixtures/umls_test_fixture.json", "tests/fixtures/test_umls_tree.tsv") 18 | with tempfile.TemporaryDirectory() as dir_name: 19 | umls_concept_aliases, tfidf_vectorizer, ann_index = create_tfidf_ann_index(dir_name, umls_fixture) 20 | candidate_generator = CandidateGenerator(ann_index, tfidf_vectorizer, umls_concept_aliases, umls_fixture) 21 | 22 | self.linker = EntityLinker(candidate_generator=candidate_generator, filter_for_definitions=False) 23 | 24 | def test_naive_entity_linking(self): 25 | text = "There was a lot of Dipalmitoylphosphatidylcholine." 26 | doc = self.nlp(text) 27 | 28 | # Check that the linker returns nothing if we set the filter_for_definitions flag 29 | # and set the threshold very high for entities without definitions. 30 | self.linker.filter_for_definitions = True 31 | self.linker.no_definition_threshold = 3.0 32 | doc = self.linker(doc) 33 | assert doc.ents[0]._.kb_ents == [] 34 | 35 | # Check that the linker returns only high confidence entities if we 36 | # set the threshold to something more reasonable. 37 | self.linker.no_definition_threshold = 0.95 38 | doc = self.linker(doc) 39 | assert doc.ents[0]._.kb_ents == [("C0000039", 1.0)] 40 | 41 | self.linker.filter_for_definitions = False 42 | self.linker.threshold = 0.45 43 | doc = self.linker(doc) 44 | # Without the filter_for_definitions filter, we get 2 entities for 45 | # the first mention. 46 | assert len(doc.ents[0]._.kb_ents) == 2 47 | 48 | id_with_score = doc.ents[0]._.kb_ents[0] 49 | assert id_with_score == ("C0000039", 1.0) 50 | umls_entity = self.linker.kb.cui_to_entity[id_with_score[0]] 51 | assert umls_entity.concept_id == "C0000039" 52 | assert umls_entity.types == ["T109", "T121"] 53 | 54 | def test_linker_resolves_abbreviations(self): 55 | 56 | self.nlp.add_pipe("abbreviation_detector") 57 | # replace abbreivation with "CNN" so spacy recognizes at as en entity 58 | # and also prefix the term with "CNN" so that abbreviation detector passes 59 | text = "CNN1-Methyl-4-phenylpyridinium (CNN) is an abbreviation which doesn't exist in the baby index." 60 | doc = self.nlp(text) 61 | doc = self.linker(doc) 62 | 63 | id_with_score = doc.ents[0]._.kb_ents[0] 64 | assert id_with_score == ("C0000098", 0.9819725155830383) 65 | umls_entity = self.linker.kb.cui_to_entity[id_with_score[0]] 66 | assert umls_entity.concept_id == "C0000098" 67 | 68 | def test_linker_has_types(self): 69 | # Just checking that the type tree is accessible from the linker 70 | assert len(self.linker.kb.semantic_type_tree.flat_nodes) == 6 71 | -------------------------------------------------------------------------------- /configs/base_ner.cfg: -------------------------------------------------------------------------------- 1 | [paths] 2 | vectors = null 3 | init_tok2vec = null 4 | parser_tagger_path = null 5 | vocab_path = null 6 | 7 | [system] 8 | gpu_allocator = null 9 | seed = 0 10 | 11 | [nlp] 12 | lang = "en" 13 | pipeline = ["tok2vec","tagger","attribute_ruler","lemmatizer","parser","ner"] 14 | tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"} 15 | disabled = [] 16 | before_creation = null 17 | after_creation = null 18 | after_pipeline_creation = null 19 | 20 | [components] 21 | 22 | [components.attribute_ruler] 23 | source = "en_core_web_sm" 24 | 25 | [components.lemmatizer] 26 | source = "en_core_web_sm" 27 | 28 | [components.ner] 29 | factory = "ner" 30 | moves = null 31 | update_with_oracle_cut_size = 100 32 | 33 | [components.ner.model] 34 | @architectures = "spacy.TransitionBasedParser.v1" 35 | state_type = "ner" 36 | extra_state_tokens = false 37 | hidden_width = 64 38 | maxout_pieces = 2 39 | use_upper = true 40 | nO = null 41 | 42 | [components.ner.model.tok2vec] 43 | @architectures = "spacy.Tok2Vec.v1" 44 | 45 | [components.ner.model.tok2vec.embed] 46 | @architectures = "spacy.MultiHashEmbed.v1" 47 | width = 96 48 | attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"] 49 | rows = [5000, 2500, 2500, 2500] 50 | include_static_vectors = true 51 | 52 | [components.ner.model.tok2vec.encode] 53 | @architectures = "spacy.MaxoutWindowEncoder.v1" 54 | width = 96 55 | depth = 4 56 | window_size = 1 57 | maxout_pieces = 3 58 | 59 | [components.parser] 60 | source = ${paths.parser_tagger_path} 61 | 62 | [components.tagger] 63 | source = ${paths.parser_tagger_path} 64 | 65 | [components.tok2vec] 66 | source = ${paths.parser_tagger_path} 67 | 68 | [corpora] 69 | 70 | [corpora.dev] 71 | @readers = "med_mentions_reader" 72 | directory_path = "assets/" 73 | split = "dev" 74 | 75 | [corpora.train] 76 | @readers = "med_mentions_reader" 77 | directory_path = "assets/" 78 | split = "train" 79 | 80 | [training] 81 | dev_corpus = "corpora.dev" 82 | train_corpus = "corpora.train" 83 | seed = ${system.seed} 84 | gpu_allocator = ${system.gpu_allocator} 85 | dropout = 0.2 86 | accumulate_gradient = 1 87 | patience = 0 88 | max_epochs = 7 89 | max_steps = 0 90 | eval_frequency = 500 91 | frozen_components = ["tok2vec", "parser", "tagger", "attribute_ruler", "lemmatizer"] 92 | before_to_disk = null 93 | 94 | [training.batcher] 95 | @batchers = "spacy.batch_by_sequence.v1" 96 | get_length = null 97 | 98 | [training.batcher.size] 99 | @schedules = "compounding.v1" 100 | start = 1 101 | stop = 32 102 | compound = 1.001 103 | t = 0.0 104 | 105 | [training.logger] 106 | @loggers = "spacy.ConsoleLogger.v1" 107 | progress_bar = true 108 | 109 | [training.optimizer] 110 | @optimizers = "Adam.v1" 111 | beta1 = 0.9 112 | beta2 = 0.999 113 | L2_is_weight_decay = true 114 | L2 = 0.01 115 | grad_clip = 1.0 116 | use_averages = false 117 | eps = 0.00000001 118 | learn_rate = 0.001 119 | 120 | [training.score_weights] 121 | dep_las_per_type = null 122 | sents_p = null 123 | sents_r = null 124 | ents_per_type = null 125 | tag_acc = null 126 | dep_uas = null 127 | dep_las = null 128 | sents_f = null 129 | ents_f = 1.0 130 | ents_p = 0.0 131 | ents_r = 0.0 132 | 133 | [pretraining] 134 | 135 | [initialize] 136 | vectors = ${paths.vectors} 137 | init_tok2vec = ${paths.init_tok2vec} 138 | vocab_data = ${paths.vocab_path} 139 | lookups = null 140 | 141 | [initialize.components] 142 | 143 | [initialize.tokenizer] 144 | 145 | [initialize.before_init] 146 | @callbacks = "replace_tokenizer" -------------------------------------------------------------------------------- /configs/base_specialized_ner.cfg: -------------------------------------------------------------------------------- 1 | [paths] 2 | vectors = null 3 | init_tok2vec = null 4 | parser_tagger_path = null 5 | dev_path = null 6 | train_path = null 7 | vocab_path = null 8 | 9 | [system] 10 | gpu_allocator = null 11 | seed = 0 12 | 13 | [nlp] 14 | lang = "en" 15 | pipeline = ["tok2vec","tagger","attribute_ruler","lemmatizer","parser","ner"] 16 | tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"} 17 | disabled = [] 18 | before_creation = null 19 | after_creation = null 20 | after_pipeline_creation = null 21 | 22 | [components] 23 | 24 | [components.attribute_ruler] 25 | source = "en_core_web_sm" 26 | 27 | [components.lemmatizer] 28 | source = "en_core_web_sm" 29 | 30 | [components.ner] 31 | factory = "ner" 32 | moves = null 33 | update_with_oracle_cut_size = 100 34 | 35 | [components.ner.model] 36 | @architectures = "spacy.TransitionBasedParser.v1" 37 | state_type = "ner" 38 | extra_state_tokens = false 39 | hidden_width = 64 40 | maxout_pieces = 2 41 | use_upper = true 42 | nO = null 43 | 44 | [components.ner.model.tok2vec] 45 | @architectures = "spacy.Tok2Vec.v1" 46 | 47 | [components.ner.model.tok2vec.embed] 48 | @architectures = "spacy.MultiHashEmbed.v1" 49 | width = 96 50 | attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"] 51 | rows = [5000, 2500, 2500, 2500] 52 | include_static_vectors = true 53 | 54 | [components.ner.model.tok2vec.encode] 55 | @architectures = "spacy.MaxoutWindowEncoder.v1" 56 | width = 96 57 | depth = 4 58 | window_size = 1 59 | maxout_pieces = 3 60 | 61 | [components.parser] 62 | source = ${paths.parser_tagger_path} 63 | 64 | [components.tagger] 65 | source = ${paths.parser_tagger_path} 66 | 67 | [components.tok2vec] 68 | source = ${paths.parser_tagger_path} 69 | 70 | [corpora] 71 | 72 | [corpora.dev] 73 | @readers = "specialized_ner_reader" 74 | file_path = ${paths.dev_path} 75 | 76 | [corpora.train] 77 | @readers = "specialized_ner_reader" 78 | file_path = ${paths.train_path} 79 | 80 | [training] 81 | dev_corpus = "corpora.dev" 82 | train_corpus = "corpora.train" 83 | seed = ${system.seed} 84 | gpu_allocator = ${system.gpu_allocator} 85 | dropout = 0.2 86 | accumulate_gradient = 1 87 | patience = 0 88 | max_epochs = 7 89 | max_steps = 0 90 | eval_frequency = 500 91 | frozen_components = ["tok2vec", "parser", "tagger", "attribute_ruler", "lemmatizer"] 92 | before_to_disk = null 93 | 94 | [training.batcher] 95 | @batchers = "spacy.batch_by_sequence.v1" 96 | get_length = null 97 | 98 | [training.batcher.size] 99 | @schedules = "compounding.v1" 100 | start = 1 101 | stop = 32 102 | compound = 1.001 103 | t = 0.0 104 | 105 | [training.logger] 106 | @loggers = "spacy.ConsoleLogger.v1" 107 | progress_bar = true 108 | 109 | [training.optimizer] 110 | @optimizers = "Adam.v1" 111 | beta1 = 0.9 112 | beta2 = 0.999 113 | L2_is_weight_decay = true 114 | L2 = 0.01 115 | grad_clip = 1.0 116 | use_averages = false 117 | eps = 0.00000001 118 | learn_rate = 0.001 119 | 120 | [training.score_weights] 121 | dep_las_per_type = null 122 | sents_p = null 123 | sents_r = null 124 | ents_per_type = null 125 | tag_acc = null 126 | dep_uas = null 127 | dep_las = null 128 | sents_f = null 129 | ents_f = 1.0 130 | ents_p = 0.0 131 | ents_r = 0.0 132 | 133 | [pretraining] 134 | 135 | [initialize] 136 | vectors = ${paths.vectors} 137 | init_tok2vec = ${paths.init_tok2vec} 138 | vocab_data = ${paths.vocab_path} 139 | lookups = null 140 | 141 | [initialize.components] 142 | 143 | [initialize.tokenizer] 144 | 145 | [initialize.before_init] 146 | @callbacks = "replace_tokenizer" -------------------------------------------------------------------------------- /configs/base_ner_scibert.cfg: -------------------------------------------------------------------------------- 1 | [paths] 2 | vectors = null 3 | init_tok2vec = null 4 | parser_tagger_path = null 5 | vocab_path = null 6 | 7 | [system] 8 | gpu_allocator = null 9 | seed = 0 10 | 11 | [nlp] 12 | lang = "en" 13 | pipeline = ["transformer", "tagger","attribute_ruler","lemmatizer","parser","ner"] 14 | tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"} 15 | disabled = [] 16 | before_creation = null 17 | after_creation = null 18 | after_pipeline_creation = null 19 | 20 | [components] 21 | 22 | [components.attribute_ruler] 23 | source = "en_core_web_sm" 24 | 25 | [components.lemmatizer] 26 | source = "en_core_web_sm" 27 | 28 | [components.ner] 29 | factory = "ner" 30 | moves = null 31 | update_with_oracle_cut_size = 100 32 | 33 | [components.ner.model] 34 | @architectures = "spacy.TransitionBasedParser.v1" 35 | state_type = "ner" 36 | extra_state_tokens = false 37 | hidden_width = 64 38 | maxout_pieces = 2 39 | use_upper = true 40 | nO = null 41 | 42 | [components.ner.model.tok2vec] 43 | @architectures = "spacy.Tok2Vec.v1" 44 | 45 | [components.ner.model.tok2vec.embed] 46 | @architectures = "spacy.MultiHashEmbed.v1" 47 | width = 96 48 | attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"] 49 | rows = [5000, 2500, 2500, 2500] 50 | include_static_vectors = false 51 | 52 | [components.ner.model.tok2vec.encode] 53 | @architectures = "spacy.MaxoutWindowEncoder.v1" 54 | width = 96 55 | depth = 4 56 | window_size = 1 57 | maxout_pieces = 3 58 | 59 | [components.parser] 60 | source = ${paths.parser_tagger_path} 61 | 62 | [components.tagger] 63 | source = ${paths.parser_tagger_path} 64 | 65 | [components.transformer] 66 | source = ${paths.parser_tagger_path} 67 | 68 | 69 | [corpora] 70 | 71 | [corpora.dev] 72 | @readers = "med_mentions_reader" 73 | directory_path = "assets/" 74 | split = "dev" 75 | 76 | [corpora.train] 77 | @readers = "med_mentions_reader" 78 | directory_path = "assets/" 79 | split = "train" 80 | 81 | [training] 82 | dev_corpus = "corpora.dev" 83 | train_corpus = "corpora.train" 84 | seed = ${system.seed} 85 | gpu_allocator = ${system.gpu_allocator} 86 | dropout = 0.2 87 | accumulate_gradient = 1 88 | patience = 0 89 | max_epochs = 7 90 | max_steps = 0 91 | eval_frequency = 500 92 | frozen_components = ["transformer", "parser", "tagger", "attribute_ruler", "lemmatizer"] 93 | before_to_disk = null 94 | 95 | [training.batcher] 96 | @batchers = "spacy.batch_by_sequence.v1" 97 | get_length = null 98 | 99 | [training.batcher.size] 100 | @schedules = "compounding.v1" 101 | start = 1 102 | stop = 32 103 | compound = 1.001 104 | t = 0.0 105 | 106 | [training.logger] 107 | @loggers = "spacy.ConsoleLogger.v1" 108 | progress_bar = true 109 | 110 | [training.optimizer] 111 | @optimizers = "Adam.v1" 112 | beta1 = 0.9 113 | beta2 = 0.999 114 | L2_is_weight_decay = true 115 | L2 = 0.01 116 | grad_clip = 1.0 117 | use_averages = false 118 | eps = 0.00000001 119 | learn_rate = 0.001 120 | 121 | [training.score_weights] 122 | dep_las_per_type = null 123 | sents_p = null 124 | sents_r = null 125 | ents_per_type = null 126 | tag_acc = null 127 | dep_uas = null 128 | dep_las = null 129 | sents_f = null 130 | ents_f = 1.0 131 | ents_p = 0.0 132 | ents_r = 0.0 133 | 134 | [pretraining] 135 | 136 | [initialize] 137 | vectors = ${paths.vectors} 138 | init_tok2vec = ${paths.init_tok2vec} 139 | vocab_data = ${paths.vocab_path} 140 | lookups = null 141 | 142 | [initialize.components] 143 | 144 | [initialize.tokenizer] 145 | 146 | [initialize.before_init] 147 | @callbacks = "replace_tokenizer" 148 | -------------------------------------------------------------------------------- /evaluation/sentence_splitting_evaluation.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import sys 4 | import json 5 | 6 | import spacy 7 | 8 | sys.path.insert(0, os.path.dirname(os.path.abspath(os.path.join(__file__, os.pardir)))) 9 | from scispacy.custom_sentence_segmenter import combined_rule_sentence_segmenter 10 | from scispacy.custom_tokenizer import remove_new_lines, combined_rule_tokenizer 11 | 12 | def evaluate_sentence_splitting(model_path: str, 13 | data_directory: str, 14 | rule_segmenter: bool = False, 15 | custom_tokenizer: bool = False, 16 | citation_data_path: str = None): 17 | 18 | model = spacy.load(model_path) 19 | if rule_segmenter: 20 | model.add_pipe(combined_rule_sentence_segmenter, first=True) 21 | if custom_tokenizer: 22 | model.tokenizer = combined_rule_tokenizer(model) 23 | 24 | total_correct = 0 25 | total = 0 26 | total_abstracts = 0 27 | perfect = 0 28 | for abstract_name in os.listdir(data_directory): 29 | 30 | abstract_sentences = [x.strip() for x in 31 | open(os.path.join(data_directory, abstract_name), "r")] 32 | 33 | full_abstract = " ".join(abstract_sentences) 34 | 35 | doc = model(full_abstract) 36 | 37 | sentences = [x.text for x in doc.sents] 38 | 39 | correct = [] 40 | for sentence in sentences: 41 | if sentence in abstract_sentences: 42 | correct.append(1) 43 | else: 44 | correct.append(0) 45 | 46 | total += len(correct) 47 | total_correct += sum(correct) 48 | perfect += all(correct) 49 | total_abstracts += 1 50 | 51 | print(f"Sentence splitting performance for {model_path} :\n") 52 | 53 | print(f"Sentence level accuracy: {total_correct} of {total}, {total_correct / total}. ") 54 | print(f"Abstract level accuracy: {perfect} of {total_abstracts}, {perfect / total_abstracts}. ") 55 | 56 | if citation_data_path is None: 57 | return 58 | 59 | skipped = 0 60 | citation_total = 0 61 | citation_correct = 0 62 | for line in open(citation_data_path, "r"): 63 | 64 | sentence = remove_new_lines(json.loads(line)["string"]) 65 | 66 | # Skip sentence if it doesn't look roughly like a sentence, 67 | # or it is > 2 std deviations above the mean length. 68 | if not sentence[0].isupper() or sentence[-1] != "." or len(sentence) > 450: 69 | skipped += 1 70 | continue 71 | 72 | sentences = list(model(sentence).sents) 73 | 74 | if len(sentences) == 1: 75 | citation_correct += 1 76 | citation_total +=1 77 | print(f"Citation handling performance for {model_path}, skipped {skipped} examples :\n") 78 | print(f"Citation level accuracy: {citation_correct} of {citation_total}, {citation_correct / citation_total}. ") 79 | 80 | 81 | if __name__ == "__main__": 82 | parser = argparse.ArgumentParser() 83 | 84 | parser.add_argument( 85 | '--data', 86 | help="Path to the directory containing the raw data." 87 | ) 88 | parser.add_argument( 89 | '--model_path', 90 | default=None, 91 | help="Path to the spacy model to load" 92 | ) 93 | parser.add_argument( 94 | '--rule_segmenter', 95 | default=False, 96 | action="store_true", 97 | help="Whether to use the rule based segmenter" 98 | ) 99 | parser.add_argument( 100 | '--custom_tokenizer', 101 | default=False, 102 | action="store_true", 103 | help="Whether to use the rule based segmenter" 104 | ) 105 | parser.add_argument( 106 | '--citation_data', 107 | default=None, 108 | help="Path to the jsonl file containing the citation contexts." 109 | ) 110 | 111 | args = parser.parse_args() 112 | evaluate_sentence_splitting(args.model_path, args.data, args.rule_segmenter, args.custom_tokenizer, args.citation_data) 113 | -------------------------------------------------------------------------------- /scispacy/base_project_code.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Callable, Iterable, Iterator 2 | from pathlib import Path 3 | 4 | import random 5 | import itertools 6 | import spacy 7 | import warnings 8 | from spacy.training import Corpus, Example 9 | from spacy.language import Language 10 | 11 | from scispacy.custom_tokenizer import combined_rule_tokenizer 12 | from scispacy.data_util import read_full_med_mentions, read_ner_from_tsv 13 | 14 | 15 | def iter_sample(iterable: Iterable, sample_percent: float) -> Iterator: 16 | for item in iterable: 17 | if len(item.reference) == 0: 18 | continue 19 | coin_flip = random.uniform(0, 1) 20 | if coin_flip < sample_percent: 21 | yield item 22 | 23 | 24 | @spacy.registry.callbacks("replace_tokenizer") 25 | def replace_tokenizer_callback() -> Callable[[Language], Language]: 26 | def replace_tokenizer(nlp: Language) -> Language: 27 | nlp.tokenizer = combined_rule_tokenizer(nlp) 28 | return nlp 29 | 30 | return replace_tokenizer 31 | 32 | 33 | @spacy.registry.readers("parser_tagger_data") 34 | def parser_tagger_data( 35 | path: Path, 36 | mixin_data_path: Optional[Path], 37 | mixin_data_percent: float, 38 | gold_preproc: bool, 39 | max_length: int = 0, 40 | limit: int = 0, 41 | augmenter: Optional[Callable] = None, 42 | seed: int = 0, 43 | ) -> Callable[[Language], Iterator[Example]]: 44 | random.seed(seed) 45 | main_corpus = Corpus( 46 | path, 47 | gold_preproc=gold_preproc, 48 | max_length=max_length, 49 | limit=limit, 50 | augmenter=augmenter, 51 | ) 52 | if mixin_data_path is not None: 53 | mixin_corpus = Corpus( 54 | mixin_data_path, 55 | gold_preproc=gold_preproc, 56 | max_length=max_length, 57 | limit=limit, 58 | augmenter=augmenter, 59 | ) 60 | 61 | def mixed_corpus(nlp: Language) -> Iterator[Example]: 62 | if mixin_data_path is not None: 63 | main_examples = main_corpus(nlp) 64 | mixin_examples = iter_sample(mixin_corpus(nlp), mixin_data_percent) 65 | return itertools.chain(main_examples, mixin_examples) 66 | else: 67 | return main_corpus(nlp) 68 | 69 | return mixed_corpus 70 | 71 | 72 | @spacy.registry.readers("med_mentions_reader") 73 | def med_mentions_reader( 74 | directory_path: str, split: str 75 | ) -> Callable[[Language], Iterator[Example]]: 76 | train, dev, test = read_full_med_mentions( 77 | directory_path, label_mapping=None, span_only=True, spacy_format=True 78 | ) 79 | 80 | def corpus(nlp: Language) -> Iterator[Example]: 81 | if split == "train": 82 | original_examples = train 83 | elif split == "dev": 84 | original_examples = dev 85 | elif split == "test": 86 | original_examples = test 87 | else: 88 | raise Exception(f"Unexpected split {split}") 89 | 90 | for original_example in original_examples: 91 | doc = nlp.make_doc(original_example[0]) 92 | with warnings.catch_warnings(): 93 | warnings.simplefilter("ignore", category=UserWarning) 94 | spacy_example = Example.from_dict(doc, original_example[1]) 95 | yield spacy_example 96 | 97 | return corpus 98 | 99 | 100 | @spacy.registry.readers("specialized_ner_reader") 101 | def specialized_ner_reader(file_path: str): 102 | original_examples = read_ner_from_tsv(file_path) 103 | 104 | def corpus(nlp: Language): 105 | for original_example in original_examples: 106 | doc = nlp.make_doc(original_example[0]) 107 | with warnings.catch_warnings(): 108 | warnings.simplefilter("ignore", category=UserWarning) 109 | spacy_example = Example.from_dict(doc, original_example[1]) 110 | yield spacy_example 111 | 112 | return corpus 113 | -------------------------------------------------------------------------------- /tests/fixtures/umls_META/MRFILES.RRF: -------------------------------------------------------------------------------- 1 | AMBIGLUI.RRF|Ambiguous term identifiers|LUI,CUI|2|537613|10302364| 2 | AMBIGSUI.RRF|Ambiguous string identifiers|SUI,CUI|2|389894|7513995| 3 | CHANGE/DELETEDCUI.RRF|Deleted concepts|PCUI,PSTR|2|10628|159420| 4 | CHANGE/DELETEDLUI.RRF|Deleted terms|PLUI,PSTR|2|0|0| 5 | CHANGE/DELETEDSUI.RRF|Deleted strings|PSUI,LAT,PSTR|3|0|0| 6 | CHANGE/MERGEDCUI.RRF|Merged concepts|PCUI,CUI|2|2188|41572| 7 | CHANGE/MERGEDLUI.RRF|Merged terms|PLUI,LUI|2|0|0| 8 | MRAUI.RRF|AUI History|AUI1,CUI1,VER,REL,RELA,MAPREASON,AUI2,CUI2,MAPIN|9|510022|27523978| 9 | MRCOLS.RRF|Attribute Relation|COL,DES,REF,MIN,AV,MAX,FIL,DTY|8|329|22758| 10 | MRCONSO.RRF|Concept names and sources|CUI,LAT,TS,LUI,STT,SUI,ISPREF,AUI,SAUI,SCUI,SDUI,SAB,TTY,CODE,STR,SRL,SUPPRESS,CVF|18|21385114|2719518477| 11 | MRCUI.RRF|CUI History|CUI1,VER,REL,RELA,MAPREASON,CUI2,MAPIN|7|2364533|72915366| 12 | MRDEF.RRF|Definitions|CUI,AUI,ATUI,SATUI,SAB,DEF,SUPPRESS,CVF|8|501039|123372655| 13 | MRDOC.RRF|Typed key value metadata map|DOCKEY,VALUE,TYPE,EXPL|4|5984|372164| 14 | MRFILES.RRF|Relation Relation|FIL,DES,FMT,CLS,RWS,BTS|6|50|4066| 15 | MRHIER.RRF|Computable hierarchies|CUI,AUI,CXN,PAUI,SAB,RELA,PTR,HCD,CVF|9|35400003|5248953255| 16 | MRHIST.RRF|Source-asserted history|CUI,SOURCEUI,SAB,SVER,CHANGETYPE,CHANGEKEY,CHANGEVAL,REASON,CVF|9|0|0| 17 | MRMAP.RRF|Mappings|MAPSETCUI,MAPSETSAB,MAPSUBSETID,MAPRANK,MAPID,MAPSID,FROMID,FROMSID,FROMEXPR,FROMTYPE,FROMRULE,FROMRES,REL,RELA,TOID,TOSID,TOEXPR,TOTYPE,TORULE,TORES,MAPRULE,MAPRES,MAPTYPE,MAPATN,MAPATV,CVF|26|1762213|245100223| 18 | MRRANK.RRF|Concept Name Ranking|RANK,SAB,TTY,SUPPRESS|4|1285|23476| 19 | MRREL.RRF|Related Concepts|CUI1,AUI1,STYPE1,REL,CUI2,AUI2,STYPE2,RELA,RUI,SRUI,SAB,SL,RG,DIR,SUPPRESS,CVF|16|104563668|9493099961| 20 | MRSAB.RRF|Source Metadata|VCUI,RCUI,VSAB,RSAB,SON,SF,SVER,VSTART,VEND,IMETA,RMETA,SLC,SCC,SRL,TFR,CFR,CXTY,TTYL,ATNL,LAT,CENC,CURVER,SABIN,SSN,SCIT|25|426|300118| 21 | MRSAT.RRF|Simple Concept, Term and String Attributes|CUI,LUI,SUI,METAUI,STYPE,CODE,ATUI,SATUI,ATN,SAB,ATV,SUPPRESS,CVF|13|108724175|10527137405| 22 | MRSMAP.RRF|Simple Mappings|MAPSETCUI,MAPSETSAB,MAPID,MAPSID,FROMEXPR,FROMTYPE,REL,RELA,TOEXPR,TOTYPE,CVF|11|694235|49081262| 23 | MRSTY.RRF|Semantic Types|CUI,TUI,STN,STY,ATUI,CVF|6|6875332|381224365| 24 | MRXNS_ENG.RRF|Normalized String Index|LAT,NSTR,CUI,LUI,SUI|5|22712182|1601710160| 25 | MRXNW_ENG.RRF|Normalized Word Index|LAT,NWD,CUI,LUI,SUI|5|74522948|3007627692| 26 | MRXW_BAQ.RRF|Basque Word Index|LAT,WD,CUI,LUI,SUI|5|5338|214412| 27 | MRXW_CHI.RRF|Chinese Word Index|LAT,WD,CUI,LUI,SUI|5|955316|43214100| 28 | MRXW_CZE.RRF|Czech Word Index|LAT,WD,CUI,LUI,SUI|5|422737|17840374| 29 | MRXW_DAN.RRF|Danish Word Index|LAT,WD,CUI,LUI,SUI|5|4932|194228| 30 | MRXW_DUT.RRF|Dutch Word Index|LAT,WD,CUI,LUI,SUI|5|1527345|63929947| 31 | MRXW_ENG.RRF|English Word Index|LAT,WD,CUI,LUI,SUI|5|72596340|2905871994| 32 | MRXW_EST.RRF|Estonian Word Index|LAT,WD,CUI,LUI,SUI|5|462812|18354498| 33 | MRXW_FIN.RRF|Finnish Word Index|LAT,WD,CUI,LUI,SUI|5|44994|1961908| 34 | MRXW_FRE.RRF|French Word Index|LAT,WD,CUI,LUI,SUI|5|3828339|160031629| 35 | MRXW_GER.RRF|German Word Index|LAT,WD,CUI,LUI,SUI|5|734623|30352992| 36 | MRXW_GRE.RRF|Greek Word Index|LAT,WD,CUI,LUI,SUI|5|42802|2067288| 37 | MRXW_HEB.RRF|Hebrew Word Index|LAT,WD,CUI,LUI,SUI|5|3234|126524| 38 | MRXW_HUN.RRF|Hungarian Word Index|LAT,WD,CUI,LUI,SUI|5|266703|11550790| 39 | MRXW_ITA.RRF|Italian Word Index|LAT,WD,CUI,LUI,SUI|5|1603999|64493060| 40 | MRXW_JPN.RRF|Japanese Word Index|LAT,WD,CUI,LUI,SUI|5|272472|16110056| 41 | MRXW_KOR.RRF|Korean Word Index|LAT,WD,CUI,LUI,SUI|5|460476|20307369| 42 | MRXW_LAV.RRF|Latvian Word Index|LAT,WD,CUI,LUI,SUI|5|3167|132408| 43 | MRXW_NOR.RRF|Norwegian Word Index|LAT,WD,CUI,LUI,SUI|5|118417|5222066| 44 | MRXW_POL.RRF|Polish Word Index|LAT,WD,CUI,LUI,SUI|5|115252|4912030| 45 | MRXW_POR.RRF|Portuguese Word Index|LAT,WD,CUI,LUI,SUI|5|2857618|114458322| 46 | MRXW_RUS.RRF|Russian Word Index|LAT,WD,CUI,LUI,SUI|5|1275708|58319365| 47 | MRXW_SCR.RRF|Croatian Word Index|LAT,WD,CUI,LUI,SUI|5|23572|996307| 48 | MRXW_SPA.RRF|Spanish Word Index|LAT,WD,CUI,LUI,SUI|5|9575118|388553905| 49 | MRXW_SWE.RRF|Swedish Word Index|LAT,WD,CUI,LUI,SUI|5|53244|2296506| 50 | MRXW_TUR.RRF|Turkish Word Index|LAT,WD,CUI,LUI,SUI|5|833756|33848218| 51 | -------------------------------------------------------------------------------- /configs/base_parser_tagger.cfg: -------------------------------------------------------------------------------- 1 | [paths] 2 | genia_train = "project_data/genia_train.spacy" 3 | genia_dev = "project_data/genia_dev.spacy" 4 | onto_train = "project_data/train" 5 | vectors = null 6 | init_tok2vec = null 7 | vocab_path = null 8 | 9 | [system] 10 | gpu_allocator = null 11 | seed = 0 12 | 13 | [nlp] 14 | lang = "en" 15 | pipeline = ["tok2vec","tagger","attribute_ruler","lemmatizer","parser"] 16 | tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"} 17 | disabled = [] 18 | before_creation = null 19 | after_creation = null 20 | after_pipeline_creation = null 21 | 22 | [components] 23 | 24 | [components.attribute_ruler] 25 | source = "en_core_web_sm" 26 | 27 | [components.lemmatizer] 28 | source = "en_core_web_sm" 29 | 30 | [components.parser] 31 | factory = "parser" 32 | learn_tokens = false 33 | min_action_freq = 30 34 | moves = null 35 | update_with_oracle_cut_size = 100 36 | 37 | [components.parser.model] 38 | @architectures = "spacy.TransitionBasedParser.v1" 39 | state_type = "parser" 40 | extra_state_tokens = false 41 | hidden_width = 128 42 | maxout_pieces = 3 43 | use_upper = true 44 | nO = null 45 | 46 | [components.parser.model.tok2vec] 47 | @architectures = "spacy.Tok2VecListener.v1" 48 | width = ${components.tok2vec.model.encode.width} 49 | upstream = "*" 50 | 51 | [components.tagger] 52 | factory = "tagger" 53 | 54 | [components.tagger.model] 55 | @architectures = "spacy.Tagger.v1" 56 | nO = null 57 | 58 | [components.tagger.model.tok2vec] 59 | @architectures = "spacy.Tok2VecListener.v1" 60 | width = ${components.tok2vec.model.encode.width} 61 | upstream = "*" 62 | 63 | [components.tok2vec] 64 | factory = "tok2vec" 65 | 66 | [components.tok2vec.model] 67 | @architectures = "spacy.Tok2Vec.v1" 68 | 69 | [components.tok2vec.model.embed] 70 | @architectures = "spacy.MultiHashEmbed.v1" 71 | width = ${components.tok2vec.model.encode.width} 72 | attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"] 73 | rows = [5000, 2500, 2500, 2500] 74 | include_static_vectors = true 75 | 76 | [components.tok2vec.model.encode] 77 | @architectures = "spacy.MaxoutWindowEncoder.v1" 78 | width = 96 79 | depth = 4 80 | window_size = 1 81 | maxout_pieces = 3 82 | 83 | [corpora] 84 | 85 | [corpora.dev] 86 | @readers = "spacy.Corpus.v1" 87 | path = ${paths.genia_dev} 88 | max_length = 0 89 | gold_preproc = false 90 | limit = 0 91 | augmenter = null 92 | 93 | [corpora.train] 94 | @readers = "parser_tagger_data" 95 | path = ${paths.genia_train} 96 | mixin_data_path = ${paths.onto_train} 97 | mixin_data_percent = 0.2 98 | max_length = 2000 99 | gold_preproc = false 100 | limit = 0 101 | augmenter = null 102 | seed = ${system.seed} 103 | 104 | [training] 105 | dev_corpus = "corpora.dev" 106 | train_corpus = "corpora.train" 107 | seed = ${system.seed} 108 | gpu_allocator = ${system.gpu_allocator} 109 | dropout = 0.2 110 | accumulate_gradient = 1 111 | patience = 0 112 | max_epochs = 20 113 | max_steps = 0 114 | eval_frequency = 2300 115 | frozen_components = ["attribute_ruler", "lemmatizer"] 116 | before_to_disk = null 117 | 118 | [training.batcher] 119 | @batchers = "spacy.batch_by_sequence.v1" 120 | get_length = null 121 | 122 | [training.batcher.size] 123 | @schedules = "compounding.v1" 124 | start = 1 125 | stop = 16 126 | compound = 1.001 127 | t = 0.0 128 | 129 | [training.logger] 130 | @loggers = "spacy.ConsoleLogger.v1" 131 | progress_bar = true 132 | 133 | [training.optimizer] 134 | @optimizers = "Adam.v1" 135 | beta1 = 0.9 136 | beta2 = 0.999 137 | L2_is_weight_decay = true 138 | L2 = 0.01 139 | grad_clip = 1.0 140 | use_averages = false 141 | eps = 0.00000001 142 | learn_rate = 0.001 143 | 144 | [training.score_weights] 145 | dep_las_per_type = null 146 | sents_p = null 147 | sents_r = null 148 | ents_per_type = null 149 | tag_acc = 0.33 150 | dep_uas = 0.33 151 | dep_las = 0.33 152 | sents_f = 0.0 153 | ents_f = 0.0 154 | ents_p = 0.0 155 | ents_r = 0.0 156 | 157 | [pretraining] 158 | 159 | [initialize] 160 | vectors = ${paths.vectors} 161 | init_tok2vec = ${paths.init_tok2vec} 162 | vocab_data = ${paths.vocab_path} 163 | lookups = null 164 | 165 | [initialize.components] 166 | 167 | [initialize.tokenizer] 168 | 169 | [initialize.before_init] 170 | @callbacks = "replace_tokenizer" -------------------------------------------------------------------------------- /configs/base_parser_tagger_scibert.cfg: -------------------------------------------------------------------------------- 1 | [paths] 2 | genia_train = "project_data/genia_train.spacy" 3 | genia_dev = "project_data/genia_dev.spacy" 4 | onto_train = "project_data/train" 5 | vectors = null 6 | init_tok2vec = null 7 | vocab_path = null 8 | 9 | [system] 10 | gpu_allocator = "pytorch" 11 | seed = 0 12 | 13 | [nlp] 14 | lang = "en" 15 | pipeline = ["transformer","tagger","attribute_ruler","lemmatizer","parser"] 16 | batch_size = 256 17 | tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"} 18 | disabled = [] 19 | before_creation = null 20 | after_creation = null 21 | after_pipeline_creation = null 22 | 23 | [components] 24 | 25 | [components.attribute_ruler] 26 | source = "en_core_web_sm" 27 | 28 | [components.lemmatizer] 29 | source = "en_core_web_sm" 30 | 31 | [components.parser] 32 | factory = "parser" 33 | learn_tokens = false 34 | min_action_freq = 30 35 | moves = null 36 | update_with_oracle_cut_size = 100 37 | 38 | [components.parser.model] 39 | @architectures = "spacy.TransitionBasedParser.v1" 40 | state_type = "parser" 41 | extra_state_tokens = false 42 | hidden_width = 128 43 | maxout_pieces = 3 44 | use_upper = true 45 | nO = null 46 | 47 | [components.parser.model.tok2vec] 48 | @architectures = "spacy-transformers.TransformerListener.v1" 49 | grad_factor = 1.0 50 | pooling = {"@layers":"reduce_mean.v1"} 51 | upstream = "*" 52 | 53 | [components.tagger] 54 | factory = "tagger" 55 | 56 | [components.tagger.model] 57 | @architectures = "spacy.Tagger.v1" 58 | nO = null 59 | 60 | [components.tagger.model.tok2vec] 61 | @architectures = "spacy-transformers.TransformerListener.v1" 62 | grad_factor = 1.0 63 | pooling = {"@layers":"reduce_mean.v1"} 64 | upstream = "*" 65 | 66 | [components.transformer] 67 | factory = "transformer" 68 | max_batch_items = 4096 69 | set_extra_annotations = {"@annotation_setters":"spacy-transformers.null_annotation_setter.v1"} 70 | 71 | [components.transformer.model] 72 | @architectures = "spacy-transformers.TransformerModel.v1" 73 | name = "allenai/scibert_scivocab_uncased" 74 | tokenizer_config = {"use_fast": true} 75 | 76 | [components.transformer.model.get_spans] 77 | @span_getters = "spacy-transformers.strided_spans.v1" 78 | window = 128 79 | stride = 96 80 | 81 | 82 | [corpora] 83 | 84 | [corpora.dev] 85 | @readers = "spacy.Corpus.v1" 86 | path = ${paths.genia_dev} 87 | max_length = 0 88 | gold_preproc = false 89 | limit = 0 90 | augmenter = null 91 | 92 | [corpora.train] 93 | @readers = "parser_tagger_data" 94 | path = ${paths.genia_train} 95 | mixin_data_path = ${paths.onto_train} 96 | mixin_data_percent = 0.2 97 | max_length = 2000 98 | gold_preproc = false 99 | limit = 0 100 | augmenter = null 101 | seed = ${system.seed} 102 | 103 | [training] 104 | dev_corpus = "corpora.dev" 105 | train_corpus = "corpora.train" 106 | seed = ${system.seed} 107 | gpu_allocator = ${system.gpu_allocator} 108 | dropout = 0.2 109 | accumulate_gradient = 1 110 | patience = 0 111 | max_epochs = 8 112 | max_steps = 0 113 | eval_frequency = 2300 114 | frozen_components = ["attribute_ruler", "lemmatizer"] 115 | before_to_disk = null 116 | 117 | [training.batcher] 118 | @batchers = "spacy.batch_by_sequence.v1" 119 | get_length = null 120 | 121 | [training.batcher.size] 122 | @schedules = "compounding.v1" 123 | start = 16 124 | stop = 64 125 | compound = 1.001 126 | t = 0.0 127 | 128 | [training.logger] 129 | @loggers = "spacy.ConsoleLogger.v1" 130 | progress_bar = true 131 | 132 | [training.optimizer] 133 | @optimizers = "Adam.v1" 134 | beta1 = 0.9 135 | beta2 = 0.999 136 | L2_is_weight_decay = true 137 | L2 = 0.01 138 | grad_clip = 1.0 139 | use_averages = false 140 | eps = 0.00000001 141 | learn_rate = 0.00005 142 | 143 | 144 | [training.score_weights] 145 | dep_las_per_type = null 146 | sents_p = null 147 | sents_r = null 148 | ents_per_type = null 149 | tag_acc = 0.33 150 | dep_uas = 0.33 151 | dep_las = 0.33 152 | sents_f = 0.0 153 | ents_f = 0.0 154 | ents_p = 0.0 155 | ents_r = 0.0 156 | 157 | [pretraining] 158 | 159 | [initialize] 160 | vectors = ${paths.vectors} 161 | init_tok2vec = ${paths.init_tok2vec} 162 | vocab_data = ${paths.vocab_path} 163 | lookups = null 164 | 165 | [initialize.components] 166 | 167 | [initialize.tokenizer] 168 | 169 | [initialize.before_init] 170 | @callbacks = "replace_tokenizer" 171 | -------------------------------------------------------------------------------- /tests/test_data_util.py: -------------------------------------------------------------------------------- 1 | import os 2 | import unittest 3 | import shutil 4 | 5 | 6 | from scispacy.data_util import read_full_med_mentions, med_mentions_example_iterator, remove_overlapping_entities 7 | from scispacy.data_util import read_ner_from_tsv 8 | 9 | class TestDataUtil(unittest.TestCase): 10 | def setUp(self): 11 | super().setUp() 12 | self.TEST_DIR = "/tmp/scispacy" 13 | os.makedirs(self.TEST_DIR, exist_ok=True) 14 | 15 | self.med_mentions = "tests/fixtures/med_mentions.txt" 16 | self.ner_tsv = "tests/fixtures/ner_test.tsv" 17 | 18 | def tearDown(self): 19 | shutil.rmtree(self.TEST_DIR) 20 | 21 | def test_example_iterator(self): 22 | iterator = med_mentions_example_iterator(self.med_mentions) 23 | for example in iterator: 24 | assert example.text == example.title + " " + example.abstract 25 | 26 | for entity in example.entities: 27 | assert entity.start < entity.end 28 | assert entity.start < len(example.text) 29 | assert entity.end < len(example.text) 30 | assert entity.mention_text == example.text[entity.start: entity.end] 31 | 32 | def test_remove_overlaps(self): 33 | test_entities = [(0, 5, 'ENTITY'), (6, 10, 'ENTITY')] 34 | result = remove_overlapping_entities(test_entities) 35 | assert result == [(0, 5, 'ENTITY'), (6, 10, 'ENTITY')] 36 | 37 | test_entities = [(0, 5, 'ENTITY'), (5, 10, 'ENTITY')] 38 | result = remove_overlapping_entities(test_entities) 39 | assert result == [(0, 5, 'ENTITY'), (5, 10, 'ENTITY')] 40 | 41 | test_entities = [(0, 5, 'ENTITY'), (4, 10, 'ENTITY')] 42 | result = remove_overlapping_entities(test_entities) 43 | assert result == [(4, 10, 'ENTITY')] 44 | 45 | test_entities = [(0, 5, 'ENTITY'), (0, 5, 'ENTITY')] 46 | result = remove_overlapping_entities(test_entities) 47 | assert result == [(0, 5, 'ENTITY')] 48 | 49 | test_entities = [(0, 5, 'ENTITY'), (4, 11, 'ENTITY'), (6, 20, 'ENTITY')] 50 | result = remove_overlapping_entities(test_entities) 51 | assert result == [(0, 5, 'ENTITY'), (6, 20, 'ENTITY')] 52 | 53 | test_entities = [(0, 5, 'ENTITY'), (4, 7, 'ENTITY'), (10, 20, 'ENTITY')] 54 | result = remove_overlapping_entities(test_entities) 55 | assert result == [(0, 5, 'ENTITY'), (10, 20, 'ENTITY')] 56 | 57 | test_entities = [(1368, 1374, 'ENTITY'), (1368, 1376, 'ENTITY')] 58 | result = remove_overlapping_entities(test_entities) 59 | assert result == [(1368, 1376, 'ENTITY')] 60 | 61 | test_entities = [(12, 33, 'ENTITY'), (769, 779, 'ENTITY'), (769, 787, 'ENTITY'), (806, 811, 'ENTITY')] 62 | result = remove_overlapping_entities(test_entities) 63 | assert result == [(12, 33, 'ENTITY'), (769, 787, 'ENTITY'), (806, 811, 'ENTITY')] 64 | 65 | test_entities = [(189, 209, 'ENTITY'), 66 | (317, 362, 'ENTITY'), 67 | (345, 354, 'ENTITY'), 68 | (364, 368, 'ENTITY')] 69 | result = remove_overlapping_entities(test_entities) 70 | assert result == [(189, 209, 'ENTITY'), (317, 362, 'ENTITY'), (364, 368, 'ENTITY')] 71 | 72 | test_entities = [(445, 502, 'ENTITY'), 73 | (461, 473, 'ENTITY'), 74 | (474, 489, 'ENTITY')] 75 | result = remove_overlapping_entities(test_entities) 76 | assert result == [(445, 502, 'ENTITY')] 77 | 78 | def test_read_ner_from_tsv(self): 79 | 80 | data = read_ner_from_tsv(self.ner_tsv) 81 | assert len(data) == 4 82 | example = data[0] 83 | assert example[0] == 'Intraocular pressure in genetically distinct mice : an update and strain survey' 84 | assert example[1] == {'entities': [(24, 35, 'SO'), (45, 49, 'TAXON')]} 85 | example = data[1] 86 | assert example[0] == 'Abstract' 87 | assert example[1] == {'entities': []} 88 | example = data[2] 89 | assert example[0] == 'Background' 90 | assert example[1] == {'entities': []} 91 | example = data[3] 92 | assert example[0] == 'Little is known about genetic factors affecting intraocular pressure ( IOP ) in mice and other mammals .' 93 | assert example[1] == {'entities': [(22, 29, 'SO'), (80, 84, 'TAXON'), (95, 102, 'TAXON')]} 94 | -------------------------------------------------------------------------------- /scispacy/per_class_scorer.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List, Tuple, Set 2 | from collections import defaultdict 3 | import copy 4 | 5 | 6 | class PerClassScorer: 7 | def __init__(self): 8 | # These will hold per label span counts. 9 | self._true_positives: Dict[str, int] = defaultdict(int) 10 | self._false_positives: Dict[str, int] = defaultdict(int) 11 | self._false_negatives: Dict[str, int] = defaultdict(int) 12 | 13 | def __call__( 14 | self, 15 | predicted_spans: List[Tuple[int, int, str]], 16 | gold_spans: List[Tuple[int, int, str]], 17 | ) -> None: 18 | 19 | gold_spans = copy.copy(gold_spans) 20 | predicted_spans = copy.copy(predicted_spans) 21 | untyped_gold_spans = {(x[0], x[1]) for x in gold_spans} 22 | untyped_predicted_spans = {(x[0], x[1]) for x in predicted_spans} 23 | 24 | for untyped_span, span in zip(untyped_predicted_spans, predicted_spans): 25 | if span in gold_spans: 26 | self._true_positives[span[2]] += 1 27 | gold_spans.remove(span) 28 | else: 29 | self._false_positives[span[2]] += 1 30 | 31 | if untyped_span in untyped_gold_spans: 32 | self._true_positives["untyped"] += 1 33 | untyped_gold_spans.remove(untyped_span) 34 | else: 35 | self._false_positives["untyped"] += 1 36 | # These spans weren't predicted. 37 | for span in gold_spans: 38 | self._false_negatives[span[2]] += 1 39 | for untyped_span in untyped_gold_spans: 40 | self._false_negatives["untyped"] += 1 41 | 42 | def get_metric(self, reset: bool = False): 43 | """ 44 | Returns 45 | ------- 46 | A Dict per label containing following the span based metrics: 47 | precision : float 48 | recall : float 49 | f1-measure : float 50 | Additionally, an ``overall`` key is included, which provides the precision, 51 | recall and f1-measure for all spans. 52 | """ 53 | all_tags: Set[str] = set() 54 | all_tags.update(self._true_positives.keys()) 55 | all_tags.update(self._false_positives.keys()) 56 | all_tags.update(self._false_negatives.keys()) 57 | all_metrics = {} 58 | for tag in all_tags: 59 | precision, recall, f1_measure = self._compute_metrics( 60 | self._true_positives[tag], 61 | self._false_positives[tag], 62 | self._false_negatives[tag], 63 | ) 64 | precision_key = "precision" + "-" + tag 65 | recall_key = "recall" + "-" + tag 66 | f1_key = "f1-measure" + "-" + tag 67 | all_metrics[precision_key] = precision 68 | all_metrics[recall_key] = recall 69 | all_metrics[f1_key] = f1_measure 70 | 71 | # Compute the precision, recall and f1 for all spans jointly. 72 | sum_true_positives = sum( 73 | {v for k, v in self._true_positives.items() if k != "untyped"} 74 | ) 75 | sum_false_positives = sum( 76 | {v for k, v in self._false_positives.items() if k != "untyped"} 77 | ) 78 | sum_false_negatives = sum( 79 | {v for k, v in self._false_negatives.items() if k != "untyped"} 80 | ) 81 | precision, recall, f1_measure = self._compute_metrics( 82 | sum_true_positives, sum_false_positives, sum_false_negatives 83 | ) 84 | all_metrics["precision-overall"] = precision 85 | all_metrics["recall-overall"] = recall 86 | all_metrics["f1-measure-overall"] = f1_measure 87 | if reset: 88 | self.reset() 89 | return all_metrics 90 | 91 | @staticmethod 92 | def _compute_metrics( 93 | true_positives: int, false_positives: int, false_negatives: int 94 | ): 95 | precision = float(true_positives) / float( 96 | true_positives + false_positives + 1e-13 97 | ) 98 | recall = float(true_positives) / float(true_positives + false_negatives + 1e-13) 99 | f1_measure = 2.0 * ((precision * recall) / (precision + recall + 1e-13)) 100 | return precision, recall, f1_measure 101 | 102 | def reset(self): 103 | self._true_positives = defaultdict(int) 104 | self._false_positives = defaultdict(int) 105 | self._false_negatives = defaultdict(int) 106 | -------------------------------------------------------------------------------- /scispacy/linking_utils.py: -------------------------------------------------------------------------------- 1 | from typing import List, Dict, NamedTuple, Optional, Set 2 | import json 3 | from collections import defaultdict 4 | 5 | from scispacy.file_cache import cached_path 6 | from scispacy.umls_semantic_type_tree import ( 7 | UmlsSemanticTypeTree, 8 | construct_umls_tree_from_tsv, 9 | ) 10 | 11 | 12 | class Entity(NamedTuple): 13 | 14 | concept_id: str 15 | canonical_name: str 16 | aliases: List[str] 17 | types: List[str] = [] 18 | definition: Optional[str] = None 19 | 20 | def __repr__(self): 21 | 22 | rep = "" 23 | num_aliases = len(self.aliases) 24 | rep = rep + f"CUI: {self.concept_id}, Name: {self.canonical_name}\n" 25 | rep = rep + f"Definition: {self.definition}\n" 26 | rep = rep + f"TUI(s): {', '.join(self.types)}\n" 27 | if num_aliases > 10: 28 | rep = ( 29 | rep 30 | + f"Aliases (abbreviated, total: {num_aliases}): \n\t {', '.join(self.aliases[:10])}" 31 | ) 32 | else: 33 | rep = ( 34 | rep + f"Aliases: (total: {num_aliases}): \n\t {', '.join(self.aliases)}" 35 | ) 36 | return rep 37 | 38 | 39 | DEFAULT_UMLS_PATH = "https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/kbs/2020-10-09/umls_2020_aa_cat0129.jsonl" # noqa 40 | DEFAULT_UMLS_TYPES_PATH = "https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/umls_semantic_type_tree.tsv" 41 | 42 | 43 | class KnowledgeBase: 44 | """ 45 | A class representing two commonly needed views of a Knowledge Base: 46 | 1. A mapping from concept_id to an Entity NamedTuple with more information. 47 | 2. A mapping from aliases to the sets of concept ids for which they are aliases. 48 | 49 | Parameters 50 | ---------- 51 | file_path: str, required. 52 | The file path to the json/jsonl representation of the KB to load. 53 | """ 54 | 55 | def __init__( 56 | self, 57 | file_path: str = None, 58 | ): 59 | if file_path is None: 60 | raise ValueError( 61 | "Do not use the default arguments to KnowledgeBase. " 62 | "Instead, use a subclass (e.g UmlsKnowledgeBase) or pass a path to a kb." 63 | ) 64 | if file_path.endswith("jsonl"): 65 | raw = (json.loads(line) for line in open(cached_path(file_path))) 66 | else: 67 | raw = json.load(open(cached_path(file_path))) 68 | 69 | alias_to_cuis: Dict[str, Set[str]] = defaultdict(set) 70 | self.cui_to_entity: Dict[str, Entity] = {} 71 | 72 | for concept in raw: 73 | unique_aliases = set(concept["aliases"]) 74 | unique_aliases.add(concept["canonical_name"]) 75 | for alias in unique_aliases: 76 | alias_to_cuis[alias].add(concept["concept_id"]) 77 | self.cui_to_entity[concept["concept_id"]] = Entity(**concept) 78 | 79 | self.alias_to_cuis: Dict[str, Set[str]] = {**alias_to_cuis} 80 | 81 | 82 | class UmlsKnowledgeBase(KnowledgeBase): 83 | def __init__( 84 | self, 85 | file_path: str = DEFAULT_UMLS_PATH, 86 | types_file_path: str = DEFAULT_UMLS_TYPES_PATH, 87 | ): 88 | 89 | super().__init__(file_path) 90 | 91 | self.semantic_type_tree: UmlsSemanticTypeTree = construct_umls_tree_from_tsv( 92 | types_file_path 93 | ) 94 | 95 | 96 | class Mesh(KnowledgeBase): 97 | def __init__( 98 | self, 99 | file_path: str = "https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/kbs/2020-10-09/mesh_2020.jsonl", # noqa 100 | ): 101 | super().__init__(file_path) 102 | 103 | 104 | class GeneOntology(KnowledgeBase): 105 | def __init__( 106 | self, 107 | file_path: str = "https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/kbs/2020-10-09/umls_2020_go.jsonl", # noqa 108 | ): 109 | super().__init__(file_path) 110 | 111 | 112 | class HumanPhenotypeOntology(KnowledgeBase): 113 | def __init__( 114 | self, 115 | file_path: str = "https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/kbs/2020-10-09/umls_2020_hpo.jsonl", # noqa 116 | ): 117 | super().__init__(file_path) 118 | 119 | 120 | class RxNorm(KnowledgeBase): 121 | def __init__( 122 | self, 123 | file_path: str = "https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/kbs/2020-10-09/umls_2020_rxnorm.jsonl", # noqa 124 | ): 125 | super().__init__(file_path) 126 | -------------------------------------------------------------------------------- /tests/custom_tests/test_custom_tokenizer.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | TEST_CASES = [("using a bag-of-words model", ["using", "a", "bag-of-words", "model"]), 4 | ("activators of cAMP- and cGMP-dependent protein", ["activators", "of", "cAMP-", "and", "cGMP-dependent", "protein"]), 5 | ("phorbol 12-myristate 13-acetate, caused almost", ["phorbol", "12-myristate", "13-acetate", ",", "caused", "almost"]), 6 | pytest.param("let C(j) denote", ["let", "C(j)", "denote"], marks=pytest.mark.xfail), 7 | pytest.param("let (C(j)) denote", ["let", "(", "C(j)", ")", "denote"], marks=pytest.mark.xfail), 8 | pytest.param("let C{j} denote", ["let", "C{j}", "denote"], marks=pytest.mark.xfail), 9 | pytest.param("for the camera(s) and manipulator(s)", ["for", "the", "camera(s)", "and", "manipulator(s)"], marks=pytest.mark.xfail), 10 | ("the (TRAP)-positive genes", ["the", "(TRAP)-positive", "genes"]), 11 | ("the {TRAP}-positive genes", ["the", "{TRAP}-positive", "genes"]), 12 | ("for [Ca2+]i protein", ["for", "[Ca2+]i", "protein"]), 13 | pytest.param("for pyrilamine[3H] protein", ["for", "pyrilamine[3H]", "protein"], marks=pytest.mark.xfail), 14 | ("this is (normal) parens", ["this", "is", "(", "normal", ")", "parens"]), 15 | ("this is [normal] brackets", ["this", "is", "[", "normal", "]", "brackets"]), 16 | ("this is {normal} braces", ["this", "is", "{", "normal", "}", "braces"]), 17 | ("in the lan-\nguage of the", ["in", "the", "language", "of", "the"]), 18 | ("in the lan-\n\nguage of the", ["in", "the", "language", "of", "the"]), 19 | ("in the lan- \nguage of the", ["in", "the", "language", "of", "the"]), 20 | ("in the lan- \n\nguage of the", ["in", "the", "language", "of", "the"]), 21 | ("a 28× 28 image", ["a", "28", "×", "28", "image"]), 22 | ("a 28×28 image", ["a", "28", "×", "28", "image"]), 23 | ("a 28 × 28 image", ["a", "28", "×", "28", "image"]), 24 | ("the neurons’ activation", ["the", "neurons", "’", "activation"]), 25 | ("the neurons' activation", ["the", "neurons", "'", "activation"]), 26 | pytest.param("H3G 1Y6", ["H3G", "1Y6"], marks=pytest.mark.xfail), 27 | ("HFG 1Y6", ["HFG", "1Y6"]), 28 | pytest.param("H3g 1Y6", ["H3g", "1Y6"], marks=pytest.mark.xfail), 29 | pytest.param("h3g 1Y6", ["h3g", "1Y6"], marks=pytest.mark.xfail), 30 | pytest.param("h36g 1Y6", ["h36g", "1Y6"], marks=pytest.mark.xfail), 31 | ("h3gh 1Y6", ["h3gh", "1Y6"]), 32 | ("h3g3 1Y6", ["h3g3", "1Y6"]), 33 | ("3g", ["3", "g"]), 34 | ("(3g)", ["(", "3", "g", ")"]), 35 | ("This can be seen in Figure 1D. Therefore", ["This", "can", "be", "seen", "in", "Figure", "1D", ".", "Therefore"]), 36 | ("This can be seen in Figure 1d. Therefore", ["This", "can", "be", "seen", "in", "Figure", "1d", ".", "Therefore"]), 37 | ("This is a sentence.", ["This", "is", "a", "sentence", "."]), 38 | ("result of 1.345 is good", ["result", "of", "1.345", "is", "good"]), 39 | ("This sentence ends with a single 1.", ["This", "sentence", "ends", "with", "a", "single", "1", "."]), 40 | ("This sentence ends with a single 1. This is the next sentence.", ["This", "sentence", "ends", "with", "a", "single", "1", ".", "This", "is", "the", "next", "sentence", "."]), 41 | ("sec. secs. Sec. Secs. fig. figs. Fig. Figs. eq. eqs. Eq. Eqs. no. nos. No. Nos. al.", ["sec.", "secs.", "Sec.", "Secs.", "fig.", "figs.", "Fig.", "Figs.", "eq.", "eqs.", "Eq.", "Eqs.", "no.", "nos.", "No.", "Nos.", "al."]), 42 | ("in the Gq/G11 protein", ["in", "the", "Gq/G11", "protein"]), 43 | ("in the G1/G11 protein", ["in", "the", "G1/G11", "protein"]), 44 | ("in the G1/11 protein", ["in", "the", "G1/11", "protein"]), 45 | ("in the Gq/11 protein", ["in", "the", "Gq/11", "protein"]), 46 | ("This is a sentence.This is another.", ["This", "is", "a", "sentence", ".", "This", "is", "another", "."]), 47 | ("This number 1.456 should not be tokenized.", ["This", "number", "1.456", "should", "not", "be", "tokenized", "."]), 48 | ] 49 | 50 | @pytest.mark.parametrize('text,expected_tokens', TEST_CASES) 51 | def test_custom_tokenization(en_with_combined_rule_tokenizer_fixture, remove_new_lines_fixture, text, expected_tokens): 52 | text = remove_new_lines_fixture(text) 53 | doc = en_with_combined_rule_tokenizer_fixture(text) 54 | tokens = [t.text for t in doc] 55 | assert tokens == expected_tokens -------------------------------------------------------------------------------- /scispacy/umls_semantic_type_tree.py: -------------------------------------------------------------------------------- 1 | from typing import NamedTuple, List, Dict, Deque, Any, Optional 2 | from collections import deque 3 | 4 | from scispacy.file_cache import cached_path 5 | 6 | 7 | class SemanticTypeNode(NamedTuple): 8 | 9 | type_id: str 10 | full_name: str 11 | children: List[Any] # Mypy does not support nested types yet :( 12 | level: int 13 | 14 | 15 | class UmlsSemanticTypeTree: 16 | """ 17 | A utility class for manipulating the UMLS Semantic Type Hierarchy. 18 | Designed to be constructed from a TSV file using `construct_umls_tree_from_tsv`. 19 | """ 20 | 21 | def __init__(self, root: SemanticTypeNode) -> None: 22 | children = self.get_children(root) 23 | children.append(root) 24 | # We'll store the nodes as a flattened list too, because 25 | # we don't just care about the leaves of the tree - sometimes 26 | # we'll need efficient access to intermediate nodes, and the tree 27 | # is tiny anyway. 28 | self.flat_nodes: List[SemanticTypeNode] = children 29 | self.type_id_to_node = {node.type_id: node for node in self.flat_nodes} 30 | self.depth = max([node.level for node in self.flat_nodes]) 31 | 32 | def get_node_from_id(self, type_id: str) -> SemanticTypeNode: 33 | return self.type_id_to_node[type_id] 34 | 35 | def get_canonical_name(self, type_id: str) -> str: 36 | return self.type_id_to_node[type_id].full_name 37 | 38 | def get_nodes_at_depth(self, level: int) -> List[SemanticTypeNode]: 39 | """ 40 | Returns nodes at a particular depth in the tree. 41 | """ 42 | return [node for node in self.flat_nodes if node.level == level] 43 | 44 | def get_children(self, node: SemanticTypeNode) -> List[SemanticTypeNode]: 45 | """ 46 | Recursively build up a flat list of all a node's children. 47 | """ 48 | children = [] 49 | for child in node.children: 50 | children.append(child) 51 | children.extend(self.get_children(child)) 52 | return children 53 | 54 | def get_parent(self, node: SemanticTypeNode) -> Optional[SemanticTypeNode]: 55 | """ 56 | Returns the parent of the input node, returning None if the input node is the root of the tree 57 | """ 58 | current_depth = node.level 59 | possible_parents = self.get_nodes_at_depth(current_depth - 1) 60 | 61 | for possible_parent in possible_parents: 62 | for child in possible_parent.children: 63 | if child.type_id == node.type_id: 64 | return possible_parent 65 | 66 | # If there are no parents, we are at the root and return None 67 | return None 68 | 69 | def get_collapsed_type_id_map_at_level(self, level: int) -> Dict[str, str]: 70 | """ 71 | Constructs a label mapping from the original tree labels to a tree of a fixed depth, 72 | collapsing labels greater than the depth specified to the closest parent which is 73 | still present in the new fixed depth tree. This is effectively mapping to a _coarser_ 74 | label space. 75 | """ 76 | new_type_id_map: Dict[str, str] = {k: k for k in self.type_id_to_node.keys()} 77 | for node in self.get_nodes_at_depth(level): 78 | for child in self.get_children(node): 79 | new_type_id_map[child.type_id] = node.type_id 80 | return new_type_id_map 81 | 82 | 83 | def construct_umls_tree_from_tsv(filepath: str) -> UmlsSemanticTypeTree: 84 | 85 | """ 86 | Reads in a tsv file which is formatted as a depth first traversal of 87 | a hierarchy tree, where nodes are of the format: 88 | 89 | Name TAB UMLS Semantic Type TAB Tree Depth 90 | 91 | Event T051 1 92 | Activity T052 2 93 | Behavior T053 3 94 | Social Behavior T054 4 95 | Individual Behavior T055 4 96 | Daily or Recreational Activity T056 3 97 | """ 98 | 99 | node_stack: Deque[SemanticTypeNode] = deque() 100 | for line in open(cached_path(filepath), "r"): 101 | name, type_id, level = line.split("\t") 102 | name = name.strip() 103 | int_level = int(level.strip()) 104 | node = SemanticTypeNode(type_id, name, [], int_level) 105 | 106 | node_stack.append(node) 107 | 108 | def attach_children(node: SemanticTypeNode, stack: Deque[SemanticTypeNode]): 109 | while stack and stack[0].level > node.level: 110 | popped = stack.popleft() 111 | attach_children(popped, stack) 112 | node.children.append(popped) 113 | 114 | first = node_stack.popleft() 115 | attach_children(first, node_stack) 116 | 117 | return UmlsSemanticTypeTree(first) 118 | -------------------------------------------------------------------------------- /scripts/export_umls_json.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | Convert a umls release to a jsonl file of concepts. 4 | 5 | """ 6 | import json 7 | import argparse 8 | from scispacy import umls_utils 9 | 10 | def main(meta_path: str, output_path: str, source: str = None): 11 | 12 | concept_details = {} # dictionary of concept_id -> { 13 | # 'concept_id': str, 14 | # 'canonical_name': str 15 | # 'aliases': List[str] 16 | # 'types': List[str] 17 | # 'definition': str 18 | # } 19 | 20 | print('Reading concepts ... ') 21 | umls_utils.read_umls_concepts(meta_path, concept_details, source) 22 | 23 | print('Reading types ... ') 24 | umls_utils.read_umls_types(meta_path, concept_details) 25 | 26 | print('Reading definitions ... ') 27 | umls_utils.read_umls_definitions(meta_path, concept_details) 28 | 29 | without_canonical_name_count = 0 30 | without_aliases_count = 0 31 | with_one_alias_count = 0 32 | with_more_than_one_alias_count = 0 33 | without_type_count = 0 34 | with_one_type_count = 0 35 | with_more_than_one_type_count = 0 36 | without_definition_count = 0 37 | with_definition_pref_source_count = 0 38 | with_definition_other_sources_count = 0 39 | for concept in concept_details.values(): 40 | without_canonical_name_count += 1 if 'canonical_name' not in concept else 0 41 | without_aliases_count += 1 if len(concept['aliases']) == 0 else 0 42 | with_one_alias_count += 1 if len(concept['aliases']) == 1 else 0 43 | with_more_than_one_alias_count += 1 if len(concept['aliases']) > 1 else 0 44 | without_type_count += 1 if len(concept['types']) == 0 else 0 45 | with_one_type_count += 1 if len(concept['types']) == 1 else 0 46 | with_more_than_one_type_count += 1 if len(concept['types']) >= 1 else 0 47 | without_definition_count += 1 if 'definition' not in concept else 0 48 | with_definition_pref_source_count += 1 if concept.get('is_from_preferred_source') == 'Y' else 0 49 | with_definition_other_sources_count += 1 if concept.get('is_from_preferred_source') == 'N' else 0 50 | 51 | print(f'Number of concepts: {len(concept_details)}') 52 | print(f'Number of concepts without canonical name (one of the aliases will be used instead): ' 53 | f'{without_canonical_name_count}') 54 | print(f'Number of concepts with no aliases: {without_aliases_count}') 55 | print(f'Number of concepts with 1 alias: {with_one_alias_count}') 56 | print(f'Number of concepts with > 1 alias: {with_more_than_one_alias_count}') 57 | print(f'Number of concepts with no type: {without_type_count}') 58 | print(f'Number of concepts with 1 type: {with_one_type_count}') 59 | print(f'Number of concepts with > 1 type: {with_more_than_one_type_count}') 60 | print(f'Number of concepts with no definition: {without_definition_count}') 61 | print(f'Number of concepts with definition from preferred sources: {with_definition_pref_source_count}') 62 | print(f'Number of concepts with definition from other sources: {with_definition_other_sources_count}') 63 | 64 | print('Deleting unused fields and choosing a canonical name from aliases ... ') 65 | for concept in concept_details.values(): 66 | 67 | # Some concepts have many duplicate aliases. Here we remove them. 68 | concept["aliases"] = list(set(concept["aliases"])) 69 | 70 | # if a concept doesn't have a canonical name, use the first alias instead 71 | if 'canonical_name' not in concept: 72 | aliases = concept['aliases'] 73 | concept['canonical_name'] = aliases[0] 74 | del aliases[0] 75 | 76 | # deleting `is_from_preferred_source` 77 | if 'is_from_preferred_source' in concept: 78 | del concept['is_from_preferred_source'] 79 | 80 | print('Exporting to the a jsonl file {} ...'.format(output_path)) 81 | with open(output_path, 'w') as fout: 82 | 83 | for value in concept_details.values(): 84 | fout.write(json.dumps(value) + "\n") 85 | print('DONE.') 86 | 87 | 88 | if __name__ == "__main__": 89 | parser = argparse.ArgumentParser() 90 | parser.add_argument( 91 | '--meta_path', 92 | help="Path to the META directory of an UMLS release." 93 | ) 94 | parser.add_argument( 95 | '--output_path', 96 | help="Path to the output jsonl file" 97 | ) 98 | parser.add_argument( 99 | '--source', 100 | type=str, 101 | default=None, 102 | help="Whether to filter for a only a single UMLS source." 103 | ) 104 | args = parser.parse_args() 105 | main(args.meta_path, args.output_path, args.source) 106 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Tuple, Optional 2 | import os 3 | 4 | import pytest 5 | import spacy 6 | from spacy.language import Language as SpacyModelType 7 | from spacy.cli.download import download as spacy_download 8 | 9 | from scispacy.custom_sentence_segmenter import pysbd_sentencizer 10 | from scispacy.custom_tokenizer import combined_rule_tokenizer, combined_rule_prefixes, remove_new_lines 11 | from scispacy.abbreviation import AbbreviationDetector 12 | 13 | LOADED_SPACY_MODELS: Dict[Tuple[str, bool, bool, bool], SpacyModelType] = {} 14 | 15 | 16 | def get_spacy_model( 17 | spacy_model_name: str, 18 | pos_tags: bool, 19 | parse: bool, 20 | ner: bool, 21 | with_custom_tokenizer: bool = False, 22 | with_sentence_segmenter: bool = False, 23 | with_serializable_abbreviation_detector: Optional[bool] = None, 24 | ) -> SpacyModelType: 25 | """ 26 | In order to avoid loading spacy models repeatedly, 27 | we'll save references to them, keyed by the options 28 | we used to create the spacy model, so any particular 29 | configuration only gets loaded once. 30 | """ 31 | options = (spacy_model_name, pos_tags, parse, ner, with_custom_tokenizer, with_sentence_segmenter, with_serializable_abbreviation_detector) 32 | if options not in LOADED_SPACY_MODELS: 33 | disable = ["vectors", "textcat"] 34 | if not pos_tags: 35 | disable.append("tagger") 36 | if not parse: 37 | disable.append("parser") 38 | if not ner: 39 | disable.append("ner") 40 | try: 41 | spacy_model = spacy.load(spacy_model_name, disable=disable) 42 | except OSError: 43 | print(f"Spacy models '{spacy_model_name}' not found. Downloading and installing.") 44 | spacy_download(spacy_model_name) 45 | spacy_model = spacy.load(spacy_model_name, disable=disable) 46 | 47 | if with_custom_tokenizer: 48 | spacy_model.tokenizer = combined_rule_tokenizer(spacy_model) 49 | if with_sentence_segmenter: 50 | spacy_model.add_pipe("pysbd_sentencizer", first=True) 51 | if with_serializable_abbreviation_detector is not None: 52 | spacy_model.add_pipe("abbreviation_detector", config={"make_serializable": with_serializable_abbreviation_detector}) 53 | 54 | LOADED_SPACY_MODELS[options] = spacy_model 55 | return LOADED_SPACY_MODELS[options] 56 | 57 | 58 | @pytest.fixture() 59 | def combined_rule_tokenizer_fixture(): 60 | nlp = get_spacy_model("en_core_web_sm", True, True, True) 61 | tokenizer = combined_rule_tokenizer(nlp) 62 | return tokenizer 63 | 64 | 65 | @pytest.fixture() 66 | def en_with_combined_rule_tokenizer_fixture(): 67 | nlp = get_spacy_model("en_core_web_sm", True, True, True, with_custom_tokenizer=True) 68 | return nlp 69 | 70 | 71 | @pytest.fixture() 72 | def en_with_combined_rule_tokenizer_and_segmenter_fixture(): 73 | nlp = get_spacy_model("en_core_web_sm", True, True, True, with_custom_tokenizer=True, with_sentence_segmenter=True) 74 | return nlp 75 | 76 | 77 | @pytest.fixture() 78 | def test_data_fixtures_path(): 79 | return os.path.join("tests", "custom_tests", "data_fixtures") 80 | 81 | 82 | @pytest.fixture() 83 | def test_raw_path(): 84 | return os.path.join("tests", "custom_tests", "data_fixtures", "raw") 85 | 86 | 87 | @pytest.fixture() 88 | def test_pmids_path(): 89 | return os.path.join("tests", "custom_tests", "data_fixtures", "test.pmids") 90 | 91 | 92 | @pytest.fixture() 93 | def test_conll_path(): 94 | return os.path.join("tests", "custom_tests", "data_fixtures", "test.conllu") 95 | 96 | 97 | @pytest.fixture() 98 | def test_model_dir(): 99 | return os.path.join("tests", "custom_tests", "data_fixtures", "tmp_model_dir") 100 | 101 | 102 | @pytest.fixture() 103 | def combined_all_model_fixture(): 104 | nlp = get_spacy_model("en_core_sci_sm", True, True, True, with_custom_tokenizer=True, with_sentence_segmenter=False, with_serializable_abbreviation_detector=True) 105 | return nlp 106 | 107 | @pytest.fixture() 108 | def combined_all_model_fixture_non_serializable_abbrev(): 109 | nlp = get_spacy_model("en_core_sci_sm", True, True, True, with_custom_tokenizer=True, with_sentence_segmenter=False, with_serializable_abbreviation_detector=False) 110 | return nlp 111 | 112 | @pytest.fixture() 113 | def combined_rule_prefixes_fixture(): 114 | return combined_rule_prefixes() 115 | 116 | 117 | @pytest.fixture() 118 | def remove_new_lines_fixture(): 119 | return remove_new_lines 120 | 121 | 122 | @pytest.fixture() 123 | def default_en_tokenizer_fixture(): 124 | nlp = get_spacy_model("en_core_web_sm", True, True, True) 125 | return nlp.tokenizer 126 | 127 | 128 | @pytest.fixture() 129 | def default_en_model_fixture(): 130 | nlp = get_spacy_model("en_core_web_sm", True, True, True) 131 | return nlp 132 | -------------------------------------------------------------------------------- /scispacy/hyponym_detector.py: -------------------------------------------------------------------------------- 1 | from spacy.matcher import Matcher 2 | from spacy.tokens import Token, Doc 3 | from spacy.language import Language 4 | 5 | from scispacy.hearst_patterns import BASE_PATTERNS, EXTENDED_PATTERNS 6 | 7 | 8 | @Language.factory("hyponym_detector") 9 | class HyponymDetector: 10 | """ 11 | A spaCy pipe for detecting hyponyms using Hearst patterns. 12 | This class sets the following attributes: 13 | 14 | - `Doc._.hearst_patterns`: A List[Tuple[str, Span, Span]] corresonding to 15 | the matching predicate, extracted general term and specific term 16 | that matched a Hearst pattern. 17 | 18 | Parts of the implementation taken from 19 | https://github.com/mmichelsonIF/hearst_patterns_python/blob/master/hearstPatterns/hearstPatterns.py 20 | and 21 | https://github.com/Fourthought/CNDPipeline/blob/master/cndlib/hpspacy.py 22 | 23 | The pipe can be used with an instantiated spacy model like so: 24 | ``` 25 | # add the hyponym detector 26 | nlp.add_pipe('hyponym_detector', config={'extended': True}, last=True) 27 | 28 | Parameters 29 | ---------- 30 | 31 | nlp: `Language`, a required argument for spacy to use this as a factory 32 | name: `str`, a required argument for spacy to use this as a factory 33 | extended: `bool`, whether to use the extended Hearts patterns or not 34 | """ 35 | 36 | def __init__( 37 | self, nlp: Language, name: str = "hyponym_detector", extended: bool = False 38 | ): 39 | 40 | self.nlp = nlp 41 | 42 | self.patterns = BASE_PATTERNS 43 | if extended: 44 | self.patterns.extend(EXTENDED_PATTERNS) 45 | 46 | self.matcher = Matcher(self.nlp.vocab) 47 | 48 | Doc.set_extension("hearst_patterns", default=[], force=True) 49 | 50 | self.first = set() 51 | self.last = set() 52 | 53 | # add patterns to matcher 54 | for pattern in self.patterns: 55 | self.matcher.add(pattern["label"], [pattern["pattern"]]) 56 | 57 | # gather list of predicates where the hypernym appears first 58 | if pattern["position"] == "first": 59 | self.first.add(pattern["label"]) 60 | 61 | # gather list of predicates where the hypernym appears last 62 | if pattern["position"] == "last": 63 | self.last.add(pattern["label"]) 64 | 65 | def expand_to_noun_compound(self, token: Token, doc: Doc): 66 | """ 67 | Expand a token to it's noun phrase based 68 | on a simple POS tag heuristic. 69 | """ 70 | 71 | start = token.i 72 | while True: 73 | if start - 1 < 0: 74 | break 75 | previous_token = doc[start - 1] 76 | if previous_token.pos_ in {"PROPN", "NOUN", "PRON"}: 77 | start -= 1 78 | else: 79 | break 80 | 81 | end = token.i + 1 82 | while True: 83 | if end >= len(doc): 84 | break 85 | next_token = doc[end] 86 | if next_token.pos_ in {"PROPN", "NOUN", "PRON"}: 87 | end += 1 88 | else: 89 | break 90 | 91 | return doc[start:end] 92 | 93 | def find_noun_compound_head(self, token: Token): 94 | 95 | while token.head.pos_ in {"PROPN", "NOUN", "PRON"} and token.dep_ == "compound": 96 | token = token.head 97 | return token 98 | 99 | def __call__(self, doc: Doc): 100 | """ 101 | Runs the matcher on the Doc object and sets token and 102 | doc level attributes for hypernym and hyponym relations. 103 | """ 104 | # Find matches in doc 105 | matches = self.matcher(doc) 106 | 107 | # If none are found then return None 108 | if not matches: 109 | return doc 110 | 111 | for match_id, start, end in matches: 112 | predicate = self.nlp.vocab.strings[match_id] 113 | 114 | # if the predicate is in the list where the hypernym is last, else hypernym is first 115 | if predicate in self.last: 116 | hypernym = doc[end - 1] 117 | hyponym = doc[start] 118 | else: 119 | # An inelegent way to deal with the "such_NOUN_as pattern" 120 | # since the first token is not the hypernym. 121 | if doc[start].lemma_ == "such": 122 | start += 1 123 | hypernym = doc[start] 124 | hyponym = doc[end - 1] 125 | 126 | hypernym = self.find_noun_compound_head(hypernym) 127 | hyponym = self.find_noun_compound_head(hyponym) 128 | 129 | # For the document level, we expand to contain noun phrases. 130 | hypernym_extended = self.expand_to_noun_compound(hypernym, doc) 131 | hyponym_extended = self.expand_to_noun_compound(hyponym, doc) 132 | 133 | doc._.hearst_patterns.append( 134 | (predicate, hypernym_extended, hyponym_extended) 135 | ) 136 | 137 | for token in hyponym.conjuncts: 138 | 139 | token_extended = self.expand_to_noun_compound(token, doc) 140 | if token != hypernym and token is not None: 141 | doc._.hearst_patterns.append( 142 | (predicate, hypernym_extended, token_extended) 143 | ) 144 | 145 | return doc 146 | -------------------------------------------------------------------------------- /scispacy/file_cache.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utilities for working with the local dataset cache. 3 | """ 4 | 5 | import os 6 | import shutil 7 | import tempfile 8 | import json 9 | from urllib.parse import urlparse 10 | from pathlib import Path 11 | from typing import Tuple, Union, IO 12 | from hashlib import sha256 13 | 14 | import requests 15 | 16 | CACHE_ROOT = Path(os.getenv("SCISPACY_CACHE", str(Path.home() / ".scispacy"))) 17 | DATASET_CACHE = str(CACHE_ROOT / "datasets") 18 | 19 | 20 | def cached_path(url_or_filename: Union[str, Path], cache_dir: str = None) -> str: 21 | """ 22 | Given something that might be a URL (or might be a local path), 23 | determine which. If it's a URL, download the file and cache it, and 24 | return the path to the cached file. If it's already a local path, 25 | make sure the file exists and then return the path. 26 | """ 27 | if cache_dir is None: 28 | cache_dir = DATASET_CACHE 29 | if isinstance(url_or_filename, Path): 30 | url_or_filename = str(url_or_filename) 31 | 32 | parsed = urlparse(url_or_filename) 33 | 34 | if parsed.scheme in ("http", "https"): 35 | # URL, so get it from the cache (downloading if necessary) 36 | return get_from_cache(url_or_filename, cache_dir) 37 | elif os.path.exists(url_or_filename): 38 | # File, and it exists. 39 | return url_or_filename 40 | elif parsed.scheme == "": 41 | # File, but it doesn't exist. 42 | raise FileNotFoundError("file {} not found".format(url_or_filename)) 43 | else: 44 | # Something unknown 45 | raise ValueError( 46 | "unable to parse {} as a URL or as a local path".format(url_or_filename) 47 | ) 48 | 49 | 50 | def url_to_filename(url: str, etag: str = None) -> str: 51 | """ 52 | Convert `url` into a hashed filename in a repeatable way. 53 | If `etag` is specified, append its hash to the url's, delimited 54 | by a period. 55 | """ 56 | 57 | last_part = url.split("/")[-1] 58 | url_bytes = url.encode("utf-8") 59 | url_hash = sha256(url_bytes) 60 | filename = url_hash.hexdigest() 61 | 62 | if etag: 63 | etag_bytes = etag.encode("utf-8") 64 | etag_hash = sha256(etag_bytes) 65 | filename += "." + etag_hash.hexdigest() 66 | 67 | filename += "." + last_part 68 | return filename 69 | 70 | 71 | def filename_to_url(filename: str, cache_dir: str = None) -> Tuple[str, str]: 72 | """ 73 | Return the url and etag (which may be ``None``) stored for `filename`. 74 | Raise ``FileNotFoundError`` if `filename` or its stored metadata do not exist. 75 | """ 76 | if cache_dir is None: 77 | cache_dir = DATASET_CACHE 78 | 79 | cache_path = os.path.join(cache_dir, filename) 80 | if not os.path.exists(cache_path): 81 | raise FileNotFoundError("file {} not found".format(cache_path)) 82 | 83 | meta_path = cache_path + ".json" 84 | if not os.path.exists(meta_path): 85 | raise FileNotFoundError("file {} not found".format(meta_path)) 86 | 87 | with open(meta_path) as meta_file: 88 | metadata = json.load(meta_file) 89 | url = metadata["url"] 90 | etag = metadata["etag"] 91 | 92 | return url, etag 93 | 94 | 95 | def http_get(url: str, temp_file: IO) -> None: 96 | req = requests.get(url, stream=True) 97 | for chunk in req.iter_content(chunk_size=1024): 98 | if chunk: # filter out keep-alive new chunks 99 | temp_file.write(chunk) 100 | 101 | 102 | def get_from_cache(url: str, cache_dir: str = None) -> str: 103 | """ 104 | Given a URL, look for the corresponding dataset in the local cache. 105 | If it's not there, download it. Then return the path to the cached file. 106 | """ 107 | if cache_dir is None: 108 | cache_dir = DATASET_CACHE 109 | 110 | os.makedirs(cache_dir, exist_ok=True) 111 | 112 | response = requests.head(url, allow_redirects=True) 113 | if response.status_code != 200: 114 | raise IOError( 115 | "HEAD request failed for url {} with status code {}".format( 116 | url, response.status_code 117 | ) 118 | ) 119 | etag = response.headers.get("ETag") 120 | 121 | filename = url_to_filename(url, etag) 122 | 123 | # get cache path to put the file 124 | cache_path = os.path.join(cache_dir, filename) 125 | 126 | if not os.path.exists(cache_path): 127 | # Download to temporary file, then copy to cache dir once finished. 128 | # Otherwise you get corrupt cache entries if the download gets interrupted. 129 | with tempfile.NamedTemporaryFile() as temp_file: # type: IO 130 | print(f"{url} not found in cache, downloading to {temp_file.name}") 131 | 132 | # GET file object 133 | http_get(url, temp_file) 134 | 135 | # we are copying the file before closing it, so flush to avoid truncation 136 | temp_file.flush() 137 | # shutil.copyfileobj() starts at the current position, so go to the start 138 | temp_file.seek(0) 139 | 140 | print( 141 | f"Finished download, copying {temp_file.name} to cache at {cache_path}" 142 | ) 143 | with open(cache_path, "wb") as cache_file: 144 | shutil.copyfileobj(temp_file, cache_file) 145 | 146 | meta = {"url": url, "etag": etag} 147 | meta_path = cache_path + ".json" 148 | with open(meta_path, "w") as meta_file: 149 | json.dump(meta, meta_file) 150 | 151 | return cache_path 152 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: default 3 | --- 4 | 5 | **scispaCy is a Python package containing [spaCy](https://spacy.io/) models for processing _biomedical_, _scientific_ or _clinical_ text.** 6 | 7 | 8 | ## Interactive Demo 9 | Just looking to test out the models on your data? Check out our [demo](https://scispacy.apps.allenai.org). 10 | 11 | ## Installing 12 | ```python 13 | pip install scispacy 14 | pip install 15 | ``` 16 | ## Models 17 | 18 | | Model | Description | Install URL 19 | |:---------------|:------------------|:----------| 20 | | en_core_sci_sm | A full spaCy pipeline for biomedical data. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_sm-0.4.0.tar.gz)| 21 | | en_core_sci_md | A full spaCy pipeline for biomedical data with a larger vocabulary and 50k word vectors. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_md-0.4.0.tar.gz)| 22 | | en_core_sci_scibert | A full spaCy pipeline for biomedical data with a ~785k vocabulary and `allenai/scibert-base` as the transformer model. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_scibert-0.4.0.tar.gz)| 23 | | en_core_sci_lg | A full spaCy pipeline for biomedical data with a larger vocabulary and 600k word vectors. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_lg-0.4.0.tar.gz)| 24 | | en_ner_craft_md| A spaCy NER model trained on the CRAFT corpus.|[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_ner_craft_md-0.4.0.tar.gz)| 25 | | en_ner_jnlpba_md | A spaCy NER model trained on the JNLPBA corpus.| [Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_ner_jnlpba_md-0.4.0.tar.gz)| 26 | | en_ner_bc5cdr_md | A spaCy NER model trained on the BC5CDR corpus. | [Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_ner_bc5cdr_md-0.4.0.tar.gz)| 27 | | en_ner_bionlp13cg_md | A spaCy NER model trained on the BIONLP13CG corpus. | [Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_ner_bionlp13cg_md-0.4.0.tar.gz)| 28 | 29 | 30 | 31 | 32 | ### Performance 33 | 34 | Our models achieve performance within 3% of published state of the art dependency parsers and within 0.4% accuracy of state of the art biomedical POS taggers. 35 | 36 | | model | UAS | LAS | POS | Mentions (F1) | Web UAS | 37 | |:---------------|:----|:------|:------|:---|:---| 38 | | en_core_sci_sm | 89.54| 87.62 | 98.32 | 68.15 | 87.62 | 39 | | en_core_sci_md | 89.61| 87.77 | 98.56 | 69.64 | 88.05 | 40 | | en_core_sci_lg | 89.63| 87.81 | 98.56 | 69.61 | 88.08 | 41 | | en_core_sci_scibert | 92.03| 90.25 | 98.91 | 67.91 | 92.21 | 42 | 43 | 44 | | model | F1 | Entity Types| 45 | |:---------------|:-----|:--------| 46 | | en_ner_craft_md | 76.11|GGP, SO, TAXON, CHEBI, GO, CL| 47 | | en_ner_jnlpba_md | 71.62| DNA, CELL_TYPE, CELL_LINE, RNA, PROTEIN | 48 | | en_ner_bc5cdr_md | 84.49| DISEASE, CHEMICAL| 49 | | en_ner_bionlp13cg_md | 77.75| AMINO_ACID, ANATOMICAL_SYSTEM, CANCER, CELL, CELLULAR_COMPONENT, DEVELOPING_ANATOMICAL_STRUCTURE, GENE_OR_GENE_PRODUCT, IMMATERIAL_ANATOMICAL_ENTITY, MULTI-TISSUE_STRUCTURE, ORGAN, ORGANISM, ORGANISM_SUBDIVISION, ORGANISM_SUBSTANCE, PATHOLOGICAL_FORMATION, SIMPLE_CHEMICAL, TISSUE | 50 | 51 | 52 | ### Example Usage 53 | 54 | ```python 55 | import scispacy 56 | import spacy 57 | 58 | nlp = spacy.load("en_core_sci_sm") 59 | text = """ 60 | Myeloid derived suppressor cells (MDSC) are immature 61 | myeloid cells with immunosuppressive activity. 62 | They accumulate in tumor-bearing mice and humans 63 | with different types of cancer, including hepatocellular 64 | carcinoma (HCC). 65 | """ 66 | doc = nlp(text) 67 | 68 | print(list(doc.sents)) 69 | >>> ["Myeloid derived suppressor cells (MDSC) are immature myeloid cells with immunosuppressive activity.", 70 | "They accumulate in tumor-bearing mice and humans with different types of cancer, including hepatocellular carcinoma (HCC)."] 71 | 72 | # Examine the entities extracted by the mention detector. 73 | # Note that they don't have types like in SpaCy, and they 74 | # are more general (e.g including verbs) - these are any 75 | # spans which might be an entity in UMLS, a large 76 | # biomedical database. 77 | print(doc.ents) 78 | >>> (Myeloid derived suppressor cells, 79 | MDSC, 80 | immature, 81 | myeloid cells, 82 | immunosuppressive activity, 83 | accumulate, 84 | tumor-bearing mice, 85 | humans, 86 | cancer, 87 | hepatocellular carcinoma, 88 | HCC) 89 | 90 | # We can also visualise dependency parses 91 | # (This renders automatically inside a jupyter notebook!): 92 | from spacy import displacy 93 | displacy.render(next(doc.sents), style='dep', jupyter=True) 94 | 95 | # See below for the generated SVG. 96 | # Zoom your browser in a bit! 97 | 98 | ``` 99 | 100 | ![Branching](./example.svg) 101 | 102 | ### Data Sources 103 | 104 | scispaCy models are trained on data from a variety of sources. In particular, 105 | we use: 106 | 107 | * **[The GENIA 1.0 Treebank](https://nlp.stanford.edu/~mcclosky/biomedical.html)**, converted to basic Universal Dependencies using the [Stanford Dependency Converter](https://nlp.stanford.edu/software/stanford-dependencies.shtml). 108 | We have made this [dataset available along with the original raw data](https://github.com/allenai/genia-dependency-trees). 109 | * **[word2vec word vectors](http://bio.nlplab.org/#word-vectors)** trained on the Pubmed Central Open Access Subset. 110 | * **[The MedMentions Entity Linking dataset](https://github.com/chanzuckerberg/MedMentions)**, used for training a mention detector. 111 | * **[Ontonotes 5.0](https://catalog.ldc.upenn.edu/LDC2013T19)** to make the parser and tagger more robust to non-biomedical text. Unfortunately this is not publically available. 112 | -------------------------------------------------------------------------------- /scispacy/custom_tokenizer.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | from spacy.lang import char_classes 4 | from spacy.symbols import ORTH 5 | from spacy.tokenizer import Tokenizer 6 | from spacy.util import compile_prefix_regex, compile_infix_regex, compile_suffix_regex 7 | from spacy.language import Language 8 | 9 | from scispacy.consts import ABBREVIATIONS 10 | 11 | 12 | def remove_new_lines(text: str) -> str: 13 | """Used to preprocess away new lines in the middle of words. This function 14 | is intended to be called on a raw string before it is passed through a 15 | spaCy pipeline 16 | 17 | @param text: a string of text to be processed 18 | """ 19 | text = text.replace("-\n\n", "") 20 | text = text.replace("- \n\n", "") 21 | text = text.replace("-\n", "") 22 | text = text.replace("- \n", "") 23 | return text 24 | 25 | 26 | def combined_rule_prefixes() -> List[str]: 27 | """Helper function that returns the prefix pattern for the tokenizer. 28 | It is a helper function to accomodate spacy tests that only test 29 | prefixes. 30 | """ 31 | # add lookahead assertions for brackets (may not work properly for unbalanced brackets) 32 | prefix_punct = char_classes.PUNCT.replace("|", " ") 33 | prefix_punct = prefix_punct.replace(r"\(", r"\((?![^\(\s]+\)\S+)") 34 | prefix_punct = prefix_punct.replace(r"\[", r"\[(?![^\[\s]+\]\S+)") 35 | prefix_punct = prefix_punct.replace(r"\{", r"\{(?![^\{\s]+\}\S+)") 36 | 37 | prefixes = ( 38 | ["§", "%", "=", r"\+"] 39 | + char_classes.split_chars(prefix_punct) 40 | + char_classes.LIST_ELLIPSES 41 | + char_classes.LIST_QUOTES 42 | + char_classes.LIST_CURRENCY 43 | + char_classes.LIST_ICONS 44 | ) 45 | return prefixes 46 | 47 | 48 | def combined_rule_tokenizer(nlp: Language) -> Tokenizer: 49 | """Creates a custom tokenizer on top of spaCy's default tokenizer. The 50 | intended use of this function is to replace the tokenizer in a spaCy 51 | pipeline like so: 52 | 53 | nlp = spacy.load("some_spacy_model") 54 | nlp.tokenizer = combined_rule_tokenizer(nlp) 55 | 56 | @param nlp: a loaded spaCy model 57 | """ 58 | # remove the first hyphen to prevent tokenization of the normal hyphen 59 | hyphens = char_classes.HYPHENS.replace("-|", "", 1) 60 | 61 | infixes = ( 62 | char_classes.LIST_ELLIPSES 63 | + char_classes.LIST_ICONS 64 | + [ 65 | r"×", # added this special x character to tokenize it separately 66 | r"(?<=[0-9])[+\-\*^](?=[0-9-])", 67 | r"(?<=[{al}])\.(?=[{au}])".format( 68 | al=char_classes.ALPHA_LOWER, au=char_classes.ALPHA_UPPER 69 | ), 70 | r"(?<=[{a}]),(?=[{a}])".format(a=char_classes.ALPHA), 71 | r'(?<=[{a}])[?";:=,.]*(?:{h})(?=[{a}])'.format( 72 | a=char_classes.ALPHA, h=hyphens 73 | ), 74 | # removed / to prevent tokenization of / 75 | r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=char_classes.ALPHA), 76 | ] 77 | ) 78 | 79 | prefixes = combined_rule_prefixes() 80 | 81 | # add the last apostrophe 82 | quotes = char_classes.LIST_QUOTES.copy() + ["’"] 83 | 84 | # add lookbehind assertions for brackets (may not work properly for unbalanced brackets) 85 | suffix_punct = char_classes.PUNCT.replace("|", " ") 86 | # These lookbehinds are commented out because they are variable width lookbehinds, and as of spacy 2.1, 87 | # spacy uses the re package instead of the regex package. The re package does not support variable width 88 | # lookbehinds. Hacking spacy internals to allow us to use the regex package is doable, but would require 89 | # creating our own instance of the language class, with our own Tokenizer class, with the from_bytes method 90 | # using the regex package instead of the re package 91 | # suffix_punct = suffix_punct.replace(r"\)", r"(? Doc: 99 | mention_strings = [] 100 | if self.resolve_abbreviations and Doc.has_extension("abbreviations"): 101 | # TODO: This is possibly sub-optimal - we might 102 | # prefer to look up both the long and short forms. 103 | for ent in doc.ents: 104 | if isinstance(ent._.long_form, Span): 105 | # Long form 106 | mention_strings.append(ent._.long_form.text) 107 | elif isinstance(ent._.long_form, str): 108 | # Long form 109 | mention_strings.append(ent._.long_form) 110 | else: 111 | # no abbreviations case 112 | mention_strings.append(ent.text) 113 | else: 114 | mention_strings = [x.text for x in doc.ents] 115 | 116 | batch_candidates = self.candidate_generator(mention_strings, self.k) 117 | 118 | for mention, candidates in zip(doc.ents, batch_candidates): 119 | predicted = [] 120 | for cand in candidates: 121 | score = max(cand.similarities) 122 | if ( 123 | self.filter_for_definitions 124 | and self.kb.cui_to_entity[cand.concept_id].definition is None 125 | and score < self.no_definition_threshold 126 | ): 127 | continue 128 | if score > self.threshold: 129 | predicted.append((cand.concept_id, score)) 130 | sorted_predicted = sorted(predicted, reverse=True, key=lambda x: x[1]) 131 | mention._.umls_ents = sorted_predicted[: self.max_entities_per_mention] 132 | mention._.kb_ents = sorted_predicted[: self.max_entities_per_mention] 133 | 134 | return doc 135 | -------------------------------------------------------------------------------- /scispacy/umls_utils.py: -------------------------------------------------------------------------------- 1 | from typing import List, Dict 2 | 3 | # TODO(Mark): Remove in scispacy v1.0, for backward compatability only. 4 | from scispacy.linking_utils import Entity as UmlsEntity, UmlsKnowledgeBase # noqa 5 | 6 | # preferred definition sources (from S2) 7 | DEF_SOURCES_PREFERRED = {"NCI_BRIDG", "NCI_NCI-GLOSS", "NCI", "GO", "MSH", "NCI_FDA"} 8 | 9 | 10 | def read_umls_file_headers(meta_path: str, filename: str) -> List[str]: 11 | """ 12 | Read the file descriptor MRFILES.RRF from a UMLS release and get column headers (names) 13 | for the given file 14 | 15 | MRFILES.RRF file format: a pipe-separated values 16 | Useful columns: 17 | column 0: name of one of the files in the META directory 18 | column 2: column names of that file 19 | 20 | Args: 21 | meta_path: path to the META directory of an UMLS release 22 | filename: name of the file to get its column headers 23 | Returns: 24 | a list of column names 25 | """ 26 | file_descriptors = f"{meta_path}/MRFILES.RRF" # to get column names 27 | with open(file_descriptors) as fin: 28 | for line in fin: 29 | splits = line.split("|") 30 | found_filename = splits[0] 31 | column_names = (splits[2] + ",").split( 32 | "," 33 | ) # ugly hack because all files end with an empty column 34 | if found_filename in filename: 35 | return column_names 36 | assert False, f"Couldn't find column names for file {filename}" 37 | return None 38 | 39 | 40 | def read_umls_concepts(meta_path: str, concept_details: Dict, source: str = None): 41 | """ 42 | Read the concepts file MRCONSO.RRF from a UMLS release and store it in 43 | concept_details dictionary. Each concept is represented with 44 | - concept_id 45 | - canonical_name 46 | - aliases 47 | - types 48 | - definition 49 | This function fills the first three. If a canonical name is not found, it is left empty. 50 | 51 | MRFILES.RRF file format: a pipe-separated values 52 | Useful columns: CUI, LAT, SUPPRESS, STR, ISPREF, TS, STT 53 | 54 | Args: 55 | meta_path: path to the META directory of an UMLS release 56 | concept_details: a dictionary to be filled with concept informations 57 | source: An optional source identifier, used as a filter to extract only a 58 | specific source from UMLS. 59 | """ 60 | concepts_filename = "MRCONSO.RRF" 61 | headers = read_umls_file_headers(meta_path, concepts_filename) 62 | with open(f"{meta_path}/{concepts_filename}") as fin: 63 | for line in fin: 64 | splits = line.strip().split("|") 65 | assert len(headers) == len(splits), (headers, splits) 66 | concept = dict(zip(headers, splits)) 67 | if concept["LAT"] != "ENG" or concept["SUPPRESS"] != "N": 68 | continue # Keep English non-suppressed concepts only 69 | 70 | if source is not None: 71 | if concept["SAB"] != source: 72 | continue 73 | 74 | concept_id = concept["CUI"] 75 | if concept_id not in concept_details: # a new concept 76 | # add it to the dictionary with an empty list of aliases and types 77 | concept_details[concept_id] = { 78 | "concept_id": concept_id, 79 | "aliases": [], 80 | "types": [], 81 | } 82 | 83 | concept_name = concept["STR"] 84 | # this condition is copied from S2. It checks if the concept name is canonical or not 85 | is_canonical = ( 86 | concept["ISPREF"] == "Y" 87 | and concept["TS"] == "P" 88 | and concept["STT"] == "PF" 89 | ) 90 | 91 | if not is_canonical or "canonical_name" in concept_details[concept_id]: 92 | # not a canonical name or a canonical name already found 93 | concept_details[concept_id]["aliases"].append( 94 | concept_name 95 | ) # add it as an alias 96 | else: 97 | concept_details[concept_id][ 98 | "canonical_name" 99 | ] = concept_name # set as canonical name 100 | 101 | 102 | def read_umls_types(meta_path: str, concept_details: Dict): 103 | """ 104 | Read the types file MRSTY.RRF from a UMLS release and store it in 105 | concept_details dictionary. This function adds the `types` field 106 | to the information of each concept 107 | 108 | MRSTY.RRF file format: a pipe-separated values 109 | Useful columns: CUI, TUI 110 | 111 | Args: 112 | meta_path: path to the META directory of an UMLS release 113 | concept_details: a dictionary to be filled with concept informations 114 | """ 115 | types_filename = "MRSTY.RRF" 116 | headers = read_umls_file_headers(meta_path, types_filename) 117 | with open(f"{meta_path}/{types_filename}") as fin: 118 | for line in fin: 119 | splits = line.strip().split("|") 120 | assert len(headers) == len(splits) 121 | concept_type = dict(zip(headers, splits)) 122 | 123 | concept = concept_details.get(concept_type["CUI"]) 124 | if ( 125 | concept is not None 126 | ): # a small number of types are for concepts that don't exist 127 | concept["types"].append(concept_type["TUI"]) 128 | 129 | 130 | def read_umls_definitions(meta_path: str, concept_details: Dict): 131 | """ 132 | Read the types file MRDEF.RRF from a UMLS release and store it in 133 | concept_details dictionary. This function adds the `definition` field 134 | to the information of each concept 135 | 136 | MRDEF.RRF file format: a pipe-separated values 137 | Useful columns: CUI, SAB, SUPPRESS, DEF 138 | 139 | Args: 140 | meta_path: path to the META directory of an UMLS release 141 | concept_details: a dictionary to be filled with concept informations 142 | """ 143 | definitions_filename = "MRDEF.RRF" 144 | headers = read_umls_file_headers(meta_path, definitions_filename) 145 | with open(f"{meta_path}/{definitions_filename}") as fin: 146 | headers = read_umls_file_headers(meta_path, definitions_filename) 147 | for line in fin: 148 | splits = line.strip().split("|") 149 | assert len(headers) == len(splits) 150 | definition = dict(zip(headers, splits)) 151 | 152 | if definition["SUPPRESS"] != "N": 153 | continue 154 | is_from_preferred_source = definition["SAB"] in DEF_SOURCES_PREFERRED 155 | concept = concept_details.get(definition["CUI"]) 156 | if ( 157 | concept is None 158 | ): # a small number of definitions are for concepts that don't exist 159 | continue 160 | 161 | if ( 162 | "definition" not in concept 163 | or is_from_preferred_source 164 | and concept["is_from_preferred_source"] == "N" 165 | ): 166 | concept["definition"] = definition["DEF"] 167 | concept["is_from_preferred_source"] = ( 168 | "Y" if is_from_preferred_source else "N" 169 | ) 170 | -------------------------------------------------------------------------------- /docs/example.svg: -------------------------------------------------------------------------------- 1 | MyeloidADJderivedVERBsuppressorNOUNcells (NOUNMDSC)NOUNareVERBimmatureADJmyeloidADJcellsNOUNwithADPimmunosuppressiveADJactivity.NOUNamodamodcompoundnsubjapposcopamodamodcaseamodnmod -------------------------------------------------------------------------------- /tests/test_abbreviation_detection.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import spacy 3 | import pytest 4 | 5 | from scispacy.abbreviation import ( 6 | AbbreviationDetector, 7 | find_abbreviation, 8 | filter_matches, 9 | ) 10 | 11 | 12 | class TestAbbreviationDetector(unittest.TestCase): 13 | def setUp(self): 14 | super().setUp() 15 | self.nlp = spacy.load("en_core_web_sm") 16 | self.detector = AbbreviationDetector(self.nlp) 17 | self.text = "Spinal and bulbar muscular atrophy (SBMA) is an \ 18 | inherited motor neuron disease caused by the expansion \ 19 | of a polyglutamine tract within the androgen receptor (AR). \ 20 | SBMA can be caused by this easily." 21 | 22 | def test_find_abbreviation(self): 23 | # Basic case 24 | doc = self.nlp("abbreviation (abbrn)") 25 | long = doc[0:1] 26 | short = doc[2:3] 27 | _, long_form = find_abbreviation(long, short) 28 | assert long_form.text == "abbreviation" 29 | 30 | # Hypenation and numbers within abbreviation 31 | doc = self.nlp("abbreviation (ab-b9rn)") 32 | long = doc[0:1] 33 | short = doc[2:3] 34 | _, long_form = find_abbreviation(long, short) 35 | assert long_form.text == "abbreviation" 36 | 37 | # No match 38 | doc = self.nlp("abbreviation (aeb-b9rn)") 39 | long = doc[0:1] 40 | short = doc[2:3] 41 | _, long_form = find_abbreviation(long, short) 42 | assert long_form is None 43 | 44 | # First letter must match start of word. 45 | doc = self.nlp("aaaabbreviation (ab-b9rn)") 46 | long = doc[0:1] 47 | short = doc[2:3] 48 | _, long_form = find_abbreviation(long, short) 49 | assert long_form.text == "aaaabbreviation" 50 | 51 | # Matching is greedy for first letter (are is not included). 52 | doc = self.nlp("more words are considered aaaabbreviation (ab-b9rn)") 53 | long = doc[0:5] 54 | short = doc[6:7] 55 | _, long_form = find_abbreviation(long, short) 56 | assert long_form.text == "aaaabbreviation" 57 | 58 | def test_filter_matches(self): 59 | doc = self.nlp(self.text) 60 | matches = self.detector.matcher(doc) 61 | matches_no_brackets = [(x[0], x[1] + 1, x[2] - 1) for x in matches] 62 | filtered = filter_matches(matches_no_brackets, doc) 63 | 64 | assert len(filtered) == 2 65 | long, short = filtered[0] 66 | assert long.text_with_ws == "Spinal and bulbar muscular atrophy " 67 | assert short.text == "SBMA" 68 | long, short = filtered[1] 69 | assert long.text_with_ws == "within the androgen receptor " 70 | assert short.text == "AR" 71 | 72 | def test_abbreviation_detection(self): 73 | # Attribute should be registered. 74 | doc = self.nlp(self.text) 75 | assert doc._.abbreviations == [] 76 | doc2 = self.detector(doc) 77 | assert len(doc2._.abbreviations) == 3 78 | 79 | correct = set() 80 | span = doc[33:34] 81 | span._.long_form = doc[0:5] 82 | correct.add(span) 83 | span = doc[6:7] 84 | span._.long_form = doc[0:5] 85 | correct.add(span) 86 | span = doc[29:30] 87 | span._.long_form = doc[26:28] 88 | correct.add(span) 89 | correct_long = {x._.long_form for x in correct} 90 | 91 | assert set(doc2._.abbreviations) == correct 92 | assert {x._.long_form for x in doc2._.abbreviations} == correct_long 93 | 94 | def test_find(self): 95 | doc = self.nlp(self.text) 96 | long, shorts = self.detector.find(doc[6:7], doc) 97 | assert long.text_with_ws == "Spinal and bulbar muscular atrophy " 98 | assert len(shorts) == 2 99 | assert {x.text_with_ws for x in shorts} == {"SBMA", "SBMA "} 100 | 101 | long, shorts = self.detector.find(doc[7:13], doc) 102 | assert shorts == set() 103 | 104 | def test_issue_158(self): 105 | text = ( 106 | "The PVO observations showed that the total transterminator flux " 107 | "was 23% of that at solar maximum and that the largest reductions in the " 108 | "number of ions transported antisunward occurred at the highest altitudes " 109 | "(Spenner et al., 1995)." 110 | ) 111 | doc = self.nlp(text) 112 | doc2 = self.detector(doc) 113 | assert len(doc2._.abbreviations) == 0 114 | 115 | def test_issue_192(self): 116 | # test for () pattern 117 | text = "blah SBMA (Spinal and bulbar muscular atrophy)" 118 | doc = self.nlp(text) 119 | doc2 = self.detector(doc) 120 | 121 | assert len(doc2._.abbreviations) == 1 122 | assert doc2._.abbreviations[0] == doc[1:2] 123 | assert doc2._.abbreviations[0]._.long_form == doc[3:8] 124 | 125 | def test_issue_161(self): 126 | # test some troublesome cases in the abbreviation detector 127 | text = "H2)]+(14)s.t. (1), (4).Similarly" 128 | print(f"Text: {text}") 129 | doc = self.nlp(text) 130 | doc2 = self.detector(doc) 131 | assert len(doc2._.abbreviations) == 0 132 | 133 | text = ".(21)In (21), λ" 134 | doc = self.nlp(text) 135 | doc2 = self.detector(doc) 136 | assert len(doc2._.abbreviations) == 0 137 | 138 | text = "map expX (·) : R" 139 | doc = self.nlp(text) 140 | doc2 = self.detector(doc) 141 | assert len(doc2._.abbreviations) == 0 142 | 143 | text = "0,(3)with the following data: (3-i) (q̄" 144 | doc = self.nlp(text) 145 | doc2 = self.detector(doc) 146 | assert len(doc2._.abbreviations) == 0 147 | 148 | text = "Φg(h),ThΦg(v) ) , (h, v)" 149 | doc = self.nlp(text) 150 | doc2 = self.detector(doc) 151 | assert len(doc2._.abbreviations) == 0 152 | 153 | text = "dimension;(S-iii) The optimal control problem obtained in (S-ii) is con-verted" 154 | doc = self.nlp(text) 155 | doc2 = self.detector(doc) 156 | assert len(doc2._.abbreviations) == 0 157 | 158 | text = "z), πut (z)) )" 159 | doc = self.nlp(text) 160 | doc2 = self.detector(doc) 161 | assert len(doc2._.abbreviations) == 0 162 | 163 | text = "repositories he/she already worked with or from previous collaborators. Nevertheless, 88% of the first action of users to a repository (repository discovery) is" 164 | doc = self.nlp(text) 165 | doc2 = self.detector(doc) 166 | assert len(doc2._.abbreviations) == 0 167 | 168 | def test_empty_span(self): 169 | text = "(19, 9, 4) Hadamard Designs and Their Residual Designs" 170 | doc = self.nlp(text) 171 | doc2 = self.detector(doc) 172 | assert len(doc2._.abbreviations) == 0 173 | 174 | def test_space_issue(self): 175 | text = "by designing A Lite BERT (ALBERT) architecture that has significantly fewer parameters than a traditional BERT architecture." 176 | doc = self.nlp(text) 177 | doc2 = self.detector(doc) 178 | assert len(doc2._.abbreviations) == 1 179 | assert doc2._.abbreviations[0]._.long_form.text == "A Lite BERT" 180 | 181 | def test_multiple_spaces(self): 182 | text = "by designing A Lite BERT (ALBERT) architecture that has significantly fewer parameters than a traditional BERT architecture." 183 | doc = self.nlp(text) 184 | doc2 = self.detector(doc) 185 | assert len(doc2._.abbreviations) == 1 186 | assert doc2._.abbreviations[0]._.long_form.text == "A Lite BERT" 187 | 188 | @pytest.mark.xfail 189 | def test_difficult_cases(self): 190 | # Don't see an obvious way of solving these. They require something more semantic to distinguish 191 | text = "is equivalent to (iv) of Theorem" 192 | doc = self.nlp(text) 193 | doc2 = self.detector(doc) 194 | assert len(doc2._.abbreviations) == 0 195 | 196 | text = "or to fork.Users work more on their repositories (owners) than on" 197 | doc = self.nlp(text) 198 | doc2 = self.detector(doc) 199 | assert len(doc2._.abbreviations) == 0 200 | -------------------------------------------------------------------------------- /tests/fixtures/med_mentions.txt: -------------------------------------------------------------------------------- 1 | 25763772|t|DCTN4 as a modifier of chronic Pseudomonas aeruginosa infection in cystic fibrosis 2 | 25763772|a|Pseudomonas aeruginosa (Pa) infection in cystic fibrosis (CF) patients is associated with worse long-term pulmonary disease and shorter survival, and chronic Pa infection (CPA) is associated with reduced lung function, faster rate of lung decline, increased rates of exacerbations and shorter survival. By using exome sequencing and extreme phenotype design, it was recently shown that isoforms of dynactin 4 (DCTN4) may influence Pa infection in CF, leading to worse respiratory disease. The purpose of this study was to investigate the role of DCTN4 missense variants on Pa infection incidence, age at first Pa infection and chronic Pa infection incidence in a cohort of adult CF patients from a single centre. Polymerase chain reaction and direct sequencing were used to screen DNA samples for DCTN4 variants. A total of 121 adult CF patients from the Cochin Hospital CF centre have been included, all of them carrying two CFTR defects: 103 developed at least 1 pulmonary infection with Pa, and 68 patients of them had CPA. DCTN4 variants were identified in 24% (29/121) CF patients with Pa infection and in only 17% (3/18) CF patients with no Pa infection. Of the patients with CPA, 29% (20/68) had DCTN4 missense variants vs 23% (8/35) in patients without CPA. Interestingly, p.Tyr263Cys tend to be more frequently observed in CF patients with CPA than in patients without CPA (4/68 vs 0/35), and DCTN4 missense variants tend to be more frequent in male CF patients with CPA bearing two class II mutations than in male CF patients without CPA bearing two class II mutations (P = 0.06). Our observations reinforce that DCTN4 missense variants, especially p.Tyr263Cys, may be involved in the pathogenesis of CPA in male CF. 3 | 25763772 0 5 DCTN4 T103 UMLS:C4308010 4 | 25763772 23 63 chronic Pseudomonas aeruginosa infection T038 UMLS:C0854135 5 | 25763772 67 82 cystic fibrosis T038 UMLS:C0010674 6 | 25763772 83 120 Pseudomonas aeruginosa (Pa) infection T038 UMLS:C0854135 7 | 25763772 124 139 cystic fibrosis T038 UMLS:C0010674 8 | 25763772 141 143 CF T038 UMLS:C0010674 9 | 25763772 189 206 pulmonary disease T038 UMLS:C0024115 10 | 25763772 233 253 chronic Pa infection T038 UMLS:C0854135 11 | 25763772 255 258 CPA T038 UMLS:C0854135 12 | 25763772 302 329 faster rate of lung decline T033 UMLS:C3160731 13 | 25763772 350 363 exacerbations T033 UMLS:C4086268 14 | 25763772 395 411 exome sequencing T062 UMLS:C3640077 15 | 25763772 469 477 isoforms T103 UMLS:C0597298 16 | 25763772 481 491 dynactin 4 T103 UMLS:C4308010 17 | 25763772 493 498 DCTN4 T103 UMLS:C4308010 18 | 25763772 514 526 Pa infection T038 UMLS:C0854135 19 | 25763772 530 532 CF T038 UMLS:C0010674 20 | 25763772 551 570 respiratory disease T038 UMLS:C0035204 21 | 25763772 592 597 study T062 UMLS:C2603343 22 | 25763772 629 634 DCTN4 T103 UMLS:C4308010 23 | 25763772 644 652 variants T103 UMLS:C0597298 24 | 25763772 656 668 Pa infection T038 UMLS:C0854135 25 | 25763772 693 705 Pa infection T038 UMLS:C0854135 26 | 25763772 710 730 chronic Pa infection T038 UMLS:C0854135 27 | 25763772 746 752 cohort T098 UMLS:C0599755 28 | 25763772 762 764 CF T038 UMLS:C0010674 29 | 25763772 788 794 centre T092 UMLS:C0475309 30 | 25763772 796 821 Polymerase chain reaction T062 UMLS:C0032520 31 | 25763772 826 843 direct sequencing T062 UMLS:C3899368 32 | 25763772 864 875 DNA samples T017 UMLS:C0444245 33 | 25763772 880 885 DCTN4 T103 UMLS:C4308010 34 | 25763772 886 894 variants T103 UMLS:C0597298 35 | 25763772 917 919 CF T038 UMLS:C0010674 36 | 25763772 938 963 Cochin Hospital CF centre T092 UMLS:C0019994 37 | 25763772 1009 1013 CFTR T017 UMLS:C1413365 38 | 25763772 1048 1067 pulmonary infection T038 UMLS:C0876973 39 | 25763772 1073 1075 Pa T007 UMLS:C0033809 40 | 25763772 1105 1108 CPA T038 UMLS:C0854135 41 | 25763772 1110 1115 DCTN4 T103 UMLS:C4308010 42 | 25763772 1116 1124 variants T103 UMLS:C0597298 43 | 25763772 1157 1159 CF T038 UMLS:C0010674 44 | 25763772 1174 1186 Pa infection T038 UMLS:C0854135 45 | 25763772 1210 1212 CF T038 UMLS:C0010674 46 | 25763772 1230 1242 Pa infection T038 UMLS:C0854135 47 | 25763772 1265 1268 CPA T038 UMLS:C0854135 48 | 25763772 1286 1291 DCTN4 T103 UMLS:C4308010 49 | 25763772 1301 1309 variants T103 UMLS:C0597298 50 | 25763772 1344 1347 CPA T038 UMLS:C0854135 51 | 25763772 1364 1375 p.Tyr263Cys T103 UMLS:C0597298 52 | 25763772 1415 1417 CF T038 UMLS:C0010674 53 | 25763772 1432 1435 CPA T038 UMLS:C0854135 54 | 25763772 1461 1464 CPA T038 UMLS:C0854135 55 | 25763772 1485 1490 DCTN4 T103 UMLS:C4308010 56 | 25763772 1500 1508 variants T103 UMLS:C0597298 57 | 25763772 1542 1544 CF T038 UMLS:C0010674 58 | 25763772 1559 1562 CPA T038 UMLS:C0854135 59 | 25763772 1575 1593 class II mutations T038 UMLS:C0026882 60 | 25763772 1607 1609 CF T038 UMLS:C0010674 61 | 25763772 1627 1630 CPA T038 UMLS:C0854135 62 | 25763772 1643 1661 class II mutations T038 UMLS:C0026882 63 | 25763772 1706 1711 DCTN4 T103 UMLS:C4308010 64 | 25763772 1721 1729 variants T103 UMLS:C0597298 65 | 25763772 1742 1753 p.Tyr263Cys T103 UMLS:C0597298 66 | 25763772 1778 1790 pathogenesis T038 UMLS:C0699748 67 | 25763772 1794 1797 CPA T038 UMLS:C0854135 68 | 25763772 1806 1808 CF T038 UMLS:C0010674 69 | 70 | 25847295|t|Nonylphenol diethoxylate inhibits apoptosis induced in PC12 cells 71 | 25847295|a|Nonylphenol and short-chain nonylphenol ethoxylates such as NP2 EO are present in aquatic environment as wastewater contaminants, and their toxic effects on aquatic species have been reported. Apoptosis has been shown to be induced by serum deprivation or copper treatment. To understand the toxicity of nonylphenol diethoxylate, we investigated the effects of NP2 EO on apoptosis induced by serum deprivation and copper by using PC12 cell system. Nonylphenol diethoxylate itself showed no toxicity and recovered cell viability from apoptosis. In addition, nonylphenol diethoxylate decreased DNA fragmentation caused by apoptosis in PC12 cells. This phenomenon was confirmed after treating apoptotic PC12 cells with nonylphenol diethoxylate, whereas the cytochrome c release into the cytosol decreased as compared to that in apoptotic cells not treated with nonylphenol diethoxylate s. Furthermore, Bax contents in apoptotic cells were reduced after exposure to nonylphenol diethoxylate. Thus, nonylphenol diethoxylate has the opposite effect on apoptosis in PC12 cells compared to nonylphenol, which enhances apoptosis induced by serum deprivation. The difference in structure of the two compounds is hypothesized to be responsible for this phenomenon. These results indicated that nonylphenol diethoxylate has capability to affect cell differentiation and development and has potentially harmful effect on organisms because of its unexpected impact on apoptosis. © 2015 Wiley Periodicals, Inc. Environ Toxicol 31: 1389-1398, 2016. 72 | 25847295 34 43 apoptosis T038 UMLS:C0162638 73 | 25847295 55 65 PC12 cells T017 UMLS:C0085262 74 | 25847295 137 144 present T033 UMLS:C0150312 75 | 25847295 206 219 toxic effects T037 UMLS:C0600688 76 | 25847295 259 268 Apoptosis T038 UMLS:C0162638 77 | 25847295 301 306 serum T031 UMLS:C0229671 78 | 25847295 322 328 copper T103 UMLS:C0009968 79 | 25847295 437 446 apoptosis T038 UMLS:C0162638 80 | 25847295 458 463 serum T031 UMLS:C0229671 81 | 25847295 480 486 copper T103 UMLS:C0009968 82 | 25847295 496 512 PC12 cell system T017 UMLS:C0085262 83 | 25847295 579 593 cell viability T038 UMLS:C0007620 84 | 25847295 599 608 apoptosis T038 UMLS:C0162638 85 | 25847295 658 675 DNA fragmentation T038 UMLS:C0376669 86 | 25847295 686 695 apoptosis T038 UMLS:C0162638 87 | 25847295 699 709 PC12 cells T017 UMLS:C0085262 88 | 25847295 766 776 PC12 cells T017 UMLS:C0085262 89 | 25847295 820 832 cytochrome c T103 UMLS:C0010749 90 | 25847295 850 857 cytosol T017 UMLS:C1383501 91 | 25847295 891 906 apoptotic cells T017 UMLS:C0007634 92 | 25847295 965 968 Bax T103 UMLS:C0219474 93 | 25847295 981 996 apoptotic cells T017 UMLS:C0007634 94 | 25847295 1112 1121 apoptosis T038 UMLS:C0162638 95 | 25847295 1125 1135 PC12 cells T017 UMLS:C0085262 96 | 25847295 1176 1185 apoptosis T038 UMLS:C0162638 97 | 25847295 1197 1202 serum T031 UMLS:C0229671 98 | 25847295 1234 1243 structure T082 UMLS:C0678594 99 | 25847295 1255 1264 compounds T103 UMLS:C0220806 100 | 25847295 1326 1333 results T033 UMLS:C2825142 101 | 25847295 1399 1419 cell differentiation T038 UMLS:C0007589 102 | 25847295 1424 1435 development T038 UMLS:C0243107 103 | 25847295 1456 1470 harmful effect T037 UMLS:C0600688 104 | 25847295 1520 1529 apoptosis T038 UMLS:C0162638 105 | 106 | 26316050|t|Prevascularized silicon membranes for the enhancement of transport to implanted medical devices 107 | 26316050|a|Recent advances in drug delivery and sensing devices for in situ applications are limited by the diffusion -limiting foreign body response of fibrous encapsulation. In this study, we fabricated prevascularized synthetic device ports to help mitigate this limitation. Membranes with rectilinear arrays of square pores with widths ranging from 40 to 200 μm were created using materials (50 μm thick double-sided polished silicon) and processes (photolithography and directed reactive ion etching) common in the manufacturing of microfabricated sensors. Vascular endothelial cells responded to membrane geometry by either forming vascular tubes that extended through the pore or completely filling membrane pores after 4 days in culture. Although tube formation began to predominate overgrowth around 75 μm and continued to increase at even larger pore sizes, tubes formed at these large pore sizes were not completely round and had relatively thin walls. Thus, the optimum range of pore size for prevascularization of these membranes was estimated to be 75-100 μm. This study lays the foundation for creating a prevascularized port that can be used to reduce fibrous encapsulation and thus enhance diffusion to implanted medical devices and sensors. © 2015 Wiley Periodicals, Inc. J Biomed Mater Res Part B: Appl Biomater, 104B: 1602-1609, 2016. 108 | 26316050 16 23 silicon T103 UMLS:C0037114 109 | 26316050 70 95 implanted medical devices T033 UMLS:C2828363 110 | 26316050 115 128 drug delivery T074 UMLS:C0085104 111 | 26316050 153 160 in situ T082 UMLS:C0444498 112 | 26316050 161 173 applications T058 UMLS:C0185125 113 | 26316050 213 234 foreign body response T033 UMLS:C1708386 114 | 26316050 400 406 square T082 UMLS:C0205120 115 | 26316050 506 522 polished silicon T103 UMLS:C0037114 116 | 26316050 647 673 Vascular endothelial cells T017 UMLS:C1257792 117 | 26316050 723 737 vascular tubes T017 UMLS:C0005847 118 | 26316050 743 751 extended T082 UMLS:C0231449 119 | 26316050 876 886 overgrowth T033 UMLS:C1849265 120 | 26316050 1012 1017 round T082 UMLS:C0332490 121 | 26316050 1042 1047 walls T082 UMLS:C0442069 122 | 26316050 1164 1169 study T062 UMLS:C2603343 123 | 26316050 1305 1330 implanted medical devices T033 UMLS:C2828363 124 | -------------------------------------------------------------------------------- /scispacy/abbreviation.py: -------------------------------------------------------------------------------- 1 | from typing import Tuple, List, Optional, Set, Dict 2 | from collections import defaultdict 3 | from spacy.tokens import Span, Doc 4 | from spacy.matcher import Matcher 5 | from spacy.language import Language 6 | 7 | 8 | def find_abbreviation( 9 | long_form_candidate: Span, short_form_candidate: Span 10 | ) -> Tuple[Span, Optional[Span]]: 11 | """ 12 | Implements the abbreviation detection algorithm in "A simple algorithm 13 | for identifying abbreviation definitions in biomedical text.", (Schwartz & Hearst, 2003). 14 | 15 | The algorithm works by enumerating the characters in the short form of the abbreviation, 16 | checking that they can be matched against characters in a candidate text for the long form 17 | in order, as well as requiring that the first letter of the abbreviated form matches the 18 | _beginning_ letter of a word. 19 | 20 | Parameters 21 | ---------- 22 | long_form_candidate: Span, required. 23 | The spaCy span for the long form candidate of the definition. 24 | short_form_candidate: Span, required. 25 | The spaCy span for the abbreviation candidate. 26 | 27 | Returns 28 | ------- 29 | A Tuple[Span, Optional[Span]], representing the short form abbreviation and the 30 | span corresponding to the long form expansion, or None if a match is not found. 31 | """ 32 | long_form = " ".join([x.text for x in long_form_candidate]) 33 | short_form = " ".join([x.text for x in short_form_candidate]) 34 | 35 | long_index = len(long_form) - 1 36 | short_index = len(short_form) - 1 37 | 38 | while short_index >= 0: 39 | current_char = short_form[short_index].lower() 40 | # We don't check non alpha-numeric characters. 41 | if not current_char.isalnum(): 42 | short_index -= 1 43 | continue 44 | 45 | # Does the character match at this position? ... 46 | while ( 47 | (long_index >= 0 and long_form[long_index].lower() != current_char) 48 | or 49 | # .... or if we are checking the first character of the abbreviation, we enforce 50 | # to be the _starting_ character of a span. 51 | ( 52 | short_index == 0 53 | and long_index > 0 54 | and long_form[long_index - 1].isalnum() 55 | ) 56 | ): 57 | long_index -= 1 58 | 59 | if long_index < 0: 60 | return short_form_candidate, None 61 | 62 | long_index -= 1 63 | short_index -= 1 64 | 65 | # The last subtraction will either take us on to a whitespace character, or 66 | # off the front of the string (i.e. long_index == -1). Either way, we want to add 67 | # one to get back to the start character of the long form 68 | long_index += 1 69 | 70 | # Now we know the character index of the start of the character span, 71 | # here we just translate that to the first token beginning after that 72 | # value, so we can return a spaCy span instead. 73 | word_lengths = 0 74 | starting_index = None 75 | for i, word in enumerate(long_form_candidate): 76 | # need to add 1 for the space characters 77 | word_lengths += len(word.text_with_ws) 78 | if word_lengths > long_index: 79 | starting_index = i 80 | break 81 | 82 | return short_form_candidate, long_form_candidate[starting_index:] 83 | 84 | 85 | def filter_matches( 86 | matcher_output: List[Tuple[int, int, int]], doc: Doc 87 | ) -> List[Tuple[Span, Span]]: 88 | # Filter into two cases: 89 | # 1. ( ) 90 | # 2. () [this case is most common]. 91 | candidates = [] 92 | for match in matcher_output: 93 | start = match[1] 94 | end = match[2] 95 | # Ignore spans with more than 8 words in them, and spans at the start of the doc 96 | if end - start > 8 or start == 1: 97 | continue 98 | if end - start > 3: 99 | # Long form is inside the parens. 100 | # Take one word before. 101 | short_form_candidate = doc[start - 2 : start - 1] 102 | long_form_candidate = doc[start:end] 103 | else: 104 | # Normal case. 105 | # Short form is inside the parens. 106 | short_form_candidate = doc[start:end] 107 | 108 | # Sum character lengths of contents of parens. 109 | abbreviation_length = sum([len(x) for x in short_form_candidate]) 110 | max_words = min(abbreviation_length + 5, abbreviation_length * 2) 111 | # Look up to max_words backwards 112 | long_form_candidate = doc[max(start - max_words - 1, 0) : start - 1] 113 | 114 | # add candidate to candidates if candidates pass filters 115 | if short_form_filter(short_form_candidate): 116 | candidates.append((long_form_candidate, short_form_candidate)) 117 | 118 | return candidates 119 | 120 | 121 | def short_form_filter(span: Span) -> bool: 122 | # All words are between length 2 and 10 123 | if not all([2 <= len(x) < 10 for x in span]): 124 | return False 125 | 126 | # At least 50% of the short form should be alpha 127 | if (sum([c.isalpha() for c in span.text]) / len(span.text)) < 0.5: 128 | return False 129 | 130 | # The first character of the short form should be alpha 131 | if not span.text[0].isalpha(): 132 | return False 133 | return True 134 | 135 | 136 | @Language.factory("abbreviation_detector") 137 | class AbbreviationDetector: 138 | """ 139 | Detects abbreviations using the algorithm in "A simple algorithm for identifying 140 | abbreviation definitions in biomedical text.", (Schwartz & Hearst, 2003). 141 | 142 | This class sets the `._.abbreviations` attribute on spaCy Doc. 143 | 144 | The abbreviations attribute is a `List[Span]` where each Span has the `Span._.long_form` 145 | attribute set to the long form definition of the abbreviation. 146 | 147 | Note that this class does not replace the spans, or merge them. 148 | 149 | Parameters 150 | ---------- 151 | 152 | nlp: `Language`, a required argument for spacy to use this as a factory 153 | name: `str`, a required argument for spacy to use this as a factory 154 | make_serializable: `bool`, a required argument for whether we want to use the serializable 155 | or non serializable version. 156 | """ 157 | 158 | def __init__( 159 | self, 160 | nlp: Language, 161 | name: str = "abbreviation_detector", 162 | make_serializable: bool = False, 163 | ) -> None: 164 | Doc.set_extension("abbreviations", default=[], force=True) 165 | Span.set_extension("long_form", default=None, force=True) 166 | 167 | self.matcher = Matcher(nlp.vocab) 168 | self.matcher.add("parenthesis", [[{"ORTH": "("}, {"OP": "+"}, {"ORTH": ")"}]]) 169 | self.make_serializable = make_serializable 170 | self.global_matcher = Matcher(nlp.vocab) 171 | 172 | def find(self, span: Span, doc: Doc) -> Tuple[Span, Set[Span]]: 173 | """ 174 | Functional version of calling the matcher for a single span. 175 | This method is helpful if you already have an abbreviation which 176 | you want to find a definition for. 177 | """ 178 | dummy_matches = [(-1, int(span.start), int(span.end))] 179 | filtered = filter_matches(dummy_matches, doc) 180 | abbreviations = self.find_matches_for(filtered, doc) 181 | 182 | if not abbreviations: 183 | return span, set() 184 | else: 185 | return abbreviations[0] 186 | 187 | def __call__(self, doc: Doc) -> Doc: 188 | matches = self.matcher(doc) 189 | matches_no_brackets = [(x[0], x[1] + 1, x[2] - 1) for x in matches] 190 | filtered = filter_matches(matches_no_brackets, doc) 191 | occurences = self.find_matches_for(filtered, doc) 192 | 193 | for (long_form, short_forms) in occurences: 194 | for short in short_forms: 195 | short._.long_form = long_form 196 | doc._.abbreviations.append(short) 197 | if self.make_serializable: 198 | abbreviations = doc._.abbreviations 199 | doc._.abbreviations = [ 200 | self.make_short_form_serializable(abbreviation) 201 | for abbreviation in abbreviations 202 | ] 203 | return doc 204 | 205 | def find_matches_for( 206 | self, filtered: List[Tuple[Span, Span]], doc: Doc 207 | ) -> List[Tuple[Span, Set[Span]]]: 208 | rules = {} 209 | all_occurences: Dict[Span, Set[Span]] = defaultdict(set) 210 | already_seen_long: Set[str] = set() 211 | already_seen_short: Set[str] = set() 212 | for (long_candidate, short_candidate) in filtered: 213 | short, long = find_abbreviation(long_candidate, short_candidate) 214 | # We need the long and short form definitions to be unique, because we need 215 | # to store them so we can look them up later. This is a bit of a 216 | # pathalogical case also, as it would mean an abbreviation had been 217 | # defined twice in a document. There's not much we can do about this, 218 | # but at least the case which is discarded will be picked up below by 219 | # the global matcher. So it's likely that things will work out ok most of the time. 220 | new_long = long.text not in already_seen_long if long else False 221 | new_short = short.text not in already_seen_short 222 | if long is not None and new_long and new_short: 223 | already_seen_long.add(long.text) 224 | already_seen_short.add(short.text) 225 | all_occurences[long].add(short) 226 | rules[long.text] = long 227 | # Add a rule to a matcher to find exactly this substring. 228 | self.global_matcher.add(long.text, [[{"ORTH": x.text} for x in short]]) 229 | to_remove = set() 230 | global_matches = self.global_matcher(doc) 231 | for match, start, end in global_matches: 232 | string_key = self.global_matcher.vocab.strings[match] 233 | to_remove.add(string_key) 234 | all_occurences[rules[string_key]].add(doc[start:end]) 235 | for key in to_remove: 236 | # Clean up the global matcher. 237 | self.global_matcher.remove(key) 238 | 239 | return list((k, v) for k, v in all_occurences.items()) 240 | 241 | def make_short_form_serializable(self, abbreviation: Span): 242 | """ 243 | Converts the abbreviations into a short form that is serializable to enable multiprocessing 244 | 245 | Parameters 246 | ---------- 247 | abbreviation: Span 248 | The abbreviation span identified by the detector 249 | """ 250 | long_form = abbreviation._.long_form 251 | abbreviation._.long_form = long_form.text 252 | serializable_abbr = { 253 | "short_text": abbreviation.text, 254 | "short_start": abbreviation.start, 255 | "short_end": abbreviation.end, 256 | "long_text": long_form.text, 257 | "long_start": long_form.start, 258 | "long_end": long_form.end, 259 | } 260 | return serializable_abbr 261 | -------------------------------------------------------------------------------- /tests/fixtures/umls_test_fixture.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "concept_id": "C0000015", 4 | "aliases": [], 5 | "types": [ 6 | "T116", 7 | "T121", 8 | "T130" 9 | ], 10 | "canonical_name": "(132)I-Macro Albin" 11 | }, 12 | { 13 | "concept_id": "C0000005", 14 | "aliases": [ 15 | "(131)I-MAA" 16 | ], 17 | "types": [ 18 | "T116", 19 | "T121", 20 | "T130" 21 | ], 22 | "canonical_name": "(131)I-Macroaggregated Albumin" 23 | }, 24 | { 25 | "concept_id": "C0000039", 26 | "aliases": [ 27 | "1,2-Dipalmitoylphosphatidylcholine", 28 | "1,2-Dipalmitoylphosphatidylcholine", 29 | "1,2 Dipalmitoylphosphatidylcholine", 30 | "1,2-Dihexadecyl-sn-Glycerophosphocholine", 31 | "1,2-Dihexadecyl-sn-Glycerophosphocholine", 32 | "1,2 Dihexadecyl sn Glycerophosphocholine", 33 | "1,2-Dipalmitoyl-Glycerophosphocholine", 34 | "1,2-Dipalmitoyl-Glycerophosphocholine", 35 | "1,2 Dipalmitoyl Glycerophosphocholine", 36 | "Dipalmitoylphosphatidylcholine", 37 | "Dipalmitoylphosphatidylcholine", 38 | "Dipalmitoylphosphatidylcholine", 39 | "Dipalmitoylphosphatidylcholine", 40 | "Dipalmitoylphosphatidylcholine", 41 | "Dipalmitoylglycerophosphocholine", 42 | "Dipalmitoylglycerophosphocholine", 43 | "Dipalmitoyllecithin", 44 | "Dipalmitoyllecithin", 45 | "3,5,9-Trioxa-4-phosphapentacosan-1-aminium, 4-hydroxy-N,N,N-trimethyl-10-oxo-7-((1-oxohexadecyl)oxy)-, inner salt, 4-oxide", 46 | "3,5,9-Trioxa-4-phosphapentacosan-1-aminium, 4-hydroxy-N,N,N-trimethyl-10-oxo-7-((1-oxohexadecyl)oxy)-, inner salt, 4-oxide", 47 | "Dipalmitoyl Phosphatidylcholine", 48 | "Dipalmitoyl Phosphatidylcholine", 49 | "Phosphatidylcholine, Dipalmitoyl", 50 | "1,2-Dipalmitoylphosphatidylcholine [Chemical/Ingredient]" 51 | ], 52 | "types": [ 53 | "T109", 54 | "T121" 55 | ], 56 | "canonical_name": "1,2-Dipalmitoylphosphatidylcholine" 57 | }, 58 | { 59 | "concept_id": "C0000052", 60 | "aliases": [ 61 | "1,4-alpha-Glucan Branching Enzyme", 62 | "1,4-alpha-Glucan Branching Enzyme", 63 | "1,4-alpha-Glucan branching enzyme", 64 | "1,4-alpha-Glucan branching enzyme", 65 | "1,4-Alpha glucan branching enzyme", 66 | "1,4-Alpha glucan branching enzyme", 67 | "1,4 alpha Glucan Branching Enzyme", 68 | "Branching Enzyme, 1,4-alpha-Glucan", 69 | "Enzyme, 1,4-alpha-Glucan Branching", 70 | "Branching Enzyme", 71 | "Branching Enzyme", 72 | "Branching enzyme", 73 | "Branching enzyme", 74 | "Enzyme, Branching", 75 | "Branching Glycosyltransferase", 76 | "Branching Glycosyltransferase", 77 | "Glycosyltransferase, Branching", 78 | "Starch Branching Enzyme", 79 | "Starch Branching Enzyme", 80 | "Branching Enzyme, Starch", 81 | "Enzyme, Starch Branching", 82 | "1,4-alpha-D-Glucan:1,4-alpha-D-glucan 6-alpha-D-(1,4-alpha-D-glucano)-transferase", 83 | "1,4-alpha-D-Glucan:1,4-alpha-D-glucan 6-alpha-D-(1,4-alpha-D-glucano)-transferase", 84 | "Amylo-(1,4,6)-transglycosylase", 85 | "Amylo-(1,4,6)-transglycosylase", 86 | "alpha-Glucan-branching glycosyltransferase", 87 | "Amylo (1-4 to 1-6)-transglucosidase", 88 | "1,4-alpha-Glucan branching enzyme (substance)", 89 | "1,4-alpha-Glucan Branching Enzyme [Chemical/Ingredient]" 90 | ], 91 | "types": [ 92 | "T116", 93 | "T126" 94 | ], 95 | "canonical_name": "1,4-alpha-Glucan Branching Enzyme", 96 | "definition": "In glycogen or amylopectin synthesis, the enzyme that catalyzes the transfer of a segment of a 1,4-alpha-glucan chain to a primary hydroxy group in a similar glucan chain. EC 2.4.1.18." 97 | }, 98 | { 99 | "concept_id": "C0000074", 100 | "aliases": [ 101 | "1 Alkyl 2 Acylphosphatidates" 102 | ], 103 | "types": [ 104 | "T109" 105 | ], 106 | "canonical_name": "1-Alkyl-2-Acylphosphatidates" 107 | }, 108 | { 109 | "concept_id": "C0000084", 110 | "aliases": [ 111 | "1-Carboxyglutamic Acid", 112 | "1 Carboxyglutamic Acid", 113 | "gamma-Carboxyglutamic Acid", 114 | "gamma-Carboxyglutamic Acid", 115 | "gamma Carboxyglutamic Acid", 116 | "3-Amino-1,1,3-propanetricarboxylic Acid", 117 | "3-Amino-1,1,3-propanetricarboxylic Acid", 118 | "1,1,3-Propanetricarboxylic acid, 3-amino-", 119 | "1,1,3-Propanetricarboxylic acid, 3-amino-", 120 | "1-Carboxyglutamic Acid [Chemical/Ingredient]" 121 | ], 122 | "types": [ 123 | "T116", 124 | "T123" 125 | ], 126 | "canonical_name": "1-Carboxyglutamic Acid", 127 | "definition": "Found in various tissues, particularly in four blood-clotting proteins including prothrombin, in kidney protein, in bone protein, and in the protein present in various ectopic calcifications." 128 | }, 129 | { 130 | "concept_id": "C0000096", 131 | "aliases": [ 132 | "1-Methyl-3-isobutylxanthine", 133 | "1 Methyl 3 isobutylxanthine", 134 | "3-Isobutyl-1-methylxanthine", 135 | "3-Isobutyl-1-methylxanthine", 136 | "3 Isobutyl 1 methylxanthine", 137 | "IBMX", 138 | "IBMX", 139 | "Isobutyltheophylline", 140 | "Isobutyltheophylline", 141 | "1H-Purine-2,6-dione, 3,7-dihydro-1-methyl-3-(2-methylpropyl)-", 142 | "1H-Purine-2,6-dione, 3,7-dihydro-1-methyl-3-(2-methylpropyl)-", 143 | "1-Methyl-3-isobutylxanthine [Chemical/Ingredient]" 144 | ], 145 | "types": [ 146 | "T109", 147 | "T121" 148 | ], 149 | "canonical_name": "1-Methyl-3-isobutylxanthine", 150 | "definition": "A potent cyclic nucleotide phosphodiesterase inhibitor; due to this action, the compound increases cyclic AMP and cyclic GMP in tissue and thereby activates CYCLIC NUCLEOTIDE-REGULATED PROTEIN KINASES" 151 | }, 152 | { 153 | "concept_id": "C0000097", 154 | "aliases": [ 155 | "1-Methyl-4-phenyl-1,2,3,6-tetrahydropyridine", 156 | "MPTP", 157 | "MPTP", 158 | "MPTP", 159 | "mptp", 160 | "N-Methyl-4-phenyl-1,2,3,6-tetrahydropyridine", 161 | "N-Methyl-4-phenyl-1,2,3,6-tetrahydropyridine", 162 | "Pyridine, 1,2,3,6-tetrahydro-1-methyl-4-phenyl-", 163 | "Pyridine, 1,2,3,6-tetrahydro-1-methyl-4-phenyl-", 164 | "Methylphenyltetrahydropyridine", 165 | "Methylphenyltetrahydropyridine", 166 | "methylphenyltetrahydropyridine", 167 | "Methylphenyltetrahydropyridine (substance)", 168 | "1-Methyl-4-phenyl-1,2,3,6-tetrahydropyridine [Chemical/Ingredient]", 169 | "1-Methyl-4-Phenyl-1,2,3,6-Tetrahydropyridine (MPTP)" 170 | ], 171 | "types": [ 172 | "T109", 173 | "T131" 174 | ], 175 | "canonical_name": "1-Methyl-4-phenyl-1,2,3,6-tetrahydropyridine", 176 | "definition": "A dopaminergic neurotoxic compound which produces irreversible clinical, chemical, and pathological alterations that mimic those found in Parkinson disease." 177 | }, 178 | { 179 | "concept_id": "C0000098", 180 | "aliases": [ 181 | "1-Methyl-4-phenylpyridinium", 182 | "1 Methyl 4 phenylpyridinium", 183 | "1-Methyl-4-phenylpyridinium Ion", 184 | "1-Methyl-4-phenylpyridinium Ion", 185 | "1 Methyl 4 phenylpyridinium Ion", 186 | "Cyperquat", 187 | "Cyperquat", 188 | "CNN", 189 | "N-Methyl-4-phenylpyridine", 190 | "N-Methyl-4-phenylpyridine", 191 | "N Methyl 4 phenylpyridine", 192 | "1-Methyl-4-phenylpyridine", 193 | "1-Methyl-4-phenylpyridine", 194 | "1 Methyl 4 phenylpyridine", 195 | "N-Methyl-4-phenylpyridinium", 196 | "N-Methyl-4-phenylpyridinium", 197 | "Pyridinium, 1-methyl-4-phenyl-", 198 | "Pyridinium, 1-methyl-4-phenyl-", 199 | "1-Methyl-4-phenylpyridinium [Chemical/Ingredient]" 200 | ], 201 | "types": [ 202 | "T109", 203 | "T131" 204 | ], 205 | "canonical_name": "1-Methyl-4-phenylpyridinium", 206 | "definition": "An active neurotoxic metabolite of 1-METHYL-4-PHENYL-1,2,3,6-TETRAHYDROPYRIDINE. The compound reduces dopamine levels, inhibits the biosynthesis of catecholamines, depletes cardiac norepinephrine and inactivates tyrosine hydroxylase. These and other toxic effects lead to cessation of oxidative phosphorylation, ATP depletion, and cell death. The compound, which is related to PARAQUAT, has also been used as an herbicide." 207 | }, 208 | { 209 | "concept_id": "C0000102", 210 | "aliases": [ 211 | "1-Naphthylamine", 212 | "1-naphthylamine", 213 | "1 Naphthylamine", 214 | "alpha-Naphthylamine", 215 | "alpha-Naphthylamine", 216 | "alpha-naphthylamine", 217 | "alpha Naphthylamine", 218 | "Naphthalidine", 219 | "Naphthalidine", 220 | "8-Aminonaphthalene", 221 | "8-Aminonaphthalene", 222 | "8 Aminonaphthalene", 223 | "1-Aminonaphthalene", 224 | "1-Aminonaphthalene", 225 | "1 Aminonaphthalene", 226 | "1-Naphthalenamine", 227 | "1-Naphthalenamine", 228 | "1-Naththylamine", 229 | "1-Naththylamine", 230 | "a- Naphthylamine", 231 | "a- Naphthylamine", 232 | "a-Naphthylamine", 233 | "1-Naththylamine (substance)", 234 | "1-Naphthylamine [Chemical/Ingredient]" 235 | ], 236 | "types": [ 237 | "T109", 238 | "T131" 239 | ], 240 | "canonical_name": "1-Naphthylamine", 241 | "definition": "A suspected industrial carcinogen (and listed as such by OSHA). Its N-hydroxy metabolite is strongly carcinogenic and mutagenic." 242 | }, 243 | { 244 | "concept_id": "C0000103", 245 | "aliases": [ 246 | "1-Naphthylisothiocyanate", 247 | "1 Naphthylisothiocyanate", 248 | "alpha-Naphthylisothiocyanate", 249 | "alpha-Naphthylisothiocyanate", 250 | "alpha Naphthylisothiocyanate", 251 | "Naphthalene, 1-isothiocyanato-", 252 | "Naphthalene, 1-isothiocyanato-", 253 | "1-Naphthylisothiocyanate [Chemical/Ingredient]" 254 | ], 255 | "types": [ 256 | "T109", 257 | "T130", 258 | "T131" 259 | ], 260 | "canonical_name": "1-Naphthylisothiocyanate", 261 | "definition": "A tool for the study of liver damage which causes bile stasis and hyperbilirubinemia acutely and bile duct hyperplasia and biliary cirrhosis chronically, with changes in hepatocyte function. It may cause skin and kidney damage." 262 | } 263 | ] -------------------------------------------------------------------------------- /scispacy/data_util.py: -------------------------------------------------------------------------------- 1 | from typing import NamedTuple, List, Iterator, Dict, Tuple 2 | import tarfile 3 | import atexit 4 | import os 5 | import shutil 6 | import tempfile 7 | 8 | from scispacy.file_cache import cached_path 9 | 10 | 11 | class MedMentionEntity(NamedTuple): 12 | start: int 13 | end: int 14 | mention_text: str 15 | mention_type: str 16 | umls_id: str 17 | 18 | 19 | class MedMentionExample(NamedTuple): 20 | title: str 21 | abstract: str 22 | text: str 23 | pubmed_id: str 24 | entities: List[MedMentionEntity] 25 | 26 | 27 | def process_example(lines: List[str]) -> MedMentionExample: 28 | """ 29 | Processes the text lines of a file corresponding to a single MedMention abstract, 30 | extracts the title, abstract, pubmed id and entities. The lines of the file should 31 | have the following format: 32 | PMID | t | Title text 33 | PMID | a | Abstract text 34 | PMID TAB StartIndex TAB EndIndex TAB MentionTextSegment TAB SemanticTypeID TAB EntityID 35 | ... 36 | """ 37 | pubmed_id, _, title = [x.strip() for x in lines[0].split("|", maxsplit=2)] 38 | _, _, abstract = [x.strip() for x in lines[1].split("|", maxsplit=2)] 39 | 40 | entities = [] 41 | for entity_line in lines[2:]: 42 | _, start, end, mention, mention_type, umls_id = entity_line.split("\t") 43 | mention_type = mention_type.split(",")[0] 44 | entities.append( 45 | MedMentionEntity(int(start), int(end), mention, mention_type, umls_id) 46 | ) 47 | return MedMentionExample( 48 | title, abstract, title + " " + abstract, pubmed_id, entities 49 | ) 50 | 51 | 52 | def med_mentions_example_iterator(filename: str) -> Iterator[MedMentionExample]: 53 | """ 54 | Iterates over a Med Mentions file, yielding examples. 55 | """ 56 | with open(filename, "r", encoding="utf-8") as med_mentions_file: 57 | lines = [] 58 | for line in med_mentions_file: 59 | line = line.strip() 60 | if line: 61 | lines.append(line) 62 | else: 63 | yield process_example(lines) 64 | lines = [] 65 | # Pick up stragglers 66 | if lines: 67 | yield process_example(lines) 68 | 69 | 70 | def select_subset_of_overlapping_chain( 71 | chain: List[Tuple[int, int, str]] 72 | ) -> List[Tuple[int, int, str]]: 73 | """ 74 | Select the subset of entities in an overlapping chain to return by greedily choosing the 75 | longest entity in the chain until there are no entities remaining 76 | """ 77 | sorted_chain = sorted(chain, key=lambda x: x[1] - x[0], reverse=True) 78 | selections_from_chain: List[Tuple[int, int, str]] = [] 79 | chain_index = 0 80 | # dump the current chain by greedily keeping the longest entity that doesn't overlap 81 | while chain_index < len(sorted_chain): 82 | entity = sorted_chain[chain_index] 83 | match_found = False 84 | for already_selected_entity in selections_from_chain: 85 | max_start = max(entity[0], already_selected_entity[0]) 86 | min_end = min(entity[1], already_selected_entity[1]) 87 | if len(range(max_start, min_end)) > 0: 88 | match_found = True 89 | break 90 | 91 | if not match_found: 92 | selections_from_chain.append(entity) 93 | 94 | chain_index += 1 95 | 96 | return selections_from_chain 97 | 98 | 99 | def remove_overlapping_entities( 100 | sorted_spacy_format_entities: List[Tuple[int, int, str]] 101 | ) -> List[Tuple[int, int, str]]: 102 | """ 103 | Removes overlapping entities from the entity set, by greedilytaking the longest 104 | entity from each overlapping chain. The input list of entities should be sorted 105 | and follow the spacy format. 106 | """ 107 | spacy_format_entities_without_overlap = [] 108 | current_overlapping_chain: List[Tuple[int, int, str]] = [] 109 | current_overlapping_chain_start = 0 110 | current_overlapping_chain_end = 0 111 | for i, current_entity in enumerate(sorted_spacy_format_entities): 112 | current_entity = sorted_spacy_format_entities[i] 113 | current_entity_start = current_entity[0] 114 | current_entity_end = current_entity[1] 115 | 116 | if len(current_overlapping_chain) == 0: 117 | current_overlapping_chain.append(current_entity) 118 | current_overlapping_chain_start = current_entity_start 119 | current_overlapping_chain_end = current_entity_end 120 | else: 121 | min_end = min(current_entity_end, current_overlapping_chain_end) 122 | max_start = max(current_entity_start, current_overlapping_chain_start) 123 | if min_end - max_start > 0: 124 | current_overlapping_chain.append(current_entity) 125 | current_overlapping_chain_start = min( 126 | current_entity_start, current_overlapping_chain_start 127 | ) 128 | current_overlapping_chain_end = max( 129 | current_entity_end, current_overlapping_chain_end 130 | ) 131 | else: 132 | selections_from_chain = select_subset_of_overlapping_chain( 133 | current_overlapping_chain 134 | ) 135 | 136 | current_overlapping_chain = [] 137 | spacy_format_entities_without_overlap.extend(selections_from_chain) 138 | current_overlapping_chain.append(current_entity) 139 | current_overlapping_chain_start = current_entity_start 140 | current_overlapping_chain_end = current_entity_end 141 | 142 | spacy_format_entities_without_overlap.extend( 143 | select_subset_of_overlapping_chain(current_overlapping_chain) 144 | ) 145 | 146 | return sorted(spacy_format_entities_without_overlap, key=lambda x: x[0]) 147 | 148 | 149 | def read_full_med_mentions( 150 | directory_path: str, 151 | label_mapping: Dict[str, str] = None, 152 | span_only: bool = False, 153 | spacy_format: bool = True, 154 | ): 155 | def _cleanup_dir(dir_path: str): 156 | if os.path.exists(dir_path): 157 | shutil.rmtree(dir_path) 158 | 159 | resolved_directory_path = cached_path(directory_path) 160 | if "tar.gz" in directory_path: 161 | # Extract dataset to temp dir 162 | tempdir = tempfile.mkdtemp() 163 | print( 164 | f"extracting dataset directory {resolved_directory_path} to temp dir {tempdir}" 165 | ) 166 | with tarfile.open(resolved_directory_path, "r:gz") as archive: 167 | archive.extractall(tempdir) 168 | # Postpone cleanup until exit in case the unarchived 169 | # contents are needed outside this function. 170 | atexit.register(_cleanup_dir, tempdir) 171 | 172 | resolved_directory_path = tempdir 173 | 174 | expected_names = [ 175 | "corpus_pubtator.txt", 176 | "corpus_pubtator_pmids_all.txt", 177 | "corpus_pubtator_pmids_dev.txt", 178 | "corpus_pubtator_pmids_test.txt", 179 | "corpus_pubtator_pmids_trng.txt", 180 | ] 181 | 182 | corpus = os.path.join(resolved_directory_path, expected_names[0]) 183 | examples = med_mentions_example_iterator(corpus) 184 | 185 | train_ids = { 186 | x.strip() 187 | for x in open(os.path.join(resolved_directory_path, expected_names[4])) 188 | } 189 | dev_ids = { 190 | x.strip() 191 | for x in open(os.path.join(resolved_directory_path, expected_names[2])) 192 | } 193 | test_ids = { 194 | x.strip() 195 | for x in open(os.path.join(resolved_directory_path, expected_names[3])) 196 | } 197 | 198 | train_examples = [] 199 | dev_examples = [] 200 | test_examples = [] 201 | 202 | def label_function(label): 203 | if span_only: 204 | return "ENTITY" 205 | if label_mapping is None: 206 | return label 207 | else: 208 | return label_mapping[label] 209 | 210 | for example in examples: 211 | spacy_format_entities = [ 212 | (x.start, x.end, label_function(x.mention_type)) for x in example.entities 213 | ] 214 | spacy_format_entities = remove_overlapping_entities( 215 | sorted(spacy_format_entities, key=lambda x: x[0]) 216 | ) 217 | spacy_example = (example.text, {"entities": spacy_format_entities}) 218 | if example.pubmed_id in train_ids: 219 | train_examples.append(spacy_example if spacy_format else example) 220 | 221 | elif example.pubmed_id in dev_ids: 222 | dev_examples.append(spacy_example if spacy_format else example) 223 | 224 | elif example.pubmed_id in test_ids: 225 | test_examples.append(spacy_example if spacy_format else example) 226 | 227 | return train_examples, dev_examples, test_examples 228 | 229 | 230 | SpacyNerExample = Tuple[str, Dict[str, List[Tuple[int, int, str]]]] 231 | 232 | 233 | def _handle_sentence(examples: List[Tuple[str, str]]) -> SpacyNerExample: 234 | """ 235 | Processes a single sentence by building it up as a space separated string 236 | with its corresponding typed entity spans. 237 | """ 238 | start_index = -1 239 | current_index = 0 240 | in_entity = False 241 | entity_type: str = "" 242 | sent = "" 243 | entities: List[Tuple[int, int, str]] = [] 244 | for word, entity in examples: 245 | sent += word 246 | sent += " " 247 | if entity != "O": 248 | if in_entity: 249 | pass 250 | else: 251 | start_index = current_index 252 | in_entity = True 253 | entity_type = entity[2:].upper() 254 | else: 255 | if in_entity: 256 | end_index = current_index - 1 257 | entities.append((start_index, end_index, entity_type.replace("-", "_"))) 258 | in_entity = False 259 | entity_type = "" 260 | start_index = -1 261 | current_index += len(word) + 1 262 | if in_entity: 263 | end_index = current_index - 1 264 | entities.append((start_index, end_index, entity_type)) 265 | 266 | # Remove last space. 267 | sent = sent[:-1] 268 | return (sent, {"entities": entities}) 269 | 270 | 271 | def read_ner_from_tsv(filename: str) -> List[SpacyNerExample]: 272 | """ 273 | Reads BIO formatted NER data from a TSV file, such as the 274 | NER data found here: 275 | https://github.com/cambridgeltl/MTL-Bioinformatics-2016 276 | 277 | Data is expected to be 2 tab seperated tokens per line, with 278 | sentences denoted by empty lines. Sentences read by this 279 | function will be already tokenized, but returned as a string, 280 | as this is the format required by SpaCy. Consider using the 281 | WhitespaceTokenizer(scispacy/util.py) to split this data 282 | with a SpaCy model. 283 | 284 | Parameters 285 | ---------- 286 | filename : str 287 | The path to the tsv data. 288 | 289 | Returns 290 | ------- 291 | spacy_format_data : List[SpacyNerExample] 292 | The BIO tagged NER examples. 293 | """ 294 | spacy_format_data = [] 295 | examples: List[Tuple[str, str]] = [] 296 | for line in open(cached_path(filename)): 297 | line = line.strip() 298 | if line.startswith("-DOCSTART-"): 299 | continue 300 | # We have reached the end of a sentence. 301 | if not line: 302 | if not examples: 303 | continue 304 | spacy_format_data.append(_handle_sentence(examples)) 305 | examples = [] 306 | else: 307 | word, entity = line.split("\t") 308 | examples.append((word, entity)) 309 | if examples: 310 | spacy_format_data.append(_handle_sentence(examples)) 311 | 312 | return spacy_format_data 313 | --------------------------------------------------------------------------------