├── docs
    ├── .gitinclude
    ├── Gemfile
    ├── scispacy-logo.png
    ├── scispacy-logo-square.png
    ├── _config.yml
    ├── index.md
    └── example.svg
├── tests
    ├── __init__.py
    ├── fixtures
    │   ├── umls_META
    │   │   ├── MRDEF.RRF
    │   │   ├── MRSTY.RRF
    │   │   ├── MRCONSO.RRF
    │   │   └── MRFILES.RRF
    │   ├── test_umls_tree.tsv
    │   ├── ner_test.tsv
    │   ├── med_mentions.txt
    │   └── umls_test_fixture.json
    ├── custom_tests
    │   ├── data_fixtures
    │   │   ├── test.pmids
    │   │   └── raw
    │   │   │   ├── 9170401.txt
    │   │   │   └── 9171236.txt
    │   ├── test_whitespace.py
    │   ├── test_all_model.py
    │   └── test_custom_tokenizer.py
    ├── test_util.py
    ├── test_per_class_scorer.py
    ├── test_umls_semantic_type_tree.py
    ├── test_hyponym_detector.py
    ├── test_umls_utils.py
    ├── test_candidate_generation.py
    ├── test_file_cache.py
    ├── test_linking.py
    ├── test_data_util.py
    ├── conftest.py
    └── test_abbreviation_detection.py
├── pytest.ini
├── scispacy
    ├── __init__.py
    ├── umls_linking.py
    ├── version.py
    ├── consts.py
    ├── train_utils.py
    ├── util.py
    ├── custom_sentence_segmenter.py
    ├── base_project_code.py
    ├── per_class_scorer.py
    ├── linking_utils.py
    ├── umls_semantic_type_tree.py
    ├── hyponym_detector.py
    ├── file_cache.py
    ├── custom_tokenizer.py
    ├── linking.py
    ├── umls_utils.py
    ├── abbreviation.py
    └── data_util.py
├── scripts
    ├── mypy.sh
    ├── create_linker.py
    ├── evaluate_ner.py
    ├── convert_freqs.py
    ├── count_word_frequencies.py
    └── export_umls_json.py
├── MANIFEST.in
├── data
    ├── meta_large.json
    ├── meta_small.json
    ├── meta_medium.json
    ├── craft_ner.json
    ├── bc5cdr_ner.json
    ├── jnlpba_ner.json
    ├── bionlp13cg_ner.json
    └── meta_scibert.json
├── requirements.in
├── .github
    └── workflows
    │   ├── main.yml
    │   └── publish.yml
├── .flake8
├── Dockerfile
├── RELEASE.md
├── .gitignore
├── setup.py
├── configs
    ├── base_ner.cfg
    ├── base_specialized_ner.cfg
    ├── base_ner_scibert.cfg
    ├── base_parser_tagger.cfg
    └── base_parser_tagger_scibert.cfg
└── evaluation
    └── sentence_splitting_evaluation.py


/docs/.gitinclude:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | testpaths = tests/
3 | 


--------------------------------------------------------------------------------
/scispacy/__init__.py:
--------------------------------------------------------------------------------
1 | from scispacy.version import VERSION as __version__
2 | 


--------------------------------------------------------------------------------
/docs/Gemfile:
--------------------------------------------------------------------------------
1 | 
2 | source 'https://rubygems.org'
3 | 
4 | gem "github-pages", group: :jekyll_plugins
5 | 
6 | 


--------------------------------------------------------------------------------
/docs/scispacy-logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/scispacy/main/docs/scispacy-logo.png


--------------------------------------------------------------------------------
/scripts/mypy.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | # Run type checking over the python code.
3 | 
4 | mypy scispacy --ignore-missing-imports
5 | 


--------------------------------------------------------------------------------
/docs/scispacy-logo-square.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/scispacy/main/docs/scispacy-logo-square.png


--------------------------------------------------------------------------------
/scispacy/umls_linking.py:
--------------------------------------------------------------------------------
1 | # Kept for backward compatability.
2 | from scispacy.linking import EntityLinker as UmlsEntityLinker  # noqa: F401
3 | 


--------------------------------------------------------------------------------
/tests/fixtures/umls_META/MRDEF.RRF:
--------------------------------------------------------------------------------
1 | C0000039|A0016515|AT38152019||MSH|Synthetic phospholipid used in liposomes and lipid bilayers to study biological membranes.|N||


--------------------------------------------------------------------------------
/docs/_config.yml:
--------------------------------------------------------------------------------
1 | theme: jekyll-theme-minimal
2 | description: SpaCy models for biomedical text processing
3 | show_downloads: true
4 | logo: /scispacy-logo-square.png
5 | 


--------------------------------------------------------------------------------
/scispacy/version.py:
--------------------------------------------------------------------------------
1 | _MAJOR = "0"
2 | _MINOR = "4"
3 | _REVISION = "0"
4 | 
5 | VERSION_SHORT = "{0}.{1}".format(_MAJOR, _MINOR)
6 | VERSION = "{0}.{1}.{2}".format(_MAJOR, _MINOR, _REVISION)
7 | 


--------------------------------------------------------------------------------
/tests/fixtures/test_umls_tree.tsv:
--------------------------------------------------------------------------------
1 | Event	T051	1
2 |   Activity	T052	2
3 |     Behavior	T053	3
4 |       Social Behavior	T054	4
5 |       Individual Behavior	T055	4
6 |     Daily or Recreational Activity	T056	3
7 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | include LICENSE
 4 | include README.md
 5 | include requirements.in
 6 | recursive-include data *
 7 | recursive-exclude proto_model *
 8 | recursive-exclude scispacy/models *
 9 | recursive-exclude * __pycache__
10 | 


--------------------------------------------------------------------------------
/tests/fixtures/umls_META/MRSTY.RRF:
--------------------------------------------------------------------------------
1 | C0000005|T116|A1.4.1.2.1.7|Amino Acid, Peptide, or Protein|AT17648347|256|
2 | C0000039|T109|A1.4.1.2.1|Organic Chemical|AT45562015|256|
3 | C0000039|T121|A1.4.1.1.1|Pharmacologic Substance|AT17567371|256|


--------------------------------------------------------------------------------
/tests/custom_tests/data_fixtures/test.pmids:
--------------------------------------------------------------------------------
 1 | 9170401
 2 | 9170401
 3 | 9170401
 4 | 9170401
 5 | 9170401
 6 | 9170401
 7 | 9170401
 8 | 9170401
 9 | 9170401
10 | 9170401
11 | 9170401
12 | 9170401
13 | 9170401
14 | 9171236
15 | 9171236
16 | 9171236
17 | 9171236
18 | 9171236
19 | 9171236
20 | 9171236
21 | 9171236
22 | 9171236
23 | 9171236
24 | 


--------------------------------------------------------------------------------
/data/meta_large.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "lang":"en",
 3 |   "name":"core_sci_lg",
 4 |   "sources": ["OntoNotes 5", "Common Crawl", "GENIA 1.0"],
 5 |   "description":"Spacy Models for Biomedical Text.",
 6 |   "author":"Allen Institute for Artificial Intelligence",
 7 |   "email": "ai2-info@allenai.org",
 8 |   "url":"https://allenai.github.io/SciSpaCy/",
 9 |   "license":"CC BY-SA 3.0"
10 | }
11 | 


--------------------------------------------------------------------------------
/data/meta_small.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "lang":"en",
 3 |   "name":"core_sci_sm",
 4 |   "sources": ["OntoNotes 5", "Common Crawl", "GENIA 1.0"],
 5 |   "description":"Spacy Models for Biomedical Text.",
 6 |   "author":"Allen Institute for Artificial Intelligence",
 7 |   "email": "ai2-info@allenai.org",
 8 |   "url":"https://allenai.github.io/SciSpaCy/",
 9 |   "license":"CC BY-SA 3.0"
10 | }
11 | 


--------------------------------------------------------------------------------
/tests/fixtures/umls_META/MRCONSO.RRF:
--------------------------------------------------------------------------------
1 | C0000005|ENG|P|L0000005|PF|S0007492|Y|A26634265||M0019694|D012711|MSH|PEP|D012711|(131)I-Macroaggregated Albumin|0|N|256|
2 | C0000005|ENG|S|L0270109|PF|S0007491|Y|A26634266||M0019694|D012711|MSH|ET|D012711|(131)I-MAA|0|N|256|
3 | C0000039|ENG|P|L0000039|PF|S0007564|N|A0016515||M0023172|D015060|MSH|MH|D015060|1,2-Dipalmitoylphosphatidylcholine|0|N|256|


--------------------------------------------------------------------------------
/data/meta_medium.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "lang":"en",
 3 |   "name":"core_sci_md",
 4 |   "sources": ["OntoNotes 5", "Common Crawl", "GENIA 1.0"],
 5 |   "description":"Spacy Models for Biomedical Text.",
 6 |   "author":"Allen Institute for Artificial Intelligence",
 7 |   "email": "ai2-info@allenai.org",
 8 |   "url":"https://allenai.github.io/SciSpaCy/",
 9 |   "license":"CC BY-SA 3.0"
10 | }
11 | 


--------------------------------------------------------------------------------
/data/craft_ner.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "lang":"en",
 3 |   "name":"ner_craft_md",
 4 |   "sources": ["CRAFT", "OntoNotes 5", "Common Crawl", "GENIA 1.0"],
 5 |   "description":"Spacy Models for Biomedical Text.",
 6 |   "author":"Allen Institute for Artificial Intelligence",
 7 |   "email": "ai2-info@allenai.org",
 8 |   "url":"https://allenai.github.io/SciSpaCy/",
 9 |   "license":"CC BY-SA 3.0"
10 | }
11 | 


--------------------------------------------------------------------------------
/data/bc5cdr_ner.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "lang":"en",
 3 |   "name":"ner_bc5cdr_md",
 4 |   "sources": ["BC5CDR", "OntoNotes 5", "Common Crawl", "GENIA 1.0"],
 5 |   "description":"Spacy Models for Biomedical Text.",
 6 |   "author":"Allen Institute for Artificial Intelligence",
 7 |   "email": "ai2-info@allenai.org",
 8 |   "url":"https://allenai.github.io/SciSpaCy/",
 9 |   "license":"CC BY-SA 3.0"
10 | }
11 | 


--------------------------------------------------------------------------------
/data/jnlpba_ner.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "lang":"en",
 3 |   "name":"ner_jnlpba_md",
 4 |   "sources": ["JNLPBA", "OntoNotes 5", "Common Crawl", "GENIA 1.0"],
 5 |   "description":"Spacy Models for Biomedical Text.",
 6 |   "author":"Allen Institute for Artificial Intelligence",
 7 |   "email": "ai2-info@allenai.org",
 8 |   "url":"https://allenai.github.io/SciSpaCy/",
 9 |   "license":"CC BY-SA 3.0"
10 | }
11 | 


--------------------------------------------------------------------------------
/data/bionlp13cg_ner.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "lang":"en",
 3 |   "name":"ner_bionlp13cg_md",
 4 |   "sources": ["BIONLP13CG", "OntoNotes 5", "Common Crawl", "GENIA 1.0"],
 5 |   "description":"Spacy Models for Biomedical Text.",
 6 |   "author":"Allen Institute for Artificial Intelligence",
 7 |   "email": "ai2-info@allenai.org",
 8 |   "url":"https://allenai.github.io/SciSpaCy/",
 9 |   "license":"CC BY-SA 3.0"
10 | }
11 | 


--------------------------------------------------------------------------------
/scispacy/consts.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | 
 3 | ABBREVIATIONS: List[str] = [
 4 |     "sec.",
 5 |     "secs.",
 6 |     "Sec.",
 7 |     "Secs.",
 8 |     "fig.",
 9 |     "figs.",
10 |     "Fig.",
11 |     "Figs.",
12 |     "eq.",
13 |     "eqs.",
14 |     "Eq.",
15 |     "Eqs.",
16 |     "no.",
17 |     "nos.",
18 |     "No.",
19 |     "Nos.",
20 |     "al.",
21 |     "gen.",
22 |     "sp.",
23 |     "nov.",
24 | ]
25 | 


--------------------------------------------------------------------------------
/data/meta_scibert.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "lang": "en",
 3 |   "name": "core_sci_scibert",
 4 |   "sources": [
 5 |     "OntoNotes 5",
 6 |     "Common Crawl",
 7 |     "GENIA 1.0"
 8 |   ],
 9 |   "description": "Spacy Models for Biomedical Text.",
10 |   "author": "Allen Institute for Artificial Intelligence",
11 |   "email": "ai2-info@allenai.org",
12 |   "url": "https://allenai.github.io/SciSpaCy/",
13 |   "license": "CC BY-SA 3.0",
14 |   "requirements": [
15 |     "spacy-transformers"
16 |   ]
17 | }


--------------------------------------------------------------------------------
/tests/fixtures/ner_test.tsv:
--------------------------------------------------------------------------------
 1 | Intraocular	O
 2 | pressure	O
 3 | in	O
 4 | genetically	B-SO
 5 | distinct	O
 6 | mice	B-Taxon
 7 | :	O
 8 | an	O
 9 | update	O
10 | and	O
11 | strain	O
12 | survey	O
13 | 
14 | Abstract	O
15 | 
16 | Background	O
17 | 
18 | Little	O
19 | is	O
20 | known	O
21 | about	O
22 | genetic	B-SO
23 | factors	O
24 | affecting	O
25 | intraocular	O
26 | pressure	O
27 | (	O
28 | IOP	O
29 | )	O
30 | in	O
31 | mice	B-Taxon
32 | and	O
33 | other	O
34 | mammals	B-Taxon
35 | .	O
36 | 


--------------------------------------------------------------------------------
/tests/test_util.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import spacy
 4 | 
 5 | from scispacy.util import WhitespaceTokenizer
 6 | 
 7 | class TestUtil(unittest.TestCase):
 8 | 
 9 |     def setUp(self):
10 |         super().setUp()
11 | 
12 |         self.nlp = spacy.load("en_core_web_sm")
13 | 
14 |     def test_whitespace_tokenizer(self):
15 | 
16 |         self.nlp.tokenizer = WhitespaceTokenizer(self.nlp.vocab)
17 |         text = "don't split this contraction."
18 |         doc = self.nlp(text)
19 | 
20 |         assert [t.text for t in doc] == text.split(" ")
21 | 


--------------------------------------------------------------------------------
/requirements.in:
--------------------------------------------------------------------------------
 1 | numpy
 2 | spacy>=3.0.0,<3.1.0
 3 | spacy-lookups-data
 4 | pandas
 5 | requests>=2.0.0,<3.0.0
 6 | conllu
 7 | 
 8 | # Candidate generation and entity linking
 9 | joblib
10 | nmslib>=1.7.3.6
11 | scikit-learn>=0.20.3
12 | 
13 | # Required for testing.
14 | pytest
15 | pytest-cov
16 | flake8
17 | # black currently pinned because of a dependency issue with spacy, typer, and click
18 | black<=21.12b0
19 | mypy
20 | types-requests
21 | 
22 | # Required for releases.
23 | twine
24 | 
25 | # required for the tests to run, or to use the custom sentence splitter
26 | pysbd
27 | 


--------------------------------------------------------------------------------
/.github/workflows/main.yml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | 
 3 | on:
 4 |   pull_request:
 5 |     branches:
 6 |     - main 
 7 | 
 8 | jobs:
 9 |   build:
10 | 
11 |     runs-on: ubuntu-latest
12 |     
13 |     steps:
14 |     - uses: actions/checkout@v1
15 |     - name: Build and test with Docker
16 |       run: |
17 |         docker build --tag scispacy .
18 |         docker run --rm scispacy pytest tests/
19 |         docker run --rm scispacy flake8 scispacy
20 |         docker run --rm scispacy black scispacy --check --line-length 88
21 |         docker run --rm scispacy bash scripts/mypy.sh
22 |         docker run --rm scispacy pytest tests/ --cov scispacy --cov-fail-under=20
23 | 


--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
 1 | [flake8]
 2 | max-line-length = 115
 3 | 
 4 | ignore =
 5 |     # these rules don't play well with black
 6 |     E203  # whitespace before :
 7 |     W503  # line break before binary operator
 8 |     W504  # line break after binary operator
 9 | 
10 | exclude =
11 |     build/**
12 |     docs/**
13 | 
14 | per-file-ignores =
15 |     # __init__.py files are allowed to have unused imports and lines-too-long
16 |     scispacy/__init__.py:F401
17 |     scispacy/**/__init__.py:F401,E501
18 | 
19 |     # scripts don't have to respect
20 |     #  E501: line length
21 |     #  E402: imports not at top of file (because we mess with sys.path)
22 |     scripts/**:E501,E402
23 | 


--------------------------------------------------------------------------------
/scripts/create_linker.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | 
 4 | from scispacy.candidate_generation import create_tfidf_ann_index
 5 | from scispacy.linking_utils import KnowledgeBase
 6 | 
 7 | 
 8 | def main(kb_path: str, output_path: str):
 9 | 
10 |     os.makedirs(output_path, exist_ok=True)
11 |     kb = KnowledgeBase(kb_path)
12 |     create_tfidf_ann_index(output_path, kb)
13 | 
14 | 
15 | if __name__ == "__main__":
16 |     parser = argparse.ArgumentParser()
17 |     parser.add_argument(
18 |         '--kb_path',
19 |         help="Path to the KB file."
20 |     )
21 |     parser.add_argument(
22 |         '--output_path',
23 |         help="Path to the output directory."
24 |     )
25 | 
26 |     args = parser.parse_args()
27 |     main(args.kb_path, args.output_path)
28 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.8-buster
 2 | 
 3 | # install base packages
 4 | RUN apt-get clean \
 5 |     && apt-get update --fix-missing \
 6 |     && apt-get install -y \
 7 |     git \
 8 |     curl \
 9 |     gcc \
10 |     g++ \
11 |     build-essential \
12 |     wget \
13 |     awscli
14 | 
15 | WORKDIR /work
16 | 
17 | # install python packages
18 | COPY requirements.in .
19 | 
20 | RUN pip install -r requirements.in
21 | RUN pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_sm-0.4.0.tar.gz
22 | RUN python -m spacy download en_core_web_sm
23 | RUN python -m spacy download en_core_web_md
24 | 
25 | # add the code as the final step so that when we modify the code
26 | # we don't bust the cached layers holding the dependencies and
27 | # system packages.
28 | COPY scispacy/ scispacy/
29 | COPY scripts/ scripts/
30 | COPY tests/ tests/
31 | COPY .flake8 .flake8
32 | 
33 | CMD [ "/bin/bash" ]
34 | 


--------------------------------------------------------------------------------
/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
 1 | # This workflow publishes the scispacy package (not the scispacy models) to pypi.
 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
 3 | 
 4 | name: Publish Package
 5 | 
 6 | on:
 7 |   release:
 8 |     types: [published]
 9 | 
10 | jobs:
11 |   deploy:
12 |     
13 |     if: github.repository == 'allenai/scispacy'
14 |     runs-on: ubuntu-latest
15 | 
16 |     steps:
17 |     - uses: actions/checkout@v2
18 |     - name: Set up Python
19 |       uses: actions/setup-python@v2
20 |       with:
21 |         python-version: '3.7'
22 |     - name: Install dependencies
23 |       run: |
24 |         python -m pip install --upgrade pip
25 |         pip install setuptools wheel twine
26 |     - name: Build and publish
27 |       run: |
28 |         python setup.py sdist bdist_wheel
29 |         twine upload -u scispacy -p ${{ secrets.PYPI_PASSWORD }} dist/*
30 | 


--------------------------------------------------------------------------------
/scispacy/train_utils.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | import tqdm
 4 | from spacy.language import Language
 5 | 
 6 | from scispacy.per_class_scorer import PerClassScorer
 7 | 
 8 | 
 9 | def evaluate_ner(
10 |     nlp: Language, eval_data, dump_path: str = None, verbose: bool = False
11 | ) -> PerClassScorer:
12 | 
13 |     scorer = PerClassScorer()
14 |     print("Evaluating %d rows" % len(eval_data))
15 |     for i, (text, gold_spans) in enumerate(tqdm.tqdm(eval_data)):
16 | 
17 |         # parse dev data with trained model
18 |         doc = nlp(text)
19 |         predicted_spans = [
20 |             (ent.start_char, ent.end_char, ent.label_) for ent in doc.ents
21 |         ]
22 |         scorer(predicted_spans, gold_spans["entities"])
23 | 
24 |         if i % 1000 == 0 and i > 0:
25 |             for name, metric in scorer.get_metric().items():
26 |                 print(f"{name}: {metric}")
27 | 
28 |     metrics = scorer.get_metric()
29 |     if dump_path is not None:
30 |         json.dump(metrics, open(dump_path, "a+"))
31 |     for name, metric in metrics.items():
32 |         if "overall" in name or "untyped" in name or verbose:
33 |             print(f"{name}: \t\t {metric}")
34 | 
35 |     return metrics
36 | 


--------------------------------------------------------------------------------
/scispacy/util.py:
--------------------------------------------------------------------------------
 1 | import spacy
 2 | from spacy.language import Language
 3 | from spacy.tokens import Doc
 4 | 
 5 | from scispacy.custom_sentence_segmenter import pysbd_sentencizer
 6 | from scispacy.custom_tokenizer import combined_rule_tokenizer
 7 | 
 8 | 
 9 | def save_model(nlp: Language, output_path: str):
10 |     nlp.to_disk(output_path)
11 | 
12 | 
13 | def create_combined_rule_model() -> Language:
14 |     nlp = spacy.load("en_core_web_sm")
15 |     nlp.tokenizer = combined_rule_tokenizer(nlp)
16 |     nlp.add_pipe(pysbd_sentencizer, first=True)
17 |     return nlp
18 | 
19 | 
20 | class WhitespaceTokenizer:
21 |     """
22 |     Spacy doesn't assume that text is tokenised. Sometimes this
23 |     is annoying, like when you have gold data which is pre-tokenised,
24 |     but Spacy's tokenisation doesn't match the gold. This can be used
25 |     as follows:
26 |     nlp = spacy.load("en_core_web_md")
27 |     # hack to replace tokenizer with a whitespace tokenizer
28 |     nlp.tokenizer = WhitespaceTokenizer(nlp.vocab)
29 |     ... use nlp("here is some text") as normal.
30 |     """
31 | 
32 |     def __init__(self, vocab):
33 |         self.vocab = vocab
34 | 
35 |     def __call__(self, text):
36 |         words = text.split(" ")
37 |         # All tokens 'own' a subsequent space character in
38 |         # this tokenizer. This is a technicality and probably
39 |         # not that interesting.
40 |         spaces = [True] * len(words)
41 |         return Doc(self.vocab, words=words, spaces=spaces)
42 | 


--------------------------------------------------------------------------------
/RELEASE.md:
--------------------------------------------------------------------------------
 1 | 
 2 | ### Creating a release
 3 | 
 4 | Scispacy has two components:
 5 | 
 6 | - The scispacy pip package
 7 | - The scispacy models
 8 | 
 9 | The scispacy pip package is published automatically using the `.github/actions/publish.yml` github action. It happens whenever a release is published (with an associated tag) in the github releases UI.
10 | 
11 | In order to create a new release, the following should happen:
12 | 
13 | #### Updating `scispacy/version.py`
14 | Update the version in version.py.
15 | 
16 | #### Training new models
17 | 
18 | For the release, new models should be trained using the `scripts/pipeline.sh` and `scripts/ner_pipeline.sh` scripts, for the small, medium and large models, and specialized NER models. Remember to export the `ONTONOTES_PATH` and `ONTONOTES_PERCENT` environment variables to mix in the ontonotes training data.
19 | 
20 | ```
21 | bash scripts/pipeline.sh small
22 | bash scripts/pipeline.sh medium
23 | bash scripts/pipeline.sh large
24 | bash scripts/ner_pipeline.sh <path to medium base model>
25 | ```
26 | 
27 | these should then be uploaded to the `https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/{VERSION}` S3 bucket, and references to previous models (e.g in the readme and in the docs) should be updated. You can find all these places using `git grep <previous version>`.
28 | 
29 | #### Merge a PR with the above changes
30 | Merge a PR with the above changes, and publish a release with a tag corresponding to the commit from the merged PR. This should trigger the publish github action, which will create the `scispacy` package and publish it to pypi.
31 | 
32 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # vscode
  2 | *.vscode
  3 | 
  4 | # Byte-compiled / optimized / DLL files
  5 | __pycache__/
  6 | *.py[cod]
  7 | *$py.class
  8 | 
  9 | # C extensions
 10 | *.so
 11 | 
 12 | # Distribution / packaging
 13 | .Python
 14 | build/
 15 | develop-eggs/
 16 | dist/
 17 | downloads/
 18 | eggs/
 19 | .eggs/
 20 | lib/
 21 | lib64/
 22 | parts/
 23 | sdist/
 24 | var/
 25 | wheels/
 26 | *.egg-info/
 27 | .installed.cfg
 28 | *.egg
 29 | MANIFEST
 30 | 
 31 | # PyInstaller
 32 | #  Usually these files are written by a python script from a template
 33 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 34 | *.manifest
 35 | *.spec
 36 | 
 37 | # Installer logs
 38 | pip-log.txt
 39 | pip-delete-this-directory.txt
 40 | 
 41 | # Unit test / coverage reports
 42 | htmlcov/
 43 | .tox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | 
 53 | # Translations
 54 | *.mo
 55 | *.pot
 56 | 
 57 | # Django stuff:
 58 | *.log
 59 | local_settings.py
 60 | db.sqlite3
 61 | 
 62 | # Flask stuff:
 63 | instance/
 64 | .webassets-cache
 65 | 
 66 | # Scrapy stuff:
 67 | .scrapy
 68 | 
 69 | # Sphinx documentation
 70 | docs/_build/
 71 | 
 72 | # PyBuilder
 73 | target/
 74 | 
 75 | # Jupyter Notebook
 76 | .ipynb_checkpoints
 77 | 
 78 | # pyenv
 79 | .python-version
 80 | 
 81 | # celery beat schedule file
 82 | celerybeat-schedule
 83 | 
 84 | # SageMath parsed files
 85 | *.sage.py
 86 | 
 87 | # Environments
 88 | .env
 89 | .venv
 90 | env/
 91 | venv/
 92 | ENV/
 93 | env.bak/
 94 | venv.bak/
 95 | 
 96 | # Spyder project settings
 97 | .spyderproject
 98 | .spyproject
 99 | 
100 | # Rope project settings
101 | .ropeproject
102 | 
103 | # mkdocs documentation
104 | /site
105 | 
106 | # mypy
107 | .mypy_cache/
108 | 


--------------------------------------------------------------------------------
/tests/custom_tests/data_fixtures/raw/9170401.txt:
--------------------------------------------------------------------------------
 1 | Induction of cytokine expression in leukocytes by binding of thrombin-stimulated platelets.
 2 | BACKGROUND: Activated platelets tether and activate myeloid leukocytes.
 3 | To investigate the potential relevance of this mechanism in acute myocardial infarction (AMI), we examined cytokine induction by leukocyte-platelet adhesion and the occurrence of leukocyte-platelet conjugates in patients with AMI.
 4 | METHODS AND RESULTS: We obtained peripheral venous blood samples in 20 patients with AMI before and daily for 5 days after direct percutaneous transluminal coronary angioplasty (PTCA) and in 20 patients undergoing elective PTCA.
 5 | Throughout the study period, CD41 immunofluorescence of leukocytes (flow cytometry) revealed increased leukocyte-platelet adhesion in patients with AMI compared with control patients (mean +/- SE of fluorescence [channels] before PTCA: 77 +/- 16 versus 35 +/- 9; P = .003).
 6 | In vitro, thrombin-stimulated fixed platelets bound to neutrophils and monocytes.
 7 | Within 2 hours, this resulted in increased mRNA for interleukin (IL),1 beta, IL-8, and monocyte chemoattractant protein (MCP)-1 in unfractionated leukocytes.
 8 | After 4 hours, IL-1 beta and IL-8 concentration of the cell-free supernatant had increased by 268 +/- 36% and 210 +/- 7%, respectively, and cellular MCP-1 content had increased by 170 +/- 8%.
 9 | Addition of activated platelets to adherent monocytes had a similar effect and was associated with nuclear factor-kappa B activation.
10 | Inhibition of binding by anti-P selectin antibodies reduced the effect of activated platelets on cytokine production.
11 | CONCLUSIONS: In patients with AMI, leukocyte-platelet adhesion is increased.
12 | Binding of activated platelets induces IL-1 beta, IL-8, and MCP-1 in leukocytes.
13 | Our findings suggest that leukocyte-platelet adhesion contributes to the regulation of inflammatory responses in AMI.
14 | 


--------------------------------------------------------------------------------
/tests/custom_tests/test_whitespace.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | """Test that tokens are created correctly for whitespace."""
 3 | 
 4 | 
 5 | from __future__ import unicode_literals
 6 | 
 7 | import pytest
 8 | 
 9 | import spacy
10 | from spacy.language import Language as SpacyModelType
11 | 
12 | from scispacy.custom_sentence_segmenter import pysbd_sentencizer
13 | 
14 | 
15 | class TestWhitespace:
16 |     nlp = spacy.load("en_core_sci_sm")
17 | 
18 |     @pytest.mark.parametrize("text", ["lorem ipsum"])
19 |     def test_tokenizer_splits_single_space(self, text):
20 |         tokens = self.nlp(text)
21 |         assert len(tokens) == 2
22 | 
23 |     @pytest.mark.parametrize("text", ["lorem  ipsum"])
24 |     def test_tokenizer_splits_double_space(self, text):
25 |         tokens = self.nlp(text)
26 |         assert len(tokens) == 3
27 |         assert tokens[1].text == " "
28 | 
29 |     @pytest.mark.parametrize("text", ["lorem ipsum  "])
30 |     def test_tokenizer_handles_double_trainling_ws(self, text):
31 |         tokens = self.nlp(text)
32 |         assert repr(tokens.text_with_ws) == repr(text)
33 | 
34 |     @pytest.mark.parametrize("text", ["lorem\nipsum"])
35 |     def test_tokenizer_splits_newline(self, text):
36 |         tokens = self.nlp(text)
37 |         assert len(tokens) == 3
38 |         assert tokens[1].text == "\n"
39 | 
40 |     @pytest.mark.parametrize("text", ["lorem \nipsum"])
41 |     def test_tokenizer_splits_newline_space(self, text):
42 |         tokens = self.nlp(text)
43 |         assert len(tokens) == 3
44 | 
45 |     @pytest.mark.parametrize("text", ["lorem  \nipsum"])
46 |     def test_tokenizer_splits_newline_double_space(self, text):
47 |         tokens = self.nlp(text)
48 |         assert len(tokens) == 3
49 | 
50 |     @pytest.mark.parametrize("text", ["lorem \n ipsum"])
51 |     def test_tokenizer_splits_newline_space_wrap(self, text):
52 |         tokens = self.nlp(text)
53 |         assert len(tokens) == 3
54 | 


--------------------------------------------------------------------------------
/tests/test_per_class_scorer.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | import unittest
 4 | 
 5 | from scispacy.per_class_scorer import PerClassScorer
 6 | 
 7 | class TestPerClassScorer(unittest.TestCase):
 8 | 
 9 |     def test_per_class_scorer_counts_correctly(self):
10 | 
11 |         scorer = PerClassScorer()
12 | 
13 |         predicted = [(1, 3, "PER"), (10, 12, "LOC")]
14 |         gold = [(1, 3, "PER"), (10, 12, "ORG")]
15 |         original_gold = [x for x in gold]
16 |         original_predicted = [x for x in predicted]
17 | 
18 |         scorer(predicted, gold)
19 | 
20 |         correct_metrics = {'precision-PER': 1.0,
21 |                            'recall-PER': 1.0,
22 |                            'f1-measure-PER': 1.0,
23 |                            'precision-LOC': 0.0,
24 |                            'recall-LOC': 0.0,
25 |                            'f1-measure-LOC': 0.0,
26 |                            'precision-untyped': 1.0,
27 |                            'recall-untyped': 1.0,
28 |                            'f1-measure-untyped': 1.0,
29 |                            'precision-ORG': 0.0,
30 |                            'recall-ORG': 0.0,
31 |                            'f1-measure-ORG': 0.0,
32 |                            'precision-overall': 0.5,
33 |                            'recall-overall': 0.5,
34 |                            'f1-measure-overall': 0.5}
35 |         metrics = scorer.get_metric()
36 |         assert set(metrics.keys()) == set(correct_metrics.keys())
37 |         for metric, value in metrics.items():
38 |             self.assertAlmostEqual(value, correct_metrics[metric])
39 | 
40 |         scorer.get_metric(reset=True)
41 | 
42 |         # Check input is not modified.
43 |         assert gold == original_gold
44 |         assert predicted == original_predicted
45 |         # Check reseting.
46 |         assert scorer._true_positives == {}
47 |         assert scorer._false_positives == {}
48 |         assert scorer._false_negatives == {}
49 | 


--------------------------------------------------------------------------------
/tests/custom_tests/data_fixtures/raw/9171236.txt:
--------------------------------------------------------------------------------
 1 | Defective survival and activation of thymocytes in transgenic mice expressing a catalytically inactive form of Ca2+/calmodulin-dependent protein kinase IV.
 2 | We have generated transgenic mice that express a catalytically inactive form of Ca2+/calmodulin-dependent protein kinase IV (CaMKIV) specifically in thymic T cells.
 3 | The presence of this protein results in a markedly reduced thymic cellularity, although the distribution of the remaining cells is normal based on evaluation of the CD4 and CD8 cell surface antigens that are used to gauge T cell development.
 4 | Isolated thymic T cells from the transgenic mice also show a dramatically decreased survival rate when evaluated in culture under conditions that do not favor activation.
 5 | When challenged with an activating stimulus such as alpha-CD3 or a combination of phorbol ester plus ionophore, the cells are severely compromised in their ability to produce the cytokine interleukin-2 (IL-2).
 6 | Reduction of IL-2 production is secondary to the inability to phosphorylate the cAMP response element binding protein, CREB, and induce expression of the immediate early genes such as Fos B that are required to transactivate the IL-2 promoter.
 7 | Because transgene expression was regulated by the proximal promoter of the murine lck gene and this promoter is inactivated in T cells that exit the thymus, the mutant hCaMKIV is not present in peripheral T cells.
 8 | Consequently, T lymphocytes present in the spleen can be activated normally in response to either stimulus mentioned above, demonstrating that the effects of the inactive CaMKIV on activation are reversible.
 9 | Our results suggest that CaMKIV may represent a physiologically relevant CREB kinase in T cells and that the enzyme is also required to ensure normal expansion of T cells in the thymus.
10 | Whereas the pathway responsible for this latter role is yet to be elucidated, it is unlikely to include CREB phosphorylation.
11 | 


--------------------------------------------------------------------------------
/tests/test_umls_semantic_type_tree.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | import unittest
 4 | 
 5 | from scispacy.umls_semantic_type_tree import construct_umls_tree_from_tsv
 6 | 
 7 | class TestUmlsSemanticTypeTree(unittest.TestCase):
 8 | 
 9 |     def setUp(self):
10 |         super().setUp()
11 |         self.tree = construct_umls_tree_from_tsv("tests/fixtures/test_umls_tree.tsv")
12 | 
13 |     def test_tree_can_be_read_from_file(self):
14 | 
15 |         correct_names = ["Activity", "Behavior", "Social Behavior", "Individual Behavior",
16 |                          "Daily or Recreational Activity", "Event"]
17 |         correct_ids = ['T052', 'T053', 'T054', 'T055', 'T056', 'T051']
18 |         for node, name, umls_id in zip(self.tree.flat_nodes, correct_names, correct_ids):
19 |             assert node.full_name == name
20 |             assert node.type_id == umls_id
21 | 
22 |     def test_tree_can_collapse_nodes(self):
23 |         new_mapping = self.tree.get_collapsed_type_id_map_at_level(2)
24 |         assert new_mapping == {'T052': 'T052',
25 |                                'T053': 'T052',
26 |                                'T054': 'T052',
27 |                                'T055': 'T052',
28 |                                'T056': 'T052',
29 |                                'T051': 'T051'}
30 |         assert ["T052"] == [node.type_id for node in self.tree.get_nodes_at_depth(2)]
31 | 
32 |     def test_get_parent_root(self):
33 |         root_node = self.tree.get_node_from_id("T051")
34 |         parent = self.tree.get_parent(root_node)
35 |         assert parent is None
36 | 
37 |     def test_get_parent(self):
38 |         level_1_node = self.tree.get_node_from_id("T052")
39 |         level_1_node_parent = self.tree.get_parent(level_1_node)
40 |         assert level_1_node_parent.type_id == "T051"
41 | 
42 |         leaf_node = self.tree.get_node_from_id("T055")
43 |         leaf_node_parent = self.tree.get_parent(leaf_node)
44 |         assert leaf_node_parent.type_id == "T053"
45 | 
46 | 


--------------------------------------------------------------------------------
/scripts/evaluate_ner.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional
 2 | 
 3 | import argparse
 4 | import spacy
 5 | import importlib
 6 | 
 7 | from scispacy.data_util import read_full_med_mentions, read_ner_from_tsv
 8 | from scispacy.train_utils import evaluate_ner
 9 | 
10 | 
11 | def main(model_path: str, dataset: str, output_path: str, code: Optional[str], med_mentions_folder_path: Optional[str]):
12 |     if code is not None:
13 |         # need to import code before loading a spacy model
14 |         spec = importlib.util.spec_from_file_location(name, str(loc))
15 |         module = importlib.util.module_from_spec(spec)
16 |         spec.loader.exec_module(module)
17 | 
18 |     nlp = spacy.load(model_path)
19 |     if dataset.startswith("medmentions"):
20 |         train_data, dev_data, test_data = read_full_med_mentions(med_mentions_folder_path, None, False)
21 |         data_split = dataset.split("-")[1]
22 |         if data_split == "train":
23 |             data = train_data
24 |         elif data_split == "dev":
25 |             data = dev_data
26 |         elif data_split == "test":
27 |             data = test_data
28 |         else:
29 |             raise Exception(f"Unrecognized split {data_split}")
30 |     else:
31 |         data = read_ner_from_tsv(dataset)
32 | 
33 |     evaluate_ner(nlp, data, dump_path=output_path)
34 | 
35 | 
36 | if __name__ == "__main__":
37 |     parser = argparse.ArgumentParser()
38 |     parser.add_argument("--model_path", type=str, help="Path to model to evaluate")
39 |     parser.add_argument("--dataset", type=str, help="medmentions-<train/dev/test>, or a tsv file to evalute")
40 |     parser.add_argument("--output_path", type=str, help="Path to write results to")
41 |     parser.add_argument("--code", type=str, default=None, help="Path to code to import before loading spacy model")
42 |     parser.add_argument("--med_mentions_folder_path", type=str, default=None, help="Path to the med mentions folder")
43 | 
44 |     args = parser.parse_args()
45 |     main(args.model_path, args.dataset, args.output_path, args.code, args.med_mentions_folder_path)


--------------------------------------------------------------------------------
/tests/test_hyponym_detector.py:
--------------------------------------------------------------------------------
 1 | # pylint: disable=no-self-use,invalid-name
 2 | import unittest
 3 | import spacy
 4 | 
 5 | from scispacy.hyponym_detector import HyponymDetector
 6 | 
 7 | 
 8 | class TestHyponymDetector(unittest.TestCase):
 9 |     def setUp(self):
10 |         super().setUp()
11 |         self.nlp = spacy.load("en_core_sci_sm")
12 |         self.detector = HyponymDetector(self.nlp, extended=True)
13 |         self.nlp.add_pipe("hyponym_detector", config={"extended": True}, last=True)
14 | 
15 |     def test_sentences(self):
16 |         text = (
17 |             "Recognizing that the preferred habitats for the species "
18 |             "are in the valleys, systematic planting of keystone plant "
19 |             "species such as fig trees (Ficus) creates the best microhabitats."
20 |         )
21 |         doc = self.nlp(text)
22 |         fig_trees = doc[21:23]
23 |         keystone_plant_species = doc[16:19]
24 |         assert doc._.hearst_patterns == [("such_as", keystone_plant_species, fig_trees)]
25 | 
26 |         doc = self.nlp("SARS, or other coronaviruses, are bad.")
27 |         assert doc._.hearst_patterns == [("other", doc[4:5], doc[0:1])]
28 |         doc = self.nlp("Coronaviruses, including SARS and MERS, are bad.")
29 |         assert doc._.hearst_patterns == [
30 |             ("include", doc[0:1], doc[3:4]),
31 |             ("include", doc[0:1], doc[5:6]),
32 |         ]
33 | 
34 |     def test_find_noun_compound_head(self):
35 | 
36 |         doc = self.nlp("The potassium channel is good.")
37 | 
38 |         head = self.detector.find_noun_compound_head(doc[1])
39 |         assert head == doc[2]
40 | 
41 |         doc = self.nlp("Planting of large plants.")
42 |         head = self.detector.find_noun_compound_head(doc[3])
43 |         # Planting is a noun, but not a compound with 'plants'.
44 |         assert head != doc[0]
45 |         assert head == doc[3]
46 | 
47 |     def test_expand_noun_phrase(self):
48 |         doc = self.nlp("Keystone plant habitats are good.")
49 |         chunk = self.detector.expand_to_noun_compound(doc[1], doc)
50 |         assert chunk == doc[0:3]
51 | 


--------------------------------------------------------------------------------
/scispacy/custom_sentence_segmenter.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | 
 3 | import pysbd
 4 | 
 5 | from spacy.tokens import Doc
 6 | from spacy.language import Language
 7 | from pysbd.utils import TextSpan
 8 | 
 9 | from scispacy.consts import ABBREVIATIONS
10 | 
11 | 
12 | @Language.component("pysbd_sentencizer")
13 | def pysbd_sentencizer(doc: Doc) -> Doc:
14 |     """Adds sentence boundaries to a Doc.
15 |     Intended to be used as a pipe in a spaCy pipeline.
16 |     Uses https://github.com/nipunsadvilkar/pySBD to get proper sentence and
17 |     respective char_spans
18 | 
19 |     Handle special cases:
20 |     New lines cannot be end of sentence tokens.
21 |     New lines that separate sentences will be added to the
22 |     beginning of the next sentence.
23 | 
24 |     @param doc: the spaCy document to be annotated with sentence boundaries
25 |     """
26 |     segmenter = pysbd.Segmenter(language="en", clean=False, char_span=True)
27 |     sents_char_spans: List[TextSpan] = segmenter.segment(doc.text)
28 | 
29 |     char_spans = [
30 |         doc.char_span(
31 |             sent_span.start,
32 |             # strip off trailing spaces when creating spans to accomodate spacy
33 |             sent_span.end - (len(sent_span.sent) - len(sent_span.sent.rstrip(" "))),
34 |         )
35 |         for sent_span in sents_char_spans
36 |     ]
37 |     start_token_char_offsets = [span[0].idx for span in char_spans if span is not None]
38 |     for token in doc:
39 |         prev_token = token.nbor(-1) if token.i != 0 else None
40 |         if token.idx in start_token_char_offsets:
41 |             if prev_token and (
42 |                 prev_token.text in ABBREVIATIONS
43 |                 # Glom new lines at the beginning of the text onto the following sentence
44 |                 or (prev_token.i == 0 and all(c == "\n" for c in prev_token.text))
45 |             ):
46 |                 token.is_sent_start = False
47 |             else:
48 |                 token.is_sent_start = True
49 |         # check if previous token contains more than 2 newline chars
50 |         elif prev_token and prev_token.i != 0 and prev_token.text.count("\n") >= 2:
51 |             token.is_sent_start = True
52 |         else:
53 |             token.is_sent_start = False
54 |     return doc
55 | 


--------------------------------------------------------------------------------
/tests/test_umls_utils.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from scispacy import umls_utils
 4 | 
 5 | class TestUtil(unittest.TestCase):
 6 | 
 7 |     expected_concepts = [
 8 |             {'concept_id': 'C0000005', 'canonical_name': '(131)I-Macroaggregated Albumin',
 9 |              'types': ['T116'], 'aliases': ['(131)I-MAA']},
10 |             {'concept_id': 'C0000039', 'aliases': ['1,2-Dipalmitoylphosphatidylcholine'],
11 |              'types': ['T109', 'T121'], 'definition':
12 |              'Synthetic phospholipid used in liposomes and lipid bilayers to study biological membranes.'}
13 |     ]
14 | 
15 |     def test_read_umls_concepts(self):
16 |         meta_path = 'tests/fixtures/umls_META'
17 |         concept_details = {}
18 |         umls_utils.read_umls_concepts(meta_path, concept_details)
19 |         assert len(self.expected_concepts) == len(concept_details)
20 | 
21 |         for expected_concept in self.expected_concepts:
22 |             assert expected_concept['concept_id'] in concept_details
23 |             concept = concept_details[expected_concept['concept_id']]
24 |             if 'canonical_name' in expected_concept:
25 |                 assert concept['canonical_name'] == expected_concept['canonical_name']
26 |             assert concept['aliases'] == expected_concept['aliases']
27 | 
28 |     def test_read_umls_types(self):
29 |         meta_path = 'tests/fixtures/umls_META'
30 |         concept_details = {}
31 |         umls_utils.read_umls_concepts(meta_path, concept_details)
32 |         umls_utils.read_umls_types(meta_path, concept_details)
33 |         for expected_concept in self.expected_concepts:
34 |             concept = concept_details[expected_concept['concept_id']]
35 |             assert concept['types'] == expected_concept['types']
36 | 
37 |     def test_read_umls_definitions(self):
38 |         meta_path = 'tests/fixtures/umls_META'
39 |         concept_details = {}
40 |         umls_utils.read_umls_concepts(meta_path, concept_details)
41 |         umls_utils.read_umls_definitions(meta_path, concept_details)
42 |         for expected_concept in self.expected_concepts:
43 |             concept = concept_details[expected_concept['concept_id']]
44 |             if 'definition' in expected_concept:
45 |                 assert concept['definition'] == expected_concept['definition']
46 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | 
 4 | """
 5 | Instructions for creating a release of the scispacy library.
 6 | 
 7 | 1. Make sure your working directory is clean.
 8 | 2. Make sure that you have changed the versions in "scispacy/version.py".
 9 | 3. Create the distribution by running "python setup.py sdist" in the root of the repository.
10 | 4. Check you can install the new distribution in a clean environment.
11 | 5. Upload the distribution to pypi by running "twine upload <path to the distribution> -u <username> -p <password>".
12 |    This step will ask you for a username and password - the username is "scispacy" you can
13 |    get the password from LastPass.
14 | """
15 | 
16 | VERSION = {}
17 | # version.py defines VERSION and VERSION_SHORT variables.
18 | # We use exec here to read it so that we don't import scispacy
19 | # whilst setting up the package.
20 | with open("scispacy/version.py", "r") as version_file:
21 |     exec(version_file.read(), VERSION)
22 | 
23 | setup(
24 |     name="scispacy",
25 |     version=VERSION["VERSION"],
26 |     url="https://allenai.github.io/SciSpaCy/",
27 |     author="Allen Institute for Artificial Intelligence",
28 |     author_email="ai2-info@allenai.org",
29 |     description="A full SpaCy pipeline and models for scientific/biomedical documents.",
30 |     long_description=open("README.md").read(),
31 |     long_description_content_type="text/markdown",
32 |     keywords=["bioinformatics nlp spacy SpaCy biomedical"],
33 |     classifiers=[
34 |         "Intended Audience :: Science/Research",
35 |         "Development Status :: 3 - Alpha",
36 |         "License :: OSI Approved :: Apache Software License",
37 |         "Programming Language :: Python :: 3.6",
38 |         "Topic :: Scientific/Engineering :: Artificial Intelligence",
39 |         "Topic :: Scientific/Engineering :: Bio-Informatics",
40 |     ],
41 |     packages=find_packages(exclude=["*.tests", "*.tests.*", "tests.*", "tests"]),
42 |     license="Apache",
43 |     install_requires=[
44 |         "spacy>=3.0.0,<3.1.0",
45 |         "requests>=2.0.0,<3.0.0",
46 |         "conllu",
47 |         "numpy",
48 |         "joblib",
49 |         "nmslib>=1.7.3.6",
50 |         "scikit-learn>=0.20.3",
51 |         "pysbd",
52 |     ],
53 |     tests_require=["pytest", "pytest-cov", "flake8", "black", "mypy"],
54 |     python_requires=">=3.6.0",
55 | )
56 | 


--------------------------------------------------------------------------------
/tests/test_candidate_generation.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import tempfile
 3 | 
 4 | from scispacy.candidate_generation import CandidateGenerator, create_tfidf_ann_index, MentionCandidate
 5 | from scispacy.umls_utils import UmlsKnowledgeBase
 6 | 
 7 | 
 8 | class TestCandidateGeneration(unittest.TestCase):
 9 | 
10 |     def test_create_index(self):
11 | 
12 |         umls_fixture = UmlsKnowledgeBase("tests/fixtures/umls_test_fixture.json")
13 |         with tempfile.TemporaryDirectory() as dir_name:
14 |             umls_concept_aliases, tfidf_vectorizer, ann_index = create_tfidf_ann_index(dir_name, umls_fixture)
15 | 
16 |         assert len(umls_concept_aliases) == 93
17 |         assert len(ann_index) == 93  # Number of deduplicated aliases + canonical ids
18 |         tfidf_params = tfidf_vectorizer.get_params()
19 | 
20 |         assert tfidf_params["analyzer"] == "char_wb"
21 |         assert tfidf_params["min_df"] == 10
22 |         assert tfidf_params["ngram_range"] == (3, 3)
23 | 
24 |     def test_candidate_generation(self):
25 | 
26 |         umls_fixture = UmlsKnowledgeBase("tests/fixtures/umls_test_fixture.json")
27 |         with tempfile.TemporaryDirectory() as dir_name:
28 |             umls_concept_aliases, tfidf_vectorizer, ann_index = create_tfidf_ann_index(dir_name, umls_fixture)
29 | 
30 |         candidate_generator = CandidateGenerator(ann_index, tfidf_vectorizer, umls_concept_aliases, umls_fixture)
31 |         results = candidate_generator(['(131)I-Macroaggregated Albumin'], 10)
32 | 
33 |         canonical_ids = [x.concept_id for x in results[0]]
34 |         assert canonical_ids == ['C0000005', 'C0000015', 'C0000074', 'C0000102', 'C0000103']
35 | 
36 |         # The mention was an exact match, so should have a distance of zero to a concept:
37 |         assert results[0][0] == MentionCandidate(concept_id='C0000005',
38 |                                                  aliases=['(131)I-Macroaggregated Albumin'],
39 |                                                  similarities=[1.0])
40 | 
41 |         # Test we don't crash with zero vectors
42 |         results = candidate_generator(['ZZZZ'], 10)
43 |         assert results == [[]]
44 | 
45 |     def test_empty_list(self):
46 | 
47 |         umls_fixture = UmlsKnowledgeBase("tests/fixtures/umls_test_fixture.json")
48 |         with tempfile.TemporaryDirectory() as dir_name:
49 |             umls_concept_aliases, tfidf_vectorizer, ann_index = create_tfidf_ann_index(dir_name, umls_fixture)
50 | 
51 |         candidate_generator = CandidateGenerator(ann_index, tfidf_vectorizer, umls_concept_aliases, umls_fixture)
52 |         results = candidate_generator([], 10)
53 | 
54 |         assert results == []
55 | 


--------------------------------------------------------------------------------
/scripts/convert_freqs.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import math
 3 | import json
 4 | from ast import literal_eval
 5 | from tqdm import tqdm
 6 | from preshed.counter import PreshCounter
 7 | from spacy.util import ensure_path
 8 | from scispacy.file_cache import cached_path
 9 | 
10 | 
11 | def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50):
12 |     print("Counting frequencies...")
13 |     counts = PreshCounter()
14 |     total = 0
15 |     with freqs_loc.open() as f:
16 |         for i, line in tqdm(enumerate(f)):
17 |             freq, doc_freq, key = line.rstrip().split("\t", 2)
18 |             freq = int(freq)
19 |             counts.inc(i + 1, freq)
20 |             total += freq
21 |     counts.smooth()
22 |     log_total = math.log(total)
23 |     probs = {}
24 |     with freqs_loc.open() as f:
25 |         for line in tqdm(f):
26 |             freq, doc_freq, key = line.rstrip().split("\t", 2)
27 |             doc_freq = int(doc_freq)
28 |             freq = int(freq)
29 |             if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length:
30 |                 try:
31 |                     word = literal_eval(key)
32 |                 except SyntaxError:
33 |                     # Take odd strings literally.
34 |                     word = literal_eval("'%s'" % key)
35 |                 smooth_count = counts.smoother(int(freq))
36 |                 probs[word] = math.log(smooth_count) - log_total
37 |     oov_prob = math.log(counts.smoother(0)) - log_total
38 |     return probs, oov_prob
39 | 
40 | 
41 | def main(input_path: str, output_path: str, min_word_frequency: int):
42 |     if input_path is not None:
43 |         input_path = cached_path(input_path)
44 |         input_path = ensure_path(input_path)
45 | 
46 |     probs, oov_prob = (
47 |         read_freqs(input_path, min_freq=min_word_frequency)
48 |         if input_path is not None
49 |         else ({}, -20)
50 |     )
51 | 
52 |     with open(output_path, "w") as _jsonl_file:
53 |         _jsonl_file.write(
54 |             json.dumps({"lang": "en", "settings": {"oov_prob": -20.502029418945312}})
55 |         )
56 |         _jsonl_file.write("\n")
57 | 
58 |         for word, prob in probs.items():
59 |             _jsonl_file.write(json.dumps({"orth": word, "prob": prob}))
60 |             _jsonl_file.write("\n")
61 | 
62 | 
63 | if __name__ == "__main__":
64 |     parser = argparse.ArgumentParser()
65 |     parser.add_argument(
66 |         "--input_path", type=str, default=None, help="Path to the freqs file"
67 |     )
68 |     parser.add_argument(
69 |         "--output_path", type=str, help="Output path for the jsonl file"
70 |     )
71 |     parser.add_argument(
72 |         "--min_word_frequency",
73 |         type=int,
74 |         default=50,
75 |         help="Minimum word frequency for inclusion",
76 |     )
77 | 
78 |     args = parser.parse_args()
79 |     main(args.input_path, args.output_path, args.min_word_frequency)


--------------------------------------------------------------------------------
/tests/test_file_cache.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pathlib
 3 | import json
 4 | import unittest
 5 | import shutil
 6 | 
 7 | import pytest
 8 | 
 9 | from scispacy.file_cache import filename_to_url, url_to_filename
10 | 
11 | class TestFileUtils(unittest.TestCase):
12 |     def setUp(self):
13 |         super().setUp()
14 |         self.TEST_DIR = "/tmp/scispacy"
15 |         os.makedirs(self.TEST_DIR, exist_ok=True)
16 | 
17 |     def tearDown(self):
18 |         shutil.rmtree(self.TEST_DIR)
19 | 
20 |     def test_url_to_filename(self):
21 |         for url in ['http://allenai.org', 'http://cool.org',
22 |                     'https://www.google.com', 'http://pytorch.org',
23 |                     'https://s3-us-west-2.amazonaws.com/cool' + '/long' * 20 + '/url']:
24 |             filename = url_to_filename(url)
25 |             assert "http" not in filename
26 |             with pytest.raises(FileNotFoundError):
27 |                 filename_to_url(filename, cache_dir=self.TEST_DIR)
28 |             pathlib.Path(os.path.join(self.TEST_DIR, filename)).touch()
29 |             with pytest.raises(FileNotFoundError):
30 |                 filename_to_url(filename, cache_dir=self.TEST_DIR)
31 |             json.dump({'url': url, 'etag': None},
32 |                       open(os.path.join(self.TEST_DIR, filename + '.json'), 'w'))
33 |             back_to_url, etag = filename_to_url(filename, cache_dir=self.TEST_DIR)
34 |             assert back_to_url == url
35 |             assert etag is None
36 | 
37 |     def test_url_to_filename_with_etags(self):
38 |         for url in ['http://allenai.org', 'http://cool.org',
39 |                     'https://www.google.com', 'http://pytorch.org']:
40 |             filename = url_to_filename(url, etag="mytag")
41 |             assert "http" not in filename
42 |             pathlib.Path(os.path.join(self.TEST_DIR, filename)).touch()
43 |             json.dump({'url': url, 'etag': 'mytag'},
44 |                       open(os.path.join(self.TEST_DIR, filename + '.json'), 'w'))
45 |             back_to_url, etag = filename_to_url(filename, cache_dir=self.TEST_DIR)
46 |             assert back_to_url == url
47 |             assert etag == "mytag"
48 |         baseurl = 'http://allenai.org/'
49 |         assert url_to_filename(baseurl + '1') != url_to_filename(baseurl, etag='1')
50 | 
51 |     def test_url_to_filename_with_etags_eliminates_quotes(self):
52 |         for url in ['http://allenai.org', 'http://cool.org',
53 |                     'https://www.google.com', 'http://pytorch.org']:
54 |             filename = url_to_filename(url, etag='"mytag"')
55 |             assert "http" not in filename
56 |             pathlib.Path(os.path.join(self.TEST_DIR, filename)).touch()
57 |             json.dump({'url': url, 'etag': 'mytag'},
58 |                       open(os.path.join(self.TEST_DIR, filename + '.json'), 'w'))
59 |             back_to_url, etag = filename_to_url(filename, cache_dir=self.TEST_DIR)
60 |             assert back_to_url == url
61 |             assert etag == "mytag"
62 | 


--------------------------------------------------------------------------------
/scripts/count_word_frequencies.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from typing import List, Tuple
 4 | import os
 5 | import io
 6 | import sys
 7 | import tempfile
 8 | import shutil
 9 | from collections import Counter
10 | from pathlib import Path
11 | from multiprocessing import Pool
12 | 
13 | import plac
14 | 
15 | import spacy.util
16 | from spacy.language import Language
17 | 
18 | sys.path.insert(0, os.path.dirname(os.path.abspath(os.path.join(__file__, os.pardir))))
19 | 
20 | from scispacy.custom_tokenizer import combined_rule_tokenizer
21 | 
22 | def count_frequencies(language_class: Language, input_path: Path):
23 |     """
24 |     Given a file containing single documents per line
25 |     (for scispacy, these are Pubmed abstracts), split the text
26 |     using a science specific tokenizer and compute word and
27 |     document frequencies for all words.
28 |     """
29 |     print(f"Processing {input_path}.")
30 |     tokenizer = combined_rule_tokenizer(language_class())
31 |     counts = Counter()
32 |     doc_counts = Counter()
33 |     for line in open(input_path, "r"):
34 |         words = [t.text for t in tokenizer(line)]
35 |         counts.update(words)
36 |         doc_counts.update(set(words))
37 | 
38 |     return counts, doc_counts
39 | 
40 | def parallelize(func, iterator, n_jobs):
41 |     pool = Pool(processes=n_jobs)
42 |     counts = pool.starmap(func, iterator)
43 |     return counts
44 | 
45 | def merge_counts(frequencies: List[Tuple[Counter, Counter]], output_path: str):
46 |     """
47 |     Merge a number of frequency counts generated from `count_frequencies`
48 |     into a single file, written to `output_path`.
49 |     """
50 |     counts = Counter()
51 |     doc_counts = Counter()
52 |     for word_count, doc_count in frequencies:
53 |         counts.update(word_count)
54 |         doc_counts.update(doc_count)
55 |     with io.open(output_path, 'w+', encoding='utf8') as file_:
56 |         for word, count in counts.most_common():
57 |             if not word.isspace():
58 |                 file_.write(f"{count}\t{doc_counts[word]}\t{repr(word)}\n")
59 | 
60 | 
61 | @plac.annotations(
62 |         raw_dir=("Location of input file list", "positional", None, Path),
63 |         output_dir=("Location for output file", "positional", None, Path),
64 |         n_jobs=("Number of workers", "option", "n", int))
65 | def main(raw_dir: Path, output_dir: Path, n_jobs=2):
66 | 
67 |     language_class = spacy.util.get_lang_class("en")
68 |     tasks = []
69 |     freqs_dir = Path(tempfile.mkdtemp(prefix="scispacy_freqs"))
70 |     for input_path in [os.path.join(raw_dir, filename)
71 |                        for filename in os.listdir(raw_dir)]:
72 |         input_path = Path(input_path.strip())
73 |         if not input_path:
74 |             continue
75 |         tasks.append((language_class, input_path))
76 | 
77 |     if tasks:
78 |         counts = parallelize(count_frequencies, tasks, n_jobs)
79 | 
80 |     print("Merge")
81 |     merge_counts(counts, output_dir)
82 |     shutil.rmtree(freqs_dir)
83 | 
84 | if __name__ == '__main__':
85 |     plac.call(main)
86 | 


--------------------------------------------------------------------------------
/tests/custom_tests/test_all_model.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import spacy
 4 | from spacy.vocab import Vocab
 5 | import shutil
 6 | import pytest
 7 | 
 8 | 
 9 | def test_custom_segmentation(combined_all_model_fixture):
10 |     text = "Induction of cytokine expression in leukocytes by binding of thrombin-stimulated platelets. BACKGROUND: Activated platelets tether and activate myeloid leukocytes."
11 |     doc = combined_all_model_fixture(text)
12 |     sents = list(doc.sents)
13 |     assert len(sents) == 2
14 |     expected_tokens = [
15 |         "Induction",
16 |         "of",
17 |         "cytokine",
18 |         "expression",
19 |         "in",
20 |         "leukocytes",
21 |         "by",
22 |         "binding",
23 |         "of",
24 |         "thrombin-stimulated",
25 |         "platelets",
26 |         ".",
27 |         "BACKGROUND",
28 |         ":",
29 |         "Activated",
30 |         "platelets",
31 |         "tether",
32 |         "and",
33 |         "activate",
34 |         "myeloid",
35 |         "leukocytes",
36 |         ".",
37 |     ]
38 |     actual_tokens = [t.text for t in doc]
39 |     assert expected_tokens == actual_tokens
40 |     assert doc.has_annotation("DEP")
41 |     assert doc[0].dep_ == "ROOT"
42 |     assert doc[0].tag_ == "NN"
43 | 
44 | def test_full_pipe_serializable(combined_all_model_fixture):
45 |     text = "Induction of cytokine expression in leukocytes (CEIL) by binding of thrombin-stimulated platelets. BACKGROUND: Activated platelets tether and activate myeloid leukocytes."
46 |     doc = [doc for doc in combined_all_model_fixture.pipe([text, text], n_process = 2)][0]
47 |     # If we got here this means that both model is serializable and there is an abbreviation that would break if it wasn't
48 |     assert len(doc._.abbreviations) > 0
49 |     abbrev = doc._.abbreviations[0]
50 |     assert abbrev["short_text"] == "CEIL"
51 |     assert abbrev["long_text"] == "cytokine expression in leukocytes"
52 |     assert doc[abbrev["short_start"] : abbrev["short_end"]].text == abbrev["short_text"]
53 |     assert doc[abbrev["long_start"] : abbrev["long_end"]].text == abbrev["long_text"]
54 | 
55 | def test_full_pipe_not_serializable(combined_all_model_fixture_non_serializable_abbrev):
56 |     text = "Induction of cytokine expression in leukocytes (CEIL) by binding of thrombin-stimulated platelets. BACKGROUND: Activated platelets tether and activate myeloid leukocytes."
57 |     # This line requires the pipeline to be serializable, so the test should fail here
58 |     doc = combined_all_model_fixture_non_serializable_abbrev(text)
59 |     with pytest.raises(TypeError):
60 |         doc.to_bytes()
61 | 
62 | # Below is the test version to be used once we move to spacy v3.1.0 or higher
63 | # def test_full_pipe_not_serializable(combined_all_model_fixture_non_serializable_abbrev):
64 | #     text = "Induction of cytokine expression in leukocytes (CEIL) by binding of thrombin-stimulated platelets. BACKGROUND: Activated platelets tether and activate myeloid leukocytes."
65 | #     # This line requires the pipeline to be serializable (because it uses 2 processes), so the test should fail here
66 | #     with pytest.raises(TypeError):
67 | #         list(combined_all_model_fixture_non_serializable_abbrev.pipe([text, text], n_process = 2))


--------------------------------------------------------------------------------
/tests/test_linking.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import tempfile
 3 | 
 4 | import spacy
 5 | 
 6 | from scispacy.candidate_generation import CandidateGenerator, create_tfidf_ann_index
 7 | from scispacy.linking import EntityLinker
 8 | from scispacy.umls_utils import UmlsKnowledgeBase
 9 | from scispacy.abbreviation import AbbreviationDetector
10 | 
11 | 
12 | class TestLinker(unittest.TestCase):
13 |     def setUp(self):
14 |         super().setUp()
15 |         self.nlp = spacy.load("en_core_web_sm")
16 | 
17 |         umls_fixture = UmlsKnowledgeBase("tests/fixtures/umls_test_fixture.json", "tests/fixtures/test_umls_tree.tsv")
18 |         with tempfile.TemporaryDirectory() as dir_name:
19 |             umls_concept_aliases, tfidf_vectorizer, ann_index = create_tfidf_ann_index(dir_name, umls_fixture)
20 |         candidate_generator = CandidateGenerator(ann_index, tfidf_vectorizer, umls_concept_aliases, umls_fixture)
21 | 
22 |         self.linker = EntityLinker(candidate_generator=candidate_generator, filter_for_definitions=False)
23 | 
24 |     def test_naive_entity_linking(self):
25 |         text = "There was a lot of Dipalmitoylphosphatidylcholine."
26 |         doc = self.nlp(text)
27 | 
28 |         # Check that the linker returns nothing if we set the filter_for_definitions flag
29 |         # and set the threshold very high for entities without definitions.
30 |         self.linker.filter_for_definitions = True
31 |         self.linker.no_definition_threshold = 3.0
32 |         doc = self.linker(doc)
33 |         assert doc.ents[0]._.kb_ents == []
34 | 
35 |         # Check that the linker returns only high confidence entities if we
36 |         # set the threshold to something more reasonable.
37 |         self.linker.no_definition_threshold = 0.95
38 |         doc = self.linker(doc)
39 |         assert doc.ents[0]._.kb_ents == [("C0000039", 1.0)]
40 | 
41 |         self.linker.filter_for_definitions = False
42 |         self.linker.threshold = 0.45
43 |         doc = self.linker(doc)
44 |         # Without the filter_for_definitions filter, we get 2 entities for
45 |         # the first mention.
46 |         assert len(doc.ents[0]._.kb_ents) == 2
47 | 
48 |         id_with_score = doc.ents[0]._.kb_ents[0]
49 |         assert id_with_score == ("C0000039", 1.0)
50 |         umls_entity = self.linker.kb.cui_to_entity[id_with_score[0]]
51 |         assert umls_entity.concept_id == "C0000039"
52 |         assert umls_entity.types == ["T109", "T121"]
53 | 
54 |     def test_linker_resolves_abbreviations(self):
55 | 
56 |         self.nlp.add_pipe("abbreviation_detector")
57 |         # replace abbreivation with "CNN" so spacy recognizes at as en entity
58 |         # and also prefix the term with "CNN" so that abbreviation detector passes
59 |         text = "CNN1-Methyl-4-phenylpyridinium (CNN) is an abbreviation which doesn't exist in the baby index."
60 |         doc = self.nlp(text)
61 |         doc = self.linker(doc)
62 | 
63 |         id_with_score = doc.ents[0]._.kb_ents[0]
64 |         assert id_with_score == ("C0000098", 0.9819725155830383)
65 |         umls_entity = self.linker.kb.cui_to_entity[id_with_score[0]]
66 |         assert umls_entity.concept_id == "C0000098"
67 | 
68 |     def test_linker_has_types(self):
69 |         # Just checking that the type tree is accessible from the linker
70 |         assert len(self.linker.kb.semantic_type_tree.flat_nodes) == 6
71 | 


--------------------------------------------------------------------------------
/configs/base_ner.cfg:
--------------------------------------------------------------------------------
  1 | [paths]
  2 | vectors = null
  3 | init_tok2vec = null
  4 | parser_tagger_path = null
  5 | vocab_path = null
  6 | 
  7 | [system]
  8 | gpu_allocator = null
  9 | seed = 0
 10 | 
 11 | [nlp]
 12 | lang = "en"
 13 | pipeline = ["tok2vec","tagger","attribute_ruler","lemmatizer","parser","ner"]
 14 | tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
 15 | disabled = []
 16 | before_creation = null
 17 | after_creation = null
 18 | after_pipeline_creation = null
 19 | 
 20 | [components]
 21 | 
 22 | [components.attribute_ruler]
 23 | source = "en_core_web_sm"
 24 | 
 25 | [components.lemmatizer]
 26 | source = "en_core_web_sm"
 27 | 
 28 | [components.ner]
 29 | factory = "ner"
 30 | moves = null
 31 | update_with_oracle_cut_size = 100
 32 | 
 33 | [components.ner.model]
 34 | @architectures = "spacy.TransitionBasedParser.v1"
 35 | state_type = "ner"
 36 | extra_state_tokens = false
 37 | hidden_width = 64
 38 | maxout_pieces = 2
 39 | use_upper = true
 40 | nO = null
 41 | 
 42 | [components.ner.model.tok2vec]
 43 | @architectures = "spacy.Tok2Vec.v1"
 44 | 
 45 | [components.ner.model.tok2vec.embed]
 46 | @architectures = "spacy.MultiHashEmbed.v1"
 47 | width = 96
 48 | attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
 49 | rows = [5000, 2500, 2500, 2500]
 50 | include_static_vectors = true
 51 | 
 52 | [components.ner.model.tok2vec.encode]
 53 | @architectures = "spacy.MaxoutWindowEncoder.v1"
 54 | width = 96
 55 | depth = 4
 56 | window_size = 1
 57 | maxout_pieces = 3
 58 | 
 59 | [components.parser]
 60 | source = ${paths.parser_tagger_path}
 61 | 
 62 | [components.tagger]
 63 | source = ${paths.parser_tagger_path}
 64 | 
 65 | [components.tok2vec]
 66 | source = ${paths.parser_tagger_path}
 67 | 
 68 | [corpora]
 69 | 
 70 | [corpora.dev]
 71 | @readers = "med_mentions_reader"
 72 | directory_path = "assets/"
 73 | split = "dev"
 74 | 
 75 | [corpora.train]
 76 | @readers = "med_mentions_reader"
 77 | directory_path = "assets/"
 78 | split = "train"
 79 | 
 80 | [training]
 81 | dev_corpus = "corpora.dev"
 82 | train_corpus = "corpora.train"
 83 | seed = ${system.seed}
 84 | gpu_allocator = ${system.gpu_allocator}
 85 | dropout = 0.2
 86 | accumulate_gradient = 1
 87 | patience = 0
 88 | max_epochs = 7
 89 | max_steps = 0
 90 | eval_frequency = 500
 91 | frozen_components = ["tok2vec", "parser", "tagger", "attribute_ruler", "lemmatizer"]
 92 | before_to_disk = null
 93 | 
 94 | [training.batcher]
 95 | @batchers = "spacy.batch_by_sequence.v1"
 96 | get_length = null
 97 | 
 98 | [training.batcher.size]
 99 | @schedules = "compounding.v1"
100 | start = 1
101 | stop = 32
102 | compound = 1.001
103 | t = 0.0
104 | 
105 | [training.logger]
106 | @loggers = "spacy.ConsoleLogger.v1"
107 | progress_bar = true
108 | 
109 | [training.optimizer]
110 | @optimizers = "Adam.v1"
111 | beta1 = 0.9
112 | beta2 = 0.999
113 | L2_is_weight_decay = true
114 | L2 = 0.01
115 | grad_clip = 1.0
116 | use_averages = false
117 | eps = 0.00000001
118 | learn_rate = 0.001
119 | 
120 | [training.score_weights]
121 | dep_las_per_type = null
122 | sents_p = null
123 | sents_r = null
124 | ents_per_type = null
125 | tag_acc = null
126 | dep_uas = null
127 | dep_las = null
128 | sents_f = null
129 | ents_f = 1.0
130 | ents_p = 0.0
131 | ents_r = 0.0
132 | 
133 | [pretraining]
134 | 
135 | [initialize]
136 | vectors = ${paths.vectors}
137 | init_tok2vec = ${paths.init_tok2vec}
138 | vocab_data = ${paths.vocab_path}
139 | lookups = null
140 | 
141 | [initialize.components]
142 | 
143 | [initialize.tokenizer]
144 | 
145 | [initialize.before_init]
146 | @callbacks = "replace_tokenizer"


--------------------------------------------------------------------------------
/configs/base_specialized_ner.cfg:
--------------------------------------------------------------------------------
  1 | [paths]
  2 | vectors = null
  3 | init_tok2vec = null
  4 | parser_tagger_path = null
  5 | dev_path = null
  6 | train_path = null
  7 | vocab_path = null
  8 | 
  9 | [system]
 10 | gpu_allocator = null
 11 | seed = 0
 12 | 
 13 | [nlp]
 14 | lang = "en"
 15 | pipeline = ["tok2vec","tagger","attribute_ruler","lemmatizer","parser","ner"]
 16 | tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
 17 | disabled = []
 18 | before_creation = null
 19 | after_creation = null
 20 | after_pipeline_creation = null
 21 | 
 22 | [components]
 23 | 
 24 | [components.attribute_ruler]
 25 | source = "en_core_web_sm"
 26 | 
 27 | [components.lemmatizer]
 28 | source = "en_core_web_sm"
 29 | 
 30 | [components.ner]
 31 | factory = "ner"
 32 | moves = null
 33 | update_with_oracle_cut_size = 100
 34 | 
 35 | [components.ner.model]
 36 | @architectures = "spacy.TransitionBasedParser.v1"
 37 | state_type = "ner"
 38 | extra_state_tokens = false
 39 | hidden_width = 64
 40 | maxout_pieces = 2
 41 | use_upper = true
 42 | nO = null
 43 | 
 44 | [components.ner.model.tok2vec]
 45 | @architectures = "spacy.Tok2Vec.v1"
 46 | 
 47 | [components.ner.model.tok2vec.embed]
 48 | @architectures = "spacy.MultiHashEmbed.v1"
 49 | width = 96
 50 | attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
 51 | rows = [5000, 2500, 2500, 2500]
 52 | include_static_vectors = true
 53 | 
 54 | [components.ner.model.tok2vec.encode]
 55 | @architectures = "spacy.MaxoutWindowEncoder.v1"
 56 | width = 96
 57 | depth = 4
 58 | window_size = 1
 59 | maxout_pieces = 3
 60 | 
 61 | [components.parser]
 62 | source = ${paths.parser_tagger_path}
 63 | 
 64 | [components.tagger]
 65 | source = ${paths.parser_tagger_path}
 66 | 
 67 | [components.tok2vec]
 68 | source = ${paths.parser_tagger_path}
 69 | 
 70 | [corpora]
 71 | 
 72 | [corpora.dev]
 73 | @readers = "specialized_ner_reader"
 74 | file_path = ${paths.dev_path}
 75 | 
 76 | [corpora.train]
 77 | @readers = "specialized_ner_reader"
 78 | file_path = ${paths.train_path}
 79 | 
 80 | [training]
 81 | dev_corpus = "corpora.dev"
 82 | train_corpus = "corpora.train"
 83 | seed = ${system.seed}
 84 | gpu_allocator = ${system.gpu_allocator}
 85 | dropout = 0.2
 86 | accumulate_gradient = 1
 87 | patience = 0
 88 | max_epochs = 7
 89 | max_steps = 0
 90 | eval_frequency = 500
 91 | frozen_components = ["tok2vec", "parser", "tagger", "attribute_ruler", "lemmatizer"]
 92 | before_to_disk = null
 93 | 
 94 | [training.batcher]
 95 | @batchers = "spacy.batch_by_sequence.v1"
 96 | get_length = null
 97 | 
 98 | [training.batcher.size]
 99 | @schedules = "compounding.v1"
100 | start = 1
101 | stop = 32
102 | compound = 1.001
103 | t = 0.0
104 | 
105 | [training.logger]
106 | @loggers = "spacy.ConsoleLogger.v1"
107 | progress_bar = true
108 | 
109 | [training.optimizer]
110 | @optimizers = "Adam.v1"
111 | beta1 = 0.9
112 | beta2 = 0.999
113 | L2_is_weight_decay = true
114 | L2 = 0.01
115 | grad_clip = 1.0
116 | use_averages = false
117 | eps = 0.00000001
118 | learn_rate = 0.001
119 | 
120 | [training.score_weights]
121 | dep_las_per_type = null
122 | sents_p = null
123 | sents_r = null
124 | ents_per_type = null
125 | tag_acc = null
126 | dep_uas = null
127 | dep_las = null
128 | sents_f = null
129 | ents_f = 1.0
130 | ents_p = 0.0
131 | ents_r = 0.0
132 | 
133 | [pretraining]
134 | 
135 | [initialize]
136 | vectors = ${paths.vectors}
137 | init_tok2vec = ${paths.init_tok2vec}
138 | vocab_data = ${paths.vocab_path}
139 | lookups = null
140 | 
141 | [initialize.components]
142 | 
143 | [initialize.tokenizer]
144 | 
145 | [initialize.before_init]
146 | @callbacks = "replace_tokenizer"


--------------------------------------------------------------------------------
/configs/base_ner_scibert.cfg:
--------------------------------------------------------------------------------
  1 | [paths]
  2 | vectors = null
  3 | init_tok2vec = null
  4 | parser_tagger_path = null
  5 | vocab_path = null
  6 | 
  7 | [system]
  8 | gpu_allocator = null
  9 | seed = 0
 10 | 
 11 | [nlp]
 12 | lang = "en"
 13 | pipeline = ["transformer", "tagger","attribute_ruler","lemmatizer","parser","ner"]
 14 | tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
 15 | disabled = []
 16 | before_creation = null
 17 | after_creation = null
 18 | after_pipeline_creation = null
 19 | 
 20 | [components]
 21 | 
 22 | [components.attribute_ruler]
 23 | source = "en_core_web_sm"
 24 | 
 25 | [components.lemmatizer]
 26 | source = "en_core_web_sm"
 27 | 
 28 | [components.ner]
 29 | factory = "ner"
 30 | moves = null
 31 | update_with_oracle_cut_size = 100
 32 | 
 33 | [components.ner.model]
 34 | @architectures = "spacy.TransitionBasedParser.v1"
 35 | state_type = "ner"
 36 | extra_state_tokens = false
 37 | hidden_width = 64
 38 | maxout_pieces = 2
 39 | use_upper = true
 40 | nO = null
 41 | 
 42 | [components.ner.model.tok2vec]
 43 | @architectures = "spacy.Tok2Vec.v1"
 44 | 
 45 | [components.ner.model.tok2vec.embed]
 46 | @architectures = "spacy.MultiHashEmbed.v1"
 47 | width = 96
 48 | attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
 49 | rows = [5000, 2500, 2500, 2500]
 50 | include_static_vectors = false
 51 | 
 52 | [components.ner.model.tok2vec.encode]
 53 | @architectures = "spacy.MaxoutWindowEncoder.v1"
 54 | width = 96
 55 | depth = 4
 56 | window_size = 1
 57 | maxout_pieces = 3
 58 | 
 59 | [components.parser]
 60 | source = ${paths.parser_tagger_path}
 61 | 
 62 | [components.tagger]
 63 | source = ${paths.parser_tagger_path}
 64 | 
 65 | [components.transformer]
 66 | source = ${paths.parser_tagger_path}
 67 | 
 68 | 
 69 | [corpora]
 70 | 
 71 | [corpora.dev]
 72 | @readers = "med_mentions_reader"
 73 | directory_path = "assets/"
 74 | split = "dev"
 75 | 
 76 | [corpora.train]
 77 | @readers = "med_mentions_reader"
 78 | directory_path = "assets/"
 79 | split = "train"
 80 | 
 81 | [training]
 82 | dev_corpus = "corpora.dev"
 83 | train_corpus = "corpora.train"
 84 | seed = ${system.seed}
 85 | gpu_allocator = ${system.gpu_allocator}
 86 | dropout = 0.2
 87 | accumulate_gradient = 1
 88 | patience = 0
 89 | max_epochs = 7
 90 | max_steps = 0
 91 | eval_frequency = 500
 92 | frozen_components = ["transformer", "parser", "tagger", "attribute_ruler", "lemmatizer"]
 93 | before_to_disk = null
 94 | 
 95 | [training.batcher]
 96 | @batchers = "spacy.batch_by_sequence.v1"
 97 | get_length = null
 98 | 
 99 | [training.batcher.size]
100 | @schedules = "compounding.v1"
101 | start = 1
102 | stop = 32
103 | compound = 1.001
104 | t = 0.0
105 | 
106 | [training.logger]
107 | @loggers = "spacy.ConsoleLogger.v1"
108 | progress_bar = true
109 | 
110 | [training.optimizer]
111 | @optimizers = "Adam.v1"
112 | beta1 = 0.9
113 | beta2 = 0.999
114 | L2_is_weight_decay = true
115 | L2 = 0.01
116 | grad_clip = 1.0
117 | use_averages = false
118 | eps = 0.00000001
119 | learn_rate = 0.001
120 | 
121 | [training.score_weights]
122 | dep_las_per_type = null
123 | sents_p = null
124 | sents_r = null
125 | ents_per_type = null
126 | tag_acc = null
127 | dep_uas = null
128 | dep_las = null
129 | sents_f = null
130 | ents_f = 1.0
131 | ents_p = 0.0
132 | ents_r = 0.0
133 | 
134 | [pretraining]
135 | 
136 | [initialize]
137 | vectors = ${paths.vectors}
138 | init_tok2vec = ${paths.init_tok2vec}
139 | vocab_data = ${paths.vocab_path}
140 | lookups = null
141 | 
142 | [initialize.components]
143 | 
144 | [initialize.tokenizer]
145 | 
146 | [initialize.before_init]
147 | @callbacks = "replace_tokenizer"
148 | 


--------------------------------------------------------------------------------
/evaluation/sentence_splitting_evaluation.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import sys
  4 | import json
  5 | 
  6 | import spacy
  7 | 
  8 | sys.path.insert(0, os.path.dirname(os.path.abspath(os.path.join(__file__, os.pardir))))
  9 | from scispacy.custom_sentence_segmenter import combined_rule_sentence_segmenter
 10 | from scispacy.custom_tokenizer import remove_new_lines, combined_rule_tokenizer
 11 | 
 12 | def evaluate_sentence_splitting(model_path: str,
 13 |                                 data_directory: str,
 14 |                                 rule_segmenter: bool = False,
 15 |                                 custom_tokenizer: bool = False,
 16 |                                 citation_data_path: str = None):
 17 | 
 18 |     model = spacy.load(model_path)
 19 |     if rule_segmenter:
 20 |         model.add_pipe(combined_rule_sentence_segmenter, first=True)
 21 |     if custom_tokenizer:
 22 |         model.tokenizer = combined_rule_tokenizer(model)
 23 | 
 24 |     total_correct = 0
 25 |     total = 0
 26 |     total_abstracts = 0
 27 |     perfect = 0
 28 |     for abstract_name in os.listdir(data_directory):
 29 | 
 30 |         abstract_sentences = [x.strip() for x in
 31 |                               open(os.path.join(data_directory, abstract_name), "r")]
 32 | 
 33 |         full_abstract = " ".join(abstract_sentences)
 34 | 
 35 |         doc = model(full_abstract)
 36 | 
 37 |         sentences = [x.text for x in doc.sents]
 38 | 
 39 |         correct = []
 40 |         for sentence in sentences:
 41 |             if sentence in abstract_sentences:
 42 |                 correct.append(1)
 43 |             else:
 44 |                 correct.append(0)
 45 | 
 46 |         total += len(correct)
 47 |         total_correct += sum(correct)
 48 |         perfect += all(correct)
 49 |         total_abstracts += 1
 50 | 
 51 |     print(f"Sentence splitting performance for {model_path} :\n")
 52 | 
 53 |     print(f"Sentence level accuracy: {total_correct} of {total}, {total_correct / total}. ")
 54 |     print(f"Abstract level accuracy: {perfect} of {total_abstracts}, {perfect / total_abstracts}. ")
 55 | 
 56 |     if citation_data_path is None:
 57 |         return
 58 | 
 59 |     skipped = 0
 60 |     citation_total = 0
 61 |     citation_correct = 0
 62 |     for line in open(citation_data_path, "r"):
 63 | 
 64 |         sentence = remove_new_lines(json.loads(line)["string"])
 65 | 
 66 |         # Skip sentence if it doesn't look roughly like a sentence,
 67 |         # or it is > 2 std deviations above the mean length.
 68 |         if not sentence[0].isupper() or sentence[-1] != "." or len(sentence) > 450:
 69 |             skipped += 1
 70 |             continue
 71 | 
 72 |         sentences = list(model(sentence).sents)
 73 | 
 74 |         if len(sentences) == 1:
 75 |             citation_correct += 1
 76 |         citation_total +=1
 77 |     print(f"Citation handling performance for {model_path}, skipped {skipped} examples :\n")
 78 |     print(f"Citation level accuracy: {citation_correct} of {citation_total}, {citation_correct / citation_total}. ")
 79 | 
 80 | 
 81 | if __name__ == "__main__":
 82 |     parser = argparse.ArgumentParser()
 83 | 
 84 |     parser.add_argument(
 85 |         '--data',
 86 |         help="Path to the directory containing the raw data."
 87 |     )
 88 |     parser.add_argument(
 89 |         '--model_path',
 90 |         default=None,
 91 |         help="Path to the spacy model to load"
 92 |     )
 93 |     parser.add_argument(
 94 |         '--rule_segmenter',
 95 |         default=False,
 96 |         action="store_true",
 97 |         help="Whether to use the rule based segmenter"
 98 |     )
 99 |     parser.add_argument(
100 |         '--custom_tokenizer',
101 |         default=False,
102 |         action="store_true",
103 |         help="Whether to use the rule based segmenter"
104 |     )
105 |     parser.add_argument(
106 |         '--citation_data',
107 |         default=None,
108 |         help="Path to the jsonl file containing the citation contexts."
109 |     )
110 | 
111 |     args = parser.parse_args()
112 |     evaluate_sentence_splitting(args.model_path, args.data, args.rule_segmenter, args.custom_tokenizer, args.citation_data)
113 | 


--------------------------------------------------------------------------------
/scispacy/base_project_code.py:
--------------------------------------------------------------------------------
  1 | from typing import Optional, Callable, Iterable, Iterator
  2 | from pathlib import Path
  3 | 
  4 | import random
  5 | import itertools
  6 | import spacy
  7 | import warnings
  8 | from spacy.training import Corpus, Example
  9 | from spacy.language import Language
 10 | 
 11 | from scispacy.custom_tokenizer import combined_rule_tokenizer
 12 | from scispacy.data_util import read_full_med_mentions, read_ner_from_tsv
 13 | 
 14 | 
 15 | def iter_sample(iterable: Iterable, sample_percent: float) -> Iterator:
 16 |     for item in iterable:
 17 |         if len(item.reference) == 0:
 18 |             continue
 19 |         coin_flip = random.uniform(0, 1)
 20 |         if coin_flip < sample_percent:
 21 |             yield item
 22 | 
 23 | 
 24 | @spacy.registry.callbacks("replace_tokenizer")
 25 | def replace_tokenizer_callback() -> Callable[[Language], Language]:
 26 |     def replace_tokenizer(nlp: Language) -> Language:
 27 |         nlp.tokenizer = combined_rule_tokenizer(nlp)
 28 |         return nlp
 29 | 
 30 |     return replace_tokenizer
 31 | 
 32 | 
 33 | @spacy.registry.readers("parser_tagger_data")
 34 | def parser_tagger_data(
 35 |     path: Path,
 36 |     mixin_data_path: Optional[Path],
 37 |     mixin_data_percent: float,
 38 |     gold_preproc: bool,
 39 |     max_length: int = 0,
 40 |     limit: int = 0,
 41 |     augmenter: Optional[Callable] = None,
 42 |     seed: int = 0,
 43 | ) -> Callable[[Language], Iterator[Example]]:
 44 |     random.seed(seed)
 45 |     main_corpus = Corpus(
 46 |         path,
 47 |         gold_preproc=gold_preproc,
 48 |         max_length=max_length,
 49 |         limit=limit,
 50 |         augmenter=augmenter,
 51 |     )
 52 |     if mixin_data_path is not None:
 53 |         mixin_corpus = Corpus(
 54 |             mixin_data_path,
 55 |             gold_preproc=gold_preproc,
 56 |             max_length=max_length,
 57 |             limit=limit,
 58 |             augmenter=augmenter,
 59 |         )
 60 | 
 61 |     def mixed_corpus(nlp: Language) -> Iterator[Example]:
 62 |         if mixin_data_path is not None:
 63 |             main_examples = main_corpus(nlp)
 64 |             mixin_examples = iter_sample(mixin_corpus(nlp), mixin_data_percent)
 65 |             return itertools.chain(main_examples, mixin_examples)
 66 |         else:
 67 |             return main_corpus(nlp)
 68 | 
 69 |     return mixed_corpus
 70 | 
 71 | 
 72 | @spacy.registry.readers("med_mentions_reader")
 73 | def med_mentions_reader(
 74 |     directory_path: str, split: str
 75 | ) -> Callable[[Language], Iterator[Example]]:
 76 |     train, dev, test = read_full_med_mentions(
 77 |         directory_path, label_mapping=None, span_only=True, spacy_format=True
 78 |     )
 79 | 
 80 |     def corpus(nlp: Language) -> Iterator[Example]:
 81 |         if split == "train":
 82 |             original_examples = train
 83 |         elif split == "dev":
 84 |             original_examples = dev
 85 |         elif split == "test":
 86 |             original_examples = test
 87 |         else:
 88 |             raise Exception(f"Unexpected split {split}")
 89 | 
 90 |         for original_example in original_examples:
 91 |             doc = nlp.make_doc(original_example[0])
 92 |             with warnings.catch_warnings():
 93 |                 warnings.simplefilter("ignore", category=UserWarning)
 94 |                 spacy_example = Example.from_dict(doc, original_example[1])
 95 |             yield spacy_example
 96 | 
 97 |     return corpus
 98 | 
 99 | 
100 | @spacy.registry.readers("specialized_ner_reader")
101 | def specialized_ner_reader(file_path: str):
102 |     original_examples = read_ner_from_tsv(file_path)
103 | 
104 |     def corpus(nlp: Language):
105 |         for original_example in original_examples:
106 |             doc = nlp.make_doc(original_example[0])
107 |             with warnings.catch_warnings():
108 |                 warnings.simplefilter("ignore", category=UserWarning)
109 |                 spacy_example = Example.from_dict(doc, original_example[1])
110 |             yield spacy_example
111 | 
112 |     return corpus
113 | 


--------------------------------------------------------------------------------
/tests/fixtures/umls_META/MRFILES.RRF:
--------------------------------------------------------------------------------
 1 | AMBIGLUI.RRF|Ambiguous term identifiers|LUI,CUI|2|537613|10302364|
 2 | AMBIGSUI.RRF|Ambiguous string identifiers|SUI,CUI|2|389894|7513995|
 3 | CHANGE/DELETEDCUI.RRF|Deleted concepts|PCUI,PSTR|2|10628|159420|
 4 | CHANGE/DELETEDLUI.RRF|Deleted terms|PLUI,PSTR|2|0|0|
 5 | CHANGE/DELETEDSUI.RRF|Deleted strings|PSUI,LAT,PSTR|3|0|0|
 6 | CHANGE/MERGEDCUI.RRF|Merged concepts|PCUI,CUI|2|2188|41572|
 7 | CHANGE/MERGEDLUI.RRF|Merged terms|PLUI,LUI|2|0|0|
 8 | MRAUI.RRF|AUI History|AUI1,CUI1,VER,REL,RELA,MAPREASON,AUI2,CUI2,MAPIN|9|510022|27523978|
 9 | MRCOLS.RRF|Attribute Relation|COL,DES,REF,MIN,AV,MAX,FIL,DTY|8|329|22758|
10 | MRCONSO.RRF|Concept names and sources|CUI,LAT,TS,LUI,STT,SUI,ISPREF,AUI,SAUI,SCUI,SDUI,SAB,TTY,CODE,STR,SRL,SUPPRESS,CVF|18|21385114|2719518477|
11 | MRCUI.RRF|CUI History|CUI1,VER,REL,RELA,MAPREASON,CUI2,MAPIN|7|2364533|72915366|
12 | MRDEF.RRF|Definitions|CUI,AUI,ATUI,SATUI,SAB,DEF,SUPPRESS,CVF|8|501039|123372655|
13 | MRDOC.RRF|Typed key value metadata map|DOCKEY,VALUE,TYPE,EXPL|4|5984|372164|
14 | MRFILES.RRF|Relation Relation|FIL,DES,FMT,CLS,RWS,BTS|6|50|4066|
15 | MRHIER.RRF|Computable hierarchies|CUI,AUI,CXN,PAUI,SAB,RELA,PTR,HCD,CVF|9|35400003|5248953255|
16 | MRHIST.RRF|Source-asserted history|CUI,SOURCEUI,SAB,SVER,CHANGETYPE,CHANGEKEY,CHANGEVAL,REASON,CVF|9|0|0|
17 | MRMAP.RRF|Mappings|MAPSETCUI,MAPSETSAB,MAPSUBSETID,MAPRANK,MAPID,MAPSID,FROMID,FROMSID,FROMEXPR,FROMTYPE,FROMRULE,FROMRES,REL,RELA,TOID,TOSID,TOEXPR,TOTYPE,TORULE,TORES,MAPRULE,MAPRES,MAPTYPE,MAPATN,MAPATV,CVF|26|1762213|245100223|
18 | MRRANK.RRF|Concept Name Ranking|RANK,SAB,TTY,SUPPRESS|4|1285|23476|
19 | MRREL.RRF|Related Concepts|CUI1,AUI1,STYPE1,REL,CUI2,AUI2,STYPE2,RELA,RUI,SRUI,SAB,SL,RG,DIR,SUPPRESS,CVF|16|104563668|9493099961|
20 | MRSAB.RRF|Source Metadata|VCUI,RCUI,VSAB,RSAB,SON,SF,SVER,VSTART,VEND,IMETA,RMETA,SLC,SCC,SRL,TFR,CFR,CXTY,TTYL,ATNL,LAT,CENC,CURVER,SABIN,SSN,SCIT|25|426|300118|
21 | MRSAT.RRF|Simple Concept, Term and String Attributes|CUI,LUI,SUI,METAUI,STYPE,CODE,ATUI,SATUI,ATN,SAB,ATV,SUPPRESS,CVF|13|108724175|10527137405|
22 | MRSMAP.RRF|Simple Mappings|MAPSETCUI,MAPSETSAB,MAPID,MAPSID,FROMEXPR,FROMTYPE,REL,RELA,TOEXPR,TOTYPE,CVF|11|694235|49081262|
23 | MRSTY.RRF|Semantic Types|CUI,TUI,STN,STY,ATUI,CVF|6|6875332|381224365|
24 | MRXNS_ENG.RRF|Normalized String Index|LAT,NSTR,CUI,LUI,SUI|5|22712182|1601710160|
25 | MRXNW_ENG.RRF|Normalized Word Index|LAT,NWD,CUI,LUI,SUI|5|74522948|3007627692|
26 | MRXW_BAQ.RRF|Basque Word Index|LAT,WD,CUI,LUI,SUI|5|5338|214412|
27 | MRXW_CHI.RRF|Chinese Word Index|LAT,WD,CUI,LUI,SUI|5|955316|43214100|
28 | MRXW_CZE.RRF|Czech Word Index|LAT,WD,CUI,LUI,SUI|5|422737|17840374|
29 | MRXW_DAN.RRF|Danish Word Index|LAT,WD,CUI,LUI,SUI|5|4932|194228|
30 | MRXW_DUT.RRF|Dutch Word Index|LAT,WD,CUI,LUI,SUI|5|1527345|63929947|
31 | MRXW_ENG.RRF|English Word Index|LAT,WD,CUI,LUI,SUI|5|72596340|2905871994|
32 | MRXW_EST.RRF|Estonian Word Index|LAT,WD,CUI,LUI,SUI|5|462812|18354498|
33 | MRXW_FIN.RRF|Finnish Word Index|LAT,WD,CUI,LUI,SUI|5|44994|1961908|
34 | MRXW_FRE.RRF|French Word Index|LAT,WD,CUI,LUI,SUI|5|3828339|160031629|
35 | MRXW_GER.RRF|German Word Index|LAT,WD,CUI,LUI,SUI|5|734623|30352992|
36 | MRXW_GRE.RRF|Greek Word Index|LAT,WD,CUI,LUI,SUI|5|42802|2067288|
37 | MRXW_HEB.RRF|Hebrew Word Index|LAT,WD,CUI,LUI,SUI|5|3234|126524|
38 | MRXW_HUN.RRF|Hungarian Word Index|LAT,WD,CUI,LUI,SUI|5|266703|11550790|
39 | MRXW_ITA.RRF|Italian Word Index|LAT,WD,CUI,LUI,SUI|5|1603999|64493060|
40 | MRXW_JPN.RRF|Japanese Word Index|LAT,WD,CUI,LUI,SUI|5|272472|16110056|
41 | MRXW_KOR.RRF|Korean Word Index|LAT,WD,CUI,LUI,SUI|5|460476|20307369|
42 | MRXW_LAV.RRF|Latvian Word Index|LAT,WD,CUI,LUI,SUI|5|3167|132408|
43 | MRXW_NOR.RRF|Norwegian Word Index|LAT,WD,CUI,LUI,SUI|5|118417|5222066|
44 | MRXW_POL.RRF|Polish Word Index|LAT,WD,CUI,LUI,SUI|5|115252|4912030|
45 | MRXW_POR.RRF|Portuguese Word Index|LAT,WD,CUI,LUI,SUI|5|2857618|114458322|
46 | MRXW_RUS.RRF|Russian Word Index|LAT,WD,CUI,LUI,SUI|5|1275708|58319365|
47 | MRXW_SCR.RRF|Croatian Word Index|LAT,WD,CUI,LUI,SUI|5|23572|996307|
48 | MRXW_SPA.RRF|Spanish Word Index|LAT,WD,CUI,LUI,SUI|5|9575118|388553905|
49 | MRXW_SWE.RRF|Swedish Word Index|LAT,WD,CUI,LUI,SUI|5|53244|2296506|
50 | MRXW_TUR.RRF|Turkish Word Index|LAT,WD,CUI,LUI,SUI|5|833756|33848218|
51 | 


--------------------------------------------------------------------------------
/configs/base_parser_tagger.cfg:
--------------------------------------------------------------------------------
  1 | [paths]
  2 | genia_train = "project_data/genia_train.spacy"
  3 | genia_dev = "project_data/genia_dev.spacy"
  4 | onto_train = "project_data/train"
  5 | vectors = null
  6 | init_tok2vec = null
  7 | vocab_path = null
  8 | 
  9 | [system]
 10 | gpu_allocator = null
 11 | seed = 0
 12 | 
 13 | [nlp]
 14 | lang = "en"
 15 | pipeline = ["tok2vec","tagger","attribute_ruler","lemmatizer","parser"]
 16 | tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
 17 | disabled = []
 18 | before_creation = null
 19 | after_creation = null
 20 | after_pipeline_creation = null
 21 | 
 22 | [components]
 23 | 
 24 | [components.attribute_ruler]
 25 | source = "en_core_web_sm"
 26 | 
 27 | [components.lemmatizer]
 28 | source = "en_core_web_sm"
 29 | 
 30 | [components.parser]
 31 | factory = "parser"
 32 | learn_tokens = false
 33 | min_action_freq = 30
 34 | moves = null
 35 | update_with_oracle_cut_size = 100
 36 | 
 37 | [components.parser.model]
 38 | @architectures = "spacy.TransitionBasedParser.v1"
 39 | state_type = "parser"
 40 | extra_state_tokens = false
 41 | hidden_width = 128
 42 | maxout_pieces = 3
 43 | use_upper = true
 44 | nO = null
 45 | 
 46 | [components.parser.model.tok2vec]
 47 | @architectures = "spacy.Tok2VecListener.v1"
 48 | width = ${components.tok2vec.model.encode.width}
 49 | upstream = "*"
 50 | 
 51 | [components.tagger]
 52 | factory = "tagger"
 53 | 
 54 | [components.tagger.model]
 55 | @architectures = "spacy.Tagger.v1"
 56 | nO = null
 57 | 
 58 | [components.tagger.model.tok2vec]
 59 | @architectures = "spacy.Tok2VecListener.v1"
 60 | width = ${components.tok2vec.model.encode.width}
 61 | upstream = "*"
 62 | 
 63 | [components.tok2vec]
 64 | factory = "tok2vec"
 65 | 
 66 | [components.tok2vec.model]
 67 | @architectures = "spacy.Tok2Vec.v1"
 68 | 
 69 | [components.tok2vec.model.embed]
 70 | @architectures = "spacy.MultiHashEmbed.v1"
 71 | width = ${components.tok2vec.model.encode.width}
 72 | attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
 73 | rows = [5000, 2500, 2500, 2500]
 74 | include_static_vectors = true
 75 | 
 76 | [components.tok2vec.model.encode]
 77 | @architectures = "spacy.MaxoutWindowEncoder.v1"
 78 | width = 96
 79 | depth = 4
 80 | window_size = 1
 81 | maxout_pieces = 3
 82 | 
 83 | [corpora]
 84 | 
 85 | [corpora.dev]
 86 | @readers = "spacy.Corpus.v1"
 87 | path = ${paths.genia_dev}
 88 | max_length = 0
 89 | gold_preproc = false
 90 | limit = 0
 91 | augmenter = null
 92 | 
 93 | [corpora.train]
 94 | @readers = "parser_tagger_data"
 95 | path = ${paths.genia_train}
 96 | mixin_data_path = ${paths.onto_train}
 97 | mixin_data_percent = 0.2
 98 | max_length = 2000
 99 | gold_preproc = false
100 | limit = 0
101 | augmenter = null
102 | seed = ${system.seed}
103 | 
104 | [training]
105 | dev_corpus = "corpora.dev"
106 | train_corpus = "corpora.train"
107 | seed = ${system.seed}
108 | gpu_allocator = ${system.gpu_allocator}
109 | dropout = 0.2
110 | accumulate_gradient = 1
111 | patience = 0
112 | max_epochs = 20
113 | max_steps = 0
114 | eval_frequency = 2300
115 | frozen_components = ["attribute_ruler", "lemmatizer"]
116 | before_to_disk = null
117 | 
118 | [training.batcher]
119 | @batchers = "spacy.batch_by_sequence.v1"
120 | get_length = null
121 | 
122 | [training.batcher.size]
123 | @schedules = "compounding.v1"
124 | start = 1
125 | stop = 16
126 | compound = 1.001
127 | t = 0.0
128 | 
129 | [training.logger]
130 | @loggers = "spacy.ConsoleLogger.v1"
131 | progress_bar = true
132 | 
133 | [training.optimizer]
134 | @optimizers = "Adam.v1"
135 | beta1 = 0.9
136 | beta2 = 0.999
137 | L2_is_weight_decay = true
138 | L2 = 0.01
139 | grad_clip = 1.0
140 | use_averages = false
141 | eps = 0.00000001
142 | learn_rate = 0.001
143 | 
144 | [training.score_weights]
145 | dep_las_per_type = null
146 | sents_p = null
147 | sents_r = null
148 | ents_per_type = null
149 | tag_acc = 0.33
150 | dep_uas = 0.33
151 | dep_las = 0.33
152 | sents_f = 0.0
153 | ents_f = 0.0
154 | ents_p = 0.0
155 | ents_r = 0.0
156 | 
157 | [pretraining]
158 | 
159 | [initialize]
160 | vectors = ${paths.vectors}
161 | init_tok2vec = ${paths.init_tok2vec}
162 | vocab_data = ${paths.vocab_path}
163 | lookups = null
164 | 
165 | [initialize.components]
166 | 
167 | [initialize.tokenizer]
168 | 
169 | [initialize.before_init]
170 | @callbacks = "replace_tokenizer"


--------------------------------------------------------------------------------
/configs/base_parser_tagger_scibert.cfg:
--------------------------------------------------------------------------------
  1 | [paths]
  2 | genia_train = "project_data/genia_train.spacy"
  3 | genia_dev = "project_data/genia_dev.spacy"
  4 | onto_train = "project_data/train"
  5 | vectors = null
  6 | init_tok2vec = null
  7 | vocab_path = null
  8 | 
  9 | [system]
 10 | gpu_allocator = "pytorch"
 11 | seed = 0
 12 | 
 13 | [nlp]
 14 | lang = "en"
 15 | pipeline = ["transformer","tagger","attribute_ruler","lemmatizer","parser"]
 16 | batch_size = 256
 17 | tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
 18 | disabled = []
 19 | before_creation = null
 20 | after_creation = null
 21 | after_pipeline_creation = null
 22 | 
 23 | [components]
 24 | 
 25 | [components.attribute_ruler]
 26 | source = "en_core_web_sm"
 27 | 
 28 | [components.lemmatizer]
 29 | source = "en_core_web_sm"
 30 | 
 31 | [components.parser]
 32 | factory = "parser"
 33 | learn_tokens = false
 34 | min_action_freq = 30
 35 | moves = null
 36 | update_with_oracle_cut_size = 100
 37 | 
 38 | [components.parser.model]
 39 | @architectures = "spacy.TransitionBasedParser.v1"
 40 | state_type = "parser"
 41 | extra_state_tokens = false
 42 | hidden_width = 128
 43 | maxout_pieces = 3
 44 | use_upper = true
 45 | nO = null
 46 | 
 47 | [components.parser.model.tok2vec]
 48 | @architectures = "spacy-transformers.TransformerListener.v1"
 49 | grad_factor = 1.0
 50 | pooling = {"@layers":"reduce_mean.v1"}
 51 | upstream = "*"
 52 | 
 53 | [components.tagger]
 54 | factory = "tagger"
 55 | 
 56 | [components.tagger.model]
 57 | @architectures = "spacy.Tagger.v1"
 58 | nO = null
 59 | 
 60 | [components.tagger.model.tok2vec]
 61 | @architectures = "spacy-transformers.TransformerListener.v1"
 62 | grad_factor = 1.0
 63 | pooling = {"@layers":"reduce_mean.v1"}
 64 | upstream = "*"
 65 | 
 66 | [components.transformer]
 67 | factory = "transformer"
 68 | max_batch_items = 4096
 69 | set_extra_annotations = {"@annotation_setters":"spacy-transformers.null_annotation_setter.v1"}
 70 | 
 71 | [components.transformer.model]
 72 | @architectures = "spacy-transformers.TransformerModel.v1"
 73 | name = "allenai/scibert_scivocab_uncased"
 74 | tokenizer_config = {"use_fast": true}
 75 | 
 76 | [components.transformer.model.get_spans]
 77 | @span_getters = "spacy-transformers.strided_spans.v1"
 78 | window = 128
 79 | stride = 96
 80 | 
 81 | 
 82 | [corpora]
 83 | 
 84 | [corpora.dev]
 85 | @readers = "spacy.Corpus.v1"
 86 | path = ${paths.genia_dev}
 87 | max_length = 0
 88 | gold_preproc = false
 89 | limit = 0
 90 | augmenter = null
 91 | 
 92 | [corpora.train]
 93 | @readers = "parser_tagger_data"
 94 | path = ${paths.genia_train}
 95 | mixin_data_path = ${paths.onto_train}
 96 | mixin_data_percent = 0.2
 97 | max_length = 2000
 98 | gold_preproc = false
 99 | limit = 0
100 | augmenter = null
101 | seed = ${system.seed}
102 | 
103 | [training]
104 | dev_corpus = "corpora.dev"
105 | train_corpus = "corpora.train"
106 | seed = ${system.seed}
107 | gpu_allocator = ${system.gpu_allocator}
108 | dropout = 0.2
109 | accumulate_gradient = 1
110 | patience = 0
111 | max_epochs = 8
112 | max_steps = 0
113 | eval_frequency = 2300
114 | frozen_components = ["attribute_ruler", "lemmatizer"]
115 | before_to_disk = null
116 | 
117 | [training.batcher]
118 | @batchers = "spacy.batch_by_sequence.v1"
119 | get_length = null
120 | 
121 | [training.batcher.size]
122 | @schedules = "compounding.v1"
123 | start = 16
124 | stop = 64
125 | compound = 1.001
126 | t = 0.0
127 | 
128 | [training.logger]
129 | @loggers = "spacy.ConsoleLogger.v1"
130 | progress_bar = true
131 | 
132 | [training.optimizer]
133 | @optimizers = "Adam.v1"
134 | beta1 = 0.9
135 | beta2 = 0.999
136 | L2_is_weight_decay = true
137 | L2 = 0.01
138 | grad_clip = 1.0
139 | use_averages = false
140 | eps = 0.00000001
141 | learn_rate = 0.00005
142 | 
143 | 
144 | [training.score_weights]
145 | dep_las_per_type = null
146 | sents_p = null
147 | sents_r = null
148 | ents_per_type = null
149 | tag_acc = 0.33
150 | dep_uas = 0.33
151 | dep_las = 0.33
152 | sents_f = 0.0
153 | ents_f = 0.0
154 | ents_p = 0.0
155 | ents_r = 0.0
156 | 
157 | [pretraining]
158 | 
159 | [initialize]
160 | vectors = ${paths.vectors}
161 | init_tok2vec = ${paths.init_tok2vec}
162 | vocab_data = ${paths.vocab_path}
163 | lookups = null
164 | 
165 | [initialize.components]
166 | 
167 | [initialize.tokenizer]
168 | 
169 | [initialize.before_init]
170 | @callbacks = "replace_tokenizer"
171 | 


--------------------------------------------------------------------------------
/tests/test_data_util.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import unittest
 3 | import shutil
 4 | 
 5 | 
 6 | from scispacy.data_util import read_full_med_mentions, med_mentions_example_iterator, remove_overlapping_entities
 7 | from scispacy.data_util import read_ner_from_tsv
 8 | 
 9 | class TestDataUtil(unittest.TestCase):
10 |     def setUp(self):
11 |         super().setUp()
12 |         self.TEST_DIR = "/tmp/scispacy"
13 |         os.makedirs(self.TEST_DIR, exist_ok=True)
14 | 
15 |         self.med_mentions = "tests/fixtures/med_mentions.txt"
16 |         self.ner_tsv = "tests/fixtures/ner_test.tsv"
17 | 
18 |     def tearDown(self):
19 |         shutil.rmtree(self.TEST_DIR)
20 | 
21 |     def test_example_iterator(self):
22 |         iterator = med_mentions_example_iterator(self.med_mentions)
23 |         for example in iterator:
24 |             assert example.text == example.title + " " + example.abstract
25 | 
26 |             for entity in example.entities:
27 |                 assert entity.start < entity.end
28 |                 assert entity.start < len(example.text)
29 |                 assert entity.end < len(example.text)
30 |                 assert entity.mention_text == example.text[entity.start: entity.end]
31 | 
32 |     def test_remove_overlaps(self):
33 |         test_entities = [(0, 5, 'ENTITY'), (6, 10, 'ENTITY')]
34 |         result = remove_overlapping_entities(test_entities)
35 |         assert result == [(0, 5, 'ENTITY'), (6, 10, 'ENTITY')]
36 | 
37 |         test_entities = [(0, 5, 'ENTITY'), (5, 10, 'ENTITY')]
38 |         result = remove_overlapping_entities(test_entities)
39 |         assert result == [(0, 5, 'ENTITY'), (5, 10, 'ENTITY')]
40 | 
41 |         test_entities = [(0, 5, 'ENTITY'), (4, 10, 'ENTITY')]
42 |         result = remove_overlapping_entities(test_entities)
43 |         assert result == [(4, 10, 'ENTITY')]
44 | 
45 |         test_entities = [(0, 5, 'ENTITY'), (0, 5, 'ENTITY')]
46 |         result = remove_overlapping_entities(test_entities)
47 |         assert result == [(0, 5, 'ENTITY')]
48 | 
49 |         test_entities = [(0, 5, 'ENTITY'), (4, 11, 'ENTITY'), (6, 20, 'ENTITY')]
50 |         result = remove_overlapping_entities(test_entities)
51 |         assert result == [(0, 5, 'ENTITY'), (6, 20, 'ENTITY')]
52 | 
53 |         test_entities = [(0, 5, 'ENTITY'), (4, 7, 'ENTITY'), (10, 20, 'ENTITY')]
54 |         result = remove_overlapping_entities(test_entities)
55 |         assert result == [(0, 5, 'ENTITY'), (10, 20, 'ENTITY')]
56 | 
57 |         test_entities = [(1368, 1374, 'ENTITY'), (1368, 1376, 'ENTITY')]
58 |         result = remove_overlapping_entities(test_entities)
59 |         assert result == [(1368, 1376, 'ENTITY')]
60 | 
61 |         test_entities = [(12, 33, 'ENTITY'), (769, 779, 'ENTITY'), (769, 787, 'ENTITY'), (806, 811, 'ENTITY')]
62 |         result = remove_overlapping_entities(test_entities)
63 |         assert result == [(12, 33, 'ENTITY'), (769, 787, 'ENTITY'), (806, 811, 'ENTITY')]
64 | 
65 |         test_entities = [(189, 209, 'ENTITY'),
66 |                          (317, 362, 'ENTITY'),
67 |                          (345, 354, 'ENTITY'),
68 |                          (364, 368, 'ENTITY')]
69 |         result = remove_overlapping_entities(test_entities)
70 |         assert result == [(189, 209, 'ENTITY'), (317, 362, 'ENTITY'), (364, 368, 'ENTITY')]
71 | 
72 |         test_entities = [(445, 502, 'ENTITY'),
73 |                          (461, 473, 'ENTITY'),
74 |                          (474, 489, 'ENTITY')]
75 |         result = remove_overlapping_entities(test_entities)
76 |         assert result == [(445, 502, 'ENTITY')]
77 | 
78 |     def test_read_ner_from_tsv(self):
79 | 
80 |         data = read_ner_from_tsv(self.ner_tsv)
81 |         assert len(data) == 4       
82 |         example = data[0]
83 |         assert example[0] == 'Intraocular pressure in genetically distinct mice : an update and strain survey'
84 |         assert example[1] ==  {'entities': [(24, 35, 'SO'), (45, 49, 'TAXON')]}
85 |         example = data[1]
86 |         assert example[0] == 'Abstract'
87 |         assert example[1] ==  {'entities': []}
88 |         example = data[2]
89 |         assert example[0] == 'Background'
90 |         assert example[1] ==  {'entities': []}
91 |         example = data[3]
92 |         assert example[0] == 'Little is known about genetic factors affecting intraocular pressure ( IOP ) in mice and other mammals .'
93 |         assert example[1] ==  {'entities': [(22, 29, 'SO'), (80, 84, 'TAXON'), (95, 102, 'TAXON')]}
94 | 


--------------------------------------------------------------------------------
/scispacy/per_class_scorer.py:
--------------------------------------------------------------------------------
  1 | from typing import Dict, List, Tuple, Set
  2 | from collections import defaultdict
  3 | import copy
  4 | 
  5 | 
  6 | class PerClassScorer:
  7 |     def __init__(self):
  8 |         # These will hold per label span counts.
  9 |         self._true_positives: Dict[str, int] = defaultdict(int)
 10 |         self._false_positives: Dict[str, int] = defaultdict(int)
 11 |         self._false_negatives: Dict[str, int] = defaultdict(int)
 12 | 
 13 |     def __call__(
 14 |         self,
 15 |         predicted_spans: List[Tuple[int, int, str]],
 16 |         gold_spans: List[Tuple[int, int, str]],
 17 |     ) -> None:
 18 | 
 19 |         gold_spans = copy.copy(gold_spans)
 20 |         predicted_spans = copy.copy(predicted_spans)
 21 |         untyped_gold_spans = {(x[0], x[1]) for x in gold_spans}
 22 |         untyped_predicted_spans = {(x[0], x[1]) for x in predicted_spans}
 23 | 
 24 |         for untyped_span, span in zip(untyped_predicted_spans, predicted_spans):
 25 |             if span in gold_spans:
 26 |                 self._true_positives[span[2]] += 1
 27 |                 gold_spans.remove(span)
 28 |             else:
 29 |                 self._false_positives[span[2]] += 1
 30 | 
 31 |             if untyped_span in untyped_gold_spans:
 32 |                 self._true_positives["untyped"] += 1
 33 |                 untyped_gold_spans.remove(untyped_span)
 34 |             else:
 35 |                 self._false_positives["untyped"] += 1
 36 |         # These spans weren't predicted.
 37 |         for span in gold_spans:
 38 |             self._false_negatives[span[2]] += 1
 39 |         for untyped_span in untyped_gold_spans:
 40 |             self._false_negatives["untyped"] += 1
 41 | 
 42 |     def get_metric(self, reset: bool = False):
 43 |         """
 44 |         Returns
 45 |         -------
 46 |         A Dict per label containing following the span based metrics:
 47 |         precision : float
 48 |         recall : float
 49 |         f1-measure : float
 50 |         Additionally, an ``overall`` key is included, which provides the precision,
 51 |         recall and f1-measure for all spans.
 52 |         """
 53 |         all_tags: Set[str] = set()
 54 |         all_tags.update(self._true_positives.keys())
 55 |         all_tags.update(self._false_positives.keys())
 56 |         all_tags.update(self._false_negatives.keys())
 57 |         all_metrics = {}
 58 |         for tag in all_tags:
 59 |             precision, recall, f1_measure = self._compute_metrics(
 60 |                 self._true_positives[tag],
 61 |                 self._false_positives[tag],
 62 |                 self._false_negatives[tag],
 63 |             )
 64 |             precision_key = "precision" + "-" + tag
 65 |             recall_key = "recall" + "-" + tag
 66 |             f1_key = "f1-measure" + "-" + tag
 67 |             all_metrics[precision_key] = precision
 68 |             all_metrics[recall_key] = recall
 69 |             all_metrics[f1_key] = f1_measure
 70 | 
 71 |         # Compute the precision, recall and f1 for all spans jointly.
 72 |         sum_true_positives = sum(
 73 |             {v for k, v in self._true_positives.items() if k != "untyped"}
 74 |         )
 75 |         sum_false_positives = sum(
 76 |             {v for k, v in self._false_positives.items() if k != "untyped"}
 77 |         )
 78 |         sum_false_negatives = sum(
 79 |             {v for k, v in self._false_negatives.items() if k != "untyped"}
 80 |         )
 81 |         precision, recall, f1_measure = self._compute_metrics(
 82 |             sum_true_positives, sum_false_positives, sum_false_negatives
 83 |         )
 84 |         all_metrics["precision-overall"] = precision
 85 |         all_metrics["recall-overall"] = recall
 86 |         all_metrics["f1-measure-overall"] = f1_measure
 87 |         if reset:
 88 |             self.reset()
 89 |         return all_metrics
 90 | 
 91 |     @staticmethod
 92 |     def _compute_metrics(
 93 |         true_positives: int, false_positives: int, false_negatives: int
 94 |     ):
 95 |         precision = float(true_positives) / float(
 96 |             true_positives + false_positives + 1e-13
 97 |         )
 98 |         recall = float(true_positives) / float(true_positives + false_negatives + 1e-13)
 99 |         f1_measure = 2.0 * ((precision * recall) / (precision + recall + 1e-13))
100 |         return precision, recall, f1_measure
101 | 
102 |     def reset(self):
103 |         self._true_positives = defaultdict(int)
104 |         self._false_positives = defaultdict(int)
105 |         self._false_negatives = defaultdict(int)
106 | 


--------------------------------------------------------------------------------
/scispacy/linking_utils.py:
--------------------------------------------------------------------------------
  1 | from typing import List, Dict, NamedTuple, Optional, Set
  2 | import json
  3 | from collections import defaultdict
  4 | 
  5 | from scispacy.file_cache import cached_path
  6 | from scispacy.umls_semantic_type_tree import (
  7 |     UmlsSemanticTypeTree,
  8 |     construct_umls_tree_from_tsv,
  9 | )
 10 | 
 11 | 
 12 | class Entity(NamedTuple):
 13 | 
 14 |     concept_id: str
 15 |     canonical_name: str
 16 |     aliases: List[str]
 17 |     types: List[str] = []
 18 |     definition: Optional[str] = None
 19 | 
 20 |     def __repr__(self):
 21 | 
 22 |         rep = ""
 23 |         num_aliases = len(self.aliases)
 24 |         rep = rep + f"CUI: {self.concept_id}, Name: {self.canonical_name}\n"
 25 |         rep = rep + f"Definition: {self.definition}\n"
 26 |         rep = rep + f"TUI(s): {', '.join(self.types)}\n"
 27 |         if num_aliases > 10:
 28 |             rep = (
 29 |                 rep
 30 |                 + f"Aliases (abbreviated, total: {num_aliases}): \n\t {', '.join(self.aliases[:10])}"
 31 |             )
 32 |         else:
 33 |             rep = (
 34 |                 rep + f"Aliases: (total: {num_aliases}): \n\t {', '.join(self.aliases)}"
 35 |             )
 36 |         return rep
 37 | 
 38 | 
 39 | DEFAULT_UMLS_PATH = "https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/kbs/2020-10-09/umls_2020_aa_cat0129.jsonl"  # noqa
 40 | DEFAULT_UMLS_TYPES_PATH = "https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/umls_semantic_type_tree.tsv"
 41 | 
 42 | 
 43 | class KnowledgeBase:
 44 |     """
 45 |     A class representing two commonly needed views of a Knowledge Base:
 46 |     1. A mapping from concept_id to an Entity NamedTuple with more information.
 47 |     2. A mapping from aliases to the sets of concept ids for which they are aliases.
 48 | 
 49 |     Parameters
 50 |     ----------
 51 |     file_path: str, required.
 52 |         The file path to the json/jsonl representation of the KB to load.
 53 |     """
 54 | 
 55 |     def __init__(
 56 |         self,
 57 |         file_path: str = None,
 58 |     ):
 59 |         if file_path is None:
 60 |             raise ValueError(
 61 |                 "Do not use the default arguments to KnowledgeBase. "
 62 |                 "Instead, use a subclass (e.g UmlsKnowledgeBase) or pass a path to a kb."
 63 |             )
 64 |         if file_path.endswith("jsonl"):
 65 |             raw = (json.loads(line) for line in open(cached_path(file_path)))
 66 |         else:
 67 |             raw = json.load(open(cached_path(file_path)))
 68 | 
 69 |         alias_to_cuis: Dict[str, Set[str]] = defaultdict(set)
 70 |         self.cui_to_entity: Dict[str, Entity] = {}
 71 | 
 72 |         for concept in raw:
 73 |             unique_aliases = set(concept["aliases"])
 74 |             unique_aliases.add(concept["canonical_name"])
 75 |             for alias in unique_aliases:
 76 |                 alias_to_cuis[alias].add(concept["concept_id"])
 77 |             self.cui_to_entity[concept["concept_id"]] = Entity(**concept)
 78 | 
 79 |         self.alias_to_cuis: Dict[str, Set[str]] = {**alias_to_cuis}
 80 | 
 81 | 
 82 | class UmlsKnowledgeBase(KnowledgeBase):
 83 |     def __init__(
 84 |         self,
 85 |         file_path: str = DEFAULT_UMLS_PATH,
 86 |         types_file_path: str = DEFAULT_UMLS_TYPES_PATH,
 87 |     ):
 88 | 
 89 |         super().__init__(file_path)
 90 | 
 91 |         self.semantic_type_tree: UmlsSemanticTypeTree = construct_umls_tree_from_tsv(
 92 |             types_file_path
 93 |         )
 94 | 
 95 | 
 96 | class Mesh(KnowledgeBase):
 97 |     def __init__(
 98 |         self,
 99 |         file_path: str = "https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/kbs/2020-10-09/mesh_2020.jsonl",  # noqa
100 |     ):
101 |         super().__init__(file_path)
102 | 
103 | 
104 | class GeneOntology(KnowledgeBase):
105 |     def __init__(
106 |         self,
107 |         file_path: str = "https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/kbs/2020-10-09/umls_2020_go.jsonl",  # noqa
108 |     ):
109 |         super().__init__(file_path)
110 | 
111 | 
112 | class HumanPhenotypeOntology(KnowledgeBase):
113 |     def __init__(
114 |         self,
115 |         file_path: str = "https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/kbs/2020-10-09/umls_2020_hpo.jsonl",  # noqa
116 |     ):
117 |         super().__init__(file_path)
118 | 
119 | 
120 | class RxNorm(KnowledgeBase):
121 |     def __init__(
122 |         self,
123 |         file_path: str = "https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/kbs/2020-10-09/umls_2020_rxnorm.jsonl",  # noqa
124 |     ):
125 |         super().__init__(file_path)
126 | 


--------------------------------------------------------------------------------
/tests/custom_tests/test_custom_tokenizer.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | TEST_CASES = [("using a bag-of-words model", ["using", "a", "bag-of-words", "model"]),
 4 |               ("activators of cAMP- and cGMP-dependent protein", ["activators", "of", "cAMP-", "and", "cGMP-dependent", "protein"]),
 5 |               ("phorbol 12-myristate 13-acetate, caused almost", ["phorbol", "12-myristate", "13-acetate", ",", "caused", "almost"]),
 6 |               pytest.param("let C(j) denote", ["let", "C(j)", "denote"], marks=pytest.mark.xfail),
 7 |               pytest.param("let (C(j)) denote", ["let", "(", "C(j)", ")", "denote"], marks=pytest.mark.xfail),
 8 |               pytest.param("let C{j} denote", ["let", "C{j}", "denote"], marks=pytest.mark.xfail),
 9 |               pytest.param("for the camera(s) and manipulator(s)", ["for", "the", "camera(s)", "and", "manipulator(s)"], marks=pytest.mark.xfail),
10 |               ("the (TRAP)-positive genes", ["the", "(TRAP)-positive", "genes"]),
11 |               ("the {TRAP}-positive genes", ["the", "{TRAP}-positive", "genes"]),
12 |               ("for [Ca2+]i protein", ["for", "[Ca2+]i", "protein"]),
13 |               pytest.param("for pyrilamine[3H] protein", ["for", "pyrilamine[3H]", "protein"], marks=pytest.mark.xfail),
14 |               ("this is (normal) parens", ["this", "is", "(", "normal", ")", "parens"]),
15 |               ("this is [normal] brackets", ["this", "is", "[", "normal", "]", "brackets"]),
16 |               ("this is {normal} braces", ["this", "is", "{", "normal", "}", "braces"]),
17 |               ("in the lan-\nguage of the", ["in", "the", "language", "of", "the"]),
18 |               ("in the lan-\n\nguage of the", ["in", "the", "language", "of", "the"]),
19 |               ("in the lan- \nguage of the", ["in", "the", "language", "of", "the"]),
20 |               ("in the lan- \n\nguage of the", ["in", "the", "language", "of", "the"]),
21 |               ("a 28× 28 image", ["a", "28", "×", "28", "image"]),
22 |               ("a 28×28 image", ["a", "28", "×", "28", "image"]),
23 |               ("a 28 × 28 image", ["a", "28", "×", "28", "image"]),
24 |               ("the neurons’ activation", ["the", "neurons", "’", "activation"]),
25 |               ("the neurons' activation", ["the", "neurons", "'", "activation"]),
26 |               pytest.param("H3G 1Y6", ["H3G", "1Y6"], marks=pytest.mark.xfail),
27 |               ("HFG 1Y6", ["HFG", "1Y6"]),
28 |               pytest.param("H3g 1Y6", ["H3g", "1Y6"], marks=pytest.mark.xfail),
29 |               pytest.param("h3g 1Y6", ["h3g", "1Y6"], marks=pytest.mark.xfail),
30 |               pytest.param("h36g 1Y6", ["h36g", "1Y6"], marks=pytest.mark.xfail),
31 |               ("h3gh 1Y6", ["h3gh", "1Y6"]),
32 |               ("h3g3 1Y6", ["h3g3", "1Y6"]),
33 |               ("3g", ["3", "g"]),
34 |               ("(3g)", ["(", "3", "g", ")"]),
35 |               ("This can be seen in Figure 1D. Therefore", ["This", "can", "be", "seen", "in", "Figure", "1D", ".", "Therefore"]),
36 |               ("This can be seen in Figure 1d. Therefore", ["This", "can", "be", "seen", "in", "Figure", "1d", ".", "Therefore"]),
37 |               ("This is a sentence.", ["This", "is", "a", "sentence", "."]),
38 |               ("result of 1.345 is good", ["result", "of", "1.345", "is", "good"]),
39 |               ("This sentence ends with a single 1.", ["This", "sentence", "ends", "with", "a", "single", "1", "."]),
40 |               ("This sentence ends with a single 1. This is the next sentence.", ["This", "sentence", "ends", "with", "a", "single", "1", ".", "This", "is", "the", "next", "sentence", "."]),
41 |               ("sec. secs. Sec. Secs. fig. figs. Fig. Figs. eq. eqs. Eq. Eqs. no. nos. No. Nos. al.", ["sec.", "secs.", "Sec.", "Secs.", "fig.", "figs.", "Fig.", "Figs.", "eq.", "eqs.", "Eq.", "Eqs.", "no.", "nos.", "No.", "Nos.", "al."]),
42 |               ("in the Gq/G11 protein", ["in", "the", "Gq/G11", "protein"]),
43 |               ("in the G1/G11 protein", ["in", "the", "G1/G11", "protein"]),
44 |               ("in the G1/11 protein", ["in", "the", "G1/11", "protein"]),
45 |               ("in the Gq/11 protein", ["in", "the", "Gq/11", "protein"]),
46 |               ("This is a sentence.This is another.", ["This", "is", "a", "sentence", ".", "This", "is", "another", "."]),
47 |               ("This number 1.456 should not be tokenized.", ["This", "number", "1.456", "should", "not", "be", "tokenized", "."]),
48 |              ]
49 | 
50 | @pytest.mark.parametrize('text,expected_tokens', TEST_CASES)
51 | def test_custom_tokenization(en_with_combined_rule_tokenizer_fixture, remove_new_lines_fixture, text, expected_tokens):
52 |     text = remove_new_lines_fixture(text)
53 |     doc = en_with_combined_rule_tokenizer_fixture(text)
54 |     tokens = [t.text for t in doc]
55 |     assert tokens == expected_tokens


--------------------------------------------------------------------------------
/scispacy/umls_semantic_type_tree.py:
--------------------------------------------------------------------------------
  1 | from typing import NamedTuple, List, Dict, Deque, Any, Optional
  2 | from collections import deque
  3 | 
  4 | from scispacy.file_cache import cached_path
  5 | 
  6 | 
  7 | class SemanticTypeNode(NamedTuple):
  8 | 
  9 |     type_id: str
 10 |     full_name: str
 11 |     children: List[Any]  # Mypy does not support nested types yet :(
 12 |     level: int
 13 | 
 14 | 
 15 | class UmlsSemanticTypeTree:
 16 |     """
 17 |     A utility class for manipulating the UMLS Semantic Type Hierarchy.
 18 |     Designed to be constructed from a TSV file using `construct_umls_tree_from_tsv`.
 19 |     """
 20 | 
 21 |     def __init__(self, root: SemanticTypeNode) -> None:
 22 |         children = self.get_children(root)
 23 |         children.append(root)
 24 |         # We'll store the nodes as a flattened list too, because
 25 |         # we don't just care about the leaves of the tree - sometimes
 26 |         # we'll need efficient access to intermediate nodes, and the tree
 27 |         # is tiny anyway.
 28 |         self.flat_nodes: List[SemanticTypeNode] = children
 29 |         self.type_id_to_node = {node.type_id: node for node in self.flat_nodes}
 30 |         self.depth = max([node.level for node in self.flat_nodes])
 31 | 
 32 |     def get_node_from_id(self, type_id: str) -> SemanticTypeNode:
 33 |         return self.type_id_to_node[type_id]
 34 | 
 35 |     def get_canonical_name(self, type_id: str) -> str:
 36 |         return self.type_id_to_node[type_id].full_name
 37 | 
 38 |     def get_nodes_at_depth(self, level: int) -> List[SemanticTypeNode]:
 39 |         """
 40 |         Returns nodes at a particular depth in the tree.
 41 |         """
 42 |         return [node for node in self.flat_nodes if node.level == level]
 43 | 
 44 |     def get_children(self, node: SemanticTypeNode) -> List[SemanticTypeNode]:
 45 |         """
 46 |         Recursively build up a flat list of all a node's children.
 47 |         """
 48 |         children = []
 49 |         for child in node.children:
 50 |             children.append(child)
 51 |             children.extend(self.get_children(child))
 52 |         return children
 53 | 
 54 |     def get_parent(self, node: SemanticTypeNode) -> Optional[SemanticTypeNode]:
 55 |         """
 56 |         Returns the parent of the input node, returning None if the input node is the root of the tree
 57 |         """
 58 |         current_depth = node.level
 59 |         possible_parents = self.get_nodes_at_depth(current_depth - 1)
 60 | 
 61 |         for possible_parent in possible_parents:
 62 |             for child in possible_parent.children:
 63 |                 if child.type_id == node.type_id:
 64 |                     return possible_parent
 65 | 
 66 |         # If there are no parents, we are at the root and return None
 67 |         return None
 68 | 
 69 |     def get_collapsed_type_id_map_at_level(self, level: int) -> Dict[str, str]:
 70 |         """
 71 |         Constructs a label mapping from the original tree labels to a tree of a fixed depth,
 72 |         collapsing labels greater than the depth specified to the closest parent which is
 73 |         still present in the new fixed depth tree. This is effectively mapping to a _coarser_
 74 |         label space.
 75 |         """
 76 |         new_type_id_map: Dict[str, str] = {k: k for k in self.type_id_to_node.keys()}
 77 |         for node in self.get_nodes_at_depth(level):
 78 |             for child in self.get_children(node):
 79 |                 new_type_id_map[child.type_id] = node.type_id
 80 |         return new_type_id_map
 81 | 
 82 | 
 83 | def construct_umls_tree_from_tsv(filepath: str) -> UmlsSemanticTypeTree:
 84 | 
 85 |     """
 86 |     Reads in a tsv file which is formatted as a depth first traversal of
 87 |     a hierarchy tree, where nodes are of the format:
 88 | 
 89 |     Name TAB UMLS Semantic Type TAB Tree Depth
 90 | 
 91 |     Event    T051    1
 92 |       Activity    T052    2
 93 |         Behavior    T053    3
 94 |           Social Behavior    T054    4
 95 |           Individual Behavior    T055    4
 96 |         Daily or Recreational Activity    T056    3
 97 |     """
 98 | 
 99 |     node_stack: Deque[SemanticTypeNode] = deque()
100 |     for line in open(cached_path(filepath), "r"):
101 |         name, type_id, level = line.split("\t")
102 |         name = name.strip()
103 |         int_level = int(level.strip())
104 |         node = SemanticTypeNode(type_id, name, [], int_level)
105 | 
106 |         node_stack.append(node)
107 | 
108 |     def attach_children(node: SemanticTypeNode, stack: Deque[SemanticTypeNode]):
109 |         while stack and stack[0].level > node.level:
110 |             popped = stack.popleft()
111 |             attach_children(popped, stack)
112 |             node.children.append(popped)
113 | 
114 |     first = node_stack.popleft()
115 |     attach_children(first, node_stack)
116 | 
117 |     return UmlsSemanticTypeTree(first)
118 | 


--------------------------------------------------------------------------------
/scripts/export_umls_json.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 
  3 | Convert a umls release to a jsonl file of concepts.
  4 | 
  5 | """
  6 | import json
  7 | import argparse
  8 | from scispacy import umls_utils
  9 | 
 10 | def main(meta_path: str, output_path: str, source: str = None):
 11 | 
 12 |     concept_details = {}  # dictionary of concept_id -> {
 13 |                           #                 'concept_id': str,
 14 |                           #                 'canonical_name': str
 15 |                           #                 'aliases': List[str]
 16 |                           #                 'types': List[str]
 17 |                           #                 'definition': str
 18 |                           # }
 19 | 
 20 |     print('Reading concepts ... ')
 21 |     umls_utils.read_umls_concepts(meta_path, concept_details, source)
 22 | 
 23 |     print('Reading types ... ')
 24 |     umls_utils.read_umls_types(meta_path, concept_details)
 25 | 
 26 |     print('Reading definitions ... ')
 27 |     umls_utils.read_umls_definitions(meta_path, concept_details)
 28 | 
 29 |     without_canonical_name_count = 0
 30 |     without_aliases_count = 0
 31 |     with_one_alias_count = 0
 32 |     with_more_than_one_alias_count = 0
 33 |     without_type_count = 0
 34 |     with_one_type_count = 0
 35 |     with_more_than_one_type_count = 0
 36 |     without_definition_count = 0
 37 |     with_definition_pref_source_count = 0
 38 |     with_definition_other_sources_count = 0
 39 |     for concept in concept_details.values():
 40 |         without_canonical_name_count += 1 if 'canonical_name' not in concept else 0
 41 |         without_aliases_count += 1 if len(concept['aliases']) == 0 else 0
 42 |         with_one_alias_count += 1 if len(concept['aliases']) == 1 else 0
 43 |         with_more_than_one_alias_count += 1 if len(concept['aliases']) > 1 else 0
 44 |         without_type_count += 1 if len(concept['types']) == 0 else 0
 45 |         with_one_type_count += 1 if len(concept['types']) == 1 else 0
 46 |         with_more_than_one_type_count += 1 if len(concept['types']) >= 1 else 0
 47 |         without_definition_count += 1 if 'definition' not in concept else 0
 48 |         with_definition_pref_source_count += 1 if concept.get('is_from_preferred_source') == 'Y' else 0
 49 |         with_definition_other_sources_count += 1 if concept.get('is_from_preferred_source') == 'N' else 0
 50 | 
 51 |     print(f'Number of concepts: {len(concept_details)}')
 52 |     print(f'Number of concepts without canonical name (one of the aliases will be used instead): '
 53 |           f'{without_canonical_name_count}')
 54 |     print(f'Number of concepts with no aliases: {without_aliases_count}')
 55 |     print(f'Number of concepts with 1 alias: {with_one_alias_count}')
 56 |     print(f'Number of concepts with > 1 alias: {with_more_than_one_alias_count}')
 57 |     print(f'Number of concepts with no type: {without_type_count}')
 58 |     print(f'Number of concepts with 1 type: {with_one_type_count}')
 59 |     print(f'Number of concepts with > 1 type: {with_more_than_one_type_count}')
 60 |     print(f'Number of concepts with no definition: {without_definition_count}')
 61 |     print(f'Number of concepts with definition from preferred sources: {with_definition_pref_source_count}')
 62 |     print(f'Number of concepts with definition from other sources: {with_definition_other_sources_count}')
 63 | 
 64 |     print('Deleting unused fields and choosing a canonical name from aliases ... ')
 65 |     for concept in concept_details.values():
 66 | 
 67 |         # Some concepts have many duplicate aliases. Here we remove them.
 68 |         concept["aliases"] = list(set(concept["aliases"]))
 69 | 
 70 |         # if a concept doesn't have a canonical name, use the first alias instead
 71 |         if 'canonical_name' not in concept:
 72 |             aliases = concept['aliases']
 73 |             concept['canonical_name'] = aliases[0]
 74 |             del aliases[0]
 75 | 
 76 |         # deleting `is_from_preferred_source`
 77 |         if 'is_from_preferred_source' in concept:
 78 |             del concept['is_from_preferred_source']
 79 | 
 80 |     print('Exporting to the a jsonl file {} ...'.format(output_path))
 81 |     with open(output_path, 'w') as fout:
 82 | 
 83 |         for value in concept_details.values():
 84 |             fout.write(json.dumps(value) + "\n")
 85 |     print('DONE.')
 86 | 
 87 | 
 88 | if __name__ == "__main__":
 89 |     parser = argparse.ArgumentParser()
 90 |     parser.add_argument(
 91 |         '--meta_path',
 92 |         help="Path to the META directory of an UMLS release."
 93 |     )
 94 |     parser.add_argument(
 95 |         '--output_path',
 96 |         help="Path to the output jsonl file"
 97 |     )
 98 |     parser.add_argument(
 99 |         '--source',
100 |         type=str,
101 |         default=None,
102 |         help="Whether to filter for a only a single UMLS source."
103 |     )
104 |     args = parser.parse_args()
105 |     main(args.meta_path, args.output_path, args.source)
106 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
  1 | from typing import Dict, Tuple, Optional
  2 | import os
  3 | 
  4 | import pytest
  5 | import spacy
  6 | from spacy.language import Language as SpacyModelType
  7 | from spacy.cli.download import download as spacy_download
  8 | 
  9 | from scispacy.custom_sentence_segmenter import pysbd_sentencizer
 10 | from scispacy.custom_tokenizer import combined_rule_tokenizer, combined_rule_prefixes, remove_new_lines
 11 | from scispacy.abbreviation import AbbreviationDetector
 12 | 
 13 | LOADED_SPACY_MODELS: Dict[Tuple[str, bool, bool, bool], SpacyModelType] = {}
 14 | 
 15 | 
 16 | def get_spacy_model(
 17 |     spacy_model_name: str,
 18 |     pos_tags: bool,
 19 |     parse: bool,
 20 |     ner: bool,
 21 |     with_custom_tokenizer: bool = False,
 22 |     with_sentence_segmenter: bool = False,
 23 |     with_serializable_abbreviation_detector: Optional[bool] = None,
 24 | ) -> SpacyModelType:
 25 |     """
 26 |     In order to avoid loading spacy models repeatedly,
 27 |     we'll save references to them, keyed by the options
 28 |     we used to create the spacy model, so any particular
 29 |     configuration only gets loaded once.
 30 |     """
 31 |     options = (spacy_model_name, pos_tags, parse, ner, with_custom_tokenizer, with_sentence_segmenter, with_serializable_abbreviation_detector)
 32 |     if options not in LOADED_SPACY_MODELS:
 33 |         disable = ["vectors", "textcat"]
 34 |         if not pos_tags:
 35 |             disable.append("tagger")
 36 |         if not parse:
 37 |             disable.append("parser")
 38 |         if not ner:
 39 |             disable.append("ner")
 40 |         try:
 41 |             spacy_model = spacy.load(spacy_model_name, disable=disable)
 42 |         except OSError:
 43 |             print(f"Spacy models '{spacy_model_name}' not found.  Downloading and installing.")
 44 |             spacy_download(spacy_model_name)
 45 |             spacy_model = spacy.load(spacy_model_name, disable=disable)
 46 | 
 47 |         if with_custom_tokenizer:
 48 |             spacy_model.tokenizer = combined_rule_tokenizer(spacy_model)
 49 |         if with_sentence_segmenter:
 50 |             spacy_model.add_pipe("pysbd_sentencizer", first=True)
 51 |         if with_serializable_abbreviation_detector is not None:
 52 |             spacy_model.add_pipe("abbreviation_detector", config={"make_serializable": with_serializable_abbreviation_detector})
 53 | 
 54 |         LOADED_SPACY_MODELS[options] = spacy_model
 55 |     return LOADED_SPACY_MODELS[options]
 56 | 
 57 | 
 58 | @pytest.fixture()
 59 | def combined_rule_tokenizer_fixture():
 60 |     nlp = get_spacy_model("en_core_web_sm", True, True, True)
 61 |     tokenizer = combined_rule_tokenizer(nlp)
 62 |     return tokenizer
 63 | 
 64 | 
 65 | @pytest.fixture()
 66 | def en_with_combined_rule_tokenizer_fixture():
 67 |     nlp = get_spacy_model("en_core_web_sm", True, True, True, with_custom_tokenizer=True)
 68 |     return nlp
 69 | 
 70 | 
 71 | @pytest.fixture()
 72 | def en_with_combined_rule_tokenizer_and_segmenter_fixture():
 73 |     nlp = get_spacy_model("en_core_web_sm", True, True, True, with_custom_tokenizer=True, with_sentence_segmenter=True)
 74 |     return nlp
 75 | 
 76 | 
 77 | @pytest.fixture()
 78 | def test_data_fixtures_path():
 79 |     return os.path.join("tests", "custom_tests", "data_fixtures")
 80 | 
 81 | 
 82 | @pytest.fixture()
 83 | def test_raw_path():
 84 |     return os.path.join("tests", "custom_tests", "data_fixtures", "raw")
 85 | 
 86 | 
 87 | @pytest.fixture()
 88 | def test_pmids_path():
 89 |     return os.path.join("tests", "custom_tests", "data_fixtures", "test.pmids")
 90 | 
 91 | 
 92 | @pytest.fixture()
 93 | def test_conll_path():
 94 |     return os.path.join("tests", "custom_tests", "data_fixtures", "test.conllu")
 95 | 
 96 | 
 97 | @pytest.fixture()
 98 | def test_model_dir():
 99 |     return os.path.join("tests", "custom_tests", "data_fixtures", "tmp_model_dir")
100 | 
101 | 
102 | @pytest.fixture()
103 | def combined_all_model_fixture():
104 |     nlp = get_spacy_model("en_core_sci_sm", True, True, True, with_custom_tokenizer=True, with_sentence_segmenter=False, with_serializable_abbreviation_detector=True)
105 |     return nlp
106 | 
107 | @pytest.fixture()
108 | def combined_all_model_fixture_non_serializable_abbrev():
109 |     nlp = get_spacy_model("en_core_sci_sm", True, True, True, with_custom_tokenizer=True, with_sentence_segmenter=False, with_serializable_abbreviation_detector=False)
110 |     return nlp
111 | 
112 | @pytest.fixture()
113 | def combined_rule_prefixes_fixture():
114 |     return combined_rule_prefixes()
115 | 
116 | 
117 | @pytest.fixture()
118 | def remove_new_lines_fixture():
119 |     return remove_new_lines
120 | 
121 | 
122 | @pytest.fixture()
123 | def default_en_tokenizer_fixture():
124 |     nlp = get_spacy_model("en_core_web_sm", True, True, True)
125 |     return nlp.tokenizer
126 | 
127 | 
128 | @pytest.fixture()
129 | def default_en_model_fixture():
130 |     nlp = get_spacy_model("en_core_web_sm", True, True, True)
131 |     return nlp
132 | 


--------------------------------------------------------------------------------
/scispacy/hyponym_detector.py:
--------------------------------------------------------------------------------
  1 | from spacy.matcher import Matcher
  2 | from spacy.tokens import Token, Doc
  3 | from spacy.language import Language
  4 | 
  5 | from scispacy.hearst_patterns import BASE_PATTERNS, EXTENDED_PATTERNS
  6 | 
  7 | 
  8 | @Language.factory("hyponym_detector")
  9 | class HyponymDetector:
 10 |     """
 11 |     A spaCy pipe for detecting hyponyms using Hearst patterns.
 12 |     This class sets the following attributes:
 13 | 
 14 |     - `Doc._.hearst_patterns`: A List[Tuple[str, Span, Span]] corresonding to
 15 |        the matching predicate, extracted general term and specific term
 16 |        that matched a Hearst pattern.
 17 | 
 18 |     Parts of the implementation taken from
 19 |     https://github.com/mmichelsonIF/hearst_patterns_python/blob/master/hearstPatterns/hearstPatterns.py
 20 |     and
 21 |     https://github.com/Fourthought/CNDPipeline/blob/master/cndlib/hpspacy.py
 22 | 
 23 |     The pipe can be used with an instantiated spacy model like so:
 24 |     ```
 25 |     # add the hyponym detector
 26 |     nlp.add_pipe('hyponym_detector', config={'extended': True}, last=True)
 27 | 
 28 |     Parameters
 29 |     ----------
 30 | 
 31 |     nlp: `Language`, a required argument for spacy to use this as a factory
 32 |     name: `str`, a required argument for spacy to use this as a factory
 33 |     extended: `bool`, whether to use the extended Hearts patterns or not
 34 |     """
 35 | 
 36 |     def __init__(
 37 |         self, nlp: Language, name: str = "hyponym_detector", extended: bool = False
 38 |     ):
 39 | 
 40 |         self.nlp = nlp
 41 | 
 42 |         self.patterns = BASE_PATTERNS
 43 |         if extended:
 44 |             self.patterns.extend(EXTENDED_PATTERNS)
 45 | 
 46 |         self.matcher = Matcher(self.nlp.vocab)
 47 | 
 48 |         Doc.set_extension("hearst_patterns", default=[], force=True)
 49 | 
 50 |         self.first = set()
 51 |         self.last = set()
 52 | 
 53 |         # add patterns to matcher
 54 |         for pattern in self.patterns:
 55 |             self.matcher.add(pattern["label"], [pattern["pattern"]])
 56 | 
 57 |             # gather list of predicates where the hypernym appears first
 58 |             if pattern["position"] == "first":
 59 |                 self.first.add(pattern["label"])
 60 | 
 61 |             # gather list of predicates where the hypernym appears last
 62 |             if pattern["position"] == "last":
 63 |                 self.last.add(pattern["label"])
 64 | 
 65 |     def expand_to_noun_compound(self, token: Token, doc: Doc):
 66 |         """
 67 |         Expand a token to it's noun phrase based
 68 |         on a simple POS tag heuristic.
 69 |         """
 70 | 
 71 |         start = token.i
 72 |         while True:
 73 |             if start - 1 < 0:
 74 |                 break
 75 |             previous_token = doc[start - 1]
 76 |             if previous_token.pos_ in {"PROPN", "NOUN", "PRON"}:
 77 |                 start -= 1
 78 |             else:
 79 |                 break
 80 | 
 81 |         end = token.i + 1
 82 |         while True:
 83 |             if end >= len(doc):
 84 |                 break
 85 |             next_token = doc[end]
 86 |             if next_token.pos_ in {"PROPN", "NOUN", "PRON"}:
 87 |                 end += 1
 88 |             else:
 89 |                 break
 90 | 
 91 |         return doc[start:end]
 92 | 
 93 |     def find_noun_compound_head(self, token: Token):
 94 | 
 95 |         while token.head.pos_ in {"PROPN", "NOUN", "PRON"} and token.dep_ == "compound":
 96 |             token = token.head
 97 |         return token
 98 | 
 99 |     def __call__(self, doc: Doc):
100 |         """
101 |         Runs the matcher on the Doc object and sets token and
102 |         doc level attributes for hypernym and hyponym relations.
103 |         """
104 |         # Find matches in doc
105 |         matches = self.matcher(doc)
106 | 
107 |         # If none are found then return None
108 |         if not matches:
109 |             return doc
110 | 
111 |         for match_id, start, end in matches:
112 |             predicate = self.nlp.vocab.strings[match_id]
113 | 
114 |             # if the predicate is in the list where the hypernym is last, else hypernym is first
115 |             if predicate in self.last:
116 |                 hypernym = doc[end - 1]
117 |                 hyponym = doc[start]
118 |             else:
119 |                 # An inelegent way to deal with the "such_NOUN_as pattern"
120 |                 # since the first token is not the hypernym.
121 |                 if doc[start].lemma_ == "such":
122 |                     start += 1
123 |                 hypernym = doc[start]
124 |                 hyponym = doc[end - 1]
125 | 
126 |             hypernym = self.find_noun_compound_head(hypernym)
127 |             hyponym = self.find_noun_compound_head(hyponym)
128 | 
129 |             # For the document level, we expand to contain noun phrases.
130 |             hypernym_extended = self.expand_to_noun_compound(hypernym, doc)
131 |             hyponym_extended = self.expand_to_noun_compound(hyponym, doc)
132 | 
133 |             doc._.hearst_patterns.append(
134 |                 (predicate, hypernym_extended, hyponym_extended)
135 |             )
136 | 
137 |             for token in hyponym.conjuncts:
138 | 
139 |                 token_extended = self.expand_to_noun_compound(token, doc)
140 |                 if token != hypernym and token is not None:
141 |                     doc._.hearst_patterns.append(
142 |                         (predicate, hypernym_extended, token_extended)
143 |                     )
144 | 
145 |         return doc
146 | 


--------------------------------------------------------------------------------
/scispacy/file_cache.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Utilities for working with the local dataset cache.
  3 | """
  4 | 
  5 | import os
  6 | import shutil
  7 | import tempfile
  8 | import json
  9 | from urllib.parse import urlparse
 10 | from pathlib import Path
 11 | from typing import Tuple, Union, IO
 12 | from hashlib import sha256
 13 | 
 14 | import requests
 15 | 
 16 | CACHE_ROOT = Path(os.getenv("SCISPACY_CACHE", str(Path.home() / ".scispacy")))
 17 | DATASET_CACHE = str(CACHE_ROOT / "datasets")
 18 | 
 19 | 
 20 | def cached_path(url_or_filename: Union[str, Path], cache_dir: str = None) -> str:
 21 |     """
 22 |     Given something that might be a URL (or might be a local path),
 23 |     determine which. If it's a URL, download the file and cache it, and
 24 |     return the path to the cached file. If it's already a local path,
 25 |     make sure the file exists and then return the path.
 26 |     """
 27 |     if cache_dir is None:
 28 |         cache_dir = DATASET_CACHE
 29 |     if isinstance(url_or_filename, Path):
 30 |         url_or_filename = str(url_or_filename)
 31 | 
 32 |     parsed = urlparse(url_or_filename)
 33 | 
 34 |     if parsed.scheme in ("http", "https"):
 35 |         # URL, so get it from the cache (downloading if necessary)
 36 |         return get_from_cache(url_or_filename, cache_dir)
 37 |     elif os.path.exists(url_or_filename):
 38 |         # File, and it exists.
 39 |         return url_or_filename
 40 |     elif parsed.scheme == "":
 41 |         # File, but it doesn't exist.
 42 |         raise FileNotFoundError("file {} not found".format(url_or_filename))
 43 |     else:
 44 |         # Something unknown
 45 |         raise ValueError(
 46 |             "unable to parse {} as a URL or as a local path".format(url_or_filename)
 47 |         )
 48 | 
 49 | 
 50 | def url_to_filename(url: str, etag: str = None) -> str:
 51 |     """
 52 |     Convert `url` into a hashed filename in a repeatable way.
 53 |     If `etag` is specified, append its hash to the url's, delimited
 54 |     by a period.
 55 |     """
 56 | 
 57 |     last_part = url.split("/")[-1]
 58 |     url_bytes = url.encode("utf-8")
 59 |     url_hash = sha256(url_bytes)
 60 |     filename = url_hash.hexdigest()
 61 | 
 62 |     if etag:
 63 |         etag_bytes = etag.encode("utf-8")
 64 |         etag_hash = sha256(etag_bytes)
 65 |         filename += "." + etag_hash.hexdigest()
 66 | 
 67 |     filename += "." + last_part
 68 |     return filename
 69 | 
 70 | 
 71 | def filename_to_url(filename: str, cache_dir: str = None) -> Tuple[str, str]:
 72 |     """
 73 |     Return the url and etag (which may be ``None``) stored for `filename`.
 74 |     Raise ``FileNotFoundError`` if `filename` or its stored metadata do not exist.
 75 |     """
 76 |     if cache_dir is None:
 77 |         cache_dir = DATASET_CACHE
 78 | 
 79 |     cache_path = os.path.join(cache_dir, filename)
 80 |     if not os.path.exists(cache_path):
 81 |         raise FileNotFoundError("file {} not found".format(cache_path))
 82 | 
 83 |     meta_path = cache_path + ".json"
 84 |     if not os.path.exists(meta_path):
 85 |         raise FileNotFoundError("file {} not found".format(meta_path))
 86 | 
 87 |     with open(meta_path) as meta_file:
 88 |         metadata = json.load(meta_file)
 89 |     url = metadata["url"]
 90 |     etag = metadata["etag"]
 91 | 
 92 |     return url, etag
 93 | 
 94 | 
 95 | def http_get(url: str, temp_file: IO) -> None:
 96 |     req = requests.get(url, stream=True)
 97 |     for chunk in req.iter_content(chunk_size=1024):
 98 |         if chunk:  # filter out keep-alive new chunks
 99 |             temp_file.write(chunk)
100 | 
101 | 
102 | def get_from_cache(url: str, cache_dir: str = None) -> str:
103 |     """
104 |     Given a URL, look for the corresponding dataset in the local cache.
105 |     If it's not there, download it. Then return the path to the cached file.
106 |     """
107 |     if cache_dir is None:
108 |         cache_dir = DATASET_CACHE
109 | 
110 |     os.makedirs(cache_dir, exist_ok=True)
111 | 
112 |     response = requests.head(url, allow_redirects=True)
113 |     if response.status_code != 200:
114 |         raise IOError(
115 |             "HEAD request failed for url {} with status code {}".format(
116 |                 url, response.status_code
117 |             )
118 |         )
119 |     etag = response.headers.get("ETag")
120 | 
121 |     filename = url_to_filename(url, etag)
122 | 
123 |     # get cache path to put the file
124 |     cache_path = os.path.join(cache_dir, filename)
125 | 
126 |     if not os.path.exists(cache_path):
127 |         # Download to temporary file, then copy to cache dir once finished.
128 |         # Otherwise you get corrupt cache entries if the download gets interrupted.
129 |         with tempfile.NamedTemporaryFile() as temp_file:  # type: IO
130 |             print(f"{url} not found in cache, downloading to {temp_file.name}")
131 | 
132 |             # GET file object
133 |             http_get(url, temp_file)
134 | 
135 |             # we are copying the file before closing it, so flush to avoid truncation
136 |             temp_file.flush()
137 |             # shutil.copyfileobj() starts at the current position, so go to the start
138 |             temp_file.seek(0)
139 | 
140 |             print(
141 |                 f"Finished download, copying {temp_file.name} to cache at {cache_path}"
142 |             )
143 |             with open(cache_path, "wb") as cache_file:
144 |                 shutil.copyfileobj(temp_file, cache_file)
145 | 
146 |             meta = {"url": url, "etag": etag}
147 |             meta_path = cache_path + ".json"
148 |             with open(meta_path, "w") as meta_file:
149 |                 json.dump(meta, meta_file)
150 | 
151 |     return cache_path
152 | 


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | layout: default
  3 | ---
  4 | 
  5 | **scispaCy is a Python package containing [spaCy](https://spacy.io/) models for processing _biomedical_, _scientific_ or _clinical_ text.**
  6 | 
  7 | 
  8 | ## Interactive Demo
  9 | Just looking to test out the models on your data? Check out our [demo](https://scispacy.apps.allenai.org).
 10 | 
 11 | ## Installing
 12 | ```python
 13 | pip install scispacy
 14 | pip install <Model URL>
 15 | ```
 16 | ## Models
 17 | 
 18 | | Model          | Description       | Install URL
 19 | |:---------------|:------------------|:----------|
 20 | | en_core_sci_sm | A full spaCy pipeline for biomedical data. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_sm-0.4.0.tar.gz)|
 21 | | en_core_sci_md |  A full spaCy pipeline for biomedical data with a larger vocabulary and 50k word vectors. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_md-0.4.0.tar.gz)|
 22 | | en_core_sci_scibert |  A full spaCy pipeline for biomedical data with a ~785k vocabulary and `allenai/scibert-base` as the transformer model. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_scibert-0.4.0.tar.gz)|
 23 | | en_core_sci_lg |  A full spaCy pipeline for biomedical data with a larger vocabulary and 600k word vectors. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_lg-0.4.0.tar.gz)|
 24 | | en_ner_craft_md|  A spaCy NER model trained on the CRAFT corpus.|[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_ner_craft_md-0.4.0.tar.gz)|
 25 | | en_ner_jnlpba_md | A spaCy NER model trained on the JNLPBA corpus.| [Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_ner_jnlpba_md-0.4.0.tar.gz)|
 26 | | en_ner_bc5cdr_md |  A spaCy NER model trained on the BC5CDR corpus. | [Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_ner_bc5cdr_md-0.4.0.tar.gz)|
 27 | | en_ner_bionlp13cg_md |  A spaCy NER model trained on the BIONLP13CG corpus. | [Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_ner_bionlp13cg_md-0.4.0.tar.gz)|
 28 | 
 29 | 
 30 | 
 31 | 
 32 | ### Performance
 33 | 
 34 | Our models achieve performance within 3% of published state of the art dependency parsers and within 0.4% accuracy of state of the art biomedical POS taggers.
 35 | 
 36 | | model          | UAS | LAS   | POS   | Mentions (F1) | Web UAS | 
 37 | |:---------------|:----|:------|:------|:---|:---|
 38 | | en_core_sci_sm | 89.54| 87.62  |  98.32  |  68.15  |  87.62  |
 39 | | en_core_sci_md | 89.61| 87.77 |  98.56 |  69.64 |  88.05  |
 40 | | en_core_sci_lg | 89.63| 87.81  |  98.56  |  69.61  |  88.08  |
 41 | | en_core_sci_scibert | 92.03| 90.25  |  98.91  |  67.91  |  92.21  |
 42 | 
 43 | 
 44 | | model          | F1 |   Entity Types|
 45 | |:---------------|:-----|:--------|
 46 | | en_ner_craft_md | 76.11|GGP, SO, TAXON, CHEBI, GO, CL|
 47 | | en_ner_jnlpba_md | 71.62| DNA, CELL_TYPE, CELL_LINE, RNA, PROTEIN |
 48 | | en_ner_bc5cdr_md | 84.49| DISEASE, CHEMICAL|
 49 | | en_ner_bionlp13cg_md | 77.75| AMINO_ACID, ANATOMICAL_SYSTEM, CANCER, CELL, CELLULAR_COMPONENT, DEVELOPING_ANATOMICAL_STRUCTURE, GENE_OR_GENE_PRODUCT, IMMATERIAL_ANATOMICAL_ENTITY, MULTI-TISSUE_STRUCTURE, ORGAN, ORGANISM, ORGANISM_SUBDIVISION, ORGANISM_SUBSTANCE, PATHOLOGICAL_FORMATION, SIMPLE_CHEMICAL, TISSUE |
 50 | 
 51 | 
 52 | ### Example Usage
 53 | 
 54 | ```python
 55 | import scispacy
 56 | import spacy
 57 | 
 58 | nlp = spacy.load("en_core_sci_sm")
 59 | text = """
 60 | Myeloid derived suppressor cells (MDSC) are immature 
 61 | myeloid cells with immunosuppressive activity. 
 62 | They accumulate in tumor-bearing mice and humans 
 63 | with different types of cancer, including hepatocellular 
 64 | carcinoma (HCC).
 65 | """
 66 | doc = nlp(text)
 67 | 
 68 | print(list(doc.sents))
 69 | >>> ["Myeloid derived suppressor cells (MDSC) are immature myeloid cells with immunosuppressive activity.", 
 70 |      "They accumulate in tumor-bearing mice and humans with different types of cancer, including hepatocellular carcinoma (HCC)."]
 71 | 
 72 | # Examine the entities extracted by the mention detector.
 73 | # Note that they don't have types like in SpaCy, and they
 74 | # are more general (e.g including verbs) - these are any
 75 | # spans which might be an entity in UMLS, a large
 76 | # biomedical database.
 77 | print(doc.ents)
 78 | >>> (Myeloid derived suppressor cells,
 79 |      MDSC,
 80 |      immature,
 81 |      myeloid cells,
 82 |      immunosuppressive activity,
 83 |      accumulate,
 84 |      tumor-bearing mice,
 85 |      humans,
 86 |      cancer,
 87 |      hepatocellular carcinoma,
 88 |      HCC)
 89 | 
 90 | # We can also visualise dependency parses
 91 | # (This renders automatically inside a jupyter notebook!):
 92 | from spacy import displacy
 93 | displacy.render(next(doc.sents), style='dep', jupyter=True)
 94 | 
 95 | # See below for the generated SVG.
 96 | # Zoom your browser in a bit!
 97 | 
 98 | ```
 99 | 
100 | ![Branching](./example.svg)
101 | 
102 | ### Data Sources
103 | 
104 | scispaCy models are trained on data from a variety of sources. In particular,
105 | we use:
106 | 
107 | *   **[The GENIA 1.0 Treebank](https://nlp.stanford.edu/~mcclosky/biomedical.html)**, converted to basic Universal Dependencies using the [Stanford Dependency Converter](https://nlp.stanford.edu/software/stanford-dependencies.shtml).
108 | We have made this [dataset available along with the original raw data](https://github.com/allenai/genia-dependency-trees).
109 | *   **[word2vec word vectors](http://bio.nlplab.org/#word-vectors)** trained on the Pubmed Central Open Access Subset.
110 | *   **[The MedMentions Entity Linking dataset](https://github.com/chanzuckerberg/MedMentions)**, used for training a mention detector.
111 | *  **[Ontonotes 5.0](https://catalog.ldc.upenn.edu/LDC2013T19)** to make the parser and tagger more robust to non-biomedical text. Unfortunately this is not publically available.
112 | 


--------------------------------------------------------------------------------
/scispacy/custom_tokenizer.py:
--------------------------------------------------------------------------------
  1 | from typing import List
  2 | 
  3 | from spacy.lang import char_classes
  4 | from spacy.symbols import ORTH
  5 | from spacy.tokenizer import Tokenizer
  6 | from spacy.util import compile_prefix_regex, compile_infix_regex, compile_suffix_regex
  7 | from spacy.language import Language
  8 | 
  9 | from scispacy.consts import ABBREVIATIONS
 10 | 
 11 | 
 12 | def remove_new_lines(text: str) -> str:
 13 |     """Used to preprocess away new lines in the middle of words. This function
 14 |        is intended to be called on a raw string before it is passed through a
 15 |        spaCy pipeline
 16 | 
 17 |     @param text: a string of text to be processed
 18 |     """
 19 |     text = text.replace("-\n\n", "")
 20 |     text = text.replace("- \n\n", "")
 21 |     text = text.replace("-\n", "")
 22 |     text = text.replace("- \n", "")
 23 |     return text
 24 | 
 25 | 
 26 | def combined_rule_prefixes() -> List[str]:
 27 |     """Helper function that returns the prefix pattern for the tokenizer.
 28 |     It is a helper function to accomodate spacy tests that only test
 29 |     prefixes.
 30 |     """
 31 |     # add lookahead assertions for brackets (may not work properly for unbalanced brackets)
 32 |     prefix_punct = char_classes.PUNCT.replace("|", " ")
 33 |     prefix_punct = prefix_punct.replace(r"\(", r"\((?![^\(\s]+\)\S+)")
 34 |     prefix_punct = prefix_punct.replace(r"\[", r"\[(?![^\[\s]+\]\S+)")
 35 |     prefix_punct = prefix_punct.replace(r"\{", r"\{(?![^\{\s]+\}\S+)")
 36 | 
 37 |     prefixes = (
 38 |         ["§", "%", "=", r"\+"]
 39 |         + char_classes.split_chars(prefix_punct)
 40 |         + char_classes.LIST_ELLIPSES
 41 |         + char_classes.LIST_QUOTES
 42 |         + char_classes.LIST_CURRENCY
 43 |         + char_classes.LIST_ICONS
 44 |     )
 45 |     return prefixes
 46 | 
 47 | 
 48 | def combined_rule_tokenizer(nlp: Language) -> Tokenizer:
 49 |     """Creates a custom tokenizer on top of spaCy's default tokenizer. The
 50 |     intended use of this function is to replace the tokenizer in a spaCy
 51 |     pipeline like so:
 52 | 
 53 |          nlp = spacy.load("some_spacy_model")
 54 |          nlp.tokenizer = combined_rule_tokenizer(nlp)
 55 | 
 56 |     @param nlp: a loaded spaCy model
 57 |     """
 58 |     # remove the first hyphen to prevent tokenization of the normal hyphen
 59 |     hyphens = char_classes.HYPHENS.replace("-|", "", 1)
 60 | 
 61 |     infixes = (
 62 |         char_classes.LIST_ELLIPSES
 63 |         + char_classes.LIST_ICONS
 64 |         + [
 65 |             r"×",  # added this special x character to tokenize it separately
 66 |             r"(?<=[0-9])[+\-\*^](?=[0-9-])",
 67 |             r"(?<=[{al}])\.(?=[{au}])".format(
 68 |                 al=char_classes.ALPHA_LOWER, au=char_classes.ALPHA_UPPER
 69 |             ),
 70 |             r"(?<=[{a}]),(?=[{a}])".format(a=char_classes.ALPHA),
 71 |             r'(?<=[{a}])[?";:=,.]*(?:{h})(?=[{a}])'.format(
 72 |                 a=char_classes.ALPHA, h=hyphens
 73 |             ),
 74 |             # removed / to prevent tokenization of /
 75 |             r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=char_classes.ALPHA),
 76 |         ]
 77 |     )
 78 | 
 79 |     prefixes = combined_rule_prefixes()
 80 | 
 81 |     # add the last apostrophe
 82 |     quotes = char_classes.LIST_QUOTES.copy() + ["’"]
 83 | 
 84 |     # add lookbehind assertions for brackets (may not work properly for unbalanced brackets)
 85 |     suffix_punct = char_classes.PUNCT.replace("|", " ")
 86 |     # These lookbehinds are commented out because they are variable width lookbehinds, and as of spacy 2.1,
 87 |     # spacy uses the re package instead of the regex package. The re package does not support variable width
 88 |     # lookbehinds. Hacking spacy internals to allow us to use the regex package is doable, but would require
 89 |     # creating our own instance of the language class, with our own Tokenizer class, with the from_bytes method
 90 |     # using the regex package instead of the re package
 91 |     # suffix_punct = suffix_punct.replace(r"\)", r"(?<!\S+\([^\)\s]+)\)")
 92 |     # suffix_punct = suffix_punct.replace(r"\]", r"(?<!\S+\[[^\]\s]+)\]")
 93 |     # suffix_punct = suffix_punct.replace(r"\}", r"(?<!\S+\{[^\}\s]+)\}")
 94 | 
 95 |     suffixes = (
 96 |         char_classes.split_chars(suffix_punct)
 97 |         + char_classes.LIST_ELLIPSES
 98 |         + quotes
 99 |         + char_classes.LIST_ICONS
100 |         + ["'s", "'S", "’s", "’S", "’s", "’S"]
101 |         + [
102 |             r"(?<=[0-9])\+",
103 |             r"(?<=°[FfCcKk])\.",
104 |             r"(?<=[0-9])(?:{})".format(char_classes.CURRENCY),
105 |             # this is another place where we used a variable width lookbehind
106 |             # so now things like 'H3g' will be tokenized as ['H3', 'g']
107 |             # previously the lookbehind was (^[0-9]+)
108 |             r"(?<=[0-9])(?:{u})".format(u=char_classes.UNITS),
109 |             r"(?<=[0-9{}{}(?:{})])\.".format(
110 |                 char_classes.ALPHA_LOWER, r"%²\-\)\]\+", "|".join(quotes)
111 |             ),
112 |             # add |\d to split off the period of a sentence that ends with 1D.
113 |             r"(?<=[{a}|\d][{a}])\.".format(a=char_classes.ALPHA_UPPER),
114 |         ]
115 |     )
116 | 
117 |     infix_re = compile_infix_regex(infixes)
118 |     prefix_re = compile_prefix_regex(prefixes)
119 |     suffix_re = compile_suffix_regex(suffixes)
120 | 
121 |     # Update exclusions to include these abbreviations so the period is not split off
122 |     exclusions = {
123 |         abbreviation: [{ORTH: abbreviation}] for abbreviation in ABBREVIATIONS
124 |     }
125 |     tokenizer_exceptions = nlp.Defaults.tokenizer_exceptions.copy()
126 |     tokenizer_exceptions.update(exclusions)
127 | 
128 |     tokenizer = Tokenizer(
129 |         nlp.vocab,
130 |         tokenizer_exceptions,
131 |         prefix_search=prefix_re.search,
132 |         suffix_search=suffix_re.search,
133 |         infix_finditer=infix_re.finditer,
134 |         token_match=nlp.tokenizer.token_match,  # type: ignore
135 |     )
136 |     return tokenizer
137 | 


--------------------------------------------------------------------------------
/scispacy/linking.py:
--------------------------------------------------------------------------------
  1 | from spacy.tokens import Doc
  2 | from spacy.tokens import Span
  3 | from spacy.language import Language
  4 | 
  5 | from scispacy.candidate_generation import CandidateGenerator
  6 | 
  7 | 
  8 | @Language.factory("scispacy_linker")
  9 | class EntityLinker:
 10 |     """
 11 |     A spacy pipeline component which identifies entities in text which appear
 12 |     in a knowledge base.
 13 | 
 14 |     Currently, there are two defaults: the Unified Medical Language System (UMLS) and
 15 |     the Medical Subject Headings (MESH) dictionary.
 16 | 
 17 |     To use these configured default KBs, pass the `name` parameter, either 'umls' or 'mesh'.
 18 | 
 19 |     Currently this implementation just compares string similarity, returning
 20 |     entities above a given threshold.
 21 | 
 22 |     This class sets the `._.kb_ents` attribute on spacy Spans, which consists of a
 23 |     List[Tuple[str, float]] corresponding to the KB concept_id and the associated score
 24 |     for a list of `max_entities_per_mention` number of entities.
 25 | 
 26 |     You can look up more information for a given id using the kb attribute of this class:
 27 | 
 28 |     print(linker.kb.cui_to_entity[concept_id])
 29 | 
 30 |     A Note on Definitions:
 31 |     Only 187767 entities, or 6.74% of the UMLS KB have definitions. However,
 32 |     the MedMentions dataset links to entities which have definitions 82.9% of the time. So by
 33 |     default, we only link to entities which have definitions (typically they are more salient / cleaner),
 34 |     but this might not suit your use case. YMMV.
 35 | 
 36 | 
 37 |     Parameters
 38 |     ----------
 39 | 
 40 |     nlp: `Language`, a required argument for spacy to use this as a factory
 41 |     name: `str`, a required argument for spacy to use this as a factory
 42 |     candidate_generator : `CandidateGenerator`, optional, (default = None)
 43 |         A CandidateGenerator to generate entity candidates for mentions.
 44 |         If no candidate generator is passed, the default pretrained one is used.
 45 |     resolve_abbreviations : bool = True, optional (default = False)
 46 |         Whether to resolve abbreviations identified in the Doc before performing linking.
 47 |         This parameter has no effect if there is no `AbbreviationDetector` in the spacy
 48 |         pipeline.
 49 |     k : int, optional, (default = 30)
 50 |         The number of nearest neighbours to look up from the candidate generator per mention.
 51 |     threshold : float, optional, (default = 0.7)
 52 |         The threshold that a entity candidate must reach to be added to the mention in the Doc
 53 |         as a mention candidate.
 54 |     no_definition_threshold : float, optional, (default = 0.95)
 55 |         The threshold that a entity candidate must reach to be added to the mention in the Doc
 56 |         as a mention candidate if the entity candidate does not have a definition.
 57 |     filter_for_definitions: bool, default = True
 58 |         Whether to filter entities that can be returned to only include those with definitions
 59 |         in the knowledge base.
 60 |     max_entities_per_mention : int, optional, default = 5
 61 |         The maximum number of entities which will be returned for a given mention, regardless of
 62 |         how many are nearest neighbours are found.
 63 |     linker_name: str, optional (default = None)
 64 |         The name of the pretrained entity linker to load.
 65 |     """
 66 | 
 67 |     def __init__(
 68 |         self,
 69 |         nlp: Language = None,
 70 |         name: str = "scispacy_linker",
 71 |         candidate_generator: CandidateGenerator = None,
 72 |         resolve_abbreviations: bool = True,
 73 |         k: int = 30,
 74 |         threshold: float = 0.7,
 75 |         no_definition_threshold: float = 0.95,
 76 |         filter_for_definitions: bool = True,
 77 |         max_entities_per_mention: int = 5,
 78 |         linker_name: str = None,
 79 |     ):
 80 |         # TODO(Mark): Remove in scispacy v1.0.
 81 |         Span.set_extension("umls_ents", default=[], force=True)
 82 |         Span.set_extension("kb_ents", default=[], force=True)
 83 | 
 84 |         self.candidate_generator = candidate_generator or CandidateGenerator(
 85 |             name=linker_name
 86 |         )
 87 |         self.resolve_abbreviations = resolve_abbreviations
 88 |         self.k = k
 89 |         self.threshold = threshold
 90 |         self.no_definition_threshold = no_definition_threshold
 91 |         self.kb = self.candidate_generator.kb
 92 |         self.filter_for_definitions = filter_for_definitions
 93 |         self.max_entities_per_mention = max_entities_per_mention
 94 | 
 95 |         # TODO(Mark): Remove in scispacy v1.0. This is for backward compatability only.
 96 |         self.umls = self.kb
 97 | 
 98 |     def __call__(self, doc: Doc) -> Doc:
 99 |         mention_strings = []
100 |         if self.resolve_abbreviations and Doc.has_extension("abbreviations"):
101 |             # TODO: This is possibly sub-optimal - we might
102 |             # prefer to look up both the long and short forms.
103 |             for ent in doc.ents:
104 |                 if isinstance(ent._.long_form, Span):
105 |                     # Long form
106 |                     mention_strings.append(ent._.long_form.text)
107 |                 elif isinstance(ent._.long_form, str):
108 |                     # Long form
109 |                     mention_strings.append(ent._.long_form)
110 |                 else:
111 |                     # no abbreviations case
112 |                     mention_strings.append(ent.text)
113 |         else:
114 |             mention_strings = [x.text for x in doc.ents]
115 | 
116 |         batch_candidates = self.candidate_generator(mention_strings, self.k)
117 | 
118 |         for mention, candidates in zip(doc.ents, batch_candidates):
119 |             predicted = []
120 |             for cand in candidates:
121 |                 score = max(cand.similarities)
122 |                 if (
123 |                     self.filter_for_definitions
124 |                     and self.kb.cui_to_entity[cand.concept_id].definition is None
125 |                     and score < self.no_definition_threshold
126 |                 ):
127 |                     continue
128 |                 if score > self.threshold:
129 |                     predicted.append((cand.concept_id, score))
130 |             sorted_predicted = sorted(predicted, reverse=True, key=lambda x: x[1])
131 |             mention._.umls_ents = sorted_predicted[: self.max_entities_per_mention]
132 |             mention._.kb_ents = sorted_predicted[: self.max_entities_per_mention]
133 | 
134 |         return doc
135 | 


--------------------------------------------------------------------------------
/scispacy/umls_utils.py:
--------------------------------------------------------------------------------
  1 | from typing import List, Dict
  2 | 
  3 | # TODO(Mark): Remove in scispacy v1.0, for backward compatability only.
  4 | from scispacy.linking_utils import Entity as UmlsEntity, UmlsKnowledgeBase  # noqa
  5 | 
  6 | # preferred definition sources (from S2)
  7 | DEF_SOURCES_PREFERRED = {"NCI_BRIDG", "NCI_NCI-GLOSS", "NCI", "GO", "MSH", "NCI_FDA"}
  8 | 
  9 | 
 10 | def read_umls_file_headers(meta_path: str, filename: str) -> List[str]:
 11 |     """
 12 |     Read the file descriptor MRFILES.RRF from a UMLS release and get column headers (names)
 13 |     for the given file
 14 | 
 15 |     MRFILES.RRF file format: a pipe-separated values
 16 |     Useful columns:
 17 |         column 0: name of one of the files in the META directory
 18 |         column 2: column names of that file
 19 | 
 20 |     Args:
 21 |         meta_path: path to the META directory of an UMLS release
 22 |         filename: name of the file to get its column headers
 23 |     Returns:
 24 |         a list of column names
 25 |     """
 26 |     file_descriptors = f"{meta_path}/MRFILES.RRF"  # to get column names
 27 |     with open(file_descriptors) as fin:
 28 |         for line in fin:
 29 |             splits = line.split("|")
 30 |             found_filename = splits[0]
 31 |             column_names = (splits[2] + ",").split(
 32 |                 ","
 33 |             )  # ugly hack because all files end with an empty column
 34 |             if found_filename in filename:
 35 |                 return column_names
 36 |     assert False, f"Couldn't find column names for file {filename}"
 37 |     return None
 38 | 
 39 | 
 40 | def read_umls_concepts(meta_path: str, concept_details: Dict, source: str = None):
 41 |     """
 42 |     Read the concepts file MRCONSO.RRF from a UMLS release and store it in
 43 |     concept_details dictionary. Each concept is represented with
 44 |     - concept_id
 45 |     - canonical_name
 46 |     - aliases
 47 |     - types
 48 |     - definition
 49 |     This function fills the first three. If a canonical name is not found, it is left empty.
 50 | 
 51 |     MRFILES.RRF file format: a pipe-separated values
 52 |     Useful columns: CUI, LAT, SUPPRESS, STR, ISPREF, TS, STT
 53 | 
 54 |     Args:
 55 |         meta_path: path to the META directory of an UMLS release
 56 |         concept_details: a dictionary to be filled with concept informations
 57 |         source: An optional source identifier, used as a filter to extract only a
 58 |                 specific source from UMLS.
 59 |     """
 60 |     concepts_filename = "MRCONSO.RRF"
 61 |     headers = read_umls_file_headers(meta_path, concepts_filename)
 62 |     with open(f"{meta_path}/{concepts_filename}") as fin:
 63 |         for line in fin:
 64 |             splits = line.strip().split("|")
 65 |             assert len(headers) == len(splits), (headers, splits)
 66 |             concept = dict(zip(headers, splits))
 67 |             if concept["LAT"] != "ENG" or concept["SUPPRESS"] != "N":
 68 |                 continue  # Keep English non-suppressed concepts only
 69 | 
 70 |             if source is not None:
 71 |                 if concept["SAB"] != source:
 72 |                     continue
 73 | 
 74 |             concept_id = concept["CUI"]
 75 |             if concept_id not in concept_details:  # a new concept
 76 |                 # add it to the dictionary with an empty list of aliases and types
 77 |                 concept_details[concept_id] = {
 78 |                     "concept_id": concept_id,
 79 |                     "aliases": [],
 80 |                     "types": [],
 81 |                 }
 82 | 
 83 |             concept_name = concept["STR"]
 84 |             # this condition is copied from S2. It checks if the concept name is canonical or not
 85 |             is_canonical = (
 86 |                 concept["ISPREF"] == "Y"
 87 |                 and concept["TS"] == "P"
 88 |                 and concept["STT"] == "PF"
 89 |             )
 90 | 
 91 |             if not is_canonical or "canonical_name" in concept_details[concept_id]:
 92 |                 # not a canonical name or a canonical name already found
 93 |                 concept_details[concept_id]["aliases"].append(
 94 |                     concept_name
 95 |                 )  # add it as an alias
 96 |             else:
 97 |                 concept_details[concept_id][
 98 |                     "canonical_name"
 99 |                 ] = concept_name  # set as canonical name
100 | 
101 | 
102 | def read_umls_types(meta_path: str, concept_details: Dict):
103 |     """
104 |     Read the types file MRSTY.RRF from a UMLS release and store it in
105 |     concept_details dictionary. This function adds the `types` field
106 |     to the information of each concept
107 | 
108 |     MRSTY.RRF file format: a pipe-separated values
109 |     Useful columns: CUI, TUI
110 | 
111 |     Args:
112 |         meta_path: path to the META directory of an UMLS release
113 |         concept_details: a dictionary to be filled with concept informations
114 |     """
115 |     types_filename = "MRSTY.RRF"
116 |     headers = read_umls_file_headers(meta_path, types_filename)
117 |     with open(f"{meta_path}/{types_filename}") as fin:
118 |         for line in fin:
119 |             splits = line.strip().split("|")
120 |             assert len(headers) == len(splits)
121 |             concept_type = dict(zip(headers, splits))
122 | 
123 |             concept = concept_details.get(concept_type["CUI"])
124 |             if (
125 |                 concept is not None
126 |             ):  # a small number of types are for concepts that don't exist
127 |                 concept["types"].append(concept_type["TUI"])
128 | 
129 | 
130 | def read_umls_definitions(meta_path: str, concept_details: Dict):
131 |     """
132 |     Read the types file MRDEF.RRF from a UMLS release and store it in
133 |     concept_details dictionary. This function adds the `definition` field
134 |     to the information of each concept
135 | 
136 |     MRDEF.RRF file format: a pipe-separated values
137 |     Useful columns: CUI, SAB, SUPPRESS, DEF
138 | 
139 |     Args:
140 |         meta_path: path to the META directory of an UMLS release
141 |         concept_details: a dictionary to be filled with concept informations
142 |     """
143 |     definitions_filename = "MRDEF.RRF"
144 |     headers = read_umls_file_headers(meta_path, definitions_filename)
145 |     with open(f"{meta_path}/{definitions_filename}") as fin:
146 |         headers = read_umls_file_headers(meta_path, definitions_filename)
147 |         for line in fin:
148 |             splits = line.strip().split("|")
149 |             assert len(headers) == len(splits)
150 |             definition = dict(zip(headers, splits))
151 | 
152 |             if definition["SUPPRESS"] != "N":
153 |                 continue
154 |             is_from_preferred_source = definition["SAB"] in DEF_SOURCES_PREFERRED
155 |             concept = concept_details.get(definition["CUI"])
156 |             if (
157 |                 concept is None
158 |             ):  # a small number of definitions are for concepts that don't exist
159 |                 continue
160 | 
161 |             if (
162 |                 "definition" not in concept
163 |                 or is_from_preferred_source
164 |                 and concept["is_from_preferred_source"] == "N"
165 |             ):
166 |                 concept["definition"] = definition["DEF"]
167 |                 concept["is_from_preferred_source"] = (
168 |                     "Y" if is_from_preferred_source else "N"
169 |                 )
170 | 


--------------------------------------------------------------------------------
/docs/example.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" id="111-0" class="displacy" width="2150" height="487.0" style="max-width: none; height: 487.0px; color: #000000; background: #ffffff; font-family: Arial"><text class="displacy-token" fill="currentColor" text-anchor="middle" y="397.0"><tspan class="displacy-word" fill="currentColor" x="50">Myeloid</tspan><tspan class="displacy-tag" dy="2em" fill="currentColor" x="50">ADJ</tspan></text><text class="displacy-token" fill="currentColor" text-anchor="middle" y="397.0"><tspan class="displacy-word" fill="currentColor" x="225">derived</tspan><tspan class="displacy-tag" dy="2em" fill="currentColor" x="225">VERB</tspan></text><text class="displacy-token" fill="currentColor" text-anchor="middle" y="397.0"><tspan class="displacy-word" fill="currentColor" x="400">suppressor</tspan><tspan class="displacy-tag" dy="2em" fill="currentColor" x="400">NOUN</tspan></text><text class="displacy-token" fill="currentColor" text-anchor="middle" y="397.0"><tspan class="displacy-word" fill="currentColor" x="575">cells (</tspan><tspan class="displacy-tag" dy="2em" fill="currentColor" x="575">NOUN</tspan></text><text class="displacy-token" fill="currentColor" text-anchor="middle" y="397.0"><tspan class="displacy-word" fill="currentColor" x="750">MDSC)</tspan><tspan class="displacy-tag" dy="2em" fill="currentColor" x="750">NOUN</tspan></text><text class="displacy-token" fill="currentColor" text-anchor="middle" y="397.0"><tspan class="displacy-word" fill="currentColor" x="925">are</tspan><tspan class="displacy-tag" dy="2em" fill="currentColor" x="925">VERB</tspan></text><text class="displacy-token" fill="currentColor" text-anchor="middle" y="397.0"><tspan class="displacy-word" fill="currentColor" x="1100">immature</tspan><tspan class="displacy-tag" dy="2em" fill="currentColor" x="1100">ADJ</tspan></text><text class="displacy-token" fill="currentColor" text-anchor="middle" y="397.0"><tspan class="displacy-word" fill="currentColor" x="1275">myeloid</tspan><tspan class="displacy-tag" dy="2em" fill="currentColor" x="1275">ADJ</tspan></text><text class="displacy-token" fill="currentColor" text-anchor="middle" y="397.0"><tspan class="displacy-word" fill="currentColor" x="1450">cells</tspan><tspan class="displacy-tag" dy="2em" fill="currentColor" x="1450">NOUN</tspan></text><text class="displacy-token" fill="currentColor" text-anchor="middle" y="397.0"><tspan class="displacy-word" fill="currentColor" x="1625">with</tspan><tspan class="displacy-tag" dy="2em" fill="currentColor" x="1625">ADP</tspan></text><text class="displacy-token" fill="currentColor" text-anchor="middle" y="397.0"><tspan class="displacy-word" fill="currentColor" x="1800">immunosuppressive</tspan><tspan class="displacy-tag" dy="2em" fill="currentColor" x="1800">ADJ</tspan></text><text class="displacy-token" fill="currentColor" text-anchor="middle" y="397.0"><tspan class="displacy-word" fill="currentColor" x="1975">activity.</tspan><tspan class="displacy-tag" dy="2em" fill="currentColor" x="1975">NOUN</tspan></text><g class="displacy-arrow"><path class="displacy-arc" id="arrow-111-0-0" stroke-width="2px" d="M70,352.0 C70,89.5 570.0,89.5 570.0,352.0" fill="none" stroke="currentColor"/><text dy="1.25em" style="font-size: 0.8em; letter-spacing: 1px"><textPath xlink:href="#arrow-111-0-0" class="displacy-label" startOffset="50%" fill="currentColor" text-anchor="middle">amod</textPath></text><path class="displacy-arrowhead" d="M70,354.0 L62,342.0 78,342.0" fill="currentColor"/></g><g class="displacy-arrow"><path class="displacy-arc" id="arrow-111-0-1" stroke-width="2px" d="M245,352.0 C245,177.0 565.0,177.0 565.0,352.0" fill="none" stroke="currentColor"/><text dy="1.25em" style="font-size: 0.8em; letter-spacing: 1px"><textPath xlink:href="#arrow-111-0-1" class="displacy-label" startOffset="50%" fill="currentColor" text-anchor="middle">amod</textPath></text><path class="displacy-arrowhead" d="M245,354.0 L237,342.0 253,342.0" fill="currentColor"/></g><g class="displacy-arrow"><path class="displacy-arc" id="arrow-111-0-2" stroke-width="2px" d="M420,352.0 C420,264.5 560.0,264.5 560.0,352.0" fill="none" stroke="currentColor"/><text dy="1.25em" style="font-size: 0.8em; letter-spacing: 1px"><textPath xlink:href="#arrow-111-0-2" class="displacy-label" startOffset="50%" fill="currentColor" text-anchor="middle">compound</textPath></text><path class="displacy-arrowhead" d="M420,354.0 L412,342.0 428,342.0" fill="currentColor"/></g><g class="displacy-arrow"><path class="displacy-arc" id="arrow-111-0-3" stroke-width="2px" d="M595,352.0 C595,2.0 1450.0,2.0 1450.0,352.0" fill="none" stroke="currentColor"/><text dy="1.25em" style="font-size: 0.8em; letter-spacing: 1px"><textPath xlink:href="#arrow-111-0-3" class="displacy-label" startOffset="50%" fill="currentColor" text-anchor="middle">nsubj</textPath></text><path class="displacy-arrowhead" d="M595,354.0 L587,342.0 603,342.0" fill="currentColor"/></g><g class="displacy-arrow"><path class="displacy-arc" id="arrow-111-0-4" stroke-width="2px" d="M595,352.0 C595,264.5 735.0,264.5 735.0,352.0" fill="none" stroke="currentColor"/><text dy="1.25em" style="font-size: 0.8em; letter-spacing: 1px"><textPath xlink:href="#arrow-111-0-4" class="displacy-label" startOffset="50%" fill="currentColor" text-anchor="middle">appos</textPath></text><path class="displacy-arrowhead" d="M735.0,354.0 L743.0,342.0 727.0,342.0" fill="currentColor"/></g><g class="displacy-arrow"><path class="displacy-arc" id="arrow-111-0-5" stroke-width="2px" d="M945,352.0 C945,89.5 1445.0,89.5 1445.0,352.0" fill="none" stroke="currentColor"/><text dy="1.25em" style="font-size: 0.8em; letter-spacing: 1px"><textPath xlink:href="#arrow-111-0-5" class="displacy-label" startOffset="50%" fill="currentColor" text-anchor="middle">cop</textPath></text><path class="displacy-arrowhead" d="M945,354.0 L937,342.0 953,342.0" fill="currentColor"/></g><g class="displacy-arrow"><path class="displacy-arc" id="arrow-111-0-6" stroke-width="2px" d="M1120,352.0 C1120,177.0 1440.0,177.0 1440.0,352.0" fill="none" stroke="currentColor"/><text dy="1.25em" style="font-size: 0.8em; letter-spacing: 1px"><textPath xlink:href="#arrow-111-0-6" class="displacy-label" startOffset="50%" fill="currentColor" text-anchor="middle">amod</textPath></text><path class="displacy-arrowhead" d="M1120,354.0 L1112,342.0 1128,342.0" fill="currentColor"/></g><g class="displacy-arrow"><path class="displacy-arc" id="arrow-111-0-7" stroke-width="2px" d="M1295,352.0 C1295,264.5 1435.0,264.5 1435.0,352.0" fill="none" stroke="currentColor"/><text dy="1.25em" style="font-size: 0.8em; letter-spacing: 1px"><textPath xlink:href="#arrow-111-0-7" class="displacy-label" startOffset="50%" fill="currentColor" text-anchor="middle">amod</textPath></text><path class="displacy-arrowhead" d="M1295,354.0 L1287,342.0 1303,342.0" fill="currentColor"/></g><g class="displacy-arrow"><path class="displacy-arc" id="arrow-111-0-8" stroke-width="2px" d="M1645,352.0 C1645,177.0 1965.0,177.0 1965.0,352.0" fill="none" stroke="currentColor"/><text dy="1.25em" style="font-size: 0.8em; letter-spacing: 1px"><textPath xlink:href="#arrow-111-0-8" class="displacy-label" startOffset="50%" fill="currentColor" text-anchor="middle">case</textPath></text><path class="displacy-arrowhead" d="M1645,354.0 L1637,342.0 1653,342.0" fill="currentColor"/></g><g class="displacy-arrow"><path class="displacy-arc" id="arrow-111-0-9" stroke-width="2px" d="M1820,352.0 C1820,264.5 1960.0,264.5 1960.0,352.0" fill="none" stroke="currentColor"/><text dy="1.25em" style="font-size: 0.8em; letter-spacing: 1px"><textPath xlink:href="#arrow-111-0-9" class="displacy-label" startOffset="50%" fill="currentColor" text-anchor="middle">amod</textPath></text><path class="displacy-arrowhead" d="M1820,354.0 L1812,342.0 1828,342.0" fill="currentColor"/></g><g class="displacy-arrow"><path class="displacy-arc" id="arrow-111-0-10" stroke-width="2px" d="M1470,352.0 C1470,89.5 1970.0,89.5 1970.0,352.0" fill="none" stroke="currentColor"/><text dy="1.25em" style="font-size: 0.8em; letter-spacing: 1px"><textPath xlink:href="#arrow-111-0-10" class="displacy-label" startOffset="50%" fill="currentColor" text-anchor="middle">nmod</textPath></text><path class="displacy-arrowhead" d="M1970.0,354.0 L1978.0,342.0 1962.0,342.0" fill="currentColor"/></g></svg>


--------------------------------------------------------------------------------
/tests/test_abbreviation_detection.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | import spacy
  3 | import pytest
  4 | 
  5 | from scispacy.abbreviation import (
  6 |     AbbreviationDetector,
  7 |     find_abbreviation,
  8 |     filter_matches,
  9 | )
 10 | 
 11 | 
 12 | class TestAbbreviationDetector(unittest.TestCase):
 13 |     def setUp(self):
 14 |         super().setUp()
 15 |         self.nlp = spacy.load("en_core_web_sm")
 16 |         self.detector = AbbreviationDetector(self.nlp)
 17 |         self.text = "Spinal and bulbar muscular atrophy (SBMA) is an \
 18 |                 inherited motor neuron disease caused by the expansion \
 19 |                 of a polyglutamine tract within the androgen receptor (AR). \
 20 |                 SBMA can be caused by this easily."
 21 | 
 22 |     def test_find_abbreviation(self):
 23 |         # Basic case
 24 |         doc = self.nlp("abbreviation (abbrn)")
 25 |         long = doc[0:1]
 26 |         short = doc[2:3]
 27 |         _, long_form = find_abbreviation(long, short)
 28 |         assert long_form.text == "abbreviation"
 29 | 
 30 |         # Hypenation and numbers within abbreviation
 31 |         doc = self.nlp("abbreviation (ab-b9rn)")
 32 |         long = doc[0:1]
 33 |         short = doc[2:3]
 34 |         _, long_form = find_abbreviation(long, short)
 35 |         assert long_form.text == "abbreviation"
 36 | 
 37 |         # No match
 38 |         doc = self.nlp("abbreviation (aeb-b9rn)")
 39 |         long = doc[0:1]
 40 |         short = doc[2:3]
 41 |         _, long_form = find_abbreviation(long, short)
 42 |         assert long_form is None
 43 | 
 44 |         # First letter must match start of word.
 45 |         doc = self.nlp("aaaabbreviation (ab-b9rn)")
 46 |         long = doc[0:1]
 47 |         short = doc[2:3]
 48 |         _, long_form = find_abbreviation(long, short)
 49 |         assert long_form.text == "aaaabbreviation"
 50 | 
 51 |         # Matching is greedy for first letter (are is not included).
 52 |         doc = self.nlp("more words are considered aaaabbreviation (ab-b9rn)")
 53 |         long = doc[0:5]
 54 |         short = doc[6:7]
 55 |         _, long_form = find_abbreviation(long, short)
 56 |         assert long_form.text == "aaaabbreviation"
 57 | 
 58 |     def test_filter_matches(self):
 59 |         doc = self.nlp(self.text)
 60 |         matches = self.detector.matcher(doc)
 61 |         matches_no_brackets = [(x[0], x[1] + 1, x[2] - 1) for x in matches]
 62 |         filtered = filter_matches(matches_no_brackets, doc)
 63 | 
 64 |         assert len(filtered) == 2
 65 |         long, short = filtered[0]
 66 |         assert long.text_with_ws == "Spinal and bulbar muscular atrophy "
 67 |         assert short.text == "SBMA"
 68 |         long, short = filtered[1]
 69 |         assert long.text_with_ws == "within the androgen receptor "
 70 |         assert short.text == "AR"
 71 | 
 72 |     def test_abbreviation_detection(self):
 73 |         # Attribute should be registered.
 74 |         doc = self.nlp(self.text)
 75 |         assert doc._.abbreviations == []
 76 |         doc2 = self.detector(doc)
 77 |         assert len(doc2._.abbreviations) == 3
 78 | 
 79 |         correct = set()
 80 |         span = doc[33:34]
 81 |         span._.long_form = doc[0:5]
 82 |         correct.add(span)
 83 |         span = doc[6:7]
 84 |         span._.long_form = doc[0:5]
 85 |         correct.add(span)
 86 |         span = doc[29:30]
 87 |         span._.long_form = doc[26:28]
 88 |         correct.add(span)
 89 |         correct_long = {x._.long_form for x in correct}
 90 | 
 91 |         assert set(doc2._.abbreviations) == correct
 92 |         assert {x._.long_form for x in doc2._.abbreviations} == correct_long
 93 | 
 94 |     def test_find(self):
 95 |         doc = self.nlp(self.text)
 96 |         long, shorts = self.detector.find(doc[6:7], doc)
 97 |         assert long.text_with_ws == "Spinal and bulbar muscular atrophy "
 98 |         assert len(shorts) == 2
 99 |         assert {x.text_with_ws for x in shorts} == {"SBMA", "SBMA "}
100 | 
101 |         long, shorts = self.detector.find(doc[7:13], doc)
102 |         assert shorts == set()
103 | 
104 |     def test_issue_158(self):
105 |         text = (
106 |             "The PVO observations showed that the total transterminator flux "
107 |             "was 23% of that at solar maximum and that the largest reductions in the "
108 |             "number of ions transported antisunward occurred at the highest altitudes "
109 |             "(Spenner et al., 1995)."
110 |         )
111 |         doc = self.nlp(text)
112 |         doc2 = self.detector(doc)
113 |         assert len(doc2._.abbreviations) == 0
114 | 
115 |     def test_issue_192(self):
116 |         # test for <short> (<long>) pattern
117 |         text = "blah SBMA (Spinal and bulbar muscular atrophy)"
118 |         doc = self.nlp(text)
119 |         doc2 = self.detector(doc)
120 | 
121 |         assert len(doc2._.abbreviations) == 1
122 |         assert doc2._.abbreviations[0] == doc[1:2]
123 |         assert doc2._.abbreviations[0]._.long_form == doc[3:8]
124 | 
125 |     def test_issue_161(self):
126 |         # test some troublesome cases in the abbreviation detector
127 |         text = "H2)]+(14)s.t. (1), (4).Similarly"
128 |         print(f"Text: {text}")
129 |         doc = self.nlp(text)
130 |         doc2 = self.detector(doc)
131 |         assert len(doc2._.abbreviations) == 0
132 | 
133 |         text = ".(21)In (21), λ"
134 |         doc = self.nlp(text)
135 |         doc2 = self.detector(doc)
136 |         assert len(doc2._.abbreviations) == 0
137 | 
138 |         text = "map expX (·) : R"
139 |         doc = self.nlp(text)
140 |         doc2 = self.detector(doc)
141 |         assert len(doc2._.abbreviations) == 0
142 | 
143 |         text = "0,(3)with the following data: (3-i) (q̄"
144 |         doc = self.nlp(text)
145 |         doc2 = self.detector(doc)
146 |         assert len(doc2._.abbreviations) == 0
147 | 
148 |         text = "Φg(h),ThΦg(v) ) , (h, v)"
149 |         doc = self.nlp(text)
150 |         doc2 = self.detector(doc)
151 |         assert len(doc2._.abbreviations) == 0
152 | 
153 |         text = "dimension;(S-iii) The optimal control problem obtained in (S-ii) is con-verted"
154 |         doc = self.nlp(text)
155 |         doc2 = self.detector(doc)
156 |         assert len(doc2._.abbreviations) == 0
157 | 
158 |         text = "z), πut (z)) )"
159 |         doc = self.nlp(text)
160 |         doc2 = self.detector(doc)
161 |         assert len(doc2._.abbreviations) == 0
162 | 
163 |         text = "repositories he/she already worked with or from previous collaborators. Nevertheless, 88% of the first action of users to a repository (repository discovery) is"
164 |         doc = self.nlp(text)
165 |         doc2 = self.detector(doc)
166 |         assert len(doc2._.abbreviations) == 0
167 | 
168 |     def test_empty_span(self):
169 |         text = "(19, 9, 4) Hadamard Designs and Their Residual Designs"
170 |         doc = self.nlp(text)
171 |         doc2 = self.detector(doc)
172 |         assert len(doc2._.abbreviations) == 0
173 | 
174 |     def test_space_issue(self):
175 |         text = "by designing A Lite BERT (ALBERT) architecture that has significantly fewer parameters than a traditional BERT architecture."
176 |         doc = self.nlp(text)
177 |         doc2 = self.detector(doc)
178 |         assert len(doc2._.abbreviations) == 1
179 |         assert doc2._.abbreviations[0]._.long_form.text == "A Lite BERT"
180 | 
181 |     def test_multiple_spaces(self):
182 |         text = "by      designing A     Lite BERT (ALBERT) architecture that has significantly fewer parameters than a traditional BERT architecture."
183 |         doc = self.nlp(text)
184 |         doc2 = self.detector(doc)
185 |         assert len(doc2._.abbreviations) == 1
186 |         assert doc2._.abbreviations[0]._.long_form.text == "A     Lite BERT"
187 | 
188 |     @pytest.mark.xfail
189 |     def test_difficult_cases(self):
190 |         # Don't see an obvious way of solving these. They require something more semantic to distinguish
191 |         text = "is equivalent to (iv) of Theorem"
192 |         doc = self.nlp(text)
193 |         doc2 = self.detector(doc)
194 |         assert len(doc2._.abbreviations) == 0
195 | 
196 |         text = "or to fork.Users work more on their repositories (owners) than on"
197 |         doc = self.nlp(text)
198 |         doc2 = self.detector(doc)
199 |         assert len(doc2._.abbreviations) == 0
200 | 


--------------------------------------------------------------------------------
/tests/fixtures/med_mentions.txt:
--------------------------------------------------------------------------------
  1 | 25763772|t|DCTN4 as a modifier of chronic Pseudomonas aeruginosa infection in cystic fibrosis
  2 | 25763772|a|Pseudomonas aeruginosa (Pa) infection in cystic fibrosis (CF) patients is associated with worse long-term pulmonary disease and shorter survival, and chronic Pa infection (CPA) is associated with reduced lung function, faster rate of lung decline, increased rates of exacerbations and shorter survival. By using exome sequencing and extreme phenotype design, it was recently shown that isoforms of dynactin 4 (DCTN4) may influence Pa infection in CF, leading to worse respiratory disease. The purpose of this study was to investigate the role of DCTN4 missense variants on Pa infection incidence, age at first Pa infection and chronic Pa infection incidence in a cohort of adult CF patients from a single centre. Polymerase chain reaction and direct sequencing were used to screen DNA samples for DCTN4 variants. A total of 121 adult CF patients from the Cochin Hospital CF centre have been included, all of them carrying two CFTR defects: 103 developed at least 1 pulmonary infection with Pa, and 68 patients of them had CPA. DCTN4 variants were identified in 24% (29/121) CF patients with Pa infection and in only 17% (3/18) CF patients with no Pa infection. Of the patients with CPA, 29% (20/68) had DCTN4 missense variants vs 23% (8/35) in patients without CPA. Interestingly, p.Tyr263Cys tend to be more frequently observed in CF patients with CPA than in patients without CPA (4/68 vs 0/35), and DCTN4 missense variants tend to be more frequent in male CF patients with CPA bearing two class II mutations than in male CF patients without CPA bearing two class II mutations (P = 0.06). Our observations reinforce that DCTN4 missense variants, especially p.Tyr263Cys, may be involved in the pathogenesis of CPA in male CF.
  3 | 25763772	0	5	DCTN4	T103	UMLS:C4308010
  4 | 25763772	23	63	chronic Pseudomonas aeruginosa infection	T038	UMLS:C0854135
  5 | 25763772	67	82	cystic fibrosis	T038	UMLS:C0010674
  6 | 25763772	83	120	Pseudomonas aeruginosa (Pa) infection	T038	UMLS:C0854135
  7 | 25763772	124	139	cystic fibrosis	T038	UMLS:C0010674
  8 | 25763772	141	143	CF	T038	UMLS:C0010674
  9 | 25763772	189	206	pulmonary disease	T038	UMLS:C0024115
 10 | 25763772	233	253	chronic Pa infection	T038	UMLS:C0854135
 11 | 25763772	255	258	CPA	T038	UMLS:C0854135
 12 | 25763772	302	329	faster rate of lung decline	T033	UMLS:C3160731
 13 | 25763772	350	363	exacerbations	T033	UMLS:C4086268
 14 | 25763772	395	411	exome sequencing	T062	UMLS:C3640077
 15 | 25763772	469	477	isoforms	T103	UMLS:C0597298
 16 | 25763772	481	491	dynactin 4	T103	UMLS:C4308010
 17 | 25763772	493	498	DCTN4	T103	UMLS:C4308010
 18 | 25763772	514	526	Pa infection	T038	UMLS:C0854135
 19 | 25763772	530	532	CF	T038	UMLS:C0010674
 20 | 25763772	551	570	respiratory disease	T038	UMLS:C0035204
 21 | 25763772	592	597	study	T062	UMLS:C2603343
 22 | 25763772	629	634	DCTN4	T103	UMLS:C4308010
 23 | 25763772	644	652	variants	T103	UMLS:C0597298
 24 | 25763772	656	668	Pa infection	T038	UMLS:C0854135
 25 | 25763772	693	705	Pa infection	T038	UMLS:C0854135
 26 | 25763772	710	730	chronic Pa infection	T038	UMLS:C0854135
 27 | 25763772	746	752	cohort	T098	UMLS:C0599755
 28 | 25763772	762	764	CF	T038	UMLS:C0010674
 29 | 25763772	788	794	centre	T092	UMLS:C0475309
 30 | 25763772	796	821	Polymerase chain reaction	T062	UMLS:C0032520
 31 | 25763772	826	843	direct sequencing	T062	UMLS:C3899368
 32 | 25763772	864	875	DNA samples	T017	UMLS:C0444245
 33 | 25763772	880	885	DCTN4	T103	UMLS:C4308010
 34 | 25763772	886	894	variants	T103	UMLS:C0597298
 35 | 25763772	917	919	CF	T038	UMLS:C0010674
 36 | 25763772	938	963	Cochin Hospital CF centre	T092	UMLS:C0019994
 37 | 25763772	1009	1013	CFTR	T017	UMLS:C1413365
 38 | 25763772	1048	1067	pulmonary infection	T038	UMLS:C0876973
 39 | 25763772	1073	1075	Pa	T007	UMLS:C0033809
 40 | 25763772	1105	1108	CPA	T038	UMLS:C0854135
 41 | 25763772	1110	1115	DCTN4	T103	UMLS:C4308010
 42 | 25763772	1116	1124	variants	T103	UMLS:C0597298
 43 | 25763772	1157	1159	CF	T038	UMLS:C0010674
 44 | 25763772	1174	1186	Pa infection	T038	UMLS:C0854135
 45 | 25763772	1210	1212	CF	T038	UMLS:C0010674
 46 | 25763772	1230	1242	Pa infection	T038	UMLS:C0854135
 47 | 25763772	1265	1268	CPA	T038	UMLS:C0854135
 48 | 25763772	1286	1291	DCTN4	T103	UMLS:C4308010
 49 | 25763772	1301	1309	variants	T103	UMLS:C0597298
 50 | 25763772	1344	1347	CPA	T038	UMLS:C0854135
 51 | 25763772	1364	1375	p.Tyr263Cys	T103	UMLS:C0597298
 52 | 25763772	1415	1417	CF	T038	UMLS:C0010674
 53 | 25763772	1432	1435	CPA	T038	UMLS:C0854135
 54 | 25763772	1461	1464	CPA	T038	UMLS:C0854135
 55 | 25763772	1485	1490	DCTN4	T103	UMLS:C4308010
 56 | 25763772	1500	1508	variants	T103	UMLS:C0597298
 57 | 25763772	1542	1544	CF	T038	UMLS:C0010674
 58 | 25763772	1559	1562	CPA	T038	UMLS:C0854135
 59 | 25763772	1575	1593	class II mutations	T038	UMLS:C0026882
 60 | 25763772	1607	1609	CF	T038	UMLS:C0010674
 61 | 25763772	1627	1630	CPA	T038	UMLS:C0854135
 62 | 25763772	1643	1661	class II mutations	T038	UMLS:C0026882
 63 | 25763772	1706	1711	DCTN4	T103	UMLS:C4308010
 64 | 25763772	1721	1729	variants	T103	UMLS:C0597298
 65 | 25763772	1742	1753	p.Tyr263Cys	T103	UMLS:C0597298
 66 | 25763772	1778	1790	pathogenesis	T038	UMLS:C0699748
 67 | 25763772	1794	1797	CPA	T038	UMLS:C0854135
 68 | 25763772	1806	1808	CF	T038	UMLS:C0010674
 69 | 
 70 | 25847295|t|Nonylphenol diethoxylate inhibits apoptosis induced in PC12 cells
 71 | 25847295|a|Nonylphenol and short-chain nonylphenol ethoxylates such as NP2 EO are present in aquatic environment as wastewater contaminants, and their toxic effects on aquatic species have been reported. Apoptosis has been shown to be induced by serum deprivation or copper treatment. To understand the toxicity of nonylphenol diethoxylate, we investigated the effects of NP2 EO on apoptosis induced by serum deprivation and copper by using PC12 cell system. Nonylphenol diethoxylate itself showed no toxicity and recovered cell viability from apoptosis. In addition, nonylphenol diethoxylate decreased DNA fragmentation caused by apoptosis in PC12 cells. This phenomenon was confirmed after treating apoptotic PC12 cells with nonylphenol diethoxylate, whereas the cytochrome c release into the cytosol decreased as compared to that in apoptotic cells not treated with nonylphenol diethoxylate s. Furthermore, Bax contents in apoptotic cells were reduced after exposure to nonylphenol diethoxylate. Thus, nonylphenol diethoxylate has the opposite effect on apoptosis in PC12 cells compared to nonylphenol, which enhances apoptosis induced by serum deprivation. The difference in structure of the two compounds is hypothesized to be responsible for this phenomenon. These results indicated that nonylphenol diethoxylate has capability to affect cell differentiation and development and has potentially harmful effect on organisms because of its unexpected impact on apoptosis. © 2015 Wiley Periodicals, Inc. Environ Toxicol 31: 1389-1398, 2016.
 72 | 25847295	34	43	apoptosis	T038	UMLS:C0162638
 73 | 25847295	55	65	PC12 cells	T017	UMLS:C0085262
 74 | 25847295	137	144	present	T033	UMLS:C0150312
 75 | 25847295	206	219	toxic effects	T037	UMLS:C0600688
 76 | 25847295	259	268	Apoptosis	T038	UMLS:C0162638
 77 | 25847295	301	306	serum	T031	UMLS:C0229671
 78 | 25847295	322	328	copper	T103	UMLS:C0009968
 79 | 25847295	437	446	apoptosis	T038	UMLS:C0162638
 80 | 25847295	458	463	serum	T031	UMLS:C0229671
 81 | 25847295	480	486	copper	T103	UMLS:C0009968
 82 | 25847295	496	512	PC12 cell system	T017	UMLS:C0085262
 83 | 25847295	579	593	cell viability	T038	UMLS:C0007620
 84 | 25847295	599	608	apoptosis	T038	UMLS:C0162638
 85 | 25847295	658	675	DNA fragmentation	T038	UMLS:C0376669
 86 | 25847295	686	695	apoptosis	T038	UMLS:C0162638
 87 | 25847295	699	709	PC12 cells	T017	UMLS:C0085262
 88 | 25847295	766	776	PC12 cells	T017	UMLS:C0085262
 89 | 25847295	820	832	cytochrome c	T103	UMLS:C0010749
 90 | 25847295	850	857	cytosol	T017	UMLS:C1383501
 91 | 25847295	891	906	apoptotic cells	T017	UMLS:C0007634
 92 | 25847295	965	968	Bax	T103	UMLS:C0219474
 93 | 25847295	981	996	apoptotic cells	T017	UMLS:C0007634
 94 | 25847295	1112	1121	apoptosis	T038	UMLS:C0162638
 95 | 25847295	1125	1135	PC12 cells	T017	UMLS:C0085262
 96 | 25847295	1176	1185	apoptosis	T038	UMLS:C0162638
 97 | 25847295	1197	1202	serum	T031	UMLS:C0229671
 98 | 25847295	1234	1243	structure	T082	UMLS:C0678594
 99 | 25847295	1255	1264	compounds	T103	UMLS:C0220806
100 | 25847295	1326	1333	results	T033	UMLS:C2825142
101 | 25847295	1399	1419	cell differentiation	T038	UMLS:C0007589
102 | 25847295	1424	1435	development	T038	UMLS:C0243107
103 | 25847295	1456	1470	harmful effect	T037	UMLS:C0600688
104 | 25847295	1520	1529	apoptosis	T038	UMLS:C0162638
105 | 
106 | 26316050|t|Prevascularized silicon membranes for the enhancement of transport to implanted medical devices
107 | 26316050|a|Recent advances in drug delivery and sensing devices for in situ applications are limited by the diffusion -limiting foreign body response of fibrous encapsulation. In this study, we fabricated prevascularized synthetic device ports to help mitigate this limitation. Membranes with rectilinear arrays of square pores with widths ranging from 40 to 200 μm were created using materials (50 μm thick double-sided polished silicon) and processes (photolithography and directed reactive ion etching) common in the manufacturing of microfabricated sensors. Vascular endothelial cells responded to membrane geometry by either forming vascular tubes that extended through the pore or completely filling membrane pores after 4 days in culture. Although tube formation began to predominate overgrowth around 75 μm and continued to increase at even larger pore sizes, tubes formed at these large pore sizes were not completely round and had relatively thin walls. Thus, the optimum range of pore size for prevascularization of these membranes was estimated to be 75-100 μm. This study lays the foundation for creating a prevascularized port that can be used to reduce fibrous encapsulation and thus enhance diffusion to implanted medical devices and sensors. © 2015 Wiley Periodicals, Inc. J Biomed Mater Res Part B: Appl Biomater, 104B: 1602-1609, 2016.
108 | 26316050	16	23	silicon	T103	UMLS:C0037114
109 | 26316050	70	95	implanted medical devices	T033	UMLS:C2828363
110 | 26316050	115	128	drug delivery	T074	UMLS:C0085104
111 | 26316050	153	160	in situ	T082	UMLS:C0444498
112 | 26316050	161	173	applications	T058	UMLS:C0185125
113 | 26316050	213	234	foreign body response	T033	UMLS:C1708386
114 | 26316050	400	406	square	T082	UMLS:C0205120
115 | 26316050	506	522	polished silicon	T103	UMLS:C0037114
116 | 26316050	647	673	Vascular endothelial cells	T017	UMLS:C1257792
117 | 26316050	723	737	vascular tubes	T017	UMLS:C0005847
118 | 26316050	743	751	extended	T082	UMLS:C0231449
119 | 26316050	876	886	overgrowth	T033	UMLS:C1849265
120 | 26316050	1012	1017	round	T082	UMLS:C0332490
121 | 26316050	1042	1047	walls	T082	UMLS:C0442069
122 | 26316050	1164	1169	study	T062	UMLS:C2603343
123 | 26316050	1305	1330	implanted medical devices	T033	UMLS:C2828363
124 | 


--------------------------------------------------------------------------------
/scispacy/abbreviation.py:
--------------------------------------------------------------------------------
  1 | from typing import Tuple, List, Optional, Set, Dict
  2 | from collections import defaultdict
  3 | from spacy.tokens import Span, Doc
  4 | from spacy.matcher import Matcher
  5 | from spacy.language import Language
  6 | 
  7 | 
  8 | def find_abbreviation(
  9 |     long_form_candidate: Span, short_form_candidate: Span
 10 | ) -> Tuple[Span, Optional[Span]]:
 11 |     """
 12 |     Implements the abbreviation detection algorithm in "A simple algorithm
 13 |     for identifying abbreviation definitions in biomedical text.", (Schwartz & Hearst, 2003).
 14 | 
 15 |     The algorithm works by enumerating the characters in the short form of the abbreviation,
 16 |     checking that they can be matched against characters in a candidate text for the long form
 17 |     in order, as well as requiring that the first letter of the abbreviated form matches the
 18 |     _beginning_ letter of a word.
 19 | 
 20 |     Parameters
 21 |     ----------
 22 |     long_form_candidate: Span, required.
 23 |         The spaCy span for the long form candidate of the definition.
 24 |     short_form_candidate: Span, required.
 25 |         The spaCy span for the abbreviation candidate.
 26 | 
 27 |     Returns
 28 |     -------
 29 |     A Tuple[Span, Optional[Span]], representing the short form abbreviation and the
 30 |     span corresponding to the long form expansion, or None if a match is not found.
 31 |     """
 32 |     long_form = " ".join([x.text for x in long_form_candidate])
 33 |     short_form = " ".join([x.text for x in short_form_candidate])
 34 | 
 35 |     long_index = len(long_form) - 1
 36 |     short_index = len(short_form) - 1
 37 | 
 38 |     while short_index >= 0:
 39 |         current_char = short_form[short_index].lower()
 40 |         # We don't check non alpha-numeric characters.
 41 |         if not current_char.isalnum():
 42 |             short_index -= 1
 43 |             continue
 44 | 
 45 |             # Does the character match at this position? ...
 46 |         while (
 47 |             (long_index >= 0 and long_form[long_index].lower() != current_char)
 48 |             or
 49 |             # .... or if we are checking the first character of the abbreviation, we enforce
 50 |             # to be the _starting_ character of a span.
 51 |             (
 52 |                 short_index == 0
 53 |                 and long_index > 0
 54 |                 and long_form[long_index - 1].isalnum()
 55 |             )
 56 |         ):
 57 |             long_index -= 1
 58 | 
 59 |         if long_index < 0:
 60 |             return short_form_candidate, None
 61 | 
 62 |         long_index -= 1
 63 |         short_index -= 1
 64 | 
 65 |     # The last subtraction will either take us on to a whitespace character, or
 66 |     # off the front of the string (i.e. long_index == -1). Either way, we want to add
 67 |     # one to get back to the start character of the long form
 68 |     long_index += 1
 69 | 
 70 |     # Now we know the character index of the start of the character span,
 71 |     # here we just translate that to the first token beginning after that
 72 |     # value, so we can return a spaCy span instead.
 73 |     word_lengths = 0
 74 |     starting_index = None
 75 |     for i, word in enumerate(long_form_candidate):
 76 |         # need to add 1 for the space characters
 77 |         word_lengths += len(word.text_with_ws)
 78 |         if word_lengths > long_index:
 79 |             starting_index = i
 80 |             break
 81 | 
 82 |     return short_form_candidate, long_form_candidate[starting_index:]
 83 | 
 84 | 
 85 | def filter_matches(
 86 |     matcher_output: List[Tuple[int, int, int]], doc: Doc
 87 | ) -> List[Tuple[Span, Span]]:
 88 |     # Filter into two cases:
 89 |     # 1. <Short Form> ( <Long Form> )
 90 |     # 2. <Long Form> (<Short Form>) [this case is most common].
 91 |     candidates = []
 92 |     for match in matcher_output:
 93 |         start = match[1]
 94 |         end = match[2]
 95 |         # Ignore spans with more than 8 words in them, and spans at the start of the doc
 96 |         if end - start > 8 or start == 1:
 97 |             continue
 98 |         if end - start > 3:
 99 |             # Long form is inside the parens.
100 |             # Take one word before.
101 |             short_form_candidate = doc[start - 2 : start - 1]
102 |             long_form_candidate = doc[start:end]
103 |         else:
104 |             # Normal case.
105 |             # Short form is inside the parens.
106 |             short_form_candidate = doc[start:end]
107 | 
108 |             # Sum character lengths of contents of parens.
109 |             abbreviation_length = sum([len(x) for x in short_form_candidate])
110 |             max_words = min(abbreviation_length + 5, abbreviation_length * 2)
111 |             # Look up to max_words backwards
112 |             long_form_candidate = doc[max(start - max_words - 1, 0) : start - 1]
113 | 
114 |         # add candidate to candidates if candidates pass filters
115 |         if short_form_filter(short_form_candidate):
116 |             candidates.append((long_form_candidate, short_form_candidate))
117 | 
118 |     return candidates
119 | 
120 | 
121 | def short_form_filter(span: Span) -> bool:
122 |     # All words are between length 2 and 10
123 |     if not all([2 <= len(x) < 10 for x in span]):
124 |         return False
125 | 
126 |     # At least 50% of the short form should be alpha
127 |     if (sum([c.isalpha() for c in span.text]) / len(span.text)) < 0.5:
128 |         return False
129 | 
130 |     # The first character of the short form should be alpha
131 |     if not span.text[0].isalpha():
132 |         return False
133 |     return True
134 | 
135 | 
136 | @Language.factory("abbreviation_detector")
137 | class AbbreviationDetector:
138 |     """
139 |     Detects abbreviations using the algorithm in "A simple algorithm for identifying
140 |     abbreviation definitions in biomedical text.", (Schwartz & Hearst, 2003).
141 | 
142 |     This class sets the `._.abbreviations` attribute on spaCy Doc.
143 | 
144 |     The abbreviations attribute is a `List[Span]` where each Span has the `Span._.long_form`
145 |     attribute set to the long form definition of the abbreviation.
146 | 
147 |     Note that this class does not replace the spans, or merge them.
148 | 
149 |     Parameters
150 |     ----------
151 | 
152 |     nlp: `Language`, a required argument for spacy to use this as a factory
153 |     name: `str`, a required argument for spacy to use this as a factory
154 |     make_serializable: `bool`, a required argument for whether we want to use the serializable
155 |     or non serializable version.
156 |     """
157 | 
158 |     def __init__(
159 |         self,
160 |         nlp: Language,
161 |         name: str = "abbreviation_detector",
162 |         make_serializable: bool = False,
163 |     ) -> None:
164 |         Doc.set_extension("abbreviations", default=[], force=True)
165 |         Span.set_extension("long_form", default=None, force=True)
166 | 
167 |         self.matcher = Matcher(nlp.vocab)
168 |         self.matcher.add("parenthesis", [[{"ORTH": "("}, {"OP": "+"}, {"ORTH": ")"}]])
169 |         self.make_serializable = make_serializable
170 |         self.global_matcher = Matcher(nlp.vocab)
171 | 
172 |     def find(self, span: Span, doc: Doc) -> Tuple[Span, Set[Span]]:
173 |         """
174 |         Functional version of calling the matcher for a single span.
175 |         This method is helpful if you already have an abbreviation which
176 |         you want to find a definition for.
177 |         """
178 |         dummy_matches = [(-1, int(span.start), int(span.end))]
179 |         filtered = filter_matches(dummy_matches, doc)
180 |         abbreviations = self.find_matches_for(filtered, doc)
181 | 
182 |         if not abbreviations:
183 |             return span, set()
184 |         else:
185 |             return abbreviations[0]
186 | 
187 |     def __call__(self, doc: Doc) -> Doc:
188 |         matches = self.matcher(doc)
189 |         matches_no_brackets = [(x[0], x[1] + 1, x[2] - 1) for x in matches]
190 |         filtered = filter_matches(matches_no_brackets, doc)
191 |         occurences = self.find_matches_for(filtered, doc)
192 | 
193 |         for (long_form, short_forms) in occurences:
194 |             for short in short_forms:
195 |                 short._.long_form = long_form
196 |                 doc._.abbreviations.append(short)
197 |         if self.make_serializable:
198 |             abbreviations = doc._.abbreviations
199 |             doc._.abbreviations = [
200 |                 self.make_short_form_serializable(abbreviation)
201 |                 for abbreviation in abbreviations
202 |             ]
203 |         return doc
204 | 
205 |     def find_matches_for(
206 |         self, filtered: List[Tuple[Span, Span]], doc: Doc
207 |     ) -> List[Tuple[Span, Set[Span]]]:
208 |         rules = {}
209 |         all_occurences: Dict[Span, Set[Span]] = defaultdict(set)
210 |         already_seen_long: Set[str] = set()
211 |         already_seen_short: Set[str] = set()
212 |         for (long_candidate, short_candidate) in filtered:
213 |             short, long = find_abbreviation(long_candidate, short_candidate)
214 |             # We need the long and short form definitions to be unique, because we need
215 |             # to store them so we can look them up later. This is a bit of a
216 |             # pathalogical case also, as it would mean an abbreviation had been
217 |             # defined twice in a document. There's not much we can do about this,
218 |             # but at least the case which is discarded will be picked up below by
219 |             # the global matcher. So it's likely that things will work out ok most of the time.
220 |             new_long = long.text not in already_seen_long if long else False
221 |             new_short = short.text not in already_seen_short
222 |             if long is not None and new_long and new_short:
223 |                 already_seen_long.add(long.text)
224 |                 already_seen_short.add(short.text)
225 |                 all_occurences[long].add(short)
226 |                 rules[long.text] = long
227 |                 # Add a rule to a matcher to find exactly this substring.
228 |                 self.global_matcher.add(long.text, [[{"ORTH": x.text} for x in short]])
229 |         to_remove = set()
230 |         global_matches = self.global_matcher(doc)
231 |         for match, start, end in global_matches:
232 |             string_key = self.global_matcher.vocab.strings[match]
233 |             to_remove.add(string_key)
234 |             all_occurences[rules[string_key]].add(doc[start:end])
235 |         for key in to_remove:
236 |             # Clean up the global matcher.
237 |             self.global_matcher.remove(key)
238 | 
239 |         return list((k, v) for k, v in all_occurences.items())
240 | 
241 |     def make_short_form_serializable(self, abbreviation: Span):
242 |         """
243 |         Converts the abbreviations into a short form that is serializable to enable multiprocessing
244 | 
245 |         Parameters
246 |         ----------
247 |         abbreviation: Span
248 |             The abbreviation span identified by the detector
249 |         """
250 |         long_form = abbreviation._.long_form
251 |         abbreviation._.long_form = long_form.text
252 |         serializable_abbr = {
253 |             "short_text": abbreviation.text,
254 |             "short_start": abbreviation.start,
255 |             "short_end": abbreviation.end,
256 |             "long_text": long_form.text,
257 |             "long_start": long_form.start,
258 |             "long_end": long_form.end,
259 |         }
260 |         return serializable_abbr
261 | 


--------------------------------------------------------------------------------
/tests/fixtures/umls_test_fixture.json:
--------------------------------------------------------------------------------
  1 | [
  2 |     {
  3 |         "concept_id": "C0000015",
  4 |         "aliases": [],
  5 |         "types": [
  6 |             "T116",
  7 |             "T121",
  8 |             "T130"
  9 |         ],
 10 |         "canonical_name": "(132)I-Macro Albin"
 11 |     },
 12 |     {
 13 |         "concept_id": "C0000005",
 14 |         "aliases": [
 15 |             "(131)I-MAA"
 16 |         ],
 17 |         "types": [
 18 |             "T116",
 19 |             "T121",
 20 |             "T130"
 21 |         ],
 22 |         "canonical_name": "(131)I-Macroaggregated Albumin"
 23 |     },
 24 |     {
 25 |         "concept_id": "C0000039",
 26 |         "aliases": [
 27 |             "1,2-Dipalmitoylphosphatidylcholine",
 28 |             "1,2-Dipalmitoylphosphatidylcholine",
 29 |             "1,2 Dipalmitoylphosphatidylcholine",
 30 |             "1,2-Dihexadecyl-sn-Glycerophosphocholine",
 31 |             "1,2-Dihexadecyl-sn-Glycerophosphocholine",
 32 |             "1,2 Dihexadecyl sn Glycerophosphocholine",
 33 |             "1,2-Dipalmitoyl-Glycerophosphocholine",
 34 |             "1,2-Dipalmitoyl-Glycerophosphocholine",
 35 |             "1,2 Dipalmitoyl Glycerophosphocholine",
 36 |             "Dipalmitoylphosphatidylcholine",
 37 |             "Dipalmitoylphosphatidylcholine",
 38 |             "Dipalmitoylphosphatidylcholine",
 39 |             "Dipalmitoylphosphatidylcholine",
 40 |             "Dipalmitoylphosphatidylcholine",
 41 |             "Dipalmitoylglycerophosphocholine",
 42 |             "Dipalmitoylglycerophosphocholine",
 43 |             "Dipalmitoyllecithin",
 44 |             "Dipalmitoyllecithin",
 45 |             "3,5,9-Trioxa-4-phosphapentacosan-1-aminium, 4-hydroxy-N,N,N-trimethyl-10-oxo-7-((1-oxohexadecyl)oxy)-, inner salt, 4-oxide",
 46 |             "3,5,9-Trioxa-4-phosphapentacosan-1-aminium, 4-hydroxy-N,N,N-trimethyl-10-oxo-7-((1-oxohexadecyl)oxy)-, inner salt, 4-oxide",
 47 |             "Dipalmitoyl Phosphatidylcholine",
 48 |             "Dipalmitoyl Phosphatidylcholine",
 49 |             "Phosphatidylcholine, Dipalmitoyl",
 50 |             "1,2-Dipalmitoylphosphatidylcholine [Chemical/Ingredient]"
 51 |         ],
 52 |         "types": [
 53 |             "T109",
 54 |             "T121"
 55 |         ],
 56 |         "canonical_name": "1,2-Dipalmitoylphosphatidylcholine"
 57 |     },
 58 |     {
 59 |         "concept_id": "C0000052",
 60 |         "aliases": [
 61 |             "1,4-alpha-Glucan Branching Enzyme",
 62 |             "1,4-alpha-Glucan Branching Enzyme",
 63 |             "1,4-alpha-Glucan branching enzyme",
 64 |             "1,4-alpha-Glucan branching enzyme",
 65 |             "1,4-Alpha glucan branching enzyme",
 66 |             "1,4-Alpha glucan branching enzyme",
 67 |             "1,4 alpha Glucan Branching Enzyme",
 68 |             "Branching Enzyme, 1,4-alpha-Glucan",
 69 |             "Enzyme, 1,4-alpha-Glucan Branching",
 70 |             "Branching Enzyme",
 71 |             "Branching Enzyme",
 72 |             "Branching enzyme",
 73 |             "Branching enzyme",
 74 |             "Enzyme, Branching",
 75 |             "Branching Glycosyltransferase",
 76 |             "Branching Glycosyltransferase",
 77 |             "Glycosyltransferase, Branching",
 78 |             "Starch Branching Enzyme",
 79 |             "Starch Branching Enzyme",
 80 |             "Branching Enzyme, Starch",
 81 |             "Enzyme, Starch Branching",
 82 |             "1,4-alpha-D-Glucan:1,4-alpha-D-glucan 6-alpha-D-(1,4-alpha-D-glucano)-transferase",
 83 |             "1,4-alpha-D-Glucan:1,4-alpha-D-glucan 6-alpha-D-(1,4-alpha-D-glucano)-transferase",
 84 |             "Amylo-(1,4,6)-transglycosylase",
 85 |             "Amylo-(1,4,6)-transglycosylase",
 86 |             "alpha-Glucan-branching glycosyltransferase",
 87 |             "Amylo (1-4 to 1-6)-transglucosidase",
 88 |             "1,4-alpha-Glucan branching enzyme (substance)",
 89 |             "1,4-alpha-Glucan Branching Enzyme [Chemical/Ingredient]"
 90 |         ],
 91 |         "types": [
 92 |             "T116",
 93 |             "T126"
 94 |         ],
 95 |         "canonical_name": "1,4-alpha-Glucan Branching Enzyme",
 96 |         "definition": "In glycogen or amylopectin synthesis, the enzyme that catalyzes the transfer of a segment of a 1,4-alpha-glucan chain to a primary hydroxy group in a similar glucan chain. EC 2.4.1.18."
 97 |     },
 98 |     {
 99 |         "concept_id": "C0000074",
100 |         "aliases": [
101 |             "1 Alkyl 2 Acylphosphatidates"
102 |         ],
103 |         "types": [
104 |             "T109"
105 |         ],
106 |         "canonical_name": "1-Alkyl-2-Acylphosphatidates"
107 |     },
108 |     {
109 |         "concept_id": "C0000084",
110 |         "aliases": [
111 |             "1-Carboxyglutamic Acid",
112 |             "1 Carboxyglutamic Acid",
113 |             "gamma-Carboxyglutamic Acid",
114 |             "gamma-Carboxyglutamic Acid",
115 |             "gamma Carboxyglutamic Acid",
116 |             "3-Amino-1,1,3-propanetricarboxylic Acid",
117 |             "3-Amino-1,1,3-propanetricarboxylic Acid",
118 |             "1,1,3-Propanetricarboxylic acid, 3-amino-",
119 |             "1,1,3-Propanetricarboxylic acid, 3-amino-",
120 |             "1-Carboxyglutamic Acid [Chemical/Ingredient]"
121 |         ],
122 |         "types": [
123 |             "T116",
124 |             "T123"
125 |         ],
126 |         "canonical_name": "1-Carboxyglutamic Acid",
127 |         "definition": "Found in various tissues, particularly in four blood-clotting proteins including prothrombin, in kidney protein, in bone protein, and in the protein present in various ectopic calcifications."
128 |     },
129 |     {
130 |         "concept_id": "C0000096",
131 |         "aliases": [
132 |             "1-Methyl-3-isobutylxanthine",
133 |             "1 Methyl 3 isobutylxanthine",
134 |             "3-Isobutyl-1-methylxanthine",
135 |             "3-Isobutyl-1-methylxanthine",
136 |             "3 Isobutyl 1 methylxanthine",
137 |             "IBMX",
138 |             "IBMX",
139 |             "Isobutyltheophylline",
140 |             "Isobutyltheophylline",
141 |             "1H-Purine-2,6-dione, 3,7-dihydro-1-methyl-3-(2-methylpropyl)-",
142 |             "1H-Purine-2,6-dione, 3,7-dihydro-1-methyl-3-(2-methylpropyl)-",
143 |             "1-Methyl-3-isobutylxanthine [Chemical/Ingredient]"
144 |         ],
145 |         "types": [
146 |             "T109",
147 |             "T121"
148 |         ],
149 |         "canonical_name": "1-Methyl-3-isobutylxanthine",
150 |         "definition": "A potent cyclic nucleotide phosphodiesterase inhibitor; due to this action, the compound increases cyclic AMP and cyclic GMP in tissue and thereby activates CYCLIC NUCLEOTIDE-REGULATED PROTEIN KINASES"
151 |     },
152 |     {
153 |         "concept_id": "C0000097",
154 |         "aliases": [
155 |             "1-Methyl-4-phenyl-1,2,3,6-tetrahydropyridine",
156 |             "MPTP",
157 |             "MPTP",
158 |             "MPTP",
159 |             "mptp",
160 |             "N-Methyl-4-phenyl-1,2,3,6-tetrahydropyridine",
161 |             "N-Methyl-4-phenyl-1,2,3,6-tetrahydropyridine",
162 |             "Pyridine, 1,2,3,6-tetrahydro-1-methyl-4-phenyl-",
163 |             "Pyridine, 1,2,3,6-tetrahydro-1-methyl-4-phenyl-",
164 |             "Methylphenyltetrahydropyridine",
165 |             "Methylphenyltetrahydropyridine",
166 |             "methylphenyltetrahydropyridine",
167 |             "Methylphenyltetrahydropyridine (substance)",
168 |             "1-Methyl-4-phenyl-1,2,3,6-tetrahydropyridine [Chemical/Ingredient]",
169 |             "1-Methyl-4-Phenyl-1,2,3,6-Tetrahydropyridine (MPTP)"
170 |         ],
171 |         "types": [
172 |             "T109",
173 |             "T131"
174 |         ],
175 |         "canonical_name": "1-Methyl-4-phenyl-1,2,3,6-tetrahydropyridine",
176 |         "definition": "A dopaminergic neurotoxic compound which produces irreversible clinical, chemical, and pathological alterations that mimic those found in Parkinson disease."
177 |     },
178 |     {
179 |         "concept_id": "C0000098",
180 |         "aliases": [
181 |             "1-Methyl-4-phenylpyridinium",
182 |             "1 Methyl 4 phenylpyridinium",
183 |             "1-Methyl-4-phenylpyridinium Ion",
184 |             "1-Methyl-4-phenylpyridinium Ion",
185 |             "1 Methyl 4 phenylpyridinium Ion",
186 |             "Cyperquat",
187 |             "Cyperquat",
188 |             "CNN",
189 |             "N-Methyl-4-phenylpyridine",
190 |             "N-Methyl-4-phenylpyridine",
191 |             "N Methyl 4 phenylpyridine",
192 |             "1-Methyl-4-phenylpyridine",
193 |             "1-Methyl-4-phenylpyridine",
194 |             "1 Methyl 4 phenylpyridine",
195 |             "N-Methyl-4-phenylpyridinium",
196 |             "N-Methyl-4-phenylpyridinium",
197 |             "Pyridinium, 1-methyl-4-phenyl-",
198 |             "Pyridinium, 1-methyl-4-phenyl-",
199 |             "1-Methyl-4-phenylpyridinium [Chemical/Ingredient]"
200 |         ],
201 |         "types": [
202 |             "T109",
203 |             "T131"
204 |         ],
205 |         "canonical_name": "1-Methyl-4-phenylpyridinium",
206 |         "definition": "An active neurotoxic metabolite of 1-METHYL-4-PHENYL-1,2,3,6-TETRAHYDROPYRIDINE. The compound reduces dopamine levels, inhibits the biosynthesis of catecholamines, depletes cardiac norepinephrine and inactivates tyrosine hydroxylase. These and other toxic effects lead to cessation of oxidative phosphorylation, ATP depletion, and cell death. The compound, which is related to PARAQUAT, has also been used as an herbicide."
207 |     },
208 |     {
209 |         "concept_id": "C0000102",
210 |         "aliases": [
211 |             "1-Naphthylamine",
212 |             "1-naphthylamine",
213 |             "1 Naphthylamine",
214 |             "alpha-Naphthylamine",
215 |             "alpha-Naphthylamine",
216 |             "alpha-naphthylamine",
217 |             "alpha Naphthylamine",
218 |             "Naphthalidine",
219 |             "Naphthalidine",
220 |             "8-Aminonaphthalene",
221 |             "8-Aminonaphthalene",
222 |             "8 Aminonaphthalene",
223 |             "1-Aminonaphthalene",
224 |             "1-Aminonaphthalene",
225 |             "1 Aminonaphthalene",
226 |             "1-Naphthalenamine",
227 |             "1-Naphthalenamine",
228 |             "1-Naththylamine",
229 |             "1-Naththylamine",
230 |             "a- Naphthylamine",
231 |             "a- Naphthylamine",
232 |             "a-Naphthylamine",
233 |             "1-Naththylamine (substance)",
234 |             "1-Naphthylamine [Chemical/Ingredient]"
235 |         ],
236 |         "types": [
237 |             "T109",
238 |             "T131"
239 |         ],
240 |         "canonical_name": "1-Naphthylamine",
241 |         "definition": "A suspected industrial carcinogen (and listed as such by OSHA). Its N-hydroxy metabolite is strongly carcinogenic and mutagenic."
242 |     },
243 |     {
244 |         "concept_id": "C0000103",
245 |         "aliases": [
246 |             "1-Naphthylisothiocyanate",
247 |             "1 Naphthylisothiocyanate",
248 |             "alpha-Naphthylisothiocyanate",
249 |             "alpha-Naphthylisothiocyanate",
250 |             "alpha Naphthylisothiocyanate",
251 |             "Naphthalene, 1-isothiocyanato-",
252 |             "Naphthalene, 1-isothiocyanato-",
253 |             "1-Naphthylisothiocyanate [Chemical/Ingredient]"
254 |         ],
255 |         "types": [
256 |             "T109",
257 |             "T130",
258 |             "T131"
259 |         ],
260 |         "canonical_name": "1-Naphthylisothiocyanate",
261 |         "definition": "A tool for the study of liver damage which causes bile stasis and hyperbilirubinemia acutely and bile duct hyperplasia and biliary cirrhosis chronically, with changes in hepatocyte function. It may cause skin and kidney damage."
262 |     }
263 | ]


--------------------------------------------------------------------------------
/scispacy/data_util.py:
--------------------------------------------------------------------------------
  1 | from typing import NamedTuple, List, Iterator, Dict, Tuple
  2 | import tarfile
  3 | import atexit
  4 | import os
  5 | import shutil
  6 | import tempfile
  7 | 
  8 | from scispacy.file_cache import cached_path
  9 | 
 10 | 
 11 | class MedMentionEntity(NamedTuple):
 12 |     start: int
 13 |     end: int
 14 |     mention_text: str
 15 |     mention_type: str
 16 |     umls_id: str
 17 | 
 18 | 
 19 | class MedMentionExample(NamedTuple):
 20 |     title: str
 21 |     abstract: str
 22 |     text: str
 23 |     pubmed_id: str
 24 |     entities: List[MedMentionEntity]
 25 | 
 26 | 
 27 | def process_example(lines: List[str]) -> MedMentionExample:
 28 |     """
 29 |     Processes the text lines of a file corresponding to a single MedMention abstract,
 30 |     extracts the title, abstract, pubmed id and entities. The lines of the file should
 31 |     have the following format:
 32 |     PMID | t | Title text
 33 |     PMID | a | Abstract text
 34 |     PMID TAB StartIndex TAB EndIndex TAB MentionTextSegment TAB SemanticTypeID TAB EntityID
 35 |     ...
 36 |     """
 37 |     pubmed_id, _, title = [x.strip() for x in lines[0].split("|", maxsplit=2)]
 38 |     _, _, abstract = [x.strip() for x in lines[1].split("|", maxsplit=2)]
 39 | 
 40 |     entities = []
 41 |     for entity_line in lines[2:]:
 42 |         _, start, end, mention, mention_type, umls_id = entity_line.split("\t")
 43 |         mention_type = mention_type.split(",")[0]
 44 |         entities.append(
 45 |             MedMentionEntity(int(start), int(end), mention, mention_type, umls_id)
 46 |         )
 47 |     return MedMentionExample(
 48 |         title, abstract, title + " " + abstract, pubmed_id, entities
 49 |     )
 50 | 
 51 | 
 52 | def med_mentions_example_iterator(filename: str) -> Iterator[MedMentionExample]:
 53 |     """
 54 |     Iterates over a Med Mentions file, yielding examples.
 55 |     """
 56 |     with open(filename, "r", encoding="utf-8") as med_mentions_file:
 57 |         lines = []
 58 |         for line in med_mentions_file:
 59 |             line = line.strip()
 60 |             if line:
 61 |                 lines.append(line)
 62 |             else:
 63 |                 yield process_example(lines)
 64 |                 lines = []
 65 |         # Pick up stragglers
 66 |         if lines:
 67 |             yield process_example(lines)
 68 | 
 69 | 
 70 | def select_subset_of_overlapping_chain(
 71 |     chain: List[Tuple[int, int, str]]
 72 | ) -> List[Tuple[int, int, str]]:
 73 |     """
 74 |     Select the subset of entities in an overlapping chain to return by greedily choosing the
 75 |     longest entity in the chain until there are no entities remaining
 76 |     """
 77 |     sorted_chain = sorted(chain, key=lambda x: x[1] - x[0], reverse=True)
 78 |     selections_from_chain: List[Tuple[int, int, str]] = []
 79 |     chain_index = 0
 80 |     # dump the current chain by greedily keeping the longest entity that doesn't overlap
 81 |     while chain_index < len(sorted_chain):
 82 |         entity = sorted_chain[chain_index]
 83 |         match_found = False
 84 |         for already_selected_entity in selections_from_chain:
 85 |             max_start = max(entity[0], already_selected_entity[0])
 86 |             min_end = min(entity[1], already_selected_entity[1])
 87 |             if len(range(max_start, min_end)) > 0:
 88 |                 match_found = True
 89 |                 break
 90 | 
 91 |         if not match_found:
 92 |             selections_from_chain.append(entity)
 93 | 
 94 |         chain_index += 1
 95 | 
 96 |     return selections_from_chain
 97 | 
 98 | 
 99 | def remove_overlapping_entities(
100 |     sorted_spacy_format_entities: List[Tuple[int, int, str]]
101 | ) -> List[Tuple[int, int, str]]:
102 |     """
103 |     Removes overlapping entities from the entity set, by greedilytaking the longest
104 |     entity from each overlapping chain. The input list of entities should be sorted
105 |     and follow the spacy format.
106 |     """
107 |     spacy_format_entities_without_overlap = []
108 |     current_overlapping_chain: List[Tuple[int, int, str]] = []
109 |     current_overlapping_chain_start = 0
110 |     current_overlapping_chain_end = 0
111 |     for i, current_entity in enumerate(sorted_spacy_format_entities):
112 |         current_entity = sorted_spacy_format_entities[i]
113 |         current_entity_start = current_entity[0]
114 |         current_entity_end = current_entity[1]
115 | 
116 |         if len(current_overlapping_chain) == 0:
117 |             current_overlapping_chain.append(current_entity)
118 |             current_overlapping_chain_start = current_entity_start
119 |             current_overlapping_chain_end = current_entity_end
120 |         else:
121 |             min_end = min(current_entity_end, current_overlapping_chain_end)
122 |             max_start = max(current_entity_start, current_overlapping_chain_start)
123 |             if min_end - max_start > 0:
124 |                 current_overlapping_chain.append(current_entity)
125 |                 current_overlapping_chain_start = min(
126 |                     current_entity_start, current_overlapping_chain_start
127 |                 )
128 |                 current_overlapping_chain_end = max(
129 |                     current_entity_end, current_overlapping_chain_end
130 |                 )
131 |             else:
132 |                 selections_from_chain = select_subset_of_overlapping_chain(
133 |                     current_overlapping_chain
134 |                 )
135 | 
136 |                 current_overlapping_chain = []
137 |                 spacy_format_entities_without_overlap.extend(selections_from_chain)
138 |                 current_overlapping_chain.append(current_entity)
139 |                 current_overlapping_chain_start = current_entity_start
140 |                 current_overlapping_chain_end = current_entity_end
141 | 
142 |     spacy_format_entities_without_overlap.extend(
143 |         select_subset_of_overlapping_chain(current_overlapping_chain)
144 |     )
145 | 
146 |     return sorted(spacy_format_entities_without_overlap, key=lambda x: x[0])
147 | 
148 | 
149 | def read_full_med_mentions(
150 |     directory_path: str,
151 |     label_mapping: Dict[str, str] = None,
152 |     span_only: bool = False,
153 |     spacy_format: bool = True,
154 | ):
155 |     def _cleanup_dir(dir_path: str):
156 |         if os.path.exists(dir_path):
157 |             shutil.rmtree(dir_path)
158 | 
159 |     resolved_directory_path = cached_path(directory_path)
160 |     if "tar.gz" in directory_path:
161 |         # Extract dataset to temp dir
162 |         tempdir = tempfile.mkdtemp()
163 |         print(
164 |             f"extracting dataset directory {resolved_directory_path} to temp dir {tempdir}"
165 |         )
166 |         with tarfile.open(resolved_directory_path, "r:gz") as archive:
167 |             archive.extractall(tempdir)
168 |         # Postpone cleanup until exit in case the unarchived
169 |         # contents are needed outside this function.
170 |         atexit.register(_cleanup_dir, tempdir)
171 | 
172 |         resolved_directory_path = tempdir
173 | 
174 |     expected_names = [
175 |         "corpus_pubtator.txt",
176 |         "corpus_pubtator_pmids_all.txt",
177 |         "corpus_pubtator_pmids_dev.txt",
178 |         "corpus_pubtator_pmids_test.txt",
179 |         "corpus_pubtator_pmids_trng.txt",
180 |     ]
181 | 
182 |     corpus = os.path.join(resolved_directory_path, expected_names[0])
183 |     examples = med_mentions_example_iterator(corpus)
184 | 
185 |     train_ids = {
186 |         x.strip()
187 |         for x in open(os.path.join(resolved_directory_path, expected_names[4]))
188 |     }
189 |     dev_ids = {
190 |         x.strip()
191 |         for x in open(os.path.join(resolved_directory_path, expected_names[2]))
192 |     }
193 |     test_ids = {
194 |         x.strip()
195 |         for x in open(os.path.join(resolved_directory_path, expected_names[3]))
196 |     }
197 | 
198 |     train_examples = []
199 |     dev_examples = []
200 |     test_examples = []
201 | 
202 |     def label_function(label):
203 |         if span_only:
204 |             return "ENTITY"
205 |         if label_mapping is None:
206 |             return label
207 |         else:
208 |             return label_mapping[label]
209 | 
210 |     for example in examples:
211 |         spacy_format_entities = [
212 |             (x.start, x.end, label_function(x.mention_type)) for x in example.entities
213 |         ]
214 |         spacy_format_entities = remove_overlapping_entities(
215 |             sorted(spacy_format_entities, key=lambda x: x[0])
216 |         )
217 |         spacy_example = (example.text, {"entities": spacy_format_entities})
218 |         if example.pubmed_id in train_ids:
219 |             train_examples.append(spacy_example if spacy_format else example)
220 | 
221 |         elif example.pubmed_id in dev_ids:
222 |             dev_examples.append(spacy_example if spacy_format else example)
223 | 
224 |         elif example.pubmed_id in test_ids:
225 |             test_examples.append(spacy_example if spacy_format else example)
226 | 
227 |     return train_examples, dev_examples, test_examples
228 | 
229 | 
230 | SpacyNerExample = Tuple[str, Dict[str, List[Tuple[int, int, str]]]]
231 | 
232 | 
233 | def _handle_sentence(examples: List[Tuple[str, str]]) -> SpacyNerExample:
234 |     """
235 |     Processes a single sentence by building it up as a space separated string
236 |     with its corresponding typed entity spans.
237 |     """
238 |     start_index = -1
239 |     current_index = 0
240 |     in_entity = False
241 |     entity_type: str = ""
242 |     sent = ""
243 |     entities: List[Tuple[int, int, str]] = []
244 |     for word, entity in examples:
245 |         sent += word
246 |         sent += " "
247 |         if entity != "O":
248 |             if in_entity:
249 |                 pass
250 |             else:
251 |                 start_index = current_index
252 |                 in_entity = True
253 |                 entity_type = entity[2:].upper()
254 |         else:
255 |             if in_entity:
256 |                 end_index = current_index - 1
257 |                 entities.append((start_index, end_index, entity_type.replace("-", "_")))
258 |             in_entity = False
259 |             entity_type = ""
260 |             start_index = -1
261 |         current_index += len(word) + 1
262 |     if in_entity:
263 |         end_index = current_index - 1
264 |         entities.append((start_index, end_index, entity_type))
265 | 
266 |     # Remove last space.
267 |     sent = sent[:-1]
268 |     return (sent, {"entities": entities})
269 | 
270 | 
271 | def read_ner_from_tsv(filename: str) -> List[SpacyNerExample]:
272 |     """
273 |     Reads BIO formatted NER data from a TSV file, such as the
274 |     NER data found here:
275 |     https://github.com/cambridgeltl/MTL-Bioinformatics-2016
276 | 
277 |     Data is expected to be 2 tab seperated tokens per line, with
278 |     sentences denoted by empty lines. Sentences read by this
279 |     function will be already tokenized, but returned as a string,
280 |     as this is the format required by SpaCy. Consider using the
281 |     WhitespaceTokenizer(scispacy/util.py) to split this data
282 |     with a SpaCy model.
283 | 
284 |     Parameters
285 |     ----------
286 |     filename : str
287 |         The path to the tsv data.
288 | 
289 |     Returns
290 |     -------
291 |     spacy_format_data : List[SpacyNerExample]
292 |         The BIO tagged NER examples.
293 |     """
294 |     spacy_format_data = []
295 |     examples: List[Tuple[str, str]] = []
296 |     for line in open(cached_path(filename)):
297 |         line = line.strip()
298 |         if line.startswith("-DOCSTART-"):
299 |             continue
300 |         # We have reached the end of a sentence.
301 |         if not line:
302 |             if not examples:
303 |                 continue
304 |             spacy_format_data.append(_handle_sentence(examples))
305 |             examples = []
306 |         else:
307 |             word, entity = line.split("\t")
308 |             examples.append((word, entity))
309 |     if examples:
310 |         spacy_format_data.append(_handle_sentence(examples))
311 | 
312 |     return spacy_format_data
313 | 


--------------------------------------------------------------------------------