├── .gitignore
├── .pylintrc
├── .travis.yml
├── CONTRIBUTING.md
├── LICENSE.txt
├── MANIFEST.in
├── README.rst
├── docs
    ├── Makefile
    ├── README
    ├── acknowledgments.rst
    ├── conf.py
    ├── contributing.rst
    ├── developer_guide.rst
    ├── disclaimer.rst
    ├── getting_started.rst
    ├── index.rst
    ├── license.rst
    ├── make.bat
    ├── reference.rst
    └── user_guide.rst
├── environment2.7.yml
├── environment3.7.yml
├── examples
    ├── 00000086.txt
    ├── 00019248.txt
    ├── 1.xml
    ├── 2.xml
    ├── cuis-cvpr2017.txt
    ├── openi-testset.txt
    └── openi_gld_std14.csv
├── images
    └── negbio.png
├── negbio
    ├── __init__.py
    ├── chexpert
    │   ├── LICENSE
    │   ├── __init__.py
    │   ├── constants.py
    │   ├── patterns
    │   │   ├── negation.txt
    │   │   ├── post_negation_uncertainty.txt
    │   │   └── pre_negation_uncertainty.txt
    │   ├── phrases
    │   │   ├── mention
    │   │   │   ├── airspace_opacity.txt
    │   │   │   ├── atelectasis.txt
    │   │   │   ├── cardiomegaly.txt
    │   │   │   ├── consolidation.txt
    │   │   │   ├── edema.txt
    │   │   │   ├── enlarged_cardiomediastinum.txt
    │   │   │   ├── fracture.txt
    │   │   │   ├── lung_lesion.txt
    │   │   │   ├── no_finding.txt
    │   │   │   ├── pleural_effusion.txt
    │   │   │   ├── pleural_other.txt
    │   │   │   ├── pneumonia.txt
    │   │   │   ├── pneumothorax.txt
    │   │   │   └── support_devices.txt
    │   │   └── unmention
    │   │   │   ├── airspace_opacity.txt
    │   │   │   ├── lung_lesion.txt
    │   │   │   └── pleural_effusion.txt
    │   └── stages
    │   │   ├── __init__.py
    │   │   ├── aggregate.py
    │   │   ├── classify.py
    │   │   ├── extract.py
    │   │   └── load.py
    ├── cli_utils.py
    ├── compat.py
    ├── ext
    │   ├── __init__.py
    │   └── normalize_mimiccxr.py
    ├── main_chexpert.py
    ├── main_mm.py
    ├── neg
    │   ├── __init__.py
    │   ├── neg_detector.py
    │   ├── propagator.py
    │   ├── semgraph.py
    │   └── utils.py
    ├── negbio_clean.py
    ├── negbio_dner_chexpert.py
    ├── negbio_dner_matamap.py
    ├── negbio_neg.py
    ├── negbio_neg_chexpert.py
    ├── negbio_normalize.py
    ├── negbio_parse.py
    ├── negbio_pipeline.py
    ├── negbio_ptb2ud.py
    ├── negbio_section_split.py
    ├── negbio_ssplit.py
    ├── negbio_text2bioc.py
    ├── ngrex
    │   ├── __init__.py
    │   ├── parser.out
    │   ├── parser.py
    │   ├── parsetab.py
    │   └── pattern.py
    ├── patterns
    │   ├── neg_patterns.txt
    │   ├── section_titles.txt
    │   └── uncertainty_patterns.txt
    └── pipeline
    │   ├── __init__.py
    │   ├── cleanup.py
    │   ├── dner_mm.py
    │   ├── negdetect.py
    │   ├── parse.py
    │   ├── pipeline.py
    │   ├── ptb2ud.py
    │   ├── scan.py
    │   ├── section_split.py
    │   ├── ssplit.py
    │   └── text2bioc.py
├── requirements.txt
├── setup.py
└── tests
    ├── __init__.py
    ├── context.py
    └── negbio
        ├── __init__.py
        ├── ngrex
            ├── __init__.py
            ├── test_parser.py
            └── test_pattern.py
        ├── pipeline
            ├── __init__.py
            └── test_parse.py
        └── test_cli.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Created by .ignore support plugin (hsz.mobi)
  2 | 
  3 | .pytest_cache/
  4 | backup
  5 | examples-local
  6 | .DS_store
  7 | 
  8 | ### Python template
  9 | # Byte-compiled / optimized / DLL files
 10 | __pycache__/
 11 | *.py[cod]
 12 | *$py.class
 13 | 
 14 | # C extensions
 15 | *.so
 16 | 
 17 | # Distribution / packaging
 18 | .Python
 19 | env/
 20 | build/
 21 | develop-eggs/
 22 | dist/
 23 | downloads/
 24 | eggs/
 25 | .eggs/
 26 | lib/
 27 | lib64/
 28 | parts/
 29 | sdist/
 30 | var/
 31 | wheels/
 32 | *.egg-info/
 33 | .installed.cfg
 34 | *.egg
 35 | 
 36 | # PyInstaller
 37 | #  Usually these files are written by a python script from a template
 38 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 39 | *.manifest
 40 | *.spec
 41 | 
 42 | # Installer logs
 43 | pip-log.txt
 44 | pip-delete-this-directory.txt
 45 | 
 46 | # Unit test / coverage reports
 47 | htmlcov/
 48 | .tox/
 49 | .coverage
 50 | .coverage.*
 51 | .cache
 52 | nosetests.xml
 53 | coverage.xml
 54 | *,cover
 55 | .hypothesis/
 56 | 
 57 | # Translations
 58 | *.mo
 59 | *.pot
 60 | 
 61 | # Django stuff:
 62 | *.log
 63 | local_settings.py
 64 | 
 65 | # Flask stuff:
 66 | instance/
 67 | .webassets-cache
 68 | 
 69 | # Scrapy stuff:
 70 | .scrapy
 71 | 
 72 | # Sphinx documentation
 73 | docs/_build/
 74 | 
 75 | # PyBuilder
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # pyenv
 82 | .python-version
 83 | 
 84 | # celery beat schedule file
 85 | celerybeat-schedule
 86 | 
 87 | # SageMath parsed files
 88 | *.sage.py
 89 | 
 90 | # dotenv
 91 | .env
 92 | 
 93 | # virtualenv
 94 | .venv
 95 | venv/
 96 | ENV/
 97 | 
 98 | # Spyder project settings
 99 | .spyderproject
100 | 
101 | # Rope project settings
102 | .ropeproject
103 | ### JetBrains template
104 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm
105 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
106 | 
107 | # User-specific stuff:
108 | .idea
109 | .idea/**/workspace.xml
110 | .idea/**/tasks.xml
111 | .idea/dictionaries
112 | 
113 | # Sensitive or high-churn files:
114 | .idea/**/dataSources/
115 | .idea/**/dataSources.ids
116 | .idea/**/dataSources.xml
117 | .idea/**/dataSources.local.xml
118 | .idea/**/sqlDataSources.xml
119 | .idea/**/dynamic.xml
120 | .idea/**/uiDesigner.xml
121 | 
122 | # Gradle:
123 | .idea/**/gradle.xml
124 | .idea/**/libraries
125 | 
126 | # Mongo Explorer plugin:
127 | .idea/**/mongoSettings.xml
128 | 
129 | ## File-based project format:
130 | *.iws
131 | 
132 | ## Plugin-specific files:
133 | 
134 | # IntelliJ
135 | /out/
136 | 
137 | # mpeltonen/sbt-idea plugin
138 | .idea_modules/
139 | 
140 | # JIRA plugin
141 | atlassian-ide-plugin.xml
142 | 
143 | # Crashlytics plugin (for Android Studio and IntelliJ)
144 | com_crashlytics_export_strings.xml
145 | crashlytics.properties
146 | crashlytics-build.properties
147 | fabric.properties
148 | 
149 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | python:
 3 |   - "2.7"
 4 | install:
 5 |   pip install -r requirements.txt
 6 | #  - sudo apt-get update
 7 | #  # We do this conditionally because it saves us some downloading if the
 8 | #  # version is the same.
 9 | #  - if [[ "$TRAVIS_PYTHON_VERSION" == "2.7" ]]; then
10 | #      wget https://repo.continuum.io/miniconda/Miniconda2-latest-Linux-x86_64.sh -O miniconda.sh;
11 | #    else
12 | #      wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh;
13 | #    fi
14 | #  - bash miniconda.sh -b -p $HOME/miniconda
15 | #  - export PATH="$HOME/miniconda/bin:$PATH"
16 | #  - hash -r
17 | #  - conda config --set always_yes yes --set changeps1 no
18 | #  - conda update -q conda
19 | #  # Useful for debugging any issues with conda
20 | #  - conda info -a
21 | #
22 | #  # Replace dep1 dep2 ... with your dependencies
23 | #  - conda env create --file environment2.7.yml
24 | #  - source activate negbio2.7
25 | 
26 | script:
27 |   - py.test
28 | 
29 | notifications:
30 |   email: false


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing
 2 | 
 3 | When contributing to this repository, please first discuss the change you wish to make via issue,
 4 | email, or any other method with the owners of this repository before making a change. 
 5 | This project adheres to the [Contributor Covenant Code of Conduct](http://contributor-covenant.org/). 
 6 | 
 7 | # Maintainers
 8 | 
 9 | NegBio is maintained with :heart: by:
10 | 
11 | -- **@yfpeng**
12 | 
13 | See also the list of [contributors](https://github.com/ncbi-nlp/NegBio/contributors) who participated in this project.
14 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 |                           PUBLIC DOMAIN NOTICE
 2 |               National Center for Biotechnology Information
 3 | 
 4 | This software/database is a "United States Government Work" under the terms of
 5 | the United States Copyright Act.  It was written as part of the author's
 6 | official duties as a United States Government employee and thus cannot be
 7 | copyrighted.  This software/database is freely available to the public for use.
 8 | The National Library of Medicine and the U.S. Government have not placed any
 9 | restriction on its use or reproduction.
10 | 
11 | Although all reasonable efforts have been taken to ensure the accuracy and
12 | reliability of the software and data, the NLM and the U.S. Government do not and
13 | cannot warrant the performance or results that may be obtained by using this
14 | software or data. The NLM and the U.S. Government disclaim all warranties,
15 | express or implied, including warranties of performance, merchantability or
16 | fitness for any particular purpose.
17 | 
18 | Please cite the author in any work or product based on this material:
19 | 
20 | Peng Y, Wang X, Lu L, Bagheri M, Summers RM, Lu Z.
21 | NegBio: a high-performance tool for negation and uncertainty detection in radiology reports.
22 | AMIA 2018 Informatics Summit. 2018.
23 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | include README.rst
 2 | include LICENSE.txt
 3 | include CONTRIBUTING.md
 4 | include requirements.txt
 5 | include examples/*
 6 | recursive-include negbio/patterns *
 7 | recursive-include negbio/chexpert/patterns *
 8 | recursive-include negbio/chexpert/phrases *
 9 | 
10 | exclude tests
11 | exclude backup
12 | exclude docs


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | .. image:: https://raw.githubusercontent.com/ncbi-nlp/NegBio/master/images/negbio.png?raw=true
  2 |    :target: https://raw.githubusercontent.com/ncbi-nlp/NegBio/master/images/negbio.png?raw=true
  3 |    :alt: NegBio
  4 | 
  5 | -----------------------
  6 | 
  7 | .. image:: https://img.shields.io/travis/yfpeng/NegBio/master.svg
  8 |    :target: https://travis-ci.org/yfpeng/NegBio
  9 |    :alt: Build status
 10 | 
 11 | .. image:: https://img.shields.io/pypi/v/negbio.svg
 12 |    :target: https://pypi.python.org/pypi/negbio
 13 |    :alt: PyPI version
 14 | 
 15 | .. image:: https://img.shields.io/readthedocs/negbio.svg
 16 |    :target: http://negbio.readthedocs.io
 17 |    :alt: RTD version
 18 | 
 19 | 
 20 | NegBio is a high-performance NLP tool for negation and uncertainty detection in clinical texts (e.g. radiology reports).
 21 | 
 22 | 
 23 | Get started
 24 | ===========
 25 | 
 26 | Install NegBio
 27 | ~~~~~~~~~~~~~~
 28 | 
 29 | 1. Installing from source (recommended)
 30 | 
 31 |     .. code-block:: bash
 32 | 
 33 |          $ git clone https://github.com/ncbi-nlp/NegBio.git
 34 |          $ cd /path/to/negbio
 35 |          $ python setup.py install --user
 36 |          $ export PATH=~/.local/bin:$PATH
 37 | 
 38 | 2. Installing from pip
 39 | 
 40 |     .. code-block:: bash
 41 | 
 42 |         $ pip install negbio
 43 | 
 44 | 
 45 | 
 46 | 
 47 | Prepare the dataset
 48 | ~~~~~~~~~~~~~~~~~~~
 49 | 
 50 | The inputs can be in either plain text or `BioC <http://bioc.sourceforge.net/>`_ format.
 51 | If the reports are in plain text, each report needs to be in a single file.
 52 | Some examples can be found in the ``examples`` folder.
 53 | 
 54 | Run the script
 55 | ~~~~~~~~~~~~~~
 56 | 
 57 | There are two ways to run the pipeline.
 58 | 
 59 | **NOTE**: If you want to process a lot of reports (e.g., > 1000), it is recommended to run the pipeline step-by-step.
 60 | See `User guide <https://negbio.readthedocs.io/en/latest/user_guide.html>`_.
 61 | 
 62 | 
 63 | Using the CheXpert algorithm
 64 | ____________________________
 65 | 
 66 | If you want to use the `CheXpert <https://github.com/stanfordmlgroup/chexpert-labeler>`_ method, run one of the following lines
 67 | 
 68 | .. code-block:: bash
 69 | 
 70 |    $ main_chexpert text --output=examples examples/00000086.txt examples/00019248.txt
 71 | 
 72 | .. code-block:: bash
 73 | 
 74 |    $ main_chexpert bioc --output=examples examples/1.xml
 75 | 
 76 | 
 77 | Using MetaMap
 78 | _____________
 79 | 
 80 | If you want to use MetaMap, run the following command by replacing ``<METAMAP_BINARY>`` with the actual **ABSOLUTE**
 81 | path, such as **META_MAP_HOME/bin/metamap16**
 82 | 
 83 | .. code-block:: bash
 84 | 
 85 |    $ main_mm text --metamap=<METAMAP_BINARY> --output=examples examples/00000086.txt \
 86 |         examples/00019248.txt
 87 | 
 88 | .. code-block:: bash
 89 | 
 90 |    $ main_mm bioc --metamap=<METAMAP_BINARY> --output=examples examples/1.xml
 91 | 
 92 | 
 93 | Documentation
 94 | =============
 95 | 
 96 | negbio `documentation <http://negbio.readthedocs.io/en/latest/>`_ is available on Read The Docs.
 97 | 
 98 | See `Getting Started <http://negbio.readthedocs.io/en/latest/getting_started.html>`_ for installation and basic
 99 | information. To contribute to negbio, read our `contribution guide </CONTRIBUTING.md>`_.
100 | 
101 | Citing NegBio
102 | =============
103 | 
104 | If you're running the NegBio pipeline, please cite:
105 | 
106 | *  Peng Y, Wang X, Lu L, Bagheri M, Summers RM, Lu Z. `NegBio: a high-performance tool for negation and uncertainty
107 |    detection in radiology reports <https://arxiv.org/abs/1712.05898>`_. *AMIA 2018 Informatics Summit*. 2018.
108 | *  Wang X, Peng Y, Lu L, Bagheri M, Lu Z, Summers R. `ChestX-ray8: Hospital-scale Chest X-ray database and benchmarks
109 |    on weakly-supervised classification and localization of common thorax diseases <https://arxiv.org/abs/1705.02315>`_.
110 |    *IEEE Conference on Computer Vision and Pattern Recognition (CVPR)*. 2017, 2097-2106.
111 | 
112 | Acknowledgments
113 | ===============
114 | 
115 | This work was supported by the Intramural Research Programs of the National Institutes of Health, National Library of
116 | Medicine and Clinical Center.
117 | 
118 | We are grateful to the authors of NegEx, MetaMap, Stanford CoreNLP, Bllip parser, and CheXpert labeler for making
119 | their software tools publicly available.
120 | 
121 | We thank Dr. Alexis Allot for the helpful discussion.
122 | 
123 | Disclaimer
124 | ==========
125 | This tool shows the results of research conducted in the Computational Biology Branch, NCBI. The information produced
126 | on this website is not intended for direct diagnostic use or medical decision-making without review and oversight
127 | by a clinical professional. Individuals should not change their health behavior solely on the basis of information
128 | produced on this website. NIH does not independently verify the validity or utility of the information produced
129 | by this tool. If you have questions about the information produced on this website, please see a health care
130 | professional. More information about NCBI's disclaimer policy is available.
131 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = sphinx-build
 7 | SPHINXPROJ    = negbio
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)


--------------------------------------------------------------------------------
/docs/README:
--------------------------------------------------------------------------------
 1 | The documentation in this tree is in plain text files and can be viewed using
 2 | any text file viewer.
 3 | 
 4 | It uses ReST (reStructuredText) [1], and the Sphinx documentation system [2].
 5 | This allows it to be built into other forms for easier viewing and browsing.
 6 | 
 7 | To create an HTML version of the docs:
 8 | 
 9 | * Install Sphinx (using ``pip install Sphinx sphinx_rtd_theme`` or some other method)
10 | 
11 | * In this docs/ directory, type ``make html`` (or ``make.bat html`` on
12 |   Windows) at a shell prompt.
13 | 
14 | The documentation in _build/html/index.html can then be viewed in a web browser.
15 | 
16 | [1] http://docutils.sourceforge.net/rst.html
17 | [2] http://sphinx-doc.org/


--------------------------------------------------------------------------------
/docs/acknowledgments.rst:
--------------------------------------------------------------------------------
 1 | Acknowledgments
 2 | ---------------
 3 | 
 4 | This work was supported by the Intramural Research Programs of the National Institutes of Health, National Library of
 5 | Medicine and Clinical Center.
 6 | 
 7 | We are grateful to the authors of NegEx, MetaMap, Stanford CoreNLP, Bllip parser, and CheXpert labeler for making
 8 | their software tools publicly available.
 9 | 
10 | We thank Dr. Alexis Allot for the helpful discussion.
11 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # negbio documentation build configuration file, created by
  5 | # sphinx-quickstart on Thu Feb  8 15:24:06 2018.
  6 | #
  7 | # This file is execfile()d with the current directory set to its
  8 | # containing dir.
  9 | #
 10 | # Note that not all possible configuration values are present in this
 11 | # autogenerated file.
 12 | #
 13 | # All configuration values have a default; values that are commented out
 14 | # serve to show the default.
 15 | 
 16 | # If extensions (or modules to document with autodoc) are in another directory,
 17 | # add these directories to sys.path here. If the directory is relative to the
 18 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 19 | #
 20 | # import os
 21 | # import sys
 22 | # sys.path.insert(0, os.path.abspath('.'))
 23 | 
 24 | 
 25 | # -- General configuration ------------------------------------------------
 26 | 
 27 | # If your documentation needs a minimal Sphinx version, state it here.
 28 | #
 29 | # needs_sphinx = '1.0'
 30 | 
 31 | # Add any Sphinx extension module names here, as strings. They can be
 32 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 33 | # ones.
 34 | extensions = []
 35 | 
 36 | # Add any paths that contain templates here, relative to this directory.
 37 | templates_path = ['_templates']
 38 | 
 39 | # The suffix(es) of source filenames.
 40 | # You can specify multiple suffix as a list of string:
 41 | #
 42 | # source_suffix = ['.rst', '.md']
 43 | source_suffix = '.rst'
 44 | 
 45 | # The master toctree document.
 46 | master_doc = 'index'
 47 | 
 48 | # General information about the project.
 49 | project = 'negbio'
 50 | copyright = '2019, NCBI, NLM, NIH'
 51 | author = 'Yifan Peng'
 52 | 
 53 | # The version info for the project you're documenting, acts as replacement for
 54 | # |version| and |release|, also used in various other places throughout the
 55 | # built documents.
 56 | #
 57 | # The short X.Y version.
 58 | version = '1.0'
 59 | # The full version, including alpha/beta/rc tags.
 60 | release = '1.0'
 61 | 
 62 | # The language for content autogenerated by Sphinx. Refer to documentation
 63 | # for a list of supported languages.
 64 | #
 65 | # This is also used if you do content translation via gettext catalogs.
 66 | # Usually you set "language" from the command line for these cases.
 67 | language = None
 68 | 
 69 | # List of patterns, relative to source directory, that match files and
 70 | # directories to ignore when looking for source files.
 71 | # This patterns also effect to html_static_path and html_extra_path
 72 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
 73 | 
 74 | # The name of the Pygments (syntax highlighting) style to use.
 75 | pygments_style = 'sphinx'
 76 | 
 77 | # If true, `todo` and `todoList` produce output, else they produce nothing.
 78 | todo_include_todos = False
 79 | 
 80 | 
 81 | # -- Options for HTML output ----------------------------------------------
 82 | 
 83 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 84 | # a list of builtin themes.
 85 | #
 86 | html_theme = 'sphinx_rtd_theme'
 87 | 
 88 | # Theme options are theme-specific and customize the look and feel of a theme
 89 | # further.  For a list of options available for each theme, see the
 90 | # documentation.
 91 | #
 92 | # html_theme_options = {}
 93 | 
 94 | # Add any paths that contain custom static files (such as style sheets) here,
 95 | # relative to this directory. They are copied after the builtin static files,
 96 | # so a file named "default.css" will overwrite the builtin "default.css".
 97 | html_static_path = ['_static']
 98 | 
 99 | 
100 | # -- Options for HTMLHelp output ------------------------------------------
101 | 
102 | # Output file base name for HTML help builder.
103 | htmlhelp_basename = 'negbiodoc'
104 | 
105 | 
106 | # -- Options for LaTeX output ---------------------------------------------
107 | 
108 | latex_elements = {
109 |     # The paper size ('letterpaper' or 'a4paper').
110 |     #
111 |     # 'papersize': 'letterpaper',
112 | 
113 |     # The font size ('10pt', '11pt' or '12pt').
114 |     #
115 |     # 'pointsize': '10pt',
116 | 
117 |     # Additional stuff for the LaTeX preamble.
118 |     #
119 |     # 'preamble': '',
120 | 
121 |     # Latex figure (float) alignment
122 |     #
123 |     # 'figure_align': 'htbp',
124 | }
125 | 
126 | # Grouping the document tree into LaTeX files. List of tuples
127 | # (source start file, target name, title,
128 | #  author, documentclass [howto, manual, or own class]).
129 | latex_documents = [
130 |     (master_doc, 'negbio.tex', 'negbio Documentation',
131 |      'Yifan Peng', 'manual'),
132 | ]
133 | 
134 | 
135 | # -- Options for manual page output ---------------------------------------
136 | 
137 | # One entry per manual page. List of tuples
138 | # (source start file, name, description, authors, manual section).
139 | man_pages = [
140 |     (master_doc, 'negbio', 'negbio Documentation',
141 |      [author], 1)
142 | ]
143 | 
144 | 
145 | # -- Options for Texinfo output -------------------------------------------
146 | 
147 | # Grouping the document tree into Texinfo files. List of tuples
148 | # (source start file, target name, title, author,
149 | #  dir menu entry, description, category)
150 | texinfo_documents = [
151 |     (master_doc, 'negbio', 'negbio Documentation',
152 |      author, 'negbio', 'One line description of project.',
153 |      'Miscellaneous'),
154 | ]
155 | 
156 | 
157 | 
158 | 


--------------------------------------------------------------------------------
/docs/contributing.rst:
--------------------------------------------------------------------------------
1 | Contributing
2 | ------------
3 | 
4 | Please read ``CONTRIBUTING.md`` for details on our code of conduct, and the process for submitting pull requests to us.


--------------------------------------------------------------------------------
/docs/developer_guide.rst:
--------------------------------------------------------------------------------
 1 | NegBio Developer Guide
 2 | ======================
 3 | 
 4 | Create the documentation
 5 | ^^^^^^^^^^^^^^^^^^^^^^^^
 6 | 
 7 | Install Sphinx
 8 | 
 9 | .. code-block:: bash
10 |    :linenos:
11 | 
12 |    $ pip install Sphinx
13 |    $ pip install sphinx_rtd_theme
14 |    $ cd docs
15 |    $ make html


--------------------------------------------------------------------------------
/docs/disclaimer.rst:
--------------------------------------------------------------------------------
 1 | Disclaimer
 2 | ==========
 3 | 
 4 | This tool shows the results of research conducted in the Computational Biology Branch, NCBI. The information produced
 5 | on this website is not intended for direct diagnostic use or medical decision-making without review and oversight
 6 | by a clinical professional. Individuals should not change their health behavior solely on the basis of information
 7 | produced on this website. NIH does not independently verify the validity or utility of the information produced
 8 | by this tool. If you have questions about the information produced on this website, please see a health care
 9 | professional. More information about NCBI's disclaimer policy is available.
10 | 


--------------------------------------------------------------------------------
/docs/getting_started.rst:
--------------------------------------------------------------------------------
  1 | Getting Started with NegBio
  2 | ===========================
  3 | 
  4 | These instructions will get you a copy of the project up and run on your local machine for development and testing
  5 | purposes. The package should successfully install on Linux (and possibly macOS).
  6 | 
  7 | Installing
  8 | ----------
  9 | 
 10 | Prerequisites
 11 | ~~~~~~~~~~~~~
 12 | 
 13 | *  python >2.4
 14 | *  Linux
 15 | *  Java
 16 | 
 17 | Note: since v1.0, MetaMap is not required. You can use the CheXpert vocabularies (``negbio/chexpert/phrases``) instead.
 18 | If you want to use MetaMap, it can be downloaded from `https://metamap.nlm.nih.gov/MainDownload.shtml <https://metamap.nlm.nih.gov/MainDownload.shtml>`_.
 19 | Installation instructions can be found at `https://metamap.nlm.nih.gov/Installation.shtml <https://metamap.nlm.nih.gov/Installation.shtml>`_.
 20 | Please make sure that both ``skrmedpostctl`` and ``wsdserverctl`` are started.
 21 | 
 22 | Installing from source (recommended)
 23 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 24 | 
 25 | .. code-block:: bash
 26 | 
 27 |      $ git clone https://github.com/ncbi-nlp/NegBio.git
 28 |      $ cd /path/to/negbio
 29 |      $ python setup.py install --user
 30 |      $ export PATH=~/.local/bin:$PATH
 31 | 
 32 | Installing from pip
 33 | ~~~~~~~~~~~~~~~~~~~
 34 | 
 35 | .. code-block:: bash
 36 | 
 37 |     $ pip install negbio
 38 | 
 39 | 
 40 | Using NegBio
 41 | ------------
 42 | 
 43 | Prepare the dataset
 44 | ~~~~~~~~~~~~~~~~~~~
 45 | 
 46 | The inputs can be in either plain text or `BioC <http://bioc.sourceforge.net/>`_ format. If the reports are in plain
 47 | text, each report needs to be in a single file. Some examples can be found in the ``examples`` folder.
 48 | 
 49 | Run the script
 50 | ~~~~~~~~~~~~~~
 51 | 
 52 | There are two ways to run the pipeline.
 53 | 
 54 | Using CheXpert algorithm
 55 | ________________________
 56 | 
 57 | If you want to use the CheXpert method, run one of the following lines
 58 | 
 59 | .. code-block:: bash
 60 | 
 61 |    $ main_chexpert text --output=examples/test.neg.xml examples/00000086.txt examples/00019248.txt
 62 | 
 63 | .. code-block:: bash
 64 | 
 65 |    $ main_chexpert bioc --output=examples/test.neg.xml examples/1.xml
 66 | 
 67 | The script will
 68 | 
 69 | 1. [Optional] Combine ``examples/00000086.txt`` and ``examples/00019248.txt`` into one BioC XML file
 70 | 2. Detect concepts using CheXpert pre-defined vocabularies (by default using the list ``negbio/chexpert/phrases``)
 71 | 3. Detect positive, negative and uncertain concepts using rules in  ``negbio/chexpert/patterns``
 72 | 4. Save the results in ``examples/test.neg.xml``
 73 | 
 74 | More options (e.g., setting the CUI list or rules) can be obtained by running
 75 | 
 76 | .. code-block:: bash
 77 | 
 78 |    $ main_chexpert --help
 79 | 
 80 | Using MetaMap
 81 | _____________
 82 | 
 83 | If you want to use MetaMap, run the following command by replacing ``<METAMAP_BIN>`` with the actual **ABSOLUTE**
 84 | path, such as **META_MAP_HOME/bin/metamap16**
 85 | 
 86 | .. code-block:: bash
 87 | 
 88 |    $ export METAMAP_BIN=META_MAP_HOME/bin/metamap16
 89 |    $ main_mm text --metamap=$METAMAP_BIN --output=examples/test.neg.xml \
 90 |         examples/00000086.txt examples/00019248.txt
 91 | 
 92 | .. code-block:: bash
 93 | 
 94 |    $ export METAMAP_BIN=META_MAP_HOME/bin/metamap16
 95 |    $ main_mm bioc --metamap=$METAMAP_BIN --output=examples/test.neg.xml examples/1.xml
 96 | 
 97 | The script will
 98 | 
 99 | 1. [Optional] Combine ``examples/00000086.txt`` and ``examples/00019248.txt`` into one BioC XML file
100 | 2. Detect UMLS concepts (CUIs) using MetaMap (by default using the CUI list ``examples/cuis-cvpr2017.txt``
101 | 3. Detect negative and uncertain CUIs using rules in  ``negbio/patterns``
102 | 4. Save the results in ``examples/test.neg.xml``
103 | 
104 | More options (e.g., setting the CUI list or rules) can be obtained by running
105 | 
106 | .. code-block:: bash
107 | 
108 |    $ main_mm --help
109 | 
110 | 
111 | Next Steps
112 | ----------
113 | 
114 | To start learning how to use NegBio, see the :doc:`user_guide`.
115 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | .. negbio documentation master file, created by
 2 |    sphinx-quickstart on Thu Feb  8 15:24:06 2018.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | NegBio documentation
 7 | ====================
 8 | 
 9 | .. toctree::
10 |    :maxdepth: 5
11 |    :caption: Contents:
12 | 
13 |    getting_started
14 |    user_guide
15 |    developer_guide
16 |    license
17 |    contributing
18 |    acknowledgments
19 |    disclaimer
20 |    reference
21 | 
22 | 
23 | Indices and tables
24 | ==================
25 | 
26 | * :ref:`genindex`
27 | * :ref:`modindex`
28 | * :ref:`search`
29 | 


--------------------------------------------------------------------------------
/docs/license.rst:
--------------------------------------------------------------------------------
 1 | License
 2 | =======
 3 | 
 4 | PUBLIC DOMAIN NOTICE
 5 | 
 6 | National Center for Biotechnology Information
 7 | 
 8 | This software/database is a "United States Government Work" under the terms of
 9 | the United States Copyright Act.  It was written as part of the author's
10 | official duties as a United States Government employee and thus cannot be
11 | copyrighted.  This software/database is freely available to the public for use.
12 | The National Library of Medicine and the U.S. Government have not placed any
13 | restriction on its use or reproduction.
14 | 
15 | Although all reasonable efforts have been taken to ensure the accuracy and
16 | reliability of the software and data, the NLM and the U.S. Government do not and
17 | cannot warrant the performance or results that may be obtained by using this
18 | software or data. The NLM and the U.S. Government disclaim all warranties,
19 | express or implied, including warranties of performance, merchantability or
20 | fitness for any particular purpose.
21 | 
22 | Please cite the author in any work or product based on these materials:
23 | 
24 | Peng Y, Wang X, Lu L, Bagheri M, Summers RM, Lu Z.
25 | NegBio: a high-performance tool for negation and uncertainty detection in
26 | radiology reports.
27 | AMIA 2018 Informatics Summit. 2018.
28 | 
29 | Wang X, Peng Y, Lu L, Bagheri M, Lu Z, Summers R.
30 | ChestX-ray8: Hospital-scale Chest X-ray database and benchmarks on
31 | weakly-supervised classification and localization of common thorax diseases.
32 | IEEE Conference on Computer Vision and Pattern Recognition (CVPR). 2017, 2097-2106.
33 | 
34 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | set SPHINXPROJ=negbio
13 | 
14 | if "%1" == "" goto help
15 | 
16 | %SPHINXBUILD% >NUL 2>NUL
17 | if errorlevel 9009 (
18 | 	echo.
19 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
20 | 	echo.installed, then set the SPHINXBUILD environment variable to point
21 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
22 | 	echo.may add the Sphinx directory to PATH.
23 | 	echo.
24 | 	echo.If you don't have Sphinx installed, grab it from
25 | 	echo.http://sphinx-doc.org/
26 | 	exit /b 1
27 | )
28 | 
29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
30 | goto end
31 | 
32 | :help
33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
34 | 
35 | :end
36 | popd
37 | 


--------------------------------------------------------------------------------
/docs/reference.rst:
--------------------------------------------------------------------------------
1 | Reference
2 | =========
3 | 
4 | *  Peng Y, Wang X, Lu L, Bagheri M, Summers RM, Lu Z. `NegBio: a high-performance tool for negation and uncertainty
5 |    detection in radiology reports <https://arxiv.org/abs/1712.05898>`_. *AMIA 2018 Informatics Summit*. 2018.
6 | *  Wang X, Peng Y, Lu L, Bagheri M, Lu Z, Summers R. `ChestX-ray8: Hospital-scale Chest X-ray database and benchmarks
7 |    on weakly-supervised classification and localization of common thorax diseases <https://arxiv.org/abs/1705.02315>`_.
8 |    *IEEE Conference on Computer Vision and Pattern Recognition (CVPR)*. 2017, 2097-2106.


--------------------------------------------------------------------------------
/docs/user_guide.rst:
--------------------------------------------------------------------------------
  1 | NegBio User Guide
  2 | =================
  3 | 
  4 | Run the pipeline step-by-step
  5 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  6 | 
  7 | The step-by-step pipeline generates all intermediate documents. You can easily rerun one step if it makes errors.
  8 | The whole steps are
  9 | 
 10 | 1. ``text2bioc`` combines text into a BioC XML file.
 11 | 2. ``normalize`` removes noisy text such as ``[**Patterns**]``.
 12 | 3. ``section_split`` splits the report into sections based on titles at ``patterns/section_titles.txt``
 13 | 4. ``ssplit`` splits text into sentences.
 14 | 5. Named entity recognition
 15 | 
 16 |    a. ``dner_mm`` detects UMLS concepts using MetaMap.
 17 |    b. ``dner_chexpert`` detects concepts using the CheXpert vocabularies at ``negbio/chexpert/phrases``.
 18 | 
 19 | 6. ``parse`` parses sentence using the `Bllip parser <https://github.com/BLLIP/bllip-parser>`_.
 20 | 7. ``ptb2ud`` converts the parse tree to universal dependencies using `Stanford converter <https://github.com/dmcc/PyStanfordDependencies>`_.
 21 | 8. Negation detection
 22 | 
 23 |    a. ``neg`` detects negative and uncertain findings.
 24 |    b. ``neg_chexpert`` detects positive, negative and uncertain findings (recommended)
 25 | 
 26 | 9. ``cleanup`` removes intermediate information.
 27 | 
 28 | Steps 2-10 will process the input files one-by-one and generate the results in the output directory.
 29 | The 2nd and 3rd can be skipped. You can chose either step 5 or 6 for named entity recognition.
 30 | 
 31 | 1. Convert text files to BioC format
 32 | ------------------------------------
 33 | 
 34 | You can skip this step if the reports are already in the `BioC <http://bioc.sourceforge.net/>`_ format.
 35 | **If you have lots of reports, it is recommended to put them into several BioC files, for example, 100 reports per BioC file.**
 36 | 
 37 | .. code-block:: bash
 38 | 
 39 |    $ export BIOC_DIR=/path/to/bioc
 40 |    $ export TEXT_DIR=/path/to/text
 41 |    $ negbio_pipeline text2bioc --output=$BIOC_DIR/test.xml $TEXT_DIR/*.txt
 42 | 
 43 | Another most commonly used command is:
 44 | 
 45 | .. code-block:: bash
 46 | 
 47 |    $ find $TEXT_DIR -type f | negbio_pipeline text2bioc --output=$BIOC_DIR
 48 | 
 49 | 2. Normalize reports
 50 | --------------------
 51 | 
 52 | This step removes the noisy text such as ``[**Patterns**]`` in the MIMIC-III reports.
 53 | 
 54 | .. code-block:: bash
 55 | 
 56 |    $ negbio_pipeline normalize --output=$OUTPUT_DIR $INPUT_DIR/*.xml
 57 | 
 58 | 3. Split each report into sections
 59 | -----------------------------------
 60 | 
 61 | This step splits the report into sections.
 62 | The default section titles is at ``patterns/section_titles.txt``.
 63 | You can specify customized section titles using the option ``--pattern=<file>``.
 64 | 
 65 | .. code-block:: bash
 66 | 
 67 |    $ negbio_pipeline section_split --output=$OUTPUT_DIR $INPUT_DIR/*.xml
 68 | 
 69 | 
 70 | 4. Splits each report into sentences
 71 | ------------------------------------
 72 | 
 73 | This step splits the report into sentences using the NLTK splitter
 74 | (`nltk.tokenize.sent_tokenize <https://www.nltk.org/api/nltk.tokenize.html>`_).
 75 | 
 76 | .. code-block:: bash
 77 | 
 78 |    $ negbio_pipeline ssplit --output=$OUTPUT_DIR $INPUT_DIR/*.xml
 79 | 
 80 | 
 81 | 5. Named entity recognition
 82 | ---------------------------
 83 | 
 84 | This step recognizes named entities (e.g., findings, diseases, devices) from the reports.
 85 | The first version of NegBio uses MetaMap to detect UMLS concepts.
 86 | 
 87 | MetaMap can be can be downloaded from `https://metamap.nlm.nih.gov/MainDownload.shtml <https://metamap.nlm.nih.gov/MainDownload.shtml>`_.
 88 | Installation instructions can be found at `https://metamap.nlm.nih.gov/Installation.shtml <https://metamap.nlm.nih.gov/Installation.shtml>`_.
 89 | Before using MetaMap, please make sure that both ``skrmedpostctl`` and ``wsdserverctl`` are started.
 90 | 
 91 | MetaMap intends to extract all UMLS concepts.
 92 | Many of them are not irrelevant to radiology.
 93 | Therefore, it is better to specify the UMLS concepts of interest via ``--cuis=<file>``
 94 | 
 95 | .. code-block:: bash
 96 | 
 97 |    $ export METAMAP_BIN=META_MAP_HOME/bin/metamap16
 98 |    $ negbio_pipeline dner_mm --metamap=$METAMAP_BIN --output=$OUTPUT_DIR $INPUT_DIR/*.xml
 99 | 
100 | NegBio also integrates the CheXpert vocabularies to recognize the presence of 14 observations.
101 | All vocabularies can be found at ``negbio/chexpert/phrases``.
102 | Each file in the folder represents one type of named entities with various text expressions.
103 | So far, NegBio does not support adding more types in the folder, but you can add more text expressions of the type.
104 | 
105 | .. code-block:: bash
106 | 
107 |    $ negbio_pipeline dner_chexpert --output=$OUTPUT_DIR $INPUT_DIR/*.xml
108 | 
109 | 
110 | In general, MetaMap is more comprehensive while CheXpert is more accurate on 14 types of findings.
111 | MetaMap is also slower and easier to break than CheXpert.
112 | 
113 | 
114 | 6. Parse the sentence
115 | ---------------------
116 | 
117 | This step parses sentence using the `Bllip parser <https://github.com/BLLIP/bllip-parser>`_.
118 | 
119 | .. code-block:: bash
120 | 
121 |    $ negbio_pipeline parse --output=$OUTPUT_DIR $INPUT_DIR/*.xml
122 | 
123 | 
124 | 7. Convert the parse tree to UD
125 | -------------------------------
126 | 
127 | This step converts the parse tree to universal dependencies using `Stanford converter <https://github.com/dmcc/PyStanfordDependencies>`_.
128 | 
129 | .. code-block:: bash
130 | 
131 |    $ negbio_pipeline ptb2ud --output=$OUTPUT_DIR $INPUT_DIR/*.xml
132 | 
133 | 
134 | 8. Detect negative and uncertain findings
135 | -----------------------------------------
136 | 
137 | This step detects negative and uncertain findings using patterns.
138 | By default, the program uses the negation and uncertainty patterns in the ``negbio/patterns`` folder.
139 | However, you are free to create your own patterns via ``--neg-patterns=<file>`` and ``--uncertainty-patterns=<file>``.
140 | The pattern is a `semgrex-type <https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html>`_
141 | pattern for matching node in the dependency graph.
142 | Currently, we only support ``<`` and ``>`` operations.
143 | A detailed grammar specification (using PLY, Python Lex-Yacc) can be found in ``ngrex/parser.py``.
144 | 
145 | .. code-block:: bash
146 | 
147 |    $ negbio_pipeline neg --output=$OUTPUT_DIR $INPUT_DIR/*.xml
148 | 
149 | NegBio also integrates the CheXpert algorithms.
150 | Different from the original NegBio, CheXpert utilizes a 3-phase pipeline consisting of pre-negation uncertainty,
151 | negation, and post-negation uncertainty (`Irvin et al., 2019 <https://arxiv.org/abs/1901.07031>`_).
152 | Each phase consists of rules which are matched against the mention; if a match is found, then the mention is classified
153 | accordingly (as uncertain in the first or third phase, and as negative in the second phase).
154 | If a mention is not matched in any of the phases, it is classified as positive.
155 | 
156 | Generally, the CheXpert contains more rules and is more accurate than the original NegBio.
157 | 
158 | .. code-block:: bash
159 | 
160 |    $ negbio_pipeline neg_chexpert --output=$OUTPUT_DIR $INPUT_DIR/*.xml
161 | 
162 | Similarly, you are free to create patterns via ``--neg-patterns=<file>``, ``--pre-uncertainty-patterns=<file>``, and
163 | ``--post-uncertainty-patterns=<file>``.
164 | 
165 | 9. Cleans intermediate information
166 | ----------------------------------
167 | 
168 | This step removes intermediate information (sentence annotations) from the BioC files.
169 | 
170 | .. code-block:: bash
171 | 
172 |    $ negbio_pipeline cleanup --output=$OUTPUT_DIR $INPUT_DIR/*.xml
173 | 
174 | 


--------------------------------------------------------------------------------
/environment2.7.yml:
--------------------------------------------------------------------------------
 1 | name: negbio2.7
 2 | channels:
 3 |   - anaconda
 4 |   - conda-forge
 5 |   - auto
 6 | dependencies:
 7 |   - python=2.7.11
 8 |   - future=0.16.0
 9 |   - docutils=0.13.1
10 |   - docopt=0.6.2
11 |   - pytest=3.1.3
12 |   - networkx=1.11
13 |   - ply=3.10
14 |   - tqdm=4.19.5
15 |   - nltk=3.2.4
16 |   - pathlib2=2.3.3
17 |   - numpy=1.15.4
18 |   - jpype1=0.6.3
19 |   - pip:
20 |     - bioc==1.1.dev3
21 |     - pystanforddependencies==0.3.1
22 |     - bllipparser==2016.9.11
23 |     - pymetamap==0.1
24 | 


--------------------------------------------------------------------------------
/environment3.7.yml:
--------------------------------------------------------------------------------
 1 | name: negbio3.7
 2 | channels:
 3 |   - anaconda
 4 |   - conda-forge
 5 |   - auto
 6 | dependencies:
 7 |   - python=3.7
 8 |   - docutils=0.14
 9 |   - docopt=0.6.2
10 |   - pytest=4.2.0
11 |   - networkx=2.2
12 |   - ply=3.11
13 |   - tqdm=4.31
14 |   - nltk=3.4
15 |   - numpy=1.16
16 |   - jpype1=0.6.3
17 |   - pip:
18 |     - bioc==1.3.1
19 |     - pystanforddependencies==0.3.1
20 |     - bllipparser==2016.9.11
21 |     - pymetamap==0.1
22 | 


--------------------------------------------------------------------------------
/examples/00000086.txt:
--------------------------------------------------------------------------------
1 | findings: pa and lat cxr at 7:34 p.m.. heart and mediastinum are
2 | stable. lungs are unchanged. air- filled cystic changes. no
3 | pneumothorax. osseous structures unchanged scoliosis
4 | impression: stable chest.
5 | dictating


--------------------------------------------------------------------------------
/examples/00019248.txt:
--------------------------------------------------------------------------------
1 | findings:
2 | chest: four images:
3 | right picc with tip within the upper svc.
4 | probable enlargement of the main pulmonary artery.
5 | mild cardiomegaly.
6 | no evidence of focal infiltrate, effusion or pneumothorax.
7 | dictating


--------------------------------------------------------------------------------
/examples/1.xml:
--------------------------------------------------------------------------------
 1 | <?xml version='1.0' encoding='utf-8' standalone='yes'?>
 2 | <collection>
 3 |   <source></source>
 4 |   <date>2017-05-31</date>
 5 |   <key></key>
 6 |   <key></key>
 7 |   <document>
 8 |     <id>00019248</id>
 9 |     <passage>
10 |       <offset>0</offset>
11 |       <text>findings:
12 | chest: four images:
13 | right picc with tip within the upper svc.
14 | probable enlargement of the main pulmonary artery.
15 | mild cardiomegaly.
16 | no evidence of focal infiltrate, effusion or pneumothorax.
17 | dictating </text>
18 |       <annotation id="24">
19 |         <infon key="term">Cardiomegaly</infon>
20 |         <infon key="CUI">C0018800</infon>
21 |         <infon key="annotator">MetaMap</infon>
22 |         <infon key="semtype">fndg</infon>
23 |         <location length="18" offset="123"/>
24 |         <text>Mild cardiomegaly.</text>
25 |       </annotation>
26 |       <annotation id="27">
27 |         <infon key="term">Infiltration</infon>
28 |         <infon key="CUI">C0332448</infon>
29 |         <infon key="annotator">MetaMap</infon>
30 |         <infon key="semtype">ftcn</infon>
31 |         <location length="10" offset="163"/>
32 |         <text>infiltrate</text>
33 |       </annotation>
34 |       <annotation id="30">
35 |         <infon key="term">effusion</infon>
36 |         <infon key="CUI">C0013687</infon>
37 |         <infon key="annotator">MetaMap</infon>
38 |         <infon key="semtype">patf</infon>
39 |         <location length="8" offset="175"/>
40 |         <text>effusion</text>
41 |       </annotation>
42 |       <annotation id="32">
43 |         <infon key="term">Pneumothorax</infon>
44 |         <infon key="CUI">C0032326</infon>
45 |         <infon key="annotator">MetaMap</infon>
46 |         <infon key="semtype">dsyn</infon>
47 |         <location length="13" offset="187"/>
48 |         <text>pneumothorax.</text>
49 |       </annotation>
50 |     </passage>
51 |   </document>
52 |   <document>
53 |     <id>00000086</id>
54 |     <passage>
55 |       <offset>0</offset>
56 |       <text>findings: pa and lat cxr at 7:34 p.m.. heart and mediastinum are
57 | stable. lungs are unchanged. air- filled cystic changes. no
58 | pneumothorax. osseous structures unchanged scoliosis
59 | impression: stable chest.
60 | dictating </text>
61 |       <annotation id="24">
62 |         <infon key="neg">True</infon>
63 |         <infon key="term">Pneumothorax</infon>
64 |         <infon key="CUI">C0032326</infon>
65 |         <infon key="annotator">MetaMap</infon>
66 |         <infon key="semtype">dsyn</infon>
67 |         <location length="12" offset="125"/>
68 |         <text>pneumothorax</text>
69 |       </annotation>
70 |     </passage>
71 |   </document>
72 | </collection>
73 | 


--------------------------------------------------------------------------------
/examples/2.xml:
--------------------------------------------------------------------------------
 1 | <?xml version='1.0' encoding='utf-8' standalone='yes'?>
 2 | <collection>
 3 |   <source></source>
 4 |   <date>2017-05-31</date>
 5 |   <key></key>
 6 |   <document>
 7 |     <id>00000086</id>
 8 |     <passage>
 9 |       <offset>0</offset>
10 |       <text>findings: pa and lat cxr at 7:34 p.m.. heart and mediastinum are
11 | stable. lungs are unchanged. air- filled cystic changes. no
12 | pneumothorax. osseous structures unchanged scoliosis
13 | impression: stable chest.
14 | dictating </text>
15 |       <annotation id="24">
16 |         <infon key="neg">True</infon>
17 |         <infon key="term">Pneumothorax</infon>
18 |         <infon key="CUI">C0032326</infon>
19 |         <infon key="annotator">MetaMap</infon>
20 |         <infon key="semtype">dsyn</infon>
21 |         <location length="12" offset="125"/>
22 |         <text>pneumothorax</text>
23 |       </annotation>
24 |     </passage>
25 |   </document>
26 | </collection>
27 | 


--------------------------------------------------------------------------------
/examples/cuis-cvpr2017.txt:
--------------------------------------------------------------------------------
 1 | C0264494
 2 | C0264496
 3 | C0004144
 4 | C0264495
 5 | C0018800
 6 | C0702116
 7 | C0521530
 8 | C0013604
 9 | C0013608
10 | C0034063
11 | C0031039
12 | C0013687
13 | C0747635
14 | C1265808
15 | C0747639
16 | C0032227
17 | C0034067
18 | C0038536
19 | C0016059
20 | C0034069
21 | C0019270
22 | C3489393
23 | C0744895
24 | C0332448
25 | C0235896
26 | C0577559
27 | C3152252
28 | C0748419
29 | C1265602
30 | C0028259
31 | C0332558
32 | C0034079
33 | C0746923
34 | C0748164
35 | C0264545
36 | C1960024
37 | C0585104
38 | C0585105
39 | C0585106
40 | C0032285
41 | C0578577
42 | C0578576
43 | C0577702
44 | C0747651
45 | C0546333
46 | C0032326
47 | C0264557
48 | C0546334
49 | 


--------------------------------------------------------------------------------
/examples/openi-testset.txt:
--------------------------------------------------------------------------------
  1 | CXR10
  2 | CXR1002
  3 | CXR1007
  4 | CXR1008
  5 | CXR101
  6 | CXR102
  7 | CXR1020
  8 | CXR1028
  9 | CXR1042
 10 | CXR105
 11 | CXR1055
 12 | CXR1056
 13 | CXR1058
 14 | CXR1062
 15 | CXR1074
 16 | CXR1076
 17 | CXR1077
 18 | CXR1078
 19 | CXR1091
 20 | CXR1092
 21 | CXR1099
 22 | CXR11
 23 | CXR1102
 24 | CXR1109
 25 | CXR1118
 26 | CXR112
 27 | CXR1121
 28 | CXR1138
 29 | CXR1140
 30 | CXR1159
 31 | CXR1161
 32 | CXR1163
 33 | CXR1167
 34 | CXR1169
 35 | CXR1179
 36 | CXR1190
 37 | CXR1194
 38 | CXR1199
 39 | CXR1202
 40 | CXR1206
 41 | CXR1208
 42 | CXR1210
 43 | CXR1213
 44 | CXR1218
 45 | CXR1222
 46 | CXR1226
 47 | CXR1231
 48 | CXR1239
 49 | CXR124
 50 | CXR1243
 51 | CXR1248
 52 | CXR1255
 53 | CXR1258
 54 | CXR1260
 55 | CXR1265
 56 | CXR1267
 57 | CXR127
 58 | CXR1270
 59 | CXR1273
 60 | CXR1286
 61 | CXR1287
 62 | CXR1288
 63 | CXR1292
 64 | CXR1295
 65 | CXR1297
 66 | CXR1314
 67 | CXR1316
 68 | CXR1332
 69 | CXR1334
 70 | CXR1374
 71 | CXR1378
 72 | CXR1391
 73 | CXR1392
 74 | CXR1396
 75 | CXR1397
 76 | CXR1398
 77 | CXR1399
 78 | CXR14
 79 | CXR1401
 80 | CXR1409
 81 | CXR141
 82 | CXR1411
 83 | CXR1413
 84 | CXR1439
 85 | CXR144
 86 | CXR1443
 87 | CXR1444
 88 | CXR1448
 89 | CXR145
 90 | CXR1452
 91 | CXR1460
 92 | CXR1461
 93 | CXR1468
 94 | CXR1481
 95 | CXR1487
 96 | CXR1497
 97 | CXR1500
 98 | CXR1510
 99 | CXR1515
100 | CXR1518
101 | CXR1519
102 | CXR1527
103 | CXR1528
104 | CXR1529
105 | CXR153
106 | CXR154
107 | CXR1540
108 | CXR1548
109 | CXR1551
110 | CXR1556
111 | CXR1563
112 | CXR1568
113 | CXR1570
114 | CXR1576
115 | CXR1581
116 | CXR1583
117 | CXR1586
118 | CXR159
119 | CXR1593
120 | CXR1602
121 | CXR1605
122 | CXR1608
123 | CXR1614
124 | CXR1617
125 | CXR1624
126 | CXR1627
127 | CXR163
128 | CXR1632
129 | CXR1638
130 | CXR1639
131 | CXR1643
132 | CXR1647
133 | CXR166
134 | CXR1660
135 | CXR167
136 | CXR1671
137 | CXR1691
138 | CXR1709
139 | CXR1711
140 | CXR1716
141 | CXR1724
142 | CXR1725
143 | CXR1728
144 | CXR1729
145 | CXR1733
146 | CXR1734
147 | CXR1736
148 | CXR1738
149 | CXR1739
150 | CXR1740
151 | CXR1746
152 | CXR1756
153 | CXR1763
154 | CXR1764
155 | CXR1765
156 | CXR1766
157 | CXR1767
158 | CXR1773
159 | CXR1777
160 | CXR1783
161 | CXR1801
162 | CXR1806
163 | CXR1813
164 | CXR1814
165 | CXR1816
166 | CXR1823
167 | CXR1831
168 | CXR1832
169 | CXR1841
170 | CXR1845
171 | CXR1861
172 | CXR1868
173 | CXR1871
174 | CXR1877
175 | CXR1881
176 | CXR1883
177 | CXR1884
178 | CXR1892
179 | CXR1895
180 | CXR1896
181 | CXR190
182 | CXR1903
183 | CXR1904
184 | CXR1909
185 | CXR191
186 | CXR1912
187 | CXR1914
188 | CXR1920
189 | CXR1923
190 | CXR1926
191 | CXR1929
192 | CXR193
193 | CXR1934
194 | CXR194
195 | CXR1942
196 | CXR1944
197 | CXR1946
198 | CXR1951
199 | CXR1952
200 | CXR1954
201 | CXR1958
202 | CXR1960
203 | CXR1964
204 | CXR1965
205 | CXR1969
206 | CXR1972
207 | CXR1977
208 | CXR1978
209 | CXR1979
210 | CXR1992
211 | CXR1993
212 | CXR1994
213 | CXR1999
214 | CXR2011
215 | CXR2012
216 | CXR2014
217 | CXR2029
218 | CXR2032
219 | CXR2038
220 | CXR2039
221 | CXR204
222 | CXR2040
223 | CXR2050
224 | CXR2053
225 | CXR2059
226 | CXR2061
227 | CXR2062
228 | CXR2066
229 | CXR2067
230 | CXR207
231 | CXR2072
232 | CXR2080
233 | CXR2086
234 | CXR2087
235 | CXR2089
236 | CXR2098
237 | CXR21
238 | CXR211
239 | CXR2111
240 | CXR2114
241 | CXR2115
242 | CXR2126
243 | CXR2131
244 | CXR2140
245 | CXR2142
246 | CXR2145
247 | CXR2152
248 | CXR2162
249 | CXR2163
250 | CXR2165
251 | CXR2167
252 | CXR2170
253 | CXR2171
254 | CXR2172
255 | CXR2177
256 | CXR2183
257 | CXR2191
258 | CXR2195
259 | CXR2199
260 | CXR2202
261 | CXR2205
262 | CXR2210
263 | CXR2211
264 | CXR2221
265 | CXR2222
266 | CXR2225
267 | CXR2244
268 | CXR2247
269 | CXR2250
270 | CXR2257
271 | CXR2264
272 | CXR2265
273 | CXR2268
274 | CXR2275
275 | CXR2287
276 | CXR2288
277 | CXR2289
278 | CXR2301
279 | CXR2307
280 | CXR2308
281 | CXR2324
282 | CXR2326
283 | CXR233
284 | CXR235
285 | CXR2352
286 | CXR2353
287 | CXR2357
288 | CXR2360
289 | CXR2368
290 | CXR237
291 | CXR2371
292 | CXR2372
293 | CXR2378
294 | CXR2380
295 | CXR2382
296 | CXR2388
297 | CXR2392
298 | CXR2395
299 | CXR2396
300 | CXR2397
301 | CXR240
302 | CXR2409
303 | CXR2414
304 | CXR2419
305 | CXR242
306 | CXR2421
307 | CXR243
308 | CXR2430
309 | CXR2437
310 | CXR2438
311 | CXR2448
312 | CXR2450
313 | CXR2455
314 | CXR2460
315 | CXR2462
316 | CXR2463
317 | CXR2465
318 | CXR2472
319 | CXR2474
320 | CXR2482
321 | CXR2494
322 | CXR2495
323 | CXR2496
324 | CXR2497
325 | CXR2498
326 | CXR2499
327 | CXR2503
328 | CXR2506
329 | CXR2515
330 | CXR2516
331 | CXR2519
332 | CXR2523
333 | CXR2525
334 | CXR2526
335 | CXR2530
336 | CXR2533
337 | CXR2536
338 | CXR2540
339 | CXR2542
340 | CXR2547
341 | CXR2557
342 | CXR256
343 | CXR2573
344 | CXR2577
345 | CXR2582
346 | CXR2583
347 | CXR2585
348 | CXR2595
349 | CXR2601
350 | CXR2604
351 | CXR2607
352 | CXR2608
353 | CXR261
354 | CXR2610
355 | CXR2617
356 | CXR2619
357 | CXR2620
358 | CXR2622
359 | CXR2625
360 | CXR2629
361 | CXR2636
362 | CXR2642
363 | CXR2649
364 | CXR2654
365 | CXR2655
366 | CXR2673
367 | CXR2684
368 | CXR2688
369 | CXR2699
370 | CXR27
371 | CXR2714
372 | CXR2716
373 | CXR2730
374 | CXR2739
375 | CXR2752
376 | CXR2759
377 | CXR276
378 | CXR2768
379 | CXR2776
380 | CXR2780
381 | CXR2782
382 | CXR2791
383 | CXR28
384 | CXR2808
385 | CXR2817
386 | CXR2820
387 | CXR2824
388 | CXR2827
389 | CXR2832
390 | CXR2833
391 | CXR284
392 | CXR2847
393 | CXR2852
394 | CXR2856
395 | CXR2858
396 | CXR286
397 | CXR287
398 | CXR2871
399 | CXR2876
400 | CXR2879
401 | CXR288
402 | CXR2887
403 | CXR2890
404 | CXR29
405 | CXR2901
406 | CXR2906
407 | CXR2909
408 | CXR2911
409 | CXR2924
410 | CXR2926
411 | CXR2927
412 | CXR2931
413 | CXR2940
414 | CXR2942
415 | CXR2951
416 | CXR2960
417 | CXR2966
418 | CXR2968
419 | CXR2969
420 | CXR297
421 | CXR2979
422 | CXR2981
423 | CXR2992
424 | CXR2997
425 | CXR300
426 | CXR3008
427 | CXR3011
428 | CXR3012
429 | CXR3016
430 | CXR302
431 | CXR3034
432 | CXR3038
433 | CXR304
434 | CXR3045
435 | CXR3046
436 | CXR305
437 | CXR3050
438 | CXR3053
439 | CXR3056
440 | CXR3057
441 | CXR3063
442 | CXR307
443 | CXR3070
444 | CXR3071
445 | CXR3083
446 | CXR3084
447 | CXR309
448 | CXR3093
449 | CXR3094
450 | CXR310
451 | CXR3100
452 | CXR3101
453 | CXR3106
454 | CXR3109
455 | CXR3112
456 | CXR3121
457 | CXR3123
458 | CXR3132
459 | CXR3133
460 | CXR3135
461 | CXR3145
462 | CXR3152
463 | CXR3153
464 | CXR3154
465 | CXR3155
466 | CXR3156
467 | CXR3159
468 | CXR3163
469 | CXR3176
470 | CXR3177
471 | CXR3178
472 | CXR3184
473 | CXR3197
474 | CXR3199
475 | CXR3206
476 | CXR3208
477 | CXR3213
478 | CXR3218
479 | CXR3230
480 | CXR3238
481 | CXR3242
482 | CXR3254
483 | CXR3255
484 | CXR3257
485 | CXR326
486 | CXR3261
487 | CXR3262
488 | CXR3271
489 | CXR3272
490 | CXR3288
491 | CXR3290
492 | CXR3292
493 | CXR3296
494 | CXR33
495 | CXR3307
496 | CXR3315
497 | CXR3318
498 | CXR3319
499 | CXR332
500 | CXR3323
501 | CXR3325
502 | CXR3329
503 | CXR333
504 | CXR3332
505 | CXR3333
506 | CXR3337
507 | CXR334
508 | CXR3342
509 | CXR3355
510 | CXR3356
511 | CXR3368
512 | CXR3373
513 | CXR3395
514 | CXR3405
515 | CXR3410
516 | CXR3413
517 | CXR3416
518 | CXR3419
519 | CXR342
520 | CXR3428
521 | CXR3432
522 | CXR3437
523 | CXR3439
524 | CXR3443
525 | CXR3449
526 | CXR3451
527 | CXR3473
528 | CXR3477
529 | CXR3479
530 | CXR3485
531 | CXR349
532 | CXR3499
533 | CXR3514
534 | CXR3521
535 | CXR3523
536 | CXR3524
537 | CXR3525
538 | CXR353
539 | CXR3530
540 | CXR3539
541 | CXR3543
542 | CXR3559
543 | CXR3562
544 | CXR357
545 | CXR3575
546 | CXR358
547 | CXR3585
548 | CXR3586
549 | CXR3587
550 | CXR3589
551 | CXR3596
552 | CXR3599
553 | CXR36
554 | CXR3603
555 | CXR3606
556 | CXR3609
557 | CXR3610
558 | CXR3619
559 | CXR3623
560 | CXR3632
561 | CXR3640
562 | CXR3641
563 | CXR3645
564 | CXR3648
565 | CXR366
566 | CXR3661
567 | CXR3663
568 | CXR3666
569 | CXR3668
570 | CXR3670
571 | CXR3677
572 | CXR368
573 | CXR3683
574 | CXR3684
575 | CXR3685
576 | CXR3698
577 | CXR370
578 | CXR3700
579 | CXR3714
580 | CXR3715
581 | CXR3718
582 | CXR3726
583 | CXR3735
584 | CXR3741
585 | CXR3744
586 | CXR3747
587 | CXR3762
588 | CXR3777
589 | CXR3785
590 | CXR379
591 | CXR3792
592 | CXR3795
593 | CXR3798
594 | CXR38
595 | CXR3803
596 | CXR3806
597 | CXR3817
598 | CXR3825
599 | CXR383
600 | CXR3830
601 | CXR3832
602 | CXR3837
603 | CXR3838
604 | CXR3846
605 | CXR3847
606 | CXR3849
607 | CXR3851
608 | CXR3852
609 | CXR3858
610 | CXR3860
611 | CXR3865
612 | CXR3867
613 | CXR3869
614 | CXR3870
615 | CXR3879
616 | CXR3881
617 | CXR3885
618 | CXR3888
619 | CXR3898
620 | CXR3899
621 | CXR3901
622 | CXR3906
623 | CXR3908
624 | CXR3913
625 | CXR392
626 | CXR3921
627 | CXR3923
628 | CXR3925
629 | CXR3928
630 | CXR3934
631 | CXR3935
632 | CXR3937
633 | CXR3946
634 | CXR3948
635 | CXR3952
636 | CXR3963
637 | CXR398
638 | CXR399
639 | CXR3998
640 | CXR40
641 | CXR402
642 | CXR403
643 | CXR406
644 | CXR408
645 | CXR416
646 | CXR420
647 | CXR423
648 | CXR427
649 | CXR432
650 | CXR439
651 | CXR444
652 | CXR445
653 | CXR46
654 | CXR467
655 | CXR47
656 | CXR471
657 | CXR473
658 | CXR474
659 | CXR477
660 | CXR48
661 | CXR481
662 | CXR493
663 | CXR494
664 | CXR503
665 | CXR508
666 | CXR512
667 | CXR522
668 | CXR53
669 | CXR530
670 | CXR540
671 | CXR55
672 | CXR565
673 | CXR570
674 | CXR573
675 | CXR577
676 | CXR584
677 | CXR585
678 | CXR589
679 | CXR590
680 | CXR598
681 | CXR60
682 | CXR601
683 | CXR606
684 | CXR611
685 | CXR616
686 | CXR617
687 | CXR622
688 | CXR639
689 | CXR64
690 | CXR645
691 | CXR646
692 | CXR654
693 | CXR661
694 | CXR665
695 | CXR668
696 | CXR671
697 | CXR672
698 | CXR673
699 | CXR674
700 | CXR680
701 | CXR686
702 | CXR695
703 | CXR698
704 | CXR700
705 | CXR703
706 | CXR705
707 | CXR706
708 | CXR707
709 | CXR71
710 | CXR712
711 | CXR719
712 | CXR726
713 | CXR73
714 | CXR733
715 | CXR737
716 | CXR738
717 | CXR741
718 | CXR742
719 | CXR743
720 | CXR751
721 | CXR752
722 | CXR756
723 | CXR760
724 | CXR781
725 | CXR792
726 | CXR795
727 | CXR797
728 | CXR8
729 | CXR800
730 | CXR805
731 | CXR831
732 | CXR833
733 | CXR837
734 | CXR840
735 | CXR843
736 | CXR846
737 | CXR853
738 | CXR855
739 | CXR856
740 | CXR859
741 | CXR871
742 | CXR875
743 | CXR885
744 | CXR888
745 | CXR889
746 | CXR892
747 | CXR897
748 | CXR903
749 | CXR904
750 | CXR906
751 | CXR907
752 | CXR909
753 | CXR919
754 | CXR920
755 | CXR921
756 | CXR925
757 | CXR927
758 | CXR929
759 | CXR932
760 | CXR934
761 | CXR935
762 | CXR939
763 | CXR941
764 | CXR943
765 | CXR95
766 | CXR964
767 | CXR970
768 | CXR975
769 | CXR981
770 | CXR989
771 | CXR992


--------------------------------------------------------------------------------
/images/negbio.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ncbi-nlp/NegBio/073199e2792824740e89844a59c13d3d40ce4d23/images/negbio.png


--------------------------------------------------------------------------------
/negbio/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ncbi-nlp/NegBio/073199e2792824740e89844a59c13d3d40ce4d23/negbio/__init__.py


--------------------------------------------------------------------------------
/negbio/chexpert/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Stanford Machine Learning Group
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/negbio/chexpert/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | The codes and patterns in this package are built on CheXpert labeler.
3 | https://github.com/stanfordmlgroup/chexpert-labeler
4 | """
5 | 


--------------------------------------------------------------------------------
/negbio/chexpert/constants.py:
--------------------------------------------------------------------------------
 1 | # Observation constants
 2 | CARDIOMEGALY = "Cardiomegaly"
 3 | ENLARGED_CARDIOMEDIASTINUM = "Enlarged Cardiomediastinum"
 4 | SUPPORT_DEVICES = "Support Devices"
 5 | NO_FINDING = "No Finding"
 6 | OBSERVATION = "observation"
 7 | CATEGORIES = ["No Finding", "Enlarged Cardiomediastinum", "Cardiomegaly",
 8 |               "Lung Lesion", "Airspace Opacity", "Edema", "Consolidation",
 9 |               "Pneumonia", "Atelectasis", "Pneumothorax", "Pleural Effusion",
10 |               "Pleural Other", "Fracture", "Support Devices"]
11 | 
12 | # Numeric constants
13 | POSITIVE = 1
14 | NEGATIVE = 0
15 | UNCERTAIN = -1
16 | 
17 | # Misc. constants
18 | UNCERTAINTY = "uncertainty"
19 | NEGATION = "negation"
20 | REPORTS = "Reports"
21 | 


--------------------------------------------------------------------------------
/negbio/chexpert/patterns/negation.txt:
--------------------------------------------------------------------------------
  1 | # No definite XXX
  2 | ({} > {} {lemma:/definite/}) > {dependency:/neg/} {}
  3 | 
  4 | # No obvious XXX
  5 | ({} > {} {lemma:/obvious/}) > {dependency:/neg/} {}
  6 | 
  7 | 
  8 | {} > {dependency:/amod|nsubj/} {lemma:/normal|unremarkable/}
  9 | {} < {dependency:/amod|nsubj/} {lemma:/normal|unremarkable/}
 10 | ({} > {} {}) < {dependency:/nsubj|dobj/} {lemma:/unremarkable|normal/}
 11 | {} < {} ({} > {dependency:/amod/} {lemma:/normal|unremarkable/})
 12 | {} < {} ({} < {dependency:/nsubj/} {lemma:/normal|unremarkable/})
 13 | {} < {dependency:/conj:no/} {}
 14 | {} < {} ({} < {dependency:/conj:or/} ({} > {} {lemma:/no/}))
 15 | {} < {dependency:/nsubj/} ({lemma:/limit.*/} > {} {lemma:/upper/} & > {dependency:/nmod:of/} {lemma:/normal/} & > {dependency:/case/} {lemma:/at|within/})
 16 | {} < {} ({dependency:/exclude/} < {} ({} > {} {lemma:/no/}))
 17 | 
 18 | 
 19 | ({lemma:/silhouette/} > {} {}) < {dependency:/dobj|nsubj/} {lemma:/obscure/}
 20 | 
 21 | ({} > {dependency:/amod/} {lemma:/normal|unremarkable/}) < {dependency:/dobj|nsubj/} {lemma:/demonstrate.*|show|present|display/}
 22 | {} < {dependency:/nmod:of/} ( {lemma:/appearance/} > {dependency:/amod/} {lemma:/normal/} & < {dependency:/dobj/} {lemma:/demonstrate.*|show|present|display/})
 23 | 
 24 | {} < {dependency:/amod/} ({} < {dependency:/dep|nsubj/} {lemma:/normal|unremarkable/})
 25 | {} < {dependency:/amod/} ({} > {dependency:/neg/} {lemma:/no/})
 26 | {} < {dependency:/amod/}({lemma:/finding.*/} < {dependency:/dobj/} ({lemma:/acute/} > {dependency:/nsubj/} {lemma:/no/}))
 27 | {} < {dependency:/amod/} ({lemma:/structure.*/} < {dependency:/dep|nsubj/} ({lemma:/appear/} > {dependency:/xcomp/} {lemma:/normal|unremarkable/}))
 28 | 
 29 | {} < {dependency:/compound/} ({} > {dependency:/neg/} {})
 30 | {} < {dependency:/nsubj/} {lemma:/absent/}
 31 | {} < {dependency:/amod/} ({} < {dependency:/nmod:of/} ({lemma:/evidence/} > {dependency:/case/} {lemma:/without/}))
 32 | {} < {dependency:/amod/} ({} < {dependency:/nmod:of/} ({lemma:/evidence/} > {dependency:/neg/} {}))
 33 | 
 34 | # XXX within normal limits
 35 | {} < {} ({} < {} ({lemma:/show|demonstrate|present/} > {dependency:/nmod:within/} ({lemma:/limit.*/} > {} {lemma:/normal/})))
 36 | ({} > {} {}) > {dependency:/nmod:within/} {lemma:/limit.*/}
 37 | {} < {dependency:/nsubj/} ({lemma:/limit.*/} > {} {lemma:/upper/} & > {dependency:/nmod:of/} {lemma:/normal/} & > {dependency:/case/} {lemma:/at|within/})
 38 | {} < {} ({} < {dependency:/nsubj/} ({lemma:/limit.*/} > {} {lemma:/upper/} & > {dependency:/nmod:of/} {lemma:/normal/} & > {dependency:/case/} {lemma:/at|within/}))
 39 | {} < {} ({} < {dependency:/nsubj/} ({lemma:/limit.*/} > {dependency:/amod/} {lemma:/normal/} & > {dependency:/case/} {lemma:/at|within/}))
 40 | ({lemma:/vascularity/} > {dependency:/amod/} {lemma:/pulmonary/}) > {dependency:/amod/} {lemma:/normal/}
 41 | {} < {dependency:/dobj|nsubj/} ({} > {dependency:/nmod:within/} ({lemma:/limit.*/} > {} {lemma:/normal/}))
 42 | {} > {dependency:/nmod:within/} ({lemma:/limit.*/} > {dependency:/amod/} {lemma:/normal/})
 43 | {} > {} ({lemma:/limit/} > {} {lemma:/normal/})
 44 | 
 45 | # XXX is/appears/are/appear/remain/remains (now, otherwise) normal/unremarkable
 46 | {} < {} ({lemma:/appear|remain/} > {} {lemma:/normal|unremarkable/})
 47 | 
 48 | # XXX is/appears/are/appear/remain/remains (now, otherwise) within normal limits
 49 | {} > {} ({lemma:/remain|appear/} > {} ({lemma:/limit/} > {} {lemma:/normal/}))
 50 | 
 51 | 
 52 | # rather than XXX
 53 | {} <{dependency:/conj:negcc/} {}
 54 | {} <{dependency:/nmod:without/} {}
 55 | 
 56 | {} <{dependency:/nmod:without|nmod:of/} {lemma:/clear|clearing/}=key
 57 | {} <{dependency:/nmod:out/} {lemma:/rule/}=key
 58 | 
 59 | # removal of XXX
 60 | {} <{dependency:/nmod:of/} {lemma:/history|free|disappearance|resolution|drainage|resolution|removal/}
 61 | {} <{dependency:/nmod:for/} {lemma:/negative/}
 62 | 
 63 | # exclude XXX
 64 | {} <{} {lemma:/exclude/}
 65 | 
 66 | {} <{dependency:/advmod|dep|conj:or/} {lemma:/no/}
 67 | 
 68 | # XXX has resolved
 69 | {} <{dependency:/nsubj/} ({lemma:/resolve/}=key >{dependency:/aux/} {})
 70 | 
 71 | # there is no XXX
 72 | {} <{dependency:/nsubj/} ({lemma:/be/} >{} {lemma:/no/})
 73 | 
 74 | # without evidence|finding of|for XXX
 75 | {} <{dependency:/nmod:of|nmod:for/} ({lemma:/evidence|finding/} <{dependency:/nmod:without/} {})
 76 | 
 77 | # without development of XXX
 78 | {} < {dependency:/nmod:of/} ({lemma:/development/} > {} {lemma:/without/})
 79 | 
 80 | # No development of XXX
 81 | {} < {dependency:/nmod:of/} ({lemma:/development/} > {} {lemma:/no/})
 82 | 
 83 | # no evidence of|for XXX
 84 | {} <{dependency:/nmod:of|nmod:for/} ({lemma:/evidence/} >{dependency:/neg/} {})
 85 | 
 86 | # without evidence|finding of|for XXX
 87 | {} <{dependency:/nmod:of|nmod:for/} ({lemma:/evidence|finding/} >{} {lemma:/without/})
 88 | 
 89 | # no focus of XXX
 90 | {} <{dependency:/nmod:of/} ({lemma:/focus/} >{dependency:/neg/} {})
 91 | {} <{dependency:/nmod:of/} ({lemma:/focus/} >{} {lemma:/no/})
 92 | 
 93 | # no moderate to XXX
 94 | {} <{dependency:/nmod:to/} ({lemma:/moderate/} >{dependency:/neg/} {})
 95 | 
 96 | # no evidence of developing XXX
 97 | {} <{} ({lemma:/developing/} <{} ({lemma:/evidence/} <{dependency:/nmod:without/} {}))
 98 | {} <{} ({lemma:/developing/} <{} ({lemma:/evidence/} >{} {lemma:/no/}))
 99 | 
100 | # no focal XXX
101 | {} <{dependency:/dobj/} ({} >{dependency:/nsubj/} {lemma:/no/})
102 | 
103 | # XXX is previously demonstrated/visualized
104 | {} <{dependency:/dobj|nsubjpass/} ({lemma:/demonstrate|visualize/} >{} {lemma:/previously/})
105 | 
106 | # there is no NN to suggest/explain XXX
107 | {} < {} ({lemma:/suggest|explain|diagnose/} < {} ({tag:/V.*/} > {} ({tag:/N.*/} > {} {lemma:/no/})))
108 | 
109 | # no NN to suggest/explain XXX 
110 | {} < {} ({lemma:/suggest|explain|diagnose/} < {} ({tag:/N.*/} > {} {lemma:/no/}))
111 | 
112 | # no area of XXX
113 | {} < {dependency:/nmod:of/} ({lemma:/area/} > {dependency:/compound/} {lemma:/no/})
114 | 
115 | # XXX is not enlarged
116 | {} < {dependency:/nsubjpass/} ({lemma:/enlarge/} > {dependency:/neg/} {})
117 | 
118 | # without development of XXX
119 | {} < {dependency:/nmod:of/} ({lemma:/development/} > {dependency:/case/} {lemma:/without/}) 
120 | 
121 | # XXX removed
122 | {} < {} {lemma:/remove/}
123 | {} > {} {lemma:/remove/}
124 | 
125 | # XXX is no longer seen
126 | {} < {dependency:/nsubjpass/} ({lemma:/see/} > {} ({} > {dependency:/neg/} {lemma:/no/}))
127 | {} < {dependency:/nsubjpass/} ({lemma:/see/} > {} {lemma:/no/})
128 | 
129 | # without evidence seen for XXX
130 | {} < {} ({lemma:/see/} > {} ({} > {} ({lemma:/evidence/} > {} {lemma:/without/})))
131 | {} < {} ({lemma:/see/} > {} ({lemma:/evidence/} > {} {lemma:/without/}))
132 | 
133 | # normal/unremarkable appearance of XXX
134 | {} < {} ({lemma:/appearance/} > {} {lemma:/normal|unremarkable/})
135 | 
136 | # normal/unremarkable XXX | XXX is/appears normal/unremarkable
137 | # make more general
138 | {} > {} {lemma:/normal|unremarkable/}
139 | {} < {} {lemma:/normal|unremarkable/}
140 | 
141 | # XXX has/have cleared
142 | # cleared XXX
143 | {} < {} {lemma:/clear/}
144 | {} > {} {lemma:/clear/}
145 | 
146 | # no obvious associated XXX
147 | {} < {} ({lemma:/associate.*/} > {} ({lemma:/obvious/} > {dependency:/neg/} {}))
148 | {} > {dependency:/neg/} {} & > {} {lemma:/obvious/} & > {} {lemma:/associate.*/} 
149 | 
150 | # XXX with interval resolution
151 | {} > {} ({lemma:/resolution/} > {} {lemma:/interval/})
152 | 
153 | # no XXX / general negative case
154 | {} >{dependency:/neg/} {}
155 | {} >{} {lemma:/no/}
156 | {} >{dependency:/case/} {lemma:/without/}
157 | 


--------------------------------------------------------------------------------
/negbio/chexpert/patterns/post_negation_uncertainty.txt:
--------------------------------------------------------------------------------
  1 | # Added Rules
  2 | 
  3 | # Stable/unchanged silhouette/cardiomediastinal
  4 | {lemma:/cardiomediastinal|pericardial|mediastinal|mediastinum|cardiomediastinum|contour|configuration|silhouette|size|heart|shadow/} > {dependency:/amod/} {lemma:/stable|unchanged/}
  5 | {lemma:/cardiomediastinal|pericardial|mediastinal|mediastinum|cardiomediastinum|contour|configuration|silhouette|size|heart|shadow/} > {} ({lemma:/change/} > {dependency:/neg/} {})
  6 | 
  7 | # Silhouette/cardiomediastinal is stable|unchanged|not changed
  8 | {lemma:/cardiomediastinal|pericardial|mediastinal|mediastinum|cardiomediastinum|contour|configuration|silhouette|size|heart|shadow/} < {dependency:/nsubj/} {lemma:/stable|unchanged/}
  9 | {lemma:/cardiomediastinal|pericardial|mediastinal|mediastinum|cardiomediastinum|contour|configuration|silhouette|size|heart|shadow/} < {} ({lemma:/change/} > {dependency:/neg/} {})
 10 | 
 11 | # {} < {} ({lemma:/change/} > {dependency:/neg/} {})
 12 | 
 13 | # Silhouette/cardiomediastinal similar to prior
 14 | {lemma:/cardiomediastinal|pericardial|mediastinal|mediastinum|cardiomediastinum|contour|configuration|silhouette|size|heart|shadow/} < {} ({lemma:/similar/} > {dependency:/nmod:to/} {lemma:/prior/})
 15 | {lemma:/cardiomediastinal|pericardial|mediastinal|mediastinum|cardiomediastinum|contour|configuration|silhouette|size|heart|shadow/} < {} ({lemma:/similar/} > {dependency:/nmod:to/} ({} >{} {lemma:/prior/}))
 16 | {lemma:/cardiomediastinal|pericardial|mediastinal|mediastinum|cardiomediastinum|contour|configuration|silhouette|size|heart|shadow/} > {} ({lemma:/similar/} > {dependency:/nmod:to/} {lemma:/prior/})
 17 | {lemma:/cardiomediastinal|pericardial|mediastinal|mediastinum|cardiomediastinum|contour|configuration|silhouette|size|heart|shadow/} > {} ({lemma:/similar/} > {dependency:/nmod:to/} ({} >{} {lemma:/prior/}))
 18 | 
 19 | # Stable apparence of silhouette/cardiomediastinal
 20 | {lemma:/cardiomediastinal|pericardial|mediastinal|mediastinum|cardiomediastinum|contour|configuration|silhouette|size|heart|shadow/} < {dependency:/nmod:of/} ({lemma:/appearance/} > {} {lemma:/stable/})
 21 | 
 22 | {} < {} ({lemma:/excluded/} > {dependency:/neg/} {})
 23 | {} < {dependency:/nmod:for/} {lemma:/suspicious/}
 24 | {} < {dependency:/dobj/} ({lemma:/represent/} > {dependency:/advmod/} {lemma:/possibly/})
 25 | {} > {dependency:/cc/} {lemma:/and.or/}
 26 | {} < {dependency:/conj:and.or/} {}
 27 | {} > {} {lemma:/new/} & > {dependency:/neg/} {lemma:/no/}
 28 | 
 29 | {} < {dependency:/dep/} ({} > {dependency:/acl:relcl/} ({lemma:/represent/} < {dependency:/aux/} {lemma:/may/}))
 30 | {} < {dependency:/nmod:for/} {lemma:/worrisome/}
 31 | 
 32 | # XXX versus YYY
 33 | {} < {dependency:/conj:versus/} {}
 34 | {} > {dependency:/conj:versus/} {}
 35 | 
 36 | # {} < {dependency:/nsubjpass/} ({lemma:/change/} > {dependency:/neg/} {})
 37 | ({lemma:/angle/} > {dependency:/nsubj/} {lemma:/costophrenic/}) > {dependency:/nmod:of/} {lemma:/blunt.*/}
 38 | {} < {lemma:/nsubj/} ({} > {} ({lemma:/likely/} > {} {lemma:/less/})) 
 39 | 
 40 | {} < {dependency:/nmod:out/} {lemma:/cannot/}
 41 | 
 42 | # outgoing edge
 43 | {} >{} {lemma:/possible|possibly|presumably|probable|questionable|suspect|suspected|suspicious/}
 44 | {} >{} {lemma:/question/}
 45 | 
 46 | # May/might/would/could be XXX
 47 | {} > {} {lemma:/may|might|would|could/}
 48 | 
 49 | # '{} >{dependency:/cop/} {lemma:/may|would|could/}
 50 | 
 51 | # incoming edge
 52 | {} <{dependency:/nmod:of/} {lemma:/question|suggestion/}
 53 | {} <{dependency:/dobj/} {lemma:/suspect|favor|question|consider/}
 54 | {} <{dependency:/nmod:for/} {lemma:/concern|suspicion/}
 55 | {} <{dependency:/nsubjpass/} {lemma:/suspect/}
 56 | {} <{} {lemma:/possible/}
 57 | 
 58 | # parsing error
 59 | # suspected XXX
 60 | {} <{dependency:/dobj/} {lemma:/suspect/}
 61 | {} >{dependency:/advmod/} {lemma:/suspect/}
 62 | 
 63 | # maybe due to XXX
 64 | {} <{dependency:/dep/} {lemma:/maybe/}
 65 | 
 66 | # may/could represent/reflect/indicate/include XXX
 67 | {} <{dependency:/dobj/} ({lemma:/reflect|represent|indicate|include/} >{} {lemma:/may|could|would|might|possibly|can/})
 68 | 
 69 | # may/could represent/reflect/indicate/include the presence of XXX
 70 | {} < {} ({lemma:/presence/} <{dependency:/dobj/} ({lemma:/reflect|represent|indicate|include/} >{} {lemma:/may|could|would|might|possibly|can/}))
 71 | 
 72 | # maybe secondary to XXX
 73 | {} <{dependency:/nmod:to/} {lemma:/secondary/}
 74 | 
 75 | # may be due to XXX
 76 | {} <{dependency:/nmod:to/} ({lemma:/due/} >{} {lemma:/can|could|may|would|possibly/})
 77 | 
 78 | # could related to XXX
 79 | {} <{dependency:/nmod:to/} ({lemma:/relate/} >{} {lemma:/can|could|may|would|possibly/})
 80 | 
 81 | # may be compatible with XXX
 82 | {} <{dependency:/nmod:with/} ({lemma:/compatible/} >{} {lemma:/be|could|may|would/})
 83 | 
 84 | # question left XXX
 85 | {} <{dependency:/dobj/} ({lemma:/left/} <{} {lemma:/question/})
 86 | {} >{} {lemma:/left/} <{} {lemma:/question/}
 87 | 
 88 | # differential diagnosis includes
 89 | {} <{dependency:/dobj/} ({lemma:/include/} >{} ({lemma:/diagnosis/} >{} {lemma:/differential/}))
 90 | 
 91 | # may be XXX
 92 | {} <{} {lemma:/be/} >{} {lemma:/may|could|would/}
 93 | 
 94 | # parsing error
 95 | # XXX suspected
 96 | {} <{dependency:/nsubj/} {lemma:/suspect/,tag:/VBN/}
 97 | 
 98 | # Correlation for symptoms of XXX
 99 | {} < {dependency:/nmod:of/} ({lemma:/symptom/} < {dependency:/nmod:for/} {lemma:/correlation/})
100 | 
101 | # borderline heart size
102 | {lemma:/heart/} < {dependency:/compound/} ({lemma:/size/} > {} {lemma:/borderline/})
103 | 
104 | # XXX could/might/may/possibly be present
105 | {} < {} ({lemma:/present/} > {dependency:/aux/} {lemma:/could|might|may|possibly|can/})
106 | 
107 | # XXX is poorly evaluated
108 | {} < {} ({lemma:/evaluate/} > {dependency:/advmod/} {lemma:/poorly/})
109 | 
110 | # XXX is incompletely evaluated
111 | {} < {} ({lemma:/evaluate/} > {dependency:/advmod/} {lemma:/incompletely/})
112 | 
113 | # XXX is not well visualized/evaluated
114 | {} < {} ({lemma:/evaluate|visualize/} >{dependency:/neg/} {})
115 | {} > {} ({lemma:/evaluate|visualize/} > {dependency:/neg/} {})
116 | 
117 | # obscuring the XXX | XXX is obscured | obscured XXX
118 | {} < {} {lemma:/obscure/}
119 | 
120 | # XXX could appear
121 | {} < {dependency:/nsubj/} ({lemma:/appear/} > {} {lemma:/could|may|might|can/})
122 | 
123 | # may be consistent/compatible with XXX
124 | {} < {dependency:/nmod:with/} ({lemma:/consistent/} > {} {lemma:/may|might|can|could/})
125 | 
126 | # correlate clinically for XXX
127 | {} < {dependency:/nmod:for/} ({lemma:/correlate/} > {dependency:/advmod/} {lemma:/clinically/})
128 | 
129 | # correlate clinically for evidence of XXX
130 | {} < {dependency:/nmod:of/} ({lemma:/evidence|sign|signs|symptoms|symptom/} < {dependency:/nmod:for/} ({lemma:/correlate/} > {dependency:/advmod/} {lemma:/clinically/}))
131 | 
132 | # XXX are not clearly seen
133 | {} < {} (({lemma:/see/} > {dependency:/neg/} {}) > {} {lemma:/clearly/})
134 | {} > {} (({lemma:/see/} > {dependency:/neg/} {}) > {} {lemma:/clearly/})
135 | 
136 | # possibly reflecting a XXX
137 | {} < {} ({lemma:/reflect/} > {} {lemma:/possibly/})
138 | 
139 | # XXX was not appreciated
140 | {} < {} ({lemma:/appreciate/} > {dependency:/neg/} {})
141 | 
142 | # XXX may|might|could (also) have this appearance
143 | {} < {} (({lemma:/have/} > {} {lemma:/may|might|could/}) > {} {lemma:/appearance/})
144 | 
145 | # vascular congestion
146 | # pulmonary congestion
147 | # indistinctness
148 | # vascular prominence
149 | {lemma:/congestion/} > {} {lemma:/vascular/}
150 | {lemma:/congestion/} > {} {lemma:/pulmonary/}
151 | {lemma:/indistinctness/}
152 | {lemma:/prominence/} > {} {lemma:/vascular/}
153 | 
154 | # XXX or YYY
155 | {} > {dependency:/conj:or/} {}
156 | {} < {dependency:/conj:or/} {}
157 | 
158 | 


--------------------------------------------------------------------------------
/negbio/chexpert/patterns/pre_negation_uncertainty.txt:
--------------------------------------------------------------------------------
 1 | # Reserved for uncertainty rules that need to be matched first.
 2 | 
 3 | # cannot exclude some XXX
 4 | {} < {} ({lemma:/exclude/} >{} {lemma:/cannot/})
 5 | 
 6 | # XXX is not excluded
 7 | {} < {} ({lemma:/exclude/} > {dependency:/neg/} {})
 8 | 
 9 | # no new XXX
10 | {} > {} {lemma:/new/} & > {dependency:/neg/} {lemma:/no/}
11 | {} < {} ({lemma:/new/} > {} {lemma:/no/})
12 | {} < {dependency:/compound/} ({} > {} {lemma:/new/} & > {} {lemma:/no/})
13 | 
14 | # no new area of XXX
15 | {} < {} ({lemma:/area/} > {} {lemma:/no/} > {} {lemma:/new/})
16 | {} > {} ({lemma:/area/} > {} {lemma:/no/} > {} {lemma:/new/})
17 | 
18 | # cannot rule out XXX
19 | {} <{dependency:/nmod:out/} ({lemma:/rule/} > {} {lemma:/cannot/})
20 | 
21 | # no evidence to rule out XXX
22 | 
23 | {} < {dependency:/nmod:out/} ({lemma:/rule/} < {} ({lemma:/evidence/} > {} {lemma:/no/}))
24 | 


--------------------------------------------------------------------------------
/negbio/chexpert/phrases/mention/airspace_opacity.txt:
--------------------------------------------------------------------------------
 1 | opaci
 2 | decreased translucency
 3 | increased density
 4 | airspace disease
 5 | air-space disease
 6 | air space disease
 7 | infiltrate
 8 | infiltration
 9 | interstitial marking
10 | interstitial pattern
11 | interstitial lung
12 | reticular pattern
13 | reticular marking
14 | reticulation
15 | parenchymal scarring
16 | peribronchial thickening
17 | wall thickening
18 | scar
19 | 


--------------------------------------------------------------------------------
/negbio/chexpert/phrases/mention/atelectasis.txt:
--------------------------------------------------------------------------------
1 | atelecta
2 | collapse
3 | 


--------------------------------------------------------------------------------
/negbio/chexpert/phrases/mention/cardiomegaly.txt:
--------------------------------------------------------------------------------
1 | cardiomegaly
2 | the heart
3 | heart size
4 | cardiac enlargement
5 | cardiac size
6 | cardiac shadow
7 | cardiac contour
8 | cardiac silhouette
9 | enlarged heart


--------------------------------------------------------------------------------
/negbio/chexpert/phrases/mention/consolidation.txt:
--------------------------------------------------------------------------------
1 | consolidat


--------------------------------------------------------------------------------
/negbio/chexpert/phrases/mention/edema.txt:
--------------------------------------------------------------------------------
1 | edema
2 | heart failure
3 | chf
4 | vascular congestion
5 | pulmonary congestion
6 | indistinctness
7 | vascular prominence


--------------------------------------------------------------------------------
/negbio/chexpert/phrases/mention/enlarged_cardiomediastinum.txt:
--------------------------------------------------------------------------------
1 | _mediastinum
2 | cardiomediastinum
3 | contour
4 | mediastinal configuration
5 | mediastinal silhouette
6 | pericardial silhouette
7 | cardiac silhouette and vascularity
8 | 


--------------------------------------------------------------------------------
/negbio/chexpert/phrases/mention/fracture.txt:
--------------------------------------------------------------------------------
1 | fracture
2 | 


--------------------------------------------------------------------------------
/negbio/chexpert/phrases/mention/lung_lesion.txt:
--------------------------------------------------------------------------------
 1 | mass
 2 | nodular density
 3 | nodular densities
 4 | nodular opacity
 5 | nodular opacities
 6 | nodular opacification
 7 | nodule
 8 | lump
 9 | cavitary lesion
10 | carcinoma
11 | neoplasm
12 | tumor
13 | 


--------------------------------------------------------------------------------
/negbio/chexpert/phrases/mention/no_finding.txt:
--------------------------------------------------------------------------------
 1 | emphysema
 2 | blunt
 3 | density
 4 | elevation
 5 | eventration
 6 | scoliosis
 7 | degenera
 8 | calcifi
 9 | hyperinflation
10 | bronchospasm
11 | asthma
12 | hernia
13 | copd
14 | interstitial markings
15 | plaque
16 | osteophytosis
17 | aortic disease
18 | bronchiolitis
19 | airways disease
20 | thickening
21 | cephalization
22 | aspiration
23 | bullae
24 | hyperinflat
25 | contusion
26 | atherosclero
27 | osteopenia
28 | metastasis
29 | granuloma
30 | pneumomediastinum
31 | pneumoperitoneum
32 | osteodystrophy
33 | cuffing
34 | irregular lucency
35 | inflam
36 | fissure
37 | hypertension
38 | prominen
39 | kyphosis
40 | defib
41 | hyperexpansion
42 | bullet
43 | reticula
44 | thoracentesis
45 | bronchitis
46 | volume loss
47 | deformity
48 | hemorrhage
49 | hematoma
50 | radiopaque
51 | aerophagia
52 | arthropathy
53 | tracheostomy
54 | 


--------------------------------------------------------------------------------
/negbio/chexpert/phrases/mention/pleural_effusion.txt:
--------------------------------------------------------------------------------
1 | pleural fluid
2 | effusion


--------------------------------------------------------------------------------
/negbio/chexpert/phrases/mention/pleural_other.txt:
--------------------------------------------------------------------------------
1 | pleural thickening
2 | fibrosis
3 | fibrothorax
4 | pleural scar
5 | pleural parenchymal scar
6 | pleuro-parenchymal scar
7 | pleuro-pericardial scar
8 | 


--------------------------------------------------------------------------------
/negbio/chexpert/phrases/mention/pneumonia.txt:
--------------------------------------------------------------------------------
1 | pneumonia
2 | infection
3 | infectious process
4 | infectious


--------------------------------------------------------------------------------
/negbio/chexpert/phrases/mention/pneumothorax.txt:
--------------------------------------------------------------------------------
1 | pneumothorax
2 | pneumothoraces


--------------------------------------------------------------------------------
/negbio/chexpert/phrases/mention/support_devices.txt:
--------------------------------------------------------------------------------
 1 | pacer
 2 | _line_
 3 | lines
 4 | picc
 5 | tube
 6 | valve
 7 | catheter
 8 | pacemaker
 9 | hardware
10 | arthroplast
11 | marker
12 | icd
13 | defib
14 | device
15 | drain_
16 | plate
17 | screw
18 | cannula
19 | apparatus
20 | coil
21 | support
22 | equipment
23 | mediport
24 | 


--------------------------------------------------------------------------------
/negbio/chexpert/phrases/unmention/airspace_opacity.txt:
--------------------------------------------------------------------------------
1 | pleural scar


--------------------------------------------------------------------------------
/negbio/chexpert/phrases/unmention/lung_lesion.txt:
--------------------------------------------------------------------------------
1 | calcified nodul
2 | massive
3 | massengale


--------------------------------------------------------------------------------
/negbio/chexpert/phrases/unmention/pleural_effusion.txt:
--------------------------------------------------------------------------------
1 | pericardial effusion


--------------------------------------------------------------------------------
/negbio/chexpert/stages/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ncbi-nlp/NegBio/073199e2792824740e89844a59c13d3d40ce4d23/negbio/chexpert/stages/__init__.py


--------------------------------------------------------------------------------
/negbio/chexpert/stages/aggregate.py:
--------------------------------------------------------------------------------
  1 | """Define mention aggregator class."""
  2 | import numpy as np
  3 | from tqdm import tqdm
  4 | 
  5 | from negbio.chexpert.constants import NEGATIVE, UNCERTAIN, POSITIVE, SUPPORT_DEVICES, NO_FINDING, OBSERVATION, \
  6 |     NEGATION, UNCERTAINTY, CARDIOMEGALY
  7 | 
  8 | 
  9 | class Aggregator(object):
 10 |     """Aggregate mentions of observations from radiology reports."""
 11 | 
 12 |     def __init__(self, categories, verbose=False):
 13 |         self.categories = categories
 14 | 
 15 |         self.verbose = verbose
 16 | 
 17 |     def dict_to_vec(self, d):
 18 |         """
 19 |         Convert a dictionary of the form
 20 | 
 21 |         {cardiomegaly: [1],
 22 |          opacity: [u, 1],
 23 |          fracture: [0]}
 24 | 
 25 |         into a vector of the form
 26 | 
 27 |         [np.nan, np.nan, 1, u, np.nan, ..., 0, np.nan]
 28 |         """
 29 |         vec = []
 30 |         for category in self.categories:
 31 |             # There was a mention of the category.
 32 |             if category in d:
 33 |                 label_list = d[category]
 34 |                 # Only one label, no conflicts.
 35 |                 if len(label_list) == 1:
 36 |                     vec.append(label_list[0])
 37 |                 # Multiple labels.
 38 |                 else:
 39 |                     # Case 1. There is negated and uncertain.
 40 |                     if NEGATIVE in label_list and UNCERTAIN in label_list:
 41 |                         vec.append(UNCERTAIN)
 42 |                     # Case 2. There is negated and positive.
 43 |                     elif NEGATIVE in label_list and POSITIVE in label_list:
 44 |                         vec.append(POSITIVE)
 45 |                     # Case 3. There is uncertain and positive.
 46 |                     elif UNCERTAIN in label_list and POSITIVE in label_list:
 47 |                         vec.append(POSITIVE)
 48 |                     # Case 4. All labels are the same.
 49 |                     else:
 50 |                         vec.append(label_list[0])
 51 | 
 52 |             # No mention of the category
 53 |             else:
 54 |                 vec.append(np.nan)
 55 | 
 56 |         return vec
 57 | 
 58 |     def aggregate(self, collection):
 59 |         labels = []
 60 |         documents = collection.documents
 61 |         if self.verbose:
 62 |             print("Aggregating mentions...")
 63 |             documents = tqdm(documents)
 64 |         for document in documents:
 65 |             label_dict = {}
 66 |             impression_passage = document.passages[0]
 67 |             no_finding = True
 68 |             for annotation in impression_passage.annotations:
 69 |                 category = annotation.infons[OBSERVATION]
 70 | 
 71 |                 if NEGATION in annotation.infons:
 72 |                     label = NEGATIVE
 73 |                 elif UNCERTAINTY in annotation.infons:
 74 |                     label = UNCERTAIN
 75 |                 else:
 76 |                     label = POSITIVE
 77 | 
 78 |                 # If at least one non-support category has a uncertain or
 79 |                 # positive label, there was a finding
 80 |                 if (category != SUPPORT_DEVICES and
 81 |                         label in [UNCERTAIN, POSITIVE]):
 82 |                     no_finding = False
 83 | 
 84 |                 # Don't add any labels for No Finding
 85 |                 if category == NO_FINDING:
 86 |                     continue
 87 | 
 88 |                 # add exception for 'chf' and 'heart failure'
 89 |                 if ((label in [UNCERTAIN, POSITIVE]) and
 90 |                         (annotation.text == 'chf' or
 91 |                          annotation.text == 'heart failure')):
 92 |                     if CARDIOMEGALY not in label_dict:
 93 |                         label_dict[CARDIOMEGALY] = [UNCERTAIN]
 94 |                     else:
 95 |                         label_dict[CARDIOMEGALY].append(UNCERTAIN)
 96 | 
 97 |                 if category not in label_dict:
 98 |                     label_dict[category] = [label]
 99 |                 else:
100 |                     label_dict[category].append(label)
101 | 
102 |             if no_finding:
103 |                 label_dict[NO_FINDING] = [POSITIVE]
104 | 
105 |             label_vec = self.dict_to_vec(label_dict)
106 | 
107 |             labels.append(label_vec)
108 | 
109 |         return np.array(labels)
110 | 
111 | 
112 | class NegBioAggregator(Aggregator):
113 |     LABEL_MAP = {UNCERTAIN: 'Uncertain', POSITIVE: 'Positive', NEGATIVE: 'Negative'}
114 | 
115 |     def aggregate_doc(self, document):
116 |         """
117 |         Aggregate mentions of observations from radiology reports.
118 | 
119 |         Args:
120 |             document (BioCDocument):
121 | 
122 |         Returns:
123 |             BioCDocument
124 |         """
125 |         label_dict = {}
126 |         no_finding = True
127 |         for passage in document.passages:
128 |             for annotation in passage.annotations:
129 |                 category = annotation.infons[OBSERVATION]
130 | 
131 |                 if NEGATION in annotation.infons:
132 |                     label = NEGATIVE
133 |                 elif UNCERTAINTY in annotation.infons:
134 |                     label = UNCERTAIN
135 |                 else:
136 |                     label = POSITIVE
137 | 
138 |                 # If at least one non-support category has a uncertain or
139 |                 # positive label, there was a finding
140 |                 if category != SUPPORT_DEVICES \
141 |                         and label in [UNCERTAIN, POSITIVE]:
142 |                     no_finding = False
143 | 
144 |                 # Don't add any labels for No Finding
145 |                 if category == NO_FINDING:
146 |                     continue
147 | 
148 |                 # add exception for 'chf' and 'heart failure'
149 |                 if label in [UNCERTAIN, POSITIVE] \
150 |                         and (annotation.text == 'chf' or annotation.text == 'heart failure'):
151 |                     if CARDIOMEGALY not in label_dict:
152 |                         label_dict[CARDIOMEGALY] = [UNCERTAIN]
153 |                     else:
154 |                         label_dict[CARDIOMEGALY].append(UNCERTAIN)
155 | 
156 |                 if category not in label_dict:
157 |                     label_dict[category] = [label]
158 |                 else:
159 |                     label_dict[category].append(label)
160 | 
161 |         if no_finding:
162 |             label_dict[NO_FINDING] = [POSITIVE]
163 | 
164 |         for category in self.categories:
165 |             key = 'CheXpert/{}'.format(category)
166 |             # There was a mention of the category.
167 |             if category in label_dict:
168 |                 label_list = label_dict[category]
169 |                 # Only one label, no conflicts.
170 |                 if len(label_list) == 1:
171 |                     document.infons[key] = self.LABEL_MAP[label_list[0]]
172 |                 # Multiple labels.
173 |                 else:
174 |                     # Case 1. There is negated and uncertain.
175 |                     if NEGATIVE in label_list and UNCERTAIN in label_list:
176 |                         document.infons[key] = self.LABEL_MAP[UNCERTAIN]
177 |                     # Case 2. There is negated and positive.
178 |                     elif NEGATIVE in label_list and POSITIVE in label_list:
179 |                         document.infons[key] = self.LABEL_MAP[POSITIVE]
180 |                     # Case 3. There is uncertain and positive.
181 |                     elif UNCERTAIN in label_list and POSITIVE in label_list:
182 |                         document.infons[key] = self.LABEL_MAP[POSITIVE]
183 |                     # Case 4. All labels are the same.
184 |                     else:
185 |                         document.infons[key] = self.LABEL_MAP[label_list[0]]
186 | 
187 |             # No mention of the category
188 |             else:
189 |                 pass
190 |         return document
191 | 


--------------------------------------------------------------------------------
/negbio/chexpert/stages/classify.py:
--------------------------------------------------------------------------------
 1 | """Define mention classifier class.
 2 | 
 3 | Author: stanfordmlgroup
 4 | Modified by: Yifan Peng
 5 | """
 6 | import logging
 7 | 
 8 | from negbio import ngrex
 9 | from negbio.chexpert.constants import *
10 | from negbio.neg import semgraph, propagator, neg_detector
11 | 
12 | 
13 | class ModifiedDetector(neg_detector.Detector):
14 |     """Child class of NegBio Detector class.
15 | 
16 |     Overrides parent methods __init__, detect, and match_uncertainty.
17 |     """
18 | 
19 |     def __init__(self, pre_negation_uncertainty_path,
20 |                  negation_path, post_negation_uncertainty_path):
21 |         super(ModifiedDetector, self).__init__(negation_path, post_negation_uncertainty_path)
22 |         self.preneg_uncertain_patterns = ngrex.load(pre_negation_uncertainty_path)
23 | 
24 |     def detect(self, sentence, locs):
25 |         """Detect rules in report sentences.
26 | 
27 |         Args:
28 |             sentence(BioCSentence): a sentence with universal dependencies
29 |             locs(list): a list of (begin, end)
30 | 
31 |         Return:
32 |             (str, MatcherObj, (begin, end)): negation or uncertainty,
33 |             matcher, matched annotation
34 |         """
35 |         try:
36 |             g = semgraph.load(sentence)
37 |             propagator.propagate(g)
38 |         except Exception:
39 |             logging.exception('Cannot parse dependency graph [offset=%s]', sentence.offset)
40 |             raise
41 |         else:
42 |             for loc in locs:
43 |                 for node in neg_detector.find_nodes(g, loc[0], loc[1]):
44 |                     # Match pre-negation uncertainty rules first.
45 |                     preneg_m = self.match_prenegation_uncertainty(g, node)
46 |                     if preneg_m:
47 |                         yield UNCERTAINTY, preneg_m, loc
48 |                     else:
49 |                         # Then match negation rules.
50 |                         neg_m = self.match_neg(g, node)
51 |                         if neg_m:
52 |                             yield NEGATION, neg_m, loc
53 |                         else:
54 |                             # Finally match post-negation uncertainty rules.
55 |                             postneg_m = self.match_uncertainty(g, node)
56 |                             if postneg_m:
57 |                                 yield UNCERTAINTY, postneg_m, loc
58 | 
59 |     def match_uncertainty(self, graph, node):
60 |         for pattern in self.uncertain_patterns:
61 |             for m in pattern.finditer(graph):
62 |                 n0 = m.group(0)
63 |                 if n0 == node:
64 |                     return m
65 | 
66 |     def match_prenegation_uncertainty(self, graph, node):
67 |         for pattern in self.preneg_uncertain_patterns:
68 |             for m in pattern.finditer(graph):
69 |                 n0 = m.group(0)
70 |                 if n0 == node:
71 |                     return m
72 | 
73 | 


--------------------------------------------------------------------------------
/negbio/chexpert/stages/extract.py:
--------------------------------------------------------------------------------
  1 | """Define observation extractor class."""
  2 | import re
  3 | import itertools
  4 | from collections import defaultdict
  5 | from tqdm import tqdm
  6 | from negbio.chexpert.constants import CARDIOMEGALY, ENLARGED_CARDIOMEDIASTINUM, OBSERVATION
  7 | 
  8 | import bioc
  9 | 
 10 | 
 11 | class Extractor(object):
 12 |     """Extract observations from impression sections of reports."""
 13 |     def __init__(self, mention_phrases_dir, unmention_phrases_dir,
 14 |                  verbose=False):
 15 |         self.verbose = verbose
 16 |         self.observation2mention_phrases\
 17 |             = self.load_phrases(mention_phrases_dir, "mention")
 18 |         self.observation2unmention_phrases\
 19 |             = self.load_phrases(unmention_phrases_dir, "unmention")
 20 |         self.add_unmention_phrases()
 21 | 
 22 |     def load_phrases(self, phrases_dir, phrases_type):
 23 |         """Read in map from observations to phrases for matching."""
 24 |         observation2phrases = defaultdict(list)
 25 |         for phrases_path in phrases_dir.glob("*.txt"):
 26 |             with phrases_path.open() as f:
 27 |                 for line in f:
 28 |                     phrase = line.strip().replace("_", " ")
 29 |                     observation = phrases_path.stem.replace("_", " ").title()
 30 |                     if line:
 31 |                         observation2phrases[observation].append(phrase)
 32 | 
 33 |         if self.verbose:
 34 |             print("Loading {} phrases for {} observations.".format(phrases_type, len(observation2phrases)))
 35 | 
 36 |         return observation2phrases
 37 | 
 38 |     def add_unmention_phrases(self):
 39 |         cardiomegaly_mentions\
 40 |             = self.observation2mention_phrases[CARDIOMEGALY]
 41 |         enlarged_cardiom_mentions\
 42 |             = self.observation2mention_phrases[ENLARGED_CARDIOMEDIASTINUM]
 43 |         positional_phrases = (["over the", "overly the", "in the"],
 44 |                               ["", " superior", " left", " right"])
 45 |         positional_unmentions = [e1 + e2
 46 |                                  for e1 in positional_phrases[0]
 47 |                                  for e2 in positional_phrases[1]]
 48 |         cardiomegaly_unmentions = [e1 + " " + e2.replace("the ", "")
 49 |                                    for e1 in positional_unmentions
 50 |                                    for e2 in cardiomegaly_mentions
 51 |                                    if e2 not in ["cardiomegaly",
 52 |                                                  "cardiac enlargement"]]
 53 |         enlarged_cardiomediastinum_unmentions\
 54 |             = [e1 + " " + e2
 55 |                for e1 in positional_unmentions
 56 |                for e2 in enlarged_cardiom_mentions]
 57 | 
 58 |         self.observation2unmention_phrases[CARDIOMEGALY]\
 59 |             = cardiomegaly_unmentions
 60 |         self.observation2unmention_phrases[ENLARGED_CARDIOMEDIASTINUM]\
 61 |             = enlarged_cardiomediastinum_unmentions
 62 | 
 63 |     def overlaps_with_unmention(self, sentence, observation, start, end):
 64 |         """Return True if a given match overlaps with an unmention phrase."""
 65 |         unmention_overlap = False
 66 |         unmention_list = self.observation2unmention_phrases.get(observation,
 67 |                                                                 [])
 68 |         for unmention in unmention_list:
 69 |             unmention_matches = re.finditer(unmention, sentence.text)
 70 |             for unmention_match in unmention_matches:
 71 |                 unmention_start, unmention_end = unmention_match.span(0)
 72 |                 if start < unmention_end and end > unmention_start:
 73 |                     unmention_overlap = True
 74 |                     break  # break early if overlap is found
 75 |             if unmention_overlap:
 76 |                 break  # break early if overlap is found
 77 | 
 78 |         return unmention_overlap
 79 | 
 80 |     def add_match(self, impression, sentence, ann_index, phrase,
 81 |                   observation, start, end):
 82 |         """Add the match data and metadata to the impression object
 83 |         in place."""
 84 |         annotation = bioc.BioCAnnotation()
 85 |         annotation.id = ann_index
 86 |         annotation.infons['CUI'] = None
 87 |         annotation.infons['semtype'] = None
 88 |         annotation.infons['term'] = phrase
 89 |         annotation.infons[OBSERVATION] = observation
 90 |         annotation.infons['annotator'] = 'CheXpert labeler'
 91 |         length = end - start
 92 |         annotation.add_location(bioc.BioCLocation(sentence.offset + start,
 93 |                                                   length))
 94 |         annotation.text = sentence.text[start:start+length]
 95 | 
 96 |         impression.annotations.append(annotation)
 97 | 
 98 |     def extract(self, collection):
 99 |         """Extract the observations in each report.
100 | 
101 |         Args:
102 |             collection (BioCCollection): Impression passages of each report.
103 | 
104 |         Return:
105 |             extracted_mentions
106 |         """
107 | 
108 |         # The BioCCollection consists of a series of documents.
109 |         # Each document is a report (just the Impression section
110 |         # of the report.)
111 |         documents = collection.documents
112 |         if self.verbose:
113 |             print("Extracting mentions...")
114 |             documents = tqdm(documents)
115 |         for document in documents:
116 |             # Get the Impression section.
117 |             impression = document.passages[0]
118 |             annotation_index = itertools.count(len(impression.annotations))
119 | 
120 |             for sentence in impression.sentences:
121 |                 obs_phrases = self.observation2mention_phrases.items()
122 |                 for observation, phrases in obs_phrases:
123 |                     for phrase in phrases:
124 |                         matches = re.finditer(phrase, sentence.text)
125 |                         for match in matches:
126 |                             start, end = match.span(0)
127 | 
128 |                             if self.overlaps_with_unmention(sentence,
129 |                                                             observation,
130 |                                                             start,
131 |                                                             end):
132 |                                 continue
133 | 
134 |                             self.add_match(impression,
135 |                                            sentence,
136 |                                            str(next(annotation_index)),
137 |                                            phrase,
138 |                                            observation,
139 |                                            start,
140 |                                            end)
141 | 
142 | 
143 | class NegBioExtractor(Extractor):
144 |     def extract_doc(self, document):
145 |         annotation_index = itertools.count()
146 |         for passage in document.passages:
147 |             for sentence in passage.sentences:
148 |                 obs_phrases = self.observation2mention_phrases.items()
149 |                 for observation, phrases in obs_phrases:
150 |                     for phrase in phrases:
151 |                         matches = re.finditer(phrase, sentence.text)
152 |                         for match in matches:
153 |                             start, end = match.span(0)
154 |                             if self.overlaps_with_unmention(sentence, observation, start, end):
155 |                                 continue
156 |                             self.add_match(passage, sentence, str(next(annotation_index)), phrase,
157 |                                            observation, start, end)
158 |         return document
159 | 
160 |     def extract_all(self, collection):
161 |         """Extract the observations in each report."""
162 |         annotation_index = itertools.count()
163 |         for doc in collection.documents:
164 |             for passage in doc.passages:
165 |                 for sentence in passage.sentences:
166 |                     obs_phrases = self.observation2mention_phrases.items()
167 |                     for observation, phrases in obs_phrases:
168 |                         for phrase in phrases:
169 |                             matches = re.finditer(phrase, sentence.text)
170 |                             for match in matches:
171 |                                 start, end = match.span(0)
172 |                                 if self.overlaps_with_unmention(sentence, observation, start, end):
173 |                                     continue
174 |                                 self.add_match(passage, sentence, str(next(annotation_index)), phrase,
175 |                                                observation, start, end)
176 |         return collection
177 | 


--------------------------------------------------------------------------------
/negbio/chexpert/stages/load.py:
--------------------------------------------------------------------------------
 1 | """Define report loader class."""
 2 | import re
 3 | 
 4 | from negbio.pipeline.section_split import split_document
 5 | 
 6 | 
 7 | def _maketrans(s):
 8 |     s = s.replace(',', ', ')
 9 |     s = s.replace('.', '. ')
10 |     return s
11 | 
12 | 
13 | def extract_impression_from_passages(document):
14 |     """Extract the Impression section from a Bioc Document."""
15 |     document.passages = [passage for passage in document.passages
16 |                          if passage.infons['title'] == "impression"]
17 | 
18 |     assert len(document.passages) <= 1, "The document contains {} impression passages.".format(len(document.passages))
19 | 
20 |     assert len(document.passages) >= 1, "The document contains no explicit impression passage."
21 | 
22 | 
23 | class NegBioLoader(object):
24 |     """Report impression loader."""
25 |     def __init__(self, extract_impression=False):
26 |         self.extract_impression = extract_impression
27 |         # self.punctuation_spacer = string.maketrans({key: "{} ".format(key)
28 |         #                                            for key in ".,"})
29 |         # self.stop_spacer = string.maketrans('.', '. ')
30 |         # self.comma_spacer = string.maketrans(',', ', ')
31 | 
32 |     def clean_doc(self, document):
33 |         """Load and clean the reports."""
34 |         for passage in document.passages:
35 |             passage.text = self.clean(passage.text)
36 | 
37 |         if self.extract_impression:
38 |             document = split_document(document)
39 |             extract_impression_from_passages(document)
40 | 
41 |         return document
42 | 
43 |     def clean(self, report):
44 |         """Clean the report text."""
45 |         lower_report = report.lower()
46 |         # Change `and/or` to `or`.
47 |         corrected_report = re.sub('and/or',
48 |                                   'or',
49 |                                   lower_report)
50 |         # Change any `XXX/YYY` to `XXX or YYY`.
51 |         corrected_report = re.sub('(?<=[a-zA-Z])/(?=[a-zA-Z])',
52 |                                   ' or ',
53 |                                   corrected_report)
54 |         # Clean double periods
55 |         clean_report = corrected_report.replace("..", ".")
56 |         # Insert space after commas and periods.
57 |         clean_report = _maketrans(clean_report)
58 |         # Convert any multi white spaces to single white spaces.
59 |         clean_report = ' '.join(clean_report.split())
60 | 
61 |         return clean_report
62 | 


--------------------------------------------------------------------------------
/negbio/cli_utils.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | 
 4 | import docopt
 5 | 
 6 | 
 7 | __root__ = os.path.dirname(os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))))
 8 | 
 9 | 
10 | def get_args(args):
11 |     s = ''
12 |     for k in args:
13 |         s += '    {}: {}\n'.format(k, args[k])
14 |     return s
15 | 
16 | 
17 | def parse_args(doc, **kwargs):
18 |     argv = docopt.docopt(doc, **kwargs)
19 |     if argv['--verbose']:
20 |         logging.basicConfig(level=logging.DEBUG)
21 |     else:
22 |         logging.basicConfig(level=logging.INFO)
23 |     logging.debug('Arguments:\n%s', get_args(argv))
24 |     return argv
25 | 
26 | 
27 | def get_absolute_path(argv, key, default_value):
28 |     print (__root__)
29 |     if argv[key] == default_value:
30 |         argv[key] = os.path.join(__root__, argv[key])
31 |     return argv


--------------------------------------------------------------------------------
/negbio/compat.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Python 3 compatibility tools.
 3 | """
 4 | import sys
 5 | 
 6 | try:
 7 |     from pathlib import Path, PurePath
 8 | except ImportError:
 9 |     try:
10 |         from pathlib2 import Path, PurePath
11 |     except ImportError:
12 |         Path = PurePath = None
13 | 
14 | if sys.version_info[0] >= 3:
15 |     basestring = str
16 | else:
17 |     basestring = basestring
18 | 
19 | 
20 | def is_pathlib_path(obj):
21 |     """
22 |     Check whether obj is a pathlib.Path object.
23 |     Prefer using `isinstance(obj, os_PathLike)` instead of this function.
24 |     """
25 |     return Path is not None and isinstance(obj, Path)
26 | 


--------------------------------------------------------------------------------
/negbio/ext/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ncbi-nlp/NegBio/073199e2792824740e89844a59c13d3d40ce4d23/negbio/ext/__init__.py


--------------------------------------------------------------------------------
/negbio/ext/normalize_mimiccxr.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import logging
 3 | 
 4 | 
 5 | def pattern_repl(matchobj):
 6 |     """
 7 |     Replace [**Patterns**] with spaces.
 8 |     """
 9 |     s = matchobj.group(0).lower()
10 |     return ' '.rjust(len(s))
11 | 
12 | 
13 | def sub(text):
14 |     text = re.sub(r'\[\*\*.*?\*\*\]', pattern_repl, text)
15 |     text = re.sub(r'_', ' ', text)
16 |     return text
17 | 
18 | 
19 | def find_start(text):
20 |     return 0
21 | 
22 | 
23 | def find_end(text):
24 |     ends = [len(text)]
25 |     patterns = [
26 |         re.compile(r'BY ELECTRONICALLY SIGNING THIS REPORT', re.I),
27 |         re.compile(r'\n {3,}DR.', re.I),
28 |         re.compile(r'[ ]{1,}RADLINE ', re.I),
29 |         re.compile(r'.*electronically signed on', re.I),
30 |         re.compile(r'M\[0KM\[0KM')
31 |     ]
32 |     for pattern in patterns:
33 |         m = pattern.search(text)
34 |         if m:
35 |             ends.append(m.start())
36 |     return min(ends)
37 | 
38 | 
39 | def trim(text):
40 |     text = sub(text)
41 |     start = find_start(text)
42 |     end = find_end(text)
43 | 
44 |     new_text = ''
45 |     if start > 0:
46 |         new_text += ' ' * start
47 |     new_text += text[start:end]
48 |     if len(text) - end > 0:
49 |         new_text += ' ' * (len(text) - end)
50 |     return new_text
51 | 
52 | 
53 | def normalize(document):
54 |     """
55 |     Assume there are only one passage in the document
56 |     """
57 |     try:
58 |         if len(document.passages) == 0:
59 |             logging.warning('Skipped: there is no text in document %s', document.id)
60 |         elif len(document.passages) > 1:
61 |             logging.warning('Skipped: there is more than one passage in document %s', document.id)
62 |         else:
63 |             document.passages[0].text = trim(document.passages[0].text)
64 |         return document
65 |     except:
66 |         logging.exception('Cannot find text in document %s', document.id)
67 | 


--------------------------------------------------------------------------------
/negbio/main_chexpert.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Detect negative and uncertain findings from SOURCE and output to DEST
  3 | Example: python negbio/main_chexpert.py --output=examples/test.neg.xml examples/1.txt examples/2.txt
  4 |          python negbio/main_chexpert.py --skip-to-bioc --output=examples/test.neg.xml examples/1.xml
  5 | 
  6 | Usage:
  7 |     main_chexpert text [options] --output=DEST SOURCES ...
  8 |     main_chexpert bioc [options] --output=DEST SOURCE
  9 | 
 10 | Options:
 11 |     --mention_phrases_dir=<directory>           Directory containing mention phrases for each observation.
 12 |                                                 [default: negbio/chexpert/phrases/mention]
 13 |     --unmention_phrases_dir=<directory>         Directory containing unmention phrases  for each observation.
 14 |                                                 [default: negbio/chexpert/phrases/unmention]
 15 |     --neg-patterns=FILE                         Negation rules [default: negbio/chexpert/patterns/negation.txt]
 16 |     --pre-negation-uncertainty-patterns=FILE    Pre negation uncertainty rules
 17 |                                                 [default: negbio/chexpert/patterns/pre_negation_uncertainty.txt]
 18 |     --post-negation-uncertainty-patterns=FILE   Post negation uncertainty rules
 19 |                                                 [default: negbio/chexpert/patterns/post_negation_uncertainty.txt]
 20 |     --bllip-model=MODEL_DIR                     Bllip parser model directory
 21 |                                                 [default: ~/.local/share/bllipparser/GENIA+PubMed]
 22 |     --split-document                            Split document into passages based on section titles such as "Finding",
 23 |                                                 "Impression"
 24 |     --newline_is_sentence_break                 Whether to treat newlines as sentence breaks. True means that a newline
 25 |                                                 is always a sentence break. False means to ignore newlines for the
 26 |                                                 purpose of sentence splitting. This is appropriate for continuous text,
 27 |                                                 when just the non-whitespace characters should be used to determine
 28 |                                                 sentence breaks.
 29 |     --verbose                                   Print more information about progress.
 30 | """
 31 | from __future__ import print_function
 32 | 
 33 | import os
 34 | 
 35 | import bioc
 36 | import tqdm
 37 | from pathlib2 import Path
 38 | 
 39 | from negbio.chexpert.stages.aggregate import NegBioAggregator
 40 | from negbio.chexpert.stages.classify import ModifiedDetector, CATEGORIES
 41 | from negbio.chexpert.stages.extract import NegBioExtractor
 42 | from negbio.chexpert.stages.load import NegBioLoader
 43 | from negbio.cli_utils import parse_args, get_absolute_path
 44 | from negbio.pipeline import text2bioc, negdetect
 45 | from negbio.pipeline.parse import NegBioParser
 46 | from negbio.pipeline.ptb2ud import NegBioPtb2DepConverter, Lemmatizer
 47 | from negbio.pipeline.ssplit import NegBioSSplitter
 48 | 
 49 | 
 50 | def pipeline(collection, loader, ssplitter, extractor, parser, ptb2dep, neg_detector, aggregator, verbose=False):
 51 |     """
 52 |     Args:
 53 |         loader (NegBioLoader)
 54 |         ssplitter (NegBioSSplitter)
 55 |         parser (NegBioParser)
 56 |         extractor (NegBioExtractor)
 57 |         ptb2dep (NegBioPtb2DepConverter)
 58 |         neg_detector (ModifiedDetector)
 59 |         aggregator (NegBioAggregator)
 60 |     """
 61 |     # for document in collection.documents:
 62 |     #
 63 |     #     for passage in document.passages:
 64 |     #         passage.text = clean(passage.text)
 65 |     #     ssplitter.split_doc(document)
 66 |     for document in tqdm.tqdm(collection.documents, disable=not verbose):
 67 |         document = loader.clean_doc(document)
 68 |         document = ssplitter.split_doc(document)
 69 |         document = extractor.extract_doc(document)
 70 |         document = parser.parse_doc(document)
 71 |         document = ptb2dep.convert_doc(document)
 72 |         document = negdetect.detect(document, neg_detector)
 73 |         document = aggregator.aggregate_doc(document)
 74 |         # remove sentence
 75 |         for passage in document.passages:
 76 |             del passage.sentences[:]
 77 | 
 78 |     return collection
 79 | 
 80 | 
 81 | def main():
 82 |     argv = parse_args(__doc__, version='version 2')
 83 |     print(argv)
 84 | 
 85 |     lemmatizer = Lemmatizer()
 86 |     ptb2dep = NegBioPtb2DepConverter(lemmatizer, universal=True)
 87 |     ssplitter = NegBioSSplitter(newline=argv['--newline_is_sentence_break'])
 88 |     parser = NegBioParser(model_dir=argv['--bllip-model'])
 89 | 
 90 |     argv = get_absolute_path(argv,
 91 |                              '--mention_phrases_dir',
 92 |                              'negbio/chexpert/phrases/mention')
 93 |     argv = get_absolute_path(argv,
 94 |                              '--unmention_phrases_dir',
 95 |                              'negbio/chexpert/phrases/unmention')
 96 |     argv = get_absolute_path(argv,
 97 |                              '--pre-negation-uncertainty-patterns',
 98 |                              'negbio/chexpert/patterns/pre_negation_uncertainty.txt')
 99 |     argv = get_absolute_path(argv,
100 |                              '--post-negation-uncertainty-patterns',
101 |                              'negbio/chexpert/patterns/post_negation_uncertainty.txt')
102 |     argv = get_absolute_path(argv,
103 |                              '--neg-patterns',
104 |                              'negbio/chexpert/patterns/negation.txt')
105 | 
106 |     # chexpert
107 |     loader = NegBioLoader()
108 |     extractor = NegBioExtractor(Path(argv['--mention_phrases_dir']),
109 |                                 Path(argv['--unmention_phrases_dir']),
110 |                                 verbose=argv['--verbose'])
111 |     neg_detector = ModifiedDetector(argv['--pre-negation-uncertainty-patterns'],
112 |                                     argv['--neg-patterns'],
113 |                                     argv['--post-negation-uncertainty-patterns'])
114 |     aggregator = NegBioAggregator(CATEGORIES, verbose=argv['--verbose'])
115 | 
116 |     if argv['text']:
117 |         collection = text2bioc.text2collection(argv['SOURCES'])
118 |     elif argv['bioc']:
119 |         with open(argv['SOURCE']) as fp:
120 |             collection = bioc.load(fp)
121 |     else:
122 |         raise KeyError
123 | 
124 |     pipeline(collection, loader, ssplitter, extractor, parser, ptb2dep, neg_detector, aggregator,
125 |              verbose=argv['--verbose'])
126 | 
127 |     with open(os.path.expanduser(argv['--output']), 'w') as fp:
128 |         bioc.dump(collection, fp)
129 | 
130 | 
131 | if __name__ == '__main__':
132 |     main()
133 | 


--------------------------------------------------------------------------------
/negbio/main_mm.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Detect negative and uncertain findings from SOURCE and output to DEST
  3 | Example: python negbio/main_mm.py --metamap=/opt/public_mm/bin/metamap16 --output=examples/test.neg.xml examples/1.txt examples/2.txt
  4 | 
  5 | Usage:
  6 |     main_mm text [options] --metamap=BINARY --output=DEST SOURCES ...
  7 |     main_mm bioc [options] --metamap=BINARY --output=DEST SOURCE ...
  8 | 
  9 | Options:
 10 |     --neg-patterns=FILE             negation rules [default: negbio/patterns/neg_patterns.txt]
 11 |     --uncertainty-patterns=FILE     uncertainty rules [default: negbio/patterns/uncertainty_patterns.txt]
 12 |     --bllip-model=MODEL_DIR         Bllip parser model directory
 13 |     --split-document                Split document into passages based on section titles such as "Finding", "Impression"
 14 |     --cuis=FILE                     CUI list. To keep all CUIs, set it to None [default: examples/cuis-cvpr2017.txt]
 15 |     --newline_is_sentence_break     Whether to treat newlines as sentence breaks. True means that a newline is always a
 16 |                                     sentence break. False means to ignore newlines for the purpose of sentence
 17 |                                     splitting. This is appropriate for continuous text, when just the non-whitespace
 18 |                                     characters should be used to determine sentence breaks.
 19 |     --word_sense_disambiguation     Whether to use word sense disambiguation.
 20 |     --verbose                       Print more information about progress.
 21 | """
 22 | from __future__ import print_function
 23 | import logging
 24 | import sys
 25 | import os
 26 | import bioc
 27 | import docopt
 28 | 
 29 | import pymetamap
 30 | 
 31 | from negbio.cli_utils import parse_args, get_absolute_path
 32 | from negbio.pipeline import negdetect, text2bioc, dner_mm
 33 | from negbio.negbio_dner_matamap import read_cuis
 34 | from negbio.pipeline.parse import NegBioParser
 35 | from negbio.pipeline.ssplit import NegBioSSplitter
 36 | from negbio.pipeline.ptb2ud import NegBioPtb2DepConverter, Lemmatizer
 37 | 
 38 | 
 39 | def pipeline(collection, metamap, splitter, parser, ptb2dep, neg_detector, cuis, extra_args):
 40 |     """
 41 | 
 42 |     Args:
 43 |         collection(BioCCollection):
 44 |         metamap(MetaMap): MetaMap instance
 45 |         splitter (NegBioSSplitter):
 46 |         parser (NegBioParser)
 47 |         ptb2dep (NegBioPtb2DepConverter)
 48 |         neg_detector (Detector):
 49 | 
 50 |     Returns:
 51 |         BioCCollection
 52 |     """
 53 |     for document in collection.documents:
 54 |         splitter.split_doc(document)
 55 | 
 56 |     dner_mm.run_metamap_col(collection, metamap, cuis, extra_args)
 57 | 
 58 |     for document in collection.documents:
 59 |         document = parser.parse_doc(document)
 60 |         document = ptb2dep.convert_doc(document)
 61 |         document = negdetect.detect(document, neg_detector)
 62 |         # remove sentence
 63 |         for passage in document.passages:
 64 |             del passage.sentences[:]
 65 | 
 66 |     return collection
 67 | 
 68 | 
 69 | def main():
 70 |     argv = parse_args(__doc__, version='version 2')
 71 |     print(argv)
 72 | 
 73 |     lemmatizer = Lemmatizer()
 74 |     ptb2dep = NegBioPtb2DepConverter(lemmatizer, universal=True)
 75 |     splitter = NegBioSSplitter(newline=argv['--newline_is_sentence_break'])
 76 |     parser = NegBioParser(model_dir=argv['--bllip-model'])
 77 | 
 78 |     argv = get_absolute_path(argv,
 79 |                              '--neg-patterns',
 80 |                              'negbio/patterns/neg_patterns.txt')
 81 |     argv = get_absolute_path(argv,
 82 |                              '--uncertainty-patterns',
 83 |                              'negbio/patterns/uncertainty_patterns.txt')
 84 | 
 85 |     mm = pymetamap.MetaMap.get_instance(argv['--metamap'])
 86 |     neg_detector = negdetect.Detector(argv['--neg-patterns'], argv['--uncertainty-patterns'])
 87 | 
 88 |     if argv['--cuis'] == 'None':
 89 |         cuis = None
 90 |     else:
 91 |         cuis = read_cuis(argv['--cuis'])
 92 | 
 93 |     if argv['text']:
 94 |         collection = text2bioc.text2collection(argv['SOURCES'])
 95 |     elif argv['bioc']:
 96 |         with open(argv['SOURCE']) as fp:
 97 |             collection = bioc.load(fp)
 98 |     else:
 99 |         raise KeyError
100 | 
101 |     extra_args = dict()
102 |     if argv['--word_sense_disambiguation']:
103 |         extra_args['word_sense_disambiguation'] = True
104 | 
105 |     # Converting empty dict to None
106 |     if len(extra_args) == 0:
107 |         extra_args = None
108 | 
109 |     pipeline(collection, mm, splitter, parser, ptb2dep, neg_detector, cuis, extra_args)
110 | 
111 |     with open(os.path.expanduser(argv['--output']), 'w') as fp:
112 |         bioc.dump(collection, fp)
113 | 
114 | 
115 | if __name__ == '__main__':
116 |     main()
117 | 


--------------------------------------------------------------------------------
/negbio/neg/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ncbi-nlp/NegBio/073199e2792824740e89844a59c13d3d40ce4d23/negbio/neg/__init__.py


--------------------------------------------------------------------------------
/negbio/neg/neg_detector.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | 
  3 | import logging
  4 | 
  5 | from negbio.neg import utils, semgraph, propagator
  6 | from negbio import ngrex
  7 | 
  8 | NEGATION = 'negation'
  9 | UNCERTAINTY = 'uncertainty'
 10 | 
 11 | 
 12 | class Detector(object):
 13 | 
 14 |     NEGATION = 'negation'
 15 |     UNCERTAINTY = 'uncertainty'
 16 | 
 17 |     def __init__(self,
 18 |                  neg_pattern_file,
 19 |                  uncertainty_pattern_file,
 20 |                  sentence_rule=False):
 21 |         self.sentence_rule = sentence_rule
 22 |         self.neg_patterns = ngrex.load(neg_pattern_file)
 23 |         self.uncertain_patterns = ngrex.load(uncertainty_pattern_file)
 24 | 
 25 |     def detect(self, sentence, locs):
 26 |         """
 27 |         Args:
 28 |             sentence(BioCSentence): a sentence with universal dependencies
 29 |             locs(list): a list of (begin, end)
 30 |         Yields:
 31 |             (str, MatcherObj, (begin, end)): negation or uncertainty, matcher, matched annotation
 32 |         """
 33 |         try:
 34 |             g = semgraph.load(sentence)
 35 |             propagator.propagate(g)
 36 |         except:
 37 |             logging.exception('Cannot parse dependency graph [offset={}]'.format(sentence.offset))
 38 |             raise
 39 |         else:
 40 |             if self.sentence_rule and is_neg_graph1(g):
 41 |                 for loc in locs:
 42 |                     yield NEGATION, None, loc
 43 |                 return
 44 |             for loc in locs:
 45 |                 if self.sentence_rule and is_neg_graph2(g, loc[0], loc[1]):
 46 |                     yield NEGATION, None, loc
 47 |                 for node in find_nodes(g, loc[0], loc[1]):
 48 |                     m = self.match_neg(g, node)
 49 |                     if m:
 50 |                         yield NEGATION, m, loc
 51 |                     m = self.match_uncertainty(g, node)
 52 |                     if m:
 53 |                         yield UNCERTAINTY, m, loc
 54 | 
 55 |     def match_neg(self, graph, node):
 56 |         """
 57 |         Returns a matcher
 58 |         """
 59 |         for pattern in self.neg_patterns:
 60 |             for m in pattern.finditer(graph):
 61 |                 n0 = m.group(0)
 62 |                 if n0 == node:
 63 |                     try:
 64 |                         key = m.get('key')
 65 |                         if semgraph.has_out_edge(graph, key, ['neg']):
 66 |                             continue
 67 |                     except:
 68 |                         pass
 69 |                     if semgraph.has_out(graph, n0, ['new'], ['amod']):
 70 |                         continue
 71 |                     return m
 72 |         return None
 73 | 
 74 |     def match_uncertainty(self, graph, node):
 75 |         for pattern in self.uncertain_patterns:
 76 |             for m in pattern.finditer(graph):
 77 |                 n0 = m.group(0)
 78 |                 if n0 == node:
 79 |                     return m
 80 | 
 81 |         # parsing error
 82 |         # suggestive of XXX
 83 |         p = ngrex.compile('{} <{dependency:/nmod:of/} {lemma:/suggestive/}')
 84 |         for m in p.finditer(graph):
 85 |             n0 = m.group(0)
 86 |             if n0 == node:
 87 |                 if semgraph.has_out_node(graph, m.group(1), ['most']):
 88 |                     return None
 89 |                 elif semgraph.has_out(graph, n0, ['new', 'develop'], ['amod']):
 90 |                     continue
 91 |                 else:
 92 |                     return m
 93 |         return None
 94 | 
 95 | 
 96 | def find_nodes(graph, begin, end):
 97 |     for node in graph.nodes():
 98 |         if utils.intersect((begin, end), (graph.node[node]['start'], graph.node[node]['end'])):
 99 |             yield node
100 | 
101 | 
102 | def is_neg_graph1(graph):
103 |     # no XXX
104 |     # resolution of XXX
105 |     if 'T0' in graph.node and graph.node['T0']['lemma'] in ['no', 'resolution', 'resolved']:
106 |         # no verb
107 |         has_verb = utils.contains(lambda x: graph.node[x]['tag'][0] == 'V', graph.nodes())
108 |         if not has_verb:
109 |             return True
110 |     return False
111 | 
112 | 
113 | def is_neg_graph2(graph, begin, end):
114 |     """
115 |     Return True if the sentence is like "without [begin, end]"
116 | 
117 |     """
118 | 
119 |     # without n [, n]
120 |     state = 0
121 |     # sort nodes
122 |     for node in sorted(graph.nodes(), key=lambda n: graph.node[n]['start']):
123 |         if graph.node[node]['end'] > end:
124 |             break
125 | 
126 |         if state == 0:
127 |             if graph.node[node]['lemma'] in (
128 |                     'without', 'no', 'resolve', 'resolution', 'rosolution'):
129 |                 state = 1
130 |         elif state == 1:
131 |             if graph.node[node]['tag'].startswith('N'):
132 |                 state = 1
133 |                 if utils.intersect((begin, end), (graph.node[node]['start'], graph.node[node]['end'])):
134 |                     return True
135 |             elif graph.node[node]['tag'] in ('JJ', 'CC', ',', 'VBN'):
136 |                 state = 1
137 |             else:
138 |                 return False
139 |     return False
140 | 
141 | 
142 | def is_neg(annotation):
143 |     return NEGATION in annotation.infons and annotation.infons[NEGATION] == 'True'
144 | 
145 | 
146 | def is_uncertain(annotation):
147 |     return UNCERTAINTY in annotation.infons and annotation.infons[UNCERTAINTY] == 'True'
148 | 


--------------------------------------------------------------------------------
/negbio/neg/propagator.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import logging
 4 | 
 5 | from negbio.neg import semgraph
 6 | import collections
 7 | 
 8 | 
 9 | Edge = collections.namedtuple('Edge', ['gov', 'dep', 'data'])
10 | 
11 | 
12 | def propagate(G):
13 | 
14 |     for i in range(0, 2):
15 |         edges = []
16 |         for node in G.nodes():
17 |             # hypoinflated but clear of
18 |             if G.node[node]['lemma'] == 'hypoinflated':
19 |                 for child in G.successors(node):
20 |                     edge_dep = G[node][child]['dependency']
21 |                     if G.node[child]['lemma'] == 'clear' and edge_dep == 'conj:but':
22 |                         for of in G.successors(node):
23 |                             of_dep = G[node][of]['dependency']
24 |                             if of_dep == 'nmod:of':
25 |                                 edges.append(Edge(child, of, of_dep))
26 |                         break
27 | 
28 |         for p, c, d in G.edges(data=True):
29 |             # propagate appos
30 |             if d['dependency'] == 'appos':
31 |                 # x > y >appos > z
32 |                 for grandpa in G.predecessors(p):
33 |                     edge_dep = G[grandpa][p]['dependency']
34 |                     edges.append(Edge(grandpa, c, edge_dep))
35 |                 # x <neg < y >appos > z
36 |                 for child in G.successors(p):
37 |                     edge_dep = G[p][child]['dependency']
38 |                     if edge_dep == 'neg':
39 |                         edges.append(Edge(c, child, edge_dep))
40 |             # propagate dep
41 |             if d['dependency'] == 'dep' \
42 |                     and G.node[p]['tag'].startswith('N') \
43 |                     and G.node[c]['tag'].startswith('N'):
44 |                 for grandchild in G.successors(c):
45 |                     edge_dep = G[c][grandchild]['dependency']
46 |                     if edge_dep == 'neg':
47 |                         edges.append(Edge(p, grandchild, edge_dep))
48 |             # propagate cop conjunction
49 |             if d['dependency'].startswith('conj') \
50 |                     and G.node[p]['tag'].startswith('N') \
51 |                     and G.node[c]['tag'].startswith('N'):
52 |                 for child in G.successors(p):
53 |                     edge_dep = G[p][child]['dependency']
54 |                     if edge_dep in ('aux', 'cop', 'neg', 'amod'):
55 |                         edges.append(Edge(c, child, edge_dep))
56 |                     if edge_dep in ('dep', 'compound') and G.node[child]['lemma'] == 'no':
57 |                         edges.append(Edge(c, child, edge_dep))
58 |                     if edge_dep == 'case' and G.node[child]['lemma'] == 'without':
59 |                         edges.append(Edge(c, child, edge_dep))
60 | 
61 |             # propagate area/amount >of XXX
62 |             if d['dependency'] == 'nmod:of' and G.node[p]['lemma'] in ('area', 'amount'):
63 |                 for grandpa in G.predecessors(p):
64 |                     edge_dep = G[grandpa][p]['dependency']
65 |                     edges.append(Edge(grandpa, c, edge_dep))
66 |             # propagate combination of XXX
67 |             if d['dependency'] == 'nmod:of' and G.node[p]['lemma'] == 'combination':
68 |                 for grandpa in G.predecessors(p):
69 |                     edge_dep = G[grandpa][p]['dependency']
70 |                     edges.append(Edge(grandpa, c, edge_dep))
71 |             if d['dependency'] == 'nmod:of':
72 |                 for child in G.successors(p):
73 |                     edge_dep = G[p][child]['dependency']
74 |                     # propagate no <neg x >of XXX
75 |                     if edge_dep == 'neg':
76 |                         edges.append(Edge(c, child, edge_dep))
77 |                     # propagate without <case x >of XXX
78 |                     if edge_dep == 'case' and G.node[child] == 'without':
79 |                         edges.append(Edge(c, child, edge_dep))
80 |             # parse error
81 |             # no xx and xxx
82 |             if d['dependency'] == 'neg' and semgraph.has_out_node(G, p, ['or', 'and']):
83 |                 for child in G.successors(p):
84 |                     edge_dep = G[p][child]['dependency']
85 |                     if edge_dep == 'compound' and G.node[child]['tag'].startswith('N'):
86 |                         edges.append(Edge(child, c, 'neg'))
87 | 
88 |         has_more_edges = False
89 |         for e in edges:
90 |             if not G.has_edge(e.gov, e.dep):
91 |                 assert isinstance(e.data, str) or isinstance(e.data, unicode), type(e.data)
92 |                 G.add_edge(e.gov, e.dep, dependency=e.data)
93 |                 has_more_edges = True
94 | 
95 |         if not has_more_edges:
96 |             break
97 | 
98 | 


--------------------------------------------------------------------------------
/negbio/neg/semgraph.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | import networkx as nx
 4 | 
 5 | 
 6 | def load(sentence):
 7 |     """
 8 |     Args:
 9 |         sentence(BioCSentence): a sentence with tag, text, lemma, start and end
10 |     
11 |     Returns:
12 |         DiGraph: dependency graph
13 |     
14 |     Examples:
15 |         ```xml
16 |         <annotation id="T0">
17 |           <infon key="lemma">small</infon>
18 |           <infon key="tag">JJ</infon>
19 |           <location length="5" offset="128"/>
20 |           <text>Small</text>
21 |         </annotation>
22 |         ```
23 |     """
24 |     graph = nx.DiGraph()
25 |     for ann in sentence.annotations:
26 |         loc = ann.get_total_location()
27 |         graph.add_node(ann.id, tag=ann.infons['tag'], text=ann.text, lemma=ann.infons['lemma'].lower(),
28 |                        start=loc.offset, end=loc.offset + loc.length)
29 |     for rel in sentence.relations:
30 |         dependant = None
31 |         governor = None
32 |         for node in rel.nodes:
33 |             if node.role == 'dependant':
34 |                 dependant = node.refid
35 |             elif node.role == 'governor':
36 |                 governor = node.refid
37 |         if not dependant or not governor:
38 |             logging.debug('Cannot find dependant or governor at {}'.format(sentence))
39 |         graph.add_edge(governor, dependant, dependency=rel.infons['dependency'], id=rel.id)
40 |     return graph
41 | 
42 | 
43 | def has_out_edge(graph, node, dependencies):
44 |     for _, _, d in graph.out_edges(node, data=True):
45 |         if d['dependency'] in dependencies:
46 |             return True
47 |     return False
48 | 
49 | 
50 | def has_in_edge(graph, node, dependencies):
51 |     for _, _, d in graph.in_edges(node, data=True):
52 |         if d['dependency'] in dependencies:
53 |             return True
54 |     return False
55 | 
56 | 
57 | def has_out(graph, node, lemmas, dependencies):
58 |     return get_out(graph, node, lemmas, dependencies) is not None
59 | 
60 | 
61 | def get_out(graph, node, lemmas, dependencies):
62 |     for _, c, d in graph.out_edges(node, data=True):
63 |         if d['dependency'] in dependencies and graph.node[c]['lemma'] in lemmas:
64 |             return c
65 |     return None
66 | 
67 | 
68 | def get_in(graph, node, lemmas, dependencies):
69 |     for p, _, d in graph.in_edges(node, data=True):
70 |         if d['dependency'] in dependencies and graph.node[p]['lemma'] in lemmas:
71 |             return p
72 |     return None
73 | 
74 | 
75 | def has_in(graph, node, lemmas, dependencies):
76 |     return get_in(graph, node, lemmas, dependencies) is not None
77 | 
78 | 
79 | def has_out_node(graph, node, lemmas):
80 |     for child in graph.successors(node):
81 |         if graph.node[child]['lemma'] in lemmas:
82 |             return True
83 |     return False
84 | 
85 | 
86 | def has_in_node(graph, node, lemmas):
87 |     for child in graph.predecessors(node):
88 |         if graph.node[child]['lemma'] in lemmas:
89 |             return True
90 |     return False
91 | 


--------------------------------------------------------------------------------
/negbio/neg/utils.py:
--------------------------------------------------------------------------------
 1 | def contains(func, iterable):
 2 |     """
 3 |     Return true if one element of iterable for which function returns true.
 4 |     """
 5 |     if func is None:
 6 |         func = bool
 7 |     for x in iterable:
 8 |         if func(x):
 9 |             return True
10 |     return False
11 | 
12 | 
13 | def intersect(range1, range2):
14 |     """
15 |     Args:
16 |         range1(int, int): [begin, end)
17 |         range2(int, int): [begin, end)
18 |     """
19 |     if range1[0] <= range2[0] < range1[1]:
20 |         return True
21 |     elif range1[0] < range2[1] <= range1[1]:
22 |         return True
23 |     elif range2[0] <= range1[0] < range2[1]:
24 |         return True
25 |     elif range2[0] < range1[1] <= range2[1]:
26 |         return True
27 |     return False
28 | 


--------------------------------------------------------------------------------
/negbio/negbio_clean.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Clean up sentences
 3 | 
 4 | Usage:
 5 |     negbio_pipeline cleanup [options] --output=<directory> <file> ...
 6 | 
 7 | Options:
 8 |     --suffix=<suffix>               Append an additional SUFFIX to file names. [default: .negbio.xml]
 9 |     --verbose                       Print more information about progress.
10 |     --output=<directory>            Specify the output directory.
11 | """
12 | 
13 | from negbio.cli_utils import parse_args
14 | from negbio.pipeline.cleanup import clean_sentences
15 | from negbio.pipeline.scan import scan_document
16 | 
17 | if __name__ == '__main__':
18 |     argv = parse_args(__doc__)
19 |     scan_document(source=argv['<file>'], directory=argv['--output'], suffix=argv['--suffix'],
20 |                   fn=clean_sentences)
21 | 


--------------------------------------------------------------------------------
/negbio/negbio_dner_chexpert.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Detect concepts from vocab
 3 | 
 4 | Usage:
 5 |     negbio_pipeline dner_chexpert [options] --output=<directory> <file> ...
 6 | 
 7 | Options:
 8 |     --suffix=<suffix>                       Append an additional SUFFIX to file names. [default: .chexpert.xml]
 9 |     --output=<directory>                    Specify the output directory.
10 |     --verbose                               Print more information about progress.
11 |     --mention_phrases_dir=<directory>       Directory containing mention phrases for each observation. [default: negbio/chexpert/phrases/mention]
12 |     --unmention_phrases_dir=<directory>     Directory containing unmention phrases  for each observation.  [default: negbio/chexpert/phrases/unmention]
13 | """
14 | from pathlib2 import Path
15 | 
16 | from negbio.chexpert.stages.extract import NegBioExtractor
17 | from negbio.cli_utils import parse_args, get_absolute_path
18 | from negbio.pipeline.scan import scan_collection
19 | 
20 | 
21 | def run_extractor(collection, extractor):
22 |     """
23 |     Args:
24 |         collection (BioCCollection):
25 |         extractor (NegBioExtractor):
26 |     """
27 |     extractor.extract_all(collection)
28 | 
29 | 
30 | if __name__ == '__main__':
31 |     argv = parse_args(__doc__)
32 | 
33 |     argv = get_absolute_path(argv,
34 |                              '--mention_phrases_dir',
35 |                              'negbio/chexpert/phrases/mention')
36 |     argv = get_absolute_path(argv,
37 |                              '--unmention_phrases_dir',
38 |                              'negbio/chexpert/phrases/unmention')
39 | 
40 |     extractor = NegBioExtractor(Path(argv['--mention_phrases_dir']),
41 |                                 Path(argv['--unmention_phrases_dir']),
42 |                                 verbose=argv['--verbose'])
43 |     scan_collection(source=argv['<file>'], directory=argv['--output'], suffix=argv['--suffix'],
44 |                     fn=run_extractor, non_sequences=[extractor])
45 | 


--------------------------------------------------------------------------------
/negbio/negbio_dner_matamap.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Detect UMLS concepts
 3 | 
 4 | Usage:
 5 |     negbio_pipeline dner_mm [options] --metamap=<binary> --output=<directory> <file> ...
 6 | 
 7 | Options:
 8 |     --suffix=<suffix>       Append an additional SUFFIX to file names. [default: .mm.xml]
 9 |     --output=<directory>    Specify the output directory.
10 |     --verbose               Print more information about progress.
11 |     --metamap=<binary>      The MetaMap binary
12 |     --cuis=<file>           Specify CUI list
13 | """
14 | 
15 | from negbio.cli_utils import parse_args
16 | from negbio.pipeline.dner_mm import run_metamap_col
17 | from negbio.pipeline.scan import scan_collection
18 | from pymetamap import MetaMap
19 | 
20 | 
21 | def read_cuis(pathname):
22 |     cuis = set()
23 |     with open(pathname) as fp:
24 |         for line in fp:
25 |             line = line.strip()
26 |             if line:
27 |                 cuis.add(line)
28 |     return cuis
29 | 
30 | 
31 | if __name__ == '__main__':
32 |     argv = parse_args(__doc__)
33 |     mm = MetaMap.get_instance(argv['--metamap'])
34 | 
35 |     if argv['--cuis'] is None:
36 |         cuis = None
37 |     else:
38 |         cuis = read_cuis(argv['--cuis'])
39 | 
40 |     scan_collection(source=argv['<file>'], directory=argv['--output'], suffix=argv['--suffix'],
41 |                     fn=run_metamap_col, non_sequences=[mm, cuis])
42 | 


--------------------------------------------------------------------------------
/negbio/negbio_neg.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Detect negation and uncertainty
 3 | 
 4 | Usage:
 5 |     negbio_pipeline neg [options] --output=<directory> <file> ...
 6 | 
 7 | Options:
 8 |     --neg-patterns=<file>           Specify negation rules [default: negbio/patterns/neg_patterns.txt]
 9 |     --uncertainty-patterns=<file>   Specify uncertainty rules [default: negbio/patterns/uncertainty_patterns.txt]
10 |     --suffix=<suffix>               Append an additional SUFFIX to file names. [default: .neg.xml]
11 |     --verbose                       Print more information about progress.
12 |     --output=<directory>            Specify the output directory.
13 | """
14 | import os
15 | 
16 | from negbio.cli_utils import parse_args, get_absolute_path
17 | from negbio.neg.neg_detector import Detector
18 | from negbio.pipeline.negdetect import detect
19 | from negbio.pipeline.scan import scan_document
20 | 
21 | if __name__ == '__main__':
22 |     argv = parse_args(__doc__)
23 | 
24 |     argv = get_absolute_path(argv,
25 |                              '--neg-patterns',
26 |                              'negbio/patterns/neg_patterns.txt')
27 |     argv = get_absolute_path(argv,
28 |                              '--uncertainty-patterns',
29 |                              'negbio/patterns/uncertainty_patterns.txt')
30 | 
31 |     neg_detector = Detector(os.path.realpath(argv['--neg-patterns']),
32 |                             os.path.realpath(argv['--uncertainty-patterns']))
33 |     scan_document(source=argv['<file>'], directory=argv['--output'], suffix=argv['--suffix'],
34 |                   fn=detect, non_sequences=[neg_detector])
35 | 


--------------------------------------------------------------------------------
/negbio/negbio_neg_chexpert.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Detect negation and uncertainty
 3 | 
 4 | Usage:
 5 |     negbio_pipeline neg_chexpert [options] --output=<directory> <file> ...
 6 | 
 7 | Options:
 8 |     --neg-patterns=FILE                         Negation rules [default: negbio/chexpert/patterns/negation.txt]
 9 |     --pre-negation-uncertainty-patterns=FILE    Pre negation uncertainty rules
10 |                                                 [default: negbio/chexpert/patterns/pre_negation_uncertainty.txt]
11 |     --post-negation-uncertainty-patterns=FILE   Post negation uncertainty rules
12 |                                                 [default: negbio/chexpert/patterns/post_negation_uncertainty.txt]
13 |     --suffix=<suffix>                           Append an additional SUFFIX to file names. [default: .neg.xml]
14 |     --verbose                                   Print more information about progress.
15 |     --output=<directory>                        Specify the output directory.
16 | """
17 | import os
18 | 
19 | from negbio.chexpert.stages.classify import ModifiedDetector
20 | from negbio.cli_utils import parse_args, get_absolute_path
21 | from negbio.pipeline.negdetect import detect
22 | from negbio.pipeline.scan import scan_document
23 | 
24 | 
25 | if __name__ == '__main__':
26 |     argv = parse_args(__doc__)
27 | 
28 |     argv = get_absolute_path(argv,
29 |                              '--pre-negation-uncertainty-patterns',
30 |                              'negbio/chexpert/patterns/pre_negation_uncertainty.txt')
31 |     argv = get_absolute_path(argv,
32 |                              '--post-negation-uncertainty-patterns',
33 |                              'negbio/chexpert/patterns/post_negation_uncertainty.txt')
34 |     argv = get_absolute_path(argv,
35 |                              '--neg-patterns',
36 |                              'negbio/chexpert/patterns/negation.txt')
37 | 
38 |     neg_detector = ModifiedDetector(argv['--pre-negation-uncertainty-patterns'],
39 |                                     argv['--neg-patterns'],
40 |                                     argv['--post-negation-uncertainty-patterns'])
41 |     scan_document(source=argv['<file>'], directory=argv['--output'], suffix=argv['--suffix'],
42 |                   fn=detect, non_sequences=[neg_detector])
43 | 


--------------------------------------------------------------------------------
/negbio/negbio_normalize.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Usage:
 3 |     negbio_pipeline normalize [options] --output=<directory> <file> ...
 4 | 
 5 | Options:
 6 |     --output=<directory>    Specify the output directory.
 7 |     --suffix=<suffix>       Append an additional SUFFIX to file names. [default: .normalized.xml]
 8 |     --verbose               Print more information about progress.
 9 | """
10 | 
11 | from negbio.cli_utils import parse_args
12 | from negbio.ext.normalize_mimiccxr import normalize
13 | from negbio.pipeline.scan import scan_document
14 | 
15 | if __name__ == '__main__':
16 |     argv = parse_args(__doc__)
17 |     scan_document(source=argv['<file>'], verbose=argv['--verbose'], suffix=argv['--suffix'],
18 |                   directory=argv['--output'], fn=normalize)
19 | 


--------------------------------------------------------------------------------
/negbio/negbio_parse.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Parse sentences
 3 | 
 4 | Usage:
 5 |     negbio_pipeline parse [options] --output=<directory> <file> ...
 6 | 
 7 | Options:
 8 |     --model=<directory>     Bllip parser model directory.
 9 |     --output=<directory>    Specify the output directory.
10 |     --suffix=<suffix>       Append an additional SUFFIX to file names. [default: .bllip.xml]
11 |     --verbose               Print more information about progress.
12 | """
13 | 
14 | from negbio.cli_utils import parse_args
15 | from negbio.pipeline.parse import NegBioParser
16 | from negbio.pipeline.scan import scan_document
17 | 
18 | 
19 | if __name__ == '__main__':
20 |     argv = parse_args(__doc__)
21 |     parser = NegBioParser(model_dir=argv['--model'])
22 |     scan_document(source=argv['<file>'], directory=argv['--output'], suffix=argv['--suffix'],
23 |                   fn=parser.parse_doc, non_sequences=[])
24 | 


--------------------------------------------------------------------------------
/negbio/negbio_pipeline.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Usage:
 3 |     negbio_pipeline [--verbose] <command> [<args>...]
 4 | 
 5 | Options:
 6 |     --verbose   Print more information about progress.
 7 | 
 8 | The most commonly used negbio commands are:
 9 |     text2bioc
10 |     normalize
11 |     section_split
12 |     ssplit
13 |     parse
14 |     ptb2ud
15 |     dner_mm
16 |     dner_chexpert
17 |     neg
18 |     neg_chexpert
19 |     cleanup
20 | """
21 | from subprocess import call
22 | import logging
23 | import os
24 | from negbio.cli_utils import parse_args
25 | 
26 | 
27 | def main():
28 |     args = parse_args(__doc__, version='negbio version 2', options_first=True)
29 |     logging.debug('CWD: %s', os.getcwd())
30 | 
31 |     argv = [args['<command>']] + args['<args>']
32 |     if args['<command>'] == 'text2bioc':
33 |         exit(call(['python', '-m', 'negbio.negbio_text2bioc'] + argv))
34 |     elif args['<command>'] == 'normalize':
35 |         exit(call(['python', '-m', 'negbio.negbio_normalize'] + argv))
36 |     elif args['<command>'] == 'section_split':
37 |         exit(call(['python', '-m', 'negbio.negbio_section_split'] + argv))
38 |     elif args['<command>'] == 'ssplit':
39 |         exit(call(['python', '-m', 'negbio.negbio_ssplit'] + argv))
40 |     elif args['<command>'] == 'parse':
41 |         exit(call(['python', '-m', 'negbio.negbio_parse'] + argv))
42 |     elif args['<command>'] == 'ptb2ud':
43 |         exit(call(['python', '-m', 'negbio.negbio_ptb2ud'] + argv))
44 |     elif args['<command>'] == 'dner_mm':
45 |         exit(call(['python', '-m', 'negbio.negbio_dner_matamap'] + argv))
46 |     elif args['<command>'] == 'dner_chexpert':
47 |         exit(call(['python', '-m', 'negbio.negbio_dner_chexpert'] + argv))
48 |     elif args['<command>'] == 'neg':
49 |         exit(call(['python', '-m', 'negbio.negbio_neg'] + argv))
50 |     elif args['<command>'] == 'neg_chexpert':
51 |         exit(call(['python', '-m', 'negbio.negbio_neg_chexpert'] + argv))
52 |     elif args['<command>'] == 'cleanup':
53 |         exit(call(['python', '-m', 'negbio.negbio_clean'] + argv))
54 |     elif args['<command>'] in ['help', None]:
55 |         exit(call(['python', '-m', 'negbio.negbio_pipeline', '--help']))
56 |     else:
57 |         exit("%r is not a negbio command. See 'negbio help'." % args['<command>'])
58 | 
59 | 
60 | if __name__ == '__main__':
61 |     main()
62 | 


--------------------------------------------------------------------------------
/negbio/negbio_ptb2ud.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Convert from parse tree to universal dependencies
 3 | 
 4 | Usage:
 5 |     negbio_pipeline ptb2ud [options] --output=<directory> <file> ...
 6 | 
 7 | Options:
 8 |     --output=<directory>    Specify the output directory.
 9 |     --suffix=<suffix>       Append an additional SUFFIX to file names. [default: .ud.xml]
10 |     --verbose               Print more information about progress.
11 | """
12 | from negbio.cli_utils import parse_args
13 | from negbio.pipeline.ptb2ud import NegBioPtb2DepConverter, Lemmatizer
14 | from negbio.pipeline.scan import scan_document
15 | 
16 | 
17 | if __name__ == '__main__':
18 |     argv = parse_args(__doc__)
19 |     lemmatizer = Lemmatizer()
20 |     ptb2dep = NegBioPtb2DepConverter(lemmatizer, universal=True)
21 |     scan_document(source=argv['<file>'], directory=argv['--output'], suffix=argv['--suffix'],
22 |                   fn=ptb2dep.convert_doc, non_sequences=[])
23 | 


--------------------------------------------------------------------------------
/negbio/negbio_section_split.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Split the report into sections based on titles.
 3 | 
 4 | Usage:
 5 |     negbio_pipeline section_split [options] --output=<directory> <file> ...
 6 | 
 7 | Options:
 8 |     --suffix=<suffix>       Append an additional SUFFIX to file names. [default: .secsplit.xml]
 9 |     --output=<directory>    Specify the output directory.
10 |     --verbose               Print more information about progress.
11 |     --pattern=<file>        Specify section title list for matching.
12 | """
13 | import re
14 | 
15 | from negbio.cli_utils import parse_args
16 | from negbio.pipeline.scan import scan_document
17 | from negbio.pipeline.section_split import split_document
18 | 
19 | 
20 | def read_section_titles(pathname):
21 |     with open(pathname) as fp:
22 |         return re.compile('|'.join(fp.readlines()), re.MULTILINE)
23 | 
24 | 
25 | if __name__ == '__main__':
26 |     argv = parse_args(__doc__)
27 | 
28 |     if argv['--pattern'] is None:
29 |         patterns = None
30 |     else:
31 |         patterns = read_section_titles(argv['--pattern'])
32 | 
33 |     scan_document(source=argv['<file>'], verbose=argv['--verbose'], suffix=argv['--suffix'],
34 |                   directory=argv['--output'], fn=split_document, non_sequences=[patterns])
35 | 


--------------------------------------------------------------------------------
/negbio/negbio_ssplit.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Split text into sentences
 3 | 
 4 | Usage:
 5 |     negbio_pipeline ssplit [options] --output=<directory> <file> ...
 6 | 
 7 | Options:
 8 |     --newline_is_sentence_break     Whether to treat newlines as sentence breaks. True means that a newline is always a
 9 |                                     sentence break. False means to ignore newlines for the purpose of sentence
10 |                                     splitting. This is appropriate for continuous text, when just the non-whitespace
11 |                                     characters should be used to determine sentence breaks. [default=False]
12 |     --suffix=<suffix>               Append an additional SUFFIX to file names. [default: .ssplit.xml]
13 |     --output=<directory>            Specify the output directory.
14 |     --verbose                       Print more information about progress.
15 | """
16 | from negbio.pipeline.scan import scan_document
17 | from negbio.pipeline.ssplit import NegBioSSplitter
18 | from negbio.cli_utils import parse_args
19 | 
20 | if __name__ == '__main__':
21 |     argv = parse_args(__doc__)
22 |     splitter = NegBioSSplitter(newline=argv['--newline_is_sentence_break'])
23 |     scan_document(source=argv['<file>'], directory=argv['--output'], suffix=argv['--suffix'],
24 |                   fn=splitter.split_doc, non_sequences=[])
25 | 


--------------------------------------------------------------------------------
/negbio/negbio_text2bioc.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Convert text FILEs to the BioC output file
 3 | 
 4 | Usage:
 5 |     negbio_pipeline text2bioc [options] --output=<file> <file> ...
 6 | 
 7 | Options:
 8 |     --output=<file>     Specify the output file name.
 9 |     --verbose           Print more information about progress.
10 | """
11 | 
12 | import bioc
13 | 
14 | from negbio.cli_utils import parse_args
15 | from negbio.pipeline.text2bioc import text2collection
16 | 
17 | if __name__ == '__main__':
18 |     argv = parse_args(__doc__)
19 |     collection = text2collection(argv['<file>'])
20 |     with open(argv['--output'], 'w') as fp:
21 |         bioc.dump(collection, fp)
22 | 


--------------------------------------------------------------------------------
/negbio/ngrex/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | A NgrexPattern is a tgrep-type pattern for matching node configurations in one of the Networkx 
 3 | structures. Unlike tgrep but like Unix grep, there is no pre-indexing of the data to be searched. 
 4 | Rather there is a linear scan through the graph where matches are sought.
 5 | 
 6 | A node/edge is represented by a set of attributes and their values contained by curly braces: 
 7 | `{attr1:value1;attr2:value2;...}`. Therefore, {} represents any node/edge in the graph. 
 8 | Attributes must be plain strings; values can be regular expressions blocked off by "/". 
 9 | (I think regular expressions must match the whole attribute value; so that /NN/ matches "NN" only, 
10 | while /NN.* / matches "NN", "NNS", "NNP", etc.)
11 | """
12 | from . import parser
13 | from . import pattern
14 | 
15 | 
16 | def compile(ngrex):
17 |     """
18 |     Compiles the given expression into a pattern
19 |     
20 |     Args:
21 |         ngrex(str): expression
22 |         
23 |     Returns:
24 |         NgrexPattern: a pattern
25 |     """
26 |     p = parser.yacc.parse(ngrex)
27 |     pattern.validate_names(p)
28 |     return p
29 | 
30 | 
31 | def load(filename):
32 |     """
33 |     Read a pattern file
34 |     
35 |     Args:
36 |         filename(str): file name
37 |     
38 |     Returns:
39 |         list: a list of NgexPattern
40 |     """
41 |     patterns = []
42 |     with open(filename) as fp:
43 |         for line in fp:
44 |             line = line.strip()
45 |             if not line:
46 |                 continue
47 |             if line[0] == '#':
48 |                 continue
49 |             patterns.append(compile(line))
50 |     return patterns
51 | 


--------------------------------------------------------------------------------
/negbio/ngrex/parser.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Start : ALIGNRELN SubNode "\n"
  3 |       | SubNode ( ":" SubNode )* "\n"
  4 |       ;
  5 | 
  6 | SubNode : "(" SubNode ")" RelationDisj?
  7 |         | ModNode RelationDisj?
  8 |         ;
  9 | 
 10 | RelationDisj : RelationConj ( "|" RelationConj )*
 11 | 
 12 | RelationConj : ModRelation ( "&"? ModRelation )*
 13 | 
 14 | ModRelation : RelChild
 15 |             | "!" RelChild
 16 |             | "?" RelChild
 17 |             ;
 18 | 
 19 | RelChild : "[" RelationDisj "]"
 20 |          | Relation
 21 |          ;
 22 | 
 23 | Relation : ( ( ( (IDENTIFIER ("," IDENTIFIER)?)? RELATION ( IDENTIFIER | REGEX )? ) ( "=" IDENTIFIER )? ) | ALIGNRELN)
 24 |            ( ModNode | "(" SubNode ")" )
 25 |          ;
 26 | 
 27 | NodeDisj : "[" NodeConj ( "|" NodeConj )* "]"
 28 |          ;
 29 | 
 30 | NodeConj : ModNode ( "&"? ModNode )*
 31 |          ;
 32 | 
 33 | ModNode : Child
 34 |         | "!" Child
 35 |         ;
 36 | 
 37 | Child : NodeDisj
 38 |       | Description
 39 |       ;
 40 | 
 41 | Description :
 42 |   "{" ( 
 43 |           (  ( IDENTIFIER ":" (IDENTIFIER | REGEX) ) (";" ( IDENTIFIER ":" ( IDENTIFIER | REGEX ) ) )* "}")
 44 |         | ( ROOT "}" )
 45 |         | ( EMPTY "}" )
 46 |         | "}" )
 47 |       ("=" IDENTIFIER )?
 48 | """
 49 | from ply import lex
 50 | from ply import yacc
 51 | 
 52 | from negbio.ngrex import pattern
 53 | 
 54 | 
 55 | t_ignore = ' \t\r'
 56 | 
 57 | tokens = (
 58 |     'RELATION',
 59 |     'IDENTIFIER',
 60 |     'REGEX',
 61 | )
 62 | 
 63 | literals = '{}()&[]:|,='
 64 | 
 65 | t_RELATION = r'[<>]'
 66 | t_IDENTIFIER = r'([^ \n\r!@#$%^&*()+={}\[\]\|\\;\':",./<>?`~-])+'
 67 | t_REGEX = r'/(/|[^\n\r/])*?/'
 68 | 
 69 | 
 70 | def t_error(t):
 71 |     raise TypeError('Unknown text "%s"' % (t.value,))
 72 | 
 73 | lexer = lex.lex()
 74 | 
 75 | 
 76 | def p_SubNode(p):
 77 |     """
 78 |     SubNode : ModNode
 79 |             | ModNode RelationDisj
 80 |             | '(' SubNode ')' 
 81 |             | '(' SubNode ')' RelationDisj
 82 |     """
 83 |     if len(p) == 2:
 84 |         p[0] = p[1]
 85 |     elif len(p) == 3:
 86 |         conj_patterns = []
 87 |         for relation_conj in p[2][1]:
 88 |             conj_patterns.append(_merge_conj(p[1], relation_conj[1]))
 89 |         p[0] = _merge_disj(conj_patterns)
 90 |     elif len(p) == 4:
 91 |         p[0] = p[2]
 92 |     elif len(p) == 5:
 93 |         conj_patterns = []
 94 |         for relation_conj in p[4][1]:
 95 |             conj_patterns.append(_merge_conj(p[2], relation_conj[1]))
 96 |         p[0] = _merge_disj(conj_patterns)
 97 | 
 98 | 
 99 | def _merge_disj(patterns):
100 |     while len(patterns) > 1:
101 |         p1 = patterns.pop()
102 |         p2 = patterns.pop()
103 |         patterns.append(pattern.CoordinationPattern(p1, p2, False))
104 |     return patterns[0]
105 | 
106 | 
107 | def _merge_conj(p1, relations):
108 |     patterns = []
109 |     for reln, attributes, node in relations:
110 |         if reln == '<':
111 |             p = pattern.EdgePattern(node, p1, attributes, direction=pattern.L)
112 |         else:
113 |             p = pattern.EdgePattern(p1, node, attributes, direction=pattern.R)
114 |         patterns.append(p)
115 |     if len(patterns) == 1:
116 |         return patterns[0]
117 |     else:
118 |         while len(patterns) > 1:
119 |             p1 = patterns.pop()
120 |             p2 = patterns.pop()
121 |             patterns.append(pattern.CoordinationPattern(p1, p2, True))
122 |         return patterns[0]
123 | 
124 | 
125 | def p_RelationDisj(p):
126 |     """
127 |     RelationDisj : RelationConj
128 |                  | RelationConj '|' RelationDisj
129 |     """
130 |     """
131 |     Returns:
132 |         ("OR", relation_list)
133 |     """
134 |     if len(p) == 2:
135 |         p[0] = ('OR', [p[1]])
136 |     elif len(p) == 4:
137 |         p[0] = ('OR', [p[1]] + p[3][1])
138 | 
139 | 
140 | def p_RelationConj(p):
141 |     """
142 |     RelationConj : ModRelation
143 |                  | ModRelation RelationConj
144 |                  | ModRelation '&' RelationConj
145 |     """
146 |     # (AND, [ModRelations])
147 |     if len(p) == 2:
148 |         p[0] = ('AND', [p[1]])
149 |     if len(p) == 3:
150 |         p[0] = ('AND', [p[1]] + p[2][1])
151 |     if len(p) == 4:
152 |         p[0] = ('AND', [p[1]] + p[3][1])
153 | 
154 | 
155 | def p_ModRelation(p):
156 |     """
157 |     ModRelation : RelChild
158 |     """
159 |     p[0] = p[1]
160 | 
161 | 
162 | def p_RelChild(p):
163 |     """
164 |     RelChild : Relation
165 |     """
166 |     p[0] = p[1]
167 | 
168 | 
169 | def p_Relation(p):
170 |     """
171 |     Relation : RELATION '{' Attributes '}' Relation_Next
172 |     """
173 |     """
174 |     Returns:
175 |         < edge_attributes node
176 |     """
177 |     p[0] = (p[1], p[3], p[5])
178 | 
179 | 
180 | def p_Relation_Next(p):
181 |     """
182 |     Relation_Next : ModNode
183 |                   | '(' SubNode ')'
184 |     """
185 |     if len(p) == 2:
186 |         p[0] = p[1]
187 |     else:
188 |         p[0] = p[2]
189 | 
190 | 
191 | def p_ModNode(p):
192 |     """
193 |     ModNode : Child
194 |     """
195 |     p[0] = p[1]
196 | 
197 | 
198 | def p_Child(p):
199 |     """
200 |     Child : Description
201 |     """
202 |     p[0] = p[1]
203 | 
204 | 
205 | def p_Description(p):
206 |     """
207 |     Description : '{' Attributes '}'
208 |                 | '{' Attributes '}' '=' IDENTIFIER
209 |     """
210 |     if len(p) == 4:
211 |         p[0] = pattern.NodePattern(p[2])
212 |     else:
213 |         p[0] = pattern.NodePattern(p[2], p[5])
214 | 
215 | def p_Attributes(p):
216 |     """
217 |     Attributes : IDENTIFIER ':' REGEX
218 |                | IDENTIFIER ':' REGEX ',' Attributes
219 |                | empty
220 |     """
221 |     if len(p) == 4:
222 |         p[0] = {p[1]: p[3]}
223 |     elif len(p) == 6:
224 |         p[0] = {p[1]: p[3]}
225 |         p[0].update(p[5])
226 |     else:
227 |         p[0] = {}
228 | 
229 | 
230 | def p_empty(p):
231 |     'empty :'
232 |     pass
233 | 
234 | 
235 | def p_error(p):
236 |     raise TypeError("Syntax error at '%s'" % p.value)
237 | 
238 | parser = yacc.yacc()
239 | 
240 | 


--------------------------------------------------------------------------------
/negbio/ngrex/parsetab.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # parsetab.py
 3 | # This file is automatically generated. Do not edit.
 4 | _tabversion = '3.10'
 5 | 
 6 | _lr_method = 'LALR'
 7 | 
 8 | _lr_signature = "RELATION IDENTIFIER REGEX\n    SubNode : ModNode\n            | ModNode RelationDisj\n            | '(' SubNode ')' \n            | '(' SubNode ')' RelationDisj\n    \n    RelationDisj : RelationConj\n                 | RelationConj '|' RelationDisj\n    \n    RelationConj : ModRelation\n                 | ModRelation RelationConj\n                 | ModRelation '&' RelationConj\n    \n    ModRelation : RelChild\n    \n    RelChild : Relation\n    \n    Relation : RELATION '{' Attributes '}' Relation_Next\n    \n    Relation_Next : ModNode\n                  | '(' SubNode ')'\n    \n    ModNode : Child\n    \n    Child : Description\n    \n    Description : '{' Attributes '}'\n                | '{' Attributes '}' '=' IDENTIFIER\n    \n    Attributes : IDENTIFIER ':' REGEX\n               | IDENTIFIER ':' REGEX ',' Attributes\n               | empty\n    empty :"
 9 |     
10 | _lr_action_items = {'REGEX':([23,],[29,]),':':([15,],[23,]),'&':([1,4,8,10,11,22,31,33,35,38,],[-16,-15,18,-10,-11,-17,-18,-12,-13,-14,]),')':([1,3,4,7,8,9,10,11,12,17,19,22,24,25,26,31,33,35,37,38,],[-16,-1,-15,17,-7,-2,-10,-11,-5,-3,-8,-17,-4,-9,-6,-18,-12,-13,38,-14,]),'(':([0,2,30,34,],[2,2,34,2,]),'=':([22,],[28,]),',':([29,],[32,]),'RELATION':([1,3,4,8,10,11,17,18,20,22,31,33,35,38,],[-16,13,-15,13,-10,-11,13,13,13,-17,-18,-12,-13,-14,]),'{':([0,2,13,30,34,],[5,5,21,5,5,]),'IDENTIFIER':([5,21,28,32,],[15,15,31,15,]),'}':([5,14,16,21,27,29,32,36,],[-22,22,-21,-22,30,-19,-22,-20,]),'|':([1,4,8,10,11,12,19,22,25,31,33,35,38,],[-16,-15,-7,-10,-11,20,-8,-17,-9,-18,-12,-13,-14,]),'$end':([1,3,4,6,8,9,10,11,12,17,19,22,24,25,26,31,33,35,38,],[-16,-1,-15,0,-7,-2,-10,-11,-5,-3,-8,-17,-4,-9,-6,-18,-12,-13,-14,]),}
11 | 
12 | _lr_action = {}
13 | for _k, _v in _lr_action_items.items():
14 |    for _x,_y in zip(_v[0],_v[1]):
15 |       if not _x in _lr_action:  _lr_action[_x] = {}
16 |       _lr_action[_x][_k] = _y
17 | del _lr_action_items
18 | 
19 | _lr_goto_items = {'Description':([0,2,30,34,],[1,1,1,1,]),'ModRelation':([3,8,17,18,20,],[8,8,8,8,8,]),'RelationDisj':([3,17,20,],[9,24,26,]),'RelChild':([3,8,17,18,20,],[10,10,10,10,10,]),'Child':([0,2,30,34,],[4,4,4,4,]),'Relation':([3,8,17,18,20,],[11,11,11,11,11,]),'RelationConj':([3,8,17,18,20,],[12,19,12,25,12,]),'ModNode':([0,2,30,34,],[3,3,35,3,]),'Attributes':([5,21,32,],[14,27,36,]),'Relation_Next':([30,],[33,]),'SubNode':([0,2,34,],[6,7,37,]),'empty':([5,21,32,],[16,16,16,]),}
20 | 
21 | _lr_goto = {}
22 | for _k, _v in _lr_goto_items.items():
23 |    for _x, _y in zip(_v[0], _v[1]):
24 |        if not _x in _lr_goto: _lr_goto[_x] = {}
25 |        _lr_goto[_x][_k] = _y
26 | del _lr_goto_items
27 | _lr_productions = [
28 |   ("S' -> SubNode","S'",1,None,None,None),
29 |   ('SubNode -> ModNode','SubNode',1,'p_SubNode','parser.py',78),
30 |   ('SubNode -> ModNode RelationDisj','SubNode',2,'p_SubNode','parser.py',79),
31 |   ('SubNode -> ( SubNode )','SubNode',3,'p_SubNode','parser.py',80),
32 |   ('SubNode -> ( SubNode ) RelationDisj','SubNode',4,'p_SubNode','parser.py',81),
33 |   ('RelationDisj -> RelationConj','RelationDisj',1,'p_RelationDisj','parser.py',127),
34 |   ('RelationDisj -> RelationConj | RelationDisj','RelationDisj',3,'p_RelationDisj','parser.py',128),
35 |   ('RelationConj -> ModRelation','RelationConj',1,'p_RelationConj','parser.py',142),
36 |   ('RelationConj -> ModRelation RelationConj','RelationConj',2,'p_RelationConj','parser.py',143),
37 |   ('RelationConj -> ModRelation & RelationConj','RelationConj',3,'p_RelationConj','parser.py',144),
38 |   ('ModRelation -> RelChild','ModRelation',1,'p_ModRelation','parser.py',157),
39 |   ('RelChild -> Relation','RelChild',1,'p_RelChild','parser.py',164),
40 |   ('Relation -> RELATION { Attributes } Relation_Next','Relation',5,'p_Relation','parser.py',171),
41 |   ('Relation_Next -> ModNode','Relation_Next',1,'p_Relation_Next','parser.py',182),
42 |   ('Relation_Next -> ( SubNode )','Relation_Next',3,'p_Relation_Next','parser.py',183),
43 |   ('ModNode -> Child','ModNode',1,'p_ModNode','parser.py',193),
44 |   ('Child -> Description','Child',1,'p_Child','parser.py',200),
45 |   ('Description -> { Attributes }','Description',3,'p_Description','parser.py',207),
46 |   ('Description -> { Attributes } = IDENTIFIER','Description',5,'p_Description','parser.py',208),
47 |   ('Attributes -> IDENTIFIER : REGEX','Attributes',3,'p_Attributes','parser.py',217),
48 |   ('Attributes -> IDENTIFIER : REGEX , Attributes','Attributes',5,'p_Attributes','parser.py',218),
49 |   ('Attributes -> empty','Attributes',1,'p_Attributes','parser.py',219),
50 |   ('empty -> <empty>','empty',0,'p_empty','parser.py',231),
51 | ]
52 | 


--------------------------------------------------------------------------------
/negbio/ngrex/pattern.py:
--------------------------------------------------------------------------------
  1 | import itertools
  2 | import re
  3 | import collections
  4 | 
  5 | L = '<'
  6 | R = '>'
  7 | LEFT = '<'
  8 | RIGHT = '>'
  9 | 
 10 | 
 11 | class NgrexPattern(object):
 12 |     """
 13 |     A NgrexPattern is a tgrep-type pattern for matching node configurations in Networkx structures.
 14 |     """
 15 | 
 16 |     def __init__(self):
 17 |         self._pattern = None
 18 | 
 19 |     def finditer(self, graph):
 20 |         """
 21 |         Returns an iterator yielding MatcherObj instances over all matches for the ngrex pattern 
 22 |         in graph.
 23 |         
 24 |         Args:
 25 |             graph(DiGraph): graph
 26 |             
 27 |         Yields:
 28 |             MatcherObj: an iterator yielding MatcherObj instances over all matches for the 
 29 |                 ngrex pattern in graph.
 30 |         """
 31 |         raise NotImplementedError('Should have implemented this')
 32 | 
 33 |     @property
 34 |     def pattern(self):
 35 |         """
 36 |         str: The pattern string from which the ngrex object was compiled.
 37 |         """
 38 |         return self._pattern
 39 | 
 40 |     def __str__(self):
 41 |         return self.pattern
 42 | 
 43 | 
 44 | class NodePattern(NgrexPattern):
 45 |     def __init__(self, attributes, name=None):
 46 |         super(NodePattern, self).__init__()
 47 |         self._name = name
 48 |         self._attributes = _get_attributes_regex(attributes)
 49 |         self._pattern = '{' + _attributes_to_str(self._attributes) + '}'
 50 |         if name:
 51 |             self._pattern += '=' + name
 52 | 
 53 |     def finditer(self, graph):
 54 |         for node in graph.nodes():
 55 |             if self._attributes:
 56 |                 if _match(self._attributes, graph.node[node]):
 57 |                     yield MatcherObj(self, graph, [(self._name, node)])
 58 |             else:
 59 |                 yield MatcherObj(self, graph, [(self._name, node)])
 60 | 
 61 | 
 62 | class EdgePattern(NgrexPattern):
 63 |     def __init__(self, governor, dependant, edge_attributes, direction=LEFT):
 64 |         """
 65 |         Args:
 66 |             direction(str): right if 'governor >edge dependant', left if 'dependant <edge governor'
 67 |         """
 68 |         super(EdgePattern, self).__init__()
 69 |         self._governor = governor
 70 |         self._dependant = dependant
 71 |         self._direction = direction
 72 |         self._edge_attributes = _get_attributes_regex(edge_attributes)
 73 | 
 74 |         if self._direction == LEFT:
 75 |             args = (dependant, '<', governor)
 76 |         else:
 77 |             args = (governor, '>', dependant)
 78 |         self._pattern = '({args[0].pattern}) {args[1]}{{{edge}}} ({args[2].pattern})'.format(
 79 |             args=args, edge=_attributes_to_str(self._edge_attributes))
 80 | 
 81 |     def finditer(self, graph):
 82 |         governors = self._governor.finditer(graph)
 83 |         dependants = self._dependant.finditer(graph)
 84 |         for g, d in itertools.product(governors, dependants):
 85 |             for p, c, e in graph.edges(data=True):
 86 |                 if p == g.group(0) and c == d.group(0):
 87 |                     if _match(self._edge_attributes, e):
 88 |                         if self._direction == LEFT:
 89 |                             yield MatcherObj(self, graph, d._nodes + g._nodes)
 90 |                         else:
 91 |                             yield MatcherObj(self, graph, g._nodes + d._nodes)
 92 | 
 93 | 
 94 | class CoordinationPattern(NgrexPattern):
 95 |     def __init__(self, pattern1, pattern2, is_conj=True):
 96 |         """
 97 |         Args:
 98 |             is_conj(bool): if is_conj is true, then it is an "AND"; otherwise, it is an "OR".
 99 |         """
100 |         super(CoordinationPattern, self).__init__()
101 |         self._pattern1 = pattern1
102 |         self._pattern2 = pattern2
103 |         self._is_conj = is_conj
104 |         self._pattern = '{} {} {}'.format(pattern2.pattern,
105 |                                           '&' if is_conj else '|',
106 |                                           pattern1.pattern)
107 | 
108 |     def finditer(self, graph):
109 |         if self._is_conj:
110 |             matchers1 = self._pattern1.finditer(graph)
111 |             matchers2 = self._pattern2.finditer(graph)
112 |             for m1, m2 in itertools.product(matchers1, matchers2):
113 |                 if m1.group(0) == m2.group(0):
114 |                     nodes = list(m1._nodes)
115 |                     if len(m2._nodes) > 2:
116 |                         nodes.extend(m2._nodes[1:])
117 |                     yield MatcherObj(self, graph, nodes)
118 |         else:
119 |             for m in self._pattern1.finditer(graph):
120 |                 yield m
121 |             for m in self._pattern2.finditer(graph):
122 |                 yield m
123 | 
124 | 
125 | class MatcherObj:
126 |     """
127 |     Match objects always have a boolean value of True.
128 |     """
129 | 
130 |     def __init__(self, pattern, graph, nodes):
131 |         """
132 |         Args:
133 |             nodes(list): [(name, node)]
134 |         """
135 |         self._pattern = pattern
136 |         self._graph = graph
137 |         self._nodes = nodes
138 | 
139 |     def __bool__(self):
140 |         return True
141 | 
142 |     def group(self, index):
143 |         """
144 |         Returns the input node captured by the given group during the previous match operation.
145 |         """
146 |         return self._nodes[index][1]
147 | 
148 |     def groups(self):
149 |         """
150 |         Returns a list containing all the subgroups of the match, from 0 up to however many nodes 
151 |         are in the pattern.
152 |         """
153 |         return (node[1] for node in self._nodes)
154 | 
155 |     def get(self, name):
156 |         for node in self._nodes:
157 |             if node[0] == name:
158 |                 return node[1]
159 |         raise KeyError(name)
160 | 
161 |     @property
162 |     def pattern(self):
163 |         """
164 |         The expression object whose `finditer()` produced this instance
165 |         """
166 |         return self._pattern
167 | 
168 |     @property
169 |     def graph(self):
170 |         """
171 |         The graph passed to `finditer()`
172 |         """
173 |         return self._graph
174 | 
175 | 
176 | def validate_names(pattern):
177 |     def _helper(p, names):
178 |         if isinstance(p, NodePattern):
179 |             if p._name in names:
180 |                 raise KeyError(p._name)
181 |             if p._name:
182 |                 names.add(p._name)
183 |         elif isinstance(p, EdgePattern):
184 |             _helper(p._governor, names)
185 |             _helper(p._dependant, names)
186 |         elif isinstance(p, CoordinationPattern):
187 |             _helper(p._pattern1, names)
188 |             _helper(p._pattern2, names)
189 |     _helper(pattern, set())
190 | 
191 | 
192 | def _get_attributes_regex(attributes):
193 |     def _get_regex(v):
194 |         v = v[1:-1]
195 |         if v:
196 |             if v[0] != '^':
197 |                 v = '^' + v
198 |             if v[-1] != '$':
199 |                 v += '$'
200 |         return re.compile(v)
201 |     return {k: _get_regex(v) for k, v in attributes.items()}
202 | 
203 | 
204 | def _match(attributes, element):
205 |     for k, v in attributes.items():
206 |         if k not in element or not v.match(element[k]):
207 |             return False
208 |     return True
209 | 
210 | 
211 | def _attributes_to_str(attributes):
212 |     return ','.join(['{}:/{}/'.format(k, v.pattern) for k, v in attributes.items()])
213 | 


--------------------------------------------------------------------------------
/negbio/patterns/neg_patterns.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | {} >{dependency:/neg/} {}
 3 | {} >{} {lemma:/no/}
 4 | {} >{dependency:/case/} {lemma:/without/}
 5 | 
 6 | # rather than XXX
 7 | {} <{dependency:/conj:negcc/} {}
 8 | {} <{dependency:/nmod:without/} {}
 9 | {} <{dependency:/conj:versus/} {}
10 | {} <{dependency:/nmod:without|nmod:of/} {lemma:/clear|clearing/}=key
11 | {} <{dependency:/nmod:out/} {lemma:/rule/}=key
12 | {} <{dependency:/nmod:of/} {lemma:/history|free|disappearance|resolution|drainage|resolution|removal/}
13 | {} <{dependency:/nmod:for/} {lemma:/negative/}
14 | {} <{} {lemma:/resolve|resolving|exclude/}=key
15 | {} <{dependency:/advmod|dep|conj:or/} {lemma:/no/}
16 | 
17 | # XXX has resolved
18 | {} <{dependency:/nsubj/} ({lemma:/resolve/}=key >{dependency:/aux/} {})
19 | 
20 | # there is no XXX
21 | {} <{dependency:/nsubj/} ({lemma:/be/} >{} {lemma:/no/})
22 | 
23 | # without evidence|finding of|for XXX
24 | {} <{dependency:/nmod:of|nmod:for/} ({lemma:/evidence|finding/} <{dependency:/nmod:without/} {})
25 | 
26 | # no evidence of|for XXX
27 | {} <{dependency:/nmod:of|nmod:for/} ({lemma:/evidence/} >{dependency:/neg/} {})
28 | 
29 | # without evidence|finding of|for XXX
30 | {} <{dependency:/nmod:of|nmod:for/} ({lemma:/evidence|finding/} >{} {lemma:/without/})
31 | 
32 | # no focus of XXX
33 | {} <{dependency:/nmod:of/} ({lemma:/focus/} >{dependency:/neg/} {})
34 | {} <{dependency:/nmod:of/} ({lemma:/focus/} >{} {lemma:/no/})
35 | 
36 | # no moderate to XXX
37 | {} <{dependency:/nmod:to/} ({lemma:/moderate/} >{dependency:/neg/} {})
38 | 
39 | # no evidence of developing XXX
40 | {} <{} ({lemma:/developing/} <{} ({lemma:/evidence/} <{dependency:/nmod:without/} {}))
41 | {} <{} ({lemma:/developing/} <{} ({lemma:/evidence/} >{} {lemma:/no/}))
42 | 
43 | # no focal XXX
44 | {} <{dependency:/dobj/} ({} >{dependency:/nsubj/} {lemma:/no/})
45 | 
46 | # do not demonstrate|visualize XXX
47 | # XXX is not demonstrated/visualized
48 | {} <{dependency:/dobj|nsubjpass/} ({lemma:/demonstrate|visualize/} >{dependency:/neg/} {})
49 | 
50 | # XXX is previously demonstrated/visualized
51 | {} <{dependency:/dobj|nsubjpass/} ({lemma:/demonstrate|visualize/} >{} {lemma:/previously/})
52 | 
53 | # there is no NN to suggest/explain XXX
54 | {} <{dependency:/dobj/} ({tag:/V.*/} <{} ({tag:/N.*/} >{dependency:/neg/} {}))
55 | 
56 | # no NN to suggest/explain XXX
57 | {} <{dependency:/dobj/} ({tag:/V.*/} >{} ({tag:/N.*/} >{dependency:/neg/} {}))


--------------------------------------------------------------------------------
/negbio/patterns/uncertainty_patterns.txt:
--------------------------------------------------------------------------------
 1 | # outgoing edge
 2 | {} >{} {lemma:/possible|possibly|presumably|probable|questionable|suspect|suspected|suspicious/}
 3 | {} >{} {lemma:/question/}
 4 | 
 5 | # '{} >{dependency:/cop/} {lemma:/may|would|could/}
 6 | 
 7 | # incoming edge
 8 | {} <{dependency:/nmod:of/} {lemma:/question|suggestion/}
 9 | {} <{dependency:/dobj/} {lemma:/suspect|favor|suggest|suggesting|question|consider/}
10 | {} <{dependency:/nmod:for/} {lemma:/concern|suspicion/}
11 | {} <{dependency:/nsubjpass/} {lemma:/suspect/}
12 | {} <{} {lemma:/possible/}
13 | 
14 | # parsing error
15 | # suspected XXX
16 | {} <{dependency:/dobj/} {lemma:/suspect/}
17 | {} >{dependency:/advmod/} {lemma:/suspect/}
18 | 
19 | # maybe due to XXX
20 | {} <{dependency:/dep/} {lemma:/maybe/}
21 | 
22 | # may/could represent/reflect/indicate/include XXX
23 | {} <{} ({lemma:/reflect|represent|indicate|include/} >{} {lemma:/may|could|would/})
24 | 
25 | # maybe secondary to XXX
26 | {} <{dependency:/nmod:to/} {lemma:/secondary/}
27 | 
28 | # may be due to XXX
29 | {} <{dependency:/nmod:to/} ({lemma:/due/} >{} {lemma:/can|could|may|would|possibly/})
30 | 
31 | # could related to XXX
32 | {} <{dependency:/nmod:to/} ({lemma:/relate/} >{} {lemma:/can|could|may|would|possibly/})
33 | 
34 | # may be compatible with XXX
35 | {} <{dependency:/nmod:with/} ({lemma:/compatible/} >{} {lemma:/be|could|may|would/})
36 | 
37 | # question left XXX
38 | {} <{dependency:/dobj/} ({lemma:/left/} <{} {lemma:/question/})
39 | {} >{} {lemma:/left/} <{} {lemma:/question/}
40 | 
41 | # cannot exclude XXX
42 | {} <{dependency:/dobj/} ({lemma:/exclude/} >{} {lemma:/cannot/})
43 | 
44 | # cannot rule out XXX
45 | {} <{dependency:/nmod:out/} ({lemma:/rule/} >{} {lemma:/cannot/})
46 | 
47 | # XXX is not excluded
48 | {} <{dependency:/nsubjpass/} ({lemma:/exclude/} >{dependency:/neg/} {})
49 | {} <{dependency:/nsubjpass/} ({lemma:/exclude/} >{} {lemma:/cannot/})
50 | 
51 | # differential diagnosis includes
52 | {} <{dependency:/dobj/} ({lemma:/include/} >{} ({lemma:/diagnosis/} >{} {lemma:/differential/}))
53 | 
54 | # may be XXX
55 | {} <{} {lemma:/be/} >{} {lemma:/may|could|would/}
56 | 
57 | # parsing error
58 | # XXX suspected
59 | {} <{dependency:/nsubj/} {lemma:/suspect/,tag:/VBN/}
60 | 
61 | # suggestive of XXX
62 | # {} <{dependency:/nmod:of/} {lemma:/suggestive/}'


--------------------------------------------------------------------------------
/negbio/pipeline/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ncbi-nlp/NegBio/073199e2792824740e89844a59c13d3d40ce4d23/negbio/pipeline/__init__.py


--------------------------------------------------------------------------------
/negbio/pipeline/cleanup.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | 
 4 | def clean_sentences(document, sort_anns=False):
 5 |     """
 6 |     Remove sentences in each passage
 7 | 
 8 |     Args:
 9 |         document(BioCDocument): a document
10 |         sort_anns(bool): sort ann by its location
11 |     """
12 |     try:
13 |         for passage in document.passages:
14 |             del passage.sentences[:]
15 | 
16 |         if sort_anns:
17 |             key_func = lambda ann: ann.get_total_location().offset
18 |             id = 0
19 |             for passage in document.passages:
20 |                 for ann in sorted(passage.annotations, key=key_func):
21 |                     ann.id = str(id)
22 |                     id += 1
23 |     except:
24 |         logging.exception("Cannot process %s", document.id)
25 |     return document
26 | 


--------------------------------------------------------------------------------
/negbio/pipeline/dner_mm.py:
--------------------------------------------------------------------------------
  1 | import collections
  2 | import itertools
  3 | import logging
  4 | import re
  5 | 
  6 | import bioc
  7 | 
  8 | 
  9 | def remove_newline(s):
 10 |     return re.sub(r'[\n\r]', ' ', s)
 11 | 
 12 | 
 13 | def adapt_concept_index(index):
 14 |     m = re.match(r"'.*?'", index)
 15 |     if m:
 16 |         return index[1:-1]
 17 |     m = re.match(r"'.*", index)
 18 |     if m:
 19 |         return index[1:]
 20 |     return index
 21 | 
 22 | 
 23 | def run_metamap_col(collection, mm, cuis=None, extra_args=None):
 24 |     """
 25 |     Get CUIs from metamap.
 26 | 
 27 |     Args:
 28 |         collection(BioCCollection):
 29 |         mm(MetaMap): MetaMap instance
 30 | 
 31 |     Returns:
 32 |         BioCCollection
 33 |     """
 34 |     try:
 35 |         annIndex = itertools.count()
 36 |         sentence_map = collections.OrderedDict()
 37 |         for document in collection.documents:
 38 |             for passage in document.passages:
 39 |                 for sentence in passage.sentences:
 40 |                     sentence_map['{}-{}'.format(document.id.replace('.', '-'), sentence.offset)] = (passage, sentence)
 41 | 
 42 |         sents = []
 43 |         ids = []
 44 |         for k in sentence_map:
 45 |             ids.append(k)
 46 |             sents.append(remove_newline(sentence_map[k][1].text))
 47 | 
 48 |         if extra_args is None:
 49 |             concepts, error = mm.extract_concepts(sents, ids)
 50 |         else:
 51 |             concepts, error = mm.extract_concepts(sents, ids, **extra_args)
 52 | 
 53 |         if error is None:
 54 |             for concept in concepts:
 55 |                 concept_index = adapt_concept_index(concept.index)
 56 |                 try:
 57 |                     if cuis is not None:
 58 |                         # if no CUI is returned for this concept - skip it
 59 |                         concept_cui = getattr(concept, 'cui', None)
 60 |                         if concept_cui not in cuis:
 61 |                             continue
 62 |                     m = re.match(r'(\d+)/(\d+)', concept.pos_info)
 63 |                     if m:
 64 |                         passage = sentence_map[concept_index][0]
 65 |                         sentence = sentence_map[concept_index][1]
 66 |                         start = int(m.group(1)) - 1
 67 |                         length = int(m.group(2))
 68 |                         ann = bioc.BioCAnnotation()
 69 |                         ann.id = str(next(annIndex))
 70 |                         ann.infons['CUI'] = concept.cui
 71 |                         ann.infons['semtype'] = concept.semtypes[1:-1]
 72 |                         ann.infons['term'] = concept.preferred_name
 73 |                         ann.infons['annotator'] = 'MetaMap'
 74 |                         ann.add_location(bioc.BioCLocation(sentence.offset + start, length))
 75 |                         ann.text = sentence.text[start:start+length]
 76 |                         passage.annotations.append(ann)
 77 |                 except:
 78 |                     logging.exception('')
 79 |     except:
 80 |         logging.exception("Cannot process %s", collection.source)
 81 |     return collection
 82 | 
 83 | 
 84 | def run_metamap(document, mm, cuis=None):
 85 |     """
 86 |     Get CUIs from metamap.
 87 | 
 88 |     Args:
 89 |         document(BioCDocument):
 90 |         mm(MetaMap): MetaMap instance
 91 | 
 92 |     Returns:
 93 |         BioCDocument
 94 |     """
 95 |     try:
 96 |         annIndex = itertools.count()
 97 |         sentence_map = collections.OrderedDict()
 98 |         for passage in document.passages:
 99 |             for sentence in passage.sentences:
100 |                 sentence_map[str(sentence.offset)] = (passage, sentence)
101 | 
102 |         sents = []
103 |         ids = []
104 |         for k in sentence_map:
105 |             ids.append(k)
106 |             sents.append(remove_newline(sentence_map[k][1].text))
107 | 
108 |         concepts, error = mm.extract_concepts(sents, ids)
109 |         if error is None:
110 |             for concept in concepts:
111 |                 concept_index = adapt_concept_index(concept.index)
112 |                 try:
113 |                     if cuis is not None and concept.cui not in cuis:
114 |                         continue
115 |                     m = re.match(r'(\d+)/(\d+)', concept.pos_info)
116 |                     if m:
117 |                         passage = sentence_map[concept_index][0]
118 |                         sentence = sentence_map[concept_index][1]
119 |                         start = int(m.group(1)) - 1
120 |                         length = int(m.group(2))
121 |                         ann = bioc.BioCAnnotation()
122 |                         ann.id = str(next(annIndex))
123 |                         ann.infons['CUI'] = concept.cui
124 |                         ann.infons['semtype'] = concept.semtypes[1:-1]
125 |                         ann.infons['term'] = concept.preferred_name
126 |                         ann.infons['annotator'] = 'MetaMap'
127 |                         ann.add_location(bioc.BioCLocation(sentence.offset + start, length))
128 |                         ann.text = sentence.text[start:start+length]
129 |                         passage.annotations.append(ann)
130 |                 except:
131 |                     logging.exception('')
132 |     except:
133 |         logging.exception("Cannot process %s", document.id)
134 |     return document
135 | 


--------------------------------------------------------------------------------
/negbio/pipeline/negdetect.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import re
 3 | 
 4 | from negbio.neg.neg_detector import Detector
 5 | 
 6 | 
 7 | def neg_mesh(annotations):
 8 |     """
 9 |     Detect negative MeSH
10 |     """
11 |     for ann in annotations:
12 |         if ann.infons.get('CUI', None) == 'C0332125':
13 |             ann.infons[Detector.NEGATION] = 'True'
14 | 
15 | 
16 | def uncertain_mesh(annotations):
17 |     """
18 |     Detect uncertain MeSH
19 |     """
20 |     for ann in annotations:
21 |         if ann.infons.get('CUI', None) == 'C0332148':
22 |             ann.infons[Detector.UNCERTAINTY] = 'True'
23 | 
24 | 
25 | def is_neg_regex(text):
26 |     if re.search(r'^(findings|impression): no ', text, re.I):
27 |         return True
28 |     return False
29 | 
30 | 
31 | def _mark_anns(annotations, begin, end, type):
32 |     """Mark all annotations in [begin:end] as type"""
33 |     for ann in annotations:
34 |         total_loc = ann.get_total_location()
35 |         if begin <= total_loc.offset and total_loc.offset + total_loc.length <= end:
36 |             ann.infons[type] = 'True'
37 | 
38 | 
39 | def _extend(document, type):
40 |     def _is_type(annotation):
41 |         return annotation.infons.get(type, None) == 'True'
42 | 
43 |     neg_anns = []
44 |     for passage in document.passages:
45 |         for ann in passage.annotations:
46 |             if _is_type(ann):
47 |                 neg_anns.append(ann)
48 | 
49 |     for passage in document.passages:
50 |         for ann in passage.annotations:
51 |             if not _is_type(ann):
52 |                 for nann in neg_anns:
53 |                     if ann in nann:
54 |                         ann.infons[type] = 'True'
55 |                         break
56 |                     if nann in ann and 'CUI' in ann and 'CUI' in nann and ann.infons['CUI'] == nann.infons['CUI']:
57 |                         ann.infons[type] = 'True'
58 |                         break
59 | 
60 | 
61 | def detect(document, detector):
62 |     """
63 |     Args:
64 |         document(BioCDocument):
65 |         detector(Detector): detector. Define customized patterns in the detector
66 |     """
67 |     try:
68 | 
69 |         for passage in document.passages:
70 |             neg_mesh(passage.annotations)
71 |             uncertain_mesh(passage.annotations)
72 | 
73 |             locs = []
74 |             for ann in passage.annotations:
75 |                 total_loc = ann.get_total_location()
76 |                 locs.append((total_loc.offset, total_loc.offset + total_loc.length))
77 | 
78 |             for sentence in passage.sentences:
79 |                 if is_neg_regex(sentence.text):
80 |                     _mark_anns(passage.annotations, sentence.offset, sentence.offset + len(sentence.text),
81 |                                Detector.NEGATION)
82 |                     continue
83 |                 for name, matcher, loc in detector.detect(sentence, locs):
84 |                     logging.debug('Find: %s, %s, %s', name, matcher.pattern, loc)
85 |                     _mark_anns(passage.annotations, loc[0], loc[1], name)
86 | 
87 |         # _extend(document, Detector.NEGATION)
88 |         # _extend(document, Detector.UNCERTAINTY)
89 |     except:
90 |         logging.exception("Cannot process %s", document.id)
91 |     return document
92 | 


--------------------------------------------------------------------------------
/negbio/pipeline/parse.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function, absolute_import
 2 | 
 3 | import logging
 4 | import os
 5 | import tempfile
 6 | 
 7 | from bllipparser import ModelFetcher
 8 | from bllipparser import RerankingParser
 9 | 
10 | 
11 | class Bllip(object):
12 |     def __init__(self, model_dir=None):
13 |         if model_dir is None:
14 |             logging.debug("downloading GENIA+PubMed model if necessary ...")
15 |             model_dir = ModelFetcher.download_and_install_model(
16 |                 'GENIA+PubMed', os.path.join(tempfile.gettempdir(), 'models'))
17 |         self.model_dir = os.path.expanduser(model_dir)
18 | 
19 |         logging.debug('loading model %s ...' % self.model_dir)
20 |         self.rrp = RerankingParser.from_unified_model_dir(self.model_dir)
21 | 
22 |     def parse(self, s):
23 |         """Parse the sentence text using Reranking parser.
24 | 
25 |         Args:
26 |             s(str): one sentence
27 | 
28 |         Returns:
29 |             ScoredParse: parse tree, ScoredParse object in RerankingParser; None if failed
30 |         """
31 |         if not s:
32 |             raise ValueError('Cannot parse empty sentence: {}'.format(s))
33 | 
34 |         nbest = self.rrp.parse(str(s))
35 |         if nbest:
36 |             return nbest[0].ptb_parse
37 | 
38 |         return None
39 | 
40 | 
41 | class NegBioParser(Bllip):
42 |     def parse_doc(self, document):
43 |         """
44 |         Parse sentences in BioC format
45 | 
46 |         Args:
47 |             document(BioCDocument): one document
48 | 
49 |         Returns:
50 |             BioCDocument
51 |         """
52 |         for passage in document.passages:
53 |             for sentence in passage.sentences:
54 |                 text = sentence.text
55 |                 tree = self.parse(text)
56 |                 if tree:
57 |                     sentence.infons['parse tree'] = str(tree)
58 |                 else:
59 |                     sentence.infons['parse tree'] = None
60 |                     logging.exception(
61 |                         'No parse tree for sentence: %s', sentence.offset)
62 |         return document
63 | 


--------------------------------------------------------------------------------
/negbio/pipeline/pipeline.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from negbio.pipeline import parse, ssplit, ptb2ud, negdetect, text2bioc, dner_mm, section_split, cleanup
 3 | from negbio.ext import normalize_mimiccxr
 4 | 
 5 | 
 6 | def process_collection(collection, metamap, splitter, parser, ptb2dep, lemmatizer, neg_detector, cuis, sec_title_patterns):
 7 |     for document in collection.documents:
 8 |         normalize_mimiccxr.normalize(document)
 9 |         section_split.split_document(document, sec_title_patterns)
10 |         ssplit.ssplit(document, splitter)
11 | 
12 |     dner_mm.run_metamap_col(collection, metamap, cuis)
13 | 
14 |     for document in collection.documents:
15 |         document = parse.parse(document, parser)
16 |         document = ptb2ud.convert(document, ptb2dep, lemmatizer)
17 |         document = negdetect.detect(document, neg_detector)
18 |         cleanup.clean_sentences(document)
19 | 
20 |     return collection
21 | 
22 | 
23 | def process_text(sources, metamap, splitter, parser, ptb2dep, lemmatizer, neg_detector, cuis, sec_title_patterns):
24 |     collection = text2bioc.text2collection(*sources)
25 |     return process_collection(collection, metamap, splitter, parser, ptb2dep, lemmatizer, neg_detector, cuis, sec_title_patterns)
26 | 


--------------------------------------------------------------------------------
/negbio/pipeline/ptb2ud.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | 
  3 | import StanfordDependencies
  4 | import bioc
  5 | from nltk.corpus import wordnet
  6 | from nltk.stem.wordnet import WordNetLemmatizer
  7 | from nltk.tag.mapping import tagset_mapping
  8 | 
  9 | 
 10 | class Lemmatizer(object):
 11 |     def __init__(self):
 12 |         self.wordnet_lemmatizer = WordNetLemmatizer()
 13 |         self.mapping = tagset_mapping('en-ptb', 'universal')
 14 | 
 15 |     def lemmatize(self, word, pos=None):
 16 |         """
 17 |         Determines the lemma for a given word
 18 | 
 19 |         Args:
 20 |             word(str): word
 21 |             pos(str): part-of-speech
 22 | 
 23 |         Returns:
 24 |             str: lemma
 25 |         """
 26 |         if pos:
 27 |             return self.wordnet_lemmatizer.lemmatize(word=word, pos=pos)
 28 |         else:
 29 |             return self.wordnet_lemmatizer.lemmatize(word=word)
 30 | 
 31 |     def map_tag(self, tag):
 32 |         if tag in self.mapping:
 33 |             tag = self.mapping[tag]
 34 |             if tag == 'NOUN':
 35 |                 return wordnet.NOUN
 36 |             elif tag == 'VERB':
 37 |                 return wordnet.VERB
 38 |             elif tag == 'ADJ':
 39 |                 return wordnet.ADJ
 40 |             elif tag == 'ADV':
 41 |                 return wordnet.ADV
 42 |             elif tag == 'ADJ_SAT':
 43 |                 return wordnet.ADJ_SAT
 44 |         return None
 45 | 
 46 | 
 47 | class Ptb2DepConverter(object):
 48 |     """
 49 |     Convert ptb trees to universal dependencies
 50 |     """
 51 | 
 52 |     basic = 'basic'
 53 |     collapsed = 'collapsed'
 54 |     CCprocessed = 'CCprocessed'
 55 |     collapsedTree = 'collapsedTree'
 56 | 
 57 |     def __init__(self, lemmatizer, representation='CCprocessed', universal=False):
 58 |         """
 59 |         Args:
 60 |             representation(str): Currently supported representations are
 61 |                 'basic', 'collapsed', 'CCprocessed', and 'collapsedTree'
 62 |             universal(bool): if True, use universal dependencies if they're available
 63 |         """
 64 |         try:
 65 |             import jpype
 66 |             self._backend = 'jpype'
 67 |         except ImportError:
 68 |             self._backend = 'subprocess'
 69 |         self.lemmatizer = lemmatizer
 70 |         self.__sd = StanfordDependencies.get_instance(backend=self._backend)
 71 |         self.representation = representation
 72 |         self.universal = universal
 73 | 
 74 |     def convert(self, parse_tree):
 75 |         """
 76 |         Convert ptb trees in a BioC sentence
 77 | 
 78 |         Args:
 79 |             parse_tree(str): parse tree in PTB format
 80 | 
 81 |         Examples:
 82 |             (ROOT (NP (JJ hello) (NN world) (. !)))
 83 |         """
 84 |         if self._backend == 'jpype':
 85 |             dependency_graph = self.__sd.convert_tree(parse_tree,
 86 |                                                       representation=self.representation,
 87 |                                                       universal=self.universal,
 88 |                                                       add_lemmas=True)
 89 |         else:
 90 |             dependency_graph = self.__sd.convert_tree(parse_tree,
 91 |                                                       representation=self.representation,
 92 |                                                       universal=self.universal)
 93 |         return dependency_graph
 94 | 
 95 | 
 96 | class NegBioPtb2DepConverter(Ptb2DepConverter):
 97 |     def __init__(self, lemmatizer, representation='CCprocessed', universal=False):
 98 |         """
 99 |         Args:
100 |             lemmatizer (Lemmatizer)
101 |         """
102 |         super(NegBioPtb2DepConverter, self).__init__(
103 |             lemmatizer, representation, universal)
104 | 
105 |     def convert_doc(self, document):
106 |         for passage in document.passages:
107 |             for sentence in passage.sentences:
108 |                 # check for empty infons, don't process if empty
109 |                 # this sometimes happens with poorly tokenized sentences
110 |                 if not sentence.infons:
111 |                     continue
112 |                 elif not sentence.infons['parse tree']:
113 |                     continue
114 | 
115 |                 try:
116 |                     dependency_graph = self.convert(
117 |                         sentence.infons['parse tree'])
118 |                     anns, rels = convert_dg(dependency_graph, sentence.text,
119 |                                             sentence.offset,
120 |                                             has_lemmas=self._backend == 'jpype')
121 |                     sentence.annotations = anns
122 |                     sentence.relations = rels
123 |                 except KeyboardInterrupt:
124 |                     raise
125 |                 except:
126 |                     logging.exception(
127 |                         "Cannot process sentence %d in %s", sentence.offset, document.id)
128 | 
129 |                 if self._backend != 'jpype':
130 |                     for ann in sentence.annotations:
131 |                         text = ann.text
132 |                         pos = ann.infons['tag']
133 |                         pos = self.lemmatizer.map_tag(pos)
134 |                         lemma = self.lemmatizer.lemmatize(word=text, pos=pos)
135 |                         ann.infons['lemma'] = lemma.lower()
136 |         return document
137 | 
138 | 
139 | def adapt_value(value):
140 |     """
141 |     Adapt string in PTB
142 |     """
143 |     value = value.replace("-LRB-", "(")
144 |     value = value.replace("-RRB-", ")")
145 |     value = value.replace("-LSB-", "[")
146 |     value = value.replace("-RSB-", "]")
147 |     value = value.replace("-LCB-", "{")
148 |     value = value.replace("-RCB-", "}")
149 |     value = value.replace("-lrb-", "(")
150 |     value = value.replace("-rrb-", ")")
151 |     value = value.replace("-lsb-", "[")
152 |     value = value.replace("-rsb-", "]")
153 |     value = value.replace("``", "\"")
154 |     value = value.replace("''", "\"")
155 |     value = value.replace("`", "'")
156 |     return value
157 | 
158 | 
159 | def convert_dg(dependency_graph, text, offset, ann_index=0, rel_index=0, has_lemmas=True):
160 |     """
161 |     Convert dependency graph to annotations and relations
162 |     """
163 |     annotations = []
164 |     relations = []
165 |     annotation_id_map = {}
166 |     start = 0
167 |     for node in dependency_graph:
168 |         if node.index in annotation_id_map:
169 |             continue
170 |         node_form = node.form
171 |         index = text.find(node_form, start)
172 |         if index == -1:
173 |             node_form = adapt_value(node.form)
174 |             index = text.find(node_form, start)
175 |             if index == -1:
176 |                 logging.debug('Cannot convert parse tree to dependency graph at %d\n%d\n%s',
177 |                               start, offset, str(dependency_graph))
178 |                 return
179 | 
180 |         ann = bioc.BioCAnnotation()
181 |         ann.id = 'T{}'.format(ann_index)
182 |         ann.text = node_form
183 |         ann.infons['tag'] = node.pos
184 |         if has_lemmas:
185 |             ann.infons['lemma'] = node.lemma.lower()
186 | 
187 |         start = index
188 | 
189 |         ann.add_location(bioc.BioCLocation(start + offset, len(node_form)))
190 |         annotations.append(ann)
191 |         annotation_id_map[node.index] = ann_index
192 |         ann_index += 1
193 |         start += len(node_form)
194 | 
195 |     for node in dependency_graph:
196 |         if node.head == 0:
197 |             ann = annotations[annotation_id_map[node.index]]
198 |             ann.infons['ROOT'] = True
199 |             continue
200 |         relation = bioc.BioCRelation()
201 |         relation.id = 'R{}'.format(rel_index)
202 |         relation.infons['dependency'] = node.deprel
203 |         if node.extra:
204 |             relation.infons['extra'] = node.extra
205 |         relation.add_node(bioc.BioCNode('T{}'.format(
206 |             annotation_id_map[node.index]), 'dependant'))
207 |         relation.add_node(bioc.BioCNode('T{}'.format(
208 |             annotation_id_map[node.head]), 'governor'))
209 |         relations.append(relation)
210 |         rel_index += 1
211 | 
212 |     return annotations, relations
213 | 


--------------------------------------------------------------------------------
/negbio/pipeline/scan.py:
--------------------------------------------------------------------------------
 1 | import io
 2 | import logging
 3 | import os
 4 | 
 5 | import bioc
 6 | import tqdm
 7 | 
 8 | 
 9 | def scan_document(*_, **kwargs):
10 |     """
11 |     Scan each document in a list of BioC source files, apply fn, and print to directory.
12 | 
13 |     Args:
14 |         kwargs:
15 |             source(list): a list of source pathnames
16 |             directory(str): output directory
17 |             fn:
18 |                 fn should expect the following arguments in this given order:
19 |                     sequence1
20 |                     sequence2
21 |                     ...
22 |                     non_sequence1
23 |                     non_sequence2
24 |                     ...
25 |             verbose(boolean):
26 |     """
27 |     source = kwargs.pop('source')
28 |     verbose = kwargs.pop('verbose', True)
29 |     directory = os.path.expanduser(kwargs.pop('directory'))
30 |     suffix = kwargs.pop('suffix')
31 |     fn = kwargs.pop('fn')
32 |     non_sequences = kwargs.pop('non_sequences', [])
33 | 
34 |     if not os.path.exists(directory):
35 |         os.makedirs(directory)
36 | 
37 |     def catch(document, non_sequences):
38 |         try:
39 |             return fn(document, *non_sequences)
40 |         except:
41 |             logging.exception('Cannot process %s', document.id)
42 | 
43 |     for pathname in tqdm.tqdm(source, total=len(source), disable=not verbose):
44 |         basename = os.path.splitext(os.path.basename(pathname))[0]
45 |         dstname = os.path.join(directory, '{}{}'.format(basename, suffix))
46 |         with io.open(pathname, encoding='utf8') as fp:
47 |             collection = bioc.load(fp)
48 |         collection.documents = [catch(doc, non_sequences) for doc in collection.documents]
49 |         with io.open(dstname, 'w', encoding='utf8') as fp:
50 |             bioc.dump(collection, fp)
51 | 
52 | 
53 | def scan_collection(*_, **kwargs):
54 |     """
55 |     Scan each document in a list of BioC source files, apply fn, and print to directory.
56 | 
57 |     Args:
58 |         kwargs:
59 |             source(list): a list of source pathnames
60 |             directory(str): output directory
61 |             fn:
62 |                 fn should expect the following arguments in this given order:
63 |                     sequence1
64 |                     sequence2
65 |                     ...
66 |                     non_sequence1
67 |                     non_sequence2
68 |                     ...
69 |             verbose(boolean):
70 |     """
71 |     source = kwargs.pop('source')
72 |     verbose = kwargs.pop('verbose', True)
73 |     directory = os.path.expanduser(kwargs.pop('directory'))
74 |     suffix = kwargs.pop('suffix')
75 |     fn = kwargs.pop('fn')
76 |     non_sequences = kwargs.pop('non_sequences', [])
77 | 
78 |     if not os.path.exists(directory):
79 |         os.makedirs(directory)
80 | 
81 |     for pathname in tqdm.tqdm(source, total=len(source), disable=not verbose):
82 |         basename = os.path.splitext(os.path.basename(pathname))[0]
83 |         dstname = os.path.join(directory, '{}{}'.format(basename, suffix))
84 |         with io.open(pathname, encoding='utf8') as fp:
85 |             collection = bioc.load(fp)
86 |             try:
87 |                 args = [collection] + non_sequences
88 |                 fn(*args)
89 |             except:
90 |                 logging.exception('Cannot process %s', collection.source)
91 |         with io.open(dstname, 'w', encoding='utf8') as fp:
92 |             bioc.dump(collection, fp)
93 | 


--------------------------------------------------------------------------------
/negbio/pipeline/section_split.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import re
 3 | 
 4 | import bioc
 5 | 
 6 | 
 7 | SECTION_TITLES = re.compile(r'('
 8 |                             r'ABDOMEN AND PELVIS|CLINICAL HISTORY|CLINICAL INDICATION|COMPARISON|COMPARISON STUDY DATE'
 9 |                             r'|EXAM|EXAMINATION|FINDINGS|HISTORY|IMPRESSION|INDICATION'
10 |                             r'|MEDICAL CONDITION|PROCEDURE|REASON FOR EXAM|REASON FOR STUDY|REASON FOR THIS EXAMINATION'
11 |                             r'|TECHNIQUE'
12 |                             r'):|FINAL REPORT',
13 |                             re.IGNORECASE | re.MULTILINE)
14 | 
15 | 
16 | def is_empty(passage):
17 |     return len(passage.text) == 0
18 | 
19 | 
20 | def strip(passage):
21 |     start = 0
22 |     while start < len(passage.text) and passage.text[start].isspace():
23 |         start += 1
24 | 
25 |     end = len(passage.text)
26 |     while end > start and passage.text[end - 1].isspace():
27 |         end -= 1
28 | 
29 |     passage.offset += start
30 |     logging.debug('before: %r' % passage.text)
31 |     passage.text = passage.text[start:end]
32 |     logging.debug('after:  %r' % passage.text)
33 |     return passage
34 | 
35 | 
36 | def split_document(document, pattern=None):
37 |     """
38 |     Split one report into sections. Section splitting is a deterministic consequence of section titles.
39 | 
40 |     Args:
41 |         document(BioCDocument): one document that contains one passage.
42 |         pattern: the regular expression patterns for section titles.
43 | 
44 |     Returns:
45 |         BioCDocument: a new BioCDocument instance
46 |     """
47 |     if pattern is None:
48 |         pattern = SECTION_TITLES
49 | 
50 |     new_document = bioc.BioCDocument()
51 |     new_document.id = document.id
52 |     new_document.infons = document.infons
53 | 
54 |     text = document.passages[0].text
55 |     offset = document.passages[0].offset
56 | 
57 |     def create_passage(start, end, title=None):
58 |         passage = bioc.BioCPassage()
59 |         passage.offset = start + offset
60 |         passage.text = text[start:end]
61 |         if title is not None:
62 |             passage.infons['title'] = title[:-1].strip() if title[-1] == ':' else title.strip()
63 |             passage.infons['type'] = 'title_1'
64 |         strip(passage)
65 |         return passage
66 | 
67 |     start = 0
68 |     for matcher in pattern.finditer(text):
69 |         logging.debug('Match: %s', matcher.group())
70 |         # add last
71 |         end = matcher.start()
72 |         if end != start:
73 |             passage = create_passage(start, end)
74 |             if not is_empty(passage):
75 |                 new_document.add_passage(passage)
76 | 
77 |         start = end
78 | 
79 |         # add title
80 |         end = matcher.end()
81 |         passage = create_passage(start, end, text[start:end])
82 |         if not is_empty(passage):
83 |             new_document.add_passage(passage)
84 | 
85 |         start = end
86 | 
87 |     # add last piece
88 |     end = len(text)
89 |     if start < end:
90 |         passage = create_passage(start, end)
91 |         if not is_empty(passage):
92 |             new_document.add_passage(passage)
93 |     return new_document
94 | 


--------------------------------------------------------------------------------
/negbio/pipeline/ssplit.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | import bioc
 4 | 
 5 | 
 6 | class NltkSSplitter(object):
 7 |     """NLTK sentence splitter"""
 8 | 
 9 |     def __init__(self, **kwargs):
10 |         self.newline = kwargs.pop('newline', False)
11 | 
12 |     def split(self, text, **kwargs):
13 |         import nltk
14 |         if not text:
15 |             return
16 | 
17 |         if self.newline:
18 |             line_splitter = self.split_line
19 |         else:
20 |             line_splitter = self.no_split
21 | 
22 |         for line, line_offset in line_splitter(text):
23 |             sent_list = nltk.sent_tokenize(line)
24 |             offset = 0
25 |             for sent in sent_list:
26 |                 offset = line.find(sent, offset)
27 |                 if offset == -1:
28 |                     logging.debug('Cannot find {} in {}'.format(sent, text))
29 |                 yield sent, offset + line_offset
30 |                 offset += len(sent)
31 | 
32 |     @classmethod
33 |     def split_line(cls, text, sep='\n'):
34 |         lines = text.split(sep)
35 |         offset = 0
36 |         for line in lines:
37 |             offset = text.index(line, offset)
38 |             yield line, offset
39 | 
40 |     @classmethod
41 |     def no_split(cls, text, **kwargs):
42 |         yield text, 0
43 | 
44 |     def __repr__(self):
45 |         return 'NLTK SSplitter'
46 | 
47 | 
48 | class NegBioSSplitter(NltkSSplitter):
49 |     def split_doc(self, document):
50 |         """
51 |         Split text into sentences with offsets.
52 | 
53 |         Args:v
54 |             document(BioCDocument): one document
55 | 
56 |         Returns:
57 |             BioCDocument
58 |         """
59 |         for passage in document.passages:
60 |             for text, offset in self.split(passage.text):
61 |                 sentence = bioc.BioCSentence()
62 |                 sentence.offset = offset + passage.offset
63 |                 sentence.text = text
64 |                 passage.add_sentence(sentence)
65 |             # passage.text = None
66 |         return document
67 | 


--------------------------------------------------------------------------------
/negbio/pipeline/text2bioc.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import string
 3 | from pathlib2 import Path
 4 | 
 5 | import bioc
 6 | 
 7 | 
 8 | def printable(s, func=None):
 9 |     """
10 |     Return string of ASCII string which is considered printable.
11 | 
12 |     Args:
13 |         s(str): string
14 |         func: function to convert non-ASCII characters
15 |     """
16 |     out = ''
17 |     for c in s:
18 |         if c in string.printable:
19 |             out += c
20 |         elif func is not None:
21 |             out += func(c)
22 |         else:
23 |             logging.warning('Cannot convert char: %s', c)
24 |     return out
25 | 
26 | 
27 | def text2document(id, text):
28 |     """
29 |     Convert text to a BioCDocument instance
30 | 
31 |     Args:
32 |         id (str): BioCDocument id
33 |         text (str): text
34 | 
35 |     Returns:
36 |         BioCDocument: a BioCDocument instance
37 |     """
38 |     document = bioc.BioCDocument()
39 |     document.id = id
40 |     text = printable(text).replace('\r\n', '\n')
41 | 
42 |     passage = bioc.BioCPassage()
43 |     passage.offset = 0
44 |     passage.text = text
45 |     document.add_passage(passage)
46 | 
47 |     return document
48 | 
49 | 
50 | def text2collection(*sources):
51 |     """
52 |     Returns a BioCCollection containing documents specified in sources.
53 | 
54 |     Args:
55 |         sources: a list of pathname
56 |     """
57 | 
58 |     collection = bioc.BioCCollection()
59 |     for pathname in iter(*sources):
60 |         logging.debug('Process %s', pathname)
61 |         try:
62 |             with open(pathname) as fp:
63 |                 text = fp.read()
64 |             id = Path(pathname).stem
65 |             document = text2document(id, text)
66 |             collection.add_document(document)
67 |         except:
68 |             logging.exception('Cannot convert %s', pathname)
69 |     return collection
70 | 
71 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | future==0.16.0
 2 | docutils==0.14
 3 | docopt==0.6.2
 4 | pytest==4.4.1
 5 | networkx==1.11
 6 | ply==3.10
 7 | tqdm==4.19.5
 8 | nltk==3.6.6
 9 | bioc==1.3.1
10 | pystanforddependencies==0.3.1
11 | bllipparser==2016.9.11
12 | pymetamap==0.1
13 | JPype1>=0.6.3
14 | pathlib2==2.3.3
15 | numpy==1.21.0
16 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | # Always prefer setuptools over distutils
  2 | # To use a consistent encoding
  3 | from __future__ import print_function
  4 | from codecs import open
  5 | import os
  6 | from subprocess import check_call
  7 | 
  8 | from setuptools import setup, find_packages
  9 | from setuptools.command.develop import develop
 10 | from setuptools.command.egg_info import egg_info
 11 | from setuptools.command.install import install
 12 | 
 13 | here = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
 14 | 
 15 | 
 16 | def readme():
 17 |     # Get the long description from the README file
 18 |     with open(os.path.join(here, 'README.rst'), encoding='utf-8') as f:
 19 |         return f.read()
 20 | 
 21 | 
 22 | def read_requirements():
 23 |     """parses requirements from requirements.txt"""
 24 |     reqs_path = os.path.join(here, 'requirements.txt')
 25 |     with open(reqs_path, encoding='utf8') as f:
 26 |         reqs = [line.strip() for line in f if not line.strip().startswith('#')]
 27 | 
 28 |     names = []
 29 |     links = []
 30 |     for req in reqs:
 31 |         if '://' in req:
 32 |             links.append(req)
 33 |         else:
 34 |             names.append(req)
 35 |     return {'install_requires': names, 'dependency_links': links}
 36 | 
 37 | 
 38 | def custom_command():
 39 |     check_call("python -m nltk.downloader universal_tagset punkt wordnet".split())
 40 | 
 41 | 
 42 | class CustomInstallCommand(install):
 43 |     def run(self):
 44 |         custom_command()
 45 |         install.run(self)
 46 | 
 47 | 
 48 | class CustomDevelopCommand(develop):
 49 |     def run(self):
 50 |         custom_command()
 51 |         develop.run(self)
 52 | 
 53 | 
 54 | class CustomEggInfoCommand(egg_info):
 55 |     def run(self):
 56 |         custom_command()
 57 |         egg_info.run(self)
 58 | 
 59 | 
 60 | setup(
 61 |     name='negbio',
 62 | 
 63 |     # Versions should comply with PEP440.  For a discussion on single-sourcing
 64 |     # the version across setup.py and the project code, see
 65 |     # https://packaging.python.org/en/latest/single_source_version.html
 66 |     version='0.9.4',
 67 | 
 68 |     description='NegBio: a tool for negation and uncertainty detection',
 69 |     long_description=readme(),
 70 | 
 71 |     # The project's main homepage.
 72 |     url='https://github.com/ncbi-nlp/NegBio.git',
 73 | 
 74 |     # Author details
 75 |     author='Yifan Peng',
 76 |     author_email='yifan.peng@nih.gov',
 77 | 
 78 |     license='Public Domain',
 79 | 
 80 |     # See https://pypi.python.org/pypi?%3Aaction=list_classifiers
 81 |     classifiers=[
 82 |         'Development Status :: 3 - Alpha',
 83 |         # Indicate who your project is intended for
 84 |         'Intended Audience :: Developers',
 85 |         'Intended Audience :: Science/Research',
 86 | 
 87 |         # Pick your license as you wish (should match "license" above)
 88 |         'License :: Public Domain',
 89 | 
 90 |         'Operating System :: MacOS',
 91 |         'Operating System :: POSIX',
 92 |         'Operating System :: POSIX :: Linux',
 93 | 
 94 |         # Specify the Python versions you support here.
 95 |         'Programming Language :: Python',
 96 |         'Topic :: Software Development',
 97 |         'Topic :: Software Development :: Libraries :: Application Frameworks',
 98 |     ],
 99 | 
100 |     keywords='negbio',
101 | 
102 |     packages=find_packages(exclude=["tests.*", "tests", "backup", "docs"]),
103 |     include_package_data=True,
104 | 
105 |     cmdclass={
106 |         'install': CustomInstallCommand,
107 |         'develop': CustomDevelopCommand,
108 |         'egg_info': CustomEggInfoCommand
109 |     },
110 | 
111 |     entry_points = {
112 |         'console_scripts': ['negbio_pipeline=negbio.negbio_pipeline:main',
113 |                             'main_chexpert=negbio.main_chexpert:main',
114 |                             'main_mm=negbio.main_mm:main'],
115 |     },
116 | 
117 |     **read_requirements()
118 | )
119 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ncbi-nlp/NegBio/073199e2792824740e89844a59c13d3d40ce4d23/tests/__init__.py


--------------------------------------------------------------------------------
/tests/context.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | 
3 | import os
4 | import sys
5 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
6 | 
7 | import bioc


--------------------------------------------------------------------------------
/tests/negbio/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ncbi-nlp/NegBio/073199e2792824740e89844a59c13d3d40ce4d23/tests/negbio/__init__.py


--------------------------------------------------------------------------------
/tests/negbio/ngrex/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ncbi-nlp/NegBio/073199e2792824740e89844a59c13d3d40ce4d23/tests/negbio/ngrex/__init__.py


--------------------------------------------------------------------------------
/tests/negbio/ngrex/test_parser.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from negbio import ngrex
 4 | from negbio.ngrex import parser
 5 | from ply.lex import LexToken
 6 | 
 7 | 
 8 | def test_lex():
 9 |     _test_lex('{lemma:/xxx/} <{dependency:/nmod:without|x/} {lemma:/yyy/}')
10 |     _test_lex('{} <{dependency:/nsubj/} {lemma:/suspect/,tag:/VBN/}')
11 |     _test_lex('{}=t <{dependency:/nsubj/} {lemma:/suspect/,tag:/VBN/}=key')
12 |     with pytest.raises(TypeError):
13 |         _test_yacc("xxx")
14 | 
15 | 
16 | def _test_lex(s):
17 |     parser.lexer.input(s)
18 |     for tok in parser.lexer:
19 |         print(tok)
20 | 
21 | 
22 | def test_yacc():
23 |     # _test_yacc("{lemma:/xxx/} <{dependency:/nmod:without|x/} {lemma:/yyy/}")
24 |     # _test_yacc("{lemma:/xxx/} >{dependency:/nmod:without/} {lemma:/yyy/}")
25 |     # _test_yacc("{lemma:/xxx/} >{dependency:/nmod:without/} ({lemma:/yyy/} >{} {lemma:/zzz/})")
26 |     # _test_yacc("{} >{} {lemma:/left/} <{} {lemma:/question/}")
27 |     # _test_yacc("{} <{dependency:/nsubj/} {lemma:/suspect/,tag:/VBN/}")
28 |     _test_yacc("{}=t <{dependency:/nsubj/} {lemma:/suspect/,tag:/VBN/}=key")
29 |     with pytest.raises(KeyError):
30 |         _test_yacc("{}=t <{dependency:/nsubj/} {lemma:/suspect/,tag:/VBN/}=t")
31 | 
32 | 
33 | def _test_yacc(s):
34 |     pattern = ngrex.compile(s)
35 |     print(pattern)
36 | 
37 | 
38 | if __name__ == '__main__':
39 |     test_lex()
40 |     test_yacc()
41 | 


--------------------------------------------------------------------------------
/tests/negbio/ngrex/test_pattern.py:
--------------------------------------------------------------------------------
 1 | import networkx as nx
 2 | from negbio import ngrex
 3 | 
 4 | 
 5 | def get_graph():
 6 |     G = nx.DiGraph()
 7 |     G.add_node('xxx', attr_dict={'lemma': 'xxx'})
 8 |     G.add_node('yyy', attr_dict={'lemma': 'yyy'})
 9 |     G.add_node('zzz', attr_dict={'lemma': 'zzz'})
10 |     G.add_edge('xxx', 'yyy', attr_dict={'dependency': 'aaa'})
11 |     G.add_edge('yyy', 'zzz', attr_dict={'dependency': 'bbb'})
12 |     G.add_edge('xxx', 'zzz', attr_dict={'dependency': 'ccc'})
13 |     return G
14 | 
15 | 
16 | def helper(G, p, expected):
17 |     pattern = ngrex.compile(p)
18 |     print(pattern.pattern)
19 |     # actual = {m.group(0) for m in pattern.finditer(G)}
20 |     actual = set()
21 |     for m in pattern.finditer(G):
22 |         actual.add(m.group(0))
23 |     assert actual == expected, '{} vs {}'.format(actual, expected)
24 | 
25 | 
26 | def test_regex():
27 |     G = get_graph()
28 |     helper(G, '{} >{dependency:/aaa|bbb/} {}', {'xxx', 'yyy'})
29 | 
30 | 
31 | def test_attribute():
32 |     G = get_graph()
33 |     helper(G, '{} >{dependency:/aaa|bbb/} {}', {'xxx', 'yyy'})
34 |     helper(G, '{} >{tag:/aaa|bbb/} {}', set())
35 | 
36 | 
37 | def test_relation():
38 |     G = get_graph()
39 |     helper(G, '{lemma:/xxx/} >{dependency:/aaa/} {lemma:/yyy/}', {'xxx'})
40 |     helper(G, '{lemma:/yyy/} <{dependency:/aaa/} {lemma:/xxx/}', {'yyy'})
41 |     helper(G, '{} >{} {}', {'xxx', 'yyy'})
42 | 
43 | 
44 | def test_relation_next():
45 |     G = get_graph()
46 |     helper(G, '{lemma:/xxx/} >{dependency:/aaa/} ({lemma:/yyy/} >{dependency:/bbb/} {lemma:/zzz/})',
47 |            {'xxx'})
48 | 
49 | 
50 | def test_relation_conj():
51 |     G = get_graph()
52 |     helper(G, '{} >{} {lemma:/yyy/} >{} {lemma:/zzz/}', {'xxx'})
53 |     helper(G, '{} >{} {lemma:/yyy/} <{} {lemma:/zzz/}', set())
54 | 
55 | 
56 | def test_relation_disj():
57 |     G = get_graph()
58 |     helper(G, '{} >{dependency:/aaa/} {} | >{dependency:/bbb/} {}', {'xxx', 'yyy'})
59 | 
60 | 
61 | def test_variables():
62 |     G = get_graph()
63 |     pattern = ngrex.compile('{}=t >{dependency:/aaa|bbb/} {}')
64 |     print(pattern.pattern)
65 |     actual = {m.get('t') for m in pattern.finditer(G)}
66 |     assert actual == {'xxx', 'yyy'}
67 | 
68 | 
69 | if __name__ == '__main__':
70 |     # test_relation()
71 |     # test_relation_next()
72 |     test_relation_conj()
73 |     # test_relation_disj()
74 |     # test_regex()
75 |     # test_attribute()
76 |     # test_variables()
77 | 


--------------------------------------------------------------------------------
/tests/negbio/pipeline/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ncbi-nlp/NegBio/073199e2792824740e89844a59c13d3d40ce4d23/tests/negbio/pipeline/__init__.py


--------------------------------------------------------------------------------
/tests/negbio/pipeline/test_parse.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | from negbio.pipeline.parse import Bllip
 4 | 
 5 | 
 6 | def test_Bllip():
 7 |     b = Bllip()
 8 |     t = b.parse('hello world!')
 9 |     assert str(t) == '(S1 (S (NP (NN hello) (NN world) (NN !))))'
10 | 
11 | 
12 | if __name__ == '__main__':
13 |     logging.basicConfig(level=logging.WARNING)
14 |     test_Bllip()
15 | 


--------------------------------------------------------------------------------
/tests/negbio/test_cli.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | from docopt import docopt
 4 | 
 5 | from negbio import negbio_pipeline, negbio_text2bioc, negbio_ssplit, negbio_section_split, negbio_parse
 6 | 
 7 | 
 8 | def test_negbio():
 9 |     doc = negbio_pipeline.__doc__
10 |     args = docopt(doc, 'text2bioc a b c'.split())
11 |     assert args['<command>'] == 'text2bioc'
12 |     assert args['<args>'] == ['a', 'b', 'c']
13 | 
14 | 
15 | def test_text2bioc():
16 |     doc = negbio_text2bioc.__doc__
17 |     args = docopt(doc, 'text2bioc --verbose --output=out a b c'.split())
18 |     assert args['--verbose']
19 |     assert args['--output'] == 'out'
20 |     assert args['<file>'] == ['a', 'b', 'c']
21 |     args = docopt(doc, 'text2bioc --output=out a b c'.split())
22 |     assert not args['--verbose']
23 | 
24 | 
25 | def test_ssplit():
26 |     doc = negbio_ssplit.__doc__
27 |     args = docopt(doc, 'ssplit --suffix suffix --newline_is_sentence_break --output out a b c'.split())
28 |     assert args['--newline_is_sentence_break']
29 |     assert args['--output'] == 'out'
30 |     assert args['--suffix'] == 'suffix'
31 |     assert args['<file>'] == ['a', 'b', 'c']
32 | 
33 | 
34 | def test_section_split():
35 |     doc = negbio_section_split.__doc__
36 |     args = docopt(doc, 'section_split --pattern pattern --output out a b c'.split())
37 |     assert args['--output'] == 'out'
38 |     assert args['--pattern'] == 'pattern'
39 |     assert args['<file>'] == ['a', 'b', 'c']
40 | 
41 | 
42 | def test_parse():
43 |     doc = negbio_parse.__doc__
44 |     args = docopt(doc, 'parse --model model --output out a b c'.split())
45 |     assert args['--output'] == 'out'
46 |     assert args['--model'] == 'model'
47 |     assert args['<file>'] == ['a', 'b', 'c']
48 | 
49 | 
50 | if __name__ == '__main__':
51 |     logging.basicConfig(level=logging.WARNING)
52 |     test_ssplit()
53 | 


--------------------------------------------------------------------------------