├── .gitignore ├── .pylintrc ├── .travis.yml ├── CONTRIBUTING.md ├── LICENSE.txt ├── MANIFEST.in ├── README.rst ├── docs ├── Makefile ├── README ├── acknowledgments.rst ├── conf.py ├── contributing.rst ├── developer_guide.rst ├── disclaimer.rst ├── getting_started.rst ├── index.rst ├── license.rst ├── make.bat ├── reference.rst └── user_guide.rst ├── environment2.7.yml ├── environment3.7.yml ├── examples ├── 00000086.txt ├── 00019248.txt ├── 1.xml ├── 2.xml ├── cuis-cvpr2017.txt ├── openi-testset.txt └── openi_gld_std14.csv ├── images └── negbio.png ├── negbio ├── __init__.py ├── chexpert │ ├── LICENSE │ ├── __init__.py │ ├── constants.py │ ├── patterns │ │ ├── negation.txt │ │ ├── post_negation_uncertainty.txt │ │ └── pre_negation_uncertainty.txt │ ├── phrases │ │ ├── mention │ │ │ ├── airspace_opacity.txt │ │ │ ├── atelectasis.txt │ │ │ ├── cardiomegaly.txt │ │ │ ├── consolidation.txt │ │ │ ├── edema.txt │ │ │ ├── enlarged_cardiomediastinum.txt │ │ │ ├── fracture.txt │ │ │ ├── lung_lesion.txt │ │ │ ├── no_finding.txt │ │ │ ├── pleural_effusion.txt │ │ │ ├── pleural_other.txt │ │ │ ├── pneumonia.txt │ │ │ ├── pneumothorax.txt │ │ │ └── support_devices.txt │ │ └── unmention │ │ │ ├── airspace_opacity.txt │ │ │ ├── lung_lesion.txt │ │ │ └── pleural_effusion.txt │ └── stages │ │ ├── __init__.py │ │ ├── aggregate.py │ │ ├── classify.py │ │ ├── extract.py │ │ └── load.py ├── cli_utils.py ├── compat.py ├── ext │ ├── __init__.py │ └── normalize_mimiccxr.py ├── main_chexpert.py ├── main_mm.py ├── neg │ ├── __init__.py │ ├── neg_detector.py │ ├── propagator.py │ ├── semgraph.py │ └── utils.py ├── negbio_clean.py ├── negbio_dner_chexpert.py ├── negbio_dner_matamap.py ├── negbio_neg.py ├── negbio_neg_chexpert.py ├── negbio_normalize.py ├── negbio_parse.py ├── negbio_pipeline.py ├── negbio_ptb2ud.py ├── negbio_section_split.py ├── negbio_ssplit.py ├── negbio_text2bioc.py ├── ngrex │ ├── __init__.py │ ├── parser.out │ ├── parser.py │ ├── parsetab.py │ └── pattern.py ├── patterns │ ├── neg_patterns.txt │ ├── section_titles.txt │ └── uncertainty_patterns.txt └── pipeline │ ├── __init__.py │ ├── cleanup.py │ ├── dner_mm.py │ ├── negdetect.py │ ├── parse.py │ ├── pipeline.py │ ├── ptb2ud.py │ ├── scan.py │ ├── section_split.py │ ├── ssplit.py │ └── text2bioc.py ├── requirements.txt ├── setup.py └── tests ├── __init__.py ├── context.py └── negbio ├── __init__.py ├── ngrex ├── __init__.py ├── test_parser.py └── test_pattern.py ├── pipeline ├── __init__.py └── test_parse.py └── test_cli.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by .ignore support plugin (hsz.mobi) 2 | 3 | .pytest_cache/ 4 | backup 5 | examples-local 6 | .DS_store 7 | 8 | ### Python template 9 | # Byte-compiled / optimized / DLL files 10 | __pycache__/ 11 | *.py[cod] 12 | *$py.class 13 | 14 | # C extensions 15 | *.so 16 | 17 | # Distribution / packaging 18 | .Python 19 | env/ 20 | build/ 21 | develop-eggs/ 22 | dist/ 23 | downloads/ 24 | eggs/ 25 | .eggs/ 26 | lib/ 27 | lib64/ 28 | parts/ 29 | sdist/ 30 | var/ 31 | wheels/ 32 | *.egg-info/ 33 | .installed.cfg 34 | *.egg 35 | 36 | # PyInstaller 37 | # Usually these files are written by a python script from a template 38 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 39 | *.manifest 40 | *.spec 41 | 42 | # Installer logs 43 | pip-log.txt 44 | pip-delete-this-directory.txt 45 | 46 | # Unit test / coverage reports 47 | htmlcov/ 48 | .tox/ 49 | .coverage 50 | .coverage.* 51 | .cache 52 | nosetests.xml 53 | coverage.xml 54 | *,cover 55 | .hypothesis/ 56 | 57 | # Translations 58 | *.mo 59 | *.pot 60 | 61 | # Django stuff: 62 | *.log 63 | local_settings.py 64 | 65 | # Flask stuff: 66 | instance/ 67 | .webassets-cache 68 | 69 | # Scrapy stuff: 70 | .scrapy 71 | 72 | # Sphinx documentation 73 | docs/_build/ 74 | 75 | # PyBuilder 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # pyenv 82 | .python-version 83 | 84 | # celery beat schedule file 85 | celerybeat-schedule 86 | 87 | # SageMath parsed files 88 | *.sage.py 89 | 90 | # dotenv 91 | .env 92 | 93 | # virtualenv 94 | .venv 95 | venv/ 96 | ENV/ 97 | 98 | # Spyder project settings 99 | .spyderproject 100 | 101 | # Rope project settings 102 | .ropeproject 103 | ### JetBrains template 104 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm 105 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 106 | 107 | # User-specific stuff: 108 | .idea 109 | .idea/**/workspace.xml 110 | .idea/**/tasks.xml 111 | .idea/dictionaries 112 | 113 | # Sensitive or high-churn files: 114 | .idea/**/dataSources/ 115 | .idea/**/dataSources.ids 116 | .idea/**/dataSources.xml 117 | .idea/**/dataSources.local.xml 118 | .idea/**/sqlDataSources.xml 119 | .idea/**/dynamic.xml 120 | .idea/**/uiDesigner.xml 121 | 122 | # Gradle: 123 | .idea/**/gradle.xml 124 | .idea/**/libraries 125 | 126 | # Mongo Explorer plugin: 127 | .idea/**/mongoSettings.xml 128 | 129 | ## File-based project format: 130 | *.iws 131 | 132 | ## Plugin-specific files: 133 | 134 | # IntelliJ 135 | /out/ 136 | 137 | # mpeltonen/sbt-idea plugin 138 | .idea_modules/ 139 | 140 | # JIRA plugin 141 | atlassian-ide-plugin.xml 142 | 143 | # Crashlytics plugin (for Android Studio and IntelliJ) 144 | com_crashlytics_export_strings.xml 145 | crashlytics.properties 146 | crashlytics-build.properties 147 | fabric.properties 148 | 149 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "2.7" 4 | install: 5 | pip install -r requirements.txt 6 | # - sudo apt-get update 7 | # # We do this conditionally because it saves us some downloading if the 8 | # # version is the same. 9 | # - if [[ "$TRAVIS_PYTHON_VERSION" == "2.7" ]]; then 10 | # wget https://repo.continuum.io/miniconda/Miniconda2-latest-Linux-x86_64.sh -O miniconda.sh; 11 | # else 12 | # wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh; 13 | # fi 14 | # - bash miniconda.sh -b -p $HOME/miniconda 15 | # - export PATH="$HOME/miniconda/bin:$PATH" 16 | # - hash -r 17 | # - conda config --set always_yes yes --set changeps1 no 18 | # - conda update -q conda 19 | # # Useful for debugging any issues with conda 20 | # - conda info -a 21 | # 22 | # # Replace dep1 dep2 ... with your dependencies 23 | # - conda env create --file environment2.7.yml 24 | # - source activate negbio2.7 25 | 26 | script: 27 | - py.test 28 | 29 | notifications: 30 | email: false -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | When contributing to this repository, please first discuss the change you wish to make via issue, 4 | email, or any other method with the owners of this repository before making a change. 5 | This project adheres to the [Contributor Covenant Code of Conduct](http://contributor-covenant.org/). 6 | 7 | # Maintainers 8 | 9 | NegBio is maintained with :heart: by: 10 | 11 | -- **@yfpeng** 12 | 13 | See also the list of [contributors](https://github.com/ncbi-nlp/NegBio/contributors) who participated in this project. 14 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | PUBLIC DOMAIN NOTICE 2 | National Center for Biotechnology Information 3 | 4 | This software/database is a "United States Government Work" under the terms of 5 | the United States Copyright Act. It was written as part of the author's 6 | official duties as a United States Government employee and thus cannot be 7 | copyrighted. This software/database is freely available to the public for use. 8 | The National Library of Medicine and the U.S. Government have not placed any 9 | restriction on its use or reproduction. 10 | 11 | Although all reasonable efforts have been taken to ensure the accuracy and 12 | reliability of the software and data, the NLM and the U.S. Government do not and 13 | cannot warrant the performance or results that may be obtained by using this 14 | software or data. The NLM and the U.S. Government disclaim all warranties, 15 | express or implied, including warranties of performance, merchantability or 16 | fitness for any particular purpose. 17 | 18 | Please cite the author in any work or product based on this material: 19 | 20 | Peng Y, Wang X, Lu L, Bagheri M, Summers RM, Lu Z. 21 | NegBio: a high-performance tool for negation and uncertainty detection in radiology reports. 22 | AMIA 2018 Informatics Summit. 2018. 23 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.rst 2 | include LICENSE.txt 3 | include CONTRIBUTING.md 4 | include requirements.txt 5 | include examples/* 6 | recursive-include negbio/patterns * 7 | recursive-include negbio/chexpert/patterns * 8 | recursive-include negbio/chexpert/phrases * 9 | 10 | exclude tests 11 | exclude backup 12 | exclude docs -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | .. image:: https://raw.githubusercontent.com/ncbi-nlp/NegBio/master/images/negbio.png?raw=true 2 | :target: https://raw.githubusercontent.com/ncbi-nlp/NegBio/master/images/negbio.png?raw=true 3 | :alt: NegBio 4 | 5 | ----------------------- 6 | 7 | .. image:: https://img.shields.io/travis/yfpeng/NegBio/master.svg 8 | :target: https://travis-ci.org/yfpeng/NegBio 9 | :alt: Build status 10 | 11 | .. image:: https://img.shields.io/pypi/v/negbio.svg 12 | :target: https://pypi.python.org/pypi/negbio 13 | :alt: PyPI version 14 | 15 | .. image:: https://img.shields.io/readthedocs/negbio.svg 16 | :target: http://negbio.readthedocs.io 17 | :alt: RTD version 18 | 19 | 20 | NegBio is a high-performance NLP tool for negation and uncertainty detection in clinical texts (e.g. radiology reports). 21 | 22 | 23 | Get started 24 | =========== 25 | 26 | Install NegBio 27 | ~~~~~~~~~~~~~~ 28 | 29 | 1. Installing from source (recommended) 30 | 31 | .. code-block:: bash 32 | 33 | $ git clone https://github.com/ncbi-nlp/NegBio.git 34 | $ cd /path/to/negbio 35 | $ python setup.py install --user 36 | $ export PATH=~/.local/bin:$PATH 37 | 38 | 2. Installing from pip 39 | 40 | .. code-block:: bash 41 | 42 | $ pip install negbio 43 | 44 | 45 | 46 | 47 | Prepare the dataset 48 | ~~~~~~~~~~~~~~~~~~~ 49 | 50 | The inputs can be in either plain text or `BioC `_ format. 51 | If the reports are in plain text, each report needs to be in a single file. 52 | Some examples can be found in the ``examples`` folder. 53 | 54 | Run the script 55 | ~~~~~~~~~~~~~~ 56 | 57 | There are two ways to run the pipeline. 58 | 59 | **NOTE**: If you want to process a lot of reports (e.g., > 1000), it is recommended to run the pipeline step-by-step. 60 | See `User guide `_. 61 | 62 | 63 | Using the CheXpert algorithm 64 | ____________________________ 65 | 66 | If you want to use the `CheXpert `_ method, run one of the following lines 67 | 68 | .. code-block:: bash 69 | 70 | $ main_chexpert text --output=examples examples/00000086.txt examples/00019248.txt 71 | 72 | .. code-block:: bash 73 | 74 | $ main_chexpert bioc --output=examples examples/1.xml 75 | 76 | 77 | Using MetaMap 78 | _____________ 79 | 80 | If you want to use MetaMap, run the following command by replacing ```` with the actual **ABSOLUTE** 81 | path, such as **META_MAP_HOME/bin/metamap16** 82 | 83 | .. code-block:: bash 84 | 85 | $ main_mm text --metamap= --output=examples examples/00000086.txt \ 86 | examples/00019248.txt 87 | 88 | .. code-block:: bash 89 | 90 | $ main_mm bioc --metamap= --output=examples examples/1.xml 91 | 92 | 93 | Documentation 94 | ============= 95 | 96 | negbio `documentation `_ is available on Read The Docs. 97 | 98 | See `Getting Started `_ for installation and basic 99 | information. To contribute to negbio, read our `contribution guide `_. 100 | 101 | Citing NegBio 102 | ============= 103 | 104 | If you're running the NegBio pipeline, please cite: 105 | 106 | * Peng Y, Wang X, Lu L, Bagheri M, Summers RM, Lu Z. `NegBio: a high-performance tool for negation and uncertainty 107 | detection in radiology reports `_. *AMIA 2018 Informatics Summit*. 2018. 108 | * Wang X, Peng Y, Lu L, Bagheri M, Lu Z, Summers R. `ChestX-ray8: Hospital-scale Chest X-ray database and benchmarks 109 | on weakly-supervised classification and localization of common thorax diseases `_. 110 | *IEEE Conference on Computer Vision and Pattern Recognition (CVPR)*. 2017, 2097-2106. 111 | 112 | Acknowledgments 113 | =============== 114 | 115 | This work was supported by the Intramural Research Programs of the National Institutes of Health, National Library of 116 | Medicine and Clinical Center. 117 | 118 | We are grateful to the authors of NegEx, MetaMap, Stanford CoreNLP, Bllip parser, and CheXpert labeler for making 119 | their software tools publicly available. 120 | 121 | We thank Dr. Alexis Allot for the helpful discussion. 122 | 123 | Disclaimer 124 | ========== 125 | This tool shows the results of research conducted in the Computational Biology Branch, NCBI. The information produced 126 | on this website is not intended for direct diagnostic use or medical decision-making without review and oversight 127 | by a clinical professional. Individuals should not change their health behavior solely on the basis of information 128 | produced on this website. NIH does not independently verify the validity or utility of the information produced 129 | by this tool. If you have questions about the information produced on this website, please see a health care 130 | professional. More information about NCBI's disclaimer policy is available. 131 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SPHINXPROJ = negbio 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -------------------------------------------------------------------------------- /docs/README: -------------------------------------------------------------------------------- 1 | The documentation in this tree is in plain text files and can be viewed using 2 | any text file viewer. 3 | 4 | It uses ReST (reStructuredText) [1], and the Sphinx documentation system [2]. 5 | This allows it to be built into other forms for easier viewing and browsing. 6 | 7 | To create an HTML version of the docs: 8 | 9 | * Install Sphinx (using ``pip install Sphinx sphinx_rtd_theme`` or some other method) 10 | 11 | * In this docs/ directory, type ``make html`` (or ``make.bat html`` on 12 | Windows) at a shell prompt. 13 | 14 | The documentation in _build/html/index.html can then be viewed in a web browser. 15 | 16 | [1] http://docutils.sourceforge.net/rst.html 17 | [2] http://sphinx-doc.org/ -------------------------------------------------------------------------------- /docs/acknowledgments.rst: -------------------------------------------------------------------------------- 1 | Acknowledgments 2 | --------------- 3 | 4 | This work was supported by the Intramural Research Programs of the National Institutes of Health, National Library of 5 | Medicine and Clinical Center. 6 | 7 | We are grateful to the authors of NegEx, MetaMap, Stanford CoreNLP, Bllip parser, and CheXpert labeler for making 8 | their software tools publicly available. 9 | 10 | We thank Dr. Alexis Allot for the helpful discussion. 11 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # 4 | # negbio documentation build configuration file, created by 5 | # sphinx-quickstart on Thu Feb 8 15:24:06 2018. 6 | # 7 | # This file is execfile()d with the current directory set to its 8 | # containing dir. 9 | # 10 | # Note that not all possible configuration values are present in this 11 | # autogenerated file. 12 | # 13 | # All configuration values have a default; values that are commented out 14 | # serve to show the default. 15 | 16 | # If extensions (or modules to document with autodoc) are in another directory, 17 | # add these directories to sys.path here. If the directory is relative to the 18 | # documentation root, use os.path.abspath to make it absolute, like shown here. 19 | # 20 | # import os 21 | # import sys 22 | # sys.path.insert(0, os.path.abspath('.')) 23 | 24 | 25 | # -- General configuration ------------------------------------------------ 26 | 27 | # If your documentation needs a minimal Sphinx version, state it here. 28 | # 29 | # needs_sphinx = '1.0' 30 | 31 | # Add any Sphinx extension module names here, as strings. They can be 32 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 33 | # ones. 34 | extensions = [] 35 | 36 | # Add any paths that contain templates here, relative to this directory. 37 | templates_path = ['_templates'] 38 | 39 | # The suffix(es) of source filenames. 40 | # You can specify multiple suffix as a list of string: 41 | # 42 | # source_suffix = ['.rst', '.md'] 43 | source_suffix = '.rst' 44 | 45 | # The master toctree document. 46 | master_doc = 'index' 47 | 48 | # General information about the project. 49 | project = 'negbio' 50 | copyright = '2019, NCBI, NLM, NIH' 51 | author = 'Yifan Peng' 52 | 53 | # The version info for the project you're documenting, acts as replacement for 54 | # |version| and |release|, also used in various other places throughout the 55 | # built documents. 56 | # 57 | # The short X.Y version. 58 | version = '1.0' 59 | # The full version, including alpha/beta/rc tags. 60 | release = '1.0' 61 | 62 | # The language for content autogenerated by Sphinx. Refer to documentation 63 | # for a list of supported languages. 64 | # 65 | # This is also used if you do content translation via gettext catalogs. 66 | # Usually you set "language" from the command line for these cases. 67 | language = None 68 | 69 | # List of patterns, relative to source directory, that match files and 70 | # directories to ignore when looking for source files. 71 | # This patterns also effect to html_static_path and html_extra_path 72 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 73 | 74 | # The name of the Pygments (syntax highlighting) style to use. 75 | pygments_style = 'sphinx' 76 | 77 | # If true, `todo` and `todoList` produce output, else they produce nothing. 78 | todo_include_todos = False 79 | 80 | 81 | # -- Options for HTML output ---------------------------------------------- 82 | 83 | # The theme to use for HTML and HTML Help pages. See the documentation for 84 | # a list of builtin themes. 85 | # 86 | html_theme = 'sphinx_rtd_theme' 87 | 88 | # Theme options are theme-specific and customize the look and feel of a theme 89 | # further. For a list of options available for each theme, see the 90 | # documentation. 91 | # 92 | # html_theme_options = {} 93 | 94 | # Add any paths that contain custom static files (such as style sheets) here, 95 | # relative to this directory. They are copied after the builtin static files, 96 | # so a file named "default.css" will overwrite the builtin "default.css". 97 | html_static_path = ['_static'] 98 | 99 | 100 | # -- Options for HTMLHelp output ------------------------------------------ 101 | 102 | # Output file base name for HTML help builder. 103 | htmlhelp_basename = 'negbiodoc' 104 | 105 | 106 | # -- Options for LaTeX output --------------------------------------------- 107 | 108 | latex_elements = { 109 | # The paper size ('letterpaper' or 'a4paper'). 110 | # 111 | # 'papersize': 'letterpaper', 112 | 113 | # The font size ('10pt', '11pt' or '12pt'). 114 | # 115 | # 'pointsize': '10pt', 116 | 117 | # Additional stuff for the LaTeX preamble. 118 | # 119 | # 'preamble': '', 120 | 121 | # Latex figure (float) alignment 122 | # 123 | # 'figure_align': 'htbp', 124 | } 125 | 126 | # Grouping the document tree into LaTeX files. List of tuples 127 | # (source start file, target name, title, 128 | # author, documentclass [howto, manual, or own class]). 129 | latex_documents = [ 130 | (master_doc, 'negbio.tex', 'negbio Documentation', 131 | 'Yifan Peng', 'manual'), 132 | ] 133 | 134 | 135 | # -- Options for manual page output --------------------------------------- 136 | 137 | # One entry per manual page. List of tuples 138 | # (source start file, name, description, authors, manual section). 139 | man_pages = [ 140 | (master_doc, 'negbio', 'negbio Documentation', 141 | [author], 1) 142 | ] 143 | 144 | 145 | # -- Options for Texinfo output ------------------------------------------- 146 | 147 | # Grouping the document tree into Texinfo files. List of tuples 148 | # (source start file, target name, title, author, 149 | # dir menu entry, description, category) 150 | texinfo_documents = [ 151 | (master_doc, 'negbio', 'negbio Documentation', 152 | author, 'negbio', 'One line description of project.', 153 | 'Miscellaneous'), 154 | ] 155 | 156 | 157 | 158 | -------------------------------------------------------------------------------- /docs/contributing.rst: -------------------------------------------------------------------------------- 1 | Contributing 2 | ------------ 3 | 4 | Please read ``CONTRIBUTING.md`` for details on our code of conduct, and the process for submitting pull requests to us. -------------------------------------------------------------------------------- /docs/developer_guide.rst: -------------------------------------------------------------------------------- 1 | NegBio Developer Guide 2 | ====================== 3 | 4 | Create the documentation 5 | ^^^^^^^^^^^^^^^^^^^^^^^^ 6 | 7 | Install Sphinx 8 | 9 | .. code-block:: bash 10 | :linenos: 11 | 12 | $ pip install Sphinx 13 | $ pip install sphinx_rtd_theme 14 | $ cd docs 15 | $ make html -------------------------------------------------------------------------------- /docs/disclaimer.rst: -------------------------------------------------------------------------------- 1 | Disclaimer 2 | ========== 3 | 4 | This tool shows the results of research conducted in the Computational Biology Branch, NCBI. The information produced 5 | on this website is not intended for direct diagnostic use or medical decision-making without review and oversight 6 | by a clinical professional. Individuals should not change their health behavior solely on the basis of information 7 | produced on this website. NIH does not independently verify the validity or utility of the information produced 8 | by this tool. If you have questions about the information produced on this website, please see a health care 9 | professional. More information about NCBI's disclaimer policy is available. 10 | -------------------------------------------------------------------------------- /docs/getting_started.rst: -------------------------------------------------------------------------------- 1 | Getting Started with NegBio 2 | =========================== 3 | 4 | These instructions will get you a copy of the project up and run on your local machine for development and testing 5 | purposes. The package should successfully install on Linux (and possibly macOS). 6 | 7 | Installing 8 | ---------- 9 | 10 | Prerequisites 11 | ~~~~~~~~~~~~~ 12 | 13 | * python >2.4 14 | * Linux 15 | * Java 16 | 17 | Note: since v1.0, MetaMap is not required. You can use the CheXpert vocabularies (``negbio/chexpert/phrases``) instead. 18 | If you want to use MetaMap, it can be downloaded from `https://metamap.nlm.nih.gov/MainDownload.shtml `_. 19 | Installation instructions can be found at `https://metamap.nlm.nih.gov/Installation.shtml `_. 20 | Please make sure that both ``skrmedpostctl`` and ``wsdserverctl`` are started. 21 | 22 | Installing from source (recommended) 23 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 24 | 25 | .. code-block:: bash 26 | 27 | $ git clone https://github.com/ncbi-nlp/NegBio.git 28 | $ cd /path/to/negbio 29 | $ python setup.py install --user 30 | $ export PATH=~/.local/bin:$PATH 31 | 32 | Installing from pip 33 | ~~~~~~~~~~~~~~~~~~~ 34 | 35 | .. code-block:: bash 36 | 37 | $ pip install negbio 38 | 39 | 40 | Using NegBio 41 | ------------ 42 | 43 | Prepare the dataset 44 | ~~~~~~~~~~~~~~~~~~~ 45 | 46 | The inputs can be in either plain text or `BioC `_ format. If the reports are in plain 47 | text, each report needs to be in a single file. Some examples can be found in the ``examples`` folder. 48 | 49 | Run the script 50 | ~~~~~~~~~~~~~~ 51 | 52 | There are two ways to run the pipeline. 53 | 54 | Using CheXpert algorithm 55 | ________________________ 56 | 57 | If you want to use the CheXpert method, run one of the following lines 58 | 59 | .. code-block:: bash 60 | 61 | $ main_chexpert text --output=examples/test.neg.xml examples/00000086.txt examples/00019248.txt 62 | 63 | .. code-block:: bash 64 | 65 | $ main_chexpert bioc --output=examples/test.neg.xml examples/1.xml 66 | 67 | The script will 68 | 69 | 1. [Optional] Combine ``examples/00000086.txt`` and ``examples/00019248.txt`` into one BioC XML file 70 | 2. Detect concepts using CheXpert pre-defined vocabularies (by default using the list ``negbio/chexpert/phrases``) 71 | 3. Detect positive, negative and uncertain concepts using rules in ``negbio/chexpert/patterns`` 72 | 4. Save the results in ``examples/test.neg.xml`` 73 | 74 | More options (e.g., setting the CUI list or rules) can be obtained by running 75 | 76 | .. code-block:: bash 77 | 78 | $ main_chexpert --help 79 | 80 | Using MetaMap 81 | _____________ 82 | 83 | If you want to use MetaMap, run the following command by replacing ```` with the actual **ABSOLUTE** 84 | path, such as **META_MAP_HOME/bin/metamap16** 85 | 86 | .. code-block:: bash 87 | 88 | $ export METAMAP_BIN=META_MAP_HOME/bin/metamap16 89 | $ main_mm text --metamap=$METAMAP_BIN --output=examples/test.neg.xml \ 90 | examples/00000086.txt examples/00019248.txt 91 | 92 | .. code-block:: bash 93 | 94 | $ export METAMAP_BIN=META_MAP_HOME/bin/metamap16 95 | $ main_mm bioc --metamap=$METAMAP_BIN --output=examples/test.neg.xml examples/1.xml 96 | 97 | The script will 98 | 99 | 1. [Optional] Combine ``examples/00000086.txt`` and ``examples/00019248.txt`` into one BioC XML file 100 | 2. Detect UMLS concepts (CUIs) using MetaMap (by default using the CUI list ``examples/cuis-cvpr2017.txt`` 101 | 3. Detect negative and uncertain CUIs using rules in ``negbio/patterns`` 102 | 4. Save the results in ``examples/test.neg.xml`` 103 | 104 | More options (e.g., setting the CUI list or rules) can be obtained by running 105 | 106 | .. code-block:: bash 107 | 108 | $ main_mm --help 109 | 110 | 111 | Next Steps 112 | ---------- 113 | 114 | To start learning how to use NegBio, see the :doc:`user_guide`. 115 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. negbio documentation master file, created by 2 | sphinx-quickstart on Thu Feb 8 15:24:06 2018. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | NegBio documentation 7 | ==================== 8 | 9 | .. toctree:: 10 | :maxdepth: 5 11 | :caption: Contents: 12 | 13 | getting_started 14 | user_guide 15 | developer_guide 16 | license 17 | contributing 18 | acknowledgments 19 | disclaimer 20 | reference 21 | 22 | 23 | Indices and tables 24 | ================== 25 | 26 | * :ref:`genindex` 27 | * :ref:`modindex` 28 | * :ref:`search` 29 | -------------------------------------------------------------------------------- /docs/license.rst: -------------------------------------------------------------------------------- 1 | License 2 | ======= 3 | 4 | PUBLIC DOMAIN NOTICE 5 | 6 | National Center for Biotechnology Information 7 | 8 | This software/database is a "United States Government Work" under the terms of 9 | the United States Copyright Act. It was written as part of the author's 10 | official duties as a United States Government employee and thus cannot be 11 | copyrighted. This software/database is freely available to the public for use. 12 | The National Library of Medicine and the U.S. Government have not placed any 13 | restriction on its use or reproduction. 14 | 15 | Although all reasonable efforts have been taken to ensure the accuracy and 16 | reliability of the software and data, the NLM and the U.S. Government do not and 17 | cannot warrant the performance or results that may be obtained by using this 18 | software or data. The NLM and the U.S. Government disclaim all warranties, 19 | express or implied, including warranties of performance, merchantability or 20 | fitness for any particular purpose. 21 | 22 | Please cite the author in any work or product based on these materials: 23 | 24 | Peng Y, Wang X, Lu L, Bagheri M, Summers RM, Lu Z. 25 | NegBio: a high-performance tool for negation and uncertainty detection in 26 | radiology reports. 27 | AMIA 2018 Informatics Summit. 2018. 28 | 29 | Wang X, Peng Y, Lu L, Bagheri M, Lu Z, Summers R. 30 | ChestX-ray8: Hospital-scale Chest X-ray database and benchmarks on 31 | weakly-supervised classification and localization of common thorax diseases. 32 | IEEE Conference on Computer Vision and Pattern Recognition (CVPR). 2017, 2097-2106. 33 | 34 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | set SPHINXPROJ=negbio 13 | 14 | if "%1" == "" goto help 15 | 16 | %SPHINXBUILD% >NUL 2>NUL 17 | if errorlevel 9009 ( 18 | echo. 19 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 20 | echo.installed, then set the SPHINXBUILD environment variable to point 21 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 22 | echo.may add the Sphinx directory to PATH. 23 | echo. 24 | echo.If you don't have Sphinx installed, grab it from 25 | echo.http://sphinx-doc.org/ 26 | exit /b 1 27 | ) 28 | 29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 30 | goto end 31 | 32 | :help 33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 34 | 35 | :end 36 | popd 37 | -------------------------------------------------------------------------------- /docs/reference.rst: -------------------------------------------------------------------------------- 1 | Reference 2 | ========= 3 | 4 | * Peng Y, Wang X, Lu L, Bagheri M, Summers RM, Lu Z. `NegBio: a high-performance tool for negation and uncertainty 5 | detection in radiology reports `_. *AMIA 2018 Informatics Summit*. 2018. 6 | * Wang X, Peng Y, Lu L, Bagheri M, Lu Z, Summers R. `ChestX-ray8: Hospital-scale Chest X-ray database and benchmarks 7 | on weakly-supervised classification and localization of common thorax diseases `_. 8 | *IEEE Conference on Computer Vision and Pattern Recognition (CVPR)*. 2017, 2097-2106. -------------------------------------------------------------------------------- /docs/user_guide.rst: -------------------------------------------------------------------------------- 1 | NegBio User Guide 2 | ================= 3 | 4 | Run the pipeline step-by-step 5 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 6 | 7 | The step-by-step pipeline generates all intermediate documents. You can easily rerun one step if it makes errors. 8 | The whole steps are 9 | 10 | 1. ``text2bioc`` combines text into a BioC XML file. 11 | 2. ``normalize`` removes noisy text such as ``[**Patterns**]``. 12 | 3. ``section_split`` splits the report into sections based on titles at ``patterns/section_titles.txt`` 13 | 4. ``ssplit`` splits text into sentences. 14 | 5. Named entity recognition 15 | 16 | a. ``dner_mm`` detects UMLS concepts using MetaMap. 17 | b. ``dner_chexpert`` detects concepts using the CheXpert vocabularies at ``negbio/chexpert/phrases``. 18 | 19 | 6. ``parse`` parses sentence using the `Bllip parser `_. 20 | 7. ``ptb2ud`` converts the parse tree to universal dependencies using `Stanford converter `_. 21 | 8. Negation detection 22 | 23 | a. ``neg`` detects negative and uncertain findings. 24 | b. ``neg_chexpert`` detects positive, negative and uncertain findings (recommended) 25 | 26 | 9. ``cleanup`` removes intermediate information. 27 | 28 | Steps 2-10 will process the input files one-by-one and generate the results in the output directory. 29 | The 2nd and 3rd can be skipped. You can chose either step 5 or 6 for named entity recognition. 30 | 31 | 1. Convert text files to BioC format 32 | ------------------------------------ 33 | 34 | You can skip this step if the reports are already in the `BioC `_ format. 35 | **If you have lots of reports, it is recommended to put them into several BioC files, for example, 100 reports per BioC file.** 36 | 37 | .. code-block:: bash 38 | 39 | $ export BIOC_DIR=/path/to/bioc 40 | $ export TEXT_DIR=/path/to/text 41 | $ negbio_pipeline text2bioc --output=$BIOC_DIR/test.xml $TEXT_DIR/*.txt 42 | 43 | Another most commonly used command is: 44 | 45 | .. code-block:: bash 46 | 47 | $ find $TEXT_DIR -type f | negbio_pipeline text2bioc --output=$BIOC_DIR 48 | 49 | 2. Normalize reports 50 | -------------------- 51 | 52 | This step removes the noisy text such as ``[**Patterns**]`` in the MIMIC-III reports. 53 | 54 | .. code-block:: bash 55 | 56 | $ negbio_pipeline normalize --output=$OUTPUT_DIR $INPUT_DIR/*.xml 57 | 58 | 3. Split each report into sections 59 | ----------------------------------- 60 | 61 | This step splits the report into sections. 62 | The default section titles is at ``patterns/section_titles.txt``. 63 | You can specify customized section titles using the option ``--pattern=``. 64 | 65 | .. code-block:: bash 66 | 67 | $ negbio_pipeline section_split --output=$OUTPUT_DIR $INPUT_DIR/*.xml 68 | 69 | 70 | 4. Splits each report into sentences 71 | ------------------------------------ 72 | 73 | This step splits the report into sentences using the NLTK splitter 74 | (`nltk.tokenize.sent_tokenize `_). 75 | 76 | .. code-block:: bash 77 | 78 | $ negbio_pipeline ssplit --output=$OUTPUT_DIR $INPUT_DIR/*.xml 79 | 80 | 81 | 5. Named entity recognition 82 | --------------------------- 83 | 84 | This step recognizes named entities (e.g., findings, diseases, devices) from the reports. 85 | The first version of NegBio uses MetaMap to detect UMLS concepts. 86 | 87 | MetaMap can be can be downloaded from `https://metamap.nlm.nih.gov/MainDownload.shtml `_. 88 | Installation instructions can be found at `https://metamap.nlm.nih.gov/Installation.shtml `_. 89 | Before using MetaMap, please make sure that both ``skrmedpostctl`` and ``wsdserverctl`` are started. 90 | 91 | MetaMap intends to extract all UMLS concepts. 92 | Many of them are not irrelevant to radiology. 93 | Therefore, it is better to specify the UMLS concepts of interest via ``--cuis=`` 94 | 95 | .. code-block:: bash 96 | 97 | $ export METAMAP_BIN=META_MAP_HOME/bin/metamap16 98 | $ negbio_pipeline dner_mm --metamap=$METAMAP_BIN --output=$OUTPUT_DIR $INPUT_DIR/*.xml 99 | 100 | NegBio also integrates the CheXpert vocabularies to recognize the presence of 14 observations. 101 | All vocabularies can be found at ``negbio/chexpert/phrases``. 102 | Each file in the folder represents one type of named entities with various text expressions. 103 | So far, NegBio does not support adding more types in the folder, but you can add more text expressions of the type. 104 | 105 | .. code-block:: bash 106 | 107 | $ negbio_pipeline dner_chexpert --output=$OUTPUT_DIR $INPUT_DIR/*.xml 108 | 109 | 110 | In general, MetaMap is more comprehensive while CheXpert is more accurate on 14 types of findings. 111 | MetaMap is also slower and easier to break than CheXpert. 112 | 113 | 114 | 6. Parse the sentence 115 | --------------------- 116 | 117 | This step parses sentence using the `Bllip parser `_. 118 | 119 | .. code-block:: bash 120 | 121 | $ negbio_pipeline parse --output=$OUTPUT_DIR $INPUT_DIR/*.xml 122 | 123 | 124 | 7. Convert the parse tree to UD 125 | ------------------------------- 126 | 127 | This step converts the parse tree to universal dependencies using `Stanford converter `_. 128 | 129 | .. code-block:: bash 130 | 131 | $ negbio_pipeline ptb2ud --output=$OUTPUT_DIR $INPUT_DIR/*.xml 132 | 133 | 134 | 8. Detect negative and uncertain findings 135 | ----------------------------------------- 136 | 137 | This step detects negative and uncertain findings using patterns. 138 | By default, the program uses the negation and uncertainty patterns in the ``negbio/patterns`` folder. 139 | However, you are free to create your own patterns via ``--neg-patterns=`` and ``--uncertainty-patterns=``. 140 | The pattern is a `semgrex-type `_ 141 | pattern for matching node in the dependency graph. 142 | Currently, we only support ``<`` and ``>`` operations. 143 | A detailed grammar specification (using PLY, Python Lex-Yacc) can be found in ``ngrex/parser.py``. 144 | 145 | .. code-block:: bash 146 | 147 | $ negbio_pipeline neg --output=$OUTPUT_DIR $INPUT_DIR/*.xml 148 | 149 | NegBio also integrates the CheXpert algorithms. 150 | Different from the original NegBio, CheXpert utilizes a 3-phase pipeline consisting of pre-negation uncertainty, 151 | negation, and post-negation uncertainty (`Irvin et al., 2019 `_). 152 | Each phase consists of rules which are matched against the mention; if a match is found, then the mention is classified 153 | accordingly (as uncertain in the first or third phase, and as negative in the second phase). 154 | If a mention is not matched in any of the phases, it is classified as positive. 155 | 156 | Generally, the CheXpert contains more rules and is more accurate than the original NegBio. 157 | 158 | .. code-block:: bash 159 | 160 | $ negbio_pipeline neg_chexpert --output=$OUTPUT_DIR $INPUT_DIR/*.xml 161 | 162 | Similarly, you are free to create patterns via ``--neg-patterns=``, ``--pre-uncertainty-patterns=``, and 163 | ``--post-uncertainty-patterns=``. 164 | 165 | 9. Cleans intermediate information 166 | ---------------------------------- 167 | 168 | This step removes intermediate information (sentence annotations) from the BioC files. 169 | 170 | .. code-block:: bash 171 | 172 | $ negbio_pipeline cleanup --output=$OUTPUT_DIR $INPUT_DIR/*.xml 173 | 174 | -------------------------------------------------------------------------------- /environment2.7.yml: -------------------------------------------------------------------------------- 1 | name: negbio2.7 2 | channels: 3 | - anaconda 4 | - conda-forge 5 | - auto 6 | dependencies: 7 | - python=2.7.11 8 | - future=0.16.0 9 | - docutils=0.13.1 10 | - docopt=0.6.2 11 | - pytest=3.1.3 12 | - networkx=1.11 13 | - ply=3.10 14 | - tqdm=4.19.5 15 | - nltk=3.2.4 16 | - pathlib2=2.3.3 17 | - numpy=1.15.4 18 | - jpype1=0.6.3 19 | - pip: 20 | - bioc==1.1.dev3 21 | - pystanforddependencies==0.3.1 22 | - bllipparser==2016.9.11 23 | - pymetamap==0.1 24 | -------------------------------------------------------------------------------- /environment3.7.yml: -------------------------------------------------------------------------------- 1 | name: negbio3.7 2 | channels: 3 | - anaconda 4 | - conda-forge 5 | - auto 6 | dependencies: 7 | - python=3.7 8 | - docutils=0.14 9 | - docopt=0.6.2 10 | - pytest=4.2.0 11 | - networkx=2.2 12 | - ply=3.11 13 | - tqdm=4.31 14 | - nltk=3.4 15 | - numpy=1.16 16 | - jpype1=0.6.3 17 | - pip: 18 | - bioc==1.3.1 19 | - pystanforddependencies==0.3.1 20 | - bllipparser==2016.9.11 21 | - pymetamap==0.1 22 | -------------------------------------------------------------------------------- /examples/00000086.txt: -------------------------------------------------------------------------------- 1 | findings: pa and lat cxr at 7:34 p.m.. heart and mediastinum are 2 | stable. lungs are unchanged. air- filled cystic changes. no 3 | pneumothorax. osseous structures unchanged scoliosis 4 | impression: stable chest. 5 | dictating -------------------------------------------------------------------------------- /examples/00019248.txt: -------------------------------------------------------------------------------- 1 | findings: 2 | chest: four images: 3 | right picc with tip within the upper svc. 4 | probable enlargement of the main pulmonary artery. 5 | mild cardiomegaly. 6 | no evidence of focal infiltrate, effusion or pneumothorax. 7 | dictating -------------------------------------------------------------------------------- /examples/1.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 2017-05-31 5 | 6 | 7 | 8 | 00019248 9 | 10 | 0 11 | findings: 12 | chest: four images: 13 | right picc with tip within the upper svc. 14 | probable enlargement of the main pulmonary artery. 15 | mild cardiomegaly. 16 | no evidence of focal infiltrate, effusion or pneumothorax. 17 | dictating 18 | 19 | Cardiomegaly 20 | C0018800 21 | MetaMap 22 | fndg 23 | 24 | Mild cardiomegaly. 25 | 26 | 27 | Infiltration 28 | C0332448 29 | MetaMap 30 | ftcn 31 | 32 | infiltrate 33 | 34 | 35 | effusion 36 | C0013687 37 | MetaMap 38 | patf 39 | 40 | effusion 41 | 42 | 43 | Pneumothorax 44 | C0032326 45 | MetaMap 46 | dsyn 47 | 48 | pneumothorax. 49 | 50 | 51 | 52 | 53 | 00000086 54 | 55 | 0 56 | findings: pa and lat cxr at 7:34 p.m.. heart and mediastinum are 57 | stable. lungs are unchanged. air- filled cystic changes. no 58 | pneumothorax. osseous structures unchanged scoliosis 59 | impression: stable chest. 60 | dictating 61 | 62 | True 63 | Pneumothorax 64 | C0032326 65 | MetaMap 66 | dsyn 67 | 68 | pneumothorax 69 | 70 | 71 | 72 | 73 | -------------------------------------------------------------------------------- /examples/2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 2017-05-31 5 | 6 | 7 | 00000086 8 | 9 | 0 10 | findings: pa and lat cxr at 7:34 p.m.. heart and mediastinum are 11 | stable. lungs are unchanged. air- filled cystic changes. no 12 | pneumothorax. osseous structures unchanged scoliosis 13 | impression: stable chest. 14 | dictating 15 | 16 | True 17 | Pneumothorax 18 | C0032326 19 | MetaMap 20 | dsyn 21 | 22 | pneumothorax 23 | 24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /examples/cuis-cvpr2017.txt: -------------------------------------------------------------------------------- 1 | C0264494 2 | C0264496 3 | C0004144 4 | C0264495 5 | C0018800 6 | C0702116 7 | C0521530 8 | C0013604 9 | C0013608 10 | C0034063 11 | C0031039 12 | C0013687 13 | C0747635 14 | C1265808 15 | C0747639 16 | C0032227 17 | C0034067 18 | C0038536 19 | C0016059 20 | C0034069 21 | C0019270 22 | C3489393 23 | C0744895 24 | C0332448 25 | C0235896 26 | C0577559 27 | C3152252 28 | C0748419 29 | C1265602 30 | C0028259 31 | C0332558 32 | C0034079 33 | C0746923 34 | C0748164 35 | C0264545 36 | C1960024 37 | C0585104 38 | C0585105 39 | C0585106 40 | C0032285 41 | C0578577 42 | C0578576 43 | C0577702 44 | C0747651 45 | C0546333 46 | C0032326 47 | C0264557 48 | C0546334 49 | -------------------------------------------------------------------------------- /examples/openi-testset.txt: -------------------------------------------------------------------------------- 1 | CXR10 2 | CXR1002 3 | CXR1007 4 | CXR1008 5 | CXR101 6 | CXR102 7 | CXR1020 8 | CXR1028 9 | CXR1042 10 | CXR105 11 | CXR1055 12 | CXR1056 13 | CXR1058 14 | CXR1062 15 | CXR1074 16 | CXR1076 17 | CXR1077 18 | CXR1078 19 | CXR1091 20 | CXR1092 21 | CXR1099 22 | CXR11 23 | CXR1102 24 | CXR1109 25 | CXR1118 26 | CXR112 27 | CXR1121 28 | CXR1138 29 | CXR1140 30 | CXR1159 31 | CXR1161 32 | CXR1163 33 | CXR1167 34 | CXR1169 35 | CXR1179 36 | CXR1190 37 | CXR1194 38 | CXR1199 39 | CXR1202 40 | CXR1206 41 | CXR1208 42 | CXR1210 43 | CXR1213 44 | CXR1218 45 | CXR1222 46 | CXR1226 47 | CXR1231 48 | CXR1239 49 | CXR124 50 | CXR1243 51 | CXR1248 52 | CXR1255 53 | CXR1258 54 | CXR1260 55 | CXR1265 56 | CXR1267 57 | CXR127 58 | CXR1270 59 | CXR1273 60 | CXR1286 61 | CXR1287 62 | CXR1288 63 | CXR1292 64 | CXR1295 65 | CXR1297 66 | CXR1314 67 | CXR1316 68 | CXR1332 69 | CXR1334 70 | CXR1374 71 | CXR1378 72 | CXR1391 73 | CXR1392 74 | CXR1396 75 | CXR1397 76 | CXR1398 77 | CXR1399 78 | CXR14 79 | CXR1401 80 | CXR1409 81 | CXR141 82 | CXR1411 83 | CXR1413 84 | CXR1439 85 | CXR144 86 | CXR1443 87 | CXR1444 88 | CXR1448 89 | CXR145 90 | CXR1452 91 | CXR1460 92 | CXR1461 93 | CXR1468 94 | CXR1481 95 | CXR1487 96 | CXR1497 97 | CXR1500 98 | CXR1510 99 | CXR1515 100 | CXR1518 101 | CXR1519 102 | CXR1527 103 | CXR1528 104 | CXR1529 105 | CXR153 106 | CXR154 107 | CXR1540 108 | CXR1548 109 | CXR1551 110 | CXR1556 111 | CXR1563 112 | CXR1568 113 | CXR1570 114 | CXR1576 115 | CXR1581 116 | CXR1583 117 | CXR1586 118 | CXR159 119 | CXR1593 120 | CXR1602 121 | CXR1605 122 | CXR1608 123 | CXR1614 124 | CXR1617 125 | CXR1624 126 | CXR1627 127 | CXR163 128 | CXR1632 129 | CXR1638 130 | CXR1639 131 | CXR1643 132 | CXR1647 133 | CXR166 134 | CXR1660 135 | CXR167 136 | CXR1671 137 | CXR1691 138 | CXR1709 139 | CXR1711 140 | CXR1716 141 | CXR1724 142 | CXR1725 143 | CXR1728 144 | CXR1729 145 | CXR1733 146 | CXR1734 147 | CXR1736 148 | CXR1738 149 | CXR1739 150 | CXR1740 151 | CXR1746 152 | CXR1756 153 | CXR1763 154 | CXR1764 155 | CXR1765 156 | CXR1766 157 | CXR1767 158 | CXR1773 159 | CXR1777 160 | CXR1783 161 | CXR1801 162 | CXR1806 163 | CXR1813 164 | CXR1814 165 | CXR1816 166 | CXR1823 167 | CXR1831 168 | CXR1832 169 | CXR1841 170 | CXR1845 171 | CXR1861 172 | CXR1868 173 | CXR1871 174 | CXR1877 175 | CXR1881 176 | CXR1883 177 | CXR1884 178 | CXR1892 179 | CXR1895 180 | CXR1896 181 | CXR190 182 | CXR1903 183 | CXR1904 184 | CXR1909 185 | CXR191 186 | CXR1912 187 | CXR1914 188 | CXR1920 189 | CXR1923 190 | CXR1926 191 | CXR1929 192 | CXR193 193 | CXR1934 194 | CXR194 195 | CXR1942 196 | CXR1944 197 | CXR1946 198 | CXR1951 199 | CXR1952 200 | CXR1954 201 | CXR1958 202 | CXR1960 203 | CXR1964 204 | CXR1965 205 | CXR1969 206 | CXR1972 207 | CXR1977 208 | CXR1978 209 | CXR1979 210 | CXR1992 211 | CXR1993 212 | CXR1994 213 | CXR1999 214 | CXR2011 215 | CXR2012 216 | CXR2014 217 | CXR2029 218 | CXR2032 219 | CXR2038 220 | CXR2039 221 | CXR204 222 | CXR2040 223 | CXR2050 224 | CXR2053 225 | CXR2059 226 | CXR2061 227 | CXR2062 228 | CXR2066 229 | CXR2067 230 | CXR207 231 | CXR2072 232 | CXR2080 233 | CXR2086 234 | CXR2087 235 | CXR2089 236 | CXR2098 237 | CXR21 238 | CXR211 239 | CXR2111 240 | CXR2114 241 | CXR2115 242 | CXR2126 243 | CXR2131 244 | CXR2140 245 | CXR2142 246 | CXR2145 247 | CXR2152 248 | CXR2162 249 | CXR2163 250 | CXR2165 251 | CXR2167 252 | CXR2170 253 | CXR2171 254 | CXR2172 255 | CXR2177 256 | CXR2183 257 | CXR2191 258 | CXR2195 259 | CXR2199 260 | CXR2202 261 | CXR2205 262 | CXR2210 263 | CXR2211 264 | CXR2221 265 | CXR2222 266 | CXR2225 267 | CXR2244 268 | CXR2247 269 | CXR2250 270 | CXR2257 271 | CXR2264 272 | CXR2265 273 | CXR2268 274 | CXR2275 275 | CXR2287 276 | CXR2288 277 | CXR2289 278 | CXR2301 279 | CXR2307 280 | CXR2308 281 | CXR2324 282 | CXR2326 283 | CXR233 284 | CXR235 285 | CXR2352 286 | CXR2353 287 | CXR2357 288 | CXR2360 289 | CXR2368 290 | CXR237 291 | CXR2371 292 | CXR2372 293 | CXR2378 294 | CXR2380 295 | CXR2382 296 | CXR2388 297 | CXR2392 298 | CXR2395 299 | CXR2396 300 | CXR2397 301 | CXR240 302 | CXR2409 303 | CXR2414 304 | CXR2419 305 | CXR242 306 | CXR2421 307 | CXR243 308 | CXR2430 309 | CXR2437 310 | CXR2438 311 | CXR2448 312 | CXR2450 313 | CXR2455 314 | CXR2460 315 | CXR2462 316 | CXR2463 317 | CXR2465 318 | CXR2472 319 | CXR2474 320 | CXR2482 321 | CXR2494 322 | CXR2495 323 | CXR2496 324 | CXR2497 325 | CXR2498 326 | CXR2499 327 | CXR2503 328 | CXR2506 329 | CXR2515 330 | CXR2516 331 | CXR2519 332 | CXR2523 333 | CXR2525 334 | CXR2526 335 | CXR2530 336 | CXR2533 337 | CXR2536 338 | CXR2540 339 | CXR2542 340 | CXR2547 341 | CXR2557 342 | CXR256 343 | CXR2573 344 | CXR2577 345 | CXR2582 346 | CXR2583 347 | CXR2585 348 | CXR2595 349 | CXR2601 350 | CXR2604 351 | CXR2607 352 | CXR2608 353 | CXR261 354 | CXR2610 355 | CXR2617 356 | CXR2619 357 | CXR2620 358 | CXR2622 359 | CXR2625 360 | CXR2629 361 | CXR2636 362 | CXR2642 363 | CXR2649 364 | CXR2654 365 | CXR2655 366 | CXR2673 367 | CXR2684 368 | CXR2688 369 | CXR2699 370 | CXR27 371 | CXR2714 372 | CXR2716 373 | CXR2730 374 | CXR2739 375 | CXR2752 376 | CXR2759 377 | CXR276 378 | CXR2768 379 | CXR2776 380 | CXR2780 381 | CXR2782 382 | CXR2791 383 | CXR28 384 | CXR2808 385 | CXR2817 386 | CXR2820 387 | CXR2824 388 | CXR2827 389 | CXR2832 390 | CXR2833 391 | CXR284 392 | CXR2847 393 | CXR2852 394 | CXR2856 395 | CXR2858 396 | CXR286 397 | CXR287 398 | CXR2871 399 | CXR2876 400 | CXR2879 401 | CXR288 402 | CXR2887 403 | CXR2890 404 | CXR29 405 | CXR2901 406 | CXR2906 407 | CXR2909 408 | CXR2911 409 | CXR2924 410 | CXR2926 411 | CXR2927 412 | CXR2931 413 | CXR2940 414 | CXR2942 415 | CXR2951 416 | CXR2960 417 | CXR2966 418 | CXR2968 419 | CXR2969 420 | CXR297 421 | CXR2979 422 | CXR2981 423 | CXR2992 424 | CXR2997 425 | CXR300 426 | CXR3008 427 | CXR3011 428 | CXR3012 429 | CXR3016 430 | CXR302 431 | CXR3034 432 | CXR3038 433 | CXR304 434 | CXR3045 435 | CXR3046 436 | CXR305 437 | CXR3050 438 | CXR3053 439 | CXR3056 440 | CXR3057 441 | CXR3063 442 | CXR307 443 | CXR3070 444 | CXR3071 445 | CXR3083 446 | CXR3084 447 | CXR309 448 | CXR3093 449 | CXR3094 450 | CXR310 451 | CXR3100 452 | CXR3101 453 | CXR3106 454 | CXR3109 455 | CXR3112 456 | CXR3121 457 | CXR3123 458 | CXR3132 459 | CXR3133 460 | CXR3135 461 | CXR3145 462 | CXR3152 463 | CXR3153 464 | CXR3154 465 | CXR3155 466 | CXR3156 467 | CXR3159 468 | CXR3163 469 | CXR3176 470 | CXR3177 471 | CXR3178 472 | CXR3184 473 | CXR3197 474 | CXR3199 475 | CXR3206 476 | CXR3208 477 | CXR3213 478 | CXR3218 479 | CXR3230 480 | CXR3238 481 | CXR3242 482 | CXR3254 483 | CXR3255 484 | CXR3257 485 | CXR326 486 | CXR3261 487 | CXR3262 488 | CXR3271 489 | CXR3272 490 | CXR3288 491 | CXR3290 492 | CXR3292 493 | CXR3296 494 | CXR33 495 | CXR3307 496 | CXR3315 497 | CXR3318 498 | CXR3319 499 | CXR332 500 | CXR3323 501 | CXR3325 502 | CXR3329 503 | CXR333 504 | CXR3332 505 | CXR3333 506 | CXR3337 507 | CXR334 508 | CXR3342 509 | CXR3355 510 | CXR3356 511 | CXR3368 512 | CXR3373 513 | CXR3395 514 | CXR3405 515 | CXR3410 516 | CXR3413 517 | CXR3416 518 | CXR3419 519 | CXR342 520 | CXR3428 521 | CXR3432 522 | CXR3437 523 | CXR3439 524 | CXR3443 525 | CXR3449 526 | CXR3451 527 | CXR3473 528 | CXR3477 529 | CXR3479 530 | CXR3485 531 | CXR349 532 | CXR3499 533 | CXR3514 534 | CXR3521 535 | CXR3523 536 | CXR3524 537 | CXR3525 538 | CXR353 539 | CXR3530 540 | CXR3539 541 | CXR3543 542 | CXR3559 543 | CXR3562 544 | CXR357 545 | CXR3575 546 | CXR358 547 | CXR3585 548 | CXR3586 549 | CXR3587 550 | CXR3589 551 | CXR3596 552 | CXR3599 553 | CXR36 554 | CXR3603 555 | CXR3606 556 | CXR3609 557 | CXR3610 558 | CXR3619 559 | CXR3623 560 | CXR3632 561 | CXR3640 562 | CXR3641 563 | CXR3645 564 | CXR3648 565 | CXR366 566 | CXR3661 567 | CXR3663 568 | CXR3666 569 | CXR3668 570 | CXR3670 571 | CXR3677 572 | CXR368 573 | CXR3683 574 | CXR3684 575 | CXR3685 576 | CXR3698 577 | CXR370 578 | CXR3700 579 | CXR3714 580 | CXR3715 581 | CXR3718 582 | CXR3726 583 | CXR3735 584 | CXR3741 585 | CXR3744 586 | CXR3747 587 | CXR3762 588 | CXR3777 589 | CXR3785 590 | CXR379 591 | CXR3792 592 | CXR3795 593 | CXR3798 594 | CXR38 595 | CXR3803 596 | CXR3806 597 | CXR3817 598 | CXR3825 599 | CXR383 600 | CXR3830 601 | CXR3832 602 | CXR3837 603 | CXR3838 604 | CXR3846 605 | CXR3847 606 | CXR3849 607 | CXR3851 608 | CXR3852 609 | CXR3858 610 | CXR3860 611 | CXR3865 612 | CXR3867 613 | CXR3869 614 | CXR3870 615 | CXR3879 616 | CXR3881 617 | CXR3885 618 | CXR3888 619 | CXR3898 620 | CXR3899 621 | CXR3901 622 | CXR3906 623 | CXR3908 624 | CXR3913 625 | CXR392 626 | CXR3921 627 | CXR3923 628 | CXR3925 629 | CXR3928 630 | CXR3934 631 | CXR3935 632 | CXR3937 633 | CXR3946 634 | CXR3948 635 | CXR3952 636 | CXR3963 637 | CXR398 638 | CXR399 639 | CXR3998 640 | CXR40 641 | CXR402 642 | CXR403 643 | CXR406 644 | CXR408 645 | CXR416 646 | CXR420 647 | CXR423 648 | CXR427 649 | CXR432 650 | CXR439 651 | CXR444 652 | CXR445 653 | CXR46 654 | CXR467 655 | CXR47 656 | CXR471 657 | CXR473 658 | CXR474 659 | CXR477 660 | CXR48 661 | CXR481 662 | CXR493 663 | CXR494 664 | CXR503 665 | CXR508 666 | CXR512 667 | CXR522 668 | CXR53 669 | CXR530 670 | CXR540 671 | CXR55 672 | CXR565 673 | CXR570 674 | CXR573 675 | CXR577 676 | CXR584 677 | CXR585 678 | CXR589 679 | CXR590 680 | CXR598 681 | CXR60 682 | CXR601 683 | CXR606 684 | CXR611 685 | CXR616 686 | CXR617 687 | CXR622 688 | CXR639 689 | CXR64 690 | CXR645 691 | CXR646 692 | CXR654 693 | CXR661 694 | CXR665 695 | CXR668 696 | CXR671 697 | CXR672 698 | CXR673 699 | CXR674 700 | CXR680 701 | CXR686 702 | CXR695 703 | CXR698 704 | CXR700 705 | CXR703 706 | CXR705 707 | CXR706 708 | CXR707 709 | CXR71 710 | CXR712 711 | CXR719 712 | CXR726 713 | CXR73 714 | CXR733 715 | CXR737 716 | CXR738 717 | CXR741 718 | CXR742 719 | CXR743 720 | CXR751 721 | CXR752 722 | CXR756 723 | CXR760 724 | CXR781 725 | CXR792 726 | CXR795 727 | CXR797 728 | CXR8 729 | CXR800 730 | CXR805 731 | CXR831 732 | CXR833 733 | CXR837 734 | CXR840 735 | CXR843 736 | CXR846 737 | CXR853 738 | CXR855 739 | CXR856 740 | CXR859 741 | CXR871 742 | CXR875 743 | CXR885 744 | CXR888 745 | CXR889 746 | CXR892 747 | CXR897 748 | CXR903 749 | CXR904 750 | CXR906 751 | CXR907 752 | CXR909 753 | CXR919 754 | CXR920 755 | CXR921 756 | CXR925 757 | CXR927 758 | CXR929 759 | CXR932 760 | CXR934 761 | CXR935 762 | CXR939 763 | CXR941 764 | CXR943 765 | CXR95 766 | CXR964 767 | CXR970 768 | CXR975 769 | CXR981 770 | CXR989 771 | CXR992 -------------------------------------------------------------------------------- /images/negbio.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ncbi-nlp/NegBio/073199e2792824740e89844a59c13d3d40ce4d23/images/negbio.png -------------------------------------------------------------------------------- /negbio/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ncbi-nlp/NegBio/073199e2792824740e89844a59c13d3d40ce4d23/negbio/__init__.py -------------------------------------------------------------------------------- /negbio/chexpert/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Stanford Machine Learning Group 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /negbio/chexpert/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | The codes and patterns in this package are built on CheXpert labeler. 3 | https://github.com/stanfordmlgroup/chexpert-labeler 4 | """ 5 | -------------------------------------------------------------------------------- /negbio/chexpert/constants.py: -------------------------------------------------------------------------------- 1 | # Observation constants 2 | CARDIOMEGALY = "Cardiomegaly" 3 | ENLARGED_CARDIOMEDIASTINUM = "Enlarged Cardiomediastinum" 4 | SUPPORT_DEVICES = "Support Devices" 5 | NO_FINDING = "No Finding" 6 | OBSERVATION = "observation" 7 | CATEGORIES = ["No Finding", "Enlarged Cardiomediastinum", "Cardiomegaly", 8 | "Lung Lesion", "Airspace Opacity", "Edema", "Consolidation", 9 | "Pneumonia", "Atelectasis", "Pneumothorax", "Pleural Effusion", 10 | "Pleural Other", "Fracture", "Support Devices"] 11 | 12 | # Numeric constants 13 | POSITIVE = 1 14 | NEGATIVE = 0 15 | UNCERTAIN = -1 16 | 17 | # Misc. constants 18 | UNCERTAINTY = "uncertainty" 19 | NEGATION = "negation" 20 | REPORTS = "Reports" 21 | -------------------------------------------------------------------------------- /negbio/chexpert/patterns/negation.txt: -------------------------------------------------------------------------------- 1 | # No definite XXX 2 | ({} > {} {lemma:/definite/}) > {dependency:/neg/} {} 3 | 4 | # No obvious XXX 5 | ({} > {} {lemma:/obvious/}) > {dependency:/neg/} {} 6 | 7 | 8 | {} > {dependency:/amod|nsubj/} {lemma:/normal|unremarkable/} 9 | {} < {dependency:/amod|nsubj/} {lemma:/normal|unremarkable/} 10 | ({} > {} {}) < {dependency:/nsubj|dobj/} {lemma:/unremarkable|normal/} 11 | {} < {} ({} > {dependency:/amod/} {lemma:/normal|unremarkable/}) 12 | {} < {} ({} < {dependency:/nsubj/} {lemma:/normal|unremarkable/}) 13 | {} < {dependency:/conj:no/} {} 14 | {} < {} ({} < {dependency:/conj:or/} ({} > {} {lemma:/no/})) 15 | {} < {dependency:/nsubj/} ({lemma:/limit.*/} > {} {lemma:/upper/} & > {dependency:/nmod:of/} {lemma:/normal/} & > {dependency:/case/} {lemma:/at|within/}) 16 | {} < {} ({dependency:/exclude/} < {} ({} > {} {lemma:/no/})) 17 | 18 | 19 | ({lemma:/silhouette/} > {} {}) < {dependency:/dobj|nsubj/} {lemma:/obscure/} 20 | 21 | ({} > {dependency:/amod/} {lemma:/normal|unremarkable/}) < {dependency:/dobj|nsubj/} {lemma:/demonstrate.*|show|present|display/} 22 | {} < {dependency:/nmod:of/} ( {lemma:/appearance/} > {dependency:/amod/} {lemma:/normal/} & < {dependency:/dobj/} {lemma:/demonstrate.*|show|present|display/}) 23 | 24 | {} < {dependency:/amod/} ({} < {dependency:/dep|nsubj/} {lemma:/normal|unremarkable/}) 25 | {} < {dependency:/amod/} ({} > {dependency:/neg/} {lemma:/no/}) 26 | {} < {dependency:/amod/}({lemma:/finding.*/} < {dependency:/dobj/} ({lemma:/acute/} > {dependency:/nsubj/} {lemma:/no/})) 27 | {} < {dependency:/amod/} ({lemma:/structure.*/} < {dependency:/dep|nsubj/} ({lemma:/appear/} > {dependency:/xcomp/} {lemma:/normal|unremarkable/})) 28 | 29 | {} < {dependency:/compound/} ({} > {dependency:/neg/} {}) 30 | {} < {dependency:/nsubj/} {lemma:/absent/} 31 | {} < {dependency:/amod/} ({} < {dependency:/nmod:of/} ({lemma:/evidence/} > {dependency:/case/} {lemma:/without/})) 32 | {} < {dependency:/amod/} ({} < {dependency:/nmod:of/} ({lemma:/evidence/} > {dependency:/neg/} {})) 33 | 34 | # XXX within normal limits 35 | {} < {} ({} < {} ({lemma:/show|demonstrate|present/} > {dependency:/nmod:within/} ({lemma:/limit.*/} > {} {lemma:/normal/}))) 36 | ({} > {} {}) > {dependency:/nmod:within/} {lemma:/limit.*/} 37 | {} < {dependency:/nsubj/} ({lemma:/limit.*/} > {} {lemma:/upper/} & > {dependency:/nmod:of/} {lemma:/normal/} & > {dependency:/case/} {lemma:/at|within/}) 38 | {} < {} ({} < {dependency:/nsubj/} ({lemma:/limit.*/} > {} {lemma:/upper/} & > {dependency:/nmod:of/} {lemma:/normal/} & > {dependency:/case/} {lemma:/at|within/})) 39 | {} < {} ({} < {dependency:/nsubj/} ({lemma:/limit.*/} > {dependency:/amod/} {lemma:/normal/} & > {dependency:/case/} {lemma:/at|within/})) 40 | ({lemma:/vascularity/} > {dependency:/amod/} {lemma:/pulmonary/}) > {dependency:/amod/} {lemma:/normal/} 41 | {} < {dependency:/dobj|nsubj/} ({} > {dependency:/nmod:within/} ({lemma:/limit.*/} > {} {lemma:/normal/})) 42 | {} > {dependency:/nmod:within/} ({lemma:/limit.*/} > {dependency:/amod/} {lemma:/normal/}) 43 | {} > {} ({lemma:/limit/} > {} {lemma:/normal/}) 44 | 45 | # XXX is/appears/are/appear/remain/remains (now, otherwise) normal/unremarkable 46 | {} < {} ({lemma:/appear|remain/} > {} {lemma:/normal|unremarkable/}) 47 | 48 | # XXX is/appears/are/appear/remain/remains (now, otherwise) within normal limits 49 | {} > {} ({lemma:/remain|appear/} > {} ({lemma:/limit/} > {} {lemma:/normal/})) 50 | 51 | 52 | # rather than XXX 53 | {} <{dependency:/conj:negcc/} {} 54 | {} <{dependency:/nmod:without/} {} 55 | 56 | {} <{dependency:/nmod:without|nmod:of/} {lemma:/clear|clearing/}=key 57 | {} <{dependency:/nmod:out/} {lemma:/rule/}=key 58 | 59 | # removal of XXX 60 | {} <{dependency:/nmod:of/} {lemma:/history|free|disappearance|resolution|drainage|resolution|removal/} 61 | {} <{dependency:/nmod:for/} {lemma:/negative/} 62 | 63 | # exclude XXX 64 | {} <{} {lemma:/exclude/} 65 | 66 | {} <{dependency:/advmod|dep|conj:or/} {lemma:/no/} 67 | 68 | # XXX has resolved 69 | {} <{dependency:/nsubj/} ({lemma:/resolve/}=key >{dependency:/aux/} {}) 70 | 71 | # there is no XXX 72 | {} <{dependency:/nsubj/} ({lemma:/be/} >{} {lemma:/no/}) 73 | 74 | # without evidence|finding of|for XXX 75 | {} <{dependency:/nmod:of|nmod:for/} ({lemma:/evidence|finding/} <{dependency:/nmod:without/} {}) 76 | 77 | # without development of XXX 78 | {} < {dependency:/nmod:of/} ({lemma:/development/} > {} {lemma:/without/}) 79 | 80 | # No development of XXX 81 | {} < {dependency:/nmod:of/} ({lemma:/development/} > {} {lemma:/no/}) 82 | 83 | # no evidence of|for XXX 84 | {} <{dependency:/nmod:of|nmod:for/} ({lemma:/evidence/} >{dependency:/neg/} {}) 85 | 86 | # without evidence|finding of|for XXX 87 | {} <{dependency:/nmod:of|nmod:for/} ({lemma:/evidence|finding/} >{} {lemma:/without/}) 88 | 89 | # no focus of XXX 90 | {} <{dependency:/nmod:of/} ({lemma:/focus/} >{dependency:/neg/} {}) 91 | {} <{dependency:/nmod:of/} ({lemma:/focus/} >{} {lemma:/no/}) 92 | 93 | # no moderate to XXX 94 | {} <{dependency:/nmod:to/} ({lemma:/moderate/} >{dependency:/neg/} {}) 95 | 96 | # no evidence of developing XXX 97 | {} <{} ({lemma:/developing/} <{} ({lemma:/evidence/} <{dependency:/nmod:without/} {})) 98 | {} <{} ({lemma:/developing/} <{} ({lemma:/evidence/} >{} {lemma:/no/})) 99 | 100 | # no focal XXX 101 | {} <{dependency:/dobj/} ({} >{dependency:/nsubj/} {lemma:/no/}) 102 | 103 | # XXX is previously demonstrated/visualized 104 | {} <{dependency:/dobj|nsubjpass/} ({lemma:/demonstrate|visualize/} >{} {lemma:/previously/}) 105 | 106 | # there is no NN to suggest/explain XXX 107 | {} < {} ({lemma:/suggest|explain|diagnose/} < {} ({tag:/V.*/} > {} ({tag:/N.*/} > {} {lemma:/no/}))) 108 | 109 | # no NN to suggest/explain XXX 110 | {} < {} ({lemma:/suggest|explain|diagnose/} < {} ({tag:/N.*/} > {} {lemma:/no/})) 111 | 112 | # no area of XXX 113 | {} < {dependency:/nmod:of/} ({lemma:/area/} > {dependency:/compound/} {lemma:/no/}) 114 | 115 | # XXX is not enlarged 116 | {} < {dependency:/nsubjpass/} ({lemma:/enlarge/} > {dependency:/neg/} {}) 117 | 118 | # without development of XXX 119 | {} < {dependency:/nmod:of/} ({lemma:/development/} > {dependency:/case/} {lemma:/without/}) 120 | 121 | # XXX removed 122 | {} < {} {lemma:/remove/} 123 | {} > {} {lemma:/remove/} 124 | 125 | # XXX is no longer seen 126 | {} < {dependency:/nsubjpass/} ({lemma:/see/} > {} ({} > {dependency:/neg/} {lemma:/no/})) 127 | {} < {dependency:/nsubjpass/} ({lemma:/see/} > {} {lemma:/no/}) 128 | 129 | # without evidence seen for XXX 130 | {} < {} ({lemma:/see/} > {} ({} > {} ({lemma:/evidence/} > {} {lemma:/without/}))) 131 | {} < {} ({lemma:/see/} > {} ({lemma:/evidence/} > {} {lemma:/without/})) 132 | 133 | # normal/unremarkable appearance of XXX 134 | {} < {} ({lemma:/appearance/} > {} {lemma:/normal|unremarkable/}) 135 | 136 | # normal/unremarkable XXX | XXX is/appears normal/unremarkable 137 | # make more general 138 | {} > {} {lemma:/normal|unremarkable/} 139 | {} < {} {lemma:/normal|unremarkable/} 140 | 141 | # XXX has/have cleared 142 | # cleared XXX 143 | {} < {} {lemma:/clear/} 144 | {} > {} {lemma:/clear/} 145 | 146 | # no obvious associated XXX 147 | {} < {} ({lemma:/associate.*/} > {} ({lemma:/obvious/} > {dependency:/neg/} {})) 148 | {} > {dependency:/neg/} {} & > {} {lemma:/obvious/} & > {} {lemma:/associate.*/} 149 | 150 | # XXX with interval resolution 151 | {} > {} ({lemma:/resolution/} > {} {lemma:/interval/}) 152 | 153 | # no XXX / general negative case 154 | {} >{dependency:/neg/} {} 155 | {} >{} {lemma:/no/} 156 | {} >{dependency:/case/} {lemma:/without/} 157 | -------------------------------------------------------------------------------- /negbio/chexpert/patterns/post_negation_uncertainty.txt: -------------------------------------------------------------------------------- 1 | # Added Rules 2 | 3 | # Stable/unchanged silhouette/cardiomediastinal 4 | {lemma:/cardiomediastinal|pericardial|mediastinal|mediastinum|cardiomediastinum|contour|configuration|silhouette|size|heart|shadow/} > {dependency:/amod/} {lemma:/stable|unchanged/} 5 | {lemma:/cardiomediastinal|pericardial|mediastinal|mediastinum|cardiomediastinum|contour|configuration|silhouette|size|heart|shadow/} > {} ({lemma:/change/} > {dependency:/neg/} {}) 6 | 7 | # Silhouette/cardiomediastinal is stable|unchanged|not changed 8 | {lemma:/cardiomediastinal|pericardial|mediastinal|mediastinum|cardiomediastinum|contour|configuration|silhouette|size|heart|shadow/} < {dependency:/nsubj/} {lemma:/stable|unchanged/} 9 | {lemma:/cardiomediastinal|pericardial|mediastinal|mediastinum|cardiomediastinum|contour|configuration|silhouette|size|heart|shadow/} < {} ({lemma:/change/} > {dependency:/neg/} {}) 10 | 11 | # {} < {} ({lemma:/change/} > {dependency:/neg/} {}) 12 | 13 | # Silhouette/cardiomediastinal similar to prior 14 | {lemma:/cardiomediastinal|pericardial|mediastinal|mediastinum|cardiomediastinum|contour|configuration|silhouette|size|heart|shadow/} < {} ({lemma:/similar/} > {dependency:/nmod:to/} {lemma:/prior/}) 15 | {lemma:/cardiomediastinal|pericardial|mediastinal|mediastinum|cardiomediastinum|contour|configuration|silhouette|size|heart|shadow/} < {} ({lemma:/similar/} > {dependency:/nmod:to/} ({} >{} {lemma:/prior/})) 16 | {lemma:/cardiomediastinal|pericardial|mediastinal|mediastinum|cardiomediastinum|contour|configuration|silhouette|size|heart|shadow/} > {} ({lemma:/similar/} > {dependency:/nmod:to/} {lemma:/prior/}) 17 | {lemma:/cardiomediastinal|pericardial|mediastinal|mediastinum|cardiomediastinum|contour|configuration|silhouette|size|heart|shadow/} > {} ({lemma:/similar/} > {dependency:/nmod:to/} ({} >{} {lemma:/prior/})) 18 | 19 | # Stable apparence of silhouette/cardiomediastinal 20 | {lemma:/cardiomediastinal|pericardial|mediastinal|mediastinum|cardiomediastinum|contour|configuration|silhouette|size|heart|shadow/} < {dependency:/nmod:of/} ({lemma:/appearance/} > {} {lemma:/stable/}) 21 | 22 | {} < {} ({lemma:/excluded/} > {dependency:/neg/} {}) 23 | {} < {dependency:/nmod:for/} {lemma:/suspicious/} 24 | {} < {dependency:/dobj/} ({lemma:/represent/} > {dependency:/advmod/} {lemma:/possibly/}) 25 | {} > {dependency:/cc/} {lemma:/and.or/} 26 | {} < {dependency:/conj:and.or/} {} 27 | {} > {} {lemma:/new/} & > {dependency:/neg/} {lemma:/no/} 28 | 29 | {} < {dependency:/dep/} ({} > {dependency:/acl:relcl/} ({lemma:/represent/} < {dependency:/aux/} {lemma:/may/})) 30 | {} < {dependency:/nmod:for/} {lemma:/worrisome/} 31 | 32 | # XXX versus YYY 33 | {} < {dependency:/conj:versus/} {} 34 | {} > {dependency:/conj:versus/} {} 35 | 36 | # {} < {dependency:/nsubjpass/} ({lemma:/change/} > {dependency:/neg/} {}) 37 | ({lemma:/angle/} > {dependency:/nsubj/} {lemma:/costophrenic/}) > {dependency:/nmod:of/} {lemma:/blunt.*/} 38 | {} < {lemma:/nsubj/} ({} > {} ({lemma:/likely/} > {} {lemma:/less/})) 39 | 40 | {} < {dependency:/nmod:out/} {lemma:/cannot/} 41 | 42 | # outgoing edge 43 | {} >{} {lemma:/possible|possibly|presumably|probable|questionable|suspect|suspected|suspicious/} 44 | {} >{} {lemma:/question/} 45 | 46 | # May/might/would/could be XXX 47 | {} > {} {lemma:/may|might|would|could/} 48 | 49 | # '{} >{dependency:/cop/} {lemma:/may|would|could/} 50 | 51 | # incoming edge 52 | {} <{dependency:/nmod:of/} {lemma:/question|suggestion/} 53 | {} <{dependency:/dobj/} {lemma:/suspect|favor|question|consider/} 54 | {} <{dependency:/nmod:for/} {lemma:/concern|suspicion/} 55 | {} <{dependency:/nsubjpass/} {lemma:/suspect/} 56 | {} <{} {lemma:/possible/} 57 | 58 | # parsing error 59 | # suspected XXX 60 | {} <{dependency:/dobj/} {lemma:/suspect/} 61 | {} >{dependency:/advmod/} {lemma:/suspect/} 62 | 63 | # maybe due to XXX 64 | {} <{dependency:/dep/} {lemma:/maybe/} 65 | 66 | # may/could represent/reflect/indicate/include XXX 67 | {} <{dependency:/dobj/} ({lemma:/reflect|represent|indicate|include/} >{} {lemma:/may|could|would|might|possibly|can/}) 68 | 69 | # may/could represent/reflect/indicate/include the presence of XXX 70 | {} < {} ({lemma:/presence/} <{dependency:/dobj/} ({lemma:/reflect|represent|indicate|include/} >{} {lemma:/may|could|would|might|possibly|can/})) 71 | 72 | # maybe secondary to XXX 73 | {} <{dependency:/nmod:to/} {lemma:/secondary/} 74 | 75 | # may be due to XXX 76 | {} <{dependency:/nmod:to/} ({lemma:/due/} >{} {lemma:/can|could|may|would|possibly/}) 77 | 78 | # could related to XXX 79 | {} <{dependency:/nmod:to/} ({lemma:/relate/} >{} {lemma:/can|could|may|would|possibly/}) 80 | 81 | # may be compatible with XXX 82 | {} <{dependency:/nmod:with/} ({lemma:/compatible/} >{} {lemma:/be|could|may|would/}) 83 | 84 | # question left XXX 85 | {} <{dependency:/dobj/} ({lemma:/left/} <{} {lemma:/question/}) 86 | {} >{} {lemma:/left/} <{} {lemma:/question/} 87 | 88 | # differential diagnosis includes 89 | {} <{dependency:/dobj/} ({lemma:/include/} >{} ({lemma:/diagnosis/} >{} {lemma:/differential/})) 90 | 91 | # may be XXX 92 | {} <{} {lemma:/be/} >{} {lemma:/may|could|would/} 93 | 94 | # parsing error 95 | # XXX suspected 96 | {} <{dependency:/nsubj/} {lemma:/suspect/,tag:/VBN/} 97 | 98 | # Correlation for symptoms of XXX 99 | {} < {dependency:/nmod:of/} ({lemma:/symptom/} < {dependency:/nmod:for/} {lemma:/correlation/}) 100 | 101 | # borderline heart size 102 | {lemma:/heart/} < {dependency:/compound/} ({lemma:/size/} > {} {lemma:/borderline/}) 103 | 104 | # XXX could/might/may/possibly be present 105 | {} < {} ({lemma:/present/} > {dependency:/aux/} {lemma:/could|might|may|possibly|can/}) 106 | 107 | # XXX is poorly evaluated 108 | {} < {} ({lemma:/evaluate/} > {dependency:/advmod/} {lemma:/poorly/}) 109 | 110 | # XXX is incompletely evaluated 111 | {} < {} ({lemma:/evaluate/} > {dependency:/advmod/} {lemma:/incompletely/}) 112 | 113 | # XXX is not well visualized/evaluated 114 | {} < {} ({lemma:/evaluate|visualize/} >{dependency:/neg/} {}) 115 | {} > {} ({lemma:/evaluate|visualize/} > {dependency:/neg/} {}) 116 | 117 | # obscuring the XXX | XXX is obscured | obscured XXX 118 | {} < {} {lemma:/obscure/} 119 | 120 | # XXX could appear 121 | {} < {dependency:/nsubj/} ({lemma:/appear/} > {} {lemma:/could|may|might|can/}) 122 | 123 | # may be consistent/compatible with XXX 124 | {} < {dependency:/nmod:with/} ({lemma:/consistent/} > {} {lemma:/may|might|can|could/}) 125 | 126 | # correlate clinically for XXX 127 | {} < {dependency:/nmod:for/} ({lemma:/correlate/} > {dependency:/advmod/} {lemma:/clinically/}) 128 | 129 | # correlate clinically for evidence of XXX 130 | {} < {dependency:/nmod:of/} ({lemma:/evidence|sign|signs|symptoms|symptom/} < {dependency:/nmod:for/} ({lemma:/correlate/} > {dependency:/advmod/} {lemma:/clinically/})) 131 | 132 | # XXX are not clearly seen 133 | {} < {} (({lemma:/see/} > {dependency:/neg/} {}) > {} {lemma:/clearly/}) 134 | {} > {} (({lemma:/see/} > {dependency:/neg/} {}) > {} {lemma:/clearly/}) 135 | 136 | # possibly reflecting a XXX 137 | {} < {} ({lemma:/reflect/} > {} {lemma:/possibly/}) 138 | 139 | # XXX was not appreciated 140 | {} < {} ({lemma:/appreciate/} > {dependency:/neg/} {}) 141 | 142 | # XXX may|might|could (also) have this appearance 143 | {} < {} (({lemma:/have/} > {} {lemma:/may|might|could/}) > {} {lemma:/appearance/}) 144 | 145 | # vascular congestion 146 | # pulmonary congestion 147 | # indistinctness 148 | # vascular prominence 149 | {lemma:/congestion/} > {} {lemma:/vascular/} 150 | {lemma:/congestion/} > {} {lemma:/pulmonary/} 151 | {lemma:/indistinctness/} 152 | {lemma:/prominence/} > {} {lemma:/vascular/} 153 | 154 | # XXX or YYY 155 | {} > {dependency:/conj:or/} {} 156 | {} < {dependency:/conj:or/} {} 157 | 158 | -------------------------------------------------------------------------------- /negbio/chexpert/patterns/pre_negation_uncertainty.txt: -------------------------------------------------------------------------------- 1 | # Reserved for uncertainty rules that need to be matched first. 2 | 3 | # cannot exclude some XXX 4 | {} < {} ({lemma:/exclude/} >{} {lemma:/cannot/}) 5 | 6 | # XXX is not excluded 7 | {} < {} ({lemma:/exclude/} > {dependency:/neg/} {}) 8 | 9 | # no new XXX 10 | {} > {} {lemma:/new/} & > {dependency:/neg/} {lemma:/no/} 11 | {} < {} ({lemma:/new/} > {} {lemma:/no/}) 12 | {} < {dependency:/compound/} ({} > {} {lemma:/new/} & > {} {lemma:/no/}) 13 | 14 | # no new area of XXX 15 | {} < {} ({lemma:/area/} > {} {lemma:/no/} > {} {lemma:/new/}) 16 | {} > {} ({lemma:/area/} > {} {lemma:/no/} > {} {lemma:/new/}) 17 | 18 | # cannot rule out XXX 19 | {} <{dependency:/nmod:out/} ({lemma:/rule/} > {} {lemma:/cannot/}) 20 | 21 | # no evidence to rule out XXX 22 | 23 | {} < {dependency:/nmod:out/} ({lemma:/rule/} < {} ({lemma:/evidence/} > {} {lemma:/no/})) 24 | -------------------------------------------------------------------------------- /negbio/chexpert/phrases/mention/airspace_opacity.txt: -------------------------------------------------------------------------------- 1 | opaci 2 | decreased translucency 3 | increased density 4 | airspace disease 5 | air-space disease 6 | air space disease 7 | infiltrate 8 | infiltration 9 | interstitial marking 10 | interstitial pattern 11 | interstitial lung 12 | reticular pattern 13 | reticular marking 14 | reticulation 15 | parenchymal scarring 16 | peribronchial thickening 17 | wall thickening 18 | scar 19 | -------------------------------------------------------------------------------- /negbio/chexpert/phrases/mention/atelectasis.txt: -------------------------------------------------------------------------------- 1 | atelecta 2 | collapse 3 | -------------------------------------------------------------------------------- /negbio/chexpert/phrases/mention/cardiomegaly.txt: -------------------------------------------------------------------------------- 1 | cardiomegaly 2 | the heart 3 | heart size 4 | cardiac enlargement 5 | cardiac size 6 | cardiac shadow 7 | cardiac contour 8 | cardiac silhouette 9 | enlarged heart -------------------------------------------------------------------------------- /negbio/chexpert/phrases/mention/consolidation.txt: -------------------------------------------------------------------------------- 1 | consolidat -------------------------------------------------------------------------------- /negbio/chexpert/phrases/mention/edema.txt: -------------------------------------------------------------------------------- 1 | edema 2 | heart failure 3 | chf 4 | vascular congestion 5 | pulmonary congestion 6 | indistinctness 7 | vascular prominence -------------------------------------------------------------------------------- /negbio/chexpert/phrases/mention/enlarged_cardiomediastinum.txt: -------------------------------------------------------------------------------- 1 | _mediastinum 2 | cardiomediastinum 3 | contour 4 | mediastinal configuration 5 | mediastinal silhouette 6 | pericardial silhouette 7 | cardiac silhouette and vascularity 8 | -------------------------------------------------------------------------------- /negbio/chexpert/phrases/mention/fracture.txt: -------------------------------------------------------------------------------- 1 | fracture 2 | -------------------------------------------------------------------------------- /negbio/chexpert/phrases/mention/lung_lesion.txt: -------------------------------------------------------------------------------- 1 | mass 2 | nodular density 3 | nodular densities 4 | nodular opacity 5 | nodular opacities 6 | nodular opacification 7 | nodule 8 | lump 9 | cavitary lesion 10 | carcinoma 11 | neoplasm 12 | tumor 13 | -------------------------------------------------------------------------------- /negbio/chexpert/phrases/mention/no_finding.txt: -------------------------------------------------------------------------------- 1 | emphysema 2 | blunt 3 | density 4 | elevation 5 | eventration 6 | scoliosis 7 | degenera 8 | calcifi 9 | hyperinflation 10 | bronchospasm 11 | asthma 12 | hernia 13 | copd 14 | interstitial markings 15 | plaque 16 | osteophytosis 17 | aortic disease 18 | bronchiolitis 19 | airways disease 20 | thickening 21 | cephalization 22 | aspiration 23 | bullae 24 | hyperinflat 25 | contusion 26 | atherosclero 27 | osteopenia 28 | metastasis 29 | granuloma 30 | pneumomediastinum 31 | pneumoperitoneum 32 | osteodystrophy 33 | cuffing 34 | irregular lucency 35 | inflam 36 | fissure 37 | hypertension 38 | prominen 39 | kyphosis 40 | defib 41 | hyperexpansion 42 | bullet 43 | reticula 44 | thoracentesis 45 | bronchitis 46 | volume loss 47 | deformity 48 | hemorrhage 49 | hematoma 50 | radiopaque 51 | aerophagia 52 | arthropathy 53 | tracheostomy 54 | -------------------------------------------------------------------------------- /negbio/chexpert/phrases/mention/pleural_effusion.txt: -------------------------------------------------------------------------------- 1 | pleural fluid 2 | effusion -------------------------------------------------------------------------------- /negbio/chexpert/phrases/mention/pleural_other.txt: -------------------------------------------------------------------------------- 1 | pleural thickening 2 | fibrosis 3 | fibrothorax 4 | pleural scar 5 | pleural parenchymal scar 6 | pleuro-parenchymal scar 7 | pleuro-pericardial scar 8 | -------------------------------------------------------------------------------- /negbio/chexpert/phrases/mention/pneumonia.txt: -------------------------------------------------------------------------------- 1 | pneumonia 2 | infection 3 | infectious process 4 | infectious -------------------------------------------------------------------------------- /negbio/chexpert/phrases/mention/pneumothorax.txt: -------------------------------------------------------------------------------- 1 | pneumothorax 2 | pneumothoraces -------------------------------------------------------------------------------- /negbio/chexpert/phrases/mention/support_devices.txt: -------------------------------------------------------------------------------- 1 | pacer 2 | _line_ 3 | lines 4 | picc 5 | tube 6 | valve 7 | catheter 8 | pacemaker 9 | hardware 10 | arthroplast 11 | marker 12 | icd 13 | defib 14 | device 15 | drain_ 16 | plate 17 | screw 18 | cannula 19 | apparatus 20 | coil 21 | support 22 | equipment 23 | mediport 24 | -------------------------------------------------------------------------------- /negbio/chexpert/phrases/unmention/airspace_opacity.txt: -------------------------------------------------------------------------------- 1 | pleural scar -------------------------------------------------------------------------------- /negbio/chexpert/phrases/unmention/lung_lesion.txt: -------------------------------------------------------------------------------- 1 | calcified nodul 2 | massive 3 | massengale -------------------------------------------------------------------------------- /negbio/chexpert/phrases/unmention/pleural_effusion.txt: -------------------------------------------------------------------------------- 1 | pericardial effusion -------------------------------------------------------------------------------- /negbio/chexpert/stages/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ncbi-nlp/NegBio/073199e2792824740e89844a59c13d3d40ce4d23/negbio/chexpert/stages/__init__.py -------------------------------------------------------------------------------- /negbio/chexpert/stages/aggregate.py: -------------------------------------------------------------------------------- 1 | """Define mention aggregator class.""" 2 | import numpy as np 3 | from tqdm import tqdm 4 | 5 | from negbio.chexpert.constants import NEGATIVE, UNCERTAIN, POSITIVE, SUPPORT_DEVICES, NO_FINDING, OBSERVATION, \ 6 | NEGATION, UNCERTAINTY, CARDIOMEGALY 7 | 8 | 9 | class Aggregator(object): 10 | """Aggregate mentions of observations from radiology reports.""" 11 | 12 | def __init__(self, categories, verbose=False): 13 | self.categories = categories 14 | 15 | self.verbose = verbose 16 | 17 | def dict_to_vec(self, d): 18 | """ 19 | Convert a dictionary of the form 20 | 21 | {cardiomegaly: [1], 22 | opacity: [u, 1], 23 | fracture: [0]} 24 | 25 | into a vector of the form 26 | 27 | [np.nan, np.nan, 1, u, np.nan, ..., 0, np.nan] 28 | """ 29 | vec = [] 30 | for category in self.categories: 31 | # There was a mention of the category. 32 | if category in d: 33 | label_list = d[category] 34 | # Only one label, no conflicts. 35 | if len(label_list) == 1: 36 | vec.append(label_list[0]) 37 | # Multiple labels. 38 | else: 39 | # Case 1. There is negated and uncertain. 40 | if NEGATIVE in label_list and UNCERTAIN in label_list: 41 | vec.append(UNCERTAIN) 42 | # Case 2. There is negated and positive. 43 | elif NEGATIVE in label_list and POSITIVE in label_list: 44 | vec.append(POSITIVE) 45 | # Case 3. There is uncertain and positive. 46 | elif UNCERTAIN in label_list and POSITIVE in label_list: 47 | vec.append(POSITIVE) 48 | # Case 4. All labels are the same. 49 | else: 50 | vec.append(label_list[0]) 51 | 52 | # No mention of the category 53 | else: 54 | vec.append(np.nan) 55 | 56 | return vec 57 | 58 | def aggregate(self, collection): 59 | labels = [] 60 | documents = collection.documents 61 | if self.verbose: 62 | print("Aggregating mentions...") 63 | documents = tqdm(documents) 64 | for document in documents: 65 | label_dict = {} 66 | impression_passage = document.passages[0] 67 | no_finding = True 68 | for annotation in impression_passage.annotations: 69 | category = annotation.infons[OBSERVATION] 70 | 71 | if NEGATION in annotation.infons: 72 | label = NEGATIVE 73 | elif UNCERTAINTY in annotation.infons: 74 | label = UNCERTAIN 75 | else: 76 | label = POSITIVE 77 | 78 | # If at least one non-support category has a uncertain or 79 | # positive label, there was a finding 80 | if (category != SUPPORT_DEVICES and 81 | label in [UNCERTAIN, POSITIVE]): 82 | no_finding = False 83 | 84 | # Don't add any labels for No Finding 85 | if category == NO_FINDING: 86 | continue 87 | 88 | # add exception for 'chf' and 'heart failure' 89 | if ((label in [UNCERTAIN, POSITIVE]) and 90 | (annotation.text == 'chf' or 91 | annotation.text == 'heart failure')): 92 | if CARDIOMEGALY not in label_dict: 93 | label_dict[CARDIOMEGALY] = [UNCERTAIN] 94 | else: 95 | label_dict[CARDIOMEGALY].append(UNCERTAIN) 96 | 97 | if category not in label_dict: 98 | label_dict[category] = [label] 99 | else: 100 | label_dict[category].append(label) 101 | 102 | if no_finding: 103 | label_dict[NO_FINDING] = [POSITIVE] 104 | 105 | label_vec = self.dict_to_vec(label_dict) 106 | 107 | labels.append(label_vec) 108 | 109 | return np.array(labels) 110 | 111 | 112 | class NegBioAggregator(Aggregator): 113 | LABEL_MAP = {UNCERTAIN: 'Uncertain', POSITIVE: 'Positive', NEGATIVE: 'Negative'} 114 | 115 | def aggregate_doc(self, document): 116 | """ 117 | Aggregate mentions of observations from radiology reports. 118 | 119 | Args: 120 | document (BioCDocument): 121 | 122 | Returns: 123 | BioCDocument 124 | """ 125 | label_dict = {} 126 | no_finding = True 127 | for passage in document.passages: 128 | for annotation in passage.annotations: 129 | category = annotation.infons[OBSERVATION] 130 | 131 | if NEGATION in annotation.infons: 132 | label = NEGATIVE 133 | elif UNCERTAINTY in annotation.infons: 134 | label = UNCERTAIN 135 | else: 136 | label = POSITIVE 137 | 138 | # If at least one non-support category has a uncertain or 139 | # positive label, there was a finding 140 | if category != SUPPORT_DEVICES \ 141 | and label in [UNCERTAIN, POSITIVE]: 142 | no_finding = False 143 | 144 | # Don't add any labels for No Finding 145 | if category == NO_FINDING: 146 | continue 147 | 148 | # add exception for 'chf' and 'heart failure' 149 | if label in [UNCERTAIN, POSITIVE] \ 150 | and (annotation.text == 'chf' or annotation.text == 'heart failure'): 151 | if CARDIOMEGALY not in label_dict: 152 | label_dict[CARDIOMEGALY] = [UNCERTAIN] 153 | else: 154 | label_dict[CARDIOMEGALY].append(UNCERTAIN) 155 | 156 | if category not in label_dict: 157 | label_dict[category] = [label] 158 | else: 159 | label_dict[category].append(label) 160 | 161 | if no_finding: 162 | label_dict[NO_FINDING] = [POSITIVE] 163 | 164 | for category in self.categories: 165 | key = 'CheXpert/{}'.format(category) 166 | # There was a mention of the category. 167 | if category in label_dict: 168 | label_list = label_dict[category] 169 | # Only one label, no conflicts. 170 | if len(label_list) == 1: 171 | document.infons[key] = self.LABEL_MAP[label_list[0]] 172 | # Multiple labels. 173 | else: 174 | # Case 1. There is negated and uncertain. 175 | if NEGATIVE in label_list and UNCERTAIN in label_list: 176 | document.infons[key] = self.LABEL_MAP[UNCERTAIN] 177 | # Case 2. There is negated and positive. 178 | elif NEGATIVE in label_list and POSITIVE in label_list: 179 | document.infons[key] = self.LABEL_MAP[POSITIVE] 180 | # Case 3. There is uncertain and positive. 181 | elif UNCERTAIN in label_list and POSITIVE in label_list: 182 | document.infons[key] = self.LABEL_MAP[POSITIVE] 183 | # Case 4. All labels are the same. 184 | else: 185 | document.infons[key] = self.LABEL_MAP[label_list[0]] 186 | 187 | # No mention of the category 188 | else: 189 | pass 190 | return document 191 | -------------------------------------------------------------------------------- /negbio/chexpert/stages/classify.py: -------------------------------------------------------------------------------- 1 | """Define mention classifier class. 2 | 3 | Author: stanfordmlgroup 4 | Modified by: Yifan Peng 5 | """ 6 | import logging 7 | 8 | from negbio import ngrex 9 | from negbio.chexpert.constants import * 10 | from negbio.neg import semgraph, propagator, neg_detector 11 | 12 | 13 | class ModifiedDetector(neg_detector.Detector): 14 | """Child class of NegBio Detector class. 15 | 16 | Overrides parent methods __init__, detect, and match_uncertainty. 17 | """ 18 | 19 | def __init__(self, pre_negation_uncertainty_path, 20 | negation_path, post_negation_uncertainty_path): 21 | super(ModifiedDetector, self).__init__(negation_path, post_negation_uncertainty_path) 22 | self.preneg_uncertain_patterns = ngrex.load(pre_negation_uncertainty_path) 23 | 24 | def detect(self, sentence, locs): 25 | """Detect rules in report sentences. 26 | 27 | Args: 28 | sentence(BioCSentence): a sentence with universal dependencies 29 | locs(list): a list of (begin, end) 30 | 31 | Return: 32 | (str, MatcherObj, (begin, end)): negation or uncertainty, 33 | matcher, matched annotation 34 | """ 35 | try: 36 | g = semgraph.load(sentence) 37 | propagator.propagate(g) 38 | except Exception: 39 | logging.exception('Cannot parse dependency graph [offset=%s]', sentence.offset) 40 | raise 41 | else: 42 | for loc in locs: 43 | for node in neg_detector.find_nodes(g, loc[0], loc[1]): 44 | # Match pre-negation uncertainty rules first. 45 | preneg_m = self.match_prenegation_uncertainty(g, node) 46 | if preneg_m: 47 | yield UNCERTAINTY, preneg_m, loc 48 | else: 49 | # Then match negation rules. 50 | neg_m = self.match_neg(g, node) 51 | if neg_m: 52 | yield NEGATION, neg_m, loc 53 | else: 54 | # Finally match post-negation uncertainty rules. 55 | postneg_m = self.match_uncertainty(g, node) 56 | if postneg_m: 57 | yield UNCERTAINTY, postneg_m, loc 58 | 59 | def match_uncertainty(self, graph, node): 60 | for pattern in self.uncertain_patterns: 61 | for m in pattern.finditer(graph): 62 | n0 = m.group(0) 63 | if n0 == node: 64 | return m 65 | 66 | def match_prenegation_uncertainty(self, graph, node): 67 | for pattern in self.preneg_uncertain_patterns: 68 | for m in pattern.finditer(graph): 69 | n0 = m.group(0) 70 | if n0 == node: 71 | return m 72 | 73 | -------------------------------------------------------------------------------- /negbio/chexpert/stages/extract.py: -------------------------------------------------------------------------------- 1 | """Define observation extractor class.""" 2 | import re 3 | import itertools 4 | from collections import defaultdict 5 | from tqdm import tqdm 6 | from negbio.chexpert.constants import CARDIOMEGALY, ENLARGED_CARDIOMEDIASTINUM, OBSERVATION 7 | 8 | import bioc 9 | 10 | 11 | class Extractor(object): 12 | """Extract observations from impression sections of reports.""" 13 | def __init__(self, mention_phrases_dir, unmention_phrases_dir, 14 | verbose=False): 15 | self.verbose = verbose 16 | self.observation2mention_phrases\ 17 | = self.load_phrases(mention_phrases_dir, "mention") 18 | self.observation2unmention_phrases\ 19 | = self.load_phrases(unmention_phrases_dir, "unmention") 20 | self.add_unmention_phrases() 21 | 22 | def load_phrases(self, phrases_dir, phrases_type): 23 | """Read in map from observations to phrases for matching.""" 24 | observation2phrases = defaultdict(list) 25 | for phrases_path in phrases_dir.glob("*.txt"): 26 | with phrases_path.open() as f: 27 | for line in f: 28 | phrase = line.strip().replace("_", " ") 29 | observation = phrases_path.stem.replace("_", " ").title() 30 | if line: 31 | observation2phrases[observation].append(phrase) 32 | 33 | if self.verbose: 34 | print("Loading {} phrases for {} observations.".format(phrases_type, len(observation2phrases))) 35 | 36 | return observation2phrases 37 | 38 | def add_unmention_phrases(self): 39 | cardiomegaly_mentions\ 40 | = self.observation2mention_phrases[CARDIOMEGALY] 41 | enlarged_cardiom_mentions\ 42 | = self.observation2mention_phrases[ENLARGED_CARDIOMEDIASTINUM] 43 | positional_phrases = (["over the", "overly the", "in the"], 44 | ["", " superior", " left", " right"]) 45 | positional_unmentions = [e1 + e2 46 | for e1 in positional_phrases[0] 47 | for e2 in positional_phrases[1]] 48 | cardiomegaly_unmentions = [e1 + " " + e2.replace("the ", "") 49 | for e1 in positional_unmentions 50 | for e2 in cardiomegaly_mentions 51 | if e2 not in ["cardiomegaly", 52 | "cardiac enlargement"]] 53 | enlarged_cardiomediastinum_unmentions\ 54 | = [e1 + " " + e2 55 | for e1 in positional_unmentions 56 | for e2 in enlarged_cardiom_mentions] 57 | 58 | self.observation2unmention_phrases[CARDIOMEGALY]\ 59 | = cardiomegaly_unmentions 60 | self.observation2unmention_phrases[ENLARGED_CARDIOMEDIASTINUM]\ 61 | = enlarged_cardiomediastinum_unmentions 62 | 63 | def overlaps_with_unmention(self, sentence, observation, start, end): 64 | """Return True if a given match overlaps with an unmention phrase.""" 65 | unmention_overlap = False 66 | unmention_list = self.observation2unmention_phrases.get(observation, 67 | []) 68 | for unmention in unmention_list: 69 | unmention_matches = re.finditer(unmention, sentence.text) 70 | for unmention_match in unmention_matches: 71 | unmention_start, unmention_end = unmention_match.span(0) 72 | if start < unmention_end and end > unmention_start: 73 | unmention_overlap = True 74 | break # break early if overlap is found 75 | if unmention_overlap: 76 | break # break early if overlap is found 77 | 78 | return unmention_overlap 79 | 80 | def add_match(self, impression, sentence, ann_index, phrase, 81 | observation, start, end): 82 | """Add the match data and metadata to the impression object 83 | in place.""" 84 | annotation = bioc.BioCAnnotation() 85 | annotation.id = ann_index 86 | annotation.infons['CUI'] = None 87 | annotation.infons['semtype'] = None 88 | annotation.infons['term'] = phrase 89 | annotation.infons[OBSERVATION] = observation 90 | annotation.infons['annotator'] = 'CheXpert labeler' 91 | length = end - start 92 | annotation.add_location(bioc.BioCLocation(sentence.offset + start, 93 | length)) 94 | annotation.text = sentence.text[start:start+length] 95 | 96 | impression.annotations.append(annotation) 97 | 98 | def extract(self, collection): 99 | """Extract the observations in each report. 100 | 101 | Args: 102 | collection (BioCCollection): Impression passages of each report. 103 | 104 | Return: 105 | extracted_mentions 106 | """ 107 | 108 | # The BioCCollection consists of a series of documents. 109 | # Each document is a report (just the Impression section 110 | # of the report.) 111 | documents = collection.documents 112 | if self.verbose: 113 | print("Extracting mentions...") 114 | documents = tqdm(documents) 115 | for document in documents: 116 | # Get the Impression section. 117 | impression = document.passages[0] 118 | annotation_index = itertools.count(len(impression.annotations)) 119 | 120 | for sentence in impression.sentences: 121 | obs_phrases = self.observation2mention_phrases.items() 122 | for observation, phrases in obs_phrases: 123 | for phrase in phrases: 124 | matches = re.finditer(phrase, sentence.text) 125 | for match in matches: 126 | start, end = match.span(0) 127 | 128 | if self.overlaps_with_unmention(sentence, 129 | observation, 130 | start, 131 | end): 132 | continue 133 | 134 | self.add_match(impression, 135 | sentence, 136 | str(next(annotation_index)), 137 | phrase, 138 | observation, 139 | start, 140 | end) 141 | 142 | 143 | class NegBioExtractor(Extractor): 144 | def extract_doc(self, document): 145 | annotation_index = itertools.count() 146 | for passage in document.passages: 147 | for sentence in passage.sentences: 148 | obs_phrases = self.observation2mention_phrases.items() 149 | for observation, phrases in obs_phrases: 150 | for phrase in phrases: 151 | matches = re.finditer(phrase, sentence.text) 152 | for match in matches: 153 | start, end = match.span(0) 154 | if self.overlaps_with_unmention(sentence, observation, start, end): 155 | continue 156 | self.add_match(passage, sentence, str(next(annotation_index)), phrase, 157 | observation, start, end) 158 | return document 159 | 160 | def extract_all(self, collection): 161 | """Extract the observations in each report.""" 162 | annotation_index = itertools.count() 163 | for doc in collection.documents: 164 | for passage in doc.passages: 165 | for sentence in passage.sentences: 166 | obs_phrases = self.observation2mention_phrases.items() 167 | for observation, phrases in obs_phrases: 168 | for phrase in phrases: 169 | matches = re.finditer(phrase, sentence.text) 170 | for match in matches: 171 | start, end = match.span(0) 172 | if self.overlaps_with_unmention(sentence, observation, start, end): 173 | continue 174 | self.add_match(passage, sentence, str(next(annotation_index)), phrase, 175 | observation, start, end) 176 | return collection 177 | -------------------------------------------------------------------------------- /negbio/chexpert/stages/load.py: -------------------------------------------------------------------------------- 1 | """Define report loader class.""" 2 | import re 3 | 4 | from negbio.pipeline.section_split import split_document 5 | 6 | 7 | def _maketrans(s): 8 | s = s.replace(',', ', ') 9 | s = s.replace('.', '. ') 10 | return s 11 | 12 | 13 | def extract_impression_from_passages(document): 14 | """Extract the Impression section from a Bioc Document.""" 15 | document.passages = [passage for passage in document.passages 16 | if passage.infons['title'] == "impression"] 17 | 18 | assert len(document.passages) <= 1, "The document contains {} impression passages.".format(len(document.passages)) 19 | 20 | assert len(document.passages) >= 1, "The document contains no explicit impression passage." 21 | 22 | 23 | class NegBioLoader(object): 24 | """Report impression loader.""" 25 | def __init__(self, extract_impression=False): 26 | self.extract_impression = extract_impression 27 | # self.punctuation_spacer = string.maketrans({key: "{} ".format(key) 28 | # for key in ".,"}) 29 | # self.stop_spacer = string.maketrans('.', '. ') 30 | # self.comma_spacer = string.maketrans(',', ', ') 31 | 32 | def clean_doc(self, document): 33 | """Load and clean the reports.""" 34 | for passage in document.passages: 35 | passage.text = self.clean(passage.text) 36 | 37 | if self.extract_impression: 38 | document = split_document(document) 39 | extract_impression_from_passages(document) 40 | 41 | return document 42 | 43 | def clean(self, report): 44 | """Clean the report text.""" 45 | lower_report = report.lower() 46 | # Change `and/or` to `or`. 47 | corrected_report = re.sub('and/or', 48 | 'or', 49 | lower_report) 50 | # Change any `XXX/YYY` to `XXX or YYY`. 51 | corrected_report = re.sub('(?<=[a-zA-Z])/(?=[a-zA-Z])', 52 | ' or ', 53 | corrected_report) 54 | # Clean double periods 55 | clean_report = corrected_report.replace("..", ".") 56 | # Insert space after commas and periods. 57 | clean_report = _maketrans(clean_report) 58 | # Convert any multi white spaces to single white spaces. 59 | clean_report = ' '.join(clean_report.split()) 60 | 61 | return clean_report 62 | -------------------------------------------------------------------------------- /negbio/cli_utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | 4 | import docopt 5 | 6 | 7 | __root__ = os.path.dirname(os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))) 8 | 9 | 10 | def get_args(args): 11 | s = '' 12 | for k in args: 13 | s += ' {}: {}\n'.format(k, args[k]) 14 | return s 15 | 16 | 17 | def parse_args(doc, **kwargs): 18 | argv = docopt.docopt(doc, **kwargs) 19 | if argv['--verbose']: 20 | logging.basicConfig(level=logging.DEBUG) 21 | else: 22 | logging.basicConfig(level=logging.INFO) 23 | logging.debug('Arguments:\n%s', get_args(argv)) 24 | return argv 25 | 26 | 27 | def get_absolute_path(argv, key, default_value): 28 | print (__root__) 29 | if argv[key] == default_value: 30 | argv[key] = os.path.join(__root__, argv[key]) 31 | return argv -------------------------------------------------------------------------------- /negbio/compat.py: -------------------------------------------------------------------------------- 1 | """ 2 | Python 3 compatibility tools. 3 | """ 4 | import sys 5 | 6 | try: 7 | from pathlib import Path, PurePath 8 | except ImportError: 9 | try: 10 | from pathlib2 import Path, PurePath 11 | except ImportError: 12 | Path = PurePath = None 13 | 14 | if sys.version_info[0] >= 3: 15 | basestring = str 16 | else: 17 | basestring = basestring 18 | 19 | 20 | def is_pathlib_path(obj): 21 | """ 22 | Check whether obj is a pathlib.Path object. 23 | Prefer using `isinstance(obj, os_PathLike)` instead of this function. 24 | """ 25 | return Path is not None and isinstance(obj, Path) 26 | -------------------------------------------------------------------------------- /negbio/ext/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ncbi-nlp/NegBio/073199e2792824740e89844a59c13d3d40ce4d23/negbio/ext/__init__.py -------------------------------------------------------------------------------- /negbio/ext/normalize_mimiccxr.py: -------------------------------------------------------------------------------- 1 | import re 2 | import logging 3 | 4 | 5 | def pattern_repl(matchobj): 6 | """ 7 | Replace [**Patterns**] with spaces. 8 | """ 9 | s = matchobj.group(0).lower() 10 | return ' '.rjust(len(s)) 11 | 12 | 13 | def sub(text): 14 | text = re.sub(r'\[\*\*.*?\*\*\]', pattern_repl, text) 15 | text = re.sub(r'_', ' ', text) 16 | return text 17 | 18 | 19 | def find_start(text): 20 | return 0 21 | 22 | 23 | def find_end(text): 24 | ends = [len(text)] 25 | patterns = [ 26 | re.compile(r'BY ELECTRONICALLY SIGNING THIS REPORT', re.I), 27 | re.compile(r'\n {3,}DR.', re.I), 28 | re.compile(r'[ ]{1,}RADLINE ', re.I), 29 | re.compile(r'.*electronically signed on', re.I), 30 | re.compile(r'M\[0KM\[0KM') 31 | ] 32 | for pattern in patterns: 33 | m = pattern.search(text) 34 | if m: 35 | ends.append(m.start()) 36 | return min(ends) 37 | 38 | 39 | def trim(text): 40 | text = sub(text) 41 | start = find_start(text) 42 | end = find_end(text) 43 | 44 | new_text = '' 45 | if start > 0: 46 | new_text += ' ' * start 47 | new_text += text[start:end] 48 | if len(text) - end > 0: 49 | new_text += ' ' * (len(text) - end) 50 | return new_text 51 | 52 | 53 | def normalize(document): 54 | """ 55 | Assume there are only one passage in the document 56 | """ 57 | try: 58 | if len(document.passages) == 0: 59 | logging.warning('Skipped: there is no text in document %s', document.id) 60 | elif len(document.passages) > 1: 61 | logging.warning('Skipped: there is more than one passage in document %s', document.id) 62 | else: 63 | document.passages[0].text = trim(document.passages[0].text) 64 | return document 65 | except: 66 | logging.exception('Cannot find text in document %s', document.id) 67 | -------------------------------------------------------------------------------- /negbio/main_chexpert.py: -------------------------------------------------------------------------------- 1 | """ 2 | Detect negative and uncertain findings from SOURCE and output to DEST 3 | Example: python negbio/main_chexpert.py --output=examples/test.neg.xml examples/1.txt examples/2.txt 4 | python negbio/main_chexpert.py --skip-to-bioc --output=examples/test.neg.xml examples/1.xml 5 | 6 | Usage: 7 | main_chexpert text [options] --output=DEST SOURCES ... 8 | main_chexpert bioc [options] --output=DEST SOURCE 9 | 10 | Options: 11 | --mention_phrases_dir= Directory containing mention phrases for each observation. 12 | [default: negbio/chexpert/phrases/mention] 13 | --unmention_phrases_dir= Directory containing unmention phrases for each observation. 14 | [default: negbio/chexpert/phrases/unmention] 15 | --neg-patterns=FILE Negation rules [default: negbio/chexpert/patterns/negation.txt] 16 | --pre-negation-uncertainty-patterns=FILE Pre negation uncertainty rules 17 | [default: negbio/chexpert/patterns/pre_negation_uncertainty.txt] 18 | --post-negation-uncertainty-patterns=FILE Post negation uncertainty rules 19 | [default: negbio/chexpert/patterns/post_negation_uncertainty.txt] 20 | --bllip-model=MODEL_DIR Bllip parser model directory 21 | [default: ~/.local/share/bllipparser/GENIA+PubMed] 22 | --split-document Split document into passages based on section titles such as "Finding", 23 | "Impression" 24 | --newline_is_sentence_break Whether to treat newlines as sentence breaks. True means that a newline 25 | is always a sentence break. False means to ignore newlines for the 26 | purpose of sentence splitting. This is appropriate for continuous text, 27 | when just the non-whitespace characters should be used to determine 28 | sentence breaks. 29 | --verbose Print more information about progress. 30 | """ 31 | from __future__ import print_function 32 | 33 | import os 34 | 35 | import bioc 36 | import tqdm 37 | from pathlib2 import Path 38 | 39 | from negbio.chexpert.stages.aggregate import NegBioAggregator 40 | from negbio.chexpert.stages.classify import ModifiedDetector, CATEGORIES 41 | from negbio.chexpert.stages.extract import NegBioExtractor 42 | from negbio.chexpert.stages.load import NegBioLoader 43 | from negbio.cli_utils import parse_args, get_absolute_path 44 | from negbio.pipeline import text2bioc, negdetect 45 | from negbio.pipeline.parse import NegBioParser 46 | from negbio.pipeline.ptb2ud import NegBioPtb2DepConverter, Lemmatizer 47 | from negbio.pipeline.ssplit import NegBioSSplitter 48 | 49 | 50 | def pipeline(collection, loader, ssplitter, extractor, parser, ptb2dep, neg_detector, aggregator, verbose=False): 51 | """ 52 | Args: 53 | loader (NegBioLoader) 54 | ssplitter (NegBioSSplitter) 55 | parser (NegBioParser) 56 | extractor (NegBioExtractor) 57 | ptb2dep (NegBioPtb2DepConverter) 58 | neg_detector (ModifiedDetector) 59 | aggregator (NegBioAggregator) 60 | """ 61 | # for document in collection.documents: 62 | # 63 | # for passage in document.passages: 64 | # passage.text = clean(passage.text) 65 | # ssplitter.split_doc(document) 66 | for document in tqdm.tqdm(collection.documents, disable=not verbose): 67 | document = loader.clean_doc(document) 68 | document = ssplitter.split_doc(document) 69 | document = extractor.extract_doc(document) 70 | document = parser.parse_doc(document) 71 | document = ptb2dep.convert_doc(document) 72 | document = negdetect.detect(document, neg_detector) 73 | document = aggregator.aggregate_doc(document) 74 | # remove sentence 75 | for passage in document.passages: 76 | del passage.sentences[:] 77 | 78 | return collection 79 | 80 | 81 | def main(): 82 | argv = parse_args(__doc__, version='version 2') 83 | print(argv) 84 | 85 | lemmatizer = Lemmatizer() 86 | ptb2dep = NegBioPtb2DepConverter(lemmatizer, universal=True) 87 | ssplitter = NegBioSSplitter(newline=argv['--newline_is_sentence_break']) 88 | parser = NegBioParser(model_dir=argv['--bllip-model']) 89 | 90 | argv = get_absolute_path(argv, 91 | '--mention_phrases_dir', 92 | 'negbio/chexpert/phrases/mention') 93 | argv = get_absolute_path(argv, 94 | '--unmention_phrases_dir', 95 | 'negbio/chexpert/phrases/unmention') 96 | argv = get_absolute_path(argv, 97 | '--pre-negation-uncertainty-patterns', 98 | 'negbio/chexpert/patterns/pre_negation_uncertainty.txt') 99 | argv = get_absolute_path(argv, 100 | '--post-negation-uncertainty-patterns', 101 | 'negbio/chexpert/patterns/post_negation_uncertainty.txt') 102 | argv = get_absolute_path(argv, 103 | '--neg-patterns', 104 | 'negbio/chexpert/patterns/negation.txt') 105 | 106 | # chexpert 107 | loader = NegBioLoader() 108 | extractor = NegBioExtractor(Path(argv['--mention_phrases_dir']), 109 | Path(argv['--unmention_phrases_dir']), 110 | verbose=argv['--verbose']) 111 | neg_detector = ModifiedDetector(argv['--pre-negation-uncertainty-patterns'], 112 | argv['--neg-patterns'], 113 | argv['--post-negation-uncertainty-patterns']) 114 | aggregator = NegBioAggregator(CATEGORIES, verbose=argv['--verbose']) 115 | 116 | if argv['text']: 117 | collection = text2bioc.text2collection(argv['SOURCES']) 118 | elif argv['bioc']: 119 | with open(argv['SOURCE']) as fp: 120 | collection = bioc.load(fp) 121 | else: 122 | raise KeyError 123 | 124 | pipeline(collection, loader, ssplitter, extractor, parser, ptb2dep, neg_detector, aggregator, 125 | verbose=argv['--verbose']) 126 | 127 | with open(os.path.expanduser(argv['--output']), 'w') as fp: 128 | bioc.dump(collection, fp) 129 | 130 | 131 | if __name__ == '__main__': 132 | main() 133 | -------------------------------------------------------------------------------- /negbio/main_mm.py: -------------------------------------------------------------------------------- 1 | """ 2 | Detect negative and uncertain findings from SOURCE and output to DEST 3 | Example: python negbio/main_mm.py --metamap=/opt/public_mm/bin/metamap16 --output=examples/test.neg.xml examples/1.txt examples/2.txt 4 | 5 | Usage: 6 | main_mm text [options] --metamap=BINARY --output=DEST SOURCES ... 7 | main_mm bioc [options] --metamap=BINARY --output=DEST SOURCE ... 8 | 9 | Options: 10 | --neg-patterns=FILE negation rules [default: negbio/patterns/neg_patterns.txt] 11 | --uncertainty-patterns=FILE uncertainty rules [default: negbio/patterns/uncertainty_patterns.txt] 12 | --bllip-model=MODEL_DIR Bllip parser model directory 13 | --split-document Split document into passages based on section titles such as "Finding", "Impression" 14 | --cuis=FILE CUI list. To keep all CUIs, set it to None [default: examples/cuis-cvpr2017.txt] 15 | --newline_is_sentence_break Whether to treat newlines as sentence breaks. True means that a newline is always a 16 | sentence break. False means to ignore newlines for the purpose of sentence 17 | splitting. This is appropriate for continuous text, when just the non-whitespace 18 | characters should be used to determine sentence breaks. 19 | --word_sense_disambiguation Whether to use word sense disambiguation. 20 | --verbose Print more information about progress. 21 | """ 22 | from __future__ import print_function 23 | import logging 24 | import sys 25 | import os 26 | import bioc 27 | import docopt 28 | 29 | import pymetamap 30 | 31 | from negbio.cli_utils import parse_args, get_absolute_path 32 | from negbio.pipeline import negdetect, text2bioc, dner_mm 33 | from negbio.negbio_dner_matamap import read_cuis 34 | from negbio.pipeline.parse import NegBioParser 35 | from negbio.pipeline.ssplit import NegBioSSplitter 36 | from negbio.pipeline.ptb2ud import NegBioPtb2DepConverter, Lemmatizer 37 | 38 | 39 | def pipeline(collection, metamap, splitter, parser, ptb2dep, neg_detector, cuis, extra_args): 40 | """ 41 | 42 | Args: 43 | collection(BioCCollection): 44 | metamap(MetaMap): MetaMap instance 45 | splitter (NegBioSSplitter): 46 | parser (NegBioParser) 47 | ptb2dep (NegBioPtb2DepConverter) 48 | neg_detector (Detector): 49 | 50 | Returns: 51 | BioCCollection 52 | """ 53 | for document in collection.documents: 54 | splitter.split_doc(document) 55 | 56 | dner_mm.run_metamap_col(collection, metamap, cuis, extra_args) 57 | 58 | for document in collection.documents: 59 | document = parser.parse_doc(document) 60 | document = ptb2dep.convert_doc(document) 61 | document = negdetect.detect(document, neg_detector) 62 | # remove sentence 63 | for passage in document.passages: 64 | del passage.sentences[:] 65 | 66 | return collection 67 | 68 | 69 | def main(): 70 | argv = parse_args(__doc__, version='version 2') 71 | print(argv) 72 | 73 | lemmatizer = Lemmatizer() 74 | ptb2dep = NegBioPtb2DepConverter(lemmatizer, universal=True) 75 | splitter = NegBioSSplitter(newline=argv['--newline_is_sentence_break']) 76 | parser = NegBioParser(model_dir=argv['--bllip-model']) 77 | 78 | argv = get_absolute_path(argv, 79 | '--neg-patterns', 80 | 'negbio/patterns/neg_patterns.txt') 81 | argv = get_absolute_path(argv, 82 | '--uncertainty-patterns', 83 | 'negbio/patterns/uncertainty_patterns.txt') 84 | 85 | mm = pymetamap.MetaMap.get_instance(argv['--metamap']) 86 | neg_detector = negdetect.Detector(argv['--neg-patterns'], argv['--uncertainty-patterns']) 87 | 88 | if argv['--cuis'] == 'None': 89 | cuis = None 90 | else: 91 | cuis = read_cuis(argv['--cuis']) 92 | 93 | if argv['text']: 94 | collection = text2bioc.text2collection(argv['SOURCES']) 95 | elif argv['bioc']: 96 | with open(argv['SOURCE']) as fp: 97 | collection = bioc.load(fp) 98 | else: 99 | raise KeyError 100 | 101 | extra_args = dict() 102 | if argv['--word_sense_disambiguation']: 103 | extra_args['word_sense_disambiguation'] = True 104 | 105 | # Converting empty dict to None 106 | if len(extra_args) == 0: 107 | extra_args = None 108 | 109 | pipeline(collection, mm, splitter, parser, ptb2dep, neg_detector, cuis, extra_args) 110 | 111 | with open(os.path.expanduser(argv['--output']), 'w') as fp: 112 | bioc.dump(collection, fp) 113 | 114 | 115 | if __name__ == '__main__': 116 | main() 117 | -------------------------------------------------------------------------------- /negbio/neg/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ncbi-nlp/NegBio/073199e2792824740e89844a59c13d3d40ce4d23/negbio/neg/__init__.py -------------------------------------------------------------------------------- /negbio/neg/neg_detector.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import logging 4 | 5 | from negbio.neg import utils, semgraph, propagator 6 | from negbio import ngrex 7 | 8 | NEGATION = 'negation' 9 | UNCERTAINTY = 'uncertainty' 10 | 11 | 12 | class Detector(object): 13 | 14 | NEGATION = 'negation' 15 | UNCERTAINTY = 'uncertainty' 16 | 17 | def __init__(self, 18 | neg_pattern_file, 19 | uncertainty_pattern_file, 20 | sentence_rule=False): 21 | self.sentence_rule = sentence_rule 22 | self.neg_patterns = ngrex.load(neg_pattern_file) 23 | self.uncertain_patterns = ngrex.load(uncertainty_pattern_file) 24 | 25 | def detect(self, sentence, locs): 26 | """ 27 | Args: 28 | sentence(BioCSentence): a sentence with universal dependencies 29 | locs(list): a list of (begin, end) 30 | Yields: 31 | (str, MatcherObj, (begin, end)): negation or uncertainty, matcher, matched annotation 32 | """ 33 | try: 34 | g = semgraph.load(sentence) 35 | propagator.propagate(g) 36 | except: 37 | logging.exception('Cannot parse dependency graph [offset={}]'.format(sentence.offset)) 38 | raise 39 | else: 40 | if self.sentence_rule and is_neg_graph1(g): 41 | for loc in locs: 42 | yield NEGATION, None, loc 43 | return 44 | for loc in locs: 45 | if self.sentence_rule and is_neg_graph2(g, loc[0], loc[1]): 46 | yield NEGATION, None, loc 47 | for node in find_nodes(g, loc[0], loc[1]): 48 | m = self.match_neg(g, node) 49 | if m: 50 | yield NEGATION, m, loc 51 | m = self.match_uncertainty(g, node) 52 | if m: 53 | yield UNCERTAINTY, m, loc 54 | 55 | def match_neg(self, graph, node): 56 | """ 57 | Returns a matcher 58 | """ 59 | for pattern in self.neg_patterns: 60 | for m in pattern.finditer(graph): 61 | n0 = m.group(0) 62 | if n0 == node: 63 | try: 64 | key = m.get('key') 65 | if semgraph.has_out_edge(graph, key, ['neg']): 66 | continue 67 | except: 68 | pass 69 | if semgraph.has_out(graph, n0, ['new'], ['amod']): 70 | continue 71 | return m 72 | return None 73 | 74 | def match_uncertainty(self, graph, node): 75 | for pattern in self.uncertain_patterns: 76 | for m in pattern.finditer(graph): 77 | n0 = m.group(0) 78 | if n0 == node: 79 | return m 80 | 81 | # parsing error 82 | # suggestive of XXX 83 | p = ngrex.compile('{} <{dependency:/nmod:of/} {lemma:/suggestive/}') 84 | for m in p.finditer(graph): 85 | n0 = m.group(0) 86 | if n0 == node: 87 | if semgraph.has_out_node(graph, m.group(1), ['most']): 88 | return None 89 | elif semgraph.has_out(graph, n0, ['new', 'develop'], ['amod']): 90 | continue 91 | else: 92 | return m 93 | return None 94 | 95 | 96 | def find_nodes(graph, begin, end): 97 | for node in graph.nodes(): 98 | if utils.intersect((begin, end), (graph.node[node]['start'], graph.node[node]['end'])): 99 | yield node 100 | 101 | 102 | def is_neg_graph1(graph): 103 | # no XXX 104 | # resolution of XXX 105 | if 'T0' in graph.node and graph.node['T0']['lemma'] in ['no', 'resolution', 'resolved']: 106 | # no verb 107 | has_verb = utils.contains(lambda x: graph.node[x]['tag'][0] == 'V', graph.nodes()) 108 | if not has_verb: 109 | return True 110 | return False 111 | 112 | 113 | def is_neg_graph2(graph, begin, end): 114 | """ 115 | Return True if the sentence is like "without [begin, end]" 116 | 117 | """ 118 | 119 | # without n [, n] 120 | state = 0 121 | # sort nodes 122 | for node in sorted(graph.nodes(), key=lambda n: graph.node[n]['start']): 123 | if graph.node[node]['end'] > end: 124 | break 125 | 126 | if state == 0: 127 | if graph.node[node]['lemma'] in ( 128 | 'without', 'no', 'resolve', 'resolution', 'rosolution'): 129 | state = 1 130 | elif state == 1: 131 | if graph.node[node]['tag'].startswith('N'): 132 | state = 1 133 | if utils.intersect((begin, end), (graph.node[node]['start'], graph.node[node]['end'])): 134 | return True 135 | elif graph.node[node]['tag'] in ('JJ', 'CC', ',', 'VBN'): 136 | state = 1 137 | else: 138 | return False 139 | return False 140 | 141 | 142 | def is_neg(annotation): 143 | return NEGATION in annotation.infons and annotation.infons[NEGATION] == 'True' 144 | 145 | 146 | def is_uncertain(annotation): 147 | return UNCERTAINTY in annotation.infons and annotation.infons[UNCERTAINTY] == 'True' 148 | -------------------------------------------------------------------------------- /negbio/neg/propagator.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import logging 4 | 5 | from negbio.neg import semgraph 6 | import collections 7 | 8 | 9 | Edge = collections.namedtuple('Edge', ['gov', 'dep', 'data']) 10 | 11 | 12 | def propagate(G): 13 | 14 | for i in range(0, 2): 15 | edges = [] 16 | for node in G.nodes(): 17 | # hypoinflated but clear of 18 | if G.node[node]['lemma'] == 'hypoinflated': 19 | for child in G.successors(node): 20 | edge_dep = G[node][child]['dependency'] 21 | if G.node[child]['lemma'] == 'clear' and edge_dep == 'conj:but': 22 | for of in G.successors(node): 23 | of_dep = G[node][of]['dependency'] 24 | if of_dep == 'nmod:of': 25 | edges.append(Edge(child, of, of_dep)) 26 | break 27 | 28 | for p, c, d in G.edges(data=True): 29 | # propagate appos 30 | if d['dependency'] == 'appos': 31 | # x > y >appos > z 32 | for grandpa in G.predecessors(p): 33 | edge_dep = G[grandpa][p]['dependency'] 34 | edges.append(Edge(grandpa, c, edge_dep)) 35 | # x appos > z 36 | for child in G.successors(p): 37 | edge_dep = G[p][child]['dependency'] 38 | if edge_dep == 'neg': 39 | edges.append(Edge(c, child, edge_dep)) 40 | # propagate dep 41 | if d['dependency'] == 'dep' \ 42 | and G.node[p]['tag'].startswith('N') \ 43 | and G.node[c]['tag'].startswith('N'): 44 | for grandchild in G.successors(c): 45 | edge_dep = G[c][grandchild]['dependency'] 46 | if edge_dep == 'neg': 47 | edges.append(Edge(p, grandchild, edge_dep)) 48 | # propagate cop conjunction 49 | if d['dependency'].startswith('conj') \ 50 | and G.node[p]['tag'].startswith('N') \ 51 | and G.node[c]['tag'].startswith('N'): 52 | for child in G.successors(p): 53 | edge_dep = G[p][child]['dependency'] 54 | if edge_dep in ('aux', 'cop', 'neg', 'amod'): 55 | edges.append(Edge(c, child, edge_dep)) 56 | if edge_dep in ('dep', 'compound') and G.node[child]['lemma'] == 'no': 57 | edges.append(Edge(c, child, edge_dep)) 58 | if edge_dep == 'case' and G.node[child]['lemma'] == 'without': 59 | edges.append(Edge(c, child, edge_dep)) 60 | 61 | # propagate area/amount >of XXX 62 | if d['dependency'] == 'nmod:of' and G.node[p]['lemma'] in ('area', 'amount'): 63 | for grandpa in G.predecessors(p): 64 | edge_dep = G[grandpa][p]['dependency'] 65 | edges.append(Edge(grandpa, c, edge_dep)) 66 | # propagate combination of XXX 67 | if d['dependency'] == 'nmod:of' and G.node[p]['lemma'] == 'combination': 68 | for grandpa in G.predecessors(p): 69 | edge_dep = G[grandpa][p]['dependency'] 70 | edges.append(Edge(grandpa, c, edge_dep)) 71 | if d['dependency'] == 'nmod:of': 72 | for child in G.successors(p): 73 | edge_dep = G[p][child]['dependency'] 74 | # propagate no of XXX 75 | if edge_dep == 'neg': 76 | edges.append(Edge(c, child, edge_dep)) 77 | # propagate without of XXX 78 | if edge_dep == 'case' and G.node[child] == 'without': 79 | edges.append(Edge(c, child, edge_dep)) 80 | # parse error 81 | # no xx and xxx 82 | if d['dependency'] == 'neg' and semgraph.has_out_node(G, p, ['or', 'and']): 83 | for child in G.successors(p): 84 | edge_dep = G[p][child]['dependency'] 85 | if edge_dep == 'compound' and G.node[child]['tag'].startswith('N'): 86 | edges.append(Edge(child, c, 'neg')) 87 | 88 | has_more_edges = False 89 | for e in edges: 90 | if not G.has_edge(e.gov, e.dep): 91 | assert isinstance(e.data, str) or isinstance(e.data, unicode), type(e.data) 92 | G.add_edge(e.gov, e.dep, dependency=e.data) 93 | has_more_edges = True 94 | 95 | if not has_more_edges: 96 | break 97 | 98 | -------------------------------------------------------------------------------- /negbio/neg/semgraph.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import networkx as nx 4 | 5 | 6 | def load(sentence): 7 | """ 8 | Args: 9 | sentence(BioCSentence): a sentence with tag, text, lemma, start and end 10 | 11 | Returns: 12 | DiGraph: dependency graph 13 | 14 | Examples: 15 | ```xml 16 | 17 | small 18 | JJ 19 | 20 | Small 21 | 22 | ``` 23 | """ 24 | graph = nx.DiGraph() 25 | for ann in sentence.annotations: 26 | loc = ann.get_total_location() 27 | graph.add_node(ann.id, tag=ann.infons['tag'], text=ann.text, lemma=ann.infons['lemma'].lower(), 28 | start=loc.offset, end=loc.offset + loc.length) 29 | for rel in sentence.relations: 30 | dependant = None 31 | governor = None 32 | for node in rel.nodes: 33 | if node.role == 'dependant': 34 | dependant = node.refid 35 | elif node.role == 'governor': 36 | governor = node.refid 37 | if not dependant or not governor: 38 | logging.debug('Cannot find dependant or governor at {}'.format(sentence)) 39 | graph.add_edge(governor, dependant, dependency=rel.infons['dependency'], id=rel.id) 40 | return graph 41 | 42 | 43 | def has_out_edge(graph, node, dependencies): 44 | for _, _, d in graph.out_edges(node, data=True): 45 | if d['dependency'] in dependencies: 46 | return True 47 | return False 48 | 49 | 50 | def has_in_edge(graph, node, dependencies): 51 | for _, _, d in graph.in_edges(node, data=True): 52 | if d['dependency'] in dependencies: 53 | return True 54 | return False 55 | 56 | 57 | def has_out(graph, node, lemmas, dependencies): 58 | return get_out(graph, node, lemmas, dependencies) is not None 59 | 60 | 61 | def get_out(graph, node, lemmas, dependencies): 62 | for _, c, d in graph.out_edges(node, data=True): 63 | if d['dependency'] in dependencies and graph.node[c]['lemma'] in lemmas: 64 | return c 65 | return None 66 | 67 | 68 | def get_in(graph, node, lemmas, dependencies): 69 | for p, _, d in graph.in_edges(node, data=True): 70 | if d['dependency'] in dependencies and graph.node[p]['lemma'] in lemmas: 71 | return p 72 | return None 73 | 74 | 75 | def has_in(graph, node, lemmas, dependencies): 76 | return get_in(graph, node, lemmas, dependencies) is not None 77 | 78 | 79 | def has_out_node(graph, node, lemmas): 80 | for child in graph.successors(node): 81 | if graph.node[child]['lemma'] in lemmas: 82 | return True 83 | return False 84 | 85 | 86 | def has_in_node(graph, node, lemmas): 87 | for child in graph.predecessors(node): 88 | if graph.node[child]['lemma'] in lemmas: 89 | return True 90 | return False 91 | -------------------------------------------------------------------------------- /negbio/neg/utils.py: -------------------------------------------------------------------------------- 1 | def contains(func, iterable): 2 | """ 3 | Return true if one element of iterable for which function returns true. 4 | """ 5 | if func is None: 6 | func = bool 7 | for x in iterable: 8 | if func(x): 9 | return True 10 | return False 11 | 12 | 13 | def intersect(range1, range2): 14 | """ 15 | Args: 16 | range1(int, int): [begin, end) 17 | range2(int, int): [begin, end) 18 | """ 19 | if range1[0] <= range2[0] < range1[1]: 20 | return True 21 | elif range1[0] < range2[1] <= range1[1]: 22 | return True 23 | elif range2[0] <= range1[0] < range2[1]: 24 | return True 25 | elif range2[0] < range1[1] <= range2[1]: 26 | return True 27 | return False 28 | -------------------------------------------------------------------------------- /negbio/negbio_clean.py: -------------------------------------------------------------------------------- 1 | """ 2 | Clean up sentences 3 | 4 | Usage: 5 | negbio_pipeline cleanup [options] --output= ... 6 | 7 | Options: 8 | --suffix= Append an additional SUFFIX to file names. [default: .negbio.xml] 9 | --verbose Print more information about progress. 10 | --output= Specify the output directory. 11 | """ 12 | 13 | from negbio.cli_utils import parse_args 14 | from negbio.pipeline.cleanup import clean_sentences 15 | from negbio.pipeline.scan import scan_document 16 | 17 | if __name__ == '__main__': 18 | argv = parse_args(__doc__) 19 | scan_document(source=argv[''], directory=argv['--output'], suffix=argv['--suffix'], 20 | fn=clean_sentences) 21 | -------------------------------------------------------------------------------- /negbio/negbio_dner_chexpert.py: -------------------------------------------------------------------------------- 1 | """ 2 | Detect concepts from vocab 3 | 4 | Usage: 5 | negbio_pipeline dner_chexpert [options] --output= ... 6 | 7 | Options: 8 | --suffix= Append an additional SUFFIX to file names. [default: .chexpert.xml] 9 | --output= Specify the output directory. 10 | --verbose Print more information about progress. 11 | --mention_phrases_dir= Directory containing mention phrases for each observation. [default: negbio/chexpert/phrases/mention] 12 | --unmention_phrases_dir= Directory containing unmention phrases for each observation. [default: negbio/chexpert/phrases/unmention] 13 | """ 14 | from pathlib2 import Path 15 | 16 | from negbio.chexpert.stages.extract import NegBioExtractor 17 | from negbio.cli_utils import parse_args, get_absolute_path 18 | from negbio.pipeline.scan import scan_collection 19 | 20 | 21 | def run_extractor(collection, extractor): 22 | """ 23 | Args: 24 | collection (BioCCollection): 25 | extractor (NegBioExtractor): 26 | """ 27 | extractor.extract_all(collection) 28 | 29 | 30 | if __name__ == '__main__': 31 | argv = parse_args(__doc__) 32 | 33 | argv = get_absolute_path(argv, 34 | '--mention_phrases_dir', 35 | 'negbio/chexpert/phrases/mention') 36 | argv = get_absolute_path(argv, 37 | '--unmention_phrases_dir', 38 | 'negbio/chexpert/phrases/unmention') 39 | 40 | extractor = NegBioExtractor(Path(argv['--mention_phrases_dir']), 41 | Path(argv['--unmention_phrases_dir']), 42 | verbose=argv['--verbose']) 43 | scan_collection(source=argv[''], directory=argv['--output'], suffix=argv['--suffix'], 44 | fn=run_extractor, non_sequences=[extractor]) 45 | -------------------------------------------------------------------------------- /negbio/negbio_dner_matamap.py: -------------------------------------------------------------------------------- 1 | """ 2 | Detect UMLS concepts 3 | 4 | Usage: 5 | negbio_pipeline dner_mm [options] --metamap= --output= ... 6 | 7 | Options: 8 | --suffix= Append an additional SUFFIX to file names. [default: .mm.xml] 9 | --output= Specify the output directory. 10 | --verbose Print more information about progress. 11 | --metamap= The MetaMap binary 12 | --cuis= Specify CUI list 13 | """ 14 | 15 | from negbio.cli_utils import parse_args 16 | from negbio.pipeline.dner_mm import run_metamap_col 17 | from negbio.pipeline.scan import scan_collection 18 | from pymetamap import MetaMap 19 | 20 | 21 | def read_cuis(pathname): 22 | cuis = set() 23 | with open(pathname) as fp: 24 | for line in fp: 25 | line = line.strip() 26 | if line: 27 | cuis.add(line) 28 | return cuis 29 | 30 | 31 | if __name__ == '__main__': 32 | argv = parse_args(__doc__) 33 | mm = MetaMap.get_instance(argv['--metamap']) 34 | 35 | if argv['--cuis'] is None: 36 | cuis = None 37 | else: 38 | cuis = read_cuis(argv['--cuis']) 39 | 40 | scan_collection(source=argv[''], directory=argv['--output'], suffix=argv['--suffix'], 41 | fn=run_metamap_col, non_sequences=[mm, cuis]) 42 | -------------------------------------------------------------------------------- /negbio/negbio_neg.py: -------------------------------------------------------------------------------- 1 | """ 2 | Detect negation and uncertainty 3 | 4 | Usage: 5 | negbio_pipeline neg [options] --output= ... 6 | 7 | Options: 8 | --neg-patterns= Specify negation rules [default: negbio/patterns/neg_patterns.txt] 9 | --uncertainty-patterns= Specify uncertainty rules [default: negbio/patterns/uncertainty_patterns.txt] 10 | --suffix= Append an additional SUFFIX to file names. [default: .neg.xml] 11 | --verbose Print more information about progress. 12 | --output= Specify the output directory. 13 | """ 14 | import os 15 | 16 | from negbio.cli_utils import parse_args, get_absolute_path 17 | from negbio.neg.neg_detector import Detector 18 | from negbio.pipeline.negdetect import detect 19 | from negbio.pipeline.scan import scan_document 20 | 21 | if __name__ == '__main__': 22 | argv = parse_args(__doc__) 23 | 24 | argv = get_absolute_path(argv, 25 | '--neg-patterns', 26 | 'negbio/patterns/neg_patterns.txt') 27 | argv = get_absolute_path(argv, 28 | '--uncertainty-patterns', 29 | 'negbio/patterns/uncertainty_patterns.txt') 30 | 31 | neg_detector = Detector(os.path.realpath(argv['--neg-patterns']), 32 | os.path.realpath(argv['--uncertainty-patterns'])) 33 | scan_document(source=argv[''], directory=argv['--output'], suffix=argv['--suffix'], 34 | fn=detect, non_sequences=[neg_detector]) 35 | -------------------------------------------------------------------------------- /negbio/negbio_neg_chexpert.py: -------------------------------------------------------------------------------- 1 | """ 2 | Detect negation and uncertainty 3 | 4 | Usage: 5 | negbio_pipeline neg_chexpert [options] --output= ... 6 | 7 | Options: 8 | --neg-patterns=FILE Negation rules [default: negbio/chexpert/patterns/negation.txt] 9 | --pre-negation-uncertainty-patterns=FILE Pre negation uncertainty rules 10 | [default: negbio/chexpert/patterns/pre_negation_uncertainty.txt] 11 | --post-negation-uncertainty-patterns=FILE Post negation uncertainty rules 12 | [default: negbio/chexpert/patterns/post_negation_uncertainty.txt] 13 | --suffix= Append an additional SUFFIX to file names. [default: .neg.xml] 14 | --verbose Print more information about progress. 15 | --output= Specify the output directory. 16 | """ 17 | import os 18 | 19 | from negbio.chexpert.stages.classify import ModifiedDetector 20 | from negbio.cli_utils import parse_args, get_absolute_path 21 | from negbio.pipeline.negdetect import detect 22 | from negbio.pipeline.scan import scan_document 23 | 24 | 25 | if __name__ == '__main__': 26 | argv = parse_args(__doc__) 27 | 28 | argv = get_absolute_path(argv, 29 | '--pre-negation-uncertainty-patterns', 30 | 'negbio/chexpert/patterns/pre_negation_uncertainty.txt') 31 | argv = get_absolute_path(argv, 32 | '--post-negation-uncertainty-patterns', 33 | 'negbio/chexpert/patterns/post_negation_uncertainty.txt') 34 | argv = get_absolute_path(argv, 35 | '--neg-patterns', 36 | 'negbio/chexpert/patterns/negation.txt') 37 | 38 | neg_detector = ModifiedDetector(argv['--pre-negation-uncertainty-patterns'], 39 | argv['--neg-patterns'], 40 | argv['--post-negation-uncertainty-patterns']) 41 | scan_document(source=argv[''], directory=argv['--output'], suffix=argv['--suffix'], 42 | fn=detect, non_sequences=[neg_detector]) 43 | -------------------------------------------------------------------------------- /negbio/negbio_normalize.py: -------------------------------------------------------------------------------- 1 | """ 2 | Usage: 3 | negbio_pipeline normalize [options] --output= ... 4 | 5 | Options: 6 | --output= Specify the output directory. 7 | --suffix= Append an additional SUFFIX to file names. [default: .normalized.xml] 8 | --verbose Print more information about progress. 9 | """ 10 | 11 | from negbio.cli_utils import parse_args 12 | from negbio.ext.normalize_mimiccxr import normalize 13 | from negbio.pipeline.scan import scan_document 14 | 15 | if __name__ == '__main__': 16 | argv = parse_args(__doc__) 17 | scan_document(source=argv[''], verbose=argv['--verbose'], suffix=argv['--suffix'], 18 | directory=argv['--output'], fn=normalize) 19 | -------------------------------------------------------------------------------- /negbio/negbio_parse.py: -------------------------------------------------------------------------------- 1 | """ 2 | Parse sentences 3 | 4 | Usage: 5 | negbio_pipeline parse [options] --output= ... 6 | 7 | Options: 8 | --model= Bllip parser model directory. 9 | --output= Specify the output directory. 10 | --suffix= Append an additional SUFFIX to file names. [default: .bllip.xml] 11 | --verbose Print more information about progress. 12 | """ 13 | 14 | from negbio.cli_utils import parse_args 15 | from negbio.pipeline.parse import NegBioParser 16 | from negbio.pipeline.scan import scan_document 17 | 18 | 19 | if __name__ == '__main__': 20 | argv = parse_args(__doc__) 21 | parser = NegBioParser(model_dir=argv['--model']) 22 | scan_document(source=argv[''], directory=argv['--output'], suffix=argv['--suffix'], 23 | fn=parser.parse_doc, non_sequences=[]) 24 | -------------------------------------------------------------------------------- /negbio/negbio_pipeline.py: -------------------------------------------------------------------------------- 1 | """ 2 | Usage: 3 | negbio_pipeline [--verbose] [...] 4 | 5 | Options: 6 | --verbose Print more information about progress. 7 | 8 | The most commonly used negbio commands are: 9 | text2bioc 10 | normalize 11 | section_split 12 | ssplit 13 | parse 14 | ptb2ud 15 | dner_mm 16 | dner_chexpert 17 | neg 18 | neg_chexpert 19 | cleanup 20 | """ 21 | from subprocess import call 22 | import logging 23 | import os 24 | from negbio.cli_utils import parse_args 25 | 26 | 27 | def main(): 28 | args = parse_args(__doc__, version='negbio version 2', options_first=True) 29 | logging.debug('CWD: %s', os.getcwd()) 30 | 31 | argv = [args['']] + args[''] 32 | if args[''] == 'text2bioc': 33 | exit(call(['python', '-m', 'negbio.negbio_text2bioc'] + argv)) 34 | elif args[''] == 'normalize': 35 | exit(call(['python', '-m', 'negbio.negbio_normalize'] + argv)) 36 | elif args[''] == 'section_split': 37 | exit(call(['python', '-m', 'negbio.negbio_section_split'] + argv)) 38 | elif args[''] == 'ssplit': 39 | exit(call(['python', '-m', 'negbio.negbio_ssplit'] + argv)) 40 | elif args[''] == 'parse': 41 | exit(call(['python', '-m', 'negbio.negbio_parse'] + argv)) 42 | elif args[''] == 'ptb2ud': 43 | exit(call(['python', '-m', 'negbio.negbio_ptb2ud'] + argv)) 44 | elif args[''] == 'dner_mm': 45 | exit(call(['python', '-m', 'negbio.negbio_dner_matamap'] + argv)) 46 | elif args[''] == 'dner_chexpert': 47 | exit(call(['python', '-m', 'negbio.negbio_dner_chexpert'] + argv)) 48 | elif args[''] == 'neg': 49 | exit(call(['python', '-m', 'negbio.negbio_neg'] + argv)) 50 | elif args[''] == 'neg_chexpert': 51 | exit(call(['python', '-m', 'negbio.negbio_neg_chexpert'] + argv)) 52 | elif args[''] == 'cleanup': 53 | exit(call(['python', '-m', 'negbio.negbio_clean'] + argv)) 54 | elif args[''] in ['help', None]: 55 | exit(call(['python', '-m', 'negbio.negbio_pipeline', '--help'])) 56 | else: 57 | exit("%r is not a negbio command. See 'negbio help'." % args['']) 58 | 59 | 60 | if __name__ == '__main__': 61 | main() 62 | -------------------------------------------------------------------------------- /negbio/negbio_ptb2ud.py: -------------------------------------------------------------------------------- 1 | """ 2 | Convert from parse tree to universal dependencies 3 | 4 | Usage: 5 | negbio_pipeline ptb2ud [options] --output= ... 6 | 7 | Options: 8 | --output= Specify the output directory. 9 | --suffix= Append an additional SUFFIX to file names. [default: .ud.xml] 10 | --verbose Print more information about progress. 11 | """ 12 | from negbio.cli_utils import parse_args 13 | from negbio.pipeline.ptb2ud import NegBioPtb2DepConverter, Lemmatizer 14 | from negbio.pipeline.scan import scan_document 15 | 16 | 17 | if __name__ == '__main__': 18 | argv = parse_args(__doc__) 19 | lemmatizer = Lemmatizer() 20 | ptb2dep = NegBioPtb2DepConverter(lemmatizer, universal=True) 21 | scan_document(source=argv[''], directory=argv['--output'], suffix=argv['--suffix'], 22 | fn=ptb2dep.convert_doc, non_sequences=[]) 23 | -------------------------------------------------------------------------------- /negbio/negbio_section_split.py: -------------------------------------------------------------------------------- 1 | """ 2 | Split the report into sections based on titles. 3 | 4 | Usage: 5 | negbio_pipeline section_split [options] --output= ... 6 | 7 | Options: 8 | --suffix= Append an additional SUFFIX to file names. [default: .secsplit.xml] 9 | --output= Specify the output directory. 10 | --verbose Print more information about progress. 11 | --pattern= Specify section title list for matching. 12 | """ 13 | import re 14 | 15 | from negbio.cli_utils import parse_args 16 | from negbio.pipeline.scan import scan_document 17 | from negbio.pipeline.section_split import split_document 18 | 19 | 20 | def read_section_titles(pathname): 21 | with open(pathname) as fp: 22 | return re.compile('|'.join(fp.readlines()), re.MULTILINE) 23 | 24 | 25 | if __name__ == '__main__': 26 | argv = parse_args(__doc__) 27 | 28 | if argv['--pattern'] is None: 29 | patterns = None 30 | else: 31 | patterns = read_section_titles(argv['--pattern']) 32 | 33 | scan_document(source=argv[''], verbose=argv['--verbose'], suffix=argv['--suffix'], 34 | directory=argv['--output'], fn=split_document, non_sequences=[patterns]) 35 | -------------------------------------------------------------------------------- /negbio/negbio_ssplit.py: -------------------------------------------------------------------------------- 1 | """ 2 | Split text into sentences 3 | 4 | Usage: 5 | negbio_pipeline ssplit [options] --output= ... 6 | 7 | Options: 8 | --newline_is_sentence_break Whether to treat newlines as sentence breaks. True means that a newline is always a 9 | sentence break. False means to ignore newlines for the purpose of sentence 10 | splitting. This is appropriate for continuous text, when just the non-whitespace 11 | characters should be used to determine sentence breaks. [default=False] 12 | --suffix= Append an additional SUFFIX to file names. [default: .ssplit.xml] 13 | --output= Specify the output directory. 14 | --verbose Print more information about progress. 15 | """ 16 | from negbio.pipeline.scan import scan_document 17 | from negbio.pipeline.ssplit import NegBioSSplitter 18 | from negbio.cli_utils import parse_args 19 | 20 | if __name__ == '__main__': 21 | argv = parse_args(__doc__) 22 | splitter = NegBioSSplitter(newline=argv['--newline_is_sentence_break']) 23 | scan_document(source=argv[''], directory=argv['--output'], suffix=argv['--suffix'], 24 | fn=splitter.split_doc, non_sequences=[]) 25 | -------------------------------------------------------------------------------- /negbio/negbio_text2bioc.py: -------------------------------------------------------------------------------- 1 | """ 2 | Convert text FILEs to the BioC output file 3 | 4 | Usage: 5 | negbio_pipeline text2bioc [options] --output= ... 6 | 7 | Options: 8 | --output= Specify the output file name. 9 | --verbose Print more information about progress. 10 | """ 11 | 12 | import bioc 13 | 14 | from negbio.cli_utils import parse_args 15 | from negbio.pipeline.text2bioc import text2collection 16 | 17 | if __name__ == '__main__': 18 | argv = parse_args(__doc__) 19 | collection = text2collection(argv['']) 20 | with open(argv['--output'], 'w') as fp: 21 | bioc.dump(collection, fp) 22 | -------------------------------------------------------------------------------- /negbio/ngrex/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | A NgrexPattern is a tgrep-type pattern for matching node configurations in one of the Networkx 3 | structures. Unlike tgrep but like Unix grep, there is no pre-indexing of the data to be searched. 4 | Rather there is a linear scan through the graph where matches are sought. 5 | 6 | A node/edge is represented by a set of attributes and their values contained by curly braces: 7 | `{attr1:value1;attr2:value2;...}`. Therefore, {} represents any node/edge in the graph. 8 | Attributes must be plain strings; values can be regular expressions blocked off by "/". 9 | (I think regular expressions must match the whole attribute value; so that /NN/ matches "NN" only, 10 | while /NN.* / matches "NN", "NNS", "NNP", etc.) 11 | """ 12 | from . import parser 13 | from . import pattern 14 | 15 | 16 | def compile(ngrex): 17 | """ 18 | Compiles the given expression into a pattern 19 | 20 | Args: 21 | ngrex(str): expression 22 | 23 | Returns: 24 | NgrexPattern: a pattern 25 | """ 26 | p = parser.yacc.parse(ngrex) 27 | pattern.validate_names(p) 28 | return p 29 | 30 | 31 | def load(filename): 32 | """ 33 | Read a pattern file 34 | 35 | Args: 36 | filename(str): file name 37 | 38 | Returns: 39 | list: a list of NgexPattern 40 | """ 41 | patterns = [] 42 | with open(filename) as fp: 43 | for line in fp: 44 | line = line.strip() 45 | if not line: 46 | continue 47 | if line[0] == '#': 48 | continue 49 | patterns.append(compile(line)) 50 | return patterns 51 | -------------------------------------------------------------------------------- /negbio/ngrex/parser.py: -------------------------------------------------------------------------------- 1 | """ 2 | Start : ALIGNRELN SubNode "\n" 3 | | SubNode ( ":" SubNode )* "\n" 4 | ; 5 | 6 | SubNode : "(" SubNode ")" RelationDisj? 7 | | ModNode RelationDisj? 8 | ; 9 | 10 | RelationDisj : RelationConj ( "|" RelationConj )* 11 | 12 | RelationConj : ModRelation ( "&"? ModRelation )* 13 | 14 | ModRelation : RelChild 15 | | "!" RelChild 16 | | "?" RelChild 17 | ; 18 | 19 | RelChild : "[" RelationDisj "]" 20 | | Relation 21 | ; 22 | 23 | Relation : ( ( ( (IDENTIFIER ("," IDENTIFIER)?)? RELATION ( IDENTIFIER | REGEX )? ) ( "=" IDENTIFIER )? ) | ALIGNRELN) 24 | ( ModNode | "(" SubNode ")" ) 25 | ; 26 | 27 | NodeDisj : "[" NodeConj ( "|" NodeConj )* "]" 28 | ; 29 | 30 | NodeConj : ModNode ( "&"? ModNode )* 31 | ; 32 | 33 | ModNode : Child 34 | | "!" Child 35 | ; 36 | 37 | Child : NodeDisj 38 | | Description 39 | ; 40 | 41 | Description : 42 | "{" ( 43 | ( ( IDENTIFIER ":" (IDENTIFIER | REGEX) ) (";" ( IDENTIFIER ":" ( IDENTIFIER | REGEX ) ) )* "}") 44 | | ( ROOT "}" ) 45 | | ( EMPTY "}" ) 46 | | "}" ) 47 | ("=" IDENTIFIER )? 48 | """ 49 | from ply import lex 50 | from ply import yacc 51 | 52 | from negbio.ngrex import pattern 53 | 54 | 55 | t_ignore = ' \t\r' 56 | 57 | tokens = ( 58 | 'RELATION', 59 | 'IDENTIFIER', 60 | 'REGEX', 61 | ) 62 | 63 | literals = '{}()&[]:|,=' 64 | 65 | t_RELATION = r'[<>]' 66 | t_IDENTIFIER = r'([^ \n\r!@#$%^&*()+={}\[\]\|\\;\':",./<>?`~-])+' 67 | t_REGEX = r'/(/|[^\n\r/])*?/' 68 | 69 | 70 | def t_error(t): 71 | raise TypeError('Unknown text "%s"' % (t.value,)) 72 | 73 | lexer = lex.lex() 74 | 75 | 76 | def p_SubNode(p): 77 | """ 78 | SubNode : ModNode 79 | | ModNode RelationDisj 80 | | '(' SubNode ')' 81 | | '(' SubNode ')' RelationDisj 82 | """ 83 | if len(p) == 2: 84 | p[0] = p[1] 85 | elif len(p) == 3: 86 | conj_patterns = [] 87 | for relation_conj in p[2][1]: 88 | conj_patterns.append(_merge_conj(p[1], relation_conj[1])) 89 | p[0] = _merge_disj(conj_patterns) 90 | elif len(p) == 4: 91 | p[0] = p[2] 92 | elif len(p) == 5: 93 | conj_patterns = [] 94 | for relation_conj in p[4][1]: 95 | conj_patterns.append(_merge_conj(p[2], relation_conj[1])) 96 | p[0] = _merge_disj(conj_patterns) 97 | 98 | 99 | def _merge_disj(patterns): 100 | while len(patterns) > 1: 101 | p1 = patterns.pop() 102 | p2 = patterns.pop() 103 | patterns.append(pattern.CoordinationPattern(p1, p2, False)) 104 | return patterns[0] 105 | 106 | 107 | def _merge_conj(p1, relations): 108 | patterns = [] 109 | for reln, attributes, node in relations: 110 | if reln == '<': 111 | p = pattern.EdgePattern(node, p1, attributes, direction=pattern.L) 112 | else: 113 | p = pattern.EdgePattern(p1, node, attributes, direction=pattern.R) 114 | patterns.append(p) 115 | if len(patterns) == 1: 116 | return patterns[0] 117 | else: 118 | while len(patterns) > 1: 119 | p1 = patterns.pop() 120 | p2 = patterns.pop() 121 | patterns.append(pattern.CoordinationPattern(p1, p2, True)) 122 | return patterns[0] 123 | 124 | 125 | def p_RelationDisj(p): 126 | """ 127 | RelationDisj : RelationConj 128 | | RelationConj '|' RelationDisj 129 | """ 130 | """ 131 | Returns: 132 | ("OR", relation_list) 133 | """ 134 | if len(p) == 2: 135 | p[0] = ('OR', [p[1]]) 136 | elif len(p) == 4: 137 | p[0] = ('OR', [p[1]] + p[3][1]) 138 | 139 | 140 | def p_RelationConj(p): 141 | """ 142 | RelationConj : ModRelation 143 | | ModRelation RelationConj 144 | | ModRelation '&' RelationConj 145 | """ 146 | # (AND, [ModRelations]) 147 | if len(p) == 2: 148 | p[0] = ('AND', [p[1]]) 149 | if len(p) == 3: 150 | p[0] = ('AND', [p[1]] + p[2][1]) 151 | if len(p) == 4: 152 | p[0] = ('AND', [p[1]] + p[3][1]) 153 | 154 | 155 | def p_ModRelation(p): 156 | """ 157 | ModRelation : RelChild 158 | """ 159 | p[0] = p[1] 160 | 161 | 162 | def p_RelChild(p): 163 | """ 164 | RelChild : Relation 165 | """ 166 | p[0] = p[1] 167 | 168 | 169 | def p_Relation(p): 170 | """ 171 | Relation : RELATION '{' Attributes '}' Relation_Next 172 | """ 173 | """ 174 | Returns: 175 | < edge_attributes node 176 | """ 177 | p[0] = (p[1], p[3], p[5]) 178 | 179 | 180 | def p_Relation_Next(p): 181 | """ 182 | Relation_Next : ModNode 183 | | '(' SubNode ')' 184 | """ 185 | if len(p) == 2: 186 | p[0] = p[1] 187 | else: 188 | p[0] = p[2] 189 | 190 | 191 | def p_ModNode(p): 192 | """ 193 | ModNode : Child 194 | """ 195 | p[0] = p[1] 196 | 197 | 198 | def p_Child(p): 199 | """ 200 | Child : Description 201 | """ 202 | p[0] = p[1] 203 | 204 | 205 | def p_Description(p): 206 | """ 207 | Description : '{' Attributes '}' 208 | | '{' Attributes '}' '=' IDENTIFIER 209 | """ 210 | if len(p) == 4: 211 | p[0] = pattern.NodePattern(p[2]) 212 | else: 213 | p[0] = pattern.NodePattern(p[2], p[5]) 214 | 215 | def p_Attributes(p): 216 | """ 217 | Attributes : IDENTIFIER ':' REGEX 218 | | IDENTIFIER ':' REGEX ',' Attributes 219 | | empty 220 | """ 221 | if len(p) == 4: 222 | p[0] = {p[1]: p[3]} 223 | elif len(p) == 6: 224 | p[0] = {p[1]: p[3]} 225 | p[0].update(p[5]) 226 | else: 227 | p[0] = {} 228 | 229 | 230 | def p_empty(p): 231 | 'empty :' 232 | pass 233 | 234 | 235 | def p_error(p): 236 | raise TypeError("Syntax error at '%s'" % p.value) 237 | 238 | parser = yacc.yacc() 239 | 240 | -------------------------------------------------------------------------------- /negbio/ngrex/parsetab.py: -------------------------------------------------------------------------------- 1 | 2 | # parsetab.py 3 | # This file is automatically generated. Do not edit. 4 | _tabversion = '3.10' 5 | 6 | _lr_method = 'LALR' 7 | 8 | _lr_signature = "RELATION IDENTIFIER REGEX\n SubNode : ModNode\n | ModNode RelationDisj\n | '(' SubNode ')' \n | '(' SubNode ')' RelationDisj\n \n RelationDisj : RelationConj\n | RelationConj '|' RelationDisj\n \n RelationConj : ModRelation\n | ModRelation RelationConj\n | ModRelation '&' RelationConj\n \n ModRelation : RelChild\n \n RelChild : Relation\n \n Relation : RELATION '{' Attributes '}' Relation_Next\n \n Relation_Next : ModNode\n | '(' SubNode ')'\n \n ModNode : Child\n \n Child : Description\n \n Description : '{' Attributes '}'\n | '{' Attributes '}' '=' IDENTIFIER\n \n Attributes : IDENTIFIER ':' REGEX\n | IDENTIFIER ':' REGEX ',' Attributes\n | empty\n empty :" 9 | 10 | _lr_action_items = {'REGEX':([23,],[29,]),':':([15,],[23,]),'&':([1,4,8,10,11,22,31,33,35,38,],[-16,-15,18,-10,-11,-17,-18,-12,-13,-14,]),')':([1,3,4,7,8,9,10,11,12,17,19,22,24,25,26,31,33,35,37,38,],[-16,-1,-15,17,-7,-2,-10,-11,-5,-3,-8,-17,-4,-9,-6,-18,-12,-13,38,-14,]),'(':([0,2,30,34,],[2,2,34,2,]),'=':([22,],[28,]),',':([29,],[32,]),'RELATION':([1,3,4,8,10,11,17,18,20,22,31,33,35,38,],[-16,13,-15,13,-10,-11,13,13,13,-17,-18,-12,-13,-14,]),'{':([0,2,13,30,34,],[5,5,21,5,5,]),'IDENTIFIER':([5,21,28,32,],[15,15,31,15,]),'}':([5,14,16,21,27,29,32,36,],[-22,22,-21,-22,30,-19,-22,-20,]),'|':([1,4,8,10,11,12,19,22,25,31,33,35,38,],[-16,-15,-7,-10,-11,20,-8,-17,-9,-18,-12,-13,-14,]),'$end':([1,3,4,6,8,9,10,11,12,17,19,22,24,25,26,31,33,35,38,],[-16,-1,-15,0,-7,-2,-10,-11,-5,-3,-8,-17,-4,-9,-6,-18,-12,-13,-14,]),} 11 | 12 | _lr_action = {} 13 | for _k, _v in _lr_action_items.items(): 14 | for _x,_y in zip(_v[0],_v[1]): 15 | if not _x in _lr_action: _lr_action[_x] = {} 16 | _lr_action[_x][_k] = _y 17 | del _lr_action_items 18 | 19 | _lr_goto_items = {'Description':([0,2,30,34,],[1,1,1,1,]),'ModRelation':([3,8,17,18,20,],[8,8,8,8,8,]),'RelationDisj':([3,17,20,],[9,24,26,]),'RelChild':([3,8,17,18,20,],[10,10,10,10,10,]),'Child':([0,2,30,34,],[4,4,4,4,]),'Relation':([3,8,17,18,20,],[11,11,11,11,11,]),'RelationConj':([3,8,17,18,20,],[12,19,12,25,12,]),'ModNode':([0,2,30,34,],[3,3,35,3,]),'Attributes':([5,21,32,],[14,27,36,]),'Relation_Next':([30,],[33,]),'SubNode':([0,2,34,],[6,7,37,]),'empty':([5,21,32,],[16,16,16,]),} 20 | 21 | _lr_goto = {} 22 | for _k, _v in _lr_goto_items.items(): 23 | for _x, _y in zip(_v[0], _v[1]): 24 | if not _x in _lr_goto: _lr_goto[_x] = {} 25 | _lr_goto[_x][_k] = _y 26 | del _lr_goto_items 27 | _lr_productions = [ 28 | ("S' -> SubNode","S'",1,None,None,None), 29 | ('SubNode -> ModNode','SubNode',1,'p_SubNode','parser.py',78), 30 | ('SubNode -> ModNode RelationDisj','SubNode',2,'p_SubNode','parser.py',79), 31 | ('SubNode -> ( SubNode )','SubNode',3,'p_SubNode','parser.py',80), 32 | ('SubNode -> ( SubNode ) RelationDisj','SubNode',4,'p_SubNode','parser.py',81), 33 | ('RelationDisj -> RelationConj','RelationDisj',1,'p_RelationDisj','parser.py',127), 34 | ('RelationDisj -> RelationConj | RelationDisj','RelationDisj',3,'p_RelationDisj','parser.py',128), 35 | ('RelationConj -> ModRelation','RelationConj',1,'p_RelationConj','parser.py',142), 36 | ('RelationConj -> ModRelation RelationConj','RelationConj',2,'p_RelationConj','parser.py',143), 37 | ('RelationConj -> ModRelation & RelationConj','RelationConj',3,'p_RelationConj','parser.py',144), 38 | ('ModRelation -> RelChild','ModRelation',1,'p_ModRelation','parser.py',157), 39 | ('RelChild -> Relation','RelChild',1,'p_RelChild','parser.py',164), 40 | ('Relation -> RELATION { Attributes } Relation_Next','Relation',5,'p_Relation','parser.py',171), 41 | ('Relation_Next -> ModNode','Relation_Next',1,'p_Relation_Next','parser.py',182), 42 | ('Relation_Next -> ( SubNode )','Relation_Next',3,'p_Relation_Next','parser.py',183), 43 | ('ModNode -> Child','ModNode',1,'p_ModNode','parser.py',193), 44 | ('Child -> Description','Child',1,'p_Child','parser.py',200), 45 | ('Description -> { Attributes }','Description',3,'p_Description','parser.py',207), 46 | ('Description -> { Attributes } = IDENTIFIER','Description',5,'p_Description','parser.py',208), 47 | ('Attributes -> IDENTIFIER : REGEX','Attributes',3,'p_Attributes','parser.py',217), 48 | ('Attributes -> IDENTIFIER : REGEX , Attributes','Attributes',5,'p_Attributes','parser.py',218), 49 | ('Attributes -> empty','Attributes',1,'p_Attributes','parser.py',219), 50 | ('empty -> ','empty',0,'p_empty','parser.py',231), 51 | ] 52 | -------------------------------------------------------------------------------- /negbio/ngrex/pattern.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import re 3 | import collections 4 | 5 | L = '<' 6 | R = '>' 7 | LEFT = '<' 8 | RIGHT = '>' 9 | 10 | 11 | class NgrexPattern(object): 12 | """ 13 | A NgrexPattern is a tgrep-type pattern for matching node configurations in Networkx structures. 14 | """ 15 | 16 | def __init__(self): 17 | self._pattern = None 18 | 19 | def finditer(self, graph): 20 | """ 21 | Returns an iterator yielding MatcherObj instances over all matches for the ngrex pattern 22 | in graph. 23 | 24 | Args: 25 | graph(DiGraph): graph 26 | 27 | Yields: 28 | MatcherObj: an iterator yielding MatcherObj instances over all matches for the 29 | ngrex pattern in graph. 30 | """ 31 | raise NotImplementedError('Should have implemented this') 32 | 33 | @property 34 | def pattern(self): 35 | """ 36 | str: The pattern string from which the ngrex object was compiled. 37 | """ 38 | return self._pattern 39 | 40 | def __str__(self): 41 | return self.pattern 42 | 43 | 44 | class NodePattern(NgrexPattern): 45 | def __init__(self, attributes, name=None): 46 | super(NodePattern, self).__init__() 47 | self._name = name 48 | self._attributes = _get_attributes_regex(attributes) 49 | self._pattern = '{' + _attributes_to_str(self._attributes) + '}' 50 | if name: 51 | self._pattern += '=' + name 52 | 53 | def finditer(self, graph): 54 | for node in graph.nodes(): 55 | if self._attributes: 56 | if _match(self._attributes, graph.node[node]): 57 | yield MatcherObj(self, graph, [(self._name, node)]) 58 | else: 59 | yield MatcherObj(self, graph, [(self._name, node)]) 60 | 61 | 62 | class EdgePattern(NgrexPattern): 63 | def __init__(self, governor, dependant, edge_attributes, direction=LEFT): 64 | """ 65 | Args: 66 | direction(str): right if 'governor >edge dependant', left if 'dependant ', dependant) 78 | self._pattern = '({args[0].pattern}) {args[1]}{{{edge}}} ({args[2].pattern})'.format( 79 | args=args, edge=_attributes_to_str(self._edge_attributes)) 80 | 81 | def finditer(self, graph): 82 | governors = self._governor.finditer(graph) 83 | dependants = self._dependant.finditer(graph) 84 | for g, d in itertools.product(governors, dependants): 85 | for p, c, e in graph.edges(data=True): 86 | if p == g.group(0) and c == d.group(0): 87 | if _match(self._edge_attributes, e): 88 | if self._direction == LEFT: 89 | yield MatcherObj(self, graph, d._nodes + g._nodes) 90 | else: 91 | yield MatcherObj(self, graph, g._nodes + d._nodes) 92 | 93 | 94 | class CoordinationPattern(NgrexPattern): 95 | def __init__(self, pattern1, pattern2, is_conj=True): 96 | """ 97 | Args: 98 | is_conj(bool): if is_conj is true, then it is an "AND"; otherwise, it is an "OR". 99 | """ 100 | super(CoordinationPattern, self).__init__() 101 | self._pattern1 = pattern1 102 | self._pattern2 = pattern2 103 | self._is_conj = is_conj 104 | self._pattern = '{} {} {}'.format(pattern2.pattern, 105 | '&' if is_conj else '|', 106 | pattern1.pattern) 107 | 108 | def finditer(self, graph): 109 | if self._is_conj: 110 | matchers1 = self._pattern1.finditer(graph) 111 | matchers2 = self._pattern2.finditer(graph) 112 | for m1, m2 in itertools.product(matchers1, matchers2): 113 | if m1.group(0) == m2.group(0): 114 | nodes = list(m1._nodes) 115 | if len(m2._nodes) > 2: 116 | nodes.extend(m2._nodes[1:]) 117 | yield MatcherObj(self, graph, nodes) 118 | else: 119 | for m in self._pattern1.finditer(graph): 120 | yield m 121 | for m in self._pattern2.finditer(graph): 122 | yield m 123 | 124 | 125 | class MatcherObj: 126 | """ 127 | Match objects always have a boolean value of True. 128 | """ 129 | 130 | def __init__(self, pattern, graph, nodes): 131 | """ 132 | Args: 133 | nodes(list): [(name, node)] 134 | """ 135 | self._pattern = pattern 136 | self._graph = graph 137 | self._nodes = nodes 138 | 139 | def __bool__(self): 140 | return True 141 | 142 | def group(self, index): 143 | """ 144 | Returns the input node captured by the given group during the previous match operation. 145 | """ 146 | return self._nodes[index][1] 147 | 148 | def groups(self): 149 | """ 150 | Returns a list containing all the subgroups of the match, from 0 up to however many nodes 151 | are in the pattern. 152 | """ 153 | return (node[1] for node in self._nodes) 154 | 155 | def get(self, name): 156 | for node in self._nodes: 157 | if node[0] == name: 158 | return node[1] 159 | raise KeyError(name) 160 | 161 | @property 162 | def pattern(self): 163 | """ 164 | The expression object whose `finditer()` produced this instance 165 | """ 166 | return self._pattern 167 | 168 | @property 169 | def graph(self): 170 | """ 171 | The graph passed to `finditer()` 172 | """ 173 | return self._graph 174 | 175 | 176 | def validate_names(pattern): 177 | def _helper(p, names): 178 | if isinstance(p, NodePattern): 179 | if p._name in names: 180 | raise KeyError(p._name) 181 | if p._name: 182 | names.add(p._name) 183 | elif isinstance(p, EdgePattern): 184 | _helper(p._governor, names) 185 | _helper(p._dependant, names) 186 | elif isinstance(p, CoordinationPattern): 187 | _helper(p._pattern1, names) 188 | _helper(p._pattern2, names) 189 | _helper(pattern, set()) 190 | 191 | 192 | def _get_attributes_regex(attributes): 193 | def _get_regex(v): 194 | v = v[1:-1] 195 | if v: 196 | if v[0] != '^': 197 | v = '^' + v 198 | if v[-1] != '$': 199 | v += '$' 200 | return re.compile(v) 201 | return {k: _get_regex(v) for k, v in attributes.items()} 202 | 203 | 204 | def _match(attributes, element): 205 | for k, v in attributes.items(): 206 | if k not in element or not v.match(element[k]): 207 | return False 208 | return True 209 | 210 | 211 | def _attributes_to_str(attributes): 212 | return ','.join(['{}:/{}/'.format(k, v.pattern) for k, v in attributes.items()]) 213 | -------------------------------------------------------------------------------- /negbio/patterns/neg_patterns.txt: -------------------------------------------------------------------------------- 1 | 2 | {} >{dependency:/neg/} {} 3 | {} >{} {lemma:/no/} 4 | {} >{dependency:/case/} {lemma:/without/} 5 | 6 | # rather than XXX 7 | {} <{dependency:/conj:negcc/} {} 8 | {} <{dependency:/nmod:without/} {} 9 | {} <{dependency:/conj:versus/} {} 10 | {} <{dependency:/nmod:without|nmod:of/} {lemma:/clear|clearing/}=key 11 | {} <{dependency:/nmod:out/} {lemma:/rule/}=key 12 | {} <{dependency:/nmod:of/} {lemma:/history|free|disappearance|resolution|drainage|resolution|removal/} 13 | {} <{dependency:/nmod:for/} {lemma:/negative/} 14 | {} <{} {lemma:/resolve|resolving|exclude/}=key 15 | {} <{dependency:/advmod|dep|conj:or/} {lemma:/no/} 16 | 17 | # XXX has resolved 18 | {} <{dependency:/nsubj/} ({lemma:/resolve/}=key >{dependency:/aux/} {}) 19 | 20 | # there is no XXX 21 | {} <{dependency:/nsubj/} ({lemma:/be/} >{} {lemma:/no/}) 22 | 23 | # without evidence|finding of|for XXX 24 | {} <{dependency:/nmod:of|nmod:for/} ({lemma:/evidence|finding/} <{dependency:/nmod:without/} {}) 25 | 26 | # no evidence of|for XXX 27 | {} <{dependency:/nmod:of|nmod:for/} ({lemma:/evidence/} >{dependency:/neg/} {}) 28 | 29 | # without evidence|finding of|for XXX 30 | {} <{dependency:/nmod:of|nmod:for/} ({lemma:/evidence|finding/} >{} {lemma:/without/}) 31 | 32 | # no focus of XXX 33 | {} <{dependency:/nmod:of/} ({lemma:/focus/} >{dependency:/neg/} {}) 34 | {} <{dependency:/nmod:of/} ({lemma:/focus/} >{} {lemma:/no/}) 35 | 36 | # no moderate to XXX 37 | {} <{dependency:/nmod:to/} ({lemma:/moderate/} >{dependency:/neg/} {}) 38 | 39 | # no evidence of developing XXX 40 | {} <{} ({lemma:/developing/} <{} ({lemma:/evidence/} <{dependency:/nmod:without/} {})) 41 | {} <{} ({lemma:/developing/} <{} ({lemma:/evidence/} >{} {lemma:/no/})) 42 | 43 | # no focal XXX 44 | {} <{dependency:/dobj/} ({} >{dependency:/nsubj/} {lemma:/no/}) 45 | 46 | # do not demonstrate|visualize XXX 47 | # XXX is not demonstrated/visualized 48 | {} <{dependency:/dobj|nsubjpass/} ({lemma:/demonstrate|visualize/} >{dependency:/neg/} {}) 49 | 50 | # XXX is previously demonstrated/visualized 51 | {} <{dependency:/dobj|nsubjpass/} ({lemma:/demonstrate|visualize/} >{} {lemma:/previously/}) 52 | 53 | # there is no NN to suggest/explain XXX 54 | {} <{dependency:/dobj/} ({tag:/V.*/} <{} ({tag:/N.*/} >{dependency:/neg/} {})) 55 | 56 | # no NN to suggest/explain XXX 57 | {} <{dependency:/dobj/} ({tag:/V.*/} >{} ({tag:/N.*/} >{dependency:/neg/} {})) -------------------------------------------------------------------------------- /negbio/patterns/uncertainty_patterns.txt: -------------------------------------------------------------------------------- 1 | # outgoing edge 2 | {} >{} {lemma:/possible|possibly|presumably|probable|questionable|suspect|suspected|suspicious/} 3 | {} >{} {lemma:/question/} 4 | 5 | # '{} >{dependency:/cop/} {lemma:/may|would|could/} 6 | 7 | # incoming edge 8 | {} <{dependency:/nmod:of/} {lemma:/question|suggestion/} 9 | {} <{dependency:/dobj/} {lemma:/suspect|favor|suggest|suggesting|question|consider/} 10 | {} <{dependency:/nmod:for/} {lemma:/concern|suspicion/} 11 | {} <{dependency:/nsubjpass/} {lemma:/suspect/} 12 | {} <{} {lemma:/possible/} 13 | 14 | # parsing error 15 | # suspected XXX 16 | {} <{dependency:/dobj/} {lemma:/suspect/} 17 | {} >{dependency:/advmod/} {lemma:/suspect/} 18 | 19 | # maybe due to XXX 20 | {} <{dependency:/dep/} {lemma:/maybe/} 21 | 22 | # may/could represent/reflect/indicate/include XXX 23 | {} <{} ({lemma:/reflect|represent|indicate|include/} >{} {lemma:/may|could|would/}) 24 | 25 | # maybe secondary to XXX 26 | {} <{dependency:/nmod:to/} {lemma:/secondary/} 27 | 28 | # may be due to XXX 29 | {} <{dependency:/nmod:to/} ({lemma:/due/} >{} {lemma:/can|could|may|would|possibly/}) 30 | 31 | # could related to XXX 32 | {} <{dependency:/nmod:to/} ({lemma:/relate/} >{} {lemma:/can|could|may|would|possibly/}) 33 | 34 | # may be compatible with XXX 35 | {} <{dependency:/nmod:with/} ({lemma:/compatible/} >{} {lemma:/be|could|may|would/}) 36 | 37 | # question left XXX 38 | {} <{dependency:/dobj/} ({lemma:/left/} <{} {lemma:/question/}) 39 | {} >{} {lemma:/left/} <{} {lemma:/question/} 40 | 41 | # cannot exclude XXX 42 | {} <{dependency:/dobj/} ({lemma:/exclude/} >{} {lemma:/cannot/}) 43 | 44 | # cannot rule out XXX 45 | {} <{dependency:/nmod:out/} ({lemma:/rule/} >{} {lemma:/cannot/}) 46 | 47 | # XXX is not excluded 48 | {} <{dependency:/nsubjpass/} ({lemma:/exclude/} >{dependency:/neg/} {}) 49 | {} <{dependency:/nsubjpass/} ({lemma:/exclude/} >{} {lemma:/cannot/}) 50 | 51 | # differential diagnosis includes 52 | {} <{dependency:/dobj/} ({lemma:/include/} >{} ({lemma:/diagnosis/} >{} {lemma:/differential/})) 53 | 54 | # may be XXX 55 | {} <{} {lemma:/be/} >{} {lemma:/may|could|would/} 56 | 57 | # parsing error 58 | # XXX suspected 59 | {} <{dependency:/nsubj/} {lemma:/suspect/,tag:/VBN/} 60 | 61 | # suggestive of XXX 62 | # {} <{dependency:/nmod:of/} {lemma:/suggestive/}' -------------------------------------------------------------------------------- /negbio/pipeline/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ncbi-nlp/NegBio/073199e2792824740e89844a59c13d3d40ce4d23/negbio/pipeline/__init__.py -------------------------------------------------------------------------------- /negbio/pipeline/cleanup.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | 4 | def clean_sentences(document, sort_anns=False): 5 | """ 6 | Remove sentences in each passage 7 | 8 | Args: 9 | document(BioCDocument): a document 10 | sort_anns(bool): sort ann by its location 11 | """ 12 | try: 13 | for passage in document.passages: 14 | del passage.sentences[:] 15 | 16 | if sort_anns: 17 | key_func = lambda ann: ann.get_total_location().offset 18 | id = 0 19 | for passage in document.passages: 20 | for ann in sorted(passage.annotations, key=key_func): 21 | ann.id = str(id) 22 | id += 1 23 | except: 24 | logging.exception("Cannot process %s", document.id) 25 | return document 26 | -------------------------------------------------------------------------------- /negbio/pipeline/dner_mm.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import itertools 3 | import logging 4 | import re 5 | 6 | import bioc 7 | 8 | 9 | def remove_newline(s): 10 | return re.sub(r'[\n\r]', ' ', s) 11 | 12 | 13 | def adapt_concept_index(index): 14 | m = re.match(r"'.*?'", index) 15 | if m: 16 | return index[1:-1] 17 | m = re.match(r"'.*", index) 18 | if m: 19 | return index[1:] 20 | return index 21 | 22 | 23 | def run_metamap_col(collection, mm, cuis=None, extra_args=None): 24 | """ 25 | Get CUIs from metamap. 26 | 27 | Args: 28 | collection(BioCCollection): 29 | mm(MetaMap): MetaMap instance 30 | 31 | Returns: 32 | BioCCollection 33 | """ 34 | try: 35 | annIndex = itertools.count() 36 | sentence_map = collections.OrderedDict() 37 | for document in collection.documents: 38 | for passage in document.passages: 39 | for sentence in passage.sentences: 40 | sentence_map['{}-{}'.format(document.id.replace('.', '-'), sentence.offset)] = (passage, sentence) 41 | 42 | sents = [] 43 | ids = [] 44 | for k in sentence_map: 45 | ids.append(k) 46 | sents.append(remove_newline(sentence_map[k][1].text)) 47 | 48 | if extra_args is None: 49 | concepts, error = mm.extract_concepts(sents, ids) 50 | else: 51 | concepts, error = mm.extract_concepts(sents, ids, **extra_args) 52 | 53 | if error is None: 54 | for concept in concepts: 55 | concept_index = adapt_concept_index(concept.index) 56 | try: 57 | if cuis is not None: 58 | # if no CUI is returned for this concept - skip it 59 | concept_cui = getattr(concept, 'cui', None) 60 | if concept_cui not in cuis: 61 | continue 62 | m = re.match(r'(\d+)/(\d+)', concept.pos_info) 63 | if m: 64 | passage = sentence_map[concept_index][0] 65 | sentence = sentence_map[concept_index][1] 66 | start = int(m.group(1)) - 1 67 | length = int(m.group(2)) 68 | ann = bioc.BioCAnnotation() 69 | ann.id = str(next(annIndex)) 70 | ann.infons['CUI'] = concept.cui 71 | ann.infons['semtype'] = concept.semtypes[1:-1] 72 | ann.infons['term'] = concept.preferred_name 73 | ann.infons['annotator'] = 'MetaMap' 74 | ann.add_location(bioc.BioCLocation(sentence.offset + start, length)) 75 | ann.text = sentence.text[start:start+length] 76 | passage.annotations.append(ann) 77 | except: 78 | logging.exception('') 79 | except: 80 | logging.exception("Cannot process %s", collection.source) 81 | return collection 82 | 83 | 84 | def run_metamap(document, mm, cuis=None): 85 | """ 86 | Get CUIs from metamap. 87 | 88 | Args: 89 | document(BioCDocument): 90 | mm(MetaMap): MetaMap instance 91 | 92 | Returns: 93 | BioCDocument 94 | """ 95 | try: 96 | annIndex = itertools.count() 97 | sentence_map = collections.OrderedDict() 98 | for passage in document.passages: 99 | for sentence in passage.sentences: 100 | sentence_map[str(sentence.offset)] = (passage, sentence) 101 | 102 | sents = [] 103 | ids = [] 104 | for k in sentence_map: 105 | ids.append(k) 106 | sents.append(remove_newline(sentence_map[k][1].text)) 107 | 108 | concepts, error = mm.extract_concepts(sents, ids) 109 | if error is None: 110 | for concept in concepts: 111 | concept_index = adapt_concept_index(concept.index) 112 | try: 113 | if cuis is not None and concept.cui not in cuis: 114 | continue 115 | m = re.match(r'(\d+)/(\d+)', concept.pos_info) 116 | if m: 117 | passage = sentence_map[concept_index][0] 118 | sentence = sentence_map[concept_index][1] 119 | start = int(m.group(1)) - 1 120 | length = int(m.group(2)) 121 | ann = bioc.BioCAnnotation() 122 | ann.id = str(next(annIndex)) 123 | ann.infons['CUI'] = concept.cui 124 | ann.infons['semtype'] = concept.semtypes[1:-1] 125 | ann.infons['term'] = concept.preferred_name 126 | ann.infons['annotator'] = 'MetaMap' 127 | ann.add_location(bioc.BioCLocation(sentence.offset + start, length)) 128 | ann.text = sentence.text[start:start+length] 129 | passage.annotations.append(ann) 130 | except: 131 | logging.exception('') 132 | except: 133 | logging.exception("Cannot process %s", document.id) 134 | return document 135 | -------------------------------------------------------------------------------- /negbio/pipeline/negdetect.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import re 3 | 4 | from negbio.neg.neg_detector import Detector 5 | 6 | 7 | def neg_mesh(annotations): 8 | """ 9 | Detect negative MeSH 10 | """ 11 | for ann in annotations: 12 | if ann.infons.get('CUI', None) == 'C0332125': 13 | ann.infons[Detector.NEGATION] = 'True' 14 | 15 | 16 | def uncertain_mesh(annotations): 17 | """ 18 | Detect uncertain MeSH 19 | """ 20 | for ann in annotations: 21 | if ann.infons.get('CUI', None) == 'C0332148': 22 | ann.infons[Detector.UNCERTAINTY] = 'True' 23 | 24 | 25 | def is_neg_regex(text): 26 | if re.search(r'^(findings|impression): no ', text, re.I): 27 | return True 28 | return False 29 | 30 | 31 | def _mark_anns(annotations, begin, end, type): 32 | """Mark all annotations in [begin:end] as type""" 33 | for ann in annotations: 34 | total_loc = ann.get_total_location() 35 | if begin <= total_loc.offset and total_loc.offset + total_loc.length <= end: 36 | ann.infons[type] = 'True' 37 | 38 | 39 | def _extend(document, type): 40 | def _is_type(annotation): 41 | return annotation.infons.get(type, None) == 'True' 42 | 43 | neg_anns = [] 44 | for passage in document.passages: 45 | for ann in passage.annotations: 46 | if _is_type(ann): 47 | neg_anns.append(ann) 48 | 49 | for passage in document.passages: 50 | for ann in passage.annotations: 51 | if not _is_type(ann): 52 | for nann in neg_anns: 53 | if ann in nann: 54 | ann.infons[type] = 'True' 55 | break 56 | if nann in ann and 'CUI' in ann and 'CUI' in nann and ann.infons['CUI'] == nann.infons['CUI']: 57 | ann.infons[type] = 'True' 58 | break 59 | 60 | 61 | def detect(document, detector): 62 | """ 63 | Args: 64 | document(BioCDocument): 65 | detector(Detector): detector. Define customized patterns in the detector 66 | """ 67 | try: 68 | 69 | for passage in document.passages: 70 | neg_mesh(passage.annotations) 71 | uncertain_mesh(passage.annotations) 72 | 73 | locs = [] 74 | for ann in passage.annotations: 75 | total_loc = ann.get_total_location() 76 | locs.append((total_loc.offset, total_loc.offset + total_loc.length)) 77 | 78 | for sentence in passage.sentences: 79 | if is_neg_regex(sentence.text): 80 | _mark_anns(passage.annotations, sentence.offset, sentence.offset + len(sentence.text), 81 | Detector.NEGATION) 82 | continue 83 | for name, matcher, loc in detector.detect(sentence, locs): 84 | logging.debug('Find: %s, %s, %s', name, matcher.pattern, loc) 85 | _mark_anns(passage.annotations, loc[0], loc[1], name) 86 | 87 | # _extend(document, Detector.NEGATION) 88 | # _extend(document, Detector.UNCERTAINTY) 89 | except: 90 | logging.exception("Cannot process %s", document.id) 91 | return document 92 | -------------------------------------------------------------------------------- /negbio/pipeline/parse.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, absolute_import 2 | 3 | import logging 4 | import os 5 | import tempfile 6 | 7 | from bllipparser import ModelFetcher 8 | from bllipparser import RerankingParser 9 | 10 | 11 | class Bllip(object): 12 | def __init__(self, model_dir=None): 13 | if model_dir is None: 14 | logging.debug("downloading GENIA+PubMed model if necessary ...") 15 | model_dir = ModelFetcher.download_and_install_model( 16 | 'GENIA+PubMed', os.path.join(tempfile.gettempdir(), 'models')) 17 | self.model_dir = os.path.expanduser(model_dir) 18 | 19 | logging.debug('loading model %s ...' % self.model_dir) 20 | self.rrp = RerankingParser.from_unified_model_dir(self.model_dir) 21 | 22 | def parse(self, s): 23 | """Parse the sentence text using Reranking parser. 24 | 25 | Args: 26 | s(str): one sentence 27 | 28 | Returns: 29 | ScoredParse: parse tree, ScoredParse object in RerankingParser; None if failed 30 | """ 31 | if not s: 32 | raise ValueError('Cannot parse empty sentence: {}'.format(s)) 33 | 34 | nbest = self.rrp.parse(str(s)) 35 | if nbest: 36 | return nbest[0].ptb_parse 37 | 38 | return None 39 | 40 | 41 | class NegBioParser(Bllip): 42 | def parse_doc(self, document): 43 | """ 44 | Parse sentences in BioC format 45 | 46 | Args: 47 | document(BioCDocument): one document 48 | 49 | Returns: 50 | BioCDocument 51 | """ 52 | for passage in document.passages: 53 | for sentence in passage.sentences: 54 | text = sentence.text 55 | tree = self.parse(text) 56 | if tree: 57 | sentence.infons['parse tree'] = str(tree) 58 | else: 59 | sentence.infons['parse tree'] = None 60 | logging.exception( 61 | 'No parse tree for sentence: %s', sentence.offset) 62 | return document 63 | -------------------------------------------------------------------------------- /negbio/pipeline/pipeline.py: -------------------------------------------------------------------------------- 1 | 2 | from negbio.pipeline import parse, ssplit, ptb2ud, negdetect, text2bioc, dner_mm, section_split, cleanup 3 | from negbio.ext import normalize_mimiccxr 4 | 5 | 6 | def process_collection(collection, metamap, splitter, parser, ptb2dep, lemmatizer, neg_detector, cuis, sec_title_patterns): 7 | for document in collection.documents: 8 | normalize_mimiccxr.normalize(document) 9 | section_split.split_document(document, sec_title_patterns) 10 | ssplit.ssplit(document, splitter) 11 | 12 | dner_mm.run_metamap_col(collection, metamap, cuis) 13 | 14 | for document in collection.documents: 15 | document = parse.parse(document, parser) 16 | document = ptb2ud.convert(document, ptb2dep, lemmatizer) 17 | document = negdetect.detect(document, neg_detector) 18 | cleanup.clean_sentences(document) 19 | 20 | return collection 21 | 22 | 23 | def process_text(sources, metamap, splitter, parser, ptb2dep, lemmatizer, neg_detector, cuis, sec_title_patterns): 24 | collection = text2bioc.text2collection(*sources) 25 | return process_collection(collection, metamap, splitter, parser, ptb2dep, lemmatizer, neg_detector, cuis, sec_title_patterns) 26 | -------------------------------------------------------------------------------- /negbio/pipeline/ptb2ud.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import StanfordDependencies 4 | import bioc 5 | from nltk.corpus import wordnet 6 | from nltk.stem.wordnet import WordNetLemmatizer 7 | from nltk.tag.mapping import tagset_mapping 8 | 9 | 10 | class Lemmatizer(object): 11 | def __init__(self): 12 | self.wordnet_lemmatizer = WordNetLemmatizer() 13 | self.mapping = tagset_mapping('en-ptb', 'universal') 14 | 15 | def lemmatize(self, word, pos=None): 16 | """ 17 | Determines the lemma for a given word 18 | 19 | Args: 20 | word(str): word 21 | pos(str): part-of-speech 22 | 23 | Returns: 24 | str: lemma 25 | """ 26 | if pos: 27 | return self.wordnet_lemmatizer.lemmatize(word=word, pos=pos) 28 | else: 29 | return self.wordnet_lemmatizer.lemmatize(word=word) 30 | 31 | def map_tag(self, tag): 32 | if tag in self.mapping: 33 | tag = self.mapping[tag] 34 | if tag == 'NOUN': 35 | return wordnet.NOUN 36 | elif tag == 'VERB': 37 | return wordnet.VERB 38 | elif tag == 'ADJ': 39 | return wordnet.ADJ 40 | elif tag == 'ADV': 41 | return wordnet.ADV 42 | elif tag == 'ADJ_SAT': 43 | return wordnet.ADJ_SAT 44 | return None 45 | 46 | 47 | class Ptb2DepConverter(object): 48 | """ 49 | Convert ptb trees to universal dependencies 50 | """ 51 | 52 | basic = 'basic' 53 | collapsed = 'collapsed' 54 | CCprocessed = 'CCprocessed' 55 | collapsedTree = 'collapsedTree' 56 | 57 | def __init__(self, lemmatizer, representation='CCprocessed', universal=False): 58 | """ 59 | Args: 60 | representation(str): Currently supported representations are 61 | 'basic', 'collapsed', 'CCprocessed', and 'collapsedTree' 62 | universal(bool): if True, use universal dependencies if they're available 63 | """ 64 | try: 65 | import jpype 66 | self._backend = 'jpype' 67 | except ImportError: 68 | self._backend = 'subprocess' 69 | self.lemmatizer = lemmatizer 70 | self.__sd = StanfordDependencies.get_instance(backend=self._backend) 71 | self.representation = representation 72 | self.universal = universal 73 | 74 | def convert(self, parse_tree): 75 | """ 76 | Convert ptb trees in a BioC sentence 77 | 78 | Args: 79 | parse_tree(str): parse tree in PTB format 80 | 81 | Examples: 82 | (ROOT (NP (JJ hello) (NN world) (. !))) 83 | """ 84 | if self._backend == 'jpype': 85 | dependency_graph = self.__sd.convert_tree(parse_tree, 86 | representation=self.representation, 87 | universal=self.universal, 88 | add_lemmas=True) 89 | else: 90 | dependency_graph = self.__sd.convert_tree(parse_tree, 91 | representation=self.representation, 92 | universal=self.universal) 93 | return dependency_graph 94 | 95 | 96 | class NegBioPtb2DepConverter(Ptb2DepConverter): 97 | def __init__(self, lemmatizer, representation='CCprocessed', universal=False): 98 | """ 99 | Args: 100 | lemmatizer (Lemmatizer) 101 | """ 102 | super(NegBioPtb2DepConverter, self).__init__( 103 | lemmatizer, representation, universal) 104 | 105 | def convert_doc(self, document): 106 | for passage in document.passages: 107 | for sentence in passage.sentences: 108 | # check for empty infons, don't process if empty 109 | # this sometimes happens with poorly tokenized sentences 110 | if not sentence.infons: 111 | continue 112 | elif not sentence.infons['parse tree']: 113 | continue 114 | 115 | try: 116 | dependency_graph = self.convert( 117 | sentence.infons['parse tree']) 118 | anns, rels = convert_dg(dependency_graph, sentence.text, 119 | sentence.offset, 120 | has_lemmas=self._backend == 'jpype') 121 | sentence.annotations = anns 122 | sentence.relations = rels 123 | except KeyboardInterrupt: 124 | raise 125 | except: 126 | logging.exception( 127 | "Cannot process sentence %d in %s", sentence.offset, document.id) 128 | 129 | if self._backend != 'jpype': 130 | for ann in sentence.annotations: 131 | text = ann.text 132 | pos = ann.infons['tag'] 133 | pos = self.lemmatizer.map_tag(pos) 134 | lemma = self.lemmatizer.lemmatize(word=text, pos=pos) 135 | ann.infons['lemma'] = lemma.lower() 136 | return document 137 | 138 | 139 | def adapt_value(value): 140 | """ 141 | Adapt string in PTB 142 | """ 143 | value = value.replace("-LRB-", "(") 144 | value = value.replace("-RRB-", ")") 145 | value = value.replace("-LSB-", "[") 146 | value = value.replace("-RSB-", "]") 147 | value = value.replace("-LCB-", "{") 148 | value = value.replace("-RCB-", "}") 149 | value = value.replace("-lrb-", "(") 150 | value = value.replace("-rrb-", ")") 151 | value = value.replace("-lsb-", "[") 152 | value = value.replace("-rsb-", "]") 153 | value = value.replace("``", "\"") 154 | value = value.replace("''", "\"") 155 | value = value.replace("`", "'") 156 | return value 157 | 158 | 159 | def convert_dg(dependency_graph, text, offset, ann_index=0, rel_index=0, has_lemmas=True): 160 | """ 161 | Convert dependency graph to annotations and relations 162 | """ 163 | annotations = [] 164 | relations = [] 165 | annotation_id_map = {} 166 | start = 0 167 | for node in dependency_graph: 168 | if node.index in annotation_id_map: 169 | continue 170 | node_form = node.form 171 | index = text.find(node_form, start) 172 | if index == -1: 173 | node_form = adapt_value(node.form) 174 | index = text.find(node_form, start) 175 | if index == -1: 176 | logging.debug('Cannot convert parse tree to dependency graph at %d\n%d\n%s', 177 | start, offset, str(dependency_graph)) 178 | return 179 | 180 | ann = bioc.BioCAnnotation() 181 | ann.id = 'T{}'.format(ann_index) 182 | ann.text = node_form 183 | ann.infons['tag'] = node.pos 184 | if has_lemmas: 185 | ann.infons['lemma'] = node.lemma.lower() 186 | 187 | start = index 188 | 189 | ann.add_location(bioc.BioCLocation(start + offset, len(node_form))) 190 | annotations.append(ann) 191 | annotation_id_map[node.index] = ann_index 192 | ann_index += 1 193 | start += len(node_form) 194 | 195 | for node in dependency_graph: 196 | if node.head == 0: 197 | ann = annotations[annotation_id_map[node.index]] 198 | ann.infons['ROOT'] = True 199 | continue 200 | relation = bioc.BioCRelation() 201 | relation.id = 'R{}'.format(rel_index) 202 | relation.infons['dependency'] = node.deprel 203 | if node.extra: 204 | relation.infons['extra'] = node.extra 205 | relation.add_node(bioc.BioCNode('T{}'.format( 206 | annotation_id_map[node.index]), 'dependant')) 207 | relation.add_node(bioc.BioCNode('T{}'.format( 208 | annotation_id_map[node.head]), 'governor')) 209 | relations.append(relation) 210 | rel_index += 1 211 | 212 | return annotations, relations 213 | -------------------------------------------------------------------------------- /negbio/pipeline/scan.py: -------------------------------------------------------------------------------- 1 | import io 2 | import logging 3 | import os 4 | 5 | import bioc 6 | import tqdm 7 | 8 | 9 | def scan_document(*_, **kwargs): 10 | """ 11 | Scan each document in a list of BioC source files, apply fn, and print to directory. 12 | 13 | Args: 14 | kwargs: 15 | source(list): a list of source pathnames 16 | directory(str): output directory 17 | fn: 18 | fn should expect the following arguments in this given order: 19 | sequence1 20 | sequence2 21 | ... 22 | non_sequence1 23 | non_sequence2 24 | ... 25 | verbose(boolean): 26 | """ 27 | source = kwargs.pop('source') 28 | verbose = kwargs.pop('verbose', True) 29 | directory = os.path.expanduser(kwargs.pop('directory')) 30 | suffix = kwargs.pop('suffix') 31 | fn = kwargs.pop('fn') 32 | non_sequences = kwargs.pop('non_sequences', []) 33 | 34 | if not os.path.exists(directory): 35 | os.makedirs(directory) 36 | 37 | def catch(document, non_sequences): 38 | try: 39 | return fn(document, *non_sequences) 40 | except: 41 | logging.exception('Cannot process %s', document.id) 42 | 43 | for pathname in tqdm.tqdm(source, total=len(source), disable=not verbose): 44 | basename = os.path.splitext(os.path.basename(pathname))[0] 45 | dstname = os.path.join(directory, '{}{}'.format(basename, suffix)) 46 | with io.open(pathname, encoding='utf8') as fp: 47 | collection = bioc.load(fp) 48 | collection.documents = [catch(doc, non_sequences) for doc in collection.documents] 49 | with io.open(dstname, 'w', encoding='utf8') as fp: 50 | bioc.dump(collection, fp) 51 | 52 | 53 | def scan_collection(*_, **kwargs): 54 | """ 55 | Scan each document in a list of BioC source files, apply fn, and print to directory. 56 | 57 | Args: 58 | kwargs: 59 | source(list): a list of source pathnames 60 | directory(str): output directory 61 | fn: 62 | fn should expect the following arguments in this given order: 63 | sequence1 64 | sequence2 65 | ... 66 | non_sequence1 67 | non_sequence2 68 | ... 69 | verbose(boolean): 70 | """ 71 | source = kwargs.pop('source') 72 | verbose = kwargs.pop('verbose', True) 73 | directory = os.path.expanduser(kwargs.pop('directory')) 74 | suffix = kwargs.pop('suffix') 75 | fn = kwargs.pop('fn') 76 | non_sequences = kwargs.pop('non_sequences', []) 77 | 78 | if not os.path.exists(directory): 79 | os.makedirs(directory) 80 | 81 | for pathname in tqdm.tqdm(source, total=len(source), disable=not verbose): 82 | basename = os.path.splitext(os.path.basename(pathname))[0] 83 | dstname = os.path.join(directory, '{}{}'.format(basename, suffix)) 84 | with io.open(pathname, encoding='utf8') as fp: 85 | collection = bioc.load(fp) 86 | try: 87 | args = [collection] + non_sequences 88 | fn(*args) 89 | except: 90 | logging.exception('Cannot process %s', collection.source) 91 | with io.open(dstname, 'w', encoding='utf8') as fp: 92 | bioc.dump(collection, fp) 93 | -------------------------------------------------------------------------------- /negbio/pipeline/section_split.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import re 3 | 4 | import bioc 5 | 6 | 7 | SECTION_TITLES = re.compile(r'(' 8 | r'ABDOMEN AND PELVIS|CLINICAL HISTORY|CLINICAL INDICATION|COMPARISON|COMPARISON STUDY DATE' 9 | r'|EXAM|EXAMINATION|FINDINGS|HISTORY|IMPRESSION|INDICATION' 10 | r'|MEDICAL CONDITION|PROCEDURE|REASON FOR EXAM|REASON FOR STUDY|REASON FOR THIS EXAMINATION' 11 | r'|TECHNIQUE' 12 | r'):|FINAL REPORT', 13 | re.IGNORECASE | re.MULTILINE) 14 | 15 | 16 | def is_empty(passage): 17 | return len(passage.text) == 0 18 | 19 | 20 | def strip(passage): 21 | start = 0 22 | while start < len(passage.text) and passage.text[start].isspace(): 23 | start += 1 24 | 25 | end = len(passage.text) 26 | while end > start and passage.text[end - 1].isspace(): 27 | end -= 1 28 | 29 | passage.offset += start 30 | logging.debug('before: %r' % passage.text) 31 | passage.text = passage.text[start:end] 32 | logging.debug('after: %r' % passage.text) 33 | return passage 34 | 35 | 36 | def split_document(document, pattern=None): 37 | """ 38 | Split one report into sections. Section splitting is a deterministic consequence of section titles. 39 | 40 | Args: 41 | document(BioCDocument): one document that contains one passage. 42 | pattern: the regular expression patterns for section titles. 43 | 44 | Returns: 45 | BioCDocument: a new BioCDocument instance 46 | """ 47 | if pattern is None: 48 | pattern = SECTION_TITLES 49 | 50 | new_document = bioc.BioCDocument() 51 | new_document.id = document.id 52 | new_document.infons = document.infons 53 | 54 | text = document.passages[0].text 55 | offset = document.passages[0].offset 56 | 57 | def create_passage(start, end, title=None): 58 | passage = bioc.BioCPassage() 59 | passage.offset = start + offset 60 | passage.text = text[start:end] 61 | if title is not None: 62 | passage.infons['title'] = title[:-1].strip() if title[-1] == ':' else title.strip() 63 | passage.infons['type'] = 'title_1' 64 | strip(passage) 65 | return passage 66 | 67 | start = 0 68 | for matcher in pattern.finditer(text): 69 | logging.debug('Match: %s', matcher.group()) 70 | # add last 71 | end = matcher.start() 72 | if end != start: 73 | passage = create_passage(start, end) 74 | if not is_empty(passage): 75 | new_document.add_passage(passage) 76 | 77 | start = end 78 | 79 | # add title 80 | end = matcher.end() 81 | passage = create_passage(start, end, text[start:end]) 82 | if not is_empty(passage): 83 | new_document.add_passage(passage) 84 | 85 | start = end 86 | 87 | # add last piece 88 | end = len(text) 89 | if start < end: 90 | passage = create_passage(start, end) 91 | if not is_empty(passage): 92 | new_document.add_passage(passage) 93 | return new_document 94 | -------------------------------------------------------------------------------- /negbio/pipeline/ssplit.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import bioc 4 | 5 | 6 | class NltkSSplitter(object): 7 | """NLTK sentence splitter""" 8 | 9 | def __init__(self, **kwargs): 10 | self.newline = kwargs.pop('newline', False) 11 | 12 | def split(self, text, **kwargs): 13 | import nltk 14 | if not text: 15 | return 16 | 17 | if self.newline: 18 | line_splitter = self.split_line 19 | else: 20 | line_splitter = self.no_split 21 | 22 | for line, line_offset in line_splitter(text): 23 | sent_list = nltk.sent_tokenize(line) 24 | offset = 0 25 | for sent in sent_list: 26 | offset = line.find(sent, offset) 27 | if offset == -1: 28 | logging.debug('Cannot find {} in {}'.format(sent, text)) 29 | yield sent, offset + line_offset 30 | offset += len(sent) 31 | 32 | @classmethod 33 | def split_line(cls, text, sep='\n'): 34 | lines = text.split(sep) 35 | offset = 0 36 | for line in lines: 37 | offset = text.index(line, offset) 38 | yield line, offset 39 | 40 | @classmethod 41 | def no_split(cls, text, **kwargs): 42 | yield text, 0 43 | 44 | def __repr__(self): 45 | return 'NLTK SSplitter' 46 | 47 | 48 | class NegBioSSplitter(NltkSSplitter): 49 | def split_doc(self, document): 50 | """ 51 | Split text into sentences with offsets. 52 | 53 | Args:v 54 | document(BioCDocument): one document 55 | 56 | Returns: 57 | BioCDocument 58 | """ 59 | for passage in document.passages: 60 | for text, offset in self.split(passage.text): 61 | sentence = bioc.BioCSentence() 62 | sentence.offset = offset + passage.offset 63 | sentence.text = text 64 | passage.add_sentence(sentence) 65 | # passage.text = None 66 | return document 67 | -------------------------------------------------------------------------------- /negbio/pipeline/text2bioc.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import string 3 | from pathlib2 import Path 4 | 5 | import bioc 6 | 7 | 8 | def printable(s, func=None): 9 | """ 10 | Return string of ASCII string which is considered printable. 11 | 12 | Args: 13 | s(str): string 14 | func: function to convert non-ASCII characters 15 | """ 16 | out = '' 17 | for c in s: 18 | if c in string.printable: 19 | out += c 20 | elif func is not None: 21 | out += func(c) 22 | else: 23 | logging.warning('Cannot convert char: %s', c) 24 | return out 25 | 26 | 27 | def text2document(id, text): 28 | """ 29 | Convert text to a BioCDocument instance 30 | 31 | Args: 32 | id (str): BioCDocument id 33 | text (str): text 34 | 35 | Returns: 36 | BioCDocument: a BioCDocument instance 37 | """ 38 | document = bioc.BioCDocument() 39 | document.id = id 40 | text = printable(text).replace('\r\n', '\n') 41 | 42 | passage = bioc.BioCPassage() 43 | passage.offset = 0 44 | passage.text = text 45 | document.add_passage(passage) 46 | 47 | return document 48 | 49 | 50 | def text2collection(*sources): 51 | """ 52 | Returns a BioCCollection containing documents specified in sources. 53 | 54 | Args: 55 | sources: a list of pathname 56 | """ 57 | 58 | collection = bioc.BioCCollection() 59 | for pathname in iter(*sources): 60 | logging.debug('Process %s', pathname) 61 | try: 62 | with open(pathname) as fp: 63 | text = fp.read() 64 | id = Path(pathname).stem 65 | document = text2document(id, text) 66 | collection.add_document(document) 67 | except: 68 | logging.exception('Cannot convert %s', pathname) 69 | return collection 70 | 71 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | future==0.16.0 2 | docutils==0.14 3 | docopt==0.6.2 4 | pytest==4.4.1 5 | networkx==1.11 6 | ply==3.10 7 | tqdm==4.19.5 8 | nltk==3.6.6 9 | bioc==1.3.1 10 | pystanforddependencies==0.3.1 11 | bllipparser==2016.9.11 12 | pymetamap==0.1 13 | JPype1>=0.6.3 14 | pathlib2==2.3.3 15 | numpy==1.21.0 16 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # Always prefer setuptools over distutils 2 | # To use a consistent encoding 3 | from __future__ import print_function 4 | from codecs import open 5 | import os 6 | from subprocess import check_call 7 | 8 | from setuptools import setup, find_packages 9 | from setuptools.command.develop import develop 10 | from setuptools.command.egg_info import egg_info 11 | from setuptools.command.install import install 12 | 13 | here = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) 14 | 15 | 16 | def readme(): 17 | # Get the long description from the README file 18 | with open(os.path.join(here, 'README.rst'), encoding='utf-8') as f: 19 | return f.read() 20 | 21 | 22 | def read_requirements(): 23 | """parses requirements from requirements.txt""" 24 | reqs_path = os.path.join(here, 'requirements.txt') 25 | with open(reqs_path, encoding='utf8') as f: 26 | reqs = [line.strip() for line in f if not line.strip().startswith('#')] 27 | 28 | names = [] 29 | links = [] 30 | for req in reqs: 31 | if '://' in req: 32 | links.append(req) 33 | else: 34 | names.append(req) 35 | return {'install_requires': names, 'dependency_links': links} 36 | 37 | 38 | def custom_command(): 39 | check_call("python -m nltk.downloader universal_tagset punkt wordnet".split()) 40 | 41 | 42 | class CustomInstallCommand(install): 43 | def run(self): 44 | custom_command() 45 | install.run(self) 46 | 47 | 48 | class CustomDevelopCommand(develop): 49 | def run(self): 50 | custom_command() 51 | develop.run(self) 52 | 53 | 54 | class CustomEggInfoCommand(egg_info): 55 | def run(self): 56 | custom_command() 57 | egg_info.run(self) 58 | 59 | 60 | setup( 61 | name='negbio', 62 | 63 | # Versions should comply with PEP440. For a discussion on single-sourcing 64 | # the version across setup.py and the project code, see 65 | # https://packaging.python.org/en/latest/single_source_version.html 66 | version='0.9.4', 67 | 68 | description='NegBio: a tool for negation and uncertainty detection', 69 | long_description=readme(), 70 | 71 | # The project's main homepage. 72 | url='https://github.com/ncbi-nlp/NegBio.git', 73 | 74 | # Author details 75 | author='Yifan Peng', 76 | author_email='yifan.peng@nih.gov', 77 | 78 | license='Public Domain', 79 | 80 | # See https://pypi.python.org/pypi?%3Aaction=list_classifiers 81 | classifiers=[ 82 | 'Development Status :: 3 - Alpha', 83 | # Indicate who your project is intended for 84 | 'Intended Audience :: Developers', 85 | 'Intended Audience :: Science/Research', 86 | 87 | # Pick your license as you wish (should match "license" above) 88 | 'License :: Public Domain', 89 | 90 | 'Operating System :: MacOS', 91 | 'Operating System :: POSIX', 92 | 'Operating System :: POSIX :: Linux', 93 | 94 | # Specify the Python versions you support here. 95 | 'Programming Language :: Python', 96 | 'Topic :: Software Development', 97 | 'Topic :: Software Development :: Libraries :: Application Frameworks', 98 | ], 99 | 100 | keywords='negbio', 101 | 102 | packages=find_packages(exclude=["tests.*", "tests", "backup", "docs"]), 103 | include_package_data=True, 104 | 105 | cmdclass={ 106 | 'install': CustomInstallCommand, 107 | 'develop': CustomDevelopCommand, 108 | 'egg_info': CustomEggInfoCommand 109 | }, 110 | 111 | entry_points = { 112 | 'console_scripts': ['negbio_pipeline=negbio.negbio_pipeline:main', 113 | 'main_chexpert=negbio.main_chexpert:main', 114 | 'main_mm=negbio.main_mm:main'], 115 | }, 116 | 117 | **read_requirements() 118 | ) 119 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ncbi-nlp/NegBio/073199e2792824740e89844a59c13d3d40ce4d23/tests/__init__.py -------------------------------------------------------------------------------- /tests/context.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import os 4 | import sys 5 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) 6 | 7 | import bioc -------------------------------------------------------------------------------- /tests/negbio/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ncbi-nlp/NegBio/073199e2792824740e89844a59c13d3d40ce4d23/tests/negbio/__init__.py -------------------------------------------------------------------------------- /tests/negbio/ngrex/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ncbi-nlp/NegBio/073199e2792824740e89844a59c13d3d40ce4d23/tests/negbio/ngrex/__init__.py -------------------------------------------------------------------------------- /tests/negbio/ngrex/test_parser.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from negbio import ngrex 4 | from negbio.ngrex import parser 5 | from ply.lex import LexToken 6 | 7 | 8 | def test_lex(): 9 | _test_lex('{lemma:/xxx/} <{dependency:/nmod:without|x/} {lemma:/yyy/}') 10 | _test_lex('{} <{dependency:/nsubj/} {lemma:/suspect/,tag:/VBN/}') 11 | _test_lex('{}=t <{dependency:/nsubj/} {lemma:/suspect/,tag:/VBN/}=key') 12 | with pytest.raises(TypeError): 13 | _test_yacc("xxx") 14 | 15 | 16 | def _test_lex(s): 17 | parser.lexer.input(s) 18 | for tok in parser.lexer: 19 | print(tok) 20 | 21 | 22 | def test_yacc(): 23 | # _test_yacc("{lemma:/xxx/} <{dependency:/nmod:without|x/} {lemma:/yyy/}") 24 | # _test_yacc("{lemma:/xxx/} >{dependency:/nmod:without/} {lemma:/yyy/}") 25 | # _test_yacc("{lemma:/xxx/} >{dependency:/nmod:without/} ({lemma:/yyy/} >{} {lemma:/zzz/})") 26 | # _test_yacc("{} >{} {lemma:/left/} <{} {lemma:/question/}") 27 | # _test_yacc("{} <{dependency:/nsubj/} {lemma:/suspect/,tag:/VBN/}") 28 | _test_yacc("{}=t <{dependency:/nsubj/} {lemma:/suspect/,tag:/VBN/}=key") 29 | with pytest.raises(KeyError): 30 | _test_yacc("{}=t <{dependency:/nsubj/} {lemma:/suspect/,tag:/VBN/}=t") 31 | 32 | 33 | def _test_yacc(s): 34 | pattern = ngrex.compile(s) 35 | print(pattern) 36 | 37 | 38 | if __name__ == '__main__': 39 | test_lex() 40 | test_yacc() 41 | -------------------------------------------------------------------------------- /tests/negbio/ngrex/test_pattern.py: -------------------------------------------------------------------------------- 1 | import networkx as nx 2 | from negbio import ngrex 3 | 4 | 5 | def get_graph(): 6 | G = nx.DiGraph() 7 | G.add_node('xxx', attr_dict={'lemma': 'xxx'}) 8 | G.add_node('yyy', attr_dict={'lemma': 'yyy'}) 9 | G.add_node('zzz', attr_dict={'lemma': 'zzz'}) 10 | G.add_edge('xxx', 'yyy', attr_dict={'dependency': 'aaa'}) 11 | G.add_edge('yyy', 'zzz', attr_dict={'dependency': 'bbb'}) 12 | G.add_edge('xxx', 'zzz', attr_dict={'dependency': 'ccc'}) 13 | return G 14 | 15 | 16 | def helper(G, p, expected): 17 | pattern = ngrex.compile(p) 18 | print(pattern.pattern) 19 | # actual = {m.group(0) for m in pattern.finditer(G)} 20 | actual = set() 21 | for m in pattern.finditer(G): 22 | actual.add(m.group(0)) 23 | assert actual == expected, '{} vs {}'.format(actual, expected) 24 | 25 | 26 | def test_regex(): 27 | G = get_graph() 28 | helper(G, '{} >{dependency:/aaa|bbb/} {}', {'xxx', 'yyy'}) 29 | 30 | 31 | def test_attribute(): 32 | G = get_graph() 33 | helper(G, '{} >{dependency:/aaa|bbb/} {}', {'xxx', 'yyy'}) 34 | helper(G, '{} >{tag:/aaa|bbb/} {}', set()) 35 | 36 | 37 | def test_relation(): 38 | G = get_graph() 39 | helper(G, '{lemma:/xxx/} >{dependency:/aaa/} {lemma:/yyy/}', {'xxx'}) 40 | helper(G, '{lemma:/yyy/} <{dependency:/aaa/} {lemma:/xxx/}', {'yyy'}) 41 | helper(G, '{} >{} {}', {'xxx', 'yyy'}) 42 | 43 | 44 | def test_relation_next(): 45 | G = get_graph() 46 | helper(G, '{lemma:/xxx/} >{dependency:/aaa/} ({lemma:/yyy/} >{dependency:/bbb/} {lemma:/zzz/})', 47 | {'xxx'}) 48 | 49 | 50 | def test_relation_conj(): 51 | G = get_graph() 52 | helper(G, '{} >{} {lemma:/yyy/} >{} {lemma:/zzz/}', {'xxx'}) 53 | helper(G, '{} >{} {lemma:/yyy/} <{} {lemma:/zzz/}', set()) 54 | 55 | 56 | def test_relation_disj(): 57 | G = get_graph() 58 | helper(G, '{} >{dependency:/aaa/} {} | >{dependency:/bbb/} {}', {'xxx', 'yyy'}) 59 | 60 | 61 | def test_variables(): 62 | G = get_graph() 63 | pattern = ngrex.compile('{}=t >{dependency:/aaa|bbb/} {}') 64 | print(pattern.pattern) 65 | actual = {m.get('t') for m in pattern.finditer(G)} 66 | assert actual == {'xxx', 'yyy'} 67 | 68 | 69 | if __name__ == '__main__': 70 | # test_relation() 71 | # test_relation_next() 72 | test_relation_conj() 73 | # test_relation_disj() 74 | # test_regex() 75 | # test_attribute() 76 | # test_variables() 77 | -------------------------------------------------------------------------------- /tests/negbio/pipeline/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ncbi-nlp/NegBio/073199e2792824740e89844a59c13d3d40ce4d23/tests/negbio/pipeline/__init__.py -------------------------------------------------------------------------------- /tests/negbio/pipeline/test_parse.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from negbio.pipeline.parse import Bllip 4 | 5 | 6 | def test_Bllip(): 7 | b = Bllip() 8 | t = b.parse('hello world!') 9 | assert str(t) == '(S1 (S (NP (NN hello) (NN world) (NN !))))' 10 | 11 | 12 | if __name__ == '__main__': 13 | logging.basicConfig(level=logging.WARNING) 14 | test_Bllip() 15 | -------------------------------------------------------------------------------- /tests/negbio/test_cli.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from docopt import docopt 4 | 5 | from negbio import negbio_pipeline, negbio_text2bioc, negbio_ssplit, negbio_section_split, negbio_parse 6 | 7 | 8 | def test_negbio(): 9 | doc = negbio_pipeline.__doc__ 10 | args = docopt(doc, 'text2bioc a b c'.split()) 11 | assert args[''] == 'text2bioc' 12 | assert args[''] == ['a', 'b', 'c'] 13 | 14 | 15 | def test_text2bioc(): 16 | doc = negbio_text2bioc.__doc__ 17 | args = docopt(doc, 'text2bioc --verbose --output=out a b c'.split()) 18 | assert args['--verbose'] 19 | assert args['--output'] == 'out' 20 | assert args[''] == ['a', 'b', 'c'] 21 | args = docopt(doc, 'text2bioc --output=out a b c'.split()) 22 | assert not args['--verbose'] 23 | 24 | 25 | def test_ssplit(): 26 | doc = negbio_ssplit.__doc__ 27 | args = docopt(doc, 'ssplit --suffix suffix --newline_is_sentence_break --output out a b c'.split()) 28 | assert args['--newline_is_sentence_break'] 29 | assert args['--output'] == 'out' 30 | assert args['--suffix'] == 'suffix' 31 | assert args[''] == ['a', 'b', 'c'] 32 | 33 | 34 | def test_section_split(): 35 | doc = negbio_section_split.__doc__ 36 | args = docopt(doc, 'section_split --pattern pattern --output out a b c'.split()) 37 | assert args['--output'] == 'out' 38 | assert args['--pattern'] == 'pattern' 39 | assert args[''] == ['a', 'b', 'c'] 40 | 41 | 42 | def test_parse(): 43 | doc = negbio_parse.__doc__ 44 | args = docopt(doc, 'parse --model model --output out a b c'.split()) 45 | assert args['--output'] == 'out' 46 | assert args['--model'] == 'model' 47 | assert args[''] == ['a', 'b', 'c'] 48 | 49 | 50 | if __name__ == '__main__': 51 | logging.basicConfig(level=logging.WARNING) 52 | test_ssplit() 53 | --------------------------------------------------------------------------------