├── .gitignore
├── .pylintrc
├── .travis.yml
├── CONTRIBUTING.md
├── LICENSE.txt
├── MANIFEST.in
├── README.rst
├── docs
├── Makefile
├── README
├── acknowledgments.rst
├── conf.py
├── contributing.rst
├── developer_guide.rst
├── disclaimer.rst
├── getting_started.rst
├── index.rst
├── license.rst
├── make.bat
├── reference.rst
└── user_guide.rst
├── environment2.7.yml
├── environment3.7.yml
├── examples
├── 00000086.txt
├── 00019248.txt
├── 1.xml
├── 2.xml
├── cuis-cvpr2017.txt
├── openi-testset.txt
└── openi_gld_std14.csv
├── images
└── negbio.png
├── negbio
├── __init__.py
├── chexpert
│ ├── LICENSE
│ ├── __init__.py
│ ├── constants.py
│ ├── patterns
│ │ ├── negation.txt
│ │ ├── post_negation_uncertainty.txt
│ │ └── pre_negation_uncertainty.txt
│ ├── phrases
│ │ ├── mention
│ │ │ ├── airspace_opacity.txt
│ │ │ ├── atelectasis.txt
│ │ │ ├── cardiomegaly.txt
│ │ │ ├── consolidation.txt
│ │ │ ├── edema.txt
│ │ │ ├── enlarged_cardiomediastinum.txt
│ │ │ ├── fracture.txt
│ │ │ ├── lung_lesion.txt
│ │ │ ├── no_finding.txt
│ │ │ ├── pleural_effusion.txt
│ │ │ ├── pleural_other.txt
│ │ │ ├── pneumonia.txt
│ │ │ ├── pneumothorax.txt
│ │ │ └── support_devices.txt
│ │ └── unmention
│ │ │ ├── airspace_opacity.txt
│ │ │ ├── lung_lesion.txt
│ │ │ └── pleural_effusion.txt
│ └── stages
│ │ ├── __init__.py
│ │ ├── aggregate.py
│ │ ├── classify.py
│ │ ├── extract.py
│ │ └── load.py
├── cli_utils.py
├── compat.py
├── ext
│ ├── __init__.py
│ └── normalize_mimiccxr.py
├── main_chexpert.py
├── main_mm.py
├── neg
│ ├── __init__.py
│ ├── neg_detector.py
│ ├── propagator.py
│ ├── semgraph.py
│ └── utils.py
├── negbio_clean.py
├── negbio_dner_chexpert.py
├── negbio_dner_matamap.py
├── negbio_neg.py
├── negbio_neg_chexpert.py
├── negbio_normalize.py
├── negbio_parse.py
├── negbio_pipeline.py
├── negbio_ptb2ud.py
├── negbio_section_split.py
├── negbio_ssplit.py
├── negbio_text2bioc.py
├── ngrex
│ ├── __init__.py
│ ├── parser.out
│ ├── parser.py
│ ├── parsetab.py
│ └── pattern.py
├── patterns
│ ├── neg_patterns.txt
│ ├── section_titles.txt
│ └── uncertainty_patterns.txt
└── pipeline
│ ├── __init__.py
│ ├── cleanup.py
│ ├── dner_mm.py
│ ├── negdetect.py
│ ├── parse.py
│ ├── pipeline.py
│ ├── ptb2ud.py
│ ├── scan.py
│ ├── section_split.py
│ ├── ssplit.py
│ └── text2bioc.py
├── requirements.txt
├── setup.py
└── tests
├── __init__.py
├── context.py
└── negbio
├── __init__.py
├── ngrex
├── __init__.py
├── test_parser.py
└── test_pattern.py
├── pipeline
├── __init__.py
└── test_parse.py
└── test_cli.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Created by .ignore support plugin (hsz.mobi)
2 |
3 | .pytest_cache/
4 | backup
5 | examples-local
6 | .DS_store
7 |
8 | ### Python template
9 | # Byte-compiled / optimized / DLL files
10 | __pycache__/
11 | *.py[cod]
12 | *$py.class
13 |
14 | # C extensions
15 | *.so
16 |
17 | # Distribution / packaging
18 | .Python
19 | env/
20 | build/
21 | develop-eggs/
22 | dist/
23 | downloads/
24 | eggs/
25 | .eggs/
26 | lib/
27 | lib64/
28 | parts/
29 | sdist/
30 | var/
31 | wheels/
32 | *.egg-info/
33 | .installed.cfg
34 | *.egg
35 |
36 | # PyInstaller
37 | # Usually these files are written by a python script from a template
38 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
39 | *.manifest
40 | *.spec
41 |
42 | # Installer logs
43 | pip-log.txt
44 | pip-delete-this-directory.txt
45 |
46 | # Unit test / coverage reports
47 | htmlcov/
48 | .tox/
49 | .coverage
50 | .coverage.*
51 | .cache
52 | nosetests.xml
53 | coverage.xml
54 | *,cover
55 | .hypothesis/
56 |
57 | # Translations
58 | *.mo
59 | *.pot
60 |
61 | # Django stuff:
62 | *.log
63 | local_settings.py
64 |
65 | # Flask stuff:
66 | instance/
67 | .webassets-cache
68 |
69 | # Scrapy stuff:
70 | .scrapy
71 |
72 | # Sphinx documentation
73 | docs/_build/
74 |
75 | # PyBuilder
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # pyenv
82 | .python-version
83 |
84 | # celery beat schedule file
85 | celerybeat-schedule
86 |
87 | # SageMath parsed files
88 | *.sage.py
89 |
90 | # dotenv
91 | .env
92 |
93 | # virtualenv
94 | .venv
95 | venv/
96 | ENV/
97 |
98 | # Spyder project settings
99 | .spyderproject
100 |
101 | # Rope project settings
102 | .ropeproject
103 | ### JetBrains template
104 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm
105 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
106 |
107 | # User-specific stuff:
108 | .idea
109 | .idea/**/workspace.xml
110 | .idea/**/tasks.xml
111 | .idea/dictionaries
112 |
113 | # Sensitive or high-churn files:
114 | .idea/**/dataSources/
115 | .idea/**/dataSources.ids
116 | .idea/**/dataSources.xml
117 | .idea/**/dataSources.local.xml
118 | .idea/**/sqlDataSources.xml
119 | .idea/**/dynamic.xml
120 | .idea/**/uiDesigner.xml
121 |
122 | # Gradle:
123 | .idea/**/gradle.xml
124 | .idea/**/libraries
125 |
126 | # Mongo Explorer plugin:
127 | .idea/**/mongoSettings.xml
128 |
129 | ## File-based project format:
130 | *.iws
131 |
132 | ## Plugin-specific files:
133 |
134 | # IntelliJ
135 | /out/
136 |
137 | # mpeltonen/sbt-idea plugin
138 | .idea_modules/
139 |
140 | # JIRA plugin
141 | atlassian-ide-plugin.xml
142 |
143 | # Crashlytics plugin (for Android Studio and IntelliJ)
144 | com_crashlytics_export_strings.xml
145 | crashlytics.properties
146 | crashlytics-build.properties
147 | fabric.properties
148 |
149 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: python
2 | python:
3 | - "2.7"
4 | install:
5 | pip install -r requirements.txt
6 | # - sudo apt-get update
7 | # # We do this conditionally because it saves us some downloading if the
8 | # # version is the same.
9 | # - if [[ "$TRAVIS_PYTHON_VERSION" == "2.7" ]]; then
10 | # wget https://repo.continuum.io/miniconda/Miniconda2-latest-Linux-x86_64.sh -O miniconda.sh;
11 | # else
12 | # wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh;
13 | # fi
14 | # - bash miniconda.sh -b -p $HOME/miniconda
15 | # - export PATH="$HOME/miniconda/bin:$PATH"
16 | # - hash -r
17 | # - conda config --set always_yes yes --set changeps1 no
18 | # - conda update -q conda
19 | # # Useful for debugging any issues with conda
20 | # - conda info -a
21 | #
22 | # # Replace dep1 dep2 ... with your dependencies
23 | # - conda env create --file environment2.7.yml
24 | # - source activate negbio2.7
25 |
26 | script:
27 | - py.test
28 |
29 | notifications:
30 | email: false
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing
2 |
3 | When contributing to this repository, please first discuss the change you wish to make via issue,
4 | email, or any other method with the owners of this repository before making a change.
5 | This project adheres to the [Contributor Covenant Code of Conduct](http://contributor-covenant.org/).
6 |
7 | # Maintainers
8 |
9 | NegBio is maintained with :heart: by:
10 |
11 | -- **@yfpeng**
12 |
13 | See also the list of [contributors](https://github.com/ncbi-nlp/NegBio/contributors) who participated in this project.
14 |
--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | PUBLIC DOMAIN NOTICE
2 | National Center for Biotechnology Information
3 |
4 | This software/database is a "United States Government Work" under the terms of
5 | the United States Copyright Act. It was written as part of the author's
6 | official duties as a United States Government employee and thus cannot be
7 | copyrighted. This software/database is freely available to the public for use.
8 | The National Library of Medicine and the U.S. Government have not placed any
9 | restriction on its use or reproduction.
10 |
11 | Although all reasonable efforts have been taken to ensure the accuracy and
12 | reliability of the software and data, the NLM and the U.S. Government do not and
13 | cannot warrant the performance or results that may be obtained by using this
14 | software or data. The NLM and the U.S. Government disclaim all warranties,
15 | express or implied, including warranties of performance, merchantability or
16 | fitness for any particular purpose.
17 |
18 | Please cite the author in any work or product based on this material:
19 |
20 | Peng Y, Wang X, Lu L, Bagheri M, Summers RM, Lu Z.
21 | NegBio: a high-performance tool for negation and uncertainty detection in radiology reports.
22 | AMIA 2018 Informatics Summit. 2018.
23 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.rst
2 | include LICENSE.txt
3 | include CONTRIBUTING.md
4 | include requirements.txt
5 | include examples/*
6 | recursive-include negbio/patterns *
7 | recursive-include negbio/chexpert/patterns *
8 | recursive-include negbio/chexpert/phrases *
9 |
10 | exclude tests
11 | exclude backup
12 | exclude docs
--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
1 | .. image:: https://raw.githubusercontent.com/ncbi-nlp/NegBio/master/images/negbio.png?raw=true
2 | :target: https://raw.githubusercontent.com/ncbi-nlp/NegBio/master/images/negbio.png?raw=true
3 | :alt: NegBio
4 |
5 | -----------------------
6 |
7 | .. image:: https://img.shields.io/travis/yfpeng/NegBio/master.svg
8 | :target: https://travis-ci.org/yfpeng/NegBio
9 | :alt: Build status
10 |
11 | .. image:: https://img.shields.io/pypi/v/negbio.svg
12 | :target: https://pypi.python.org/pypi/negbio
13 | :alt: PyPI version
14 |
15 | .. image:: https://img.shields.io/readthedocs/negbio.svg
16 | :target: http://negbio.readthedocs.io
17 | :alt: RTD version
18 |
19 |
20 | NegBio is a high-performance NLP tool for negation and uncertainty detection in clinical texts (e.g. radiology reports).
21 |
22 |
23 | Get started
24 | ===========
25 |
26 | Install NegBio
27 | ~~~~~~~~~~~~~~
28 |
29 | 1. Installing from source (recommended)
30 |
31 | .. code-block:: bash
32 |
33 | $ git clone https://github.com/ncbi-nlp/NegBio.git
34 | $ cd /path/to/negbio
35 | $ python setup.py install --user
36 | $ export PATH=~/.local/bin:$PATH
37 |
38 | 2. Installing from pip
39 |
40 | .. code-block:: bash
41 |
42 | $ pip install negbio
43 |
44 |
45 |
46 |
47 | Prepare the dataset
48 | ~~~~~~~~~~~~~~~~~~~
49 |
50 | The inputs can be in either plain text or `BioC `_ format.
51 | If the reports are in plain text, each report needs to be in a single file.
52 | Some examples can be found in the ``examples`` folder.
53 |
54 | Run the script
55 | ~~~~~~~~~~~~~~
56 |
57 | There are two ways to run the pipeline.
58 |
59 | **NOTE**: If you want to process a lot of reports (e.g., > 1000), it is recommended to run the pipeline step-by-step.
60 | See `User guide `_.
61 |
62 |
63 | Using the CheXpert algorithm
64 | ____________________________
65 |
66 | If you want to use the `CheXpert `_ method, run one of the following lines
67 |
68 | .. code-block:: bash
69 |
70 | $ main_chexpert text --output=examples examples/00000086.txt examples/00019248.txt
71 |
72 | .. code-block:: bash
73 |
74 | $ main_chexpert bioc --output=examples examples/1.xml
75 |
76 |
77 | Using MetaMap
78 | _____________
79 |
80 | If you want to use MetaMap, run the following command by replacing ```` with the actual **ABSOLUTE**
81 | path, such as **META_MAP_HOME/bin/metamap16**
82 |
83 | .. code-block:: bash
84 |
85 | $ main_mm text --metamap= --output=examples examples/00000086.txt \
86 | examples/00019248.txt
87 |
88 | .. code-block:: bash
89 |
90 | $ main_mm bioc --metamap= --output=examples examples/1.xml
91 |
92 |
93 | Documentation
94 | =============
95 |
96 | negbio `documentation `_ is available on Read The Docs.
97 |
98 | See `Getting Started `_ for installation and basic
99 | information. To contribute to negbio, read our `contribution guide `_.
100 |
101 | Citing NegBio
102 | =============
103 |
104 | If you're running the NegBio pipeline, please cite:
105 |
106 | * Peng Y, Wang X, Lu L, Bagheri M, Summers RM, Lu Z. `NegBio: a high-performance tool for negation and uncertainty
107 | detection in radiology reports `_. *AMIA 2018 Informatics Summit*. 2018.
108 | * Wang X, Peng Y, Lu L, Bagheri M, Lu Z, Summers R. `ChestX-ray8: Hospital-scale Chest X-ray database and benchmarks
109 | on weakly-supervised classification and localization of common thorax diseases `_.
110 | *IEEE Conference on Computer Vision and Pattern Recognition (CVPR)*. 2017, 2097-2106.
111 |
112 | Acknowledgments
113 | ===============
114 |
115 | This work was supported by the Intramural Research Programs of the National Institutes of Health, National Library of
116 | Medicine and Clinical Center.
117 |
118 | We are grateful to the authors of NegEx, MetaMap, Stanford CoreNLP, Bllip parser, and CheXpert labeler for making
119 | their software tools publicly available.
120 |
121 | We thank Dr. Alexis Allot for the helpful discussion.
122 |
123 | Disclaimer
124 | ==========
125 | This tool shows the results of research conducted in the Computational Biology Branch, NCBI. The information produced
126 | on this website is not intended for direct diagnostic use or medical decision-making without review and oversight
127 | by a clinical professional. Individuals should not change their health behavior solely on the basis of information
128 | produced on this website. NIH does not independently verify the validity or utility of the information produced
129 | by this tool. If you have questions about the information produced on this website, please see a health care
130 | professional. More information about NCBI's disclaimer policy is available.
131 |
--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line.
5 | SPHINXOPTS =
6 | SPHINXBUILD = sphinx-build
7 | SPHINXPROJ = negbio
8 | SOURCEDIR = .
9 | BUILDDIR = _build
10 |
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 |
15 | .PHONY: help Makefile
16 |
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
--------------------------------------------------------------------------------
/docs/README:
--------------------------------------------------------------------------------
1 | The documentation in this tree is in plain text files and can be viewed using
2 | any text file viewer.
3 |
4 | It uses ReST (reStructuredText) [1], and the Sphinx documentation system [2].
5 | This allows it to be built into other forms for easier viewing and browsing.
6 |
7 | To create an HTML version of the docs:
8 |
9 | * Install Sphinx (using ``pip install Sphinx sphinx_rtd_theme`` or some other method)
10 |
11 | * In this docs/ directory, type ``make html`` (or ``make.bat html`` on
12 | Windows) at a shell prompt.
13 |
14 | The documentation in _build/html/index.html can then be viewed in a web browser.
15 |
16 | [1] http://docutils.sourceforge.net/rst.html
17 | [2] http://sphinx-doc.org/
--------------------------------------------------------------------------------
/docs/acknowledgments.rst:
--------------------------------------------------------------------------------
1 | Acknowledgments
2 | ---------------
3 |
4 | This work was supported by the Intramural Research Programs of the National Institutes of Health, National Library of
5 | Medicine and Clinical Center.
6 |
7 | We are grateful to the authors of NegEx, MetaMap, Stanford CoreNLP, Bllip parser, and CheXpert labeler for making
8 | their software tools publicly available.
9 |
10 | We thank Dr. Alexis Allot for the helpful discussion.
11 |
--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 | #
4 | # negbio documentation build configuration file, created by
5 | # sphinx-quickstart on Thu Feb 8 15:24:06 2018.
6 | #
7 | # This file is execfile()d with the current directory set to its
8 | # containing dir.
9 | #
10 | # Note that not all possible configuration values are present in this
11 | # autogenerated file.
12 | #
13 | # All configuration values have a default; values that are commented out
14 | # serve to show the default.
15 |
16 | # If extensions (or modules to document with autodoc) are in another directory,
17 | # add these directories to sys.path here. If the directory is relative to the
18 | # documentation root, use os.path.abspath to make it absolute, like shown here.
19 | #
20 | # import os
21 | # import sys
22 | # sys.path.insert(0, os.path.abspath('.'))
23 |
24 |
25 | # -- General configuration ------------------------------------------------
26 |
27 | # If your documentation needs a minimal Sphinx version, state it here.
28 | #
29 | # needs_sphinx = '1.0'
30 |
31 | # Add any Sphinx extension module names here, as strings. They can be
32 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
33 | # ones.
34 | extensions = []
35 |
36 | # Add any paths that contain templates here, relative to this directory.
37 | templates_path = ['_templates']
38 |
39 | # The suffix(es) of source filenames.
40 | # You can specify multiple suffix as a list of string:
41 | #
42 | # source_suffix = ['.rst', '.md']
43 | source_suffix = '.rst'
44 |
45 | # The master toctree document.
46 | master_doc = 'index'
47 |
48 | # General information about the project.
49 | project = 'negbio'
50 | copyright = '2019, NCBI, NLM, NIH'
51 | author = 'Yifan Peng'
52 |
53 | # The version info for the project you're documenting, acts as replacement for
54 | # |version| and |release|, also used in various other places throughout the
55 | # built documents.
56 | #
57 | # The short X.Y version.
58 | version = '1.0'
59 | # The full version, including alpha/beta/rc tags.
60 | release = '1.0'
61 |
62 | # The language for content autogenerated by Sphinx. Refer to documentation
63 | # for a list of supported languages.
64 | #
65 | # This is also used if you do content translation via gettext catalogs.
66 | # Usually you set "language" from the command line for these cases.
67 | language = None
68 |
69 | # List of patterns, relative to source directory, that match files and
70 | # directories to ignore when looking for source files.
71 | # This patterns also effect to html_static_path and html_extra_path
72 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
73 |
74 | # The name of the Pygments (syntax highlighting) style to use.
75 | pygments_style = 'sphinx'
76 |
77 | # If true, `todo` and `todoList` produce output, else they produce nothing.
78 | todo_include_todos = False
79 |
80 |
81 | # -- Options for HTML output ----------------------------------------------
82 |
83 | # The theme to use for HTML and HTML Help pages. See the documentation for
84 | # a list of builtin themes.
85 | #
86 | html_theme = 'sphinx_rtd_theme'
87 |
88 | # Theme options are theme-specific and customize the look and feel of a theme
89 | # further. For a list of options available for each theme, see the
90 | # documentation.
91 | #
92 | # html_theme_options = {}
93 |
94 | # Add any paths that contain custom static files (such as style sheets) here,
95 | # relative to this directory. They are copied after the builtin static files,
96 | # so a file named "default.css" will overwrite the builtin "default.css".
97 | html_static_path = ['_static']
98 |
99 |
100 | # -- Options for HTMLHelp output ------------------------------------------
101 |
102 | # Output file base name for HTML help builder.
103 | htmlhelp_basename = 'negbiodoc'
104 |
105 |
106 | # -- Options for LaTeX output ---------------------------------------------
107 |
108 | latex_elements = {
109 | # The paper size ('letterpaper' or 'a4paper').
110 | #
111 | # 'papersize': 'letterpaper',
112 |
113 | # The font size ('10pt', '11pt' or '12pt').
114 | #
115 | # 'pointsize': '10pt',
116 |
117 | # Additional stuff for the LaTeX preamble.
118 | #
119 | # 'preamble': '',
120 |
121 | # Latex figure (float) alignment
122 | #
123 | # 'figure_align': 'htbp',
124 | }
125 |
126 | # Grouping the document tree into LaTeX files. List of tuples
127 | # (source start file, target name, title,
128 | # author, documentclass [howto, manual, or own class]).
129 | latex_documents = [
130 | (master_doc, 'negbio.tex', 'negbio Documentation',
131 | 'Yifan Peng', 'manual'),
132 | ]
133 |
134 |
135 | # -- Options for manual page output ---------------------------------------
136 |
137 | # One entry per manual page. List of tuples
138 | # (source start file, name, description, authors, manual section).
139 | man_pages = [
140 | (master_doc, 'negbio', 'negbio Documentation',
141 | [author], 1)
142 | ]
143 |
144 |
145 | # -- Options for Texinfo output -------------------------------------------
146 |
147 | # Grouping the document tree into Texinfo files. List of tuples
148 | # (source start file, target name, title, author,
149 | # dir menu entry, description, category)
150 | texinfo_documents = [
151 | (master_doc, 'negbio', 'negbio Documentation',
152 | author, 'negbio', 'One line description of project.',
153 | 'Miscellaneous'),
154 | ]
155 |
156 |
157 |
158 |
--------------------------------------------------------------------------------
/docs/contributing.rst:
--------------------------------------------------------------------------------
1 | Contributing
2 | ------------
3 |
4 | Please read ``CONTRIBUTING.md`` for details on our code of conduct, and the process for submitting pull requests to us.
--------------------------------------------------------------------------------
/docs/developer_guide.rst:
--------------------------------------------------------------------------------
1 | NegBio Developer Guide
2 | ======================
3 |
4 | Create the documentation
5 | ^^^^^^^^^^^^^^^^^^^^^^^^
6 |
7 | Install Sphinx
8 |
9 | .. code-block:: bash
10 | :linenos:
11 |
12 | $ pip install Sphinx
13 | $ pip install sphinx_rtd_theme
14 | $ cd docs
15 | $ make html
--------------------------------------------------------------------------------
/docs/disclaimer.rst:
--------------------------------------------------------------------------------
1 | Disclaimer
2 | ==========
3 |
4 | This tool shows the results of research conducted in the Computational Biology Branch, NCBI. The information produced
5 | on this website is not intended for direct diagnostic use or medical decision-making without review and oversight
6 | by a clinical professional. Individuals should not change their health behavior solely on the basis of information
7 | produced on this website. NIH does not independently verify the validity or utility of the information produced
8 | by this tool. If you have questions about the information produced on this website, please see a health care
9 | professional. More information about NCBI's disclaimer policy is available.
10 |
--------------------------------------------------------------------------------
/docs/getting_started.rst:
--------------------------------------------------------------------------------
1 | Getting Started with NegBio
2 | ===========================
3 |
4 | These instructions will get you a copy of the project up and run on your local machine for development and testing
5 | purposes. The package should successfully install on Linux (and possibly macOS).
6 |
7 | Installing
8 | ----------
9 |
10 | Prerequisites
11 | ~~~~~~~~~~~~~
12 |
13 | * python >2.4
14 | * Linux
15 | * Java
16 |
17 | Note: since v1.0, MetaMap is not required. You can use the CheXpert vocabularies (``negbio/chexpert/phrases``) instead.
18 | If you want to use MetaMap, it can be downloaded from `https://metamap.nlm.nih.gov/MainDownload.shtml `_.
19 | Installation instructions can be found at `https://metamap.nlm.nih.gov/Installation.shtml `_.
20 | Please make sure that both ``skrmedpostctl`` and ``wsdserverctl`` are started.
21 |
22 | Installing from source (recommended)
23 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
24 |
25 | .. code-block:: bash
26 |
27 | $ git clone https://github.com/ncbi-nlp/NegBio.git
28 | $ cd /path/to/negbio
29 | $ python setup.py install --user
30 | $ export PATH=~/.local/bin:$PATH
31 |
32 | Installing from pip
33 | ~~~~~~~~~~~~~~~~~~~
34 |
35 | .. code-block:: bash
36 |
37 | $ pip install negbio
38 |
39 |
40 | Using NegBio
41 | ------------
42 |
43 | Prepare the dataset
44 | ~~~~~~~~~~~~~~~~~~~
45 |
46 | The inputs can be in either plain text or `BioC `_ format. If the reports are in plain
47 | text, each report needs to be in a single file. Some examples can be found in the ``examples`` folder.
48 |
49 | Run the script
50 | ~~~~~~~~~~~~~~
51 |
52 | There are two ways to run the pipeline.
53 |
54 | Using CheXpert algorithm
55 | ________________________
56 |
57 | If you want to use the CheXpert method, run one of the following lines
58 |
59 | .. code-block:: bash
60 |
61 | $ main_chexpert text --output=examples/test.neg.xml examples/00000086.txt examples/00019248.txt
62 |
63 | .. code-block:: bash
64 |
65 | $ main_chexpert bioc --output=examples/test.neg.xml examples/1.xml
66 |
67 | The script will
68 |
69 | 1. [Optional] Combine ``examples/00000086.txt`` and ``examples/00019248.txt`` into one BioC XML file
70 | 2. Detect concepts using CheXpert pre-defined vocabularies (by default using the list ``negbio/chexpert/phrases``)
71 | 3. Detect positive, negative and uncertain concepts using rules in ``negbio/chexpert/patterns``
72 | 4. Save the results in ``examples/test.neg.xml``
73 |
74 | More options (e.g., setting the CUI list or rules) can be obtained by running
75 |
76 | .. code-block:: bash
77 |
78 | $ main_chexpert --help
79 |
80 | Using MetaMap
81 | _____________
82 |
83 | If you want to use MetaMap, run the following command by replacing ```` with the actual **ABSOLUTE**
84 | path, such as **META_MAP_HOME/bin/metamap16**
85 |
86 | .. code-block:: bash
87 |
88 | $ export METAMAP_BIN=META_MAP_HOME/bin/metamap16
89 | $ main_mm text --metamap=$METAMAP_BIN --output=examples/test.neg.xml \
90 | examples/00000086.txt examples/00019248.txt
91 |
92 | .. code-block:: bash
93 |
94 | $ export METAMAP_BIN=META_MAP_HOME/bin/metamap16
95 | $ main_mm bioc --metamap=$METAMAP_BIN --output=examples/test.neg.xml examples/1.xml
96 |
97 | The script will
98 |
99 | 1. [Optional] Combine ``examples/00000086.txt`` and ``examples/00019248.txt`` into one BioC XML file
100 | 2. Detect UMLS concepts (CUIs) using MetaMap (by default using the CUI list ``examples/cuis-cvpr2017.txt``
101 | 3. Detect negative and uncertain CUIs using rules in ``negbio/patterns``
102 | 4. Save the results in ``examples/test.neg.xml``
103 |
104 | More options (e.g., setting the CUI list or rules) can be obtained by running
105 |
106 | .. code-block:: bash
107 |
108 | $ main_mm --help
109 |
110 |
111 | Next Steps
112 | ----------
113 |
114 | To start learning how to use NegBio, see the :doc:`user_guide`.
115 |
--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
1 | .. negbio documentation master file, created by
2 | sphinx-quickstart on Thu Feb 8 15:24:06 2018.
3 | You can adapt this file completely to your liking, but it should at least
4 | contain the root `toctree` directive.
5 |
6 | NegBio documentation
7 | ====================
8 |
9 | .. toctree::
10 | :maxdepth: 5
11 | :caption: Contents:
12 |
13 | getting_started
14 | user_guide
15 | developer_guide
16 | license
17 | contributing
18 | acknowledgments
19 | disclaimer
20 | reference
21 |
22 |
23 | Indices and tables
24 | ==================
25 |
26 | * :ref:`genindex`
27 | * :ref:`modindex`
28 | * :ref:`search`
29 |
--------------------------------------------------------------------------------
/docs/license.rst:
--------------------------------------------------------------------------------
1 | License
2 | =======
3 |
4 | PUBLIC DOMAIN NOTICE
5 |
6 | National Center for Biotechnology Information
7 |
8 | This software/database is a "United States Government Work" under the terms of
9 | the United States Copyright Act. It was written as part of the author's
10 | official duties as a United States Government employee and thus cannot be
11 | copyrighted. This software/database is freely available to the public for use.
12 | The National Library of Medicine and the U.S. Government have not placed any
13 | restriction on its use or reproduction.
14 |
15 | Although all reasonable efforts have been taken to ensure the accuracy and
16 | reliability of the software and data, the NLM and the U.S. Government do not and
17 | cannot warrant the performance or results that may be obtained by using this
18 | software or data. The NLM and the U.S. Government disclaim all warranties,
19 | express or implied, including warranties of performance, merchantability or
20 | fitness for any particular purpose.
21 |
22 | Please cite the author in any work or product based on these materials:
23 |
24 | Peng Y, Wang X, Lu L, Bagheri M, Summers RM, Lu Z.
25 | NegBio: a high-performance tool for negation and uncertainty detection in
26 | radiology reports.
27 | AMIA 2018 Informatics Summit. 2018.
28 |
29 | Wang X, Peng Y, Lu L, Bagheri M, Lu Z, Summers R.
30 | ChestX-ray8: Hospital-scale Chest X-ray database and benchmarks on
31 | weakly-supervised classification and localization of common thorax diseases.
32 | IEEE Conference on Computer Vision and Pattern Recognition (CVPR). 2017, 2097-2106.
33 |
34 |
--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 |
3 | pushd %~dp0
4 |
5 | REM Command file for Sphinx documentation
6 |
7 | if "%SPHINXBUILD%" == "" (
8 | set SPHINXBUILD=sphinx-build
9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | set SPHINXPROJ=negbio
13 |
14 | if "%1" == "" goto help
15 |
16 | %SPHINXBUILD% >NUL 2>NUL
17 | if errorlevel 9009 (
18 | echo.
19 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
20 | echo.installed, then set the SPHINXBUILD environment variable to point
21 | echo.to the full path of the 'sphinx-build' executable. Alternatively you
22 | echo.may add the Sphinx directory to PATH.
23 | echo.
24 | echo.If you don't have Sphinx installed, grab it from
25 | echo.http://sphinx-doc.org/
26 | exit /b 1
27 | )
28 |
29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
30 | goto end
31 |
32 | :help
33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
34 |
35 | :end
36 | popd
37 |
--------------------------------------------------------------------------------
/docs/reference.rst:
--------------------------------------------------------------------------------
1 | Reference
2 | =========
3 |
4 | * Peng Y, Wang X, Lu L, Bagheri M, Summers RM, Lu Z. `NegBio: a high-performance tool for negation and uncertainty
5 | detection in radiology reports `_. *AMIA 2018 Informatics Summit*. 2018.
6 | * Wang X, Peng Y, Lu L, Bagheri M, Lu Z, Summers R. `ChestX-ray8: Hospital-scale Chest X-ray database and benchmarks
7 | on weakly-supervised classification and localization of common thorax diseases `_.
8 | *IEEE Conference on Computer Vision and Pattern Recognition (CVPR)*. 2017, 2097-2106.
--------------------------------------------------------------------------------
/docs/user_guide.rst:
--------------------------------------------------------------------------------
1 | NegBio User Guide
2 | =================
3 |
4 | Run the pipeline step-by-step
5 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
6 |
7 | The step-by-step pipeline generates all intermediate documents. You can easily rerun one step if it makes errors.
8 | The whole steps are
9 |
10 | 1. ``text2bioc`` combines text into a BioC XML file.
11 | 2. ``normalize`` removes noisy text such as ``[**Patterns**]``.
12 | 3. ``section_split`` splits the report into sections based on titles at ``patterns/section_titles.txt``
13 | 4. ``ssplit`` splits text into sentences.
14 | 5. Named entity recognition
15 |
16 | a. ``dner_mm`` detects UMLS concepts using MetaMap.
17 | b. ``dner_chexpert`` detects concepts using the CheXpert vocabularies at ``negbio/chexpert/phrases``.
18 |
19 | 6. ``parse`` parses sentence using the `Bllip parser `_.
20 | 7. ``ptb2ud`` converts the parse tree to universal dependencies using `Stanford converter `_.
21 | 8. Negation detection
22 |
23 | a. ``neg`` detects negative and uncertain findings.
24 | b. ``neg_chexpert`` detects positive, negative and uncertain findings (recommended)
25 |
26 | 9. ``cleanup`` removes intermediate information.
27 |
28 | Steps 2-10 will process the input files one-by-one and generate the results in the output directory.
29 | The 2nd and 3rd can be skipped. You can chose either step 5 or 6 for named entity recognition.
30 |
31 | 1. Convert text files to BioC format
32 | ------------------------------------
33 |
34 | You can skip this step if the reports are already in the `BioC `_ format.
35 | **If you have lots of reports, it is recommended to put them into several BioC files, for example, 100 reports per BioC file.**
36 |
37 | .. code-block:: bash
38 |
39 | $ export BIOC_DIR=/path/to/bioc
40 | $ export TEXT_DIR=/path/to/text
41 | $ negbio_pipeline text2bioc --output=$BIOC_DIR/test.xml $TEXT_DIR/*.txt
42 |
43 | Another most commonly used command is:
44 |
45 | .. code-block:: bash
46 |
47 | $ find $TEXT_DIR -type f | negbio_pipeline text2bioc --output=$BIOC_DIR
48 |
49 | 2. Normalize reports
50 | --------------------
51 |
52 | This step removes the noisy text such as ``[**Patterns**]`` in the MIMIC-III reports.
53 |
54 | .. code-block:: bash
55 |
56 | $ negbio_pipeline normalize --output=$OUTPUT_DIR $INPUT_DIR/*.xml
57 |
58 | 3. Split each report into sections
59 | -----------------------------------
60 |
61 | This step splits the report into sections.
62 | The default section titles is at ``patterns/section_titles.txt``.
63 | You can specify customized section titles using the option ``--pattern=``.
64 |
65 | .. code-block:: bash
66 |
67 | $ negbio_pipeline section_split --output=$OUTPUT_DIR $INPUT_DIR/*.xml
68 |
69 |
70 | 4. Splits each report into sentences
71 | ------------------------------------
72 |
73 | This step splits the report into sentences using the NLTK splitter
74 | (`nltk.tokenize.sent_tokenize `_).
75 |
76 | .. code-block:: bash
77 |
78 | $ negbio_pipeline ssplit --output=$OUTPUT_DIR $INPUT_DIR/*.xml
79 |
80 |
81 | 5. Named entity recognition
82 | ---------------------------
83 |
84 | This step recognizes named entities (e.g., findings, diseases, devices) from the reports.
85 | The first version of NegBio uses MetaMap to detect UMLS concepts.
86 |
87 | MetaMap can be can be downloaded from `https://metamap.nlm.nih.gov/MainDownload.shtml `_.
88 | Installation instructions can be found at `https://metamap.nlm.nih.gov/Installation.shtml `_.
89 | Before using MetaMap, please make sure that both ``skrmedpostctl`` and ``wsdserverctl`` are started.
90 |
91 | MetaMap intends to extract all UMLS concepts.
92 | Many of them are not irrelevant to radiology.
93 | Therefore, it is better to specify the UMLS concepts of interest via ``--cuis=``
94 |
95 | .. code-block:: bash
96 |
97 | $ export METAMAP_BIN=META_MAP_HOME/bin/metamap16
98 | $ negbio_pipeline dner_mm --metamap=$METAMAP_BIN --output=$OUTPUT_DIR $INPUT_DIR/*.xml
99 |
100 | NegBio also integrates the CheXpert vocabularies to recognize the presence of 14 observations.
101 | All vocabularies can be found at ``negbio/chexpert/phrases``.
102 | Each file in the folder represents one type of named entities with various text expressions.
103 | So far, NegBio does not support adding more types in the folder, but you can add more text expressions of the type.
104 |
105 | .. code-block:: bash
106 |
107 | $ negbio_pipeline dner_chexpert --output=$OUTPUT_DIR $INPUT_DIR/*.xml
108 |
109 |
110 | In general, MetaMap is more comprehensive while CheXpert is more accurate on 14 types of findings.
111 | MetaMap is also slower and easier to break than CheXpert.
112 |
113 |
114 | 6. Parse the sentence
115 | ---------------------
116 |
117 | This step parses sentence using the `Bllip parser `_.
118 |
119 | .. code-block:: bash
120 |
121 | $ negbio_pipeline parse --output=$OUTPUT_DIR $INPUT_DIR/*.xml
122 |
123 |
124 | 7. Convert the parse tree to UD
125 | -------------------------------
126 |
127 | This step converts the parse tree to universal dependencies using `Stanford converter `_.
128 |
129 | .. code-block:: bash
130 |
131 | $ negbio_pipeline ptb2ud --output=$OUTPUT_DIR $INPUT_DIR/*.xml
132 |
133 |
134 | 8. Detect negative and uncertain findings
135 | -----------------------------------------
136 |
137 | This step detects negative and uncertain findings using patterns.
138 | By default, the program uses the negation and uncertainty patterns in the ``negbio/patterns`` folder.
139 | However, you are free to create your own patterns via ``--neg-patterns=`` and ``--uncertainty-patterns=``.
140 | The pattern is a `semgrex-type `_
141 | pattern for matching node in the dependency graph.
142 | Currently, we only support ``<`` and ``>`` operations.
143 | A detailed grammar specification (using PLY, Python Lex-Yacc) can be found in ``ngrex/parser.py``.
144 |
145 | .. code-block:: bash
146 |
147 | $ negbio_pipeline neg --output=$OUTPUT_DIR $INPUT_DIR/*.xml
148 |
149 | NegBio also integrates the CheXpert algorithms.
150 | Different from the original NegBio, CheXpert utilizes a 3-phase pipeline consisting of pre-negation uncertainty,
151 | negation, and post-negation uncertainty (`Irvin et al., 2019 `_).
152 | Each phase consists of rules which are matched against the mention; if a match is found, then the mention is classified
153 | accordingly (as uncertain in the first or third phase, and as negative in the second phase).
154 | If a mention is not matched in any of the phases, it is classified as positive.
155 |
156 | Generally, the CheXpert contains more rules and is more accurate than the original NegBio.
157 |
158 | .. code-block:: bash
159 |
160 | $ negbio_pipeline neg_chexpert --output=$OUTPUT_DIR $INPUT_DIR/*.xml
161 |
162 | Similarly, you are free to create patterns via ``--neg-patterns=``, ``--pre-uncertainty-patterns=``, and
163 | ``--post-uncertainty-patterns=``.
164 |
165 | 9. Cleans intermediate information
166 | ----------------------------------
167 |
168 | This step removes intermediate information (sentence annotations) from the BioC files.
169 |
170 | .. code-block:: bash
171 |
172 | $ negbio_pipeline cleanup --output=$OUTPUT_DIR $INPUT_DIR/*.xml
173 |
174 |
--------------------------------------------------------------------------------
/environment2.7.yml:
--------------------------------------------------------------------------------
1 | name: negbio2.7
2 | channels:
3 | - anaconda
4 | - conda-forge
5 | - auto
6 | dependencies:
7 | - python=2.7.11
8 | - future=0.16.0
9 | - docutils=0.13.1
10 | - docopt=0.6.2
11 | - pytest=3.1.3
12 | - networkx=1.11
13 | - ply=3.10
14 | - tqdm=4.19.5
15 | - nltk=3.2.4
16 | - pathlib2=2.3.3
17 | - numpy=1.15.4
18 | - jpype1=0.6.3
19 | - pip:
20 | - bioc==1.1.dev3
21 | - pystanforddependencies==0.3.1
22 | - bllipparser==2016.9.11
23 | - pymetamap==0.1
24 |
--------------------------------------------------------------------------------
/environment3.7.yml:
--------------------------------------------------------------------------------
1 | name: negbio3.7
2 | channels:
3 | - anaconda
4 | - conda-forge
5 | - auto
6 | dependencies:
7 | - python=3.7
8 | - docutils=0.14
9 | - docopt=0.6.2
10 | - pytest=4.2.0
11 | - networkx=2.2
12 | - ply=3.11
13 | - tqdm=4.31
14 | - nltk=3.4
15 | - numpy=1.16
16 | - jpype1=0.6.3
17 | - pip:
18 | - bioc==1.3.1
19 | - pystanforddependencies==0.3.1
20 | - bllipparser==2016.9.11
21 | - pymetamap==0.1
22 |
--------------------------------------------------------------------------------
/examples/00000086.txt:
--------------------------------------------------------------------------------
1 | findings: pa and lat cxr at 7:34 p.m.. heart and mediastinum are
2 | stable. lungs are unchanged. air- filled cystic changes. no
3 | pneumothorax. osseous structures unchanged scoliosis
4 | impression: stable chest.
5 | dictating
--------------------------------------------------------------------------------
/examples/00019248.txt:
--------------------------------------------------------------------------------
1 | findings:
2 | chest: four images:
3 | right picc with tip within the upper svc.
4 | probable enlargement of the main pulmonary artery.
5 | mild cardiomegaly.
6 | no evidence of focal infiltrate, effusion or pneumothorax.
7 | dictating
--------------------------------------------------------------------------------
/examples/1.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | 2017-05-31
5 |
6 |
7 |
8 | 00019248
9 |
10 | 0
11 | findings:
12 | chest: four images:
13 | right picc with tip within the upper svc.
14 | probable enlargement of the main pulmonary artery.
15 | mild cardiomegaly.
16 | no evidence of focal infiltrate, effusion or pneumothorax.
17 | dictating
18 |
19 | Cardiomegaly
20 | C0018800
21 | MetaMap
22 | fndg
23 |
24 | Mild cardiomegaly.
25 |
26 |
27 | Infiltration
28 | C0332448
29 | MetaMap
30 | ftcn
31 |
32 | infiltrate
33 |
34 |
35 | effusion
36 | C0013687
37 | MetaMap
38 | patf
39 |
40 | effusion
41 |
42 |
43 | Pneumothorax
44 | C0032326
45 | MetaMap
46 | dsyn
47 |
48 | pneumothorax.
49 |
50 |
51 |
52 |
53 | 00000086
54 |
55 | 0
56 | findings: pa and lat cxr at 7:34 p.m.. heart and mediastinum are
57 | stable. lungs are unchanged. air- filled cystic changes. no
58 | pneumothorax. osseous structures unchanged scoliosis
59 | impression: stable chest.
60 | dictating
61 |
62 | True
63 | Pneumothorax
64 | C0032326
65 | MetaMap
66 | dsyn
67 |
68 | pneumothorax
69 |
70 |
71 |
72 |
73 |
--------------------------------------------------------------------------------
/examples/2.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | 2017-05-31
5 |
6 |
7 | 00000086
8 |
9 | 0
10 | findings: pa and lat cxr at 7:34 p.m.. heart and mediastinum are
11 | stable. lungs are unchanged. air- filled cystic changes. no
12 | pneumothorax. osseous structures unchanged scoliosis
13 | impression: stable chest.
14 | dictating
15 |
16 | True
17 | Pneumothorax
18 | C0032326
19 | MetaMap
20 | dsyn
21 |
22 | pneumothorax
23 |
24 |
25 |
26 |
27 |
--------------------------------------------------------------------------------
/examples/cuis-cvpr2017.txt:
--------------------------------------------------------------------------------
1 | C0264494
2 | C0264496
3 | C0004144
4 | C0264495
5 | C0018800
6 | C0702116
7 | C0521530
8 | C0013604
9 | C0013608
10 | C0034063
11 | C0031039
12 | C0013687
13 | C0747635
14 | C1265808
15 | C0747639
16 | C0032227
17 | C0034067
18 | C0038536
19 | C0016059
20 | C0034069
21 | C0019270
22 | C3489393
23 | C0744895
24 | C0332448
25 | C0235896
26 | C0577559
27 | C3152252
28 | C0748419
29 | C1265602
30 | C0028259
31 | C0332558
32 | C0034079
33 | C0746923
34 | C0748164
35 | C0264545
36 | C1960024
37 | C0585104
38 | C0585105
39 | C0585106
40 | C0032285
41 | C0578577
42 | C0578576
43 | C0577702
44 | C0747651
45 | C0546333
46 | C0032326
47 | C0264557
48 | C0546334
49 |
--------------------------------------------------------------------------------
/examples/openi-testset.txt:
--------------------------------------------------------------------------------
1 | CXR10
2 | CXR1002
3 | CXR1007
4 | CXR1008
5 | CXR101
6 | CXR102
7 | CXR1020
8 | CXR1028
9 | CXR1042
10 | CXR105
11 | CXR1055
12 | CXR1056
13 | CXR1058
14 | CXR1062
15 | CXR1074
16 | CXR1076
17 | CXR1077
18 | CXR1078
19 | CXR1091
20 | CXR1092
21 | CXR1099
22 | CXR11
23 | CXR1102
24 | CXR1109
25 | CXR1118
26 | CXR112
27 | CXR1121
28 | CXR1138
29 | CXR1140
30 | CXR1159
31 | CXR1161
32 | CXR1163
33 | CXR1167
34 | CXR1169
35 | CXR1179
36 | CXR1190
37 | CXR1194
38 | CXR1199
39 | CXR1202
40 | CXR1206
41 | CXR1208
42 | CXR1210
43 | CXR1213
44 | CXR1218
45 | CXR1222
46 | CXR1226
47 | CXR1231
48 | CXR1239
49 | CXR124
50 | CXR1243
51 | CXR1248
52 | CXR1255
53 | CXR1258
54 | CXR1260
55 | CXR1265
56 | CXR1267
57 | CXR127
58 | CXR1270
59 | CXR1273
60 | CXR1286
61 | CXR1287
62 | CXR1288
63 | CXR1292
64 | CXR1295
65 | CXR1297
66 | CXR1314
67 | CXR1316
68 | CXR1332
69 | CXR1334
70 | CXR1374
71 | CXR1378
72 | CXR1391
73 | CXR1392
74 | CXR1396
75 | CXR1397
76 | CXR1398
77 | CXR1399
78 | CXR14
79 | CXR1401
80 | CXR1409
81 | CXR141
82 | CXR1411
83 | CXR1413
84 | CXR1439
85 | CXR144
86 | CXR1443
87 | CXR1444
88 | CXR1448
89 | CXR145
90 | CXR1452
91 | CXR1460
92 | CXR1461
93 | CXR1468
94 | CXR1481
95 | CXR1487
96 | CXR1497
97 | CXR1500
98 | CXR1510
99 | CXR1515
100 | CXR1518
101 | CXR1519
102 | CXR1527
103 | CXR1528
104 | CXR1529
105 | CXR153
106 | CXR154
107 | CXR1540
108 | CXR1548
109 | CXR1551
110 | CXR1556
111 | CXR1563
112 | CXR1568
113 | CXR1570
114 | CXR1576
115 | CXR1581
116 | CXR1583
117 | CXR1586
118 | CXR159
119 | CXR1593
120 | CXR1602
121 | CXR1605
122 | CXR1608
123 | CXR1614
124 | CXR1617
125 | CXR1624
126 | CXR1627
127 | CXR163
128 | CXR1632
129 | CXR1638
130 | CXR1639
131 | CXR1643
132 | CXR1647
133 | CXR166
134 | CXR1660
135 | CXR167
136 | CXR1671
137 | CXR1691
138 | CXR1709
139 | CXR1711
140 | CXR1716
141 | CXR1724
142 | CXR1725
143 | CXR1728
144 | CXR1729
145 | CXR1733
146 | CXR1734
147 | CXR1736
148 | CXR1738
149 | CXR1739
150 | CXR1740
151 | CXR1746
152 | CXR1756
153 | CXR1763
154 | CXR1764
155 | CXR1765
156 | CXR1766
157 | CXR1767
158 | CXR1773
159 | CXR1777
160 | CXR1783
161 | CXR1801
162 | CXR1806
163 | CXR1813
164 | CXR1814
165 | CXR1816
166 | CXR1823
167 | CXR1831
168 | CXR1832
169 | CXR1841
170 | CXR1845
171 | CXR1861
172 | CXR1868
173 | CXR1871
174 | CXR1877
175 | CXR1881
176 | CXR1883
177 | CXR1884
178 | CXR1892
179 | CXR1895
180 | CXR1896
181 | CXR190
182 | CXR1903
183 | CXR1904
184 | CXR1909
185 | CXR191
186 | CXR1912
187 | CXR1914
188 | CXR1920
189 | CXR1923
190 | CXR1926
191 | CXR1929
192 | CXR193
193 | CXR1934
194 | CXR194
195 | CXR1942
196 | CXR1944
197 | CXR1946
198 | CXR1951
199 | CXR1952
200 | CXR1954
201 | CXR1958
202 | CXR1960
203 | CXR1964
204 | CXR1965
205 | CXR1969
206 | CXR1972
207 | CXR1977
208 | CXR1978
209 | CXR1979
210 | CXR1992
211 | CXR1993
212 | CXR1994
213 | CXR1999
214 | CXR2011
215 | CXR2012
216 | CXR2014
217 | CXR2029
218 | CXR2032
219 | CXR2038
220 | CXR2039
221 | CXR204
222 | CXR2040
223 | CXR2050
224 | CXR2053
225 | CXR2059
226 | CXR2061
227 | CXR2062
228 | CXR2066
229 | CXR2067
230 | CXR207
231 | CXR2072
232 | CXR2080
233 | CXR2086
234 | CXR2087
235 | CXR2089
236 | CXR2098
237 | CXR21
238 | CXR211
239 | CXR2111
240 | CXR2114
241 | CXR2115
242 | CXR2126
243 | CXR2131
244 | CXR2140
245 | CXR2142
246 | CXR2145
247 | CXR2152
248 | CXR2162
249 | CXR2163
250 | CXR2165
251 | CXR2167
252 | CXR2170
253 | CXR2171
254 | CXR2172
255 | CXR2177
256 | CXR2183
257 | CXR2191
258 | CXR2195
259 | CXR2199
260 | CXR2202
261 | CXR2205
262 | CXR2210
263 | CXR2211
264 | CXR2221
265 | CXR2222
266 | CXR2225
267 | CXR2244
268 | CXR2247
269 | CXR2250
270 | CXR2257
271 | CXR2264
272 | CXR2265
273 | CXR2268
274 | CXR2275
275 | CXR2287
276 | CXR2288
277 | CXR2289
278 | CXR2301
279 | CXR2307
280 | CXR2308
281 | CXR2324
282 | CXR2326
283 | CXR233
284 | CXR235
285 | CXR2352
286 | CXR2353
287 | CXR2357
288 | CXR2360
289 | CXR2368
290 | CXR237
291 | CXR2371
292 | CXR2372
293 | CXR2378
294 | CXR2380
295 | CXR2382
296 | CXR2388
297 | CXR2392
298 | CXR2395
299 | CXR2396
300 | CXR2397
301 | CXR240
302 | CXR2409
303 | CXR2414
304 | CXR2419
305 | CXR242
306 | CXR2421
307 | CXR243
308 | CXR2430
309 | CXR2437
310 | CXR2438
311 | CXR2448
312 | CXR2450
313 | CXR2455
314 | CXR2460
315 | CXR2462
316 | CXR2463
317 | CXR2465
318 | CXR2472
319 | CXR2474
320 | CXR2482
321 | CXR2494
322 | CXR2495
323 | CXR2496
324 | CXR2497
325 | CXR2498
326 | CXR2499
327 | CXR2503
328 | CXR2506
329 | CXR2515
330 | CXR2516
331 | CXR2519
332 | CXR2523
333 | CXR2525
334 | CXR2526
335 | CXR2530
336 | CXR2533
337 | CXR2536
338 | CXR2540
339 | CXR2542
340 | CXR2547
341 | CXR2557
342 | CXR256
343 | CXR2573
344 | CXR2577
345 | CXR2582
346 | CXR2583
347 | CXR2585
348 | CXR2595
349 | CXR2601
350 | CXR2604
351 | CXR2607
352 | CXR2608
353 | CXR261
354 | CXR2610
355 | CXR2617
356 | CXR2619
357 | CXR2620
358 | CXR2622
359 | CXR2625
360 | CXR2629
361 | CXR2636
362 | CXR2642
363 | CXR2649
364 | CXR2654
365 | CXR2655
366 | CXR2673
367 | CXR2684
368 | CXR2688
369 | CXR2699
370 | CXR27
371 | CXR2714
372 | CXR2716
373 | CXR2730
374 | CXR2739
375 | CXR2752
376 | CXR2759
377 | CXR276
378 | CXR2768
379 | CXR2776
380 | CXR2780
381 | CXR2782
382 | CXR2791
383 | CXR28
384 | CXR2808
385 | CXR2817
386 | CXR2820
387 | CXR2824
388 | CXR2827
389 | CXR2832
390 | CXR2833
391 | CXR284
392 | CXR2847
393 | CXR2852
394 | CXR2856
395 | CXR2858
396 | CXR286
397 | CXR287
398 | CXR2871
399 | CXR2876
400 | CXR2879
401 | CXR288
402 | CXR2887
403 | CXR2890
404 | CXR29
405 | CXR2901
406 | CXR2906
407 | CXR2909
408 | CXR2911
409 | CXR2924
410 | CXR2926
411 | CXR2927
412 | CXR2931
413 | CXR2940
414 | CXR2942
415 | CXR2951
416 | CXR2960
417 | CXR2966
418 | CXR2968
419 | CXR2969
420 | CXR297
421 | CXR2979
422 | CXR2981
423 | CXR2992
424 | CXR2997
425 | CXR300
426 | CXR3008
427 | CXR3011
428 | CXR3012
429 | CXR3016
430 | CXR302
431 | CXR3034
432 | CXR3038
433 | CXR304
434 | CXR3045
435 | CXR3046
436 | CXR305
437 | CXR3050
438 | CXR3053
439 | CXR3056
440 | CXR3057
441 | CXR3063
442 | CXR307
443 | CXR3070
444 | CXR3071
445 | CXR3083
446 | CXR3084
447 | CXR309
448 | CXR3093
449 | CXR3094
450 | CXR310
451 | CXR3100
452 | CXR3101
453 | CXR3106
454 | CXR3109
455 | CXR3112
456 | CXR3121
457 | CXR3123
458 | CXR3132
459 | CXR3133
460 | CXR3135
461 | CXR3145
462 | CXR3152
463 | CXR3153
464 | CXR3154
465 | CXR3155
466 | CXR3156
467 | CXR3159
468 | CXR3163
469 | CXR3176
470 | CXR3177
471 | CXR3178
472 | CXR3184
473 | CXR3197
474 | CXR3199
475 | CXR3206
476 | CXR3208
477 | CXR3213
478 | CXR3218
479 | CXR3230
480 | CXR3238
481 | CXR3242
482 | CXR3254
483 | CXR3255
484 | CXR3257
485 | CXR326
486 | CXR3261
487 | CXR3262
488 | CXR3271
489 | CXR3272
490 | CXR3288
491 | CXR3290
492 | CXR3292
493 | CXR3296
494 | CXR33
495 | CXR3307
496 | CXR3315
497 | CXR3318
498 | CXR3319
499 | CXR332
500 | CXR3323
501 | CXR3325
502 | CXR3329
503 | CXR333
504 | CXR3332
505 | CXR3333
506 | CXR3337
507 | CXR334
508 | CXR3342
509 | CXR3355
510 | CXR3356
511 | CXR3368
512 | CXR3373
513 | CXR3395
514 | CXR3405
515 | CXR3410
516 | CXR3413
517 | CXR3416
518 | CXR3419
519 | CXR342
520 | CXR3428
521 | CXR3432
522 | CXR3437
523 | CXR3439
524 | CXR3443
525 | CXR3449
526 | CXR3451
527 | CXR3473
528 | CXR3477
529 | CXR3479
530 | CXR3485
531 | CXR349
532 | CXR3499
533 | CXR3514
534 | CXR3521
535 | CXR3523
536 | CXR3524
537 | CXR3525
538 | CXR353
539 | CXR3530
540 | CXR3539
541 | CXR3543
542 | CXR3559
543 | CXR3562
544 | CXR357
545 | CXR3575
546 | CXR358
547 | CXR3585
548 | CXR3586
549 | CXR3587
550 | CXR3589
551 | CXR3596
552 | CXR3599
553 | CXR36
554 | CXR3603
555 | CXR3606
556 | CXR3609
557 | CXR3610
558 | CXR3619
559 | CXR3623
560 | CXR3632
561 | CXR3640
562 | CXR3641
563 | CXR3645
564 | CXR3648
565 | CXR366
566 | CXR3661
567 | CXR3663
568 | CXR3666
569 | CXR3668
570 | CXR3670
571 | CXR3677
572 | CXR368
573 | CXR3683
574 | CXR3684
575 | CXR3685
576 | CXR3698
577 | CXR370
578 | CXR3700
579 | CXR3714
580 | CXR3715
581 | CXR3718
582 | CXR3726
583 | CXR3735
584 | CXR3741
585 | CXR3744
586 | CXR3747
587 | CXR3762
588 | CXR3777
589 | CXR3785
590 | CXR379
591 | CXR3792
592 | CXR3795
593 | CXR3798
594 | CXR38
595 | CXR3803
596 | CXR3806
597 | CXR3817
598 | CXR3825
599 | CXR383
600 | CXR3830
601 | CXR3832
602 | CXR3837
603 | CXR3838
604 | CXR3846
605 | CXR3847
606 | CXR3849
607 | CXR3851
608 | CXR3852
609 | CXR3858
610 | CXR3860
611 | CXR3865
612 | CXR3867
613 | CXR3869
614 | CXR3870
615 | CXR3879
616 | CXR3881
617 | CXR3885
618 | CXR3888
619 | CXR3898
620 | CXR3899
621 | CXR3901
622 | CXR3906
623 | CXR3908
624 | CXR3913
625 | CXR392
626 | CXR3921
627 | CXR3923
628 | CXR3925
629 | CXR3928
630 | CXR3934
631 | CXR3935
632 | CXR3937
633 | CXR3946
634 | CXR3948
635 | CXR3952
636 | CXR3963
637 | CXR398
638 | CXR399
639 | CXR3998
640 | CXR40
641 | CXR402
642 | CXR403
643 | CXR406
644 | CXR408
645 | CXR416
646 | CXR420
647 | CXR423
648 | CXR427
649 | CXR432
650 | CXR439
651 | CXR444
652 | CXR445
653 | CXR46
654 | CXR467
655 | CXR47
656 | CXR471
657 | CXR473
658 | CXR474
659 | CXR477
660 | CXR48
661 | CXR481
662 | CXR493
663 | CXR494
664 | CXR503
665 | CXR508
666 | CXR512
667 | CXR522
668 | CXR53
669 | CXR530
670 | CXR540
671 | CXR55
672 | CXR565
673 | CXR570
674 | CXR573
675 | CXR577
676 | CXR584
677 | CXR585
678 | CXR589
679 | CXR590
680 | CXR598
681 | CXR60
682 | CXR601
683 | CXR606
684 | CXR611
685 | CXR616
686 | CXR617
687 | CXR622
688 | CXR639
689 | CXR64
690 | CXR645
691 | CXR646
692 | CXR654
693 | CXR661
694 | CXR665
695 | CXR668
696 | CXR671
697 | CXR672
698 | CXR673
699 | CXR674
700 | CXR680
701 | CXR686
702 | CXR695
703 | CXR698
704 | CXR700
705 | CXR703
706 | CXR705
707 | CXR706
708 | CXR707
709 | CXR71
710 | CXR712
711 | CXR719
712 | CXR726
713 | CXR73
714 | CXR733
715 | CXR737
716 | CXR738
717 | CXR741
718 | CXR742
719 | CXR743
720 | CXR751
721 | CXR752
722 | CXR756
723 | CXR760
724 | CXR781
725 | CXR792
726 | CXR795
727 | CXR797
728 | CXR8
729 | CXR800
730 | CXR805
731 | CXR831
732 | CXR833
733 | CXR837
734 | CXR840
735 | CXR843
736 | CXR846
737 | CXR853
738 | CXR855
739 | CXR856
740 | CXR859
741 | CXR871
742 | CXR875
743 | CXR885
744 | CXR888
745 | CXR889
746 | CXR892
747 | CXR897
748 | CXR903
749 | CXR904
750 | CXR906
751 | CXR907
752 | CXR909
753 | CXR919
754 | CXR920
755 | CXR921
756 | CXR925
757 | CXR927
758 | CXR929
759 | CXR932
760 | CXR934
761 | CXR935
762 | CXR939
763 | CXR941
764 | CXR943
765 | CXR95
766 | CXR964
767 | CXR970
768 | CXR975
769 | CXR981
770 | CXR989
771 | CXR992
--------------------------------------------------------------------------------
/images/negbio.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ncbi-nlp/NegBio/073199e2792824740e89844a59c13d3d40ce4d23/images/negbio.png
--------------------------------------------------------------------------------
/negbio/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ncbi-nlp/NegBio/073199e2792824740e89844a59c13d3d40ce4d23/negbio/__init__.py
--------------------------------------------------------------------------------
/negbio/chexpert/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2018 Stanford Machine Learning Group
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/negbio/chexpert/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | The codes and patterns in this package are built on CheXpert labeler.
3 | https://github.com/stanfordmlgroup/chexpert-labeler
4 | """
5 |
--------------------------------------------------------------------------------
/negbio/chexpert/constants.py:
--------------------------------------------------------------------------------
1 | # Observation constants
2 | CARDIOMEGALY = "Cardiomegaly"
3 | ENLARGED_CARDIOMEDIASTINUM = "Enlarged Cardiomediastinum"
4 | SUPPORT_DEVICES = "Support Devices"
5 | NO_FINDING = "No Finding"
6 | OBSERVATION = "observation"
7 | CATEGORIES = ["No Finding", "Enlarged Cardiomediastinum", "Cardiomegaly",
8 | "Lung Lesion", "Airspace Opacity", "Edema", "Consolidation",
9 | "Pneumonia", "Atelectasis", "Pneumothorax", "Pleural Effusion",
10 | "Pleural Other", "Fracture", "Support Devices"]
11 |
12 | # Numeric constants
13 | POSITIVE = 1
14 | NEGATIVE = 0
15 | UNCERTAIN = -1
16 |
17 | # Misc. constants
18 | UNCERTAINTY = "uncertainty"
19 | NEGATION = "negation"
20 | REPORTS = "Reports"
21 |
--------------------------------------------------------------------------------
/negbio/chexpert/patterns/negation.txt:
--------------------------------------------------------------------------------
1 | # No definite XXX
2 | ({} > {} {lemma:/definite/}) > {dependency:/neg/} {}
3 |
4 | # No obvious XXX
5 | ({} > {} {lemma:/obvious/}) > {dependency:/neg/} {}
6 |
7 |
8 | {} > {dependency:/amod|nsubj/} {lemma:/normal|unremarkable/}
9 | {} < {dependency:/amod|nsubj/} {lemma:/normal|unremarkable/}
10 | ({} > {} {}) < {dependency:/nsubj|dobj/} {lemma:/unremarkable|normal/}
11 | {} < {} ({} > {dependency:/amod/} {lemma:/normal|unremarkable/})
12 | {} < {} ({} < {dependency:/nsubj/} {lemma:/normal|unremarkable/})
13 | {} < {dependency:/conj:no/} {}
14 | {} < {} ({} < {dependency:/conj:or/} ({} > {} {lemma:/no/}))
15 | {} < {dependency:/nsubj/} ({lemma:/limit.*/} > {} {lemma:/upper/} & > {dependency:/nmod:of/} {lemma:/normal/} & > {dependency:/case/} {lemma:/at|within/})
16 | {} < {} ({dependency:/exclude/} < {} ({} > {} {lemma:/no/}))
17 |
18 |
19 | ({lemma:/silhouette/} > {} {}) < {dependency:/dobj|nsubj/} {lemma:/obscure/}
20 |
21 | ({} > {dependency:/amod/} {lemma:/normal|unremarkable/}) < {dependency:/dobj|nsubj/} {lemma:/demonstrate.*|show|present|display/}
22 | {} < {dependency:/nmod:of/} ( {lemma:/appearance/} > {dependency:/amod/} {lemma:/normal/} & < {dependency:/dobj/} {lemma:/demonstrate.*|show|present|display/})
23 |
24 | {} < {dependency:/amod/} ({} < {dependency:/dep|nsubj/} {lemma:/normal|unremarkable/})
25 | {} < {dependency:/amod/} ({} > {dependency:/neg/} {lemma:/no/})
26 | {} < {dependency:/amod/}({lemma:/finding.*/} < {dependency:/dobj/} ({lemma:/acute/} > {dependency:/nsubj/} {lemma:/no/}))
27 | {} < {dependency:/amod/} ({lemma:/structure.*/} < {dependency:/dep|nsubj/} ({lemma:/appear/} > {dependency:/xcomp/} {lemma:/normal|unremarkable/}))
28 |
29 | {} < {dependency:/compound/} ({} > {dependency:/neg/} {})
30 | {} < {dependency:/nsubj/} {lemma:/absent/}
31 | {} < {dependency:/amod/} ({} < {dependency:/nmod:of/} ({lemma:/evidence/} > {dependency:/case/} {lemma:/without/}))
32 | {} < {dependency:/amod/} ({} < {dependency:/nmod:of/} ({lemma:/evidence/} > {dependency:/neg/} {}))
33 |
34 | # XXX within normal limits
35 | {} < {} ({} < {} ({lemma:/show|demonstrate|present/} > {dependency:/nmod:within/} ({lemma:/limit.*/} > {} {lemma:/normal/})))
36 | ({} > {} {}) > {dependency:/nmod:within/} {lemma:/limit.*/}
37 | {} < {dependency:/nsubj/} ({lemma:/limit.*/} > {} {lemma:/upper/} & > {dependency:/nmod:of/} {lemma:/normal/} & > {dependency:/case/} {lemma:/at|within/})
38 | {} < {} ({} < {dependency:/nsubj/} ({lemma:/limit.*/} > {} {lemma:/upper/} & > {dependency:/nmod:of/} {lemma:/normal/} & > {dependency:/case/} {lemma:/at|within/}))
39 | {} < {} ({} < {dependency:/nsubj/} ({lemma:/limit.*/} > {dependency:/amod/} {lemma:/normal/} & > {dependency:/case/} {lemma:/at|within/}))
40 | ({lemma:/vascularity/} > {dependency:/amod/} {lemma:/pulmonary/}) > {dependency:/amod/} {lemma:/normal/}
41 | {} < {dependency:/dobj|nsubj/} ({} > {dependency:/nmod:within/} ({lemma:/limit.*/} > {} {lemma:/normal/}))
42 | {} > {dependency:/nmod:within/} ({lemma:/limit.*/} > {dependency:/amod/} {lemma:/normal/})
43 | {} > {} ({lemma:/limit/} > {} {lemma:/normal/})
44 |
45 | # XXX is/appears/are/appear/remain/remains (now, otherwise) normal/unremarkable
46 | {} < {} ({lemma:/appear|remain/} > {} {lemma:/normal|unremarkable/})
47 |
48 | # XXX is/appears/are/appear/remain/remains (now, otherwise) within normal limits
49 | {} > {} ({lemma:/remain|appear/} > {} ({lemma:/limit/} > {} {lemma:/normal/}))
50 |
51 |
52 | # rather than XXX
53 | {} <{dependency:/conj:negcc/} {}
54 | {} <{dependency:/nmod:without/} {}
55 |
56 | {} <{dependency:/nmod:without|nmod:of/} {lemma:/clear|clearing/}=key
57 | {} <{dependency:/nmod:out/} {lemma:/rule/}=key
58 |
59 | # removal of XXX
60 | {} <{dependency:/nmod:of/} {lemma:/history|free|disappearance|resolution|drainage|resolution|removal/}
61 | {} <{dependency:/nmod:for/} {lemma:/negative/}
62 |
63 | # exclude XXX
64 | {} <{} {lemma:/exclude/}
65 |
66 | {} <{dependency:/advmod|dep|conj:or/} {lemma:/no/}
67 |
68 | # XXX has resolved
69 | {} <{dependency:/nsubj/} ({lemma:/resolve/}=key >{dependency:/aux/} {})
70 |
71 | # there is no XXX
72 | {} <{dependency:/nsubj/} ({lemma:/be/} >{} {lemma:/no/})
73 |
74 | # without evidence|finding of|for XXX
75 | {} <{dependency:/nmod:of|nmod:for/} ({lemma:/evidence|finding/} <{dependency:/nmod:without/} {})
76 |
77 | # without development of XXX
78 | {} < {dependency:/nmod:of/} ({lemma:/development/} > {} {lemma:/without/})
79 |
80 | # No development of XXX
81 | {} < {dependency:/nmod:of/} ({lemma:/development/} > {} {lemma:/no/})
82 |
83 | # no evidence of|for XXX
84 | {} <{dependency:/nmod:of|nmod:for/} ({lemma:/evidence/} >{dependency:/neg/} {})
85 |
86 | # without evidence|finding of|for XXX
87 | {} <{dependency:/nmod:of|nmod:for/} ({lemma:/evidence|finding/} >{} {lemma:/without/})
88 |
89 | # no focus of XXX
90 | {} <{dependency:/nmod:of/} ({lemma:/focus/} >{dependency:/neg/} {})
91 | {} <{dependency:/nmod:of/} ({lemma:/focus/} >{} {lemma:/no/})
92 |
93 | # no moderate to XXX
94 | {} <{dependency:/nmod:to/} ({lemma:/moderate/} >{dependency:/neg/} {})
95 |
96 | # no evidence of developing XXX
97 | {} <{} ({lemma:/developing/} <{} ({lemma:/evidence/} <{dependency:/nmod:without/} {}))
98 | {} <{} ({lemma:/developing/} <{} ({lemma:/evidence/} >{} {lemma:/no/}))
99 |
100 | # no focal XXX
101 | {} <{dependency:/dobj/} ({} >{dependency:/nsubj/} {lemma:/no/})
102 |
103 | # XXX is previously demonstrated/visualized
104 | {} <{dependency:/dobj|nsubjpass/} ({lemma:/demonstrate|visualize/} >{} {lemma:/previously/})
105 |
106 | # there is no NN to suggest/explain XXX
107 | {} < {} ({lemma:/suggest|explain|diagnose/} < {} ({tag:/V.*/} > {} ({tag:/N.*/} > {} {lemma:/no/})))
108 |
109 | # no NN to suggest/explain XXX
110 | {} < {} ({lemma:/suggest|explain|diagnose/} < {} ({tag:/N.*/} > {} {lemma:/no/}))
111 |
112 | # no area of XXX
113 | {} < {dependency:/nmod:of/} ({lemma:/area/} > {dependency:/compound/} {lemma:/no/})
114 |
115 | # XXX is not enlarged
116 | {} < {dependency:/nsubjpass/} ({lemma:/enlarge/} > {dependency:/neg/} {})
117 |
118 | # without development of XXX
119 | {} < {dependency:/nmod:of/} ({lemma:/development/} > {dependency:/case/} {lemma:/without/})
120 |
121 | # XXX removed
122 | {} < {} {lemma:/remove/}
123 | {} > {} {lemma:/remove/}
124 |
125 | # XXX is no longer seen
126 | {} < {dependency:/nsubjpass/} ({lemma:/see/} > {} ({} > {dependency:/neg/} {lemma:/no/}))
127 | {} < {dependency:/nsubjpass/} ({lemma:/see/} > {} {lemma:/no/})
128 |
129 | # without evidence seen for XXX
130 | {} < {} ({lemma:/see/} > {} ({} > {} ({lemma:/evidence/} > {} {lemma:/without/})))
131 | {} < {} ({lemma:/see/} > {} ({lemma:/evidence/} > {} {lemma:/without/}))
132 |
133 | # normal/unremarkable appearance of XXX
134 | {} < {} ({lemma:/appearance/} > {} {lemma:/normal|unremarkable/})
135 |
136 | # normal/unremarkable XXX | XXX is/appears normal/unremarkable
137 | # make more general
138 | {} > {} {lemma:/normal|unremarkable/}
139 | {} < {} {lemma:/normal|unremarkable/}
140 |
141 | # XXX has/have cleared
142 | # cleared XXX
143 | {} < {} {lemma:/clear/}
144 | {} > {} {lemma:/clear/}
145 |
146 | # no obvious associated XXX
147 | {} < {} ({lemma:/associate.*/} > {} ({lemma:/obvious/} > {dependency:/neg/} {}))
148 | {} > {dependency:/neg/} {} & > {} {lemma:/obvious/} & > {} {lemma:/associate.*/}
149 |
150 | # XXX with interval resolution
151 | {} > {} ({lemma:/resolution/} > {} {lemma:/interval/})
152 |
153 | # no XXX / general negative case
154 | {} >{dependency:/neg/} {}
155 | {} >{} {lemma:/no/}
156 | {} >{dependency:/case/} {lemma:/without/}
157 |
--------------------------------------------------------------------------------
/negbio/chexpert/patterns/post_negation_uncertainty.txt:
--------------------------------------------------------------------------------
1 | # Added Rules
2 |
3 | # Stable/unchanged silhouette/cardiomediastinal
4 | {lemma:/cardiomediastinal|pericardial|mediastinal|mediastinum|cardiomediastinum|contour|configuration|silhouette|size|heart|shadow/} > {dependency:/amod/} {lemma:/stable|unchanged/}
5 | {lemma:/cardiomediastinal|pericardial|mediastinal|mediastinum|cardiomediastinum|contour|configuration|silhouette|size|heart|shadow/} > {} ({lemma:/change/} > {dependency:/neg/} {})
6 |
7 | # Silhouette/cardiomediastinal is stable|unchanged|not changed
8 | {lemma:/cardiomediastinal|pericardial|mediastinal|mediastinum|cardiomediastinum|contour|configuration|silhouette|size|heart|shadow/} < {dependency:/nsubj/} {lemma:/stable|unchanged/}
9 | {lemma:/cardiomediastinal|pericardial|mediastinal|mediastinum|cardiomediastinum|contour|configuration|silhouette|size|heart|shadow/} < {} ({lemma:/change/} > {dependency:/neg/} {})
10 |
11 | # {} < {} ({lemma:/change/} > {dependency:/neg/} {})
12 |
13 | # Silhouette/cardiomediastinal similar to prior
14 | {lemma:/cardiomediastinal|pericardial|mediastinal|mediastinum|cardiomediastinum|contour|configuration|silhouette|size|heart|shadow/} < {} ({lemma:/similar/} > {dependency:/nmod:to/} {lemma:/prior/})
15 | {lemma:/cardiomediastinal|pericardial|mediastinal|mediastinum|cardiomediastinum|contour|configuration|silhouette|size|heart|shadow/} < {} ({lemma:/similar/} > {dependency:/nmod:to/} ({} >{} {lemma:/prior/}))
16 | {lemma:/cardiomediastinal|pericardial|mediastinal|mediastinum|cardiomediastinum|contour|configuration|silhouette|size|heart|shadow/} > {} ({lemma:/similar/} > {dependency:/nmod:to/} {lemma:/prior/})
17 | {lemma:/cardiomediastinal|pericardial|mediastinal|mediastinum|cardiomediastinum|contour|configuration|silhouette|size|heart|shadow/} > {} ({lemma:/similar/} > {dependency:/nmod:to/} ({} >{} {lemma:/prior/}))
18 |
19 | # Stable apparence of silhouette/cardiomediastinal
20 | {lemma:/cardiomediastinal|pericardial|mediastinal|mediastinum|cardiomediastinum|contour|configuration|silhouette|size|heart|shadow/} < {dependency:/nmod:of/} ({lemma:/appearance/} > {} {lemma:/stable/})
21 |
22 | {} < {} ({lemma:/excluded/} > {dependency:/neg/} {})
23 | {} < {dependency:/nmod:for/} {lemma:/suspicious/}
24 | {} < {dependency:/dobj/} ({lemma:/represent/} > {dependency:/advmod/} {lemma:/possibly/})
25 | {} > {dependency:/cc/} {lemma:/and.or/}
26 | {} < {dependency:/conj:and.or/} {}
27 | {} > {} {lemma:/new/} & > {dependency:/neg/} {lemma:/no/}
28 |
29 | {} < {dependency:/dep/} ({} > {dependency:/acl:relcl/} ({lemma:/represent/} < {dependency:/aux/} {lemma:/may/}))
30 | {} < {dependency:/nmod:for/} {lemma:/worrisome/}
31 |
32 | # XXX versus YYY
33 | {} < {dependency:/conj:versus/} {}
34 | {} > {dependency:/conj:versus/} {}
35 |
36 | # {} < {dependency:/nsubjpass/} ({lemma:/change/} > {dependency:/neg/} {})
37 | ({lemma:/angle/} > {dependency:/nsubj/} {lemma:/costophrenic/}) > {dependency:/nmod:of/} {lemma:/blunt.*/}
38 | {} < {lemma:/nsubj/} ({} > {} ({lemma:/likely/} > {} {lemma:/less/}))
39 |
40 | {} < {dependency:/nmod:out/} {lemma:/cannot/}
41 |
42 | # outgoing edge
43 | {} >{} {lemma:/possible|possibly|presumably|probable|questionable|suspect|suspected|suspicious/}
44 | {} >{} {lemma:/question/}
45 |
46 | # May/might/would/could be XXX
47 | {} > {} {lemma:/may|might|would|could/}
48 |
49 | # '{} >{dependency:/cop/} {lemma:/may|would|could/}
50 |
51 | # incoming edge
52 | {} <{dependency:/nmod:of/} {lemma:/question|suggestion/}
53 | {} <{dependency:/dobj/} {lemma:/suspect|favor|question|consider/}
54 | {} <{dependency:/nmod:for/} {lemma:/concern|suspicion/}
55 | {} <{dependency:/nsubjpass/} {lemma:/suspect/}
56 | {} <{} {lemma:/possible/}
57 |
58 | # parsing error
59 | # suspected XXX
60 | {} <{dependency:/dobj/} {lemma:/suspect/}
61 | {} >{dependency:/advmod/} {lemma:/suspect/}
62 |
63 | # maybe due to XXX
64 | {} <{dependency:/dep/} {lemma:/maybe/}
65 |
66 | # may/could represent/reflect/indicate/include XXX
67 | {} <{dependency:/dobj/} ({lemma:/reflect|represent|indicate|include/} >{} {lemma:/may|could|would|might|possibly|can/})
68 |
69 | # may/could represent/reflect/indicate/include the presence of XXX
70 | {} < {} ({lemma:/presence/} <{dependency:/dobj/} ({lemma:/reflect|represent|indicate|include/} >{} {lemma:/may|could|would|might|possibly|can/}))
71 |
72 | # maybe secondary to XXX
73 | {} <{dependency:/nmod:to/} {lemma:/secondary/}
74 |
75 | # may be due to XXX
76 | {} <{dependency:/nmod:to/} ({lemma:/due/} >{} {lemma:/can|could|may|would|possibly/})
77 |
78 | # could related to XXX
79 | {} <{dependency:/nmod:to/} ({lemma:/relate/} >{} {lemma:/can|could|may|would|possibly/})
80 |
81 | # may be compatible with XXX
82 | {} <{dependency:/nmod:with/} ({lemma:/compatible/} >{} {lemma:/be|could|may|would/})
83 |
84 | # question left XXX
85 | {} <{dependency:/dobj/} ({lemma:/left/} <{} {lemma:/question/})
86 | {} >{} {lemma:/left/} <{} {lemma:/question/}
87 |
88 | # differential diagnosis includes
89 | {} <{dependency:/dobj/} ({lemma:/include/} >{} ({lemma:/diagnosis/} >{} {lemma:/differential/}))
90 |
91 | # may be XXX
92 | {} <{} {lemma:/be/} >{} {lemma:/may|could|would/}
93 |
94 | # parsing error
95 | # XXX suspected
96 | {} <{dependency:/nsubj/} {lemma:/suspect/,tag:/VBN/}
97 |
98 | # Correlation for symptoms of XXX
99 | {} < {dependency:/nmod:of/} ({lemma:/symptom/} < {dependency:/nmod:for/} {lemma:/correlation/})
100 |
101 | # borderline heart size
102 | {lemma:/heart/} < {dependency:/compound/} ({lemma:/size/} > {} {lemma:/borderline/})
103 |
104 | # XXX could/might/may/possibly be present
105 | {} < {} ({lemma:/present/} > {dependency:/aux/} {lemma:/could|might|may|possibly|can/})
106 |
107 | # XXX is poorly evaluated
108 | {} < {} ({lemma:/evaluate/} > {dependency:/advmod/} {lemma:/poorly/})
109 |
110 | # XXX is incompletely evaluated
111 | {} < {} ({lemma:/evaluate/} > {dependency:/advmod/} {lemma:/incompletely/})
112 |
113 | # XXX is not well visualized/evaluated
114 | {} < {} ({lemma:/evaluate|visualize/} >{dependency:/neg/} {})
115 | {} > {} ({lemma:/evaluate|visualize/} > {dependency:/neg/} {})
116 |
117 | # obscuring the XXX | XXX is obscured | obscured XXX
118 | {} < {} {lemma:/obscure/}
119 |
120 | # XXX could appear
121 | {} < {dependency:/nsubj/} ({lemma:/appear/} > {} {lemma:/could|may|might|can/})
122 |
123 | # may be consistent/compatible with XXX
124 | {} < {dependency:/nmod:with/} ({lemma:/consistent/} > {} {lemma:/may|might|can|could/})
125 |
126 | # correlate clinically for XXX
127 | {} < {dependency:/nmod:for/} ({lemma:/correlate/} > {dependency:/advmod/} {lemma:/clinically/})
128 |
129 | # correlate clinically for evidence of XXX
130 | {} < {dependency:/nmod:of/} ({lemma:/evidence|sign|signs|symptoms|symptom/} < {dependency:/nmod:for/} ({lemma:/correlate/} > {dependency:/advmod/} {lemma:/clinically/}))
131 |
132 | # XXX are not clearly seen
133 | {} < {} (({lemma:/see/} > {dependency:/neg/} {}) > {} {lemma:/clearly/})
134 | {} > {} (({lemma:/see/} > {dependency:/neg/} {}) > {} {lemma:/clearly/})
135 |
136 | # possibly reflecting a XXX
137 | {} < {} ({lemma:/reflect/} > {} {lemma:/possibly/})
138 |
139 | # XXX was not appreciated
140 | {} < {} ({lemma:/appreciate/} > {dependency:/neg/} {})
141 |
142 | # XXX may|might|could (also) have this appearance
143 | {} < {} (({lemma:/have/} > {} {lemma:/may|might|could/}) > {} {lemma:/appearance/})
144 |
145 | # vascular congestion
146 | # pulmonary congestion
147 | # indistinctness
148 | # vascular prominence
149 | {lemma:/congestion/} > {} {lemma:/vascular/}
150 | {lemma:/congestion/} > {} {lemma:/pulmonary/}
151 | {lemma:/indistinctness/}
152 | {lemma:/prominence/} > {} {lemma:/vascular/}
153 |
154 | # XXX or YYY
155 | {} > {dependency:/conj:or/} {}
156 | {} < {dependency:/conj:or/} {}
157 |
158 |
--------------------------------------------------------------------------------
/negbio/chexpert/patterns/pre_negation_uncertainty.txt:
--------------------------------------------------------------------------------
1 | # Reserved for uncertainty rules that need to be matched first.
2 |
3 | # cannot exclude some XXX
4 | {} < {} ({lemma:/exclude/} >{} {lemma:/cannot/})
5 |
6 | # XXX is not excluded
7 | {} < {} ({lemma:/exclude/} > {dependency:/neg/} {})
8 |
9 | # no new XXX
10 | {} > {} {lemma:/new/} & > {dependency:/neg/} {lemma:/no/}
11 | {} < {} ({lemma:/new/} > {} {lemma:/no/})
12 | {} < {dependency:/compound/} ({} > {} {lemma:/new/} & > {} {lemma:/no/})
13 |
14 | # no new area of XXX
15 | {} < {} ({lemma:/area/} > {} {lemma:/no/} > {} {lemma:/new/})
16 | {} > {} ({lemma:/area/} > {} {lemma:/no/} > {} {lemma:/new/})
17 |
18 | # cannot rule out XXX
19 | {} <{dependency:/nmod:out/} ({lemma:/rule/} > {} {lemma:/cannot/})
20 |
21 | # no evidence to rule out XXX
22 |
23 | {} < {dependency:/nmod:out/} ({lemma:/rule/} < {} ({lemma:/evidence/} > {} {lemma:/no/}))
24 |
--------------------------------------------------------------------------------
/negbio/chexpert/phrases/mention/airspace_opacity.txt:
--------------------------------------------------------------------------------
1 | opaci
2 | decreased translucency
3 | increased density
4 | airspace disease
5 | air-space disease
6 | air space disease
7 | infiltrate
8 | infiltration
9 | interstitial marking
10 | interstitial pattern
11 | interstitial lung
12 | reticular pattern
13 | reticular marking
14 | reticulation
15 | parenchymal scarring
16 | peribronchial thickening
17 | wall thickening
18 | scar
19 |
--------------------------------------------------------------------------------
/negbio/chexpert/phrases/mention/atelectasis.txt:
--------------------------------------------------------------------------------
1 | atelecta
2 | collapse
3 |
--------------------------------------------------------------------------------
/negbio/chexpert/phrases/mention/cardiomegaly.txt:
--------------------------------------------------------------------------------
1 | cardiomegaly
2 | the heart
3 | heart size
4 | cardiac enlargement
5 | cardiac size
6 | cardiac shadow
7 | cardiac contour
8 | cardiac silhouette
9 | enlarged heart
--------------------------------------------------------------------------------
/negbio/chexpert/phrases/mention/consolidation.txt:
--------------------------------------------------------------------------------
1 | consolidat
--------------------------------------------------------------------------------
/negbio/chexpert/phrases/mention/edema.txt:
--------------------------------------------------------------------------------
1 | edema
2 | heart failure
3 | chf
4 | vascular congestion
5 | pulmonary congestion
6 | indistinctness
7 | vascular prominence
--------------------------------------------------------------------------------
/negbio/chexpert/phrases/mention/enlarged_cardiomediastinum.txt:
--------------------------------------------------------------------------------
1 | _mediastinum
2 | cardiomediastinum
3 | contour
4 | mediastinal configuration
5 | mediastinal silhouette
6 | pericardial silhouette
7 | cardiac silhouette and vascularity
8 |
--------------------------------------------------------------------------------
/negbio/chexpert/phrases/mention/fracture.txt:
--------------------------------------------------------------------------------
1 | fracture
2 |
--------------------------------------------------------------------------------
/negbio/chexpert/phrases/mention/lung_lesion.txt:
--------------------------------------------------------------------------------
1 | mass
2 | nodular density
3 | nodular densities
4 | nodular opacity
5 | nodular opacities
6 | nodular opacification
7 | nodule
8 | lump
9 | cavitary lesion
10 | carcinoma
11 | neoplasm
12 | tumor
13 |
--------------------------------------------------------------------------------
/negbio/chexpert/phrases/mention/no_finding.txt:
--------------------------------------------------------------------------------
1 | emphysema
2 | blunt
3 | density
4 | elevation
5 | eventration
6 | scoliosis
7 | degenera
8 | calcifi
9 | hyperinflation
10 | bronchospasm
11 | asthma
12 | hernia
13 | copd
14 | interstitial markings
15 | plaque
16 | osteophytosis
17 | aortic disease
18 | bronchiolitis
19 | airways disease
20 | thickening
21 | cephalization
22 | aspiration
23 | bullae
24 | hyperinflat
25 | contusion
26 | atherosclero
27 | osteopenia
28 | metastasis
29 | granuloma
30 | pneumomediastinum
31 | pneumoperitoneum
32 | osteodystrophy
33 | cuffing
34 | irregular lucency
35 | inflam
36 | fissure
37 | hypertension
38 | prominen
39 | kyphosis
40 | defib
41 | hyperexpansion
42 | bullet
43 | reticula
44 | thoracentesis
45 | bronchitis
46 | volume loss
47 | deformity
48 | hemorrhage
49 | hematoma
50 | radiopaque
51 | aerophagia
52 | arthropathy
53 | tracheostomy
54 |
--------------------------------------------------------------------------------
/negbio/chexpert/phrases/mention/pleural_effusion.txt:
--------------------------------------------------------------------------------
1 | pleural fluid
2 | effusion
--------------------------------------------------------------------------------
/negbio/chexpert/phrases/mention/pleural_other.txt:
--------------------------------------------------------------------------------
1 | pleural thickening
2 | fibrosis
3 | fibrothorax
4 | pleural scar
5 | pleural parenchymal scar
6 | pleuro-parenchymal scar
7 | pleuro-pericardial scar
8 |
--------------------------------------------------------------------------------
/negbio/chexpert/phrases/mention/pneumonia.txt:
--------------------------------------------------------------------------------
1 | pneumonia
2 | infection
3 | infectious process
4 | infectious
--------------------------------------------------------------------------------
/negbio/chexpert/phrases/mention/pneumothorax.txt:
--------------------------------------------------------------------------------
1 | pneumothorax
2 | pneumothoraces
--------------------------------------------------------------------------------
/negbio/chexpert/phrases/mention/support_devices.txt:
--------------------------------------------------------------------------------
1 | pacer
2 | _line_
3 | lines
4 | picc
5 | tube
6 | valve
7 | catheter
8 | pacemaker
9 | hardware
10 | arthroplast
11 | marker
12 | icd
13 | defib
14 | device
15 | drain_
16 | plate
17 | screw
18 | cannula
19 | apparatus
20 | coil
21 | support
22 | equipment
23 | mediport
24 |
--------------------------------------------------------------------------------
/negbio/chexpert/phrases/unmention/airspace_opacity.txt:
--------------------------------------------------------------------------------
1 | pleural scar
--------------------------------------------------------------------------------
/negbio/chexpert/phrases/unmention/lung_lesion.txt:
--------------------------------------------------------------------------------
1 | calcified nodul
2 | massive
3 | massengale
--------------------------------------------------------------------------------
/negbio/chexpert/phrases/unmention/pleural_effusion.txt:
--------------------------------------------------------------------------------
1 | pericardial effusion
--------------------------------------------------------------------------------
/negbio/chexpert/stages/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ncbi-nlp/NegBio/073199e2792824740e89844a59c13d3d40ce4d23/negbio/chexpert/stages/__init__.py
--------------------------------------------------------------------------------
/negbio/chexpert/stages/aggregate.py:
--------------------------------------------------------------------------------
1 | """Define mention aggregator class."""
2 | import numpy as np
3 | from tqdm import tqdm
4 |
5 | from negbio.chexpert.constants import NEGATIVE, UNCERTAIN, POSITIVE, SUPPORT_DEVICES, NO_FINDING, OBSERVATION, \
6 | NEGATION, UNCERTAINTY, CARDIOMEGALY
7 |
8 |
9 | class Aggregator(object):
10 | """Aggregate mentions of observations from radiology reports."""
11 |
12 | def __init__(self, categories, verbose=False):
13 | self.categories = categories
14 |
15 | self.verbose = verbose
16 |
17 | def dict_to_vec(self, d):
18 | """
19 | Convert a dictionary of the form
20 |
21 | {cardiomegaly: [1],
22 | opacity: [u, 1],
23 | fracture: [0]}
24 |
25 | into a vector of the form
26 |
27 | [np.nan, np.nan, 1, u, np.nan, ..., 0, np.nan]
28 | """
29 | vec = []
30 | for category in self.categories:
31 | # There was a mention of the category.
32 | if category in d:
33 | label_list = d[category]
34 | # Only one label, no conflicts.
35 | if len(label_list) == 1:
36 | vec.append(label_list[0])
37 | # Multiple labels.
38 | else:
39 | # Case 1. There is negated and uncertain.
40 | if NEGATIVE in label_list and UNCERTAIN in label_list:
41 | vec.append(UNCERTAIN)
42 | # Case 2. There is negated and positive.
43 | elif NEGATIVE in label_list and POSITIVE in label_list:
44 | vec.append(POSITIVE)
45 | # Case 3. There is uncertain and positive.
46 | elif UNCERTAIN in label_list and POSITIVE in label_list:
47 | vec.append(POSITIVE)
48 | # Case 4. All labels are the same.
49 | else:
50 | vec.append(label_list[0])
51 |
52 | # No mention of the category
53 | else:
54 | vec.append(np.nan)
55 |
56 | return vec
57 |
58 | def aggregate(self, collection):
59 | labels = []
60 | documents = collection.documents
61 | if self.verbose:
62 | print("Aggregating mentions...")
63 | documents = tqdm(documents)
64 | for document in documents:
65 | label_dict = {}
66 | impression_passage = document.passages[0]
67 | no_finding = True
68 | for annotation in impression_passage.annotations:
69 | category = annotation.infons[OBSERVATION]
70 |
71 | if NEGATION in annotation.infons:
72 | label = NEGATIVE
73 | elif UNCERTAINTY in annotation.infons:
74 | label = UNCERTAIN
75 | else:
76 | label = POSITIVE
77 |
78 | # If at least one non-support category has a uncertain or
79 | # positive label, there was a finding
80 | if (category != SUPPORT_DEVICES and
81 | label in [UNCERTAIN, POSITIVE]):
82 | no_finding = False
83 |
84 | # Don't add any labels for No Finding
85 | if category == NO_FINDING:
86 | continue
87 |
88 | # add exception for 'chf' and 'heart failure'
89 | if ((label in [UNCERTAIN, POSITIVE]) and
90 | (annotation.text == 'chf' or
91 | annotation.text == 'heart failure')):
92 | if CARDIOMEGALY not in label_dict:
93 | label_dict[CARDIOMEGALY] = [UNCERTAIN]
94 | else:
95 | label_dict[CARDIOMEGALY].append(UNCERTAIN)
96 |
97 | if category not in label_dict:
98 | label_dict[category] = [label]
99 | else:
100 | label_dict[category].append(label)
101 |
102 | if no_finding:
103 | label_dict[NO_FINDING] = [POSITIVE]
104 |
105 | label_vec = self.dict_to_vec(label_dict)
106 |
107 | labels.append(label_vec)
108 |
109 | return np.array(labels)
110 |
111 |
112 | class NegBioAggregator(Aggregator):
113 | LABEL_MAP = {UNCERTAIN: 'Uncertain', POSITIVE: 'Positive', NEGATIVE: 'Negative'}
114 |
115 | def aggregate_doc(self, document):
116 | """
117 | Aggregate mentions of observations from radiology reports.
118 |
119 | Args:
120 | document (BioCDocument):
121 |
122 | Returns:
123 | BioCDocument
124 | """
125 | label_dict = {}
126 | no_finding = True
127 | for passage in document.passages:
128 | for annotation in passage.annotations:
129 | category = annotation.infons[OBSERVATION]
130 |
131 | if NEGATION in annotation.infons:
132 | label = NEGATIVE
133 | elif UNCERTAINTY in annotation.infons:
134 | label = UNCERTAIN
135 | else:
136 | label = POSITIVE
137 |
138 | # If at least one non-support category has a uncertain or
139 | # positive label, there was a finding
140 | if category != SUPPORT_DEVICES \
141 | and label in [UNCERTAIN, POSITIVE]:
142 | no_finding = False
143 |
144 | # Don't add any labels for No Finding
145 | if category == NO_FINDING:
146 | continue
147 |
148 | # add exception for 'chf' and 'heart failure'
149 | if label in [UNCERTAIN, POSITIVE] \
150 | and (annotation.text == 'chf' or annotation.text == 'heart failure'):
151 | if CARDIOMEGALY not in label_dict:
152 | label_dict[CARDIOMEGALY] = [UNCERTAIN]
153 | else:
154 | label_dict[CARDIOMEGALY].append(UNCERTAIN)
155 |
156 | if category not in label_dict:
157 | label_dict[category] = [label]
158 | else:
159 | label_dict[category].append(label)
160 |
161 | if no_finding:
162 | label_dict[NO_FINDING] = [POSITIVE]
163 |
164 | for category in self.categories:
165 | key = 'CheXpert/{}'.format(category)
166 | # There was a mention of the category.
167 | if category in label_dict:
168 | label_list = label_dict[category]
169 | # Only one label, no conflicts.
170 | if len(label_list) == 1:
171 | document.infons[key] = self.LABEL_MAP[label_list[0]]
172 | # Multiple labels.
173 | else:
174 | # Case 1. There is negated and uncertain.
175 | if NEGATIVE in label_list and UNCERTAIN in label_list:
176 | document.infons[key] = self.LABEL_MAP[UNCERTAIN]
177 | # Case 2. There is negated and positive.
178 | elif NEGATIVE in label_list and POSITIVE in label_list:
179 | document.infons[key] = self.LABEL_MAP[POSITIVE]
180 | # Case 3. There is uncertain and positive.
181 | elif UNCERTAIN in label_list and POSITIVE in label_list:
182 | document.infons[key] = self.LABEL_MAP[POSITIVE]
183 | # Case 4. All labels are the same.
184 | else:
185 | document.infons[key] = self.LABEL_MAP[label_list[0]]
186 |
187 | # No mention of the category
188 | else:
189 | pass
190 | return document
191 |
--------------------------------------------------------------------------------
/negbio/chexpert/stages/classify.py:
--------------------------------------------------------------------------------
1 | """Define mention classifier class.
2 |
3 | Author: stanfordmlgroup
4 | Modified by: Yifan Peng
5 | """
6 | import logging
7 |
8 | from negbio import ngrex
9 | from negbio.chexpert.constants import *
10 | from negbio.neg import semgraph, propagator, neg_detector
11 |
12 |
13 | class ModifiedDetector(neg_detector.Detector):
14 | """Child class of NegBio Detector class.
15 |
16 | Overrides parent methods __init__, detect, and match_uncertainty.
17 | """
18 |
19 | def __init__(self, pre_negation_uncertainty_path,
20 | negation_path, post_negation_uncertainty_path):
21 | super(ModifiedDetector, self).__init__(negation_path, post_negation_uncertainty_path)
22 | self.preneg_uncertain_patterns = ngrex.load(pre_negation_uncertainty_path)
23 |
24 | def detect(self, sentence, locs):
25 | """Detect rules in report sentences.
26 |
27 | Args:
28 | sentence(BioCSentence): a sentence with universal dependencies
29 | locs(list): a list of (begin, end)
30 |
31 | Return:
32 | (str, MatcherObj, (begin, end)): negation or uncertainty,
33 | matcher, matched annotation
34 | """
35 | try:
36 | g = semgraph.load(sentence)
37 | propagator.propagate(g)
38 | except Exception:
39 | logging.exception('Cannot parse dependency graph [offset=%s]', sentence.offset)
40 | raise
41 | else:
42 | for loc in locs:
43 | for node in neg_detector.find_nodes(g, loc[0], loc[1]):
44 | # Match pre-negation uncertainty rules first.
45 | preneg_m = self.match_prenegation_uncertainty(g, node)
46 | if preneg_m:
47 | yield UNCERTAINTY, preneg_m, loc
48 | else:
49 | # Then match negation rules.
50 | neg_m = self.match_neg(g, node)
51 | if neg_m:
52 | yield NEGATION, neg_m, loc
53 | else:
54 | # Finally match post-negation uncertainty rules.
55 | postneg_m = self.match_uncertainty(g, node)
56 | if postneg_m:
57 | yield UNCERTAINTY, postneg_m, loc
58 |
59 | def match_uncertainty(self, graph, node):
60 | for pattern in self.uncertain_patterns:
61 | for m in pattern.finditer(graph):
62 | n0 = m.group(0)
63 | if n0 == node:
64 | return m
65 |
66 | def match_prenegation_uncertainty(self, graph, node):
67 | for pattern in self.preneg_uncertain_patterns:
68 | for m in pattern.finditer(graph):
69 | n0 = m.group(0)
70 | if n0 == node:
71 | return m
72 |
73 |
--------------------------------------------------------------------------------
/negbio/chexpert/stages/extract.py:
--------------------------------------------------------------------------------
1 | """Define observation extractor class."""
2 | import re
3 | import itertools
4 | from collections import defaultdict
5 | from tqdm import tqdm
6 | from negbio.chexpert.constants import CARDIOMEGALY, ENLARGED_CARDIOMEDIASTINUM, OBSERVATION
7 |
8 | import bioc
9 |
10 |
11 | class Extractor(object):
12 | """Extract observations from impression sections of reports."""
13 | def __init__(self, mention_phrases_dir, unmention_phrases_dir,
14 | verbose=False):
15 | self.verbose = verbose
16 | self.observation2mention_phrases\
17 | = self.load_phrases(mention_phrases_dir, "mention")
18 | self.observation2unmention_phrases\
19 | = self.load_phrases(unmention_phrases_dir, "unmention")
20 | self.add_unmention_phrases()
21 |
22 | def load_phrases(self, phrases_dir, phrases_type):
23 | """Read in map from observations to phrases for matching."""
24 | observation2phrases = defaultdict(list)
25 | for phrases_path in phrases_dir.glob("*.txt"):
26 | with phrases_path.open() as f:
27 | for line in f:
28 | phrase = line.strip().replace("_", " ")
29 | observation = phrases_path.stem.replace("_", " ").title()
30 | if line:
31 | observation2phrases[observation].append(phrase)
32 |
33 | if self.verbose:
34 | print("Loading {} phrases for {} observations.".format(phrases_type, len(observation2phrases)))
35 |
36 | return observation2phrases
37 |
38 | def add_unmention_phrases(self):
39 | cardiomegaly_mentions\
40 | = self.observation2mention_phrases[CARDIOMEGALY]
41 | enlarged_cardiom_mentions\
42 | = self.observation2mention_phrases[ENLARGED_CARDIOMEDIASTINUM]
43 | positional_phrases = (["over the", "overly the", "in the"],
44 | ["", " superior", " left", " right"])
45 | positional_unmentions = [e1 + e2
46 | for e1 in positional_phrases[0]
47 | for e2 in positional_phrases[1]]
48 | cardiomegaly_unmentions = [e1 + " " + e2.replace("the ", "")
49 | for e1 in positional_unmentions
50 | for e2 in cardiomegaly_mentions
51 | if e2 not in ["cardiomegaly",
52 | "cardiac enlargement"]]
53 | enlarged_cardiomediastinum_unmentions\
54 | = [e1 + " " + e2
55 | for e1 in positional_unmentions
56 | for e2 in enlarged_cardiom_mentions]
57 |
58 | self.observation2unmention_phrases[CARDIOMEGALY]\
59 | = cardiomegaly_unmentions
60 | self.observation2unmention_phrases[ENLARGED_CARDIOMEDIASTINUM]\
61 | = enlarged_cardiomediastinum_unmentions
62 |
63 | def overlaps_with_unmention(self, sentence, observation, start, end):
64 | """Return True if a given match overlaps with an unmention phrase."""
65 | unmention_overlap = False
66 | unmention_list = self.observation2unmention_phrases.get(observation,
67 | [])
68 | for unmention in unmention_list:
69 | unmention_matches = re.finditer(unmention, sentence.text)
70 | for unmention_match in unmention_matches:
71 | unmention_start, unmention_end = unmention_match.span(0)
72 | if start < unmention_end and end > unmention_start:
73 | unmention_overlap = True
74 | break # break early if overlap is found
75 | if unmention_overlap:
76 | break # break early if overlap is found
77 |
78 | return unmention_overlap
79 |
80 | def add_match(self, impression, sentence, ann_index, phrase,
81 | observation, start, end):
82 | """Add the match data and metadata to the impression object
83 | in place."""
84 | annotation = bioc.BioCAnnotation()
85 | annotation.id = ann_index
86 | annotation.infons['CUI'] = None
87 | annotation.infons['semtype'] = None
88 | annotation.infons['term'] = phrase
89 | annotation.infons[OBSERVATION] = observation
90 | annotation.infons['annotator'] = 'CheXpert labeler'
91 | length = end - start
92 | annotation.add_location(bioc.BioCLocation(sentence.offset + start,
93 | length))
94 | annotation.text = sentence.text[start:start+length]
95 |
96 | impression.annotations.append(annotation)
97 |
98 | def extract(self, collection):
99 | """Extract the observations in each report.
100 |
101 | Args:
102 | collection (BioCCollection): Impression passages of each report.
103 |
104 | Return:
105 | extracted_mentions
106 | """
107 |
108 | # The BioCCollection consists of a series of documents.
109 | # Each document is a report (just the Impression section
110 | # of the report.)
111 | documents = collection.documents
112 | if self.verbose:
113 | print("Extracting mentions...")
114 | documents = tqdm(documents)
115 | for document in documents:
116 | # Get the Impression section.
117 | impression = document.passages[0]
118 | annotation_index = itertools.count(len(impression.annotations))
119 |
120 | for sentence in impression.sentences:
121 | obs_phrases = self.observation2mention_phrases.items()
122 | for observation, phrases in obs_phrases:
123 | for phrase in phrases:
124 | matches = re.finditer(phrase, sentence.text)
125 | for match in matches:
126 | start, end = match.span(0)
127 |
128 | if self.overlaps_with_unmention(sentence,
129 | observation,
130 | start,
131 | end):
132 | continue
133 |
134 | self.add_match(impression,
135 | sentence,
136 | str(next(annotation_index)),
137 | phrase,
138 | observation,
139 | start,
140 | end)
141 |
142 |
143 | class NegBioExtractor(Extractor):
144 | def extract_doc(self, document):
145 | annotation_index = itertools.count()
146 | for passage in document.passages:
147 | for sentence in passage.sentences:
148 | obs_phrases = self.observation2mention_phrases.items()
149 | for observation, phrases in obs_phrases:
150 | for phrase in phrases:
151 | matches = re.finditer(phrase, sentence.text)
152 | for match in matches:
153 | start, end = match.span(0)
154 | if self.overlaps_with_unmention(sentence, observation, start, end):
155 | continue
156 | self.add_match(passage, sentence, str(next(annotation_index)), phrase,
157 | observation, start, end)
158 | return document
159 |
160 | def extract_all(self, collection):
161 | """Extract the observations in each report."""
162 | annotation_index = itertools.count()
163 | for doc in collection.documents:
164 | for passage in doc.passages:
165 | for sentence in passage.sentences:
166 | obs_phrases = self.observation2mention_phrases.items()
167 | for observation, phrases in obs_phrases:
168 | for phrase in phrases:
169 | matches = re.finditer(phrase, sentence.text)
170 | for match in matches:
171 | start, end = match.span(0)
172 | if self.overlaps_with_unmention(sentence, observation, start, end):
173 | continue
174 | self.add_match(passage, sentence, str(next(annotation_index)), phrase,
175 | observation, start, end)
176 | return collection
177 |
--------------------------------------------------------------------------------
/negbio/chexpert/stages/load.py:
--------------------------------------------------------------------------------
1 | """Define report loader class."""
2 | import re
3 |
4 | from negbio.pipeline.section_split import split_document
5 |
6 |
7 | def _maketrans(s):
8 | s = s.replace(',', ', ')
9 | s = s.replace('.', '. ')
10 | return s
11 |
12 |
13 | def extract_impression_from_passages(document):
14 | """Extract the Impression section from a Bioc Document."""
15 | document.passages = [passage for passage in document.passages
16 | if passage.infons['title'] == "impression"]
17 |
18 | assert len(document.passages) <= 1, "The document contains {} impression passages.".format(len(document.passages))
19 |
20 | assert len(document.passages) >= 1, "The document contains no explicit impression passage."
21 |
22 |
23 | class NegBioLoader(object):
24 | """Report impression loader."""
25 | def __init__(self, extract_impression=False):
26 | self.extract_impression = extract_impression
27 | # self.punctuation_spacer = string.maketrans({key: "{} ".format(key)
28 | # for key in ".,"})
29 | # self.stop_spacer = string.maketrans('.', '. ')
30 | # self.comma_spacer = string.maketrans(',', ', ')
31 |
32 | def clean_doc(self, document):
33 | """Load and clean the reports."""
34 | for passage in document.passages:
35 | passage.text = self.clean(passage.text)
36 |
37 | if self.extract_impression:
38 | document = split_document(document)
39 | extract_impression_from_passages(document)
40 |
41 | return document
42 |
43 | def clean(self, report):
44 | """Clean the report text."""
45 | lower_report = report.lower()
46 | # Change `and/or` to `or`.
47 | corrected_report = re.sub('and/or',
48 | 'or',
49 | lower_report)
50 | # Change any `XXX/YYY` to `XXX or YYY`.
51 | corrected_report = re.sub('(?<=[a-zA-Z])/(?=[a-zA-Z])',
52 | ' or ',
53 | corrected_report)
54 | # Clean double periods
55 | clean_report = corrected_report.replace("..", ".")
56 | # Insert space after commas and periods.
57 | clean_report = _maketrans(clean_report)
58 | # Convert any multi white spaces to single white spaces.
59 | clean_report = ' '.join(clean_report.split())
60 |
61 | return clean_report
62 |
--------------------------------------------------------------------------------
/negbio/cli_utils.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import os
3 |
4 | import docopt
5 |
6 |
7 | __root__ = os.path.dirname(os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))))
8 |
9 |
10 | def get_args(args):
11 | s = ''
12 | for k in args:
13 | s += ' {}: {}\n'.format(k, args[k])
14 | return s
15 |
16 |
17 | def parse_args(doc, **kwargs):
18 | argv = docopt.docopt(doc, **kwargs)
19 | if argv['--verbose']:
20 | logging.basicConfig(level=logging.DEBUG)
21 | else:
22 | logging.basicConfig(level=logging.INFO)
23 | logging.debug('Arguments:\n%s', get_args(argv))
24 | return argv
25 |
26 |
27 | def get_absolute_path(argv, key, default_value):
28 | print (__root__)
29 | if argv[key] == default_value:
30 | argv[key] = os.path.join(__root__, argv[key])
31 | return argv
--------------------------------------------------------------------------------
/negbio/compat.py:
--------------------------------------------------------------------------------
1 | """
2 | Python 3 compatibility tools.
3 | """
4 | import sys
5 |
6 | try:
7 | from pathlib import Path, PurePath
8 | except ImportError:
9 | try:
10 | from pathlib2 import Path, PurePath
11 | except ImportError:
12 | Path = PurePath = None
13 |
14 | if sys.version_info[0] >= 3:
15 | basestring = str
16 | else:
17 | basestring = basestring
18 |
19 |
20 | def is_pathlib_path(obj):
21 | """
22 | Check whether obj is a pathlib.Path object.
23 | Prefer using `isinstance(obj, os_PathLike)` instead of this function.
24 | """
25 | return Path is not None and isinstance(obj, Path)
26 |
--------------------------------------------------------------------------------
/negbio/ext/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ncbi-nlp/NegBio/073199e2792824740e89844a59c13d3d40ce4d23/negbio/ext/__init__.py
--------------------------------------------------------------------------------
/negbio/ext/normalize_mimiccxr.py:
--------------------------------------------------------------------------------
1 | import re
2 | import logging
3 |
4 |
5 | def pattern_repl(matchobj):
6 | """
7 | Replace [**Patterns**] with spaces.
8 | """
9 | s = matchobj.group(0).lower()
10 | return ' '.rjust(len(s))
11 |
12 |
13 | def sub(text):
14 | text = re.sub(r'\[\*\*.*?\*\*\]', pattern_repl, text)
15 | text = re.sub(r'_', ' ', text)
16 | return text
17 |
18 |
19 | def find_start(text):
20 | return 0
21 |
22 |
23 | def find_end(text):
24 | ends = [len(text)]
25 | patterns = [
26 | re.compile(r'BY ELECTRONICALLY SIGNING THIS REPORT', re.I),
27 | re.compile(r'\n {3,}DR.', re.I),
28 | re.compile(r'[ ]{1,}RADLINE ', re.I),
29 | re.compile(r'.*electronically signed on', re.I),
30 | re.compile(r'M\[0KM\[0KM')
31 | ]
32 | for pattern in patterns:
33 | m = pattern.search(text)
34 | if m:
35 | ends.append(m.start())
36 | return min(ends)
37 |
38 |
39 | def trim(text):
40 | text = sub(text)
41 | start = find_start(text)
42 | end = find_end(text)
43 |
44 | new_text = ''
45 | if start > 0:
46 | new_text += ' ' * start
47 | new_text += text[start:end]
48 | if len(text) - end > 0:
49 | new_text += ' ' * (len(text) - end)
50 | return new_text
51 |
52 |
53 | def normalize(document):
54 | """
55 | Assume there are only one passage in the document
56 | """
57 | try:
58 | if len(document.passages) == 0:
59 | logging.warning('Skipped: there is no text in document %s', document.id)
60 | elif len(document.passages) > 1:
61 | logging.warning('Skipped: there is more than one passage in document %s', document.id)
62 | else:
63 | document.passages[0].text = trim(document.passages[0].text)
64 | return document
65 | except:
66 | logging.exception('Cannot find text in document %s', document.id)
67 |
--------------------------------------------------------------------------------
/negbio/main_chexpert.py:
--------------------------------------------------------------------------------
1 | """
2 | Detect negative and uncertain findings from SOURCE and output to DEST
3 | Example: python negbio/main_chexpert.py --output=examples/test.neg.xml examples/1.txt examples/2.txt
4 | python negbio/main_chexpert.py --skip-to-bioc --output=examples/test.neg.xml examples/1.xml
5 |
6 | Usage:
7 | main_chexpert text [options] --output=DEST SOURCES ...
8 | main_chexpert bioc [options] --output=DEST SOURCE
9 |
10 | Options:
11 | --mention_phrases_dir= Directory containing mention phrases for each observation.
12 | [default: negbio/chexpert/phrases/mention]
13 | --unmention_phrases_dir= Directory containing unmention phrases for each observation.
14 | [default: negbio/chexpert/phrases/unmention]
15 | --neg-patterns=FILE Negation rules [default: negbio/chexpert/patterns/negation.txt]
16 | --pre-negation-uncertainty-patterns=FILE Pre negation uncertainty rules
17 | [default: negbio/chexpert/patterns/pre_negation_uncertainty.txt]
18 | --post-negation-uncertainty-patterns=FILE Post negation uncertainty rules
19 | [default: negbio/chexpert/patterns/post_negation_uncertainty.txt]
20 | --bllip-model=MODEL_DIR Bllip parser model directory
21 | [default: ~/.local/share/bllipparser/GENIA+PubMed]
22 | --split-document Split document into passages based on section titles such as "Finding",
23 | "Impression"
24 | --newline_is_sentence_break Whether to treat newlines as sentence breaks. True means that a newline
25 | is always a sentence break. False means to ignore newlines for the
26 | purpose of sentence splitting. This is appropriate for continuous text,
27 | when just the non-whitespace characters should be used to determine
28 | sentence breaks.
29 | --verbose Print more information about progress.
30 | """
31 | from __future__ import print_function
32 |
33 | import os
34 |
35 | import bioc
36 | import tqdm
37 | from pathlib2 import Path
38 |
39 | from negbio.chexpert.stages.aggregate import NegBioAggregator
40 | from negbio.chexpert.stages.classify import ModifiedDetector, CATEGORIES
41 | from negbio.chexpert.stages.extract import NegBioExtractor
42 | from negbio.chexpert.stages.load import NegBioLoader
43 | from negbio.cli_utils import parse_args, get_absolute_path
44 | from negbio.pipeline import text2bioc, negdetect
45 | from negbio.pipeline.parse import NegBioParser
46 | from negbio.pipeline.ptb2ud import NegBioPtb2DepConverter, Lemmatizer
47 | from negbio.pipeline.ssplit import NegBioSSplitter
48 |
49 |
50 | def pipeline(collection, loader, ssplitter, extractor, parser, ptb2dep, neg_detector, aggregator, verbose=False):
51 | """
52 | Args:
53 | loader (NegBioLoader)
54 | ssplitter (NegBioSSplitter)
55 | parser (NegBioParser)
56 | extractor (NegBioExtractor)
57 | ptb2dep (NegBioPtb2DepConverter)
58 | neg_detector (ModifiedDetector)
59 | aggregator (NegBioAggregator)
60 | """
61 | # for document in collection.documents:
62 | #
63 | # for passage in document.passages:
64 | # passage.text = clean(passage.text)
65 | # ssplitter.split_doc(document)
66 | for document in tqdm.tqdm(collection.documents, disable=not verbose):
67 | document = loader.clean_doc(document)
68 | document = ssplitter.split_doc(document)
69 | document = extractor.extract_doc(document)
70 | document = parser.parse_doc(document)
71 | document = ptb2dep.convert_doc(document)
72 | document = negdetect.detect(document, neg_detector)
73 | document = aggregator.aggregate_doc(document)
74 | # remove sentence
75 | for passage in document.passages:
76 | del passage.sentences[:]
77 |
78 | return collection
79 |
80 |
81 | def main():
82 | argv = parse_args(__doc__, version='version 2')
83 | print(argv)
84 |
85 | lemmatizer = Lemmatizer()
86 | ptb2dep = NegBioPtb2DepConverter(lemmatizer, universal=True)
87 | ssplitter = NegBioSSplitter(newline=argv['--newline_is_sentence_break'])
88 | parser = NegBioParser(model_dir=argv['--bllip-model'])
89 |
90 | argv = get_absolute_path(argv,
91 | '--mention_phrases_dir',
92 | 'negbio/chexpert/phrases/mention')
93 | argv = get_absolute_path(argv,
94 | '--unmention_phrases_dir',
95 | 'negbio/chexpert/phrases/unmention')
96 | argv = get_absolute_path(argv,
97 | '--pre-negation-uncertainty-patterns',
98 | 'negbio/chexpert/patterns/pre_negation_uncertainty.txt')
99 | argv = get_absolute_path(argv,
100 | '--post-negation-uncertainty-patterns',
101 | 'negbio/chexpert/patterns/post_negation_uncertainty.txt')
102 | argv = get_absolute_path(argv,
103 | '--neg-patterns',
104 | 'negbio/chexpert/patterns/negation.txt')
105 |
106 | # chexpert
107 | loader = NegBioLoader()
108 | extractor = NegBioExtractor(Path(argv['--mention_phrases_dir']),
109 | Path(argv['--unmention_phrases_dir']),
110 | verbose=argv['--verbose'])
111 | neg_detector = ModifiedDetector(argv['--pre-negation-uncertainty-patterns'],
112 | argv['--neg-patterns'],
113 | argv['--post-negation-uncertainty-patterns'])
114 | aggregator = NegBioAggregator(CATEGORIES, verbose=argv['--verbose'])
115 |
116 | if argv['text']:
117 | collection = text2bioc.text2collection(argv['SOURCES'])
118 | elif argv['bioc']:
119 | with open(argv['SOURCE']) as fp:
120 | collection = bioc.load(fp)
121 | else:
122 | raise KeyError
123 |
124 | pipeline(collection, loader, ssplitter, extractor, parser, ptb2dep, neg_detector, aggregator,
125 | verbose=argv['--verbose'])
126 |
127 | with open(os.path.expanduser(argv['--output']), 'w') as fp:
128 | bioc.dump(collection, fp)
129 |
130 |
131 | if __name__ == '__main__':
132 | main()
133 |
--------------------------------------------------------------------------------
/negbio/main_mm.py:
--------------------------------------------------------------------------------
1 | """
2 | Detect negative and uncertain findings from SOURCE and output to DEST
3 | Example: python negbio/main_mm.py --metamap=/opt/public_mm/bin/metamap16 --output=examples/test.neg.xml examples/1.txt examples/2.txt
4 |
5 | Usage:
6 | main_mm text [options] --metamap=BINARY --output=DEST SOURCES ...
7 | main_mm bioc [options] --metamap=BINARY --output=DEST SOURCE ...
8 |
9 | Options:
10 | --neg-patterns=FILE negation rules [default: negbio/patterns/neg_patterns.txt]
11 | --uncertainty-patterns=FILE uncertainty rules [default: negbio/patterns/uncertainty_patterns.txt]
12 | --bllip-model=MODEL_DIR Bllip parser model directory
13 | --split-document Split document into passages based on section titles such as "Finding", "Impression"
14 | --cuis=FILE CUI list. To keep all CUIs, set it to None [default: examples/cuis-cvpr2017.txt]
15 | --newline_is_sentence_break Whether to treat newlines as sentence breaks. True means that a newline is always a
16 | sentence break. False means to ignore newlines for the purpose of sentence
17 | splitting. This is appropriate for continuous text, when just the non-whitespace
18 | characters should be used to determine sentence breaks.
19 | --word_sense_disambiguation Whether to use word sense disambiguation.
20 | --verbose Print more information about progress.
21 | """
22 | from __future__ import print_function
23 | import logging
24 | import sys
25 | import os
26 | import bioc
27 | import docopt
28 |
29 | import pymetamap
30 |
31 | from negbio.cli_utils import parse_args, get_absolute_path
32 | from negbio.pipeline import negdetect, text2bioc, dner_mm
33 | from negbio.negbio_dner_matamap import read_cuis
34 | from negbio.pipeline.parse import NegBioParser
35 | from negbio.pipeline.ssplit import NegBioSSplitter
36 | from negbio.pipeline.ptb2ud import NegBioPtb2DepConverter, Lemmatizer
37 |
38 |
39 | def pipeline(collection, metamap, splitter, parser, ptb2dep, neg_detector, cuis, extra_args):
40 | """
41 |
42 | Args:
43 | collection(BioCCollection):
44 | metamap(MetaMap): MetaMap instance
45 | splitter (NegBioSSplitter):
46 | parser (NegBioParser)
47 | ptb2dep (NegBioPtb2DepConverter)
48 | neg_detector (Detector):
49 |
50 | Returns:
51 | BioCCollection
52 | """
53 | for document in collection.documents:
54 | splitter.split_doc(document)
55 |
56 | dner_mm.run_metamap_col(collection, metamap, cuis, extra_args)
57 |
58 | for document in collection.documents:
59 | document = parser.parse_doc(document)
60 | document = ptb2dep.convert_doc(document)
61 | document = negdetect.detect(document, neg_detector)
62 | # remove sentence
63 | for passage in document.passages:
64 | del passage.sentences[:]
65 |
66 | return collection
67 |
68 |
69 | def main():
70 | argv = parse_args(__doc__, version='version 2')
71 | print(argv)
72 |
73 | lemmatizer = Lemmatizer()
74 | ptb2dep = NegBioPtb2DepConverter(lemmatizer, universal=True)
75 | splitter = NegBioSSplitter(newline=argv['--newline_is_sentence_break'])
76 | parser = NegBioParser(model_dir=argv['--bllip-model'])
77 |
78 | argv = get_absolute_path(argv,
79 | '--neg-patterns',
80 | 'negbio/patterns/neg_patterns.txt')
81 | argv = get_absolute_path(argv,
82 | '--uncertainty-patterns',
83 | 'negbio/patterns/uncertainty_patterns.txt')
84 |
85 | mm = pymetamap.MetaMap.get_instance(argv['--metamap'])
86 | neg_detector = negdetect.Detector(argv['--neg-patterns'], argv['--uncertainty-patterns'])
87 |
88 | if argv['--cuis'] == 'None':
89 | cuis = None
90 | else:
91 | cuis = read_cuis(argv['--cuis'])
92 |
93 | if argv['text']:
94 | collection = text2bioc.text2collection(argv['SOURCES'])
95 | elif argv['bioc']:
96 | with open(argv['SOURCE']) as fp:
97 | collection = bioc.load(fp)
98 | else:
99 | raise KeyError
100 |
101 | extra_args = dict()
102 | if argv['--word_sense_disambiguation']:
103 | extra_args['word_sense_disambiguation'] = True
104 |
105 | # Converting empty dict to None
106 | if len(extra_args) == 0:
107 | extra_args = None
108 |
109 | pipeline(collection, mm, splitter, parser, ptb2dep, neg_detector, cuis, extra_args)
110 |
111 | with open(os.path.expanduser(argv['--output']), 'w') as fp:
112 | bioc.dump(collection, fp)
113 |
114 |
115 | if __name__ == '__main__':
116 | main()
117 |
--------------------------------------------------------------------------------
/negbio/neg/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ncbi-nlp/NegBio/073199e2792824740e89844a59c13d3d40ce4d23/negbio/neg/__init__.py
--------------------------------------------------------------------------------
/negbio/neg/neg_detector.py:
--------------------------------------------------------------------------------
1 | from __future__ import print_function
2 |
3 | import logging
4 |
5 | from negbio.neg import utils, semgraph, propagator
6 | from negbio import ngrex
7 |
8 | NEGATION = 'negation'
9 | UNCERTAINTY = 'uncertainty'
10 |
11 |
12 | class Detector(object):
13 |
14 | NEGATION = 'negation'
15 | UNCERTAINTY = 'uncertainty'
16 |
17 | def __init__(self,
18 | neg_pattern_file,
19 | uncertainty_pattern_file,
20 | sentence_rule=False):
21 | self.sentence_rule = sentence_rule
22 | self.neg_patterns = ngrex.load(neg_pattern_file)
23 | self.uncertain_patterns = ngrex.load(uncertainty_pattern_file)
24 |
25 | def detect(self, sentence, locs):
26 | """
27 | Args:
28 | sentence(BioCSentence): a sentence with universal dependencies
29 | locs(list): a list of (begin, end)
30 | Yields:
31 | (str, MatcherObj, (begin, end)): negation or uncertainty, matcher, matched annotation
32 | """
33 | try:
34 | g = semgraph.load(sentence)
35 | propagator.propagate(g)
36 | except:
37 | logging.exception('Cannot parse dependency graph [offset={}]'.format(sentence.offset))
38 | raise
39 | else:
40 | if self.sentence_rule and is_neg_graph1(g):
41 | for loc in locs:
42 | yield NEGATION, None, loc
43 | return
44 | for loc in locs:
45 | if self.sentence_rule and is_neg_graph2(g, loc[0], loc[1]):
46 | yield NEGATION, None, loc
47 | for node in find_nodes(g, loc[0], loc[1]):
48 | m = self.match_neg(g, node)
49 | if m:
50 | yield NEGATION, m, loc
51 | m = self.match_uncertainty(g, node)
52 | if m:
53 | yield UNCERTAINTY, m, loc
54 |
55 | def match_neg(self, graph, node):
56 | """
57 | Returns a matcher
58 | """
59 | for pattern in self.neg_patterns:
60 | for m in pattern.finditer(graph):
61 | n0 = m.group(0)
62 | if n0 == node:
63 | try:
64 | key = m.get('key')
65 | if semgraph.has_out_edge(graph, key, ['neg']):
66 | continue
67 | except:
68 | pass
69 | if semgraph.has_out(graph, n0, ['new'], ['amod']):
70 | continue
71 | return m
72 | return None
73 |
74 | def match_uncertainty(self, graph, node):
75 | for pattern in self.uncertain_patterns:
76 | for m in pattern.finditer(graph):
77 | n0 = m.group(0)
78 | if n0 == node:
79 | return m
80 |
81 | # parsing error
82 | # suggestive of XXX
83 | p = ngrex.compile('{} <{dependency:/nmod:of/} {lemma:/suggestive/}')
84 | for m in p.finditer(graph):
85 | n0 = m.group(0)
86 | if n0 == node:
87 | if semgraph.has_out_node(graph, m.group(1), ['most']):
88 | return None
89 | elif semgraph.has_out(graph, n0, ['new', 'develop'], ['amod']):
90 | continue
91 | else:
92 | return m
93 | return None
94 |
95 |
96 | def find_nodes(graph, begin, end):
97 | for node in graph.nodes():
98 | if utils.intersect((begin, end), (graph.node[node]['start'], graph.node[node]['end'])):
99 | yield node
100 |
101 |
102 | def is_neg_graph1(graph):
103 | # no XXX
104 | # resolution of XXX
105 | if 'T0' in graph.node and graph.node['T0']['lemma'] in ['no', 'resolution', 'resolved']:
106 | # no verb
107 | has_verb = utils.contains(lambda x: graph.node[x]['tag'][0] == 'V', graph.nodes())
108 | if not has_verb:
109 | return True
110 | return False
111 |
112 |
113 | def is_neg_graph2(graph, begin, end):
114 | """
115 | Return True if the sentence is like "without [begin, end]"
116 |
117 | """
118 |
119 | # without n [, n]
120 | state = 0
121 | # sort nodes
122 | for node in sorted(graph.nodes(), key=lambda n: graph.node[n]['start']):
123 | if graph.node[node]['end'] > end:
124 | break
125 |
126 | if state == 0:
127 | if graph.node[node]['lemma'] in (
128 | 'without', 'no', 'resolve', 'resolution', 'rosolution'):
129 | state = 1
130 | elif state == 1:
131 | if graph.node[node]['tag'].startswith('N'):
132 | state = 1
133 | if utils.intersect((begin, end), (graph.node[node]['start'], graph.node[node]['end'])):
134 | return True
135 | elif graph.node[node]['tag'] in ('JJ', 'CC', ',', 'VBN'):
136 | state = 1
137 | else:
138 | return False
139 | return False
140 |
141 |
142 | def is_neg(annotation):
143 | return NEGATION in annotation.infons and annotation.infons[NEGATION] == 'True'
144 |
145 |
146 | def is_uncertain(annotation):
147 | return UNCERTAINTY in annotation.infons and annotation.infons[UNCERTAINTY] == 'True'
148 |
--------------------------------------------------------------------------------
/negbio/neg/propagator.py:
--------------------------------------------------------------------------------
1 | from __future__ import print_function
2 |
3 | import logging
4 |
5 | from negbio.neg import semgraph
6 | import collections
7 |
8 |
9 | Edge = collections.namedtuple('Edge', ['gov', 'dep', 'data'])
10 |
11 |
12 | def propagate(G):
13 |
14 | for i in range(0, 2):
15 | edges = []
16 | for node in G.nodes():
17 | # hypoinflated but clear of
18 | if G.node[node]['lemma'] == 'hypoinflated':
19 | for child in G.successors(node):
20 | edge_dep = G[node][child]['dependency']
21 | if G.node[child]['lemma'] == 'clear' and edge_dep == 'conj:but':
22 | for of in G.successors(node):
23 | of_dep = G[node][of]['dependency']
24 | if of_dep == 'nmod:of':
25 | edges.append(Edge(child, of, of_dep))
26 | break
27 |
28 | for p, c, d in G.edges(data=True):
29 | # propagate appos
30 | if d['dependency'] == 'appos':
31 | # x > y >appos > z
32 | for grandpa in G.predecessors(p):
33 | edge_dep = G[grandpa][p]['dependency']
34 | edges.append(Edge(grandpa, c, edge_dep))
35 | # x appos > z
36 | for child in G.successors(p):
37 | edge_dep = G[p][child]['dependency']
38 | if edge_dep == 'neg':
39 | edges.append(Edge(c, child, edge_dep))
40 | # propagate dep
41 | if d['dependency'] == 'dep' \
42 | and G.node[p]['tag'].startswith('N') \
43 | and G.node[c]['tag'].startswith('N'):
44 | for grandchild in G.successors(c):
45 | edge_dep = G[c][grandchild]['dependency']
46 | if edge_dep == 'neg':
47 | edges.append(Edge(p, grandchild, edge_dep))
48 | # propagate cop conjunction
49 | if d['dependency'].startswith('conj') \
50 | and G.node[p]['tag'].startswith('N') \
51 | and G.node[c]['tag'].startswith('N'):
52 | for child in G.successors(p):
53 | edge_dep = G[p][child]['dependency']
54 | if edge_dep in ('aux', 'cop', 'neg', 'amod'):
55 | edges.append(Edge(c, child, edge_dep))
56 | if edge_dep in ('dep', 'compound') and G.node[child]['lemma'] == 'no':
57 | edges.append(Edge(c, child, edge_dep))
58 | if edge_dep == 'case' and G.node[child]['lemma'] == 'without':
59 | edges.append(Edge(c, child, edge_dep))
60 |
61 | # propagate area/amount >of XXX
62 | if d['dependency'] == 'nmod:of' and G.node[p]['lemma'] in ('area', 'amount'):
63 | for grandpa in G.predecessors(p):
64 | edge_dep = G[grandpa][p]['dependency']
65 | edges.append(Edge(grandpa, c, edge_dep))
66 | # propagate combination of XXX
67 | if d['dependency'] == 'nmod:of' and G.node[p]['lemma'] == 'combination':
68 | for grandpa in G.predecessors(p):
69 | edge_dep = G[grandpa][p]['dependency']
70 | edges.append(Edge(grandpa, c, edge_dep))
71 | if d['dependency'] == 'nmod:of':
72 | for child in G.successors(p):
73 | edge_dep = G[p][child]['dependency']
74 | # propagate no of XXX
75 | if edge_dep == 'neg':
76 | edges.append(Edge(c, child, edge_dep))
77 | # propagate without of XXX
78 | if edge_dep == 'case' and G.node[child] == 'without':
79 | edges.append(Edge(c, child, edge_dep))
80 | # parse error
81 | # no xx and xxx
82 | if d['dependency'] == 'neg' and semgraph.has_out_node(G, p, ['or', 'and']):
83 | for child in G.successors(p):
84 | edge_dep = G[p][child]['dependency']
85 | if edge_dep == 'compound' and G.node[child]['tag'].startswith('N'):
86 | edges.append(Edge(child, c, 'neg'))
87 |
88 | has_more_edges = False
89 | for e in edges:
90 | if not G.has_edge(e.gov, e.dep):
91 | assert isinstance(e.data, str) or isinstance(e.data, unicode), type(e.data)
92 | G.add_edge(e.gov, e.dep, dependency=e.data)
93 | has_more_edges = True
94 |
95 | if not has_more_edges:
96 | break
97 |
98 |
--------------------------------------------------------------------------------
/negbio/neg/semgraph.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | import networkx as nx
4 |
5 |
6 | def load(sentence):
7 | """
8 | Args:
9 | sentence(BioCSentence): a sentence with tag, text, lemma, start and end
10 |
11 | Returns:
12 | DiGraph: dependency graph
13 |
14 | Examples:
15 | ```xml
16 |
17 | small
18 | JJ
19 |
20 | Small
21 |
22 | ```
23 | """
24 | graph = nx.DiGraph()
25 | for ann in sentence.annotations:
26 | loc = ann.get_total_location()
27 | graph.add_node(ann.id, tag=ann.infons['tag'], text=ann.text, lemma=ann.infons['lemma'].lower(),
28 | start=loc.offset, end=loc.offset + loc.length)
29 | for rel in sentence.relations:
30 | dependant = None
31 | governor = None
32 | for node in rel.nodes:
33 | if node.role == 'dependant':
34 | dependant = node.refid
35 | elif node.role == 'governor':
36 | governor = node.refid
37 | if not dependant or not governor:
38 | logging.debug('Cannot find dependant or governor at {}'.format(sentence))
39 | graph.add_edge(governor, dependant, dependency=rel.infons['dependency'], id=rel.id)
40 | return graph
41 |
42 |
43 | def has_out_edge(graph, node, dependencies):
44 | for _, _, d in graph.out_edges(node, data=True):
45 | if d['dependency'] in dependencies:
46 | return True
47 | return False
48 |
49 |
50 | def has_in_edge(graph, node, dependencies):
51 | for _, _, d in graph.in_edges(node, data=True):
52 | if d['dependency'] in dependencies:
53 | return True
54 | return False
55 |
56 |
57 | def has_out(graph, node, lemmas, dependencies):
58 | return get_out(graph, node, lemmas, dependencies) is not None
59 |
60 |
61 | def get_out(graph, node, lemmas, dependencies):
62 | for _, c, d in graph.out_edges(node, data=True):
63 | if d['dependency'] in dependencies and graph.node[c]['lemma'] in lemmas:
64 | return c
65 | return None
66 |
67 |
68 | def get_in(graph, node, lemmas, dependencies):
69 | for p, _, d in graph.in_edges(node, data=True):
70 | if d['dependency'] in dependencies and graph.node[p]['lemma'] in lemmas:
71 | return p
72 | return None
73 |
74 |
75 | def has_in(graph, node, lemmas, dependencies):
76 | return get_in(graph, node, lemmas, dependencies) is not None
77 |
78 |
79 | def has_out_node(graph, node, lemmas):
80 | for child in graph.successors(node):
81 | if graph.node[child]['lemma'] in lemmas:
82 | return True
83 | return False
84 |
85 |
86 | def has_in_node(graph, node, lemmas):
87 | for child in graph.predecessors(node):
88 | if graph.node[child]['lemma'] in lemmas:
89 | return True
90 | return False
91 |
--------------------------------------------------------------------------------
/negbio/neg/utils.py:
--------------------------------------------------------------------------------
1 | def contains(func, iterable):
2 | """
3 | Return true if one element of iterable for which function returns true.
4 | """
5 | if func is None:
6 | func = bool
7 | for x in iterable:
8 | if func(x):
9 | return True
10 | return False
11 |
12 |
13 | def intersect(range1, range2):
14 | """
15 | Args:
16 | range1(int, int): [begin, end)
17 | range2(int, int): [begin, end)
18 | """
19 | if range1[0] <= range2[0] < range1[1]:
20 | return True
21 | elif range1[0] < range2[1] <= range1[1]:
22 | return True
23 | elif range2[0] <= range1[0] < range2[1]:
24 | return True
25 | elif range2[0] < range1[1] <= range2[1]:
26 | return True
27 | return False
28 |
--------------------------------------------------------------------------------
/negbio/negbio_clean.py:
--------------------------------------------------------------------------------
1 | """
2 | Clean up sentences
3 |
4 | Usage:
5 | negbio_pipeline cleanup [options] --output= ...
6 |
7 | Options:
8 | --suffix= Append an additional SUFFIX to file names. [default: .negbio.xml]
9 | --verbose Print more information about progress.
10 | --output= Specify the output directory.
11 | """
12 |
13 | from negbio.cli_utils import parse_args
14 | from negbio.pipeline.cleanup import clean_sentences
15 | from negbio.pipeline.scan import scan_document
16 |
17 | if __name__ == '__main__':
18 | argv = parse_args(__doc__)
19 | scan_document(source=argv[''], directory=argv['--output'], suffix=argv['--suffix'],
20 | fn=clean_sentences)
21 |
--------------------------------------------------------------------------------
/negbio/negbio_dner_chexpert.py:
--------------------------------------------------------------------------------
1 | """
2 | Detect concepts from vocab
3 |
4 | Usage:
5 | negbio_pipeline dner_chexpert [options] --output= ...
6 |
7 | Options:
8 | --suffix= Append an additional SUFFIX to file names. [default: .chexpert.xml]
9 | --output= Specify the output directory.
10 | --verbose Print more information about progress.
11 | --mention_phrases_dir= Directory containing mention phrases for each observation. [default: negbio/chexpert/phrases/mention]
12 | --unmention_phrases_dir= Directory containing unmention phrases for each observation. [default: negbio/chexpert/phrases/unmention]
13 | """
14 | from pathlib2 import Path
15 |
16 | from negbio.chexpert.stages.extract import NegBioExtractor
17 | from negbio.cli_utils import parse_args, get_absolute_path
18 | from negbio.pipeline.scan import scan_collection
19 |
20 |
21 | def run_extractor(collection, extractor):
22 | """
23 | Args:
24 | collection (BioCCollection):
25 | extractor (NegBioExtractor):
26 | """
27 | extractor.extract_all(collection)
28 |
29 |
30 | if __name__ == '__main__':
31 | argv = parse_args(__doc__)
32 |
33 | argv = get_absolute_path(argv,
34 | '--mention_phrases_dir',
35 | 'negbio/chexpert/phrases/mention')
36 | argv = get_absolute_path(argv,
37 | '--unmention_phrases_dir',
38 | 'negbio/chexpert/phrases/unmention')
39 |
40 | extractor = NegBioExtractor(Path(argv['--mention_phrases_dir']),
41 | Path(argv['--unmention_phrases_dir']),
42 | verbose=argv['--verbose'])
43 | scan_collection(source=argv[''], directory=argv['--output'], suffix=argv['--suffix'],
44 | fn=run_extractor, non_sequences=[extractor])
45 |
--------------------------------------------------------------------------------
/negbio/negbio_dner_matamap.py:
--------------------------------------------------------------------------------
1 | """
2 | Detect UMLS concepts
3 |
4 | Usage:
5 | negbio_pipeline dner_mm [options] --metamap= --output= ...
6 |
7 | Options:
8 | --suffix= Append an additional SUFFIX to file names. [default: .mm.xml]
9 | --output= Specify the output directory.
10 | --verbose Print more information about progress.
11 | --metamap= The MetaMap binary
12 | --cuis= Specify CUI list
13 | """
14 |
15 | from negbio.cli_utils import parse_args
16 | from negbio.pipeline.dner_mm import run_metamap_col
17 | from negbio.pipeline.scan import scan_collection
18 | from pymetamap import MetaMap
19 |
20 |
21 | def read_cuis(pathname):
22 | cuis = set()
23 | with open(pathname) as fp:
24 | for line in fp:
25 | line = line.strip()
26 | if line:
27 | cuis.add(line)
28 | return cuis
29 |
30 |
31 | if __name__ == '__main__':
32 | argv = parse_args(__doc__)
33 | mm = MetaMap.get_instance(argv['--metamap'])
34 |
35 | if argv['--cuis'] is None:
36 | cuis = None
37 | else:
38 | cuis = read_cuis(argv['--cuis'])
39 |
40 | scan_collection(source=argv[''], directory=argv['--output'], suffix=argv['--suffix'],
41 | fn=run_metamap_col, non_sequences=[mm, cuis])
42 |
--------------------------------------------------------------------------------
/negbio/negbio_neg.py:
--------------------------------------------------------------------------------
1 | """
2 | Detect negation and uncertainty
3 |
4 | Usage:
5 | negbio_pipeline neg [options] --output= ...
6 |
7 | Options:
8 | --neg-patterns= Specify negation rules [default: negbio/patterns/neg_patterns.txt]
9 | --uncertainty-patterns= Specify uncertainty rules [default: negbio/patterns/uncertainty_patterns.txt]
10 | --suffix= Append an additional SUFFIX to file names. [default: .neg.xml]
11 | --verbose Print more information about progress.
12 | --output= Specify the output directory.
13 | """
14 | import os
15 |
16 | from negbio.cli_utils import parse_args, get_absolute_path
17 | from negbio.neg.neg_detector import Detector
18 | from negbio.pipeline.negdetect import detect
19 | from negbio.pipeline.scan import scan_document
20 |
21 | if __name__ == '__main__':
22 | argv = parse_args(__doc__)
23 |
24 | argv = get_absolute_path(argv,
25 | '--neg-patterns',
26 | 'negbio/patterns/neg_patterns.txt')
27 | argv = get_absolute_path(argv,
28 | '--uncertainty-patterns',
29 | 'negbio/patterns/uncertainty_patterns.txt')
30 |
31 | neg_detector = Detector(os.path.realpath(argv['--neg-patterns']),
32 | os.path.realpath(argv['--uncertainty-patterns']))
33 | scan_document(source=argv[''], directory=argv['--output'], suffix=argv['--suffix'],
34 | fn=detect, non_sequences=[neg_detector])
35 |
--------------------------------------------------------------------------------
/negbio/negbio_neg_chexpert.py:
--------------------------------------------------------------------------------
1 | """
2 | Detect negation and uncertainty
3 |
4 | Usage:
5 | negbio_pipeline neg_chexpert [options] --output= ...
6 |
7 | Options:
8 | --neg-patterns=FILE Negation rules [default: negbio/chexpert/patterns/negation.txt]
9 | --pre-negation-uncertainty-patterns=FILE Pre negation uncertainty rules
10 | [default: negbio/chexpert/patterns/pre_negation_uncertainty.txt]
11 | --post-negation-uncertainty-patterns=FILE Post negation uncertainty rules
12 | [default: negbio/chexpert/patterns/post_negation_uncertainty.txt]
13 | --suffix= Append an additional SUFFIX to file names. [default: .neg.xml]
14 | --verbose Print more information about progress.
15 | --output= Specify the output directory.
16 | """
17 | import os
18 |
19 | from negbio.chexpert.stages.classify import ModifiedDetector
20 | from negbio.cli_utils import parse_args, get_absolute_path
21 | from negbio.pipeline.negdetect import detect
22 | from negbio.pipeline.scan import scan_document
23 |
24 |
25 | if __name__ == '__main__':
26 | argv = parse_args(__doc__)
27 |
28 | argv = get_absolute_path(argv,
29 | '--pre-negation-uncertainty-patterns',
30 | 'negbio/chexpert/patterns/pre_negation_uncertainty.txt')
31 | argv = get_absolute_path(argv,
32 | '--post-negation-uncertainty-patterns',
33 | 'negbio/chexpert/patterns/post_negation_uncertainty.txt')
34 | argv = get_absolute_path(argv,
35 | '--neg-patterns',
36 | 'negbio/chexpert/patterns/negation.txt')
37 |
38 | neg_detector = ModifiedDetector(argv['--pre-negation-uncertainty-patterns'],
39 | argv['--neg-patterns'],
40 | argv['--post-negation-uncertainty-patterns'])
41 | scan_document(source=argv[''], directory=argv['--output'], suffix=argv['--suffix'],
42 | fn=detect, non_sequences=[neg_detector])
43 |
--------------------------------------------------------------------------------
/negbio/negbio_normalize.py:
--------------------------------------------------------------------------------
1 | """
2 | Usage:
3 | negbio_pipeline normalize [options] --output= ...
4 |
5 | Options:
6 | --output= Specify the output directory.
7 | --suffix= Append an additional SUFFIX to file names. [default: .normalized.xml]
8 | --verbose Print more information about progress.
9 | """
10 |
11 | from negbio.cli_utils import parse_args
12 | from negbio.ext.normalize_mimiccxr import normalize
13 | from negbio.pipeline.scan import scan_document
14 |
15 | if __name__ == '__main__':
16 | argv = parse_args(__doc__)
17 | scan_document(source=argv[''], verbose=argv['--verbose'], suffix=argv['--suffix'],
18 | directory=argv['--output'], fn=normalize)
19 |
--------------------------------------------------------------------------------
/negbio/negbio_parse.py:
--------------------------------------------------------------------------------
1 | """
2 | Parse sentences
3 |
4 | Usage:
5 | negbio_pipeline parse [options] --output= ...
6 |
7 | Options:
8 | --model= Bllip parser model directory.
9 | --output= Specify the output directory.
10 | --suffix= Append an additional SUFFIX to file names. [default: .bllip.xml]
11 | --verbose Print more information about progress.
12 | """
13 |
14 | from negbio.cli_utils import parse_args
15 | from negbio.pipeline.parse import NegBioParser
16 | from negbio.pipeline.scan import scan_document
17 |
18 |
19 | if __name__ == '__main__':
20 | argv = parse_args(__doc__)
21 | parser = NegBioParser(model_dir=argv['--model'])
22 | scan_document(source=argv[''], directory=argv['--output'], suffix=argv['--suffix'],
23 | fn=parser.parse_doc, non_sequences=[])
24 |
--------------------------------------------------------------------------------
/negbio/negbio_pipeline.py:
--------------------------------------------------------------------------------
1 | """
2 | Usage:
3 | negbio_pipeline [--verbose] [...]
4 |
5 | Options:
6 | --verbose Print more information about progress.
7 |
8 | The most commonly used negbio commands are:
9 | text2bioc
10 | normalize
11 | section_split
12 | ssplit
13 | parse
14 | ptb2ud
15 | dner_mm
16 | dner_chexpert
17 | neg
18 | neg_chexpert
19 | cleanup
20 | """
21 | from subprocess import call
22 | import logging
23 | import os
24 | from negbio.cli_utils import parse_args
25 |
26 |
27 | def main():
28 | args = parse_args(__doc__, version='negbio version 2', options_first=True)
29 | logging.debug('CWD: %s', os.getcwd())
30 |
31 | argv = [args['']] + args['']
32 | if args[''] == 'text2bioc':
33 | exit(call(['python', '-m', 'negbio.negbio_text2bioc'] + argv))
34 | elif args[''] == 'normalize':
35 | exit(call(['python', '-m', 'negbio.negbio_normalize'] + argv))
36 | elif args[''] == 'section_split':
37 | exit(call(['python', '-m', 'negbio.negbio_section_split'] + argv))
38 | elif args[''] == 'ssplit':
39 | exit(call(['python', '-m', 'negbio.negbio_ssplit'] + argv))
40 | elif args[''] == 'parse':
41 | exit(call(['python', '-m', 'negbio.negbio_parse'] + argv))
42 | elif args[''] == 'ptb2ud':
43 | exit(call(['python', '-m', 'negbio.negbio_ptb2ud'] + argv))
44 | elif args[''] == 'dner_mm':
45 | exit(call(['python', '-m', 'negbio.negbio_dner_matamap'] + argv))
46 | elif args[''] == 'dner_chexpert':
47 | exit(call(['python', '-m', 'negbio.negbio_dner_chexpert'] + argv))
48 | elif args[''] == 'neg':
49 | exit(call(['python', '-m', 'negbio.negbio_neg'] + argv))
50 | elif args[''] == 'neg_chexpert':
51 | exit(call(['python', '-m', 'negbio.negbio_neg_chexpert'] + argv))
52 | elif args[''] == 'cleanup':
53 | exit(call(['python', '-m', 'negbio.negbio_clean'] + argv))
54 | elif args[''] in ['help', None]:
55 | exit(call(['python', '-m', 'negbio.negbio_pipeline', '--help']))
56 | else:
57 | exit("%r is not a negbio command. See 'negbio help'." % args[''])
58 |
59 |
60 | if __name__ == '__main__':
61 | main()
62 |
--------------------------------------------------------------------------------
/negbio/negbio_ptb2ud.py:
--------------------------------------------------------------------------------
1 | """
2 | Convert from parse tree to universal dependencies
3 |
4 | Usage:
5 | negbio_pipeline ptb2ud [options] --output= ...
6 |
7 | Options:
8 | --output= Specify the output directory.
9 | --suffix= Append an additional SUFFIX to file names. [default: .ud.xml]
10 | --verbose Print more information about progress.
11 | """
12 | from negbio.cli_utils import parse_args
13 | from negbio.pipeline.ptb2ud import NegBioPtb2DepConverter, Lemmatizer
14 | from negbio.pipeline.scan import scan_document
15 |
16 |
17 | if __name__ == '__main__':
18 | argv = parse_args(__doc__)
19 | lemmatizer = Lemmatizer()
20 | ptb2dep = NegBioPtb2DepConverter(lemmatizer, universal=True)
21 | scan_document(source=argv[''], directory=argv['--output'], suffix=argv['--suffix'],
22 | fn=ptb2dep.convert_doc, non_sequences=[])
23 |
--------------------------------------------------------------------------------
/negbio/negbio_section_split.py:
--------------------------------------------------------------------------------
1 | """
2 | Split the report into sections based on titles.
3 |
4 | Usage:
5 | negbio_pipeline section_split [options] --output= ...
6 |
7 | Options:
8 | --suffix= Append an additional SUFFIX to file names. [default: .secsplit.xml]
9 | --output= Specify the output directory.
10 | --verbose Print more information about progress.
11 | --pattern= Specify section title list for matching.
12 | """
13 | import re
14 |
15 | from negbio.cli_utils import parse_args
16 | from negbio.pipeline.scan import scan_document
17 | from negbio.pipeline.section_split import split_document
18 |
19 |
20 | def read_section_titles(pathname):
21 | with open(pathname) as fp:
22 | return re.compile('|'.join(fp.readlines()), re.MULTILINE)
23 |
24 |
25 | if __name__ == '__main__':
26 | argv = parse_args(__doc__)
27 |
28 | if argv['--pattern'] is None:
29 | patterns = None
30 | else:
31 | patterns = read_section_titles(argv['--pattern'])
32 |
33 | scan_document(source=argv[''], verbose=argv['--verbose'], suffix=argv['--suffix'],
34 | directory=argv['--output'], fn=split_document, non_sequences=[patterns])
35 |
--------------------------------------------------------------------------------
/negbio/negbio_ssplit.py:
--------------------------------------------------------------------------------
1 | """
2 | Split text into sentences
3 |
4 | Usage:
5 | negbio_pipeline ssplit [options] --output= ...
6 |
7 | Options:
8 | --newline_is_sentence_break Whether to treat newlines as sentence breaks. True means that a newline is always a
9 | sentence break. False means to ignore newlines for the purpose of sentence
10 | splitting. This is appropriate for continuous text, when just the non-whitespace
11 | characters should be used to determine sentence breaks. [default=False]
12 | --suffix= Append an additional SUFFIX to file names. [default: .ssplit.xml]
13 | --output= Specify the output directory.
14 | --verbose Print more information about progress.
15 | """
16 | from negbio.pipeline.scan import scan_document
17 | from negbio.pipeline.ssplit import NegBioSSplitter
18 | from negbio.cli_utils import parse_args
19 |
20 | if __name__ == '__main__':
21 | argv = parse_args(__doc__)
22 | splitter = NegBioSSplitter(newline=argv['--newline_is_sentence_break'])
23 | scan_document(source=argv[''], directory=argv['--output'], suffix=argv['--suffix'],
24 | fn=splitter.split_doc, non_sequences=[])
25 |
--------------------------------------------------------------------------------
/negbio/negbio_text2bioc.py:
--------------------------------------------------------------------------------
1 | """
2 | Convert text FILEs to the BioC output file
3 |
4 | Usage:
5 | negbio_pipeline text2bioc [options] --output= ...
6 |
7 | Options:
8 | --output= Specify the output file name.
9 | --verbose Print more information about progress.
10 | """
11 |
12 | import bioc
13 |
14 | from negbio.cli_utils import parse_args
15 | from negbio.pipeline.text2bioc import text2collection
16 |
17 | if __name__ == '__main__':
18 | argv = parse_args(__doc__)
19 | collection = text2collection(argv[''])
20 | with open(argv['--output'], 'w') as fp:
21 | bioc.dump(collection, fp)
22 |
--------------------------------------------------------------------------------
/negbio/ngrex/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | A NgrexPattern is a tgrep-type pattern for matching node configurations in one of the Networkx
3 | structures. Unlike tgrep but like Unix grep, there is no pre-indexing of the data to be searched.
4 | Rather there is a linear scan through the graph where matches are sought.
5 |
6 | A node/edge is represented by a set of attributes and their values contained by curly braces:
7 | `{attr1:value1;attr2:value2;...}`. Therefore, {} represents any node/edge in the graph.
8 | Attributes must be plain strings; values can be regular expressions blocked off by "/".
9 | (I think regular expressions must match the whole attribute value; so that /NN/ matches "NN" only,
10 | while /NN.* / matches "NN", "NNS", "NNP", etc.)
11 | """
12 | from . import parser
13 | from . import pattern
14 |
15 |
16 | def compile(ngrex):
17 | """
18 | Compiles the given expression into a pattern
19 |
20 | Args:
21 | ngrex(str): expression
22 |
23 | Returns:
24 | NgrexPattern: a pattern
25 | """
26 | p = parser.yacc.parse(ngrex)
27 | pattern.validate_names(p)
28 | return p
29 |
30 |
31 | def load(filename):
32 | """
33 | Read a pattern file
34 |
35 | Args:
36 | filename(str): file name
37 |
38 | Returns:
39 | list: a list of NgexPattern
40 | """
41 | patterns = []
42 | with open(filename) as fp:
43 | for line in fp:
44 | line = line.strip()
45 | if not line:
46 | continue
47 | if line[0] == '#':
48 | continue
49 | patterns.append(compile(line))
50 | return patterns
51 |
--------------------------------------------------------------------------------
/negbio/ngrex/parser.py:
--------------------------------------------------------------------------------
1 | """
2 | Start : ALIGNRELN SubNode "\n"
3 | | SubNode ( ":" SubNode )* "\n"
4 | ;
5 |
6 | SubNode : "(" SubNode ")" RelationDisj?
7 | | ModNode RelationDisj?
8 | ;
9 |
10 | RelationDisj : RelationConj ( "|" RelationConj )*
11 |
12 | RelationConj : ModRelation ( "&"? ModRelation )*
13 |
14 | ModRelation : RelChild
15 | | "!" RelChild
16 | | "?" RelChild
17 | ;
18 |
19 | RelChild : "[" RelationDisj "]"
20 | | Relation
21 | ;
22 |
23 | Relation : ( ( ( (IDENTIFIER ("," IDENTIFIER)?)? RELATION ( IDENTIFIER | REGEX )? ) ( "=" IDENTIFIER )? ) | ALIGNRELN)
24 | ( ModNode | "(" SubNode ")" )
25 | ;
26 |
27 | NodeDisj : "[" NodeConj ( "|" NodeConj )* "]"
28 | ;
29 |
30 | NodeConj : ModNode ( "&"? ModNode )*
31 | ;
32 |
33 | ModNode : Child
34 | | "!" Child
35 | ;
36 |
37 | Child : NodeDisj
38 | | Description
39 | ;
40 |
41 | Description :
42 | "{" (
43 | ( ( IDENTIFIER ":" (IDENTIFIER | REGEX) ) (";" ( IDENTIFIER ":" ( IDENTIFIER | REGEX ) ) )* "}")
44 | | ( ROOT "}" )
45 | | ( EMPTY "}" )
46 | | "}" )
47 | ("=" IDENTIFIER )?
48 | """
49 | from ply import lex
50 | from ply import yacc
51 |
52 | from negbio.ngrex import pattern
53 |
54 |
55 | t_ignore = ' \t\r'
56 |
57 | tokens = (
58 | 'RELATION',
59 | 'IDENTIFIER',
60 | 'REGEX',
61 | )
62 |
63 | literals = '{}()&[]:|,='
64 |
65 | t_RELATION = r'[<>]'
66 | t_IDENTIFIER = r'([^ \n\r!@#$%^&*()+={}\[\]\|\\;\':",./<>?`~-])+'
67 | t_REGEX = r'/(/|[^\n\r/])*?/'
68 |
69 |
70 | def t_error(t):
71 | raise TypeError('Unknown text "%s"' % (t.value,))
72 |
73 | lexer = lex.lex()
74 |
75 |
76 | def p_SubNode(p):
77 | """
78 | SubNode : ModNode
79 | | ModNode RelationDisj
80 | | '(' SubNode ')'
81 | | '(' SubNode ')' RelationDisj
82 | """
83 | if len(p) == 2:
84 | p[0] = p[1]
85 | elif len(p) == 3:
86 | conj_patterns = []
87 | for relation_conj in p[2][1]:
88 | conj_patterns.append(_merge_conj(p[1], relation_conj[1]))
89 | p[0] = _merge_disj(conj_patterns)
90 | elif len(p) == 4:
91 | p[0] = p[2]
92 | elif len(p) == 5:
93 | conj_patterns = []
94 | for relation_conj in p[4][1]:
95 | conj_patterns.append(_merge_conj(p[2], relation_conj[1]))
96 | p[0] = _merge_disj(conj_patterns)
97 |
98 |
99 | def _merge_disj(patterns):
100 | while len(patterns) > 1:
101 | p1 = patterns.pop()
102 | p2 = patterns.pop()
103 | patterns.append(pattern.CoordinationPattern(p1, p2, False))
104 | return patterns[0]
105 |
106 |
107 | def _merge_conj(p1, relations):
108 | patterns = []
109 | for reln, attributes, node in relations:
110 | if reln == '<':
111 | p = pattern.EdgePattern(node, p1, attributes, direction=pattern.L)
112 | else:
113 | p = pattern.EdgePattern(p1, node, attributes, direction=pattern.R)
114 | patterns.append(p)
115 | if len(patterns) == 1:
116 | return patterns[0]
117 | else:
118 | while len(patterns) > 1:
119 | p1 = patterns.pop()
120 | p2 = patterns.pop()
121 | patterns.append(pattern.CoordinationPattern(p1, p2, True))
122 | return patterns[0]
123 |
124 |
125 | def p_RelationDisj(p):
126 | """
127 | RelationDisj : RelationConj
128 | | RelationConj '|' RelationDisj
129 | """
130 | """
131 | Returns:
132 | ("OR", relation_list)
133 | """
134 | if len(p) == 2:
135 | p[0] = ('OR', [p[1]])
136 | elif len(p) == 4:
137 | p[0] = ('OR', [p[1]] + p[3][1])
138 |
139 |
140 | def p_RelationConj(p):
141 | """
142 | RelationConj : ModRelation
143 | | ModRelation RelationConj
144 | | ModRelation '&' RelationConj
145 | """
146 | # (AND, [ModRelations])
147 | if len(p) == 2:
148 | p[0] = ('AND', [p[1]])
149 | if len(p) == 3:
150 | p[0] = ('AND', [p[1]] + p[2][1])
151 | if len(p) == 4:
152 | p[0] = ('AND', [p[1]] + p[3][1])
153 |
154 |
155 | def p_ModRelation(p):
156 | """
157 | ModRelation : RelChild
158 | """
159 | p[0] = p[1]
160 |
161 |
162 | def p_RelChild(p):
163 | """
164 | RelChild : Relation
165 | """
166 | p[0] = p[1]
167 |
168 |
169 | def p_Relation(p):
170 | """
171 | Relation : RELATION '{' Attributes '}' Relation_Next
172 | """
173 | """
174 | Returns:
175 | < edge_attributes node
176 | """
177 | p[0] = (p[1], p[3], p[5])
178 |
179 |
180 | def p_Relation_Next(p):
181 | """
182 | Relation_Next : ModNode
183 | | '(' SubNode ')'
184 | """
185 | if len(p) == 2:
186 | p[0] = p[1]
187 | else:
188 | p[0] = p[2]
189 |
190 |
191 | def p_ModNode(p):
192 | """
193 | ModNode : Child
194 | """
195 | p[0] = p[1]
196 |
197 |
198 | def p_Child(p):
199 | """
200 | Child : Description
201 | """
202 | p[0] = p[1]
203 |
204 |
205 | def p_Description(p):
206 | """
207 | Description : '{' Attributes '}'
208 | | '{' Attributes '}' '=' IDENTIFIER
209 | """
210 | if len(p) == 4:
211 | p[0] = pattern.NodePattern(p[2])
212 | else:
213 | p[0] = pattern.NodePattern(p[2], p[5])
214 |
215 | def p_Attributes(p):
216 | """
217 | Attributes : IDENTIFIER ':' REGEX
218 | | IDENTIFIER ':' REGEX ',' Attributes
219 | | empty
220 | """
221 | if len(p) == 4:
222 | p[0] = {p[1]: p[3]}
223 | elif len(p) == 6:
224 | p[0] = {p[1]: p[3]}
225 | p[0].update(p[5])
226 | else:
227 | p[0] = {}
228 |
229 |
230 | def p_empty(p):
231 | 'empty :'
232 | pass
233 |
234 |
235 | def p_error(p):
236 | raise TypeError("Syntax error at '%s'" % p.value)
237 |
238 | parser = yacc.yacc()
239 |
240 |
--------------------------------------------------------------------------------
/negbio/ngrex/parsetab.py:
--------------------------------------------------------------------------------
1 |
2 | # parsetab.py
3 | # This file is automatically generated. Do not edit.
4 | _tabversion = '3.10'
5 |
6 | _lr_method = 'LALR'
7 |
8 | _lr_signature = "RELATION IDENTIFIER REGEX\n SubNode : ModNode\n | ModNode RelationDisj\n | '(' SubNode ')' \n | '(' SubNode ')' RelationDisj\n \n RelationDisj : RelationConj\n | RelationConj '|' RelationDisj\n \n RelationConj : ModRelation\n | ModRelation RelationConj\n | ModRelation '&' RelationConj\n \n ModRelation : RelChild\n \n RelChild : Relation\n \n Relation : RELATION '{' Attributes '}' Relation_Next\n \n Relation_Next : ModNode\n | '(' SubNode ')'\n \n ModNode : Child\n \n Child : Description\n \n Description : '{' Attributes '}'\n | '{' Attributes '}' '=' IDENTIFIER\n \n Attributes : IDENTIFIER ':' REGEX\n | IDENTIFIER ':' REGEX ',' Attributes\n | empty\n empty :"
9 |
10 | _lr_action_items = {'REGEX':([23,],[29,]),':':([15,],[23,]),'&':([1,4,8,10,11,22,31,33,35,38,],[-16,-15,18,-10,-11,-17,-18,-12,-13,-14,]),')':([1,3,4,7,8,9,10,11,12,17,19,22,24,25,26,31,33,35,37,38,],[-16,-1,-15,17,-7,-2,-10,-11,-5,-3,-8,-17,-4,-9,-6,-18,-12,-13,38,-14,]),'(':([0,2,30,34,],[2,2,34,2,]),'=':([22,],[28,]),',':([29,],[32,]),'RELATION':([1,3,4,8,10,11,17,18,20,22,31,33,35,38,],[-16,13,-15,13,-10,-11,13,13,13,-17,-18,-12,-13,-14,]),'{':([0,2,13,30,34,],[5,5,21,5,5,]),'IDENTIFIER':([5,21,28,32,],[15,15,31,15,]),'}':([5,14,16,21,27,29,32,36,],[-22,22,-21,-22,30,-19,-22,-20,]),'|':([1,4,8,10,11,12,19,22,25,31,33,35,38,],[-16,-15,-7,-10,-11,20,-8,-17,-9,-18,-12,-13,-14,]),'$end':([1,3,4,6,8,9,10,11,12,17,19,22,24,25,26,31,33,35,38,],[-16,-1,-15,0,-7,-2,-10,-11,-5,-3,-8,-17,-4,-9,-6,-18,-12,-13,-14,]),}
11 |
12 | _lr_action = {}
13 | for _k, _v in _lr_action_items.items():
14 | for _x,_y in zip(_v[0],_v[1]):
15 | if not _x in _lr_action: _lr_action[_x] = {}
16 | _lr_action[_x][_k] = _y
17 | del _lr_action_items
18 |
19 | _lr_goto_items = {'Description':([0,2,30,34,],[1,1,1,1,]),'ModRelation':([3,8,17,18,20,],[8,8,8,8,8,]),'RelationDisj':([3,17,20,],[9,24,26,]),'RelChild':([3,8,17,18,20,],[10,10,10,10,10,]),'Child':([0,2,30,34,],[4,4,4,4,]),'Relation':([3,8,17,18,20,],[11,11,11,11,11,]),'RelationConj':([3,8,17,18,20,],[12,19,12,25,12,]),'ModNode':([0,2,30,34,],[3,3,35,3,]),'Attributes':([5,21,32,],[14,27,36,]),'Relation_Next':([30,],[33,]),'SubNode':([0,2,34,],[6,7,37,]),'empty':([5,21,32,],[16,16,16,]),}
20 |
21 | _lr_goto = {}
22 | for _k, _v in _lr_goto_items.items():
23 | for _x, _y in zip(_v[0], _v[1]):
24 | if not _x in _lr_goto: _lr_goto[_x] = {}
25 | _lr_goto[_x][_k] = _y
26 | del _lr_goto_items
27 | _lr_productions = [
28 | ("S' -> SubNode","S'",1,None,None,None),
29 | ('SubNode -> ModNode','SubNode',1,'p_SubNode','parser.py',78),
30 | ('SubNode -> ModNode RelationDisj','SubNode',2,'p_SubNode','parser.py',79),
31 | ('SubNode -> ( SubNode )','SubNode',3,'p_SubNode','parser.py',80),
32 | ('SubNode -> ( SubNode ) RelationDisj','SubNode',4,'p_SubNode','parser.py',81),
33 | ('RelationDisj -> RelationConj','RelationDisj',1,'p_RelationDisj','parser.py',127),
34 | ('RelationDisj -> RelationConj | RelationDisj','RelationDisj',3,'p_RelationDisj','parser.py',128),
35 | ('RelationConj -> ModRelation','RelationConj',1,'p_RelationConj','parser.py',142),
36 | ('RelationConj -> ModRelation RelationConj','RelationConj',2,'p_RelationConj','parser.py',143),
37 | ('RelationConj -> ModRelation & RelationConj','RelationConj',3,'p_RelationConj','parser.py',144),
38 | ('ModRelation -> RelChild','ModRelation',1,'p_ModRelation','parser.py',157),
39 | ('RelChild -> Relation','RelChild',1,'p_RelChild','parser.py',164),
40 | ('Relation -> RELATION { Attributes } Relation_Next','Relation',5,'p_Relation','parser.py',171),
41 | ('Relation_Next -> ModNode','Relation_Next',1,'p_Relation_Next','parser.py',182),
42 | ('Relation_Next -> ( SubNode )','Relation_Next',3,'p_Relation_Next','parser.py',183),
43 | ('ModNode -> Child','ModNode',1,'p_ModNode','parser.py',193),
44 | ('Child -> Description','Child',1,'p_Child','parser.py',200),
45 | ('Description -> { Attributes }','Description',3,'p_Description','parser.py',207),
46 | ('Description -> { Attributes } = IDENTIFIER','Description',5,'p_Description','parser.py',208),
47 | ('Attributes -> IDENTIFIER : REGEX','Attributes',3,'p_Attributes','parser.py',217),
48 | ('Attributes -> IDENTIFIER : REGEX , Attributes','Attributes',5,'p_Attributes','parser.py',218),
49 | ('Attributes -> empty','Attributes',1,'p_Attributes','parser.py',219),
50 | ('empty -> ','empty',0,'p_empty','parser.py',231),
51 | ]
52 |
--------------------------------------------------------------------------------
/negbio/ngrex/pattern.py:
--------------------------------------------------------------------------------
1 | import itertools
2 | import re
3 | import collections
4 |
5 | L = '<'
6 | R = '>'
7 | LEFT = '<'
8 | RIGHT = '>'
9 |
10 |
11 | class NgrexPattern(object):
12 | """
13 | A NgrexPattern is a tgrep-type pattern for matching node configurations in Networkx structures.
14 | """
15 |
16 | def __init__(self):
17 | self._pattern = None
18 |
19 | def finditer(self, graph):
20 | """
21 | Returns an iterator yielding MatcherObj instances over all matches for the ngrex pattern
22 | in graph.
23 |
24 | Args:
25 | graph(DiGraph): graph
26 |
27 | Yields:
28 | MatcherObj: an iterator yielding MatcherObj instances over all matches for the
29 | ngrex pattern in graph.
30 | """
31 | raise NotImplementedError('Should have implemented this')
32 |
33 | @property
34 | def pattern(self):
35 | """
36 | str: The pattern string from which the ngrex object was compiled.
37 | """
38 | return self._pattern
39 |
40 | def __str__(self):
41 | return self.pattern
42 |
43 |
44 | class NodePattern(NgrexPattern):
45 | def __init__(self, attributes, name=None):
46 | super(NodePattern, self).__init__()
47 | self._name = name
48 | self._attributes = _get_attributes_regex(attributes)
49 | self._pattern = '{' + _attributes_to_str(self._attributes) + '}'
50 | if name:
51 | self._pattern += '=' + name
52 |
53 | def finditer(self, graph):
54 | for node in graph.nodes():
55 | if self._attributes:
56 | if _match(self._attributes, graph.node[node]):
57 | yield MatcherObj(self, graph, [(self._name, node)])
58 | else:
59 | yield MatcherObj(self, graph, [(self._name, node)])
60 |
61 |
62 | class EdgePattern(NgrexPattern):
63 | def __init__(self, governor, dependant, edge_attributes, direction=LEFT):
64 | """
65 | Args:
66 | direction(str): right if 'governor >edge dependant', left if 'dependant ', dependant)
78 | self._pattern = '({args[0].pattern}) {args[1]}{{{edge}}} ({args[2].pattern})'.format(
79 | args=args, edge=_attributes_to_str(self._edge_attributes))
80 |
81 | def finditer(self, graph):
82 | governors = self._governor.finditer(graph)
83 | dependants = self._dependant.finditer(graph)
84 | for g, d in itertools.product(governors, dependants):
85 | for p, c, e in graph.edges(data=True):
86 | if p == g.group(0) and c == d.group(0):
87 | if _match(self._edge_attributes, e):
88 | if self._direction == LEFT:
89 | yield MatcherObj(self, graph, d._nodes + g._nodes)
90 | else:
91 | yield MatcherObj(self, graph, g._nodes + d._nodes)
92 |
93 |
94 | class CoordinationPattern(NgrexPattern):
95 | def __init__(self, pattern1, pattern2, is_conj=True):
96 | """
97 | Args:
98 | is_conj(bool): if is_conj is true, then it is an "AND"; otherwise, it is an "OR".
99 | """
100 | super(CoordinationPattern, self).__init__()
101 | self._pattern1 = pattern1
102 | self._pattern2 = pattern2
103 | self._is_conj = is_conj
104 | self._pattern = '{} {} {}'.format(pattern2.pattern,
105 | '&' if is_conj else '|',
106 | pattern1.pattern)
107 |
108 | def finditer(self, graph):
109 | if self._is_conj:
110 | matchers1 = self._pattern1.finditer(graph)
111 | matchers2 = self._pattern2.finditer(graph)
112 | for m1, m2 in itertools.product(matchers1, matchers2):
113 | if m1.group(0) == m2.group(0):
114 | nodes = list(m1._nodes)
115 | if len(m2._nodes) > 2:
116 | nodes.extend(m2._nodes[1:])
117 | yield MatcherObj(self, graph, nodes)
118 | else:
119 | for m in self._pattern1.finditer(graph):
120 | yield m
121 | for m in self._pattern2.finditer(graph):
122 | yield m
123 |
124 |
125 | class MatcherObj:
126 | """
127 | Match objects always have a boolean value of True.
128 | """
129 |
130 | def __init__(self, pattern, graph, nodes):
131 | """
132 | Args:
133 | nodes(list): [(name, node)]
134 | """
135 | self._pattern = pattern
136 | self._graph = graph
137 | self._nodes = nodes
138 |
139 | def __bool__(self):
140 | return True
141 |
142 | def group(self, index):
143 | """
144 | Returns the input node captured by the given group during the previous match operation.
145 | """
146 | return self._nodes[index][1]
147 |
148 | def groups(self):
149 | """
150 | Returns a list containing all the subgroups of the match, from 0 up to however many nodes
151 | are in the pattern.
152 | """
153 | return (node[1] for node in self._nodes)
154 |
155 | def get(self, name):
156 | for node in self._nodes:
157 | if node[0] == name:
158 | return node[1]
159 | raise KeyError(name)
160 |
161 | @property
162 | def pattern(self):
163 | """
164 | The expression object whose `finditer()` produced this instance
165 | """
166 | return self._pattern
167 |
168 | @property
169 | def graph(self):
170 | """
171 | The graph passed to `finditer()`
172 | """
173 | return self._graph
174 |
175 |
176 | def validate_names(pattern):
177 | def _helper(p, names):
178 | if isinstance(p, NodePattern):
179 | if p._name in names:
180 | raise KeyError(p._name)
181 | if p._name:
182 | names.add(p._name)
183 | elif isinstance(p, EdgePattern):
184 | _helper(p._governor, names)
185 | _helper(p._dependant, names)
186 | elif isinstance(p, CoordinationPattern):
187 | _helper(p._pattern1, names)
188 | _helper(p._pattern2, names)
189 | _helper(pattern, set())
190 |
191 |
192 | def _get_attributes_regex(attributes):
193 | def _get_regex(v):
194 | v = v[1:-1]
195 | if v:
196 | if v[0] != '^':
197 | v = '^' + v
198 | if v[-1] != '$':
199 | v += '$'
200 | return re.compile(v)
201 | return {k: _get_regex(v) for k, v in attributes.items()}
202 |
203 |
204 | def _match(attributes, element):
205 | for k, v in attributes.items():
206 | if k not in element or not v.match(element[k]):
207 | return False
208 | return True
209 |
210 |
211 | def _attributes_to_str(attributes):
212 | return ','.join(['{}:/{}/'.format(k, v.pattern) for k, v in attributes.items()])
213 |
--------------------------------------------------------------------------------
/negbio/patterns/neg_patterns.txt:
--------------------------------------------------------------------------------
1 |
2 | {} >{dependency:/neg/} {}
3 | {} >{} {lemma:/no/}
4 | {} >{dependency:/case/} {lemma:/without/}
5 |
6 | # rather than XXX
7 | {} <{dependency:/conj:negcc/} {}
8 | {} <{dependency:/nmod:without/} {}
9 | {} <{dependency:/conj:versus/} {}
10 | {} <{dependency:/nmod:without|nmod:of/} {lemma:/clear|clearing/}=key
11 | {} <{dependency:/nmod:out/} {lemma:/rule/}=key
12 | {} <{dependency:/nmod:of/} {lemma:/history|free|disappearance|resolution|drainage|resolution|removal/}
13 | {} <{dependency:/nmod:for/} {lemma:/negative/}
14 | {} <{} {lemma:/resolve|resolving|exclude/}=key
15 | {} <{dependency:/advmod|dep|conj:or/} {lemma:/no/}
16 |
17 | # XXX has resolved
18 | {} <{dependency:/nsubj/} ({lemma:/resolve/}=key >{dependency:/aux/} {})
19 |
20 | # there is no XXX
21 | {} <{dependency:/nsubj/} ({lemma:/be/} >{} {lemma:/no/})
22 |
23 | # without evidence|finding of|for XXX
24 | {} <{dependency:/nmod:of|nmod:for/} ({lemma:/evidence|finding/} <{dependency:/nmod:without/} {})
25 |
26 | # no evidence of|for XXX
27 | {} <{dependency:/nmod:of|nmod:for/} ({lemma:/evidence/} >{dependency:/neg/} {})
28 |
29 | # without evidence|finding of|for XXX
30 | {} <{dependency:/nmod:of|nmod:for/} ({lemma:/evidence|finding/} >{} {lemma:/without/})
31 |
32 | # no focus of XXX
33 | {} <{dependency:/nmod:of/} ({lemma:/focus/} >{dependency:/neg/} {})
34 | {} <{dependency:/nmod:of/} ({lemma:/focus/} >{} {lemma:/no/})
35 |
36 | # no moderate to XXX
37 | {} <{dependency:/nmod:to/} ({lemma:/moderate/} >{dependency:/neg/} {})
38 |
39 | # no evidence of developing XXX
40 | {} <{} ({lemma:/developing/} <{} ({lemma:/evidence/} <{dependency:/nmod:without/} {}))
41 | {} <{} ({lemma:/developing/} <{} ({lemma:/evidence/} >{} {lemma:/no/}))
42 |
43 | # no focal XXX
44 | {} <{dependency:/dobj/} ({} >{dependency:/nsubj/} {lemma:/no/})
45 |
46 | # do not demonstrate|visualize XXX
47 | # XXX is not demonstrated/visualized
48 | {} <{dependency:/dobj|nsubjpass/} ({lemma:/demonstrate|visualize/} >{dependency:/neg/} {})
49 |
50 | # XXX is previously demonstrated/visualized
51 | {} <{dependency:/dobj|nsubjpass/} ({lemma:/demonstrate|visualize/} >{} {lemma:/previously/})
52 |
53 | # there is no NN to suggest/explain XXX
54 | {} <{dependency:/dobj/} ({tag:/V.*/} <{} ({tag:/N.*/} >{dependency:/neg/} {}))
55 |
56 | # no NN to suggest/explain XXX
57 | {} <{dependency:/dobj/} ({tag:/V.*/} >{} ({tag:/N.*/} >{dependency:/neg/} {}))
--------------------------------------------------------------------------------
/negbio/patterns/uncertainty_patterns.txt:
--------------------------------------------------------------------------------
1 | # outgoing edge
2 | {} >{} {lemma:/possible|possibly|presumably|probable|questionable|suspect|suspected|suspicious/}
3 | {} >{} {lemma:/question/}
4 |
5 | # '{} >{dependency:/cop/} {lemma:/may|would|could/}
6 |
7 | # incoming edge
8 | {} <{dependency:/nmod:of/} {lemma:/question|suggestion/}
9 | {} <{dependency:/dobj/} {lemma:/suspect|favor|suggest|suggesting|question|consider/}
10 | {} <{dependency:/nmod:for/} {lemma:/concern|suspicion/}
11 | {} <{dependency:/nsubjpass/} {lemma:/suspect/}
12 | {} <{} {lemma:/possible/}
13 |
14 | # parsing error
15 | # suspected XXX
16 | {} <{dependency:/dobj/} {lemma:/suspect/}
17 | {} >{dependency:/advmod/} {lemma:/suspect/}
18 |
19 | # maybe due to XXX
20 | {} <{dependency:/dep/} {lemma:/maybe/}
21 |
22 | # may/could represent/reflect/indicate/include XXX
23 | {} <{} ({lemma:/reflect|represent|indicate|include/} >{} {lemma:/may|could|would/})
24 |
25 | # maybe secondary to XXX
26 | {} <{dependency:/nmod:to/} {lemma:/secondary/}
27 |
28 | # may be due to XXX
29 | {} <{dependency:/nmod:to/} ({lemma:/due/} >{} {lemma:/can|could|may|would|possibly/})
30 |
31 | # could related to XXX
32 | {} <{dependency:/nmod:to/} ({lemma:/relate/} >{} {lemma:/can|could|may|would|possibly/})
33 |
34 | # may be compatible with XXX
35 | {} <{dependency:/nmod:with/} ({lemma:/compatible/} >{} {lemma:/be|could|may|would/})
36 |
37 | # question left XXX
38 | {} <{dependency:/dobj/} ({lemma:/left/} <{} {lemma:/question/})
39 | {} >{} {lemma:/left/} <{} {lemma:/question/}
40 |
41 | # cannot exclude XXX
42 | {} <{dependency:/dobj/} ({lemma:/exclude/} >{} {lemma:/cannot/})
43 |
44 | # cannot rule out XXX
45 | {} <{dependency:/nmod:out/} ({lemma:/rule/} >{} {lemma:/cannot/})
46 |
47 | # XXX is not excluded
48 | {} <{dependency:/nsubjpass/} ({lemma:/exclude/} >{dependency:/neg/} {})
49 | {} <{dependency:/nsubjpass/} ({lemma:/exclude/} >{} {lemma:/cannot/})
50 |
51 | # differential diagnosis includes
52 | {} <{dependency:/dobj/} ({lemma:/include/} >{} ({lemma:/diagnosis/} >{} {lemma:/differential/}))
53 |
54 | # may be XXX
55 | {} <{} {lemma:/be/} >{} {lemma:/may|could|would/}
56 |
57 | # parsing error
58 | # XXX suspected
59 | {} <{dependency:/nsubj/} {lemma:/suspect/,tag:/VBN/}
60 |
61 | # suggestive of XXX
62 | # {} <{dependency:/nmod:of/} {lemma:/suggestive/}'
--------------------------------------------------------------------------------
/negbio/pipeline/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ncbi-nlp/NegBio/073199e2792824740e89844a59c13d3d40ce4d23/negbio/pipeline/__init__.py
--------------------------------------------------------------------------------
/negbio/pipeline/cleanup.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 |
4 | def clean_sentences(document, sort_anns=False):
5 | """
6 | Remove sentences in each passage
7 |
8 | Args:
9 | document(BioCDocument): a document
10 | sort_anns(bool): sort ann by its location
11 | """
12 | try:
13 | for passage in document.passages:
14 | del passage.sentences[:]
15 |
16 | if sort_anns:
17 | key_func = lambda ann: ann.get_total_location().offset
18 | id = 0
19 | for passage in document.passages:
20 | for ann in sorted(passage.annotations, key=key_func):
21 | ann.id = str(id)
22 | id += 1
23 | except:
24 | logging.exception("Cannot process %s", document.id)
25 | return document
26 |
--------------------------------------------------------------------------------
/negbio/pipeline/dner_mm.py:
--------------------------------------------------------------------------------
1 | import collections
2 | import itertools
3 | import logging
4 | import re
5 |
6 | import bioc
7 |
8 |
9 | def remove_newline(s):
10 | return re.sub(r'[\n\r]', ' ', s)
11 |
12 |
13 | def adapt_concept_index(index):
14 | m = re.match(r"'.*?'", index)
15 | if m:
16 | return index[1:-1]
17 | m = re.match(r"'.*", index)
18 | if m:
19 | return index[1:]
20 | return index
21 |
22 |
23 | def run_metamap_col(collection, mm, cuis=None, extra_args=None):
24 | """
25 | Get CUIs from metamap.
26 |
27 | Args:
28 | collection(BioCCollection):
29 | mm(MetaMap): MetaMap instance
30 |
31 | Returns:
32 | BioCCollection
33 | """
34 | try:
35 | annIndex = itertools.count()
36 | sentence_map = collections.OrderedDict()
37 | for document in collection.documents:
38 | for passage in document.passages:
39 | for sentence in passage.sentences:
40 | sentence_map['{}-{}'.format(document.id.replace('.', '-'), sentence.offset)] = (passage, sentence)
41 |
42 | sents = []
43 | ids = []
44 | for k in sentence_map:
45 | ids.append(k)
46 | sents.append(remove_newline(sentence_map[k][1].text))
47 |
48 | if extra_args is None:
49 | concepts, error = mm.extract_concepts(sents, ids)
50 | else:
51 | concepts, error = mm.extract_concepts(sents, ids, **extra_args)
52 |
53 | if error is None:
54 | for concept in concepts:
55 | concept_index = adapt_concept_index(concept.index)
56 | try:
57 | if cuis is not None:
58 | # if no CUI is returned for this concept - skip it
59 | concept_cui = getattr(concept, 'cui', None)
60 | if concept_cui not in cuis:
61 | continue
62 | m = re.match(r'(\d+)/(\d+)', concept.pos_info)
63 | if m:
64 | passage = sentence_map[concept_index][0]
65 | sentence = sentence_map[concept_index][1]
66 | start = int(m.group(1)) - 1
67 | length = int(m.group(2))
68 | ann = bioc.BioCAnnotation()
69 | ann.id = str(next(annIndex))
70 | ann.infons['CUI'] = concept.cui
71 | ann.infons['semtype'] = concept.semtypes[1:-1]
72 | ann.infons['term'] = concept.preferred_name
73 | ann.infons['annotator'] = 'MetaMap'
74 | ann.add_location(bioc.BioCLocation(sentence.offset + start, length))
75 | ann.text = sentence.text[start:start+length]
76 | passage.annotations.append(ann)
77 | except:
78 | logging.exception('')
79 | except:
80 | logging.exception("Cannot process %s", collection.source)
81 | return collection
82 |
83 |
84 | def run_metamap(document, mm, cuis=None):
85 | """
86 | Get CUIs from metamap.
87 |
88 | Args:
89 | document(BioCDocument):
90 | mm(MetaMap): MetaMap instance
91 |
92 | Returns:
93 | BioCDocument
94 | """
95 | try:
96 | annIndex = itertools.count()
97 | sentence_map = collections.OrderedDict()
98 | for passage in document.passages:
99 | for sentence in passage.sentences:
100 | sentence_map[str(sentence.offset)] = (passage, sentence)
101 |
102 | sents = []
103 | ids = []
104 | for k in sentence_map:
105 | ids.append(k)
106 | sents.append(remove_newline(sentence_map[k][1].text))
107 |
108 | concepts, error = mm.extract_concepts(sents, ids)
109 | if error is None:
110 | for concept in concepts:
111 | concept_index = adapt_concept_index(concept.index)
112 | try:
113 | if cuis is not None and concept.cui not in cuis:
114 | continue
115 | m = re.match(r'(\d+)/(\d+)', concept.pos_info)
116 | if m:
117 | passage = sentence_map[concept_index][0]
118 | sentence = sentence_map[concept_index][1]
119 | start = int(m.group(1)) - 1
120 | length = int(m.group(2))
121 | ann = bioc.BioCAnnotation()
122 | ann.id = str(next(annIndex))
123 | ann.infons['CUI'] = concept.cui
124 | ann.infons['semtype'] = concept.semtypes[1:-1]
125 | ann.infons['term'] = concept.preferred_name
126 | ann.infons['annotator'] = 'MetaMap'
127 | ann.add_location(bioc.BioCLocation(sentence.offset + start, length))
128 | ann.text = sentence.text[start:start+length]
129 | passage.annotations.append(ann)
130 | except:
131 | logging.exception('')
132 | except:
133 | logging.exception("Cannot process %s", document.id)
134 | return document
135 |
--------------------------------------------------------------------------------
/negbio/pipeline/negdetect.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import re
3 |
4 | from negbio.neg.neg_detector import Detector
5 |
6 |
7 | def neg_mesh(annotations):
8 | """
9 | Detect negative MeSH
10 | """
11 | for ann in annotations:
12 | if ann.infons.get('CUI', None) == 'C0332125':
13 | ann.infons[Detector.NEGATION] = 'True'
14 |
15 |
16 | def uncertain_mesh(annotations):
17 | """
18 | Detect uncertain MeSH
19 | """
20 | for ann in annotations:
21 | if ann.infons.get('CUI', None) == 'C0332148':
22 | ann.infons[Detector.UNCERTAINTY] = 'True'
23 |
24 |
25 | def is_neg_regex(text):
26 | if re.search(r'^(findings|impression): no ', text, re.I):
27 | return True
28 | return False
29 |
30 |
31 | def _mark_anns(annotations, begin, end, type):
32 | """Mark all annotations in [begin:end] as type"""
33 | for ann in annotations:
34 | total_loc = ann.get_total_location()
35 | if begin <= total_loc.offset and total_loc.offset + total_loc.length <= end:
36 | ann.infons[type] = 'True'
37 |
38 |
39 | def _extend(document, type):
40 | def _is_type(annotation):
41 | return annotation.infons.get(type, None) == 'True'
42 |
43 | neg_anns = []
44 | for passage in document.passages:
45 | for ann in passage.annotations:
46 | if _is_type(ann):
47 | neg_anns.append(ann)
48 |
49 | for passage in document.passages:
50 | for ann in passage.annotations:
51 | if not _is_type(ann):
52 | for nann in neg_anns:
53 | if ann in nann:
54 | ann.infons[type] = 'True'
55 | break
56 | if nann in ann and 'CUI' in ann and 'CUI' in nann and ann.infons['CUI'] == nann.infons['CUI']:
57 | ann.infons[type] = 'True'
58 | break
59 |
60 |
61 | def detect(document, detector):
62 | """
63 | Args:
64 | document(BioCDocument):
65 | detector(Detector): detector. Define customized patterns in the detector
66 | """
67 | try:
68 |
69 | for passage in document.passages:
70 | neg_mesh(passage.annotations)
71 | uncertain_mesh(passage.annotations)
72 |
73 | locs = []
74 | for ann in passage.annotations:
75 | total_loc = ann.get_total_location()
76 | locs.append((total_loc.offset, total_loc.offset + total_loc.length))
77 |
78 | for sentence in passage.sentences:
79 | if is_neg_regex(sentence.text):
80 | _mark_anns(passage.annotations, sentence.offset, sentence.offset + len(sentence.text),
81 | Detector.NEGATION)
82 | continue
83 | for name, matcher, loc in detector.detect(sentence, locs):
84 | logging.debug('Find: %s, %s, %s', name, matcher.pattern, loc)
85 | _mark_anns(passage.annotations, loc[0], loc[1], name)
86 |
87 | # _extend(document, Detector.NEGATION)
88 | # _extend(document, Detector.UNCERTAINTY)
89 | except:
90 | logging.exception("Cannot process %s", document.id)
91 | return document
92 |
--------------------------------------------------------------------------------
/negbio/pipeline/parse.py:
--------------------------------------------------------------------------------
1 | from __future__ import print_function, absolute_import
2 |
3 | import logging
4 | import os
5 | import tempfile
6 |
7 | from bllipparser import ModelFetcher
8 | from bllipparser import RerankingParser
9 |
10 |
11 | class Bllip(object):
12 | def __init__(self, model_dir=None):
13 | if model_dir is None:
14 | logging.debug("downloading GENIA+PubMed model if necessary ...")
15 | model_dir = ModelFetcher.download_and_install_model(
16 | 'GENIA+PubMed', os.path.join(tempfile.gettempdir(), 'models'))
17 | self.model_dir = os.path.expanduser(model_dir)
18 |
19 | logging.debug('loading model %s ...' % self.model_dir)
20 | self.rrp = RerankingParser.from_unified_model_dir(self.model_dir)
21 |
22 | def parse(self, s):
23 | """Parse the sentence text using Reranking parser.
24 |
25 | Args:
26 | s(str): one sentence
27 |
28 | Returns:
29 | ScoredParse: parse tree, ScoredParse object in RerankingParser; None if failed
30 | """
31 | if not s:
32 | raise ValueError('Cannot parse empty sentence: {}'.format(s))
33 |
34 | nbest = self.rrp.parse(str(s))
35 | if nbest:
36 | return nbest[0].ptb_parse
37 |
38 | return None
39 |
40 |
41 | class NegBioParser(Bllip):
42 | def parse_doc(self, document):
43 | """
44 | Parse sentences in BioC format
45 |
46 | Args:
47 | document(BioCDocument): one document
48 |
49 | Returns:
50 | BioCDocument
51 | """
52 | for passage in document.passages:
53 | for sentence in passage.sentences:
54 | text = sentence.text
55 | tree = self.parse(text)
56 | if tree:
57 | sentence.infons['parse tree'] = str(tree)
58 | else:
59 | sentence.infons['parse tree'] = None
60 | logging.exception(
61 | 'No parse tree for sentence: %s', sentence.offset)
62 | return document
63 |
--------------------------------------------------------------------------------
/negbio/pipeline/pipeline.py:
--------------------------------------------------------------------------------
1 |
2 | from negbio.pipeline import parse, ssplit, ptb2ud, negdetect, text2bioc, dner_mm, section_split, cleanup
3 | from negbio.ext import normalize_mimiccxr
4 |
5 |
6 | def process_collection(collection, metamap, splitter, parser, ptb2dep, lemmatizer, neg_detector, cuis, sec_title_patterns):
7 | for document in collection.documents:
8 | normalize_mimiccxr.normalize(document)
9 | section_split.split_document(document, sec_title_patterns)
10 | ssplit.ssplit(document, splitter)
11 |
12 | dner_mm.run_metamap_col(collection, metamap, cuis)
13 |
14 | for document in collection.documents:
15 | document = parse.parse(document, parser)
16 | document = ptb2ud.convert(document, ptb2dep, lemmatizer)
17 | document = negdetect.detect(document, neg_detector)
18 | cleanup.clean_sentences(document)
19 |
20 | return collection
21 |
22 |
23 | def process_text(sources, metamap, splitter, parser, ptb2dep, lemmatizer, neg_detector, cuis, sec_title_patterns):
24 | collection = text2bioc.text2collection(*sources)
25 | return process_collection(collection, metamap, splitter, parser, ptb2dep, lemmatizer, neg_detector, cuis, sec_title_patterns)
26 |
--------------------------------------------------------------------------------
/negbio/pipeline/ptb2ud.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | import StanfordDependencies
4 | import bioc
5 | from nltk.corpus import wordnet
6 | from nltk.stem.wordnet import WordNetLemmatizer
7 | from nltk.tag.mapping import tagset_mapping
8 |
9 |
10 | class Lemmatizer(object):
11 | def __init__(self):
12 | self.wordnet_lemmatizer = WordNetLemmatizer()
13 | self.mapping = tagset_mapping('en-ptb', 'universal')
14 |
15 | def lemmatize(self, word, pos=None):
16 | """
17 | Determines the lemma for a given word
18 |
19 | Args:
20 | word(str): word
21 | pos(str): part-of-speech
22 |
23 | Returns:
24 | str: lemma
25 | """
26 | if pos:
27 | return self.wordnet_lemmatizer.lemmatize(word=word, pos=pos)
28 | else:
29 | return self.wordnet_lemmatizer.lemmatize(word=word)
30 |
31 | def map_tag(self, tag):
32 | if tag in self.mapping:
33 | tag = self.mapping[tag]
34 | if tag == 'NOUN':
35 | return wordnet.NOUN
36 | elif tag == 'VERB':
37 | return wordnet.VERB
38 | elif tag == 'ADJ':
39 | return wordnet.ADJ
40 | elif tag == 'ADV':
41 | return wordnet.ADV
42 | elif tag == 'ADJ_SAT':
43 | return wordnet.ADJ_SAT
44 | return None
45 |
46 |
47 | class Ptb2DepConverter(object):
48 | """
49 | Convert ptb trees to universal dependencies
50 | """
51 |
52 | basic = 'basic'
53 | collapsed = 'collapsed'
54 | CCprocessed = 'CCprocessed'
55 | collapsedTree = 'collapsedTree'
56 |
57 | def __init__(self, lemmatizer, representation='CCprocessed', universal=False):
58 | """
59 | Args:
60 | representation(str): Currently supported representations are
61 | 'basic', 'collapsed', 'CCprocessed', and 'collapsedTree'
62 | universal(bool): if True, use universal dependencies if they're available
63 | """
64 | try:
65 | import jpype
66 | self._backend = 'jpype'
67 | except ImportError:
68 | self._backend = 'subprocess'
69 | self.lemmatizer = lemmatizer
70 | self.__sd = StanfordDependencies.get_instance(backend=self._backend)
71 | self.representation = representation
72 | self.universal = universal
73 |
74 | def convert(self, parse_tree):
75 | """
76 | Convert ptb trees in a BioC sentence
77 |
78 | Args:
79 | parse_tree(str): parse tree in PTB format
80 |
81 | Examples:
82 | (ROOT (NP (JJ hello) (NN world) (. !)))
83 | """
84 | if self._backend == 'jpype':
85 | dependency_graph = self.__sd.convert_tree(parse_tree,
86 | representation=self.representation,
87 | universal=self.universal,
88 | add_lemmas=True)
89 | else:
90 | dependency_graph = self.__sd.convert_tree(parse_tree,
91 | representation=self.representation,
92 | universal=self.universal)
93 | return dependency_graph
94 |
95 |
96 | class NegBioPtb2DepConverter(Ptb2DepConverter):
97 | def __init__(self, lemmatizer, representation='CCprocessed', universal=False):
98 | """
99 | Args:
100 | lemmatizer (Lemmatizer)
101 | """
102 | super(NegBioPtb2DepConverter, self).__init__(
103 | lemmatizer, representation, universal)
104 |
105 | def convert_doc(self, document):
106 | for passage in document.passages:
107 | for sentence in passage.sentences:
108 | # check for empty infons, don't process if empty
109 | # this sometimes happens with poorly tokenized sentences
110 | if not sentence.infons:
111 | continue
112 | elif not sentence.infons['parse tree']:
113 | continue
114 |
115 | try:
116 | dependency_graph = self.convert(
117 | sentence.infons['parse tree'])
118 | anns, rels = convert_dg(dependency_graph, sentence.text,
119 | sentence.offset,
120 | has_lemmas=self._backend == 'jpype')
121 | sentence.annotations = anns
122 | sentence.relations = rels
123 | except KeyboardInterrupt:
124 | raise
125 | except:
126 | logging.exception(
127 | "Cannot process sentence %d in %s", sentence.offset, document.id)
128 |
129 | if self._backend != 'jpype':
130 | for ann in sentence.annotations:
131 | text = ann.text
132 | pos = ann.infons['tag']
133 | pos = self.lemmatizer.map_tag(pos)
134 | lemma = self.lemmatizer.lemmatize(word=text, pos=pos)
135 | ann.infons['lemma'] = lemma.lower()
136 | return document
137 |
138 |
139 | def adapt_value(value):
140 | """
141 | Adapt string in PTB
142 | """
143 | value = value.replace("-LRB-", "(")
144 | value = value.replace("-RRB-", ")")
145 | value = value.replace("-LSB-", "[")
146 | value = value.replace("-RSB-", "]")
147 | value = value.replace("-LCB-", "{")
148 | value = value.replace("-RCB-", "}")
149 | value = value.replace("-lrb-", "(")
150 | value = value.replace("-rrb-", ")")
151 | value = value.replace("-lsb-", "[")
152 | value = value.replace("-rsb-", "]")
153 | value = value.replace("``", "\"")
154 | value = value.replace("''", "\"")
155 | value = value.replace("`", "'")
156 | return value
157 |
158 |
159 | def convert_dg(dependency_graph, text, offset, ann_index=0, rel_index=0, has_lemmas=True):
160 | """
161 | Convert dependency graph to annotations and relations
162 | """
163 | annotations = []
164 | relations = []
165 | annotation_id_map = {}
166 | start = 0
167 | for node in dependency_graph:
168 | if node.index in annotation_id_map:
169 | continue
170 | node_form = node.form
171 | index = text.find(node_form, start)
172 | if index == -1:
173 | node_form = adapt_value(node.form)
174 | index = text.find(node_form, start)
175 | if index == -1:
176 | logging.debug('Cannot convert parse tree to dependency graph at %d\n%d\n%s',
177 | start, offset, str(dependency_graph))
178 | return
179 |
180 | ann = bioc.BioCAnnotation()
181 | ann.id = 'T{}'.format(ann_index)
182 | ann.text = node_form
183 | ann.infons['tag'] = node.pos
184 | if has_lemmas:
185 | ann.infons['lemma'] = node.lemma.lower()
186 |
187 | start = index
188 |
189 | ann.add_location(bioc.BioCLocation(start + offset, len(node_form)))
190 | annotations.append(ann)
191 | annotation_id_map[node.index] = ann_index
192 | ann_index += 1
193 | start += len(node_form)
194 |
195 | for node in dependency_graph:
196 | if node.head == 0:
197 | ann = annotations[annotation_id_map[node.index]]
198 | ann.infons['ROOT'] = True
199 | continue
200 | relation = bioc.BioCRelation()
201 | relation.id = 'R{}'.format(rel_index)
202 | relation.infons['dependency'] = node.deprel
203 | if node.extra:
204 | relation.infons['extra'] = node.extra
205 | relation.add_node(bioc.BioCNode('T{}'.format(
206 | annotation_id_map[node.index]), 'dependant'))
207 | relation.add_node(bioc.BioCNode('T{}'.format(
208 | annotation_id_map[node.head]), 'governor'))
209 | relations.append(relation)
210 | rel_index += 1
211 |
212 | return annotations, relations
213 |
--------------------------------------------------------------------------------
/negbio/pipeline/scan.py:
--------------------------------------------------------------------------------
1 | import io
2 | import logging
3 | import os
4 |
5 | import bioc
6 | import tqdm
7 |
8 |
9 | def scan_document(*_, **kwargs):
10 | """
11 | Scan each document in a list of BioC source files, apply fn, and print to directory.
12 |
13 | Args:
14 | kwargs:
15 | source(list): a list of source pathnames
16 | directory(str): output directory
17 | fn:
18 | fn should expect the following arguments in this given order:
19 | sequence1
20 | sequence2
21 | ...
22 | non_sequence1
23 | non_sequence2
24 | ...
25 | verbose(boolean):
26 | """
27 | source = kwargs.pop('source')
28 | verbose = kwargs.pop('verbose', True)
29 | directory = os.path.expanduser(kwargs.pop('directory'))
30 | suffix = kwargs.pop('suffix')
31 | fn = kwargs.pop('fn')
32 | non_sequences = kwargs.pop('non_sequences', [])
33 |
34 | if not os.path.exists(directory):
35 | os.makedirs(directory)
36 |
37 | def catch(document, non_sequences):
38 | try:
39 | return fn(document, *non_sequences)
40 | except:
41 | logging.exception('Cannot process %s', document.id)
42 |
43 | for pathname in tqdm.tqdm(source, total=len(source), disable=not verbose):
44 | basename = os.path.splitext(os.path.basename(pathname))[0]
45 | dstname = os.path.join(directory, '{}{}'.format(basename, suffix))
46 | with io.open(pathname, encoding='utf8') as fp:
47 | collection = bioc.load(fp)
48 | collection.documents = [catch(doc, non_sequences) for doc in collection.documents]
49 | with io.open(dstname, 'w', encoding='utf8') as fp:
50 | bioc.dump(collection, fp)
51 |
52 |
53 | def scan_collection(*_, **kwargs):
54 | """
55 | Scan each document in a list of BioC source files, apply fn, and print to directory.
56 |
57 | Args:
58 | kwargs:
59 | source(list): a list of source pathnames
60 | directory(str): output directory
61 | fn:
62 | fn should expect the following arguments in this given order:
63 | sequence1
64 | sequence2
65 | ...
66 | non_sequence1
67 | non_sequence2
68 | ...
69 | verbose(boolean):
70 | """
71 | source = kwargs.pop('source')
72 | verbose = kwargs.pop('verbose', True)
73 | directory = os.path.expanduser(kwargs.pop('directory'))
74 | suffix = kwargs.pop('suffix')
75 | fn = kwargs.pop('fn')
76 | non_sequences = kwargs.pop('non_sequences', [])
77 |
78 | if not os.path.exists(directory):
79 | os.makedirs(directory)
80 |
81 | for pathname in tqdm.tqdm(source, total=len(source), disable=not verbose):
82 | basename = os.path.splitext(os.path.basename(pathname))[0]
83 | dstname = os.path.join(directory, '{}{}'.format(basename, suffix))
84 | with io.open(pathname, encoding='utf8') as fp:
85 | collection = bioc.load(fp)
86 | try:
87 | args = [collection] + non_sequences
88 | fn(*args)
89 | except:
90 | logging.exception('Cannot process %s', collection.source)
91 | with io.open(dstname, 'w', encoding='utf8') as fp:
92 | bioc.dump(collection, fp)
93 |
--------------------------------------------------------------------------------
/negbio/pipeline/section_split.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import re
3 |
4 | import bioc
5 |
6 |
7 | SECTION_TITLES = re.compile(r'('
8 | r'ABDOMEN AND PELVIS|CLINICAL HISTORY|CLINICAL INDICATION|COMPARISON|COMPARISON STUDY DATE'
9 | r'|EXAM|EXAMINATION|FINDINGS|HISTORY|IMPRESSION|INDICATION'
10 | r'|MEDICAL CONDITION|PROCEDURE|REASON FOR EXAM|REASON FOR STUDY|REASON FOR THIS EXAMINATION'
11 | r'|TECHNIQUE'
12 | r'):|FINAL REPORT',
13 | re.IGNORECASE | re.MULTILINE)
14 |
15 |
16 | def is_empty(passage):
17 | return len(passage.text) == 0
18 |
19 |
20 | def strip(passage):
21 | start = 0
22 | while start < len(passage.text) and passage.text[start].isspace():
23 | start += 1
24 |
25 | end = len(passage.text)
26 | while end > start and passage.text[end - 1].isspace():
27 | end -= 1
28 |
29 | passage.offset += start
30 | logging.debug('before: %r' % passage.text)
31 | passage.text = passage.text[start:end]
32 | logging.debug('after: %r' % passage.text)
33 | return passage
34 |
35 |
36 | def split_document(document, pattern=None):
37 | """
38 | Split one report into sections. Section splitting is a deterministic consequence of section titles.
39 |
40 | Args:
41 | document(BioCDocument): one document that contains one passage.
42 | pattern: the regular expression patterns for section titles.
43 |
44 | Returns:
45 | BioCDocument: a new BioCDocument instance
46 | """
47 | if pattern is None:
48 | pattern = SECTION_TITLES
49 |
50 | new_document = bioc.BioCDocument()
51 | new_document.id = document.id
52 | new_document.infons = document.infons
53 |
54 | text = document.passages[0].text
55 | offset = document.passages[0].offset
56 |
57 | def create_passage(start, end, title=None):
58 | passage = bioc.BioCPassage()
59 | passage.offset = start + offset
60 | passage.text = text[start:end]
61 | if title is not None:
62 | passage.infons['title'] = title[:-1].strip() if title[-1] == ':' else title.strip()
63 | passage.infons['type'] = 'title_1'
64 | strip(passage)
65 | return passage
66 |
67 | start = 0
68 | for matcher in pattern.finditer(text):
69 | logging.debug('Match: %s', matcher.group())
70 | # add last
71 | end = matcher.start()
72 | if end != start:
73 | passage = create_passage(start, end)
74 | if not is_empty(passage):
75 | new_document.add_passage(passage)
76 |
77 | start = end
78 |
79 | # add title
80 | end = matcher.end()
81 | passage = create_passage(start, end, text[start:end])
82 | if not is_empty(passage):
83 | new_document.add_passage(passage)
84 |
85 | start = end
86 |
87 | # add last piece
88 | end = len(text)
89 | if start < end:
90 | passage = create_passage(start, end)
91 | if not is_empty(passage):
92 | new_document.add_passage(passage)
93 | return new_document
94 |
--------------------------------------------------------------------------------
/negbio/pipeline/ssplit.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | import bioc
4 |
5 |
6 | class NltkSSplitter(object):
7 | """NLTK sentence splitter"""
8 |
9 | def __init__(self, **kwargs):
10 | self.newline = kwargs.pop('newline', False)
11 |
12 | def split(self, text, **kwargs):
13 | import nltk
14 | if not text:
15 | return
16 |
17 | if self.newline:
18 | line_splitter = self.split_line
19 | else:
20 | line_splitter = self.no_split
21 |
22 | for line, line_offset in line_splitter(text):
23 | sent_list = nltk.sent_tokenize(line)
24 | offset = 0
25 | for sent in sent_list:
26 | offset = line.find(sent, offset)
27 | if offset == -1:
28 | logging.debug('Cannot find {} in {}'.format(sent, text))
29 | yield sent, offset + line_offset
30 | offset += len(sent)
31 |
32 | @classmethod
33 | def split_line(cls, text, sep='\n'):
34 | lines = text.split(sep)
35 | offset = 0
36 | for line in lines:
37 | offset = text.index(line, offset)
38 | yield line, offset
39 |
40 | @classmethod
41 | def no_split(cls, text, **kwargs):
42 | yield text, 0
43 |
44 | def __repr__(self):
45 | return 'NLTK SSplitter'
46 |
47 |
48 | class NegBioSSplitter(NltkSSplitter):
49 | def split_doc(self, document):
50 | """
51 | Split text into sentences with offsets.
52 |
53 | Args:v
54 | document(BioCDocument): one document
55 |
56 | Returns:
57 | BioCDocument
58 | """
59 | for passage in document.passages:
60 | for text, offset in self.split(passage.text):
61 | sentence = bioc.BioCSentence()
62 | sentence.offset = offset + passage.offset
63 | sentence.text = text
64 | passage.add_sentence(sentence)
65 | # passage.text = None
66 | return document
67 |
--------------------------------------------------------------------------------
/negbio/pipeline/text2bioc.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import string
3 | from pathlib2 import Path
4 |
5 | import bioc
6 |
7 |
8 | def printable(s, func=None):
9 | """
10 | Return string of ASCII string which is considered printable.
11 |
12 | Args:
13 | s(str): string
14 | func: function to convert non-ASCII characters
15 | """
16 | out = ''
17 | for c in s:
18 | if c in string.printable:
19 | out += c
20 | elif func is not None:
21 | out += func(c)
22 | else:
23 | logging.warning('Cannot convert char: %s', c)
24 | return out
25 |
26 |
27 | def text2document(id, text):
28 | """
29 | Convert text to a BioCDocument instance
30 |
31 | Args:
32 | id (str): BioCDocument id
33 | text (str): text
34 |
35 | Returns:
36 | BioCDocument: a BioCDocument instance
37 | """
38 | document = bioc.BioCDocument()
39 | document.id = id
40 | text = printable(text).replace('\r\n', '\n')
41 |
42 | passage = bioc.BioCPassage()
43 | passage.offset = 0
44 | passage.text = text
45 | document.add_passage(passage)
46 |
47 | return document
48 |
49 |
50 | def text2collection(*sources):
51 | """
52 | Returns a BioCCollection containing documents specified in sources.
53 |
54 | Args:
55 | sources: a list of pathname
56 | """
57 |
58 | collection = bioc.BioCCollection()
59 | for pathname in iter(*sources):
60 | logging.debug('Process %s', pathname)
61 | try:
62 | with open(pathname) as fp:
63 | text = fp.read()
64 | id = Path(pathname).stem
65 | document = text2document(id, text)
66 | collection.add_document(document)
67 | except:
68 | logging.exception('Cannot convert %s', pathname)
69 | return collection
70 |
71 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | future==0.16.0
2 | docutils==0.14
3 | docopt==0.6.2
4 | pytest==4.4.1
5 | networkx==1.11
6 | ply==3.10
7 | tqdm==4.19.5
8 | nltk==3.6.6
9 | bioc==1.3.1
10 | pystanforddependencies==0.3.1
11 | bllipparser==2016.9.11
12 | pymetamap==0.1
13 | JPype1>=0.6.3
14 | pathlib2==2.3.3
15 | numpy==1.21.0
16 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | # Always prefer setuptools over distutils
2 | # To use a consistent encoding
3 | from __future__ import print_function
4 | from codecs import open
5 | import os
6 | from subprocess import check_call
7 |
8 | from setuptools import setup, find_packages
9 | from setuptools.command.develop import develop
10 | from setuptools.command.egg_info import egg_info
11 | from setuptools.command.install import install
12 |
13 | here = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
14 |
15 |
16 | def readme():
17 | # Get the long description from the README file
18 | with open(os.path.join(here, 'README.rst'), encoding='utf-8') as f:
19 | return f.read()
20 |
21 |
22 | def read_requirements():
23 | """parses requirements from requirements.txt"""
24 | reqs_path = os.path.join(here, 'requirements.txt')
25 | with open(reqs_path, encoding='utf8') as f:
26 | reqs = [line.strip() for line in f if not line.strip().startswith('#')]
27 |
28 | names = []
29 | links = []
30 | for req in reqs:
31 | if '://' in req:
32 | links.append(req)
33 | else:
34 | names.append(req)
35 | return {'install_requires': names, 'dependency_links': links}
36 |
37 |
38 | def custom_command():
39 | check_call("python -m nltk.downloader universal_tagset punkt wordnet".split())
40 |
41 |
42 | class CustomInstallCommand(install):
43 | def run(self):
44 | custom_command()
45 | install.run(self)
46 |
47 |
48 | class CustomDevelopCommand(develop):
49 | def run(self):
50 | custom_command()
51 | develop.run(self)
52 |
53 |
54 | class CustomEggInfoCommand(egg_info):
55 | def run(self):
56 | custom_command()
57 | egg_info.run(self)
58 |
59 |
60 | setup(
61 | name='negbio',
62 |
63 | # Versions should comply with PEP440. For a discussion on single-sourcing
64 | # the version across setup.py and the project code, see
65 | # https://packaging.python.org/en/latest/single_source_version.html
66 | version='0.9.4',
67 |
68 | description='NegBio: a tool for negation and uncertainty detection',
69 | long_description=readme(),
70 |
71 | # The project's main homepage.
72 | url='https://github.com/ncbi-nlp/NegBio.git',
73 |
74 | # Author details
75 | author='Yifan Peng',
76 | author_email='yifan.peng@nih.gov',
77 |
78 | license='Public Domain',
79 |
80 | # See https://pypi.python.org/pypi?%3Aaction=list_classifiers
81 | classifiers=[
82 | 'Development Status :: 3 - Alpha',
83 | # Indicate who your project is intended for
84 | 'Intended Audience :: Developers',
85 | 'Intended Audience :: Science/Research',
86 |
87 | # Pick your license as you wish (should match "license" above)
88 | 'License :: Public Domain',
89 |
90 | 'Operating System :: MacOS',
91 | 'Operating System :: POSIX',
92 | 'Operating System :: POSIX :: Linux',
93 |
94 | # Specify the Python versions you support here.
95 | 'Programming Language :: Python',
96 | 'Topic :: Software Development',
97 | 'Topic :: Software Development :: Libraries :: Application Frameworks',
98 | ],
99 |
100 | keywords='negbio',
101 |
102 | packages=find_packages(exclude=["tests.*", "tests", "backup", "docs"]),
103 | include_package_data=True,
104 |
105 | cmdclass={
106 | 'install': CustomInstallCommand,
107 | 'develop': CustomDevelopCommand,
108 | 'egg_info': CustomEggInfoCommand
109 | },
110 |
111 | entry_points = {
112 | 'console_scripts': ['negbio_pipeline=negbio.negbio_pipeline:main',
113 | 'main_chexpert=negbio.main_chexpert:main',
114 | 'main_mm=negbio.main_mm:main'],
115 | },
116 |
117 | **read_requirements()
118 | )
119 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ncbi-nlp/NegBio/073199e2792824740e89844a59c13d3d40ce4d23/tests/__init__.py
--------------------------------------------------------------------------------
/tests/context.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 |
3 | import os
4 | import sys
5 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
6 |
7 | import bioc
--------------------------------------------------------------------------------
/tests/negbio/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ncbi-nlp/NegBio/073199e2792824740e89844a59c13d3d40ce4d23/tests/negbio/__init__.py
--------------------------------------------------------------------------------
/tests/negbio/ngrex/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ncbi-nlp/NegBio/073199e2792824740e89844a59c13d3d40ce4d23/tests/negbio/ngrex/__init__.py
--------------------------------------------------------------------------------
/tests/negbio/ngrex/test_parser.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from negbio import ngrex
4 | from negbio.ngrex import parser
5 | from ply.lex import LexToken
6 |
7 |
8 | def test_lex():
9 | _test_lex('{lemma:/xxx/} <{dependency:/nmod:without|x/} {lemma:/yyy/}')
10 | _test_lex('{} <{dependency:/nsubj/} {lemma:/suspect/,tag:/VBN/}')
11 | _test_lex('{}=t <{dependency:/nsubj/} {lemma:/suspect/,tag:/VBN/}=key')
12 | with pytest.raises(TypeError):
13 | _test_yacc("xxx")
14 |
15 |
16 | def _test_lex(s):
17 | parser.lexer.input(s)
18 | for tok in parser.lexer:
19 | print(tok)
20 |
21 |
22 | def test_yacc():
23 | # _test_yacc("{lemma:/xxx/} <{dependency:/nmod:without|x/} {lemma:/yyy/}")
24 | # _test_yacc("{lemma:/xxx/} >{dependency:/nmod:without/} {lemma:/yyy/}")
25 | # _test_yacc("{lemma:/xxx/} >{dependency:/nmod:without/} ({lemma:/yyy/} >{} {lemma:/zzz/})")
26 | # _test_yacc("{} >{} {lemma:/left/} <{} {lemma:/question/}")
27 | # _test_yacc("{} <{dependency:/nsubj/} {lemma:/suspect/,tag:/VBN/}")
28 | _test_yacc("{}=t <{dependency:/nsubj/} {lemma:/suspect/,tag:/VBN/}=key")
29 | with pytest.raises(KeyError):
30 | _test_yacc("{}=t <{dependency:/nsubj/} {lemma:/suspect/,tag:/VBN/}=t")
31 |
32 |
33 | def _test_yacc(s):
34 | pattern = ngrex.compile(s)
35 | print(pattern)
36 |
37 |
38 | if __name__ == '__main__':
39 | test_lex()
40 | test_yacc()
41 |
--------------------------------------------------------------------------------
/tests/negbio/ngrex/test_pattern.py:
--------------------------------------------------------------------------------
1 | import networkx as nx
2 | from negbio import ngrex
3 |
4 |
5 | def get_graph():
6 | G = nx.DiGraph()
7 | G.add_node('xxx', attr_dict={'lemma': 'xxx'})
8 | G.add_node('yyy', attr_dict={'lemma': 'yyy'})
9 | G.add_node('zzz', attr_dict={'lemma': 'zzz'})
10 | G.add_edge('xxx', 'yyy', attr_dict={'dependency': 'aaa'})
11 | G.add_edge('yyy', 'zzz', attr_dict={'dependency': 'bbb'})
12 | G.add_edge('xxx', 'zzz', attr_dict={'dependency': 'ccc'})
13 | return G
14 |
15 |
16 | def helper(G, p, expected):
17 | pattern = ngrex.compile(p)
18 | print(pattern.pattern)
19 | # actual = {m.group(0) for m in pattern.finditer(G)}
20 | actual = set()
21 | for m in pattern.finditer(G):
22 | actual.add(m.group(0))
23 | assert actual == expected, '{} vs {}'.format(actual, expected)
24 |
25 |
26 | def test_regex():
27 | G = get_graph()
28 | helper(G, '{} >{dependency:/aaa|bbb/} {}', {'xxx', 'yyy'})
29 |
30 |
31 | def test_attribute():
32 | G = get_graph()
33 | helper(G, '{} >{dependency:/aaa|bbb/} {}', {'xxx', 'yyy'})
34 | helper(G, '{} >{tag:/aaa|bbb/} {}', set())
35 |
36 |
37 | def test_relation():
38 | G = get_graph()
39 | helper(G, '{lemma:/xxx/} >{dependency:/aaa/} {lemma:/yyy/}', {'xxx'})
40 | helper(G, '{lemma:/yyy/} <{dependency:/aaa/} {lemma:/xxx/}', {'yyy'})
41 | helper(G, '{} >{} {}', {'xxx', 'yyy'})
42 |
43 |
44 | def test_relation_next():
45 | G = get_graph()
46 | helper(G, '{lemma:/xxx/} >{dependency:/aaa/} ({lemma:/yyy/} >{dependency:/bbb/} {lemma:/zzz/})',
47 | {'xxx'})
48 |
49 |
50 | def test_relation_conj():
51 | G = get_graph()
52 | helper(G, '{} >{} {lemma:/yyy/} >{} {lemma:/zzz/}', {'xxx'})
53 | helper(G, '{} >{} {lemma:/yyy/} <{} {lemma:/zzz/}', set())
54 |
55 |
56 | def test_relation_disj():
57 | G = get_graph()
58 | helper(G, '{} >{dependency:/aaa/} {} | >{dependency:/bbb/} {}', {'xxx', 'yyy'})
59 |
60 |
61 | def test_variables():
62 | G = get_graph()
63 | pattern = ngrex.compile('{}=t >{dependency:/aaa|bbb/} {}')
64 | print(pattern.pattern)
65 | actual = {m.get('t') for m in pattern.finditer(G)}
66 | assert actual == {'xxx', 'yyy'}
67 |
68 |
69 | if __name__ == '__main__':
70 | # test_relation()
71 | # test_relation_next()
72 | test_relation_conj()
73 | # test_relation_disj()
74 | # test_regex()
75 | # test_attribute()
76 | # test_variables()
77 |
--------------------------------------------------------------------------------
/tests/negbio/pipeline/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ncbi-nlp/NegBio/073199e2792824740e89844a59c13d3d40ce4d23/tests/negbio/pipeline/__init__.py
--------------------------------------------------------------------------------
/tests/negbio/pipeline/test_parse.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | from negbio.pipeline.parse import Bllip
4 |
5 |
6 | def test_Bllip():
7 | b = Bllip()
8 | t = b.parse('hello world!')
9 | assert str(t) == '(S1 (S (NP (NN hello) (NN world) (NN !))))'
10 |
11 |
12 | if __name__ == '__main__':
13 | logging.basicConfig(level=logging.WARNING)
14 | test_Bllip()
15 |
--------------------------------------------------------------------------------
/tests/negbio/test_cli.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | from docopt import docopt
4 |
5 | from negbio import negbio_pipeline, negbio_text2bioc, negbio_ssplit, negbio_section_split, negbio_parse
6 |
7 |
8 | def test_negbio():
9 | doc = negbio_pipeline.__doc__
10 | args = docopt(doc, 'text2bioc a b c'.split())
11 | assert args[''] == 'text2bioc'
12 | assert args[''] == ['a', 'b', 'c']
13 |
14 |
15 | def test_text2bioc():
16 | doc = negbio_text2bioc.__doc__
17 | args = docopt(doc, 'text2bioc --verbose --output=out a b c'.split())
18 | assert args['--verbose']
19 | assert args['--output'] == 'out'
20 | assert args[''] == ['a', 'b', 'c']
21 | args = docopt(doc, 'text2bioc --output=out a b c'.split())
22 | assert not args['--verbose']
23 |
24 |
25 | def test_ssplit():
26 | doc = negbio_ssplit.__doc__
27 | args = docopt(doc, 'ssplit --suffix suffix --newline_is_sentence_break --output out a b c'.split())
28 | assert args['--newline_is_sentence_break']
29 | assert args['--output'] == 'out'
30 | assert args['--suffix'] == 'suffix'
31 | assert args[''] == ['a', 'b', 'c']
32 |
33 |
34 | def test_section_split():
35 | doc = negbio_section_split.__doc__
36 | args = docopt(doc, 'section_split --pattern pattern --output out a b c'.split())
37 | assert args['--output'] == 'out'
38 | assert args['--pattern'] == 'pattern'
39 | assert args[''] == ['a', 'b', 'c']
40 |
41 |
42 | def test_parse():
43 | doc = negbio_parse.__doc__
44 | args = docopt(doc, 'parse --model model --output out a b c'.split())
45 | assert args['--output'] == 'out'
46 | assert args['--model'] == 'model'
47 | assert args[''] == ['a', 'b', 'c']
48 |
49 |
50 | if __name__ == '__main__':
51 | logging.basicConfig(level=logging.WARNING)
52 | test_ssplit()
53 |
--------------------------------------------------------------------------------