├── deid
    ├── tools
    │   ├── __init__.py
    │   ├── i2b2
    │   │   ├── __init__.py
    │   │   ├── requirements.txt
    │   │   └── README.md
    │   ├── fix_xml_texts.py
    │   ├── fix_180-03.py
    │   ├── find_differences.py
    │   ├── dataset.py
    │   ├── i2b2_xml_to_csv_tests.py
    │   ├── embeddings.py
    │   ├── find_good_amount_of_noise.py
    │   ├── config.py
    │   └── i2b2_xml_to_csv.py
    ├── fixtures
    │   ├── deid_work
    │   │   └── .gitkeep
    │   └── deid_resources
    │   │   ├── config
    │   │       └── generated
    │   │       │   └── .gitkeep
    │   │   └── i2b2_data
    │   │       ├── train
    │   │           ├── 999-99.txt
    │   │           └── 999-99.csv
    │   │       └── train_xml
    │   │           ├── 999-96.xml
    │   │           ├── 999-97.xml
    │   │           ├── 999-98.xml
    │   │           └── 999-99.xml
    ├── model
    │   ├── losses
    │   │   ├── __init__.py
    │   │   ├── discriminator.py
    │   │   └── crf.py
    │   ├── layers
    │   │   ├── __init__.py
    │   │   ├── gradient_reversal.py
    │   │   └── noise.py
    │   ├── __init__.py
    │   ├── optimizer.py
    │   ├── deidentifier.py
    │   ├── adversary.py
    │   ├── representer.py
    │   └── adversarial.py
    ├── data
    │   ├── types.py
    │   ├── dataset_tests.py
    │   ├── augment
    │   │   ├── __init__.py
    │   │   ├── strategy_tests.py
    │   │   ├── get.py
    │   │   ├── augment_tests.py
    │   │   ├── strategy.py
    │   │   └── augment.py
    │   ├── __init__.py
    │   ├── class_weight.py
    │   ├── feature_tests.py
    │   ├── token.py
    │   ├── feature.py
    │   ├── util.py
    │   ├── tokenizer_tests.py
    │   ├── batch_tests.py
    │   ├── postprocess.py
    │   ├── tokenizer.py
    │   ├── read.py
    │   └── batch.py
    ├── experiment
    │   ├── run.py
    │   ├── random.py
    │   ├── __main__.py
    │   ├── __init__.py
    │   ├── config_tests.py
    │   ├── directory.py
    │   ├── get.py
    │   ├── evaluation_tests.py
    │   ├── config.py
    │   ├── alternating_evaluation.py
    │   ├── dummy.py
    │   ├── fake_sentences.py
    │   ├── evaluation.py
    │   ├── mtn_evaluation.py
    │   └── basic.py
    ├── __init__.py
    ├── embeddings
    │   ├── dummy.py
    │   ├── util.py
    │   ├── __init__.py
    │   ├── util_tests.py
    │   ├── noise.py
    │   ├── embeddings.py
    │   ├── glove.py
    │   ├── elmo.py
    │   ├── fasttext.py
    │   └── matrix.py
    ├── config_template.yaml.example
    ├── config_template_alternating.yaml.example
    └── env.py
├── adversary1.png
├── adversary2.png
├── architecture.png
├── .gitmodules
├── scripts
    ├── xml_to_csv
    └── queue
├── environment.yml
├── .travis.yml
├── LICENSE
├── .gitignore
└── README.md


/deid/tools/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/deid/fixtures/deid_work/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/deid/fixtures/deid_resources/config/generated/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/deid/tools/i2b2/__init__.py:
--------------------------------------------------------------------------------
1 | # Modified by Max Friedrich, 2018
2 | 


--------------------------------------------------------------------------------
/deid/tools/i2b2/requirements.txt:
--------------------------------------------------------------------------------
1 | lxml==3.3.1
2 | numpy==1.8.0
3 | 


--------------------------------------------------------------------------------
/adversary1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/maxfriedrich/deid-training-data/HEAD/adversary1.png


--------------------------------------------------------------------------------
/adversary2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/maxfriedrich/deid-training-data/HEAD/adversary2.png


--------------------------------------------------------------------------------
/architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/maxfriedrich/deid-training-data/HEAD/architecture.png


--------------------------------------------------------------------------------
/deid/model/losses/__init__.py:
--------------------------------------------------------------------------------
1 | from .crf import crf_loss
2 | from .discriminator import discriminator_loss
3 | 


--------------------------------------------------------------------------------
/deid/model/layers/__init__.py:
--------------------------------------------------------------------------------
1 | from .gradient_reversal import GradientReversal
2 | from .noise import Noise, AdditiveNoise, MultiplicativeNoise
3 | 


--------------------------------------------------------------------------------
/deid/fixtures/deid_resources/i2b2_data/train/999-99.txt:
--------------------------------------------------------------------------------
1 | 
2 | 
3 | 
4 | Record date: 2018-06-15
5 | 
6 | Max Friedrich is a 25-year-old Computer science student living in Hamburg, Germany.
7 | 
8 | 


--------------------------------------------------------------------------------
/deid/data/types.py:
--------------------------------------------------------------------------------
1 | from typing import Sequence, Union
2 | import numpy as np
3 | from .token import Token
4 | 
5 | Sentence = Sequence[Union[Token, np.ndarray]]
6 | SentenceLabels = Sequence[Sequence[int]]
7 | 


--------------------------------------------------------------------------------
/deid/data/dataset_tests.py:
--------------------------------------------------------------------------------
1 | from . import TrainingSet
2 | from ..embeddings import DummyEmbeddings
3 | 
4 | 
5 | def test_training_set():
6 |     tr = TrainingSet(limit_documents=1, embeddings=DummyEmbeddings())
7 |     assert len(tr.X) == len(tr.y)
8 | 


--------------------------------------------------------------------------------
/deid/data/augment/__init__.py:
--------------------------------------------------------------------------------
1 | from .strategy import AugmentStrategy, AugmentEmbedding, AugmentWord, Zeros, RandomEmbedding, RandomDigits, \
2 |     AdditiveNoise, MoveToNeighbor
3 | from .get import get
4 | from .augment import Augment, AugmentedSentence
5 | 
6 | 
7 | 


--------------------------------------------------------------------------------
/deid/experiment/run.py:
--------------------------------------------------------------------------------
1 | from . import get_config, get as get_experiment
2 | 
3 | 
4 | def run_experiment(config_name_or_path):
5 |     config = get_config(config_name_or_path)
6 |     experiment = get_experiment(config['experiment']['type'])
7 |     experiment(config)
8 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "dependencies/keras-contrib"]
2 | 	path = dependencies/keras-contrib
3 | 	url = git://github.com/keras-team/keras-contrib
4 | [submodule "dependencies/fastText"]
5 | 	path = dependencies/fastText
6 | 	url = git://github.com/facebookresearch/fastText.git
7 | 


--------------------------------------------------------------------------------
/deid/experiment/random.py:
--------------------------------------------------------------------------------
 1 | def setup_random():
 2 |     import os
 3 |     import random
 4 | 
 5 |     import numpy as np
 6 |     import tensorflow as tf
 7 | 
 8 |     os.environ['PYTHONHASHSEED'] = '0'
 9 |     np.random.seed(1)
10 |     random.seed(2)
11 |     tf.set_random_seed(3)
12 | 


--------------------------------------------------------------------------------
/deid/__init__.py:
--------------------------------------------------------------------------------
1 | # https://stackoverflow.com/a/40846742/2623170
2 | # https://github.com/numpy/numpy/pull/432/commits/170ed4e3
3 | import warnings
4 | 
5 | warnings.filterwarnings("ignore", message="numpy.dtype size changed")
6 | warnings.filterwarnings("ignore", message="numpy.ufunc size changed")
7 | 


--------------------------------------------------------------------------------
/deid/fixtures/deid_resources/i2b2_data/train_xml/999-96.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8" ?>
 2 | <deIdi2b2>
 3 | <TEXT><![CDATA[
 4 | 
 5 | 
 6 | zero &lt; one
 7 | 
 8 | ]]></TEXT>
 9 | <TAGS>
10 | <DATE id="P1" start="13" end="16" text="one" TYPE="DATE" comment="" />
11 | </TAGS>
12 | </deIdi2b2>
13 | 


--------------------------------------------------------------------------------
/deid/experiment/__main__.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | from . import run_experiment
 4 | 
 5 | 
 6 | def main():
 7 |     parser = argparse.ArgumentParser()
 8 |     parser.add_argument('config', help='the config filename')
 9 |     args = parser.parse_args()
10 | 
11 |     run_experiment(args.config)
12 | 
13 | 
14 | if __name__ == '__main__':
15 |     main()
16 | 


--------------------------------------------------------------------------------
/deid/model/__init__.py:
--------------------------------------------------------------------------------
 1 | def get(identifier):
 2 |     if identifier == 'lstm':
 3 |         return make_lstm_crf
 4 |     elif identifier.startswith('adversarial'):
 5 |         return AdversarialModel
 6 |     else:
 7 |         raise ValueError('unknown identifier:', identifier)
 8 | 
 9 | 
10 | from .adversarial import AdversarialModel
11 | from .deidentifier import make_lstm_crf
12 | 


--------------------------------------------------------------------------------
/deid/data/__init__.py:
--------------------------------------------------------------------------------
1 | from .token import Token, TOKEN_TYPE, BINARY_LABEL
2 | from .tokenizer import tokenize
3 | from .types import Sentence, SentenceLabels
4 | from .batch import BatchGenerator, StratifiedSampling, BatchGeneratorWithExtraFeatures, \
5 |     StratifiedSamplingWithExtraFeatures, fake_sentences_batch
6 | from .dataset import DataSet, TrainingSet, ValidationSet, TestSet, is_phi_sentence
7 | from .postprocess import prediction_to_xml
8 | 


--------------------------------------------------------------------------------
/deid/fixtures/deid_resources/i2b2_data/train_xml/999-97.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8" ?>
 2 | <deIdi2b2>
 3 | <TEXT><![CDATA[
 4 | 
 5 | 
 6 | zero one  two three fourFive
 7 | 
 8 | ]]></TEXT>
 9 | <TAGS>
10 | <DATE id="P1" start="8" end="11" text="one" TYPE="DATE" comment="" />
11 | <AGE id="P2" start="12" end="16" text=" two" TYPE="AGE" comment="" />
12 | <DATE id="P3" start="23" end="27" text="four" TYPE="DATE" comment="" />
13 | </TAGS>
14 | </deIdi2b2>
15 | 


--------------------------------------------------------------------------------
/deid/model/optimizer.py:
--------------------------------------------------------------------------------
 1 | from keras.optimizers import Adam, Nadam, RMSprop, SGD
 2 | 
 3 | # We want to pass custom args to the adversaries. Passing a Keras optimizer string to the compile method won't let us
 4 | # select custom args, so we make a subset of optimizers available by string keys here.
 5 | optimizers = {'adam': Adam, 'nadam': Nadam, 'rmsprop': RMSprop, 'sgd': SGD}
 6 | 
 7 | 
 8 | def get(identifier):
 9 |     if identifier in optimizers.keys():
10 |         return optimizers[identifier]
11 |     raise ValueError(f'Unknown optimizer: {identifier}')
12 | 


--------------------------------------------------------------------------------
/deid/experiment/__init__.py:
--------------------------------------------------------------------------------
 1 | from .config import get_config
 2 | from .random import setup_random
 3 | from .directory import experiment_directory
 4 | from .evaluation import evaluate_deid_performance, DeidentificationEvaluationCallback
 5 | 
 6 | from .basic import basic_experiment
 7 | from .alternating import alternating_experiment
 8 | from .alternating_evaluation import alternating_evaluation_experiment
 9 | from .mtn_evaluation import mtn_evaluation_experiment
10 | from .fake_sentences import fake_sentences_experiment
11 | 
12 | from .get import get
13 | from .run import run_experiment
14 | 


--------------------------------------------------------------------------------
/deid/experiment/config_tests.py:
--------------------------------------------------------------------------------
 1 | from .config import Config
 2 | 
 3 | 
 4 | def example_config():
 5 |     return Config({'a': 0, 'b': 1, 'c': {'d': 2}})
 6 | 
 7 | 
 8 | def test_config_behaves_like_a_dict():
 9 |     config = example_config()
10 |     assert config['a'] == 0
11 |     assert config['b'] == 1
12 |     assert config['c']['d'] == 2
13 | 
14 |     config['c']['d'] = 3
15 |     assert config['c']['d'] == 3
16 | 
17 | 
18 | def test_config_returns_none_for_missing_values():
19 |     config = example_config()
20 |     assert config['x'] is None
21 |     assert config['c']['y'] is None
22 | 


--------------------------------------------------------------------------------
/deid/data/class_weight.py:
--------------------------------------------------------------------------------
 1 | import itertools
 2 | 
 3 | 
 4 | def get(identifier):
 5 |     if identifier == 'balanced':
 6 |         return balanced
 7 | 
 8 |     raise ValueError(f'Unknown class weight: {identifier}')
 9 | 
10 | 
11 | def balanced(output_size, y):
12 |     y = list(itertools.chain.from_iterable([[label[0] for label in sent] for sent in y]))
13 | 
14 |     o_weight = len(y) / y.count(1)
15 |     phi_weight = len(y) / (len(y) - y.count(1))
16 | 
17 |     class_weight = [0, o_weight]
18 |     for i in range(2, output_size):
19 |         class_weight.append(phi_weight)
20 | 
21 |     return class_weight
22 | 


--------------------------------------------------------------------------------
/scripts/xml_to_csv:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import os
 4 | import sys
 5 | 
 6 | ma_path = os.path.dirname(os.path.dirname(__file__))
 7 | if os.path.abspath(ma_path) not in sys.path:
 8 |     sys.path.append(ma_path)
 9 | 
10 | from deid.env import env
11 | 
12 | for t in ['test', 'train', 'validation']:
13 |     print(f'Converting {t} xmls...')
14 |     command = ' '.join(['python -m deid.tools.i2b2_xml_to_csv --check',
15 |                         f"{os.path.join(env.data_dir, t + '_xml')}",
16 |                         f"{os.path.join(env.data_dir, t)}"])
17 | 
18 |     print(command)
19 |     os.system(command)
20 | 


--------------------------------------------------------------------------------
/deid/fixtures/deid_resources/i2b2_data/train_xml/999-98.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8" ?>
 2 | <deIdi2b2>
 3 | <TEXT><![CDATA[
 4 | 
 5 | 
 6 | one two three four 123456Test
 7 | 
 8 | ]]></TEXT>
 9 | <TAGS>
10 | <DATE id="P1" start="3" end="6" text="one" TYPE="DATE" comment="" />
11 | <AGE id="P2" start="7" end="16" text="two three" TYPE="AGE" comment="" />
12 | <DATE id="P3" start="17" end="21" text="four" TYPE="DATE" comment="" />
13 | <ID id="P4" start="22" end="28" text="123456" TYPE="MEDICALRECORD" comment="" />
14 | <LOCATION id="P5" start="28" end="32" text="Test" TYPE="HOSPITAL" comment="" />
15 | </TAGS>
16 | </deIdi2b2>
17 | 


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
 1 | name: deid-training-data
 2 | channels:
 3 | - defaults
 4 | - conda-forge
 5 | dependencies:
 6 | - python=3.6
 7 | - libgcc
 8 | - pip
 9 | - cython
10 | - jupyter
11 | - pathlib
12 | - numpy==1.14.5
13 | - scipy
14 | - pandas
15 | - matplotlib
16 | - beautifulsoup4
17 | - nose
18 | - h5py
19 | - spacy==2.0.12
20 | - tqdm
21 | - scikit-learn
22 | - lxml
23 | - pylint
24 | - mypy
25 | - msgpack-python
26 | - pip:
27 |   - keras==2.2.2
28 |   - tensorflow==1.10.0  # or tensorflow-gpu
29 |   - tensorflow-hub==0.1.1
30 |   - terminaltables
31 |   - pybind11
32 |   - -e ./dependencies/keras-contrib
33 |   - -e ./dependencies/fastText
34 | 


--------------------------------------------------------------------------------
/deid/model/losses/discriminator.py:
--------------------------------------------------------------------------------
 1 | from keras import backend as K
 2 | from keras.losses import binary_crossentropy
 3 | 
 4 | 
 5 | def discriminator_loss(y_true, y_pred):
 6 |     """ Compares the actual binary crossentropy loss to the random guessing loss (0.6931..., accuracy 0.5) and returns
 7 |     the maximum. This is motivated by the fact that our adversarial discriminators should not be worse than random
 8 |     guessing, otherwise we could just flip every prediction and get a better discriminator.
 9 |     """
10 |     loss = binary_crossentropy(y_true, y_pred)
11 |     random_guessing = -K.log(0.5)
12 |     return K.maximum(loss, random_guessing)
13 | 


--------------------------------------------------------------------------------
/deid/embeddings/dummy.py:
--------------------------------------------------------------------------------
 1 | import hashlib
 2 | 
 3 | import numpy as np
 4 | 
 5 | from . import Embeddings
 6 | 
 7 | 
 8 | class DummyEmbeddings(Embeddings):
 9 |     @property
10 |     def size(self):
11 |         return 5
12 | 
13 |     @property
14 |     def std(self):
15 |         return 0.5
16 | 
17 |     def is_unknown(self, word: str) -> bool:
18 |         return False
19 | 
20 |     def lookup(self, word: str):
21 |         hashed = int(hashlib.sha256(word.encode('utf-8')).hexdigest(), 16)
22 |         five_digits = [int(digit) for digit in str(hashed)[1:6]]  # omitting a possible - at the first index
23 |         return np.array([-0.5 if digit < 5 else 0.5 for digit in five_digits])
24 | 


--------------------------------------------------------------------------------
/deid/fixtures/deid_resources/i2b2_data/train/999-99.csv:
--------------------------------------------------------------------------------
 1 | text,type,start,end
 2 | <s>,O,0,0
 3 | </s>,O,0,0
 4 | <s>,O,3,3
 5 | Record,O,3,9
 6 | date,O,10,14
 7 | :,O,14,15
 8 | 2018,B-DATE,16,20
 9 | -,I-DATE,20,21
10 | 06,I-DATE,21,23
11 | -,I-DATE,23,24
12 | 15,I-DATE,24,26
13 | </s>,O,26,26
14 | <s>,O,28,28
15 | Max,B-PATIENT,28,31
16 | Friedrich,I-PATIENT,32,41
17 | is,O,42,44
18 | a,O,45,46
19 | 25,B-AGE,47,49
20 | -,O,49,50
21 | year,O,50,54
22 | -,O,54,55
23 | old,O,55,58
24 | Computer,O,59,67
25 | science,O,68,75
26 | student,O,76,83
27 | living,O,84,90
28 | in,O,91,93
29 | Hamburg,B-CITY,94,101
30 | ",",O,101,102
31 | Germany,B-COUNTRY,103,110
32 | .,O,110,111
33 | </s>,O,111,111
34 | 


--------------------------------------------------------------------------------
/deid/fixtures/deid_resources/i2b2_data/train_xml/999-99.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8" ?>
 2 | <deIdi2b2>
 3 | <TEXT><![CDATA[
 4 | 
 5 | 
 6 | Record date: 2018-06-15
 7 | 
 8 | Max Friedrich is a 25-year-old Computer science student living in Hamburg, Germany.
 9 | 
10 | ]]></TEXT>
11 | <TAGS>
12 | <DATE id="P1" start="16" end="26" text="2018-06-15" TYPE="DATE" comment="" />
13 | <NAME id="P2" start="28" end="41" text="Max Friedrich" TYPE="PATIENT" comment="" />
14 | <AGE id="P3" start="47" end="49" text="25" TYPE="AGE" comment="" />
15 | <LOCATION id="P4" start="94" end="101" text="Hamburg" TYPE="CITY" comment="" />
16 | <LOCATION id="P5" start="103" end="110" text="Germany" TYPE="COUNTRY" comment="" />
17 | </TAGS>
18 | </deIdi2b2>


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | # https://conda.io/docs/user-guide/tasks/use-conda-with-travis-ci.html
 2 | 
 3 | sudo: required
 4 | dist: trusty
 5 | group: travis_latest
 6 | language: python
 7 | python:
 8 |   - '3.6'
 9 | git:
10 |   depth: false
11 | install:
12 |   - sudo apt-get update
13 |   - wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh
14 |   - bash miniconda.sh -b -p $HOME/miniconda
15 |   - export PATH="$HOME/miniconda/bin:$PATH"
16 |   - hash -r
17 |   - conda config --set always_yes yes --set changeps1 no
18 |   - conda update -q conda
19 |   - conda info -a
20 |   - conda env create -q
21 |   - source activate deid-training-data
22 |   - conda list
23 |   - python -m spacy download en
24 | script:
25 |   - nosetests --with-doctest
26 | 


--------------------------------------------------------------------------------
/deid/experiment/directory.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shutil
 3 | import socket
 4 | from datetime import datetime
 5 | 
 6 | from ..env import env
 7 | 
 8 | 
 9 | def experiment_directory(name, config_path=None, work_dir=env.work_dir):
10 |     """ Creates a directory for the experiment
11 | 
12 |     :param name:
13 |     :param config_path:
14 |     :param work_dir:
15 |     :return:
16 |     """
17 |     date_str = datetime.now().strftime('%Y%m%d-%H%M%S')
18 |     directory = os.path.join(work_dir, name + '_' + socket.gethostname() + '_' + date_str)
19 |     if env.experiment_dir_postfix is not None:
20 |         directory += '_' + env.experiment_dir_postfix
21 |     os.mkdir(directory)
22 |     if config_path is not None:
23 |         shutil.copy2(config_path, directory)
24 | 
25 |     return directory
26 | 


--------------------------------------------------------------------------------
/scripts/queue:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | if [ -z "$DEID_CONFIG_DIR" ]; then
 6 |     echo "Please set the DEID_CONFIG_DIR variable to the config directory."
 7 |     exit 1
 8 | fi
 9 | 
10 | TODO="${DEID_CONFIG_DIR}/todo"
11 | IN_PROGRESS="${DEID_CONFIG_DIR}/in_progress"
12 | DONE="${DEID_CONFIG_DIR}/done"
13 | EXECUTED=0
14 | STOP=0
15 | 
16 | find "${TODO}" -type f -name '*.yaml' -print0 |
17 | while IFS= read -r -d '' f; do
18 |     config="$(basename $f)";
19 |     echo "Next config is ${f}, basename ${config}";
20 |     mv "${f}" "${IN_PROGRESS}";
21 |     python3 -m deid.experiment "${IN_PROGRESS}/${config}";
22 |     echo "Moving to done";
23 |     mv "${IN_PROGRESS}/${config}" "$DONE";
24 |     echo "OK";
25 |     EXECUTED=${EXECUTED}+1;
26 | done
27 | echo "Executed ${EXECUTED} configs."
28 | 


--------------------------------------------------------------------------------
/deid/experiment/get.py:
--------------------------------------------------------------------------------
 1 | from .basic import basic_experiment
 2 | from .alternating import alternating_experiment
 3 | from .alternating_evaluation import alternating_evaluation_experiment
 4 | from .mtn_evaluation import mtn_evaluation_experiment
 5 | from .fake_sentences import fake_sentences_experiment
 6 | 
 7 | 
 8 | def get(identifier):
 9 |     if identifier == 'basic':
10 |         return basic_experiment
11 |     elif identifier == 'alternating':
12 |         return alternating_experiment
13 |     elif identifier == 'alternating_evaluation':
14 |         return alternating_evaluation_experiment
15 |     elif identifier == 'mtn_evaluation':
16 |         return mtn_evaluation_experiment
17 |     elif identifier == 'fake_sentences':
18 |         return fake_sentences_experiment
19 |     else:
20 |         raise ValueError('unknown identifier:', identifier)
21 | 


--------------------------------------------------------------------------------
/deid/data/augment/strategy_tests.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from .strategy import NeighborsCache
 4 | from ...embeddings import EmbeddingSimilarity
 5 | 
 6 | 
 7 | def test_neighbors_cache():
 8 |     cache = NeighborsCache('selected')
 9 |     assert cache.lookup('test') is None
10 | 
11 |     neighbors = [EmbeddingSimilarity(1, 'tests', 0.95, np.zeros(10)),
12 |                  EmbeddingSimilarity(1, 'testing', 0.93, np.zeros(10)),
13 |                  EmbeddingSimilarity(1, 'tester', 0.91, np.zeros(10))]
14 | 
15 |     cache.store('test', neighbors, neighbors[0])
16 |     assert cache.lookup('test') == cache.lookup('test') == neighbors[0]
17 | 
18 |     cache = NeighborsCache('neighbors')
19 |     assert cache.lookup('test') is None
20 | 
21 |     cache.store('test', neighbors, neighbors[0])
22 |     assert cache.lookup('test') in neighbors
23 |     assert cache.lookup('test') in neighbors
24 | 


--------------------------------------------------------------------------------
/deid/experiment/evaluation_tests.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import tempfile
 3 | 
 4 | from . import evaluation
 5 | from ..env import Test
 6 | from .evaluation import _run_official_evaluation
 7 | 
 8 | config = evaluation.env = Test()
 9 | 
10 | 
11 | def test_run_official_evaluation():
12 |     with tempfile.NamedTemporaryFile() as f:
13 |         # testing the fixtures train_xml directory against itself, resulting in perfect score
14 |         results = _run_official_evaluation(predictions_dir=os.path.join(config.data_dir, 'train_xml'),
15 |                                            test_set='train',
16 |                                            output_file=f.name)
17 |         assert len(f.read().strip()) != 0  # something was written to the evaluation file
18 | 
19 |     assert results['Token']['precision'] == 1.0
20 |     assert results['Token']['recall'] == 1.0
21 |     assert results['Token']['f1'] == 1.0
22 | 


--------------------------------------------------------------------------------
/deid/embeddings/util.py:
--------------------------------------------------------------------------------
 1 | from typing import Sequence, Tuple, List, Any
 2 | 
 3 | 
 4 | def pad_string_sequences(seq: Sequence[Sequence[str]]) -> Tuple[List[List[str]], Sequence[int]]:
 5 |     """ Like keras.preprocessing.sequence.pad_string_sequences but for strings, and it also returns seq_length. """
 6 | 
 7 |     seq_length = [len(item) for item in seq]
 8 |     maxlen = max(seq_length)
 9 | 
10 |     result = []
11 |     for i, item in enumerate(seq):
12 |         result.append(list(item) + [''] * (maxlen - seq_length[i]))
13 |     return result, seq_length
14 | 
15 | 
16 | def unpad_sequences(padded: Sequence[Any], seq_length: Sequence[int]):
17 |     """ The reverse operation of `keras.preprocessing.sequence.pad_sequences`. """
18 |     assert len(padded) == len(seq_length)
19 |     return [padded[i][:seq_length[i]] for i in range(len(padded))]
20 | 
21 | 
22 | # https://stackoverflow.com/a/434328/2623170
23 | def chunks(seq, size):
24 |     return (seq[pos:pos + size] for pos in range(0, len(seq), size))
25 | 


--------------------------------------------------------------------------------
/deid/embeddings/__init__.py:
--------------------------------------------------------------------------------
 1 | from .embeddings import Embeddings, PrecomputedEmbeddings
 2 | from .dummy import DummyEmbeddings
 3 | from .elmo import ElmoEmbeddings, TensorFlowElmoEmbeddings, CachedElmoEmbeddings
 4 | from .fasttext import FastTextEmbeddings, PreloadFastTextEmbeddings, CachedFastTextEmbeddings
 5 | from .glove import GloveEmbeddings
 6 | from .matrix import Matrix, EmbeddingSimilarity
 7 | from .noise import Noise, GaussianNoise, DropoutNoise, NoiseWrapper
 8 | 
 9 | 
10 | def get(identifier, *args, **kwargs):
11 |     if identifier == 'dummy':
12 |         return DummyEmbeddings()
13 |     elif identifier == 'elmo':
14 |         return ElmoEmbeddings(*args)
15 |     elif identifier == 'elmo-tf':
16 |         return TensorFlowElmoEmbeddings(*args, **kwargs)
17 |     elif identifier == 'glove':
18 |         return GloveEmbeddings(*args, **kwargs)
19 |     elif identifier == 'fasttext':
20 |         return FastTextEmbeddings(*args, **kwargs)
21 |     else:
22 |         raise ValueError('unknown identifier:', identifier)
23 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Max Friedrich
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/deid/embeddings/util_tests.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from .util import pad_string_sequences, unpad_sequences
 4 | from keras.preprocessing.sequence import pad_sequences as keras_pad_sequences
 5 | 
 6 | 
 7 | def test_pad_string_sequences():
 8 |     test_seq = [['apple', 'banana', 'cherry'], ['d', 'e', 'f', 'g'], ['h', 'i', 'j', 'k', 'l', 'q'], ['r']]
 9 |     padded, seq_length = pad_string_sequences(test_seq)
10 |     assert len(padded) == 4
11 |     assert len(padded[0]) == 6
12 |     assert padded[0][0] == 'apple'
13 |     assert padded[0][3] == ''
14 |     assert seq_length == [3, 4, 6, 1]
15 | 
16 | 
17 | def test_unpad_sequences():
18 |     test_seq = [['apple', 'banana', 'cherry', '', ''], ['d', 'e', 'f', 'g', 'h'], ['i', '', '', '', '', ]]
19 |     seq = unpad_sequences(test_seq, [3, 5, 1])
20 |     assert len(seq) == 3
21 |     assert seq[0] == ['apple', 'banana', 'cherry']
22 | 
23 | 
24 | def test_is_reverse_operation():
25 |     test_seq = [[0, 1, 2, 3], [4], [5, 6]]
26 |     padded = keras_pad_sequences(test_seq, padding='post')
27 |     unpadded = unpad_sequences(padded, [4, 1, 2])
28 |     assert [list(item) for item in unpadded] == test_seq
29 | 


--------------------------------------------------------------------------------
/deid/data/augment/get.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional
 2 | 
 3 | from .strategy import AugmentStrategy, Zeros, RandomEmbedding, RandomDigits, AdditiveNoise, MoveToNeighbor
 4 | 
 5 | 
 6 | def get(identifier: Optional[str], *args, **kwargs) -> Optional[AugmentStrategy]:
 7 |     if identifier is None:
 8 |         return None
 9 |     elif identifier == 'zeros':
10 |         return Zeros()
11 |     elif identifier.startswith('random_embedding'):
12 |         if '-' in identifier:
13 |             scale = float(identifier.split('-')[1])
14 |             return RandomEmbedding(scale, l2_normalize='l2' in identifier)
15 |         else:
16 |             return RandomEmbedding()
17 |     elif identifier == 'random_digits':
18 |         return RandomDigits(*args, **kwargs)
19 |     elif identifier.startswith('additive_noise'):
20 |         scale = float(identifier.split('-')[1])
21 |         return AdditiveNoise(scale)
22 |     elif identifier.startswith('move_to_neighbor'):
23 |         n_neighbors = int(identifier.split('-')[1])
24 |         return MoveToNeighbor(n_neighbors=n_neighbors, *args, **kwargs)  # type: ignore
25 |     else:
26 |         raise ValueError('unknown identifier:', identifier)
27 | 


--------------------------------------------------------------------------------
/deid/config_template.yaml.example:
--------------------------------------------------------------------------------
 1 | ---
 2 | experiment:
 3 |   type: basic # see config_template_alternating.yaml.example for alternating experiment
 4 |   binary_classification: false
 5 |   hipaa_only: false
 6 |   model: lstm # or adversarial
 7 |   embeddings: fasttext # or glove, elmo
 8 |   train_set: train
 9 |   validation_set: validation
10 | model_args:
11 |   hidden_size: # add options for multiple runs like this
12 |     choice:
13 |       - 64
14 |       - 128
15 |       - 256
16 |       # ...
17 |   num_hidden:
18 |     choice:
19 |       - 1
20 |       - 2
21 |   input_dropout:
22 |     choice:
23 |       - 0.
24 |       - 0.05
25 |       - 0.1
26 |       - 0.25
27 |       - 0.5
28 |   after_hidden_dropout: 0.5
29 |   recurrent_dropout: 0.25
30 | training:
31 |   optimizer: adam
32 |   optimizer_args:
33 |     clipnorm: 1.
34 |   train_epochs: 10
35 |   early_stopping_patience: 2
36 |   batch_size: 32
37 |   i2b2_evaluate_every: 2
38 | augment:
39 |   strategy: move_to_neighbor-50 # or additive_noise-0.1, etc.
40 |   digit_strategy: random_digits
41 |   include_original: false
42 |   augment_args:
43 |     augment_all: false
44 |     n_augmentations: 10
45 | test:
46 |   run_test: false
47 | 


--------------------------------------------------------------------------------
/deid/data/feature_tests.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from .feature import CaseFeature, apply_features
 4 | from .token import Token
 5 | 
 6 | 
 7 | def test_case():
 8 |     feature = CaseFeature()
 9 |     assert np.all(feature.apply(Token.with_text('1234')) == np.array([0, 1, 0, 0, 0, 0, 0]))  # all numeric
10 |     assert np.all(feature.apply(Token.with_text('123a')) == np.array([0, 0, 1, 0, 0, 0, 0]))  # mainly numeric
11 |     assert np.all(feature.apply(Token.with_text('ok4y')) == np.array([0, 0, 0, 1, 0, 0, 0]))  # all lower
12 |     assert np.all(feature.apply(Token.with_text('OKAY')) == np.array([0, 0, 0, 0, 1, 0, 0]))  # all upper
13 |     # ...
14 | 
15 | 
16 | def test_apply_features():
17 |     features = [CaseFeature()]
18 |     case_features = apply_features(features, [Token.with_text('UPPER'), Token.with_text('CASE')])
19 |     assert len(case_features) == 2
20 |     assert np.all(case_features[0] == np.array([0, 0, 0, 0, 1, 0, 0]))
21 | 
22 |     features = [CaseFeature(), CaseFeature()]
23 |     case_features = apply_features(features, [Token.with_text('UPPER'), Token.with_text('CASE')])
24 |     print(case_features)
25 |     assert np.all(case_features[0] == np.array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0]))
26 | 


--------------------------------------------------------------------------------
/deid/tools/fix_xml_texts.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | 
 4 | # noinspection PyProtectedMember
 5 | from bs4 import BeautifulSoup, CData
 6 | 
 7 | 
 8 | def fix_xml(pred_xml, gold_xml):
 9 |     print(pred_xml, gold_xml)
10 |     gold_soup = BeautifulSoup(open(gold_xml, 'r').read(), features='xml')
11 |     gold_text = gold_soup.find('TEXT').string
12 | 
13 |     print(gold_text.count('\n'))
14 |     
15 |     pred_soup = BeautifulSoup(open(pred_xml, 'r').read(), features='xml')
16 |     pred_soup.find('TEXT').string = CData(gold_text)
17 |     with open(pred_xml, 'w') as f:
18 |         f.write(str(pred_soup))
19 | 
20 | 
21 | def main():
22 |     parser = argparse.ArgumentParser()
23 |     parser.add_argument('pred',
24 |                         help='the predictions file or directory')
25 |     parser.add_argument('gold', help='the gold file or directory')
26 | 
27 |     args = parser.parse_args()
28 | 
29 |     if os.path.isdir(args.pred):
30 |         for xml in [f for f in os.listdir(args.pred) if f.endswith('.xml')]:
31 |             pred_xml = os.path.join(args.pred, xml)
32 |             gold_xml = os.path.join(args.gold, xml)
33 | 
34 |             fix_xml(pred_xml, gold_xml)
35 |     else:
36 |         fix_xml(args.pred, args.gold)
37 | 
38 | 
39 | if __name__ == '__main__':
40 |     main()
41 | 


--------------------------------------------------------------------------------
/deid/tools/fix_180-03.py:
--------------------------------------------------------------------------------
 1 | # Fixes a shift in start/end coordinates that is caused by the special characters in "O’neil’s Court"
 2 | 
 3 | import argparse
 4 | import os
 5 | import re
 6 | 
 7 | 
 8 | def fixed_contents(contents):
 9 |     result = ''
10 |     edit_here = False
11 |     increment_start = False
12 |     for line in contents:
13 |         if '<LOCATION id="P35"' in line:  # the first broken line still has the correct start but incorrect end
14 |             edit_here = True
15 |         if edit_here:
16 |             if increment_start:
17 |                 line = re.sub('start="(\d*)"', lambda x: f'start="{int(x.group(1)) - 4}"', line)
18 |             line = re.sub('end="(\d*)"', lambda x: f'end="{int(x.group(1)) - 4}"', line)
19 |             increment_start = True
20 |         result += line
21 |     return result
22 | 
23 | 
24 | def fix_180_03(path):
25 |     with open(path) as f:
26 |         result = fixed_contents(f.read())
27 | 
28 |     os.rename(path, path + '.bak')
29 |     with open(path, 'w') as f:
30 |         f.write(result)
31 | 
32 | 
33 | def main():
34 |     parser = argparse.ArgumentParser()
35 |     parser.add_argument('path', help='the path to 180-03.xml')
36 |     args = parser.parse_args()
37 | 
38 |     if os.path.isdir(args.path):
39 |         args.path = os.path.join(args.path, '180-03.xml')
40 |     if not os.path.isfile(args.path):
41 |         raise ValueError(f'Cannot locate 180-03.xml at {os.path.dirname(args.path)}')
42 | 
43 |     fix_180_03(args.path)
44 | 
45 | 
46 | if __name__ == '__main__':
47 |     main()
48 | 


--------------------------------------------------------------------------------
/deid/model/losses/crf.py:
--------------------------------------------------------------------------------
 1 | # https://github.com/linxihui/keras-contrib/blob/7c9fc2124f3a6c6d821f2f3c5c437a38072c5ded/keras_contrib/losses/crf_losses.py
 2 | 
 3 | # Specify this crf_loss function in the custom_objects dict when loading keras models with a CRF layer
 4 | # until https://github.com/keras-team/keras-contrib/pull/272 is merged :)
 5 | 
 6 | 
 7 | from keras.losses import categorical_crossentropy, sparse_categorical_crossentropy
 8 | from keras_contrib import backend as K
 9 | 
10 | 
11 | # noinspection PyProtectedMember,PyProtectedMember,PyProtectedMember,PyProtectedMember
12 | def crf_nll(y_true, y_pred):
13 |     """Usual Linear Chain CRF negative log likelihood. Used for CRF "join" mode. See `layers.CRF` for usage."""
14 |     crf, idx = y_pred._keras_history[:2]
15 |     assert not crf._outbound_nodes, 'When learn_model="join", CRF must be the last layer.'
16 |     if crf.sparse_target:
17 |         y_true = K.one_hot(K.cast(y_true[:, :, 0], 'int32'), crf.units)
18 |     X = crf._inbound_nodes[idx].input_tensors[0]
19 |     mask = crf._inbound_nodes[idx].input_masks[0]
20 |     nloglik = crf.get_negative_log_likelihood(y_true, X, mask)
21 |     return nloglik
22 | 
23 | 
24 | # noinspection PyProtectedMember
25 | def crf_loss(y_true, y_pred):
26 |     """General CRF loss function, depending on the learning mode."""
27 |     crf, idx = y_pred._keras_history[:2]
28 |     if crf.learn_mode == 'join':
29 |         return crf_nll(y_true, y_pred)
30 |     else:
31 |         if crf.sparse_target:
32 |             return sparse_categorical_crossentropy(y_true, y_pred)
33 |         else:
34 |             return categorical_crossentropy(y_true, y_pred)
35 | 


--------------------------------------------------------------------------------
/deid/tools/find_differences.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | 
 4 | from bs4 import BeautifulSoup
 5 | 
 6 | 
 7 | def print_differences(pred_xml, gold_xml):
 8 |     print(os.path.basename(pred_xml))
 9 | 
10 |     sets = []
11 |     for thing in [pred_xml, gold_xml]:
12 |         soup = BeautifulSoup(open(thing, 'r').read(), 'xml')
13 | 
14 |         items = set()
15 |         for tag in soup.find('TAGS').findChildren():
16 |             items.add((tag.get('TYPE'), tag.get('start'), tag.get('end'), tag.get('text')))
17 |         sets.append(items)
18 | 
19 |     false_positives = sets[0] - sets[1]
20 |     if len(false_positives) > 0:
21 |         print('  false positives:')
22 |         for fp in false_positives:
23 |             print('  -', fp)
24 | 
25 |     false_negatives = sets[1] - sets[0]
26 |     if len(false_negatives) > 0:
27 |         print('  false negatives:')
28 |         for fn in false_negatives:
29 |             print('  -', fn)
30 | 
31 |     print('-' * 100)
32 | 
33 | 
34 | def main():
35 |     parser = argparse.ArgumentParser()
36 |     parser.add_argument('pred',
37 |                         help='the predictions file or directory')
38 |     parser.add_argument('gold', help='the gold file or directory')
39 | 
40 |     args = parser.parse_args()
41 | 
42 |     if os.path.isdir(args.pred):
43 |         for xml in [f for f in os.listdir(args.pred) if f.endswith('.xml')]:
44 |             pred_xml = os.path.join(args.pred, xml)
45 |             gold_xml = os.path.join(args.gold, xml)
46 | 
47 |             print_differences(pred_xml, gold_xml)
48 |     else:
49 |         print_differences(args.pred, args.gold)
50 | 
51 | 
52 | if __name__ == '__main__':
53 |     main()
54 | 


--------------------------------------------------------------------------------
/deid/data/token.py:
--------------------------------------------------------------------------------
 1 | from typing import NamedTuple
 2 | from ..tools.i2b2.classes import PHITrackEvaluation
 3 | 
 4 | 
 5 | class Token(NamedTuple):
 6 |     text: str
 7 |     type: str
 8 |     start: int
 9 |     end: int
10 | 
11 |     @classmethod
12 |     def with_text(cls, text, label='O'):
13 |         """ Mostly useful for unit tests """
14 |         return Token(text, label, 0, 0)
15 | 
16 | 
17 | # noinspection SpellCheckingInspection
18 | TOKEN_TYPE = {
19 |     'PATIENT': 'NAME',
20 |     'DOCTOR': 'NAME',
21 |     'USERNAME': 'NAME',
22 |     'PROFESSION': 'PROFESSION',
23 |     'ROOM': 'LOCATION',
24 |     'DEPARTMENT': 'LOCATION',
25 |     'HOSPITAL': 'LOCATION',
26 |     'ORGANIZATION': 'LOCATION',
27 |     'STREET': 'LOCATION',
28 |     'CITY': 'LOCATION',
29 |     'STATE': 'LOCATION',
30 |     'COUNTRY': 'LOCATION',
31 |     'ZIP': 'LOCATION',
32 |     'LOCATION-OTHER': 'LOCATION',
33 |     'AGE': 'AGE',
34 |     'DATE': 'DATE',
35 |     'PHONE': 'CONTACT',
36 |     'FAX': 'CONTACT',
37 |     'EMAIL': 'CONTACT',
38 |     'URL': 'CONTACT',
39 |     'IPADDR': 'CONTACT',
40 |     'SSN': 'ID',
41 |     'MEDICALRECORD': 'ID',
42 |     'HEALTHPLAN': 'ID',
43 |     'ACCOUNT': 'ID',
44 |     'LICENSE': 'ID',
45 |     'VEHICLE': 'ID',
46 |     'DEVICE': 'ID',
47 |     'BIOID': 'ID',
48 |     'IDNUM': 'ID',
49 |     'OTHER': 'OTHER'
50 | }
51 | 
52 | HIPAA_TOKEN_TYPE = {tag: type for tag, type in TOKEN_TYPE.items() if any([n_re.match(type) and t_re.match(tag)
53 |                                                                          for n_re, t_re in
54 |                                                                          PHITrackEvaluation.HIPAA_regexes])}
55 | 
56 | BINARY_LABEL = 'PATIENT'
57 | 


--------------------------------------------------------------------------------
/deid/config_template_alternating.yaml.example:
--------------------------------------------------------------------------------
 1 | ---
 2 | experiment:
 3 |   type: alternating
 4 |   binary_classification: false
 5 |   hipaa_only: false
 6 |   model: adversarial
 7 |   embeddings:
 8 |     choice:
 9 |       - fasttext
10 |       - glove
11 |   extra_features:
12 |     - case
13 |   train_set: train
14 |   validation_set: validation
15 | model_args:
16 |   representation_type: lstm
17 |   representation_size:
18 |     choice:
19 |       - 50
20 |       - 100
21 |       - 300
22 |   representation_args:
23 |     single_stddev: false
24 |   adversaries:
25 |     - discriminate-representations
26 |     - discriminate-representation-embedding-pair
27 |   adversary_args:
28 |     input_dropout: 0.0
29 |     lstm_size: 300
30 |     recurrent_dropout: 0.0
31 |     reverse_gradient: false
32 |   deidentifier_args:
33 |     hidden_size: 128
34 |     num_hidden: 2
35 |     input_dropout: 0.1
36 |     after_hidden_dropout: 0.5
37 |     recurrent_dropout: 0.5
38 |     use_crf: true
39 | training:
40 |   optimizer: nadam
41 |   optimizer_args:
42 |     clipnorm: 1.
43 |   pretrain_deidentifier_epochs: 20
44 |   pretrain_adversary_epochs: 20
45 |   train_epochs: 40
46 |   early_stopping_patience: 10
47 |   batch_size: 32
48 |   batch_size_compound: 0
49 |   i2b2_evaluate_every: 100
50 |   class_weight: balanced
51 | augment:
52 |   strategy:
53 |     choice:
54 |       - move_to_neighbor-5
55 |       - move_to_neighbor-10
56 |       - move_to_neighbor-20
57 |       - move_to_neighbor-50
58 |       - move_to_neighbor-100
59 |       - move_to_neighbor-200
60 |       - move_to_neighbor-500
61 |   augment_args:
62 |     n_augmentations: 10
63 |     augment_all: false
64 |     augment_max: 1
65 | test:
66 |   run_test: true
67 | 


--------------------------------------------------------------------------------
/deid/model/layers/gradient_reversal.py:
--------------------------------------------------------------------------------
 1 | # Adapted from https://github.com/keras-team/keras/pull/4031
 2 | 
 3 | import tensorflow as tf
 4 | from keras import backend as K
 5 | from keras.engine import Layer
 6 | 
 7 | 
 8 | def reverse_gradient(X, hp_lambda):
 9 |     """Flips the sign of the incoming gradient during training."""
10 |     try:
11 |         reverse_gradient.num_calls += 1
12 |     except AttributeError:
13 |         reverse_gradient.num_calls = 1
14 | 
15 |     grad_name = "GradientReversal%d" % reverse_gradient.num_calls
16 | 
17 |     @tf.RegisterGradient(grad_name)
18 |     def _flip_gradients(_, grad):
19 |         return [tf.negative(grad) * hp_lambda]
20 | 
21 |     g = K.get_session().graph
22 |     with g.gradient_override_map({'Identity': grad_name}):
23 |         y = tf.identity(X)
24 | 
25 |     return y
26 | 
27 | 
28 | class GradientReversal(Layer):
29 |     """Flip the sign of gradient during training."""
30 | 
31 |     def __init__(self, hp_lambda=1.0, **kwargs):
32 |         super(GradientReversal, self).__init__(**kwargs)
33 |         assert hp_lambda > 0, f'hp_lambda is {hp_lambda} -- it should be > 0 to actually flip the gradient'
34 |         self.hp_lambda = hp_lambda
35 |         self.supports_masking = False
36 | 
37 |     def build(self, input_shape):
38 |         self.trainable_weights = []
39 | 
40 |     def call(self, x, mask=None):
41 |         return reverse_gradient(x, self.hp_lambda)
42 | 
43 |     def compute_output_shape(self, input_shape):
44 |         return input_shape
45 | 
46 |     def get_config(self):
47 |         config = {'hp_lambda': self.hp_lambda}
48 |         base_config = super(GradientReversal, self).get_config()
49 |         return {**base_config, **config}
50 | 


--------------------------------------------------------------------------------
/deid/embeddings/noise.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from . import Embeddings
 4 | 
 5 | 
 6 | class Noise:
 7 |     def noise(self, size: int) -> None:
 8 |         raise NotImplementedError
 9 | 
10 | 
11 | class GaussianNoise(Noise):
12 |     def __init__(self, scale: float, loc=0., clip=None) -> None:
13 |         self.loc = loc
14 |         self.scale = scale
15 |         self.clip = clip
16 | 
17 |     def noise(self, size):
18 |         result = np.random.normal(self.loc, self.scale, size)
19 |         if self.clip is not None:
20 |             result = np.clip(result, self.clip[0], self.clip[1])
21 |         return result
22 | 
23 | 
24 | class DropoutNoise(Noise):
25 |     def __init__(self, dropout_prob) -> None:
26 |         self.dropout_prob = dropout_prob
27 | 
28 |     def noise(self, size):
29 |         return np.random.choice(2, size, p=[self.dropout_prob, 1 - self.dropout_prob])
30 | 
31 | 
32 | class NoiseWrapper(Embeddings):
33 |     def __init__(self, embeddings: Embeddings, op, noise: Noise) -> None:
34 |         self.wrapped_embeddings = embeddings
35 |         self.noise = noise
36 | 
37 |         if type(op) == str:
38 |             if op == 'add' or op == '+':
39 |                 self.op = lambda x, y: x + y
40 |             elif op == 'mul' or op == '*':
41 |                 self.op = lambda x, y: x * y
42 |             else:
43 |                 raise ValueError(f'Unrecognized op: {op}')
44 |         else:
45 |             self.op = op
46 | 
47 |     @property
48 |     def size(self):
49 |         return self.wrapped_embeddings.size
50 | 
51 |     def lookup(self, word):
52 |         return self.op(self.wrapped_embeddings.lookup(word), self.noise.noise(self.size))
53 | 
54 |     def __str__(self):
55 |         return f'<{self.__class__.__name__} wrapper of {self.wrapped_embeddings} {vars(self)}>'
56 | 


--------------------------------------------------------------------------------
/deid/experiment/config.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | import yaml
 5 | 
 6 | from ..env import env
 7 | 
 8 | config_dir = os.path.join(env.resources_dir, 'config')
 9 | 
10 | 
11 | class Config(dict):
12 |     """ A dict that returns None for missing items instead of raising an exception, including for child dicts """
13 | 
14 |     def __init__(self, *args, **kwargs):
15 |         super().__init__(*args, **kwargs)
16 |         for k, v in self.items():
17 |             if k == 'choice':
18 |                 raise ValueError('This is a config template, not an experiment config. Please generate configs from '
19 |                                  'it with python -m deid.tools.config')
20 |             # please don't put a dict into itself (can't happen when importing from yaml anyway)
21 |             if isinstance(v, dict):
22 |                 self[k] = Config(v)
23 | 
24 |     def __getitem__(self, key):
25 |         if key.endswith('_args'):
26 |             return self.get(key, {})
27 |         return self.get(key)
28 | 
29 | 
30 | def get_config(name):
31 |     if os.path.isfile(name):
32 |         return load_config_yaml(name)
33 | 
34 |     for parent in [config_dir, os.path.join(config_dir, 'generated')]:
35 |         filename = os.path.join(parent, name)
36 |         if os.path.isfile(filename):
37 |             return load_config_yaml(filename)
38 | 
39 |         filename = filename + '.yaml'
40 |         if os.path.isfile(filename):
41 |             return load_config_yaml(filename)
42 | 
43 |     raise ValueError(f'Could not locate config "{name}" in config dir')
44 | 
45 | 
46 | def load_config_yaml(path):
47 |     config = Config(yaml.load(open(path)))
48 |     config['name'] = '.'.join(os.path.basename(path).split('.')[:-1])
49 |     config['path'] = path
50 |     sys.stderr.write(f"Using {config['name']} config.\n")
51 |     return config
52 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | .idea
  2 | temp
  3 | deid/temp
  4 | 
  5 | # Byte-compiled / optimized / DLL files
  6 | __pycache__/
  7 | *.py[cod]
  8 | *$py.class
  9 | 
 10 | # C extensions
 11 | *.so
 12 | 
 13 | # Distribution / packaging
 14 | .Python
 15 | build/
 16 | develop-eggs/
 17 | dist/
 18 | downloads/
 19 | eggs/
 20 | .eggs/
 21 | lib/
 22 | lib64/
 23 | parts/
 24 | sdist/
 25 | var/
 26 | wheels/
 27 | *.egg-info/
 28 | .installed.cfg
 29 | *.egg
 30 | MANIFEST
 31 | 
 32 | # PyInstaller
 33 | #  Usually these files are written by a python script from a template
 34 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 35 | *.manifest
 36 | *.spec
 37 | 
 38 | # Installer logs
 39 | pip-log.txt
 40 | pip-delete-this-directory.txt
 41 | 
 42 | # Unit test / coverage reports
 43 | htmlcov/
 44 | .tox/
 45 | .nox/
 46 | .coverage
 47 | .coverage.*
 48 | .cache
 49 | nosetests.xml
 50 | coverage.xml
 51 | *.cover
 52 | .hypothesis/
 53 | .pytest_cache/
 54 | 
 55 | # Translations
 56 | *.mo
 57 | *.pot
 58 | 
 59 | # Django stuff:
 60 | *.log
 61 | local_settings.py
 62 | db.sqlite3
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # celery beat schedule file
 88 | celerybeat-schedule
 89 | 
 90 | # SageMath parsed files
 91 | *.sage.py
 92 | 
 93 | # Environments
 94 | .env
 95 | .venv
 96 | env/
 97 | venv/
 98 | ENV/
 99 | env.bak/
100 | venv.bak/
101 | 
102 | # Spyder project settings
103 | .spyderproject
104 | .spyproject
105 | 
106 | # Rope project settings
107 | .ropeproject
108 | 
109 | # mkdocs documentation
110 | /site
111 | 
112 | # mypy
113 | .mypy_cache/
114 | .dmypy.json
115 | dmypy.json
116 | 
117 | # Pyre type checker
118 | .pyre/
119 | 


--------------------------------------------------------------------------------
/deid/env.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | from typing import Optional
 4 | 
 5 | deid_dir = os.path.dirname(__file__)
 6 | 
 7 | 
 8 | # Defining the attributes as static variables isn't super smart as they are all created even if the config is not used.
 9 | # This means os.environ['SOME_SPECIFIC_VAR'] will crash in other environments, so we have to use os.environ.get().
10 | class Environment:
11 |     name: str
12 |     deid_dir: str = deid_dir
13 |     data_dir: str
14 |     work_dir: str
15 |     resources_dir: str
16 |     results_dir: str
17 |     limit_training_documents: Optional[int]
18 |     limit_validation_documents: Optional[int]
19 |     use_short_sentences: bool
20 |     keras_verbose: int
21 |     save_model: int
22 |     embeddings_cache: bool
23 |     experiment_dir_postfix: Optional[str] = None
24 | 
25 |     unk_token: str = '<unk>'
26 |     sent_start = '<s>'
27 |     sent_end = '</s>'
28 | 
29 | 
30 | class Development(Environment):
31 |     name = 'development'
32 |     work_dir = os.path.join(os.environ['HOME'], 'deid_work')
33 |     resources_dir = os.path.join(os.environ['HOME'], 'deid_resources')
34 |     data_dir = os.path.join(resources_dir, 'i2b2_data')
35 |     limit_training_documents = None  # set this to e.g. 10 for faster experimentation
36 |     limit_validation_documents = None
37 |     use_short_sentences = False
38 |     keras_verbose = 1
39 |     save_model = True
40 |     embeddings_cache = True
41 | 
42 | 
43 | class Test(Environment):
44 |     name = 'unit test'
45 |     work_dir = os.path.join(deid_dir, 'fixtures', 'deid_work')
46 |     resources_dir = os.path.join(deid_dir, 'fixtures', 'deid_resources')
47 |     data_dir = os.path.join(resources_dir, 'i2b2_data')
48 |     limit_training_documents = 4
49 |     limit_validation_documents = 2
50 |     use_short_sentences = True
51 |     keras_verbose = 1
52 |     save_model = False
53 |     embeddings_cache = True
54 | 
55 | 
56 | env: Environment
57 | if 'DEID_TEST_CONFIG' in os.environ.keys() and os.environ['DEID_TEST_CONFIG']:
58 |     env = Test()
59 | else:
60 |     env = Development()
61 | sys.stderr.write(f'Using {env.name} environment.\n')
62 | 


--------------------------------------------------------------------------------
/deid/data/feature.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from .token import Token
 4 | from .util import one_hot
 5 | 
 6 | 
 7 | def get(identifier):
 8 |     if identifier == 'case':
 9 |         return CaseFeature()
10 |     if identifier == 'one':
11 |         return UselessOneFeature()
12 |     raise ValueError(f'unknown feature identifier: {identifier}')
13 | 
14 | 
15 | class Feature:
16 |     def apply(self, token) -> np.ndarray:
17 |         raise NotImplementedError
18 | 
19 |     @property
20 |     def dimension(self):
21 |         return NotImplementedError
22 | 
23 | 
24 | class CaseFeature(Feature):
25 |     """ Casing feature from Reimers and Gurevych (2017) https://arxiv.org/abs/1707.06799 """
26 |     OTHER = 0
27 |     NUMERIC = 1
28 |     MAINLY_NUMERIC = 2
29 |     ALL_LOWER = 3
30 |     ALL_UPPER = 4
31 |     INITIAL_UPPER = 5
32 |     CONTAINS_DIGIT = 6
33 | 
34 |     def apply(self, token: Token) -> np.ndarray:
35 |         token = token.text
36 | 
37 |         num_digits = len([char for char in token if char.isdigit()])
38 |         digit_fraction = num_digits / len(token)
39 | 
40 |         if token.isdigit():
41 |             casing = self.NUMERIC
42 |         elif digit_fraction > 0.5:
43 |             casing = self.MAINLY_NUMERIC
44 |         elif token.islower():
45 |             casing = self.ALL_LOWER
46 |         elif token.isupper():
47 |             casing = self.ALL_UPPER
48 |         elif token[0].isupper():
49 |             casing = self.INITIAL_UPPER
50 |         elif num_digits > 0:
51 |             casing = self.CONTAINS_DIGIT
52 |         else:
53 |             casing = self.OTHER
54 | 
55 |         return one_hot(casing, 7)
56 | 
57 |     @property
58 |     def dimension(self):
59 |         return 7
60 | 
61 | 
62 | class UselessOneFeature(Feature):
63 |     def apply(self, token) -> np.ndarray:
64 |         return np.array([1])
65 | 
66 |     @property
67 |     def dimension(self):
68 |         return 1
69 | 
70 | 
71 | def apply_features(features, sent):
72 |     if len(features) == 0:
73 |         return np.array([np.array([]) for _ in sent])
74 |     return np.array([np.concatenate([feature.apply(word) for feature in features]) for word in sent])
75 | 


--------------------------------------------------------------------------------
/deid/data/util.py:
--------------------------------------------------------------------------------
 1 | import itertools
 2 | from typing import Sequence, Optional, Any
 3 | from spacy.util import compounding as spacy_compounding
 4 | import numpy as np
 5 | 
 6 | 
 7 | def one_hot(x: int, n: int) -> np.ndarray:
 8 |     result = np.zeros(n)
 9 |     result[x] = 1
10 |     return result
11 | 
12 | 
13 | def compounding(start, stop, compound):
14 |     """ Wraps spaCy's compounding utility to always return ints.
15 | 
16 |       >>> sizes = compounding(1., 10., 1.5)
17 |       >>> assert next(sizes) == 1.
18 |       >>> assert next(sizes) == int(1 * 1.5)
19 |       >>> assert next(sizes) == int(1.5 * 1.5)
20 |     """
21 |     return (int(result) for result in spacy_compounding(start, stop, compound))
22 | 
23 | 
24 | def peek(iterator):
25 |     item = next(iterator)
26 |     return item, itertools.chain([item], iterator)
27 | 
28 | 
29 | def pad_2d_sequences(seq: Sequence[Any], maxlen: Optional[int] = None,
30 |                      embedding_size: Optional[int] = None) -> np.ndarray:
31 |     """ Like keras.preprocessing.sequence.pad_sequences but for 2d (already embedded) sequences.
32 | 
33 |     Caveat: this function does not truncate inputs. An error will be raised if the specified maxlen is smaller than the
34 |     actual maximum length in the sequence.
35 | 
36 |     :param seq: the input sequence
37 |     :param maxlen: the length to which the result will be padded, may be None
38 |     :param embedding_size: the embedding dimension of the input, may be None
39 |     :return: a padded array
40 |     """
41 | 
42 |     # find the maximum length by looking through the sequence
43 |     if maxlen is None:
44 |         maxlen = -1
45 |         for item in seq:
46 |             maxlen = max(maxlen, len(item))
47 | 
48 |     # find the embedding dimension by looking through the sequence until there is a non-empty item
49 |     if embedding_size is None:
50 |         for item in seq:
51 |             if len(item) != 0:
52 |                 embedding_size = len(item[0])
53 |                 break
54 | 
55 |     result = np.zeros((len(seq), maxlen, embedding_size))
56 |     for i, item in enumerate(seq):
57 |         assert len(item) > 0
58 |         result[i, -len(item):] = item
59 |     return result
60 | 


--------------------------------------------------------------------------------
/deid/experiment/alternating_evaluation.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import os
 3 | import pickle
 4 | import sys
 5 | 
 6 | import numpy as np
 7 | from keras.callbacks import EarlyStopping, LambdaCallback
 8 | from keras.utils.generic_utils import Progbar
 9 | 
10 | from .alternating import alternating_experiment
11 | from ..env import env
12 | 
13 | 
14 | def make_progress_bar(target):
15 |     return Progbar(target=target, verbose=env.keras_verbose)
16 | 
17 | 
18 | def alternating_evaluation_experiment(config):
19 |     weights = config['test']['test_weights']
20 |     model, tr, train_gen, val, valid_gen, experiment_dir = alternating_experiment(config, run_experiment=False)
21 | 
22 |     model.complete_model.load_weights(weights)
23 | 
24 |     batch_size = config['training']['batch_size']
25 |     test_batch_size = config['training']['test_batch_size']
26 |     if test_batch_size is None:
27 |         test_batch_size = batch_size
28 | 
29 |     early_stopping = EarlyStopping(monitor='val_loss', patience=config['training']['early_stopping_patience'])
30 |     flush = LambdaCallback(on_epoch_end=lambda epoch, logs: sys.stdout.flush())
31 | 
32 |     before_fine_tuning_weights = model.train_representer.get_weights()
33 | 
34 |     def assert_fixed_weights():
35 |         after_fine_tuning_weights = model.train_representer.get_weights()
36 |         for i in range(len(before_fine_tuning_weights)):
37 |             assert np.all(before_fine_tuning_weights[i] == after_fine_tuning_weights[i])
38 | 
39 |     assert_fixed_representer = LambdaCallback(on_epoch_end=lambda epoch, logs: assert_fixed_weights())
40 |     callbacks = [early_stopping, flush, assert_fixed_representer]
41 | 
42 |     print('Training adversary')
43 |     history = model.pretrain_adversary.fit_generator(train_gen,
44 |                                                      epochs=config['training']['train_epochs'],
45 |                                                      steps_per_epoch=int(math.ceil(len(tr.X) / batch_size)),
46 |                                                      validation_data=valid_gen,
47 |                                                      validation_steps=int(math.ceil(len(val.X) / test_batch_size)),
48 |                                                      callbacks=callbacks,
49 |                                                      verbose=env.keras_verbose)
50 | 
51 |     history_pickle_path = os.path.join(experiment_dir, 'history.pickle')
52 |     print('Saving history to', history_pickle_path)
53 |     with open(history_pickle_path, 'wb') as f:
54 |         pickle.dump(history.history, f)
55 | 


--------------------------------------------------------------------------------
/deid/embeddings/embeddings.py:
--------------------------------------------------------------------------------
 1 | from typing import Sequence, Dict
 2 | 
 3 | import numpy as np
 4 | 
 5 | 
 6 | class Embeddings:
 7 |     """ Flexible base class for embeddings that doesn't necessarily use a matrix """
 8 | 
 9 |     @property
10 |     def size(self) -> int:
11 |         raise NotImplementedError
12 | 
13 |     @property
14 |     def mean(self) -> float:
15 |         return 0.
16 | 
17 |     @property
18 |     def std(self) -> float:
19 |         raise NotImplementedError
20 | 
21 |     def is_unknown(self, word: str) -> bool:
22 |         raise NotImplementedError
23 | 
24 |     def lookup(self, word: str) -> np.ndarray:
25 |         """ Looks up the vector representation of one word.
26 | 
27 |         :param word: an input string
28 |         :return: a vector representation of size `size`
29 |         """
30 |         raise NotImplementedError
31 | 
32 |     def lookup_sentence(self, words: Sequence[str]) -> Sequence[np.ndarray]:
33 |         """ Looks up the vector representation of multiple words. Override this if there is a more efficient way to get
34 |         a batch of embeddings than looking them up one by one.
35 | 
36 |         :param words: a sequence of input strings
37 |         :return: a vector representation of size `(len(words), size)`
38 |         """
39 |         return np.array([self.lookup(word) for word in words])
40 | 
41 |     def lookup_sentences(self, sentences: Sequence[Sequence[str]]) -> Sequence[Sequence[np.ndarray]]:
42 |         """ Looks up the vector representation of an entire sentence. Override this if there is a more efficient way to
43 |         get a batch of embeddings sequences than looking them up one by one.
44 | 
45 |         :param sentences: a sequence of sequences of input strings
46 |         :return: a sequence of arrays that have size `(len(sentence), size)` for the corresponding sentence
47 |         """
48 | 
49 |         return [self.lookup_sentence(sentence) for sentence in sentences]
50 | 
51 | 
52 | class PrecomputedEmbeddings(Embeddings):
53 |     """ Base class for embeddings that provide a precomputed matrix in addition to the lookup """
54 | 
55 |     @property
56 |     def size(self) -> int:
57 |         raise NotImplementedError
58 | 
59 |     @property
60 |     def std(self) -> float:
61 |         raise NotImplementedError
62 | 
63 |     def is_unknown(self, word: str) -> bool:
64 |         raise NotImplementedError
65 | 
66 |     def lookup(self, word: str) -> np.ndarray:
67 |         raise NotImplementedError
68 | 
69 |     @property
70 |     def precomputed_word2ind(self) -> Dict[str, int]:
71 |         raise NotImplementedError
72 | 
73 |     @property
74 |     def precomputed_matrix(self) -> np.ndarray:
75 |         raise NotImplementedError
76 | 


--------------------------------------------------------------------------------
/deid/data/tokenizer_tests.py:
--------------------------------------------------------------------------------
 1 | from . import tokenize
 2 | 
 3 | 
 4 | def assert_number_of_tokens(doc, count):
 5 |     assert len(doc) == count, f'token sequence {[str(t) for t in doc]} has length {len(doc)}, expected {count}'
 6 | 
 7 | 
 8 | def assert_number_of_sentences(doc, count):
 9 |     sents = list(str(sent) for sent in doc.sents)
10 |     assert len(sents) == count, f'doc {sents} has {len(sents)} sentences, expected {count}'
11 | 
12 | 
13 | def test_tokenize_one_sentence():
14 |     doc = tokenize('A sentence that is simple to tokenize.')
15 |     assert_number_of_tokens(doc, 8)
16 |     assert doc[-1].text == '.'
17 | 
18 | 
19 | def test_tokenize_multiple_sentences():
20 |     doc = tokenize('One sentence. And another sentence.')
21 |     assert_number_of_sentences(doc, 2)
22 | 
23 | 
24 | def test_tokenize_phone_number():
25 |     doc = tokenize('555-2394-72-01')
26 |     assert_number_of_tokens(doc, 7)
27 | 
28 | 
29 | def test_tokenize_custom_infixes():
30 |     doc = tokenize('a/b')
31 |     assert_number_of_tokens(doc, 3)
32 | 
33 |     doc = tokenize('a_b_c')
34 |     assert_number_of_tokens(doc, 5)
35 | 
36 |     doc = tokenize('81-year-old')
37 |     tokens = [str(t) for t in doc]
38 |     assert tokens == ['81', '-', 'year', '-', 'old']
39 | 
40 |     doc = tokenize('a^b')
41 |     tokens = [str(t) for t in doc]
42 |     assert tokens == ['a', '^', 'b']
43 | 
44 |     doc = tokenize('25yo')
45 |     tokens = [str(t) for t in doc]
46 |     assert tokens == ['25', 'yo']
47 | 
48 | 
49 | def test_tokenize_sentences():
50 |     doc = tokenize('Here is some text that is followed by many newlines\n \n \n \n \nAnd here is some other text.')
51 |     assert_number_of_sentences(doc, 2)
52 | 
53 |     doc = tokenize("""- First list item
54 |     - and the second list item, which does not necessarily look like a sentence start.""")
55 |     assert_number_of_sentences(doc, 2)
56 | 
57 |     doc = tokenize("""1. test
58 |         2: ok""")
59 |     assert_number_of_sentences(doc, 2)
60 | 
61 |     doc = tokenize("""----list with unusual format
62 |     ----starting with some dashes, no space between dashes and first word
63 |     ---sometimes it's a different number of dashes""")
64 |     assert_number_of_sentences(doc, 6)
65 | 
66 | 
67 | def test_tokenize_html():
68 |     doc = tokenize('NASA &amp; SpaceX')
69 |     tokens = [t for t in doc]
70 |     assert [str(t) for t in tokens] == ['NASA', '&amp;', 'SpaceX']
71 |     assert tokens[0]._.unescaped_html is None
72 |     assert tokens[1]._.unescaped_html == '&'
73 | 
74 |     doc = tokenize('NASA &#62; SpaceX')
75 |     tokens = [t for t in doc]
76 |     assert [str(t) for t in tokens] == ['NASA', '&#62;', 'SpaceX']
77 |     assert tokens[0]._.unescaped_html is None
78 |     assert tokens[1]._.unescaped_html == '>'
79 | 
80 |     doc = tokenize('Nasa <br> SpaceX')
81 |     tokens = [t for t in doc]
82 |     assert tokens[1]._.unescaped_html == '\n'
83 | 


--------------------------------------------------------------------------------
/deid/tools/dataset.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | import random
 4 | import shutil
 5 | from datetime import datetime
 6 | 
 7 | from typing import NamedTuple
 8 | 
 9 | from ..env import env
10 | 
11 | NUM_TRAIN_VALID_DOCS = 790  # the total number of train + validation docs in the i2b2 dataset
12 | 
13 | 
14 | class Document(NamedTuple):
15 |     csv: str
16 |     txt: str
17 |     xml: str
18 | 
19 | 
20 | def make_dataset(train_split: float, valid_split: float):
21 |     data_dir = os.path.join(env.data_dir, 'generated')
22 |     if not os.path.isdir(data_dir):
23 |         os.mkdir(data_dir)
24 | 
25 |     date_str = datetime.now().strftime('%Y%m%d-%H%M%S')
26 |     config = f'{train_split}-{valid_split}-{date_str}'
27 | 
28 |     dataset_train_dir = os.path.join(data_dir, f'train-{config}')
29 |     os.mkdir(dataset_train_dir)
30 | 
31 |     dataset_train_xml_dir = os.path.join(data_dir, f'train-{config}_xml')
32 |     os.mkdir(dataset_train_xml_dir)
33 | 
34 |     dataset_valid_dir = os.path.join(data_dir, f'validation-{config}')
35 |     os.mkdir(dataset_valid_dir)
36 | 
37 |     dataset_valid_xml_dir = os.path.join(data_dir, f'validation-{config}_xml')
38 |     os.mkdir(dataset_valid_xml_dir)
39 | 
40 |     all_documents = []
41 |     for dataset in ['train', 'validation']:
42 |         dataset_dir = os.path.join(env.data_dir, dataset)
43 |         dataset_xml_dir = os.path.join(env.data_dir, dataset + '_xml')
44 |         for filename in [filename for filename in os.listdir(dataset_dir) if filename.endswith('csv')]:
45 |             csv_filename = os.path.join(dataset_dir, filename)
46 |             txt_filename = os.path.join(dataset_dir, filename[:-3] + 'txt')
47 |             xml_filename = os.path.join(dataset_xml_dir, filename[:-3] + 'xml')
48 |             all_documents.append(Document(csv=csv_filename, xml=xml_filename, txt=txt_filename))
49 | 
50 |     size = min(max(int(train_split * NUM_TRAIN_VALID_DOCS), 2), NUM_TRAIN_VALID_DOCS)
51 |     train_documents = random.sample(all_documents, size)
52 |     valid_size = max(int(valid_split * len(train_documents)), 1)
53 |     valid_documents = random.sample(train_documents, valid_size)
54 |     print(f'Using {size-valid_size} train documents and {valid_size} validation documents.')
55 | 
56 |     for document in train_documents:
57 |         target = dataset_valid_dir if document in valid_documents else dataset_train_dir
58 |         shutil.copy2(document.csv, target)
59 |         shutil.copy2(document.txt, target)
60 |         shutil.copy2(document.xml, target + '_xml')
61 | 
62 |     print(f'Made dataset at {dataset_train_dir}, {dataset_valid_dir}')
63 | 
64 | 
65 | def main():
66 |     parser = argparse.ArgumentParser()
67 |     parser.description = 'Make train and validation sets of a specified size'
68 |     parser.add_argument('train_split', type=float)
69 |     parser.add_argument('--valid_split', type=float, default=0.2)
70 |     args = parser.parse_args()
71 | 
72 |     make_dataset(args.train_split, args.valid_split)
73 | 
74 | 
75 | if __name__ == '__main__':
76 |     main()
77 | 


--------------------------------------------------------------------------------
/deid/model/layers/noise.py:
--------------------------------------------------------------------------------
 1 | from keras import backend as K
 2 | from keras.engine.topology import Layer
 3 | 
 4 | 
 5 | class Noise(Layer):
 6 |     """ Abstract Gaussian Noise layer with trainable mean and standard deviation """
 7 | 
 8 |     def __init__(self, operation, single_stddev: bool, apply_noise: bool = True, **kwargs) -> None:
 9 |         """ Initializes the Noise layer.
10 | 
11 |         :param operation: the operation to apply to the inputs and noise, may be '+'/'add' or '*'/'mult'. The mean of
12 |         the noise will be set according to this operator.
13 |         :param single_stddev: whether to learn a matrix of noise stddev values instead of only one stddev value that is
14 |         applied to all dimensions of the data
15 |         :param apply_noise: set this to False to only apply the mean instead of noise
16 |         :param kwargs: other Layer arguments
17 |         """
18 |         super().__init__(**kwargs)
19 |         if operation == '+' or operation == 'add':
20 |             self.operation = lambda x, y: x + y
21 |             self.mean = 0.
22 |         elif operation == '*' or operation == 'mult':
23 |             self.operation = lambda x, y: x * y
24 |             self.mean = 1.
25 |         else:
26 |             raise ValueError(f'unknown operation: {operation}')
27 | 
28 |         self.apply_noise = K.constant(value=apply_noise)
29 |         self.single_stddev = single_stddev
30 |         self.k = self.stddev = None  # will be initialized in the build method
31 | 
32 |         self.supports_masking = True
33 | 
34 |     def build(self, input_shape):
35 |         self.k = self.add_weight(name='k',
36 |                                  shape=(1,),
37 |                                  initializer='ones',
38 |                                  trainable=True)
39 |         self.stddev = self.add_weight(name='stddev',
40 |                                       shape=(1,) if self.single_stddev else (input_shape[-1],),
41 |                                       initializer='normal',
42 |                                       trainable=True)
43 |         super().build(input_shape)
44 | 
45 |     def compute_output_shape(self, input_shape):
46 |         return input_shape
47 | 
48 |     def call(self, inputs, **kwargs):
49 |         def noise():
50 |             noise_matrix = K.random_normal(shape=K.shape(inputs), mean=self.mean, stddev=self.stddev)
51 |             return self.operation(inputs, self.k * noise_matrix)
52 | 
53 |         return K.switch(self.apply_noise, noise, inputs)
54 | 
55 |     def get_config(self):
56 |         config = {'apply_noise': self.apply_noise,
57 |                   'mean': self.mean,
58 |                   'single_stddev': self.single_stddev,
59 |                   'k': self.k}
60 |         base_config = super().get_config()
61 |         return {**base_config, **config}
62 | 
63 | 
64 | class AdditiveNoise(Noise):
65 |     def __init__(self, **kwargs):
66 |         super().__init__('+', **kwargs)
67 | 
68 | 
69 | class MultiplicativeNoise(Noise):
70 |     def __init__(self, **kwargs):
71 |         super().__init__('*', **kwargs)
72 | 


--------------------------------------------------------------------------------
/deid/model/deidentifier.py:
--------------------------------------------------------------------------------
 1 | from keras import backend as K
 2 | from keras.layers import Input, Dense, LSTM, Bidirectional, TimeDistributed, Masking, Dropout, concatenate, Lambda
 3 | from keras.models import Model
 4 | from keras_contrib.layers import CRF
 5 | 
 6 | 
 7 | def make_lstm_crf(input_size, hidden_size, output_size, name='deidentifier', extra_input_size=0, num_hidden=1,
 8 |                   input_dropout=0., recurrent_dropout=0., after_hidden_dropout=0., use_crf=False, optimizer=None,
 9 |                   l2_normalize=False):
10 |     """ Make a BiLSTM(-CRF) model that can be used for de-identification.
11 | 
12 |     :param input_size: the embedding/representation input size
13 |     :param hidden_size: the number of LSTM units per direction
14 |     :param output_size: the number of output labels
15 |     :param name: a name for the model
16 |     :param extra_input_size: size for an additional input, if it is 0, this returns a single-input model
17 |     :param num_hidden: the number of LSTM layers
18 |     :param input_dropout: dropout probability for the input layer
19 |     :param recurrent_dropout: recurrent (variational) dropout probability
20 |     :param after_hidden_dropout: dropout probability for the LSTM outputs
21 |     :param use_crf: whether to use a CRF to optimize the output sequences
22 |     :param optimizer: a Keras optimizer, or None if the model should not be compiled
23 |     :param l2_normalize: whether to L2 normalize the embedding/representation input
24 |     :return: a tuple (model, loss), or a compiled Keras model if an optimizer was specified
25 |     """
26 |     embedding_input = Input(shape=(None, input_size))
27 |     x = Masking()(embedding_input)
28 |     if l2_normalize:
29 |         x = Lambda(lambda x: K.l2_normalize(x, axis=-1))(x)
30 |     x = Dropout(input_dropout)(x)
31 | 
32 |     extra_input = Input(shape=(None, extra_input_size))
33 |     if extra_input_size > 0:
34 |         x2 = Masking()(extra_input)
35 |         x = concatenate([x, x2])
36 | 
37 |     for _ in range(num_hidden):
38 |         x = Bidirectional(LSTM(hidden_size, return_sequences=True, dropout=after_hidden_dropout,
39 |                                recurrent_dropout=recurrent_dropout))(x)
40 |     if use_crf:
41 |         # CRF learn mode 'join' does not work at the moment, this GitHub issue contains a minimal example showing
42 |         # the problem: https://github.com/keras-team/keras-contrib/issues/271
43 |         x = TimeDistributed(Dense(output_size, activation=None))(x)
44 |         crf = CRF(output_size, sparse_target=True, learn_mode='marginal', name='deid_output')
45 |         x = crf(x)
46 |         loss = crf.loss_function
47 |     else:
48 |         x = TimeDistributed(Dense(output_size, activation='softmax'), name='deid_output')(x)
49 |         loss = 'sparse_categorical_crossentropy'
50 | 
51 |     if extra_input_size > 0:
52 |         model = Model([embedding_input, extra_input], x, name=name)
53 |     else:
54 |         model = Model(embedding_input, x, name=name)
55 | 
56 |     if optimizer is not None:
57 |         model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])
58 |         return model
59 |     return model, loss
60 | 


--------------------------------------------------------------------------------
/deid/embeddings/glove.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import re
 3 | from typing import Optional, Dict
 4 | 
 5 | import numpy as np
 6 | 
 7 | from . import PrecomputedEmbeddings
 8 | from ..env import env
 9 | 
10 | glove_dir = os.path.join(env.resources_dir, 'glove.6B')
11 | 
12 | 
13 | class GloveEmbeddings(PrecomputedEmbeddings):
14 |     """ Pre-trained GloVe embeddings, see https://nlp.stanford.edu/projects/glove/ """
15 | 
16 |     def __init__(self, dims: int = 300, vocab_size: Optional[int] = None) -> None:
17 |         """ Initialize a GloveEmbeddings object.
18 | 
19 |         :param dims: the GloVe variant to use (50, 100, 200, or 300 dimensions)
20 |         :param vocab_size: limits the size of the embedding matrix
21 |         """
22 |         self._dims = dims
23 |         filename = os.path.join(glove_dir, f'glove.6B.{dims}d.txt')
24 |         if not os.path.isfile(filename):
25 |             raise ValueError(f"Can't find GloVe embeddings with {dims} dims in {glove_dir}.")
26 | 
27 |         embeddings = [np.zeros(dims), np.random.normal(0., scale=1e-6, size=dims)]  # Padding and UNK
28 |         self._word2ind = {env.unk_token: 1}
29 |         self._ind2word = {1: env.unk_token}
30 | 
31 |         with open(filename) as f:
32 |             for i, line in enumerate(f, start=2):
33 |                 values = line.split()
34 |                 word = values[0]
35 |                 embedding = np.asarray(values[1:], dtype='float32')
36 |                 self._word2ind[word] = i
37 |                 self._ind2word[i] = word
38 |                 embeddings.append(embedding / np.linalg.norm(embedding))
39 |                 if i == vocab_size:
40 |                     break
41 | 
42 |         self._embeddings = np.array(embeddings)
43 | 
44 |     @property
45 |     def precomputed_word2ind(self) -> Dict[str, int]:
46 |         return self._word2ind
47 | 
48 |     @property
49 |     def precomputed_matrix(self) -> np.ndarray:
50 |         return self._embeddings
51 | 
52 |     @property
53 |     def size(self) -> int:
54 |         return self._dims
55 | 
56 |     @property
57 |     def std(self):
58 |         return 0.37
59 | 
60 |     def word2ind(self, word: str) -> int:
61 |         result = self._word2ind.get(word)
62 |         if result is not None:
63 |             return result
64 | 
65 |         word = word.lower()
66 |         result = self._word2ind.get(word)
67 |         if result is not None:
68 |             return result
69 | 
70 |         word = re.sub(r'\W', '', word)
71 |         result = self._word2ind.get(word)
72 |         if result is not None:
73 |             return result
74 | 
75 |         # replace every digit with a 0
76 |         result = self._word2ind.get(re.sub(r'\d', '0', word))
77 |         if result is not None:
78 |             return result
79 | 
80 |         # replace all connected digits with a single 0
81 |         result = self._word2ind.get(re.sub(r'\d*', '0', word))
82 |         if result is not None:
83 |             return result
84 | 
85 |         return self._word2ind[env.unk_token]
86 | 
87 |     def lookup(self, word: str) -> np.ndarray:
88 |         return self._embeddings[self.word2ind(word)]
89 | 
90 |     def is_unknown(self, word: str):
91 |         return np.all(self.word2ind(word) == self._word2ind[env.unk_token])
92 | 
93 |     def __str__(self) -> str:
94 |         return '<GloVeEmbeddings>'
95 | 


--------------------------------------------------------------------------------
/deid/tools/i2b2_xml_to_csv_tests.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from deid.env import Test
 4 | from .i2b2_xml_to_csv import xml_to_annotated_tokens_and_text
 5 | 
 6 | 
 7 | def find_token(tokens, text):
 8 |     return next((index, token) for index, token in enumerate(tokens) if token.text.startswith(text))
 9 | 
10 | 
11 | def test_xml_to_annotated_tokens_and_text():
12 |     tokens, text = xml_to_annotated_tokens_and_text(os.path.join(Test().data_dir, 'train_xml', '999-99.xml'),
13 |                                                     check_alignment=True)
14 | 
15 |     _, date_token = find_token(tokens, '2018')
16 |     assert date_token.type == 'B-DATE'
17 |     assert date_token.start == 16
18 | 
19 |     max_index, max_token = find_token(tokens, 'Max')
20 |     assert max_token.type == 'B-PATIENT'
21 |     assert max_token.start == 28
22 |     assert max_token.end == 31
23 | 
24 |     assert tokens[max_index + 1].type == 'I-PATIENT'
25 |     assert tokens[max_index + 2].type == 'O'
26 | 
27 |     lines = text.strip().split('\n')
28 |     assert lines[0] == 'Record date: 2018-06-15'
29 |     assert lines[2] == 'Max Friedrich is a 25-year-old Computer science student living in Hamburg, Germany.'
30 | 
31 | 
32 | def test_tags_right_after_each_other():
33 |     tokens, text = xml_to_annotated_tokens_and_text(os.path.join(Test().data_dir, 'train_xml', '999-98.xml'),
34 |                                                     check_alignment=True)
35 |     one_index, one_token = find_token(tokens, 'one')
36 |     assert one_token.type == 'B-DATE'
37 | 
38 |     two_token = tokens[one_index + 1]
39 |     assert two_token.type == 'B-AGE'
40 | 
41 |     three_token = tokens[one_index + 2]
42 |     assert three_token.type == 'I-AGE'
43 | 
44 |     four_token = tokens[one_index + 3]
45 |     assert four_token.type == 'B-DATE'
46 | 
47 |     medical_record_token = tokens[one_index + 4]
48 |     assert medical_record_token.type == 'B-MEDICALRECORD'
49 | 
50 |     hospital_token = tokens[one_index + 5]
51 |     print(hospital_token)
52 |     assert hospital_token.type == 'B-HOSPITAL'
53 | 
54 | 
55 | def test_uses_start_tag_even_with_wrong_alignment():
56 |     tokens, text = xml_to_annotated_tokens_and_text(os.path.join(Test().data_dir, 'train_xml', '999-97.xml'),
57 |                                                     check_alignment=True)
58 |     print(tokens)
59 |     zero_index, zero_token = find_token(tokens, 'zero')
60 |     assert zero_token.type == 'O'
61 | 
62 |     one_token = tokens[zero_index + 1]
63 |     assert one_token.type == 'B-DATE'
64 | 
65 |     two_token = tokens[zero_index + 2]
66 |     assert two_token.type == 'B-AGE'  # not I-AGE
67 | 
68 |     three_token = tokens[zero_index + 3]
69 |     assert three_token.type == 'O'
70 | 
71 |     four_token = tokens[zero_index + 4]
72 |     assert four_token.type == 'B-DATE'
73 | 
74 | 
75 | def test_escape_html():
76 |     tokens, text = xml_to_annotated_tokens_and_text(os.path.join(Test().data_dir, 'train_xml', '999-96.xml'),
77 |                                                     check_alignment=True)
78 |     print(tokens)
79 |     zero_index, zero_token = find_token(tokens, 'zero')
80 |     assert zero_token.type == 'O'
81 | 
82 |     lt_token = tokens[zero_index + 1]
83 |     assert lt_token.text == '<'
84 |     assert lt_token.type == 'O'
85 |     assert lt_token.start == 8
86 | 
87 |     one_token = tokens[zero_index + 2]
88 |     assert one_token.type == 'B-DATE'
89 |     assert one_token.start == 13
90 | 


--------------------------------------------------------------------------------
/deid/data/batch_tests.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from .batch import BatchGenerator, StratifiedSampling
 4 | from .util import compounding
 5 | from ..data import TrainingSet
 6 | from ..embeddings import DummyEmbeddings
 7 | 
 8 | 
 9 | def test_generator():
10 |     batch_size = 2
11 |     tr = TrainingSet(limit_documents=1, embeddings=DummyEmbeddings())
12 | 
13 |     generator = BatchGenerator(tr.X, tr.y, batch_size)
14 |     x, y = next(generator)
15 |     assert x.shape[0] == y.shape[0] == batch_size
16 | 
17 | 
18 | def test_generator_yields_incomplete_batches():
19 |     def make_array():
20 |         return np.array([[[i for _ in range(10)] for _ in range(3)] for i in range(3)])
21 | 
22 |     generator = BatchGenerator(make_array(), make_array(), batch_size=2, yield_incomplete_batches=True)
23 |     assert generator.epoch_length == 2
24 |     x, y = next(generator)
25 |     assert x.shape[0] == y.shape[0] == 2
26 | 
27 |     x, y = next(generator)
28 |     assert x.shape[0] == y.shape[0] == 1
29 | 
30 |     x, y = next(generator)
31 |     assert x.shape[0] == y.shape[0] == 2
32 | 
33 |     generator = BatchGenerator(make_array(), make_array(), batch_size=2, yield_incomplete_batches=False)
34 |     assert generator.epoch_length == 1
35 |     x, y = next(generator)
36 |     assert x.shape[0] == y.shape[0] == 2
37 | 
38 |     x, y = next(generator)
39 |     assert x.shape[0] == y.shape[0] == 2
40 | 
41 | 
42 | def test_generator_compounding_batch_size():
43 |     def make_array():
44 |         return np.ones((100, 10, 1))
45 | 
46 |     generator = BatchGenerator(make_array(), make_array(), batch_size=compounding(1, 20, 1.1),
47 |                               yield_incomplete_batches=False)
48 |     compounding_value = 1
49 | 
50 |     sum = 0
51 |     print('batch sizes:', generator.epoch_batch_sizes)
52 |     print('epoch length:', generator.epoch_length)
53 |     for i in range(40):  # 1 * 1.1**40 ≈ 45, so it's testing the maximum size as well
54 |         compounding_value = min(20, int(1.1 ** i))
55 |         x, y = next(generator)
56 |         sum += x.shape[0]
57 |         print(f'({i})', x.shape[0], '=', compounding_value, sum)
58 |         assert x.shape[0] == y.shape[0] == int(compounding_value)
59 | 
60 |     assert compounding_value == 20
61 | 
62 | 
63 | def test_generator_yields_permutation():
64 |     def make_array():
65 |         return np.arange(0, 100).reshape((10, 10, 1))
66 | 
67 |     x, y = make_array(), make_array()
68 |     generator = BatchGenerator(x, y, batch_size=5, yield_indices=True)
69 | 
70 |     for _ in range(5):  # so we shuffle a couple of times
71 |         batch_x, batch_y, batch_ind = next(generator)
72 |         assert np.all(batch_x[0] == x[batch_ind[0]])
73 | 
74 | 
75 | def test_stratified_sampling():
76 |     def make_array():
77 |         arr = np.zeros((100, 10, 1))
78 |         for i in range(100):
79 |             arr[i] = np.ones((10, 1)) * i
80 |         return arr
81 | 
82 |     x, y = make_array(), make_array()
83 |     generator = StratifiedSampling(x, y, split_condition=lambda x, _: x[-1] >= 20, batch_size=6, yield_indices=True)
84 | 
85 |     assert generator.epoch_length == 7
86 | 
87 |     batch_x, batch_y, batch_ind = next(generator)
88 |     assert np.all(batch_x[0] == x[batch_ind[0]])
89 |     assert batch_x.size == 60
90 |     assert batch_x[batch_x >= 20].size == 30  # half of them
91 | 
92 |     for _ in range(1, generator.epoch_length):
93 |         batch_x, batch_y, batch_ind = next(generator)
94 |     assert batch_x.size == 40  # last batch should be incomplete
95 | 
96 |     batch_x, batch_y, batch_ind = next(generator)
97 |     assert batch_x.size == 60  # next epoch
98 | 


--------------------------------------------------------------------------------
/deid/tools/embeddings.py:
--------------------------------------------------------------------------------
 1 | import itertools
 2 | import os
 3 | import pickle
 4 | 
 5 | import numpy as np
 6 | 
 7 | 
 8 | def _create_cache(embeddings_class, lookup_sentences):
 9 |     from ..data import DataSet, TrainingSet, ValidationSet, TestSet
10 | 
11 |     def sentences_from_dataset(dataset: DataSet):
12 |         return [[token.text for token in sentence] for sentence in dataset.X]
13 | 
14 |     def words_from_dataset(dataset: DataSet):
15 |         return list(itertools.chain.from_iterable(sentences_from_dataset(dataset)))
16 | 
17 |     read_dataset = sentences_from_dataset if lookup_sentences else words_from_dataset
18 | 
19 |     print('Loading the vocabulary...')
20 |     tr = TrainingSet()
21 |     vocab = read_dataset(tr)
22 |     vocab += read_dataset(ValidationSet(validation_set='validation', embeddings=None, label2ind=tr.label2ind))
23 |     vocab += read_dataset(ValidationSet(validation_set='test', embeddings=None, label2ind=tr.label2ind))
24 | 
25 |     print('Loading embeddings...')
26 |     embeddings_class(vocab)
27 |     print('Done.')
28 | 
29 | 
30 | def create_fasttext_cache():
31 |     from ..embeddings import CachedFastTextEmbeddings
32 |     _create_cache(CachedFastTextEmbeddings, lookup_sentences=False)
33 | 
34 | 
35 | def create_elmo_cache():
36 |     from ..embeddings import CachedElmoEmbeddings
37 |     _create_cache(CachedElmoEmbeddings, lookup_sentences=True)
38 | 
39 | 
40 | def convert_precomputed_fasttext_embeddings():
41 |     from ..embeddings.fasttext import fasttext_dir, fasttext_embeddings_name
42 | 
43 |     print('Loading precomputed embeddings...')
44 |     vec_filename = os.path.join(fasttext_dir, fasttext_embeddings_name + '.vec')
45 | 
46 |     precomputed_vocab = np.loadtxt(vec_filename, usecols=0, dtype=object, skiprows=2, comments=None)
47 |     precomputed_word2ind = {word: i for i, word in enumerate(precomputed_vocab)}
48 | 
49 |     # make sure there are no duplicate words
50 |     assert len(precomputed_vocab) == len(precomputed_word2ind)
51 | 
52 |     precomputed_matrix = np.loadtxt(vec_filename, usecols=range(1, 301), skiprows=2, comments=None)
53 | 
54 |     print('L2 normalizing the embedding matrix...')
55 |     normalized_matrix = precomputed_matrix / np.sqrt((precomputed_matrix ** 2).sum(-1))[..., np.newaxis]
56 | 
57 |     print('Saving the dictionary...')
58 |     pickle.dump(precomputed_word2ind, open(vec_filename + '.vocab.pickle', 'wb'))
59 |     print('Saving the matrix...')
60 |     np.save(vec_filename + '.matrix.npy', normalized_matrix)
61 |     print('Done.')
62 | 
63 | 
64 | def main():
65 |     import argparse
66 | 
67 |     parser = argparse.ArgumentParser()
68 |     parser.add_argument('--fasttext-cache', help='Initialize a fasttext embeddings cache with the i2b2 vocabulary',
69 |                         action='store_true')
70 |     parser.add_argument('--fasttext-precomputed', help='Convert precomputed fasttext embeddings to matrix/dict',
71 |                         action='store_true')
72 |     parser.add_argument('--elmo-cache', help='Initialize an elmo embeddings cache with the i2b2 vocabulary',
73 |                         action='store_true')
74 |     args = parser.parse_args()
75 | 
76 |     if not any([args.fasttext_cache, args.fasttext_precomputed, args.elmo_cache]):
77 |         print('Specify at least one of --fasttext-cache, --fasttext-precomputed, --elmo-cache')
78 | 
79 |     if args.fasttext_cache:
80 |         create_fasttext_cache()
81 | 
82 |     if args.elmo_cache:
83 |         create_elmo_cache()
84 | 
85 |     if args.fasttext_precomputed:
86 |         convert_precomputed_fasttext_embeddings()
87 | 
88 | 
89 | if __name__ == '__main__':
90 |     main()
91 | 


--------------------------------------------------------------------------------
/deid/data/postprocess.py:
--------------------------------------------------------------------------------
 1 | import itertools
 2 | from typing import List, NamedTuple, Sequence, Tuple, Optional
 3 | 
 4 | import numpy as np
 5 | # noinspection PyProtectedMember
 6 | from bs4 import CData, BeautifulSoup
 7 | 
 8 | from .token import Token, TOKEN_TYPE
 9 | 
10 | 
11 | class TaggedTokens(NamedTuple):
12 |     type: str
13 |     tokens: List[Token]
14 |     start: int
15 |     end: int
16 | 
17 | 
18 | class TagAssembler:
19 |     def __init__(self, sent_tokens: Sequence[Token]) -> None:
20 |         self.input = sent_tokens
21 |         self.result: List[TaggedTokens] = []
22 |         self.current_tag: Optional[str] = None
23 |         self.current_tag_tokens: List[Token] = []
24 | 
25 |     def close_current_tag(self) -> None:
26 |         if self.current_tag is not None:
27 |             self.result.append(TaggedTokens(self.current_tag,
28 |                                             self.current_tag_tokens,
29 |                                             self.current_tag_tokens[0].start,
30 |                                             self.current_tag_tokens[-1].end))
31 |             self.current_tag = None
32 |             self.current_tag_tokens = []
33 | 
34 |     def assemble(self) -> Sequence[TaggedTokens]:
35 |         for t in self.input:
36 |             if t.type == 'O':
37 |                 self.close_current_tag()
38 |             elif t.type.startswith('I') and self.current_tag == t.type[2:]:
39 |                 self.current_tag_tokens.append(t)
40 |             else:  # B tag or a stray I tag that should be normalized to a B
41 |                 self.close_current_tag()
42 |                 self.current_tag = t.type[2:]
43 |                 self.current_tag_tokens.append(t)
44 | 
45 |         self.close_current_tag()
46 |         return self.result
47 | 
48 | 
49 | def unpad(X, preds) -> Tuple[List, List]:
50 |     assert len(X) == len(preds), f'X and preds have different lengths: {len(X)} != {len(preds)} '
51 |     unpadded_X, unpadded_preds = [], []
52 |     for i in range(len(X)):
53 |         if isinstance(X[i], np.ndarray):
54 |             actual_length = np.sum(X[i].any(axis=1))
55 |             X_start = preds_start = len(X[i]) - actual_length
56 |         else:
57 |             X_start = 0
58 |             preds_start = len(preds[i]) - len(X[i])
59 |         unpadded_X.append(list(X[i][X_start:]))
60 |         unpadded_preds.append(list(preds[i][preds_start:]))
61 |         assert len(unpadded_X[i]) == len(unpadded_preds[i])
62 |     return unpadded_X, unpadded_preds
63 | 
64 | 
65 | def postprocess_prediction(X, preds, sents, ind2label_lookup) -> Sequence[Sequence[TaggedTokens]]:
66 |     X, preds = unpad(X, preds)
67 | 
68 |     result = []
69 |     for i in range(len(X)):
70 |         sent_tokens = []
71 |         for j in range(len(X[i])):
72 |             sent_tokens.append(
73 |                 Token(sents[i][j].text, ind2label_lookup(preds[i][j]), sents[i][j].start, sents[i][j].end))
74 |         result.append(sent_tokens)
75 | 
76 |     return [TagAssembler(sent_tokens).assemble() for sent_tokens in result]
77 | 
78 | 
79 | def prediction_to_xml(X, preds, text, sents, ind2label_lookup) -> str:
80 |     preds = postprocess_prediction(X, preds, sents, ind2label_lookup)
81 | 
82 |     soup = BeautifulSoup('<deIdi2b2><TEXT></TEXT><TAGS></TAGS></deIdi2b2>', features='xml')
83 |     soup.find('TEXT').string = CData(text)
84 |     tags = soup.find('TAGS')
85 |     for i, tagged_tokens in enumerate(itertools.chain.from_iterable(preds)):
86 |         tags.append(soup.new_tag(TOKEN_TYPE[tagged_tokens.type],
87 |                                  id=f'P{i}',
88 |                                  start=tagged_tokens.start,
89 |                                  end=tagged_tokens.end,
90 |                                  TYPE=tagged_tokens.type,
91 |                                  text=text[tagged_tokens.start:tagged_tokens.end]))
92 | 
93 |     return str(soup)
94 | 


--------------------------------------------------------------------------------
/deid/data/tokenizer.py:
--------------------------------------------------------------------------------
  1 | import html
  2 | 
  3 | import spacy
  4 | from spacy.matcher import Matcher
  5 | from spacy.tokens import Token
  6 | 
  7 | 
  8 | def _deid_tokenizer():
  9 |     prefixes = spacy.util.compile_prefix_regex(nlp.Defaults.prefixes)
 10 | 
 11 |     my_infix = ['\.\.\.+',
 12 |                 '(?<=[0-9])-(?=[0-9])',
 13 |                 '(?<=[0-9])(?=[A-Za-z])',
 14 |                 '[!&:;#,()/_\\-\\^~%{}=\'<>@]']
 15 |     infixes = spacy.util.compile_infix_regex(list(nlp.Defaults.infixes) + my_infix)
 16 | 
 17 |     suffixes = spacy.util.compile_suffix_regex(nlp.Defaults.suffixes)
 18 | 
 19 |     return spacy.tokenizer.Tokenizer(nlp.vocab, nlp.Defaults.tokenizer_exceptions,
 20 |                                      prefix_search=prefixes.search,
 21 |                                      infix_finditer=infixes.finditer, suffix_search=suffixes.search,
 22 |                                      token_match=None)
 23 | 
 24 | 
 25 | def _new_sentence_after_three_newlines_component(doc):
 26 |     def has_newlines(text):
 27 |         return text.count('\n') > 2
 28 | 
 29 |     for i in range(len(doc[:-2])):
 30 |         if has_newlines(doc[i].text) and not has_newlines(doc[i + 1].text):
 31 |             doc[i + 1].sent_start = True
 32 |     return doc
 33 | 
 34 | 
 35 | def _new_sentence_for_bulleted_lists_component(doc):
 36 |     def has_newlines(text):
 37 |         return text.count('\n') > 0
 38 | 
 39 |     def is_bullet(text):
 40 |         return text.startswith('-') or text.startswith('*') or text.startswith('.') or text == 'o' or text[0].isdigit()
 41 | 
 42 |     for i in range(len(doc[:-2])):
 43 |         if has_newlines(doc[i].text) and not has_newlines(doc[i + 1].text) and is_bullet(doc[i + 1].text):
 44 |             doc[i + 1].sent_start = True
 45 |     return doc
 46 | 
 47 | 
 48 | def _new_sentence_after_three_dashes_component(doc):
 49 |     for i in range(3, len(doc[:-3])):
 50 |         if all(token.text == '-' for token in doc[i - 3:i]) and doc[i].text != '-':
 51 |             doc[i].sent_start = True
 52 | 
 53 |     return doc
 54 | 
 55 | 
 56 | # https://spacy.io/usage/linguistic-features#section-rule-based-matching
 57 | class _HTMLMerger(object):
 58 |     def __init__(self, nlp):
 59 |         Token.set_extension('unescaped_html', default=None)
 60 |         self.matcher = Matcher(nlp.vocab)
 61 |         self.matcher.add('BAD_HTML', None,
 62 |                          [{'ORTH': '<'}, {'LOWER': 'br'}, {'ORTH': '>'}],
 63 |                          [{'ORTH': '<'}, {'LOWER': 'br'}, {'ORTH': '/'}, {'ORTH': '>'}],
 64 |                          [{'ORTH': '&'}, {'SHAPE': 'xx'}, {'ORTH': ';'}],  # &lt;
 65 |                          [{'ORTH': '&'}, {'SHAPE': 'xxx'}, {'ORTH': ';'}],  # &amp;
 66 |                          [{'ORTH': '&'}, {'ORTH': '#'}, {'SHAPE': 'dd'}, {'ORTH': ';'}],  # &#12;
 67 |                          [{'ORTH': '&'}, {'ORTH': '#'}, {'SHAPE': 'ddd'}, {'ORTH': ';'}],
 68 |                          [{'ORTH': '&'}, {'ORTH': '#'}, {'SHAPE': 'dddd'}, {'ORTH': ';'}])
 69 | 
 70 |     def __call__(self, doc):
 71 |         matches = self.matcher(doc)
 72 |         spans = []
 73 |         for match_id, start, end in matches:
 74 |             spans.append(doc[start:end])
 75 |         for span in spans:
 76 |             span.merge()
 77 |             for token in span:
 78 |                 if '<br' in token.text:
 79 |                     token._.unescaped_html = '\n'
 80 |                 else:
 81 |                     token._.unescaped_html = html.unescape(token.text)
 82 |         return doc
 83 | 
 84 | 
 85 | nlp = spacy.load('en', disable=['ner'])
 86 | nlp.tokenizer = _deid_tokenizer()
 87 | nlp.add_pipe(_new_sentence_after_three_newlines_component, before='parser')
 88 | nlp.add_pipe(_new_sentence_for_bulleted_lists_component, before='parser')
 89 | nlp.add_pipe(_new_sentence_after_three_dashes_component, before='parser')
 90 | nlp.add_pipe(_HTMLMerger(nlp), last=True)
 91 | 
 92 | 
 93 | def tokenize(text):
 94 |     """ Converts a text to a spaCy document using a customized tokenizer.
 95 | 
 96 |     :param text: the input text
 97 |     :return: a spaCy Doc object
 98 |     """
 99 |     return nlp(text)
100 | 


--------------------------------------------------------------------------------
/deid/tools/find_good_amount_of_noise.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import itertools
 3 | import random
 4 | import numpy as np
 5 | 
 6 | from ..data import TrainingSet
 7 | from ..embeddings import FastTextEmbeddings, GloveEmbeddings, Matrix
 8 | from ..env import env
 9 | 
10 | 
11 | def top_perc(n, ranks):
12 |     return len([rank for rank in ranks if rank <= n]) / len(ranks)
13 | 
14 | 
15 | def sim_perc(threshold, similarities):
16 |     return len([sim for sim in similarities if sim >= threshold]) / len(similarities)
17 | 
18 | 
19 | def main():
20 |     parser = argparse.ArgumentParser()
21 |     parser.description = 'try different amounts of noise to find a balance'
22 |     parser.add_argument('embeddings', type=str, help='the embeddings to use, either glove or fasttext')
23 |     parser.add_argument('noises', nargs='+', type=float, help='the noises to try')
24 |     args = parser.parse_args()
25 | 
26 |     noises = args.noises
27 |     if len(noises) == 0:
28 |         raise argparse.ArgumentTypeError('Please provide a list of noises')
29 | 
30 |     if args.embeddings == 'fasttext':
31 |         emb = FastTextEmbeddings()
32 |         lower = False
33 |     elif args.embeddings == 'glove':
34 |         emb = GloveEmbeddings()
35 |         lower = True
36 |     else:
37 |         raise argparse.ArgumentTypeError(f'Unknown embeddings: {args.embeddings}')
38 | 
39 |     mat = Matrix(lookup_embeddings=emb, precomputed_word2ind=emb.precomputed_word2ind,
40 |                  precomputed_matrix=emb.precomputed_matrix)
41 | 
42 |     tr = TrainingSet(limit_documents=env.limit_training_documents)
43 | 
44 |     phi_tokens = set([token.text for token in itertools.chain.from_iterable(tr.X) if token.type != 'O'])
45 |     phi_tokens = [word.lower() if lower else word for word in phi_tokens
46 |                   if len(word) > 2
47 |                   and (word.lower() if lower else word) in emb.precomputed_word2ind.keys()
48 |                   and not any([c.isdigit() for c in word])]
49 | 
50 |     tokens_to_check = random.sample(phi_tokens, 1_000)
51 |     # print(tokens_to_check)
52 | 
53 |     print('Similarity to closest neighbors:')
54 |     closest_neighbor_similarities = []
55 |     for token in random.sample(tokens_to_check, 10):
56 |         closest_neighbor_similarities.append(mat.most_similar_cosine(token, n=2)[1].similarity)
57 | 
58 |     print(f'closest neighbor similarity mean: {np.mean(closest_neighbor_similarities)}',
59 |           f'std: {np.std(closest_neighbor_similarities)}')
60 | 
61 |     for noise in noises:
62 |         ranks = []
63 |         similarities = []
64 |         closest_neighbor_similarities = []
65 | 
66 |         for token in tokens_to_check:
67 |             looked_up = emb.lookup(token)
68 |             noisy = looked_up + np.random.normal(0., noise, emb.size)
69 |             ranks.append(mat.cosine_distance_rank(noisy, token))
70 |             similarities.append(mat.cosine_distance(noisy, token))
71 |             closest_neighbor_similarities.append(mat.most_similar_cosine(noisy, n=1)[0].similarity)
72 | 
73 |         print('---')
74 |         print(f'Report for scale {noise}:')
75 |         print(f'rank mean: {np.mean(ranks)},',
76 |               f'std: {np.std(ranks)},',
77 |               f'%top1: {top_perc(1, ranks)},',
78 |               f'%top5: {top_perc(5, ranks)},',
79 |               f'%top10: {top_perc(10, ranks)}')
80 |         print(f'similarity with original mean: {np.mean(similarities)}',
81 |               f'std: {np.std(similarities)}',
82 |               f'%0.9+: {sim_perc(0.9, similarities)}',
83 |               f'%0.8+: {sim_perc(0.8, similarities)}',
84 |               f'%0.7+: {sim_perc(0.7, similarities)}',
85 |               f'%0.6+: {sim_perc(0.6, similarities)}')
86 |         print(f'closest neighbor similarity mean: {np.mean(closest_neighbor_similarities)}',
87 |               f'std: {np.std(closest_neighbor_similarities)}',
88 |               f'%0.9+: {sim_perc(0.9, closest_neighbor_similarities)}',
89 |               f'%0.8+: {sim_perc(0.8, closest_neighbor_similarities)}',
90 |               f'%0.7+: {sim_perc(0.7, closest_neighbor_similarities)}',
91 |               f'%0.6+: {sim_perc(0.6, closest_neighbor_similarities)}')
92 | 
93 |         print()
94 | 
95 | 
96 | if __name__ == '__main__':
97 |     main()
98 | 


--------------------------------------------------------------------------------
/deid/embeddings/elmo.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import os
  3 | import pickle
  4 | from typing import Sequence
  5 | 
  6 | import numpy as np
  7 | import tensorflow as tf
  8 | import tensorflow_hub as hub
  9 | from tqdm import tqdm
 10 | 
 11 | from . import Embeddings
 12 | from .util import pad_string_sequences, unpad_sequences, chunks
 13 | from ..env import env
 14 | 
 15 | elmo_dir = os.path.join(env.resources_dir, 'elmo')
 16 | 
 17 | 
 18 | class ElmoEmbeddings(Embeddings):
 19 |     def __new__(cls, *args, **kwargs):
 20 |         if env.embeddings_cache:
 21 |             return CachedElmoEmbeddings(*args, **kwargs)
 22 |         return TensorFlowElmoEmbeddings(*args, **kwargs)
 23 | 
 24 |     def __init__(self, *_, **__):
 25 |         raise NotImplementedError('this should not happen')
 26 | 
 27 |     @property
 28 |     def size(self) -> int:
 29 |         raise NotImplementedError
 30 | 
 31 |     @property
 32 |     def std(self):
 33 |         raise NotImplementedError
 34 | 
 35 |     def lookup(self, word: str) -> np.ndarray:
 36 |         raise NotImplementedError
 37 | 
 38 |     def is_unknown(self, word: str) -> bool:
 39 |         raise NotImplementedError
 40 | 
 41 | 
 42 | class ElmoEmbeddingsImpl(Embeddings):
 43 |     @property
 44 |     def size(self) -> int:
 45 |         return 1024
 46 | 
 47 |     @property
 48 |     def std(self) -> float:
 49 |         return 0.47
 50 | 
 51 |     def lookup(self, word: str) -> np.ndarray:
 52 |         raise RuntimeError("Don't lookup single words in ELMo")
 53 | 
 54 |     def is_unknown(self, word: str):
 55 |         return False
 56 | 
 57 | 
 58 | class TensorFlowElmoEmbeddings(ElmoEmbeddingsImpl):
 59 |     def __init__(self, *_, **__):
 60 |         graph = tf.Graph()
 61 |         with graph.as_default():
 62 |             self.tokens = tf.placeholder(tf.string, shape=[None, None])
 63 |             self.sequence_len = tf.placeholder(tf.int32, shape=[None])
 64 |             self.elmo = hub.Module('https://tfhub.dev/google/elmo/2')
 65 |             self.embed = self.elmo({'tokens': self.tokens, 'sequence_len': self.sequence_len}, signature='tokens',
 66 |                                    as_dict=True)
 67 |             init_op = tf.group([tf.global_variables_initializer(), tf.tables_initializer()])
 68 |         graph.finalize()
 69 |         self.sess = tf.Session(graph=graph)
 70 |         self.sess.run(init_op)
 71 | 
 72 |     def lookup_sentence(self, words: Sequence[str]) -> Sequence[np.ndarray]:
 73 |         return self.sess.run(self.embed, {self.tokens: [words], self.sequence_len: [len(words)]})['elmo'][0]
 74 | 
 75 |     def lookup_sentences(self, sentences: Sequence[Sequence[str]]) -> Sequence[Sequence[np.ndarray]]:
 76 |         sentences, seq_length = pad_string_sequences(sentences)
 77 |         result = self.sess.run(self.embed, {self.tokens: sentences, self.sequence_len: seq_length})['elmo']
 78 |         return unpad_sequences(result, seq_length)
 79 | 
 80 | 
 81 | class CachedElmoEmbeddings(ElmoEmbeddingsImpl):
 82 |     def __init__(self, sentences=None, lookup_batch_size=64, *_, **__):
 83 |         if sentences is None:
 84 |             self.sent2vec = {}
 85 |             for chunk_name in [filename for filename in os.listdir(elmo_dir) if 'chunk' in filename]:
 86 |                 self.sent2vec.update(pickle.load(open(os.path.join(elmo_dir, chunk_name), 'rb')))
 87 |         else:
 88 |             if not os.path.isdir(elmo_dir):
 89 |                 os.mkdir(elmo_dir)
 90 | 
 91 |             embeddings = TensorFlowElmoEmbeddings()
 92 |             self.sent2vec = {}
 93 |             sentence_chunks = chunks(sentences, lookup_batch_size)
 94 |             for i, sentence_chunk in tqdm(enumerate(sentence_chunks), desc='Looking up sentence batches',
 95 |                                           total=math.ceil(len(sentences) / lookup_batch_size)):
 96 |                 chunk_sent2vec = {}
 97 |                 result = embeddings.lookup_sentences(sentence_chunk)
 98 |                 for j, sentence in enumerate(sentence_chunk):
 99 |                     chunk_sent2vec[' '.join(sentence)] = result[j]
100 |                     self.sent2vec[' '.join(sentence)] = result[j]
101 |                 chunk_filename = os.path.join(elmo_dir, f'elmo_chunk{i:04}.pickle')
102 |                 pickle.dump(chunk_sent2vec, open(chunk_filename, 'wb'))
103 | 
104 |     def lookup_sentence(self, words: Sequence[str]):
105 |         result = self.sent2vec.get(' '.join(words))
106 |         if result is not None:
107 |             return result
108 |         raise RuntimeError(f'Cache lookup failed for "{words}". Please rebuild the embedding cache.')
109 | 


--------------------------------------------------------------------------------
/deid/data/augment/augment_tests.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | from .augment import Augment
  4 | from .strategy import AugmentWord, AugmentEmbedding
  5 | from ...data import Token
  6 | from ...embeddings import DummyEmbeddings
  7 | 
  8 | 
  9 | class Ones(AugmentEmbedding):
 10 |     def augment(self, word_embedding):
 11 |         return np.ones(len(word_embedding))
 12 | 
 13 | 
 14 | def test_augment_embeddings():
 15 |     embeddings = DummyEmbeddings()
 16 |     augment = Augment(embeddings, Ones(), exclude=None, n_augmentations=2)
 17 |     sent = [Token.with_text('this'), Token.with_text('is'), Token.with_text('a'), Token.with_text('test')]
 18 | 
 19 |     result = augment.lookup_sentence(sent)
 20 |     assert len(result.original) == 4
 21 |     assert len(result.original[0]) == embeddings.size
 22 |     assert len(result.augmented) == 0
 23 | 
 24 |     sent = [Token.with_text('this'), Token.with_text('is'), Token.with_text('a', 'B-NAME'),
 25 |             Token.with_text('name', 'I-NAME')]
 26 |     result = augment.lookup_sentence(sent)
 27 |     augmented = result.augmented[0]
 28 | 
 29 |     assert len(augmented) == 4
 30 |     assert len(augmented[0]) == embeddings.size
 31 |     assert np.all(augmented[2] == np.ones(embeddings.size))
 32 |     assert np.all(augmented[3] == np.ones(embeddings.size))
 33 |     assert len(result.augmented) == 2
 34 | 
 35 | 
 36 | class ReplaceWithFixed(AugmentWord):
 37 |     def augment(self, word):
 38 |         return 'REPLACED'
 39 | 
 40 | 
 41 | def test_augment_words():
 42 |     embeddings = DummyEmbeddings()
 43 |     augment = Augment(embeddings, ReplaceWithFixed(), exclude=None)
 44 |     sent = [Token.with_text('replace'), Token.with_text('these', 'B-NAME'), Token.with_text('words', 'I-NAME')]
 45 |     result = augment.lookup_sentence(sent).augmented[0]
 46 | 
 47 |     assert np.all(result[0] == embeddings.lookup('replace'))
 48 | 
 49 |     assert np.any(result[1] != embeddings.lookup('these'))
 50 |     assert np.all(result[1] == embeddings.lookup('REPLACED'))
 51 | 
 52 |     assert np.any(result[2] != embeddings.lookup('words'))
 53 |     assert np.all(result[2] == embeddings.lookup('REPLACED'))
 54 | 
 55 | 
 56 | def test_augment_exclude():
 57 |     embeddings = DummyEmbeddings()
 58 |     augment = Augment(embeddings, Ones())
 59 |     sent = [Token.with_text('Please'), Token.with_text('ignore'), Token.with_text('this', 'B-NAME'),
 60 |             Token.with_text(':', 'I-NAME'), Token.with_text('stopword', 'I-NAME')]
 61 | 
 62 |     result = augment.lookup_sentence(sent).augmented[0]
 63 |     assert np.all(result[2] != np.ones(embeddings.size))
 64 |     assert np.all(result[3] != np.ones(embeddings.size))
 65 |     assert np.all(result[4] == np.ones(embeddings.size))
 66 | 
 67 | 
 68 | def test_augment_all():
 69 |     embeddings = DummyEmbeddings()
 70 |     augment = Augment(embeddings, Ones(), augment_all=True, exclude=None)
 71 |     sent = [Token.with_text('Augment'), Token.with_text('all'), Token.with_text('of', 'B-NAME'),
 72 |             Token.with_text('these', 'I-NAME')]
 73 | 
 74 |     result = augment.lookup_sentence(sent).augmented[0]
 75 |     assert np.all(result[0] == np.ones(embeddings.size))
 76 |     assert np.all(result[1] == np.ones(embeddings.size))
 77 |     assert np.all(result[2] == np.ones(embeddings.size))
 78 |     assert np.all(result[3] == np.ones(embeddings.size))
 79 | 
 80 | 
 81 | def test_augment_does_not_touch_unknown():
 82 |     class DummyEmbeddingsWithUnknownTestWord(DummyEmbeddings):
 83 |         def is_unknown(self, word: str):
 84 |             return word == 'test'
 85 | 
 86 |         def lookup(self, word):
 87 |             if word == 'test':
 88 |                 return np.zeros(self.size)
 89 |             return super().lookup(word)
 90 | 
 91 |     embeddings = DummyEmbeddingsWithUnknownTestWord()
 92 |     augment = Augment(embeddings, Ones(), exclude=None)
 93 |     sent = [Token.with_text('This', 'B-NAME'), Token.with_text('is', 'I-NAME'), Token.with_text('another', 'I-NAME'),
 94 |             Token.with_text('test', 'I-NAME')]
 95 |     result = augment.lookup_sentence(sent).augmented[0]
 96 |     assert np.any(result[2] == np.ones(embeddings.size))
 97 |     assert np.all(result[3] == np.zeros(embeddings.size))
 98 | 
 99 | 
100 | def test_augment_max():
101 |     embeddings = DummyEmbeddings()
102 |     augment = Augment(embeddings, ReplaceWithFixed(), augment_max=1, exclude=None)
103 |     sent = [Token.with_text('Augment'), Token.with_text('only'), Token.with_text('one', 'B-NAME'),
104 |             Token.with_text('please', 'I-NAME')]
105 |     result = augment.lookup_sentence(sent).augmented[0]
106 |     assert len([r for r in result if np.all(r == embeddings.lookup('REPLACED'))]) == 1
107 | 


--------------------------------------------------------------------------------
/deid/experiment/dummy.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import collections
  3 | from typing import Sequence
  4 | import random
  5 | 
  6 | import numpy as np
  7 | from keras.preprocessing.sequence import pad_sequences
  8 | 
  9 | from .directory import experiment_directory
 10 | from .evaluation import evaluate_deid_performance
 11 | from ..data import TrainingSet, ValidationSet, Token
 12 | from ..env import env
 13 | 
 14 | 
 15 | class DummyDeidentifier:
 16 |     def guess(self, sentence: Sequence[str]):
 17 |         raise NotImplementedError
 18 | 
 19 |     def predict(self, X, **_):
 20 |         if len(X) == 2 and isinstance(X[0][0], list) and isinstance(X[0][0][0], Token):  # extra features provided
 21 |             X, _ = X
 22 |         y = [self.guess([token.text for token in sentence]) for sentence in X]
 23 |         y = pad_sequences(y)
 24 |         return y
 25 | 
 26 | 
 27 | class UpperBoundDeidentifier(DummyDeidentifier):
 28 |     def __init__(self, X, y):
 29 |         self.solutions = {}
 30 |         for sentence, labels in zip(X, y):
 31 |             self.solutions[' '.join([token.text for token in sentence])] = [l[0] for l in labels]
 32 | 
 33 |     def guess(self, sentence):
 34 |         return self.solutions[' '.join(sentence)]
 35 | 
 36 | 
 37 | class RandomGuessingDeidentifier(DummyDeidentifier):
 38 |     def __init__(self, X, y):
 39 |         label_counts = collections.defaultdict(int)
 40 |         for sentence, labels in zip(X, y):
 41 |             for label in labels:
 42 |                 label_counts[label[0]] += 1
 43 |         n_labels = sum(label_counts.values())
 44 |         self.labels = sorted(label_counts.keys())
 45 |         self.probabilities = [label_counts[label] / n_labels for label in self.labels]
 46 | 
 47 |     def guess(self, sentence):
 48 |         return np.random.choice(self.labels, size=len(sentence), p=self.probabilities)
 49 | 
 50 | 
 51 | class WordListDeidentifier(DummyDeidentifier):
 52 |     def __init__(self, X, y):
 53 |         self.memory = collections.defaultdict(lambda: [1])
 54 |         for sentence, labels in zip(X, y):
 55 |             for word, label in zip(sentence, labels):
 56 |                 self.memory[word.text].append(label[0])
 57 | 
 58 |     def guess(self, sentence):
 59 |         def most_common(lst):
 60 |             return max(set(lst), key=lst.count)
 61 | 
 62 |         return [most_common(self.memory[word]) for word in sentence]
 63 | 
 64 | 
 65 | def main():
 66 |     parser = argparse.ArgumentParser()
 67 |     parser.description = 'different dummy predictors'
 68 |     parser.add_argument('--upper-bound', help='the embeddings to use, either glove or fasttext', action='store_true')
 69 |     parser.add_argument('--random-guessing', help='the embeddings to use, either glove or fasttext',
 70 |                         action='store_true')
 71 |     parser.add_argument('--word-list', help='the embeddings to use, either glove or fasttext', action='store_true')
 72 |     args = parser.parse_args()
 73 | 
 74 |     if not any([args.upper_bound, args.random_guessing, args.word_list]):
 75 |         raise ValueError('please select at least one of --upper-bound, --random-guessing, --word-list')
 76 | 
 77 |     tr = TrainingSet(limit_documents=env.limit_training_documents)
 78 |     val = ValidationSet(tr.label2ind, limit_documents=env.limit_training_documents, validation_set='validation')
 79 | 
 80 |     if args.upper_bound:
 81 |         # needs its own special case because the model is initialized with the test set!
 82 |         test = ValidationSet(tr.label2ind, limit_documents=env.limit_training_documents, validation_set='test')
 83 |         experiment_dir = experiment_directory('upper_bound')
 84 |         model = UpperBoundDeidentifier(test.X, test.y)
 85 |         evaluate_deid_performance(model, embeddings=None, test_set='test', label2ind=tr.label2ind,
 86 |                                   ind2label=tr.ind2label,
 87 |                                   batch_size=8, experiment_dir=experiment_dir, require_argmax=False)
 88 | 
 89 |     if args.random_guessing:
 90 |         test_baseline('random_guessing', RandomGuessingDeidentifier, tr, val)
 91 | 
 92 |     if args.word_list:
 93 |         test_baseline('word_list', WordListDeidentifier, tr, val)
 94 | 
 95 | 
 96 | def test_baseline(identifier, model_class, tr, val):
 97 |     experiment_dir = experiment_directory(identifier)
 98 |     model = model_class(tr.X + val.X, tr.y + val.y)
 99 |     evaluate_deid_performance(model, embeddings=None, test_set='test', label2ind=tr.label2ind,
100 |                               ind2label=tr.ind2label, batch_size=8, experiment_dir=experiment_dir,
101 |                               require_argmax=False)
102 | 
103 | 
104 | if __name__ == '__main__':
105 |     main()
106 | 


--------------------------------------------------------------------------------
/deid/experiment/fake_sentences.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import os
  3 | import pickle
  4 | import random
  5 | 
  6 | import numpy as np
  7 | from keras import Sequential
  8 | from keras.layers import Bidirectional, LSTM, Dense
  9 | 
 10 | from . import experiment_directory
 11 | from ..data import TrainingSet, ValidationSet, StratifiedSampling, is_phi_sentence
 12 | from ..data.augment import Augment, get as get_strategy
 13 | from ..data.batch import IteratorWithEpochLength
 14 | from ..data.util import pad_2d_sequences
 15 | from ..embeddings import Matrix, get as get_embeddings
 16 | from ..env import env
 17 | 
 18 | 
 19 | def real_and_fake_sentences(X, y, indices, alternatives, split_condition):
 20 |     indices = [i for i in indices if split_condition(X[i], y[i])]
 21 |     real_sentences = [X[i] for i in indices]
 22 |     fake_sentences = [random.choice(alternatives[ind]) for ind in indices]
 23 | 
 24 |     X = []
 25 |     y = []
 26 |     for real, fake in zip(real_sentences, fake_sentences):
 27 |         X += [real, fake]
 28 |         y += [1, 0]
 29 | 
 30 |     return pad_2d_sequences(X), np.array(y)
 31 | 
 32 | 
 33 | class FakeSentencesGenerator(IteratorWithEpochLength):
 34 |     def __init__(self, generator: IteratorWithEpochLength, dataset):
 35 |         self.generator = generator
 36 |         self.dataset = dataset
 37 | 
 38 |     def __next__(self):
 39 |         _, _, indices = next(self.generator)
 40 |         X, y = real_and_fake_sentences(self.dataset.X, self.dataset.y, indices, self.dataset.augmented,
 41 |                                        split_condition=is_phi_sentence)
 42 |         return X, y
 43 | 
 44 |     @property
 45 |     def epoch_length(self) -> int:
 46 |         return self.generator.epoch_length
 47 | 
 48 | 
 49 | def fake_sentences_experiment(config):
 50 |     print('Loading embeddings...')
 51 |     embeddings = get_embeddings(config['experiment']['embeddings'])
 52 | 
 53 |     name = config['name']
 54 |     experiment_dir = experiment_directory(name, config['path'])
 55 | 
 56 |     print('Loading matrix...')
 57 |     matrix = Matrix(embeddings, precomputed_word2ind=embeddings.precomputed_word2ind,
 58 |                     precomputed_matrix=embeddings.precomputed_matrix)
 59 | 
 60 |     strategy = get_strategy(config['augment']['strategy'], matrix)
 61 |     digit_strategy = get_strategy(config['augment']['digit_strategy'], matrix)
 62 |     augment = Augment(embeddings, strategy=strategy, digit_strategy=digit_strategy,
 63 |                       **config['augment']['augment_args'])
 64 | 
 65 |     print('Augmenting training set...', flush=True)
 66 |     tr = TrainingSet(embeddings=embeddings,
 67 |                      train_set=config['experiment']['train_set'],
 68 |                      use_short_sentences=env.use_short_sentences,
 69 |                      limit_documents=env.limit_training_documents,
 70 |                      augment=augment)
 71 | 
 72 |     print('Augmenting validation set...', flush=True)
 73 |     val = ValidationSet(embeddings=embeddings,
 74 |                         validation_set=config['experiment']['validation_set'],
 75 |                         label2ind=tr.label2ind,
 76 |                         use_short_sentences=env.use_short_sentences,
 77 |                         limit_documents=env.limit_validation_documents,
 78 |                         augment=augment)
 79 | 
 80 |     model = Sequential()
 81 |     model.add(Bidirectional(LSTM(embeddings.size), input_shape=(None, embeddings.size)))
 82 |     model.add(Dense(1, activation='sigmoid'))
 83 |     model.summary()
 84 |     model.compile(optimizer='nadam', loss='binary_crossentropy', metrics=['accuracy'])
 85 | 
 86 |     batch_size = test_batch_size = 32
 87 |     train_gen = FakeSentencesGenerator(StratifiedSampling(tr.X, tr.y, split_condition=is_phi_sentence,
 88 |                                                           batch_size=batch_size, yield_indices=True, shuffle=True), tr)
 89 |     valid_gen = FakeSentencesGenerator(StratifiedSampling(val.X, val.y, split_condition=is_phi_sentence,
 90 |                                                           batch_size=batch_size, yield_indices=True, shuffle=False),
 91 |                                        val)
 92 | 
 93 |     history = model.fit_generator(train_gen,
 94 |                                   epochs=config['training']['train_epochs'],
 95 |                                   steps_per_epoch=int(math.ceil(len(tr.X) / batch_size)),
 96 |                                   validation_data=valid_gen,
 97 |                                   validation_steps=int(math.ceil(len(val.X) / test_batch_size)),
 98 |                                   verbose=env.keras_verbose)
 99 | 
100 |     history_pickle_path = os.path.join(experiment_dir, 'history.pickle')
101 |     print('Saving history to', history_pickle_path)
102 |     with open(history_pickle_path, 'wb') as f:
103 |         pickle.dump(history.history, f)
104 | 


--------------------------------------------------------------------------------
/deid/model/adversary.py:
--------------------------------------------------------------------------------
 1 | from keras import backend as K
 2 | from keras.layers import Input, Dense, Lambda, LSTM, Bidirectional, TimeDistributed, Dropout, concatenate
 3 | from keras.models import Model, Sequential
 4 | 
 5 | from .layers import GradientReversal
 6 | 
 7 | discriminator_loss = 'binary_crossentropy'
 8 | 
 9 | 
10 | def get(identifier):
11 |     if identifier == 'reconstruct':
12 |         return Reidentifier
13 |     elif identifier == 'discriminate-representations':
14 |         return TwoRepresentationsAreSameOriginalDiscriminator
15 |     elif identifier == 'discriminate-representation-embedding-pair':
16 |         return OriginalAndRepresentationAreMatchingDiscriminator
17 |     else:
18 |         raise ValueError(f'Unknown adversary: "{identifier}"')
19 | 
20 | 
21 | class Adversary:
22 |     """ An adversary is a model with a gradient reversal layer. It can chose its inputs from a dictionary that contains
23 |     entries for 'train_representation', 'fake_representation', and 'original_embeddings'.
24 |     """
25 | 
26 |     def __init__(self, model, loss, inputs, **compile_kwargs):
27 |         self.model = model
28 |         self.loss = loss
29 |         self.inputs = inputs
30 |         self.compile_kwargs = compile_kwargs
31 | 
32 | 
33 | class Reidentifier(Adversary):
34 |     def __init__(self, inputs, representation_size, embedding_size, lstm_size, input_dropout=0., recurrent_dropout=0.,
35 |                  reverse_gradient=True, **_):
36 |         model = Sequential(name='reidentifier')
37 |         model.add(Dropout(input_dropout, input_shape=(None, representation_size)))
38 |         if reverse_gradient:
39 |             model.add(GradientReversal())
40 |         model.add(Bidirectional(LSTM(lstm_size, return_sequences=True, recurrent_dropout=recurrent_dropout)))
41 |         model.add(TimeDistributed(Dense(embedding_size)))
42 |         model.add(Lambda(lambda x: K.l2_normalize(x, axis=-1)))
43 |         super().__init__(model, inputs=[inputs['train_representation']], loss='mse', sample_weight_mode='temporal',
44 |                          metrics=['cosine_proximity'])
45 | 
46 | 
47 | class TwoRepresentationsAreSameOriginalDiscriminator(Adversary):
48 |     def __init__(self, inputs, representation_size, lstm_size, input_dropout=0., recurrent_dropout=0.,
49 |                  reverse_gradient=True, **_):
50 |         """ LSTM size should be at least the representation size for this to converge quickly. """
51 |         representation_input1 = Input(shape=(None, representation_size))
52 |         representation_input2 = Input(shape=(None, representation_size))
53 | 
54 |         # (batch_size, maxlen, repr_size) -> (batch_size, maxlen, 1) -- the dot layer doesn't do this
55 |         normalized_1 = Lambda(lambda x: K.l2_normalize(x, axis=-1))(representation_input1)
56 |         normalized_2 = Lambda(lambda x: K.l2_normalize(x, axis=-1))(representation_input2)
57 |         dot_product = Lambda(lambda x: K.sum(x[0] * x[1], axis=-1, keepdims=True))([normalized_1, normalized_2])
58 | 
59 |         both_inputs = concatenate([representation_input1, representation_input2], axis=-1)
60 |         both_inputs = Dropout(input_dropout)(both_inputs)
61 | 
62 |         inputs_and_dot_product = concatenate([both_inputs, dot_product], axis=-1)
63 |         if reverse_gradient:
64 |             inputs_and_dot_product = GradientReversal()(inputs_and_dot_product)
65 | 
66 |         summary = Bidirectional(LSTM(lstm_size, recurrent_dropout=recurrent_dropout))(inputs_and_dot_product)
67 |         output = Dense(1, activation='sigmoid')(summary)
68 | 
69 |         model = Model([representation_input1, representation_input2], output, name='rr-adv')
70 |         super().__init__(model, inputs=[inputs['train_representation'], inputs['fake_representation']],
71 |                          loss=discriminator_loss, metrics=['accuracy'])
72 | 
73 | 
74 | class OriginalAndRepresentationAreMatchingDiscriminator(Adversary):
75 |     def __init__(self, inputs, representation_size, embedding_size, lstm_size, input_dropout=0., recurrent_dropout=0.,
76 |                  reverse_gradient=True, **_):
77 |         embedding_input = Input(shape=(None, embedding_size))
78 |         representation_input = Input(shape=(None, representation_size))
79 | 
80 |         both_inputs = concatenate([embedding_input, representation_input], axis=-1)
81 |         if reverse_gradient:
82 |             both_inputs = GradientReversal()(both_inputs)
83 |         both_inputs = Dropout(input_dropout)(both_inputs)
84 |         summary = Bidirectional(LSTM(lstm_size, recurrent_dropout=recurrent_dropout))(both_inputs)
85 | 
86 |         output = Dense(1, activation='sigmoid')(summary)
87 | 
88 |         model = Model([embedding_input, representation_input], output, name='er-adv')
89 |         super().__init__(model, inputs=[inputs['original_embeddings'], inputs['fake_representation']],
90 |                          loss=discriminator_loss, metrics=['accuracy'])
91 | 


--------------------------------------------------------------------------------
/deid/data/augment/strategy.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import random
  3 | import re
  4 | from typing import Any, Optional, Sequence, Dict
  5 | 
  6 | import numpy as np
  7 | 
  8 | from ...embeddings import Matrix, EmbeddingSimilarity
  9 | 
 10 | logger = logging.getLogger()
 11 | digit_pattern = '^[0-9]*$'
 12 | 
 13 | 
 14 | class AugmentStrategy:
 15 |     augments_words: bool
 16 | 
 17 |     @property
 18 |     def description(self) -> Optional[str]:
 19 |         return None
 20 | 
 21 |     def augment(self, word_or_embedding: Any) -> Any:
 22 |         raise NotImplementedError
 23 | 
 24 |     def __str__(self) -> str:
 25 |         options = '' if self.description is None else ' ' + self.description
 26 |         return f'<{self.__class__.__name__}{options}>'
 27 | 
 28 | 
 29 | class AugmentWord(AugmentStrategy):
 30 |     augments_words = True
 31 | 
 32 |     def augment(self, word: str) -> str:
 33 |         raise NotImplementedError
 34 | 
 35 | 
 36 | class AugmentEmbedding(AugmentStrategy):
 37 |     augments_words = False
 38 | 
 39 |     def augment(self, word_embedding: np.ndarray) -> np.ndarray:
 40 |         raise NotImplementedError
 41 | 
 42 | 
 43 | class Zeros(AugmentEmbedding):
 44 |     """ Not actually zeros to distinguish from masking. """
 45 | 
 46 |     def augment(self, word_embedding: np.ndarray) -> np.ndarray:
 47 |         return np.random.normal(0., scale=1e-6, size=len(word_embedding))
 48 | 
 49 | 
 50 | class RandomEmbedding(AugmentEmbedding):
 51 |     """ A random normal embedding, optionally L2 normalized """
 52 | 
 53 |     def __init__(self, scale=None, l2_normalize=True):
 54 |         self.scale = 1. if scale is None else scale
 55 |         self.l2_normalize = l2_normalize
 56 | 
 57 |     @property
 58 |     def description(self) -> Optional[str]:
 59 |         return f'scale={self.scale}, l2_normalize={self.l2_normalize}'
 60 | 
 61 |     def augment(self, word_embedding: np.ndarray) -> np.ndarray:
 62 |         embedding = np.random.normal(0., scale=self.scale, size=len(word_embedding))
 63 |         if self.l2_normalize:
 64 |             embedding = embedding / np.linalg.norm(embedding)
 65 |         return embedding
 66 | 
 67 | 
 68 | class RandomDigits(AugmentWord):
 69 |     def __init__(self, matrix: Matrix) -> None:
 70 |         self.matrix = matrix
 71 |         logger.info('getting digit indices')
 72 |         self.digit_ind = [ind for word, ind in matrix.word2ind.items() if re.match(digit_pattern, str(word))]
 73 |         logger.info('found %d indices', len(self.digit_ind))
 74 | 
 75 |     def augment(self, word: str) -> str:
 76 |         ind = random.choice(self.digit_ind)
 77 |         return self.matrix.ind2word[ind]
 78 | 
 79 | 
 80 | class AdditiveNoise(AugmentEmbedding):
 81 |     def __init__(self, scale: float) -> None:
 82 |         self.scale = scale
 83 | 
 84 |     @property
 85 |     def description(self) -> Optional[str]:
 86 |         return f'scale={self.scale}'
 87 | 
 88 |     def augment(self, word_embedding: np.ndarray) -> np.ndarray:
 89 |         noisy = word_embedding + np.random.normal(0, scale=self.scale, size=len(word_embedding))
 90 |         return noisy / np.linalg.norm(noisy)
 91 | 
 92 | 
 93 | class MoveToNeighbor(AugmentWord):
 94 |     """ Only makes sense for embeddings like GloVE and fastText that have a fixed word->embedding lookup """
 95 | 
 96 |     def __init__(self, matrix: Matrix, n_neighbors: int, cache_mode: str = 'neighbors') -> None:
 97 |         self.matrix = matrix
 98 |         self.n_neighbors = n_neighbors
 99 |         self.cache = NeighborsCache(cache_mode)
100 | 
101 |     @property
102 |     def description(self) -> Optional[str]:
103 |         return f'n_neighbors={self.n_neighbors}'
104 | 
105 |     def augment(self, word: str) -> str:
106 |         cache_result = self.cache.lookup(word)
107 |         if cache_result is None:
108 |             neighbors = self.matrix.most_similar_cosine(word, n=self.n_neighbors)
109 |             selected = random.choice(neighbors)
110 |             self.cache.store(word, neighbors, selected)
111 |         else:
112 |             selected = cache_result
113 |         return selected.word
114 | 
115 | 
116 | class NeighborsCache:
117 |     def __init__(self, mode: Optional[str]) -> None:
118 |         if mode not in [None, 'neighbors', 'selected']:
119 |             raise ValueError("Cache mode must be either None, 'neighbors' or 'selected'")
120 |         self.mode = mode
121 |         self.cache: Dict[str, Sequence[EmbeddingSimilarity]] = {}
122 | 
123 |     def lookup(self, word: str) -> Optional[EmbeddingSimilarity]:
124 |         if self.mode is None:
125 |             return None
126 | 
127 |         result = self.cache.get(word)
128 |         return result if result is None else random.choice(result)
129 | 
130 |     def store(self, word: str, neighbors: Sequence[EmbeddingSimilarity], selected: EmbeddingSimilarity) -> None:
131 |         if self.mode == 'neighbors':
132 |             self.cache[word] = neighbors
133 |         if self.mode == 'selected':
134 |             self.cache[word] = [selected]
135 | 


--------------------------------------------------------------------------------
/deid/embeddings/fasttext.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pickle
  3 | from typing import Dict
  4 | 
  5 | import fastText
  6 | import numpy as np
  7 | from tqdm import tqdm
  8 | 
  9 | from . import PrecomputedEmbeddings
 10 | from ..env import env
 11 | 
 12 | fasttext_dir = os.path.join(env.resources_dir, 'fastText')
 13 | fasttext_embeddings_name = 'wiki-news-300d-1M-subword'
 14 | 
 15 | 
 16 | class FastTextEmbeddings(PrecomputedEmbeddings):
 17 |     def __new__(cls, *args, **kwargs):
 18 |         if env.embeddings_cache:
 19 |             return CachedFastTextEmbeddings()
 20 |         return PreloadFastTextEmbeddings()
 21 | 
 22 |     def __init__(self, *_, **__):
 23 |         raise NotImplementedError('this should not happen')
 24 | 
 25 |     @property
 26 |     def size(self) -> int:
 27 |         raise NotImplementedError
 28 | 
 29 |     @property
 30 |     def std(self):
 31 |         raise NotImplementedError
 32 | 
 33 |     def lookup(self, word: str) -> np.ndarray:
 34 |         raise NotImplementedError
 35 | 
 36 |     def is_unknown(self, word: str):
 37 |         return NotImplementedError
 38 | 
 39 |     @property
 40 |     def precomputed_word2ind(self) -> Dict[str, int]:
 41 |         raise NotImplementedError
 42 | 
 43 |     @property
 44 |     def precomputed_matrix(self) -> np.ndarray:
 45 |         raise NotImplementedError
 46 | 
 47 | 
 48 | class FastTextEmbeddingsImpl(PrecomputedEmbeddings):
 49 |     def __init__(self, size, *_, **__):
 50 |         self._size = size
 51 |         self._precomputed_word2ind = None
 52 |         self._precomputed_matrix = None
 53 | 
 54 |     @property
 55 |     def precomputed_word2ind(self) -> Dict[str, int]:
 56 |         if self._precomputed_word2ind is None:
 57 |             vocab_filename = os.path.join(fasttext_dir, fasttext_embeddings_name + '.vec.vocab.pickle')
 58 |             self._precomputed_word2ind = pickle.load(open(vocab_filename, 'rb'))
 59 |         return self._precomputed_word2ind
 60 | 
 61 |     @property
 62 |     def precomputed_matrix(self) -> np.ndarray:
 63 |         if self._precomputed_matrix is None:
 64 |             matrix_filename = os.path.join(fasttext_dir, fasttext_embeddings_name + '.vec.matrix.npy')
 65 |             self._precomputed_matrix = np.load(matrix_filename)
 66 |         return self._precomputed_matrix
 67 | 
 68 |     @staticmethod
 69 |     def l2_normalize_if_needed(vec: np.ndarray, l2_normalize: bool) -> np.ndarray:
 70 |         if l2_normalize:
 71 |             vec /= np.linalg.norm(vec)  # all-zero embeddings shouldn't exist
 72 |         return vec
 73 | 
 74 |     @property
 75 |     def size(self) -> int:
 76 |         return self._size
 77 | 
 78 |     @property
 79 |     def std(self) -> float:
 80 |         return 0.05
 81 | 
 82 |     def lookup(self, word: str) -> np.ndarray:
 83 |         raise NotImplementedError
 84 | 
 85 |     def is_unknown(self, word: str) -> bool:
 86 |         return False
 87 | 
 88 | 
 89 | class PreloadFastTextEmbeddings(FastTextEmbeddingsImpl):
 90 |     def __init__(self) -> None:
 91 |         self.model = fastText.load_model(os.path.join(fasttext_dir, fasttext_embeddings_name + '.bin'))
 92 |         super().__init__(self.model.get_dimension())
 93 | 
 94 |     def lookup(self, word: str, l2_normalize: bool = True) -> np.ndarray:
 95 |         vec = self.model.get_word_vector(word)
 96 |         if np.count_nonzero(vec) == 0:
 97 |             # add small amount of noise to all-zero embeddings to make them work with masking / CRF
 98 |             vec += np.random.normal(0., scale=1e-6, size=len(vec))
 99 | 
100 |         return self.l2_normalize_if_needed(vec, l2_normalize)
101 | 
102 |     def __str__(self) -> str:
103 |         return '<PreloadFastTextEmbeddings>'
104 | 
105 | 
106 | class CachedFastTextEmbeddings(FastTextEmbeddingsImpl):  # always L2 normalized!
107 |     def __init__(self, vocab=None):
108 |         cache_path = os.path.join(fasttext_dir, fasttext_embeddings_name + '.pickle')
109 |         if vocab is None:
110 |             self.word2ind, self.matrix = pickle.load(open(cache_path, 'rb'))
111 |         else:
112 |             vocab = set(vocab)
113 |             embeddings = PreloadFastTextEmbeddings()
114 |             self.word2ind = {word: i + 1 for i, word in enumerate(vocab)}
115 |             self.matrix = np.zeros((len(vocab) + 1, embeddings.size))
116 |             for i, word in tqdm(enumerate(vocab, start=1), desc='Looking up words', total=len(vocab)):
117 |                 self.matrix[i] = embeddings.lookup(word, l2_normalize=True)
118 | 
119 |             pickle.dump((self.word2ind, self.matrix), open(cache_path, 'wb'))
120 |         super().__init__(self.matrix.shape[1])
121 | 
122 |     def lookup(self, word: str, include_precomputed: bool = True) -> np.ndarray:
123 |         index = self.word2ind.get(word)
124 |         if index is not None:
125 |             return self.matrix[index]
126 | 
127 |         index = self.precomputed_word2ind.get(word)
128 |         if index is not None:
129 |             return self.precomputed_matrix[index]
130 | 
131 |         raise RuntimeError(f'Cache/precomputed lookup failed for "{word}". Please rebuild the embedding cache.')
132 | 


--------------------------------------------------------------------------------
/deid/data/read.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | import os
  3 | from typing import Sequence, Optional, Set
  4 | 
  5 | from .token import Token, HIPAA_TOKEN_TYPE, BINARY_LABEL
  6 | from ..env import env
  7 | 
  8 | 
  9 | def _add_data_dir_if_needed(path: str) -> str:
 10 |     """ Adds the data directory to a path if it's not already a sub-path.
 11 | 
 12 |     >>> _add_data_dir_if_needed('train') == os.path.join(env.data_dir, 'train')
 13 |     True
 14 | 
 15 |     :param path: the input path
 16 |     :return: a path containing the data directory
 17 |     """
 18 |     if os.path.realpath(env.data_dir) not in os.path.realpath(path):
 19 |         path = os.path.join(env.data_dir, path)
 20 |     return path
 21 | 
 22 | 
 23 | def full_text_for_csv(filename: str) -> str:
 24 |     """ Returns the full text for a csv file that is saved to a .txt file with the same stem name.
 25 | 
 26 |     :param filename: the csv filename
 27 |     :return: a string that is read from the corresponding txt file
 28 |     """
 29 |     filename = _add_data_dir_if_needed(filename)
 30 | 
 31 |     if not filename.endswith('.csv'):
 32 |         raise ValueError(f'{filename} is not a csv file')
 33 | 
 34 |     return open(filename[:-4] + '.txt').read()
 35 | 
 36 | 
 37 | def tokens_from_csv(file_or_dir: str,
 38 |                     limit: Optional[int] = None,
 39 |                     binary_classification: bool = False,
 40 |                     hipaa_only: bool = False) -> Sequence[Token]:
 41 |     """ Parses a directory of csv files or a single csv file for tokens.
 42 | 
 43 |     :param file_or_dir: the csv file or directory to parse
 44 |     :param limit: upper limit for the number of csv files to parse
 45 |     :param binary_classification: set to True to skip the classes and use only generic BIO labels
 46 |     :param hipaa_only: set to True to skip all non-HIPAA tags
 47 | 
 48 |     :return: a list of Token objects
 49 |     """
 50 | 
 51 |     def label_string(bio_string):
 52 |         if hipaa_only:
 53 |             if bio_string == 'O' or bio_string[2:] not in HIPAA_TOKEN_TYPE.keys():
 54 |                 return 'O'
 55 | 
 56 |         if binary_classification:
 57 |             # Not really binary: there is still a B, I, and O label (and the padding label). I tried using true binary
 58 |             # labels and there was no real difference, so I'm deciding to keep it like this.
 59 |             return 'O' if bio_string == 'O' else f'{bio_string[0]}-{BINARY_LABEL}'
 60 |         return bio_string
 61 | 
 62 |     file_or_dir = _add_data_dir_if_needed(file_or_dir)
 63 | 
 64 |     if os.path.isdir(file_or_dir):
 65 |         filenames = sorted([os.path.join(file_or_dir, f) for f in os.listdir(file_or_dir) if f.endswith('.csv')])
 66 |         if len(filenames) == 0:
 67 |             raise ValueError(f'{file_or_dir} does not contain any csv files')
 68 |     elif file_or_dir.endswith('.csv'):
 69 |         filenames = [file_or_dir]
 70 |     else:
 71 |         raise ValueError(f'{file_or_dir} is not a csv file')
 72 | 
 73 |     tokens = []
 74 |     for i, filename in enumerate(filenames):
 75 |         with open(filename) as f:
 76 |             reader = csv.reader(f)
 77 |             next(reader)  # skip header
 78 |             for row in reader:
 79 |                 tokens.append(Token(row[0],
 80 |                                     label_string(row[1]),
 81 |                                     *map(int, row[2:])))
 82 |         if i == limit:
 83 |             break
 84 | 
 85 |     return tokens
 86 | 
 87 | 
 88 | def split_sentences(tokens: Sequence[Token]) -> Sequence[Sequence[Token]]:
 89 |     """ Breaks a list of Token objects into sentence chunks delimited by sent_start and sent_end. Incomplete sentences
 90 |     are not included in the result.
 91 | 
 92 |     >>> len(split_sentences([Token.with_text(env.sent_start), Token.with_text('test'), Token.with_text(env.sent_end)]))
 93 |     1
 94 | 
 95 |     >>> len(split_sentences([Token.with_text(env.sent_start), Token.with_text('test')]))
 96 |     0
 97 | 
 98 |     :param tokens: the tokens to break into sentences
 99 |     :return: a list of sentences (i.e. a list of lists of tokens)
100 |     """
101 |     sents = []
102 |     current_sent = []
103 |     for token in tokens:
104 |         if token.text not in [env.sent_start, env.sent_end]:
105 |             current_sent.append(token)
106 |         if token.text == env.sent_end:
107 |             if len(current_sent) > 0:
108 |                 sents.append(current_sent)
109 |             current_sent = []
110 |     return sents
111 | 
112 | 
113 | def vocab_from_tokens(tokens: Sequence[Token]) -> Set[str]:
114 |     """ Returns a set of words from a token sequence, excluding the special sent_start and sent_end tokens.
115 | 
116 |     >>> sorted(vocab_from_tokens([Token.with_text(env.sent_start), Token.with_text('test'), \
117 |     Token.with_text('some'), Token.with_text('test'), Token.with_text('words'), Token.with_text(env.sent_end)]))
118 |     ['some', 'test', 'words']
119 | 
120 |     :param tokens: the tokens to convert to a vocabulary set
121 |     :return: a set of words
122 |     """
123 |     return set(token.text for token in tokens) - {env.sent_start, env.sent_end}
124 | 


--------------------------------------------------------------------------------
/deid/tools/config.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import itertools
  3 | import os
  4 | import random
  5 | 
  6 | import yaml
  7 | 
  8 | from ..env import env
  9 | 
 10 | config_dir = os.path.join(env.resources_dir, 'config')
 11 | generated_dir = os.path.join(config_dir, 'generated')
 12 | if not os.path.isdir(generated_dir):
 13 |     os.mkdir(generated_dir)
 14 | 
 15 | 
 16 | def generate_config(config):
 17 |     result = {}
 18 |     for key, value in config.items():
 19 |         if key == 'choice':
 20 |             if isinstance(value, list):
 21 |                 return random.choice(value)
 22 |             else:
 23 |                 raise ValueError('does not support other inputs than lists at the moment')
 24 |         elif isinstance(value, dict):
 25 |             result[key] = generate_config(value)
 26 |         else:
 27 |             result[key] = value
 28 |     return result
 29 | 
 30 | 
 31 | def generate_random_configs(config, name, n, start, output_path):
 32 |     for i in range(n):
 33 |         config_num = i + start
 34 |         result = generate_config(config)
 35 |         with open(os.path.join(output_path, name) + f'_{config_num:03d}.yaml', 'w') as f:
 36 |             f.write(yaml.dump(result))
 37 |     print(f'Generated {n} configs.')
 38 | 
 39 | 
 40 | def flatten_config(config, sep='--', prefix=None):
 41 |     """
 42 |     >>> flatten_config({'a': 1, 'b': {'c': {'d': 4, 'e': [5, 6]}}})
 43 |     {'a': 1, 'b--c--d': 4, 'b--c--e': [5, 6]}
 44 |     """
 45 |     result = {}
 46 |     for key, value in config.items():
 47 |         key = key if prefix is None else f'{prefix}{sep}{key}'
 48 |         if isinstance(value, dict):
 49 |             result.update(flatten_config(value, sep, prefix=key))
 50 |         else:
 51 |             result[key] = value
 52 |     return result
 53 | 
 54 | 
 55 | def unflatten_config(config, sep='--'):
 56 |     """
 57 |     >>> unflatten_config({'a': 1, 'b--c--d': 4, 'b--c--e': [5, 6]})
 58 |     {'a': 1, 'b': {'c': {'d': 4, 'e': [5, 6]}}}
 59 |     """
 60 |     result = {}
 61 |     for key, value in config.items():
 62 |         parts = key.split(sep)
 63 |         parent = result
 64 |         for child in parts[:-1]:
 65 |             if child not in parent.keys():
 66 |                 parent[child] = {}
 67 |             parent = parent[child]
 68 |         parent[parts[-1]] = value
 69 |     return result
 70 | 
 71 | 
 72 | def remove_choices(config, sep='--'):
 73 |     """
 74 |     >>> remove_choices({'a': 1, 'b--c--choice': 2})
 75 |     {'a': 1, 'b--c': 2}
 76 |     """
 77 |     result = {}
 78 |     for key, value in config.items():
 79 |         if key.endswith(f'{sep}choice'):
 80 |             result[key[:-len(f'{sep}choice')]] = value
 81 |         else:
 82 |             result[key] = value
 83 |     return result
 84 | 
 85 | 
 86 | def generate_grid_configs(config, name, output_path):
 87 |     flattened = flatten_config(config)
 88 |     choice_keys = [key for key in flattened.keys() if key.endswith('choice')]
 89 | 
 90 |     for i, choices in enumerate(itertools.product(*[flattened[key] for key in choice_keys]), start=1):
 91 |         for choice_key, choice in zip(choice_keys, choices):
 92 |             flattened[choice_key] = choice
 93 | 
 94 |         result = unflatten_config(remove_choices(flattened))
 95 |         with open(os.path.join(output_path, name) + f'_grid_{i:03d}.yaml', 'w') as f:
 96 |             f.write(yaml.dump(result))
 97 |     print(f'Generated {i} configs.')
 98 | 
 99 | 
100 | def find_config(name):
101 |     filename = name
102 |     if os.path.isfile(filename):
103 |         return filename
104 | 
105 |     filename = os.path.join(config_dir, filename)
106 |     if os.path.isfile(filename):
107 |         return filename
108 | 
109 |     filename = filename + '.yaml'
110 |     if os.path.isfile(filename):
111 |         return filename
112 | 
113 |     raise argparse.ArgumentTypeError(f'{name} is not a valid config name or path')
114 | 
115 | 
116 | def main():
117 |     def ensure_dir(arg) -> str:
118 |         if type(arg) == str and os.path.isdir(arg):
119 |             return arg
120 |         raise argparse.ArgumentTypeError(f'{arg} is not a directory')
121 | 
122 |     parser = argparse.ArgumentParser()
123 |     parser.description = 'Create experiment configs from a config template.'
124 |     parser.add_argument('input_config', help='the input config template')
125 |     parser.add_argument('-o', '--output_path', help='the path to store the results', type=ensure_dir,
126 |                         default=generated_dir)
127 |     parser.add_argument('-n', '--n', help='the number of configs to generate', type=int, default=10)
128 |     parser.add_argument('-a', '--all', help='generate all configs (grid), overrides --n', action='store_true')
129 |     parser.add_argument('-s', '--start', help='the starting number for config filenames', type=int, default=0)
130 | 
131 |     args = parser.parse_args()
132 | 
133 |     filename = find_config(args.input_config)
134 |     config = yaml.load(open(filename))
135 |     name = '.'.join(os.path.basename(filename).split('.')[:-1])
136 |     name = name.replace('_template', '')
137 |     if args.all:
138 |         generate_grid_configs(config, name, args.output_path)
139 |     else:
140 |         generate_random_configs(config, name, args.n, args.start, args.output_path)
141 | 
142 | 
143 | if __name__ == '__main__':
144 |     main()
145 | 


--------------------------------------------------------------------------------
/deid/model/representer.py:
--------------------------------------------------------------------------------
  1 | from keras import backend as K
  2 | from keras.layers import Dense, Lambda, LSTM, Bidirectional, TimeDistributed, Masking
  3 | from keras.models import Sequential
  4 | 
  5 | from .layers import Noise
  6 | 
  7 | 
  8 | def get(identifier):
  9 |     if identifier == 'noisy':
 10 |         return build_noise_representer
 11 |     elif identifier == 'dense':
 12 |         return build_dense_representer
 13 |     elif identifier == 'lstm':
 14 |         return build_lstm_representer
 15 |     else:
 16 |         raise ValueError(f'Unknown representation type: "{identifier}"')
 17 | 
 18 | 
 19 | def build_noise_representer(embedding_size, representation_size, noises, single_stddev, apply_noise,
 20 |                             l2_normalize=False, **_):
 21 |     """ Build a representer that applies a series of noise steps.
 22 | 
 23 |     :param embedding_size: the embedding (input) size
 24 |     :param representation_size: the representation (output) size
 25 |     :param noises: the types of noise to add if using the 'noisy' representation. Must be a single
 26 |     identifier or sequence of identifiers, allowed identifiers are '+'/'add' or '*'/'mult'
 27 |     :param single_stddev: whether to use a single stddev for all embedding dimensions
 28 |     :param apply_noise: whether to apply noise or the mean in this model
 29 |     :param l2_normalize: whether to L2 normalize the inputs (outputs are always L2 normalized)
 30 |     :return: a noisy representer model
 31 |     """
 32 |     if type(noises) == str:
 33 |         noises = [noises]
 34 | 
 35 |     model = Sequential(name='representer')
 36 |     model.add(Masking(input_shape=(None, embedding_size)))
 37 |     if l2_normalize:
 38 |         model.add(Lambda(lambda x: K.l2_normalize(x, axis=-1)))
 39 |     for i, noise_operation in enumerate(noises):
 40 |         model.add(Noise(noise_operation, apply_noise=apply_noise, single_stddev=single_stddev,
 41 |                         input_shape=(None, embedding_size)))
 42 | 
 43 |     model.add(TimeDistributed(Dense(representation_size)))
 44 |     model.add(Lambda(lambda x: K.l2_normalize(x, axis=-1)))
 45 |     return model
 46 | 
 47 | 
 48 | def build_dense_representer(embedding_size, representation_size, apply_noise, num_hidden=2, hidden_size=None,
 49 |                             l2_normalize=False, noise_before=True, noise_after=True, single_stddev=False, **_):
 50 |     """ Build a dense representer that applies the same dense weights to each element in the input sequence.
 51 | 
 52 |     :param embedding_size: the embedding (input) size
 53 |     :param representation_size: the representation (output) size
 54 |     :param apply_noise: whether to apply noise or the mean in this model
 55 |     :param num_hidden: the number of hidden layers in the dense model
 56 |     :param hidden_size: the number of units per hidden layer in the dense model
 57 |     :param l2_normalize: whether to L2 normalize the inputs (outputs are always L2 normalized)
 58 |     :param noise_before: whether to add noise with trainable stddev to the inputs
 59 |     :param noise_after: whether to add noise with trainable stddev to the outputs
 60 |     :param single_stddev: whether to use a single stddev for all embedding dimensions
 61 |     :param _: ignored kwargs
 62 |     :return: a dense representer model
 63 |     """
 64 |     if hidden_size is None:
 65 |         hidden_size = embedding_size
 66 | 
 67 |     model = Sequential(name='representer')
 68 |     model.add(Masking(input_shape=(None, embedding_size)))
 69 |     if l2_normalize:
 70 |         model.add(Lambda(lambda x: K.l2_normalize(x, axis=-1)))
 71 |     if noise_before:
 72 |         model.add(Noise('add', single_stddev=single_stddev, apply_noise=apply_noise))
 73 | 
 74 |     for _ in range(num_hidden):
 75 |         model.add(TimeDistributed(Dense(hidden_size, activation='relu')))
 76 |     model.add(TimeDistributed(Dense(representation_size)))
 77 | 
 78 |     if noise_after:
 79 |         model.add(Noise('add', single_stddev=single_stddev, apply_noise=apply_noise))
 80 |     model.add(Lambda(lambda x: K.l2_normalize(x, axis=-1)))
 81 |     return model
 82 | 
 83 | 
 84 | def build_lstm_representer(embedding_size, representation_size, apply_noise, num_hidden=1, lstm_size=128,
 85 |                            l2_normalize=False, noise_before=True, noise_after=True, single_stddev=False, **_):
 86 |     """ Build an LSTM representer.
 87 | 
 88 |     :param embedding_size: the embedding (input) size
 89 |     :param representation_size: the representation (output) size
 90 |     :param apply_noise: whether to apply noise or the mean in this model
 91 |     :param num_hidden: the number of LSTM layers
 92 |     :param lstm_size: the number of LSTM units per direction and layer
 93 |     :param l2_normalize: whether to L2 normalize the inputs (outputs are always L2 normalized)
 94 |     :param noise_before: whether to add noise with trainable stddev to the inputs
 95 |     :param noise_after: whether to add noise with trainable stddev to the outputs
 96 |     :param single_stddev: whether to use a single stddev for all embedding dimensions
 97 |     :param _: ignored kwargs
 98 |     :return: an LSTM representer model
 99 |     """
100 |     model = Sequential(name='representer')
101 |     model.add(Masking(input_shape=(None, embedding_size)))
102 |     if l2_normalize:
103 |         model.add(Lambda(lambda x: K.l2_normalize(x, axis=-1)))
104 |     if noise_before:
105 |         model.add(Noise('add', single_stddev=single_stddev, apply_noise=apply_noise))
106 | 
107 |     for _ in range(num_hidden):
108 |         model.add(Bidirectional(LSTM(lstm_size, return_sequences=True)))
109 |     model.add(TimeDistributed(Dense(representation_size)))
110 | 
111 |     if noise_after:
112 |         model.add(Noise('add', single_stddev=single_stddev, apply_noise=apply_noise))
113 |     model.add(Lambda(lambda x: K.l2_normalize(x, axis=-1)))
114 |     return model
115 | 


--------------------------------------------------------------------------------
/deid/tools/i2b2_xml_to_csv.py:
--------------------------------------------------------------------------------
  1 | # Call this using `python -m deid.tools.i2b2_xml_to_csv [params]
  2 | 
  3 | import argparse
  4 | import csv
  5 | import html
  6 | import os
  7 | from typing import Tuple, Sequence
  8 | 
  9 | from bs4 import BeautifulSoup
 10 | from tqdm import tqdm
 11 | 
 12 | from ..data import Token, tokenize
 13 | 
 14 | 
 15 | def xml_to_csv(filename: str, output_dir: str, check_alignment) -> None:
 16 |     tokens, text = xml_to_annotated_tokens_and_text(filename, check_alignment)
 17 | 
 18 |     path_without_ext = os.path.join(output_dir, os.path.basename(filename)[:-4])
 19 | 
 20 |     with open(path_without_ext + '.csv', 'w') as f:
 21 |         writer = csv.writer(f)
 22 |         writer.writerow(['text', 'type', 'start', 'end'])
 23 |         writer.writerows(tokens)
 24 | 
 25 |     with open(path_without_ext + '.txt', 'w') as f:
 26 |         f.write(text)
 27 | 
 28 | 
 29 | def xml_to_annotated_tokens_and_text(filename, check_alignment) -> Tuple[Sequence[Token], str]:
 30 |     soup = BeautifulSoup(open(filename).read(), features='xml')
 31 | 
 32 |     text = str(soup.find('TEXT').contents[0])
 33 |     tags = soup.find('TAGS').findChildren()
 34 | 
 35 |     if check_alignment:
 36 |         # Sanity check: compare the <tag text='...'> with the above text.
 37 |         # Ignoring differences where only a '\n' is missing from the tag text because this occurs often in the data
 38 |         # and does not seem to matter for us.
 39 |         for tag in tags:
 40 |             tag_text, original_text = tag.get('text'), text[int(tag.get('start')):int(tag.get('end'))]
 41 |             if tag_text != original_text and tag_text != original_text.replace('\n', ' '):
 42 |                 location = f"{os.path.basename(filename)}[{tag.get('start')}:{tag.get('end')}]"
 43 |                 tqdm.write(f"{location} (tag) {tag_text.__repr__()} ≠ (original) {original_text.__repr__()}")
 44 | 
 45 |         # TODO check here if the start and end tags actually fall on tokens
 46 | 
 47 |     doc = tokenize(text)
 48 |     return annotate_with_tags(doc, tags), text
 49 | 
 50 | 
 51 | def annotate_with_tags(doc, tags) -> Sequence[Token]:
 52 |     def tag_start(i):
 53 |         return int(tags[i].get('start'))
 54 | 
 55 |     def tag_end(i):
 56 |         return int(tags[i].get('end'))
 57 | 
 58 |     def is_current_tag(token):
 59 |         if token.idx == tag_start(current_tag):
 60 |             return True
 61 |         if token.idx >= tag_start(current_tag) and token.idx + len(token.text) <= tag_end(current_tag):
 62 |             return True
 63 |         if token.idx < tag_start(current_tag) < token.idx + len(token.text):
 64 |             return True
 65 |         return False
 66 | 
 67 |     current_tag = 0
 68 | 
 69 |     result = []
 70 |     for sentence in doc.sents:
 71 |         continue_tag_type = None  # set to the tag type string if the tag is not yet processed fully
 72 |         result.append(Token('<s>', 'O', sentence[0].idx, sentence[0].idx))
 73 | 
 74 |         for token in sentence:
 75 |             if continue_tag_type and token.idx < tag_end(current_tag):
 76 |                 tag = f'I-{continue_tag_type}'
 77 |             else:
 78 |                 if token.idx >= tag_end(current_tag) and current_tag < len(tags) - 1:
 79 |                     current_tag += 1
 80 | 
 81 |                 # make sure we did not skip an entire tag
 82 |                 while token.idx >= tag_end(current_tag) and current_tag < len(tags) - 1:
 83 |                     print('Skipping a tag:', tags[current_tag].get('TYPE'), tags[current_tag].get('text'))
 84 |                     current_tag += 1
 85 | 
 86 |                 if is_current_tag(token):
 87 |                     continue_tag_type = tags[current_tag].get('TYPE')
 88 |                     tag = f'B-{continue_tag_type}'
 89 |                 else:
 90 |                     tag = 'O'
 91 |                     continue_tag_type = None
 92 | 
 93 |             token_text = token._.unescaped_html if token._.unescaped_html is not None else token.text
 94 |             token_text = token_text.strip()
 95 |             if len(token_text) == 0 and tag.startswith('B'):
 96 |                 continue_tag_type = None
 97 | 
 98 |             if len(token_text) != 0:
 99 |                 result.append(Token(token_text, tag, token.idx, token.idx + len(token)))
100 | 
101 |         result.append(Token('</s>', 'O', sentence[-1].idx, sentence[-1].idx))
102 | 
103 |     return result
104 | 
105 | 
106 | def main() -> None:
107 |     def ensure_file_or_dir(arg) -> str:
108 |         if type(arg) == str and (os.path.isfile(arg) or os.path.isdir(arg)):
109 |             return arg
110 |         raise argparse.ArgumentTypeError(f'{arg} is not a file or directory')
111 | 
112 |     def ensure_dir(arg) -> str:
113 |         if type(arg) == str and os.path.isdir(arg):
114 |             return arg
115 |         raise argparse.ArgumentTypeError(f'{arg} is not a directory')
116 | 
117 |     parser = argparse.ArgumentParser()
118 |     parser.add_argument('file_or_dir', help='the input file(s)', type=ensure_file_or_dir)
119 |     parser.add_argument('output_dir', help='save the csv file(s) here', type=ensure_dir)
120 |     parser.add_argument('--check', help='check the tag/text alignment', action='store_true')
121 |     args = parser.parse_args()
122 | 
123 |     if os.path.isdir(args.file_or_dir):
124 |         filenames = sorted([file for file in os.listdir(args.file_or_dir) if file.endswith('.xml')])
125 |         if len(filenames) == 0:
126 |             print('No XML files found.')
127 | 
128 |         pbar = tqdm(filenames)
129 |         for filename in pbar:
130 |             pbar.set_description(filename)
131 |             path = os.path.join(args.file_or_dir, filename)
132 |             xml_to_csv(path, output_dir=args.output_dir, check_alignment=args.check)
133 |     else:
134 |         xml_to_csv(args.file_or_dir, output_dir=args.output_dir, check_alignment=args.check)
135 | 
136 | 
137 | if __name__ == '__main__':
138 |     main()
139 | 


--------------------------------------------------------------------------------
/deid/data/augment/augment.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import random
  3 | import re
  4 | from typing import Optional, Callable, Sequence, NamedTuple, Tuple, Dict, Union
  5 | 
  6 | from spacy.lang.en.stop_words import STOP_WORDS
  7 | 
  8 | from . import AugmentStrategy, get as get_strategy
  9 | from .. import Token, Sentence
 10 | from ...embeddings import Embeddings
 11 | 
 12 | logger = logging.getLogger()
 13 | digit_pattern = '^[0-9]*$'
 14 | 
 15 | 
 16 | def default_exclude(word: str) -> bool:
 17 |     return word.lower() in STOP_WORDS or bool(re.match('^[.,:;/+\-*=\\\\]*$', word))
 18 | 
 19 | 
 20 | def exclude_nothing(_: str) -> bool:
 21 |     return False
 22 | 
 23 | 
 24 | class AugmentedSentence(NamedTuple):
 25 |     original: Sentence
 26 |     augmented: Sequence[Sentence]
 27 | 
 28 | 
 29 | class Augment:
 30 |     def __init__(self, embeddings: Embeddings,
 31 |                  strategy: Union[AugmentStrategy, str],
 32 |                  digit_strategy: Optional[Union[AugmentStrategy, str]] = None,
 33 |                  n_augmentations: int = 1,
 34 |                  augment_all: bool = False,
 35 |                  augment_max: Optional[int] = None,
 36 |                  exclude_unknown: bool = True,
 37 |                  exclude: Optional[Callable[[str], bool]] = default_exclude) -> None:
 38 |         self.embeddings = embeddings
 39 |         self.augment_all = augment_all
 40 |         self.exclude_unknown = exclude_unknown
 41 |         if isinstance(strategy, str):
 42 |             self.strategy = get_strategy(strategy)
 43 |         else:
 44 |             self.strategy = strategy
 45 | 
 46 |         if digit_strategy is None:
 47 |             self.digit_strategy = self.strategy
 48 |         elif isinstance(digit_strategy, str):
 49 |             self.digit_strategy = get_strategy(digit_strategy)
 50 |         else:
 51 |             self.digit_strategy = digit_strategy
 52 | 
 53 |         self.n_augmentations = n_augmentations
 54 |         self.augment_max = augment_max if augment_max is not None else 10_000
 55 |         self.exclude = exclude if exclude is not None else exclude_nothing
 56 | 
 57 |     def __str__(self) -> str:
 58 |         return f'<Augment embeddings={self.embeddings.__class__.__name__}, strategy={self.strategy}, ' \
 59 |                f'digit_strategy={self.digit_strategy}, n_augmentations={self.n_augmentations}, ' \
 60 |                f'augment_all={self.augment_all}, exclude_unknown={self.exclude_unknown}>'
 61 | 
 62 |     def _strategy_or_digit_strategy(self, word: str) -> AugmentStrategy:
 63 |         if re.match(digit_pattern, word):
 64 |             return self.digit_strategy
 65 |         return self.strategy
 66 | 
 67 |     def _should_be_excluded(self, word, label):
 68 |         exclude_because_o = not self.augment_all and label == 'O'
 69 |         exclude_because_unknown = self.exclude_unknown and self.embeddings.is_unknown(word)
 70 |         return self.exclude(word) or exclude_because_o or exclude_because_unknown
 71 | 
 72 |     def lookup_sentence(self, sentence: Sequence[Token]) -> AugmentedSentence:
 73 |         """ If the sentence is only O, just look it up. Otherwise:
 74 |         - apply the word strategies and keep track of the embedding strategies that need to be applied later
 75 |         - look up the sentence
 76 |         - apply the embedding strategies
 77 | 
 78 |         :param sentence: the input sentence
 79 |         :return: an AugmentedSentence object
 80 |         """
 81 |         original = self.embeddings.lookup_sentence([token.text for token in sentence])
 82 |         if not self.augment_all and all([token.type == 'O' for token in sentence]):
 83 |             return AugmentedSentence(original, [])
 84 | 
 85 |         apply_word_strategies_result = [self.apply_word_strategies(sentence) for _ in range(self.n_augmentations)]
 86 |         augment_embeddings, sentences_for_lookup = zip(*apply_word_strategies_result)
 87 |         embedded_sentences = self.embeddings.lookup_sentences(sentences_for_lookup)
 88 |         augmented = [self.apply_embedding_strategies(augment_embedding, embedded_sent) for
 89 |                      augment_embedding, embedded_sent in zip(augment_embeddings, embedded_sentences)]
 90 |         return AugmentedSentence(original, augmented)
 91 | 
 92 |     def apply_embedding_strategies(self, augment_embedding: Dict[int, AugmentStrategy],
 93 |                                    sentence_embeddings: Sentence) -> Sentence:
 94 |         sentence_embeddings = list(sentence_embeddings)
 95 |         for i, strategy in augment_embedding.items():
 96 |             augmented = strategy.augment(sentence_embeddings[i])
 97 |             assert len(augmented) == self.embeddings.size
 98 |             sentence_embeddings[i] = augmented
 99 |         return sentence_embeddings
100 | 
101 |     def apply_word_strategies(self, sentence: Sequence[Token]) -> Tuple[Dict[int, AugmentStrategy], Sequence[str]]:
102 |         sentence_for_lookup = []
103 |         augment_embedding = {}
104 | 
105 |         augment_word_ind = []
106 |         for i, token in enumerate(sentence):
107 |             word, label = token.text, token.type
108 |             if not self._should_be_excluded(word, label):
109 |                 strategy = self._strategy_or_digit_strategy(word)
110 |                 if strategy.augments_words:
111 |                     augment_word_ind.append(i)
112 |                 else:
113 |                     augment_embedding[i] = strategy
114 |                     logger.info('deferring strategy %s to augment "%s"', strategy, word)
115 | 
116 |         if len(augment_word_ind) > self.augment_max:
117 |             augment_word_ind = random.sample(augment_word_ind, self.augment_max)
118 | 
119 |         for i, token in enumerate(sentence):
120 |             word, label = token.text, token.type
121 |             if i in augment_word_ind:
122 |                 strategy = self._strategy_or_digit_strategy(word)
123 |                 augmented = strategy.augment(word)
124 |                 logger.info('using strategy %s to augment "%s" to "%s"', strategy, word, augmented)
125 |                 sentence_for_lookup.append(augmented)
126 |             else:
127 |                 sentence_for_lookup.append(word)
128 | 
129 |         return augment_embedding, sentence_for_lookup
130 | 


--------------------------------------------------------------------------------
/deid/experiment/evaluation.py:
--------------------------------------------------------------------------------
  1 | import contextlib
  2 | import os
  3 | from collections import OrderedDict
  4 | 
  5 | import numpy as np
  6 | from keras.callbacks import Callback
  7 | from keras.utils.generic_utils import Progbar
  8 | from terminaltables import SingleTable as TerminalTable
  9 | 
 10 | from ..data import TestSet, prediction_to_xml
 11 | from ..env import env
 12 | from ..tools.i2b2.classes import PHITrackEvaluation, Evaluate
 13 | from ..tools.i2b2.evaluate import evaluate as i2b2_evaluate
 14 | 
 15 | 
 16 | def _save_predictions_to_xmls(model, batch_size, embeddings, label2ind, ind2label, test_set, predictions_dir,
 17 |                               binary_classification, hipaa_only, extra_features, require_argmax):
 18 |     if not os.path.isdir(predictions_dir):
 19 |         os.mkdir(predictions_dir)
 20 | 
 21 |     print('Saving test XMLs to', predictions_dir)
 22 |     progress_bar = Progbar(target=TestSet.number_of_test_sets(test_set), verbose=env.keras_verbose)
 23 | 
 24 |     for i, te in enumerate(TestSet.test_sets(embeddings,
 25 |                                              test_set=test_set,
 26 |                                              label2ind=label2ind,
 27 |                                              binary_classification=binary_classification,
 28 |                                              hipaa_only=hipaa_only,
 29 |                                              extra_features=extra_features), start=1):
 30 |         preds = model.predict([te.X, te.X_extra], batch_size=batch_size)
 31 |         if require_argmax:
 32 |             preds = np.argmax(preds, axis=-1)
 33 |         xml = prediction_to_xml(te.X, preds, te.text, te.sents, ind2label)
 34 |         filename = os.path.basename(te.filename)[:-4] + '.xml'
 35 |         with open(os.path.join(predictions_dir, filename), 'w') as f:
 36 |             f.write(xml)
 37 | 
 38 |         progress_bar.update(i)
 39 | 
 40 | 
 41 | def _run_official_evaluation(predictions_dir, test_set, output_file, binary_classification=False, hipaa_only=False,
 42 |                              print_summary=True):
 43 |     xml_test_dir = os.path.join(env.data_dir, test_set + '_xml')
 44 | 
 45 |     def call_i2b2_evaluate():
 46 |         return i2b2_evaluate([predictions_dir], xml_test_dir, PHITrackEvaluation, verbose=False)
 47 | 
 48 |     if output_file is not None:
 49 |         with open(output_file, 'w') as f:
 50 |             with contextlib.redirect_stdout(f):
 51 |                 evaluations = call_i2b2_evaluate()
 52 |     else:
 53 |         evaluations = call_i2b2_evaluate()
 54 | 
 55 |     result = OrderedDict()
 56 |     for evaluation in evaluations.evaluations:
 57 |         mp = evaluation.micro_precision()
 58 |         mr = evaluation.micro_recall()
 59 |         f1 = Evaluate.F_beta(mr, mp)
 60 |         result[evaluation.sys_id] = {'precision': mp, 'recall': mr, 'f1': f1}
 61 | 
 62 |     if print_summary:
 63 |         print('Evaluating', predictions_dir, xml_test_dir)
 64 |         print('Evaluation summary:')
 65 |         table_data = [['Evaluation', 'Precision', 'Recall', 'F1 (micro)']]
 66 |         for name, values in result.items():
 67 |             if binary_classification and 'Binary' not in name:
 68 |                 continue
 69 |             if hipaa_only and 'HIPAA' not in name:
 70 |                 continue
 71 |             if binary_classification and not hipaa_only and 'HIPAA' in name:
 72 |                 continue  # evaluation is wrong for these because all tags get mapped to a HIPAA (name) tag
 73 |             table_data.append([name] + [round(values[key], 5) for key in ['precision', 'recall', 'f1']])
 74 | 
 75 |         table = TerminalTable(table_data)
 76 |         print(table.table)
 77 |         print(f'(see complete evaluation at {output_file})')
 78 | 
 79 |     return result
 80 | 
 81 | 
 82 | def evaluate_deid_performance(model, batch_size, embeddings, label2ind, ind2label, experiment_dir, epoch=1,
 83 |                               test_set='validation', binary_classification=False,
 84 |                               hipaa_only=False, extra_features=(), require_argmax=True):
 85 |     predictions_dir = os.path.join(experiment_dir, f'predictions_epoch_{epoch:02d}')
 86 |     _save_predictions_to_xmls(model=model, batch_size=batch_size, embeddings=embeddings, label2ind=label2ind,
 87 |                               ind2label=ind2label, test_set=test_set, predictions_dir=predictions_dir,
 88 |                               binary_classification=binary_classification, hipaa_only=hipaa_only,
 89 |                               extra_features=extra_features, require_argmax=require_argmax)
 90 | 
 91 |     output_file = predictions_dir + '.txt'
 92 |     return _run_official_evaluation(predictions_dir=predictions_dir, test_set=test_set, output_file=output_file,
 93 |                                     print_summary=True, binary_classification=binary_classification,
 94 |                                     hipaa_only=hipaa_only)
 95 | 
 96 | 
 97 | class DeidentificationEvaluationCallback(Callback):
 98 |     def __init__(self, deid_model, batch_size, embeddings, label2ind, ind2label, test_set, experiment_dir,
 99 |                  evaluate_every, binary_classification, hipaa_only, extra_features, call_model=False):
100 |         super().__init__()
101 |         self.deid_model = deid_model
102 |         self.batch_size = batch_size
103 |         self.embeddings = embeddings
104 |         self.label2ind = label2ind
105 |         self.ind2label = ind2label
106 |         self.test_set = test_set
107 |         self.experiment_dir = experiment_dir
108 |         self.evaluate_every = evaluate_every
109 |         self.binary_classification = binary_classification
110 |         self.hipaa_only = hipaa_only
111 |         self.extra_features = extra_features
112 |         self.call_model = call_model
113 | 
114 |     def on_epoch_end(self, epoch, logs=None):
115 |         deid_model = self.deid_model() if self.call_model else self.deid_model
116 |         epoch = epoch + 1  # keras uses 0-indexed epochs
117 |         if epoch % self.evaluate_every == 0:
118 |             evaluate_deid_performance(model=deid_model, batch_size=self.batch_size, embeddings=self.embeddings,
119 |                                       label2ind=self.label2ind, ind2label=self.ind2label, epoch=epoch,
120 |                                       test_set=self.test_set, experiment_dir=self.experiment_dir,
121 |                                       binary_classification=self.binary_classification,
122 |                                       hipaa_only=self.hipaa_only,
123 |                                       extra_features=self.extra_features)
124 | 


--------------------------------------------------------------------------------
/deid/model/adversarial.py:
--------------------------------------------------------------------------------
  1 | from types import MappingProxyType
  2 | 
  3 | from keras import backend as K
  4 | from keras.layers import Input, Lambda, concatenate
  5 | from keras.losses import binary_crossentropy
  6 | from keras.models import Model
  7 | 
  8 | from . import get as get_deidentifier
  9 | from .adversary import get as get_adversary
 10 | from .optimizer import get as get_optimizer
 11 | from .representer import get as get_representer
 12 | 
 13 | 
 14 | class AdversarialModel:
 15 |     def __init__(self,
 16 |                  *_,  # don't allow any positional arguments
 17 |                  embedding_size,
 18 |                  output_size,
 19 |                  representation_size=None,
 20 |                  representation_type='lstm',
 21 |                  representation_args=MappingProxyType({}),
 22 |                  deidentifier_type='lstm',
 23 |                  deidentifier_args=MappingProxyType({}),
 24 |                  extra_input_size=0,
 25 |                  adversaries=('discriminate-representations', 'discriminate-representation-embedding-pair'),
 26 |                  adversary_args=MappingProxyType({}),
 27 |                  optimizer='adam',
 28 |                  optimizer_args=MappingProxyType({})):
 29 |         """ Initialize the adversarial model. It's components are
 30 |         - a representation model that transforms embeddings into a (noisy) representation
 31 |         - a deidentifier model that performs the deidentification task from the representation
 32 |         - an adversary model that tries to reconstruct information from the representation
 33 | 
 34 |         :param embedding_size: the representation input size
 35 |         :param output_size: the deidentifier output size
 36 |         :param representation_size: the representation size (or None to use the embedding size)
 37 |         :param representation_type: the type of representation model to use (see representer.py)
 38 |         :param representation_args: the kwargs for the representation model
 39 |         :param deidentifier_type: the type of deidentifier model to use (see deidentifier.py)
 40 |         :param deidentifier_args: the kwargs for the deidentifier model
 41 |         :param adversaries: a sequence of adversary type strings (see adversary.py)
 42 |         :param adversary_args: a dictionary of adversary args or a list of dictionaries (if every adversary should get
 43 |             its own args)
 44 |         :param optimizer: the type of optimizer to use (see optimizer.py)
 45 |         :param optimizer_args: the args passed to the optimizer
 46 |         """
 47 | 
 48 |         if representation_size is None:
 49 |             representation_size = embedding_size
 50 | 
 51 |         original_embeddings = Input(shape=(None, embedding_size))
 52 | 
 53 |         build_representer = get_representer(representation_type)
 54 |         self.train_representer = build_representer(embedding_size=embedding_size,
 55 |                                                    representation_size=representation_size,
 56 |                                                    apply_noise=True,
 57 |                                                    **representation_args)
 58 | 
 59 |         train_representation = self.train_representer(original_embeddings)
 60 | 
 61 |         deidentifier, deidentifier_loss = get_deidentifier(deidentifier_type)(
 62 |             name='deidentifier',
 63 |             input_size=representation_size,
 64 |             output_size=output_size,
 65 |             extra_input_size=extra_input_size,
 66 |             **deidentifier_args)
 67 | 
 68 |         extra_input = Input(shape=(None, extra_input_size))
 69 |         if extra_input_size > 0:
 70 |             train_deidentifier_input = [train_representation, extra_input]
 71 |         else:
 72 |             train_deidentifier_input = train_representation
 73 | 
 74 |         train_deidentifier_output = deidentifier(train_deidentifier_input)
 75 |         self.pretrain_deidentifier = Model([original_embeddings, extra_input], train_deidentifier_output)
 76 |         self.pretrain_deidentifier.compile(optimizer=get_optimizer(optimizer)(**optimizer_args), loss=deidentifier_loss,
 77 |                                            metrics=['accuracy'])
 78 | 
 79 |         self.train_representer.trainable = False
 80 | 
 81 |         adv_embeddings = Input(shape=(None, embedding_size))
 82 |         adv_representation = self.train_representer(adv_embeddings)
 83 | 
 84 |         adv_fake_embeddings = Input(shape=(None, embedding_size))
 85 |         adv_fake_representation = self.train_representer(adv_fake_embeddings)
 86 | 
 87 |         adversary_models = []
 88 |         adversary_outputs = []
 89 |         if isinstance(adversary_args, dict):
 90 |             adversary_args = [adversary_args for _ in adversaries]
 91 | 
 92 |         for adversary_type, args in zip(adversaries, adversary_args):
 93 |             adversary = get_adversary(adversary_type)(inputs={'train_representation': adv_representation,
 94 |                                                               'original_embeddings': adv_embeddings,
 95 |                                                               'fake_representation': adv_fake_representation},
 96 |                                                       representation_size=representation_size,
 97 |                                                       embedding_size=embedding_size,
 98 |                                                       **args)
 99 |             adversary_models.append(adversary.model)
100 |             adversary_outputs.append(adversary.model(adversary.inputs))
101 |             adversary.model.summary()
102 |         if len(adversary_outputs) > 1:
103 |             adversary_output = concatenate(adversary_outputs, axis=-1)
104 |         else:
105 |             adversary_output = adversary_outputs[0]
106 |         adversary_output = Lambda(lambda x: K.mean(x, axis=-1, keepdims=True), name='adversary')(adversary_output)
107 | 
108 |         self.pretrain_adversary = Model([adv_embeddings, adv_fake_embeddings], adversary_output)
109 |         self.pretrain_adversary.summary()
110 |         self.pretrain_adversary.compile(optimizer=get_optimizer(optimizer)(**optimizer_args),
111 |                                         loss='binary_crossentropy',
112 |                                         metrics=['accuracy'])
113 | 
114 |         self.fine_tune_branches = Model([original_embeddings, extra_input, adv_embeddings, adv_fake_embeddings],
115 |                                         [train_deidentifier_output, adversary_output])
116 |         self.fine_tune_branches.compile(optimizer=get_optimizer(optimizer)(**optimizer_args),
117 |                                         loss=[deidentifier_loss, 'binary_crossentropy'],
118 |                                         metrics=['accuracy'])
119 | 
120 |         self.train_representer.trainable = True
121 |         deidentifier.trainable = False
122 |         for adversary in adversary_models:
123 |             adversary.trainable = False
124 |         self.fine_tune_representer = Model([original_embeddings, extra_input, adv_embeddings, adv_fake_embeddings],
125 |                                            [train_deidentifier_output, adversary_output])
126 |         self.fine_tune_representer.compile(optimizer=get_optimizer(optimizer)(**optimizer_args),
127 |                                            loss=[deidentifier_loss, adversarial_objective],
128 |                                            loss_weights=[1, 1], metrics=['accuracy'])
129 | 
130 |     @property
131 |     def complete_model(self):
132 |         return self.fine_tune_branches
133 | 
134 | 
135 | def adversarial_objective(y_true, y_pred):
136 |     loss = binary_crossentropy(y_true, y_pred)
137 |     random_guessing = -K.log(0.5)
138 |     return K.abs(loss - random_guessing)
139 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Sharing Training Data for De-Identification
  2 | 
  3 | [![Build Status](https://www.travis-ci.org/maxfriedrich/deid-training-data.svg?branch=master)](https://www.travis-ci.org/maxfriedrich/deid-training-data)
  4 | 
  5 | **Update 2019-08-11:** Our paper ["Adversarial Learning of Privacy-Preserving Text Representations for De-Identification of Medical Records"](https://www.aclweb.org/anthology/papers/P/P19/P19-1584/) was published at ACL 2019.
  6 | 
  7 | This is the code for my [Master's thesis](https://www.inf.uni-hamburg.de/en/inst/ab/lt/teaching/theses/completed-theses/2018-ma-friedrich.pdf). It's about automatic transformations that can be applied to medical text data that…
  8 | 
  9 | - allow training a de-identification model (i.e. finding all protected information in text)
 10 | - do not allow attackers to infer any protected information.
 11 | 
 12 | ## Main Contribution
 13 | 
 14 | An adversarial deep learning architecture that learns a private representation of medical text. The representation model is an LSTM model that adds Gaussian noise of a trainable scale to its inputs and outputs.
 15 | 
 16 | <img alt="Adversarial architecture" src="architecture.png" width="430">
 17 | 
 18 | The representation fulfills two invariance criteria that are both enforced by binary classifier LSTM adversary models that receive sequence pairs as inputs.
 19 | 
 20 | Left: Representations should be invariant to *any* protected information token being replaced with a neighbor in an embedding space (e.g. substituting a name or date).
 21 | 
 22 | Right: Looking up the same token sequence multiple times should result in a representation that is randomly different by a high enough degree that it could be the representation of a neighboring sequence.
 23 | 
 24 | <img alt="First adversary" src="adversary1.png" width="430">&nbsp;&nbsp;&nbsp;<img alt="Second adversary" src="adversary2.png" width="430">
 25 | 
 26 | ## Installation
 27 | 
 28 | - Checkout the repository including submodules. If you're doing a new clone:
 29 | 
 30 |   ```bash
 31 |   git clone --recurse-submodules git@github.com:maxfriedrich/deid-training-data.git
 32 |   ```
 33 | 
 34 | - Or, if you already cloned the repository:
 35 | 
 36 |   ```bash
 37 |   git submodule update --init
 38 |   ```
 39 | 
 40 | - Create a Conda environment for the project. If you want the environment name to be something other than `deid-training-data` or use `tensorflow-gpu` instead of `tensorflow`, adapt the `environment.yml` file before running this command. Then activate the environment.
 41 | 
 42 |   ```bash
 43 |   cd deid-training-data
 44 |   conda env create
 45 |   conda activate deid-training-data
 46 |   ```
 47 | 
 48 | - Download the English language model for spaCy:
 49 | 
 50 |   ```bash
 51 |   python -m spacy download en
 52 |   ```
 53 | 
 54 | - Verify that the environment is working by running the tests:
 55 | 
 56 |   ```bash
 57 |   DEID_TEST_CONFIG=1 nosetests --with-doctest
 58 |   ```
 59 | 
 60 | - Adapt the [environment file](deid/env.py).
 61 | 
 62 | - Decide with embeddings you want to use:
 63 | 
 64 |   - For **FastText**, get a [fastText embeddings binary](https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M-subword.bin.zip) (4.5 GB download) as well as the [corresponding `.vec` file of precomputed embeddings](https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M-subword.vec.zip) (590 MB download) and put it them the resources directory. Adapt the path [here](deid/embeddings/fasttext.py) if necessary. Then convert the precomputed fastText embeddings to a `{word: ind}` dictionary and numpy matrix file:
 65 | 
 66 |     ```bash
 67 |     python -m deid.tools.embeddings --fasttext-precomputed
 68 |     ```
 69 |   
 70 |   - For **GloVe**, download [a set of pre-trained word vectors](https://github.com/stanfordnlp/GloVe#download-pre-trained-word-vectors) and put it into the resources directory. Adapt the path and dimension [here](deid/embeddings/glove.py) if you're not using the Wikipedia-pretrained 300d embeddings.
 71 |   
 72 |   - For **ELMo**, you don't need to download anything.
 73 | 
 74 | - Get the [i2b2 data](https://www.i2b2.org/NLP/DataSets/) and extract `training-PHI-Gold-Set1` into `train_xml`, `training-PHI-Gold-Set2` into `validation_xml`, and `testing-PHI-Gold-fixed` into a `test_xml` directory.
 75 | 
 76 | - Fix one of the xml files where indices are offset after a special character:
 77 | 
 78 |   ```bash
 79 |   python -m deid.tools.fix_180-03 /path/to/validation_xml
 80 |   ```
 81 | 
 82 | - Convert the xml files with standoff annotations to an IOB2 format csv and a txt file containing the raw text:
 83 | 
 84 |   ```bash
 85 |   ./scripts/xml_to_csv
 86 |   ```
 87 | 
 88 |   The `xml_to_csv` script calls the `deid.tools.i2b2_xml_to_csv` module with the `train_xml`, `validation_xml` and `test_xml` directories. It will output some inconsistencies in the data (standoff annotation texts differ from original text), but we'll ignore those for now.
 89 | 
 90 | - Create an embeddings cache, again depending on your choice(s) of embeddings:
 91 | 
 92 |   - For **FastText**, this command writes all words from the train, test, and validation set to a pickle cache (5 minutes on my machine).
 93 | 
 94 |     ```bash
 95 |     python -m deid.tools.embeddings --fasttext-cache
 96 |     ```
 97 | 
 98 |   - For **ELMo**, this command looks up all sentences from the train, test, and validation set and writes them to many pickle files. This is slow, taking up to 3 hours.
 99 | 
100 |     ```bash
101 |     python -m deid.tools.embeddings --elmo-cache
102 |     ```
103 | 
104 | ## Experiments
105 | 
106 | You can find these experiments in the [`deid/experiment`](deid/experiment) directory:
107 | 
108 | - A [basic experiment](deid/experiment/basic.py) that can be used for training models on raw as well as augmented data
109 | - An implementation of [alternating adversarial training](deid/experiment/alternating.py) similar to [Feutry et al. (2018)](https://arxiv.org/abs/1802.09386)
110 | - Evaluation experiments for [automatic pseudonymization](deid/experiment/mtn_evaluation.py), discriminating [real from automatically pseudonymized sentences](deid/experiment/fake_sentences.py), and the [alternating training](deid/experiment/alternating_evaluation.py)
111 | 
112 | To run an experiment:
113 | 
114 | - Modify the [example config template](deid/config_template.yaml.example) and rename it to `.yaml`. Generate configs from it using the `config` tool:
115 | 
116 |   ```bash
117 |   python -m deid.tools.config /path/to/config_template.yaml
118 |   ```
119 | 
120 |   Specify the number of configs with the `-n` option. For a grid search instead of random samples, use the `-a` option (careful, this might generate thousands of configs depending on the hyperparameter space!).
121 | 
122 | - Run a single experiment from a config:
123 | 
124 |   ```bash
125 |   python -m deid.experiment /path/to/config.yaml
126 |   ```
127 | 
128 |   This will output predictions and save a history pickle to an experiment directory inside `env.work_dir`.
129 | 
130 | - Or set the `DEID_CONFIG_DIR` variable to the config directory and use the `queue` script to run all experiment configs from the `${DEID_CONFIG_DIR}/todo` directory (they will be processed sequentially and moved to the `${DEID_CONFIG_DIR}/done` directory).
131 | 
132 |   ```bash
133 |   DEID_CONFIG_DIR=/path/to/config/dir ./scripts/queue
134 |   ```
135 | 
136 | ## Evaluation
137 | 
138 | The evaluation using a [modified version](deid/tools/i2b2) (`2to3`, minor fixes) of the [official evaluation script](https://github.com/kotfic/i2b2_evaluation_scripts) is run automatically in the experiments. You can also call it like this to evaluate a directory of XML predictions:
139 | 
140 | ```bash
141 | python -m deid.tools.i2b2.evaluate phi /path/to/predictions /path/to/i2b2_data/validation_xml/
142 | ```
143 | 


--------------------------------------------------------------------------------
/deid/embeddings/matrix.py:
--------------------------------------------------------------------------------
  1 | from typing import Iterable, List, Tuple, NamedTuple, Union, Optional, Dict
  2 | 
  3 | import numpy as np
  4 | from tqdm import tqdm
  5 | 
  6 | from . import Embeddings
  7 | 
  8 | 
  9 | class EmbeddingSimilarity(NamedTuple):
 10 |     rank: int
 11 |     word: str
 12 |     similarity: float
 13 |     vec: np.ndarray
 14 | 
 15 | 
 16 | MostSimilarResult = List[EmbeddingSimilarity]
 17 | WordOrVec = Union[str, np.ndarray]
 18 | 
 19 | 
 20 | class Matrix:
 21 |     """ Transforms a lookup-based Embeddings object into a classical embedding matrix by looking up a fixed vocabulary
 22 |     and storing the results. The matrix can then be used for distance measuring.
 23 |     """
 24 | 
 25 |     def __init__(self, lookup_embeddings: Embeddings, vocab: Optional[Iterable[str]] = None,
 26 |                  precomputed_word2ind: Optional[Dict[str, int]] = None, precomputed_matrix: Optional[np.ndarray] = None,
 27 |                  verbose: bool = False) -> None:
 28 |         """ Initialize the Matrix object.
 29 | 
 30 |         :param lookup_embeddings: the embeddings object used for lookup
 31 |         :param vocab: an iterable containing the words that should be stored in the matrix
 32 |         :param precomputed_word2ind: a precomputed word2ind dict, e.g. from the fastText .vec file
 33 |         :param precomputed_matrix: a precomputed embedding matrix, e.g. from the fastText .vec file
 34 |         :param verbose: setting this to True will show a progress bar when first looking up embeddings as well as output
 35 |         means when computing distances
 36 |         """
 37 |         self.verbose = verbose
 38 |         self.lookup_embeddings = lookup_embeddings
 39 | 
 40 |         if vocab is not None:
 41 |             self._init_from_vocab(lookup_embeddings, vocab=vocab)
 42 |         elif precomputed_word2ind is not None and precomputed_matrix is not None:
 43 |             self._init_from_word2ind_and_matrix(precomputed_word2ind, precomputed_matrix)
 44 |         else:
 45 |             raise ValueError('The Matrix needs to be initialized either with vocab or word2ind+matrix')
 46 | 
 47 |     def _init_from_vocab(self, lookup_embeddings, vocab):
 48 |         vocab = set(vocab)
 49 |         self.vocab_size = len(vocab)
 50 |         self.word2ind = {word: i for i, word in enumerate(vocab)}
 51 |         self.ind2word = {i: word for i, word in enumerate(vocab)}
 52 |         self.embedding_matrix = np.zeros((self.vocab_size, lookup_embeddings.size))
 53 |         self.is_norm = False
 54 | 
 55 |         items: Iterable[Tuple[str, int]] = self.word2ind.items()
 56 |         if self.verbose:
 57 |             items = tqdm(items, desc='Looking up embeddings')
 58 |         for word, ind in items:
 59 |             looked_up = lookup_embeddings.lookup(word)
 60 |             if np.count_nonzero(looked_up) > 0:
 61 |                 self.embedding_matrix[ind] = looked_up
 62 |             else:
 63 |                 # this shouldn't happen anymore
 64 |                 raise RuntimeError(f'Embedding vector for {word} is all zeros')
 65 | 
 66 |     def _init_from_word2ind_and_matrix(self, word2ind, matrix):
 67 |         self.vocab_size = len(word2ind)
 68 |         self.word2ind = word2ind
 69 |         self.ind2word = {i: word for word, i in self.word2ind.items()}
 70 |         self.embedding_matrix = matrix
 71 |         self.is_norm = True
 72 | 
 73 |     def init_norms(self, force: bool = False) -> None:
 74 |         """ Initializes self.norms with pre-computed L2 normalized vectors for cosine distance computation.
 75 | 
 76 |         :param force: setting this to True will update the norms even if they were already computed
 77 |         :return: None
 78 |         """
 79 |         if not self.is_norm or force:
 80 |             # noinspection PyAttributeOutsideInit
 81 |             self.embedding_matrix = self.embedding_matrix / np.sqrt((self.embedding_matrix ** 2).sum(-1))[
 82 |                 ..., np.newaxis]
 83 |             self.is_norm = True
 84 | 
 85 |     def _most_similar_cosine_measurement(self, vec):
 86 |         self.init_norms()
 87 |         normalized_vec = vec / np.linalg.norm(vec)
 88 |         return np.dot(self.embedding_matrix, normalized_vec)
 89 | 
 90 |     def most_similar_cosine(self, word_or_vec: WordOrVec, n: int = 20) -> MostSimilarResult:
 91 |         """ Calculate the cosine distance of the input vector to all vectors in the embedding matrix and return the
 92 |         most similar ones.
 93 | 
 94 |         :param word_or_vec: the input word or vector
 95 |         :param n: the number of results to return, or None if all should be returned
 96 |         :return: a list of MostSimilarResult objects
 97 |         """
 98 |         return self._generic_most_similar(word_or_vec, self._most_similar_cosine_measurement,
 99 |                                           higher_is_more_similar=True, n=n)
100 | 
101 |     def cosine_distance_rank(self, word_or_vec: WordOrVec, word):
102 |         return self._generic_rank(word_or_vec, word, self._most_similar_cosine_measurement, higher_is_more_similar=True)
103 | 
104 |     def cosine_distance(self, vec: np.ndarray, word: str) -> float:
105 |         """ Returns the cosine distance between an input word and vector.
106 | 
107 |         :param vec: the input vector
108 |         :param word: the input word
109 |         :return: a float between -1 and 1
110 |         """
111 |         self.init_norms()
112 |         normalized_vec = vec / np.linalg.norm(vec)
113 |         return float(np.dot(self.embedding_matrix[self.word2ind[word]], normalized_vec))
114 | 
115 |     def most_similar_l2(self, word_or_vec: WordOrVec, n: int = 20) -> MostSimilarResult:
116 |         """ Calculate the L2 norm distance of the input vector to all vectors in the embedding matrix and return the
117 |         most similar ones.
118 | 
119 |         :param word_or_vec: the input word or vector
120 |         :param n: the number of results to return, or None if all should be returned
121 |         :return: a list of (word, distance) pairs, with lower distance meaning more similar
122 |         """
123 | 
124 |         def measurement(vec):
125 |             distances = np.zeros(self.vocab_size)
126 |             for i, emb in enumerate(self.embedding_matrix):
127 |                 distances[i] = np.linalg.norm(vec - emb)
128 |             return distances
129 | 
130 |         return self._generic_most_similar(word_or_vec, measurement, higher_is_more_similar=False, n=n)
131 | 
132 |     def _lookup_if_needed(self, word_or_vec: WordOrVec) -> np.ndarray:
133 |         if type(word_or_vec) == str:
134 |             return self.lookup_embeddings.lookup(word_or_vec)
135 |         else:
136 |             return word_or_vec
137 | 
138 |     def _generic_most_similar(self, word_or_vec: WordOrVec, measurement, higher_is_more_similar, n: int = 20):
139 |         self.init_norms()
140 |         vec = self._lookup_if_needed(word_or_vec)
141 |         distances = measurement(vec)
142 |         assert len(distances) == len(self.embedding_matrix)
143 |         if self.verbose:
144 |             print('mean distance', np.mean(distances))
145 | 
146 |         distances_for_sorting = -distances if higher_is_more_similar else distances
147 | 
148 |         if n is None or n == len(self.embedding_matrix):
149 |             sorted_most_similar_ind = np.argsort(distances_for_sorting)
150 |         else:
151 |             most_similar_ind = np.argpartition(distances_for_sorting, n)[:n]
152 |             sorted_most_similar_ind = most_similar_ind[np.argsort(distances_for_sorting[most_similar_ind])]
153 | 
154 |         return [EmbeddingSimilarity(rank=rank,
155 |                                     word=self.ind2word[ind],
156 |                                     similarity=distances[ind],
157 |                                     vec=self.embedding_matrix[ind])
158 |                 for rank, ind in enumerate(sorted_most_similar_ind, start=1)]
159 | 
160 |     def _generic_rank(self, word_or_vec: WordOrVec, word, measurement, higher_is_more_similar):
161 |         self.init_norms()
162 |         vec = self._lookup_if_needed(word_or_vec)
163 |         distances = measurement(vec)
164 |         distances = -distances if higher_is_more_similar else distances
165 | 
166 |         word_distance = distances[self.word2ind[word]]
167 |         return np.count_nonzero(distances[distances < word_distance]) + 1
168 | 


--------------------------------------------------------------------------------
/deid/experiment/mtn_evaluation.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import os
  3 | import pickle
  4 | import random
  5 | import sys
  6 | 
  7 | import numpy as np
  8 | from keras.callbacks import EarlyStopping, LambdaCallback, ModelCheckpoint
  9 | from keras.layers import Input
 10 | 
 11 | from . import experiment_directory
 12 | from ..data import TrainingSet, ValidationSet, StratifiedSampling, is_phi_sentence
 13 | from ..data.augment import Augment, get as get_strategy
 14 | from ..data.batch import IteratorWithEpochLength
 15 | from ..data.util import pad_2d_sequences
 16 | from ..embeddings import Matrix, get as get_embeddings
 17 | from ..env import env
 18 | from ..model.adversary import TwoRepresentationsAreSameOriginalDiscriminator
 19 | 
 20 | 
 21 | def fake_augmented_sentences_batch(X, y, indices, augm_alternatives, fake_alternatives, split_condition):
 22 |     indices = [i for i in indices if split_condition(X[i], y[i])]
 23 |     real_sentences = [X[i] for i in indices]
 24 |     augmented_sentences = [augm_alternatives[ind][0] for ind in indices]
 25 |     fake_sentences = [random.choice(fake_alternatives[ind]) for ind in indices]
 26 | 
 27 |     X_1 = []
 28 |     X_2 = []
 29 |     y = []
 30 |     for real, augm, fake in zip(real_sentences, augmented_sentences, fake_sentences):
 31 |         X_1 += [augm, augm]
 32 |         X_2 += [real, fake]
 33 |         y += [1, 0]
 34 | 
 35 |     return pad_2d_sequences(X_1), pad_2d_sequences(X_2), np.array(y)
 36 | 
 37 | 
 38 | class MTNGenerator(IteratorWithEpochLength):
 39 |     def __init__(self, generator: IteratorWithEpochLength, dataset, dataset2):
 40 |         self.generator = generator
 41 |         self.dataset = dataset
 42 |         self.dataset2 = dataset2
 43 | 
 44 |     def __next__(self):
 45 |         _, _, indices = next(self.generator)
 46 |         X_1, X_2, adv_y = fake_augmented_sentences_batch(self.dataset.X, self.dataset.y, indices,
 47 |                                                          self.dataset.augmented, self.dataset2.augmented,
 48 |                                                          split_condition=is_phi_sentence)
 49 |         return [X_1, X_2], adv_y
 50 | 
 51 |     @property
 52 |     def epoch_length(self) -> int:
 53 |         return self.generator.epoch_length
 54 | 
 55 | 
 56 | def mtn_evaluation_experiment(config):
 57 |     print('Loading embeddings...')
 58 |     embeddings = get_embeddings(config['experiment']['embeddings'])
 59 | 
 60 |     name = config['name']
 61 |     experiment_dir = experiment_directory(name, config['path'])
 62 | 
 63 |     print('Loading matrix...')
 64 |     matrix = Matrix(embeddings, precomputed_word2ind=embeddings.precomputed_word2ind,
 65 |                     precomputed_matrix=embeddings.precomputed_matrix)
 66 | 
 67 |     strategy = get_strategy(config['augment']['strategy'], matrix)
 68 |     digit_strategy = get_strategy(config['augment']['digit_strategy'], matrix)
 69 |     adv_strategy = get_strategy('move_to_neighbor-5', matrix)
 70 | 
 71 |     augment = Augment(embeddings, strategy=strategy, digit_strategy=digit_strategy, n_augmentations=1)
 72 | 
 73 |     augment2 = Augment(embeddings, strategy=adv_strategy, digit_strategy=digit_strategy,
 74 |                        n_augmentations=config['augment']['n_augmentations'], augment_max=1)
 75 | 
 76 |     print('Augmenting training set...', flush=True)
 77 |     tr = TrainingSet(train_set=config['experiment']['train_set'],
 78 |                      embeddings=embeddings,
 79 |                      use_short_sentences=env.use_short_sentences,
 80 |                      limit_documents=env.limit_training_documents,
 81 |                      augment=augment)
 82 | 
 83 |     tr2 = TrainingSet(train_set=config['experiment']['train_set'],
 84 |                       embeddings=embeddings,
 85 |                       use_short_sentences=env.use_short_sentences,
 86 |                       limit_documents=env.limit_training_documents,
 87 |                       augment=augment2)
 88 | 
 89 |     assert np.all(tr.X[100] == tr2.X[100])  # making sure that the training sets have the same order
 90 | 
 91 |     print('Augmenting validation set...', flush=True)
 92 |     val = ValidationSet(validation_set=config['experiment']['validation_set'],
 93 |                         embeddings=embeddings,
 94 |                         label2ind=tr.label2ind,
 95 |                         use_short_sentences=env.use_short_sentences,
 96 |                         limit_documents=env.limit_validation_documents,
 97 |                         augment=augment)
 98 | 
 99 |     val2 = ValidationSet(validation_set=config['experiment']['validation_set'],
100 |                          embeddings=embeddings,
101 |                          label2ind=tr.label2ind,
102 |                          use_short_sentences=env.use_short_sentences,
103 |                          limit_documents=env.limit_validation_documents,
104 |                          augment=augment2)
105 | 
106 |     inputs = {'train_representation': Input(shape=(None, embeddings.size)),
107 |               'fake_representation': Input(shape=(None, embeddings.size))}
108 |     adversary = TwoRepresentationsAreSameOriginalDiscriminator(inputs, representation_size=embeddings.size,
109 |                                                                lstm_size=embeddings.size)
110 |     adversary.model.compile(loss=adversary.loss, optimizer='nadam', metrics=['accuracy'])
111 | 
112 |     batch_size = test_batch_size = 32
113 |     train_gen = MTNGenerator(StratifiedSampling(tr.X, tr.y, split_condition=is_phi_sentence,
114 |                                                 batch_size=batch_size, yield_indices=True, shuffle=True), tr, tr2)
115 |     valid_gen = MTNGenerator(StratifiedSampling(val.X, val.y, split_condition=is_phi_sentence,
116 |                                                 batch_size=test_batch_size, yield_indices=True, shuffle=False), val,
117 |                              val2)
118 | 
119 |     early_stopping = EarlyStopping(monitor='val_loss', patience=config['training']['early_stopping_patience'])
120 |     flush = LambdaCallback(on_epoch_end=lambda epoch, logs: sys.stdout.flush())
121 |     callbacks = [early_stopping, flush]
122 |     if env.save_model:
123 |         checkpoint = ModelCheckpoint(os.path.join(experiment_dir, 'model.hdf5'), save_best_only=True)
124 |         callbacks.append(checkpoint)
125 | 
126 |     history = adversary.model.fit_generator(train_gen,
127 |                                             epochs=config['training']['train_epochs'],
128 |                                             steps_per_epoch=int(math.ceil(len(tr.X) / batch_size)),
129 |                                             validation_data=valid_gen,
130 |                                             validation_steps=int(math.ceil(len(val.X) / test_batch_size)),
131 |                                             callbacks=callbacks,
132 |                                             verbose=env.keras_verbose)
133 | 
134 |     if config['test']['run_test']:
135 |         label2ind = tr.label2ind
136 |         del tr, tr2, val, val2, train_gen, valid_gen
137 | 
138 |         if env.save_model:
139 |             print('Restoring best weights')
140 |             adversary.model.load_weights(os.path.join(experiment_dir, 'model.hdf5'))
141 | 
142 |         print('Augmenting test set...', flush=True)
143 | 
144 |         test = ValidationSet(validation_set='test',
145 |                              embeddings=embeddings,
146 |                              label2ind=label2ind,
147 |                              use_short_sentences=env.use_short_sentences,
148 |                              limit_documents=env.limit_validation_documents,
149 |                              augment=augment)
150 | 
151 |         test2 = ValidationSet(validation_set='test',
152 |                               embeddings=embeddings,
153 |                               label2ind=label2ind,
154 |                               use_short_sentences=env.use_short_sentences,
155 |                               limit_documents=env.limit_validation_documents,
156 |                               augment=augment2)
157 |         test_gen = MTNGenerator(StratifiedSampling(test.X, test.y, split_condition=is_phi_sentence,
158 |                                                    batch_size=test_batch_size, yield_indices=True, shuffle=False), test,
159 |                                 test2)
160 | 
161 |         loss, acc = adversary.model.evaluate_generator(test_gen, int(math.ceil(len(test.X) / test_batch_size)))
162 |         print(f'Test loss: {loss}, test acc: {acc}')
163 |         history.history['test_loss'] = loss
164 |         history.history['test_acc'] = acc
165 | 
166 |     history_pickle_path = os.path.join(experiment_dir, 'history.pickle')
167 |     print('Saving history to', history_pickle_path)
168 |     with open(history_pickle_path, 'wb') as f:
169 |         pickle.dump(history.history, f)
170 | 


--------------------------------------------------------------------------------
/deid/experiment/basic.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import os
  3 | import pickle
  4 | import sys
  5 | 
  6 | import numpy as np
  7 | from keras.callbacks import EarlyStopping, LambdaCallback, ModelCheckpoint
  8 | 
  9 | from . import DeidentificationEvaluationCallback, evaluate_deid_performance, experiment_directory
 10 | from ..data import TrainingSet, ValidationSet, BatchGeneratorWithExtraFeatures, StratifiedSamplingWithExtraFeatures, \
 11 |     is_phi_sentence
 12 | from ..data.augment import Augment, get as get_strategy
 13 | from ..data.class_weight import get as get_class_weight
 14 | from ..data.feature import get as get_feature
 15 | from ..data.util import compounding
 16 | from ..embeddings import PrecomputedEmbeddings, Matrix, get as get_embeddings
 17 | from ..env import env
 18 | from ..model import get as get_model
 19 | from ..model.optimizer import get as get_optimizer
 20 | 
 21 | 
 22 | def basic_experiment(config):
 23 |     name = config['name']
 24 |     batch_size = config['training']['batch_size']
 25 |     test_batch_size = config['training']['test_batch_size']
 26 |     if test_batch_size is None:
 27 |         test_batch_size = batch_size
 28 |     test_weights = config['test']['test_weights']
 29 | 
 30 |     experiment_dir = experiment_directory(name, config['path'])
 31 | 
 32 |     print('Loading embeddings...')
 33 |     embeddings = get_embeddings(config['experiment']['embeddings'])
 34 |     print('Done.')
 35 | 
 36 |     if config['augment'] is not None and test_weights is None:
 37 |         if isinstance(embeddings, PrecomputedEmbeddings):
 38 |             matrix = Matrix(embeddings, precomputed_word2ind=embeddings.precomputed_word2ind,
 39 |                             precomputed_matrix=embeddings.precomputed_matrix)
 40 |             strategy_kwargs = {'matrix': matrix}
 41 |         else:
 42 |             strategy_kwargs = {}
 43 | 
 44 |         strategy = get_strategy(config['augment']['strategy'], **strategy_kwargs)
 45 |         digit_strategy = get_strategy(config['augment']['digit_strategy'], **strategy_kwargs)
 46 |         augment = Augment(embeddings=embeddings, strategy=strategy, digit_strategy=digit_strategy,
 47 |                           **config['augment']['augment_args'])
 48 |     else:
 49 |         augment = None
 50 | 
 51 |     if config['experiment']['extra_features'] is None or len(config['experiment']['extra_features']) == 0:
 52 |         extra_features = []
 53 |     else:
 54 |         extra_features = [get_feature(identifier) for identifier in config['experiment']['extra_features']]
 55 | 
 56 |     tr = TrainingSet(train_set=config['experiment']['train_set'],
 57 |                      embeddings=embeddings,
 58 |                      use_short_sentences=env.use_short_sentences,
 59 |                      limit_documents=env.limit_training_documents,
 60 |                      binary_classification=config['experiment']['binary_classification'],
 61 |                      hipaa_only=config['experiment']['hipaa_only'],
 62 |                      augment=augment,
 63 |                      extra_features=extra_features)
 64 | 
 65 |     model = get_model(config['experiment']['model'])(name=name,
 66 |                                                      input_size=embeddings.size,
 67 |                                                      extra_input_size=tr.X_extra_size,
 68 |                                                      output_size=tr.output_size,
 69 |                                                      optimizer=get_optimizer(config['training']['optimizer'])(
 70 |                                                          **config['training']['optimizer_args']),
 71 |                                                      **config['model_args'])
 72 | 
 73 |     if test_weights is None:
 74 |         train_and_validate(model, config, tr, embeddings, extra_features, batch_size, test_batch_size, experiment_dir)
 75 |     else:
 76 |         model.load_weights(test_weights)
 77 | 
 78 |     if config['test']['run_test']:
 79 |         test_set = config['test']['test_set']
 80 |         if test_set is None:
 81 |             test_set = 'test'
 82 |         evaluate_deid_performance(model=model, batch_size=test_batch_size, embeddings=embeddings,
 83 |                                   label2ind=tr.label2ind, ind2label=tr.ind2label,
 84 |                                   test_set=test_set, experiment_dir=experiment_dir,
 85 |                                   binary_classification=config['experiment']['binary_classification'],
 86 |                                   hipaa_only=config['experiment']['hipaa_only'],
 87 |                                   extra_features=extra_features, epoch=99)
 88 | 
 89 | 
 90 | def train_and_validate(model, config, tr, embeddings, extra_features, batch_size, test_batch_size, experiment_dir):
 91 |     val = ValidationSet(validation_set=config['experiment']['validation_set'],
 92 |                         embeddings=embeddings,
 93 |                         label2ind=tr.label2ind,
 94 |                         use_short_sentences=env.use_short_sentences,
 95 |                         limit_documents=env.limit_validation_documents,
 96 |                         binary_classification=config['experiment']['binary_classification'],
 97 |                         hipaa_only=config['experiment']['hipaa_only'],
 98 |                         extra_features=extra_features)
 99 | 
100 |     if config['augment'] is not None and config['augment']['include_original']:
101 |         tr_X, tr_y, tr_X_extra = tr.data_with_augmented
102 |         augment_training_generator = None
103 |     else:
104 |         tr_X, tr_y, tr_X_extra = tr.X, tr.y, tr.X_extra
105 |         augment_training_generator = tr.augmented
106 | 
107 |     print('Size of the training set:', len(tr_X), 'with maxlen:', tr.maxlen)
108 |     compound = config['training']['batch_size_compound']
109 |     if compound is not None and compound != 0 and compound < batch_size:
110 |         training_batch_size = compounding(1, batch_size, compound)
111 |     else:
112 |         training_batch_size = batch_size
113 | 
114 |     if config['training']['batch_mode'] == 'stratified':
115 |         train_gen_class, train_gen_args = StratifiedSamplingWithExtraFeatures, {'split_condition': is_phi_sentence}
116 |     else:
117 |         train_gen_class, train_gen_args = BatchGeneratorWithExtraFeatures, {}
118 | 
119 |     training_generator = train_gen_class(tr_X, tr_y, tr_X_extra,
120 |                                          batch_size=training_batch_size,
121 |                                          augment=augment_training_generator, **train_gen_args)
122 | 
123 |     validation_generator = BatchGeneratorWithExtraFeatures(val.X, val.y, val.X_extra, test_batch_size,
124 |                                                            shuffle=False)
125 | 
126 |     if config['experiment']['class_weight'] is not None:
127 |         class_weight = get_class_weight(config['experiment']['class_weight'])(tr.output_size, tr_y)
128 |     else:
129 |         class_weight = None
130 | 
131 |     early_stopping = EarlyStopping(monitor='val_loss', patience=config['training']['early_stopping_patience'])
132 |     flush = LambdaCallback(on_epoch_end=lambda epoch, logs: sys.stdout.flush())
133 |     evaluation = DeidentificationEvaluationCallback(model, batch_size=test_batch_size, embeddings=embeddings,
134 |                                                     label2ind=tr.label2ind, ind2label=tr.ind2label,
135 |                                                     test_set=config['experiment']['validation_set'],
136 |                                                     experiment_dir=experiment_dir,
137 |                                                     evaluate_every=config['training']['i2b2_evaluate_every'],
138 |                                                     binary_classification=config['experiment'][
139 |                                                         'binary_classification'],
140 |                                                     hipaa_only=config['experiment']['hipaa_only'],
141 |                                                     extra_features=extra_features)
142 | 
143 |     callbacks = [early_stopping, evaluation, flush]
144 |     if env.save_model:
145 |         checkpoint = ModelCheckpoint(os.path.join(experiment_dir, 'model.hdf5'), save_best_only=True)
146 |         callbacks.append(checkpoint)
147 | 
148 |     history = model.fit_generator(training_generator,
149 |                                   epochs=config['training']['train_epochs'],
150 |                                   steps_per_epoch=int(math.ceil(len(tr_X) / batch_size)),
151 |                                   validation_data=validation_generator,
152 |                                   validation_steps=int(math.ceil(len(val.X) / test_batch_size)),
153 |                                   class_weight=class_weight,
154 |                                   callbacks=callbacks,
155 |                                   verbose=env.keras_verbose,
156 |                                   use_multiprocessing=True)
157 |     if env.save_model:
158 |         best_epoch = np.argmin(history.history['val_loss']) + 1  # epoch numbering is 1-based
159 |         print(f'Resetting to weights from epoch {best_epoch:02d}')
160 |         model.load_weights(os.path.join(experiment_dir, 'model.hdf5'))
161 | 
162 |     history_pickle_path = os.path.join(experiment_dir, 'history.pickle')
163 |     print('Saving history to', history_pickle_path)
164 |     with open(history_pickle_path, 'wb') as f:
165 |         pickle.dump(history.history, f)
166 | 


--------------------------------------------------------------------------------
/deid/tools/i2b2/README.md:
--------------------------------------------------------------------------------
  1 | **i2b2 2014 Evaluation Script**
  2 | 
  3 | This script is distributed as a part of the i2b2 2014 Cardiac Risk and 
  4 | Protected Health Information (PHI) tasks. 
  5 | 
  6 | If you would like to contribute to this project, pull requests are welcome.
  7 | Please see: https://help.github.com/articles/fork-a-repo for instructions
  8 | on how to make a fork of this repository, and
  9 | https://help.github.com/articles/using-pull-requests for instructions on
 10 | making a pull request. Suggestions for improvements, bugs or feature requests
 11 | may be directed to the i2b2 evaluation scripts' issues page located at:
 12 | https://github.com/kotfic/i2b2_evaluation_scripts/issues
 13 | 
 14 | _Setup_
 15 | 
 16 | This script also requires the following Python packages:
 17 | lxml version 3.3.1
 18 | numpy version 1.8.0
 19 | 
 20 | If you get an error when running the script, please make sure that these
 21 | are installed and accessible to your Python installation.
 22 | 
 23 | 
 24 | _Running the script_
 25 | 
 26 | This script intended to be used via
 27 | command line:
 28 | python evaluate.py [cr|phi] [FLAGS] SYSTEM GOLD
 29 | 
 30 | Where 'cr' produces Precision, Recall and F1 (P/R/F1) measure for the 
 31 | cardiac risk task and 'phi' produces P/R/F1 for the PHI task. SYSTEM and GOLD 
 32 | may be individual files representing system output in the case of SYSTEM and
 33 | the gold standard in the case of GOLD.  SYSTEM and GOLD may also be 
 34 | directories in which case all files in SYSTEM will be compared to files the 
 35 | GOLD directory based on their file names.   See below for more information 
 36 | on the different output the cr/phi flag produces.
 37 | 
 38 | 
 39 | 
 40 | _File name restrictions_
 41 | 
 42 | File names MUST be of the form:
 43 | XXX-YY.xml where XXX is the patient id,  and YY is the document id. The 
 44 | files from your system runs are matched to the the gold standard file by 
 45 | file name alone.  If your system outputs file names in a different format, 
 46 | you will need to either modify your system or this script.
 47 | 
 48 | 
 49 | _Output for Risk Factor Track_
 50 | 
 51 | To compare your system output for the Risk Factor track, run the following 
 52 | command for individual files:
 53 | 
 54 | python evaluate.py cr {system.xml} {gold.xml}
 55 | (replace the file names in {}s with the names of your actual files)
 56 | 
 57 | or, to run the script on directories of files:
 58 | python evaluate.py cr {system}/ {gold}/
 59 | (again, replace the folder names in {}s with the names of your actual folders)
 60 | 
 61 | Running one of these versions will produce output in this format:
 62 | 
 63 | ```
 64 |  (# of files)            Measure        Macro (SD)     Micro               
 65 | ---------------------------------------------------------------------------
 66 | Total                    Precision      1.0 (0.0)      1.0                 
 67 |                          Recall         1.0 (0.0)      1.0                 
 68 |                          F1             1.0            1.0              
 69 | ```
 70 | 
 71 | The script evaluates the accuracy of your tags based on tag type 
 72 | and all the attributes (except ID).  If you want to get more details 
 73 | about the output of your system, such as which attributes it is 
 74 | getting right/wrong, you can use the more experimental flags.  Please see 
 75 | the evaluate.py script itself for more information on the flags.
 76 | 
 77 | 
 78 | _Output for De-identification Track_
 79 | 
 80 | To compare your system output for the de-identification track, run the following 
 81 | command on individual files:
 82 | 
 83 | python evaluate.py phi {system.xml} {gold.xml}
 84 | (replace the file names in {}s with the names of your actual files)
 85 | 
 86 | or, to run the script on directories of files:
 87 | python evaluate.py phi {system}/ {gold}/
 88 | (again, replace the folder names in {}s with the names of your actual folders)
 89 | 
 90 | 
 91 | Running one of these versions wil produce output that looks like this:
 92 | 
 93 | ```
 94 | Strict (521)             Measure        Macro (SD)     Micro               
 95 | ---------------------------------------------------------------------------
 96 | Total                    Precision      0.6635 (0.11)  0.6537              
 97 |                          Recall         0.4906 (0.12)  0.4988              
 98 |                          F1             0.5641         0.5658              
 99 | 
100 | 
101 | Relaxed (521)            Measure        Macro (SD)     Micro               
102 | ---------------------------------------------------------------------------
103 | Total                    Precision      0.897 (0.086)  0.9047              
104 |                          Recall         0.6663 (0.15)  0.6903              
105 |                          F1             0.7646         0.7831              
106 | 
107 | 
108 | HIPAA Strict (521)       Measure        Macro (SD)     Micro               
109 | ---------------------------------------------------------------------------
110 | Total                    Precision      0.7406 (0.098) 0.7225              
111 |                          Recall         0.7406 (0.098) 0.7225              
112 |                          F1             0.7406         0.7225              
113 | 
114 | 
115 | HIPAA Relaxed (521)      Measure        Macro (SD)     Micro               
116 | ---------------------------------------------------------------------------
117 | Total                    Precision      1.0 (0.0)      1.0                 
118 |                          Recall         1.0 (0.0)      1.0                 
119 |                          F1             1.0            1.0                 
120 | ```
121 | 
122 | A few notes to explain this output:
123 | -  The "(521)" represents the number of files the scrip was run on
124 | -  "Strict" evaluations require that the offsets for the system outputs match *exactly*
125 | -  "Relaxed" evaluations allow for the "end" part of the offsets to be off by 2--this allows for variations in including "'s" and other endings that many systems will ignore due to tokenization
126 | -  "HIPPA" evalutions include only the tags that a strict interpretation of the HIPAA guidelines.  See the below list for which tags are included in this evaluation
127 | 
128 | 
129 | 
130 | _HIPAA-compliant PHI_
131 | 
132 | - NAME/PATIENT
133 | - AGE
134 | - LOCATION/CITY
135 | - LOCATION/STREET
136 | - LOCATION/ZIP
137 | - LOCATION/ORGANIZATION
138 | - DATE
139 | - CONTACT/PHONE
140 | - CONTACT/FAX
141 | - CONTACT/EMAIL
142 | - ID/SSN
143 | - ID/MEDICALRECORD
144 | - ID/HEALTHPLAN
145 | - ID/ACCOUNT
146 | - ID/LICENSE
147 | - ID/VEHICLE
148 | - ID/DEVICE
149 | - ID/BIOID
150 | - ID/IDNUM 
151 | 
152 | 
153 | _Verbose flag_
154 | 
155 | To get document-by-document information about the accuracy of your tags, you can use the
156 | "-v" or "--verbose" flag.  For example:
157 | 
158 | python evaluate.py cr -v system/ gold/
159 | 
160 | 
161 | _Advanced useage_
162 | 
163 | Some additional functionality is made available for testing and error 
164 | analysis. This functionality is provided AS IS with the hopes that it will
165 | be useful. It should be considered 'experimental' at best, may be bug prone
166 | and will not be explicitly supported, though, bug reports and pull requests
167 | are welcome.
168 | 
169 | Advanced Flags:
170 | 
171 | --filter [TAG ATTRIBUTES] :: run P/R/F1 measures in either summary or verbose
172 |                              mode (see -v) for the list of attributes defined
173 |                              by TAG ATTRIBUTES. This may be a comma separated
174 |                              list of tag names and attribute values. For more
175 |                              see Advanced Examples.
176 | --conjunctive :: If multiple values are passed to filter as a comma separated
177 |                  list, treat them as a series of AND based filters instead of
178 |                  a series of OR based filters
179 | --invert :: run P/R/F1 on the inverted set of tags defined by TAG ATTRIBUTES
180 |             in the --filter tag (see --filter).
181 | 
182 | Advanced Examples:
183 | 
184 | python evaluate.py cr --filter MEDICATION system/ gold/ 
185 | 
186 |   Evaluate system output in system/ folder against gold/ folder considering
187 | only MEDICATION tags
188 | 
189 | python evaluate.py cr --filter CAD,OBESE system/ gold/ 
190 | 
191 |   Evaluate system output in system/ folder against gold/ folder considering
192 | only CAD or OBESE tags. Comma separated lists to the --filter flag are con-
193 | joined via OR.
194 | 
195 | python evaluate.py cr --filter "CAD,before DCT" system/ gold/ 
196 | 
197 |   Evaluate system output in system/ folder against gold/ folder considering
198 | only CAD *OR* tags with a time attribute of before DCT. This is probably 
199 | not what you want when filtering, see the next example
200 | 
201 | python evaluate.py cr --conjunctive \
202 |                          --filter "CAD,before DCT" system/ gold/ 
203 | 
204 |   Evaluate system output in system/ folder against gold/ folder considering
205 | CAD tags *AND* tags with a time attribute of before DCT.
206 | 
207 | python evaluate.py cr --invert \
208 |                          --filter MEDICATION system/ gold/
209 | 
210 |  Evaluate system output in system/ folder against gold/ folder considering
211 | any tag which is NOT a MEDICATION tag.
212 | 
213 | python evaluate.py cr --invert \
214 |                          --conjunctive \
215 |                          --filter "CAD,before DCT" system/ gold/ 
216 | 
217 |  Evaluate system output in system/ folder against gold/ folder considering
218 | any tag which is NOT CAD and with a time attribute of 'before DCT'
219 | 


--------------------------------------------------------------------------------
/deid/data/batch.py:
--------------------------------------------------------------------------------
  1 | import itertools
  2 | import math
  3 | import random
  4 | from typing import Generic, TypeVar, Optional
  5 | from typing import Sequence, Union, Tuple, Callable, Dict, Iterator, List
  6 | 
  7 | import numpy as np
  8 | 
  9 | from .util import pad_2d_sequences, peek
 10 | 
 11 | X_type = TypeVar('X_type')
 12 | y_type = TypeVar('y_type')
 13 | 
 14 | TwoArrays = Tuple[np.ndarray, np.ndarray]
 15 | ThreeArrays = Tuple[np.ndarray, np.ndarray, np.ndarray]
 16 | 
 17 | 
 18 | class IteratorWithEpochLength(Iterator):
 19 |     def __next__(self) -> Union[TwoArrays, ThreeArrays]:
 20 |         raise NotImplementedError
 21 | 
 22 |     @property
 23 |     def epoch_length(self) -> int:
 24 |         raise NotImplementedError
 25 | 
 26 | 
 27 | class IteratorWithEpochLengthImpl(Generic[X_type, y_type], IteratorWithEpochLength):
 28 |     def __init__(self,
 29 |                  X: Sequence[X_type],
 30 |                  y: Sequence[y_type],
 31 |                  total_size: int,
 32 |                  batch_size_iter: Iterator[int],
 33 |                  yield_incomplete_batches: bool = True,
 34 |                  yield_indices: bool = False,
 35 |                  augment: Optional[Dict[int, Sequence[X_type]]] = None,
 36 |                  augment_include_original: bool = True) -> None:
 37 |         assert len(X) == len(y)
 38 |         self.X, self.y = X, y
 39 |         self.augment = augment
 40 |         self.augment_include_original = augment_include_original
 41 | 
 42 |         self.total_size = total_size
 43 |         self.batch_size_iter = batch_size_iter
 44 |         self.yield_indices = yield_indices
 45 |         self.yield_incomplete_batches = yield_incomplete_batches
 46 |         self.init_epoch()
 47 | 
 48 |     def __next__(self) -> Union[TwoArrays, ThreeArrays]:
 49 |         if self.batch_number == self.epoch_length:
 50 |             self.init_epoch()
 51 | 
 52 |         current_batch_size = self.epoch_batch_sizes[self.batch_number]
 53 |         end = min(self.cursor + current_batch_size, self.total_size)
 54 |         batch_ind = self.select_batch_ind(self.cursor, end)
 55 | 
 56 |         if self.augment is not None:
 57 |             if self.augment_include_original:
 58 |                 batch_X = [random.choice(self.augment[i] + [self.X[i]]) for i in batch_ind]
 59 |             else:
 60 |                 batch_X = [random.choice(self.augment[i]) if len(self.augment[i]) > 0 else self.X[i] for i in batch_ind]
 61 |         else:
 62 |             batch_X = [self.X[i] for i in batch_ind]
 63 |         batch_y = [self.y[i] for i in batch_ind]
 64 |         self.cursor += current_batch_size
 65 |         self.batch_number += 1
 66 | 
 67 |         batch_X, batch_y = pad_2d_sequences(batch_X), pad_2d_sequences(batch_y)
 68 |         if self.yield_indices:
 69 |             return batch_X, batch_y, batch_ind
 70 |         else:
 71 |             return batch_X, batch_y
 72 | 
 73 |     def select_batch_ind(self, cursor, end) -> np.ndarray:
 74 |         raise NotImplementedError
 75 | 
 76 |     def __iter__(self):
 77 |         return self
 78 | 
 79 |     @property
 80 |     def epoch_length(self) -> int:
 81 |         return len(self.epoch_batch_sizes)
 82 | 
 83 |     # noinspection PyAttributeOutsideInit
 84 |     def init_epoch(self):
 85 |         self.batch_number = self.cursor = 0
 86 |         self.epoch_batch_sizes = self._make_epoch_batch_sizes(self.total_size)
 87 | 
 88 |     def _make_epoch_batch_sizes(self, total_size):
 89 |         """ Take items from the batch size iter until they make an epoch."""
 90 |         result = []
 91 |         seen = 0
 92 |         while seen < total_size:
 93 |             if self.yield_incomplete_batches:
 94 |                 size = min(next(self.batch_size_iter), total_size - seen)
 95 |                 seen += size
 96 |                 result.append(size)
 97 |             else:
 98 |                 size, self.batch_size_iter = peek(self.batch_size_iter)
 99 |                 if seen + size > total_size:
100 |                     break
101 |                 size = next(self.batch_size_iter)
102 |                 seen += size
103 |                 result.append(size)
104 | 
105 |         assert seen == total_size if self.yield_incomplete_batches else seen <= total_size
106 |         return result
107 | 
108 | 
109 | class BatchGenerator(IteratorWithEpochLengthImpl):
110 |     def __init__(self,
111 |                  X: Sequence[X_type],
112 |                  y: Sequence[y_type],
113 |                  batch_size: Union[int, Iterator[int]],
114 |                  shuffle: bool = True,
115 |                  **kwargs) -> None:
116 | 
117 |         self.shuffle = shuffle
118 | 
119 |         if isinstance(batch_size, int):
120 |             batch_size_iter = itertools.repeat(batch_size)
121 |         else:
122 |             batch_size_iter = batch_size
123 |         super().__init__(X, y, total_size=len(X), batch_size_iter=batch_size_iter, **kwargs)
124 | 
125 |     # noinspection PyAttributeOutsideInit
126 |     def init_epoch(self):
127 |         super().init_epoch()
128 |         if self.shuffle:
129 |             self.shuffled_ind = np.random.permutation(np.arange(len(self.X)))
130 |         else:
131 |             self.shuffled_ind = np.arange(len(self.X))
132 | 
133 |     def select_batch_ind(self, cursor, end):
134 |         return self.shuffled_ind[cursor:end]
135 | 
136 | 
137 | class BatchGeneratorWithExtraFeatures(BatchGenerator):
138 |     def __init__(self,
139 |                  X: Sequence[X_type],
140 |                  y: Sequence[y_type],
141 |                  X_extra,
142 |                  batch_size: Union[int, Iterator[int]],
143 |                  **kwargs) -> None:
144 |         self.X_extra = X_extra
145 |         super().__init__(X, y, batch_size=batch_size, yield_indices=True, **kwargs)
146 | 
147 |     def __next__(self):
148 |         X, y, ind = super().__next__()
149 |         return [X, pad_2d_sequences([self.X_extra[i] for i in ind])], y
150 | 
151 | 
152 | class StratifiedSampling(IteratorWithEpochLengthImpl):
153 |     def __init__(self,
154 |                  X: Sequence[X_type],
155 |                  y: Sequence[y_type],
156 |                  batch_size: Union[int, Iterator[int]],
157 |                  split_condition: Callable[[X_type, y_type], bool],
158 |                  shuffle: bool = False,
159 |                  **kwargs) -> None:
160 |         self.X_pos_ind, self.X_neg_ind = self.split_indices(X, y, split_condition)
161 |         self.shorter_partition_size = min(len(self.X_pos_ind), len(self.X_neg_ind))
162 | 
163 |         self.shuffle = shuffle
164 | 
165 |         if isinstance(batch_size, int):
166 |             batch_size_iter = itertools.repeat(math.ceil(batch_size / 2))
167 |         else:
168 |             double_batch_size_iter: Iterator[int] = batch_size
169 |             batch_size_iter = (math.ceil(size / 2) for size in double_batch_size_iter)
170 | 
171 |         super().__init__(X, y, total_size=self.shorter_partition_size, batch_size_iter=batch_size_iter, **kwargs)
172 | 
173 |     # noinspection PyAttributeOutsideInit
174 |     def init_epoch(self):
175 |         super().init_epoch()
176 |         if self.shuffle:
177 |             self.shuffled_pos = np.random.permutation(self.X_pos_ind)
178 |             self.shuffled_neg = np.random.permutation(self.X_neg_ind)
179 |         else:
180 |             self.shuffled_pos, self.shuffled_neg = self.X_pos_ind, self.X_neg_ind
181 | 
182 |     def select_batch_ind(self, cursor, end):
183 |         return np.concatenate((self.shuffled_pos[cursor:end], self.shuffled_neg[cursor:end]), axis=0)
184 | 
185 |     @staticmethod
186 |     def split_indices(X: Sequence[X_type],
187 |                       y: Sequence[y_type],
188 |                       split_condition: Callable[[X_type, y_type], bool]) -> Tuple[Sequence[int], Sequence[int]]:
189 |         pos: List[int] = []
190 |         neg: List[int] = []
191 |         for i in range(len(X)):
192 |             (pos if split_condition(X[i], y[i]) else neg).append(i)
193 |         return pos, neg
194 | 
195 | 
196 | class StratifiedSamplingWithExtraFeatures(StratifiedSampling):
197 |     def __init__(self,
198 |                  X: Sequence[X_type],
199 |                  y: Sequence[y_type],
200 |                  X_extra,
201 |                  batch_size: Union[int, Iterator[int]],
202 |                  **kwargs) -> None:
203 |         self.X_extra = X_extra
204 |         super().__init__(X, y, batch_size=batch_size, yield_indices=True, **kwargs)
205 | 
206 |     def __next__(self):
207 |         X, y, ind = super().__next__()
208 |         return [X, pad_2d_sequences([self.X_extra[i] for i in ind])], y
209 | 
210 | 
211 | def fake_sentences_batch(X: np.ndarray,
212 |                          y: np.ndarray,
213 |                          indices: np.ndarray,
214 |                          alternatives: Dict[int, Sequence[np.ndarray]],
215 |                          split_condition: Callable[[np.ndarray, np.ndarray], bool]) -> ThreeArrays:
216 |     """ Generate a batch of real and fake/augmented sentence pairs.
217 | 
218 |     :param X: the complete X array
219 |     :param y: the complete y array
220 |     :param indices: the indices of this batch
221 |     :param alternatives: a dictionary (index -> sequence of alternatives) providing fake alternatives for each index
222 |     :param split_condition: a condition determining if the sentence should be used
223 |     :return: A batch `X_1, X_2, y`
224 |     """
225 | 
226 |     indices = [i for i in indices if split_condition(X[i], y[i])]
227 |     real_sentences = [X[i] for i in indices]
228 |     fake_sentences = [random.choice(alternatives[ind]) for ind in indices]
229 | 
230 |     X_1: List[np.ndarray] = []
231 |     X_2: List[np.ndarray] = []
232 |     y = []
233 |     for real, fake in zip(real_sentences, fake_sentences):
234 |         X_1 += [real, real]
235 |         X_2 += [real, fake]
236 |         y += [1, 0]
237 | 
238 |     return pad_2d_sequences(X_1), pad_2d_sequences(X_2), np.array(y)
239 | 


--------------------------------------------------------------------------------