├── deid
├── tools
│ ├── __init__.py
│ ├── i2b2
│ │ ├── __init__.py
│ │ ├── requirements.txt
│ │ └── README.md
│ ├── fix_xml_texts.py
│ ├── fix_180-03.py
│ ├── find_differences.py
│ ├── dataset.py
│ ├── i2b2_xml_to_csv_tests.py
│ ├── embeddings.py
│ ├── find_good_amount_of_noise.py
│ ├── config.py
│ └── i2b2_xml_to_csv.py
├── fixtures
│ ├── deid_work
│ │ └── .gitkeep
│ └── deid_resources
│ │ ├── config
│ │ └── generated
│ │ │ └── .gitkeep
│ │ └── i2b2_data
│ │ ├── train
│ │ ├── 999-99.txt
│ │ └── 999-99.csv
│ │ └── train_xml
│ │ ├── 999-96.xml
│ │ ├── 999-97.xml
│ │ ├── 999-98.xml
│ │ └── 999-99.xml
├── model
│ ├── losses
│ │ ├── __init__.py
│ │ ├── discriminator.py
│ │ └── crf.py
│ ├── layers
│ │ ├── __init__.py
│ │ ├── gradient_reversal.py
│ │ └── noise.py
│ ├── __init__.py
│ ├── optimizer.py
│ ├── deidentifier.py
│ ├── adversary.py
│ ├── representer.py
│ └── adversarial.py
├── data
│ ├── types.py
│ ├── dataset_tests.py
│ ├── augment
│ │ ├── __init__.py
│ │ ├── strategy_tests.py
│ │ ├── get.py
│ │ ├── augment_tests.py
│ │ ├── strategy.py
│ │ └── augment.py
│ ├── __init__.py
│ ├── class_weight.py
│ ├── feature_tests.py
│ ├── token.py
│ ├── feature.py
│ ├── util.py
│ ├── tokenizer_tests.py
│ ├── batch_tests.py
│ ├── postprocess.py
│ ├── tokenizer.py
│ ├── read.py
│ └── batch.py
├── experiment
│ ├── run.py
│ ├── random.py
│ ├── __main__.py
│ ├── __init__.py
│ ├── config_tests.py
│ ├── directory.py
│ ├── get.py
│ ├── evaluation_tests.py
│ ├── config.py
│ ├── alternating_evaluation.py
│ ├── dummy.py
│ ├── fake_sentences.py
│ ├── evaluation.py
│ ├── mtn_evaluation.py
│ └── basic.py
├── __init__.py
├── embeddings
│ ├── dummy.py
│ ├── util.py
│ ├── __init__.py
│ ├── util_tests.py
│ ├── noise.py
│ ├── embeddings.py
│ ├── glove.py
│ ├── elmo.py
│ ├── fasttext.py
│ └── matrix.py
├── config_template.yaml.example
├── config_template_alternating.yaml.example
└── env.py
├── adversary1.png
├── adversary2.png
├── architecture.png
├── .gitmodules
├── scripts
├── xml_to_csv
└── queue
├── environment.yml
├── .travis.yml
├── LICENSE
├── .gitignore
└── README.md
/deid/tools/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/deid/fixtures/deid_work/.gitkeep:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/deid/fixtures/deid_resources/config/generated/.gitkeep:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/deid/tools/i2b2/__init__.py:
--------------------------------------------------------------------------------
1 | # Modified by Max Friedrich, 2018
2 |
--------------------------------------------------------------------------------
/deid/tools/i2b2/requirements.txt:
--------------------------------------------------------------------------------
1 | lxml==3.3.1
2 | numpy==1.8.0
3 |
--------------------------------------------------------------------------------
/adversary1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/maxfriedrich/deid-training-data/HEAD/adversary1.png
--------------------------------------------------------------------------------
/adversary2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/maxfriedrich/deid-training-data/HEAD/adversary2.png
--------------------------------------------------------------------------------
/architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/maxfriedrich/deid-training-data/HEAD/architecture.png
--------------------------------------------------------------------------------
/deid/model/losses/__init__.py:
--------------------------------------------------------------------------------
1 | from .crf import crf_loss
2 | from .discriminator import discriminator_loss
3 |
--------------------------------------------------------------------------------
/deid/model/layers/__init__.py:
--------------------------------------------------------------------------------
1 | from .gradient_reversal import GradientReversal
2 | from .noise import Noise, AdditiveNoise, MultiplicativeNoise
3 |
--------------------------------------------------------------------------------
/deid/fixtures/deid_resources/i2b2_data/train/999-99.txt:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Record date: 2018-06-15
5 |
6 | Max Friedrich is a 25-year-old Computer science student living in Hamburg, Germany.
7 |
8 |
--------------------------------------------------------------------------------
/deid/data/types.py:
--------------------------------------------------------------------------------
1 | from typing import Sequence, Union
2 | import numpy as np
3 | from .token import Token
4 |
5 | Sentence = Sequence[Union[Token, np.ndarray]]
6 | SentenceLabels = Sequence[Sequence[int]]
7 |
--------------------------------------------------------------------------------
/deid/data/dataset_tests.py:
--------------------------------------------------------------------------------
1 | from . import TrainingSet
2 | from ..embeddings import DummyEmbeddings
3 |
4 |
5 | def test_training_set():
6 | tr = TrainingSet(limit_documents=1, embeddings=DummyEmbeddings())
7 | assert len(tr.X) == len(tr.y)
8 |
--------------------------------------------------------------------------------
/deid/data/augment/__init__.py:
--------------------------------------------------------------------------------
1 | from .strategy import AugmentStrategy, AugmentEmbedding, AugmentWord, Zeros, RandomEmbedding, RandomDigits, \
2 | AdditiveNoise, MoveToNeighbor
3 | from .get import get
4 | from .augment import Augment, AugmentedSentence
5 |
6 |
7 |
--------------------------------------------------------------------------------
/deid/experiment/run.py:
--------------------------------------------------------------------------------
1 | from . import get_config, get as get_experiment
2 |
3 |
4 | def run_experiment(config_name_or_path):
5 | config = get_config(config_name_or_path)
6 | experiment = get_experiment(config['experiment']['type'])
7 | experiment(config)
8 |
--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "dependencies/keras-contrib"]
2 | path = dependencies/keras-contrib
3 | url = git://github.com/keras-team/keras-contrib
4 | [submodule "dependencies/fastText"]
5 | path = dependencies/fastText
6 | url = git://github.com/facebookresearch/fastText.git
7 |
--------------------------------------------------------------------------------
/deid/experiment/random.py:
--------------------------------------------------------------------------------
1 | def setup_random():
2 | import os
3 | import random
4 |
5 | import numpy as np
6 | import tensorflow as tf
7 |
8 | os.environ['PYTHONHASHSEED'] = '0'
9 | np.random.seed(1)
10 | random.seed(2)
11 | tf.set_random_seed(3)
12 |
--------------------------------------------------------------------------------
/deid/__init__.py:
--------------------------------------------------------------------------------
1 | # https://stackoverflow.com/a/40846742/2623170
2 | # https://github.com/numpy/numpy/pull/432/commits/170ed4e3
3 | import warnings
4 |
5 | warnings.filterwarnings("ignore", message="numpy.dtype size changed")
6 | warnings.filterwarnings("ignore", message="numpy.ufunc size changed")
7 |
--------------------------------------------------------------------------------
/deid/fixtures/deid_resources/i2b2_data/train_xml/999-96.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/deid/experiment/__main__.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 | from . import run_experiment
4 |
5 |
6 | def main():
7 | parser = argparse.ArgumentParser()
8 | parser.add_argument('config', help='the config filename')
9 | args = parser.parse_args()
10 |
11 | run_experiment(args.config)
12 |
13 |
14 | if __name__ == '__main__':
15 | main()
16 |
--------------------------------------------------------------------------------
/deid/model/__init__.py:
--------------------------------------------------------------------------------
1 | def get(identifier):
2 | if identifier == 'lstm':
3 | return make_lstm_crf
4 | elif identifier.startswith('adversarial'):
5 | return AdversarialModel
6 | else:
7 | raise ValueError('unknown identifier:', identifier)
8 |
9 |
10 | from .adversarial import AdversarialModel
11 | from .deidentifier import make_lstm_crf
12 |
--------------------------------------------------------------------------------
/deid/data/__init__.py:
--------------------------------------------------------------------------------
1 | from .token import Token, TOKEN_TYPE, BINARY_LABEL
2 | from .tokenizer import tokenize
3 | from .types import Sentence, SentenceLabels
4 | from .batch import BatchGenerator, StratifiedSampling, BatchGeneratorWithExtraFeatures, \
5 | StratifiedSamplingWithExtraFeatures, fake_sentences_batch
6 | from .dataset import DataSet, TrainingSet, ValidationSet, TestSet, is_phi_sentence
7 | from .postprocess import prediction_to_xml
8 |
--------------------------------------------------------------------------------
/deid/fixtures/deid_resources/i2b2_data/train_xml/999-97.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
--------------------------------------------------------------------------------
/deid/model/optimizer.py:
--------------------------------------------------------------------------------
1 | from keras.optimizers import Adam, Nadam, RMSprop, SGD
2 |
3 | # We want to pass custom args to the adversaries. Passing a Keras optimizer string to the compile method won't let us
4 | # select custom args, so we make a subset of optimizers available by string keys here.
5 | optimizers = {'adam': Adam, 'nadam': Nadam, 'rmsprop': RMSprop, 'sgd': SGD}
6 |
7 |
8 | def get(identifier):
9 | if identifier in optimizers.keys():
10 | return optimizers[identifier]
11 | raise ValueError(f'Unknown optimizer: {identifier}')
12 |
--------------------------------------------------------------------------------
/deid/experiment/__init__.py:
--------------------------------------------------------------------------------
1 | from .config import get_config
2 | from .random import setup_random
3 | from .directory import experiment_directory
4 | from .evaluation import evaluate_deid_performance, DeidentificationEvaluationCallback
5 |
6 | from .basic import basic_experiment
7 | from .alternating import alternating_experiment
8 | from .alternating_evaluation import alternating_evaluation_experiment
9 | from .mtn_evaluation import mtn_evaluation_experiment
10 | from .fake_sentences import fake_sentences_experiment
11 |
12 | from .get import get
13 | from .run import run_experiment
14 |
--------------------------------------------------------------------------------
/deid/experiment/config_tests.py:
--------------------------------------------------------------------------------
1 | from .config import Config
2 |
3 |
4 | def example_config():
5 | return Config({'a': 0, 'b': 1, 'c': {'d': 2}})
6 |
7 |
8 | def test_config_behaves_like_a_dict():
9 | config = example_config()
10 | assert config['a'] == 0
11 | assert config['b'] == 1
12 | assert config['c']['d'] == 2
13 |
14 | config['c']['d'] = 3
15 | assert config['c']['d'] == 3
16 |
17 |
18 | def test_config_returns_none_for_missing_values():
19 | config = example_config()
20 | assert config['x'] is None
21 | assert config['c']['y'] is None
22 |
--------------------------------------------------------------------------------
/deid/data/class_weight.py:
--------------------------------------------------------------------------------
1 | import itertools
2 |
3 |
4 | def get(identifier):
5 | if identifier == 'balanced':
6 | return balanced
7 |
8 | raise ValueError(f'Unknown class weight: {identifier}')
9 |
10 |
11 | def balanced(output_size, y):
12 | y = list(itertools.chain.from_iterable([[label[0] for label in sent] for sent in y]))
13 |
14 | o_weight = len(y) / y.count(1)
15 | phi_weight = len(y) / (len(y) - y.count(1))
16 |
17 | class_weight = [0, o_weight]
18 | for i in range(2, output_size):
19 | class_weight.append(phi_weight)
20 |
21 | return class_weight
22 |
--------------------------------------------------------------------------------
/scripts/xml_to_csv:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import os
4 | import sys
5 |
6 | ma_path = os.path.dirname(os.path.dirname(__file__))
7 | if os.path.abspath(ma_path) not in sys.path:
8 | sys.path.append(ma_path)
9 |
10 | from deid.env import env
11 |
12 | for t in ['test', 'train', 'validation']:
13 | print(f'Converting {t} xmls...')
14 | command = ' '.join(['python -m deid.tools.i2b2_xml_to_csv --check',
15 | f"{os.path.join(env.data_dir, t + '_xml')}",
16 | f"{os.path.join(env.data_dir, t)}"])
17 |
18 | print(command)
19 | os.system(command)
20 |
--------------------------------------------------------------------------------
/deid/fixtures/deid_resources/i2b2_data/train_xml/999-98.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
1 | name: deid-training-data
2 | channels:
3 | - defaults
4 | - conda-forge
5 | dependencies:
6 | - python=3.6
7 | - libgcc
8 | - pip
9 | - cython
10 | - jupyter
11 | - pathlib
12 | - numpy==1.14.5
13 | - scipy
14 | - pandas
15 | - matplotlib
16 | - beautifulsoup4
17 | - nose
18 | - h5py
19 | - spacy==2.0.12
20 | - tqdm
21 | - scikit-learn
22 | - lxml
23 | - pylint
24 | - mypy
25 | - msgpack-python
26 | - pip:
27 | - keras==2.2.2
28 | - tensorflow==1.10.0 # or tensorflow-gpu
29 | - tensorflow-hub==0.1.1
30 | - terminaltables
31 | - pybind11
32 | - -e ./dependencies/keras-contrib
33 | - -e ./dependencies/fastText
34 |
--------------------------------------------------------------------------------
/deid/model/losses/discriminator.py:
--------------------------------------------------------------------------------
1 | from keras import backend as K
2 | from keras.losses import binary_crossentropy
3 |
4 |
5 | def discriminator_loss(y_true, y_pred):
6 | """ Compares the actual binary crossentropy loss to the random guessing loss (0.6931..., accuracy 0.5) and returns
7 | the maximum. This is motivated by the fact that our adversarial discriminators should not be worse than random
8 | guessing, otherwise we could just flip every prediction and get a better discriminator.
9 | """
10 | loss = binary_crossentropy(y_true, y_pred)
11 | random_guessing = -K.log(0.5)
12 | return K.maximum(loss, random_guessing)
13 |
--------------------------------------------------------------------------------
/deid/embeddings/dummy.py:
--------------------------------------------------------------------------------
1 | import hashlib
2 |
3 | import numpy as np
4 |
5 | from . import Embeddings
6 |
7 |
8 | class DummyEmbeddings(Embeddings):
9 | @property
10 | def size(self):
11 | return 5
12 |
13 | @property
14 | def std(self):
15 | return 0.5
16 |
17 | def is_unknown(self, word: str) -> bool:
18 | return False
19 |
20 | def lookup(self, word: str):
21 | hashed = int(hashlib.sha256(word.encode('utf-8')).hexdigest(), 16)
22 | five_digits = [int(digit) for digit in str(hashed)[1:6]] # omitting a possible - at the first index
23 | return np.array([-0.5 if digit < 5 else 0.5 for digit in five_digits])
24 |
--------------------------------------------------------------------------------
/deid/fixtures/deid_resources/i2b2_data/train/999-99.csv:
--------------------------------------------------------------------------------
1 | text,type,start,end
2 | ,O,0,0
3 | ,O,0,0
4 | ,O,3,3
5 | Record,O,3,9
6 | date,O,10,14
7 | :,O,14,15
8 | 2018,B-DATE,16,20
9 | -,I-DATE,20,21
10 | 06,I-DATE,21,23
11 | -,I-DATE,23,24
12 | 15,I-DATE,24,26
13 | ,O,26,26
14 | ,O,28,28
15 | Max,B-PATIENT,28,31
16 | Friedrich,I-PATIENT,32,41
17 | is,O,42,44
18 | a,O,45,46
19 | 25,B-AGE,47,49
20 | -,O,49,50
21 | year,O,50,54
22 | -,O,54,55
23 | old,O,55,58
24 | Computer,O,59,67
25 | science,O,68,75
26 | student,O,76,83
27 | living,O,84,90
28 | in,O,91,93
29 | Hamburg,B-CITY,94,101
30 | ",",O,101,102
31 | Germany,B-COUNTRY,103,110
32 | .,O,110,111
33 | ,O,111,111
34 |
--------------------------------------------------------------------------------
/deid/fixtures/deid_resources/i2b2_data/train_xml/999-99.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | # https://conda.io/docs/user-guide/tasks/use-conda-with-travis-ci.html
2 |
3 | sudo: required
4 | dist: trusty
5 | group: travis_latest
6 | language: python
7 | python:
8 | - '3.6'
9 | git:
10 | depth: false
11 | install:
12 | - sudo apt-get update
13 | - wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh
14 | - bash miniconda.sh -b -p $HOME/miniconda
15 | - export PATH="$HOME/miniconda/bin:$PATH"
16 | - hash -r
17 | - conda config --set always_yes yes --set changeps1 no
18 | - conda update -q conda
19 | - conda info -a
20 | - conda env create -q
21 | - source activate deid-training-data
22 | - conda list
23 | - python -m spacy download en
24 | script:
25 | - nosetests --with-doctest
26 |
--------------------------------------------------------------------------------
/deid/experiment/directory.py:
--------------------------------------------------------------------------------
1 | import os
2 | import shutil
3 | import socket
4 | from datetime import datetime
5 |
6 | from ..env import env
7 |
8 |
9 | def experiment_directory(name, config_path=None, work_dir=env.work_dir):
10 | """ Creates a directory for the experiment
11 |
12 | :param name:
13 | :param config_path:
14 | :param work_dir:
15 | :return:
16 | """
17 | date_str = datetime.now().strftime('%Y%m%d-%H%M%S')
18 | directory = os.path.join(work_dir, name + '_' + socket.gethostname() + '_' + date_str)
19 | if env.experiment_dir_postfix is not None:
20 | directory += '_' + env.experiment_dir_postfix
21 | os.mkdir(directory)
22 | if config_path is not None:
23 | shutil.copy2(config_path, directory)
24 |
25 | return directory
26 |
--------------------------------------------------------------------------------
/scripts/queue:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | set -e
4 |
5 | if [ -z "$DEID_CONFIG_DIR" ]; then
6 | echo "Please set the DEID_CONFIG_DIR variable to the config directory."
7 | exit 1
8 | fi
9 |
10 | TODO="${DEID_CONFIG_DIR}/todo"
11 | IN_PROGRESS="${DEID_CONFIG_DIR}/in_progress"
12 | DONE="${DEID_CONFIG_DIR}/done"
13 | EXECUTED=0
14 | STOP=0
15 |
16 | find "${TODO}" -type f -name '*.yaml' -print0 |
17 | while IFS= read -r -d '' f; do
18 | config="$(basename $f)";
19 | echo "Next config is ${f}, basename ${config}";
20 | mv "${f}" "${IN_PROGRESS}";
21 | python3 -m deid.experiment "${IN_PROGRESS}/${config}";
22 | echo "Moving to done";
23 | mv "${IN_PROGRESS}/${config}" "$DONE";
24 | echo "OK";
25 | EXECUTED=${EXECUTED}+1;
26 | done
27 | echo "Executed ${EXECUTED} configs."
28 |
--------------------------------------------------------------------------------
/deid/experiment/get.py:
--------------------------------------------------------------------------------
1 | from .basic import basic_experiment
2 | from .alternating import alternating_experiment
3 | from .alternating_evaluation import alternating_evaluation_experiment
4 | from .mtn_evaluation import mtn_evaluation_experiment
5 | from .fake_sentences import fake_sentences_experiment
6 |
7 |
8 | def get(identifier):
9 | if identifier == 'basic':
10 | return basic_experiment
11 | elif identifier == 'alternating':
12 | return alternating_experiment
13 | elif identifier == 'alternating_evaluation':
14 | return alternating_evaluation_experiment
15 | elif identifier == 'mtn_evaluation':
16 | return mtn_evaluation_experiment
17 | elif identifier == 'fake_sentences':
18 | return fake_sentences_experiment
19 | else:
20 | raise ValueError('unknown identifier:', identifier)
21 |
--------------------------------------------------------------------------------
/deid/data/augment/strategy_tests.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | from .strategy import NeighborsCache
4 | from ...embeddings import EmbeddingSimilarity
5 |
6 |
7 | def test_neighbors_cache():
8 | cache = NeighborsCache('selected')
9 | assert cache.lookup('test') is None
10 |
11 | neighbors = [EmbeddingSimilarity(1, 'tests', 0.95, np.zeros(10)),
12 | EmbeddingSimilarity(1, 'testing', 0.93, np.zeros(10)),
13 | EmbeddingSimilarity(1, 'tester', 0.91, np.zeros(10))]
14 |
15 | cache.store('test', neighbors, neighbors[0])
16 | assert cache.lookup('test') == cache.lookup('test') == neighbors[0]
17 |
18 | cache = NeighborsCache('neighbors')
19 | assert cache.lookup('test') is None
20 |
21 | cache.store('test', neighbors, neighbors[0])
22 | assert cache.lookup('test') in neighbors
23 | assert cache.lookup('test') in neighbors
24 |
--------------------------------------------------------------------------------
/deid/experiment/evaluation_tests.py:
--------------------------------------------------------------------------------
1 | import os
2 | import tempfile
3 |
4 | from . import evaluation
5 | from ..env import Test
6 | from .evaluation import _run_official_evaluation
7 |
8 | config = evaluation.env = Test()
9 |
10 |
11 | def test_run_official_evaluation():
12 | with tempfile.NamedTemporaryFile() as f:
13 | # testing the fixtures train_xml directory against itself, resulting in perfect score
14 | results = _run_official_evaluation(predictions_dir=os.path.join(config.data_dir, 'train_xml'),
15 | test_set='train',
16 | output_file=f.name)
17 | assert len(f.read().strip()) != 0 # something was written to the evaluation file
18 |
19 | assert results['Token']['precision'] == 1.0
20 | assert results['Token']['recall'] == 1.0
21 | assert results['Token']['f1'] == 1.0
22 |
--------------------------------------------------------------------------------
/deid/embeddings/util.py:
--------------------------------------------------------------------------------
1 | from typing import Sequence, Tuple, List, Any
2 |
3 |
4 | def pad_string_sequences(seq: Sequence[Sequence[str]]) -> Tuple[List[List[str]], Sequence[int]]:
5 | """ Like keras.preprocessing.sequence.pad_string_sequences but for strings, and it also returns seq_length. """
6 |
7 | seq_length = [len(item) for item in seq]
8 | maxlen = max(seq_length)
9 |
10 | result = []
11 | for i, item in enumerate(seq):
12 | result.append(list(item) + [''] * (maxlen - seq_length[i]))
13 | return result, seq_length
14 |
15 |
16 | def unpad_sequences(padded: Sequence[Any], seq_length: Sequence[int]):
17 | """ The reverse operation of `keras.preprocessing.sequence.pad_sequences`. """
18 | assert len(padded) == len(seq_length)
19 | return [padded[i][:seq_length[i]] for i in range(len(padded))]
20 |
21 |
22 | # https://stackoverflow.com/a/434328/2623170
23 | def chunks(seq, size):
24 | return (seq[pos:pos + size] for pos in range(0, len(seq), size))
25 |
--------------------------------------------------------------------------------
/deid/embeddings/__init__.py:
--------------------------------------------------------------------------------
1 | from .embeddings import Embeddings, PrecomputedEmbeddings
2 | from .dummy import DummyEmbeddings
3 | from .elmo import ElmoEmbeddings, TensorFlowElmoEmbeddings, CachedElmoEmbeddings
4 | from .fasttext import FastTextEmbeddings, PreloadFastTextEmbeddings, CachedFastTextEmbeddings
5 | from .glove import GloveEmbeddings
6 | from .matrix import Matrix, EmbeddingSimilarity
7 | from .noise import Noise, GaussianNoise, DropoutNoise, NoiseWrapper
8 |
9 |
10 | def get(identifier, *args, **kwargs):
11 | if identifier == 'dummy':
12 | return DummyEmbeddings()
13 | elif identifier == 'elmo':
14 | return ElmoEmbeddings(*args)
15 | elif identifier == 'elmo-tf':
16 | return TensorFlowElmoEmbeddings(*args, **kwargs)
17 | elif identifier == 'glove':
18 | return GloveEmbeddings(*args, **kwargs)
19 | elif identifier == 'fasttext':
20 | return FastTextEmbeddings(*args, **kwargs)
21 | else:
22 | raise ValueError('unknown identifier:', identifier)
23 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2018 Max Friedrich
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/deid/embeddings/util_tests.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | from .util import pad_string_sequences, unpad_sequences
4 | from keras.preprocessing.sequence import pad_sequences as keras_pad_sequences
5 |
6 |
7 | def test_pad_string_sequences():
8 | test_seq = [['apple', 'banana', 'cherry'], ['d', 'e', 'f', 'g'], ['h', 'i', 'j', 'k', 'l', 'q'], ['r']]
9 | padded, seq_length = pad_string_sequences(test_seq)
10 | assert len(padded) == 4
11 | assert len(padded[0]) == 6
12 | assert padded[0][0] == 'apple'
13 | assert padded[0][3] == ''
14 | assert seq_length == [3, 4, 6, 1]
15 |
16 |
17 | def test_unpad_sequences():
18 | test_seq = [['apple', 'banana', 'cherry', '', ''], ['d', 'e', 'f', 'g', 'h'], ['i', '', '', '', '', ]]
19 | seq = unpad_sequences(test_seq, [3, 5, 1])
20 | assert len(seq) == 3
21 | assert seq[0] == ['apple', 'banana', 'cherry']
22 |
23 |
24 | def test_is_reverse_operation():
25 | test_seq = [[0, 1, 2, 3], [4], [5, 6]]
26 | padded = keras_pad_sequences(test_seq, padding='post')
27 | unpadded = unpad_sequences(padded, [4, 1, 2])
28 | assert [list(item) for item in unpadded] == test_seq
29 |
--------------------------------------------------------------------------------
/deid/data/augment/get.py:
--------------------------------------------------------------------------------
1 | from typing import Optional
2 |
3 | from .strategy import AugmentStrategy, Zeros, RandomEmbedding, RandomDigits, AdditiveNoise, MoveToNeighbor
4 |
5 |
6 | def get(identifier: Optional[str], *args, **kwargs) -> Optional[AugmentStrategy]:
7 | if identifier is None:
8 | return None
9 | elif identifier == 'zeros':
10 | return Zeros()
11 | elif identifier.startswith('random_embedding'):
12 | if '-' in identifier:
13 | scale = float(identifier.split('-')[1])
14 | return RandomEmbedding(scale, l2_normalize='l2' in identifier)
15 | else:
16 | return RandomEmbedding()
17 | elif identifier == 'random_digits':
18 | return RandomDigits(*args, **kwargs)
19 | elif identifier.startswith('additive_noise'):
20 | scale = float(identifier.split('-')[1])
21 | return AdditiveNoise(scale)
22 | elif identifier.startswith('move_to_neighbor'):
23 | n_neighbors = int(identifier.split('-')[1])
24 | return MoveToNeighbor(n_neighbors=n_neighbors, *args, **kwargs) # type: ignore
25 | else:
26 | raise ValueError('unknown identifier:', identifier)
27 |
--------------------------------------------------------------------------------
/deid/config_template.yaml.example:
--------------------------------------------------------------------------------
1 | ---
2 | experiment:
3 | type: basic # see config_template_alternating.yaml.example for alternating experiment
4 | binary_classification: false
5 | hipaa_only: false
6 | model: lstm # or adversarial
7 | embeddings: fasttext # or glove, elmo
8 | train_set: train
9 | validation_set: validation
10 | model_args:
11 | hidden_size: # add options for multiple runs like this
12 | choice:
13 | - 64
14 | - 128
15 | - 256
16 | # ...
17 | num_hidden:
18 | choice:
19 | - 1
20 | - 2
21 | input_dropout:
22 | choice:
23 | - 0.
24 | - 0.05
25 | - 0.1
26 | - 0.25
27 | - 0.5
28 | after_hidden_dropout: 0.5
29 | recurrent_dropout: 0.25
30 | training:
31 | optimizer: adam
32 | optimizer_args:
33 | clipnorm: 1.
34 | train_epochs: 10
35 | early_stopping_patience: 2
36 | batch_size: 32
37 | i2b2_evaluate_every: 2
38 | augment:
39 | strategy: move_to_neighbor-50 # or additive_noise-0.1, etc.
40 | digit_strategy: random_digits
41 | include_original: false
42 | augment_args:
43 | augment_all: false
44 | n_augmentations: 10
45 | test:
46 | run_test: false
47 |
--------------------------------------------------------------------------------
/deid/data/feature_tests.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | from .feature import CaseFeature, apply_features
4 | from .token import Token
5 |
6 |
7 | def test_case():
8 | feature = CaseFeature()
9 | assert np.all(feature.apply(Token.with_text('1234')) == np.array([0, 1, 0, 0, 0, 0, 0])) # all numeric
10 | assert np.all(feature.apply(Token.with_text('123a')) == np.array([0, 0, 1, 0, 0, 0, 0])) # mainly numeric
11 | assert np.all(feature.apply(Token.with_text('ok4y')) == np.array([0, 0, 0, 1, 0, 0, 0])) # all lower
12 | assert np.all(feature.apply(Token.with_text('OKAY')) == np.array([0, 0, 0, 0, 1, 0, 0])) # all upper
13 | # ...
14 |
15 |
16 | def test_apply_features():
17 | features = [CaseFeature()]
18 | case_features = apply_features(features, [Token.with_text('UPPER'), Token.with_text('CASE')])
19 | assert len(case_features) == 2
20 | assert np.all(case_features[0] == np.array([0, 0, 0, 0, 1, 0, 0]))
21 |
22 | features = [CaseFeature(), CaseFeature()]
23 | case_features = apply_features(features, [Token.with_text('UPPER'), Token.with_text('CASE')])
24 | print(case_features)
25 | assert np.all(case_features[0] == np.array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0]))
26 |
--------------------------------------------------------------------------------
/deid/tools/fix_xml_texts.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import os
3 |
4 | # noinspection PyProtectedMember
5 | from bs4 import BeautifulSoup, CData
6 |
7 |
8 | def fix_xml(pred_xml, gold_xml):
9 | print(pred_xml, gold_xml)
10 | gold_soup = BeautifulSoup(open(gold_xml, 'r').read(), features='xml')
11 | gold_text = gold_soup.find('TEXT').string
12 |
13 | print(gold_text.count('\n'))
14 |
15 | pred_soup = BeautifulSoup(open(pred_xml, 'r').read(), features='xml')
16 | pred_soup.find('TEXT').string = CData(gold_text)
17 | with open(pred_xml, 'w') as f:
18 | f.write(str(pred_soup))
19 |
20 |
21 | def main():
22 | parser = argparse.ArgumentParser()
23 | parser.add_argument('pred',
24 | help='the predictions file or directory')
25 | parser.add_argument('gold', help='the gold file or directory')
26 |
27 | args = parser.parse_args()
28 |
29 | if os.path.isdir(args.pred):
30 | for xml in [f for f in os.listdir(args.pred) if f.endswith('.xml')]:
31 | pred_xml = os.path.join(args.pred, xml)
32 | gold_xml = os.path.join(args.gold, xml)
33 |
34 | fix_xml(pred_xml, gold_xml)
35 | else:
36 | fix_xml(args.pred, args.gold)
37 |
38 |
39 | if __name__ == '__main__':
40 | main()
41 |
--------------------------------------------------------------------------------
/deid/tools/fix_180-03.py:
--------------------------------------------------------------------------------
1 | # Fixes a shift in start/end coordinates that is caused by the special characters in "O’neil’s Court"
2 |
3 | import argparse
4 | import os
5 | import re
6 |
7 |
8 | def fixed_contents(contents):
9 | result = ''
10 | edit_here = False
11 | increment_start = False
12 | for line in contents:
13 | if ' 0:
21 | print(' false positives:')
22 | for fp in false_positives:
23 | print(' -', fp)
24 |
25 | false_negatives = sets[1] - sets[0]
26 | if len(false_negatives) > 0:
27 | print(' false negatives:')
28 | for fn in false_negatives:
29 | print(' -', fn)
30 |
31 | print('-' * 100)
32 |
33 |
34 | def main():
35 | parser = argparse.ArgumentParser()
36 | parser.add_argument('pred',
37 | help='the predictions file or directory')
38 | parser.add_argument('gold', help='the gold file or directory')
39 |
40 | args = parser.parse_args()
41 |
42 | if os.path.isdir(args.pred):
43 | for xml in [f for f in os.listdir(args.pred) if f.endswith('.xml')]:
44 | pred_xml = os.path.join(args.pred, xml)
45 | gold_xml = os.path.join(args.gold, xml)
46 |
47 | print_differences(pred_xml, gold_xml)
48 | else:
49 | print_differences(args.pred, args.gold)
50 |
51 |
52 | if __name__ == '__main__':
53 | main()
54 |
--------------------------------------------------------------------------------
/deid/data/token.py:
--------------------------------------------------------------------------------
1 | from typing import NamedTuple
2 | from ..tools.i2b2.classes import PHITrackEvaluation
3 |
4 |
5 | class Token(NamedTuple):
6 | text: str
7 | type: str
8 | start: int
9 | end: int
10 |
11 | @classmethod
12 | def with_text(cls, text, label='O'):
13 | """ Mostly useful for unit tests """
14 | return Token(text, label, 0, 0)
15 |
16 |
17 | # noinspection SpellCheckingInspection
18 | TOKEN_TYPE = {
19 | 'PATIENT': 'NAME',
20 | 'DOCTOR': 'NAME',
21 | 'USERNAME': 'NAME',
22 | 'PROFESSION': 'PROFESSION',
23 | 'ROOM': 'LOCATION',
24 | 'DEPARTMENT': 'LOCATION',
25 | 'HOSPITAL': 'LOCATION',
26 | 'ORGANIZATION': 'LOCATION',
27 | 'STREET': 'LOCATION',
28 | 'CITY': 'LOCATION',
29 | 'STATE': 'LOCATION',
30 | 'COUNTRY': 'LOCATION',
31 | 'ZIP': 'LOCATION',
32 | 'LOCATION-OTHER': 'LOCATION',
33 | 'AGE': 'AGE',
34 | 'DATE': 'DATE',
35 | 'PHONE': 'CONTACT',
36 | 'FAX': 'CONTACT',
37 | 'EMAIL': 'CONTACT',
38 | 'URL': 'CONTACT',
39 | 'IPADDR': 'CONTACT',
40 | 'SSN': 'ID',
41 | 'MEDICALRECORD': 'ID',
42 | 'HEALTHPLAN': 'ID',
43 | 'ACCOUNT': 'ID',
44 | 'LICENSE': 'ID',
45 | 'VEHICLE': 'ID',
46 | 'DEVICE': 'ID',
47 | 'BIOID': 'ID',
48 | 'IDNUM': 'ID',
49 | 'OTHER': 'OTHER'
50 | }
51 |
52 | HIPAA_TOKEN_TYPE = {tag: type for tag, type in TOKEN_TYPE.items() if any([n_re.match(type) and t_re.match(tag)
53 | for n_re, t_re in
54 | PHITrackEvaluation.HIPAA_regexes])}
55 |
56 | BINARY_LABEL = 'PATIENT'
57 |
--------------------------------------------------------------------------------
/deid/config_template_alternating.yaml.example:
--------------------------------------------------------------------------------
1 | ---
2 | experiment:
3 | type: alternating
4 | binary_classification: false
5 | hipaa_only: false
6 | model: adversarial
7 | embeddings:
8 | choice:
9 | - fasttext
10 | - glove
11 | extra_features:
12 | - case
13 | train_set: train
14 | validation_set: validation
15 | model_args:
16 | representation_type: lstm
17 | representation_size:
18 | choice:
19 | - 50
20 | - 100
21 | - 300
22 | representation_args:
23 | single_stddev: false
24 | adversaries:
25 | - discriminate-representations
26 | - discriminate-representation-embedding-pair
27 | adversary_args:
28 | input_dropout: 0.0
29 | lstm_size: 300
30 | recurrent_dropout: 0.0
31 | reverse_gradient: false
32 | deidentifier_args:
33 | hidden_size: 128
34 | num_hidden: 2
35 | input_dropout: 0.1
36 | after_hidden_dropout: 0.5
37 | recurrent_dropout: 0.5
38 | use_crf: true
39 | training:
40 | optimizer: nadam
41 | optimizer_args:
42 | clipnorm: 1.
43 | pretrain_deidentifier_epochs: 20
44 | pretrain_adversary_epochs: 20
45 | train_epochs: 40
46 | early_stopping_patience: 10
47 | batch_size: 32
48 | batch_size_compound: 0
49 | i2b2_evaluate_every: 100
50 | class_weight: balanced
51 | augment:
52 | strategy:
53 | choice:
54 | - move_to_neighbor-5
55 | - move_to_neighbor-10
56 | - move_to_neighbor-20
57 | - move_to_neighbor-50
58 | - move_to_neighbor-100
59 | - move_to_neighbor-200
60 | - move_to_neighbor-500
61 | augment_args:
62 | n_augmentations: 10
63 | augment_all: false
64 | augment_max: 1
65 | test:
66 | run_test: true
67 |
--------------------------------------------------------------------------------
/deid/model/layers/gradient_reversal.py:
--------------------------------------------------------------------------------
1 | # Adapted from https://github.com/keras-team/keras/pull/4031
2 |
3 | import tensorflow as tf
4 | from keras import backend as K
5 | from keras.engine import Layer
6 |
7 |
8 | def reverse_gradient(X, hp_lambda):
9 | """Flips the sign of the incoming gradient during training."""
10 | try:
11 | reverse_gradient.num_calls += 1
12 | except AttributeError:
13 | reverse_gradient.num_calls = 1
14 |
15 | grad_name = "GradientReversal%d" % reverse_gradient.num_calls
16 |
17 | @tf.RegisterGradient(grad_name)
18 | def _flip_gradients(_, grad):
19 | return [tf.negative(grad) * hp_lambda]
20 |
21 | g = K.get_session().graph
22 | with g.gradient_override_map({'Identity': grad_name}):
23 | y = tf.identity(X)
24 |
25 | return y
26 |
27 |
28 | class GradientReversal(Layer):
29 | """Flip the sign of gradient during training."""
30 |
31 | def __init__(self, hp_lambda=1.0, **kwargs):
32 | super(GradientReversal, self).__init__(**kwargs)
33 | assert hp_lambda > 0, f'hp_lambda is {hp_lambda} -- it should be > 0 to actually flip the gradient'
34 | self.hp_lambda = hp_lambda
35 | self.supports_masking = False
36 |
37 | def build(self, input_shape):
38 | self.trainable_weights = []
39 |
40 | def call(self, x, mask=None):
41 | return reverse_gradient(x, self.hp_lambda)
42 |
43 | def compute_output_shape(self, input_shape):
44 | return input_shape
45 |
46 | def get_config(self):
47 | config = {'hp_lambda': self.hp_lambda}
48 | base_config = super(GradientReversal, self).get_config()
49 | return {**base_config, **config}
50 |
--------------------------------------------------------------------------------
/deid/embeddings/noise.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | from . import Embeddings
4 |
5 |
6 | class Noise:
7 | def noise(self, size: int) -> None:
8 | raise NotImplementedError
9 |
10 |
11 | class GaussianNoise(Noise):
12 | def __init__(self, scale: float, loc=0., clip=None) -> None:
13 | self.loc = loc
14 | self.scale = scale
15 | self.clip = clip
16 |
17 | def noise(self, size):
18 | result = np.random.normal(self.loc, self.scale, size)
19 | if self.clip is not None:
20 | result = np.clip(result, self.clip[0], self.clip[1])
21 | return result
22 |
23 |
24 | class DropoutNoise(Noise):
25 | def __init__(self, dropout_prob) -> None:
26 | self.dropout_prob = dropout_prob
27 |
28 | def noise(self, size):
29 | return np.random.choice(2, size, p=[self.dropout_prob, 1 - self.dropout_prob])
30 |
31 |
32 | class NoiseWrapper(Embeddings):
33 | def __init__(self, embeddings: Embeddings, op, noise: Noise) -> None:
34 | self.wrapped_embeddings = embeddings
35 | self.noise = noise
36 |
37 | if type(op) == str:
38 | if op == 'add' or op == '+':
39 | self.op = lambda x, y: x + y
40 | elif op == 'mul' or op == '*':
41 | self.op = lambda x, y: x * y
42 | else:
43 | raise ValueError(f'Unrecognized op: {op}')
44 | else:
45 | self.op = op
46 |
47 | @property
48 | def size(self):
49 | return self.wrapped_embeddings.size
50 |
51 | def lookup(self, word):
52 | return self.op(self.wrapped_embeddings.lookup(word), self.noise.noise(self.size))
53 |
54 | def __str__(self):
55 | return f'<{self.__class__.__name__} wrapper of {self.wrapped_embeddings} {vars(self)}>'
56 |
--------------------------------------------------------------------------------
/deid/experiment/config.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 |
4 | import yaml
5 |
6 | from ..env import env
7 |
8 | config_dir = os.path.join(env.resources_dir, 'config')
9 |
10 |
11 | class Config(dict):
12 | """ A dict that returns None for missing items instead of raising an exception, including for child dicts """
13 |
14 | def __init__(self, *args, **kwargs):
15 | super().__init__(*args, **kwargs)
16 | for k, v in self.items():
17 | if k == 'choice':
18 | raise ValueError('This is a config template, not an experiment config. Please generate configs from '
19 | 'it with python -m deid.tools.config')
20 | # please don't put a dict into itself (can't happen when importing from yaml anyway)
21 | if isinstance(v, dict):
22 | self[k] = Config(v)
23 |
24 | def __getitem__(self, key):
25 | if key.endswith('_args'):
26 | return self.get(key, {})
27 | return self.get(key)
28 |
29 |
30 | def get_config(name):
31 | if os.path.isfile(name):
32 | return load_config_yaml(name)
33 |
34 | for parent in [config_dir, os.path.join(config_dir, 'generated')]:
35 | filename = os.path.join(parent, name)
36 | if os.path.isfile(filename):
37 | return load_config_yaml(filename)
38 |
39 | filename = filename + '.yaml'
40 | if os.path.isfile(filename):
41 | return load_config_yaml(filename)
42 |
43 | raise ValueError(f'Could not locate config "{name}" in config dir')
44 |
45 |
46 | def load_config_yaml(path):
47 | config = Config(yaml.load(open(path)))
48 | config['name'] = '.'.join(os.path.basename(path).split('.')[:-1])
49 | config['path'] = path
50 | sys.stderr.write(f"Using {config['name']} config.\n")
51 | return config
52 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | temp
3 | deid/temp
4 |
5 | # Byte-compiled / optimized / DLL files
6 | __pycache__/
7 | *.py[cod]
8 | *$py.class
9 |
10 | # C extensions
11 | *.so
12 |
13 | # Distribution / packaging
14 | .Python
15 | build/
16 | develop-eggs/
17 | dist/
18 | downloads/
19 | eggs/
20 | .eggs/
21 | lib/
22 | lib64/
23 | parts/
24 | sdist/
25 | var/
26 | wheels/
27 | *.egg-info/
28 | .installed.cfg
29 | *.egg
30 | MANIFEST
31 |
32 | # PyInstaller
33 | # Usually these files are written by a python script from a template
34 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
35 | *.manifest
36 | *.spec
37 |
38 | # Installer logs
39 | pip-log.txt
40 | pip-delete-this-directory.txt
41 |
42 | # Unit test / coverage reports
43 | htmlcov/
44 | .tox/
45 | .nox/
46 | .coverage
47 | .coverage.*
48 | .cache
49 | nosetests.xml
50 | coverage.xml
51 | *.cover
52 | .hypothesis/
53 | .pytest_cache/
54 |
55 | # Translations
56 | *.mo
57 | *.pot
58 |
59 | # Django stuff:
60 | *.log
61 | local_settings.py
62 | db.sqlite3
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | target/
76 |
77 | # Jupyter Notebook
78 | .ipynb_checkpoints
79 |
80 | # IPython
81 | profile_default/
82 | ipython_config.py
83 |
84 | # pyenv
85 | .python-version
86 |
87 | # celery beat schedule file
88 | celerybeat-schedule
89 |
90 | # SageMath parsed files
91 | *.sage.py
92 |
93 | # Environments
94 | .env
95 | .venv
96 | env/
97 | venv/
98 | ENV/
99 | env.bak/
100 | venv.bak/
101 |
102 | # Spyder project settings
103 | .spyderproject
104 | .spyproject
105 |
106 | # Rope project settings
107 | .ropeproject
108 |
109 | # mkdocs documentation
110 | /site
111 |
112 | # mypy
113 | .mypy_cache/
114 | .dmypy.json
115 | dmypy.json
116 |
117 | # Pyre type checker
118 | .pyre/
119 |
--------------------------------------------------------------------------------
/deid/env.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | from typing import Optional
4 |
5 | deid_dir = os.path.dirname(__file__)
6 |
7 |
8 | # Defining the attributes as static variables isn't super smart as they are all created even if the config is not used.
9 | # This means os.environ['SOME_SPECIFIC_VAR'] will crash in other environments, so we have to use os.environ.get().
10 | class Environment:
11 | name: str
12 | deid_dir: str = deid_dir
13 | data_dir: str
14 | work_dir: str
15 | resources_dir: str
16 | results_dir: str
17 | limit_training_documents: Optional[int]
18 | limit_validation_documents: Optional[int]
19 | use_short_sentences: bool
20 | keras_verbose: int
21 | save_model: int
22 | embeddings_cache: bool
23 | experiment_dir_postfix: Optional[str] = None
24 |
25 | unk_token: str = ''
26 | sent_start = ''
27 | sent_end = ''
28 |
29 |
30 | class Development(Environment):
31 | name = 'development'
32 | work_dir = os.path.join(os.environ['HOME'], 'deid_work')
33 | resources_dir = os.path.join(os.environ['HOME'], 'deid_resources')
34 | data_dir = os.path.join(resources_dir, 'i2b2_data')
35 | limit_training_documents = None # set this to e.g. 10 for faster experimentation
36 | limit_validation_documents = None
37 | use_short_sentences = False
38 | keras_verbose = 1
39 | save_model = True
40 | embeddings_cache = True
41 |
42 |
43 | class Test(Environment):
44 | name = 'unit test'
45 | work_dir = os.path.join(deid_dir, 'fixtures', 'deid_work')
46 | resources_dir = os.path.join(deid_dir, 'fixtures', 'deid_resources')
47 | data_dir = os.path.join(resources_dir, 'i2b2_data')
48 | limit_training_documents = 4
49 | limit_validation_documents = 2
50 | use_short_sentences = True
51 | keras_verbose = 1
52 | save_model = False
53 | embeddings_cache = True
54 |
55 |
56 | env: Environment
57 | if 'DEID_TEST_CONFIG' in os.environ.keys() and os.environ['DEID_TEST_CONFIG']:
58 | env = Test()
59 | else:
60 | env = Development()
61 | sys.stderr.write(f'Using {env.name} environment.\n')
62 |
--------------------------------------------------------------------------------
/deid/data/feature.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | from .token import Token
4 | from .util import one_hot
5 |
6 |
7 | def get(identifier):
8 | if identifier == 'case':
9 | return CaseFeature()
10 | if identifier == 'one':
11 | return UselessOneFeature()
12 | raise ValueError(f'unknown feature identifier: {identifier}')
13 |
14 |
15 | class Feature:
16 | def apply(self, token) -> np.ndarray:
17 | raise NotImplementedError
18 |
19 | @property
20 | def dimension(self):
21 | return NotImplementedError
22 |
23 |
24 | class CaseFeature(Feature):
25 | """ Casing feature from Reimers and Gurevych (2017) https://arxiv.org/abs/1707.06799 """
26 | OTHER = 0
27 | NUMERIC = 1
28 | MAINLY_NUMERIC = 2
29 | ALL_LOWER = 3
30 | ALL_UPPER = 4
31 | INITIAL_UPPER = 5
32 | CONTAINS_DIGIT = 6
33 |
34 | def apply(self, token: Token) -> np.ndarray:
35 | token = token.text
36 |
37 | num_digits = len([char for char in token if char.isdigit()])
38 | digit_fraction = num_digits / len(token)
39 |
40 | if token.isdigit():
41 | casing = self.NUMERIC
42 | elif digit_fraction > 0.5:
43 | casing = self.MAINLY_NUMERIC
44 | elif token.islower():
45 | casing = self.ALL_LOWER
46 | elif token.isupper():
47 | casing = self.ALL_UPPER
48 | elif token[0].isupper():
49 | casing = self.INITIAL_UPPER
50 | elif num_digits > 0:
51 | casing = self.CONTAINS_DIGIT
52 | else:
53 | casing = self.OTHER
54 |
55 | return one_hot(casing, 7)
56 |
57 | @property
58 | def dimension(self):
59 | return 7
60 |
61 |
62 | class UselessOneFeature(Feature):
63 | def apply(self, token) -> np.ndarray:
64 | return np.array([1])
65 |
66 | @property
67 | def dimension(self):
68 | return 1
69 |
70 |
71 | def apply_features(features, sent):
72 | if len(features) == 0:
73 | return np.array([np.array([]) for _ in sent])
74 | return np.array([np.concatenate([feature.apply(word) for feature in features]) for word in sent])
75 |
--------------------------------------------------------------------------------
/deid/data/util.py:
--------------------------------------------------------------------------------
1 | import itertools
2 | from typing import Sequence, Optional, Any
3 | from spacy.util import compounding as spacy_compounding
4 | import numpy as np
5 |
6 |
7 | def one_hot(x: int, n: int) -> np.ndarray:
8 | result = np.zeros(n)
9 | result[x] = 1
10 | return result
11 |
12 |
13 | def compounding(start, stop, compound):
14 | """ Wraps spaCy's compounding utility to always return ints.
15 |
16 | >>> sizes = compounding(1., 10., 1.5)
17 | >>> assert next(sizes) == 1.
18 | >>> assert next(sizes) == int(1 * 1.5)
19 | >>> assert next(sizes) == int(1.5 * 1.5)
20 | """
21 | return (int(result) for result in spacy_compounding(start, stop, compound))
22 |
23 |
24 | def peek(iterator):
25 | item = next(iterator)
26 | return item, itertools.chain([item], iterator)
27 |
28 |
29 | def pad_2d_sequences(seq: Sequence[Any], maxlen: Optional[int] = None,
30 | embedding_size: Optional[int] = None) -> np.ndarray:
31 | """ Like keras.preprocessing.sequence.pad_sequences but for 2d (already embedded) sequences.
32 |
33 | Caveat: this function does not truncate inputs. An error will be raised if the specified maxlen is smaller than the
34 | actual maximum length in the sequence.
35 |
36 | :param seq: the input sequence
37 | :param maxlen: the length to which the result will be padded, may be None
38 | :param embedding_size: the embedding dimension of the input, may be None
39 | :return: a padded array
40 | """
41 |
42 | # find the maximum length by looking through the sequence
43 | if maxlen is None:
44 | maxlen = -1
45 | for item in seq:
46 | maxlen = max(maxlen, len(item))
47 |
48 | # find the embedding dimension by looking through the sequence until there is a non-empty item
49 | if embedding_size is None:
50 | for item in seq:
51 | if len(item) != 0:
52 | embedding_size = len(item[0])
53 | break
54 |
55 | result = np.zeros((len(seq), maxlen, embedding_size))
56 | for i, item in enumerate(seq):
57 | assert len(item) > 0
58 | result[i, -len(item):] = item
59 | return result
60 |
--------------------------------------------------------------------------------
/deid/experiment/alternating_evaluation.py:
--------------------------------------------------------------------------------
1 | import math
2 | import os
3 | import pickle
4 | import sys
5 |
6 | import numpy as np
7 | from keras.callbacks import EarlyStopping, LambdaCallback
8 | from keras.utils.generic_utils import Progbar
9 |
10 | from .alternating import alternating_experiment
11 | from ..env import env
12 |
13 |
14 | def make_progress_bar(target):
15 | return Progbar(target=target, verbose=env.keras_verbose)
16 |
17 |
18 | def alternating_evaluation_experiment(config):
19 | weights = config['test']['test_weights']
20 | model, tr, train_gen, val, valid_gen, experiment_dir = alternating_experiment(config, run_experiment=False)
21 |
22 | model.complete_model.load_weights(weights)
23 |
24 | batch_size = config['training']['batch_size']
25 | test_batch_size = config['training']['test_batch_size']
26 | if test_batch_size is None:
27 | test_batch_size = batch_size
28 |
29 | early_stopping = EarlyStopping(monitor='val_loss', patience=config['training']['early_stopping_patience'])
30 | flush = LambdaCallback(on_epoch_end=lambda epoch, logs: sys.stdout.flush())
31 |
32 | before_fine_tuning_weights = model.train_representer.get_weights()
33 |
34 | def assert_fixed_weights():
35 | after_fine_tuning_weights = model.train_representer.get_weights()
36 | for i in range(len(before_fine_tuning_weights)):
37 | assert np.all(before_fine_tuning_weights[i] == after_fine_tuning_weights[i])
38 |
39 | assert_fixed_representer = LambdaCallback(on_epoch_end=lambda epoch, logs: assert_fixed_weights())
40 | callbacks = [early_stopping, flush, assert_fixed_representer]
41 |
42 | print('Training adversary')
43 | history = model.pretrain_adversary.fit_generator(train_gen,
44 | epochs=config['training']['train_epochs'],
45 | steps_per_epoch=int(math.ceil(len(tr.X) / batch_size)),
46 | validation_data=valid_gen,
47 | validation_steps=int(math.ceil(len(val.X) / test_batch_size)),
48 | callbacks=callbacks,
49 | verbose=env.keras_verbose)
50 |
51 | history_pickle_path = os.path.join(experiment_dir, 'history.pickle')
52 | print('Saving history to', history_pickle_path)
53 | with open(history_pickle_path, 'wb') as f:
54 | pickle.dump(history.history, f)
55 |
--------------------------------------------------------------------------------
/deid/embeddings/embeddings.py:
--------------------------------------------------------------------------------
1 | from typing import Sequence, Dict
2 |
3 | import numpy as np
4 |
5 |
6 | class Embeddings:
7 | """ Flexible base class for embeddings that doesn't necessarily use a matrix """
8 |
9 | @property
10 | def size(self) -> int:
11 | raise NotImplementedError
12 |
13 | @property
14 | def mean(self) -> float:
15 | return 0.
16 |
17 | @property
18 | def std(self) -> float:
19 | raise NotImplementedError
20 |
21 | def is_unknown(self, word: str) -> bool:
22 | raise NotImplementedError
23 |
24 | def lookup(self, word: str) -> np.ndarray:
25 | """ Looks up the vector representation of one word.
26 |
27 | :param word: an input string
28 | :return: a vector representation of size `size`
29 | """
30 | raise NotImplementedError
31 |
32 | def lookup_sentence(self, words: Sequence[str]) -> Sequence[np.ndarray]:
33 | """ Looks up the vector representation of multiple words. Override this if there is a more efficient way to get
34 | a batch of embeddings than looking them up one by one.
35 |
36 | :param words: a sequence of input strings
37 | :return: a vector representation of size `(len(words), size)`
38 | """
39 | return np.array([self.lookup(word) for word in words])
40 |
41 | def lookup_sentences(self, sentences: Sequence[Sequence[str]]) -> Sequence[Sequence[np.ndarray]]:
42 | """ Looks up the vector representation of an entire sentence. Override this if there is a more efficient way to
43 | get a batch of embeddings sequences than looking them up one by one.
44 |
45 | :param sentences: a sequence of sequences of input strings
46 | :return: a sequence of arrays that have size `(len(sentence), size)` for the corresponding sentence
47 | """
48 |
49 | return [self.lookup_sentence(sentence) for sentence in sentences]
50 |
51 |
52 | class PrecomputedEmbeddings(Embeddings):
53 | """ Base class for embeddings that provide a precomputed matrix in addition to the lookup """
54 |
55 | @property
56 | def size(self) -> int:
57 | raise NotImplementedError
58 |
59 | @property
60 | def std(self) -> float:
61 | raise NotImplementedError
62 |
63 | def is_unknown(self, word: str) -> bool:
64 | raise NotImplementedError
65 |
66 | def lookup(self, word: str) -> np.ndarray:
67 | raise NotImplementedError
68 |
69 | @property
70 | def precomputed_word2ind(self) -> Dict[str, int]:
71 | raise NotImplementedError
72 |
73 | @property
74 | def precomputed_matrix(self) -> np.ndarray:
75 | raise NotImplementedError
76 |
--------------------------------------------------------------------------------
/deid/data/tokenizer_tests.py:
--------------------------------------------------------------------------------
1 | from . import tokenize
2 |
3 |
4 | def assert_number_of_tokens(doc, count):
5 | assert len(doc) == count, f'token sequence {[str(t) for t in doc]} has length {len(doc)}, expected {count}'
6 |
7 |
8 | def assert_number_of_sentences(doc, count):
9 | sents = list(str(sent) for sent in doc.sents)
10 | assert len(sents) == count, f'doc {sents} has {len(sents)} sentences, expected {count}'
11 |
12 |
13 | def test_tokenize_one_sentence():
14 | doc = tokenize('A sentence that is simple to tokenize.')
15 | assert_number_of_tokens(doc, 8)
16 | assert doc[-1].text == '.'
17 |
18 |
19 | def test_tokenize_multiple_sentences():
20 | doc = tokenize('One sentence. And another sentence.')
21 | assert_number_of_sentences(doc, 2)
22 |
23 |
24 | def test_tokenize_phone_number():
25 | doc = tokenize('555-2394-72-01')
26 | assert_number_of_tokens(doc, 7)
27 |
28 |
29 | def test_tokenize_custom_infixes():
30 | doc = tokenize('a/b')
31 | assert_number_of_tokens(doc, 3)
32 |
33 | doc = tokenize('a_b_c')
34 | assert_number_of_tokens(doc, 5)
35 |
36 | doc = tokenize('81-year-old')
37 | tokens = [str(t) for t in doc]
38 | assert tokens == ['81', '-', 'year', '-', 'old']
39 |
40 | doc = tokenize('a^b')
41 | tokens = [str(t) for t in doc]
42 | assert tokens == ['a', '^', 'b']
43 |
44 | doc = tokenize('25yo')
45 | tokens = [str(t) for t in doc]
46 | assert tokens == ['25', 'yo']
47 |
48 |
49 | def test_tokenize_sentences():
50 | doc = tokenize('Here is some text that is followed by many newlines\n \n \n \n \nAnd here is some other text.')
51 | assert_number_of_sentences(doc, 2)
52 |
53 | doc = tokenize("""- First list item
54 | - and the second list item, which does not necessarily look like a sentence start.""")
55 | assert_number_of_sentences(doc, 2)
56 |
57 | doc = tokenize("""1. test
58 | 2: ok""")
59 | assert_number_of_sentences(doc, 2)
60 |
61 | doc = tokenize("""----list with unusual format
62 | ----starting with some dashes, no space between dashes and first word
63 | ---sometimes it's a different number of dashes""")
64 | assert_number_of_sentences(doc, 6)
65 |
66 |
67 | def test_tokenize_html():
68 | doc = tokenize('NASA & SpaceX')
69 | tokens = [t for t in doc]
70 | assert [str(t) for t in tokens] == ['NASA', '&', 'SpaceX']
71 | assert tokens[0]._.unescaped_html is None
72 | assert tokens[1]._.unescaped_html == '&'
73 |
74 | doc = tokenize('NASA > SpaceX')
75 | tokens = [t for t in doc]
76 | assert [str(t) for t in tokens] == ['NASA', '>', 'SpaceX']
77 | assert tokens[0]._.unescaped_html is None
78 | assert tokens[1]._.unescaped_html == '>'
79 |
80 | doc = tokenize('Nasa
SpaceX')
81 | tokens = [t for t in doc]
82 | assert tokens[1]._.unescaped_html == '\n'
83 |
--------------------------------------------------------------------------------
/deid/tools/dataset.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import os
3 | import random
4 | import shutil
5 | from datetime import datetime
6 |
7 | from typing import NamedTuple
8 |
9 | from ..env import env
10 |
11 | NUM_TRAIN_VALID_DOCS = 790 # the total number of train + validation docs in the i2b2 dataset
12 |
13 |
14 | class Document(NamedTuple):
15 | csv: str
16 | txt: str
17 | xml: str
18 |
19 |
20 | def make_dataset(train_split: float, valid_split: float):
21 | data_dir = os.path.join(env.data_dir, 'generated')
22 | if not os.path.isdir(data_dir):
23 | os.mkdir(data_dir)
24 |
25 | date_str = datetime.now().strftime('%Y%m%d-%H%M%S')
26 | config = f'{train_split}-{valid_split}-{date_str}'
27 |
28 | dataset_train_dir = os.path.join(data_dir, f'train-{config}')
29 | os.mkdir(dataset_train_dir)
30 |
31 | dataset_train_xml_dir = os.path.join(data_dir, f'train-{config}_xml')
32 | os.mkdir(dataset_train_xml_dir)
33 |
34 | dataset_valid_dir = os.path.join(data_dir, f'validation-{config}')
35 | os.mkdir(dataset_valid_dir)
36 |
37 | dataset_valid_xml_dir = os.path.join(data_dir, f'validation-{config}_xml')
38 | os.mkdir(dataset_valid_xml_dir)
39 |
40 | all_documents = []
41 | for dataset in ['train', 'validation']:
42 | dataset_dir = os.path.join(env.data_dir, dataset)
43 | dataset_xml_dir = os.path.join(env.data_dir, dataset + '_xml')
44 | for filename in [filename for filename in os.listdir(dataset_dir) if filename.endswith('csv')]:
45 | csv_filename = os.path.join(dataset_dir, filename)
46 | txt_filename = os.path.join(dataset_dir, filename[:-3] + 'txt')
47 | xml_filename = os.path.join(dataset_xml_dir, filename[:-3] + 'xml')
48 | all_documents.append(Document(csv=csv_filename, xml=xml_filename, txt=txt_filename))
49 |
50 | size = min(max(int(train_split * NUM_TRAIN_VALID_DOCS), 2), NUM_TRAIN_VALID_DOCS)
51 | train_documents = random.sample(all_documents, size)
52 | valid_size = max(int(valid_split * len(train_documents)), 1)
53 | valid_documents = random.sample(train_documents, valid_size)
54 | print(f'Using {size-valid_size} train documents and {valid_size} validation documents.')
55 |
56 | for document in train_documents:
57 | target = dataset_valid_dir if document in valid_documents else dataset_train_dir
58 | shutil.copy2(document.csv, target)
59 | shutil.copy2(document.txt, target)
60 | shutil.copy2(document.xml, target + '_xml')
61 |
62 | print(f'Made dataset at {dataset_train_dir}, {dataset_valid_dir}')
63 |
64 |
65 | def main():
66 | parser = argparse.ArgumentParser()
67 | parser.description = 'Make train and validation sets of a specified size'
68 | parser.add_argument('train_split', type=float)
69 | parser.add_argument('--valid_split', type=float, default=0.2)
70 | args = parser.parse_args()
71 |
72 | make_dataset(args.train_split, args.valid_split)
73 |
74 |
75 | if __name__ == '__main__':
76 | main()
77 |
--------------------------------------------------------------------------------
/deid/model/layers/noise.py:
--------------------------------------------------------------------------------
1 | from keras import backend as K
2 | from keras.engine.topology import Layer
3 |
4 |
5 | class Noise(Layer):
6 | """ Abstract Gaussian Noise layer with trainable mean and standard deviation """
7 |
8 | def __init__(self, operation, single_stddev: bool, apply_noise: bool = True, **kwargs) -> None:
9 | """ Initializes the Noise layer.
10 |
11 | :param operation: the operation to apply to the inputs and noise, may be '+'/'add' or '*'/'mult'. The mean of
12 | the noise will be set according to this operator.
13 | :param single_stddev: whether to learn a matrix of noise stddev values instead of only one stddev value that is
14 | applied to all dimensions of the data
15 | :param apply_noise: set this to False to only apply the mean instead of noise
16 | :param kwargs: other Layer arguments
17 | """
18 | super().__init__(**kwargs)
19 | if operation == '+' or operation == 'add':
20 | self.operation = lambda x, y: x + y
21 | self.mean = 0.
22 | elif operation == '*' or operation == 'mult':
23 | self.operation = lambda x, y: x * y
24 | self.mean = 1.
25 | else:
26 | raise ValueError(f'unknown operation: {operation}')
27 |
28 | self.apply_noise = K.constant(value=apply_noise)
29 | self.single_stddev = single_stddev
30 | self.k = self.stddev = None # will be initialized in the build method
31 |
32 | self.supports_masking = True
33 |
34 | def build(self, input_shape):
35 | self.k = self.add_weight(name='k',
36 | shape=(1,),
37 | initializer='ones',
38 | trainable=True)
39 | self.stddev = self.add_weight(name='stddev',
40 | shape=(1,) if self.single_stddev else (input_shape[-1],),
41 | initializer='normal',
42 | trainable=True)
43 | super().build(input_shape)
44 |
45 | def compute_output_shape(self, input_shape):
46 | return input_shape
47 |
48 | def call(self, inputs, **kwargs):
49 | def noise():
50 | noise_matrix = K.random_normal(shape=K.shape(inputs), mean=self.mean, stddev=self.stddev)
51 | return self.operation(inputs, self.k * noise_matrix)
52 |
53 | return K.switch(self.apply_noise, noise, inputs)
54 |
55 | def get_config(self):
56 | config = {'apply_noise': self.apply_noise,
57 | 'mean': self.mean,
58 | 'single_stddev': self.single_stddev,
59 | 'k': self.k}
60 | base_config = super().get_config()
61 | return {**base_config, **config}
62 |
63 |
64 | class AdditiveNoise(Noise):
65 | def __init__(self, **kwargs):
66 | super().__init__('+', **kwargs)
67 |
68 |
69 | class MultiplicativeNoise(Noise):
70 | def __init__(self, **kwargs):
71 | super().__init__('*', **kwargs)
72 |
--------------------------------------------------------------------------------
/deid/model/deidentifier.py:
--------------------------------------------------------------------------------
1 | from keras import backend as K
2 | from keras.layers import Input, Dense, LSTM, Bidirectional, TimeDistributed, Masking, Dropout, concatenate, Lambda
3 | from keras.models import Model
4 | from keras_contrib.layers import CRF
5 |
6 |
7 | def make_lstm_crf(input_size, hidden_size, output_size, name='deidentifier', extra_input_size=0, num_hidden=1,
8 | input_dropout=0., recurrent_dropout=0., after_hidden_dropout=0., use_crf=False, optimizer=None,
9 | l2_normalize=False):
10 | """ Make a BiLSTM(-CRF) model that can be used for de-identification.
11 |
12 | :param input_size: the embedding/representation input size
13 | :param hidden_size: the number of LSTM units per direction
14 | :param output_size: the number of output labels
15 | :param name: a name for the model
16 | :param extra_input_size: size for an additional input, if it is 0, this returns a single-input model
17 | :param num_hidden: the number of LSTM layers
18 | :param input_dropout: dropout probability for the input layer
19 | :param recurrent_dropout: recurrent (variational) dropout probability
20 | :param after_hidden_dropout: dropout probability for the LSTM outputs
21 | :param use_crf: whether to use a CRF to optimize the output sequences
22 | :param optimizer: a Keras optimizer, or None if the model should not be compiled
23 | :param l2_normalize: whether to L2 normalize the embedding/representation input
24 | :return: a tuple (model, loss), or a compiled Keras model if an optimizer was specified
25 | """
26 | embedding_input = Input(shape=(None, input_size))
27 | x = Masking()(embedding_input)
28 | if l2_normalize:
29 | x = Lambda(lambda x: K.l2_normalize(x, axis=-1))(x)
30 | x = Dropout(input_dropout)(x)
31 |
32 | extra_input = Input(shape=(None, extra_input_size))
33 | if extra_input_size > 0:
34 | x2 = Masking()(extra_input)
35 | x = concatenate([x, x2])
36 |
37 | for _ in range(num_hidden):
38 | x = Bidirectional(LSTM(hidden_size, return_sequences=True, dropout=after_hidden_dropout,
39 | recurrent_dropout=recurrent_dropout))(x)
40 | if use_crf:
41 | # CRF learn mode 'join' does not work at the moment, this GitHub issue contains a minimal example showing
42 | # the problem: https://github.com/keras-team/keras-contrib/issues/271
43 | x = TimeDistributed(Dense(output_size, activation=None))(x)
44 | crf = CRF(output_size, sparse_target=True, learn_mode='marginal', name='deid_output')
45 | x = crf(x)
46 | loss = crf.loss_function
47 | else:
48 | x = TimeDistributed(Dense(output_size, activation='softmax'), name='deid_output')(x)
49 | loss = 'sparse_categorical_crossentropy'
50 |
51 | if extra_input_size > 0:
52 | model = Model([embedding_input, extra_input], x, name=name)
53 | else:
54 | model = Model(embedding_input, x, name=name)
55 |
56 | if optimizer is not None:
57 | model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])
58 | return model
59 | return model, loss
60 |
--------------------------------------------------------------------------------
/deid/embeddings/glove.py:
--------------------------------------------------------------------------------
1 | import os
2 | import re
3 | from typing import Optional, Dict
4 |
5 | import numpy as np
6 |
7 | from . import PrecomputedEmbeddings
8 | from ..env import env
9 |
10 | glove_dir = os.path.join(env.resources_dir, 'glove.6B')
11 |
12 |
13 | class GloveEmbeddings(PrecomputedEmbeddings):
14 | """ Pre-trained GloVe embeddings, see https://nlp.stanford.edu/projects/glove/ """
15 |
16 | def __init__(self, dims: int = 300, vocab_size: Optional[int] = None) -> None:
17 | """ Initialize a GloveEmbeddings object.
18 |
19 | :param dims: the GloVe variant to use (50, 100, 200, or 300 dimensions)
20 | :param vocab_size: limits the size of the embedding matrix
21 | """
22 | self._dims = dims
23 | filename = os.path.join(glove_dir, f'glove.6B.{dims}d.txt')
24 | if not os.path.isfile(filename):
25 | raise ValueError(f"Can't find GloVe embeddings with {dims} dims in {glove_dir}.")
26 |
27 | embeddings = [np.zeros(dims), np.random.normal(0., scale=1e-6, size=dims)] # Padding and UNK
28 | self._word2ind = {env.unk_token: 1}
29 | self._ind2word = {1: env.unk_token}
30 |
31 | with open(filename) as f:
32 | for i, line in enumerate(f, start=2):
33 | values = line.split()
34 | word = values[0]
35 | embedding = np.asarray(values[1:], dtype='float32')
36 | self._word2ind[word] = i
37 | self._ind2word[i] = word
38 | embeddings.append(embedding / np.linalg.norm(embedding))
39 | if i == vocab_size:
40 | break
41 |
42 | self._embeddings = np.array(embeddings)
43 |
44 | @property
45 | def precomputed_word2ind(self) -> Dict[str, int]:
46 | return self._word2ind
47 |
48 | @property
49 | def precomputed_matrix(self) -> np.ndarray:
50 | return self._embeddings
51 |
52 | @property
53 | def size(self) -> int:
54 | return self._dims
55 |
56 | @property
57 | def std(self):
58 | return 0.37
59 |
60 | def word2ind(self, word: str) -> int:
61 | result = self._word2ind.get(word)
62 | if result is not None:
63 | return result
64 |
65 | word = word.lower()
66 | result = self._word2ind.get(word)
67 | if result is not None:
68 | return result
69 |
70 | word = re.sub(r'\W', '', word)
71 | result = self._word2ind.get(word)
72 | if result is not None:
73 | return result
74 |
75 | # replace every digit with a 0
76 | result = self._word2ind.get(re.sub(r'\d', '0', word))
77 | if result is not None:
78 | return result
79 |
80 | # replace all connected digits with a single 0
81 | result = self._word2ind.get(re.sub(r'\d*', '0', word))
82 | if result is not None:
83 | return result
84 |
85 | return self._word2ind[env.unk_token]
86 |
87 | def lookup(self, word: str) -> np.ndarray:
88 | return self._embeddings[self.word2ind(word)]
89 |
90 | def is_unknown(self, word: str):
91 | return np.all(self.word2ind(word) == self._word2ind[env.unk_token])
92 |
93 | def __str__(self) -> str:
94 | return ''
95 |
--------------------------------------------------------------------------------
/deid/tools/i2b2_xml_to_csv_tests.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from deid.env import Test
4 | from .i2b2_xml_to_csv import xml_to_annotated_tokens_and_text
5 |
6 |
7 | def find_token(tokens, text):
8 | return next((index, token) for index, token in enumerate(tokens) if token.text.startswith(text))
9 |
10 |
11 | def test_xml_to_annotated_tokens_and_text():
12 | tokens, text = xml_to_annotated_tokens_and_text(os.path.join(Test().data_dir, 'train_xml', '999-99.xml'),
13 | check_alignment=True)
14 |
15 | _, date_token = find_token(tokens, '2018')
16 | assert date_token.type == 'B-DATE'
17 | assert date_token.start == 16
18 |
19 | max_index, max_token = find_token(tokens, 'Max')
20 | assert max_token.type == 'B-PATIENT'
21 | assert max_token.start == 28
22 | assert max_token.end == 31
23 |
24 | assert tokens[max_index + 1].type == 'I-PATIENT'
25 | assert tokens[max_index + 2].type == 'O'
26 |
27 | lines = text.strip().split('\n')
28 | assert lines[0] == 'Record date: 2018-06-15'
29 | assert lines[2] == 'Max Friedrich is a 25-year-old Computer science student living in Hamburg, Germany.'
30 |
31 |
32 | def test_tags_right_after_each_other():
33 | tokens, text = xml_to_annotated_tokens_and_text(os.path.join(Test().data_dir, 'train_xml', '999-98.xml'),
34 | check_alignment=True)
35 | one_index, one_token = find_token(tokens, 'one')
36 | assert one_token.type == 'B-DATE'
37 |
38 | two_token = tokens[one_index + 1]
39 | assert two_token.type == 'B-AGE'
40 |
41 | three_token = tokens[one_index + 2]
42 | assert three_token.type == 'I-AGE'
43 |
44 | four_token = tokens[one_index + 3]
45 | assert four_token.type == 'B-DATE'
46 |
47 | medical_record_token = tokens[one_index + 4]
48 | assert medical_record_token.type == 'B-MEDICALRECORD'
49 |
50 | hospital_token = tokens[one_index + 5]
51 | print(hospital_token)
52 | assert hospital_token.type == 'B-HOSPITAL'
53 |
54 |
55 | def test_uses_start_tag_even_with_wrong_alignment():
56 | tokens, text = xml_to_annotated_tokens_and_text(os.path.join(Test().data_dir, 'train_xml', '999-97.xml'),
57 | check_alignment=True)
58 | print(tokens)
59 | zero_index, zero_token = find_token(tokens, 'zero')
60 | assert zero_token.type == 'O'
61 |
62 | one_token = tokens[zero_index + 1]
63 | assert one_token.type == 'B-DATE'
64 |
65 | two_token = tokens[zero_index + 2]
66 | assert two_token.type == 'B-AGE' # not I-AGE
67 |
68 | three_token = tokens[zero_index + 3]
69 | assert three_token.type == 'O'
70 |
71 | four_token = tokens[zero_index + 4]
72 | assert four_token.type == 'B-DATE'
73 |
74 |
75 | def test_escape_html():
76 | tokens, text = xml_to_annotated_tokens_and_text(os.path.join(Test().data_dir, 'train_xml', '999-96.xml'),
77 | check_alignment=True)
78 | print(tokens)
79 | zero_index, zero_token = find_token(tokens, 'zero')
80 | assert zero_token.type == 'O'
81 |
82 | lt_token = tokens[zero_index + 1]
83 | assert lt_token.text == '<'
84 | assert lt_token.type == 'O'
85 | assert lt_token.start == 8
86 |
87 | one_token = tokens[zero_index + 2]
88 | assert one_token.type == 'B-DATE'
89 | assert one_token.start == 13
90 |
--------------------------------------------------------------------------------
/deid/data/batch_tests.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | from .batch import BatchGenerator, StratifiedSampling
4 | from .util import compounding
5 | from ..data import TrainingSet
6 | from ..embeddings import DummyEmbeddings
7 |
8 |
9 | def test_generator():
10 | batch_size = 2
11 | tr = TrainingSet(limit_documents=1, embeddings=DummyEmbeddings())
12 |
13 | generator = BatchGenerator(tr.X, tr.y, batch_size)
14 | x, y = next(generator)
15 | assert x.shape[0] == y.shape[0] == batch_size
16 |
17 |
18 | def test_generator_yields_incomplete_batches():
19 | def make_array():
20 | return np.array([[[i for _ in range(10)] for _ in range(3)] for i in range(3)])
21 |
22 | generator = BatchGenerator(make_array(), make_array(), batch_size=2, yield_incomplete_batches=True)
23 | assert generator.epoch_length == 2
24 | x, y = next(generator)
25 | assert x.shape[0] == y.shape[0] == 2
26 |
27 | x, y = next(generator)
28 | assert x.shape[0] == y.shape[0] == 1
29 |
30 | x, y = next(generator)
31 | assert x.shape[0] == y.shape[0] == 2
32 |
33 | generator = BatchGenerator(make_array(), make_array(), batch_size=2, yield_incomplete_batches=False)
34 | assert generator.epoch_length == 1
35 | x, y = next(generator)
36 | assert x.shape[0] == y.shape[0] == 2
37 |
38 | x, y = next(generator)
39 | assert x.shape[0] == y.shape[0] == 2
40 |
41 |
42 | def test_generator_compounding_batch_size():
43 | def make_array():
44 | return np.ones((100, 10, 1))
45 |
46 | generator = BatchGenerator(make_array(), make_array(), batch_size=compounding(1, 20, 1.1),
47 | yield_incomplete_batches=False)
48 | compounding_value = 1
49 |
50 | sum = 0
51 | print('batch sizes:', generator.epoch_batch_sizes)
52 | print('epoch length:', generator.epoch_length)
53 | for i in range(40): # 1 * 1.1**40 ≈ 45, so it's testing the maximum size as well
54 | compounding_value = min(20, int(1.1 ** i))
55 | x, y = next(generator)
56 | sum += x.shape[0]
57 | print(f'({i})', x.shape[0], '=', compounding_value, sum)
58 | assert x.shape[0] == y.shape[0] == int(compounding_value)
59 |
60 | assert compounding_value == 20
61 |
62 |
63 | def test_generator_yields_permutation():
64 | def make_array():
65 | return np.arange(0, 100).reshape((10, 10, 1))
66 |
67 | x, y = make_array(), make_array()
68 | generator = BatchGenerator(x, y, batch_size=5, yield_indices=True)
69 |
70 | for _ in range(5): # so we shuffle a couple of times
71 | batch_x, batch_y, batch_ind = next(generator)
72 | assert np.all(batch_x[0] == x[batch_ind[0]])
73 |
74 |
75 | def test_stratified_sampling():
76 | def make_array():
77 | arr = np.zeros((100, 10, 1))
78 | for i in range(100):
79 | arr[i] = np.ones((10, 1)) * i
80 | return arr
81 |
82 | x, y = make_array(), make_array()
83 | generator = StratifiedSampling(x, y, split_condition=lambda x, _: x[-1] >= 20, batch_size=6, yield_indices=True)
84 |
85 | assert generator.epoch_length == 7
86 |
87 | batch_x, batch_y, batch_ind = next(generator)
88 | assert np.all(batch_x[0] == x[batch_ind[0]])
89 | assert batch_x.size == 60
90 | assert batch_x[batch_x >= 20].size == 30 # half of them
91 |
92 | for _ in range(1, generator.epoch_length):
93 | batch_x, batch_y, batch_ind = next(generator)
94 | assert batch_x.size == 40 # last batch should be incomplete
95 |
96 | batch_x, batch_y, batch_ind = next(generator)
97 | assert batch_x.size == 60 # next epoch
98 |
--------------------------------------------------------------------------------
/deid/tools/embeddings.py:
--------------------------------------------------------------------------------
1 | import itertools
2 | import os
3 | import pickle
4 |
5 | import numpy as np
6 |
7 |
8 | def _create_cache(embeddings_class, lookup_sentences):
9 | from ..data import DataSet, TrainingSet, ValidationSet, TestSet
10 |
11 | def sentences_from_dataset(dataset: DataSet):
12 | return [[token.text for token in sentence] for sentence in dataset.X]
13 |
14 | def words_from_dataset(dataset: DataSet):
15 | return list(itertools.chain.from_iterable(sentences_from_dataset(dataset)))
16 |
17 | read_dataset = sentences_from_dataset if lookup_sentences else words_from_dataset
18 |
19 | print('Loading the vocabulary...')
20 | tr = TrainingSet()
21 | vocab = read_dataset(tr)
22 | vocab += read_dataset(ValidationSet(validation_set='validation', embeddings=None, label2ind=tr.label2ind))
23 | vocab += read_dataset(ValidationSet(validation_set='test', embeddings=None, label2ind=tr.label2ind))
24 |
25 | print('Loading embeddings...')
26 | embeddings_class(vocab)
27 | print('Done.')
28 |
29 |
30 | def create_fasttext_cache():
31 | from ..embeddings import CachedFastTextEmbeddings
32 | _create_cache(CachedFastTextEmbeddings, lookup_sentences=False)
33 |
34 |
35 | def create_elmo_cache():
36 | from ..embeddings import CachedElmoEmbeddings
37 | _create_cache(CachedElmoEmbeddings, lookup_sentences=True)
38 |
39 |
40 | def convert_precomputed_fasttext_embeddings():
41 | from ..embeddings.fasttext import fasttext_dir, fasttext_embeddings_name
42 |
43 | print('Loading precomputed embeddings...')
44 | vec_filename = os.path.join(fasttext_dir, fasttext_embeddings_name + '.vec')
45 |
46 | precomputed_vocab = np.loadtxt(vec_filename, usecols=0, dtype=object, skiprows=2, comments=None)
47 | precomputed_word2ind = {word: i for i, word in enumerate(precomputed_vocab)}
48 |
49 | # make sure there are no duplicate words
50 | assert len(precomputed_vocab) == len(precomputed_word2ind)
51 |
52 | precomputed_matrix = np.loadtxt(vec_filename, usecols=range(1, 301), skiprows=2, comments=None)
53 |
54 | print('L2 normalizing the embedding matrix...')
55 | normalized_matrix = precomputed_matrix / np.sqrt((precomputed_matrix ** 2).sum(-1))[..., np.newaxis]
56 |
57 | print('Saving the dictionary...')
58 | pickle.dump(precomputed_word2ind, open(vec_filename + '.vocab.pickle', 'wb'))
59 | print('Saving the matrix...')
60 | np.save(vec_filename + '.matrix.npy', normalized_matrix)
61 | print('Done.')
62 |
63 |
64 | def main():
65 | import argparse
66 |
67 | parser = argparse.ArgumentParser()
68 | parser.add_argument('--fasttext-cache', help='Initialize a fasttext embeddings cache with the i2b2 vocabulary',
69 | action='store_true')
70 | parser.add_argument('--fasttext-precomputed', help='Convert precomputed fasttext embeddings to matrix/dict',
71 | action='store_true')
72 | parser.add_argument('--elmo-cache', help='Initialize an elmo embeddings cache with the i2b2 vocabulary',
73 | action='store_true')
74 | args = parser.parse_args()
75 |
76 | if not any([args.fasttext_cache, args.fasttext_precomputed, args.elmo_cache]):
77 | print('Specify at least one of --fasttext-cache, --fasttext-precomputed, --elmo-cache')
78 |
79 | if args.fasttext_cache:
80 | create_fasttext_cache()
81 |
82 | if args.elmo_cache:
83 | create_elmo_cache()
84 |
85 | if args.fasttext_precomputed:
86 | convert_precomputed_fasttext_embeddings()
87 |
88 |
89 | if __name__ == '__main__':
90 | main()
91 |
--------------------------------------------------------------------------------
/deid/data/postprocess.py:
--------------------------------------------------------------------------------
1 | import itertools
2 | from typing import List, NamedTuple, Sequence, Tuple, Optional
3 |
4 | import numpy as np
5 | # noinspection PyProtectedMember
6 | from bs4 import CData, BeautifulSoup
7 |
8 | from .token import Token, TOKEN_TYPE
9 |
10 |
11 | class TaggedTokens(NamedTuple):
12 | type: str
13 | tokens: List[Token]
14 | start: int
15 | end: int
16 |
17 |
18 | class TagAssembler:
19 | def __init__(self, sent_tokens: Sequence[Token]) -> None:
20 | self.input = sent_tokens
21 | self.result: List[TaggedTokens] = []
22 | self.current_tag: Optional[str] = None
23 | self.current_tag_tokens: List[Token] = []
24 |
25 | def close_current_tag(self) -> None:
26 | if self.current_tag is not None:
27 | self.result.append(TaggedTokens(self.current_tag,
28 | self.current_tag_tokens,
29 | self.current_tag_tokens[0].start,
30 | self.current_tag_tokens[-1].end))
31 | self.current_tag = None
32 | self.current_tag_tokens = []
33 |
34 | def assemble(self) -> Sequence[TaggedTokens]:
35 | for t in self.input:
36 | if t.type == 'O':
37 | self.close_current_tag()
38 | elif t.type.startswith('I') and self.current_tag == t.type[2:]:
39 | self.current_tag_tokens.append(t)
40 | else: # B tag or a stray I tag that should be normalized to a B
41 | self.close_current_tag()
42 | self.current_tag = t.type[2:]
43 | self.current_tag_tokens.append(t)
44 |
45 | self.close_current_tag()
46 | return self.result
47 |
48 |
49 | def unpad(X, preds) -> Tuple[List, List]:
50 | assert len(X) == len(preds), f'X and preds have different lengths: {len(X)} != {len(preds)} '
51 | unpadded_X, unpadded_preds = [], []
52 | for i in range(len(X)):
53 | if isinstance(X[i], np.ndarray):
54 | actual_length = np.sum(X[i].any(axis=1))
55 | X_start = preds_start = len(X[i]) - actual_length
56 | else:
57 | X_start = 0
58 | preds_start = len(preds[i]) - len(X[i])
59 | unpadded_X.append(list(X[i][X_start:]))
60 | unpadded_preds.append(list(preds[i][preds_start:]))
61 | assert len(unpadded_X[i]) == len(unpadded_preds[i])
62 | return unpadded_X, unpadded_preds
63 |
64 |
65 | def postprocess_prediction(X, preds, sents, ind2label_lookup) -> Sequence[Sequence[TaggedTokens]]:
66 | X, preds = unpad(X, preds)
67 |
68 | result = []
69 | for i in range(len(X)):
70 | sent_tokens = []
71 | for j in range(len(X[i])):
72 | sent_tokens.append(
73 | Token(sents[i][j].text, ind2label_lookup(preds[i][j]), sents[i][j].start, sents[i][j].end))
74 | result.append(sent_tokens)
75 |
76 | return [TagAssembler(sent_tokens).assemble() for sent_tokens in result]
77 |
78 |
79 | def prediction_to_xml(X, preds, text, sents, ind2label_lookup) -> str:
80 | preds = postprocess_prediction(X, preds, sents, ind2label_lookup)
81 |
82 | soup = BeautifulSoup('', features='xml')
83 | soup.find('TEXT').string = CData(text)
84 | tags = soup.find('TAGS')
85 | for i, tagged_tokens in enumerate(itertools.chain.from_iterable(preds)):
86 | tags.append(soup.new_tag(TOKEN_TYPE[tagged_tokens.type],
87 | id=f'P{i}',
88 | start=tagged_tokens.start,
89 | end=tagged_tokens.end,
90 | TYPE=tagged_tokens.type,
91 | text=text[tagged_tokens.start:tagged_tokens.end]))
92 |
93 | return str(soup)
94 |
--------------------------------------------------------------------------------
/deid/data/tokenizer.py:
--------------------------------------------------------------------------------
1 | import html
2 |
3 | import spacy
4 | from spacy.matcher import Matcher
5 | from spacy.tokens import Token
6 |
7 |
8 | def _deid_tokenizer():
9 | prefixes = spacy.util.compile_prefix_regex(nlp.Defaults.prefixes)
10 |
11 | my_infix = ['\.\.\.+',
12 | '(?<=[0-9])-(?=[0-9])',
13 | '(?<=[0-9])(?=[A-Za-z])',
14 | '[!&:;#,()/_\\-\\^~%{}=\'<>@]']
15 | infixes = spacy.util.compile_infix_regex(list(nlp.Defaults.infixes) + my_infix)
16 |
17 | suffixes = spacy.util.compile_suffix_regex(nlp.Defaults.suffixes)
18 |
19 | return spacy.tokenizer.Tokenizer(nlp.vocab, nlp.Defaults.tokenizer_exceptions,
20 | prefix_search=prefixes.search,
21 | infix_finditer=infixes.finditer, suffix_search=suffixes.search,
22 | token_match=None)
23 |
24 |
25 | def _new_sentence_after_three_newlines_component(doc):
26 | def has_newlines(text):
27 | return text.count('\n') > 2
28 |
29 | for i in range(len(doc[:-2])):
30 | if has_newlines(doc[i].text) and not has_newlines(doc[i + 1].text):
31 | doc[i + 1].sent_start = True
32 | return doc
33 |
34 |
35 | def _new_sentence_for_bulleted_lists_component(doc):
36 | def has_newlines(text):
37 | return text.count('\n') > 0
38 |
39 | def is_bullet(text):
40 | return text.startswith('-') or text.startswith('*') or text.startswith('.') or text == 'o' or text[0].isdigit()
41 |
42 | for i in range(len(doc[:-2])):
43 | if has_newlines(doc[i].text) and not has_newlines(doc[i + 1].text) and is_bullet(doc[i + 1].text):
44 | doc[i + 1].sent_start = True
45 | return doc
46 |
47 |
48 | def _new_sentence_after_three_dashes_component(doc):
49 | for i in range(3, len(doc[:-3])):
50 | if all(token.text == '-' for token in doc[i - 3:i]) and doc[i].text != '-':
51 | doc[i].sent_start = True
52 |
53 | return doc
54 |
55 |
56 | # https://spacy.io/usage/linguistic-features#section-rule-based-matching
57 | class _HTMLMerger(object):
58 | def __init__(self, nlp):
59 | Token.set_extension('unescaped_html', default=None)
60 | self.matcher = Matcher(nlp.vocab)
61 | self.matcher.add('BAD_HTML', None,
62 | [{'ORTH': '<'}, {'LOWER': 'br'}, {'ORTH': '>'}],
63 | [{'ORTH': '<'}, {'LOWER': 'br'}, {'ORTH': '/'}, {'ORTH': '>'}],
64 | [{'ORTH': '&'}, {'SHAPE': 'xx'}, {'ORTH': ';'}], # <
65 | [{'ORTH': '&'}, {'SHAPE': 'xxx'}, {'ORTH': ';'}], # &
66 | [{'ORTH': '&'}, {'ORTH': '#'}, {'SHAPE': 'dd'}, {'ORTH': ';'}], #
67 | [{'ORTH': '&'}, {'ORTH': '#'}, {'SHAPE': 'ddd'}, {'ORTH': ';'}],
68 | [{'ORTH': '&'}, {'ORTH': '#'}, {'SHAPE': 'dddd'}, {'ORTH': ';'}])
69 |
70 | def __call__(self, doc):
71 | matches = self.matcher(doc)
72 | spans = []
73 | for match_id, start, end in matches:
74 | spans.append(doc[start:end])
75 | for span in spans:
76 | span.merge()
77 | for token in span:
78 | if '
= threshold]) / len(similarities)
17 |
18 |
19 | def main():
20 | parser = argparse.ArgumentParser()
21 | parser.description = 'try different amounts of noise to find a balance'
22 | parser.add_argument('embeddings', type=str, help='the embeddings to use, either glove or fasttext')
23 | parser.add_argument('noises', nargs='+', type=float, help='the noises to try')
24 | args = parser.parse_args()
25 |
26 | noises = args.noises
27 | if len(noises) == 0:
28 | raise argparse.ArgumentTypeError('Please provide a list of noises')
29 |
30 | if args.embeddings == 'fasttext':
31 | emb = FastTextEmbeddings()
32 | lower = False
33 | elif args.embeddings == 'glove':
34 | emb = GloveEmbeddings()
35 | lower = True
36 | else:
37 | raise argparse.ArgumentTypeError(f'Unknown embeddings: {args.embeddings}')
38 |
39 | mat = Matrix(lookup_embeddings=emb, precomputed_word2ind=emb.precomputed_word2ind,
40 | precomputed_matrix=emb.precomputed_matrix)
41 |
42 | tr = TrainingSet(limit_documents=env.limit_training_documents)
43 |
44 | phi_tokens = set([token.text for token in itertools.chain.from_iterable(tr.X) if token.type != 'O'])
45 | phi_tokens = [word.lower() if lower else word for word in phi_tokens
46 | if len(word) > 2
47 | and (word.lower() if lower else word) in emb.precomputed_word2ind.keys()
48 | and not any([c.isdigit() for c in word])]
49 |
50 | tokens_to_check = random.sample(phi_tokens, 1_000)
51 | # print(tokens_to_check)
52 |
53 | print('Similarity to closest neighbors:')
54 | closest_neighbor_similarities = []
55 | for token in random.sample(tokens_to_check, 10):
56 | closest_neighbor_similarities.append(mat.most_similar_cosine(token, n=2)[1].similarity)
57 |
58 | print(f'closest neighbor similarity mean: {np.mean(closest_neighbor_similarities)}',
59 | f'std: {np.std(closest_neighbor_similarities)}')
60 |
61 | for noise in noises:
62 | ranks = []
63 | similarities = []
64 | closest_neighbor_similarities = []
65 |
66 | for token in tokens_to_check:
67 | looked_up = emb.lookup(token)
68 | noisy = looked_up + np.random.normal(0., noise, emb.size)
69 | ranks.append(mat.cosine_distance_rank(noisy, token))
70 | similarities.append(mat.cosine_distance(noisy, token))
71 | closest_neighbor_similarities.append(mat.most_similar_cosine(noisy, n=1)[0].similarity)
72 |
73 | print('---')
74 | print(f'Report for scale {noise}:')
75 | print(f'rank mean: {np.mean(ranks)},',
76 | f'std: {np.std(ranks)},',
77 | f'%top1: {top_perc(1, ranks)},',
78 | f'%top5: {top_perc(5, ranks)},',
79 | f'%top10: {top_perc(10, ranks)}')
80 | print(f'similarity with original mean: {np.mean(similarities)}',
81 | f'std: {np.std(similarities)}',
82 | f'%0.9+: {sim_perc(0.9, similarities)}',
83 | f'%0.8+: {sim_perc(0.8, similarities)}',
84 | f'%0.7+: {sim_perc(0.7, similarities)}',
85 | f'%0.6+: {sim_perc(0.6, similarities)}')
86 | print(f'closest neighbor similarity mean: {np.mean(closest_neighbor_similarities)}',
87 | f'std: {np.std(closest_neighbor_similarities)}',
88 | f'%0.9+: {sim_perc(0.9, closest_neighbor_similarities)}',
89 | f'%0.8+: {sim_perc(0.8, closest_neighbor_similarities)}',
90 | f'%0.7+: {sim_perc(0.7, closest_neighbor_similarities)}',
91 | f'%0.6+: {sim_perc(0.6, closest_neighbor_similarities)}')
92 |
93 | print()
94 |
95 |
96 | if __name__ == '__main__':
97 | main()
98 |
--------------------------------------------------------------------------------
/deid/embeddings/elmo.py:
--------------------------------------------------------------------------------
1 | import math
2 | import os
3 | import pickle
4 | from typing import Sequence
5 |
6 | import numpy as np
7 | import tensorflow as tf
8 | import tensorflow_hub as hub
9 | from tqdm import tqdm
10 |
11 | from . import Embeddings
12 | from .util import pad_string_sequences, unpad_sequences, chunks
13 | from ..env import env
14 |
15 | elmo_dir = os.path.join(env.resources_dir, 'elmo')
16 |
17 |
18 | class ElmoEmbeddings(Embeddings):
19 | def __new__(cls, *args, **kwargs):
20 | if env.embeddings_cache:
21 | return CachedElmoEmbeddings(*args, **kwargs)
22 | return TensorFlowElmoEmbeddings(*args, **kwargs)
23 |
24 | def __init__(self, *_, **__):
25 | raise NotImplementedError('this should not happen')
26 |
27 | @property
28 | def size(self) -> int:
29 | raise NotImplementedError
30 |
31 | @property
32 | def std(self):
33 | raise NotImplementedError
34 |
35 | def lookup(self, word: str) -> np.ndarray:
36 | raise NotImplementedError
37 |
38 | def is_unknown(self, word: str) -> bool:
39 | raise NotImplementedError
40 |
41 |
42 | class ElmoEmbeddingsImpl(Embeddings):
43 | @property
44 | def size(self) -> int:
45 | return 1024
46 |
47 | @property
48 | def std(self) -> float:
49 | return 0.47
50 |
51 | def lookup(self, word: str) -> np.ndarray:
52 | raise RuntimeError("Don't lookup single words in ELMo")
53 |
54 | def is_unknown(self, word: str):
55 | return False
56 |
57 |
58 | class TensorFlowElmoEmbeddings(ElmoEmbeddingsImpl):
59 | def __init__(self, *_, **__):
60 | graph = tf.Graph()
61 | with graph.as_default():
62 | self.tokens = tf.placeholder(tf.string, shape=[None, None])
63 | self.sequence_len = tf.placeholder(tf.int32, shape=[None])
64 | self.elmo = hub.Module('https://tfhub.dev/google/elmo/2')
65 | self.embed = self.elmo({'tokens': self.tokens, 'sequence_len': self.sequence_len}, signature='tokens',
66 | as_dict=True)
67 | init_op = tf.group([tf.global_variables_initializer(), tf.tables_initializer()])
68 | graph.finalize()
69 | self.sess = tf.Session(graph=graph)
70 | self.sess.run(init_op)
71 |
72 | def lookup_sentence(self, words: Sequence[str]) -> Sequence[np.ndarray]:
73 | return self.sess.run(self.embed, {self.tokens: [words], self.sequence_len: [len(words)]})['elmo'][0]
74 |
75 | def lookup_sentences(self, sentences: Sequence[Sequence[str]]) -> Sequence[Sequence[np.ndarray]]:
76 | sentences, seq_length = pad_string_sequences(sentences)
77 | result = self.sess.run(self.embed, {self.tokens: sentences, self.sequence_len: seq_length})['elmo']
78 | return unpad_sequences(result, seq_length)
79 |
80 |
81 | class CachedElmoEmbeddings(ElmoEmbeddingsImpl):
82 | def __init__(self, sentences=None, lookup_batch_size=64, *_, **__):
83 | if sentences is None:
84 | self.sent2vec = {}
85 | for chunk_name in [filename for filename in os.listdir(elmo_dir) if 'chunk' in filename]:
86 | self.sent2vec.update(pickle.load(open(os.path.join(elmo_dir, chunk_name), 'rb')))
87 | else:
88 | if not os.path.isdir(elmo_dir):
89 | os.mkdir(elmo_dir)
90 |
91 | embeddings = TensorFlowElmoEmbeddings()
92 | self.sent2vec = {}
93 | sentence_chunks = chunks(sentences, lookup_batch_size)
94 | for i, sentence_chunk in tqdm(enumerate(sentence_chunks), desc='Looking up sentence batches',
95 | total=math.ceil(len(sentences) / lookup_batch_size)):
96 | chunk_sent2vec = {}
97 | result = embeddings.lookup_sentences(sentence_chunk)
98 | for j, sentence in enumerate(sentence_chunk):
99 | chunk_sent2vec[' '.join(sentence)] = result[j]
100 | self.sent2vec[' '.join(sentence)] = result[j]
101 | chunk_filename = os.path.join(elmo_dir, f'elmo_chunk{i:04}.pickle')
102 | pickle.dump(chunk_sent2vec, open(chunk_filename, 'wb'))
103 |
104 | def lookup_sentence(self, words: Sequence[str]):
105 | result = self.sent2vec.get(' '.join(words))
106 | if result is not None:
107 | return result
108 | raise RuntimeError(f'Cache lookup failed for "{words}". Please rebuild the embedding cache.')
109 |
--------------------------------------------------------------------------------
/deid/data/augment/augment_tests.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | from .augment import Augment
4 | from .strategy import AugmentWord, AugmentEmbedding
5 | from ...data import Token
6 | from ...embeddings import DummyEmbeddings
7 |
8 |
9 | class Ones(AugmentEmbedding):
10 | def augment(self, word_embedding):
11 | return np.ones(len(word_embedding))
12 |
13 |
14 | def test_augment_embeddings():
15 | embeddings = DummyEmbeddings()
16 | augment = Augment(embeddings, Ones(), exclude=None, n_augmentations=2)
17 | sent = [Token.with_text('this'), Token.with_text('is'), Token.with_text('a'), Token.with_text('test')]
18 |
19 | result = augment.lookup_sentence(sent)
20 | assert len(result.original) == 4
21 | assert len(result.original[0]) == embeddings.size
22 | assert len(result.augmented) == 0
23 |
24 | sent = [Token.with_text('this'), Token.with_text('is'), Token.with_text('a', 'B-NAME'),
25 | Token.with_text('name', 'I-NAME')]
26 | result = augment.lookup_sentence(sent)
27 | augmented = result.augmented[0]
28 |
29 | assert len(augmented) == 4
30 | assert len(augmented[0]) == embeddings.size
31 | assert np.all(augmented[2] == np.ones(embeddings.size))
32 | assert np.all(augmented[3] == np.ones(embeddings.size))
33 | assert len(result.augmented) == 2
34 |
35 |
36 | class ReplaceWithFixed(AugmentWord):
37 | def augment(self, word):
38 | return 'REPLACED'
39 |
40 |
41 | def test_augment_words():
42 | embeddings = DummyEmbeddings()
43 | augment = Augment(embeddings, ReplaceWithFixed(), exclude=None)
44 | sent = [Token.with_text('replace'), Token.with_text('these', 'B-NAME'), Token.with_text('words', 'I-NAME')]
45 | result = augment.lookup_sentence(sent).augmented[0]
46 |
47 | assert np.all(result[0] == embeddings.lookup('replace'))
48 |
49 | assert np.any(result[1] != embeddings.lookup('these'))
50 | assert np.all(result[1] == embeddings.lookup('REPLACED'))
51 |
52 | assert np.any(result[2] != embeddings.lookup('words'))
53 | assert np.all(result[2] == embeddings.lookup('REPLACED'))
54 |
55 |
56 | def test_augment_exclude():
57 | embeddings = DummyEmbeddings()
58 | augment = Augment(embeddings, Ones())
59 | sent = [Token.with_text('Please'), Token.with_text('ignore'), Token.with_text('this', 'B-NAME'),
60 | Token.with_text(':', 'I-NAME'), Token.with_text('stopword', 'I-NAME')]
61 |
62 | result = augment.lookup_sentence(sent).augmented[0]
63 | assert np.all(result[2] != np.ones(embeddings.size))
64 | assert np.all(result[3] != np.ones(embeddings.size))
65 | assert np.all(result[4] == np.ones(embeddings.size))
66 |
67 |
68 | def test_augment_all():
69 | embeddings = DummyEmbeddings()
70 | augment = Augment(embeddings, Ones(), augment_all=True, exclude=None)
71 | sent = [Token.with_text('Augment'), Token.with_text('all'), Token.with_text('of', 'B-NAME'),
72 | Token.with_text('these', 'I-NAME')]
73 |
74 | result = augment.lookup_sentence(sent).augmented[0]
75 | assert np.all(result[0] == np.ones(embeddings.size))
76 | assert np.all(result[1] == np.ones(embeddings.size))
77 | assert np.all(result[2] == np.ones(embeddings.size))
78 | assert np.all(result[3] == np.ones(embeddings.size))
79 |
80 |
81 | def test_augment_does_not_touch_unknown():
82 | class DummyEmbeddingsWithUnknownTestWord(DummyEmbeddings):
83 | def is_unknown(self, word: str):
84 | return word == 'test'
85 |
86 | def lookup(self, word):
87 | if word == 'test':
88 | return np.zeros(self.size)
89 | return super().lookup(word)
90 |
91 | embeddings = DummyEmbeddingsWithUnknownTestWord()
92 | augment = Augment(embeddings, Ones(), exclude=None)
93 | sent = [Token.with_text('This', 'B-NAME'), Token.with_text('is', 'I-NAME'), Token.with_text('another', 'I-NAME'),
94 | Token.with_text('test', 'I-NAME')]
95 | result = augment.lookup_sentence(sent).augmented[0]
96 | assert np.any(result[2] == np.ones(embeddings.size))
97 | assert np.all(result[3] == np.zeros(embeddings.size))
98 |
99 |
100 | def test_augment_max():
101 | embeddings = DummyEmbeddings()
102 | augment = Augment(embeddings, ReplaceWithFixed(), augment_max=1, exclude=None)
103 | sent = [Token.with_text('Augment'), Token.with_text('only'), Token.with_text('one', 'B-NAME'),
104 | Token.with_text('please', 'I-NAME')]
105 | result = augment.lookup_sentence(sent).augmented[0]
106 | assert len([r for r in result if np.all(r == embeddings.lookup('REPLACED'))]) == 1
107 |
--------------------------------------------------------------------------------
/deid/experiment/dummy.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import collections
3 | from typing import Sequence
4 | import random
5 |
6 | import numpy as np
7 | from keras.preprocessing.sequence import pad_sequences
8 |
9 | from .directory import experiment_directory
10 | from .evaluation import evaluate_deid_performance
11 | from ..data import TrainingSet, ValidationSet, Token
12 | from ..env import env
13 |
14 |
15 | class DummyDeidentifier:
16 | def guess(self, sentence: Sequence[str]):
17 | raise NotImplementedError
18 |
19 | def predict(self, X, **_):
20 | if len(X) == 2 and isinstance(X[0][0], list) and isinstance(X[0][0][0], Token): # extra features provided
21 | X, _ = X
22 | y = [self.guess([token.text for token in sentence]) for sentence in X]
23 | y = pad_sequences(y)
24 | return y
25 |
26 |
27 | class UpperBoundDeidentifier(DummyDeidentifier):
28 | def __init__(self, X, y):
29 | self.solutions = {}
30 | for sentence, labels in zip(X, y):
31 | self.solutions[' '.join([token.text for token in sentence])] = [l[0] for l in labels]
32 |
33 | def guess(self, sentence):
34 | return self.solutions[' '.join(sentence)]
35 |
36 |
37 | class RandomGuessingDeidentifier(DummyDeidentifier):
38 | def __init__(self, X, y):
39 | label_counts = collections.defaultdict(int)
40 | for sentence, labels in zip(X, y):
41 | for label in labels:
42 | label_counts[label[0]] += 1
43 | n_labels = sum(label_counts.values())
44 | self.labels = sorted(label_counts.keys())
45 | self.probabilities = [label_counts[label] / n_labels for label in self.labels]
46 |
47 | def guess(self, sentence):
48 | return np.random.choice(self.labels, size=len(sentence), p=self.probabilities)
49 |
50 |
51 | class WordListDeidentifier(DummyDeidentifier):
52 | def __init__(self, X, y):
53 | self.memory = collections.defaultdict(lambda: [1])
54 | for sentence, labels in zip(X, y):
55 | for word, label in zip(sentence, labels):
56 | self.memory[word.text].append(label[0])
57 |
58 | def guess(self, sentence):
59 | def most_common(lst):
60 | return max(set(lst), key=lst.count)
61 |
62 | return [most_common(self.memory[word]) for word in sentence]
63 |
64 |
65 | def main():
66 | parser = argparse.ArgumentParser()
67 | parser.description = 'different dummy predictors'
68 | parser.add_argument('--upper-bound', help='the embeddings to use, either glove or fasttext', action='store_true')
69 | parser.add_argument('--random-guessing', help='the embeddings to use, either glove or fasttext',
70 | action='store_true')
71 | parser.add_argument('--word-list', help='the embeddings to use, either glove or fasttext', action='store_true')
72 | args = parser.parse_args()
73 |
74 | if not any([args.upper_bound, args.random_guessing, args.word_list]):
75 | raise ValueError('please select at least one of --upper-bound, --random-guessing, --word-list')
76 |
77 | tr = TrainingSet(limit_documents=env.limit_training_documents)
78 | val = ValidationSet(tr.label2ind, limit_documents=env.limit_training_documents, validation_set='validation')
79 |
80 | if args.upper_bound:
81 | # needs its own special case because the model is initialized with the test set!
82 | test = ValidationSet(tr.label2ind, limit_documents=env.limit_training_documents, validation_set='test')
83 | experiment_dir = experiment_directory('upper_bound')
84 | model = UpperBoundDeidentifier(test.X, test.y)
85 | evaluate_deid_performance(model, embeddings=None, test_set='test', label2ind=tr.label2ind,
86 | ind2label=tr.ind2label,
87 | batch_size=8, experiment_dir=experiment_dir, require_argmax=False)
88 |
89 | if args.random_guessing:
90 | test_baseline('random_guessing', RandomGuessingDeidentifier, tr, val)
91 |
92 | if args.word_list:
93 | test_baseline('word_list', WordListDeidentifier, tr, val)
94 |
95 |
96 | def test_baseline(identifier, model_class, tr, val):
97 | experiment_dir = experiment_directory(identifier)
98 | model = model_class(tr.X + val.X, tr.y + val.y)
99 | evaluate_deid_performance(model, embeddings=None, test_set='test', label2ind=tr.label2ind,
100 | ind2label=tr.ind2label, batch_size=8, experiment_dir=experiment_dir,
101 | require_argmax=False)
102 |
103 |
104 | if __name__ == '__main__':
105 | main()
106 |
--------------------------------------------------------------------------------
/deid/experiment/fake_sentences.py:
--------------------------------------------------------------------------------
1 | import math
2 | import os
3 | import pickle
4 | import random
5 |
6 | import numpy as np
7 | from keras import Sequential
8 | from keras.layers import Bidirectional, LSTM, Dense
9 |
10 | from . import experiment_directory
11 | from ..data import TrainingSet, ValidationSet, StratifiedSampling, is_phi_sentence
12 | from ..data.augment import Augment, get as get_strategy
13 | from ..data.batch import IteratorWithEpochLength
14 | from ..data.util import pad_2d_sequences
15 | from ..embeddings import Matrix, get as get_embeddings
16 | from ..env import env
17 |
18 |
19 | def real_and_fake_sentences(X, y, indices, alternatives, split_condition):
20 | indices = [i for i in indices if split_condition(X[i], y[i])]
21 | real_sentences = [X[i] for i in indices]
22 | fake_sentences = [random.choice(alternatives[ind]) for ind in indices]
23 |
24 | X = []
25 | y = []
26 | for real, fake in zip(real_sentences, fake_sentences):
27 | X += [real, fake]
28 | y += [1, 0]
29 |
30 | return pad_2d_sequences(X), np.array(y)
31 |
32 |
33 | class FakeSentencesGenerator(IteratorWithEpochLength):
34 | def __init__(self, generator: IteratorWithEpochLength, dataset):
35 | self.generator = generator
36 | self.dataset = dataset
37 |
38 | def __next__(self):
39 | _, _, indices = next(self.generator)
40 | X, y = real_and_fake_sentences(self.dataset.X, self.dataset.y, indices, self.dataset.augmented,
41 | split_condition=is_phi_sentence)
42 | return X, y
43 |
44 | @property
45 | def epoch_length(self) -> int:
46 | return self.generator.epoch_length
47 |
48 |
49 | def fake_sentences_experiment(config):
50 | print('Loading embeddings...')
51 | embeddings = get_embeddings(config['experiment']['embeddings'])
52 |
53 | name = config['name']
54 | experiment_dir = experiment_directory(name, config['path'])
55 |
56 | print('Loading matrix...')
57 | matrix = Matrix(embeddings, precomputed_word2ind=embeddings.precomputed_word2ind,
58 | precomputed_matrix=embeddings.precomputed_matrix)
59 |
60 | strategy = get_strategy(config['augment']['strategy'], matrix)
61 | digit_strategy = get_strategy(config['augment']['digit_strategy'], matrix)
62 | augment = Augment(embeddings, strategy=strategy, digit_strategy=digit_strategy,
63 | **config['augment']['augment_args'])
64 |
65 | print('Augmenting training set...', flush=True)
66 | tr = TrainingSet(embeddings=embeddings,
67 | train_set=config['experiment']['train_set'],
68 | use_short_sentences=env.use_short_sentences,
69 | limit_documents=env.limit_training_documents,
70 | augment=augment)
71 |
72 | print('Augmenting validation set...', flush=True)
73 | val = ValidationSet(embeddings=embeddings,
74 | validation_set=config['experiment']['validation_set'],
75 | label2ind=tr.label2ind,
76 | use_short_sentences=env.use_short_sentences,
77 | limit_documents=env.limit_validation_documents,
78 | augment=augment)
79 |
80 | model = Sequential()
81 | model.add(Bidirectional(LSTM(embeddings.size), input_shape=(None, embeddings.size)))
82 | model.add(Dense(1, activation='sigmoid'))
83 | model.summary()
84 | model.compile(optimizer='nadam', loss='binary_crossentropy', metrics=['accuracy'])
85 |
86 | batch_size = test_batch_size = 32
87 | train_gen = FakeSentencesGenerator(StratifiedSampling(tr.X, tr.y, split_condition=is_phi_sentence,
88 | batch_size=batch_size, yield_indices=True, shuffle=True), tr)
89 | valid_gen = FakeSentencesGenerator(StratifiedSampling(val.X, val.y, split_condition=is_phi_sentence,
90 | batch_size=batch_size, yield_indices=True, shuffle=False),
91 | val)
92 |
93 | history = model.fit_generator(train_gen,
94 | epochs=config['training']['train_epochs'],
95 | steps_per_epoch=int(math.ceil(len(tr.X) / batch_size)),
96 | validation_data=valid_gen,
97 | validation_steps=int(math.ceil(len(val.X) / test_batch_size)),
98 | verbose=env.keras_verbose)
99 |
100 | history_pickle_path = os.path.join(experiment_dir, 'history.pickle')
101 | print('Saving history to', history_pickle_path)
102 | with open(history_pickle_path, 'wb') as f:
103 | pickle.dump(history.history, f)
104 |
--------------------------------------------------------------------------------
/deid/model/adversary.py:
--------------------------------------------------------------------------------
1 | from keras import backend as K
2 | from keras.layers import Input, Dense, Lambda, LSTM, Bidirectional, TimeDistributed, Dropout, concatenate
3 | from keras.models import Model, Sequential
4 |
5 | from .layers import GradientReversal
6 |
7 | discriminator_loss = 'binary_crossentropy'
8 |
9 |
10 | def get(identifier):
11 | if identifier == 'reconstruct':
12 | return Reidentifier
13 | elif identifier == 'discriminate-representations':
14 | return TwoRepresentationsAreSameOriginalDiscriminator
15 | elif identifier == 'discriminate-representation-embedding-pair':
16 | return OriginalAndRepresentationAreMatchingDiscriminator
17 | else:
18 | raise ValueError(f'Unknown adversary: "{identifier}"')
19 |
20 |
21 | class Adversary:
22 | """ An adversary is a model with a gradient reversal layer. It can chose its inputs from a dictionary that contains
23 | entries for 'train_representation', 'fake_representation', and 'original_embeddings'.
24 | """
25 |
26 | def __init__(self, model, loss, inputs, **compile_kwargs):
27 | self.model = model
28 | self.loss = loss
29 | self.inputs = inputs
30 | self.compile_kwargs = compile_kwargs
31 |
32 |
33 | class Reidentifier(Adversary):
34 | def __init__(self, inputs, representation_size, embedding_size, lstm_size, input_dropout=0., recurrent_dropout=0.,
35 | reverse_gradient=True, **_):
36 | model = Sequential(name='reidentifier')
37 | model.add(Dropout(input_dropout, input_shape=(None, representation_size)))
38 | if reverse_gradient:
39 | model.add(GradientReversal())
40 | model.add(Bidirectional(LSTM(lstm_size, return_sequences=True, recurrent_dropout=recurrent_dropout)))
41 | model.add(TimeDistributed(Dense(embedding_size)))
42 | model.add(Lambda(lambda x: K.l2_normalize(x, axis=-1)))
43 | super().__init__(model, inputs=[inputs['train_representation']], loss='mse', sample_weight_mode='temporal',
44 | metrics=['cosine_proximity'])
45 |
46 |
47 | class TwoRepresentationsAreSameOriginalDiscriminator(Adversary):
48 | def __init__(self, inputs, representation_size, lstm_size, input_dropout=0., recurrent_dropout=0.,
49 | reverse_gradient=True, **_):
50 | """ LSTM size should be at least the representation size for this to converge quickly. """
51 | representation_input1 = Input(shape=(None, representation_size))
52 | representation_input2 = Input(shape=(None, representation_size))
53 |
54 | # (batch_size, maxlen, repr_size) -> (batch_size, maxlen, 1) -- the dot layer doesn't do this
55 | normalized_1 = Lambda(lambda x: K.l2_normalize(x, axis=-1))(representation_input1)
56 | normalized_2 = Lambda(lambda x: K.l2_normalize(x, axis=-1))(representation_input2)
57 | dot_product = Lambda(lambda x: K.sum(x[0] * x[1], axis=-1, keepdims=True))([normalized_1, normalized_2])
58 |
59 | both_inputs = concatenate([representation_input1, representation_input2], axis=-1)
60 | both_inputs = Dropout(input_dropout)(both_inputs)
61 |
62 | inputs_and_dot_product = concatenate([both_inputs, dot_product], axis=-1)
63 | if reverse_gradient:
64 | inputs_and_dot_product = GradientReversal()(inputs_and_dot_product)
65 |
66 | summary = Bidirectional(LSTM(lstm_size, recurrent_dropout=recurrent_dropout))(inputs_and_dot_product)
67 | output = Dense(1, activation='sigmoid')(summary)
68 |
69 | model = Model([representation_input1, representation_input2], output, name='rr-adv')
70 | super().__init__(model, inputs=[inputs['train_representation'], inputs['fake_representation']],
71 | loss=discriminator_loss, metrics=['accuracy'])
72 |
73 |
74 | class OriginalAndRepresentationAreMatchingDiscriminator(Adversary):
75 | def __init__(self, inputs, representation_size, embedding_size, lstm_size, input_dropout=0., recurrent_dropout=0.,
76 | reverse_gradient=True, **_):
77 | embedding_input = Input(shape=(None, embedding_size))
78 | representation_input = Input(shape=(None, representation_size))
79 |
80 | both_inputs = concatenate([embedding_input, representation_input], axis=-1)
81 | if reverse_gradient:
82 | both_inputs = GradientReversal()(both_inputs)
83 | both_inputs = Dropout(input_dropout)(both_inputs)
84 | summary = Bidirectional(LSTM(lstm_size, recurrent_dropout=recurrent_dropout))(both_inputs)
85 |
86 | output = Dense(1, activation='sigmoid')(summary)
87 |
88 | model = Model([embedding_input, representation_input], output, name='er-adv')
89 | super().__init__(model, inputs=[inputs['original_embeddings'], inputs['fake_representation']],
90 | loss=discriminator_loss, metrics=['accuracy'])
91 |
--------------------------------------------------------------------------------
/deid/data/augment/strategy.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import random
3 | import re
4 | from typing import Any, Optional, Sequence, Dict
5 |
6 | import numpy as np
7 |
8 | from ...embeddings import Matrix, EmbeddingSimilarity
9 |
10 | logger = logging.getLogger()
11 | digit_pattern = '^[0-9]*$'
12 |
13 |
14 | class AugmentStrategy:
15 | augments_words: bool
16 |
17 | @property
18 | def description(self) -> Optional[str]:
19 | return None
20 |
21 | def augment(self, word_or_embedding: Any) -> Any:
22 | raise NotImplementedError
23 |
24 | def __str__(self) -> str:
25 | options = '' if self.description is None else ' ' + self.description
26 | return f'<{self.__class__.__name__}{options}>'
27 |
28 |
29 | class AugmentWord(AugmentStrategy):
30 | augments_words = True
31 |
32 | def augment(self, word: str) -> str:
33 | raise NotImplementedError
34 |
35 |
36 | class AugmentEmbedding(AugmentStrategy):
37 | augments_words = False
38 |
39 | def augment(self, word_embedding: np.ndarray) -> np.ndarray:
40 | raise NotImplementedError
41 |
42 |
43 | class Zeros(AugmentEmbedding):
44 | """ Not actually zeros to distinguish from masking. """
45 |
46 | def augment(self, word_embedding: np.ndarray) -> np.ndarray:
47 | return np.random.normal(0., scale=1e-6, size=len(word_embedding))
48 |
49 |
50 | class RandomEmbedding(AugmentEmbedding):
51 | """ A random normal embedding, optionally L2 normalized """
52 |
53 | def __init__(self, scale=None, l2_normalize=True):
54 | self.scale = 1. if scale is None else scale
55 | self.l2_normalize = l2_normalize
56 |
57 | @property
58 | def description(self) -> Optional[str]:
59 | return f'scale={self.scale}, l2_normalize={self.l2_normalize}'
60 |
61 | def augment(self, word_embedding: np.ndarray) -> np.ndarray:
62 | embedding = np.random.normal(0., scale=self.scale, size=len(word_embedding))
63 | if self.l2_normalize:
64 | embedding = embedding / np.linalg.norm(embedding)
65 | return embedding
66 |
67 |
68 | class RandomDigits(AugmentWord):
69 | def __init__(self, matrix: Matrix) -> None:
70 | self.matrix = matrix
71 | logger.info('getting digit indices')
72 | self.digit_ind = [ind for word, ind in matrix.word2ind.items() if re.match(digit_pattern, str(word))]
73 | logger.info('found %d indices', len(self.digit_ind))
74 |
75 | def augment(self, word: str) -> str:
76 | ind = random.choice(self.digit_ind)
77 | return self.matrix.ind2word[ind]
78 |
79 |
80 | class AdditiveNoise(AugmentEmbedding):
81 | def __init__(self, scale: float) -> None:
82 | self.scale = scale
83 |
84 | @property
85 | def description(self) -> Optional[str]:
86 | return f'scale={self.scale}'
87 |
88 | def augment(self, word_embedding: np.ndarray) -> np.ndarray:
89 | noisy = word_embedding + np.random.normal(0, scale=self.scale, size=len(word_embedding))
90 | return noisy / np.linalg.norm(noisy)
91 |
92 |
93 | class MoveToNeighbor(AugmentWord):
94 | """ Only makes sense for embeddings like GloVE and fastText that have a fixed word->embedding lookup """
95 |
96 | def __init__(self, matrix: Matrix, n_neighbors: int, cache_mode: str = 'neighbors') -> None:
97 | self.matrix = matrix
98 | self.n_neighbors = n_neighbors
99 | self.cache = NeighborsCache(cache_mode)
100 |
101 | @property
102 | def description(self) -> Optional[str]:
103 | return f'n_neighbors={self.n_neighbors}'
104 |
105 | def augment(self, word: str) -> str:
106 | cache_result = self.cache.lookup(word)
107 | if cache_result is None:
108 | neighbors = self.matrix.most_similar_cosine(word, n=self.n_neighbors)
109 | selected = random.choice(neighbors)
110 | self.cache.store(word, neighbors, selected)
111 | else:
112 | selected = cache_result
113 | return selected.word
114 |
115 |
116 | class NeighborsCache:
117 | def __init__(self, mode: Optional[str]) -> None:
118 | if mode not in [None, 'neighbors', 'selected']:
119 | raise ValueError("Cache mode must be either None, 'neighbors' or 'selected'")
120 | self.mode = mode
121 | self.cache: Dict[str, Sequence[EmbeddingSimilarity]] = {}
122 |
123 | def lookup(self, word: str) -> Optional[EmbeddingSimilarity]:
124 | if self.mode is None:
125 | return None
126 |
127 | result = self.cache.get(word)
128 | return result if result is None else random.choice(result)
129 |
130 | def store(self, word: str, neighbors: Sequence[EmbeddingSimilarity], selected: EmbeddingSimilarity) -> None:
131 | if self.mode == 'neighbors':
132 | self.cache[word] = neighbors
133 | if self.mode == 'selected':
134 | self.cache[word] = [selected]
135 |
--------------------------------------------------------------------------------
/deid/embeddings/fasttext.py:
--------------------------------------------------------------------------------
1 | import os
2 | import pickle
3 | from typing import Dict
4 |
5 | import fastText
6 | import numpy as np
7 | from tqdm import tqdm
8 |
9 | from . import PrecomputedEmbeddings
10 | from ..env import env
11 |
12 | fasttext_dir = os.path.join(env.resources_dir, 'fastText')
13 | fasttext_embeddings_name = 'wiki-news-300d-1M-subword'
14 |
15 |
16 | class FastTextEmbeddings(PrecomputedEmbeddings):
17 | def __new__(cls, *args, **kwargs):
18 | if env.embeddings_cache:
19 | return CachedFastTextEmbeddings()
20 | return PreloadFastTextEmbeddings()
21 |
22 | def __init__(self, *_, **__):
23 | raise NotImplementedError('this should not happen')
24 |
25 | @property
26 | def size(self) -> int:
27 | raise NotImplementedError
28 |
29 | @property
30 | def std(self):
31 | raise NotImplementedError
32 |
33 | def lookup(self, word: str) -> np.ndarray:
34 | raise NotImplementedError
35 |
36 | def is_unknown(self, word: str):
37 | return NotImplementedError
38 |
39 | @property
40 | def precomputed_word2ind(self) -> Dict[str, int]:
41 | raise NotImplementedError
42 |
43 | @property
44 | def precomputed_matrix(self) -> np.ndarray:
45 | raise NotImplementedError
46 |
47 |
48 | class FastTextEmbeddingsImpl(PrecomputedEmbeddings):
49 | def __init__(self, size, *_, **__):
50 | self._size = size
51 | self._precomputed_word2ind = None
52 | self._precomputed_matrix = None
53 |
54 | @property
55 | def precomputed_word2ind(self) -> Dict[str, int]:
56 | if self._precomputed_word2ind is None:
57 | vocab_filename = os.path.join(fasttext_dir, fasttext_embeddings_name + '.vec.vocab.pickle')
58 | self._precomputed_word2ind = pickle.load(open(vocab_filename, 'rb'))
59 | return self._precomputed_word2ind
60 |
61 | @property
62 | def precomputed_matrix(self) -> np.ndarray:
63 | if self._precomputed_matrix is None:
64 | matrix_filename = os.path.join(fasttext_dir, fasttext_embeddings_name + '.vec.matrix.npy')
65 | self._precomputed_matrix = np.load(matrix_filename)
66 | return self._precomputed_matrix
67 |
68 | @staticmethod
69 | def l2_normalize_if_needed(vec: np.ndarray, l2_normalize: bool) -> np.ndarray:
70 | if l2_normalize:
71 | vec /= np.linalg.norm(vec) # all-zero embeddings shouldn't exist
72 | return vec
73 |
74 | @property
75 | def size(self) -> int:
76 | return self._size
77 |
78 | @property
79 | def std(self) -> float:
80 | return 0.05
81 |
82 | def lookup(self, word: str) -> np.ndarray:
83 | raise NotImplementedError
84 |
85 | def is_unknown(self, word: str) -> bool:
86 | return False
87 |
88 |
89 | class PreloadFastTextEmbeddings(FastTextEmbeddingsImpl):
90 | def __init__(self) -> None:
91 | self.model = fastText.load_model(os.path.join(fasttext_dir, fasttext_embeddings_name + '.bin'))
92 | super().__init__(self.model.get_dimension())
93 |
94 | def lookup(self, word: str, l2_normalize: bool = True) -> np.ndarray:
95 | vec = self.model.get_word_vector(word)
96 | if np.count_nonzero(vec) == 0:
97 | # add small amount of noise to all-zero embeddings to make them work with masking / CRF
98 | vec += np.random.normal(0., scale=1e-6, size=len(vec))
99 |
100 | return self.l2_normalize_if_needed(vec, l2_normalize)
101 |
102 | def __str__(self) -> str:
103 | return ''
104 |
105 |
106 | class CachedFastTextEmbeddings(FastTextEmbeddingsImpl): # always L2 normalized!
107 | def __init__(self, vocab=None):
108 | cache_path = os.path.join(fasttext_dir, fasttext_embeddings_name + '.pickle')
109 | if vocab is None:
110 | self.word2ind, self.matrix = pickle.load(open(cache_path, 'rb'))
111 | else:
112 | vocab = set(vocab)
113 | embeddings = PreloadFastTextEmbeddings()
114 | self.word2ind = {word: i + 1 for i, word in enumerate(vocab)}
115 | self.matrix = np.zeros((len(vocab) + 1, embeddings.size))
116 | for i, word in tqdm(enumerate(vocab, start=1), desc='Looking up words', total=len(vocab)):
117 | self.matrix[i] = embeddings.lookup(word, l2_normalize=True)
118 |
119 | pickle.dump((self.word2ind, self.matrix), open(cache_path, 'wb'))
120 | super().__init__(self.matrix.shape[1])
121 |
122 | def lookup(self, word: str, include_precomputed: bool = True) -> np.ndarray:
123 | index = self.word2ind.get(word)
124 | if index is not None:
125 | return self.matrix[index]
126 |
127 | index = self.precomputed_word2ind.get(word)
128 | if index is not None:
129 | return self.precomputed_matrix[index]
130 |
131 | raise RuntimeError(f'Cache/precomputed lookup failed for "{word}". Please rebuild the embedding cache.')
132 |
--------------------------------------------------------------------------------
/deid/data/read.py:
--------------------------------------------------------------------------------
1 | import csv
2 | import os
3 | from typing import Sequence, Optional, Set
4 |
5 | from .token import Token, HIPAA_TOKEN_TYPE, BINARY_LABEL
6 | from ..env import env
7 |
8 |
9 | def _add_data_dir_if_needed(path: str) -> str:
10 | """ Adds the data directory to a path if it's not already a sub-path.
11 |
12 | >>> _add_data_dir_if_needed('train') == os.path.join(env.data_dir, 'train')
13 | True
14 |
15 | :param path: the input path
16 | :return: a path containing the data directory
17 | """
18 | if os.path.realpath(env.data_dir) not in os.path.realpath(path):
19 | path = os.path.join(env.data_dir, path)
20 | return path
21 |
22 |
23 | def full_text_for_csv(filename: str) -> str:
24 | """ Returns the full text for a csv file that is saved to a .txt file with the same stem name.
25 |
26 | :param filename: the csv filename
27 | :return: a string that is read from the corresponding txt file
28 | """
29 | filename = _add_data_dir_if_needed(filename)
30 |
31 | if not filename.endswith('.csv'):
32 | raise ValueError(f'{filename} is not a csv file')
33 |
34 | return open(filename[:-4] + '.txt').read()
35 |
36 |
37 | def tokens_from_csv(file_or_dir: str,
38 | limit: Optional[int] = None,
39 | binary_classification: bool = False,
40 | hipaa_only: bool = False) -> Sequence[Token]:
41 | """ Parses a directory of csv files or a single csv file for tokens.
42 |
43 | :param file_or_dir: the csv file or directory to parse
44 | :param limit: upper limit for the number of csv files to parse
45 | :param binary_classification: set to True to skip the classes and use only generic BIO labels
46 | :param hipaa_only: set to True to skip all non-HIPAA tags
47 |
48 | :return: a list of Token objects
49 | """
50 |
51 | def label_string(bio_string):
52 | if hipaa_only:
53 | if bio_string == 'O' or bio_string[2:] not in HIPAA_TOKEN_TYPE.keys():
54 | return 'O'
55 |
56 | if binary_classification:
57 | # Not really binary: there is still a B, I, and O label (and the padding label). I tried using true binary
58 | # labels and there was no real difference, so I'm deciding to keep it like this.
59 | return 'O' if bio_string == 'O' else f'{bio_string[0]}-{BINARY_LABEL}'
60 | return bio_string
61 |
62 | file_or_dir = _add_data_dir_if_needed(file_or_dir)
63 |
64 | if os.path.isdir(file_or_dir):
65 | filenames = sorted([os.path.join(file_or_dir, f) for f in os.listdir(file_or_dir) if f.endswith('.csv')])
66 | if len(filenames) == 0:
67 | raise ValueError(f'{file_or_dir} does not contain any csv files')
68 | elif file_or_dir.endswith('.csv'):
69 | filenames = [file_or_dir]
70 | else:
71 | raise ValueError(f'{file_or_dir} is not a csv file')
72 |
73 | tokens = []
74 | for i, filename in enumerate(filenames):
75 | with open(filename) as f:
76 | reader = csv.reader(f)
77 | next(reader) # skip header
78 | for row in reader:
79 | tokens.append(Token(row[0],
80 | label_string(row[1]),
81 | *map(int, row[2:])))
82 | if i == limit:
83 | break
84 |
85 | return tokens
86 |
87 |
88 | def split_sentences(tokens: Sequence[Token]) -> Sequence[Sequence[Token]]:
89 | """ Breaks a list of Token objects into sentence chunks delimited by sent_start and sent_end. Incomplete sentences
90 | are not included in the result.
91 |
92 | >>> len(split_sentences([Token.with_text(env.sent_start), Token.with_text('test'), Token.with_text(env.sent_end)]))
93 | 1
94 |
95 | >>> len(split_sentences([Token.with_text(env.sent_start), Token.with_text('test')]))
96 | 0
97 |
98 | :param tokens: the tokens to break into sentences
99 | :return: a list of sentences (i.e. a list of lists of tokens)
100 | """
101 | sents = []
102 | current_sent = []
103 | for token in tokens:
104 | if token.text not in [env.sent_start, env.sent_end]:
105 | current_sent.append(token)
106 | if token.text == env.sent_end:
107 | if len(current_sent) > 0:
108 | sents.append(current_sent)
109 | current_sent = []
110 | return sents
111 |
112 |
113 | def vocab_from_tokens(tokens: Sequence[Token]) -> Set[str]:
114 | """ Returns a set of words from a token sequence, excluding the special sent_start and sent_end tokens.
115 |
116 | >>> sorted(vocab_from_tokens([Token.with_text(env.sent_start), Token.with_text('test'), \
117 | Token.with_text('some'), Token.with_text('test'), Token.with_text('words'), Token.with_text(env.sent_end)]))
118 | ['some', 'test', 'words']
119 |
120 | :param tokens: the tokens to convert to a vocabulary set
121 | :return: a set of words
122 | """
123 | return set(token.text for token in tokens) - {env.sent_start, env.sent_end}
124 |
--------------------------------------------------------------------------------
/deid/tools/config.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import itertools
3 | import os
4 | import random
5 |
6 | import yaml
7 |
8 | from ..env import env
9 |
10 | config_dir = os.path.join(env.resources_dir, 'config')
11 | generated_dir = os.path.join(config_dir, 'generated')
12 | if not os.path.isdir(generated_dir):
13 | os.mkdir(generated_dir)
14 |
15 |
16 | def generate_config(config):
17 | result = {}
18 | for key, value in config.items():
19 | if key == 'choice':
20 | if isinstance(value, list):
21 | return random.choice(value)
22 | else:
23 | raise ValueError('does not support other inputs than lists at the moment')
24 | elif isinstance(value, dict):
25 | result[key] = generate_config(value)
26 | else:
27 | result[key] = value
28 | return result
29 |
30 |
31 | def generate_random_configs(config, name, n, start, output_path):
32 | for i in range(n):
33 | config_num = i + start
34 | result = generate_config(config)
35 | with open(os.path.join(output_path, name) + f'_{config_num:03d}.yaml', 'w') as f:
36 | f.write(yaml.dump(result))
37 | print(f'Generated {n} configs.')
38 |
39 |
40 | def flatten_config(config, sep='--', prefix=None):
41 | """
42 | >>> flatten_config({'a': 1, 'b': {'c': {'d': 4, 'e': [5, 6]}}})
43 | {'a': 1, 'b--c--d': 4, 'b--c--e': [5, 6]}
44 | """
45 | result = {}
46 | for key, value in config.items():
47 | key = key if prefix is None else f'{prefix}{sep}{key}'
48 | if isinstance(value, dict):
49 | result.update(flatten_config(value, sep, prefix=key))
50 | else:
51 | result[key] = value
52 | return result
53 |
54 |
55 | def unflatten_config(config, sep='--'):
56 | """
57 | >>> unflatten_config({'a': 1, 'b--c--d': 4, 'b--c--e': [5, 6]})
58 | {'a': 1, 'b': {'c': {'d': 4, 'e': [5, 6]}}}
59 | """
60 | result = {}
61 | for key, value in config.items():
62 | parts = key.split(sep)
63 | parent = result
64 | for child in parts[:-1]:
65 | if child not in parent.keys():
66 | parent[child] = {}
67 | parent = parent[child]
68 | parent[parts[-1]] = value
69 | return result
70 |
71 |
72 | def remove_choices(config, sep='--'):
73 | """
74 | >>> remove_choices({'a': 1, 'b--c--choice': 2})
75 | {'a': 1, 'b--c': 2}
76 | """
77 | result = {}
78 | for key, value in config.items():
79 | if key.endswith(f'{sep}choice'):
80 | result[key[:-len(f'{sep}choice')]] = value
81 | else:
82 | result[key] = value
83 | return result
84 |
85 |
86 | def generate_grid_configs(config, name, output_path):
87 | flattened = flatten_config(config)
88 | choice_keys = [key for key in flattened.keys() if key.endswith('choice')]
89 |
90 | for i, choices in enumerate(itertools.product(*[flattened[key] for key in choice_keys]), start=1):
91 | for choice_key, choice in zip(choice_keys, choices):
92 | flattened[choice_key] = choice
93 |
94 | result = unflatten_config(remove_choices(flattened))
95 | with open(os.path.join(output_path, name) + f'_grid_{i:03d}.yaml', 'w') as f:
96 | f.write(yaml.dump(result))
97 | print(f'Generated {i} configs.')
98 |
99 |
100 | def find_config(name):
101 | filename = name
102 | if os.path.isfile(filename):
103 | return filename
104 |
105 | filename = os.path.join(config_dir, filename)
106 | if os.path.isfile(filename):
107 | return filename
108 |
109 | filename = filename + '.yaml'
110 | if os.path.isfile(filename):
111 | return filename
112 |
113 | raise argparse.ArgumentTypeError(f'{name} is not a valid config name or path')
114 |
115 |
116 | def main():
117 | def ensure_dir(arg) -> str:
118 | if type(arg) == str and os.path.isdir(arg):
119 | return arg
120 | raise argparse.ArgumentTypeError(f'{arg} is not a directory')
121 |
122 | parser = argparse.ArgumentParser()
123 | parser.description = 'Create experiment configs from a config template.'
124 | parser.add_argument('input_config', help='the input config template')
125 | parser.add_argument('-o', '--output_path', help='the path to store the results', type=ensure_dir,
126 | default=generated_dir)
127 | parser.add_argument('-n', '--n', help='the number of configs to generate', type=int, default=10)
128 | parser.add_argument('-a', '--all', help='generate all configs (grid), overrides --n', action='store_true')
129 | parser.add_argument('-s', '--start', help='the starting number for config filenames', type=int, default=0)
130 |
131 | args = parser.parse_args()
132 |
133 | filename = find_config(args.input_config)
134 | config = yaml.load(open(filename))
135 | name = '.'.join(os.path.basename(filename).split('.')[:-1])
136 | name = name.replace('_template', '')
137 | if args.all:
138 | generate_grid_configs(config, name, args.output_path)
139 | else:
140 | generate_random_configs(config, name, args.n, args.start, args.output_path)
141 |
142 |
143 | if __name__ == '__main__':
144 | main()
145 |
--------------------------------------------------------------------------------
/deid/model/representer.py:
--------------------------------------------------------------------------------
1 | from keras import backend as K
2 | from keras.layers import Dense, Lambda, LSTM, Bidirectional, TimeDistributed, Masking
3 | from keras.models import Sequential
4 |
5 | from .layers import Noise
6 |
7 |
8 | def get(identifier):
9 | if identifier == 'noisy':
10 | return build_noise_representer
11 | elif identifier == 'dense':
12 | return build_dense_representer
13 | elif identifier == 'lstm':
14 | return build_lstm_representer
15 | else:
16 | raise ValueError(f'Unknown representation type: "{identifier}"')
17 |
18 |
19 | def build_noise_representer(embedding_size, representation_size, noises, single_stddev, apply_noise,
20 | l2_normalize=False, **_):
21 | """ Build a representer that applies a series of noise steps.
22 |
23 | :param embedding_size: the embedding (input) size
24 | :param representation_size: the representation (output) size
25 | :param noises: the types of noise to add if using the 'noisy' representation. Must be a single
26 | identifier or sequence of identifiers, allowed identifiers are '+'/'add' or '*'/'mult'
27 | :param single_stddev: whether to use a single stddev for all embedding dimensions
28 | :param apply_noise: whether to apply noise or the mean in this model
29 | :param l2_normalize: whether to L2 normalize the inputs (outputs are always L2 normalized)
30 | :return: a noisy representer model
31 | """
32 | if type(noises) == str:
33 | noises = [noises]
34 |
35 | model = Sequential(name='representer')
36 | model.add(Masking(input_shape=(None, embedding_size)))
37 | if l2_normalize:
38 | model.add(Lambda(lambda x: K.l2_normalize(x, axis=-1)))
39 | for i, noise_operation in enumerate(noises):
40 | model.add(Noise(noise_operation, apply_noise=apply_noise, single_stddev=single_stddev,
41 | input_shape=(None, embedding_size)))
42 |
43 | model.add(TimeDistributed(Dense(representation_size)))
44 | model.add(Lambda(lambda x: K.l2_normalize(x, axis=-1)))
45 | return model
46 |
47 |
48 | def build_dense_representer(embedding_size, representation_size, apply_noise, num_hidden=2, hidden_size=None,
49 | l2_normalize=False, noise_before=True, noise_after=True, single_stddev=False, **_):
50 | """ Build a dense representer that applies the same dense weights to each element in the input sequence.
51 |
52 | :param embedding_size: the embedding (input) size
53 | :param representation_size: the representation (output) size
54 | :param apply_noise: whether to apply noise or the mean in this model
55 | :param num_hidden: the number of hidden layers in the dense model
56 | :param hidden_size: the number of units per hidden layer in the dense model
57 | :param l2_normalize: whether to L2 normalize the inputs (outputs are always L2 normalized)
58 | :param noise_before: whether to add noise with trainable stddev to the inputs
59 | :param noise_after: whether to add noise with trainable stddev to the outputs
60 | :param single_stddev: whether to use a single stddev for all embedding dimensions
61 | :param _: ignored kwargs
62 | :return: a dense representer model
63 | """
64 | if hidden_size is None:
65 | hidden_size = embedding_size
66 |
67 | model = Sequential(name='representer')
68 | model.add(Masking(input_shape=(None, embedding_size)))
69 | if l2_normalize:
70 | model.add(Lambda(lambda x: K.l2_normalize(x, axis=-1)))
71 | if noise_before:
72 | model.add(Noise('add', single_stddev=single_stddev, apply_noise=apply_noise))
73 |
74 | for _ in range(num_hidden):
75 | model.add(TimeDistributed(Dense(hidden_size, activation='relu')))
76 | model.add(TimeDistributed(Dense(representation_size)))
77 |
78 | if noise_after:
79 | model.add(Noise('add', single_stddev=single_stddev, apply_noise=apply_noise))
80 | model.add(Lambda(lambda x: K.l2_normalize(x, axis=-1)))
81 | return model
82 |
83 |
84 | def build_lstm_representer(embedding_size, representation_size, apply_noise, num_hidden=1, lstm_size=128,
85 | l2_normalize=False, noise_before=True, noise_after=True, single_stddev=False, **_):
86 | """ Build an LSTM representer.
87 |
88 | :param embedding_size: the embedding (input) size
89 | :param representation_size: the representation (output) size
90 | :param apply_noise: whether to apply noise or the mean in this model
91 | :param num_hidden: the number of LSTM layers
92 | :param lstm_size: the number of LSTM units per direction and layer
93 | :param l2_normalize: whether to L2 normalize the inputs (outputs are always L2 normalized)
94 | :param noise_before: whether to add noise with trainable stddev to the inputs
95 | :param noise_after: whether to add noise with trainable stddev to the outputs
96 | :param single_stddev: whether to use a single stddev for all embedding dimensions
97 | :param _: ignored kwargs
98 | :return: an LSTM representer model
99 | """
100 | model = Sequential(name='representer')
101 | model.add(Masking(input_shape=(None, embedding_size)))
102 | if l2_normalize:
103 | model.add(Lambda(lambda x: K.l2_normalize(x, axis=-1)))
104 | if noise_before:
105 | model.add(Noise('add', single_stddev=single_stddev, apply_noise=apply_noise))
106 |
107 | for _ in range(num_hidden):
108 | model.add(Bidirectional(LSTM(lstm_size, return_sequences=True)))
109 | model.add(TimeDistributed(Dense(representation_size)))
110 |
111 | if noise_after:
112 | model.add(Noise('add', single_stddev=single_stddev, apply_noise=apply_noise))
113 | model.add(Lambda(lambda x: K.l2_normalize(x, axis=-1)))
114 | return model
115 |
--------------------------------------------------------------------------------
/deid/tools/i2b2_xml_to_csv.py:
--------------------------------------------------------------------------------
1 | # Call this using `python -m deid.tools.i2b2_xml_to_csv [params]
2 |
3 | import argparse
4 | import csv
5 | import html
6 | import os
7 | from typing import Tuple, Sequence
8 |
9 | from bs4 import BeautifulSoup
10 | from tqdm import tqdm
11 |
12 | from ..data import Token, tokenize
13 |
14 |
15 | def xml_to_csv(filename: str, output_dir: str, check_alignment) -> None:
16 | tokens, text = xml_to_annotated_tokens_and_text(filename, check_alignment)
17 |
18 | path_without_ext = os.path.join(output_dir, os.path.basename(filename)[:-4])
19 |
20 | with open(path_without_ext + '.csv', 'w') as f:
21 | writer = csv.writer(f)
22 | writer.writerow(['text', 'type', 'start', 'end'])
23 | writer.writerows(tokens)
24 |
25 | with open(path_without_ext + '.txt', 'w') as f:
26 | f.write(text)
27 |
28 |
29 | def xml_to_annotated_tokens_and_text(filename, check_alignment) -> Tuple[Sequence[Token], str]:
30 | soup = BeautifulSoup(open(filename).read(), features='xml')
31 |
32 | text = str(soup.find('TEXT').contents[0])
33 | tags = soup.find('TAGS').findChildren()
34 |
35 | if check_alignment:
36 | # Sanity check: compare the with the above text.
37 | # Ignoring differences where only a '\n' is missing from the tag text because this occurs often in the data
38 | # and does not seem to matter for us.
39 | for tag in tags:
40 | tag_text, original_text = tag.get('text'), text[int(tag.get('start')):int(tag.get('end'))]
41 | if tag_text != original_text and tag_text != original_text.replace('\n', ' '):
42 | location = f"{os.path.basename(filename)}[{tag.get('start')}:{tag.get('end')}]"
43 | tqdm.write(f"{location} (tag) {tag_text.__repr__()} ≠ (original) {original_text.__repr__()}")
44 |
45 | # TODO check here if the start and end tags actually fall on tokens
46 |
47 | doc = tokenize(text)
48 | return annotate_with_tags(doc, tags), text
49 |
50 |
51 | def annotate_with_tags(doc, tags) -> Sequence[Token]:
52 | def tag_start(i):
53 | return int(tags[i].get('start'))
54 |
55 | def tag_end(i):
56 | return int(tags[i].get('end'))
57 |
58 | def is_current_tag(token):
59 | if token.idx == tag_start(current_tag):
60 | return True
61 | if token.idx >= tag_start(current_tag) and token.idx + len(token.text) <= tag_end(current_tag):
62 | return True
63 | if token.idx < tag_start(current_tag) < token.idx + len(token.text):
64 | return True
65 | return False
66 |
67 | current_tag = 0
68 |
69 | result = []
70 | for sentence in doc.sents:
71 | continue_tag_type = None # set to the tag type string if the tag is not yet processed fully
72 | result.append(Token('', 'O', sentence[0].idx, sentence[0].idx))
73 |
74 | for token in sentence:
75 | if continue_tag_type and token.idx < tag_end(current_tag):
76 | tag = f'I-{continue_tag_type}'
77 | else:
78 | if token.idx >= tag_end(current_tag) and current_tag < len(tags) - 1:
79 | current_tag += 1
80 |
81 | # make sure we did not skip an entire tag
82 | while token.idx >= tag_end(current_tag) and current_tag < len(tags) - 1:
83 | print('Skipping a tag:', tags[current_tag].get('TYPE'), tags[current_tag].get('text'))
84 | current_tag += 1
85 |
86 | if is_current_tag(token):
87 | continue_tag_type = tags[current_tag].get('TYPE')
88 | tag = f'B-{continue_tag_type}'
89 | else:
90 | tag = 'O'
91 | continue_tag_type = None
92 |
93 | token_text = token._.unescaped_html if token._.unescaped_html is not None else token.text
94 | token_text = token_text.strip()
95 | if len(token_text) == 0 and tag.startswith('B'):
96 | continue_tag_type = None
97 |
98 | if len(token_text) != 0:
99 | result.append(Token(token_text, tag, token.idx, token.idx + len(token)))
100 |
101 | result.append(Token('', 'O', sentence[-1].idx, sentence[-1].idx))
102 |
103 | return result
104 |
105 |
106 | def main() -> None:
107 | def ensure_file_or_dir(arg) -> str:
108 | if type(arg) == str and (os.path.isfile(arg) or os.path.isdir(arg)):
109 | return arg
110 | raise argparse.ArgumentTypeError(f'{arg} is not a file or directory')
111 |
112 | def ensure_dir(arg) -> str:
113 | if type(arg) == str and os.path.isdir(arg):
114 | return arg
115 | raise argparse.ArgumentTypeError(f'{arg} is not a directory')
116 |
117 | parser = argparse.ArgumentParser()
118 | parser.add_argument('file_or_dir', help='the input file(s)', type=ensure_file_or_dir)
119 | parser.add_argument('output_dir', help='save the csv file(s) here', type=ensure_dir)
120 | parser.add_argument('--check', help='check the tag/text alignment', action='store_true')
121 | args = parser.parse_args()
122 |
123 | if os.path.isdir(args.file_or_dir):
124 | filenames = sorted([file for file in os.listdir(args.file_or_dir) if file.endswith('.xml')])
125 | if len(filenames) == 0:
126 | print('No XML files found.')
127 |
128 | pbar = tqdm(filenames)
129 | for filename in pbar:
130 | pbar.set_description(filename)
131 | path = os.path.join(args.file_or_dir, filename)
132 | xml_to_csv(path, output_dir=args.output_dir, check_alignment=args.check)
133 | else:
134 | xml_to_csv(args.file_or_dir, output_dir=args.output_dir, check_alignment=args.check)
135 |
136 |
137 | if __name__ == '__main__':
138 | main()
139 |
--------------------------------------------------------------------------------
/deid/data/augment/augment.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import random
3 | import re
4 | from typing import Optional, Callable, Sequence, NamedTuple, Tuple, Dict, Union
5 |
6 | from spacy.lang.en.stop_words import STOP_WORDS
7 |
8 | from . import AugmentStrategy, get as get_strategy
9 | from .. import Token, Sentence
10 | from ...embeddings import Embeddings
11 |
12 | logger = logging.getLogger()
13 | digit_pattern = '^[0-9]*$'
14 |
15 |
16 | def default_exclude(word: str) -> bool:
17 | return word.lower() in STOP_WORDS or bool(re.match('^[.,:;/+\-*=\\\\]*$', word))
18 |
19 |
20 | def exclude_nothing(_: str) -> bool:
21 | return False
22 |
23 |
24 | class AugmentedSentence(NamedTuple):
25 | original: Sentence
26 | augmented: Sequence[Sentence]
27 |
28 |
29 | class Augment:
30 | def __init__(self, embeddings: Embeddings,
31 | strategy: Union[AugmentStrategy, str],
32 | digit_strategy: Optional[Union[AugmentStrategy, str]] = None,
33 | n_augmentations: int = 1,
34 | augment_all: bool = False,
35 | augment_max: Optional[int] = None,
36 | exclude_unknown: bool = True,
37 | exclude: Optional[Callable[[str], bool]] = default_exclude) -> None:
38 | self.embeddings = embeddings
39 | self.augment_all = augment_all
40 | self.exclude_unknown = exclude_unknown
41 | if isinstance(strategy, str):
42 | self.strategy = get_strategy(strategy)
43 | else:
44 | self.strategy = strategy
45 |
46 | if digit_strategy is None:
47 | self.digit_strategy = self.strategy
48 | elif isinstance(digit_strategy, str):
49 | self.digit_strategy = get_strategy(digit_strategy)
50 | else:
51 | self.digit_strategy = digit_strategy
52 |
53 | self.n_augmentations = n_augmentations
54 | self.augment_max = augment_max if augment_max is not None else 10_000
55 | self.exclude = exclude if exclude is not None else exclude_nothing
56 |
57 | def __str__(self) -> str:
58 | return f''
61 |
62 | def _strategy_or_digit_strategy(self, word: str) -> AugmentStrategy:
63 | if re.match(digit_pattern, word):
64 | return self.digit_strategy
65 | return self.strategy
66 |
67 | def _should_be_excluded(self, word, label):
68 | exclude_because_o = not self.augment_all and label == 'O'
69 | exclude_because_unknown = self.exclude_unknown and self.embeddings.is_unknown(word)
70 | return self.exclude(word) or exclude_because_o or exclude_because_unknown
71 |
72 | def lookup_sentence(self, sentence: Sequence[Token]) -> AugmentedSentence:
73 | """ If the sentence is only O, just look it up. Otherwise:
74 | - apply the word strategies and keep track of the embedding strategies that need to be applied later
75 | - look up the sentence
76 | - apply the embedding strategies
77 |
78 | :param sentence: the input sentence
79 | :return: an AugmentedSentence object
80 | """
81 | original = self.embeddings.lookup_sentence([token.text for token in sentence])
82 | if not self.augment_all and all([token.type == 'O' for token in sentence]):
83 | return AugmentedSentence(original, [])
84 |
85 | apply_word_strategies_result = [self.apply_word_strategies(sentence) for _ in range(self.n_augmentations)]
86 | augment_embeddings, sentences_for_lookup = zip(*apply_word_strategies_result)
87 | embedded_sentences = self.embeddings.lookup_sentences(sentences_for_lookup)
88 | augmented = [self.apply_embedding_strategies(augment_embedding, embedded_sent) for
89 | augment_embedding, embedded_sent in zip(augment_embeddings, embedded_sentences)]
90 | return AugmentedSentence(original, augmented)
91 |
92 | def apply_embedding_strategies(self, augment_embedding: Dict[int, AugmentStrategy],
93 | sentence_embeddings: Sentence) -> Sentence:
94 | sentence_embeddings = list(sentence_embeddings)
95 | for i, strategy in augment_embedding.items():
96 | augmented = strategy.augment(sentence_embeddings[i])
97 | assert len(augmented) == self.embeddings.size
98 | sentence_embeddings[i] = augmented
99 | return sentence_embeddings
100 |
101 | def apply_word_strategies(self, sentence: Sequence[Token]) -> Tuple[Dict[int, AugmentStrategy], Sequence[str]]:
102 | sentence_for_lookup = []
103 | augment_embedding = {}
104 |
105 | augment_word_ind = []
106 | for i, token in enumerate(sentence):
107 | word, label = token.text, token.type
108 | if not self._should_be_excluded(word, label):
109 | strategy = self._strategy_or_digit_strategy(word)
110 | if strategy.augments_words:
111 | augment_word_ind.append(i)
112 | else:
113 | augment_embedding[i] = strategy
114 | logger.info('deferring strategy %s to augment "%s"', strategy, word)
115 |
116 | if len(augment_word_ind) > self.augment_max:
117 | augment_word_ind = random.sample(augment_word_ind, self.augment_max)
118 |
119 | for i, token in enumerate(sentence):
120 | word, label = token.text, token.type
121 | if i in augment_word_ind:
122 | strategy = self._strategy_or_digit_strategy(word)
123 | augmented = strategy.augment(word)
124 | logger.info('using strategy %s to augment "%s" to "%s"', strategy, word, augmented)
125 | sentence_for_lookup.append(augmented)
126 | else:
127 | sentence_for_lookup.append(word)
128 |
129 | return augment_embedding, sentence_for_lookup
130 |
--------------------------------------------------------------------------------
/deid/experiment/evaluation.py:
--------------------------------------------------------------------------------
1 | import contextlib
2 | import os
3 | from collections import OrderedDict
4 |
5 | import numpy as np
6 | from keras.callbacks import Callback
7 | from keras.utils.generic_utils import Progbar
8 | from terminaltables import SingleTable as TerminalTable
9 |
10 | from ..data import TestSet, prediction_to_xml
11 | from ..env import env
12 | from ..tools.i2b2.classes import PHITrackEvaluation, Evaluate
13 | from ..tools.i2b2.evaluate import evaluate as i2b2_evaluate
14 |
15 |
16 | def _save_predictions_to_xmls(model, batch_size, embeddings, label2ind, ind2label, test_set, predictions_dir,
17 | binary_classification, hipaa_only, extra_features, require_argmax):
18 | if not os.path.isdir(predictions_dir):
19 | os.mkdir(predictions_dir)
20 |
21 | print('Saving test XMLs to', predictions_dir)
22 | progress_bar = Progbar(target=TestSet.number_of_test_sets(test_set), verbose=env.keras_verbose)
23 |
24 | for i, te in enumerate(TestSet.test_sets(embeddings,
25 | test_set=test_set,
26 | label2ind=label2ind,
27 | binary_classification=binary_classification,
28 | hipaa_only=hipaa_only,
29 | extra_features=extra_features), start=1):
30 | preds = model.predict([te.X, te.X_extra], batch_size=batch_size)
31 | if require_argmax:
32 | preds = np.argmax(preds, axis=-1)
33 | xml = prediction_to_xml(te.X, preds, te.text, te.sents, ind2label)
34 | filename = os.path.basename(te.filename)[:-4] + '.xml'
35 | with open(os.path.join(predictions_dir, filename), 'w') as f:
36 | f.write(xml)
37 |
38 | progress_bar.update(i)
39 |
40 |
41 | def _run_official_evaluation(predictions_dir, test_set, output_file, binary_classification=False, hipaa_only=False,
42 | print_summary=True):
43 | xml_test_dir = os.path.join(env.data_dir, test_set + '_xml')
44 |
45 | def call_i2b2_evaluate():
46 | return i2b2_evaluate([predictions_dir], xml_test_dir, PHITrackEvaluation, verbose=False)
47 |
48 | if output_file is not None:
49 | with open(output_file, 'w') as f:
50 | with contextlib.redirect_stdout(f):
51 | evaluations = call_i2b2_evaluate()
52 | else:
53 | evaluations = call_i2b2_evaluate()
54 |
55 | result = OrderedDict()
56 | for evaluation in evaluations.evaluations:
57 | mp = evaluation.micro_precision()
58 | mr = evaluation.micro_recall()
59 | f1 = Evaluate.F_beta(mr, mp)
60 | result[evaluation.sys_id] = {'precision': mp, 'recall': mr, 'f1': f1}
61 |
62 | if print_summary:
63 | print('Evaluating', predictions_dir, xml_test_dir)
64 | print('Evaluation summary:')
65 | table_data = [['Evaluation', 'Precision', 'Recall', 'F1 (micro)']]
66 | for name, values in result.items():
67 | if binary_classification and 'Binary' not in name:
68 | continue
69 | if hipaa_only and 'HIPAA' not in name:
70 | continue
71 | if binary_classification and not hipaa_only and 'HIPAA' in name:
72 | continue # evaluation is wrong for these because all tags get mapped to a HIPAA (name) tag
73 | table_data.append([name] + [round(values[key], 5) for key in ['precision', 'recall', 'f1']])
74 |
75 | table = TerminalTable(table_data)
76 | print(table.table)
77 | print(f'(see complete evaluation at {output_file})')
78 |
79 | return result
80 |
81 |
82 | def evaluate_deid_performance(model, batch_size, embeddings, label2ind, ind2label, experiment_dir, epoch=1,
83 | test_set='validation', binary_classification=False,
84 | hipaa_only=False, extra_features=(), require_argmax=True):
85 | predictions_dir = os.path.join(experiment_dir, f'predictions_epoch_{epoch:02d}')
86 | _save_predictions_to_xmls(model=model, batch_size=batch_size, embeddings=embeddings, label2ind=label2ind,
87 | ind2label=ind2label, test_set=test_set, predictions_dir=predictions_dir,
88 | binary_classification=binary_classification, hipaa_only=hipaa_only,
89 | extra_features=extra_features, require_argmax=require_argmax)
90 |
91 | output_file = predictions_dir + '.txt'
92 | return _run_official_evaluation(predictions_dir=predictions_dir, test_set=test_set, output_file=output_file,
93 | print_summary=True, binary_classification=binary_classification,
94 | hipaa_only=hipaa_only)
95 |
96 |
97 | class DeidentificationEvaluationCallback(Callback):
98 | def __init__(self, deid_model, batch_size, embeddings, label2ind, ind2label, test_set, experiment_dir,
99 | evaluate_every, binary_classification, hipaa_only, extra_features, call_model=False):
100 | super().__init__()
101 | self.deid_model = deid_model
102 | self.batch_size = batch_size
103 | self.embeddings = embeddings
104 | self.label2ind = label2ind
105 | self.ind2label = ind2label
106 | self.test_set = test_set
107 | self.experiment_dir = experiment_dir
108 | self.evaluate_every = evaluate_every
109 | self.binary_classification = binary_classification
110 | self.hipaa_only = hipaa_only
111 | self.extra_features = extra_features
112 | self.call_model = call_model
113 |
114 | def on_epoch_end(self, epoch, logs=None):
115 | deid_model = self.deid_model() if self.call_model else self.deid_model
116 | epoch = epoch + 1 # keras uses 0-indexed epochs
117 | if epoch % self.evaluate_every == 0:
118 | evaluate_deid_performance(model=deid_model, batch_size=self.batch_size, embeddings=self.embeddings,
119 | label2ind=self.label2ind, ind2label=self.ind2label, epoch=epoch,
120 | test_set=self.test_set, experiment_dir=self.experiment_dir,
121 | binary_classification=self.binary_classification,
122 | hipaa_only=self.hipaa_only,
123 | extra_features=self.extra_features)
124 |
--------------------------------------------------------------------------------
/deid/model/adversarial.py:
--------------------------------------------------------------------------------
1 | from types import MappingProxyType
2 |
3 | from keras import backend as K
4 | from keras.layers import Input, Lambda, concatenate
5 | from keras.losses import binary_crossentropy
6 | from keras.models import Model
7 |
8 | from . import get as get_deidentifier
9 | from .adversary import get as get_adversary
10 | from .optimizer import get as get_optimizer
11 | from .representer import get as get_representer
12 |
13 |
14 | class AdversarialModel:
15 | def __init__(self,
16 | *_, # don't allow any positional arguments
17 | embedding_size,
18 | output_size,
19 | representation_size=None,
20 | representation_type='lstm',
21 | representation_args=MappingProxyType({}),
22 | deidentifier_type='lstm',
23 | deidentifier_args=MappingProxyType({}),
24 | extra_input_size=0,
25 | adversaries=('discriminate-representations', 'discriminate-representation-embedding-pair'),
26 | adversary_args=MappingProxyType({}),
27 | optimizer='adam',
28 | optimizer_args=MappingProxyType({})):
29 | """ Initialize the adversarial model. It's components are
30 | - a representation model that transforms embeddings into a (noisy) representation
31 | - a deidentifier model that performs the deidentification task from the representation
32 | - an adversary model that tries to reconstruct information from the representation
33 |
34 | :param embedding_size: the representation input size
35 | :param output_size: the deidentifier output size
36 | :param representation_size: the representation size (or None to use the embedding size)
37 | :param representation_type: the type of representation model to use (see representer.py)
38 | :param representation_args: the kwargs for the representation model
39 | :param deidentifier_type: the type of deidentifier model to use (see deidentifier.py)
40 | :param deidentifier_args: the kwargs for the deidentifier model
41 | :param adversaries: a sequence of adversary type strings (see adversary.py)
42 | :param adversary_args: a dictionary of adversary args or a list of dictionaries (if every adversary should get
43 | its own args)
44 | :param optimizer: the type of optimizer to use (see optimizer.py)
45 | :param optimizer_args: the args passed to the optimizer
46 | """
47 |
48 | if representation_size is None:
49 | representation_size = embedding_size
50 |
51 | original_embeddings = Input(shape=(None, embedding_size))
52 |
53 | build_representer = get_representer(representation_type)
54 | self.train_representer = build_representer(embedding_size=embedding_size,
55 | representation_size=representation_size,
56 | apply_noise=True,
57 | **representation_args)
58 |
59 | train_representation = self.train_representer(original_embeddings)
60 |
61 | deidentifier, deidentifier_loss = get_deidentifier(deidentifier_type)(
62 | name='deidentifier',
63 | input_size=representation_size,
64 | output_size=output_size,
65 | extra_input_size=extra_input_size,
66 | **deidentifier_args)
67 |
68 | extra_input = Input(shape=(None, extra_input_size))
69 | if extra_input_size > 0:
70 | train_deidentifier_input = [train_representation, extra_input]
71 | else:
72 | train_deidentifier_input = train_representation
73 |
74 | train_deidentifier_output = deidentifier(train_deidentifier_input)
75 | self.pretrain_deidentifier = Model([original_embeddings, extra_input], train_deidentifier_output)
76 | self.pretrain_deidentifier.compile(optimizer=get_optimizer(optimizer)(**optimizer_args), loss=deidentifier_loss,
77 | metrics=['accuracy'])
78 |
79 | self.train_representer.trainable = False
80 |
81 | adv_embeddings = Input(shape=(None, embedding_size))
82 | adv_representation = self.train_representer(adv_embeddings)
83 |
84 | adv_fake_embeddings = Input(shape=(None, embedding_size))
85 | adv_fake_representation = self.train_representer(adv_fake_embeddings)
86 |
87 | adversary_models = []
88 | adversary_outputs = []
89 | if isinstance(adversary_args, dict):
90 | adversary_args = [adversary_args for _ in adversaries]
91 |
92 | for adversary_type, args in zip(adversaries, adversary_args):
93 | adversary = get_adversary(adversary_type)(inputs={'train_representation': adv_representation,
94 | 'original_embeddings': adv_embeddings,
95 | 'fake_representation': adv_fake_representation},
96 | representation_size=representation_size,
97 | embedding_size=embedding_size,
98 | **args)
99 | adversary_models.append(adversary.model)
100 | adversary_outputs.append(adversary.model(adversary.inputs))
101 | adversary.model.summary()
102 | if len(adversary_outputs) > 1:
103 | adversary_output = concatenate(adversary_outputs, axis=-1)
104 | else:
105 | adversary_output = adversary_outputs[0]
106 | adversary_output = Lambda(lambda x: K.mean(x, axis=-1, keepdims=True), name='adversary')(adversary_output)
107 |
108 | self.pretrain_adversary = Model([adv_embeddings, adv_fake_embeddings], adversary_output)
109 | self.pretrain_adversary.summary()
110 | self.pretrain_adversary.compile(optimizer=get_optimizer(optimizer)(**optimizer_args),
111 | loss='binary_crossentropy',
112 | metrics=['accuracy'])
113 |
114 | self.fine_tune_branches = Model([original_embeddings, extra_input, adv_embeddings, adv_fake_embeddings],
115 | [train_deidentifier_output, adversary_output])
116 | self.fine_tune_branches.compile(optimizer=get_optimizer(optimizer)(**optimizer_args),
117 | loss=[deidentifier_loss, 'binary_crossentropy'],
118 | metrics=['accuracy'])
119 |
120 | self.train_representer.trainable = True
121 | deidentifier.trainable = False
122 | for adversary in adversary_models:
123 | adversary.trainable = False
124 | self.fine_tune_representer = Model([original_embeddings, extra_input, adv_embeddings, adv_fake_embeddings],
125 | [train_deidentifier_output, adversary_output])
126 | self.fine_tune_representer.compile(optimizer=get_optimizer(optimizer)(**optimizer_args),
127 | loss=[deidentifier_loss, adversarial_objective],
128 | loss_weights=[1, 1], metrics=['accuracy'])
129 |
130 | @property
131 | def complete_model(self):
132 | return self.fine_tune_branches
133 |
134 |
135 | def adversarial_objective(y_true, y_pred):
136 | loss = binary_crossentropy(y_true, y_pred)
137 | random_guessing = -K.log(0.5)
138 | return K.abs(loss - random_guessing)
139 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Sharing Training Data for De-Identification
2 |
3 | [](https://www.travis-ci.org/maxfriedrich/deid-training-data)
4 |
5 | **Update 2019-08-11:** Our paper ["Adversarial Learning of Privacy-Preserving Text Representations for De-Identification of Medical Records"](https://www.aclweb.org/anthology/papers/P/P19/P19-1584/) was published at ACL 2019.
6 |
7 | This is the code for my [Master's thesis](https://www.inf.uni-hamburg.de/en/inst/ab/lt/teaching/theses/completed-theses/2018-ma-friedrich.pdf). It's about automatic transformations that can be applied to medical text data that…
8 |
9 | - allow training a de-identification model (i.e. finding all protected information in text)
10 | - do not allow attackers to infer any protected information.
11 |
12 | ## Main Contribution
13 |
14 | An adversarial deep learning architecture that learns a private representation of medical text. The representation model is an LSTM model that adds Gaussian noise of a trainable scale to its inputs and outputs.
15 |
16 |
17 |
18 | The representation fulfills two invariance criteria that are both enforced by binary classifier LSTM adversary models that receive sequence pairs as inputs.
19 |
20 | Left: Representations should be invariant to *any* protected information token being replaced with a neighbor in an embedding space (e.g. substituting a name or date).
21 |
22 | Right: Looking up the same token sequence multiple times should result in a representation that is randomly different by a high enough degree that it could be the representation of a neighboring sequence.
23 |
24 |
25 |
26 | ## Installation
27 |
28 | - Checkout the repository including submodules. If you're doing a new clone:
29 |
30 | ```bash
31 | git clone --recurse-submodules git@github.com:maxfriedrich/deid-training-data.git
32 | ```
33 |
34 | - Or, if you already cloned the repository:
35 |
36 | ```bash
37 | git submodule update --init
38 | ```
39 |
40 | - Create a Conda environment for the project. If you want the environment name to be something other than `deid-training-data` or use `tensorflow-gpu` instead of `tensorflow`, adapt the `environment.yml` file before running this command. Then activate the environment.
41 |
42 | ```bash
43 | cd deid-training-data
44 | conda env create
45 | conda activate deid-training-data
46 | ```
47 |
48 | - Download the English language model for spaCy:
49 |
50 | ```bash
51 | python -m spacy download en
52 | ```
53 |
54 | - Verify that the environment is working by running the tests:
55 |
56 | ```bash
57 | DEID_TEST_CONFIG=1 nosetests --with-doctest
58 | ```
59 |
60 | - Adapt the [environment file](deid/env.py).
61 |
62 | - Decide with embeddings you want to use:
63 |
64 | - For **FastText**, get a [fastText embeddings binary](https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M-subword.bin.zip) (4.5 GB download) as well as the [corresponding `.vec` file of precomputed embeddings](https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M-subword.vec.zip) (590 MB download) and put it them the resources directory. Adapt the path [here](deid/embeddings/fasttext.py) if necessary. Then convert the precomputed fastText embeddings to a `{word: ind}` dictionary and numpy matrix file:
65 |
66 | ```bash
67 | python -m deid.tools.embeddings --fasttext-precomputed
68 | ```
69 |
70 | - For **GloVe**, download [a set of pre-trained word vectors](https://github.com/stanfordnlp/GloVe#download-pre-trained-word-vectors) and put it into the resources directory. Adapt the path and dimension [here](deid/embeddings/glove.py) if you're not using the Wikipedia-pretrained 300d embeddings.
71 |
72 | - For **ELMo**, you don't need to download anything.
73 |
74 | - Get the [i2b2 data](https://www.i2b2.org/NLP/DataSets/) and extract `training-PHI-Gold-Set1` into `train_xml`, `training-PHI-Gold-Set2` into `validation_xml`, and `testing-PHI-Gold-fixed` into a `test_xml` directory.
75 |
76 | - Fix one of the xml files where indices are offset after a special character:
77 |
78 | ```bash
79 | python -m deid.tools.fix_180-03 /path/to/validation_xml
80 | ```
81 |
82 | - Convert the xml files with standoff annotations to an IOB2 format csv and a txt file containing the raw text:
83 |
84 | ```bash
85 | ./scripts/xml_to_csv
86 | ```
87 |
88 | The `xml_to_csv` script calls the `deid.tools.i2b2_xml_to_csv` module with the `train_xml`, `validation_xml` and `test_xml` directories. It will output some inconsistencies in the data (standoff annotation texts differ from original text), but we'll ignore those for now.
89 |
90 | - Create an embeddings cache, again depending on your choice(s) of embeddings:
91 |
92 | - For **FastText**, this command writes all words from the train, test, and validation set to a pickle cache (5 minutes on my machine).
93 |
94 | ```bash
95 | python -m deid.tools.embeddings --fasttext-cache
96 | ```
97 |
98 | - For **ELMo**, this command looks up all sentences from the train, test, and validation set and writes them to many pickle files. This is slow, taking up to 3 hours.
99 |
100 | ```bash
101 | python -m deid.tools.embeddings --elmo-cache
102 | ```
103 |
104 | ## Experiments
105 |
106 | You can find these experiments in the [`deid/experiment`](deid/experiment) directory:
107 |
108 | - A [basic experiment](deid/experiment/basic.py) that can be used for training models on raw as well as augmented data
109 | - An implementation of [alternating adversarial training](deid/experiment/alternating.py) similar to [Feutry et al. (2018)](https://arxiv.org/abs/1802.09386)
110 | - Evaluation experiments for [automatic pseudonymization](deid/experiment/mtn_evaluation.py), discriminating [real from automatically pseudonymized sentences](deid/experiment/fake_sentences.py), and the [alternating training](deid/experiment/alternating_evaluation.py)
111 |
112 | To run an experiment:
113 |
114 | - Modify the [example config template](deid/config_template.yaml.example) and rename it to `.yaml`. Generate configs from it using the `config` tool:
115 |
116 | ```bash
117 | python -m deid.tools.config /path/to/config_template.yaml
118 | ```
119 |
120 | Specify the number of configs with the `-n` option. For a grid search instead of random samples, use the `-a` option (careful, this might generate thousands of configs depending on the hyperparameter space!).
121 |
122 | - Run a single experiment from a config:
123 |
124 | ```bash
125 | python -m deid.experiment /path/to/config.yaml
126 | ```
127 |
128 | This will output predictions and save a history pickle to an experiment directory inside `env.work_dir`.
129 |
130 | - Or set the `DEID_CONFIG_DIR` variable to the config directory and use the `queue` script to run all experiment configs from the `${DEID_CONFIG_DIR}/todo` directory (they will be processed sequentially and moved to the `${DEID_CONFIG_DIR}/done` directory).
131 |
132 | ```bash
133 | DEID_CONFIG_DIR=/path/to/config/dir ./scripts/queue
134 | ```
135 |
136 | ## Evaluation
137 |
138 | The evaluation using a [modified version](deid/tools/i2b2) (`2to3`, minor fixes) of the [official evaluation script](https://github.com/kotfic/i2b2_evaluation_scripts) is run automatically in the experiments. You can also call it like this to evaluate a directory of XML predictions:
139 |
140 | ```bash
141 | python -m deid.tools.i2b2.evaluate phi /path/to/predictions /path/to/i2b2_data/validation_xml/
142 | ```
143 |
--------------------------------------------------------------------------------
/deid/embeddings/matrix.py:
--------------------------------------------------------------------------------
1 | from typing import Iterable, List, Tuple, NamedTuple, Union, Optional, Dict
2 |
3 | import numpy as np
4 | from tqdm import tqdm
5 |
6 | from . import Embeddings
7 |
8 |
9 | class EmbeddingSimilarity(NamedTuple):
10 | rank: int
11 | word: str
12 | similarity: float
13 | vec: np.ndarray
14 |
15 |
16 | MostSimilarResult = List[EmbeddingSimilarity]
17 | WordOrVec = Union[str, np.ndarray]
18 |
19 |
20 | class Matrix:
21 | """ Transforms a lookup-based Embeddings object into a classical embedding matrix by looking up a fixed vocabulary
22 | and storing the results. The matrix can then be used for distance measuring.
23 | """
24 |
25 | def __init__(self, lookup_embeddings: Embeddings, vocab: Optional[Iterable[str]] = None,
26 | precomputed_word2ind: Optional[Dict[str, int]] = None, precomputed_matrix: Optional[np.ndarray] = None,
27 | verbose: bool = False) -> None:
28 | """ Initialize the Matrix object.
29 |
30 | :param lookup_embeddings: the embeddings object used for lookup
31 | :param vocab: an iterable containing the words that should be stored in the matrix
32 | :param precomputed_word2ind: a precomputed word2ind dict, e.g. from the fastText .vec file
33 | :param precomputed_matrix: a precomputed embedding matrix, e.g. from the fastText .vec file
34 | :param verbose: setting this to True will show a progress bar when first looking up embeddings as well as output
35 | means when computing distances
36 | """
37 | self.verbose = verbose
38 | self.lookup_embeddings = lookup_embeddings
39 |
40 | if vocab is not None:
41 | self._init_from_vocab(lookup_embeddings, vocab=vocab)
42 | elif precomputed_word2ind is not None and precomputed_matrix is not None:
43 | self._init_from_word2ind_and_matrix(precomputed_word2ind, precomputed_matrix)
44 | else:
45 | raise ValueError('The Matrix needs to be initialized either with vocab or word2ind+matrix')
46 |
47 | def _init_from_vocab(self, lookup_embeddings, vocab):
48 | vocab = set(vocab)
49 | self.vocab_size = len(vocab)
50 | self.word2ind = {word: i for i, word in enumerate(vocab)}
51 | self.ind2word = {i: word for i, word in enumerate(vocab)}
52 | self.embedding_matrix = np.zeros((self.vocab_size, lookup_embeddings.size))
53 | self.is_norm = False
54 |
55 | items: Iterable[Tuple[str, int]] = self.word2ind.items()
56 | if self.verbose:
57 | items = tqdm(items, desc='Looking up embeddings')
58 | for word, ind in items:
59 | looked_up = lookup_embeddings.lookup(word)
60 | if np.count_nonzero(looked_up) > 0:
61 | self.embedding_matrix[ind] = looked_up
62 | else:
63 | # this shouldn't happen anymore
64 | raise RuntimeError(f'Embedding vector for {word} is all zeros')
65 |
66 | def _init_from_word2ind_and_matrix(self, word2ind, matrix):
67 | self.vocab_size = len(word2ind)
68 | self.word2ind = word2ind
69 | self.ind2word = {i: word for word, i in self.word2ind.items()}
70 | self.embedding_matrix = matrix
71 | self.is_norm = True
72 |
73 | def init_norms(self, force: bool = False) -> None:
74 | """ Initializes self.norms with pre-computed L2 normalized vectors for cosine distance computation.
75 |
76 | :param force: setting this to True will update the norms even if they were already computed
77 | :return: None
78 | """
79 | if not self.is_norm or force:
80 | # noinspection PyAttributeOutsideInit
81 | self.embedding_matrix = self.embedding_matrix / np.sqrt((self.embedding_matrix ** 2).sum(-1))[
82 | ..., np.newaxis]
83 | self.is_norm = True
84 |
85 | def _most_similar_cosine_measurement(self, vec):
86 | self.init_norms()
87 | normalized_vec = vec / np.linalg.norm(vec)
88 | return np.dot(self.embedding_matrix, normalized_vec)
89 |
90 | def most_similar_cosine(self, word_or_vec: WordOrVec, n: int = 20) -> MostSimilarResult:
91 | """ Calculate the cosine distance of the input vector to all vectors in the embedding matrix and return the
92 | most similar ones.
93 |
94 | :param word_or_vec: the input word or vector
95 | :param n: the number of results to return, or None if all should be returned
96 | :return: a list of MostSimilarResult objects
97 | """
98 | return self._generic_most_similar(word_or_vec, self._most_similar_cosine_measurement,
99 | higher_is_more_similar=True, n=n)
100 |
101 | def cosine_distance_rank(self, word_or_vec: WordOrVec, word):
102 | return self._generic_rank(word_or_vec, word, self._most_similar_cosine_measurement, higher_is_more_similar=True)
103 |
104 | def cosine_distance(self, vec: np.ndarray, word: str) -> float:
105 | """ Returns the cosine distance between an input word and vector.
106 |
107 | :param vec: the input vector
108 | :param word: the input word
109 | :return: a float between -1 and 1
110 | """
111 | self.init_norms()
112 | normalized_vec = vec / np.linalg.norm(vec)
113 | return float(np.dot(self.embedding_matrix[self.word2ind[word]], normalized_vec))
114 |
115 | def most_similar_l2(self, word_or_vec: WordOrVec, n: int = 20) -> MostSimilarResult:
116 | """ Calculate the L2 norm distance of the input vector to all vectors in the embedding matrix and return the
117 | most similar ones.
118 |
119 | :param word_or_vec: the input word or vector
120 | :param n: the number of results to return, or None if all should be returned
121 | :return: a list of (word, distance) pairs, with lower distance meaning more similar
122 | """
123 |
124 | def measurement(vec):
125 | distances = np.zeros(self.vocab_size)
126 | for i, emb in enumerate(self.embedding_matrix):
127 | distances[i] = np.linalg.norm(vec - emb)
128 | return distances
129 |
130 | return self._generic_most_similar(word_or_vec, measurement, higher_is_more_similar=False, n=n)
131 |
132 | def _lookup_if_needed(self, word_or_vec: WordOrVec) -> np.ndarray:
133 | if type(word_or_vec) == str:
134 | return self.lookup_embeddings.lookup(word_or_vec)
135 | else:
136 | return word_or_vec
137 |
138 | def _generic_most_similar(self, word_or_vec: WordOrVec, measurement, higher_is_more_similar, n: int = 20):
139 | self.init_norms()
140 | vec = self._lookup_if_needed(word_or_vec)
141 | distances = measurement(vec)
142 | assert len(distances) == len(self.embedding_matrix)
143 | if self.verbose:
144 | print('mean distance', np.mean(distances))
145 |
146 | distances_for_sorting = -distances if higher_is_more_similar else distances
147 |
148 | if n is None or n == len(self.embedding_matrix):
149 | sorted_most_similar_ind = np.argsort(distances_for_sorting)
150 | else:
151 | most_similar_ind = np.argpartition(distances_for_sorting, n)[:n]
152 | sorted_most_similar_ind = most_similar_ind[np.argsort(distances_for_sorting[most_similar_ind])]
153 |
154 | return [EmbeddingSimilarity(rank=rank,
155 | word=self.ind2word[ind],
156 | similarity=distances[ind],
157 | vec=self.embedding_matrix[ind])
158 | for rank, ind in enumerate(sorted_most_similar_ind, start=1)]
159 |
160 | def _generic_rank(self, word_or_vec: WordOrVec, word, measurement, higher_is_more_similar):
161 | self.init_norms()
162 | vec = self._lookup_if_needed(word_or_vec)
163 | distances = measurement(vec)
164 | distances = -distances if higher_is_more_similar else distances
165 |
166 | word_distance = distances[self.word2ind[word]]
167 | return np.count_nonzero(distances[distances < word_distance]) + 1
168 |
--------------------------------------------------------------------------------
/deid/experiment/mtn_evaluation.py:
--------------------------------------------------------------------------------
1 | import math
2 | import os
3 | import pickle
4 | import random
5 | import sys
6 |
7 | import numpy as np
8 | from keras.callbacks import EarlyStopping, LambdaCallback, ModelCheckpoint
9 | from keras.layers import Input
10 |
11 | from . import experiment_directory
12 | from ..data import TrainingSet, ValidationSet, StratifiedSampling, is_phi_sentence
13 | from ..data.augment import Augment, get as get_strategy
14 | from ..data.batch import IteratorWithEpochLength
15 | from ..data.util import pad_2d_sequences
16 | from ..embeddings import Matrix, get as get_embeddings
17 | from ..env import env
18 | from ..model.adversary import TwoRepresentationsAreSameOriginalDiscriminator
19 |
20 |
21 | def fake_augmented_sentences_batch(X, y, indices, augm_alternatives, fake_alternatives, split_condition):
22 | indices = [i for i in indices if split_condition(X[i], y[i])]
23 | real_sentences = [X[i] for i in indices]
24 | augmented_sentences = [augm_alternatives[ind][0] for ind in indices]
25 | fake_sentences = [random.choice(fake_alternatives[ind]) for ind in indices]
26 |
27 | X_1 = []
28 | X_2 = []
29 | y = []
30 | for real, augm, fake in zip(real_sentences, augmented_sentences, fake_sentences):
31 | X_1 += [augm, augm]
32 | X_2 += [real, fake]
33 | y += [1, 0]
34 |
35 | return pad_2d_sequences(X_1), pad_2d_sequences(X_2), np.array(y)
36 |
37 |
38 | class MTNGenerator(IteratorWithEpochLength):
39 | def __init__(self, generator: IteratorWithEpochLength, dataset, dataset2):
40 | self.generator = generator
41 | self.dataset = dataset
42 | self.dataset2 = dataset2
43 |
44 | def __next__(self):
45 | _, _, indices = next(self.generator)
46 | X_1, X_2, adv_y = fake_augmented_sentences_batch(self.dataset.X, self.dataset.y, indices,
47 | self.dataset.augmented, self.dataset2.augmented,
48 | split_condition=is_phi_sentence)
49 | return [X_1, X_2], adv_y
50 |
51 | @property
52 | def epoch_length(self) -> int:
53 | return self.generator.epoch_length
54 |
55 |
56 | def mtn_evaluation_experiment(config):
57 | print('Loading embeddings...')
58 | embeddings = get_embeddings(config['experiment']['embeddings'])
59 |
60 | name = config['name']
61 | experiment_dir = experiment_directory(name, config['path'])
62 |
63 | print('Loading matrix...')
64 | matrix = Matrix(embeddings, precomputed_word2ind=embeddings.precomputed_word2ind,
65 | precomputed_matrix=embeddings.precomputed_matrix)
66 |
67 | strategy = get_strategy(config['augment']['strategy'], matrix)
68 | digit_strategy = get_strategy(config['augment']['digit_strategy'], matrix)
69 | adv_strategy = get_strategy('move_to_neighbor-5', matrix)
70 |
71 | augment = Augment(embeddings, strategy=strategy, digit_strategy=digit_strategy, n_augmentations=1)
72 |
73 | augment2 = Augment(embeddings, strategy=adv_strategy, digit_strategy=digit_strategy,
74 | n_augmentations=config['augment']['n_augmentations'], augment_max=1)
75 |
76 | print('Augmenting training set...', flush=True)
77 | tr = TrainingSet(train_set=config['experiment']['train_set'],
78 | embeddings=embeddings,
79 | use_short_sentences=env.use_short_sentences,
80 | limit_documents=env.limit_training_documents,
81 | augment=augment)
82 |
83 | tr2 = TrainingSet(train_set=config['experiment']['train_set'],
84 | embeddings=embeddings,
85 | use_short_sentences=env.use_short_sentences,
86 | limit_documents=env.limit_training_documents,
87 | augment=augment2)
88 |
89 | assert np.all(tr.X[100] == tr2.X[100]) # making sure that the training sets have the same order
90 |
91 | print('Augmenting validation set...', flush=True)
92 | val = ValidationSet(validation_set=config['experiment']['validation_set'],
93 | embeddings=embeddings,
94 | label2ind=tr.label2ind,
95 | use_short_sentences=env.use_short_sentences,
96 | limit_documents=env.limit_validation_documents,
97 | augment=augment)
98 |
99 | val2 = ValidationSet(validation_set=config['experiment']['validation_set'],
100 | embeddings=embeddings,
101 | label2ind=tr.label2ind,
102 | use_short_sentences=env.use_short_sentences,
103 | limit_documents=env.limit_validation_documents,
104 | augment=augment2)
105 |
106 | inputs = {'train_representation': Input(shape=(None, embeddings.size)),
107 | 'fake_representation': Input(shape=(None, embeddings.size))}
108 | adversary = TwoRepresentationsAreSameOriginalDiscriminator(inputs, representation_size=embeddings.size,
109 | lstm_size=embeddings.size)
110 | adversary.model.compile(loss=adversary.loss, optimizer='nadam', metrics=['accuracy'])
111 |
112 | batch_size = test_batch_size = 32
113 | train_gen = MTNGenerator(StratifiedSampling(tr.X, tr.y, split_condition=is_phi_sentence,
114 | batch_size=batch_size, yield_indices=True, shuffle=True), tr, tr2)
115 | valid_gen = MTNGenerator(StratifiedSampling(val.X, val.y, split_condition=is_phi_sentence,
116 | batch_size=test_batch_size, yield_indices=True, shuffle=False), val,
117 | val2)
118 |
119 | early_stopping = EarlyStopping(monitor='val_loss', patience=config['training']['early_stopping_patience'])
120 | flush = LambdaCallback(on_epoch_end=lambda epoch, logs: sys.stdout.flush())
121 | callbacks = [early_stopping, flush]
122 | if env.save_model:
123 | checkpoint = ModelCheckpoint(os.path.join(experiment_dir, 'model.hdf5'), save_best_only=True)
124 | callbacks.append(checkpoint)
125 |
126 | history = adversary.model.fit_generator(train_gen,
127 | epochs=config['training']['train_epochs'],
128 | steps_per_epoch=int(math.ceil(len(tr.X) / batch_size)),
129 | validation_data=valid_gen,
130 | validation_steps=int(math.ceil(len(val.X) / test_batch_size)),
131 | callbacks=callbacks,
132 | verbose=env.keras_verbose)
133 |
134 | if config['test']['run_test']:
135 | label2ind = tr.label2ind
136 | del tr, tr2, val, val2, train_gen, valid_gen
137 |
138 | if env.save_model:
139 | print('Restoring best weights')
140 | adversary.model.load_weights(os.path.join(experiment_dir, 'model.hdf5'))
141 |
142 | print('Augmenting test set...', flush=True)
143 |
144 | test = ValidationSet(validation_set='test',
145 | embeddings=embeddings,
146 | label2ind=label2ind,
147 | use_short_sentences=env.use_short_sentences,
148 | limit_documents=env.limit_validation_documents,
149 | augment=augment)
150 |
151 | test2 = ValidationSet(validation_set='test',
152 | embeddings=embeddings,
153 | label2ind=label2ind,
154 | use_short_sentences=env.use_short_sentences,
155 | limit_documents=env.limit_validation_documents,
156 | augment=augment2)
157 | test_gen = MTNGenerator(StratifiedSampling(test.X, test.y, split_condition=is_phi_sentence,
158 | batch_size=test_batch_size, yield_indices=True, shuffle=False), test,
159 | test2)
160 |
161 | loss, acc = adversary.model.evaluate_generator(test_gen, int(math.ceil(len(test.X) / test_batch_size)))
162 | print(f'Test loss: {loss}, test acc: {acc}')
163 | history.history['test_loss'] = loss
164 | history.history['test_acc'] = acc
165 |
166 | history_pickle_path = os.path.join(experiment_dir, 'history.pickle')
167 | print('Saving history to', history_pickle_path)
168 | with open(history_pickle_path, 'wb') as f:
169 | pickle.dump(history.history, f)
170 |
--------------------------------------------------------------------------------
/deid/experiment/basic.py:
--------------------------------------------------------------------------------
1 | import math
2 | import os
3 | import pickle
4 | import sys
5 |
6 | import numpy as np
7 | from keras.callbacks import EarlyStopping, LambdaCallback, ModelCheckpoint
8 |
9 | from . import DeidentificationEvaluationCallback, evaluate_deid_performance, experiment_directory
10 | from ..data import TrainingSet, ValidationSet, BatchGeneratorWithExtraFeatures, StratifiedSamplingWithExtraFeatures, \
11 | is_phi_sentence
12 | from ..data.augment import Augment, get as get_strategy
13 | from ..data.class_weight import get as get_class_weight
14 | from ..data.feature import get as get_feature
15 | from ..data.util import compounding
16 | from ..embeddings import PrecomputedEmbeddings, Matrix, get as get_embeddings
17 | from ..env import env
18 | from ..model import get as get_model
19 | from ..model.optimizer import get as get_optimizer
20 |
21 |
22 | def basic_experiment(config):
23 | name = config['name']
24 | batch_size = config['training']['batch_size']
25 | test_batch_size = config['training']['test_batch_size']
26 | if test_batch_size is None:
27 | test_batch_size = batch_size
28 | test_weights = config['test']['test_weights']
29 |
30 | experiment_dir = experiment_directory(name, config['path'])
31 |
32 | print('Loading embeddings...')
33 | embeddings = get_embeddings(config['experiment']['embeddings'])
34 | print('Done.')
35 |
36 | if config['augment'] is not None and test_weights is None:
37 | if isinstance(embeddings, PrecomputedEmbeddings):
38 | matrix = Matrix(embeddings, precomputed_word2ind=embeddings.precomputed_word2ind,
39 | precomputed_matrix=embeddings.precomputed_matrix)
40 | strategy_kwargs = {'matrix': matrix}
41 | else:
42 | strategy_kwargs = {}
43 |
44 | strategy = get_strategy(config['augment']['strategy'], **strategy_kwargs)
45 | digit_strategy = get_strategy(config['augment']['digit_strategy'], **strategy_kwargs)
46 | augment = Augment(embeddings=embeddings, strategy=strategy, digit_strategy=digit_strategy,
47 | **config['augment']['augment_args'])
48 | else:
49 | augment = None
50 |
51 | if config['experiment']['extra_features'] is None or len(config['experiment']['extra_features']) == 0:
52 | extra_features = []
53 | else:
54 | extra_features = [get_feature(identifier) for identifier in config['experiment']['extra_features']]
55 |
56 | tr = TrainingSet(train_set=config['experiment']['train_set'],
57 | embeddings=embeddings,
58 | use_short_sentences=env.use_short_sentences,
59 | limit_documents=env.limit_training_documents,
60 | binary_classification=config['experiment']['binary_classification'],
61 | hipaa_only=config['experiment']['hipaa_only'],
62 | augment=augment,
63 | extra_features=extra_features)
64 |
65 | model = get_model(config['experiment']['model'])(name=name,
66 | input_size=embeddings.size,
67 | extra_input_size=tr.X_extra_size,
68 | output_size=tr.output_size,
69 | optimizer=get_optimizer(config['training']['optimizer'])(
70 | **config['training']['optimizer_args']),
71 | **config['model_args'])
72 |
73 | if test_weights is None:
74 | train_and_validate(model, config, tr, embeddings, extra_features, batch_size, test_batch_size, experiment_dir)
75 | else:
76 | model.load_weights(test_weights)
77 |
78 | if config['test']['run_test']:
79 | test_set = config['test']['test_set']
80 | if test_set is None:
81 | test_set = 'test'
82 | evaluate_deid_performance(model=model, batch_size=test_batch_size, embeddings=embeddings,
83 | label2ind=tr.label2ind, ind2label=tr.ind2label,
84 | test_set=test_set, experiment_dir=experiment_dir,
85 | binary_classification=config['experiment']['binary_classification'],
86 | hipaa_only=config['experiment']['hipaa_only'],
87 | extra_features=extra_features, epoch=99)
88 |
89 |
90 | def train_and_validate(model, config, tr, embeddings, extra_features, batch_size, test_batch_size, experiment_dir):
91 | val = ValidationSet(validation_set=config['experiment']['validation_set'],
92 | embeddings=embeddings,
93 | label2ind=tr.label2ind,
94 | use_short_sentences=env.use_short_sentences,
95 | limit_documents=env.limit_validation_documents,
96 | binary_classification=config['experiment']['binary_classification'],
97 | hipaa_only=config['experiment']['hipaa_only'],
98 | extra_features=extra_features)
99 |
100 | if config['augment'] is not None and config['augment']['include_original']:
101 | tr_X, tr_y, tr_X_extra = tr.data_with_augmented
102 | augment_training_generator = None
103 | else:
104 | tr_X, tr_y, tr_X_extra = tr.X, tr.y, tr.X_extra
105 | augment_training_generator = tr.augmented
106 |
107 | print('Size of the training set:', len(tr_X), 'with maxlen:', tr.maxlen)
108 | compound = config['training']['batch_size_compound']
109 | if compound is not None and compound != 0 and compound < batch_size:
110 | training_batch_size = compounding(1, batch_size, compound)
111 | else:
112 | training_batch_size = batch_size
113 |
114 | if config['training']['batch_mode'] == 'stratified':
115 | train_gen_class, train_gen_args = StratifiedSamplingWithExtraFeatures, {'split_condition': is_phi_sentence}
116 | else:
117 | train_gen_class, train_gen_args = BatchGeneratorWithExtraFeatures, {}
118 |
119 | training_generator = train_gen_class(tr_X, tr_y, tr_X_extra,
120 | batch_size=training_batch_size,
121 | augment=augment_training_generator, **train_gen_args)
122 |
123 | validation_generator = BatchGeneratorWithExtraFeatures(val.X, val.y, val.X_extra, test_batch_size,
124 | shuffle=False)
125 |
126 | if config['experiment']['class_weight'] is not None:
127 | class_weight = get_class_weight(config['experiment']['class_weight'])(tr.output_size, tr_y)
128 | else:
129 | class_weight = None
130 |
131 | early_stopping = EarlyStopping(monitor='val_loss', patience=config['training']['early_stopping_patience'])
132 | flush = LambdaCallback(on_epoch_end=lambda epoch, logs: sys.stdout.flush())
133 | evaluation = DeidentificationEvaluationCallback(model, batch_size=test_batch_size, embeddings=embeddings,
134 | label2ind=tr.label2ind, ind2label=tr.ind2label,
135 | test_set=config['experiment']['validation_set'],
136 | experiment_dir=experiment_dir,
137 | evaluate_every=config['training']['i2b2_evaluate_every'],
138 | binary_classification=config['experiment'][
139 | 'binary_classification'],
140 | hipaa_only=config['experiment']['hipaa_only'],
141 | extra_features=extra_features)
142 |
143 | callbacks = [early_stopping, evaluation, flush]
144 | if env.save_model:
145 | checkpoint = ModelCheckpoint(os.path.join(experiment_dir, 'model.hdf5'), save_best_only=True)
146 | callbacks.append(checkpoint)
147 |
148 | history = model.fit_generator(training_generator,
149 | epochs=config['training']['train_epochs'],
150 | steps_per_epoch=int(math.ceil(len(tr_X) / batch_size)),
151 | validation_data=validation_generator,
152 | validation_steps=int(math.ceil(len(val.X) / test_batch_size)),
153 | class_weight=class_weight,
154 | callbacks=callbacks,
155 | verbose=env.keras_verbose,
156 | use_multiprocessing=True)
157 | if env.save_model:
158 | best_epoch = np.argmin(history.history['val_loss']) + 1 # epoch numbering is 1-based
159 | print(f'Resetting to weights from epoch {best_epoch:02d}')
160 | model.load_weights(os.path.join(experiment_dir, 'model.hdf5'))
161 |
162 | history_pickle_path = os.path.join(experiment_dir, 'history.pickle')
163 | print('Saving history to', history_pickle_path)
164 | with open(history_pickle_path, 'wb') as f:
165 | pickle.dump(history.history, f)
166 |
--------------------------------------------------------------------------------
/deid/tools/i2b2/README.md:
--------------------------------------------------------------------------------
1 | **i2b2 2014 Evaluation Script**
2 |
3 | This script is distributed as a part of the i2b2 2014 Cardiac Risk and
4 | Protected Health Information (PHI) tasks.
5 |
6 | If you would like to contribute to this project, pull requests are welcome.
7 | Please see: https://help.github.com/articles/fork-a-repo for instructions
8 | on how to make a fork of this repository, and
9 | https://help.github.com/articles/using-pull-requests for instructions on
10 | making a pull request. Suggestions for improvements, bugs or feature requests
11 | may be directed to the i2b2 evaluation scripts' issues page located at:
12 | https://github.com/kotfic/i2b2_evaluation_scripts/issues
13 |
14 | _Setup_
15 |
16 | This script also requires the following Python packages:
17 | lxml version 3.3.1
18 | numpy version 1.8.0
19 |
20 | If you get an error when running the script, please make sure that these
21 | are installed and accessible to your Python installation.
22 |
23 |
24 | _Running the script_
25 |
26 | This script intended to be used via
27 | command line:
28 | python evaluate.py [cr|phi] [FLAGS] SYSTEM GOLD
29 |
30 | Where 'cr' produces Precision, Recall and F1 (P/R/F1) measure for the
31 | cardiac risk task and 'phi' produces P/R/F1 for the PHI task. SYSTEM and GOLD
32 | may be individual files representing system output in the case of SYSTEM and
33 | the gold standard in the case of GOLD. SYSTEM and GOLD may also be
34 | directories in which case all files in SYSTEM will be compared to files the
35 | GOLD directory based on their file names. See below for more information
36 | on the different output the cr/phi flag produces.
37 |
38 |
39 |
40 | _File name restrictions_
41 |
42 | File names MUST be of the form:
43 | XXX-YY.xml where XXX is the patient id, and YY is the document id. The
44 | files from your system runs are matched to the the gold standard file by
45 | file name alone. If your system outputs file names in a different format,
46 | you will need to either modify your system or this script.
47 |
48 |
49 | _Output for Risk Factor Track_
50 |
51 | To compare your system output for the Risk Factor track, run the following
52 | command for individual files:
53 |
54 | python evaluate.py cr {system.xml} {gold.xml}
55 | (replace the file names in {}s with the names of your actual files)
56 |
57 | or, to run the script on directories of files:
58 | python evaluate.py cr {system}/ {gold}/
59 | (again, replace the folder names in {}s with the names of your actual folders)
60 |
61 | Running one of these versions will produce output in this format:
62 |
63 | ```
64 | (# of files) Measure Macro (SD) Micro
65 | ---------------------------------------------------------------------------
66 | Total Precision 1.0 (0.0) 1.0
67 | Recall 1.0 (0.0) 1.0
68 | F1 1.0 1.0
69 | ```
70 |
71 | The script evaluates the accuracy of your tags based on tag type
72 | and all the attributes (except ID). If you want to get more details
73 | about the output of your system, such as which attributes it is
74 | getting right/wrong, you can use the more experimental flags. Please see
75 | the evaluate.py script itself for more information on the flags.
76 |
77 |
78 | _Output for De-identification Track_
79 |
80 | To compare your system output for the de-identification track, run the following
81 | command on individual files:
82 |
83 | python evaluate.py phi {system.xml} {gold.xml}
84 | (replace the file names in {}s with the names of your actual files)
85 |
86 | or, to run the script on directories of files:
87 | python evaluate.py phi {system}/ {gold}/
88 | (again, replace the folder names in {}s with the names of your actual folders)
89 |
90 |
91 | Running one of these versions wil produce output that looks like this:
92 |
93 | ```
94 | Strict (521) Measure Macro (SD) Micro
95 | ---------------------------------------------------------------------------
96 | Total Precision 0.6635 (0.11) 0.6537
97 | Recall 0.4906 (0.12) 0.4988
98 | F1 0.5641 0.5658
99 |
100 |
101 | Relaxed (521) Measure Macro (SD) Micro
102 | ---------------------------------------------------------------------------
103 | Total Precision 0.897 (0.086) 0.9047
104 | Recall 0.6663 (0.15) 0.6903
105 | F1 0.7646 0.7831
106 |
107 |
108 | HIPAA Strict (521) Measure Macro (SD) Micro
109 | ---------------------------------------------------------------------------
110 | Total Precision 0.7406 (0.098) 0.7225
111 | Recall 0.7406 (0.098) 0.7225
112 | F1 0.7406 0.7225
113 |
114 |
115 | HIPAA Relaxed (521) Measure Macro (SD) Micro
116 | ---------------------------------------------------------------------------
117 | Total Precision 1.0 (0.0) 1.0
118 | Recall 1.0 (0.0) 1.0
119 | F1 1.0 1.0
120 | ```
121 |
122 | A few notes to explain this output:
123 | - The "(521)" represents the number of files the scrip was run on
124 | - "Strict" evaluations require that the offsets for the system outputs match *exactly*
125 | - "Relaxed" evaluations allow for the "end" part of the offsets to be off by 2--this allows for variations in including "'s" and other endings that many systems will ignore due to tokenization
126 | - "HIPPA" evalutions include only the tags that a strict interpretation of the HIPAA guidelines. See the below list for which tags are included in this evaluation
127 |
128 |
129 |
130 | _HIPAA-compliant PHI_
131 |
132 | - NAME/PATIENT
133 | - AGE
134 | - LOCATION/CITY
135 | - LOCATION/STREET
136 | - LOCATION/ZIP
137 | - LOCATION/ORGANIZATION
138 | - DATE
139 | - CONTACT/PHONE
140 | - CONTACT/FAX
141 | - CONTACT/EMAIL
142 | - ID/SSN
143 | - ID/MEDICALRECORD
144 | - ID/HEALTHPLAN
145 | - ID/ACCOUNT
146 | - ID/LICENSE
147 | - ID/VEHICLE
148 | - ID/DEVICE
149 | - ID/BIOID
150 | - ID/IDNUM
151 |
152 |
153 | _Verbose flag_
154 |
155 | To get document-by-document information about the accuracy of your tags, you can use the
156 | "-v" or "--verbose" flag. For example:
157 |
158 | python evaluate.py cr -v system/ gold/
159 |
160 |
161 | _Advanced useage_
162 |
163 | Some additional functionality is made available for testing and error
164 | analysis. This functionality is provided AS IS with the hopes that it will
165 | be useful. It should be considered 'experimental' at best, may be bug prone
166 | and will not be explicitly supported, though, bug reports and pull requests
167 | are welcome.
168 |
169 | Advanced Flags:
170 |
171 | --filter [TAG ATTRIBUTES] :: run P/R/F1 measures in either summary or verbose
172 | mode (see -v) for the list of attributes defined
173 | by TAG ATTRIBUTES. This may be a comma separated
174 | list of tag names and attribute values. For more
175 | see Advanced Examples.
176 | --conjunctive :: If multiple values are passed to filter as a comma separated
177 | list, treat them as a series of AND based filters instead of
178 | a series of OR based filters
179 | --invert :: run P/R/F1 on the inverted set of tags defined by TAG ATTRIBUTES
180 | in the --filter tag (see --filter).
181 |
182 | Advanced Examples:
183 |
184 | python evaluate.py cr --filter MEDICATION system/ gold/
185 |
186 | Evaluate system output in system/ folder against gold/ folder considering
187 | only MEDICATION tags
188 |
189 | python evaluate.py cr --filter CAD,OBESE system/ gold/
190 |
191 | Evaluate system output in system/ folder against gold/ folder considering
192 | only CAD or OBESE tags. Comma separated lists to the --filter flag are con-
193 | joined via OR.
194 |
195 | python evaluate.py cr --filter "CAD,before DCT" system/ gold/
196 |
197 | Evaluate system output in system/ folder against gold/ folder considering
198 | only CAD *OR* tags with a time attribute of before DCT. This is probably
199 | not what you want when filtering, see the next example
200 |
201 | python evaluate.py cr --conjunctive \
202 | --filter "CAD,before DCT" system/ gold/
203 |
204 | Evaluate system output in system/ folder against gold/ folder considering
205 | CAD tags *AND* tags with a time attribute of before DCT.
206 |
207 | python evaluate.py cr --invert \
208 | --filter MEDICATION system/ gold/
209 |
210 | Evaluate system output in system/ folder against gold/ folder considering
211 | any tag which is NOT a MEDICATION tag.
212 |
213 | python evaluate.py cr --invert \
214 | --conjunctive \
215 | --filter "CAD,before DCT" system/ gold/
216 |
217 | Evaluate system output in system/ folder against gold/ folder considering
218 | any tag which is NOT CAD and with a time attribute of 'before DCT'
219 |
--------------------------------------------------------------------------------
/deid/data/batch.py:
--------------------------------------------------------------------------------
1 | import itertools
2 | import math
3 | import random
4 | from typing import Generic, TypeVar, Optional
5 | from typing import Sequence, Union, Tuple, Callable, Dict, Iterator, List
6 |
7 | import numpy as np
8 |
9 | from .util import pad_2d_sequences, peek
10 |
11 | X_type = TypeVar('X_type')
12 | y_type = TypeVar('y_type')
13 |
14 | TwoArrays = Tuple[np.ndarray, np.ndarray]
15 | ThreeArrays = Tuple[np.ndarray, np.ndarray, np.ndarray]
16 |
17 |
18 | class IteratorWithEpochLength(Iterator):
19 | def __next__(self) -> Union[TwoArrays, ThreeArrays]:
20 | raise NotImplementedError
21 |
22 | @property
23 | def epoch_length(self) -> int:
24 | raise NotImplementedError
25 |
26 |
27 | class IteratorWithEpochLengthImpl(Generic[X_type, y_type], IteratorWithEpochLength):
28 | def __init__(self,
29 | X: Sequence[X_type],
30 | y: Sequence[y_type],
31 | total_size: int,
32 | batch_size_iter: Iterator[int],
33 | yield_incomplete_batches: bool = True,
34 | yield_indices: bool = False,
35 | augment: Optional[Dict[int, Sequence[X_type]]] = None,
36 | augment_include_original: bool = True) -> None:
37 | assert len(X) == len(y)
38 | self.X, self.y = X, y
39 | self.augment = augment
40 | self.augment_include_original = augment_include_original
41 |
42 | self.total_size = total_size
43 | self.batch_size_iter = batch_size_iter
44 | self.yield_indices = yield_indices
45 | self.yield_incomplete_batches = yield_incomplete_batches
46 | self.init_epoch()
47 |
48 | def __next__(self) -> Union[TwoArrays, ThreeArrays]:
49 | if self.batch_number == self.epoch_length:
50 | self.init_epoch()
51 |
52 | current_batch_size = self.epoch_batch_sizes[self.batch_number]
53 | end = min(self.cursor + current_batch_size, self.total_size)
54 | batch_ind = self.select_batch_ind(self.cursor, end)
55 |
56 | if self.augment is not None:
57 | if self.augment_include_original:
58 | batch_X = [random.choice(self.augment[i] + [self.X[i]]) for i in batch_ind]
59 | else:
60 | batch_X = [random.choice(self.augment[i]) if len(self.augment[i]) > 0 else self.X[i] for i in batch_ind]
61 | else:
62 | batch_X = [self.X[i] for i in batch_ind]
63 | batch_y = [self.y[i] for i in batch_ind]
64 | self.cursor += current_batch_size
65 | self.batch_number += 1
66 |
67 | batch_X, batch_y = pad_2d_sequences(batch_X), pad_2d_sequences(batch_y)
68 | if self.yield_indices:
69 | return batch_X, batch_y, batch_ind
70 | else:
71 | return batch_X, batch_y
72 |
73 | def select_batch_ind(self, cursor, end) -> np.ndarray:
74 | raise NotImplementedError
75 |
76 | def __iter__(self):
77 | return self
78 |
79 | @property
80 | def epoch_length(self) -> int:
81 | return len(self.epoch_batch_sizes)
82 |
83 | # noinspection PyAttributeOutsideInit
84 | def init_epoch(self):
85 | self.batch_number = self.cursor = 0
86 | self.epoch_batch_sizes = self._make_epoch_batch_sizes(self.total_size)
87 |
88 | def _make_epoch_batch_sizes(self, total_size):
89 | """ Take items from the batch size iter until they make an epoch."""
90 | result = []
91 | seen = 0
92 | while seen < total_size:
93 | if self.yield_incomplete_batches:
94 | size = min(next(self.batch_size_iter), total_size - seen)
95 | seen += size
96 | result.append(size)
97 | else:
98 | size, self.batch_size_iter = peek(self.batch_size_iter)
99 | if seen + size > total_size:
100 | break
101 | size = next(self.batch_size_iter)
102 | seen += size
103 | result.append(size)
104 |
105 | assert seen == total_size if self.yield_incomplete_batches else seen <= total_size
106 | return result
107 |
108 |
109 | class BatchGenerator(IteratorWithEpochLengthImpl):
110 | def __init__(self,
111 | X: Sequence[X_type],
112 | y: Sequence[y_type],
113 | batch_size: Union[int, Iterator[int]],
114 | shuffle: bool = True,
115 | **kwargs) -> None:
116 |
117 | self.shuffle = shuffle
118 |
119 | if isinstance(batch_size, int):
120 | batch_size_iter = itertools.repeat(batch_size)
121 | else:
122 | batch_size_iter = batch_size
123 | super().__init__(X, y, total_size=len(X), batch_size_iter=batch_size_iter, **kwargs)
124 |
125 | # noinspection PyAttributeOutsideInit
126 | def init_epoch(self):
127 | super().init_epoch()
128 | if self.shuffle:
129 | self.shuffled_ind = np.random.permutation(np.arange(len(self.X)))
130 | else:
131 | self.shuffled_ind = np.arange(len(self.X))
132 |
133 | def select_batch_ind(self, cursor, end):
134 | return self.shuffled_ind[cursor:end]
135 |
136 |
137 | class BatchGeneratorWithExtraFeatures(BatchGenerator):
138 | def __init__(self,
139 | X: Sequence[X_type],
140 | y: Sequence[y_type],
141 | X_extra,
142 | batch_size: Union[int, Iterator[int]],
143 | **kwargs) -> None:
144 | self.X_extra = X_extra
145 | super().__init__(X, y, batch_size=batch_size, yield_indices=True, **kwargs)
146 |
147 | def __next__(self):
148 | X, y, ind = super().__next__()
149 | return [X, pad_2d_sequences([self.X_extra[i] for i in ind])], y
150 |
151 |
152 | class StratifiedSampling(IteratorWithEpochLengthImpl):
153 | def __init__(self,
154 | X: Sequence[X_type],
155 | y: Sequence[y_type],
156 | batch_size: Union[int, Iterator[int]],
157 | split_condition: Callable[[X_type, y_type], bool],
158 | shuffle: bool = False,
159 | **kwargs) -> None:
160 | self.X_pos_ind, self.X_neg_ind = self.split_indices(X, y, split_condition)
161 | self.shorter_partition_size = min(len(self.X_pos_ind), len(self.X_neg_ind))
162 |
163 | self.shuffle = shuffle
164 |
165 | if isinstance(batch_size, int):
166 | batch_size_iter = itertools.repeat(math.ceil(batch_size / 2))
167 | else:
168 | double_batch_size_iter: Iterator[int] = batch_size
169 | batch_size_iter = (math.ceil(size / 2) for size in double_batch_size_iter)
170 |
171 | super().__init__(X, y, total_size=self.shorter_partition_size, batch_size_iter=batch_size_iter, **kwargs)
172 |
173 | # noinspection PyAttributeOutsideInit
174 | def init_epoch(self):
175 | super().init_epoch()
176 | if self.shuffle:
177 | self.shuffled_pos = np.random.permutation(self.X_pos_ind)
178 | self.shuffled_neg = np.random.permutation(self.X_neg_ind)
179 | else:
180 | self.shuffled_pos, self.shuffled_neg = self.X_pos_ind, self.X_neg_ind
181 |
182 | def select_batch_ind(self, cursor, end):
183 | return np.concatenate((self.shuffled_pos[cursor:end], self.shuffled_neg[cursor:end]), axis=0)
184 |
185 | @staticmethod
186 | def split_indices(X: Sequence[X_type],
187 | y: Sequence[y_type],
188 | split_condition: Callable[[X_type, y_type], bool]) -> Tuple[Sequence[int], Sequence[int]]:
189 | pos: List[int] = []
190 | neg: List[int] = []
191 | for i in range(len(X)):
192 | (pos if split_condition(X[i], y[i]) else neg).append(i)
193 | return pos, neg
194 |
195 |
196 | class StratifiedSamplingWithExtraFeatures(StratifiedSampling):
197 | def __init__(self,
198 | X: Sequence[X_type],
199 | y: Sequence[y_type],
200 | X_extra,
201 | batch_size: Union[int, Iterator[int]],
202 | **kwargs) -> None:
203 | self.X_extra = X_extra
204 | super().__init__(X, y, batch_size=batch_size, yield_indices=True, **kwargs)
205 |
206 | def __next__(self):
207 | X, y, ind = super().__next__()
208 | return [X, pad_2d_sequences([self.X_extra[i] for i in ind])], y
209 |
210 |
211 | def fake_sentences_batch(X: np.ndarray,
212 | y: np.ndarray,
213 | indices: np.ndarray,
214 | alternatives: Dict[int, Sequence[np.ndarray]],
215 | split_condition: Callable[[np.ndarray, np.ndarray], bool]) -> ThreeArrays:
216 | """ Generate a batch of real and fake/augmented sentence pairs.
217 |
218 | :param X: the complete X array
219 | :param y: the complete y array
220 | :param indices: the indices of this batch
221 | :param alternatives: a dictionary (index -> sequence of alternatives) providing fake alternatives for each index
222 | :param split_condition: a condition determining if the sentence should be used
223 | :return: A batch `X_1, X_2, y`
224 | """
225 |
226 | indices = [i for i in indices if split_condition(X[i], y[i])]
227 | real_sentences = [X[i] for i in indices]
228 | fake_sentences = [random.choice(alternatives[ind]) for ind in indices]
229 |
230 | X_1: List[np.ndarray] = []
231 | X_2: List[np.ndarray] = []
232 | y = []
233 | for real, fake in zip(real_sentences, fake_sentences):
234 | X_1 += [real, real]
235 | X_2 += [real, fake]
236 | y += [1, 0]
237 |
238 | return pad_2d_sequences(X_1), pad_2d_sequences(X_2), np.array(y)
239 |
--------------------------------------------------------------------------------