├── deid ├── tools │ ├── __init__.py │ ├── i2b2 │ │ ├── __init__.py │ │ ├── requirements.txt │ │ └── README.md │ ├── fix_xml_texts.py │ ├── fix_180-03.py │ ├── find_differences.py │ ├── dataset.py │ ├── i2b2_xml_to_csv_tests.py │ ├── embeddings.py │ ├── find_good_amount_of_noise.py │ ├── config.py │ └── i2b2_xml_to_csv.py ├── fixtures │ ├── deid_work │ │ └── .gitkeep │ └── deid_resources │ │ ├── config │ │ └── generated │ │ │ └── .gitkeep │ │ └── i2b2_data │ │ ├── train │ │ ├── 999-99.txt │ │ └── 999-99.csv │ │ └── train_xml │ │ ├── 999-96.xml │ │ ├── 999-97.xml │ │ ├── 999-98.xml │ │ └── 999-99.xml ├── model │ ├── losses │ │ ├── __init__.py │ │ ├── discriminator.py │ │ └── crf.py │ ├── layers │ │ ├── __init__.py │ │ ├── gradient_reversal.py │ │ └── noise.py │ ├── __init__.py │ ├── optimizer.py │ ├── deidentifier.py │ ├── adversary.py │ ├── representer.py │ └── adversarial.py ├── data │ ├── types.py │ ├── dataset_tests.py │ ├── augment │ │ ├── __init__.py │ │ ├── strategy_tests.py │ │ ├── get.py │ │ ├── augment_tests.py │ │ ├── strategy.py │ │ └── augment.py │ ├── __init__.py │ ├── class_weight.py │ ├── feature_tests.py │ ├── token.py │ ├── feature.py │ ├── util.py │ ├── tokenizer_tests.py │ ├── batch_tests.py │ ├── postprocess.py │ ├── tokenizer.py │ ├── read.py │ └── batch.py ├── experiment │ ├── run.py │ ├── random.py │ ├── __main__.py │ ├── __init__.py │ ├── config_tests.py │ ├── directory.py │ ├── get.py │ ├── evaluation_tests.py │ ├── config.py │ ├── alternating_evaluation.py │ ├── dummy.py │ ├── fake_sentences.py │ ├── evaluation.py │ ├── mtn_evaluation.py │ └── basic.py ├── __init__.py ├── embeddings │ ├── dummy.py │ ├── util.py │ ├── __init__.py │ ├── util_tests.py │ ├── noise.py │ ├── embeddings.py │ ├── glove.py │ ├── elmo.py │ ├── fasttext.py │ └── matrix.py ├── config_template.yaml.example ├── config_template_alternating.yaml.example └── env.py ├── adversary1.png ├── adversary2.png ├── architecture.png ├── .gitmodules ├── scripts ├── xml_to_csv └── queue ├── environment.yml ├── .travis.yml ├── LICENSE ├── .gitignore └── README.md /deid/tools/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /deid/fixtures/deid_work/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /deid/fixtures/deid_resources/config/generated/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /deid/tools/i2b2/__init__.py: -------------------------------------------------------------------------------- 1 | # Modified by Max Friedrich, 2018 2 | -------------------------------------------------------------------------------- /deid/tools/i2b2/requirements.txt: -------------------------------------------------------------------------------- 1 | lxml==3.3.1 2 | numpy==1.8.0 3 | -------------------------------------------------------------------------------- /adversary1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/maxfriedrich/deid-training-data/HEAD/adversary1.png -------------------------------------------------------------------------------- /adversary2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/maxfriedrich/deid-training-data/HEAD/adversary2.png -------------------------------------------------------------------------------- /architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/maxfriedrich/deid-training-data/HEAD/architecture.png -------------------------------------------------------------------------------- /deid/model/losses/__init__.py: -------------------------------------------------------------------------------- 1 | from .crf import crf_loss 2 | from .discriminator import discriminator_loss 3 | -------------------------------------------------------------------------------- /deid/model/layers/__init__.py: -------------------------------------------------------------------------------- 1 | from .gradient_reversal import GradientReversal 2 | from .noise import Noise, AdditiveNoise, MultiplicativeNoise 3 | -------------------------------------------------------------------------------- /deid/fixtures/deid_resources/i2b2_data/train/999-99.txt: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Record date: 2018-06-15 5 | 6 | Max Friedrich is a 25-year-old Computer science student living in Hamburg, Germany. 7 | 8 | -------------------------------------------------------------------------------- /deid/data/types.py: -------------------------------------------------------------------------------- 1 | from typing import Sequence, Union 2 | import numpy as np 3 | from .token import Token 4 | 5 | Sentence = Sequence[Union[Token, np.ndarray]] 6 | SentenceLabels = Sequence[Sequence[int]] 7 | -------------------------------------------------------------------------------- /deid/data/dataset_tests.py: -------------------------------------------------------------------------------- 1 | from . import TrainingSet 2 | from ..embeddings import DummyEmbeddings 3 | 4 | 5 | def test_training_set(): 6 | tr = TrainingSet(limit_documents=1, embeddings=DummyEmbeddings()) 7 | assert len(tr.X) == len(tr.y) 8 | -------------------------------------------------------------------------------- /deid/data/augment/__init__.py: -------------------------------------------------------------------------------- 1 | from .strategy import AugmentStrategy, AugmentEmbedding, AugmentWord, Zeros, RandomEmbedding, RandomDigits, \ 2 | AdditiveNoise, MoveToNeighbor 3 | from .get import get 4 | from .augment import Augment, AugmentedSentence 5 | 6 | 7 | -------------------------------------------------------------------------------- /deid/experiment/run.py: -------------------------------------------------------------------------------- 1 | from . import get_config, get as get_experiment 2 | 3 | 4 | def run_experiment(config_name_or_path): 5 | config = get_config(config_name_or_path) 6 | experiment = get_experiment(config['experiment']['type']) 7 | experiment(config) 8 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "dependencies/keras-contrib"] 2 | path = dependencies/keras-contrib 3 | url = git://github.com/keras-team/keras-contrib 4 | [submodule "dependencies/fastText"] 5 | path = dependencies/fastText 6 | url = git://github.com/facebookresearch/fastText.git 7 | -------------------------------------------------------------------------------- /deid/experiment/random.py: -------------------------------------------------------------------------------- 1 | def setup_random(): 2 | import os 3 | import random 4 | 5 | import numpy as np 6 | import tensorflow as tf 7 | 8 | os.environ['PYTHONHASHSEED'] = '0' 9 | np.random.seed(1) 10 | random.seed(2) 11 | tf.set_random_seed(3) 12 | -------------------------------------------------------------------------------- /deid/__init__.py: -------------------------------------------------------------------------------- 1 | # https://stackoverflow.com/a/40846742/2623170 2 | # https://github.com/numpy/numpy/pull/432/commits/170ed4e3 3 | import warnings 4 | 5 | warnings.filterwarnings("ignore", message="numpy.dtype size changed") 6 | warnings.filterwarnings("ignore", message="numpy.ufunc size changed") 7 | -------------------------------------------------------------------------------- /deid/fixtures/deid_resources/i2b2_data/train_xml/999-96.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /deid/experiment/__main__.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | from . import run_experiment 4 | 5 | 6 | def main(): 7 | parser = argparse.ArgumentParser() 8 | parser.add_argument('config', help='the config filename') 9 | args = parser.parse_args() 10 | 11 | run_experiment(args.config) 12 | 13 | 14 | if __name__ == '__main__': 15 | main() 16 | -------------------------------------------------------------------------------- /deid/model/__init__.py: -------------------------------------------------------------------------------- 1 | def get(identifier): 2 | if identifier == 'lstm': 3 | return make_lstm_crf 4 | elif identifier.startswith('adversarial'): 5 | return AdversarialModel 6 | else: 7 | raise ValueError('unknown identifier:', identifier) 8 | 9 | 10 | from .adversarial import AdversarialModel 11 | from .deidentifier import make_lstm_crf 12 | -------------------------------------------------------------------------------- /deid/data/__init__.py: -------------------------------------------------------------------------------- 1 | from .token import Token, TOKEN_TYPE, BINARY_LABEL 2 | from .tokenizer import tokenize 3 | from .types import Sentence, SentenceLabels 4 | from .batch import BatchGenerator, StratifiedSampling, BatchGeneratorWithExtraFeatures, \ 5 | StratifiedSamplingWithExtraFeatures, fake_sentences_batch 6 | from .dataset import DataSet, TrainingSet, ValidationSet, TestSet, is_phi_sentence 7 | from .postprocess import prediction_to_xml 8 | -------------------------------------------------------------------------------- /deid/fixtures/deid_resources/i2b2_data/train_xml/999-97.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /deid/model/optimizer.py: -------------------------------------------------------------------------------- 1 | from keras.optimizers import Adam, Nadam, RMSprop, SGD 2 | 3 | # We want to pass custom args to the adversaries. Passing a Keras optimizer string to the compile method won't let us 4 | # select custom args, so we make a subset of optimizers available by string keys here. 5 | optimizers = {'adam': Adam, 'nadam': Nadam, 'rmsprop': RMSprop, 'sgd': SGD} 6 | 7 | 8 | def get(identifier): 9 | if identifier in optimizers.keys(): 10 | return optimizers[identifier] 11 | raise ValueError(f'Unknown optimizer: {identifier}') 12 | -------------------------------------------------------------------------------- /deid/experiment/__init__.py: -------------------------------------------------------------------------------- 1 | from .config import get_config 2 | from .random import setup_random 3 | from .directory import experiment_directory 4 | from .evaluation import evaluate_deid_performance, DeidentificationEvaluationCallback 5 | 6 | from .basic import basic_experiment 7 | from .alternating import alternating_experiment 8 | from .alternating_evaluation import alternating_evaluation_experiment 9 | from .mtn_evaluation import mtn_evaluation_experiment 10 | from .fake_sentences import fake_sentences_experiment 11 | 12 | from .get import get 13 | from .run import run_experiment 14 | -------------------------------------------------------------------------------- /deid/experiment/config_tests.py: -------------------------------------------------------------------------------- 1 | from .config import Config 2 | 3 | 4 | def example_config(): 5 | return Config({'a': 0, 'b': 1, 'c': {'d': 2}}) 6 | 7 | 8 | def test_config_behaves_like_a_dict(): 9 | config = example_config() 10 | assert config['a'] == 0 11 | assert config['b'] == 1 12 | assert config['c']['d'] == 2 13 | 14 | config['c']['d'] = 3 15 | assert config['c']['d'] == 3 16 | 17 | 18 | def test_config_returns_none_for_missing_values(): 19 | config = example_config() 20 | assert config['x'] is None 21 | assert config['c']['y'] is None 22 | -------------------------------------------------------------------------------- /deid/data/class_weight.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | 3 | 4 | def get(identifier): 5 | if identifier == 'balanced': 6 | return balanced 7 | 8 | raise ValueError(f'Unknown class weight: {identifier}') 9 | 10 | 11 | def balanced(output_size, y): 12 | y = list(itertools.chain.from_iterable([[label[0] for label in sent] for sent in y])) 13 | 14 | o_weight = len(y) / y.count(1) 15 | phi_weight = len(y) / (len(y) - y.count(1)) 16 | 17 | class_weight = [0, o_weight] 18 | for i in range(2, output_size): 19 | class_weight.append(phi_weight) 20 | 21 | return class_weight 22 | -------------------------------------------------------------------------------- /scripts/xml_to_csv: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import os 4 | import sys 5 | 6 | ma_path = os.path.dirname(os.path.dirname(__file__)) 7 | if os.path.abspath(ma_path) not in sys.path: 8 | sys.path.append(ma_path) 9 | 10 | from deid.env import env 11 | 12 | for t in ['test', 'train', 'validation']: 13 | print(f'Converting {t} xmls...') 14 | command = ' '.join(['python -m deid.tools.i2b2_xml_to_csv --check', 15 | f"{os.path.join(env.data_dir, t + '_xml')}", 16 | f"{os.path.join(env.data_dir, t)}"]) 17 | 18 | print(command) 19 | os.system(command) 20 | -------------------------------------------------------------------------------- /deid/fixtures/deid_resources/i2b2_data/train_xml/999-98.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: deid-training-data 2 | channels: 3 | - defaults 4 | - conda-forge 5 | dependencies: 6 | - python=3.6 7 | - libgcc 8 | - pip 9 | - cython 10 | - jupyter 11 | - pathlib 12 | - numpy==1.14.5 13 | - scipy 14 | - pandas 15 | - matplotlib 16 | - beautifulsoup4 17 | - nose 18 | - h5py 19 | - spacy==2.0.12 20 | - tqdm 21 | - scikit-learn 22 | - lxml 23 | - pylint 24 | - mypy 25 | - msgpack-python 26 | - pip: 27 | - keras==2.2.2 28 | - tensorflow==1.10.0 # or tensorflow-gpu 29 | - tensorflow-hub==0.1.1 30 | - terminaltables 31 | - pybind11 32 | - -e ./dependencies/keras-contrib 33 | - -e ./dependencies/fastText 34 | -------------------------------------------------------------------------------- /deid/model/losses/discriminator.py: -------------------------------------------------------------------------------- 1 | from keras import backend as K 2 | from keras.losses import binary_crossentropy 3 | 4 | 5 | def discriminator_loss(y_true, y_pred): 6 | """ Compares the actual binary crossentropy loss to the random guessing loss (0.6931..., accuracy 0.5) and returns 7 | the maximum. This is motivated by the fact that our adversarial discriminators should not be worse than random 8 | guessing, otherwise we could just flip every prediction and get a better discriminator. 9 | """ 10 | loss = binary_crossentropy(y_true, y_pred) 11 | random_guessing = -K.log(0.5) 12 | return K.maximum(loss, random_guessing) 13 | -------------------------------------------------------------------------------- /deid/embeddings/dummy.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | 3 | import numpy as np 4 | 5 | from . import Embeddings 6 | 7 | 8 | class DummyEmbeddings(Embeddings): 9 | @property 10 | def size(self): 11 | return 5 12 | 13 | @property 14 | def std(self): 15 | return 0.5 16 | 17 | def is_unknown(self, word: str) -> bool: 18 | return False 19 | 20 | def lookup(self, word: str): 21 | hashed = int(hashlib.sha256(word.encode('utf-8')).hexdigest(), 16) 22 | five_digits = [int(digit) for digit in str(hashed)[1:6]] # omitting a possible - at the first index 23 | return np.array([-0.5 if digit < 5 else 0.5 for digit in five_digits]) 24 | -------------------------------------------------------------------------------- /deid/fixtures/deid_resources/i2b2_data/train/999-99.csv: -------------------------------------------------------------------------------- 1 | text,type,start,end 2 | ,O,0,0 3 | ,O,0,0 4 | ,O,3,3 5 | Record,O,3,9 6 | date,O,10,14 7 | :,O,14,15 8 | 2018,B-DATE,16,20 9 | -,I-DATE,20,21 10 | 06,I-DATE,21,23 11 | -,I-DATE,23,24 12 | 15,I-DATE,24,26 13 | ,O,26,26 14 | ,O,28,28 15 | Max,B-PATIENT,28,31 16 | Friedrich,I-PATIENT,32,41 17 | is,O,42,44 18 | a,O,45,46 19 | 25,B-AGE,47,49 20 | -,O,49,50 21 | year,O,50,54 22 | -,O,54,55 23 | old,O,55,58 24 | Computer,O,59,67 25 | science,O,68,75 26 | student,O,76,83 27 | living,O,84,90 28 | in,O,91,93 29 | Hamburg,B-CITY,94,101 30 | ",",O,101,102 31 | Germany,B-COUNTRY,103,110 32 | .,O,110,111 33 | ,O,111,111 34 | -------------------------------------------------------------------------------- /deid/fixtures/deid_resources/i2b2_data/train_xml/999-99.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | # https://conda.io/docs/user-guide/tasks/use-conda-with-travis-ci.html 2 | 3 | sudo: required 4 | dist: trusty 5 | group: travis_latest 6 | language: python 7 | python: 8 | - '3.6' 9 | git: 10 | depth: false 11 | install: 12 | - sudo apt-get update 13 | - wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh 14 | - bash miniconda.sh -b -p $HOME/miniconda 15 | - export PATH="$HOME/miniconda/bin:$PATH" 16 | - hash -r 17 | - conda config --set always_yes yes --set changeps1 no 18 | - conda update -q conda 19 | - conda info -a 20 | - conda env create -q 21 | - source activate deid-training-data 22 | - conda list 23 | - python -m spacy download en 24 | script: 25 | - nosetests --with-doctest 26 | -------------------------------------------------------------------------------- /deid/experiment/directory.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import socket 4 | from datetime import datetime 5 | 6 | from ..env import env 7 | 8 | 9 | def experiment_directory(name, config_path=None, work_dir=env.work_dir): 10 | """ Creates a directory for the experiment 11 | 12 | :param name: 13 | :param config_path: 14 | :param work_dir: 15 | :return: 16 | """ 17 | date_str = datetime.now().strftime('%Y%m%d-%H%M%S') 18 | directory = os.path.join(work_dir, name + '_' + socket.gethostname() + '_' + date_str) 19 | if env.experiment_dir_postfix is not None: 20 | directory += '_' + env.experiment_dir_postfix 21 | os.mkdir(directory) 22 | if config_path is not None: 23 | shutil.copy2(config_path, directory) 24 | 25 | return directory 26 | -------------------------------------------------------------------------------- /scripts/queue: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | if [ -z "$DEID_CONFIG_DIR" ]; then 6 | echo "Please set the DEID_CONFIG_DIR variable to the config directory." 7 | exit 1 8 | fi 9 | 10 | TODO="${DEID_CONFIG_DIR}/todo" 11 | IN_PROGRESS="${DEID_CONFIG_DIR}/in_progress" 12 | DONE="${DEID_CONFIG_DIR}/done" 13 | EXECUTED=0 14 | STOP=0 15 | 16 | find "${TODO}" -type f -name '*.yaml' -print0 | 17 | while IFS= read -r -d '' f; do 18 | config="$(basename $f)"; 19 | echo "Next config is ${f}, basename ${config}"; 20 | mv "${f}" "${IN_PROGRESS}"; 21 | python3 -m deid.experiment "${IN_PROGRESS}/${config}"; 22 | echo "Moving to done"; 23 | mv "${IN_PROGRESS}/${config}" "$DONE"; 24 | echo "OK"; 25 | EXECUTED=${EXECUTED}+1; 26 | done 27 | echo "Executed ${EXECUTED} configs." 28 | -------------------------------------------------------------------------------- /deid/experiment/get.py: -------------------------------------------------------------------------------- 1 | from .basic import basic_experiment 2 | from .alternating import alternating_experiment 3 | from .alternating_evaluation import alternating_evaluation_experiment 4 | from .mtn_evaluation import mtn_evaluation_experiment 5 | from .fake_sentences import fake_sentences_experiment 6 | 7 | 8 | def get(identifier): 9 | if identifier == 'basic': 10 | return basic_experiment 11 | elif identifier == 'alternating': 12 | return alternating_experiment 13 | elif identifier == 'alternating_evaluation': 14 | return alternating_evaluation_experiment 15 | elif identifier == 'mtn_evaluation': 16 | return mtn_evaluation_experiment 17 | elif identifier == 'fake_sentences': 18 | return fake_sentences_experiment 19 | else: 20 | raise ValueError('unknown identifier:', identifier) 21 | -------------------------------------------------------------------------------- /deid/data/augment/strategy_tests.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from .strategy import NeighborsCache 4 | from ...embeddings import EmbeddingSimilarity 5 | 6 | 7 | def test_neighbors_cache(): 8 | cache = NeighborsCache('selected') 9 | assert cache.lookup('test') is None 10 | 11 | neighbors = [EmbeddingSimilarity(1, 'tests', 0.95, np.zeros(10)), 12 | EmbeddingSimilarity(1, 'testing', 0.93, np.zeros(10)), 13 | EmbeddingSimilarity(1, 'tester', 0.91, np.zeros(10))] 14 | 15 | cache.store('test', neighbors, neighbors[0]) 16 | assert cache.lookup('test') == cache.lookup('test') == neighbors[0] 17 | 18 | cache = NeighborsCache('neighbors') 19 | assert cache.lookup('test') is None 20 | 21 | cache.store('test', neighbors, neighbors[0]) 22 | assert cache.lookup('test') in neighbors 23 | assert cache.lookup('test') in neighbors 24 | -------------------------------------------------------------------------------- /deid/experiment/evaluation_tests.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tempfile 3 | 4 | from . import evaluation 5 | from ..env import Test 6 | from .evaluation import _run_official_evaluation 7 | 8 | config = evaluation.env = Test() 9 | 10 | 11 | def test_run_official_evaluation(): 12 | with tempfile.NamedTemporaryFile() as f: 13 | # testing the fixtures train_xml directory against itself, resulting in perfect score 14 | results = _run_official_evaluation(predictions_dir=os.path.join(config.data_dir, 'train_xml'), 15 | test_set='train', 16 | output_file=f.name) 17 | assert len(f.read().strip()) != 0 # something was written to the evaluation file 18 | 19 | assert results['Token']['precision'] == 1.0 20 | assert results['Token']['recall'] == 1.0 21 | assert results['Token']['f1'] == 1.0 22 | -------------------------------------------------------------------------------- /deid/embeddings/util.py: -------------------------------------------------------------------------------- 1 | from typing import Sequence, Tuple, List, Any 2 | 3 | 4 | def pad_string_sequences(seq: Sequence[Sequence[str]]) -> Tuple[List[List[str]], Sequence[int]]: 5 | """ Like keras.preprocessing.sequence.pad_string_sequences but for strings, and it also returns seq_length. """ 6 | 7 | seq_length = [len(item) for item in seq] 8 | maxlen = max(seq_length) 9 | 10 | result = [] 11 | for i, item in enumerate(seq): 12 | result.append(list(item) + [''] * (maxlen - seq_length[i])) 13 | return result, seq_length 14 | 15 | 16 | def unpad_sequences(padded: Sequence[Any], seq_length: Sequence[int]): 17 | """ The reverse operation of `keras.preprocessing.sequence.pad_sequences`. """ 18 | assert len(padded) == len(seq_length) 19 | return [padded[i][:seq_length[i]] for i in range(len(padded))] 20 | 21 | 22 | # https://stackoverflow.com/a/434328/2623170 23 | def chunks(seq, size): 24 | return (seq[pos:pos + size] for pos in range(0, len(seq), size)) 25 | -------------------------------------------------------------------------------- /deid/embeddings/__init__.py: -------------------------------------------------------------------------------- 1 | from .embeddings import Embeddings, PrecomputedEmbeddings 2 | from .dummy import DummyEmbeddings 3 | from .elmo import ElmoEmbeddings, TensorFlowElmoEmbeddings, CachedElmoEmbeddings 4 | from .fasttext import FastTextEmbeddings, PreloadFastTextEmbeddings, CachedFastTextEmbeddings 5 | from .glove import GloveEmbeddings 6 | from .matrix import Matrix, EmbeddingSimilarity 7 | from .noise import Noise, GaussianNoise, DropoutNoise, NoiseWrapper 8 | 9 | 10 | def get(identifier, *args, **kwargs): 11 | if identifier == 'dummy': 12 | return DummyEmbeddings() 13 | elif identifier == 'elmo': 14 | return ElmoEmbeddings(*args) 15 | elif identifier == 'elmo-tf': 16 | return TensorFlowElmoEmbeddings(*args, **kwargs) 17 | elif identifier == 'glove': 18 | return GloveEmbeddings(*args, **kwargs) 19 | elif identifier == 'fasttext': 20 | return FastTextEmbeddings(*args, **kwargs) 21 | else: 22 | raise ValueError('unknown identifier:', identifier) 23 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Max Friedrich 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /deid/embeddings/util_tests.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from .util import pad_string_sequences, unpad_sequences 4 | from keras.preprocessing.sequence import pad_sequences as keras_pad_sequences 5 | 6 | 7 | def test_pad_string_sequences(): 8 | test_seq = [['apple', 'banana', 'cherry'], ['d', 'e', 'f', 'g'], ['h', 'i', 'j', 'k', 'l', 'q'], ['r']] 9 | padded, seq_length = pad_string_sequences(test_seq) 10 | assert len(padded) == 4 11 | assert len(padded[0]) == 6 12 | assert padded[0][0] == 'apple' 13 | assert padded[0][3] == '' 14 | assert seq_length == [3, 4, 6, 1] 15 | 16 | 17 | def test_unpad_sequences(): 18 | test_seq = [['apple', 'banana', 'cherry', '', ''], ['d', 'e', 'f', 'g', 'h'], ['i', '', '', '', '', ]] 19 | seq = unpad_sequences(test_seq, [3, 5, 1]) 20 | assert len(seq) == 3 21 | assert seq[0] == ['apple', 'banana', 'cherry'] 22 | 23 | 24 | def test_is_reverse_operation(): 25 | test_seq = [[0, 1, 2, 3], [4], [5, 6]] 26 | padded = keras_pad_sequences(test_seq, padding='post') 27 | unpadded = unpad_sequences(padded, [4, 1, 2]) 28 | assert [list(item) for item in unpadded] == test_seq 29 | -------------------------------------------------------------------------------- /deid/data/augment/get.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from .strategy import AugmentStrategy, Zeros, RandomEmbedding, RandomDigits, AdditiveNoise, MoveToNeighbor 4 | 5 | 6 | def get(identifier: Optional[str], *args, **kwargs) -> Optional[AugmentStrategy]: 7 | if identifier is None: 8 | return None 9 | elif identifier == 'zeros': 10 | return Zeros() 11 | elif identifier.startswith('random_embedding'): 12 | if '-' in identifier: 13 | scale = float(identifier.split('-')[1]) 14 | return RandomEmbedding(scale, l2_normalize='l2' in identifier) 15 | else: 16 | return RandomEmbedding() 17 | elif identifier == 'random_digits': 18 | return RandomDigits(*args, **kwargs) 19 | elif identifier.startswith('additive_noise'): 20 | scale = float(identifier.split('-')[1]) 21 | return AdditiveNoise(scale) 22 | elif identifier.startswith('move_to_neighbor'): 23 | n_neighbors = int(identifier.split('-')[1]) 24 | return MoveToNeighbor(n_neighbors=n_neighbors, *args, **kwargs) # type: ignore 25 | else: 26 | raise ValueError('unknown identifier:', identifier) 27 | -------------------------------------------------------------------------------- /deid/config_template.yaml.example: -------------------------------------------------------------------------------- 1 | --- 2 | experiment: 3 | type: basic # see config_template_alternating.yaml.example for alternating experiment 4 | binary_classification: false 5 | hipaa_only: false 6 | model: lstm # or adversarial 7 | embeddings: fasttext # or glove, elmo 8 | train_set: train 9 | validation_set: validation 10 | model_args: 11 | hidden_size: # add options for multiple runs like this 12 | choice: 13 | - 64 14 | - 128 15 | - 256 16 | # ... 17 | num_hidden: 18 | choice: 19 | - 1 20 | - 2 21 | input_dropout: 22 | choice: 23 | - 0. 24 | - 0.05 25 | - 0.1 26 | - 0.25 27 | - 0.5 28 | after_hidden_dropout: 0.5 29 | recurrent_dropout: 0.25 30 | training: 31 | optimizer: adam 32 | optimizer_args: 33 | clipnorm: 1. 34 | train_epochs: 10 35 | early_stopping_patience: 2 36 | batch_size: 32 37 | i2b2_evaluate_every: 2 38 | augment: 39 | strategy: move_to_neighbor-50 # or additive_noise-0.1, etc. 40 | digit_strategy: random_digits 41 | include_original: false 42 | augment_args: 43 | augment_all: false 44 | n_augmentations: 10 45 | test: 46 | run_test: false 47 | -------------------------------------------------------------------------------- /deid/data/feature_tests.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from .feature import CaseFeature, apply_features 4 | from .token import Token 5 | 6 | 7 | def test_case(): 8 | feature = CaseFeature() 9 | assert np.all(feature.apply(Token.with_text('1234')) == np.array([0, 1, 0, 0, 0, 0, 0])) # all numeric 10 | assert np.all(feature.apply(Token.with_text('123a')) == np.array([0, 0, 1, 0, 0, 0, 0])) # mainly numeric 11 | assert np.all(feature.apply(Token.with_text('ok4y')) == np.array([0, 0, 0, 1, 0, 0, 0])) # all lower 12 | assert np.all(feature.apply(Token.with_text('OKAY')) == np.array([0, 0, 0, 0, 1, 0, 0])) # all upper 13 | # ... 14 | 15 | 16 | def test_apply_features(): 17 | features = [CaseFeature()] 18 | case_features = apply_features(features, [Token.with_text('UPPER'), Token.with_text('CASE')]) 19 | assert len(case_features) == 2 20 | assert np.all(case_features[0] == np.array([0, 0, 0, 0, 1, 0, 0])) 21 | 22 | features = [CaseFeature(), CaseFeature()] 23 | case_features = apply_features(features, [Token.with_text('UPPER'), Token.with_text('CASE')]) 24 | print(case_features) 25 | assert np.all(case_features[0] == np.array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0])) 26 | -------------------------------------------------------------------------------- /deid/tools/fix_xml_texts.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | 4 | # noinspection PyProtectedMember 5 | from bs4 import BeautifulSoup, CData 6 | 7 | 8 | def fix_xml(pred_xml, gold_xml): 9 | print(pred_xml, gold_xml) 10 | gold_soup = BeautifulSoup(open(gold_xml, 'r').read(), features='xml') 11 | gold_text = gold_soup.find('TEXT').string 12 | 13 | print(gold_text.count('\n')) 14 | 15 | pred_soup = BeautifulSoup(open(pred_xml, 'r').read(), features='xml') 16 | pred_soup.find('TEXT').string = CData(gold_text) 17 | with open(pred_xml, 'w') as f: 18 | f.write(str(pred_soup)) 19 | 20 | 21 | def main(): 22 | parser = argparse.ArgumentParser() 23 | parser.add_argument('pred', 24 | help='the predictions file or directory') 25 | parser.add_argument('gold', help='the gold file or directory') 26 | 27 | args = parser.parse_args() 28 | 29 | if os.path.isdir(args.pred): 30 | for xml in [f for f in os.listdir(args.pred) if f.endswith('.xml')]: 31 | pred_xml = os.path.join(args.pred, xml) 32 | gold_xml = os.path.join(args.gold, xml) 33 | 34 | fix_xml(pred_xml, gold_xml) 35 | else: 36 | fix_xml(args.pred, args.gold) 37 | 38 | 39 | if __name__ == '__main__': 40 | main() 41 | -------------------------------------------------------------------------------- /deid/tools/fix_180-03.py: -------------------------------------------------------------------------------- 1 | # Fixes a shift in start/end coordinates that is caused by the special characters in "O’neil’s Court" 2 | 3 | import argparse 4 | import os 5 | import re 6 | 7 | 8 | def fixed_contents(contents): 9 | result = '' 10 | edit_here = False 11 | increment_start = False 12 | for line in contents: 13 | if ' 0: 21 | print(' false positives:') 22 | for fp in false_positives: 23 | print(' -', fp) 24 | 25 | false_negatives = sets[1] - sets[0] 26 | if len(false_negatives) > 0: 27 | print(' false negatives:') 28 | for fn in false_negatives: 29 | print(' -', fn) 30 | 31 | print('-' * 100) 32 | 33 | 34 | def main(): 35 | parser = argparse.ArgumentParser() 36 | parser.add_argument('pred', 37 | help='the predictions file or directory') 38 | parser.add_argument('gold', help='the gold file or directory') 39 | 40 | args = parser.parse_args() 41 | 42 | if os.path.isdir(args.pred): 43 | for xml in [f for f in os.listdir(args.pred) if f.endswith('.xml')]: 44 | pred_xml = os.path.join(args.pred, xml) 45 | gold_xml = os.path.join(args.gold, xml) 46 | 47 | print_differences(pred_xml, gold_xml) 48 | else: 49 | print_differences(args.pred, args.gold) 50 | 51 | 52 | if __name__ == '__main__': 53 | main() 54 | -------------------------------------------------------------------------------- /deid/data/token.py: -------------------------------------------------------------------------------- 1 | from typing import NamedTuple 2 | from ..tools.i2b2.classes import PHITrackEvaluation 3 | 4 | 5 | class Token(NamedTuple): 6 | text: str 7 | type: str 8 | start: int 9 | end: int 10 | 11 | @classmethod 12 | def with_text(cls, text, label='O'): 13 | """ Mostly useful for unit tests """ 14 | return Token(text, label, 0, 0) 15 | 16 | 17 | # noinspection SpellCheckingInspection 18 | TOKEN_TYPE = { 19 | 'PATIENT': 'NAME', 20 | 'DOCTOR': 'NAME', 21 | 'USERNAME': 'NAME', 22 | 'PROFESSION': 'PROFESSION', 23 | 'ROOM': 'LOCATION', 24 | 'DEPARTMENT': 'LOCATION', 25 | 'HOSPITAL': 'LOCATION', 26 | 'ORGANIZATION': 'LOCATION', 27 | 'STREET': 'LOCATION', 28 | 'CITY': 'LOCATION', 29 | 'STATE': 'LOCATION', 30 | 'COUNTRY': 'LOCATION', 31 | 'ZIP': 'LOCATION', 32 | 'LOCATION-OTHER': 'LOCATION', 33 | 'AGE': 'AGE', 34 | 'DATE': 'DATE', 35 | 'PHONE': 'CONTACT', 36 | 'FAX': 'CONTACT', 37 | 'EMAIL': 'CONTACT', 38 | 'URL': 'CONTACT', 39 | 'IPADDR': 'CONTACT', 40 | 'SSN': 'ID', 41 | 'MEDICALRECORD': 'ID', 42 | 'HEALTHPLAN': 'ID', 43 | 'ACCOUNT': 'ID', 44 | 'LICENSE': 'ID', 45 | 'VEHICLE': 'ID', 46 | 'DEVICE': 'ID', 47 | 'BIOID': 'ID', 48 | 'IDNUM': 'ID', 49 | 'OTHER': 'OTHER' 50 | } 51 | 52 | HIPAA_TOKEN_TYPE = {tag: type for tag, type in TOKEN_TYPE.items() if any([n_re.match(type) and t_re.match(tag) 53 | for n_re, t_re in 54 | PHITrackEvaluation.HIPAA_regexes])} 55 | 56 | BINARY_LABEL = 'PATIENT' 57 | -------------------------------------------------------------------------------- /deid/config_template_alternating.yaml.example: -------------------------------------------------------------------------------- 1 | --- 2 | experiment: 3 | type: alternating 4 | binary_classification: false 5 | hipaa_only: false 6 | model: adversarial 7 | embeddings: 8 | choice: 9 | - fasttext 10 | - glove 11 | extra_features: 12 | - case 13 | train_set: train 14 | validation_set: validation 15 | model_args: 16 | representation_type: lstm 17 | representation_size: 18 | choice: 19 | - 50 20 | - 100 21 | - 300 22 | representation_args: 23 | single_stddev: false 24 | adversaries: 25 | - discriminate-representations 26 | - discriminate-representation-embedding-pair 27 | adversary_args: 28 | input_dropout: 0.0 29 | lstm_size: 300 30 | recurrent_dropout: 0.0 31 | reverse_gradient: false 32 | deidentifier_args: 33 | hidden_size: 128 34 | num_hidden: 2 35 | input_dropout: 0.1 36 | after_hidden_dropout: 0.5 37 | recurrent_dropout: 0.5 38 | use_crf: true 39 | training: 40 | optimizer: nadam 41 | optimizer_args: 42 | clipnorm: 1. 43 | pretrain_deidentifier_epochs: 20 44 | pretrain_adversary_epochs: 20 45 | train_epochs: 40 46 | early_stopping_patience: 10 47 | batch_size: 32 48 | batch_size_compound: 0 49 | i2b2_evaluate_every: 100 50 | class_weight: balanced 51 | augment: 52 | strategy: 53 | choice: 54 | - move_to_neighbor-5 55 | - move_to_neighbor-10 56 | - move_to_neighbor-20 57 | - move_to_neighbor-50 58 | - move_to_neighbor-100 59 | - move_to_neighbor-200 60 | - move_to_neighbor-500 61 | augment_args: 62 | n_augmentations: 10 63 | augment_all: false 64 | augment_max: 1 65 | test: 66 | run_test: true 67 | -------------------------------------------------------------------------------- /deid/model/layers/gradient_reversal.py: -------------------------------------------------------------------------------- 1 | # Adapted from https://github.com/keras-team/keras/pull/4031 2 | 3 | import tensorflow as tf 4 | from keras import backend as K 5 | from keras.engine import Layer 6 | 7 | 8 | def reverse_gradient(X, hp_lambda): 9 | """Flips the sign of the incoming gradient during training.""" 10 | try: 11 | reverse_gradient.num_calls += 1 12 | except AttributeError: 13 | reverse_gradient.num_calls = 1 14 | 15 | grad_name = "GradientReversal%d" % reverse_gradient.num_calls 16 | 17 | @tf.RegisterGradient(grad_name) 18 | def _flip_gradients(_, grad): 19 | return [tf.negative(grad) * hp_lambda] 20 | 21 | g = K.get_session().graph 22 | with g.gradient_override_map({'Identity': grad_name}): 23 | y = tf.identity(X) 24 | 25 | return y 26 | 27 | 28 | class GradientReversal(Layer): 29 | """Flip the sign of gradient during training.""" 30 | 31 | def __init__(self, hp_lambda=1.0, **kwargs): 32 | super(GradientReversal, self).__init__(**kwargs) 33 | assert hp_lambda > 0, f'hp_lambda is {hp_lambda} -- it should be > 0 to actually flip the gradient' 34 | self.hp_lambda = hp_lambda 35 | self.supports_masking = False 36 | 37 | def build(self, input_shape): 38 | self.trainable_weights = [] 39 | 40 | def call(self, x, mask=None): 41 | return reverse_gradient(x, self.hp_lambda) 42 | 43 | def compute_output_shape(self, input_shape): 44 | return input_shape 45 | 46 | def get_config(self): 47 | config = {'hp_lambda': self.hp_lambda} 48 | base_config = super(GradientReversal, self).get_config() 49 | return {**base_config, **config} 50 | -------------------------------------------------------------------------------- /deid/embeddings/noise.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from . import Embeddings 4 | 5 | 6 | class Noise: 7 | def noise(self, size: int) -> None: 8 | raise NotImplementedError 9 | 10 | 11 | class GaussianNoise(Noise): 12 | def __init__(self, scale: float, loc=0., clip=None) -> None: 13 | self.loc = loc 14 | self.scale = scale 15 | self.clip = clip 16 | 17 | def noise(self, size): 18 | result = np.random.normal(self.loc, self.scale, size) 19 | if self.clip is not None: 20 | result = np.clip(result, self.clip[0], self.clip[1]) 21 | return result 22 | 23 | 24 | class DropoutNoise(Noise): 25 | def __init__(self, dropout_prob) -> None: 26 | self.dropout_prob = dropout_prob 27 | 28 | def noise(self, size): 29 | return np.random.choice(2, size, p=[self.dropout_prob, 1 - self.dropout_prob]) 30 | 31 | 32 | class NoiseWrapper(Embeddings): 33 | def __init__(self, embeddings: Embeddings, op, noise: Noise) -> None: 34 | self.wrapped_embeddings = embeddings 35 | self.noise = noise 36 | 37 | if type(op) == str: 38 | if op == 'add' or op == '+': 39 | self.op = lambda x, y: x + y 40 | elif op == 'mul' or op == '*': 41 | self.op = lambda x, y: x * y 42 | else: 43 | raise ValueError(f'Unrecognized op: {op}') 44 | else: 45 | self.op = op 46 | 47 | @property 48 | def size(self): 49 | return self.wrapped_embeddings.size 50 | 51 | def lookup(self, word): 52 | return self.op(self.wrapped_embeddings.lookup(word), self.noise.noise(self.size)) 53 | 54 | def __str__(self): 55 | return f'<{self.__class__.__name__} wrapper of {self.wrapped_embeddings} {vars(self)}>' 56 | -------------------------------------------------------------------------------- /deid/experiment/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | import yaml 5 | 6 | from ..env import env 7 | 8 | config_dir = os.path.join(env.resources_dir, 'config') 9 | 10 | 11 | class Config(dict): 12 | """ A dict that returns None for missing items instead of raising an exception, including for child dicts """ 13 | 14 | def __init__(self, *args, **kwargs): 15 | super().__init__(*args, **kwargs) 16 | for k, v in self.items(): 17 | if k == 'choice': 18 | raise ValueError('This is a config template, not an experiment config. Please generate configs from ' 19 | 'it with python -m deid.tools.config') 20 | # please don't put a dict into itself (can't happen when importing from yaml anyway) 21 | if isinstance(v, dict): 22 | self[k] = Config(v) 23 | 24 | def __getitem__(self, key): 25 | if key.endswith('_args'): 26 | return self.get(key, {}) 27 | return self.get(key) 28 | 29 | 30 | def get_config(name): 31 | if os.path.isfile(name): 32 | return load_config_yaml(name) 33 | 34 | for parent in [config_dir, os.path.join(config_dir, 'generated')]: 35 | filename = os.path.join(parent, name) 36 | if os.path.isfile(filename): 37 | return load_config_yaml(filename) 38 | 39 | filename = filename + '.yaml' 40 | if os.path.isfile(filename): 41 | return load_config_yaml(filename) 42 | 43 | raise ValueError(f'Could not locate config "{name}" in config dir') 44 | 45 | 46 | def load_config_yaml(path): 47 | config = Config(yaml.load(open(path))) 48 | config['name'] = '.'.join(os.path.basename(path).split('.')[:-1]) 49 | config['path'] = path 50 | sys.stderr.write(f"Using {config['name']} config.\n") 51 | return config 52 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | temp 3 | deid/temp 4 | 5 | # Byte-compiled / optimized / DLL files 6 | __pycache__/ 7 | *.py[cod] 8 | *$py.class 9 | 10 | # C extensions 11 | *.so 12 | 13 | # Distribution / packaging 14 | .Python 15 | build/ 16 | develop-eggs/ 17 | dist/ 18 | downloads/ 19 | eggs/ 20 | .eggs/ 21 | lib/ 22 | lib64/ 23 | parts/ 24 | sdist/ 25 | var/ 26 | wheels/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | MANIFEST 31 | 32 | # PyInstaller 33 | # Usually these files are written by a python script from a template 34 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 35 | *.manifest 36 | *.spec 37 | 38 | # Installer logs 39 | pip-log.txt 40 | pip-delete-this-directory.txt 41 | 42 | # Unit test / coverage reports 43 | htmlcov/ 44 | .tox/ 45 | .nox/ 46 | .coverage 47 | .coverage.* 48 | .cache 49 | nosetests.xml 50 | coverage.xml 51 | *.cover 52 | .hypothesis/ 53 | .pytest_cache/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | db.sqlite3 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # celery beat schedule file 88 | celerybeat-schedule 89 | 90 | # SageMath parsed files 91 | *.sage.py 92 | 93 | # Environments 94 | .env 95 | .venv 96 | env/ 97 | venv/ 98 | ENV/ 99 | env.bak/ 100 | venv.bak/ 101 | 102 | # Spyder project settings 103 | .spyderproject 104 | .spyproject 105 | 106 | # Rope project settings 107 | .ropeproject 108 | 109 | # mkdocs documentation 110 | /site 111 | 112 | # mypy 113 | .mypy_cache/ 114 | .dmypy.json 115 | dmypy.json 116 | 117 | # Pyre type checker 118 | .pyre/ 119 | -------------------------------------------------------------------------------- /deid/env.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from typing import Optional 4 | 5 | deid_dir = os.path.dirname(__file__) 6 | 7 | 8 | # Defining the attributes as static variables isn't super smart as they are all created even if the config is not used. 9 | # This means os.environ['SOME_SPECIFIC_VAR'] will crash in other environments, so we have to use os.environ.get(). 10 | class Environment: 11 | name: str 12 | deid_dir: str = deid_dir 13 | data_dir: str 14 | work_dir: str 15 | resources_dir: str 16 | results_dir: str 17 | limit_training_documents: Optional[int] 18 | limit_validation_documents: Optional[int] 19 | use_short_sentences: bool 20 | keras_verbose: int 21 | save_model: int 22 | embeddings_cache: bool 23 | experiment_dir_postfix: Optional[str] = None 24 | 25 | unk_token: str = '' 26 | sent_start = '' 27 | sent_end = '' 28 | 29 | 30 | class Development(Environment): 31 | name = 'development' 32 | work_dir = os.path.join(os.environ['HOME'], 'deid_work') 33 | resources_dir = os.path.join(os.environ['HOME'], 'deid_resources') 34 | data_dir = os.path.join(resources_dir, 'i2b2_data') 35 | limit_training_documents = None # set this to e.g. 10 for faster experimentation 36 | limit_validation_documents = None 37 | use_short_sentences = False 38 | keras_verbose = 1 39 | save_model = True 40 | embeddings_cache = True 41 | 42 | 43 | class Test(Environment): 44 | name = 'unit test' 45 | work_dir = os.path.join(deid_dir, 'fixtures', 'deid_work') 46 | resources_dir = os.path.join(deid_dir, 'fixtures', 'deid_resources') 47 | data_dir = os.path.join(resources_dir, 'i2b2_data') 48 | limit_training_documents = 4 49 | limit_validation_documents = 2 50 | use_short_sentences = True 51 | keras_verbose = 1 52 | save_model = False 53 | embeddings_cache = True 54 | 55 | 56 | env: Environment 57 | if 'DEID_TEST_CONFIG' in os.environ.keys() and os.environ['DEID_TEST_CONFIG']: 58 | env = Test() 59 | else: 60 | env = Development() 61 | sys.stderr.write(f'Using {env.name} environment.\n') 62 | -------------------------------------------------------------------------------- /deid/data/feature.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from .token import Token 4 | from .util import one_hot 5 | 6 | 7 | def get(identifier): 8 | if identifier == 'case': 9 | return CaseFeature() 10 | if identifier == 'one': 11 | return UselessOneFeature() 12 | raise ValueError(f'unknown feature identifier: {identifier}') 13 | 14 | 15 | class Feature: 16 | def apply(self, token) -> np.ndarray: 17 | raise NotImplementedError 18 | 19 | @property 20 | def dimension(self): 21 | return NotImplementedError 22 | 23 | 24 | class CaseFeature(Feature): 25 | """ Casing feature from Reimers and Gurevych (2017) https://arxiv.org/abs/1707.06799 """ 26 | OTHER = 0 27 | NUMERIC = 1 28 | MAINLY_NUMERIC = 2 29 | ALL_LOWER = 3 30 | ALL_UPPER = 4 31 | INITIAL_UPPER = 5 32 | CONTAINS_DIGIT = 6 33 | 34 | def apply(self, token: Token) -> np.ndarray: 35 | token = token.text 36 | 37 | num_digits = len([char for char in token if char.isdigit()]) 38 | digit_fraction = num_digits / len(token) 39 | 40 | if token.isdigit(): 41 | casing = self.NUMERIC 42 | elif digit_fraction > 0.5: 43 | casing = self.MAINLY_NUMERIC 44 | elif token.islower(): 45 | casing = self.ALL_LOWER 46 | elif token.isupper(): 47 | casing = self.ALL_UPPER 48 | elif token[0].isupper(): 49 | casing = self.INITIAL_UPPER 50 | elif num_digits > 0: 51 | casing = self.CONTAINS_DIGIT 52 | else: 53 | casing = self.OTHER 54 | 55 | return one_hot(casing, 7) 56 | 57 | @property 58 | def dimension(self): 59 | return 7 60 | 61 | 62 | class UselessOneFeature(Feature): 63 | def apply(self, token) -> np.ndarray: 64 | return np.array([1]) 65 | 66 | @property 67 | def dimension(self): 68 | return 1 69 | 70 | 71 | def apply_features(features, sent): 72 | if len(features) == 0: 73 | return np.array([np.array([]) for _ in sent]) 74 | return np.array([np.concatenate([feature.apply(word) for feature in features]) for word in sent]) 75 | -------------------------------------------------------------------------------- /deid/data/util.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | from typing import Sequence, Optional, Any 3 | from spacy.util import compounding as spacy_compounding 4 | import numpy as np 5 | 6 | 7 | def one_hot(x: int, n: int) -> np.ndarray: 8 | result = np.zeros(n) 9 | result[x] = 1 10 | return result 11 | 12 | 13 | def compounding(start, stop, compound): 14 | """ Wraps spaCy's compounding utility to always return ints. 15 | 16 | >>> sizes = compounding(1., 10., 1.5) 17 | >>> assert next(sizes) == 1. 18 | >>> assert next(sizes) == int(1 * 1.5) 19 | >>> assert next(sizes) == int(1.5 * 1.5) 20 | """ 21 | return (int(result) for result in spacy_compounding(start, stop, compound)) 22 | 23 | 24 | def peek(iterator): 25 | item = next(iterator) 26 | return item, itertools.chain([item], iterator) 27 | 28 | 29 | def pad_2d_sequences(seq: Sequence[Any], maxlen: Optional[int] = None, 30 | embedding_size: Optional[int] = None) -> np.ndarray: 31 | """ Like keras.preprocessing.sequence.pad_sequences but for 2d (already embedded) sequences. 32 | 33 | Caveat: this function does not truncate inputs. An error will be raised if the specified maxlen is smaller than the 34 | actual maximum length in the sequence. 35 | 36 | :param seq: the input sequence 37 | :param maxlen: the length to which the result will be padded, may be None 38 | :param embedding_size: the embedding dimension of the input, may be None 39 | :return: a padded array 40 | """ 41 | 42 | # find the maximum length by looking through the sequence 43 | if maxlen is None: 44 | maxlen = -1 45 | for item in seq: 46 | maxlen = max(maxlen, len(item)) 47 | 48 | # find the embedding dimension by looking through the sequence until there is a non-empty item 49 | if embedding_size is None: 50 | for item in seq: 51 | if len(item) != 0: 52 | embedding_size = len(item[0]) 53 | break 54 | 55 | result = np.zeros((len(seq), maxlen, embedding_size)) 56 | for i, item in enumerate(seq): 57 | assert len(item) > 0 58 | result[i, -len(item):] = item 59 | return result 60 | -------------------------------------------------------------------------------- /deid/experiment/alternating_evaluation.py: -------------------------------------------------------------------------------- 1 | import math 2 | import os 3 | import pickle 4 | import sys 5 | 6 | import numpy as np 7 | from keras.callbacks import EarlyStopping, LambdaCallback 8 | from keras.utils.generic_utils import Progbar 9 | 10 | from .alternating import alternating_experiment 11 | from ..env import env 12 | 13 | 14 | def make_progress_bar(target): 15 | return Progbar(target=target, verbose=env.keras_verbose) 16 | 17 | 18 | def alternating_evaluation_experiment(config): 19 | weights = config['test']['test_weights'] 20 | model, tr, train_gen, val, valid_gen, experiment_dir = alternating_experiment(config, run_experiment=False) 21 | 22 | model.complete_model.load_weights(weights) 23 | 24 | batch_size = config['training']['batch_size'] 25 | test_batch_size = config['training']['test_batch_size'] 26 | if test_batch_size is None: 27 | test_batch_size = batch_size 28 | 29 | early_stopping = EarlyStopping(monitor='val_loss', patience=config['training']['early_stopping_patience']) 30 | flush = LambdaCallback(on_epoch_end=lambda epoch, logs: sys.stdout.flush()) 31 | 32 | before_fine_tuning_weights = model.train_representer.get_weights() 33 | 34 | def assert_fixed_weights(): 35 | after_fine_tuning_weights = model.train_representer.get_weights() 36 | for i in range(len(before_fine_tuning_weights)): 37 | assert np.all(before_fine_tuning_weights[i] == after_fine_tuning_weights[i]) 38 | 39 | assert_fixed_representer = LambdaCallback(on_epoch_end=lambda epoch, logs: assert_fixed_weights()) 40 | callbacks = [early_stopping, flush, assert_fixed_representer] 41 | 42 | print('Training adversary') 43 | history = model.pretrain_adversary.fit_generator(train_gen, 44 | epochs=config['training']['train_epochs'], 45 | steps_per_epoch=int(math.ceil(len(tr.X) / batch_size)), 46 | validation_data=valid_gen, 47 | validation_steps=int(math.ceil(len(val.X) / test_batch_size)), 48 | callbacks=callbacks, 49 | verbose=env.keras_verbose) 50 | 51 | history_pickle_path = os.path.join(experiment_dir, 'history.pickle') 52 | print('Saving history to', history_pickle_path) 53 | with open(history_pickle_path, 'wb') as f: 54 | pickle.dump(history.history, f) 55 | -------------------------------------------------------------------------------- /deid/embeddings/embeddings.py: -------------------------------------------------------------------------------- 1 | from typing import Sequence, Dict 2 | 3 | import numpy as np 4 | 5 | 6 | class Embeddings: 7 | """ Flexible base class for embeddings that doesn't necessarily use a matrix """ 8 | 9 | @property 10 | def size(self) -> int: 11 | raise NotImplementedError 12 | 13 | @property 14 | def mean(self) -> float: 15 | return 0. 16 | 17 | @property 18 | def std(self) -> float: 19 | raise NotImplementedError 20 | 21 | def is_unknown(self, word: str) -> bool: 22 | raise NotImplementedError 23 | 24 | def lookup(self, word: str) -> np.ndarray: 25 | """ Looks up the vector representation of one word. 26 | 27 | :param word: an input string 28 | :return: a vector representation of size `size` 29 | """ 30 | raise NotImplementedError 31 | 32 | def lookup_sentence(self, words: Sequence[str]) -> Sequence[np.ndarray]: 33 | """ Looks up the vector representation of multiple words. Override this if there is a more efficient way to get 34 | a batch of embeddings than looking them up one by one. 35 | 36 | :param words: a sequence of input strings 37 | :return: a vector representation of size `(len(words), size)` 38 | """ 39 | return np.array([self.lookup(word) for word in words]) 40 | 41 | def lookup_sentences(self, sentences: Sequence[Sequence[str]]) -> Sequence[Sequence[np.ndarray]]: 42 | """ Looks up the vector representation of an entire sentence. Override this if there is a more efficient way to 43 | get a batch of embeddings sequences than looking them up one by one. 44 | 45 | :param sentences: a sequence of sequences of input strings 46 | :return: a sequence of arrays that have size `(len(sentence), size)` for the corresponding sentence 47 | """ 48 | 49 | return [self.lookup_sentence(sentence) for sentence in sentences] 50 | 51 | 52 | class PrecomputedEmbeddings(Embeddings): 53 | """ Base class for embeddings that provide a precomputed matrix in addition to the lookup """ 54 | 55 | @property 56 | def size(self) -> int: 57 | raise NotImplementedError 58 | 59 | @property 60 | def std(self) -> float: 61 | raise NotImplementedError 62 | 63 | def is_unknown(self, word: str) -> bool: 64 | raise NotImplementedError 65 | 66 | def lookup(self, word: str) -> np.ndarray: 67 | raise NotImplementedError 68 | 69 | @property 70 | def precomputed_word2ind(self) -> Dict[str, int]: 71 | raise NotImplementedError 72 | 73 | @property 74 | def precomputed_matrix(self) -> np.ndarray: 75 | raise NotImplementedError 76 | -------------------------------------------------------------------------------- /deid/data/tokenizer_tests.py: -------------------------------------------------------------------------------- 1 | from . import tokenize 2 | 3 | 4 | def assert_number_of_tokens(doc, count): 5 | assert len(doc) == count, f'token sequence {[str(t) for t in doc]} has length {len(doc)}, expected {count}' 6 | 7 | 8 | def assert_number_of_sentences(doc, count): 9 | sents = list(str(sent) for sent in doc.sents) 10 | assert len(sents) == count, f'doc {sents} has {len(sents)} sentences, expected {count}' 11 | 12 | 13 | def test_tokenize_one_sentence(): 14 | doc = tokenize('A sentence that is simple to tokenize.') 15 | assert_number_of_tokens(doc, 8) 16 | assert doc[-1].text == '.' 17 | 18 | 19 | def test_tokenize_multiple_sentences(): 20 | doc = tokenize('One sentence. And another sentence.') 21 | assert_number_of_sentences(doc, 2) 22 | 23 | 24 | def test_tokenize_phone_number(): 25 | doc = tokenize('555-2394-72-01') 26 | assert_number_of_tokens(doc, 7) 27 | 28 | 29 | def test_tokenize_custom_infixes(): 30 | doc = tokenize('a/b') 31 | assert_number_of_tokens(doc, 3) 32 | 33 | doc = tokenize('a_b_c') 34 | assert_number_of_tokens(doc, 5) 35 | 36 | doc = tokenize('81-year-old') 37 | tokens = [str(t) for t in doc] 38 | assert tokens == ['81', '-', 'year', '-', 'old'] 39 | 40 | doc = tokenize('a^b') 41 | tokens = [str(t) for t in doc] 42 | assert tokens == ['a', '^', 'b'] 43 | 44 | doc = tokenize('25yo') 45 | tokens = [str(t) for t in doc] 46 | assert tokens == ['25', 'yo'] 47 | 48 | 49 | def test_tokenize_sentences(): 50 | doc = tokenize('Here is some text that is followed by many newlines\n \n \n \n \nAnd here is some other text.') 51 | assert_number_of_sentences(doc, 2) 52 | 53 | doc = tokenize("""- First list item 54 | - and the second list item, which does not necessarily look like a sentence start.""") 55 | assert_number_of_sentences(doc, 2) 56 | 57 | doc = tokenize("""1. test 58 | 2: ok""") 59 | assert_number_of_sentences(doc, 2) 60 | 61 | doc = tokenize("""----list with unusual format 62 | ----starting with some dashes, no space between dashes and first word 63 | ---sometimes it's a different number of dashes""") 64 | assert_number_of_sentences(doc, 6) 65 | 66 | 67 | def test_tokenize_html(): 68 | doc = tokenize('NASA & SpaceX') 69 | tokens = [t for t in doc] 70 | assert [str(t) for t in tokens] == ['NASA', '&', 'SpaceX'] 71 | assert tokens[0]._.unescaped_html is None 72 | assert tokens[1]._.unescaped_html == '&' 73 | 74 | doc = tokenize('NASA > SpaceX') 75 | tokens = [t for t in doc] 76 | assert [str(t) for t in tokens] == ['NASA', '>', 'SpaceX'] 77 | assert tokens[0]._.unescaped_html is None 78 | assert tokens[1]._.unescaped_html == '>' 79 | 80 | doc = tokenize('Nasa
SpaceX') 81 | tokens = [t for t in doc] 82 | assert tokens[1]._.unescaped_html == '\n' 83 | -------------------------------------------------------------------------------- /deid/tools/dataset.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import random 4 | import shutil 5 | from datetime import datetime 6 | 7 | from typing import NamedTuple 8 | 9 | from ..env import env 10 | 11 | NUM_TRAIN_VALID_DOCS = 790 # the total number of train + validation docs in the i2b2 dataset 12 | 13 | 14 | class Document(NamedTuple): 15 | csv: str 16 | txt: str 17 | xml: str 18 | 19 | 20 | def make_dataset(train_split: float, valid_split: float): 21 | data_dir = os.path.join(env.data_dir, 'generated') 22 | if not os.path.isdir(data_dir): 23 | os.mkdir(data_dir) 24 | 25 | date_str = datetime.now().strftime('%Y%m%d-%H%M%S') 26 | config = f'{train_split}-{valid_split}-{date_str}' 27 | 28 | dataset_train_dir = os.path.join(data_dir, f'train-{config}') 29 | os.mkdir(dataset_train_dir) 30 | 31 | dataset_train_xml_dir = os.path.join(data_dir, f'train-{config}_xml') 32 | os.mkdir(dataset_train_xml_dir) 33 | 34 | dataset_valid_dir = os.path.join(data_dir, f'validation-{config}') 35 | os.mkdir(dataset_valid_dir) 36 | 37 | dataset_valid_xml_dir = os.path.join(data_dir, f'validation-{config}_xml') 38 | os.mkdir(dataset_valid_xml_dir) 39 | 40 | all_documents = [] 41 | for dataset in ['train', 'validation']: 42 | dataset_dir = os.path.join(env.data_dir, dataset) 43 | dataset_xml_dir = os.path.join(env.data_dir, dataset + '_xml') 44 | for filename in [filename for filename in os.listdir(dataset_dir) if filename.endswith('csv')]: 45 | csv_filename = os.path.join(dataset_dir, filename) 46 | txt_filename = os.path.join(dataset_dir, filename[:-3] + 'txt') 47 | xml_filename = os.path.join(dataset_xml_dir, filename[:-3] + 'xml') 48 | all_documents.append(Document(csv=csv_filename, xml=xml_filename, txt=txt_filename)) 49 | 50 | size = min(max(int(train_split * NUM_TRAIN_VALID_DOCS), 2), NUM_TRAIN_VALID_DOCS) 51 | train_documents = random.sample(all_documents, size) 52 | valid_size = max(int(valid_split * len(train_documents)), 1) 53 | valid_documents = random.sample(train_documents, valid_size) 54 | print(f'Using {size-valid_size} train documents and {valid_size} validation documents.') 55 | 56 | for document in train_documents: 57 | target = dataset_valid_dir if document in valid_documents else dataset_train_dir 58 | shutil.copy2(document.csv, target) 59 | shutil.copy2(document.txt, target) 60 | shutil.copy2(document.xml, target + '_xml') 61 | 62 | print(f'Made dataset at {dataset_train_dir}, {dataset_valid_dir}') 63 | 64 | 65 | def main(): 66 | parser = argparse.ArgumentParser() 67 | parser.description = 'Make train and validation sets of a specified size' 68 | parser.add_argument('train_split', type=float) 69 | parser.add_argument('--valid_split', type=float, default=0.2) 70 | args = parser.parse_args() 71 | 72 | make_dataset(args.train_split, args.valid_split) 73 | 74 | 75 | if __name__ == '__main__': 76 | main() 77 | -------------------------------------------------------------------------------- /deid/model/layers/noise.py: -------------------------------------------------------------------------------- 1 | from keras import backend as K 2 | from keras.engine.topology import Layer 3 | 4 | 5 | class Noise(Layer): 6 | """ Abstract Gaussian Noise layer with trainable mean and standard deviation """ 7 | 8 | def __init__(self, operation, single_stddev: bool, apply_noise: bool = True, **kwargs) -> None: 9 | """ Initializes the Noise layer. 10 | 11 | :param operation: the operation to apply to the inputs and noise, may be '+'/'add' or '*'/'mult'. The mean of 12 | the noise will be set according to this operator. 13 | :param single_stddev: whether to learn a matrix of noise stddev values instead of only one stddev value that is 14 | applied to all dimensions of the data 15 | :param apply_noise: set this to False to only apply the mean instead of noise 16 | :param kwargs: other Layer arguments 17 | """ 18 | super().__init__(**kwargs) 19 | if operation == '+' or operation == 'add': 20 | self.operation = lambda x, y: x + y 21 | self.mean = 0. 22 | elif operation == '*' or operation == 'mult': 23 | self.operation = lambda x, y: x * y 24 | self.mean = 1. 25 | else: 26 | raise ValueError(f'unknown operation: {operation}') 27 | 28 | self.apply_noise = K.constant(value=apply_noise) 29 | self.single_stddev = single_stddev 30 | self.k = self.stddev = None # will be initialized in the build method 31 | 32 | self.supports_masking = True 33 | 34 | def build(self, input_shape): 35 | self.k = self.add_weight(name='k', 36 | shape=(1,), 37 | initializer='ones', 38 | trainable=True) 39 | self.stddev = self.add_weight(name='stddev', 40 | shape=(1,) if self.single_stddev else (input_shape[-1],), 41 | initializer='normal', 42 | trainable=True) 43 | super().build(input_shape) 44 | 45 | def compute_output_shape(self, input_shape): 46 | return input_shape 47 | 48 | def call(self, inputs, **kwargs): 49 | def noise(): 50 | noise_matrix = K.random_normal(shape=K.shape(inputs), mean=self.mean, stddev=self.stddev) 51 | return self.operation(inputs, self.k * noise_matrix) 52 | 53 | return K.switch(self.apply_noise, noise, inputs) 54 | 55 | def get_config(self): 56 | config = {'apply_noise': self.apply_noise, 57 | 'mean': self.mean, 58 | 'single_stddev': self.single_stddev, 59 | 'k': self.k} 60 | base_config = super().get_config() 61 | return {**base_config, **config} 62 | 63 | 64 | class AdditiveNoise(Noise): 65 | def __init__(self, **kwargs): 66 | super().__init__('+', **kwargs) 67 | 68 | 69 | class MultiplicativeNoise(Noise): 70 | def __init__(self, **kwargs): 71 | super().__init__('*', **kwargs) 72 | -------------------------------------------------------------------------------- /deid/model/deidentifier.py: -------------------------------------------------------------------------------- 1 | from keras import backend as K 2 | from keras.layers import Input, Dense, LSTM, Bidirectional, TimeDistributed, Masking, Dropout, concatenate, Lambda 3 | from keras.models import Model 4 | from keras_contrib.layers import CRF 5 | 6 | 7 | def make_lstm_crf(input_size, hidden_size, output_size, name='deidentifier', extra_input_size=0, num_hidden=1, 8 | input_dropout=0., recurrent_dropout=0., after_hidden_dropout=0., use_crf=False, optimizer=None, 9 | l2_normalize=False): 10 | """ Make a BiLSTM(-CRF) model that can be used for de-identification. 11 | 12 | :param input_size: the embedding/representation input size 13 | :param hidden_size: the number of LSTM units per direction 14 | :param output_size: the number of output labels 15 | :param name: a name for the model 16 | :param extra_input_size: size for an additional input, if it is 0, this returns a single-input model 17 | :param num_hidden: the number of LSTM layers 18 | :param input_dropout: dropout probability for the input layer 19 | :param recurrent_dropout: recurrent (variational) dropout probability 20 | :param after_hidden_dropout: dropout probability for the LSTM outputs 21 | :param use_crf: whether to use a CRF to optimize the output sequences 22 | :param optimizer: a Keras optimizer, or None if the model should not be compiled 23 | :param l2_normalize: whether to L2 normalize the embedding/representation input 24 | :return: a tuple (model, loss), or a compiled Keras model if an optimizer was specified 25 | """ 26 | embedding_input = Input(shape=(None, input_size)) 27 | x = Masking()(embedding_input) 28 | if l2_normalize: 29 | x = Lambda(lambda x: K.l2_normalize(x, axis=-1))(x) 30 | x = Dropout(input_dropout)(x) 31 | 32 | extra_input = Input(shape=(None, extra_input_size)) 33 | if extra_input_size > 0: 34 | x2 = Masking()(extra_input) 35 | x = concatenate([x, x2]) 36 | 37 | for _ in range(num_hidden): 38 | x = Bidirectional(LSTM(hidden_size, return_sequences=True, dropout=after_hidden_dropout, 39 | recurrent_dropout=recurrent_dropout))(x) 40 | if use_crf: 41 | # CRF learn mode 'join' does not work at the moment, this GitHub issue contains a minimal example showing 42 | # the problem: https://github.com/keras-team/keras-contrib/issues/271 43 | x = TimeDistributed(Dense(output_size, activation=None))(x) 44 | crf = CRF(output_size, sparse_target=True, learn_mode='marginal', name='deid_output') 45 | x = crf(x) 46 | loss = crf.loss_function 47 | else: 48 | x = TimeDistributed(Dense(output_size, activation='softmax'), name='deid_output')(x) 49 | loss = 'sparse_categorical_crossentropy' 50 | 51 | if extra_input_size > 0: 52 | model = Model([embedding_input, extra_input], x, name=name) 53 | else: 54 | model = Model(embedding_input, x, name=name) 55 | 56 | if optimizer is not None: 57 | model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy']) 58 | return model 59 | return model, loss 60 | -------------------------------------------------------------------------------- /deid/embeddings/glove.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | from typing import Optional, Dict 4 | 5 | import numpy as np 6 | 7 | from . import PrecomputedEmbeddings 8 | from ..env import env 9 | 10 | glove_dir = os.path.join(env.resources_dir, 'glove.6B') 11 | 12 | 13 | class GloveEmbeddings(PrecomputedEmbeddings): 14 | """ Pre-trained GloVe embeddings, see https://nlp.stanford.edu/projects/glove/ """ 15 | 16 | def __init__(self, dims: int = 300, vocab_size: Optional[int] = None) -> None: 17 | """ Initialize a GloveEmbeddings object. 18 | 19 | :param dims: the GloVe variant to use (50, 100, 200, or 300 dimensions) 20 | :param vocab_size: limits the size of the embedding matrix 21 | """ 22 | self._dims = dims 23 | filename = os.path.join(glove_dir, f'glove.6B.{dims}d.txt') 24 | if not os.path.isfile(filename): 25 | raise ValueError(f"Can't find GloVe embeddings with {dims} dims in {glove_dir}.") 26 | 27 | embeddings = [np.zeros(dims), np.random.normal(0., scale=1e-6, size=dims)] # Padding and UNK 28 | self._word2ind = {env.unk_token: 1} 29 | self._ind2word = {1: env.unk_token} 30 | 31 | with open(filename) as f: 32 | for i, line in enumerate(f, start=2): 33 | values = line.split() 34 | word = values[0] 35 | embedding = np.asarray(values[1:], dtype='float32') 36 | self._word2ind[word] = i 37 | self._ind2word[i] = word 38 | embeddings.append(embedding / np.linalg.norm(embedding)) 39 | if i == vocab_size: 40 | break 41 | 42 | self._embeddings = np.array(embeddings) 43 | 44 | @property 45 | def precomputed_word2ind(self) -> Dict[str, int]: 46 | return self._word2ind 47 | 48 | @property 49 | def precomputed_matrix(self) -> np.ndarray: 50 | return self._embeddings 51 | 52 | @property 53 | def size(self) -> int: 54 | return self._dims 55 | 56 | @property 57 | def std(self): 58 | return 0.37 59 | 60 | def word2ind(self, word: str) -> int: 61 | result = self._word2ind.get(word) 62 | if result is not None: 63 | return result 64 | 65 | word = word.lower() 66 | result = self._word2ind.get(word) 67 | if result is not None: 68 | return result 69 | 70 | word = re.sub(r'\W', '', word) 71 | result = self._word2ind.get(word) 72 | if result is not None: 73 | return result 74 | 75 | # replace every digit with a 0 76 | result = self._word2ind.get(re.sub(r'\d', '0', word)) 77 | if result is not None: 78 | return result 79 | 80 | # replace all connected digits with a single 0 81 | result = self._word2ind.get(re.sub(r'\d*', '0', word)) 82 | if result is not None: 83 | return result 84 | 85 | return self._word2ind[env.unk_token] 86 | 87 | def lookup(self, word: str) -> np.ndarray: 88 | return self._embeddings[self.word2ind(word)] 89 | 90 | def is_unknown(self, word: str): 91 | return np.all(self.word2ind(word) == self._word2ind[env.unk_token]) 92 | 93 | def __str__(self) -> str: 94 | return '' 95 | -------------------------------------------------------------------------------- /deid/tools/i2b2_xml_to_csv_tests.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from deid.env import Test 4 | from .i2b2_xml_to_csv import xml_to_annotated_tokens_and_text 5 | 6 | 7 | def find_token(tokens, text): 8 | return next((index, token) for index, token in enumerate(tokens) if token.text.startswith(text)) 9 | 10 | 11 | def test_xml_to_annotated_tokens_and_text(): 12 | tokens, text = xml_to_annotated_tokens_and_text(os.path.join(Test().data_dir, 'train_xml', '999-99.xml'), 13 | check_alignment=True) 14 | 15 | _, date_token = find_token(tokens, '2018') 16 | assert date_token.type == 'B-DATE' 17 | assert date_token.start == 16 18 | 19 | max_index, max_token = find_token(tokens, 'Max') 20 | assert max_token.type == 'B-PATIENT' 21 | assert max_token.start == 28 22 | assert max_token.end == 31 23 | 24 | assert tokens[max_index + 1].type == 'I-PATIENT' 25 | assert tokens[max_index + 2].type == 'O' 26 | 27 | lines = text.strip().split('\n') 28 | assert lines[0] == 'Record date: 2018-06-15' 29 | assert lines[2] == 'Max Friedrich is a 25-year-old Computer science student living in Hamburg, Germany.' 30 | 31 | 32 | def test_tags_right_after_each_other(): 33 | tokens, text = xml_to_annotated_tokens_and_text(os.path.join(Test().data_dir, 'train_xml', '999-98.xml'), 34 | check_alignment=True) 35 | one_index, one_token = find_token(tokens, 'one') 36 | assert one_token.type == 'B-DATE' 37 | 38 | two_token = tokens[one_index + 1] 39 | assert two_token.type == 'B-AGE' 40 | 41 | three_token = tokens[one_index + 2] 42 | assert three_token.type == 'I-AGE' 43 | 44 | four_token = tokens[one_index + 3] 45 | assert four_token.type == 'B-DATE' 46 | 47 | medical_record_token = tokens[one_index + 4] 48 | assert medical_record_token.type == 'B-MEDICALRECORD' 49 | 50 | hospital_token = tokens[one_index + 5] 51 | print(hospital_token) 52 | assert hospital_token.type == 'B-HOSPITAL' 53 | 54 | 55 | def test_uses_start_tag_even_with_wrong_alignment(): 56 | tokens, text = xml_to_annotated_tokens_and_text(os.path.join(Test().data_dir, 'train_xml', '999-97.xml'), 57 | check_alignment=True) 58 | print(tokens) 59 | zero_index, zero_token = find_token(tokens, 'zero') 60 | assert zero_token.type == 'O' 61 | 62 | one_token = tokens[zero_index + 1] 63 | assert one_token.type == 'B-DATE' 64 | 65 | two_token = tokens[zero_index + 2] 66 | assert two_token.type == 'B-AGE' # not I-AGE 67 | 68 | three_token = tokens[zero_index + 3] 69 | assert three_token.type == 'O' 70 | 71 | four_token = tokens[zero_index + 4] 72 | assert four_token.type == 'B-DATE' 73 | 74 | 75 | def test_escape_html(): 76 | tokens, text = xml_to_annotated_tokens_and_text(os.path.join(Test().data_dir, 'train_xml', '999-96.xml'), 77 | check_alignment=True) 78 | print(tokens) 79 | zero_index, zero_token = find_token(tokens, 'zero') 80 | assert zero_token.type == 'O' 81 | 82 | lt_token = tokens[zero_index + 1] 83 | assert lt_token.text == '<' 84 | assert lt_token.type == 'O' 85 | assert lt_token.start == 8 86 | 87 | one_token = tokens[zero_index + 2] 88 | assert one_token.type == 'B-DATE' 89 | assert one_token.start == 13 90 | -------------------------------------------------------------------------------- /deid/data/batch_tests.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from .batch import BatchGenerator, StratifiedSampling 4 | from .util import compounding 5 | from ..data import TrainingSet 6 | from ..embeddings import DummyEmbeddings 7 | 8 | 9 | def test_generator(): 10 | batch_size = 2 11 | tr = TrainingSet(limit_documents=1, embeddings=DummyEmbeddings()) 12 | 13 | generator = BatchGenerator(tr.X, tr.y, batch_size) 14 | x, y = next(generator) 15 | assert x.shape[0] == y.shape[0] == batch_size 16 | 17 | 18 | def test_generator_yields_incomplete_batches(): 19 | def make_array(): 20 | return np.array([[[i for _ in range(10)] for _ in range(3)] for i in range(3)]) 21 | 22 | generator = BatchGenerator(make_array(), make_array(), batch_size=2, yield_incomplete_batches=True) 23 | assert generator.epoch_length == 2 24 | x, y = next(generator) 25 | assert x.shape[0] == y.shape[0] == 2 26 | 27 | x, y = next(generator) 28 | assert x.shape[0] == y.shape[0] == 1 29 | 30 | x, y = next(generator) 31 | assert x.shape[0] == y.shape[0] == 2 32 | 33 | generator = BatchGenerator(make_array(), make_array(), batch_size=2, yield_incomplete_batches=False) 34 | assert generator.epoch_length == 1 35 | x, y = next(generator) 36 | assert x.shape[0] == y.shape[0] == 2 37 | 38 | x, y = next(generator) 39 | assert x.shape[0] == y.shape[0] == 2 40 | 41 | 42 | def test_generator_compounding_batch_size(): 43 | def make_array(): 44 | return np.ones((100, 10, 1)) 45 | 46 | generator = BatchGenerator(make_array(), make_array(), batch_size=compounding(1, 20, 1.1), 47 | yield_incomplete_batches=False) 48 | compounding_value = 1 49 | 50 | sum = 0 51 | print('batch sizes:', generator.epoch_batch_sizes) 52 | print('epoch length:', generator.epoch_length) 53 | for i in range(40): # 1 * 1.1**40 ≈ 45, so it's testing the maximum size as well 54 | compounding_value = min(20, int(1.1 ** i)) 55 | x, y = next(generator) 56 | sum += x.shape[0] 57 | print(f'({i})', x.shape[0], '=', compounding_value, sum) 58 | assert x.shape[0] == y.shape[0] == int(compounding_value) 59 | 60 | assert compounding_value == 20 61 | 62 | 63 | def test_generator_yields_permutation(): 64 | def make_array(): 65 | return np.arange(0, 100).reshape((10, 10, 1)) 66 | 67 | x, y = make_array(), make_array() 68 | generator = BatchGenerator(x, y, batch_size=5, yield_indices=True) 69 | 70 | for _ in range(5): # so we shuffle a couple of times 71 | batch_x, batch_y, batch_ind = next(generator) 72 | assert np.all(batch_x[0] == x[batch_ind[0]]) 73 | 74 | 75 | def test_stratified_sampling(): 76 | def make_array(): 77 | arr = np.zeros((100, 10, 1)) 78 | for i in range(100): 79 | arr[i] = np.ones((10, 1)) * i 80 | return arr 81 | 82 | x, y = make_array(), make_array() 83 | generator = StratifiedSampling(x, y, split_condition=lambda x, _: x[-1] >= 20, batch_size=6, yield_indices=True) 84 | 85 | assert generator.epoch_length == 7 86 | 87 | batch_x, batch_y, batch_ind = next(generator) 88 | assert np.all(batch_x[0] == x[batch_ind[0]]) 89 | assert batch_x.size == 60 90 | assert batch_x[batch_x >= 20].size == 30 # half of them 91 | 92 | for _ in range(1, generator.epoch_length): 93 | batch_x, batch_y, batch_ind = next(generator) 94 | assert batch_x.size == 40 # last batch should be incomplete 95 | 96 | batch_x, batch_y, batch_ind = next(generator) 97 | assert batch_x.size == 60 # next epoch 98 | -------------------------------------------------------------------------------- /deid/tools/embeddings.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import os 3 | import pickle 4 | 5 | import numpy as np 6 | 7 | 8 | def _create_cache(embeddings_class, lookup_sentences): 9 | from ..data import DataSet, TrainingSet, ValidationSet, TestSet 10 | 11 | def sentences_from_dataset(dataset: DataSet): 12 | return [[token.text for token in sentence] for sentence in dataset.X] 13 | 14 | def words_from_dataset(dataset: DataSet): 15 | return list(itertools.chain.from_iterable(sentences_from_dataset(dataset))) 16 | 17 | read_dataset = sentences_from_dataset if lookup_sentences else words_from_dataset 18 | 19 | print('Loading the vocabulary...') 20 | tr = TrainingSet() 21 | vocab = read_dataset(tr) 22 | vocab += read_dataset(ValidationSet(validation_set='validation', embeddings=None, label2ind=tr.label2ind)) 23 | vocab += read_dataset(ValidationSet(validation_set='test', embeddings=None, label2ind=tr.label2ind)) 24 | 25 | print('Loading embeddings...') 26 | embeddings_class(vocab) 27 | print('Done.') 28 | 29 | 30 | def create_fasttext_cache(): 31 | from ..embeddings import CachedFastTextEmbeddings 32 | _create_cache(CachedFastTextEmbeddings, lookup_sentences=False) 33 | 34 | 35 | def create_elmo_cache(): 36 | from ..embeddings import CachedElmoEmbeddings 37 | _create_cache(CachedElmoEmbeddings, lookup_sentences=True) 38 | 39 | 40 | def convert_precomputed_fasttext_embeddings(): 41 | from ..embeddings.fasttext import fasttext_dir, fasttext_embeddings_name 42 | 43 | print('Loading precomputed embeddings...') 44 | vec_filename = os.path.join(fasttext_dir, fasttext_embeddings_name + '.vec') 45 | 46 | precomputed_vocab = np.loadtxt(vec_filename, usecols=0, dtype=object, skiprows=2, comments=None) 47 | precomputed_word2ind = {word: i for i, word in enumerate(precomputed_vocab)} 48 | 49 | # make sure there are no duplicate words 50 | assert len(precomputed_vocab) == len(precomputed_word2ind) 51 | 52 | precomputed_matrix = np.loadtxt(vec_filename, usecols=range(1, 301), skiprows=2, comments=None) 53 | 54 | print('L2 normalizing the embedding matrix...') 55 | normalized_matrix = precomputed_matrix / np.sqrt((precomputed_matrix ** 2).sum(-1))[..., np.newaxis] 56 | 57 | print('Saving the dictionary...') 58 | pickle.dump(precomputed_word2ind, open(vec_filename + '.vocab.pickle', 'wb')) 59 | print('Saving the matrix...') 60 | np.save(vec_filename + '.matrix.npy', normalized_matrix) 61 | print('Done.') 62 | 63 | 64 | def main(): 65 | import argparse 66 | 67 | parser = argparse.ArgumentParser() 68 | parser.add_argument('--fasttext-cache', help='Initialize a fasttext embeddings cache with the i2b2 vocabulary', 69 | action='store_true') 70 | parser.add_argument('--fasttext-precomputed', help='Convert precomputed fasttext embeddings to matrix/dict', 71 | action='store_true') 72 | parser.add_argument('--elmo-cache', help='Initialize an elmo embeddings cache with the i2b2 vocabulary', 73 | action='store_true') 74 | args = parser.parse_args() 75 | 76 | if not any([args.fasttext_cache, args.fasttext_precomputed, args.elmo_cache]): 77 | print('Specify at least one of --fasttext-cache, --fasttext-precomputed, --elmo-cache') 78 | 79 | if args.fasttext_cache: 80 | create_fasttext_cache() 81 | 82 | if args.elmo_cache: 83 | create_elmo_cache() 84 | 85 | if args.fasttext_precomputed: 86 | convert_precomputed_fasttext_embeddings() 87 | 88 | 89 | if __name__ == '__main__': 90 | main() 91 | -------------------------------------------------------------------------------- /deid/data/postprocess.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | from typing import List, NamedTuple, Sequence, Tuple, Optional 3 | 4 | import numpy as np 5 | # noinspection PyProtectedMember 6 | from bs4 import CData, BeautifulSoup 7 | 8 | from .token import Token, TOKEN_TYPE 9 | 10 | 11 | class TaggedTokens(NamedTuple): 12 | type: str 13 | tokens: List[Token] 14 | start: int 15 | end: int 16 | 17 | 18 | class TagAssembler: 19 | def __init__(self, sent_tokens: Sequence[Token]) -> None: 20 | self.input = sent_tokens 21 | self.result: List[TaggedTokens] = [] 22 | self.current_tag: Optional[str] = None 23 | self.current_tag_tokens: List[Token] = [] 24 | 25 | def close_current_tag(self) -> None: 26 | if self.current_tag is not None: 27 | self.result.append(TaggedTokens(self.current_tag, 28 | self.current_tag_tokens, 29 | self.current_tag_tokens[0].start, 30 | self.current_tag_tokens[-1].end)) 31 | self.current_tag = None 32 | self.current_tag_tokens = [] 33 | 34 | def assemble(self) -> Sequence[TaggedTokens]: 35 | for t in self.input: 36 | if t.type == 'O': 37 | self.close_current_tag() 38 | elif t.type.startswith('I') and self.current_tag == t.type[2:]: 39 | self.current_tag_tokens.append(t) 40 | else: # B tag or a stray I tag that should be normalized to a B 41 | self.close_current_tag() 42 | self.current_tag = t.type[2:] 43 | self.current_tag_tokens.append(t) 44 | 45 | self.close_current_tag() 46 | return self.result 47 | 48 | 49 | def unpad(X, preds) -> Tuple[List, List]: 50 | assert len(X) == len(preds), f'X and preds have different lengths: {len(X)} != {len(preds)} ' 51 | unpadded_X, unpadded_preds = [], [] 52 | for i in range(len(X)): 53 | if isinstance(X[i], np.ndarray): 54 | actual_length = np.sum(X[i].any(axis=1)) 55 | X_start = preds_start = len(X[i]) - actual_length 56 | else: 57 | X_start = 0 58 | preds_start = len(preds[i]) - len(X[i]) 59 | unpadded_X.append(list(X[i][X_start:])) 60 | unpadded_preds.append(list(preds[i][preds_start:])) 61 | assert len(unpadded_X[i]) == len(unpadded_preds[i]) 62 | return unpadded_X, unpadded_preds 63 | 64 | 65 | def postprocess_prediction(X, preds, sents, ind2label_lookup) -> Sequence[Sequence[TaggedTokens]]: 66 | X, preds = unpad(X, preds) 67 | 68 | result = [] 69 | for i in range(len(X)): 70 | sent_tokens = [] 71 | for j in range(len(X[i])): 72 | sent_tokens.append( 73 | Token(sents[i][j].text, ind2label_lookup(preds[i][j]), sents[i][j].start, sents[i][j].end)) 74 | result.append(sent_tokens) 75 | 76 | return [TagAssembler(sent_tokens).assemble() for sent_tokens in result] 77 | 78 | 79 | def prediction_to_xml(X, preds, text, sents, ind2label_lookup) -> str: 80 | preds = postprocess_prediction(X, preds, sents, ind2label_lookup) 81 | 82 | soup = BeautifulSoup('', features='xml') 83 | soup.find('TEXT').string = CData(text) 84 | tags = soup.find('TAGS') 85 | for i, tagged_tokens in enumerate(itertools.chain.from_iterable(preds)): 86 | tags.append(soup.new_tag(TOKEN_TYPE[tagged_tokens.type], 87 | id=f'P{i}', 88 | start=tagged_tokens.start, 89 | end=tagged_tokens.end, 90 | TYPE=tagged_tokens.type, 91 | text=text[tagged_tokens.start:tagged_tokens.end])) 92 | 93 | return str(soup) 94 | -------------------------------------------------------------------------------- /deid/data/tokenizer.py: -------------------------------------------------------------------------------- 1 | import html 2 | 3 | import spacy 4 | from spacy.matcher import Matcher 5 | from spacy.tokens import Token 6 | 7 | 8 | def _deid_tokenizer(): 9 | prefixes = spacy.util.compile_prefix_regex(nlp.Defaults.prefixes) 10 | 11 | my_infix = ['\.\.\.+', 12 | '(?<=[0-9])-(?=[0-9])', 13 | '(?<=[0-9])(?=[A-Za-z])', 14 | '[!&:;#,()/_\\-\\^~%{}=\'<>@]'] 15 | infixes = spacy.util.compile_infix_regex(list(nlp.Defaults.infixes) + my_infix) 16 | 17 | suffixes = spacy.util.compile_suffix_regex(nlp.Defaults.suffixes) 18 | 19 | return spacy.tokenizer.Tokenizer(nlp.vocab, nlp.Defaults.tokenizer_exceptions, 20 | prefix_search=prefixes.search, 21 | infix_finditer=infixes.finditer, suffix_search=suffixes.search, 22 | token_match=None) 23 | 24 | 25 | def _new_sentence_after_three_newlines_component(doc): 26 | def has_newlines(text): 27 | return text.count('\n') > 2 28 | 29 | for i in range(len(doc[:-2])): 30 | if has_newlines(doc[i].text) and not has_newlines(doc[i + 1].text): 31 | doc[i + 1].sent_start = True 32 | return doc 33 | 34 | 35 | def _new_sentence_for_bulleted_lists_component(doc): 36 | def has_newlines(text): 37 | return text.count('\n') > 0 38 | 39 | def is_bullet(text): 40 | return text.startswith('-') or text.startswith('*') or text.startswith('.') or text == 'o' or text[0].isdigit() 41 | 42 | for i in range(len(doc[:-2])): 43 | if has_newlines(doc[i].text) and not has_newlines(doc[i + 1].text) and is_bullet(doc[i + 1].text): 44 | doc[i + 1].sent_start = True 45 | return doc 46 | 47 | 48 | def _new_sentence_after_three_dashes_component(doc): 49 | for i in range(3, len(doc[:-3])): 50 | if all(token.text == '-' for token in doc[i - 3:i]) and doc[i].text != '-': 51 | doc[i].sent_start = True 52 | 53 | return doc 54 | 55 | 56 | # https://spacy.io/usage/linguistic-features#section-rule-based-matching 57 | class _HTMLMerger(object): 58 | def __init__(self, nlp): 59 | Token.set_extension('unescaped_html', default=None) 60 | self.matcher = Matcher(nlp.vocab) 61 | self.matcher.add('BAD_HTML', None, 62 | [{'ORTH': '<'}, {'LOWER': 'br'}, {'ORTH': '>'}], 63 | [{'ORTH': '<'}, {'LOWER': 'br'}, {'ORTH': '/'}, {'ORTH': '>'}], 64 | [{'ORTH': '&'}, {'SHAPE': 'xx'}, {'ORTH': ';'}], # < 65 | [{'ORTH': '&'}, {'SHAPE': 'xxx'}, {'ORTH': ';'}], # & 66 | [{'ORTH': '&'}, {'ORTH': '#'}, {'SHAPE': 'dd'}, {'ORTH': ';'}], # 67 | [{'ORTH': '&'}, {'ORTH': '#'}, {'SHAPE': 'ddd'}, {'ORTH': ';'}], 68 | [{'ORTH': '&'}, {'ORTH': '#'}, {'SHAPE': 'dddd'}, {'ORTH': ';'}]) 69 | 70 | def __call__(self, doc): 71 | matches = self.matcher(doc) 72 | spans = [] 73 | for match_id, start, end in matches: 74 | spans.append(doc[start:end]) 75 | for span in spans: 76 | span.merge() 77 | for token in span: 78 | if '= threshold]) / len(similarities) 17 | 18 | 19 | def main(): 20 | parser = argparse.ArgumentParser() 21 | parser.description = 'try different amounts of noise to find a balance' 22 | parser.add_argument('embeddings', type=str, help='the embeddings to use, either glove or fasttext') 23 | parser.add_argument('noises', nargs='+', type=float, help='the noises to try') 24 | args = parser.parse_args() 25 | 26 | noises = args.noises 27 | if len(noises) == 0: 28 | raise argparse.ArgumentTypeError('Please provide a list of noises') 29 | 30 | if args.embeddings == 'fasttext': 31 | emb = FastTextEmbeddings() 32 | lower = False 33 | elif args.embeddings == 'glove': 34 | emb = GloveEmbeddings() 35 | lower = True 36 | else: 37 | raise argparse.ArgumentTypeError(f'Unknown embeddings: {args.embeddings}') 38 | 39 | mat = Matrix(lookup_embeddings=emb, precomputed_word2ind=emb.precomputed_word2ind, 40 | precomputed_matrix=emb.precomputed_matrix) 41 | 42 | tr = TrainingSet(limit_documents=env.limit_training_documents) 43 | 44 | phi_tokens = set([token.text for token in itertools.chain.from_iterable(tr.X) if token.type != 'O']) 45 | phi_tokens = [word.lower() if lower else word for word in phi_tokens 46 | if len(word) > 2 47 | and (word.lower() if lower else word) in emb.precomputed_word2ind.keys() 48 | and not any([c.isdigit() for c in word])] 49 | 50 | tokens_to_check = random.sample(phi_tokens, 1_000) 51 | # print(tokens_to_check) 52 | 53 | print('Similarity to closest neighbors:') 54 | closest_neighbor_similarities = [] 55 | for token in random.sample(tokens_to_check, 10): 56 | closest_neighbor_similarities.append(mat.most_similar_cosine(token, n=2)[1].similarity) 57 | 58 | print(f'closest neighbor similarity mean: {np.mean(closest_neighbor_similarities)}', 59 | f'std: {np.std(closest_neighbor_similarities)}') 60 | 61 | for noise in noises: 62 | ranks = [] 63 | similarities = [] 64 | closest_neighbor_similarities = [] 65 | 66 | for token in tokens_to_check: 67 | looked_up = emb.lookup(token) 68 | noisy = looked_up + np.random.normal(0., noise, emb.size) 69 | ranks.append(mat.cosine_distance_rank(noisy, token)) 70 | similarities.append(mat.cosine_distance(noisy, token)) 71 | closest_neighbor_similarities.append(mat.most_similar_cosine(noisy, n=1)[0].similarity) 72 | 73 | print('---') 74 | print(f'Report for scale {noise}:') 75 | print(f'rank mean: {np.mean(ranks)},', 76 | f'std: {np.std(ranks)},', 77 | f'%top1: {top_perc(1, ranks)},', 78 | f'%top5: {top_perc(5, ranks)},', 79 | f'%top10: {top_perc(10, ranks)}') 80 | print(f'similarity with original mean: {np.mean(similarities)}', 81 | f'std: {np.std(similarities)}', 82 | f'%0.9+: {sim_perc(0.9, similarities)}', 83 | f'%0.8+: {sim_perc(0.8, similarities)}', 84 | f'%0.7+: {sim_perc(0.7, similarities)}', 85 | f'%0.6+: {sim_perc(0.6, similarities)}') 86 | print(f'closest neighbor similarity mean: {np.mean(closest_neighbor_similarities)}', 87 | f'std: {np.std(closest_neighbor_similarities)}', 88 | f'%0.9+: {sim_perc(0.9, closest_neighbor_similarities)}', 89 | f'%0.8+: {sim_perc(0.8, closest_neighbor_similarities)}', 90 | f'%0.7+: {sim_perc(0.7, closest_neighbor_similarities)}', 91 | f'%0.6+: {sim_perc(0.6, closest_neighbor_similarities)}') 92 | 93 | print() 94 | 95 | 96 | if __name__ == '__main__': 97 | main() 98 | -------------------------------------------------------------------------------- /deid/embeddings/elmo.py: -------------------------------------------------------------------------------- 1 | import math 2 | import os 3 | import pickle 4 | from typing import Sequence 5 | 6 | import numpy as np 7 | import tensorflow as tf 8 | import tensorflow_hub as hub 9 | from tqdm import tqdm 10 | 11 | from . import Embeddings 12 | from .util import pad_string_sequences, unpad_sequences, chunks 13 | from ..env import env 14 | 15 | elmo_dir = os.path.join(env.resources_dir, 'elmo') 16 | 17 | 18 | class ElmoEmbeddings(Embeddings): 19 | def __new__(cls, *args, **kwargs): 20 | if env.embeddings_cache: 21 | return CachedElmoEmbeddings(*args, **kwargs) 22 | return TensorFlowElmoEmbeddings(*args, **kwargs) 23 | 24 | def __init__(self, *_, **__): 25 | raise NotImplementedError('this should not happen') 26 | 27 | @property 28 | def size(self) -> int: 29 | raise NotImplementedError 30 | 31 | @property 32 | def std(self): 33 | raise NotImplementedError 34 | 35 | def lookup(self, word: str) -> np.ndarray: 36 | raise NotImplementedError 37 | 38 | def is_unknown(self, word: str) -> bool: 39 | raise NotImplementedError 40 | 41 | 42 | class ElmoEmbeddingsImpl(Embeddings): 43 | @property 44 | def size(self) -> int: 45 | return 1024 46 | 47 | @property 48 | def std(self) -> float: 49 | return 0.47 50 | 51 | def lookup(self, word: str) -> np.ndarray: 52 | raise RuntimeError("Don't lookup single words in ELMo") 53 | 54 | def is_unknown(self, word: str): 55 | return False 56 | 57 | 58 | class TensorFlowElmoEmbeddings(ElmoEmbeddingsImpl): 59 | def __init__(self, *_, **__): 60 | graph = tf.Graph() 61 | with graph.as_default(): 62 | self.tokens = tf.placeholder(tf.string, shape=[None, None]) 63 | self.sequence_len = tf.placeholder(tf.int32, shape=[None]) 64 | self.elmo = hub.Module('https://tfhub.dev/google/elmo/2') 65 | self.embed = self.elmo({'tokens': self.tokens, 'sequence_len': self.sequence_len}, signature='tokens', 66 | as_dict=True) 67 | init_op = tf.group([tf.global_variables_initializer(), tf.tables_initializer()]) 68 | graph.finalize() 69 | self.sess = tf.Session(graph=graph) 70 | self.sess.run(init_op) 71 | 72 | def lookup_sentence(self, words: Sequence[str]) -> Sequence[np.ndarray]: 73 | return self.sess.run(self.embed, {self.tokens: [words], self.sequence_len: [len(words)]})['elmo'][0] 74 | 75 | def lookup_sentences(self, sentences: Sequence[Sequence[str]]) -> Sequence[Sequence[np.ndarray]]: 76 | sentences, seq_length = pad_string_sequences(sentences) 77 | result = self.sess.run(self.embed, {self.tokens: sentences, self.sequence_len: seq_length})['elmo'] 78 | return unpad_sequences(result, seq_length) 79 | 80 | 81 | class CachedElmoEmbeddings(ElmoEmbeddingsImpl): 82 | def __init__(self, sentences=None, lookup_batch_size=64, *_, **__): 83 | if sentences is None: 84 | self.sent2vec = {} 85 | for chunk_name in [filename for filename in os.listdir(elmo_dir) if 'chunk' in filename]: 86 | self.sent2vec.update(pickle.load(open(os.path.join(elmo_dir, chunk_name), 'rb'))) 87 | else: 88 | if not os.path.isdir(elmo_dir): 89 | os.mkdir(elmo_dir) 90 | 91 | embeddings = TensorFlowElmoEmbeddings() 92 | self.sent2vec = {} 93 | sentence_chunks = chunks(sentences, lookup_batch_size) 94 | for i, sentence_chunk in tqdm(enumerate(sentence_chunks), desc='Looking up sentence batches', 95 | total=math.ceil(len(sentences) / lookup_batch_size)): 96 | chunk_sent2vec = {} 97 | result = embeddings.lookup_sentences(sentence_chunk) 98 | for j, sentence in enumerate(sentence_chunk): 99 | chunk_sent2vec[' '.join(sentence)] = result[j] 100 | self.sent2vec[' '.join(sentence)] = result[j] 101 | chunk_filename = os.path.join(elmo_dir, f'elmo_chunk{i:04}.pickle') 102 | pickle.dump(chunk_sent2vec, open(chunk_filename, 'wb')) 103 | 104 | def lookup_sentence(self, words: Sequence[str]): 105 | result = self.sent2vec.get(' '.join(words)) 106 | if result is not None: 107 | return result 108 | raise RuntimeError(f'Cache lookup failed for "{words}". Please rebuild the embedding cache.') 109 | -------------------------------------------------------------------------------- /deid/data/augment/augment_tests.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from .augment import Augment 4 | from .strategy import AugmentWord, AugmentEmbedding 5 | from ...data import Token 6 | from ...embeddings import DummyEmbeddings 7 | 8 | 9 | class Ones(AugmentEmbedding): 10 | def augment(self, word_embedding): 11 | return np.ones(len(word_embedding)) 12 | 13 | 14 | def test_augment_embeddings(): 15 | embeddings = DummyEmbeddings() 16 | augment = Augment(embeddings, Ones(), exclude=None, n_augmentations=2) 17 | sent = [Token.with_text('this'), Token.with_text('is'), Token.with_text('a'), Token.with_text('test')] 18 | 19 | result = augment.lookup_sentence(sent) 20 | assert len(result.original) == 4 21 | assert len(result.original[0]) == embeddings.size 22 | assert len(result.augmented) == 0 23 | 24 | sent = [Token.with_text('this'), Token.with_text('is'), Token.with_text('a', 'B-NAME'), 25 | Token.with_text('name', 'I-NAME')] 26 | result = augment.lookup_sentence(sent) 27 | augmented = result.augmented[0] 28 | 29 | assert len(augmented) == 4 30 | assert len(augmented[0]) == embeddings.size 31 | assert np.all(augmented[2] == np.ones(embeddings.size)) 32 | assert np.all(augmented[3] == np.ones(embeddings.size)) 33 | assert len(result.augmented) == 2 34 | 35 | 36 | class ReplaceWithFixed(AugmentWord): 37 | def augment(self, word): 38 | return 'REPLACED' 39 | 40 | 41 | def test_augment_words(): 42 | embeddings = DummyEmbeddings() 43 | augment = Augment(embeddings, ReplaceWithFixed(), exclude=None) 44 | sent = [Token.with_text('replace'), Token.with_text('these', 'B-NAME'), Token.with_text('words', 'I-NAME')] 45 | result = augment.lookup_sentence(sent).augmented[0] 46 | 47 | assert np.all(result[0] == embeddings.lookup('replace')) 48 | 49 | assert np.any(result[1] != embeddings.lookup('these')) 50 | assert np.all(result[1] == embeddings.lookup('REPLACED')) 51 | 52 | assert np.any(result[2] != embeddings.lookup('words')) 53 | assert np.all(result[2] == embeddings.lookup('REPLACED')) 54 | 55 | 56 | def test_augment_exclude(): 57 | embeddings = DummyEmbeddings() 58 | augment = Augment(embeddings, Ones()) 59 | sent = [Token.with_text('Please'), Token.with_text('ignore'), Token.with_text('this', 'B-NAME'), 60 | Token.with_text(':', 'I-NAME'), Token.with_text('stopword', 'I-NAME')] 61 | 62 | result = augment.lookup_sentence(sent).augmented[0] 63 | assert np.all(result[2] != np.ones(embeddings.size)) 64 | assert np.all(result[3] != np.ones(embeddings.size)) 65 | assert np.all(result[4] == np.ones(embeddings.size)) 66 | 67 | 68 | def test_augment_all(): 69 | embeddings = DummyEmbeddings() 70 | augment = Augment(embeddings, Ones(), augment_all=True, exclude=None) 71 | sent = [Token.with_text('Augment'), Token.with_text('all'), Token.with_text('of', 'B-NAME'), 72 | Token.with_text('these', 'I-NAME')] 73 | 74 | result = augment.lookup_sentence(sent).augmented[0] 75 | assert np.all(result[0] == np.ones(embeddings.size)) 76 | assert np.all(result[1] == np.ones(embeddings.size)) 77 | assert np.all(result[2] == np.ones(embeddings.size)) 78 | assert np.all(result[3] == np.ones(embeddings.size)) 79 | 80 | 81 | def test_augment_does_not_touch_unknown(): 82 | class DummyEmbeddingsWithUnknownTestWord(DummyEmbeddings): 83 | def is_unknown(self, word: str): 84 | return word == 'test' 85 | 86 | def lookup(self, word): 87 | if word == 'test': 88 | return np.zeros(self.size) 89 | return super().lookup(word) 90 | 91 | embeddings = DummyEmbeddingsWithUnknownTestWord() 92 | augment = Augment(embeddings, Ones(), exclude=None) 93 | sent = [Token.with_text('This', 'B-NAME'), Token.with_text('is', 'I-NAME'), Token.with_text('another', 'I-NAME'), 94 | Token.with_text('test', 'I-NAME')] 95 | result = augment.lookup_sentence(sent).augmented[0] 96 | assert np.any(result[2] == np.ones(embeddings.size)) 97 | assert np.all(result[3] == np.zeros(embeddings.size)) 98 | 99 | 100 | def test_augment_max(): 101 | embeddings = DummyEmbeddings() 102 | augment = Augment(embeddings, ReplaceWithFixed(), augment_max=1, exclude=None) 103 | sent = [Token.with_text('Augment'), Token.with_text('only'), Token.with_text('one', 'B-NAME'), 104 | Token.with_text('please', 'I-NAME')] 105 | result = augment.lookup_sentence(sent).augmented[0] 106 | assert len([r for r in result if np.all(r == embeddings.lookup('REPLACED'))]) == 1 107 | -------------------------------------------------------------------------------- /deid/experiment/dummy.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import collections 3 | from typing import Sequence 4 | import random 5 | 6 | import numpy as np 7 | from keras.preprocessing.sequence import pad_sequences 8 | 9 | from .directory import experiment_directory 10 | from .evaluation import evaluate_deid_performance 11 | from ..data import TrainingSet, ValidationSet, Token 12 | from ..env import env 13 | 14 | 15 | class DummyDeidentifier: 16 | def guess(self, sentence: Sequence[str]): 17 | raise NotImplementedError 18 | 19 | def predict(self, X, **_): 20 | if len(X) == 2 and isinstance(X[0][0], list) and isinstance(X[0][0][0], Token): # extra features provided 21 | X, _ = X 22 | y = [self.guess([token.text for token in sentence]) for sentence in X] 23 | y = pad_sequences(y) 24 | return y 25 | 26 | 27 | class UpperBoundDeidentifier(DummyDeidentifier): 28 | def __init__(self, X, y): 29 | self.solutions = {} 30 | for sentence, labels in zip(X, y): 31 | self.solutions[' '.join([token.text for token in sentence])] = [l[0] for l in labels] 32 | 33 | def guess(self, sentence): 34 | return self.solutions[' '.join(sentence)] 35 | 36 | 37 | class RandomGuessingDeidentifier(DummyDeidentifier): 38 | def __init__(self, X, y): 39 | label_counts = collections.defaultdict(int) 40 | for sentence, labels in zip(X, y): 41 | for label in labels: 42 | label_counts[label[0]] += 1 43 | n_labels = sum(label_counts.values()) 44 | self.labels = sorted(label_counts.keys()) 45 | self.probabilities = [label_counts[label] / n_labels for label in self.labels] 46 | 47 | def guess(self, sentence): 48 | return np.random.choice(self.labels, size=len(sentence), p=self.probabilities) 49 | 50 | 51 | class WordListDeidentifier(DummyDeidentifier): 52 | def __init__(self, X, y): 53 | self.memory = collections.defaultdict(lambda: [1]) 54 | for sentence, labels in zip(X, y): 55 | for word, label in zip(sentence, labels): 56 | self.memory[word.text].append(label[0]) 57 | 58 | def guess(self, sentence): 59 | def most_common(lst): 60 | return max(set(lst), key=lst.count) 61 | 62 | return [most_common(self.memory[word]) for word in sentence] 63 | 64 | 65 | def main(): 66 | parser = argparse.ArgumentParser() 67 | parser.description = 'different dummy predictors' 68 | parser.add_argument('--upper-bound', help='the embeddings to use, either glove or fasttext', action='store_true') 69 | parser.add_argument('--random-guessing', help='the embeddings to use, either glove or fasttext', 70 | action='store_true') 71 | parser.add_argument('--word-list', help='the embeddings to use, either glove or fasttext', action='store_true') 72 | args = parser.parse_args() 73 | 74 | if not any([args.upper_bound, args.random_guessing, args.word_list]): 75 | raise ValueError('please select at least one of --upper-bound, --random-guessing, --word-list') 76 | 77 | tr = TrainingSet(limit_documents=env.limit_training_documents) 78 | val = ValidationSet(tr.label2ind, limit_documents=env.limit_training_documents, validation_set='validation') 79 | 80 | if args.upper_bound: 81 | # needs its own special case because the model is initialized with the test set! 82 | test = ValidationSet(tr.label2ind, limit_documents=env.limit_training_documents, validation_set='test') 83 | experiment_dir = experiment_directory('upper_bound') 84 | model = UpperBoundDeidentifier(test.X, test.y) 85 | evaluate_deid_performance(model, embeddings=None, test_set='test', label2ind=tr.label2ind, 86 | ind2label=tr.ind2label, 87 | batch_size=8, experiment_dir=experiment_dir, require_argmax=False) 88 | 89 | if args.random_guessing: 90 | test_baseline('random_guessing', RandomGuessingDeidentifier, tr, val) 91 | 92 | if args.word_list: 93 | test_baseline('word_list', WordListDeidentifier, tr, val) 94 | 95 | 96 | def test_baseline(identifier, model_class, tr, val): 97 | experiment_dir = experiment_directory(identifier) 98 | model = model_class(tr.X + val.X, tr.y + val.y) 99 | evaluate_deid_performance(model, embeddings=None, test_set='test', label2ind=tr.label2ind, 100 | ind2label=tr.ind2label, batch_size=8, experiment_dir=experiment_dir, 101 | require_argmax=False) 102 | 103 | 104 | if __name__ == '__main__': 105 | main() 106 | -------------------------------------------------------------------------------- /deid/experiment/fake_sentences.py: -------------------------------------------------------------------------------- 1 | import math 2 | import os 3 | import pickle 4 | import random 5 | 6 | import numpy as np 7 | from keras import Sequential 8 | from keras.layers import Bidirectional, LSTM, Dense 9 | 10 | from . import experiment_directory 11 | from ..data import TrainingSet, ValidationSet, StratifiedSampling, is_phi_sentence 12 | from ..data.augment import Augment, get as get_strategy 13 | from ..data.batch import IteratorWithEpochLength 14 | from ..data.util import pad_2d_sequences 15 | from ..embeddings import Matrix, get as get_embeddings 16 | from ..env import env 17 | 18 | 19 | def real_and_fake_sentences(X, y, indices, alternatives, split_condition): 20 | indices = [i for i in indices if split_condition(X[i], y[i])] 21 | real_sentences = [X[i] for i in indices] 22 | fake_sentences = [random.choice(alternatives[ind]) for ind in indices] 23 | 24 | X = [] 25 | y = [] 26 | for real, fake in zip(real_sentences, fake_sentences): 27 | X += [real, fake] 28 | y += [1, 0] 29 | 30 | return pad_2d_sequences(X), np.array(y) 31 | 32 | 33 | class FakeSentencesGenerator(IteratorWithEpochLength): 34 | def __init__(self, generator: IteratorWithEpochLength, dataset): 35 | self.generator = generator 36 | self.dataset = dataset 37 | 38 | def __next__(self): 39 | _, _, indices = next(self.generator) 40 | X, y = real_and_fake_sentences(self.dataset.X, self.dataset.y, indices, self.dataset.augmented, 41 | split_condition=is_phi_sentence) 42 | return X, y 43 | 44 | @property 45 | def epoch_length(self) -> int: 46 | return self.generator.epoch_length 47 | 48 | 49 | def fake_sentences_experiment(config): 50 | print('Loading embeddings...') 51 | embeddings = get_embeddings(config['experiment']['embeddings']) 52 | 53 | name = config['name'] 54 | experiment_dir = experiment_directory(name, config['path']) 55 | 56 | print('Loading matrix...') 57 | matrix = Matrix(embeddings, precomputed_word2ind=embeddings.precomputed_word2ind, 58 | precomputed_matrix=embeddings.precomputed_matrix) 59 | 60 | strategy = get_strategy(config['augment']['strategy'], matrix) 61 | digit_strategy = get_strategy(config['augment']['digit_strategy'], matrix) 62 | augment = Augment(embeddings, strategy=strategy, digit_strategy=digit_strategy, 63 | **config['augment']['augment_args']) 64 | 65 | print('Augmenting training set...', flush=True) 66 | tr = TrainingSet(embeddings=embeddings, 67 | train_set=config['experiment']['train_set'], 68 | use_short_sentences=env.use_short_sentences, 69 | limit_documents=env.limit_training_documents, 70 | augment=augment) 71 | 72 | print('Augmenting validation set...', flush=True) 73 | val = ValidationSet(embeddings=embeddings, 74 | validation_set=config['experiment']['validation_set'], 75 | label2ind=tr.label2ind, 76 | use_short_sentences=env.use_short_sentences, 77 | limit_documents=env.limit_validation_documents, 78 | augment=augment) 79 | 80 | model = Sequential() 81 | model.add(Bidirectional(LSTM(embeddings.size), input_shape=(None, embeddings.size))) 82 | model.add(Dense(1, activation='sigmoid')) 83 | model.summary() 84 | model.compile(optimizer='nadam', loss='binary_crossentropy', metrics=['accuracy']) 85 | 86 | batch_size = test_batch_size = 32 87 | train_gen = FakeSentencesGenerator(StratifiedSampling(tr.X, tr.y, split_condition=is_phi_sentence, 88 | batch_size=batch_size, yield_indices=True, shuffle=True), tr) 89 | valid_gen = FakeSentencesGenerator(StratifiedSampling(val.X, val.y, split_condition=is_phi_sentence, 90 | batch_size=batch_size, yield_indices=True, shuffle=False), 91 | val) 92 | 93 | history = model.fit_generator(train_gen, 94 | epochs=config['training']['train_epochs'], 95 | steps_per_epoch=int(math.ceil(len(tr.X) / batch_size)), 96 | validation_data=valid_gen, 97 | validation_steps=int(math.ceil(len(val.X) / test_batch_size)), 98 | verbose=env.keras_verbose) 99 | 100 | history_pickle_path = os.path.join(experiment_dir, 'history.pickle') 101 | print('Saving history to', history_pickle_path) 102 | with open(history_pickle_path, 'wb') as f: 103 | pickle.dump(history.history, f) 104 | -------------------------------------------------------------------------------- /deid/model/adversary.py: -------------------------------------------------------------------------------- 1 | from keras import backend as K 2 | from keras.layers import Input, Dense, Lambda, LSTM, Bidirectional, TimeDistributed, Dropout, concatenate 3 | from keras.models import Model, Sequential 4 | 5 | from .layers import GradientReversal 6 | 7 | discriminator_loss = 'binary_crossentropy' 8 | 9 | 10 | def get(identifier): 11 | if identifier == 'reconstruct': 12 | return Reidentifier 13 | elif identifier == 'discriminate-representations': 14 | return TwoRepresentationsAreSameOriginalDiscriminator 15 | elif identifier == 'discriminate-representation-embedding-pair': 16 | return OriginalAndRepresentationAreMatchingDiscriminator 17 | else: 18 | raise ValueError(f'Unknown adversary: "{identifier}"') 19 | 20 | 21 | class Adversary: 22 | """ An adversary is a model with a gradient reversal layer. It can chose its inputs from a dictionary that contains 23 | entries for 'train_representation', 'fake_representation', and 'original_embeddings'. 24 | """ 25 | 26 | def __init__(self, model, loss, inputs, **compile_kwargs): 27 | self.model = model 28 | self.loss = loss 29 | self.inputs = inputs 30 | self.compile_kwargs = compile_kwargs 31 | 32 | 33 | class Reidentifier(Adversary): 34 | def __init__(self, inputs, representation_size, embedding_size, lstm_size, input_dropout=0., recurrent_dropout=0., 35 | reverse_gradient=True, **_): 36 | model = Sequential(name='reidentifier') 37 | model.add(Dropout(input_dropout, input_shape=(None, representation_size))) 38 | if reverse_gradient: 39 | model.add(GradientReversal()) 40 | model.add(Bidirectional(LSTM(lstm_size, return_sequences=True, recurrent_dropout=recurrent_dropout))) 41 | model.add(TimeDistributed(Dense(embedding_size))) 42 | model.add(Lambda(lambda x: K.l2_normalize(x, axis=-1))) 43 | super().__init__(model, inputs=[inputs['train_representation']], loss='mse', sample_weight_mode='temporal', 44 | metrics=['cosine_proximity']) 45 | 46 | 47 | class TwoRepresentationsAreSameOriginalDiscriminator(Adversary): 48 | def __init__(self, inputs, representation_size, lstm_size, input_dropout=0., recurrent_dropout=0., 49 | reverse_gradient=True, **_): 50 | """ LSTM size should be at least the representation size for this to converge quickly. """ 51 | representation_input1 = Input(shape=(None, representation_size)) 52 | representation_input2 = Input(shape=(None, representation_size)) 53 | 54 | # (batch_size, maxlen, repr_size) -> (batch_size, maxlen, 1) -- the dot layer doesn't do this 55 | normalized_1 = Lambda(lambda x: K.l2_normalize(x, axis=-1))(representation_input1) 56 | normalized_2 = Lambda(lambda x: K.l2_normalize(x, axis=-1))(representation_input2) 57 | dot_product = Lambda(lambda x: K.sum(x[0] * x[1], axis=-1, keepdims=True))([normalized_1, normalized_2]) 58 | 59 | both_inputs = concatenate([representation_input1, representation_input2], axis=-1) 60 | both_inputs = Dropout(input_dropout)(both_inputs) 61 | 62 | inputs_and_dot_product = concatenate([both_inputs, dot_product], axis=-1) 63 | if reverse_gradient: 64 | inputs_and_dot_product = GradientReversal()(inputs_and_dot_product) 65 | 66 | summary = Bidirectional(LSTM(lstm_size, recurrent_dropout=recurrent_dropout))(inputs_and_dot_product) 67 | output = Dense(1, activation='sigmoid')(summary) 68 | 69 | model = Model([representation_input1, representation_input2], output, name='rr-adv') 70 | super().__init__(model, inputs=[inputs['train_representation'], inputs['fake_representation']], 71 | loss=discriminator_loss, metrics=['accuracy']) 72 | 73 | 74 | class OriginalAndRepresentationAreMatchingDiscriminator(Adversary): 75 | def __init__(self, inputs, representation_size, embedding_size, lstm_size, input_dropout=0., recurrent_dropout=0., 76 | reverse_gradient=True, **_): 77 | embedding_input = Input(shape=(None, embedding_size)) 78 | representation_input = Input(shape=(None, representation_size)) 79 | 80 | both_inputs = concatenate([embedding_input, representation_input], axis=-1) 81 | if reverse_gradient: 82 | both_inputs = GradientReversal()(both_inputs) 83 | both_inputs = Dropout(input_dropout)(both_inputs) 84 | summary = Bidirectional(LSTM(lstm_size, recurrent_dropout=recurrent_dropout))(both_inputs) 85 | 86 | output = Dense(1, activation='sigmoid')(summary) 87 | 88 | model = Model([embedding_input, representation_input], output, name='er-adv') 89 | super().__init__(model, inputs=[inputs['original_embeddings'], inputs['fake_representation']], 90 | loss=discriminator_loss, metrics=['accuracy']) 91 | -------------------------------------------------------------------------------- /deid/data/augment/strategy.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import random 3 | import re 4 | from typing import Any, Optional, Sequence, Dict 5 | 6 | import numpy as np 7 | 8 | from ...embeddings import Matrix, EmbeddingSimilarity 9 | 10 | logger = logging.getLogger() 11 | digit_pattern = '^[0-9]*$' 12 | 13 | 14 | class AugmentStrategy: 15 | augments_words: bool 16 | 17 | @property 18 | def description(self) -> Optional[str]: 19 | return None 20 | 21 | def augment(self, word_or_embedding: Any) -> Any: 22 | raise NotImplementedError 23 | 24 | def __str__(self) -> str: 25 | options = '' if self.description is None else ' ' + self.description 26 | return f'<{self.__class__.__name__}{options}>' 27 | 28 | 29 | class AugmentWord(AugmentStrategy): 30 | augments_words = True 31 | 32 | def augment(self, word: str) -> str: 33 | raise NotImplementedError 34 | 35 | 36 | class AugmentEmbedding(AugmentStrategy): 37 | augments_words = False 38 | 39 | def augment(self, word_embedding: np.ndarray) -> np.ndarray: 40 | raise NotImplementedError 41 | 42 | 43 | class Zeros(AugmentEmbedding): 44 | """ Not actually zeros to distinguish from masking. """ 45 | 46 | def augment(self, word_embedding: np.ndarray) -> np.ndarray: 47 | return np.random.normal(0., scale=1e-6, size=len(word_embedding)) 48 | 49 | 50 | class RandomEmbedding(AugmentEmbedding): 51 | """ A random normal embedding, optionally L2 normalized """ 52 | 53 | def __init__(self, scale=None, l2_normalize=True): 54 | self.scale = 1. if scale is None else scale 55 | self.l2_normalize = l2_normalize 56 | 57 | @property 58 | def description(self) -> Optional[str]: 59 | return f'scale={self.scale}, l2_normalize={self.l2_normalize}' 60 | 61 | def augment(self, word_embedding: np.ndarray) -> np.ndarray: 62 | embedding = np.random.normal(0., scale=self.scale, size=len(word_embedding)) 63 | if self.l2_normalize: 64 | embedding = embedding / np.linalg.norm(embedding) 65 | return embedding 66 | 67 | 68 | class RandomDigits(AugmentWord): 69 | def __init__(self, matrix: Matrix) -> None: 70 | self.matrix = matrix 71 | logger.info('getting digit indices') 72 | self.digit_ind = [ind for word, ind in matrix.word2ind.items() if re.match(digit_pattern, str(word))] 73 | logger.info('found %d indices', len(self.digit_ind)) 74 | 75 | def augment(self, word: str) -> str: 76 | ind = random.choice(self.digit_ind) 77 | return self.matrix.ind2word[ind] 78 | 79 | 80 | class AdditiveNoise(AugmentEmbedding): 81 | def __init__(self, scale: float) -> None: 82 | self.scale = scale 83 | 84 | @property 85 | def description(self) -> Optional[str]: 86 | return f'scale={self.scale}' 87 | 88 | def augment(self, word_embedding: np.ndarray) -> np.ndarray: 89 | noisy = word_embedding + np.random.normal(0, scale=self.scale, size=len(word_embedding)) 90 | return noisy / np.linalg.norm(noisy) 91 | 92 | 93 | class MoveToNeighbor(AugmentWord): 94 | """ Only makes sense for embeddings like GloVE and fastText that have a fixed word->embedding lookup """ 95 | 96 | def __init__(self, matrix: Matrix, n_neighbors: int, cache_mode: str = 'neighbors') -> None: 97 | self.matrix = matrix 98 | self.n_neighbors = n_neighbors 99 | self.cache = NeighborsCache(cache_mode) 100 | 101 | @property 102 | def description(self) -> Optional[str]: 103 | return f'n_neighbors={self.n_neighbors}' 104 | 105 | def augment(self, word: str) -> str: 106 | cache_result = self.cache.lookup(word) 107 | if cache_result is None: 108 | neighbors = self.matrix.most_similar_cosine(word, n=self.n_neighbors) 109 | selected = random.choice(neighbors) 110 | self.cache.store(word, neighbors, selected) 111 | else: 112 | selected = cache_result 113 | return selected.word 114 | 115 | 116 | class NeighborsCache: 117 | def __init__(self, mode: Optional[str]) -> None: 118 | if mode not in [None, 'neighbors', 'selected']: 119 | raise ValueError("Cache mode must be either None, 'neighbors' or 'selected'") 120 | self.mode = mode 121 | self.cache: Dict[str, Sequence[EmbeddingSimilarity]] = {} 122 | 123 | def lookup(self, word: str) -> Optional[EmbeddingSimilarity]: 124 | if self.mode is None: 125 | return None 126 | 127 | result = self.cache.get(word) 128 | return result if result is None else random.choice(result) 129 | 130 | def store(self, word: str, neighbors: Sequence[EmbeddingSimilarity], selected: EmbeddingSimilarity) -> None: 131 | if self.mode == 'neighbors': 132 | self.cache[word] = neighbors 133 | if self.mode == 'selected': 134 | self.cache[word] = [selected] 135 | -------------------------------------------------------------------------------- /deid/embeddings/fasttext.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | from typing import Dict 4 | 5 | import fastText 6 | import numpy as np 7 | from tqdm import tqdm 8 | 9 | from . import PrecomputedEmbeddings 10 | from ..env import env 11 | 12 | fasttext_dir = os.path.join(env.resources_dir, 'fastText') 13 | fasttext_embeddings_name = 'wiki-news-300d-1M-subword' 14 | 15 | 16 | class FastTextEmbeddings(PrecomputedEmbeddings): 17 | def __new__(cls, *args, **kwargs): 18 | if env.embeddings_cache: 19 | return CachedFastTextEmbeddings() 20 | return PreloadFastTextEmbeddings() 21 | 22 | def __init__(self, *_, **__): 23 | raise NotImplementedError('this should not happen') 24 | 25 | @property 26 | def size(self) -> int: 27 | raise NotImplementedError 28 | 29 | @property 30 | def std(self): 31 | raise NotImplementedError 32 | 33 | def lookup(self, word: str) -> np.ndarray: 34 | raise NotImplementedError 35 | 36 | def is_unknown(self, word: str): 37 | return NotImplementedError 38 | 39 | @property 40 | def precomputed_word2ind(self) -> Dict[str, int]: 41 | raise NotImplementedError 42 | 43 | @property 44 | def precomputed_matrix(self) -> np.ndarray: 45 | raise NotImplementedError 46 | 47 | 48 | class FastTextEmbeddingsImpl(PrecomputedEmbeddings): 49 | def __init__(self, size, *_, **__): 50 | self._size = size 51 | self._precomputed_word2ind = None 52 | self._precomputed_matrix = None 53 | 54 | @property 55 | def precomputed_word2ind(self) -> Dict[str, int]: 56 | if self._precomputed_word2ind is None: 57 | vocab_filename = os.path.join(fasttext_dir, fasttext_embeddings_name + '.vec.vocab.pickle') 58 | self._precomputed_word2ind = pickle.load(open(vocab_filename, 'rb')) 59 | return self._precomputed_word2ind 60 | 61 | @property 62 | def precomputed_matrix(self) -> np.ndarray: 63 | if self._precomputed_matrix is None: 64 | matrix_filename = os.path.join(fasttext_dir, fasttext_embeddings_name + '.vec.matrix.npy') 65 | self._precomputed_matrix = np.load(matrix_filename) 66 | return self._precomputed_matrix 67 | 68 | @staticmethod 69 | def l2_normalize_if_needed(vec: np.ndarray, l2_normalize: bool) -> np.ndarray: 70 | if l2_normalize: 71 | vec /= np.linalg.norm(vec) # all-zero embeddings shouldn't exist 72 | return vec 73 | 74 | @property 75 | def size(self) -> int: 76 | return self._size 77 | 78 | @property 79 | def std(self) -> float: 80 | return 0.05 81 | 82 | def lookup(self, word: str) -> np.ndarray: 83 | raise NotImplementedError 84 | 85 | def is_unknown(self, word: str) -> bool: 86 | return False 87 | 88 | 89 | class PreloadFastTextEmbeddings(FastTextEmbeddingsImpl): 90 | def __init__(self) -> None: 91 | self.model = fastText.load_model(os.path.join(fasttext_dir, fasttext_embeddings_name + '.bin')) 92 | super().__init__(self.model.get_dimension()) 93 | 94 | def lookup(self, word: str, l2_normalize: bool = True) -> np.ndarray: 95 | vec = self.model.get_word_vector(word) 96 | if np.count_nonzero(vec) == 0: 97 | # add small amount of noise to all-zero embeddings to make them work with masking / CRF 98 | vec += np.random.normal(0., scale=1e-6, size=len(vec)) 99 | 100 | return self.l2_normalize_if_needed(vec, l2_normalize) 101 | 102 | def __str__(self) -> str: 103 | return '' 104 | 105 | 106 | class CachedFastTextEmbeddings(FastTextEmbeddingsImpl): # always L2 normalized! 107 | def __init__(self, vocab=None): 108 | cache_path = os.path.join(fasttext_dir, fasttext_embeddings_name + '.pickle') 109 | if vocab is None: 110 | self.word2ind, self.matrix = pickle.load(open(cache_path, 'rb')) 111 | else: 112 | vocab = set(vocab) 113 | embeddings = PreloadFastTextEmbeddings() 114 | self.word2ind = {word: i + 1 for i, word in enumerate(vocab)} 115 | self.matrix = np.zeros((len(vocab) + 1, embeddings.size)) 116 | for i, word in tqdm(enumerate(vocab, start=1), desc='Looking up words', total=len(vocab)): 117 | self.matrix[i] = embeddings.lookup(word, l2_normalize=True) 118 | 119 | pickle.dump((self.word2ind, self.matrix), open(cache_path, 'wb')) 120 | super().__init__(self.matrix.shape[1]) 121 | 122 | def lookup(self, word: str, include_precomputed: bool = True) -> np.ndarray: 123 | index = self.word2ind.get(word) 124 | if index is not None: 125 | return self.matrix[index] 126 | 127 | index = self.precomputed_word2ind.get(word) 128 | if index is not None: 129 | return self.precomputed_matrix[index] 130 | 131 | raise RuntimeError(f'Cache/precomputed lookup failed for "{word}". Please rebuild the embedding cache.') 132 | -------------------------------------------------------------------------------- /deid/data/read.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import os 3 | from typing import Sequence, Optional, Set 4 | 5 | from .token import Token, HIPAA_TOKEN_TYPE, BINARY_LABEL 6 | from ..env import env 7 | 8 | 9 | def _add_data_dir_if_needed(path: str) -> str: 10 | """ Adds the data directory to a path if it's not already a sub-path. 11 | 12 | >>> _add_data_dir_if_needed('train') == os.path.join(env.data_dir, 'train') 13 | True 14 | 15 | :param path: the input path 16 | :return: a path containing the data directory 17 | """ 18 | if os.path.realpath(env.data_dir) not in os.path.realpath(path): 19 | path = os.path.join(env.data_dir, path) 20 | return path 21 | 22 | 23 | def full_text_for_csv(filename: str) -> str: 24 | """ Returns the full text for a csv file that is saved to a .txt file with the same stem name. 25 | 26 | :param filename: the csv filename 27 | :return: a string that is read from the corresponding txt file 28 | """ 29 | filename = _add_data_dir_if_needed(filename) 30 | 31 | if not filename.endswith('.csv'): 32 | raise ValueError(f'{filename} is not a csv file') 33 | 34 | return open(filename[:-4] + '.txt').read() 35 | 36 | 37 | def tokens_from_csv(file_or_dir: str, 38 | limit: Optional[int] = None, 39 | binary_classification: bool = False, 40 | hipaa_only: bool = False) -> Sequence[Token]: 41 | """ Parses a directory of csv files or a single csv file for tokens. 42 | 43 | :param file_or_dir: the csv file or directory to parse 44 | :param limit: upper limit for the number of csv files to parse 45 | :param binary_classification: set to True to skip the classes and use only generic BIO labels 46 | :param hipaa_only: set to True to skip all non-HIPAA tags 47 | 48 | :return: a list of Token objects 49 | """ 50 | 51 | def label_string(bio_string): 52 | if hipaa_only: 53 | if bio_string == 'O' or bio_string[2:] not in HIPAA_TOKEN_TYPE.keys(): 54 | return 'O' 55 | 56 | if binary_classification: 57 | # Not really binary: there is still a B, I, and O label (and the padding label). I tried using true binary 58 | # labels and there was no real difference, so I'm deciding to keep it like this. 59 | return 'O' if bio_string == 'O' else f'{bio_string[0]}-{BINARY_LABEL}' 60 | return bio_string 61 | 62 | file_or_dir = _add_data_dir_if_needed(file_or_dir) 63 | 64 | if os.path.isdir(file_or_dir): 65 | filenames = sorted([os.path.join(file_or_dir, f) for f in os.listdir(file_or_dir) if f.endswith('.csv')]) 66 | if len(filenames) == 0: 67 | raise ValueError(f'{file_or_dir} does not contain any csv files') 68 | elif file_or_dir.endswith('.csv'): 69 | filenames = [file_or_dir] 70 | else: 71 | raise ValueError(f'{file_or_dir} is not a csv file') 72 | 73 | tokens = [] 74 | for i, filename in enumerate(filenames): 75 | with open(filename) as f: 76 | reader = csv.reader(f) 77 | next(reader) # skip header 78 | for row in reader: 79 | tokens.append(Token(row[0], 80 | label_string(row[1]), 81 | *map(int, row[2:]))) 82 | if i == limit: 83 | break 84 | 85 | return tokens 86 | 87 | 88 | def split_sentences(tokens: Sequence[Token]) -> Sequence[Sequence[Token]]: 89 | """ Breaks a list of Token objects into sentence chunks delimited by sent_start and sent_end. Incomplete sentences 90 | are not included in the result. 91 | 92 | >>> len(split_sentences([Token.with_text(env.sent_start), Token.with_text('test'), Token.with_text(env.sent_end)])) 93 | 1 94 | 95 | >>> len(split_sentences([Token.with_text(env.sent_start), Token.with_text('test')])) 96 | 0 97 | 98 | :param tokens: the tokens to break into sentences 99 | :return: a list of sentences (i.e. a list of lists of tokens) 100 | """ 101 | sents = [] 102 | current_sent = [] 103 | for token in tokens: 104 | if token.text not in [env.sent_start, env.sent_end]: 105 | current_sent.append(token) 106 | if token.text == env.sent_end: 107 | if len(current_sent) > 0: 108 | sents.append(current_sent) 109 | current_sent = [] 110 | return sents 111 | 112 | 113 | def vocab_from_tokens(tokens: Sequence[Token]) -> Set[str]: 114 | """ Returns a set of words from a token sequence, excluding the special sent_start and sent_end tokens. 115 | 116 | >>> sorted(vocab_from_tokens([Token.with_text(env.sent_start), Token.with_text('test'), \ 117 | Token.with_text('some'), Token.with_text('test'), Token.with_text('words'), Token.with_text(env.sent_end)])) 118 | ['some', 'test', 'words'] 119 | 120 | :param tokens: the tokens to convert to a vocabulary set 121 | :return: a set of words 122 | """ 123 | return set(token.text for token in tokens) - {env.sent_start, env.sent_end} 124 | -------------------------------------------------------------------------------- /deid/tools/config.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import itertools 3 | import os 4 | import random 5 | 6 | import yaml 7 | 8 | from ..env import env 9 | 10 | config_dir = os.path.join(env.resources_dir, 'config') 11 | generated_dir = os.path.join(config_dir, 'generated') 12 | if not os.path.isdir(generated_dir): 13 | os.mkdir(generated_dir) 14 | 15 | 16 | def generate_config(config): 17 | result = {} 18 | for key, value in config.items(): 19 | if key == 'choice': 20 | if isinstance(value, list): 21 | return random.choice(value) 22 | else: 23 | raise ValueError('does not support other inputs than lists at the moment') 24 | elif isinstance(value, dict): 25 | result[key] = generate_config(value) 26 | else: 27 | result[key] = value 28 | return result 29 | 30 | 31 | def generate_random_configs(config, name, n, start, output_path): 32 | for i in range(n): 33 | config_num = i + start 34 | result = generate_config(config) 35 | with open(os.path.join(output_path, name) + f'_{config_num:03d}.yaml', 'w') as f: 36 | f.write(yaml.dump(result)) 37 | print(f'Generated {n} configs.') 38 | 39 | 40 | def flatten_config(config, sep='--', prefix=None): 41 | """ 42 | >>> flatten_config({'a': 1, 'b': {'c': {'d': 4, 'e': [5, 6]}}}) 43 | {'a': 1, 'b--c--d': 4, 'b--c--e': [5, 6]} 44 | """ 45 | result = {} 46 | for key, value in config.items(): 47 | key = key if prefix is None else f'{prefix}{sep}{key}' 48 | if isinstance(value, dict): 49 | result.update(flatten_config(value, sep, prefix=key)) 50 | else: 51 | result[key] = value 52 | return result 53 | 54 | 55 | def unflatten_config(config, sep='--'): 56 | """ 57 | >>> unflatten_config({'a': 1, 'b--c--d': 4, 'b--c--e': [5, 6]}) 58 | {'a': 1, 'b': {'c': {'d': 4, 'e': [5, 6]}}} 59 | """ 60 | result = {} 61 | for key, value in config.items(): 62 | parts = key.split(sep) 63 | parent = result 64 | for child in parts[:-1]: 65 | if child not in parent.keys(): 66 | parent[child] = {} 67 | parent = parent[child] 68 | parent[parts[-1]] = value 69 | return result 70 | 71 | 72 | def remove_choices(config, sep='--'): 73 | """ 74 | >>> remove_choices({'a': 1, 'b--c--choice': 2}) 75 | {'a': 1, 'b--c': 2} 76 | """ 77 | result = {} 78 | for key, value in config.items(): 79 | if key.endswith(f'{sep}choice'): 80 | result[key[:-len(f'{sep}choice')]] = value 81 | else: 82 | result[key] = value 83 | return result 84 | 85 | 86 | def generate_grid_configs(config, name, output_path): 87 | flattened = flatten_config(config) 88 | choice_keys = [key for key in flattened.keys() if key.endswith('choice')] 89 | 90 | for i, choices in enumerate(itertools.product(*[flattened[key] for key in choice_keys]), start=1): 91 | for choice_key, choice in zip(choice_keys, choices): 92 | flattened[choice_key] = choice 93 | 94 | result = unflatten_config(remove_choices(flattened)) 95 | with open(os.path.join(output_path, name) + f'_grid_{i:03d}.yaml', 'w') as f: 96 | f.write(yaml.dump(result)) 97 | print(f'Generated {i} configs.') 98 | 99 | 100 | def find_config(name): 101 | filename = name 102 | if os.path.isfile(filename): 103 | return filename 104 | 105 | filename = os.path.join(config_dir, filename) 106 | if os.path.isfile(filename): 107 | return filename 108 | 109 | filename = filename + '.yaml' 110 | if os.path.isfile(filename): 111 | return filename 112 | 113 | raise argparse.ArgumentTypeError(f'{name} is not a valid config name or path') 114 | 115 | 116 | def main(): 117 | def ensure_dir(arg) -> str: 118 | if type(arg) == str and os.path.isdir(arg): 119 | return arg 120 | raise argparse.ArgumentTypeError(f'{arg} is not a directory') 121 | 122 | parser = argparse.ArgumentParser() 123 | parser.description = 'Create experiment configs from a config template.' 124 | parser.add_argument('input_config', help='the input config template') 125 | parser.add_argument('-o', '--output_path', help='the path to store the results', type=ensure_dir, 126 | default=generated_dir) 127 | parser.add_argument('-n', '--n', help='the number of configs to generate', type=int, default=10) 128 | parser.add_argument('-a', '--all', help='generate all configs (grid), overrides --n', action='store_true') 129 | parser.add_argument('-s', '--start', help='the starting number for config filenames', type=int, default=0) 130 | 131 | args = parser.parse_args() 132 | 133 | filename = find_config(args.input_config) 134 | config = yaml.load(open(filename)) 135 | name = '.'.join(os.path.basename(filename).split('.')[:-1]) 136 | name = name.replace('_template', '') 137 | if args.all: 138 | generate_grid_configs(config, name, args.output_path) 139 | else: 140 | generate_random_configs(config, name, args.n, args.start, args.output_path) 141 | 142 | 143 | if __name__ == '__main__': 144 | main() 145 | -------------------------------------------------------------------------------- /deid/model/representer.py: -------------------------------------------------------------------------------- 1 | from keras import backend as K 2 | from keras.layers import Dense, Lambda, LSTM, Bidirectional, TimeDistributed, Masking 3 | from keras.models import Sequential 4 | 5 | from .layers import Noise 6 | 7 | 8 | def get(identifier): 9 | if identifier == 'noisy': 10 | return build_noise_representer 11 | elif identifier == 'dense': 12 | return build_dense_representer 13 | elif identifier == 'lstm': 14 | return build_lstm_representer 15 | else: 16 | raise ValueError(f'Unknown representation type: "{identifier}"') 17 | 18 | 19 | def build_noise_representer(embedding_size, representation_size, noises, single_stddev, apply_noise, 20 | l2_normalize=False, **_): 21 | """ Build a representer that applies a series of noise steps. 22 | 23 | :param embedding_size: the embedding (input) size 24 | :param representation_size: the representation (output) size 25 | :param noises: the types of noise to add if using the 'noisy' representation. Must be a single 26 | identifier or sequence of identifiers, allowed identifiers are '+'/'add' or '*'/'mult' 27 | :param single_stddev: whether to use a single stddev for all embedding dimensions 28 | :param apply_noise: whether to apply noise or the mean in this model 29 | :param l2_normalize: whether to L2 normalize the inputs (outputs are always L2 normalized) 30 | :return: a noisy representer model 31 | """ 32 | if type(noises) == str: 33 | noises = [noises] 34 | 35 | model = Sequential(name='representer') 36 | model.add(Masking(input_shape=(None, embedding_size))) 37 | if l2_normalize: 38 | model.add(Lambda(lambda x: K.l2_normalize(x, axis=-1))) 39 | for i, noise_operation in enumerate(noises): 40 | model.add(Noise(noise_operation, apply_noise=apply_noise, single_stddev=single_stddev, 41 | input_shape=(None, embedding_size))) 42 | 43 | model.add(TimeDistributed(Dense(representation_size))) 44 | model.add(Lambda(lambda x: K.l2_normalize(x, axis=-1))) 45 | return model 46 | 47 | 48 | def build_dense_representer(embedding_size, representation_size, apply_noise, num_hidden=2, hidden_size=None, 49 | l2_normalize=False, noise_before=True, noise_after=True, single_stddev=False, **_): 50 | """ Build a dense representer that applies the same dense weights to each element in the input sequence. 51 | 52 | :param embedding_size: the embedding (input) size 53 | :param representation_size: the representation (output) size 54 | :param apply_noise: whether to apply noise or the mean in this model 55 | :param num_hidden: the number of hidden layers in the dense model 56 | :param hidden_size: the number of units per hidden layer in the dense model 57 | :param l2_normalize: whether to L2 normalize the inputs (outputs are always L2 normalized) 58 | :param noise_before: whether to add noise with trainable stddev to the inputs 59 | :param noise_after: whether to add noise with trainable stddev to the outputs 60 | :param single_stddev: whether to use a single stddev for all embedding dimensions 61 | :param _: ignored kwargs 62 | :return: a dense representer model 63 | """ 64 | if hidden_size is None: 65 | hidden_size = embedding_size 66 | 67 | model = Sequential(name='representer') 68 | model.add(Masking(input_shape=(None, embedding_size))) 69 | if l2_normalize: 70 | model.add(Lambda(lambda x: K.l2_normalize(x, axis=-1))) 71 | if noise_before: 72 | model.add(Noise('add', single_stddev=single_stddev, apply_noise=apply_noise)) 73 | 74 | for _ in range(num_hidden): 75 | model.add(TimeDistributed(Dense(hidden_size, activation='relu'))) 76 | model.add(TimeDistributed(Dense(representation_size))) 77 | 78 | if noise_after: 79 | model.add(Noise('add', single_stddev=single_stddev, apply_noise=apply_noise)) 80 | model.add(Lambda(lambda x: K.l2_normalize(x, axis=-1))) 81 | return model 82 | 83 | 84 | def build_lstm_representer(embedding_size, representation_size, apply_noise, num_hidden=1, lstm_size=128, 85 | l2_normalize=False, noise_before=True, noise_after=True, single_stddev=False, **_): 86 | """ Build an LSTM representer. 87 | 88 | :param embedding_size: the embedding (input) size 89 | :param representation_size: the representation (output) size 90 | :param apply_noise: whether to apply noise or the mean in this model 91 | :param num_hidden: the number of LSTM layers 92 | :param lstm_size: the number of LSTM units per direction and layer 93 | :param l2_normalize: whether to L2 normalize the inputs (outputs are always L2 normalized) 94 | :param noise_before: whether to add noise with trainable stddev to the inputs 95 | :param noise_after: whether to add noise with trainable stddev to the outputs 96 | :param single_stddev: whether to use a single stddev for all embedding dimensions 97 | :param _: ignored kwargs 98 | :return: an LSTM representer model 99 | """ 100 | model = Sequential(name='representer') 101 | model.add(Masking(input_shape=(None, embedding_size))) 102 | if l2_normalize: 103 | model.add(Lambda(lambda x: K.l2_normalize(x, axis=-1))) 104 | if noise_before: 105 | model.add(Noise('add', single_stddev=single_stddev, apply_noise=apply_noise)) 106 | 107 | for _ in range(num_hidden): 108 | model.add(Bidirectional(LSTM(lstm_size, return_sequences=True))) 109 | model.add(TimeDistributed(Dense(representation_size))) 110 | 111 | if noise_after: 112 | model.add(Noise('add', single_stddev=single_stddev, apply_noise=apply_noise)) 113 | model.add(Lambda(lambda x: K.l2_normalize(x, axis=-1))) 114 | return model 115 | -------------------------------------------------------------------------------- /deid/tools/i2b2_xml_to_csv.py: -------------------------------------------------------------------------------- 1 | # Call this using `python -m deid.tools.i2b2_xml_to_csv [params] 2 | 3 | import argparse 4 | import csv 5 | import html 6 | import os 7 | from typing import Tuple, Sequence 8 | 9 | from bs4 import BeautifulSoup 10 | from tqdm import tqdm 11 | 12 | from ..data import Token, tokenize 13 | 14 | 15 | def xml_to_csv(filename: str, output_dir: str, check_alignment) -> None: 16 | tokens, text = xml_to_annotated_tokens_and_text(filename, check_alignment) 17 | 18 | path_without_ext = os.path.join(output_dir, os.path.basename(filename)[:-4]) 19 | 20 | with open(path_without_ext + '.csv', 'w') as f: 21 | writer = csv.writer(f) 22 | writer.writerow(['text', 'type', 'start', 'end']) 23 | writer.writerows(tokens) 24 | 25 | with open(path_without_ext + '.txt', 'w') as f: 26 | f.write(text) 27 | 28 | 29 | def xml_to_annotated_tokens_and_text(filename, check_alignment) -> Tuple[Sequence[Token], str]: 30 | soup = BeautifulSoup(open(filename).read(), features='xml') 31 | 32 | text = str(soup.find('TEXT').contents[0]) 33 | tags = soup.find('TAGS').findChildren() 34 | 35 | if check_alignment: 36 | # Sanity check: compare the with the above text. 37 | # Ignoring differences where only a '\n' is missing from the tag text because this occurs often in the data 38 | # and does not seem to matter for us. 39 | for tag in tags: 40 | tag_text, original_text = tag.get('text'), text[int(tag.get('start')):int(tag.get('end'))] 41 | if tag_text != original_text and tag_text != original_text.replace('\n', ' '): 42 | location = f"{os.path.basename(filename)}[{tag.get('start')}:{tag.get('end')}]" 43 | tqdm.write(f"{location} (tag) {tag_text.__repr__()} ≠ (original) {original_text.__repr__()}") 44 | 45 | # TODO check here if the start and end tags actually fall on tokens 46 | 47 | doc = tokenize(text) 48 | return annotate_with_tags(doc, tags), text 49 | 50 | 51 | def annotate_with_tags(doc, tags) -> Sequence[Token]: 52 | def tag_start(i): 53 | return int(tags[i].get('start')) 54 | 55 | def tag_end(i): 56 | return int(tags[i].get('end')) 57 | 58 | def is_current_tag(token): 59 | if token.idx == tag_start(current_tag): 60 | return True 61 | if token.idx >= tag_start(current_tag) and token.idx + len(token.text) <= tag_end(current_tag): 62 | return True 63 | if token.idx < tag_start(current_tag) < token.idx + len(token.text): 64 | return True 65 | return False 66 | 67 | current_tag = 0 68 | 69 | result = [] 70 | for sentence in doc.sents: 71 | continue_tag_type = None # set to the tag type string if the tag is not yet processed fully 72 | result.append(Token('', 'O', sentence[0].idx, sentence[0].idx)) 73 | 74 | for token in sentence: 75 | if continue_tag_type and token.idx < tag_end(current_tag): 76 | tag = f'I-{continue_tag_type}' 77 | else: 78 | if token.idx >= tag_end(current_tag) and current_tag < len(tags) - 1: 79 | current_tag += 1 80 | 81 | # make sure we did not skip an entire tag 82 | while token.idx >= tag_end(current_tag) and current_tag < len(tags) - 1: 83 | print('Skipping a tag:', tags[current_tag].get('TYPE'), tags[current_tag].get('text')) 84 | current_tag += 1 85 | 86 | if is_current_tag(token): 87 | continue_tag_type = tags[current_tag].get('TYPE') 88 | tag = f'B-{continue_tag_type}' 89 | else: 90 | tag = 'O' 91 | continue_tag_type = None 92 | 93 | token_text = token._.unescaped_html if token._.unescaped_html is not None else token.text 94 | token_text = token_text.strip() 95 | if len(token_text) == 0 and tag.startswith('B'): 96 | continue_tag_type = None 97 | 98 | if len(token_text) != 0: 99 | result.append(Token(token_text, tag, token.idx, token.idx + len(token))) 100 | 101 | result.append(Token('', 'O', sentence[-1].idx, sentence[-1].idx)) 102 | 103 | return result 104 | 105 | 106 | def main() -> None: 107 | def ensure_file_or_dir(arg) -> str: 108 | if type(arg) == str and (os.path.isfile(arg) or os.path.isdir(arg)): 109 | return arg 110 | raise argparse.ArgumentTypeError(f'{arg} is not a file or directory') 111 | 112 | def ensure_dir(arg) -> str: 113 | if type(arg) == str and os.path.isdir(arg): 114 | return arg 115 | raise argparse.ArgumentTypeError(f'{arg} is not a directory') 116 | 117 | parser = argparse.ArgumentParser() 118 | parser.add_argument('file_or_dir', help='the input file(s)', type=ensure_file_or_dir) 119 | parser.add_argument('output_dir', help='save the csv file(s) here', type=ensure_dir) 120 | parser.add_argument('--check', help='check the tag/text alignment', action='store_true') 121 | args = parser.parse_args() 122 | 123 | if os.path.isdir(args.file_or_dir): 124 | filenames = sorted([file for file in os.listdir(args.file_or_dir) if file.endswith('.xml')]) 125 | if len(filenames) == 0: 126 | print('No XML files found.') 127 | 128 | pbar = tqdm(filenames) 129 | for filename in pbar: 130 | pbar.set_description(filename) 131 | path = os.path.join(args.file_or_dir, filename) 132 | xml_to_csv(path, output_dir=args.output_dir, check_alignment=args.check) 133 | else: 134 | xml_to_csv(args.file_or_dir, output_dir=args.output_dir, check_alignment=args.check) 135 | 136 | 137 | if __name__ == '__main__': 138 | main() 139 | -------------------------------------------------------------------------------- /deid/data/augment/augment.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import random 3 | import re 4 | from typing import Optional, Callable, Sequence, NamedTuple, Tuple, Dict, Union 5 | 6 | from spacy.lang.en.stop_words import STOP_WORDS 7 | 8 | from . import AugmentStrategy, get as get_strategy 9 | from .. import Token, Sentence 10 | from ...embeddings import Embeddings 11 | 12 | logger = logging.getLogger() 13 | digit_pattern = '^[0-9]*$' 14 | 15 | 16 | def default_exclude(word: str) -> bool: 17 | return word.lower() in STOP_WORDS or bool(re.match('^[.,:;/+\-*=\\\\]*$', word)) 18 | 19 | 20 | def exclude_nothing(_: str) -> bool: 21 | return False 22 | 23 | 24 | class AugmentedSentence(NamedTuple): 25 | original: Sentence 26 | augmented: Sequence[Sentence] 27 | 28 | 29 | class Augment: 30 | def __init__(self, embeddings: Embeddings, 31 | strategy: Union[AugmentStrategy, str], 32 | digit_strategy: Optional[Union[AugmentStrategy, str]] = None, 33 | n_augmentations: int = 1, 34 | augment_all: bool = False, 35 | augment_max: Optional[int] = None, 36 | exclude_unknown: bool = True, 37 | exclude: Optional[Callable[[str], bool]] = default_exclude) -> None: 38 | self.embeddings = embeddings 39 | self.augment_all = augment_all 40 | self.exclude_unknown = exclude_unknown 41 | if isinstance(strategy, str): 42 | self.strategy = get_strategy(strategy) 43 | else: 44 | self.strategy = strategy 45 | 46 | if digit_strategy is None: 47 | self.digit_strategy = self.strategy 48 | elif isinstance(digit_strategy, str): 49 | self.digit_strategy = get_strategy(digit_strategy) 50 | else: 51 | self.digit_strategy = digit_strategy 52 | 53 | self.n_augmentations = n_augmentations 54 | self.augment_max = augment_max if augment_max is not None else 10_000 55 | self.exclude = exclude if exclude is not None else exclude_nothing 56 | 57 | def __str__(self) -> str: 58 | return f'' 61 | 62 | def _strategy_or_digit_strategy(self, word: str) -> AugmentStrategy: 63 | if re.match(digit_pattern, word): 64 | return self.digit_strategy 65 | return self.strategy 66 | 67 | def _should_be_excluded(self, word, label): 68 | exclude_because_o = not self.augment_all and label == 'O' 69 | exclude_because_unknown = self.exclude_unknown and self.embeddings.is_unknown(word) 70 | return self.exclude(word) or exclude_because_o or exclude_because_unknown 71 | 72 | def lookup_sentence(self, sentence: Sequence[Token]) -> AugmentedSentence: 73 | """ If the sentence is only O, just look it up. Otherwise: 74 | - apply the word strategies and keep track of the embedding strategies that need to be applied later 75 | - look up the sentence 76 | - apply the embedding strategies 77 | 78 | :param sentence: the input sentence 79 | :return: an AugmentedSentence object 80 | """ 81 | original = self.embeddings.lookup_sentence([token.text for token in sentence]) 82 | if not self.augment_all and all([token.type == 'O' for token in sentence]): 83 | return AugmentedSentence(original, []) 84 | 85 | apply_word_strategies_result = [self.apply_word_strategies(sentence) for _ in range(self.n_augmentations)] 86 | augment_embeddings, sentences_for_lookup = zip(*apply_word_strategies_result) 87 | embedded_sentences = self.embeddings.lookup_sentences(sentences_for_lookup) 88 | augmented = [self.apply_embedding_strategies(augment_embedding, embedded_sent) for 89 | augment_embedding, embedded_sent in zip(augment_embeddings, embedded_sentences)] 90 | return AugmentedSentence(original, augmented) 91 | 92 | def apply_embedding_strategies(self, augment_embedding: Dict[int, AugmentStrategy], 93 | sentence_embeddings: Sentence) -> Sentence: 94 | sentence_embeddings = list(sentence_embeddings) 95 | for i, strategy in augment_embedding.items(): 96 | augmented = strategy.augment(sentence_embeddings[i]) 97 | assert len(augmented) == self.embeddings.size 98 | sentence_embeddings[i] = augmented 99 | return sentence_embeddings 100 | 101 | def apply_word_strategies(self, sentence: Sequence[Token]) -> Tuple[Dict[int, AugmentStrategy], Sequence[str]]: 102 | sentence_for_lookup = [] 103 | augment_embedding = {} 104 | 105 | augment_word_ind = [] 106 | for i, token in enumerate(sentence): 107 | word, label = token.text, token.type 108 | if not self._should_be_excluded(word, label): 109 | strategy = self._strategy_or_digit_strategy(word) 110 | if strategy.augments_words: 111 | augment_word_ind.append(i) 112 | else: 113 | augment_embedding[i] = strategy 114 | logger.info('deferring strategy %s to augment "%s"', strategy, word) 115 | 116 | if len(augment_word_ind) > self.augment_max: 117 | augment_word_ind = random.sample(augment_word_ind, self.augment_max) 118 | 119 | for i, token in enumerate(sentence): 120 | word, label = token.text, token.type 121 | if i in augment_word_ind: 122 | strategy = self._strategy_or_digit_strategy(word) 123 | augmented = strategy.augment(word) 124 | logger.info('using strategy %s to augment "%s" to "%s"', strategy, word, augmented) 125 | sentence_for_lookup.append(augmented) 126 | else: 127 | sentence_for_lookup.append(word) 128 | 129 | return augment_embedding, sentence_for_lookup 130 | -------------------------------------------------------------------------------- /deid/experiment/evaluation.py: -------------------------------------------------------------------------------- 1 | import contextlib 2 | import os 3 | from collections import OrderedDict 4 | 5 | import numpy as np 6 | from keras.callbacks import Callback 7 | from keras.utils.generic_utils import Progbar 8 | from terminaltables import SingleTable as TerminalTable 9 | 10 | from ..data import TestSet, prediction_to_xml 11 | from ..env import env 12 | from ..tools.i2b2.classes import PHITrackEvaluation, Evaluate 13 | from ..tools.i2b2.evaluate import evaluate as i2b2_evaluate 14 | 15 | 16 | def _save_predictions_to_xmls(model, batch_size, embeddings, label2ind, ind2label, test_set, predictions_dir, 17 | binary_classification, hipaa_only, extra_features, require_argmax): 18 | if not os.path.isdir(predictions_dir): 19 | os.mkdir(predictions_dir) 20 | 21 | print('Saving test XMLs to', predictions_dir) 22 | progress_bar = Progbar(target=TestSet.number_of_test_sets(test_set), verbose=env.keras_verbose) 23 | 24 | for i, te in enumerate(TestSet.test_sets(embeddings, 25 | test_set=test_set, 26 | label2ind=label2ind, 27 | binary_classification=binary_classification, 28 | hipaa_only=hipaa_only, 29 | extra_features=extra_features), start=1): 30 | preds = model.predict([te.X, te.X_extra], batch_size=batch_size) 31 | if require_argmax: 32 | preds = np.argmax(preds, axis=-1) 33 | xml = prediction_to_xml(te.X, preds, te.text, te.sents, ind2label) 34 | filename = os.path.basename(te.filename)[:-4] + '.xml' 35 | with open(os.path.join(predictions_dir, filename), 'w') as f: 36 | f.write(xml) 37 | 38 | progress_bar.update(i) 39 | 40 | 41 | def _run_official_evaluation(predictions_dir, test_set, output_file, binary_classification=False, hipaa_only=False, 42 | print_summary=True): 43 | xml_test_dir = os.path.join(env.data_dir, test_set + '_xml') 44 | 45 | def call_i2b2_evaluate(): 46 | return i2b2_evaluate([predictions_dir], xml_test_dir, PHITrackEvaluation, verbose=False) 47 | 48 | if output_file is not None: 49 | with open(output_file, 'w') as f: 50 | with contextlib.redirect_stdout(f): 51 | evaluations = call_i2b2_evaluate() 52 | else: 53 | evaluations = call_i2b2_evaluate() 54 | 55 | result = OrderedDict() 56 | for evaluation in evaluations.evaluations: 57 | mp = evaluation.micro_precision() 58 | mr = evaluation.micro_recall() 59 | f1 = Evaluate.F_beta(mr, mp) 60 | result[evaluation.sys_id] = {'precision': mp, 'recall': mr, 'f1': f1} 61 | 62 | if print_summary: 63 | print('Evaluating', predictions_dir, xml_test_dir) 64 | print('Evaluation summary:') 65 | table_data = [['Evaluation', 'Precision', 'Recall', 'F1 (micro)']] 66 | for name, values in result.items(): 67 | if binary_classification and 'Binary' not in name: 68 | continue 69 | if hipaa_only and 'HIPAA' not in name: 70 | continue 71 | if binary_classification and not hipaa_only and 'HIPAA' in name: 72 | continue # evaluation is wrong for these because all tags get mapped to a HIPAA (name) tag 73 | table_data.append([name] + [round(values[key], 5) for key in ['precision', 'recall', 'f1']]) 74 | 75 | table = TerminalTable(table_data) 76 | print(table.table) 77 | print(f'(see complete evaluation at {output_file})') 78 | 79 | return result 80 | 81 | 82 | def evaluate_deid_performance(model, batch_size, embeddings, label2ind, ind2label, experiment_dir, epoch=1, 83 | test_set='validation', binary_classification=False, 84 | hipaa_only=False, extra_features=(), require_argmax=True): 85 | predictions_dir = os.path.join(experiment_dir, f'predictions_epoch_{epoch:02d}') 86 | _save_predictions_to_xmls(model=model, batch_size=batch_size, embeddings=embeddings, label2ind=label2ind, 87 | ind2label=ind2label, test_set=test_set, predictions_dir=predictions_dir, 88 | binary_classification=binary_classification, hipaa_only=hipaa_only, 89 | extra_features=extra_features, require_argmax=require_argmax) 90 | 91 | output_file = predictions_dir + '.txt' 92 | return _run_official_evaluation(predictions_dir=predictions_dir, test_set=test_set, output_file=output_file, 93 | print_summary=True, binary_classification=binary_classification, 94 | hipaa_only=hipaa_only) 95 | 96 | 97 | class DeidentificationEvaluationCallback(Callback): 98 | def __init__(self, deid_model, batch_size, embeddings, label2ind, ind2label, test_set, experiment_dir, 99 | evaluate_every, binary_classification, hipaa_only, extra_features, call_model=False): 100 | super().__init__() 101 | self.deid_model = deid_model 102 | self.batch_size = batch_size 103 | self.embeddings = embeddings 104 | self.label2ind = label2ind 105 | self.ind2label = ind2label 106 | self.test_set = test_set 107 | self.experiment_dir = experiment_dir 108 | self.evaluate_every = evaluate_every 109 | self.binary_classification = binary_classification 110 | self.hipaa_only = hipaa_only 111 | self.extra_features = extra_features 112 | self.call_model = call_model 113 | 114 | def on_epoch_end(self, epoch, logs=None): 115 | deid_model = self.deid_model() if self.call_model else self.deid_model 116 | epoch = epoch + 1 # keras uses 0-indexed epochs 117 | if epoch % self.evaluate_every == 0: 118 | evaluate_deid_performance(model=deid_model, batch_size=self.batch_size, embeddings=self.embeddings, 119 | label2ind=self.label2ind, ind2label=self.ind2label, epoch=epoch, 120 | test_set=self.test_set, experiment_dir=self.experiment_dir, 121 | binary_classification=self.binary_classification, 122 | hipaa_only=self.hipaa_only, 123 | extra_features=self.extra_features) 124 | -------------------------------------------------------------------------------- /deid/model/adversarial.py: -------------------------------------------------------------------------------- 1 | from types import MappingProxyType 2 | 3 | from keras import backend as K 4 | from keras.layers import Input, Lambda, concatenate 5 | from keras.losses import binary_crossentropy 6 | from keras.models import Model 7 | 8 | from . import get as get_deidentifier 9 | from .adversary import get as get_adversary 10 | from .optimizer import get as get_optimizer 11 | from .representer import get as get_representer 12 | 13 | 14 | class AdversarialModel: 15 | def __init__(self, 16 | *_, # don't allow any positional arguments 17 | embedding_size, 18 | output_size, 19 | representation_size=None, 20 | representation_type='lstm', 21 | representation_args=MappingProxyType({}), 22 | deidentifier_type='lstm', 23 | deidentifier_args=MappingProxyType({}), 24 | extra_input_size=0, 25 | adversaries=('discriminate-representations', 'discriminate-representation-embedding-pair'), 26 | adversary_args=MappingProxyType({}), 27 | optimizer='adam', 28 | optimizer_args=MappingProxyType({})): 29 | """ Initialize the adversarial model. It's components are 30 | - a representation model that transforms embeddings into a (noisy) representation 31 | - a deidentifier model that performs the deidentification task from the representation 32 | - an adversary model that tries to reconstruct information from the representation 33 | 34 | :param embedding_size: the representation input size 35 | :param output_size: the deidentifier output size 36 | :param representation_size: the representation size (or None to use the embedding size) 37 | :param representation_type: the type of representation model to use (see representer.py) 38 | :param representation_args: the kwargs for the representation model 39 | :param deidentifier_type: the type of deidentifier model to use (see deidentifier.py) 40 | :param deidentifier_args: the kwargs for the deidentifier model 41 | :param adversaries: a sequence of adversary type strings (see adversary.py) 42 | :param adversary_args: a dictionary of adversary args or a list of dictionaries (if every adversary should get 43 | its own args) 44 | :param optimizer: the type of optimizer to use (see optimizer.py) 45 | :param optimizer_args: the args passed to the optimizer 46 | """ 47 | 48 | if representation_size is None: 49 | representation_size = embedding_size 50 | 51 | original_embeddings = Input(shape=(None, embedding_size)) 52 | 53 | build_representer = get_representer(representation_type) 54 | self.train_representer = build_representer(embedding_size=embedding_size, 55 | representation_size=representation_size, 56 | apply_noise=True, 57 | **representation_args) 58 | 59 | train_representation = self.train_representer(original_embeddings) 60 | 61 | deidentifier, deidentifier_loss = get_deidentifier(deidentifier_type)( 62 | name='deidentifier', 63 | input_size=representation_size, 64 | output_size=output_size, 65 | extra_input_size=extra_input_size, 66 | **deidentifier_args) 67 | 68 | extra_input = Input(shape=(None, extra_input_size)) 69 | if extra_input_size > 0: 70 | train_deidentifier_input = [train_representation, extra_input] 71 | else: 72 | train_deidentifier_input = train_representation 73 | 74 | train_deidentifier_output = deidentifier(train_deidentifier_input) 75 | self.pretrain_deidentifier = Model([original_embeddings, extra_input], train_deidentifier_output) 76 | self.pretrain_deidentifier.compile(optimizer=get_optimizer(optimizer)(**optimizer_args), loss=deidentifier_loss, 77 | metrics=['accuracy']) 78 | 79 | self.train_representer.trainable = False 80 | 81 | adv_embeddings = Input(shape=(None, embedding_size)) 82 | adv_representation = self.train_representer(adv_embeddings) 83 | 84 | adv_fake_embeddings = Input(shape=(None, embedding_size)) 85 | adv_fake_representation = self.train_representer(adv_fake_embeddings) 86 | 87 | adversary_models = [] 88 | adversary_outputs = [] 89 | if isinstance(adversary_args, dict): 90 | adversary_args = [adversary_args for _ in adversaries] 91 | 92 | for adversary_type, args in zip(adversaries, adversary_args): 93 | adversary = get_adversary(adversary_type)(inputs={'train_representation': adv_representation, 94 | 'original_embeddings': adv_embeddings, 95 | 'fake_representation': adv_fake_representation}, 96 | representation_size=representation_size, 97 | embedding_size=embedding_size, 98 | **args) 99 | adversary_models.append(adversary.model) 100 | adversary_outputs.append(adversary.model(adversary.inputs)) 101 | adversary.model.summary() 102 | if len(adversary_outputs) > 1: 103 | adversary_output = concatenate(adversary_outputs, axis=-1) 104 | else: 105 | adversary_output = adversary_outputs[0] 106 | adversary_output = Lambda(lambda x: K.mean(x, axis=-1, keepdims=True), name='adversary')(adversary_output) 107 | 108 | self.pretrain_adversary = Model([adv_embeddings, adv_fake_embeddings], adversary_output) 109 | self.pretrain_adversary.summary() 110 | self.pretrain_adversary.compile(optimizer=get_optimizer(optimizer)(**optimizer_args), 111 | loss='binary_crossentropy', 112 | metrics=['accuracy']) 113 | 114 | self.fine_tune_branches = Model([original_embeddings, extra_input, adv_embeddings, adv_fake_embeddings], 115 | [train_deidentifier_output, adversary_output]) 116 | self.fine_tune_branches.compile(optimizer=get_optimizer(optimizer)(**optimizer_args), 117 | loss=[deidentifier_loss, 'binary_crossentropy'], 118 | metrics=['accuracy']) 119 | 120 | self.train_representer.trainable = True 121 | deidentifier.trainable = False 122 | for adversary in adversary_models: 123 | adversary.trainable = False 124 | self.fine_tune_representer = Model([original_embeddings, extra_input, adv_embeddings, adv_fake_embeddings], 125 | [train_deidentifier_output, adversary_output]) 126 | self.fine_tune_representer.compile(optimizer=get_optimizer(optimizer)(**optimizer_args), 127 | loss=[deidentifier_loss, adversarial_objective], 128 | loss_weights=[1, 1], metrics=['accuracy']) 129 | 130 | @property 131 | def complete_model(self): 132 | return self.fine_tune_branches 133 | 134 | 135 | def adversarial_objective(y_true, y_pred): 136 | loss = binary_crossentropy(y_true, y_pred) 137 | random_guessing = -K.log(0.5) 138 | return K.abs(loss - random_guessing) 139 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Sharing Training Data for De-Identification 2 | 3 | [![Build Status](https://www.travis-ci.org/maxfriedrich/deid-training-data.svg?branch=master)](https://www.travis-ci.org/maxfriedrich/deid-training-data) 4 | 5 | **Update 2019-08-11:** Our paper ["Adversarial Learning of Privacy-Preserving Text Representations for De-Identification of Medical Records"](https://www.aclweb.org/anthology/papers/P/P19/P19-1584/) was published at ACL 2019. 6 | 7 | This is the code for my [Master's thesis](https://www.inf.uni-hamburg.de/en/inst/ab/lt/teaching/theses/completed-theses/2018-ma-friedrich.pdf). It's about automatic transformations that can be applied to medical text data that… 8 | 9 | - allow training a de-identification model (i.e. finding all protected information in text) 10 | - do not allow attackers to infer any protected information. 11 | 12 | ## Main Contribution 13 | 14 | An adversarial deep learning architecture that learns a private representation of medical text. The representation model is an LSTM model that adds Gaussian noise of a trainable scale to its inputs and outputs. 15 | 16 | Adversarial architecture 17 | 18 | The representation fulfills two invariance criteria that are both enforced by binary classifier LSTM adversary models that receive sequence pairs as inputs. 19 | 20 | Left: Representations should be invariant to *any* protected information token being replaced with a neighbor in an embedding space (e.g. substituting a name or date). 21 | 22 | Right: Looking up the same token sequence multiple times should result in a representation that is randomly different by a high enough degree that it could be the representation of a neighboring sequence. 23 | 24 | First adversary   Second adversary 25 | 26 | ## Installation 27 | 28 | - Checkout the repository including submodules. If you're doing a new clone: 29 | 30 | ```bash 31 | git clone --recurse-submodules git@github.com:maxfriedrich/deid-training-data.git 32 | ``` 33 | 34 | - Or, if you already cloned the repository: 35 | 36 | ```bash 37 | git submodule update --init 38 | ``` 39 | 40 | - Create a Conda environment for the project. If you want the environment name to be something other than `deid-training-data` or use `tensorflow-gpu` instead of `tensorflow`, adapt the `environment.yml` file before running this command. Then activate the environment. 41 | 42 | ```bash 43 | cd deid-training-data 44 | conda env create 45 | conda activate deid-training-data 46 | ``` 47 | 48 | - Download the English language model for spaCy: 49 | 50 | ```bash 51 | python -m spacy download en 52 | ``` 53 | 54 | - Verify that the environment is working by running the tests: 55 | 56 | ```bash 57 | DEID_TEST_CONFIG=1 nosetests --with-doctest 58 | ``` 59 | 60 | - Adapt the [environment file](deid/env.py). 61 | 62 | - Decide with embeddings you want to use: 63 | 64 | - For **FastText**, get a [fastText embeddings binary](https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M-subword.bin.zip) (4.5 GB download) as well as the [corresponding `.vec` file of precomputed embeddings](https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M-subword.vec.zip) (590 MB download) and put it them the resources directory. Adapt the path [here](deid/embeddings/fasttext.py) if necessary. Then convert the precomputed fastText embeddings to a `{word: ind}` dictionary and numpy matrix file: 65 | 66 | ```bash 67 | python -m deid.tools.embeddings --fasttext-precomputed 68 | ``` 69 | 70 | - For **GloVe**, download [a set of pre-trained word vectors](https://github.com/stanfordnlp/GloVe#download-pre-trained-word-vectors) and put it into the resources directory. Adapt the path and dimension [here](deid/embeddings/glove.py) if you're not using the Wikipedia-pretrained 300d embeddings. 71 | 72 | - For **ELMo**, you don't need to download anything. 73 | 74 | - Get the [i2b2 data](https://www.i2b2.org/NLP/DataSets/) and extract `training-PHI-Gold-Set1` into `train_xml`, `training-PHI-Gold-Set2` into `validation_xml`, and `testing-PHI-Gold-fixed` into a `test_xml` directory. 75 | 76 | - Fix one of the xml files where indices are offset after a special character: 77 | 78 | ```bash 79 | python -m deid.tools.fix_180-03 /path/to/validation_xml 80 | ``` 81 | 82 | - Convert the xml files with standoff annotations to an IOB2 format csv and a txt file containing the raw text: 83 | 84 | ```bash 85 | ./scripts/xml_to_csv 86 | ``` 87 | 88 | The `xml_to_csv` script calls the `deid.tools.i2b2_xml_to_csv` module with the `train_xml`, `validation_xml` and `test_xml` directories. It will output some inconsistencies in the data (standoff annotation texts differ from original text), but we'll ignore those for now. 89 | 90 | - Create an embeddings cache, again depending on your choice(s) of embeddings: 91 | 92 | - For **FastText**, this command writes all words from the train, test, and validation set to a pickle cache (5 minutes on my machine). 93 | 94 | ```bash 95 | python -m deid.tools.embeddings --fasttext-cache 96 | ``` 97 | 98 | - For **ELMo**, this command looks up all sentences from the train, test, and validation set and writes them to many pickle files. This is slow, taking up to 3 hours. 99 | 100 | ```bash 101 | python -m deid.tools.embeddings --elmo-cache 102 | ``` 103 | 104 | ## Experiments 105 | 106 | You can find these experiments in the [`deid/experiment`](deid/experiment) directory: 107 | 108 | - A [basic experiment](deid/experiment/basic.py) that can be used for training models on raw as well as augmented data 109 | - An implementation of [alternating adversarial training](deid/experiment/alternating.py) similar to [Feutry et al. (2018)](https://arxiv.org/abs/1802.09386) 110 | - Evaluation experiments for [automatic pseudonymization](deid/experiment/mtn_evaluation.py), discriminating [real from automatically pseudonymized sentences](deid/experiment/fake_sentences.py), and the [alternating training](deid/experiment/alternating_evaluation.py) 111 | 112 | To run an experiment: 113 | 114 | - Modify the [example config template](deid/config_template.yaml.example) and rename it to `.yaml`. Generate configs from it using the `config` tool: 115 | 116 | ```bash 117 | python -m deid.tools.config /path/to/config_template.yaml 118 | ``` 119 | 120 | Specify the number of configs with the `-n` option. For a grid search instead of random samples, use the `-a` option (careful, this might generate thousands of configs depending on the hyperparameter space!). 121 | 122 | - Run a single experiment from a config: 123 | 124 | ```bash 125 | python -m deid.experiment /path/to/config.yaml 126 | ``` 127 | 128 | This will output predictions and save a history pickle to an experiment directory inside `env.work_dir`. 129 | 130 | - Or set the `DEID_CONFIG_DIR` variable to the config directory and use the `queue` script to run all experiment configs from the `${DEID_CONFIG_DIR}/todo` directory (they will be processed sequentially and moved to the `${DEID_CONFIG_DIR}/done` directory). 131 | 132 | ```bash 133 | DEID_CONFIG_DIR=/path/to/config/dir ./scripts/queue 134 | ``` 135 | 136 | ## Evaluation 137 | 138 | The evaluation using a [modified version](deid/tools/i2b2) (`2to3`, minor fixes) of the [official evaluation script](https://github.com/kotfic/i2b2_evaluation_scripts) is run automatically in the experiments. You can also call it like this to evaluate a directory of XML predictions: 139 | 140 | ```bash 141 | python -m deid.tools.i2b2.evaluate phi /path/to/predictions /path/to/i2b2_data/validation_xml/ 142 | ``` 143 | -------------------------------------------------------------------------------- /deid/embeddings/matrix.py: -------------------------------------------------------------------------------- 1 | from typing import Iterable, List, Tuple, NamedTuple, Union, Optional, Dict 2 | 3 | import numpy as np 4 | from tqdm import tqdm 5 | 6 | from . import Embeddings 7 | 8 | 9 | class EmbeddingSimilarity(NamedTuple): 10 | rank: int 11 | word: str 12 | similarity: float 13 | vec: np.ndarray 14 | 15 | 16 | MostSimilarResult = List[EmbeddingSimilarity] 17 | WordOrVec = Union[str, np.ndarray] 18 | 19 | 20 | class Matrix: 21 | """ Transforms a lookup-based Embeddings object into a classical embedding matrix by looking up a fixed vocabulary 22 | and storing the results. The matrix can then be used for distance measuring. 23 | """ 24 | 25 | def __init__(self, lookup_embeddings: Embeddings, vocab: Optional[Iterable[str]] = None, 26 | precomputed_word2ind: Optional[Dict[str, int]] = None, precomputed_matrix: Optional[np.ndarray] = None, 27 | verbose: bool = False) -> None: 28 | """ Initialize the Matrix object. 29 | 30 | :param lookup_embeddings: the embeddings object used for lookup 31 | :param vocab: an iterable containing the words that should be stored in the matrix 32 | :param precomputed_word2ind: a precomputed word2ind dict, e.g. from the fastText .vec file 33 | :param precomputed_matrix: a precomputed embedding matrix, e.g. from the fastText .vec file 34 | :param verbose: setting this to True will show a progress bar when first looking up embeddings as well as output 35 | means when computing distances 36 | """ 37 | self.verbose = verbose 38 | self.lookup_embeddings = lookup_embeddings 39 | 40 | if vocab is not None: 41 | self._init_from_vocab(lookup_embeddings, vocab=vocab) 42 | elif precomputed_word2ind is not None and precomputed_matrix is not None: 43 | self._init_from_word2ind_and_matrix(precomputed_word2ind, precomputed_matrix) 44 | else: 45 | raise ValueError('The Matrix needs to be initialized either with vocab or word2ind+matrix') 46 | 47 | def _init_from_vocab(self, lookup_embeddings, vocab): 48 | vocab = set(vocab) 49 | self.vocab_size = len(vocab) 50 | self.word2ind = {word: i for i, word in enumerate(vocab)} 51 | self.ind2word = {i: word for i, word in enumerate(vocab)} 52 | self.embedding_matrix = np.zeros((self.vocab_size, lookup_embeddings.size)) 53 | self.is_norm = False 54 | 55 | items: Iterable[Tuple[str, int]] = self.word2ind.items() 56 | if self.verbose: 57 | items = tqdm(items, desc='Looking up embeddings') 58 | for word, ind in items: 59 | looked_up = lookup_embeddings.lookup(word) 60 | if np.count_nonzero(looked_up) > 0: 61 | self.embedding_matrix[ind] = looked_up 62 | else: 63 | # this shouldn't happen anymore 64 | raise RuntimeError(f'Embedding vector for {word} is all zeros') 65 | 66 | def _init_from_word2ind_and_matrix(self, word2ind, matrix): 67 | self.vocab_size = len(word2ind) 68 | self.word2ind = word2ind 69 | self.ind2word = {i: word for word, i in self.word2ind.items()} 70 | self.embedding_matrix = matrix 71 | self.is_norm = True 72 | 73 | def init_norms(self, force: bool = False) -> None: 74 | """ Initializes self.norms with pre-computed L2 normalized vectors for cosine distance computation. 75 | 76 | :param force: setting this to True will update the norms even if they were already computed 77 | :return: None 78 | """ 79 | if not self.is_norm or force: 80 | # noinspection PyAttributeOutsideInit 81 | self.embedding_matrix = self.embedding_matrix / np.sqrt((self.embedding_matrix ** 2).sum(-1))[ 82 | ..., np.newaxis] 83 | self.is_norm = True 84 | 85 | def _most_similar_cosine_measurement(self, vec): 86 | self.init_norms() 87 | normalized_vec = vec / np.linalg.norm(vec) 88 | return np.dot(self.embedding_matrix, normalized_vec) 89 | 90 | def most_similar_cosine(self, word_or_vec: WordOrVec, n: int = 20) -> MostSimilarResult: 91 | """ Calculate the cosine distance of the input vector to all vectors in the embedding matrix and return the 92 | most similar ones. 93 | 94 | :param word_or_vec: the input word or vector 95 | :param n: the number of results to return, or None if all should be returned 96 | :return: a list of MostSimilarResult objects 97 | """ 98 | return self._generic_most_similar(word_or_vec, self._most_similar_cosine_measurement, 99 | higher_is_more_similar=True, n=n) 100 | 101 | def cosine_distance_rank(self, word_or_vec: WordOrVec, word): 102 | return self._generic_rank(word_or_vec, word, self._most_similar_cosine_measurement, higher_is_more_similar=True) 103 | 104 | def cosine_distance(self, vec: np.ndarray, word: str) -> float: 105 | """ Returns the cosine distance between an input word and vector. 106 | 107 | :param vec: the input vector 108 | :param word: the input word 109 | :return: a float between -1 and 1 110 | """ 111 | self.init_norms() 112 | normalized_vec = vec / np.linalg.norm(vec) 113 | return float(np.dot(self.embedding_matrix[self.word2ind[word]], normalized_vec)) 114 | 115 | def most_similar_l2(self, word_or_vec: WordOrVec, n: int = 20) -> MostSimilarResult: 116 | """ Calculate the L2 norm distance of the input vector to all vectors in the embedding matrix and return the 117 | most similar ones. 118 | 119 | :param word_or_vec: the input word or vector 120 | :param n: the number of results to return, or None if all should be returned 121 | :return: a list of (word, distance) pairs, with lower distance meaning more similar 122 | """ 123 | 124 | def measurement(vec): 125 | distances = np.zeros(self.vocab_size) 126 | for i, emb in enumerate(self.embedding_matrix): 127 | distances[i] = np.linalg.norm(vec - emb) 128 | return distances 129 | 130 | return self._generic_most_similar(word_or_vec, measurement, higher_is_more_similar=False, n=n) 131 | 132 | def _lookup_if_needed(self, word_or_vec: WordOrVec) -> np.ndarray: 133 | if type(word_or_vec) == str: 134 | return self.lookup_embeddings.lookup(word_or_vec) 135 | else: 136 | return word_or_vec 137 | 138 | def _generic_most_similar(self, word_or_vec: WordOrVec, measurement, higher_is_more_similar, n: int = 20): 139 | self.init_norms() 140 | vec = self._lookup_if_needed(word_or_vec) 141 | distances = measurement(vec) 142 | assert len(distances) == len(self.embedding_matrix) 143 | if self.verbose: 144 | print('mean distance', np.mean(distances)) 145 | 146 | distances_for_sorting = -distances if higher_is_more_similar else distances 147 | 148 | if n is None or n == len(self.embedding_matrix): 149 | sorted_most_similar_ind = np.argsort(distances_for_sorting) 150 | else: 151 | most_similar_ind = np.argpartition(distances_for_sorting, n)[:n] 152 | sorted_most_similar_ind = most_similar_ind[np.argsort(distances_for_sorting[most_similar_ind])] 153 | 154 | return [EmbeddingSimilarity(rank=rank, 155 | word=self.ind2word[ind], 156 | similarity=distances[ind], 157 | vec=self.embedding_matrix[ind]) 158 | for rank, ind in enumerate(sorted_most_similar_ind, start=1)] 159 | 160 | def _generic_rank(self, word_or_vec: WordOrVec, word, measurement, higher_is_more_similar): 161 | self.init_norms() 162 | vec = self._lookup_if_needed(word_or_vec) 163 | distances = measurement(vec) 164 | distances = -distances if higher_is_more_similar else distances 165 | 166 | word_distance = distances[self.word2ind[word]] 167 | return np.count_nonzero(distances[distances < word_distance]) + 1 168 | -------------------------------------------------------------------------------- /deid/experiment/mtn_evaluation.py: -------------------------------------------------------------------------------- 1 | import math 2 | import os 3 | import pickle 4 | import random 5 | import sys 6 | 7 | import numpy as np 8 | from keras.callbacks import EarlyStopping, LambdaCallback, ModelCheckpoint 9 | from keras.layers import Input 10 | 11 | from . import experiment_directory 12 | from ..data import TrainingSet, ValidationSet, StratifiedSampling, is_phi_sentence 13 | from ..data.augment import Augment, get as get_strategy 14 | from ..data.batch import IteratorWithEpochLength 15 | from ..data.util import pad_2d_sequences 16 | from ..embeddings import Matrix, get as get_embeddings 17 | from ..env import env 18 | from ..model.adversary import TwoRepresentationsAreSameOriginalDiscriminator 19 | 20 | 21 | def fake_augmented_sentences_batch(X, y, indices, augm_alternatives, fake_alternatives, split_condition): 22 | indices = [i for i in indices if split_condition(X[i], y[i])] 23 | real_sentences = [X[i] for i in indices] 24 | augmented_sentences = [augm_alternatives[ind][0] for ind in indices] 25 | fake_sentences = [random.choice(fake_alternatives[ind]) for ind in indices] 26 | 27 | X_1 = [] 28 | X_2 = [] 29 | y = [] 30 | for real, augm, fake in zip(real_sentences, augmented_sentences, fake_sentences): 31 | X_1 += [augm, augm] 32 | X_2 += [real, fake] 33 | y += [1, 0] 34 | 35 | return pad_2d_sequences(X_1), pad_2d_sequences(X_2), np.array(y) 36 | 37 | 38 | class MTNGenerator(IteratorWithEpochLength): 39 | def __init__(self, generator: IteratorWithEpochLength, dataset, dataset2): 40 | self.generator = generator 41 | self.dataset = dataset 42 | self.dataset2 = dataset2 43 | 44 | def __next__(self): 45 | _, _, indices = next(self.generator) 46 | X_1, X_2, adv_y = fake_augmented_sentences_batch(self.dataset.X, self.dataset.y, indices, 47 | self.dataset.augmented, self.dataset2.augmented, 48 | split_condition=is_phi_sentence) 49 | return [X_1, X_2], adv_y 50 | 51 | @property 52 | def epoch_length(self) -> int: 53 | return self.generator.epoch_length 54 | 55 | 56 | def mtn_evaluation_experiment(config): 57 | print('Loading embeddings...') 58 | embeddings = get_embeddings(config['experiment']['embeddings']) 59 | 60 | name = config['name'] 61 | experiment_dir = experiment_directory(name, config['path']) 62 | 63 | print('Loading matrix...') 64 | matrix = Matrix(embeddings, precomputed_word2ind=embeddings.precomputed_word2ind, 65 | precomputed_matrix=embeddings.precomputed_matrix) 66 | 67 | strategy = get_strategy(config['augment']['strategy'], matrix) 68 | digit_strategy = get_strategy(config['augment']['digit_strategy'], matrix) 69 | adv_strategy = get_strategy('move_to_neighbor-5', matrix) 70 | 71 | augment = Augment(embeddings, strategy=strategy, digit_strategy=digit_strategy, n_augmentations=1) 72 | 73 | augment2 = Augment(embeddings, strategy=adv_strategy, digit_strategy=digit_strategy, 74 | n_augmentations=config['augment']['n_augmentations'], augment_max=1) 75 | 76 | print('Augmenting training set...', flush=True) 77 | tr = TrainingSet(train_set=config['experiment']['train_set'], 78 | embeddings=embeddings, 79 | use_short_sentences=env.use_short_sentences, 80 | limit_documents=env.limit_training_documents, 81 | augment=augment) 82 | 83 | tr2 = TrainingSet(train_set=config['experiment']['train_set'], 84 | embeddings=embeddings, 85 | use_short_sentences=env.use_short_sentences, 86 | limit_documents=env.limit_training_documents, 87 | augment=augment2) 88 | 89 | assert np.all(tr.X[100] == tr2.X[100]) # making sure that the training sets have the same order 90 | 91 | print('Augmenting validation set...', flush=True) 92 | val = ValidationSet(validation_set=config['experiment']['validation_set'], 93 | embeddings=embeddings, 94 | label2ind=tr.label2ind, 95 | use_short_sentences=env.use_short_sentences, 96 | limit_documents=env.limit_validation_documents, 97 | augment=augment) 98 | 99 | val2 = ValidationSet(validation_set=config['experiment']['validation_set'], 100 | embeddings=embeddings, 101 | label2ind=tr.label2ind, 102 | use_short_sentences=env.use_short_sentences, 103 | limit_documents=env.limit_validation_documents, 104 | augment=augment2) 105 | 106 | inputs = {'train_representation': Input(shape=(None, embeddings.size)), 107 | 'fake_representation': Input(shape=(None, embeddings.size))} 108 | adversary = TwoRepresentationsAreSameOriginalDiscriminator(inputs, representation_size=embeddings.size, 109 | lstm_size=embeddings.size) 110 | adversary.model.compile(loss=adversary.loss, optimizer='nadam', metrics=['accuracy']) 111 | 112 | batch_size = test_batch_size = 32 113 | train_gen = MTNGenerator(StratifiedSampling(tr.X, tr.y, split_condition=is_phi_sentence, 114 | batch_size=batch_size, yield_indices=True, shuffle=True), tr, tr2) 115 | valid_gen = MTNGenerator(StratifiedSampling(val.X, val.y, split_condition=is_phi_sentence, 116 | batch_size=test_batch_size, yield_indices=True, shuffle=False), val, 117 | val2) 118 | 119 | early_stopping = EarlyStopping(monitor='val_loss', patience=config['training']['early_stopping_patience']) 120 | flush = LambdaCallback(on_epoch_end=lambda epoch, logs: sys.stdout.flush()) 121 | callbacks = [early_stopping, flush] 122 | if env.save_model: 123 | checkpoint = ModelCheckpoint(os.path.join(experiment_dir, 'model.hdf5'), save_best_only=True) 124 | callbacks.append(checkpoint) 125 | 126 | history = adversary.model.fit_generator(train_gen, 127 | epochs=config['training']['train_epochs'], 128 | steps_per_epoch=int(math.ceil(len(tr.X) / batch_size)), 129 | validation_data=valid_gen, 130 | validation_steps=int(math.ceil(len(val.X) / test_batch_size)), 131 | callbacks=callbacks, 132 | verbose=env.keras_verbose) 133 | 134 | if config['test']['run_test']: 135 | label2ind = tr.label2ind 136 | del tr, tr2, val, val2, train_gen, valid_gen 137 | 138 | if env.save_model: 139 | print('Restoring best weights') 140 | adversary.model.load_weights(os.path.join(experiment_dir, 'model.hdf5')) 141 | 142 | print('Augmenting test set...', flush=True) 143 | 144 | test = ValidationSet(validation_set='test', 145 | embeddings=embeddings, 146 | label2ind=label2ind, 147 | use_short_sentences=env.use_short_sentences, 148 | limit_documents=env.limit_validation_documents, 149 | augment=augment) 150 | 151 | test2 = ValidationSet(validation_set='test', 152 | embeddings=embeddings, 153 | label2ind=label2ind, 154 | use_short_sentences=env.use_short_sentences, 155 | limit_documents=env.limit_validation_documents, 156 | augment=augment2) 157 | test_gen = MTNGenerator(StratifiedSampling(test.X, test.y, split_condition=is_phi_sentence, 158 | batch_size=test_batch_size, yield_indices=True, shuffle=False), test, 159 | test2) 160 | 161 | loss, acc = adversary.model.evaluate_generator(test_gen, int(math.ceil(len(test.X) / test_batch_size))) 162 | print(f'Test loss: {loss}, test acc: {acc}') 163 | history.history['test_loss'] = loss 164 | history.history['test_acc'] = acc 165 | 166 | history_pickle_path = os.path.join(experiment_dir, 'history.pickle') 167 | print('Saving history to', history_pickle_path) 168 | with open(history_pickle_path, 'wb') as f: 169 | pickle.dump(history.history, f) 170 | -------------------------------------------------------------------------------- /deid/experiment/basic.py: -------------------------------------------------------------------------------- 1 | import math 2 | import os 3 | import pickle 4 | import sys 5 | 6 | import numpy as np 7 | from keras.callbacks import EarlyStopping, LambdaCallback, ModelCheckpoint 8 | 9 | from . import DeidentificationEvaluationCallback, evaluate_deid_performance, experiment_directory 10 | from ..data import TrainingSet, ValidationSet, BatchGeneratorWithExtraFeatures, StratifiedSamplingWithExtraFeatures, \ 11 | is_phi_sentence 12 | from ..data.augment import Augment, get as get_strategy 13 | from ..data.class_weight import get as get_class_weight 14 | from ..data.feature import get as get_feature 15 | from ..data.util import compounding 16 | from ..embeddings import PrecomputedEmbeddings, Matrix, get as get_embeddings 17 | from ..env import env 18 | from ..model import get as get_model 19 | from ..model.optimizer import get as get_optimizer 20 | 21 | 22 | def basic_experiment(config): 23 | name = config['name'] 24 | batch_size = config['training']['batch_size'] 25 | test_batch_size = config['training']['test_batch_size'] 26 | if test_batch_size is None: 27 | test_batch_size = batch_size 28 | test_weights = config['test']['test_weights'] 29 | 30 | experiment_dir = experiment_directory(name, config['path']) 31 | 32 | print('Loading embeddings...') 33 | embeddings = get_embeddings(config['experiment']['embeddings']) 34 | print('Done.') 35 | 36 | if config['augment'] is not None and test_weights is None: 37 | if isinstance(embeddings, PrecomputedEmbeddings): 38 | matrix = Matrix(embeddings, precomputed_word2ind=embeddings.precomputed_word2ind, 39 | precomputed_matrix=embeddings.precomputed_matrix) 40 | strategy_kwargs = {'matrix': matrix} 41 | else: 42 | strategy_kwargs = {} 43 | 44 | strategy = get_strategy(config['augment']['strategy'], **strategy_kwargs) 45 | digit_strategy = get_strategy(config['augment']['digit_strategy'], **strategy_kwargs) 46 | augment = Augment(embeddings=embeddings, strategy=strategy, digit_strategy=digit_strategy, 47 | **config['augment']['augment_args']) 48 | else: 49 | augment = None 50 | 51 | if config['experiment']['extra_features'] is None or len(config['experiment']['extra_features']) == 0: 52 | extra_features = [] 53 | else: 54 | extra_features = [get_feature(identifier) for identifier in config['experiment']['extra_features']] 55 | 56 | tr = TrainingSet(train_set=config['experiment']['train_set'], 57 | embeddings=embeddings, 58 | use_short_sentences=env.use_short_sentences, 59 | limit_documents=env.limit_training_documents, 60 | binary_classification=config['experiment']['binary_classification'], 61 | hipaa_only=config['experiment']['hipaa_only'], 62 | augment=augment, 63 | extra_features=extra_features) 64 | 65 | model = get_model(config['experiment']['model'])(name=name, 66 | input_size=embeddings.size, 67 | extra_input_size=tr.X_extra_size, 68 | output_size=tr.output_size, 69 | optimizer=get_optimizer(config['training']['optimizer'])( 70 | **config['training']['optimizer_args']), 71 | **config['model_args']) 72 | 73 | if test_weights is None: 74 | train_and_validate(model, config, tr, embeddings, extra_features, batch_size, test_batch_size, experiment_dir) 75 | else: 76 | model.load_weights(test_weights) 77 | 78 | if config['test']['run_test']: 79 | test_set = config['test']['test_set'] 80 | if test_set is None: 81 | test_set = 'test' 82 | evaluate_deid_performance(model=model, batch_size=test_batch_size, embeddings=embeddings, 83 | label2ind=tr.label2ind, ind2label=tr.ind2label, 84 | test_set=test_set, experiment_dir=experiment_dir, 85 | binary_classification=config['experiment']['binary_classification'], 86 | hipaa_only=config['experiment']['hipaa_only'], 87 | extra_features=extra_features, epoch=99) 88 | 89 | 90 | def train_and_validate(model, config, tr, embeddings, extra_features, batch_size, test_batch_size, experiment_dir): 91 | val = ValidationSet(validation_set=config['experiment']['validation_set'], 92 | embeddings=embeddings, 93 | label2ind=tr.label2ind, 94 | use_short_sentences=env.use_short_sentences, 95 | limit_documents=env.limit_validation_documents, 96 | binary_classification=config['experiment']['binary_classification'], 97 | hipaa_only=config['experiment']['hipaa_only'], 98 | extra_features=extra_features) 99 | 100 | if config['augment'] is not None and config['augment']['include_original']: 101 | tr_X, tr_y, tr_X_extra = tr.data_with_augmented 102 | augment_training_generator = None 103 | else: 104 | tr_X, tr_y, tr_X_extra = tr.X, tr.y, tr.X_extra 105 | augment_training_generator = tr.augmented 106 | 107 | print('Size of the training set:', len(tr_X), 'with maxlen:', tr.maxlen) 108 | compound = config['training']['batch_size_compound'] 109 | if compound is not None and compound != 0 and compound < batch_size: 110 | training_batch_size = compounding(1, batch_size, compound) 111 | else: 112 | training_batch_size = batch_size 113 | 114 | if config['training']['batch_mode'] == 'stratified': 115 | train_gen_class, train_gen_args = StratifiedSamplingWithExtraFeatures, {'split_condition': is_phi_sentence} 116 | else: 117 | train_gen_class, train_gen_args = BatchGeneratorWithExtraFeatures, {} 118 | 119 | training_generator = train_gen_class(tr_X, tr_y, tr_X_extra, 120 | batch_size=training_batch_size, 121 | augment=augment_training_generator, **train_gen_args) 122 | 123 | validation_generator = BatchGeneratorWithExtraFeatures(val.X, val.y, val.X_extra, test_batch_size, 124 | shuffle=False) 125 | 126 | if config['experiment']['class_weight'] is not None: 127 | class_weight = get_class_weight(config['experiment']['class_weight'])(tr.output_size, tr_y) 128 | else: 129 | class_weight = None 130 | 131 | early_stopping = EarlyStopping(monitor='val_loss', patience=config['training']['early_stopping_patience']) 132 | flush = LambdaCallback(on_epoch_end=lambda epoch, logs: sys.stdout.flush()) 133 | evaluation = DeidentificationEvaluationCallback(model, batch_size=test_batch_size, embeddings=embeddings, 134 | label2ind=tr.label2ind, ind2label=tr.ind2label, 135 | test_set=config['experiment']['validation_set'], 136 | experiment_dir=experiment_dir, 137 | evaluate_every=config['training']['i2b2_evaluate_every'], 138 | binary_classification=config['experiment'][ 139 | 'binary_classification'], 140 | hipaa_only=config['experiment']['hipaa_only'], 141 | extra_features=extra_features) 142 | 143 | callbacks = [early_stopping, evaluation, flush] 144 | if env.save_model: 145 | checkpoint = ModelCheckpoint(os.path.join(experiment_dir, 'model.hdf5'), save_best_only=True) 146 | callbacks.append(checkpoint) 147 | 148 | history = model.fit_generator(training_generator, 149 | epochs=config['training']['train_epochs'], 150 | steps_per_epoch=int(math.ceil(len(tr_X) / batch_size)), 151 | validation_data=validation_generator, 152 | validation_steps=int(math.ceil(len(val.X) / test_batch_size)), 153 | class_weight=class_weight, 154 | callbacks=callbacks, 155 | verbose=env.keras_verbose, 156 | use_multiprocessing=True) 157 | if env.save_model: 158 | best_epoch = np.argmin(history.history['val_loss']) + 1 # epoch numbering is 1-based 159 | print(f'Resetting to weights from epoch {best_epoch:02d}') 160 | model.load_weights(os.path.join(experiment_dir, 'model.hdf5')) 161 | 162 | history_pickle_path = os.path.join(experiment_dir, 'history.pickle') 163 | print('Saving history to', history_pickle_path) 164 | with open(history_pickle_path, 'wb') as f: 165 | pickle.dump(history.history, f) 166 | -------------------------------------------------------------------------------- /deid/tools/i2b2/README.md: -------------------------------------------------------------------------------- 1 | **i2b2 2014 Evaluation Script** 2 | 3 | This script is distributed as a part of the i2b2 2014 Cardiac Risk and 4 | Protected Health Information (PHI) tasks. 5 | 6 | If you would like to contribute to this project, pull requests are welcome. 7 | Please see: https://help.github.com/articles/fork-a-repo for instructions 8 | on how to make a fork of this repository, and 9 | https://help.github.com/articles/using-pull-requests for instructions on 10 | making a pull request. Suggestions for improvements, bugs or feature requests 11 | may be directed to the i2b2 evaluation scripts' issues page located at: 12 | https://github.com/kotfic/i2b2_evaluation_scripts/issues 13 | 14 | _Setup_ 15 | 16 | This script also requires the following Python packages: 17 | lxml version 3.3.1 18 | numpy version 1.8.0 19 | 20 | If you get an error when running the script, please make sure that these 21 | are installed and accessible to your Python installation. 22 | 23 | 24 | _Running the script_ 25 | 26 | This script intended to be used via 27 | command line: 28 | python evaluate.py [cr|phi] [FLAGS] SYSTEM GOLD 29 | 30 | Where 'cr' produces Precision, Recall and F1 (P/R/F1) measure for the 31 | cardiac risk task and 'phi' produces P/R/F1 for the PHI task. SYSTEM and GOLD 32 | may be individual files representing system output in the case of SYSTEM and 33 | the gold standard in the case of GOLD. SYSTEM and GOLD may also be 34 | directories in which case all files in SYSTEM will be compared to files the 35 | GOLD directory based on their file names. See below for more information 36 | on the different output the cr/phi flag produces. 37 | 38 | 39 | 40 | _File name restrictions_ 41 | 42 | File names MUST be of the form: 43 | XXX-YY.xml where XXX is the patient id, and YY is the document id. The 44 | files from your system runs are matched to the the gold standard file by 45 | file name alone. If your system outputs file names in a different format, 46 | you will need to either modify your system or this script. 47 | 48 | 49 | _Output for Risk Factor Track_ 50 | 51 | To compare your system output for the Risk Factor track, run the following 52 | command for individual files: 53 | 54 | python evaluate.py cr {system.xml} {gold.xml} 55 | (replace the file names in {}s with the names of your actual files) 56 | 57 | or, to run the script on directories of files: 58 | python evaluate.py cr {system}/ {gold}/ 59 | (again, replace the folder names in {}s with the names of your actual folders) 60 | 61 | Running one of these versions will produce output in this format: 62 | 63 | ``` 64 | (# of files) Measure Macro (SD) Micro 65 | --------------------------------------------------------------------------- 66 | Total Precision 1.0 (0.0) 1.0 67 | Recall 1.0 (0.0) 1.0 68 | F1 1.0 1.0 69 | ``` 70 | 71 | The script evaluates the accuracy of your tags based on tag type 72 | and all the attributes (except ID). If you want to get more details 73 | about the output of your system, such as which attributes it is 74 | getting right/wrong, you can use the more experimental flags. Please see 75 | the evaluate.py script itself for more information on the flags. 76 | 77 | 78 | _Output for De-identification Track_ 79 | 80 | To compare your system output for the de-identification track, run the following 81 | command on individual files: 82 | 83 | python evaluate.py phi {system.xml} {gold.xml} 84 | (replace the file names in {}s with the names of your actual files) 85 | 86 | or, to run the script on directories of files: 87 | python evaluate.py phi {system}/ {gold}/ 88 | (again, replace the folder names in {}s with the names of your actual folders) 89 | 90 | 91 | Running one of these versions wil produce output that looks like this: 92 | 93 | ``` 94 | Strict (521) Measure Macro (SD) Micro 95 | --------------------------------------------------------------------------- 96 | Total Precision 0.6635 (0.11) 0.6537 97 | Recall 0.4906 (0.12) 0.4988 98 | F1 0.5641 0.5658 99 | 100 | 101 | Relaxed (521) Measure Macro (SD) Micro 102 | --------------------------------------------------------------------------- 103 | Total Precision 0.897 (0.086) 0.9047 104 | Recall 0.6663 (0.15) 0.6903 105 | F1 0.7646 0.7831 106 | 107 | 108 | HIPAA Strict (521) Measure Macro (SD) Micro 109 | --------------------------------------------------------------------------- 110 | Total Precision 0.7406 (0.098) 0.7225 111 | Recall 0.7406 (0.098) 0.7225 112 | F1 0.7406 0.7225 113 | 114 | 115 | HIPAA Relaxed (521) Measure Macro (SD) Micro 116 | --------------------------------------------------------------------------- 117 | Total Precision 1.0 (0.0) 1.0 118 | Recall 1.0 (0.0) 1.0 119 | F1 1.0 1.0 120 | ``` 121 | 122 | A few notes to explain this output: 123 | - The "(521)" represents the number of files the scrip was run on 124 | - "Strict" evaluations require that the offsets for the system outputs match *exactly* 125 | - "Relaxed" evaluations allow for the "end" part of the offsets to be off by 2--this allows for variations in including "'s" and other endings that many systems will ignore due to tokenization 126 | - "HIPPA" evalutions include only the tags that a strict interpretation of the HIPAA guidelines. See the below list for which tags are included in this evaluation 127 | 128 | 129 | 130 | _HIPAA-compliant PHI_ 131 | 132 | - NAME/PATIENT 133 | - AGE 134 | - LOCATION/CITY 135 | - LOCATION/STREET 136 | - LOCATION/ZIP 137 | - LOCATION/ORGANIZATION 138 | - DATE 139 | - CONTACT/PHONE 140 | - CONTACT/FAX 141 | - CONTACT/EMAIL 142 | - ID/SSN 143 | - ID/MEDICALRECORD 144 | - ID/HEALTHPLAN 145 | - ID/ACCOUNT 146 | - ID/LICENSE 147 | - ID/VEHICLE 148 | - ID/DEVICE 149 | - ID/BIOID 150 | - ID/IDNUM 151 | 152 | 153 | _Verbose flag_ 154 | 155 | To get document-by-document information about the accuracy of your tags, you can use the 156 | "-v" or "--verbose" flag. For example: 157 | 158 | python evaluate.py cr -v system/ gold/ 159 | 160 | 161 | _Advanced useage_ 162 | 163 | Some additional functionality is made available for testing and error 164 | analysis. This functionality is provided AS IS with the hopes that it will 165 | be useful. It should be considered 'experimental' at best, may be bug prone 166 | and will not be explicitly supported, though, bug reports and pull requests 167 | are welcome. 168 | 169 | Advanced Flags: 170 | 171 | --filter [TAG ATTRIBUTES] :: run P/R/F1 measures in either summary or verbose 172 | mode (see -v) for the list of attributes defined 173 | by TAG ATTRIBUTES. This may be a comma separated 174 | list of tag names and attribute values. For more 175 | see Advanced Examples. 176 | --conjunctive :: If multiple values are passed to filter as a comma separated 177 | list, treat them as a series of AND based filters instead of 178 | a series of OR based filters 179 | --invert :: run P/R/F1 on the inverted set of tags defined by TAG ATTRIBUTES 180 | in the --filter tag (see --filter). 181 | 182 | Advanced Examples: 183 | 184 | python evaluate.py cr --filter MEDICATION system/ gold/ 185 | 186 | Evaluate system output in system/ folder against gold/ folder considering 187 | only MEDICATION tags 188 | 189 | python evaluate.py cr --filter CAD,OBESE system/ gold/ 190 | 191 | Evaluate system output in system/ folder against gold/ folder considering 192 | only CAD or OBESE tags. Comma separated lists to the --filter flag are con- 193 | joined via OR. 194 | 195 | python evaluate.py cr --filter "CAD,before DCT" system/ gold/ 196 | 197 | Evaluate system output in system/ folder against gold/ folder considering 198 | only CAD *OR* tags with a time attribute of before DCT. This is probably 199 | not what you want when filtering, see the next example 200 | 201 | python evaluate.py cr --conjunctive \ 202 | --filter "CAD,before DCT" system/ gold/ 203 | 204 | Evaluate system output in system/ folder against gold/ folder considering 205 | CAD tags *AND* tags with a time attribute of before DCT. 206 | 207 | python evaluate.py cr --invert \ 208 | --filter MEDICATION system/ gold/ 209 | 210 | Evaluate system output in system/ folder against gold/ folder considering 211 | any tag which is NOT a MEDICATION tag. 212 | 213 | python evaluate.py cr --invert \ 214 | --conjunctive \ 215 | --filter "CAD,before DCT" system/ gold/ 216 | 217 | Evaluate system output in system/ folder against gold/ folder considering 218 | any tag which is NOT CAD and with a time attribute of 'before DCT' 219 | -------------------------------------------------------------------------------- /deid/data/batch.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import math 3 | import random 4 | from typing import Generic, TypeVar, Optional 5 | from typing import Sequence, Union, Tuple, Callable, Dict, Iterator, List 6 | 7 | import numpy as np 8 | 9 | from .util import pad_2d_sequences, peek 10 | 11 | X_type = TypeVar('X_type') 12 | y_type = TypeVar('y_type') 13 | 14 | TwoArrays = Tuple[np.ndarray, np.ndarray] 15 | ThreeArrays = Tuple[np.ndarray, np.ndarray, np.ndarray] 16 | 17 | 18 | class IteratorWithEpochLength(Iterator): 19 | def __next__(self) -> Union[TwoArrays, ThreeArrays]: 20 | raise NotImplementedError 21 | 22 | @property 23 | def epoch_length(self) -> int: 24 | raise NotImplementedError 25 | 26 | 27 | class IteratorWithEpochLengthImpl(Generic[X_type, y_type], IteratorWithEpochLength): 28 | def __init__(self, 29 | X: Sequence[X_type], 30 | y: Sequence[y_type], 31 | total_size: int, 32 | batch_size_iter: Iterator[int], 33 | yield_incomplete_batches: bool = True, 34 | yield_indices: bool = False, 35 | augment: Optional[Dict[int, Sequence[X_type]]] = None, 36 | augment_include_original: bool = True) -> None: 37 | assert len(X) == len(y) 38 | self.X, self.y = X, y 39 | self.augment = augment 40 | self.augment_include_original = augment_include_original 41 | 42 | self.total_size = total_size 43 | self.batch_size_iter = batch_size_iter 44 | self.yield_indices = yield_indices 45 | self.yield_incomplete_batches = yield_incomplete_batches 46 | self.init_epoch() 47 | 48 | def __next__(self) -> Union[TwoArrays, ThreeArrays]: 49 | if self.batch_number == self.epoch_length: 50 | self.init_epoch() 51 | 52 | current_batch_size = self.epoch_batch_sizes[self.batch_number] 53 | end = min(self.cursor + current_batch_size, self.total_size) 54 | batch_ind = self.select_batch_ind(self.cursor, end) 55 | 56 | if self.augment is not None: 57 | if self.augment_include_original: 58 | batch_X = [random.choice(self.augment[i] + [self.X[i]]) for i in batch_ind] 59 | else: 60 | batch_X = [random.choice(self.augment[i]) if len(self.augment[i]) > 0 else self.X[i] for i in batch_ind] 61 | else: 62 | batch_X = [self.X[i] for i in batch_ind] 63 | batch_y = [self.y[i] for i in batch_ind] 64 | self.cursor += current_batch_size 65 | self.batch_number += 1 66 | 67 | batch_X, batch_y = pad_2d_sequences(batch_X), pad_2d_sequences(batch_y) 68 | if self.yield_indices: 69 | return batch_X, batch_y, batch_ind 70 | else: 71 | return batch_X, batch_y 72 | 73 | def select_batch_ind(self, cursor, end) -> np.ndarray: 74 | raise NotImplementedError 75 | 76 | def __iter__(self): 77 | return self 78 | 79 | @property 80 | def epoch_length(self) -> int: 81 | return len(self.epoch_batch_sizes) 82 | 83 | # noinspection PyAttributeOutsideInit 84 | def init_epoch(self): 85 | self.batch_number = self.cursor = 0 86 | self.epoch_batch_sizes = self._make_epoch_batch_sizes(self.total_size) 87 | 88 | def _make_epoch_batch_sizes(self, total_size): 89 | """ Take items from the batch size iter until they make an epoch.""" 90 | result = [] 91 | seen = 0 92 | while seen < total_size: 93 | if self.yield_incomplete_batches: 94 | size = min(next(self.batch_size_iter), total_size - seen) 95 | seen += size 96 | result.append(size) 97 | else: 98 | size, self.batch_size_iter = peek(self.batch_size_iter) 99 | if seen + size > total_size: 100 | break 101 | size = next(self.batch_size_iter) 102 | seen += size 103 | result.append(size) 104 | 105 | assert seen == total_size if self.yield_incomplete_batches else seen <= total_size 106 | return result 107 | 108 | 109 | class BatchGenerator(IteratorWithEpochLengthImpl): 110 | def __init__(self, 111 | X: Sequence[X_type], 112 | y: Sequence[y_type], 113 | batch_size: Union[int, Iterator[int]], 114 | shuffle: bool = True, 115 | **kwargs) -> None: 116 | 117 | self.shuffle = shuffle 118 | 119 | if isinstance(batch_size, int): 120 | batch_size_iter = itertools.repeat(batch_size) 121 | else: 122 | batch_size_iter = batch_size 123 | super().__init__(X, y, total_size=len(X), batch_size_iter=batch_size_iter, **kwargs) 124 | 125 | # noinspection PyAttributeOutsideInit 126 | def init_epoch(self): 127 | super().init_epoch() 128 | if self.shuffle: 129 | self.shuffled_ind = np.random.permutation(np.arange(len(self.X))) 130 | else: 131 | self.shuffled_ind = np.arange(len(self.X)) 132 | 133 | def select_batch_ind(self, cursor, end): 134 | return self.shuffled_ind[cursor:end] 135 | 136 | 137 | class BatchGeneratorWithExtraFeatures(BatchGenerator): 138 | def __init__(self, 139 | X: Sequence[X_type], 140 | y: Sequence[y_type], 141 | X_extra, 142 | batch_size: Union[int, Iterator[int]], 143 | **kwargs) -> None: 144 | self.X_extra = X_extra 145 | super().__init__(X, y, batch_size=batch_size, yield_indices=True, **kwargs) 146 | 147 | def __next__(self): 148 | X, y, ind = super().__next__() 149 | return [X, pad_2d_sequences([self.X_extra[i] for i in ind])], y 150 | 151 | 152 | class StratifiedSampling(IteratorWithEpochLengthImpl): 153 | def __init__(self, 154 | X: Sequence[X_type], 155 | y: Sequence[y_type], 156 | batch_size: Union[int, Iterator[int]], 157 | split_condition: Callable[[X_type, y_type], bool], 158 | shuffle: bool = False, 159 | **kwargs) -> None: 160 | self.X_pos_ind, self.X_neg_ind = self.split_indices(X, y, split_condition) 161 | self.shorter_partition_size = min(len(self.X_pos_ind), len(self.X_neg_ind)) 162 | 163 | self.shuffle = shuffle 164 | 165 | if isinstance(batch_size, int): 166 | batch_size_iter = itertools.repeat(math.ceil(batch_size / 2)) 167 | else: 168 | double_batch_size_iter: Iterator[int] = batch_size 169 | batch_size_iter = (math.ceil(size / 2) for size in double_batch_size_iter) 170 | 171 | super().__init__(X, y, total_size=self.shorter_partition_size, batch_size_iter=batch_size_iter, **kwargs) 172 | 173 | # noinspection PyAttributeOutsideInit 174 | def init_epoch(self): 175 | super().init_epoch() 176 | if self.shuffle: 177 | self.shuffled_pos = np.random.permutation(self.X_pos_ind) 178 | self.shuffled_neg = np.random.permutation(self.X_neg_ind) 179 | else: 180 | self.shuffled_pos, self.shuffled_neg = self.X_pos_ind, self.X_neg_ind 181 | 182 | def select_batch_ind(self, cursor, end): 183 | return np.concatenate((self.shuffled_pos[cursor:end], self.shuffled_neg[cursor:end]), axis=0) 184 | 185 | @staticmethod 186 | def split_indices(X: Sequence[X_type], 187 | y: Sequence[y_type], 188 | split_condition: Callable[[X_type, y_type], bool]) -> Tuple[Sequence[int], Sequence[int]]: 189 | pos: List[int] = [] 190 | neg: List[int] = [] 191 | for i in range(len(X)): 192 | (pos if split_condition(X[i], y[i]) else neg).append(i) 193 | return pos, neg 194 | 195 | 196 | class StratifiedSamplingWithExtraFeatures(StratifiedSampling): 197 | def __init__(self, 198 | X: Sequence[X_type], 199 | y: Sequence[y_type], 200 | X_extra, 201 | batch_size: Union[int, Iterator[int]], 202 | **kwargs) -> None: 203 | self.X_extra = X_extra 204 | super().__init__(X, y, batch_size=batch_size, yield_indices=True, **kwargs) 205 | 206 | def __next__(self): 207 | X, y, ind = super().__next__() 208 | return [X, pad_2d_sequences([self.X_extra[i] for i in ind])], y 209 | 210 | 211 | def fake_sentences_batch(X: np.ndarray, 212 | y: np.ndarray, 213 | indices: np.ndarray, 214 | alternatives: Dict[int, Sequence[np.ndarray]], 215 | split_condition: Callable[[np.ndarray, np.ndarray], bool]) -> ThreeArrays: 216 | """ Generate a batch of real and fake/augmented sentence pairs. 217 | 218 | :param X: the complete X array 219 | :param y: the complete y array 220 | :param indices: the indices of this batch 221 | :param alternatives: a dictionary (index -> sequence of alternatives) providing fake alternatives for each index 222 | :param split_condition: a condition determining if the sentence should be used 223 | :return: A batch `X_1, X_2, y` 224 | """ 225 | 226 | indices = [i for i in indices if split_condition(X[i], y[i])] 227 | real_sentences = [X[i] for i in indices] 228 | fake_sentences = [random.choice(alternatives[ind]) for ind in indices] 229 | 230 | X_1: List[np.ndarray] = [] 231 | X_2: List[np.ndarray] = [] 232 | y = [] 233 | for real, fake in zip(real_sentences, fake_sentences): 234 | X_1 += [real, real] 235 | X_2 += [real, fake] 236 | y += [1, 0] 237 | 238 | return pad_2d_sequences(X_1), pad_2d_sequences(X_2), np.array(y) 239 | --------------------------------------------------------------------------------