├── requirements.txt
├── bash_scripts
    ├── analyze_results.sh
    ├── log_probability.sh
    ├── pregen_embs.sh
    ├── finetune_on_target.sh
    ├── train_baseline_clinical_BERT.sh
    ├── train_adv_clinical_BERT.sh
    ├── run_clinical_targets.py
    └── data_processing_pipeline.sh
├── .gitignore
├── scripts
    ├── statistical_significance.py
    ├── gradient_reversal.py
    ├── Constants.py
    ├── group_sents.py
    ├── pregen_embeddings.py
    ├── predict_missing.py
    ├── get_data.py
    ├── utils.py
    ├── log_probability_bias_scores.py
    ├── sentence_tokenization.py
    ├── analyze_results.py
    ├── heuristic_tokenize.py
    ├── make_targets.py
    ├── readers.py
    ├── finetune_on_pregenerated.py
    ├── run_classifier_dataset_utils.py
    ├── pregenerate_training_data.py
    └── adversarial_finetune_on_pregen.py
├── fill_in_blanks_examples
    ├── attributes.csv
    └── templates.txt
├── README.md
├── LICENSE
└── notebooks
    └── GetBasePrevs.ipynb


/requirements.txt:
--------------------------------------------------------------------------------
 1 | matplotlib==2.2.2
 2 | numpy==1.16.1
 3 | pandas==0.25.0
 4 | pyodbc==4.0.23
 5 | pytorch-pretrained-bert==0.6.2
 6 | PyYAML==5.1
 7 | scikit-learn==0.21.2
 8 | seaborn==0.8.1
 9 | torch==1.3.0
10 | torchvision==0.4.1
11 | psycopg2-binary==2.8.3
12 | nltk==3.4.5
13 | openpyxl==2.5.3
14 | spacy==2.2.1
15 | scispacy==0.2.2
16 | https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.4/en_core_sci_md-0.2.4.tar.gz
17 | 


--------------------------------------------------------------------------------
/bash_scripts/analyze_results.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #SBATCH --partition cpu 
 3 | #SBATCH -c 2
 4 | #SBATCH --output bootstrap%A.log
 5 | #SBATCH --mem 50gb
 6 | 
 7 | set -e
 8 | source activate hurtfulwords
 9 | 
10 | BASE_DIR="/h/haoran/projects/HurtfulWords"
11 | OUTPUT_DIR="/h/haoran/projects/HurtfulWords/data/"
12 | cd "$BASE_DIR/scripts"
13 | 
14 | python analyze_results.py \
15 | 	--models_path "${OUTPUT_DIR}/models/finetuned/" \
16 | 	--set_to_use "test" \
17 | 	--bootstrap \
18 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | MANIFEST
 2 | build
 3 | dist
 4 | data
 5 | _build
 6 | docs/man/*.gz
 7 | docs/source/api/generated
 8 | docs/source/config.rst
 9 | docs/gh-pages
10 | notebook/static/components
11 | notebook/static/style/*.min.css*
12 | notebook/static/*/js/built/
13 | notebook/static/*/built/
14 | notebook/static/built/
15 | notebook/static/*/js/main.min.js*
16 | notebook/static/lab/*bundle.js
17 | node_modules
18 | *.py[co]
19 | __pycache__
20 | *.egg-info
21 | *~
22 | *.bak
23 | .ipynb_checkpoints
24 | .tox
25 | .DS_Store
26 | \#*#
27 | .#*
28 | .coverage
29 | src
30 | 
31 | *.swp
32 | *.map
33 | .idea/
34 | config.rst
35 | venv
36 | .venv
37 | 
38 | *.pkl
39 | *.h5
40 | # *.csv
41 | *.h5
42 | *.log
43 | log.txt
44 | BERT_DeBias/*.sh
45 | *.out
46 | *.sbatch
47 | 
48 | figures/
49 | figures/*
50 | 


--------------------------------------------------------------------------------
/scripts/statistical_significance.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import numpy as np
 3 | from scipy import stats
 4 | import pandas as pd
 5 | 
 6 | for fpath in sys.argv[1:]:
 7 |     df = pd.read_csv(fpath, sep='\t')
 8 |     categories = np.unique(df.categories)
 9 | 
10 |     for cat in categories:
11 |         tmp = df[df.categories == cat]
12 |         m = tmp[tmp.demographic == 'male'].log_probs
13 |         f = tmp[tmp.demographic == 'female'].log_probs
14 | 
15 |         m_mean = np.mean(m)
16 |         f_mean = np.mean(f)
17 | 
18 |         wilcoxon = stats.wilcoxon(m, f)
19 | 
20 |         print('****', cat, '****')
21 |         print('male mean,\t', m_mean)
22 |         print('female mean,\t', f_mean)
23 | 
24 |         print("Test statistic,\t", wilcoxon[0])
25 |         print("p-value,\t", wilcoxon[1])
26 |         print()
27 | 


--------------------------------------------------------------------------------
/bash_scripts/log_probability.sh:
--------------------------------------------------------------------------------
 1 | set -e 
 2 | source activate hurtfulwords
 3 | 
 4 | BASE_DIR="/h/haoran/projects/HurtfulWords/" 
 5 | OUTPUT_DIR="/h/haoran/projects/HurtfulWords/data/"
 6 | #MODEL_NAME="baseline_clinical_BERT_1_epoch_512"
 7 | #MODEL_NAME="adv_clinical_BERT_1_epoch_512" 
 8 | MODEL_NAME="SciBERT"
 9 | 
10 | cd "$BASE_DIR/scripts"
11 | 
12 | python log_probability_bias_scores.py \
13 |     --model "${OUTPUT_DIR}/models/${MODEL_NAME}/" \
14 |     --demographic 'GEND' \
15 |     --template_file "${BASE_DIR}/fill_in_blanks_examples/templates.txt" \
16 |     --attributes_file "${BASE_DIR}/fill_in_blanks_examples/attributes.csv" \
17 |     --out_file "${OUTPUT_DIR}/${MODEL_NAME}_log_scores.tsv" 
18 |  
19 | python statistical_significance.py "${OUTPUT_DIR}/${MODEL_NAME}_log_scores.tsv" > "${OUTPUT_DIR}/${MODEL_NAME}_log_score_significance.txt"
20 | 
21 | 


--------------------------------------------------------------------------------
/bash_scripts/pregen_embs.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #SBATCH --partition t4 
 3 | #SBATCH --gres gpu:2
 4 | #SBATCH -c 8
 5 | #SBATCH --output pregen_embs_%A.log
 6 | #SBATCH --mem 85gb
 7 | 
 8 | set -e
 9 | source activate hurtfulwords
10 | 
11 | BASE_DIR="/h/haoran/projects/HurtfulWords"
12 | OUTPUT_DIR="/h/haoran/projects/HurtfulWords/data/"
13 | cd "$BASE_DIR/scripts"
14 | mkdir -p "$OUTPUT_DIR/pregen_embs/"
15 | emb_method='cat4'
16 | 
17 | for target in inhosp_mort phenotype_first phenotype_all; do
18 | 	for model in baseline_clinical_BERT_1_epoch_512 adv_clinical_BERT_1_epoch_512; do
19 | 		python pregen_embeddings.py \
20 | 		    --df_path "$OUTPUT_DIR/finetuning/$target"\
21 | 		    --model "$OUTPUT_DIR/models/$model" \
22 | 		    --output_path "${OUTPUT_DIR}/pregen_embs/pregen_${model}_${emb_method}_${target}" \
23 | 		    --emb_method $emb_method
24 | 	done
25 | done
26 | 
27 | 


--------------------------------------------------------------------------------
/bash_scripts/finetune_on_target.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --partition t4 
 3 | #SBATCH --gres gpu:1
 4 | #SBATCH -c 8
 5 | #SBATCH --output=finetune_%A.out
 6 | #SBATCH --mem 60gb
 7 | 
 8 | # $1 - target type {inhosp_mort, phenotype_first, phenotype_all}
 9 | # $2 - BERT model name {baseline_clinical_BERT_1_epoch_512, adv_clinical_BERT_1_epoch_512}
10 | # $3 - target column name within the dataframe, ex: "Shock", "any_acute"
11 | 
12 | set -e 
13 | source activate hurtfulwords
14 | 
15 | BASE_DIR="/h/haoran/projects/HurtfulWords"
16 | OUTPUT_DIR="/scratch/hdd001/home/haoran/shared_data/HurtfulWords/data"
17 | 
18 | cd "$BASE_DIR/scripts"
19 | 
20 | python finetune_on_target.py \
21 | 	--df_path "${OUTPUT_DIR}/finetuning/$1" \
22 | 	--model_path "${OUTPUT_DIR}/models/$2" \
23 | 	--fold_id 9 10\
24 | 	--target_col_name "$3" \
25 | 	--output_dir "${OUTPUT_DIR}/models/finetuned/${1}_${2}_${3}/" \
26 | 	--freeze_bert \
27 | 	--train_batch_size 32 \
28 | 	--pregen_emb_path "${OUTPUT_DIR}/pregen_embs/pregen_${2}_cat4_${1}" \
29 | 	--task_type binary \
30 | 	--other_fields age sofa sapsii_prob sapsii_prob oasis oasis_prob \
31 |         --gridsearch_classifier \
32 |         --gridsearch_c \
33 |         --emb_method cat4
34 | 


--------------------------------------------------------------------------------
/bash_scripts/train_baseline_clinical_BERT.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --partition gpu
 3 | #SBATCH --gres gpu:4
 4 | #SBATCH -c 8
 5 | #SBATCH --output train_baseline.log
 6 | #SBATCH --mem 200gb
 7 | set -e
 8 | source activate hurtfulwords
 9 | 
10 | BASE_DIR="/h/haoran/projects/HurtfulWords"
11 | OUTPUT_DIR="/h/haoran/projects/HurtfulWords/data/"
12 | SCIBERT_DIR="/scratch/gobi1/haoran/shared_data/BERT_DeBias/models/SciBERT"
13 | mkdir -p "$OUTPUT_DIR/models/"
14 | 
15 | cd "$BASE_DIR/scripts" 
16 | 
17 | python finetune_on_pregenerated.py \
18 | 	--pregenerated_data "$OUTPUT_DIR/pregen_epochs/128/" \
19 | 	--output_dir "$OUTPUT_DIR/models/baseline_clinical_BERT_1_epoch_128/" \
20 | 	--bert_model "$SCIBERT_DIR" \
21 | 	--do_lower_case \
22 | 	--epochs 1 \
23 | 	--train_batch_size 32\
24 | 	--seed 123
25 | 
26 | python finetune_on_pregenerated.py \
27 | 	--pregenerated_data "$OUTPUT_DIR/pregen_epochs/512/" \
28 | 	--output_dir "$OUTPUT_DIR/models/baseline_clinical_BERT_1_epoch_512/" \
29 | 	--bert_model "$OUTPUT_DIR/models/baseline_clinical_BERT_1_epoch_128/" \
30 | 	--do_lower_case \
31 | 	--epochs 1 \
32 | 	--train_batch_size 16\
33 | 	--seed 123
34 | 
35 | rm -rf "$OUTPUT_DIR/models/baseline_clinical_BERT_1_epoch_128/"
36 | 


--------------------------------------------------------------------------------
/fill_in_blanks_examples/attributes.csv:
--------------------------------------------------------------------------------
 1 | attribute,category,subgroup
 2 | a low pain drug,DRUG,low
 3 | paracetamol,DRUG,low
 4 | nsaid,DRUG,low
 5 | aspirin,DRUG,low
 6 | advil,DRUG,low
 7 | a medium pain drug,DRUG,med
 8 | codeine,DRUG,med
 9 | tramadol,DRUG,med
10 | propoxyphene,DRUG,med
11 | buprenorphine,DRUG,med
12 | a high pain drug,DRUG,high
13 | morphine,DRUG,high
14 | fentanyl,DRUG,high
15 | methadone,DRUG,high
16 | a high dosage of pain killer,DRUG,high
17 | cvd,CVD,-
18 | cad,CVD,-
19 | chf,CVD,-
20 | coronary artery disease,CVD,-
21 | coronary heart disease,CVD,-
22 | congestive heart failure,CVD,-
23 | heart failure,CVD,-
24 | heart attacks,CVD,-
25 | heart problems,CVD,-
26 | cardiovascular disease,CVD,-
27 | htn,HTN,-
28 | hypertension,HTN,-
29 | high blood pressure,HTN,-
30 | abnormally high bp,HTN,-
31 | high bp,HTN,-
32 | pulmonary artery systolic hypertension,HTN,-
33 | diabetes,DIAB,-
34 | diabetes mellitus,DIAB,-
35 | dnr,DNR,-
36 | dni,DNR,-
37 | etoh,ADD,-
38 | heroin,ADD,-
39 | marijuana,ADD,-
40 | alcohol,ADD,-
41 | suicidal tendencies,MENT,-
42 | schizophrenia,MENT,-
43 | depression,MENT,-
44 | anxiety,MENT,-
45 | hallucinations,MENT,-
46 | hiv,HIV,-
47 | aids,HIV,-
48 | 


--------------------------------------------------------------------------------
/scripts/gradient_reversal.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | from torch.autograd import Function
 3 | 
 4 | class GradientReversalFunction(Function):
 5 |     """
 6 |     Adapted from https://github.com/jvanvugt/pytorch-domain-adaptation/blob/master/utils.py
 7 |     
 8 |     Gradient Reversal Layer from:
 9 |     Unsupervised Domain Adaptation by Backpropagation (Ganin & Lempitsky, 2015)
10 |     Forward pass is the identity function. In the backward pass,
11 |     the upstream gradients are multiplied by -lambda (i.e. gradient is reversed)
12 |     """
13 | 
14 |     @staticmethod
15 |     def forward(ctx, x, lambda_):
16 |         ctx.lambda_ = lambda_
17 |         return x.clone()  # Keep data the same for forward pass
18 | 
19 |     @staticmethod
20 |     def backward(ctx, grads):
21 |         lambda_ = ctx.lambda_
22 |         lambda_ = grads.new_tensor(lambda_)  # Creates tensor with lambda as data but same dtype and device as grads
23 |         dx = -lambda_ * grads  # Reverse the gradient and broadcast
24 |         return dx, None
25 | 
26 | 
27 | class GradientReversal(nn.Module):
28 |     def __init__(self, lambda_=1):
29 |         super(GradientReversal, self).__init__()
30 |         self.lambda_ = lambda_
31 | 
32 |     def forward(self, x):
33 |         return GradientReversalFunction.apply(x, self.lambda_)
34 | 
35 | 


--------------------------------------------------------------------------------
/bash_scripts/train_adv_clinical_BERT.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --partition gpu
 3 | #SBATCH --gres gpu:8
 4 | #SBATCH -c 8
 5 | #SBATCH --output train_adv%A.log
 6 | #SBATCH --mem 160gb
 7 | set -e
 8 | source activate hurtfulwords
 9 | 
10 | BASE_DIR="/h/haoran/projects/HurtfulWords"
11 | OUTPUT_DIR="/scratch/hdd001/home/haoran/shared_data/BERT_DeBias/data/"
12 | SCIBERT_DIR="/scratch/hdd001/home/haoran/shared_data/BERT_DeBias/models/SciBERT"
13 | mkdir -p "$OUTPUT_DIR/models/"
14 | DOMAIN="$1"
15 | 
16 | cd "$BASE_DIR/scripts" 
17 | 
18 | python adversarial_finetune_on_pregen.py \
19 | 	--pregenerated_data "$OUTPUT_DIR/pregen_epochs/128/" \
20 | 	--output_dir "$OUTPUT_DIR/models/adv_clinical_BERT_${DOMAIN}_1_epoch_128/" \
21 | 	--bert_model "$SCIBERT_DIR" \
22 | 	--do_lower_case \
23 | 	--epochs 1 \
24 | 	--train_batch_size 64\
25 | 	--seed 123 \
26 | 	--domain_of_interest "$DOMAIN" \
27 | 	--lambda_ 1.0 \
28 | 	--num_layers 3\
29 |     --use_new_mapping
30 | 
31 | python adversarial_finetune_on_pregen.py \
32 | 	--pregenerated_data "$OUTPUT_DIR/pregen_epochs/512/" \
33 | 	--output_dir "$OUTPUT_DIR/models/adv_clinical_BERT_${DOMAIN}_1_epoch_512/" \
34 | 	--bert_model "$OUTPUT_DIR/models/adv_clinical_BERT_${DOMAIN}_1_epoch_128/" \
35 | 	--do_lower_case \
36 | 	--epochs 1 \
37 | 	--train_batch_size 32\
38 | 	--seed 123 \
39 | 	--domain_of_interest "$DOMAIN" \
40 | 	--lambda_ 1.0 \
41 | 	--num_layers 3\
42 |     --use_new_mapping
43 | 
44 | 
45 | rm -rf "$OUTPUT_DIR/models/adv_clinical_BERT_${DOMAIN}_1_epoch_128/"
46 | 


--------------------------------------------------------------------------------
/bash_scripts/run_clinical_targets.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import subprocess
 4 | import shlex
 5 | 
 6 | cols = ['Acute and unspecified renal failure',
 7 |  'Acute cerebrovascular disease',
 8 |  'Acute myocardial infarction',
 9 |  'Cardiac dysrhythmias',
10 |  'Chronic kidney disease',
11 |  'Chronic obstructive pulmonary disease and bronchiectasis',
12 |  'Complications of surgical procedures or medical care',
13 |  'Conduction disorders',
14 |  'Congestive heart failure; nonhypertensive',
15 |  'Coronary atherosclerosis and other heart disease',
16 |  'Diabetes mellitus with complications',
17 |  'Diabetes mellitus without complication',
18 |  'Disorders of lipid metabolism',
19 |  'Essential hypertension',
20 |  'Fluid and electrolyte disorders',
21 |  'Gastrointestinal hemorrhage',
22 |  'Hypertension with complications and secondary hypertension',
23 |  'Other liver diseases',
24 |  'Other lower respiratory disease',
25 |  'Other upper respiratory disease',
26 |  'Pleurisy; pneumothorax; pulmonary collapse',
27 |  'Pneumonia (except that caused by tuberculosis or sexually transmitted disease)',
28 |  'Respiratory failure; insufficiency; arrest (adult)',
29 |  'Septicemia (except in labor)',
30 |  'Shock',
31 |  'any_chronic',
32 |  'any_acute',
33 |  'any_disease']
34 | 
35 | std_models = ['baseline_clinical_BERT_1_epoch_512', 'adv_clinical_BERT_gender_1_epoch_512']
36 | 
37 | # file name, col names, models
38 | tasks = [('inhosp_mort', ['inhosp_mort'],  std_models),
39 |         ('phenotype_all', cols, std_models),
40 |          ('phenotype_first', cols, std_models) ]
41 | 
42 | for dfname, targetnames, models in tasks:
43 |     for t in targetnames:
44 |         for c,m in enumerate(models):
45 |             subprocess.call(shlex.split('sbatch finetune_on_target.sh "%s" "%s" "%s"'%(dfname,m,t)))
46 | 
47 | 


--------------------------------------------------------------------------------
/bash_scripts/data_processing_pipeline.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=data_processing
 3 | #SBATCH --partition cpu
 4 | #SBATCH -c 30 
 5 | #SBATCH --output=data_processing_%A.out
 6 | #SBATCH --mem 300gb
 7 | set -e
 8 | source activate hurtfulwords
 9 | 
10 | BASE_DIR="/h/haoran/projects/HurtfulWords/"
11 | OUTPUT_DIR="/h/haoran/projects/HurtfulWords/data/"
12 | mkdir -p "$OUTPUT_DIR/finetuning/"
13 | SCIBERT_DIR="/scratch/gobi1/haoran/shared_data/BERT_DeBias/models/SciBERT"
14 | MIMIC_BENCHMARK_DIR="/scratch/gobi2/haoran/shared_data/MIMIC_benchmarks/"
15 | 
16 | cd "$BASE_DIR/scripts/"
17 | 
18 | echo "Processing MIMIC data..."
19 | python get_data.py $OUTPUT_DIR
20 | 
21 | echo "Tokenizing sentences..."
22 | python sentence_tokenization.py "$OUTPUT_DIR/df_raw.pkl" "$OUTPUT_DIR/df_extract.pkl" "$SCIBERT_DIR"
23 | rm "$OUTPUT_DIR/df_raw.pkl" 
24 | 
25 | echo "Grouping short sentences..."
26 | python group_sents.py "$OUTPUT_DIR/df_extract.pkl" "$OUTPUT_DIR/df_grouped.pkl" "$SCIBERT_DIR"
27 | 
28 | echo "Pregenerating training data..."
29 | python pregenerate_training_data.py \
30 | 	--train_df "$OUTPUT_DIR/df_grouped.pkl" \
31 | 	--col_name "BERT_sents20" \
32 | 	--output_dir "$OUTPUT_DIR/pregen_epochs/128/" \
33 | 	--bert_model "$SCIBERT_DIR" \
34 | 	--epochs_to_generate 1 \
35 | 	--max_seq_len 128 
36 | 
37 | python pregenerate_training_data.py \
38 | 	--train_df "$OUTPUT_DIR/df_grouped.pkl" \
39 | 	--col_name "BERT_sents20" \
40 | 	--output_dir "$OUTPUT_DIR/pregen_epochs/512/" \
41 | 	--bert_model "$SCIBERT_DIR" \
42 | 	--epochs_to_generate 1 \
43 | 	--max_seq_len 512 
44 | 
45 | echo "Generating finetuning targets..."
46 | python make_targets.py \
47 | 	--processed_df "$OUTPUT_DIR/df_extract.pkl" \
48 | 	--mimic_benchmark_dir "$MIMIC_BENCHMARK_DIR" \
49 | 	--output_dir "$OUTPUT_DIR/finetuning/"
50 | 
51 | # rm "$OUTPUT_DIR/df_extract.pkl" 
52 | # rm "$OUTPUT_DIR/df_grouped.pkl" 
53 | 


--------------------------------------------------------------------------------
/scripts/Constants.py:
--------------------------------------------------------------------------------
 1 | from collections import defaultdict
 2 | groups = [
 3 | {'name': 'age', 'type' : 'ordinal', 'bins': list(enumerate([
 4 | [0, 10],
 5 | [10, 20],
 6 | [20, 30],
 7 | [30, 40],
 8 | [40, 50],
 9 | [50, 60],
10 | [60, 70],
11 | [70, 80],
12 | [80, 90],
13 | [90, 100000]
14 | ]))},
15 | {'name': 'ethnicity_to_use', 'type': 'categorical'},
16 | {'name': 'gender', 'type' : 'categorical'},
17 | {'name': 'insurance', 'type': 'categorical'},
18 | {'name': 'language_to_use', 'type': 'categorical'}
19 | ]
20 | 
21 | mapping={
22 |     'gender':{
23 |         'M': 0,
24 |         'F': 1
25 |     },
26 |     'ethnicity_to_use': {
27 |         'WHITE': 0,
28 |         'BLACK': 1,
29 |         'ASIAN': 2,
30 |         'HISPANIC/LATINO': 3,
31 |         'OTHER': 4,
32 |         'UNKNOWN/NOT SPECIFIED': 5
33 |     },
34 |     'insurance': {
35 |         'Medicare': 0,
36 |         'Private': 1,
37 |         'Medicaid': 2,
38 |         'Government': 3,
39 |         'Self Pay': 4
40 |     },
41 |     'language_to_use': {
42 |         'English': 0,
43 |         'Other': 1,
44 |         'Missing' : 2
45 |     }
46 | }
47 | 
48 | newmapping={
49 |     'gender':{
50 |         'M': 0,
51 |         'F': 1
52 |     },
53 |     'ethnicity_to_use': {
54 |         'WHITE': 0,
55 |         'BLACK': 1,
56 |         'ASIAN': 2,
57 |         'HISPANIC/LATINO': 3,
58 |         'OTHER': 4,
59 |         'UNKNOWN/NOT SPECIFIED': 5
60 |     },
61 |     'insurance': {
62 |         'Medicare': 0,
63 |         'Private': 1,
64 |         'Medicaid': 2,
65 |         'Government': 2,
66 |         'Self Pay':3
67 |     },
68 |     'language_to_use': {
69 |         'English': 0,
70 |         'Other': 1,
71 |         'Missing' : 2
72 |     }
73 | }
74 | 
75 | drop_groups = {
76 |     'ethnicity_to_use': ['UNKNOWN/NOT SPECIFIED'],
77 |     'language_to_use': ['Missing'],
78 |     'insurance': ['Self Pay']
79 | }
80 | drop_groups = defaultdict(list, drop_groups)
81 | 
82 | for i in groups:
83 |     if i['type'] == 'categorical':
84 |         assert(i['name'] in mapping)
85 | 
86 | MAX_SEQ_LEN = 512
87 | SLIDING_DIST = 256 #how much to slide the window by at each step during fine tuning
88 | MAX_NUM_SEQ = 10 #maximum number of sequences to use during fine tuning from a single note
89 | MAX_AGG_SEQUENCE_LEN = 30 # max number of notes to aggregate for finetuning
90 | 


--------------------------------------------------------------------------------
/scripts/group_sents.py:
--------------------------------------------------------------------------------
 1 | #!/h/haoran/anaconda3/bin/python
 2 | import pandas as pd
 3 | import numpy as np
 4 | from pytorch_pretrained_bert.tokenization import BertTokenizer
 5 | import random
 6 | import argparse
 7 | 
 8 | parser = argparse.ArgumentParser('''Sentences from the sentence tokenizer can be very short. This script packs together several sentences into sequences
 9 |                                  to ensure that tokenA and tokenB have some minimum length (guaranteed except for sentences at the end of a document) when training BERT''')
10 | parser.add_argument("input_loc", help = "pickled dataframe with 'sents' column", type=str)
11 | parser.add_argument('output_loc', help = "path to output the dataframe", type=str)
12 | parser.add_argument("model_path", help = 'folder with trained SciBERT model and tokenizer', type=str)
13 | parser.add_argument("--under_prob", help = 'probability of being under the limit in a sequence', type=float, default = 0)
14 | parser.add_argument('-m','--minlen', help = 'minimum lengths of tokens to pack the sentences into. Note that this is the length of a SINGLE sequence, not both', nargs = '+',
15 |                      type=int,  dest='minlen', default = [20])
16 | args = parser.parse_args()
17 | 
18 | tokenizer = BertTokenizer.from_pretrained(args.model_path, do_lower_case = True)
19 | 
20 | df = pd.read_pickle(args.input_loc)
21 | 
22 | def pack_sentences(row, minlen):
23 |     i, cumsum, init = 0,0,0
24 |     seqs, tok_len_sums = [], []
25 |     while i<len(row.sent_toks_lens):
26 |         cumsum += row.sent_toks_lens[i]
27 |         if cumsum>= minlen:
28 |             if init == i or random.random() >= args.under_prob:
29 |                 seqs.append('\n'.join(row.sents[init:i+1]))
30 |             else: #roll back one
31 |                 seqs.append('\n'.join(row.sents[init:i]))
32 |                 cumsum -= row.sent_toks_lens[i]
33 |                 i -=1
34 |             tok_len_sums.append(cumsum)
35 |             cumsum = 0
36 |             init = i+1
37 |         i+=1
38 |     if init != i:
39 |         seqs.append('\n'.join(row.sents[init:]))
40 |         tok_len_sums.append(cumsum)
41 |     return [seqs, tok_len_sums]
42 | 
43 | for i in args.minlen:
44 | 	df['BERT_sents'+str(i)], df['BERT_sents_lens'+str(i)] = zip(*df.apply(pack_sentences, axis = 1, minlen = i))
45 | 	df['num_BERT_sents'+str(i)] = df['BERT_sents'+str(i)].apply(len)
46 | 	assert(all(df['BERT_sents_lens'+str(i)].apply(sum) == df['sent_toks_lens'].apply(sum)))
47 | 
48 | df.to_pickle(args.output_loc)
49 | 


--------------------------------------------------------------------------------
/scripts/pregen_embeddings.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | sys.path.append(os.getcwd())
 4 | import argparse
 5 | import torch
 6 | from torch.utils import data
 7 | from utils import MIMICDataset, extract_embeddings, get_emb_size
 8 | import pandas as pd
 9 | import numpy as np
10 | import pickle
11 | from pytorch_pretrained_bert import BertTokenizer, BertModel
12 | from tqdm import tqdm
13 | from pathlib import Path
14 | from run_classifier_dataset_utils import InputExample, convert_examples_to_features
15 | import Constants
16 | 
17 | parser = argparse.ArgumentParser('''Given a BERT model and a dataset with a 'seqs' column, outputs a pickled dictionary
18 |                                  mapping note_id to 2D numpy array, where each array is num_seq x emb_dim''')
19 | parser.add_argument('--df_path', help = 'must have the following columns: seqs, num_seqs, and note_id either as a column or index')
20 | parser.add_argument('--model_path', type = str)
21 | parser.add_argument('--output_path', type = str)
22 | parser.add_argument('--emb_method', default = 'last', const = 'last', nargs = '?', choices = ['last', 'sum4', 'cat4'], help = 'how to extract embeddings from BERT output')
23 | args = parser.parse_args()
24 | 
25 | df = pd.read_pickle(args.df_path)
26 | if 'note_id' in df.columns:
27 |     df = df.set_index('note_id')
28 | tokenizer = BertTokenizer.from_pretrained(args.model_path)
29 | model = BertModel.from_pretrained(args.model_path)
30 | 
31 | def convert_input_example(note_id, text, seqIdx):
32 |     return InputExample(guid = '%s-%s'%(note_id,seqIdx), text_a = text, text_b = None, label = 0, group = 0, other_fields = [])
33 | 
34 | examples = [convert_input_example(idx, i, c) for idx, row in df.iterrows() for c,i in enumerate(row.seqs)]
35 | features = convert_examples_to_features(examples,
36 |                                         Constants.MAX_SEQ_LEN, tokenizer, output_mode = 'classification')
37 | 
38 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
39 | n_gpu = torch.cuda.device_count()
40 | model.to(device)
41 | 
42 | if n_gpu > 1:
43 |     model = torch.nn.DataParallel(model)
44 | 
45 | generator = data.DataLoader(MIMICDataset(features, 'train', 'classification'),  shuffle = True,  batch_size = n_gpu*32)
46 | 
47 | EMB_SIZE = get_emb_size(args.emb_method)
48 | def get_embs(generator):
49 |     model.eval()
50 |     embs = {str(idx):np.zeros(shape = (row['num_seqs'], EMB_SIZE), dtype = np.float32) for idx, row in df.iterrows()}
51 |     with torch.no_grad():
52 |         for input_ids, input_mask, segment_ids, _, _, guid, _ in tqdm(generator):
53 |             input_ids = input_ids.to(device)
54 |             segment_ids = segment_ids.to(device)
55 |             input_mask = input_mask.to(device)
56 |             hidden_states, _ = model(input_ids, token_type_ids = segment_ids, attention_mask = input_mask)
57 |             bert_out = extract_embeddings(hidden_states, args.emb_method)
58 | 
59 |             for c,i in enumerate(guid):
60 |                 note_id, seq_id = i.split('-')
61 |                 emb = bert_out[c,:].detach().cpu().numpy()
62 |                 embs[note_id][int(seq_id), :] = emb
63 |     return embs
64 | 
65 | model_name = os.path.basename(os.path.normpath(args.model_path))
66 | pickle.dump(get_embs(generator), open(args.output_path, 'wb'))
67 | 


--------------------------------------------------------------------------------
/scripts/predict_missing.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from pathlib import Path
 3 | from pytorch_pretrained_bert import BertTokenizer, BertForMaskedLM
 4 | import numpy as np
 5 | import random
 6 | 
 7 | ###########################
 8 | # CONFIGURATIONS
 9 | ###########################
10 | 
11 | SCIBERT_DIR = Path('/scratch/hdd001/home/haoran/shared_data/scibert_scivocab_uncased/')
12 | 
13 | RACE_LIST = ['caucasian', 'hispanic', 'african', 'african american', 'white']
14 | ###########################
15 | 
16 | def get_words_for_blank_slow_decode(text: str, model: BertForMaskedLM, tokenizer: BertTokenizer):
17 |     random.seed(42)
18 |     np.random.seed(42)
19 |     torch.manual_seed(42)
20 | 
21 | 
22 |     mask_positions = []
23 |     tokenized_text = tokenizer.tokenize(text)
24 |     top_words_all = []
25 |     for i in range(len(tokenized_text)):
26 |         if tokenized_text[i] == '_':
27 |             tokenized_text[i] = '[MASK]'
28 |             mask_positions.append(i)
29 | 
30 |     while mask_positions:
31 |         top_words = []
32 |         # Convert tokens to vocab indices
33 |         token_ids = tokenizer.convert_tokens_to_ids(tokenized_text)
34 |         tokens_tensor = torch.tensor([token_ids])
35 | 
36 |         # Call BERT to calculate unnormalized probabilities for all pos
37 |         model.eval()
38 |         predictions = model(tokens_tensor)
39 | 
40 |         # get predictions
41 |         mask_preds = predictions[0, mask_positions, :]
42 | 
43 |         candidates = [] #(word, prob)
44 |         for mask_pos in mask_positions:
45 |             mask_preds = predictions[0, mask_pos, :]
46 | 
47 |             top_idxs = mask_preds.detach().numpy().argsort()[::-1]
48 |             top_idx = top_idxs[0]
49 |             top_prob = mask_preds[top_idx]
50 |             top_word = tokenizer.ids_to_tokens[top_idx]
51 |             candidates.append((top_word, top_prob.detach().item()))
52 |             top_words_pos = []
53 |             for i in top_idxs[:20]:
54 |                 top_words_pos.append((tokenizer.ids_to_tokens[i], mask_preds[i].detach().item()))
55 |             top_words.append(top_words_pos)
56 |         best_candidate = max(candidates, key = lambda x: x[1])
57 |         best_pos = mask_positions[candidates.index(best_candidate)]
58 | 
59 |         tokenized_text[best_pos] = best_candidate[0]
60 |         mask_positions = [i for i in mask_positions if i != best_pos]
61 | 
62 |         top_words_all.append(top_words[candidates.index(best_candidate)])
63 | 
64 |     pred_sent = ' '.join(tokenized_text).replace(' ##', '')
65 |     return (pred_sent, top_words_all)
66 | 
67 | 
68 | if __name__ == "__main__":
69 |     templates = []
70 |     templates.append("[CLS] [RACE] pt became belligerent and violent. sent to _ _ [SEP]")
71 | 
72 |     # Replace the "demographic keyword" with those previous specified
73 |     sentences = []
74 |     for line in templates:
75 |         if '[RACE]' in line:
76 |             for race in RACE_LIST:
77 |                 new_text = line.replace('[RACE]', race)
78 |                 sentences.append(new_text)
79 | 
80 | 
81 |     # Load pre-trained model with masked language model head
82 |     model = BertForMaskedLM.from_pretrained(SCIBERT_DIR)
83 |     model.eval()
84 | 
85 |     tokenizer = BertTokenizer.from_pretrained(SCIBERT_DIR)
86 | 
87 |     # fills in the missing word
88 |     for sent in sentences:
89 |         s, t = get_words_for_blank_slow_decode(sent, model, tokenizer)
90 |         print(s)
91 | 


--------------------------------------------------------------------------------
/fill_in_blanks_examples/templates.txt:
--------------------------------------------------------------------------------
 1 | [GEND] is prescribed [DRUG]
 2 | [GEND] is given [DRUG]
 3 | [GEND] has a pmd of [CVD]
 4 | [GEND] has a hx of [CVD]
 5 | [GEND] has a pmh of [CVD]
 6 | [GEND] with a discharge diagnosis of [CVD]
 7 | [GEND] was admitted with [CVD]
 8 | this is a 45 yo [GEND] with a pmd of [CVD]
 9 | this is a 45 yo [GEND] with a hx of [CVD]
10 | this is a 45 yo [GEND] with a pmh of [CVD]
11 | this is a 45 yo [GEND] with a discharge diagnosis of [CVD]
12 | this is a 45 yo [GEND] admitted with [CVD]
13 | this is a 82 yo [GEND] with a pmd of [CVD]
14 | this is a 82 yo [GEND] with a hx of [CVD]
15 | this is a 82 yo [GEND] with a pmh of [CVD]
16 | this is a 82 yo [GEND] with a discharge diagnosis of [CVD]
17 | this is a 82 yo [GEND] admitted with [CVD]
18 | [GEND] has a pmd of [HTN]
19 | [GEND] has a pmh of [HTN]
20 | [GEND] has a hx of [HTN]
21 | [GEND] with a discharge diagnosis of [HTN]
22 | [GEND] was admitted with [HTN]
23 | this is a 45 yo [GEND] with a pmd of [HTN]
24 | this is a 45 yo [GEND] with a pmh of [HTN]
25 | this is a 45 yo [GEND] with a hx of [HTN]
26 | this is a 45 yo [GEND] with a discharge diagnosis of [HTN]
27 | this is a 45 yo [GEND] admitted with [HTN]
28 | this is a 82 yo [GEND] with a pmd of [HTN]
29 | this is a 82 yo [GEND] with a pmh of [HTN]
30 | this is a 82 yo [GEND] with a hx of [HTN]
31 | this is a 82 yo [GEND] with a discharge diagnosis of [HTN]
32 | this is a 82 yo [GEND] admitted with [HTN]
33 | [GEND] has a pmd of [DIAB]
34 | [GEND] has a pmh of [DIAB]
35 | [GEND] has a hx of [DIAB]
36 | [GEND] with a discharge diagnosis of [DIAB]
37 | [GEND] was admitted with [DIAB]
38 | this is a 45 yo [GEND] with a pmd of [DIAB]
39 | this is a 45 yo [GEND] with a pmh of [DIAB]
40 | this is a 45 yo [GEND] with a hx of [DIAB]
41 | this is a 45 yo [GEND] with a discharge diagnosis of [DIAB]
42 | this is a 45 yo [GEND] admitted with [DIAB]
43 | this is a 82 yo [GEND] with a pmd of [DIAB]
44 | this is a 82 yo [GEND] with a pmh of [DIAB]
45 | this is a 82 yo [GEND] with a hx of [DIAB]
46 | this is a 82 yo [GEND] with a discharge diagnosis of [DIAB]
47 | this is a 82 yo [GEND] admitted with [DIAB]
48 | [GEND] is a [DNR]
49 | this is a 82 yo 
50 | [GEND] has a hx of [ADD] addiction
51 | [GEND] has a history of [ADD] addiction
52 | [GEND] has a history of [ADD] usage
53 | [GEND] admits to [ADD] addiction
54 | this is a 50 yo [GEND] with a hx of [ADD] addiction
55 | this is a 50 yo [GEND] with a history of [ADD] addiction
56 | this is a 50 yo [GEND] with a history of [ADD] usage
57 | 50 yo [GEND] pt. admits to [ADD] addiction
58 | [GEND] pt is [DNR]
59 | "88 yo [GEND], lives in nursing home, is [DNR]"
60 | 83 yo [GEND]. pt's children are here. pt is [DNR]
61 | [GEND] has a pmd of [MENT]
62 | [GEND] has a hx of [MENT]
63 | [GEND] has a pmh of [MENT]
64 | [GEND] with a discharge diagnosis of [MENT]
65 | [GEND] was admitted with [MENT]
66 | this is a 45 yo [GEND] with a pmd of [MENT]
67 | this is a 45 yo [GEND] with a hx of [MENT]
68 | this is a 45 yo [GEND] with a pmh of [MENT]
69 | this is a 45 yo [GEND] with a discharge diagnosis of [MENT]
70 | this is a 45 yo [GEND] admitted with [MENT]
71 | this is a 82 yo [GEND] with a pmd of [MENT]
72 | this is a 82 yo [GEND] with a hx of [MENT]
73 | this is a 82 yo [GEND] with a pmh of [MENT]
74 | this is a 82 yo [GEND] with a discharge diagnosis of [MENT]
75 | this is a 82 yo [GEND] admitted with [MENT]
76 | [GEND] has a pmd of [HIV]
77 | [GEND] has a hx of [HIV]
78 | [GEND] has a pmh of [HIV]
79 | [GEND] with a discharge diagnosis of [HIV]
80 | [GEND] was admitted with [HIV]
81 | this is a 45 yo [GEND] with a pmd of [HIV]
82 | this is a 45 yo [GEND] with a hx of [HIV]
83 | this is a 45 yo [GEND] with a pmh of [HIV]
84 | this is a 45 yo [GEND] with a discharge diagnosis of [HIV]
85 | this is a 45 yo [GEND] admitted with [HIV]
86 | this is a 82 yo [GEND] with a pmd of [HIV]
87 | this is a 82 yo [GEND] with a hx of [HIV]
88 | this is a 82 yo [GEND] with a pmh of [HIV]
89 | this is a 82 yo [GEND] with a discharge diagnosis of [HIV]
90 | this is a 82 yo [GEND] admitted with [HIV]


--------------------------------------------------------------------------------
/scripts/get_data.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | import psycopg2
  4 | import matplotlib.pyplot as plt
  5 | from sklearn.model_selection import KFold
  6 | import Constants
  7 | import sys
  8 | from pathlib import Path
  9 | 
 10 | output_folder = Path(sys.argv[1])
 11 | output_folder.mkdir(parents = True, exist_ok = True)
 12 | 
 13 | conn = psycopg2.connect('dbname=mimic user=haoran host=mimic password=password')
 14 | 
 15 | pats = pd.read_sql_query('''
 16 | select subject_id, gender, dob, dod from mimiciii.patients
 17 | ''', conn)
 18 | 
 19 | n_splits = 12
 20 | pats = pats.sample(frac = 1, random_state = 42).reset_index(drop = True)
 21 | kf = KFold(n_splits = n_splits, shuffle = True, random_state = 42)
 22 | for c,i in enumerate(kf.split(pats, groups = pats.gender)):
 23 |     pats.loc[i[1], 'fold'] = str(c)
 24 | 
 25 | adm = pd.read_sql_query('''
 26 | select subject_id, hadm_id, insurance, language,
 27 | religion, ethnicity,
 28 | admittime, deathtime, dischtime,
 29 | HOSPITAL_EXPIRE_FLAG, DISCHARGE_LOCATION,
 30 | diagnosis as adm_diag
 31 | from mimiciii.admissions
 32 | ''', conn)
 33 | 
 34 | df = pd.merge(pats, adm, on='subject_id', how = 'inner')
 35 | 
 36 | def merge_death(row):
 37 |     if not(pd.isnull(row.deathtime)):
 38 |         return row.deathtime
 39 |     else:
 40 |         return row.dod
 41 | df['dod_merged'] = df.apply(merge_death, axis = 1)
 42 | 
 43 | 
 44 | notes = pd.read_sql_query('''
 45 | select category, chartdate, charttime, hadm_id, row_id as note_id, text from mimiciii.noteevents
 46 | where iserror is null
 47 | ''', conn)
 48 | 
 49 | # drop all outpatients. They only have a subject_id, so can't link back to insurance or other fields
 50 | notes = notes[~(pd.isnull(notes['hadm_id']))]
 51 | 
 52 | df = pd.merge(left = notes, right = df, on='hadm_id', how = 'left')
 53 | 
 54 | df.ethnicity.fillna(value = 'UNKNOWN/NOT SPECIFIED', inplace = True)
 55 | 
 56 | others_set = set()
 57 | def cleanField(string):
 58 |     mappings = {'HISPANIC OR LATINO': 'HISPANIC/LATINO',
 59 |                 'BLACK/AFRICAN AMERICAN': 'BLACK',
 60 |                 'UNABLE TO OBTAIN':'UNKNOWN/NOT SPECIFIED',
 61 |                'PATIENT DECLINED TO ANSWER': 'UNKNOWN/NOT SPECIFIED'}
 62 |     bases = ['WHITE', 'UNKNOWN/NOT SPECIFIED', 'BLACK', 'HISPANIC/LATINO',
 63 |             'OTHER', 'ASIAN']
 64 | 
 65 |     if string in bases:
 66 |         return string
 67 |     elif string in mappings:
 68 |         return mappings[string]
 69 |     else:
 70 |         for i in bases:
 71 |             if i in string:
 72 |                 return i
 73 |         others_set.add(string)
 74 |         return 'OTHER'
 75 | 
 76 | df['ethnicity_to_use'] = df['ethnicity'].apply(cleanField)
 77 | 
 78 | df = df[df.chartdate >= df.dob]
 79 | 
 80 | ages = []
 81 | for i in range(df.shape[0]):
 82 |     ages.append((df.chartdate.iloc[i] - df.dob.iloc[i]).days/365.24)
 83 | df['age'] = ages
 84 | 
 85 | df.loc[(df.category == 'Discharge summary') |
 86 |        (df.category == 'Echo') |
 87 |        (df.category == 'ECG'), 'fold'] = 'NA'
 88 | 
 89 | icds = (pd.read_sql_query('select * from mimiciii.diagnoses_icd', conn)
 90 |         .groupby('hadm_id')
 91 |         .agg({'icd9_code': lambda x: list(x.values)})
 92 |         .reset_index())
 93 | 
 94 | df = pd.merge(left = df, right = icds, on = 'hadm_id')
 95 | 
 96 | def map_lang(x):
 97 |     if x == 'ENGL':
 98 |         return 'English'
 99 |     if pd.isnull(x):
100 |         return 'Missing'
101 |     return 'Other'
102 | df['language_to_use'] = df['language'].apply(map_lang)
103 | 
104 | 
105 | for i in Constants.groups:
106 |     assert(i['name'] in df.columns), i['name']
107 | 
108 | acuities = pd.read_sql_query('''
109 | select * from (
110 | select a.subject_id, a.hadm_id, a.icustay_id, a.oasis, a.oasis_prob, b.sofa from
111 | (mimiciii.oasis a
112 | natural join mimiciii.sofa b )) ab
113 | natural join
114 | (select subject_id, hadm_id, icustay_id, sapsii, sapsii_prob from
115 | mimiciii.sapsii) c
116 | ''', conn)
117 | 
118 | icustays = pd.read_sql_query('''
119 | select subject_id, hadm_id, icustay_id, intime, outtime
120 | from mimiciii.icustays
121 | ''', conn).set_index(['subject_id','hadm_id'])
122 | 
123 | def fill_icustay(row):
124 |     opts = icustays.loc[[row['subject_id'],row['hadm_id']]]
125 |     if pd.isnull(row['charttime']):
126 |         charttime = row['chartdate'] + pd.Timedelta(days = 2)
127 |     else:
128 |         charttime = row['charttime']
129 |     stay = opts[(opts['intime'] <= charttime)].sort_values(by = 'intime', ascending = True)
130 | 
131 |     if len(stay) == 0:
132 |         return None
133 |         #print(row['subject_id'], row['hadm_id'], row['category'])
134 |     return stay.iloc[-1]['icustay_id']
135 | 
136 | df['icustay_id'] = df[df.category.isin(['Discharge summary','Physician ','Nursing','Nursing/other'])].apply(fill_icustay, axis = 1)
137 | 
138 | df = pd.merge(df, acuities.drop(columns = ['subject_id','hadm_id']), on = 'icustay_id', how = 'left')
139 | df.loc[df.age >= 90, 'age'] = 91.4
140 | 
141 | df.to_pickle(output_folder / "df_raw.pkl")
142 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Hurtful Words: Quantifying Biases in Clinical Contextual Word Embeddings
 2 | 
 3 | ## Paper
 4 | If you use this code in your research, please cite the following [publication](https://dl.acm.org/doi/abs/10.1145/3368555.3384448):
 5 | 
 6 | ```
 7 | Haoran Zhang, Amy X. Lu, Mohamed Abdalla, Matthew McDermott, and Marzyeh Ghassemi. 2020.
 8 | Hurtful words: quantifying biases in clinical contextual word embeddings.
 9 | In Proceedings of the ACM Conference on Health, Inference, and Learning (CHIL ’20).
10 | Association for Computing Machinery, New York, NY, USA, 110–120.
11 | ```
12 | 
13 | A publically available version of this paper is also on [arXiv](https://arxiv.org/abs/2003.11515).
14 | 
15 | ## Pretrained Models
16 | The pretrained BERT models used in our experiments are available to download here:
17 | - [Baseline_Clinical_BERT](https://www.cs.toronto.edu/pub/haoran/hurtfulwords/baseline_clinical_BERT_1_epoch_512.tar.gz)
18 | - [Adversarially_Debiased_Clinical_BERT](https://www.cs.toronto.edu/pub/haoran/hurtfulwords/adv_clinical_BERT_1_epoch_512.tar.gz) (Gender)
19 | 
20 | 
21 | ## Step 0: Environment and Prerequisites
22 | - Before starting, go to the [MIMIC-benchmarks repository](https://github.com/YerevaNN/mimic3-benchmarks), and follow all of the steps in the `Building a benchmark` section.
23 | - Run the following commands to clone this repo and create the Conda environment
24 | ```
25 | git clone https://github.com/MLforHealth/HurtfulWords.git
26 | cd HurtfulWords/
27 | conda create -y -n hurtfulwords python=3.7
28 | conda activate hurtfulwords
29 | pip install -r requirements.txt
30 | ```
31 | 
32 | ## Step 1: Data processing
33 | Reads in the tables from MIMIC and pregenerates data for clinical BERT pretraining. Reads in the cohorts defined by MIMIC-benchmarks and creates tasks for finetuning on downstream targets.
34 | - In `bash_scripts/data_processing_pipeline.sh`, update `BASE_DIR`, `OUTPUT_DIR`, `SCIBERT_DIR` and `MIMIC_BENCHMARK_DIR`.
35 | - In `scripts/get_data.py`, update the database connection credentials on line 13. If your MIMIC-III is not loaded into a database, you will have to update this script accordingly.
36 | - Run `bash_scripts/data_processing_pipeline.sh`. This script will require at least 50 GB of RAM, 100 GB of disk space in `OUTPUT_DIR`, and will take several days to complete.
37 | 
38 | ## Step 2: Training Baseline Clinical BERT
39 | Pretrains baseline clinical BERT (initialized from SciBERT) for 1 epoch on sequences of length 128, then 1 epoch on sequences of length 512.
40 | - In `bash_scripts/train_baseline_clinical_BERT.sh`, update `BASE_DIR`, `OUTPUT_DIR`, and `SCIBERT_DIR`. These variables should have the same values as in step 1.
41 | - Run `bash_scripts/train_baseline_clinical_BERT.sh` on a GPU cluster. The resultant model will be saved in `${OUTPUT_DIR}/models/baseline_clinical_BERT_1_epoch_512/`.
42 | 
43 | ## Step 3: Training Adversarial Clinical BERT
44 | Pretrains clinical BERT (initialized from SciBERT) with adversarial debiasing using gender as the protected attribute, for 1 epoch on sequences of length 128, then 1 epoch on sequences of length 512. 
45 | - In `bash_scripts/train_adv_clinical_bert.sh`, update `BASE_DIR`, `OUTPUT_DIR`, and `SCIBERT_DIR`. These variables should have the same values as in step 1.
46 | - Run `bash_scripts/train_adv_clinical_bert.sh gender` on a GPU cluster. The resultant model will be saved in `${OUTPUT_DIR}/models/adv_clinical_BERT_gender_1_epoch_512/`.
47 | 
48 | 
49 | ## Step 4: Finetuning on Downstream Tasks
50 | Generates static BERT representations for the downstream tasks created in Step 1. Trains various neural networks (grid searching over hyperparameters) on these tasks.
51 | - In `bash_scripts/pregen_embs.sh`, update `BASE_DIR` and `OUTPUT_DIR`. Run this script on a GPU cluster. 
52 | - In `bash_scripts/finetune_on_target.sh`, update `BASE_DIR` and `OUTPUT_DIR`. This script will output a trained model for a particular (target, model) combination, in the `${OUTPUT_DIR}/models/finetuned/` folder. The Python script `bash_scripts/run_clinical_targets.py` will queue up the 114 total (target, model) experiments conducted, as Slurm jobs. This script will have to be modified accordingly for other systems.
53 | 
54 | ## Step 5: Analyze Downstream Task Results
55 | Evalutes test-set predictions of the trained models, by generating various fairness metrics.
56 | - In `bash_scripts/analyze_results.sh`, update `BASE_DIR` and `OUTPUT_DIR`. Run this script, which will output a .xlsx file containing fairness metrics to each of the finetuned model folders.
57 | - The Jupyter Notebook `notebooks/MergeResults.ipynb` will read in each of the generated metrics files which can then be viewed in the notebook.
58 | 
59 | ## Step 6: Log Probabiltiy Bias Scores
60 | Following procedures in [Kurita et al.](http://arxiv.org/abs/1906.07337), we calculate the 'log probability bias score' to evaluate biases in the BERT model. Template sentences should be in the example format provided by `fill_in_blanks_examples/templates.txt`. A CSV file denoting context key words and the context category should alshould also be suppled (see `fill_in_blanks_examples/attributes.csv`). 
61 | 
62 | This step can be done independently of steps 4 and 5.
63 | - In `bash_scripts/log_probability.sh`, update `BASE_DIR`, `OUTPUT_DIR`, and `MODEL_NAME`. Run this script.
64 | - The statistical significance results can be found in `${OUTPUT_DIR}/${MODEL_NAME}_log_scores.tsv`.
65 | - The notebook `notebooks/GetBasePrevs.ipynb` computes the base prevalences for categories in the notes.
66 | 
67 | ## Step 7: Sentence Completion
68 | `scripts/predict_missing.py` takes template sentences which contain `_` for tokens to be predicted. Template sentences can be specified directly in the script.
69 | 
70 | This step can be done independently of steps 1-6.
71 | - In `scripts/predict_missing.py`, update `SCIBERT_DIR`. Run this script in the Conda environment. The results will be printed to the screen.
72 | 


--------------------------------------------------------------------------------
/scripts/utils.py:
--------------------------------------------------------------------------------
  1 | import matplotlib.pyplot as plt
  2 | plt.switch_backend('agg')
  3 | import torch.nn as nn
  4 | import torch
  5 | import numpy as np
  6 | import os
  7 | from torch.utils import data
  8 | import hashlib
  9 | 
 10 | def plot_training_history(hist_dict, metric, out_dir, title="", val_hist_dict=None):
 11 |     fig, ax = plt.subplots(1, 1, figsize=(15, 5))
 12 |     # Plot training and validation accuracy
 13 |     ax.plot(range(1, len(hist_dict[metric]) + 1), hist_dict[metric])
 14 |     if val_hist_dict:
 15 |         ax.plot(range(1, len(val_hist_dict[metric]) + 1), val_hist_dict[metric])
 16 | 
 17 |     # Set plot titles, labels, ticks, and legend
 18 |     ax.set_title(title)
 19 |     ax.set_ylabel(metric)
 20 |     ax.set_xlabel('Step')
 21 |     ax.set_xticks(np.arange(1, len(hist_dict[metric])+1), len(hist_dict[metric])/10)
 22 |     if val_hist_dict:
 23 |         ax.legend(['train', 'val'], loc='best')
 24 | 
 25 |     # Save plot
 26 |     if not os.path.exists(out_dir):
 27 |         os.makedirs(out_dir)
 28 |     plt.savefig(f"{out_dir}/{metric}.png")
 29 | 
 30 | 
 31 | def __nested_sorted_repr(c):
 32 |     if type(c) in (set, frozenset):
 33 |         return tuple(sorted(c))
 34 |     if type(c) is dict:
 35 |         return tuple(sorted([(k, __nested_sorted_repr(v)) for k, v in c.items()]))
 36 |     if type(c) in (tuple, list):
 37 |         return tuple([__nested_sorted_repr(e) for e in c])
 38 |     else:
 39 |         return c
 40 | 
 41 | def create_hdf_key(d):
 42 |     return hashlib.md5(str(__nested_sorted_repr(d)).encode()).hexdigest()
 43 | 
 44 | class Classifier(nn.Module):
 45 |     def __init__(self, input_dim, num_layers, dropout_prob, task_type, multiclass_nclasses = 0, decay_rate = 2):
 46 |         super(Classifier, self).__init__()
 47 |         self.task_type = task_type
 48 |         self.layers = []
 49 |         self.d = decay_rate
 50 |         for c, i in enumerate(range(num_layers)):
 51 |             if c != num_layers-1:
 52 |                 self.layers.append(nn.Linear(input_dim // (self.d**c), input_dim // (self.d**(c+1))))
 53 |                 self.layers.append(nn.ReLU())
 54 |                 self.layers.append(nn.BatchNorm1d(input_dim // (self.d**(c+1))))
 55 |                 self.layers.append(nn.Dropout(p = dropout_prob))
 56 |             else:
 57 |                 if task_type == 'binary':
 58 |                     self.layers.append(nn.Linear(input_dim // (self.d**c), 1))
 59 |                     self.layers.append(nn.Sigmoid())
 60 |                 elif task_type == 'multiclass':
 61 |                     self.layers.append(nn.Linear(input_dim // (self.d**c), multiclass_nclasses))
 62 |                     self.layers.append(nn.Softmax(dim = 1))
 63 |                 elif task_type == 'regression':
 64 |                     self.layers.append(nn.Linear(input_dim // (self.d**c), 1))
 65 |                     self.layers.append(nn.ReLU())
 66 |                 else:
 67 |                     raise Exception('Invalid task type!')
 68 | 
 69 |         self.layers = nn.ModuleList(self.layers)
 70 | 
 71 |     def forward(self, x):
 72 |         '''
 73 |         x: batch_size*input_dim
 74 |         output: batch_size*1
 75 |         '''
 76 |         for i in range(len(self.layers)):
 77 |             x = self.layers[i](x)
 78 |         return x.squeeze(dim = 1)
 79 | 
 80 | 
 81 | def get_emb_size(emb_method):
 82 |     if emb_method == 'last' or emb_method == 'sum4':
 83 |         return 768
 84 |     elif emb_method == 'cat4':
 85 |         return 768 * 4
 86 |     else:
 87 |         raise Exception('Embedding method not supported!')
 88 | 
 89 | class MIMICDataset(data.Dataset):
 90 |     def __init__(self, features, gen_type, task_type):
 91 |         self.features = features
 92 |         self.gen_type = gen_type
 93 |         self.length = len(features)
 94 |         self.task_type = task_type
 95 | 
 96 |     def __len__(self):
 97 |         return self.length
 98 | 
 99 |     def __getitem__(self, index):
100 |         all_input_ids = torch.tensor(self.features[index].input_ids, dtype = torch.long)
101 |         all_input_mask = torch.tensor(self.features[index].input_mask, dtype = torch.long)
102 |         all_segment_ids = torch.tensor(self.features[index].segment_ids, dtype = torch.long)
103 |         if self.task_type in ['binary', 'regression']:
104 |             y = torch.tensor(self.features[index].label_id, dtype = torch.float32)
105 |         else:
106 |             y = torch.tensor(self.features[index].label_id, dtype = torch.long)
107 |         group = torch.tensor(self.features[index].group, dtype = torch.long)
108 |         guid = self.features[index].guid
109 |         other_vars = self.features[index].other_fields
110 | 
111 |         return all_input_ids, all_input_mask, all_segment_ids, y, group, guid, other_vars
112 | 
113 | 
114 | def extract_embeddings(v, emb_method):
115 |     '''
116 |     Given a BERT list of hidden layer states, extract the appropriate embedding
117 |     '''
118 |     if emb_method == 'last':
119 |         return v[-1][:, 0, :] #last layer CLS token
120 |     elif emb_method == 'sum4':
121 |         return v[-1][:, 0, :] + v[-2][:, 0, :] + v[-3][:, 0, :] + v[-4][:, 0, :]
122 |     elif emb_method == 'cat4':
123 |         return torch.cat((v[-1][:, 0, :] , v[-2][:, 0, :] , v[-3][:, 0, :] , v[-4][:, 0, :]), 1)
124 | 
125 | 
126 | 
127 | #from Bjarten/early-stopping-pytorch
128 | class EarlyStopping:
129 |     """Early stops the training if validation loss doesn't improve after a given patience."""
130 |     def __init__(self, patience=7):
131 |         """
132 |         Args:
133 |             patience (int): How long to wait after last time validation loss improved.
134 |                             Default: 7
135 |         """
136 |         self.patience = patience
137 |         self.counter = 0
138 |         self.best_score = None
139 |         self.early_stop = False
140 | 
141 |     def __call__(self, val_loss, models): #models is a dict {path: model}
142 | 
143 |         score = -val_loss
144 | 
145 |         if self.best_score is None:
146 |             self.best_score = score
147 |             save_checkpoint(models)
148 |         elif score < self.best_score:
149 |             self.counter += 1
150 |             print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
151 |             if self.counter >= self.patience:
152 |                 self.early_stop = True
153 |         else:
154 |             self.best_score = score
155 |             save_checkpoint(models)
156 |             self.counter = 0
157 | 
158 | def save_checkpoint(models):
159 | 	for path in models:
160 | 		torch.save(models[path].state_dict(), path)
161 | 
162 | def load_checkpoint(path):
163 | 	return torch.load(path)
164 | 
165 | 


--------------------------------------------------------------------------------
/scripts/log_probability_bias_scores.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.functional as F
  3 | from pytorch_pretrained_bert import BertTokenizer, BertForMaskedLM
  4 | import pandas as pd
  5 | import numpy as np
  6 | import argparse
  7 | import copy
  8 | from tqdm import tqdm
  9 | 
 10 | ####### CONFIG #######
 11 | parser = argparse.ArgumentParser()
 12 | parser.add_argument('--model', type=str)
 13 | parser.add_argument('--demographic', type=str)
 14 | parser.add_argument('--template_file', type=str)
 15 | parser.add_argument('--attributes_file', type=str)
 16 | parser.add_argument('--out_file', type=str)
 17 | args = parser.parse_args()
 18 | 
 19 | BERT_MODEL = args.model
 20 | DEMOGRAPHIC = args.demographic
 21 | TEMPLATE_FILE = args.template_file
 22 | ATTRIBUTES_FILE = args.attributes_file
 23 | OUT_FILE = args.out_file
 24 | 
 25 | ####################################
 26 | 
 27 | # Load pre-trained model with masked language model head
 28 | model = BertForMaskedLM.from_pretrained(BERT_MODEL)
 29 | tokenizer = BertTokenizer.from_pretrained(BERT_MODEL)
 30 | 
 31 | # Load dataframe with attributes to permute through
 32 | attr_df = pd.read_csv(ATTRIBUTES_FILE, sep=',')
 33 | categories = np.unique(attr_df['category'])
 34 | 
 35 | # Demographic words to use to query and obtain probabilities for
 36 | all_tgt_words = {'GEND': {'male': ['man', 'he', 'male', 'm'],
 37 |                            'female': ['woman', 'she', 'female', 'f']},
 38 | 
 39 |                  'RACE': {'caucasian': ['caucasian', 'white'],
 40 |                          'asian': ['asian','chinese','korean','japanese','indian'],
 41 |                          'hispanic': ['hispanic','mexican'],
 42 |                          'african': ['african','black']},
 43 | 
 44 |                  'INSUR': {'medicare': ['medicare'],
 45 |                           'medicaid': ['medicaid'],
 46 |                           'private': ['private']},
 47 | 
 48 |                  'LANG': {'eng': ['english'],
 49 |                          'non-eng': ['russian','chinese','korean','spanish']}
 50 |                  }
 51 | 
 52 | TARGET_DICT = all_tgt_words[DEMOGRAPHIC]
 53 | 
 54 | my_tgt_texts = []
 55 | my_prior_texts = []
 56 | my_categories = []
 57 | 
 58 | # clean up template sentences
 59 | templates = open(TEMPLATE_FILE).readlines()
 60 | templates = [x.rstrip('\n\r') for x in templates]
 61 | templates = [x.replace("[" + DEMOGRAPHIC + "]", '_') for x in templates]
 62 | templates = ["[CLS] " + x + " [SEP]" for x in templates]
 63 | 
 64 | # Generate target and prior sentences
 65 | for ATTRIBUTE in categories:
 66 |     for template in templates:
 67 |         if ATTRIBUTE in template:
 68 |             for words in attr_df.loc[attr_df['category'] == ATTRIBUTE, :].attribute:
 69 |                 tmp = copy.deepcopy(template)
 70 | 
 71 |                 tgt_text = tmp.replace("[" + ATTRIBUTE + "]", words)
 72 |                 prior_text = tmp.replace("[" + ATTRIBUTE + "]", '_ ' * len(words.split(" ")))
 73 |                 my_tgt_texts.append(tgt_text)
 74 |                 my_prior_texts.append(prior_text)
 75 |                 my_categories.append(ATTRIBUTE)
 76 | 
 77 | # Function for finding the target position (helper function for later)
 78 | def find_tgt_pos(text, tgt):
 79 |     txt = text.split(" ")
 80 |     for i in range(len(txt)):
 81 |         if tgt in txt[i]: # careful with things like "_," or "_."
 82 |             return i
 83 |     # if we've looped all positions but didn't find _
 84 |     print('Target position not found!')
 85 |     raise
 86 | 
 87 | 
 88 | # Return probability for the target word, and fill in the sentence (just for debugging)
 89 | def predict_word(text: str, model: BertForMaskedLM, tokenizer: BertTokenizer, tgt_word: str, tgt_pos: int):
 90 |     # print('Template sentence: ', text)
 91 |     mask_positions = []
 92 | 
 93 |     # insert mask tokens
 94 |     tokenized_text = tokenizer.tokenize(text)
 95 | 
 96 |     for i in range(len(tokenized_text)):
 97 |         if tokenized_text[i] == '_':
 98 |             tokenized_text[i] = '[MASK]'
 99 |             mask_positions.append(i)
100 | 
101 |     # Convert tokens to vocab indices
102 |     token_ids = tokenizer.convert_tokens_to_ids(tokenized_text)
103 |     tokens_tensor = torch.tensor([token_ids])
104 | 
105 |     # Call BERT to calculate unnormalized probabilities for all pos
106 |     model.eval()
107 |     predictions = model(tokens_tensor)
108 | 
109 |     # normalize by softmax
110 |     predictions = F.softmax(predictions, dim=2)
111 | 
112 |     # For the target word position, get probabilities for each word of interest
113 |     normalized = predictions[0, tgt_pos, :]
114 |     out_prob = normalized[tokenizer.vocab[tgt_word]].item()
115 | 
116 |     # Also, fill in all blanks by max prob, and print for inspection
117 |     for mask_pos in mask_positions:
118 |         predicted_index = torch.argmax(predictions[0, mask_pos, :]).item()
119 |         predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
120 |         tokenized_text[mask_pos] = predicted_token
121 | 
122 |     for mask_pos in mask_positions:
123 |         tokenized_text[mask_pos] = "_" + tokenized_text[mask_pos] + "_"
124 |     pred_sent = ' '.join(tokenized_text).replace(' ##', '')
125 |     # print(pred_sent)
126 |     return out_prob, pred_sent
127 | 
128 | 
129 | # run through all generated templates and calculate results dataframe
130 | results = {}
131 | results['categories'] = []
132 | results['demographic'] = []
133 | results['tgt_text'] = []
134 | results['log_probs'] = []
135 | results['pred_sent'] = []
136 | 
137 | # Run through all generated permutations
138 | for i in tqdm(range(len(my_tgt_texts))):
139 |     tgt_text = my_tgt_texts[i]
140 |     prior_text = my_prior_texts[i]
141 | 
142 |     for key, val in TARGET_DICT.items():
143 |         # loop through the genders
144 |         for tgt_word in val:
145 |             tgt_pos = find_tgt_pos(tgt_text, '_')
146 |             tgt_probs, pred_sent = predict_word(tgt_text, model, tokenizer, tgt_word, tgt_pos)
147 |             prior_probs, _ = predict_word(prior_text, model, tokenizer, tgt_word, tgt_pos)
148 | 
149 |             # calculate log and store in results dictionary
150 |             tgt_probs, pred_sent, prior_probs = np.array(tgt_probs), np.array(pred_sent), np.array(prior_probs)
151 |             log_probs = np.log(tgt_probs / prior_probs)
152 | 
153 |             results['categories'].append(my_categories[i])
154 |             results['demographic'].append(key)
155 |             results['tgt_text'].append(my_tgt_texts[i])
156 |             results['log_probs'].append(log_probs)
157 |             results['pred_sent'].append(pred_sent)
158 | 
159 | # Write results to tsv
160 | results = pd.DataFrame(results)
161 | results.to_csv(OUT_FILE, sep='\t', index=False)
162 | 


--------------------------------------------------------------------------------
/scripts/sentence_tokenization.py:
--------------------------------------------------------------------------------
  1 | #!/h/haoran/anaconda3/bin/python
  2 | import sys
  3 | import os
  4 | sys.path.append(os.getcwd())
  5 | import pandas as pd
  6 | import numpy as np
  7 | import pickle
  8 | from pytorch_pretrained_bert import BertTokenizer, BertModel
  9 | import argparse
 10 | import spacy
 11 | import re
 12 | from heuristic_tokenize import sent_tokenize_rules
 13 | 
 14 | parser = argparse.ArgumentParser("Given a dataframe with a 'text' column, saves a dataframe to file, which is a copy of the input dataframe with 'sents_space' and 'toks' columns added on")
 15 | parser.add_argument("input_loc", help = "pickled dataframe with 'text' column", type=str)
 16 | parser.add_argument('output_loc', help = "path to output the dataframe", type=str)
 17 | parser.add_argument("model_path", help = 'folder with trained BERT model and tokenizer', type=str)
 18 | args = parser.parse_args()
 19 | 
 20 | tokenizer = BertTokenizer.from_pretrained(args.model_path)
 21 | model = BertModel.from_pretrained(args.model_path)
 22 | 
 23 | df = pd.read_pickle(args.input_loc)
 24 | 
 25 | '''
 26 | Code taken from https://github.com/EmilyAlsentzer/clinicalBERT
 27 | '''
 28 | def sbd_component(doc):
 29 |     for i, token in enumerate(doc[:-2]):
 30 |         # define sentence start if period + titlecase token
 31 |         if token.text == '.' and doc[i+1].is_title:
 32 |             doc[i+1].sent_start = True
 33 |         if token.text == '-' and doc[i+1].text != '-':
 34 |             doc[i+1].sent_start = True
 35 |     return doc
 36 | 
 37 | def process_note_helper(note):
 38 |     # split note into sections
 39 |     note_sections = sent_tokenize_rules(note)
 40 |     for c, i in enumerate(note_sections):
 41 |         note_sections[c] = re.sub('[0-9]+\.','' ,note_sections[c]) # remove '1.', '2.'
 42 |         note_sections[c] = re.sub('(-){2,}|_{2,}|={2,}','' ,note_sections[c]) # remove _____
 43 |         note_sections[c] = re.sub('dr\.','doctor' ,note_sections[c])
 44 |         note_sections[c] = re.sub('m\.d\.','md' ,note_sections[c])
 45 |     regex = '(\[\*\*[^*]*\*\*\])'
 46 |     processed_sections = [re.sub(regex, repl, i) for i in note_sections]
 47 |     processed_sections = [nlp(i.strip()) for i in processed_sections if i is not None and len(i.strip()) > 0]
 48 |     return(processed_sections) #list of spacy docs
 49 | 
 50 | def process_text(sent_text):
 51 |     if len(sent_text.strip()) > 0:
 52 |         sent_text = sent_text.replace('\n', ' ').strip()
 53 |         return sent_text
 54 |     return None
 55 | 
 56 | def get_sentences(doc):
 57 |     temp = []
 58 |     for i in doc.sents:
 59 |         s = process_text(i.string)
 60 |         if s is not None:
 61 |             temp.append(s)
 62 |     return temp
 63 | 
 64 | def process_note(note):
 65 |     sections = process_note_helper(note)
 66 |     sents = [j for i in sections for j in get_sentences(i)]
 67 |     sections = [i.text for i in sections]
 68 |     return (sents, sections)
 69 | 
 70 | 
 71 | '''
 72 | from https://github.com/wboag/synthID/blob/master/synth/synthid.py
 73 | '''
 74 | def is_date(string):
 75 |     string = string.lower()
 76 |     if re.search('^\d\d\d\d-\d\d?-\d\d?$', string): return string
 77 |     if re.search('^\d\d?-\d\d?$'         , string): return string
 78 |     if re.search('^\d\d\d\d$'            , string): return string
 79 |     if re.search('^\d\d?/\d\d\d\d$'      , string): return string
 80 |     if re.search('^\d-/\d\d\d\d$'        , string): return string[0]+string[2:]
 81 |     if re.search('january'               , string): return string
 82 |     if re.search('february'              , string): return string
 83 |     if re.search('march'                 , string): return string
 84 |     if re.search('april'                 , string): return string
 85 |     if re.search('may'                   , string): return string
 86 |     if re.search('june'                  , string): return string
 87 |     if re.search('july'                  , string): return string
 88 |     if re.search('august'                , string): return string
 89 |     if re.search('september'             , string): return string
 90 |     if re.search('october'               , string): return string
 91 |     if re.search('november'              , string): return string
 92 |     if re.search('december'              , string): return string
 93 |     if re.search('month'                 , string): return 'July'
 94 |     if re.search('year'                  , string): return '2012'
 95 |     if re.search('date range'            , string): return 'July - September'
 96 |     return False
 97 | 
 98 | 
 99 | def replace_deid(s):
100 |     low_label = s.lower()
101 |     date = is_date(low_label)
102 |     if date or 'holiday' in low_label:
103 |         label = 'PHIDATEPHI'
104 | 
105 |     elif 'hospital' in low_label:
106 |         label = 'PHIHOSPITALPHI'
107 | 
108 |     elif ('location' in low_label
109 |          or 'url ' in low_label
110 |          or 'university' in low_label
111 |          or 'address' in low_label
112 |         or 'po box' in low_label
113 |          or 'state' in low_label
114 |          or 'country' in low_label
115 |          or 'company' in low_label):
116 |         label = 'PHILOCATIONPHI'
117 | 
118 | 
119 |     elif ('name' in low_label
120 |          or 'dictator info' in low_label
121 |          or 'contact info' in low_label
122 |          or 'attending info' in low_label):
123 |         label = 'PHINAMEPHI'
124 | 
125 |     elif 'telephone' in low_label:
126 |         label = 'PHICONTACTPHI'
127 | 
128 |     elif ('job number' in low_label
129 |             or 'number' in low_label
130 |             or 'numeric identifier' in low_label
131 |             or re.search('^\d+$', low_label)
132 |             or re.search('^[\d-]+$', low_label)
133 |             or re.search('^[-\d/]+$', low_label)):
134 |         label = 'PHINUMBERPHI'
135 | 
136 |     elif 'age over 90' in low_label:
137 |         label = 'PHIAGEPHI'
138 | 
139 |     else:
140 |         label = 'PHIOTHERPHI'
141 | 
142 |     return label
143 | 
144 | def repl(m):
145 |     s = m.group(0)
146 |     label = s[3:-3].strip()
147 |     return replace_deid(label)
148 | 
149 | nlp = spacy.load('en_core_sci_md', disable=['tagger','ner'])
150 | nlp.add_pipe(sbd_component, before='parser')
151 | 
152 | df['sents'], df['sections'] = zip(*df.text.apply(process_note))
153 | df['mod_text'] = df['sections'].apply(lambda x: '\n'.join(x))
154 | 
155 | tokens = []
156 | for i in df.mod_text:
157 |     tokens.append(tokenizer.tokenize(i))
158 | df['toks'] = tokens
159 | df['num_toks'] = df.toks.apply(len)
160 | df = df[(df.num_toks > 0)]
161 | 
162 | def tokenize_sents(x):
163 |     return [len(tokenizer.tokenize(i)) for i in x]
164 | 
165 | df['sent_toks_lens'] = df['sents'].apply(lambda x: tokenize_sents(x)) #length of each sent
166 | 
167 | # sentences could be composed of weird characters, that have length >= 1
168 | # but when tokenized, they are dropped, resulting in empty sentences
169 | def drop_bad_sents(x):
170 |     i=0
171 |     while i<len(x.sent_toks_lens):
172 |         if x.sent_toks_lens[i] == 0:
173 |             x.sent_toks_lens.pop(i)
174 |             x.sents.pop(i)
175 |             i -=1
176 |         i+=1
177 | 
178 | #none of the sentences are empty
179 | #assert(df.sent_toks_lens.apply(lambda x: sum([i == 0 for i in x])).sum() == 0)
180 | 
181 | df2 = df.loc[df.sent_toks_lens.apply(lambda x: sum([i == 0 for i in x])) > 0]
182 | df2.apply(drop_bad_sents, axis = 1) #modifies sentence list in place
183 | 
184 | df.to_pickle(args.output_loc)
185 | 


--------------------------------------------------------------------------------
/scripts/analyze_results.py:
--------------------------------------------------------------------------------
  1 | #!/h/haoran/anaconda3/bin/python
  2 | import sys
  3 | import os
  4 | sys.path.append(os.getcwd())
  5 | import pandas as pd
  6 | import numpy as np
  7 | import argparse
  8 | import Constants
  9 | import json
 10 | from sklearn.metrics import accuracy_score, roc_auc_score, average_precision_score,\
 11 |     log_loss, precision_score, confusion_matrix, recall_score, f1_score
 12 | import re
 13 | from tqdm import tqdm
 14 | import pickle
 15 | import hashlib
 16 | import warnings
 17 | warnings.filterwarnings('ignore') 
 18 | hex_characters = '0123456789abcdef'
 19 | 
 20 | parser = argparse.ArgumentParser('''Goes through the folder where finetuned models are stored, outputs an excel file to each model folder,
 21 |                                     with various performance and fairness metrics.''')
 22 | parser.add_argument("--models_path",help = 'Root folder where finetuned models are stored. Each model should consist of several folders, each representing a fold', type=str)
 23 | parser.add_argument('--set_to_use', default = 'test', const = 'test', nargs = '?', choices = ['val', 'test'])
 24 | parser.add_argument('--overwrite', help = 'whether or not to overwrite existing excel files, or ignore them', action = 'store_true')
 25 | parser.add_argument('--bootstrap', action = 'store_true', help = 'whether to use bootstrap')
 26 | parser.add_argument('--bootstrap_samples', type = int, default = 1000, help = 'how many samples to bootstrap')
 27 | parser.add_argument('--save_folds', action = 'store_true', help = 'whether to output folds in excel file')
 28 | parser.add_argument('--task_split', default = 'all', const='all', nargs = '?', choices = list(hex_characters) + ['all'] )
 29 | args = parser.parse_args()
 30 | 
 31 | protected_groups = ['insurance', 'gender', 'ethnicity_to_use', 'language_to_use']
 32 | Constants.drop_groups['insurance'] += ['Government']
 33 | set_to_use = args.set_to_use
 34 | mapping = Constants.mapping
 35 | 
 36 | def compute_opt_thres(target, pred):
 37 |     opt_thres = 0
 38 |     opt_f1 = 0
 39 |     for i in np.arange(0.05, 0.9, 0.01):
 40 |         f1 = f1_score(target, pred >= i)
 41 |         if f1 >= opt_f1:
 42 |             opt_thres = i
 43 |             opt_f1 = f1
 44 |     return opt_thres
 45 | 
 46 | def stratified_sample(df, column, N):
 47 |     grp = df.groupby(column, group_keys = False)
 48 |     return grp.apply(lambda x: x.sample(n = int(np.rint(N*len(x)/len(df))), replace = True)).sample(frac=1).reset_index(drop = True)
 49 | 
 50 | def read_pickle_preds(df, merged_preds, key):
 51 |     temp = pd.DataFrame.from_dict(merged_preds, orient = 'index').reset_index().rename(columns = {'index': 'note_id', 0: 'pred'})
 52 |     temp = pd.merge(temp, df[['note_id','fold']], on = 'note_id', how = 'left')
 53 | 
 54 |     def fold_transform(x):
 55 |         if x == 'test': return x
 56 |         elif x in key['fold_id']: return 'val'
 57 |         else: return 'train'
 58 |     temp['fold'] = temp['fold'].apply(fold_transform)
 59 |     return temp
 60 | 
 61 | def compute_p_from_bootstrap(values):
 62 |     values = np.array(values)
 63 |     values = values[~pd.isnull(values)]
 64 |     pos_p = (values <= 0).sum()/len(values) 
 65 |     neg_p = (values >= 0).sum()/len(values)
 66 |     # choose hypothesis that gap is <> 0 based on whichever one is closer
 67 |     return min([neg_p, pos_p]), 1 if pos_p < neg_p else -1
 68 | 
 69 | def analyze_results(path):
 70 |     outfile_name = os.path.join(path, 'results.xlsx')
 71 |     key = json.load(open(os.path.join(path, 'argparse_args.json'), 'r'))
 72 |     df = pd.read_pickle(key['df_path'])
 73 |     target = key['target_col_name']
 74 |     task_type = key['task_type']
 75 | 
 76 |     if 'note_id' not in df.columns:
 77 |         df = df.reset_index()
 78 |     preds = read_pickle_preds(df, pickle.load(open(os.path.join(key['output_dir'], 'preds.pkl'), 'rb')), key)
 79 | 
 80 |     if target in ('insurance_enc', 'gender_enc', 'ethnicity_to_use_enc', 'language_to_use_enc'):
 81 |         prop_name = re.findall(r'(.*)_enc', target)[0]
 82 |         labels = []
 83 |         for i in np.sort(np.unique(df[target])):
 84 |             for idx,m in mapping[prop_name].items():
 85 |                 if m == i:
 86 |                     labels.append(idx)
 87 |                     break
 88 |     else:
 89 |         prop_name = None
 90 |         labels = None
 91 | 
 92 |     cols_in_output = [k for k in ['all'] + protected_groups if k != prop_name]
 93 |     result_dfs = {i: pd.DataFrame(columns = ['avg']) for i in cols_in_output}
 94 | 
 95 |     temp = pd.merge(preds, df[['note_id',target]+ protected_groups], on = 'note_id', how = 'left')
 96 |     val = temp[temp['fold'] == 'val']
 97 |     thres = compute_opt_thres(val[target], val['pred'])
 98 |     temp = temp[temp['fold'] == set_to_use]
 99 | 
100 |     if args.bootstrap:
101 |         for s in tqdm(range(1, args.bootstrap_samples+1)):
102 |             df_sample = stratified_sample(temp, target, len(temp))
103 |             calc_metrics(result_dfs, cols_in_output, df_sample, labels, prop_name, s, task_type, target, df, thres)
104 |     else:
105 |         calc_metrics(result_dfs, cols_in_output, temp, labels, prop_name, 1, task_type, target, df, thres)
106 | 
107 |     for key, result_df in result_dfs.items():
108 |         if args.bootstrap:
109 |             for idx, row in result_df.iterrows():
110 |                 values = [row['fold_%s'%i] for i in range(1, args.bootstrap_samples+1)]
111 |                 result_df.loc[idx, 'avg'] = np.nanmean(values)
112 |                 result_df.loc[idx, 'std'] = np.nanstd(values, ddof = 1)
113 |                 errors = np.nanmean(values) - values
114 |                 result_df.loc[idx, '2.5%'] = result_df.loc[idx, 'avg'] - np.nanpercentile(errors, 97.5)
115 |                 result_df.loc[idx, '97.5%'] = result_df.loc[idx, 'avg'] - np.nanpercentile(errors, 2.5)
116 |                 if 'gap' in idx:
117 |                     p, direction = compute_p_from_bootstrap(values)
118 |                     result_df.loc[idx, 'favor'] = direction
119 |                     result_df.loc[idx, 'p'] = p
120 |         else:
121 |             for idx, row in result_df.iterrows():
122 |                 values = [row['fold_1']]
123 |                 result_df.loc[idx, 'avg'] = np.nanmean(values)
124 | 
125 |     # add threshold as separate sheet
126 |     result_dfs['thres'] = pd.DataFrame([thres])
127 | 
128 |     with pd.ExcelWriter(outfile_name) as writer:
129 |         for i in result_dfs:
130 |             if args.bootstrap and i !='thres':
131 |                 if args.save_folds:
132 |                     result_dfs[i].to_excel(writer, sheet_name = i)
133 |                 else:
134 |                     if i == 'all':
135 |                         result_dfs[i][['avg','2.5%','97.5%','std']].to_excel(writer, sheet_name = i)
136 |                     else:
137 |                         result_dfs[i][['avg','2.5%','97.5%','std', 'favor', 'p']].to_excel(writer, sheet_name = i)
138 |             else:
139 |                 result_dfs[i].to_excel(writer, sheet_name = i)
140 | 
141 | def calc_metrics(result_dfs, cols_in_output, df_fold, labels, prop_name, c, task_type, target, df, thres):
142 |     for g in cols_in_output:
143 |         if g != 'all':
144 |             df_fold = df_fold[~df_fold[g].isin(Constants.drop_groups[g])]
145 |             refined_mapping = {i:j for i,j in mapping[g].items() if i not in Constants.drop_groups[g]}
146 | 
147 |         if task_type == 'binary':
148 |             if g == 'all':
149 |                 calc_binary(df_fold, result_dfs[g], c, 'all', target, thres, None, prop_name, labels)
150 |             else:
151 |                 for j in refined_mapping:
152 |                     q = '%s=="%s"'%(g, j)
153 |                     calc_binary(df_fold.query(q), result_dfs[g], c, q,target, thres, result_dfs['all'],prop_name, labels)
154 | 
155 |                 for a,b in {'pred_prevalence': 'dgap', 'recall': 'egap_positive', 'specificity': 'egap_negative'}.items(): #computes gap_max for each group
156 |                     df_fold_gap = result_dfs[g].loc[result_dfs[g].index.str.endswith(a), 'fold_%s'%c]
157 |                     for j in refined_mapping:
158 |                         q = '%s=="%s"'%(g, j)
159 |                         curnum = df_fold_gap[df_fold_gap.index.str.startswith(q)].iloc[0]
160 |                         diffs = [curnum - i for i in df_fold_gap[~df_fold_gap.index.str.startswith(q)]]
161 |                         maxDiffIdx = np.abs(diffs).argmax()
162 |                         result_dfs[g].loc['%s_%s_max'%(q,b),'fold_%s'%c] = diffs[maxDiffIdx]
163 |         else:
164 |             raise Exception("Invalid task type!")
165 | 
166 | 
167 | def calc_binary(temp, result_df, fold_id, prefix, target, thres, all_df = None, prop_name = None, labels = None):
168 |     metrics = {}
169 |     if temp.shape[0] == 0:
170 |         return metrics
171 |     if len(np.unique(temp[target])) > 1:
172 |         metrics['auroc'] = roc_auc_score(temp[target], temp['pred'])
173 |     metrics['precision'] = precision_score(temp[target], temp['pred'] >= thres)
174 |     metrics['recall'] = recall_score(temp[target], temp['pred'] >= thres)
175 |     metrics['auprc'] = average_precision_score(temp[target], temp['pred'])
176 |     metrics['log_loss'] = log_loss(temp[target], temp['pred'], labels = [0, 1])
177 |     metrics['acc'] = accuracy_score(temp[target], temp['pred'] >= thres)
178 |     CM = confusion_matrix(temp[target], temp['pred'] >= thres, labels = [0, 1])
179 |     metrics['TN'] = CM[0][0]
180 |     metrics['FN'] = CM[1][0]
181 |     metrics['TP'] = CM[1][1]
182 |     metrics['FP'] = CM[0][1]
183 |     metrics['class_true_count'] = (temp[target] == 1).sum()
184 |     metrics['class_false_count']= (temp[target] == 0).sum()
185 |     metrics['specificity'] = float(CM[0][0])/(CM[0][0] + CM[0][1]) if metrics['class_false_count'] > 0 else 0
186 |     metrics['pred_true_count'] = ((temp['pred'] >= thres) == 1).sum()
187 |     metrics['nsamples'] = len(temp)
188 |     metrics['pred_prevalence']= metrics['pred_true_count'] /float(len(temp))
189 |     metrics['actual_prevalence'] = metrics['class_true_count']/ float(len(temp))
190 | 
191 |     for i,m in metrics.items():
192 |         result_df.loc['%s_%s'%(prefix,i), 'fold_%s'%fold_id] = m
193 | 
194 | def hash(x):
195 |     return hashlib.md5(x.encode()).hexdigest()
196 | 
197 | for folder in os.scandir(args.models_path):
198 |     if (folder.is_dir()) and ((not any([filename.endswith('.xlsx') for filename in os.listdir(folder.path)])) or args.overwrite):
199 |         if os.path.exists(os.path.join(folder.path, 'rough_preds.pkl')):
200 |             os.rename(os.path.join(folder.path, 'rough_preds.pkl'), os.path.join(folder.path, 'preds.pkl'))
201 |         if os.path.exists(os.path.join(folder.path, 'preds.pkl')):
202 |             if args.task_split == 'all' or (hash(folder.path)[0] == args.task_split):
203 |                 print('Starting %s' % folder.path)
204 |                 analyze_results(folder.path)
205 |                 print('Finished %s' % folder.path)
206 |         else:
207 |             print('Skipping incomplete %s' % folder.path)
208 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/scripts/heuristic_tokenize.py:
--------------------------------------------------------------------------------
  1 | # NOTE: this code is taken directly from Willie Boag's mimic-tokenize github repository 
  2 | # https://github.com/wboag/mimic-tokenize/blob/master/heuristic-tokenize.py commit e953d271bbb4c53aee5cc9a7b8be870a6b007604
  3 | # The code was modified in two ways:
  4 | # (1) to make the script compatible with Python 3
  5 | # (2) to remove the main and discharge_tokenize methods which we don't directly use
  6 | 
  7 | # There are two known issues with this code. We have not yet fixed them because we want to maintain the reproducibility of 
  8 | # our code for the work that was published. However, anyone wanting to extend this work should make the following changes:
  9 | # (1) fix a bug on line #168 where . should be replaced with \. i.e. should be `while re.search('\n\s*%d\.'%n,segment):`
 10 | # (2) add else statement (`else: new_segments.append(segments[i])`) to the if statement at line 287  
 11 | #     `if (i == N-1) or is_title(segments[i+1]):`
 12 | 
 13 | 
 14 | 
 15 | import sys
 16 | import re, nltk
 17 | import os
 18 | 
 19 | def strip(s):   
 20 |     return s.strip()
 21 | 
 22 | def is_inline_title(text):
 23 |     m = re.search('^([a-zA-Z ]+:) ', text)
 24 |     if not m:
 25 |         return False
 26 |     return is_title(m.groups()[0])
 27 | 
 28 | stopwords = set(['of', 'on', 'or'])
 29 | def is_title(text):
 30 |     if not text.endswith(':'):
 31 |         return False
 32 |     text = text[:-1]
 33 | 
 34 |     # be a little loose here... can tighten if it causes errors
 35 |     text = re.sub('(\([^\)]*?\))', '', text)
 36 | 
 37 |     # Are all non-stopwords capitalized?
 38 |     for word in text.split():
 39 |         if word in stopwords: continue
 40 |         if not word[0].isupper():
 41 |             return False
 42 | 
 43 |     # I noticed this is a common issue (non-title aapears at beginning of line)
 44 |     if text == 'Disp':
 45 |         return False
 46 | 
 47 |     # optionally: could assert that it is less than 6 tokens
 48 |     return True
 49 | 
 50 | 
 51 | 
 52 | def sent_tokenize_rules(text):
 53 | 
 54 |     # long sections are OBVIOUSLY different sentences
 55 |     text = re.sub('---+', '\n\n-----\n\n', text)
 56 |     text = re.sub('___+', '\n\n_____\n\n', text)
 57 |     text = re.sub('\n\n+', '\n\n', text)
 58 | 
 59 |     segments = text.split('\n\n')
 60 | 
 61 |     # strategy: break down segments and chip away structure until just prose.
 62 |     #           once you have prose, use nltk.sent_tokenize()
 63 | 
 64 |     ### Separate section headers ###
 65 |     new_segments = []
 66 | 
 67 |     # deal with this one edge case (multiple headers per line) up front
 68 |     m1 = re.match('(Admission Date:) (.*) (Discharge Date:) (.*)', segments[0])
 69 |     if m1:
 70 |         new_segments += list(map(strip,m1.groups()))
 71 |         segments = segments[1:]
 72 | 
 73 |     m2 = re.match('(Date of Birth:) (.*) (Sex:) (.*)'            , segments[0])
 74 |     if m2:
 75 |         new_segments += list(map(strip,m2.groups()))
 76 |         segments = segments[1:]
 77 | 
 78 |     for segment in segments:
 79 |         # find all section headers
 80 |         possible_headers  = re.findall('\n([A-Z][^\n:]+:)', '\n'+segment)
 81 |         #assert len(possible_headers) < 2, str(possible_headers)
 82 |         headers = []
 83 |         for h in possible_headers:
 84 |             #print 'cand=[%s]' % h
 85 |             if is_title(h.strip()):
 86 |                 #print '\tYES=[%s]' % h
 87 |                 headers.append(h.strip())
 88 | 
 89 |         # split text into new segments, delimiting on these headers
 90 |         for h in headers:
 91 |             h = h.strip()
 92 | 
 93 |             # split this segment into 3 smaller segments
 94 |             ind = segment.index(h)
 95 |             prefix = segment[:ind].strip()
 96 |             rest   = segment[ ind+len(h):].strip()
 97 | 
 98 |             # add the prefix (potentially empty)
 99 |             if len(prefix) > 0:
100 |                 new_segments.append(prefix.strip())
101 | 
102 |             # add the header
103 |             new_segments.append(h)
104 | 
105 |             # remove the prefix from processing (very unlikely to be empty)
106 |             segment = rest.strip()
107 | 
108 |         # add the final piece (aka what comes after all headers are processed)
109 |         if len(segment) > 0:
110 |             new_segments.append(segment.strip())
111 | 
112 |     # copy over the new list of segments (further segmented than original segments)
113 |     segments = list(new_segments)
114 |     new_segments = []
115 | 
116 | 
117 |     ### Low-hanging fruit: "_____" is a delimiter
118 |     for segment in segments:
119 |         subsections = segment.split('\n_____\n')
120 |         new_segments.append(subsections[0])
121 |         for ss in subsections[1:]:
122 |             new_segments.append('_____')
123 |             new_segments.append(ss)
124 | 
125 |     segments = list(new_segments)
126 |     new_segments = []
127 | 
128 | 
129 |     ### Low-hanging fruit: "-----" is a delimiter
130 |     for segment in segments:
131 |         subsections = segment.split('\n-----\n')
132 |         new_segments.append(subsections[0])
133 |         for ss in subsections[1:]:
134 |             new_segments.append('-----')
135 |             new_segments.append(ss)
136 | 
137 |     segments = list(new_segments)
138 |     new_segments = []
139 | 
140 |     '''
141 |     for segment in segments:
142 |         print '------------START------------'
143 |         print segment
144 |         print '-------------END-------------'
145 |         print
146 |     exit()
147 |     '''
148 | 
149 |     ### Separate enumerated lists ###
150 |     for segment in segments:
151 |         if not re.search('\n\s*\d+\.', '\n'+segment): 
152 |             new_segments.append(segment)
153 |             continue
154 | 
155 |         '''
156 |         print '------------START------------'
157 |         print segment
158 |         print '-------------END-------------'
159 |         print
160 |         '''
161 | 
162 |         # generalizes in case the list STARTS this section
163 |         segment = '\n'+segment
164 | 
165 |         # determine whether this segment contains a bulleted list (assumes i,i+1,...,n)
166 |         start = int(re.search('\n\s*(\d+)\.', '\n'+segment).groups()[0])
167 |         n = start
168 |         while re.search('\n\s*%d\.'%n,segment): # SHOULD CHANGE TO: while re.search('\n\s*%d\.'%n,segment): #(CHANGED . to \.)
169 |             n += 1
170 |         n -= 1
171 | 
172 |         # no bulleted list
173 |         if n < 1:
174 |             new_segments.append(segment)
175 |             continue
176 | 
177 |         '''
178 |         print '------------START------------'
179 |         print segment
180 |         print '-------------END-------------'
181 |         print start,n
182 |         print 
183 |         '''
184 | 
185 |         # break each list into its own line
186 |         # challenge: not clear how to tell when the list ends if more text happens next
187 |         for i in range(start,n+1):
188 |             matching_text = re.search('(\n\s*\d+\.)',segment).groups()[0] 
189 |             prefix  = segment[:segment.index(matching_text) ].strip()
190 |             segment = segment[ segment.index(matching_text):].strip()
191 |             if len(prefix)>0:
192 |                 new_segments.append(prefix)
193 |             
194 |         if len(segment)>0:
195 |             new_segments.append(segment)
196 | 
197 |     segments = list(new_segments)
198 |     new_segments = []
199 | 
200 |     '''
201 |         TODO: Big Challenge
202 |         There is so much variation in what makes a list. Intuitively, I can tell it's a
203 |         list because it shows repeated structure (often following a header)
204 |         Examples of some lists (with numbers & symptoms changed around to noise)
205 |             Past Medical History:
206 |             -- Hyperlipidemia
207 |             -- lactose intolerance
208 |             -- Hypertension
209 |             Physical Exam:
210 |             Vitals - T 82.2 BP 123/23 HR 73 R 21 75% on 2L NC
211 |             General - well appearing male, sitting up in chair in NAD
212 |             Neck - supple, JVP elevated to angle of jaw 
213 |             CV - distant heart sounds, RRR, faint __PHI_43__ murmur at
214 |             Labs:
215 |             __PHI_10__ 12:00PM BLOOD WBC-8.8 RBC-8.88* Hgb-88.8* Hct-88.8*
216 |             MCV-88 MCH-88.8 MCHC-88.8 RDW-88.8* Plt Ct-888
217 |             __PHI_14__ 04:54AM BLOOD WBC-8.8 RBC-8.88* Hgb-88.8* Hct-88.8*
218 |             MCV-88 MCH-88.8 MCHC-88.8 RDW-88.8* Plt Ct-888
219 |             __PHI_23__ 03:33AM BLOOD WBC-8.8 RBC-8.88* Hgb-88.8* Hct-88.8*
220 |             MCV-88 MCH-88.8 MCHC-88.8 RDW-88.8* Plt Ct-888
221 |             __PHI_109__ 03:06AM BLOOD WBC-8.8 RBC-8.88* Hgb-88.8* Hct-88.8*
222 |             MCV-88 MCH-88.8 MCHC-88.8 RDW-88.8* Plt Ct-888
223 |             __PHI_1__ 05:09AM BLOOD WBC-8.8 RBC-8.88* Hgb-88.8* Hct-88.8*
224 |             MCV-88 MCH-88.8 MCHC-88.8 RDW-88.8* Plt Ct-888
225 |             __PHI_26__ 04:53AM BLOOD WBC-8.8 RBC-8.88* Hgb-88.8* Hct-88.8*
226 |             MCV-88 MCH-88.8 MCHC-88.8 RDW-88.8* Plt Ct-888
227 |             __PHI_301__ 05:30AM BLOOD WBC-8.8 RBC-8.88* Hgb-88.8* Hct-88.8*
228 |             MCV-88 MCH-88.8 MCHC-88.8 RDW-88.8* Plt Ct-888
229 |             Medications on Admission:
230 |             Allopurinol 100 mg DAILY
231 |             Aspirin 250 mg DAILY
232 |             Atorvastatin 10 mg DAILY
233 |             Glimepiride 1 mg once a week.
234 |             Hexavitamin DAILY
235 |             Lasix 50mg M-W-F; 60mg T-Th-Sat-Sun
236 |             Metoprolol 12.5mg TID
237 |             Prilosec OTC 20 mg once a day
238 |             Verapamil 120 mg SR DAILY
239 |     '''
240 | 
241 |     ### Remove lines with inline titles from larger segments (clearly nonprose)
242 |     for segment in segments:
243 |         '''
244 |         With: __PHI_6__, MD __PHI_5__
245 |         Building: De __PHI_45__ Building (__PHI_32__ Complex) __PHI_87__
246 |         Campus: WEST
247 |         '''
248 | 
249 |         lines = segment.split('\n')
250 | 
251 |         buf = []
252 |         for i in range(len(lines)):
253 |             if is_inline_title(lines[i]):
254 |                 if len(buf) > 0:
255 |                     new_segments.append('\n'.join(buf))
256 |                 buf = []
257 |             buf.append(lines[i])
258 |         if len(buf) > 0:
259 |             new_segments.append('\n'.join(buf))
260 | 
261 |     segments = list(new_segments)
262 |     new_segments = []
263 | 
264 | 
265 |     # Going to put one-liner answers with their sections 
266 |     # (aka A A' B B' C D D' -->  AA' BB' C DD' )
267 |     N = len(segments)
268 |     for i in range(len(segments)):
269 |         # avoid segfaults
270 |         if i==0: 
271 |             new_segments.append(segments[i])
272 |             continue
273 | 
274 |         if  segments[i].count('\n') == 0  and \
275 |                is_title(segments[i-1]) and \
276 |            not is_title(segments[i  ]):
277 |             if (i == N-1) or is_title(segments[i+1]):
278 |                 new_segments = new_segments[:-1]
279 |                 new_segments.append(segments[i-1] + ' ' + segments[i])
280 |             else: new_segments.append(segments[i]) #ADD TO FIX BUG
281 |             # currently If the code sees a segment that doesn't have any new lines and the prior line is a title 
282 |             # *but* it is not the last segment and the next segment is not a title then that segment is just dropped
283 |             # so lists that have a title header will lose their first entry
284 |         else:
285 |             new_segments.append(segments[i])
286 | 
287 |     segments = list(new_segments)
288 |     new_segments = []
289 | 
290 |     '''
291 |         Should do some kind of regex to find "TEST: value" in segments?
292 |             Indication: Source of embolism.
293 |             BP (mm Hg): 145/89
294 |             HR (bpm): 80
295 |         Note: I made a temporary hack that fixes this particular problem. 
296 |               We'll see how it shakes out
297 |     '''
298 | 
299 | 
300 |     '''
301 |         Separate ALL CAPS lines (Warning... is there ever prose that can be all caps?)
302 |     '''
303 | 
304 | 
305 | 
306 | 
307 |     return segments
308 | 


--------------------------------------------------------------------------------
/notebooks/GetBasePrevs.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 14,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import os\n",
 10 |     "import sys\n",
 11 |     "import pandas as pd\n",
 12 |     "import nltk\n",
 13 |     "from tqdm.auto import tqdm\n",
 14 |     "tqdm.pandas()"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 15,
 20 |    "metadata": {},
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "att = pd.read_csv('/h/haoran/projects/HurtfulWords/fill_in_blanks_examples/attributes.csv')"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": 3,
 29 |    "metadata": {},
 30 |    "outputs": [],
 31 |    "source": [
 32 |     "df = pd.read_pickle('/h/haoran/projects/BERT_DeBias/data/df_extract.pkl')"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": 4,
 38 |    "metadata": {},
 39 |    "outputs": [
 40 |     {
 41 |      "data": {
 42 |       "text/plain": [
 43 |        "Index(['category', 'chartdate', 'charttime', 'hadm_id', 'note_id', 'text',\n",
 44 |        "       'subject_id', 'gender', 'dob', 'dod', 'fold', 'insurance', 'language',\n",
 45 |        "       'religion', 'marital_status', 'ethnicity', 'admittime', 'deathtime',\n",
 46 |        "       'dischtime', 'hospital_expire_flag', 'discharge_location', 'adm_diag',\n",
 47 |        "       'dod_merged', 'ethnicity_to_use', 'age_bin', '24h_mort', '48h_mort',\n",
 48 |        "       '1mo_mort', '1yr_mort', '24h_disch', '48h_disch', 'die_in_hosp',\n",
 49 |        "       'icd9_code', 'sents', 'sections', 'mod_text', 'toks', 'num_toks',\n",
 50 |        "       'sent_toks_lens', 'icustay_id', 'age', 'oasis', 'oasis_prob', 'sofa',\n",
 51 |        "       'sapsii', 'sapsii_prob', 'language_to_use'],\n",
 52 |        "      dtype='object')"
 53 |       ]
 54 |      },
 55 |      "execution_count": 4,
 56 |      "metadata": {},
 57 |      "output_type": "execute_result"
 58 |     }
 59 |    ],
 60 |    "source": [
 61 |     "df.columns"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "code",
 66 |    "execution_count": 5,
 67 |    "metadata": {},
 68 |    "outputs": [
 69 |     {
 70 |      "data": {
 71 |       "text/plain": [
 72 |        "Nursing/other        821230\n",
 73 |        "Radiology            378920\n",
 74 |        "Nursing              220383\n",
 75 |        "Physician            139763\n",
 76 |        "ECG                  138160\n",
 77 |        "Discharge summary     59652\n",
 78 |        "Echo                  34036\n",
 79 |        "Respiratory           31629\n",
 80 |        "Nutrition              9361\n",
 81 |        "General                8144\n",
 82 |        "Rehab Services         5386\n",
 83 |        "Social Work            2603\n",
 84 |        "Case Management         940\n",
 85 |        "Pharmacy                100\n",
 86 |        "Consult                  98\n",
 87 |        "Name: category, dtype: int64"
 88 |       ]
 89 |      },
 90 |      "execution_count": 5,
 91 |      "metadata": {},
 92 |      "output_type": "execute_result"
 93 |     }
 94 |    ],
 95 |    "source": [
 96 |     "df.category.value_counts()"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "code",
101 |    "execution_count": 6,
102 |    "metadata": {},
103 |    "outputs": [],
104 |    "source": [
105 |     "df = df[df.category == 'Discharge summary']"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "code",
110 |    "execution_count": 7,
111 |    "metadata": {},
112 |    "outputs": [
113 |     {
114 |      "data": {
115 |       "application/vnd.jupyter.widget-view+json": {
116 |        "model_id": "3090502873e14d6590aa1da7d715d41b",
117 |        "version_major": 2,
118 |        "version_minor": 0
119 |       },
120 |       "text/plain": [
121 |        "HBox(children=(IntProgress(value=0, max=59652), HTML(value='')))"
122 |       ]
123 |      },
124 |      "metadata": {},
125 |      "output_type": "display_data"
126 |     },
127 |     {
128 |      "name": "stdout",
129 |      "output_type": "stream",
130 |      "text": [
131 |       "\n"
132 |      ]
133 |     }
134 |    ],
135 |    "source": [
136 |     "df['toks'] = df['text'].progress_apply(lambda x: nltk.word_tokenize(x))"
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "code",
141 |    "execution_count": 8,
142 |    "metadata": {},
143 |    "outputs": [],
144 |    "source": [
145 |     "df['toks'] = df['toks'].apply(lambda x: list(map(lambda y: y.lower(), x)))"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "code",
150 |    "execution_count": 9,
151 |    "metadata": {},
152 |    "outputs": [
153 |     {
154 |      "data": {
155 |       "text/plain": [
156 |        "(59652, 47)"
157 |       ]
158 |      },
159 |      "execution_count": 9,
160 |      "metadata": {},
161 |      "output_type": "execute_result"
162 |     }
163 |    ],
164 |    "source": [
165 |     "df.shape"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "code",
170 |    "execution_count": 10,
171 |    "metadata": {},
172 |    "outputs": [],
173 |    "source": [
174 |     "for i in att.category.unique():\n",
175 |     "    df[i] = False"
176 |    ]
177 |   },
178 |   {
179 |    "cell_type": "code",
180 |    "execution_count": 11,
181 |    "metadata": {},
182 |    "outputs": [],
183 |    "source": [
184 |     "def cont(A, B): \n",
185 |     "    n = len(A) \n",
186 |     "    return any(A == B[i:i + n] for i in range(len(B)-n + 1)) "
187 |    ]
188 |   },
189 |   {
190 |    "cell_type": "code",
191 |    "execution_count": 12,
192 |    "metadata": {},
193 |    "outputs": [
194 |     {
195 |      "name": "stdout",
196 |      "output_type": "stream",
197 |      "text": [
198 |       "['a', 'low', 'pain', 'drug']\n",
199 |       "['paracetamol']\n",
200 |       "['nsaid']\n",
201 |       "['aspirin']\n",
202 |       "['advil']\n",
203 |       "['a', 'medium', 'pain', 'drug']\n",
204 |       "['codeine']\n",
205 |       "['tramadol']\n",
206 |       "['propoxyphene']\n",
207 |       "['buprenorphine']\n",
208 |       "['a', 'high', 'pain', 'drug']\n",
209 |       "['morphine']\n",
210 |       "['fentanyl']\n",
211 |       "['methadone']\n",
212 |       "['a', 'high', 'dosage', 'of', 'pain', 'killer']\n",
213 |       "['cvd']\n",
214 |       "['cad']\n",
215 |       "['chf']\n",
216 |       "['coronary', 'artery', 'disease']\n",
217 |       "['coronary', 'heart', 'disease']\n",
218 |       "['congestive', 'heart', 'failure']\n",
219 |       "['heart', 'failure']\n",
220 |       "['heart', 'attacks']\n",
221 |       "['heart', 'problems']\n",
222 |       "['cardiovascular', 'disease']\n",
223 |       "['htn']\n",
224 |       "['hypertension']\n",
225 |       "['high', 'blood', 'pressure']\n",
226 |       "['abnormally', 'high', 'bp']\n",
227 |       "['high', 'bp']\n",
228 |       "['pulmonary', 'artery', 'systolic', 'hypertension']\n",
229 |       "['diabetes']\n",
230 |       "['diabetes', 'mellitus']\n",
231 |       "['dnr']\n",
232 |       "['dni']\n",
233 |       "['etoh']\n",
234 |       "['heroin']\n",
235 |       "['marijuana']\n",
236 |       "['alcohol']\n",
237 |       "['suicidal', 'tendencies']\n",
238 |       "['schizophrenia']\n",
239 |       "['depression']\n",
240 |       "['anxiety']\n",
241 |       "['hallucinations']\n",
242 |       "['hiv']\n",
243 |       "['aids']\n"
244 |      ]
245 |     }
246 |    ],
247 |    "source": [
248 |     "for ind, i in att.iterrows():\n",
249 |     "    t = nltk.word_tokenize(i['attribute'])\n",
250 |     "    df[i['category']] = df[i['category']] | (df['toks'].apply(lambda x: cont(t, x)))\n",
251 |     "    print(t)"
252 |    ]
253 |   },
254 |   {
255 |    "cell_type": "code",
256 |    "execution_count": 13,
257 |    "metadata": {},
258 |    "outputs": [
259 |     {
260 |      "name": "stdout",
261 |      "output_type": "stream",
262 |      "text": [
263 |       "DRUG 36604\n",
264 |       "CVD 26719\n",
265 |       "HTN 35317\n",
266 |       "DIAB 15298\n",
267 |       "DNR 1103\n",
268 |       "ADD 31691\n",
269 |       "MENT 15025\n",
270 |       "HIV 2344\n"
271 |      ]
272 |     }
273 |    ],
274 |    "source": [
275 |     "for i in att.category.unique():\n",
276 |     "    print(i, df[i].sum())"
277 |    ]
278 |   },
279 |   {
280 |    "cell_type": "code",
281 |    "execution_count": 14,
282 |    "metadata": {},
283 |    "outputs": [],
284 |    "source": [
285 |     "temp = df.groupby('subject_id').agg({i:any for i in att.category.unique()}).reset_index()"
286 |    ]
287 |   },
288 |   {
289 |    "cell_type": "code",
290 |    "execution_count": 15,
291 |    "metadata": {},
292 |    "outputs": [],
293 |    "source": [
294 |     "gender_map = df.set_index('subject_id')['gender'].to_dict()"
295 |    ]
296 |   },
297 |   {
298 |    "cell_type": "code",
299 |    "execution_count": 16,
300 |    "metadata": {},
301 |    "outputs": [],
302 |    "source": [
303 |     "temp['gender'] = temp['subject_id'].map(gender_map)"
304 |    ]
305 |   },
306 |   {
307 |    "cell_type": "code",
308 |    "execution_count": 16,
309 |    "metadata": {},
310 |    "outputs": [],
311 |    "source": [
312 |     "males = temp[temp.gender == 'M']\n",
313 |     "females = temp[temp.gender == 'F']"
314 |    ]
315 |   },
316 |   {
317 |    "cell_type": "code",
318 |    "execution_count": 21,
319 |    "metadata": {},
320 |    "outputs": [],
321 |    "source": [
322 |     "d = []\n",
323 |     "for i in att.category.unique():\n",
324 |     "    total = temp[i].sum()\n",
325 |     "    d.append((i, males[i].sum()/total, females[i].sum()/total))"
326 |    ]
327 |   },
328 |   {
329 |    "cell_type": "code",
330 |    "execution_count": 4,
331 |    "metadata": {},
332 |    "outputs": [
333 |     {
334 |      "name": "stdout",
335 |      "output_type": "stream",
336 |      "text": [
337 |       "DRUG 27267\n",
338 |       "CVD 19422\n",
339 |       "HTN 26274\n",
340 |       "DIAB 10986\n",
341 |       "DNR 1056\n",
342 |       "ADD 24141\n",
343 |       "MENT 11539\n",
344 |       "HIV 1935\n"
345 |      ]
346 |     }
347 |    ],
348 |    "source": [
349 |     "for i in temp.columns:\n",
350 |     "    if i not in ['subject_id', 'gender']:\n",
351 |     "        print(i, temp[i].sum())"
352 |    ]
353 |   },
354 |   {
355 |    "cell_type": "code",
356 |    "execution_count": 22,
357 |    "metadata": {},
358 |    "outputs": [
359 |     {
360 |      "data": {
361 |       "text/html": [
362 |        "<div>\n",
363 |        "<style scoped>\n",
364 |        "    .dataframe tbody tr th:only-of-type {\n",
365 |        "        vertical-align: middle;\n",
366 |        "    }\n",
367 |        "\n",
368 |        "    .dataframe tbody tr th {\n",
369 |        "        vertical-align: top;\n",
370 |        "    }\n",
371 |        "\n",
372 |        "    .dataframe thead th {\n",
373 |        "        text-align: right;\n",
374 |        "    }\n",
375 |        "</style>\n",
376 |        "<table border=\"1\" class=\"dataframe\">\n",
377 |        "  <thead>\n",
378 |        "    <tr style=\"text-align: right;\">\n",
379 |        "      <th></th>\n",
380 |        "      <th>Category</th>\n",
381 |        "      <th>Male Prev</th>\n",
382 |        "      <th>Female Prev</th>\n",
383 |        "    </tr>\n",
384 |        "  </thead>\n",
385 |        "  <tbody>\n",
386 |        "    <tr>\n",
387 |        "      <th>0</th>\n",
388 |        "      <td>DRUG</td>\n",
389 |        "      <td>0.569113</td>\n",
390 |        "      <td>0.430887</td>\n",
391 |        "    </tr>\n",
392 |        "    <tr>\n",
393 |        "      <th>1</th>\n",
394 |        "      <td>CVD</td>\n",
395 |        "      <td>0.586809</td>\n",
396 |        "      <td>0.413191</td>\n",
397 |        "    </tr>\n",
398 |        "    <tr>\n",
399 |        "      <th>2</th>\n",
400 |        "      <td>HTN</td>\n",
401 |        "      <td>0.558347</td>\n",
402 |        "      <td>0.441653</td>\n",
403 |        "    </tr>\n",
404 |        "    <tr>\n",
405 |        "      <th>3</th>\n",
406 |        "      <td>DIAB</td>\n",
407 |        "      <td>0.563353</td>\n",
408 |        "      <td>0.436647</td>\n",
409 |        "    </tr>\n",
410 |        "    <tr>\n",
411 |        "      <th>4</th>\n",
412 |        "      <td>DNR</td>\n",
413 |        "      <td>0.518939</td>\n",
414 |        "      <td>0.481061</td>\n",
415 |        "    </tr>\n",
416 |        "    <tr>\n",
417 |        "      <th>5</th>\n",
418 |        "      <td>ADD</td>\n",
419 |        "      <td>0.573796</td>\n",
420 |        "      <td>0.426204</td>\n",
421 |        "    </tr>\n",
422 |        "    <tr>\n",
423 |        "      <th>6</th>\n",
424 |        "      <td>MENT</td>\n",
425 |        "      <td>0.484097</td>\n",
426 |        "      <td>0.515903</td>\n",
427 |        "    </tr>\n",
428 |        "    <tr>\n",
429 |        "      <th>7</th>\n",
430 |        "      <td>HIV</td>\n",
431 |        "      <td>0.645995</td>\n",
432 |        "      <td>0.354005</td>\n",
433 |        "    </tr>\n",
434 |        "  </tbody>\n",
435 |        "</table>\n",
436 |        "</div>"
437 |       ],
438 |       "text/plain": [
439 |        "  Category  Male Prev  Female Prev\n",
440 |        "0     DRUG   0.569113     0.430887\n",
441 |        "1      CVD   0.586809     0.413191\n",
442 |        "2      HTN   0.558347     0.441653\n",
443 |        "3     DIAB   0.563353     0.436647\n",
444 |        "4      DNR   0.518939     0.481061\n",
445 |        "5      ADD   0.573796     0.426204\n",
446 |        "6     MENT   0.484097     0.515903\n",
447 |        "7      HIV   0.645995     0.354005"
448 |       ]
449 |      },
450 |      "execution_count": 22,
451 |      "metadata": {},
452 |      "output_type": "execute_result"
453 |     }
454 |    ],
455 |    "source": [
456 |     "pd.DataFrame(d, columns = ['Category','Male Prev','Female Prev'])"
457 |    ]
458 |   },
459 |   {
460 |    "cell_type": "code",
461 |    "execution_count": null,
462 |    "metadata": {},
463 |    "outputs": [],
464 |    "source": []
465 |   }
466 |  ],
467 |  "metadata": {
468 |   "kernelspec": {
469 |    "display_name": "Python 3",
470 |    "language": "python",
471 |    "name": "python3"
472 |   },
473 |   "language_info": {
474 |    "codemirror_mode": {
475 |     "name": "ipython",
476 |     "version": 3
477 |    },
478 |    "file_extension": ".py",
479 |    "mimetype": "text/x-python",
480 |    "name": "python",
481 |    "nbconvert_exporter": "python",
482 |    "pygments_lexer": "ipython3",
483 |    "version": "3.6.5"
484 |   }
485 |  },
486 |  "nbformat": 4,
487 |  "nbformat_minor": 2
488 | }
489 | 


--------------------------------------------------------------------------------
/scripts/make_targets.py:
--------------------------------------------------------------------------------
  1 | # code is a mess since it's mostly exported from a jupyter notebook
  2 | import os
  3 | import pandas as pd
  4 | import Constants
  5 | from sklearn.model_selection import KFold
  6 | from readers import PhenotypingReader, InHospitalMortalityReader
  7 | import yaml
  8 | from argparse import ArgumentParser
  9 | from pathlib import Path
 10 | 
 11 | parser = ArgumentParser()
 12 | parser.add_argument('--processed_df', type=Path, required=True)
 13 | parser.add_argument('--mimic_benchmark_dir', type = Path, required = True)
 14 | parser.add_argument('--output_dir', type = Path, required = True)
 15 | args = parser.parse_args()
 16 | 
 17 | def preprocessing(row):
 18 |     '''
 19 |     Input: a list of tokens
 20 |     Output: a list of string, with each string having MAX_SEQ_LEN-2 tokens
 21 |     Uses a sliding window approach, with the window sliding (SLIDING_DIST tokens) each time
 22 |     '''
 23 |     n = int(len(row.toks)/Constants.SLIDING_DIST)
 24 |     seqs = []
 25 |     if n == 0: # note shorter than SLIDING_DIST tokens
 26 |         seqs.append(' '.join(row.toks))
 27 |     else:
 28 |         for j in range(min(n, Constants.MAX_NUM_SEQ)):
 29 |             seqs.append(' '.join(row.toks[j*Constants.SLIDING_DIST:(j+1)*Constants.SLIDING_DIST+(Constants.MAX_SEQ_LEN - Constants.SLIDING_DIST-2)]))
 30 |     return seqs
 31 | 
 32 | df = pd.read_pickle(args.processed_df.resolve())
 33 | df['seqs'] = df.apply(preprocessing, axis = 1)
 34 | df['num_seqs'] = df.seqs.apply(len)
 35 | assert(df.seqs.apply(lambda x: any([len(i)== 0 for i in x])).sum() == 0)
 36 | 
 37 | df['note_id'] = df['note_id'].astype(str)
 38 | df = df[~pd.isnull(df['oasis'])]
 39 | MAX_AGG_SEQUENCE_LENGTH = Constants.MAX_AGG_SEQUENCE_LEN
 40 | 
 41 | root_folder = args.mimic_benchmark_dir/'root/'
 42 | other_features = ['age', 'oasis', 'oasis_prob', 'sofa', 'sapsii', 'sapsii_prob']
 43 | 
 44 | '''
 45 | In-Hospital Mortality
 46 | Using the first 48 hours of patient information within their ICU stay, predict whether or not the patient will die in hospital.
 47 | Subjects/targets are extracted by MIMIC-Benchmarks script. Their script only extracts numeric data, while we want to use only notes.
 48 | Their script also defines a new time scale, so that t=0 is when the patient first enters the ICU.
 49 | 
 50 | What we do is:
 51 | - Using the MIMIC-Benchmarks InHospitalMortalityReader, read in each patient, to get the target.
 52 | We know the period of interest will be 0-48 hours, where 0 is the intime to the ICU.
 53 | - For each patient, we obtain their icustay_id from the episode file in their data folder
 54 | - We obtain their (hadm_id, intime) from all_stays.csv using their icustay_id
 55 | - With this information, along with the 48 hour period length, we can index
 56 | into df to obtain a set of note_ids corresponding to that period
 57 | - To construct a training set for each individual, we take sequences from the
 58 | last k notes, until the patient runs out of notes, or we reach the max_agg_sequence_length.
 59 | - We only use sequences from the following note types: Nursing, Nursing/Other,
 60 | Physician
 61 | - We take the last k notes, because they are more likely to be informative of
 62 | the target, compared to the first notes
 63 | - We assign a new ID for this aggregated note, which is a combination of their
 64 | subject ID and episode number
 65 | '''
 66 | 
 67 | 
 68 | train_reader = InHospitalMortalityReader(dataset_dir=args.mimic_benchmark_dir/'in-hospital-mortality' / 'train')
 69 | test_reader = InHospitalMortalityReader(dataset_dir=args.mimic_benchmark_dir/'in-hospital-mortality' / 'test')
 70 | all_stays = pd.read_csv(os.path.join(root_folder, 'all_stays.csv'), parse_dates = ['INTIME']).set_index('ICUSTAY_ID')
 71 | 
 72 | def read_patient(name, period_length, allowed_types, eps = 0.001, dtype = 'train', return_intime = False):
 73 |     # given a file name, retrieve all notes from t=-eps to period_length+eps
 74 |     subj_id = int(name.split('_')[0])
 75 |     stay = pd.read_csv(os.path.join(root_folder, dtype, str(subj_id), name.split('_')[1]+'.csv'))
 76 |     assert(stay.shape[0] == 1)
 77 |     row = stay.iloc[0]
 78 | 
 79 |     icuid = row['Icustay']
 80 |     hadm_id = all_stays.loc[icuid]['HADM_ID']
 81 |     intime = all_stays.loc[icuid]['INTIME']
 82 |     result = df[(df['subject_id'] == subj_id) & (df['hadm_id'] == hadm_id)
 83 |        & (df['charttime'] >= intime) & (df['charttime'] < intime+pd.Timedelta(hours = period_length + eps))
 84 |              & (df['category'].isin(allowed_types))]
 85 |     if return_intime:
 86 |         return (intime, result)
 87 |     else:
 88 |         return result
 89 | 
 90 | def agg_notes(notes, first = False, intime = None, timeDiff = pd.Timedelta(hours = 48)):
 91 |     notes = notes.sort_values(by = 'charttime', ascending = False)
 92 |     seqs = []
 93 |     note_ids = []
 94 |     if first:
 95 |         note_to_take = None
 96 |         firstgood = notes[notes.category.isin(['Nursing', 'Physician '])]
 97 |         if firstgood.shape[0] > 0 and (firstgood.iloc[0]['charttime'] - intime) <= timeDiff:
 98 |             note_to_take = firstgood.iloc[0]
 99 |         elif (notes.iloc[0]['charttime'] - intime) <= timeDiff:
100 |             note_to_take = notes.iloc[0]
101 |         if note_to_take is not None:
102 |             seqs = note_to_take['seqs']
103 |             note_ids.append(note_to_take['note_id'])
104 | 
105 |     else:
106 |         for idx, row in notes.iterrows():
107 |             if len(seqs) + row.num_seqs <= MAX_AGG_SEQUENCE_LENGTH:
108 |                 seqs = row.seqs + seqs
109 |                 note_ids = [row.note_id] + note_ids
110 |     return {**{
111 |         'insurance': notes.iloc[0]['insurance'],
112 |         'gender': notes.iloc[0]['gender'],
113 |         'ethnicity_to_use': notes.iloc[0]['ethnicity_to_use'],
114 |         'language_to_use': notes.iloc[0]['language_to_use'],
115 |         'subject_id': notes.iloc[0]['subject_id'],
116 |         'hadm_id': notes.iloc[0]['hadm_id'],
117 |         'seqs': seqs,
118 |         'note_ids': note_ids,
119 |         'num_seqs': len(seqs),
120 |     }, **{i: notes.iloc[0][i] for i in other_features}}
121 | 
122 | temp = []
123 | for i in range(train_reader.get_number_of_examples()):
124 |     ex = train_reader.read_example(i)
125 |     notes = read_patient(ex['name'], 48, ['Nursing', 'Physician ', 'Nursing/other'])
126 |     if len(notes) > 0: #no notes of interest within first 48 hours
127 |         dic = agg_notes(notes)
128 |         dic['inhosp_mort'] = ex['y']
129 |         dic['note_id'] = ''.join(ex['name'].split('_')[:2]) + 'a'
130 |         dic['fold'] = 'train'
131 |         temp.append(dic)
132 | 
133 | for i in range(test_reader.get_number_of_examples()):
134 |     ex = test_reader.read_example(i)
135 |     notes = read_patient(ex['name'], 48, ['Nursing', 'Physician ', 'Nursing/other'], dtype = 'test')
136 |     if len(notes) > 0: #no notes of interest within first 48 hours
137 |         dic = agg_notes(notes)
138 |         dic['inhosp_mort'] = ex['y']
139 |         dic['note_id'] = ''.join(ex['name'].split('_')[:2])+ 'a'
140 |         dic['fold'] = 'test'
141 |         temp.append(dic)
142 | t2 = pd.DataFrame(temp)
143 | # split training set into folds, stratify by inhosp_mort
144 | subjects = t2.loc[t2['fold'] != 'test',['subject_id', 'inhosp_mort']].groupby('subject_id').first().reset_index()
145 | kf = KFold(n_splits = 10, shuffle = True, random_state = 42)
146 | for c,j in enumerate(kf.split(subjects, groups = subjects['inhosp_mort'])):
147 |     for k in j[1]:
148 |         t2.loc[t2['subject_id'] == subjects.loc[k]['subject_id'], 'fold'] = str(c+1)
149 | t2.to_pickle(args.output_dir / 'inhosp_mort')
150 | 
151 | 
152 | '''
153 | Phenotyping using all patient notes
154 | - Using the MIMIC-Benchmarks PhenotypingReader, read in each patient, to get
155 | the targets and the period length (which is the length of stay). We know the period of interest will be 0 to los + $\epsilon$,
156 | where 0 is the intime to the ICU, and $\epsilon$ is a small number (so discharge notes are included).
157 | - We obtain (hadm_id, intime) usin the same method above
158 | - With this information, along with the los + $\epsilon$ hour period length, we
159 | can index into df to obtain a set of note_ids corresponding to that period
160 | - We construct sequences using the last k notes, in the same manner as above.
161 | - We only use sequences from the following note types: Nursing, Nursing/Other,
162 | Physician, Discharge Summary
163 | - We also add in the following targets, aggregated from the specific
164 | phenotypes: Any acute, Any chronic, Any disease
165 | '''
166 | 
167 | with open('../icd9_codes.yml', 'r') as f:
168 |     ccs = pd.DataFrame.from_dict(yaml.load(f)).T
169 | 
170 | target_names = list(pd.read_csv(os.path.join(root_folder, 'phenotype_labels.csv')).columns)
171 | acutes = [i for i in target_names if ccs.loc[i, 'type'] == 'acute']
172 | chronics = [i for i in target_names if ccs.loc[i, 'type'] == 'chronic']
173 | train_reader = PhenotypingReader(dataset_dir=args.mimic_benchmark_dir/'phenotyping' / 'train')
174 | test_reader = PhenotypingReader(dataset_dir=args.mimic_benchmark_dir/'phenotyping' / 'test')
175 | temp = []
176 | def has_any(dic, keys):
177 |     return any([dic[i] == 1 for i in keys])
178 | 
179 | for i in range(train_reader.get_number_of_examples()):
180 |     ex = train_reader.read_example(i)
181 |     notes = read_patient(ex['name'], float(ex['t']), ['Nursing', 'Physician ', 'Nursing/other', 'Discharge summary'])
182 |     if len(notes) > 0:
183 |         dic = agg_notes(notes)
184 |         for tar, y in zip(target_names, ex['y']):
185 |             dic[tar] = y
186 |         dic['any_acute'] = has_any(dic, acutes)
187 |         dic['any_chronic'] = has_any(dic, chronics)
188 |         dic['any_disease'] = has_any(dic, target_names)
189 | 
190 |         dic['note_id'] = ''.join(ex['name'].split('_')[:2]) + 'b'
191 |         dic['fold'] = 'train'
192 |         temp.append(dic)
193 | 
194 | for i in range(test_reader.get_number_of_examples()):
195 |     ex = test_reader.read_example(i)
196 |     notes = read_patient(ex['name'], float(ex['t']), ['Nursing', 'Physician ', 'Nursing/other', 'Discharge summary'], dtype = 'test')
197 |     if len(notes) > 0:
198 |         dic = agg_notes(notes)
199 |         for tar, y in zip(target_names, ex['y']):
200 |             dic[tar] = y
201 |         dic['any_acute'] = has_any(dic, acutes)
202 |         dic['any_chronic'] = has_any(dic, chronics)
203 |         dic['any_disease'] = has_any(dic, target_names)
204 | 
205 |         dic['note_id'] = ''.join(ex['name'].split('_')[:2]) + 'b'
206 |         dic['fold'] = 'test'
207 |         temp.append(dic)
208 | 
209 | cols = target_names + ['any_chronic', 'any_acute', 'any_disease']
210 | t3 = pd.DataFrame(temp)
211 | subjects = t3.loc[t3['fold'] != 'test',['subject_id', 'any_disease']].groupby('subject_id').first().reset_index()
212 | kf = KFold(n_splits = 10, shuffle = True, random_state = 42)
213 | for c,j in enumerate(kf.split(subjects, groups = subjects['any_disease'])):
214 |     for k in j[1]:
215 |         t3.loc[t3['subject_id'] == subjects.loc[k]['subject_id'], 'fold'] = str(c+1)
216 | 
217 | t3.to_pickle(args.output_dir / 'phenotype_all')
218 | 
219 | '''
220 | Phenotyping using the first patient note
221 | - Using the MIMIC-Benchmarks PhenotypingReader, read in each patient, to get
222 | the targets and the period length (which is the length of stay). We know the period of interest will be 0 to los + $\epsilon$,
223 | where 0 is the intime to the ICU, and $\epsilon$ is a small number (so discharge notes are included).
224 | - We obtain (hadm_id, intime) usin the same method above
225 | - With this information, along with the los + $\epsilon$ hour period length, we
226 | can index into df. We take the first nursing or physician note within the first 48 hours of a person's stay.
227 | If this does not exist, we take the first nursing/other note within the first 48 hours.
228 | - If they do not have a nursing note within 48 hours of their intime, the
229 | patient is dropped.
230 | '''
231 | 
232 | train_reader = PhenotypingReader(dataset_dir=args.mimic_benchmark_dir/'phenotyping' / 'train')
233 | test_reader = PhenotypingReader(dataset_dir=args.mimic_benchmark_dir/'phenotyping' / 'test')
234 | temp = []
235 | for i in range(train_reader.get_number_of_examples()):
236 |     ex = train_reader.read_example(i)
237 |     intime, notes = read_patient(ex['name'], float(ex['t']), ['Nursing', 'Physician ', 'Nursing/other'], return_intime = True)
238 |     if len(notes) > 0:
239 |         dic = agg_notes(notes, first = True, intime = intime)
240 |         if len(dic['seqs']) == 0:
241 |             continue
242 |         for tar, y in zip(target_names, ex['y']):
243 |             dic[tar] = y
244 |         dic['any_acute'] = has_any(dic, acutes)
245 |         dic['any_chronic'] = has_any(dic, chronics)
246 |         dic['any_disease'] = has_any(dic, target_names)
247 | 
248 |         dic['note_id'] = dic['note_ids'][0]
249 |         del dic['note_ids']
250 |         dic['fold'] = 'train'
251 |         temp.append(dic)
252 | 
253 | for i in range(test_reader.get_number_of_examples()):
254 |     ex = test_reader.read_example(i)
255 |     intime, notes = read_patient(ex['name'], float(ex['t']), ['Nursing', 'Physician ', 'Nursing/other'], dtype = 'test', return_intime = True)
256 |     if len(notes) > 0:
257 |         dic = agg_notes(notes, first = True, intime = intime)
258 |         if len(dic['seqs']) == 0:
259 |             continue
260 |         for tar, y in zip(target_names, ex['y']):
261 |             dic[tar] = y
262 |         dic['any_acute'] = has_any(dic, acutes)
263 |         dic['any_chronic'] = has_any(dic, chronics)
264 |         dic['any_disease'] = has_any(dic, target_names)
265 | 
266 |         dic['note_id'] = dic['note_ids'][0]
267 |         del dic['note_ids']
268 |         dic['fold'] = 'test'
269 |         temp.append(dic)
270 | t4 = pd.DataFrame(temp)
271 | t4 = pd.merge(t4, df[['note_id', 'category']], on = 'note_id', how = 'left')
272 | subjects = t4.loc[t4['fold'] != 'test',['subject_id', 'any_disease']].groupby('subject_id').first().reset_index()
273 | kf = KFold(n_splits = 10, shuffle = True, random_state = 42)
274 | for c,j in enumerate(kf.split(subjects, groups = subjects['any_disease'])):
275 |     for k in j[1]:
276 |         t4.loc[t4['subject_id'] == subjects.loc[k]['subject_id'], 'fold'] = str(c+1)
277 | t4.to_pickle(args.output_dir / 'phenotype_first')
278 | 


--------------------------------------------------------------------------------
/scripts/readers.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | from __future__ import print_function
  3 | 
  4 | import os
  5 | import numpy as np
  6 | import random
  7 | 
  8 | 
  9 | class Reader(object):
 10 |     def __init__(self, dataset_dir, listfile=None):
 11 |         self._dataset_dir = dataset_dir
 12 |         self._current_index = 0
 13 |         if listfile is None:
 14 |             listfile_path = os.path.join(dataset_dir, "listfile.csv")
 15 |         else:
 16 |             listfile_path = listfile
 17 |         with open(listfile_path, "r") as lfile:
 18 |             self._data = lfile.readlines()
 19 |         self._listfile_header = self._data[0]
 20 |         self._data = self._data[1:]
 21 | 
 22 |     def get_number_of_examples(self):
 23 |         return len(self._data)
 24 | 
 25 |     def random_shuffle(self, seed=None):
 26 |         if seed is not None:
 27 |             random.seed(seed)
 28 |         random.shuffle(self._data)
 29 | 
 30 |     def read_example(self, index):
 31 |         raise NotImplementedError()
 32 | 
 33 |     def read_next(self):
 34 |         to_read_index = self._current_index
 35 |         self._current_index += 1
 36 |         if self._current_index == self.get_number_of_examples():
 37 |             self._current_index = 0
 38 |         return self.read_example(to_read_index)
 39 | 
 40 | 
 41 | class DecompensationReader(Reader):
 42 |     def __init__(self, dataset_dir, listfile=None):
 43 |         """ Reader for decompensation prediction task.
 44 |         :param dataset_dir: Directory where timeseries files are stored.
 45 |         :param listfile:    Path to a listfile. If this parameter is left `None` then
 46 |                             `dataset_dir/listfile.csv` will be used.
 47 |         """
 48 |         Reader.__init__(self, dataset_dir, listfile)
 49 |         self._data = [line.split(',') for line in self._data]
 50 |         self._data = [(x, float(t), int(y)) for (x, t, y) in self._data]
 51 | 
 52 |     def _read_timeseries(self, ts_filename, time_bound):
 53 |         ret = []
 54 |         with open(os.path.join(self._dataset_dir, ts_filename), "r") as tsfile:
 55 |             header = tsfile.readline().strip().split(',')
 56 |             assert header[0] == "Hours"
 57 |             for line in tsfile:
 58 |                 mas = line.strip().split(',')
 59 |                 t = float(mas[0])
 60 |                 if t > time_bound + 1e-6:
 61 |                     break
 62 |                 ret.append(np.array(mas))
 63 |         return (np.stack(ret), header)
 64 | 
 65 |     def read_example(self, index):
 66 |         """ Read the example with given index.
 67 | 
 68 |         :param index: Index of the line of the listfile to read (counting starts from 0).
 69 |         :return: Directory with the following keys:
 70 |             X : np.array
 71 |                 2D array containing all events. Each row corresponds to a moment.
 72 |                 First column is the time and other columns correspond to different
 73 |                 variables.
 74 |             t : float
 75 |                 Length of the data in hours. Note, in general, it is not equal to the
 76 |                 timestamp of last event.
 77 |             y : int (0 or 1)
 78 |                 Mortality within next 24 hours.
 79 |             header : array of strings
 80 |                 Names of the columns. The ordering of the columns is always the same.
 81 |             name: Name of the sample.
 82 |         """
 83 |         if index < 0 or index >= len(self._data):
 84 |             raise ValueError("Index must be from 0 (inclusive) to number of examples (exclusive).")
 85 | 
 86 |         name = self._data[index][0]
 87 |         t = self._data[index][1]
 88 |         y = self._data[index][2]
 89 |         (X, header) = self._read_timeseries(name, t)
 90 | 
 91 |         return {"X": X,
 92 |                 "t": t,
 93 |                 "y": y,
 94 |                 "header": header,
 95 |                 "name": name}
 96 | 
 97 | 
 98 | class InHospitalMortalityReader(Reader):
 99 |     def __init__(self, dataset_dir, listfile=None, period_length=48.0):
100 |         """ Reader for in-hospital moratality prediction task.
101 | 
102 |         :param dataset_dir:   Directory where timeseries files are stored.
103 |         :param listfile:      Path to a listfile. If this parameter is left `None` then
104 |                               `dataset_dir/listfile.csv` will be used.
105 |         :param period_length: Length of the period (in hours) from which the prediction is done.
106 |         """
107 |         Reader.__init__(self, dataset_dir, listfile)
108 |         self._data = [line.split(',') for line in self._data]
109 |         self._data = [(x, int(y)) for (x, y) in self._data]
110 |         self._period_length = period_length
111 | 
112 |     def _read_timeseries(self, ts_filename):
113 |         ret = []
114 |         with open(os.path.join(self._dataset_dir, ts_filename), "r") as tsfile:
115 |             header = tsfile.readline().strip().split(',')
116 |             assert header[0] == "Hours"
117 |             for line in tsfile:
118 |                 mas = line.strip().split(',')
119 |                 ret.append(np.array(mas))
120 |         return (np.stack(ret), header)
121 | 
122 |     def read_example(self, index):
123 |         """ Reads the example with given index.
124 | 
125 |         :param index: Index of the line of the listfile to read (counting starts from 0).
126 |         :return: Dictionary with the following keys:
127 |             X : np.array
128 |                 2D array containing all events. Each row corresponds to a moment.
129 |                 First column is the time and other columns correspond to different
130 |                 variables.
131 |             t : float
132 |                 Length of the data in hours. Note, in general, it is not equal to the
133 |                 timestamp of last event.
134 |             y : int (0 or 1)
135 |                 In-hospital mortality.
136 |             header : array of strings
137 |                 Names of the columns. The ordering of the columns is always the same.
138 |             name: Name of the sample.
139 |         """
140 |         if index < 0 or index >= len(self._data):
141 |             raise ValueError("Index must be from 0 (inclusive) to number of lines (exclusive).")
142 | 
143 |         name = self._data[index][0]
144 |         t = self._period_length
145 |         y = self._data[index][1]
146 |         (X, header) = self._read_timeseries(name)
147 | 
148 |         return {"X": X,
149 |                 "t": t,
150 |                 "y": y,
151 |                 "header": header,
152 |                 "name": name}
153 | 
154 | 
155 | class LengthOfStayReader(Reader):
156 |     def __init__(self, dataset_dir, listfile=None):
157 |         """ Reader for length of stay prediction task.
158 | 
159 |         :param dataset_dir: Directory where timeseries files are stored.
160 |         :param listfile:    Path to a listfile. If this parameter is left `None` then
161 |                             `dataset_dir/listfile.csv` will be used.
162 |         """
163 |         Reader.__init__(self, dataset_dir, listfile)
164 |         self._data = [line.split(',') for line in self._data]
165 |         self._data = [(x, float(t), float(y)) for (x, t, y) in self._data]
166 | 
167 |     def _read_timeseries(self, ts_filename, time_bound):
168 |         ret = []
169 |         with open(os.path.join(self._dataset_dir, ts_filename), "r") as tsfile:
170 |             header = tsfile.readline().strip().split(',')
171 |             assert header[0] == "Hours"
172 |             for line in tsfile:
173 |                 mas = line.strip().split(',')
174 |                 t = float(mas[0])
175 |                 if t > time_bound + 1e-6:
176 |                     break
177 |                 ret.append(np.array(mas))
178 |         return (np.stack(ret), header)
179 | 
180 |     def read_example(self, index):
181 |         """ Reads the example with given index.
182 | 
183 |         :param index: Index of the line of the listfile to read (counting starts from 0).
184 |         :return: Dictionary with the following keys:
185 |             X : np.array
186 |                 2D array containing all events. Each row corresponds to a moment.
187 |                 First column is the time and other columns correspond to different
188 |                 variables.
189 |             t : float
190 |                 Length of the data in hours. Note, in general, it is not equal to the
191 |                 timestamp of last event.
192 |             y : float
193 |                 Remaining time in ICU.
194 |             header : array of strings
195 |                 Names of the columns. The ordering of the columns is always the same.
196 |             name: Name of the sample.
197 |         """
198 |         if index < 0 or index >= len(self._data):
199 |             raise ValueError("Index must be from 0 (inclusive) to number of lines (exclusive).")
200 | 
201 |         name = self._data[index][0]
202 |         t = self._data[index][1]
203 |         y = self._data[index][2]
204 |         (X, header) = self._read_timeseries(name, t)
205 | 
206 |         return {"X": X,
207 |                 "t": t,
208 |                 "y": y,
209 |                 "header": header,
210 |                 "name": name}
211 | 
212 | 
213 | class PhenotypingReader(Reader):
214 |     def __init__(self, dataset_dir, listfile=None):
215 |         """ Reader for phenotype classification task.
216 | 
217 |         :param dataset_dir: Directory where timeseries files are stored.
218 |         :param listfile:    Path to a listfile. If this parameter is left `None` then
219 |                             `dataset_dir/listfile.csv` will be used.
220 |         """
221 |         Reader.__init__(self, dataset_dir, listfile)
222 |         self._data = [line.split(',') for line in self._data]
223 |         self._data = [(mas[0], float(mas[1]), list(map(int, mas[2:]))) for mas in self._data]
224 | 
225 |     def _read_timeseries(self, ts_filename):
226 |         ret = []
227 |         with open(os.path.join(self._dataset_dir, ts_filename), "r") as tsfile:
228 |             header = tsfile.readline().strip().split(',')
229 |             assert header[0] == "Hours"
230 |             for line in tsfile:
231 |                 mas = line.strip().split(',')
232 |                 ret.append(np.array(mas))
233 |         return (np.stack(ret), header)
234 | 
235 |     def read_example(self, index):
236 |         """ Reads the example with given index.
237 | 
238 |         :param index: Index of the line of the listfile to read (counting starts from 0).
239 |         :return: Dictionary with the following keys:
240 |             X : np.array
241 |                 2D array containing all events. Each row corresponds to a moment.
242 |                 First column is the time and other columns correspond to different
243 |                 variables.
244 |             t : float
245 |                 Length of the data in hours. Note, in general, it is not equal to the
246 |                 timestamp of last event.
247 |             y : array of ints
248 |                 Phenotype labels.
249 |             header : array of strings
250 |                 Names of the columns. The ordering of the columns is always the same.
251 |             name: Name of the sample.
252 |         """
253 |         if index < 0 or index >= len(self._data):
254 |             raise ValueError("Index must be from 0 (inclusive) to number of lines (exclusive).")
255 | 
256 |         name = self._data[index][0]
257 |         t = self._data[index][1]
258 |         y = self._data[index][2]
259 |         (X, header) = self._read_timeseries(name)
260 | 
261 |         return {"X": X,
262 |                 "t": t,
263 |                 "y": y,
264 |                 "header": header,
265 |                 "name": name}
266 | 
267 | 
268 | class MultitaskReader(Reader):
269 |     def __init__(self, dataset_dir, listfile=None):
270 |         """ Reader for multitask learning.
271 | 
272 |         :param dataset_dir: Directory where timeseries files are stored.
273 |         :param listfile:    Path to a listfile. If this parameter is left `None` then
274 |                             `dataset_dir/listfile.csv` will be used.
275 |         """
276 |         Reader.__init__(self, dataset_dir, listfile)
277 |         self._data = [line.split(',') for line in self._data]
278 | 
279 |         def process_ihm(x):
280 |             return list(map(int, x.split(';')))
281 | 
282 |         def process_los(x):
283 |             x = x.split(';')
284 |             if x[0] == '':
285 |                 return ([], [])
286 |             return (list(map(int, x[:len(x)//2])), list(map(float, x[len(x)//2:])))
287 | 
288 |         def process_ph(x):
289 |             return list(map(int, x.split(';')))
290 | 
291 |         def process_decomp(x):
292 |             x = x.split(';')
293 |             if x[0] == '':
294 |                 return ([], [])
295 |             return (list(map(int, x[:len(x)//2])), list(map(int, x[len(x)//2:])))
296 | 
297 |         self._data = [(fname, float(t), process_ihm(ihm), process_los(los),
298 |                        process_ph(pheno), process_decomp(decomp))
299 |                       for fname, t, ihm, los, pheno, decomp in self._data]
300 | 
301 |     def _read_timeseries(self, ts_filename):
302 |         ret = []
303 |         with open(os.path.join(self._dataset_dir, ts_filename), "r") as tsfile:
304 |             header = tsfile.readline().strip().split(',')
305 |             assert header[0] == "Hours"
306 |             for line in tsfile:
307 |                 mas = line.strip().split(',')
308 |                 ret.append(np.array(mas))
309 |         return (np.stack(ret), header)
310 | 
311 |     def read_example(self, index):
312 |         """ Reads the example with given index.
313 | 
314 |         :param index: Index of the line of the listfile to read (counting starts from 0).
315 |         :return: Return dictionary with the following keys:
316 |             X : np.array
317 |                 2D array containing all events. Each row corresponds to a moment.
318 |                 First column is the time and other columns correspond to different
319 |                 variables.
320 |             t : float
321 |                 Length of the data in hours. Note, in general, it is not equal to the
322 |                 timestamp of last event.
323 |             ihm : array
324 |                 Array of 3 integers: [pos, mask, label].
325 |             los : array
326 |                 Array of 2 arrays: [masks, labels].
327 |             pheno : array
328 |                 Array of 25 binary integers (phenotype labels).
329 |             decomp : array
330 |                 Array of 2 arrays: [masks, labels].
331 |             header : array of strings
332 |                 Names of the columns. The ordering of the columns is always the same.
333 |             name: Name of the sample.
334 |         """
335 |         if index < 0 or index >= len(self._data):
336 |             raise ValueError("Index must be from 0 (inclusive) to number of lines (exclusive).")
337 | 
338 |         name = self._data[index][0]
339 |         (X, header) = self._read_timeseries(name)
340 | 
341 |         return {"X": X,
342 |                 "t": self._data[index][1],
343 |                 "ihm": self._data[index][2],
344 |                 "los": self._data[index][3],
345 |                 "pheno": self._data[index][4],
346 |                 "decomp": self._data[index][5],
347 |                 "header": header,
348 |                 "name": name}
349 | 


--------------------------------------------------------------------------------
/scripts/finetune_on_pregenerated.py:
--------------------------------------------------------------------------------
  1 | #!/h/haoran/anaconda3/bin/python
  2 | from argparse import ArgumentParser
  3 | from pathlib import Path
  4 | import os
  5 | import torch
  6 | import logging
  7 | import json
  8 | import random
  9 | import numpy as np
 10 | from collections import namedtuple
 11 | from tempfile import TemporaryDirectory
 12 | 
 13 | from torch.utils.data import DataLoader, Dataset, RandomSampler
 14 | from torch.utils.data.distributed import DistributedSampler
 15 | from tqdm import tqdm
 16 | 
 17 | from pytorch_pretrained_bert import WEIGHTS_NAME, CONFIG_NAME
 18 | from pytorch_pretrained_bert.modeling import BertForPreTraining
 19 | from pytorch_pretrained_bert.tokenization import BertTokenizer
 20 | from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule
 21 | 
 22 | InputFeatures = namedtuple("InputFeatures", "input_ids input_mask segment_ids lm_label_ids is_next")
 23 | 
 24 | log_format = '%(asctime)-10s: %(message)s'
 25 | logging.basicConfig(level=logging.INFO, format=log_format)
 26 | 
 27 | 
 28 | def convert_example_to_features(example, tokenizer, max_seq_length):
 29 |     tokens = example["tokens"]
 30 |     segment_ids = example["segment_ids"]
 31 |     is_random_next = example["is_random_next"]
 32 |     masked_lm_positions = example["masked_lm_positions"]
 33 |     masked_lm_labels = example["masked_lm_labels"]
 34 | 
 35 |     assert len(tokens) == len(segment_ids) <= max_seq_length  # The preprocessed data should be already truncated
 36 |     input_ids = tokenizer.convert_tokens_to_ids(tokens)
 37 |     masked_label_ids = tokenizer.convert_tokens_to_ids(masked_lm_labels)
 38 | 
 39 |     input_array = np.zeros(max_seq_length, dtype=np.int)
 40 |     input_array[:len(input_ids)] = input_ids
 41 | 
 42 |     mask_array = np.zeros(max_seq_length, dtype=np.bool)
 43 |     mask_array[:len(input_ids)] = 1
 44 | 
 45 |     segment_array = np.zeros(max_seq_length, dtype=np.bool)
 46 |     segment_array[:len(segment_ids)] = segment_ids
 47 | 
 48 |     lm_label_array = np.full(max_seq_length, dtype=np.int, fill_value=-1)
 49 |     lm_label_array[masked_lm_positions] = masked_label_ids
 50 | 
 51 |     features = InputFeatures(input_ids=input_array,
 52 |                              input_mask=mask_array,
 53 |                              segment_ids=segment_array,
 54 |                              lm_label_ids=lm_label_array,
 55 |                              is_next=is_random_next)
 56 |     return features
 57 | 
 58 | 
 59 | class PregeneratedDataset(Dataset):
 60 |     def __init__(self, training_path, epoch, tokenizer, num_data_epochs, reduce_memory=False):
 61 |         self.vocab = tokenizer.vocab
 62 |         self.tokenizer = tokenizer
 63 |         self.epoch = epoch
 64 |         self.data_epoch = epoch % num_data_epochs
 65 |         data_file = training_path / f"epoch_{self.data_epoch}.json"
 66 |         metrics_file = training_path / f"epoch_{self.data_epoch}_metrics.json"
 67 |         assert data_file.is_file() and metrics_file.is_file()
 68 |         metrics = json.loads(metrics_file.read_text())
 69 |         num_samples = metrics['num_training_examples']
 70 |         seq_len = metrics['max_seq_len']
 71 |         self.temp_dir = None
 72 |         self.working_dir = None
 73 |         if reduce_memory:
 74 |             self.temp_dir = TemporaryDirectory()
 75 |             self.working_dir = Path(self.temp_dir.name)
 76 |             input_ids = np.memmap(filename=self.working_dir/'input_ids.memmap',
 77 |                                   mode='w+', dtype=np.int32, shape=(num_samples, seq_len))
 78 |             input_masks = np.memmap(filename=self.working_dir/'input_masks.memmap',
 79 |                                     shape=(num_samples, seq_len), mode='w+', dtype=np.bool)
 80 |             segment_ids = np.memmap(filename=self.working_dir/'segment_ids.memmap',
 81 |                                     shape=(num_samples, seq_len), mode='w+', dtype=np.bool)
 82 |             lm_label_ids = np.memmap(filename=self.working_dir/'lm_label_ids.memmap',
 83 |                                      shape=(num_samples, seq_len), mode='w+', dtype=np.int32)
 84 |             lm_label_ids[:] = -1
 85 |             is_nexts = np.memmap(filename=self.working_dir/'is_nexts.memmap',
 86 |                                  shape=(num_samples,), mode='w+', dtype=np.bool)
 87 |         else:
 88 |             input_ids = np.zeros(shape=(num_samples, seq_len), dtype=np.int32)
 89 |             input_masks = np.zeros(shape=(num_samples, seq_len), dtype=np.bool)
 90 |             segment_ids = np.zeros(shape=(num_samples, seq_len), dtype=np.bool)
 91 |             lm_label_ids = np.full(shape=(num_samples, seq_len), dtype=np.int32, fill_value=-1)
 92 |             is_nexts = np.zeros(shape=(num_samples,), dtype=np.bool)
 93 |         logging.info(f"Loading training examples for epoch {epoch}")
 94 |         with data_file.open() as f:
 95 |             for i, line in enumerate(tqdm(f, total=num_samples, desc="Training examples")):
 96 |                 line = line.strip()
 97 |                 example = json.loads(line)
 98 |                 features = convert_example_to_features(example, tokenizer, seq_len)
 99 |                 input_ids[i] = features.input_ids
100 |                 segment_ids[i] = features.segment_ids
101 |                 input_masks[i] = features.input_mask
102 |                 lm_label_ids[i] = features.lm_label_ids
103 |                 is_nexts[i] = features.is_next
104 |         assert i == num_samples - 1  # Assert that the sample count metric was true
105 |         logging.info("Loading complete!")
106 |         self.num_samples = num_samples
107 |         self.seq_len = seq_len
108 |         self.input_ids = input_ids
109 |         self.input_masks = input_masks
110 |         self.segment_ids = segment_ids
111 |         self.lm_label_ids = lm_label_ids
112 |         self.is_nexts = is_nexts
113 | 
114 |     def __len__(self):
115 |         return self.num_samples
116 | 
117 |     def __getitem__(self, item):
118 |         return (torch.tensor(self.input_ids[item].astype(np.int64)),
119 |                 torch.tensor(self.input_masks[item].astype(np.int64)),
120 |                 torch.tensor(self.segment_ids[item].astype(np.int64)),
121 |                 torch.tensor(self.lm_label_ids[item].astype(np.int64)),
122 |                 torch.tensor(self.is_nexts[item].astype(np.int64)))
123 | 
124 | 
125 | def main():
126 |     parser = ArgumentParser()
127 |     parser.add_argument('--pregenerated_data', type=Path, required=True)
128 |     parser.add_argument('--output_dir', type=Path, required=True)
129 |     parser.add_argument("--bert_model", type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, "
130 |                              "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.")
131 |     parser.add_argument("--do_lower_case", action="store_true")
132 |     parser.add_argument("--reduce_memory", action="store_true",
133 |                         help="Store training data as on-disc memmaps to massively reduce memory usage")
134 | 
135 |     parser.add_argument("--epochs", type=int, default=3, help="Number of epochs to train for")
136 |     parser.add_argument("--local_rank",
137 |                         type=int,
138 |                         default=-1,
139 |                         help="local_rank for distributed training on gpus")
140 |     parser.add_argument("--no_cuda",
141 |                         action='store_true',
142 |                         help="Whether not to use CUDA when available")
143 |     parser.add_argument('--gradient_accumulation_steps',
144 |                         type=int,
145 |                         default=1,
146 |                         help="Number of updates steps to accumulate before performing a backward/update pass.")
147 |     parser.add_argument("--train_batch_size",
148 |                         default=32,
149 |                         type=int,
150 |                         help="Total batch size for training.")
151 |     parser.add_argument('--fp16',
152 |                         action='store_true',
153 |                         help="Whether to use 16-bit float precision instead of 32-bit")
154 |     parser.add_argument('--loss_scale',
155 |                         type=float, default=0,
156 |                         help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
157 |                         "0 (default value): dynamic loss scaling.\n"
158 |                         "Positive power of 2: static loss scaling value.\n")
159 |     parser.add_argument("--warmup_proportion",
160 |                         default=0.1,
161 |                         type=float,
162 |                         help="Proportion of training to perform linear learning rate warmup for. "
163 |                              "E.g., 0.1 = 10%% of training.")
164 |     parser.add_argument("--learning_rate",
165 |                         default=3e-5,
166 |                         type=float,
167 |                         help="The initial learning rate for Adam.")
168 |     parser.add_argument('--seed',
169 |                         type=int,
170 |                         default=42,
171 |                         help="random seed for initialization")
172 |     args = parser.parse_args()
173 | 
174 |     assert args.pregenerated_data.is_dir(), \
175 |         "--pregenerated_data should point to the folder of files made by pregenerate_training_data.py!"
176 | 
177 |     samples_per_epoch = []
178 |     for i in range(args.epochs):
179 |         epoch_file = args.pregenerated_data / f"epoch_{i}.json"
180 |         metrics_file = args.pregenerated_data / f"epoch_{i}_metrics.json"
181 |         if epoch_file.is_file() and metrics_file.is_file():
182 |             metrics = json.loads(metrics_file.read_text())
183 |             samples_per_epoch.append(metrics['num_training_examples'])
184 |         else:
185 |             if i == 0:
186 |                 exit("No training data was found!")
187 |             print(f"Warning! There are fewer epochs of pregenerated data ({i}) than training epochs ({args.epochs}).")
188 |             print("This script will loop over the available data, but training diversity may be negatively impacted.")
189 |             num_data_epochs = i
190 |             break
191 |     else:
192 |         num_data_epochs = args.epochs
193 | 
194 |     if args.local_rank == -1 or args.no_cuda:
195 |         device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
196 |         n_gpu = torch.cuda.device_count()
197 |     else:
198 |         torch.cuda.set_device(args.local_rank)
199 |         device = torch.device("cuda", args.local_rank)
200 |         n_gpu = 1
201 |         # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
202 |         torch.distributed.init_process_group(backend='nccl')
203 |     logging.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
204 |         device, n_gpu, bool(args.local_rank != -1), args.fp16))
205 | 
206 |     if args.gradient_accumulation_steps < 1:
207 |         raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
208 |                             args.gradient_accumulation_steps))
209 | 
210 |     args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
211 | 
212 |     random.seed(args.seed)
213 |     np.random.seed(args.seed)
214 |     torch.manual_seed(args.seed)
215 |     if n_gpu > 0:
216 |         torch.cuda.manual_seed_all(args.seed)
217 | 
218 |     if args.output_dir.is_dir() and list(args.output_dir.iterdir()):
219 |         logging.warning(f"Output directory ({args.output_dir}) already exists and is not empty!")
220 |     args.output_dir.mkdir(parents=True, exist_ok=True)
221 | 
222 |     tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
223 | 
224 |     total_train_examples = 0
225 |     for i in range(args.epochs):
226 |         # The modulo takes into account the fact that we may loop over limited epochs of data
227 |         total_train_examples += samples_per_epoch[i % len(samples_per_epoch)]
228 | 
229 |     num_train_optimization_steps = int(
230 |         total_train_examples / args.train_batch_size / args.gradient_accumulation_steps)
231 |     if args.local_rank != -1:
232 |         num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()
233 | 
234 |     # Prepare model
235 |     model = BertForPreTraining.from_pretrained(args.bert_model)
236 |     if args.fp16:
237 |         model.half()
238 |     model.to(device)
239 |     if args.local_rank != -1:
240 |         try:
241 |             from apex.parallel import DistributedDataParallel as DDP
242 |         except ImportError:
243 |             raise ImportError(
244 |                 "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
245 |         model = DDP(model)
246 |     elif n_gpu > 1:
247 |         model = torch.nn.DataParallel(model)
248 | 
249 |     # Prepare optimizer
250 |     param_optimizer = list(model.named_parameters())
251 |     no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
252 |     optimizer_grouped_parameters = [
253 |         {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
254 |          'weight_decay': 0.01},
255 |         {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
256 |     ]
257 | 
258 |     if args.fp16:
259 |         try:
260 |             from apex.optimizers import FP16_Optimizer
261 |             from apex.optimizers import FusedAdam
262 |         except ImportError:
263 |             raise ImportError(
264 |                 "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
265 | 
266 |         optimizer = FusedAdam(optimizer_grouped_parameters,
267 |                               lr=args.learning_rate,
268 |                               bias_correction=False,
269 |                               max_grad_norm=1.0)
270 |         if args.loss_scale == 0:
271 |             optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
272 |         else:
273 |             optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
274 |         warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion,
275 |                                              t_total=num_train_optimization_steps)
276 |     else:
277 |         optimizer = BertAdam(optimizer_grouped_parameters,
278 |                              lr=args.learning_rate,
279 |                              warmup=args.warmup_proportion,
280 |                              t_total=num_train_optimization_steps)
281 | 
282 |     global_step = 0
283 |     logging.info("***** Running training *****")
284 |     logging.info(f"  Num examples = {total_train_examples}")
285 |     logging.info("  Batch size = %d", args.train_batch_size)
286 |     logging.info("  Num steps = %d", num_train_optimization_steps)
287 |     model.train()
288 |     for epoch in range(args.epochs):
289 |         epoch_dataset = PregeneratedDataset(epoch=epoch, training_path=args.pregenerated_data, tokenizer=tokenizer,
290 |                                             num_data_epochs=num_data_epochs, reduce_memory=args.reduce_memory)
291 |         if args.local_rank == -1:
292 |             train_sampler = RandomSampler(epoch_dataset)
293 |         else:
294 |             train_sampler = DistributedSampler(epoch_dataset)
295 |         train_dataloader = DataLoader(epoch_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
296 |         tr_loss = 0
297 |         nb_tr_examples, nb_tr_steps = 0, 0
298 |         with tqdm(total=len(train_dataloader), desc=f"Epoch {epoch}") as pbar:
299 |             for step, batch in enumerate(train_dataloader):
300 |                 batch = tuple(t.to(device) for t in batch)
301 |                 input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch
302 |                 loss = model(input_ids, segment_ids, input_mask, lm_label_ids, is_next)
303 |                 if n_gpu > 1:
304 |                     loss = loss.mean() # mean() to average on multi-gpu.
305 |                 if args.gradient_accumulation_steps > 1:
306 |                     loss = loss / args.gradient_accumulation_steps
307 |                 if args.fp16:
308 |                     optimizer.backward(loss)
309 |                 else:
310 |                     loss.backward()
311 |                 tr_loss += loss.item()
312 |                 nb_tr_examples += input_ids.size(0)
313 |                 nb_tr_steps += 1
314 |                 pbar.update(1)
315 |                 mean_loss = tr_loss * args.gradient_accumulation_steps / nb_tr_steps
316 |                 pbar.set_postfix_str(f"Loss: {mean_loss:.5f}")
317 |                 if (step + 1) % args.gradient_accumulation_steps == 0:
318 |                     if args.fp16:
319 |                         # modify learning rate with special warm up BERT uses
320 |                         # if args.fp16 is False, BertAdam is used that handles this automatically
321 |                         lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step, args.warmup_proportion)
322 |                         for param_group in optimizer.param_groups:
323 |                             param_group['lr'] = lr_this_step
324 |                     optimizer.step()
325 |                     optimizer.zero_grad()
326 |                     global_step += 1
327 | 
328 |     # Save a trained model
329 |     logging.info("** ** * Saving fine-tuned model ** ** * ")
330 |     model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
331 | 
332 |     output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
333 |     output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
334 | 
335 |     torch.save(model_to_save.state_dict(), output_model_file)
336 |     model_to_save.config.to_json_file(output_config_file)
337 |     tokenizer.save_vocabulary(args.output_dir)
338 | 
339 | 
340 | if __name__ == '__main__':
341 |     main()
342 | 


--------------------------------------------------------------------------------
/scripts/run_classifier_dataset_utils.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
  3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | """ BERT classification fine-tuning: utilities to work with GLUE tasks """
 17 | 
 18 | from __future__ import absolute_import, division, print_function
 19 | 
 20 | import csv
 21 | import logging
 22 | import os
 23 | import sys
 24 | 
 25 | from scipy.stats import pearsonr, spearmanr
 26 | from sklearn.metrics import matthews_corrcoef, f1_score
 27 | 
 28 | logger = logging.getLogger(__name__)
 29 | 
 30 | 
 31 | class InputExample(object):
 32 |     """A single training/test example for simple sequence classification."""
 33 | 
 34 |     def __init__(self, guid, text_a, text_b=None, label=None, group = None, other_fields = []):
 35 |         """Constructs a InputExample.
 36 | 
 37 |         Args:
 38 |             guid: Unique id for the example.
 39 |             text_a: string. The untokenized text of the first sequence. For single
 40 |             sequence tasks, only this sequence must be specified.
 41 |             text_b: (Optional) string. The untokenized text of the second sequence.
 42 |             Only must be specified for sequence pair tasks.
 43 |             label: (Optional) string. The label of the example. This should be
 44 |             specified for train and dev examples, but not for test examples.
 45 |         """
 46 |         self.guid = guid
 47 |         self.text_a = text_a
 48 |         self.text_b = text_b
 49 |         self.label = label
 50 |         self.group = group
 51 |         self.other_fields = other_fields
 52 | 
 53 | 
 54 | class InputFeatures(object):
 55 |     """A single set of features of data."""
 56 | 
 57 |     def __init__(self, input_ids, input_mask, segment_ids, label_id, group = None, guid = None, other_fields = []):
 58 |         self.input_ids = input_ids
 59 |         self.input_mask = input_mask
 60 |         self.segment_ids = segment_ids
 61 |         self.label_id = label_id
 62 |         self.group = group
 63 |         self.guid = guid
 64 |         self.other_fields = other_fields
 65 | 
 66 | 
 67 | class DataProcessor(object):
 68 |     """Base class for data converters for sequence classification data sets."""
 69 | 
 70 |     def get_train_examples(self, data_dir):
 71 |         """Gets a collection of `InputExample`s for the train set."""
 72 |         raise NotImplementedError()
 73 | 
 74 |     def get_dev_examples(self, data_dir):
 75 |         """Gets a collection of `InputExample`s for the dev set."""
 76 |         raise NotImplementedError()
 77 | 
 78 |     def get_labels(self):
 79 |         """Gets the list of labels for this data set."""
 80 |         raise NotImplementedError()
 81 | 
 82 |     @classmethod
 83 |     def _read_tsv(cls, input_file, quotechar=None):
 84 |         """Reads a tab separated value file."""
 85 |         with open(input_file, "r", encoding="utf-8") as f:
 86 |             reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
 87 |             lines = []
 88 |             for line in reader:
 89 |                 if sys.version_info[0] == 2:
 90 |                     line = list(unicode(cell, 'utf-8') for cell in line)
 91 |                 lines.append(line)
 92 |             return lines
 93 | 
 94 | 
 95 | class MrpcProcessor(DataProcessor):
 96 |     """Processor for the MRPC data set (GLUE version)."""
 97 | 
 98 |     def get_train_examples(self, data_dir):
 99 |         """See base class."""
100 |         logger.info("LOOKING AT {}".format(os.path.join(data_dir, "train.tsv")))
101 |         return self._create_examples(
102 |             self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
103 | 
104 |     def get_dev_examples(self, data_dir):
105 |         """See base class."""
106 |         return self._create_examples(
107 |             self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
108 | 
109 |     def get_labels(self):
110 |         """See base class."""
111 |         return ["0", "1"]
112 | 
113 |     def _create_examples(self, lines, set_type):
114 |         """Creates examples for the training and dev sets."""
115 |         examples = []
116 |         for (i, line) in enumerate(lines):
117 |             if i == 0:
118 |                 continue
119 |             guid = "%s-%s" % (set_type, i)
120 |             text_a = line[3]
121 |             text_b = line[4]
122 |             label = line[0]
123 |             examples.append(
124 |                 InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
125 |         return examples
126 | 
127 | 
128 | class MnliProcessor(DataProcessor):
129 |     """Processor for the MultiNLI data set (GLUE version)."""
130 | 
131 |     def get_train_examples(self, data_dir):
132 |         """See base class."""
133 |         return self._create_examples(
134 |             self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
135 | 
136 |     def get_dev_examples(self, data_dir):
137 |         """See base class."""
138 |         return self._create_examples(
139 |             self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")),
140 |             "dev_matched")
141 | 
142 |     def get_labels(self):
143 |         """See base class."""
144 |         return ["contradiction", "entailment", "neutral"]
145 | 
146 |     def _create_examples(self, lines, set_type):
147 |         """Creates examples for the training and dev sets."""
148 |         examples = []
149 |         for (i, line) in enumerate(lines):
150 |             if i == 0:
151 |                 continue
152 |             guid = "%s-%s" % (set_type, line[0])
153 |             text_a = line[8]
154 |             text_b = line[9]
155 |             label = line[-1]
156 |             examples.append(
157 |                 InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
158 |         return examples
159 | 
160 | 
161 | class MnliMismatchedProcessor(MnliProcessor):
162 |     """Processor for the MultiNLI Mismatched data set (GLUE version)."""
163 | 
164 |     def get_dev_examples(self, data_dir):
165 |         """See base class."""
166 |         return self._create_examples(
167 |             self._read_tsv(os.path.join(data_dir, "dev_mismatched.tsv")),
168 |             "dev_matched")
169 | 
170 | 
171 | class ColaProcessor(DataProcessor):
172 |     """Processor for the CoLA data set (GLUE version)."""
173 | 
174 |     def get_train_examples(self, data_dir):
175 |         """See base class."""
176 |         return self._create_examples(
177 |             self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
178 | 
179 |     def get_dev_examples(self, data_dir):
180 |         """See base class."""
181 |         return self._create_examples(
182 |             self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
183 | 
184 |     def get_labels(self):
185 |         """See base class."""
186 |         return ["0", "1"]
187 | 
188 |     def _create_examples(self, lines, set_type):
189 |         """Creates examples for the training and dev sets."""
190 |         examples = []
191 |         for (i, line) in enumerate(lines):
192 |             guid = "%s-%s" % (set_type, i)
193 |             text_a = line[3]
194 |             label = line[1]
195 |             examples.append(
196 |                 InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
197 |         return examples
198 | 
199 | 
200 | class Sst2Processor(DataProcessor):
201 |     """Processor for the SST-2 data set (GLUE version)."""
202 | 
203 |     def get_train_examples(self, data_dir):
204 |         """See base class."""
205 |         return self._create_examples(
206 |             self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
207 | 
208 |     def get_dev_examples(self, data_dir):
209 |         """See base class."""
210 |         return self._create_examples(
211 |             self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
212 | 
213 |     def get_labels(self):
214 |         """See base class."""
215 |         return ["0", "1"]
216 | 
217 |     def _create_examples(self, lines, set_type):
218 |         """Creates examples for the training and dev sets."""
219 |         examples = []
220 |         for (i, line) in enumerate(lines):
221 |             if i == 0:
222 |                 continue
223 |             guid = "%s-%s" % (set_type, i)
224 |             text_a = line[0]
225 |             label = line[1]
226 |             examples.append(
227 |                 InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
228 |         return examples
229 | 
230 | 
231 | class StsbProcessor(DataProcessor):
232 |     """Processor for the STS-B data set (GLUE version)."""
233 | 
234 |     def get_train_examples(self, data_dir):
235 |         """See base class."""
236 |         return self._create_examples(
237 |             self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
238 | 
239 |     def get_dev_examples(self, data_dir):
240 |         """See base class."""
241 |         return self._create_examples(
242 |             self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
243 | 
244 |     def get_labels(self):
245 |         """See base class."""
246 |         return [None]
247 | 
248 |     def _create_examples(self, lines, set_type):
249 |         """Creates examples for the training and dev sets."""
250 |         examples = []
251 |         for (i, line) in enumerate(lines):
252 |             if i == 0:
253 |                 continue
254 |             guid = "%s-%s" % (set_type, line[0])
255 |             text_a = line[7]
256 |             text_b = line[8]
257 |             label = line[-1]
258 |             examples.append(
259 |                 InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
260 |         return examples
261 | 
262 | 
263 | class QqpProcessor(DataProcessor):
264 |     """Processor for the QQP data set (GLUE version)."""
265 | 
266 |     def get_train_examples(self, data_dir):
267 |         """See base class."""
268 |         return self._create_examples(
269 |             self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
270 | 
271 |     def get_dev_examples(self, data_dir):
272 |         """See base class."""
273 |         return self._create_examples(
274 |             self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
275 | 
276 |     def get_labels(self):
277 |         """See base class."""
278 |         return ["0", "1"]
279 | 
280 |     def _create_examples(self, lines, set_type):
281 |         """Creates examples for the training and dev sets."""
282 |         examples = []
283 |         for (i, line) in enumerate(lines):
284 |             if i == 0:
285 |                 continue
286 |             guid = "%s-%s" % (set_type, line[0])
287 |             try:
288 |                 text_a = line[3]
289 |                 text_b = line[4]
290 |                 label = line[5]
291 |             except IndexError:
292 |                 continue
293 |             examples.append(
294 |                 InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
295 |         return examples
296 | 
297 | 
298 | class QnliProcessor(DataProcessor):
299 |     """Processor for the QNLI data set (GLUE version)."""
300 | 
301 |     def get_train_examples(self, data_dir):
302 |         """See base class."""
303 |         return self._create_examples(
304 |             self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
305 | 
306 |     def get_dev_examples(self, data_dir):
307 |         """See base class."""
308 |         return self._create_examples(
309 |             self._read_tsv(os.path.join(data_dir, "dev.tsv")),
310 |             "dev_matched")
311 | 
312 |     def get_labels(self):
313 |         """See base class."""
314 |         return ["entailment", "not_entailment"]
315 | 
316 |     def _create_examples(self, lines, set_type):
317 |         """Creates examples for the training and dev sets."""
318 |         examples = []
319 |         for (i, line) in enumerate(lines):
320 |             if i == 0:
321 |                 continue
322 |             guid = "%s-%s" % (set_type, line[0])
323 |             text_a = line[1]
324 |             text_b = line[2]
325 |             label = line[-1]
326 |             examples.append(
327 |                 InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
328 |         return examples
329 | 
330 | 
331 | class RteProcessor(DataProcessor):
332 |     """Processor for the RTE data set (GLUE version)."""
333 | 
334 |     def get_train_examples(self, data_dir):
335 |         """See base class."""
336 |         return self._create_examples(
337 |             self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
338 | 
339 |     def get_dev_examples(self, data_dir):
340 |         """See base class."""
341 |         return self._create_examples(
342 |             self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
343 | 
344 |     def get_labels(self):
345 |         """See base class."""
346 |         return ["entailment", "not_entailment"]
347 | 
348 |     def _create_examples(self, lines, set_type):
349 |         """Creates examples for the training and dev sets."""
350 |         examples = []
351 |         for (i, line) in enumerate(lines):
352 |             if i == 0:
353 |                 continue
354 |             guid = "%s-%s" % (set_type, line[0])
355 |             text_a = line[1]
356 |             text_b = line[2]
357 |             label = line[-1]
358 |             examples.append(
359 |                 InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
360 |         return examples
361 | 
362 | 
363 | class WnliProcessor(DataProcessor):
364 |     """Processor for the WNLI data set (GLUE version)."""
365 | 
366 |     def get_train_examples(self, data_dir):
367 |         """See base class."""
368 |         return self._create_examples(
369 |             self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
370 | 
371 |     def get_dev_examples(self, data_dir):
372 |         """See base class."""
373 |         return self._create_examples(
374 |             self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
375 | 
376 |     def get_labels(self):
377 |         """See base class."""
378 |         return ["0", "1"]
379 | 
380 |     def _create_examples(self, lines, set_type):
381 |         """Creates examples for the training and dev sets."""
382 |         examples = []
383 |         for (i, line) in enumerate(lines):
384 |             if i == 0:
385 |                 continue
386 |             guid = "%s-%s" % (set_type, line[0])
387 |             text_a = line[1]
388 |             text_b = line[2]
389 |             label = line[-1]
390 |             examples.append(
391 |                 InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
392 |         return examples
393 | 
394 | 
395 | def convert_examples_to_features(examples, max_seq_length,
396 |                                  tokenizer, output_mode):
397 |     """Loads a data file into a list of `InputBatch`s."""
398 | 
399 |     #label_map = {label : i for i, label in enumerate(label_list)}
400 | 
401 |     features = []
402 |     for (ex_index, example) in enumerate(examples):
403 |         if ex_index % 10000 == 0:
404 |             logger.info("Writing example %d of %d" % (ex_index, len(examples)))
405 | 
406 |         tokens_a = tokenizer.tokenize(example.text_a)
407 | 
408 |         tokens_b = None
409 |         if example.text_b:
410 |             tokens_b = tokenizer.tokenize(example.text_b)
411 |             # Modifies `tokens_a` and `tokens_b` in place so that the total
412 |             # length is less than the specified length.
413 |             # Account for [CLS], [SEP], [SEP] with "- 3"
414 |             _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
415 |         else:
416 |             # Account for [CLS] and [SEP] with "- 2"
417 |             if len(tokens_a) > max_seq_length - 2:
418 |                 tokens_a = tokens_a[:(max_seq_length - 2)]
419 | 
420 |         # The convention in BERT is:
421 |         # (a) For sequence pairs:
422 |         #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
423 |         #  type_ids: 0   0  0    0    0     0       0 0    1  1  1  1   1 1
424 |         # (b) For single sequences:
425 |         #  tokens:   [CLS] the dog is hairy . [SEP]
426 |         #  type_ids: 0   0   0   0  0     0 0
427 |         #
428 |         # Where "type_ids" are used to indicate whether this is the first
429 |         # sequence or the second sequence. The embedding vectors for `type=0` and
430 |         # `type=1` were learned during pre-training and are added to the wordpiece
431 |         # embedding vector (and position vector). This is not *strictly* necessary
432 |         # since the [SEP] token unambiguously separates the sequences, but it makes
433 |         # it easier for the model to learn the concept of sequences.
434 |         #
435 |         # For classification tasks, the first vector (corresponding to [CLS]) is
436 |         # used as as the "sentence vector". Note that this only makes sense because
437 |         # the entire model is fine-tuned.
438 |         tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
439 |         segment_ids = [0] * len(tokens)
440 | 
441 |         if tokens_b:
442 |             tokens += tokens_b + ["[SEP]"]
443 |             segment_ids += [1] * (len(tokens_b) + 1)
444 | 
445 |         input_ids = tokenizer.convert_tokens_to_ids(tokens)
446 | 
447 |         # The mask has 1 for real tokens and 0 for padding tokens. Only real
448 |         # tokens are attended to.
449 |         input_mask = [1] * len(input_ids)
450 | 
451 |         # Zero-pad up to the sequence length.
452 |         padding = [0] * (max_seq_length - len(input_ids))
453 |         input_ids += padding
454 |         input_mask += padding
455 |         segment_ids += padding
456 |         group = example.group
457 |         guid = example.guid
458 |         other_fields = example.other_fields
459 | 
460 |         assert len(input_ids) == max_seq_length
461 |         assert len(input_mask) == max_seq_length
462 |         assert len(segment_ids) == max_seq_length
463 | 
464 |         if output_mode == "classification":
465 |             label_id = example.label #modifies code to bypass label map, input is already encoded
466 |         elif output_mode == "regression":
467 |             label_id = float(example.label)
468 |         else:
469 |             raise KeyError(output_mode)
470 | 
471 |         if ex_index < 5:
472 |             logger.info("*** Example ***")
473 |             logger.info("guid: %s" % (example.guid))
474 |             logger.info("tokens: %s" % " ".join(
475 |                     [str(x) for x in tokens]))
476 |             logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
477 |             logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
478 |             logger.info(
479 |                     "segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
480 |             logger.info("label: %s (id = %d)" % (example.label, label_id))
481 | 
482 |         features.append(
483 |                 InputFeatures(input_ids=input_ids,
484 |                               input_mask=input_mask,
485 |                               segment_ids=segment_ids,
486 |                               label_id=label_id,
487 |                               group = group,
488 |                               guid = guid,
489 |                               other_fields = other_fields
490 |                               ))
491 |     return features
492 | 
493 | 
494 | def _truncate_seq_pair(tokens_a, tokens_b, max_length):
495 |     """Truncates a sequence pair in place to the maximum length."""
496 | 
497 |     # This is a simple heuristic which will always truncate the longer sequence
498 |     # one token at a time. This makes more sense than truncating an equal percent
499 |     # of tokens from each, since if one sequence is very short then each token
500 |     # that's truncated likely contains more information than a longer sequence.
501 |     while True:
502 |         total_length = len(tokens_a) + len(tokens_b)
503 |         if total_length <= max_length:
504 |             break
505 |         if len(tokens_a) > len(tokens_b):
506 |             tokens_a.pop()
507 |         else:
508 |             tokens_b.pop()
509 | 
510 | 
511 | def simple_accuracy(preds, labels):
512 |     return (preds == labels).mean()
513 | 
514 | 
515 | def acc_and_f1(preds, labels):
516 |     acc = simple_accuracy(preds, labels)
517 |     f1 = f1_score(y_true=labels, y_pred=preds)
518 |     return {
519 |         "acc": acc,
520 |         "f1": f1,
521 |         "acc_and_f1": (acc + f1) / 2,
522 |     }
523 | 
524 | 
525 | def pearson_and_spearman(preds, labels):
526 |     pearson_corr = pearsonr(preds, labels)[0]
527 |     spearman_corr = spearmanr(preds, labels)[0]
528 |     return {
529 |         "pearson": pearson_corr,
530 |         "spearmanr": spearman_corr,
531 |         "corr": (pearson_corr + spearman_corr) / 2,
532 |     }
533 | 
534 | 
535 | def compute_metrics(task_name, preds, labels):
536 |     assert len(preds) == len(labels)
537 |     if task_name == "cola":
538 |         return {"mcc": matthews_corrcoef(labels, preds)}
539 |     elif task_name == "sst-2":
540 |         return {"acc": simple_accuracy(preds, labels)}
541 |     elif task_name == "mrpc":
542 |         return acc_and_f1(preds, labels)
543 |     elif task_name == "sts-b":
544 |         return pearson_and_spearman(preds, labels)
545 |     elif task_name == "qqp":
546 |         return acc_and_f1(preds, labels)
547 |     elif task_name == "mnli":
548 |         return {"acc": simple_accuracy(preds, labels)}
549 |     elif task_name == "mnli-mm":
550 |         return {"acc": simple_accuracy(preds, labels)}
551 |     elif task_name == "qnli":
552 |         return {"acc": simple_accuracy(preds, labels)}
553 |     elif task_name == "rte":
554 |         return {"acc": simple_accuracy(preds, labels)}
555 |     elif task_name == "wnli":
556 |         return {"acc": simple_accuracy(preds, labels)}
557 |     else:
558 |         raise KeyError(task_name)
559 | 
560 | processors = {
561 |     "cola": ColaProcessor,
562 |     "mnli": MnliProcessor,
563 |     "mnli-mm": MnliMismatchedProcessor,
564 |     "mrpc": MrpcProcessor,
565 |     "sst-2": Sst2Processor,
566 |     "sts-b": StsbProcessor,
567 |     "qqp": QqpProcessor,
568 |     "qnli": QnliProcessor,
569 |     "rte": RteProcessor,
570 |     "wnli": WnliProcessor,
571 | }
572 | 
573 | output_modes = {
574 |     "cola": "classification",
575 |     "mnli": "classification",
576 |     "mrpc": "classification",
577 |     "sst-2": "classification",
578 |     "sts-b": "regression",
579 |     "qqp": "classification",
580 |     "qnli": "classification",
581 |     "rte": "classification",
582 |     "wnli": "classification",
583 | }
584 | 


--------------------------------------------------------------------------------
/scripts/pregenerate_training_data.py:
--------------------------------------------------------------------------------
  1 | #!/h/haoran/anaconda3/bin/python
  2 | import sys
  3 | import os
  4 | sys.path.append(os.getcwd())
  5 | from argparse import ArgumentParser
  6 | from pathlib import Path
  7 | from tqdm import tqdm, trange
  8 | from tempfile import TemporaryDirectory
  9 | import shelve
 10 | 
 11 | from random import random, randrange, randint, shuffle, choice
 12 | from pytorch_pretrained_bert.tokenization import BertTokenizer
 13 | import numpy as np
 14 | import json
 15 | import collections
 16 | import Constants
 17 | import pandas as pd
 18 | 
 19 | '''
 20 | Code adapted from simple_lm_finetuning.py in huggingface/pytorch-pretrained-BERT
 21 | '''
 22 | class DocumentDatabase:
 23 |     def __init__(self):
 24 |         self.documents = []
 25 |         self.temp_dir = None
 26 |         self.doc_lengths = []
 27 |         self.doc_cumsum = None
 28 |         self.cumsum_max = None
 29 | 
 30 |     def add_document(self, document):
 31 |         if not document:
 32 |             return
 33 |         self.documents.append(document) #each document is a list of dictionaries
 34 |         self.doc_lengths.append(len(document))
 35 | 
 36 |     def _precalculate_doc_weights(self):
 37 |         self.doc_cumsum = np.cumsum(self.doc_lengths)
 38 |         self.cumsum_max = self.doc_cumsum[-1]
 39 | 
 40 |     def sample_doc(self, current_idx, sentence_weighted=True):
 41 |         # Uses the current iteration counter to ensure we don't sample the same doc twice
 42 |         if sentence_weighted:
 43 |             # With sentence weighting, we sample docs proportionally to their sentence length
 44 |             if self.doc_cumsum is None or len(self.doc_cumsum) != len(self.doc_lengths):
 45 |                 self._precalculate_doc_weights()
 46 |             rand_start = self.doc_cumsum[current_idx]
 47 |             rand_end = rand_start + self.cumsum_max - self.doc_lengths[current_idx]
 48 |             sentence_index = randrange(rand_start, rand_end) % self.cumsum_max
 49 |             sampled_doc_index = np.searchsorted(self.doc_cumsum, sentence_index, side='right')
 50 |         else:
 51 |             # If we don't use sentence weighting, then every doc has an equal chance to be chosen
 52 |             sampled_doc_index = (current_idx + randrange(1, len(self.doc_lengths))) % len(self.doc_lengths)
 53 |         assert sampled_doc_index != current_idx
 54 |         return self.documents[sampled_doc_index]
 55 | 
 56 |     def __len__(self):
 57 |         return len(self.doc_lengths)
 58 | 
 59 |     def __getitem__(self, item):
 60 |         return self.documents[item]
 61 | 
 62 | 
 63 | def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens):
 64 |     """Truncates a pair of sequences to a maximum sequence length. Lifted from Google's BERT repo."""
 65 |     while True:
 66 |         total_length = len(tokens_a) + len(tokens_b)
 67 |         if total_length <= max_num_tokens:
 68 |             break
 69 | 
 70 |         trunc_tokens = tokens_a if len(tokens_a) > len(tokens_b) else tokens_b
 71 |         assert len(trunc_tokens) >= 1
 72 | 
 73 |         # We want to sometimes truncate from the front and sometimes from the
 74 |         # back to add more randomness and avoid biases.
 75 |         if random() < 0.5:
 76 |             del trunc_tokens[0]
 77 |         else:
 78 |             trunc_tokens.pop()
 79 | 
 80 | MaskedLmInstance = collections.namedtuple("MaskedLmInstance",
 81 |                                           ["index", "label"])
 82 | 
 83 | def create_masked_lm_predictions(tokens, masked_lm_prob, max_predictions_per_seq, whole_word_mask, vocab_list):
 84 |     """Creates the predictions for the masked LM objective. This is mostly copied from the Google BERT repo, but
 85 |     with several refactors to clean it up and remove a lot of unnecessary variables."""
 86 |     cand_indices = []
 87 |     for (i, token) in enumerate(tokens):
 88 |         if token == "[CLS]" or token == "[SEP]":
 89 |             continue
 90 |         # Whole Word Masking means that if we mask all of the wordpieces
 91 |         # corresponding to an original word. When a word has been split into
 92 |         # WordPieces, the first token does not have any marker and any subsequence
 93 |         # tokens are prefixed with ##. So whenever we see the ## token, we
 94 |         # append it to the previous set of word indexes.
 95 |         #
 96 |         # Note that Whole Word Masking does *not* change the training code
 97 |         # at all -- we still predict each WordPiece independently, softmaxed
 98 |         # over the entire vocabulary.
 99 |         if (whole_word_mask and len(cand_indices) >= 1 and token.startswith("##")):
100 |             cand_indices[-1].append(i)
101 |         else:
102 |             cand_indices.append([i])
103 | 
104 |     num_to_mask = min(max_predictions_per_seq,
105 |                       max(1, int(round(len(tokens) * masked_lm_prob))))
106 |     shuffle(cand_indices)
107 |     masked_lms = []
108 |     covered_indexes = set()
109 |     for index_set in cand_indices:
110 |         if len(masked_lms) >= num_to_mask:
111 |             break
112 |         # If adding a whole-word mask would exceed the maximum number of
113 |         # predictions, then just skip this candidate.
114 |         if len(masked_lms) + len(index_set) > num_to_mask:
115 |             continue
116 |         is_any_index_covered = False
117 |         for index in index_set:
118 |             if index in covered_indexes:
119 |                 is_any_index_covered = True
120 |                 break
121 |         if is_any_index_covered:
122 |             continue
123 |         for index in index_set:
124 |             covered_indexes.add(index)
125 | 
126 |             masked_token = None
127 |             # 80% of the time, replace with [MASK]
128 |             if random() < 0.8:
129 |                 masked_token = "[MASK]"
130 |             else:
131 |                 # 10% of the time, keep original
132 |                 if random() < 0.5:
133 |                     masked_token = tokens[index]
134 |                 # 10% of the time, replace with random word
135 |                 else:
136 |                     masked_token = choice(vocab_list)
137 |             masked_lms.append(MaskedLmInstance(index=index, label=tokens[index]))
138 |             tokens[index] = masked_token
139 | 
140 |     assert len(masked_lms) <= num_to_mask
141 |     masked_lms = sorted(masked_lms, key=lambda x: x.index)
142 |     mask_indices = [p.index for p in masked_lms]
143 |     masked_token_labels = [p.label for p in masked_lms]
144 | 
145 |     return tokens, mask_indices, masked_token_labels
146 | 
147 | 
148 | def create_instances_from_document(
149 |         doc_database, doc_idx, max_seq_length, short_seq_prob,
150 |         masked_lm_prob, max_predictions_per_seq, whole_word_mask, vocab_list):
151 |     """This code is mostly a duplicate of the equivalent function from Google BERT's repo.
152 |     However, we make some changes and improvements. Sampling is improved and no longer requires a loop in this function.
153 |     Also, documents are sampled proportionally to the number of sentences they contain, which means each sentence
154 |     (rather than each document) has an equal chance of being sampled as a false example for the NextSentence task."""
155 |     document = doc_database[doc_idx]
156 |     # Account for [CLS], [SEP], [SEP]
157 |     max_num_tokens = max_seq_length - 3
158 | 
159 |     # We *usually* want to fill up the entire sequence since we are padding
160 |     # to `max_seq_length` anyways, so short sequences are generally wasted
161 |     # computation. However, we *sometimes*
162 |     # (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter
163 |     # sequences to minimize the mismatch between pre-training and fine-tuning.
164 |     # The `target_seq_length` is just a rough target however, whereas
165 |     # `max_seq_length` is a hard limit.
166 |     target_seq_length = max_num_tokens
167 |     if random() < short_seq_prob:
168 |         target_seq_length = randint(2, max_num_tokens)
169 | 
170 |     # We DON'T just concatenate all of the tokens from a document into a long
171 |     # sequence and choose an arbitrary split point because this would make the
172 |     # next sentence prediction task too easy. Instead, we split the input into
173 |     # segments "A" and "B" based on the actual "sentences" provided by the user
174 |     # input.
175 |     instances = []
176 |     current_chunk = []
177 |     current_length = 0
178 |     i = 0
179 |     while i < len(document):
180 |         segment = document[i]['tokens']
181 |         current_chunk.append(segment)
182 |         current_length += len(segment)
183 |         groups_a = document[i]['groups']
184 |         if i == len(document) - 1 or current_length >= target_seq_length:
185 |             if current_chunk:
186 |                 # `a_end` is how many segments from `current_chunk` go into the `A`
187 |                 # (first) sentence.
188 |                 a_end = 1
189 |                 if len(current_chunk) >= 2:
190 |                     a_end = randrange(1, len(current_chunk))
191 | 
192 |                 tokens_a = []
193 |                 for j in range(a_end):
194 |                     tokens_a.extend(current_chunk[j])
195 | 
196 |                 tokens_b = []
197 |                 random_document = None
198 |                 # Random next
199 |                 if len(current_chunk) == 1 or random() < 0.5:
200 |                     is_random_next = True
201 |                     target_b_length = target_seq_length - len(tokens_a)
202 | 
203 |                     # Sample a random document, with longer docs being sampled more frequently
204 |                     random_document = doc_database.sample_doc(current_idx=doc_idx, sentence_weighted=True)
205 | 
206 |                     random_start = randrange(0, len(random_document))
207 |                     for j in range(random_start, len(random_document)):
208 |                         doc_b_last = random_document[j]
209 |                         tokens_b.extend(doc_b_last['tokens'])
210 |                         if len(tokens_b) >= target_b_length:
211 |                             break
212 |                     groups_b = doc_b_last['groups']
213 |                     # We didn't actually use these segments so we "put them back" so
214 |                     # they don't go to waste.
215 |                     num_unused_segments = len(current_chunk) - a_end
216 |                     i -= num_unused_segments
217 |                 # Actual next
218 |                 else:
219 |                     groups_b = groups_a
220 |                     is_random_next = False
221 |                     for j in range(a_end, len(current_chunk)):
222 |                         tokens_b.extend(current_chunk[j])
223 |                 truncate_seq_pair(tokens_a, tokens_b, max_num_tokens)
224 | 
225 |                 try:
226 |                     assert len(tokens_a) >= 1
227 |                     assert len(tokens_b) >= 1
228 |                 except AssertionError:
229 |                     print(document)
230 |                     if random_document is not None:
231 |                         print(random_document)
232 |                     raise
233 | 
234 |                 tokens = ["[CLS]"] + tokens_a + ["[SEP]"] + tokens_b + ["[SEP]"]
235 |                 # The segment IDs are 0 for the [CLS] token, the A tokens and the first [SEP]
236 |                 # They are 1 for the B tokens and the final [SEP]
237 |                 segment_ids = [0 for _ in range(len(tokens_a) + 2)] + [1 for _ in range(len(tokens_b) + 1)]
238 | 
239 |                 tokens, masked_lm_positions, masked_lm_labels = create_masked_lm_predictions(
240 |                     tokens, masked_lm_prob, max_predictions_per_seq, whole_word_mask, vocab_list)
241 | 
242 |                 instance = {
243 |                     "tokens": tokens,
244 |                     "segment_ids": segment_ids,
245 |                     "is_random_next": is_random_next,
246 |                     "masked_lm_positions": masked_lm_positions,
247 |                     "masked_lm_labels": masked_lm_labels,
248 |                     'groups_a': groups_a,
249 |                     'groups_b': groups_b}
250 |                 instances.append(instance)
251 |             current_chunk = []
252 |             current_length = 0
253 |         i += 1
254 | 
255 |     return instances
256 | 
257 | def convert_example_to_features(example, max_seq_length, tokenizer):
258 |     """
259 |     Convert a raw sample (pair of sentences as tokenized strings) into a proper training sample with
260 |     IDs, LM labels, input_mask, CLS and SEP tokens etc.
261 |     :param example: InputExample, containing sentence input as strings and is_next label
262 |     :param max_seq_length: int, maximum length of sequence.
263 |     :param tokenizer: Tokenizer
264 |     :return: InputFeatures, containing all inputs and labels of one sample as IDs (as used for model training)
265 |     """
266 |     tokens_a = example.tokens_a
267 |     tokens_b = example.tokens_b
268 |     # Modifies `tokens_a` and `tokens_b` in place so that the total
269 |     # length is less than the specified length.
270 |     # Account for [CLS], [SEP], [SEP] with "- 3"
271 |     _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
272 | 
273 |     tokens_a, t1_label = random_word(tokens_a, tokenizer)
274 |     tokens_b, t2_label = random_word(tokens_b, tokenizer)
275 |     # concatenate lm labels and account for CLS, SEP, SEP
276 |     lm_label_ids = ([-1] + t1_label + [-1] + t2_label + [-1])
277 | 
278 |     # The convention in BERT is:
279 |     # (a) For sequence pairs:
280 |     #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
281 |     #  type_ids: 0   0  0    0    0     0       0 0    1  1  1  1   1 1
282 |     # (b) For single sequences:
283 |     #  tokens:   [CLS] the dog is hairy . [SEP]
284 |     #  type_ids: 0   0   0   0  0     0 0
285 |     #
286 |     # Where "type_ids" are used to indicate whether this is the first
287 |     # sequence or the second sequence. The embedding vectors for `type=0` and
288 |     # `type=1` were learned during pre-training and are added to the wordpiece
289 |     # embedding vector (and position vector). This is not *strictly* necessary
290 |     # since the [SEP] token unambigiously separates the sequences, but it makes
291 |     # it easier for the model to learn the concept of sequences.
292 |     #
293 |     # For classification tasks, the first vector (corresponding to [CLS]) is
294 |     # used as as the "sentence vector". Note that this only makes sense because
295 |     # the entire model is fine-tuned.
296 |     tokens = []
297 |     segment_ids = []
298 |     tokens.append("[CLS]")
299 |     segment_ids.append(0)
300 |     for token in tokens_a:
301 |         tokens.append(token)
302 |         segment_ids.append(0)
303 |     tokens.append("[SEP]")
304 |     segment_ids.append(0)
305 | 
306 |     assert len(tokens_b) > 0
307 |     for token in tokens_b:
308 |         tokens.append(token)
309 |         segment_ids.append(1)
310 |     tokens.append("[SEP]")
311 |     segment_ids.append(1)
312 | 
313 |     input_ids = tokenizer.convert_tokens_to_ids(tokens)
314 | 
315 |     # The mask has 1 for real tokens and 0 for padding tokens. Only real
316 |     # tokens are attended to.
317 |     input_mask = [1] * len(input_ids)
318 | 
319 |     # Zero-pad up to the sequence length.
320 |     while len(input_ids) < max_seq_length:
321 |         input_ids.append(0)
322 |         input_mask.append(0)
323 |         segment_ids.append(0)
324 |         lm_label_ids.append(-1)
325 | 
326 |     assert len(input_ids) == max_seq_length
327 |     assert len(input_mask) == max_seq_length
328 |     assert len(segment_ids) == max_seq_length
329 |     assert len(lm_label_ids) == max_seq_length
330 | 
331 |     if example.guid < 5:
332 |         logger.info("*** Example ***")
333 |         logger.info("guid: %s" % (example.guid))
334 |         logger.info("tokens: %s" % " ".join(
335 |                 [str(x) for x in tokens]))
336 |         logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
337 |         logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
338 |         logger.info(
339 |                 "segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
340 |         logger.info("LM label: %s " % (lm_label_ids))
341 |         logger.info("Is next sentence label: %s " % (example.is_next))
342 | 
343 |     features = InputFeatures(input_ids=input_ids,
344 |                              input_mask=input_mask,
345 |                              segment_ids=segment_ids,
346 |                              lm_label_ids=lm_label_ids,
347 |                              is_next=example.is_next)
348 |     return features
349 | 
350 | def random_word(tokens, tokenizer):
351 |     """
352 |     Masking some random tokens for Language Model task with probabilities as in the original BERT paper.
353 |     :param tokens: list of str, tokenized sentence.
354 |     :param tokenizer: Tokenizer, object used for tokenization (we need it's vocab here)
355 |     :return: (list of str, list of int), masked tokens and related labels for LM prediction
356 |     """
357 |     output_label = []
358 | 
359 |     for i, token in enumerate(tokens):
360 |         prob = random.random()
361 |         # mask token with 15% probability
362 |         if prob < 0.15:
363 |             prob /= 0.15
364 | 
365 |             # 80% randomly change token to mask token
366 |             if prob < 0.8:
367 |                 tokens[i] = "[MASK]"
368 | 
369 |             # 10% randomly change token to random token
370 |             elif prob < 0.9:
371 |                 tokens[i] = random.choice(list(tokenizer.vocab.items()))[0]
372 | 
373 |             # -> rest 10% randomly keep current token
374 | 
375 |             # append current token to output (we will predict these later)
376 |             try:
377 |                 output_label.append(tokenizer.vocab[token])
378 |             except KeyError:
379 |                 # For unknown words (should not occur with BPE vocab)
380 |                 output_label.append(tokenizer.vocab["[UNK]"])
381 |                 logger.warning("Cannot find token '{}' in vocab. Using [UNK] insetad".format(token))
382 |         else:
383 |             # no masking token (will be ignored by loss function later)
384 |             output_label.append(-1)
385 | 
386 |     return tokens, output_label
387 | 
388 | 
389 | def _truncate_seq_pair(tokens_a, tokens_b, max_length):
390 |     """Truncates a sequence pair in place to the maximum length."""
391 | 
392 |     # This is a simple heuristic which will always truncate the longer sequence
393 |     # one token at a time. This makes more sense than truncating an equal percent
394 |     # of tokens from each, since if one sequence is very short then each token
395 |     # that's truncated likely contains more information than a longer sequence.
396 |     while True:
397 |         total_length = len(tokens_a) + len(tokens_b)
398 |         if total_length <= max_length:
399 |             break
400 |         if len(tokens_a) > len(tokens_b):
401 |             tokens_a.pop()
402 |         else:
403 |             tokens_b.pop()
404 | 
405 | def getGroups(row):
406 |     temp =  {i['name']:row[i['name']] for i in Constants.groups}
407 |     temp['note_id'] = row['note_id']
408 |     return temp
409 | 
410 | def main():
411 |     parser = ArgumentParser()
412 |     parser.add_argument('--train_df', type=Path, required=True)
413 |     parser.add_argument('--col_name', type=str, required = True)
414 |     parser.add_argument("--output_dir", type=Path, required=True)
415 |     parser.add_argument("--bert_model", type=str, required=True)
416 |     parser.add_argument("--do_whole_word_mask", action="store_true", default = True,
417 |                         help="Whether to use whole word masking rather than per-WordPiece masking.")
418 |     parser.add_argument("--epochs_to_generate", type=int, default=3,
419 |                         help="Number of epochs of data to pregenerate")
420 |     parser.add_argument("--max_seq_len", type=int, default=128)
421 |     parser.add_argument("--short_seq_prob", type=float, default=0,
422 |                         help="Probability of making a short sentence as a training example")
423 |     parser.add_argument("--masked_lm_prob", type=float, default=0.15,
424 |                         help="Probability of masking each token for the LM task")
425 |     parser.add_argument("--max_predictions_per_seq", type=int, default=20,
426 |                         help="Maximum number of tokens to mask in each sequence")
427 |     parser.add_argument('--categories', type = str, nargs = '+', dest = 'categories', default = [])
428 |     parser.add_argument('--drop_group', type = str, default = '', help = 'name of adversarial protected group to drop classes for')
429 | 
430 |     args = parser.parse_args()
431 | 
432 |     df = pd.read_pickle(args.train_df)
433 | 
434 |     if len(args.categories) > 0:
435 |         for i in args.categories:
436 |             assert((df['category'] == i).sum() > 0) # make sure each category is present
437 |         df = df[df['category'].isin(args.categories)]
438 |         if df.shape[0] == 0:
439 |             raise Exception('dataframe is empty after subsetting!')
440 | 
441 |     if len(args.drop_group) > 0:
442 |         print('Records before dropping: %s' %len(df))
443 |         for i in Constants.drop_groups[args.drop_group]:
444 |             df = df[df[args.drop_group] != i]
445 |         print('Records after dropping: %s' %len(df))
446 | 
447 |     tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=True)
448 |     vocab_list = list(tokenizer.vocab.keys())
449 |     docs =  DocumentDatabase()
450 |     for idx, row in df.iterrows():
451 |         doc = []
452 |         groups = getGroups(row)
453 |         for d, line in enumerate(row[args.col_name]):
454 |             sample = {
455 |                 'tokens': tokenizer.tokenize(line),
456 |                  'groups': groups
457 |             }
458 |             doc.append(sample)
459 |         docs.add_document(doc)
460 | 
461 |     args.output_dir.mkdir(exist_ok=True, parents = True)
462 |     for epoch in trange(args.epochs_to_generate, desc="Epoch"):
463 |         epoch_filename = args.output_dir / f"epoch_{epoch}.json"
464 |         num_instances = 0
465 |         with epoch_filename.open('w') as epoch_file:
466 |             for doc_idx in trange(len(docs), desc="Document"):
467 |                 doc_instances = create_instances_from_document(
468 |                     docs, doc_idx, max_seq_length=args.max_seq_len, short_seq_prob=args.short_seq_prob,
469 |                     masked_lm_prob=args.masked_lm_prob, max_predictions_per_seq=args.max_predictions_per_seq,
470 |                     whole_word_mask=args.do_whole_word_mask, vocab_list=vocab_list)
471 |                 doc_instances = [json.dumps(instance) for instance in doc_instances]
472 |                 for instance in doc_instances:
473 |                     epoch_file.write(instance + '\n')
474 |                     num_instances += 1
475 |         metrics_file = args.output_dir / f"epoch_{epoch}_metrics.json"
476 |         with metrics_file.open('w') as metrics_file:
477 |             metrics = {
478 |                 "num_training_examples": num_instances,
479 |                 "max_seq_len": args.max_seq_len
480 |             }
481 |             metrics_file.write(json.dumps(metrics))
482 | 
483 | if __name__ == '__main__':
484 | 	main()
485 | 


--------------------------------------------------------------------------------
/scripts/adversarial_finetune_on_pregen.py:
--------------------------------------------------------------------------------
  1 | '''Adapted from https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/examples/lm_finetuning/finetune_on_pregenerated.py'''
  2 | import sys
  3 | import os
  4 | sys.path.insert(0, os.getcwd())
  5 | from argparse import ArgumentParser
  6 | from pathlib import Path
  7 | import os
  8 | import torch
  9 | import logging
 10 | import json
 11 | import random
 12 | import copy
 13 | import numpy as np
 14 | from collections import namedtuple
 15 | from tempfile import TemporaryDirectory
 16 | from torch.utils.data import DataLoader, Dataset, RandomSampler
 17 | from torch.utils.data.distributed import DistributedSampler
 18 | import torch.nn.functional as F
 19 | import torch.nn as nn
 20 | from tqdm import tqdm
 21 | 
 22 | from pytorch_pretrained_bert import WEIGHTS_NAME, CONFIG_NAME
 23 | from pytorch_pretrained_bert.modeling import BertForPreTraining
 24 | from pytorch_pretrained_bert.tokenization import BertTokenizer
 25 | from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule
 26 | from gradient_reversal import GradientReversal
 27 | import Constants
 28 | import utils
 29 | 
 30 | 
 31 | # Create the InputFeatures container with named tuple fields
 32 | InputFeatures = namedtuple("InputFeatures", "input_ids input_mask segment_ids lm_label_ids is_next domain_a domain_b")
 33 | 
 34 | # Configure loggers
 35 | log_format = '%(asctime)-10s: %(message)s'
 36 | logging.basicConfig(level=logging.INFO, format=log_format)
 37 | 
 38 | def convert_example_to_features(example, tokenizer, domain_to_id_dict, domain_name, max_seq_length):
 39 |     '''Helper function for turning JSON strings into tokenized features'''
 40 |     tokens = example["tokens"]
 41 |     segment_ids = example["segment_ids"]
 42 |     is_random_next = example["is_random_next"]
 43 |     masked_lm_positions = example["masked_lm_positions"]
 44 |     masked_lm_labels = example["masked_lm_labels"]
 45 | 
 46 |     groups_a = example["groups_a"]  # dictionary of 5 protected group categories and associated attribute
 47 |     groups_b = example["groups_b"]  # same keys as above, but with values for the second sequence
 48 |     domain_a = domain_to_id_dict[groups_a[domain_name]]  # for the domain of interest, convert the category string to unique category ID
 49 |     domain_b = domain_to_id_dict[groups_b[domain_name]]
 50 | 
 51 |     assert len(tokens) == len(segment_ids) <= max_seq_length  # The preprocessed data should be already truncated
 52 |     input_ids = tokenizer.convert_tokens_to_ids(tokens)
 53 |     masked_label_ids = tokenizer.convert_tokens_to_ids(masked_lm_labels)
 54 | 
 55 |     input_array = np.zeros(max_seq_length, dtype=np.int)
 56 |     input_array[:len(input_ids)] = input_ids
 57 | 
 58 |     mask_array = np.zeros(max_seq_length, dtype=np.bool)
 59 |     mask_array[:len(input_ids)] = 1
 60 | 
 61 |     segment_array = np.zeros(max_seq_length, dtype=np.bool)
 62 |     segment_array[:len(segment_ids)] = segment_ids
 63 | 
 64 |     lm_label_array = np.full(max_seq_length, dtype=np.int, fill_value=-1)
 65 |     lm_label_array[masked_lm_positions] = masked_label_ids
 66 | 
 67 |     features = InputFeatures(input_ids=input_array,
 68 |                              input_mask=mask_array,
 69 |                              segment_ids=segment_array,
 70 |                              lm_label_ids=lm_label_array,
 71 |                              is_next=is_random_next,
 72 |                              domain_a=domain_a,
 73 |                              domain_b=domain_b,
 74 |                             )
 75 |     return features
 76 | 
 77 | def _save_model(model, args, suffix, config_suffix="", save_config=False):
 78 |     model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
 79 |     output_model_file = os.path.join(args.output_dir, suffix)
 80 |     torch.save(model_to_save.state_dict(), output_model_file)
 81 |     if save_config:
 82 |         output_config_file = os.path.join(args.output_dir, config_suffix)
 83 |         model_to_save.config.to_json_file(output_config_file)
 84 | 
 85 | class PregeneratedDataset(Dataset):
 86 |     def __init__(self, training_path, epoch, tokenizer, domain_to_id_dict, domain_name, num_data_epochs, reduce_memory=False):
 87 |         self.vocab = tokenizer.vocab
 88 |         self.tokenizer = tokenizer
 89 |         self.domain_to_id_dict = domain_to_id_dict
 90 |         self.epoch = epoch
 91 |         self.data_epoch = epoch % num_data_epochs
 92 |         data_file = training_path / f"epoch_{self.data_epoch}.json"
 93 |         metrics_file = training_path / f"epoch_{self.data_epoch}_metrics.json"
 94 |         assert data_file.is_file() and metrics_file.is_file()
 95 |         metrics = json.loads(metrics_file.read_text())
 96 |         num_samples = metrics['num_training_examples']
 97 |         seq_len = metrics['max_seq_len']
 98 |         self.temp_dir = None
 99 |         self.working_dir = None
100 |         if reduce_memory:
101 |             self.temp_dir = TemporaryDirectory()
102 |             self.working_dir = Path(self.temp_dir.name)
103 |             input_ids = np.memmap(filename=self.working_dir/'input_ids.memmap',
104 |                                   mode='w+', dtype=np.int32, shape=(num_samples, seq_len))
105 |             input_masks = np.memmap(filename=self.working_dir/'input_masks.memmap',
106 |                                     shape=(num_samples, seq_len), mode='w+', dtype=np.bool)
107 |             segment_ids = np.memmap(filename=self.working_dir/'segment_ids.memmap',
108 |                                     shape=(num_samples, seq_len), mode='w+', dtype=np.bool)
109 |             lm_label_ids = np.memmap(filename=self.working_dir/'lm_label_ids.memmap',
110 |                                      shape=(num_samples, seq_len), mode='w+', dtype=np.int32)
111 |             lm_label_ids[:] = -1
112 |             is_nexts = np.memmap(filename=self.working_dir/'is_nexts.memmap',
113 |                                  shape=(num_samples,), mode='w+', dtype=np.bool)
114 |             domain_a = np.memmap(filename=self.working_dir/'domain_a.memmap',
115 |                                 shape=(num_samples,), mode='w+', dtype=np.int32)
116 |             domain_b = np.memmap(filename=self.working_dir/'domain_b.memmap',
117 |                                 shape=(num_samples,), mode='w+', dtype=np.int32)
118 |         else:
119 |             input_ids = np.zeros(shape=(num_samples, seq_len), dtype=np.int32)
120 |             input_masks = np.zeros(shape=(num_samples, seq_len), dtype=np.bool)
121 |             segment_ids = np.zeros(shape=(num_samples, seq_len), dtype=np.bool)
122 |             lm_label_ids = np.full(shape=(num_samples, seq_len), dtype=np.int32, fill_value=-1)
123 |             is_nexts = np.zeros(shape=(num_samples,), dtype=np.bool)
124 |             domain_a = np.zeros(shape=(num_samples,), dtype=np.int32)
125 |             domain_b = np.zeros(shape=(num_samples,), dtype=np.int32)
126 |         logging.info(f"Loading training examples for epoch {epoch}Þ[MaÞ[MaÞ")
127 |         with data_file.open() as f:
128 |             for i, line in enumerate(tqdm(f, total=num_samples, desc="Training examples")):
129 |                 line = line.strip()
130 |                 example = json.loads(line)
131 |                 features = convert_example_to_features(example, tokenizer, domain_to_id_dict, domain_name, seq_len)
132 |                 input_ids[i] = features.input_ids
133 |                 segment_ids[i] = features.segment_ids
134 |                 input_masks[i] = features.input_mask
135 |                 lm_label_ids[i] = features.lm_label_ids
136 |                 is_nexts[i] = features.is_next
137 |                 domain_a[i] = features.domain_a
138 |                 domain_b[i] = features.domain_b
139 |         assert i == num_samples - 1  # Assert that the sample count metric was true
140 |         logging.info("Loading complete!")
141 |         self.num_samples = num_samples
142 |         self.seq_len = seq_len
143 |         self.input_ids = input_ids
144 |         self.input_masks = input_masks
145 |         self.segment_ids = segment_ids
146 |         self.lm_label_ids = lm_label_ids
147 |         self.is_nexts = is_nexts
148 |         self.domain_a = domain_a
149 |         self.domain_b = domain_b
150 | 
151 |     def __len__(self):
152 |         return self.num_samples
153 | 
154 |     def __getitem__(self, item):
155 |         return (torch.tensor(self.input_ids[item].astype(np.int64)),
156 |                 torch.tensor(self.input_masks[item].astype(np.int64)),
157 |                 torch.tensor(self.segment_ids[item].astype(np.int64)),
158 |                 torch.tensor(self.lm_label_ids[item].astype(np.int64)),
159 |                 torch.tensor(self.is_nexts[item].astype(np.int64)),
160 |                 torch.tensor(self.domain_a[item].astype(np.int64)),
161 |                 torch.tensor(self.domain_b[item].astype(np.int64)))
162 | 
163 | 
164 | class Discriminator(nn.Module):
165 |     def __init__(self, input_dim, num_layers, num_categories, lm):
166 |         super(Discriminator, self).__init__()
167 |         self.num_layers = num_layers
168 |         assert(num_layers >= 1)
169 |         self.input_dim = input_dim
170 |         self.num_categories = num_categories
171 |         self.lm = lm
172 |         self.layers = [GradientReversal(lambda_ = lm)]
173 |         for c, i in enumerate(range(num_layers)):
174 |             if c != num_layers-1:
175 |                 self.layers.append(nn.Linear(input_dim // (2**c), input_dim // (2**(c+1))))
176 |                 self.layers.append(nn.ReLU())
177 |             else:
178 |                 self.layers.append(nn.Linear(input_dim // (2**c), num_categories))
179 |                 self.layers.append(nn.Softmax())
180 |         self.layers = nn.ModuleList(self.layers)
181 | 
182 |     def forward(self, x):
183 |         for i in range(len(self.layers)):
184 |             x = self.layers[i](x)
185 |         return x
186 | 
187 | 
188 | def main():
189 |     parser = ArgumentParser()
190 |     parser.add_argument('--pregenerated_data', type=Path, required=True)
191 |     parser.add_argument('--output_dir', type=Path, required=True)
192 |     parser.add_argument('--domain_of_interest', type=str, required=True)  # Added for domain adaptation
193 |     parser.add_argument('--layer_to_get_features', type=int, default=11, help="Choose an integer in [0, 11] for BERT basic (with 12 layers) or [0, 23] for BERT large (with 24 layers)")
194 |     parser.add_argument('--discriminator_input_dim', type=int, default=768, help='Must correspond to number of hidden dimensions for BERT embeddings.')
195 |     parser.add_argument('--lambda_', type=float, required=True, help = 'Weighting parameter for the loss of the adversarial network')
196 |     parser.add_argument('--num_layers', type=int, required=True, help = 'Number of fully connected layers for the discriminator')
197 |     parser.add_argument("--use_new_mapping", action="store_true", help = 'whether to use new mapping in Constants file')
198 |     parser.add_argument('--discriminator_a_path', type=str, required = False, help = 'path for pretrained discriminator_a if it exists, otherwise initialize from random')
199 |     parser.add_argument('--discriminator_b_path', type=str, required = False, help = 'path for pretrained discriminator_b if it exists, otherwise initialize from random')
200 |     parser.add_argument("--bert_model", type=str, required=True, help="Path to BERT pre-trained model, or select from list: bert-base-uncased, "
201 |                              "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.")
202 |     parser.add_argument("--do_lower_case", action="store_true")
203 |     parser.add_argument("--reduce_memory", action="store_true",
204 |                         help="Store training data as on-disc memmaps to massively reduce memory usage")
205 | 
206 |     parser.add_argument("--epochs", type=int, default=3, help="Number of epochs to train for")
207 |     parser.add_argument("--local_rank",
208 |                         type=int,
209 |                         default=-1,
210 |                         help="local_rank for distributed training on gpus")
211 |     parser.add_argument("--no_cuda",
212 |                         action='store_true',
213 |                         help="Whether not to use CUDA when available")
214 |     parser.add_argument('--gradient_accumulation_steps',
215 |                         type=int,
216 |                         default=1,
217 |                         help="Number of updates steps to accumulate before performing a backward/update pass.")
218 |     parser.add_argument("--train_batch_size",
219 |                         default=32,
220 |                         type=int,
221 |                         help="Total batch size for training.")
222 |     parser.add_argument('--fp16',
223 |                         action='store_true',
224 |                         help="Whether to use 16-bit float precision instead of 32-bit")
225 |     parser.add_argument('--loss_scale',
226 |                         type=float, default=0,
227 |                         help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
228 |                         "0 (default value): dynamic loss scaling.\n"
229 |                         "Positive power of 2: static loss scaling value.\n")
230 |     parser.add_argument("--warmup_proportion",
231 |                         default=0.1,
232 |                         type=float,
233 |                         help="Proportion of training to perform linear learning rate warmup for. "
234 |                              "E.g., 0.1 = 10%% of training.")
235 |     parser.add_argument("--learning_rate",
236 |                         default=3e-5,
237 |                         type=float,
238 |                         help="The initial learning rate for Adam.")
239 |     parser.add_argument('--seed',
240 |                         type=int,
241 |                         default=42,
242 |                         help="random seed for initialization")
243 |     args = parser.parse_args()
244 | 
245 |     assert args.pregenerated_data.is_dir(), \
246 |         "--pregenerated_data should point to the folder of files made by pregenerate_training_data.py!"
247 | 
248 |     # get domain mapping
249 |     assert (args.domain_of_interest in Constants.mapping)
250 |     if args.use_new_mapping:
251 |         domain_mapping = Constants.newmapping[args.domain_of_interest]
252 |     else:
253 |         domain_mapping = Constants.mapping[args.domain_of_interest]
254 |     num_categories = len(set(domain_mapping.values()))
255 | 
256 |     # check that data has been pregenerated for the specified epochs
257 |     samples_per_epoch = []
258 |     for i in range(args.epochs):
259 |         epoch_file = args.pregenerated_data / f"epoch_{i}.json"
260 |         metrics_file = args.pregenerated_data / f"epoch_{i}_metrics.json"
261 |         if epoch_file.is_file() and metrics_file.is_file():
262 |             metrics = json.loads(metrics_file.read_text())
263 |             samples_per_epoch.append(metrics['num_training_examples'])
264 |         else:
265 |             if i == 0:
266 |                 exit("No training data was found!")
267 |             print(f"Warning! There are fewer epochs of pregenerated data ({i}) than training epochs ({args.epochs}).")
268 |             print("This script will loop over the available data, but training diversity may be negatively impacted.")
269 |             num_data_epochs = i
270 |             break
271 |     else:
272 |         num_data_epochs = args.epochs
273 | 
274 |     # get up GPU
275 |     if args.local_rank == -1 or args.no_cuda:
276 |         if torch.cuda.is_available() and not args.no_cuda:
277 |             device = torch.device("cuda")
278 |         else:
279 |             print("[WARNING] Using CPU instead of GPU for training!")
280 |             device = torch.device("cpu")
281 |         n_gpu = torch.cuda.device_count()
282 |     else:
283 |         torch.cuda.set_device(args.local_rank)
284 |         device = torch.device("cuda", args.local_rank)
285 |         n_gpu = 1
286 |         # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
287 |         torch.distributed.init_process_group(backend='nccl')
288 | 
289 |     logging.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
290 |         device, n_gpu, bool(args.local_rank != -1), args.fp16))
291 | 
292 |     if args.gradient_accumulation_steps < 1:
293 |         raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
294 |                             args.gradient_accumulation_steps))
295 | 
296 |     args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
297 | 
298 |     random.seed(args.seed)
299 |     np.random.seed(args.seed)
300 |     torch.manual_seed(args.seed)
301 |     if n_gpu > 0:
302 |         torch.cuda.manual_seed_all(args.seed)
303 | 
304 |     if args.output_dir.is_dir() and list(args.output_dir.iterdir()):
305 |         logging.warning(f"Output directory ({args.output_dir}) already exists and is not empty!")
306 |     args.output_dir.mkdir(parents=True, exist_ok=True)
307 | 
308 |     tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
309 | 
310 |     total_train_examples = 0
311 |     for i in range(args.epochs):
312 |         # The modulo takes into account the fact that we may loop over limited epochs of data
313 |         total_train_examples += samples_per_epoch[i % len(samples_per_epoch)]
314 | 
315 |     num_train_optimization_steps = int(
316 |         total_train_examples / args.train_batch_size / args.gradient_accumulation_steps)
317 |     if args.local_rank != -1:
318 |         num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()
319 | 
320 |     # Create transformer encoder layers
321 |     model = BertForPreTraining.from_pretrained(args.bert_model)
322 |     embed_layers = model.bert
323 |     embed_layers = embed_layers.to(device)
324 | 
325 |     # Make discriminator networks for the two sentences
326 |     discriminator_a = Discriminator(input_dim = args.discriminator_input_dim,
327 |                                   num_layers = args.num_layers,
328 |                                   num_categories = num_categories,
329 |                                   lm = args.lambda_)
330 |     discriminator_b = Discriminator(input_dim = args.discriminator_input_dim,
331 |                                   num_layers = args.num_layers,
332 |                                   num_categories = num_categories,
333 |                                   lm = args.lambda_)
334 | 
335 | 
336 |     # Prepare models for GPU training
337 |     if args.fp16:
338 |         # cast floating point parameters to the half precision datatype
339 |         model.half()
340 |         discriminator_a.half()
341 |         discriminator_b.half()
342 | 
343 |     model = model.to(device)
344 |     discriminator_a = discriminator_a.to(device)
345 |     discriminator_b = discriminator_b.to(device)
346 | 
347 |     if args.discriminator_a_path:
348 |         discriminator_a.load_state_dict(torch.load(args.discriminator_a_path))
349 | 
350 |     if args.discriminator_b_path:
351 |         discriminator_b.load_state_dict(torch.load(args.discriminator_b_path))
352 | 
353 |     if args.local_rank != -1:
354 |         try:
355 |             from apex.parallel import DistributedDataParallel as DDP
356 |         except ImportError:
357 |             raise ImportError(
358 |                 "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
359 |         model = DDP(model)
360 |         discriminator_a = DPP(model)
361 |         discriminator_b = DPP(model)
362 |     elif n_gpu > 1:
363 |         model = torch.nn.DataParallel(model)
364 |         discriminator_a = torch.nn.DataParallel(discriminator_a)
365 |         discriminator_b = torch.nn.DataParallel(discriminator_b)
366 | 
367 |     # Prepare optimizer
368 |     param_optimizer = list(model.named_parameters()) + list(discriminator_a.named_parameters()) + list(discriminator_b.named_parameters())
369 |     no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
370 |     optimizer_grouped_parameters = [
371 |         {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
372 |          'weight_decay': 0.01},
373 |         {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
374 |     ]
375 | 
376 |     if args.fp16:
377 |         try:
378 |             from apex.optimizers import FP16_Optimizer
379 |             from apex.optimizers import FusedAdam
380 |         except ImportError:
381 |             raise ImportError(
382 |                 "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
383 | 
384 |         optimizer = FusedAdam(optimizer_grouped_parameters,
385 |                               lr=args.learning_rate,
386 |                               bias_correction=False,
387 |                               max_grad_norm=1.0)
388 |         if args.loss_scale == 0:
389 |             optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
390 |         else:
391 |             optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
392 |         warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion,
393 |                                              t_total=num_train_optimization_steps)
394 |     else:
395 |         optimizer = BertAdam(optimizer_grouped_parameters,
396 |                              lr=args.learning_rate,
397 |                              warmup=args.warmup_proportion,
398 |                              t_total=num_train_optimization_steps)
399 | 
400 |     global_step = 0
401 |     logging.info("***** Running training *****")
402 |     logging.info(f"  Num examples = {total_train_examples}")
403 |     logging.info("  Batch size = %d", args.train_batch_size)
404 |     logging.info("  Num steps = %d", num_train_optimization_steps)
405 | 
406 |     loss_func = nn.DataParallel(nn.CrossEntropyLoss())
407 | 
408 |     # Track training accuracy/loss across **all epochs**
409 |     train_hist = {'domain_a_loss': [],
410 |                 'domain_a_acc': [],
411 |                 'domain_b_loss': [],
412 |                 'domain_b_acc': [],
413 |                 'label_loss': [],
414 |                 'tr_loss': [],
415 |                 }
416 | 
417 |     for epoch in range(args.epochs):
418 |         # put models in train mode
419 |         model.train()
420 |         discriminator_a.train()
421 |         discriminator_b.train()
422 | 
423 |         epoch_dataset = PregeneratedDataset(epoch=epoch, training_path=args.pregenerated_data, tokenizer=tokenizer,
424 |                                             domain_to_id_dict = domain_mapping, domain_name = args.domain_of_interest,
425 |                                             num_data_epochs=num_data_epochs, reduce_memory=args.reduce_memory)
426 |         if args.local_rank == -1:
427 |             train_sampler = RandomSampler(epoch_dataset)
428 |         else:
429 |             train_sampler = DistributedSampler(epoch_dataset)
430 |         train_dataloader = DataLoader(epoch_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
431 | 
432 |         # Reinitialize and reaccumulate stats for **current epoch** (i.e. across batchs)
433 |         epoch_stats = {'domain_a_loss': 0,  # not yet normalized for the number of steps
434 |                 'domain_a_correct': 0,
435 |                 'domain_b_loss': 0,
436 |                 'domain_b_correct': 0,
437 |                 'label_loss': 0,
438 |                 'tr_loss': 0,
439 |                 'nb_tr_examples': 0,
440 |                 'nb_tr_steps': 0,
441 |                 }
442 | 
443 |         with tqdm(total=len(train_dataloader), desc=f"Epoch {epoch}") as pbar:
444 |             for step, batch in enumerate(train_dataloader):
445 |                 batch = tuple(t.to(device) for t in batch)
446 |                 input_ids, input_mask, segment_ids, lm_label_ids, is_next, domain_a, domain_b = batch
447 | 
448 |                 # Get class label loss
449 |                 label_loss = model(input_ids, segment_ids, input_mask, lm_label_ids, is_next)
450 | 
451 |                 # Get feature embeddings for this batch
452 |                 with torch.no_grad():
453 |                     encoded_layers, pooled_output = embed_layers(input_ids, segment_ids, output_all_encoded_layers=True)
454 |                 features = encoded_layers[args.layer_to_get_features]
455 |                 assert features.shape[2] == args.discriminator_input_dim
456 | 
457 |                 # We only feed the [CLS] token into the discriminator:
458 |                 domain_input = features[:, 0, :]   # tensor of (batch_size, hidden_dim)
459 | 
460 |                 # get domain predictions and loss
461 |                 domain_input = domain_input.to(device)
462 |                 domain_a_preds = discriminator_a(domain_input)
463 |                 domain_b_preds = discriminator_b(domain_input)
464 | 
465 |                 domain_a_loss = loss_func(domain_a_preds, domain_a)
466 |                 domain_b_loss = loss_func(domain_b_preds, domain_b)
467 | 
468 |                 if n_gpu > 1:
469 |                     label_loss = label_loss.mean() # mean() to average on multi-gpu.
470 |                     domain_a_loss = domain_a_loss.mean()
471 |                     domain_b_loss = domain_b_loss.mean()
472 |                 if args.gradient_accumulation_steps > 1:
473 |                     label_loss = label_loss / args.gradient_accumulation_steps
474 |                     domain_a_loss = domain_a_loss / args.gradient_accumulation_steps
475 |                     domain_b_loss = domain_b_loss / args.gradient_accumulation_steps
476 |                 loss = label_loss + domain_a_loss + domain_b_loss
477 | 
478 | 
479 |                 if args.fp16:
480 |                     optimizer.backward(loss)
481 |                 else:
482 |                     loss.backward()
483 | 
484 |                 # Gather loss and domain prediction accuracies
485 |                 domain_a_correct = domain_a_preds.argmax(dim=-1).eq(domain_a).sum().item()
486 |                 domain_b_correct = domain_b_preds.argmax(dim=-1).eq(domain_b).sum().item()
487 | 
488 |                 epoch_stats['domain_a_loss'] += domain_a_loss.item()
489 |                 epoch_stats['domain_a_correct'] += domain_a_correct
490 |                 epoch_stats['domain_b_loss'] += domain_b_loss.item()
491 |                 epoch_stats['domain_b_correct'] += domain_b_correct
492 |                 epoch_stats['label_loss'] += label_loss.item()
493 |                 epoch_stats['tr_loss'] += loss.item()
494 |                 epoch_stats['nb_tr_examples'] += input_ids.size(0)
495 |                 epoch_stats['nb_tr_steps'] += 1
496 | 
497 |                 pbar.update(1)
498 |                 mean_loss = epoch_stats['tr_loss'] * args.gradient_accumulation_steps / epoch_stats['nb_tr_steps']
499 |                 pbar.set_postfix_str(f"Loss: {mean_loss:.5f}")
500 |                 if (step + 1) % args.gradient_accumulation_steps == 0:
501 |                     if args.fp16:
502 |                         # modify learning rate with special warm up BERT uses
503 |                         # if args.fp16 is False, BertAdam is used that handles this automatically
504 |                         lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step, args.warmup_proportion)
505 |                         for param_group in optimizer.param_groups:
506 |                             param_group['lr'] = lr_this_step
507 |                     optimizer.step()
508 |                     optimizer.zero_grad()
509 |                     global_step += 1
510 | 
511 |                 # At the end of **each minibatch**, add a new point to the training history plot
512 |                 # Note that even though a new point is plotted, the loss and accuracy is calculated from
513 |                 # stats accumulated across the entire epoch.
514 |                 train_hist['domain_a_loss'].append(epoch_stats['domain_a_loss'] / epoch_stats['nb_tr_steps'])
515 |                 train_hist['domain_b_loss'].append(epoch_stats['domain_b_loss'] / epoch_stats['nb_tr_steps'])
516 |                 train_hist['domain_a_acc'].append(epoch_stats['domain_a_correct'] / epoch_stats['nb_tr_examples'])
517 |                 train_hist['domain_b_acc'].append(epoch_stats['domain_b_correct'] / epoch_stats['nb_tr_examples'])
518 |                 train_hist['label_loss'].append(epoch_stats['label_loss'] / epoch_stats['nb_tr_steps'])
519 |                 train_hist['tr_loss'].append(epoch_stats['tr_loss'] / epoch_stats['nb_tr_steps'])
520 | 
521 |         # At the end of each **epoch**, save a trained model and a training loss/acc plot
522 |         # Note that the plot will still accumulate points from _all_ epochs
523 |         logging.info(f"** ** * Saving fine-tuned model for epoch {epoch} ** ** * ")
524 |         _save_model(model, args, suffix=WEIGHTS_NAME, config_suffix=CONFIG_NAME, save_config=True)
525 |         _save_model(discriminator_a, args, suffix=f"discriminator_a_{epoch}.bin", save_config=False)
526 |         _save_model(discriminator_b, args, suffix=f"discriminator_b_{epoch}.bin", save_config=False)
527 |         tokenizer.save_vocabulary(args.output_dir)
528 | 
529 |         utils.plot_training_history(train_hist, 'domain_a_loss', args.output_dir / 'figures', title="domain of first sequence: training loss")
530 |         utils.plot_training_history(train_hist, 'domain_b_loss', args.output_dir / 'figures', title="domain of second sequence: training loss")
531 |         utils.plot_training_history(train_hist, 'label_loss', args.output_dir / 'figures', title="BERT pretraining task labels: training loss")
532 |         utils.plot_training_history(train_hist, 'tr_loss', args.output_dir / 'figures', title="Overall loss")
533 |         utils.plot_training_history(train_hist, 'domain_a_acc', args.output_dir / 'figures', title="domain of first sequence: training accuracy")
534 |         utils.plot_training_history(train_hist, 'domain_b_acc', args.output_dir / 'figures', title="domain of second sequence: training accuracy")
535 | 
536 |     print(f'Finished {args.epochs} epochs of training.')
537 | 
538 |     args_dict = copy.deepcopy(vars(args))
539 |     for key, value in args_dict.items():
540 |         args_dict[key] = str(args_dict[key])
541 |     args_json = json.dumps(args_dict)
542 |     with open(args.output_dir / "parser_arguments.json", "w") as f:
543 |         for line in args_json:
544 |             f.write(line)
545 | 
546 | 
547 | 
548 | if __name__ == '__main__':
549 |     main()
550 | 


--------------------------------------------------------------------------------