├── runs
    ├── __init__.py
    ├── core
    │   ├── __init__.py
    │   ├── run_ft_reg.sh
    │   ├── run_deprobe.sh
    │   ├── run_ft_1hot.sh
    │   ├── run_ft_1hot_selectivity.py
    │   ├── run_ft_reg.py
    │   ├── run_ft_1hot.py
    │   └── run_deprobe.py
    ├── encode
    │   ├── __init__.py
    │   ├── run_encode.sh
    │   ├── reduce_tag_labels.sh
    │   ├── reduce_ner_labels.sh
    │   ├── run_layer_encode.sh
    │   ├── reduce_dep_labels.sh
    │   ├── run_encode.py
    │   └── run_layer_encode.py
    ├── evaluate
    │   ├── __init__.py
    │   ├── run_layer_wise_deprobe.sh
    │   ├── run_specific_eval.sh
    │   ├── run_layer_wise_lm.sh
    │   ├── run_eval.sh
    │   ├── run_eval_per_dim.sh
    │   ├── run_layer_wise_lm.py
    │   ├── run_specific_eval.py
    │   ├── run_layer_wise_deprobe.py
    │   ├── run_eval_per_dim.py
    │   └── run_eval.py
    ├── clear_set_ts.py
    └── ts_run.py
├── amnesic_probing
    ├── __init__.py
    ├── tasks
    │   ├── __init__.py
    │   ├── reduce_labels.py
    │   ├── layer_wise_lm.py
    │   ├── task_specific_eval.py
    │   ├── lm_per_dim.py
    │   ├── layer_wise_deprobe.py
    │   ├── data_preparation.py
    │   ├── remove_property.py
    │   ├── utils.py
    │   └── lm.py
    ├── debias
    │   ├── __init__.py
    │   ├── classifier.py
    │   └── debias.py
    ├── debiased_finetuning
    │   ├── utils.py
    │   ├── debiased_finetuning_lm.py
    │   └── rebiased_finetuning_lm.py
    └── encoders
    │   ├── control.py
    │   ├── encode.py
    │   ├── encode_with_forward_pass
    │       ├── __init__.py
    │       ├── encode.py
    │       └── bert_encoding.py
    │   ├── bert_encoding.py
    │   └── __init__.py
├── requirements.txt
├── LICENSE
└── README.md


/runs/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/runs/core/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/runs/encode/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/amnesic_probing/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/runs/evaluate/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/amnesic_probing/tasks/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/amnesic_probing/debias/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | torch
2 | transformers
3 | docopt
4 | tensorboardX
5 | wandb


--------------------------------------------------------------------------------
/runs/core/run_ft_reg.sh:
--------------------------------------------------------------------------------
 1 | AMNESIC_PATH="PATH-TO-AMNESIC-PROBING"
 2 | cd $AMNESIC_PATH
 3 | export PYTHONPATH=$AMNESIC_PATH
 4 | 
 5 | train_path=$1
 6 | out_dir=$2
 7 | debias_p=$3
 8 | 
 9 | python amnesic_probing/debiased_finetuning/debiased_finetuning_lm.py \
10 |         --train_path $train_path \
11 |         --out_dir $out_dir \
12 |         --n_epochs 20 \
13 |         --debias $debias_p \
14 |         --device cuda:0 \
15 |         --wandb
16 | 


--------------------------------------------------------------------------------
/runs/evaluate/run_layer_wise_deprobe.sh:
--------------------------------------------------------------------------------
 1 | AMNESIC_PATH="PATH-TO-AMNESIC-PROBING"
 2 | cd $AMNESIC_PATH
 3 | export PYTHONPATH=$AMNESIC_PATH
 4 | 
 5 | layers=$1
 6 | proj_vecs=$2
 7 | labels=$3
 8 | text=$4
 9 | task=$5
10 | 
11 | python amnesic_probing/tasks/layer_wise_deprobe.py \
12 |         --layers $layers \
13 |         --proj_vecs $proj_vecs \
14 |         --labels $labels \
15 |         --text $text \
16 |         --task $task \
17 |         --wandb
18 | 


--------------------------------------------------------------------------------
/runs/evaluate/run_specific_eval.sh:
--------------------------------------------------------------------------------
 1 | AMNESIC_PATH="PATH-TO-AMNESIC-PROBING"
 2 | cd $AMNESIC_PATH
 3 | export PYTHONPATH=$AMNESIC_PATH
 4 | 
 5 | vecs=$1
 6 | labels=$2
 7 | text=$3
 8 | deprobe_dir=$4
 9 | device=$5
10 | 
11 | python amnesic_probing/tasks/task_specific_eval.py \
12 |         --vecs $vecs \
13 |         --labels $labels \
14 |         --text $text \
15 |         --deprobe_dir $deprobe_dir \
16 |         --device $device \
17 |         --wandb
18 | 


--------------------------------------------------------------------------------
/runs/evaluate/run_layer_wise_lm.sh:
--------------------------------------------------------------------------------
 1 | AMNESIC_PATH="PATH-TO-AMNESIC-PROBING"
 2 | cd $AMNESIC_PATH
 3 | export PYTHONPATH=$AMNESIC_PATH
 4 | 
 5 | proj_vecs=$1
 6 | labels=$2
 7 | text=$3
 8 | task=$4
 9 | device=$5
10 | 
11 | python amnesic_probing/tasks/layer_wise_lm.py \
12 |         --proj_vecs $proj_vecs \
13 |         --labels $labels \
14 |         --text $text \
15 |         --task $task \
16 |         --n 100000 \
17 |         --device $device \
18 |         --wandb
19 | 


--------------------------------------------------------------------------------
/runs/core/run_deprobe.sh:
--------------------------------------------------------------------------------
 1 | AMNESIC_PATH="PATH-TO-AMNESIC-PROBING"
 2 | cd $AMNESIC_PATH
 3 | export PYTHONPATH=$AMNESIC_PATH
 4 | 
 5 | vecs=$1
 6 | labels=$2
 7 | out_dir=$3
 8 | task=$4
 9 | balance=$5
10 | 
11 | python amnesic_probing/tasks/remove_property.py \
12 |         --vecs $vecs \
13 |         --labels $labels \
14 |         --out_dir $out_dir \
15 |         --n_cls 100 \
16 |         --task $task \
17 |         --input_dim 768 \
18 |         --balance_data $balance \
19 |         --wandb
20 | 


--------------------------------------------------------------------------------
/runs/core/run_ft_1hot.sh:
--------------------------------------------------------------------------------
 1 | AMNESIC_PATH="PATH-TO-AMNESIC-PROBING"
 2 | cd $AMNESIC_PATH
 3 | export PYTHONPATH=$AMNESIC_PATH
 4 | 
 5 | train_path=$1
 6 | debias_p=$2
 7 | rebias=$3
 8 | out_dir=$4
 9 | 
10 | python amnesic_probing/debiased_finetuning/rebiased_finetuning_lm.py \
11 |         --train_path $train_path \
12 |         --debias $debias_p \
13 |         --rebias $rebias \
14 |         --n_epochs 20 \
15 |         --out_dir $out_dir \
16 |         --device cuda:0 \
17 |         --wandb
18 | 


--------------------------------------------------------------------------------
/runs/evaluate/run_eval.sh:
--------------------------------------------------------------------------------
 1 | AMNESIC_PATH="PATH-TO-AMNESIC-PROBING"
 2 | cd $AMNESIC_PATH
 3 | export PYTHONPATH=$AMNESIC_PATH
 4 | 
 5 | vecs=$1
 6 | labels=$2
 7 | text=$3
 8 | deprobe_dir=$4
 9 | task_type=$5
10 | device=$6
11 | 
12 | python amnesic_probing/tasks/lm.py \
13 |         --vecs $vecs \
14 |         --labels $labels \
15 |         --text $text \
16 |         --task $task_type \
17 |         --deprobe_dir $deprobe_dir \
18 |         --display_examples 100 \
19 |         --device $device \
20 |         --wandb
21 | 


--------------------------------------------------------------------------------
/runs/evaluate/run_eval_per_dim.sh:
--------------------------------------------------------------------------------
 1 | AMNESIC_PATH="PATH-TO-AMNESIC-PROBING"
 2 | cd $AMNESIC_PATH
 3 | export PYTHONPATH=$AMNESIC_PATH
 4 | 
 5 | vecs=$1
 6 | labels=$2
 7 | text=$3
 8 | deprobe_dir=$4
 9 | task_type=$5
10 | device=$6
11 | 
12 | python amnesic_probing/tasks/lm_per_dim.py \
13 |         --vecs $vecs \
14 |         --labels $labels \
15 |         --text $text \
16 |         --task $task_type \
17 |         --deprobe_dir $deprobe_dir \
18 |         --display_examples 100 \
19 |         --device $device \
20 |         --wandb
21 | 


--------------------------------------------------------------------------------
/runs/encode/run_encode.sh:
--------------------------------------------------------------------------------
 1 | AMNESIC_PATH="PATH-TO-AMNESIC-PROBING"
 2 | cd $AMNESIC_PATH
 3 | export PYTHONPATH=$AMNESIC_PATH
 4 | 
 5 | split=$1
 6 | input_file=$2
 7 | output_dir=$3
 8 | task_format=$4
 9 | encode_format=$5
10 | device=$6
11 | 
12 | 
13 | python amnesic_probing/encoders/encode.py \
14 |         --input_file $input_file  \
15 |         --output_dir $output_dir/$split/ \
16 |         --encoder bert-base-uncased \
17 |         --format $task_format \
18 |         --encode_format $encode_format \
19 |         --all_layers \
20 |         --device $device
21 | 


--------------------------------------------------------------------------------
/runs/encode/reduce_tag_labels.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | TAGS=( VERB NOUN ADP DET NUM . PRT CONJ ADV PRON ADJ X )
 4 | TAGS_NAME=( verb noun adp det num punct prt conj adv pron adj other )
 5 | 
 6 | for encode_format in normal masked
 7 | do
 8 |   for ((i=0;i<${#TAGS[@]};++i))
 9 |   do
10 |     for split in train dev
11 |     do
12 |       python amnesic_probing/tasks/reduce_labels.py \
13 |             --label data/ud_output_$encode_format/$split/tag.pickle \
14 |             --keep_label "${TAGS[i]}" \
15 |             --out_labels data/ud_output_$encode_format/$split/tag_"${TAGS_NAME[i]}".pickle
16 |     done
17 |   done
18 | done


--------------------------------------------------------------------------------
/runs/encode/reduce_ner_labels.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | TAGS=( ORG CARDINAL GPE DATE PERSON )
 4 | TAGS_NAME=( ORG CARDINAL GPE DATE PERSON )
 5 | 
 6 | for encode_format in normal masked
 7 | do
 8 |   for ((i=0;i<${#TAGS[@]};++i))
 9 |   do
10 |     for split in train dev
11 |     do
12 |       echo "data/ontonotes_output_$encode_format/$split/ner_"${TAGS_NAME[i]}""
13 |       python amnesic_probing/tasks/reduce_labels.py \
14 |             --label data/ontonotes_output_$encode_format/$split/ner.pickle \
15 |             --keep_label "${TAGS[i]}" \
16 |             --out_labels data/ontonotes_output_$encode_format/$split/ner_"${TAGS_NAME[i]}".pickle
17 |     done
18 |   done
19 | done
20 | 


--------------------------------------------------------------------------------
/runs/encode/run_layer_encode.sh:
--------------------------------------------------------------------------------
 1 | AMNESIC_PATH="PATH-TO-AMNESIC-PROBING"
 2 | cd $AMNESIC_PATH
 3 | export PYTHONPATH=$AMNESIC_PATH
 4 | 
 5 | input_file=$1
 6 | projections_dir=$2
 7 | output_dir=$3
 8 | task_format=$4
 9 | encode_format=$5
10 | control=$6
11 | device=$7
12 | 
13 | 
14 | python amnesic_probing/encoders/encode_with_forward_pass/encode.py \
15 |         --input_file $input_file  \
16 |         --projections_dir $projections_dir \
17 |         --output_dir $output_dir/ \
18 |         --encoder bert-base-uncased \
19 |         --format $task_format \
20 |         --encode_format $encode_format \
21 |         --control $control \
22 |         --device $device \
23 |         --num_layers 12
24 | 


--------------------------------------------------------------------------------
/runs/encode/reduce_dep_labels.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #TAGS=( VERB NOUN ADP DET NUM . PRT CONJ ADV PRON ADJ X )
 4 | #TAGS_NAME=( verb noun adp det num punct prt conj adv pron adj other )
 5 | TAGS_NAME=(adpmod det compmod num adpobj p poss adp amod nsubj dep dobj cc conj advmod ROOT ccomp aux xcomp neg)
 6 | TAGS=(adpmod det compmod num adpobj p poss adp amod nsubj dep dobj cc conj advmod ROOT ccomp aux xcomp neg)
 7 | 
 8 | for encode_format in normal masked
 9 | do
10 |   for ((i=0;i<${#TAGS[@]};++i))
11 |   do
12 |     for split in train dev
13 |     do
14 |       python amnesic_probing/tasks/reduce_labels.py \
15 |             --label data/ud_output_$encode_format/$split/dep.pickle \
16 |             --keep_label "${TAGS[i]}" \
17 |             --out_labels data/ud_output_$encode_format/$split/dep_"${TAGS_NAME[i]}".pickle
18 |     done
19 |   done
20 | done
21 | 


--------------------------------------------------------------------------------
/runs/clear_set_ts.py:
--------------------------------------------------------------------------------
 1 | import spur
 2 | from tqdm import tqdm
 3 | 
 4 | 
 5 | env = {
 6 |     'USE_SIMPLE_THREADED_LEVEL3': '1',
 7 |     'OMP_NUM_THREADS': '1',
 8 | }
 9 | ts = 'PATH-TO-TASK-SPOOLER/ts'
10 | 
11 | # ┌──────────────────────┐
12 | # │ connect to all nodes │
13 | # └──────────────────────┘
14 | nodes = [
15 |     'nlp01',
16 |     #'nlp02',
17 |     'nlp03',
18 |     'nlp04',
19 |     'nlp05',
20 |     'nlp06',
21 |     'nlp07',
22 |     'nlp08',
23 |     'nlp09',
24 |     'nlp10',
25 |     'nlp11',
26 |     'nlp12',
27 |     'nlp13',
28 |     'nlp14',
29 |     'nlp15',
30 | ]
31 | 
32 | # assumes automatic connection w/o password
33 | connections = [spur.SshShell(hostname=node, username="USERNAME") for node in nodes]
34 | 
35 | dargs = {}
36 | 
37 | for connection in tqdm(connections):
38 | 
39 |     connection.run(f"{ts} -C".split(), update_env=env)
40 |     connection.run(f"{ts} -S 4".split(), update_env=env)
41 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Yanai Elazar
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/amnesic_probing/tasks/reduce_labels.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Usage:
 3 |   remove_property.py [--labels=LABELS] [--out_labels=OUT_LABELS] [--keep_label=KEEP_LABEL]
 4 | 
 5 | Options:
 6 |   -h --help                     show this help message and exit
 7 |   --labels=LABELS               labels file. using the train path (and automatically also using the dev,
 8 |                                 by replacing train by dev)
 9 |   --out_labels=OUT_LABELS       output file for the new labels
10 |   --keep_label=KEEP_LABEL       name of the label to keep
11 | 
12 | """
13 | 
14 | import pickle
15 | 
16 | from docopt import docopt
17 | from tqdm import tqdm
18 | 
19 | 
20 | def read_data(in_f):
21 |     with open(in_f, 'rb') as f:
22 |         labels = pickle.load(f)
23 |     return labels
24 | 
25 | 
26 | def convert_labels(in_f, label2keep):
27 |     labels = read_data(in_f)
28 | 
29 |     keep_labels = label2keep.split(',')
30 | 
31 |     new_labels = []
32 |     for sen_labels in tqdm(labels):
33 |         sen_new_labels = []
34 |         for l in sen_labels:
35 |             if l in keep_labels:
36 |                 sen_new_labels.append(l)
37 |             else:
38 |                 sen_new_labels.append('other')
39 |         new_labels.append(sen_new_labels)
40 |     return new_labels
41 | 
42 | 
43 | def to_file(labels, out_f):
44 |     with open(out_f, 'wb') as f:
45 |         pickle.dump(labels, f)
46 | 
47 | 
48 | if __name__ == '__main__':
49 |     arguments = docopt(__doc__)
50 | 
51 |     labels_file = arguments['--labels']
52 | 
53 |     new_labels = convert_labels(labels_file, arguments['--keep_label'])
54 | 
55 |     to_file(new_labels, arguments['--out_labels'])
56 | 


--------------------------------------------------------------------------------
/runs/core/run_ft_1hot_selectivity.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Usage:
 3 |   run_ft_1hot.py [--dry_run]
 4 | 
 5 | Options:
 6 |   -h --help                     show this help message and exit
 7 |   --dry_run                     if used, not running the experiments, and just printing them
 8 | 
 9 | """
10 | 
11 | from docopt import docopt
12 | from runs.ts_run import parallelize
13 | 
14 | # ┌──────────────────────┐
15 | # │ connect to all nodes │
16 | # └──────────────────────┘
17 | nodes = [
18 |     'nlp01',
19 |     'nlp02',
20 | ]
21 | 
22 | 
23 | # ┌──────────────────────┐
24 | # │ generate experiments │
25 | # └──────────────────────┘
26 | 
27 | runs_dic = {
28 |     'ud': {
29 |         'train_dir': 'data/ud_output_{}/train/',
30 |         'labels': ['dep', 'tag']
31 |     },
32 | }
33 | 
34 | if __name__ == '__main__':
35 |     arguments = docopt(__doc__)
36 | 
37 |     if arguments['--dry_run']:
38 |         dry_run = True
39 |     else:
40 |         dry_run = False
41 | 
42 |     cartesian_product = []
43 |     for data_type, vals in runs_dic.items():
44 |         for label in vals['labels']:
45 |             for masking in ['masked']:
46 |                 for iter in range(20):
47 |                     train_dir = vals['train_dir'].format(masking)
48 |                     cartesian_product.append([train_dir,
49 |                                               f'models/lm/{label}/{masking}/layer:last/P_{iter}.npy',
50 |                                               f'{label}',
51 |                                               f'models/lm/{label}/{masking}/layer:last/',
52 |                                               ])
53 | 
54 |     parallelize(nodes, cartesian_product, 'amnesic_probing/runs/core/run_ft_1hot.sh',
55 |                 on_gpu=True, dry_run=dry_run)
56 | 


--------------------------------------------------------------------------------
/runs/ts_run.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 
 3 | Based on https://gist.github.com/felixkreuk/8d70c8c1507fcaac6197d84a8a787fa0
 4 | """
 5 | 
 6 | import spur
 7 | 
 8 | 
 9 | env = {
10 |     'USE_SIMPLE_THREADED_LEVEL3': '1',
11 |     'OMP_NUM_THREADS': '1',
12 | }
13 | ts = 'PATH-TO-TASK-SPOOLER/ts'
14 | 
15 | 
16 | def parallelize(nodes_list, all_runs_args, run_script, on_gpu=False, dry_run=False):
17 |     """
18 |     Running on a list of given servers, a bunch of experiments.
19 |     Assumes that can connect automatically to the servers
20 |     :param nodes_list:
21 |     :param all_runs_args:
22 |     :param run_script:
23 |     :param on_gpu:
24 |     :param dry_run: allows to simply print the intended experiments, and not actually run them
25 |     :return:
26 |     """
27 |     # assumes automatic connection w/o password
28 |     connections = [spur.SshShell(hostname=node, username="USERNAME") for node in nodes_list]
29 | 
30 |     # ┌──────────────┐
31 |     # │ execute tasks│
32 |     # └──────────────┘
33 | 
34 |     for sub_exp_idx, combination in enumerate(all_runs_args):
35 |         args_str = f"{ts} sh {run_script}"
36 | 
37 |         for item in combination:
38 |             args_str += f" {item}"
39 | 
40 |         if on_gpu:
41 |             gpu_id = sub_exp_idx % 4
42 |             args_str += f" cuda:0"
43 | 
44 |             node_id = sub_exp_idx // 4 % len(nodes_list)
45 |             env['CUDA_VISIBLE_DEVICES'] = f"{gpu_id}"
46 |             print(args_str.split(" "), node_id, gpu_id)
47 |         else:
48 |             node_id = sub_exp_idx % len(nodes_list)
49 |             print(args_str.split(" "), node_id)
50 | 
51 |         if not dry_run:
52 |             connections[node_id].run(args_str.split(" "), update_env=env)
53 | 
54 |     print(f"==> running {len(all_runs_args)} experiments")
55 | 


--------------------------------------------------------------------------------
/amnesic_probing/debiased_finetuning/utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | from amnesic_probing.debias.classifier import PytorchClassifier
 4 | import pickle
 5 | 
 6 | 
 7 | def define_network(W: np.ndarray, b: np.ndarray, projection_mat: np.ndarray = None, device: str = 'cpu'):
 8 |     embedding_net = torch.nn.Linear(in_features=W.shape[1], out_features=W.shape[0])
 9 |     embedding_net.weight.data = torch.tensor(W)
10 |     embedding_net.bias.data = torch.tensor(b)
11 | 
12 |     if projection_mat is not None:
13 |         projection_net = torch.nn.Linear(in_features=projection_mat.shape[1],
14 |                                          out_features=projection_mat.shape[0],
15 |                                          bias=False)
16 |         projection_net.weight.data = torch.tensor(projection_mat, dtype=torch.float)
17 |         for p in projection_net.parameters():
18 |             p.requires_grad = False
19 |         word_prediction_net = torch.nn.Sequential(projection_net, embedding_net)
20 | 
21 |     else:
22 |         word_prediction_net = torch.nn.Sequential(embedding_net)
23 | 
24 |     net = PytorchClassifier(word_prediction_net, device=device)
25 |     return net
26 | 
27 | 
28 | def load_data(path):
29 |     vecs = np.load(f"{path}/last_vec.npy", allow_pickle=True)
30 |     vecs = np.array([x[1:-1] for x in vecs])
31 | 
32 |     with open(f"{path}/tokens.pickle", 'rb') as f:
33 |         labels = pickle.load(f)
34 | 
35 |     return vecs, labels
36 | 
37 | 
38 | def load_labels(labels_file):
39 |     with open(labels_file, 'rb') as f:
40 |         rebias_labels = pickle.load(f)
41 | 
42 |     return rebias_labels
43 | 
44 | 
45 | def flatten_list(input_list):
46 |     return [x for x_list in input_list for x in x_list]
47 | 
48 | 
49 | def flatten_label_list(input_list, labels_list):
50 |     flat_list = flatten_list(input_list)
51 |     return np.array([labels_list.index(y) for y in flat_list]).flatten()
52 | 
53 | 
54 | def flatten_tokens(all_vectors, all_labels, lm_tokenizer):
55 |     x = np.array(flatten_list(all_vectors))
56 |     y = np.array(
57 |         [label for sentence_y in all_labels for label in
58 |          lm_tokenizer.convert_tokens_to_ids(sentence_y)]).flatten()
59 |     return x, y
60 | 


--------------------------------------------------------------------------------
/runs/encode/run_encode.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Usage:
 3 |   run_encode.py [--dry_run]
 4 | 
 5 | Options:
 6 |   -h --help                     show this help message and exit
 7 |   --dry_run                     if used, not running the experiments, and just printing them
 8 | 
 9 | """
10 | 
11 | from docopt import docopt
12 | from runs.ts_run import parallelize
13 | 
14 | 
15 | # ┌──────────────────────┐
16 | # │ connect to all nodes │
17 | # └──────────────────────┘
18 | nodes = [
19 |     'nlp01',
20 |     'nlp02',
21 | ]
22 | 
23 | # ┌──────────────────────┐
24 | # │ running combination  │
25 | # └──────────────────────┘
26 | 
27 | runs_dic = {
28 |     'ontonotes': {
29 |         'splits': ['train', 'dev', 'test'],
30 |         'input_file': 'data/ontonotes/{}',
31 |         'out_dir': 'data/ontonotes_output_{}/',
32 |         'task_format': 'ontonotes',
33 |         'encode_format': ['normal', 'masked'],
34 |     },
35 |     'ud': {
36 |         'splits': ['train', 'dev', 'test'],
37 |         'input_file': 'data/ud/en-universal-{}.conll',
38 |         'out_dir': 'data/ud_output_{}',
39 |         'task_format': 'conll',
40 |         'encode_format': ['normal', 'masked'],
41 |     },
42 | }
43 | 
44 | if __name__ == '__main__':
45 |     arguments = docopt(__doc__)
46 | 
47 |     if arguments['--dry_run']:
48 |         dry_run = True
49 |     else:
50 |         dry_run = False
51 | 
52 |     cartesian_product = []
53 |     for data_type, vals in runs_dic.items():
54 |         task_format = vals['task_format']
55 |         for split in vals['splits']:
56 |             input_file = vals['input_file']
57 |             input_file = input_file.format(split)
58 | 
59 |             for encoding_format in vals['encode_format']:
60 |                 out_dir = vals['out_dir'].format(encoding_format)
61 | 
62 |                 cartesian_product.append([split,
63 |                                           input_file,
64 |                                           out_dir,
65 |                                           task_format,
66 |                                           encoding_format,
67 |                                           ])
68 | 
69 |     parallelize(nodes, cartesian_product, 'amnesic_probing/runs/encode/run_encode.sh',
70 |                 on_gpu=True, dry_run=dry_run)
71 | 


--------------------------------------------------------------------------------
/runs/evaluate/run_layer_wise_lm.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Usage:
 3 |   run_eval_per_dim.py [--dry_run]
 4 | 
 5 | Options:
 6 |   -h --help                     show this help message and exit
 7 |   --dry_run                     if used, not running the experiments, and just printing them
 8 | 
 9 | """
10 | 
11 | from docopt import docopt
12 | from runs.ts_run import parallelize
13 | 
14 | # ┌──────────────────────┐
15 | # │ connect to all nodes │
16 | # └──────────────────────┘
17 | nodes = [
18 |     'nlp01',
19 |     'nlp02',
20 | ]
21 | 
22 | # ┌──────────────────────┐
23 | # │ generate experiments │
24 | # └──────────────────────┘
25 | 
26 | runs_dic = {
27 |     'ontonotes': {
28 |         'base_dir': 'data/ontonotes_output_projection_{}/{}/train/',
29 |         'labels': ['ner', 'np_start', 'phrase_start', 'phrase_end',
30 |                    'ner_control', 'np_start_control', 'phrase_start_control', 'phrase_end_control'],
31 |         'task': 'task',
32 |     },
33 |     'ud': {
34 |         'base_dir': 'data/ud_output_projection_{}/{}/train/',
35 |         'labels': ['tag',
36 |                    'tag_control'],
37 |         'task': 'task',
38 |     },
39 | }
40 | 
41 | if __name__ == '__main__':
42 |     arguments = docopt(__doc__)
43 | 
44 |     if arguments['--dry_run']:
45 |         dry_run = True
46 |     else:
47 |         dry_run = False
48 | 
49 |     cartesian_product = []
50 |     for data_type, vals in runs_dic.items():
51 |         task_type = vals['task']
52 |         for masking in ['normal', 'masked']:
53 |             for task in vals['labels']:
54 | 
55 |                 # using the task labels in the regular case, or random labels from the same data
56 |                 # when the task is generated on the fly (e.g. word_len)
57 |                 data_label = task
58 |                 if task_type != 'task':
59 |                     data_label = 'np_start'
60 | 
61 |                 base_dir = vals['base_dir'].format(masking, task)
62 |                 labels = f'{base_dir}/{data_label}.pickle'
63 |                 text = f'{base_dir}/tokens.pickle'
64 |                 cartesian_product.append([base_dir, labels, text, task_type])
65 | 
66 |     parallelize(nodes, cartesian_product,
67 |                 'amnesic_probing/runs/evaluate/run_layer_wise_lm.sh',
68 |                 on_gpu=True, dry_run=dry_run)
69 | 


--------------------------------------------------------------------------------
/runs/core/run_ft_reg.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Usage:
 3 |   run_ft_reg.py [--dry_run]
 4 | 
 5 | Options:
 6 |   -h --help                     show this help message and exit
 7 |   --dry_run                     if used, not running the experiments, and just printing them
 8 | 
 9 | """
10 | 
11 | from docopt import docopt
12 | from runs.ts_run import parallelize
13 | 
14 | # ┌──────────────────────┐
15 | # │ connect to all nodes │
16 | # └──────────────────────┘
17 | nodes = [
18 |     'nlp01',
19 |     'nlp02',
20 | ]
21 | 
22 | # ┌──────────────────────┐
23 | # │ generate experiments │
24 | # └──────────────────────┘
25 | 
26 | runs_dic = {
27 |     'ontonotes': {
28 |         'train_dir': 'data/ontonotes_output_{}/train',
29 |         'labels': ['ner', 'np_start', 'phrase_start', 'phrase_end']
30 |     },
31 |     'ud': {
32 |         'train_dir': 'data/ud_output_{}/train',
33 |         'labels': ['dep', 'pos', 'tag', 'pos_next_word']
34 |     },
35 |     'binary_tag': {
36 |         'train_dir': 'data/ud_output_{}/train',
37 |         'labels': ['tag_verb', 'tag_noun', 'tag_adp', 'tag_det', 'tag_num', 'tag_punct',
38 |                    'tag_prt', 'tag_conj', 'tag_adv', 'tag_pron', 'tag_adj', 'tag_other'],
39 |     },
40 |     'binary_dep': {
41 |         'train_dir': 'data/ud_output_{}/train',
42 |         'labels': ['dep_adpmod', 'dep_det', 'dep_compmod', 'dep_num', 'dep_adpobj',
43 |                    'dep_p', 'dep_poss', 'dep_adp', 'dep_amod', 'dep_nsubj',
44 |                    'dep_dep', 'dep_dobj', 'dep_cc', 'dep_conj', 'dep_advmod',
45 |                    'dep_ROOT', 'dep_ccomp', 'dep_aux', 'dep_xcomp', 'dep_neg'],
46 |     },
47 | }
48 | 
49 | if __name__ == '__main__':
50 |     arguments = docopt(__doc__)
51 | 
52 |     if arguments['--dry_run']:
53 |         dry_run = True
54 |     else:
55 |         dry_run = False
56 | 
57 |     cartesian_product = []
58 |     for data_type, vals in runs_dic.items():
59 |         for label in vals['labels']:
60 |             for masking in ['normal', 'masked']:
61 |                 train_dir = vals['train_dir'].format(masking)
62 |                 cartesian_product.append([train_dir,
63 |                                           f'models/lm/{label}/{masking}/layer:last/',
64 |                                           f'models/lm/{label}/{masking}/layer:last/P.npy'])
65 | 
66 |     parallelize(nodes, cartesian_product, 'amnesic_probing/runs/core/run_ft_reg.sh',
67 |                 on_gpu=True, dry_run=dry_run)
68 | 


--------------------------------------------------------------------------------
/runs/evaluate/run_specific_eval.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Usage:
 3 |   run_eval_per_dim.py [--dry_run]
 4 | 
 5 | Options:
 6 |   -h --help                     show this help message and exit
 7 |   --dry_run                     if used, not running the experiments, and just printing them
 8 | 
 9 | """
10 | 
11 | from docopt import docopt
12 | from runs.ts_run import parallelize
13 | 
14 | 
15 | # ┌──────────────────────┐
16 | # │ connect to all nodes │
17 | # └──────────────────────┘
18 | nodes = [
19 |     'nlp01',
20 |     'nlp02',
21 | ]
22 | 
23 | # ┌──────────────────────┐
24 | # │ generate experiments │
25 | # └──────────────────────┘
26 | 
27 | runs_dic = {
28 |     'ontonotes': {
29 |         'base_dir': 'data/ontonotes_output_{}/train',
30 |         'vecs': 'last_vec.npy',
31 |         'text': 'tokens.pickle',
32 |         'labels': ['ner', 'np_start', 'phrase_start', 'phrase_end'],
33 |     },
34 |     'ud': {
35 |         'base_dir': 'data/ud_output_{}/train',
36 |         'vecs': 'last_vec.npy',
37 |         'text': 'tokens.pickle',
38 |         'labels': ['dep', 'pos', 'tag', 'pos_next_word'],
39 |     },
40 |     'binary_tag': {
41 |         'base_dir': 'data/ud_output_{}/train',
42 |         'vecs': 'last_vec.npy',
43 |         'text': 'tokens.pickle',
44 |         'labels': ['tag_verb', 'tag_noun', 'tag_adp', 'tag_det', 'tag_num', 'tag_punct',
45 |                    'tag_prt', 'tag_conj', 'tag_adv', 'tag_pron', 'tag_adj', 'tag_other'],
46 |     },
47 | }
48 | 
49 | if __name__ == '__main__':
50 |     arguments = docopt(__doc__)
51 | 
52 |     if arguments['--dry_run']:
53 |         dry_run = True
54 |     else:
55 |         dry_run = False
56 | 
57 |     cartesian_product = []
58 |     for data_type, vals in runs_dic.items():
59 |         base_dir = vals['base_dir']
60 |         vecs = base_dir + '/' + vals['vecs']
61 |         text = base_dir + '/' + vals['text']
62 |         for masking in ['normal', 'masked']:
63 |             base_dir = vals['base_dir'].format(masking)
64 |             vecs = base_dir + '/' + vals['vecs']
65 |             text = base_dir + '/' + vals['text']
66 |             for label in vals['labels']:
67 |                 data_label = label
68 |                 cartesian_product.append([vecs, f'{base_dir}/{data_label}.pickle', text,
69 |                                           f'models/lm/{label}/{masking}/layer:last/'])
70 | 
71 |     parallelize(nodes, cartesian_product,
72 |                 'amnesic_probing/runs/evaluate/run_specific_eval.sh',
73 |                 on_gpu=True, dry_run=dry_run)
74 | 


--------------------------------------------------------------------------------
/runs/evaluate/run_layer_wise_deprobe.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Usage:
 3 |   run_eval_per_dim.py [--dry_run]
 4 | 
 5 | Options:
 6 |   -h --help                     show this help message and exit
 7 |   --dry_run                     if used, not running the experiments, and just printing them
 8 | 
 9 | """
10 | 
11 | from docopt import docopt
12 | from runs.ts_run import parallelize
13 | 
14 | # ┌──────────────────────┐
15 | # │ connect to all nodes │
16 | # └──────────────────────┘
17 | nodes = [
18 |     'nlp01',
19 |     'nlp02',
20 | ]
21 | 
22 | # ┌──────────────────────┐
23 | # │ generate experiments │
24 | # └──────────────────────┘
25 | 
26 | runs_dic = {
27 |     'ontonotes': {
28 |         'base_dir': 'data/ontonotes_output_projection_{}/{}/train/',
29 |         'layers_dir': 'data/ontonotes_output_{}/train/',
30 |         'labels': ['ner', 'np_start', 'phrase_start', 'phrase_end'],
31 |         'task_type': 'task'
32 |     },
33 |     'ud': {
34 |         'base_dir': 'data/ud_output_projection_{}/{}/train/',
35 |         'layers_dir': 'data/ud_output_{}/train/',
36 |         'labels': ['tag'],
37 |         'task_type': 'task'
38 |     },
39 | }
40 | 
41 | if __name__ == '__main__':
42 |     arguments = docopt(__doc__)
43 | 
44 |     if arguments['--dry_run']:
45 |         dry_run = True
46 |     else:
47 |         dry_run = False
48 | 
49 |     cartesian_product = []
50 |     for data_type, vals in runs_dic.items():
51 |         for masking in ['normal', 'masked']:
52 |             layer_dir = vals['layers_dir'].format(masking)
53 |             for task in vals['labels']:
54 |                 base_dir = vals['base_dir'].format(masking, task)
55 |                 text_file = f'{base_dir}/tokens.pickle'
56 |                 task_type = vals['task_type']
57 | 
58 |                 # using the task labels in the regular case, or random labels from the same data
59 |                 # when the task is generated on the fly (e.g. word_len)
60 |                 data_label = task
61 |                 if task_type != 'task':
62 |                     data_label = 'np_start'
63 |                 cartesian_product.append([layer_dir,
64 |                                           base_dir,
65 |                                           f'{base_dir}/{data_label}.pickle',
66 |                                           text_file,
67 |                                           task_type])
68 | 
69 |     parallelize(nodes, cartesian_product,
70 |                 'amnesic_probing/runs/evaluate/run_layer_wise_deprobe.sh',
71 |                 on_gpu=False, dry_run=dry_run)
72 | 


--------------------------------------------------------------------------------
/runs/core/run_ft_1hot.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Usage:
 3 |   run_ft_1hot.py [--dry_run]
 4 | 
 5 | Options:
 6 |   -h --help                     show this help message and exit
 7 |   --dry_run                     if used, not running the experiments, and just printing them
 8 | 
 9 | """
10 | 
11 | from docopt import docopt
12 | from runs.ts_run import parallelize
13 | 
14 | # ┌──────────────────────┐
15 | # │ connect to all nodes │
16 | # └──────────────────────┘
17 | nodes = [
18 |     'nlp01',
19 |     'nlp02',
20 | ]
21 | 
22 | 
23 | # ┌──────────────────────┐
24 | # │ generate experiments │
25 | # └──────────────────────┘
26 | 
27 | runs_dic = {
28 |     'ontonotes': {
29 |         'train_dir': 'data/ontonotes_output_{}/train/',
30 |         'labels': ['ner', 'np_start', 'phrase_start', 'phrase_end']
31 |     },
32 |     'ud': {
33 |         'train_dir': 'data/ud_output_{}/train/',
34 |         'labels': ['dep', 'pos', 'tag', 'pos_next_word']
35 |     },
36 |     'binary_tag': {
37 |         'train_dir': 'data/ud_output_{}/train',
38 |         'labels': ['tag_verb', 'tag_noun', 'tag_adp', 'tag_det', 'tag_num', 'tag_punct',
39 |                    'tag_prt', 'tag_conj', 'tag_adv', 'tag_pron', 'tag_adj', 'tag_other'],
40 |     },
41 |     'binary_dep': {
42 |         'train_dir': 'data/ud_output_{}/train',
43 |         'labels': ['dep_adpmod', 'dep_det', 'dep_compmod', 'dep_num', 'dep_adpobj',
44 |                    'dep_p', 'dep_poss', 'dep_adp', 'dep_amod', 'dep_nsubj',
45 |                    'dep_dep', 'dep_dobj', 'dep_cc', 'dep_conj', 'dep_advmod',
46 |                    'dep_ROOT', 'dep_ccomp', 'dep_aux', 'dep_xcomp', 'dep_neg'],
47 |     },
48 | }
49 | 
50 | if __name__ == '__main__':
51 |     arguments = docopt(__doc__)
52 | 
53 |     if arguments['--dry_run']:
54 |         dry_run = True
55 |     else:
56 |         dry_run = False
57 | 
58 |     cartesian_product = []
59 |     for data_type, vals in runs_dic.items():
60 |         for label in vals['labels']:
61 |             for masking in ['normal', 'masked']:
62 |                 train_dir = vals['train_dir'].format(masking)
63 |                 cartesian_product.append([train_dir,
64 |                                           f'models/lm/{label}/{masking}/layer:last/P.npy',
65 |                                           f'{label}',
66 |                                           f'models/lm/{label}/{masking}/layer:last/',
67 |                                           ])
68 | 
69 |     parallelize(nodes, cartesian_product, 'amnesic_probing/runs/core/run_ft_1hot.sh',
70 |                 on_gpu=True, dry_run=dry_run)
71 | 


--------------------------------------------------------------------------------
/amnesic_probing/encoders/control.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Usage:
 3 |   encode.py [--input_sentences=INPUT_SENTENCES] [--input_labels=INPUT_LABELS] [--output_file=OUTPUT_FILE]
 4 | 
 5 | Options:
 6 |   -h --help                     show this help message and exit
 7 |   --input_sentences=INPUT_SENTENCES       input file containing sentences. pickle file
 8 |   --input_labels=INPUT_LABELS   input file containing labels. pickle file
 9 |   --output_file=OUTPUT_FILE     output file where to write the output file
10 | 
11 | """
12 | 
13 | import pickle
14 | 
15 | import numpy as np
16 | from docopt import docopt
17 | 
18 | np.random.seed(0)
19 | 
20 | 
21 | def get_num_labels(in_f):
22 |     with open(in_f, 'rb') as f:
23 |         labels = pickle.load(f)
24 | 
25 |     all_labels = [x for l in labels for x in l]
26 |     return len(set(all_labels))
27 | 
28 | 
29 | def assign_labels(in_f, n_labels):
30 |     with open(in_f, 'rb') as f:
31 |         lines = pickle.load(f)
32 | 
33 |     words = [x for l in lines for x in l]
34 |     words = list(set(words))
35 |     np.random.shuffle(words)
36 | 
37 |     temp = dict(enumerate(words))
38 |     temp = {v: k for k, v in temp.items()}
39 |     random_words_label_dic = {}
40 |     for k, v in temp.items():
41 |         random_words_label_dic[k] = v % n_labels
42 |     return random_words_label_dic
43 | 
44 | 
45 | def label_sentences(in_f, words_labels_dic, n_labels):
46 |     with open(in_f, 'rb') as f:
47 |         lines = pickle.load(f)
48 | 
49 |     labels = []
50 |     for line in lines:
51 |         sentence_labels = []
52 |         for w in line:
53 |             if w not in words_labels_dic:
54 |                 i = np.random.randint(n_labels)
55 |                 words_labels_dic[w] = i
56 |             sentence_labels.append(words_labels_dic[w])
57 |         labels.append(sentence_labels)
58 | 
59 |     return labels, words_labels_dic
60 | 
61 | 
62 | if __name__ == '__main__':
63 |     arguments = docopt(__doc__)
64 | 
65 |     n = get_num_labels(arguments['--input_labels'])
66 |     words_labels = assign_labels(arguments['--input_sentences'], n)
67 | 
68 |     train_labels, words_labels = label_sentences(arguments['--input_sentences'], words_labels, n)
69 |     dev_labels, words_labels = label_sentences(arguments['--input_sentences'].replace('train', 'dev'), words_labels, n)
70 | 
71 |     out_file = arguments['--output_file']
72 |     # os.makedirs(out_dir, exist_ok=True)
73 |     with open(out_file, 'wb') as f:
74 |         pickle.dump(train_labels, f)
75 | 
76 |     out_file_dev = out_file.replace('train', 'dev')
77 |     # os.makedirs(out_file_dev, exist_ok=True)
78 |     with open(out_file_dev, 'wb') as f:
79 |         pickle.dump(dev_labels, f)
80 | 
81 | 


--------------------------------------------------------------------------------
/amnesic_probing/encoders/encode.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Usage:
 3 |   encode.py [--input_file=INPUT_FILE] [--output_dir=OUTPUT_DIR] [--encoder=ENCODER]
 4 |                     [--format=FORMAT] [--encode_format=ENCODE_FORMAT] [--device=DEVICE] [--all_layers]
 5 | 
 6 | Options:
 7 |   -h --help                     show this help message and exit
 8 |   --input_file=INPUT_FILE       input file. conll format
 9 |   --output_dir=OUTPUT_DIR       output directory where to write the output files
10 |   --encoder=ENCODER             encoder. types: bert-base-uncased, qa, ...
11 |   --format=FORMAT               data format: conll, ontonotes, semtagging, fce
12 |   --encode_format=ENCODE_FORMAT     encoding: normal, masked
13 |   --device=DEVICE               cpu, cuda:0, cuda:1, ... (default: cpu)
14 |   --all_layers                  encode all layers
15 | """
16 | 
17 | from docopt import docopt
18 | 
19 | from amnesic_probing.encoders import get_pretrained_models, encode_text, to_file, read_conll_format, read_onto_notes_format, \
20 |     read_sem_tagging_format, read_coarse_sem_tagging_format, read_fce_format, read_coord_format
21 | 
22 | if __name__ == '__main__':
23 |     arguments = docopt(__doc__)
24 |     only_last_layer = not arguments["--all_layers"]
25 |     print("only last layer:", only_last_layer)
26 |     
27 |     encoder, tokenizer = get_pretrained_models(arguments['--encoder'])
28 |     encoder = encoder.to(arguments['--device'])
29 | 
30 |     data_format = arguments['--format']
31 |     if data_format == 'conll':
32 |         data = read_conll_format(arguments['--input_file'])
33 |     elif data_format == 'ontonotes':
34 |         data = read_onto_notes_format(arguments['--input_file'])
35 |     elif data_format == 'semtagging':
36 |         data = read_sem_tagging_format(arguments['--input_file'])
37 |     elif data_format == 'coarse_semtagging':
38 |         data = read_coarse_sem_tagging_format(arguments['--input_file'])
39 |     elif data_format == 'fce':
40 |         data = read_fce_format(arguments['--input_file'])
41 |     elif data_format == 'coord':
42 |         data = read_coord_format(arguments['--input_file'])
43 |     else:
44 |         raise Exception('Unsupported file format exception')
45 | 
46 |     if arguments['--encode_format'] == 'normal':
47 |         final_data = encode_text(data, encoder, tokenizer, masked=False, only_last_layer=only_last_layer)
48 |     elif arguments['--encode_format'] == 'masked':
49 |         final_data = encode_text(data, encoder, tokenizer, masked=True, only_last_layer=only_last_layer)
50 |     else:
51 |         raise Exception('Unsupported encoding type')
52 | 
53 |     to_file(final_data, arguments['--output_dir'], only_last_layer=only_last_layer)
54 | 


--------------------------------------------------------------------------------
/runs/evaluate/run_eval_per_dim.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Usage:
 3 |   run_eval_per_dim.py [--dry_run]
 4 | 
 5 | Options:
 6 |   -h --help                     show this help message and exit
 7 |   --dry_run                     if used, not running the experiments, and just printing them
 8 | 
 9 | """
10 | 
11 | from docopt import docopt
12 | from runs.ts_run import parallelize
13 | 
14 | # ┌──────────────────────┐
15 | # │ connect to all nodes │
16 | # └──────────────────────┘
17 | nodes = [
18 |     'nlp01',
19 |     'nlp02',
20 | ]
21 | 
22 | # ┌──────────────────────┐
23 | # │ generate experiments │
24 | # └──────────────────────┘
25 | 
26 | runs_dic = {
27 |     'ontonotes': {
28 |         'base_dir': 'data/ontonotes_output_{}/train',
29 |         'vecs': 'last_vec.npy',
30 |         'text': 'tokens.pickle',
31 |         'labels': ['ner', 'phrase_start', 'phrase_end'],
32 |         'task_type': 'task'
33 |     },
34 |     'ud': {
35 |         'base_dir': 'data/ud_output_{}/train',
36 |         'vecs': 'last_vec.npy',
37 |         'text': 'tokens.pickle',
38 |         'labels': ['dep', 'pos', 'tag'],
39 |         'task_type': 'task'
40 |     },
41 |     'ontonotes_control': {
42 |         'base_dir': 'data/ontonotes_output_{}/train',
43 |         'vecs': 'last_vec.npy',
44 |         'text': 'tokens.pickle',
45 |         'labels': ['ner_control', 'phrase_start_control', 'phrase_end_control'],
46 |         'task_type': 'task'
47 |     },
48 |     'ud_control': {
49 |         'base_dir': 'data/ud_output_{}/train',
50 |         'vecs': 'last_vec.npy',
51 |         'text': 'tokens.pickle',
52 |         'labels': ['dep_control', 'pos_control', 'tag_control'],
53 |         'task_type': 'task'
54 |     },
55 | }
56 | 
57 | if __name__ == '__main__':
58 |     arguments = docopt(__doc__)
59 | 
60 |     if arguments['--dry_run']:
61 |         dry_run = True
62 |     else:
63 |         dry_run = False
64 | 
65 |     cartesian_product = []
66 |     for data_type, vals in runs_dic.items():
67 |         base_dir = vals['base_dir']
68 |         vecs = base_dir + '/' + vals['vecs']
69 |         text = base_dir + '/' + vals['text']
70 |         task_type = vals['task_type']
71 |         for masking in ['normal', 'masked']:
72 |             base_dir = vals['base_dir'].format(masking)
73 |             vecs = base_dir + '/' + vals['vecs']
74 |             text = base_dir + '/' + vals['text']
75 |             task_type = vals['task_type']
76 |             for label in vals['labels']:
77 |                 data_label = label
78 |                 if task_type != 'task':
79 |                     data_label = 'np_start'
80 |                 cartesian_product.append([vecs, f'{base_dir}/{data_label}.pickle', text,
81 |                                           f'models/lm/{label}/{masking}/layer:last/',
82 |                                           task_type])
83 | 
84 |     parallelize(nodes, cartesian_product,
85 |                 'amnesic_probing/runs/evaluate/run_eval_per_dim.sh',
86 |                 on_gpu=True, dry_run=dry_run)
87 | 


--------------------------------------------------------------------------------
/runs/encode/run_layer_encode.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Usage:
 3 |   run_layer_encode.py [--dry_run]
 4 | 
 5 | Options:
 6 |   -h --help                     show this help message and exit
 7 |   --dry_run                     if used, not running the experiments, and just printing them
 8 | 
 9 | """
10 | 
11 | from docopt import docopt
12 | from runs.ts_run import parallelize
13 | 
14 | # ┌──────────────────────┐
15 | # │ connect to all nodes │
16 | # └──────────────────────┘
17 | nodes = [
18 |     'nlp01',
19 |     'nlp02',
20 | ]
21 | 
22 | # ┌──────────────────────┐
23 | # │ running combination  │
24 | # └──────────────────────┘
25 | 
26 | runs_dic = {
27 |     'ontonotes': {
28 |         'splits': ['train', 'dev', 'test'],
29 |         'input_file': 'data/ontonotes/{}',
30 |         'out_dir': 'data/ontonotes_output_projection_{}/{}/{}/',
31 |         'task_format': 'ontonotes',
32 |         'encode_format': ['normal', 'masked'],
33 |         'tasks': ['ner', 'np_start', 'phrase_start', 'phrase_end', 'word_len', 'vowel'],
34 |     },
35 |     'ud': {
36 |         'splits': ['train', 'dev', 'test'],
37 |         'input_file': 'data/ud/en-universal-{}.conll',
38 |         'out_dir': 'data/ud_output_projection_{}/{}/{}/',
39 |         'task_format': 'conll',
40 |         'encode_format': ['normal', 'masked'],
41 |         'tasks': ['tag']
42 |     },
43 |     'regression': {
44 |         'splits': ['dev', 'test', 'train'],
45 |         'input_file': 'data/ontonotes/{}',
46 |         'out_dir': 'data/ontonotes_output_projection_{}/{}/{}/',
47 |         'task_format': 'ontonotes',
48 |         'encode_format': ['normal', 'masked'],
49 |         'tasks': ['word_ind'],
50 |     },
51 | }
52 | 
53 | if __name__ == '__main__':
54 |     arguments = docopt(__doc__)
55 | 
56 |     if arguments['--dry_run']:
57 |         dry_run = True
58 |     else:
59 |         dry_run = False
60 | 
61 |     cartesian_product = []
62 |     for data_type, vals in runs_dic.items():
63 |         task_format = vals['task_format']
64 |         for split in vals['splits']:
65 |             input_file = vals['input_file']
66 |             input_file = input_file.format(split)
67 |             for encoding_format in vals['encode_format']:
68 |                 for task in vals['tasks']:
69 |                     for control in ['false', 'true']:
70 |                         if control == 'true':
71 |                             out_dir = vals['out_dir'].format(encoding_format, task + '_control', split)
72 |                         else:
73 |                             out_dir = vals['out_dir'].format(encoding_format, task, split)
74 | 
75 |                         cartesian_product.append([input_file,
76 |                                                   f'models/lm/{task}/{encoding_format}/',
77 |                                                   out_dir,
78 |                                                   task_format,
79 |                                                   encoding_format,
80 |                                                   control
81 |                                                   ])
82 | 
83 |     parallelize(nodes, cartesian_product, 'amnesic_probing/runs/encode/run_layer_encode.sh',
84 |                 on_gpu=True, dry_run=dry_run)
85 | 


--------------------------------------------------------------------------------
/amnesic_probing/debiased_finetuning/debiased_finetuning_lm.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Usage:
 3 |   debiased_finetuning_lm.py [--train_path=TRAIN_PATH] [--encoder=ENCODER] [--input_dim=INPUT_DIM] [--out_dir=OUT_DIR]
 4 |   [--n_epochs=N_EPOCHS] [--debias=DEBIAS] [--device=DEVICE] [--wandb]
 5 | 
 6 | Options:
 7 |   -h --help                     show this help message and exit
 8 |   --train_path=TRAIN_PATH       input directory
 9 |   --encoder=ENCODER             encoder. types: bert-base-uncased, qa, ... [default: bert-base-uncased]
10 |   --input_dim=INPUT_DIM         input dimension [default: 768]
11 |   --out_dir=OUT_DIR             logs and outputs directory
12 |   --n_epochs=N_EPOCHS           number of epochs to run the classifier [default: 20]
13 |   --debias=DEBIAS               the debias projection matrix to use, or none for no debiasing [default: none]
14 |   --device=DEVICE               cpu, cuda:0, cuda:1, ... [default: cpu]
15 |   --wandb                       log using wandb
16 | 
17 | """
18 | 
19 | import numpy as np
20 | import wandb
21 | from docopt import docopt
22 | 
23 | from amnesic_probing.debiased_finetuning.utils import define_network, load_data, flatten_tokens
24 | from amnesic_probing.tasks.utils import get_lm_vals
25 | 
26 | 
27 | def log_wandb(arguments):
28 |     out_dir = arguments['--out_dir']
29 |     if out_dir[-1] == '/':
30 |         out_dir = out_dir[:-1]
31 |     task_type_full = out_dir.split('models/lm/')[1]
32 |     task_type = task_type_full.split('/')[0]
33 |     masking = task_type_full.split('/')[1]
34 |     dataset_full = arguments['--train_path'].split('data/')[1].split('/')[0]
35 |     dataset = dataset_full.split('_output', 1)[0]
36 |     debias = arguments['--debias']
37 |     if debias == 'none':
38 |         task_type = task_type + '_baseline'
39 |     debias = task_type
40 | 
41 |     config = dict(
42 |         property=task_type,
43 |         encoder='bert-base-uncased',
44 |         dataset=dataset,
45 |         masking=masking,
46 |         debias_property=debias
47 |     )
48 | 
49 |     wandb.init(
50 |         name=task_type + '_ft_reg',
51 |         project="amnesic_probing",
52 |         tags=["lm", "emb_ft", task_type],
53 |         config=config,
54 |     )
55 | 
56 | 
57 | if __name__ == '__main__':
58 |     arguments = docopt(__doc__)
59 | 
60 |     use_wandb = arguments['--wandb']
61 |     if use_wandb:
62 |         log_wandb(arguments)
63 | 
64 |     _, tokenizer, word_embeddings, bias = get_lm_vals(arguments['--encoder'])
65 | 
66 |     train_dir = arguments['--train_path']
67 |     dev_dir = train_dir.replace('train', 'dev')
68 |     out_dir = arguments['--out_dir']
69 |     input_dim = int(arguments['--input_dim'])
70 | 
71 |     if arguments['--debias'] == 'none':
72 |         debias = np.eye(input_dim)
73 |     else:
74 |         debias = np.load(arguments['--debias'])
75 | 
76 |     train_vecs, train_labels = load_data(train_dir)
77 |     dev_vecs, dev_labels = load_data(dev_dir)
78 | 
79 |     x_train, y_train = flatten_tokens(train_vecs, train_labels, tokenizer)
80 |     assert len(x_train) == len(y_train), f"{len(x_train)}, {len(y_train)}"
81 | 
82 |     x_test, y_test = flatten_tokens(dev_vecs, dev_labels, tokenizer)
83 |     assert len(x_test) == len(y_test), f"{len(x_test)}, {len(y_test)}"
84 | 
85 |     net = define_network(word_embeddings, bias, debias, arguments['--device'])
86 | 
87 |     if arguments['--debias'] == 'none':
88 |         print("Debiasing option is turned off.")
89 |         dev_acc = net.eval(x_test, y_test)
90 |         print("Dev accuracy without fine-tuning is: ", dev_acc)
91 | 
92 |     net.train(x_train, y_train, x_test, y_test,
93 |               epochs=int(arguments['--n_epochs']),
94 |               save_path=f"{out_dir}/debias_ft.pt",
95 |               use_wandb=use_wandb)
96 | 


--------------------------------------------------------------------------------
/runs/evaluate/run_eval.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Usage:
  3 |   run_eval.py [--dry_run]
  4 | 
  5 | Options:
  6 |   -h --help                     show this help message and exit
  7 |   --dry_run                     if used, not running the experiments, and just printing them
  8 | 
  9 | """
 10 | 
 11 | from docopt import docopt
 12 | from runs.ts_run import parallelize
 13 | 
 14 | # ┌──────────────────────┐
 15 | # │ connect to all nodes │
 16 | # └──────────────────────┘
 17 | nodes = [
 18 |     'nlp01',
 19 |     'nlp02',
 20 | ]
 21 | 
 22 | 
 23 | # ┌──────────────────────┐
 24 | # │ generate experiments │
 25 | # └──────────────────────┘
 26 | 
 27 | runs_dic = {
 28 |     'ontonotes': {
 29 |         'base_dir': 'data/ontonotes_output_{}/train',
 30 |         'vecs': 'last_vec.npy',
 31 |         'text': 'tokens.pickle',
 32 |         'labels': ['ner', 'np_start', 'phrase_start', 'phrase_end'],
 33 |         'task_type': 'task'
 34 |     },
 35 |     'ud': {
 36 |         'base_dir': 'data/ud_output_{}/train',
 37 |         'vecs': 'last_vec.npy',
 38 |         'text': 'tokens.pickle',
 39 |         'labels': ['dep', 'pos', 'tag', 'pos_next_word'],
 40 |         'task_type': 'task'
 41 |     },
 42 |     'binary_tag': {
 43 |         'base_dir': 'data/ud_output_{}/train',
 44 |         'vecs': 'last_vec.npy',
 45 |         'text': 'tokens.pickle',
 46 |         'labels': ['tag_verb', 'tag_noun', 'tag_adp', 'tag_det', 'tag_num', 'tag_punct',
 47 |                    'tag_prt', 'tag_conj', 'tag_adv', 'tag_pron', 'tag_adj', 'tag_other'],
 48 |         'task_type': 'task'
 49 |     },
 50 |     'binary_dep': {
 51 |         'base_dir': 'data/ud_output_{}/train',
 52 |         'vecs': 'last_vec.npy',
 53 |         'text': 'tokens.pickle',
 54 |         'labels': ['dep_adpmod', 'dep_det', 'dep_compmod', 'dep_num', 'dep_adpobj',
 55 |                    'dep_p', 'dep_poss', 'dep_adp', 'dep_amod', 'dep_nsubj',
 56 |                    'dep_dep', 'dep_dobj', 'dep_cc', 'dep_conj', 'dep_advmod',
 57 |                    'dep_ROOT', 'dep_ccomp', 'dep_aux', 'dep_xcomp', 'dep_neg'],
 58 |         'task_type': 'task'
 59 |     },
 60 |     'binary_ner': {
 61 |         'base_dir': 'data/ontonotes_output_{}/train',
 62 |         'vecs': 'last_vec.npy',
 63 |         'text': 'tokens.pickle',
 64 |         'labels': ['ner_*', 'ner_ORG', 'ner_CARDINAL', 'ner_GPE', 'ner_DATE', 'ner_PERSON'],
 65 |         'task_type': 'task'
 66 |     },
 67 |     'ontonotes_control': {
 68 |         'base_dir': 'data/ontonotes_output_{}/train',
 69 |         'vecs': 'last_vec.npy',
 70 |         'text': 'tokens.pickle',
 71 |         'labels': ['ner_control', 'phrase_start_control', 'phrase_end_control'],
 72 |         'task_type': 'task'
 73 |     },
 74 |     'ud_control': {
 75 |         'base_dir': 'data/ud_output_{}/train',
 76 |         'vecs': 'last_vec.npy',
 77 |         'text': 'tokens.pickle',
 78 |         'labels': ['dep_control', 'pos_control', 'tag_control'],
 79 |         'task_type': 'task'
 80 |     },
 81 | }
 82 | 
 83 | if __name__ == '__main__':
 84 |     arguments = docopt(__doc__)
 85 | 
 86 |     if arguments['--dry_run']:
 87 |         dry_run = True
 88 |     else:
 89 |         dry_run = False
 90 | 
 91 |     cartesian_product = []
 92 |     for data_type, vals in runs_dic.items():
 93 |         base_dir = vals['base_dir']
 94 |         vecs = base_dir + '/' + vals['vecs']
 95 |         text = base_dir + '/' + vals['text']
 96 |         task_type = vals['task_type']
 97 |         for masking in ['normal', 'masked']:
 98 |             base_dir = vals['base_dir'].format(masking)
 99 |             vecs = base_dir + '/' + vals['vecs']
100 |             text = base_dir + '/' + vals['text']
101 |             task_type = vals['task_type']
102 |             for label in vals['labels']:
103 |                 data_label = label
104 |                 if task_type != 'task':
105 |                     data_label = 'np_start'
106 |                 cartesian_product.append([vecs, f'{base_dir}/{data_label}.pickle', text,
107 |                                           f'models/lm/{label}/{masking}/layer:last/',
108 |                                           task_type])
109 | 
110 |     parallelize(nodes, cartesian_product, 'amnesic_probing/runs/evaluate/run_eval.sh',
111 |                 on_gpu=True, dry_run=dry_run)
112 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Amnesic Probing
 2 | This repository contain all the codebase for the paper:
 3 | 
 4 | "Amnesic Probing: Behavioral Explanation with Amnesic Counterfactuals"
 5 | 
 6 | _Note that a previous version of this paper that appeared on arxiv in 2020 was named: "**When Bert Forgets How To POS: Amnesic Probing of Linguistic Properties and MLM Predictions**", which we changed to the current title to better reflect our contributions._
 7 | 
 8 | ## General Notes and Considerations
 9 | This work contain many moving parts, which are build one on top of the other, therefore the code
10 | also contain many different parts.
11 | In order to save run-times, we tried to save as much as we can to local files.
12 | Furthermore, we use a queue-based software 
13 | ([task spooler](https://vicerveza.homeunix.net/~viric/soft/ts/)) 
14 | in order to parallel the (many) experiments.
15 | Note that only the runners scripts inside of the `runs` directory requires them,
16 | but otherwise one can simply run individual runs without this software.
17 | 
18 | We direct to the following running scripts for the logical order of how one should run 
19 | these experiments. If you care about a specific run please follow the script to the relevant file.
20 | 
21 | For any question, query regarding the code, or paper, please reach out at `yanaiela@gmail.com`
22 | 
23 | 
24 | 
25 | ## Prerequisites
26 | We use python 3.7, and linux machines for all our experiments
27 | 
28 | Create a virtual environment:
29 | ```sh
30 | conda create -n amnesic_probing python=3.7 anaconda
31 | ```
32 | 
33 | ## Walk-through Experiments
34 | 
35 | ### Encode Datasets
36 | This step is required in order to encode the texts into vectors, encoded by BERT.
37 | It also saves the tokenized words and labels for the relevant tasks.
38 | 
39 | `python runs/encode/run_encode.py`
40 | 
41 | 
42 | ### Running the _Amnesic Probing_
43 | This process runs the amnesic operation only. Meaning it runs the INLP process on the
44 | relevant data, and save all the projection matrices in a folder.
45 | 
46 | ```python runs/core/run_deprobe.py```
47 | 
48 | ### Evaluate
49 | The following script, runs the basic evaluation, where we compute the LM performance, DKL
50 | for the "best" amnesic projection.
51 | ```python runs/evaluate/run_eval.py```
52 | 
53 | In order to compute the LM scores for all of the projections (Figure 2 in the paper), run:
54 | ```python runs/evaluate/run_eval_per_dim.py```
55 | 
56 | Then, to run fine-grained evaluation (the performance per label), run:
57 | ```python runs/evaluate/run_specific_eval.py```
58 | 
59 | ### Per-Layer Runs
60 | Finally, in order to run the final part of the paper (Section 7), there are multiple steps.
61 | Note, that this part takes all the encoding from some layer i, then runs the (pre computed in 
62 | a former step) projection matrix (that does the amnesic operation) on that layer, and then
63 | re-run then encoding from this step forwards.
64 | 
65 | This is a rather long process (for the training it can take around 8-10 hours on gpu), and
66 | very heavy in disk usage. We save everything to the disk in order to make the further steps faster.
67 | Each encoded vector file (on the train) is about 4G, therefore the encoding of an entire dataset
68 | with all layer-to-layer encoding is about 400G
69 | 
70 | First, start by encoding:
71 | ```python runs/encode/run_layer_encode.py```
72 | 
73 | Once this step is done (again, it can take a while ~10 hours for each training encoding), run
74 | the evaluation:
75 | ```python runs/evaluate/run_layer_wise_lm.py```
76 | 
77 | ```python runs/evaluate/run_layer_wise_deprobe.py```
78 | 
79 | 
80 | ## Citation
81 | If you find this work relevant to yours, please cite us:
82 | ```
83 | @article{amnesic-probing,
84 |     author = {Elazar, Yanai and Ravfogel, Shauli and Jacovi, Alon and Goldberg, Yoav},
85 |     title = "{Amnesic Probing: Behavioral Explanation with Amnesic Counterfactuals}",
86 |     journal = {Transactions of the Association for Computational Linguistics},
87 |     volume = {9},
88 |     pages = {160-175},
89 |     year = {2021},
90 |     month = {03},
91 |     issn = {2307-387X},
92 |     doi = {10.1162/tacl_a_00359},
93 |     url = {https://doi.org/10.1162/tacl\_a\_00359},
94 |     eprint = {https://direct.mit.edu/tacl/article-pdf/doi/10.1162/tacl\_a\_00359/1894330/tacl\_a\_00359.pdf},
95 | }
96 | 
97 | ```
98 | 


--------------------------------------------------------------------------------
/amnesic_probing/encoders/encode_with_forward_pass/__init__.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pickle
  3 | from collections import defaultdict
  4 | 
  5 | import numpy as np
  6 | from tqdm import tqdm
  7 | from transformers import BertForMaskedLM
  8 | 
  9 | from amnesic_probing.encoders.bert_encoding import tokenize_and_preserve_labels
 10 | from amnesic_probing.encoders.encode_with_forward_pass.bert_encoding import lm_encoding_with_projection, bert_based_encoding, \
 11 |     lm_masked_encoding_with_projection
 12 | 
 13 | 
 14 | def encode_text(data, encoder, tokenizer, masked=False, layer2projs=None, output_dir=None, batch_size = 512):
 15 | 
 16 |     batch_counter = 0
 17 |     encoded_vectors = defaultdict(list)
 18 |     encoded_labels = defaultdict(list)
 19 | 
 20 |     for i, datum in enumerate(tqdm(data)):
 21 |         
 22 |         #if i > 70: break
 23 |         
 24 |         tokens = datum['text']
 25 | 
 26 |         if type(encoder) == BertForMaskedLM:
 27 |             if masked:
 28 |                 layer2next_layers = lm_masked_encoding_with_projection(' '.join(tokens), encoder, tokenizer,
 29 |                                                                        layer2projs=layer2projs)
 30 | 
 31 |             else:
 32 |                 layer2next_layers = lm_encoding_with_projection(' '.join(tokens), encoder, tokenizer,
 33 |                                                                 layer2projs=layer2projs)
 34 | 
 35 |             for key in layer2next_layers.keys():
 36 |                 encoded_vectors[key].append(layer2next_layers[key])
 37 | 
 38 |         else:
 39 |             layer2next_layers = bert_based_encoding(' '.join(tokens), encoder, tokenizer)
 40 |             for key in layer2next_layers.keys():
 41 |                 encoded_vectors[key].append(layer2next_layers[key])
 42 | 
 43 |         # going over all labels that were collected from the dataset
 44 |         for label_name, labels in datum['labels'].items():
 45 |             tok_sen, tok_label = tokenize_and_preserve_labels(tokens, labels, tokenizer)
 46 |             encoded_labels[label_name].append(tok_label)
 47 | 
 48 |         encoded_labels['tokens'].append(tok_sen)
 49 |         
 50 |         if i % batch_size == 0 and i > 0:
 51 |         
 52 |             encoding = {'vectors': encoded_vectors, 'labels': encoded_labels}
 53 |             to_file(encoding, output_dir, batch_counter)
 54 |             batch_counter += 1
 55 |             encoded_vectors = defaultdict(list)
 56 |             encoded_labels = defaultdict(list)
 57 |     
 58 |     # reminder of last batch
 59 |     if i % batch_size != 0:
 60 |         encoding = {'vectors': encoded_vectors, 'labels': encoded_labels}
 61 |         to_file(encoding, output_dir, batch_counter)    
 62 |         
 63 |                         
 64 |     unite_batched_files(output_dir)                
 65 |   
 66 | def unite_batched_files(output_dir):
 67 | 
 68 |     filenames = defaultdict(list)
 69 |     
 70 |     for filename in os.listdir(output_dir):
 71 |         if "batch=" not in filename: continue
 72 |         prefix = ".".join(filename.split(".")[:-1])
 73 |         
 74 |         batch = filename.split(".")[-1].split("=")[-1]
 75 |         filenames[prefix].append((int(batch), filename))
 76 | 
 77 |     for prefix, fnames in filenames.items():
 78 |     
 79 |         fnames = sorted(fnames, key = lambda batch_and_fname: batch_and_fname[0])
 80 |         all_data = []
 81 |         for batch, fname in fnames:
 82 |         
 83 |             data = np.load(output_dir + '/' + fname, allow_pickle = True)
 84 |             all_data.append(data)
 85 |         
 86 |         all_data = np.concatenate(all_data, axis = 0)
 87 |         with open(output_dir + "/" + prefix, "wb") as f:
 88 |             pickle.dump(all_data, f)
 89 |         
 90 |         for batch, fname in fnames:
 91 |             os.remove(output_dir + "/" + fname)
 92 | 
 93 | def to_file(encoded_data, output_dir, batch_counter):
 94 |     if not os.path.isdir(output_dir):
 95 |         print('creating dir ', output_dir)
 96 |         os.makedirs(output_dir)
 97 | 
 98 |     for layer, vals in encoded_data['vectors'].items():
 99 | 
100 |         for j in range(len(vals[0]["next_layers"])):  # foreach next layer (over all sentences)
101 |             X = np.array([x["next_layers"][j] for x in vals])
102 |             layer_number = layer + j + 1
103 |             path = output_dir + "from:{}.to:{}.npy.batch={}".format(layer, layer_number, batch_counter)
104 | 
105 |             with open(path, "wb") as f:
106 |                 np.save(f, X)
107 | 
108 |     for name, vals in encoded_data['labels'].items():
109 |         with open(output_dir + '/{}.pickle.batch={}'.format(name, batch_counter), 'wb') as f:
110 |             pickle.dump(vals, f)
111 | 


--------------------------------------------------------------------------------
/runs/core/run_deprobe.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Usage:
  3 |   run_deprobe.py [--dry_run]
  4 | 
  5 | Options:
  6 |   -h --help                     show this help message and exit
  7 |   --dry_run                     if used, not running the experiments, and just printing them
  8 | 
  9 | """
 10 | 
 11 | from docopt import docopt
 12 | from runs.ts_run import parallelize
 13 | 
 14 | # ┌──────────────────────┐
 15 | # │ connect to all nodes │
 16 | # └──────────────────────┘
 17 | nodes = [
 18 |     'nlp01',
 19 |     'nlp02',
 20 | ]
 21 | 
 22 | # ┌──────────────────────┐
 23 | # │ running combination  │
 24 | # └──────────────────────┘
 25 | 
 26 | runs_dic = {
 27 |     'ontonotes': {
 28 |         'base_dir': 'data/ontonotes_output_{}/train',
 29 |         'vecs': 'last_vec.npy',
 30 |         'labels': ['ner', 'np_start', 'phrase_start', 'phrase_end'],
 31 |         'task': ['task'],
 32 |     },
 33 |     'ontonotes_layers': {
 34 |         'base_dir': 'data/ontonotes_output_{}/train',
 35 |         'vecs': 'vec_layer:{}.npy',
 36 |         'labels': ['ner', 'np_start', 'phrase_start', 'phrase_end'],
 37 |         'layers': list(range(13)),
 38 |         'task': ['task'],
 39 |     },
 40 |     'ud': {
 41 |         'base_dir': 'data/ud_output_{}/train',
 42 |         'vecs': 'last_vec.npy',
 43 |         'labels': ['dep', 'pos', 'tag', 'pos_next_word'],
 44 |         'task': ['task'],
 45 |     },
 46 |     'ud_layers': {
 47 |         'base_dir': 'data/ud_output_{}/train',
 48 |         'vecs': 'vec_layer:{}.npy',
 49 |         'labels': ['tag'],
 50 |         'layers': list(range(13)),
 51 |         'task': ['task'],
 52 |     },
 53 |     'binary_tag': {
 54 |         'base_dir': 'data/ud_output_{}/train',
 55 |         'vecs': 'last_vec.npy',
 56 |         'labels': ['tag_verb', 'tag_noun', 'tag_adp', 'tag_det', 'tag_num', 'tag_punct',
 57 |                    'tag_prt', 'tag_conj', 'tag_adv', 'tag_pron', 'tag_adj', 'tag_other'],
 58 |         'task': ['task'],
 59 |         'balance': 'true'
 60 |     },
 61 |     'binary_dep': {
 62 |         'base_dir': 'data/ud_output_{}/train',
 63 |         'vecs': 'last_vec.npy',
 64 |         # used labels that appear at least 10K times
 65 |         'labels': ['dep_adpmod', 'dep_det', 'dep_compmod', 'dep_num', 'dep_adpobj',
 66 |                    'dep_p', 'dep_poss', 'dep_adp', 'dep_amod', 'dep_nsubj',
 67 |                    'dep_dep', 'dep_dobj', 'dep_cc', 'dep_conj', 'dep_advmod',
 68 |                    'dep_ROOT', 'dep_ccomp', 'dep_aux', 'dep_xcomp', 'dep_neg'],
 69 |         'task': ['task'],
 70 |         'balance': 'true',
 71 |     },
 72 |     'binary_ner': {
 73 |         'base_dir': 'data/ontonotes_output_{}/train',
 74 |         'vecs': 'last_vec.npy',
 75 |         # used labels that appear at least 10K times
 76 |         'labels': ['ner_*', 'ner_ORG', 'ner_CARDINAL', 'ner_GPE', 'ner_DATE', 'ner_PERSON'],
 77 |         'task': ['task'],
 78 |         'balance': 'true',
 79 |     },
 80 |     'ontonotes_control': {
 81 |         'base_dir': 'data/ontonotes_output_{}/train',
 82 |         'vecs': 'last_vec.npy',
 83 |         'labels': ['ner_control', 'phrase_start_control', 'phrase_end_control'],
 84 |         'task': ['task'],
 85 |     },
 86 |     'ud_control': {
 87 |         'base_dir': 'data/ud_output_{}/train',
 88 |         'vecs': 'last_vec.npy',
 89 |         'labels': ['dep_control', 'pos_control', 'tag_control'],
 90 |         'task': ['task'],
 91 |     },
 92 | }
 93 | 
 94 | if __name__ == '__main__':
 95 |     arguments = docopt(__doc__)
 96 | 
 97 |     if arguments['--dry_run']:
 98 |         dry_run = True
 99 |     else:
100 |         dry_run = False
101 | 
102 |     cartesian_product = []
103 |     for data_type, vals in runs_dic.items():
104 |         balanced = vals.get('balance', 'false')
105 |         for masking in ['normal', 'masked']:
106 |             base_dir = vals['base_dir'].format(masking)
107 |             vecs = base_dir + '/' + vals['vecs']
108 |             for task in vals['task']:
109 |                 for label in vals['labels']:
110 |                     output_dir = 'models/lm/{0}/{1}/layer:{2}/'
111 |                     if task == 'task':
112 |                         task_name = label
113 |                     else:
114 |                         task_name = task
115 |                     # running over all the model layers
116 |                     if 'layers' in vals:
117 |                         for layer in vals['layers']:
118 |                             cartesian_product.append([vecs.format(layer),
119 |                                                       f'{base_dir}/{label}.pickle',
120 |                                                       output_dir.format(task_name, masking, layer),
121 |                                                       task,
122 |                                                       balanced],
123 |                                                      )
124 |                     else:
125 |                         cartesian_product.append([vecs,
126 |                                                   f'{base_dir}/{label}.pickle',
127 |                                                   output_dir.format(task_name, masking, 'last'),
128 |                                                   task,
129 |                                                   balanced])
130 | 
131 |     parallelize(nodes, cartesian_product, 'amnesic_probing/runs/core/run_deprobe.sh',
132 |                 on_gpu=False, dry_run=dry_run)
133 | 


--------------------------------------------------------------------------------
/amnesic_probing/encoders/encode_with_forward_pass/encode.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Usage:
  3 |   encode.py [--input_file=INPUT_FILE] [--projections_dir=PROJECTIONS_DIR] [--output_dir=OUTPUT_DIR] [--encoder=ENCODER]
  4 |                     [--format=FORMAT] [--encode_format=ENCODE_FORMAT] [--device=DEVICE] [--num_layers=NUM_LAYERS]
  5 |                     [--control=CONTROL]
  6 |                     [--batch_size=BATCH_SIZE]
  7 | 
  8 | Options:
  9 |   -h --help                     show this help message and exit
 10 |   --input_file=INPUT_FILE       input file. conll format
 11 |   --projections_dir=PROJECTIONS_DIR         directory where projection matrices are stored.
 12 |   --output_dir=OUTPUT_DIR       output directory where to write the output files
 13 |   --encoder=ENCODER             encoder. types: bert-base-uncased, qa, ... [default: bert-base-uncased]
 14 |   --format=FORMAT               data format: conll, ontonotes, semtagging, fce
 15 |   --encode_format=ENCODE_FORMAT     encoding: normal, masked
 16 |   --device=DEVICE               cpu, cuda:0, cuda:1, ... [default: cpu]
 17 |   --num_layers=NUM_LAYERS       how many layers does the model have [default: 12]
 18 |   --control=CONTROL             instead of using the learned projection, creating random ones, with the same amount of
 19 |                                 directions. values: true|false [default: false]
 20 |   --batch_size=BATCH_SIZE       batch_size [default: 1024]
 21 | 
 22 | """
 23 | 
 24 | import numpy as np
 25 | import torch
 26 | import json
 27 | from docopt import docopt
 28 | 
 29 | from amnesic_probing.encoders import get_pretrained_models, read_conll_format, read_onto_notes_format, \
 30 |     read_sem_tagging_format, read_coarse_sem_tagging_format, read_fce_format, read_coord_format
 31 | from amnesic_probing.encoders.encode_with_forward_pass import encode_text, to_file
 32 | from amnesic_probing.debias.debias import debias_by_specific_directions
 33 | 
 34 | 
 35 | def create_rand_dir_projection(dim, n_coord):
 36 |     # creating random directions (vectors) within the range of -0.5 : 0.5
 37 |     rand_directions = [np.random.rand(1, dim) - 0.5 for _ in range(n_coord)]
 38 | 
 39 |     # finding the null-space of random directions
 40 |     rand_direction_projection = debias_by_specific_directions(rand_directions, dim)
 41 |     return rand_direction_projection
 42 | 
 43 | 
 44 | def get_random_projection(dir_path, vecs_dim=768):
 45 |     with open(dir_path + '/meta.json', 'r') as f:
 46 |         meta = json.load(f)
 47 |     n_coord = int(meta['removed_directions'])
 48 |     projection = create_rand_dir_projection(vecs_dim, n_coord)
 49 |     return projection
 50 | 
 51 | 
 52 | def load_projection_matrices(dir_path, num_layers, control: bool, device="cpu"):
 53 |     layer2proj = {}
 54 |     if dir_path is None:
 55 |         print("*** WARNING: PROJECTION DIR NOT SUPPLIED, USES RANDOM PROJECTION MATRICES ***")
 56 |         for layer in range(0, num_layers + 1):
 57 |             layer2proj[layer] = torch.eye(768).to(device).float() #torch.tensor(np.random.rand(768, 768)).to(device).float()
 58 | 
 59 |     else:
 60 |         for layer in range(0, num_layers + 1):
 61 |             if control:
 62 |                 print('generating random projections')
 63 |                 projection = get_random_projection(dir_path + f'/layer:{layer}/', vecs_dim=768)
 64 |             else:
 65 |                 projection = np.load(dir_path + f'/layer:{layer}/P.npy')
 66 |             layer2proj[layer] = torch.tensor(projection).to(device).float()
 67 | 
 68 |     return layer2proj
 69 | 
 70 | 
 71 | if __name__ == '__main__':
 72 |     arguments = docopt(__doc__)
 73 | 
 74 |     #arguments = {"--encoder": "bert-base-uncased", "--device": "cuda", "--num_layers": 12, "--format": "conll", "--encode_format": "normal", "--input_file": "data/ud/en-universal-dev.conll", "--projections_dir": None, "--output_dir": "data/ud_output_projection_normal/train/", "--batch_size": 32}
 75 | 
 76 |     encoder, tokenizer = get_pretrained_models(arguments['--encoder'])
 77 |     encoder = encoder.to(arguments['--device'])
 78 |     print(arguments['--num_layers'])
 79 | 
 80 |     control = bool(arguments['--control'] == 'true')
 81 |     print('control projections: ', control)
 82 | 
 83 |     layer2projs = load_projection_matrices(arguments["--projections_dir"], int(arguments['--num_layers']), control,
 84 |                                            arguments['--device'])
 85 | 
 86 |     data_format = arguments['--format']
 87 |     if data_format == 'conll':
 88 |         data = read_conll_format(arguments['--input_file'])
 89 |     elif data_format == 'ontonotes':
 90 |         data = read_onto_notes_format(arguments['--input_file'])
 91 |     elif data_format == 'semtagging':
 92 |         data = read_sem_tagging_format(arguments['--input_file'])
 93 |     elif data_format == 'coarse_semtagging':
 94 |         data = read_coarse_sem_tagging_format(arguments['--input_file'])
 95 |     elif data_format == 'fce':
 96 |         data = read_fce_format(arguments['--input_file'])
 97 |     elif data_format == 'coord':
 98 |         data = read_coord_format(arguments['--input_file'])
 99 |     else:
100 |         raise Exception('Unsupported file format exception')
101 | 
102 |     masked_encoding = arguments['--encode_format'] == 'masked'
103 |     final_data = encode_text(data, encoder, tokenizer, masked=masked_encoding, layer2projs=layer2projs, output_dir = arguments['--output_dir'], batch_size = int(arguments["--batch_size"]))
104 | 
105 |     #to_file(final_data, )
106 | 


--------------------------------------------------------------------------------
/amnesic_probing/tasks/layer_wise_lm.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Usage:
  3 |   layer_wise_lm.py [--proj_vecs=PROJ_VECS] [--labels=LABELS] [--text=TEXT] [--task=TASK]
  4 |         [--n=N] [--device=DEVICE]
  5 |         [--wandb]
  6 | 
  7 | Options:
  8 |   -h --help                     show this help message and exit
  9 |   --proj_vecs=proj_vecs         input dir of all the projected layers
 10 |   --labels=LABELS               labels file. using the train path (and automatically also using the dev,
 11 |                                 by replacing train by dev)
 12 |   --text=TEXT                   text file
 13 |   --task=TASK                   task type. between word_ind, sen_len, task [default: task]
 14 |   --n=N                         number of training examples [default: 100000]
 15 |   --device=DEVICE               cpu, cuda:0, cuda:1, ... [default: cpu]
 16 |   --wandb                       log using wandb
 17 | 
 18 | """
 19 | 
 20 | import wandb
 21 | from docopt import docopt
 22 | from sklearn.utils import shuffle
 23 | from tqdm import tqdm
 24 | 
 25 | from amnesic_probing.tasks.data_preparation import get_appropriate_data
 26 | from amnesic_probing.tasks.utils import read_files, get_lm_predictions_gpu, learn_cls, learn_pls_cls, get_lm_vals, \
 27 |     classification_tasks
 28 | 
 29 | 
 30 | def layer_eval(vecs_train, vecs_dev, labels_train, labels_dev, y_ids, out_embed, bias, task, device):
 31 |     x_train = vecs_train
 32 |     x_dev = vecs_dev
 33 | 
 34 |     # probing acc on the last layer after projection from layer i (< last layer)
 35 |     if task in classification_tasks:
 36 |         task_acc = learn_cls(x_train, labels_train, x_dev, labels_dev)
 37 |     else:
 38 |         task_acc = learn_pls_cls(x_train, labels_train, x_dev, labels_dev)
 39 | 
 40 |     # lm prediction acc after projection from layer i (< last layer)
 41 |     base_acc = get_lm_predictions_gpu(out_embed, bias, vecs_dev, y_ids, device=device)
 42 | 
 43 |     return task_acc, base_acc
 44 | 
 45 | 
 46 | def eval_layers(in_vecs_dir, in_labels_f, in_texts_f, task, device, n):
 47 |     _, tokenizer, out_embed, bias = get_lm_vals('bert-base-uncased')
 48 | 
 49 |     tasks_results = []
 50 | 
 51 |     for from_layer in tqdm(range(13)):
 52 |         vecs_train_f = f'{in_vecs_dir}/from:{from_layer}.to:{13}.npy'
 53 |         labels_train_f = in_labels_f
 54 | 
 55 |         vecs_train, labels_train, sentences_train = read_files(vecs_train_f, labels_train_f, text_f=in_texts_f,
 56 |                                                                ignore_special_tokens=True)
 57 |         vecs_dev, labels_dev, sentences_dev = read_files(vecs_train_f.replace('train', 'dev'),
 58 |                                                          labels_train_f.replace('train', 'dev'),
 59 |                                                          text_f=in_texts_f.replace('train', 'dev'),
 60 |                                                          ignore_special_tokens=True)
 61 | 
 62 |         (x_train, y_train, words_train), (x_dev, y_dev, words_dev) = get_appropriate_data(task, vecs_train,
 63 |                                                                                           labels_train,
 64 |                                                                                           sentences_train,
 65 |                                                                                           vecs_dev, labels_dev,
 66 |                                                                                           sentences_dev)
 67 | 
 68 |         x_train, y_train = shuffle(x_train, y_train, random_state=0, n_samples=min(len(y_train), n))
 69 | 
 70 |         y_ids = tokenizer.convert_tokens_to_ids(words_dev)
 71 |         task_acc, lm_acc = layer_eval(x_train, x_dev, y_train, y_dev, y_ids, out_embed, bias, task, device)
 72 |         print(from_layer, task_acc, lm_acc)
 73 |         tasks_results.append([from_layer, task_acc, lm_acc])
 74 |     return tasks_results
 75 | 
 76 | 
 77 | def log_wandb(arguments):
 78 |     labels = arguments['--labels'].split('.')[0]
 79 |     task_type = labels.split('/')[-1]
 80 | 
 81 |     task = arguments['--task']
 82 |     if task != 'task':
 83 |         task_type = task
 84 |         classification_type = 'regression'
 85 |     else:
 86 |         classification_type = 'classification'
 87 | 
 88 |     data_orig = labels.split('data/')[1].split('/')[0]
 89 |     print(labels)
 90 |     print(data_orig)
 91 |     dataset = data_orig.split('_output', 1)[0]
 92 |     masking = data_orig.rsplit('_', 1)[1]
 93 | 
 94 |     config = dict(
 95 |         property=task_type,
 96 |         encoder='bert-base-uncased',
 97 |         dataset=dataset,
 98 |         masking=masking,
 99 |     )
100 | 
101 |     wandb.init(
102 |         name=f'{task_type}_layer_wise_lm',
103 |         project="amnesic_probing",
104 |         tags=["layer_wise", "lm", task_type, classification_type],
105 |         config=config,
106 |     )
107 | 
108 | 
109 | if __name__ == '__main__':
110 |     arguments = docopt(__doc__)
111 | 
112 |     labels = arguments['--labels']
113 |     n = int(arguments['--n'])
114 |     proj_vecs = arguments['--proj_vecs']
115 |     texts = arguments['--text']
116 |     task = arguments['--task']
117 | 
118 |     use_wandb = arguments['--wandb']
119 |     if use_wandb:
120 |         log_wandb(arguments)
121 | 
122 |     results = eval_layers(proj_vecs, labels, texts, task, arguments['--device'], int(arguments['--n']))
123 | 
124 |     if use_wandb:
125 |         wandb.log({'per_layer_deprobe': wandb.Table(data=results,
126 |                                                     columns=['from', 'task_acc', 'lm_acc'])})
127 | 


--------------------------------------------------------------------------------
/amnesic_probing/debiased_finetuning/rebiased_finetuning_lm.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Usage:
  3 |   rebiased_finetuning_lm.py [--train_path=TRAIN_PATH] [--encoder=ENCODER] [--debias=DEBIAS] [--rebias=REBIAS]
  4 |   [--n_epochs=N_EPOCHS] [--out_dir=OUT_DIR] [--device=DEVICE] [--wandb]
  5 | 
  6 | Options:
  7 |   -h --help                     show this help message and exit
  8 |   --train_path=TRAIN_PATH       input directory
  9 |   --encoder=ENCODER             encoder. types: bert-base-uncased, qa, ... [default: bert-base-uncased]
 10 |   --debias=DEBIAS               the debias projection matrix to use, or none for no debiasing
 11 |   --rebias=REBIAS               the labels to add back to the representation
 12 |   --n_epochs=N_EPOCHS           number of epochs to run the classifier [default: 20]
 13 |   --out_dir=OUT_DIR             logs and outputs directory
 14 |   --device=DEVICE               device to use: cpu, cuda:0, cuda:1, ... [defaults: cpu]
 15 |   --wandb                       log using wandb
 16 | 
 17 | """
 18 | 
 19 | import numpy as np
 20 | import torch
 21 | from docopt import docopt
 22 | import wandb
 23 | 
 24 | from amnesic_probing.debias.classifier import PytorchClassifier
 25 | from amnesic_probing.debiased_finetuning.utils import load_data, load_labels, flatten_tokens, flatten_label_list
 26 | from amnesic_probing.tasks.utils import get_lm_vals
 27 | 
 28 | 
 29 | class RebiasClassifier(torch.nn.Module):
 30 |     def __init__(self, debias, num_bias_labels, classifier):
 31 |         super(RebiasClassifier, self).__init__()
 32 | 
 33 |         debias_net = torch.nn.Linear(in_features=debias.shape[1], out_features=debias.shape[0], bias=False)
 34 |         debias_net.weight.data = torch.tensor(debias, dtype=torch.float)
 35 |         for p in debias_net.parameters():
 36 |             p.requires_grad = False
 37 |         net = torch.nn.Linear(in_features=classifier.shape[1], out_features=classifier.shape[0])
 38 |         net.weight.data = torch.tensor(classifier)
 39 |         net.bias.data = torch.tensor(bias)
 40 | 
 41 |         rebias_embedding_size = 32
 42 |         encode_rebias = torch.nn.Embedding(num_bias_labels, rebias_embedding_size)
 43 |         rebias_net = torch.nn.Linear(in_features=classifier.shape[1]+rebias_embedding_size,
 44 |                                      out_features=classifier.shape[1])
 45 | 
 46 |         self.debias_net = debias_net
 47 |         self.encode_rebias = encode_rebias
 48 |         self.rebias_net = rebias_net
 49 |         self.classifier_net = net
 50 | 
 51 |     def forward(self, input: torch.Tensor):
 52 |         rebias_labels = input[:, -1].long()
 53 |         input = input[:, :-1].float()
 54 | 
 55 |         debiased_input = self.debias_net(input)
 56 |         rebias = self.encode_rebias(rebias_labels)
 57 |         rebiased_input = torch.cat([debiased_input, rebias], dim=1)
 58 |         rebiased_input = self.rebias_net(rebiased_input)
 59 | 
 60 |         return self.classifier_net(rebiased_input)
 61 | 
 62 | 
 63 | def log_wandb(arguments):
 64 |     out_dir = arguments['--out_dir']
 65 |     if out_dir[-1] == '/':
 66 |         out_dir = out_dir[:-1]
 67 |     task_type_full = out_dir.split('models/lm/')[1]
 68 |     task_type = task_type_full.split('/')[0]
 69 |     masking = task_type_full.split('/')[1]
 70 |     dataset_full = arguments['--train_path'].split('data/')[1].split('/')[0]
 71 |     dataset = dataset_full.split('_output', 1)[0]
 72 |     debias = arguments['--debias']
 73 |     if debias == 'none':
 74 |         task_type = task_type + '_baseline'
 75 |     if len(debias.rsplit('/', 1)) > 1:
 76 |         proj = debias.rsplit('/', 1).split('.')[0]
 77 |         if '_' in proj:
 78 |             num = proj.split('_')[1]
 79 |             task_type = task_type + f'_iter:{num}'
 80 |     debias = task_type
 81 | 
 82 |     config = dict(
 83 |         property=task_type,
 84 |         encoder='bert-base-uncased',
 85 |         dataset=dataset,
 86 |         masking=masking,
 87 |         debias_property=debias
 88 |     )
 89 | 
 90 |     wandb.init(
 91 |         name=task_type + '_ft_1hot',
 92 |         project="amnesic_probing",
 93 |         tags=["lm", "1hot_emb_ft", task_type],
 94 |         config=config,
 95 |     )
 96 | 
 97 | 
 98 | if __name__ == '__main__':
 99 |     arguments = docopt(__doc__)
100 | 
101 |     use_wandb = arguments['--wandb']
102 |     if use_wandb:
103 |         log_wandb(arguments)
104 | 
105 |     _, tokenizer, word_embeddings, bias = get_lm_vals(arguments['--encoder'])
106 | 
107 |     train_dir = arguments['--train_path']
108 |     dev_dir = train_dir.replace('train', 'dev')
109 | 
110 |     if arguments['--debias'] == 'none':
111 |         debias = np.eye(768)
112 |     else:
113 |         debias = np.load(arguments['--debias'])
114 | 
115 |     rebias_labels_name = arguments['--rebias']
116 | 
117 |     train_vecs, train_words = load_data(train_dir)
118 |     train_labels = load_labels(f'{train_dir}/{rebias_labels_name}.pickle')
119 | 
120 |     dev_vecs, dev_words = load_data(train_dir.replace('train', 'dev'))
121 |     dev_labels = load_labels(f'{dev_dir}/{rebias_labels_name}.pickle')
122 | 
123 |     all_labels = list(set([y for sen_y in train_labels for y in sen_y]))
124 | 
125 |     x_train, y_words_train = flatten_tokens(train_vecs, train_words, tokenizer)
126 |     y_labels_train = flatten_label_list(train_labels, all_labels)
127 |     assert len(x_train) == len(y_words_train), f"{len(x_train)}, {len(y_words_train)}"
128 |     assert len(x_train) == len(y_labels_train), f"{len(x_train)}, {len(y_labels_train)}"
129 | 
130 |     x_dev, y_words_dev = flatten_tokens(dev_vecs, dev_words, tokenizer)
131 |     y_labels_dev = flatten_label_list(dev_labels, all_labels)
132 |     assert len(x_dev) == len(y_words_dev), f"{len(x_dev)}, {len(y_words_dev)}"
133 |     assert len(x_dev) == len(y_labels_dev), f"{len(x_dev)}, {len(y_labels_dev)}"
134 | 
135 |     # adding bias label to input
136 |     x_train = np.concatenate([x_train, y_labels_train.reshape(-1, 1)], axis=-1)
137 |     x_dev = np.concatenate([x_dev, y_labels_dev.reshape(-1, 1)], axis=-1)
138 | 
139 |     out_dir = arguments['--out_dir']
140 |     net = RebiasClassifier(debias=debias, num_bias_labels=len(all_labels), classifier=word_embeddings)
141 |     net = PytorchClassifier(net, device=arguments['--device'])
142 |     net.train(x_train, y_words_train, x_dev, y_words_dev,
143 |               epochs=int(arguments['--n_epochs']),
144 |               save_path=f"{out_dir}/finetuned_with_rebias.pt",
145 |               use_wandb=use_wandb)
146 | 


--------------------------------------------------------------------------------
/amnesic_probing/tasks/task_specific_eval.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Usage:
  3 |   task_specific_eval.py [--vecs=VECS] [--labels=LABELS] [--text=TEXT]
  4 |         [--task=TASK]
  5 |         [--deprobe_dir=DEPROBE_DIR]
  6 |         [--device=DEVICE]
  7 |         [--wandb]
  8 | 
  9 | Options:
 10 |   -h --help                     show this help message and exit
 11 |   --vecs=VECS                   input vectors file
 12 |   --labels=LABELS               labels file
 13 |   --text=TEXT                   text file
 14 |   --task=TASK                   task type. between word_ind, sen_len, task [default: task]
 15 |   --deprobe_dir=DEPROBE_DIR     directory where the amnesic_probing files are located.
 16 |   --device=DEVICE               cpu, cuda:0, cuda:1, ... [default: cpu]
 17 |   --wandb                       log using wandb
 18 | 
 19 | """
 20 | 
 21 | import os
 22 | from collections import Counter, defaultdict
 23 | 
 24 | import numpy as np
 25 | import pandas as pd
 26 | import wandb
 27 | from docopt import docopt
 28 | 
 29 | from amnesic_probing.tasks.data_preparation import get_appropriate_data
 30 | from amnesic_probing.tasks.lm import load_deprobing_params
 31 | from amnesic_probing.tasks.utils import data_projection, read_files, get_lm_vals, get_lm_predictions_gpu, \
 32 |     rand_direction_control
 33 | 
 34 | _, tokenizer, out_embed, bias = get_lm_vals('bert-base-uncased')
 35 | 
 36 | 
 37 | def eval_lm_per_task(tokenizer, out_embed, bias, x, words, y, projection, n_coords, label2ind, device='cpu'):
 38 |     y_ids = np.array(tokenizer.convert_tokens_to_ids(words))
 39 | 
 40 |     lm_results = defaultdict(dict)
 41 | 
 42 |     for label_name, label_ind in label2ind.items():
 43 |         labels_indices = y == label_ind
 44 |         if sum(labels_indices) == 0:
 45 |             continue
 46 |         x_labels = x[labels_indices]
 47 |         y_id_labels = y_ids[labels_indices]
 48 | 
 49 |         base_acc = get_lm_predictions_gpu(out_embed, bias, x_labels, y_id_labels, device=device)
 50 | 
 51 |         x_p = data_projection(x_labels, projection)
 52 |         p_acc = get_lm_predictions_gpu(out_embed, bias, x_p, y_id_labels, device=device)
 53 | 
 54 |         x_rand_dir = rand_direction_control(x_labels, n_coords)
 55 |         rand_dir_acc = get_lm_predictions_gpu(out_embed, bias, x_rand_dir, y_id_labels, device=device)
 56 | 
 57 |         lm_results[label_name]['vanilla'] = base_acc
 58 |         lm_results[label_name]['p'] = p_acc
 59 |         lm_results[label_name]['rand_dir'] = rand_dir_acc
 60 | 
 61 |     return pd.DataFrame(lm_results).T
 62 | 
 63 | 
 64 | def log_wandb(arguments):
 65 |     task_name = arguments['--deprobe_dir'].split('models/lm/')[1]
 66 |     task_type = task_name.split('/')[0]
 67 |     layer = arguments['--vecs'].split('/')[-1].split('.')[0]
 68 | 
 69 |     labels = arguments['--labels'].split('.')[0]
 70 |     data_orig = labels.split('data/')[1].split('/')[0]
 71 |     print(labels)
 72 |     print(data_orig)
 73 |     dataset = data_orig.split('_output', 1)[0]
 74 |     masking = data_orig.rsplit('_', 1)[1]
 75 | 
 76 |     config = dict(
 77 |         property=task_type,
 78 |         encoder='bert-base-uncased',
 79 |         dataset=dataset,
 80 |         masking=masking,
 81 |         layer=layer
 82 |     )
 83 | 
 84 |     wandb.init(
 85 |         name=task_type + '_task_specific_eval',
 86 |         project="amnesic_probing",
 87 |         tags=["lm", "eval", "task_specific", task_type],
 88 |         config=config,
 89 |     )
 90 | 
 91 | 
 92 | if __name__ == '__main__':
 93 |     arguments = docopt(__doc__)
 94 | 
 95 |     deprobe_dir = arguments['--deprobe_dir']
 96 |     if not os.path.isdir(deprobe_dir):
 97 |         assert 'Deprobing directory does not exists...'
 98 | 
 99 |     use_wandb = arguments['--wandb']
100 |     if use_wandb:
101 |         log_wandb(arguments)
102 | 
103 |     vecs_train, labels_train, sentences_train = read_files(arguments['--vecs'],
104 |                                                            arguments['--labels'],
105 |                                                            arguments['--text'], ignore_special_tokens=True)
106 |     vecs_dev, labels_dev, sentences_dev = read_files(arguments['--vecs'].replace('train', 'dev'),
107 |                                                      arguments['--labels'].replace('train', 'dev'),
108 |                                                      arguments['--text'].replace('train', 'dev'),
109 |                                                      ignore_special_tokens=True)
110 | 
111 |     task = arguments['--task']
112 | 
113 |     (x_train, y_train, words_train), (x_dev, y_dev, words_dev) = get_appropriate_data(task, vecs_train, labels_train,
114 |                                                                                       sentences_train,
115 |                                                                                       vecs_dev, labels_dev,
116 |                                                                                       sentences_dev)
117 | 
118 |     pos2ind = {p: i for i, p in enumerate(sorted(set([item for sublist in labels_train for item in sublist])))}
119 | 
120 |     print('number of classes', len(pos2ind))
121 |     print('most common class', Counter(y_dev).most_common(1)[0][1] / float(len(y_dev)))
122 | 
123 |     if use_wandb:
124 |         wandb.run.summary['n_classes'] = len(pos2ind)
125 |         wandb.run.summary['majority'] = Counter(y_dev).most_common(1)[0][1] / float(len(y_dev))
126 | 
127 |     proj_file = deprobe_dir + '/P.npy'
128 |     meta = load_deprobing_params(deprobe_dir + '/meta.json')
129 |     n_coords = int(meta['removed_directions'])
130 | 
131 |     if os.path.isfile(proj_file):
132 |         P = np.load(proj_file)
133 |     else:
134 |         raise FileNotFoundError('projection file does not exists...')
135 | 
136 |     print('evaluating performance')
137 |     # calculating the number of dimensions that were removed
138 | 
139 |     device = arguments['--device']
140 |     lm_results_df = eval_lm_per_task(tokenizer, out_embed, bias, x_dev, words_dev, y_dev, P, n_coords, pos2ind,
141 |                                      device=device)
142 | 
143 |     if use_wandb:
144 |         labels_names = lm_results_df.index.to_numpy()
145 |         table_results = np.concatenate((labels_names.reshape(len(labels_names), 1), lm_results_df.to_numpy()), axis=1).tolist()
146 |         wandb.log({"results": wandb.Table(data=table_results,
147 |                                           columns=['label'] + lm_results_df.columns.tolist())})
148 | 


--------------------------------------------------------------------------------
/amnesic_probing/tasks/lm_per_dim.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Usage:
  3 |   lm.py [--vecs=VECS] [--labels=LABELS] [--text=TEXT]
  4 |         [--task=TASK]
  5 |         [--deprobe_dir=DEPROBE_DIR]
  6 |         [--display_examples=DISPLAY_EXAMPLES]
  7 |         [--device=DEVICE]
  8 |         [--n=N]
  9 |         [--wandb]
 10 | 
 11 | Options:
 12 |   -h --help                     show this help message and exit
 13 |   --vecs=VECS                   input vectors file
 14 |   --labels=LABELS               labels file
 15 |   --text=TEXT                   text file
 16 |   --task=TASK                   task type. between word_ind, sen_len, task [default: task]
 17 |   --deprobe_dir=DEPROBE_DIR     directory where the amnesic_probing files are located.
 18 |   --display_examples=DISPLAY_EXAMPLES       number of examples to display [default: 10]
 19 |   --device=DEVICE               cpu, cuda:0, cuda:1, ... [default: cpu]
 20 |   --n=N                         number of training examples [default: 100000]
 21 |   --wandb                       log using wandb
 22 | 
 23 | """
 24 | 
 25 | import glob
 26 | import json
 27 | import os
 28 | from collections import Counter
 29 | 
 30 | import numpy as np
 31 | import pandas as pd
 32 | import wandb
 33 | from docopt import docopt
 34 | from sklearn.utils import shuffle
 35 | 
 36 | from amnesic_probing.tasks.data_preparation import get_appropriate_data
 37 | from amnesic_probing.tasks.lm import eval_lm_performance, eval_task_performance
 38 | from amnesic_probing.tasks.utils import data_projection, read_files, get_lm_vals
 39 | 
 40 | _, tokenizer, out_embed, bias = get_lm_vals('bert-base-uncased')
 41 | 
 42 | 
 43 | def log_wandb(arguments):
 44 |     task_name = arguments['--deprobe_dir'].split('models/lm/')[1]
 45 |     task_type = task_name.split('/')[0]
 46 |     layer = arguments['--vecs'].split('/')[-1].split('.')[0]
 47 | 
 48 |     labels = arguments['--labels'].split('.')[0]
 49 |     data_orig = labels.split('data/')[1].split('/')[0]
 50 |     print(labels)
 51 |     print(data_orig)
 52 |     dataset = data_orig.split('_output', 1)[0]
 53 |     masking = data_orig.rsplit('_', 1)[1]
 54 | 
 55 |     config = dict(
 56 |         property=task_type,
 57 |         encoder='bert-base-uncased',
 58 |         dataset=dataset,
 59 |         masking=masking,
 60 |         layer=layer
 61 |     )
 62 | 
 63 |     wandb.init(
 64 |         name=task_type + '_eval_iterations',
 65 |         project="amnesic_probing",
 66 |         tags=["lm", "eval", task_type],
 67 |         config=config,
 68 |     )
 69 | 
 70 | 
 71 | def load_deprobing_params(in_file):
 72 |     with open(in_file, 'r') as f:
 73 |         meta = json.load(f)
 74 |     return meta
 75 | 
 76 | 
 77 | if __name__ == '__main__':
 78 |     arguments = docopt(__doc__)
 79 | 
 80 |     deprobe_dir = arguments['--deprobe_dir']
 81 |     if not os.path.isdir(deprobe_dir):
 82 |         assert 'Deprobing directory does not exists...'
 83 | 
 84 |     use_wandb = arguments['--wandb']
 85 |     if use_wandb:
 86 |         log_wandb(arguments)
 87 | 
 88 |     vecs_train, labels_train, sentences_train = read_files(arguments['--vecs'],
 89 |                                                            arguments['--labels'],
 90 |                                                            arguments['--text'], ignore_special_tokens=True)
 91 |     vecs_dev, labels_dev, sentences_dev = read_files(arguments['--vecs'].replace('train', 'dev'),
 92 |                                                      arguments['--labels'].replace('train', 'dev'),
 93 |                                                      arguments['--text'].replace('train', 'dev'),
 94 |                                                      ignore_special_tokens=True)
 95 | 
 96 |     task = arguments['--task']
 97 | 
 98 |     (x_train, y_train, words_train), (x_dev, y_dev, words_dev) = get_appropriate_data(task, vecs_train, labels_train,
 99 |                                                                                       sentences_train,
100 |                                                                                       vecs_dev, labels_dev,
101 |                                                                                       sentences_dev)
102 | 
103 |     pos2ind = {p: i for i, p in enumerate(sorted(set([item for sublist in labels_train for item in sublist])))}
104 | 
105 |     print('number of classes', len(pos2ind))
106 |     print('most common class', Counter(y_dev).most_common(1)[0][1] / float(len(y_dev)))
107 | 
108 |     meta = load_deprobing_params(deprobe_dir + '/meta.json')
109 |     n_coords = int(meta['removed_directions'])
110 |     n_classes = len(pos2ind)
111 | 
112 |     if use_wandb:
113 |         wandb.run.summary['n_classes'] = len(pos2ind)
114 |         wandb.run.summary['majority'] = Counter(y_dev).most_common(1)[0][1] / float(len(y_dev))
115 |         wandb.run.summary['removed_directions'] = n_coords
116 | 
117 |     x_train, y_train = shuffle(x_train, y_train, random_state=0, n_samples=min(len(y_train), int(arguments['--n'])))
118 |     device = arguments['--device']
119 | 
120 |     results_dic = {}
121 |     for proj_file in glob.glob(deprobe_dir + '/P_*.npy'):
122 |         P = np.load(proj_file)
123 | 
124 |         print('evaluating performance')
125 |         proj_iter = int(proj_file.split('/P_')[1].split('.npy')[0])
126 | 
127 |         removed_directions = int((proj_iter + 1) * n_classes)
128 |         # in case of 2 classes, each inlp iteration we remove a single direction
129 |         if n_classes == 2:
130 |             removed_directions /= 2
131 | 
132 |         print(removed_directions)
133 |         lm_results = eval_lm_performance(tokenizer, out_embed, bias, x_dev, words_dev, P,
134 |                                          n_coords=int(removed_directions), device=device)
135 | 
136 |         print('removing property from inputs')
137 | 
138 |         x_train_no_label = data_projection(x_train, P)
139 |         x_dev_no_label = data_projection(x_dev, P)
140 | 
141 |         task_results = eval_task_performance(x_train, y_train, x_dev, y_dev, x_train_no_label, x_dev_no_label, task)
142 | 
143 |         all_results = {**lm_results, **task_results}
144 |         for k, v in all_results.items():
145 |             print(k, v)
146 | 
147 |         results_dic[proj_iter] = all_results
148 | 
149 |     results_df = pd.DataFrame(results_dic).T
150 |     results_df.index = results_df.index.set_names('iter')
151 |     results_df.columns = results_df.columns
152 |     results_df = results_df.reset_index()
153 | 
154 |     if use_wandb:
155 |         table_data = results_df.values.tolist()
156 |         wandb.log({"results": wandb.Table(data=table_data, columns=results_df.columns.tolist())})
157 | 


--------------------------------------------------------------------------------
/amnesic_probing/encoders/encode_with_forward_pass/bert_encoding.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict
  2 | 
  3 | import numpy as np
  4 | import torch
  5 | import transformers
  6 | 
  7 | 
  8 | def forward_from_specific_layer(model, layer_number: int, layer_representation: torch.Tensor):
  9 |     """
 10 |    :param model: a BertForMaskedLM model
 11 |    :param layer_representation: a torch tensor, dims: [1, seq length, 768]
 12 |    Return: 
 13 |            states, a numpy array. dims: [#LAYERS - layer_number, seq length, 768]
 14 |            last_state_after_batch_norm: np array, after batch norm. dims: [seq_length, 768]
 15 |    """
 16 | 
 17 |     layers = model.bert.encoder.layer[layer_number:]
 18 |     layers.append(model.cls.predictions.transform)
 19 | 
 20 |     h = layer_representation
 21 |     states = []
 22 | 
 23 |     for i, layer in enumerate(layers):
 24 |         
 25 |         h = layer(h)[0] if i != len(layers) -1 else layer(h)
 26 |         states.append(h)
 27 | 
 28 |     #states[-1] = states[-1].unsqueeze(0)
 29 |     
 30 |     for i, s in enumerate(states):
 31 |         states[i] = s.detach().cpu().numpy()
 32 | 
 33 |     states = np.array(states)
 34 |     
 35 |     for x in states:
 36 |         assert len(x.shape) == 3
 37 |     
 38 |     return states
 39 | 
 40 | 
 41 | 
 42 | def lm_encoding_with_projection(text, lm_model, tokenizer, layer2projs=None):
 43 | 
 44 |     device = next(lm_model.parameters()).device
 45 |     result_dict = defaultdict(dict)
 46 |     # Encode text
 47 |     input_ids = torch.tensor([tokenizer.encode(text, add_special_tokens=True)]).to(device)
 48 |     
 49 |     with torch.no_grad():
 50 |         last_hidden_states = lm_model(input_ids)
 51 | 
 52 |     # [1] - the second tuple with all hidden layers,
 53 |     # [-1] - the last layer
 54 |     # [0] bs = 0
 55 | 
 56 |     rep_state = [last_hidden_states[1][i][0] for i in range(len(last_hidden_states[1]))]
 57 | 
 58 |     for layer in range(0, len(rep_state)):
 59 |         P = layer2projs[layer]
 60 |         states = rep_state[layer]# if layer != 0 else input_embds
 61 | 
 62 |         # removing properties on all tokens except special tokens (cls, sep)
 63 |         states_projected = torch.cat([states[:1], (states[1:-1] @ P), states[-1:]], dim=0)
 64 |         # states_projected = states @ P
 65 |         
 66 |         next_layers = forward_from_specific_layer(lm_model, layer, states_projected.unsqueeze(0)).squeeze(1)
 67 |         result_dict[layer]["next_layers"] = next_layers
 68 |         result_dict[layer]["layer_projected"] = states_projected.detach().cpu().numpy()
 69 | 
 70 |     return result_dict
 71 | 
 72 | 
 73 | def lm_masked_encoding_with_projection(text, lm_model, tokenizer: transformers.PreTrainedTokenizer, batch_size=8,
 74 |                                        layer2projs=None):
 75 |     """
 76 |     CUDA_VISIBLE_DEVICES=2 python amnesic_probing/encoders/encode.py
 77 |         --input_file=ud/en-universal-dev.conll
 78 |         --output_dir=out/conll_masked_dev
 79 |         --encoder=bert-base-uncased
 80 |         --format=conll
 81 |         --encode_format=masked
 82 |         --device=cuda:0
 83 |     """
 84 |     device = next(lm_model.parameters()).device
 85 |     result_dict = defaultdict(dict)
 86 |     # logger.warning(f"BERT device found to be {device}. "
 87 |     #                f"We assume the entire model is only on this device (no multiple gpus)!")
 88 | 
 89 |     input_ids = tokenizer.encode(text)
 90 | 
 91 |     masked_inputs = []
 92 | 
 93 |     for tok_ix in range(len(input_ids)):
 94 |         masked_input = list(input_ids)
 95 |         masked_input[tok_ix] = tokenizer.mask_token_id
 96 |         masked_inputs.append(torch.tensor(masked_input).unsqueeze(0).to(device))  # SHAPE: (1, len)
 97 | 
 98 |     # masked_inputs is (len, 1, len); masked_inputs[i][j] will be the representation of word j on the version where word i is maked.
 99 | 
100 |     # starting from 1 [for i in range(1, ... ] because we're not interested in masking CLS
101 |     batches = [{'tensors': torch.cat(masked_inputs[i:i + batch_size], dim=0),
102 |                 'masked_token_ixs': list(range(i, i + len(masked_inputs[i:i + batch_size])))}
103 |                for i in range(0, len(input_ids), batch_size)]  # SHAPE: (batch_size, len)
104 | 
105 |     # batches is (num_batches_per_sent, batch_size, len)
106 | 
107 |     all_rep_states = defaultdict(list)
108 | 
109 |     with torch.no_grad():
110 |         for batch in batches:
111 |             tensors = batch['tensors']
112 |             masked_token_ixs = batch['masked_token_ixs']
113 | 
114 |             last_hidden_states = lm_model(tensors)
115 | 
116 |             rep_state = list(last_hidden_states[1])  # (num_layers, batch, len, 768)
117 | 
118 |             # to numpy           
119 |             # rep_state = np.array([h.detach().cpu().numpy() for h in rep_state])
120 |             # rep_state = np.swapaxes(rep_state, 0, 1) # now shape is (batch_size, num_layers, len, 768)
121 | 
122 |             for layer in range(0, len(rep_state)):
123 |                 h = rep_state[layer]  # (batch, len, 768)
124 |                 P = layer2projs[layer]
125 |                 # h_projected = h @ P
126 |                 # removing properties on all tokens except special tokens (cls, sep)
127 |                 h_projected = torch.cat([h[:1], (h[1:-1] @ P), h[-1:]], dim=0)
128 | 
129 |                 next_layers = forward_from_specific_layer(lm_model, layer,
130 |                                                           h_projected)  # (num_remaining_layers, batch, len, 768)                   
131 |                 relevant_next_layers = np.array(
132 |                     [[layer[i][mask_ind] for i, mask_ind in enumerate(masked_token_ixs)] for layer in next_layers])
133 |                 # shape is (remaining_layers, batch_size, 768). we took only the masked token from each element.
134 | 
135 |                 relevant_next_layers = np.swapaxes(relevant_next_layers, 0, 1)
136 |                 all_rep_states[layer].append(relevant_next_layers)
137 | 
138 |     for layer in all_rep_states.keys():
139 |         states_lst = all_rep_states[layer]
140 |         layer_seq = np.concatenate(states_lst,
141 |                                    axis=0)  # concatenate over the batch dim to reconstruct the full sequence
142 |         layer_seq = np.swapaxes(layer_seq, 0, 1)
143 |         result_dict[layer]["next_layers"] = layer_seq
144 | 
145 |     return result_dict
146 | 
147 | 
148 | def bert_based_encoding(text, encoder, tokenizer):
149 |     input_ids = torch.tensor([tokenizer.encode(text, add_special_tokens=True)])
150 |     with torch.no_grad():
151 |         last_hidden_states = encoder.encode(input_ids)
152 | 
153 |     # [1] - the second tuple with all hidden layers,
154 |     # [-1] - the last layer
155 |     # [0] bs = 0
156 |     # [1: -1] ignoring the special characters that were added before and after the sentence (cls)
157 |     # import pdb;
158 |     # pdb.set_trace()
159 |     rep_state = last_hidden_states  # [1: -1]
160 | 
161 |     return rep_state.detach().numpy()
162 | 


--------------------------------------------------------------------------------
/amnesic_probing/tasks/layer_wise_deprobe.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Usage:
  3 |   layer_wise_deprobe.py [--layers=LAYERS] [--proj_vecs=PROJ_VECS] [--labels=LABELS] [--text=TEXT] [--task=TASK]
  4 |         [--n=N]
  5 |         [--wandb]
  6 | 
  7 | Options:
  8 |   -h --help                     show this help message and exit
  9 |   --layers=LAYERS               input folder of the plain layer vectors
 10 |   --proj_vecs=proj_vecs         input dir of all the projected layers
 11 |   --labels=LABELS               labels file. using the train path (and automatically also using the dev,
 12 |                                 by replacing train by dev)
 13 |   --text=TEXT                   text file
 14 |   --task=TASK                   task type. between word_ind, sen_len, task [default: task]
 15 |   --n=N                         number of training examples [default: 100000]
 16 |   --wandb                       log using wandb
 17 | 
 18 | """
 19 | 
 20 | import pickle
 21 | 
 22 | import numpy as np
 23 | import wandb
 24 | from docopt import docopt
 25 | from sklearn.utils import shuffle
 26 | from tqdm import tqdm
 27 | 
 28 | from amnesic_probing.tasks.data_preparation import create_labeled_data
 29 | from amnesic_probing.tasks.data_preparation import get_appropriate_data
 30 | from amnesic_probing.tasks.utils import learn_cls, read_files, learn_pls_cls, classification_tasks
 31 | 
 32 | 
 33 | def get_labels_file(in_dir):
 34 |     with open(in_dir, 'rb') as f:
 35 |         labels = pickle.load(f)
 36 |     return labels
 37 | 
 38 | 
 39 | def get_layer_vecs(layer_file):
 40 |     vectors = np.load(layer_file, allow_pickle=True)
 41 |     vectors = np.array([x[1:-1] for x in vectors])
 42 |     return vectors
 43 | 
 44 | 
 45 | def learn_cls_for_layer(layer_name, labels_train, labels_dev, n):
 46 |     vecs_train = get_layer_vecs(layer_name)
 47 |     vecs_dev = get_layer_vecs(layer_name.replace('train', 'dev'))
 48 |     x_train, y_train, label2i = create_labeled_data(vecs_train, labels_train)
 49 |     x_dev, y_dev, _ = create_labeled_data(vecs_dev, labels_dev, pos2i=label2i)
 50 |     x_train, y_train = shuffle(x_train, y_train, random_state=0, n_samples=n)
 51 | 
 52 |     score = learn_cls(x_train, y_train, x_dev, y_dev)
 53 |     return score
 54 | 
 55 | 
 56 | def learn_cls_for_layer_new(layer_name, labels_name, text_name, task, n):
 57 |     vecs_train, labels_train, sentences_train = read_files(layer_name,
 58 |                                                            labels_name,
 59 |                                                            text_name, ignore_special_tokens=True)
 60 |     vecs_dev, labels_dev, sentences_dev = read_files(layer_name.replace('train', 'dev'),
 61 |                                                      labels_name.replace('train', 'dev'),
 62 |                                                      text_name.replace('train', 'dev'),
 63 |                                                      ignore_special_tokens=True)
 64 | 
 65 |     (x_train, y_train, words_train), (x_dev, y_dev, words_dev) = get_appropriate_data(task, vecs_train, labels_train,
 66 |                                                                                       sentences_train,
 67 |                                                                                       vecs_dev, labels_dev,
 68 |                                                                                       sentences_dev)
 69 |     x_train, y_train = shuffle(x_train, y_train, random_state=0, n_samples=n)
 70 |     if task in classification_tasks:
 71 |         score = learn_cls(x_train, y_train, x_dev, y_dev)
 72 |     else:
 73 |         score = learn_pls_cls(x_train, y_train, x_dev, y_dev)
 74 |     return score
 75 | 
 76 | 
 77 | def compute_per_layer_task_performance(in_dir, labels_file, text_file, task, n):
 78 |     print('layer probing for tasks')
 79 | 
 80 |     per_layer_score = np.zeros(14)
 81 | 
 82 |     for layer_index in tqdm(range(0, 14)):
 83 |         layer_name = f'{in_dir}/vec_layer:{layer_index}.npy'
 84 |         if layer_index == 13:
 85 |             layer_name = f'{in_dir}/last_vec.npy'
 86 | 
 87 |         score = learn_cls_for_layer_new(layer_name, labels_file, text_file, task, n)
 88 |         per_layer_score[layer_index] = score
 89 | 
 90 |         print(layer_index, score)
 91 | 
 92 |     return per_layer_score
 93 | 
 94 | 
 95 | def compute_following_layers_task_performance(in_dir, labels_file, text_file, task, n):
 96 |     print('layer amnesic_probing. removing frmo layer i and testing on layer j')
 97 | 
 98 |     layers_task_results = []
 99 | 
100 |     for from_layer in range(0, 13):
101 |         for to_layer in range(from_layer + 1, 14):
102 |             vecs_name = f'{in_dir}/from:{from_layer}.to:{to_layer}.npy'
103 |             score = learn_cls_for_layer_new(vecs_name, labels_file, text_file, task, n)
104 | 
105 |             layers_task_results.append([from_layer, to_layer, score])
106 | 
107 |             print(from_layer, to_layer, score)
108 |     return layers_task_results
109 | 
110 | 
111 | def log_wandb(arguments):
112 |     labels = arguments['--labels'].split('.')[0]
113 |     task_type = labels.split('/')[-1]
114 | 
115 |     task = arguments['--task']
116 |     if task != 'task':
117 |         task_type = task
118 |         classification_type = 'regression'
119 |     else:
120 |         classification_type = 'classification'
121 | 
122 |     data_orig = labels.split('data/')[1].split('/')[0]
123 |     print(labels)
124 |     print(data_orig)
125 |     dataset = data_orig.split('_output', 1)[0]
126 |     masking = data_orig.rsplit('_', 1)[1]
127 | 
128 |     config = dict(
129 |         property=task_type,
130 |         encoder='bert-base-uncased',
131 |         dataset=dataset,
132 |         masking=masking,
133 |     )
134 | 
135 |     wandb.init(
136 |         name=f'{task_type}_layer_wise_eval',
137 |         project="amnesic_probing",
138 |         tags=["layer_wise", task_type, classification_type],
139 |         config=config,
140 |     )
141 | 
142 | 
143 | if __name__ == '__main__':
144 |     arguments = docopt(__doc__)
145 | 
146 |     labels = arguments['--labels']
147 |     layers = arguments['--layers']
148 |     texts = arguments['--text']
149 |     task = arguments['--task']
150 |     n = int(arguments['--n'])
151 |     proj_vecs = arguments['--proj_vecs']
152 | 
153 |     use_wandb = arguments['--wandb']
154 |     if use_wandb:
155 |         log_wandb(arguments)
156 | 
157 |     layer_probe_results = compute_per_layer_task_performance(layers, labels, texts, task, n)
158 |     layers_task_results = compute_following_layers_task_performance(proj_vecs, labels, texts, task, n)
159 | 
160 |     if use_wandb:
161 |         layer_probing = np.concatenate((np.array(list(range(14))).reshape(14, 1),
162 |                                         layer_probe_results.reshape(14, 1)),
163 |                                        axis=1)
164 |         wandb.log({"layer_probe": wandb.Table(data=layer_probing.tolist(),
165 |                                               columns=['layer', 'probing'])})
166 | 
167 |         wandb.log({'per_layer_deprobe': wandb.Table(data=layers_task_results,
168 |                                                     columns=['from', 'to', 'score'])})
169 | 


--------------------------------------------------------------------------------
/amnesic_probing/debias/classifier.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | import wandb
  4 | 
  5 | 
  6 | # an abstract class for linear classifiers
  7 | 
  8 | 
  9 | class Classifier:
 10 |     def train(self, X_train: np.ndarray, Y_train: np.ndarray, X_dev: np.ndarray, Y_dev: np.ndarray) -> float:
 11 |         """
 12 | 
 13 |         :param X_train:
 14 |         :param Y_train:
 15 |         :param X_dev:
 16 |         :param Y_dev:
 17 |         :return: accuracy score on the dev set
 18 |         """
 19 |         raise NotImplementedError
 20 | 
 21 |     def get_weights(self) -> np.ndarray:
 22 |         """
 23 |         :return: final weights of the model, as np array
 24 |         """
 25 | 
 26 |         raise NotImplementedError
 27 | 
 28 | 
 29 | class SKlearnClassifier(Classifier):
 30 |     def __init__(self, m):
 31 |         self.model = m
 32 | 
 33 |     def train_network(self, X_train: np.ndarray, Y_train: np.ndarray, X_dev: np.ndarray, Y_dev: np.ndarray) -> float:
 34 |         """
 35 |         :param X_train:
 36 |         :param Y_train:
 37 |         :param X_dev:
 38 |         :param Y_dev:
 39 |         :return: accuracy score on the dev set / Person's R in the case of regression
 40 |         """
 41 | 
 42 |         self.model.fit(X_train, Y_train)
 43 |         score = self.model.score(X_dev, Y_dev)
 44 |         return score
 45 | 
 46 |     def get_weights(self) -> np.ndarray:
 47 |         """
 48 |         :return: final weights of the model, as np array
 49 |         """
 50 | 
 51 |         w = self.model.coef_
 52 |         if len(w.shape) == 1:
 53 |             w = np.expand_dims(w, 0)
 54 | 
 55 |         return w
 56 | 
 57 | 
 58 | class PytorchClassifier(Classifier):
 59 |     def __init__(self, m: torch.nn.Module, device: str):
 60 |         self.m = m.to(device)
 61 |         self.device = device
 62 | 
 63 |     def eval(self, X_dev: np.ndarray, Y_dev: np.ndarray) -> float:
 64 |         X_dev = torch.tensor(X_dev).float().to(self.device)
 65 |         Y_dev = torch.tensor(Y_dev).to(self.device)
 66 |         test_dataset = torch.utils.data.TensorDataset(X_dev, Y_dev)
 67 |         testloader = torch.utils.data.DataLoader(test_dataset, batch_size=4096,
 68 |                                                  shuffle=False)
 69 |         acc = self._eval(testloader)
 70 |         # print("Eval accuracy: ", acc)
 71 |         return acc
 72 | 
 73 |     def _eval(self, testloader: torch.utils.data.DataLoader):
 74 |         correct = 0
 75 |         total = 0
 76 |         with torch.no_grad():
 77 |             for data in testloader:
 78 |                 vectors, labels = data
 79 |                 outputs = self.m(vectors)
 80 |                 _, predicted = torch.max(outputs.data, 1)
 81 |                 total += labels.size(0)
 82 |                 correct += (predicted == labels).sum().item()
 83 | 
 84 |         # print('Accuracy of the network on test: %d %%' % (
 85 |         #         100 * correct / total))
 86 |         return correct / total
 87 | 
 88 |     def get_probs(self, x: np.ndarray, y) -> np.ndarray:
 89 |         X = torch.tensor(x).float().to(self.device)
 90 |         Y = torch.tensor(y).to(self.device)
 91 |         test_dataset = torch.utils.data.TensorDataset(X, Y)
 92 |         testloader = torch.utils.data.DataLoader(test_dataset, batch_size=4096,
 93 |                                                  shuffle=False)
 94 |         probs = self._get_probs(testloader)
 95 |         # print("Eval accuracy: ", acc)
 96 |         return probs
 97 | 
 98 |     def _get_probs(self, testloader: torch.utils.data.DataLoader):
 99 |         softmax_logits = []
100 |         with torch.no_grad():
101 |             for data in testloader:
102 |                 vectors, labels = data
103 |                 outputs = self.m(vectors)
104 |                 probs = torch.softmax(outputs.data, dim=1)
105 |                 softmax_logits.append(probs.cpu().numpy())
106 | 
107 |         return np.array(softmax_logits)
108 | 
109 |     def train(self, X_train: np.ndarray, Y_train: np.ndarray,
110 |               X_dev: np.ndarray, Y_dev: np.ndarray, epochs=1, save_path=None,
111 |               use_wandb: bool = False) -> float:
112 |         X_train = torch.tensor(X_train).to(self.device)
113 |         Y_train = torch.tensor(Y_train).to(self.device)
114 |         X_dev = torch.tensor(X_dev).to(self.device)
115 |         Y_dev = torch.tensor(Y_dev).to(self.device)
116 | 
117 |         train_dataset = torch.utils.data.TensorDataset(X_train, Y_train)
118 |         train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32,
119 |                                                    shuffle=True)
120 |         dev_dataset = torch.utils.data.TensorDataset(X_dev, Y_dev)
121 |         dev_loader = torch.utils.data.DataLoader(dev_dataset, batch_size=2048,
122 |                                                  shuffle=False)
123 | 
124 |         criterion = torch.nn.CrossEntropyLoss()
125 |         optimizer = torch.optim.Adam(self.m.parameters(), lr=0.0001)
126 | 
127 |         acc = self._eval(dev_loader)
128 |         best_acc = -1
129 | 
130 |         print("Dev accuracy before training: ", acc)
131 |         if use_wandb:
132 |             wandb.run.summary['dev_acc_no_ft'] = acc
133 | 
134 |         if save_path:
135 |             torch.save(self.m, save_path)
136 | 
137 |         for epoch in range(epochs):  # loop over the dataset multiple times
138 |             running_loss = 0.0
139 |             for i, data in enumerate(train_loader, 0):
140 |                 # get the inputs; data is a list of [inputs, labels]
141 |                 inputs, labels = data
142 | 
143 |                 # zero the parameter gradients
144 |                 optimizer.zero_grad()
145 | 
146 |                 # forward + backward + optimize
147 |                 outputs = self.m(inputs)
148 |                 loss = criterion(outputs, labels)
149 |                 loss.backward()
150 |                 optimizer.step()
151 | 
152 |                 # print statistics
153 |                 running_loss += loss.item()
154 |                 if i % 2000 == 1999:  # print every 2000 mini-batches
155 |                     print('[%d, %5d] loss: %.3f' %
156 |                           (epoch + 1, i + 1, running_loss / 2000))
157 |                     running_loss = 0.0
158 | 
159 |             acc = self._eval(dev_loader)
160 |             print("Dev acc during training:", acc)
161 |             if use_wandb:
162 |                 wandb.log({'dev_acc': acc})
163 |             if acc > best_acc:
164 |                 best_acc = acc
165 |                 if use_wandb:
166 |                     wandb.run.summary['dev_best_acc'] = best_acc
167 |                     wandb.run.summary['dev_best_epoch'] = epoch
168 | 
169 |                 if save_path:
170 |                     print("New best dev acc reached. Saving model to", save_path)
171 |                     torch.save(self.m, save_path)
172 | 
173 |         print('Finished Training')
174 | 
175 |         acc = self._eval(dev_loader)
176 | 
177 |         print("Dev accuracy after training: ", acc)
178 | 
179 |         return acc
180 | 
181 |     # def get_weights(self, layer) -> Tuple[np.ndarray, np.ndarray]:
182 |     #     if len(self.m) > 1:
183 |     #         return self.m[1].weight.detach().cpu().numpy(), self.m[1].bias.detach().cpu().numpy()
184 | 


--------------------------------------------------------------------------------
/amnesic_probing/tasks/data_preparation.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | percentiles_list = [50]
  4 | 
  5 | 
  6 | def create_labeled_data(vecs, labels_seq, pos2i=None):
  7 |     x = []
  8 |     y = []
  9 | 
 10 |     if not pos2i:
 11 |         # using `sorted` function to make this process deterministic
 12 |         pos2i = {p: i for i, p in enumerate(sorted(set([item for sublist in labels_seq for item in sublist])))}
 13 | 
 14 |     for label, vec in zip(labels_seq, vecs):
 15 |         for l, v in zip(label, vec):
 16 |             x.append(v)
 17 |             y.append(pos2i[l])
 18 | 
 19 |     return np.array(x), np.array(y), pos2i
 20 | 
 21 | 
 22 | def create_subword_data(vecs, sentences_seq):
 23 |     x = []
 24 |     y = []
 25 | 
 26 |     for sentence, vec in zip(sentences_seq, vecs):
 27 |         for w, v in zip(sentence, vec):
 28 |             x.append(v)
 29 |             if w.startswith('##'):
 30 |                 label = 1
 31 |             else:
 32 |                 label = 0
 33 | 
 34 |             y.append(label)
 35 | 
 36 |     return np.array(x), np.array(y)
 37 | 
 38 | 
 39 | def create_word_length_data(vecs, sentences_seq):
 40 |     x = []
 41 |     y = []
 42 | 
 43 |     for sentence, vec in zip(sentences_seq, vecs):
 44 |         for w, v in zip(sentence, vec):
 45 |             x.append(v)
 46 |             if len(w) < 4:
 47 |                 label = 'short'
 48 |             else:
 49 |                 label = 'long'
 50 | 
 51 |             y.append(label)
 52 | 
 53 |     return np.array(x), np.array(y)
 54 | 
 55 | 
 56 | def create_character_data(vecs, sentences_seq):
 57 |     x = []
 58 |     y = []
 59 | 
 60 |     for sentence, vec in zip(sentences_seq, vecs):
 61 |         for w, v in zip(sentence, vec):
 62 |             x.append(v)
 63 |             if len(w) == 1 or w[1] in 'aeiou':
 64 |                 label = 'vowel'
 65 |             else:
 66 |                 label = 'consonant'
 67 | 
 68 |             y.append(label)
 69 | 
 70 |     return np.array(x), np.array(y)
 71 | 
 72 | 
 73 | def create_word_index_data(vecs, normalize=True):
 74 |     x = []
 75 |     y = []
 76 | 
 77 |     for vec in vecs:
 78 |         word_index = 0
 79 |         for v in vec:
 80 |             x.append(v)
 81 |             if normalize:
 82 |                 y.append(float(word_index) / vec.shape[0])
 83 |             else:
 84 |                 y.append(word_index)
 85 |             word_index += 1
 86 | 
 87 |     return np.array(x), np.array(y)
 88 | 
 89 | 
 90 | def get_label_in_percentile(val, percentiles):
 91 |     for i in range(len(percentiles)):
 92 |         if val < percentiles[i]:
 93 |             return i
 94 |     return len(percentiles)
 95 | 
 96 | 
 97 | def create_word_index_data_discrete(vecs, percentiles_sen_len=None):
 98 |     x = []
 99 |     y = []
100 | 
101 |     if percentiles_sen_len is None:
102 |         sen_lens = [x for xx in [list(range(x.shape[0])) for x in vecs] for x in xx]
103 |         percentiles_sen_len = np.percentile(sen_lens, percentiles_list)
104 | 
105 |     for vec in vecs:
106 |         for word_index, v in enumerate(vec):
107 |             x.append(v)
108 |             y.append(get_label_in_percentile(word_index, percentiles_sen_len))
109 | 
110 |     return np.array(x), np.array(y), percentiles_sen_len
111 | 
112 | 
113 | def create_sentence_len_data(vecs, normalize=True, max_sen_len=None):
114 |     x = []
115 |     y = []
116 | 
117 |     if not max_sen_len:
118 |         max_sen_len = max([x.shape[0] for x in vecs])
119 | 
120 |     for vec in vecs:
121 |         for v in vec:
122 |             x.append(v)
123 |             if normalize:
124 |                 y.append(vec.shape[0] / float(max_sen_len))
125 |             else:
126 |                 y.append(vec.shape[0])
127 | 
128 |     return np.array(x), np.array(y), max_sen_len
129 | 
130 | 
131 | def create_sentence_len_data_discrete(vecs, sen_len_percentiles=None):
132 |     x = []
133 |     y = []
134 | 
135 |     if sen_len_percentiles is None:
136 |         sen_lens = [x for xx in [[x.shape[0]] * x.shape[0] for x in vecs] for x in xx]
137 |         sen_len_percentiles = np.percentile(sen_lens, percentiles_list)
138 | 
139 |     for vec in vecs:
140 |         sen_y = get_label_in_percentile(vec.shape[0], sen_len_percentiles)
141 |         for v in vec:
142 |             x.append(v)
143 |             y.append(sen_y)
144 | 
145 |     return np.array(x), np.array(y), sen_len_percentiles
146 | 
147 | 
148 | def filter_sen_len(vecs, labels, sentences):
149 |     sen_lens = [len(x) for x in sentences]
150 |     percentiles = np.percentile(sen_lens, [10, 95])
151 |     f_vecs, f_labels, f_sentences = [], [], []
152 |     for vec, label, sentence in zip(vecs, labels, sentences):
153 |         sen_len = len(sentence)
154 |         if sen_len < percentiles[0] or sen_len > percentiles[-1]:
155 |             continue
156 |         f_vecs.append(vec)
157 |         f_labels.append(label)
158 |         f_sentences.append(sentence)
159 |     return f_vecs, f_labels, f_sentences
160 | 
161 | 
162 | def get_appropriate_data(task_type, vecs_train, labels_train, sentences_train, vecs_dev, labels_dev, sentences_dev):
163 | 
164 |     if task_type == 'word_ind':
165 |         # x_train, y_train, word_inds_percentiles = create_word_index_data_discrete(vecs_train)
166 |         x_train, y_train = create_word_index_data(vecs_train)
167 |         # x_dev, y_dev, _ = create_word_index_data_discrete(vecs_dev, word_inds_percentiles)
168 |         x_dev, y_dev = create_word_index_data(vecs_dev)
169 |     elif task_type == 'sen_len':
170 |         # note that in there's an override of the previous train/dev sentences set.
171 |         # This will also returned a filtered version of the data,
172 |         # therefore, it might not be comparable to the other results, as the data is different
173 |         vecs_train_f, labels_train_f, sentences_train = filter_sen_len(vecs_train, labels_train, sentences_train)
174 |         vecs_dev_f, labels_dev_f, sentences_dev = filter_sen_len(vecs_dev, labels_dev, sentences_dev)
175 | 
176 |         # x_train, y_train, sen_len_percentiles = create_sentence_len_data_discrete(vecs_train)
177 |         x_train, y_train, max_sen_len_train = create_sentence_len_data(vecs_train_f)
178 |         # x_dev, y_dev, _ = create_sentence_len_data_discrete(vecs_dev, sen_len_percentiles=sen_len_percentiles)
179 |         x_dev, y_dev, _ = create_sentence_len_data(vecs_dev_f, max_sen_len=max_sen_len_train)
180 |     elif task_type == 'task':
181 |         x_train, y_train, label2ind = create_labeled_data(vecs_train, labels_train)
182 |         x_dev, y_dev, _ = create_labeled_data(vecs_dev, labels_dev, label2ind)
183 |     elif task_type == 'subword':
184 |         x_train, y_train = create_subword_data(vecs_train, sentences_train)
185 |         x_dev, y_dev = create_subword_data(vecs_dev, sentences_dev)
186 |     elif task_type == 'word_len':
187 |         x_train, y_train = create_word_length_data(vecs_train, sentences_train)
188 |         x_dev, y_dev = create_word_length_data(vecs_dev, sentences_dev)
189 |     elif task_type == 'vowel':
190 |         x_train, y_train = create_character_data(vecs_train, sentences_train)
191 |         x_dev, y_dev = create_character_data(vecs_dev, sentences_dev)
192 |     else:
193 |         print('task: {} is not supported'.format(task_type))
194 |         raise ValueError('task not supported')
195 | 
196 |     words_train = [w for sen in sentences_train for w in sen]
197 |     words_dev = [w for sen in sentences_dev for w in sen]
198 |     return (x_train, y_train, words_train), (x_dev, y_dev, words_dev)
199 | 


--------------------------------------------------------------------------------
/amnesic_probing/tasks/remove_property.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Usage:
  3 |   remove_property.py [--vecs=VECS] [--labels=LABELS] [--out_dir=OUT_DIR]
  4 |         [--n_cls=N_CLS] [--task=TASK] [--input_dim=INPUT_DIM] [--max_iter=MAX_ITER]
  5 |         [--balance_data=BALANCE_DATA] [--n=N]
  6 |         [--wandb]
  7 | 
  8 | Options:
  9 |   -h --help                     show this help message and exit
 10 |   --vecs=VECS                   input vectors file. using the train path (and automatically also using the dev,
 11 |                                 by replacing train by dev)
 12 |   --labels=LABELS               labels file. using the train path (and automatically also using the dev,
 13 |                                 by replacing train by dev)
 14 |   --out_dir=OUT_DIR             logs and outputs directory
 15 |   --n_cls=N_CLS                 number of classifiers to use [default: 20]
 16 |   --task=TASK                   task type. between word_ind, sen_len, task [default: task]
 17 |   --input_dim=INPUT_DIM         input dimension [default: 768]
 18 |   --max_iter=MAX_ITER           maximum iteration for the linear model [default: 10000]
 19 |   --balance_data=BALANCE_DATA   balance data based on the labels. [default: false]
 20 |   --n=N                         number of training examples [default: 100000]
 21 |   --wandb                       log using wandb
 22 | 
 23 | """
 24 | 
 25 | import json
 26 | import os
 27 | from collections import Counter
 28 | 
 29 | import numpy as np
 30 | import pandas as pd
 31 | import wandb
 32 | from docopt import docopt
 33 | from sklearn.utils import shuffle
 34 | from torch.utils.tensorboard import SummaryWriter
 35 | 
 36 | from amnesic_probing.tasks.data_preparation import get_appropriate_data
 37 | from amnesic_probing.tasks.utils import read_files, get_projection_matrix, get_regression_pls, classification_tasks
 38 | 
 39 | np.random.seed(0)
 40 | 
 41 | 
 42 | def balance_data(x, y):
 43 |         df = pd.DataFrame.from_dict({'x': x.tolist(), 'y': y})
 44 |         g = df.groupby('y')
 45 |         g = pd.DataFrame(g.apply(lambda x: x.sample(g.size().min()).reset_index(drop=True)))
 46 |         return np.array(g['x'].tolist()), np.array(g['y'].tolist())
 47 | 
 48 | 
 49 | def log_wandb(arguments):
 50 |     labels = arguments['--labels'].split('.')[0]
 51 |     task_type = labels.split('/')[-1]
 52 | 
 53 |     task = arguments['--task']
 54 |     if task != 'task':
 55 |         task_type = task
 56 |     if task in ['word_ind', 'sen_len']:
 57 |         classification_type = 'regression'
 58 |     else:
 59 |         classification_type = 'classification'
 60 | 
 61 |     data_orig = labels.split('data/')[1].split('/')[0]
 62 |     print(labels)
 63 |     print(data_orig)
 64 |     dataset = data_orig.split('_output', 1)[0]
 65 |     masking = data_orig.rsplit('_', 1)[1]
 66 | 
 67 |     layer_str = arguments['--vecs'].split('/')[-1].rsplit('.', maxsplit=1)[0]
 68 |     if 'layer' in layer_str:
 69 |         layer = str(layer_str.split(':')[1])
 70 |     else:
 71 |         layer = 'last'
 72 | 
 73 |     config = dict(
 74 |         property=task_type,
 75 |         encoder='bert-base-uncased',
 76 |         dataset=dataset,
 77 |         masking=masking,
 78 |         layer=layer
 79 |     )
 80 | 
 81 |     wandb.init(
 82 |         name=task_type + f'_{layer}_inlp',
 83 |         project="amnesic_probing",
 84 |         tags=["inlp", task_type, classification_type],
 85 |         config=config,
 86 |     )
 87 | 
 88 | 
 89 | if __name__ == '__main__':
 90 |     arguments = docopt(__doc__)
 91 | 
 92 |     if arguments['--wandb']:
 93 |         log_wandb(arguments)
 94 | 
 95 |     out_dir = arguments['--out_dir']
 96 | 
 97 |     os.makedirs(out_dir, exist_ok=True)
 98 |     if os.path.isfile(out_dir + '/P.npy'):
 99 |         print('matrix already exists... skipping training')
100 |         exit(0)
101 |     writer = SummaryWriter(out_dir)
102 | 
103 |     in_dim = int(arguments['--input_dim'])
104 | 
105 |     sentence_file = arguments['--vecs'].rsplit('/', 1)[0] + '/' + 'tokens.pickle'
106 | 
107 |     vecs_train, labels_train, sentences_train = read_files(arguments['--vecs'], arguments['--labels'],
108 |                                                            sentence_file,
109 |                                                            ignore_special_tokens=True)
110 |     vecs_dev, labels_dev, sentences_dev = read_files(arguments['--vecs'].replace('train', 'dev'),
111 |                                                      arguments['--labels'].replace('train', 'dev'),
112 |                                                      sentence_file.replace('train', 'dev'),
113 |                                                      ignore_special_tokens=True)
114 | 
115 |     print('#sentences', len(vecs_train))
116 | 
117 |     task = arguments['--task']
118 | 
119 |     (x_train, y_train, _), (x_dev, y_dev, _) = get_appropriate_data(task, vecs_train, labels_train, sentences_train,
120 |                                                                     vecs_dev, labels_dev, sentences_dev)
121 | 
122 |     if bool(arguments['--balance_data'] == 'true'):
123 |         x_train, y_train = balance_data(x_train, y_train)
124 |         x_dev, y_dev = balance_data(x_dev, y_dev)
125 | 
126 |     n_classes = len(set(y_train))
127 |     majority = Counter(y_dev).most_common(1)[0][1] / float(len(y_dev))
128 |     print('number of classes:', n_classes)
129 |     print('most common class (dev):', majority)
130 | 
131 |     if arguments['--wandb']:
132 |         wandb.run.summary['n_classes'] = n_classes
133 |         wandb.run.summary['majority'] = majority
134 | 
135 |     num_clfs = int(arguments['--n_cls'])
136 |     max_iter = int(arguments['--max_iter'])
137 | 
138 |     # setting n to be the minimum between the number of examples
139 |     # in the training data and the provided amount
140 |     n = min(int(arguments['--n']), len(y_train))
141 |     print('using {} training examples'.format(n))
142 | 
143 |     x_train, y_train = shuffle(x_train, y_train, random_state=0, n_samples=n)
144 |     if task in classification_tasks:
145 |         P, all_projections, best_projection = get_projection_matrix(num_clfs,
146 |                                                                     x_train, y_train, x_dev, y_dev,
147 |                                                                     majority_acc=majority, max_iter=max_iter,
148 |                                                                     summary_writer=writer)
149 |     else:
150 |         P, all_projections, best_projection = get_regression_pls(num_clfs, x_train,
151 |                                                                  y_train, x_dev, y_dev, dim=in_dim,
152 |                                                                  majority_acc=majority,
153 |                                                                  summary_writer=writer)
154 | 
155 |     for i, projection in enumerate(all_projections):
156 |         np.save(out_dir + '/P_{}.npy'.format(i), projection)
157 | 
158 |     np.save(out_dir + '/P.npy', best_projection[0])
159 | 
160 |     if task in classification_tasks:
161 |         removed_directions = int((best_projection[1]) * n_classes)
162 |         # in case of 2 classes, each inlp iteration we remove a single direction
163 |         if n_classes == 2:
164 |             removed_directions /= 2
165 |     else:  # in regression tasks, each iteration we remove a single dimension
166 |         removed_directions = int((best_projection[1]))
167 | 
168 |     meta_dic = {'best_i': best_projection[1],
169 |                 'n_classes': n_classes,
170 |                 'majority': majority,
171 |                 'removed_directions': removed_directions}
172 | 
173 |     if arguments['--wandb']:
174 |         wandb.run.summary['best_i'] = best_projection[1]
175 |         wandb.run.summary['removed_directions'] = removed_directions
176 | 
177 |     json.dump(meta_dic, open(out_dir + '/meta.json', 'w'))
178 | 
179 |     print('done iterations. exiting...')
180 | 


--------------------------------------------------------------------------------
/amnesic_probing/encoders/bert_encoding.py:
--------------------------------------------------------------------------------
  1 | from functools import lru_cache
  2 | 
  3 | import numpy as np
  4 | import torch
  5 | import transformers
  6 | 
  7 | 
  8 | def forward_from_specific_layer(model, layer_number: int, layer_representation: torch.Tensor):
  9 |     """
 10 |    :param model: a BertForMaskedLM model
 11 |    :param layer_representation: a torch tensor, dims: [1, seq length, 768]
 12 |    Return: 
 13 |            states, a numpy array. dims: [#LAYERS - layer_number, seq length, 768]
 14 |            last_state_after_batch_norm: np array, after batch norm. dims: [seq_length, 768]
 15 |    """
 16 | 
 17 |     layers = model.bert.encoder.layer[layer_number:]
 18 |     h = layer_representation
 19 |     states = []
 20 | 
 21 |     for layer in layers:
 22 |         h = layer(h)[0]
 23 |         states.append(h)
 24 | 
 25 |     last_state_after_batch_norm = model.cls.predictions.transform(states[-1]).detach().cpu().numpy()[0]
 26 | 
 27 |     for i, s in enumerate(states):
 28 |         states[i] = s.detach().cpu().numpy()[0]
 29 | 
 30 |     return np.array(states), last_state_after_batch_norm
 31 | 
 32 | 
 33 | def lm_encoding(text, lm_model, tokenizer, only_last_layer=True):
 34 |     device = next(lm_model.parameters()).device
 35 |     # Encode text
 36 |     input_ids = torch.tensor([tokenizer.encode(text, add_special_tokens=True)]).to(device)
 37 |     with torch.no_grad():
 38 |         last_hidden_states = lm_model(input_ids)
 39 | 
 40 |     # [1] - the second tuple with all hidden layers,
 41 |     # [-1] - the last layer
 42 |     # [0] bs = 0
 43 | 
 44 |     if only_last_layer:
 45 |         rep_state = last_hidden_states[1][-1][0]  # [1: -1]
 46 |     else:
 47 |         rep_state = [last_hidden_states[1][i][0] for i in range(len(last_hidden_states[1]))]
 48 | 
 49 |     last_state_after_batch_norm = lm_model.cls.predictions.transform(rep_state if only_last_layer else rep_state[-1])
 50 | 
 51 |     # detach
 52 | 
 53 |     if only_last_layer:
 54 |         rep_state = rep_state.detach().cpu().numpy()
 55 |     else:
 56 |         rep_state = [x.detach().cpu().numpy() for x in rep_state]
 57 | 
 58 |     last_state_after_batch_norm = last_state_after_batch_norm.detach().cpu().numpy()
 59 | 
 60 |     # rep_state: a list of tensors. shape: (len, 768) if only_last_layer; else (num_layers, len, 768)
 61 |     # last_state_after_batch_norm: a tensor, shape: (len, 768)
 62 | 
 63 |     return last_state_after_batch_norm, rep_state
 64 |     # w/o the special tokens, of the first (currently only) sentence
 65 |     # return last_hidden_states[0, 1:-1, :].detach().numpy()
 66 | 
 67 | 
 68 | def lm_masked_encoding(text, lm_model, tokenizer: transformers.PreTrainedTokenizer, batch_size=8, only_last_layer=True):
 69 |     """
 70 |     CUDA_VISIBLE_DEVICES=2 python amnesic_probing/encoders/encode.py
 71 |         --input_file=ud/en-universal-dev.conll
 72 |         --output_dir=out/conll_masked_dev
 73 |         --encoder=bert-base-uncased
 74 |         --format=conll
 75 |         --encode_format=masked
 76 |         --device=cuda:0
 77 |     """
 78 |     device = next(lm_model.parameters()).device
 79 |     # logger.warning(f"BERT device found to be {device}. "
 80 |     #                f"We assume the entire model is only on this device (no multiple gpus)!")
 81 | 
 82 |     input_ids = tokenizer.encode(text)
 83 | 
 84 |     masked_inputs = []
 85 | 
 86 |     for tok_ix in range(len(input_ids)):
 87 |         masked_input = list(input_ids)
 88 |         masked_input[tok_ix] = tokenizer.mask_token_id
 89 |         masked_inputs.append(torch.tensor(masked_input).unsqueeze(0).to(device))  # SHAPE: (1, len)
 90 | 
 91 |     # masked_inputs is (len, 1, len); masked_inputs[i][j] will be the representation of word j on the version where word i is maked.
 92 | 
 93 |     # starting from 1 [for i in range(1, ... ] because we're not interested in masking CLS
 94 |     batches = [{'tensors': torch.cat(masked_inputs[i:i + batch_size], dim=0),
 95 |                 'masked_token_ixs': list(range(i, i + len(masked_inputs[i:i + batch_size])))}
 96 |                for i in range(0, len(input_ids), batch_size)]  # SHAPE: (batch_size, len)
 97 | 
 98 |     # batches is (num_batches_per_sent, batch_size, len)
 99 | 
100 |     all_last_states_after_batch_norm = []
101 |     all_rep_states = []
102 | 
103 |     with torch.no_grad():
104 |         for batch in batches:
105 |             tensors = batch['tensors']
106 |             masked_token_ixs = batch['masked_token_ixs']
107 |             last_hidden_states = lm_model(tensors)
108 | 
109 |             # [1] - the second tuple with all hidden layers,
110 |             # [-1] - the last layer
111 | 
112 |             rep_state = last_hidden_states[1][-1] if only_last_layer else list(last_hidden_states[1])
113 |             last_state_after_batch_norm = lm_model.cls.predictions.transform(
114 |                 rep_state if only_last_layer else rep_state[-1])  # SHAPE: (batch_size, len, emb)
115 | 
116 |             # to numpy
117 | 
118 |             last_state_after_batch_norm = last_state_after_batch_norm.detach().cpu().numpy()
119 |             rep_state = rep_state.detach().cpu().numpy() if only_last_layer else np.array(
120 |                 [h.detach().cpu().numpy() for h in rep_state])
121 |             if not only_last_layer:
122 |                 rep_state = np.swapaxes(rep_state, 0,
123 |                                         1)  # now shape is (batch_size, num_layers, len, 768)
124 |             last_state_after_batch_norm = list(last_state_after_batch_norm)
125 | 
126 |             assert len(last_state_after_batch_norm) == len(
127 |                 masked_token_ixs), f"{len(last_state_after_batch_norm)}:{len(masked_token_ixs)}"
128 |             last_state_after_batch_norm = [emb[tok_ix] for emb, tok_ix in
129 |                                            zip(last_state_after_batch_norm,
130 |                                                masked_token_ixs)]  # for each element in the batch (emb), choose only the index of the masked token (given by tok_ix)
131 |             # last_state_after_batch_norm is (batch_size, 768)
132 | 
133 |             rep_state = [emb[tok_ix] if only_last_layer else emb[:, tok_ix] for emb, tok_ix in
134 |                          zip(rep_state, masked_token_ixs)]
135 |             # shape: (batch_size, 768) if only_last_layer else (batch_size, num_layers, 768) 
136 | 
137 |             last_state_after_batch_norm = [np.expand_dims(x, 0) for x in last_state_after_batch_norm]
138 |             rep_state = [np.expand_dims(x, 0) if only_last_layer else x for x in rep_state]
139 | 
140 |             all_last_states_after_batch_norm.extend(last_state_after_batch_norm)
141 |             all_rep_states.extend(rep_state)
142 | 
143 |     all_last_states_after_batch_norm = np.concatenate(all_last_states_after_batch_norm, axis=0)
144 |     if only_last_layer:
145 |         all_rep_states = np.concatenate(all_rep_states, axis=0)
146 |     else:
147 |         all_rep_states = np.array(all_rep_states)
148 |         all_rep_states = np.swapaxes(all_rep_states, 0, 1)
149 | 
150 |     # all_rep_states shape: (len, 768) if only_last_layer else (num_layers, len, 768)
151 |     # all_last_states_after_batch_norm shape: (len, 768)
152 | 
153 |     return all_last_states_after_batch_norm, all_rep_states
154 | 
155 | 
156 | def bert_based_encoding(text, encoder, tokenizer):
157 |     input_ids = torch.tensor([tokenizer.encode(text, add_special_tokens=True)])
158 |     with torch.no_grad():
159 |         last_hidden_states = encoder.encode(input_ids)
160 | 
161 |     # [1] - the second tuple with all hidden layers,
162 |     # [-1] - the last layer
163 |     # [0] bs = 0
164 |     # [1: -1] ignoring the special characters that were added before and after the sentence (cls)
165 |     rep_state = last_hidden_states  # [1: -1]
166 | 
167 |     return rep_state.detach().numpy()
168 | 
169 | 
170 | @lru_cache(maxsize=None)
171 | def word_tokenize(tokenizer, word):
172 |     tokenized_word = tokenizer.tokenize(word)
173 |     n_subwords = len(tokenized_word)
174 |     return tokenized_word, n_subwords
175 | 
176 | 
177 | def tokenize_and_preserve_labels(sentence, text_labels, tokenizer):
178 |     """
179 |     taken from: https://github.com/chambliss/Multilingual_NER/blob/master/python/utils/main_utils.py#L118
180 |     Word piece tokenization makes it difficult to match word labels
181 |     back up with individual word pieces. This function tokenizes each
182 |     word one at a time so that it is easier to preserve the correct
183 |     label for each subword. It is, of course, a bit slower in processing
184 |     time, but it will help our model achieve higher accuracy.
185 |     """
186 | 
187 |     tokenized_sentence = []
188 |     labels = []
189 | 
190 |     for word, label in zip(sentence, text_labels):
191 |         # Tokenize the word and count # of subwords the word is broken into
192 |         tokenized_word, n_subwords = word_tokenize(tokenizer, word)
193 | 
194 |         # Add the tokenized word to the final tokenized word list
195 |         tokenized_sentence.extend(tokenized_word)
196 | 
197 |         # Add the same label to the new list of labels `n_subwords` times
198 |         labels.extend([label] * n_subwords)
199 | 
200 |     return tokenized_sentence, labels
201 | 


--------------------------------------------------------------------------------
/amnesic_probing/tasks/utils.py:
--------------------------------------------------------------------------------
  1 | import pickle
  2 | from collections import defaultdict, Counter
  3 | 
  4 | import numpy as np
  5 | import torch
  6 | from scipy.special import softmax
  7 | from scipy.stats import entropy
  8 | from sklearn.cross_decomposition import PLSRegression
  9 | from sklearn.linear_model import SGDClassifier, Ridge
 10 | from torch.nn.functional import kl_div
 11 | from tqdm import tqdm
 12 | from transformers import BertTokenizer, BertForMaskedLM
 13 | 
 14 | from amnesic_probing.debias.debias import get_debiasing_projection, debias_by_specific_directions, get_pls_projection
 15 | from amnesic_probing.debiased_finetuning.utils import define_network
 16 | 
 17 | classification_tasks = ['task', 'word_len', 'subword', 'vowel']
 18 | 
 19 | 
 20 | def get_lm_logits(x, w, b):
 21 |     logits = np.dot(w, x.T) + np.array([b]).repeat(x.shape[0], axis=0).T
 22 |     return logits
 23 | 
 24 | 
 25 | def get_lm_predictions(tokenizer, w, b, x):
 26 |     logits = get_lm_logits(x, w, b)
 27 |     y = logits.argmax(axis=0)
 28 |     return tokenizer.convert_ids_to_tokens(y)
 29 | 
 30 | 
 31 | def get_top_k_lm_predictions(tokenizer, w, b, x, k=20):
 32 |     logits = get_lm_logits(x, w, b)
 33 |     top_y = logits.argsort(axis=0)[-k:][::-1]
 34 |     top_words = []
 35 |     for top_k_per_word in top_y:
 36 |         top_k = tokenizer.convert_ids_to_tokens(top_k_per_word)
 37 |         top_words.append(top_k)
 38 |     return top_words
 39 | 
 40 | 
 41 | def get_top_k_lm_predictions_gpu(tokenizer, w, b, x, y, projection: np.ndarray = None, k=100, device: str = 'cpu'):
 42 |     network = define_network(w, b, projection_mat=projection, device=device)
 43 |     distribution = network.get_probs(x, y)[0]
 44 |     top_y = torch.tensor(distribution).to(device).topk(k=k, dim=1, largest=True, sorted=True).indices.cpu().numpy()
 45 |     top_words = []
 46 |     for top_k_per_word in top_y:
 47 |         top_k = tokenizer.convert_ids_to_tokens(top_k_per_word)
 48 |         top_words.append(top_k)
 49 |     return top_words
 50 | 
 51 | 
 52 | def get_lm_predictions_gpu(w, b, x, y, projection: np.ndarray = None, device: str = 'cpu'):
 53 |     network = define_network(w, b, projection_mat=projection, device=device)
 54 |     accuracy = network.eval(x, y)
 55 |     return accuracy
 56 | 
 57 | 
 58 | def get_lm_softmax_gpu(w, b, x, y, device: str):
 59 |     network = define_network(w, b, device=device)
 60 |     distribution = network.get_probs(x, y)
 61 |     return distribution
 62 | 
 63 | 
 64 | def data_projection(x, projection_matrix):
 65 |     return x.dot(projection_matrix)
 66 | 
 67 | 
 68 | def dropout_control(x, n_coord):
 69 |     all_indices = np.array(range(x.shape[1]))
 70 |     np.random.shuffle(all_indices)
 71 |     random_indices = all_indices[:n_coord]
 72 |     x_rand_dropout = x.copy()
 73 |     x_rand_dropout[:, random_indices] = 0
 74 |     return x_rand_dropout
 75 | 
 76 | 
 77 | def rand_direction_control(x, n_coord):
 78 |     dim = x.shape[1]
 79 |     # creating random directions (vectors) within the range of -0.5 : 0.5
 80 |     rand_directions = [np.random.rand(1, dim) - 0.5 for _ in range(n_coord)]
 81 | 
 82 |     # finding the null-space of random directions
 83 |     rand_direction_p = debias_by_specific_directions(rand_directions, dim)
 84 | 
 85 |     # and projecting the original data into that space (to remove random directions)
 86 |     x_rand_direction = rand_direction_p.dot(x.T).T
 87 |     return x_rand_direction
 88 | 
 89 | 
 90 | def learn_cls(x_train, y_train, x_dev, y_dev):
 91 |     clf = SGDClassifier(warm_start=True, loss='log', n_jobs=-1, max_iter=10000, random_state=0, early_stopping=True)
 92 | 
 93 |     clf.fit(x_train, y_train)
 94 |     acc = clf.score(x_dev, y_dev)
 95 |     return acc
 96 | 
 97 | 
 98 | def learn_reg_cls(x_train, y_train, x_dev, y_dev):
 99 |     clf = Ridge(random_state=0)
100 | 
101 |     clf.fit(x_train, y_train)
102 |     acc = clf.score(x_dev, y_dev)
103 |     return acc
104 | 
105 | 
106 | def learn_pls_cls(x_train, y_train, x_dev, y_dev):
107 |     clf = PLSRegression(n_components=100)
108 | 
109 |     clf.fit(x_train, y_train)
110 |     acc = clf.score(x_dev, y_dev)
111 |     return acc
112 | 
113 | 
114 | def read_files(vec_f, label_f, text_f=None, ignore_special_tokens=False):
115 |     vecs = np.load(vec_f, allow_pickle=True)
116 | 
117 |     if ignore_special_tokens:
118 |         vecs = np.array([x[1:-1] for x in vecs])
119 | 
120 |     with open(label_f, 'rb') as f:
121 |         labels = pickle.load(f)
122 | 
123 |     if text_f:
124 |         with open(text_f, 'rb') as f:
125 |             sentences = pickle.load(f)
126 |     else:
127 |         sentences = None
128 | 
129 |     return vecs, labels, sentences
130 | 
131 | 
132 | def get_projection_matrix(num_clfs, x_train, y_train, x_dev, y_dev,
133 |                           majority_acc, max_iter=500, summary_writer=None):
134 |     clf = SGDClassifier
135 |     params = {'warm_start': True, 'loss': 'log', 'n_jobs': -1, 'max_iter': max_iter, 'random_state': 0,
136 |               'early_stopping': True}
137 |     dim = x_train.shape[1]
138 | 
139 |     P, _, _, all_projections, best_projection = get_debiasing_projection(clf, params, num_clfs, dim,
140 |                                                                          is_autoregressive=True,
141 |                                                                          min_accuracy=majority_acc,
142 |                                                                          X_train=x_train, Y_train=y_train,
143 |                                                                          X_dev=x_dev, Y_dev=y_dev,
144 |                                                                          summary_writer=summary_writer)
145 | 
146 |     return P, all_projections, best_projection
147 | 
148 | 
149 | def get_regression_projection_matrix(num_clfs, x_train, y_train, x_dev, y_dev, dim, majority_acc, summary_writer=None):
150 |     clf = Ridge
151 |     params = {'random_state': 0}
152 | 
153 |     projection_matrix, _, _, all_projections, best_projection = get_debiasing_projection(clf, params, num_clfs, dim,
154 |                                                                                          is_autoregressive=True,
155 |                                                                                          min_accuracy=0,
156 |                                                                                          X_train=x_train, Y_train=y_train,
157 |                                                                                          X_dev=x_dev, Y_dev=y_dev,
158 |                                                                                          summary_writer=summary_writer)
159 | 
160 |     return projection_matrix, all_projections, best_projection
161 | 
162 | 
163 | def get_regression_pls(num_clfs, x_train, y_train, x_dev, y_dev, dim, majority_acc, summary_writer=None):
164 |     projection_matrix, all_projections, best_projection = get_pls_projection(num_clfs, x_train, y_train, x_dev, y_dev,
165 |                                                                              summary_writer=summary_writer)
166 |     return projection_matrix, all_projections, best_projection
167 | 
168 | 
169 | def get_lm_vals(model_name):
170 |     lm_model = BertForMaskedLM.from_pretrained(model_name, output_hidden_states=True)
171 |     tokenizer = BertTokenizer.from_pretrained(model_name)
172 |     out_embed = lm_model.cls.predictions.decoder.weight.detach().cpu().numpy()
173 |     bias = lm_model.cls.predictions.decoder.bias.detach().cpu().numpy()
174 |     return lm_model, tokenizer, out_embed, bias
175 | 
176 | 
177 | def predict_word(vec, projection, out_embed, bias, tokenizer):
178 |     logits = np.dot(out_embed, vec) + bias
179 |     am = tokenizer.convert_ids_to_tokens([logits.argmax()])[0]
180 | 
181 |     logits_P = np.dot(out_embed, projection.dot(vec.T).T) + bias
182 |     amp = tokenizer.convert_ids_to_tokens([logits_P.argmax()])[0]
183 | 
184 |     return am, amp
185 | 
186 | 
187 | def dkl(w, b, x_orig, x_diff):
188 |     logits = get_lm_logits(x_orig, w, b)
189 |     logits_diff = get_lm_logits(x_diff, w, b)
190 | 
191 |     probs = softmax(logits, axis=1)
192 |     probs_diff = softmax(logits_diff, axis=1)
193 |     dkl = entropy(probs, probs_diff, axis=1)
194 |     dkl_mean = dkl.mean()
195 |     return dkl_mean
196 | 
197 | 
198 | def dkl_gpu(w, b, x_orig, x_diff, y, plain_probs: np.ndarray = None, device: str = 'cpu'):
199 |     if plain_probs is None:
200 |         probs = get_lm_softmax_gpu(w, b, x_orig, y, device=device)
201 |     else:
202 |         probs = plain_probs
203 |     probs_diff = get_lm_softmax_gpu(w, b, x_diff, y, device=device)
204 | 
205 |     all_dkl = []
206 |     for batch_prob, batch_prob_diff in tqdm(zip(probs, probs_diff)):
207 |         batch_dkl = kl_div(torch.tensor(batch_prob_diff).float().to(device).log(),
208 |                            torch.tensor(batch_prob).float().to(device), reduction='none')\
209 |             .sum(axis=1).cpu().numpy()
210 |         all_dkl.extend(batch_dkl)
211 | 
212 |     dkl_mean = np.mean(all_dkl)
213 |     return dkl_mean, probs
214 | 
215 | 
216 | def most_probable_label(words, labels):
217 |     words_labels = defaultdict(list)
218 | 
219 |     for word, label in zip(words, labels):
220 |         words_labels[word].append(label)
221 | 
222 |     most_probable_label_per_word = {}
223 |     for word, label_list in words_labels.items():
224 |         most_probable_label_per_word[word] = Counter(label_list).most_common(1)[0][0]
225 |     return most_probable_label_per_word
226 | 
227 | 
228 | def convert_words2labels(words, probable_labels, label2ind, most_common_label):
229 | 
230 |     labels_freqeuency = np.zeros(len(label2ind))
231 |     for word in words:
232 |         labels_freqeuency[label2ind[probable_labels.get(word, most_common_label)]] += 1
233 |     return labels_freqeuency
234 | 
235 | 
236 | def calc_entropy(x, y, y_labels, probable_labels, tokenizer, out_embed, bias, k, device):
237 |     all_labels = list(set(y_labels))
238 |     ind2label = dict(enumerate(all_labels))
239 |     label2ind = {v: k for k, v in ind2label.items()}
240 |     most_common_label = Counter(y_labels).most_common(1)[0][0]
241 | 
242 |     top_words = get_top_k_lm_predictions_gpu(tokenizer, out_embed, bias, x, y, None, k=k,
243 |                                              device=device)
244 |     all_dists = torch.tensor(
245 |         [convert_words2labels(top_words[i], probable_labels, label2ind, most_common_label) for i in range(len(top_words))]).to(device)
246 |     # this will be normlized to 2
247 |     entropy_score = torch.distributions.Categorical(logits=all_dists).entropy().mean().cpu().numpy()
248 |     return entropy_score
249 | 


--------------------------------------------------------------------------------
/amnesic_probing/tasks/lm.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Usage:
  3 |   lm.py [--vecs=VECS] [--labels=LABELS] [--text=TEXT]
  4 |         [--task=TASK]
  5 |         [--deprobe_dir=DEPROBE_DIR]
  6 |         [--display_examples=DISPLAY_EXAMPLES]
  7 |         [--device=DEVICE]
  8 |         [--n=N]
  9 |         [--wandb]
 10 | 
 11 | Options:
 12 |   -h --help                     show this help message and exit
 13 |   --vecs=VECS                   input vectors file
 14 |   --labels=LABELS               labels file
 15 |   --text=TEXT                   text file
 16 |   --task=TASK                   task type. between word_ind, sen_len, task [default: task]
 17 |   --deprobe_dir=DEPROBE_DIR     directory where the amnesic_probing files are located.
 18 |   --display_examples=DISPLAY_EXAMPLES       number of examples to display [default: 10]
 19 |   --device=DEVICE               cpu, cuda:0, cuda:1, ... [default: cpu]
 20 |   --n=N                         number of training examples [default: 100000]
 21 |   --wandb                       log using wandb
 22 | 
 23 | """
 24 | 
 25 | import json
 26 | import os
 27 | from collections import Counter
 28 | 
 29 | import numpy as np
 30 | import pandas as pd
 31 | import wandb
 32 | from docopt import docopt
 33 | from sklearn.utils import shuffle
 34 | 
 35 | from amnesic_probing.tasks.data_preparation import get_appropriate_data
 36 | from amnesic_probing.tasks.utils import data_projection, read_files, get_lm_vals, dropout_control, rand_direction_control, \
 37 |     get_lm_predictions_gpu, dkl_gpu, learn_cls, learn_reg_cls, get_lm_predictions, calc_entropy, most_probable_label, \
 38 |     classification_tasks
 39 | 
 40 | _, tokenizer, out_embed, bias = get_lm_vals('bert-base-uncased')
 41 | 
 42 | 
 43 | def unify_inputs(sentences, labels_seq, vecs, pos2ind):
 44 |     x = []
 45 |     y = []
 46 |     words = []
 47 | 
 48 |     for sen, label, vec in zip(sentences, labels_seq, vecs):
 49 |         for w, l, v in zip(sen, label, vec):
 50 |             x.append(v)
 51 |             y.append(pos2ind[l])
 52 |             words.append(w)
 53 | 
 54 |     return np.array(x), np.array(y), words
 55 | 
 56 | 
 57 | def sentence_prediction_example(text_tokens, text_vecs, task_labels, projection_matrix):
 58 |     print('sentence: ', ' '.join(text_tokens))
 59 |     print('token', 'lm predicted token', 'lm task-less predicted token', 'task label')
 60 |     outputs = []
 61 |     predicted_tokens = get_lm_predictions(tokenizer, out_embed, bias, text_vecs)
 62 |     predicted_tokens_p = get_lm_predictions(tokenizer, out_embed, bias,
 63 |                                             data_projection(text_vecs, projection_matrix))
 64 |     for true_word, y_hat, y_hat_p, y_task in zip(text_tokens, predicted_tokens, predicted_tokens_p, task_labels):
 65 |         print(true_word, y_hat, y_hat_p, y_task)
 66 |         outputs.append([true_word, y_hat, y_hat_p, y_task])
 67 |     return outputs
 68 | 
 69 | 
 70 | def eval_lm_performance(tokenizer, out_embed, bias, x, words, projection, n_coords, device='cpu'):
 71 |     y_ids = tokenizer.convert_tokens_to_ids(words)
 72 | 
 73 |     lm_results = {}
 74 |     base_acc = get_lm_predictions_gpu(out_embed, bias, x, y_ids, device=device)
 75 | 
 76 |     x_p = data_projection(x, projection)
 77 |     p_acc = get_lm_predictions_gpu(out_embed, bias, x_p, y_ids, device=device)
 78 | 
 79 |     x_dropout = dropout_control(x, n_coords)
 80 |     dropout_acc = get_lm_predictions_gpu(out_embed, bias, x_dropout, y_ids, device=device)
 81 | 
 82 |     x_rand_dir = rand_direction_control(x, n_coords)
 83 |     rand_dir_acc = get_lm_predictions_gpu(out_embed, bias, x_rand_dir, y_ids, device=device)
 84 | 
 85 |     lm_results['lm_acc_vanilla'] = base_acc
 86 |     lm_results['lm_acc_p'] = p_acc
 87 |     lm_results['lm_acc_dropout'] = dropout_acc
 88 |     lm_results['lm_acc_rand_dir'] = rand_dir_acc
 89 | 
 90 |     dkl_p, x_probs = dkl_gpu(out_embed, bias, x, x_p, y_ids, device=device)
 91 |     dkl_drop, _ = dkl_gpu(out_embed, bias, x, x_dropout, y_ids, x_probs, device=device)
 92 |     dkl_rand, _ = dkl_gpu(out_embed, bias, x, x_rand_dir, y_ids, x_probs, device=device)
 93 |     lm_results['dkl_p'] = dkl_p
 94 |     lm_results['dkl_dropout'] = dkl_drop
 95 |     lm_results['dkl_rand_dir'] = dkl_rand
 96 | 
 97 |     return lm_results
 98 | 
 99 | 
100 | def eval_topk_performance(tokenizer, out_embed, bias, x, words, projection, probable_labels,
101 |                           n_coords, y_train_labels, k=100, device='cpu'):
102 |     y_ids = tokenizer.convert_tokens_to_ids(words)
103 | 
104 |     lm_labels_results = {}
105 |     entropy_vanilla = calc_entropy(x, y_ids, y_train_labels, probable_labels, tokenizer, out_embed, bias, k, device)
106 | 
107 |     x_p = data_projection(x, projection)
108 |     entropy_p = calc_entropy(x_p, y_ids, y_train_labels, probable_labels, tokenizer, out_embed, bias, k, device)
109 | 
110 |     x_dropout = dropout_control(x, n_coords)
111 |     entropy_dropout = calc_entropy(x_dropout, y_ids, y_train_labels, probable_labels, tokenizer, out_embed, bias, k, device)
112 | 
113 |     x_rand_dir = rand_direction_control(x, n_coords)
114 |     entropy_rand_dir = calc_entropy(x_rand_dir, y_ids, y_train_labels, probable_labels, tokenizer, out_embed, bias, k, device)
115 | 
116 |     lm_labels_results['top_k_entropy_vanilla'] = entropy_vanilla
117 |     lm_labels_results['top_k_entropy_p'] = entropy_p
118 |     lm_labels_results['top_k_entropy_dropout'] = entropy_dropout
119 |     lm_labels_results['top_k_entropy_rand_dir'] = entropy_rand_dir
120 |     return lm_labels_results
121 | 
122 | 
123 | def eval_task_performance(x_train, y_train, x_dev, y_dev, x_train_no_label, x_dev_no_label, task_type):
124 |     task_results = {}
125 |     if task_type in classification_tasks:
126 |         acc = learn_cls(x_train, y_train, x_dev, y_dev)
127 |         acc_inlp = learn_cls(x_train_no_label, y_train, x_dev_no_label, y_dev)
128 |     else:
129 |         acc = learn_reg_cls(x_train, y_train, x_dev, y_dev)
130 |         acc_inlp = learn_reg_cls(x_train_no_label, y_train, x_dev_no_label, y_dev)
131 | 
132 |     task_results['task_acc_vanialla'] = acc
133 |     task_results['task_acc_p'] = acc_inlp
134 |     return task_results
135 | 
136 | 
137 | def log_wandb(arguments):
138 |     task_name = arguments['--deprobe_dir'].split('models/lm/')[1]
139 |     task_type = task_name.split('/')[0]
140 |     layer = arguments['--vecs'].split('/')[-1].split('.')[0]
141 | 
142 |     labels = arguments['--labels'].split('.')[0]
143 |     data_orig = labels.split('data/')[1].split('/')[0]
144 |     print(labels)
145 |     print(data_orig)
146 |     dataset = data_orig.split('_output', 1)[0]
147 |     masking = data_orig.rsplit('_', 1)[1]
148 | 
149 |     config = dict(
150 |         property=task_type,
151 |         encoder='bert-base-uncased',
152 |         dataset=dataset,
153 |         masking=masking,
154 |         layer=layer
155 |     )
156 | 
157 |     wandb.init(
158 |         name=task_type + '_eval',
159 |         project="amnesic_probing",
160 |         tags=["lm", "eval", task_type],
161 |         config=config,
162 |     )
163 | 
164 | 
165 | def load_deprobing_params(in_file):
166 |     with open(in_file, 'r') as f:
167 |         meta = json.load(f)
168 |     return meta
169 | 
170 | 
171 | if __name__ == '__main__':
172 |     arguments = docopt(__doc__)
173 | 
174 |     deprobe_dir = arguments['--deprobe_dir']
175 |     if not os.path.isdir(deprobe_dir):
176 |         assert 'Deprobing directory does not exists...'
177 | 
178 |     use_wandb = arguments['--wandb']
179 |     if use_wandb:
180 |         log_wandb(arguments)
181 | 
182 |     vecs_train, labels_train, sentences_train = read_files(arguments['--vecs'],
183 |                                                            arguments['--labels'],
184 |                                                            arguments['--text'], ignore_special_tokens=True)
185 |     vecs_dev, labels_dev, sentences_dev = read_files(arguments['--vecs'].replace('train', 'dev'),
186 |                                                      arguments['--labels'].replace('train', 'dev'),
187 |                                                      arguments['--text'].replace('train', 'dev'),
188 |                                                      ignore_special_tokens=True)
189 | 
190 |     task = arguments['--task']
191 | 
192 |     (x_train, y_train, words_train), (x_dev, y_dev, words_dev) = get_appropriate_data(task, vecs_train, labels_train,
193 |                                                                                       sentences_train,
194 |                                                                                       vecs_dev, labels_dev,
195 |                                                                                       sentences_dev)
196 | 
197 |     pos2ind = {p: i for i, p in enumerate(sorted(set([item for sublist in labels_train for item in sublist])))}
198 | 
199 |     print('number of classes', len(pos2ind))
200 |     print('most common class', Counter(y_dev).most_common(1)[0][1] / float(len(y_dev)))
201 | 
202 |     meta = load_deprobing_params(deprobe_dir + '/meta.json')
203 |     n_coords = int(meta['removed_directions'])
204 | 
205 |     if use_wandb:
206 |         wandb.run.summary['n_classes'] = len(pos2ind)
207 |         wandb.run.summary['majority'] = Counter(y_dev).most_common(1)[0][1] / float(len(y_dev))
208 |         wandb.run.summary['removed_directions'] = n_coords
209 | 
210 |     proj_file = deprobe_dir + '/P.npy'
211 | 
212 |     if os.path.isfile(proj_file):
213 |         P = np.load(proj_file)
214 |     else:
215 |         raise FileNotFoundError('projection file does not exists...')
216 | 
217 |     print('evaluating performance')
218 | 
219 |     device = arguments['--device']
220 |     lm_results = eval_lm_performance(tokenizer, out_embed, bias, x_dev, words_dev, P,
221 |                                      n_coords=n_coords, device=device)
222 |     if task in classification_tasks:
223 |         probable_labels = most_probable_label(words_train, y_train)
224 |         prediction_variety_results = eval_topk_performance(tokenizer, out_embed, bias, x_dev, words_dev, P, probable_labels,
225 |                                                            n_coords, y_train, k=20, device=device)
226 |     else:
227 |         prediction_variety_results = {}
228 | 
229 |     print('removing property from inputs')
230 |     x_train, y_train = shuffle(x_train, y_train, random_state=0, n_samples=min(len(y_train), int(arguments['--n'])))
231 |     x_train_no_label = data_projection(x_train, P)
232 |     x_dev_no_label = data_projection(x_dev, P)
233 | 
234 |     task_results = eval_task_performance(x_train, y_train, x_dev, y_dev, x_train_no_label, x_dev_no_label, task)
235 | 
236 |     all_results = {**lm_results, **prediction_variety_results, **task_results}
237 |     if use_wandb:
238 |         for k, val in all_results.items():
239 |             wandb.run.summary[k] = val
240 | 
241 |     table_data = []
242 |     ind = 0
243 |     for i in range(int(arguments['--display_examples'])):
244 |         for w, orig_y, P_y, y_label in sentence_prediction_example(sentences_dev[i], vecs_dev[i], labels_dev[i], P):
245 |             table_data.append([w, orig_y, P_y, y_label, ind])
246 |             ind += 1
247 |         table_data.append(['-', '-', '-', '-', ind])
248 |         ind += 1
249 | 
250 |     wandb.log({"examples": wandb.Table(data=table_data, columns=["word", "lm_word", "-p_word", "label", "index"])})
251 |     df = pd.DataFrame(table_data, columns=["word", "lm_word", "-p_word", "label", "index"])
252 |     df.to_csv(deprobe_dir + '/examples.tsv', sep='\t', index=False)
253 | 


--------------------------------------------------------------------------------
/amnesic_probing/debias/debias.py:
--------------------------------------------------------------------------------
  1 | import warnings
  2 | from typing import Dict
  3 | from typing import List
  4 | 
  5 | import numpy as np
  6 | import scipy
  7 | import wandb
  8 | from sklearn.cross_decomposition import PLSRegression
  9 | from sklearn.linear_model import Ridge
 10 | from tqdm import tqdm
 11 | 
 12 | from amnesic_probing.debias import classifier
 13 | 
 14 | 
 15 | def get_rowspace_projection(W: np.ndarray) -> np.ndarray:
 16 |     """
 17 |     :param W: the matrix over its nullspace to project
 18 |     :return: the projection matrix over the rowspace
 19 |     """
 20 | 
 21 |     if np.allclose(W, 0):
 22 |         w_basis = np.zeros_like(W.T)
 23 |     else:
 24 |         w_basis = scipy.linalg.orth(W.T)  # orthogonal basis
 25 | 
 26 |     w_basis * np.sign(w_basis[0][0])  # handle sign ambiguity
 27 |     P_W = w_basis.dot(w_basis.T)  # orthogonal projection on W's rowspace
 28 | 
 29 |     return P_W
 30 | 
 31 | 
 32 | def get_projection_to_intersection_of_nullspaces(rowspace_projection_matrices: List[np.ndarray], input_dim: int):
 33 |     """
 34 |     Given a list of rowspace projection matrices P_R(w_1), ..., P_R(w_n),
 35 |     this function calculates the projection to the intersection of all nullspasces of the matrices w_1, ..., w_n.
 36 |     uses the intersection-projection formula of Ben-Israel 2013 http://benisrael.net/BEN-ISRAEL-NOV-30-13.pdf:
 37 |     N(w1)∩ N(w2) ∩ ... ∩ N(wn) = N(P_R(w1) + P_R(w2) + ... + P_R(wn))
 38 |     :param rowspace_projection_matrices: List[np.array], a list of rowspace projections
 39 |     :param input_dim: input dim
 40 |     """
 41 | 
 42 |     I = np.eye(input_dim)
 43 |     Q = np.sum(rowspace_projection_matrices, axis=0)
 44 |     P = I - get_rowspace_projection(Q)
 45 | 
 46 |     return P
 47 | 
 48 | 
 49 | def debias_by_specific_directions(directions: List[np.ndarray], input_dim: int):
 50 |     """
 51 |     the goal of this function is to perform INLP on a set of user-provided directions
 52 |     (instead of learning those directions).
 53 |     :param directions: list of vectors, as numpy arrays.
 54 |     :param input_dim: dimensionality of the vectors.
 55 |     """
 56 | 
 57 |     rowspace_projections = []
 58 | 
 59 |     for v in directions:
 60 |         P_v = get_rowspace_projection(v)
 61 |         rowspace_projections.append(P_v)
 62 | 
 63 |     P = get_projection_to_intersection_of_nullspaces(rowspace_projections, input_dim)
 64 | 
 65 |     return P
 66 | 
 67 | 
 68 | def get_debiasing_projection(classifier_class, cls_params: Dict, num_classifiers: int, input_dim: int,
 69 |                              is_autoregressive: bool,
 70 |                              min_accuracy: float, X_train: np.ndarray, Y_train: np.ndarray, X_dev: np.ndarray,
 71 |                              Y_dev: np.ndarray, best_iter_diff=0.01, summary_writer=None) \
 72 |         -> (np.ndarray, list, list, list, tuple):
 73 |     """
 74 |     :param classifier_class: the sklearn classifier class (SVM/Perceptron etc.)
 75 |     :param cls_params: a dictionary, containing the params for the sklearn classifier
 76 |     :param num_classifiers: number of iterations (equivalent to number of dimensions to remove)
 77 |     :param input_dim: size of input vectors
 78 |     :param is_autoregressive: whether to train the ith classiifer on the data projected to the nullsapces of w1,...,wi-1
 79 |     :param min_accuracy: above this threshold, ignore the learned classifier
 80 |     :param X_train: ndarray, training vectors
 81 |     :param Y_train: ndarray, training labels (protected attributes)
 82 |     :param X_dev: ndarray, eval vectors
 83 |     :param Y_dev: ndarray, eval labels (protected attributes)
 84 |     :param best_iter_diff: float, diff from majority, used to decide on best iteration
 85 |     :return: P, the debiasing projection; rowspace_projections, the list of all rowspace projection;
 86 |             Ws, the list of all calssifiers.
 87 |     """
 88 | 
 89 |     X_train_cp = X_train.copy()
 90 |     X_dev_cp = X_dev.copy()
 91 |     rowspace_projections = []
 92 |     Ws = []
 93 |     all_projections = []
 94 |     best_projection = None
 95 |     iters_under_threshold = 0
 96 |     prev_acc = -99.
 97 |     iters_no_change = 0
 98 | 
 99 |     pbar = tqdm(range(num_classifiers))
100 |     for i in pbar:
101 |         clf = classifier.SKlearnClassifier(classifier_class(**cls_params))
102 |         acc = clf.train_network(X_train_cp, Y_train, X_dev_cp, Y_dev)
103 |         pbar.set_description("iteration: {}, accuracy: {}".format(i, acc))
104 |         if summary_writer is not None:
105 |             summary_writer.add_scalar('dev_acc', acc, i)
106 |             wandb.log({'dev_acc': acc}, step=i)
107 | 
108 |         if iters_under_threshold >= 3:
109 |             print('3 iterations under the minimum accuracy.. stopping the process')
110 |             break
111 | 
112 |         if acc <= min_accuracy and best_projection is not None:
113 |             iters_under_threshold += 1
114 |             continue
115 | 
116 |         if prev_acc == acc:
117 |             iters_no_change += 1
118 |         else:
119 |             iters_no_change = 0
120 | 
121 |         if iters_no_change >= 3:
122 |             print('3 iterations with no accuracy change.. topping the process')
123 |             break
124 |         prev_acc = acc
125 | 
126 |         W = clf.get_weights()
127 |         Ws.append(W)
128 |         P_rowspace_wi = get_rowspace_projection(W)  # projection to W's rowspace
129 |         rowspace_projections.append(P_rowspace_wi)
130 | 
131 |         if is_autoregressive:
132 |             """
133 |             to ensure numerical stability, explicitly project to the intersection of the nullspaces found so far
134 |              (instead of doing X = P_iX, which is problematic when w_i is not exactly orthogonal to w_i-1,...,w1,
135 |               due to e.g inexact argmin calculation).
136 |             """
137 |             # use the intersection-projection formula of Ben-Israel 2013 http://benisrael.net/BEN-ISRAEL-NOV-30-13.pdf:
138 |             # N(w1)∩ N(w2) ∩ ... ∩ N(wn) = N(P_R(w1) + P_R(w2) + ... + P_R(wn))
139 | 
140 |             P = get_projection_to_intersection_of_nullspaces(rowspace_projections, input_dim)
141 |             all_projections.append(P)
142 | 
143 |             # project
144 |             X_train_cp = X_train.dot(P)
145 |             X_dev_cp = X_dev.dot(P)
146 | 
147 |             # the first iteration that gets closest performance (or less) to majority
148 |             if (acc - min_accuracy) <= best_iter_diff and best_projection is None:
149 |                 print('projection saved timestamp: {}'.format(i))
150 |                 best_projection = (P, i + 1)
151 | 
152 |     """
153 |     calculae final projection matrix P=PnPn-1....P2P1
154 |     since w_i.dot(w_i-1) = 0, P2P1 = I - P1 - P2 (proof in the paper); this is more stable.
155 |     by induction, PnPn-1....P2P1 = I - (P1+..+PN). We will use instead Ben-Israel's formula to increase stability,
156 |     i.e., we explicitly project to intersection of all nullspaces (this is not critical at this point; I-(P1+...+PN)
157 |     is roughly as accurate as this)
158 |     """
159 | 
160 |     P = get_projection_to_intersection_of_nullspaces(rowspace_projections, input_dim)
161 | 
162 |     if best_projection is None:
163 |         print('projection saved timestamp: {}'.format(num_classifiers))
164 |         print('using all of the iterations as the final projection')
165 |         best_projection = (P, num_classifiers)
166 | 
167 |     return P, rowspace_projections, Ws, all_projections, best_projection
168 | 
169 | 
170 | def get_debiasing_projection_by_cls(classifier_class, cls_params: Dict, num_classifiers: int, input_dim: int,
171 |                                     is_autoregressive: bool,
172 |                                     min_accuracy: float, X_train: np.ndarray, Y_train: np.ndarray, X_dev: np.ndarray,
173 |                                     Y_dev: np.ndarray, by_class=True, Y_train_main=None,
174 |                                     Y_dev_main=None, dropout_rate=0, summary_writer=None) -> (
175 |         np.ndarray, list, list, list, tuple):
176 |     """
177 |     :param classifier_class: the sklearn classifier class (SVM/Perceptron etc.)
178 |     :param cls_params: a dictionary, containing the params for the sklearn classifier
179 |     :param num_classifiers: number of iterations (equivalent to number of dimensions to remove)
180 |     :param input_dim: size of input vectors
181 |     :param is_autoregressive: whether to train the ith classiifer on the data projected to the nullsapces of w1,...,wi-1
182 |     :param min_accuracy: above this threshold, ignore the learned classifier
183 |     :param X_train: ndarray, training vectors
184 |     :param Y_train: ndarray, training labels (protected attributes)
185 |     :param X_dev: ndarray, eval vectors
186 |     :param Y_dev: ndarray, eval labels (protected attributes)
187 |     :param by_class: if true, at each iteration sample one main-task label, and extract the protected attribute only
188 |            from vectors from this class
189 |     :param T_train_main: ndarray, main-task train labels
190 |     :param Y_dev_main: ndarray, main-task eval labels
191 |     :param dropout_rate: float, default: 0 (note: not recommended to be used with autoregressive=True)
192 |     :return: P, the debiasing projection; rowspace_projections, the list of all rowspace projection;
193 |              Ws, the list of all calssifiers.
194 |     """
195 |     if dropout_rate > 0 and is_autoregressive:
196 |         warnings.warn(
197 |             "Note: when using dropout with autoregressive training, the property w_i.dot(w_(i+1)) = 0 no longer holds.")
198 | 
199 |     X_train_cp = X_train.copy()
200 |     X_dev_cp = X_dev.copy()
201 |     rowspace_projections = []
202 |     Ws = []
203 |     all_projections = []
204 |     best_projection = None
205 |     removed_directions = 0
206 | 
207 |     all_labels = list(set(Y_train))
208 | 
209 |     pbar = tqdm(range(len(all_labels)), ncols=600)
210 |     for i in pbar:
211 |         pbar_inner = tqdm(range(60), ncols=600)
212 |         for j in pbar_inner:
213 |             clf = classifier.SKlearnClassifier(classifier_class(**cls_params))
214 |             dropout_scale = 1. / (1 - dropout_rate + 1e-6)
215 |             dropout_mask = (np.random.rand(*X_train.shape) < (1 - dropout_rate)).astype(float) * dropout_scale
216 | 
217 |             cur_cls_inds_train = np.where(Y_train == i)
218 |             y_train = np.zeros(len(Y_train))
219 |             y_train[cur_cls_inds_train] = 1
220 |             cur_cls_inds_dev = np.where(Y_dev == i)
221 |             y_dev = np.zeros(len(Y_dev))
222 |             y_dev[cur_cls_inds_dev] = 1
223 | 
224 |             if len(cur_cls_inds_train[0]) == 0 or len(cur_cls_inds_dev[0]) == 0:
225 |                 continue
226 | 
227 |             acc = clf.train_network((X_train_cp * dropout_mask), y_train,
228 |                                     X_dev_cp, y_dev)
229 |             acc = np.around(acc, decimals=3)
230 |             maj = ((len(y_dev) - len(cur_cls_inds_dev[0])) / float(len(y_dev)))
231 |             maj = np.around(maj, decimals=3)
232 |             pbar_inner.set_description("iteration: {}, accuracy: {}, cls: {}, majority: {}"
233 |                                        .format(i, acc, i, maj))
234 |             if summary_writer is not None:
235 |                 summary_writer.add_scalar('dev_acc', acc, i)
236 |                 wandb.log({'dev_acc': acc}, step=i)
237 | 
238 |             # if acc < min_accuracy: continue
239 |             if abs(maj - acc) <= 0.005:
240 |                 break
241 | 
242 |             removed_directions += 1
243 | 
244 |             W = clf.get_weights()
245 |             Ws.append(W)
246 |             P_rowspace_wi = get_rowspace_projection(W)  # projection to W's rowspace
247 |             rowspace_projections.append(P_rowspace_wi)
248 | 
249 |             if is_autoregressive:
250 |                 """
251 |                 to ensure numerical stability, explicitly project to the intersection of the nullspaces found so far
252 |                 (instaed of doing X = P_iX, which is problematic when w_i is not exactly orthogonal
253 |                  to w_i-1,...,w1, due to e.g inexact argmin calculation).
254 |                 """
255 |                 # use the intersection-projection formula of Ben-Israel 2013
256 |                 # http://benisrael.net/BEN-ISRAEL-NOV-30-13.pdf:
257 |                 # N(w1)∩ N(w2) ∩ ... ∩ N(wn) = N(P_R(w1) + P_R(w2) + ... + P_R(wn))
258 | 
259 |                 P = get_projection_to_intersection_of_nullspaces(rowspace_projections, input_dim)
260 |                 all_projections.append(P)
261 |                 # project
262 | 
263 |                 X_train_cp = X_train.dot(P)
264 |                 X_dev_cp = X_dev.dot(P)
265 | 
266 |                 if abs(acc - min_accuracy) <= 0.01 and best_projection is None:
267 |                     print('projection saved timestamp: {}'.format(i))
268 |                     best_projection = (P, i)
269 | 
270 |     """
271 |     calculae final projection matrix P=PnPn-1....P2P1
272 |     since w_i.dot(w_i-1) = 0, P2P1 = I - P1 - P2 (proof in the paper); this is more stable.
273 |     by induction, PnPn-1....P2P1 = I - (P1+..+PN). We will use instead Ben-Israel's formula to increase stability,
274 |     i.e., we explicitly project to intersection of all nullspaces (this is not critical at this point; I-(P1+...+PN)
275 |      is roughly as accurate as this)
276 |     """
277 |     P = get_projection_to_intersection_of_nullspaces(rowspace_projections, input_dim)
278 | 
279 |     print('number of dimensions removed: {}'.format(removed_directions))
280 | 
281 |     if best_projection is None:
282 |         print('projection saved timestamp: {}'.format(num_classifiers))
283 |         best_projection = (P, num_classifiers)
284 | 
285 |     return P, rowspace_projections, Ws, all_projections, best_projection
286 | 
287 | 
288 | def get_pls_projection(num_classifiers: int,
289 |                        X_train: np.ndarray, Y_train: np.ndarray, X_dev: np.ndarray,
290 |                        Y_dev: np.ndarray, summary_writer=None) \
291 |         -> (np.ndarray, list, tuple):
292 |     """
293 |     :param num_classifiers: number of iterations (equivalent to number of dimensions to remove)
294 |     :param input_dim: size of input vectors
295 |     :param min_accuracy: above this threshold, ignore the learned classifier
296 |     :param X_train: ndarray, training vectors
297 |     :param Y_train: ndarray, training labels (protected attributes)
298 |     :param X_dev: ndarray, eval vectors
299 |     :param Y_dev: ndarray, eval labels (protected attributes)
300 |     :param best_iter_diff: float, diff from majority, used to decide on best iteration
301 |     :return: P, the debiasing projection; the list of all calssifiers.
302 |     """
303 | 
304 |     all_projections = []
305 |     best_projection = None
306 |     iters_under_threshold = 0
307 |     prev_acc = -99.
308 |     iters_no_change = 0
309 | 
310 |     print('training pls')
311 |     pls = PLSRegression(n_components=num_classifiers)
312 |     pls.fit(X_train, Y_train)
313 | 
314 |     x_dim = X_train.shape[1]
315 |     pbar = tqdm(range(num_classifiers))
316 |     for i in pbar:
317 |         weights = pls.x_weights_[:, :i + 1]
318 |         P = np.eye(x_dim, x_dim) - get_rowspace_projection(weights.T)
319 |         all_projections.append(P)
320 | 
321 |         x_train_p = X_train.dot(P)
322 |         x_dev_p = X_dev.dot(P)
323 | 
324 |         clf = Ridge(random_state=0)
325 | 
326 |         clf.fit(x_train_p, Y_train)
327 |         acc = clf.score(x_dev_p, Y_dev)
328 | 
329 |         pbar.set_description("iteration: {}, accuracy: {}".format(i, acc))
330 |         if summary_writer is not None:
331 |             summary_writer.add_scalar('dev_acc', acc, i)
332 |             wandb.log({'dev_acc': acc}, step=i)
333 | 
334 |         # the first iteration that gets closest performance (or less) to majority
335 |         # if (acc - min_accuracy) <= best_iter_diff and best_projection is None:
336 |         #     print('projection saved timestamp: {}'.format(i))
337 |         #     best_projection = (P, i + 1)
338 | 
339 |     # if best_projection is None:
340 |     print('projection saved timestamp: {}'.format(num_classifiers))
341 |     print('using all of the iterations as the final projection')
342 |     best_projection = (all_projections[-1], num_classifiers)
343 | 
344 |     return P, all_projections, best_projection
345 | 


--------------------------------------------------------------------------------
/amnesic_probing/encoders/__init__.py:
--------------------------------------------------------------------------------
  1 | import glob
  2 | import os
  3 | import pickle
  4 | from collections import defaultdict
  5 | 
  6 | import numpy as np
  7 | from tqdm import tqdm
  8 | from transformers import BertTokenizer, BertForMaskedLM, BertConfig, RobertaTokenizer, RobertaForMaskedLM, RobertaConfig
  9 | 
 10 | from amnesic_probing.encoders.bert_encoding import lm_encoding, bert_based_encoding, \
 11 |     lm_masked_encoding, tokenize_and_preserve_labels
 12 | from amnesic_probing.models.qa import BertForQuestionAnsweringDeprobe
 13 | 
 14 | models = {
 15 |     'bert-base-uncased': {
 16 |         'type': 'bert',
 17 |         'name': 'bert-base-uncased',
 18 |         'tokenizer': BertTokenizer,
 19 |         'model': BertForMaskedLM,
 20 |         'config': None,
 21 |         'lower': True,
 22 |         'mlm': True,
 23 |     },
 24 |     'roberta-base': {
 25 |         'type': 'bert',
 26 |         'name': 'roberta-base',
 27 |         'tokenizer': RobertaTokenizer,
 28 |         'model': RobertaForMaskedLM,
 29 |         'config': None,
 30 |         'lower': True,
 31 |         'mlm': True,
 32 |     },
 33 |     'qa': {
 34 |         'type': 'bert',
 35 |         'name': 'deepset/bert-base-cased-squad2',
 36 |         'tokenizer': BertTokenizer,
 37 |         'model': BertForQuestionAnsweringDeprobe,
 38 |         'config': BertConfig,
 39 |         'lower': False,
 40 |         'mlm': False,
 41 |     },
 42 | }
 43 | 
 44 | 
 45 | def get_pretrained_models(model_type):
 46 |     params = models[model_type]
 47 | 
 48 |     model_name = params['name']
 49 | 
 50 |     if params['config'] is not None:
 51 |         config = params['config'].from_pretrained(
 52 |             model_name,
 53 |         )
 54 |     else:
 55 |         config = None
 56 |     tokenizer = params['tokenizer'].from_pretrained(
 57 |         model_name,
 58 |         do_lower_case=params['lower'],
 59 |     )
 60 |     if params['mlm']:
 61 |         model = params['model'].from_pretrained(
 62 |             model_name,
 63 |             output_hidden_states=True,
 64 |             from_tf=False,
 65 |             config=config,
 66 |         )
 67 |     else:
 68 |         model = params['model'].from_pretrained(
 69 |             model_name,
 70 |             from_tf=False,
 71 |             config=config,
 72 |         )
 73 | 
 74 |     return model.eval(), tokenizer
 75 | 
 76 | 
 77 | def encode_text(data, encoder, tokenizer, masked=False, only_last_layer=True):
 78 |     encoded_vectors = defaultdict(list)
 79 |     encoded_labels = defaultdict(list)
 80 | 
 81 |     for i, datum in enumerate(tqdm(data)):
 82 |         tokens = datum['text']
 83 | 
 84 |         if type(encoder) in [BertForMaskedLM, RobertaForMaskedLM]:
 85 |             if masked:
 86 |                 last_vecs, rep_vecs = lm_masked_encoding(' '.join(tokens), encoder, tokenizer,
 87 |                                                          only_last_layer=only_last_layer)
 88 | 
 89 |             else:
 90 |                 last_vecs, rep_vecs = lm_encoding(' '.join(tokens), encoder, tokenizer, only_last_layer=only_last_layer)
 91 | 
 92 |             encoded_vectors['last_vec'].append(last_vecs)
 93 |             encoded_vectors['rep_vec'].append(rep_vecs)
 94 |         else:
 95 |             rep_vecs = bert_based_encoding(' '.join(tokens), encoder, tokenizer)
 96 |             encoded_vectors['rep_vec'].append(rep_vecs)
 97 | 
 98 |         # going over all labels that were collected from the dataset
 99 |         for label_name, labels in datum['labels'].items():
100 |             tok_sen, tok_label = tokenize_and_preserve_labels(tokens, labels, tokenizer)
101 |             encoded_labels[label_name].append(tok_label)
102 | 
103 |         # tok_sen are the tokens of the current sentence. It doesn't change over the previous loop
104 |         # therefore the last one equals all the rest (it assumes that there's at least one label)
105 |         encoded_labels['tokens'].append(tok_sen)
106 | 
107 |     return {'vectors': encoded_vectors, 'labels': encoded_labels}
108 | 
109 | 
110 | def to_file(encoded_data, output_dir, only_last_layer):
111 |     if not os.path.isdir(output_dir):
112 |         print('creating dir ', output_dir)
113 |         os.makedirs(output_dir)
114 | 
115 |     for name, vals in encoded_data['vectors'].items():
116 | 
117 |         if name == "rep_vec" and not only_last_layer:
118 |             for layer in range(len(vals[0])):
119 |                 X = np.array([x[layer] for x in vals])
120 |                 np.save(output_dir + '/vec_layer:{}.npy'.format(layer), X)
121 |         else:
122 |             np.save(output_dir + '/{}.npy'.format(name), np.array(vals))
123 | 
124 |     for name, vals in encoded_data['labels'].items():
125 |         with open(output_dir + '/{}.pickle'.format(name), 'wb') as f:
126 |             pickle.dump(vals, f)
127 | 
128 | 
129 | def read_conll_format(input_file):
130 |     data = []
131 |     with open(input_file, 'r') as f:
132 |         sen = []
133 |         tag = []
134 |         pos = []
135 |         dep = []
136 |         orig_vals = []
137 |         for line in tqdm(f):
138 |             if line.strip() == '':
139 |                 pos_next_word = pos[1:] + ['EOL']
140 |                 tag_next_word = tag[1:] + ['EOL']
141 |                 data.append({'text': sen,
142 |                              'labels': {
143 |                                  'tag': tag,
144 |                                  'pos': pos,
145 |                                  'dep': dep,
146 |                                  'orig_vals': orig_vals,
147 | 
148 |                                  'pos_next_word': pos_next_word,
149 |                                  'tag_next_word': tag_next_word
150 |                              }
151 |                              })
152 |                 sen = []
153 |                 tag = []
154 |                 pos = []
155 |                 dep = []
156 |                 orig_vals = []
157 |                 continue
158 |             vals = line.split('\t')
159 |             sen.append(vals[1])
160 |             tag.append(vals[3])
161 |             pos.append(vals[4])
162 |             dep.append(vals[7])
163 |             orig_vals.append(vals)
164 | 
165 |     return data
166 | 
167 | 
168 | def read_onto_notes_format(input_file):
169 |     data = []
170 |     for cur_file in tqdm(glob.glob(input_file + '/data/english/**/*.*gold_conll', recursive=True)):
171 | 
172 |         with open(cur_file, 'r') as in_f:
173 |             sen = []
174 |             ner = []
175 |             np_start = []
176 |             np_end = []
177 |             phrase_start = []
178 |             phrase_end = []
179 |             prev_ner = ''
180 |             for line in in_f:
181 |                 if line.startswith('#'):
182 |                     continue
183 |                 if line.strip() == '':
184 |                     data.append({'text': sen,
185 |                                  'labels': {
186 |                                      'ner': ner,
187 |                                      'phrase_start': phrase_start,
188 |                                      'phrase_end': phrase_end,
189 |                                      'np_start': np_start,
190 |                                      'np_end': np_end,
191 |                                  }
192 |                                  })
193 |                     sen = []
194 |                     ner = []
195 |                     np_start = []
196 |                     np_end = []
197 |                     phrase_start = []
198 |                     phrase_end = []
199 |                     continue
200 |                 vals = line.split()
201 |                 sen.append(vals[3])
202 | 
203 |                 cur_ner = vals[10]
204 |                 if cur_ner.startswith('('):
205 |                     cur_ner = cur_ner[1:]
206 |                     prev_ner = cur_ner
207 |                 if cur_ner.endswith(')'):
208 |                     cur_ner = prev_ner[:-1]
209 |                     prev_ner = ''
210 |                 if prev_ner != '':
211 |                     cur_ner = prev_ner
212 |                 if cur_ner != '*' and cur_ner.endswith('*'):
213 |                     cur_ner = cur_ner[:-1]
214 |                 ner.append(cur_ner)
215 | 
216 |                 constituency = vals[5]
217 | 
218 |                 if '(NP' in constituency:
219 |                     np_start.append('S')
220 |                 else:
221 |                     np_start.append('NS')
222 | 
223 |                 if 'NP)' in constituency:
224 |                     np_end.append('E')
225 |                 else:
226 |                     np_end.append('NE')
227 | 
228 |                 if constituency.startswith('('):
229 |                     phrase_start.append('S')
230 |                 else:
231 |                     phrase_start.append('NS')
232 | 
233 |                 if constituency.endswith(')'):
234 |                     phrase_end.append('E')
235 |                 else:
236 |                     phrase_end.append('NE')
237 | 
238 |     return data
239 | 
240 | 
241 | def read_sem_tagging_format(input_file):
242 |     """
243 |     https://www.aclweb.org/anthology/W17-6901/
244 |     https://arxiv.org/abs/1609.07053
245 |     """
246 |     data = []
247 |     with open(input_file, 'r') as f:
248 |         sen = []
249 |         semtag = []
250 |         orig_vals = []
251 |         for line in tqdm(f):
252 |             if line.strip() == '':
253 |                 data.append({'text': sen,
254 |                              'labels': {
255 |                                  'semtag': semtag,
256 |                                  'orig_vals': orig_vals,
257 |                              }
258 |                              })
259 |                 sen = []
260 |                 semtag = []
261 |                 orig_vals = []
262 |                 continue
263 |             vals = line.split('\t')
264 |             sen.append(vals[1])
265 |             semtag.append(vals[0])
266 |             orig_vals.append(vals)
267 | 
268 |     return data
269 | 
270 | 
271 | def read_coarse_sem_tagging_format(input_file):
272 |     """
273 |     https://www.aclweb.org/anthology/W17-6901/
274 |     https://arxiv.org/abs/1609.07053
275 |     """
276 |     mapping = COARSE_SEMTAG_MAPPING
277 | 
278 |     data = []
279 |     with open(input_file, 'r') as f:
280 |         sen = []
281 |         semtag = []
282 |         orig_vals = []
283 |         for line in tqdm(f):
284 |             if line.strip() == '':
285 |                 data.append({'text': sen,
286 |                              'labels': {
287 |                                  'semtag': semtag,
288 |                                  'orig_vals': orig_vals,
289 |                              }
290 |                              })
291 |                 sen = []
292 |                 semtag = []
293 |                 orig_vals = []
294 |                 continue
295 |             vals = line.split('\t')
296 |             sen.append(vals[1])
297 |             semtag.append(mapping[vals[0]])
298 |             orig_vals.append(vals)
299 | 
300 |     return data
301 | 
302 | 
303 | def read_fce_format(input_file):
304 |     """
305 |     Compositional Sequence Labeling Models for Error Detection in Learner Writing
306 |     Marek Rei and Helen Yannakoudakis
307 |     In Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (ACL-2016)
308 | 
309 |     A New Dataset and Method for Automatically Grading ESOL Texts
310 |     Helen Yannakoudakis, Ted Briscoe and Ben Medlock
311 |     In Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics (ACL-2011)
312 | 
313 |     """
314 |     data = []
315 |     with open(input_file, 'r') as f:
316 |         sen = []
317 |         grammatical = []
318 |         orig_vals = []
319 |         for line in tqdm(f):
320 |             if line.strip() == '':
321 |                 data.append({'text': sen,
322 |                              'labels': {
323 |                                  'grammatical': grammatical,
324 |                                  'orig_vals': orig_vals,
325 |                              }
326 |                              })
327 |                 # print(sen)
328 |                 sen = []
329 |                 grammatical = []
330 |                 orig_vals = []
331 |                 continue
332 |             vals = line.split('\t')
333 |             grammatical.append(vals[1])
334 |             sen.append(vals[0])
335 |             orig_vals.append(vals)
336 | 
337 |     return data
338 | 
339 | 
340 | def read_coord_format(input_file):
341 |     """
342 |     True if the token is part of any COORD sub-tree, false otherwise
343 | 
344 |     python amnesic_probing/encoders/encode.py --input_file data/CoordinationExtPTB/train.txt --output_dir=out/coord/train
345 |      --encoder=bert-base-uncased --encode_format=normal --format=coord
346 |     """
347 |     from nltk.tree import Tree
348 | 
349 |     def parse_coord(t: Tree, is_coord: bool):
350 |         if len(t.leaves()) == 1:
351 |             if t.pos()[0][1] == '-NONE-':
352 |                 return []
353 |             else:
354 |                 return [(t.leaves()[0], is_coord)]
355 | 
356 |         res = []
357 |         for subtree in t:
358 |             res += parse_coord(subtree, is_coord or "COORD" in subtree.label())
359 | 
360 |         return res
361 | 
362 |     data = []
363 |     with open(input_file, 'r') as f:
364 |         for line in tqdm(f):
365 |             parsed_sen = Tree.fromstring(line)
366 | 
367 |             parsed_coords = parse_coord(parsed_sen, False)
368 | 
369 |             data.append({'text': [v[0] for v in parsed_coords],
370 |                          'labels': {
371 |                              'coord': [v[1] for v in parsed_coords],
372 |                              'orig_vals': line,
373 |                          }
374 |                          })
375 |     return data
376 | 
377 | 
378 | COARSE_SEMTAG_MAPPING = {}
379 | 
380 | # v0.7
381 | COARSE_SEMTAG_MAPPING['PRO'] = 'ANA'
382 | COARSE_SEMTAG_MAPPING['DEF'] = 'ANA'
383 | COARSE_SEMTAG_MAPPING['HAS'] = 'ANA'
384 | COARSE_SEMTAG_MAPPING['REF'] = 'ANA'
385 | COARSE_SEMTAG_MAPPING['EMP'] = 'ANA'
386 | COARSE_SEMTAG_MAPPING['GRE'] = 'ACT'
387 | COARSE_SEMTAG_MAPPING['ITJ'] = 'ACT'
388 | COARSE_SEMTAG_MAPPING['HES'] = 'ACT'
389 | COARSE_SEMTAG_MAPPING['QUE'] = 'ACT'
390 | COARSE_SEMTAG_MAPPING['QUC'] = 'ATT'
391 | COARSE_SEMTAG_MAPPING['QUV'] = 'ATT'
392 | COARSE_SEMTAG_MAPPING['COL'] = 'ATT'
393 | COARSE_SEMTAG_MAPPING['IST'] = 'ATT'
394 | COARSE_SEMTAG_MAPPING['SST'] = 'ATT'
395 | COARSE_SEMTAG_MAPPING['PRI'] = 'ATT'
396 | COARSE_SEMTAG_MAPPING['DEG'] = 'ATT'
397 | COARSE_SEMTAG_MAPPING['INT'] = 'ATT'
398 | COARSE_SEMTAG_MAPPING['REL'] = 'ATT'
399 | COARSE_SEMTAG_MAPPING['SCO'] = 'ATT'
400 | COARSE_SEMTAG_MAPPING['EQU'] = 'COM'
401 | COARSE_SEMTAG_MAPPING['MOR'] = 'COM'
402 | COARSE_SEMTAG_MAPPING['LES'] = 'COM'
403 | COARSE_SEMTAG_MAPPING['TOP'] = 'COM'
404 | COARSE_SEMTAG_MAPPING['BOT'] = 'COM'
405 | COARSE_SEMTAG_MAPPING['ORD'] = 'COM'
406 | COARSE_SEMTAG_MAPPING['CON'] = 'UNE'
407 | COARSE_SEMTAG_MAPPING['ROL'] = 'UNE'
408 | COARSE_SEMTAG_MAPPING['GRP'] = 'UNE'
409 | COARSE_SEMTAG_MAPPING['DXP'] = 'DXS'
410 | COARSE_SEMTAG_MAPPING['DXT'] = 'DXS'
411 | COARSE_SEMTAG_MAPPING['DXD'] = 'DXS'
412 | COARSE_SEMTAG_MAPPING['ALT'] = 'LOG'
413 | COARSE_SEMTAG_MAPPING['XCL'] = 'LOG'
414 | COARSE_SEMTAG_MAPPING['NIL'] = 'LOG'
415 | COARSE_SEMTAG_MAPPING['DIS'] = 'LOG'
416 | COARSE_SEMTAG_MAPPING['IMP'] = 'LOG'
417 | COARSE_SEMTAG_MAPPING['AND'] = 'LOG'
418 | COARSE_SEMTAG_MAPPING['NOT'] = 'MOD'
419 | COARSE_SEMTAG_MAPPING['NEC'] = 'MOD'
420 | COARSE_SEMTAG_MAPPING['POS'] = 'MOD'
421 | COARSE_SEMTAG_MAPPING['SUB'] = 'DSC'
422 | COARSE_SEMTAG_MAPPING['COO'] = 'DSC'
423 | COARSE_SEMTAG_MAPPING['APP'] = 'DSC'
424 | COARSE_SEMTAG_MAPPING['BUT'] = 'DSC'
425 | COARSE_SEMTAG_MAPPING['PER'] = 'NAM'
426 | COARSE_SEMTAG_MAPPING['GPE'] = 'NAM'
427 | COARSE_SEMTAG_MAPPING['GPO'] = 'NAM'
428 | COARSE_SEMTAG_MAPPING['GEO'] = 'NAM'
429 | COARSE_SEMTAG_MAPPING['ORG'] = 'NAM'
430 | COARSE_SEMTAG_MAPPING['ART'] = 'NAM'
431 | COARSE_SEMTAG_MAPPING['HAP'] = 'NAM'
432 | COARSE_SEMTAG_MAPPING['UOM'] = 'NAM'
433 | COARSE_SEMTAG_MAPPING['CTC'] = 'NAM'
434 | COARSE_SEMTAG_MAPPING['URL'] = 'NAM'
435 | COARSE_SEMTAG_MAPPING['LIT'] = 'NAM'
436 | COARSE_SEMTAG_MAPPING['NTH'] = 'NAM'
437 | COARSE_SEMTAG_MAPPING['EXS'] = 'EVE'
438 | COARSE_SEMTAG_MAPPING['ENS'] = 'EVE'
439 | COARSE_SEMTAG_MAPPING['EPS'] = 'EVE'
440 | COARSE_SEMTAG_MAPPING['EXG'] = 'EVE'
441 | COARSE_SEMTAG_MAPPING['EXT'] = 'EVE'
442 | COARSE_SEMTAG_MAPPING['NOW'] = 'TNS'
443 | COARSE_SEMTAG_MAPPING['PST'] = 'TNS'
444 | COARSE_SEMTAG_MAPPING['FUT'] = 'TNS'
445 | COARSE_SEMTAG_MAPPING['PRG'] = 'TNS'
446 | COARSE_SEMTAG_MAPPING['PFT'] = 'TNS'
447 | COARSE_SEMTAG_MAPPING['DAT'] = 'TIM'
448 | COARSE_SEMTAG_MAPPING['DOM'] = 'TIM'
449 | COARSE_SEMTAG_MAPPING['YOC'] = 'TIM'
450 | COARSE_SEMTAG_MAPPING['DOW'] = 'TIM'
451 | COARSE_SEMTAG_MAPPING['MOY'] = 'TIM'
452 | COARSE_SEMTAG_MAPPING['DEC'] = 'TIM'
453 | COARSE_SEMTAG_MAPPING['CLO'] = 'TIM'
454 | 
455 | # update
456 | COARSE_SEMTAG_MAPPING['PRO'] = 'ANA'
457 | COARSE_SEMTAG_MAPPING['DEF'] = 'ANA'
458 | COARSE_SEMTAG_MAPPING['HAS'] = 'ANA'
459 | COARSE_SEMTAG_MAPPING['REF'] = 'ANA'
460 | COARSE_SEMTAG_MAPPING['EMP'] = 'ANA'
461 | COARSE_SEMTAG_MAPPING['GRE'] = 'ACT'
462 | COARSE_SEMTAG_MAPPING['ITJ'] = 'ACT'
463 | COARSE_SEMTAG_MAPPING['HES'] = 'ACT'
464 | COARSE_SEMTAG_MAPPING['QUE'] = 'ACT'
465 | COARSE_SEMTAG_MAPPING['QUA'] = 'ATT'
466 | COARSE_SEMTAG_MAPPING['UOM'] = 'ATT'
467 | COARSE_SEMTAG_MAPPING['IST'] = 'ATT'
468 | COARSE_SEMTAG_MAPPING['REL'] = 'ATT'
469 | COARSE_SEMTAG_MAPPING['RLI'] = 'ATT'
470 | COARSE_SEMTAG_MAPPING['SST'] = 'ATT'
471 | COARSE_SEMTAG_MAPPING['PRI'] = 'ATT'
472 | COARSE_SEMTAG_MAPPING['INT'] = 'ATT'
473 | COARSE_SEMTAG_MAPPING['SCO'] = 'ATT'
474 | COARSE_SEMTAG_MAPPING['ALT'] = 'LOG'
475 | COARSE_SEMTAG_MAPPING['EXC'] = 'LOG'
476 | COARSE_SEMTAG_MAPPING['NIL'] = 'LOG'
477 | COARSE_SEMTAG_MAPPING['DIS'] = 'LOG'
478 | COARSE_SEMTAG_MAPPING['IMP'] = 'LOG'
479 | COARSE_SEMTAG_MAPPING['AND'] = 'LOG'
480 | COARSE_SEMTAG_MAPPING['BUT'] = 'LOG'
481 | COARSE_SEMTAG_MAPPING['EQA'] = 'COM'
482 | COARSE_SEMTAG_MAPPING['MOR'] = 'COM'
483 | COARSE_SEMTAG_MAPPING['LES'] = 'COM'
484 | COARSE_SEMTAG_MAPPING['TOP'] = 'COM'
485 | COARSE_SEMTAG_MAPPING['BOT'] = 'COM'
486 | COARSE_SEMTAG_MAPPING['ORD'] = 'COM'
487 | COARSE_SEMTAG_MAPPING['PRX'] = 'DEM'
488 | COARSE_SEMTAG_MAPPING['MED'] = 'DEM'
489 | COARSE_SEMTAG_MAPPING['DST'] = 'DEM'
490 | COARSE_SEMTAG_MAPPING['SUB'] = 'DIS'
491 | COARSE_SEMTAG_MAPPING['COO'] = 'DIS'
492 | COARSE_SEMTAG_MAPPING['APP'] = 'DIS'
493 | COARSE_SEMTAG_MAPPING['NOT'] = 'MOD'
494 | COARSE_SEMTAG_MAPPING['NEC'] = 'MOD'
495 | COARSE_SEMTAG_MAPPING['POS'] = 'MOD'
496 | COARSE_SEMTAG_MAPPING['CON'] = 'ENT'
497 | COARSE_SEMTAG_MAPPING['ROL'] = 'ENT'
498 | COARSE_SEMTAG_MAPPING['GPE'] = 'NAM'
499 | COARSE_SEMTAG_MAPPING['PER'] = 'NAM'
500 | COARSE_SEMTAG_MAPPING['LOC'] = 'NAM'
501 | COARSE_SEMTAG_MAPPING['ORG'] = 'NAM'
502 | COARSE_SEMTAG_MAPPING['ART'] = 'NAM'
503 | COARSE_SEMTAG_MAPPING['NAT'] = 'NAM'
504 | COARSE_SEMTAG_MAPPING['HAP'] = 'NAM'
505 | COARSE_SEMTAG_MAPPING['URL'] = 'NAM'
506 | COARSE_SEMTAG_MAPPING['EXS'] = 'EVE'
507 | COARSE_SEMTAG_MAPPING['ENS'] = 'EVE'
508 | COARSE_SEMTAG_MAPPING['EPS'] = 'EVE'
509 | COARSE_SEMTAG_MAPPING['EFS'] = 'EVE'
510 | COARSE_SEMTAG_MAPPING['EXG'] = 'EVE'
511 | COARSE_SEMTAG_MAPPING['ENG'] = 'EVE'
512 | COARSE_SEMTAG_MAPPING['EPG'] = 'EVE'
513 | COARSE_SEMTAG_MAPPING['EFG'] = 'EVE'
514 | COARSE_SEMTAG_MAPPING['EXT'] = 'EVE'
515 | COARSE_SEMTAG_MAPPING['ENT'] = 'EVE'
516 | COARSE_SEMTAG_MAPPING['EPT'] = 'EVE'
517 | COARSE_SEMTAG_MAPPING['EFT'] = 'EVE'
518 | COARSE_SEMTAG_MAPPING['ETG'] = 'EVE'
519 | COARSE_SEMTAG_MAPPING['ETV'] = 'EVE'
520 | COARSE_SEMTAG_MAPPING['EXV'] = 'EVE'
521 | COARSE_SEMTAG_MAPPING['NOW'] = 'TNS'
522 | COARSE_SEMTAG_MAPPING['PST'] = 'TNS'
523 | COARSE_SEMTAG_MAPPING['FUT'] = 'TNS'
524 | COARSE_SEMTAG_MAPPING['DOM'] = 'TIM'
525 | COARSE_SEMTAG_MAPPING['YOC'] = 'TIM'
526 | COARSE_SEMTAG_MAPPING['DOW'] = 'TIM'
527 | COARSE_SEMTAG_MAPPING['MOY'] = 'TIM'
528 | COARSE_SEMTAG_MAPPING['DEC'] = 'TIM'
529 | COARSE_SEMTAG_MAPPING['CLO'] = 'TIM'
530 | COARSE_SEMTAG_MAPPING['APX'] = 'APX'
531 | 


--------------------------------------------------------------------------------