├── __init__.py ├── KnowMan ├── __init__.py ├── models │ ├── __init__.py │ ├── layers.py │ └── nn_models.py ├── save │ └── __init__.py ├── utils │ ├── __init__.py │ ├── knowman_parameters.py │ ├── vocab.py │ ├── knowman_utils.py │ └── logging_utils.py ├── data_prep │ ├── __init__.py │ └── get_knodle_dataset.py └── default_config.yaml ├── baselines ├── __init__.py ├── log_reg.py └── snorkel_training_knodle.py ├── experiments ├── __init__.py ├── imdb │ ├── __init__.py │ ├── imdb_transformer.yaml │ ├── imdb_tfidf.yaml │ ├── train_tfidf_imdb.py │ └── train_transformer_imdb.py ├── spam │ ├── __init__.py │ ├── spam_tfidf.yaml │ ├── spam_transformer.yaml │ ├── train_tfidf_spam.py │ └── train_transformer_spam.py └── spouse │ ├── __init__.py │ ├── spouse_transformer.yaml │ ├── spouse_tfidf.yaml │ ├── train_transformer_spouse.py │ └── train_tfidf_spouse.py ├── .gitignore └── README.md /__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /KnowMan/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /baselines/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /experiments/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /KnowMan/models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /KnowMan/save/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /KnowMan/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /experiments/imdb/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /experiments/spam/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /KnowMan/data_prep/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /experiments/spouse/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /venv 2 | .idea/ 3 | data/ 4 | old_code_fragments/ 5 | -------------------------------------------------------------------------------- /KnowMan/default_config.yaml: -------------------------------------------------------------------------------- 1 | experiment_name: "knowman_exp" 2 | 3 | dataset: 4 | dataset_path: none 5 | 6 | model_params: 7 | activation: relu 8 | loss: gr 9 | 10 | training_setting: 11 | random_seed: 1 12 | use_tensorboard: true 13 | device: cuda 14 | debug: true 15 | evaluate_after_batches_between_logging: true 16 | test_only: false -------------------------------------------------------------------------------- /KnowMan/models/layers.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | 4 | 5 | class TransformerHeadFeature(nn.Module): 6 | def __init__(self, config, drop_out, out_size): 7 | super().__init__() 8 | self.dense = nn.Linear(config.hidden_size, config.hidden_size) 9 | self.dropout = nn.Dropout(drop_out) 10 | self.out_proj = nn.Linear(config.hidden_size, out_size) 11 | 12 | def forward(self, features, **kwargs): 13 | x = features[:, 0, :] # take token (equiv. to [CLS]) 14 | x = self.dropout(x) 15 | x = self.dense(x) 16 | x = torch.tanh(x) 17 | x = self.dropout(x) 18 | x = self.out_proj(x) 19 | return x 20 | -------------------------------------------------------------------------------- /KnowMan/utils/knowman_parameters.py: -------------------------------------------------------------------------------- 1 | import yaml 2 | import os 3 | 4 | DEFAULT_Path = os.path.dirname(__file__) 5 | 6 | 7 | class KnowManParameters: 8 | def __init__(self, config_file=DEFAULT_Path + "/../default_config.yaml"): 9 | self.dataset = {} 10 | self.model_params = {} 11 | self.training_setting = {} 12 | self.experiment_name = None 13 | 14 | with open(config_file) as f: 15 | config_content = yaml.safe_load(f) 16 | 17 | self.__dict__.update(config_content) 18 | 19 | def update_parameters(self, config_file): 20 | with open(config_file) as f: 21 | config_content = yaml.safe_load(f) 22 | 23 | self.__dict__.update(config_content) 24 | 25 | def get_config(self): 26 | return self.__dict__ 27 | 28 | -------------------------------------------------------------------------------- /KnowMan/utils/vocab.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | class KnowMANDataSet(torch.utils.data.Dataset): 5 | def __init__(self, encodings, labels=None, adv_labels=None, max_length=200): 6 | self.encodings = encodings 7 | self.labels = labels 8 | self.adv_labels = adv_labels 9 | self.max_length = max_length 10 | 11 | def __getitem__(self, idx): 12 | if type(self.adv_labels) != torch.Tensor: 13 | return {k: torch.tensor(v[idx][0:self.max_length]) for k, v in self.encodings.items()}, \ 14 | self.labels[idx] 15 | else: 16 | return {k: torch.tensor(v[idx][0:self.max_length]) for k, v in self.encodings.items()}, \ 17 | self.labels[idx], \ 18 | self.adv_labels[idx] 19 | 20 | def __len__(self): 21 | return len(self.labels) 22 | 23 | -------------------------------------------------------------------------------- /experiments/spam/spam_tfidf.yaml: -------------------------------------------------------------------------------- 1 | experiment_name: "spam" 2 | 3 | dataset: 4 | dataset_path: /raid/user-data/nlp_share/KnodleData/spam/processed 5 | 6 | model_params: 7 | activation: relu 8 | loss: gr 9 | feature_ext: distilbert-base-cased # ['roberta-base', 'distilbert-base-cased'] 10 | F_hidden_sizes: [1000, 500] 11 | F_layers: 1 12 | C_layers: 1 13 | D_layers: 1 14 | n_critic: 5 15 | lambd: 2.0 16 | F_bn: false 17 | C_bn: true 18 | D_bn: true 19 | dropout: 0.4 20 | max_length_transformer: 300 21 | feature_num: 3916 22 | all_domains: 10 23 | shared_hidden_size: 700 24 | num_labels: 2 25 | domains: [] 26 | domain_hidden_size: 0 27 | 28 | training_setting: 29 | random_seed: 1 30 | 31 | use_tensorboard: true 32 | tensorboard_dir: ./tensorboard_logging/spam_exp 33 | 34 | device: cuda:1 35 | debug: true 36 | 37 | max_epoch: 2 38 | batch_size: 32 39 | batches_between_logging: 50 40 | evaluate_after_batches_between_logging: true 41 | 42 | learning_rate: 0.0001 43 | D_learning_rate: 0.0001 44 | transformer_weight_decay: 0.01 45 | 46 | model_save_file: ./save/spam/spam_exp 47 | test_only: false 48 | 49 | 50 | 51 | -------------------------------------------------------------------------------- /experiments/spouse/spouse_transformer.yaml: -------------------------------------------------------------------------------- 1 | experiment_name: "spam_transformer" 2 | 3 | dataset: 4 | dataset_path: /raid/user-data/nlp_share/KnodleData/spouse/processed 5 | 6 | model_params: 7 | feature_ext: distilbert-base-cased # ['roberta-base', 'distilbert-base-cased'] 8 | F_hidden_sizes: [1000, 500] 9 | F_layers: 1 10 | C_layers: 1 11 | D_layers: 1 12 | n_critic: 5 13 | lambd: 2.0 14 | F_bn: false 15 | C_bn: true 16 | D_bn: ture 17 | dropout: 0.4 18 | max_length_transformer: 250 19 | feature_num: 26503 20 | shared_hidden_size: 700 21 | num_labels: 2 22 | all_domains: 9 23 | domains: [] 24 | activation: relu 25 | domain_hidden_size: 0 26 | loss: gr 27 | 28 | training_setting: 29 | random_seed: 1 30 | 31 | use_tensorboard: true 32 | tensorboard_dir: ./tensorboard_logging/spouse/spouse_exp 33 | 34 | device: cuda:1 35 | debug: true 36 | 37 | max_epoch: 15 38 | batch_size: 16 39 | batches_between_logging: 695 40 | evaluate_after_batches_between_logging: true 41 | 42 | learning_rate: 0.0001 43 | D_learning_rate: 0.0001 44 | transformer_weight_decay: 0.01 45 | 46 | model_save_file: ./save/spouse/spouse_exp 47 | test_only: false -------------------------------------------------------------------------------- /experiments/spam/spam_transformer.yaml: -------------------------------------------------------------------------------- 1 | experiment_name: "spam_transformer" 2 | 3 | dataset: 4 | dataset_path: /raid/user-data/nlp_share/KnodleData/spam/processed 5 | 6 | model_params: 7 | activation: relu 8 | loss: gr 9 | feature_ext: distilbert-base-cased # ['roberta-base', 'distilbert-base-cased'] 10 | F_hidden_sizes: [1000, 500] 11 | F_layers: 1 12 | C_layers: 1 13 | D_layers: 1 14 | n_critic: 5 15 | lambd: 2.0 16 | F_bn: false 17 | C_bn: true 18 | D_bn: true 19 | dropout: 0.4 20 | max_length_transformer: 300 21 | feature_num: 3916 22 | all_domains: 10 23 | shared_hidden_size: 700 24 | num_labels: 2 25 | domains: [] 26 | domain_hidden_size: 0 27 | 28 | training_setting: 29 | random_seed: 1 30 | 31 | use_tensorboard: true 32 | tensorboard_dir: ./tensorboard_logging/spam_exp 33 | 34 | device: cuda:1 35 | debug: true 36 | 37 | max_epoch: 5 38 | batch_size: 16 39 | batches_between_logging: 50 40 | evaluate_after_batches_between_logging: true 41 | 42 | learning_rate: 0.0001 43 | D_learning_rate: 0.0001 44 | transformer_weight_decay: 0.01 45 | 46 | model_save_file: ./save/spam/spam_exp 47 | test_only: false 48 | 49 | 50 | 51 | -------------------------------------------------------------------------------- /experiments/spouse/spouse_tfidf.yaml: -------------------------------------------------------------------------------- 1 | experiment_name: "spam" 2 | 3 | dataset: 4 | dataset_path: /raid/user-data/nlp_share/KnodleData/spouse/processed 5 | 6 | model_params: 7 | feature_ext: distilbert-base-cased # ['roberta-base', 'distilbert-base-cased'] 8 | F_hidden_sizes: [1000, 500] 9 | F_layers: 5 10 | C_layers: 10 11 | D_layers: 1 12 | n_critic: 1 13 | lambd: 5.0 14 | F_bn: false 15 | C_bn: true 16 | D_bn: ture 17 | dropout: 0.37910169318546527 18 | max_length_transformer: 250 19 | feature_num: 26503 20 | shared_hidden_size: 988 21 | num_labels: 2 22 | all_domains: 9 23 | domains: [] 24 | activation: relu 25 | domain_hidden_size: 0 26 | loss: gr 27 | 28 | training_setting: 29 | random_seed: 1 30 | 31 | use_tensorboard: true 32 | tensorboard_dir: ./tensorboard_logging/spouse/spouse_exp 33 | 34 | device: cuda:1 35 | debug: true 36 | 37 | max_epoch: 1 38 | batch_size: 16 39 | batches_between_logging: 10 40 | evaluate_after_batches_between_logging: true 41 | 42 | learning_rate: 0.0005420120672334338 43 | D_learning_rate: 0.001 44 | transformer_weight_decay: 0.01 45 | 46 | model_save_file: ./save/spouse/spouse_exp 47 | test_only: false -------------------------------------------------------------------------------- /experiments/imdb/imdb_transformer.yaml: -------------------------------------------------------------------------------- 1 | experiment_name: "imbd" 2 | 3 | dataset: 4 | dataset_path: /raid/user-data/nlp_share/KnodleData/imdb/processed/ 5 | 6 | model_params: 7 | feature_ext: distilbert-base-cased # ['roberta-base', 'distilbert-base-cased'] 8 | F_hidden_sizes: [1000, 500] 9 | F_layers: 2 10 | C_layers: 1 11 | D_layers: 10 12 | n_critic: 10 13 | lambd: 2.0 14 | F_bn: false 15 | C_bn: true 16 | D_bn: true 17 | dropout: 0.2752300511765772 18 | max_length_transformer: 250 19 | feature_num: 94633 20 | shared_hidden_size: 700 21 | num_labels: 2 22 | all_domains: 6786 23 | domains: [] 24 | activation: relu 25 | domain_hidden_size: 0 26 | loss: gr 27 | 28 | training_setting: 29 | random_seed: 1 30 | 31 | use_tensorboard: true 32 | tensorboard_dir: ./tensorboard_logging/new_logging/imdb/man_imdb_transformers_l0 33 | 34 | device: cuda:1 35 | debug: true 36 | 37 | max_epoch: 5 38 | batch_size: 16 39 | batches_between_logging: 2500 40 | evaluate_after_batches_between_logging: true 41 | 42 | learning_rate: 0.00001 43 | D_learning_rate: 0.0001 44 | transformer_weight_decay: 0.01 45 | 46 | model_save_file: ./save/imdb/man_imdb_transformers_l0 47 | test_only: false 48 | 49 | 50 | -------------------------------------------------------------------------------- /experiments/imdb/imdb_tfidf.yaml: -------------------------------------------------------------------------------- 1 | experiment_name: "imbd" 2 | 3 | dataset: 4 | dataset_path: /raid/user-data/nlp_share/KnodleData/imdb/processed/ 5 | 6 | model_params: 7 | feature_ext: distilbert-base-cased # ['roberta-base', 'distilbert-base-cased'] 8 | F_hidden_sizes: [1000, 500] 9 | F_layers: 2 10 | C_layers: 1 11 | D_layers: 10 12 | n_critic: 50 13 | lambd: 5.0 14 | F_bn: false 15 | C_bn: true 16 | D_bn: true 17 | dropout: 0.2752300511765772 18 | max_length_transformer: 250 19 | feature_num: 94633 20 | shared_hidden_size: 584 21 | num_labels: 2 22 | all_domains: 6786 23 | domains: [] 24 | activation: relu 25 | domain_hidden_size: 0 26 | loss: gr 27 | 28 | training_setting: 29 | random_seed: 1 30 | 31 | use_tensorboard: true 32 | tensorboard_dir: ./tensorboard_logging/new_logging/imdb/man_imdb_transformers_l0 33 | 34 | device: cuda:1 35 | debug: true 36 | 37 | max_epoch: 15 38 | batch_size: 895 39 | batches_between_logging: 2500 40 | evaluate_after_batches_between_logging: true 41 | 42 | learning_rate: 0.001 43 | D_learning_rate: 0.000922510274469868 44 | transformer_weight_decay: 0.01 45 | 46 | model_save_file: ./save/imdb/man_imdb_transformers_l0 47 | test_only: false 48 | 49 | 50 | -------------------------------------------------------------------------------- /KnowMan/utils/knowman_utils.py: -------------------------------------------------------------------------------- 1 | import enum 2 | from transformers import RobertaTokenizer, DistilBertTokenizer 3 | from KnowMan.models.nn_models import RobertaKnowMANFeatureExt, DistilbertKnowMANFeatureExt 4 | 5 | 6 | class FeatureExtraction(enum.Enum): 7 | ROBERTABASE = 1 8 | DISTILBERT = 2 9 | 10 | @staticmethod 11 | def name2type(text): 12 | return {'distilbert-base-cased':FeatureExtraction.DISTILBERT, 13 | 'roberta-base': FeatureExtraction.ROBERTABASE}[text] 14 | 15 | 16 | class TransformerUtil: 17 | name2type = {'roberta-base': FeatureExtraction.ROBERTABASE, 18 | 'distilbert-base-cased': FeatureExtraction.DISTILBERT} 19 | 20 | @staticmethod 21 | def get_tokenizer(transformer_enum): 22 | if transformer_enum == FeatureExtraction.ROBERTABASE: 23 | return RobertaTokenizer.from_pretrained('roberta-base') 24 | if transformer_enum == FeatureExtraction.DISTILBERT: 25 | return DistilBertTokenizer.from_pretrained('distilbert-base-cased') 26 | 27 | @staticmethod 28 | def get_pretrained_model(transformer_enum, dropout, out_size): 29 | if transformer_enum == FeatureExtraction.ROBERTABASE: 30 | return RobertaKnowMANFeatureExt.from_pretrained('roberta-base', dropout=dropout, 31 | out_size=out_size) 32 | 33 | if transformer_enum == FeatureExtraction.DISTILBERT: 34 | return DistilbertKnowMANFeatureExt.from_pretrained('distilbert-base-cased', dropout=dropout, 35 | out_size=out_size) 36 | 37 | 38 | def unpackKnowMAN_batch(batch_X, device): 39 | inputs, labels, adv_labels = batch_X 40 | return {k: v.to(device) for k, v in inputs.items()}, labels.to(device), adv_labels.to(device) 41 | 42 | 43 | def freeze_net(net): 44 | if not net: 45 | return 46 | for name, p in net.named_parameters(): 47 | p.requires_grad = False 48 | 49 | 50 | def unfreeze_net(net): 51 | if not net: 52 | return 53 | for name, p in net.named_parameters(): 54 | if 'transformer' not in name: 55 | p.requires_grad = True 56 | 57 | -------------------------------------------------------------------------------- /baselines/log_reg.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import numpy as np 4 | from KnowMan.data_prep.get_knodle_dataset import get_data, get_tfidf_features, \ 5 | z_t_matrices_to_majority_vote_probs, probabilities_to_majority_vote 6 | from sklearn.linear_model import LogisticRegression 7 | from sklearn.metrics import classification_report 8 | from KnowMan.utils.knowman_parameters import KnowManParameters 9 | 10 | 11 | path = os.path.dirname(__file__) 12 | 13 | 14 | def get_dataset(data_path: str, if_dev_data: bool = True): 15 | 16 | # first, the data is read from the file 17 | train_df, dev_df, test_df, train_rule_matches_z, dev_rule_matches_z, test_rule_matches_z, \ 18 | mapping_rules_labels_t = get_data(data_path, if_dev_data=if_dev_data) 19 | if "tac" in data_path: 20 | val = "samples" 21 | else: 22 | val = "sample" 23 | 24 | # For the LogReg model we encode train samples with TF-IDF features. 25 | if if_dev_data: 26 | train_tfidf, test_tfidf, dev_tfidf = get_tfidf_features(train_df[val].tolist(), test_df[val].tolist(), 27 | dev_df[val].tolist()) 28 | 29 | else: 30 | train_tfidf, test_tfidf, _ = get_tfidf_features(train_df[val].tolist(), test_df[val].tolist()) 31 | 32 | test_labels = np.asarray(list(test_df.iloc[:, 1])) 33 | train_probs = z_t_matrices_to_majority_vote_probs(train_rule_matches_z, mapping_rules_labels_t) 34 | train_labels = np.asarray(np.apply_along_axis(probabilities_to_majority_vote, axis=1, arr=train_probs, 35 | choose_random_label=True)) 36 | 37 | if if_dev_data: 38 | dev_labels = np.asarray(list(dev_df.iloc[:, 1])) 39 | 40 | return [train_tfidf, train_labels], [dev_tfidf, dev_labels], [test_tfidf, test_labels] 41 | else: 42 | return [train_tfidf, train_labels], [test_tfidf, test_labels] 43 | 44 | 45 | def main(): 46 | yaml_file = sys.argv[1] 47 | 48 | params = KnowManParameters() 49 | params.update_parameters(yaml_file) 50 | 51 | dataset_path = params.dataset["dataset_path"] 52 | train_dataset, test_dataset = get_dataset(dataset_path, if_dev_data=False) 53 | 54 | clf = LogisticRegression(random_state=0, max_iter=200).fit(train_dataset[0], train_dataset[1]) 55 | pred = clf.predict(test_dataset[0]) 56 | 57 | with open("../KnowMan/save/labels_KS_tfidf.csv", "w") as out: 58 | for i in range(len(test_dataset[1])): 59 | out.write(str(test_dataset[1][i]) + "," + str(pred[i]) + "\n") 60 | 61 | print(classification_report(test_dataset[1], pred)) 62 | 63 | 64 | if __name__ == '__main__': 65 | main() 66 | -------------------------------------------------------------------------------- /baselines/snorkel_training_knodle.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import torch 4 | import numpy as np 5 | import scipy.sparse as sp 6 | from torch import LongTensor 7 | from torch.utils.data import TensorDataset 8 | from knodle.trainer.snorkel.snorkel import SnorkelTrainer 9 | from KnowMan.data_prep.get_knodle_dataset import get_data, get_tfidf_features 10 | from KnowMan.models.nn_models import SentimentClassifier 11 | from KnowMan.utils.knowman_parameters import KnowManParameters 12 | 13 | 14 | path = os.path.dirname(__file__) 15 | sys.path.append(path + '/../') 16 | 17 | 18 | def get_dataset(data_path: str, if_dev_data: bool = True): 19 | # first, the data is read from the file 20 | train_df, dev_df, test_df, train_rule_matches_z, dev_rule_matches_z, test_rule_matches_z, mapping_rules_labels_t = \ 21 | get_data(data_path, if_dev_data=if_dev_data) 22 | 23 | val = "sample" 24 | 25 | # For the LogReg model we encode train samples with TF-IDF features. 26 | if if_dev_data: 27 | train_tfidf, test_tfidf, dev_tfidf = get_tfidf_features(train_df[val].tolist(), test_df[val].tolist(), 28 | dev_df[val].tolist()) 29 | 30 | dev_set = np_array_to_tensor_dataset(dev_tfidf.toarray()) 31 | dev_labels = TensorDataset(LongTensor(dev_df["label"].tolist())) 32 | dev_dataset = [dev_set, dev_labels] 33 | train_set = np_array_to_tensor_dataset(train_tfidf.toarray()) 34 | test_set = np_array_to_tensor_dataset(test_tfidf.toarray()) 35 | test_labels = TensorDataset(LongTensor(test_df["label"].tolist())) 36 | train_dataset = [train_set] 37 | test_dataset = [test_set, test_labels] 38 | 39 | else: 40 | train_tfidf, test_tfidf, _ = get_tfidf_features(train_df[val].tolist(), test_df[val].tolist()) 41 | 42 | train_set = np_array_to_tensor_dataset(train_tfidf.toarray()) 43 | test_set = np_array_to_tensor_dataset(test_tfidf.toarray()) 44 | test_labels = TensorDataset(LongTensor(test_df["label"].tolist())) 45 | train_dataset = [train_set] 46 | test_dataset = [test_set, test_labels] 47 | 48 | if if_dev_data: 49 | return train_dataset, dev_dataset, test_dataset, train_rule_matches_z, dev_rule_matches_z, test_rule_matches_z, \ 50 | mapping_rules_labels_t 51 | else: 52 | return train_dataset, test_dataset, train_rule_matches_z, test_rule_matches_z, mapping_rules_labels_t 53 | 54 | 55 | def np_array_to_tensor_dataset(x: np.ndarray) -> TensorDataset: 56 | if isinstance(x, sp.csr_matrix): 57 | x = x.toarray() 58 | x = torch.from_numpy(x) 59 | x = TensorDataset(x.float()) 60 | return x 61 | 62 | 63 | def snorkel_train(params, if_dev_data: bool = True): 64 | data_path = params.dataset["dataset_path"] 65 | model = SentimentClassifier(params.model_params["C_layers"], 66 | params.model_params["feature_num"], 67 | params.model_params["shared_hidden_size"], 68 | params.model_params["num_labels"], 69 | params.model_params["dropout"], 70 | params.model_params["C_bn"]) 71 | 72 | if if_dev_data: 73 | train_dataset, dev_dataset, test_dataset, train_rule_matches_z, dev_rule_matches_z, test_rule_matches_z, \ 74 | mapping_rules_labels_t = get_dataset(data_path, if_dev_data) 75 | 76 | trainer = SnorkelTrainer( 77 | model=model, 78 | mapping_rules_labels_t=mapping_rules_labels_t, 79 | model_input_x=train_dataset[0], 80 | rule_matches_z=train_rule_matches_z, 81 | dev_model_input_x=dev_dataset[0], 82 | dev_gold_labels_y=dev_dataset[1] 83 | ) 84 | trainer.train() 85 | return trainer.test(test_dataset[0], test_dataset[1]) 86 | 87 | else: 88 | train_dataset, test_dataset, train_rule_matches_z, test_rule_matches_z, mapping_rules_labels_t = \ 89 | get_dataset(data_path, if_dev_data) 90 | 91 | trainer = SnorkelTrainer( 92 | model=model, 93 | mapping_rules_labels_t=mapping_rules_labels_t, 94 | model_input_x=train_dataset[0], 95 | rule_matches_z=train_rule_matches_z, 96 | ) 97 | 98 | trainer.train() 99 | return trainer.test(test_dataset[0], test_dataset[1]) 100 | 101 | 102 | def main(): 103 | yaml_file = sys.argv[1] 104 | 105 | params = KnowManParameters() 106 | params.update_parameters(yaml_file) 107 | 108 | res, gold, pred = snorkel_train(params) 109 | print(res) 110 | 111 | with open("../KnowMan/save/labels_sorkel_tfidf", "w")as out: 112 | for i in range(len(gold)): 113 | out.write(str(gold[i]) + "," + str(pred[i]) + "\n") 114 | 115 | return res 116 | 117 | 118 | if __name__ == '__main__': 119 | main() 120 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # KnowMAN 2 | ### KnowMAN: Weakly Supervised Multinomial Adversarial Networks 3 | [![Python Version](https://img.shields.io/badge/python-3.7-yellow.svg)](https://www.python.org/downloads/release/python-360/) 4 | 5 | This repository contains code that is used in our paper:
6 | [KnowMAN: Weakly Supervised Multinomial Networks](https://arxiv.org/abs/2109.07994) - to be published at EMNLP 2021. 🎉
7 | by Luisa März, Ehsaneddin Asgari, Fabienne Braune, Franziska Zimmermann and Benjamin Roth. 8 | 9 | 10 | For any questions please [get in touch](mailto:luisa.maerz@volkswagen.de) 11 | 12 | --- 13 | ## What is KnowMAN about? 🤓 14 | 15 | The absence of labeled data for training neural 16 | models is often addressed by leveraging 17 | knowledge about the specific task, resulting 18 | in heuristic but noisy labels. The knowledge 19 | is captured in labeling functions, which detect 20 | certain regularities or patterns in the training 21 | samples and annotate corresponding labels for 22 | training. This process of weakly supervised 23 | training may result in an over-reliance on the 24 | signals captured by the labeling functions and 25 | hinder models to exploit other signals or to 26 | generalize well. 27 | 28 | **KnowMAN** is an 29 | adversarial scheme that enables to control influence 30 | of signals associated with specific labeling 31 | functions. **KnowMAN** forces the network 32 | to learn representations that are invariant 33 | to those signals and to pick up other signals 34 | that are more generally associated with an 35 | output label. **KnowMAN** strongly improves 36 | results compared to direct weakly supervised 37 | learning with a pre-trained transformer language 38 | model and a feature-based baseline. 39 | 40 | --- 41 | 42 | ## Usage 🚀 43 | 44 | 45 | Experiments described in our paper can be found in the **experiments** folder. 46 | To run them execute the respective file.
47 | **Please make sure that you have downloaded the data files in advance (see datasets section) and adjusted the datafile path in the yaml files!** 48 | 49 | 50 | E.g. run the imdb tfidf training: 51 | ``` 52 | python ./experiments/imdb/train_tfidf_imdb.py 53 | ``` 54 | 55 | 56 | E.g. run the spam DistilBERT training: 57 | 58 | ``` 59 | python ./experiments/spam/train_transformers_spam.py 60 | ``` 61 | 62 | 63 | If you want to change hyperparameters just edit the **yaml** files in the experiments folder. 64 | 65 | --- 66 | 67 | Baselines can be found in the **baselines** folder. To run them please pass the yaml file for the experiment you want to try here. 68 | 69 | E.g. run the spouse snorkel training: 70 | ``` 71 | python ./baselines/snorkel_training_knodle.py ./experiments/spouse/spouse_tfidf.yaml 72 | ``` 73 | 74 | 75 | Please note that the baselines are only implemented for tf-idf encoding here. The results for DistilBERT baselines can be reproduced by using [Knodle](https://github.com/knodle/knodle). 76 | 77 | --- 78 | ## Datasets 📚 79 | 80 | Datasets used in our work: 81 | 82 | - Spam Dataset - a dataset, based on the YouTube comments dataset from [Alberto et al. (2015)](https://www.researchgate.net/publication/300414679_TubeSpam_Comment_Spam_Filtering_on_YouTube). Here, the task is to classify whether a text is relevant to the video or holds spam, such as advertisement. 83 | - Spouse Dataset - relation extraction dataset is based on the Signal Media One-Million News Articles Dataset from [Corney et al. (2016)](http://ceur-ws.org/Vol-1568/paper8.pdf). 84 | - IMDb Dataset - a dataset, that consists of short movie reviews. The task is to determine whether a review holds a positive or negative sentiment. 85 | 86 | All datasets are part of the the [Knodle](https://github.com/knodle/knodle) framework and can be dowloaded [here](https://knodle.cc/minio/knodle/). 87 | 88 | 89 | 90 | 91 | --- 92 | ## Citation 📑 93 | 94 | When using our work please cite our Acl Anthology print: 95 | 96 | ``` 97 | @inproceedings{marz-etal-2021-knowman, 98 | title = "{K}now{MAN}: Weakly Supervised Multinomial Adversarial Networks", 99 | author = {M{\"a}rz, Luisa and 100 | Asgari, Ehsaneddin and 101 | Braune, Fabienne and 102 | Zimmermann, Franziska and 103 | Roth, Benjamin}, 104 | booktitle = "Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing", 105 | month = nov, 106 | year = "2021", 107 | address = "Online and Punta Cana, Dominican Republic", 108 | publisher = "Association for Computational Linguistics", 109 | url = "https://aclanthology.org/2021.emnlp-main.751", 110 | pages = "9549--9557", 111 | abstract = "The absence of labeled data for training neural models is often addressed by leveraging knowledge about the specific task, resulting in heuristic but noisy labels. The knowledge is captured in labeling functions, which detect certain regularities or patterns in the training samples and annotate corresponding labels for training. This process of weakly supervised training may result in an over-reliance on the signals captured by the labeling functions and hinder models to exploit other signals or to generalize well. We propose KnowMAN, an adversarial scheme that enables to control influence of signals associated with specific labeling functions. KnowMAN forces the network to learn representations that are invariant to those signals and to pick up other signals that are more generally associated with an output label. KnowMAN strongly improves results compared to direct weakly supervised learning with a pre-trained transformer language model and a feature-based baseline.", 112 | } 113 | 114 | ``` 115 | 116 | ## Acknowledgments 💎 117 | 118 | This research was funded by the WWTF though the project “Knowledge-infused Deep Learning for Natural Language Processing” (WWTF Vienna Research Group VRG19-008). 119 | 120 | -------------------------------------------------------------------------------- /KnowMan/utils/logging_utils.py: -------------------------------------------------------------------------------- 1 | 2 | def per_step_classifier_tb_logging(log, writer, 3 | tmp_dev_classification_report_dict, tmp_test_classification_report_dict, 4 | total_steps, batches_between_logging, classification_report_dict, 5 | avg_train_losses_classifier, avg_train_losses_domain_blurrer, 6 | avg_train_losses_classifier_dom_blurrer, log_dev_test=True, use_accuracy=True): 7 | log.info(f'Training loss classifier over the last {batches_between_logging} batches: {avg_train_losses_classifier}') 8 | log.info( 9 | f'Training loss domain blurrer over the last {batches_between_logging} ' 10 | f'batches: {avg_train_losses_domain_blurrer}') 11 | log.info( 12 | f'Training loss classifier and domain blurrer together over the last {batches_between_logging} ' 13 | f'batches: {avg_train_losses_classifier_dom_blurrer}') 14 | writer.add_scalar( 15 | 'Training loss classifier accumulated over last batches', avg_train_losses_classifier, total_steps) 16 | writer.add_scalar( 17 | 'Training loss domain blurrer accumulated over last batches', avg_train_losses_domain_blurrer, total_steps) 18 | writer.add_scalar( 19 | 'Training loss classifier and domain blurrer together accumulated over last batches', 20 | avg_train_losses_classifier_dom_blurrer, total_steps) 21 | 22 | if use_accuracy: 23 | log.info('Classification Training accuracy: {}%'.format(classification_report_dict["accuracy"])) 24 | writer.add_scalar("Classification accuracy train", classification_report_dict["accuracy"], total_steps) 25 | if log_dev_test: 26 | if tmp_dev_classification_report_dict is not None: 27 | log.info(f'Average validation accuracy: {100.0 * tmp_dev_classification_report_dict["accuracy"]}%') 28 | writer.add_scalar("Classification accuracy dev", tmp_dev_classification_report_dict["accuracy"], 29 | total_steps) 30 | log.info(f'Average test accuracy: {100.0 * tmp_test_classification_report_dict["accuracy"]}%') 31 | 32 | else: 33 | log.info('Classification Training weighted avg f1 score: ' 34 | '{}%'.format(classification_report_dict['1']["f1-score"])) 35 | writer.add_scalar("Classification weighted-f1 train", classification_report_dict['1']["f1-score"], total_steps) 36 | writer.add_scalar("Classification prec (1) train", classification_report_dict['1']["precision"], 37 | total_steps) 38 | writer.add_scalar("Classification recall (1) train", classification_report_dict['1']["recall"], 39 | total_steps) 40 | if log_dev_test: 41 | log.info(f'Average test weighted-f1: {100.0 * tmp_test_classification_report_dict["1"]["f1-score"]}%') 42 | if tmp_dev_classification_report_dict is not None: 43 | log.info(f'Average validation weighted-f1: ' 44 | f'{100.0 * tmp_dev_classification_report_dict["1"]["f1-score"]}%') 45 | writer.add_scalar("Classification weighted-f1 dev", tmp_dev_classification_report_dict['1']["f1-score"], 46 | total_steps) 47 | writer.add_scalar("Classification prec (1) dev", 48 | tmp_dev_classification_report_dict['1']["precision"], total_steps) 49 | writer.add_scalar("Classification recall (1) dev", 50 | tmp_dev_classification_report_dict['1']["recall"], total_steps) 51 | 52 | return writer 53 | 54 | 55 | def per_epoch_classifier_tb_logging(writer, classification_report_dict, tmp_dev_classification_report_dict, 56 | tmp_test_classification_report_dict, avg_train_losses_classifier_per_epoch, 57 | avg_train_losses_domain_blurrer_per_epoch, 58 | avg_train_losses_classifier_dom_blurrer_per_epoch, epoch): 59 | writer.add_scalar("Per epoch Classification accuracy train", classification_report_dict["accuracy"], epoch) 60 | writer.add_scalar("Per epoch Classification weighted-f1 train", classification_report_dict['1']["f1-score"], 61 | epoch) 62 | writer.add_scalar("Per epoch Classification prec (1) train", classification_report_dict['1']["precision"], 63 | epoch) 64 | writer.add_scalar("Per epoch Classification recall (1) train", classification_report_dict['1']["recall"], 65 | epoch) 66 | 67 | writer.add_scalar("Per epoch Classification accuracy dev", tmp_dev_classification_report_dict["accuracy"], 68 | epoch) 69 | writer.add_scalar("Per epoch Classification accuracy test", tmp_test_classification_report_dict["accuracy"], 70 | epoch) 71 | writer.add_scalar("Per epoch Classification weighted-f1 dev", 72 | tmp_dev_classification_report_dict['1']["f1-score"], 73 | epoch) 74 | writer.add_scalar("Per epoch Classification weighted-f1 test", 75 | tmp_test_classification_report_dict['1']["f1-score"], 76 | epoch) 77 | writer.add_scalar("Per epoch Classification prec (1) dev", 78 | tmp_dev_classification_report_dict['1']["precision"], epoch) 79 | writer.add_scalar("Per epoch Classification prec (1) test", 80 | tmp_test_classification_report_dict['1']["precision"], epoch) 81 | writer.add_scalar("Per epoch Classification recall (1) dev", 82 | tmp_dev_classification_report_dict['1']["recall"], epoch) 83 | writer.add_scalar("Per epoch Classification recall (1) test", 84 | tmp_test_classification_report_dict['1']["recall"], epoch) 85 | writer.add_scalar( 86 | 'Per epoch Training loss classifier accumulated over last batches', avg_train_losses_classifier_per_epoch, 87 | epoch) 88 | writer.add_scalar( 89 | 'Per epoch Training loss domain blurrer accumulated over last batches', 90 | avg_train_losses_domain_blurrer_per_epoch, epoch) 91 | writer.add_scalar( 92 | 'Per epoch Training loss classifier and domain blurrer together accumulated over last batches', 93 | avg_train_losses_classifier_dom_blurrer_per_epoch, epoch) 94 | return writer 95 | -------------------------------------------------------------------------------- /KnowMan/models/nn_models.py: -------------------------------------------------------------------------------- 1 | from KnowMan.models.layers import * 2 | import torch.nn.functional as functional 3 | from transformers import RobertaForSequenceClassification, DistilBertForSequenceClassification 4 | 5 | 6 | class MlpFeatureExtractor(nn.Module): 7 | def __init__(self, 8 | input_size, 9 | hidden_sizes, 10 | output_size, 11 | dropout, 12 | batch_norm=False): 13 | super(MlpFeatureExtractor, self).__init__() 14 | self.hidden_sizes = hidden_sizes 15 | self.net = nn.Sequential() 16 | num_layers = len(hidden_sizes) 17 | for i in range(num_layers): 18 | if dropout > 0: 19 | self.net.add_module('f-dropout-{}'.format(i), nn.Dropout(p=dropout)) 20 | if i == 0: 21 | self.net.add_module('f-linear-{}'.format(i), nn.Linear(input_size, hidden_sizes[0])) 22 | else: 23 | self.net.add_module('f-linear-{}'.format(i), nn.Linear(hidden_sizes[i-1], hidden_sizes[i])) 24 | if batch_norm: 25 | self.net.add_module('f-bn-{}'.format(i), nn.BatchNorm1d(hidden_sizes[i])) 26 | self.net.add_module('f-relu-{}'.format(i), nn.ReLU()) 27 | 28 | if dropout > 0: 29 | self.net.add_module('f-dropout-final', nn.Dropout(p=dropout)) 30 | self.net.add_module('f-linear-final', nn.Linear(hidden_sizes[-1], output_size)) 31 | if batch_norm: 32 | self.net.add_module('f-bn-final', nn.BatchNorm1d(output_size)) 33 | self.net.add_module('f-relu-final', nn.ReLU()) 34 | 35 | def forward(self, input): 36 | return self.net(input) 37 | 38 | 39 | class SentimentClassifier(nn.Module): 40 | def __init__(self, 41 | num_layers, 42 | input_size, 43 | hidden_size, 44 | output_size, 45 | dropout, 46 | batch_norm=False): 47 | super(SentimentClassifier, self).__init__() 48 | assert num_layers >= 0, 'Invalid layer numbers' 49 | self.hidden_size = hidden_size 50 | self.net = nn.Sequential() 51 | for i in range(num_layers): 52 | if dropout > 0: 53 | self.net.add_module('p-dropout-{}'.format(i), nn.Dropout(p=dropout)) 54 | if i == 0: 55 | self.net.add_module('p-linear-{}'.format(i), nn.Linear(input_size, hidden_size)) 56 | else: 57 | self.net.add_module('p-linear-{}'.format(i), nn.Linear(hidden_size, hidden_size)) 58 | if batch_norm: 59 | self.net.add_module('p-bn-{}'.format(i), nn.BatchNorm1d(hidden_size)) 60 | self.net.add_module('p-relu-{}'.format(i), nn.ReLU()) 61 | 62 | self.net.add_module('p-linear-final', nn.Linear(hidden_size, output_size)) 63 | self.net.add_module('p-logsoftmax', nn.LogSoftmax(dim=-1)) 64 | 65 | def forward(self, input): 66 | return self.net(input) 67 | 68 | 69 | class DomainClassifier(nn.Module): 70 | def __init__(self, 71 | num_layers, 72 | input_size, 73 | hidden_size, 74 | num_domains, 75 | loss_type, 76 | dropout, 77 | batch_norm=False): 78 | super(DomainClassifier, self).__init__() 79 | assert num_layers >= 0, 'Invalid layer numbers' 80 | self.num_domains = num_domains 81 | self.loss_type = loss_type 82 | self.net = nn.Sequential() 83 | for i in range(num_layers): 84 | if dropout > 0: 85 | self.net.add_module('q-dropout-{}'.format(i), nn.Dropout(p=dropout)) 86 | if i == 0: 87 | self.net.add_module('q-linear-{}'.format(i), nn.Linear(input_size, hidden_size)) 88 | else: 89 | self.net.add_module('q-linear-{}'.format(i), nn.Linear(hidden_size, hidden_size)) 90 | if batch_norm: 91 | self.net.add_module('q-bn-{}'.format(i), nn.BatchNorm1d(hidden_size)) 92 | self.net.add_module('q-relu-{}'.format(i), nn.ReLU()) 93 | 94 | self.net.add_module('q-linear-final', nn.Linear(hidden_size, num_domains)) 95 | if loss_type.lower() == 'gr' or loss_type.lower() == 'bs': 96 | self.net.add_module('q-logsoftmax', nn.LogSoftmax(dim=-1)) 97 | 98 | def forward(self, input): 99 | scores = self.net(input) 100 | if self.loss_type.lower() == 'l2': 101 | # normalize 102 | scores = functional.relu(scores) 103 | scores /= torch.sum(scores, dim=1, keepdim=True) 104 | return scores 105 | 106 | 107 | class FeatureExtractor(nn.Module): 108 | def __init__(self, input_size, hidden_size, linear=False): 109 | super(FeatureExtractor, self).__init__() 110 | self.input_size = input_size 111 | self.hidden_size = hidden_size 112 | self.linear_layer = nn.Linear(self.input_size, self.hidden_size) 113 | self.is_linear = linear 114 | if not linear: 115 | self.relu = nn.ReLU() 116 | 117 | def forward(self, x): 118 | projection = self.linear_layer(x) 119 | if self.is_linear: 120 | return projection 121 | 122 | features = self.relu(projection) 123 | return features 124 | 125 | 126 | class Classifier(nn.Module): 127 | def __init__(self, feature_dim, num_classes, linear=False): 128 | super(Classifier, self).__init__() 129 | self.feature_dim = feature_dim 130 | self.num_classes = num_classes 131 | self.linear_layer = nn.Linear(feature_dim, num_classes) 132 | self.is_linear = linear 133 | if not self.is_linear: 134 | self.relu = nn.ReLU() 135 | 136 | def forward(self, x): 137 | projection = self.linear_layer(x) 138 | if self.is_linear: 139 | return projection 140 | classification = self.relu(projection) 141 | return classification 142 | 143 | 144 | class RobertaKnowMANFeatureExt(RobertaForSequenceClassification): 145 | """ 146 | Roberta feature extractor 147 | """ 148 | 149 | def __init__(self, config, dropout=0.5, out_size=100): 150 | super().__init__(config) 151 | self.KnowMANFeat_layer = TransformerHeadFeature(config, dropout, out_size) 152 | 153 | def forward( 154 | self, 155 | input_ids=None, 156 | attention_mask=None, 157 | token_type_ids=None, 158 | position_ids=None, 159 | head_mask=None, 160 | inputs_embeds=None, 161 | output_attentions=None, 162 | output_hidden_states=None, 163 | return_dict=None, 164 | ): 165 | 166 | return_dict = return_dict if return_dict is not None else self.config.use_return_dict 167 | 168 | outputs = self.roberta( 169 | input_ids=input_ids, 170 | attention_mask=attention_mask, 171 | token_type_ids=token_type_ids, 172 | position_ids=position_ids, 173 | head_mask=head_mask, 174 | inputs_embeds=inputs_embeds, 175 | output_attentions=output_attentions, 176 | output_hidden_states=output_hidden_states, 177 | return_dict=return_dict, 178 | ) 179 | sequence_output = outputs[0] 180 | return self.KnowMANFeat_layer(sequence_output) 181 | 182 | 183 | class DistilbertKnowMANFeatureExt(DistilBertForSequenceClassification): 184 | """ 185 | Roberta feature extractor 186 | """ 187 | 188 | def __init__(self, config, drop_out=0.5, out_size=100): 189 | super().__init__(config) 190 | self.dropout = nn.Dropout(drop_out) 191 | self.pre_classifier = nn.Linear(config.dim, out_size) 192 | 193 | def forward( 194 | self, 195 | input_ids=None, 196 | attention_mask=None, 197 | head_mask=None, 198 | inputs_embeds=None, 199 | output_attentions=None, 200 | output_hidden_states=None, 201 | return_dict=None, 202 | ): 203 | 204 | return_dict = return_dict if return_dict is not None else self.config.use_return_dict 205 | 206 | outputs = self.distilbert( 207 | input_ids=input_ids, 208 | attention_mask=attention_mask, 209 | head_mask=head_mask, 210 | inputs_embeds=inputs_embeds, 211 | output_attentions=output_attentions, 212 | output_hidden_states=output_hidden_states, 213 | return_dict=return_dict, 214 | ) 215 | hidden_state = outputs[0] # (bs, seq_len, dim) 216 | pooled_output = hidden_state[:, 0] # (bs, dim) 217 | pooled_output = self.pre_classifier(pooled_output) # (bs, dim) 218 | pooled_output = nn.ReLU()(pooled_output) # (bs, dim) 219 | pooled_output = self.dropout(pooled_output) # (bs, dim) 220 | return pooled_output 221 | -------------------------------------------------------------------------------- /KnowMan/data_prep/get_knodle_dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import numpy as np 4 | from joblib import load 5 | import scipy.sparse as sp 6 | from typing import Union, Tuple, List 7 | from sklearn.feature_extraction.text import TfidfVectorizer 8 | from torch import Tensor, LongTensor 9 | from torch.utils.data import TensorDataset 10 | from KnowMan.utils.vocab import KnowMANDataSet 11 | from KnowMan.utils.knowman_utils import FeatureExtraction, TransformerUtil 12 | 13 | path = os.path.dirname(__file__) 14 | sys.path.append(path+'/../') 15 | 16 | 17 | def get_data(target_path: str, if_dev_data: bool = True): 18 | """ 19 | Reads data, label and also retrieves adversarial label 20 | Returns TensorDataset to be used in DataLoader 21 | train_data, dev_data, test_data: csv. file with index, sample, label 22 | t_train, t_dev: dataframe with mapping rules - labels 23 | z_train, z_dev: dataframe with mapping instances - rules 24 | """ 25 | train_df = load(os.path.join(target_path, 'df_train.lib')) 26 | test_df = load(os.path.join(target_path, 'df_test.lib')) 27 | 28 | if "imdb" in target_path: 29 | train_rule_matches_z = load(os.path.join(target_path, 'train_rule_matches_z.lib')).toarray() 30 | test_rule_matches_z = load(os.path.join(target_path, 'test_rule_matches_z.lib')).toarray() 31 | else: 32 | train_rule_matches_z = load(os.path.join(target_path, 'train_rule_matches_z.lib')) 33 | test_rule_matches_z = load(os.path.join(target_path, 'test_rule_matches_z.lib')) 34 | 35 | mapping_rules_labels_t = load(os.path.join(target_path, 'mapping_rules_labels_t.lib')) 36 | 37 | if if_dev_data: 38 | dev_df = load(os.path.join(target_path, 'df_dev.lib')) 39 | if "imdb" in target_path: 40 | dev_rule_matches_z = load(os.path.join(target_path, 'dev_rule_matches_z.lib')).toarray() 41 | else: 42 | dev_rule_matches_z = load(os.path.join(target_path, 'dev_rule_matches_z.lib')) 43 | return train_df, dev_df, test_df, train_rule_matches_z, dev_rule_matches_z, test_rule_matches_z, \ 44 | mapping_rules_labels_t 45 | 46 | return train_df, None, test_df, train_rule_matches_z, None, test_rule_matches_z, mapping_rules_labels_t 47 | 48 | 49 | def get_tfidf_features( 50 | train_data: List, test_data: List = None, dev_data: List = None 51 | ) -> Union[Tuple[np.ndarray, np.ndarray, np.ndarray], Tuple[np.ndarray, np.ndarray, None]]: 52 | """ 53 | Convert input data to a matrix of TF-IDF features. 54 | :param train_data: training samples that are to be encoded with TF-IDF features. Can be given as Series or 55 | as DataFrames with specified column number where the sample are stored. 56 | :param column_num: optional parameter that is needed to specify in which column of input_data Dataframe the samples 57 | are stored 58 | :param test_data: if DataFrame/Series with test data is provided 59 | :param dev_data: if DataFrame/Series with development data is provided, it will be encoded as well 60 | :return: TensorDataset with encoded data 61 | """ 62 | dev_transformed_data, test_transformed_data = None, None 63 | vectorizer = TfidfVectorizer() 64 | 65 | train_transformed_data = vectorizer.fit_transform(train_data) 66 | if test_data is not None: 67 | test_transformed_data = vectorizer.transform(test_data) 68 | if dev_data is not None: 69 | dev_transformed_data = vectorizer.transform(dev_data) 70 | return train_transformed_data, test_transformed_data, dev_transformed_data 71 | 72 | 73 | def z_t_matrices_to_majority_vote_probs( 74 | rule_matches_z: np.ndarray, mapping_rules_labels_t: np.ndarray, other_class: int = None 75 | ) -> np.ndarray: 76 | """ 77 | This function calculates a majority vote probability for all rule_matches_z. The difference from simple 78 | get_majority_vote_probs function is the following: samples, where no rules matched (that is, all elements in 79 | the corresponding row in rule_matches_z matrix equal 0), are assigned to no_match_class (that is, a value in the 80 | corresponding column in rule_counts_probs matrix is changed to 1). 81 | Args: 82 | rule_matches_z: Binary encoded array of which rules matched. Shape: instances x rules 83 | mapping_rules_labels_t: Mapping of rules to labels, binary encoded. Shape: rules x classes 84 | other_class: Class which is chosen, if no function is hitting. 85 | Returns: Array with majority vote probabilities. Shape: instances x classes 86 | """ 87 | 88 | if rule_matches_z.shape[1] != mapping_rules_labels_t.shape[0]: 89 | raise ValueError(f"Dimensions mismatch! Z matrix has shape {rule_matches_z.shape}, while " 90 | f"T matrix has shape {mapping_rules_labels_t.shape}") 91 | 92 | if isinstance(rule_matches_z, sp.csr_matrix): 93 | rule_counts = rule_matches_z.dot(mapping_rules_labels_t).toarray() 94 | else: 95 | rule_counts = np.matmul(rule_matches_z, mapping_rules_labels_t) 96 | 97 | if other_class: 98 | if other_class < 0: 99 | raise RuntimeError("Label for negative samples should be greater than 0 for correct matrix multiplication") 100 | if other_class < mapping_rules_labels_t.shape[1] - 1: 101 | warnings.warn(f"Negative class {other_class} is already present in data") 102 | if rule_counts.shape[1] == other_class: 103 | rule_counts = np.hstack((rule_counts, np.zeros([rule_counts.shape[0], 1]))) 104 | rule_counts[~rule_counts.any(axis=1), other_class] = 1 105 | elif rule_counts.shape[1] >= other_class: 106 | rule_counts[~rule_counts.any(axis=1), other_class] = 1 107 | else: 108 | raise ValueError("Other class id is incorrect") 109 | rule_counts_probs = rule_counts / rule_counts.sum(axis=1).reshape(-1, 1) 110 | rule_counts_probs[np.isnan(rule_counts_probs)] = 0 111 | return rule_counts_probs 112 | 113 | 114 | def probabilities_to_majority_vote( 115 | probs: np.ndarray, choose_random_label: bool = False, other_class_id: int = None, 116 | multiple_instances: bool = False 117 | ) -> int: 118 | """Transforms a vector of probabilities to its majority vote. If there is one class with clear majority, return it. 119 | If there are more than one class with equal probabilities: either select one of the classes randomly, return a 120 | vector containing all of them or assign to the sample the other class id. 121 | 122 | Args: 123 | probs: Vector of probabilities for 1 sample. Shape: classes x 1 124 | choose_random_label: Choose a random label, if there's no clear majority. 125 | other_class_id: Class ID being used, if there's no clear majority 126 | multiple_instances: Return duplicated instances with labels, if there are several maxima. 127 | Returns: An array of classes. 128 | """ 129 | if choose_random_label and other_class_id is not None: 130 | raise ValueError("You can either choose a random class, or transform undefined cases to an other class.") 131 | if choose_random_label and multiple_instances: 132 | raise ValueError("You can either choose a random class, or create multiple instances with multiple classes.") 133 | 134 | row_max = np.max(probs) 135 | num_occurrences = (row_max == probs).sum() 136 | if num_occurrences == 1: 137 | return int(np.argmax(probs)) 138 | elif choose_random_label: 139 | max_ids = np.where(probs == row_max)[0] 140 | return int(np.random.choice(max_ids)) 141 | elif multiple_instances: 142 | return np.where(probs == row_max)[0] 143 | elif other_class_id is not None: 144 | return other_class_id 145 | else: 146 | raise ValueError("Specify a way how to resolve unclear majority votes.") 147 | 148 | 149 | def probabilies_to_majority_class_label( 150 | probs: np.ndarray, choose_random_label: bool = False, other_class_id: int = None, 151 | multiple_instances: bool = False, 152 | ) -> int: 153 | """Transforms a vector of probabilities to its majority vote. If there is one class with clear majority, return it. 154 | If there are more than one class with equal probabilities: either select one of the classes randomly or assign to 155 | the sample the other class id. 156 | 157 | Args: 158 | probs: Vector of probabilities for 1 sample. Shape: classes x 1 159 | choose_random_label: Choose a random label, if there's no clear majority. 160 | other_class_id: Class ID being used, if there's no clear majority 161 | multiple_instances: Return duplicated instances with labels, if there are more maxima. 162 | Returns: An array of classes. 163 | """ 164 | if choose_random_label and other_class_id is not None: 165 | raise ValueError("You can either choose a random class, or transform undefined cases to an other class.") 166 | if choose_random_label and multiple_instances: 167 | raise ValueError("You can either choose a random class, or create multiple instances with multiple classes.") 168 | 169 | row_max = np.max(probs) 170 | num_occurrences = (row_max == probs).sum() 171 | if num_occurrences == 1: 172 | return int(np.argmax(probs)) 173 | elif choose_random_label: 174 | max_ids = np.where(probs == row_max)[0] 175 | return int(np.random.choice(max_ids)) 176 | elif multiple_instances: 177 | return np.where(probs == row_max)[0] 178 | elif other_class_id is not None: 179 | return other_class_id 180 | else: 181 | raise ValueError("Specify a way how to resolve unclear majority votes.") 182 | 183 | 184 | def z_matrix_to_rule_idx( 185 | rules: np.ndarray, choose_random_rule: bool = False, multiple_instances: bool = False 186 | ) -> int: 187 | """Transforms a z matrix to rule indices of matching rules. 188 | If there is more than one rule match: either select one of the rules randomly or return a vector containing 189 | all of them. 190 | 191 | Args: 192 | rules: Vector of probabilities for 1 sample. Shape: classes x 1 193 | choose_random_rule: Choose a random label, if there's no clear majority. 194 | multiple_instances: Return duplicated instances with idx, if there are several rule matches. 195 | Returns: An array of classes. 196 | """ 197 | if choose_random_rule and multiple_instances: 198 | raise ValueError("You can either choose a random rule, or create multiple instances with multiple rules.") 199 | 200 | row_max = np.max(rules) 201 | num_occurrences = (row_max == rules).sum() 202 | if num_occurrences == 1: 203 | return int(np.argmax(rules)) 204 | elif choose_random_rule: 205 | max_ids = np.where(rules == row_max)[0] 206 | return int(np.random.choice(max_ids)) 207 | elif multiple_instances: 208 | return np.where(rules == row_max)[0] 209 | else: 210 | raise ValueError("Specify a way how to resolve multiple rule matches.") 211 | 212 | 213 | def get_dataset(data_path: str, use_tfidf: bool = True, if_dev_data: bool = True): 214 | # first, the data is read from the file 215 | train_df, dev_df, test_df, train_rule_matches_z, dev_rule_matches_z, test_rule_matches_z, \ 216 | mapping_rules_labels_t = get_data(data_path, if_dev_data=if_dev_data) 217 | if "tac" in data_path: 218 | val = "samples" 219 | else: 220 | val = "sample" 221 | 222 | if use_tfidf: 223 | # For the LogReg model we encode train samples with TF-IDF features. 224 | if if_dev_data: 225 | train_tfidf, test_tfidf, dev_tfidf = get_tfidf_features(train_df[val].tolist(), test_df[val].tolist(), 226 | dev_df[val].tolist()) 227 | 228 | else: 229 | train_tfidf, test_tfidf, _ = get_tfidf_features(train_df[val].tolist(), test_df[val].tolist()) 230 | 231 | train_set = Tensor(train_tfidf.toarray()) 232 | test_set = Tensor(test_tfidf.toarray()) 233 | else: 234 | train_set = train_df[val].tolist() 235 | test_set = test_df[val].tolist() 236 | 237 | test_labels = LongTensor(list(test_df.iloc[:, 1])) 238 | train_probs = z_t_matrices_to_majority_vote_probs(train_rule_matches_z, mapping_rules_labels_t) 239 | train_labels = LongTensor(np.apply_along_axis(probabilities_to_majority_vote, axis=1, arr=train_probs, 240 | choose_random_label=True)) 241 | train_adv_labels = LongTensor(np.apply_along_axis(z_matrix_to_rule_idx, axis=1, 242 | arr=train_rule_matches_z, choose_random_rule=True)) 243 | 244 | train_dataset = TensorDataset(train_set, train_labels, train_adv_labels) 245 | test_dataset = TensorDataset(test_set, test_labels) 246 | 247 | if if_dev_data: 248 | if use_tfidf: 249 | dev_set = Tensor(dev_tfidf.toarray()) 250 | else: 251 | dev_set = dev_df[val].tolist() 252 | dev_labels = LongTensor(list(dev_df.iloc[:, 1])) 253 | dev_adv_labels = LongTensor(np.apply_along_axis(z_matrix_to_rule_idx, axis=1, 254 | arr=dev_rule_matches_z, choose_random_rule=True)) 255 | dev_dataset = TensorDataset(dev_set, dev_labels, dev_adv_labels) 256 | return train_dataset, dev_dataset, test_dataset 257 | else: 258 | return train_dataset, test_dataset 259 | 260 | 261 | def get_transformer_dataset(data_path: str, feature_ext: FeatureExtraction = FeatureExtraction.DISTILBERT, 262 | if_dev_data: bool = True, max_length_transformer=200): 263 | 264 | # first, the data is read from the file 265 | train_df, dev_df, test_df, train_rule_matches_z, dev_rule_matches_z, test_rule_matches_z, \ 266 | mapping_rules_labels_t = get_data(data_path, if_dev_data=if_dev_data) 267 | 268 | if "tac" in data_path: 269 | val = "samples" 270 | else: 271 | val = "sample" 272 | 273 | tokenizer = TransformerUtil.get_tokenizer(feature_ext) 274 | train_set = tokenizer(train_df[val].tolist(), truncation=True, padding='max_length') 275 | test_set = tokenizer(test_df[val].tolist(), truncation=True, padding='max_length') 276 | 277 | test_labels = LongTensor(list(test_df.iloc[:, 1])) 278 | train_probs = z_t_matrices_to_majority_vote_probs(train_rule_matches_z, mapping_rules_labels_t) 279 | train_labels = LongTensor(np.apply_along_axis(probabilities_to_majority_vote, axis=1, arr=train_probs, 280 | choose_random_label=True)) 281 | train_adv_labels = LongTensor(np.apply_along_axis(z_matrix_to_rule_idx, axis=1, 282 | arr=train_rule_matches_z, choose_random_rule=True)) 283 | 284 | train_dataset = KnowMANDataSet(train_set, train_labels, train_adv_labels, max_length=max_length_transformer) 285 | test_dataset = KnowMANDataSet(test_set, test_labels, max_length=max_length_transformer) 286 | 287 | if if_dev_data: 288 | dev_set = tokenizer(dev_df[val].tolist(), truncation=True, padding='max_length') 289 | 290 | dev_labels = LongTensor(list(dev_df.iloc[:, 1])) 291 | dev_adv_labels = LongTensor(np.apply_along_axis(z_matrix_to_rule_idx, axis=1, 292 | arr=dev_rule_matches_z, choose_random_rule=True)) 293 | 294 | dev_dataset = KnowMANDataSet(dev_set, dev_labels, dev_adv_labels, max_length=max_length_transformer) 295 | 296 | return train_dataset, dev_dataset, test_dataset 297 | else: 298 | return train_dataset, test_dataset 299 | -------------------------------------------------------------------------------- /experiments/spam/train_tfidf_spam.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import time 4 | import torch 5 | import pickle 6 | import logging 7 | import random 8 | import itertools 9 | import numpy as np 10 | import torch.optim as optim 11 | 12 | from tqdm import tqdm 13 | from collections import defaultdict 14 | from torch.nn import functional 15 | from torch.utils.data import DataLoader 16 | from torchnet.meter import ConfusionMeter 17 | from sklearn.metrics import classification_report 18 | 19 | from KnowMan.utils.knowman_utils import freeze_net, unfreeze_net 20 | from KnowMan.models.nn_models import MlpFeatureExtractor, SentimentClassifier, DomainClassifier 21 | from KnowMan.data_prep.get_knodle_dataset import get_dataset 22 | from KnowMan.utils.logging_utils import per_step_classifier_tb_logging 23 | from KnowMan.utils.knowman_parameters import KnowManParameters 24 | 25 | 26 | def train(train_set, test_set, params, log): 27 | """ 28 | train_set, dev_set, test_set: raw_datasets from corpus 29 | """ 30 | if params.training_setting["use_tensorboard"]: 31 | from torch.utils.tensorboard import SummaryWriter 32 | writer = SummaryWriter(log_dir=params.training_setting["tensorboard_dir"], 33 | comment='Using lambda {}'.format(params.model_params["lambd"])) 34 | print(f"tensorboard logging path is {params.training_setting['tensorboard_dir']}") 35 | 36 | train_loader = DataLoader(train_set, params.training_setting["batch_size"], shuffle=True) 37 | test_loader = DataLoader(test_set, params.training_setting["batch_size"], shuffle=True) 38 | 39 | F_s = None 40 | C, D = None, None 41 | 42 | F_s = MlpFeatureExtractor(params.model_params["feature_num"], params.model_params["F_hidden_sizes"], 43 | params.model_params["shared_hidden_size"], params.model_params["dropout"], 44 | params.model_params["F_bn"]) 45 | C = SentimentClassifier(params.model_params["C_layers"], params.model_params["shared_hidden_size"] + 46 | params.model_params["domain_hidden_size"], 47 | params.model_params["shared_hidden_size"] + params.model_params["domain_hidden_size"], 48 | params.model_params["num_labels"], params.model_params["dropout"], 49 | params.model_params["C_bn"]) 50 | D = DomainClassifier(params.model_params["D_layers"], params.model_params["shared_hidden_size"], 51 | params.model_params["shared_hidden_size"], params.model_params["all_domains"], 52 | params.model_params["loss"], params.model_params["dropout"], params.model_params["D_bn"]) 53 | 54 | F_s, C, D = F_s.to(params.training_setting["device"]), \ 55 | C.to(params.training_setting["device"]), \ 56 | D.to(params.training_setting["device"]) 57 | 58 | optimizer = optim.Adam(itertools.chain(*map(list, [F_s.parameters() if F_s else [], C.parameters()])), 59 | lr=params.training_setting["learning_rate"]) 60 | optimizerD = optim.Adam(D.parameters(), lr=params.training_setting["D_learning_rate"]) 61 | 62 | # testing 63 | if params.training_setting["test_only"]: 64 | log.info(f'Loading model from {params.training_setting["model_save_file"]}...') 65 | F_s.load_state_dict(torch.load(os.path.join(params.training_setting["model_save_file"], f'netF_s.pth'))) 66 | C.load_state_dict(torch.load(os.path.join(params.training_setting["model_save_file"], f'netC.pth'))) 67 | D.load_state_dict(torch.load(os.path.join(params.training_setting["model_save_file"], f'netD.pth'))) 68 | 69 | log.info('Evaluating test sets:') 70 | test_classification_report_dict, y_true, y_pred = evaluate(test_loader, F_s, C, params, log, return_labels=True) 71 | log.info(f'Average test accuracy: {test_classification_report_dict["accuracy"]}') 72 | 73 | y_true = np.ndarray.tolist(y_true) 74 | y_pred = np.ndarray.tolist(y_pred) 75 | 76 | with open("../../KnowMan/save/labels_best_spam_tfidf.csv", "w")as out: 77 | for i in range(len(y_true)): 78 | out.write(str(y_true[i]) + "," + str(y_pred[i]) + "\n") 79 | 80 | print({'test': test_classification_report_dict["accuracy"]}) 81 | 82 | # training 83 | else: 84 | best_avg_acc, best_acc = defaultdict(float), 0.0 85 | batches_between_logging = params.training_setting["batches_between_logging"] 86 | evaluate_after_batches_between_logging = params.training_setting["evaluate_after_batches_between_logging"] 87 | num_training_items = len(train_loader) 88 | print(f"Number of training batches: {num_training_items}") 89 | total_steps = 0 90 | avg_train_losses_classifier, avg_train_losses_domain_blurrer, avg_train_losses_classifier_dom_blurrer = 0, 0, 0 91 | tmp_test_classification_report_dict = evaluate(test_loader, F_s, C, params, log) 92 | classification_report_dict = evaluate(train_loader, F_s, C, params, log) 93 | 94 | for epoch in range(params.training_setting["max_epoch"]): 95 | per_epoch_discr_loss_collection = [] 96 | per_epoch_train_losses_classifier = [] 97 | per_epoch_train_losses_domain_blurrer = [] 98 | per_epoch_train_losses_classifier_dom_blurrer = [] 99 | for group in optimizer.param_groups: 100 | learning_rate = group["lr"] 101 | writer.add_scalar("learning_rate", learning_rate, epoch) 102 | for group in optimizerD.param_groups: 103 | learning_rate = group["lr"] 104 | writer.add_scalar("learning_rate_D", learning_rate, epoch) 105 | 106 | F_s.train() 107 | C.train() 108 | D.train() 109 | 110 | # D iteration 111 | d_correct, d_total = 0, 0 112 | 113 | freeze_net(F_s) 114 | freeze_net(C) 115 | unfreeze_net(D) 116 | 117 | for critic_loop_index in range(params.model_params["n_critic"]): 118 | discr_loss_collection = [] 119 | for i, (inputs, labels, adv_labels) in enumerate(train_loader): 120 | inputs, labels, adv_labels = inputs.to(params.training_setting["device"]), \ 121 | labels.to(params.training_setting["device"]), \ 122 | adv_labels.to(params.training_setting["device"]) 123 | 124 | shared_feat = F_s(inputs) 125 | outputs = D(shared_feat) 126 | 127 | _, pred = torch.max(outputs, 1) 128 | d_total += len(inputs) 129 | 130 | d_correct += (pred == adv_labels).sum().item() 131 | l_d = functional.nll_loss(outputs, adv_labels) 132 | discr_loss_collection.append(l_d.item()) 133 | per_epoch_discr_loss_collection.append(l_d.item()) 134 | l_d.backward() 135 | optimizerD.step() 136 | D.zero_grad() 137 | if (i > 0 and i % batches_between_logging == 0) or i == num_training_items - 1: 138 | avg_loss_dom_discr = np.mean(np.array(discr_loss_collection)) 139 | discr_loss_collection = [] 140 | log.info("Discriminator training Epoch {}, " 141 | "critic_loop_index {}, Step {}...".format(epoch, critic_loop_index, i)) 142 | log.info(f'Training loss domain discriminator over the last {batches_between_logging} ' 143 | f'batches: {avg_loss_dom_discr}') 144 | writer.add_scalar('Training loss domain discriminator accumulated over last batches', 145 | avg_loss_dom_discr, total_steps) 146 | if d_total > 0: 147 | log.info('Domain Training Accuracy: {}%'.format(100.0 * d_correct / d_total)) 148 | writer.add_scalar("Domain Training Accuracy", d_correct / d_total, total_steps) 149 | 150 | writer = per_step_classifier_tb_logging(log, writer, None, tmp_test_classification_report_dict, 151 | total_steps, batches_between_logging, 152 | classification_report_dict, 153 | avg_train_losses_classifier, 154 | avg_train_losses_domain_blurrer, 155 | avg_train_losses_classifier_dom_blurrer, 156 | log_dev_test=True, use_accuracy=True) 157 | total_steps += 1 158 | 159 | # F&C iteration 160 | unfreeze_net(F_s) 161 | unfreeze_net(C) 162 | freeze_net(D) 163 | train_losses_classifier = [] 164 | train_losses_domain_blurrer = [] 165 | train_losses_classifier_dom_blurrer = [] 166 | all_c_pred = [] 167 | all_labels = [] 168 | for i, (inputs, labels, adv_labels) in enumerate(train_loader): 169 | inputs, labels, adv_labels = inputs.to(params.training_setting["device"]), \ 170 | labels.to(params.training_setting["device"]), \ 171 | adv_labels.to(params.training_setting["device"]) 172 | shared_feat = F_s(inputs) 173 | c_outputs = C(shared_feat) 174 | d_outputs = D(shared_feat) 175 | 176 | _, c_pred = torch.max(c_outputs, 1) 177 | all_c_pred.append(c_pred.cpu().numpy()) 178 | all_labels.append(labels.cpu().numpy()) 179 | l_c = functional.nll_loss(c_outputs, labels) 180 | train_losses_classifier.append(l_c.item()) 181 | train_losses_domain_blurrer.append(-l_d.item()) 182 | per_epoch_train_losses_classifier.append(l_c.item()) 183 | per_epoch_train_losses_domain_blurrer.append(-l_d.item()) 184 | 185 | l_d = functional.nll_loss(d_outputs, adv_labels) 186 | l_d *= params.model_params["lambd"] 187 | 188 | l_shared = l_c - l_d 189 | train_losses_classifier_dom_blurrer.append(l_shared.item()) 190 | per_epoch_train_losses_classifier_dom_blurrer.append(l_shared.item()) 191 | l_shared.backward() 192 | 193 | optimizer.step() 194 | F_s.zero_grad() 195 | C.zero_grad() 196 | 197 | if (i > 0 and i % batches_between_logging == 0) or i == num_training_items-1: 198 | avg_train_losses_classifier_dom_blurrer = np.mean(np.array(train_losses_classifier_dom_blurrer)) 199 | train_losses_classifier_dom_blurrer = [] 200 | avg_train_losses_classifier = np.mean(np.array(train_losses_classifier)) 201 | train_losses_classifier = [] 202 | avg_train_losses_domain_blurrer = np.mean(np.array(train_losses_domain_blurrer)) 203 | train_losses_domain_blurrer = [] 204 | log.info("Classifier training Epoch {}, Step: {}...".format(epoch, i)) 205 | all_c_pred_array = np.concatenate(all_c_pred, axis=0) 206 | all_labels_array = np.concatenate(all_labels, axis=0) 207 | classification_report_dict = classification_report(y_true=all_labels_array, y_pred=all_c_pred_array, 208 | output_dict=True) 209 | t = time.localtime() 210 | log.info("Time: {}".format(time.strftime("%H:%M:%S", t))) 211 | if d_total > 0: 212 | log.info('Domain Training Accuracy: {}%'.format(100.0 * d_correct / d_total)) 213 | writer.add_scalar('Training loss domain discriminator accumulated over last batches', 214 | avg_loss_dom_discr, total_steps) 215 | writer.add_scalar("Domain Training Accuracy", d_correct / d_total, total_steps) 216 | 217 | if evaluate_after_batches_between_logging or i == num_training_items - 1: 218 | tmp_test_classification_report_dict = evaluate(test_loader, F_s, C, params, log) 219 | 220 | writer = per_step_classifier_tb_logging(log, writer, None, tmp_test_classification_report_dict, 221 | total_steps, batches_between_logging, 222 | classification_report_dict, 223 | avg_train_losses_classifier, 224 | avg_train_losses_domain_blurrer, 225 | avg_train_losses_classifier_dom_blurrer, 226 | log_dev_test=(evaluate_after_batches_between_logging 227 | or i == num_training_items - 1), 228 | use_accuracy=True) 229 | 230 | if evaluate_after_batches_between_logging or i == num_training_items - 1: 231 | if classification_report_dict['accuracy'] > best_acc: 232 | best_avg_acc['test'] = tmp_test_classification_report_dict['accuracy'] 233 | best_acc = classification_report_dict['accuracy'] 234 | with open(os.path.join(params.training_setting["model_save_file"], 'options.pkl'), 'wb')\ 235 | as ouf: 236 | pickle.dump(params.get_config(), ouf) 237 | log.info("Saving new model") 238 | torch.save(F_s.state_dict(), '{}/netF_s.pth'.format(params.training_setting 239 | ["model_save_file"])) 240 | torch.save(C.state_dict(), '{}/netC.pth'.format(params.training_setting["model_save_file"])) 241 | torch.save(D.state_dict(), '{}/netD.pth'.format(params.training_setting["model_save_file"])) 242 | 243 | total_steps += 1 244 | 245 | # end of epoch 246 | log.info('Ending epoch {}'.format(epoch+1)) 247 | 248 | # end of training 249 | log.info(f'Best average accuracy: {100.0*best_acc}%') 250 | return best_avg_acc 251 | 252 | 253 | def evaluate(loader, F_s, C, params, log, return_labels=None): 254 | F_s.eval() 255 | C.eval() 256 | it = iter(loader) 257 | correct = 0 258 | total = 0 259 | all_c_pred = [] 260 | all_labels = [] 261 | confusion = ConfusionMeter(params.model_params["num_labels"]) 262 | for elem in tqdm(it): 263 | inputs, targets = elem[0], elem[1] 264 | inputs, targets = inputs.to(params.training_setting["device"]), targets.to(params.training_setting["device"]) 265 | features = F_s(inputs) 266 | outputs = C(features) 267 | _, pred = torch.max(outputs, 1) 268 | confusion.add(pred.data, targets.data) 269 | total += targets.size(0) 270 | correct += (pred == targets).sum().item() 271 | all_c_pred.append(pred.cpu().numpy()) 272 | all_labels.append(targets.cpu().numpy()) 273 | all_c_pred_array = np.concatenate(all_c_pred, axis=0) 274 | all_labels_array = np.concatenate(all_labels, axis=0) 275 | classification_report_dict = classification_report(y_true=all_labels_array, y_pred=all_c_pred_array, 276 | output_dict=True) 277 | log.debug(confusion.conf) 278 | F_s.train() 279 | C.train() 280 | if return_labels: 281 | return classification_report_dict, all_labels_array, all_c_pred_array 282 | else: 283 | return classification_report_dict 284 | 285 | 286 | def set_seeds(params): 287 | random.seed(params.training_setting["random_seed"]) 288 | np.random.seed(params.training_setting["random_seed"]) 289 | torch.manual_seed(params.training_setting["random_seed"]) 290 | torch.cuda.manual_seed_all(params.training_setting["random_seed"]) 291 | 292 | 293 | def set_logging(params): 294 | # save models and logging 295 | if not os.path.exists(params.training_setting["model_save_file"]): 296 | os.makedirs(params.training_setting["model_save_file"]) 297 | logging.basicConfig(stream=sys.stderr, level=logging.DEBUG if params.training_setting["debug"] else logging.INFO) 298 | log = logging.getLogger(__name__) 299 | fh = logging.FileHandler(os.path.join(params.training_setting["model_save_file"], 'log.txt')) 300 | log.addHandler(fh) 301 | 302 | # output options 303 | log.info(params.get_config()) 304 | 305 | return log 306 | 307 | 308 | def main(): 309 | spam_params = KnowManParameters() 310 | spam_params.update_parameters("./spam_tfidf.yaml") 311 | 312 | set_seeds(spam_params) 313 | log = set_logging(spam_params) 314 | 315 | if not os.path.exists(spam_params.training_setting["model_save_file"]): 316 | os.makedirs(spam_params.training_setting["model_save_file"]) 317 | 318 | train_dataset, test_dataset = get_dataset(spam_params.dataset["dataset_path"], if_dev_data=False) 319 | 320 | cv = train(train_dataset, test_dataset, spam_params, log) 321 | log.info(f'Training done...') 322 | test_acc = cv['test'] 323 | log.info(f'Test Set \t{100.0*test_acc}%') 324 | return cv 325 | 326 | 327 | if __name__ == '__main__': 328 | main() 329 | -------------------------------------------------------------------------------- /experiments/spam/train_transformer_spam.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import time 4 | import torch 5 | import pickle 6 | import logging 7 | import random 8 | import numpy as np 9 | import torch.optim as optim 10 | 11 | from tqdm import tqdm 12 | from transformers import AdamW 13 | from collections import defaultdict 14 | from torch.nn import functional 15 | from torch.utils.data import DataLoader 16 | from torchnet.meter import ConfusionMeter 17 | from sklearn.metrics import classification_report 18 | 19 | from KnowMan.utils.knowman_utils import freeze_net, unfreeze_net, FeatureExtraction, TransformerUtil, \ 20 | unpackKnowMAN_batch 21 | from KnowMan.models.nn_models import SentimentClassifier, DomainClassifier 22 | from KnowMan.data_prep.get_knodle_dataset import get_transformer_dataset 23 | from KnowMan.utils.logging_utils import per_step_classifier_tb_logging 24 | from KnowMan.utils.knowman_parameters import KnowManParameters 25 | 26 | 27 | def train(train_set, test_set, params, log, feature_ext_enum=FeatureExtraction.DISTILBERT): 28 | """ 29 | train_set, dev_set, test_set: raw_datasets from corpus 30 | """ 31 | if params.training_setting["use_tensorboard"]: 32 | from torch.utils.tensorboard import SummaryWriter 33 | writer = SummaryWriter(log_dir=params.training_setting['tensorboard_dir'], 34 | comment='Using lambda {}'.format(params.model_params["lambd"])) 35 | print(f"tensorboard logging path is {params.training_setting['tensorboard_dir']}") 36 | 37 | train_loader = DataLoader(train_set, params.training_setting["batch_size"], shuffle=True) 38 | test_loader = DataLoader(test_set, params.training_setting["batch_size"], shuffle=True) 39 | 40 | F_s = None 41 | C, D = None, None 42 | 43 | F_s = TransformerUtil.get_pretrained_model(feature_ext_enum, dropout=params.model_params["dropout"], 44 | out_size=params.model_params["shared_hidden_size"]) 45 | 46 | F_s.train() 47 | 48 | C = SentimentClassifier(params.model_params["C_layers"], 49 | params.model_params["shared_hidden_size"] + params.model_params["domain_hidden_size"], 50 | params.model_params["shared_hidden_size"] + params.model_params["domain_hidden_size"], 51 | params.model_params["num_labels"], 52 | params.model_params["dropout"], params.model_params["C_bn"]) 53 | D = DomainClassifier(params.model_params["D_layers"], params.model_params["shared_hidden_size"], 54 | params.model_params["shared_hidden_size"], 55 | params.model_params["all_domains"], params.model_params["loss"], 56 | params.model_params["dropout"], params.model_params["D_bn"]) 57 | 58 | F_s, C, D = F_s.to(params.training_setting["device"]), \ 59 | C.to(params.training_setting["device"]), \ 60 | D.to(params.training_setting["device"]) 61 | 62 | for param in F_s.base_model.parameters(): 63 | param.requires_grad = False 64 | 65 | # transformer optimization 66 | no_decay = ['bias', 'LayerNorm.weight'] 67 | optimizer_grouped_parameters = [ 68 | {'params': [p for n, p in F_s.named_parameters() if not any(nd in n for nd in no_decay)], 69 | 'weight_decay': params.training_setting["transformer_weight_decay"]}, 70 | {'params': [p for n, p in F_s.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}, 71 | {'params': [p for n, p in C.named_parameters()], 'weight_decay': 0.0} 72 | ] 73 | optimizer = AdamW(optimizer_grouped_parameters, lr=params.training_setting["learning_rate"]) 74 | optimizerD = optim.Adam(D.parameters(), lr=params.training_setting["D_learning_rate"]) 75 | 76 | # testing 77 | if params.training_setting["test_only"]: 78 | log.info(f'Loading model from {params.training_setting["model_save_file"]}...') 79 | F_s.load_state_dict(torch.load(os.path.join(params.training_setting["model_save_file"], f'netF_s.pth'))) 80 | 81 | C.load_state_dict(torch.load(os.path.join(params.training_setting["model_save_file"], f'netC.pth'))) 82 | D.load_state_dict(torch.load(os.path.join(params.training_setting["model_save_file"], f'netD.pth'))) 83 | 84 | log.info('Evaluating test sets:') 85 | test_classification_report_dict, y_true, y_pred = evaluate(test_loader, F_s, C, params, log, return_labels=True) 86 | log.info(f'Average test accuracy: {test_classification_report_dict["accuracy"]}') 87 | 88 | y_true = np.ndarray.tolist(y_true) 89 | y_pred = np.ndarray.tolist(y_pred) 90 | 91 | with open("../../KnowMan/save/labels_l0_spam_bert.csv", "w")as out: 92 | for i in range(len(y_true)): 93 | out.write(str(y_true[i]) + "," + str(y_pred[i]) + "\n") 94 | 95 | print({'test': test_classification_report_dict["accuracy"]}) 96 | 97 | # training 98 | else: 99 | best_avg_acc, best_acc = defaultdict(float), 0.0 100 | batches_between_logging = params.training_setting["batches_between_logging"] 101 | evaluate_after_batches_between_logging = params.training_setting["evaluate_after_batches_between_logging"] 102 | num_training_items = len(train_loader) 103 | print(f"Number of training batches: {num_training_items}") 104 | total_steps = 0 105 | avg_train_losses_classifier, avg_train_losses_domain_blurrer, avg_train_losses_classifier_dom_blurrer = 0, 0, 0 106 | tmp_test_classification_report_dict = evaluate(test_loader, F_s, C, params, log) 107 | classification_report_dict = evaluate(train_loader, F_s, C, params, log) 108 | 109 | for epoch in range(params.training_setting["max_epoch"]): 110 | per_epoch_discr_loss_collection = [] 111 | per_epoch_train_losses_classifier = [] 112 | per_epoch_train_losses_domain_blurrer = [] 113 | per_epoch_train_losses_classifier_dom_blurrer = [] 114 | for group in optimizer.param_groups: 115 | learning_rate = group["lr"] 116 | writer.add_scalar("learning_rate", learning_rate, epoch) 117 | for group in optimizerD.param_groups: 118 | learning_rate = group["lr"] 119 | writer.add_scalar("learning_rate_D", learning_rate, epoch) 120 | 121 | F_s.train() 122 | C.train() 123 | D.train() 124 | 125 | # D iteration 126 | d_correct, d_total = 0, 0 127 | freeze_net(F_s) 128 | freeze_net(C) 129 | unfreeze_net(D) 130 | 131 | for critic_loop_index in range(params.model_params["n_critic"]): 132 | discr_loss_collection = [] 133 | for i, batch_X in enumerate(train_loader): 134 | inputs, labels, adv_labels = unpackKnowMAN_batch(batch_X, params.training_setting["device"]) 135 | 136 | # input type is dict if we use transformers 137 | shared_feat = F_s(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask']) 138 | outputs = D(shared_feat) 139 | 140 | # D accuracy 141 | _, pred = torch.max(outputs, 1) 142 | d_total += len(inputs) 143 | d_correct += (pred == adv_labels).sum().item() 144 | l_d = functional.nll_loss(outputs, adv_labels) 145 | discr_loss_collection.append(l_d.item()) 146 | per_epoch_discr_loss_collection.append(l_d.item()) 147 | l_d.backward() 148 | optimizerD.step() 149 | D.zero_grad() 150 | if (i > 0 and i % batches_between_logging == 0) or i == num_training_items - 1: 151 | avg_loss_dom_discr = np.mean(np.array(discr_loss_collection)) 152 | discr_loss_collection = [] 153 | log.info("Discriminator training Epoch {}, critic_loop_index {}, Step {}...".format( 154 | epoch, critic_loop_index, i)) 155 | log.info(f'Training loss domain discriminator over the last {batches_between_logging} ' 156 | f'batches: {avg_loss_dom_discr}') 157 | writer.add_scalar('Training loss domain discriminator accumulated over last batches', 158 | avg_loss_dom_discr, total_steps) 159 | if d_total > 0: 160 | log.info('Domain Training Accuracy: {}%'.format(d_correct / d_total)) 161 | writer.add_scalar("Domain Training Accuracy", d_correct / d_total, total_steps) 162 | 163 | writer = per_step_classifier_tb_logging(log, writer, None, tmp_test_classification_report_dict, 164 | total_steps, batches_between_logging, 165 | classification_report_dict, 166 | avg_train_losses_classifier, 167 | avg_train_losses_domain_blurrer, 168 | avg_train_losses_classifier_dom_blurrer, 169 | log_dev_test=True, use_accuracy=True) 170 | total_steps += 1 171 | 172 | # F&C iteration 173 | unfreeze_net(F_s) 174 | unfreeze_net(C) 175 | freeze_net(D) 176 | train_losses_classifier = [] 177 | train_losses_domain_blurrer = [] 178 | train_losses_classifier_dom_blurrer = [] 179 | all_c_pred = [] 180 | all_labels = [] 181 | 182 | for i, batch_X in enumerate(train_loader): 183 | inputs, labels, adv_labels = unpackKnowMAN_batch(batch_X, params.training_setting["device"]) 184 | 185 | shared_feat = F_s(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask']) 186 | 187 | c_outputs = C(shared_feat) 188 | d_outputs = D(shared_feat) 189 | 190 | _, c_pred = torch.max(c_outputs, 1) 191 | all_c_pred.append(c_pred.cpu().numpy()) 192 | all_labels.append(labels.cpu().numpy()) 193 | l_c = functional.nll_loss(c_outputs, labels) 194 | train_losses_classifier.append(l_c.item()) 195 | train_losses_domain_blurrer.append(-l_d.item()) 196 | per_epoch_train_losses_classifier.append(l_c.item()) 197 | per_epoch_train_losses_domain_blurrer.append(-l_d.item()) 198 | 199 | l_d = functional.nll_loss(d_outputs, adv_labels) 200 | l_d *= params.model_params["lambd"] 201 | 202 | l_shared = l_c - l_d 203 | train_losses_classifier_dom_blurrer.append(l_shared.item()) 204 | per_epoch_train_losses_classifier_dom_blurrer.append(l_shared.item()) 205 | l_shared.backward() 206 | 207 | optimizer.step() 208 | F_s.zero_grad() 209 | C.zero_grad() 210 | 211 | if (i > 0 and i % batches_between_logging == 0) or i == num_training_items-1: 212 | avg_train_losses_classifier_dom_blurrer = np.mean(np.array(train_losses_classifier_dom_blurrer)) 213 | train_losses_classifier_dom_blurrer = [] 214 | avg_train_losses_classifier = np.mean(np.array(train_losses_classifier)) 215 | train_losses_classifier = [] 216 | avg_train_losses_domain_blurrer = np.mean(np.array(train_losses_domain_blurrer)) 217 | train_losses_domain_blurrer = [] 218 | log.info("Classifier training Epoch {}, Step: {}...".format(epoch, i)) 219 | all_c_pred_array = np.concatenate(all_c_pred, axis=0) 220 | all_labels_array = np.concatenate(all_labels, axis=0) 221 | classification_report_dict = classification_report(y_true=all_labels_array, y_pred=all_c_pred_array, 222 | output_dict=True) 223 | t = time.localtime() 224 | log.info("Time: {}".format(time.strftime("%H:%M:%S", t))) 225 | if d_total > 0: 226 | log.info('Domain Training Accuracy: {}%'.format(d_correct / d_total)) 227 | writer.add_scalar('Training loss domain discriminator accumulated over last batches', 228 | avg_loss_dom_discr, total_steps) 229 | writer.add_scalar("Domain Training Accuracy", d_correct / d_total, total_steps) 230 | 231 | if evaluate_after_batches_between_logging or i == num_training_items - 1: 232 | tmp_test_classification_report_dict = evaluate(test_loader, F_s, C, params, log) 233 | 234 | writer = per_step_classifier_tb_logging(log, writer, None, tmp_test_classification_report_dict, 235 | total_steps, batches_between_logging, 236 | classification_report_dict, 237 | avg_train_losses_classifier, 238 | avg_train_losses_domain_blurrer, 239 | avg_train_losses_classifier_dom_blurrer, 240 | log_dev_test=(evaluate_after_batches_between_logging 241 | or i == num_training_items - 1), 242 | use_accuracy=True) 243 | 244 | if evaluate_after_batches_between_logging or i == num_training_items - 1: 245 | if classification_report_dict['accuracy'] > best_acc: 246 | best_avg_acc['test'] = tmp_test_classification_report_dict['accuracy'] 247 | best_acc = classification_report_dict['accuracy'] 248 | with open(os.path.join(params.training_setting["model_save_file"], 'options.pkl'), 'wb') \ 249 | as ouf: 250 | pickle.dump(params.get_config(), ouf) 251 | log.info("Saving new model") 252 | torch.save(F_s.state_dict(), '{}/netF_s.pth'.format( 253 | params.training_setting["model_save_file"])) 254 | torch.save(C.state_dict(), '{}/netC.pth'.format(params.training_setting["model_save_file"])) 255 | torch.save(D.state_dict(), '{}/netD.pth'.format(params.training_setting["model_save_file"])) 256 | 257 | total_steps += 1 258 | 259 | # end of epoch 260 | log.info('Ending epoch {}'.format(epoch+1)) 261 | 262 | # end of training 263 | log.info(f'Best average accuracy: {100.0*best_acc}%') 264 | return best_avg_acc 265 | 266 | 267 | def evaluate(loader, F_s, C, params, log, return_labels=None): 268 | F_s.eval() 269 | C.eval() 270 | it = iter(loader) 271 | correct = 0 272 | total = 0 273 | all_c_pred = [] 274 | all_labels = [] 275 | confusion = ConfusionMeter(params.model_params["num_labels"]) 276 | for elem in tqdm(it): 277 | inputs, targets = elem[0], elem[1] 278 | inputs, targets = {k: v.to(params.training_setting["device"]) for k, v in inputs.items()}, \ 279 | targets.to(params.training_setting["device"]) 280 | features = F_s(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask']) 281 | 282 | outputs = C(features) 283 | _, pred = torch.max(outputs, 1) 284 | confusion.add(pred.data, targets.data) 285 | total += targets.size(0) 286 | correct += (pred == targets).sum().item() 287 | all_c_pred.append(pred.cpu().numpy()) 288 | all_labels.append(targets.cpu().numpy()) 289 | all_c_pred_array = np.concatenate(all_c_pred, axis=0) 290 | all_labels_array = np.concatenate(all_labels, axis=0) 291 | classification_report_dict = classification_report(y_true=all_labels_array, y_pred=all_c_pred_array, 292 | output_dict=True) 293 | log.debug(confusion.conf) 294 | F_s.train() 295 | C.train() 296 | if return_labels: 297 | return classification_report_dict, all_labels_array, all_c_pred_array 298 | else: 299 | return classification_report_dict 300 | 301 | 302 | def set_seeds(params): 303 | random.seed(params.training_setting["random_seed"]) 304 | np.random.seed(params.training_setting["random_seed"]) 305 | torch.manual_seed(params.training_setting["random_seed"]) 306 | torch.cuda.manual_seed_all(params.training_setting["random_seed"]) 307 | 308 | 309 | def set_logging(params): 310 | # save models and logging 311 | if not os.path.exists(params.training_setting["model_save_file"]): 312 | os.makedirs(params.training_setting["model_save_file"]) 313 | logging.basicConfig(stream=sys.stderr, level=logging.DEBUG if params.training_setting["debug"] else logging.INFO) 314 | log = logging.getLogger(__name__) 315 | fh = logging.FileHandler(os.path.join(params.training_setting["model_save_file"], 'log.txt')) 316 | log.addHandler(fh) 317 | 318 | # output options 319 | log.info(params.get_config()) 320 | 321 | return log 322 | 323 | 324 | def main(): 325 | spam_params = KnowManParameters() 326 | spam_params.update_parameters("./spam_transformer.yaml") 327 | 328 | set_seeds(spam_params) 329 | log = set_logging(spam_params) 330 | 331 | if not os.path.exists(spam_params.training_setting["model_save_file"]): 332 | os.makedirs(spam_params.training_setting["model_save_file"]) 333 | 334 | feature_ext_enum = FeatureExtraction.name2type(spam_params.model_params["feature_ext"]) 335 | train_dataset, test_dataset = get_transformer_dataset(spam_params.dataset["dataset_path"], 336 | feature_ext=feature_ext_enum, if_dev_data=False, 337 | max_length_transformer= 338 | spam_params.model_params["max_length_transformer"]) 339 | 340 | cv = train(train_dataset, test_dataset, spam_params, log, feature_ext_enum=feature_ext_enum) 341 | log.info(f'Training done...') 342 | test_acc = cv['test'] 343 | log.info(f'Test Set \t{100.0*test_acc}%') 344 | return cv 345 | 346 | 347 | if __name__ == '__main__': 348 | main() 349 | -------------------------------------------------------------------------------- /experiments/imdb/train_tfidf_imdb.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import time 4 | import torch 5 | import pickle 6 | import logging 7 | import random 8 | import itertools 9 | import numpy as np 10 | import torch.optim as optim 11 | 12 | from tqdm import tqdm 13 | from collections import defaultdict 14 | from torch.nn import functional 15 | from torch.utils.data import DataLoader 16 | from torchnet.meter import ConfusionMeter 17 | from sklearn.metrics import classification_report 18 | 19 | from KnowMan.utils.knowman_utils import freeze_net, unfreeze_net 20 | from KnowMan.models.nn_models import MlpFeatureExtractor, SentimentClassifier, DomainClassifier 21 | from KnowMan.data_prep.get_knodle_dataset import get_dataset 22 | from KnowMan.utils.logging_utils import per_step_classifier_tb_logging 23 | from KnowMan.utils.knowman_parameters import KnowManParameters 24 | 25 | 26 | def train(train_set, dev_set, test_set, params, log): 27 | """ 28 | train_set, dev_set, test_set: raw_datasets from corpus 29 | """ 30 | if params.training_setting["use_tensorboard"]: 31 | from torch.utils.tensorboard import SummaryWriter 32 | writer = SummaryWriter(log_dir=params.training_setting["tensorboard_dir"], 33 | comment='Using lambda {}'.format(params.model_params["lambd"])) 34 | print(f"tensorboard logging path is {params.training_setting['tensorboard_dir']}") 35 | 36 | train_loader = DataLoader(train_set, params.training_setting["batch_size"], shuffle=True) 37 | dev_loader = DataLoader(dev_set, params.training_setting["batch_size"], shuffle=True) 38 | test_loader = DataLoader(test_set, params.training_setting["batch_size"], shuffle=True) 39 | 40 | F_s = None 41 | C, D = None, None 42 | 43 | F_s = MlpFeatureExtractor(params.model_params["feature_num"], params.model_params["F_hidden_sizes"], 44 | params.model_params["shared_hidden_size"], params.model_params["dropout"], 45 | params.model_params["F_bn"]) 46 | C = SentimentClassifier(params.model_params["C_layers"], params.model_params["shared_hidden_size"] + 47 | params.model_params["domain_hidden_size"], 48 | params.model_params["shared_hidden_size"] + params.model_params["domain_hidden_size"], 49 | params.model_params["num_labels"], params.model_params["dropout"], 50 | params.model_params["C_bn"]) 51 | D = DomainClassifier(params.model_params["D_layers"], params.model_params["shared_hidden_size"], 52 | params.model_params["shared_hidden_size"], params.model_params["all_domains"], 53 | params.model_params["loss"], params.model_params["dropout"], params.model_params["D_bn"]) 54 | 55 | F_s, C, D = F_s.to(params.training_setting["device"]), \ 56 | C.to(params.training_setting["device"]), \ 57 | D.to(params.training_setting["device"]) 58 | 59 | optimizer = optim.Adam(itertools.chain(*map(list, [F_s.parameters() if F_s else [], C.parameters()])), 60 | lr=params.training_setting["learning_rate"]) 61 | optimizerD = optim.Adam(D.parameters(), lr=params.training_setting["D_learning_rate"]) 62 | 63 | # testing 64 | if params.training_setting["test_only"]: 65 | log.info(f'Loading model from {params.training_setting["model_save_file"]}...') 66 | F_s.load_state_dict(torch.load(os.path.join(params.training_setting["model_save_file"], f'netF_s.pth'))) 67 | C.load_state_dict(torch.load(os.path.join(params.training_setting["model_save_file"], f'netC.pth'))) 68 | D.load_state_dict(torch.load(os.path.join(params.training_setting["model_save_file"], f'netD.pth'))) 69 | 70 | log.info('Evaluating validation sets:') 71 | dev_classification_report_dict = evaluate(dev_loader, F_s, C, params, log) 72 | log.info(f'Average validation accuracy: {dev_classification_report_dict["accuracy"]}') 73 | log.info('Evaluating test sets:') 74 | test_classification_report_dict, y_true, y_pred = evaluate(test_loader, F_s, C, params, log, return_labels=True) 75 | log.info(f'Average test accuracy: {test_classification_report_dict["accuracy"]}') 76 | 77 | y_true = np.ndarray.tolist(y_true) 78 | y_pred = np.ndarray.tolist(y_pred) 79 | 80 | with open("../../KnowMan/save/labels_l0_imdb_tfidf.csv", "w")as out: 81 | for i in range(len(y_true)): 82 | out.write(str(y_true[i]) + "," + str(y_pred[i]) + "\n") 83 | 84 | print({'valid': dev_classification_report_dict["accuracy"], 85 | 'test': test_classification_report_dict["accuracy"]}) 86 | 87 | # training 88 | else: 89 | best_avg_acc, best_acc = defaultdict(float), 0.0 90 | batches_between_logging = params.training_setting["batches_between_logging"] 91 | evaluate_after_batches_between_logging = params.training_setting["evaluate_after_batches_between_logging"] 92 | num_training_items = len(train_loader) 93 | print(f"Number of training batches: {num_training_items}") 94 | total_steps = 0 95 | avg_train_losses_classifier, avg_train_losses_domain_blurrer, avg_train_losses_classifier_dom_blurrer = 0, 0, 0 96 | tmp_dev_classification_report_dict = evaluate(dev_loader, F_s, C, params, log) 97 | tmp_test_classification_report_dict = evaluate(test_loader, F_s, C, params, log) 98 | classification_report_dict = evaluate(train_loader, F_s, C, params, log) 99 | 100 | for epoch in range(params.training_setting["max_epoch"]): 101 | per_epoch_discr_loss_collection = [] 102 | per_epoch_train_losses_classifier = [] 103 | per_epoch_train_losses_domain_blurrer = [] 104 | per_epoch_train_losses_classifier_dom_blurrer = [] 105 | for group in optimizer.param_groups: 106 | learning_rate = group["lr"] 107 | writer.add_scalar("learning_rate", learning_rate, epoch) 108 | for group in optimizerD.param_groups: 109 | learning_rate = group["lr"] 110 | writer.add_scalar("learning_rate_D", learning_rate, epoch) 111 | 112 | F_s.train() 113 | C.train() 114 | D.train() 115 | 116 | # D iteration 117 | d_correct, d_total = 0, 0 118 | freeze_net(F_s) 119 | freeze_net(C) 120 | unfreeze_net(D) 121 | 122 | for critic_loop_index in range(params.model_params["n_critic"]): 123 | discr_loss_collection = [] 124 | for i, (inputs, labels, adv_labels) in enumerate(train_loader): 125 | inputs, labels, adv_labels = inputs.to(params.training_setting["device"]), \ 126 | labels.to(params.training_setting["device"]), \ 127 | adv_labels.to(params.training_setting["device"]) 128 | 129 | shared_feat = F_s(inputs) 130 | outputs = D(shared_feat) 131 | _, pred = torch.max(outputs, 1) 132 | d_total += len(inputs) 133 | d_correct += (pred == adv_labels).sum().item() 134 | l_d = functional.nll_loss(outputs, adv_labels) 135 | discr_loss_collection.append(l_d.item()) 136 | per_epoch_discr_loss_collection.append(l_d.item()) 137 | l_d.backward() 138 | optimizerD.step() 139 | D.zero_grad() 140 | if (i > 0 and i % batches_between_logging == 0) or i == num_training_items - 1: 141 | avg_loss_dom_discr = np.mean(np.array(discr_loss_collection)) 142 | discr_loss_collection = [] 143 | log.info("Discriminator training Epoch {}, " 144 | "critic_loop_index {}, Step {}...".format(epoch, critic_loop_index, i)) 145 | log.info(f'Training loss domain discriminator over the last {batches_between_logging} ' 146 | f'batches: {avg_loss_dom_discr}') 147 | writer.add_scalar('Training loss domain discriminator accumulated over last batches', 148 | avg_loss_dom_discr, total_steps) 149 | if d_total > 0: 150 | log.info('Domain Training Accuracy: {}%'.format(100.0 * d_correct / d_total)) 151 | writer.add_scalar("Domain Training Accuracy", d_correct / d_total, 152 | total_steps) 153 | 154 | writer = per_step_classifier_tb_logging(log, writer, tmp_dev_classification_report_dict, 155 | tmp_test_classification_report_dict, total_steps, 156 | batches_between_logging, classification_report_dict, 157 | avg_train_losses_classifier, 158 | avg_train_losses_domain_blurrer, 159 | avg_train_losses_classifier_dom_blurrer, 160 | log_dev_test=True, use_accuracy=True) 161 | total_steps += 1 162 | 163 | # F&C iteration 164 | unfreeze_net(F_s) 165 | unfreeze_net(C) 166 | freeze_net(D) 167 | train_losses_classifier = [] 168 | train_losses_domain_blurrer = [] 169 | train_losses_classifier_dom_blurrer = [] 170 | all_c_pred = [] 171 | all_labels = [] 172 | for i, (inputs, labels, adv_labels) in enumerate(train_loader): 173 | inputs, labels, adv_labels = inputs.to(params.training_setting["device"]), \ 174 | labels.to(params.training_setting["device"]), \ 175 | adv_labels.to(params.training_setting["device"]) 176 | shared_feat = F_s(inputs) 177 | c_outputs = C(shared_feat) 178 | d_outputs = D(shared_feat) 179 | 180 | _, c_pred = torch.max(c_outputs, 1) 181 | all_c_pred.append(c_pred.cpu().numpy()) 182 | all_labels.append(labels.cpu().numpy()) 183 | l_c = functional.nll_loss(c_outputs, labels) 184 | train_losses_classifier.append(l_c.item()) 185 | train_losses_domain_blurrer.append(-l_d.item()) 186 | per_epoch_train_losses_classifier.append(l_c.item()) 187 | per_epoch_train_losses_domain_blurrer.append(-l_d.item()) 188 | 189 | l_d = functional.nll_loss(d_outputs, adv_labels) 190 | l_d *= params.model_params["lambd"] 191 | 192 | l_shared = l_c - l_d 193 | train_losses_classifier_dom_blurrer.append(l_shared.item()) 194 | per_epoch_train_losses_classifier_dom_blurrer.append(l_shared.item()) 195 | l_shared.backward() 196 | 197 | optimizer.step() 198 | F_s.zero_grad() 199 | C.zero_grad() 200 | 201 | if (i > 0 and i % batches_between_logging == 0) or i == num_training_items-1: 202 | avg_train_losses_classifier_dom_blurrer = np.mean(np.array(train_losses_classifier_dom_blurrer)) 203 | train_losses_classifier_dom_blurrer = [] 204 | avg_train_losses_classifier = np.mean(np.array(train_losses_classifier)) 205 | train_losses_classifier = [] 206 | avg_train_losses_domain_blurrer = np.mean(np.array(train_losses_domain_blurrer)) 207 | train_losses_domain_blurrer = [] 208 | log.info("Classifier training Epoch {}, Step: {}...".format(epoch, i)) 209 | all_c_pred_array = np.concatenate(all_c_pred, axis=0) 210 | all_labels_array = np.concatenate(all_labels, axis=0) 211 | classification_report_dict = classification_report(y_true=all_labels_array, y_pred=all_c_pred_array, 212 | output_dict=True) 213 | t = time.localtime() 214 | log.info("Time: {}".format(time.strftime("%H:%M:%S", t))) 215 | if d_total > 0: 216 | log.info('Domain Training Accuracy: {}%'.format(100.0 * d_correct / d_total)) 217 | writer.add_scalar('Training loss domain discriminator accumulated over last batches', 218 | avg_loss_dom_discr, total_steps) 219 | writer.add_scalar("Domain Training Accuracy", d_correct / d_total, total_steps) 220 | 221 | if evaluate_after_batches_between_logging or i == num_training_items - 1: 222 | tmp_dev_classification_report_dict = evaluate(dev_loader, F_s, C, params, log) 223 | tmp_test_classification_report_dict = evaluate(test_loader, F_s, C, params, log) 224 | 225 | writer = per_step_classifier_tb_logging(log, writer, tmp_dev_classification_report_dict, 226 | tmp_test_classification_report_dict, total_steps, 227 | batches_between_logging, classification_report_dict, 228 | avg_train_losses_classifier, 229 | avg_train_losses_domain_blurrer, 230 | avg_train_losses_classifier_dom_blurrer, 231 | log_dev_test=(evaluate_after_batches_between_logging or 232 | i == num_training_items - 1), 233 | use_accuracy=True) 234 | 235 | if evaluate_after_batches_between_logging or i == num_training_items - 1: 236 | if tmp_dev_classification_report_dict['accuracy'] > best_acc: 237 | log.info(f'New best average validation accuracy:' 238 | f' {100.0 * tmp_dev_classification_report_dict["accuracy"]}%') 239 | best_avg_acc['valid'] = tmp_dev_classification_report_dict['accuracy'] 240 | best_avg_acc['test'] = tmp_test_classification_report_dict['accuracy'] 241 | best_acc = tmp_dev_classification_report_dict['accuracy'] 242 | with open(os.path.join(params.training_setting["model_save_file"], 'options.pkl'), 'wb') \ 243 | as ouf: 244 | pickle.dump(params.get_config(), ouf) 245 | log.info("Saving new model") 246 | torch.save(F_s.state_dict(), '{}/netF_s.pth'.format(params.training_setting 247 | ["model_save_file"])) 248 | torch.save(C.state_dict(), '{}/netC.pth'.format(params.training_setting["model_save_file"])) 249 | torch.save(D.state_dict(), '{}/netD.pth'.format(params.training_setting["model_save_file"])) 250 | 251 | total_steps += 1 252 | 253 | # end of epoch 254 | log.info('Ending epoch {}'.format(epoch+1)) 255 | 256 | # end of training 257 | log.info(f'Best average validation accuracy: {100.0*best_acc}%') 258 | return best_avg_acc 259 | 260 | 261 | def evaluate(loader, F_s, C, params, log, return_labels=None): 262 | F_s.eval() 263 | C.eval() 264 | it = iter(loader) 265 | correct = 0 266 | total = 0 267 | all_c_pred = [] 268 | all_labels = [] 269 | confusion = ConfusionMeter(params.model_params["num_labels"]) 270 | for elem in tqdm(it): 271 | inputs, targets = elem[0], elem[1] 272 | inputs, targets = inputs.to(params.training_setting["device"]), targets.to(params.training_setting["device"]) 273 | features = F_s(inputs) 274 | outputs = C(features) 275 | _, pred = torch.max(outputs, 1) 276 | confusion.add(pred.data, targets.data) 277 | total += targets.size(0) 278 | correct += (pred == targets).sum().item() 279 | all_c_pred.append(pred.cpu().numpy()) 280 | all_labels.append(targets.cpu().numpy()) 281 | all_c_pred_array = np.concatenate(all_c_pred, axis=0) 282 | all_labels_array = np.concatenate(all_labels, axis=0) 283 | classification_report_dict = classification_report(y_true=all_labels_array, y_pred=all_c_pred_array, 284 | output_dict=True) 285 | log.debug(confusion.conf) 286 | F_s.train() 287 | C.train() 288 | if return_labels: 289 | return classification_report_dict, all_labels_array, all_c_pred_array 290 | else: 291 | return classification_report_dict 292 | 293 | 294 | def set_seeds(params): 295 | random.seed(params.training_setting["random_seed"]) 296 | np.random.seed(params.training_setting["random_seed"]) 297 | torch.manual_seed(params.training_setting["random_seed"]) 298 | torch.cuda.manual_seed_all(params.training_setting["random_seed"]) 299 | 300 | 301 | def set_logging(params): 302 | # save models and logging 303 | if not os.path.exists(params.training_setting["model_save_file"]): 304 | os.makedirs(params.training_setting["model_save_file"]) 305 | logging.basicConfig(stream=sys.stderr, level=logging.DEBUG if params.training_setting["debug"] else logging.INFO) 306 | log = logging.getLogger(__name__) 307 | fh = logging.FileHandler(os.path.join(params.training_setting["model_save_file"], 'log.txt')) 308 | log.addHandler(fh) 309 | 310 | # output options 311 | log.info(params.get_config()) 312 | 313 | return log 314 | 315 | 316 | def main(): 317 | imdb_params = KnowManParameters() 318 | imdb_params.update_parameters("./imdb_tfidf.yaml") 319 | 320 | set_seeds(imdb_params) 321 | log = set_logging(imdb_params) 322 | 323 | if not os.path.exists(imdb_params.training_setting["model_save_file"]): 324 | os.makedirs(imdb_params.training_setting["model_save_file"]) 325 | 326 | train_dataset, dev_dataset, test_dataset = get_dataset(imdb_params.dataset["dataset_path"]) 327 | 328 | cv = train(train_dataset, dev_dataset, test_dataset, imdb_params, log) 329 | log.info(f'Training done...') 330 | acc = cv['valid'] 331 | log.info(f'Validation Set \t{100.0*acc}%') 332 | test_acc = cv['test'] 333 | log.info(f'Test Set \t{100.0*test_acc}%') 334 | return cv 335 | 336 | 337 | if __name__ == '__main__': 338 | main() 339 | -------------------------------------------------------------------------------- /experiments/imdb/train_transformer_imdb.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import time 4 | import torch 5 | import pickle 6 | import logging 7 | import random 8 | import numpy as np 9 | import torch.optim as optim 10 | 11 | from tqdm import tqdm 12 | from transformers import AdamW 13 | from collections import defaultdict 14 | from torch.nn import functional 15 | from torch.utils.data import DataLoader 16 | from torchnet.meter import ConfusionMeter 17 | from sklearn.metrics import classification_report 18 | 19 | from KnowMan.utils.knowman_utils import freeze_net, unfreeze_net, FeatureExtraction, TransformerUtil, \ 20 | unpackKnowMAN_batch 21 | from KnowMan.models.nn_models import SentimentClassifier, DomainClassifier 22 | from KnowMan.data_prep.get_knodle_dataset import get_transformer_dataset 23 | from KnowMan.utils.logging_utils import per_step_classifier_tb_logging 24 | from KnowMan.utils.knowman_parameters import KnowManParameters 25 | 26 | 27 | def train(train_set, dev_set, test_set, params, log, feature_ext_enum=FeatureExtraction.DISTILBERT): 28 | """ 29 | train_set, dev_set, test_set: raw_datasets from corpus 30 | """ 31 | if params.training_setting["use_tensorboard"]: 32 | from torch.utils.tensorboard import SummaryWriter 33 | writer = SummaryWriter(log_dir=params.training_setting['tensorboard_dir'], 34 | comment='Using lambda {}'.format(params.model_params["lambd"])) 35 | print(f"tensorboard logging path is {params.training_setting['tensorboard_dir']}") 36 | 37 | train_loader = DataLoader(train_set, params.training_setting["batch_size"], shuffle=True) 38 | dev_loader = DataLoader(dev_set, params.training_setting["batch_size"], shuffle=True) 39 | test_loader = DataLoader(test_set, params.training_setting["batch_size"], shuffle=True) 40 | 41 | F_s = None 42 | C, D = None, None 43 | 44 | F_s = TransformerUtil.get_pretrained_model(feature_ext_enum, dropout=params.model_params["dropout"], 45 | out_size=params.model_params["shared_hidden_size"]) 46 | 47 | F_s.train() 48 | 49 | C = SentimentClassifier(params.model_params["C_layers"], 50 | params.model_params["shared_hidden_size"] + params.model_params["domain_hidden_size"], 51 | params.model_params["shared_hidden_size"] + params.model_params["domain_hidden_size"], 52 | params.model_params["num_labels"], 53 | params.model_params["dropout"], params.model_params["C_bn"]) 54 | D = DomainClassifier(params.model_params["D_layers"], params.model_params["shared_hidden_size"], 55 | params.model_params["shared_hidden_size"], 56 | params.model_params["all_domains"], params.model_params["loss"], 57 | params.model_params["dropout"], params.model_params["D_bn"]) 58 | 59 | F_s, C, D = F_s.to(params.training_setting["device"]), \ 60 | C.to(params.training_setting["device"]), \ 61 | D.to(params.training_setting["device"]) 62 | 63 | for param in F_s.base_model.parameters(): 64 | param.requires_grad = False 65 | 66 | # transformer optimization 67 | no_decay = ['bias', 'LayerNorm.weight'] 68 | optimizer_grouped_parameters = [ 69 | {'params': [p for n, p in F_s.named_parameters() if not any(nd in n for nd in no_decay)], 70 | 'weight_decay': params.training_setting["transformer_weight_decay"]}, 71 | {'params': [p for n, p in F_s.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}, 72 | {'params': [p for n, p in C.named_parameters()], 'weight_decay': 0.0} 73 | ] 74 | optimizer = AdamW(optimizer_grouped_parameters, lr=params.training_setting["learning_rate"]) 75 | optimizerD = optim.Adam(D.parameters(), lr=params.training_setting["D_learning_rate"]) 76 | 77 | # testing 78 | if params.training_setting["test_only"]: 79 | log.info(f'Loading model from {params.training_setting["model_save_file"]}...') 80 | log.info(f'Loading model from {params.training_setting["model_save_file"]}...') 81 | F_s.load_state_dict(torch.load(os.path.join(params.training_setting["model_save_file"], f'netF_s.pth'))) 82 | 83 | C.load_state_dict(torch.load(os.path.join(params.training_setting["model_save_file"], f'netC.pth'))) 84 | D.load_state_dict(torch.load(os.path.join(params.training_setting["model_save_file"], f'netD.pth'))) 85 | 86 | log.info('Evaluating validation sets:') 87 | dev_classification_report_dict = evaluate(dev_loader, F_s, C, params, log) 88 | log.info(f'Average validation accuracy: {dev_classification_report_dict["accuracy"]}') 89 | log.info('Evaluating test sets:') 90 | test_classification_report_dict, y_true, y_pred = evaluate(test_loader, F_s, C, params, log, return_labels=True) 91 | log.info(f'Average test accuracy: {test_classification_report_dict["accuracy"]}') 92 | 93 | y_true = np.ndarray.tolist(y_true) 94 | y_pred = np.ndarray.tolist(y_pred) 95 | 96 | with open("../../KnowMan/save/labels_best_imdb_bert.csv", "w")as out: 97 | for i in range(len(y_true)): 98 | out.write(str(y_true[i]) + "," + str(y_pred[i]) + "\n") 99 | 100 | print({'valid': dev_classification_report_dict["accuracy"], 101 | 'test': test_classification_report_dict["accuracy"]}) 102 | 103 | # training 104 | else: 105 | best_avg_acc, best_acc = defaultdict(float), 0.0 106 | batches_between_logging = params.training_setting["batches_between_logging"] 107 | evaluate_after_batches_between_logging = params.training_setting["evaluate_after_batches_between_logging"] 108 | num_training_items = len(train_loader) 109 | print(f"Number of training batches: {num_training_items}") 110 | total_steps = 0 111 | avg_train_losses_classifier, avg_train_losses_domain_blurrer, avg_train_losses_classifier_dom_blurrer = 0, 0, 0 112 | tmp_dev_classification_report_dict = evaluate(dev_loader, F_s, C, params, log) 113 | tmp_test_classification_report_dict = evaluate(test_loader, F_s, C, params, log) 114 | classification_report_dict = evaluate(train_loader, F_s, C, params, log) 115 | 116 | for epoch in range(params.training_setting["max_epoch"]): 117 | per_epoch_discr_loss_collection = [] 118 | per_epoch_train_losses_classifier = [] 119 | per_epoch_train_losses_domain_blurrer = [] 120 | per_epoch_train_losses_classifier_dom_blurrer = [] 121 | for group in optimizer.param_groups: 122 | learning_rate = group["lr"] 123 | writer.add_scalar("learning_rate", learning_rate, epoch) 124 | for group in optimizerD.param_groups: 125 | learning_rate = group["lr"] 126 | writer.add_scalar("learning_rate_D", learning_rate, epoch) 127 | 128 | F_s.train() 129 | C.train() 130 | D.train() 131 | 132 | # D iteration 133 | d_correct, d_total = 0, 0 134 | freeze_net(F_s) 135 | freeze_net(C) 136 | unfreeze_net(D) 137 | 138 | for critic_loop_index in range(params.model_params["n_critic"]): 139 | discr_loss_collection = [] 140 | for i, batch_X in enumerate(train_loader): 141 | inputs, labels, adv_labels = unpackKnowMAN_batch(batch_X, params.training_setting["device"]) 142 | 143 | shared_feat = F_s(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask']) 144 | outputs = D(shared_feat) 145 | 146 | _, pred = torch.max(outputs, 1) 147 | d_total += len(inputs) 148 | d_correct += (pred == adv_labels).sum().item() 149 | l_d = functional.nll_loss(outputs, adv_labels) 150 | discr_loss_collection.append(l_d.item()) 151 | per_epoch_discr_loss_collection.append(l_d.item()) 152 | l_d.backward() 153 | optimizerD.step() 154 | D.zero_grad() 155 | if (i > 0 and i % batches_between_logging == 0) or i == num_training_items - 1: 156 | avg_loss_dom_discr = np.mean(np.array(discr_loss_collection)) 157 | discr_loss_collection = [] 158 | log.info("Discriminator training Epoch {}, " 159 | "critic_loop_index {}, Step {}...".format(epoch, critic_loop_index, i)) 160 | log.info(f'Training loss domain discriminator over the last {batches_between_logging} ' 161 | f'batches: {avg_loss_dom_discr}') 162 | writer.add_scalar('Training loss domain discriminator accumulated over last batches', 163 | avg_loss_dom_discr, total_steps) 164 | if d_total > 0: 165 | log.info('Domain Training Accuracy: {}%'.format(100.0 * d_correct / d_total)) 166 | writer.add_scalar("Domain Training Accuracy", d_correct / d_total, total_steps) 167 | 168 | writer = per_step_classifier_tb_logging(log, writer, tmp_dev_classification_report_dict, 169 | tmp_test_classification_report_dict, total_steps, 170 | batches_between_logging, classification_report_dict, 171 | avg_train_losses_classifier, 172 | avg_train_losses_domain_blurrer, 173 | avg_train_losses_classifier_dom_blurrer, 174 | log_dev_test=True, use_accuracy=True) 175 | total_steps += 1 176 | 177 | # F&C iteration 178 | unfreeze_net(F_s) 179 | unfreeze_net(C) 180 | freeze_net(D) 181 | train_losses_classifier = [] 182 | train_losses_domain_blurrer = [] 183 | train_losses_classifier_dom_blurrer = [] 184 | all_c_pred = [] 185 | all_labels = [] 186 | 187 | for i, batch_X in enumerate(train_loader): 188 | inputs, labels, adv_labels = unpackKnowMAN_batch(batch_X, params.training_setting["device"]) 189 | shared_feat = F_s(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask']) 190 | 191 | c_outputs = C(shared_feat) 192 | d_outputs = D(shared_feat) 193 | 194 | _, c_pred = torch.max(c_outputs, 1) 195 | all_c_pred.append(c_pred.cpu().numpy()) 196 | all_labels.append(labels.cpu().numpy()) 197 | l_c = functional.nll_loss(c_outputs, labels) 198 | train_losses_classifier.append(l_c.item()) 199 | train_losses_domain_blurrer.append(-l_d.item()) 200 | per_epoch_train_losses_classifier.append(l_c.item()) 201 | per_epoch_train_losses_domain_blurrer.append(-l_d.item()) 202 | 203 | l_d = functional.nll_loss(d_outputs, adv_labels) 204 | l_d *= params.model_params["lambd"] 205 | 206 | l_shared = l_c - l_d 207 | train_losses_classifier_dom_blurrer.append(l_shared.item()) 208 | per_epoch_train_losses_classifier_dom_blurrer.append(l_shared.item()) 209 | l_shared.backward() 210 | 211 | optimizer.step() 212 | F_s.zero_grad() 213 | C.zero_grad() 214 | 215 | if (i > 0 and i % batches_between_logging == 0) or i == num_training_items-1: 216 | avg_train_losses_classifier_dom_blurrer = np.mean(np.array(train_losses_classifier_dom_blurrer)) 217 | train_losses_classifier_dom_blurrer = [] 218 | avg_train_losses_classifier = np.mean(np.array(train_losses_classifier)) 219 | train_losses_classifier = [] 220 | avg_train_losses_domain_blurrer = np.mean(np.array(train_losses_domain_blurrer)) 221 | train_losses_domain_blurrer = [] 222 | log.info("Classifier training Epoch {}, Step: {}...".format(epoch, i)) 223 | all_c_pred_array = np.concatenate(all_c_pred, axis=0) 224 | all_labels_array = np.concatenate(all_labels, axis=0) 225 | classification_report_dict = classification_report(y_true=all_labels_array, y_pred=all_c_pred_array, 226 | output_dict=True) 227 | t = time.localtime() 228 | log.info("Time: {}".format(time.strftime("%H:%M:%S", t))) 229 | if d_total > 0: 230 | log.info('Domain Training Accuracy: {}%'.format(100.0 * d_correct / d_total)) 231 | writer.add_scalar('Training loss domain discriminator accumulated over last batches', 232 | avg_loss_dom_discr, total_steps) 233 | writer.add_scalar("Domain Training Accuracy", d_correct / d_total, total_steps) 234 | 235 | if evaluate_after_batches_between_logging or i == num_training_items - 1: 236 | tmp_dev_classification_report_dict = evaluate(dev_loader, F_s, C, params, log) 237 | tmp_test_classification_report_dict = evaluate(test_loader, F_s, C, params, log) 238 | 239 | writer = per_step_classifier_tb_logging(log, writer, tmp_dev_classification_report_dict, 240 | tmp_test_classification_report_dict, total_steps, 241 | batches_between_logging, classification_report_dict, 242 | avg_train_losses_classifier, 243 | avg_train_losses_domain_blurrer, 244 | avg_train_losses_classifier_dom_blurrer, 245 | log_dev_test=(evaluate_after_batches_between_logging or 246 | i == num_training_items - 1), 247 | use_accuracy=True) 248 | 249 | if evaluate_after_batches_between_logging or i == num_training_items - 1: 250 | if tmp_dev_classification_report_dict['accuracy'] > best_acc: 251 | log.info(f'New best average validation accuracy: ' 252 | f'{100.0 * tmp_dev_classification_report_dict["accuracy"]}%') 253 | best_avg_acc['valid'] = tmp_dev_classification_report_dict['accuracy'] 254 | best_avg_acc['test'] = tmp_test_classification_report_dict['accuracy'] 255 | best_acc = tmp_dev_classification_report_dict['accuracy'] 256 | with open(os.path.join(params.training_setting["model_save_file"], 'options.pkl'), 'wb') \ 257 | as ouf: 258 | pickle.dump(params.get_config(), ouf) 259 | log.info("Saving new model") 260 | torch.save(F_s.state_dict(), '{}/netF_s.pth'.format( 261 | params.training_setting["model_save_file"])) 262 | torch.save(C.state_dict(), '{}/netC.pth'.format(params.training_setting["model_save_file"])) 263 | torch.save(D.state_dict(), '{}/netD.pth'.format(params.training_setting["model_save_file"])) 264 | 265 | total_steps += 1 266 | 267 | # end of epoch 268 | log.info('Ending epoch {}'.format(epoch+1)) 269 | 270 | # end of training 271 | log.info(f'Best average validation accuracy: {100.0*best_acc}%') 272 | return best_avg_acc 273 | 274 | 275 | def evaluate(loader, F_s, C, params, log, return_labels=None): 276 | F_s.eval() 277 | C.eval() 278 | it = iter(loader) 279 | correct = 0 280 | total = 0 281 | all_c_pred = [] 282 | all_labels = [] 283 | confusion = ConfusionMeter(params.model_params["num_labels"]) 284 | for elem in tqdm(it): 285 | inputs, targets = elem[0], elem[1] 286 | inputs, targets = {k: v.to(params.training_setting["device"]) for k, v in inputs.items()}, \ 287 | targets.to(params.training_setting["device"]) 288 | features = F_s(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask']) 289 | 290 | outputs = C(features) 291 | _, pred = torch.max(outputs, 1) 292 | confusion.add(pred.data, targets.data) 293 | total += targets.size(0) 294 | correct += (pred == targets).sum().item() 295 | all_c_pred.append(pred.cpu().numpy()) 296 | all_labels.append(targets.cpu().numpy()) 297 | all_c_pred_array = np.concatenate(all_c_pred, axis=0) 298 | all_labels_array = np.concatenate(all_labels, axis=0) 299 | classification_report_dict = classification_report(y_true=all_labels_array, y_pred=all_c_pred_array, 300 | output_dict=True) 301 | log.debug(confusion.conf) 302 | F_s.train() 303 | C.train() 304 | if return_labels: 305 | return classification_report_dict, all_labels_array, all_c_pred_array 306 | else: 307 | return classification_report_dict 308 | 309 | 310 | def set_seeds(params): 311 | random.seed(params.training_setting["random_seed"]) 312 | np.random.seed(params.training_setting["random_seed"]) 313 | torch.manual_seed(params.training_setting["random_seed"]) 314 | torch.cuda.manual_seed_all(params.training_setting["random_seed"]) 315 | 316 | 317 | def set_logging(params): 318 | # save models and logging 319 | if not os.path.exists(params.training_setting["model_save_file"]): 320 | os.makedirs(params.training_setting["model_save_file"]) 321 | logging.basicConfig(stream=sys.stderr, level=logging.DEBUG if params.training_setting["debug"] else logging.INFO) 322 | log = logging.getLogger(__name__) 323 | fh = logging.FileHandler(os.path.join(params.training_setting["model_save_file"], 'log.txt')) 324 | log.addHandler(fh) 325 | 326 | # output options 327 | log.info(params.get_config()) 328 | 329 | return log 330 | 331 | 332 | def main(): 333 | imdb_params = KnowManParameters() 334 | imdb_params.update_parameters("./imdb_transformer.yaml") 335 | 336 | set_seeds(imdb_params) 337 | log = set_logging(imdb_params) 338 | 339 | if not os.path.exists(imdb_params.training_setting["model_save_file"]): 340 | os.makedirs(imdb_params.training_setting["model_save_file"]) 341 | 342 | feature_ext_enum = FeatureExtraction.name2type(imdb_params.model_params["feature_ext"]) 343 | 344 | train_dataset, dev_dataset, test_dataset = get_transformer_dataset(imdb_params.dataset["dataset_path"], 345 | feature_ext=feature_ext_enum, 346 | max_length_transformer= 347 | imdb_params.model_params 348 | ["max_length_transformer"]) 349 | 350 | cv = train(train_dataset, dev_dataset, test_dataset, imdb_params, log, feature_ext_enum=feature_ext_enum) 351 | log.info(f'Training done...') 352 | acc = cv['valid'] 353 | log.info(f'Validation Set \t{100.0*acc}%') 354 | test_acc = cv['test'] 355 | log.info(f'Test Set \t{100.0*test_acc}%') 356 | return cv 357 | 358 | 359 | if __name__ == '__main__': 360 | main() 361 | -------------------------------------------------------------------------------- /experiments/spouse/train_transformer_spouse.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import time 4 | import torch 5 | import pickle 6 | import logging 7 | import random 8 | import numpy as np 9 | import torch.optim as optim 10 | 11 | from tqdm import tqdm 12 | from transformers import AdamW 13 | from collections import defaultdict 14 | from torch.nn import functional 15 | from torch.utils.data import DataLoader 16 | from torchnet.meter import ConfusionMeter 17 | from sklearn.metrics import classification_report 18 | 19 | from KnowMan.utils.knowman_utils import freeze_net, unfreeze_net, FeatureExtraction, TransformerUtil, \ 20 | unpackKnowMAN_batch 21 | from KnowMan.models.nn_models import SentimentClassifier, DomainClassifier 22 | from KnowMan.data_prep.get_knodle_dataset import get_transformer_dataset 23 | from KnowMan.utils.logging_utils import per_step_classifier_tb_logging 24 | from KnowMan.utils.knowman_parameters import KnowManParameters 25 | 26 | 27 | def train(train_set, dev_set, test_set, params, log, feature_ext_enum=FeatureExtraction.DISTILBERT): 28 | """ 29 | train_set, dev_set, test_set: raw_datasets from corpus 30 | """ 31 | if params.training_setting["use_tensorboard"]: 32 | from torch.utils.tensorboard import SummaryWriter 33 | writer = SummaryWriter(log_dir=params.training_setting['tensorboard_dir'], 34 | comment='Using lambda {}'.format(params.model_params["lambd"])) 35 | print(f"tensorboard logging path is {params.training_setting['tensorboard_dir']}") 36 | 37 | train_loader = DataLoader(train_set, params.training_setting["batch_size"], shuffle=True) 38 | dev_loader = DataLoader(dev_set, params.training_setting["batch_size"], shuffle=True) 39 | test_loader = DataLoader(test_set, params.training_setting["batch_size"], shuffle=True) 40 | 41 | F_s = None 42 | C, D = None, None 43 | 44 | F_s = TransformerUtil.get_pretrained_model(feature_ext_enum, dropout=params.model_params["dropout"], 45 | out_size=params.model_params["shared_hidden_size"]) 46 | 47 | F_s.train() 48 | 49 | C = SentimentClassifier(params.model_params["C_layers"], 50 | params.model_params["shared_hidden_size"] + params.model_params["domain_hidden_size"], 51 | params.model_params["shared_hidden_size"] + params.model_params["domain_hidden_size"], 52 | params.model_params["num_labels"], 53 | params.model_params["dropout"], params.model_params["C_bn"]) 54 | D = DomainClassifier(params.model_params["D_layers"], params.model_params["shared_hidden_size"], 55 | params.model_params["shared_hidden_size"], 56 | params.model_params["all_domains"], params.model_params["loss"], 57 | params.model_params["dropout"], params.model_params["D_bn"]) 58 | 59 | F_s, C, D = F_s.to(params.training_setting["device"]), \ 60 | C.to(params.training_setting["device"]), \ 61 | D.to(params.training_setting["device"]) 62 | 63 | for param in F_s.base_model.parameters(): 64 | param.requires_grad = False 65 | 66 | # transformer optimization 67 | no_decay = ['bias', 'LayerNorm.weight'] 68 | optimizer_grouped_parameters = [ 69 | {'params': [p for n, p in F_s.named_parameters() if not any(nd in n for nd in no_decay)], 70 | 'weight_decay': params.training_setting["transformer_weight_decay"]}, 71 | {'params': [p for n, p in F_s.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}, 72 | {'params': [p for n, p in C.named_parameters()], 'weight_decay': 0.0} 73 | ] 74 | optimizer = AdamW(optimizer_grouped_parameters, lr=params.training_setting["learning_rate"]) 75 | optimizerD = optim.Adam(D.parameters(), lr=params.training_setting["D_learning_rate"]) 76 | 77 | # testing 78 | if params.training_setting["test_only"]: 79 | log.info(f'Loading model from {params.training_setting["model_save_file"]}...') 80 | F_s.load_state_dict(torch.load(os.path.join(params.training_setting["model_save_file"], f'netF_s.pth'))) 81 | 82 | C.load_state_dict(torch.load(os.path.join(params.training_setting["model_save_file"], f'netC.pth'))) 83 | D.load_state_dict(torch.load(os.path.join(params.training_setting["model_save_file"], f'netD.pth'))) 84 | 85 | log.info('Evaluating validation sets:') 86 | dev_classification_report_dict = evaluate(dev_loader, F_s, C, params, log) 87 | log.info('Evaluating test sets:') 88 | test_classification_report_dict, y_true, y_pred = evaluate(test_loader, F_s, C, params, log, return_labels=True) 89 | log.info(f'Average test weighted-f1: {100.0 * test_classification_report_dict["1"]["f1-score"]}%') 90 | log.info(f'Average validation weighted-f1: {100.0 * dev_classification_report_dict["1"]["f1-score"]}%') 91 | 92 | y_true = np.ndarray.tolist(y_true) 93 | y_pred = np.ndarray.tolist(y_pred) 94 | 95 | with open("../../KnowMan/save/labels_best_spouse_bert.csv", "w")as out: 96 | for i in range(len(y_true)): 97 | out.write(str(y_true[i]) + "," + str(y_pred[i]) + "\n") 98 | print({'test': test_classification_report_dict["1"]}) 99 | 100 | # training 101 | else: 102 | best_avg_weighted_f1, best_weighted_f1 = defaultdict(float), 0.0 103 | batches_between_logging = params.training_setting["batches_between_logging"] 104 | evaluate_after_batches_between_logging = params.training_setting["evaluate_after_batches_between_logging"] 105 | num_training_items = len(train_loader) 106 | print(f"Number of training batches: {num_training_items}") 107 | total_steps = 0 108 | avg_train_losses_classifier, avg_train_losses_domain_blurrer, avg_train_losses_classifier_dom_blurrer = 0, 0, 0 109 | tmp_dev_classification_report_dict = evaluate(dev_loader, F_s, C, params, log) 110 | tmp_test_classification_report_dict = evaluate(test_loader, F_s, C, params, log) 111 | classification_report_dict = evaluate(train_loader, F_s, C, params, log) 112 | 113 | for epoch in range(params.training_setting["max_epoch"]): 114 | per_epoch_discr_loss_collection = [] 115 | per_epoch_train_losses_classifier = [] 116 | per_epoch_train_losses_domain_blurrer = [] 117 | per_epoch_train_losses_classifier_dom_blurrer = [] 118 | for group in optimizer.param_groups: 119 | learning_rate = group["lr"] 120 | writer.add_scalar("learning_rate", learning_rate, epoch) 121 | for group in optimizerD.param_groups: 122 | learning_rate = group["lr"] 123 | writer.add_scalar("learning_rate_D", learning_rate, epoch) 124 | 125 | F_s.train() 126 | C.train() 127 | D.train() 128 | 129 | # D iteration 130 | d_correct, d_total = 0, 0 131 | freeze_net(F_s) 132 | freeze_net(C) 133 | unfreeze_net(D) 134 | 135 | for critic_loop_index in range(params.model_params["n_critic"]): 136 | discr_loss_collection = [] 137 | for i, batch_X in enumerate(train_loader): 138 | inputs, labels, adv_labels = unpackKnowMAN_batch(batch_X, params.training_setting["device"]) 139 | 140 | # input type is dict if we use transformers 141 | shared_feat = F_s(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask']) 142 | outputs = D(shared_feat) 143 | 144 | # D accuracy 145 | _, pred = torch.max(outputs, 1) 146 | d_total += len(inputs) 147 | d_correct += (pred == adv_labels).sum().item() 148 | l_d = functional.nll_loss(outputs, adv_labels) 149 | discr_loss_collection.append(l_d.item()) 150 | per_epoch_discr_loss_collection.append(l_d.item()) 151 | l_d.backward() 152 | optimizerD.step() 153 | D.zero_grad() 154 | if (i > 0 and i % batches_between_logging == 0) or i == num_training_items - 1: 155 | avg_loss_dom_discr = np.mean(np.array(discr_loss_collection)) 156 | discr_loss_collection = [] 157 | log.info("Discriminator training Epoch {}, " 158 | "critic_loop_index {}, Step {}...".format(epoch, critic_loop_index, i)) 159 | log.info(f'Training loss domain discriminator over the last {batches_between_logging} ' 160 | f'batches: {avg_loss_dom_discr}') 161 | writer.add_scalar('Training loss domain discriminator accumulated over last batches', 162 | avg_loss_dom_discr, total_steps) 163 | if d_total > 0: 164 | log.info('Domain Training Accuracy: {}%'.format(100.0 * d_correct / d_total)) 165 | writer.add_scalar("Domain Training Accuracy", d_correct / d_total, total_steps) 166 | 167 | writer = per_step_classifier_tb_logging(log, writer, tmp_dev_classification_report_dict, 168 | tmp_test_classification_report_dict, total_steps, 169 | batches_between_logging, classification_report_dict, 170 | avg_train_losses_classifier, 171 | avg_train_losses_domain_blurrer, 172 | avg_train_losses_classifier_dom_blurrer, 173 | log_dev_test=True, use_accuracy=False) 174 | total_steps += 1 175 | 176 | # F&C iteration 177 | unfreeze_net(F_s) 178 | unfreeze_net(C) 179 | freeze_net(D) 180 | train_losses_classifier = [] 181 | train_losses_domain_blurrer = [] 182 | train_losses_classifier_dom_blurrer = [] 183 | all_c_pred = [] 184 | all_labels = [] 185 | 186 | for i, batch_X in enumerate(train_loader): 187 | inputs, labels, adv_labels = unpackKnowMAN_batch(batch_X, params.training_setting["device"]) 188 | 189 | shared_feat = F_s(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask']) 190 | 191 | c_outputs = C(shared_feat) 192 | d_outputs = D(shared_feat) 193 | 194 | _, c_pred = torch.max(c_outputs, 1) 195 | all_c_pred.append(c_pred.cpu().numpy()) 196 | all_labels.append(labels.cpu().numpy()) 197 | l_c = functional.nll_loss(c_outputs, labels) 198 | train_losses_classifier.append(l_c.item()) 199 | train_losses_domain_blurrer.append(-l_d.item()) 200 | per_epoch_train_losses_classifier.append(l_c.item()) 201 | per_epoch_train_losses_domain_blurrer.append(-l_d.item()) 202 | 203 | l_d = functional.nll_loss(d_outputs, adv_labels) 204 | l_d *= params.model_params["lambd"] 205 | 206 | l_shared = l_c - l_d 207 | train_losses_classifier_dom_blurrer.append(l_shared.item()) 208 | per_epoch_train_losses_classifier_dom_blurrer.append(l_shared.item()) 209 | l_shared.backward() 210 | 211 | optimizer.step() 212 | F_s.zero_grad() 213 | C.zero_grad() 214 | 215 | if (i > 0 and i % batches_between_logging == 0) or i == num_training_items-1: 216 | avg_train_losses_classifier_dom_blurrer = np.mean(np.array(train_losses_classifier_dom_blurrer)) 217 | train_losses_classifier_dom_blurrer = [] 218 | avg_train_losses_classifier = np.mean(np.array(train_losses_classifier)) 219 | train_losses_classifier = [] 220 | avg_train_losses_domain_blurrer = np.mean(np.array(train_losses_domain_blurrer)) 221 | train_losses_domain_blurrer = [] 222 | log.info("Classifier training Epoch {}, Step: {}...".format(epoch, i)) 223 | all_c_pred_array = np.concatenate(all_c_pred, axis=0) 224 | all_labels_array = np.concatenate(all_labels, axis=0) 225 | classification_report_dict = classification_report(y_true=all_labels_array, y_pred=all_c_pred_array, 226 | output_dict=True) 227 | t = time.localtime() 228 | log.info("Time: {}".format(time.strftime("%H:%M:%S", t))) 229 | if d_total > 0: 230 | log.info('Domain Training Accuracy: {}%'.format(100.0 * d_correct / d_total)) 231 | writer.add_scalar( 232 | 'Training loss domain discriminator accumulated over last batches', avg_loss_dom_discr, 233 | total_steps) 234 | writer.add_scalar("Domain Training Accuracy", d_correct / d_total, total_steps) 235 | 236 | if evaluate_after_batches_between_logging or i == num_training_items - 1: 237 | tmp_dev_classification_report_dict = evaluate(dev_loader, F_s, C, params, log) 238 | tmp_test_classification_report_dict = evaluate(test_loader, F_s, C, params, log) 239 | 240 | writer = per_step_classifier_tb_logging(log, writer, tmp_dev_classification_report_dict, 241 | tmp_test_classification_report_dict, total_steps, 242 | batches_between_logging, classification_report_dict, 243 | avg_train_losses_classifier, 244 | avg_train_losses_domain_blurrer, 245 | avg_train_losses_classifier_dom_blurrer, 246 | log_dev_test=(evaluate_after_batches_between_logging or 247 | i == num_training_items - 1), 248 | use_accuracy=False) 249 | 250 | if evaluate_after_batches_between_logging or i == num_training_items - 1: 251 | if tmp_dev_classification_report_dict["1"]["f1-score"] > best_weighted_f1: 252 | log.info(f'New best average validation accuracy: ' 253 | f'{100.0 * tmp_dev_classification_report_dict["1"]["f1-score"]}%') 254 | best_avg_weighted_f1['valid'] = tmp_dev_classification_report_dict['1']["f1-score"] 255 | best_avg_weighted_f1['test'] = tmp_test_classification_report_dict['1']["f1-score"] 256 | best_weighted_f1 = tmp_dev_classification_report_dict['1']["f1-score"] 257 | with open(os.path.join(params.training_setting["model_save_file"], 'options.pkl'), 'wb') \ 258 | as ouf: 259 | pickle.dump(params.get_config(), ouf) 260 | log.info("Saving new model") 261 | torch.save(F_s.state_dict(), '{}/netF_s.pth'.format( 262 | params.training_setting["model_save_file"])) 263 | torch.save(C.state_dict(), '{}/netC.pth'.format(params.training_setting["model_save_file"])) 264 | torch.save(D.state_dict(), '{}/netD.pth'.format(params.training_setting["model_save_file"])) 265 | 266 | total_steps += 1 267 | 268 | # end of epoch 269 | log.info('Ending epoch {}'.format(epoch+1)) 270 | 271 | # end of training 272 | log.info(f'Best average validation weighted f1: {100.0*best_weighted_f1}%') 273 | return best_avg_weighted_f1 274 | 275 | 276 | def evaluate(loader, F_s, C, params, log, return_labels=None): 277 | F_s.eval() 278 | C.eval() 279 | it = iter(loader) 280 | correct = 0 281 | total = 0 282 | all_c_pred = [] 283 | all_labels = [] 284 | confusion = ConfusionMeter(params.model_params["num_labels"]) 285 | for elem in tqdm(it): 286 | inputs, targets = elem[0], elem[1] 287 | inputs, targets = {k: v.to(params.training_setting["device"]) for k, v in inputs.items()}, \ 288 | targets.to(params.training_setting["device"]) 289 | features = F_s(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask']) 290 | 291 | outputs = C(features) 292 | _, pred = torch.max(outputs, 1) 293 | confusion.add(pred.data, targets.data) 294 | total += targets.size(0) 295 | correct += (pred == targets).sum().item() 296 | all_c_pred.append(pred.cpu().numpy()) 297 | all_labels.append(targets.cpu().numpy()) 298 | all_c_pred_array = np.concatenate(all_c_pred, axis=0) 299 | all_labels_array = np.concatenate(all_labels, axis=0) 300 | classification_report_dict = classification_report(y_true=all_labels_array, y_pred=all_c_pred_array, 301 | output_dict=True) 302 | log.debug(confusion.conf) 303 | F_s.train() 304 | C.train() 305 | if return_labels: 306 | return classification_report_dict, all_labels_array, all_c_pred_array 307 | else: 308 | return classification_report_dict 309 | 310 | 311 | def set_seeds(params): 312 | random.seed(params.training_setting["random_seed"]) 313 | np.random.seed(params.training_setting["random_seed"]) 314 | torch.manual_seed(params.training_setting["random_seed"]) 315 | torch.cuda.manual_seed_all(params.training_setting["random_seed"]) 316 | 317 | 318 | def set_logging(params): 319 | # save models and logging 320 | if not os.path.exists(params.training_setting["model_save_file"]): 321 | os.makedirs(params.training_setting["model_save_file"]) 322 | logging.basicConfig(stream=sys.stderr, level=logging.DEBUG if params.training_setting["debug"] else logging.INFO) 323 | log = logging.getLogger(__name__) 324 | fh = logging.FileHandler(os.path.join(params.training_setting["model_save_file"], 'log.txt')) 325 | log.addHandler(fh) 326 | 327 | # output options 328 | log.info(params.get_config()) 329 | 330 | return log 331 | 332 | 333 | def main(): 334 | spouse_params = KnowManParameters() 335 | spouse_params.update_parameters("./spouse_transformer.yaml") 336 | 337 | set_seeds(spouse_params) 338 | log = set_logging(spouse_params) 339 | 340 | if not os.path.exists(spouse_params.training_setting["model_save_file"]): 341 | os.makedirs(spouse_params.training_setting["model_save_file"]) 342 | 343 | feature_ext_enum = FeatureExtraction.name2type(spouse_params.model_params["feature_ext"]) 344 | 345 | train_dataset, dev_dataset, test_dataset = get_transformer_dataset(spouse_params.dataset["dataset_path"], 346 | feature_ext=feature_ext_enum, 347 | max_length_transformer= 348 | spouse_params.model_params 349 | ["max_length_transformer"]) 350 | 351 | cv = train(train_dataset, dev_dataset, test_dataset, spouse_params, log, feature_ext_enum=feature_ext_enum) 352 | log.info(f'Training done...') 353 | acc = cv['valid'] 354 | log.info(f'Validation Set \t{100.0*acc}%') 355 | test_acc = cv['test'] 356 | log.info(f'Test Set \t{100.0*test_acc}%') 357 | return cv 358 | 359 | 360 | if __name__ == '__main__': 361 | main() 362 | -------------------------------------------------------------------------------- /experiments/spouse/train_tfidf_spouse.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import time 4 | import torch 5 | import pickle 6 | import logging 7 | import random 8 | import itertools 9 | import numpy as np 10 | import torch.optim as optim 11 | 12 | from tqdm import tqdm 13 | from collections import defaultdict 14 | from torch.nn import functional 15 | from torch.utils.data import DataLoader 16 | from torchnet.meter import ConfusionMeter 17 | from sklearn.metrics import classification_report 18 | 19 | from KnowMan.utils.knowman_utils import freeze_net, unfreeze_net 20 | from KnowMan.models.nn_models import MlpFeatureExtractor, SentimentClassifier, DomainClassifier 21 | from KnowMan.data_prep.get_knodle_dataset import get_dataset 22 | from KnowMan.utils.logging_utils import per_step_classifier_tb_logging 23 | from KnowMan.utils.knowman_parameters import KnowManParameters 24 | 25 | 26 | def train(train_set, 27 | dev_set, 28 | test_set, 29 | params, 30 | log, 31 | batch_size=None, 32 | batches_between_logging=None, 33 | evaluate_after_batches_between_logging=None, 34 | dropout=None, 35 | n_crit=None, 36 | lambd=None, 37 | shared_hidden_size=None, 38 | D_learning_rate=None, 39 | learning_rate=None, 40 | C_layers=None, 41 | D_layers=None, 42 | F_layers=None): 43 | """ 44 | train_set, dev_set, test_set: raw_datasets from corpus 45 | n_critic, lambda, dropout, shared_hidden_size, F_layers, C_layers, D_layers: hyperparameters that can be set 46 | If hyperparameters are not set, values are taken from opt 47 | since we have all information in one dataset we don't need to loop over different datasets, domains etc... 48 | """ 49 | # initialize these parameters or take their value from params 50 | batch_size = batch_size or params.training_setting["batch_size"] 51 | batches_between_logging = batches_between_logging or params.training_setting["batches_between_logging"] 52 | evaluate_after_batches_between_logging = \ 53 | evaluate_after_batches_between_logging or params.training_setting["evaluate_after_batches_between_logging"] 54 | dropout = dropout or params.model_params["dropout"] 55 | lambd = lambd or params.model_params["lambd"] 56 | n_crit = n_crit or params.model_params["n_critic"] 57 | shared_hidden_size = shared_hidden_size or params.model_params["shared_hidden_size"] 58 | D_learning_rate = D_learning_rate or params.training_setting["D_learning_rate"] 59 | learning_rate = learning_rate or params.training_setting["learning_rate"] 60 | C_layers = C_layers or params.model_params["C_layers"] 61 | D_layers = D_layers or params.model_params["D_layers"] 62 | F_layers = F_layers or params.model_params["F_layers"] 63 | 64 | hyperopt_string = f"bs{batch_size}dr{dropout}lambd{lambd}ncrit{n_crit}shs{shared_hidden_size}" \ 65 | f"Dlr{D_learning_rate}lr{learning_rate}CL{C_layers}DL{D_layers}FL{F_layers}" 66 | 67 | if params.training_setting["use_tensorboard"]: 68 | tensorboard_dir_path = os.path.join(params.training_setting["tensorboard_dir"], hyperopt_string) 69 | from torch.utils.tensorboard import SummaryWriter 70 | writer = SummaryWriter(log_dir=tensorboard_dir_path, comment='Using lambda {}'.format( 71 | params.model_params["lambd"])) 72 | print(f"tensorboard logging path is {tensorboard_dir_path}") 73 | 74 | # dataset loaders, make usual PyTroch Loader 75 | train_loader = DataLoader(train_set, batch_size, shuffle=True) 76 | dev_loader = DataLoader(dev_set, batch_size, shuffle=True) 77 | test_loader = DataLoader(test_set, batch_size, shuffle=True) 78 | 79 | # models 80 | F_s = None 81 | C, D = None, None 82 | 83 | F_s = MlpFeatureExtractor(params.model_params["feature_num"], params.model_params["F_hidden_sizes"], 84 | shared_hidden_size, dropout, params.model_params["F_bn"]) 85 | C = SentimentClassifier(C_layers, shared_hidden_size + params.model_params["domain_hidden_size"], 86 | shared_hidden_size + params.model_params["domain_hidden_size"], 87 | params.model_params["num_labels"], dropout, params.model_params["C_bn"]) 88 | D = DomainClassifier(D_layers, shared_hidden_size, shared_hidden_size, 89 | params.model_params["all_domains"], params.model_params["loss"], 90 | dropout, params.model_params["D_bn"]) 91 | 92 | F_s, C, D = F_s.to(params.training_setting["device"]), \ 93 | C.to(params.training_setting["device"]), \ 94 | D.to(params.training_setting["device"]) 95 | 96 | optimizer = optim.Adam(itertools.chain(*map(list, [F_s.parameters() if F_s else [], C.parameters()])), 97 | lr=learning_rate) 98 | optimizerD = optim.Adam(D.parameters(), lr=D_learning_rate) 99 | 100 | # testing 101 | if params.training_setting["test_only"]: 102 | log.info(f'Loading model from {params.training_setting["model_save_file"]}...') 103 | F_s.load_state_dict(torch.load(os.path.join(params.training_setting["model_save_file"], f'netF_s.pth'))) 104 | C.load_state_dict(torch.load(os.path.join(params.training_setting["model_save_file"], f'netC.pth'))) 105 | D.load_state_dict(torch.load(os.path.join(params.training_setting["model_save_file"], f'netD.pth'))) 106 | 107 | log.info('Evaluating validation sets:') 108 | dev_classification_report_dict = evaluate(dev_loader, F_s, C, params, log) 109 | log.info(f'Average validation accuracy: {dev_classification_report_dict["accuracy"]}') 110 | log.info('Evaluating test sets:') 111 | test_classification_report_dict, y_true, y_pred = evaluate(test_loader, F_s, C, params, log, return_labels=True) 112 | log.info(f'Average test weighted-f1: {100.0 * test_classification_report_dict["1"]["f1-score"]}%') 113 | log.info(f'Average validation weighted-f1: {100.0 * dev_classification_report_dict["1"]["f1-score"]}%') 114 | 115 | y_true = np.ndarray.tolist(y_true) 116 | y_pred = np.ndarray.tolist(y_pred) 117 | 118 | with open("../../KnowMan/save/labels_best_spouse_tfidf.csv", "w")as out: 119 | for i in range(len(y_true)): 120 | out.write(str(y_true[i]) + "," + str(y_pred[i]) + "\n") 121 | 122 | print({'test': test_classification_report_dict["1"]}) 123 | 124 | # training 125 | else: 126 | best_avg_weighted_f1, best_weighted_f1 = defaultdict(float), 0.0 127 | num_training_items = len(train_loader) 128 | print(f"Number of training batches: {num_training_items}") 129 | total_steps = 0 130 | avg_train_losses_classifier, avg_train_losses_domain_blurrer, avg_train_losses_classifier_dom_blurrer = 0, 0, 0 131 | tmp_dev_classification_report_dict = evaluate(dev_loader, F_s, C, params, log) 132 | tmp_test_classification_report_dict = evaluate(test_loader, F_s, C, params, log) 133 | classification_report_dict = evaluate(train_loader, F_s, C, params, log) 134 | 135 | for epoch in range(params.training_setting["max_epoch"]): 136 | per_epoch_discr_loss_collection = [] 137 | per_epoch_train_losses_classifier = [] 138 | per_epoch_train_losses_domain_blurrer = [] 139 | per_epoch_train_losses_classifier_dom_blurrer = [] 140 | for group in optimizer.param_groups: 141 | learning_rate = group["lr"] 142 | writer.add_scalar("learning_rate", learning_rate, epoch) 143 | for group in optimizerD.param_groups: 144 | learning_rate = group["lr"] 145 | writer.add_scalar("learning_rate_D", learning_rate, epoch) 146 | 147 | F_s.train() 148 | C.train() 149 | D.train() 150 | 151 | # D iteration 152 | d_correct, d_total = 0, 0 153 | freeze_net(F_s) 154 | freeze_net(C) 155 | unfreeze_net(D) 156 | 157 | for critic_loop_index in range(n_crit): 158 | discr_loss_collection = [] 159 | for i, (inputs, labels, adv_labels) in enumerate(train_loader): 160 | inputs, labels, adv_labels = inputs.to(params.training_setting["device"]), \ 161 | labels.to(params.training_setting["device"]), \ 162 | adv_labels.to(params.training_setting["device"]) 163 | 164 | shared_feat = F_s(inputs) 165 | outputs = D(shared_feat) 166 | 167 | _, pred = torch.max(outputs, 1) 168 | d_total += len(inputs) 169 | 170 | d_correct += (pred == adv_labels).sum().item() 171 | l_d = functional.nll_loss(outputs, adv_labels) 172 | discr_loss_collection.append(l_d.item()) 173 | per_epoch_discr_loss_collection.append(l_d.item()) 174 | l_d.backward() 175 | optimizerD.step() 176 | D.zero_grad() 177 | if (i > 0 and i % batches_between_logging == 0) or i == num_training_items - 1: 178 | avg_loss_dom_discr = np.mean(np.array(discr_loss_collection)) 179 | discr_loss_collection = [] 180 | log.info("Discriminator training Epoch {}, " 181 | "critic_loop_index {}, Step {}...".format(epoch, critic_loop_index, i)) 182 | log.info(f'Training loss domain discriminator over the last {batches_between_logging} ' 183 | f'batches: {avg_loss_dom_discr}') 184 | writer.add_scalar('Training loss domain discriminator accumulated over last batches', 185 | avg_loss_dom_discr, total_steps) 186 | if d_total > 0: 187 | log.info('Domain Training Accuracy: {}%'.format(100.0 * d_correct / d_total)) 188 | writer.add_scalar("Domain Training Accuracy", d_correct / d_total, 189 | total_steps) 190 | 191 | writer = per_step_classifier_tb_logging(log, writer, tmp_dev_classification_report_dict, 192 | tmp_test_classification_report_dict, total_steps, 193 | batches_between_logging, classification_report_dict, 194 | avg_train_losses_classifier, 195 | avg_train_losses_domain_blurrer, 196 | avg_train_losses_classifier_dom_blurrer, 197 | log_dev_test=True, use_accuracy=False) 198 | total_steps += 1 199 | 200 | # F&C iteration 201 | unfreeze_net(F_s) 202 | unfreeze_net(C) 203 | freeze_net(D) 204 | train_losses_classifier = [] 205 | train_losses_domain_blurrer = [] 206 | train_losses_classifier_dom_blurrer = [] 207 | all_c_pred = [] 208 | all_labels = [] 209 | 210 | for i, (inputs, labels, adv_labels) in enumerate(train_loader): 211 | inputs, labels, adv_labels = inputs.to(params.training_setting["device"]),\ 212 | labels.to(params.training_setting["device"]), \ 213 | adv_labels.to(params.training_setting["device"]) 214 | shared_feat = F_s(inputs) 215 | c_outputs = C(shared_feat) 216 | d_outputs = D(shared_feat) 217 | 218 | _, c_pred = torch.max(c_outputs, 1) 219 | all_c_pred.append(c_pred.cpu().numpy()) 220 | all_labels.append(labels.cpu().numpy()) 221 | l_c = functional.nll_loss(c_outputs, labels) 222 | train_losses_classifier.append(l_c.item()) 223 | train_losses_domain_blurrer.append(-l_d.item()) 224 | per_epoch_train_losses_classifier.append(l_c.item()) 225 | per_epoch_train_losses_domain_blurrer.append(-l_d.item()) 226 | 227 | l_d = functional.nll_loss(d_outputs, adv_labels) 228 | l_d *= lambd 229 | 230 | l_shared = l_c - l_d 231 | train_losses_classifier_dom_blurrer.append(l_shared.item()) 232 | per_epoch_train_losses_classifier_dom_blurrer.append(l_shared.item()) 233 | l_shared.backward() 234 | 235 | optimizer.step() 236 | F_s.zero_grad() 237 | C.zero_grad() 238 | 239 | if (i > 0 and i % batches_between_logging == 0) or i == num_training_items-1: 240 | avg_train_losses_classifier_dom_blurrer = np.mean(np.array(train_losses_classifier_dom_blurrer)) 241 | train_losses_classifier_dom_blurrer = [] 242 | avg_train_losses_classifier = np.mean(np.array(train_losses_classifier)) 243 | train_losses_classifier = [] 244 | avg_train_losses_domain_blurrer = np.mean(np.array(train_losses_domain_blurrer)) 245 | train_losses_domain_blurrer = [] 246 | log.info("Classifier training Epoch {}, Step: {}...".format(epoch, i)) 247 | all_c_pred_array = np.concatenate(all_c_pred, axis=0) 248 | all_labels_array = np.concatenate(all_labels, axis=0) 249 | classification_report_dict = classification_report(y_true=all_labels_array, y_pred=all_c_pred_array, 250 | output_dict=True) 251 | t = time.localtime() 252 | log.info("Time: {}".format(time.strftime("%H:%M:%S", t))) 253 | if d_total > 0: 254 | log.info('Domain Training Accuracy: {}%'.format(100.0 * d_correct / d_total)) 255 | writer.add_scalar( 256 | 'Training loss domain discriminator accumulated over last batches', avg_loss_dom_discr, 257 | total_steps) 258 | writer.add_scalar("Domain Training Accuracy", d_correct / d_total, total_steps) 259 | 260 | if evaluate_after_batches_between_logging or i == num_training_items - 1: 261 | tmp_dev_classification_report_dict = evaluate(dev_loader, F_s, C, params, log) 262 | tmp_test_classification_report_dict = evaluate(test_loader, F_s, C, params, log) 263 | 264 | writer = per_step_classifier_tb_logging(log, writer, tmp_dev_classification_report_dict, 265 | tmp_test_classification_report_dict, total_steps, 266 | batches_between_logging, classification_report_dict, 267 | avg_train_losses_classifier, 268 | avg_train_losses_domain_blurrer, 269 | avg_train_losses_classifier_dom_blurrer, 270 | log_dev_test=(evaluate_after_batches_between_logging 271 | or i == num_training_items - 1), 272 | use_accuracy=False) 273 | 274 | if evaluate_after_batches_between_logging or i == num_training_items - 1: 275 | if tmp_dev_classification_report_dict['1']["f1-score"] > best_weighted_f1: 276 | log.info(f'New best average validation accuracy:' 277 | f' {100.0 * tmp_dev_classification_report_dict["1"]["f1-score"]}%') 278 | best_avg_weighted_f1['valid'] = tmp_dev_classification_report_dict['1']["f1-score"] 279 | best_avg_weighted_f1['test'] = tmp_test_classification_report_dict['1']["f1-score"] 280 | best_weighted_f1 = tmp_dev_classification_report_dict['1']["f1-score"] 281 | with open(os.path.join(params.training_setting["model_save_file"], 'options.pkl'), 'wb')\ 282 | as ouf: 283 | pickle.dump(params.get_config(), ouf) 284 | log.info("Saving new model") 285 | torch.save(F_s.state_dict(), 286 | '{}/netF_s.pth'.format(params.training_setting["model_save_file"])) 287 | torch.save(C.state_dict(), '{}/netC.pth'.format(params.training_setting["model_save_file"])) 288 | torch.save(D.state_dict(), '{}/netD.pth'.format(params.training_setting["model_save_file"])) 289 | 290 | total_steps += 1 291 | 292 | # end of epoch 293 | log.info('Ending epoch {}'.format(epoch+1)) 294 | 295 | # end of training 296 | log.info(f'Best average validation weighted f1: {100.0*best_weighted_f1}%') 297 | return best_avg_weighted_f1 298 | 299 | 300 | def evaluate(loader, F_s, C, params, log, return_labels=None): 301 | F_s.eval() 302 | C.eval() 303 | it = iter(loader) 304 | correct = 0 305 | total = 0 306 | all_c_pred = [] 307 | all_labels = [] 308 | confusion = ConfusionMeter(params.model_params["num_labels"]) 309 | for elem in tqdm(it): 310 | inputs, targets = elem[0], elem[1] 311 | inputs, targets = inputs.to(params.training_setting["device"]), targets.to(params.training_setting["device"]) 312 | features = F_s(inputs) 313 | outputs = C(features) 314 | _, pred = torch.max(outputs, 1) 315 | confusion.add(pred.data, targets.data) 316 | total += targets.size(0) 317 | correct += (pred == targets).sum().item() 318 | all_c_pred.append(pred.cpu().numpy()) 319 | all_labels.append(targets.cpu().numpy()) 320 | all_c_pred_array = np.concatenate(all_c_pred, axis=0) 321 | all_labels_array = np.concatenate(all_labels, axis=0) 322 | classification_report_dict = classification_report(y_true=all_labels_array, y_pred=all_c_pred_array, 323 | output_dict=True) 324 | log.debug(confusion.conf) 325 | F_s.train() 326 | C.train() 327 | if return_labels: 328 | return classification_report_dict, all_labels_array, all_c_pred_array 329 | else: 330 | return classification_report_dict 331 | 332 | 333 | def set_seeds(params): 334 | random.seed(params.training_setting["random_seed"]) 335 | np.random.seed(params.training_setting["random_seed"]) 336 | torch.manual_seed(params.training_setting["random_seed"]) 337 | torch.cuda.manual_seed_all(params.training_setting["random_seed"]) 338 | 339 | 340 | def set_logging(params): 341 | # save models and logging 342 | if not os.path.exists(params.training_setting["model_save_file"]): 343 | os.makedirs(params.training_setting["model_save_file"]) 344 | logging.basicConfig(stream=sys.stderr, level=logging.DEBUG if params.training_setting["debug"] else logging.INFO) 345 | log = logging.getLogger(__name__) 346 | fh = logging.FileHandler(os.path.join(params.training_setting["model_save_file"], 'log.txt')) 347 | log.addHandler(fh) 348 | 349 | # output options 350 | log.info(params.get_config()) 351 | 352 | return log 353 | 354 | 355 | def main(): 356 | spouse_params = KnowManParameters() 357 | spouse_params.update_parameters("./spouse_tfidf.yaml") 358 | 359 | set_seeds(spouse_params) 360 | log = set_logging(spouse_params) 361 | 362 | if not os.path.exists(spouse_params.training_setting["model_save_file"]): 363 | os.makedirs(spouse_params.training_setting["model_save_file"]) 364 | 365 | train_dataset, dev_dataset, test_dataset = get_dataset(spouse_params.dataset["dataset_path"]) 366 | cv = train(train_dataset, dev_dataset, test_dataset, spouse_params, log) 367 | log.info(f'Training done...') 368 | acc = cv['valid'] 369 | log.info(f'Validation Set \t{100.0*acc}%') 370 | test_acc = cv['test'] 371 | log.info(f'Test Set \t{100.0*test_acc}%') 372 | return cv 373 | 374 | 375 | if __name__ == '__main__': 376 | main() 377 | --------------------------------------------------------------------------------