├── .gitignore ├── .travis.yml ├── LICENSE ├── PyTorch ├── C-MIDP │ ├── layers.py │ ├── test.py │ └── train.py ├── H-MIDP │ ├── layers.py │ ├── test.py │ └── train.py ├── R-MIDP │ ├── layers.py │ ├── test.py │ └── train.py ├── TARNN │ ├── layers.py │ ├── test.py │ └── train.py └── utils │ ├── checkmate.py │ ├── data_helpers.py │ └── param_parser.py ├── README.md ├── TF ├── C-MIDP │ ├── test_cmidp.py │ ├── text_cmidp.py │ └── train_cmidp.py ├── H-MIDP │ ├── test_hmidp.py │ ├── text_hmidp.py │ └── train_hmidp.py ├── R-MIDP │ ├── test_rmidp.py │ ├── text_rmidp.py │ └── train_rmidp.py ├── TARNN │ ├── test_tarnn.py │ ├── text_tarnn.py │ └── train_tarnn.py └── utils │ ├── checkmate.py │ ├── data_helpers.py │ └── param_parser.py ├── TMLA ├── DTR │ ├── test_dtr.py │ └── train_dtr.py ├── LR │ ├── test_lr.py │ └── train_lr.py ├── SVM │ ├── test_svm.py │ └── train_svm.py ├── XGBoost │ ├── test_xgb.py │ └── train_xgb.py └── utils │ └── data_process.py ├── Usage-PyTorch.md ├── Usage-TF.md ├── data ├── Test_BOW_sample.json ├── Test_sample.json ├── Train_BOW_sample.json ├── Train_sample.json ├── Validation_BOW_sample.json └── Validation_sample.json └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- 1 | ### Compiled source ### 2 | *.com 3 | *.class 4 | *.dll 5 | *.exe 6 | *.o 7 | *.so 8 | 9 | ### Packages ### 10 | # it's better to unpack these files and commit the raw source 11 | # git has its own built in compression methods 12 | *.7z 13 | *.dmg 14 | *.gz 15 | *.iso 16 | *.jar 17 | *.rar 18 | *.tar 19 | *.zip 20 | 21 | ### Logs and databases ### 22 | *.log 23 | *.sql 24 | *.sqlite 25 | 26 | ### Mac OS generated files ### 27 | .DS_Store 28 | .DS_Store? 29 | ._* 30 | .Spotlight-V100 31 | .Trashes 32 | ehthumbs.db 33 | Thumbs.db 34 | 35 | ### JetBrain config files ### 36 | .idea 37 | 38 | ### Python ### 39 | # Byte-compiled / optimized / DLL files 40 | *.npy 41 | __pycache__/ 42 | *.py[cod] 43 | *$py.class 44 | 45 | # Distribution / packaging 46 | .Python 47 | env/ 48 | build/ 49 | develop-eggs/ 50 | dist/ 51 | downloads/ 52 | eggs/ 53 | .eggs/ 54 | lib/ 55 | lib64/ 56 | parts/ 57 | sdist/ 58 | var/ 59 | *.egg-info/ 60 | .installed.cfg 61 | *.egg 62 | 63 | # PyInstaller 64 | # Usually these files are written by a python script from a template 65 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 66 | *.manifest 67 | *.spec 68 | 69 | # Installer logs 70 | pip-log.txt 71 | pip-delete-this-directory.txt 72 | 73 | # Unit test / coverage reports 74 | htmlcov/ 75 | .tox/ 76 | .coverage 77 | .coverage.* 78 | .cache 79 | nosetests.xml 80 | coverage.xml 81 | *,cover 82 | 83 | # Translations 84 | *.mo 85 | *.pot 86 | 87 | # Sphinx documentation 88 | docs/_build/ 89 | 90 | # PyBuilder 91 | target/ 92 | 93 | ### IPythonNotebook ### 94 | # Temporary data 95 | .ipynb_checkpoints/ 96 | 97 | ### Current Project ### 98 | # Data File 99 | *.txt 100 | *.tsv 101 | *.csv 102 | *.json 103 | *.jpg 104 | *.png 105 | *.pickle 106 | *.xls 107 | *.doc 108 | !/data 109 | !/data/Train_sample.json 110 | !/data/Validation_sample.json 111 | !/data/Test_sample.json 112 | !/data/Train_BOW_sample.json 113 | !/data/Validation_BOW_sample.json 114 | !/data/Test_BOW_sample.json 115 | 116 | # Model File 117 | *.model 118 | *.pb 119 | runs/ 120 | graph/ 121 | 122 | # Analysis File 123 | Data Analysis.md 124 | 125 | # Log File 126 | logs/ 127 | 128 | # Related Code 129 | temp.py 130 | data/preprocess.py 131 | TF/utils/pairwise_data_helpers.py 132 | TF/TACNN 133 | TF/PARNN 134 | PyTorch/Quesnet/ 135 | PyTorch/Others 136 | english_difficulty_prediction_dtr 137 | english_difficulty_prediction_tf 138 | english_difficulty_prediction_pytorch 139 | 140 | ### Else ### 141 | randolph/ 142 | Icon? 143 | *.graffle -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | 3 | matrix: 4 | include: 5 | - python: 3.6 6 | 7 | install: 8 | - pip install -r requirements.txt 9 | - pip install coveralls 10 | 11 | before_script: 12 | - export PYTHONPATH=$PWD 13 | 14 | script: 15 | - true # add other tests here 16 | - coveralls -------------------------------------------------------------------------------- /PyTorch/C-MIDP/layers.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | __author__ = 'randolph' 3 | 4 | """CMIDP layers.""" 5 | 6 | import torch 7 | import numpy as np 8 | import torch.nn as nn 9 | import torch.nn.functional as F 10 | from torch.autograd import Variable 11 | 12 | 13 | class ConvLayer(nn.Module): 14 | def __init__(self, input_units, num_filters, filter_size): 15 | super(ConvLayer, self).__init__() 16 | self.conv = nn.Conv1d(in_channels=input_units, out_channels=num_filters, 17 | kernel_size=filter_size, padding=filter_size - 1, stride=1) 18 | 19 | def forward(self, input_x, pooling_size): 20 | conv_out = F.relu(self.conv(input_x)) 21 | pooled_out = F.max_pool1d(conv_out, kernel_size=pooling_size) 22 | return pooled_out 23 | 24 | 25 | class CMIDP(nn.Module): 26 | """An implementation of CMIDP""" 27 | 28 | def __init__(self, args, vocab_size, embedding_size, pretrained_embedding=None): 29 | super(CMIDP, self).__init__() 30 | """ 31 | :param args: Arguments object. 32 | """ 33 | self.args = args 34 | self.vocab_size = vocab_size 35 | self.embedding_size = embedding_size 36 | self.pretrained_embedding = pretrained_embedding 37 | self._setup_layers() 38 | 39 | def _setup_embedding_layer(self): 40 | """ 41 | Creating Embedding layers. 42 | """ 43 | if self.pretrained_embedding is None: 44 | embedding_weight = torch.FloatTensor(np.random.uniform(-1, 1, size=(self.vocab_size, self.embedding_size))) 45 | embedding_weight = Variable(embedding_weight, requires_grad=True) 46 | else: 47 | if self.args.embedding_type == 0: 48 | embedding_weight = torch.from_numpy(self.pretrained_embedding).float() 49 | if self.args.embedding_type == 1: 50 | embedding_weight = Variable(torch.from_numpy(self.pretrained_embedding).float(), requires_grad=True) 51 | self.embedding = nn.Embedding(self.vocab_size, self.embedding_size, _weight=embedding_weight) 52 | 53 | def _setup_conv_layer(self): 54 | """ 55 | Creating Convolution Layer. 56 | """ 57 | 58 | self.conv1 = ConvLayer(input_units=self.embedding_size, num_filters=self.args.num_filters[0], 59 | filter_size=self.args.filter_sizes[0]) 60 | self.conv2 = ConvLayer(input_units=self.args.num_filters[0], num_filters=self.args.num_filters[1], 61 | filter_size=self.args.filter_sizes[1]) 62 | 63 | def _setup_fc_layer(self): 64 | """ 65 | Creating FC Layer. 66 | """ 67 | self.fc = nn.Linear(in_features=self.args.num_filters[1], out_features=self.args.fc_dim, bias=True) 68 | self.out = nn.Linear(in_features=self.args.fc_dim, out_features=1, bias=True) 69 | 70 | def _setup_layers(self): 71 | """ 72 | Creating layers of model. 73 | 1. Embedding Layer. 74 | 2. Convolution Layer. 75 | 3. FC Layer. 76 | """ 77 | self._setup_embedding_layer() 78 | self._setup_conv_layer() 79 | self._setup_fc_layer() 80 | 81 | def _sub_network(self, x_content, x_question, x_option): 82 | embedded_sentence_content = self.embedding(x_content) 83 | embedded_sentence_question = self.embedding(x_question) 84 | embedded_sentence_option = self.embedding(x_option) 85 | 86 | # Concat Vectors 87 | # [batch_size, sequence_length_all, embedding_size] 88 | sequence_length_total = sum(self.args.pad_seq_len) 89 | embedded_sentence_all = torch.cat((embedded_sentence_content, embedded_sentence_question, 90 | embedded_sentence_option), dim=1) 91 | # [batch_size, embedding_size, sequence_length_all] 92 | embedded_sentence_transpose = embedded_sentence_all.permute(0, 2, 1) 93 | 94 | # Convolution Layer 1 95 | conv1_out = self.conv1(embedded_sentence_transpose, pooling_size=self.args.pooling_size) 96 | 97 | # Convolution Layer 2 98 | new_pooling_size = (sequence_length_total + self.args.filter_sizes[0] - 1) // self.args.pooling_size 99 | # conv2_out: [batch_size, num_filters[1], 1] 100 | conv2_out = self.conv2(conv1_out, pooling_size=new_pooling_size) 101 | 102 | # conv_final_flat: [batch_size, num_filters[1]] 103 | conv_final_flat = conv2_out.view(-1, conv2_out.size(1)) 104 | 105 | # Fully Connected Layer 106 | fc_out = self.fc(conv_final_flat) 107 | 108 | # Final scores 109 | logits = self.out(fc_out).squeeze() 110 | scores = torch.sigmoid(logits) 111 | 112 | return logits, scores 113 | 114 | def forward(self, x_fb_content, x_fb_question, x_fb_option): 115 | """ 116 | Forward propagation pass. 117 | :param x_fb_content: Front & Behind Content tensors with features. 118 | :param x_fb_question: Front & Behind Question tensors with features. 119 | :param x_fb_option: Front & Behind Option tensors with features. 120 | :return logits: The predicted logistic values. 121 | :return scores: The predicted scores. 122 | """ 123 | f_logits, f_scores = self._sub_network(x_fb_content[0], x_fb_question[0], x_fb_option[0]) 124 | b_logits, b_scores = self._sub_network(x_fb_content[1], x_fb_question[1], x_fb_option[1]) 125 | 126 | logits = (f_logits, b_logits) 127 | scores = (f_scores, b_scores) 128 | return logits, scores 129 | 130 | 131 | class Loss(nn.Module): 132 | def __init__(self): 133 | super(Loss, self).__init__() 134 | 135 | def forward(self, predict_y, input_y): 136 | # Loss 137 | value = (predict_y[0] - predict_y[1]) - (input_y[0] - input_y[1]) 138 | losses = torch.mean(torch.pow(value, 2)) 139 | return losses 140 | -------------------------------------------------------------------------------- /PyTorch/C-MIDP/test.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | __author__ = 'randolph' 3 | 4 | import os 5 | import sys 6 | import time 7 | import torch 8 | 9 | sys.path.append('../') 10 | 11 | from layers import CMIDP, Loss 12 | from utils import checkmate as cm 13 | from utils import data_helpers as dh 14 | from utils import param_parser as parser 15 | from tqdm import trange 16 | from torch.utils.data import TensorDataset, DataLoader 17 | from sklearn.metrics import mean_squared_error, r2_score 18 | 19 | args = parser.parameter_parser() 20 | MODEL = dh.get_model_name() 21 | logger = dh.logger_fn("ptlog", "logs/Test-{0}.log".format(time.asctime())) 22 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 23 | 24 | CPT_DIR = os.path.abspath(os.path.join(os.path.curdir, "runs", MODEL)) 25 | SAVE_DIR = os.path.abspath(os.path.join(os.path.curdir, "outputs", MODEL)) 26 | 27 | 28 | def test(): 29 | logger.info("Loading Data...") 30 | logger.info("Data processing...") 31 | test_data = dh.load_data_and_labels(args.test_file, args.word2vec_file) 32 | logger.info("Data padding...") 33 | test_dataset = dh.MyData(test_data, args.pad_seq_len, device) 34 | test_loader = DataLoader(test_dataset, batch_size=args.batch_size, shuffle=False) 35 | VOCAB_SIZE, EMBEDDING_SIZE, pretrained_word2vec_matrix = dh.load_word2vec_matrix(args.word2vec_file) 36 | 37 | criterion = Loss() 38 | net = CMIDP(args, VOCAB_SIZE, EMBEDDING_SIZE, pretrained_word2vec_matrix).to(device) 39 | checkpoint_file = cm.get_best_checkpoint(CPT_DIR, select_maximum_value=False) 40 | checkpoint = torch.load(checkpoint_file) 41 | net.load_state_dict(checkpoint['model_state_dict']) 42 | net.eval() 43 | 44 | logger.info("Scoring...") 45 | true_labels, predicted_scores = [], [] 46 | batches = trange(len(test_loader), desc="Batches", leave=True) 47 | for batch_cnt, batch in zip(batches, test_loader): 48 | x_test_fb_content, x_test_fb_question, x_test_fb_option, \ 49 | x_test_fb_clens, x_test_fb_qlens, x_test_fb_olens, y_test_fb = batch 50 | logits, scores = net(x_test_fb_content, x_test_fb_question, x_test_fb_option) 51 | for i in y_test_fb[0].tolist(): 52 | true_labels.append(i) 53 | for j in scores[0].tolist(): 54 | predicted_scores.append(j) 55 | 56 | # Calculate the Metrics 57 | test_rmse = mean_squared_error(true_labels, predicted_scores) ** 0.5 58 | test_r2 = r2_score(true_labels, predicted_scores) 59 | test_pcc, test_doa = dh.evaluation(true_labels, predicted_scores) 60 | logger.info("All Test set: PCC {0:.4f} | DOA {1:.4f} | RMSE {2:.4f} | R2 {3:.4f}" 61 | .format(test_pcc, test_doa, test_rmse, test_r2)) 62 | logger.info('Test Finished.') 63 | 64 | logger.info('Creating the prediction file...') 65 | dh.create_prediction_file(save_dir=SAVE_DIR, identifiers=test_data['f_id'], predictions=predicted_scores) 66 | 67 | logger.info('All Finished.') 68 | 69 | 70 | if __name__ == "__main__": 71 | test() 72 | 73 | -------------------------------------------------------------------------------- /PyTorch/C-MIDP/train.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | __author__ = 'randolph' 3 | 4 | import os 5 | import sys 6 | import time 7 | import torch 8 | import torch.nn as nn 9 | 10 | sys.path.append('../') 11 | 12 | from layers import CMIDP, Loss 13 | from utils import checkmate as cm 14 | from utils import data_helpers as dh 15 | from utils import param_parser as parser 16 | from tqdm import tqdm, trange 17 | from torch.utils.tensorboard import SummaryWriter 18 | from torch.utils.data import TensorDataset, DataLoader 19 | from sklearn.metrics import mean_squared_error, r2_score 20 | 21 | 22 | args = parser.parameter_parser() 23 | OPTION = dh.option() 24 | logger = dh.logger_fn("ptlog", "logs/{0}-{1}.log".format('Train' if OPTION == 'T' else 'Restore', time.asctime())) 25 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 26 | 27 | 28 | def train(): 29 | """Training CMIDP model.""" 30 | dh.tab_printer(args, logger) 31 | 32 | # Load sentences, labels, and training parameters 33 | logger.info("Loading data...") 34 | logger.info("Data processing...") 35 | train_data = dh.load_data_and_labels(args.train_file, args.word2vec_file) 36 | val_data = dh.load_data_and_labels(args.validation_file, args.word2vec_file) 37 | 38 | logger.info("Data padding...") 39 | train_dataset = dh.MyData(train_data, args.pad_seq_len, device) 40 | val_dataset = dh.MyData(val_data, args.pad_seq_len, device) 41 | 42 | train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True) 43 | val_loader = DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False) 44 | 45 | # Load word2vec model 46 | VOCAB_SIZE, EMBEDDING_SIZE, pretrained_word2vec_matrix = dh.load_word2vec_matrix(args.word2vec_file) 47 | 48 | # Init network 49 | logger.info("Init nn...") 50 | net = CMIDP(args, VOCAB_SIZE, EMBEDDING_SIZE, pretrained_word2vec_matrix).to(device) 51 | 52 | print("Model's state_dict:") 53 | for param_tensor in net.state_dict(): 54 | print(param_tensor, "\t", net.state_dict()[param_tensor].size()) 55 | 56 | criterion = Loss() 57 | optimizer = torch.optim.Adam(net.parameters(), lr=args.learning_rate, weight_decay=args.l2_lambda) 58 | 59 | if OPTION == 'T': 60 | timestamp = str(int(time.time())) 61 | out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp)) 62 | saver = cm.BestCheckpointSaver(save_dir=out_dir, num_to_keep=args.num_checkpoints, maximize=False) 63 | logger.info("Writing to {0}\n".format(out_dir)) 64 | elif OPTION == 'R': 65 | timestamp = input("[Input] Please input the checkpoints model you want to restore: ") 66 | while not (timestamp.isdigit() and len(timestamp) == 10): 67 | timestamp = input("[Warning] The format of your input is illegal, please re-input: ") 68 | out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp)) 69 | saver = cm.BestCheckpointSaver(save_dir=out_dir, num_to_keep=args.num_checkpoints, maximize=False) 70 | logger.info("Writing to {0}\n".format(out_dir)) 71 | checkpoint = torch.load(out_dir) 72 | net.load_state_dict(checkpoint['model_state_dict']) 73 | optimizer.load_state_dict(checkpoint['optimizer_state_dict']) 74 | 75 | logger.info("Training...") 76 | writer = SummaryWriter('summary') 77 | 78 | def eval_model(val_loader, epoch): 79 | """ 80 | Evaluate on the validation set. 81 | """ 82 | net.eval() 83 | eval_loss = 0.0 84 | true_labels, predicted_scores = [], [] 85 | for batch in val_loader: 86 | x_val_fb_content, x_val_fb_question, x_val_fb_option, \ 87 | x_val_fb_clens, x_val_fb_qlens, x_val_fb_olens, y_val_fb = batch 88 | 89 | logits, scores = net(x_val_fb_content, x_val_fb_question, x_val_fb_option) 90 | avg_batch_loss = criterion(scores, y_val_fb) 91 | eval_loss = eval_loss + avg_batch_loss.item() 92 | for i in y_val_fb[0].tolist(): 93 | true_labels.append(i) 94 | for j in scores[0].tolist(): 95 | predicted_scores.append(j) 96 | 97 | # Calculate the Metrics 98 | eval_rmse = mean_squared_error(true_labels, predicted_scores) ** 0.5 99 | eval_r2 = r2_score(true_labels, predicted_scores) 100 | eval_pcc, eval_doa = dh.evaluation(true_labels, predicted_scores) 101 | eval_loss = eval_loss / len(val_loader) 102 | cur_value = eval_rmse 103 | logger.info("All Validation set: Loss {0:g} | PCC {1:.4f} | DOA {2:.4f} | RMSE {3:.4f} | R2 {4:.4f}" 104 | .format(eval_loss, eval_pcc, eval_doa, eval_rmse, eval_r2)) 105 | writer.add_scalar('validation loss', eval_loss, epoch) 106 | writer.add_scalar('validation PCC', eval_pcc, epoch) 107 | writer.add_scalar('validation DOA', eval_doa, epoch) 108 | writer.add_scalar('validation RMSE', eval_rmse, epoch) 109 | writer.add_scalar('validation R2', eval_r2, epoch) 110 | return cur_value 111 | 112 | for epoch in tqdm(range(args.epochs), desc="Epochs:", leave=True): 113 | # Training step 114 | batches = trange(len(train_loader), desc="Batches", leave=True) 115 | for batch_cnt, batch in zip(batches, train_loader): 116 | net.train() 117 | x_train_fb_content, x_train_fb_question, x_train_fb_option, \ 118 | x_train_fb_clens, x_train_fb_qlens, x_train_fb_olens, y_train_fb = batch 119 | 120 | optimizer.zero_grad() # 如果不置零,Variable 的梯度在每次 backward 的时候都会累加 121 | logits, scores = net(x_train_fb_content, x_train_fb_question, x_train_fb_option) 122 | avg_batch_loss = criterion(scores, y_train_fb) 123 | avg_batch_loss.backward() 124 | optimizer.step() # Parameter updating 125 | batches.set_description("Batches (Loss={:.4f})".format(avg_batch_loss.item())) 126 | logger.info('[epoch {0}, batch {1}] loss: {2:.4f}'.format(epoch + 1, batch_cnt, avg_batch_loss.item())) 127 | writer.add_scalar('training loss', avg_batch_loss, batch_cnt) 128 | # Evaluation step 129 | cur_value = eval_model(val_loader, epoch) 130 | saver.handle(cur_value, net, optimizer, epoch) 131 | writer.close() 132 | 133 | logger.info('Training Finished.') 134 | 135 | 136 | if __name__ == "__main__": 137 | train() 138 | 139 | -------------------------------------------------------------------------------- /PyTorch/H-MIDP/layers.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | __author__ = 'randolph' 3 | 4 | """HMIDP layers.""" 5 | 6 | import torch 7 | import numpy as np 8 | import torch.nn as nn 9 | import torch.nn.functional as F 10 | from torch.autograd import Variable 11 | 12 | 13 | class ConvLayer(nn.Module): 14 | def __init__(self, input_units, num_filters, filter_size): 15 | super(ConvLayer, self).__init__() 16 | self.conv = nn.Conv1d(in_channels=input_units, out_channels=num_filters, 17 | kernel_size=filter_size, padding=filter_size - 1, stride=1) 18 | 19 | def forward(self, input_x, pooling_size): 20 | conv_out = F.relu(self.conv(input_x)) 21 | pooled_out = F.max_pool1d(conv_out, kernel_size=pooling_size) 22 | return pooled_out 23 | 24 | 25 | class BiRNNLayer(nn.Module): 26 | def __init__(self, input_units, rnn_type, rnn_layers, rnn_hidden_size, dropout_keep_prob): 27 | super(BiRNNLayer, self).__init__() 28 | if rnn_type == 'LSTM': 29 | self.bi_rnn = nn.LSTM(input_size=input_units, hidden_size=rnn_hidden_size, num_layers=rnn_layers, 30 | batch_first=True, bidirectional=True, dropout=dropout_keep_prob) 31 | if rnn_type == 'GRU': 32 | self.bi_rnn = nn.GRU(input_size=input_units, hidden_size=rnn_hidden_size, num_layers=rnn_layers, 33 | batch_first=True, bidirectional=True, dropout=dropout_keep_prob) 34 | 35 | def forward(self, input_x): 36 | rnn_out, _ = self.bi_rnn(input_x) 37 | rnn_avg = torch.mean(rnn_out, dim=1) 38 | return rnn_out, rnn_avg 39 | 40 | 41 | class HMIDP(nn.Module): 42 | """An implementation of HMIDP""" 43 | 44 | def __init__(self, args, vocab_size, embedding_size, pretrained_embedding=None): 45 | super(HMIDP, self).__init__() 46 | """ 47 | :param args: Arguments object. 48 | """ 49 | self.args = args 50 | self.vocab_size = vocab_size 51 | self.embedding_size = embedding_size 52 | self.pretrained_embedding = pretrained_embedding 53 | self._setup_layers() 54 | 55 | def _setup_embedding_layer(self): 56 | """ 57 | Creating Embedding layers. 58 | """ 59 | if self.pretrained_embedding is None: 60 | embedding_weight = torch.FloatTensor(np.random.uniform(-1, 1, size=(self.vocab_size, self.embedding_size))) 61 | embedding_weight = Variable(embedding_weight, requires_grad=True) 62 | else: 63 | if self.args.embedding_type == 0: 64 | embedding_weight = torch.from_numpy(self.pretrained_embedding).float() 65 | if self.args.embedding_type == 1: 66 | embedding_weight = Variable(torch.from_numpy(self.pretrained_embedding).float(), requires_grad=True) 67 | self.embedding = nn.Embedding(self.vocab_size, self.embedding_size, _weight=embedding_weight) 68 | 69 | def _setup_conv_layer(self): 70 | """ 71 | Creating Convolution Layer. 72 | """ 73 | 74 | self.conv1 = ConvLayer(input_units=self.embedding_size, num_filters=self.args.num_filters[0], 75 | filter_size=self.args.filter_sizes[0]) 76 | self.conv2 = ConvLayer(input_units=self.args.num_filters[0], num_filters=self.args.num_filters[1], 77 | filter_size=self.args.filter_sizes[1]) 78 | 79 | def _setup_bi_rnn_layer(self): 80 | """ 81 | Creating Bi-RNN Layer. 82 | """ 83 | self.bi_rnn = BiRNNLayer(input_units=self.embedding_size, rnn_type=self.args.rnn_type, 84 | rnn_layers=self.args.rnn_layers, rnn_hidden_size=self.args.rnn_dim, 85 | dropout_keep_prob=self.args.dropout_rate) 86 | 87 | def _setup_fc_layer(self): 88 | """ 89 | Creating FC Layer. 90 | """ 91 | self.fc = nn.Linear(in_features=self.args.num_filters[1] + self.args.rnn_dim * 2, 92 | out_features=self.args.fc_dim, bias=True) 93 | self.out = nn.Linear(in_features=self.args.fc_dim, out_features=1, bias=True) 94 | 95 | def _setup_layers(self): 96 | """ 97 | Creating layers of model. 98 | 1. Embedding Layer. 99 | 2. Convolution Layer. 100 | 3. Bi-RNN Layer. 101 | 4. FC Layer. 102 | """ 103 | self._setup_embedding_layer() 104 | self._setup_conv_layer() 105 | self._setup_bi_rnn_layer() 106 | self._setup_fc_layer() 107 | 108 | def _sub_network(self, x_content, x_question, x_option): 109 | embedded_sentence_content = self.embedding(x_content) 110 | embedded_sentence_question = self.embedding(x_question) 111 | embedded_sentence_option = self.embedding(x_option) 112 | 113 | # Concat Vectors 114 | # [batch_size, sequence_length_all, embedding_size] 115 | sequence_length_total = sum(self.args.pad_seq_len) 116 | embedded_sentence_all = torch.cat((embedded_sentence_content, embedded_sentence_question, 117 | embedded_sentence_option), dim=1) 118 | # [batch_size, embedding_size, sequence_length_all] 119 | embedded_sentence_transpose = embedded_sentence_all.permute(0, 2, 1) 120 | 121 | # Convolution Layer 1 122 | conv1_out = self.conv1(embedded_sentence_transpose, pooling_size=self.args.pooling_size) 123 | 124 | # Convolution Layer 2 125 | new_pooling_size = (sequence_length_total + self.args.filter_sizes[0] - 1) // self.args.pooling_size 126 | # conv2_out: [batch_size, num_filters[1], 1] 127 | conv2_out = self.conv2(conv1_out, pooling_size=new_pooling_size) 128 | 129 | # conv_final_flat: [batch_size, num_filters[1]] 130 | conv_final_flat = conv2_out.view(-1, conv2_out.size(1)) 131 | 132 | # Bi-RNN Layer 133 | # rnn_pooled: [batch_size, rnn_hidden_size * 2] 134 | rnn_out, rnn_pooled = self.bi_rnn(embedded_sentence_all) 135 | 136 | # Concat 137 | concat = torch.cat((conv_final_flat, rnn_pooled), dim=1) 138 | 139 | # Fully Connected Layer 140 | fc_out = self.fc(concat) 141 | 142 | # Final scores 143 | logits = self.out(fc_out).squeeze() 144 | scores = torch.sigmoid(logits) 145 | 146 | return logits, scores 147 | 148 | def forward(self, x_fb_content, x_fb_question, x_fb_option): 149 | """ 150 | Forward propagation pass. 151 | :param x_fb_content: Front & Behind Content tensors with features. 152 | :param x_fb_question: Front & Behind Question tensors with features. 153 | :param x_fb_option: Front & Behind Option tensors with features. 154 | :return logits: The predicted logistic values. 155 | :return scores: The predicted scores. 156 | """ 157 | f_logits, f_scores = self._sub_network(x_fb_content[0], x_fb_question[0], x_fb_option[0]) 158 | b_logits, b_scores = self._sub_network(x_fb_content[1], x_fb_question[1], x_fb_option[1]) 159 | 160 | logits = (f_logits, b_logits) 161 | scores = (f_scores, b_scores) 162 | return logits, scores 163 | 164 | 165 | class Loss(nn.Module): 166 | def __init__(self): 167 | super(Loss, self).__init__() 168 | 169 | def forward(self, predict_y, input_y): 170 | # Loss 171 | value = (predict_y[0] - predict_y[1]) - (input_y[0] - input_y[1]) 172 | losses = torch.mean(torch.pow(value, 2)) 173 | return losses 174 | -------------------------------------------------------------------------------- /PyTorch/H-MIDP/test.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | __author__ = 'randolph' 3 | 4 | import os 5 | import sys 6 | import time 7 | import torch 8 | 9 | sys.path.append('../') 10 | 11 | from layers import HMIDP, Loss 12 | from utils import checkmate as cm 13 | from utils import data_helpers as dh 14 | from utils import param_parser as parser 15 | from tqdm import trange 16 | from torch.utils.data import TensorDataset, DataLoader 17 | from sklearn.metrics import mean_squared_error, r2_score 18 | 19 | args = parser.parameter_parser() 20 | MODEL = dh.get_model_name() 21 | logger = dh.logger_fn("ptlog", "logs/Test-{0}.log".format(time.asctime())) 22 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 23 | 24 | CPT_DIR = os.path.abspath(os.path.join(os.path.curdir, "runs", MODEL)) 25 | SAVE_DIR = os.path.abspath(os.path.join(os.path.curdir, "outputs", MODEL)) 26 | 27 | 28 | def test(): 29 | logger.info("Loading Data...") 30 | logger.info("Data processing...") 31 | test_data = dh.load_data_and_labels(args.test_file, args.word2vec_file) 32 | logger.info("Data padding...") 33 | test_dataset = dh.MyData(test_data, args.pad_seq_len, device) 34 | test_loader = DataLoader(test_dataset, batch_size=args.batch_size, shuffle=False) 35 | VOCAB_SIZE, EMBEDDING_SIZE, pretrained_word2vec_matrix = dh.load_word2vec_matrix(args.word2vec_file) 36 | 37 | criterion = Loss() 38 | net = HMIDP(args, VOCAB_SIZE, EMBEDDING_SIZE, pretrained_word2vec_matrix).to(device) 39 | checkpoint_file = cm.get_best_checkpoint(CPT_DIR, select_maximum_value=False) 40 | checkpoint = torch.load(checkpoint_file) 41 | net.load_state_dict(checkpoint['model_state_dict']) 42 | net.eval() 43 | 44 | logger.info("Scoring...") 45 | true_labels, predicted_scores = [], [] 46 | batches = trange(len(test_loader), desc="Batches", leave=True) 47 | for batch_cnt, batch in zip(batches, test_loader): 48 | x_test_fb_content, x_test_fb_question, x_test_fb_option, \ 49 | x_test_fb_clens, x_test_fb_qlens, x_test_fb_olens, y_test_fb = batch 50 | logits, scores = net(x_test_fb_content, x_test_fb_question, x_test_fb_option) 51 | for i in y_test_fb[0].tolist(): 52 | true_labels.append(i) 53 | for j in scores[0].tolist(): 54 | predicted_scores.append(j) 55 | 56 | # Calculate the Metrics 57 | test_rmse = mean_squared_error(true_labels, predicted_scores) ** 0.5 58 | test_r2 = r2_score(true_labels, predicted_scores) 59 | test_pcc, test_doa = dh.evaluation(true_labels, predicted_scores) 60 | logger.info("All Test set: PCC {0:.4f} | DOA {1:.4f} | RMSE {2:.4f} | R2 {3:.4f}" 61 | .format(test_pcc, test_doa, test_rmse, test_r2)) 62 | logger.info('Test Finished.') 63 | 64 | logger.info('Creating the prediction file...') 65 | dh.create_prediction_file(save_dir=SAVE_DIR, identifiers=test_data['f_id'], predictions=predicted_scores) 66 | 67 | logger.info('All Finished.') 68 | 69 | 70 | if __name__ == "__main__": 71 | test() 72 | 73 | -------------------------------------------------------------------------------- /PyTorch/H-MIDP/train.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | __author__ = 'randolph' 3 | 4 | import os 5 | import sys 6 | import time 7 | import torch 8 | import torch.nn as nn 9 | 10 | sys.path.append('../') 11 | 12 | from layers import HMIDP, Loss 13 | from utils import checkmate as cm 14 | from utils import data_helpers as dh 15 | from utils import param_parser as parser 16 | from tqdm import tqdm, trange 17 | from torch.utils.tensorboard import SummaryWriter 18 | from torch.utils.data import TensorDataset, DataLoader 19 | from sklearn.metrics import mean_squared_error, r2_score 20 | 21 | 22 | args = parser.parameter_parser() 23 | OPTION = dh.option() 24 | logger = dh.logger_fn("ptlog", "logs/{0}-{1}.log".format('Train' if OPTION == 'T' else 'Restore', time.asctime())) 25 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 26 | 27 | 28 | def train(): 29 | """Training HMIDP model.""" 30 | dh.tab_printer(args, logger) 31 | 32 | # Load sentences, labels, and training parameters 33 | logger.info("Loading data...") 34 | logger.info("Data processing...") 35 | train_data = dh.load_data_and_labels(args.train_file, args.word2vec_file) 36 | val_data = dh.load_data_and_labels(args.validation_file, args.word2vec_file) 37 | 38 | logger.info("Data padding...") 39 | train_dataset = dh.MyData(train_data, args.pad_seq_len, device) 40 | val_dataset = dh.MyData(val_data, args.pad_seq_len, device) 41 | 42 | train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True) 43 | val_loader = DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False) 44 | 45 | # Load word2vec model 46 | VOCAB_SIZE, EMBEDDING_SIZE, pretrained_word2vec_matrix = dh.load_word2vec_matrix(args.word2vec_file) 47 | 48 | # Init network 49 | logger.info("Init nn...") 50 | net = HMIDP(args, VOCAB_SIZE, EMBEDDING_SIZE, pretrained_word2vec_matrix).to(device) 51 | 52 | print("Model's state_dict:") 53 | for param_tensor in net.state_dict(): 54 | print(param_tensor, "\t", net.state_dict()[param_tensor].size()) 55 | 56 | criterion = Loss() 57 | optimizer = torch.optim.Adam(net.parameters(), lr=args.learning_rate, weight_decay=args.l2_lambda) 58 | 59 | if OPTION == 'T': 60 | timestamp = str(int(time.time())) 61 | out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp)) 62 | saver = cm.BestCheckpointSaver(save_dir=out_dir, num_to_keep=args.num_checkpoints, maximize=False) 63 | logger.info("Writing to {0}\n".format(out_dir)) 64 | elif OPTION == 'R': 65 | timestamp = input("[Input] Please input the checkpoints model you want to restore: ") 66 | while not (timestamp.isdigit() and len(timestamp) == 10): 67 | timestamp = input("[Warning] The format of your input is illegal, please re-input: ") 68 | out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp)) 69 | saver = cm.BestCheckpointSaver(save_dir=out_dir, num_to_keep=args.num_checkpoints, maximize=False) 70 | logger.info("Writing to {0}\n".format(out_dir)) 71 | checkpoint = torch.load(out_dir) 72 | net.load_state_dict(checkpoint['model_state_dict']) 73 | optimizer.load_state_dict(checkpoint['optimizer_state_dict']) 74 | 75 | logger.info("Training...") 76 | writer = SummaryWriter('summary') 77 | 78 | def eval_model(val_loader, epoch): 79 | """ 80 | Evaluate on the validation set. 81 | """ 82 | net.eval() 83 | eval_loss = 0.0 84 | true_labels, predicted_scores = [], [] 85 | for batch in val_loader: 86 | x_val_fb_content, x_val_fb_question, x_val_fb_option, \ 87 | x_val_fb_clens, x_val_fb_qlens, x_val_fb_olens, y_val_fb = batch 88 | 89 | logits, scores = net(x_val_fb_content, x_val_fb_question, x_val_fb_option) 90 | avg_batch_loss = criterion(scores, y_val_fb) 91 | eval_loss = eval_loss + avg_batch_loss.item() 92 | for i in y_val_fb[0].tolist(): 93 | true_labels.append(i) 94 | for j in scores[0].tolist(): 95 | predicted_scores.append(j) 96 | 97 | # Calculate the Metrics 98 | eval_rmse = mean_squared_error(true_labels, predicted_scores) ** 0.5 99 | eval_r2 = r2_score(true_labels, predicted_scores) 100 | eval_pcc, eval_doa = dh.evaluation(true_labels, predicted_scores) 101 | eval_loss = eval_loss / len(val_loader) 102 | cur_value = eval_rmse 103 | logger.info("All Validation set: Loss {0:g} | PCC {1:.4f} | DOA {2:.4f} | RMSE {3:.4f} | R2 {4:.4f}" 104 | .format(eval_loss, eval_pcc, eval_doa, eval_rmse, eval_r2)) 105 | writer.add_scalar('validation loss', eval_loss, epoch) 106 | writer.add_scalar('validation PCC', eval_pcc, epoch) 107 | writer.add_scalar('validation DOA', eval_doa, epoch) 108 | writer.add_scalar('validation RMSE', eval_rmse, epoch) 109 | writer.add_scalar('validation R2', eval_r2, epoch) 110 | return cur_value 111 | 112 | for epoch in tqdm(range(args.epochs), desc="Epochs:", leave=True): 113 | # Training step 114 | batches = trange(len(train_loader), desc="Batches", leave=True) 115 | for batch_cnt, batch in zip(batches, train_loader): 116 | net.train() 117 | x_train_fb_content, x_train_fb_question, x_train_fb_option, \ 118 | x_train_fb_clens, x_train_fb_qlens, x_train_fb_olens, y_train_fb = batch 119 | 120 | optimizer.zero_grad() # 如果不置零,Variable 的梯度在每次 backward 的时候都会累加 121 | logits, scores = net(x_train_fb_content, x_train_fb_question, x_train_fb_option) 122 | avg_batch_loss = criterion(scores, y_train_fb) 123 | avg_batch_loss.backward() 124 | optimizer.step() # Parameter updating 125 | batches.set_description("Batches (Loss={:.4f})".format(avg_batch_loss.item())) 126 | logger.info('[epoch {0}, batch {1}] loss: {2:.4f}'.format(epoch + 1, batch_cnt, avg_batch_loss.item())) 127 | writer.add_scalar('training loss', avg_batch_loss, batch_cnt) 128 | # Evaluation step 129 | cur_value = eval_model(val_loader, epoch) 130 | saver.handle(cur_value, net, optimizer, epoch) 131 | writer.close() 132 | 133 | logger.info('Training Finished.') 134 | 135 | 136 | if __name__ == "__main__": 137 | train() 138 | 139 | -------------------------------------------------------------------------------- /PyTorch/R-MIDP/layers.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | __author__ = 'randolph' 3 | 4 | """RMIDP layers.""" 5 | 6 | import torch 7 | import numpy as np 8 | import torch.nn as nn 9 | import torch.nn.functional as F 10 | from torch.autograd import Variable 11 | 12 | 13 | class BiRNNLayer(nn.Module): 14 | def __init__(self, input_units, rnn_type, rnn_layers, rnn_hidden_size, dropout_keep_prob): 15 | super(BiRNNLayer, self).__init__() 16 | if rnn_type == 'LSTM': 17 | self.bi_rnn = nn.LSTM(input_size=input_units, hidden_size=rnn_hidden_size, num_layers=rnn_layers, 18 | batch_first=True, bidirectional=True, dropout=dropout_keep_prob) 19 | if rnn_type == 'GRU': 20 | self.bi_rnn = nn.GRU(input_size=input_units, hidden_size=rnn_hidden_size, num_layers=rnn_layers, 21 | batch_first=True, bidirectional=True, dropout=dropout_keep_prob) 22 | 23 | def forward(self, input_x): 24 | rnn_out, _ = self.bi_rnn(input_x) 25 | rnn_avg = torch.mean(rnn_out, dim=1) 26 | return rnn_out, rnn_avg 27 | 28 | 29 | class RMIDP(nn.Module): 30 | """An implementation of RMIDP""" 31 | 32 | def __init__(self, args, vocab_size, embedding_size, pretrained_embedding=None): 33 | super(RMIDP, self).__init__() 34 | """ 35 | :param args: Arguments object. 36 | """ 37 | self.args = args 38 | self.vocab_size = vocab_size 39 | self.embedding_size = embedding_size 40 | self.pretrained_embedding = pretrained_embedding 41 | self._setup_layers() 42 | 43 | def _setup_embedding_layer(self): 44 | """ 45 | Creating Embedding layers. 46 | """ 47 | if self.pretrained_embedding is None: 48 | embedding_weight = torch.FloatTensor(np.random.uniform(-1, 1, size=(self.vocab_size, self.embedding_size))) 49 | embedding_weight = Variable(embedding_weight, requires_grad=True) 50 | else: 51 | if self.args.embedding_type == 0: 52 | embedding_weight = torch.from_numpy(self.pretrained_embedding).float() 53 | if self.args.embedding_type == 1: 54 | embedding_weight = Variable(torch.from_numpy(self.pretrained_embedding).float(), requires_grad=True) 55 | self.embedding = nn.Embedding(self.vocab_size, self.embedding_size, _weight=embedding_weight) 56 | 57 | def _setup_bi_rnn_layer(self): 58 | """ 59 | Creating Bi-RNN Layer. 60 | """ 61 | self.bi_rnn = BiRNNLayer(input_units=self.embedding_size, rnn_type=self.args.rnn_type, 62 | rnn_layers=self.args.rnn_layers, rnn_hidden_size=self.args.rnn_dim, 63 | dropout_keep_prob=self.args.dropout_rate) 64 | 65 | def _setup_fc_layer(self): 66 | """ 67 | Creating FC Layer. 68 | """ 69 | self.fc = nn.Linear(in_features=self.args.rnn_dim * 2, out_features=self.args.fc_dim, bias=True) 70 | self.out = nn.Linear(in_features=self.args.fc_dim, out_features=1, bias=True) 71 | 72 | def _setup_layers(self): 73 | """ 74 | Creating layers of model. 75 | 1. Embedding Layer. 76 | 2. Bi-RNN Layer. 77 | 3. FC Layer. 78 | """ 79 | self._setup_embedding_layer() 80 | self._setup_bi_rnn_layer() 81 | self._setup_fc_layer() 82 | 83 | def _sub_network(self, x_content, x_question, x_option): 84 | embedded_sentence_content = self.embedding(x_content) 85 | embedded_sentence_question = self.embedding(x_question) 86 | embedded_sentence_option = self.embedding(x_option) 87 | 88 | # Concat Vectors 89 | # [batch_size, sequence_length_all, embedding_size] 90 | embedded_sentence_all = torch.cat((embedded_sentence_content, embedded_sentence_question, 91 | embedded_sentence_option), dim=1) 92 | 93 | # Bi-RNN Layer 94 | rnn_out, rnn_pooled = self.bi_rnn(embedded_sentence_all) 95 | 96 | # Fully Connected Layer 97 | fc_out = self.fc(rnn_pooled) 98 | 99 | # Final scores 100 | logits = self.out(fc_out).squeeze() 101 | scores = torch.sigmoid(logits) 102 | 103 | return logits, scores 104 | 105 | def forward(self, x_fb_content, x_fb_question, x_fb_option): 106 | """ 107 | Forward propagation pass. 108 | :param x_fb_content: Front & Behind Content tensors with features. 109 | :param x_fb_question: Front & Behind Question tensors with features. 110 | :param x_fb_option: Front & Behind Option tensors with features. 111 | :return logits: The predicted logistic values. 112 | :return scores: The predicted scores. 113 | """ 114 | f_logits, f_scores = self._sub_network(x_fb_content[0], x_fb_question[0], x_fb_option[0]) 115 | b_logits, b_scores = self._sub_network(x_fb_content[1], x_fb_question[1], x_fb_option[1]) 116 | 117 | logits = (f_logits, b_logits) 118 | scores = (f_scores, b_scores) 119 | return logits, scores 120 | 121 | 122 | class Loss(nn.Module): 123 | def __init__(self): 124 | super(Loss, self).__init__() 125 | 126 | def forward(self, predict_y, input_y): 127 | # Loss 128 | value = (predict_y[0] - predict_y[1]) - (input_y[0] - input_y[1]) 129 | losses = torch.mean(torch.pow(value, 2)) 130 | return losses 131 | -------------------------------------------------------------------------------- /PyTorch/R-MIDP/test.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | __author__ = 'randolph' 3 | 4 | import os 5 | import sys 6 | import time 7 | import torch 8 | 9 | sys.path.append('../') 10 | 11 | from layers import RMIDP, Loss 12 | from utils import checkmate as cm 13 | from utils import data_helpers as dh 14 | from utils import param_parser as parser 15 | from tqdm import trange 16 | from torch.utils.data import TensorDataset, DataLoader 17 | from sklearn.metrics import mean_squared_error, r2_score 18 | 19 | args = parser.parameter_parser() 20 | MODEL = dh.get_model_name() 21 | logger = dh.logger_fn("ptlog", "logs/Test-{0}.log".format(time.asctime())) 22 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 23 | 24 | CPT_DIR = os.path.abspath(os.path.join(os.path.curdir, "runs", MODEL)) 25 | SAVE_DIR = os.path.abspath(os.path.join(os.path.curdir, "outputs", MODEL)) 26 | 27 | 28 | def test(): 29 | logger.info("Loading Data...") 30 | logger.info("Data processing...") 31 | test_data = dh.load_data_and_labels(args.test_file, args.word2vec_file) 32 | logger.info("Data padding...") 33 | test_dataset = dh.MyData(test_data, args.pad_seq_len, device) 34 | test_loader = DataLoader(test_dataset, batch_size=args.batch_size, shuffle=False) 35 | VOCAB_SIZE, EMBEDDING_SIZE, pretrained_word2vec_matrix = dh.load_word2vec_matrix(args.word2vec_file) 36 | 37 | criterion = Loss() 38 | net = RMIDP(args, VOCAB_SIZE, EMBEDDING_SIZE, pretrained_word2vec_matrix).to(device) 39 | checkpoint_file = cm.get_best_checkpoint(CPT_DIR, select_maximum_value=False) 40 | checkpoint = torch.load(checkpoint_file) 41 | net.load_state_dict(checkpoint['model_state_dict']) 42 | net.eval() 43 | 44 | logger.info("Scoring...") 45 | true_labels, predicted_scores = [], [] 46 | batches = trange(len(test_loader), desc="Batches", leave=True) 47 | for batch_cnt, batch in zip(batches, test_loader): 48 | x_test_fb_content, x_test_fb_question, x_test_fb_option, \ 49 | x_test_fb_clens, x_test_fb_qlens, x_test_fb_olens, y_test_fb = batch 50 | logits, scores = net(x_test_fb_content, x_test_fb_question, x_test_fb_option) 51 | for i in y_test_fb[0].tolist(): 52 | true_labels.append(i) 53 | for j in scores[0].tolist(): 54 | predicted_scores.append(j) 55 | 56 | # Calculate the Metrics 57 | test_rmse = mean_squared_error(true_labels, predicted_scores) ** 0.5 58 | test_r2 = r2_score(true_labels, predicted_scores) 59 | test_pcc, test_doa = dh.evaluation(true_labels, predicted_scores) 60 | logger.info("All Test set: PCC {0:.4f} | DOA {1:.4f} | RMSE {2:.4f} | R2 {3:.4f}" 61 | .format(test_pcc, test_doa, test_rmse, test_r2)) 62 | logger.info('Test Finished.') 63 | 64 | logger.info('Creating the prediction file...') 65 | dh.create_prediction_file(save_dir=SAVE_DIR, identifiers=test_data['f_id'], predictions=predicted_scores) 66 | 67 | logger.info('All Finished.') 68 | 69 | 70 | if __name__ == "__main__": 71 | test() 72 | 73 | -------------------------------------------------------------------------------- /PyTorch/R-MIDP/train.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | __author__ = 'randolph' 3 | 4 | import os 5 | import sys 6 | import time 7 | import torch 8 | import torch.nn as nn 9 | 10 | sys.path.append('../') 11 | 12 | from layers import RMIDP, Loss 13 | from utils import checkmate as cm 14 | from utils import data_helpers as dh 15 | from utils import param_parser as parser 16 | from tqdm import tqdm, trange 17 | from torch.utils.tensorboard import SummaryWriter 18 | from torch.utils.data import TensorDataset, DataLoader 19 | from sklearn.metrics import mean_squared_error, r2_score 20 | 21 | 22 | args = parser.parameter_parser() 23 | OPTION = dh.option() 24 | logger = dh.logger_fn("ptlog", "logs/{0}-{1}.log".format('Train' if OPTION == 'T' else 'Restore', time.asctime())) 25 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 26 | 27 | 28 | def train(): 29 | """Training RMIDP model.""" 30 | dh.tab_printer(args, logger) 31 | 32 | # Load sentences, labels, and training parameters 33 | logger.info("Loading data...") 34 | logger.info("Data processing...") 35 | train_data = dh.load_data_and_labels(args.train_file, args.word2vec_file) 36 | val_data = dh.load_data_and_labels(args.validation_file, args.word2vec_file) 37 | 38 | logger.info("Data padding...") 39 | train_dataset = dh.MyData(train_data, args.pad_seq_len, device) 40 | val_dataset = dh.MyData(val_data, args.pad_seq_len, device) 41 | 42 | train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True) 43 | val_loader = DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False) 44 | 45 | # Load word2vec model 46 | VOCAB_SIZE, EMBEDDING_SIZE, pretrained_word2vec_matrix = dh.load_word2vec_matrix(args.word2vec_file) 47 | 48 | # Init network 49 | logger.info("Init nn...") 50 | net = RMIDP(args, VOCAB_SIZE, EMBEDDING_SIZE, pretrained_word2vec_matrix).to(device) 51 | 52 | print("Model's state_dict:") 53 | for param_tensor in net.state_dict(): 54 | print(param_tensor, "\t", net.state_dict()[param_tensor].size()) 55 | 56 | criterion = Loss() 57 | optimizer = torch.optim.Adam(net.parameters(), lr=args.learning_rate, weight_decay=args.l2_lambda) 58 | 59 | if OPTION == 'T': 60 | timestamp = str(int(time.time())) 61 | out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp)) 62 | saver = cm.BestCheckpointSaver(save_dir=out_dir, num_to_keep=args.num_checkpoints, maximize=False) 63 | logger.info("Writing to {0}\n".format(out_dir)) 64 | elif OPTION == 'R': 65 | timestamp = input("[Input] Please input the checkpoints model you want to restore: ") 66 | while not (timestamp.isdigit() and len(timestamp) == 10): 67 | timestamp = input("[Warning] The format of your input is illegal, please re-input: ") 68 | out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp)) 69 | saver = cm.BestCheckpointSaver(save_dir=out_dir, num_to_keep=args.num_checkpoints, maximize=False) 70 | logger.info("Writing to {0}\n".format(out_dir)) 71 | checkpoint = torch.load(out_dir) 72 | net.load_state_dict(checkpoint['model_state_dict']) 73 | optimizer.load_state_dict(checkpoint['optimizer_state_dict']) 74 | 75 | logger.info("Training...") 76 | writer = SummaryWriter('summary') 77 | 78 | def eval_model(val_loader, epoch): 79 | """ 80 | Evaluate on the validation set. 81 | """ 82 | net.eval() 83 | eval_loss = 0.0 84 | true_labels, predicted_scores = [], [] 85 | for batch in val_loader: 86 | x_val_fb_content, x_val_fb_question, x_val_fb_option, \ 87 | x_val_fb_clens, x_val_fb_qlens, x_val_fb_olens, y_val_fb = batch 88 | 89 | logits, scores = net(x_val_fb_content, x_val_fb_question, x_val_fb_option) 90 | avg_batch_loss = criterion(scores, y_val_fb) 91 | eval_loss = eval_loss + avg_batch_loss.item() 92 | for i in y_val_fb[0].tolist(): 93 | true_labels.append(i) 94 | for j in scores[0].tolist(): 95 | predicted_scores.append(j) 96 | 97 | # Calculate the Metrics 98 | eval_rmse = mean_squared_error(true_labels, predicted_scores) ** 0.5 99 | eval_r2 = r2_score(true_labels, predicted_scores) 100 | eval_pcc, eval_doa = dh.evaluation(true_labels, predicted_scores) 101 | eval_loss = eval_loss / len(val_loader) 102 | cur_value = eval_rmse 103 | logger.info("All Validation set: Loss {0:g} | PCC {1:.4f} | DOA {2:.4f} | RMSE {3:.4f} | R2 {4:.4f}" 104 | .format(eval_loss, eval_pcc, eval_doa, eval_rmse, eval_r2)) 105 | writer.add_scalar('validation loss', eval_loss, epoch) 106 | writer.add_scalar('validation PCC', eval_pcc, epoch) 107 | writer.add_scalar('validation DOA', eval_doa, epoch) 108 | writer.add_scalar('validation RMSE', eval_rmse, epoch) 109 | writer.add_scalar('validation R2', eval_r2, epoch) 110 | return cur_value 111 | 112 | for epoch in tqdm(range(args.epochs), desc="Epochs:", leave=True): 113 | # Training step 114 | batches = trange(len(train_loader), desc="Batches", leave=True) 115 | for batch_cnt, batch in zip(batches, train_loader): 116 | net.train() 117 | x_train_fb_content, x_train_fb_question, x_train_fb_option, \ 118 | x_train_fb_clens, x_train_fb_qlens, x_train_fb_olens, y_train_fb = batch 119 | 120 | optimizer.zero_grad() # 如果不置零,Variable 的梯度在每次 backward 的时候都会累加 121 | logits, scores = net(x_train_fb_content, x_train_fb_question, x_train_fb_option) 122 | avg_batch_loss = criterion(scores, y_train_fb) 123 | avg_batch_loss.backward() 124 | optimizer.step() # Parameter updating 125 | batches.set_description("Batches (Loss={:.4f})".format(avg_batch_loss.item())) 126 | logger.info('[epoch {0}, batch {1}] loss: {2:.4f}'.format(epoch + 1, batch_cnt, avg_batch_loss.item())) 127 | writer.add_scalar('training loss', avg_batch_loss, batch_cnt) 128 | # Evaluation step 129 | cur_value = eval_model(val_loader, epoch) 130 | saver.handle(cur_value, net, optimizer, epoch) 131 | writer.close() 132 | 133 | logger.info('Training Finished.') 134 | 135 | 136 | if __name__ == "__main__": 137 | train() 138 | 139 | -------------------------------------------------------------------------------- /PyTorch/TARNN/layers.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | __author__ = 'randolph' 3 | 4 | """TARNN layers.""" 5 | 6 | import torch 7 | import numpy as np 8 | import torch.nn as nn 9 | import torch.nn.functional as F 10 | from torch.autograd import Variable 11 | 12 | 13 | class BiRNNLayer(nn.Module): 14 | def __init__(self, input_units, rnn_type, rnn_layers, rnn_hidden_size, dropout_keep_prob): 15 | super(BiRNNLayer, self).__init__() 16 | if rnn_type == 'LSTM': 17 | self.bi_rnn = nn.LSTM(input_size=input_units, hidden_size=rnn_hidden_size, num_layers=rnn_layers, 18 | batch_first=True, bidirectional=True, dropout=dropout_keep_prob) 19 | if rnn_type == 'GRU': 20 | self.bi_rnn = nn.GRU(input_size=input_units, hidden_size=rnn_hidden_size, num_layers=rnn_layers, 21 | batch_first=True, bidirectional=True, dropout=dropout_keep_prob) 22 | 23 | def forward(self, input_x): 24 | rnn_out, _ = self.bi_rnn(input_x) 25 | rnn_avg = torch.mean(rnn_out, dim=1) 26 | return rnn_out, rnn_avg 27 | 28 | 29 | class AttentionLayer(nn.Module): 30 | def __init__(self, num_units, att_unit_size, att_type): 31 | super(AttentionLayer, self).__init__() 32 | self.att_type = att_type 33 | 34 | def forward(self, input_x, input_y): 35 | if self.att_type == 'normal': 36 | attention_matrix = torch.matmul(input_y, input_x.transpose(1, 2)) 37 | attention_weight = torch.softmax(attention_matrix, dim=2) 38 | attention_visual = torch.mean(attention_matrix, dim=1) 39 | attention_out = torch.matmul(attention_weight, input_x) 40 | # TODO 41 | attention_out = torch.mean(attention_out, dim=1) 42 | if self.att_type == 'cosine': 43 | cos_matrix = [] 44 | seq_len = list(input_y.size())[-2] 45 | normalized_x = F.normalize(input_x, p=2, dim=2) 46 | for t in range(seq_len): 47 | new_input_y = torch.unsqueeze(input_y[:, t, :], dim=1) 48 | normalized_y = F.normalize(new_input_y, p=2, dim=2) 49 | # cos_similarity: [batch_size, seq_len_1] 50 | cos_similarity = torch.sum(torch.mul(normalized_y, normalized_x), dim=2) 51 | cos_matrix.append(cos_similarity) 52 | # attention_matrix: [batch_size, seq_len_2, seq_len_1] 53 | attention_matrix = torch.stack(cos_matrix, dim=1) 54 | attention_visual = torch.mean(attention_matrix, dim=1) 55 | attention_out = torch.mul(torch.unsqueeze(attention_visual, dim=-1), input_x) 56 | attention_out = torch.mean(attention_out, dim=1) 57 | if self.att_type == 'mlp': 58 | alpha_matrix = [] 59 | seq_len = list(input_y.size())[-2] 60 | for t in range(seq_len): 61 | u_t = torch.matmul(torch.unsqueeze(input_y[:, t, :], dim=1), input_x.transpose(1, 2)) 62 | # u_t: [batch_size, 1, seq_len_1] 63 | u_t = torch.tanh(u_t) 64 | alpha_matrix.append(u_t) 65 | attention_matrix = torch.cat(alpha_matrix, dim=1) 66 | attention_matrix = torch.squeeze(attention_matrix, dim=2) 67 | attention_weight = F.softmax(attention_matrix, dim=1) 68 | attention_visual = torch.mean(attention_weight, dim=1) 69 | attention_out = torch.mul(torch.unsqueeze(attention_visual, dim=-1), input_x) 70 | attention_out = torch.mean(attention_out, dim=1) 71 | return attention_visual, attention_out 72 | 73 | 74 | class HighwayLayer(nn.Module): 75 | def __init__(self, in_units, out_units): 76 | super(HighwayLayer, self).__init__() 77 | self.highway_linear = nn.Linear(in_features=in_units, out_features=out_units, bias=True) 78 | self.highway_gate = nn.Linear(in_features=in_units, out_features=out_units, bias=True) 79 | 80 | def forward(self, input_x): 81 | highway_g = torch.relu(self.highway_linear(input_x)) 82 | highway_t = torch.sigmoid(self.highway_gate(input_x)) 83 | highway_out = torch.mul(highway_g, highway_t) + torch.mul((1 - highway_t), input_x) 84 | return highway_out 85 | 86 | 87 | class TARNN(nn.Module): 88 | """An implementation of TARNN""" 89 | def __init__(self, args, vocab_size, embedding_size, pretrained_embedding=None): 90 | super(TARNN, self).__init__() 91 | """ 92 | :param args: Arguments object. 93 | """ 94 | self.args = args 95 | self.vocab_size = vocab_size 96 | self.embedding_size = embedding_size 97 | self.pretrained_embedding = pretrained_embedding 98 | self._setup_layers() 99 | 100 | def _setup_embedding_layer(self): 101 | """ 102 | Creating Embedding layers. 103 | """ 104 | if self.pretrained_embedding is None: 105 | embedding_weight = torch.FloatTensor(np.random.uniform(-1, 1, size=(self.vocab_size, self.embedding_size))) 106 | embedding_weight = Variable(embedding_weight, requires_grad=True) 107 | else: 108 | if self.args.embedding_type == 0: 109 | embedding_weight = torch.from_numpy(self.pretrained_embedding).float() 110 | if self.args.embedding_type == 1: 111 | embedding_weight = Variable(torch.from_numpy(self.pretrained_embedding).float(), requires_grad=True) 112 | self.embedding = nn.Embedding(self.vocab_size, self.embedding_size, _weight=embedding_weight) 113 | 114 | def _setup_bi_rnn_layer(self): 115 | """ 116 | Creating Bi-RNN Layer. 117 | """ 118 | self.bi_rnn_content = BiRNNLayer(input_units=self.embedding_size, rnn_type=self.args.rnn_type, 119 | rnn_layers=self.args.rnn_layers, rnn_hidden_size=self.args.rnn_dim, 120 | dropout_keep_prob=self.args.dropout_rate) 121 | self.bi_rnn_question = BiRNNLayer(input_units=self.embedding_size, rnn_type=self.args.rnn_type, 122 | rnn_layers=self.args.rnn_layers, rnn_hidden_size=self.args.rnn_dim, 123 | dropout_keep_prob=self.args.dropout_rate) 124 | self.bi_rnn_option = BiRNNLayer(input_units=self.embedding_size, rnn_type=self.args.rnn_type, 125 | rnn_layers=self.args.rnn_layers, rnn_hidden_size=self.args.rnn_dim, 126 | dropout_keep_prob=self.args.dropout_rate) 127 | 128 | def _setup_attention(self): 129 | """ 130 | Creating Attention Layer. 131 | """ 132 | self.att_cq = AttentionLayer(num_units=self.args.attention_dim, 133 | att_unit_size=self.args.attention_dim, 134 | att_type=self.args.attention_type) 135 | self.att_oq = AttentionLayer(num_units=self.args.attention_dim, 136 | att_unit_size=self.args.attention_dim, 137 | att_type=self.args.attention_type) 138 | 139 | def _setup_highway_layer(self): 140 | """ 141 | Creating Highway Layer. 142 | """ 143 | self.highway = HighwayLayer(in_units=self.args.fc_dim, out_units=self.args.fc_dim) 144 | 145 | def _setup_fc_layer(self): 146 | """ 147 | Creating FC Layer. 148 | """ 149 | self.fc = nn.Linear(in_features=self.args.rnn_dim * 2 * 3, out_features=self.args.fc_dim, bias=True) 150 | self.out = nn.Linear(in_features=self.args.fc_dim, out_features=1, bias=True) 151 | 152 | def _setup_dropout(self): 153 | """ 154 | Adding Dropout. 155 | """ 156 | self.dropout = nn.Dropout(self.args.dropout_rate) 157 | 158 | def _setup_layers(self): 159 | """ 160 | Creating layers of model. 161 | 1. Embedding Layer. 162 | 2. Bi-RNN Layer. 163 | 3. Attention Layer. 164 | 4. Highway Layer. 165 | 5. FC Layer. 166 | 6. Dropout 167 | """ 168 | self._setup_embedding_layer() 169 | self._setup_bi_rnn_layer() 170 | self._setup_attention() 171 | self._setup_highway_layer() 172 | self._setup_fc_layer() 173 | self._setup_dropout() 174 | 175 | def _sub_network(self, x_content, x_question, x_option): 176 | embedded_sentence_content = self.embedding(x_content) 177 | embedded_sentence_question = self.embedding(x_question) 178 | embedded_sentence_option = self.embedding(x_option) 179 | 180 | # Average Vectors 181 | # [batch_size, embedding_size] 182 | embedded_content_average = torch.mean(embedded_sentence_content, dim=1) 183 | embedded_question_average = torch.mean(embedded_sentence_question, dim=1) 184 | embedded_option_average = torch.mean(embedded_sentence_option, dim=1) 185 | 186 | # Bi-RNN Layer 187 | rnn_out_content, rnn_avg_content = self.bi_rnn_content(embedded_sentence_content) 188 | rnn_out_question, rnn_avg_question = self.bi_rnn_question(embedded_sentence_question) 189 | rnn_out_option, rnn_avg_option = self.bi_rnn_option(embedded_sentence_option) 190 | 191 | # Attention Layer 192 | attention_cq_visual, attention_cq = self.att_cq(rnn_out_content, rnn_out_question) 193 | attention_oq_visual, attention_oq = self.att_oq(rnn_out_option, rnn_out_question) 194 | 195 | # Concat 196 | # shape of att_out: [batch_size, rnn_hidden_size * 2 * 3] 197 | att_out = torch.cat((attention_cq, rnn_avg_question, attention_oq), dim=1) 198 | 199 | # Fully Connected Layer 200 | fc_out = self.fc(att_out) 201 | 202 | # Highway Layer 203 | highway_out = self.highway(fc_out) 204 | 205 | # Dropout 206 | h_drop = self.dropout(highway_out) 207 | 208 | logits = self.out(h_drop).squeeze() 209 | scores = torch.sigmoid(logits) 210 | 211 | return logits, scores 212 | 213 | def forward(self, x_fb_content, x_fb_question, x_fb_option): 214 | """ 215 | Forward propagation pass. 216 | :param x_fb_content: Front & Behind Content tensors with features. 217 | :param x_fb_question: Front & Behind Question tensors with features. 218 | :param x_fb_option: Front & Behind Option tensors with features. 219 | :return logits: The predicted logistic values. 220 | :return scores: The predicted scores. 221 | """ 222 | f_logits, f_scores = self._sub_network(x_fb_content[0], x_fb_question[0], x_fb_option[0]) 223 | b_logits, b_scores = self._sub_network(x_fb_content[1], x_fb_question[1], x_fb_option[1]) 224 | 225 | logits = (f_logits, b_logits) 226 | scores = (f_scores, b_scores) 227 | return logits, scores 228 | 229 | 230 | class Loss(nn.Module): 231 | def __init__(self): 232 | super(Loss, self).__init__() 233 | self.MSELoss = nn.MSELoss(reduce=True, size_average=True) 234 | 235 | def forward(self, predict_y, input_y): 236 | # Loss 237 | f_loss = self.MSELoss(predict_y[0], input_y[0]) 238 | b_loss = self.MSELoss(predict_y[1], input_y[1]) 239 | 240 | losses = f_loss + b_loss 241 | # value = (predict_y[0] - predict_y[1]) - (input_y[0] - input_y[1]) 242 | # losses = torch.mean(torch.pow(value, 2)) 243 | return losses 244 | -------------------------------------------------------------------------------- /PyTorch/TARNN/test.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | __author__ = 'randolph' 3 | 4 | import os 5 | import sys 6 | import time 7 | import torch 8 | 9 | sys.path.append('../') 10 | 11 | from layers import TARNN, Loss 12 | from utils import checkmate as cm 13 | from utils import data_helpers as dh 14 | from utils import param_parser as parser 15 | from tqdm import trange 16 | from torch.utils.data import TensorDataset, DataLoader 17 | from sklearn.metrics import mean_squared_error, r2_score 18 | 19 | args = parser.parameter_parser() 20 | MODEL = dh.get_model_name() 21 | logger = dh.logger_fn("ptlog", "logs/Test-{0}.log".format(time.asctime())) 22 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 23 | 24 | CPT_DIR = os.path.abspath(os.path.join(os.path.curdir, "runs", MODEL)) 25 | SAVE_DIR = os.path.abspath(os.path.join(os.path.curdir, "outputs", MODEL)) 26 | 27 | 28 | def test(): 29 | logger.info("Loading Data...") 30 | logger.info("Data processing...") 31 | test_data = dh.load_data_and_labels(args.test_file, args.word2vec_file) 32 | logger.info("Data padding...") 33 | test_dataset = dh.MyData(test_data, args.pad_seq_len, device) 34 | test_loader = DataLoader(test_dataset, batch_size=args.batch_size, shuffle=False) 35 | VOCAB_SIZE, EMBEDDING_SIZE, pretrained_word2vec_matrix = dh.load_word2vec_matrix(args.word2vec_file) 36 | 37 | criterion = Loss() 38 | net = TARNN(args, VOCAB_SIZE, EMBEDDING_SIZE, pretrained_word2vec_matrix).to(device) 39 | checkpoint_file = cm.get_best_checkpoint(CPT_DIR, select_maximum_value=False) 40 | checkpoint = torch.load(checkpoint_file) 41 | net.load_state_dict(checkpoint['model_state_dict']) 42 | net.eval() 43 | 44 | logger.info("Scoring...") 45 | true_labels, predicted_scores = [], [] 46 | batches = trange(len(test_loader), desc="Batches", leave=True) 47 | for batch_cnt, batch in zip(batches, test_loader): 48 | x_test_fb_content, x_test_fb_question, x_test_fb_option, \ 49 | x_test_fb_clens, x_test_fb_qlens, x_test_fb_olens, y_test_fb = batch 50 | logits, scores = net(x_test_fb_content, x_test_fb_question, x_test_fb_option) 51 | for i in y_test_fb[0].tolist(): 52 | true_labels.append(i) 53 | for j in scores[0].tolist(): 54 | predicted_scores.append(j) 55 | 56 | # Calculate the Metrics 57 | test_rmse = mean_squared_error(true_labels, predicted_scores) ** 0.5 58 | test_r2 = r2_score(true_labels, predicted_scores) 59 | test_pcc, test_doa = dh.evaluation(true_labels, predicted_scores) 60 | logger.info("All Test set: PCC {0:.4f} | DOA {1:.4f} | RMSE {2:.4f} | R2 {3:.4f}" 61 | .format(test_pcc, test_doa, test_rmse, test_r2)) 62 | logger.info('Test Finished.') 63 | 64 | logger.info('Creating the prediction file...') 65 | dh.create_prediction_file(save_dir=SAVE_DIR, identifiers=test_data['f_id'], predictions=predicted_scores) 66 | 67 | logger.info('All Finished.') 68 | 69 | 70 | if __name__ == "__main__": 71 | test() 72 | 73 | -------------------------------------------------------------------------------- /PyTorch/TARNN/train.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | __author__ = 'randolph' 3 | 4 | import os 5 | import sys 6 | import time 7 | import torch 8 | import torch.nn as nn 9 | 10 | sys.path.append('../') 11 | 12 | from layers import TARNN, Loss 13 | from utils import checkmate as cm 14 | from utils import data_helpers as dh 15 | from utils import param_parser as parser 16 | from tqdm import tqdm, trange 17 | from torch.utils.tensorboard import SummaryWriter 18 | from torch.utils.data import TensorDataset, DataLoader 19 | from sklearn.metrics import mean_squared_error, r2_score 20 | 21 | 22 | args = parser.parameter_parser() 23 | OPTION = dh.option() 24 | logger = dh.logger_fn("ptlog", "logs/{0}-{1}.log".format('Train' if OPTION == 'T' else 'Restore', time.asctime())) 25 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 26 | 27 | 28 | def train(): 29 | """Training TARNN model.""" 30 | dh.tab_printer(args, logger) 31 | 32 | # Load sentences, labels, and training parameters 33 | logger.info("Loading data...") 34 | logger.info("Data processing...") 35 | train_data = dh.load_data_and_labels(args.train_file, args.word2vec_file) 36 | val_data = dh.load_data_and_labels(args.validation_file, args.word2vec_file) 37 | 38 | logger.info("Data padding...") 39 | train_dataset = dh.MyData(train_data, args.pad_seq_len, device) 40 | val_dataset = dh.MyData(val_data, args.pad_seq_len, device) 41 | 42 | train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True) 43 | val_loader = DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False) 44 | 45 | # Load word2vec model 46 | VOCAB_SIZE, EMBEDDING_SIZE, pretrained_word2vec_matrix = dh.load_word2vec_matrix(args.word2vec_file) 47 | 48 | # Init network 49 | logger.info("Init nn...") 50 | net = TARNN(args, VOCAB_SIZE, EMBEDDING_SIZE, pretrained_word2vec_matrix).to(device) 51 | 52 | print("Model's state_dict:") 53 | for param_tensor in net.state_dict(): 54 | print(param_tensor, "\t", net.state_dict()[param_tensor].size()) 55 | 56 | criterion = Loss() 57 | optimizer = torch.optim.Adam(net.parameters(), lr=args.learning_rate, weight_decay=args.l2_lambda) 58 | 59 | if OPTION == 'T': 60 | timestamp = str(int(time.time())) 61 | out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp)) 62 | saver = cm.BestCheckpointSaver(save_dir=out_dir, num_to_keep=args.num_checkpoints, maximize=False) 63 | logger.info("Writing to {0}\n".format(out_dir)) 64 | elif OPTION == 'R': 65 | timestamp = input("[Input] Please input the checkpoints model you want to restore: ") 66 | while not (timestamp.isdigit() and len(timestamp) == 10): 67 | timestamp = input("[Warning] The format of your input is illegal, please re-input: ") 68 | out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp)) 69 | saver = cm.BestCheckpointSaver(save_dir=out_dir, num_to_keep=args.num_checkpoints, maximize=False) 70 | logger.info("Writing to {0}\n".format(out_dir)) 71 | checkpoint = torch.load(out_dir) 72 | net.load_state_dict(checkpoint['model_state_dict']) 73 | optimizer.load_state_dict(checkpoint['optimizer_state_dict']) 74 | 75 | logger.info("Training...") 76 | writer = SummaryWriter('summary') 77 | 78 | def eval_model(val_loader, epoch): 79 | """ 80 | Evaluate on the validation set. 81 | """ 82 | net.eval() 83 | eval_loss = 0.0 84 | true_labels, predicted_scores = [], [] 85 | for batch in val_loader: 86 | x_val_fb_content, x_val_fb_question, x_val_fb_option, \ 87 | x_val_fb_clens, x_val_fb_qlens, x_val_fb_olens, y_val_fb = batch 88 | 89 | logits, scores = net(x_val_fb_content, x_val_fb_question, x_val_fb_option) 90 | avg_batch_loss = criterion(scores, y_val_fb) 91 | eval_loss = eval_loss + avg_batch_loss.item() 92 | for i in y_val_fb[0].tolist(): 93 | true_labels.append(i) 94 | for j in scores[0].tolist(): 95 | predicted_scores.append(j) 96 | 97 | # Calculate the Metrics 98 | eval_rmse = mean_squared_error(true_labels, predicted_scores) ** 0.5 99 | eval_r2 = r2_score(true_labels, predicted_scores) 100 | eval_pcc, eval_doa = dh.evaluation(true_labels, predicted_scores) 101 | eval_loss = eval_loss / len(val_loader) 102 | cur_value = eval_rmse 103 | logger.info("All Validation set: Loss {0:g} | PCC {1:.4f} | DOA {2:.4f} | RMSE {3:.4f} | R2 {4:.4f}" 104 | .format(eval_loss, eval_pcc, eval_doa, eval_rmse, eval_r2)) 105 | writer.add_scalar('validation loss', eval_loss, epoch) 106 | writer.add_scalar('validation PCC', eval_pcc, epoch) 107 | writer.add_scalar('validation DOA', eval_doa, epoch) 108 | writer.add_scalar('validation RMSE', eval_rmse, epoch) 109 | writer.add_scalar('validation R2', eval_r2, epoch) 110 | return cur_value 111 | 112 | for epoch in tqdm(range(args.epochs), desc="Epochs:", leave=True): 113 | # Training step 114 | batches = trange(len(train_loader), desc="Batches", leave=True) 115 | for batch_cnt, batch in zip(batches, train_loader): 116 | net.train() 117 | x_train_fb_content, x_train_fb_question, x_train_fb_option, \ 118 | x_train_fb_clens, x_train_fb_qlens, x_train_fb_olens, y_train_fb = batch 119 | 120 | optimizer.zero_grad() # 如果不置零,Variable 的梯度在每次 backward 的时候都会累加 121 | logits, scores = net(x_train_fb_content, x_train_fb_question, x_train_fb_option) 122 | avg_batch_loss = criterion(scores, y_train_fb) 123 | avg_batch_loss.backward() 124 | optimizer.step() # Parameter updating 125 | batches.set_description("Batches (Loss={:.4f})".format(avg_batch_loss.item())) 126 | logger.info('[epoch {0}, batch {1}] loss: {2:.4f}'.format(epoch + 1, batch_cnt, avg_batch_loss.item())) 127 | writer.add_scalar('training loss', avg_batch_loss, batch_cnt) 128 | # Evaluation step 129 | cur_value = eval_model(val_loader, epoch) 130 | saver.handle(cur_value, net, optimizer, epoch) 131 | writer.close() 132 | 133 | logger.info('Training Finished.') 134 | 135 | 136 | if __name__ == "__main__": 137 | train() -------------------------------------------------------------------------------- /PyTorch/utils/checkmate.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | __author__ = 'Randolph' 3 | 4 | import os 5 | import glob 6 | import json 7 | import torch 8 | import numpy as np 9 | 10 | 11 | class BestCheckpointSaver(object): 12 | """Maintains a directory containing only the best n checkpoints 13 | Inside the directory is a best_checkpoints JSON file containing a dictionary 14 | mapping of the best checkpoint filepaths to the values by which the checkpoints 15 | are compared. Only the best n checkpoints are contained in the directory and JSON file. 16 | This is a light-weight wrapper class only intended to work in simple, 17 | non-distributed settings. 18 | """ 19 | def __init__(self, save_dir, num_to_keep=1, maximize=True): 20 | """Creates a `BestCheckpointSaver` 21 | `BestCheckpointSaver` acts as a wrapper class around a `tf.train.Saver` 22 | Args: 23 | save_dir: The directory in which the checkpoint files will be saved 24 | num_to_keep: The number of best checkpoint files to retain 25 | maximize: Define 'best' values to be the highest values. For example, 26 | set this to True if selecting for the checkpoints with the highest 27 | given accuracy. Or set to False to select for checkpoints with the 28 | lowest given error rate. 29 | """ 30 | self._num_to_keep = num_to_keep 31 | self._save_dir = save_dir 32 | self._maximize = maximize 33 | 34 | if not os.path.exists(save_dir): 35 | os.makedirs(save_dir) 36 | self.best_checkpoints_file = os.path.join(save_dir, 'best_checkpoints') 37 | 38 | def handle(self, value, model, optimizer, global_epoch): 39 | """ 40 | Updates the set of best checkpoints based on the given result. 41 | Args: 42 | value: The value by which to rank the checkpoint. 43 | model: The model 44 | optimizer: The optimizer 45 | global_epoch: The global epoch 46 | """ 47 | state = {'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict()} 48 | 49 | current_ckpt = 'epoch-{0}'.format(global_epoch) 50 | filename = os.path.abspath(os.path.join(self._save_dir, current_ckpt)) 51 | value = float(value) 52 | if not os.path.exists(self.best_checkpoints_file): 53 | self._save_best_checkpoints_file({current_ckpt: value}) 54 | torch.save(state, filename) 55 | return 56 | 57 | best_checkpoints = self._load_best_checkpoints_file() 58 | 59 | if len(best_checkpoints) < self._num_to_keep: 60 | best_checkpoints[current_ckpt] = value 61 | self._save_best_checkpoints_file(best_checkpoints) 62 | torch.save(state, filename) 63 | return 64 | 65 | if self._maximize: 66 | should_save = not all(current_best >= value for current_best in best_checkpoints.values()) 67 | else: 68 | should_save = not all(current_best <= value for current_best in best_checkpoints.values()) 69 | if should_save: 70 | best_checkpoint_list = self._sort(best_checkpoints) 71 | 72 | worst_checkpoint = os.path.join(self._save_dir, best_checkpoint_list.pop(-1)[0]) 73 | self._remove_outdated_checkpoint_files(worst_checkpoint) 74 | 75 | best_checkpoints = dict(best_checkpoint_list) 76 | best_checkpoints[current_ckpt] = value 77 | self._save_best_checkpoints_file(best_checkpoints) 78 | torch.save(state, filename) 79 | 80 | def _save_best_checkpoints_file(self, updated_best_checkpoints): 81 | with open(self.best_checkpoints_file, 'w') as f: 82 | json.dump(updated_best_checkpoints, f, indent=3) 83 | 84 | def _remove_outdated_checkpoint_files(self, worst_checkpoint): 85 | os.remove(worst_checkpoint) 86 | 87 | def _load_best_checkpoints_file(self): 88 | with open(self.best_checkpoints_file, 'r') as f: 89 | best_checkpoints = json.load(f) 90 | return best_checkpoints 91 | 92 | def _sort(self, best_checkpoints): 93 | best_checkpoints = [ 94 | (ckpt, best_checkpoints[ckpt]) 95 | for ckpt in sorted(best_checkpoints, 96 | key=best_checkpoints.get, 97 | reverse=self._maximize) 98 | ] 99 | return best_checkpoints 100 | 101 | 102 | def get_best_checkpoint(best_checkpoint_dir, select_maximum_value=True): 103 | """ 104 | Returns filepath to the best checkpoint 105 | Reads the best_checkpoints file in the best_checkpoint_dir directory. 106 | Returns the filepath in the best_checkpoints file associated with 107 | the highest value if select_maximum_value is True, or the filepath 108 | associated with the lowest value if select_maximum_value is False. 109 | Args: 110 | best_checkpoint_dir: Directory containing best_checkpoints JSON file 111 | select_maximum_value: If True, select the filepath associated 112 | with the highest value. Otherwise, select the filepath associated 113 | with the lowest value. 114 | Returns: 115 | The full path to the best checkpoint file 116 | """ 117 | best_checkpoints_file = os.path.join(best_checkpoint_dir, 'best_checkpoints') 118 | assert os.path.exists(best_checkpoints_file) 119 | with open(best_checkpoints_file, 'r') as f: 120 | best_checkpoints = json.load(f) 121 | best_checkpoints = [ 122 | ckpt for ckpt in sorted(best_checkpoints, 123 | key=best_checkpoints.get, 124 | reverse=select_maximum_value) 125 | ] 126 | return os.path.join(os.path.abspath(best_checkpoint_dir), best_checkpoints[0]) -------------------------------------------------------------------------------- /PyTorch/utils/param_parser.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | 4 | def parameter_parser(): 5 | """ 6 | A method to parse up command line parameters. 7 | The default hyperparameters give good results without cross-validation. 8 | """ 9 | parser = argparse.ArgumentParser(description="Run Model.") 10 | 11 | # Data Parameters 12 | parser.add_argument("--train-file", nargs="?", default="../../data/Train_pairwise_sample.json", help="Training data.") 13 | parser.add_argument("--validation-file", nargs="?", default="../../data/Validation_pairwise_sample.json", help="Validation data.") 14 | parser.add_argument("--test-file", nargs="?", default="../../data/Test_pairwise_sample.json", help="Testing data.") 15 | parser.add_argument("--metadata-file", nargs="?", default="../../data/metadata.tsv", 16 | help="Metadata file for embedding visualization.") 17 | parser.add_argument("--word2vec-file", nargs="?", default="../../data/word2vec_300.txt", 18 | help="Word2vec file for embedding characters.") 19 | 20 | # Model Hyperparameters 21 | parser.add_argument("--pad-seq-len", type=list, default=[350, 15, 10], help="Padding Sequence length. (depends on the data)") 22 | parser.add_argument("--embedding-type", type=int, default=1, help="The embedding type.") 23 | parser.add_argument("--embedding-dim", type=int, default=300, help="Dimensionality of character embedding.") 24 | parser.add_argument("--attention-type", nargs="?", default="mlp", help="The attention type. ('normal', 'cosine', 'mlp')") 25 | parser.add_argument("--attention-dim", type=int, default=200, help="Dimensionality of Attention Neurons.") 26 | parser.add_argument("--filter-sizes", type=list, default=[3, 5, 7], help="Filter sizes.") 27 | parser.add_argument("--conv-padding-sizes", type=list, default=[1, 2, 3], help="Padding sizes for Conv Layer.") 28 | parser.add_argument("--dilation-sizes", type=list, default=[1, 2, 3], help="Dilation sizes for Conv Layer.") 29 | parser.add_argument("--num-filters", type=list, default=[256, 256, 256], help="Number of filters per filter size.") 30 | parser.add_argument("--pooling-size", type=int, default=3, help="Pooling sizes. (default: 3)") 31 | parser.add_argument("--rnn-dim", type=int, default=128, help="Dimensionality for RNN Neurons.") 32 | parser.add_argument("--rnn-type", nargs="?", default="GRU", help="Type of RNN Cell. ('RNN', 'LSTM', 'GRU')") 33 | parser.add_argument("--rnn-layers", type=int, default=1, help="Number of RNN Layers.") 34 | parser.add_argument("--skip-size", type=int, default=3, help="Skip window of Skip-RNN Layers.") 35 | parser.add_argument("--skip-dim", type=int, default=5, help="Dimensionality for Skip-RNN Layers.") 36 | parser.add_argument("--fc-dim", type=int, default=512, help="Dimensionality for FC Neurons.") 37 | parser.add_argument("--dropout-rate", type=float, default=0.5, help="Dropout keep probability.") 38 | 39 | # Training Parameters 40 | parser.add_argument("--epochs", type=int, default=30, help="Number of training epochs.") 41 | parser.add_argument("--batch-size", type=int, default=32, help="Batch Size.") 42 | parser.add_argument("--learning-rate", type=float, default=0.001, help="Learning rate.") 43 | parser.add_argument("--decay-rate", type=float, default=0.95, help="Rate of decay for learning rate.") 44 | parser.add_argument("--decay-steps", type=int, default=500, help="How many steps before decay learning rate.") 45 | parser.add_argument("--norm-ratio", type=float, default=1.25, 46 | help="The ratio of the sum of gradients norms of trainable variable.") 47 | parser.add_argument("--l2-lambda", type=float, default=0.0, help="L2 regularization lambda.") 48 | parser.add_argument("--num-checkpoints", type=int, default=3, help="Number of checkpoints to store.") 49 | 50 | return parser.parse_args() -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Deep Learning for Question Difficulty Prediction 2 | [![Python Version](https://img.shields.io/badge/language-python3.6-blue.svg)](https://www.python.org/downloads/) [![Build Status](https://travis-ci.org/RandolphVI/Question-Difficulty-Prediction.svg?branch=master)](https://travis-ci.org/RandolphVI/Question-Difficulty-Prediction) [![Codacy Badge](https://api.codacy.com/project/badge/Grade/c45aac301b244316830b00b9b0985e3e)](https://www.codacy.com/app/chinawolfman/Question-Difficulty-Prediction?utm_source=github.com&utm_medium=referral&utm_content=RandolphVI/Question-Difficulty-Prediction&utm_campaign=Badge_Grade) [![License](https://img.shields.io/github/license/RandolphVI/Question-Difficulty-Prediction.svg)](https://www.apache.org/licenses/LICENSE-2.0) [![Issues](https://img.shields.io/github/issues/RandolphVI/Question-Difficulty-Prediction.svg)](https://github.com/RandolphVI/Question-Difficulty-Prediction/issues) 3 | 4 | This repository contains my implementations for question difficulty prediction task. 5 | 6 | The main objective of the project is to predict the difficulty of each given question based on its context materials which include several components (such like document, question and option in English READING problems). 7 | 8 | ## Requirements 9 | 10 | - Python 3.6 11 | - PyTorch 1.6.0 12 | - Tensorflow 1.15.0 13 | - Tensorboard 1.15.0 14 | - XGBoost 1.2.0 15 | - Sklearn 0.19.1 16 | - Numpy 1.16.2 17 | - Gensim 3.8.3 18 | - Tqdm 4.49.0 19 | 20 | 21 | ## Introduction 22 | 23 | In the widely used standard test, such as **TOEFL** or **SAT**, examinees are often allowed to retake tests and choose higher scores for college admission. This rule brings an important requirement that we should select test papers with consistent difficulties to guarantee the fairness. Therefore, measurements on tests have attracted much attention. 24 | 25 | Among the measurements, one of the most crucial demands is predicting the difficulty of each specific test question, i.e., the percentage of examinees who answer the question wrong. Unfortunately, the ques- 26 | tion difficulty is not directly observable before the test is conducted, and traditional methods often resort to expertise, such as manual labeling or artificial tests organization. Obviously, these human-based solutions are limited in that they are subjective and labor intensive, and the results could also be biased or misleading (we will illustrate this discovery experimentally). 27 | 28 | Therefore, it is an urgent issue to automatically predict question difficulty without manual intervention. Fortunately, with abundant tests recorded by automatic test paper marking systems, test logs of examinees and text materials of questions, as the auxiliary information, become more and more available, which benefits a data-driven solution to this Question Difficulty Prediction (QDP) task, especially for the typical English READING problems. For example, a English READING problem contains a document material and the several corresponding questions, and each question contains the corresponding options. 29 | 30 | ## Project 31 | 32 | The project structure is below: 33 | 34 | ```text 35 | . 36 | ├── TMLA(Traditional Machine Learning Algorithms) 37 | │   ├── DTR / LR / SVM / XGBoost 38 | │   └── utils 39 | ├── TF(TensorFlow) 40 | │   ├── C-MIDP / R-MIDP / H-MIDP 41 | │   ├── TARNN 42 | │   └── utils 43 | ├── PyTorch 44 | │   ├── C-MIDP / R-MIDP / H-MIDP 45 | │   ├── TARNN 46 | │   │   ├── test_tarnn.py 47 | │   │   ├── text_tarnn.py 48 | │   │   └── train_tarnn.py 49 | │   └── utils 50 | │      ├── param_parser.py 51 | │   └── data_helpers.py 52 | ├── data 53 | │   ├── word2vec_300.txt [Need Download] 54 | │   ├── Train / Validation /Test_sample.json 55 | │   ├── Train / Validation / Test_BOW_sample.json 56 | │   └── Train / Validation / Test_pairwise_sample.json 57 | ├── LICENSE 58 | ├── README.md 59 | └── requirements.txt 60 | ``` 61 | 62 | ## Data 63 | 64 | See data format in `/data` folder which including the data sample files. For example, `train_sample.json` is like: 65 | 66 | ```json 67 | {"id": "6", "content": ["year", "ruined", "summer", "vacation-a", "two-week", "vacation", "wife", "family", "cabin", "lake", "northern", "ontario", "located", "boundary", "canada-by", "bringing", "modern", "convenience", "wa", "convenient", "good", "ipad", "admiring", "beauty", "nature", "checked", "e-mail", "paddling", "canoe", "twitter", "feed", "devouring", "great", "amusing", "stuck", "workday", "diet", "newspaper", "morning", "wa", "problem", "wa", "behaving", "office", "sticking", "unending", "news", "cycle", "body", "wa", "vacation", "head", "wasnt", "year", "made", "mind", "social", "medium", "experiment", "reverse", "withdrawal", "internet", "manage", "unplug", "knew", "wouldnt", "easy", "im", "good", "self-denial", "wa", "determined", "started", "physical", "restraint", "handing", "ipad", "wife", "helpfully", "announced", "wa", "read", "book", "club", "inclined", "relinquish", "tablet", "moment", "stroke", "luck", "cell", "phone", "signal", "canadian", "cabin", "wa", "spottier", "past", "making", "attempt", "cheating", "experience", "frustration", "wa", "trapped", "forced", "comply", "good", "intention", "largely", "cut", "e-mail", "twitter", "favorite", "newspaper", "website", "connect", "world", "radio-and", "radio", "listen", "choice", "planned", "read", "book", "experienced", "criminal", "plot", "street", "los", "angeles", "cutthroat", "battle", "cancer", "lab", "psyche", "london", "social", "butterfly", "magazine", "read", "im", "claiming", "cut", "internet", "completely", "day", "biked", "nearest", "town", "reward", "sat", "park", "bench", "front", "public", "library", "wi-fi", "back", "cabin", "suffered", "slow", "dial-up", "connection", "day", "check", "e-mail", "tale", "self-denial", "ha", "happy", "ending-for", "determination", "deep", "breathing", "strong", "support", "wife", "succeeded", "vacation", "struggle", "internet", "realizing", "finally", "wa", "ipad", "wa", "problem", "knew", "passed", "starbucks", "wife", "asked", "wanted", "stop", "wi-fi", "dont", "sound", "pleased", "return", "post-vacation", "situation", "test", "begin", "stay", "wagon", "im", "back", "work", "time", "compulsion", "whats", "overwhelming", "crucial", "livelihood", "intention", "giving", "membership", "cult", "immediacy", "hope", "resist", "temptation", "reflexively", "check", "e-mail", "minute", "lead", "long", "im", "checking", "twitter", "feed", "website", "vacation", "supposed", "reset", "brain", "productive", "hoping", "worked"], "question": ["doe", "underlined", "word", "restraint"], "pos_text": ["calm", "controlled", "behavior"], "neg_text": ["relaxing", "move", "strong", "determination", "unshakable", "faith"], "diff": 0.550373134328} 68 | ``` 69 | 70 | - **"id"**: just the id. 71 | - **"content"**: the word segment of the content. 72 | - **"question"**: The word segment of the question. 73 | - **"pos_text"**: The word segment of the correct option. 74 | - **"neg_text"**: The word segment of the wrong options. 75 | - **"diff"**: The difficulty of the question. 76 | 77 | ### Text Segment 78 | 79 | 1. You can use `nltk` package if you are going to deal with the English text data. 80 | 81 | 2. You can use `jieba` package if you are going to deal with the Chinese text data. 82 | 83 | ### Data Format 84 | 85 | This repository can be used in other similiar datasets in two ways: 86 | 87 | 1. Modify your datasets into the same format of [the sample](https://github.com/RandolphVI/Question-Difficulty-Prediction/tree/master/data). 88 | 2. Modify the data preprocessing code in `data_helpers.py`. 89 | 90 | 91 | Anyway, it should depend on what your data and task are. 92 | 93 | ### Pre-trained Word Vectors 94 | 95 | **You can download the [Word2vec model file](https://drive.google.com/open?id=1QQhm6vKdZmEHaVYvuFbA5Yj6RoVlOhzh) (dim=300). Make sure they are unzipped and under the `/data` folder.** 96 | 97 | You can pre-training your word vectors (based on your corpus) in many ways: 98 | - Use `gensim` package to pre-train data. 99 | - Use `glove` tools to pre-train data. 100 | - Even can use a **fasttext** network to pre-train data. 101 | 102 | ## Usage 103 | 104 | See [Usage-TF](https://github.com/RandolphVI/Question-Difficulty-Prediction/blob/master/Usage-TF.md) & [Usage-PyTorch](https://github.com/RandolphVI/Question-Difficulty-Prediction/blob/master/Usage-PyTorch.md). 105 | 106 | ## Network Structure 107 | 108 | Specifically, given the abundant historical test logs and text materials of question (including document, questions and options), we first design a LSTM-based architecture to extract sentence representations for the text materials. Then, we utilize an attention strategy to qualify the difficulty contribution of 1) each word in document to questions, and 2) each word in option to questions. 109 | 110 | Considering the incomparability of question difficulties in different tests, we propose a test-dependent pairwise strategy for training TARNN and generating the difficulty prediction value. 111 | 112 | ![](https://farm8.staticflickr.com/7846/33643949658_9599454fdf_o.png) 113 | 114 | The framework of TARNN: 115 | 116 | 1. The **Input Layer** comprises document representation (TD), question representation (TQ) and option representation (TO). 117 | 2. The **Bi-LSTM Layer** learns the deep comparable semantic representations for text materials. 118 | 3. The **Attention Layer** extracts words of the document (or the option) with high scores as dominant information for a specific question, which is helpful for visualizing the model and improving the performance. 119 | 4. Finally the **Prediction Layer** shows predicted difficulty scores of the given READING problem. 120 | 121 | ## Reference 122 | 123 | **If you want to follow the paper or utilize the code, please note the following info in your work:** 124 | 125 | - **Model C-MIDP/R-MIDP/H-MIDP** 126 | 127 | ```bibtex 128 | @article{佟威2019数据驱动的数学试题难度预测, 129 | author = {佟威 and 130 | 汪飞 and 131 | 刘淇 and 132 | 陈恩红}, 133 | title = {数据驱动的数学试题难度预测}, 134 | journal = {计算机研究与发展}, 135 | pages = {1007--1019}, 136 | year = {2019}, 137 | } 138 | ``` 139 | 140 | - **Model TARNN** (modified by TACNN) 141 | 142 | ```bibtex 143 | @inproceedings{huang2017question, 144 | author = {Zhenya Huang and 145 | Qi Liu and 146 | Enchong Chen and 147 | Hongke Zhao and 148 | Mingyong Gao and 149 | Si Wei and 150 | Yu Su and 151 | Guoping Hu}, 152 | title = {Question Difficulty Prediction for READING Problems in Standard Tests}, 153 | booktitle = {Thirty-First AAAI Conference on Artificial Intelligence}, 154 | year = {2017}, 155 | } 156 | ``` 157 | 158 | ## About Me 159 | 160 | 黄威,Randolph 161 | 162 | SCU SE Bachelor; USTC CS Ph.D. 163 | 164 | Email: chinawolfman@hotmail.com 165 | 166 | My Blog: [randolph.pro](http://randolph.pro) 167 | 168 | LinkedIn: [randolph's linkedin](https://www.linkedin.com/in/randolph-%E9%BB%84%E5%A8%81/) -------------------------------------------------------------------------------- /TF/C-MIDP/test_cmidp.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | __author__ = 'Randolph' 3 | 4 | import os 5 | import sys 6 | import time 7 | import logging 8 | 9 | sys.path.append('../') 10 | logging.getLogger('tensorflow').disabled = True 11 | 12 | import tensorflow as tf 13 | from utils import checkmate as cm 14 | from utils import data_helpers as dh 15 | from utils import param_parser as parser 16 | from sklearn.metrics import mean_squared_error, r2_score 17 | 18 | args = parser.parameter_parser() 19 | MODEL = dh.get_model_name() 20 | logger = dh.logger_fn("tflog", "logs/Test-{0}.log".format(time.asctime())) 21 | 22 | CPT_DIR = 'runs/' + MODEL + '/checkpoints/' 23 | BEST_CPT_DIR = 'runs/' + MODEL + '/bestcheckpoints/' 24 | SAVE_DIR = 'output/' + MODEL 25 | 26 | 27 | def test_cmidp(): 28 | """Test CMIDP model.""" 29 | # Print parameters used for the model 30 | dh.tab_printer(args, logger) 31 | 32 | # Load data 33 | logger.info("Loading data...") 34 | logger.info("Data processing...") 35 | test_data = dh.load_data_and_labels(args.test_file, args.word2vec_file, data_aug_flag=False) 36 | 37 | logger.info("Data padding...") 38 | x_test_content, x_test_question, x_test_option, y_test = dh.pad_data(test_data, args.pad_seq_len) 39 | 40 | # Load cmidp model 41 | OPTION = dh.option(pattern=1) 42 | if OPTION == 'B': 43 | logger.info("Loading best model...") 44 | checkpoint_file = cm.get_best_checkpoint(BEST_CPT_DIR, select_maximum_value=True) 45 | else: 46 | logger.info("Loading latest model...") 47 | checkpoint_file = tf.train.latest_checkpoint(CPT_DIR) 48 | logger.info(checkpoint_file) 49 | 50 | graph = tf.Graph() 51 | with graph.as_default(): 52 | session_conf = tf.ConfigProto( 53 | allow_soft_placement=args.allow_soft_placement, 54 | log_device_placement=args.log_device_placement) 55 | session_conf.gpu_options.allow_growth = args.gpu_options_allow_growth 56 | sess = tf.Session(config=session_conf) 57 | with sess.as_default(): 58 | # Load the saved meta graph and restore variables 59 | saver = tf.train.import_meta_graph("{0}.meta".format(checkpoint_file)) 60 | saver.restore(sess, checkpoint_file) 61 | 62 | # Get the placeholders from the graph by name 63 | input_x_content = graph.get_operation_by_name("input_x_content").outputs[0] 64 | input_x_question = graph.get_operation_by_name("input_x_question").outputs[0] 65 | input_x_option = graph.get_operation_by_name("input_x_option").outputs[0] 66 | input_y = graph.get_operation_by_name("input_y").outputs[0] 67 | dropout_keep_prob = graph.get_operation_by_name("dropout_keep_prob").outputs[0] 68 | is_training = graph.get_operation_by_name("is_training").outputs[0] 69 | 70 | # Tensors we want to evaluate 71 | scores = graph.get_operation_by_name("output/scores").outputs[0] 72 | loss = graph.get_operation_by_name("loss/loss").outputs[0] 73 | 74 | # Split the output nodes name by '|' if you have several output nodes 75 | output_node_names = "output/scores" 76 | 77 | # Save the .pb model file 78 | output_graph_def = tf.graph_util.convert_variables_to_constants(sess, sess.graph_def, 79 | output_node_names.split("|")) 80 | tf.train.write_graph(output_graph_def, "graph", "graph-cmidp-{0}.pb".format(MODEL), as_text=False) 81 | 82 | # Generate batches for one epoch 83 | batches = dh.batch_iter(list(zip(x_test_content, x_test_question, x_test_option, y_test)), 84 | args.batch_size, 1, shuffle=False) 85 | 86 | test_counter, test_loss = 0, 0.0 87 | 88 | # Collect the predictions here 89 | true_labels = [] 90 | predicted_scores = [] 91 | 92 | for batch_test in batches: 93 | x_batch_content, x_batch_question, x_batch_option, y_batch = zip(*batch_test) 94 | feed_dict = { 95 | input_x_content: x_batch_content, 96 | input_x_question: x_batch_question, 97 | input_x_option: x_batch_option, 98 | input_y: y_batch, 99 | dropout_keep_prob: 1.0, 100 | is_training: False 101 | } 102 | batch_scores, cur_loss = sess.run([scores, loss], feed_dict) 103 | 104 | # Prepare for calculating metrics 105 | for i in y_batch: 106 | true_labels.append(i) 107 | for j in batch_scores: 108 | predicted_scores.append(j) 109 | 110 | test_loss = test_loss + cur_loss 111 | test_counter = test_counter + 1 112 | 113 | # Calculate PCC & DOA 114 | pcc, doa = dh.evaluation(true_labels, predicted_scores) 115 | # Calculate RMSE 116 | rmse = mean_squared_error(true_labels, predicted_scores) ** 0.5 117 | r2 = r2_score(true_labels, predicted_scores) 118 | 119 | test_loss = float(test_loss / test_counter) 120 | 121 | logger.info("All Test Dataset: Loss {0:g} | PCC {1:g} | DOA {2:g} | RMSE {3:g} | R2 {4:g}" 122 | .format(test_loss, pcc, doa, rmse, r2)) 123 | 124 | # Save the prediction result 125 | if not os.path.exists(SAVE_DIR): 126 | os.makedirs(SAVE_DIR) 127 | dh.create_prediction_file(output_file=SAVE_DIR + "/predictions.json", all_id=test_data.id, 128 | all_labels=true_labels, all_predict_scores=predicted_scores) 129 | 130 | logger.info("All Done.") 131 | 132 | 133 | if __name__ == '__main__': 134 | test_cmidp() 135 | -------------------------------------------------------------------------------- /TF/C-MIDP/text_cmidp.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | __author__ = 'Randolph' 3 | 4 | import numpy as np 5 | import tensorflow as tf 6 | 7 | 8 | class TextCMIDP(object): 9 | """A CMIDP for text classification.""" 10 | 11 | def __init__( 12 | self, sequence_length, vocab_size, embedding_type, embedding_size, filter_sizes, 13 | num_filters, pooling_size, fc_hidden_size, l2_reg_lambda=0.0, pretrained_embedding=None): 14 | 15 | # Placeholders for input, output, dropout_prob and training_tag 16 | self.input_x_content = tf.placeholder(tf.int32, [None, sequence_length[0]], name="input_x_content") 17 | self.input_x_question = tf.placeholder(tf.int32, [None, sequence_length[1]], name="input_x_question") 18 | self.input_x_option = tf.placeholder(tf.int32, [None, sequence_length[2]], name="input_x_option") 19 | self.input_y = tf.placeholder(tf.float32, [None, 1], name="input_y") 20 | self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob") 21 | self.is_training = tf.placeholder(tf.bool, name="is_training") 22 | 23 | self.global_step = tf.Variable(0, trainable=False, name="Global_Step") 24 | 25 | def _fc_layer(input_x, name=""): 26 | """ 27 | Fully Connected Layer. 28 | Args: 29 | input_x: 30 | name: Scope name 31 | Returns: 32 | [batch_size, fc_hidden_size] 33 | """ 34 | with tf.name_scope(name + "fc"): 35 | num_units = input_x.get_shape().as_list()[-1] 36 | W = tf.Variable(tf.truncated_normal(shape=[num_units, fc_hidden_size], 37 | stddev=0.1, dtype=tf.float32), name="W") 38 | b = tf.Variable(tf.constant(value=0.1, shape=[fc_hidden_size], dtype=tf.float32), name="b") 39 | fc = tf.nn.xw_plus_b(input_x, W, b) 40 | fc_out = tf.nn.relu(fc) 41 | return fc_out 42 | 43 | def _convolution(input_, pool_size, layer_cnt): 44 | index = layer_cnt - 1 45 | with tf.name_scope("conv{0}".format(layer_cnt)): 46 | # Padding Zero 47 | new_input = tf.pad(input_, np.array([[0, 0], [filter_sizes[index] - 1, filter_sizes[index] - 1], 48 | [0, 0], [0, 0]]), mode="CONSTANT") 49 | width_size = new_input.get_shape().as_list()[-2] 50 | 51 | # Convolution Layer 52 | filter_shape = [filter_sizes[index], width_size, 1, num_filters[index]] 53 | W = tf.Variable(tf.truncated_normal(shape=filter_shape, stddev=0.1, dtype=tf.float32), name="W") 54 | b = tf.Variable(tf.constant(value=0.1, shape=[num_filters[index]], dtype=tf.float32), name="b") 55 | conv = tf.nn.conv2d( 56 | new_input, 57 | W, 58 | strides=[1, 1, 1, 1], 59 | padding="VALID", 60 | name="conv") 61 | 62 | conv = tf.nn.bias_add(conv, b) 63 | 64 | # Apply nonlinearity 65 | conv_out = tf.nn.relu(conv, name="relu") 66 | 67 | with tf.name_scope("pool{0}".format(layer_cnt)): 68 | # Maxpooling over the outputs 69 | pooled = tf.nn.max_pool( 70 | conv_out, 71 | ksize=[1, pool_size, 1, 1], 72 | strides=[1, pool_size, 1, 1], 73 | padding="VALID", 74 | name="pool") 75 | return pooled 76 | 77 | # Embedding Layer 78 | with tf.device("/cpu:0"), tf.name_scope("embedding"): 79 | # Use random generated the word vector by default 80 | # Can also be obtained through our own word vectors trained by our corpus 81 | if pretrained_embedding is None: 82 | self.embedding = tf.Variable(tf.random_uniform([vocab_size, embedding_size], minval=-1.0, maxval=1.0, 83 | dtype=tf.float32), trainable=True, name="embedding") 84 | else: 85 | if embedding_type == 0: 86 | self.embedding = tf.constant(pretrained_embedding, dtype=tf.float32, name="embedding") 87 | if embedding_type == 1: 88 | self.embedding = tf.Variable(pretrained_embedding, trainable=True, 89 | dtype=tf.float32, name="embedding") 90 | # [batch_size, sequence_length, embedding_size] 91 | self.embedded_sentence_content = tf.nn.embedding_lookup(self.embedding, self.input_x_content) 92 | self.embedded_sentence_question = tf.nn.embedding_lookup(self.embedding, self.input_x_question) 93 | self.embedded_sentence_option = tf.nn.embedding_lookup(self.embedding, self.input_x_option) 94 | 95 | sequence_length_total = sequence_length[0] + sequence_length[1] + sequence_length[2] 96 | # Concat -> embedded_sentence_all: [batch_size, sequence_length_all, embedding_size] 97 | self.embedded_sentence_all = tf.concat([self.embedded_sentence_content, self.embedded_sentence_question, 98 | self.embedded_sentence_option], axis=1) 99 | self.embedded_sentence_expanded = tf.expand_dims(self.embedded_sentence_all, axis=-1) 100 | 101 | # Convolution Layer 1 102 | # conv1_out: [batch_size, sequence_len + filter_sizes[0] -1 / pooling_size[0], 1, num_filters[0]] 103 | self.conv1_out = _convolution(self.embedded_sentence_expanded, pool_size=pooling_size, layer_cnt=1) 104 | # conv1_out_trans: [batch_size, sequence_len + filter_sizes[0] -1 / pooling_size[0], num_filters[0], 1] 105 | self.conv1_out_trans = tf.transpose(self.conv1_out, perm=[0, 1, 3, 2]) 106 | 107 | # Convolution Layer 2 108 | new_pooling_size = (sequence_length_total + filter_sizes[0] - 1) // pooling_size 109 | self.conv2_out = _convolution(self.conv1_out_trans, pool_size=new_pooling_size, layer_cnt=2) 110 | 111 | self.conv_final_flat = tf.reshape(self.conv2_out, shape=[-1, num_filters[1]]) 112 | 113 | # Fully Connected Layer 114 | self.fc_out = _fc_layer(self.conv_final_flat) 115 | 116 | # Add dropout 117 | with tf.name_scope("dropout"): 118 | self.fc_drop = tf.nn.dropout(self.fc_out, self.dropout_keep_prob) 119 | 120 | # Final scores 121 | with tf.name_scope("output"): 122 | W = tf.Variable(tf.truncated_normal(shape=[fc_hidden_size, 1], 123 | stddev=0.1, dtype=tf.float32), name="W") 124 | b = tf.Variable(tf.constant(value=0.1, shape=[1], dtype=tf.float32), name="b") 125 | self.logits = tf.nn.xw_plus_b(self.fc_drop, W, b, name="logits") 126 | self.scores = tf.sigmoid(self.logits, name="scores") 127 | 128 | # Calculate mean cross-entropy loss, L2 loss 129 | with tf.name_scope("loss"): 130 | losses = tf.reduce_mean(tf.square(self.input_y - self.scores), name="losses") 131 | l2_losses = tf.add_n([tf.nn.l2_loss(tf.cast(v, tf.float32)) for v in tf.trainable_variables()], 132 | name="l2_losses") * l2_reg_lambda 133 | self.loss = tf.add(losses, l2_losses, name="loss") 134 | -------------------------------------------------------------------------------- /TF/C-MIDP/train_cmidp.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | __author__ = 'Randolph' 3 | 4 | import os 5 | import sys 6 | import time 7 | import logging 8 | 9 | sys.path.append('../') 10 | logging.getLogger('tensorflow').disabled = True 11 | 12 | import tensorflow as tf 13 | from text_cmidp import TextCMIDP 14 | from utils import checkmate as cm 15 | from utils import data_helpers as dh 16 | from utils import param_parser as parser 17 | from tensorboard.plugins import projector 18 | from sklearn.metrics import mean_squared_error, r2_score 19 | 20 | args = parser.parameter_parser() 21 | OPTION = dh.option(pattern=0) 22 | logger = dh.logger_fn("tflog", "logs/{0}-{1}.log".format('Train' if OPTION == 'T' else 'Restore', time.asctime())) 23 | 24 | 25 | def train_cmidp(): 26 | """Training CMIDP model.""" 27 | # Print parameters used for the model 28 | dh.tab_printer(args, logger) 29 | 30 | # Load sentences, labels, and training parameters 31 | logger.info("Loading data...") 32 | logger.info("Data processing...") 33 | train_data = dh.load_data_and_labels(args.train_file, args.word2vec_file, data_aug_flag=False) 34 | val_data = dh.load_data_and_labels(args.validation_file, args.word2vec_file, data_aug_flag=False) 35 | 36 | logger.info("Data padding...") 37 | x_train_content, x_train_question, x_train_option, y_train = dh.pad_data(train_data, args.pad_seq_len) 38 | x_val_content, x_val_question, x_val_option, y_val = dh.pad_data(val_data, args.pad_seq_len) 39 | 40 | # Build vocabulary 41 | VOCAB_SIZE, EMBEDDING_SIZE, pretrained_word2vec_matrix = dh.load_word2vec_matrix(args.word2vec_file) 42 | 43 | # Build a graph and cmidp object 44 | with tf.Graph().as_default(): 45 | session_conf = tf.ConfigProto( 46 | allow_soft_placement=args.allow_soft_placement, 47 | log_device_placement=args.log_device_placement) 48 | session_conf.gpu_options.allow_growth = args.gpu_options_allow_growth 49 | sess = tf.Session(config=session_conf) 50 | with sess.as_default(): 51 | cmidp = TextCMIDP( 52 | sequence_length=args.pad_seq_len, 53 | vocab_size=VOCAB_SIZE, 54 | embedding_type=args.embedding_type, 55 | embedding_size=EMBEDDING_SIZE, 56 | filter_sizes=args.filter_sizes, 57 | num_filters=args.num_filters, 58 | pooling_size=args.pooling_size, 59 | fc_hidden_size=args.fc_dim, 60 | l2_reg_lambda=args.l2_lambda, 61 | pretrained_embedding=pretrained_word2vec_matrix) 62 | 63 | # Define training procedure 64 | with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)): 65 | learning_rate = tf.train.exponential_decay(learning_rate=args.learning_rate, 66 | global_step=cmidp.global_step, decay_steps=args.decay_steps, 67 | decay_rate=args.decay_rate, staircase=True) 68 | optimizer = tf.train.AdamOptimizer(learning_rate) 69 | grads, vars = zip(*optimizer.compute_gradients(cmidp.loss)) 70 | grads, _ = tf.clip_by_global_norm(grads, clip_norm=args.norm_ratio) 71 | train_op = optimizer.apply_gradients(zip(grads, vars), global_step=cmidp.global_step, name="train_op") 72 | 73 | # Keep track of gradient values and sparsity (optional) 74 | grad_summaries = [] 75 | for g, v in zip(grads, vars): 76 | if g is not None: 77 | grad_hist_summary = tf.summary.histogram("{0}/grad/hist".format(v.name), g) 78 | sparsity_summary = tf.summary.scalar("{0}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g)) 79 | grad_summaries.append(grad_hist_summary) 80 | grad_summaries.append(sparsity_summary) 81 | grad_summaries_merged = tf.summary.merge(grad_summaries) 82 | 83 | # Output directory for models and summaries 84 | out_dir = dh.get_out_dir(OPTION, logger) 85 | checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints")) 86 | best_checkpoint_dir = os.path.abspath(os.path.join(out_dir, "bestcheckpoints")) 87 | 88 | # Summaries for loss 89 | loss_summary = tf.summary.scalar("loss", cmidp.loss) 90 | 91 | # Train summaries 92 | train_summary_op = tf.summary.merge([loss_summary, grad_summaries_merged]) 93 | train_summary_dir = os.path.join(out_dir, "summaries", "train") 94 | train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph) 95 | 96 | # Validation summaries 97 | validation_summary_op = tf.summary.merge([loss_summary]) 98 | validation_summary_dir = os.path.join(out_dir, "summaries", "validation") 99 | validation_summary_writer = tf.summary.FileWriter(validation_summary_dir, sess.graph) 100 | 101 | saver = tf.train.Saver(tf.global_variables(), max_to_keep=args.num_checkpoints) 102 | best_saver = cm.BestCheckpointSaver(save_dir=best_checkpoint_dir, num_to_keep=3, maximize=False) 103 | 104 | if OPTION == 'R': 105 | # Load cmidp model 106 | logger.info("Loading model...") 107 | checkpoint_file = tf.train.latest_checkpoint(checkpoint_dir) 108 | logger.info(checkpoint_file) 109 | 110 | # Load the saved meta graph and restore variables 111 | saver = tf.train.import_meta_graph("{0}.meta".format(checkpoint_file)) 112 | saver.restore(sess, checkpoint_file) 113 | if OPTION == 'T': 114 | if not os.path.exists(checkpoint_dir): 115 | os.makedirs(checkpoint_dir) 116 | sess.run(tf.global_variables_initializer()) 117 | sess.run(tf.local_variables_initializer()) 118 | 119 | # Embedding visualization config 120 | config = projector.ProjectorConfig() 121 | embedding_conf = config.embeddings.add() 122 | embedding_conf.tensor_name = "embedding" 123 | embedding_conf.metadata_path = args.metadata_file 124 | 125 | projector.visualize_embeddings(train_summary_writer, config) 126 | projector.visualize_embeddings(validation_summary_writer, config) 127 | 128 | # Save the embedding visualization 129 | saver.save(sess, os.path.join(out_dir, "embedding", "embedding.ckpt")) 130 | 131 | current_step = sess.run(cmidp.global_step) 132 | 133 | def train_step(x_batch_content, x_batch_question, x_batch_option, y_batch): 134 | """A single training step""" 135 | feed_dict = { 136 | cmidp.input_x_content: x_batch_content, 137 | cmidp.input_x_question: x_batch_question, 138 | cmidp.input_x_option: x_batch_option, 139 | cmidp.input_y: y_batch, 140 | cmidp.dropout_keep_prob: args.dropout_rate, 141 | cmidp.is_training: True 142 | } 143 | _, step, summaries, loss = sess.run( 144 | [train_op, cmidp.global_step, train_summary_op, cmidp.loss], feed_dict) 145 | logger.info("step {0}: loss {1:g}".format(step, loss)) 146 | train_summary_writer.add_summary(summaries, step) 147 | 148 | def validation_step(x_val_content, x_val_question, x_val_option, y_val, writer=None): 149 | """Evaluates model on a validation set""" 150 | batches_validation = dh.batch_iter(list(zip(x_val_content, x_val_question, x_val_option, y_val)), 151 | args.batch_size, 1) 152 | 153 | eval_counter, eval_loss = 0, 0.0 154 | 155 | true_labels = [] 156 | predicted_scores = [] 157 | 158 | for batch_validation in batches_validation: 159 | x_batch_content, x_batch_question, x_batch_option, y_batch = zip(*batch_validation) 160 | feed_dict = { 161 | cmidp.input_x_content: x_batch_content, 162 | cmidp.input_x_question: x_batch_question, 163 | cmidp.input_x_option: x_batch_option, 164 | cmidp.input_y: y_batch, 165 | cmidp.dropout_keep_prob: 1.0, 166 | cmidp.is_training: False 167 | } 168 | step, summaries, scores, cur_loss = sess.run( 169 | [cmidp.global_step, validation_summary_op, cmidp.scores, cmidp.loss], feed_dict) 170 | 171 | # Prepare for calculating metrics 172 | for i in y_batch: 173 | true_labels.append(i) 174 | for j in scores: 175 | predicted_scores.append(j) 176 | 177 | eval_loss = eval_loss + cur_loss 178 | eval_counter = eval_counter + 1 179 | 180 | if writer: 181 | writer.add_summary(summaries, step) 182 | 183 | eval_loss = float(eval_loss / eval_counter) 184 | 185 | # Calculate PCC & DOA 186 | pcc, doa = dh.evaluation(true_labels, predicted_scores) 187 | # Calculate RMSE 188 | rmse = mean_squared_error(true_labels, predicted_scores) ** 0.5 189 | r2 = r2_score(true_labels, predicted_scores) 190 | 191 | return eval_loss, pcc, doa, rmse, r2 192 | 193 | # Generate batches 194 | batches_train = dh.batch_iter(list(zip(x_train_content, x_train_question, x_train_option, y_train)), 195 | args.batch_size, args.epochs) 196 | 197 | num_batches_per_epoch = int((len(y_train) - 1) / args.batch_size) + 1 198 | 199 | # Training loop. For each batch... 200 | for batch_train in batches_train: 201 | x_batch_train_content, x_batch_train_question, x_batch_train_option, y_batch_train = zip(*batch_train) 202 | train_step(x_batch_train_content, x_batch_train_question, x_batch_train_option, y_batch_train) 203 | current_step = tf.train.global_step(sess, cmidp.global_step) 204 | 205 | if current_step % args.evaluate_steps == 0: 206 | logger.info("\nEvaluation:") 207 | eval_loss, pcc, doa, rmse, r2 = validation_step(x_val_content, x_val_question, x_val_option, y_val, 208 | writer=validation_summary_writer) 209 | logger.info("All Validation set: Loss {0:g} | PCC {1:g} | DOA {2:g} | RMSE {3:g} | R2 {4:g}" 210 | .format(eval_loss, pcc, doa, rmse, r2)) 211 | best_saver.handle(rmse, sess, current_step) 212 | if current_step % args.checkpoint_steps == 0: 213 | checkpoint_prefix = os.path.join(checkpoint_dir, "model") 214 | path = saver.save(sess, checkpoint_prefix, global_step=current_step) 215 | logger.info("Saved model checkpoint to {0}\n".format(path)) 216 | if current_step % num_batches_per_epoch == 0: 217 | current_epoch = current_step // num_batches_per_epoch 218 | logger.info("Epoch {0} has finished!".format(current_epoch)) 219 | 220 | logger.info("All Done.") 221 | 222 | 223 | if __name__ == '__main__': 224 | train_cmidp() -------------------------------------------------------------------------------- /TF/H-MIDP/test_hmidp.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | __author__ = 'Randolph' 3 | 4 | import os 5 | import sys 6 | import time 7 | import logging 8 | 9 | sys.path.append('../') 10 | logging.getLogger('tensorflow').disabled = True 11 | 12 | import tensorflow as tf 13 | from utils import checkmate as cm 14 | from utils import data_helpers as dh 15 | from utils import param_parser as parser 16 | from sklearn.metrics import mean_squared_error, r2_score 17 | 18 | args = parser.parameter_parser() 19 | MODEL = dh.get_model_name() 20 | logger = dh.logger_fn("tflog", "logs/Test-{0}.log".format(time.asctime())) 21 | 22 | CPT_DIR = 'runs/' + MODEL + '/checkpoints/' 23 | BEST_CPT_DIR = 'runs/' + MODEL + '/bestcheckpoints/' 24 | SAVE_DIR = 'output/' + MODEL 25 | 26 | 27 | def test_hmidp(): 28 | """Test HMIDP model.""" 29 | # Print parameters used for the model 30 | dh.tab_printer(args, logger) 31 | 32 | # Load data 33 | logger.info("Loading data...") 34 | logger.info("Data processing...") 35 | test_data = dh.load_data_and_labels(args.test_file, args.word2vec_file, data_aug_flag=False) 36 | 37 | logger.info("Data padding...") 38 | x_test_content, x_test_question, x_test_option, y_test = dh.pad_data(test_data, args.pad_seq_len) 39 | 40 | # Load hmidp model 41 | OPTION = dh.option(pattern=1) 42 | if OPTION == 'B': 43 | logger.info("Loading best model...") 44 | checkpoint_file = cm.get_best_checkpoint(BEST_CPT_DIR, select_maximum_value=True) 45 | else: 46 | logger.info("Loading latest model...") 47 | checkpoint_file = tf.train.latest_checkpoint(CPT_DIR) 48 | logger.info(checkpoint_file) 49 | 50 | graph = tf.Graph() 51 | with graph.as_default(): 52 | session_conf = tf.ConfigProto( 53 | allow_soft_placement=args.allow_soft_placement, 54 | log_device_placement=args.log_device_placement) 55 | session_conf.gpu_options.allow_growth = args.gpu_options_allow_growth 56 | sess = tf.Session(config=session_conf) 57 | with sess.as_default(): 58 | # Load the saved meta graph and restore variables 59 | saver = tf.train.import_meta_graph("{0}.meta".format(checkpoint_file)) 60 | saver.restore(sess, checkpoint_file) 61 | 62 | # Get the placeholders from the graph by name 63 | input_x_content = graph.get_operation_by_name("input_x_content").outputs[0] 64 | input_x_question = graph.get_operation_by_name("input_x_question").outputs[0] 65 | input_x_option = graph.get_operation_by_name("input_x_option").outputs[0] 66 | input_y = graph.get_operation_by_name("input_y").outputs[0] 67 | dropout_keep_prob = graph.get_operation_by_name("dropout_keep_prob").outputs[0] 68 | is_training = graph.get_operation_by_name("is_training").outputs[0] 69 | 70 | # Tensors we want to evaluate 71 | scores = graph.get_operation_by_name("output/scores").outputs[0] 72 | loss = graph.get_operation_by_name("loss/loss").outputs[0] 73 | 74 | # Split the output nodes name by '|' if you have several output nodes 75 | output_node_names = "output/scores" 76 | 77 | # Save the .pb model file 78 | output_graph_def = tf.graph_util.convert_variables_to_constants(sess, sess.graph_def, 79 | output_node_names.split("|")) 80 | tf.train.write_graph(output_graph_def, "graph", "graph-hmidp-{0}.pb".format(MODEL), as_text=False) 81 | 82 | # Generate batches for one epoch 83 | batches = dh.batch_iter(list(zip(x_test_content, x_test_question, x_test_option, y_test)), 84 | args.batch_size, 1, shuffle=False) 85 | 86 | test_counter, test_loss = 0, 0.0 87 | 88 | # Collect the predictions here 89 | true_labels = [] 90 | predicted_scores = [] 91 | 92 | for batch_test in batches: 93 | x_batch_content, x_batch_question, x_batch_option, y_batch = zip(*batch_test) 94 | feed_dict = { 95 | input_x_content: x_batch_content, 96 | input_x_question: x_batch_question, 97 | input_x_option: x_batch_option, 98 | input_y: y_batch, 99 | dropout_keep_prob: 1.0, 100 | is_training: False 101 | } 102 | batch_scores, cur_loss = sess.run([scores, loss], feed_dict) 103 | 104 | # Prepare for calculating metrics 105 | for i in y_batch: 106 | true_labels.append(i) 107 | for j in batch_scores: 108 | predicted_scores.append(j) 109 | 110 | test_loss = test_loss + cur_loss 111 | test_counter = test_counter + 1 112 | 113 | # Calculate PCC & DOA 114 | pcc, doa = dh.evaluation(true_labels, predicted_scores) 115 | # Calculate RMSE 116 | rmse = mean_squared_error(true_labels, predicted_scores) ** 0.5 117 | r2 = r2_score(true_labels, predicted_scores) 118 | 119 | test_loss = float(test_loss / test_counter) 120 | 121 | logger.info("All Test Dataset: Loss {0:g} | PCC {1:g} | DOA {2:g} | RMSE {3:g} | R2 {4:g}" 122 | .format(test_loss, pcc, doa, rmse, r2)) 123 | 124 | # Save the prediction result 125 | if not os.path.exists(SAVE_DIR): 126 | os.makedirs(SAVE_DIR) 127 | dh.create_prediction_file(output_file=SAVE_DIR + "/predictions.json", all_id=test_data.id, 128 | all_labels=true_labels, all_predict_scores=predicted_scores) 129 | 130 | logger.info("All Done.") 131 | 132 | 133 | if __name__ == '__main__': 134 | test_hmidp() 135 | -------------------------------------------------------------------------------- /TF/H-MIDP/text_hmidp.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | __author__ = 'Randolph' 3 | 4 | import numpy as np 5 | import tensorflow as tf 6 | 7 | 8 | class TextHMIDP(object): 9 | """A HMIDP for text classification.""" 10 | 11 | def __init__( 12 | self, sequence_length, vocab_size, embedding_type, embedding_size, filter_sizes, num_filters, 13 | pooling_size, rnn_hidden_size, rnn_type, rnn_layers, fc_hidden_size, l2_reg_lambda=0.0, 14 | pretrained_embedding=None): 15 | 16 | # Placeholders for input, output, dropout_prob and training_tag 17 | self.input_x_content = tf.placeholder(tf.int32, [None, sequence_length[0]], name="input_x_content") 18 | self.input_x_question = tf.placeholder(tf.int32, [None, sequence_length[1]], name="input_x_question") 19 | self.input_x_option = tf.placeholder(tf.int32, [None, sequence_length[2]], name="input_x_option") 20 | self.input_y = tf.placeholder(tf.float32, [None, 1], name="input_y") 21 | self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob") 22 | self.is_training = tf.placeholder(tf.bool, name="is_training") 23 | 24 | self.global_step = tf.Variable(0, trainable=False, name="Global_Step") 25 | 26 | def _get_rnn_cell(rnn_hidden_size, rnn_type): 27 | if rnn_type == 'RNN': 28 | return tf.nn.rnn_cell.BasicRNNCell(rnn_hidden_size) 29 | if rnn_type == 'LSTM': 30 | return tf.nn.rnn_cell.BasicLSTMCell(rnn_hidden_size) 31 | if rnn_type == 'GRU': 32 | return tf.nn.rnn_cell.GRUCell(rnn_hidden_size) 33 | 34 | def _convolution(input_, pool_size, layer_cnt): 35 | index = layer_cnt - 1 36 | with tf.name_scope("conv{0}".format(layer_cnt)): 37 | # Padding Zero 38 | new_input = tf.pad(input_, np.array([[0, 0], [filter_sizes[index] - 1, filter_sizes[index] - 1], 39 | [0, 0], [0, 0]]), mode="CONSTANT") 40 | width_size = new_input.get_shape().as_list()[-2] 41 | 42 | # Convolution Layer 43 | filter_shape = [filter_sizes[index], width_size, 1, num_filters[index]] 44 | W = tf.Variable(tf.truncated_normal(shape=filter_shape, stddev=0.1, dtype=tf.float32), name="W") 45 | b = tf.Variable(tf.constant(value=0.1, shape=[num_filters[index]], dtype=tf.float32), name="b") 46 | conv = tf.nn.conv2d( 47 | new_input, 48 | W, 49 | strides=[1, 1, 1, 1], 50 | padding="VALID", 51 | name="conv") 52 | 53 | conv = tf.nn.bias_add(conv, b) 54 | 55 | # Apply nonlinearity 56 | conv_out = tf.nn.relu(conv, name="relu") 57 | 58 | with tf.name_scope("pool{0}".format(layer_cnt)): 59 | # Maxpooling over the outputs 60 | pooled = tf.nn.max_pool( 61 | conv_out, 62 | ksize=[1, pool_size, 1, 1], 63 | strides=[1, pool_size, 1, 1], 64 | padding="VALID", 65 | name="pool") 66 | return pooled 67 | 68 | def _bi_rnn_layer(input_x, name=""): 69 | # Bi-RNN Layer 70 | with tf.variable_scope(name + "Bi_rnn", reuse=tf.AUTO_REUSE): 71 | fw_rnn_cell = tf.nn.rnn_cell.MultiRNNCell([_get_rnn_cell(rnn_hidden_size, rnn_type) 72 | for _ in range(rnn_layers)]) 73 | bw_rnn_cell = tf.nn.rnn_cell.MultiRNNCell([_get_rnn_cell(rnn_hidden_size, rnn_type) 74 | for _ in range(rnn_layers)]) 75 | if self.dropout_keep_prob is not None: 76 | fw_rnn_cell = tf.nn.rnn_cell.DropoutWrapper(fw_rnn_cell, output_keep_prob=self.dropout_keep_prob) 77 | bw_rnn_cell = tf.nn.rnn_cell.DropoutWrapper(bw_rnn_cell, output_keep_prob=self.dropout_keep_prob) 78 | 79 | # Creates a dynamic bidirectional recurrent neural network 80 | # shape of `outputs`: tuple -> (outputs_fw, outputs_bw) 81 | # shape of `outputs_fw`: [batch_size, sequence_length, rnn_hidden_size] 82 | 83 | # shape of `state`: tuple -> (outputs_state_fw, output_state_bw) 84 | # shape of `outputs_state_fw`: tuple -> (c, h) c: memory cell; h: hidden state 85 | outputs, state = tf.nn.bidirectional_dynamic_rnn(fw_rnn_cell, bw_rnn_cell, input_x, dtype=tf.float32) 86 | 87 | # Concat output 88 | # [batch_size, sequence_length, rnn_hidden_size * 2] 89 | rnn_out = tf.concat(outputs, axis=2, name=name + "rnn_out") 90 | 91 | # [batch_size, rnn_hidden_size * 2] 92 | rnn_pooled = tf.reduce_max(rnn_out, axis=1, name=name + "rnn_pooled") 93 | 94 | return rnn_pooled 95 | 96 | def _fc_layer(input_x, name=""): 97 | """ 98 | Fully Connected Layer. 99 | Args: 100 | input_x: 101 | name: Scope name 102 | Returns: 103 | [batch_size, fc_hidden_size] 104 | """ 105 | with tf.name_scope(name + "fc"): 106 | num_units = input_x.get_shape().as_list()[-1] 107 | W = tf.Variable(tf.truncated_normal(shape=[num_units, fc_hidden_size], 108 | stddev=0.1, dtype=tf.float32), name="W") 109 | b = tf.Variable(tf.constant(value=0.1, shape=[fc_hidden_size], dtype=tf.float32), name="b") 110 | fc = tf.nn.xw_plus_b(input_x, W, b) 111 | fc_out = tf.nn.relu(fc) 112 | return fc_out 113 | 114 | # Embedding Layer 115 | with tf.device("/cpu:0"), tf.name_scope("embedding"): 116 | # Use random generated the word vector by default 117 | # Can also be obtained through our own word vectors trained by our corpus 118 | if pretrained_embedding is None: 119 | self.embedding = tf.Variable(tf.random_uniform([vocab_size, embedding_size], minval=-1.0, maxval=1.0, 120 | dtype=tf.float32), trainable=True, name="embedding") 121 | else: 122 | if embedding_type == 0: 123 | self.embedding = tf.constant(pretrained_embedding, dtype=tf.float32, name="embedding") 124 | if embedding_type == 1: 125 | self.embedding = tf.Variable(pretrained_embedding, trainable=True, 126 | dtype=tf.float32, name="embedding") 127 | # [batch_size, sequence_length, embedding_size] 128 | self.embedded_sentence_content = tf.nn.embedding_lookup(self.embedding, self.input_x_content) 129 | self.embedded_sentence_question = tf.nn.embedding_lookup(self.embedding, self.input_x_question) 130 | self.embedded_sentence_option = tf.nn.embedding_lookup(self.embedding, self.input_x_option) 131 | 132 | sequence_length_total = sequence_length[0] + sequence_length[1] + sequence_length[2] 133 | # Concat -> embedded_sentence_all: [batch_size, sequence_length_all, embedding_size] 134 | self.embedded_sentence_all = tf.concat([self.embedded_sentence_content, self.embedded_sentence_question, 135 | self.embedded_sentence_option], axis=1) 136 | self.embedded_sentence_expanded = tf.expand_dims(self.embedded_sentence_all, axis=-1) 137 | 138 | # Convolution Layer 1 139 | # conv1_out: [batch_size, sequence_len + filter_sizes[0] -1 / pooling_size[0], 1, num_filters[0]] 140 | self.conv1_out = _convolution(self.embedded_sentence_expanded, pool_size=pooling_size, layer_cnt=1) 141 | # conv1_out_trans: [batch_size, sequence_len + filter_sizes[0] -1 / pooling_size[0], num_filters[0], 1] 142 | self.conv1_out_trans = tf.transpose(self.conv1_out, perm=[0, 1, 3, 2]) 143 | 144 | # Convolution Layer 2 145 | new_pooling_size = (sequence_length_total + filter_sizes[0] - 1) // pooling_size 146 | self.conv2_out = _convolution(self.conv1_out_trans, pool_size=new_pooling_size, layer_cnt=2) 147 | self.conv_final_flat = tf.reshape(self.conv2_out, shape=[-1, num_filters[1]]) 148 | 149 | # Bi-RNN Layer 150 | # bi_rnn_out: [batch_size, rnn_hidden_size * 2] 151 | self.bi_rnn_out = _bi_rnn_layer(self.embedded_sentence_all, name="total_") 152 | 153 | # Concat 154 | self.conv_rnn_concat = tf.concat([self.conv_final_flat, self.bi_rnn_out], axis=1) 155 | 156 | # Fully Connected Layer 1 157 | self.fc1_out = _fc_layer(self.conv_rnn_concat) 158 | 159 | # Fully Connected Layer 2 160 | self.fc2_out = _fc_layer(self.fc1_out) 161 | 162 | # Add dropout 163 | with tf.name_scope("dropout"): 164 | self.fc_drop = tf.nn.dropout(self.fc2_out, self.dropout_keep_prob) 165 | 166 | # Final scores 167 | with tf.name_scope("output"): 168 | W = tf.Variable(tf.truncated_normal(shape=[fc_hidden_size, 1], 169 | stddev=0.1, dtype=tf.float32), name="W") 170 | b = tf.Variable(tf.constant(value=0.1, shape=[1], dtype=tf.float32), name="b") 171 | self.logits = tf.nn.xw_plus_b(self.fc_drop, W, b, name="logits") 172 | self.scores = tf.sigmoid(self.logits, name="scores") 173 | 174 | # Calculate mean cross-entropy loss, L2 loss 175 | with tf.name_scope("loss"): 176 | losses = tf.reduce_mean(tf.square(self.input_y - self.scores), name="losses") 177 | l2_losses = tf.add_n([tf.nn.l2_loss(tf.cast(v, tf.float32)) for v in tf.trainable_variables()], 178 | name="l2_losses") * l2_reg_lambda 179 | self.loss = tf.add(losses, l2_losses, name="loss") 180 | 181 | -------------------------------------------------------------------------------- /TF/H-MIDP/train_hmidp.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | __author__ = 'Randolph' 3 | 4 | import os 5 | import sys 6 | import time 7 | import logging 8 | 9 | sys.path.append('../') 10 | logging.getLogger('tensorflow').disabled = True 11 | 12 | import tensorflow as tf 13 | from text_hmidp import TextHMIDP 14 | from utils import checkmate as cm 15 | from utils import data_helpers as dh 16 | from utils import param_parser as parser 17 | from tensorboard.plugins import projector 18 | from sklearn.metrics import mean_squared_error, r2_score 19 | 20 | args = parser.parameter_parser() 21 | OPTION = dh.option(pattern=0) 22 | logger = dh.logger_fn("tflog", "logs/{0}-{1}.log".format('Train' if OPTION == 'T' else 'Restore', time.asctime())) 23 | 24 | 25 | def train_hmidp(): 26 | """Training HMIDP model.""" 27 | # Print parameters used for the model 28 | dh.tab_printer(args, logger) 29 | 30 | # Load sentences, labels, and training parameters 31 | logger.info("Loading data...") 32 | logger.info("Data processing...") 33 | train_data = dh.load_data_and_labels(args.train_file, args.word2vec_file, data_aug_flag=False) 34 | val_data = dh.load_data_and_labels(args.validation_file, args.word2vec_file, data_aug_flag=False) 35 | 36 | logger.info("Data padding...") 37 | x_train_content, x_train_question, x_train_option, y_train = dh.pad_data(train_data, args.pad_seq_len) 38 | x_val_content, x_val_question, x_val_option, y_val = dh.pad_data(val_data, args.pad_seq_len) 39 | 40 | # Build vocabulary 41 | VOCAB_SIZE, EMBEDDING_SIZE, pretrained_word2vec_matrix = dh.load_word2vec_matrix(args.word2vec_file) 42 | 43 | # Build a graph and hmidp object 44 | with tf.Graph().as_default(): 45 | session_conf = tf.ConfigProto( 46 | allow_soft_placement=args.allow_soft_placement, 47 | log_device_placement=args.log_device_placement) 48 | session_conf.gpu_options.allow_growth = args.gpu_options_allow_growth 49 | sess = tf.Session(config=session_conf) 50 | with sess.as_default(): 51 | hmidp = TextHMIDP( 52 | sequence_length=args.pad_seq_len, 53 | vocab_size=VOCAB_SIZE, 54 | embedding_type=args.embedding_type, 55 | embedding_size=EMBEDDING_SIZE, 56 | filter_sizes=args.filter_sizes, 57 | num_filters=args.num_filters, 58 | pooling_size=args.pooling_size, 59 | rnn_hidden_size=args.rnn_dim, 60 | rnn_type=args.rnn_type, 61 | rnn_layers=args.rnn_layers, 62 | fc_hidden_size=args.fc_dim, 63 | l2_reg_lambda=args.l2_lambda, 64 | pretrained_embedding=pretrained_word2vec_matrix) 65 | 66 | # Define training procedure 67 | with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)): 68 | learning_rate = tf.train.exponential_decay(learning_rate=args.learning_rate, 69 | global_step=hmidp.global_step, decay_steps=args.decay_steps, 70 | decay_rate=args.decay_rate, staircase=True) 71 | optimizer = tf.train.AdamOptimizer(learning_rate) 72 | grads, vars = zip(*optimizer.compute_gradients(hmidp.loss)) 73 | grads, _ = tf.clip_by_global_norm(grads, clip_norm=args.norm_ratio) 74 | train_op = optimizer.apply_gradients(zip(grads, vars), global_step=hmidp.global_step, name="train_op") 75 | 76 | # Keep track of gradient values and sparsity (optional) 77 | grad_summaries = [] 78 | for g, v in zip(grads, vars): 79 | if g is not None: 80 | grad_hist_summary = tf.summary.histogram("{0}/grad/hist".format(v.name), g) 81 | sparsity_summary = tf.summary.scalar("{0}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g)) 82 | grad_summaries.append(grad_hist_summary) 83 | grad_summaries.append(sparsity_summary) 84 | grad_summaries_merged = tf.summary.merge(grad_summaries) 85 | 86 | # Output directory for models and summaries 87 | out_dir = dh.get_out_dir(OPTION, logger) 88 | checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints")) 89 | best_checkpoint_dir = os.path.abspath(os.path.join(out_dir, "bestcheckpoints")) 90 | 91 | # Summaries for loss 92 | loss_summary = tf.summary.scalar("loss", hmidp.loss) 93 | 94 | # Train summaries 95 | train_summary_op = tf.summary.merge([loss_summary, grad_summaries_merged]) 96 | train_summary_dir = os.path.join(out_dir, "summaries", "train") 97 | train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph) 98 | 99 | # Validation summaries 100 | validation_summary_op = tf.summary.merge([loss_summary]) 101 | validation_summary_dir = os.path.join(out_dir, "summaries", "validation") 102 | validation_summary_writer = tf.summary.FileWriter(validation_summary_dir, sess.graph) 103 | 104 | saver = tf.train.Saver(tf.global_variables(), max_to_keep=args.num_checkpoints) 105 | best_saver = cm.BestCheckpointSaver(save_dir=best_checkpoint_dir, num_to_keep=3, maximize=False) 106 | 107 | if OPTION == 'R': 108 | # Load hmidp model 109 | logger.info("Loading model...") 110 | checkpoint_file = tf.train.latest_checkpoint(checkpoint_dir) 111 | logger.info(checkpoint_file) 112 | 113 | # Load the saved meta graph and restore variables 114 | saver = tf.train.import_meta_graph("{0}.meta".format(checkpoint_file)) 115 | saver.restore(sess, checkpoint_file) 116 | if OPTION == 'T': 117 | if not os.path.exists(checkpoint_dir): 118 | os.makedirs(checkpoint_dir) 119 | sess.run(tf.global_variables_initializer()) 120 | sess.run(tf.local_variables_initializer()) 121 | 122 | # Embedding visualization config 123 | config = projector.ProjectorConfig() 124 | embedding_conf = config.embeddings.add() 125 | embedding_conf.tensor_name = "embedding" 126 | embedding_conf.metadata_path = args.metadata_file 127 | 128 | projector.visualize_embeddings(train_summary_writer, config) 129 | projector.visualize_embeddings(validation_summary_writer, config) 130 | 131 | # Save the embedding visualization 132 | saver.save(sess, os.path.join(out_dir, "embedding", "embedding.ckpt")) 133 | 134 | current_step = sess.run(hmidp.global_step) 135 | 136 | def train_step(x_batch_content, x_batch_question, x_batch_option, y_batch): 137 | """A single training step""" 138 | feed_dict = { 139 | hmidp.input_x_content: x_batch_content, 140 | hmidp.input_x_question: x_batch_question, 141 | hmidp.input_x_option: x_batch_option, 142 | hmidp.input_y: y_batch, 143 | hmidp.dropout_keep_prob: args.dropout_rate, 144 | hmidp.is_training: True 145 | } 146 | _, step, summaries, loss = sess.run( 147 | [train_op, hmidp.global_step, train_summary_op, hmidp.loss], feed_dict) 148 | logger.info("step {0}: loss {1:g}".format(step, loss)) 149 | train_summary_writer.add_summary(summaries, step) 150 | 151 | def validation_step(x_val_content, x_val_question, x_val_option, y_val, writer=None): 152 | """Evaluates model on a validation set""" 153 | batches_validation = dh.batch_iter(list(zip(x_val_content, x_val_question, x_val_option, y_val)), 154 | args.batch_size, 1) 155 | 156 | eval_counter, eval_loss = 0, 0.0 157 | 158 | true_labels = [] 159 | predicted_scores = [] 160 | 161 | for batch_validation in batches_validation: 162 | x_batch_content, x_batch_question, x_batch_option, y_batch = zip(*batch_validation) 163 | feed_dict = { 164 | hmidp.input_x_content: x_batch_content, 165 | hmidp.input_x_question: x_batch_question, 166 | hmidp.input_x_option: x_batch_option, 167 | hmidp.input_y: y_batch, 168 | hmidp.dropout_keep_prob: 1.0, 169 | hmidp.is_training: False 170 | } 171 | step, summaries, scores, cur_loss = sess.run( 172 | [hmidp.global_step, validation_summary_op, hmidp.scores, hmidp.loss], feed_dict) 173 | 174 | # Prepare for calculating metrics 175 | for i in y_batch: 176 | true_labels.append(i) 177 | for j in scores: 178 | predicted_scores.append(j) 179 | 180 | eval_loss = eval_loss + cur_loss 181 | eval_counter = eval_counter + 1 182 | 183 | if writer: 184 | writer.add_summary(summaries, step) 185 | 186 | eval_loss = float(eval_loss / eval_counter) 187 | 188 | # Calculate PCC & DOA 189 | pcc, doa = dh.evaluation(true_labels, predicted_scores) 190 | # Calculate RMSE 191 | rmse = mean_squared_error(true_labels, predicted_scores) ** 0.5 192 | r2 = r2_score(true_labels, predicted_scores) 193 | 194 | return eval_loss, pcc, doa, rmse, r2 195 | 196 | # Generate batches 197 | batches_train = dh.batch_iter(list(zip(x_train_content, x_train_question, x_train_option, y_train)), 198 | args.batch_size, args.epochs) 199 | 200 | num_batches_per_epoch = int((len(y_train) - 1) / args.batch_size) + 1 201 | 202 | # Training loop. For each batch... 203 | for batch_train in batches_train: 204 | x_batch_train_content, x_batch_train_question, x_batch_train_option, y_batch_train = zip(*batch_train) 205 | train_step(x_batch_train_content, x_batch_train_question, x_batch_train_option, y_batch_train) 206 | current_step = tf.train.global_step(sess, hmidp.global_step) 207 | 208 | if current_step % args.evaluate_steps == 0: 209 | logger.info("\nEvaluation:") 210 | eval_loss, pcc, doa, rmse, r2 = validation_step(x_val_content, x_val_question, x_val_option, y_val, 211 | writer=validation_summary_writer) 212 | logger.info("All Validation set: Loss {0:g} | PCC {1:g} | DOA {2:g} | RMSE {3:g} | R2 {4:g}" 213 | .format(eval_loss, pcc, doa, rmse, r2)) 214 | best_saver.handle(rmse, sess, current_step) 215 | if current_step % args.checkpoint_steps == 0: 216 | checkpoint_prefix = os.path.join(checkpoint_dir, "model") 217 | path = saver.save(sess, checkpoint_prefix, global_step=current_step) 218 | logger.info("Saved model checkpoint to {0}\n".format(path)) 219 | if current_step % num_batches_per_epoch == 0: 220 | current_epoch = current_step // num_batches_per_epoch 221 | logger.info("Epoch {0} has finished!".format(current_epoch)) 222 | 223 | logger.info("All Done.") 224 | 225 | 226 | if __name__ == '__main__': 227 | train_hmidp() -------------------------------------------------------------------------------- /TF/R-MIDP/test_rmidp.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | __author__ = 'Randolph' 3 | 4 | import os 5 | import sys 6 | import time 7 | import logging 8 | 9 | sys.path.append('../') 10 | logging.getLogger('tensorflow').disabled = True 11 | 12 | import tensorflow as tf 13 | from utils import checkmate as cm 14 | from utils import data_helpers as dh 15 | from utils import param_parser as parser 16 | from sklearn.metrics import mean_squared_error, r2_score 17 | 18 | args = parser.parameter_parser() 19 | MODEL = dh.get_model_name() 20 | logger = dh.logger_fn("tflog", "logs/Test-{0}.log".format(time.asctime())) 21 | 22 | CPT_DIR = 'runs/' + MODEL + '/checkpoints/' 23 | BEST_CPT_DIR = 'runs/' + MODEL + '/bestcheckpoints/' 24 | SAVE_DIR = 'output/' + MODEL 25 | 26 | 27 | def test_rmidp(): 28 | """Test RMIDP model.""" 29 | # Print parameters used for the model 30 | dh.tab_printer(args, logger) 31 | 32 | # Load data 33 | logger.info("Loading data...") 34 | logger.info("Data processing...") 35 | test_data = dh.load_data_and_labels(args.test_file, args.word2vec_file, data_aug_flag=False) 36 | 37 | logger.info("Data padding...") 38 | x_test_content, x_test_question, x_test_option, y_test = dh.pad_data(test_data, args.pad_seq_len) 39 | 40 | # Load rmidp model 41 | OPTION = dh.option(pattern=1) 42 | if OPTION == 'B': 43 | logger.info("Loading best model...") 44 | checkpoint_file = cm.get_best_checkpoint(BEST_CPT_DIR, select_maximum_value=True) 45 | else: 46 | logger.info("Loading latest model...") 47 | checkpoint_file = tf.train.latest_checkpoint(CPT_DIR) 48 | logger.info(checkpoint_file) 49 | 50 | graph = tf.Graph() 51 | with graph.as_default(): 52 | session_conf = tf.ConfigProto( 53 | allow_soft_placement=args.allow_soft_placement, 54 | log_device_placement=args.log_device_placement) 55 | session_conf.gpu_options.allow_growth = args.gpu_options_allow_growth 56 | sess = tf.Session(config=session_conf) 57 | with sess.as_default(): 58 | # Load the saved meta graph and restore variables 59 | saver = tf.train.import_meta_graph("{0}.meta".format(checkpoint_file)) 60 | saver.restore(sess, checkpoint_file) 61 | 62 | # Get the placeholders from the graph by name 63 | input_x_content = graph.get_operation_by_name("input_x_content").outputs[0] 64 | input_x_question = graph.get_operation_by_name("input_x_question").outputs[0] 65 | input_x_option = graph.get_operation_by_name("input_x_option").outputs[0] 66 | input_y = graph.get_operation_by_name("input_y").outputs[0] 67 | dropout_keep_prob = graph.get_operation_by_name("dropout_keep_prob").outputs[0] 68 | is_training = graph.get_operation_by_name("is_training").outputs[0] 69 | 70 | # Tensors we want to evaluate 71 | scores = graph.get_operation_by_name("output/scores").outputs[0] 72 | loss = graph.get_operation_by_name("loss/loss").outputs[0] 73 | 74 | # Split the output nodes name by '|' if you have several output nodes 75 | output_node_names = "output/scores" 76 | 77 | # Save the .pb model file 78 | output_graph_def = tf.graph_util.convert_variables_to_constants(sess, sess.graph_def, 79 | output_node_names.split("|")) 80 | tf.train.write_graph(output_graph_def, "graph", "graph-rmidp-{0}.pb".format(MODEL), as_text=False) 81 | 82 | # Generate batches for one epoch 83 | batches = dh.batch_iter(list(zip(x_test_content, x_test_question, x_test_option, y_test)), 84 | args.batch_size, 1, shuffle=False) 85 | 86 | test_counter, test_loss = 0, 0.0 87 | 88 | # Collect the predictions here 89 | true_labels = [] 90 | predicted_scores = [] 91 | 92 | for batch_test in batches: 93 | x_batch_content, x_batch_question, x_batch_option, y_batch = zip(*batch_test) 94 | feed_dict = { 95 | input_x_content: x_batch_content, 96 | input_x_question: x_batch_question, 97 | input_x_option: x_batch_option, 98 | input_y: y_batch, 99 | dropout_keep_prob: 1.0, 100 | is_training: False 101 | } 102 | batch_scores, cur_loss = sess.run([scores, loss], feed_dict) 103 | 104 | # Prepare for calculating metrics 105 | for i in y_batch: 106 | true_labels.append(i) 107 | for j in batch_scores: 108 | predicted_scores.append(j) 109 | 110 | test_loss = test_loss + cur_loss 111 | test_counter = test_counter + 1 112 | 113 | # Calculate PCC & DOA 114 | pcc, doa = dh.evaluation(true_labels, predicted_scores) 115 | # Calculate RMSE 116 | rmse = mean_squared_error(true_labels, predicted_scores) ** 0.5 117 | r2 = r2_score(true_labels, predicted_scores) 118 | 119 | test_loss = float(test_loss / test_counter) 120 | 121 | logger.info("All Test Dataset: Loss {0:g} | PCC {1:g} | DOA {2:g} | RMSE {3:g} | R2 {4:g}" 122 | .format(test_loss, pcc, doa, rmse, r2)) 123 | 124 | # Save the prediction result 125 | if not os.path.exists(SAVE_DIR): 126 | os.makedirs(SAVE_DIR) 127 | dh.create_prediction_file(output_file=SAVE_DIR + "/predictions.json", all_id=test_data.id, 128 | all_labels=true_labels, all_predict_scores=predicted_scores) 129 | 130 | logger.info("All Done.") 131 | 132 | 133 | if __name__ == '__main__': 134 | test_rmidp() 135 | -------------------------------------------------------------------------------- /TF/R-MIDP/text_rmidp.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | __author__ = 'Randolph' 3 | 4 | import numpy as np 5 | import tensorflow as tf 6 | 7 | 8 | class TextRMIDP(object): 9 | """A RMIDP for text classification.""" 10 | 11 | def __init__( 12 | self, sequence_length, vocab_size, embedding_type, embedding_size, rnn_hidden_size, rnn_type, 13 | rnn_layers, fc_hidden_size, l2_reg_lambda=0.0, pretrained_embedding=None): 14 | 15 | # Placeholders for input, output, dropout_prob and training_tag 16 | self.input_x_content = tf.placeholder(tf.int32, [None, sequence_length[0]], name="input_x_content") 17 | self.input_x_question = tf.placeholder(tf.int32, [None, sequence_length[1]], name="input_x_question") 18 | self.input_x_option = tf.placeholder(tf.int32, [None, sequence_length[2]], name="input_x_option") 19 | self.input_y = tf.placeholder(tf.float32, [None, 1], name="input_y") 20 | self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob") 21 | self.is_training = tf.placeholder(tf.bool, name="is_training") 22 | 23 | self.global_step = tf.Variable(0, trainable=False, name="Global_Step") 24 | 25 | def _get_rnn_cell(rnn_hidden_size, rnn_type): 26 | if rnn_type == 'RNN': 27 | return tf.nn.rnn_cell.BasicRNNCell(rnn_hidden_size) 28 | if rnn_type == 'LSTM': 29 | return tf.nn.rnn_cell.BasicLSTMCell(rnn_hidden_size) 30 | if rnn_type == 'GRU': 31 | return tf.nn.rnn_cell.GRUCell(rnn_hidden_size) 32 | 33 | def _bi_rnn_layer(input_x, name=""): 34 | # Bi-RNN Layer 35 | with tf.variable_scope(name + "Bi_rnn", reuse=tf.AUTO_REUSE): 36 | fw_rnn_cell = tf.nn.rnn_cell.MultiRNNCell([_get_rnn_cell(rnn_hidden_size, rnn_type) 37 | for _ in range(rnn_layers)]) 38 | bw_rnn_cell = tf.nn.rnn_cell.MultiRNNCell([_get_rnn_cell(rnn_hidden_size, rnn_type) 39 | for _ in range(rnn_layers)]) 40 | if self.dropout_keep_prob is not None: 41 | fw_rnn_cell = tf.nn.rnn_cell.DropoutWrapper(fw_rnn_cell, output_keep_prob=self.dropout_keep_prob) 42 | bw_rnn_cell = tf.nn.rnn_cell.DropoutWrapper(bw_rnn_cell, output_keep_prob=self.dropout_keep_prob) 43 | 44 | # Creates a dynamic bidirectional recurrent neural network 45 | # shape of `outputs`: tuple -> (outputs_fw, outputs_bw) 46 | # shape of `outputs_fw`: [batch_size, sequence_length, rnn_hidden_size] 47 | 48 | # shape of `state`: tuple -> (outputs_state_fw, output_state_bw) 49 | # shape of `outputs_state_fw`: tuple -> (c, h) c: memory cell; h: hidden state 50 | outputs, state = tf.nn.bidirectional_dynamic_rnn(fw_rnn_cell, bw_rnn_cell, input_x, dtype=tf.float32) 51 | 52 | # Concat output 53 | # [batch_size, sequence_length, rnn_hidden_size * 2] 54 | rnn_out = tf.concat(outputs, axis=2, name=name + "rnn_out") 55 | 56 | # [batch_size, rnn_hidden_size * 2] 57 | rnn_pooled = tf.reduce_max(rnn_out, axis=1, name=name + "rnn_pooled") 58 | 59 | return rnn_pooled 60 | 61 | def _fc_layer(input_x, name=""): 62 | """ 63 | Fully Connected Layer. 64 | Args: 65 | input_x: 66 | name: Scope name 67 | Returns: 68 | [batch_size, fc_hidden_size] 69 | """ 70 | with tf.name_scope(name + "fc"): 71 | num_units = input_x.get_shape().as_list()[-1] 72 | W = tf.Variable(tf.truncated_normal(shape=[num_units, fc_hidden_size], 73 | stddev=0.1, dtype=tf.float32), name="W") 74 | b = tf.Variable(tf.constant(value=0.1, shape=[fc_hidden_size], dtype=tf.float32), name="b") 75 | fc = tf.nn.xw_plus_b(input_x, W, b) 76 | fc_out = tf.nn.relu(fc) 77 | return fc_out 78 | 79 | # Embedding Layer 80 | with tf.device("/cpu:0"), tf.name_scope("embedding"): 81 | # Use random generated the word vector by default 82 | # Can also be obtained through our own word vectors trained by our corpus 83 | if pretrained_embedding is None: 84 | self.embedding = tf.Variable(tf.random_uniform([vocab_size, embedding_size], minval=-1.0, maxval=1.0, 85 | dtype=tf.float32), trainable=True, name="embedding") 86 | else: 87 | if embedding_type == 0: 88 | self.embedding = tf.constant(pretrained_embedding, dtype=tf.float32, name="embedding") 89 | if embedding_type == 1: 90 | self.embedding = tf.Variable(pretrained_embedding, trainable=True, 91 | dtype=tf.float32, name="embedding") 92 | # [batch_size, sequence_length, embedding_size] 93 | self.embedded_sentence_content = tf.nn.embedding_lookup(self.embedding, self.input_x_content) 94 | self.embedded_sentence_question = tf.nn.embedding_lookup(self.embedding, self.input_x_question) 95 | self.embedded_sentence_option = tf.nn.embedding_lookup(self.embedding, self.input_x_option) 96 | 97 | # Concat -> embedded_sentence_all: [batch_size, sequence_length_all, embedding_size] 98 | self.embedded_sentence_all = tf.concat([self.embedded_sentence_content, self.embedded_sentence_question, 99 | self.embedded_sentence_option], axis=1) 100 | 101 | # Bi-RNN Layer 102 | # bi_rnn_out: [batch_size, rnn_hidden_size * 2] 103 | self.bi_rnn_out = _bi_rnn_layer(self.embedded_sentence_all, name="total_") 104 | 105 | # Fully Connected Layer 106 | self.fc_out = _fc_layer(self.bi_rnn_out) 107 | 108 | # Add dropout 109 | with tf.name_scope("dropout"): 110 | self.fc_drop = tf.nn.dropout(self.fc_out, self.dropout_keep_prob) 111 | 112 | # Final scores 113 | with tf.name_scope("output"): 114 | W = tf.Variable(tf.truncated_normal(shape=[fc_hidden_size, 1], 115 | stddev=0.1, dtype=tf.float32), name="W") 116 | b = tf.Variable(tf.constant(value=0.1, shape=[1], dtype=tf.float32), name="b") 117 | self.logits = tf.nn.xw_plus_b(self.fc_drop, W, b, name="logits") 118 | self.scores = tf.sigmoid(self.logits, name="scores") 119 | 120 | # Calculate mean cross-entropy loss, L2 loss 121 | with tf.name_scope("loss"): 122 | losses = tf.reduce_mean(tf.square(self.input_y - self.scores), name="losses") 123 | l2_losses = tf.add_n([tf.nn.l2_loss(tf.cast(v, tf.float32)) for v in tf.trainable_variables()], 124 | name="l2_losses") * l2_reg_lambda 125 | self.loss = tf.add(losses, l2_losses, name="loss") -------------------------------------------------------------------------------- /TF/R-MIDP/train_rmidp.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | __author__ = 'Randolph' 3 | 4 | import os 5 | import sys 6 | import time 7 | import logging 8 | 9 | sys.path.append('../') 10 | logging.getLogger('tensorflow').disabled = True 11 | 12 | import tensorflow as tf 13 | from text_rmidp import TextRMIDP 14 | from utils import checkmate as cm 15 | from utils import data_helpers as dh 16 | from utils import param_parser as parser 17 | from tensorboard.plugins import projector 18 | from sklearn.metrics import mean_squared_error, r2_score 19 | 20 | args = parser.parameter_parser() 21 | OPTION = dh.option(pattern=0) 22 | logger = dh.logger_fn("tflog", "logs/{0}-{1}.log".format('Train' if OPTION == 'T' else 'Restore', time.asctime())) 23 | 24 | 25 | def train_rmidp(): 26 | """Training RMIDP model.""" 27 | # Print parameters used for the model 28 | dh.tab_printer(args, logger) 29 | 30 | # Load sentences, labels, and training parameters 31 | logger.info("Loading data...") 32 | logger.info("Data processing...") 33 | train_data = dh.load_data_and_labels(args.train_file, args.word2vec_file, data_aug_flag=False) 34 | val_data = dh.load_data_and_labels(args.validation_file, args.word2vec_file, data_aug_flag=False) 35 | 36 | logger.info("Data padding...") 37 | x_train_content, x_train_question, x_train_option, y_train = dh.pad_data(train_data, args.pad_seq_len) 38 | x_val_content, x_val_question, x_val_option, y_val = dh.pad_data(val_data, args.pad_seq_len) 39 | 40 | # Build vocabulary 41 | VOCAB_SIZE, EMBEDDING_SIZE, pretrained_word2vec_matrix = dh.load_word2vec_matrix(args.word2vec_file) 42 | 43 | # Build a graph and rmidp object 44 | with tf.Graph().as_default(): 45 | session_conf = tf.ConfigProto( 46 | allow_soft_placement=args.allow_soft_placement, 47 | log_device_placement=args.log_device_placement) 48 | session_conf.gpu_options.allow_growth = args.gpu_options_allow_growth 49 | sess = tf.Session(config=session_conf) 50 | with sess.as_default(): 51 | rmidp = TextRMIDP( 52 | sequence_length=args.pad_seq_len, 53 | vocab_size=VOCAB_SIZE, 54 | embedding_type=args.embedding_type, 55 | embedding_size=EMBEDDING_SIZE, 56 | rnn_hidden_size=args.rnn_dim, 57 | rnn_type=args.rnn_type, 58 | rnn_layers=args.rnn_layers, 59 | fc_hidden_size=args.fc_dim, 60 | l2_reg_lambda=args.l2_lambda, 61 | pretrained_embedding=pretrained_word2vec_matrix) 62 | 63 | # Define training procedure 64 | with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)): 65 | learning_rate = tf.train.exponential_decay(learning_rate=args.learning_rate, 66 | global_step=rmidp.global_step, decay_steps=args.decay_steps, 67 | decay_rate=args.decay_rate, staircase=True) 68 | optimizer = tf.train.AdamOptimizer(learning_rate) 69 | grads, vars = zip(*optimizer.compute_gradients(rmidp.loss)) 70 | grads, _ = tf.clip_by_global_norm(grads, clip_norm=args.norm_ratio) 71 | train_op = optimizer.apply_gradients(zip(grads, vars), global_step=rmidp.global_step, name="train_op") 72 | 73 | # Keep track of gradient values and sparsity (optional) 74 | grad_summaries = [] 75 | for g, v in zip(grads, vars): 76 | if g is not None: 77 | grad_hist_summary = tf.summary.histogram("{0}/grad/hist".format(v.name), g) 78 | sparsity_summary = tf.summary.scalar("{0}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g)) 79 | grad_summaries.append(grad_hist_summary) 80 | grad_summaries.append(sparsity_summary) 81 | grad_summaries_merged = tf.summary.merge(grad_summaries) 82 | 83 | # Output directory for models and summaries 84 | out_dir = dh.get_out_dir(OPTION, logger) 85 | checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints")) 86 | best_checkpoint_dir = os.path.abspath(os.path.join(out_dir, "bestcheckpoints")) 87 | 88 | # Summaries for loss 89 | loss_summary = tf.summary.scalar("loss", rmidp.loss) 90 | 91 | # Train summaries 92 | train_summary_op = tf.summary.merge([loss_summary, grad_summaries_merged]) 93 | train_summary_dir = os.path.join(out_dir, "summaries", "train") 94 | train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph) 95 | 96 | # Validation summaries 97 | validation_summary_op = tf.summary.merge([loss_summary]) 98 | validation_summary_dir = os.path.join(out_dir, "summaries", "validation") 99 | validation_summary_writer = tf.summary.FileWriter(validation_summary_dir, sess.graph) 100 | 101 | saver = tf.train.Saver(tf.global_variables(), max_to_keep=args.num_checkpoints) 102 | best_saver = cm.BestCheckpointSaver(save_dir=best_checkpoint_dir, num_to_keep=3, maximize=False) 103 | 104 | if OPTION == 'R': 105 | # Load rmidp model 106 | logger.info("Loading model...") 107 | checkpoint_file = tf.train.latest_checkpoint(checkpoint_dir) 108 | logger.info(checkpoint_file) 109 | 110 | # Load the saved meta graph and restore variables 111 | saver = tf.train.import_meta_graph("{0}.meta".format(checkpoint_file)) 112 | saver.restore(sess, checkpoint_file) 113 | if OPTION == 'T': 114 | if not os.path.exists(checkpoint_dir): 115 | os.makedirs(checkpoint_dir) 116 | sess.run(tf.global_variables_initializer()) 117 | sess.run(tf.local_variables_initializer()) 118 | 119 | # Embedding visualization config 120 | config = projector.ProjectorConfig() 121 | embedding_conf = config.embeddings.add() 122 | embedding_conf.tensor_name = "embedding" 123 | embedding_conf.metadata_path = args.metadata_file 124 | 125 | projector.visualize_embeddings(train_summary_writer, config) 126 | projector.visualize_embeddings(validation_summary_writer, config) 127 | 128 | # Save the embedding visualization 129 | saver.save(sess, os.path.join(out_dir, "embedding", "embedding.ckpt")) 130 | 131 | current_step = sess.run(rmidp.global_step) 132 | 133 | def train_step(x_batch_content, x_batch_question, x_batch_option, y_batch): 134 | """A single training step""" 135 | feed_dict = { 136 | rmidp.input_x_content: x_batch_content, 137 | rmidp.input_x_question: x_batch_question, 138 | rmidp.input_x_option: x_batch_option, 139 | rmidp.input_y: y_batch, 140 | rmidp.dropout_keep_prob: args.dropout_rate, 141 | rmidp.is_training: True 142 | } 143 | _, step, summaries, loss = sess.run( 144 | [train_op, rmidp.global_step, train_summary_op, rmidp.loss], feed_dict) 145 | logger.info("step {0}: loss {1:g}".format(step, loss)) 146 | train_summary_writer.add_summary(summaries, step) 147 | 148 | def validation_step(x_val_content, x_val_question, x_val_option, y_val, writer=None): 149 | """Evaluates model on a validation set""" 150 | batches_validation = dh.batch_iter(list(zip(x_val_content, x_val_question, x_val_option, y_val)), 151 | args.batch_size, 1) 152 | 153 | eval_counter, eval_loss = 0, 0.0 154 | 155 | true_labels = [] 156 | predicted_scores = [] 157 | 158 | for batch_validation in batches_validation: 159 | x_batch_content, x_batch_question, x_batch_option, y_batch = zip(*batch_validation) 160 | feed_dict = { 161 | rmidp.input_x_content: x_batch_content, 162 | rmidp.input_x_question: x_batch_question, 163 | rmidp.input_x_option: x_batch_option, 164 | rmidp.input_y: y_batch, 165 | rmidp.dropout_keep_prob: 1.0, 166 | rmidp.is_training: False 167 | } 168 | step, summaries, scores, cur_loss = sess.run( 169 | [rmidp.global_step, validation_summary_op, rmidp.scores, rmidp.loss], feed_dict) 170 | 171 | # Prepare for calculating metrics 172 | for i in y_batch: 173 | true_labels.append(i) 174 | for j in scores: 175 | predicted_scores.append(j) 176 | 177 | eval_loss = eval_loss + cur_loss 178 | eval_counter = eval_counter + 1 179 | 180 | if writer: 181 | writer.add_summary(summaries, step) 182 | 183 | eval_loss = float(eval_loss / eval_counter) 184 | 185 | # Calculate PCC & DOA 186 | pcc, doa = dh.evaluation(true_labels, predicted_scores) 187 | # Calculate RMSE 188 | rmse = mean_squared_error(true_labels, predicted_scores) ** 0.5 189 | r2 = r2_score(true_labels, predicted_scores) 190 | 191 | return eval_loss, pcc, doa, rmse, r2 192 | 193 | # Generate batches 194 | batches_train = dh.batch_iter(list(zip(x_train_content, x_train_question, x_train_option, y_train)), 195 | args.batch_size, args.epochs) 196 | 197 | num_batches_per_epoch = int((len(y_train) - 1) / args.batch_size) + 1 198 | 199 | # Training loop. For each batch... 200 | for batch_train in batches_train: 201 | x_batch_train_content, x_batch_train_question, x_batch_train_option, y_batch_train = zip(*batch_train) 202 | train_step(x_batch_train_content, x_batch_train_question, x_batch_train_option, y_batch_train) 203 | current_step = tf.train.global_step(sess, rmidp.global_step) 204 | 205 | if current_step % args.evaluate_steps == 0: 206 | logger.info("\nEvaluation:") 207 | eval_loss, pcc, doa, rmse, r2 = validation_step(x_val_content, x_val_question, x_val_option, y_val, 208 | writer=validation_summary_writer) 209 | logger.info("All Validation set: Loss {0:g} | PCC {1:g} | DOA {2:g} | RMSE {3:g} | R2 {4:g}" 210 | .format(eval_loss, pcc, doa, rmse, r2)) 211 | best_saver.handle(rmse, sess, current_step) 212 | if current_step % args.checkpoint_steps == 0: 213 | checkpoint_prefix = os.path.join(checkpoint_dir, "model") 214 | path = saver.save(sess, checkpoint_prefix, global_step=current_step) 215 | logger.info("Saved model checkpoint to {0}\n".format(path)) 216 | if current_step % num_batches_per_epoch == 0: 217 | current_epoch = current_step // num_batches_per_epoch 218 | logger.info("Epoch {0} has finished!".format(current_epoch)) 219 | 220 | logger.info("All Done.") 221 | 222 | 223 | if __name__ == '__main__': 224 | train_rmidp() -------------------------------------------------------------------------------- /TF/TARNN/test_tarnn.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | __author__ = 'Randolph' 3 | 4 | import os 5 | import sys 6 | import time 7 | import logging 8 | 9 | sys.path.append('../') 10 | logging.getLogger('tensorflow').disabled = True 11 | 12 | import tensorflow as tf 13 | from utils import checkmate as cm 14 | from utils import data_helpers as dh 15 | from utils import param_parser as parser 16 | from sklearn.metrics import mean_squared_error, r2_score 17 | 18 | args = parser.parameter_parser() 19 | MODEL = dh.get_model_name() 20 | logger = dh.logger_fn("tflog", "logs/Test-{0}.log".format(time.asctime())) 21 | 22 | CPT_DIR = 'runs/' + MODEL + '/checkpoints/' 23 | BEST_CPT_DIR = 'runs/' + MODEL + '/bestcheckpoints/' 24 | SAVE_DIR = 'output/' + MODEL 25 | 26 | 27 | def test_tarnn(): 28 | """Test TARNN model.""" 29 | # Print parameters used for the model 30 | dh.tab_printer(args, logger) 31 | 32 | # Load data 33 | logger.info("Loading data...") 34 | logger.info("Data processing...") 35 | test_data = dh.load_data_and_labels(args.test_file, args.word2vec_file, data_aug_flag=False) 36 | 37 | logger.info("Data padding...") 38 | x_test_content, x_test_question, x_test_option, y_test = dh.pad_data(test_data, args.pad_seq_len) 39 | 40 | # Load tarnn model 41 | OPTION = dh.option(pattern=1) 42 | if OPTION == 'B': 43 | logger.info("Loading best model...") 44 | checkpoint_file = cm.get_best_checkpoint(BEST_CPT_DIR, select_maximum_value=True) 45 | else: 46 | logger.info("Loading latest model...") 47 | checkpoint_file = tf.train.latest_checkpoint(CPT_DIR) 48 | logger.info(checkpoint_file) 49 | 50 | graph = tf.Graph() 51 | with graph.as_default(): 52 | session_conf = tf.ConfigProto( 53 | allow_soft_placement=args.allow_soft_placement, 54 | log_device_placement=args.log_device_placement) 55 | session_conf.gpu_options.allow_growth = args.gpu_options_allow_growth 56 | sess = tf.Session(config=session_conf) 57 | with sess.as_default(): 58 | # Load the saved meta graph and restore variables 59 | saver = tf.train.import_meta_graph("{0}.meta".format(checkpoint_file)) 60 | saver.restore(sess, checkpoint_file) 61 | 62 | # Get the placeholders from the graph by name 63 | input_x_content = graph.get_operation_by_name("input_x_content").outputs[0] 64 | input_x_question = graph.get_operation_by_name("input_x_question").outputs[0] 65 | input_x_option = graph.get_operation_by_name("input_x_option").outputs[0] 66 | input_y = graph.get_operation_by_name("input_y").outputs[0] 67 | dropout_keep_prob = graph.get_operation_by_name("dropout_keep_prob").outputs[0] 68 | is_training = graph.get_operation_by_name("is_training").outputs[0] 69 | 70 | # Tensors we want to evaluate 71 | scores = graph.get_operation_by_name("output/scores").outputs[0] 72 | loss = graph.get_operation_by_name("loss/loss").outputs[0] 73 | 74 | # Split the output nodes name by '|' if you have several output nodes 75 | output_node_names = "output/scores" 76 | 77 | # Save the .pb model file 78 | output_graph_def = tf.graph_util.convert_variables_to_constants(sess, sess.graph_def, 79 | output_node_names.split("|")) 80 | tf.train.write_graph(output_graph_def, "graph", "graph-tarnn-{0}.pb".format(MODEL), as_text=False) 81 | 82 | # Generate batches for one epoch 83 | batches = dh.batch_iter(list(zip(x_test_content, x_test_question, x_test_option, y_test)), 84 | args.batch_size, 1, shuffle=False) 85 | 86 | test_counter, test_loss = 0, 0.0 87 | 88 | # Collect the predictions here 89 | true_labels = [] 90 | predicted_scores = [] 91 | 92 | for batch_test in batches: 93 | x_batch_content, x_batch_question, x_batch_option, y_batch = zip(*batch_test) 94 | feed_dict = { 95 | input_x_content: x_batch_content, 96 | input_x_question: x_batch_question, 97 | input_x_option: x_batch_option, 98 | input_y: y_batch, 99 | dropout_keep_prob: 1.0, 100 | is_training: False 101 | } 102 | batch_scores, cur_loss = sess.run([scores, loss], feed_dict) 103 | 104 | # Prepare for calculating metrics 105 | for i in y_batch: 106 | true_labels.append(i) 107 | for j in batch_scores: 108 | predicted_scores.append(j) 109 | 110 | test_loss = test_loss + cur_loss 111 | test_counter = test_counter + 1 112 | 113 | # Calculate PCC & DOA 114 | pcc, doa = dh.evaluation(true_labels, predicted_scores) 115 | # Calculate RMSE 116 | rmse = mean_squared_error(true_labels, predicted_scores) ** 0.5 117 | r2 = r2_score(true_labels, predicted_scores) 118 | 119 | test_loss = float(test_loss / test_counter) 120 | 121 | logger.info("All Test Dataset: Loss {0:g} | PCC {1:g} | DOA {2:g} | RMSE {3:g} | R2 {4:g}" 122 | .format(test_loss, pcc, doa, rmse, r2)) 123 | 124 | # Save the prediction result 125 | if not os.path.exists(SAVE_DIR): 126 | os.makedirs(SAVE_DIR) 127 | dh.create_prediction_file(output_file=SAVE_DIR + "/predictions.json", all_id=test_data.id, 128 | all_labels=true_labels, all_predict_scores=predicted_scores) 129 | 130 | logger.info("All Done.") 131 | 132 | 133 | if __name__ == '__main__': 134 | test_tarnn() 135 | -------------------------------------------------------------------------------- /TF/TARNN/train_tarnn.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | __author__ = 'Randolph' 3 | 4 | import os 5 | import sys 6 | import time 7 | import logging 8 | 9 | sys.path.append('../') 10 | logging.getLogger('tensorflow').disabled = True 11 | 12 | import tensorflow as tf 13 | from text_tarnn import TextTARNN 14 | from utils import checkmate as cm 15 | from utils import data_helpers as dh 16 | from utils import param_parser as parser 17 | from tensorboard.plugins import projector 18 | from sklearn.metrics import mean_squared_error, r2_score 19 | 20 | args = parser.parameter_parser() 21 | OPTION = dh.option(pattern=0) 22 | logger = dh.logger_fn("tflog", "logs/{0}-{1}.log".format('Train' if OPTION == 'T' else 'Restore', time.asctime())) 23 | 24 | 25 | def train_tarnn(): 26 | """Training TARNN model.""" 27 | # Print parameters used for the model 28 | dh.tab_printer(args, logger) 29 | 30 | # Load sentences, labels, and training parameters 31 | logger.info("Loading data...") 32 | logger.info("Data processing...") 33 | train_data = dh.load_data_and_labels(args.train_file, args.word2vec_file, data_aug_flag=False) 34 | val_data = dh.load_data_and_labels(args.validation_file, args.word2vec_file, data_aug_flag=False) 35 | 36 | logger.info("Data padding...") 37 | x_train_content, x_train_question, x_train_option, y_train = dh.pad_data(train_data, args.pad_seq_len) 38 | x_val_content, x_val_question, x_val_option, y_val = dh.pad_data(val_data, args.pad_seq_len) 39 | 40 | # Build vocabulary 41 | VOCAB_SIZE, EMBEDDING_SIZE, pretrained_word2vec_matrix = dh.load_word2vec_matrix(args.word2vec_file) 42 | 43 | # Build a graph and tarnn object 44 | with tf.Graph().as_default(): 45 | session_conf = tf.ConfigProto( 46 | allow_soft_placement=args.allow_soft_placement, 47 | log_device_placement=args.log_device_placement) 48 | session_conf.gpu_options.allow_growth = args.gpu_options_allow_growth 49 | sess = tf.Session(config=session_conf) 50 | with sess.as_default(): 51 | tarnn = TextTARNN( 52 | sequence_length=args.pad_seq_len, 53 | vocab_size=VOCAB_SIZE, 54 | embedding_type=args.embedding_type, 55 | embedding_size=EMBEDDING_SIZE, 56 | rnn_hidden_size=args.rnn_dim, 57 | rnn_type=args.rnn_type, 58 | rnn_layers=args.rnn_layers, 59 | attention_type=args.attention_type, 60 | fc_hidden_size=args.fc_dim, 61 | l2_reg_lambda=args.l2_lambda, 62 | pretrained_embedding=pretrained_word2vec_matrix) 63 | 64 | # Define training procedure 65 | with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)): 66 | learning_rate = tf.train.exponential_decay(learning_rate=args.learning_rate, 67 | global_step=tarnn.global_step, decay_steps=args.decay_steps, 68 | decay_rate=args.decay_rate, staircase=True) 69 | optimizer = tf.train.AdamOptimizer(learning_rate) 70 | grads, vars = zip(*optimizer.compute_gradients(tarnn.loss)) 71 | grads, _ = tf.clip_by_global_norm(grads, clip_norm=args.norm_ratio) 72 | train_op = optimizer.apply_gradients(zip(grads, vars), global_step=tarnn.global_step, name="train_op") 73 | 74 | # Keep track of gradient values and sparsity (optional) 75 | grad_summaries = [] 76 | for g, v in zip(grads, vars): 77 | if g is not None: 78 | grad_hist_summary = tf.summary.histogram("{0}/grad/hist".format(v.name), g) 79 | sparsity_summary = tf.summary.scalar("{0}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g)) 80 | grad_summaries.append(grad_hist_summary) 81 | grad_summaries.append(sparsity_summary) 82 | grad_summaries_merged = tf.summary.merge(grad_summaries) 83 | 84 | # Output directory for models and summaries 85 | out_dir = dh.get_out_dir(OPTION, logger) 86 | checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints")) 87 | best_checkpoint_dir = os.path.abspath(os.path.join(out_dir, "bestcheckpoints")) 88 | 89 | # Summaries for loss 90 | loss_summary = tf.summary.scalar("loss", tarnn.loss) 91 | 92 | # Train summaries 93 | train_summary_op = tf.summary.merge([loss_summary, grad_summaries_merged]) 94 | train_summary_dir = os.path.join(out_dir, "summaries", "train") 95 | train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph) 96 | 97 | # Validation summaries 98 | validation_summary_op = tf.summary.merge([loss_summary]) 99 | validation_summary_dir = os.path.join(out_dir, "summaries", "validation") 100 | validation_summary_writer = tf.summary.FileWriter(validation_summary_dir, sess.graph) 101 | 102 | saver = tf.train.Saver(tf.global_variables(), max_to_keep=args.num_checkpoints) 103 | best_saver = cm.BestCheckpointSaver(save_dir=best_checkpoint_dir, num_to_keep=3, maximize=False) 104 | 105 | if OPTION == 'R': 106 | # Load tarnn model 107 | logger.info("Loading model...") 108 | checkpoint_file = tf.train.latest_checkpoint(checkpoint_dir) 109 | logger.info(checkpoint_file) 110 | 111 | # Load the saved meta graph and restore variables 112 | saver = tf.train.import_meta_graph("{0}.meta".format(checkpoint_file)) 113 | saver.restore(sess, checkpoint_file) 114 | if OPTION == 'T': 115 | if not os.path.exists(checkpoint_dir): 116 | os.makedirs(checkpoint_dir) 117 | sess.run(tf.global_variables_initializer()) 118 | sess.run(tf.local_variables_initializer()) 119 | 120 | # Embedding visualization config 121 | config = projector.ProjectorConfig() 122 | embedding_conf = config.embeddings.add() 123 | embedding_conf.tensor_name = "embedding" 124 | embedding_conf.metadata_path = args.metadata_file 125 | 126 | projector.visualize_embeddings(train_summary_writer, config) 127 | projector.visualize_embeddings(validation_summary_writer, config) 128 | 129 | # Save the embedding visualization 130 | saver.save(sess, os.path.join(out_dir, "embedding", "embedding.ckpt")) 131 | 132 | current_step = sess.run(tarnn.global_step) 133 | 134 | def train_step(x_batch_content, x_batch_question, x_batch_option, y_batch): 135 | """A single training step""" 136 | feed_dict = { 137 | tarnn.input_x_content: x_batch_content, 138 | tarnn.input_x_question: x_batch_question, 139 | tarnn.input_x_option: x_batch_option, 140 | tarnn.input_y: y_batch, 141 | tarnn.dropout_keep_prob: args.dropout_rate, 142 | tarnn.is_training: True 143 | } 144 | _, step, summaries, loss = sess.run( 145 | [train_op, tarnn.global_step, train_summary_op, tarnn.loss], feed_dict) 146 | logger.info("step {0}: loss {1:g}".format(step, loss)) 147 | train_summary_writer.add_summary(summaries, step) 148 | 149 | def validation_step(x_val_content, x_val_question, x_val_option, y_val, writer=None): 150 | """Evaluates model on a validation set""" 151 | batches_validation = dh.batch_iter(list(zip(x_val_content, x_val_question, x_val_option, y_val)), 152 | args.batch_size, 1) 153 | 154 | eval_counter, eval_loss = 0, 0.0 155 | true_labels = [] 156 | predicted_scores = [] 157 | 158 | for batch_validation in batches_validation: 159 | x_batch_content, x_batch_question, x_batch_option, y_batch = zip(*batch_validation) 160 | feed_dict = { 161 | tarnn.input_x_content: x_batch_content, 162 | tarnn.input_x_question: x_batch_question, 163 | tarnn.input_x_option: x_batch_option, 164 | tarnn.input_y: y_batch, 165 | tarnn.dropout_keep_prob: 1.0, 166 | tarnn.is_training: False 167 | } 168 | step, summaries, scores, cur_loss = sess.run( 169 | [tarnn.global_step, validation_summary_op, tarnn.scores, tarnn.loss], feed_dict) 170 | 171 | # Prepare for calculating metrics 172 | for i in y_batch: 173 | true_labels.append(i) 174 | for j in scores: 175 | predicted_scores.append(j) 176 | 177 | eval_loss = eval_loss + cur_loss 178 | eval_counter = eval_counter + 1 179 | 180 | if writer: 181 | writer.add_summary(summaries, step) 182 | 183 | eval_loss = float(eval_loss / eval_counter) 184 | 185 | # Calculate PCC & DOA 186 | pcc, doa = dh.evaluation(true_labels, predicted_scores) 187 | # Calculate RMSE 188 | rmse = mean_squared_error(true_labels, predicted_scores) ** 0.5 189 | r2 = r2_score(true_labels, predicted_scores) 190 | 191 | return eval_loss, pcc, doa, rmse, r2 192 | 193 | # Generate batches 194 | batches_train = dh.batch_iter(list(zip(x_train_content, x_train_question, x_train_option, y_train)), 195 | args.batch_size, args.epochs) 196 | 197 | num_batches_per_epoch = int((len(y_train) - 1) / args.batch_size) + 1 198 | 199 | # Training loop. For each batch... 200 | for batch_train in batches_train: 201 | x_batch_train_content, x_batch_train_question, x_batch_train_option, y_batch_train = zip(*batch_train) 202 | train_step(x_batch_train_content, x_batch_train_question, x_batch_train_option, y_batch_train) 203 | current_step = tf.train.global_step(sess, tarnn.global_step) 204 | 205 | if current_step % args.evaluate_steps == 0: 206 | logger.info("\nEvaluation:") 207 | eval_loss, pcc, doa, rmse, r2 = validation_step(x_val_content, x_val_question, x_val_option, y_val, 208 | writer=validation_summary_writer) 209 | logger.info("All Validation set: Loss {0:g} | PCC {1:g} | DOA {2:g} | RMSE {3:g} | R2 {4:g}" 210 | .format(eval_loss, pcc, doa, rmse, r2)) 211 | best_saver.handle(rmse, sess, current_step) 212 | if current_step % args.checkpoint_steps == 0: 213 | checkpoint_prefix = os.path.join(checkpoint_dir, "model") 214 | path = saver.save(sess, checkpoint_prefix, global_step=current_step) 215 | logger.info("Saved model checkpoint to {0}\n".format(path)) 216 | if current_step % num_batches_per_epoch == 0: 217 | current_epoch = current_step // num_batches_per_epoch 218 | logger.info("Epoch {0} has finished!".format(current_epoch)) 219 | 220 | logger.info("All Done.") 221 | 222 | 223 | if __name__ == '__main__': 224 | train_tarnn() -------------------------------------------------------------------------------- /TF/utils/checkmate.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | __author__ = 'Randolph' 3 | 4 | import os 5 | import glob 6 | import json 7 | import numpy as np 8 | import tensorflow as tf 9 | 10 | 11 | class BestCheckpointSaver(object): 12 | """Maintains a directory containing only the best n checkpoints 13 | Inside the directory is a best_checkpoints JSON file containing a dictionary 14 | mapping of the best checkpoint filepaths to the values by which the checkpoints 15 | are compared. Only the best n checkpoints are contained in the directory and JSON file. 16 | This is a light-weight wrapper class only intended to work in simple, 17 | non-distributed settings. It is not intended to work with the tf.Estimator 18 | framework. 19 | """ 20 | def __init__(self, save_dir, num_to_keep=1, maximize=True, saver=None): 21 | """Creates a `BestCheckpointSaver` 22 | `BestCheckpointSaver` acts as a wrapper class around a `tf.train.Saver` 23 | Args: 24 | save_dir: The directory in which the checkpoint files will be saved 25 | num_to_keep: The number of best checkpoint files to retain 26 | maximize: Define 'best' values to be the highest values. For example, 27 | set this to True if selecting for the checkpoints with the highest 28 | given accuracy. Or set to False to select for checkpoints with the 29 | lowest given error rate. 30 | saver: A `tf.train.Saver` to use for saving checkpoints. A default 31 | `tf.train.Saver` will be created if none is provided. 32 | """ 33 | self._num_to_keep = num_to_keep 34 | self._save_dir = save_dir 35 | self._save_path = os.path.join(save_dir, 'model') 36 | self._maximize = maximize 37 | self._saver = saver if saver else tf.train.Saver( 38 | max_to_keep=None, 39 | save_relative_paths=True 40 | ) 41 | 42 | if not os.path.exists(save_dir): 43 | os.makedirs(save_dir) 44 | self.best_checkpoints_file = os.path.join(save_dir, 'best_checkpoints') 45 | 46 | def handle(self, value, sess, global_step): 47 | """Updates the set of best checkpoints based on the given result. 48 | Args: 49 | value: The value by which to rank the checkpoint. 50 | sess: A tf.Session to use to save the checkpoint 51 | global_step: The global step 52 | """ 53 | current_ckpt = 'model-{}'.format(global_step) 54 | value = float(value) 55 | if not os.path.exists(self.best_checkpoints_file): 56 | self._save_best_checkpoints_file({current_ckpt: value}) 57 | self._saver.save(sess, self._save_path, global_step) 58 | return 59 | 60 | best_checkpoints = self._load_best_checkpoints_file() 61 | 62 | if len(best_checkpoints) < self._num_to_keep: 63 | best_checkpoints[current_ckpt] = value 64 | self._save_best_checkpoints_file(best_checkpoints) 65 | self._saver.save(sess, self._save_path, global_step) 66 | return 67 | 68 | if self._maximize: 69 | should_save = not all(current_best >= value 70 | for current_best in best_checkpoints.values()) 71 | else: 72 | should_save = not all(current_best <= value 73 | for current_best in best_checkpoints.values()) 74 | if should_save: 75 | best_checkpoint_list = self._sort(best_checkpoints) 76 | 77 | worst_checkpoint = os.path.join(self._save_dir, 78 | best_checkpoint_list.pop(-1)[0]) 79 | self._remove_outdated_checkpoint_files(worst_checkpoint) 80 | self._update_internal_saver_state(best_checkpoint_list) 81 | 82 | best_checkpoints = dict(best_checkpoint_list) 83 | best_checkpoints[current_ckpt] = value 84 | self._save_best_checkpoints_file(best_checkpoints) 85 | 86 | self._saver.save(sess, self._save_path, global_step) 87 | 88 | def _save_best_checkpoints_file(self, updated_best_checkpoints): 89 | with open(self.best_checkpoints_file, 'w') as f: 90 | json.dump(updated_best_checkpoints, f, indent=3) 91 | 92 | def _remove_outdated_checkpoint_files(self, worst_checkpoint): 93 | os.remove(os.path.join(self._save_dir, 'checkpoint')) 94 | for ckpt_file in glob.glob(worst_checkpoint + '.*'): 95 | os.remove(ckpt_file) 96 | 97 | def _update_internal_saver_state(self, best_checkpoint_list): 98 | best_checkpoint_files = [ 99 | (ckpt[0], np.inf) # TODO: Try to use actual file timestamp 100 | for ckpt in best_checkpoint_list 101 | ] 102 | self._saver.set_last_checkpoints_with_time(best_checkpoint_files) 103 | 104 | def _load_best_checkpoints_file(self): 105 | with open(self.best_checkpoints_file, 'r') as f: 106 | best_checkpoints = json.load(f) 107 | return best_checkpoints 108 | 109 | def _sort(self, best_checkpoints): 110 | best_checkpoints = [ 111 | (ckpt, best_checkpoints[ckpt]) 112 | for ckpt in sorted(best_checkpoints, 113 | key=best_checkpoints.get, 114 | reverse=self._maximize) 115 | ] 116 | return best_checkpoints 117 | 118 | 119 | def get_best_checkpoint(best_checkpoint_dir, select_maximum_value=True): 120 | """ 121 | Returns filepath to the best checkpoint 122 | Reads the best_checkpoints file in the best_checkpoint_dir directory. 123 | Returns the filepath in the best_checkpoints file associated with 124 | the highest value if select_maximum_value is True, or the filepath 125 | associated with the lowest value if select_maximum_value is False. 126 | Args: 127 | best_checkpoint_dir: Directory containing best_checkpoints JSON file 128 | select_maximum_value: If True, select the filepath associated 129 | with the highest value. Otherwise, select the filepath associated 130 | with the lowest value. 131 | Returns: 132 | The full path to the best checkpoint file 133 | """ 134 | best_checkpoints_file = os.path.join(best_checkpoint_dir, 'best_checkpoints') 135 | assert os.path.exists(best_checkpoints_file) 136 | with open(best_checkpoints_file, 'r') as f: 137 | best_checkpoints = json.load(f) 138 | best_checkpoints = [ 139 | ckpt for ckpt in sorted(best_checkpoints, 140 | key=best_checkpoints.get, 141 | reverse=select_maximum_value) 142 | ] 143 | return os.path.join(os.path.abspath(best_checkpoint_dir), best_checkpoints[0]) 144 | -------------------------------------------------------------------------------- /TF/utils/param_parser.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | 4 | def parameter_parser(): 5 | """ 6 | A method to parse up command line parameters. 7 | The default hyperparameters give good results without cross-validation. 8 | """ 9 | parser = argparse.ArgumentParser(description="Run Model.") 10 | 11 | # Data Parameters 12 | parser.add_argument("--train-file", 13 | nargs="?", 14 | default="../../data/Train_sample.json", 15 | help="Training data.") 16 | 17 | parser.add_argument("--validation-file", 18 | nargs="?", 19 | default="../../data/Validation_sample.json", 20 | help="Validation data.") 21 | 22 | parser.add_argument("--test-file", 23 | nargs="?", 24 | default="../../data/Test_sample.json", 25 | help="Testing data.") 26 | 27 | parser.add_argument("--metadata-file", 28 | nargs="?", 29 | default="../../data/metadata.tsv", 30 | help="Metadata file for embedding visualization.") 31 | 32 | parser.add_argument("--word2vec-file", 33 | nargs="?", 34 | default="../../data/word2vec_300.txt", 35 | help="Word2vec file for embedding characters. (the dim need to be the same as embedding dim).") 36 | 37 | # Model Hyperparameters 38 | parser.add_argument("--pad-seq-len", 39 | type=list, 40 | default=[350, 15, 10], 41 | help="Padding Sequence length of data. (depends on the data)") 42 | 43 | parser.add_argument("--embedding-type", 44 | type=int, 45 | default=1, 46 | help="The embedding type. (default: 1)") 47 | 48 | parser.add_argument("--embedding-dim", 49 | type=int, 50 | default=300, 51 | help="Dimensionality of character embedding. (default: 300)") 52 | 53 | parser.add_argument("--attention-type", 54 | nargs="?", 55 | default="normal", 56 | help="The attention type. ('normal', 'cosine', 'mlp', 'islet')") 57 | 58 | parser.add_argument("--attention-dim", 59 | type=int, 60 | default=200, 61 | help="Dimensionality of Attention Neurons. (default: 200)") 62 | 63 | parser.add_argument("--filter-sizes", 64 | type=list, 65 | default=[3, 3], 66 | help="Filter sizes. (default: 3)") 67 | 68 | parser.add_argument("--num-filters", 69 | type=list, 70 | default=[200, 400], 71 | help="Number of filters per filter size. (default: 128)") 72 | 73 | parser.add_argument("--pooling-size", 74 | type=int, 75 | default=3, 76 | help="Pooling sizes. (default: 3)") 77 | 78 | parser.add_argument("--rnn-dim", 79 | type=int, 80 | default=128, 81 | help="Dimensionality for RNN Neurons. (default: 256)") 82 | 83 | parser.add_argument("--rnn-type", 84 | nargs="?", 85 | default="GRU", 86 | help="Type of RNN Cell. ('RNN', 'LSTM', 'GRU')") 87 | 88 | parser.add_argument("--rnn-layers", 89 | type=int, 90 | default=1, 91 | help="Number of RNN Layers. (default: 1)") 92 | 93 | parser.add_argument("--fc-dim", 94 | type=int, 95 | default=512, 96 | help="Dimensionality for FC Neurons. (default: 512)") 97 | 98 | parser.add_argument("--dropout-rate", 99 | type=float, 100 | default=0.5, 101 | help="Dropout keep probability. (default: 0.5)") 102 | 103 | # Training Parameters 104 | parser.add_argument("--epochs", 105 | type=int, 106 | default=30, 107 | help="Number of training epochs. (default: 30)") 108 | 109 | parser.add_argument("--batch-size", 110 | type=int, 111 | default=32, 112 | help="Batch Size. (default: 32)") 113 | 114 | parser.add_argument("--learning-rate", 115 | type=float, 116 | default=0.001, 117 | help="Learning rate. (default: 0.001)") 118 | 119 | parser.add_argument("--decay-rate", 120 | type=float, 121 | default=0.95, 122 | help="Rate of decay for learning rate. (default: 0.95)") 123 | 124 | parser.add_argument("--decay-steps", 125 | type=int, 126 | default=500, 127 | help="How many steps before decay learning rate. (default: 500)") 128 | 129 | parser.add_argument("--evaluate-steps", 130 | type=int, 131 | default=10, 132 | help="Evaluate model on val set after how many steps. (default: 10)") 133 | 134 | parser.add_argument("--norm-ratio", 135 | type=float, 136 | default=1.25, 137 | help="The ratio of the sum of gradients norms of trainable variable. (default: 1.25)") 138 | 139 | parser.add_argument("--l2-lambda", 140 | type=float, 141 | default=0.0, 142 | help="L2 regularization lambda. (default: 0.0)") 143 | 144 | parser.add_argument("--checkpoint-steps", 145 | type=int, 146 | default=10, 147 | help="Save model after how many steps. (default: 10)") 148 | 149 | parser.add_argument("--num-checkpoints", 150 | type=int, 151 | default=10, 152 | help="Number of checkpoints to store. (default: 10)") 153 | 154 | # Misc Parameters 155 | parser.add_argument("--allow-soft-placement", 156 | type=bool, 157 | default=True, 158 | help="Allow device soft device placement. (default: True)") 159 | 160 | parser.add_argument("--log-device-placement", 161 | type=bool, 162 | default=False, 163 | help="Log placement of ops on devices. (default: False)") 164 | 165 | parser.add_argument("--gpu-options-allow-growth", 166 | type=bool, 167 | default=True, 168 | help="Allow gpu options growth. (default: True)") 169 | 170 | return parser.parse_args() -------------------------------------------------------------------------------- /TMLA/DTR/test_dtr.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | __author__ = 'Randolph' 3 | 4 | import sys 5 | import time 6 | 7 | sys.path.append('../') 8 | 9 | from utils import data_process as dp 10 | from sklearn.externals import joblib 11 | from sklearn.metrics import mean_squared_error, r2_score 12 | 13 | logger = dp.logger_fn("dtr-log", "dtr/test-{0}.log".format(time.asctime())) 14 | 15 | # Data Parameters 16 | TEST_DIR = '../../data/Test_BOW_sample.json' 17 | MODEL_DIR = 'dtr_model.m' 18 | 19 | 20 | def test(): 21 | logger.info("Loading data...") 22 | 23 | x_test, y_test = dp.load_data(TEST_DIR) 24 | 25 | logger.info("Loading model...") 26 | model = joblib.load(MODEL_DIR) 27 | 28 | logger.info("Predicting...") 29 | y_pred = model.predict(x_test) 30 | 31 | logger.info("Calculate Metrics...") 32 | pcc, doa = dp.evaluation(y_test, y_pred) 33 | rmse = mean_squared_error(y_test, y_pred) ** 0.5 34 | r2 = r2_score(y_test, y_pred) 35 | 36 | logger.info("DTR: PCC {0:g} | DOA {1:g} | RMSE {2:g} | R2 {3:g}".format(pcc, doa, rmse, r2)) 37 | 38 | logger.info("All Done.") 39 | 40 | 41 | if __name__ == '__main__': 42 | test() 43 | -------------------------------------------------------------------------------- /TMLA/DTR/train_dtr.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | __author__ = 'Randolph' 3 | 4 | import sys 5 | import time 6 | 7 | sys.path.append('../') 8 | 9 | from utils import data_process as dp 10 | from sklearn.tree import DecisionTreeRegressor 11 | from sklearn.externals import joblib 12 | 13 | logger = dp.logger_fn("dtr-log", "dtr/train-{0}.log".format(time.asctime())) 14 | 15 | # Data Parameters 16 | TRAININGSET_DIR = '../../data/Train_BOW_sample.json' 17 | MODEL_DIR = 'dtr_model.m' 18 | 19 | 20 | def train(): 21 | # Load data 22 | logger.info("Loading data...") 23 | 24 | x_train, y_train = dp.load_data(TRAININGSET_DIR) 25 | 26 | logger.info("Finish building BOW.") 27 | 28 | model = DecisionTreeRegressor(criterion="mse", splitter="best") 29 | 30 | logger.info("Training model...") 31 | model.fit(x_train, y_train) 32 | 33 | logger.info("Finish training. Saving model...") 34 | joblib.dump(model, MODEL_DIR) 35 | 36 | 37 | if __name__ == '__main__': 38 | train() 39 | -------------------------------------------------------------------------------- /TMLA/LR/test_lr.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | __author__ = 'Randolph' 3 | 4 | import sys 5 | import time 6 | 7 | sys.path.append('../') 8 | 9 | from utils import data_process as dp 10 | from sklearn.externals import joblib 11 | from sklearn.metrics import mean_squared_error, r2_score 12 | 13 | logger = dp.logger_fn("lr-log", "lr/test-{0}.log".format(time.asctime())) 14 | 15 | # Data Parameters 16 | TEST_DIR = '../../data/Test_BOW_sample.json' 17 | MODEL_DIR = 'lr_model.m' 18 | 19 | 20 | def test(): 21 | logger.info("Loading data...") 22 | 23 | x_test, y_test = dp.load_data(TEST_DIR) 24 | 25 | logger.info("Loading model...") 26 | model = joblib.load(MODEL_DIR) 27 | 28 | logger.info("Predicting...") 29 | y_pred = model.predict(x_test) 30 | 31 | logger.info("Calculate Metrics...") 32 | pcc, doa = dp.evaluation(y_test, y_pred) 33 | rmse = mean_squared_error(y_test, y_pred) ** 0.5 34 | r2 = r2_score(y_test, y_pred) 35 | 36 | logger.info("Logistic: PCC {0:g} | DOA {1:g} | RMSE {2:g} | R2 {3:g}".format(pcc, doa, rmse, r2)) 37 | 38 | logger.info("All Done.") 39 | 40 | 41 | if __name__ == '__main__': 42 | test() 43 | -------------------------------------------------------------------------------- /TMLA/LR/train_lr.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | __author__ = 'Randolph' 3 | 4 | import sys 5 | import time 6 | 7 | sys.path.append('../') 8 | 9 | from utils import data_process as dp 10 | from sklearn.linear_model import LinearRegression 11 | from sklearn.externals import joblib 12 | 13 | logger = dp.logger_fn("lr-log", "lr/train-{0}.log".format(time.asctime())) 14 | 15 | # Data Parameters 16 | TRAININGSET_DIR = '../../data/Train_BOW_sample.json' 17 | MODEL_DIR = 'lr_model.m' 18 | 19 | 20 | def train(): 21 | # Load data 22 | logger.info("Loading data...") 23 | 24 | x_train, y_train = dp.load_data(TRAININGSET_DIR) 25 | 26 | logger.info("Finish building BOW.") 27 | 28 | model = LinearRegression() 29 | 30 | logger.info("Training model...") 31 | model.fit(x_train, y_train) 32 | 33 | logger.info("Finish training. Saving model...") 34 | joblib.dump(model, MODEL_DIR) 35 | 36 | 37 | if __name__ == '__main__': 38 | train() 39 | -------------------------------------------------------------------------------- /TMLA/SVM/test_svm.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | __author__ = 'Randolph' 3 | 4 | import sys 5 | import time 6 | 7 | sys.path.append('../') 8 | 9 | from utils import data_process as dp 10 | from sklearn.externals import joblib 11 | from sklearn.metrics import mean_squared_error, r2_score 12 | 13 | logger = dp.logger_fn("svm-log", "svm/test-{0}.log".format(time.asctime())) 14 | 15 | # Data Parameters 16 | TEST_DIR = '../../data/Test_BOW_sample.json' 17 | MODEL_DIR = 'svm_model.m' 18 | 19 | 20 | def test(): 21 | logger.info("Loading data...") 22 | 23 | x_test, y_test = dp.load_data(TEST_DIR) 24 | 25 | logger.info("Loading model...") 26 | model = joblib.load(MODEL_DIR) 27 | 28 | logger.info("Predicting...") 29 | y_pred = model.predict(x_test) 30 | 31 | logger.info("Calculate Metrics...") 32 | pcc, doa = dp.evaluation(y_test, y_pred) 33 | rmse = mean_squared_error(y_test, y_pred) ** 0.5 34 | r2 = r2_score(y_test, y_pred) 35 | 36 | logger.info("SVM: PCC {0:g} | DOA {1:g} | RMSE {2:g} | R2 {3:g}".format(pcc, doa, rmse, r2)) 37 | 38 | logger.info("All Done.") 39 | 40 | 41 | if __name__ == '__main__': 42 | test() 43 | -------------------------------------------------------------------------------- /TMLA/SVM/train_svm.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | __author__ = 'Randolph' 3 | 4 | import sys 5 | import time 6 | 7 | sys.path.append('../') 8 | 9 | from utils import data_process as dp 10 | from sklearn.svm import SVR 11 | from sklearn.externals import joblib 12 | 13 | logger = dp.logger_fn("svm-log", "svm/train-{0}.log".format(time.asctime())) 14 | 15 | # Data Parameters 16 | TRAININGSET_DIR = '../../data/Train_BOW_sample.json' 17 | MODEL_DIR = 'svm_model.m' 18 | 19 | 20 | def train(): 21 | # Load data 22 | logger.info("Loading data...") 23 | 24 | x_train, y_train = dp.load_data(TRAININGSET_DIR) 25 | 26 | logger.info("Finish building BOW.") 27 | 28 | model = SVR() 29 | 30 | logger.info("Training model...") 31 | model.fit(x_train, y_train) 32 | 33 | logger.info("Finish training. Saving model...") 34 | joblib.dump(model, MODEL_DIR) 35 | 36 | 37 | if __name__ == '__main__': 38 | train() 39 | -------------------------------------------------------------------------------- /TMLA/XGBoost/test_xgb.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | __author__ = 'Randolph' 3 | 4 | import sys 5 | import time 6 | 7 | sys.path.append('../') 8 | 9 | import xgboost as xgb 10 | from utils import data_process as dp 11 | from sklearn.externals import joblib 12 | from sklearn.metrics import mean_squared_error, r2_score 13 | 14 | logger = dp.logger_fn("xgb-log", "xgb/test-{0}.log".format(time.asctime())) 15 | 16 | # Data Parameters 17 | TEST_DIR = '../../data/Test_BOW_sample.json' 18 | MODEL_DIR = 'xgb_model.m' 19 | 20 | 21 | def test(): 22 | logger.info("Loading data...") 23 | 24 | x_test, y_test = dp.load_data(TEST_DIR) 25 | d_test = xgb.DMatrix(x_test, label=y_test) 26 | 27 | logger.info("Loading model...") 28 | model = joblib.load(MODEL_DIR) 29 | 30 | logger.info("Predicting...") 31 | y_pred = model.predict(d_test) 32 | 33 | logger.info("Calculate Metrics...") 34 | pcc, doa = dp.evaluation(y_test, y_pred) 35 | rmse = mean_squared_error(y_test, y_pred) ** 0.5 36 | r2 = r2_score(y_test, y_pred) 37 | 38 | logger.info("XGBoost: PCC {0:.4f} | DOA {1:.4f} | RMSE {2:.4f} | R2 {3:.4f}".format(pcc, doa, rmse, r2)) 39 | 40 | logger.info("All Done.") 41 | 42 | 43 | if __name__ == '__main__': 44 | test() 45 | -------------------------------------------------------------------------------- /TMLA/XGBoost/train_xgb.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | __author__ = 'Randolph' 3 | 4 | import sys 5 | import time 6 | 7 | sys.path.append('../') 8 | 9 | import xgboost as xgb 10 | from utils import data_process as dp 11 | from sklearn.externals import joblib 12 | 13 | logger = dp.logger_fn("xgb-log", "xgb/train-{0}.log".format(time.asctime())) 14 | 15 | # Data Parameters 16 | TRAININGSET_DIR = '../../data/Train_BOW_sample.json' 17 | VALIDATION_DIR = '../../data/Validation_BOW_sample.json' 18 | MODEL_DIR = 'xgb_model.m' 19 | 20 | 21 | def train(): 22 | # Load data 23 | logger.info("Loading data...") 24 | 25 | x_train, y_train = dp.load_data(TRAININGSET_DIR) 26 | x_val, y_val = dp.load_data(VALIDATION_DIR) 27 | 28 | d_train = xgb.DMatrix(x_train, label=y_train) 29 | d_val = xgb.DMatrix(x_val, label=y_val) 30 | watchlist = [(d_train, 'train'), (d_val, 'valid')] 31 | logger.info("Finish building BOW.") 32 | 33 | params_xgb = { 34 | 'objective': 'reg:linear', 35 | 'eta': 0.001, 36 | 'max_depth': 10, 37 | 'eval_metric': 'rmse' 38 | } 39 | # TODO 40 | model = xgb.train(params_xgb, d_train, 10000, evals=watchlist, early_stopping_rounds=20, verbose_eval=10) 41 | logger.info("Training model...") 42 | 43 | logger.info("Finish training. Saving model...") 44 | joblib.dump(model, MODEL_DIR) 45 | 46 | 47 | if __name__ == '__main__': 48 | train() 49 | -------------------------------------------------------------------------------- /TMLA/utils/data_process.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | __author__ = 'Randolph' 3 | 4 | import os 5 | import json 6 | import math 7 | import logging 8 | import pickle 9 | import numpy as np 10 | from tqdm import tqdm 11 | from scipy import stats 12 | 13 | 14 | def logger_fn(name, input_file, level=logging.INFO): 15 | tf_logger = logging.getLogger(name) 16 | tf_logger.setLevel(level) 17 | log_dir = os.path.dirname(input_file) 18 | if not os.path.exists(log_dir): 19 | os.makedirs(log_dir) 20 | fh = logging.FileHandler(input_file, mode='w') 21 | formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') 22 | fh.setFormatter(formatter) 23 | tf_logger.addHandler(fh) 24 | return tf_logger 25 | 26 | 27 | def create_word_dict(input_file, pickle_file): 28 | with open(input_file, 'r') as fin, open(pickle_file, 'wb') as handle: 29 | word_dict = dict() 30 | word_num = 0 31 | for eachline in fin: 32 | line = json.loads(eachline) 33 | words = line['content'] + line['question'] + line['pos_text'] 34 | for word in words: 35 | if word not in word_dict.keys(): 36 | word_dict[word] = word_num 37 | word_num = word_num + 1 38 | # Save Word Dict 39 | pickle.dump(word_dict, handle) 40 | 41 | 42 | def create_bow_feature(input_file, pickle_file, output_file): 43 | with open(input_file, 'r') as fin, open(pickle_file, 'rb') as handle, open(output_file, 'w') as fout: 44 | word_dict = pickle.load(handle) 45 | word_num = len(word_dict.keys()) 46 | print(word_num) 47 | 48 | for eachline in tqdm(fin): 49 | line = json.loads(eachline) 50 | words = line['content'] + line['question'] + line['pos_text'] 51 | feature = [0] * word_num 52 | for word in words: 53 | feature[word_dict[word]] += 1 54 | data_record = { 55 | 'id': line['id'], 56 | 'feature': feature, 57 | 'diff': line['diff'] 58 | } 59 | fout.write(json.dumps(data_record, ensure_ascii=False) + '\n') 60 | 61 | 62 | def load_data(data_file): 63 | x_data, y_data = [], [] 64 | with open(data_file, 'r') as f_train: 65 | for eachline in f_train: 66 | line = json.loads(eachline) 67 | x_data.append(list(map(float, line['feature']))) 68 | y_data.append(float(line['diff'])) 69 | 70 | x_data = np.array(x_data) 71 | y_data = np.array(y_data) 72 | 73 | return x_data, y_data 74 | 75 | 76 | def evaluation(test_y, pred_y): 77 | # compute pcc 78 | pcc, _ = stats.pearsonr(pred_y, test_y) 79 | if math.isnan(pcc): 80 | print('ERROR: PCC=nan', test_y, pred_y) 81 | # compute doa 82 | n = 0 83 | correct_num = 0 84 | for i in range(len(test_y) - 1): 85 | for j in range(i + 1, len(test_y)): 86 | if (test_y[i] > test_y[j]) and (pred_y[i] > pred_y[j]): 87 | correct_num += 1 88 | elif (test_y[i] == test_y[j]) and (pred_y[i] == pred_y[j]): 89 | continue 90 | elif (test_y[i] < test_y[j]) and (pred_y[i] < pred_y[j]): 91 | correct_num += 1 92 | n += 1 93 | if n == 0: 94 | print(test_y) 95 | return -1, -1 96 | doa = correct_num / n 97 | return pcc, doa 98 | 99 | 100 | if __name__ == '__main__': 101 | # create_word_dict('../../data/data.json', '../../data/word.pickle') 102 | # create_bow_feature('../../data/Train_sample.json', '../../data/word.pickle', '../../data/Train_BOW_sample.json') 103 | # create_bow_feature('../../data/Validation_sample.json', '../../data/word.pickle', '../../data/Validation_BOW_sample.json') 104 | # create_bow_feature('../../data/Test_sample.json', '../../data/word.pickle', '../../data/Test_BOW_sample.json') 105 | pass 106 | -------------------------------------------------------------------------------- /Usage-PyTorch.md: -------------------------------------------------------------------------------- 1 | # Usage-PyTorch 2 | 3 | ## Options 4 | 5 | ### Input and output options 6 | 7 | ``` 8 | --train-file STR Training file. Default is `data/Train_sample.json`. 9 | --validation-file STR Validation file. Default is `data/Validation_sample.json`. 10 | --test-file STR Testing file. Default is `data/Test_sample.json`. 11 | --word2vec-file STR Word2vec model file. Default is `data/word2vec_300.txt`. 12 | ``` 13 | 14 | ### Model option 15 | 16 | ``` 17 | --pad-seq-len LIST Padding Sequence length of data. Depends on data. 18 | --embedding-type INT The embedding type. Default is 1. 19 | --embedding-dim INT Dim of character embedding. Default is 300. 20 | --filter-sizes LIST Filter sizes. Default is [3,3]. 21 | --num-filters LIST Number of filters per filter size. Default is [200,400]. 22 | --pooling-size INT Pooling size. Default is 3. 23 | --lstm-dim INT Dim of LSTM neurons. Default is 256. 24 | --lstm-layers INT Number of LSTM layers. Defatul is 1. 25 | --attention-type STR The attention type. Default is 'normal'. 26 | --attention-dim INT Dim of Attention neurons. Default is 200. 27 | --fc-dim INT Dim of FC neurons. Default is 512. 28 | --dropout-rate FLOAT Dropout keep probability. Default is 0.5. 29 | ``` 30 | 31 | ### Training option 32 | 33 | ``` 34 | --epochs INT Number of epochs. Default is 30. 35 | --batch-size INT Batch size. Default is 32. 36 | --learning-rate FLOAT Adam learning rate. Default is 0.001. 37 | --decay-rate FLOAT Rate of decay for learning rate. Default is 0.95. 38 | --decay-steps INT How many steps before decy lr. Default is 500. 39 | --evaluate-steps INT How many steps to evluate val set. Default is 10. 40 | --l2-lambda FLOAT L2 regularization lambda. Default is 0.0. 41 | --checkpoint-steps INT How many steps to save model. Default is 10. 42 | --num-checkpoints INT Number of checkpoints to store. Default is 10. 43 | ``` 44 | 45 | ## Training 46 | 47 | The following commands train a model. (Use TARNN for example) 48 | 49 | ```bash 50 | $ python3 train_tarnn.py 51 | ``` 52 | 53 | Training a model for a 10 epochs and set batch size as 128. 54 | 55 | ```bash 56 | $ python3 train_tarnn.py --epochs 10 --batch-size 128 57 | ``` 58 | 59 | In the beginning, you will see the program shows: 60 | 61 | ![](https://live.staticflickr.com/65535/49767412868_ca51f1eb17_o.png) 62 | 63 | **You need to choose Training or Restore. (T for Training and R for Restore)** 64 | 65 | After training, you will get the `/log` and `/run` folder. 66 | 67 | - `/log` folder saves the log info file. 68 | - `/run` folder saves the checkpoints. 69 | 70 | It should be like this: 71 | 72 | ```text 73 | . 74 | ├── logs 75 | ├── runs 76 | │   └── 1586759461 [a 10-digital format] 77 | │   ├── bestcheckpoints 78 | │   ├── checkpoints 79 | │   ├── embedding 80 | │   └── summaries 81 | ├── test_tarnn.py 82 | ├── text_tarnn.py 83 | └── train_tarnn.py 84 | ``` 85 | 86 | **The programs name and identify the model by using the asctime (It should be 10-digital number, like 1586759461).** 87 | 88 | ## Restore 89 | 90 | When your model stops training for some reason and you want to restore training, you can: 91 | 92 | In the beginning, you will see the program shows: 93 | 94 | ![](https://live.staticflickr.com/65535/49767947506_cbcc0ecfd1_o.png) 95 | 96 | **And you need to input R for restore.** 97 | 98 | Then you will be asked to give the model name (a 10-digital format, like 1586759461): 99 | 100 | ![](https://live.staticflickr.com/65535/49767968391_247d21d0bb_o.png) 101 | 102 | And the model will continue training from the last time. 103 | 104 | ## Test 105 | 106 | The following commands test a model. 107 | 108 | ```bash 109 | $ python3 test_tarnn.py 110 | ``` 111 | 112 | Then you will be asked to give the model name (a 10-digital format, like 1586759461): 113 | 114 | ![](https://live.staticflickr.com/65535/49767454533_6af8053c5f_o.png) 115 | 116 | And you can choose to use the best model or the latest model **(B for Best, L for Latest)**: 117 | 118 | ![](https://live.staticflickr.com/65535/49768319867_0a9fc9cafd_o.png) 119 | 120 | Finally, you can get the `predictions.json` file under the `/outputs` folder, it should be like: 121 | 122 | ```text 123 | . 124 | ├── graph 125 | ├── logs 126 | ├── output 127 | │   └── 1586759461 128 | │   └── predictions.json 129 | ├── runs 130 | │   └── 1586759461 131 | │   ├── bestcheckpoints 132 | │   ├── checkpoints 133 | │   ├── embedding 134 | │   └── summaries 135 | ├── test_tarnn.py 136 | ├── text_tarnn.py 137 | └── train_tarnn.py 138 | ``` 139 | 140 | -------------------------------------------------------------------------------- /Usage-TF.md: -------------------------------------------------------------------------------- 1 | # Usage-TF 2 | 3 | ## Options 4 | 5 | ### Input and output options 6 | 7 | ``` 8 | --train-file STR Training file. Default is `data/Train_sample.json`. 9 | --validation-file STR Validation file. Default is `data/Validation_sample.json`. 10 | --test-file STR Testing file. Default is `data/Test_sample.json`. 11 | --word2vec-file STR Word2vec model file. Default is `data/word2vec_300.txt`. 12 | ``` 13 | 14 | ### Model option 15 | 16 | ``` 17 | --pad-seq-len LIST Padding Sequence length of data. Depends on data. 18 | --embedding-type INT The embedding type. Default is 1. 19 | --embedding-dim INT Dim of character embedding. Default is 300. 20 | --filter-sizes LIST Filter sizes. Default is [3,3]. 21 | --num-filters LIST Number of filters per filter size. Default is [200,400]. 22 | --pooling-size INT Pooling size. Default is 3. 23 | --lstm-dim INT Dim of LSTM neurons. Default is 256. 24 | --lstm-layers INT Number of LSTM layers. Defatul is 1. 25 | --attention-type STR The attention type. Default is 'normal'. 26 | --attention-dim INT Dim of Attention neurons. Default is 200. 27 | --fc-dim INT Dim of FC neurons. Default is 512. 28 | --dropout-rate FLOAT Dropout keep probability. Default is 0.5. 29 | ``` 30 | 31 | ### Training option 32 | 33 | ``` 34 | --epochs INT Number of epochs. Default is 30. 35 | --batch-size INT Batch size. Default is 32. 36 | --learning-rate FLOAT Adam learning rate. Default is 0.001. 37 | --decay-rate FLOAT Rate of decay for learning rate. Default is 0.95. 38 | --decay-steps INT How many steps before decy lr. Default is 500. 39 | --evaluate-steps INT How many steps to evluate val set. Default is 10. 40 | --l2-lambda FLOAT L2 regularization lambda. Default is 0.0. 41 | --checkpoint-steps INT How many steps to save model. Default is 10. 42 | --num-checkpoints INT Number of checkpoints to store. Default is 10. 43 | ``` 44 | 45 | ## Training 46 | 47 | The following commands train a model. (Use TARNN for example) 48 | 49 | ```bash 50 | $ python3 train_tarnn.py 51 | ``` 52 | 53 | Training a model for a 10 epochs and set batch size as 128. 54 | 55 | ```bash 56 | $ python3 train_tarnn.py --epochs 10 --batch-size 128 57 | ``` 58 | 59 | In the beginning, you will see the program shows: 60 | 61 | ![](https://live.staticflickr.com/65535/49767412868_ca51f1eb17_o.png) 62 | 63 | **You need to choose Training or Restore. (T for Training and R for Restore)** 64 | 65 | After training, you will get the `/log` and `/run` folder. 66 | 67 | - `/log` folder saves the log info file. 68 | - `/run` folder saves the checkpoints. 69 | 70 | It should be like this: 71 | 72 | ```text 73 | . 74 | ├── logs 75 | ├── runs 76 | │   └── 1586759461 [a 10-digital format] 77 | │   ├── bestcheckpoints 78 | │   ├── checkpoints 79 | │   ├── embedding 80 | │   └── summaries 81 | ├── test_tarnn.py 82 | ├── text_tarnn.py 83 | └── train_tarnn.py 84 | ``` 85 | 86 | **The programs name and identify the model by using the asctime (It should be 10-digital number, like 1586759461).** 87 | 88 | ## Restore 89 | 90 | When your model stops training for some reason and you want to restore training, you can: 91 | 92 | In the beginning, you will see the program shows: 93 | 94 | ![](https://live.staticflickr.com/65535/49767947506_cbcc0ecfd1_o.png) 95 | 96 | **And you need to input R for restore.** 97 | 98 | Then you will be asked to give the model name (a 10-digital format, like 1586759461): 99 | 100 | ![](https://live.staticflickr.com/65535/49767968391_247d21d0bb_o.png) 101 | 102 | And the model will continue training from the last time. 103 | 104 | ## Test 105 | 106 | The following commands test a model. 107 | 108 | ```bash 109 | $ python3 test_tarnn.py 110 | ``` 111 | 112 | Then you will be asked to give the model name (a 10-digital format, like 1586759461): 113 | 114 | ![](https://live.staticflickr.com/65535/49767454533_6af8053c5f_o.png) 115 | 116 | And you can choose to use the best model or the latest model **(B for Best, L for Latest)**: 117 | 118 | ![](https://live.staticflickr.com/65535/49768319867_0a9fc9cafd_o.png) 119 | 120 | Finally, you can get the `predictions.json` file under the `/outputs` folder, it should be like: 121 | 122 | ```text 123 | . 124 | ├── graph 125 | ├── logs 126 | ├── output 127 | │   └── 1586759461 128 | │   └── predictions.json 129 | ├── runs 130 | │   └── 1586759461 131 | │   ├── bestcheckpoints 132 | │   ├── checkpoints 133 | │   ├── embedding 134 | │   └── summaries 135 | ├── test_tarnn.py 136 | ├── text_tarnn.py 137 | └── train_tarnn.py 138 | ``` 139 | 140 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | torch==1.6.0 2 | tensorflow==1.15.0 3 | tensorflow_tensorboard==1.5.1 4 | tensorboard==1.15.0 5 | xgboost==1.2.0 6 | matplotlib==2.2.3 7 | tflearn==0.3.2 8 | gensim==3.8.3 9 | numpy==1.16.2 10 | Pillow==5.4.1 11 | python_gflags==3.1.2 12 | scikit_learn==0.19.1 13 | tqdm==4.49.0 14 | google-compute-engine==2.8.13 --------------------------------------------------------------------------------