├── .gitignore
├── .travis.yml
├── LICENSE
├── PyTorch
    ├── C-MIDP
    │   ├── layers.py
    │   ├── test.py
    │   └── train.py
    ├── H-MIDP
    │   ├── layers.py
    │   ├── test.py
    │   └── train.py
    ├── R-MIDP
    │   ├── layers.py
    │   ├── test.py
    │   └── train.py
    ├── TARNN
    │   ├── layers.py
    │   ├── test.py
    │   └── train.py
    └── utils
    │   ├── checkmate.py
    │   ├── data_helpers.py
    │   └── param_parser.py
├── README.md
├── TF
    ├── C-MIDP
    │   ├── test_cmidp.py
    │   ├── text_cmidp.py
    │   └── train_cmidp.py
    ├── H-MIDP
    │   ├── test_hmidp.py
    │   ├── text_hmidp.py
    │   └── train_hmidp.py
    ├── R-MIDP
    │   ├── test_rmidp.py
    │   ├── text_rmidp.py
    │   └── train_rmidp.py
    ├── TARNN
    │   ├── test_tarnn.py
    │   ├── text_tarnn.py
    │   └── train_tarnn.py
    └── utils
    │   ├── checkmate.py
    │   ├── data_helpers.py
    │   └── param_parser.py
├── TMLA
    ├── DTR
    │   ├── test_dtr.py
    │   └── train_dtr.py
    ├── LR
    │   ├── test_lr.py
    │   └── train_lr.py
    ├── SVM
    │   ├── test_svm.py
    │   └── train_svm.py
    ├── XGBoost
    │   ├── test_xgb.py
    │   └── train_xgb.py
    └── utils
    │   └── data_process.py
├── Usage-PyTorch.md
├── Usage-TF.md
├── data
    ├── Test_BOW_sample.json
    ├── Test_sample.json
    ├── Train_BOW_sample.json
    ├── Train_sample.json
    ├── Validation_BOW_sample.json
    └── Validation_sample.json
└── requirements.txt


/.gitignore:
--------------------------------------------------------------------------------
  1 | ### Compiled source ###
  2 | *.com
  3 | *.class
  4 | *.dll
  5 | *.exe
  6 | *.o
  7 | *.so
  8 | 
  9 | ### Packages ###
 10 | # it's better to unpack these files and commit the raw source
 11 | # git has its own built in compression methods
 12 | *.7z
 13 | *.dmg
 14 | *.gz
 15 | *.iso
 16 | *.jar
 17 | *.rar
 18 | *.tar
 19 | *.zip
 20 | 
 21 | ### Logs and databases ###
 22 | *.log
 23 | *.sql
 24 | *.sqlite
 25 | 
 26 | ### Mac OS generated files ###
 27 | .DS_Store
 28 | .DS_Store?
 29 | ._*
 30 | .Spotlight-V100
 31 | .Trashes
 32 | ehthumbs.db
 33 | Thumbs.db
 34 | 
 35 | ### JetBrain config files ###
 36 | .idea
 37 | 
 38 | ### Python ###
 39 | # Byte-compiled / optimized / DLL files
 40 | *.npy
 41 | __pycache__/
 42 | *.py[cod]
 43 | *$py.class
 44 | 
 45 | # Distribution / packaging
 46 | .Python
 47 | env/
 48 | build/
 49 | develop-eggs/
 50 | dist/
 51 | downloads/
 52 | eggs/
 53 | .eggs/
 54 | lib/
 55 | lib64/
 56 | parts/
 57 | sdist/
 58 | var/
 59 | *.egg-info/
 60 | .installed.cfg
 61 | *.egg
 62 | 
 63 | # PyInstaller
 64 | #  Usually these files are written by a python script from a template
 65 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 66 | *.manifest
 67 | *.spec
 68 | 
 69 | # Installer logs
 70 | pip-log.txt
 71 | pip-delete-this-directory.txt
 72 | 
 73 | # Unit test / coverage reports
 74 | htmlcov/
 75 | .tox/
 76 | .coverage
 77 | .coverage.*
 78 | .cache
 79 | nosetests.xml
 80 | coverage.xml
 81 | *,cover
 82 | 
 83 | # Translations
 84 | *.mo
 85 | *.pot
 86 | 
 87 | # Sphinx documentation
 88 | docs/_build/
 89 | 
 90 | # PyBuilder
 91 | target/
 92 | 
 93 | ### IPythonNotebook ###
 94 | # Temporary data
 95 | .ipynb_checkpoints/
 96 | 
 97 | ### Current Project ###
 98 | # Data File
 99 | *.txt
100 | *.tsv
101 | *.csv
102 | *.json
103 | *.jpg
104 | *.png
105 | *.pickle
106 | *.xls
107 | *.doc
108 | !/data
109 | !/data/Train_sample.json
110 | !/data/Validation_sample.json
111 | !/data/Test_sample.json
112 | !/data/Train_BOW_sample.json
113 | !/data/Validation_BOW_sample.json
114 | !/data/Test_BOW_sample.json
115 | 
116 | # Model File
117 | *.model
118 | *.pb
119 | runs/
120 | graph/
121 | 
122 | # Analysis File
123 | Data Analysis.md
124 | 
125 | # Log File
126 | logs/
127 | 
128 | # Related Code
129 | temp.py
130 | data/preprocess.py
131 | TF/utils/pairwise_data_helpers.py
132 | TF/TACNN
133 | TF/PARNN
134 | PyTorch/Quesnet/
135 | PyTorch/Others
136 | english_difficulty_prediction_dtr
137 | english_difficulty_prediction_tf
138 | english_difficulty_prediction_pytorch
139 | 
140 | ### Else ###
141 | randolph/
142 | Icon?
143 | *.graffle


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | 
 3 | matrix:
 4 |   include:
 5 |     - python: 3.6
 6 | 
 7 | install:
 8 |   - pip install -r requirements.txt
 9 |   - pip install coveralls
10 | 
11 | before_script:
12 |   - export PYTHONPATH=$PWD
13 | 
14 | script:
15 |   - true # add other tests here
16 |   - coveralls


--------------------------------------------------------------------------------
/PyTorch/C-MIDP/layers.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | __author__ = 'randolph'
  3 | 
  4 | """CMIDP layers."""
  5 | 
  6 | import torch
  7 | import numpy as np
  8 | import torch.nn as nn
  9 | import torch.nn.functional as F
 10 | from torch.autograd import Variable
 11 | 
 12 | 
 13 | class ConvLayer(nn.Module):
 14 |     def __init__(self, input_units, num_filters, filter_size):
 15 |         super(ConvLayer, self).__init__()
 16 |         self.conv = nn.Conv1d(in_channels=input_units, out_channels=num_filters,
 17 |                               kernel_size=filter_size, padding=filter_size - 1, stride=1)
 18 | 
 19 |     def forward(self, input_x, pooling_size):
 20 |         conv_out = F.relu(self.conv(input_x))
 21 |         pooled_out = F.max_pool1d(conv_out, kernel_size=pooling_size)
 22 |         return pooled_out
 23 | 
 24 | 
 25 | class CMIDP(nn.Module):
 26 |     """An implementation of CMIDP"""
 27 | 
 28 |     def __init__(self, args, vocab_size, embedding_size, pretrained_embedding=None):
 29 |         super(CMIDP, self).__init__()
 30 |         """
 31 |         :param args: Arguments object.
 32 |         """
 33 |         self.args = args
 34 |         self.vocab_size = vocab_size
 35 |         self.embedding_size = embedding_size
 36 |         self.pretrained_embedding = pretrained_embedding
 37 |         self._setup_layers()
 38 | 
 39 |     def _setup_embedding_layer(self):
 40 |         """
 41 |         Creating Embedding layers.
 42 |         """
 43 |         if self.pretrained_embedding is None:
 44 |             embedding_weight = torch.FloatTensor(np.random.uniform(-1, 1, size=(self.vocab_size, self.embedding_size)))
 45 |             embedding_weight = Variable(embedding_weight, requires_grad=True)
 46 |         else:
 47 |             if self.args.embedding_type == 0:
 48 |                 embedding_weight = torch.from_numpy(self.pretrained_embedding).float()
 49 |             if self.args.embedding_type == 1:
 50 |                 embedding_weight = Variable(torch.from_numpy(self.pretrained_embedding).float(), requires_grad=True)
 51 |         self.embedding = nn.Embedding(self.vocab_size, self.embedding_size, _weight=embedding_weight)
 52 | 
 53 |     def _setup_conv_layer(self):
 54 |         """
 55 |         Creating Convolution Layer.
 56 |         """
 57 | 
 58 |         self.conv1 = ConvLayer(input_units=self.embedding_size, num_filters=self.args.num_filters[0],
 59 |                                filter_size=self.args.filter_sizes[0])
 60 |         self.conv2 = ConvLayer(input_units=self.args.num_filters[0], num_filters=self.args.num_filters[1],
 61 |                                filter_size=self.args.filter_sizes[1])
 62 | 
 63 |     def _setup_fc_layer(self):
 64 |         """
 65 |          Creating FC Layer.
 66 |          """
 67 |         self.fc = nn.Linear(in_features=self.args.num_filters[1], out_features=self.args.fc_dim, bias=True)
 68 |         self.out = nn.Linear(in_features=self.args.fc_dim, out_features=1, bias=True)
 69 | 
 70 |     def _setup_layers(self):
 71 |         """
 72 |         Creating layers of model.
 73 |         1. Embedding Layer.
 74 |         2. Convolution Layer.
 75 |         3. FC Layer.
 76 |         """
 77 |         self._setup_embedding_layer()
 78 |         self._setup_conv_layer()
 79 |         self._setup_fc_layer()
 80 | 
 81 |     def _sub_network(self, x_content, x_question, x_option):
 82 |         embedded_sentence_content = self.embedding(x_content)
 83 |         embedded_sentence_question = self.embedding(x_question)
 84 |         embedded_sentence_option = self.embedding(x_option)
 85 | 
 86 |         # Concat Vectors
 87 |         # [batch_size, sequence_length_all, embedding_size]
 88 |         sequence_length_total = sum(self.args.pad_seq_len)
 89 |         embedded_sentence_all = torch.cat((embedded_sentence_content, embedded_sentence_question,
 90 |                                            embedded_sentence_option), dim=1)
 91 |         # [batch_size, embedding_size, sequence_length_all]
 92 |         embedded_sentence_transpose = embedded_sentence_all.permute(0, 2, 1)
 93 | 
 94 |         # Convolution Layer 1
 95 |         conv1_out = self.conv1(embedded_sentence_transpose, pooling_size=self.args.pooling_size)
 96 | 
 97 |         # Convolution Layer 2
 98 |         new_pooling_size = (sequence_length_total + self.args.filter_sizes[0] - 1) // self.args.pooling_size
 99 |         # conv2_out: [batch_size, num_filters[1], 1]
100 |         conv2_out = self.conv2(conv1_out, pooling_size=new_pooling_size)
101 | 
102 |         # conv_final_flat: [batch_size, num_filters[1]]
103 |         conv_final_flat = conv2_out.view(-1, conv2_out.size(1))
104 | 
105 |         # Fully Connected Layer
106 |         fc_out = self.fc(conv_final_flat)
107 | 
108 |         # Final scores
109 |         logits = self.out(fc_out).squeeze()
110 |         scores = torch.sigmoid(logits)
111 | 
112 |         return logits, scores
113 | 
114 |     def forward(self, x_fb_content, x_fb_question, x_fb_option):
115 |         """
116 |         Forward propagation pass.
117 |         :param x_fb_content: Front & Behind Content tensors with features. <list>
118 |         :param x_fb_question: Front & Behind Question tensors with features. <list>
119 |         :param x_fb_option: Front & Behind Option tensors  with features. <list>
120 |         :return logits: The predicted logistic values.
121 |         :return scores: The predicted scores.
122 |         """
123 |         f_logits, f_scores = self._sub_network(x_fb_content[0], x_fb_question[0], x_fb_option[0])
124 |         b_logits, b_scores = self._sub_network(x_fb_content[1], x_fb_question[1], x_fb_option[1])
125 | 
126 |         logits = (f_logits, b_logits)
127 |         scores = (f_scores, b_scores)
128 |         return logits, scores
129 | 
130 | 
131 | class Loss(nn.Module):
132 |     def __init__(self):
133 |         super(Loss, self).__init__()
134 | 
135 |     def forward(self, predict_y, input_y):
136 |         # Loss
137 |         value = (predict_y[0] - predict_y[1]) - (input_y[0] - input_y[1])
138 |         losses = torch.mean(torch.pow(value, 2))
139 |         return losses
140 | 


--------------------------------------------------------------------------------
/PyTorch/C-MIDP/test.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | __author__ = 'randolph'
 3 | 
 4 | import os
 5 | import sys
 6 | import time
 7 | import torch
 8 | 
 9 | sys.path.append('../')
10 | 
11 | from layers import CMIDP, Loss
12 | from utils import checkmate as cm
13 | from utils import data_helpers as dh
14 | from utils import param_parser as parser
15 | from tqdm import trange
16 | from torch.utils.data import TensorDataset, DataLoader
17 | from sklearn.metrics import mean_squared_error, r2_score
18 | 
19 | args = parser.parameter_parser()
20 | MODEL = dh.get_model_name()
21 | logger = dh.logger_fn("ptlog", "logs/Test-{0}.log".format(time.asctime()))
22 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
23 | 
24 | CPT_DIR = os.path.abspath(os.path.join(os.path.curdir, "runs", MODEL))
25 | SAVE_DIR = os.path.abspath(os.path.join(os.path.curdir, "outputs", MODEL))
26 | 
27 | 
28 | def test():
29 |     logger.info("Loading Data...")
30 |     logger.info("Data processing...")
31 |     test_data = dh.load_data_and_labels(args.test_file, args.word2vec_file)
32 |     logger.info("Data padding...")
33 |     test_dataset = dh.MyData(test_data, args.pad_seq_len, device)
34 |     test_loader = DataLoader(test_dataset, batch_size=args.batch_size, shuffle=False)
35 |     VOCAB_SIZE, EMBEDDING_SIZE, pretrained_word2vec_matrix = dh.load_word2vec_matrix(args.word2vec_file)
36 | 
37 |     criterion = Loss()
38 |     net = CMIDP(args, VOCAB_SIZE, EMBEDDING_SIZE, pretrained_word2vec_matrix).to(device)
39 |     checkpoint_file = cm.get_best_checkpoint(CPT_DIR, select_maximum_value=False)
40 |     checkpoint = torch.load(checkpoint_file)
41 |     net.load_state_dict(checkpoint['model_state_dict'])
42 |     net.eval()
43 | 
44 |     logger.info("Scoring...")
45 |     true_labels, predicted_scores = [], []
46 |     batches = trange(len(test_loader), desc="Batches", leave=True)
47 |     for batch_cnt, batch in zip(batches, test_loader):
48 |         x_test_fb_content, x_test_fb_question, x_test_fb_option, \
49 |         x_test_fb_clens, x_test_fb_qlens, x_test_fb_olens, y_test_fb = batch
50 |         logits, scores = net(x_test_fb_content, x_test_fb_question, x_test_fb_option)
51 |         for i in y_test_fb[0].tolist():
52 |             true_labels.append(i)
53 |         for j in scores[0].tolist():
54 |             predicted_scores.append(j)
55 | 
56 |     # Calculate the Metrics
57 |     test_rmse = mean_squared_error(true_labels, predicted_scores) ** 0.5
58 |     test_r2 = r2_score(true_labels, predicted_scores)
59 |     test_pcc, test_doa = dh.evaluation(true_labels, predicted_scores)
60 |     logger.info("All Test set: PCC {0:.4f} | DOA {1:.4f} | RMSE {2:.4f} | R2 {3:.4f}"
61 |                 .format(test_pcc, test_doa, test_rmse, test_r2))
62 |     logger.info('Test Finished.')
63 | 
64 |     logger.info('Creating the prediction file...')
65 |     dh.create_prediction_file(save_dir=SAVE_DIR, identifiers=test_data['f_id'], predictions=predicted_scores)
66 | 
67 |     logger.info('All Finished.')
68 | 
69 | 
70 | if __name__ == "__main__":
71 |     test()
72 | 
73 | 


--------------------------------------------------------------------------------
/PyTorch/C-MIDP/train.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | __author__ = 'randolph'
  3 | 
  4 | import os
  5 | import sys
  6 | import time
  7 | import torch
  8 | import torch.nn as nn
  9 | 
 10 | sys.path.append('../')
 11 | 
 12 | from layers import CMIDP, Loss
 13 | from utils import checkmate as cm
 14 | from utils import data_helpers as dh
 15 | from utils import param_parser as parser
 16 | from tqdm import tqdm, trange
 17 | from torch.utils.tensorboard import SummaryWriter
 18 | from torch.utils.data import TensorDataset, DataLoader
 19 | from sklearn.metrics import mean_squared_error, r2_score
 20 | 
 21 | 
 22 | args = parser.parameter_parser()
 23 | OPTION = dh.option()
 24 | logger = dh.logger_fn("ptlog", "logs/{0}-{1}.log".format('Train' if OPTION == 'T' else 'Restore', time.asctime()))
 25 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 26 | 
 27 | 
 28 | def train():
 29 |     """Training CMIDP model."""
 30 |     dh.tab_printer(args, logger)
 31 | 
 32 |     # Load sentences, labels, and training parameters
 33 |     logger.info("Loading data...")
 34 |     logger.info("Data processing...")
 35 |     train_data = dh.load_data_and_labels(args.train_file, args.word2vec_file)
 36 |     val_data = dh.load_data_and_labels(args.validation_file, args.word2vec_file)
 37 | 
 38 |     logger.info("Data padding...")
 39 |     train_dataset = dh.MyData(train_data, args.pad_seq_len, device)
 40 |     val_dataset = dh.MyData(val_data, args.pad_seq_len, device)
 41 | 
 42 |     train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True)
 43 |     val_loader = DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False)
 44 | 
 45 |     # Load word2vec model
 46 |     VOCAB_SIZE, EMBEDDING_SIZE, pretrained_word2vec_matrix = dh.load_word2vec_matrix(args.word2vec_file)
 47 | 
 48 |     # Init network
 49 |     logger.info("Init nn...")
 50 |     net = CMIDP(args, VOCAB_SIZE, EMBEDDING_SIZE, pretrained_word2vec_matrix).to(device)
 51 | 
 52 |     print("Model's state_dict:")
 53 |     for param_tensor in net.state_dict():
 54 |         print(param_tensor, "\t", net.state_dict()[param_tensor].size())
 55 | 
 56 |     criterion = Loss()
 57 |     optimizer = torch.optim.Adam(net.parameters(), lr=args.learning_rate, weight_decay=args.l2_lambda)
 58 | 
 59 |     if OPTION == 'T':
 60 |         timestamp = str(int(time.time()))
 61 |         out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
 62 |         saver = cm.BestCheckpointSaver(save_dir=out_dir, num_to_keep=args.num_checkpoints, maximize=False)
 63 |         logger.info("Writing to {0}\n".format(out_dir))
 64 |     elif OPTION == 'R':
 65 |         timestamp = input("[Input] Please input the checkpoints model you want to restore: ")
 66 |         while not (timestamp.isdigit() and len(timestamp) == 10):
 67 |             timestamp = input("[Warning] The format of your input is illegal, please re-input: ")
 68 |         out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
 69 |         saver = cm.BestCheckpointSaver(save_dir=out_dir, num_to_keep=args.num_checkpoints, maximize=False)
 70 |         logger.info("Writing to {0}\n".format(out_dir))
 71 |         checkpoint = torch.load(out_dir)
 72 |         net.load_state_dict(checkpoint['model_state_dict'])
 73 |         optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
 74 | 
 75 |     logger.info("Training...")
 76 |     writer = SummaryWriter('summary')
 77 | 
 78 |     def eval_model(val_loader, epoch):
 79 |         """
 80 |         Evaluate on the validation set.
 81 |         """
 82 |         net.eval()
 83 |         eval_loss = 0.0
 84 |         true_labels, predicted_scores = [], []
 85 |         for batch in val_loader:
 86 |             x_val_fb_content, x_val_fb_question, x_val_fb_option, \
 87 |             x_val_fb_clens, x_val_fb_qlens, x_val_fb_olens, y_val_fb = batch
 88 | 
 89 |             logits, scores = net(x_val_fb_content, x_val_fb_question, x_val_fb_option)
 90 |             avg_batch_loss = criterion(scores, y_val_fb)
 91 |             eval_loss = eval_loss + avg_batch_loss.item()
 92 |             for i in y_val_fb[0].tolist():
 93 |                 true_labels.append(i)
 94 |             for j in scores[0].tolist():
 95 |                 predicted_scores.append(j)
 96 | 
 97 |         # Calculate the Metrics
 98 |         eval_rmse = mean_squared_error(true_labels, predicted_scores) ** 0.5
 99 |         eval_r2 = r2_score(true_labels, predicted_scores)
100 |         eval_pcc, eval_doa = dh.evaluation(true_labels, predicted_scores)
101 |         eval_loss = eval_loss / len(val_loader)
102 |         cur_value = eval_rmse
103 |         logger.info("All Validation set: Loss {0:g} | PCC {1:.4f} | DOA {2:.4f} | RMSE {3:.4f} | R2 {4:.4f}"
104 |                     .format(eval_loss, eval_pcc, eval_doa, eval_rmse, eval_r2))
105 |         writer.add_scalar('validation loss', eval_loss, epoch)
106 |         writer.add_scalar('validation PCC', eval_pcc, epoch)
107 |         writer.add_scalar('validation DOA', eval_doa, epoch)
108 |         writer.add_scalar('validation RMSE', eval_rmse, epoch)
109 |         writer.add_scalar('validation R2', eval_r2, epoch)
110 |         return cur_value
111 | 
112 |     for epoch in tqdm(range(args.epochs), desc="Epochs:", leave=True):
113 |         # Training step
114 |         batches = trange(len(train_loader), desc="Batches", leave=True)
115 |         for batch_cnt, batch in zip(batches, train_loader):
116 |             net.train()
117 |             x_train_fb_content, x_train_fb_question, x_train_fb_option, \
118 |             x_train_fb_clens, x_train_fb_qlens, x_train_fb_olens, y_train_fb = batch
119 | 
120 |             optimizer.zero_grad()   # 如果不置零，Variable 的梯度在每次 backward 的时候都会累加
121 |             logits, scores = net(x_train_fb_content, x_train_fb_question, x_train_fb_option)
122 |             avg_batch_loss = criterion(scores, y_train_fb)
123 |             avg_batch_loss.backward()
124 |             optimizer.step()    # Parameter updating
125 |             batches.set_description("Batches (Loss={:.4f})".format(avg_batch_loss.item()))
126 |             logger.info('[epoch {0}, batch {1}] loss: {2:.4f}'.format(epoch + 1, batch_cnt, avg_batch_loss.item()))
127 |             writer.add_scalar('training loss', avg_batch_loss, batch_cnt)
128 |         # Evaluation step
129 |         cur_value = eval_model(val_loader, epoch)
130 |         saver.handle(cur_value, net, optimizer, epoch)
131 |     writer.close()
132 | 
133 |     logger.info('Training Finished.')
134 | 
135 | 
136 | if __name__ == "__main__":
137 |     train()
138 | 
139 | 


--------------------------------------------------------------------------------
/PyTorch/H-MIDP/layers.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | __author__ = 'randolph'
  3 | 
  4 | """HMIDP layers."""
  5 | 
  6 | import torch
  7 | import numpy as np
  8 | import torch.nn as nn
  9 | import torch.nn.functional as F
 10 | from torch.autograd import Variable
 11 | 
 12 | 
 13 | class ConvLayer(nn.Module):
 14 |     def __init__(self, input_units, num_filters, filter_size):
 15 |         super(ConvLayer, self).__init__()
 16 |         self.conv = nn.Conv1d(in_channels=input_units, out_channels=num_filters,
 17 |                               kernel_size=filter_size, padding=filter_size - 1, stride=1)
 18 | 
 19 |     def forward(self, input_x, pooling_size):
 20 |         conv_out = F.relu(self.conv(input_x))
 21 |         pooled_out = F.max_pool1d(conv_out, kernel_size=pooling_size)
 22 |         return pooled_out
 23 | 
 24 | 
 25 | class BiRNNLayer(nn.Module):
 26 |     def __init__(self, input_units, rnn_type, rnn_layers, rnn_hidden_size, dropout_keep_prob):
 27 |         super(BiRNNLayer, self).__init__()
 28 |         if rnn_type == 'LSTM':
 29 |             self.bi_rnn = nn.LSTM(input_size=input_units, hidden_size=rnn_hidden_size, num_layers=rnn_layers,
 30 |                                   batch_first=True, bidirectional=True, dropout=dropout_keep_prob)
 31 |         if rnn_type == 'GRU':
 32 |             self.bi_rnn = nn.GRU(input_size=input_units, hidden_size=rnn_hidden_size, num_layers=rnn_layers,
 33 |                                  batch_first=True, bidirectional=True, dropout=dropout_keep_prob)
 34 | 
 35 |     def forward(self, input_x):
 36 |         rnn_out, _ = self.bi_rnn(input_x)
 37 |         rnn_avg = torch.mean(rnn_out, dim=1)
 38 |         return rnn_out, rnn_avg
 39 | 
 40 | 
 41 | class HMIDP(nn.Module):
 42 |     """An implementation of HMIDP"""
 43 | 
 44 |     def __init__(self, args, vocab_size, embedding_size, pretrained_embedding=None):
 45 |         super(HMIDP, self).__init__()
 46 |         """
 47 |         :param args: Arguments object.
 48 |         """
 49 |         self.args = args
 50 |         self.vocab_size = vocab_size
 51 |         self.embedding_size = embedding_size
 52 |         self.pretrained_embedding = pretrained_embedding
 53 |         self._setup_layers()
 54 | 
 55 |     def _setup_embedding_layer(self):
 56 |         """
 57 |         Creating Embedding layers.
 58 |         """
 59 |         if self.pretrained_embedding is None:
 60 |             embedding_weight = torch.FloatTensor(np.random.uniform(-1, 1, size=(self.vocab_size, self.embedding_size)))
 61 |             embedding_weight = Variable(embedding_weight, requires_grad=True)
 62 |         else:
 63 |             if self.args.embedding_type == 0:
 64 |                 embedding_weight = torch.from_numpy(self.pretrained_embedding).float()
 65 |             if self.args.embedding_type == 1:
 66 |                 embedding_weight = Variable(torch.from_numpy(self.pretrained_embedding).float(), requires_grad=True)
 67 |         self.embedding = nn.Embedding(self.vocab_size, self.embedding_size, _weight=embedding_weight)
 68 | 
 69 |     def _setup_conv_layer(self):
 70 |         """
 71 |         Creating Convolution Layer.
 72 |         """
 73 | 
 74 |         self.conv1 = ConvLayer(input_units=self.embedding_size, num_filters=self.args.num_filters[0],
 75 |                                filter_size=self.args.filter_sizes[0])
 76 |         self.conv2 = ConvLayer(input_units=self.args.num_filters[0], num_filters=self.args.num_filters[1],
 77 |                                filter_size=self.args.filter_sizes[1])
 78 | 
 79 |     def _setup_bi_rnn_layer(self):
 80 |         """
 81 |         Creating Bi-RNN Layer.
 82 |         """
 83 |         self.bi_rnn = BiRNNLayer(input_units=self.embedding_size, rnn_type=self.args.rnn_type,
 84 |                                  rnn_layers=self.args.rnn_layers, rnn_hidden_size=self.args.rnn_dim,
 85 |                                  dropout_keep_prob=self.args.dropout_rate)
 86 | 
 87 |     def _setup_fc_layer(self):
 88 |         """
 89 |          Creating FC Layer.
 90 |          """
 91 |         self.fc = nn.Linear(in_features=self.args.num_filters[1] + self.args.rnn_dim * 2,
 92 |                             out_features=self.args.fc_dim, bias=True)
 93 |         self.out = nn.Linear(in_features=self.args.fc_dim, out_features=1, bias=True)
 94 | 
 95 |     def _setup_layers(self):
 96 |         """
 97 |         Creating layers of model.
 98 |         1. Embedding Layer.
 99 |         2. Convolution Layer.
100 |         3. Bi-RNN Layer.
101 |         4. FC Layer.
102 |         """
103 |         self._setup_embedding_layer()
104 |         self._setup_conv_layer()
105 |         self._setup_bi_rnn_layer()
106 |         self._setup_fc_layer()
107 | 
108 |     def _sub_network(self, x_content, x_question, x_option):
109 |         embedded_sentence_content = self.embedding(x_content)
110 |         embedded_sentence_question = self.embedding(x_question)
111 |         embedded_sentence_option = self.embedding(x_option)
112 | 
113 |         # Concat Vectors
114 |         # [batch_size, sequence_length_all, embedding_size]
115 |         sequence_length_total = sum(self.args.pad_seq_len)
116 |         embedded_sentence_all = torch.cat((embedded_sentence_content, embedded_sentence_question,
117 |                                            embedded_sentence_option), dim=1)
118 |         # [batch_size, embedding_size, sequence_length_all]
119 |         embedded_sentence_transpose = embedded_sentence_all.permute(0, 2, 1)
120 | 
121 |         # Convolution Layer 1
122 |         conv1_out = self.conv1(embedded_sentence_transpose, pooling_size=self.args.pooling_size)
123 | 
124 |         # Convolution Layer 2
125 |         new_pooling_size = (sequence_length_total + self.args.filter_sizes[0] - 1) // self.args.pooling_size
126 |         # conv2_out: [batch_size, num_filters[1], 1]
127 |         conv2_out = self.conv2(conv1_out, pooling_size=new_pooling_size)
128 | 
129 |         # conv_final_flat: [batch_size, num_filters[1]]
130 |         conv_final_flat = conv2_out.view(-1, conv2_out.size(1))
131 | 
132 |         # Bi-RNN Layer
133 |         # rnn_pooled: [batch_size, rnn_hidden_size * 2]
134 |         rnn_out, rnn_pooled = self.bi_rnn(embedded_sentence_all)
135 | 
136 |         # Concat
137 |         concat = torch.cat((conv_final_flat, rnn_pooled), dim=1)
138 | 
139 |         # Fully Connected Layer
140 |         fc_out = self.fc(concat)
141 | 
142 |         # Final scores
143 |         logits = self.out(fc_out).squeeze()
144 |         scores = torch.sigmoid(logits)
145 | 
146 |         return logits, scores
147 | 
148 |     def forward(self, x_fb_content, x_fb_question, x_fb_option):
149 |         """
150 |         Forward propagation pass.
151 |         :param x_fb_content: Front & Behind Content tensors with features. <list>
152 |         :param x_fb_question: Front & Behind Question tensors with features. <list>
153 |         :param x_fb_option: Front & Behind Option tensors  with features. <list>
154 |         :return logits: The predicted logistic values.
155 |         :return scores: The predicted scores.
156 |         """
157 |         f_logits, f_scores = self._sub_network(x_fb_content[0], x_fb_question[0], x_fb_option[0])
158 |         b_logits, b_scores = self._sub_network(x_fb_content[1], x_fb_question[1], x_fb_option[1])
159 | 
160 |         logits = (f_logits, b_logits)
161 |         scores = (f_scores, b_scores)
162 |         return logits, scores
163 | 
164 | 
165 | class Loss(nn.Module):
166 |     def __init__(self):
167 |         super(Loss, self).__init__()
168 | 
169 |     def forward(self, predict_y, input_y):
170 |         # Loss
171 |         value = (predict_y[0] - predict_y[1]) - (input_y[0] - input_y[1])
172 |         losses = torch.mean(torch.pow(value, 2))
173 |         return losses
174 | 


--------------------------------------------------------------------------------
/PyTorch/H-MIDP/test.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | __author__ = 'randolph'
 3 | 
 4 | import os
 5 | import sys
 6 | import time
 7 | import torch
 8 | 
 9 | sys.path.append('../')
10 | 
11 | from layers import HMIDP, Loss
12 | from utils import checkmate as cm
13 | from utils import data_helpers as dh
14 | from utils import param_parser as parser
15 | from tqdm import trange
16 | from torch.utils.data import TensorDataset, DataLoader
17 | from sklearn.metrics import mean_squared_error, r2_score
18 | 
19 | args = parser.parameter_parser()
20 | MODEL = dh.get_model_name()
21 | logger = dh.logger_fn("ptlog", "logs/Test-{0}.log".format(time.asctime()))
22 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
23 | 
24 | CPT_DIR = os.path.abspath(os.path.join(os.path.curdir, "runs", MODEL))
25 | SAVE_DIR = os.path.abspath(os.path.join(os.path.curdir, "outputs", MODEL))
26 | 
27 | 
28 | def test():
29 |     logger.info("Loading Data...")
30 |     logger.info("Data processing...")
31 |     test_data = dh.load_data_and_labels(args.test_file, args.word2vec_file)
32 |     logger.info("Data padding...")
33 |     test_dataset = dh.MyData(test_data, args.pad_seq_len, device)
34 |     test_loader = DataLoader(test_dataset, batch_size=args.batch_size, shuffle=False)
35 |     VOCAB_SIZE, EMBEDDING_SIZE, pretrained_word2vec_matrix = dh.load_word2vec_matrix(args.word2vec_file)
36 | 
37 |     criterion = Loss()
38 |     net = HMIDP(args, VOCAB_SIZE, EMBEDDING_SIZE, pretrained_word2vec_matrix).to(device)
39 |     checkpoint_file = cm.get_best_checkpoint(CPT_DIR, select_maximum_value=False)
40 |     checkpoint = torch.load(checkpoint_file)
41 |     net.load_state_dict(checkpoint['model_state_dict'])
42 |     net.eval()
43 | 
44 |     logger.info("Scoring...")
45 |     true_labels, predicted_scores = [], []
46 |     batches = trange(len(test_loader), desc="Batches", leave=True)
47 |     for batch_cnt, batch in zip(batches, test_loader):
48 |         x_test_fb_content, x_test_fb_question, x_test_fb_option, \
49 |         x_test_fb_clens, x_test_fb_qlens, x_test_fb_olens, y_test_fb = batch
50 |         logits, scores = net(x_test_fb_content, x_test_fb_question, x_test_fb_option)
51 |         for i in y_test_fb[0].tolist():
52 |             true_labels.append(i)
53 |         for j in scores[0].tolist():
54 |             predicted_scores.append(j)
55 | 
56 |     # Calculate the Metrics
57 |     test_rmse = mean_squared_error(true_labels, predicted_scores) ** 0.5
58 |     test_r2 = r2_score(true_labels, predicted_scores)
59 |     test_pcc, test_doa = dh.evaluation(true_labels, predicted_scores)
60 |     logger.info("All Test set: PCC {0:.4f} | DOA {1:.4f} | RMSE {2:.4f} | R2 {3:.4f}"
61 |                 .format(test_pcc, test_doa, test_rmse, test_r2))
62 |     logger.info('Test Finished.')
63 | 
64 |     logger.info('Creating the prediction file...')
65 |     dh.create_prediction_file(save_dir=SAVE_DIR, identifiers=test_data['f_id'], predictions=predicted_scores)
66 | 
67 |     logger.info('All Finished.')
68 | 
69 | 
70 | if __name__ == "__main__":
71 |     test()
72 | 
73 | 


--------------------------------------------------------------------------------
/PyTorch/H-MIDP/train.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | __author__ = 'randolph'
  3 | 
  4 | import os
  5 | import sys
  6 | import time
  7 | import torch
  8 | import torch.nn as nn
  9 | 
 10 | sys.path.append('../')
 11 | 
 12 | from layers import HMIDP, Loss
 13 | from utils import checkmate as cm
 14 | from utils import data_helpers as dh
 15 | from utils import param_parser as parser
 16 | from tqdm import tqdm, trange
 17 | from torch.utils.tensorboard import SummaryWriter
 18 | from torch.utils.data import TensorDataset, DataLoader
 19 | from sklearn.metrics import mean_squared_error, r2_score
 20 | 
 21 | 
 22 | args = parser.parameter_parser()
 23 | OPTION = dh.option()
 24 | logger = dh.logger_fn("ptlog", "logs/{0}-{1}.log".format('Train' if OPTION == 'T' else 'Restore', time.asctime()))
 25 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 26 | 
 27 | 
 28 | def train():
 29 |     """Training HMIDP model."""
 30 |     dh.tab_printer(args, logger)
 31 | 
 32 |     # Load sentences, labels, and training parameters
 33 |     logger.info("Loading data...")
 34 |     logger.info("Data processing...")
 35 |     train_data = dh.load_data_and_labels(args.train_file, args.word2vec_file)
 36 |     val_data = dh.load_data_and_labels(args.validation_file, args.word2vec_file)
 37 | 
 38 |     logger.info("Data padding...")
 39 |     train_dataset = dh.MyData(train_data, args.pad_seq_len, device)
 40 |     val_dataset = dh.MyData(val_data, args.pad_seq_len, device)
 41 | 
 42 |     train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True)
 43 |     val_loader = DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False)
 44 | 
 45 |     # Load word2vec model
 46 |     VOCAB_SIZE, EMBEDDING_SIZE, pretrained_word2vec_matrix = dh.load_word2vec_matrix(args.word2vec_file)
 47 | 
 48 |     # Init network
 49 |     logger.info("Init nn...")
 50 |     net = HMIDP(args, VOCAB_SIZE, EMBEDDING_SIZE, pretrained_word2vec_matrix).to(device)
 51 | 
 52 |     print("Model's state_dict:")
 53 |     for param_tensor in net.state_dict():
 54 |         print(param_tensor, "\t", net.state_dict()[param_tensor].size())
 55 | 
 56 |     criterion = Loss()
 57 |     optimizer = torch.optim.Adam(net.parameters(), lr=args.learning_rate, weight_decay=args.l2_lambda)
 58 | 
 59 |     if OPTION == 'T':
 60 |         timestamp = str(int(time.time()))
 61 |         out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
 62 |         saver = cm.BestCheckpointSaver(save_dir=out_dir, num_to_keep=args.num_checkpoints, maximize=False)
 63 |         logger.info("Writing to {0}\n".format(out_dir))
 64 |     elif OPTION == 'R':
 65 |         timestamp = input("[Input] Please input the checkpoints model you want to restore: ")
 66 |         while not (timestamp.isdigit() and len(timestamp) == 10):
 67 |             timestamp = input("[Warning] The format of your input is illegal, please re-input: ")
 68 |         out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
 69 |         saver = cm.BestCheckpointSaver(save_dir=out_dir, num_to_keep=args.num_checkpoints, maximize=False)
 70 |         logger.info("Writing to {0}\n".format(out_dir))
 71 |         checkpoint = torch.load(out_dir)
 72 |         net.load_state_dict(checkpoint['model_state_dict'])
 73 |         optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
 74 | 
 75 |     logger.info("Training...")
 76 |     writer = SummaryWriter('summary')
 77 | 
 78 |     def eval_model(val_loader, epoch):
 79 |         """
 80 |         Evaluate on the validation set.
 81 |         """
 82 |         net.eval()
 83 |         eval_loss = 0.0
 84 |         true_labels, predicted_scores = [], []
 85 |         for batch in val_loader:
 86 |             x_val_fb_content, x_val_fb_question, x_val_fb_option, \
 87 |             x_val_fb_clens, x_val_fb_qlens, x_val_fb_olens, y_val_fb = batch
 88 | 
 89 |             logits, scores = net(x_val_fb_content, x_val_fb_question, x_val_fb_option)
 90 |             avg_batch_loss = criterion(scores, y_val_fb)
 91 |             eval_loss = eval_loss + avg_batch_loss.item()
 92 |             for i in y_val_fb[0].tolist():
 93 |                 true_labels.append(i)
 94 |             for j in scores[0].tolist():
 95 |                 predicted_scores.append(j)
 96 | 
 97 |         # Calculate the Metrics
 98 |         eval_rmse = mean_squared_error(true_labels, predicted_scores) ** 0.5
 99 |         eval_r2 = r2_score(true_labels, predicted_scores)
100 |         eval_pcc, eval_doa = dh.evaluation(true_labels, predicted_scores)
101 |         eval_loss = eval_loss / len(val_loader)
102 |         cur_value = eval_rmse
103 |         logger.info("All Validation set: Loss {0:g} | PCC {1:.4f} | DOA {2:.4f} | RMSE {3:.4f} | R2 {4:.4f}"
104 |                     .format(eval_loss, eval_pcc, eval_doa, eval_rmse, eval_r2))
105 |         writer.add_scalar('validation loss', eval_loss, epoch)
106 |         writer.add_scalar('validation PCC', eval_pcc, epoch)
107 |         writer.add_scalar('validation DOA', eval_doa, epoch)
108 |         writer.add_scalar('validation RMSE', eval_rmse, epoch)
109 |         writer.add_scalar('validation R2', eval_r2, epoch)
110 |         return cur_value
111 | 
112 |     for epoch in tqdm(range(args.epochs), desc="Epochs:", leave=True):
113 |         # Training step
114 |         batches = trange(len(train_loader), desc="Batches", leave=True)
115 |         for batch_cnt, batch in zip(batches, train_loader):
116 |             net.train()
117 |             x_train_fb_content, x_train_fb_question, x_train_fb_option, \
118 |             x_train_fb_clens, x_train_fb_qlens, x_train_fb_olens, y_train_fb = batch
119 | 
120 |             optimizer.zero_grad()   # 如果不置零，Variable 的梯度在每次 backward 的时候都会累加
121 |             logits, scores = net(x_train_fb_content, x_train_fb_question, x_train_fb_option)
122 |             avg_batch_loss = criterion(scores, y_train_fb)
123 |             avg_batch_loss.backward()
124 |             optimizer.step()    # Parameter updating
125 |             batches.set_description("Batches (Loss={:.4f})".format(avg_batch_loss.item()))
126 |             logger.info('[epoch {0}, batch {1}] loss: {2:.4f}'.format(epoch + 1, batch_cnt, avg_batch_loss.item()))
127 |             writer.add_scalar('training loss', avg_batch_loss, batch_cnt)
128 |         # Evaluation step
129 |         cur_value = eval_model(val_loader, epoch)
130 |         saver.handle(cur_value, net, optimizer, epoch)
131 |     writer.close()
132 | 
133 |     logger.info('Training Finished.')
134 | 
135 | 
136 | if __name__ == "__main__":
137 |     train()
138 | 
139 | 


--------------------------------------------------------------------------------
/PyTorch/R-MIDP/layers.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | __author__ = 'randolph'
  3 | 
  4 | """RMIDP layers."""
  5 | 
  6 | import torch
  7 | import numpy as np
  8 | import torch.nn as nn
  9 | import torch.nn.functional as F
 10 | from torch.autograd import Variable
 11 | 
 12 | 
 13 | class BiRNNLayer(nn.Module):
 14 |     def __init__(self, input_units, rnn_type, rnn_layers, rnn_hidden_size, dropout_keep_prob):
 15 |         super(BiRNNLayer, self).__init__()
 16 |         if rnn_type == 'LSTM':
 17 |             self.bi_rnn = nn.LSTM(input_size=input_units, hidden_size=rnn_hidden_size, num_layers=rnn_layers,
 18 |                                   batch_first=True, bidirectional=True, dropout=dropout_keep_prob)
 19 |         if rnn_type == 'GRU':
 20 |             self.bi_rnn = nn.GRU(input_size=input_units, hidden_size=rnn_hidden_size, num_layers=rnn_layers,
 21 |                                  batch_first=True, bidirectional=True, dropout=dropout_keep_prob)
 22 | 
 23 |     def forward(self, input_x):
 24 |         rnn_out, _ = self.bi_rnn(input_x)
 25 |         rnn_avg = torch.mean(rnn_out, dim=1)
 26 |         return rnn_out, rnn_avg
 27 | 
 28 | 
 29 | class RMIDP(nn.Module):
 30 |     """An implementation of RMIDP"""
 31 | 
 32 |     def __init__(self, args, vocab_size, embedding_size, pretrained_embedding=None):
 33 |         super(RMIDP, self).__init__()
 34 |         """
 35 |         :param args: Arguments object.
 36 |         """
 37 |         self.args = args
 38 |         self.vocab_size = vocab_size
 39 |         self.embedding_size = embedding_size
 40 |         self.pretrained_embedding = pretrained_embedding
 41 |         self._setup_layers()
 42 | 
 43 |     def _setup_embedding_layer(self):
 44 |         """
 45 |         Creating Embedding layers.
 46 |         """
 47 |         if self.pretrained_embedding is None:
 48 |             embedding_weight = torch.FloatTensor(np.random.uniform(-1, 1, size=(self.vocab_size, self.embedding_size)))
 49 |             embedding_weight = Variable(embedding_weight, requires_grad=True)
 50 |         else:
 51 |             if self.args.embedding_type == 0:
 52 |                 embedding_weight = torch.from_numpy(self.pretrained_embedding).float()
 53 |             if self.args.embedding_type == 1:
 54 |                 embedding_weight = Variable(torch.from_numpy(self.pretrained_embedding).float(), requires_grad=True)
 55 |         self.embedding = nn.Embedding(self.vocab_size, self.embedding_size, _weight=embedding_weight)
 56 | 
 57 |     def _setup_bi_rnn_layer(self):
 58 |         """
 59 |         Creating Bi-RNN Layer.
 60 |         """
 61 |         self.bi_rnn = BiRNNLayer(input_units=self.embedding_size, rnn_type=self.args.rnn_type,
 62 |                                  rnn_layers=self.args.rnn_layers, rnn_hidden_size=self.args.rnn_dim,
 63 |                                  dropout_keep_prob=self.args.dropout_rate)
 64 | 
 65 |     def _setup_fc_layer(self):
 66 |         """
 67 |          Creating FC Layer.
 68 |          """
 69 |         self.fc = nn.Linear(in_features=self.args.rnn_dim * 2, out_features=self.args.fc_dim, bias=True)
 70 |         self.out = nn.Linear(in_features=self.args.fc_dim, out_features=1, bias=True)
 71 | 
 72 |     def _setup_layers(self):
 73 |         """
 74 |         Creating layers of model.
 75 |         1. Embedding Layer.
 76 |         2. Bi-RNN Layer.
 77 |         3. FC Layer.
 78 |         """
 79 |         self._setup_embedding_layer()
 80 |         self._setup_bi_rnn_layer()
 81 |         self._setup_fc_layer()
 82 | 
 83 |     def _sub_network(self, x_content, x_question, x_option):
 84 |         embedded_sentence_content = self.embedding(x_content)
 85 |         embedded_sentence_question = self.embedding(x_question)
 86 |         embedded_sentence_option = self.embedding(x_option)
 87 | 
 88 |         # Concat Vectors
 89 |         # [batch_size, sequence_length_all, embedding_size]
 90 |         embedded_sentence_all = torch.cat((embedded_sentence_content, embedded_sentence_question,
 91 |                                            embedded_sentence_option), dim=1)
 92 | 
 93 |         # Bi-RNN Layer
 94 |         rnn_out, rnn_pooled = self.bi_rnn(embedded_sentence_all)
 95 | 
 96 |         # Fully Connected Layer
 97 |         fc_out = self.fc(rnn_pooled)
 98 | 
 99 |         # Final scores
100 |         logits = self.out(fc_out).squeeze()
101 |         scores = torch.sigmoid(logits)
102 | 
103 |         return logits, scores
104 | 
105 |     def forward(self, x_fb_content, x_fb_question, x_fb_option):
106 |         """
107 |         Forward propagation pass.
108 |         :param x_fb_content: Front & Behind Content tensors with features. <list>
109 |         :param x_fb_question: Front & Behind Question tensors with features. <list>
110 |         :param x_fb_option: Front & Behind Option tensors  with features. <list>
111 |         :return logits: The predicted logistic values.
112 |         :return scores: The predicted scores.
113 |         """
114 |         f_logits, f_scores = self._sub_network(x_fb_content[0], x_fb_question[0], x_fb_option[0])
115 |         b_logits, b_scores = self._sub_network(x_fb_content[1], x_fb_question[1], x_fb_option[1])
116 | 
117 |         logits = (f_logits, b_logits)
118 |         scores = (f_scores, b_scores)
119 |         return logits, scores
120 | 
121 | 
122 | class Loss(nn.Module):
123 |     def __init__(self):
124 |         super(Loss, self).__init__()
125 | 
126 |     def forward(self, predict_y, input_y):
127 |         # Loss
128 |         value = (predict_y[0] - predict_y[1]) - (input_y[0] - input_y[1])
129 |         losses = torch.mean(torch.pow(value, 2))
130 |         return losses
131 | 


--------------------------------------------------------------------------------
/PyTorch/R-MIDP/test.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | __author__ = 'randolph'
 3 | 
 4 | import os
 5 | import sys
 6 | import time
 7 | import torch
 8 | 
 9 | sys.path.append('../')
10 | 
11 | from layers import RMIDP, Loss
12 | from utils import checkmate as cm
13 | from utils import data_helpers as dh
14 | from utils import param_parser as parser
15 | from tqdm import trange
16 | from torch.utils.data import TensorDataset, DataLoader
17 | from sklearn.metrics import mean_squared_error, r2_score
18 | 
19 | args = parser.parameter_parser()
20 | MODEL = dh.get_model_name()
21 | logger = dh.logger_fn("ptlog", "logs/Test-{0}.log".format(time.asctime()))
22 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
23 | 
24 | CPT_DIR = os.path.abspath(os.path.join(os.path.curdir, "runs", MODEL))
25 | SAVE_DIR = os.path.abspath(os.path.join(os.path.curdir, "outputs", MODEL))
26 | 
27 | 
28 | def test():
29 |     logger.info("Loading Data...")
30 |     logger.info("Data processing...")
31 |     test_data = dh.load_data_and_labels(args.test_file, args.word2vec_file)
32 |     logger.info("Data padding...")
33 |     test_dataset = dh.MyData(test_data, args.pad_seq_len, device)
34 |     test_loader = DataLoader(test_dataset, batch_size=args.batch_size, shuffle=False)
35 |     VOCAB_SIZE, EMBEDDING_SIZE, pretrained_word2vec_matrix = dh.load_word2vec_matrix(args.word2vec_file)
36 | 
37 |     criterion = Loss()
38 |     net = RMIDP(args, VOCAB_SIZE, EMBEDDING_SIZE, pretrained_word2vec_matrix).to(device)
39 |     checkpoint_file = cm.get_best_checkpoint(CPT_DIR, select_maximum_value=False)
40 |     checkpoint = torch.load(checkpoint_file)
41 |     net.load_state_dict(checkpoint['model_state_dict'])
42 |     net.eval()
43 | 
44 |     logger.info("Scoring...")
45 |     true_labels, predicted_scores = [], []
46 |     batches = trange(len(test_loader), desc="Batches", leave=True)
47 |     for batch_cnt, batch in zip(batches, test_loader):
48 |         x_test_fb_content, x_test_fb_question, x_test_fb_option, \
49 |         x_test_fb_clens, x_test_fb_qlens, x_test_fb_olens, y_test_fb = batch
50 |         logits, scores = net(x_test_fb_content, x_test_fb_question, x_test_fb_option)
51 |         for i in y_test_fb[0].tolist():
52 |             true_labels.append(i)
53 |         for j in scores[0].tolist():
54 |             predicted_scores.append(j)
55 | 
56 |     # Calculate the Metrics
57 |     test_rmse = mean_squared_error(true_labels, predicted_scores) ** 0.5
58 |     test_r2 = r2_score(true_labels, predicted_scores)
59 |     test_pcc, test_doa = dh.evaluation(true_labels, predicted_scores)
60 |     logger.info("All Test set: PCC {0:.4f} | DOA {1:.4f} | RMSE {2:.4f} | R2 {3:.4f}"
61 |                 .format(test_pcc, test_doa, test_rmse, test_r2))
62 |     logger.info('Test Finished.')
63 | 
64 |     logger.info('Creating the prediction file...')
65 |     dh.create_prediction_file(save_dir=SAVE_DIR, identifiers=test_data['f_id'], predictions=predicted_scores)
66 | 
67 |     logger.info('All Finished.')
68 | 
69 | 
70 | if __name__ == "__main__":
71 |     test()
72 | 
73 | 


--------------------------------------------------------------------------------
/PyTorch/R-MIDP/train.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | __author__ = 'randolph'
  3 | 
  4 | import os
  5 | import sys
  6 | import time
  7 | import torch
  8 | import torch.nn as nn
  9 | 
 10 | sys.path.append('../')
 11 | 
 12 | from layers import RMIDP, Loss
 13 | from utils import checkmate as cm
 14 | from utils import data_helpers as dh
 15 | from utils import param_parser as parser
 16 | from tqdm import tqdm, trange
 17 | from torch.utils.tensorboard import SummaryWriter
 18 | from torch.utils.data import TensorDataset, DataLoader
 19 | from sklearn.metrics import mean_squared_error, r2_score
 20 | 
 21 | 
 22 | args = parser.parameter_parser()
 23 | OPTION = dh.option()
 24 | logger = dh.logger_fn("ptlog", "logs/{0}-{1}.log".format('Train' if OPTION == 'T' else 'Restore', time.asctime()))
 25 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 26 | 
 27 | 
 28 | def train():
 29 |     """Training RMIDP model."""
 30 |     dh.tab_printer(args, logger)
 31 | 
 32 |     # Load sentences, labels, and training parameters
 33 |     logger.info("Loading data...")
 34 |     logger.info("Data processing...")
 35 |     train_data = dh.load_data_and_labels(args.train_file, args.word2vec_file)
 36 |     val_data = dh.load_data_and_labels(args.validation_file, args.word2vec_file)
 37 | 
 38 |     logger.info("Data padding...")
 39 |     train_dataset = dh.MyData(train_data, args.pad_seq_len, device)
 40 |     val_dataset = dh.MyData(val_data, args.pad_seq_len, device)
 41 | 
 42 |     train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True)
 43 |     val_loader = DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False)
 44 | 
 45 |     # Load word2vec model
 46 |     VOCAB_SIZE, EMBEDDING_SIZE, pretrained_word2vec_matrix = dh.load_word2vec_matrix(args.word2vec_file)
 47 | 
 48 |     # Init network
 49 |     logger.info("Init nn...")
 50 |     net = RMIDP(args, VOCAB_SIZE, EMBEDDING_SIZE, pretrained_word2vec_matrix).to(device)
 51 | 
 52 |     print("Model's state_dict:")
 53 |     for param_tensor in net.state_dict():
 54 |         print(param_tensor, "\t", net.state_dict()[param_tensor].size())
 55 | 
 56 |     criterion = Loss()
 57 |     optimizer = torch.optim.Adam(net.parameters(), lr=args.learning_rate, weight_decay=args.l2_lambda)
 58 | 
 59 |     if OPTION == 'T':
 60 |         timestamp = str(int(time.time()))
 61 |         out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
 62 |         saver = cm.BestCheckpointSaver(save_dir=out_dir, num_to_keep=args.num_checkpoints, maximize=False)
 63 |         logger.info("Writing to {0}\n".format(out_dir))
 64 |     elif OPTION == 'R':
 65 |         timestamp = input("[Input] Please input the checkpoints model you want to restore: ")
 66 |         while not (timestamp.isdigit() and len(timestamp) == 10):
 67 |             timestamp = input("[Warning] The format of your input is illegal, please re-input: ")
 68 |         out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
 69 |         saver = cm.BestCheckpointSaver(save_dir=out_dir, num_to_keep=args.num_checkpoints, maximize=False)
 70 |         logger.info("Writing to {0}\n".format(out_dir))
 71 |         checkpoint = torch.load(out_dir)
 72 |         net.load_state_dict(checkpoint['model_state_dict'])
 73 |         optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
 74 | 
 75 |     logger.info("Training...")
 76 |     writer = SummaryWriter('summary')
 77 | 
 78 |     def eval_model(val_loader, epoch):
 79 |         """
 80 |         Evaluate on the validation set.
 81 |         """
 82 |         net.eval()
 83 |         eval_loss = 0.0
 84 |         true_labels, predicted_scores = [], []
 85 |         for batch in val_loader:
 86 |             x_val_fb_content, x_val_fb_question, x_val_fb_option, \
 87 |             x_val_fb_clens, x_val_fb_qlens, x_val_fb_olens, y_val_fb = batch
 88 | 
 89 |             logits, scores = net(x_val_fb_content, x_val_fb_question, x_val_fb_option)
 90 |             avg_batch_loss = criterion(scores, y_val_fb)
 91 |             eval_loss = eval_loss + avg_batch_loss.item()
 92 |             for i in y_val_fb[0].tolist():
 93 |                 true_labels.append(i)
 94 |             for j in scores[0].tolist():
 95 |                 predicted_scores.append(j)
 96 | 
 97 |         # Calculate the Metrics
 98 |         eval_rmse = mean_squared_error(true_labels, predicted_scores) ** 0.5
 99 |         eval_r2 = r2_score(true_labels, predicted_scores)
100 |         eval_pcc, eval_doa = dh.evaluation(true_labels, predicted_scores)
101 |         eval_loss = eval_loss / len(val_loader)
102 |         cur_value = eval_rmse
103 |         logger.info("All Validation set: Loss {0:g} | PCC {1:.4f} | DOA {2:.4f} | RMSE {3:.4f} | R2 {4:.4f}"
104 |                     .format(eval_loss, eval_pcc, eval_doa, eval_rmse, eval_r2))
105 |         writer.add_scalar('validation loss', eval_loss, epoch)
106 |         writer.add_scalar('validation PCC', eval_pcc, epoch)
107 |         writer.add_scalar('validation DOA', eval_doa, epoch)
108 |         writer.add_scalar('validation RMSE', eval_rmse, epoch)
109 |         writer.add_scalar('validation R2', eval_r2, epoch)
110 |         return cur_value
111 | 
112 |     for epoch in tqdm(range(args.epochs), desc="Epochs:", leave=True):
113 |         # Training step
114 |         batches = trange(len(train_loader), desc="Batches", leave=True)
115 |         for batch_cnt, batch in zip(batches, train_loader):
116 |             net.train()
117 |             x_train_fb_content, x_train_fb_question, x_train_fb_option, \
118 |             x_train_fb_clens, x_train_fb_qlens, x_train_fb_olens, y_train_fb = batch
119 | 
120 |             optimizer.zero_grad()   # 如果不置零，Variable 的梯度在每次 backward 的时候都会累加
121 |             logits, scores = net(x_train_fb_content, x_train_fb_question, x_train_fb_option)
122 |             avg_batch_loss = criterion(scores, y_train_fb)
123 |             avg_batch_loss.backward()
124 |             optimizer.step()    # Parameter updating
125 |             batches.set_description("Batches (Loss={:.4f})".format(avg_batch_loss.item()))
126 |             logger.info('[epoch {0}, batch {1}] loss: {2:.4f}'.format(epoch + 1, batch_cnt, avg_batch_loss.item()))
127 |             writer.add_scalar('training loss', avg_batch_loss, batch_cnt)
128 |         # Evaluation step
129 |         cur_value = eval_model(val_loader, epoch)
130 |         saver.handle(cur_value, net, optimizer, epoch)
131 |     writer.close()
132 | 
133 |     logger.info('Training Finished.')
134 | 
135 | 
136 | if __name__ == "__main__":
137 |     train()
138 | 
139 | 


--------------------------------------------------------------------------------
/PyTorch/TARNN/layers.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | __author__ = 'randolph'
  3 | 
  4 | """TARNN layers."""
  5 | 
  6 | import torch
  7 | import numpy as np
  8 | import torch.nn as nn
  9 | import torch.nn.functional as F
 10 | from torch.autograd import Variable
 11 | 
 12 | 
 13 | class BiRNNLayer(nn.Module):
 14 |     def __init__(self, input_units, rnn_type, rnn_layers, rnn_hidden_size, dropout_keep_prob):
 15 |         super(BiRNNLayer, self).__init__()
 16 |         if rnn_type == 'LSTM':
 17 |             self.bi_rnn = nn.LSTM(input_size=input_units, hidden_size=rnn_hidden_size, num_layers=rnn_layers,
 18 |                                   batch_first=True, bidirectional=True, dropout=dropout_keep_prob)
 19 |         if rnn_type == 'GRU':
 20 |             self.bi_rnn = nn.GRU(input_size=input_units, hidden_size=rnn_hidden_size, num_layers=rnn_layers,
 21 |                                  batch_first=True, bidirectional=True, dropout=dropout_keep_prob)
 22 | 
 23 |     def forward(self, input_x):
 24 |         rnn_out, _ = self.bi_rnn(input_x)
 25 |         rnn_avg = torch.mean(rnn_out, dim=1)
 26 |         return rnn_out, rnn_avg
 27 | 
 28 | 
 29 | class AttentionLayer(nn.Module):
 30 |     def __init__(self, num_units, att_unit_size, att_type):
 31 |         super(AttentionLayer, self).__init__()
 32 |         self.att_type = att_type
 33 | 
 34 |     def forward(self, input_x, input_y):
 35 |         if self.att_type == 'normal':
 36 |             attention_matrix = torch.matmul(input_y, input_x.transpose(1, 2))
 37 |             attention_weight = torch.softmax(attention_matrix, dim=2)
 38 |             attention_visual = torch.mean(attention_matrix, dim=1)
 39 |             attention_out = torch.matmul(attention_weight, input_x)
 40 |             # TODO
 41 |             attention_out = torch.mean(attention_out, dim=1)
 42 |         if self.att_type == 'cosine':
 43 |             cos_matrix = []
 44 |             seq_len = list(input_y.size())[-2]
 45 |             normalized_x = F.normalize(input_x, p=2, dim=2)
 46 |             for t in range(seq_len):
 47 |                 new_input_y = torch.unsqueeze(input_y[:, t, :], dim=1)
 48 |                 normalized_y = F.normalize(new_input_y, p=2, dim=2)
 49 |                 # cos_similarity: [batch_size, seq_len_1]
 50 |                 cos_similarity = torch.sum(torch.mul(normalized_y, normalized_x), dim=2)
 51 |                 cos_matrix.append(cos_similarity)
 52 |             # attention_matrix: [batch_size, seq_len_2, seq_len_1]
 53 |             attention_matrix = torch.stack(cos_matrix, dim=1)
 54 |             attention_visual = torch.mean(attention_matrix, dim=1)
 55 |             attention_out = torch.mul(torch.unsqueeze(attention_visual, dim=-1), input_x)
 56 |             attention_out = torch.mean(attention_out, dim=1)
 57 |         if self.att_type == 'mlp':
 58 |             alpha_matrix = []
 59 |             seq_len = list(input_y.size())[-2]
 60 |             for t in range(seq_len):
 61 |                 u_t = torch.matmul(torch.unsqueeze(input_y[:, t, :], dim=1), input_x.transpose(1, 2))
 62 |                 # u_t: [batch_size, 1, seq_len_1]
 63 |                 u_t = torch.tanh(u_t)
 64 |                 alpha_matrix.append(u_t)
 65 |             attention_matrix = torch.cat(alpha_matrix, dim=1)
 66 |             attention_matrix = torch.squeeze(attention_matrix, dim=2)
 67 |             attention_weight = F.softmax(attention_matrix, dim=1)
 68 |             attention_visual = torch.mean(attention_weight, dim=1)
 69 |             attention_out = torch.mul(torch.unsqueeze(attention_visual, dim=-1), input_x)
 70 |             attention_out = torch.mean(attention_out, dim=1)
 71 |         return attention_visual, attention_out
 72 | 
 73 | 
 74 | class HighwayLayer(nn.Module):
 75 |     def __init__(self, in_units, out_units):
 76 |         super(HighwayLayer, self).__init__()
 77 |         self.highway_linear = nn.Linear(in_features=in_units, out_features=out_units, bias=True)
 78 |         self.highway_gate = nn.Linear(in_features=in_units, out_features=out_units, bias=True)
 79 | 
 80 |     def forward(self, input_x):
 81 |         highway_g = torch.relu(self.highway_linear(input_x))
 82 |         highway_t = torch.sigmoid(self.highway_gate(input_x))
 83 |         highway_out = torch.mul(highway_g, highway_t) + torch.mul((1 - highway_t), input_x)
 84 |         return highway_out
 85 | 
 86 | 
 87 | class TARNN(nn.Module):
 88 |     """An implementation of TARNN"""
 89 |     def __init__(self, args, vocab_size, embedding_size, pretrained_embedding=None):
 90 |         super(TARNN, self).__init__()
 91 |         """
 92 |         :param args: Arguments object.
 93 |         """
 94 |         self.args = args
 95 |         self.vocab_size = vocab_size
 96 |         self.embedding_size = embedding_size
 97 |         self.pretrained_embedding = pretrained_embedding
 98 |         self._setup_layers()
 99 | 
100 |     def _setup_embedding_layer(self):
101 |         """
102 |         Creating Embedding layers.
103 |         """
104 |         if self.pretrained_embedding is None:
105 |             embedding_weight = torch.FloatTensor(np.random.uniform(-1, 1, size=(self.vocab_size, self.embedding_size)))
106 |             embedding_weight = Variable(embedding_weight, requires_grad=True)
107 |         else:
108 |             if self.args.embedding_type == 0:
109 |                 embedding_weight = torch.from_numpy(self.pretrained_embedding).float()
110 |             if self.args.embedding_type == 1:
111 |                 embedding_weight = Variable(torch.from_numpy(self.pretrained_embedding).float(), requires_grad=True)
112 |         self.embedding = nn.Embedding(self.vocab_size, self.embedding_size, _weight=embedding_weight)
113 | 
114 |     def _setup_bi_rnn_layer(self):
115 |         """
116 |         Creating Bi-RNN Layer.
117 |         """
118 |         self.bi_rnn_content = BiRNNLayer(input_units=self.embedding_size, rnn_type=self.args.rnn_type,
119 |                                          rnn_layers=self.args.rnn_layers, rnn_hidden_size=self.args.rnn_dim,
120 |                                          dropout_keep_prob=self.args.dropout_rate)
121 |         self.bi_rnn_question = BiRNNLayer(input_units=self.embedding_size, rnn_type=self.args.rnn_type,
122 |                                           rnn_layers=self.args.rnn_layers, rnn_hidden_size=self.args.rnn_dim,
123 |                                           dropout_keep_prob=self.args.dropout_rate)
124 |         self.bi_rnn_option = BiRNNLayer(input_units=self.embedding_size, rnn_type=self.args.rnn_type,
125 |                                         rnn_layers=self.args.rnn_layers, rnn_hidden_size=self.args.rnn_dim,
126 |                                         dropout_keep_prob=self.args.dropout_rate)
127 | 
128 |     def _setup_attention(self):
129 |         """
130 |         Creating Attention Layer.
131 |         """
132 |         self.att_cq = AttentionLayer(num_units=self.args.attention_dim,
133 |                                      att_unit_size=self.args.attention_dim,
134 |                                      att_type=self.args.attention_type)
135 |         self.att_oq = AttentionLayer(num_units=self.args.attention_dim,
136 |                                      att_unit_size=self.args.attention_dim,
137 |                                      att_type=self.args.attention_type)
138 | 
139 |     def _setup_highway_layer(self):
140 |         """
141 |          Creating Highway Layer.
142 |          """
143 |         self.highway = HighwayLayer(in_units=self.args.fc_dim, out_units=self.args.fc_dim)
144 | 
145 |     def _setup_fc_layer(self):
146 |         """
147 |          Creating FC Layer.
148 |          """
149 |         self.fc = nn.Linear(in_features=self.args.rnn_dim * 2 * 3, out_features=self.args.fc_dim, bias=True)
150 |         self.out = nn.Linear(in_features=self.args.fc_dim, out_features=1, bias=True)
151 | 
152 |     def _setup_dropout(self):
153 |         """
154 |          Adding Dropout.
155 |          """
156 |         self.dropout = nn.Dropout(self.args.dropout_rate)
157 | 
158 |     def _setup_layers(self):
159 |         """
160 |         Creating layers of model.
161 |         1. Embedding Layer.
162 |         2. Bi-RNN Layer.
163 |         3. Attention Layer.
164 |         4. Highway Layer.
165 |         5. FC Layer.
166 |         6. Dropout
167 |         """
168 |         self._setup_embedding_layer()
169 |         self._setup_bi_rnn_layer()
170 |         self._setup_attention()
171 |         self._setup_highway_layer()
172 |         self._setup_fc_layer()
173 |         self._setup_dropout()
174 | 
175 |     def _sub_network(self, x_content, x_question, x_option):
176 |         embedded_sentence_content = self.embedding(x_content)
177 |         embedded_sentence_question = self.embedding(x_question)
178 |         embedded_sentence_option = self.embedding(x_option)
179 | 
180 |         # Average Vectors
181 |         # [batch_size, embedding_size]
182 |         embedded_content_average = torch.mean(embedded_sentence_content, dim=1)
183 |         embedded_question_average = torch.mean(embedded_sentence_question, dim=1)
184 |         embedded_option_average = torch.mean(embedded_sentence_option, dim=1)
185 | 
186 |         # Bi-RNN Layer
187 |         rnn_out_content, rnn_avg_content = self.bi_rnn_content(embedded_sentence_content)
188 |         rnn_out_question, rnn_avg_question = self.bi_rnn_question(embedded_sentence_question)
189 |         rnn_out_option, rnn_avg_option = self.bi_rnn_option(embedded_sentence_option)
190 | 
191 |         # Attention Layer
192 |         attention_cq_visual, attention_cq = self.att_cq(rnn_out_content, rnn_out_question)
193 |         attention_oq_visual, attention_oq = self.att_oq(rnn_out_option, rnn_out_question)
194 | 
195 |         # Concat
196 |         # shape of att_out: [batch_size, rnn_hidden_size * 2 * 3]
197 |         att_out = torch.cat((attention_cq, rnn_avg_question, attention_oq), dim=1)
198 | 
199 |         # Fully Connected Layer
200 |         fc_out = self.fc(att_out)
201 | 
202 |         # Highway Layer
203 |         highway_out = self.highway(fc_out)
204 | 
205 |         # Dropout
206 |         h_drop = self.dropout(highway_out)
207 | 
208 |         logits = self.out(h_drop).squeeze()
209 |         scores = torch.sigmoid(logits)
210 | 
211 |         return logits, scores
212 | 
213 |     def forward(self, x_fb_content, x_fb_question, x_fb_option):
214 |         """
215 |         Forward propagation pass.
216 |         :param x_fb_content: Front & Behind Content tensors with features. <list>
217 |         :param x_fb_question: Front & Behind Question tensors with features. <list>
218 |         :param x_fb_option: Front & Behind Option tensors  with features. <list>
219 |         :return logits: The predicted logistic values.
220 |         :return scores: The predicted scores.
221 |         """
222 |         f_logits, f_scores = self._sub_network(x_fb_content[0], x_fb_question[0], x_fb_option[0])
223 |         b_logits, b_scores = self._sub_network(x_fb_content[1], x_fb_question[1], x_fb_option[1])
224 | 
225 |         logits = (f_logits, b_logits)
226 |         scores = (f_scores, b_scores)
227 |         return logits, scores
228 | 
229 | 
230 | class Loss(nn.Module):
231 |     def __init__(self):
232 |         super(Loss, self).__init__()
233 |         self.MSELoss = nn.MSELoss(reduce=True, size_average=True)
234 | 
235 |     def forward(self, predict_y, input_y):
236 |         # Loss
237 |         f_loss = self.MSELoss(predict_y[0], input_y[0])
238 |         b_loss = self.MSELoss(predict_y[1], input_y[1])
239 | 
240 |         losses = f_loss + b_loss
241 |         # value = (predict_y[0] - predict_y[1]) - (input_y[0] - input_y[1])
242 |         # losses = torch.mean(torch.pow(value, 2))
243 |         return losses
244 | 


--------------------------------------------------------------------------------
/PyTorch/TARNN/test.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | __author__ = 'randolph'
 3 | 
 4 | import os
 5 | import sys
 6 | import time
 7 | import torch
 8 | 
 9 | sys.path.append('../')
10 | 
11 | from layers import TARNN, Loss
12 | from utils import checkmate as cm
13 | from utils import data_helpers as dh
14 | from utils import param_parser as parser
15 | from tqdm import trange
16 | from torch.utils.data import TensorDataset, DataLoader
17 | from sklearn.metrics import mean_squared_error, r2_score
18 | 
19 | args = parser.parameter_parser()
20 | MODEL = dh.get_model_name()
21 | logger = dh.logger_fn("ptlog", "logs/Test-{0}.log".format(time.asctime()))
22 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
23 | 
24 | CPT_DIR = os.path.abspath(os.path.join(os.path.curdir, "runs", MODEL))
25 | SAVE_DIR = os.path.abspath(os.path.join(os.path.curdir, "outputs", MODEL))
26 | 
27 | 
28 | def test():
29 |     logger.info("Loading Data...")
30 |     logger.info("Data processing...")
31 |     test_data = dh.load_data_and_labels(args.test_file, args.word2vec_file)
32 |     logger.info("Data padding...")
33 |     test_dataset = dh.MyData(test_data, args.pad_seq_len, device)
34 |     test_loader = DataLoader(test_dataset, batch_size=args.batch_size, shuffle=False)
35 |     VOCAB_SIZE, EMBEDDING_SIZE, pretrained_word2vec_matrix = dh.load_word2vec_matrix(args.word2vec_file)
36 | 
37 |     criterion = Loss()
38 |     net = TARNN(args, VOCAB_SIZE, EMBEDDING_SIZE, pretrained_word2vec_matrix).to(device)
39 |     checkpoint_file = cm.get_best_checkpoint(CPT_DIR, select_maximum_value=False)
40 |     checkpoint = torch.load(checkpoint_file)
41 |     net.load_state_dict(checkpoint['model_state_dict'])
42 |     net.eval()
43 | 
44 |     logger.info("Scoring...")
45 |     true_labels, predicted_scores = [], []
46 |     batches = trange(len(test_loader), desc="Batches", leave=True)
47 |     for batch_cnt, batch in zip(batches, test_loader):
48 |         x_test_fb_content, x_test_fb_question, x_test_fb_option, \
49 |         x_test_fb_clens, x_test_fb_qlens, x_test_fb_olens, y_test_fb = batch
50 |         logits, scores = net(x_test_fb_content, x_test_fb_question, x_test_fb_option)
51 |         for i in y_test_fb[0].tolist():
52 |             true_labels.append(i)
53 |         for j in scores[0].tolist():
54 |             predicted_scores.append(j)
55 | 
56 |     # Calculate the Metrics
57 |     test_rmse = mean_squared_error(true_labels, predicted_scores) ** 0.5
58 |     test_r2 = r2_score(true_labels, predicted_scores)
59 |     test_pcc, test_doa = dh.evaluation(true_labels, predicted_scores)
60 |     logger.info("All Test set: PCC {0:.4f} | DOA {1:.4f} | RMSE {2:.4f} | R2 {3:.4f}"
61 |                 .format(test_pcc, test_doa, test_rmse, test_r2))
62 |     logger.info('Test Finished.')
63 | 
64 |     logger.info('Creating the prediction file...')
65 |     dh.create_prediction_file(save_dir=SAVE_DIR, identifiers=test_data['f_id'], predictions=predicted_scores)
66 | 
67 |     logger.info('All Finished.')
68 | 
69 | 
70 | if __name__ == "__main__":
71 |     test()
72 | 
73 | 


--------------------------------------------------------------------------------
/PyTorch/TARNN/train.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | __author__ = 'randolph'
  3 | 
  4 | import os
  5 | import sys
  6 | import time
  7 | import torch
  8 | import torch.nn as nn
  9 | 
 10 | sys.path.append('../')
 11 | 
 12 | from layers import TARNN, Loss
 13 | from utils import checkmate as cm
 14 | from utils import data_helpers as dh
 15 | from utils import param_parser as parser
 16 | from tqdm import tqdm, trange
 17 | from torch.utils.tensorboard import SummaryWriter
 18 | from torch.utils.data import TensorDataset, DataLoader
 19 | from sklearn.metrics import mean_squared_error, r2_score
 20 | 
 21 | 
 22 | args = parser.parameter_parser()
 23 | OPTION = dh.option()
 24 | logger = dh.logger_fn("ptlog", "logs/{0}-{1}.log".format('Train' if OPTION == 'T' else 'Restore', time.asctime()))
 25 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 26 | 
 27 | 
 28 | def train():
 29 |     """Training TARNN model."""
 30 |     dh.tab_printer(args, logger)
 31 | 
 32 |     # Load sentences, labels, and training parameters
 33 |     logger.info("Loading data...")
 34 |     logger.info("Data processing...")
 35 |     train_data = dh.load_data_and_labels(args.train_file, args.word2vec_file)
 36 |     val_data = dh.load_data_and_labels(args.validation_file, args.word2vec_file)
 37 | 
 38 |     logger.info("Data padding...")
 39 |     train_dataset = dh.MyData(train_data, args.pad_seq_len, device)
 40 |     val_dataset = dh.MyData(val_data, args.pad_seq_len, device)
 41 | 
 42 |     train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True)
 43 |     val_loader = DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False)
 44 | 
 45 |     # Load word2vec model
 46 |     VOCAB_SIZE, EMBEDDING_SIZE, pretrained_word2vec_matrix = dh.load_word2vec_matrix(args.word2vec_file)
 47 | 
 48 |     # Init network
 49 |     logger.info("Init nn...")
 50 |     net = TARNN(args, VOCAB_SIZE, EMBEDDING_SIZE, pretrained_word2vec_matrix).to(device)
 51 | 
 52 |     print("Model's state_dict:")
 53 |     for param_tensor in net.state_dict():
 54 |         print(param_tensor, "\t", net.state_dict()[param_tensor].size())
 55 | 
 56 |     criterion = Loss()
 57 |     optimizer = torch.optim.Adam(net.parameters(), lr=args.learning_rate, weight_decay=args.l2_lambda)
 58 | 
 59 |     if OPTION == 'T':
 60 |         timestamp = str(int(time.time()))
 61 |         out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
 62 |         saver = cm.BestCheckpointSaver(save_dir=out_dir, num_to_keep=args.num_checkpoints, maximize=False)
 63 |         logger.info("Writing to {0}\n".format(out_dir))
 64 |     elif OPTION == 'R':
 65 |         timestamp = input("[Input] Please input the checkpoints model you want to restore: ")
 66 |         while not (timestamp.isdigit() and len(timestamp) == 10):
 67 |             timestamp = input("[Warning] The format of your input is illegal, please re-input: ")
 68 |         out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
 69 |         saver = cm.BestCheckpointSaver(save_dir=out_dir, num_to_keep=args.num_checkpoints, maximize=False)
 70 |         logger.info("Writing to {0}\n".format(out_dir))
 71 |         checkpoint = torch.load(out_dir)
 72 |         net.load_state_dict(checkpoint['model_state_dict'])
 73 |         optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
 74 | 
 75 |     logger.info("Training...")
 76 |     writer = SummaryWriter('summary')
 77 | 
 78 |     def eval_model(val_loader, epoch):
 79 |         """
 80 |         Evaluate on the validation set.
 81 |         """
 82 |         net.eval()
 83 |         eval_loss = 0.0
 84 |         true_labels, predicted_scores = [], []
 85 |         for batch in val_loader:
 86 |             x_val_fb_content, x_val_fb_question, x_val_fb_option, \
 87 |             x_val_fb_clens, x_val_fb_qlens, x_val_fb_olens, y_val_fb = batch
 88 | 
 89 |             logits, scores = net(x_val_fb_content, x_val_fb_question, x_val_fb_option)
 90 |             avg_batch_loss = criterion(scores, y_val_fb)
 91 |             eval_loss = eval_loss + avg_batch_loss.item()
 92 |             for i in y_val_fb[0].tolist():
 93 |                 true_labels.append(i)
 94 |             for j in scores[0].tolist():
 95 |                 predicted_scores.append(j)
 96 | 
 97 |         # Calculate the Metrics
 98 |         eval_rmse = mean_squared_error(true_labels, predicted_scores) ** 0.5
 99 |         eval_r2 = r2_score(true_labels, predicted_scores)
100 |         eval_pcc, eval_doa = dh.evaluation(true_labels, predicted_scores)
101 |         eval_loss = eval_loss / len(val_loader)
102 |         cur_value = eval_rmse
103 |         logger.info("All Validation set: Loss {0:g} | PCC {1:.4f} | DOA {2:.4f} | RMSE {3:.4f} | R2 {4:.4f}"
104 |                     .format(eval_loss, eval_pcc, eval_doa, eval_rmse, eval_r2))
105 |         writer.add_scalar('validation loss', eval_loss, epoch)
106 |         writer.add_scalar('validation PCC', eval_pcc, epoch)
107 |         writer.add_scalar('validation DOA', eval_doa, epoch)
108 |         writer.add_scalar('validation RMSE', eval_rmse, epoch)
109 |         writer.add_scalar('validation R2', eval_r2, epoch)
110 |         return cur_value
111 | 
112 |     for epoch in tqdm(range(args.epochs), desc="Epochs:", leave=True):
113 |         # Training step
114 |         batches = trange(len(train_loader), desc="Batches", leave=True)
115 |         for batch_cnt, batch in zip(batches, train_loader):
116 |             net.train()
117 |             x_train_fb_content, x_train_fb_question, x_train_fb_option, \
118 |             x_train_fb_clens, x_train_fb_qlens, x_train_fb_olens, y_train_fb = batch
119 | 
120 |             optimizer.zero_grad()   # 如果不置零，Variable 的梯度在每次 backward 的时候都会累加
121 |             logits, scores = net(x_train_fb_content, x_train_fb_question, x_train_fb_option)
122 |             avg_batch_loss = criterion(scores, y_train_fb)
123 |             avg_batch_loss.backward()
124 |             optimizer.step()    # Parameter updating
125 |             batches.set_description("Batches (Loss={:.4f})".format(avg_batch_loss.item()))
126 |             logger.info('[epoch {0}, batch {1}] loss: {2:.4f}'.format(epoch + 1, batch_cnt, avg_batch_loss.item()))
127 |             writer.add_scalar('training loss', avg_batch_loss, batch_cnt)
128 |         # Evaluation step
129 |         cur_value = eval_model(val_loader, epoch)
130 |         saver.handle(cur_value, net, optimizer, epoch)
131 |     writer.close()
132 | 
133 |     logger.info('Training Finished.')
134 | 
135 | 
136 | if __name__ == "__main__":
137 |     train()


--------------------------------------------------------------------------------
/PyTorch/utils/checkmate.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | __author__ = 'Randolph'
  3 | 
  4 | import os
  5 | import glob
  6 | import json
  7 | import torch
  8 | import numpy as np
  9 | 
 10 | 
 11 | class BestCheckpointSaver(object):
 12 |     """Maintains a directory containing only the best n checkpoints
 13 |     Inside the directory is a best_checkpoints JSON file containing a dictionary
 14 |     mapping of the best checkpoint filepaths to the values by which the checkpoints
 15 |     are compared.  Only the best n checkpoints are contained in the directory and JSON file.
 16 |     This is a light-weight wrapper class only intended to work in simple,
 17 |     non-distributed settings.
 18 |     """
 19 |     def __init__(self, save_dir, num_to_keep=1, maximize=True):
 20 |         """Creates a `BestCheckpointSaver`
 21 |         `BestCheckpointSaver` acts as a wrapper class around a `tf.train.Saver`
 22 |         Args:
 23 |             save_dir: The directory in which the checkpoint files will be saved
 24 |             num_to_keep: The number of best checkpoint files to retain
 25 |             maximize: Define 'best' values to be the highest values. For example,
 26 |               set this to True if selecting for the checkpoints with the highest
 27 |               given accuracy.  Or set to False to select for checkpoints with the
 28 |               lowest given error rate.
 29 |         """
 30 |         self._num_to_keep = num_to_keep
 31 |         self._save_dir = save_dir
 32 |         self._maximize = maximize
 33 | 
 34 |         if not os.path.exists(save_dir):
 35 |             os.makedirs(save_dir)
 36 |         self.best_checkpoints_file = os.path.join(save_dir, 'best_checkpoints')
 37 | 
 38 |     def handle(self, value, model, optimizer, global_epoch):
 39 |         """
 40 |         Updates the set of best checkpoints based on the given result.
 41 |         Args:
 42 |             value: The value by which to rank the checkpoint.
 43 |             model: The model
 44 |             optimizer: The optimizer
 45 |             global_epoch: The global epoch
 46 |         """
 47 |         state = {'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict()}
 48 | 
 49 |         current_ckpt = 'epoch-{0}'.format(global_epoch)
 50 |         filename = os.path.abspath(os.path.join(self._save_dir, current_ckpt))
 51 |         value = float(value)
 52 |         if not os.path.exists(self.best_checkpoints_file):
 53 |             self._save_best_checkpoints_file({current_ckpt: value})
 54 |             torch.save(state, filename)
 55 |             return
 56 | 
 57 |         best_checkpoints = self._load_best_checkpoints_file()
 58 | 
 59 |         if len(best_checkpoints) < self._num_to_keep:
 60 |             best_checkpoints[current_ckpt] = value
 61 |             self._save_best_checkpoints_file(best_checkpoints)
 62 |             torch.save(state, filename)
 63 |             return
 64 | 
 65 |         if self._maximize:
 66 |             should_save = not all(current_best >= value for current_best in best_checkpoints.values())
 67 |         else:
 68 |             should_save = not all(current_best <= value for current_best in best_checkpoints.values())
 69 |         if should_save:
 70 |             best_checkpoint_list = self._sort(best_checkpoints)
 71 | 
 72 |             worst_checkpoint = os.path.join(self._save_dir, best_checkpoint_list.pop(-1)[0])
 73 |             self._remove_outdated_checkpoint_files(worst_checkpoint)
 74 | 
 75 |             best_checkpoints = dict(best_checkpoint_list)
 76 |             best_checkpoints[current_ckpt] = value
 77 |             self._save_best_checkpoints_file(best_checkpoints)
 78 |             torch.save(state, filename)
 79 | 
 80 |     def _save_best_checkpoints_file(self, updated_best_checkpoints):
 81 |         with open(self.best_checkpoints_file, 'w') as f:
 82 |             json.dump(updated_best_checkpoints, f, indent=3)
 83 | 
 84 |     def _remove_outdated_checkpoint_files(self, worst_checkpoint):
 85 |         os.remove(worst_checkpoint)
 86 | 
 87 |     def _load_best_checkpoints_file(self):
 88 |         with open(self.best_checkpoints_file, 'r') as f:
 89 |             best_checkpoints = json.load(f)
 90 |         return best_checkpoints
 91 | 
 92 |     def _sort(self, best_checkpoints):
 93 |         best_checkpoints = [
 94 |             (ckpt, best_checkpoints[ckpt])
 95 |             for ckpt in sorted(best_checkpoints,
 96 |                                key=best_checkpoints.get,
 97 |                                reverse=self._maximize)
 98 |         ]
 99 |         return best_checkpoints
100 | 
101 | 
102 | def get_best_checkpoint(best_checkpoint_dir, select_maximum_value=True):
103 |     """
104 |     Returns filepath to the best checkpoint
105 |     Reads the best_checkpoints file in the best_checkpoint_dir directory.
106 |     Returns the filepath in the best_checkpoints file associated with
107 |     the highest value if select_maximum_value is True, or the filepath
108 |     associated with the lowest value if select_maximum_value is False.
109 |     Args:
110 |         best_checkpoint_dir: Directory containing best_checkpoints JSON file
111 |         select_maximum_value: If True, select the filepath associated
112 |           with the highest value.  Otherwise, select the filepath associated
113 |           with the lowest value.
114 |     Returns:
115 |         The full path to the best checkpoint file
116 |     """
117 |     best_checkpoints_file = os.path.join(best_checkpoint_dir, 'best_checkpoints')
118 |     assert os.path.exists(best_checkpoints_file)
119 |     with open(best_checkpoints_file, 'r') as f:
120 |         best_checkpoints = json.load(f)
121 |     best_checkpoints = [
122 |         ckpt for ckpt in sorted(best_checkpoints,
123 |                                 key=best_checkpoints.get,
124 |                                 reverse=select_maximum_value)
125 |     ]
126 |     return os.path.join(os.path.abspath(best_checkpoint_dir),  best_checkpoints[0])


--------------------------------------------------------------------------------
/PyTorch/utils/param_parser.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | 
 4 | def parameter_parser():
 5 |     """
 6 |     A method to parse up command line parameters.
 7 |     The default hyperparameters give good results without cross-validation.
 8 |     """
 9 |     parser = argparse.ArgumentParser(description="Run Model.")
10 | 
11 |     # Data Parameters
12 |     parser.add_argument("--train-file", nargs="?", default="../../data/Train_pairwise_sample.json", help="Training data.")
13 |     parser.add_argument("--validation-file", nargs="?", default="../../data/Validation_pairwise_sample.json", help="Validation data.")
14 |     parser.add_argument("--test-file", nargs="?", default="../../data/Test_pairwise_sample.json", help="Testing data.")
15 |     parser.add_argument("--metadata-file", nargs="?", default="../../data/metadata.tsv",
16 |                         help="Metadata file for embedding visualization.")
17 |     parser.add_argument("--word2vec-file", nargs="?", default="../../data/word2vec_300.txt",
18 |                         help="Word2vec file for embedding characters.")
19 | 
20 |     # Model Hyperparameters
21 |     parser.add_argument("--pad-seq-len", type=list, default=[350, 15, 10], help="Padding Sequence length. (depends on the data)")
22 |     parser.add_argument("--embedding-type", type=int, default=1, help="The embedding type.")
23 |     parser.add_argument("--embedding-dim", type=int, default=300, help="Dimensionality of character embedding.")
24 |     parser.add_argument("--attention-type", nargs="?", default="mlp", help="The attention type. ('normal', 'cosine', 'mlp')")
25 |     parser.add_argument("--attention-dim", type=int, default=200, help="Dimensionality of Attention Neurons.")
26 |     parser.add_argument("--filter-sizes", type=list, default=[3, 5, 7], help="Filter sizes.")
27 |     parser.add_argument("--conv-padding-sizes", type=list, default=[1, 2, 3], help="Padding sizes for Conv Layer.")
28 |     parser.add_argument("--dilation-sizes", type=list, default=[1, 2, 3], help="Dilation sizes for Conv Layer.")
29 |     parser.add_argument("--num-filters", type=list, default=[256, 256, 256], help="Number of filters per filter size.")
30 |     parser.add_argument("--pooling-size", type=int, default=3, help="Pooling sizes. (default: 3)")
31 |     parser.add_argument("--rnn-dim", type=int, default=128, help="Dimensionality for RNN Neurons.")
32 |     parser.add_argument("--rnn-type", nargs="?", default="GRU", help="Type of RNN Cell. ('RNN', 'LSTM', 'GRU')")
33 |     parser.add_argument("--rnn-layers", type=int, default=1, help="Number of RNN Layers.")
34 |     parser.add_argument("--skip-size", type=int, default=3, help="Skip window of Skip-RNN Layers.")
35 |     parser.add_argument("--skip-dim", type=int, default=5, help="Dimensionality for Skip-RNN Layers.")
36 |     parser.add_argument("--fc-dim", type=int, default=512, help="Dimensionality for FC Neurons.")
37 |     parser.add_argument("--dropout-rate", type=float, default=0.5, help="Dropout keep probability.")
38 | 
39 |     # Training Parameters
40 |     parser.add_argument("--epochs", type=int, default=30, help="Number of training epochs.")
41 |     parser.add_argument("--batch-size", type=int, default=32, help="Batch Size.")
42 |     parser.add_argument("--learning-rate", type=float, default=0.001, help="Learning rate.")
43 |     parser.add_argument("--decay-rate", type=float, default=0.95, help="Rate of decay for learning rate.")
44 |     parser.add_argument("--decay-steps", type=int, default=500, help="How many steps before decay learning rate.")
45 |     parser.add_argument("--norm-ratio", type=float, default=1.25,
46 |                         help="The ratio of the sum of gradients norms of trainable variable.")
47 |     parser.add_argument("--l2-lambda", type=float, default=0.0, help="L2 regularization lambda.")
48 |     parser.add_argument("--num-checkpoints", type=int, default=3, help="Number of checkpoints to store.")
49 | 
50 |     return parser.parse_args()


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Deep Learning for Question Difficulty Prediction
  2 | [![Python Version](https://img.shields.io/badge/language-python3.6-blue.svg)](https://www.python.org/downloads/) [![Build Status](https://travis-ci.org/RandolphVI/Question-Difficulty-Prediction.svg?branch=master)](https://travis-ci.org/RandolphVI/Question-Difficulty-Prediction) [![Codacy Badge](https://api.codacy.com/project/badge/Grade/c45aac301b244316830b00b9b0985e3e)](https://www.codacy.com/app/chinawolfman/Question-Difficulty-Prediction?utm_source=github.com&amp;utm_medium=referral&amp;utm_content=RandolphVI/Question-Difficulty-Prediction&amp;utm_campaign=Badge_Grade) [![License](https://img.shields.io/github/license/RandolphVI/Question-Difficulty-Prediction.svg)](https://www.apache.org/licenses/LICENSE-2.0) [![Issues](https://img.shields.io/github/issues/RandolphVI/Question-Difficulty-Prediction.svg)](https://github.com/RandolphVI/Question-Difficulty-Prediction/issues)
  3 | 
  4 | This repository contains my implementations for question difficulty prediction task.
  5 | 
  6 | The main objective of the project is to predict the difficulty of each given question based on its context materials which include several components (such like document, question and option in English READING problems).
  7 | 
  8 | ## Requirements
  9 | 
 10 | - Python 3.6
 11 | - PyTorch 1.6.0
 12 | - Tensorflow 1.15.0
 13 | - Tensorboard 1.15.0
 14 | - XGBoost 1.2.0
 15 | - Sklearn 0.19.1
 16 | - Numpy 1.16.2
 17 | - Gensim 3.8.3
 18 | - Tqdm 4.49.0
 19 | 
 20 | 
 21 | ## Introduction
 22 | 
 23 | In the widely used standard test, such as **TOEFL** or **SAT**, examinees are often allowed to retake tests and choose higher scores for college admission. This rule brings an important requirement that we should select test papers with consistent difficulties to guarantee the fairness. Therefore, measurements on tests have attracted much attention.
 24 | 
 25 | Among the measurements, one of the most crucial demands is predicting the difficulty of each specific test question, i.e., the percentage of examinees who answer the question wrong. Unfortunately, the ques-
 26 | tion difficulty is not directly observable before the test is conducted, and traditional methods often resort to expertise, such as manual labeling or artificial tests organization. Obviously, these human-based solutions are limited in that they are subjective and labor intensive, and the results could also be biased or misleading (we will illustrate this discovery experimentally). 
 27 | 
 28 | Therefore, it is an urgent issue to automatically predict question difficulty without manual intervention. Fortunately, with abundant tests recorded by automatic test paper marking systems, test logs of examinees and text materials of questions, as the auxiliary information, become more and more available, which benefits a data-driven solution to this Question Difficulty Prediction (QDP) task, especially for the typical English READING problems. For example, a English READING problem contains a document material and  the several corresponding questions, and each question contains  the corresponding options.
 29 | 
 30 | ## Project
 31 | 
 32 | The project structure is below:
 33 | 
 34 | ```text
 35 | .
 36 | ├── TMLA(Traditional Machine Learning Algorithms)
 37 | │   ├── DTR / LR / SVM / XGBoost
 38 | │   └── utils
 39 | ├── TF(TensorFlow)
 40 | │   ├── C-MIDP / R-MIDP / H-MIDP
 41 | │   ├── TARNN
 42 | │   └── utils
 43 | ├── PyTorch
 44 | │   ├── C-MIDP / R-MIDP / H-MIDP
 45 | │   ├── TARNN
 46 | │   │   ├── test_tarnn.py
 47 | │   │   ├── text_tarnn.py
 48 | │   │   └── train_tarnn.py
 49 | │   └── utils
 50 | │       ├── param_parser.py
 51 | │       └── data_helpers.py
 52 | ├── data
 53 | │   ├── word2vec_300.txt [Need Download]
 54 | │   ├── Train / Validation /Test_sample.json
 55 | │   ├── Train / Validation / Test_BOW_sample.json
 56 | │   └── Train / Validation / Test_pairwise_sample.json
 57 | ├── LICENSE
 58 | ├── README.md
 59 | └── requirements.txt
 60 | ```
 61 | 
 62 | ## Data
 63 | 
 64 | See data format in `/data` folder which including the data sample files. For example, `train_sample.json` is like:
 65 | 
 66 | ```json
 67 | {"id": "6", "content": ["year", "ruined", "summer", "vacation-a", "two-week", "vacation", "wife", "family", "cabin", "lake", "northern", "ontario", "located", "boundary", "canada-by", "bringing", "modern", "convenience", "wa", "convenient", "good", "ipad", "admiring", "beauty", "nature", "checked", "e-mail", "paddling", "canoe", "twitter", "feed", "devouring", "great", "amusing", "stuck", "workday", "diet", "newspaper", "morning", "wa", "problem", "wa", "behaving", "office", "sticking", "unending", "news", "cycle", "body", "wa", "vacation", "head", "wasnt", "year", "made", "mind", "social", "medium", "experiment", "reverse", "withdrawal", "internet", "manage", "unplug", "knew", "wouldnt", "easy", "im", "good", "self-denial", "wa", "determined", "started", "physical", "restraint", "handing", "ipad", "wife", "helpfully", "announced", "wa", "read", "book", "club", "inclined", "relinquish", "tablet", "moment", "stroke", "luck", "cell", "phone", "signal", "canadian", "cabin", "wa", "spottier", "past", "making", "attempt", "cheating", "experience", "frustration", "wa", "trapped", "forced", "comply", "good", "intention", "largely", "cut", "e-mail", "twitter", "favorite", "newspaper", "website", "connect", "world", "radio-and", "radio", "listen", "choice", "planned", "read", "book", "experienced", "criminal", "plot", "street", "los", "angeles", "cutthroat", "battle", "cancer", "lab", "psyche", "london", "social", "butterfly", "magazine", "read", "im", "claiming", "cut", "internet", "completely", "day", "biked", "nearest", "town", "reward", "sat", "park", "bench", "front", "public", "library", "wi-fi", "back", "cabin", "suffered", "slow", "dial-up", "connection", "day", "check", "e-mail", "tale", "self-denial", "ha", "happy", "ending-for", "determination", "deep", "breathing", "strong", "support", "wife", "succeeded", "vacation", "struggle", "internet", "realizing", "finally", "wa", "ipad", "wa", "problem", "knew", "passed", "starbucks", "wife", "asked", "wanted", "stop", "wi-fi", "dont", "sound", "pleased", "return", "post-vacation", "situation", "test", "begin", "stay", "wagon", "im", "back", "work", "time", "compulsion", "whats", "overwhelming", "crucial", "livelihood", "intention", "giving", "membership", "cult", "immediacy", "hope", "resist", "temptation", "reflexively", "check", "e-mail", "minute", "lead", "long", "im", "checking", "twitter", "feed", "website", "vacation", "supposed", "reset", "brain", "productive", "hoping", "worked"], "question": ["doe", "underlined", "word", "restraint"], "pos_text": ["calm", "controlled", "behavior"], "neg_text": ["relaxing", "move", "strong", "determination", "unshakable", "faith"], "diff": 0.550373134328}
 68 | ```
 69 | 
 70 | - **"id"**: just the id.
 71 | - **"content"**: the word segment of the content.
 72 | - **"question"**: The word segment of the question.
 73 | - **"pos_text"**: The word segment of the correct option.
 74 | - **"neg_text"**: The word segment of the wrong options.
 75 | - **"diff"**: The difficulty of the question.
 76 | 
 77 | ### Text Segment
 78 | 
 79 | 1. You can use `nltk` package if you are going to deal with the English text data.
 80 | 
 81 | 2. You can use `jieba` package if you are going to deal with the Chinese text data.
 82 | 
 83 | ### Data Format
 84 | 
 85 | This repository can be used in other similiar datasets in two ways:
 86 | 
 87 | 1. Modify your datasets into the same format of [the sample](https://github.com/RandolphVI/Question-Difficulty-Prediction/tree/master/data).
 88 | 2. Modify the data preprocessing code in `data_helpers.py`.
 89 | 
 90 | 
 91 | Anyway, it should depend on what your data and task are.
 92 | 
 93 | ### Pre-trained Word Vectors
 94 | 
 95 | **You can download the [Word2vec model file](https://drive.google.com/open?id=1QQhm6vKdZmEHaVYvuFbA5Yj6RoVlOhzh) (dim=300). Make sure they are unzipped and under the `/data` folder.**
 96 | 
 97 | You can pre-training your word vectors (based on your corpus) in many ways:
 98 | - Use `gensim` package to pre-train data.
 99 | - Use `glove` tools to pre-train data.
100 | - Even can use a **fasttext** network to pre-train data.
101 | 
102 | ## Usage
103 | 
104 | See [Usage-TF](https://github.com/RandolphVI/Question-Difficulty-Prediction/blob/master/Usage-TF.md) & [Usage-PyTorch](https://github.com/RandolphVI/Question-Difficulty-Prediction/blob/master/Usage-PyTorch.md).
105 | 
106 | ## Network Structure
107 | 
108 | Specifically, given the abundant historical test logs and text materials of question (including document, questions and options), we first design a LSTM-based architecture to extract sentence representations for the text materials. Then, we utilize an attention strategy to qualify the difficulty contribution of 1) each word in document to questions, and 2) each word in option to questions.
109 | 
110 | Considering the incomparability of question difficulties in different tests, we propose a test-dependent pairwise strategy for training TARNN and generating the difficulty prediction value.
111 | 
112 | ![](https://farm8.staticflickr.com/7846/33643949658_9599454fdf_o.png)
113 | 
114 | The framework of TARNN:
115 | 
116 | 1. The **Input Layer** comprises document representation (TD), question representation (TQ) and option representation (TO). 
117 | 2. The **Bi-LSTM Layer** learns the deep comparable semantic representations for text materials. 
118 | 3. The **Attention Layer** extracts words of the document (or the option) with high scores as dominant information for a specific question, which is helpful for visualizing the model and improving the performance.
119 | 4. Finally the **Prediction Layer** shows predicted difficulty scores of the given READING problem.
120 | 
121 | ## Reference
122 | 
123 | **If you want to follow the paper or utilize the code, please note the following info in your work:** 
124 | 
125 | - **Model C-MIDP/R-MIDP/H-MIDP**
126 | 
127 | ```bibtex
128 | @article{佟威2019数据驱动的数学试题难度预测,
129 |   author    = {佟威 and
130 |                汪飞 and
131 |                刘淇 and
132 |                陈恩红},
133 |   title     = {数据驱动的数学试题难度预测},
134 |   journal   = {计算机研究与发展},
135 |   pages     = {1007--1019},
136 |   year      = {2019},
137 | }
138 | ```
139 | 
140 | - **Model TARNN** (modified by TACNN)
141 | 
142 | ```bibtex
143 | @inproceedings{huang2017question,
144 |   author    = {Zhenya Huang and
145 |                Qi Liu and
146 |                Enchong Chen and
147 |                Hongke Zhao and
148 |                Mingyong Gao and
149 |                Si Wei and
150 |                Yu Su and
151 |                Guoping Hu},
152 |   title     = {Question Difficulty Prediction for READING Problems in Standard Tests},
153 |   booktitle = {Thirty-First AAAI Conference on Artificial Intelligence},
154 |   year      = {2017},
155 | }
156 | ```
157 | 
158 | ## About Me
159 | 
160 | 黄威，Randolph
161 | 
162 | SCU SE Bachelor; USTC CS Ph.D.
163 | 
164 | Email: chinawolfman@hotmail.com
165 | 
166 | My Blog: [randolph.pro](http://randolph.pro)
167 | 
168 | LinkedIn: [randolph's linkedin](https://www.linkedin.com/in/randolph-%E9%BB%84%E5%A8%81/)


--------------------------------------------------------------------------------
/TF/C-MIDP/test_cmidp.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | __author__ = 'Randolph'
  3 | 
  4 | import os
  5 | import sys
  6 | import time
  7 | import logging
  8 | 
  9 | sys.path.append('../')
 10 | logging.getLogger('tensorflow').disabled = True
 11 | 
 12 | import tensorflow as tf
 13 | from utils import checkmate as cm
 14 | from utils import data_helpers as dh
 15 | from utils import param_parser as parser
 16 | from sklearn.metrics import mean_squared_error, r2_score
 17 | 
 18 | args = parser.parameter_parser()
 19 | MODEL = dh.get_model_name()
 20 | logger = dh.logger_fn("tflog", "logs/Test-{0}.log".format(time.asctime()))
 21 | 
 22 | CPT_DIR = 'runs/' + MODEL + '/checkpoints/'
 23 | BEST_CPT_DIR = 'runs/' + MODEL + '/bestcheckpoints/'
 24 | SAVE_DIR = 'output/' + MODEL
 25 | 
 26 | 
 27 | def test_cmidp():
 28 |     """Test CMIDP model."""
 29 |     # Print parameters used for the model
 30 |     dh.tab_printer(args, logger)
 31 | 
 32 |     # Load data
 33 |     logger.info("Loading data...")
 34 |     logger.info("Data processing...")
 35 |     test_data = dh.load_data_and_labels(args.test_file, args.word2vec_file, data_aug_flag=False)
 36 | 
 37 |     logger.info("Data padding...")
 38 |     x_test_content, x_test_question, x_test_option, y_test = dh.pad_data(test_data, args.pad_seq_len)
 39 | 
 40 |     # Load cmidp model
 41 |     OPTION = dh.option(pattern=1)
 42 |     if OPTION == 'B':
 43 |         logger.info("Loading best model...")
 44 |         checkpoint_file = cm.get_best_checkpoint(BEST_CPT_DIR, select_maximum_value=True)
 45 |     else:
 46 |         logger.info("Loading latest model...")
 47 |         checkpoint_file = tf.train.latest_checkpoint(CPT_DIR)
 48 |     logger.info(checkpoint_file)
 49 | 
 50 |     graph = tf.Graph()
 51 |     with graph.as_default():
 52 |         session_conf = tf.ConfigProto(
 53 |             allow_soft_placement=args.allow_soft_placement,
 54 |             log_device_placement=args.log_device_placement)
 55 |         session_conf.gpu_options.allow_growth = args.gpu_options_allow_growth
 56 |         sess = tf.Session(config=session_conf)
 57 |         with sess.as_default():
 58 |             # Load the saved meta graph and restore variables
 59 |             saver = tf.train.import_meta_graph("{0}.meta".format(checkpoint_file))
 60 |             saver.restore(sess, checkpoint_file)
 61 | 
 62 |             # Get the placeholders from the graph by name
 63 |             input_x_content = graph.get_operation_by_name("input_x_content").outputs[0]
 64 |             input_x_question = graph.get_operation_by_name("input_x_question").outputs[0]
 65 |             input_x_option = graph.get_operation_by_name("input_x_option").outputs[0]
 66 |             input_y = graph.get_operation_by_name("input_y").outputs[0]
 67 |             dropout_keep_prob = graph.get_operation_by_name("dropout_keep_prob").outputs[0]
 68 |             is_training = graph.get_operation_by_name("is_training").outputs[0]
 69 | 
 70 |             # Tensors we want to evaluate
 71 |             scores = graph.get_operation_by_name("output/scores").outputs[0]
 72 |             loss = graph.get_operation_by_name("loss/loss").outputs[0]
 73 | 
 74 |             # Split the output nodes name by '|' if you have several output nodes
 75 |             output_node_names = "output/scores"
 76 | 
 77 |             # Save the .pb model file
 78 |             output_graph_def = tf.graph_util.convert_variables_to_constants(sess, sess.graph_def,
 79 |                                                                             output_node_names.split("|"))
 80 |             tf.train.write_graph(output_graph_def, "graph", "graph-cmidp-{0}.pb".format(MODEL), as_text=False)
 81 | 
 82 |             # Generate batches for one epoch
 83 |             batches = dh.batch_iter(list(zip(x_test_content, x_test_question, x_test_option, y_test)),
 84 |                                     args.batch_size, 1, shuffle=False)
 85 | 
 86 |             test_counter, test_loss = 0, 0.0
 87 | 
 88 |             # Collect the predictions here
 89 |             true_labels = []
 90 |             predicted_scores = []
 91 | 
 92 |             for batch_test in batches:
 93 |                 x_batch_content, x_batch_question, x_batch_option, y_batch = zip(*batch_test)
 94 |                 feed_dict = {
 95 |                     input_x_content: x_batch_content,
 96 |                     input_x_question: x_batch_question,
 97 |                     input_x_option: x_batch_option,
 98 |                     input_y: y_batch,
 99 |                     dropout_keep_prob: 1.0,
100 |                     is_training: False
101 |                 }
102 |                 batch_scores, cur_loss = sess.run([scores, loss], feed_dict)
103 | 
104 |                 # Prepare for calculating metrics
105 |                 for i in y_batch:
106 |                     true_labels.append(i)
107 |                 for j in batch_scores:
108 |                     predicted_scores.append(j)
109 | 
110 |                 test_loss = test_loss + cur_loss
111 |                 test_counter = test_counter + 1
112 | 
113 |             # Calculate PCC & DOA
114 |             pcc, doa = dh.evaluation(true_labels, predicted_scores)
115 |             # Calculate RMSE
116 |             rmse = mean_squared_error(true_labels, predicted_scores) ** 0.5
117 |             r2 = r2_score(true_labels, predicted_scores)
118 | 
119 |             test_loss = float(test_loss / test_counter)
120 | 
121 |             logger.info("All Test Dataset: Loss {0:g} | PCC {1:g} | DOA {2:g} | RMSE {3:g} | R2 {4:g}"
122 |                         .format(test_loss, pcc, doa, rmse, r2))
123 | 
124 |             # Save the prediction result
125 |             if not os.path.exists(SAVE_DIR):
126 |                 os.makedirs(SAVE_DIR)
127 |             dh.create_prediction_file(output_file=SAVE_DIR + "/predictions.json", all_id=test_data.id,
128 |                                       all_labels=true_labels, all_predict_scores=predicted_scores)
129 | 
130 |     logger.info("All Done.")
131 | 
132 | 
133 | if __name__ == '__main__':
134 |     test_cmidp()
135 | 


--------------------------------------------------------------------------------
/TF/C-MIDP/text_cmidp.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | __author__ = 'Randolph'
  3 | 
  4 | import numpy as np
  5 | import tensorflow as tf
  6 | 
  7 | 
  8 | class TextCMIDP(object):
  9 |     """A CMIDP for text classification."""
 10 | 
 11 |     def __init__(
 12 |             self, sequence_length, vocab_size, embedding_type, embedding_size, filter_sizes,
 13 |             num_filters, pooling_size, fc_hidden_size, l2_reg_lambda=0.0, pretrained_embedding=None):
 14 | 
 15 |         # Placeholders for input, output, dropout_prob and training_tag
 16 |         self.input_x_content = tf.placeholder(tf.int32, [None, sequence_length[0]], name="input_x_content")
 17 |         self.input_x_question = tf.placeholder(tf.int32, [None, sequence_length[1]], name="input_x_question")
 18 |         self.input_x_option = tf.placeholder(tf.int32, [None, sequence_length[2]], name="input_x_option")
 19 |         self.input_y = tf.placeholder(tf.float32, [None, 1], name="input_y")
 20 |         self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")
 21 |         self.is_training = tf.placeholder(tf.bool, name="is_training")
 22 | 
 23 |         self.global_step = tf.Variable(0, trainable=False, name="Global_Step")
 24 | 
 25 |         def _fc_layer(input_x, name=""):
 26 |             """
 27 |             Fully Connected Layer.
 28 |             Args:
 29 |                 input_x:
 30 |                 name: Scope name
 31 |             Returns:
 32 |                 [batch_size, fc_hidden_size]
 33 |             """
 34 |             with tf.name_scope(name + "fc"):
 35 |                 num_units = input_x.get_shape().as_list()[-1]
 36 |                 W = tf.Variable(tf.truncated_normal(shape=[num_units, fc_hidden_size],
 37 |                                                     stddev=0.1, dtype=tf.float32), name="W")
 38 |                 b = tf.Variable(tf.constant(value=0.1, shape=[fc_hidden_size], dtype=tf.float32), name="b")
 39 |                 fc = tf.nn.xw_plus_b(input_x, W, b)
 40 |                 fc_out = tf.nn.relu(fc)
 41 |             return fc_out
 42 | 
 43 |         def _convolution(input_, pool_size, layer_cnt):
 44 |             index = layer_cnt - 1
 45 |             with tf.name_scope("conv{0}".format(layer_cnt)):
 46 |                 # Padding Zero
 47 |                 new_input = tf.pad(input_, np.array([[0, 0], [filter_sizes[index] - 1, filter_sizes[index] - 1],
 48 |                                                      [0, 0], [0, 0]]), mode="CONSTANT")
 49 |                 width_size = new_input.get_shape().as_list()[-2]
 50 | 
 51 |                 # Convolution Layer
 52 |                 filter_shape = [filter_sizes[index], width_size, 1, num_filters[index]]
 53 |                 W = tf.Variable(tf.truncated_normal(shape=filter_shape, stddev=0.1, dtype=tf.float32), name="W")
 54 |                 b = tf.Variable(tf.constant(value=0.1, shape=[num_filters[index]], dtype=tf.float32), name="b")
 55 |                 conv = tf.nn.conv2d(
 56 |                     new_input,
 57 |                     W,
 58 |                     strides=[1, 1, 1, 1],
 59 |                     padding="VALID",
 60 |                     name="conv")
 61 | 
 62 |                 conv = tf.nn.bias_add(conv, b)
 63 | 
 64 |                 # Apply nonlinearity
 65 |                 conv_out = tf.nn.relu(conv, name="relu")
 66 | 
 67 |             with tf.name_scope("pool{0}".format(layer_cnt)):
 68 |                 # Maxpooling over the outputs
 69 |                 pooled = tf.nn.max_pool(
 70 |                     conv_out,
 71 |                     ksize=[1, pool_size, 1, 1],
 72 |                     strides=[1, pool_size, 1, 1],
 73 |                     padding="VALID",
 74 |                     name="pool")
 75 |             return pooled
 76 | 
 77 |         # Embedding Layer
 78 |         with tf.device("/cpu:0"), tf.name_scope("embedding"):
 79 |             # Use random generated the word vector by default
 80 |             # Can also be obtained through our own word vectors trained by our corpus
 81 |             if pretrained_embedding is None:
 82 |                 self.embedding = tf.Variable(tf.random_uniform([vocab_size, embedding_size], minval=-1.0, maxval=1.0,
 83 |                                                                dtype=tf.float32), trainable=True, name="embedding")
 84 |             else:
 85 |                 if embedding_type == 0:
 86 |                     self.embedding = tf.constant(pretrained_embedding, dtype=tf.float32, name="embedding")
 87 |                 if embedding_type == 1:
 88 |                     self.embedding = tf.Variable(pretrained_embedding, trainable=True,
 89 |                                                  dtype=tf.float32, name="embedding")
 90 |             # [batch_size, sequence_length, embedding_size]
 91 |             self.embedded_sentence_content = tf.nn.embedding_lookup(self.embedding, self.input_x_content)
 92 |             self.embedded_sentence_question = tf.nn.embedding_lookup(self.embedding, self.input_x_question)
 93 |             self.embedded_sentence_option = tf.nn.embedding_lookup(self.embedding, self.input_x_option)
 94 | 
 95 |         sequence_length_total = sequence_length[0] + sequence_length[1] + sequence_length[2]
 96 |         # Concat -> embedded_sentence_all: [batch_size, sequence_length_all, embedding_size]
 97 |         self.embedded_sentence_all = tf.concat([self.embedded_sentence_content, self.embedded_sentence_question,
 98 |                                                self.embedded_sentence_option], axis=1)
 99 |         self.embedded_sentence_expanded = tf.expand_dims(self.embedded_sentence_all, axis=-1)
100 | 
101 |         # Convolution Layer 1
102 |         # conv1_out: [batch_size, sequence_len + filter_sizes[0] -1 / pooling_size[0], 1, num_filters[0]]
103 |         self.conv1_out = _convolution(self.embedded_sentence_expanded, pool_size=pooling_size, layer_cnt=1)
104 |         # conv1_out_trans: [batch_size, sequence_len + filter_sizes[0] -1 / pooling_size[0], num_filters[0], 1]
105 |         self.conv1_out_trans = tf.transpose(self.conv1_out, perm=[0, 1, 3, 2])
106 | 
107 |         # Convolution Layer 2
108 |         new_pooling_size = (sequence_length_total + filter_sizes[0] - 1) // pooling_size
109 |         self.conv2_out = _convolution(self.conv1_out_trans, pool_size=new_pooling_size, layer_cnt=2)
110 | 
111 |         self.conv_final_flat = tf.reshape(self.conv2_out, shape=[-1, num_filters[1]])
112 | 
113 |         # Fully Connected Layer
114 |         self.fc_out = _fc_layer(self.conv_final_flat)
115 | 
116 |         # Add dropout
117 |         with tf.name_scope("dropout"):
118 |             self.fc_drop = tf.nn.dropout(self.fc_out, self.dropout_keep_prob)
119 | 
120 |         # Final scores
121 |         with tf.name_scope("output"):
122 |             W = tf.Variable(tf.truncated_normal(shape=[fc_hidden_size, 1],
123 |                                                 stddev=0.1, dtype=tf.float32), name="W")
124 |             b = tf.Variable(tf.constant(value=0.1, shape=[1], dtype=tf.float32), name="b")
125 |             self.logits = tf.nn.xw_plus_b(self.fc_drop, W, b, name="logits")
126 |             self.scores = tf.sigmoid(self.logits, name="scores")
127 | 
128 |         # Calculate mean cross-entropy loss, L2 loss
129 |         with tf.name_scope("loss"):
130 |             losses = tf.reduce_mean(tf.square(self.input_y - self.scores), name="losses")
131 |             l2_losses = tf.add_n([tf.nn.l2_loss(tf.cast(v, tf.float32)) for v in tf.trainable_variables()],
132 |                                  name="l2_losses") * l2_reg_lambda
133 |             self.loss = tf.add(losses, l2_losses, name="loss")
134 | 


--------------------------------------------------------------------------------
/TF/C-MIDP/train_cmidp.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | __author__ = 'Randolph'
  3 | 
  4 | import os
  5 | import sys
  6 | import time
  7 | import logging
  8 | 
  9 | sys.path.append('../')
 10 | logging.getLogger('tensorflow').disabled = True
 11 | 
 12 | import tensorflow as tf
 13 | from text_cmidp import TextCMIDP
 14 | from utils import checkmate as cm
 15 | from utils import data_helpers as dh
 16 | from utils import param_parser as parser
 17 | from tensorboard.plugins import projector
 18 | from sklearn.metrics import mean_squared_error, r2_score
 19 | 
 20 | args = parser.parameter_parser()
 21 | OPTION = dh.option(pattern=0)
 22 | logger = dh.logger_fn("tflog", "logs/{0}-{1}.log".format('Train' if OPTION == 'T' else 'Restore', time.asctime()))
 23 | 
 24 | 
 25 | def train_cmidp():
 26 |     """Training CMIDP model."""
 27 |     # Print parameters used for the model
 28 |     dh.tab_printer(args, logger)
 29 | 
 30 |     # Load sentences, labels, and training parameters
 31 |     logger.info("Loading data...")
 32 |     logger.info("Data processing...")
 33 |     train_data = dh.load_data_and_labels(args.train_file, args.word2vec_file, data_aug_flag=False)
 34 |     val_data = dh.load_data_and_labels(args.validation_file, args.word2vec_file, data_aug_flag=False)
 35 | 
 36 |     logger.info("Data padding...")
 37 |     x_train_content, x_train_question, x_train_option, y_train = dh.pad_data(train_data, args.pad_seq_len)
 38 |     x_val_content, x_val_question, x_val_option, y_val = dh.pad_data(val_data, args.pad_seq_len)
 39 | 
 40 |     # Build vocabulary
 41 |     VOCAB_SIZE, EMBEDDING_SIZE, pretrained_word2vec_matrix = dh.load_word2vec_matrix(args.word2vec_file)
 42 | 
 43 |     # Build a graph and cmidp object
 44 |     with tf.Graph().as_default():
 45 |         session_conf = tf.ConfigProto(
 46 |             allow_soft_placement=args.allow_soft_placement,
 47 |             log_device_placement=args.log_device_placement)
 48 |         session_conf.gpu_options.allow_growth = args.gpu_options_allow_growth
 49 |         sess = tf.Session(config=session_conf)
 50 |         with sess.as_default():
 51 |             cmidp = TextCMIDP(
 52 |                 sequence_length=args.pad_seq_len,
 53 |                 vocab_size=VOCAB_SIZE,
 54 |                 embedding_type=args.embedding_type,
 55 |                 embedding_size=EMBEDDING_SIZE,
 56 |                 filter_sizes=args.filter_sizes,
 57 |                 num_filters=args.num_filters,
 58 |                 pooling_size=args.pooling_size,
 59 |                 fc_hidden_size=args.fc_dim,
 60 |                 l2_reg_lambda=args.l2_lambda,
 61 |                 pretrained_embedding=pretrained_word2vec_matrix)
 62 | 
 63 |             # Define training procedure
 64 |             with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
 65 |                 learning_rate = tf.train.exponential_decay(learning_rate=args.learning_rate,
 66 |                                                            global_step=cmidp.global_step, decay_steps=args.decay_steps,
 67 |                                                            decay_rate=args.decay_rate, staircase=True)
 68 |                 optimizer = tf.train.AdamOptimizer(learning_rate)
 69 |                 grads, vars = zip(*optimizer.compute_gradients(cmidp.loss))
 70 |                 grads, _ = tf.clip_by_global_norm(grads, clip_norm=args.norm_ratio)
 71 |                 train_op = optimizer.apply_gradients(zip(grads, vars), global_step=cmidp.global_step, name="train_op")
 72 | 
 73 |             # Keep track of gradient values and sparsity (optional)
 74 |             grad_summaries = []
 75 |             for g, v in zip(grads, vars):
 76 |                 if g is not None:
 77 |                     grad_hist_summary = tf.summary.histogram("{0}/grad/hist".format(v.name), g)
 78 |                     sparsity_summary = tf.summary.scalar("{0}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g))
 79 |                     grad_summaries.append(grad_hist_summary)
 80 |                     grad_summaries.append(sparsity_summary)
 81 |             grad_summaries_merged = tf.summary.merge(grad_summaries)
 82 | 
 83 |             # Output directory for models and summaries
 84 |             out_dir = dh.get_out_dir(OPTION, logger)
 85 |             checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
 86 |             best_checkpoint_dir = os.path.abspath(os.path.join(out_dir, "bestcheckpoints"))
 87 | 
 88 |             # Summaries for loss
 89 |             loss_summary = tf.summary.scalar("loss", cmidp.loss)
 90 | 
 91 |             # Train summaries
 92 |             train_summary_op = tf.summary.merge([loss_summary, grad_summaries_merged])
 93 |             train_summary_dir = os.path.join(out_dir, "summaries", "train")
 94 |             train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph)
 95 | 
 96 |             # Validation summaries
 97 |             validation_summary_op = tf.summary.merge([loss_summary])
 98 |             validation_summary_dir = os.path.join(out_dir, "summaries", "validation")
 99 |             validation_summary_writer = tf.summary.FileWriter(validation_summary_dir, sess.graph)
100 | 
101 |             saver = tf.train.Saver(tf.global_variables(), max_to_keep=args.num_checkpoints)
102 |             best_saver = cm.BestCheckpointSaver(save_dir=best_checkpoint_dir, num_to_keep=3, maximize=False)
103 | 
104 |             if OPTION == 'R':
105 |                 # Load cmidp model
106 |                 logger.info("Loading model...")
107 |                 checkpoint_file = tf.train.latest_checkpoint(checkpoint_dir)
108 |                 logger.info(checkpoint_file)
109 | 
110 |                 # Load the saved meta graph and restore variables
111 |                 saver = tf.train.import_meta_graph("{0}.meta".format(checkpoint_file))
112 |                 saver.restore(sess, checkpoint_file)
113 |             if OPTION == 'T':
114 |                 if not os.path.exists(checkpoint_dir):
115 |                     os.makedirs(checkpoint_dir)
116 |                 sess.run(tf.global_variables_initializer())
117 |                 sess.run(tf.local_variables_initializer())
118 | 
119 |                 # Embedding visualization config
120 |                 config = projector.ProjectorConfig()
121 |                 embedding_conf = config.embeddings.add()
122 |                 embedding_conf.tensor_name = "embedding"
123 |                 embedding_conf.metadata_path = args.metadata_file
124 | 
125 |                 projector.visualize_embeddings(train_summary_writer, config)
126 |                 projector.visualize_embeddings(validation_summary_writer, config)
127 | 
128 |                 # Save the embedding visualization
129 |                 saver.save(sess, os.path.join(out_dir, "embedding", "embedding.ckpt"))
130 | 
131 |             current_step = sess.run(cmidp.global_step)
132 | 
133 |             def train_step(x_batch_content, x_batch_question, x_batch_option, y_batch):
134 |                 """A single training step"""
135 |                 feed_dict = {
136 |                     cmidp.input_x_content: x_batch_content,
137 |                     cmidp.input_x_question: x_batch_question,
138 |                     cmidp.input_x_option: x_batch_option,
139 |                     cmidp.input_y: y_batch,
140 |                     cmidp.dropout_keep_prob: args.dropout_rate,
141 |                     cmidp.is_training: True
142 |                 }
143 |                 _, step, summaries, loss = sess.run(
144 |                     [train_op, cmidp.global_step, train_summary_op, cmidp.loss], feed_dict)
145 |                 logger.info("step {0}: loss {1:g}".format(step, loss))
146 |                 train_summary_writer.add_summary(summaries, step)
147 | 
148 |             def validation_step(x_val_content, x_val_question, x_val_option, y_val, writer=None):
149 |                 """Evaluates model on a validation set"""
150 |                 batches_validation = dh.batch_iter(list(zip(x_val_content, x_val_question, x_val_option, y_val)),
151 |                                                    args.batch_size, 1)
152 | 
153 |                 eval_counter, eval_loss = 0, 0.0
154 | 
155 |                 true_labels = []
156 |                 predicted_scores = []
157 | 
158 |                 for batch_validation in batches_validation:
159 |                     x_batch_content, x_batch_question, x_batch_option, y_batch = zip(*batch_validation)
160 |                     feed_dict = {
161 |                         cmidp.input_x_content: x_batch_content,
162 |                         cmidp.input_x_question: x_batch_question,
163 |                         cmidp.input_x_option: x_batch_option,
164 |                         cmidp.input_y: y_batch,
165 |                         cmidp.dropout_keep_prob: 1.0,
166 |                         cmidp.is_training: False
167 |                     }
168 |                     step, summaries, scores, cur_loss = sess.run(
169 |                         [cmidp.global_step, validation_summary_op, cmidp.scores, cmidp.loss], feed_dict)
170 | 
171 |                     # Prepare for calculating metrics
172 |                     for i in y_batch:
173 |                         true_labels.append(i)
174 |                     for j in scores:
175 |                         predicted_scores.append(j)
176 | 
177 |                     eval_loss = eval_loss + cur_loss
178 |                     eval_counter = eval_counter + 1
179 | 
180 |                     if writer:
181 |                         writer.add_summary(summaries, step)
182 | 
183 |                 eval_loss = float(eval_loss / eval_counter)
184 | 
185 |                 # Calculate PCC & DOA
186 |                 pcc, doa = dh.evaluation(true_labels, predicted_scores)
187 |                 # Calculate RMSE
188 |                 rmse = mean_squared_error(true_labels, predicted_scores) ** 0.5
189 |                 r2 = r2_score(true_labels, predicted_scores)
190 | 
191 |                 return eval_loss, pcc, doa, rmse, r2
192 | 
193 |             # Generate batches
194 |             batches_train = dh.batch_iter(list(zip(x_train_content, x_train_question, x_train_option, y_train)),
195 |                                           args.batch_size, args.epochs)
196 | 
197 |             num_batches_per_epoch = int((len(y_train) - 1) / args.batch_size) + 1
198 | 
199 |             # Training loop. For each batch...
200 |             for batch_train in batches_train:
201 |                 x_batch_train_content, x_batch_train_question, x_batch_train_option, y_batch_train = zip(*batch_train)
202 |                 train_step(x_batch_train_content, x_batch_train_question, x_batch_train_option, y_batch_train)
203 |                 current_step = tf.train.global_step(sess, cmidp.global_step)
204 | 
205 |                 if current_step % args.evaluate_steps == 0:
206 |                     logger.info("\nEvaluation:")
207 |                     eval_loss, pcc, doa, rmse, r2 = validation_step(x_val_content, x_val_question, x_val_option, y_val,
208 |                                                                     writer=validation_summary_writer)
209 |                     logger.info("All Validation set: Loss {0:g} | PCC {1:g} | DOA {2:g} | RMSE {3:g} | R2 {4:g}"
210 |                                 .format(eval_loss, pcc, doa, rmse, r2))
211 |                     best_saver.handle(rmse, sess, current_step)
212 |                 if current_step % args.checkpoint_steps == 0:
213 |                     checkpoint_prefix = os.path.join(checkpoint_dir, "model")
214 |                     path = saver.save(sess, checkpoint_prefix, global_step=current_step)
215 |                     logger.info("Saved model checkpoint to {0}\n".format(path))
216 |                 if current_step % num_batches_per_epoch == 0:
217 |                     current_epoch = current_step // num_batches_per_epoch
218 |                     logger.info("Epoch {0} has finished!".format(current_epoch))
219 | 
220 |     logger.info("All Done.")
221 | 
222 | 
223 | if __name__ == '__main__':
224 |     train_cmidp()


--------------------------------------------------------------------------------
/TF/H-MIDP/test_hmidp.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | __author__ = 'Randolph'
  3 | 
  4 | import os
  5 | import sys
  6 | import time
  7 | import logging
  8 | 
  9 | sys.path.append('../')
 10 | logging.getLogger('tensorflow').disabled = True
 11 | 
 12 | import tensorflow as tf
 13 | from utils import checkmate as cm
 14 | from utils import data_helpers as dh
 15 | from utils import param_parser as parser
 16 | from sklearn.metrics import mean_squared_error, r2_score
 17 | 
 18 | args = parser.parameter_parser()
 19 | MODEL = dh.get_model_name()
 20 | logger = dh.logger_fn("tflog", "logs/Test-{0}.log".format(time.asctime()))
 21 | 
 22 | CPT_DIR = 'runs/' + MODEL + '/checkpoints/'
 23 | BEST_CPT_DIR = 'runs/' + MODEL + '/bestcheckpoints/'
 24 | SAVE_DIR = 'output/' + MODEL
 25 | 
 26 | 
 27 | def test_hmidp():
 28 |     """Test HMIDP model."""
 29 |     # Print parameters used for the model
 30 |     dh.tab_printer(args, logger)
 31 | 
 32 |     # Load data
 33 |     logger.info("Loading data...")
 34 |     logger.info("Data processing...")
 35 |     test_data = dh.load_data_and_labels(args.test_file, args.word2vec_file, data_aug_flag=False)
 36 | 
 37 |     logger.info("Data padding...")
 38 |     x_test_content, x_test_question, x_test_option, y_test = dh.pad_data(test_data, args.pad_seq_len)
 39 | 
 40 |     # Load hmidp model
 41 |     OPTION = dh.option(pattern=1)
 42 |     if OPTION == 'B':
 43 |         logger.info("Loading best model...")
 44 |         checkpoint_file = cm.get_best_checkpoint(BEST_CPT_DIR, select_maximum_value=True)
 45 |     else:
 46 |         logger.info("Loading latest model...")
 47 |         checkpoint_file = tf.train.latest_checkpoint(CPT_DIR)
 48 |     logger.info(checkpoint_file)
 49 | 
 50 |     graph = tf.Graph()
 51 |     with graph.as_default():
 52 |         session_conf = tf.ConfigProto(
 53 |             allow_soft_placement=args.allow_soft_placement,
 54 |             log_device_placement=args.log_device_placement)
 55 |         session_conf.gpu_options.allow_growth = args.gpu_options_allow_growth
 56 |         sess = tf.Session(config=session_conf)
 57 |         with sess.as_default():
 58 |             # Load the saved meta graph and restore variables
 59 |             saver = tf.train.import_meta_graph("{0}.meta".format(checkpoint_file))
 60 |             saver.restore(sess, checkpoint_file)
 61 | 
 62 |             # Get the placeholders from the graph by name
 63 |             input_x_content = graph.get_operation_by_name("input_x_content").outputs[0]
 64 |             input_x_question = graph.get_operation_by_name("input_x_question").outputs[0]
 65 |             input_x_option = graph.get_operation_by_name("input_x_option").outputs[0]
 66 |             input_y = graph.get_operation_by_name("input_y").outputs[0]
 67 |             dropout_keep_prob = graph.get_operation_by_name("dropout_keep_prob").outputs[0]
 68 |             is_training = graph.get_operation_by_name("is_training").outputs[0]
 69 | 
 70 |             # Tensors we want to evaluate
 71 |             scores = graph.get_operation_by_name("output/scores").outputs[0]
 72 |             loss = graph.get_operation_by_name("loss/loss").outputs[0]
 73 | 
 74 |             # Split the output nodes name by '|' if you have several output nodes
 75 |             output_node_names = "output/scores"
 76 | 
 77 |             # Save the .pb model file
 78 |             output_graph_def = tf.graph_util.convert_variables_to_constants(sess, sess.graph_def,
 79 |                                                                             output_node_names.split("|"))
 80 |             tf.train.write_graph(output_graph_def, "graph", "graph-hmidp-{0}.pb".format(MODEL), as_text=False)
 81 | 
 82 |             # Generate batches for one epoch
 83 |             batches = dh.batch_iter(list(zip(x_test_content, x_test_question, x_test_option, y_test)),
 84 |                                     args.batch_size, 1, shuffle=False)
 85 | 
 86 |             test_counter, test_loss = 0, 0.0
 87 | 
 88 |             # Collect the predictions here
 89 |             true_labels = []
 90 |             predicted_scores = []
 91 | 
 92 |             for batch_test in batches:
 93 |                 x_batch_content, x_batch_question, x_batch_option, y_batch = zip(*batch_test)
 94 |                 feed_dict = {
 95 |                     input_x_content: x_batch_content,
 96 |                     input_x_question: x_batch_question,
 97 |                     input_x_option: x_batch_option,
 98 |                     input_y: y_batch,
 99 |                     dropout_keep_prob: 1.0,
100 |                     is_training: False
101 |                 }
102 |                 batch_scores, cur_loss = sess.run([scores, loss], feed_dict)
103 | 
104 |                 # Prepare for calculating metrics
105 |                 for i in y_batch:
106 |                     true_labels.append(i)
107 |                 for j in batch_scores:
108 |                     predicted_scores.append(j)
109 | 
110 |                 test_loss = test_loss + cur_loss
111 |                 test_counter = test_counter + 1
112 | 
113 |             # Calculate PCC & DOA
114 |             pcc, doa = dh.evaluation(true_labels, predicted_scores)
115 |             # Calculate RMSE
116 |             rmse = mean_squared_error(true_labels, predicted_scores) ** 0.5
117 |             r2 = r2_score(true_labels, predicted_scores)
118 | 
119 |             test_loss = float(test_loss / test_counter)
120 | 
121 |             logger.info("All Test Dataset: Loss {0:g} | PCC {1:g} | DOA {2:g} | RMSE {3:g} | R2 {4:g}"
122 |                         .format(test_loss, pcc, doa, rmse, r2))
123 | 
124 |             # Save the prediction result
125 |             if not os.path.exists(SAVE_DIR):
126 |                 os.makedirs(SAVE_DIR)
127 |             dh.create_prediction_file(output_file=SAVE_DIR + "/predictions.json", all_id=test_data.id,
128 |                                       all_labels=true_labels, all_predict_scores=predicted_scores)
129 | 
130 |     logger.info("All Done.")
131 | 
132 | 
133 | if __name__ == '__main__':
134 |     test_hmidp()
135 | 


--------------------------------------------------------------------------------
/TF/H-MIDP/text_hmidp.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | __author__ = 'Randolph'
  3 | 
  4 | import numpy as np
  5 | import tensorflow as tf
  6 | 
  7 | 
  8 | class TextHMIDP(object):
  9 |     """A HMIDP for text classification."""
 10 | 
 11 |     def __init__(
 12 |             self, sequence_length, vocab_size, embedding_type, embedding_size, filter_sizes, num_filters,
 13 |             pooling_size, rnn_hidden_size, rnn_type, rnn_layers, fc_hidden_size, l2_reg_lambda=0.0,
 14 |             pretrained_embedding=None):
 15 | 
 16 |         # Placeholders for input, output, dropout_prob and training_tag
 17 |         self.input_x_content = tf.placeholder(tf.int32, [None, sequence_length[0]], name="input_x_content")
 18 |         self.input_x_question = tf.placeholder(tf.int32, [None, sequence_length[1]], name="input_x_question")
 19 |         self.input_x_option = tf.placeholder(tf.int32, [None, sequence_length[2]], name="input_x_option")
 20 |         self.input_y = tf.placeholder(tf.float32, [None, 1], name="input_y")
 21 |         self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")
 22 |         self.is_training = tf.placeholder(tf.bool, name="is_training")
 23 | 
 24 |         self.global_step = tf.Variable(0, trainable=False, name="Global_Step")
 25 | 
 26 |         def _get_rnn_cell(rnn_hidden_size, rnn_type):
 27 |             if rnn_type == 'RNN':
 28 |                 return tf.nn.rnn_cell.BasicRNNCell(rnn_hidden_size)
 29 |             if rnn_type == 'LSTM':
 30 |                 return tf.nn.rnn_cell.BasicLSTMCell(rnn_hidden_size)
 31 |             if rnn_type == 'GRU':
 32 |                 return tf.nn.rnn_cell.GRUCell(rnn_hidden_size)
 33 | 
 34 |         def _convolution(input_, pool_size, layer_cnt):
 35 |             index = layer_cnt - 1
 36 |             with tf.name_scope("conv{0}".format(layer_cnt)):
 37 |                 # Padding Zero
 38 |                 new_input = tf.pad(input_, np.array([[0, 0], [filter_sizes[index] - 1, filter_sizes[index] - 1],
 39 |                                                      [0, 0], [0, 0]]), mode="CONSTANT")
 40 |                 width_size = new_input.get_shape().as_list()[-2]
 41 | 
 42 |                 # Convolution Layer
 43 |                 filter_shape = [filter_sizes[index], width_size, 1, num_filters[index]]
 44 |                 W = tf.Variable(tf.truncated_normal(shape=filter_shape, stddev=0.1, dtype=tf.float32), name="W")
 45 |                 b = tf.Variable(tf.constant(value=0.1, shape=[num_filters[index]], dtype=tf.float32), name="b")
 46 |                 conv = tf.nn.conv2d(
 47 |                     new_input,
 48 |                     W,
 49 |                     strides=[1, 1, 1, 1],
 50 |                     padding="VALID",
 51 |                     name="conv")
 52 | 
 53 |                 conv = tf.nn.bias_add(conv, b)
 54 | 
 55 |                 # Apply nonlinearity
 56 |                 conv_out = tf.nn.relu(conv, name="relu")
 57 | 
 58 |             with tf.name_scope("pool{0}".format(layer_cnt)):
 59 |                 # Maxpooling over the outputs
 60 |                 pooled = tf.nn.max_pool(
 61 |                     conv_out,
 62 |                     ksize=[1, pool_size, 1, 1],
 63 |                     strides=[1, pool_size, 1, 1],
 64 |                     padding="VALID",
 65 |                     name="pool")
 66 |             return pooled
 67 | 
 68 |         def _bi_rnn_layer(input_x, name=""):
 69 |             # Bi-RNN Layer
 70 |             with tf.variable_scope(name + "Bi_rnn", reuse=tf.AUTO_REUSE):
 71 |                 fw_rnn_cell = tf.nn.rnn_cell.MultiRNNCell([_get_rnn_cell(rnn_hidden_size, rnn_type)
 72 |                                                            for _ in range(rnn_layers)])
 73 |                 bw_rnn_cell = tf.nn.rnn_cell.MultiRNNCell([_get_rnn_cell(rnn_hidden_size, rnn_type)
 74 |                                                            for _ in range(rnn_layers)])
 75 |                 if self.dropout_keep_prob is not None:
 76 |                     fw_rnn_cell = tf.nn.rnn_cell.DropoutWrapper(fw_rnn_cell, output_keep_prob=self.dropout_keep_prob)
 77 |                     bw_rnn_cell = tf.nn.rnn_cell.DropoutWrapper(bw_rnn_cell, output_keep_prob=self.dropout_keep_prob)
 78 | 
 79 |                 # Creates a dynamic bidirectional recurrent neural network
 80 |                 # shape of `outputs`: tuple -> (outputs_fw, outputs_bw)
 81 |                 # shape of `outputs_fw`: [batch_size, sequence_length, rnn_hidden_size]
 82 | 
 83 |                 # shape of `state`: tuple -> (outputs_state_fw, output_state_bw)
 84 |                 # shape of `outputs_state_fw`: tuple -> (c, h) c: memory cell; h: hidden state
 85 |                 outputs, state = tf.nn.bidirectional_dynamic_rnn(fw_rnn_cell, bw_rnn_cell, input_x, dtype=tf.float32)
 86 | 
 87 |             # Concat output
 88 |             # [batch_size, sequence_length, rnn_hidden_size * 2]
 89 |             rnn_out = tf.concat(outputs, axis=2, name=name + "rnn_out")
 90 | 
 91 |             # [batch_size, rnn_hidden_size * 2]
 92 |             rnn_pooled = tf.reduce_max(rnn_out, axis=1, name=name + "rnn_pooled")
 93 | 
 94 |             return rnn_pooled
 95 | 
 96 |         def _fc_layer(input_x, name=""):
 97 |             """
 98 |             Fully Connected Layer.
 99 |             Args:
100 |                 input_x:
101 |                 name: Scope name
102 |             Returns:
103 |                 [batch_size, fc_hidden_size]
104 |             """
105 |             with tf.name_scope(name + "fc"):
106 |                 num_units = input_x.get_shape().as_list()[-1]
107 |                 W = tf.Variable(tf.truncated_normal(shape=[num_units, fc_hidden_size],
108 |                                                     stddev=0.1, dtype=tf.float32), name="W")
109 |                 b = tf.Variable(tf.constant(value=0.1, shape=[fc_hidden_size], dtype=tf.float32), name="b")
110 |                 fc = tf.nn.xw_plus_b(input_x, W, b)
111 |                 fc_out = tf.nn.relu(fc)
112 |             return fc_out
113 | 
114 |         # Embedding Layer
115 |         with tf.device("/cpu:0"), tf.name_scope("embedding"):
116 |             # Use random generated the word vector by default
117 |             # Can also be obtained through our own word vectors trained by our corpus
118 |             if pretrained_embedding is None:
119 |                 self.embedding = tf.Variable(tf.random_uniform([vocab_size, embedding_size], minval=-1.0, maxval=1.0,
120 |                                                                dtype=tf.float32), trainable=True, name="embedding")
121 |             else:
122 |                 if embedding_type == 0:
123 |                     self.embedding = tf.constant(pretrained_embedding, dtype=tf.float32, name="embedding")
124 |                 if embedding_type == 1:
125 |                     self.embedding = tf.Variable(pretrained_embedding, trainable=True,
126 |                                                  dtype=tf.float32, name="embedding")
127 |             # [batch_size, sequence_length, embedding_size]
128 |             self.embedded_sentence_content = tf.nn.embedding_lookup(self.embedding, self.input_x_content)
129 |             self.embedded_sentence_question = tf.nn.embedding_lookup(self.embedding, self.input_x_question)
130 |             self.embedded_sentence_option = tf.nn.embedding_lookup(self.embedding, self.input_x_option)
131 | 
132 |         sequence_length_total = sequence_length[0] + sequence_length[1] + sequence_length[2]
133 |         # Concat -> embedded_sentence_all: [batch_size, sequence_length_all, embedding_size]
134 |         self.embedded_sentence_all = tf.concat([self.embedded_sentence_content, self.embedded_sentence_question,
135 |                                                self.embedded_sentence_option], axis=1)
136 |         self.embedded_sentence_expanded = tf.expand_dims(self.embedded_sentence_all, axis=-1)
137 | 
138 |         # Convolution Layer 1
139 |         # conv1_out: [batch_size, sequence_len + filter_sizes[0] -1 / pooling_size[0], 1, num_filters[0]]
140 |         self.conv1_out = _convolution(self.embedded_sentence_expanded, pool_size=pooling_size, layer_cnt=1)
141 |         # conv1_out_trans: [batch_size, sequence_len + filter_sizes[0] -1 / pooling_size[0], num_filters[0], 1]
142 |         self.conv1_out_trans = tf.transpose(self.conv1_out, perm=[0, 1, 3, 2])
143 | 
144 |         # Convolution Layer 2
145 |         new_pooling_size = (sequence_length_total + filter_sizes[0] - 1) // pooling_size
146 |         self.conv2_out = _convolution(self.conv1_out_trans, pool_size=new_pooling_size, layer_cnt=2)
147 |         self.conv_final_flat = tf.reshape(self.conv2_out, shape=[-1, num_filters[1]])
148 | 
149 |         # Bi-RNN Layer
150 |         # bi_rnn_out: [batch_size, rnn_hidden_size * 2]
151 |         self.bi_rnn_out = _bi_rnn_layer(self.embedded_sentence_all, name="total_")
152 | 
153 |         # Concat
154 |         self.conv_rnn_concat = tf.concat([self.conv_final_flat, self.bi_rnn_out], axis=1)
155 | 
156 |         # Fully Connected Layer 1
157 |         self.fc1_out = _fc_layer(self.conv_rnn_concat)
158 | 
159 |         # Fully Connected Layer 2
160 |         self.fc2_out = _fc_layer(self.fc1_out)
161 | 
162 |         # Add dropout
163 |         with tf.name_scope("dropout"):
164 |             self.fc_drop = tf.nn.dropout(self.fc2_out, self.dropout_keep_prob)
165 | 
166 |         # Final scores
167 |         with tf.name_scope("output"):
168 |             W = tf.Variable(tf.truncated_normal(shape=[fc_hidden_size, 1],
169 |                                                 stddev=0.1, dtype=tf.float32), name="W")
170 |             b = tf.Variable(tf.constant(value=0.1, shape=[1], dtype=tf.float32), name="b")
171 |             self.logits = tf.nn.xw_plus_b(self.fc_drop, W, b, name="logits")
172 |             self.scores = tf.sigmoid(self.logits, name="scores")
173 | 
174 |         # Calculate mean cross-entropy loss, L2 loss
175 |         with tf.name_scope("loss"):
176 |             losses = tf.reduce_mean(tf.square(self.input_y - self.scores), name="losses")
177 |             l2_losses = tf.add_n([tf.nn.l2_loss(tf.cast(v, tf.float32)) for v in tf.trainable_variables()],
178 |                                  name="l2_losses") * l2_reg_lambda
179 |             self.loss = tf.add(losses, l2_losses, name="loss")
180 | 
181 | 


--------------------------------------------------------------------------------
/TF/H-MIDP/train_hmidp.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | __author__ = 'Randolph'
  3 | 
  4 | import os
  5 | import sys
  6 | import time
  7 | import logging
  8 | 
  9 | sys.path.append('../')
 10 | logging.getLogger('tensorflow').disabled = True
 11 | 
 12 | import tensorflow as tf
 13 | from text_hmidp import TextHMIDP
 14 | from utils import checkmate as cm
 15 | from utils import data_helpers as dh
 16 | from utils import param_parser as parser
 17 | from tensorboard.plugins import projector
 18 | from sklearn.metrics import mean_squared_error, r2_score
 19 | 
 20 | args = parser.parameter_parser()
 21 | OPTION = dh.option(pattern=0)
 22 | logger = dh.logger_fn("tflog", "logs/{0}-{1}.log".format('Train' if OPTION == 'T' else 'Restore', time.asctime()))
 23 | 
 24 | 
 25 | def train_hmidp():
 26 |     """Training HMIDP model."""
 27 |     # Print parameters used for the model
 28 |     dh.tab_printer(args, logger)
 29 | 
 30 |     # Load sentences, labels, and training parameters
 31 |     logger.info("Loading data...")
 32 |     logger.info("Data processing...")
 33 |     train_data = dh.load_data_and_labels(args.train_file, args.word2vec_file, data_aug_flag=False)
 34 |     val_data = dh.load_data_and_labels(args.validation_file, args.word2vec_file, data_aug_flag=False)
 35 | 
 36 |     logger.info("Data padding...")
 37 |     x_train_content, x_train_question, x_train_option, y_train = dh.pad_data(train_data, args.pad_seq_len)
 38 |     x_val_content, x_val_question, x_val_option, y_val = dh.pad_data(val_data, args.pad_seq_len)
 39 | 
 40 |     # Build vocabulary
 41 |     VOCAB_SIZE, EMBEDDING_SIZE, pretrained_word2vec_matrix = dh.load_word2vec_matrix(args.word2vec_file)
 42 | 
 43 |     # Build a graph and hmidp object
 44 |     with tf.Graph().as_default():
 45 |         session_conf = tf.ConfigProto(
 46 |             allow_soft_placement=args.allow_soft_placement,
 47 |             log_device_placement=args.log_device_placement)
 48 |         session_conf.gpu_options.allow_growth = args.gpu_options_allow_growth
 49 |         sess = tf.Session(config=session_conf)
 50 |         with sess.as_default():
 51 |             hmidp = TextHMIDP(
 52 |                 sequence_length=args.pad_seq_len,
 53 |                 vocab_size=VOCAB_SIZE,
 54 |                 embedding_type=args.embedding_type,
 55 |                 embedding_size=EMBEDDING_SIZE,
 56 |                 filter_sizes=args.filter_sizes,
 57 |                 num_filters=args.num_filters,
 58 |                 pooling_size=args.pooling_size,
 59 |                 rnn_hidden_size=args.rnn_dim,
 60 |                 rnn_type=args.rnn_type,
 61 |                 rnn_layers=args.rnn_layers,
 62 |                 fc_hidden_size=args.fc_dim,
 63 |                 l2_reg_lambda=args.l2_lambda,
 64 |                 pretrained_embedding=pretrained_word2vec_matrix)
 65 | 
 66 |             # Define training procedure
 67 |             with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
 68 |                 learning_rate = tf.train.exponential_decay(learning_rate=args.learning_rate,
 69 |                                                            global_step=hmidp.global_step, decay_steps=args.decay_steps,
 70 |                                                            decay_rate=args.decay_rate, staircase=True)
 71 |                 optimizer = tf.train.AdamOptimizer(learning_rate)
 72 |                 grads, vars = zip(*optimizer.compute_gradients(hmidp.loss))
 73 |                 grads, _ = tf.clip_by_global_norm(grads, clip_norm=args.norm_ratio)
 74 |                 train_op = optimizer.apply_gradients(zip(grads, vars), global_step=hmidp.global_step, name="train_op")
 75 | 
 76 |             # Keep track of gradient values and sparsity (optional)
 77 |             grad_summaries = []
 78 |             for g, v in zip(grads, vars):
 79 |                 if g is not None:
 80 |                     grad_hist_summary = tf.summary.histogram("{0}/grad/hist".format(v.name), g)
 81 |                     sparsity_summary = tf.summary.scalar("{0}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g))
 82 |                     grad_summaries.append(grad_hist_summary)
 83 |                     grad_summaries.append(sparsity_summary)
 84 |             grad_summaries_merged = tf.summary.merge(grad_summaries)
 85 | 
 86 |             # Output directory for models and summaries
 87 |             out_dir = dh.get_out_dir(OPTION, logger)
 88 |             checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
 89 |             best_checkpoint_dir = os.path.abspath(os.path.join(out_dir, "bestcheckpoints"))
 90 | 
 91 |             # Summaries for loss
 92 |             loss_summary = tf.summary.scalar("loss", hmidp.loss)
 93 | 
 94 |             # Train summaries
 95 |             train_summary_op = tf.summary.merge([loss_summary, grad_summaries_merged])
 96 |             train_summary_dir = os.path.join(out_dir, "summaries", "train")
 97 |             train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph)
 98 | 
 99 |             # Validation summaries
100 |             validation_summary_op = tf.summary.merge([loss_summary])
101 |             validation_summary_dir = os.path.join(out_dir, "summaries", "validation")
102 |             validation_summary_writer = tf.summary.FileWriter(validation_summary_dir, sess.graph)
103 | 
104 |             saver = tf.train.Saver(tf.global_variables(), max_to_keep=args.num_checkpoints)
105 |             best_saver = cm.BestCheckpointSaver(save_dir=best_checkpoint_dir, num_to_keep=3, maximize=False)
106 | 
107 |             if OPTION == 'R':
108 |                 # Load hmidp model
109 |                 logger.info("Loading model...")
110 |                 checkpoint_file = tf.train.latest_checkpoint(checkpoint_dir)
111 |                 logger.info(checkpoint_file)
112 | 
113 |                 # Load the saved meta graph and restore variables
114 |                 saver = tf.train.import_meta_graph("{0}.meta".format(checkpoint_file))
115 |                 saver.restore(sess, checkpoint_file)
116 |             if OPTION == 'T':
117 |                 if not os.path.exists(checkpoint_dir):
118 |                     os.makedirs(checkpoint_dir)
119 |                 sess.run(tf.global_variables_initializer())
120 |                 sess.run(tf.local_variables_initializer())
121 | 
122 |                 # Embedding visualization config
123 |                 config = projector.ProjectorConfig()
124 |                 embedding_conf = config.embeddings.add()
125 |                 embedding_conf.tensor_name = "embedding"
126 |                 embedding_conf.metadata_path = args.metadata_file
127 | 
128 |                 projector.visualize_embeddings(train_summary_writer, config)
129 |                 projector.visualize_embeddings(validation_summary_writer, config)
130 | 
131 |                 # Save the embedding visualization
132 |                 saver.save(sess, os.path.join(out_dir, "embedding", "embedding.ckpt"))
133 | 
134 |             current_step = sess.run(hmidp.global_step)
135 | 
136 |             def train_step(x_batch_content, x_batch_question, x_batch_option, y_batch):
137 |                 """A single training step"""
138 |                 feed_dict = {
139 |                     hmidp.input_x_content: x_batch_content,
140 |                     hmidp.input_x_question: x_batch_question,
141 |                     hmidp.input_x_option: x_batch_option,
142 |                     hmidp.input_y: y_batch,
143 |                     hmidp.dropout_keep_prob: args.dropout_rate,
144 |                     hmidp.is_training: True
145 |                 }
146 |                 _, step, summaries, loss = sess.run(
147 |                     [train_op, hmidp.global_step, train_summary_op, hmidp.loss], feed_dict)
148 |                 logger.info("step {0}: loss {1:g}".format(step, loss))
149 |                 train_summary_writer.add_summary(summaries, step)
150 | 
151 |             def validation_step(x_val_content, x_val_question, x_val_option, y_val, writer=None):
152 |                 """Evaluates model on a validation set"""
153 |                 batches_validation = dh.batch_iter(list(zip(x_val_content, x_val_question, x_val_option, y_val)),
154 |                                                    args.batch_size, 1)
155 | 
156 |                 eval_counter, eval_loss = 0, 0.0
157 | 
158 |                 true_labels = []
159 |                 predicted_scores = []
160 | 
161 |                 for batch_validation in batches_validation:
162 |                     x_batch_content, x_batch_question, x_batch_option, y_batch = zip(*batch_validation)
163 |                     feed_dict = {
164 |                         hmidp.input_x_content: x_batch_content,
165 |                         hmidp.input_x_question: x_batch_question,
166 |                         hmidp.input_x_option: x_batch_option,
167 |                         hmidp.input_y: y_batch,
168 |                         hmidp.dropout_keep_prob: 1.0,
169 |                         hmidp.is_training: False
170 |                     }
171 |                     step, summaries, scores, cur_loss = sess.run(
172 |                         [hmidp.global_step, validation_summary_op, hmidp.scores, hmidp.loss], feed_dict)
173 | 
174 |                     # Prepare for calculating metrics
175 |                     for i in y_batch:
176 |                         true_labels.append(i)
177 |                     for j in scores:
178 |                         predicted_scores.append(j)
179 | 
180 |                     eval_loss = eval_loss + cur_loss
181 |                     eval_counter = eval_counter + 1
182 | 
183 |                     if writer:
184 |                         writer.add_summary(summaries, step)
185 | 
186 |                 eval_loss = float(eval_loss / eval_counter)
187 | 
188 |                 # Calculate PCC & DOA
189 |                 pcc, doa = dh.evaluation(true_labels, predicted_scores)
190 |                 # Calculate RMSE
191 |                 rmse = mean_squared_error(true_labels, predicted_scores) ** 0.5
192 |                 r2 = r2_score(true_labels, predicted_scores)
193 | 
194 |                 return eval_loss, pcc, doa, rmse, r2
195 | 
196 |             # Generate batches
197 |             batches_train = dh.batch_iter(list(zip(x_train_content, x_train_question, x_train_option, y_train)),
198 |                                           args.batch_size, args.epochs)
199 | 
200 |             num_batches_per_epoch = int((len(y_train) - 1) / args.batch_size) + 1
201 | 
202 |             # Training loop. For each batch...
203 |             for batch_train in batches_train:
204 |                 x_batch_train_content, x_batch_train_question, x_batch_train_option, y_batch_train = zip(*batch_train)
205 |                 train_step(x_batch_train_content, x_batch_train_question, x_batch_train_option, y_batch_train)
206 |                 current_step = tf.train.global_step(sess, hmidp.global_step)
207 | 
208 |                 if current_step % args.evaluate_steps == 0:
209 |                     logger.info("\nEvaluation:")
210 |                     eval_loss, pcc, doa, rmse, r2 = validation_step(x_val_content, x_val_question, x_val_option, y_val,
211 |                                                                     writer=validation_summary_writer)
212 |                     logger.info("All Validation set: Loss {0:g} | PCC {1:g} | DOA {2:g} | RMSE {3:g} | R2 {4:g}"
213 |                                 .format(eval_loss, pcc, doa, rmse, r2))
214 |                     best_saver.handle(rmse, sess, current_step)
215 |                 if current_step % args.checkpoint_steps == 0:
216 |                     checkpoint_prefix = os.path.join(checkpoint_dir, "model")
217 |                     path = saver.save(sess, checkpoint_prefix, global_step=current_step)
218 |                     logger.info("Saved model checkpoint to {0}\n".format(path))
219 |                 if current_step % num_batches_per_epoch == 0:
220 |                     current_epoch = current_step // num_batches_per_epoch
221 |                     logger.info("Epoch {0} has finished!".format(current_epoch))
222 | 
223 |     logger.info("All Done.")
224 | 
225 | 
226 | if __name__ == '__main__':
227 |     train_hmidp()


--------------------------------------------------------------------------------
/TF/R-MIDP/test_rmidp.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | __author__ = 'Randolph'
  3 | 
  4 | import os
  5 | import sys
  6 | import time
  7 | import logging
  8 | 
  9 | sys.path.append('../')
 10 | logging.getLogger('tensorflow').disabled = True
 11 | 
 12 | import tensorflow as tf
 13 | from utils import checkmate as cm
 14 | from utils import data_helpers as dh
 15 | from utils import param_parser as parser
 16 | from sklearn.metrics import mean_squared_error, r2_score
 17 | 
 18 | args = parser.parameter_parser()
 19 | MODEL = dh.get_model_name()
 20 | logger = dh.logger_fn("tflog", "logs/Test-{0}.log".format(time.asctime()))
 21 | 
 22 | CPT_DIR = 'runs/' + MODEL + '/checkpoints/'
 23 | BEST_CPT_DIR = 'runs/' + MODEL + '/bestcheckpoints/'
 24 | SAVE_DIR = 'output/' + MODEL
 25 | 
 26 | 
 27 | def test_rmidp():
 28 |     """Test RMIDP model."""
 29 |     # Print parameters used for the model
 30 |     dh.tab_printer(args, logger)
 31 | 
 32 |     # Load data
 33 |     logger.info("Loading data...")
 34 |     logger.info("Data processing...")
 35 |     test_data = dh.load_data_and_labels(args.test_file, args.word2vec_file, data_aug_flag=False)
 36 | 
 37 |     logger.info("Data padding...")
 38 |     x_test_content, x_test_question, x_test_option, y_test = dh.pad_data(test_data, args.pad_seq_len)
 39 | 
 40 |     # Load rmidp model
 41 |     OPTION = dh.option(pattern=1)
 42 |     if OPTION == 'B':
 43 |         logger.info("Loading best model...")
 44 |         checkpoint_file = cm.get_best_checkpoint(BEST_CPT_DIR, select_maximum_value=True)
 45 |     else:
 46 |         logger.info("Loading latest model...")
 47 |         checkpoint_file = tf.train.latest_checkpoint(CPT_DIR)
 48 |     logger.info(checkpoint_file)
 49 | 
 50 |     graph = tf.Graph()
 51 |     with graph.as_default():
 52 |         session_conf = tf.ConfigProto(
 53 |             allow_soft_placement=args.allow_soft_placement,
 54 |             log_device_placement=args.log_device_placement)
 55 |         session_conf.gpu_options.allow_growth = args.gpu_options_allow_growth
 56 |         sess = tf.Session(config=session_conf)
 57 |         with sess.as_default():
 58 |             # Load the saved meta graph and restore variables
 59 |             saver = tf.train.import_meta_graph("{0}.meta".format(checkpoint_file))
 60 |             saver.restore(sess, checkpoint_file)
 61 | 
 62 |             # Get the placeholders from the graph by name
 63 |             input_x_content = graph.get_operation_by_name("input_x_content").outputs[0]
 64 |             input_x_question = graph.get_operation_by_name("input_x_question").outputs[0]
 65 |             input_x_option = graph.get_operation_by_name("input_x_option").outputs[0]
 66 |             input_y = graph.get_operation_by_name("input_y").outputs[0]
 67 |             dropout_keep_prob = graph.get_operation_by_name("dropout_keep_prob").outputs[0]
 68 |             is_training = graph.get_operation_by_name("is_training").outputs[0]
 69 | 
 70 |             # Tensors we want to evaluate
 71 |             scores = graph.get_operation_by_name("output/scores").outputs[0]
 72 |             loss = graph.get_operation_by_name("loss/loss").outputs[0]
 73 | 
 74 |             # Split the output nodes name by '|' if you have several output nodes
 75 |             output_node_names = "output/scores"
 76 | 
 77 |             # Save the .pb model file
 78 |             output_graph_def = tf.graph_util.convert_variables_to_constants(sess, sess.graph_def,
 79 |                                                                             output_node_names.split("|"))
 80 |             tf.train.write_graph(output_graph_def, "graph", "graph-rmidp-{0}.pb".format(MODEL), as_text=False)
 81 | 
 82 |             # Generate batches for one epoch
 83 |             batches = dh.batch_iter(list(zip(x_test_content, x_test_question, x_test_option, y_test)),
 84 |                                     args.batch_size, 1, shuffle=False)
 85 | 
 86 |             test_counter, test_loss = 0, 0.0
 87 | 
 88 |             # Collect the predictions here
 89 |             true_labels = []
 90 |             predicted_scores = []
 91 | 
 92 |             for batch_test in batches:
 93 |                 x_batch_content, x_batch_question, x_batch_option, y_batch = zip(*batch_test)
 94 |                 feed_dict = {
 95 |                     input_x_content: x_batch_content,
 96 |                     input_x_question: x_batch_question,
 97 |                     input_x_option: x_batch_option,
 98 |                     input_y: y_batch,
 99 |                     dropout_keep_prob: 1.0,
100 |                     is_training: False
101 |                 }
102 |                 batch_scores, cur_loss = sess.run([scores, loss], feed_dict)
103 | 
104 |                 # Prepare for calculating metrics
105 |                 for i in y_batch:
106 |                     true_labels.append(i)
107 |                 for j in batch_scores:
108 |                     predicted_scores.append(j)
109 | 
110 |                 test_loss = test_loss + cur_loss
111 |                 test_counter = test_counter + 1
112 | 
113 |             # Calculate PCC & DOA
114 |             pcc, doa = dh.evaluation(true_labels, predicted_scores)
115 |             # Calculate RMSE
116 |             rmse = mean_squared_error(true_labels, predicted_scores) ** 0.5
117 |             r2 = r2_score(true_labels, predicted_scores)
118 | 
119 |             test_loss = float(test_loss / test_counter)
120 | 
121 |             logger.info("All Test Dataset: Loss {0:g} | PCC {1:g} | DOA {2:g} | RMSE {3:g} | R2 {4:g}"
122 |                         .format(test_loss, pcc, doa, rmse, r2))
123 | 
124 |             # Save the prediction result
125 |             if not os.path.exists(SAVE_DIR):
126 |                 os.makedirs(SAVE_DIR)
127 |             dh.create_prediction_file(output_file=SAVE_DIR + "/predictions.json", all_id=test_data.id,
128 |                                       all_labels=true_labels, all_predict_scores=predicted_scores)
129 | 
130 |     logger.info("All Done.")
131 | 
132 | 
133 | if __name__ == '__main__':
134 |     test_rmidp()
135 | 


--------------------------------------------------------------------------------
/TF/R-MIDP/text_rmidp.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | __author__ = 'Randolph'
  3 | 
  4 | import numpy as np
  5 | import tensorflow as tf
  6 | 
  7 | 
  8 | class TextRMIDP(object):
  9 |     """A RMIDP for text classification."""
 10 | 
 11 |     def __init__(
 12 |             self, sequence_length, vocab_size, embedding_type, embedding_size, rnn_hidden_size, rnn_type,
 13 |             rnn_layers, fc_hidden_size, l2_reg_lambda=0.0, pretrained_embedding=None):
 14 | 
 15 |         # Placeholders for input, output, dropout_prob and training_tag
 16 |         self.input_x_content = tf.placeholder(tf.int32, [None, sequence_length[0]], name="input_x_content")
 17 |         self.input_x_question = tf.placeholder(tf.int32, [None, sequence_length[1]], name="input_x_question")
 18 |         self.input_x_option = tf.placeholder(tf.int32, [None, sequence_length[2]], name="input_x_option")
 19 |         self.input_y = tf.placeholder(tf.float32, [None, 1], name="input_y")
 20 |         self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")
 21 |         self.is_training = tf.placeholder(tf.bool, name="is_training")
 22 | 
 23 |         self.global_step = tf.Variable(0, trainable=False, name="Global_Step")
 24 | 
 25 |         def _get_rnn_cell(rnn_hidden_size, rnn_type):
 26 |             if rnn_type == 'RNN':
 27 |                 return tf.nn.rnn_cell.BasicRNNCell(rnn_hidden_size)
 28 |             if rnn_type == 'LSTM':
 29 |                 return tf.nn.rnn_cell.BasicLSTMCell(rnn_hidden_size)
 30 |             if rnn_type == 'GRU':
 31 |                 return tf.nn.rnn_cell.GRUCell(rnn_hidden_size)
 32 | 
 33 |         def _bi_rnn_layer(input_x, name=""):
 34 |             # Bi-RNN Layer
 35 |             with tf.variable_scope(name + "Bi_rnn", reuse=tf.AUTO_REUSE):
 36 |                 fw_rnn_cell = tf.nn.rnn_cell.MultiRNNCell([_get_rnn_cell(rnn_hidden_size, rnn_type)
 37 |                                                            for _ in range(rnn_layers)])
 38 |                 bw_rnn_cell = tf.nn.rnn_cell.MultiRNNCell([_get_rnn_cell(rnn_hidden_size, rnn_type)
 39 |                                                            for _ in range(rnn_layers)])
 40 |                 if self.dropout_keep_prob is not None:
 41 |                     fw_rnn_cell = tf.nn.rnn_cell.DropoutWrapper(fw_rnn_cell, output_keep_prob=self.dropout_keep_prob)
 42 |                     bw_rnn_cell = tf.nn.rnn_cell.DropoutWrapper(bw_rnn_cell, output_keep_prob=self.dropout_keep_prob)
 43 | 
 44 |                 # Creates a dynamic bidirectional recurrent neural network
 45 |                 # shape of `outputs`: tuple -> (outputs_fw, outputs_bw)
 46 |                 # shape of `outputs_fw`: [batch_size, sequence_length, rnn_hidden_size]
 47 | 
 48 |                 # shape of `state`: tuple -> (outputs_state_fw, output_state_bw)
 49 |                 # shape of `outputs_state_fw`: tuple -> (c, h) c: memory cell; h: hidden state
 50 |                 outputs, state = tf.nn.bidirectional_dynamic_rnn(fw_rnn_cell, bw_rnn_cell, input_x, dtype=tf.float32)
 51 | 
 52 |             # Concat output
 53 |             # [batch_size, sequence_length, rnn_hidden_size * 2]
 54 |             rnn_out = tf.concat(outputs, axis=2, name=name + "rnn_out")
 55 | 
 56 |             # [batch_size, rnn_hidden_size * 2]
 57 |             rnn_pooled = tf.reduce_max(rnn_out, axis=1, name=name + "rnn_pooled")
 58 | 
 59 |             return rnn_pooled
 60 | 
 61 |         def _fc_layer(input_x, name=""):
 62 |             """
 63 |             Fully Connected Layer.
 64 |             Args:
 65 |                 input_x:
 66 |                 name: Scope name
 67 |             Returns:
 68 |                 [batch_size, fc_hidden_size]
 69 |             """
 70 |             with tf.name_scope(name + "fc"):
 71 |                 num_units = input_x.get_shape().as_list()[-1]
 72 |                 W = tf.Variable(tf.truncated_normal(shape=[num_units, fc_hidden_size],
 73 |                                                     stddev=0.1, dtype=tf.float32), name="W")
 74 |                 b = tf.Variable(tf.constant(value=0.1, shape=[fc_hidden_size], dtype=tf.float32), name="b")
 75 |                 fc = tf.nn.xw_plus_b(input_x, W, b)
 76 |                 fc_out = tf.nn.relu(fc)
 77 |             return fc_out
 78 | 
 79 |         # Embedding Layer
 80 |         with tf.device("/cpu:0"), tf.name_scope("embedding"):
 81 |             # Use random generated the word vector by default
 82 |             # Can also be obtained through our own word vectors trained by our corpus
 83 |             if pretrained_embedding is None:
 84 |                 self.embedding = tf.Variable(tf.random_uniform([vocab_size, embedding_size], minval=-1.0, maxval=1.0,
 85 |                                                                dtype=tf.float32), trainable=True, name="embedding")
 86 |             else:
 87 |                 if embedding_type == 0:
 88 |                     self.embedding = tf.constant(pretrained_embedding, dtype=tf.float32, name="embedding")
 89 |                 if embedding_type == 1:
 90 |                     self.embedding = tf.Variable(pretrained_embedding, trainable=True,
 91 |                                                  dtype=tf.float32, name="embedding")
 92 |             # [batch_size, sequence_length, embedding_size]
 93 |             self.embedded_sentence_content = tf.nn.embedding_lookup(self.embedding, self.input_x_content)
 94 |             self.embedded_sentence_question = tf.nn.embedding_lookup(self.embedding, self.input_x_question)
 95 |             self.embedded_sentence_option = tf.nn.embedding_lookup(self.embedding, self.input_x_option)
 96 | 
 97 |         # Concat -> embedded_sentence_all: [batch_size, sequence_length_all, embedding_size]
 98 |         self.embedded_sentence_all = tf.concat([self.embedded_sentence_content, self.embedded_sentence_question,
 99 |                                                self.embedded_sentence_option], axis=1)
100 | 
101 |         # Bi-RNN Layer
102 |         # bi_rnn_out: [batch_size, rnn_hidden_size * 2]
103 |         self.bi_rnn_out = _bi_rnn_layer(self.embedded_sentence_all, name="total_")
104 | 
105 |         # Fully Connected Layer
106 |         self.fc_out = _fc_layer(self.bi_rnn_out)
107 | 
108 |         # Add dropout
109 |         with tf.name_scope("dropout"):
110 |             self.fc_drop = tf.nn.dropout(self.fc_out, self.dropout_keep_prob)
111 | 
112 |         # Final scores
113 |         with tf.name_scope("output"):
114 |             W = tf.Variable(tf.truncated_normal(shape=[fc_hidden_size, 1],
115 |                                                 stddev=0.1, dtype=tf.float32), name="W")
116 |             b = tf.Variable(tf.constant(value=0.1, shape=[1], dtype=tf.float32), name="b")
117 |             self.logits = tf.nn.xw_plus_b(self.fc_drop, W, b, name="logits")
118 |             self.scores = tf.sigmoid(self.logits, name="scores")
119 | 
120 |         # Calculate mean cross-entropy loss, L2 loss
121 |         with tf.name_scope("loss"):
122 |             losses = tf.reduce_mean(tf.square(self.input_y - self.scores), name="losses")
123 |             l2_losses = tf.add_n([tf.nn.l2_loss(tf.cast(v, tf.float32)) for v in tf.trainable_variables()],
124 |                                  name="l2_losses") * l2_reg_lambda
125 |             self.loss = tf.add(losses, l2_losses, name="loss")


--------------------------------------------------------------------------------
/TF/R-MIDP/train_rmidp.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | __author__ = 'Randolph'
  3 | 
  4 | import os
  5 | import sys
  6 | import time
  7 | import logging
  8 | 
  9 | sys.path.append('../')
 10 | logging.getLogger('tensorflow').disabled = True
 11 | 
 12 | import tensorflow as tf
 13 | from text_rmidp import TextRMIDP
 14 | from utils import checkmate as cm
 15 | from utils import data_helpers as dh
 16 | from utils import param_parser as parser
 17 | from tensorboard.plugins import projector
 18 | from sklearn.metrics import mean_squared_error, r2_score
 19 | 
 20 | args = parser.parameter_parser()
 21 | OPTION = dh.option(pattern=0)
 22 | logger = dh.logger_fn("tflog", "logs/{0}-{1}.log".format('Train' if OPTION == 'T' else 'Restore', time.asctime()))
 23 | 
 24 | 
 25 | def train_rmidp():
 26 |     """Training RMIDP model."""
 27 |     # Print parameters used for the model
 28 |     dh.tab_printer(args, logger)
 29 | 
 30 |     # Load sentences, labels, and training parameters
 31 |     logger.info("Loading data...")
 32 |     logger.info("Data processing...")
 33 |     train_data = dh.load_data_and_labels(args.train_file, args.word2vec_file, data_aug_flag=False)
 34 |     val_data = dh.load_data_and_labels(args.validation_file, args.word2vec_file, data_aug_flag=False)
 35 | 
 36 |     logger.info("Data padding...")
 37 |     x_train_content, x_train_question, x_train_option, y_train = dh.pad_data(train_data, args.pad_seq_len)
 38 |     x_val_content, x_val_question, x_val_option, y_val = dh.pad_data(val_data, args.pad_seq_len)
 39 | 
 40 |     # Build vocabulary
 41 |     VOCAB_SIZE, EMBEDDING_SIZE, pretrained_word2vec_matrix = dh.load_word2vec_matrix(args.word2vec_file)
 42 | 
 43 |     # Build a graph and rmidp object
 44 |     with tf.Graph().as_default():
 45 |         session_conf = tf.ConfigProto(
 46 |             allow_soft_placement=args.allow_soft_placement,
 47 |             log_device_placement=args.log_device_placement)
 48 |         session_conf.gpu_options.allow_growth = args.gpu_options_allow_growth
 49 |         sess = tf.Session(config=session_conf)
 50 |         with sess.as_default():
 51 |             rmidp = TextRMIDP(
 52 |                 sequence_length=args.pad_seq_len,
 53 |                 vocab_size=VOCAB_SIZE,
 54 |                 embedding_type=args.embedding_type,
 55 |                 embedding_size=EMBEDDING_SIZE,
 56 |                 rnn_hidden_size=args.rnn_dim,
 57 |                 rnn_type=args.rnn_type,
 58 |                 rnn_layers=args.rnn_layers,
 59 |                 fc_hidden_size=args.fc_dim,
 60 |                 l2_reg_lambda=args.l2_lambda,
 61 |                 pretrained_embedding=pretrained_word2vec_matrix)
 62 | 
 63 |             # Define training procedure
 64 |             with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
 65 |                 learning_rate = tf.train.exponential_decay(learning_rate=args.learning_rate,
 66 |                                                            global_step=rmidp.global_step, decay_steps=args.decay_steps,
 67 |                                                            decay_rate=args.decay_rate, staircase=True)
 68 |                 optimizer = tf.train.AdamOptimizer(learning_rate)
 69 |                 grads, vars = zip(*optimizer.compute_gradients(rmidp.loss))
 70 |                 grads, _ = tf.clip_by_global_norm(grads, clip_norm=args.norm_ratio)
 71 |                 train_op = optimizer.apply_gradients(zip(grads, vars), global_step=rmidp.global_step, name="train_op")
 72 | 
 73 |             # Keep track of gradient values and sparsity (optional)
 74 |             grad_summaries = []
 75 |             for g, v in zip(grads, vars):
 76 |                 if g is not None:
 77 |                     grad_hist_summary = tf.summary.histogram("{0}/grad/hist".format(v.name), g)
 78 |                     sparsity_summary = tf.summary.scalar("{0}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g))
 79 |                     grad_summaries.append(grad_hist_summary)
 80 |                     grad_summaries.append(sparsity_summary)
 81 |             grad_summaries_merged = tf.summary.merge(grad_summaries)
 82 | 
 83 |             # Output directory for models and summaries
 84 |             out_dir = dh.get_out_dir(OPTION, logger)
 85 |             checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
 86 |             best_checkpoint_dir = os.path.abspath(os.path.join(out_dir, "bestcheckpoints"))
 87 | 
 88 |             # Summaries for loss
 89 |             loss_summary = tf.summary.scalar("loss", rmidp.loss)
 90 | 
 91 |             # Train summaries
 92 |             train_summary_op = tf.summary.merge([loss_summary, grad_summaries_merged])
 93 |             train_summary_dir = os.path.join(out_dir, "summaries", "train")
 94 |             train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph)
 95 | 
 96 |             # Validation summaries
 97 |             validation_summary_op = tf.summary.merge([loss_summary])
 98 |             validation_summary_dir = os.path.join(out_dir, "summaries", "validation")
 99 |             validation_summary_writer = tf.summary.FileWriter(validation_summary_dir, sess.graph)
100 | 
101 |             saver = tf.train.Saver(tf.global_variables(), max_to_keep=args.num_checkpoints)
102 |             best_saver = cm.BestCheckpointSaver(save_dir=best_checkpoint_dir, num_to_keep=3, maximize=False)
103 | 
104 |             if OPTION == 'R':
105 |                 # Load rmidp model
106 |                 logger.info("Loading model...")
107 |                 checkpoint_file = tf.train.latest_checkpoint(checkpoint_dir)
108 |                 logger.info(checkpoint_file)
109 | 
110 |                 # Load the saved meta graph and restore variables
111 |                 saver = tf.train.import_meta_graph("{0}.meta".format(checkpoint_file))
112 |                 saver.restore(sess, checkpoint_file)
113 |             if OPTION == 'T':
114 |                 if not os.path.exists(checkpoint_dir):
115 |                     os.makedirs(checkpoint_dir)
116 |                 sess.run(tf.global_variables_initializer())
117 |                 sess.run(tf.local_variables_initializer())
118 | 
119 |                 # Embedding visualization config
120 |                 config = projector.ProjectorConfig()
121 |                 embedding_conf = config.embeddings.add()
122 |                 embedding_conf.tensor_name = "embedding"
123 |                 embedding_conf.metadata_path = args.metadata_file
124 | 
125 |                 projector.visualize_embeddings(train_summary_writer, config)
126 |                 projector.visualize_embeddings(validation_summary_writer, config)
127 | 
128 |                 # Save the embedding visualization
129 |                 saver.save(sess, os.path.join(out_dir, "embedding", "embedding.ckpt"))
130 | 
131 |             current_step = sess.run(rmidp.global_step)
132 | 
133 |             def train_step(x_batch_content, x_batch_question, x_batch_option, y_batch):
134 |                 """A single training step"""
135 |                 feed_dict = {
136 |                     rmidp.input_x_content: x_batch_content,
137 |                     rmidp.input_x_question: x_batch_question,
138 |                     rmidp.input_x_option: x_batch_option,
139 |                     rmidp.input_y: y_batch,
140 |                     rmidp.dropout_keep_prob: args.dropout_rate,
141 |                     rmidp.is_training: True
142 |                 }
143 |                 _, step, summaries, loss = sess.run(
144 |                     [train_op, rmidp.global_step, train_summary_op, rmidp.loss], feed_dict)
145 |                 logger.info("step {0}: loss {1:g}".format(step, loss))
146 |                 train_summary_writer.add_summary(summaries, step)
147 | 
148 |             def validation_step(x_val_content, x_val_question, x_val_option, y_val, writer=None):
149 |                 """Evaluates model on a validation set"""
150 |                 batches_validation = dh.batch_iter(list(zip(x_val_content, x_val_question, x_val_option, y_val)),
151 |                                                    args.batch_size, 1)
152 | 
153 |                 eval_counter, eval_loss = 0, 0.0
154 | 
155 |                 true_labels = []
156 |                 predicted_scores = []
157 | 
158 |                 for batch_validation in batches_validation:
159 |                     x_batch_content, x_batch_question, x_batch_option, y_batch = zip(*batch_validation)
160 |                     feed_dict = {
161 |                         rmidp.input_x_content: x_batch_content,
162 |                         rmidp.input_x_question: x_batch_question,
163 |                         rmidp.input_x_option: x_batch_option,
164 |                         rmidp.input_y: y_batch,
165 |                         rmidp.dropout_keep_prob: 1.0,
166 |                         rmidp.is_training: False
167 |                     }
168 |                     step, summaries, scores, cur_loss = sess.run(
169 |                         [rmidp.global_step, validation_summary_op, rmidp.scores, rmidp.loss], feed_dict)
170 | 
171 |                     # Prepare for calculating metrics
172 |                     for i in y_batch:
173 |                         true_labels.append(i)
174 |                     for j in scores:
175 |                         predicted_scores.append(j)
176 | 
177 |                     eval_loss = eval_loss + cur_loss
178 |                     eval_counter = eval_counter + 1
179 | 
180 |                     if writer:
181 |                         writer.add_summary(summaries, step)
182 | 
183 |                 eval_loss = float(eval_loss / eval_counter)
184 | 
185 |                 # Calculate PCC & DOA
186 |                 pcc, doa = dh.evaluation(true_labels, predicted_scores)
187 |                 # Calculate RMSE
188 |                 rmse = mean_squared_error(true_labels, predicted_scores) ** 0.5
189 |                 r2 = r2_score(true_labels, predicted_scores)
190 | 
191 |                 return eval_loss, pcc, doa, rmse, r2
192 | 
193 |             # Generate batches
194 |             batches_train = dh.batch_iter(list(zip(x_train_content, x_train_question, x_train_option, y_train)),
195 |                                           args.batch_size, args.epochs)
196 | 
197 |             num_batches_per_epoch = int((len(y_train) - 1) / args.batch_size) + 1
198 | 
199 |             # Training loop. For each batch...
200 |             for batch_train in batches_train:
201 |                 x_batch_train_content, x_batch_train_question, x_batch_train_option, y_batch_train = zip(*batch_train)
202 |                 train_step(x_batch_train_content, x_batch_train_question, x_batch_train_option, y_batch_train)
203 |                 current_step = tf.train.global_step(sess, rmidp.global_step)
204 | 
205 |                 if current_step % args.evaluate_steps == 0:
206 |                     logger.info("\nEvaluation:")
207 |                     eval_loss, pcc, doa, rmse, r2 = validation_step(x_val_content, x_val_question, x_val_option, y_val,
208 |                                                                     writer=validation_summary_writer)
209 |                     logger.info("All Validation set: Loss {0:g} | PCC {1:g} | DOA {2:g} | RMSE {3:g} | R2 {4:g}"
210 |                                 .format(eval_loss, pcc, doa, rmse, r2))
211 |                     best_saver.handle(rmse, sess, current_step)
212 |                 if current_step % args.checkpoint_steps == 0:
213 |                     checkpoint_prefix = os.path.join(checkpoint_dir, "model")
214 |                     path = saver.save(sess, checkpoint_prefix, global_step=current_step)
215 |                     logger.info("Saved model checkpoint to {0}\n".format(path))
216 |                 if current_step % num_batches_per_epoch == 0:
217 |                     current_epoch = current_step // num_batches_per_epoch
218 |                     logger.info("Epoch {0} has finished!".format(current_epoch))
219 | 
220 |     logger.info("All Done.")
221 | 
222 | 
223 | if __name__ == '__main__':
224 |     train_rmidp()


--------------------------------------------------------------------------------
/TF/TARNN/test_tarnn.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | __author__ = 'Randolph'
  3 | 
  4 | import os
  5 | import sys
  6 | import time
  7 | import logging
  8 | 
  9 | sys.path.append('../')
 10 | logging.getLogger('tensorflow').disabled = True
 11 | 
 12 | import tensorflow as tf
 13 | from utils import checkmate as cm
 14 | from utils import data_helpers as dh
 15 | from utils import param_parser as parser
 16 | from sklearn.metrics import mean_squared_error, r2_score
 17 | 
 18 | args = parser.parameter_parser()
 19 | MODEL = dh.get_model_name()
 20 | logger = dh.logger_fn("tflog", "logs/Test-{0}.log".format(time.asctime()))
 21 | 
 22 | CPT_DIR = 'runs/' + MODEL + '/checkpoints/'
 23 | BEST_CPT_DIR = 'runs/' + MODEL + '/bestcheckpoints/'
 24 | SAVE_DIR = 'output/' + MODEL
 25 | 
 26 | 
 27 | def test_tarnn():
 28 |     """Test TARNN model."""
 29 |     # Print parameters used for the model
 30 |     dh.tab_printer(args, logger)
 31 | 
 32 |     # Load data
 33 |     logger.info("Loading data...")
 34 |     logger.info("Data processing...")
 35 |     test_data = dh.load_data_and_labels(args.test_file, args.word2vec_file, data_aug_flag=False)
 36 | 
 37 |     logger.info("Data padding...")
 38 |     x_test_content, x_test_question, x_test_option, y_test = dh.pad_data(test_data, args.pad_seq_len)
 39 | 
 40 |     # Load tarnn model
 41 |     OPTION = dh.option(pattern=1)
 42 |     if OPTION == 'B':
 43 |         logger.info("Loading best model...")
 44 |         checkpoint_file = cm.get_best_checkpoint(BEST_CPT_DIR, select_maximum_value=True)
 45 |     else:
 46 |         logger.info("Loading latest model...")
 47 |         checkpoint_file = tf.train.latest_checkpoint(CPT_DIR)
 48 |     logger.info(checkpoint_file)
 49 | 
 50 |     graph = tf.Graph()
 51 |     with graph.as_default():
 52 |         session_conf = tf.ConfigProto(
 53 |             allow_soft_placement=args.allow_soft_placement,
 54 |             log_device_placement=args.log_device_placement)
 55 |         session_conf.gpu_options.allow_growth = args.gpu_options_allow_growth
 56 |         sess = tf.Session(config=session_conf)
 57 |         with sess.as_default():
 58 |             # Load the saved meta graph and restore variables
 59 |             saver = tf.train.import_meta_graph("{0}.meta".format(checkpoint_file))
 60 |             saver.restore(sess, checkpoint_file)
 61 | 
 62 |             # Get the placeholders from the graph by name
 63 |             input_x_content = graph.get_operation_by_name("input_x_content").outputs[0]
 64 |             input_x_question = graph.get_operation_by_name("input_x_question").outputs[0]
 65 |             input_x_option = graph.get_operation_by_name("input_x_option").outputs[0]
 66 |             input_y = graph.get_operation_by_name("input_y").outputs[0]
 67 |             dropout_keep_prob = graph.get_operation_by_name("dropout_keep_prob").outputs[0]
 68 |             is_training = graph.get_operation_by_name("is_training").outputs[0]
 69 | 
 70 |             # Tensors we want to evaluate
 71 |             scores = graph.get_operation_by_name("output/scores").outputs[0]
 72 |             loss = graph.get_operation_by_name("loss/loss").outputs[0]
 73 | 
 74 |             # Split the output nodes name by '|' if you have several output nodes
 75 |             output_node_names = "output/scores"
 76 | 
 77 |             # Save the .pb model file
 78 |             output_graph_def = tf.graph_util.convert_variables_to_constants(sess, sess.graph_def,
 79 |                                                                             output_node_names.split("|"))
 80 |             tf.train.write_graph(output_graph_def, "graph", "graph-tarnn-{0}.pb".format(MODEL), as_text=False)
 81 | 
 82 |             # Generate batches for one epoch
 83 |             batches = dh.batch_iter(list(zip(x_test_content, x_test_question, x_test_option, y_test)),
 84 |                                     args.batch_size, 1, shuffle=False)
 85 | 
 86 |             test_counter, test_loss = 0, 0.0
 87 | 
 88 |             # Collect the predictions here
 89 |             true_labels = []
 90 |             predicted_scores = []
 91 | 
 92 |             for batch_test in batches:
 93 |                 x_batch_content, x_batch_question, x_batch_option, y_batch = zip(*batch_test)
 94 |                 feed_dict = {
 95 |                     input_x_content: x_batch_content,
 96 |                     input_x_question: x_batch_question,
 97 |                     input_x_option: x_batch_option,
 98 |                     input_y: y_batch,
 99 |                     dropout_keep_prob: 1.0,
100 |                     is_training: False
101 |                 }
102 |                 batch_scores, cur_loss = sess.run([scores, loss], feed_dict)
103 | 
104 |                 # Prepare for calculating metrics
105 |                 for i in y_batch:
106 |                     true_labels.append(i)
107 |                 for j in batch_scores:
108 |                     predicted_scores.append(j)
109 | 
110 |                 test_loss = test_loss + cur_loss
111 |                 test_counter = test_counter + 1
112 | 
113 |             # Calculate PCC & DOA
114 |             pcc, doa = dh.evaluation(true_labels, predicted_scores)
115 |             # Calculate RMSE
116 |             rmse = mean_squared_error(true_labels, predicted_scores) ** 0.5
117 |             r2 = r2_score(true_labels, predicted_scores)
118 | 
119 |             test_loss = float(test_loss / test_counter)
120 | 
121 |             logger.info("All Test Dataset: Loss {0:g} | PCC {1:g} | DOA {2:g} | RMSE {3:g} | R2 {4:g}"
122 |                         .format(test_loss, pcc, doa, rmse, r2))
123 | 
124 |             # Save the prediction result
125 |             if not os.path.exists(SAVE_DIR):
126 |                 os.makedirs(SAVE_DIR)
127 |             dh.create_prediction_file(output_file=SAVE_DIR + "/predictions.json", all_id=test_data.id,
128 |                                       all_labels=true_labels, all_predict_scores=predicted_scores)
129 | 
130 |     logger.info("All Done.")
131 | 
132 | 
133 | if __name__ == '__main__':
134 |     test_tarnn()
135 | 


--------------------------------------------------------------------------------
/TF/TARNN/train_tarnn.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | __author__ = 'Randolph'
  3 | 
  4 | import os
  5 | import sys
  6 | import time
  7 | import logging
  8 | 
  9 | sys.path.append('../')
 10 | logging.getLogger('tensorflow').disabled = True
 11 | 
 12 | import tensorflow as tf
 13 | from text_tarnn import TextTARNN
 14 | from utils import checkmate as cm
 15 | from utils import data_helpers as dh
 16 | from utils import param_parser as parser
 17 | from tensorboard.plugins import projector
 18 | from sklearn.metrics import mean_squared_error, r2_score
 19 | 
 20 | args = parser.parameter_parser()
 21 | OPTION = dh.option(pattern=0)
 22 | logger = dh.logger_fn("tflog", "logs/{0}-{1}.log".format('Train' if OPTION == 'T' else 'Restore', time.asctime()))
 23 | 
 24 | 
 25 | def train_tarnn():
 26 |     """Training TARNN model."""
 27 |     # Print parameters used for the model
 28 |     dh.tab_printer(args, logger)
 29 | 
 30 |     # Load sentences, labels, and training parameters
 31 |     logger.info("Loading data...")
 32 |     logger.info("Data processing...")
 33 |     train_data = dh.load_data_and_labels(args.train_file, args.word2vec_file, data_aug_flag=False)
 34 |     val_data = dh.load_data_and_labels(args.validation_file, args.word2vec_file, data_aug_flag=False)
 35 | 
 36 |     logger.info("Data padding...")
 37 |     x_train_content, x_train_question, x_train_option, y_train = dh.pad_data(train_data, args.pad_seq_len)
 38 |     x_val_content, x_val_question, x_val_option, y_val = dh.pad_data(val_data, args.pad_seq_len)
 39 | 
 40 |     # Build vocabulary
 41 |     VOCAB_SIZE, EMBEDDING_SIZE, pretrained_word2vec_matrix = dh.load_word2vec_matrix(args.word2vec_file)
 42 | 
 43 |     # Build a graph and tarnn object
 44 |     with tf.Graph().as_default():
 45 |         session_conf = tf.ConfigProto(
 46 |             allow_soft_placement=args.allow_soft_placement,
 47 |             log_device_placement=args.log_device_placement)
 48 |         session_conf.gpu_options.allow_growth = args.gpu_options_allow_growth
 49 |         sess = tf.Session(config=session_conf)
 50 |         with sess.as_default():
 51 |             tarnn = TextTARNN(
 52 |                 sequence_length=args.pad_seq_len,
 53 |                 vocab_size=VOCAB_SIZE,
 54 |                 embedding_type=args.embedding_type,
 55 |                 embedding_size=EMBEDDING_SIZE,
 56 |                 rnn_hidden_size=args.rnn_dim,
 57 |                 rnn_type=args.rnn_type,
 58 |                 rnn_layers=args.rnn_layers,
 59 |                 attention_type=args.attention_type,
 60 |                 fc_hidden_size=args.fc_dim,
 61 |                 l2_reg_lambda=args.l2_lambda,
 62 |                 pretrained_embedding=pretrained_word2vec_matrix)
 63 | 
 64 |             # Define training procedure
 65 |             with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
 66 |                 learning_rate = tf.train.exponential_decay(learning_rate=args.learning_rate,
 67 |                                                            global_step=tarnn.global_step, decay_steps=args.decay_steps,
 68 |                                                            decay_rate=args.decay_rate, staircase=True)
 69 |                 optimizer = tf.train.AdamOptimizer(learning_rate)
 70 |                 grads, vars = zip(*optimizer.compute_gradients(tarnn.loss))
 71 |                 grads, _ = tf.clip_by_global_norm(grads, clip_norm=args.norm_ratio)
 72 |                 train_op = optimizer.apply_gradients(zip(grads, vars), global_step=tarnn.global_step, name="train_op")
 73 | 
 74 |             # Keep track of gradient values and sparsity (optional)
 75 |             grad_summaries = []
 76 |             for g, v in zip(grads, vars):
 77 |                 if g is not None:
 78 |                     grad_hist_summary = tf.summary.histogram("{0}/grad/hist".format(v.name), g)
 79 |                     sparsity_summary = tf.summary.scalar("{0}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g))
 80 |                     grad_summaries.append(grad_hist_summary)
 81 |                     grad_summaries.append(sparsity_summary)
 82 |             grad_summaries_merged = tf.summary.merge(grad_summaries)
 83 | 
 84 |             # Output directory for models and summaries
 85 |             out_dir = dh.get_out_dir(OPTION, logger)
 86 |             checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
 87 |             best_checkpoint_dir = os.path.abspath(os.path.join(out_dir, "bestcheckpoints"))
 88 | 
 89 |             # Summaries for loss
 90 |             loss_summary = tf.summary.scalar("loss", tarnn.loss)
 91 | 
 92 |             # Train summaries
 93 |             train_summary_op = tf.summary.merge([loss_summary, grad_summaries_merged])
 94 |             train_summary_dir = os.path.join(out_dir, "summaries", "train")
 95 |             train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph)
 96 | 
 97 |             # Validation summaries
 98 |             validation_summary_op = tf.summary.merge([loss_summary])
 99 |             validation_summary_dir = os.path.join(out_dir, "summaries", "validation")
100 |             validation_summary_writer = tf.summary.FileWriter(validation_summary_dir, sess.graph)
101 | 
102 |             saver = tf.train.Saver(tf.global_variables(), max_to_keep=args.num_checkpoints)
103 |             best_saver = cm.BestCheckpointSaver(save_dir=best_checkpoint_dir, num_to_keep=3, maximize=False)
104 | 
105 |             if OPTION == 'R':
106 |                 # Load tarnn model
107 |                 logger.info("Loading model...")
108 |                 checkpoint_file = tf.train.latest_checkpoint(checkpoint_dir)
109 |                 logger.info(checkpoint_file)
110 | 
111 |                 # Load the saved meta graph and restore variables
112 |                 saver = tf.train.import_meta_graph("{0}.meta".format(checkpoint_file))
113 |                 saver.restore(sess, checkpoint_file)
114 |             if OPTION == 'T':
115 |                 if not os.path.exists(checkpoint_dir):
116 |                     os.makedirs(checkpoint_dir)
117 |                 sess.run(tf.global_variables_initializer())
118 |                 sess.run(tf.local_variables_initializer())
119 | 
120 |                 # Embedding visualization config
121 |                 config = projector.ProjectorConfig()
122 |                 embedding_conf = config.embeddings.add()
123 |                 embedding_conf.tensor_name = "embedding"
124 |                 embedding_conf.metadata_path = args.metadata_file
125 | 
126 |                 projector.visualize_embeddings(train_summary_writer, config)
127 |                 projector.visualize_embeddings(validation_summary_writer, config)
128 | 
129 |                 # Save the embedding visualization
130 |                 saver.save(sess, os.path.join(out_dir, "embedding", "embedding.ckpt"))
131 | 
132 |             current_step = sess.run(tarnn.global_step)
133 | 
134 |             def train_step(x_batch_content, x_batch_question, x_batch_option, y_batch):
135 |                 """A single training step"""
136 |                 feed_dict = {
137 |                     tarnn.input_x_content: x_batch_content,
138 |                     tarnn.input_x_question: x_batch_question,
139 |                     tarnn.input_x_option: x_batch_option,
140 |                     tarnn.input_y: y_batch,
141 |                     tarnn.dropout_keep_prob: args.dropout_rate,
142 |                     tarnn.is_training: True
143 |                 }
144 |                 _, step, summaries, loss = sess.run(
145 |                     [train_op, tarnn.global_step, train_summary_op, tarnn.loss], feed_dict)
146 |                 logger.info("step {0}: loss {1:g}".format(step, loss))
147 |                 train_summary_writer.add_summary(summaries, step)
148 | 
149 |             def validation_step(x_val_content, x_val_question, x_val_option, y_val, writer=None):
150 |                 """Evaluates model on a validation set"""
151 |                 batches_validation = dh.batch_iter(list(zip(x_val_content, x_val_question, x_val_option, y_val)),
152 |                                                    args.batch_size, 1)
153 | 
154 |                 eval_counter, eval_loss = 0, 0.0
155 |                 true_labels = []
156 |                 predicted_scores = []
157 | 
158 |                 for batch_validation in batches_validation:
159 |                     x_batch_content, x_batch_question, x_batch_option, y_batch = zip(*batch_validation)
160 |                     feed_dict = {
161 |                         tarnn.input_x_content: x_batch_content,
162 |                         tarnn.input_x_question: x_batch_question,
163 |                         tarnn.input_x_option: x_batch_option,
164 |                         tarnn.input_y: y_batch,
165 |                         tarnn.dropout_keep_prob: 1.0,
166 |                         tarnn.is_training: False
167 |                     }
168 |                     step, summaries, scores, cur_loss = sess.run(
169 |                         [tarnn.global_step, validation_summary_op, tarnn.scores, tarnn.loss], feed_dict)
170 | 
171 |                     # Prepare for calculating metrics
172 |                     for i in y_batch:
173 |                         true_labels.append(i)
174 |                     for j in scores:
175 |                         predicted_scores.append(j)
176 | 
177 |                     eval_loss = eval_loss + cur_loss
178 |                     eval_counter = eval_counter + 1
179 | 
180 |                     if writer:
181 |                         writer.add_summary(summaries, step)
182 | 
183 |                 eval_loss = float(eval_loss / eval_counter)
184 | 
185 |                 # Calculate PCC & DOA
186 |                 pcc, doa = dh.evaluation(true_labels, predicted_scores)
187 |                 # Calculate RMSE
188 |                 rmse = mean_squared_error(true_labels, predicted_scores) ** 0.5
189 |                 r2 = r2_score(true_labels, predicted_scores)
190 | 
191 |                 return eval_loss, pcc, doa, rmse, r2
192 | 
193 |             # Generate batches
194 |             batches_train = dh.batch_iter(list(zip(x_train_content, x_train_question, x_train_option, y_train)),
195 |                                           args.batch_size, args.epochs)
196 | 
197 |             num_batches_per_epoch = int((len(y_train) - 1) / args.batch_size) + 1
198 | 
199 |             # Training loop. For each batch...
200 |             for batch_train in batches_train:
201 |                 x_batch_train_content, x_batch_train_question, x_batch_train_option, y_batch_train = zip(*batch_train)
202 |                 train_step(x_batch_train_content, x_batch_train_question, x_batch_train_option, y_batch_train)
203 |                 current_step = tf.train.global_step(sess, tarnn.global_step)
204 | 
205 |                 if current_step % args.evaluate_steps == 0:
206 |                     logger.info("\nEvaluation:")
207 |                     eval_loss, pcc, doa, rmse, r2 = validation_step(x_val_content, x_val_question, x_val_option, y_val,
208 |                                                                     writer=validation_summary_writer)
209 |                     logger.info("All Validation set: Loss {0:g} | PCC {1:g} | DOA {2:g} | RMSE {3:g} | R2 {4:g}"
210 |                                 .format(eval_loss, pcc, doa, rmse, r2))
211 |                     best_saver.handle(rmse, sess, current_step)
212 |                 if current_step % args.checkpoint_steps == 0:
213 |                     checkpoint_prefix = os.path.join(checkpoint_dir, "model")
214 |                     path = saver.save(sess, checkpoint_prefix, global_step=current_step)
215 |                     logger.info("Saved model checkpoint to {0}\n".format(path))
216 |                 if current_step % num_batches_per_epoch == 0:
217 |                     current_epoch = current_step // num_batches_per_epoch
218 |                     logger.info("Epoch {0} has finished!".format(current_epoch))
219 | 
220 |     logger.info("All Done.")
221 | 
222 | 
223 | if __name__ == '__main__':
224 |     train_tarnn()


--------------------------------------------------------------------------------
/TF/utils/checkmate.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | __author__ = 'Randolph'
  3 | 
  4 | import os
  5 | import glob
  6 | import json
  7 | import numpy as np
  8 | import tensorflow as tf
  9 | 
 10 | 
 11 | class BestCheckpointSaver(object):
 12 |     """Maintains a directory containing only the best n checkpoints
 13 |     Inside the directory is a best_checkpoints JSON file containing a dictionary
 14 |     mapping of the best checkpoint filepaths to the values by which the checkpoints
 15 |     are compared.  Only the best n checkpoints are contained in the directory and JSON file.
 16 |     This is a light-weight wrapper class only intended to work in simple,
 17 |     non-distributed settings.  It is not intended to work with the tf.Estimator
 18 |     framework.
 19 |     """
 20 |     def __init__(self, save_dir, num_to_keep=1, maximize=True, saver=None):
 21 |         """Creates a `BestCheckpointSaver`
 22 |         `BestCheckpointSaver` acts as a wrapper class around a `tf.train.Saver`
 23 |         Args:
 24 |             save_dir: The directory in which the checkpoint files will be saved
 25 |             num_to_keep: The number of best checkpoint files to retain
 26 |             maximize: Define 'best' values to be the highest values.  For example,
 27 |               set this to True if selecting for the checkpoints with the highest
 28 |               given accuracy.  Or set to False to select for checkpoints with the
 29 |               lowest given error rate.
 30 |             saver: A `tf.train.Saver` to use for saving checkpoints.  A default
 31 |               `tf.train.Saver` will be created if none is provided.
 32 |         """
 33 |         self._num_to_keep = num_to_keep
 34 |         self._save_dir = save_dir
 35 |         self._save_path = os.path.join(save_dir, 'model')
 36 |         self._maximize = maximize
 37 |         self._saver = saver if saver else tf.train.Saver(
 38 |             max_to_keep=None,
 39 |             save_relative_paths=True
 40 |         )
 41 | 
 42 |         if not os.path.exists(save_dir):
 43 |             os.makedirs(save_dir)
 44 |         self.best_checkpoints_file = os.path.join(save_dir, 'best_checkpoints')
 45 | 
 46 |     def handle(self, value, sess, global_step):
 47 |         """Updates the set of best checkpoints based on the given result.
 48 |         Args:
 49 |             value: The value by which to rank the checkpoint.
 50 |             sess: A tf.Session to use to save the checkpoint
 51 |             global_step: The global step
 52 |         """
 53 |         current_ckpt = 'model-{}'.format(global_step)
 54 |         value = float(value)
 55 |         if not os.path.exists(self.best_checkpoints_file):
 56 |             self._save_best_checkpoints_file({current_ckpt: value})
 57 |             self._saver.save(sess, self._save_path, global_step)
 58 |             return
 59 | 
 60 |         best_checkpoints = self._load_best_checkpoints_file()
 61 | 
 62 |         if len(best_checkpoints) < self._num_to_keep:
 63 |             best_checkpoints[current_ckpt] = value
 64 |             self._save_best_checkpoints_file(best_checkpoints)
 65 |             self._saver.save(sess, self._save_path, global_step)
 66 |             return
 67 | 
 68 |         if self._maximize:
 69 |             should_save = not all(current_best >= value
 70 |                                   for current_best in best_checkpoints.values())
 71 |         else:
 72 |             should_save = not all(current_best <= value
 73 |                                   for current_best in best_checkpoints.values())
 74 |         if should_save:
 75 |             best_checkpoint_list = self._sort(best_checkpoints)
 76 | 
 77 |             worst_checkpoint = os.path.join(self._save_dir,
 78 |                                             best_checkpoint_list.pop(-1)[0])
 79 |             self._remove_outdated_checkpoint_files(worst_checkpoint)
 80 |             self._update_internal_saver_state(best_checkpoint_list)
 81 | 
 82 |             best_checkpoints = dict(best_checkpoint_list)
 83 |             best_checkpoints[current_ckpt] = value
 84 |             self._save_best_checkpoints_file(best_checkpoints)
 85 | 
 86 |             self._saver.save(sess, self._save_path, global_step)
 87 | 
 88 |     def _save_best_checkpoints_file(self, updated_best_checkpoints):
 89 |         with open(self.best_checkpoints_file, 'w') as f:
 90 |             json.dump(updated_best_checkpoints, f, indent=3)
 91 | 
 92 |     def _remove_outdated_checkpoint_files(self, worst_checkpoint):
 93 |         os.remove(os.path.join(self._save_dir, 'checkpoint'))
 94 |         for ckpt_file in glob.glob(worst_checkpoint + '.*'):
 95 |             os.remove(ckpt_file)
 96 | 
 97 |     def _update_internal_saver_state(self, best_checkpoint_list):
 98 |         best_checkpoint_files = [
 99 |             (ckpt[0], np.inf)  # TODO: Try to use actual file timestamp
100 |             for ckpt in best_checkpoint_list
101 |         ]
102 |         self._saver.set_last_checkpoints_with_time(best_checkpoint_files)
103 | 
104 |     def _load_best_checkpoints_file(self):
105 |         with open(self.best_checkpoints_file, 'r') as f:
106 |             best_checkpoints = json.load(f)
107 |         return best_checkpoints
108 | 
109 |     def _sort(self, best_checkpoints):
110 |         best_checkpoints = [
111 |             (ckpt, best_checkpoints[ckpt])
112 |             for ckpt in sorted(best_checkpoints,
113 |                                key=best_checkpoints.get,
114 |                                reverse=self._maximize)
115 |         ]
116 |         return best_checkpoints
117 | 
118 | 
119 | def get_best_checkpoint(best_checkpoint_dir, select_maximum_value=True):
120 |     """
121 |     Returns filepath to the best checkpoint
122 |     Reads the best_checkpoints file in the best_checkpoint_dir directory.
123 |     Returns the filepath in the best_checkpoints file associated with
124 |     the highest value if select_maximum_value is True, or the filepath
125 |     associated with the lowest value if select_maximum_value is False.
126 |     Args:
127 |         best_checkpoint_dir: Directory containing best_checkpoints JSON file
128 |         select_maximum_value: If True, select the filepath associated
129 |           with the highest value.  Otherwise, select the filepath associated
130 |           with the lowest value.
131 |     Returns:
132 |         The full path to the best checkpoint file
133 |     """
134 |     best_checkpoints_file = os.path.join(best_checkpoint_dir, 'best_checkpoints')
135 |     assert os.path.exists(best_checkpoints_file)
136 |     with open(best_checkpoints_file, 'r') as f:
137 |         best_checkpoints = json.load(f)
138 |     best_checkpoints = [
139 |         ckpt for ckpt in sorted(best_checkpoints,
140 |                                 key=best_checkpoints.get,
141 |                                 reverse=select_maximum_value)
142 |     ]
143 |     return os.path.join(os.path.abspath(best_checkpoint_dir),  best_checkpoints[0])
144 | 


--------------------------------------------------------------------------------
/TF/utils/param_parser.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | 
  3 | 
  4 | def parameter_parser():
  5 |     """
  6 |     A method to parse up command line parameters.
  7 |     The default hyperparameters give good results without cross-validation.
  8 |     """
  9 |     parser = argparse.ArgumentParser(description="Run Model.")
 10 | 
 11 |     # Data Parameters
 12 |     parser.add_argument("--train-file",
 13 |                         nargs="?",
 14 |                         default="../../data/Train_sample.json",
 15 |                         help="Training data.")
 16 | 
 17 |     parser.add_argument("--validation-file",
 18 |                         nargs="?",
 19 |                         default="../../data/Validation_sample.json",
 20 |                         help="Validation data.")
 21 | 
 22 |     parser.add_argument("--test-file",
 23 |                         nargs="?",
 24 |                         default="../../data/Test_sample.json",
 25 |                         help="Testing data.")
 26 | 
 27 |     parser.add_argument("--metadata-file",
 28 |                         nargs="?",
 29 |                         default="../../data/metadata.tsv",
 30 |                         help="Metadata file for embedding visualization.")
 31 | 
 32 |     parser.add_argument("--word2vec-file",
 33 |                         nargs="?",
 34 |                         default="../../data/word2vec_300.txt",
 35 |                         help="Word2vec file for embedding characters. (the dim need to be the same as embedding dim).")
 36 | 
 37 |     # Model Hyperparameters
 38 |     parser.add_argument("--pad-seq-len",
 39 |                         type=list,
 40 |                         default=[350, 15, 10],
 41 |                         help="Padding Sequence length of data. (depends on the data)")
 42 | 
 43 |     parser.add_argument("--embedding-type",
 44 |                         type=int,
 45 |                         default=1,
 46 |                         help="The embedding type. (default: 1)")
 47 | 
 48 |     parser.add_argument("--embedding-dim",
 49 |                         type=int,
 50 |                         default=300,
 51 |                         help="Dimensionality of character embedding. (default: 300)")
 52 | 
 53 |     parser.add_argument("--attention-type",
 54 |                         nargs="?",
 55 |                         default="normal",
 56 |                         help="The attention type. ('normal', 'cosine', 'mlp', 'islet')")
 57 | 
 58 |     parser.add_argument("--attention-dim",
 59 |                         type=int,
 60 |                         default=200,
 61 |                         help="Dimensionality of Attention Neurons. (default: 200)")
 62 | 
 63 |     parser.add_argument("--filter-sizes",
 64 |                         type=list,
 65 |                         default=[3, 3],
 66 |                         help="Filter sizes. (default: 3)")
 67 | 
 68 |     parser.add_argument("--num-filters",
 69 |                         type=list,
 70 |                         default=[200, 400],
 71 |                         help="Number of filters per filter size. (default: 128)")
 72 | 
 73 |     parser.add_argument("--pooling-size",
 74 |                         type=int,
 75 |                         default=3,
 76 |                         help="Pooling sizes. (default: 3)")
 77 | 
 78 |     parser.add_argument("--rnn-dim",
 79 |                         type=int,
 80 |                         default=128,
 81 |                         help="Dimensionality for RNN Neurons. (default: 256)")
 82 | 
 83 |     parser.add_argument("--rnn-type",
 84 |                         nargs="?",
 85 |                         default="GRU",
 86 |                         help="Type of RNN Cell. ('RNN', 'LSTM', 'GRU')")
 87 | 
 88 |     parser.add_argument("--rnn-layers",
 89 |                         type=int,
 90 |                         default=1,
 91 |                         help="Number of RNN Layers. (default: 1)")
 92 | 
 93 |     parser.add_argument("--fc-dim",
 94 |                         type=int,
 95 |                         default=512,
 96 |                         help="Dimensionality for FC Neurons. (default: 512)")
 97 | 
 98 |     parser.add_argument("--dropout-rate",
 99 |                         type=float,
100 |                         default=0.5,
101 |                         help="Dropout keep probability. (default: 0.5)")
102 | 
103 |     # Training Parameters
104 |     parser.add_argument("--epochs",
105 |                         type=int,
106 |                         default=30,
107 |                         help="Number of training epochs. (default: 30)")
108 | 
109 |     parser.add_argument("--batch-size",
110 |                         type=int,
111 |                         default=32,
112 |                         help="Batch Size. (default: 32)")
113 | 
114 |     parser.add_argument("--learning-rate",
115 |                         type=float,
116 |                         default=0.001,
117 |                         help="Learning rate. (default: 0.001)")
118 | 
119 |     parser.add_argument("--decay-rate",
120 |                         type=float,
121 |                         default=0.95,
122 |                         help="Rate of decay for learning rate. (default: 0.95)")
123 | 
124 |     parser.add_argument("--decay-steps",
125 |                         type=int,
126 |                         default=500,
127 |                         help="How many steps before decay learning rate. (default: 500)")
128 | 
129 |     parser.add_argument("--evaluate-steps",
130 |                         type=int,
131 |                         default=10,
132 |                         help="Evaluate model on val set after how many steps. (default: 10)")
133 | 
134 |     parser.add_argument("--norm-ratio",
135 |                         type=float,
136 |                         default=1.25,
137 |                         help="The ratio of the sum of gradients norms of trainable variable. (default: 1.25)")
138 | 
139 |     parser.add_argument("--l2-lambda",
140 |                         type=float,
141 |                         default=0.0,
142 |                         help="L2 regularization lambda. (default: 0.0)")
143 | 
144 |     parser.add_argument("--checkpoint-steps",
145 |                         type=int,
146 |                         default=10,
147 |                         help="Save model after how many steps. (default: 10)")
148 | 
149 |     parser.add_argument("--num-checkpoints",
150 |                         type=int,
151 |                         default=10,
152 |                         help="Number of checkpoints to store. (default: 10)")
153 | 
154 |     # Misc Parameters
155 |     parser.add_argument("--allow-soft-placement",
156 |                         type=bool,
157 |                         default=True,
158 |                         help="Allow device soft device placement. (default: True)")
159 | 
160 |     parser.add_argument("--log-device-placement",
161 |                         type=bool,
162 |                         default=False,
163 |                         help="Log placement of ops on devices. (default: False)")
164 | 
165 |     parser.add_argument("--gpu-options-allow-growth",
166 |                         type=bool,
167 |                         default=True,
168 |                         help="Allow gpu options growth. (default: True)")
169 | 
170 |     return parser.parse_args()


--------------------------------------------------------------------------------
/TMLA/DTR/test_dtr.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | __author__ = 'Randolph'
 3 | 
 4 | import sys
 5 | import time
 6 | 
 7 | sys.path.append('../')
 8 | 
 9 | from utils import data_process as dp
10 | from sklearn.externals import joblib
11 | from sklearn.metrics import mean_squared_error, r2_score
12 | 
13 | logger = dp.logger_fn("dtr-log", "dtr/test-{0}.log".format(time.asctime()))
14 | 
15 | # Data Parameters
16 | TEST_DIR = '../../data/Test_BOW_sample.json'
17 | MODEL_DIR = 'dtr_model.m'
18 | 
19 | 
20 | def test():
21 |     logger.info("Loading data...")
22 | 
23 |     x_test, y_test = dp.load_data(TEST_DIR)
24 | 
25 |     logger.info("Loading model...")
26 |     model = joblib.load(MODEL_DIR)
27 | 
28 |     logger.info("Predicting...")
29 |     y_pred = model.predict(x_test)
30 | 
31 |     logger.info("Calculate Metrics...")
32 |     pcc, doa = dp.evaluation(y_test, y_pred)
33 |     rmse = mean_squared_error(y_test, y_pred) ** 0.5
34 |     r2 = r2_score(y_test, y_pred)
35 | 
36 |     logger.info("DTR: PCC {0:g} | DOA {1:g} | RMSE {2:g} | R2 {3:g}".format(pcc, doa, rmse, r2))
37 | 
38 |     logger.info("All Done.")
39 | 
40 | 
41 | if __name__ == '__main__':
42 |     test()
43 | 


--------------------------------------------------------------------------------
/TMLA/DTR/train_dtr.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | __author__ = 'Randolph'
 3 | 
 4 | import sys
 5 | import time
 6 | 
 7 | sys.path.append('../')
 8 | 
 9 | from utils import data_process as dp
10 | from sklearn.tree import DecisionTreeRegressor
11 | from sklearn.externals import joblib
12 | 
13 | logger = dp.logger_fn("dtr-log", "dtr/train-{0}.log".format(time.asctime()))
14 | 
15 | # Data Parameters
16 | TRAININGSET_DIR = '../../data/Train_BOW_sample.json'
17 | MODEL_DIR = 'dtr_model.m'
18 | 
19 | 
20 | def train():
21 |     # Load data
22 |     logger.info("Loading data...")
23 | 
24 |     x_train, y_train = dp.load_data(TRAININGSET_DIR)
25 | 
26 |     logger.info("Finish building BOW.")
27 | 
28 |     model = DecisionTreeRegressor(criterion="mse", splitter="best")
29 | 
30 |     logger.info("Training model...")
31 |     model.fit(x_train, y_train)
32 | 
33 |     logger.info("Finish training. Saving model...")
34 |     joblib.dump(model, MODEL_DIR)
35 | 
36 | 
37 | if __name__ == '__main__':
38 |     train()
39 | 


--------------------------------------------------------------------------------
/TMLA/LR/test_lr.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | __author__ = 'Randolph'
 3 | 
 4 | import sys
 5 | import time
 6 | 
 7 | sys.path.append('../')
 8 | 
 9 | from utils import data_process as dp
10 | from sklearn.externals import joblib
11 | from sklearn.metrics import mean_squared_error, r2_score
12 | 
13 | logger = dp.logger_fn("lr-log", "lr/test-{0}.log".format(time.asctime()))
14 | 
15 | # Data Parameters
16 | TEST_DIR = '../../data/Test_BOW_sample.json'
17 | MODEL_DIR = 'lr_model.m'
18 | 
19 | 
20 | def test():
21 |     logger.info("Loading data...")
22 | 
23 |     x_test, y_test = dp.load_data(TEST_DIR)
24 | 
25 |     logger.info("Loading model...")
26 |     model = joblib.load(MODEL_DIR)
27 | 
28 |     logger.info("Predicting...")
29 |     y_pred = model.predict(x_test)
30 | 
31 |     logger.info("Calculate Metrics...")
32 |     pcc, doa = dp.evaluation(y_test, y_pred)
33 |     rmse = mean_squared_error(y_test, y_pred) ** 0.5
34 |     r2 = r2_score(y_test, y_pred)
35 | 
36 |     logger.info("Logistic: PCC {0:g} | DOA {1:g} | RMSE {2:g} | R2 {3:g}".format(pcc, doa, rmse, r2))
37 | 
38 |     logger.info("All Done.")
39 | 
40 | 
41 | if __name__ == '__main__':
42 |     test()
43 | 


--------------------------------------------------------------------------------
/TMLA/LR/train_lr.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | __author__ = 'Randolph'
 3 | 
 4 | import sys
 5 | import time
 6 | 
 7 | sys.path.append('../')
 8 | 
 9 | from utils import data_process as dp
10 | from sklearn.linear_model import LinearRegression
11 | from sklearn.externals import joblib
12 | 
13 | logger = dp.logger_fn("lr-log", "lr/train-{0}.log".format(time.asctime()))
14 | 
15 | # Data Parameters
16 | TRAININGSET_DIR = '../../data/Train_BOW_sample.json'
17 | MODEL_DIR = 'lr_model.m'
18 | 
19 | 
20 | def train():
21 |     # Load data
22 |     logger.info("Loading data...")
23 | 
24 |     x_train, y_train = dp.load_data(TRAININGSET_DIR)
25 | 
26 |     logger.info("Finish building BOW.")
27 | 
28 |     model = LinearRegression()
29 | 
30 |     logger.info("Training model...")
31 |     model.fit(x_train, y_train)
32 | 
33 |     logger.info("Finish training. Saving model...")
34 |     joblib.dump(model, MODEL_DIR)
35 | 
36 | 
37 | if __name__ == '__main__':
38 |     train()
39 | 


--------------------------------------------------------------------------------
/TMLA/SVM/test_svm.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | __author__ = 'Randolph'
 3 | 
 4 | import sys
 5 | import time
 6 | 
 7 | sys.path.append('../')
 8 | 
 9 | from utils import data_process as dp
10 | from sklearn.externals import joblib
11 | from sklearn.metrics import mean_squared_error, r2_score
12 | 
13 | logger = dp.logger_fn("svm-log", "svm/test-{0}.log".format(time.asctime()))
14 | 
15 | # Data Parameters
16 | TEST_DIR = '../../data/Test_BOW_sample.json'
17 | MODEL_DIR = 'svm_model.m'
18 | 
19 | 
20 | def test():
21 |     logger.info("Loading data...")
22 | 
23 |     x_test, y_test = dp.load_data(TEST_DIR)
24 | 
25 |     logger.info("Loading model...")
26 |     model = joblib.load(MODEL_DIR)
27 | 
28 |     logger.info("Predicting...")
29 |     y_pred = model.predict(x_test)
30 | 
31 |     logger.info("Calculate Metrics...")
32 |     pcc, doa = dp.evaluation(y_test, y_pred)
33 |     rmse = mean_squared_error(y_test, y_pred) ** 0.5
34 |     r2 = r2_score(y_test, y_pred)
35 | 
36 |     logger.info("SVM: PCC {0:g} | DOA {1:g} | RMSE {2:g} | R2 {3:g}".format(pcc, doa, rmse, r2))
37 | 
38 |     logger.info("All Done.")
39 | 
40 | 
41 | if __name__ == '__main__':
42 |     test()
43 | 


--------------------------------------------------------------------------------
/TMLA/SVM/train_svm.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | __author__ = 'Randolph'
 3 | 
 4 | import sys
 5 | import time
 6 | 
 7 | sys.path.append('../')
 8 | 
 9 | from utils import data_process as dp
10 | from sklearn.svm import SVR
11 | from sklearn.externals import joblib
12 | 
13 | logger = dp.logger_fn("svm-log", "svm/train-{0}.log".format(time.asctime()))
14 | 
15 | # Data Parameters
16 | TRAININGSET_DIR = '../../data/Train_BOW_sample.json'
17 | MODEL_DIR = 'svm_model.m'
18 | 
19 | 
20 | def train():
21 |     # Load data
22 |     logger.info("Loading data...")
23 | 
24 |     x_train, y_train = dp.load_data(TRAININGSET_DIR)
25 | 
26 |     logger.info("Finish building BOW.")
27 | 
28 |     model = SVR()
29 | 
30 |     logger.info("Training model...")
31 |     model.fit(x_train, y_train)
32 | 
33 |     logger.info("Finish training. Saving model...")
34 |     joblib.dump(model, MODEL_DIR)
35 | 
36 | 
37 | if __name__ == '__main__':
38 |     train()
39 | 


--------------------------------------------------------------------------------
/TMLA/XGBoost/test_xgb.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | __author__ = 'Randolph'
 3 | 
 4 | import sys
 5 | import time
 6 | 
 7 | sys.path.append('../')
 8 | 
 9 | import xgboost as xgb
10 | from utils import data_process as dp
11 | from sklearn.externals import joblib
12 | from sklearn.metrics import mean_squared_error, r2_score
13 | 
14 | logger = dp.logger_fn("xgb-log", "xgb/test-{0}.log".format(time.asctime()))
15 | 
16 | # Data Parameters
17 | TEST_DIR = '../../data/Test_BOW_sample.json'
18 | MODEL_DIR = 'xgb_model.m'
19 | 
20 | 
21 | def test():
22 |     logger.info("Loading data...")
23 | 
24 |     x_test, y_test = dp.load_data(TEST_DIR)
25 |     d_test = xgb.DMatrix(x_test, label=y_test)
26 | 
27 |     logger.info("Loading model...")
28 |     model = joblib.load(MODEL_DIR)
29 | 
30 |     logger.info("Predicting...")
31 |     y_pred = model.predict(d_test)
32 | 
33 |     logger.info("Calculate Metrics...")
34 |     pcc, doa = dp.evaluation(y_test, y_pred)
35 |     rmse = mean_squared_error(y_test, y_pred) ** 0.5
36 |     r2 = r2_score(y_test, y_pred)
37 | 
38 |     logger.info("XGBoost: PCC {0:.4f} | DOA {1:.4f} | RMSE {2:.4f} | R2 {3:.4f}".format(pcc, doa, rmse, r2))
39 | 
40 |     logger.info("All Done.")
41 | 
42 | 
43 | if __name__ == '__main__':
44 |     test()
45 | 


--------------------------------------------------------------------------------
/TMLA/XGBoost/train_xgb.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | __author__ = 'Randolph'
 3 | 
 4 | import sys
 5 | import time
 6 | 
 7 | sys.path.append('../')
 8 | 
 9 | import xgboost as xgb
10 | from utils import data_process as dp
11 | from sklearn.externals import joblib
12 | 
13 | logger = dp.logger_fn("xgb-log", "xgb/train-{0}.log".format(time.asctime()))
14 | 
15 | # Data Parameters
16 | TRAININGSET_DIR = '../../data/Train_BOW_sample.json'
17 | VALIDATION_DIR = '../../data/Validation_BOW_sample.json'
18 | MODEL_DIR = 'xgb_model.m'
19 | 
20 | 
21 | def train():
22 |     # Load data
23 |     logger.info("Loading data...")
24 | 
25 |     x_train, y_train = dp.load_data(TRAININGSET_DIR)
26 |     x_val, y_val = dp.load_data(VALIDATION_DIR)
27 | 
28 |     d_train = xgb.DMatrix(x_train, label=y_train)
29 |     d_val = xgb.DMatrix(x_val, label=y_val)
30 |     watchlist = [(d_train, 'train'), (d_val, 'valid')]
31 |     logger.info("Finish building BOW.")
32 | 
33 |     params_xgb = {
34 |         'objective': 'reg:linear',
35 |         'eta': 0.001,
36 |         'max_depth': 10,
37 |         'eval_metric': 'rmse'
38 |     }
39 |     # TODO
40 |     model = xgb.train(params_xgb, d_train, 10000, evals=watchlist, early_stopping_rounds=20, verbose_eval=10)
41 |     logger.info("Training model...")
42 | 
43 |     logger.info("Finish training. Saving model...")
44 |     joblib.dump(model, MODEL_DIR)
45 | 
46 | 
47 | if __name__ == '__main__':
48 |     train()
49 | 


--------------------------------------------------------------------------------
/TMLA/utils/data_process.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | __author__ = 'Randolph'
  3 | 
  4 | import os
  5 | import json
  6 | import math
  7 | import logging
  8 | import pickle
  9 | import numpy as np
 10 | from tqdm import tqdm
 11 | from scipy import stats
 12 | 
 13 | 
 14 | def logger_fn(name, input_file, level=logging.INFO):
 15 |     tf_logger = logging.getLogger(name)
 16 |     tf_logger.setLevel(level)
 17 |     log_dir = os.path.dirname(input_file)
 18 |     if not os.path.exists(log_dir):
 19 |         os.makedirs(log_dir)
 20 |     fh = logging.FileHandler(input_file, mode='w')
 21 |     formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
 22 |     fh.setFormatter(formatter)
 23 |     tf_logger.addHandler(fh)
 24 |     return tf_logger
 25 | 
 26 | 
 27 | def create_word_dict(input_file, pickle_file):
 28 |     with open(input_file, 'r') as fin, open(pickle_file, 'wb') as handle:
 29 |         word_dict = dict()
 30 |         word_num = 0
 31 |         for eachline in fin:
 32 |             line = json.loads(eachline)
 33 |             words = line['content'] + line['question'] + line['pos_text']
 34 |             for word in words:
 35 |                 if word not in word_dict.keys():
 36 |                     word_dict[word] = word_num
 37 |                     word_num = word_num + 1
 38 |         # Save Word Dict
 39 |         pickle.dump(word_dict, handle)
 40 | 
 41 | 
 42 | def create_bow_feature(input_file, pickle_file, output_file):
 43 |     with open(input_file, 'r') as fin, open(pickle_file, 'rb') as handle, open(output_file, 'w') as fout:
 44 |         word_dict = pickle.load(handle)
 45 |         word_num = len(word_dict.keys())
 46 |         print(word_num)
 47 | 
 48 |         for eachline in tqdm(fin):
 49 |             line = json.loads(eachline)
 50 |             words = line['content'] + line['question'] + line['pos_text']
 51 |             feature = [0] * word_num
 52 |             for word in words:
 53 |                 feature[word_dict[word]] += 1
 54 |             data_record = {
 55 |                 'id': line['id'],
 56 |                 'feature': feature,
 57 |                 'diff': line['diff']
 58 |             }
 59 |             fout.write(json.dumps(data_record, ensure_ascii=False) + '\n')
 60 | 
 61 | 
 62 | def load_data(data_file):
 63 |     x_data, y_data = [], []
 64 |     with open(data_file, 'r') as f_train:
 65 |         for eachline in f_train:
 66 |             line = json.loads(eachline)
 67 |             x_data.append(list(map(float, line['feature'])))
 68 |             y_data.append(float(line['diff']))
 69 | 
 70 |     x_data = np.array(x_data)
 71 |     y_data = np.array(y_data)
 72 | 
 73 |     return x_data, y_data
 74 | 
 75 | 
 76 | def evaluation(test_y, pred_y):
 77 |     # compute pcc
 78 |     pcc, _ = stats.pearsonr(pred_y, test_y)
 79 |     if math.isnan(pcc):
 80 |         print('ERROR: PCC=nan', test_y, pred_y)
 81 |     # compute doa
 82 |     n = 0
 83 |     correct_num = 0
 84 |     for i in range(len(test_y) - 1):
 85 |         for j in range(i + 1, len(test_y)):
 86 |             if (test_y[i] > test_y[j]) and (pred_y[i] > pred_y[j]):
 87 |                 correct_num += 1
 88 |             elif (test_y[i] == test_y[j]) and (pred_y[i] == pred_y[j]):
 89 |                 continue
 90 |             elif (test_y[i] < test_y[j]) and (pred_y[i] < pred_y[j]):
 91 |                 correct_num += 1
 92 |             n += 1
 93 |     if n == 0:
 94 |         print(test_y)
 95 |         return -1, -1
 96 |     doa = correct_num / n
 97 |     return pcc, doa
 98 | 
 99 | 
100 | if __name__ == '__main__':
101 |     # create_word_dict('../../data/data.json', '../../data/word.pickle')
102 |     # create_bow_feature('../../data/Train_sample.json', '../../data/word.pickle', '../../data/Train_BOW_sample.json')
103 |     # create_bow_feature('../../data/Validation_sample.json', '../../data/word.pickle', '../../data/Validation_BOW_sample.json')
104 |     # create_bow_feature('../../data/Test_sample.json', '../../data/word.pickle', '../../data/Test_BOW_sample.json')
105 |     pass
106 | 


--------------------------------------------------------------------------------
/Usage-PyTorch.md:
--------------------------------------------------------------------------------
  1 | # Usage-PyTorch
  2 | 
  3 | ## Options
  4 | 
  5 | ### Input and output options
  6 | 
  7 | ```
  8 |   --train-file              STR    Training file.      		Default is `data/Train_sample.json`.
  9 |   --validation-file         STR    Validation file.      	Default is `data/Validation_sample.json`.
 10 |   --test-file               STR    Testing file.       		Default is `data/Test_sample.json`.
 11 |   --word2vec-file           STR    Word2vec model file.		Default is `data/word2vec_300.txt`.
 12 | ```
 13 | 
 14 | ### Model option
 15 | 
 16 | ```
 17 |   --pad-seq-len             LIST    Padding Sequence length of data.        Depends on data.
 18 |   --embedding-type          INT     The embedding type.                     Default is 1.
 19 |   --embedding-dim           INT     Dim of character embedding.             Default is 300.
 20 |   --filter-sizes            LIST    Filter sizes.                           Default is [3,3].
 21 |   --num-filters             LIST    Number of filters per filter size.      Default is [200,400].  
 22 |   --pooling-size            INT     Pooling size.                           Default is 3.
 23 |   --lstm-dim                INT     Dim of LSTM neurons.                    Default is 256.
 24 |   --lstm-layers             INT     Number of LSTM layers.                  Defatul is 1.
 25 |   --attention-type          STR     The attention type.                     Default is 'normal'.
 26 |   --attention-dim           INT     Dim of Attention neurons.               Default is 200.
 27 |   --fc-dim                  INT     Dim of FC neurons.                      Default is 512.
 28 |   --dropout-rate            FLOAT   Dropout keep probability.               Default is 0.5.
 29 | ```
 30 | 
 31 | ### Training option
 32 | 
 33 | ```
 34 |   --epochs                  INT     Number of epochs.                       Default is 30.
 35 |   --batch-size              INT     Batch size.                             Default is 32.
 36 |   --learning-rate           FLOAT   Adam learning rate.                     Default is 0.001.
 37 |   --decay-rate              FLOAT   Rate of decay for learning rate.        Default is 0.95.
 38 |   --decay-steps             INT     How many steps before decy lr.          Default is 500.
 39 |   --evaluate-steps          INT     How many steps to evluate val set.      Default is 10.
 40 |   --l2-lambda               FLOAT   L2 regularization lambda.               Default is 0.0.
 41 |   --checkpoint-steps        INT     How many steps to save model.           Default is 10.
 42 |   --num-checkpoints         INT     Number of checkpoints to store.         Default is 10.
 43 | ```
 44 | 
 45 | ## Training
 46 | 
 47 | The following commands train a model. (Use TARNN for example)
 48 | 
 49 | ```bash
 50 | $ python3 train_tarnn.py
 51 | ```
 52 | 
 53 | Training a model for a 10 epochs and set batch size as 128.
 54 | 
 55 | ```bash
 56 | $ python3 train_tarnn.py --epochs 10 --batch-size 128
 57 | ```
 58 | 
 59 | In the beginning, you will see the program shows:
 60 | 
 61 | ![](https://live.staticflickr.com/65535/49767412868_ca51f1eb17_o.png)
 62 | 
 63 | **You need to choose Training or Restore. (T for Training and R for Restore)**
 64 | 
 65 | After training, you will get the `/log` and  `/run` folder.
 66 | 
 67 | - `/log` folder saves the log info file.
 68 | - `/run` folder saves the checkpoints.
 69 | 
 70 | It should be like this:
 71 | 
 72 | ```text
 73 | .
 74 | ├── logs
 75 | ├── runs
 76 | │   └── 1586759461 [a 10-digital format]
 77 | │       ├── bestcheckpoints
 78 | │       ├── checkpoints
 79 | │       ├── embedding
 80 | │       └── summaries
 81 | ├── test_tarnn.py
 82 | ├── text_tarnn.py
 83 | └── train_tarnn.py
 84 | ```
 85 | 
 86 | **The programs name and identify the model by using the asctime (It should be 10-digital number, like 1586759461).** 
 87 | 
 88 | ## Restore
 89 | 
 90 | When your model stops training for some reason and you want to restore training, you can:
 91 | 
 92 | In the beginning, you will see the program shows:
 93 | 
 94 | ![](https://live.staticflickr.com/65535/49767947506_cbcc0ecfd1_o.png)
 95 | 
 96 | **And you need to input R for restore.**
 97 | 
 98 | Then you will be asked to give the model name (a 10-digital format, like 1586759461):
 99 | 
100 | ![](https://live.staticflickr.com/65535/49767968391_247d21d0bb_o.png)
101 | 
102 | And the model will continue training from the last time.
103 | 
104 | ## Test
105 | 
106 | The following commands test a model.
107 | 
108 | ```bash
109 | $ python3 test_tarnn.py
110 | ```
111 | 
112 | Then you will be asked to give the model name (a 10-digital format, like 1586759461):
113 | 
114 | ![](https://live.staticflickr.com/65535/49767454533_6af8053c5f_o.png)
115 | 
116 | And you can choose to use the best model or the latest model **(B for Best, L for Latest)**:
117 | 
118 | ![](https://live.staticflickr.com/65535/49768319867_0a9fc9cafd_o.png)
119 | 
120 | Finally, you can get the `predictions.json` file under the `/outputs`  folder, it should be like:
121 | 
122 | ```text
123 | .
124 | ├── graph
125 | ├── logs
126 | ├── output
127 | │   └── 1586759461
128 | │       └── predictions.json
129 | ├── runs
130 | │   └── 1586759461
131 | │       ├── bestcheckpoints
132 | │       ├── checkpoints
133 | │       ├── embedding
134 | │       └── summaries
135 | ├── test_tarnn.py
136 | ├── text_tarnn.py
137 | └── train_tarnn.py
138 | ```
139 | 
140 | 


--------------------------------------------------------------------------------
/Usage-TF.md:
--------------------------------------------------------------------------------
  1 | # Usage-TF
  2 | 
  3 | ## Options
  4 | 
  5 | ### Input and output options
  6 | 
  7 | ```
  8 |   --train-file              STR    Training file.      		Default is `data/Train_sample.json`.
  9 |   --validation-file         STR    Validation file.      	Default is `data/Validation_sample.json`.
 10 |   --test-file               STR    Testing file.       		Default is `data/Test_sample.json`.
 11 |   --word2vec-file           STR    Word2vec model file.		Default is `data/word2vec_300.txt`.
 12 | ```
 13 | 
 14 | ### Model option
 15 | 
 16 | ```
 17 |   --pad-seq-len             LIST    Padding Sequence length of data.        Depends on data.
 18 |   --embedding-type          INT     The embedding type.                     Default is 1.
 19 |   --embedding-dim           INT     Dim of character embedding.             Default is 300.
 20 |   --filter-sizes            LIST    Filter sizes.                           Default is [3,3].
 21 |   --num-filters             LIST    Number of filters per filter size.      Default is [200,400].  
 22 |   --pooling-size            INT     Pooling size.                           Default is 3.
 23 |   --lstm-dim                INT     Dim of LSTM neurons.                    Default is 256.
 24 |   --lstm-layers             INT     Number of LSTM layers.                  Defatul is 1.
 25 |   --attention-type          STR     The attention type.                     Default is 'normal'.
 26 |   --attention-dim           INT     Dim of Attention neurons.               Default is 200.
 27 |   --fc-dim                  INT     Dim of FC neurons.                      Default is 512.
 28 |   --dropout-rate            FLOAT   Dropout keep probability.               Default is 0.5.
 29 | ```
 30 | 
 31 | ### Training option
 32 | 
 33 | ```
 34 |   --epochs                  INT     Number of epochs.                       Default is 30.
 35 |   --batch-size              INT     Batch size.                             Default is 32.
 36 |   --learning-rate           FLOAT   Adam learning rate.                     Default is 0.001.
 37 |   --decay-rate              FLOAT   Rate of decay for learning rate.        Default is 0.95.
 38 |   --decay-steps             INT     How many steps before decy lr.          Default is 500.
 39 |   --evaluate-steps          INT     How many steps to evluate val set.      Default is 10.
 40 |   --l2-lambda               FLOAT   L2 regularization lambda.               Default is 0.0.
 41 |   --checkpoint-steps        INT     How many steps to save model.           Default is 10.
 42 |   --num-checkpoints         INT     Number of checkpoints to store.         Default is 10.
 43 | ```
 44 | 
 45 | ## Training
 46 | 
 47 | The following commands train a model. (Use TARNN for example)
 48 | 
 49 | ```bash
 50 | $ python3 train_tarnn.py
 51 | ```
 52 | 
 53 | Training a model for a 10 epochs and set batch size as 128.
 54 | 
 55 | ```bash
 56 | $ python3 train_tarnn.py --epochs 10 --batch-size 128
 57 | ```
 58 | 
 59 | In the beginning, you will see the program shows:
 60 | 
 61 | ![](https://live.staticflickr.com/65535/49767412868_ca51f1eb17_o.png)
 62 | 
 63 | **You need to choose Training or Restore. (T for Training and R for Restore)**
 64 | 
 65 | After training, you will get the `/log` and  `/run` folder.
 66 | 
 67 | - `/log` folder saves the log info file.
 68 | - `/run` folder saves the checkpoints.
 69 | 
 70 | It should be like this:
 71 | 
 72 | ```text
 73 | .
 74 | ├── logs
 75 | ├── runs
 76 | │   └── 1586759461 [a 10-digital format]
 77 | │       ├── bestcheckpoints
 78 | │       ├── checkpoints
 79 | │       ├── embedding
 80 | │       └── summaries
 81 | ├── test_tarnn.py
 82 | ├── text_tarnn.py
 83 | └── train_tarnn.py
 84 | ```
 85 | 
 86 | **The programs name and identify the model by using the asctime (It should be 10-digital number, like 1586759461).** 
 87 | 
 88 | ## Restore
 89 | 
 90 | When your model stops training for some reason and you want to restore training, you can:
 91 | 
 92 | In the beginning, you will see the program shows:
 93 | 
 94 | ![](https://live.staticflickr.com/65535/49767947506_cbcc0ecfd1_o.png)
 95 | 
 96 | **And you need to input R for restore.**
 97 | 
 98 | Then you will be asked to give the model name (a 10-digital format, like 1586759461):
 99 | 
100 | ![](https://live.staticflickr.com/65535/49767968391_247d21d0bb_o.png)
101 | 
102 | And the model will continue training from the last time.
103 | 
104 | ## Test
105 | 
106 | The following commands test a model.
107 | 
108 | ```bash
109 | $ python3 test_tarnn.py
110 | ```
111 | 
112 | Then you will be asked to give the model name (a 10-digital format, like 1586759461):
113 | 
114 | ![](https://live.staticflickr.com/65535/49767454533_6af8053c5f_o.png)
115 | 
116 | And you can choose to use the best model or the latest model **(B for Best, L for Latest)**:
117 | 
118 | ![](https://live.staticflickr.com/65535/49768319867_0a9fc9cafd_o.png)
119 | 
120 | Finally, you can get the `predictions.json` file under the `/outputs`  folder, it should be like:
121 | 
122 | ```text
123 | .
124 | ├── graph
125 | ├── logs
126 | ├── output
127 | │   └── 1586759461
128 | │       └── predictions.json
129 | ├── runs
130 | │   └── 1586759461
131 | │       ├── bestcheckpoints
132 | │       ├── checkpoints
133 | │       ├── embedding
134 | │       └── summaries
135 | ├── test_tarnn.py
136 | ├── text_tarnn.py
137 | └── train_tarnn.py
138 | ```
139 | 
140 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | torch==1.6.0
 2 | tensorflow==1.15.0
 3 | tensorflow_tensorboard==1.5.1
 4 | tensorboard==1.15.0
 5 | xgboost==1.2.0
 6 | matplotlib==2.2.3
 7 | tflearn==0.3.2
 8 | gensim==3.8.3
 9 | numpy==1.16.2
10 | Pillow==5.4.1
11 | python_gflags==3.1.2
12 | scikit_learn==0.19.1
13 | tqdm==4.49.0
14 | google-compute-engine==2.8.13


--------------------------------------------------------------------------------