├── .gitignore ├── dataset ├── toxic_comment.py └── toxic_comment_fold.py ├── main.py ├── main_fold.py ├── net ├── c_lstm.py ├── cnn.py ├── lstm.py └── lstm_mean.py ├── readme.md ├── requirements.txt ├── test_coeff.py └── utils ├── embedding_utils.py └── misc.py /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | -------------------------------------------------------------------------------- /dataset/toxic_comment.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Author: Linjian Zhang 5 | Email: linjian93@foxmail.com 6 | Create Time: 2018-03-01 15:41:29 7 | Program: 8 | Description: 9 | """ 10 | from torch.utils.data import Dataset, DataLoader 11 | import os 12 | import json 13 | import numpy as np 14 | import pandas as pd 15 | from utils.embedding_utils import tokenize_sentences, read_embedding_list, clear_embedding_list, convert_tokens_to_ids 16 | 17 | 18 | class ToxicCommentDataSet(Dataset): 19 | """toxic comment data set""" 20 | 21 | def __init__(self, dir_data, sentence_length, phase): 22 | self.dir_data = dir_data 23 | self.sentence_length = sentence_length 24 | self.phase = phase 25 | self.dir_save = self.dir_data + '/train-test-len-{:d}'.format(self.sentence_length) 26 | self.dir_train = self.dir_data + '/train.csv' 27 | self.dir_test = self.dir_data + '/test.csv' 28 | self.dir_embedding = self.dir_data + '/crawl-300d-2M.vec' 29 | self.UNKNOWN_WORD = '_UNK_' 30 | self.NAN_WORD = '_NAN_' 31 | self.END_WORD = '_END_' 32 | self.CLASSES = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"] 33 | 34 | if self.phase == 'Train': 35 | self.x, self.y, self.embeddings = self.load_data_train() 36 | elif self.phase == 'Test': 37 | self.test_id, self.x, self.embeddings = self.load_data_test() 38 | 39 | def load_data_train(self): 40 | if os.path.exists(self.dir_save): 41 | print('Find pre-processed data\nLoading train.txt...') 42 | data_train = np.loadtxt(self.dir_save + '/train.txt') 43 | 44 | print('Loading label.txt...') 45 | label_train = np.loadtxt(self.dir_save + '/label.txt') 46 | 47 | print('Loading embedding_list.txt...') 48 | embedding_list = np.loadtxt(self.dir_save + '/embedding_list.txt') 49 | else: 50 | train_data = pd.read_csv(self.dir_train) 51 | test_data = pd.read_csv(self.dir_test) 52 | list_sentences_train = train_data["comment_text"].fillna(self.NAN_WORD).values 53 | list_sentences_test = test_data["comment_text"].fillna(self.NAN_WORD).values 54 | label_train = train_data[self.CLASSES].values # (159571, 6) 55 | 56 | print("Tokenizing sentences in train set...") 57 | tokenized_sentences_train, words_dict = tokenize_sentences(list_sentences_train, {}) # 159571 58 | print("Tokenizing sentences in test set...") 59 | tokenized_sentences_test, words_dict = tokenize_sentences(list_sentences_test, words_dict) # 153164 60 | words_dict[self.UNKNOWN_WORD] = len(words_dict) # insert unknown_word to the last 61 | 62 | print("Loading embeddings...") 63 | embedding_list, embedding_word_dict = read_embedding_list(self.dir_embedding) 64 | embedding_size = len(embedding_list[0]) # 300 65 | 66 | print("Preparing data...") 67 | embedding_list, embedding_word_dict = clear_embedding_list(embedding_list, embedding_word_dict, words_dict) 68 | embedding_word_dict[self.UNKNOWN_WORD] = len(embedding_word_dict) 69 | embedding_list.append([0.] * embedding_size) 70 | embedding_word_dict[self.END_WORD] = len(embedding_word_dict) 71 | embedding_list.append([-1.] * embedding_size) 72 | 73 | id_to_word = dict((id, word) for word, id in words_dict.items()) 74 | data_train = convert_tokens_to_ids( 75 | tokenized_sentences_train, 76 | id_to_word, 77 | embedding_word_dict, 78 | self.sentence_length) 79 | data_test = convert_tokens_to_ids( 80 | tokenized_sentences_test, 81 | id_to_word, 82 | embedding_word_dict, 83 | self.sentence_length) 84 | 85 | os.mkdir(self.dir_save) 86 | np.savetxt(self.dir_save + '/train.txt', data_train) 87 | np.savetxt(self.dir_save + '/test.txt', data_test) 88 | np.savetxt(self.dir_save + '/label.txt', label_train) 89 | np.savetxt(self.dir_save + '/embedding_list.txt', embedding_list) 90 | with open(self.dir_save + '/id_to_word.json', 'w') as f: 91 | json.dump(id_to_word, f) 92 | print('Data saved in {:s}'.format(self.dir_save)) 93 | 94 | return data_train, label_train, embedding_list 95 | 96 | def load_data_test(self): 97 | print('Loading test.csv...') 98 | data_raw = pd.read_csv(self.dir_test) 99 | test_id = data_raw["id"].values 100 | test_id = test_id.reshape((len(test_id), 1)) 101 | 102 | print('Loading test.txt...') 103 | data_test = np.loadtxt(self.dir_save + '/test.txt') 104 | 105 | print('Loading embedding_list.txt...') 106 | embedding_list = np.loadtxt(self.dir_save + '/embedding_list.txt') 107 | 108 | return test_id, data_test, embedding_list 109 | 110 | def __len__(self): 111 | return len(self.x) 112 | 113 | def __getitem__(self, idx): 114 | """ get one sample""" 115 | sample = dict() 116 | sample['sentence'] = self.embeddings[self.x[idx].astype(int)].astype(np.float32) 117 | if self.phase == 'Train': 118 | sample['label'] = self.y[idx].astype(np.float32) 119 | 120 | return sample 121 | 122 | 123 | def main(): 124 | # train 125 | dir_data = '/media/csc105/Data/dataset-jiange/kaggle/toxic-comment-classification' 126 | sen_len = 500 127 | data_set = ToxicCommentDataSet(dir_data=dir_data, sentence_length=sen_len, phase='Train') 128 | loader = DataLoader(data_set, batch_size=16, shuffle=False, num_workers=4) 129 | n_batch = int(len(data_set.x)//loader.batch_size) 130 | for i_batch, sample_batch in enumerate(loader): 131 | print(i_batch, '/', n_batch, ':', sample_batch['sentence'].size(), sample_batch['label'].size()) 132 | 133 | # test 134 | # data_set = ToxicCommentDataSet(dir_data=dir_data, sentence_length=250, phase='Test') 135 | # loader = DataLoader(data_set, batch_size=16, shuffle=False, num_workers=4) 136 | # n_batch = int(len(data_set.x) // loader.batch_size) 137 | # for i_batch, sample_batch in enumerate(loader): 138 | # print(i_batch, '/', n_batch, ':', sample_batch['sentence'].size()) 139 | 140 | if __name__ == '__main__': 141 | main() 142 | -------------------------------------------------------------------------------- /dataset/toxic_comment_fold.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Author: Linjian Zhang 5 | Email: linjian93@foxmail.com 6 | Create Time: 2018-03-16 12:34:33 7 | Program: 8 | Description: 9 | """ 10 | 11 | from torch.utils.data import Dataset, DataLoader 12 | import os 13 | import json 14 | import numpy as np 15 | import pandas as pd 16 | from utils.embedding_utils import tokenize_sentences, read_embedding_list, clear_embedding_list, convert_tokens_to_ids 17 | 18 | 19 | class ToxicComment(object): 20 | def __init__(self, dir_data, sentence_length, fold_count): 21 | self.dir_data = dir_data 22 | self.sentence_length = sentence_length 23 | self.fold_count = fold_count 24 | self.dir_save = self.dir_data + '/train-test-len-{:d}'.format(self.sentence_length) 25 | self.dir_train = self.dir_data + '/train.csv' 26 | self.dir_test = self.dir_data + '/test.csv' 27 | self.dir_embedding = self.dir_data + '/crawl-300d-2M.vec' 28 | self.UNKNOWN_WORD = '_UNK_' 29 | self.NAN_WORD = '_NAN_' 30 | self.END_WORD = '_END_' 31 | self.CLASSES = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"] 32 | 33 | self.x_train, self.y_train, self.x_test, self.test_id, self.embeddings = self.load_data() 34 | 35 | def load_data(self): 36 | if os.path.exists(self.dir_save): 37 | print('Find pre-processed data\nLoading train.txt...') 38 | data_train = np.loadtxt(self.dir_save + '/train.txt') 39 | 40 | print('Loading label.txt...') 41 | label_train = np.loadtxt(self.dir_save + '/label.txt') 42 | 43 | print('Loading test.txt...') 44 | data_test = np.loadtxt(self.dir_save + '/test.txt') 45 | 46 | print('Loading test id...') 47 | test_data = pd.read_csv(self.dir_test) 48 | test_id = test_data["id"].values 49 | test_id = test_id.reshape((len(test_id), 1)) 50 | 51 | print('Loading embedding_list.txt...') 52 | embedding_list = np.loadtxt(self.dir_save + '/embedding_list.txt') 53 | else: 54 | train_data = pd.read_csv(self.dir_train) 55 | test_data = pd.read_csv(self.dir_test) 56 | test_id = test_data["id"].values 57 | test_id = test_id.reshape((len(test_id), 1)) 58 | list_sentences_train = train_data["comment_text"].fillna(self.NAN_WORD).values 59 | list_sentences_test = test_data["comment_text"].fillna(self.NAN_WORD).values 60 | label_train = train_data[self.CLASSES].values # (159571, 6) 61 | 62 | print("Tokenizing sentences in train set...") 63 | tokenized_sentences_train, words_dict = tokenize_sentences(list_sentences_train, {}) # 159571 64 | print("Tokenizing sentences in test set...") 65 | tokenized_sentences_test, words_dict = tokenize_sentences(list_sentences_test, words_dict) # 153164 66 | words_dict[self.UNKNOWN_WORD] = len(words_dict) # insert unknown_word to the last 67 | 68 | print("Loading embeddings...") 69 | embedding_list, embedding_word_dict = read_embedding_list(self.dir_embedding) 70 | embedding_size = len(embedding_list[0]) # 300 71 | 72 | print("Preparing data...") 73 | embedding_list, embedding_word_dict = clear_embedding_list(embedding_list, embedding_word_dict, words_dict) 74 | embedding_word_dict[self.UNKNOWN_WORD] = len(embedding_word_dict) 75 | embedding_list.append([0.] * embedding_size) 76 | embedding_word_dict[self.END_WORD] = len(embedding_word_dict) 77 | embedding_list.append([-1.] * embedding_size) 78 | 79 | id_to_word = dict((id, word) for word, id in words_dict.items()) 80 | data_train = convert_tokens_to_ids( 81 | tokenized_sentences_train, 82 | id_to_word, 83 | embedding_word_dict, 84 | self.sentence_length) 85 | data_test = convert_tokens_to_ids( 86 | tokenized_sentences_test, 87 | id_to_word, 88 | embedding_word_dict, 89 | self.sentence_length) 90 | 91 | os.mkdir(self.dir_save) 92 | np.savetxt(self.dir_save + '/train.txt', data_train) 93 | np.savetxt(self.dir_save + '/test.txt', data_test) 94 | np.savetxt(self.dir_save + '/label.txt', label_train) 95 | np.savetxt(self.dir_save + '/embedding_list.txt', embedding_list) 96 | with open(self.dir_save + '/id_to_word.json', 'w') as f: 97 | json.dump(id_to_word, f) 98 | print('Data saved in {:s}'.format(self.dir_save)) 99 | 100 | return data_train, label_train, data_test, test_id, embedding_list 101 | 102 | def get_fold_by_id(self, idx): 103 | fold_size = len(self.x_train) // self.fold_count 104 | fold_start = fold_size * idx 105 | fold_end = fold_start + fold_size 106 | if idx == fold_size - 1: 107 | fold_end = len(self.x_train) 108 | 109 | x_t = np.concatenate([self.x_train[:fold_start], self.x_train[fold_end:]]) 110 | y_t = np.concatenate([self.y_train[:fold_start], self.y_train[fold_end:]]) 111 | x_v = self.x_train[fold_start:fold_end] 112 | y_v = self.y_train[fold_start:fold_end] 113 | 114 | return x_t, y_t, x_v, y_v 115 | 116 | 117 | class ToxicCommentDataSet(Dataset): 118 | """toxic comment data set""" 119 | 120 | def __init__(self, x, embeddings, y=None, phase='Train'): 121 | self.x = x 122 | self.embeddings = embeddings 123 | self.y = y 124 | self.phase = phase 125 | 126 | def __len__(self): 127 | return len(self.x) 128 | 129 | def __getitem__(self, idx): 130 | """ get one sample""" 131 | sample = dict() 132 | sample['sentence'] = self.embeddings[self.x[idx].astype(int)].astype(np.float32) 133 | if self.phase == 'Train' or self.phase == 'Valid': 134 | sample['label'] = self.y[idx].astype(np.float32) 135 | 136 | return sample 137 | 138 | 139 | def main(): 140 | # train 141 | dir_data = '/media/csc105/Data/dataset-jiange/kaggle/toxic-comment-classification' 142 | sen_len = 500 143 | # data_set = ToxicComment(dir_data=dir_data, sentence_length=sen_len) 144 | # loader = DataLoader(data_set, batch_size=16, shuffle=False, num_workers=4) 145 | # n_batch = int(len(data_set.x)//loader.batch_size) 146 | # for i_batch, sample_batch in enumerate(loader): 147 | # print(i_batch, '/', n_batch, ':', sample_batch['sentence'].size(), sample_batch['label'].size()) 148 | 149 | # test 150 | # data_set = ToxicCommentDataSet(dir_data=dir_data, sentence_length=250, phase='Test') 151 | # loader = DataLoader(data_set, batch_size=16, shuffle=False, num_workers=4) 152 | # n_batch = int(len(data_set.x) // loader.batch_size) 153 | # for i_batch, sample_batch in enumerate(loader): 154 | # print(i_batch, '/', n_batch, ':', sample_batch['sentence'].size()) 155 | 156 | if __name__ == '__main__': 157 | main() 158 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Author: Linjian Zhang 5 | Email: linjian93@foxmail.com 6 | Create Time: 2018-03-01 15:36:56 7 | Program: 8 | Description: 9 | """ 10 | 11 | import torch 12 | import torch.nn as nn 13 | from torch.utils.data import DataLoader 14 | import re 15 | import os 16 | import math 17 | import argparse 18 | import numpy as np 19 | import pandas as pd 20 | from time import time 21 | from tqdm import tqdm 22 | from tensorboardX import SummaryWriter 23 | from dataset.toxic_comment import ToxicCommentDataSet 24 | from utils.misc import to_var, adjust_learning_rate, pre_create_file_train, display_loss 25 | 26 | 27 | def get_parser(): 28 | parser = argparse.ArgumentParser() 29 | parser.add_argument('--server', default=None, type=int, help='[6099]') 30 | parser.add_argument('--phase', default=None, help='[Train/Test]') 31 | parser.add_argument('--sen_len', default=None, type=int, help='sentence length') 32 | 33 | parser.add_argument('--net_name', default=None, help='[lstm]') 34 | parser.add_argument('--dir_date', default=None, help='Name it with date, such as 20180102') 35 | parser.add_argument('--batch_size', default=32, type=int, help='Batch size') 36 | parser.add_argument('--lr_base', default=1e-3, type=float, help='Base learning rate') 37 | parser.add_argument('--lr_decay_rate', default=0.1, type=float, help='Decay rate of lr') 38 | parser.add_argument('--epoch_lr_decay', default=1000, type=int, help='Every # epoch, lr decay lr_decay_rate') 39 | 40 | parser.add_argument('--layer_num', default=2, type=int, help='Lstm layer number') 41 | parser.add_argument('--hidden_size', default=64, type=int, help='Lstm hidden units') 42 | parser.add_argument('--gpu', default='0', help='GPU id list') 43 | parser.add_argument('--workers', default=4, type=int, help='Workers number') 44 | 45 | return parser.parse_args() 46 | 47 | 48 | def run_batch(sample, model, loss_func=None, optimizer=None, phase=None): 49 | """ 50 | Run a batch for phase = {train, valid, test} 51 | """ 52 | if phase == 'Train': 53 | model.train() 54 | else: 55 | model.eval() # test model,close dropout... 56 | 57 | x = to_var(sample['sentence']) 58 | label_pre = model(x) # [bs, 6] 59 | 60 | if phase == 'Train': 61 | label_gt = to_var(sample['label']) # [bs, 6] 62 | loss = loss_func(label_pre, label_gt) 63 | optimizer.zero_grad() # clear gradients for this training step 64 | loss.backward() # bp, compute gradients 65 | optimizer.step() # apply gradients 66 | return loss.data[0], label_pre.data 67 | else: 68 | return label_pre.data 69 | 70 | 71 | def main(args): 72 | """ 73 | 1. Train till the loss minimized for 4 epoch 74 | 2. Use the best model to test 75 | """ 76 | print('\n\n') 77 | print('START'.center(70, '=')) 78 | print('Net\t\t\t{:s}\nPhase\t\t\t{:s}\nSentence length\t\t{:d}'.format(args.net_name, args.phase, args.sen_len)) 79 | torch.set_default_tensor_type('torch.FloatTensor') 80 | 81 | if args.phase == 'Train': 82 | model = Net(in_features=300, hidden_size=args.hidden_size, layer_num=args.layer_num, phase='Train') 83 | if torch.cuda.is_available(): 84 | model = nn.DataParallel(model.cuda(), device_ids=args.gpu) 85 | 86 | print('LOADING DATA '.center(70, '=')) 87 | dir_model_date, dir_log_date = pre_create_file_train(dir_model, dir_log, args) 88 | writer = SummaryWriter(dir_log_date) 89 | loss_func = nn.BCEWithLogitsLoss() # loss(input, target) 90 | optimizer = torch.optim.Adam(model.parameters(), lr=args.lr_base) 91 | data_set = ToxicCommentDataSet(dir_data=dir_data, sentence_length=args.sen_len, phase='Train') 92 | loader = DataLoader(data_set, batch_size=args.batch_size, shuffle=True, num_workers=args.workers) 93 | step_per_epoch = int(math.ceil(len(data_set) / loader.batch_size)) 94 | 95 | print('TRAIN'.center(70, '=')) 96 | loss_best = -1 97 | epoch_best = 0 98 | epoch_current = 0 99 | while True: 100 | adjust_learning_rate(optimizer, epoch_current, args.lr_base, args.lr_decay_rate, args.epoch_lr_decay) 101 | loss_list = [] 102 | for step, sample_batch in enumerate(loader): 103 | step_global = epoch_current * step_per_epoch + step 104 | tic = time() 105 | loss, _ = run_batch(sample=sample_batch, 106 | model=model, 107 | loss_func=loss_func, 108 | optimizer=optimizer, 109 | phase='Train') 110 | hour_per_epoch = step_per_epoch * ((time() - tic) / 3600) 111 | loss_list.append(loss) 112 | 113 | # display result and add to tensor board 114 | if (step + 1) % 10 == 0: 115 | display_loss(hour_per_epoch, epoch_current, args, step, step_per_epoch, optimizer, loss, loss_list, 116 | writer, step_global) 117 | loss_mean = np.mean(loss_list) 118 | epoch_current += 1 119 | if loss_mean < loss_best or loss_best == -1: 120 | loss_best = loss_mean 121 | epoch_best = epoch_current 122 | torch.save(model.state_dict(), dir_model_date + '/model-best.pkl') 123 | print('>>>save current best model in {:s}\n'.format(dir_model_date + '/model-best.pkl')) 124 | else: 125 | if epoch_current - epoch_best == 5: 126 | break 127 | 128 | print('TEST'.center(70, '=')) 129 | model = Net(in_features=300, hidden_size=args.hidden_size, layer_num=args.layer_num, phase='Test') 130 | if torch.cuda.is_available(): 131 | model = nn.DataParallel(model.cuda(), device_ids=args.gpu) 132 | dir_model_date = dir_model + '/' + args.net_name + '/' + args.dir_date 133 | model.load_state_dict(torch.load(dir_model_date + '/model-best.pkl')) 134 | print('>>>load best model in {:s}\n'.format(dir_model_date + '/model-best.pkl')) 135 | 136 | data_set = ToxicCommentDataSet(dir_data=dir_data, sentence_length=args.sen_len, phase='Test') 137 | loader = DataLoader(data_set, batch_size=args.batch_size, shuffle=False, num_workers=args.workers) 138 | predicts = [] # 153164 139 | for step, sample_batch in enumerate(tqdm(loader)): 140 | predict = run_batch(sample=sample_batch, model=model, phase='Test') # bs x 6 141 | predicts.extend(predict.cpu().numpy()) 142 | ret = pd.DataFrame(data=predicts, columns=data_set.CLASSES) 143 | ret['id'] = data_set.test_id 144 | ret = ret[['id'] + data_set.CLASSES] 145 | ret.to_csv(dir_model_date+'/submit.csv', index=False) 146 | 147 | print('END'.center(70, '=')) 148 | 149 | 150 | if __name__ == '__main__': 151 | parser_args = get_parser() 152 | os.environ["CUDA_VISIBLE_DEVICES"] = parser_args.gpu # set visible gpu list, eg: '2,3,4' 153 | gpu_list = re.split('[, ]', parser_args.gpu) # store the gpu id into a list 154 | parser_args.gpu = range(len(list(filter(None, gpu_list)))) # gpu for PyTorch 155 | 156 | if parser_args.server == 6099: 157 | dir_data = '/media/csc105/Data/dataset-jiange/kaggle/toxic-comment-classification' 158 | dir_project = '/home/jiange/project/toxic_comment_classification' 159 | dir_model = dir_project + '/model' # directory to save model 160 | dir_log = dir_project + '/log' # directory to save log 161 | else: 162 | raise Exception('Must give the right server id!') 163 | 164 | if parser_args.net_name == 'lstm': 165 | from net.lstm import Net 166 | elif parser_args.net_name == 'lstm_mean': 167 | from net.lstm_mean import Net 168 | elif parser_args.net_name == 'c_lstm': 169 | from net.c_lstm import Net 170 | elif parser_args.net_name == 'cnn': 171 | from net.cnn import Net 172 | else: 173 | raise Exception('Must give a net name') 174 | 175 | main(parser_args) 176 | -------------------------------------------------------------------------------- /main_fold.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Author: Linjian Zhang 5 | Email: linjian93@foxmail.com 6 | Create Time: 2018-03-16 12:28:12 7 | Program: 8 | Description: 9 | """ 10 | 11 | import torch 12 | import torch.nn as nn 13 | from torch.utils.data import DataLoader 14 | import re 15 | import os 16 | import argparse 17 | import numpy as np 18 | import pandas as pd 19 | from tqdm import tqdm 20 | from dataset.toxic_comment_fold import ToxicComment, ToxicCommentDataSet 21 | from utils.misc import to_var, adjust_learning_rate, pre_create_file_train 22 | 23 | 24 | def get_parser(): 25 | parser = argparse.ArgumentParser() 26 | parser.add_argument('--server', default=None, type=int, help='[6099]') 27 | parser.add_argument('--phase', default=None, help='[Train/Test]') 28 | parser.add_argument('--sen_len', default=None, type=int, help='sentence length') 29 | 30 | parser.add_argument('--net_name', default=None, help='[lstm]') 31 | parser.add_argument('--dir_date', default=None, help='Name it with date, such as 20180102') 32 | parser.add_argument('--batch_size', default=32, type=int, help='Batch size') 33 | parser.add_argument('--lr_base', default=1e-3, type=float, help='Base learning rate') 34 | parser.add_argument('--lr_decay_rate', default=0.1, type=float, help='Decay rate of lr') 35 | parser.add_argument('--epoch_lr_decay', default=1000, type=int, help='Every # epoch, lr decay lr_decay_rate') 36 | 37 | parser.add_argument('--layer_num', default=2, type=int, help='Lstm layer number') 38 | parser.add_argument('--hidden_size', default=64, type=int, help='Lstm hidden units') 39 | parser.add_argument('--gpu', default='0', help='GPU id list') 40 | parser.add_argument('--workers', default=4, type=int, help='Workers number') 41 | parser.add_argument('--fold_count', default=10, type=int, help='Fold count') 42 | parser.add_argument('--coefficient', default=1.4, type=float, help='normalize coefficient') 43 | 44 | return parser.parse_args() 45 | 46 | 47 | def run_batch(sample, model, loss_func=None, optimizer=None, phase=None): 48 | """ 49 | Run a batch for phase = {train, valid, test} 50 | """ 51 | if phase == 'Train': 52 | model.train() 53 | else: 54 | model.eval() # test model,close dropout... 55 | 56 | x = to_var(sample['sentence']) 57 | label_pre = model(x) # [bs, 6] 58 | 59 | if phase == 'Train': 60 | label_gt = to_var(sample['label']) # [bs, 6] 61 | loss = loss_func(label_pre, label_gt) 62 | optimizer.zero_grad() # clear gradients for this training step 63 | loss.backward() # bp, compute gradients 64 | optimizer.step() # apply gradients 65 | return loss.data[0], label_pre.data 66 | 67 | elif phase == 'Valid': 68 | label_gt = to_var(sample['label']) # [bs, 6] 69 | loss = loss_func(label_pre, label_gt) 70 | return loss.data[0], label_pre.data 71 | 72 | else: 73 | return label_pre.data 74 | 75 | 76 | def get_best_model(args, loader_t, loader_v, fold_id, model_date, model, loss_func, optimizer): 77 | """ 78 | Save the model whose valid loss is minimized for 4 epoch 79 | """ 80 | loss_best = -1 81 | epoch_best = 0 82 | epoch_current = 0 83 | 84 | while True: 85 | adjust_learning_rate(optimizer, epoch_current, args.lr_base, args.lr_decay_rate, args.epoch_lr_decay) 86 | for step, sample_batch in enumerate(tqdm(loader_t)): 87 | _, _ = run_batch(sample=sample_batch, 88 | model=model, 89 | loss_func=loss_func, 90 | optimizer=optimizer, 91 | phase='Train') 92 | loss_total = [] 93 | for step, sample_batch in enumerate(tqdm(loader_v)): 94 | loss, _ = run_batch(sample=sample_batch, 95 | model=model, 96 | loss_func=loss_func, 97 | optimizer=optimizer, 98 | phase='Valid') 99 | loss_total.append(loss) 100 | loss_mean = np.mean(loss_total) 101 | print('epoch {:d}({:d}) loss {:.5f}({:.5f})\n'.format(epoch_current+1, epoch_best+1, loss_mean, loss_best)) 102 | epoch_current += 1 103 | if loss_mean < loss_best or loss_best == -1: 104 | loss_best = loss_mean 105 | epoch_best = epoch_current 106 | dir_model_save = model_date + '/fold{}-best.pkl'.format(fold_id+1) 107 | torch.save(model.state_dict(), dir_model_save) 108 | print('save current best model in {:s}\n'.format(dir_model_save)) 109 | else: 110 | if epoch_current - epoch_best == 5: 111 | break 112 | 113 | 114 | def main(args): 115 | """ 116 | 1. Train, and save best model for every fold 117 | 2. Use every best model to test, averaged 118 | """ 119 | print('\n\n') 120 | print('START'.center(70, '=')) 121 | print('Net\t\t\t{:s}\nPhase\t\t\t{:s}\nSentence length\t\t{:d}'.format(args.net_name, args.phase, args.sen_len)) 122 | torch.set_default_tensor_type('torch.FloatTensor') 123 | 124 | print('LOADING DATA '.center(70, '=')) 125 | data_set = ToxicComment(dir_data=dir_data, sentence_length=args.sen_len, fold_count=args.fold_count) 126 | 127 | if args.phase == 'Train': 128 | print('TRAIN'.center(70, '=')) 129 | model = Net(in_features=300, hidden_size=args.hidden_size, layer_num=args.layer_num, phase='Train') 130 | if torch.cuda.is_available(): 131 | model = nn.DataParallel(model.cuda(), device_ids=args.gpu) 132 | 133 | dir_model_date, dir_log_date = pre_create_file_train(dir_model, dir_log, args) 134 | loss_func = nn.BCEWithLogitsLoss() # loss(input, target) 135 | optimizer = torch.optim.Adam(model.parameters(), lr=args.lr_base) 136 | 137 | for fold_id in range(0, args.fold_count): 138 | print('>>>Fold {:d}\n'.format(fold_id+1)) 139 | x_t, y_t, x_v, y_v = data_set.get_fold_by_id(fold_id) 140 | data_fold_train = ToxicCommentDataSet(x_t, data_set.embeddings, y_t, phase='Train') 141 | data_fold_valid = ToxicCommentDataSet(x_v, data_set.embeddings, y_v, phase='Valid') 142 | loader_train = DataLoader(data_fold_train, batch_size=args.batch_size, shuffle=True, num_workers=args.workers) 143 | loader_valid = DataLoader(data_fold_valid, batch_size=args.batch_size, shuffle=False, num_workers=args.workers) 144 | get_best_model(args, loader_train, loader_valid, fold_id, dir_model_date, model, loss_func, optimizer) 145 | 146 | print('TEST'.center(70, '=')) 147 | dir_model_date = dir_model + '/' + args.net_name + '/' + args.dir_date 148 | model = Net(in_features=300, hidden_size=args.hidden_size, layer_num=args.layer_num, phase='Test') 149 | if torch.cuda.is_available(): 150 | model = nn.DataParallel(model.cuda(), device_ids=args.gpu) 151 | 152 | data_test = ToxicCommentDataSet(data_set.x_test, data_set.embeddings, phase='Test') 153 | loader_test = DataLoader(data_test, batch_size=args.batch_size, shuffle=False, num_workers=args.workers) 154 | predicts_list = [] 155 | for fold_id in range(0, args.fold_count): 156 | print('\n>>>Fold {:d}'.format(fold_id + 1)) 157 | dir_restore = dir_model_date + '/fold{}-best.pkl'.format(fold_id+1) 158 | model.load_state_dict(torch.load(dir_restore)) 159 | predicts = [] # 153164 160 | for step, sample_batch in enumerate(tqdm(loader_test)): 161 | predict = run_batch(sample=sample_batch, model=model, phase='Test') # bs x 6 162 | predicts.extend(predict.cpu().numpy()) 163 | predicts = np.array(predicts).reshape([-1, 6]) 164 | np.savetxt(dir_model_date+'/predicts-fold{}'.format(fold_id), predicts) 165 | predicts_list.append(predicts) 166 | 167 | predicts_ret = np.ones(predicts_list[0].shape) 168 | for predicts_fold in predicts_list: 169 | predicts_ret *= predicts_fold 170 | 171 | predicts_ret **= (1. / len(predicts_list)) 172 | predicts_ret **= args.coefficient 173 | 174 | ret = pd.DataFrame(data=predicts_ret, columns=data_set.CLASSES) 175 | ret['id'] = data_set.test_id 176 | ret = ret[['id'] + data_set.CLASSES] 177 | ret.to_csv(dir_model_date+'/submit.csv', index=False) 178 | 179 | print('END'.center(70, '=')) 180 | 181 | 182 | if __name__ == '__main__': 183 | parser_args = get_parser() 184 | os.environ["CUDA_VISIBLE_DEVICES"] = parser_args.gpu # set visible gpu list, eg: '2,3,4' 185 | gpu_list = re.split('[, ]', parser_args.gpu) # store the gpu id into a list 186 | parser_args.gpu = range(len(list(filter(None, gpu_list)))) # gpu for PyTorch 187 | 188 | if parser_args.server == 6099: 189 | dir_data = '/media/csc105/Data/dataset-jiange/kaggle/toxic-comment-classification' 190 | dir_project = '/home/jiange/project/toxic_comment_classification' 191 | dir_model = dir_project + '/model' # directory to save model 192 | dir_log = dir_project + '/log' # directory to save log 193 | else: 194 | raise Exception('Must give the right server id!') 195 | 196 | if parser_args.net_name == 'lstm': 197 | from net.lstm import Net 198 | elif parser_args.net_name == 'lstm_mean': 199 | from net.lstm_mean import Net 200 | elif parser_args.net_name == 'c_lstm': 201 | from net.c_lstm import Net 202 | elif parser_args.net_name == 'cnn': 203 | from net.cnn import Net 204 | else: 205 | raise Exception('Must give a net name') 206 | 207 | main(parser_args) 208 | -------------------------------------------------------------------------------- /net/c_lstm.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Author: Linjian Zhang 5 | Email: linjian93@foxmail.com 6 | Create Time: 2018-03-13 15:04:15 7 | Program: 8 | Description: 9 | """ 10 | import torch 11 | import torch.nn as nn 12 | from torch.nn.init import xavier_normal 13 | 14 | 15 | def conv(batch_norm, c_in, c_out, ks, sd=1, pad=0): 16 | if batch_norm: 17 | return nn.Sequential( 18 | nn.Conv2d(c_in, c_out, kernel_size=ks, stride=sd, padding=pad, bias=False), 19 | nn.BatchNorm2d(c_out), 20 | nn.ReLU(), 21 | ) 22 | else: 23 | return nn.Sequential( 24 | nn.Conv2d(c_in, c_out, kernel_size=ks, stride=sd, padding=pad, bias=True), 25 | nn.ReLU(), 26 | ) 27 | 28 | 29 | def fc(c_in, c_out, activation=None): 30 | if activation == 'relu': 31 | return nn.Sequential( 32 | nn.Linear(c_in, c_out), 33 | nn.ReLU(), 34 | ) 35 | elif activation == 'sigmoid': 36 | return nn.Sequential( 37 | nn.Linear(c_in, c_out), 38 | nn.Sigmoid(), 39 | ) 40 | else: 41 | return nn.Linear(c_in, c_out) 42 | 43 | 44 | class Net(nn.Module): 45 | def __init__(self, in_features, hidden_size, layer_num, phase='Train', batch_norm=False): 46 | super(Net, self).__init__() 47 | self.phase = phase 48 | self.batch_norm = batch_norm 49 | self.conv1 = conv(self.batch_norm, 1, 256, ks=[3, in_features], pad=0) 50 | self.dropout = nn.Dropout(0.5) 51 | self.lstm = nn.LSTM(input_size=256, 52 | hidden_size=hidden_size, 53 | num_layers=layer_num, 54 | batch_first=True, 55 | dropout=0.5, 56 | bidirectional=True) 57 | self.fc1 = fc(hidden_size*2, 32, activation='relu') 58 | self.fc2 = fc(32, 6) 59 | self.sigmoid = nn.Sigmoid() 60 | 61 | for m in self.modules(): 62 | if isinstance(m, nn.Conv2d): 63 | xavier_normal(m.weight.data) 64 | if m.bias is not None: 65 | m.bias.data.zero_() 66 | elif isinstance(m, nn.BatchNorm2d): 67 | m.weight.data.fill_(1) 68 | m.bias.data.zero_() 69 | 70 | def forward(self, x): 71 | N, T, D1 = tuple(x.size()) 72 | x = x.view(N, 1, T, D1) 73 | x = self.conv1(x) # N x 256 x T x 1 74 | x = torch.transpose(x, 2, 1) # N x T x 256 x 1 75 | x = x.contiguous() 76 | x = x.view(N, -1, 256) # N x T x 256 77 | x, _ = self.lstm(x) # N x T x D2 78 | x = x[:, -1, :] # N x 1 x D2 (last time step) 79 | x = x.view(N, -1) # N x D2 80 | x = self.fc1(x) 81 | x = self.fc2(x) 82 | 83 | if self.phase == 'Train': 84 | return x 85 | else: 86 | return self.sigmoid(x) 87 | 88 | def weight_parameters(self): 89 | return [param for name, param in self.named_parameters() if 'weight' in name] 90 | 91 | def bias_parameters(self): 92 | return [param for name, param in self.named_parameters() if 'bias' in name] 93 | 94 | 95 | def main(): 96 | net = Net(in_features=300, hidden_size=64, layer_num=2) 97 | print(net) 98 | 99 | from torch.autograd import Variable 100 | while True: 101 | input = Variable(torch.randn(32, 250, 300)) 102 | output = net(input) 103 | print(output.size()) 104 | 105 | 106 | if __name__ == '__main__': 107 | main() 108 | -------------------------------------------------------------------------------- /net/cnn.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Author: Linjian Zhang 5 | Email: linjian93@foxmail.com 6 | Create Time: 2018-03-17 22:17:58 7 | Program: 8 | Description: 9 | """ 10 | import torch 11 | import torch.nn as nn 12 | from torch.nn.init import xavier_normal 13 | 14 | 15 | def conv(batch_norm, c_in, c_out, ks, sd=1, pad=0): 16 | if batch_norm: 17 | return nn.Sequential( 18 | nn.Conv2d(c_in, c_out, kernel_size=ks, stride=sd, padding=pad, bias=False), 19 | nn.BatchNorm2d(c_out), 20 | nn.ReLU(), 21 | ) 22 | else: 23 | return nn.Sequential( 24 | nn.Conv2d(c_in, c_out, kernel_size=ks, stride=sd, padding=pad, bias=True), 25 | nn.ReLU(), 26 | ) 27 | 28 | 29 | def fc(c_in, c_out, activation=None): 30 | if activation == 'relu': 31 | return nn.Sequential( 32 | nn.Linear(c_in, c_out), 33 | nn.ReLU(), 34 | ) 35 | elif activation == 'sigmoid': 36 | return nn.Sequential( 37 | nn.Linear(c_in, c_out), 38 | nn.Sigmoid(), 39 | ) 40 | else: 41 | return nn.Linear(c_in, c_out) 42 | 43 | 44 | class Net(nn.Module): 45 | def __init__(self, in_features, hidden_size=None, layer_num=None, phase='Train', batch_norm=False): 46 | super(Net, self).__init__() 47 | self.hidden_size = hidden_size 48 | self.layer_num = layer_num 49 | self.phase = phase 50 | self.batch_norm = batch_norm 51 | self.conv1 = conv(self.batch_norm, 1, 128, ks=[3, in_features], pad=0) 52 | self.conv2 = conv(self.batch_norm, 1, 128, ks=[4, in_features], pad=0) 53 | self.conv3 = conv(self.batch_norm, 1, 128, ks=[5, in_features], pad=0) 54 | self.dropout = nn.Dropout(0.5) 55 | self.fc1 = fc(384, 64, activation='relu') 56 | self.fc2 = fc(64, 6) 57 | self.sigmoid = nn.Sigmoid() 58 | 59 | for m in self.modules(): 60 | if isinstance(m, nn.Conv2d): 61 | xavier_normal(m.weight.data) 62 | if m.bias is not None: 63 | m.bias.data.zero_() 64 | elif isinstance(m, nn.BatchNorm2d): 65 | m.weight.data.fill_(1) 66 | m.bias.data.zero_() 67 | 68 | def forward(self, x): 69 | N, T, D1 = tuple(x.size()) 70 | x = x.view(N, 1, T, D1) 71 | x1 = self.conv1(x).view(N, 128, -1) # N x 128 x T 72 | x1, _ = torch.max(x1, 2) # N x 128 73 | x2 = self.conv2(x).view(N, 128, -1) # N x 128 x T x 1 74 | x2, _ = torch.max(x2, 2) # N x 128 75 | x3 = self.conv3(x).view(N, 128, -1) # N x 128 x T x 1 76 | x3, _ = torch.max(x3, 2) # N x 128 77 | x = torch.cat((x1, x2, x3), dim=1) # N x 384 78 | x = self.fc1(x) 79 | x = self.fc2(x) 80 | 81 | if self.phase == 'Train': 82 | return x 83 | else: 84 | return self.sigmoid(x) 85 | 86 | def weight_parameters(self): 87 | return [param for name, param in self.named_parameters() if 'weight' in name] 88 | 89 | def bias_parameters(self): 90 | return [param for name, param in self.named_parameters() if 'bias' in name] 91 | 92 | 93 | def main(): 94 | net = Net(in_features=300, phase='Train') 95 | print(net) 96 | from torch.autograd import Variable 97 | while True: 98 | input = Variable(torch.randn(32, 250, 300)) 99 | output = net(input) 100 | print(output.size()) 101 | 102 | 103 | if __name__ == '__main__': 104 | main() 105 | -------------------------------------------------------------------------------- /net/lstm.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Author: Linjian Zhang 5 | Email: linjian93@foxmail.com 6 | Create Time: 2018-03-01 15:42:29 7 | Program: 8 | Description: 9 | """ 10 | import torch.nn as nn 11 | from torch.nn.init import xavier_normal 12 | 13 | 14 | def conv(batch_norm, c_in, c_out, ks=3, sd=1): 15 | if batch_norm: 16 | return nn.Sequential( 17 | nn.Conv2d(c_in, c_out, kernel_size=ks, stride=sd, padding=(ks-1)//2, bias=False), 18 | nn.BatchNorm2d(c_out), 19 | nn.ReLU(), 20 | ) 21 | else: 22 | return nn.Sequential( 23 | nn.Conv2d(c_in, c_out, kernel_size=ks, stride=sd, padding=(ks-1)//2, bias=True), 24 | nn.ReLU(), 25 | ) 26 | 27 | 28 | def fc(c_in, c_out, activation=None): 29 | if activation == 'relu': 30 | return nn.Sequential( 31 | nn.Linear(c_in, c_out), 32 | nn.ReLU(), 33 | ) 34 | elif activation == 'sigmoid': 35 | return nn.Sequential( 36 | nn.Linear(c_in, c_out), 37 | nn.Sigmoid(), 38 | ) 39 | else: 40 | return nn.Linear(c_in, c_out) 41 | 42 | 43 | class Net(nn.Module): 44 | def __init__(self, in_features, hidden_size, layer_num, phase='Train'): 45 | super(Net, self).__init__() 46 | self.phase = phase 47 | self.dropout = nn.Dropout(0.5) 48 | self.lstm = nn.LSTM(input_size=in_features, 49 | hidden_size=hidden_size, 50 | num_layers=layer_num, 51 | batch_first=True, 52 | dropout=0.5, 53 | bidirectional=True) 54 | self.gru = nn.GRU(input_size=in_features, 55 | hidden_size=hidden_size, 56 | num_layers=layer_num, 57 | batch_first=True, 58 | dropout=0.5, 59 | bidirectional=True) 60 | self.fc1 = fc(hidden_size*2, 32, activation='relu') 61 | self.fc2 = fc(32, 6) 62 | self.sigmoid = nn.Sigmoid() 63 | 64 | for m in self.modules(): 65 | if isinstance(m, nn.Conv2d): 66 | xavier_normal(m.weight.data) 67 | if m.bias is not None: 68 | m.bias.data.zero_() 69 | elif isinstance(m, nn.BatchNorm2d): 70 | m.weight.data.fill_(1) 71 | m.bias.data.zero_() 72 | 73 | def forward(self, x): 74 | N, T, D1 = tuple(x.size()) 75 | # x = self.dropout(x) 76 | # x, _ = self.lstm(x) # N x T x D2 77 | x, _ = self.gru(x) # N x T x D2 78 | x = x[:, -1, :] # N x 1 x D2 (last time step) 79 | x = x.view(N, -1) # N x D2 80 | x = self.fc1(x) 81 | x = self.fc2(x) 82 | 83 | if self.phase == 'Train': 84 | return x 85 | else: 86 | return self.sigmoid(x) 87 | 88 | def weight_parameters(self): 89 | return [param for name, param in self.named_parameters() if 'weight' in name] 90 | 91 | def bias_parameters(self): 92 | return [param for name, param in self.named_parameters() if 'bias' in name] 93 | 94 | 95 | def main(): 96 | net = Net(in_features=300, hidden_size=64, layer_num=2) 97 | print(net) 98 | 99 | import torch 100 | from torch.autograd import Variable 101 | while True: 102 | input = Variable(torch.randn(32, 250, 300)) 103 | output = net(input) 104 | print(output.size()) 105 | 106 | 107 | if __name__ == '__main__': 108 | main() 109 | -------------------------------------------------------------------------------- /net/lstm_mean.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Author: Linjian Zhang 5 | Email: linjian93@foxmail.com 6 | Create Time: 2018-03-13 15:13:23 7 | Program: 8 | Description: 9 | """ 10 | import torch 11 | import torch.nn as nn 12 | from torch.autograd import Variable 13 | from torch.nn.init import xavier_normal 14 | 15 | 16 | def conv(batch_norm, c_in, c_out, ks=3, sd=1): 17 | if batch_norm: 18 | return nn.Sequential( 19 | nn.Conv2d(c_in, c_out, kernel_size=ks, stride=sd, padding=(ks-1)//2, bias=False), 20 | nn.BatchNorm2d(c_out), 21 | nn.ReLU(), 22 | ) 23 | else: 24 | return nn.Sequential( 25 | nn.Conv2d(c_in, c_out, kernel_size=ks, stride=sd, padding=(ks-1)//2, bias=True), 26 | nn.ReLU(), 27 | ) 28 | 29 | 30 | def fc(c_in, c_out, activation=None): 31 | if activation == 'relu': 32 | return nn.Sequential( 33 | nn.Linear(c_in, c_out), 34 | nn.ReLU(), 35 | ) 36 | elif activation == 'sigmoid': 37 | return nn.Sequential( 38 | nn.Linear(c_in, c_out), 39 | nn.Sigmoid(), 40 | ) 41 | else: 42 | return nn.Linear(c_in, c_out) 43 | 44 | 45 | class Net(nn.Module): 46 | def __init__(self, in_features, hidden_size, layer_num, phase='Train'): 47 | super(Net, self).__init__() 48 | self.phase = phase 49 | # self.dropout = nn.Dropout(0.5) 50 | self.lstm = nn.LSTM(input_size=in_features, 51 | hidden_size=hidden_size, 52 | num_layers=layer_num, 53 | batch_first=True, 54 | dropout=0.7, 55 | bidirectional=True) 56 | self.fc1 = fc(hidden_size*2, 32, activation='relu') 57 | self.fc2 = fc(32, 6) 58 | self.sigmoid = nn.Sigmoid() 59 | 60 | for m in self.modules(): 61 | if isinstance(m, nn.Conv2d): 62 | xavier_normal(m.weight.data) 63 | if m.bias is not None: 64 | m.bias.data.zero_() 65 | elif isinstance(m, nn.BatchNorm2d): 66 | m.weight.data.fill_(1) 67 | m.bias.data.zero_() 68 | 69 | def forward(self, x): 70 | # input: (N, T, D) 71 | N, T, D1 = tuple(x.size()) 72 | x, _ = self.lstm(x) # N x T x D2 73 | x = torch.sum(x, 1) / T # N x D2 74 | x = self.fc1(x) 75 | x = self.fc2(x) 76 | 77 | if self.phase == 'Train': 78 | return x 79 | else: 80 | return self.sigmoid(x) 81 | 82 | def weight_parameters(self): 83 | return [param for name, param in self.named_parameters() if 'weight' in name] 84 | 85 | def bias_parameters(self): 86 | return [param for name, param in self.named_parameters() if 'bias' in name] 87 | 88 | 89 | def main(): 90 | net = Net(in_features=300, hidden_size=64, layer_num=2) 91 | print(net) 92 | while True: 93 | input = Variable(torch.randn(32, 250, 300)) 94 | output = net(input) 95 | print(output.size()) 96 | 97 | 98 | if __name__ == '__main__': 99 | main() 100 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # Toxic Comment Classification 2 | 3 | ## 主要功能 4 | 采用LSTM/C-LSTM/CNN等多种方法,对评论进行多标签分类 5 | 6 | ## 代码结构 7 | 8 | ### main 9 | 10 | - main.py 11 | 无交叉验证,取训练误差最小的参数,用于测试 12 | 13 | - main_fold.py 14 | k折交叉验证,测试时取k次的平均值 15 | 16 | ### dataset 17 | 18 | toxic comment 19 | 20 | ### net 21 | 22 | lstm/c-lstm/cnn... 23 | 24 | ## 运行 25 | 26 | 无交叉验证的运行指令如: 27 | ``` 28 | python main.py \ 29 | --server=6099 \ 30 | --phase=Train \ 31 | --sen_len=250 \ 32 | --net_name=lstm \ 33 | --dir_date=20180317 \ 34 | --lr_base=1e-3 \ 35 | --batch_size=256 \ 36 | --gpu=0 \ 37 | ``` 38 | 39 | 有交叉验证的指令如: 40 | ``` 41 | python main_fold.py \ 42 | --server=6099 \ 43 | --phase=Train \ 44 | --sen_len=250 \ 45 | --net_name=lstm \ 46 | --dir_date=20180318_fold_20 \ 47 | --batch_size=256 \ 48 | --lr_base=1e-3 \ 49 | --gpu=1 \ 50 | --fold_count=10 \ 51 | ``` -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | nltk 2 | pandas 3 | tqdm 4 | -------------------------------------------------------------------------------- /test_coeff.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Author: Linjian Zhang 5 | Email: linjian93@foxmail.com 6 | Create Time: 2018-03-20 16:27:14 7 | Program: 8 | Description: 9 | """ 10 | import argparse 11 | import numpy as np 12 | import pandas as pd 13 | 14 | 15 | parser = argparse.ArgumentParser() 16 | parser.add_argument('--coefficient', default=1.4, type=float, help='normalize coefficient') 17 | args = parser.parse_args() 18 | 19 | print('\n') 20 | print('START'.center(70, '=')) 21 | CLASSES = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"] 22 | print('Loading test id...') 23 | 24 | dir_data = '/media/csc105/Data/dataset-jiange/kaggle/toxic-comment-classification' 25 | dir_test = dir_data + '/test.csv' 26 | dir_save = './model/lstm/20180317' 27 | 28 | test_data = pd.read_csv(dir_test) 29 | test_id = test_data["id"].values 30 | test_id = test_id.reshape((len(test_id), 1)) 31 | 32 | predicts_list = list() 33 | for i in range(10): 34 | predicts = np.loadtxt(dir_save + '/predicts-fold{}'.format(i)) 35 | predicts_list.append(predicts) 36 | 37 | predicts_ret = np.ones(predicts_list[0].shape) 38 | for predicts_fold in predicts_list: 39 | predicts_ret *= predicts_fold 40 | 41 | predicts_ret **= (1. / len(predicts_list)) 42 | predicts_ret **= args.coefficient 43 | 44 | ret = pd.DataFrame(data=predicts_ret, columns=CLASSES) 45 | ret['id'] = test_id 46 | ret = ret[['id'] + CLASSES] 47 | ret.to_csv(dir_save + '/submit-coeff-{:.1f}.csv'.format(args.coefficient), index=False) 48 | print('DONE'.center(70, '=')) 49 | -------------------------------------------------------------------------------- /utils/embedding_utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Author: Linjian Zhang 5 | Email: linjian93@foxmail.com 6 | Create Time: 2018-03-06 13:55:21 7 | Program: 8 | Description: 9 | """ 10 | import nltk 11 | import numpy as np 12 | from tqdm import tqdm 13 | 14 | 15 | def tokenize_sentences(sentences, words_dict): 16 | """ 17 | :param sentences: list 18 | :param words_dict: {} 19 | :return: 20 | tokenized_sentences: list 21 | words_dict: {'word': word_id} 22 | """ 23 | tokenized_sentences = [] 24 | for sentence in tqdm(sentences): 25 | if hasattr(sentence, "decode"): 26 | sentence = sentence.decode("utf-8") 27 | tokens = nltk.tokenize.word_tokenize(sentence) 28 | result = [] 29 | for word in tokens: 30 | word = word.lower() 31 | if word not in words_dict: 32 | words_dict[word] = len(words_dict) 33 | word_index = words_dict[word] 34 | result.append(word_index) 35 | tokenized_sentences.append(result) 36 | return tokenized_sentences, words_dict 37 | 38 | 39 | def read_embedding_list(file_path): 40 | """ 41 | return: 42 | embedding_list: 2M x 300 43 | embedding_word_dict: {'word': id} length 2M 44 | """ 45 | embedding_word_dict = {} 46 | embedding_list = [] 47 | with open(file_path) as f: 48 | for row in tqdm(f.read().split("\n")[1:-1]): 49 | data = row.split(" ") 50 | word = data[0] 51 | embedding = np.array([float(num) for num in data[1:-1]]) 52 | embedding_list.append(embedding) 53 | embedding_word_dict[word] = len(embedding_word_dict) 54 | 55 | embedding_list = np.array(embedding_list) 56 | return embedding_list, embedding_word_dict 57 | 58 | 59 | def clear_embedding_list(embedding_list, embedding_word_dict, words_dict): 60 | """ 61 | return: 62 | cleared_embedding_list: W x 300, W is the number --> words_dict & embedding_word_dict 63 | cleared_embedding_word_dict: {'word': id} length W 64 | """ 65 | cleared_embedding_list = [] 66 | cleared_embedding_word_dict = {} 67 | 68 | for word in words_dict: 69 | if word not in embedding_word_dict: 70 | continue 71 | word_id = embedding_word_dict[word] 72 | row = embedding_list[word_id] 73 | cleared_embedding_list.append(row) 74 | cleared_embedding_word_dict[word] = len(cleared_embedding_word_dict) 75 | 76 | return cleared_embedding_list, cleared_embedding_word_dict 77 | 78 | 79 | def convert_tokens_to_ids(tokenized_sentences, words_list, embedding_word_dict, sentences_length): 80 | words_train = [] 81 | 82 | for sentence in tokenized_sentences: 83 | current_words = [] 84 | for word_index in sentence: # eg: [10, 13, 8, ...] 85 | word = words_list[word_index] # eg: 'the' 86 | word_id = embedding_word_dict.get(word, len(embedding_word_dict) - 2) # id in embedding_word_dict 87 | current_words.append(word_id) 88 | 89 | if len(current_words) >= sentences_length: 90 | current_words = current_words[:sentences_length] 91 | else: 92 | current_words += [len(embedding_word_dict) - 1] * (sentences_length - len(current_words)) # add END_WORD 93 | words_train.append(current_words) 94 | return words_train 95 | -------------------------------------------------------------------------------- /utils/misc.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Author: Linjian Zhang 5 | Email: linjian93@foxmail.com 6 | Create Time: 2018-03-01 16:47:20 7 | Program: 8 | Description: 9 | """ 10 | 11 | import torch 12 | import shutil 13 | import os 14 | import math 15 | import numpy as np 16 | from torch.autograd import Variable 17 | from torch.nn import init 18 | 19 | 20 | def pre_create_file_train(dir_model, dir_log, args): 21 | """ 22 | :param dir_model: ./model 23 | :param dir_log: ./log 24 | :param args: 25 | :return: 26 | mkdir ./model/lstm/20180101 if not exist, mkdir 27 | mkdir ./log/lstm/20180101 if exist, remove then mkdir 28 | """ 29 | if not os.path.exists(dir_model): 30 | os.mkdir(dir_model) 31 | if not os.path.exists(dir_log): 32 | os.mkdir(dir_log) 33 | 34 | dir_models = dir_model + '/' + args.net_name 35 | dir_logs = dir_log + '/' + args.net_name 36 | dir_model_date = dir_models + '/' + args.dir_date 37 | dir_log_date = dir_logs + '/' + args.dir_date 38 | if not os.path.exists(dir_models): 39 | os.mkdir(dir_models) 40 | if not os.path.exists(dir_logs): 41 | os.mkdir(dir_logs) 42 | if not os.path.exists(dir_model_date): 43 | os.mkdir(dir_model_date) 44 | if os.path.exists(dir_log_date): 45 | shutil.rmtree(dir_log_date) 46 | os.mkdir(dir_log_date) 47 | return dir_model_date, dir_log_date 48 | 49 | 50 | def pre_create_file_test(args): 51 | dir_test = './test' 52 | if not os.path.exists(dir_test): 53 | os.mkdir(dir_test) 54 | 55 | dir_net = dir_test + '/' + args.net_restore 56 | dir_time = dir_net + '/' + args.date_restore + '_' + args.model_restore 57 | if not os.path.exists(dir_net): 58 | os.mkdir(dir_net) 59 | if not os.path.exists(dir_time): 60 | os.mkdir(dir_time) 61 | return dir_time 62 | 63 | 64 | def to_var(x): 65 | if torch.cuda.is_available(): 66 | return Variable(x).cuda() 67 | else: 68 | return Variable(x) 69 | 70 | 71 | def init_xavier(m): 72 | if isinstance(m, torch.nn.Conv2d): 73 | init.xavier_normal(m.weight.data) 74 | init.constant(m.bias.data, 0.0) 75 | if isinstance(m, torch.nn.Linear): 76 | init.xavier_normal(m.weight.data) 77 | init.constant(m.bias.data, 0.0) 78 | if isinstance(m, torch.nn.BatchNorm2d): 79 | init.xavier_normal(m.weight.data) 80 | init.constant(m.bias.data, 0.0) 81 | 82 | 83 | def adjust_learning_rate(optimizer, epoch, lr_base, gamma=0.316, epoch_lr_decay=25): 84 | """ 85 | epoch lr 86 | 000-025 1e-4 87 | 025-050 3e-5 88 | 050-075 1e-5 89 | 075-100 3e-6 90 | 100-125 1e-6 91 | 125-150 3e-7 92 | """ 93 | 94 | exp = int(math.floor(epoch / epoch_lr_decay)) 95 | lr_decay = gamma ** exp 96 | for param_group in optimizer.param_groups: 97 | param_group['lr'] = lr_decay * lr_base 98 | 99 | 100 | def display_loss(hour_per_epoch, epoch, args, step, step_per_epoch, optimizer, loss, loss_list, writer, step_global): 101 | """ 102 | tensor board: loss, mean loss (reset every epoch) 103 | """ 104 | loss_mean = np.mean(loss_list) 105 | print('\n{:.3f}h/E {:03d} [{:03d}/{:03d}] [lr {:.6f}] {:.3f} ({:.3f})'.format( 106 | hour_per_epoch, epoch + 1, step + 1, step_per_epoch, optimizer.param_groups[0]['lr'], loss, 107 | loss_mean)) 108 | writer.add_scalars('./train', 109 | {'loss_t': loss, 'loss_mean': loss_mean}, 110 | step_global) 111 | 112 | 113 | def display_loss_tb_val(batch_v, loss_v, loss1_v, loss2_v, args, writer, step_global): 114 | print('\n{:d} batches: L {:.4f}={:.4f}+{:d}*{:.4f}'.format(batch_v, loss_v, loss1_v, args.beta, loss2_v)) 115 | writer.add_scalars('./train-val', {'loss_v': loss_v, 'loss1_v': loss1_v, 'loss2_v': loss2_v}, step_global) 116 | --------------------------------------------------------------------------------