├── Bert-classification-new ├── classify.py ├── get-res.py ├── process.py └── test.py ├── Bert.py ├── README.md ├── TextCNN_keras.py ├── process_imdb.py ├── test_bert.py ├── textCNN.py └── textCNN_parallel.py /Bert-classification-new/classify.py: -------------------------------------------------------------------------------- 1 | import process 2 | from torch.utils.data import TensorDataset,DataLoader 3 | import torch 4 | import torch.nn as nn 5 | from torch import optim 6 | import logging 7 | import numpy as np 8 | import pickle 9 | import os 10 | import torch.nn.functional as F 11 | # import matplotlib.pyplot as plt 12 | from transformers import BertForSequenceClassification, AdamW,BertTokenizer,BertModel 13 | from transformers import get_linear_schedule_with_warmup 14 | import argparse 15 | 16 | parser = argparse.ArgumentParser() 17 | parser.add_argument('--maxlen',type=int,default=512,help='sentence length in train files') 18 | parser.add_argument('--batch-size',type=int,default=16) 19 | parser.add_argument('--lr',type=float,default=3e-5) 20 | parser.add_argument('--epoch',type=int,default=4) 21 | parser.add_argument('--train-dir',default='train_ids.pkl',help="save train file in pkl format,convenient for loading train files in next time") 22 | parser.add_argument('--path',default='bert_base_multilingual_cased',help="pretrained model path") 23 | parser.add_argument('--params-dir',default='mbert_base_bs16_beta.pkl',help="save trained model params to this file") 24 | parser.add_argument('--num-labels',type=int,default=2,help="nums of labels for classification") 25 | 26 | parser.add_argument('--is-select',default=False,help='test or apply select') 27 | parser.add_argument('--treshold',type=float,default=0.7,help="treshold for select sentence of binary classification") 28 | parser.add_argument('--file',default=None,help='set candidate file for selecting') 29 | args = parser.parse_args() 30 | 31 | MAXLEN=args.maxlen - 2 32 | batchsize = args.batch_size 33 | 34 | path=args.path 35 | tokenizer = BertTokenizer.from_pretrained(path) 36 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 37 | 38 | def convert_text_to_ids(tokenizer,sentence,limit_size=MAXLEN): 39 | 40 | t=tokenizer.tokenize(sentence)[:limit_size] 41 | # print(len(sentence),len(t)) 42 | encoded_ids= tokenizer.encode(t) 43 | if len(encoded_ids)0) for i in seq] 61 | atten_masks.append(seq_mask) 62 | return atten_masks 63 | 64 | def predict(logits): 65 | res=torch.argmax(logits,dim=1) # 66 | probs=F.softmax(logits,dim=1) #binary classification prob 67 | return res,probs 68 | 69 | def train_model(net, epoch=args.epoch,lr=args.lr,train_pkl=args.train_dir): 70 | # ------------------------------ 71 | if os.path.exists(train_pkl): 72 | with open(train_pkl,'rb') as fr: 73 | input_ids=pickle.load(fr) 74 | else: 75 | input_ids= [convert_text_to_ids(tokenizer,sen) for sen in process.train_samples] 76 | with open(train_pkl,'wb') as fw: 77 | pickle.dump(input_ids,fw) 78 | # input_ids= [convert_text_to_ids(tokenizer,sen) for sen in process.train_samples] 79 | input_labels = process.train_labels 80 | atten_token_train=attention_masks(input_ids) 81 | train_set = TensorDataset(torch.LongTensor(input_ids),torch.LongTensor(atten_token_train),torch.LongTensor(input_labels)) 82 | train_loader = DataLoader(dataset=train_set, 83 | batch_size=batchsize, 84 | shuffle=True, 85 | num_workers=4 86 | ) 87 | 88 | for i, (train,mask, label) in enumerate(train_loader): 89 | print(train.shape,mask.shape, label.shape) ##torch.Size([8, 512]) torch.Size([8,512]) torch.Size([8, 1]) 90 | break 91 | # -------------------------------- 92 | avg_loss = [] 93 | net.train() # 94 | net.to(device) 95 | optimizer = AdamW(net.parameters(), lr) 96 | 97 | accumulation_steps = 8 98 | for e in range(args.epoch): 99 | for batch_idx, (data, mask, target) in enumerate(train_loader): 100 | # optimizer.zero_grad() 101 | data, mask, target = data.to(device), mask.to(device), target.to(device) 102 | output = net(input_ids=data, token_type_ids=None, attention_mask=mask, labels=target) 103 | #logits 104 | loss,logits=output[0],output[1] 105 | loss = loss / accumulation_steps # 106 | avg_loss.append(loss.item()) 107 | loss.backward() 108 | 109 | if ((batch_idx + 1) % accumulation_steps) == 0: 110 | optimizer.step() 111 | optimizer.zero_grad() 112 | 113 | if batch_idx % 10 == 0: 114 | logging.info('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss:{:.6f}'.format( 115 | e + 1, batch_idx, len(train_loader), 100. * 116 | batch_idx / len(train_loader), np.array(avg_loss).mean() 117 | )) 118 | 119 | print('Finished Training') 120 | return net 121 | 122 | 123 | def test_model(net,file=None,is_select=False,output=None): 124 | batchsize=64 125 | #-------------------------------- 126 | if is_select: #to select data 127 | assert file is not None 128 | test_samples,test_labels=process.file_list(file,-1) 129 | else: # to test acc 130 | test_samples,test_labels=process.test_samples,process.test_labels 131 | print(len(test_samples),len(test_labels)) 132 | input_ids2= [convert_text_to_ids(tokenizer,sen) for sen in test_samples] 133 | input_labels2 = torch.unsqueeze(torch.tensor(test_labels),dim=1) 134 | 135 | atten_tokens_eval=attention_masks(input_ids2) 136 | test_set = TensorDataset(torch.LongTensor(input_ids2),torch.LongTensor(atten_tokens_eval), torch.LongTensor(input_labels2)) 137 | test_loader = DataLoader(dataset=test_set, 138 | batch_size=batchsize, 139 | num_workers=4) 140 | for i, (train,mask, label) in enumerate(test_loader): 141 | print(train.shape,mask.shape, label.shape) # 142 | break 143 | #-------------------------------- 144 | net.eval() 145 | net=net.to(device) 146 | correct=0 147 | total=0 148 | with torch.no_grad(): 149 | if is_select: 150 | if not output: 151 | output=file+".res" 152 | fw=open(output,'w') 153 | count = 0 154 | for batch_idx, (data,mask,label) in enumerate(test_loader): 155 | # logging.info("test batch_id=" + str(batch_idx)) 156 | data, mask ,label =data.to(device), mask.to(device), label.to(device) 157 | output = net(input_ids=data, token_type_ids=None, attention_mask=mask) # 158 | # print(output[0].size(),label.shape) 159 | total += label.size(0) # 160 | res , probs=predict(output[0])[0],predict(output[0])[1] 161 | probs_pos=probs[:,1] 162 | if is_select: 163 | for i,value in enumerate(probs_pos): 164 | if value>args.treshold: 165 | fw.write(str(count+i)+"\t"+str(value.item())+"\t"+test_samples[count+i]) 166 | fw.write('\n') 167 | count+=len(probs_pos) 168 | if not is_select: 169 | correct += ( res == label.flatten()).sum().item() 170 | if is_select: 171 | fw.close() 172 | print('Finished Writing') 173 | else: 174 | print(f'correct: {correct} all-sum: {total} accuracy: {100.*correct/total:.3f}%') 175 | 176 | 177 | if __name__=="__main__": 178 | logging.basicConfig(format='%(asctime)s:%(levelname)s: %(message)s', level=logging.INFO) 179 | 180 | pre_net=BertForSequenceClassification.from_pretrained(args.path) 181 | model=train_model(pre_net,args.num_labels) 182 | torch.save(model.state_dict(), args.params_dir) 183 | -------------------------------------------------------------------------------- /Bert-classification-new/get-res.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | bi_candi=sys.argv[1] 4 | num=sys.argv[2] 5 | bi_res=bi_candi+'.res' 6 | with open(bi_candi,encoding='utf-8')as fr1,open(num,encoding='utf-8') as fr2,open(bi_res,'w',encoding='utf-8')as fw: 7 | nums=fr2.readlines() 8 | nums=[ str(num).strip() for num in nums] 9 | lines=fr1.readlines() 10 | for i,line in enumerate(lines): 11 | if str(i) in nums: 12 | fw.write(line) -------------------------------------------------------------------------------- /Bert-classification-new/process.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | 4 | def file_list(dir_name,label): 5 | texts=[];labels=[] 6 | with open(dir_name) as fr: 7 | for line in fr.readlines(): 8 | if len(line)<=1: #konghang 9 | continue 10 | else: 11 | texts.append(str(line).strip()) 12 | labels.append(label) 13 | return texts,labels 14 | 15 | 16 | traindir='.' 17 | devdir='.' 18 | 19 | neg_texts,neg_labels=file_list(os.path.join(traindir,'outdomain'),0) 20 | pos_texts,pos_labels=file_list(os.path.join(traindir,'indomain'),1) 21 | train_texts,train_labels=[],[] 22 | train_texts.extend(pos_texts); train_texts.extend(neg_texts) 23 | train_labels.extend(pos_labels); train_labels.extend(neg_labels) 24 | 25 | # dev_texts0,dev_labels0=file_list(os.path.join(devdir,'test_neg.ca'),0) 26 | dev_texts1,dev_labels1=file_list(os.path.join(devdir,'test_pos.ca'),1) 27 | dev_texts,dev_labels=[],[] 28 | # dev_texts.extend(dev_texts0); 29 | dev_texts.extend(dev_texts1) 30 | # dev_labels.extend(dev_labels0); 31 | dev_labels.extend(dev_labels1) 32 | 33 | random.seed(1) 34 | idx=[i for i in range(len(train_texts))] 35 | random.shuffle(idx) 36 | 37 | x=[] 38 | y=[] 39 | 40 | for id in idx: 41 | x.append(train_texts[id]) 42 | y.append(train_labels[id]) 43 | train_samples = x 44 | train_labels = y 45 | # print(train_samples[-3:],train_labels[-3:]) 46 | 47 | ########## test acc or apply ######### 48 | """test""" 49 | test_samples = dev_texts 50 | test_labels = dev_labels 51 | 52 | # print(len(train_samples),len(train_labels)) -------------------------------------------------------------------------------- /Bert-classification-new/test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import classify 3 | import os 4 | from transformers import BertForSequenceClassification, AdamW,BertTokenizer,BertModel 5 | import argparse 6 | # 测试 7 | if __name__=="__main__": 8 | args = classify.args 9 | model=BertForSequenceClassification.from_pretrained(args.path) 10 | model.load_state_dict(torch.load(args.params_dir)) 11 | if args.is_select: 12 | classify.test_model(model,file=args.file,is_select=args.is_select) 13 | else: 14 | classify.test_model(model) -------------------------------------------------------------------------------- /Bert.py: -------------------------------------------------------------------------------- 1 | import process_imdb 2 | from torch.utils.data import TensorDataset, DataLoader 3 | import torch 4 | import torch.nn as nn 5 | from torch import optim 6 | import logging 7 | import numpy as np 8 | 9 | # 使用BERT使其向量化 10 | 11 | MAXLEN = 512 - 2 12 | BATCHSIZE = 8 13 | 14 | from transformers import BertForSequenceClassification, AdamW, BertTokenizer, BertModel 15 | from transformers import get_linear_schedule_with_warmup 16 | 17 | path = '/data/yanghan/Bert_related/bert_base_uncased/' 18 | config_dir = path 19 | tokenizer = BertTokenizer.from_pretrained(path) 20 | 21 | 22 | def convert_text_to_ids(tokenizer, sentence, limit_size=MAXLEN): 23 | t = tokenizer.tokenize(sentence)[:limit_size] 24 | encoded_ids = tokenizer.encode(t) 25 | if len(encoded_ids) < limit_size + 2: 26 | tmp = [0] * (limit_size + 2 - len(encoded_ids)) 27 | encoded_ids.extend(tmp) 28 | return encoded_ids 29 | 30 | 31 | '''构建数据集和迭代器''' 32 | 33 | input_ids = [convert_text_to_ids(tokenizer, sen) for sen in process_imdb.train_samples] 34 | # input_labels = process_imdb.get_onehot_labels(process_imdb.train_labels) 35 | input_labels = torch.unsqueeze(torch.tensor(process_imdb.train_labels), dim=1) 36 | 37 | 38 | def get_att_masks(input_ids): 39 | atten_masks = [] 40 | for seq in input_ids: 41 | seq_mask = [float(i > 0) for i in seq] 42 | atten_masks.append(seq_mask) 43 | return atten_masks 44 | 45 | 46 | atten_token_train = get_att_masks(input_ids) 47 | 48 | '''构建数据集和数据迭代器,设定 batch_size 大小为''' 49 | 50 | train_set = TensorDataset(torch.LongTensor(input_ids), torch.LongTensor(atten_token_train), 51 | torch.LongTensor(input_labels)) 52 | train_loader = DataLoader(dataset=train_set, 53 | batch_size=BATCHSIZE, 54 | shuffle=True 55 | ) 56 | 57 | for i, (train, mask, label) in enumerate(train_loader): 58 | print(train.shape, mask.shape, label.shape) ##torch.Size([8,512]) torch.Size([8,512]) torch.Size([8, 1]) 59 | break 60 | 61 | input_ids2 = [convert_text_to_ids(tokenizer, sen) for sen in process_imdb.test_samples] 62 | input_labels2 = torch.unsqueeze(torch.tensor(process_imdb.test_labels), dim=1) 63 | atten_tokens_eval = get_att_masks(input_ids2) 64 | test_set = TensorDataset(torch.LongTensor(input_ids2), torch.LongTensor(atten_tokens_eval), 65 | torch.LongTensor(input_labels2)) 66 | test_loader = DataLoader(dataset=test_set, 67 | batch_size=BATCHSIZE, ) 68 | 69 | for i, (train, mask, label) in enumerate(test_loader): 70 | print(train.shape, mask.shape, label.shape) # 71 | break 72 | 73 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 74 | 75 | '''预测函数,用于预测结果''' 76 | 77 | def predict(logits): 78 | res = torch.argmax(logits, dim=1) # 按行取每行最大的列下标 79 | return res 80 | 81 | 82 | '''训练''' 83 | 84 | def train_model(net, epoch=4): 85 | avg_loss = [] 86 | net.train() # 将模型设置为训练模式 87 | net.to(device) 88 | 89 | optimizer = AdamW(net.parameters(), lr=5e-5) 90 | 91 | accumulation_steps = 8 92 | for e in range(epoch): 93 | for batch_idx, (data, mask, target) in enumerate(train_loader): 94 | # optimizer.zero_grad() 95 | data, mask, target = data.to(device), mask.to(device), target.to(device) 96 | output = net(data, token_type_ids=None, attention_mask=mask, labels=target) 97 | # logit是正负概率 98 | loss,logits=output[0],output[1] 99 | loss = loss / accumulation_steps # 梯度积累 100 | avg_loss.append(loss.item()) 101 | loss.backward() 102 | 103 | if ((batch_idx + 1) % accumulation_steps) == 0: 104 | # 每 8 次更新一下网络中的参数 105 | optimizer.step() 106 | optimizer.zero_grad() 107 | 108 | if batch_idx % 5 == 0: 109 | logging.info('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss:{:.6f}'.format( 110 | e + 1, batch_idx, len(train_loader), 100. * 111 | batch_idx / len(train_loader), np.array(avg_loss).mean() 112 | )) 113 | 114 | print('Finished Training') 115 | return net 116 | 117 | 118 | def test_model(net): 119 | net.eval() 120 | net = net.to(device) 121 | correct = 0 122 | total = 0 123 | with torch.no_grad(): 124 | for batch_idx, (data, mask, label) in enumerate(test_loader): 125 | logging.info("test batch_id=" + str(batch_idx)) 126 | 127 | data, mask, label = data.to(device), mask.to(device), label.to(device) 128 | output = net(data, token_type_ids=None, attention_mask=mask) # 调用model模型时不传入label值。 129 | # output的形式为(元组类型,第0个元素是每个batch中好评和差评的概率) 130 | # print(output[0],label) 131 | print(predict(output[0]), label.flatten()) 132 | total += label.size(0) # 逐次按batch递增 133 | correct += (predict(output[0]) == label.flatten()).sum().item() 134 | print(f"正确分类的样本数 {correct},总数 {total},准确率 {100.*correct/total:.3f}%") 135 | 136 | 137 | if __name__ == "__main__": 138 | logging.basicConfig(format='%(asctime)s:%(levelname)s: %(message)s', level=logging.INFO) 139 | 140 | pre_net = BertForSequenceClassification.from_pretrained(path) 141 | params_dir = 'model/bert_base_model_beta.pkl' 142 | 143 | model = train_model(pre_net, epoch=4) 144 | torch.save(model.state_dict(), params_dir) # 保存模型参数 145 | 146 | 147 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # text_classfication 2 | nlp text classification task program on IMDB dataset 3 | 4 | ## 数据集 5 | 共50000条电影评论,正负各一半,训练和测试各一半。 6 | 下载:http://ai.stanford.edu/~amaas/data/sentiment/ 7 | 8 | ## 模型说明: 9 | 大概是自己nlp学习的一点记录,去年用比较经典的模型写的文本分类,比较粗糙,没有特意调参: 10 | 11 | ### TextCNN-pytorch实现、keras实现 12 | 13 | 文件说明: 14 | 15 | - TextCNN_keras.py 用tf2中的keras写的CNN文本分类,acc不太高. 训练集划出来0.1的比例作为验证集 16 | - TextCNN.py 是pytorch版本,用了glove词向量,测试时把训练那部分注释掉 17 | - TextCNN_parallel.py 用分布式并行的相关代码改造上个文件,提升了训练的效率(约5倍),详见博客:https://www.cnblogs.com/yh-blog/p/12877922.html 18 | 19 | ### Bert-pytorch实现 20 | 21 | 文件说明: 22 | 23 | - process_imdb.py 是数据处理函数,把评论文本和标签转为list格式 24 | - Bert.py 是主函数 25 | - test_bert.py 用来测试模型accuracy 26 | 27 | 借助了huggingface的开源仓库 https://github.com/huggingface/transformers. 28 | 29 | 官方fine-tune参数推荐 https://github.com/google-research/bert 30 | 31 | - batch sizes: 8, 16, 32, 64, 128 32 | - learning rates(Adam): 3e-4, 1e-4, 5e-5, 3e-5 33 | - Number of epochs: 3, 4 34 | 35 | ## Results 36 | 37 | | result | acc | 38 | | :----: | :----: | 39 | | TextCNN(keras) | 87.27% | 40 | | Bert | 95.18%| 41 | 42 | 43 | 44 | ## Bert-classification-new说明: 45 | 46 | 预训练模型:bert_base_multilingual_cased 47 | 48 | 重构了用mbert分类的代码,可以用自己的数据训练得到的分类模型,根据分类预测结果,卡阈值选取句子,注意这里是二分类。已测试候选句量级为40w时能正常运行 49 | 50 | - process:数据处理函数,把评论文本和标签转为list格式,注意训练和测试accuracy的文件都在代码中指定,自行修改,其中outdomain代表训练的负样本,indomain代表训练的正样本。 51 | 52 | - classify:主函数,用于训练,用默认参数时,训练脚本: 53 | 54 | `CUDA_VISIBLE_DEVICES=1 python classify.py` 55 | 56 | 也可以自己定义参数 57 | 58 | - test.py:用来测试模型accuracy或者用模型进行句子的筛选 59 | 60 | 测acc: 61 | `CUDA_VISIBLE_DEVICES=1 python test.py` 62 | 63 | 筛选句子: 64 | `CUDA_VISIBLE_DEVICES=1 python test.py --is-select True --file candidate` 65 | -------------------------------------------------------------------------------- /TextCNN_keras.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, print_function, unicode_literals 2 | import numpy as np 3 | import tensorflow as tf 4 | # import tensorflow_datasets as tfds 5 | from tensorflow import keras 6 | from tensorflow.keras import layers 7 | import matplotlib.pyplot as plt 8 | # dataset,info=tfds.load("imdb_reviews",with_info=True,as_supervised=True) 9 | 10 | (train_data,train_label),(test_data,test_label)=keras.datasets.imdb.load_data(num_words=10000) 11 | maxl=400 12 | # print(len(train_data)) 13 | # print(train_data[0]) 14 | word_index=keras.datasets.imdb.get_word_index() 15 | word2id={k:(v+3) for k,v in word_index.items()} 16 | word2id[''] = 0 17 | word2id[''] = 1 18 | word2id[''] = 2 19 | word2id[''] = 3 20 | id2word={v:k for k,v in word_index.items()} 21 | # reverse_word_index = dict([(value, key) for (key, value) in word_index.items()]) 22 | # # We decode the review; note that our indices were offset by 3 23 | # # because 0, 1 and 2 are reserved indices for "padding", "start of sequence", and "unknown". 24 | # decoded_review = ' '.join([reverse_word_index.get(i - 3, '?') for i in train_data[0]]) 25 | # print(decoded_review) 26 | def get_words(sent_ids): 27 | return ' '.join([id2word.get(i-3,"?") for i in sent_ids]) #找不到返回? 28 | 29 | sent = get_words(train_data[0]) 30 | # print(sent) 31 | # 句子末尾padding 32 | train_data=keras.preprocessing.sequence.pad_sequences(train_data,value=word2id[''],padding='post',maxlen=maxl) 33 | test_data=keras.preprocessing.sequence.pad_sequences(test_data,value=word2id[''],padding='post',maxlen=maxl) 34 | # print('len: ',len(train_data[0]),len(test_data[1])) 35 | 36 | #构建模型 37 | 38 | vocab_size=10000 39 | # model=keras.Input(shape=()) 40 | input=layers.Input(shape=(maxl, )) 41 | em=layers.Embedding(vocab_size+1,300,input_length=maxl)(input) 42 | cnn1=layers.Conv1D(256,kernel_size=3,padding='same',strides=1,activation='relu',activity_regularizer='l2')(em) 43 | # cnn1 = layers.MaxPooling1D(2,strides=2)(cnn1) 44 | # cnn1 = layers.MaxPooling1D(2)(cnn1) 45 | # drop1 = layers.Dropout(0.25)(cnn1) 46 | cnn2 = layers.Conv1D(filters=256, kernel_size=4, padding='same', strides=1, activation='relu',activity_regularizer='l2')(em) 47 | # cnn2 = layers.MaxPooling1D(2,strides=2)(cnn2) 48 | # cnn2 = layers.MaxPooling1D(2)(cnn2) 49 | # drop2 = layers.Dropout(0.25)(cnn2) 50 | cnn3 = layers.Conv1D(256, kernel_size=5, padding='same', strides=1, activation='relu',activity_regularizer='l2')(em) 51 | # cnn3 = layers.MaxPooling1D(2)(cnn3) 52 | # drop3 = layers.Dropout(0.25)(cnn3) 53 | # concat=layers.concatenate([drop1, drop2, drop3], axis=-1) 54 | concat=layers.concatenate([cnn1,cnn2 ,cnn3 ], axis=-1) 55 | maxpool=layers.GlobalMaxPooling1D()(concat) 56 | flat = layers.Flatten()(maxpool) 57 | dense=layers.Dropout(0.5)(flat) 58 | dense=layers.Dense(64,activation='relu')(dense) 59 | output=layers.Dense(2,activation='softmax')(dense) 60 | model=tf.keras.models.Model(input,output) 61 | 62 | one_hot_labels = keras.utils.to_categorical(train_label, num_classes=2) #转为one hot编码 63 | adamOpti=tf.keras.optimizers.Adam(0.001) 64 | model.compile(loss='binary_crossentropy', optimizer=adamOpti, metrics=['accuracy']) 65 | 66 | #callbacks_list=[#tf.keras.callbacks.EarlyStopping(monitor='accuracy',patience=1), 67 | #tf.keras.callbacks.ModelCheckpoint(filepath="bestmodel.h5",monitor='val_loss',save_best_only=True), 68 | # tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=8, verbose=0, mode='auto', 69 | # epsilon=0.0001, cooldown=0, min_lr=0) 70 | # ] 71 | 72 | his=model.fit(train_data, one_hot_labels,batch_size=32, epochs=4,validation_split=0.1) 73 | model.save("model.h5") 74 | 75 | test_one_hot_labels=keras.utils.to_categorical(test_label, num_classes=2) 76 | # model=tf.keras.models.load_model('model.h5') 77 | plt.plot(his.history['accuracy']) 78 | plt.plot(his.history['loss']) 79 | plt.legend(['accuracy', 'loss'], loc='upper left') 80 | plt.show() 81 | test_loss, test_acc = model.evaluate(test_data, test_one_hot_labels) 82 | print('\n',test_loss,test_acc) 83 | -------------------------------------------------------------------------------- /process_imdb.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | vocab=[] #词表 4 | """ 5 | 将原始的文件处理成 评论列表和标签列表 6 | """ 7 | # 从pos以及neg样例中共抽取25000个样本 8 | imdb_dir = './aclImdb' 9 | train_dir=os.path.join(imdb_dir,'train') 10 | test_dir=os.path.join(imdb_dir,'test') 11 | 12 | def file_list(f_dir): 13 | labels=[];texts=[] 14 | for label_type in ['neg','pos']: 15 | dir_name=os.path.join(f_dir,label_type) 16 | for fname in os.listdir(dir_name): 17 | if fname[-4:] =='.txt': 18 | fo=open(os.path.join(dir_name,fname)) 19 | texts.append(fo.read()) 20 | fo.close() 21 | if label_type=='pos': 22 | labels.append(1) 23 | else: 24 | labels.append(0) 25 | return texts,labels 26 | 27 | train_texts,train_labels=file_list(train_dir) 28 | test_texts,test_labels=file_list(test_dir) 29 | print(train_labels[:3],test_labels[-3:]) 30 | # 由于之前我们处理数据的时候得到的数据集前12500个是neg样本后12500个是pos样本,因此我们需要将其随机打乱: 31 | random.seed(1) 32 | idx=[i for i in range(len(train_texts))] 33 | # print(idx[-3:]) 34 | random.shuffle(idx) 35 | # print(len(idx),len(texts),len(labels)) 36 | 37 | x=[] #打乱后的文本列表 38 | y=[] #打乱后对应的标签列表 39 | #x,y对应评论和标签的列表,已打乱 40 | for id in idx: 41 | x.append(train_texts[id]) 42 | y.append(train_labels[id]) 43 | # x=texts 44 | # y=labels 45 | print(x[-1:],y[-1:]) 46 | 47 | TRAINSET_SIZE = 25000 48 | TESTSET_SIZE = 25000 49 | 50 | train_samples = x[:TRAINSET_SIZE] 51 | train_labels = y[:TRAINSET_SIZE] 52 | 53 | test_samples = test_texts[:TESTSET_SIZE] #测试集不用打乱 54 | test_labels = test_labels[:TESTSET_SIZE] 55 | # print(eval_labels) 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | -------------------------------------------------------------------------------- /test_bert.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import Bert 3 | import os 4 | from transformers import BertForSequenceClassification, AdamW,BertTokenizer,BertModel 5 | # 测试 6 | if __name__=="__main__": 7 | params_dir='model/bert_base_model_test.pkl' 8 | 9 | path='/data/yanghan/Bert_related/bert_base_uncased/' 10 | model=BertForSequenceClassification.from_pretrained(path) 11 | model.load_state_dict(torch.load(params_dir)) 12 | Bert.test_model(model) 13 | -------------------------------------------------------------------------------- /textCNN.py: -------------------------------------------------------------------------------- 1 | import torch as t 2 | import torch.nn.functional as F 3 | import torch.nn as nn 4 | from torch.utils.data import Dataset, DataLoader 5 | import logging 6 | import torch.optim as optim 7 | import os 8 | import jieba 9 | import gensim 10 | from gensim.test.utils import datapath,get_tmpfile 11 | from gensim.scripts.glove2word2vec import glove2word2vec 12 | import argparse 13 | import re 14 | import time 15 | class textCNN(nn.Module): 16 | # 多通道textcnn 17 | def __init__(self, args,vectors=None): 18 | super(textCNN,self).__init__() 19 | self.args=args 20 | 21 | self.label_num=args.label_num #标签个数 22 | self.filter_num=args.filter_num #卷积核个数 23 | self.filter_sizes=[int(fsz) for fsz in args.filter_sizes] 24 | self.vocab_size=args.vocab_size 25 | self.embedding_dim=args.embedding_dim 26 | 27 | self.embedding=nn.Embedding(self.vocab_size,self.embedding_dim) 28 | # # requires_grad指定是否在训练过程中对词向量的权重进行微调 29 | # self.embedding.weight.requires_grad = True 30 | if args.static: # 如果使用预训练词向量,则提前加载,当不需要微调时设置freeze为True 31 | self.embedding = self.embedding.from_pretrained(vectors, freeze=not args.fine_tune) 32 | channel_in=1 33 | # nn.ModuleList相当于一个卷积的列表,相当于一个list 34 | # 卷积核宽度与embeding-dim相同,相当于一维卷积 35 | # nn.Conv1d()是一维卷积。in_channels:词向量的维度, out_channels:输出通道数 36 | # nn.MaxPool1d()是最大池化,此处对每一个向量取最大值,所有kernel_size为卷积操作之后的向量维度 37 | self.convs = nn.ModuleList([nn.Sequential( 38 | nn.Conv2d(channel_in, self.filter_num, (kernel, self.embedding_dim)), 39 | nn.ReLU(), 40 | # 经过卷积之后,得到一个维度为sentence_max_size - kernel + 1的一维向量 41 | nn.MaxPool2d((args.sentence_max_size - kernel + 1, 1)) 42 | ) 43 | for kernel in self.filter_sizes]) 44 | 45 | # self.convs = nn.ModuleList([ 46 | # nn.Conv2d(channel_in, self.filter_num, (fsz, self.embedding_dim)) 47 | # for fsz in self.filter_sizes]) 48 | self.dropout = nn.Dropout(args.dropout) 49 | self.fc = nn.Linear(len(self.filter_sizes) * self.filter_num, self.label_num) 50 | 51 | def forward(self,x): 52 | # Conv2d的输入是个四维的tensor,每一位分别代表batch_size、channel、length、width 53 | in_size = x.size(0) # x.size(0),表示的是输入x的batch_size 54 | out = [conv(x) for conv in self.convs] 55 | out = t.cat(out, dim=1) 56 | out = out.view(in_size, -1) # 设经过max pooling之后,有output_num个数,将out变成(batch_size,output_num),-1表示自适应 57 | out = F.dropout(out) 58 | out = self.fc(out) # nn.Linear接收的参数类型是二维的tensor(batch_size,output_num),一批有多少数据,就有多少行 59 | return out 60 | 61 | 62 | class MyDataset(Dataset): 63 | """MyDataset的实现原理就是通过遍历file_list,得到每一个文件路径名,根据路径名,将其内容读到内存中, 64 | 通过generate_tensor()函数将文件内容转化为tensor,函数返回tensor与对应的label,其中index就是list的下标""" 65 | 66 | def __init__(self, file_list, label_list, sentence_max_size, embedding, word2id, stopwords=None): 67 | self.x = file_list 68 | self.y = label_list 69 | self.sentence_max_size = sentence_max_size 70 | self.embedding = embedding 71 | self.word2id = word2id 72 | self.stopwords = stopwords 73 | 74 | def __getitem__(self, index): 75 | # 读取文章内容 76 | words = [] 77 | with open(self.x[index], "r", encoding="utf8") as file: 78 | for line in file.readlines(): 79 | words.extend(segment(line.strip())) 80 | # 生成文章的词向量矩阵 81 | tensor = generate_tensor(words, self.sentence_max_size, self.embedding, self.word2id) 82 | return tensor, self.y[index] 83 | 84 | def __len__(self): 85 | return len(self.x) 86 | 87 | 88 | def get_file_list(source_dir): 89 | file_list = [] # 文件路径名列表 90 | # os.walk()遍历给定目录下的所有子目录,每个walk是三元组(root,dirs,files) 91 | # root 所指的是当前正在遍历的这个文件夹的本身的地址 92 | # dirs 是一个 list ,内容是该文件夹中所有的目录的名字(不包括子目录) 93 | # files 同样是 list , 内容是该文件夹中所有的文件(不包括子目录) 94 | # 遍历所有评论 95 | for root, dirs, files in os.walk(source_dir): 96 | file = [os.path.join(root, filename) for filename in files] 97 | # print(root,dir,file) #file和 98 | file_list.extend(file) 99 | # print('len of filelist:',len(file_list),file_list) 100 | return file_list 101 | 102 | 103 | def get_label_list(file_list): 104 | # print(len(file_list)) 105 | # 提取出标签名 106 | label_name_list = [file.split(r'/')[3] for file in file_list] 107 | # 标签名对应的数字 108 | label_list = [] 109 | for label_name in label_name_list: 110 | if label_name == "neg": 111 | label_list.append(0) 112 | elif label_name == "pos": 113 | label_list.append(1) 114 | return label_list 115 | 116 | def segment(content): 117 | # regex = re.compile(r'[^\u4e00-\u9fa5aA-Za-z0-9]') 118 | # text = regex.sub(' ', content) 119 | text = re.sub('[^\w ]', '', content) 120 | return [word for word in jieba.cut(text) if word.strip()] 121 | 122 | 123 | '''先将一篇评论进行分词,然后将每个词转换为对应的词向量。最终每篇评论,会变成[sentence_max_size,vec_dim]的矩阵''' 124 | def generate_tensor(sentence, sentence_max_size, embedding, word2id): 125 | """ 126 | 对一篇文章生成对应的词向量矩阵 127 | :param sentence:一篇文章的分词列表 128 | :param sentence_max_size:认为设定的一篇文章的最大分词数量 129 | :param embedding:词向量对象 130 | :param word2id:字典{word:id} 131 | :return:一篇文章的词向量矩阵 132 | """ 133 | tensor = t.zeros([sentence_max_size, embedding.embedding_dim]) 134 | for index in range(0, sentence_max_size): 135 | if index >= len(sentence): 136 | break 137 | else: 138 | word = sentence[index] 139 | if word in word2id: 140 | vector = embedding.weight[word2id[word]] 141 | tensor[index] = vector 142 | elif word.lower() in word2id: 143 | vector = embedding.weight[word2id[word.lower()]] 144 | tensor[index] = vector 145 | return tensor.unsqueeze(0) # tensor是二维的,必须扩充为三维,否则会报错 146 | 147 | def train_textcnn_model(net, train_loader, epoch, lr, args): 148 | print("begin training") 149 | if args.cuda: 150 | net.cuda() 151 | net.train() # 必备,将模型设置为训练模式 152 | optimizer = optim.Adam(net.parameters(), lr=lr) 153 | criterion = nn.CrossEntropyLoss() 154 | for i in range(epoch): # 多批次循环 155 | for batch_idx, (data, target) in enumerate(train_loader): 156 | # data=data.long(); print(data.dtype) 157 | if args.cuda: 158 | data,target=data.cuda(),target.cuda() 159 | optimizer.zero_grad() # 清除所有优化的梯度 160 | output = net(data) # 传入数据并前向传播获取输出 161 | loss = criterion(output, target) 162 | loss.backward() 163 | optimizer.step() 164 | 165 | # 打印状态信息 166 | logging.info("train epoch=" + str(i) + ",batch_id=" + str(batch_idx) + ",loss=" + str(loss.item() / 64)) 167 | print('Finished Training') 168 | 169 | def textcnn_model_test(net, test_loader,args): 170 | if args.cuda: 171 | net.cuda() 172 | net.eval() # 必备,将模型设置为训练模式 173 | correct = 0 174 | total = 0 175 | test_acc = 0.0 176 | with t.no_grad(): 177 | for i, (data, label) in enumerate(test_loader): 178 | logging.info("test batch_id=" + str(i)) 179 | # data=data.long() 180 | if args.cuda: 181 | data, label = data.cuda(), label.cuda() 182 | outputs = net(data) 183 | # torch.max()[0]表示最大值的值,troch.max()[1]表示回最大值的每个索引 184 | _, predicted = t.max(outputs.data, 1) # 每个output是一行n列的数据,取一行中最大的值 185 | total += label.size(0) 186 | correct += (predicted == label).sum().item() 187 | logging.info('Accuracy of the network on test set: %.3f %%' % (100 * correct / total)) 188 | # test_acc += accuracy_score(torch.argmax(outputs.data, dim=1), label) 189 | # logging.info("test_acc=" + str(test_acc)) 190 | 191 | def transfer(glove_dir,word2vec_dir): 192 | 193 | glove_input_file=datapath(glove_dir) 194 | word2vec_output_file=get_tmpfile(word2vec_dir) #创建临时文件 195 | (count, dimensions) = glove2word2vec(glove_input_file, word2vec_output_file) 196 | # print(count, '\n', dimensions) 197 | return count, dimensions 198 | 199 | def parse(): 200 | parser = argparse.ArgumentParser(description='TextCNN text classifier') 201 | parser.add_argument('-vocab-size', type=int, default=89527, help='评论词表大小') 202 | parser.add_argument('-lr', type=float, default=0.001, help='学习率') 203 | parser.add_argument('-batch-size', type=int, default=64) 204 | parser.add_argument('-epoch', type=int, default=10) 205 | parser.add_argument('-filter-num', type=int, default=100, help='卷积核的个数') 206 | parser.add_argument('-filter-sizes', type=str, default=[2,3,4], help='不同卷积核大小') 207 | parser.add_argument('-embedding-dim', type=int, default=300, help='词向量的维度') 208 | parser.add_argument('-dropout', type=float, default=0.5) 209 | parser.add_argument('-label-num', type=int, default=2, help='标签个数') 210 | parser.add_argument('-static', type=bool, default=True, help='是否使用预训练词向量') 211 | parser.add_argument('-fine-tune', type=bool, default=True, help='预训练词向量是否要微调') 212 | parser.add_argument('-sentence-max-size',type=int,default=300,help='评论的最大长度') 213 | parser.add_argument('-cuda', type=bool, default=True) 214 | # parser.add_argument('-log-interval', type=int, default=1, help='经过多少iteration记录一次训练状态') 215 | # parser.add_argument('-test-interval', type=int, default=100, help='经过多少iteration对验证集进行测试') 216 | # parser.add_argument('-early-stopping', type=int, default=1000, help='早停时迭代的次数') 217 | # parser.add_argument('-save-best', type=bool, default=True, help='当得到更好的准确度是否要保存') 218 | # parser.add_argument('-save-dir', type=str, default='model_dir', help='存储训练模型位置') 219 | args = parser.parse_args() 220 | return args 221 | 222 | if __name__ == "__main__": 223 | # t.cuda.synchronize() 224 | start_time=time.clock( ) 225 | logging.basicConfig(format='%(asctime)s:%(levelname)s: %(message)s', level=logging.INFO) 226 | train_dir = './aclImdb/train' # 训练集路径 227 | test_dir = "./aclImdb/test" # 测试集路径 228 | # net_dir = "./model/net.pkl" 229 | params_dir="./model/gpu_params_300.pkl" 230 | parse=parse() 231 | 232 | #最好写绝对路径 233 | glove_dir = '/data/yanghan/homework/glove.6B/glove.6B.'+str(parse.embedding_dim)+'d.txt' 234 | word2vec_dir = '/data/yanghan/homework/glove.6B/glove.6B.word2vec.'+str(parse.embedding_dim)+'d.txt' 235 | print('count, dimensions',transfer(glove_dir,word2vec_dir)) 236 | 237 | 238 | # 加载词向量模型 239 | logging.info("加载词向量模型") 240 | # 使用gensim载入word2vec词向量 241 | 242 | wvmodel = gensim.models.KeyedVectors.load_word2vec_format('./glove.6B/glove.6B.word2vec.'+str(parse.embedding_dim)+'d.txt', 243 | binary=False, encoding='utf-8') 244 | 245 | 246 | word2id = {} # word2id是一个字典,存储{word:id}的映射 247 | for i, word in enumerate(wvmodel.index2word): 248 | word2id[word] = i 249 | # 根据已经训练好的词向量模型,生成Embedding对象 250 | embedding = nn.Embedding.from_pretrained(t.FloatTensor(wvmodel.vectors)) 251 | # requires_grad指定是否在训练过程中对词向量的权重进行微调 252 | # embedding.weight.requires_grad = True 253 | 254 | 255 | # 获取训练数据 256 | logging.info("获取训练数据") 257 | train_filelist = get_file_list(train_dir) 258 | train_labellist = get_label_list(train_filelist) 259 | train_dataset = MyDataset(train_filelist, train_labellist, parse.sentence_max_size, embedding, word2id) 260 | train_dataloader = DataLoader(train_dataset, batch_size=parse.batch_size, shuffle=True) 261 | # (data, target) = next(iter(train_dataloader)) 262 | # print(data,target) 263 | # print(data.dtype) 264 | # data=data.long();print(data.dtype) 265 | 266 | # 获取测试数据 267 | logging.info("获取测试数据") 268 | test_set = get_file_list(test_dir) 269 | test_label = get_label_list(test_set) 270 | test_dataset = MyDataset(test_set, test_label, parse.sentence_max_size, embedding, word2id) 271 | test_dataloader = DataLoader(test_dataset, batch_size=parse.batch_size, shuffle=True) 272 | 273 | # 定义模型 274 | net = textCNN(parse,vectors=t.FloatTensor(wvmodel.vectors)) 275 | 276 | # 训练 277 | logging.info("开始训练模型") 278 | # t.cuda.synchronize() 279 | start_train = time.clock() 280 | train_textcnn_model(net, train_dataloader, parse.epoch, parse.lr, args=parse) 281 | t.save(net.state_dict(), params_dir) # 保存模型参数 282 | # t.cuda.synchronize() 283 | end_train = time.clock() 284 | logging.info('train time cost:%f s' % (end_train - start_train)) 285 | # 测试 286 | net = textCNN(parse, vectors=t.FloatTensor(wvmodel.vectors)) 287 | net.load_state_dict(t.load(params_dir)) 288 | textcnn_model_test(net,test_dataloader,args=parse) 289 | # t.cuda.synchronize() 290 | end_test = time.clock() 291 | logging.info('test time cost:%f s' % (end_test - end_train)) 292 | logging.info('overall time cost:%f s' % (end_test - start_time)) 293 | 294 | -------------------------------------------------------------------------------- /textCNN_parallel.py: -------------------------------------------------------------------------------- 1 | import torch as t 2 | import torch.nn.functional as F 3 | import torch.nn as nn 4 | from torch.utils.data import Dataset, DataLoader 5 | import logging 6 | import torch.optim as optim 7 | import os 8 | import jieba 9 | import gensim 10 | from gensim.test.utils import datapath,get_tmpfile 11 | from gensim.scripts.glove2word2vec import glove2word2vec 12 | import argparse 13 | import re 14 | import time 15 | import torch.distributed as dist 16 | 17 | class textCNN(nn.Module): 18 | # 多通道textcnn 19 | def __init__(self, args,vectors=None): 20 | super(textCNN,self).__init__() 21 | self.args=args 22 | 23 | self.label_num=args.label_num #标签个数 24 | self.filter_num=args.filter_num #卷积核个数 25 | self.filter_sizes=[int(fsz) for fsz in args.filter_sizes] 26 | self.vocab_size=args.vocab_size 27 | self.embedding_dim=args.embedding_dim 28 | 29 | self.embedding=nn.Embedding(self.vocab_size,self.embedding_dim) 30 | # # requires_grad指定是否在训练过程中对词向量的权重进行微调 31 | # self.embedding.weight.requires_grad = True 32 | if args.static: # 如果使用预训练词向量,则提前加载,当不需要微调时设置freeze为True 33 | self.embedding = self.embedding.from_pretrained(vectors, freeze=not args.fine_tune) 34 | channel_in=1 35 | # nn.ModuleList相当于一个卷积的列表,相当于一个list 36 | # 卷积核宽度与embeding-dim相同,相当于一维卷积 37 | # nn.Conv1d()是一维卷积。in_channels:词向量的维度, out_channels:输出通道数 38 | # nn.MaxPool1d()是最大池化,此处对每一个向量取最大值,所有kernel_size为卷积操作之后的向量维度 39 | self.convs = nn.ModuleList([nn.Sequential( 40 | nn.Conv2d(channel_in, self.filter_num, (kernel, self.embedding_dim)), 41 | nn.ReLU(), 42 | # 经过卷积之后,得到一个维度为sentence_max_size - kernel + 1的一维向量 43 | nn.MaxPool2d((args.sentence_max_size - kernel + 1, 1)) 44 | ) 45 | for kernel in self.filter_sizes]) 46 | 47 | # self.convs = nn.ModuleList([ 48 | # nn.Conv2d(channel_in, self.filter_num, (fsz, self.embedding_dim)) 49 | # for fsz in self.filter_sizes]) 50 | self.dropout = nn.Dropout(args.dropout) 51 | self.fc = nn.Linear(len(self.filter_sizes) * self.filter_num, self.label_num) 52 | 53 | def forward(self,x): 54 | # Conv2d的输入是个四维的tensor,每一位分别代表batch_size、channel、length、width 55 | in_size = x.size(0) # x.size(0),表示的是输入x的batch_size 56 | out = [conv(x) for conv in self.convs] 57 | out = t.cat(out, dim=1) 58 | out = out.view(in_size, -1) # 设经过max pooling之后,有output_num个数,将out变成(batch_size,output_num),-1表示自适应 59 | out = F.dropout(out) 60 | out = self.fc(out) # nn.Linear接收的参数类型是二维的tensor(batch_size,output_num),一批有多少数据,就有多少行 61 | return out 62 | 63 | 64 | class MyDataset(Dataset): 65 | """MyDataset的实现原理就是通过遍历file_list,得到每一个文件路径名,根据路径名,将其内容读到内存中, 66 | 通过generate_tensor()函数将文件内容转化为tensor,函数返回tensor与对应的label,其中index就是list的下标""" 67 | 68 | def __init__(self, file_list, label_list, sentence_max_size, embedding, word2id, stopwords=None): 69 | self.x = file_list 70 | self.y = label_list 71 | self.sentence_max_size = sentence_max_size 72 | self.embedding = embedding 73 | self.word2id = word2id 74 | self.stopwords = stopwords 75 | 76 | def __getitem__(self, index): 77 | # 读取文章内容 78 | words = [] 79 | with open(self.x[index], "r", encoding="utf8") as file: 80 | for line in file.readlines(): 81 | words.extend(segment(line.strip())) 82 | # 生成文章的词向量矩阵 83 | tensor = generate_tensor(words, self.sentence_max_size, self.embedding, self.word2id) 84 | return tensor, self.y[index] 85 | 86 | def __len__(self): 87 | return len(self.x) 88 | 89 | 90 | def get_file_list(source_dir): 91 | file_list = [] # 文件路径名列表 92 | # os.walk()遍历给定目录下的所有子目录,每个walk是三元组(root,dirs,files) 93 | # root 所指的是当前正在遍历的这个文件夹的本身的地址 94 | # dirs 是一个 list ,内容是该文件夹中所有的目录的名字(不包括子目录) 95 | # files 同样是 list , 内容是该文件夹中所有的文件(不包括子目录) 96 | # 遍历所有评论 97 | for root, dirs, files in os.walk(source_dir): 98 | file = [os.path.join(root, filename) for filename in files] 99 | # print(root,dir,file) #file和 100 | file_list.extend(file) 101 | # print('len of filelist:',len(file_list),file_list) 102 | return file_list 103 | 104 | 105 | def get_label_list(file_list): 106 | # print(len(file_list)) 107 | # 提取出标签名 108 | label_name_list = [file.split(r'/')[3] for file in file_list] 109 | # 标签名对应的数字 110 | label_list = [] 111 | for label_name in label_name_list: 112 | if label_name == "neg": 113 | label_list.append(0) 114 | elif label_name == "pos": 115 | label_list.append(1) 116 | return label_list 117 | 118 | def segment(content): 119 | # regex = re.compile(r'[^\u4e00-\u9fa5aA-Za-z0-9]') 120 | # text = regex.sub(' ', content) 121 | text = re.sub('[^\w ]', '', content) 122 | return [word for word in jieba.cut(text) if word.strip()] 123 | 124 | 125 | '''先将一篇评论进行分词,然后将每个词转换为对应的词向量。最终每篇评论,会变成[sentence_max_size,vec_dim]的矩阵''' 126 | def generate_tensor(sentence, sentence_max_size, embedding, word2id): 127 | """ 128 | 对一篇文章生成对应的词向量矩阵 129 | :param sentence:一篇文章的分词列表 130 | :param sentence_max_size:认为设定的一篇文章的最大分词数量 131 | :param embedding:词向量对象 132 | :param word2id:字典{word:id} 133 | :return:一篇文章的词向量矩阵 134 | """ 135 | tensor = t.zeros([sentence_max_size, embedding.embedding_dim]) 136 | for index in range(0, sentence_max_size): 137 | if index >= len(sentence): 138 | break 139 | else: 140 | word = sentence[index] 141 | if word in word2id: 142 | vector = embedding.weight[word2id[word]] 143 | tensor[index] = vector 144 | elif word.lower() in word2id: 145 | vector = embedding.weight[word2id[word.lower()]] 146 | tensor[index] = vector 147 | return tensor.unsqueeze(0) # tensor是二维的,必须扩充为三维,否则会报错 148 | 149 | def train_textcnn_model(net, train_loader, epoch, lr, args): 150 | print("begin training") 151 | # if args.cuda: 152 | # net.cuda() 153 | net.train() # 必备,将模型设置为训练模式 154 | optimizer = optim.Adam(net.parameters(), lr=lr) 155 | criterion = nn.CrossEntropyLoss() 156 | for i in range(epoch): # 多批次循环 157 | for batch_idx, (data, target) in enumerate(train_loader): 158 | # data=data.long(); print(data.dtype) 159 | if args.cuda: 160 | data,target=data.cuda(),target.cuda() 161 | optimizer.zero_grad() # 清除所有优化的梯度 162 | output = net(data) # 传入数据并前向传播获取输出 163 | loss = criterion(output, target) 164 | loss.backward() 165 | optimizer.step() 166 | 167 | # 打印状态信息 168 | logging.info("train epoch=" + str(i) + ",batch_id=" + str(batch_idx) + ",loss=" + str(loss.item() / 64)) 169 | print('Finished Training') 170 | 171 | def textcnn_model_test(net, test_loader,args): 172 | # if args.cuda: 173 | # net.cuda() 174 | net.eval() # 必备,将模型设置为训练模式 175 | correct = 0 176 | total = 0 177 | test_acc = 0.0 178 | with t.no_grad(): 179 | for i, (data, label) in enumerate(test_loader): 180 | logging.info("test batch_id=" + str(i)) 181 | # data=data.long() 182 | if args.cuda: 183 | data, label = data.cuda(), label.cuda() 184 | outputs = net(data) 185 | # torch.max()[0]表示最大值的值,troch.max()[1]表示回最大值的每个索引 186 | _, predicted = t.max(outputs.data, 1) # 每个output是一行n列的数据,取一行中最大的值 187 | total += label.size(0) 188 | correct += (predicted == label).sum().item() 189 | logging.info('Accuracy of the network on test set: %.3f %%' % (100 * correct / total)) 190 | # test_acc += accuracy_score(torch.argmax(outputs.data, dim=1), label) 191 | # logging.info("test_acc=" + str(test_acc)) 192 | 193 | def transfer(glove_dir,word2vec_dir): 194 | 195 | glove_input_file=datapath(glove_dir) 196 | word2vec_output_file=get_tmpfile(word2vec_dir) #创建临时文件 197 | (count, dimensions) = glove2word2vec(glove_input_file, word2vec_output_file) 198 | # print(count, '\n', dimensions) 199 | return count, dimensions 200 | 201 | def parse(): 202 | parser = argparse.ArgumentParser(description='TextCNN text classifier') 203 | parser.add_argument('--vocab-size', type=int, default=89527, help='评论词表大小') 204 | parser.add_argument('--lr', type=float, default=0.001, help='学习率') 205 | parser.add_argument('--batch-size', type=int, default=64) 206 | parser.add_argument('--epoch', type=int, default=10) 207 | parser.add_argument('--filter-num', type=int, default=100, help='卷积核的个数') 208 | parser.add_argument('--filter-sizes', type=str, default=[2,3,4], help='不同卷积核大小') 209 | parser.add_argument('--embedding-dim', type=int, default=300, help='词向量的维度') 210 | parser.add_argument('--dropout', type=float, default=0.5) 211 | parser.add_argument('--label-num', type=int, default=2, help='标签个数') 212 | parser.add_argument('--static', type=bool, default=True, help='是否使用预训练词向量') 213 | parser.add_argument('--fine-tune', type=bool, default=True, help='预训练词向量是否要微调') 214 | parser.add_argument('--sentence-max-size',type=int,default=300,help='评论的最大长度') 215 | parser.add_argument('--cuda', type=bool, default=True) 216 | parser.add_argument('--local_rank', default=-1, type=int, 217 | help='node rank for distributed training') 218 | parser.add_argument('--workers', default=2, type=int,help='load workers in distributed training') 219 | # parser.add_argument('-log-interval', type=int, default=1, help='经过多少iteration记录一次训练状态') 220 | # parser.add_argument('-test-interval', type=int, default=100, help='经过多少iteration对验证集进行测试') 221 | # parser.add_argument('-early-stopping', type=int, default=1000, help='早停时迭代的次数') 222 | # parser.add_argument('-save-best', type=bool, default=True, help='当得到更好的准确度是否要保存') 223 | # parser.add_argument('-save-dir', type=str, default='model_dir', help='存储训练模型位置') 224 | args = parser.parse_args() 225 | return args 226 | 227 | if __name__ == "__main__": 228 | # t.cuda.synchronize() 229 | start_time=time.clock( ) 230 | logging.basicConfig(format='%(asctime)s:%(levelname)s: %(message)s', level=logging.INFO) 231 | train_dir = './aclImdb/train' # 训练集路径 232 | test_dir = "./aclImdb/test" # 测试集路径 233 | # net_dir = "./model/net.pkl" 234 | params_dir="./model/distributed_2gpu_params_300_1000batch.pkl" 235 | args=parse() 236 | device_ids=[4,5] 237 | 238 | #最好写绝对路径 239 | glove_dir = '/data/yanghan/homework/glove.6B/glove.6B.'+str(args.embedding_dim)+'d.txt' 240 | word2vec_dir = '/data/yanghan/homework/glove.6B/glove.6B.word2vec.'+str(args.embedding_dim)+'d.txt' 241 | print('count, dimensions',transfer(glove_dir,word2vec_dir)) 242 | 243 | 244 | # 加载词向量模型 245 | logging.info("加载词向量模型") 246 | # 使用gensim载入word2vec词向量 247 | wvmodel = gensim.models.KeyedVectors.load_word2vec_format('./glove.6B/glove.6B.word2vec.'+str(args.embedding_dim)+'d.txt', 248 | binary=False, encoding='utf-8') 249 | 250 | 251 | word2id = {} # word2id是一个字典,存储{word:id}的映射 252 | for i, word in enumerate(wvmodel.index2word): 253 | word2id[word] = i 254 | # 根据已经训练好的词向量模型,生成Embedding对象 255 | embedding = nn.Embedding.from_pretrained(t.FloatTensor(wvmodel.vectors)) 256 | 257 | #初始化使用nccl后端 258 | dist.init_process_group(backend='nccl') 259 | # When using a single GPU per process and per 260 | # DistributedDataParallel, we need to divide the batch size 261 | # ourselves based on the total number of GPUs we have 262 | ngpus_per_node=len(device_ids) 263 | args.batch_size = int(args.batch_size / ngpus_per_node) 264 | 265 | # 获取训练数据 266 | logging.info("获取训练数据") 267 | train_filelist = get_file_list(train_dir) 268 | train_labellist = get_label_list(train_filelist) 269 | train_dataset = MyDataset(train_filelist, train_labellist, args.sentence_max_size, embedding, word2id) 270 | train_sampler = t.utils.data.distributed.DistributedSampler(train_dataset) 271 | # train_dataloader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True) 272 | train_dataloader = DataLoader(train_dataset, 273 | pin_memory=true, 274 | shuffle=(train_sampler is None), 275 | batch_size=args.batch_size, 276 | num_workers=args.workers, 277 | sampler=train_sampler ) 278 | 279 | 280 | # 获取测试数据 281 | logging.info("获取测试数据") 282 | test_set = get_file_list(test_dir) 283 | test_label = get_label_list(test_set) 284 | test_dataset = MyDataset(test_set, test_label, args.sentence_max_size, embedding, word2id) 285 | # test_dataloader = DataLoader(test_dataset, batch_size=args.batch_size, shuffle=True) 286 | test_dataloader = DataLoader(test_dataset, batch_size=args.batch_size) 287 | 288 | 289 | # 定义模型 290 | net = textCNN(args,vectors=t.FloatTensor(wvmodel.vectors)) 291 | if args.cuda: 292 | net.cuda() 293 | #以下是两种并行的方式 294 | if len(device_ids)>1: 295 | # net=t.nn.parallel.DataParallel(net) 296 | net=t.nn.parallel.DistributedDataParallel(net,find_unused_parameters=True) 297 | 298 | 299 | # 训练 300 | logging.info("开始训练模型") 301 | # t.cuda.synchronize() 302 | start_train = time.clock() 303 | train_textcnn_model(net, train_dataloader, args.epoch, args.lr, args=args) 304 | t.save(net.state_dict(), params_dir) # 保存模型参数 305 | # t.cuda.synchronize() 306 | end_train = time.clock() 307 | logging.info('train time cost:%f s' % (end_train - start_train)) 308 | # 测试 309 | # net = textCNN(args, vectors=t.FloatTensor(wvmodel.vectors)) 310 | # net.load_state_dict(t.load(params_dir)) 311 | textcnn_model_test(net,test_dataloader,args=args) 312 | # t.cuda.synchronize() 313 | end_test = time.clock() 314 | logging.info('test time cost:%f s' % (end_test - end_train)) 315 | logging.info('overall time cost:%f s' % (end_test - start_time)) 316 | 317 | 318 | 319 | #CUDA_VISIBLE_DEVICES=0,1,2,3 python -m torch.distributed.launch --nproc_per_node=4 main.py --------------------------------------------------------------------------------