├── Config.py ├── README.md ├── datasets ├── .DS_Store └── THUCNews │ ├── .DS_Store │ ├── class.txt │ ├── dev.txt │ ├── test.txt │ └── train.txt ├── main.py ├── module ├── DataManager.py ├── LossManager.py ├── ModelMap.py ├── Predictor.py ├── Trainer.py ├── loss │ ├── .DS_Store │ ├── dice_loss.py │ ├── focal_loss.py │ ├── infonce_loss.py │ ├── kl_loss.py │ └── label_smoothing.py ├── models │ ├── .DS_Store │ ├── Albert.py │ ├── Bert.py │ ├── Distilbert.py │ ├── Electra.py │ ├── FastText.py │ ├── Roberta.py │ ├── TextCNN.py │ ├── TextRCNN.py │ ├── TextRNN.py │ ├── Transformer.py │ └── XLNet.py ├── optimal │ ├── .DS_Store │ └── adversarial.py └── tokenizer │ ├── .DS_Store │ ├── LMTextTokenizer.py │ └── TextTokenizer.py ├── requirements.txt ├── run.sh └── utils ├── IOOption.py └── progressbar.py /Config.py: -------------------------------------------------------------------------------- 1 | 2 | import os 3 | import random 4 | 5 | from module.models.Transformer import Transformer 6 | 7 | 8 | 9 | 10 | class Config(object): 11 | 12 | # 运行模式 13 | mode = 'train' 14 | 15 | # GPU配置 16 | cuda_visible_devices = '0' # 可见的GPU 17 | device = 'cuda:0' # master GPU 18 | port = str(random.randint(10000,60000)) # 多卡训练进程间通讯端口 19 | init_method = 'tcp://localhost:' + port # 多卡训练的通讯地址 20 | world_size = 1 # 线程数,默认为1 21 | 22 | # 模型选型 23 | # 基础模型:FastText/TextCNN/TextRNN/TextRCNN/Transformer 24 | # 语言模型:Bert/Albert/Roberta/Distilbert/Electra/XLNet 25 | model_name='Bert' 26 | initial_pretrain_model = 'bert-base-chinese' # 加载的预训练分词器checkpoint 27 | initial_pretrain_tokenizer = 'bert-base-chinese' # 加载的预训练模型checkpoint 28 | lm_model_list = ['Bert','Albert','Roberta','Distilbert','Electra','XLNet'] 29 | 30 | # 训练配置 31 | num_epochs = 30 # 迭代次数 32 | batch_size = 128 # 每个批次的大小 33 | learning_rate = 2e-5 # 学习率 34 | num_warmup_steps = 0.1 # warm up步数 35 | sen_max_length = 32 # 句子最长长度 36 | padding = True # 是否对输入进行padding 37 | step_save = 1000 # 多少步保存一次模型 38 | loss_type = 'ce' 39 | 40 | # 对比学习 41 | cl_option = True # 是否使用对比学习 42 | cl_method = 'Rdrop' # Rdrop/InfoNCE 43 | cl_loss_weight = 0.5 # 对比学习loss比例 44 | # 对抗训练 45 | adv_option = 'None' # 是否引入对抗训练:none/FGM/PGD 46 | adv_name = 'word_embeddings' 47 | adv_epsilon = 1.0 48 | # 混合精度训练 49 | fp16 = False 50 | fp16_opt_level = 'O1' # 训练可选'O1',测试可选'O3' 51 | 52 | # 模型及路径配置 53 | path_root = os.getcwd() 54 | path_model_save = os.path.join(path_root, 'checkpoints/') # 模型保存路径 55 | path_datasets = os.path.join(path_root, 'datasets/THUCNews') # 数据集 56 | path_log = os.path.join(path_root, 'logs') 57 | path_output = os.path.join(path_datasets, 'outputs') 58 | 59 | 60 | 61 | 62 | 63 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Text_Classifier_Pytorch 2 | 3 | ## Info 4 | 基于Pytorch的文本分类框架。 5 | 6 | 同时支持中英文的数据集的文本分类任务。 7 | 8 | 9 | ## Model 10 | - 非预训练类模型: 11 | - FastText 12 | - TextCNN 13 | - TextRNN 14 | - TextRCNN 15 | - Transformer 16 | - 预训练类模型 17 | - Bert 18 | - Albert 19 | - Roberta 20 | - Distilbert 21 | - Electra 22 | - XLNet 23 | 24 | 25 | ## Trianing Mode Support 26 | 27 | - 支持中英文语料训练 28 | - 支持中英文的文本分类任务。 29 | - 支持多种模型使用 30 | - 配置文件`Config.py`中的变量`model_name`表示模型名称,可以更改成你想要加载的模型名称。 31 | - 若是预训练类的模型,如Bert等,需要同步修改变量`initial_pretrain_model`和`initial_pretrain_tokenizer`,修改为你想要加载的预训练参数。 32 | - 混合精度训练 33 | - 用于提升训练过程效率,缩短训练时间。 34 | - 配置文件`Config.py`中的变量`fp16`值改为`True`。 35 | - GPU多卡训练 36 | - 用于分布式训练,支持单机单卡、多卡训练。 37 | - 配置文件`Config.py`中的变量`cuda_visible_devices`用于设置可见的GPU卡号,多卡情况下用`,`间隔开。 38 | - 对抗训练 39 | - 在模型embedding层增加扰动,使模型学习对抗扰动,提升表现,需要额外增加训练时间。 40 | - 配置文件`Config.py`中的变量`adv_option`用于设置可见的对抗模式,目前支持FGM/PGD。 41 | - 对比学习 42 | - 用于增强模型语义特征提取能力,借鉴Rdrop和SimCSE的思想,目前支持KL loss和InfoNCE两种损失。 43 | - 配置文件`Config.py`中的变量`cl_option`设置为`True`则表示开启对比学习模式,`cl_method`用于设置计算对比损失的方法。 44 | 45 | 46 | 47 | ## Datasets 48 | * **THUCNews** 49 | * 来自:https://github.com/649453932/Chinese-Text-Classification-Pytorch 50 | * 关于THUCNews的的数据。 51 | * 数据分为10个类标签类别,分别为:财经、房产、股票、教育、科技、社会、时政、体育、游戏、娱乐 52 | 53 | * **加入自己的数据集** 54 | * 可使用本项目的处理方式,将数据集切分为3部分:train/valid/test,其中token和label之间用制表符`\t`分割。 55 | * 在 ./dataset 目录下新建一个文件夹,并把3个数据文件放置新建文件夹下。 56 | 57 | * **数据集示例** 58 | * 以数据集THUCNews为栗子,文本和标签使用空格隔开,采用以下形式存储: 59 | ``` 60 | 午评沪指涨0.78%逼近2800 汽车家电农业领涨 2 61 | 卡佩罗:告诉你德国脚生猛的原因 不希望英德战踢点球 7 62 | ``` 63 | 64 | 65 | ## Experiments 66 | 67 | 说明:预训练模型基于transformers框架,如若想要替换成其他预训练参数,可以查看[transformers官方网站](https://huggingface.co/models)。 68 | 69 | | 模型名称 | MicroF1 | LearningRate | 预训练参数 | 70 | | :-----| :---- | :---- | :---- | 71 | | FastText | 0.8926 | 1e-3 | - | 72 | | TextCNN | 0.9009 | 1e-3 | - | 73 | | TextRNN | 0.9080 | 1e-3 | - | 74 | | TextRCNN | 0.9142 | 1e-3 | - | 75 | | Tramsformer(2 layer) | 0.8849 | 1e-3 | - | 76 | | Albert | 0.9124 | 2e-5 | [voidful/albert_chinese_tiny](https://huggingface.co/voidful/albert_chinese_tiny) | 77 | | Distilbert | 0.9209 | 2e-5 | [Geotrend/distilbert-base-zh-cased](https://huggingface.co/Geotrend/distilbert-base-zh-cased) | 78 | | Bert | 0.9401 | 2e-5 | [bert-base-chinese](https://huggingface.co/bert-base-chinese) | 79 | | Roberta | 0.9448 | 2e-5 | [hfl/chinese-roberta-wwm-ext](https://huggingface.co/hfl/chinese-roberta-wwm-ext) | 80 | | Electra | 0.9377 | 2e-5 | [hfl/chinese-electra-base-discriminator](https://huggingface.co/hfl/chinese-electra-base-discriminator) | 81 | | XLNet | 0.9051 | 2e-5 | 无参数初始化 | 82 | 83 | 84 | 85 | 86 | 87 | ## Requirement 88 | Python使用的是3.6.X版本,其他依赖模块如下: 89 | ``` 90 | numpy==1.19.2 91 | pandas==1.1.5 92 | scikit_learn==1.0.2 93 | torch==1.8.0 94 | tqdm==4.62.3 95 | transformers==4.15.0 96 | apex==0.1 97 | ``` 98 | 99 | 除了`apex`需要额外安装(参考官网:https://github.com/NVIDIA/apex 100 | ),其他模块可通过以下命令安装依赖包 101 | ``` 102 | pip install -r requirement.txt 103 | ``` 104 | 105 | 106 | ## Get Started 107 | ### 1. 训练 108 | 准备好训练数据后,终端可运行命令 109 | ``` 110 | python3 main.py 111 | ``` 112 | ### 2 测试评估 113 | 加载已训练好的模型,并使用valid set作模型测试,输出文件到 ./dataset/${your_dataset}/output/output.txt 目录下。 114 | 115 | 需要修改Config文件中的变量值`mode = 'test'`,并保存。 116 | 117 | 终端可运行命令 118 | ``` 119 | python3 main.py 120 | ``` 121 | 122 | 123 | ## Reference 124 | 125 | [Github:transformers] https://github.com/huggingface/transformers 126 | 127 | [Paper:Bert] https://arxiv.org/abs/1810.04805 128 | 129 | [Paper:RDrop] https://arxiv.org/abs/2106.14448 130 | 131 | [Paper:SimCSE] https://arxiv.org/abs/2104.08821 132 | -------------------------------------------------------------------------------- /datasets/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wzzzd/text_classifier_pytorch/847c4565f0efa574d782de66311dc21f35c44f3e/datasets/.DS_Store -------------------------------------------------------------------------------- /datasets/THUCNews/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wzzzd/text_classifier_pytorch/847c4565f0efa574d782de66311dc21f35c44f3e/datasets/THUCNews/.DS_Store -------------------------------------------------------------------------------- /datasets/THUCNews/class.txt: -------------------------------------------------------------------------------- 1 | finance 2 | realty 3 | stocks 4 | education 5 | science 6 | society 7 | politics 8 | sports 9 | game 10 | entertainment -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import os 4 | import time 5 | import numpy as np 6 | import torch 7 | import logging 8 | from Config import Config 9 | from module.DataManager import DataManager 10 | from module.Trainer import Trainer 11 | from module.Predictor import Predictor 12 | 13 | 14 | 15 | if __name__ == '__main__': 16 | 17 | 18 | config = Config() 19 | os.environ["CUDA_VISIBLE_DEVICES"] = config.cuda_visible_devices 20 | 21 | # 设置随机种子,保证结果每次结果一样 22 | np.random.seed(1) 23 | torch.manual_seed(1) 24 | torch.cuda.manual_seed_all(1) 25 | torch.backends.cudnn.deterministic = True 26 | start_time = time.time() 27 | 28 | # 数据处理 29 | print('read data...') 30 | dm = DataManager(config) 31 | 32 | # 模式 33 | if config.mode == 'train': 34 | # 获取数据 35 | print('data process...') 36 | train_loader = dm.get_dataset(data_type='train') 37 | valid_loader = dm.get_dataset(data_type='dev') 38 | test_loader = dm.get_dataset(data_type='test') 39 | # 训练 40 | trainer = Trainer(config, train_loader, valid_loader, test_loader) 41 | trainer.train() 42 | elif config.mode == 'test': 43 | # 测试 44 | test_loader = dm.get_dataset(data_type='test') 45 | predictor = Predictor(config) 46 | predictor.predict(test_loader) 47 | else: 48 | print("no task going on!") 49 | print("you can use one of the following lists to replace the valible of Config.py. ['train', 'test', 'valid'] !") 50 | -------------------------------------------------------------------------------- /module/DataManager.py: -------------------------------------------------------------------------------- 1 | 2 | import re 3 | import os 4 | import random 5 | import math 6 | import numpy as np 7 | import pandas as pd 8 | import pickle as pkl 9 | import torch 10 | # from tqdm.auto import tqdm 11 | from datasets import Dataset, load_dataset, load_metric 12 | from torch.utils.data import DataLoader 13 | from transformers import AutoTokenizer, DataCollatorWithPadding, BertTokenizer 14 | from torch.utils.data import DataLoader, TensorDataset, RandomSampler 15 | from torch.utils.data.distributed import DistributedSampler 16 | from utils.IOOption import open_file, write_text, write_file 17 | 18 | from module.ModelMap import map_tokenizer 19 | from module.tokenizer.TextTokenizer import TextTokenizer 20 | from module.tokenizer.LMTextTokenizer import LMTextTokenizer 21 | 22 | 23 | 24 | 25 | 26 | class DataManager(object): 27 | 28 | def __init__(self, config): 29 | 30 | self.config = config 31 | self.init_gpu_config() # 初始化GPU配置 32 | self.load_label() # 读取标签 33 | self.load_tokenizer() # 读取tokenizer分词模型 34 | 35 | 36 | def init_gpu_config(self): 37 | """ 38 | 初始化GPU并行配置 39 | """ 40 | print('loading GPU config ...') 41 | if self.config.mode == 'train' and torch.cuda.device_count() > 1: 42 | torch.distributed.init_process_group(backend='nccl', 43 | init_method=self.config.init_method, 44 | rank=0, 45 | world_size=self.config.world_size) 46 | torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache 47 | 48 | 49 | def load_label(self): 50 | """ 51 | 读取标签 52 | """ 53 | print('loading tag file ...') 54 | path_label = os.path.join(self.config.path_datasets, 'class.txt') 55 | label = [ x.strip() for x in open(path_label, 'r', encoding='utf8').readlines()] 56 | self.label2ids = {x:i for i,x in enumerate(label)} 57 | self.ids2label = {i:x for i,x in enumerate(label)} 58 | 59 | 60 | def load_tokenizer(self): 61 | """ 62 | 读取分词器 63 | """ 64 | print('loading tokenizer config ...') 65 | tokenizer = map_tokenizer(self.config.model_name) 66 | if not tokenizer: 67 | print('toknizer {} is null, please check your model name.'.format(self.config.model_name)) 68 | 69 | if self.config.model_name not in self.config.lm_model_list: 70 | path_token = os.path.join(self.config.path_datasets, 'vocab.pkl') 71 | self.tokenizer = tokenizer() 72 | # 若存在词表,则直接读取 73 | if os.path.exists(path_token): 74 | self.tokenizer.load(path_token) 75 | else: 76 | # 否则读取训练数据,并创建词表 77 | path_corpus = os.path.join(self.config.path_datasets, 'train.txt') 78 | corpus, _ = open_file(path_corpus, sep='\t') 79 | token2index, _ = self.tokenizer.create(corpus) 80 | # 标签映射表存到本地 81 | write_file(token2index, path_token + '.txt') 82 | pkl.dump(token2index, open(path_token, 'wb')) 83 | self.tokenizer.load(path_token) 84 | else: 85 | tokenizer = tokenizer.from_pretrained(self.config.initial_pretrain_tokenizer) 86 | self.tokenizer = LMTextTokenizer(tokenizer) 87 | print('Vocab size: {}'.format(len(self.tokenizer.token2index))) 88 | 89 | 90 | def get_dataset(self, data_type='train'): 91 | """ 92 | 获取数据集 93 | """ 94 | file = '{}.txt'.format(data_type) 95 | dataloader = self.data_process(file) 96 | return dataloader 97 | 98 | 99 | def data_process(self, file_name): 100 | """ 101 | 数据转换 102 | """ 103 | # 获取数据 104 | path = os.path.join(self.config.path_datasets, file_name) 105 | src, tgt = open_file(path, sep='\t') 106 | dataset = pd.DataFrame({'src':src, 'label':tgt}) 107 | # dataset.to_csv('./data/cache.csv', sep='\t', index=False) 108 | # dataframe to datasets 109 | raw_datasets = Dataset.from_pandas(dataset) 110 | # tokenizer. 111 | tokenized_datasets = raw_datasets.map(lambda x: self.tokenize_function(x), batched=True) # 对于样本中每条数据进行数据转换 112 | # data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer) # 对数据进行padding 113 | tokenized_datasets = tokenized_datasets.remove_columns(["src"]) # 移除不需要的字段 114 | tokenized_datasets.set_format("torch", columns=["input_ids","attention_mask","label"]) 115 | # 转换成DataLoader类 116 | sampler = RandomSampler(tokenized_datasets) if not torch.cuda.device_count() > 1 else DistributedSampler(tokenized_datasets) 117 | dataloader = DataLoader(tokenized_datasets, sampler=sampler, batch_size=self.config.batch_size) #, collate_fn=data_collator 118 | 119 | return dataloader 120 | 121 | 122 | def tokenize_function(self, example): 123 | """ 124 | 数据转换 125 | """ 126 | # 分词 127 | token = {} 128 | # src = [self.tokenizer.convert_tokens_to_ids(x) for x in example["src"]] 129 | src_origin = [self.tokenizer.tokenizer(x) for x in example["src"]] 130 | src = [ x['input_ids'] for x in src_origin ] 131 | attention_mask = [ x['attention_mask'] for x in src_origin ] 132 | # paddding 133 | src = [self.padding(x) for x in src] 134 | attention_mask = [self.padding_attention(x) for x in attention_mask] 135 | label = [ int(x) for x in example["label"]] 136 | # 添加标签到样本中 137 | token = { 138 | 'input_ids':src, 139 | 'attention_mask':attention_mask, 140 | 'label':label 141 | } 142 | return token 143 | 144 | 145 | 146 | def padding(self, src): 147 | """ 148 | padding 149 | """ 150 | # 裁剪 151 | if len(src) > self.config.sen_max_length: 152 | src = src[:self.config.sen_max_length] 153 | # padding 154 | pad_size = self.config.sen_max_length-len(src) # 待padding的长度 155 | # 添加cls/pad/sep特殊字符 156 | # src = [self.tokenizer.cls_token_id] + src + [self.tokenizer.sep_token_id] + [self.tokenizer.pad_token_id]*pad_size 157 | src = src + [self.tokenizer.pad_token_id]*pad_size 158 | assert len(src) == self.config.sen_max_length, 'input no equal {}'.format(self.config.sen_max_length) 159 | return src 160 | 161 | 162 | def padding_attention(self, attention_mask): 163 | """ 164 | padding attention mask 165 | """ 166 | # 裁剪 167 | if len(attention_mask) > self.config.sen_max_length: 168 | attention_mask = attention_mask[:self.config.sen_max_length] 169 | # padding 170 | pad_size = self.config.sen_max_length-len(attention_mask) # 待padding的长度 171 | # 添加cls/pad/sep特殊字符 172 | attention_mask = attention_mask + [0]*pad_size 173 | assert len(attention_mask) == self.config.sen_max_length, 'input no equal {}'.format(self.config.sen_max_length) 174 | return attention_mask 175 | 176 | -------------------------------------------------------------------------------- /module/LossManager.py: -------------------------------------------------------------------------------- 1 | 2 | from torch.nn import CrossEntropyLoss 3 | from module.loss.focal_loss import FocalLoss 4 | from module.loss.infonce_loss import InfoNCELoss 5 | from module.loss.kl_loss import KLLoss 6 | from module.loss.label_smoothing import LabelSmoothingCrossEntropy 7 | 8 | 9 | class LossManager(object): 10 | 11 | def __init__(self, loss_type, cl_option=False, loss_cl_type='InfoNCE'): 12 | self.loss_type = loss_type 13 | self.cl_option = cl_option 14 | self.loss_cl_type = loss_cl_type 15 | # 判断配置的loss类型 16 | if loss_type == 'focalloss': 17 | self.loss_func = FocalLoss() 18 | elif loss_type == 'LabelSmoothingCrossEntropy': 19 | self.loss_func = LabelSmoothingCrossEntropy() 20 | else: 21 | self.loss_func = CrossEntropyLoss() 22 | 23 | if cl_option: 24 | if loss_cl_type == 'Rdrop': 25 | self.loss_cl_func = KLLoss() 26 | else: 27 | self.loss_cl_func = InfoNCELoss() 28 | 29 | 30 | def compute(self, 31 | input_x, 32 | target, 33 | hidden_emb_x=None, 34 | hidden_emb_y=None, 35 | alpha=0.5): 36 | """ 37 | 计算loss 38 | Args: 39 | input: [N, C] 40 | target: [N, ] 41 | """ 42 | if hidden_emb_x is not None and hidden_emb_y is not None: 43 | loss_ce = (1-alpha) * self.loss_func(input_x, target) 44 | weight_etx = 1e+5 if self.loss_cl_type=='Rdrop' else 1 45 | loss_cl = alpha * weight_etx * self.loss_cl_func(hidden_emb_x, hidden_emb_y) 46 | loss = loss_ce + loss_cl 47 | return loss 48 | else: 49 | loss = self.loss_func(input_x, target) 50 | return loss 51 | 52 | 53 | 54 | # def compute(self, input, target): 55 | # """ 56 | # 计算loss 57 | # Args: 58 | # input: [N, C] 59 | # target: [N, ] 60 | # """ 61 | # loss = self.loss_func(input, target) 62 | # return loss 63 | 64 | 65 | # def compute(self, input1, input2, output_pooler1, output_pooler2, target, alpha=0.5): 66 | # """ 67 | # 计算loss 68 | # Args: 69 | # input: [N, C] 70 | # target: [N, ] 71 | # """ 72 | 73 | # loss_ce = alpha * self.loss_func(input1, target) 74 | # loss_nce = (1-alpha) * self.loss_func_nce(output_pooler1, output_pooler2) 75 | # # loss = alpha*loss_ce + (1-alpha)*loss_nce 76 | # loss = loss_ce + loss_nce 77 | # return loss, loss_ce, loss_nce -------------------------------------------------------------------------------- /module/ModelMap.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | from module.models.FastText import FastText, Config as FastTextConfig 4 | from module.models.TextCNN import TextCNN, Config as TextCNNConfig 5 | from module.models.TextRNN import TextRNN, Config as TextRNNConfig 6 | from module.models.TextRCNN import TextRCNN, Config as TextRCNNConfig 7 | from module.models.Transformer import Transformer, Config as TransformerConfig 8 | from module.models.Bert import Bert 9 | from module.models.Albert import Albert 10 | from module.models.Roberta import Roberta 11 | from module.models.Distilbert import Distilbert 12 | from module.models.Electra import Electra 13 | from module.models.XLNet import XLNet 14 | from transformers import AutoTokenizer 15 | 16 | 17 | 18 | 19 | 20 | from transformers import AlbertTokenizer, BertTokenizer, DistilBertTokenizer, RobertaTokenizer, ElectraTokenizer, XLNetTokenizer 21 | from module.tokenizer.TextTokenizer import TextTokenizer 22 | 23 | 24 | def map_model(model_name): 25 | """ 26 | 模型映射函数 27 | """ 28 | dic = { 29 | 'FastText' : FastText, 30 | 'TextCNN' : TextCNN, 31 | 'TextRNN' : TextRNN, 32 | 'TextRCNN' : TextRCNN, 33 | 'Transformer' : Transformer, 34 | 'Bert' : Bert, 35 | 'Albert' : Albert, 36 | 'Roberta' : Roberta, 37 | 'Distilbert' : Distilbert, 38 | 'Electra' : Electra, 39 | 'XLNet' : XLNet 40 | } 41 | model = dic.get(model_name, None) 42 | return model 43 | 44 | 45 | def map_tokenizer(model_name): 46 | """ 47 | 分词器映射函数 48 | """ 49 | dic = { 50 | 'FastText' : TextTokenizer, 51 | 'TextCNN' : TextTokenizer, 52 | 'TextRNN' : TextTokenizer, 53 | 'TextRCNN' : TextTokenizer, 54 | 'Transformer' : TextTokenizer, 55 | 'Bert' : BertTokenizer, 56 | 'Albert' : AutoTokenizer, 57 | 'Roberta' : BertTokenizer, 58 | 'Distilbert' : DistilBertTokenizer, 59 | 'Electra' : AutoTokenizer, 60 | 'XLNet' : AutoTokenizer 61 | } 62 | tokenizer = dic.get(model_name, None) 63 | return tokenizer 64 | 65 | 66 | def map_config(model_name): 67 | """ 68 | 模型配置映射 69 | """ 70 | dic = { 71 | 'FastText' : FastTextConfig, 72 | 'TextCNN' : TextCNNConfig, 73 | 'TextRNN' : TextRNNConfig, 74 | 'TextRCNN' : TextRCNNConfig, 75 | 'Transformer' : TransformerConfig 76 | } 77 | model = dic.get(model_name, None) 78 | return model -------------------------------------------------------------------------------- /module/Predictor.py: -------------------------------------------------------------------------------- 1 | 2 | import os 3 | from posixpath import sep 4 | import time 5 | import random 6 | import logging 7 | import math 8 | import numpy as np 9 | import pandas as pd 10 | import torch 11 | from apex import amp 12 | from tqdm.auto import tqdm 13 | from datasets import Dataset, load_dataset, load_metric 14 | from torch.utils.data import DataLoader 15 | from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler, get_linear_schedule_with_warmup 16 | from transformers import BertTokenizer, BertConfig, AutoConfig 17 | # from model.BertForMaskedLM import BertForMaskedLM 18 | from sklearn import metrics 19 | 20 | 21 | from Config import Config 22 | from utils.progressbar import ProgressBar 23 | from module.ModelMap import map_model, map_tokenizer 24 | from module.tokenizer.LMTextTokenizer import LMTextTokenizer 25 | 26 | 27 | 28 | 29 | class Predictor(object): 30 | 31 | def __init__(self, config): 32 | self.config = config 33 | # self.test_loader = test_loader 34 | self.device = torch.device(self.config.device) 35 | # 加载模型 36 | self.load_label() 37 | self.load_tokenizer() 38 | self.load_model() 39 | 40 | 41 | def load_label(self): 42 | """ 43 | 读取标签 44 | """ 45 | print('loading tag file ...') 46 | path_label = os.path.join(self.config.path_datasets, 'class.txt') 47 | self.label = [ x.strip() for x in open(path_label, 'r', encoding='utf8').readlines()] 48 | self.label2ids = {x:i for i,x in enumerate(self.label)} 49 | self.ids2label = {i:x for i,x in enumerate(self.label)} 50 | 51 | 52 | def load_tokenizer(self): 53 | """ 54 | 读取分词器 55 | """ 56 | print('loading tokenizer config ...') 57 | tokenizer = map_tokenizer(self.config.model_name) 58 | if not tokenizer: 59 | print('toknizer {} is null, please check your model name.'.format(self.config.model_name)) 60 | 61 | if 'Text' in self.config.model_name or 'Transformer' in self.config.model_name: 62 | path_token = os.path.join(self.config.path_datasets, 'vocab.pkl') 63 | self.tokenizer = tokenizer() 64 | # 若存在词表,则直接读取 65 | if os.path.exists(path_token): 66 | self.tokenizer.load(path_token) 67 | else: 68 | # 否则读取训练数据,并创建词表 69 | print('vacob file not exist: {}'.format(path_token)) 70 | else: 71 | tokenizer = tokenizer.from_pretrained(self.config.initial_pretrain_tokenizer) 72 | self.tokenizer = LMTextTokenizer(tokenizer) 73 | 74 | 75 | def load_model(self): 76 | """ 77 | 加载模型及初始化模型参数 78 | """ 79 | print('loading model...%s' %self.config.model_name) 80 | self.model = map_model(self.config.model_name) 81 | if not self.model: 82 | print('model {} is null, please check your model name.'.format(self.config.model_name)) 83 | 84 | if 'Text' in self.config.model_name or 'Transformer' in self.config.model_name: 85 | path_model = os.path.join(self.config.path_model_save, 'step_best/pytorch_model.bin') 86 | if not os.path.exists(path_model): 87 | print('model checkpoint file not exist: {}'.format(path_model)) 88 | return 89 | self.model.load_state_dict(torch.load(path_model)) 90 | else: 91 | # 模型路径 92 | path_model = os.path.join(self.config.path_model_save, 'step_best/') 93 | if not os.path.exists(path_model): 94 | print('model checkpoint file not exist: {}'.format(path_model)) 95 | return 96 | path_config = os.path.join(path_model, 'config.json') 97 | model_config = AutoConfig.from_pretrained(path_config) #, num_labels=len(self.label) 98 | self.model = self.model.from_pretrained(path_model, config=model_config) 99 | # 将模型加载到CPU/GPU 100 | self.model.to(self.device) 101 | self.model.eval() 102 | 103 | 104 | def predict(self, test_loader): 105 | """ 106 | 预测 107 | """ 108 | print('predict start') 109 | 110 | # 混合精度 111 | if self.config.fp16: 112 | self.model = amp.initialize(self.model, opt_level='O3') 113 | 114 | # 初始化指标计算 115 | progress_bar = ProgressBar(n_total=len(test_loader), desc='Predict') 116 | src = [] 117 | label = np.array([], dtype=int) 118 | pred = np.array([], dtype=int) 119 | for i, batch in enumerate(test_loader): 120 | # 推断 121 | batch = {k:v.to(self.config.device) for k,v in batch.items()} 122 | with torch.no_grad(): 123 | output = self.model(**batch) 124 | output = output[0] 125 | # 输入文本转换 126 | input_ids = batch['input_ids'].cpu().numpy() 127 | tmp_src_string = self.ids2string(input_ids) 128 | 129 | # 获取标签 130 | tmp_pred = torch.max(output, 1)[1].cpu().numpy() 131 | tmp_label = batch['label'].cpu().numpy() 132 | # 添加到总列表 133 | src.extend(tmp_src_string) 134 | label = np.append(label, tmp_label) 135 | pred = np.append(pred, tmp_pred) 136 | progress_bar(i, {}) 137 | 138 | # 计算指标 139 | # report = metrics.classification_report(label, pred, target_names=self.label, digits=4) 140 | # confusion = metrics.confusion_matrix(label, pred) 141 | # print('Evaluate Classifier Performance') 142 | # print(report) 143 | 144 | # 保存 145 | data = {'src':src, 'label':label, 'pred':pred} 146 | data = pd.DataFrame(data) 147 | if not os.path.exists(self.config.path_output): 148 | os.mkdir(self.config.path_output) 149 | path_output = os.path.join(self.config.path_output, 'pred_data.csv') 150 | data.to_csv(path_output, sep='\t', index=False) 151 | print('predict result save: {}'.format(path_output)) 152 | 153 | 154 | 155 | def ids2string(self, input_ids): 156 | """ 157 | 将模型输出转换成中文 158 | """ 159 | # 获取特殊字符 160 | special_tokens = self.tokenizer.get_special_tokens() 161 | src = [] 162 | for line in input_ids: 163 | # 分开是否是预训练语言 164 | if self.config.model_name in self.config.lm_model_list: 165 | src_line = self.tokenizer.tokenizer.convert_ids_to_tokens(line) 166 | # 过滤特殊字符 167 | src_line = [x for x in src_line if x not in special_tokens] 168 | src_line = ' '.join(src_line) 169 | else: 170 | src_line = '' 171 | for x in line: 172 | tmp_x = self.tokenizer.index2token.get(x, '') 173 | # 跳过特殊字符 174 | if tmp_x not in special_tokens: 175 | src_line += tmp_x 176 | src.append(src_line) 177 | return src 178 | 179 | 180 | -------------------------------------------------------------------------------- /module/Trainer.py: -------------------------------------------------------------------------------- 1 | 2 | import os 3 | from posixpath import sep 4 | import time 5 | import random 6 | import logging 7 | import math 8 | import numpy as np 9 | import pandas as pd 10 | import pickle as pkl 11 | import torch 12 | import torch.nn as nn 13 | from apex import amp 14 | from tqdm.auto import tqdm 15 | from datasets import Dataset, load_dataset, load_metric 16 | from torch.utils.data import DataLoader 17 | from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler, get_linear_schedule_with_warmup 18 | from transformers import BertTokenizer, BertConfig, AutoConfig 19 | # from model.BertForMaskedLM import BertForMaskedLM 20 | 21 | 22 | import torch.nn.functional as F 23 | from sklearn import metrics 24 | 25 | from utils.progressbar import ProgressBar 26 | from module.optimal.adversarial import FGM,PGD 27 | from module.ModelMap import map_model, map_config, map_tokenizer 28 | from module.LossManager import LossManager 29 | 30 | 31 | 32 | class Trainer(object): 33 | 34 | def __init__(self, config, train_loader, valid_loader, test_loader): 35 | self.config = config 36 | # 设置GPU环境 37 | self.device = torch.device(self.config.device) 38 | # 加载数据集 39 | self.train_loader = train_loader 40 | self.valid_loader = valid_loader 41 | self.test_loader = test_loader 42 | # 加载标签 43 | self.load_label() 44 | # 加载模型 45 | self.load_tokenizer() 46 | self.load_model() 47 | # 加载loss计算类 48 | self.loss_manager = LossManager(loss_type=config.loss_type, cl_option=config.cl_option, loss_cl_type=config.cl_method) 49 | 50 | 51 | 52 | def load_label(self): 53 | """ 54 | 读取标签 55 | """ 56 | path_label = os.path.join(self.config.path_datasets, 'class.txt') 57 | self.label = [ x.strip() for x in open(path_label, 'r', encoding='utf8').readlines()] 58 | self.label2ids = {x:i for i,x in enumerate(self.label)} 59 | self.ids2label = {i:x for i,x in enumerate(self.label)} 60 | 61 | 62 | def load_tokenizer(self): 63 | """ 64 | 读取分词器 65 | """ 66 | self.tokenizer = map_tokenizer(self.config.model_name) 67 | 68 | 69 | def load_model(self): 70 | """ 71 | 加载模型及初始化模型参数 72 | """ 73 | # 读取模型 74 | print('loading model...%s' %self.config.model_name) 75 | self.model = map_model(self.config.model_name) 76 | if not self.model: 77 | print('model {} is null, please check your model name.'.format(self.config.model_name)) 78 | 79 | if self.config.model_name not in self.config.lm_model_list: 80 | # self.model = map_model(self.config.model_name) 81 | model_config = map_config(self.config.model_name)(self.config) 82 | self.model = self.model(model_config) 83 | # 重新初始化模型参数 84 | self.init_network() 85 | else: 86 | # self.tokenizer = map_tokenizer(self.config.model_name).from_pretrained(self.config.model_pretrain_online_checkpoint) 87 | # self.tokenizer.save_pretrained(self.config.path_tokenizer) 88 | # self.func_index2token = self.tokenizer.convert_ids_to_tokens 89 | # 加载预训练模型 90 | model_config = AutoConfig.from_pretrained(self.config.initial_pretrain_model, num_labels=len(self.label)) #, num_labels=len(self.label2ids) 91 | self.model = self.model.from_pretrained(self.config.initial_pretrain_model, config=model_config) 92 | # 将模型加载到CPU/GPU 93 | self.model.to(self.device) 94 | 95 | 96 | def init_network(self, method='xavier', exclude='embedding', seed=123): 97 | """ 98 | # 权重初始化,默认xavier 99 | """ 100 | for name, w in self.model.named_parameters(): 101 | if exclude not in name: 102 | if 'weight' in name: 103 | if method == 'xavier': 104 | if 'transformer' in name: 105 | nn.init.uniform_(w, -0.1, 0.1) 106 | else: 107 | nn.init.xavier_normal_(w) 108 | elif method == 'kaiming': 109 | nn.init.kaiming_normal_(w) 110 | else: 111 | nn.init.normal_(w) 112 | elif 'bias' in name: 113 | nn.init.constant_(w, 0) 114 | else: 115 | pass 116 | 117 | 118 | def train(self): 119 | """ 120 | 预训练模型 121 | """ 122 | # weight decay 123 | # bert_parameters = self.model.bert.named_parameters() 124 | # start_parameters = self.model.start_fc.named_parameters() 125 | # end_parameters = self.model.end_fc.named_parameters() 126 | # no_decay = ["bias", "LayerNorm.weight"] 127 | # optimizer_grouped_parameters = [ 128 | # {"params": [p for n, p in bert_parameters if not any(nd in n for nd in no_decay)], 129 | # "weight_decay": 0.01, 'lr': self.config.learning_rate}, 130 | # {"params": [p for n, p in bert_parameters if any(nd in n for nd in no_decay)], "weight_decay": 0.0 131 | # , 'lr': self.config.learning_rate}, 132 | # {"params": [p for n, p in start_parameters if not any(nd in n for nd in no_decay)], 133 | # "weight_decay": 0.01, 'lr': 0.001}, 134 | # {"params": [p for n, p in start_parameters if any(nd in n for nd in no_decay)], "weight_decay": 0.0 135 | # , 'lr': 0.001}, 136 | # {"params": [p for n, p in end_parameters if not any(nd in n for nd in no_decay)], 137 | # "weight_decay": 0.01, 'lr': 0.001}, 138 | # {"params": [p for n, p in end_parameters if any(nd in n for nd in no_decay)], "weight_decay": 0.0 139 | # , 'lr': 0.001}] 140 | # step_total = self.config.num_epochs * len(self.train_loader) * self.config.batch_size 141 | # # step_total = 640 #len(train_ld)*config.batch_size // config.num_epochs 142 | # warmup_steps = int(step_total * self.config.num_warmup_steps) 143 | # self.optimizer = AdamW(optimizer_grouped_parameters, lr=self.config.learning_rate, eps=1e-8) 144 | # self.lr_scheduler = get_linear_schedule_with_warmup(self.optimizer, num_warmup_steps=warmup_steps, 145 | # num_training_steps=step_total) 146 | 147 | # 定义优化器配置 148 | # num_training_steps = self.config.num_epochs * len(self.train_loader) 149 | # 总的训练次数 150 | step_total = self.config.num_epochs * len(self.train_loader) * self.config.batch_size 151 | # warm up的次数 152 | warmup_steps = int(step_total * self.config.num_warmup_steps) 153 | if self.config.model_name not in self.config.lm_model_list: 154 | self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.config.learning_rate) 155 | else: 156 | self.optimizer = AdamW(self.model.parameters(), lr=self.config.learning_rate) 157 | self.lr_scheduler = get_scheduler( 158 | "linear", 159 | optimizer=self.optimizer, 160 | num_warmup_steps=self.config.num_warmup_steps, 161 | num_training_steps=step_total 162 | ) 163 | # scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, 164 | # num_training_steps=t_total) 165 | 166 | # 混合精度训练 167 | if self.config.fp16: 168 | self.model, self.optimizer = amp.initialize(self.model, self.optimizer, opt_level=self.config.fp16_opt_level) 169 | # 分布式训练 170 | if torch.cuda.device_count() > 1: 171 | self.model = torch.nn.parallel.DistributedDataParallel(self.model, find_unused_parameters=True) 172 | # 对抗训练 173 | if self.config.adv_option == 'FGM': 174 | self.fgm = FGM(self.model, emb_name=self.config.adv_name, epsilon=self.config.adv_epsilon) 175 | if self.config.adv_option == 'PGD': 176 | self.pgd = PGD(self.model, emb_name=self.config.adv_name, epsilon=self.config.adv_epsilon) 177 | 178 | # Train! 179 | print("\n>>>>>>>> Running training >>>>>>>>") 180 | print(" Num examples = %d" %(len(self.train_loader)*self.config.batch_size)) 181 | print(" Num Epochs = %d" %self.config.num_epochs) 182 | print(" Batch size per GPU = %d"%self.config.batch_size) 183 | print(" GPU ids = %s" %self.config.cuda_visible_devices) 184 | print(" Total step = %d" %step_total) 185 | print(" Warm up step = %d" %warmup_steps) 186 | print(" FP16 Option = %s" %self.config.fp16) 187 | print(">>>>>>>> Running training >>>>>>>>\n") 188 | 189 | print(">>>>>>>> Model Structure >>>>>>>>") 190 | for name,parameters in self.model.named_parameters(): 191 | print(name,':',parameters.size()) 192 | print(">>>>>>>> Model Structure >>>>>>>>\n") 193 | 194 | # step_total = config.num_epochs * len(train_ld) 195 | step_current = 0 196 | f1_best = 0 197 | for epoch in range(self.config.num_epochs): 198 | progress_bar = ProgressBar(n_total=len(self.train_loader), desc='Training epoch:{0}'.format(epoch)) 199 | for i, batch in enumerate(self.train_loader): 200 | # 模型推断及计算损失 201 | self.model.train() 202 | loss = self.step(batch) 203 | progress_bar(i, {'loss': loss.item()}) 204 | # progress_bar(i, {'loss': loss.item(),'loss_ce': loss_ce.item(),'loss_cl': loss_nce.item()}) 205 | step_current += 1 206 | # 模型保存 207 | if step_current%self.config.step_save==0 and step_current>0: 208 | # 模型评估 209 | f1_eval = self.evaluate(self.valid_loader) 210 | # 模型保存 211 | f1_best = self.save_checkpoint(step_current, f1_eval, f1_best) 212 | print('\nEpoch:{} Iter:{}/{} loss:{:.4f}\n'.format(epoch, step_current, step_total, loss.item())) 213 | self.evaluate(self.test_loader, print_table=True) 214 | 215 | 216 | 217 | def step(self, batch): 218 | """ 219 | 每一个batch的训练过程 220 | """ 221 | 222 | # 数据操作 223 | batch = {k:v.to(self.device) for k,v in batch.items()} 224 | target = batch['label'] 225 | # 模型输入&输出 226 | outputs = self.model(**batch) 227 | output, hidden_emb = outputs 228 | # 对比学习 229 | if self.config.cl_option: 230 | # 重新获取一次模型输出 231 | outputs_etx = self.model(**batch) 232 | _, hidden_emb_etx = outputs_etx 233 | loss = self.loss_manager.compute(output, target, hidden_emb, hidden_emb_etx, alpha=self.config.cl_loss_weight) 234 | else: 235 | loss = self.loss_manager.compute(output, target) 236 | # 反向传播 237 | if torch.cuda.device_count() > 1: 238 | loss = loss.mean() 239 | if self.config.fp16: 240 | with amp.scale_loss(loss, self.optimizer) as scaled_loss: 241 | scaled_loss.backward() 242 | else: 243 | loss.backward() 244 | # 对抗训练 245 | self.attack_train(batch) 246 | # 梯度操作 247 | self.optimizer.step() 248 | if self.config.model_name in self.config.lm_model_list: 249 | self.lr_scheduler.step() 250 | self.model.zero_grad() 251 | # self.optimizer.zero_grad() 252 | return loss 253 | 254 | 255 | def attack_train(self, batch): 256 | """ 257 | 对抗训练 258 | """ 259 | # FGM 260 | if self.config.adv_option == 'FGM': 261 | self.fgm.attack() 262 | output = self.model(**batch)[0] 263 | loss_adv = self.loss_manager.compute(output, batch['label']) 264 | if torch.cuda.device_count() > 1: 265 | loss_adv = loss_adv.mean() 266 | loss_adv.backward() 267 | self.fgm.restore() 268 | # PGD 269 | if self.config.adv_option == 'PGD': 270 | self.pgd.backup_grad() 271 | K = 3 272 | for t in range(K): 273 | self.pgd.attack(is_first_attack=(t==0)) # 在embedding上添加对抗扰动, first attack时备份param.data 274 | if t != K-1: 275 | self.model.zero_grad() 276 | else: 277 | self.pgd.restore_grad() 278 | output = self.model(**batch)[0] 279 | loss_adv = self.loss_manager.compute(output, batch['label']) 280 | loss_adv.backward() # 反向传播,并在正常的grad基础上,累加对抗训练的梯度 281 | self.pgd.restore() 282 | 283 | 284 | def save_checkpoint(self, step_current, f1_eval, f1_best): 285 | """ 286 | 模型保存 287 | """ 288 | if f1_eval != 0: 289 | # 保存路径 290 | path = os.path.join(self.config.path_model_save, 'step_{}'.format(step_current)) 291 | if not os.path.exists(path): 292 | os.makedirs(path) 293 | # 保存当前step的模型 294 | if self.config.model_name not in self.config.lm_model_list: 295 | path_model = os.path.join(path, 'pytorch_model.bin') 296 | torch.save(self.model.state_dict(), path_model) 297 | else: 298 | model_save = self.model.module if torch.cuda.device_count() > 1 else self.model 299 | model_save.save_pretrained(path) 300 | print('Saving model: {}'.format(path)) 301 | # 保存最优的模型 302 | if f1_eval > f1_best: 303 | # 创建文件夹 304 | path = os.path.join(self.config.path_model_save, 'step_best/') 305 | if not os.path.exists(path): 306 | os.makedirs(path) 307 | # 模型保存 308 | if self.config.model_name not in self.config.lm_model_list: 309 | path_model = os.path.join(path, 'pytorch_model.bin') 310 | torch.save(self.model.state_dict(), path_model) 311 | else: 312 | model_save = self.model.module if torch.cuda.device_count() > 1 else self.model 313 | model_save.save_pretrained(path) 314 | f1_best = f1_eval 315 | print('Saving best model: {}\n'.format(path)) 316 | return f1_best 317 | 318 | 319 | def evaluate(self, data, print_table=False): 320 | """ 321 | 模型测试集效果评估 322 | """ 323 | self.model.eval() 324 | loss_total = 0 325 | predict_all = np.array([], dtype=int) 326 | labels_all = np.array([], dtype=int) 327 | loss_manager = LossManager(loss_type=self.config.loss_type, cl_option=False) 328 | with torch.no_grad(): 329 | for i, batch in enumerate(data): 330 | batch = {k:v.to(self.device) for k,v in batch.items()} 331 | output = self.model(**batch)[0] 332 | # 计算loss 333 | # loss = F.cross_entropy(outputs, labels) 334 | # loss_total += outputx[0] 335 | target = batch['label'] 336 | loss = loss_manager.compute(output, target) 337 | loss_total += loss 338 | # 获取标签 339 | labels = batch['label'].cpu().numpy()#[:,1:-1] 340 | predic = torch.max(output, -1)[1].cpu().numpy() 341 | labels_all = np.append(labels_all, labels) 342 | predict_all = np.append(predict_all, predic) 343 | # 计算指标 344 | acc = metrics.accuracy_score(labels_all, predict_all) 345 | f1 = metrics.f1_score(labels_all, predict_all, average='micro') 346 | print('\n>>Eval Set>>: Loss:{:.4f} Acc:{} MicroF1:{:.4f}'.format(loss_total.item(), acc, f1)) 347 | # {'micro', 'macro', 'samples','weighted', 'binary'} 348 | if print_table: 349 | # 打印指标 350 | report = metrics.classification_report(labels_all, predict_all, target_names=self.label, digits=4) 351 | confusion = metrics.confusion_matrix(labels_all, predict_all) 352 | print('\nEvaluate Classifier Performance '+'#'*50) 353 | print(report) 354 | print('\nConfusion Matrix') 355 | print(confusion) 356 | print('#'*60) 357 | 358 | return f1 359 | 360 | -------------------------------------------------------------------------------- /module/loss/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wzzzd/text_classifier_pytorch/847c4565f0efa574d782de66311dc21f35c44f3e/module/loss/.DS_Store -------------------------------------------------------------------------------- /module/loss/dice_loss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | 4 | class DiceLoss(nn.Module): 5 | """DiceLoss implemented from 'Dice Loss for Data-imbalanced NLP Tasks' 6 | Useful in dealing with unbalanced data 7 | """ 8 | def __init__(self): 9 | super(DiceLoss, self).__init__() 10 | 11 | def forward(self,input, target): 12 | ''' 13 | input: [N, C] 14 | target: [N, ] 15 | ''' 16 | prob = torch.softmax(input, dim=1) 17 | prob = torch.gather(prob, dim=1, index=target.unsqueeze(1)) 18 | dsc_i = 1 - ((1 - prob) * prob) / ((1 - prob) * prob + 1) 19 | dice_loss = dsc_i.mean() 20 | return dice_loss 21 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /module/loss/focal_loss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | class FocalLoss(nn.Module): 6 | '''Multi-class Focal loss implementation''' 7 | def __init__(self, gamma=2, weight=None,ignore_index=-100): 8 | super(FocalLoss, self).__init__() 9 | self.gamma = gamma 10 | self.weight = weight 11 | self.ignore_index=ignore_index 12 | 13 | def forward(self, input, target): 14 | """ 15 | input: [N, C] 16 | target: [N, ] 17 | """ 18 | logpt = F.log_softmax(input, dim=1) 19 | pt = torch.exp(logpt) 20 | logpt = (1-pt)**self.gamma * logpt 21 | loss = F.nll_loss(logpt, target, self.weight,ignore_index=self.ignore_index) 22 | return loss 23 | -------------------------------------------------------------------------------- /module/loss/infonce_loss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | # from torch._C import LongTensor, dtype 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | 6 | 7 | class InfoNCELoss(nn.Module): 8 | '''InfoNCE loss implementation''' 9 | def __init__(self, temperature=0.999): 10 | super(InfoNCELoss, self).__init__() 11 | self.temperature = temperature 12 | 13 | 14 | def forward(self, input1, input2): 15 | """ 16 | input1: [N, C] 17 | input2: [N, C] 18 | """ 19 | 20 | # p_matrix_sim = torch.cosine_similarity(input1, input2, dim=1) 21 | 22 | # positive 2 norm 23 | norm_1 = torch.norm(input1,p=2,dim=1) # [N,] 24 | norm_2 = torch.norm(input2,p=2,dim=1) # [N,] 25 | norm_m = norm_1.mul(norm_2.t()) # [N,N] 26 | eps = torch.tensor(1e-8) 27 | norm = 1/torch.max(norm_m, eps) # [N,N] 28 | # norm = 1/norm_m 29 | 30 | # negative 2 norm 31 | norm_n_m = norm_1.mul(norm_1.t()) # [N,N] 32 | norm_n = 1/torch.max(norm_n_m, eps) # [N,N] 33 | # norm_n = 1/norm_n_m 34 | 35 | # positive sample 36 | p_matrix_sim = input1.mm(input2.t()) # [N,N] 37 | p_matrix_sim = p_matrix_sim.mul(norm) # [N,N] 38 | p_sim = torch.diag(p_matrix_sim) # [N,] 39 | p_sim_zero_matrix = torch.diag_embed(p_sim) # [N,N] 40 | # negative sample 41 | matrix_sim = input1.mm(input1.t()) # [N,N] 42 | matrix_sim = matrix_sim.mul(norm_n) # [N,N] 43 | drop_diag = torch.diag(matrix_sim) 44 | drop_diag_zero_matrix = torch.diag_embed(drop_diag) 45 | # 减去对角线元素 46 | matrix_sim_drop = matrix_sim - drop_diag_zero_matrix 47 | # 对角线加上新的元素 48 | n_matrix_sim = matrix_sim_drop + p_sim_zero_matrix # [N,N] 49 | 50 | # positive score 51 | p_exp = torch.exp(p_sim/self.temperature) # [N,] 52 | # total sample score 53 | total_exp = torch.exp(n_matrix_sim/self.temperature) # [N,N] 54 | total_exp_sum = total_exp.sum(dim=0) 55 | # loss 56 | loss = torch.log(p_exp/total_exp_sum) 57 | loss = -1 * loss 58 | loss = loss.mean() 59 | # print('positive exp:{} negative exp:{}'.format(p_exp.mean(),total_exp.mean())) 60 | return loss 61 | 62 | 63 | 64 | if __name__ == '__main__': 65 | infonce = InfoNCELoss() 66 | input1 = torch.randn([20,5]) 67 | input2 = torch.randn([20,5]) 68 | target = torch.randint(0,5,[20,]) 69 | input_ids_anti = torch.randn([20,50,5]) 70 | label_anti = torch.randint(0,5,[20,50,]) 71 | 72 | loss = infonce(input1=input1, input2=input2) 73 | print(1) 74 | 75 | 76 | -------------------------------------------------------------------------------- /module/loss/kl_loss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | 6 | class KLLoss(nn.Module): 7 | 8 | 9 | def __init__(self, temperature=0.1): 10 | super(KLLoss, self).__init__() 11 | self.temperature = temperature 12 | 13 | 14 | def forward(self, p, q, reduce='mean'): 15 | """ 16 | 计算KL divergence loss 17 | p: [N, C] 18 | q: [N, C] 19 | """ 20 | # 转换成log probabilities 21 | p = F.softmax(p, dim=-1) 22 | q = F.softmax(q, dim=-1) 23 | # 计算损失 24 | loss_func = torch.nn.KLDivLoss(size_average=False, reduce=False) 25 | loss_pq = loss_func(p.log(), q) 26 | loss_qp = loss_func(q.log(), p) 27 | 28 | if reduce == 'sum': 29 | loss_pq = loss_pq.sum() 30 | loss_qp = loss_qp.sum() 31 | else: 32 | loss_pq = loss_pq.mean() 33 | loss_qp = loss_qp.mean() 34 | loss = (loss_pq + loss_qp) / 2 35 | return loss -------------------------------------------------------------------------------- /module/loss/label_smoothing.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch.nn.functional as F 3 | 4 | class LabelSmoothingCrossEntropy(nn.Module): 5 | def __init__(self, eps=0.1, reduction='mean',ignore_index=-100): 6 | super(LabelSmoothingCrossEntropy, self).__init__() 7 | self.eps = eps 8 | self.reduction = reduction 9 | self.ignore_index = ignore_index 10 | 11 | def forward(self, output, target): 12 | c = output.size()[-1] 13 | log_preds = F.log_softmax(output, dim=-1) 14 | if self.reduction=='sum': 15 | loss = -log_preds.sum() 16 | else: 17 | loss = -log_preds.sum(dim=-1) 18 | if self.reduction=='mean': 19 | loss = loss.mean() 20 | return loss*self.eps/c + (1-self.eps) * F.nll_loss(log_preds, target, reduction=self.reduction, 21 | ignore_index=self.ignore_index) -------------------------------------------------------------------------------- /module/models/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wzzzd/text_classifier_pytorch/847c4565f0efa574d782de66311dc21f35c44f3e/module/models/.DS_Store -------------------------------------------------------------------------------- /module/models/Albert.py: -------------------------------------------------------------------------------- 1 | # coding: UTF-8 2 | import torch 3 | import torch.nn as nn 4 | from transformers import AlbertPreTrainedModel, AlbertModel 5 | from torch.nn import CrossEntropyLoss 6 | 7 | 8 | class Albert(AlbertPreTrainedModel): 9 | 10 | def __init__(self, config): 11 | super(Albert, self).__init__(config) 12 | self.albert = AlbertModel(config) 13 | self.hidden_size = config.hidden_size 14 | self.num_classes = config.num_labels 15 | self.fc = nn.Linear(self.hidden_size, self.num_classes) 16 | 17 | def forward(self, input_ids, attention_mask, label=None): 18 | output_albert = self.albert(input_ids, attention_mask=attention_mask) 19 | output = self.fc(output_albert.pooler_output) 20 | return [output,output_albert.pooler_output] 21 | 22 | -------------------------------------------------------------------------------- /module/models/Bert.py: -------------------------------------------------------------------------------- 1 | 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | from transformers import BertPreTrainedModel, BertModel, BertTokenizer 6 | from torch.nn import CrossEntropyLoss 7 | 8 | 9 | class Bert(BertPreTrainedModel): 10 | 11 | def __init__(self, config): 12 | super(Bert, self).__init__(config) 13 | self.bert = BertModel(config) 14 | self.hidden_size = config.hidden_size 15 | self.num_classes = config.num_labels 16 | self.fc = nn.Linear(self.hidden_size, self.num_classes) 17 | 18 | def forward(self, 19 | input_ids, 20 | attention_mask, 21 | label=None, 22 | input_ids_anti=None, 23 | label_anti=None): 24 | # inference 25 | output_bert = self.bert(input_ids, attention_mask=attention_mask) #(batch_size, sen_length, hidden_size) 26 | output_pooler = output_bert.pooler_output 27 | output = self.fc(output_pooler) 28 | 29 | return [output, output_pooler] 30 | 31 | -------------------------------------------------------------------------------- /module/models/Distilbert.py: -------------------------------------------------------------------------------- 1 | # coding: UTF-8 2 | import torch 3 | import torch.nn as nn 4 | # from pytorch_pretrained_bert import BertModel, BertTokenizer 5 | # from pytorch_pretrained import BertModel, BertTokenizer 6 | # from transformers import BertPreTrainedModel, BertModel, BertTokenizer 7 | from transformers import DistilBertPreTrainedModel, DistilBertModel, DistilBertTokenizer 8 | from torch.nn import CrossEntropyLoss 9 | import torch.nn.functional as F 10 | 11 | 12 | 13 | class Distilbert(DistilBertPreTrainedModel): 14 | 15 | def __init__(self, config): 16 | super(Distilbert, self).__init__(config) 17 | self.distilbert = DistilBertModel(config) 18 | # self.pool_layer = BertPooler(config) 19 | self.hidden_size = config.hidden_size #768 20 | self.num_classes = config.num_labels 21 | self.fc = nn.Linear(self.hidden_size, self.num_classes) 22 | 23 | self.dense = nn.Linear(config.hidden_size, config.hidden_size) 24 | # self.activation = nn.Tanh() 25 | 26 | 27 | def forward(self, input_ids, attention_mask, label=None): 28 | output = self.distilbert(input_ids, attention_mask=attention_mask) 29 | # out = self.fc(output.pooler_output) 30 | # pooling 31 | first_token_tensor = output.last_hidden_state[:, 0] 32 | pooler_output = self.dense(first_token_tensor) 33 | # pooler_output = self.activation(pooler_output) 34 | # pooler_output = self.pool_layer(pooler_output) 35 | # class 36 | output = self.fc(pooler_output) 37 | return [output,pooler_output] 38 | 39 | 40 | # class BertPooler(nn.Module): 41 | # def __init__(self, config): 42 | # super().__init__() 43 | # self.dense = nn.Linear(config.hidden_size, config.hidden_size) 44 | # self.activation = nn.Tanh() 45 | 46 | # def forward(self, hidden_states): 47 | # # We "pool" the model by simply taking the hidden state corresponding 48 | # # to the first token. 49 | # first_token_tensor = hidden_states[:, 0] 50 | # pooled_output = self.dense(first_token_tensor) 51 | # pooled_output = self.activation(pooled_output) 52 | # return pooled_output -------------------------------------------------------------------------------- /module/models/Electra.py: -------------------------------------------------------------------------------- 1 | # coding: UTF-8 2 | import torch 3 | import torch.nn as nn 4 | from transformers import ElectraPreTrainedModel, ElectraModel, ElectraTokenizer 5 | from torch.nn import CrossEntropyLoss 6 | 7 | 8 | class Electra(ElectraPreTrainedModel): 9 | 10 | def __init__(self, config): 11 | super(Electra, self).__init__(config) 12 | self.electra = ElectraModel(config) 13 | self.hidden_size = config.hidden_size 14 | self.num_classes = config.num_labels 15 | self.dense = nn.Linear(self.hidden_size, self.hidden_size) 16 | self.fc = nn.Linear(self.hidden_size, self.num_classes) 17 | self.activation = nn.Tanh() 18 | 19 | def forward(self, input_ids, attention_mask, label=None): 20 | output = self.electra(input_ids, attention_mask=attention_mask) 21 | 22 | first_token_tensor = output.last_hidden_state[:, 0] 23 | pooler_output = self.dense(first_token_tensor) 24 | pooler_output = self.activation(pooler_output) 25 | output = self.fc(pooler_output) 26 | return [output,pooler_output] 27 | 28 | -------------------------------------------------------------------------------- /module/models/FastText.py: -------------------------------------------------------------------------------- 1 | # coding: UTF-8 2 | import os 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | import numpy as np 7 | import pickle as pkl 8 | from torch.nn import CrossEntropyLoss 9 | 10 | 11 | class Config(object): 12 | 13 | """配置参数""" 14 | def __init__(self, config): 15 | self.model_name = 'FastText' 16 | path_class = os.path.join(config.path_datasets, 'class.txt') 17 | self.class_list = [x.strip() for x in open(path_class, encoding='utf-8').readlines()] # 类别名单 18 | self.num_classes = len(self.class_list) # 类别数 19 | # embedding config 20 | file_embedding = 'random' 21 | path_embedding = os.path.join(config.path_datasets, file_embedding) 22 | self.embedding_pretrained = torch.tensor(np.load(path_embedding)["embeddings"].astype('float32')) if file_embedding != 'random' else None # 预训练词向量 23 | self.embed = self.embedding_pretrained.size(1)\ 24 | if self.embedding_pretrained is not None else 300 # 字向量维度 # 预训练词向量 25 | # self.device = torch.device(config.device if torch.cuda.is_available() else 'cpu') # 设备 26 | 27 | # vocab 28 | path_vocab = os.path.join(config.path_datasets, 'vocab.pkl') 29 | toekn2index = pkl.load(open(path_vocab, 'rb')) 30 | self.n_vocab = len(toekn2index.keys()) # 词表大小,在运行时赋值 31 | # model config 32 | self.dropout = 0.5 # 随机失活 33 | self.hidden_size = 256 34 | 35 | 36 | class FastText(nn.Module): 37 | def __init__(self, config): 38 | super(FastText, self).__init__() 39 | if config.embedding_pretrained is not None: 40 | self.embedding = nn.Embedding.from_pretrained(config.embedding_pretrained, freeze=False) 41 | else: 42 | self.embedding = nn.Embedding(config.n_vocab, config.embed, padding_idx=config.n_vocab - 1) 43 | self.dropout = nn.Dropout(config.dropout) 44 | self.fc1 = nn.Linear(config.embed, config.hidden_size) 45 | # self.dropout2 = nn.Dropout(config.dropout) 46 | self.fc2 = nn.Linear(config.hidden_size, config.num_classes) 47 | 48 | def forward(self, input_ids, label, attention_mask): 49 | 50 | out = self.embedding(input_ids) # size: (batch_size, seq_len, dim) 51 | out = out.mean(dim=1) # size: (batch_size, dim) 52 | out = self.dropout(out) 53 | out = self.fc1(out) # size: (batch_size, hidden_size) 54 | out_relu = F.relu(out) 55 | out = self.fc2(out_relu) # size: (batch_size, num_class) 56 | return [out,out_relu] 57 | # # 计算loss 58 | # loss = None 59 | # if label is not None: 60 | # loss_func = CrossEntropyLoss() 61 | # # out_softmax = F.softmax(out) 62 | # loss = loss_func(out, label) 63 | # # loss = F.cross_entropy(out, label) 64 | # output = (loss, out) 65 | # return output 66 | 67 | -------------------------------------------------------------------------------- /module/models/Roberta.py: -------------------------------------------------------------------------------- 1 | # coding: UTF-8 2 | import torch 3 | import torch.nn as nn 4 | from transformers import FlaxRobertaPreTrainedModel, RobertaModel, BertModel, BertPreTrainedModel 5 | from torch.nn import CrossEntropyLoss 6 | 7 | 8 | # class Roberta(FlaxRobertaPreTrainedModel): 9 | class Roberta(BertPreTrainedModel): 10 | 11 | def __init__(self, config): 12 | super(Roberta, self).__init__(config) 13 | self.bert = BertModel(config) 14 | self.hidden_size = config.hidden_size 15 | self.num_classes = config.num_labels 16 | self.fc = nn.Linear(self.hidden_size, self.num_classes) 17 | 18 | def forward(self, input_ids, attention_mask, label=None): 19 | 20 | output = self.bert(input_ids, attention_mask=attention_mask) 21 | out = self.fc(output.pooler_output) 22 | return [out,output.pooler_output] 23 | 24 | -------------------------------------------------------------------------------- /module/models/TextCNN.py: -------------------------------------------------------------------------------- 1 | # coding: UTF-8 2 | import os 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | import numpy as np 7 | import pickle as pkl 8 | from torch.nn import CrossEntropyLoss 9 | 10 | 11 | class Config(object): 12 | 13 | """配置参数""" 14 | def __init__(self, config): 15 | self.model_name = 'TextCNN' 16 | path_class = os.path.join(config.path_datasets, 'class.txt') 17 | self.class_list = [x.strip() for x in open(path_class, encoding='utf-8').readlines()] # 类别名单 18 | self.num_classes = len(self.class_list) # 类别数 19 | # embedding config 20 | file_embedding = 'random' 21 | path_embedding = os.path.join(config.path_datasets, file_embedding) 22 | self.embedding_pretrained = torch.tensor(np.load(path_embedding)["embeddings"].astype('float32')) if file_embedding != 'random' else None # 预训练词向量 23 | self.embed = self.embedding_pretrained.size(1)\ 24 | if self.embedding_pretrained is not None else 300 # 字向量维度 25 | # self.device = torch.device(config.device if torch.cuda.is_available() else 'cpu') # 设备 26 | # vocab 27 | path_vocab = os.path.join(config.path_datasets, 'vocab.pkl') 28 | toekn2index = pkl.load(open(path_vocab, 'rb')) 29 | self.n_vocab = len(toekn2index.keys()) # 词表大小,在运行时赋值 30 | # model config 31 | self.dropout = 0.5 # 随机失活 32 | self.filter_sizes = (2, 3, 4) # 卷积核尺寸 33 | self.num_filters = 256 # 卷积核数量(channels数) 34 | 35 | 36 | class TextCNN(nn.Module): 37 | def __init__(self, config): 38 | super(TextCNN, self).__init__() 39 | self.c = config 40 | if config.embedding_pretrained is not None: 41 | self.embedding = nn.Embedding.from_pretrained(config.embedding_pretrained, freeze=False) 42 | else: 43 | self.embedding = nn.Embedding(config.n_vocab, config.embed, padding_idx=config.n_vocab - 1) 44 | self.convs = nn.ModuleList( 45 | [nn.Conv2d(1, config.num_filters, (k, config.embed)) for k in config.filter_sizes]) 46 | self.dropout = nn.Dropout(config.dropout) 47 | self.fc = nn.Linear(config.num_filters * len(config.filter_sizes), config.num_classes) 48 | 49 | def conv_and_pool(self, x, conv): 50 | x = F.relu(conv(x)).squeeze(3) 51 | x = F.max_pool1d(x, x.size(2)).squeeze(2) 52 | return x 53 | 54 | def forward(self, input_ids, label, attention_mask): 55 | out = self.embedding(input_ids) 56 | out = out.unsqueeze(1) 57 | out = torch.cat([self.conv_and_pool(out, conv) for conv in self.convs], 1) 58 | out_drop = self.dropout(out) 59 | out = self.fc(out_drop) 60 | return [out, out_drop] 61 | 62 | # # 计算loss 63 | # loss = None 64 | # if label is not None: 65 | # loss_func = CrossEntropyLoss() 66 | # # out_softmax = F.softmax(out) 67 | # loss = loss_func(out, label) 68 | # # loss = F.cross_entropy(out, label) 69 | # output = (loss, out) 70 | # return output 71 | -------------------------------------------------------------------------------- /module/models/TextRCNN.py: -------------------------------------------------------------------------------- 1 | # coding: UTF-8 2 | import os 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | import numpy as np 7 | import pickle as pkl 8 | 9 | 10 | class Config(object): 11 | 12 | """配置参数""" 13 | def __init__(self, config): 14 | self.model_name = 'TextRCNN' 15 | path_class = os.path.join(config.path_datasets, 'class.txt') 16 | self.class_list = [x.strip() for x in open(path_class, encoding='utf-8').readlines()] # 类别名单 17 | self.num_classes = len(self.class_list) # 类别数 18 | # embedding config 19 | file_embedding = 'random' 20 | path_embedding = os.path.join(config.path_datasets, file_embedding) 21 | self.embedding_pretrained = torch.tensor(np.load(path_embedding)["embeddings"].astype('float32')) if file_embedding != 'random' else None # 预训练词向量 22 | self.embed = self.embedding_pretrained.size(1)\ 23 | if self.embedding_pretrained is not None else 300 # 字向量维度 24 | # vocab 25 | path_vocab = os.path.join(config.path_datasets, 'vocab.pkl') 26 | toekn2index = pkl.load(open(path_vocab, 'rb')) 27 | self.n_vocab = len(toekn2index.keys()) 28 | # model config 29 | self.dropout = 0.5 # 随机失活 30 | self.hidden_size = 256 # lstm隐藏层 31 | self.num_layers = 1 # lstm层数 32 | self.pad_size = config.sen_max_length 33 | 34 | 35 | class TextRCNN(nn.Module): 36 | def __init__(self, config): 37 | super(TextRCNN, self).__init__() 38 | if config.embedding_pretrained is not None: 39 | self.embedding = nn.Embedding.from_pretrained(config.embedding_pretrained, freeze=False) 40 | else: 41 | self.embedding = nn.Embedding(config.n_vocab, config.embed, padding_idx=config.n_vocab - 1) 42 | self.lstm = nn.LSTM(config.embed, config.hidden_size, config.num_layers, 43 | bidirectional=True, batch_first=True, dropout=config.dropout) 44 | self.maxpool = nn.MaxPool1d(config.pad_size) 45 | self.fc = nn.Linear(config.hidden_size * 2 + config.embed, config.num_classes) 46 | 47 | def forward(self, input_ids, label, attention_mask): 48 | embed = self.embedding(input_ids) # [batch_size, seq_len, embeding]=[64, 32, 64] 49 | out, _ = self.lstm(embed) 50 | out = torch.cat((embed, out), 2) 51 | out = F.relu(out) 52 | out = out.permute(0, 2, 1) 53 | out_squ = self.maxpool(out).squeeze() 54 | out = self.fc(out_squ) 55 | return [out, out_squ] 56 | -------------------------------------------------------------------------------- /module/models/TextRNN.py: -------------------------------------------------------------------------------- 1 | # coding: UTF-8 2 | import os 3 | import torch 4 | import torch.nn as nn 5 | import numpy as np 6 | import pickle as pkl 7 | 8 | 9 | class Config(object): 10 | 11 | """配置参数""" 12 | def __init__(self, config): 13 | self.model_name = 'TextRNN' 14 | path_class = os.path.join(config.path_datasets, 'class.txt') 15 | self.class_list = [x.strip() for x in open(path_class, encoding='utf-8').readlines()] # 类别名单 16 | self.num_classes = len(self.class_list) # 类别数 17 | # embedding config 18 | file_embedding = 'random' 19 | path_embedding = os.path.join(config.path_datasets, file_embedding) 20 | self.embedding_pretrained = torch.tensor(np.load(path_embedding)["embeddings"].astype('float32')) if file_embedding != 'random' else None # 预训练词向量 21 | self.embed = self.embedding_pretrained.size(1)\ 22 | if self.embedding_pretrained is not None else 300 # 字向量维度 23 | # vocab 24 | path_vocab = os.path.join(config.path_datasets, 'vocab.pkl') 25 | toekn2index = pkl.load(open(path_vocab, 'rb')) 26 | self.n_vocab = len(toekn2index.keys()) 27 | # model config 28 | self.dropout = 0.5 # 随机失活 # 学习率 29 | self.hidden_size = 128 # lstm隐藏层 30 | self.num_layers = 2 # lstm层数 31 | 32 | 33 | class TextRNN(nn.Module): 34 | def __init__(self, config): 35 | super(TextRNN, self).__init__() 36 | if config.embedding_pretrained is not None: 37 | self.embedding = nn.Embedding.from_pretrained(config.embedding_pretrained, freeze=False) 38 | else: 39 | self.embedding = nn.Embedding(config.n_vocab, config.embed, padding_idx=config.n_vocab - 1) 40 | self.lstm = nn.LSTM(config.embed, config.hidden_size, config.num_layers, 41 | bidirectional=True, batch_first=True, dropout=config.dropout) 42 | self.fc = nn.Linear(config.hidden_size * 2, config.num_classes) 43 | 44 | def forward(self, input_ids, label, attention_mask): 45 | out = self.embedding(input_ids) # [batch_size, seq_len, embeding]=[128, 32, 300] 46 | out_lstm, _ = self.lstm(out) 47 | out = self.fc(out_lstm[:, -1, :]) # 句子最后时刻的 hidden state 48 | return [out, out_lstm] 49 | -------------------------------------------------------------------------------- /module/models/Transformer.py: -------------------------------------------------------------------------------- 1 | # coding: UTF-8 2 | import os 3 | import math 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | import numpy as np 8 | import torch 9 | import torch.nn as nn 10 | import torch.nn.functional as F 11 | from torch.nn import TransformerEncoder, TransformerEncoderLayer 12 | from torch.nn.modules import dropout 13 | import pickle as pkl 14 | 15 | 16 | class Config(object): 17 | 18 | """配置参数""" 19 | def __init__(self, config): 20 | self.model_name = 'Transormer_base' 21 | path_class = os.path.join(config.path_datasets, 'class.txt') 22 | self.class_list = [x.strip() for x in open(path_class, encoding='utf-8').readlines()] # 类别名单 23 | self.num_classes = len(self.class_list) # 类别数 24 | # embedding config 25 | file_embedding = 'random' 26 | path_embedding = os.path.join(config.path_datasets, file_embedding) 27 | self.embedding_pretrained = torch.tensor(np.load(path_embedding)["embeddings"].astype('float32')) if file_embedding != 'random' else None # 预训练词向量 28 | self.embed = self.embedding_pretrained.size(1)\ 29 | if self.embedding_pretrained is not None else 768 # 字向量维度 30 | # self.device = torch.device(config.device if torch.cuda.is_available() else 'cpu') # 设备 31 | # vocab 32 | path_vocab = os.path.join(config.path_datasets, 'vocab.pkl') 33 | toekn2index = pkl.load(open(path_vocab, 'rb')) 34 | self.n_vocab = len(toekn2index.keys()) # 词表大小,在运行时赋值 35 | # model config 36 | self.dropout = 0.3 # 随机失活 37 | self.nhead = 12 38 | self.hidden_size = 3072 39 | self.nlayer = 6 40 | self.sen_length = config.sen_max_length 41 | 42 | 43 | 44 | class Transformer(nn.Module): 45 | def __init__(self, config): 46 | super(Transformer, self).__init__() 47 | self.c = config 48 | if config.embedding_pretrained is not None: 49 | self.embedding = nn.Embedding.from_pretrained(config.embedding_pretrained, freeze=False) 50 | else: 51 | self.embedding = nn.Embedding(config.n_vocab, config.embed, padding_idx=config.n_vocab - 1) 52 | # transformer 53 | encoder_layer = nn.TransformerEncoderLayer(d_model=config.embed, nhead=config.nhead, dim_feedforward=config.hidden_size) 54 | self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=config.nlayer) 55 | self.pos_encoder = PositionalEncoding(d_model=config.embed, max_len=config.sen_length) 56 | self.dropout = nn.Dropout(config.dropout) 57 | self.fc = nn.Linear(config.sen_length, config.num_classes) 58 | 59 | def forward(self, input_ids, label, attention_mask): 60 | out = self.embedding(input_ids) # (batch_size, seq_len, emb_size) 61 | out = out.transpose(0,1) # (seq_len, batch_size, emb_size) 62 | out = self.pos_encoder(out) # (seq_len, batch_size, emb_size) 63 | out = self.transformer_encoder(out) # (seq_len, batch_size, emb_size) 64 | out = out.transpose(0,1) # (batch_size, seq_len, emb_size) 65 | out_pool = F.max_pool1d(out, out.size(2)).squeeze(2) 66 | out = self.fc(out_pool) 67 | return [out,out_pool] 68 | 69 | 70 | class PositionalEncoding(nn.Module): 71 | 72 | def __init__(self, d_model, dropout=0.1, max_len=5000): 73 | super(PositionalEncoding, self).__init__() 74 | self.dropout = nn.Dropout(p=dropout) 75 | 76 | pe = torch.zeros(max_len, d_model) 77 | position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1) 78 | div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)) 79 | pe[:, 0::2] = torch.sin(position * div_term) 80 | pe[:, 1::2] = torch.cos(position * div_term) 81 | pe = pe.unsqueeze(0).transpose(0, 1) 82 | self.register_buffer('pe', pe) 83 | 84 | def forward(self, x): 85 | x = x + self.pe[:x.size(0), :] 86 | return self.dropout(x) 87 | 88 | -------------------------------------------------------------------------------- /module/models/XLNet.py: -------------------------------------------------------------------------------- 1 | # coding: UTF-8 2 | import torch 3 | import torch.nn as nn 4 | from transformers import XLNetPreTrainedModel, XLNetModel, AutoModel 5 | 6 | 7 | class XLNet(XLNetPreTrainedModel): 8 | 9 | def __init__(self, config): 10 | super(XLNet, self).__init__(config) 11 | self.xlnet = AutoModel.from_config(config) 12 | # self.xlnet = XLNetModel(config) 13 | self.hidden_size = config.hidden_size 14 | self.num_classes = config.num_labels 15 | self.dense = nn.Linear(config.hidden_size, config.hidden_size) 16 | # self.activation = nn.Tanh() 17 | self.fc = nn.Linear(self.hidden_size, self.num_classes) 18 | 19 | def forward(self, input_ids, attention_mask, label=None): 20 | output = self.xlnet(input_ids, attention_mask=attention_mask) 21 | # pooling 22 | first_token_tensor = output.last_hidden_state[:, 0] 23 | pooler_output = self.dense(first_token_tensor) 24 | # pooler_output = self.activation(pooler_output) 25 | out = self.fc(pooler_output) 26 | return [out,pooler_output] 27 | 28 | -------------------------------------------------------------------------------- /module/optimal/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wzzzd/text_classifier_pytorch/847c4565f0efa574d782de66311dc21f35c44f3e/module/optimal/.DS_Store -------------------------------------------------------------------------------- /module/optimal/adversarial.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | class FGM(): 4 | ''' 5 | Example 6 | # 初始化 7 | fgm = FGM(model,epsilon=1,emb_name='word_embeddings.') 8 | for batch_input, batch_label in data: 9 | # 正常训练 10 | loss = model(batch_input, batch_label) 11 | loss.backward() # 反向传播,得到正常的grad 12 | # 对抗训练 13 | fgm.attack() # 在embedding上添加对抗扰动 14 | loss_adv = model(batch_input, batch_label) 15 | loss_adv.backward() # 反向传播,并在正常的grad基础上,累加对抗训练的梯度 16 | fgm.restore() # 恢复embedding参数 17 | # 梯度下降,更新参数 18 | optimizer.step() 19 | model.zero_grad() 20 | ''' 21 | def __init__(self, model,emb_name,epsilon=1.0): 22 | # emb_name这个参数要换成你模型中embedding的参数名 23 | self.model = model 24 | self.epsilon = epsilon 25 | self.emb_name = emb_name 26 | self.backup = {} 27 | 28 | def attack(self): 29 | for name, param in self.model.named_parameters(): 30 | if param.requires_grad and self.emb_name in name: 31 | self.backup[name] = param.data.clone() 32 | norm = torch.norm(param.grad) 33 | if norm!=0 and not torch.isnan(norm): 34 | r_at = self.epsilon * param.grad / norm 35 | param.data.add_(r_at) 36 | 37 | def restore(self): 38 | for name, param in self.model.named_parameters(): 39 | if param.requires_grad and self.emb_name in name: 40 | assert name in self.backup 41 | param.data = self.backup[name] 42 | self.backup = {} 43 | 44 | class PGD(): 45 | ''' 46 | Example 47 | pgd = PGD(model,emb_name='word_embeddings.',epsilon=1.0,alpha=0.3) 48 | K = 3 49 | for batch_input, batch_label in data: 50 | # 正常训练 51 | loss = model(batch_input, batch_label) 52 | loss.backward() # 反向传播,得到正常的grad 53 | pgd.backup_grad() 54 | # 对抗训练 55 | for t in range(K): 56 | pgd.attack(is_first_attack=(t==0)) # 在embedding上添加对抗扰动, first attack时备份param.data 57 | if t != K-1: 58 | model.zero_grad() 59 | else: 60 | pgd.restore_grad() 61 | loss_adv = model(batch_input, batch_label) 62 | loss_adv.backward() # 反向传播,并在正常的grad基础上,累加对抗训练的梯度 63 | pgd.restore() # 恢复embedding参数 64 | # 梯度下降,更新参数 65 | optimizer.step() 66 | model.zero_grad() 67 | ''' 68 | def __init__(self, model,emb_name,epsilon=1.,alpha=0.3): 69 | # emb_name这个参数要换成你模型中embedding的参数名 70 | self.model = model 71 | self.emb_name = emb_name 72 | self.epsilon = epsilon 73 | self.alpha = alpha 74 | self.emb_backup = {} 75 | self.grad_backup = {} 76 | 77 | def attack(self,is_first_attack=False): 78 | for name, param in self.model.named_parameters(): 79 | if param.requires_grad and self.emb_name in name: 80 | if is_first_attack: 81 | self.emb_backup[name] = param.data.clone() 82 | norm = torch.norm(param.grad) 83 | if norm != 0: 84 | r_at = self.alpha * param.grad / norm 85 | param.data.add_(r_at) 86 | param.data = self.project(name, param.data, self.epsilon) 87 | 88 | def restore(self): 89 | for name, param in self.model.named_parameters(): 90 | if param.requires_grad and self.emb_name in name: 91 | assert name in self.emb_backup 92 | param.data = self.emb_backup[name] 93 | self.emb_backup = {} 94 | 95 | def project(self, param_name, param_data, epsilon): 96 | r = param_data - self.emb_backup[param_name] 97 | if torch.norm(r) > epsilon: 98 | r = epsilon * r / torch.norm(r) 99 | return self.emb_backup[param_name] + r 100 | 101 | def backup_grad(self): 102 | for name, param in self.model.named_parameters(): 103 | if param.requires_grad: 104 | if param.grad is None: 105 | self.grad_backup[name] = None 106 | else: 107 | self.grad_backup[name] = param.grad.clone() 108 | def restore_grad(self): 109 | for name, param in self.model.named_parameters(): 110 | if param.requires_grad: 111 | param.grad = self.grad_backup[name] -------------------------------------------------------------------------------- /module/tokenizer/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wzzzd/text_classifier_pytorch/847c4565f0efa574d782de66311dc21f35c44f3e/module/tokenizer/.DS_Store -------------------------------------------------------------------------------- /module/tokenizer/LMTextTokenizer.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import pickle as pkl 4 | 5 | 6 | 7 | class LMTextTokenizer(object): 8 | 9 | def __init__(self, tokenizer): 10 | 11 | self.tokenizer = tokenizer 12 | self.cls_token_id = tokenizer.cls_token_id 13 | self.pad_token_id = tokenizer.pad_token_id 14 | self.sep_token_id = tokenizer.sep_token_id 15 | self.unk_token_id = tokenizer.unk_token_id 16 | # self.convert_tokens_to_ids = '' 17 | self.load() 18 | 19 | 20 | 21 | def load(self): 22 | """ 23 | 读取分词器 24 | """ 25 | self.token2index = self.tokenizer.vocab 26 | self.index2token = { i:x for x,i in self.token2index.items()} 27 | 28 | 29 | def tokenizer(self, text): 30 | """ 31 | 分词,按字分词 32 | """ 33 | token = self.tokenizer(text, return_tensors="pt") 34 | return token 35 | 36 | 37 | def get_special_tokens(self): 38 | """ 39 | 获取特殊字符 40 | """ 41 | target_ids = [self.cls_token_id, self.pad_token_id, self.sep_token_id, self.unk_token_id] 42 | target = [self.index2token.get(x, '') for x in target_ids] 43 | target = [ x for x in target if x] 44 | return target 45 | 46 | -------------------------------------------------------------------------------- /module/tokenizer/TextTokenizer.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import pickle as pkl 4 | from utils.IOOption import open_file, write_file 5 | 6 | 7 | 8 | class TextTokenizer(object): 9 | 10 | def __init__(self): 11 | 12 | self.cls_token = '[CLS]' 13 | self.pad_token = '[PAD]' 14 | self.sep_token = '[SEP]' 15 | self.unk_token = '[UNK]' 16 | self.convert_tokens_to_ids = '' 17 | 18 | 19 | def load(self, path): 20 | """ 21 | 读取分词器 22 | """ 23 | self.token2index = pkl.load(open(path, 'rb')) 24 | self.index2token = { i:x for x,i in self.token2index.items()} 25 | self.cls_token_id = self.token2index.get(self.cls_token) 26 | self.pad_token_id = self.token2index.get(self.pad_token) 27 | self.sep_token_id = self.token2index.get(self.sep_token) 28 | self.unk_token_id = self.token2index.get(self.unk_token) 29 | 30 | 31 | def create(self, corpus): 32 | """ 33 | 创建分词字典,获取训练集词表 34 | """ 35 | # 按字分词 36 | words = [w for line in corpus for w in line if w != ''] 37 | words = list(set(words)) 38 | words = sorted(words, reverse=False) 39 | # 创建索引 40 | token2index = {x:i for i,x in enumerate(words)} 41 | index2token = {i:x for i,x in enumerate(words)} 42 | 43 | # 添加特殊字符 44 | if self.pad_token not in token2index.keys(): 45 | index2token[len(token2index)] = self.pad_token 46 | token2index[self.pad_token] = len(token2index) 47 | if self.unk_token not in token2index.keys(): 48 | index2token[len(token2index)] = self.unk_token 49 | token2index[self.unk_token] = len(token2index) 50 | if self.cls_token not in token2index.keys(): 51 | index2token[len(token2index)] = self.cls_token 52 | token2index[self.cls_token] = len(token2index) 53 | if self.sep_token not in token2index.keys(): 54 | index2token[len(token2index)] = self.sep_token 55 | token2index[self.sep_token] = len(token2index) 56 | self.token2index = token2index 57 | self.index2token = index2token 58 | return token2index, index2token 59 | 60 | 61 | def tokenizer(self, text): 62 | """ 63 | 分词,按字分词 64 | """ 65 | tokens = [ x for x in text] 66 | input_ids = [self.token2index.get(x, self.unk_token_id) for x in tokens] 67 | attention_mask = [0]*len(input_ids) 68 | token = { 69 | 'input_ids' : input_ids, 70 | 'attention_mask' : attention_mask 71 | } 72 | return token 73 | 74 | 75 | def get_special_tokens(self): 76 | """ 77 | 获取特殊字符 78 | """ 79 | target_ids = [self.cls_token_id, self.pad_token_id, self.sep_token_id, self.unk_token_id] 80 | target = [self.index2token.get(x, '') for x in target_ids] 81 | target = [ x for x in target if x] 82 | return target -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | apex==0.1 2 | numpy==1.19.2 3 | pandas==1.1.5 4 | scikit_learn==1.0.2 5 | torch==1.8.0 6 | tqdm==4.62.3 7 | transformers==4.15.0 8 | -------------------------------------------------------------------------------- /run.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wzzzd/text_classifier_pytorch/847c4565f0efa574d782de66311dc21f35c44f3e/run.sh -------------------------------------------------------------------------------- /utils/IOOption.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | def open_file(path, sep=' '): 6 | """读取文件""" 7 | src = [] 8 | tgt = [] 9 | with open(path, 'r', encoding='utf8') as f: 10 | for i, line in enumerate(f.readlines()): # 11 | line = line.strip().split(sep) 12 | tmp_src = str(line[0]) 13 | tmp_tgt = str(line[1]) 14 | # 若文本和标签都非空 15 | if tmp_src and tmp_tgt: 16 | src.append(tmp_src) 17 | tgt.append(tmp_tgt) 18 | return src, tgt 19 | 20 | 21 | 22 | def write_file(word2index, path): 23 | """写文件""" 24 | with open(path, 'w', encoding='utf8') as f: 25 | for k,v in word2index.items(): 26 | string = k + ' ' + str(v) + '\n' 27 | f.write(string) 28 | 29 | 30 | def write_text(text, path): 31 | """写文件""" 32 | with open(path, 'w', encoding='utf8') as f: 33 | for x in text: 34 | string = str(x) + '\n' 35 | f.write(string) 36 | 37 | 38 | -------------------------------------------------------------------------------- /utils/progressbar.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | 4 | class ProgressBar(object): 5 | ''' 6 | custom progress bar(进度条) 7 | Example: 8 | >>> pbar = ProgressBar(n_total=30,desc='training') 9 | >>> step = 2 10 | >>> pbar(step=step) 11 | ''' 12 | def __init__(self, n_total,width=30,desc = 'Training'): 13 | self.width = width 14 | self.n_total = n_total 15 | self.start_time = time.time() 16 | self.desc = desc 17 | 18 | def __call__(self, step, info={}): 19 | now = time.time() 20 | current = step + 1 21 | recv_per = current / self.n_total 22 | bar = f'[{self.desc}] {current}/{self.n_total} [' 23 | if recv_per >= 1: 24 | recv_per = 1 25 | prog_width = int(self.width * recv_per) 26 | if prog_width > 0: 27 | bar += '=' * (prog_width - 1) 28 | if current< self.n_total: 29 | bar += ">" 30 | else: 31 | bar += '=' 32 | bar += '.' * (self.width - prog_width) 33 | bar += ']' 34 | show_bar = f"\r{bar}" 35 | time_per_unit = (now - self.start_time) / current 36 | if current < self.n_total: 37 | eta = time_per_unit * (self.n_total - current) 38 | if eta > 3600: 39 | eta_format = ('%d:%02d:%02d' % 40 | (eta // 3600, (eta % 3600) // 60, eta % 60)) 41 | elif eta > 60: 42 | eta_format = '%d:%02d' % (eta // 60, eta % 60) 43 | else: 44 | eta_format = '%ds' % eta 45 | time_info = f' - ETA: {eta_format}' 46 | else: 47 | if time_per_unit >= 1: 48 | time_info = f' {time_per_unit:.1f}s/step' 49 | elif time_per_unit >= 1e-3: 50 | time_info = f' {time_per_unit * 1e3:.1f}ms/step' 51 | else: 52 | time_info = f' {time_per_unit * 1e6:.1f}us/step' 53 | 54 | show_bar += time_info 55 | if len(info) != 0: 56 | show_info = f'{show_bar} ' + \ 57 | "-".join([f' {key}: {value:.4f} ' for key, value in info.items()]) 58 | print(show_info, end='') 59 | else: 60 | print(show_bar, end='') 61 | --------------------------------------------------------------------------------