├── Config.py
├── README.md
├── datasets
    ├── .DS_Store
    └── THUCNews
    │   ├── .DS_Store
    │   ├── class.txt
    │   ├── dev.txt
    │   ├── test.txt
    │   └── train.txt
├── main.py
├── module
    ├── DataManager.py
    ├── LossManager.py
    ├── ModelMap.py
    ├── Predictor.py
    ├── Trainer.py
    ├── loss
    │   ├── .DS_Store
    │   ├── dice_loss.py
    │   ├── focal_loss.py
    │   ├── infonce_loss.py
    │   ├── kl_loss.py
    │   └── label_smoothing.py
    ├── models
    │   ├── .DS_Store
    │   ├── Albert.py
    │   ├── Bert.py
    │   ├── Distilbert.py
    │   ├── Electra.py
    │   ├── FastText.py
    │   ├── Roberta.py
    │   ├── TextCNN.py
    │   ├── TextRCNN.py
    │   ├── TextRNN.py
    │   ├── Transformer.py
    │   └── XLNet.py
    ├── optimal
    │   ├── .DS_Store
    │   └── adversarial.py
    └── tokenizer
    │   ├── .DS_Store
    │   ├── LMTextTokenizer.py
    │   └── TextTokenizer.py
├── requirements.txt
├── run.sh
└── utils
    ├── IOOption.py
    └── progressbar.py


/Config.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import os
 3 | import random
 4 | 
 5 | from module.models.Transformer import Transformer
 6 | 
 7 | 
 8 | 
 9 | 
10 | class Config(object):
11 |         
12 |     # 运行模式
13 |     mode = 'train'
14 |     
15 |     # GPU配置
16 |     cuda_visible_devices = '0'                                  # 可见的GPU
17 |     device = 'cuda:0'                                           # master GPU
18 |     port = str(random.randint(10000,60000))                     # 多卡训练进程间通讯端口
19 |     init_method = 'tcp://localhost:' + port                     # 多卡训练的通讯地址
20 |     world_size = 1                                              # 线程数，默认为1
21 |     
22 |     # 模型选型
23 |     # 基础模型：FastText/TextCNN/TextRNN/TextRCNN/Transformer
24 |     # 语言模型：Bert/Albert/Roberta/Distilbert/Electra/XLNet
25 |     model_name='Bert'                                      
26 |     initial_pretrain_model = 'bert-base-chinese'           # 加载的预训练分词器checkpoint
27 |     initial_pretrain_tokenizer = 'bert-base-chinese'       # 加载的预训练模型checkpoint
28 |     lm_model_list = ['Bert','Albert','Roberta','Distilbert','Electra','XLNet']
29 |     
30 |     # 训练配置
31 |     num_epochs = 30                                             # 迭代次数
32 |     batch_size = 128                                            # 每个批次的大小
33 |     learning_rate = 2e-5                                        # 学习率
34 |     num_warmup_steps = 0.1                                      # warm up步数
35 |     sen_max_length = 32                                         # 句子最长长度
36 |     padding = True                                              # 是否对输入进行padding
37 |     step_save = 1000                                            # 多少步保存一次模型
38 |     loss_type = 'ce'
39 |     
40 |     # 对比学习
41 |     cl_option = True                                            # 是否使用对比学习
42 |     cl_method = 'Rdrop'                                         # Rdrop/InfoNCE
43 |     cl_loss_weight = 0.5                                        # 对比学习loss比例
44 |     # 对抗训练
45 |     adv_option = 'None'                                         # 是否引入对抗训练：none/FGM/PGD
46 |     adv_name = 'word_embeddings'
47 |     adv_epsilon = 1.0
48 |     # 混合精度训练
49 |     fp16 = False
50 |     fp16_opt_level = 'O1'                                   # 训练可选'O1'，测试可选'O3'
51 |     
52 |     # 模型及路径配置
53 |     path_root = os.getcwd()
54 |     path_model_save = os.path.join(path_root, 'checkpoints/')                      # 模型保存路径
55 |     path_datasets = os.path.join(path_root, 'datasets/THUCNews')            # 数据集
56 |     path_log = os.path.join(path_root, 'logs')
57 |     path_output = os.path.join(path_datasets, 'outputs')
58 | 
59 | 
60 | 
61 | 
62 | 
63 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Text_Classifier_Pytorch
  2 | 
  3 | ## Info
  4 | 基于Pytorch的文本分类框架。
  5 | 
  6 | 同时支持中英文的数据集的文本分类任务。
  7 | 
  8 | 
  9 | ## Model
 10 | - 非预训练类模型：
 11 |     - FastText
 12 |     - TextCNN
 13 |     - TextRNN
 14 |     - TextRCNN
 15 |     - Transformer
 16 | - 预训练类模型
 17 |     - Bert
 18 |     - Albert
 19 |     - Roberta
 20 |     - Distilbert
 21 |     - Electra
 22 |     - XLNet                                  
 23 | 
 24 | 
 25 | ## Trianing Mode Support
 26 | 
 27 | - 支持中英文语料训练
 28 |     - 支持中英文的文本分类任务。
 29 | - 支持多种模型使用
 30 |     - 配置文件`Config.py`中的变量`model_name`表示模型名称，可以更改成你想要加载的模型名称。
 31 |     - 若是预训练类的模型，如Bert等，需要同步修改变量`initial_pretrain_model`和`initial_pretrain_tokenizer`，修改为你想要加载的预训练参数。
 32 | - 混合精度训练
 33 |     - 用于提升训练过程效率，缩短训练时间。
 34 |     - 配置文件`Config.py`中的变量`fp16`值改为`True`。
 35 | - GPU多卡训练
 36 |     - 用于分布式训练，支持单机单卡、多卡训练。
 37 |     - 配置文件`Config.py`中的变量`cuda_visible_devices`用于设置可见的GPU卡号，多卡情况下用`,`间隔开。
 38 | - 对抗训练
 39 |     - 在模型embedding层增加扰动，使模型学习对抗扰动，提升表现，需要额外增加训练时间。
 40 |     - 配置文件`Config.py`中的变量`adv_option`用于设置可见的对抗模式，目前支持FGM/PGD。
 41 | - 对比学习
 42 |     - 用于增强模型语义特征提取能力，借鉴Rdrop和SimCSE的思想，目前支持KL loss和InfoNCE两种损失。
 43 |     - 配置文件`Config.py`中的变量`cl_option`设置为`True`则表示开启对比学习模式，`cl_method`用于设置计算对比损失的方法。
 44 | 
 45 | 
 46 | 
 47 | ## Datasets
 48 | * **THUCNews**
 49 |     * 来自：https://github.com/649453932/Chinese-Text-Classification-Pytorch
 50 |     * 关于THUCNews的的数据。
 51 |     * 数据分为10个类标签类别，分别为：财经、房产、股票、教育、科技、社会、时政、体育、游戏、娱乐
 52 | 
 53 | * **加入自己的数据集**
 54 |     * 可使用本项目的处理方式，将数据集切分为3部分：train/valid/test，其中token和label之间用制表符`\t`分割。
 55 |     * 在 ./dataset 目录下新建一个文件夹，并把3个数据文件放置新建文件夹下。
 56 | 
 57 | * **数据集示例**
 58 |     * 以数据集THUCNews为栗子，文本和标签使用空格隔开，采用以下形式存储：
 59 |     ```
 60 |         午评沪指涨0.78%逼近2800 汽车家电农业领涨	2
 61 |         卡佩罗：告诉你德国脚生猛的原因 不希望英德战踢点球	7
 62 |     ```
 63 | 
 64 | 
 65 | ## Experiments
 66 | 
 67 | 说明：预训练模型基于transformers框架，如若想要替换成其他预训练参数，可以查看[transformers官方网站](https://huggingface.co/models)。
 68 | 
 69 | | 模型名称 | MicroF1 | LearningRate | 预训练参数 |
 70 | | :-----| :---- | :---- | :---- |
 71 | | FastText | 0.8926 | 1e-3 | - |
 72 | | TextCNN | 0.9009 | 1e-3 | - |
 73 | | TextRNN | 0.9080 | 1e-3 | - |
 74 | | TextRCNN | 0.9142 | 1e-3 | - |
 75 | | Tramsformer(2 layer) | 0.8849 | 1e-3 | - |
 76 | | Albert | 0.9124 | 2e-5 | [voidful/albert_chinese_tiny](https://huggingface.co/voidful/albert_chinese_tiny) |
 77 | | Distilbert | 0.9209 | 2e-5 | [Geotrend/distilbert-base-zh-cased](https://huggingface.co/Geotrend/distilbert-base-zh-cased) |
 78 | | Bert | 0.9401 | 2e-5 | [bert-base-chinese](https://huggingface.co/bert-base-chinese) |
 79 | | Roberta | 0.9448 | 2e-5 | [hfl/chinese-roberta-wwm-ext](https://huggingface.co/hfl/chinese-roberta-wwm-ext) |
 80 | | Electra | 0.9377 | 2e-5 | [hfl/chinese-electra-base-discriminator](https://huggingface.co/hfl/chinese-electra-base-discriminator) |
 81 | | XLNet | 0.9051 | 2e-5 | 无参数初始化 |
 82 | 
 83 | 
 84 | 
 85 | 
 86 | 
 87 | ## Requirement
 88 | Python使用的是3.6.X版本，其他依赖模块如下：
 89 | ```
 90 |     numpy==1.19.2
 91 |     pandas==1.1.5
 92 |     scikit_learn==1.0.2
 93 |     torch==1.8.0
 94 |     tqdm==4.62.3
 95 |     transformers==4.15.0
 96 |     apex==0.1
 97 | ```
 98 | 
 99 | 除了`apex`需要额外安装（参考官网：https://github.com/NVIDIA/apex
100 | ），其他模块可通过以下命令安装依赖包
101 | ```
102 |     pip install -r requirement.txt
103 | ```
104 | 
105 | 
106 | ## Get Started
107 | ### 1. 训练
108 | 准备好训练数据后，终端可运行命令
109 | ```
110 |     python3 main.py
111 | ```
112 | ### 2 测试评估
113 | 加载已训练好的模型，并使用valid set作模型测试，输出文件到 ./dataset/${your_dataset}/output/output.txt 目录下。
114 | 
115 | 需要修改Config文件中的变量值`mode = 'test'`，并保存。
116 | 
117 | 终端可运行命令
118 | ```
119 |     python3 main.py
120 | ```
121 | 
122 | 
123 | ## Reference
124 | 
125 | [Github:transformers] https://github.com/huggingface/transformers
126 | 
127 | [Paper:Bert] https://arxiv.org/abs/1810.04805
128 | 
129 | [Paper:RDrop] https://arxiv.org/abs/2106.14448
130 | 
131 | [Paper:SimCSE] https://arxiv.org/abs/2104.08821
132 | 


--------------------------------------------------------------------------------
/datasets/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wzzzd/text_classifier_pytorch/847c4565f0efa574d782de66311dc21f35c44f3e/datasets/.DS_Store


--------------------------------------------------------------------------------
/datasets/THUCNews/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wzzzd/text_classifier_pytorch/847c4565f0efa574d782de66311dc21f35c44f3e/datasets/THUCNews/.DS_Store


--------------------------------------------------------------------------------
/datasets/THUCNews/class.txt:
--------------------------------------------------------------------------------
 1 | finance
 2 | realty
 3 | stocks
 4 | education
 5 | science
 6 | society
 7 | politics
 8 | sports
 9 | game
10 | entertainment


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | import os
 4 | import time
 5 | import numpy as np
 6 | import torch
 7 | import logging
 8 | from Config import Config
 9 | from module.DataManager import DataManager
10 | from module.Trainer import Trainer
11 | from module.Predictor import Predictor
12 | 
13 | 
14 | 
15 | if __name__ == '__main__':
16 | 
17 | 
18 |     config = Config()
19 |     os.environ["CUDA_VISIBLE_DEVICES"] = config.cuda_visible_devices
20 | 
21 |     # 设置随机种子，保证结果每次结果一样
22 |     np.random.seed(1)
23 |     torch.manual_seed(1)
24 |     torch.cuda.manual_seed_all(1)
25 |     torch.backends.cudnn.deterministic = True
26 |     start_time = time.time()
27 | 
28 |     # 数据处理
29 |     print('read data...')
30 |     dm = DataManager(config)
31 | 
32 |     # 模式
33 |     if config.mode == 'train':
34 |         # 获取数据
35 |         print('data process...')
36 |         train_loader = dm.get_dataset(data_type='train')
37 |         valid_loader = dm.get_dataset(data_type='dev')
38 |         test_loader = dm.get_dataset(data_type='test')
39 |         # 训练
40 |         trainer = Trainer(config, train_loader, valid_loader, test_loader)
41 |         trainer.train()
42 |     elif config.mode == 'test':
43 |         # 测试
44 |         test_loader = dm.get_dataset(data_type='test')
45 |         predictor = Predictor(config)
46 |         predictor.predict(test_loader)
47 |     else:
48 |         print("no task going on!")
49 |         print("you can use one of the following lists to replace the valible of Config.py. ['train', 'test', 'valid'] !")
50 |         


--------------------------------------------------------------------------------
/module/DataManager.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import re
  3 | import os
  4 | import random
  5 | import math
  6 | import numpy as np
  7 | import pandas as pd
  8 | import pickle as pkl
  9 | import torch
 10 | # from tqdm.auto import tqdm
 11 | from datasets import Dataset, load_dataset, load_metric
 12 | from torch.utils.data import DataLoader
 13 | from transformers import AutoTokenizer, DataCollatorWithPadding, BertTokenizer
 14 | from torch.utils.data import DataLoader, TensorDataset, RandomSampler
 15 | from torch.utils.data.distributed import DistributedSampler
 16 | from utils.IOOption import open_file, write_text, write_file
 17 | 
 18 | from module.ModelMap import map_tokenizer
 19 | from module.tokenizer.TextTokenizer import TextTokenizer
 20 | from module.tokenizer.LMTextTokenizer import LMTextTokenizer
 21 | 
 22 | 
 23 | 
 24 | 
 25 | 
 26 | class DataManager(object):
 27 |     
 28 |     def __init__(self, config):
 29 |         
 30 |         self.config = config
 31 |         self.init_gpu_config()          # 初始化GPU配置
 32 |         self.load_label()               # 读取标签
 33 |         self.load_tokenizer()           # 读取tokenizer分词模型
 34 |     
 35 |     
 36 |     def init_gpu_config(self):
 37 |         """
 38 |         初始化GPU并行配置
 39 |         """
 40 |         print('loading GPU config ...')
 41 |         if self.config.mode == 'train' and torch.cuda.device_count() > 1:
 42 |             torch.distributed.init_process_group(backend='nccl', 
 43 |                                                  init_method=self.config.init_method,
 44 |                                                  rank=0, 
 45 |                                                  world_size=self.config.world_size)
 46 |             torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
 47 |     
 48 |     
 49 |     def load_label(self):
 50 |         """
 51 |         读取标签
 52 |         """
 53 |         print('loading tag file ...')
 54 |         path_label = os.path.join(self.config.path_datasets, 'class.txt')
 55 |         label = [ x.strip() for x in open(path_label, 'r', encoding='utf8').readlines()]
 56 |         self.label2ids = {x:i for i,x in enumerate(label)}
 57 |         self.ids2label = {i:x for i,x in enumerate(label)}
 58 |         
 59 |     
 60 |     def load_tokenizer(self):
 61 |         """
 62 |         读取分词器
 63 |         """
 64 |         print('loading tokenizer config ...')
 65 |         tokenizer = map_tokenizer(self.config.model_name)
 66 |         if not tokenizer:
 67 |             print('toknizer {} is null, please check your model name.'.format(self.config.model_name))
 68 |         
 69 |         if self.config.model_name not in self.config.lm_model_list:
 70 |             path_token = os.path.join(self.config.path_datasets, 'vocab.pkl')
 71 |             self.tokenizer = tokenizer()
 72 |             # 若存在词表，则直接读取
 73 |             if os.path.exists(path_token):
 74 |                 self.tokenizer.load(path_token)
 75 |             else:
 76 |                 # 否则读取训练数据，并创建词表
 77 |                 path_corpus = os.path.join(self.config.path_datasets, 'train.txt')
 78 |                 corpus, _ = open_file(path_corpus, sep='\t')
 79 |                 token2index, _ = self.tokenizer.create(corpus)
 80 |                 # 标签映射表存到本地
 81 |                 write_file(token2index, path_token + '.txt')
 82 |                 pkl.dump(token2index, open(path_token, 'wb'))
 83 |                 self.tokenizer.load(path_token)
 84 |         else:
 85 |             tokenizer = tokenizer.from_pretrained(self.config.initial_pretrain_tokenizer)
 86 |             self.tokenizer = LMTextTokenizer(tokenizer)
 87 |         print('Vocab size: {}'.format(len(self.tokenizer.token2index)))
 88 | 
 89 |     
 90 |     def get_dataset(self, data_type='train'):
 91 |         """
 92 |         获取数据集
 93 |         """
 94 |         file = '{}.txt'.format(data_type)
 95 |         dataloader = self.data_process(file)
 96 |         return dataloader
 97 | 
 98 | 
 99 |     def data_process(self, file_name):
100 |         """
101 |         数据转换
102 |         """
103 |         # 获取数据
104 |         path = os.path.join(self.config.path_datasets, file_name)
105 |         src, tgt = open_file(path, sep='\t')
106 |         dataset = pd.DataFrame({'src':src, 'label':tgt})
107 |         # dataset.to_csv('./data/cache.csv', sep='\t', index=False)
108 |         # dataframe to datasets
109 |         raw_datasets = Dataset.from_pandas(dataset)
110 |         # tokenizer.
111 |         tokenized_datasets = raw_datasets.map(lambda x: self.tokenize_function(x), batched=True)        # 对于样本中每条数据进行数据转换
112 |         # data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer)                               # 对数据进行padding
113 |         tokenized_datasets = tokenized_datasets.remove_columns(["src"])                        # 移除不需要的字段
114 |         tokenized_datasets.set_format("torch", columns=["input_ids","attention_mask","label"])   
115 |         # 转换成DataLoader类
116 |         sampler = RandomSampler(tokenized_datasets) if not torch.cuda.device_count() > 1 else DistributedSampler(tokenized_datasets)
117 |         dataloader = DataLoader(tokenized_datasets, sampler=sampler, batch_size=self.config.batch_size)     #, collate_fn=data_collator
118 | 
119 |         return dataloader
120 | 
121 | 
122 |     def tokenize_function(self, example):
123 |         """
124 |         数据转换
125 |         """
126 |         # 分词
127 |         token = {}
128 |         # src = [self.tokenizer.convert_tokens_to_ids(x) for x in example["src"]]
129 |         src_origin = [self.tokenizer.tokenizer(x) for x in example["src"]]
130 |         src = [ x['input_ids'] for x in src_origin ]
131 |         attention_mask = [ x['attention_mask'] for x in src_origin ]
132 |         # paddding
133 |         src = [self.padding(x) for x in src]
134 |         attention_mask = [self.padding_attention(x) for x in attention_mask]
135 |         label = [ int(x) for x in example["label"]]
136 |         # 添加标签到样本中
137 |         token = {
138 |             'input_ids':src,
139 |             'attention_mask':attention_mask,
140 |             'label':label
141 |         }
142 |         return token
143 | 
144 | 
145 | 
146 |     def padding(self, src):
147 |         """
148 |         padding
149 |         """
150 |         # 裁剪
151 |         if len(src) > self.config.sen_max_length:
152 |             src = src[:self.config.sen_max_length]
153 |         # padding
154 |         pad_size = self.config.sen_max_length-len(src)       # 待padding的长度
155 |         # 添加cls/pad/sep特殊字符
156 |         # src = [self.tokenizer.cls_token_id] + src + [self.tokenizer.sep_token_id] + [self.tokenizer.pad_token_id]*pad_size
157 |         src = src + [self.tokenizer.pad_token_id]*pad_size
158 |         assert len(src) == self.config.sen_max_length, 'input no equal {}'.format(self.config.sen_max_length)
159 |         return src
160 | 
161 | 
162 |     def padding_attention(self, attention_mask):
163 |         """
164 |         padding attention mask
165 |         """
166 |         # 裁剪
167 |         if len(attention_mask) > self.config.sen_max_length:
168 |             attention_mask = attention_mask[:self.config.sen_max_length]
169 |         # padding
170 |         pad_size = self.config.sen_max_length-len(attention_mask)       # 待padding的长度
171 |         # 添加cls/pad/sep特殊字符
172 |         attention_mask = attention_mask + [0]*pad_size
173 |         assert len(attention_mask) == self.config.sen_max_length, 'input no equal {}'.format(self.config.sen_max_length)
174 |         return attention_mask
175 | 
176 | 


--------------------------------------------------------------------------------
/module/LossManager.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from torch.nn import CrossEntropyLoss
 3 | from module.loss.focal_loss import FocalLoss
 4 | from module.loss.infonce_loss import InfoNCELoss
 5 | from module.loss.kl_loss import KLLoss
 6 | from module.loss.label_smoothing import LabelSmoothingCrossEntropy
 7 | 
 8 | 
 9 | class LossManager(object):
10 |     
11 |     def __init__(self, loss_type, cl_option=False, loss_cl_type='InfoNCE'):
12 |         self.loss_type = loss_type
13 |         self.cl_option = cl_option
14 |         self.loss_cl_type = loss_cl_type
15 |         # 判断配置的loss类型
16 |         if loss_type == 'focalloss':
17 |             self.loss_func = FocalLoss()
18 |         elif loss_type == 'LabelSmoothingCrossEntropy':
19 |             self.loss_func = LabelSmoothingCrossEntropy()
20 |         else:
21 |             self.loss_func = CrossEntropyLoss()
22 |             
23 |         if cl_option:
24 |             if loss_cl_type == 'Rdrop':
25 |                 self.loss_cl_func = KLLoss()
26 |             else:
27 |                 self.loss_cl_func = InfoNCELoss()
28 | 
29 | 
30 |     def compute(self, 
31 |                 input_x, 
32 |                 target,
33 |                 hidden_emb_x=None, 
34 |                 hidden_emb_y=None, 
35 |                 alpha=0.5):
36 |         """        
37 |         计算loss
38 |         Args:
39 |             input: [N, C]
40 |             target: [N, ]
41 |         """
42 |         if hidden_emb_x is not None and hidden_emb_y is not None:
43 |             loss_ce = (1-alpha) * self.loss_func(input_x, target)
44 |             weight_etx = 1e+5 if self.loss_cl_type=='Rdrop' else 1
45 |             loss_cl = alpha * weight_etx * self.loss_cl_func(hidden_emb_x, hidden_emb_y)
46 |             loss = loss_ce + loss_cl
47 |             return loss
48 |         else:
49 |             loss = self.loss_func(input_x, target)
50 |             return loss
51 |     
52 | 
53 |     
54 |     # def compute(self, input, target):
55 |     #     """        
56 |     #     计算loss
57 |     #     Args:
58 |     #         input: [N, C]
59 |     #         target: [N, ]
60 |     #     """
61 |     #     loss = self.loss_func(input, target)
62 |     #     return loss
63 | 
64 | 
65 |     # def compute(self, input1, input2, output_pooler1, output_pooler2, target, alpha=0.5):
66 |     #     """        
67 |     #     计算loss
68 |     #     Args:
69 |     #         input: [N, C]
70 |     #         target: [N, ]
71 |     #     """
72 |         
73 |     #     loss_ce = alpha * self.loss_func(input1, target)
74 |     #     loss_nce = (1-alpha) * self.loss_func_nce(output_pooler1, output_pooler2)
75 |     #     # loss = alpha*loss_ce + (1-alpha)*loss_nce
76 |     #     loss = loss_ce + loss_nce
77 |     #     return loss, loss_ce, loss_nce


--------------------------------------------------------------------------------
/module/ModelMap.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | from module.models.FastText import FastText, Config as FastTextConfig
 4 | from module.models.TextCNN import TextCNN, Config as TextCNNConfig
 5 | from module.models.TextRNN import TextRNN, Config as TextRNNConfig
 6 | from module.models.TextRCNN import TextRCNN, Config as TextRCNNConfig
 7 | from module.models.Transformer import Transformer, Config as TransformerConfig
 8 | from module.models.Bert import Bert
 9 | from module.models.Albert import Albert
10 | from module.models.Roberta import Roberta
11 | from module.models.Distilbert import Distilbert
12 | from module.models.Electra import Electra
13 | from module.models.XLNet import XLNet
14 | from transformers import AutoTokenizer
15 | 
16 | 
17 | 
18 | 
19 | 
20 | from transformers import AlbertTokenizer, BertTokenizer, DistilBertTokenizer, RobertaTokenizer, ElectraTokenizer, XLNetTokenizer
21 | from module.tokenizer.TextTokenizer import TextTokenizer
22 | 
23 | 
24 | def map_model(model_name):
25 |     """
26 |     模型映射函数
27 |     """
28 |     dic = {
29 |         'FastText' : FastText,
30 |         'TextCNN' : TextCNN,
31 |         'TextRNN' : TextRNN,
32 |         'TextRCNN' : TextRCNN,
33 |         'Transformer' : Transformer,
34 |         'Bert' : Bert,
35 |         'Albert' : Albert,
36 |         'Roberta' : Roberta,
37 |         'Distilbert' : Distilbert,
38 |         'Electra' : Electra,
39 |         'XLNet' : XLNet
40 |     }
41 |     model = dic.get(model_name, None)
42 |     return model
43 |     
44 | 
45 | def map_tokenizer(model_name):
46 |     """
47 |     分词器映射函数
48 |     """
49 |     dic = {
50 |         'FastText' : TextTokenizer,
51 |         'TextCNN' : TextTokenizer,
52 |         'TextRNN' : TextTokenizer,
53 |         'TextRCNN' : TextTokenizer,
54 |         'Transformer' : TextTokenizer,
55 |         'Bert' : BertTokenizer,
56 |         'Albert' : AutoTokenizer,
57 |         'Roberta' : BertTokenizer,
58 |         'Distilbert' : DistilBertTokenizer,
59 |         'Electra' : AutoTokenizer,
60 |         'XLNet' : AutoTokenizer
61 |     }
62 |     tokenizer = dic.get(model_name, None)
63 |     return tokenizer
64 | 
65 | 
66 | def map_config(model_name):
67 |     """
68 |     模型配置映射
69 |     """
70 |     dic = {
71 |         'FastText' : FastTextConfig,
72 |         'TextCNN' : TextCNNConfig,
73 |         'TextRNN' : TextRNNConfig,
74 |         'TextRCNN' : TextRCNNConfig,
75 |         'Transformer' : TransformerConfig
76 |     }
77 |     model = dic.get(model_name, None)
78 |     return model


--------------------------------------------------------------------------------
/module/Predictor.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import os
  3 | from posixpath import sep
  4 | import time
  5 | import random
  6 | import logging
  7 | import math
  8 | import numpy as np
  9 | import pandas as pd
 10 | import torch
 11 | from apex import amp
 12 | from tqdm.auto import tqdm
 13 | from datasets import Dataset, load_dataset, load_metric
 14 | from torch.utils.data import DataLoader
 15 | from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler, get_linear_schedule_with_warmup
 16 | from transformers import BertTokenizer, BertConfig, AutoConfig
 17 | # from model.BertForMaskedLM import BertForMaskedLM
 18 | from sklearn import metrics
 19 | 
 20 | 
 21 | from Config import Config
 22 | from utils.progressbar import ProgressBar
 23 | from module.ModelMap import map_model, map_tokenizer
 24 | from module.tokenizer.LMTextTokenizer import LMTextTokenizer
 25 | 
 26 | 
 27 | 
 28 | 
 29 | class Predictor(object):
 30 |     
 31 |     def __init__(self, config):
 32 |         self.config = config
 33 |         # self.test_loader = test_loader
 34 |         self.device = torch.device(self.config.device)
 35 |         # 加载模型
 36 |         self.load_label()
 37 |         self.load_tokenizer()
 38 |         self.load_model()
 39 |         
 40 |     
 41 |     def load_label(self):
 42 |         """
 43 |         读取标签
 44 |         """
 45 |         print('loading tag file ...')
 46 |         path_label = os.path.join(self.config.path_datasets, 'class.txt')
 47 |         self.label = [ x.strip() for x in open(path_label, 'r', encoding='utf8').readlines()]
 48 |         self.label2ids = {x:i for i,x in enumerate(self.label)}
 49 |         self.ids2label = {i:x for i,x in enumerate(self.label)}
 50 |     
 51 |     
 52 |     def load_tokenizer(self):
 53 |         """
 54 |         读取分词器
 55 |         """
 56 |         print('loading tokenizer config ...')
 57 |         tokenizer = map_tokenizer(self.config.model_name)
 58 |         if not tokenizer:
 59 |             print('toknizer {} is null, please check your model name.'.format(self.config.model_name))
 60 |         
 61 |         if 'Text' in self.config.model_name or 'Transformer' in self.config.model_name:
 62 |             path_token = os.path.join(self.config.path_datasets, 'vocab.pkl')
 63 |             self.tokenizer = tokenizer()
 64 |             # 若存在词表，则直接读取
 65 |             if os.path.exists(path_token):
 66 |                 self.tokenizer.load(path_token)
 67 |             else:
 68 |                 # 否则读取训练数据，并创建词表
 69 |                 print('vacob file not exist: {}'.format(path_token))
 70 |         else:
 71 |             tokenizer = tokenizer.from_pretrained(self.config.initial_pretrain_tokenizer)
 72 |             self.tokenizer = LMTextTokenizer(tokenizer)
 73 |     
 74 |     
 75 |     def load_model(self):
 76 |         """
 77 |         加载模型及初始化模型参数
 78 |         """
 79 |         print('loading model...%s' %self.config.model_name)
 80 |         self.model = map_model(self.config.model_name)
 81 |         if not self.model:
 82 |             print('model {} is null, please check your model name.'.format(self.config.model_name))
 83 |         
 84 |         if 'Text' in self.config.model_name or 'Transformer' in self.config.model_name:
 85 |             path_model = os.path.join(self.config.path_model_save, 'step_best/pytorch_model.bin')
 86 |             if not os.path.exists(path_model):
 87 |                 print('model checkpoint file not exist: {}'.format(path_model))
 88 |                 return 
 89 |             self.model.load_state_dict(torch.load(path_model))
 90 |         else:
 91 |             # 模型路径
 92 |             path_model = os.path.join(self.config.path_model_save, 'step_best/')
 93 |             if not os.path.exists(path_model):
 94 |                 print('model checkpoint file not exist: {}'.format(path_model))
 95 |                 return 
 96 |             path_config = os.path.join(path_model, 'config.json')
 97 |             model_config = AutoConfig.from_pretrained(path_config)   #, num_labels=len(self.label)
 98 |             self.model = self.model.from_pretrained(path_model, config=model_config)    
 99 |         # 将模型加载到CPU/GPU
100 |         self.model.to(self.device)
101 |         self.model.eval()
102 |     
103 |     
104 |     def predict(self, test_loader):
105 |         """
106 |         预测
107 |         """
108 |         print('predict start')        
109 | 
110 |         # 混合精度
111 |         if self.config.fp16:
112 |             self.model = amp.initialize(self.model, opt_level='O3')
113 | 
114 |         # 初始化指标计算
115 |         progress_bar = ProgressBar(n_total=len(test_loader), desc='Predict')
116 |         src = []
117 |         label = np.array([], dtype=int)
118 |         pred = np.array([], dtype=int)
119 |         for i, batch in enumerate(test_loader):
120 |             # 推断
121 |             batch = {k:v.to(self.config.device) for k,v in batch.items()}
122 |             with torch.no_grad():
123 |                 output = self.model(**batch)
124 |             output = output[0]
125 |             # 输入文本转换
126 |             input_ids = batch['input_ids'].cpu().numpy()
127 |             tmp_src_string = self.ids2string(input_ids)
128 |             
129 |             # 获取标签
130 |             tmp_pred = torch.max(output, 1)[1].cpu().numpy()
131 |             tmp_label = batch['label'].cpu().numpy()
132 |             # 添加到总列表
133 |             src.extend(tmp_src_string)
134 |             label = np.append(label, tmp_label)
135 |             pred = np.append(pred, tmp_pred)
136 |             progress_bar(i, {})
137 | 
138 |         # 计算指标
139 |         # report = metrics.classification_report(label, pred, target_names=self.label, digits=4)
140 |         # confusion = metrics.confusion_matrix(label, pred)
141 |         # print('Evaluate Classifier Performance')
142 |         # print(report)
143 |         
144 |         # 保存
145 |         data = {'src':src, 'label':label, 'pred':pred}
146 |         data = pd.DataFrame(data)
147 |         if not os.path.exists(self.config.path_output):
148 |             os.mkdir(self.config.path_output)
149 |         path_output = os.path.join(self.config.path_output, 'pred_data.csv')
150 |         data.to_csv(path_output, sep='\t', index=False)
151 |         print('predict result save: {}'.format(path_output))
152 | 
153 | 
154 | 
155 |     def ids2string(self, input_ids):
156 |         """
157 |         将模型输出转换成中文
158 |         """
159 |         # 获取特殊字符
160 |         special_tokens = self.tokenizer.get_special_tokens()
161 |         src = []
162 |         for line in input_ids:
163 |             # 分开是否是预训练语言
164 |             if self.config.model_name in self.config.lm_model_list:
165 |                 src_line = self.tokenizer.tokenizer.convert_ids_to_tokens(line)
166 |                 # 过滤特殊字符
167 |                 src_line = [x for x in src_line if x not in special_tokens]
168 |                 src_line = ' '.join(src_line)
169 |             else:
170 |                 src_line = ''
171 |                 for x in line:
172 |                     tmp_x = self.tokenizer.index2token.get(x, '')
173 |                     # 跳过特殊字符
174 |                     if tmp_x not in special_tokens:
175 |                         src_line += tmp_x
176 |             src.append(src_line)
177 |         return src
178 |         
179 |         
180 | 


--------------------------------------------------------------------------------
/module/Trainer.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import os
  3 | from posixpath import sep
  4 | import time
  5 | import random
  6 | import logging
  7 | import math
  8 | import numpy as np
  9 | import pandas as pd
 10 | import pickle as pkl
 11 | import torch
 12 | import torch.nn as nn
 13 | from apex import amp
 14 | from tqdm.auto import tqdm
 15 | from datasets import Dataset, load_dataset, load_metric
 16 | from torch.utils.data import DataLoader
 17 | from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler, get_linear_schedule_with_warmup
 18 | from transformers import BertTokenizer, BertConfig, AutoConfig
 19 | # from model.BertForMaskedLM import BertForMaskedLM
 20 | 
 21 | 
 22 | import torch.nn.functional as F
 23 | from sklearn import metrics
 24 | 
 25 | from utils.progressbar import ProgressBar
 26 | from module.optimal.adversarial import FGM,PGD
 27 | from module.ModelMap import map_model, map_config, map_tokenizer
 28 | from module.LossManager import LossManager
 29 | 
 30 | 
 31 | 
 32 | class Trainer(object):
 33 |     
 34 |     def __init__(self, config, train_loader, valid_loader, test_loader):
 35 |         self.config = config
 36 |         # 设置GPU环境
 37 |         self.device = torch.device(self.config.device)
 38 |         # 加载数据集
 39 |         self.train_loader = train_loader
 40 |         self.valid_loader = valid_loader
 41 |         self.test_loader = test_loader
 42 |         # 加载标签
 43 |         self.load_label()
 44 |         # 加载模型
 45 |         self.load_tokenizer()
 46 |         self.load_model()
 47 |         # 加载loss计算类
 48 |         self.loss_manager = LossManager(loss_type=config.loss_type, cl_option=config.cl_option, loss_cl_type=config.cl_method)
 49 | 
 50 | 
 51 | 
 52 |     def load_label(self):
 53 |         """
 54 |         读取标签
 55 |         """
 56 |         path_label = os.path.join(self.config.path_datasets, 'class.txt')
 57 |         self.label = [ x.strip() for x in open(path_label, 'r', encoding='utf8').readlines()]
 58 |         self.label2ids = {x:i for i,x in enumerate(self.label)}
 59 |         self.ids2label = {i:x for i,x in enumerate(self.label)}
 60 | 
 61 | 
 62 |     def load_tokenizer(self):
 63 |         """
 64 |         读取分词器
 65 |         """
 66 |         self.tokenizer = map_tokenizer(self.config.model_name)
 67 | 
 68 | 
 69 |     def load_model(self):
 70 |         """
 71 |         加载模型及初始化模型参数
 72 |         """
 73 |         # 读取模型
 74 |         print('loading model...%s' %self.config.model_name)
 75 |         self.model = map_model(self.config.model_name)
 76 |         if not self.model:
 77 |             print('model {} is null, please check your model name.'.format(self.config.model_name))
 78 |         
 79 |         if self.config.model_name not in self.config.lm_model_list:
 80 |             # self.model = map_model(self.config.model_name)
 81 |             model_config = map_config(self.config.model_name)(self.config)
 82 |             self.model = self.model(model_config)
 83 |             # 重新初始化模型参数
 84 |             self.init_network()
 85 |         else:
 86 |             # self.tokenizer = map_tokenizer(self.config.model_name).from_pretrained(self.config.model_pretrain_online_checkpoint)
 87 |             # self.tokenizer.save_pretrained(self.config.path_tokenizer)
 88 |             # self.func_index2token = self.tokenizer.convert_ids_to_tokens
 89 |             # 加载预训练模型
 90 |             model_config = AutoConfig.from_pretrained(self.config.initial_pretrain_model, num_labels=len(self.label))   #, num_labels=len(self.label2ids)
 91 |             self.model = self.model.from_pretrained(self.config.initial_pretrain_model, config=model_config)    
 92 |         # 将模型加载到CPU/GPU
 93 |         self.model.to(self.device)
 94 | 
 95 |     
 96 |     def init_network(self, method='xavier', exclude='embedding', seed=123):
 97 |         """
 98 |         # 权重初始化，默认xavier
 99 |         """
100 |         for name, w in self.model.named_parameters():
101 |             if exclude not in name:
102 |                 if 'weight' in name:
103 |                     if method == 'xavier':
104 |                         if 'transformer' in name:
105 |                             nn.init.uniform_(w, -0.1, 0.1)
106 |                         else:
107 |                             nn.init.xavier_normal_(w)
108 |                     elif method == 'kaiming':
109 |                         nn.init.kaiming_normal_(w)
110 |                     else:
111 |                         nn.init.normal_(w)
112 |                 elif 'bias' in name:
113 |                     nn.init.constant_(w, 0)
114 |                 else:
115 |                     pass
116 | 
117 | 
118 |     def train(self):
119 |         """
120 |             预训练模型
121 |         """
122 |         # weight decay
123 |         # bert_parameters = self.model.bert.named_parameters()
124 |         # start_parameters = self.model.start_fc.named_parameters()
125 |         # end_parameters = self.model.end_fc.named_parameters()
126 |         # no_decay = ["bias", "LayerNorm.weight"]
127 |         # optimizer_grouped_parameters = [
128 |         #     {"params": [p for n, p in bert_parameters if not any(nd in n for nd in no_decay)],
129 |         #      "weight_decay": 0.01, 'lr': self.config.learning_rate},
130 |         #     {"params": [p for n, p in bert_parameters if any(nd in n for nd in no_decay)], "weight_decay": 0.0
131 |         #         , 'lr': self.config.learning_rate},
132 |         #     {"params": [p for n, p in start_parameters if not any(nd in n for nd in no_decay)],
133 |         #      "weight_decay": 0.01, 'lr': 0.001},
134 |         #     {"params": [p for n, p in start_parameters if any(nd in n for nd in no_decay)], "weight_decay": 0.0
135 |         #         , 'lr': 0.001},
136 |         #     {"params": [p for n, p in end_parameters if not any(nd in n for nd in no_decay)],
137 |         #      "weight_decay": 0.01, 'lr': 0.001},
138 |         #     {"params": [p for n, p in end_parameters if any(nd in n for nd in no_decay)], "weight_decay": 0.0
139 |         #         , 'lr': 0.001}]
140 |         # step_total = self.config.num_epochs * len(self.train_loader) * self.config.batch_size
141 |         # # step_total = 640 #len(train_ld)*config.batch_size // config.num_epochs
142 |         # warmup_steps = int(step_total * self.config.num_warmup_steps)
143 |         # self.optimizer = AdamW(optimizer_grouped_parameters, lr=self.config.learning_rate, eps=1e-8)
144 |         # self.lr_scheduler = get_linear_schedule_with_warmup(self.optimizer, num_warmup_steps=warmup_steps,
145 |         #                                             num_training_steps=step_total)
146 |         
147 |         # 定义优化器配置
148 |         # num_training_steps = self.config.num_epochs * len(self.train_loader)
149 |         # 总的训练次数
150 |         step_total = self.config.num_epochs * len(self.train_loader) * self.config.batch_size
151 |         # warm up的次数
152 |         warmup_steps = int(step_total * self.config.num_warmup_steps)
153 |         if self.config.model_name not in self.config.lm_model_list:
154 |             self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.config.learning_rate)
155 |         else:
156 |             self.optimizer = AdamW(self.model.parameters(), lr=self.config.learning_rate)
157 |             self.lr_scheduler = get_scheduler(
158 |                 "linear",
159 |                 optimizer=self.optimizer,
160 |                 num_warmup_steps=self.config.num_warmup_steps,
161 |                 num_training_steps=step_total
162 |             )
163 |             # scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps,
164 |             #                                             num_training_steps=t_total)
165 |         
166 |         # 混合精度训练
167 |         if self.config.fp16:
168 |             self.model, self.optimizer = amp.initialize(self.model, self.optimizer, opt_level=self.config.fp16_opt_level)
169 |         # 分布式训练
170 |         if torch.cuda.device_count() > 1:
171 |             self.model = torch.nn.parallel.DistributedDataParallel(self.model, find_unused_parameters=True)
172 |         # 对抗训练
173 |         if self.config.adv_option == 'FGM':
174 |             self.fgm = FGM(self.model, emb_name=self.config.adv_name, epsilon=self.config.adv_epsilon)
175 |         if self.config.adv_option == 'PGD':
176 |             self.pgd = PGD(self.model, emb_name=self.config.adv_name, epsilon=self.config.adv_epsilon)
177 | 
178 |         # Train!
179 |         print("\n>>>>>>>> Running training >>>>>>>>")
180 |         print("  Num examples = %d" %(len(self.train_loader)*self.config.batch_size))
181 |         print("  Num Epochs = %d" %self.config.num_epochs)
182 |         print("  Batch size per GPU = %d"%self.config.batch_size)
183 |         print("  GPU ids = %s" %self.config.cuda_visible_devices)
184 |         print("  Total step = %d" %step_total)
185 |         print("  Warm up step = %d" %warmup_steps)
186 |         print("  FP16 Option = %s" %self.config.fp16)
187 |         print(">>>>>>>> Running training >>>>>>>>\n")
188 |         
189 |         print(">>>>>>>> Model Structure >>>>>>>>")
190 |         for name,parameters in self.model.named_parameters():
191 |             print(name,':',parameters.size())
192 |         print(">>>>>>>> Model Structure >>>>>>>>\n")
193 | 
194 |         # step_total = config.num_epochs * len(train_ld)
195 |         step_current = 0
196 |         f1_best = 0
197 |         for epoch in range(self.config.num_epochs):
198 |             progress_bar = ProgressBar(n_total=len(self.train_loader), desc='Training epoch:{0}'.format(epoch))
199 |             for i, batch in enumerate(self.train_loader):
200 |                 # 模型推断及计算损失
201 |                 self.model.train()
202 |                 loss = self.step(batch)
203 |                 progress_bar(i, {'loss': loss.item()})
204 |                 # progress_bar(i, {'loss': loss.item(),'loss_ce': loss_ce.item(),'loss_cl': loss_nce.item()})
205 |                 step_current += 1
206 |                 # 模型保存
207 |                 if step_current%self.config.step_save==0 and step_current>0:
208 |                     # 模型评估
209 |                     f1_eval = self.evaluate(self.valid_loader)
210 |                     # 模型保存
211 |                     f1_best = self.save_checkpoint(step_current, f1_eval, f1_best)
212 |             print('\nEpoch:{}  Iter:{}/{}  loss:{:.4f}\n'.format(epoch, step_current, step_total, loss.item()))
213 |         self.evaluate(self.test_loader, print_table=True)
214 |     
215 |     
216 | 
217 |     def step(self, batch):
218 |         """
219 |         每一个batch的训练过程
220 |         """
221 |         
222 |         # 数据操作
223 |         batch = {k:v.to(self.device) for k,v in batch.items()}
224 |         target = batch['label']
225 |         # 模型输入&输出
226 |         outputs = self.model(**batch)
227 |         output, hidden_emb = outputs
228 |         # 对比学习
229 |         if self.config.cl_option:
230 |             # 重新获取一次模型输出
231 |             outputs_etx = self.model(**batch)
232 |             _, hidden_emb_etx = outputs_etx
233 |             loss = self.loss_manager.compute(output, target, hidden_emb, hidden_emb_etx, alpha=self.config.cl_loss_weight)
234 |         else:
235 |             loss = self.loss_manager.compute(output, target)
236 |         # 反向传播
237 |         if torch.cuda.device_count() > 1:
238 |             loss = loss.mean()
239 |         if self.config.fp16:
240 |             with amp.scale_loss(loss, self.optimizer) as scaled_loss:
241 |                 scaled_loss.backward()
242 |         else:
243 |             loss.backward()
244 |         # 对抗训练
245 |         self.attack_train(batch)
246 |         # 梯度操作
247 |         self.optimizer.step()
248 |         if self.config.model_name in self.config.lm_model_list:
249 |             self.lr_scheduler.step()
250 |         self.model.zero_grad()
251 |         # self.optimizer.zero_grad()
252 |         return loss
253 | 
254 | 
255 |     def attack_train(self, batch):
256 |         """
257 |         对抗训练
258 |         """
259 |         # FGM
260 |         if self.config.adv_option == 'FGM':
261 |             self.fgm.attack()
262 |             output = self.model(**batch)[0]
263 |             loss_adv = self.loss_manager.compute(output, batch['label'])
264 |             if torch.cuda.device_count() > 1:
265 |                 loss_adv = loss_adv.mean()
266 |             loss_adv.backward()
267 |             self.fgm.restore()
268 |         # PGD
269 |         if self.config.adv_option == 'PGD':
270 |             self.pgd.backup_grad()
271 |             K = 3
272 |             for t in range(K):
273 |                 self.pgd.attack(is_first_attack=(t==0))  # 在embedding上添加对抗扰动, first attack时备份param.data
274 |                 if t != K-1:
275 |                     self.model.zero_grad()
276 |                 else:
277 |                     self.pgd.restore_grad()
278 |                 output = self.model(**batch)[0]
279 |                 loss_adv = self.loss_manager.compute(output, batch['label'])
280 |                 loss_adv.backward()                      # 反向传播，并在正常的grad基础上，累加对抗训练的梯度
281 |             self.pgd.restore()   
282 |             
283 | 
284 |     def save_checkpoint(self, step_current, f1_eval, f1_best):
285 |         """
286 |         模型保存
287 |         """
288 |         if f1_eval != 0:
289 |             # 保存路径
290 |             path = os.path.join(self.config.path_model_save, 'step_{}'.format(step_current))
291 |             if not os.path.exists(path):
292 |                 os.makedirs(path)
293 |             # 保存当前step的模型
294 |             if self.config.model_name not in self.config.lm_model_list:
295 |                 path_model = os.path.join(path, 'pytorch_model.bin')
296 |                 torch.save(self.model.state_dict(), path_model)
297 |             else:
298 |                 model_save = self.model.module if torch.cuda.device_count() > 1 else self.model
299 |                 model_save.save_pretrained(path)
300 |             print('Saving model: {}'.format(path))
301 |             # 保存最优的模型
302 |             if f1_eval > f1_best:
303 |                 # 创建文件夹
304 |                 path = os.path.join(self.config.path_model_save, 'step_best/')
305 |                 if not os.path.exists(path):
306 |                     os.makedirs(path)
307 |                 # 模型保存
308 |                 if self.config.model_name not in self.config.lm_model_list:
309 |                     path_model = os.path.join(path, 'pytorch_model.bin')
310 |                     torch.save(self.model.state_dict(), path_model)
311 |                 else:
312 |                     model_save = self.model.module if torch.cuda.device_count() > 1 else self.model
313 |                     model_save.save_pretrained(path)
314 |                 f1_best = f1_eval
315 |                 print('Saving best model: {}\n'.format(path))
316 |         return f1_best
317 | 
318 | 
319 |     def evaluate(self, data, print_table=False):
320 |         """
321 |         模型测试集效果评估
322 |         """
323 |         self.model.eval()
324 |         loss_total = 0
325 |         predict_all = np.array([], dtype=int)
326 |         labels_all = np.array([], dtype=int)
327 |         loss_manager = LossManager(loss_type=self.config.loss_type, cl_option=False)
328 |         with torch.no_grad():
329 |             for i, batch in enumerate(data):
330 |                 batch = {k:v.to(self.device) for k,v in batch.items()}
331 |                 output = self.model(**batch)[0]
332 |                 # 计算loss
333 |                 # loss = F.cross_entropy(outputs, labels)
334 |                 # loss_total += outputx[0]
335 |                 target = batch['label']
336 |                 loss = loss_manager.compute(output, target)
337 |                 loss_total += loss
338 |                 # 获取标签
339 |                 labels = batch['label'].cpu().numpy()#[:,1:-1]
340 |                 predic = torch.max(output, -1)[1].cpu().numpy()
341 |                 labels_all = np.append(labels_all, labels)
342 |                 predict_all = np.append(predict_all, predic)
343 |         # 计算指标
344 |         acc = metrics.accuracy_score(labels_all, predict_all)
345 |         f1 = metrics.f1_score(labels_all, predict_all, average='micro')
346 |         print('\n>>Eval Set>>:  Loss:{:.4f}  Acc:{}  MicroF1:{:.4f}'.format(loss_total.item(), acc, f1))
347 |         # {'micro', 'macro', 'samples','weighted', 'binary'}
348 |         if print_table:
349 |             # 打印指标
350 |             report = metrics.classification_report(labels_all, predict_all, target_names=self.label, digits=4)
351 |             confusion = metrics.confusion_matrix(labels_all, predict_all)
352 |             print('\nEvaluate Classifier Performance '+'#'*50)
353 |             print(report)
354 |             print('\nConfusion Matrix')
355 |             print(confusion)
356 |             print('#'*60)
357 |             
358 |         return f1
359 |     
360 |     


--------------------------------------------------------------------------------
/module/loss/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wzzzd/text_classifier_pytorch/847c4565f0efa574d782de66311dc21f35c44f3e/module/loss/.DS_Store


--------------------------------------------------------------------------------
/module/loss/dice_loss.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | 
 4 | class DiceLoss(nn.Module):
 5 |     """DiceLoss implemented from 'Dice Loss for Data-imbalanced NLP Tasks'
 6 |     Useful in dealing with unbalanced data
 7 |     """
 8 |     def __init__(self):
 9 |         super(DiceLoss, self).__init__()
10 | 
11 |     def forward(self,input, target):
12 |         '''
13 |         input: [N, C]
14 |         target: [N, ]
15 |         '''
16 |         prob = torch.softmax(input, dim=1)
17 |         prob = torch.gather(prob, dim=1, index=target.unsqueeze(1))
18 |         dsc_i = 1 - ((1 - prob) * prob) / ((1 - prob) * prob + 1)
19 |         dice_loss = dsc_i.mean()
20 |         return dice_loss
21 | 
22 | 
23 | 
24 | 


--------------------------------------------------------------------------------
/module/loss/focal_loss.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | class FocalLoss(nn.Module):
 6 |     '''Multi-class Focal loss implementation'''
 7 |     def __init__(self, gamma=2, weight=None,ignore_index=-100):
 8 |         super(FocalLoss, self).__init__()
 9 |         self.gamma = gamma
10 |         self.weight = weight
11 |         self.ignore_index=ignore_index
12 | 
13 |     def forward(self, input, target):
14 |         """
15 |         input: [N, C]
16 |         target: [N, ]
17 |         """
18 |         logpt = F.log_softmax(input, dim=1)
19 |         pt = torch.exp(logpt)
20 |         logpt = (1-pt)**self.gamma * logpt
21 |         loss = F.nll_loss(logpt, target, self.weight,ignore_index=self.ignore_index)
22 |         return loss
23 | 


--------------------------------------------------------------------------------
/module/loss/infonce_loss.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | # from torch._C import LongTensor, dtype
 3 | import torch.nn as nn
 4 | import torch.nn.functional as F
 5 | 
 6 | 
 7 | class InfoNCELoss(nn.Module):
 8 |     '''InfoNCE loss implementation'''
 9 |     def __init__(self, temperature=0.999):
10 |         super(InfoNCELoss, self).__init__()
11 |         self.temperature = temperature
12 | 
13 | 
14 |     def forward(self, input1, input2):
15 |         """
16 |         input1: [N, C]
17 |         input2: [N, C]
18 |         """
19 |         
20 |         # p_matrix_sim = torch.cosine_similarity(input1, input2, dim=1)
21 |         
22 |         # positive 2 norm
23 |         norm_1 = torch.norm(input1,p=2,dim=1)                         # [N,]
24 |         norm_2 = torch.norm(input2,p=2,dim=1)                         # [N,]
25 |         norm_m = norm_1.mul(norm_2.t())                           # [N,N]
26 |         eps = torch.tensor(1e-8)
27 |         norm = 1/torch.max(norm_m, eps)                             # [N,N]
28 |         # norm = 1/norm_m
29 |         
30 |         # negative 2 norm
31 |         norm_n_m = norm_1.mul(norm_1.t())                         # [N,N]
32 |         norm_n = 1/torch.max(norm_n_m, eps)                         # [N,N]
33 |         # norm_n = 1/norm_n_m
34 |         
35 |         # positive sample
36 |         p_matrix_sim = input1.mm(input2.t())                    # [N,N]
37 |         p_matrix_sim = p_matrix_sim.mul(norm)                   # [N,N]
38 |         p_sim = torch.diag(p_matrix_sim)                        # [N,]
39 |         p_sim_zero_matrix = torch.diag_embed(p_sim)             # [N,N]
40 |         # negative sample
41 |         matrix_sim = input1.mm(input1.t())                      # [N,N]
42 |         matrix_sim = matrix_sim.mul(norm_n)                     # [N,N]
43 |         drop_diag = torch.diag(matrix_sim)
44 |         drop_diag_zero_matrix = torch.diag_embed(drop_diag)
45 |         # 减去对角线元素
46 |         matrix_sim_drop = matrix_sim - drop_diag_zero_matrix
47 |         # 对角线加上新的元素
48 |         n_matrix_sim = matrix_sim_drop + p_sim_zero_matrix      # [N,N]
49 |         
50 |         # positive score
51 |         p_exp = torch.exp(p_sim/self.temperature)               # [N,]   
52 |         # total sample score
53 |         total_exp = torch.exp(n_matrix_sim/self.temperature)     # [N,N]
54 |         total_exp_sum = total_exp.sum(dim=0)
55 |         # loss
56 |         loss = torch.log(p_exp/total_exp_sum)
57 |         loss = -1 * loss
58 |         loss = loss.mean()
59 |         # print('positive exp:{} negative exp:{}'.format(p_exp.mean(),total_exp.mean()))
60 |         return loss
61 | 
62 |         
63 |         
64 | if __name__ == '__main__':
65 |     infonce = InfoNCELoss()
66 |     input1 = torch.randn([20,5])   
67 |     input2 = torch.randn([20,5])   
68 |     target = torch.randint(0,5,[20,])
69 |     input_ids_anti = torch.randn([20,50,5])
70 |     label_anti = torch.randint(0,5,[20,50,])
71 |     
72 |     loss = infonce(input1=input1, input2=input2)
73 |     print(1)
74 |     
75 |     
76 |     


--------------------------------------------------------------------------------
/module/loss/kl_loss.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | 
 6 | class KLLoss(nn.Module):
 7 | 
 8 | 
 9 |     def __init__(self, temperature=0.1):
10 |         super(KLLoss, self).__init__()
11 |         self.temperature = temperature
12 | 
13 | 
14 |     def forward(self, p, q, reduce='mean'):
15 |         """
16 |         计算KL divergence loss
17 |         p: [N, C]
18 |         q: [N, C]
19 |         """
20 |         # 转换成log probabilities
21 |         p = F.softmax(p, dim=-1)
22 |         q = F.softmax(q, dim=-1)
23 |         # 计算损失
24 |         loss_func = torch.nn.KLDivLoss(size_average=False, reduce=False)
25 |         loss_pq = loss_func(p.log(), q)
26 |         loss_qp = loss_func(q.log(), p)
27 | 
28 |         if reduce == 'sum':
29 |             loss_pq = loss_pq.sum()
30 |             loss_qp = loss_qp.sum()
31 |         else:
32 |             loss_pq = loss_pq.mean()
33 |             loss_qp = loss_qp.mean()
34 |         loss = (loss_pq + loss_qp) / 2
35 |         return loss


--------------------------------------------------------------------------------
/module/loss/label_smoothing.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | import torch.nn.functional as F
 3 | 
 4 | class LabelSmoothingCrossEntropy(nn.Module):
 5 |     def __init__(self, eps=0.1, reduction='mean',ignore_index=-100):
 6 |         super(LabelSmoothingCrossEntropy, self).__init__()
 7 |         self.eps = eps
 8 |         self.reduction = reduction
 9 |         self.ignore_index = ignore_index
10 | 
11 |     def forward(self, output, target):
12 |         c = output.size()[-1]
13 |         log_preds = F.log_softmax(output, dim=-1)
14 |         if self.reduction=='sum':
15 |             loss = -log_preds.sum()
16 |         else:
17 |             loss = -log_preds.sum(dim=-1)
18 |             if self.reduction=='mean':
19 |                 loss = loss.mean()
20 |         return loss*self.eps/c + (1-self.eps) * F.nll_loss(log_preds, target, reduction=self.reduction,
21 |                                                            ignore_index=self.ignore_index)


--------------------------------------------------------------------------------
/module/models/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wzzzd/text_classifier_pytorch/847c4565f0efa574d782de66311dc21f35c44f3e/module/models/.DS_Store


--------------------------------------------------------------------------------
/module/models/Albert.py:
--------------------------------------------------------------------------------
 1 | # coding: UTF-8
 2 | import torch
 3 | import torch.nn as nn
 4 | from transformers import AlbertPreTrainedModel, AlbertModel
 5 | from torch.nn import CrossEntropyLoss
 6 | 
 7 | 
 8 | class Albert(AlbertPreTrainedModel):
 9 |     
10 |     def __init__(self, config):
11 |         super(Albert, self).__init__(config)
12 |         self.albert = AlbertModel(config)
13 |         self.hidden_size = config.hidden_size
14 |         self.num_classes = config.num_labels
15 |         self.fc = nn.Linear(self.hidden_size, self.num_classes)
16 | 
17 |     def forward(self, input_ids, attention_mask, label=None):  
18 |         output_albert = self.albert(input_ids, attention_mask=attention_mask)
19 |         output = self.fc(output_albert.pooler_output)
20 |         return [output,output_albert.pooler_output]
21 | 
22 | 


--------------------------------------------------------------------------------
/module/models/Bert.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import torch
 3 | import torch.nn as nn
 4 | import torch.nn.functional as F
 5 | from transformers import BertPreTrainedModel, BertModel, BertTokenizer
 6 | from torch.nn import CrossEntropyLoss
 7 | 
 8 | 
 9 | class Bert(BertPreTrainedModel):
10 |     
11 |     def __init__(self, config):
12 |         super(Bert, self).__init__(config)
13 |         self.bert = BertModel(config)
14 |         self.hidden_size = config.hidden_size
15 |         self.num_classes = config.num_labels
16 |         self.fc = nn.Linear(self.hidden_size, self.num_classes)
17 | 
18 |     def forward(self, 
19 |                 input_ids, 
20 |                 attention_mask, 
21 |                 label=None, 
22 |                 input_ids_anti=None, 
23 |                 label_anti=None):
24 |         # inference  
25 |         output_bert = self.bert(input_ids, attention_mask=attention_mask)    #(batch_size, sen_length, hidden_size)
26 |         output_pooler = output_bert.pooler_output
27 |         output = self.fc(output_pooler)
28 |         
29 |         return [output, output_pooler]
30 | 
31 | 


--------------------------------------------------------------------------------
/module/models/Distilbert.py:
--------------------------------------------------------------------------------
 1 | # coding: UTF-8
 2 | import torch
 3 | import torch.nn as nn
 4 | # from pytorch_pretrained_bert import BertModel, BertTokenizer
 5 | # from pytorch_pretrained import BertModel, BertTokenizer
 6 | # from transformers import BertPreTrainedModel, BertModel, BertTokenizer
 7 | from transformers import DistilBertPreTrainedModel, DistilBertModel, DistilBertTokenizer
 8 | from torch.nn import CrossEntropyLoss
 9 | import torch.nn.functional as F
10 | 
11 | 
12 | 
13 | class Distilbert(DistilBertPreTrainedModel):
14 |     
15 |     def __init__(self, config):
16 |         super(Distilbert, self).__init__(config)
17 |         self.distilbert = DistilBertModel(config)
18 |         # self.pool_layer = BertPooler(config)
19 |         self.hidden_size = config.hidden_size   #768
20 |         self.num_classes = config.num_labels
21 |         self.fc = nn.Linear(self.hidden_size, self.num_classes)
22 | 
23 |         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
24 |         # self.activation = nn.Tanh()
25 | 
26 | 
27 |     def forward(self, input_ids, attention_mask, label=None):   
28 |         output = self.distilbert(input_ids, attention_mask=attention_mask)
29 |         # out = self.fc(output.pooler_output)
30 |         # pooling
31 |         first_token_tensor = output.last_hidden_state[:, 0]
32 |         pooler_output = self.dense(first_token_tensor)
33 |         # pooler_output = self.activation(pooler_output)
34 |         # pooler_output = self.pool_layer(pooler_output)
35 |         # class
36 |         output = self.fc(pooler_output)
37 |         return [output,pooler_output]
38 | 
39 | 
40 | # class BertPooler(nn.Module):
41 | #     def __init__(self, config):
42 | #         super().__init__()
43 | #         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
44 | #         self.activation = nn.Tanh()
45 | 
46 | #     def forward(self, hidden_states):
47 | #         # We "pool" the model by simply taking the hidden state corresponding
48 | #         # to the first token.
49 | #         first_token_tensor = hidden_states[:, 0]
50 | #         pooled_output = self.dense(first_token_tensor)
51 | #         pooled_output = self.activation(pooled_output)
52 | #         return pooled_output


--------------------------------------------------------------------------------
/module/models/Electra.py:
--------------------------------------------------------------------------------
 1 | # coding: UTF-8
 2 | import torch
 3 | import torch.nn as nn
 4 | from transformers import ElectraPreTrainedModel, ElectraModel, ElectraTokenizer
 5 | from torch.nn import CrossEntropyLoss
 6 | 
 7 | 
 8 | class Electra(ElectraPreTrainedModel):
 9 |     
10 |     def __init__(self, config):
11 |         super(Electra, self).__init__(config)
12 |         self.electra = ElectraModel(config)
13 |         self.hidden_size = config.hidden_size
14 |         self.num_classes = config.num_labels
15 |         self.dense = nn.Linear(self.hidden_size, self.hidden_size)
16 |         self.fc = nn.Linear(self.hidden_size, self.num_classes)
17 |         self.activation = nn.Tanh()
18 | 
19 |     def forward(self, input_ids, attention_mask, label=None):    
20 |         output = self.electra(input_ids, attention_mask=attention_mask)
21 |         
22 |         first_token_tensor = output.last_hidden_state[:, 0]
23 |         pooler_output = self.dense(first_token_tensor)
24 |         pooler_output = self.activation(pooler_output)
25 |         output = self.fc(pooler_output)
26 |         return [output,pooler_output]
27 | 
28 | 


--------------------------------------------------------------------------------
/module/models/FastText.py:
--------------------------------------------------------------------------------
 1 | # coding: UTF-8
 2 | import os
 3 | import torch
 4 | import torch.nn as nn
 5 | import torch.nn.functional as F
 6 | import numpy as np
 7 | import pickle as pkl
 8 | from torch.nn import CrossEntropyLoss
 9 | 
10 | 
11 | class Config(object):
12 | 
13 |     """配置参数"""
14 |     def __init__(self, config):
15 |         self.model_name = 'FastText'
16 |         path_class = os.path.join(config.path_datasets, 'class.txt')
17 |         self.class_list = [x.strip() for x in open(path_class, encoding='utf-8').readlines()]              # 类别名单
18 |         self.num_classes = len(self.class_list)                         # 类别数
19 |         # embedding config
20 |         file_embedding = 'random'
21 |         path_embedding = os.path.join(config.path_datasets, file_embedding)
22 |         self.embedding_pretrained = torch.tensor(np.load(path_embedding)["embeddings"].astype('float32')) if file_embedding != 'random' else None                                          # 预训练词向量
23 |         self.embed = self.embedding_pretrained.size(1)\
24 |             if self.embedding_pretrained is not None else 300           # 字向量维度                                    # 预训练词向量
25 |         # self.device = torch.device(config.device if torch.cuda.is_available() else 'cpu')   # 设备
26 | 
27 |         # vocab
28 |         path_vocab = os.path.join(config.path_datasets, 'vocab.pkl')
29 |         toekn2index = pkl.load(open(path_vocab, 'rb'))
30 |         self.n_vocab = len(toekn2index.keys())                                                # 词表大小，在运行时赋值
31 |         # model config
32 |         self.dropout = 0.5                                              # 随机失活
33 |         self.hidden_size = 256     
34 | 
35 | 
36 | class FastText(nn.Module):
37 |     def __init__(self, config):
38 |         super(FastText, self).__init__()
39 |         if config.embedding_pretrained is not None:
40 |             self.embedding = nn.Embedding.from_pretrained(config.embedding_pretrained, freeze=False)
41 |         else:
42 |             self.embedding = nn.Embedding(config.n_vocab, config.embed, padding_idx=config.n_vocab - 1)
43 |         self.dropout = nn.Dropout(config.dropout)
44 |         self.fc1 = nn.Linear(config.embed, config.hidden_size)
45 |         # self.dropout2 = nn.Dropout(config.dropout)
46 |         self.fc2 = nn.Linear(config.hidden_size, config.num_classes)
47 | 
48 |     def forward(self, input_ids, label, attention_mask):
49 | 
50 |         out = self.embedding(input_ids)                  # size: (batch_size, seq_len, dim)
51 |         out = out.mean(dim=1)                       # size: (batch_size, dim)
52 |         out = self.dropout(out)
53 |         out = self.fc1(out)                         # size: (batch_size, hidden_size)
54 |         out_relu = F.relu(out)
55 |         out = self.fc2(out_relu)                         # size: (batch_size, num_class)
56 |         return [out,out_relu]
57 |         # # 计算loss
58 |         # loss = None
59 |         # if label is not None:
60 |         #     loss_func = CrossEntropyLoss()
61 |         #     # out_softmax = F.softmax(out)
62 |         #     loss = loss_func(out, label)
63 |         #     # loss = F.cross_entropy(out, label)
64 |         # output = (loss, out)
65 |         # return output
66 | 
67 | 


--------------------------------------------------------------------------------
/module/models/Roberta.py:
--------------------------------------------------------------------------------
 1 | # coding: UTF-8
 2 | import torch
 3 | import torch.nn as nn
 4 | from transformers import FlaxRobertaPreTrainedModel, RobertaModel, BertModel, BertPreTrainedModel
 5 | from torch.nn import CrossEntropyLoss
 6 | 
 7 | 
 8 | # class Roberta(FlaxRobertaPreTrainedModel):
 9 | class Roberta(BertPreTrainedModel):
10 |     
11 |     def __init__(self, config):
12 |         super(Roberta, self).__init__(config)
13 |         self.bert = BertModel(config)
14 |         self.hidden_size = config.hidden_size
15 |         self.num_classes = config.num_labels
16 |         self.fc = nn.Linear(self.hidden_size, self.num_classes)
17 | 
18 |     def forward(self, input_ids, attention_mask, label=None):
19 |   
20 |         output = self.bert(input_ids, attention_mask=attention_mask)
21 |         out = self.fc(output.pooler_output)
22 |         return [out,output.pooler_output]
23 | 
24 | 


--------------------------------------------------------------------------------
/module/models/TextCNN.py:
--------------------------------------------------------------------------------
 1 | # coding: UTF-8
 2 | import os
 3 | import torch
 4 | import torch.nn as nn
 5 | import torch.nn.functional as F
 6 | import numpy as np
 7 | import pickle as pkl
 8 | from torch.nn import CrossEntropyLoss
 9 | 
10 | 
11 | class Config(object):
12 | 
13 |     """配置参数"""
14 |     def __init__(self, config):
15 |         self.model_name = 'TextCNN'
16 |         path_class = os.path.join(config.path_datasets, 'class.txt')
17 |         self.class_list = [x.strip() for x in open(path_class, encoding='utf-8').readlines()]              # 类别名单
18 |         self.num_classes = len(self.class_list)                         # 类别数
19 |         # embedding config
20 |         file_embedding = 'random'
21 |         path_embedding = os.path.join(config.path_datasets, file_embedding)
22 |         self.embedding_pretrained = torch.tensor(np.load(path_embedding)["embeddings"].astype('float32')) if file_embedding != 'random' else None                                          # 预训练词向量
23 |         self.embed = self.embedding_pretrained.size(1)\
24 |             if self.embedding_pretrained is not None else 300           # 字向量维度
25 |         # self.device = torch.device(config.device if torch.cuda.is_available() else 'cpu')   # 设备
26 |         # vocab
27 |         path_vocab = os.path.join(config.path_datasets, 'vocab.pkl')
28 |         toekn2index = pkl.load(open(path_vocab, 'rb'))
29 |         self.n_vocab = len(toekn2index.keys())                                                # 词表大小，在运行时赋值
30 |         # model config
31 |         self.dropout = 0.5                                              # 随机失活
32 |         self.filter_sizes = (2, 3, 4)                                   # 卷积核尺寸
33 |         self.num_filters = 256                                          # 卷积核数量(channels数)
34 | 
35 | 
36 | class TextCNN(nn.Module):
37 |     def __init__(self, config):
38 |         super(TextCNN, self).__init__()
39 |         self.c = config
40 |         if config.embedding_pretrained is not None:
41 |             self.embedding = nn.Embedding.from_pretrained(config.embedding_pretrained, freeze=False)
42 |         else:
43 |             self.embedding = nn.Embedding(config.n_vocab, config.embed, padding_idx=config.n_vocab - 1)
44 |         self.convs = nn.ModuleList(
45 |             [nn.Conv2d(1, config.num_filters, (k, config.embed)) for k in config.filter_sizes])
46 |         self.dropout = nn.Dropout(config.dropout)
47 |         self.fc = nn.Linear(config.num_filters * len(config.filter_sizes), config.num_classes)
48 | 
49 |     def conv_and_pool(self, x, conv): 
50 |         x = F.relu(conv(x)).squeeze(3)
51 |         x = F.max_pool1d(x, x.size(2)).squeeze(2)
52 |         return x
53 | 
54 |     def forward(self, input_ids, label, attention_mask):
55 |         out = self.embedding(input_ids)
56 |         out = out.unsqueeze(1)
57 |         out = torch.cat([self.conv_and_pool(out, conv) for conv in self.convs], 1)
58 |         out_drop = self.dropout(out)
59 |         out = self.fc(out_drop)
60 |         return [out, out_drop]
61 |     
62 |         # # 计算loss
63 |         # loss = None
64 |         # if label is not None:
65 |         #     loss_func = CrossEntropyLoss()
66 |         #     # out_softmax = F.softmax(out)
67 |         #     loss = loss_func(out, label)
68 |         #     # loss = F.cross_entropy(out, label)
69 |         # output = (loss, out)
70 |         # return output
71 | 


--------------------------------------------------------------------------------
/module/models/TextRCNN.py:
--------------------------------------------------------------------------------
 1 | # coding: UTF-8
 2 | import os
 3 | import torch
 4 | import torch.nn as nn
 5 | import torch.nn.functional as F
 6 | import numpy as np
 7 | import pickle as pkl
 8 | 
 9 | 
10 | class Config(object):
11 | 
12 |     """配置参数"""
13 |     def __init__(self, config):
14 |         self.model_name = 'TextRCNN'
15 |         path_class = os.path.join(config.path_datasets, 'class.txt')
16 |         self.class_list = [x.strip() for x in open(path_class, encoding='utf-8').readlines()]              # 类别名单
17 |         self.num_classes = len(self.class_list)                         # 类别数
18 |         # embedding config
19 |         file_embedding = 'random'
20 |         path_embedding = os.path.join(config.path_datasets, file_embedding)
21 |         self.embedding_pretrained = torch.tensor(np.load(path_embedding)["embeddings"].astype('float32')) if file_embedding != 'random' else None                                          # 预训练词向量
22 |         self.embed = self.embedding_pretrained.size(1)\
23 |             if self.embedding_pretrained is not None else 300           # 字向量维度
24 |         # vocab
25 |         path_vocab = os.path.join(config.path_datasets, 'vocab.pkl')
26 |         toekn2index = pkl.load(open(path_vocab, 'rb'))
27 |         self.n_vocab = len(toekn2index.keys())  
28 |         # model config
29 |         self.dropout = 0.5                                              # 随机失活
30 |         self.hidden_size = 256                                          # lstm隐藏层
31 |         self.num_layers = 1                                             # lstm层数
32 |         self.pad_size = config.sen_max_length
33 | 
34 | 
35 | class TextRCNN(nn.Module):
36 |     def __init__(self, config):
37 |         super(TextRCNN, self).__init__()
38 |         if config.embedding_pretrained is not None:
39 |             self.embedding = nn.Embedding.from_pretrained(config.embedding_pretrained, freeze=False)
40 |         else:
41 |             self.embedding = nn.Embedding(config.n_vocab, config.embed, padding_idx=config.n_vocab - 1)
42 |         self.lstm = nn.LSTM(config.embed, config.hidden_size, config.num_layers,
43 |                             bidirectional=True, batch_first=True, dropout=config.dropout)
44 |         self.maxpool = nn.MaxPool1d(config.pad_size)
45 |         self.fc = nn.Linear(config.hidden_size * 2 + config.embed, config.num_classes)
46 | 
47 |     def forward(self, input_ids, label, attention_mask):
48 |         embed = self.embedding(input_ids)  # [batch_size, seq_len, embeding]=[64, 32, 64]
49 |         out, _ = self.lstm(embed)
50 |         out = torch.cat((embed, out), 2)
51 |         out = F.relu(out)
52 |         out = out.permute(0, 2, 1)
53 |         out_squ = self.maxpool(out).squeeze()
54 |         out = self.fc(out_squ)
55 |         return [out, out_squ]
56 | 


--------------------------------------------------------------------------------
/module/models/TextRNN.py:
--------------------------------------------------------------------------------
 1 | # coding: UTF-8
 2 | import os
 3 | import torch
 4 | import torch.nn as nn
 5 | import numpy as np
 6 | import pickle as pkl
 7 | 
 8 | 
 9 | class Config(object):
10 | 
11 |     """配置参数"""
12 |     def __init__(self, config):
13 |         self.model_name = 'TextRNN'
14 |         path_class = os.path.join(config.path_datasets, 'class.txt')
15 |         self.class_list = [x.strip() for x in open(path_class, encoding='utf-8').readlines()]              # 类别名单
16 |         self.num_classes = len(self.class_list)                         # 类别数
17 |         # embedding config
18 |         file_embedding = 'random'
19 |         path_embedding = os.path.join(config.path_datasets, file_embedding)
20 |         self.embedding_pretrained = torch.tensor(np.load(path_embedding)["embeddings"].astype('float32')) if file_embedding != 'random' else None                                          # 预训练词向量
21 |         self.embed = self.embedding_pretrained.size(1)\
22 |             if self.embedding_pretrained is not None else 300           # 字向量维度
23 |         # vocab
24 |         path_vocab = os.path.join(config.path_datasets, 'vocab.pkl')
25 |         toekn2index = pkl.load(open(path_vocab, 'rb'))
26 |         self.n_vocab = len(toekn2index.keys())  
27 |         # model config
28 |         self.dropout = 0.5                                              # 随机失活                                      # 学习率
29 |         self.hidden_size = 128                                          # lstm隐藏层
30 |         self.num_layers = 2                                             # lstm层数
31 | 
32 | 
33 | class TextRNN(nn.Module):
34 |     def __init__(self, config):
35 |         super(TextRNN, self).__init__()
36 |         if config.embedding_pretrained is not None:
37 |             self.embedding = nn.Embedding.from_pretrained(config.embedding_pretrained, freeze=False)
38 |         else:
39 |             self.embedding = nn.Embedding(config.n_vocab, config.embed, padding_idx=config.n_vocab - 1)
40 |         self.lstm = nn.LSTM(config.embed, config.hidden_size, config.num_layers,
41 |                             bidirectional=True, batch_first=True, dropout=config.dropout)
42 |         self.fc = nn.Linear(config.hidden_size * 2, config.num_classes)
43 | 
44 |     def forward(self, input_ids, label, attention_mask):
45 |         out = self.embedding(input_ids)  # [batch_size, seq_len, embeding]=[128, 32, 300]
46 |         out_lstm, _ = self.lstm(out)
47 |         out = self.fc(out_lstm[:, -1, :])  # 句子最后时刻的 hidden state
48 |         return [out, out_lstm]
49 | 


--------------------------------------------------------------------------------
/module/models/Transformer.py:
--------------------------------------------------------------------------------
 1 | # coding: UTF-8
 2 | import os
 3 | import math
 4 | import torch
 5 | import torch.nn as nn
 6 | import torch.nn.functional as F
 7 | import numpy as np
 8 | import torch
 9 | import torch.nn as nn
10 | import torch.nn.functional as F
11 | from torch.nn import TransformerEncoder, TransformerEncoderLayer
12 | from torch.nn.modules import dropout
13 | import pickle as pkl
14 | 
15 | 
16 | class Config(object):
17 | 
18 |     """配置参数"""
19 |     def __init__(self, config):
20 |         self.model_name = 'Transormer_base'
21 |         path_class = os.path.join(config.path_datasets, 'class.txt')
22 |         self.class_list = [x.strip() for x in open(path_class, encoding='utf-8').readlines()]              # 类别名单
23 |         self.num_classes = len(self.class_list)                         # 类别数
24 |         # embedding config
25 |         file_embedding = 'random'
26 |         path_embedding = os.path.join(config.path_datasets, file_embedding)
27 |         self.embedding_pretrained = torch.tensor(np.load(path_embedding)["embeddings"].astype('float32')) if file_embedding != 'random' else None                                          # 预训练词向量
28 |         self.embed = self.embedding_pretrained.size(1)\
29 |             if self.embedding_pretrained is not None else 768           # 字向量维度
30 |         # self.device = torch.device(config.device if torch.cuda.is_available() else 'cpu')   # 设备
31 |         # vocab
32 |         path_vocab = os.path.join(config.path_datasets, 'vocab.pkl')
33 |         toekn2index = pkl.load(open(path_vocab, 'rb'))
34 |         self.n_vocab = len(toekn2index.keys())                                                # 词表大小，在运行时赋值
35 |         # model config
36 |         self.dropout = 0.3                                              # 随机失活
37 |         self.nhead = 12
38 |         self.hidden_size = 3072
39 |         self.nlayer = 6
40 |         self.sen_length = config.sen_max_length
41 | 
42 | 
43 | 
44 | class Transformer(nn.Module):
45 |     def __init__(self, config):
46 |         super(Transformer, self).__init__()
47 |         self.c = config
48 |         if config.embedding_pretrained is not None:
49 |             self.embedding = nn.Embedding.from_pretrained(config.embedding_pretrained, freeze=False)
50 |         else:
51 |             self.embedding = nn.Embedding(config.n_vocab, config.embed, padding_idx=config.n_vocab - 1)
52 |         # transformer
53 |         encoder_layer = nn.TransformerEncoderLayer(d_model=config.embed, nhead=config.nhead, dim_feedforward=config.hidden_size)
54 |         self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=config.nlayer)
55 |         self.pos_encoder = PositionalEncoding(d_model=config.embed, max_len=config.sen_length)
56 |         self.dropout = nn.Dropout(config.dropout)
57 |         self.fc = nn.Linear(config.sen_length, config.num_classes)
58 | 
59 |     def forward(self, input_ids, label, attention_mask):
60 |         out = self.embedding(input_ids)         # (batch_size, seq_len, emb_size)
61 |         out = out.transpose(0,1)                # (seq_len, batch_size, emb_size)
62 |         out = self.pos_encoder(out)             # (seq_len, batch_size, emb_size)
63 |         out = self.transformer_encoder(out)     # (seq_len, batch_size, emb_size)
64 |         out = out.transpose(0,1)                # (batch_size, seq_len, emb_size)
65 |         out_pool = F.max_pool1d(out, out.size(2)).squeeze(2)
66 |         out = self.fc(out_pool)
67 |         return [out,out_pool]
68 | 
69 | 
70 | class PositionalEncoding(nn.Module):
71 | 
72 |     def __init__(self, d_model, dropout=0.1, max_len=5000):
73 |         super(PositionalEncoding, self).__init__()
74 |         self.dropout = nn.Dropout(p=dropout)
75 | 
76 |         pe = torch.zeros(max_len, d_model)
77 |         position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
78 |         div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
79 |         pe[:, 0::2] = torch.sin(position * div_term)
80 |         pe[:, 1::2] = torch.cos(position * div_term)
81 |         pe = pe.unsqueeze(0).transpose(0, 1)
82 |         self.register_buffer('pe', pe)
83 | 
84 |     def forward(self, x):
85 |         x = x + self.pe[:x.size(0), :]
86 |         return self.dropout(x)
87 | 
88 | 


--------------------------------------------------------------------------------
/module/models/XLNet.py:
--------------------------------------------------------------------------------
 1 | # coding: UTF-8
 2 | import torch
 3 | import torch.nn as nn
 4 | from transformers import XLNetPreTrainedModel, XLNetModel, AutoModel
 5 | 
 6 | 
 7 | class XLNet(XLNetPreTrainedModel):
 8 |     
 9 |     def __init__(self, config):
10 |         super(XLNet, self).__init__(config)
11 |         self.xlnet = AutoModel.from_config(config)
12 |         # self.xlnet = XLNetModel(config)
13 |         self.hidden_size = config.hidden_size
14 |         self.num_classes = config.num_labels
15 |         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
16 |         # self.activation = nn.Tanh()
17 |         self.fc = nn.Linear(self.hidden_size, self.num_classes)
18 | 
19 |     def forward(self, input_ids, attention_mask, label=None):   
20 |         output = self.xlnet(input_ids, attention_mask=attention_mask)
21 |         # pooling
22 |         first_token_tensor = output.last_hidden_state[:, 0]
23 |         pooler_output = self.dense(first_token_tensor)
24 |         # pooler_output = self.activation(pooler_output)
25 |         out = self.fc(pooler_output)
26 |         return [out,pooler_output]
27 | 
28 | 


--------------------------------------------------------------------------------
/module/optimal/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wzzzd/text_classifier_pytorch/847c4565f0efa574d782de66311dc21f35c44f3e/module/optimal/.DS_Store


--------------------------------------------------------------------------------
/module/optimal/adversarial.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | 
  3 | class FGM():
  4 |     '''
  5 |     Example
  6 |     # 初始化
  7 |     fgm = FGM(model,epsilon=1,emb_name='word_embeddings.')
  8 |     for batch_input, batch_label in data:
  9 |         # 正常训练
 10 |         loss = model(batch_input, batch_label)
 11 |         loss.backward() # 反向传播，得到正常的grad
 12 |         # 对抗训练
 13 |         fgm.attack() # 在embedding上添加对抗扰动
 14 |         loss_adv = model(batch_input, batch_label)
 15 |         loss_adv.backward() # 反向传播，并在正常的grad基础上，累加对抗训练的梯度
 16 |         fgm.restore() # 恢复embedding参数
 17 |         # 梯度下降，更新参数
 18 |         optimizer.step()
 19 |         model.zero_grad()
 20 |     '''
 21 |     def __init__(self, model,emb_name,epsilon=1.0):
 22 |         # emb_name这个参数要换成你模型中embedding的参数名
 23 |         self.model = model
 24 |         self.epsilon = epsilon
 25 |         self.emb_name = emb_name
 26 |         self.backup = {}
 27 | 
 28 |     def attack(self):
 29 |         for name, param in self.model.named_parameters():
 30 |             if param.requires_grad and self.emb_name in name:
 31 |                 self.backup[name] = param.data.clone()
 32 |                 norm = torch.norm(param.grad)
 33 |                 if norm!=0 and not torch.isnan(norm):
 34 |                     r_at = self.epsilon * param.grad / norm
 35 |                     param.data.add_(r_at)
 36 | 
 37 |     def restore(self):
 38 |         for name, param in self.model.named_parameters():
 39 |             if param.requires_grad and self.emb_name in name:
 40 |                 assert name in self.backup
 41 |                 param.data = self.backup[name]
 42 |         self.backup = {}
 43 | 
 44 | class PGD():
 45 |     '''
 46 |     Example
 47 |     pgd = PGD(model,emb_name='word_embeddings.',epsilon=1.0,alpha=0.3)
 48 |     K = 3
 49 |     for batch_input, batch_label in data:
 50 |         # 正常训练
 51 |         loss = model(batch_input, batch_label)
 52 |         loss.backward() # 反向传播，得到正常的grad
 53 |         pgd.backup_grad()
 54 |         # 对抗训练
 55 |         for t in range(K):
 56 |             pgd.attack(is_first_attack=(t==0)) # 在embedding上添加对抗扰动, first attack时备份param.data
 57 |             if t != K-1:
 58 |                 model.zero_grad()
 59 |             else:
 60 |                 pgd.restore_grad()
 61 |             loss_adv = model(batch_input, batch_label)
 62 |             loss_adv.backward() # 反向传播，并在正常的grad基础上，累加对抗训练的梯度
 63 |         pgd.restore() # 恢复embedding参数
 64 |         # 梯度下降，更新参数
 65 |         optimizer.step()
 66 |         model.zero_grad()
 67 |     '''
 68 |     def __init__(self, model,emb_name,epsilon=1.,alpha=0.3):
 69 |         # emb_name这个参数要换成你模型中embedding的参数名
 70 |         self.model = model
 71 |         self.emb_name = emb_name
 72 |         self.epsilon = epsilon
 73 |         self.alpha = alpha
 74 |         self.emb_backup = {}
 75 |         self.grad_backup = {}
 76 | 
 77 |     def attack(self,is_first_attack=False):
 78 |         for name, param in self.model.named_parameters():
 79 |             if param.requires_grad and self.emb_name in name:
 80 |                 if is_first_attack:
 81 |                     self.emb_backup[name] = param.data.clone()
 82 |                 norm = torch.norm(param.grad)
 83 |                 if norm != 0:
 84 |                     r_at = self.alpha * param.grad / norm
 85 |                     param.data.add_(r_at)
 86 |                     param.data = self.project(name, param.data, self.epsilon)
 87 | 
 88 |     def restore(self):
 89 |         for name, param in self.model.named_parameters():
 90 |             if param.requires_grad and self.emb_name in name:
 91 |                 assert name in self.emb_backup
 92 |                 param.data = self.emb_backup[name]
 93 |         self.emb_backup = {}
 94 | 
 95 |     def project(self, param_name, param_data, epsilon):
 96 |         r = param_data - self.emb_backup[param_name]
 97 |         if torch.norm(r) > epsilon:
 98 |             r = epsilon * r / torch.norm(r)
 99 |         return self.emb_backup[param_name] + r
100 | 
101 |     def backup_grad(self):
102 |         for name, param in self.model.named_parameters():
103 |             if param.requires_grad:
104 |                 if param.grad is None:
105 |                     self.grad_backup[name] = None
106 |                 else:
107 |                     self.grad_backup[name] = param.grad.clone()
108 |     def restore_grad(self):
109 |         for name, param in self.model.named_parameters():
110 |             if param.requires_grad:
111 |                 param.grad = self.grad_backup[name]


--------------------------------------------------------------------------------
/module/tokenizer/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wzzzd/text_classifier_pytorch/847c4565f0efa574d782de66311dc21f35c44f3e/module/tokenizer/.DS_Store


--------------------------------------------------------------------------------
/module/tokenizer/LMTextTokenizer.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | import pickle as pkl
 4 | 
 5 | 
 6 | 
 7 | class LMTextTokenizer(object):
 8 |     
 9 |     def __init__(self, tokenizer):
10 |         
11 |         self.tokenizer = tokenizer
12 |         self.cls_token_id = tokenizer.cls_token_id
13 |         self.pad_token_id = tokenizer.pad_token_id
14 |         self.sep_token_id = tokenizer.sep_token_id
15 |         self.unk_token_id = tokenizer.unk_token_id
16 |         # self.convert_tokens_to_ids = ''
17 |         self.load()
18 |         
19 | 
20 | 
21 |     def load(self):
22 |         """
23 |         读取分词器
24 |         """
25 |         self.token2index = self.tokenizer.vocab
26 |         self.index2token = { i:x for x,i in self.token2index.items()}
27 | 
28 | 
29 |     def tokenizer(self, text):
30 |         """
31 |         分词，按字分词
32 |         """
33 |         token = self.tokenizer(text, return_tensors="pt")
34 |         return token
35 | 
36 | 
37 |     def get_special_tokens(self):
38 |         """
39 |         获取特殊字符
40 |         """
41 |         target_ids = [self.cls_token_id, self.pad_token_id, self.sep_token_id, self.unk_token_id]
42 |         target = [self.index2token.get(x, '') for x in target_ids]
43 |         target = [ x for x in target if x]
44 |         return target
45 |     
46 |     


--------------------------------------------------------------------------------
/module/tokenizer/TextTokenizer.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | import pickle as pkl
 4 | from utils.IOOption import open_file, write_file
 5 | 
 6 | 
 7 | 
 8 | class TextTokenizer(object):
 9 |     
10 |     def __init__(self):
11 |         
12 |         self.cls_token = '[CLS]'
13 |         self.pad_token = '[PAD]'
14 |         self.sep_token = '[SEP]'
15 |         self.unk_token = '[UNK]'
16 |         self.convert_tokens_to_ids = ''
17 |         
18 | 
19 |     def load(self, path):
20 |         """
21 |         读取分词器
22 |         """
23 |         self.token2index = pkl.load(open(path, 'rb'))
24 |         self.index2token = { i:x for x,i in self.token2index.items()}
25 |         self.cls_token_id = self.token2index.get(self.cls_token)
26 |         self.pad_token_id = self.token2index.get(self.pad_token)
27 |         self.sep_token_id = self.token2index.get(self.sep_token)
28 |         self.unk_token_id = self.token2index.get(self.unk_token)
29 |         
30 | 
31 |     def create(self, corpus):
32 |         """
33 |         创建分词字典，获取训练集词表
34 |         """
35 |         # 按字分词
36 |         words = [w for line in corpus for w in line if w != '']
37 |         words = list(set(words))
38 |         words = sorted(words, reverse=False)
39 |         # 创建索引
40 |         token2index = {x:i for i,x in enumerate(words)}
41 |         index2token = {i:x for i,x in enumerate(words)}
42 | 
43 |         # 添加特殊字符
44 |         if self.pad_token not in token2index.keys():
45 |             index2token[len(token2index)] = self.pad_token
46 |             token2index[self.pad_token] = len(token2index)
47 |         if self.unk_token not in token2index.keys():
48 |             index2token[len(token2index)] = self.unk_token
49 |             token2index[self.unk_token] = len(token2index)
50 |         if self.cls_token not in token2index.keys():
51 |             index2token[len(token2index)] = self.cls_token
52 |             token2index[self.cls_token] = len(token2index)
53 |         if self.sep_token not in token2index.keys():
54 |             index2token[len(token2index)] = self.sep_token
55 |             token2index[self.sep_token] = len(token2index)
56 |         self.token2index = token2index
57 |         self.index2token = index2token
58 |         return token2index, index2token
59 | 
60 | 
61 |     def tokenizer(self, text):
62 |         """
63 |         分词，按字分词
64 |         """
65 |         tokens = [ x for x in text]
66 |         input_ids = [self.token2index.get(x, self.unk_token_id) for x in tokens]
67 |         attention_mask = [0]*len(input_ids)
68 |         token = {
69 |             'input_ids' : input_ids,
70 |             'attention_mask' : attention_mask
71 |         }
72 |         return token
73 | 
74 | 
75 |     def get_special_tokens(self):
76 |         """
77 |         获取特殊字符
78 |         """
79 |         target_ids = [self.cls_token_id, self.pad_token_id, self.sep_token_id, self.unk_token_id]
80 |         target = [self.index2token.get(x, '') for x in target_ids]
81 |         target = [ x for x in target if x]
82 |         return target


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | apex==0.1
2 | numpy==1.19.2
3 | pandas==1.1.5
4 | scikit_learn==1.0.2
5 | torch==1.8.0
6 | tqdm==4.62.3
7 | transformers==4.15.0
8 | 


--------------------------------------------------------------------------------
/run.sh:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wzzzd/text_classifier_pytorch/847c4565f0efa574d782de66311dc21f35c44f3e/run.sh


--------------------------------------------------------------------------------
/utils/IOOption.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | 
 5 | def open_file(path, sep=' '):
 6 |     """读取文件"""
 7 |     src = []
 8 |     tgt = []
 9 |     with open(path, 'r', encoding='utf8') as f:
10 |         for i, line in enumerate(f.readlines()):    # 
11 |             line = line.strip().split(sep)
12 |             tmp_src = str(line[0])
13 |             tmp_tgt = str(line[1])
14 |             # 若文本和标签都非空
15 |             if tmp_src and tmp_tgt:
16 |                 src.append(tmp_src)
17 |                 tgt.append(tmp_tgt)
18 |     return src, tgt
19 | 
20 | 
21 | 
22 | def write_file(word2index, path):
23 |     """写文件"""
24 |     with open(path, 'w', encoding='utf8') as f:
25 |         for k,v in word2index.items():
26 |             string = k + ' ' + str(v) + '\n'
27 |             f.write(string)
28 | 
29 | 
30 | def write_text(text, path):
31 |     """写文件"""
32 |     with open(path, 'w', encoding='utf8') as f:
33 |         for x in text:
34 |             string = str(x) + '\n'
35 |             f.write(string)
36 |             
37 |             
38 |             


--------------------------------------------------------------------------------
/utils/progressbar.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | 
 3 | 
 4 | class ProgressBar(object):
 5 |     '''
 6 |     custom progress bar（进度条）
 7 |     Example:
 8 |         >>> pbar = ProgressBar(n_total=30,desc='training')
 9 |         >>> step = 2
10 |         >>> pbar(step=step)
11 |     '''
12 |     def __init__(self, n_total,width=30,desc = 'Training'):
13 |         self.width = width
14 |         self.n_total = n_total
15 |         self.start_time = time.time()
16 |         self.desc = desc
17 | 
18 |     def __call__(self, step, info={}):
19 |         now = time.time()
20 |         current = step + 1
21 |         recv_per = current / self.n_total
22 |         bar = f'[{self.desc}] {current}/{self.n_total} ['
23 |         if recv_per >= 1:
24 |             recv_per = 1
25 |         prog_width = int(self.width * recv_per)
26 |         if prog_width > 0:
27 |             bar += '=' * (prog_width - 1)
28 |             if current< self.n_total:
29 |                 bar += ">"
30 |             else:
31 |                 bar += '='
32 |         bar += '.' * (self.width - prog_width)
33 |         bar += ']'
34 |         show_bar = f"\r{bar}"
35 |         time_per_unit = (now - self.start_time) / current
36 |         if current < self.n_total:
37 |             eta = time_per_unit * (self.n_total - current)
38 |             if eta > 3600:
39 |                 eta_format = ('%d:%02d:%02d' %
40 |                               (eta // 3600, (eta % 3600) // 60, eta % 60))
41 |             elif eta > 60:
42 |                 eta_format = '%d:%02d' % (eta // 60, eta % 60)
43 |             else:
44 |                 eta_format = '%ds' % eta
45 |             time_info = f' - ETA: {eta_format}'
46 |         else:
47 |             if time_per_unit >= 1:
48 |                 time_info = f' {time_per_unit:.1f}s/step'
49 |             elif time_per_unit >= 1e-3:
50 |                 time_info = f' {time_per_unit * 1e3:.1f}ms/step'
51 |             else:
52 |                 time_info = f' {time_per_unit * 1e6:.1f}us/step'
53 | 
54 |         show_bar += time_info
55 |         if len(info) != 0:
56 |             show_info = f'{show_bar} ' + \
57 |                         "-".join([f' {key}: {value:.4f} ' for key, value in info.items()])
58 |             print(show_info, end='')
59 |         else:
60 |             print(show_bar, end='')
61 | 


--------------------------------------------------------------------------------