├── Bert_pytorch ├── Data_analysis.ipynb ├── Generate_TTA.ipynb ├── bert_finetuning │ ├── .ipynb_checkpoints │ │ └── PyTorch_Bert-Squad_OnnxRuntime_GPU-checkpoint.ipynb │ ├── Config.py │ ├── NEZHA │ │ ├── configuration_nezha.py │ │ └── modeling_nezha.py │ ├── ensemble_10fold.py │ ├── ensemble_single.py │ ├── generate_pseudo_label.py │ ├── main_bert_10fold.py │ ├── main_bert_all.py │ ├── model.py │ ├── predict.py │ ├── predict_tta.py │ ├── stacking.py │ └── utils.py └── pretrain │ ├── NLP_Utils.py │ ├── bert_model │ ├── vocab.txt │ ├── vocab_100w.txt │ └── vocab_3462.txt │ ├── train_bert.py │ └── transformers1.zip ├── Nezha_pytorch ├── finetuning │ ├── .DS_Store │ ├── .ipynb_checkpoints │ │ └── PyTorch_Bert-Squad_OnnxRuntime_GPU-checkpoint.ipynb │ ├── Config.py │ ├── NEZHA │ │ ├── configuration_nezha.py │ │ └── modeling_nezha.py │ ├── NEZHA_main.py │ ├── model.py │ ├── predict.py │ ├── submit │ │ └── submit_bert_5epoch-10fold-first.csv │ └── utils.py ├── nezha_model │ └── .ipynb_checkpoints │ │ └── config-checkpoint.json └── pretrain │ ├── .DS_Store │ ├── NEZHA │ ├── __pycache__ │ │ ├── configuration_nezha.cpython-36.pyc │ │ └── modeling_nezha.cpython-36.pyc │ ├── configuration_nezha.py │ └── modeling_nezha.py │ ├── NLP_Utils.py │ ├── __init__.py │ ├── nezha_model │ ├── config.json │ └── vocab.txt │ ├── train_nezha.py │ └── transformers1.zip ├── README.md ├── Traditional-DL ├── Voting.ipynb ├── main_DL.py └── utils │ ├── DL_model.py │ ├── __pycache__ │ ├── DL_model.cpython-36.pyc │ ├── adversarial_model.cpython-36.pyc │ ├── init_net.cpython-36.pyc │ ├── optimizer_lookahead.cpython-36.pyc │ └── spatial_dropout.cpython-36.pyc │ ├── adversarial_model.py │ ├── init_net.py │ ├── optimizer_lookahead.py │ └── spatial_dropout.py └── data ├── datagrand_2021_test.csv └── datagrand_2021_train.csv /Bert_pytorch/Data_analysis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "- 比赛题目:第五届“达观杯” 基于大规模预训练模型的风险事件标签识别\n", 8 | "- 比赛链接:https://www.datafountain.cn/competitions/512" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "metadata": {}, 14 | "source": [ 15 | "## 1 导入包" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": null, 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "import numpy as np\n", 25 | "import pandas as pd\n", 26 | "import matplotlib.pyplot as plt\n", 27 | "import seaborn as sns\n", 28 | "from pylab import rcParams\n", 29 | "%matplotlib inline\n", 30 | "%config InlineBackend.figure_format = 'retina' # 主题\n", 31 | "\n", 32 | "plt.rcParams['font.sans-serif'] = ['SimHei'] # 步骤一(替换sans-serif字体)\n", 33 | "plt.rcParams['axes.unicode_minus'] = False # 步骤二(解决坐标轴负数的负号显示问题)\n", 34 | "train_data_file = pd.read_csv(\n", 35 | " 'data/pesudo_label_data_114009.csv')\n", 36 | "test_data_file = pd.read_csv('data/datagrand_2021_test.csv')\n", 37 | "# picture_file = './documentation/picture/'\n" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": null, 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "train_data_file.info()" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "test_data_file.info()" 56 | ] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "metadata": {}, 61 | "source": [ 62 | "# 2 统计词数" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": null, 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [ 71 | "train_allwords = []\n", 72 | "for index, row in train_data_file.iterrows():\n", 73 | " text = row[\"text\"].split()\n", 74 | " train_allwords.extend(text)\n", 75 | "train_textdict = pd.value_counts(train_allwords)\n", 76 | " \n", 77 | "testallwords = []\n", 78 | "for index, row in test_data_file.iterrows():\n", 79 | " text = row[\"text\"].split()\n", 80 | " testallwords.extend(text)\n", 81 | "test_textdict = pd.value_counts(testallwords)\n" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "# 训练集的词数\n", 91 | "\n", 92 | "train_vocab_size = len(train_textdict.keys())\n", 93 | "print(\"训练集的vpcab_size: {}\".format(train_vocab_size))" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": null, 99 | "metadata": {}, 100 | "outputs": [], 101 | "source": [ 102 | "# 测试集的词数\n", 103 | "test_vocab_size = len(test_textdict.keys())\n", 104 | "print(\"测试集的vocab_size: {}\".format(test_vocab_size))" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "metadata": {}, 111 | "outputs": [], 112 | "source": [ 113 | "combined_keys = train_textdict.keys() | test_textdict.keys()\n", 114 | "d_comb = {key: train_textdict.get(key, 0) + test_textdict.get(key, 0) for key in combined_keys}\n", 115 | "print(\"总数据集的vocab_size: {}\".format(len(d_comb)))\n" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": null, 121 | "metadata": {}, 122 | "outputs": [], 123 | "source": [ 124 | "sorted(d_comb.items(), key=lambda x: x[1], reverse=True)" 125 | ] 126 | }, 127 | { 128 | "cell_type": "markdown", 129 | "metadata": {}, 130 | "source": [ 131 | "## 2 统计句子长度分布" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": null, 137 | "metadata": {}, 138 | "outputs": [], 139 | "source": [ 140 | "train_data_file['text'].map(len).describe()\n" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "metadata": {}, 147 | "outputs": [], 148 | "source": [ 149 | "train_data_file.text.apply(lambda x:len(x.strip().split())).describe()\n" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": null, 155 | "metadata": {}, 156 | "outputs": [], 157 | "source": [ 158 | "test_data_file['text'].map(len).describe()\n" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": null, 164 | "metadata": {}, 165 | "outputs": [], 166 | "source": [ 167 | "test_data_file.text.apply(lambda x: len(x.strip().split())).describe()\n" 168 | ] 169 | }, 170 | { 171 | "cell_type": "markdown", 172 | "metadata": {}, 173 | "source": [ 174 | "### 2.1 查看缺失值" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": null, 180 | "metadata": {}, 181 | "outputs": [], 182 | "source": [ 183 | "# 查看是否有缺失值\n", 184 | "train_data_file.isnull().sum()\n" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": null, 190 | "metadata": {}, 191 | "outputs": [], 192 | "source": [ 193 | "test_data_file.isnull().sum()\n" 194 | ] 195 | }, 196 | { 197 | "cell_type": "markdown", 198 | "metadata": {}, 199 | "source": [ 200 | "### 2.2 标签分布" 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": null, 206 | "metadata": {}, 207 | "outputs": [], 208 | "source": [ 209 | "train_data_file['label'].value_counts()\n" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": null, 215 | "metadata": {}, 216 | "outputs": [], 217 | "source": [ 218 | "labeldict = dict(train_data_file['label'].value_counts())\n" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": null, 224 | "metadata": {}, 225 | "outputs": [], 226 | "source": [ 227 | "labellist = list(labeldict.keys())\n", 228 | "f_level = []\n", 229 | "s_level = []\n", 230 | "for i in range(len(labellist)):\n", 231 | " a, b = labellist[i].split(\"-\")\n", 232 | " f_level.append(a)\n", 233 | " s_level.append(b)\n" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": null, 239 | "metadata": {}, 240 | "outputs": [], 241 | "source": [ 242 | "list(set(s_level))\n" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": null, 248 | "metadata": {}, 249 | "outputs": [], 250 | "source": [ 251 | "n_s_level = [int(x) for x in list(set(s_level))]\n", 252 | "n_s_level.sort()\n", 253 | "n_s_level\n" 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": null, 259 | "metadata": {}, 260 | "outputs": [], 261 | "source": [ 262 | "plt.bar(labeldict.keys(), labeldict.values())\n", 263 | "plt.xlabel('label count')\n" 264 | ] 265 | } 266 | ], 267 | "metadata": { 268 | "interpreter": { 269 | "hash": "916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1" 270 | }, 271 | "kernelspec": { 272 | "display_name": "Python 3.7.3 64-bit", 273 | "name": "python3" 274 | }, 275 | "language_info": { 276 | "name": "python", 277 | "version": "" 278 | } 279 | }, 280 | "nbformat": 4, 281 | "nbformat_minor": 4 282 | } -------------------------------------------------------------------------------- /Bert_pytorch/Generate_TTA.ipynb: -------------------------------------------------------------------------------- 1 | {"cells":[{"cell_type":"code","execution_count":null,"metadata":{"_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","_kg_hide-output":true,"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","execution":{"iopub.execute_input":"2021-07-25T14:19:59.760921Z","iopub.status.busy":"2021-07-25T14:19:59.760546Z","iopub.status.idle":"2021-07-25T14:20:02.084014Z","shell.execute_reply":"2021-07-25T14:20:02.083159Z","shell.execute_reply.started":"2021-07-25T14:19:59.760841Z"},"trusted":true},"outputs":[],"source":["import pandas as pd\n","import numpy as np\n","from gensim.models import Word2Vec\n","import pandas as pd\n","import jieba\n","import os\n"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2021-07-25T14:23:43.452203Z","iopub.status.busy":"2021-07-25T14:23:43.451860Z","iopub.status.idle":"2021-07-25T14:23:44.937626Z","shell.execute_reply":"2021-07-25T14:23:44.936749Z","shell.execute_reply.started":"2021-07-25T14:23:43.452156Z"},"trusted":true},"outputs":[],"source":["#train = pd.read_csv('datagrand_2021_train.csv')\r\n","test = pd.read_csv('datagrand_2021_test.csv')\r\n","\r\n"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["#对测试集进行tta\r\n","#如果是对训练集进行tta,就把下面这句删除\r\n","train=test"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["#先统计每个符号出现的位置\r\n","#然后再将所有符号换成英文逗号\r\n","#最后再根据逗号进行还原\r\n","fuhao=[',','!','。','?']\r\n","tmp=train.text.tolist()\r\n"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["totalFuhao=[]\n","for text in tmp:\n"," tF=[]\n"," t=text.split()\n"," for j in t:\n"," if j in fuhao:\n"," tF.append(j)\n"," # print(\"ok\")\n"," totalFuhao.append(tF)"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["print(len(totalFuhao))"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["def getClean(document):\r\n"," text = str(document)\r\n"," text = text.replace(',', ',')\r\n"," text = text.replace('!', ',')\r\n"," text = text.replace('?', ',')\r\n"," text = text.replace('。', ',')\r\n"," return text"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["def suffer(document):\r\n"," text=str(document)\r\n"," t=text.split(',')\r\n"," newT=t[::-1]\r\n"," return \" , \".join(newT)"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["train['text']=train['text'].apply(lambda x:getClean(x))"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["#句子逆序\r\n","train['text']=train['text'].apply(lambda x: suffer(x))"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["#符号还原\r\n","def tranform(df):\r\n"," ixd=0\r\n"," totaldx=0\r\n"," ans=[]\r\n"," for text in df:\r\n"," arr=[]\r\n"," dinx=0\r\n"," t=text.split()\r\n"," if ixd==0:\r\n"," print(t)\r\n"," for j in t:\r\n"," if j==',':\r\n"," arr.append(totalFuhao[totaldx][dinx])\r\n"," dinx+=1\r\n"," else :\r\n"," arr.append(j)\r\n"," ixd+=1\r\n"," totaldx+=1\r\n"," if ixd==1 :\r\n"," print(\" \".join(arr))\r\n"," ans.append(\" \".join(arr))\r\n"," return ans\r\n"," "]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["#将倒序后的句子进行符号还原\r\n","newText=train['text'].tolist()\r\n","neT=tranform(newText)"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["train['text']=neT"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["train.to_csv(\"./ttatest.csv\",index=False)"]}],"metadata":{"interpreter":{"hash":"2fc9f0689f2f32664301ce51aaed3853cc1802bb7b4d4a74b41993575fbadbc0"},"kernelspec":{"display_name":"Python 3.6.13 64-bit ('tf2': conda)","name":"python3"},"language_info":{"name":"python","version":""}},"nbformat":4,"nbformat_minor":4} -------------------------------------------------------------------------------- /Bert_pytorch/bert_finetuning/Config.py: -------------------------------------------------------------------------------- 1 | from transformers import BertTokenizer, AdamW, BertModel, BertPreTrainedModel, BertConfig, \ 2 | get_linear_schedule_with_warmup, XLNetModel, XLNetTokenizer, XLNetConfig, ElectraModel, ElectraConfig, ElectraTokenizer, \ 3 | RobertaTokenizer, RobertaModel, RobertaConfig 4 | from NEZHA.modeling_nezha import NeZhaModel 5 | from NEZHA.configuration_nezha import NeZhaConfig 6 | 7 | 8 | MODELS = { 9 | 'BertForClass': BertModel, 10 | 'BertLstm': BertModel, 11 | 'BertForClass_MultiDropout': BertModel, 12 | 'BertLastTwoCls': BertModel, 13 | 'BertLastCls':BertModel, 14 | 'BertLastTwoClsPooler': BertModel, 15 | 'BertLastTwoEmbeddings': BertModel, 16 | 'BertLastTwoEmbeddingsPooler': BertModel, 17 | 'BertLastFourCls': BertModel, 18 | 'BertLastFourClsPooler': BertModel, 19 | 'BertLastFourEmbeddings': BertModel, 20 | 'BertLastFourEmbeddingsPooler': BertModel, 21 | 'BertDynCls': BertModel, 22 | 'BertDynEmbeddings': BertModel, 23 | 'BertRNN': BertModel, 24 | 'BertCNN': XLNetModel, 25 | 'BertRCNN': BertModel, 26 | 'XLNet': XLNetModel, 27 | 'Electra': ElectraModel, 28 | 'NEZHA': NeZhaModel 29 | } 30 | 31 | TOKENIZERS = { 32 | 'BertForClass': BertTokenizer, 33 | 'BertLstm': BertTokenizer, 34 | 'BertForClass_MultiDropout': BertTokenizer, 35 | 'BertLastTwoCls': BertTokenizer, 36 | 'BertLastCls': BertTokenizer, 37 | 'BertLastTwoClsPooler': BertTokenizer, 38 | 'BertLastTwoEmbeddings': BertTokenizer, 39 | 'BertLastTwoEmbeddingsPooler': BertTokenizer, 40 | 'BertLastFourCls': BertTokenizer, 41 | 'BertLastFourClsPooler': BertTokenizer, 42 | 'BertLastFourEmbeddings': BertTokenizer, 43 | 'BertLastFourEmbeddingsPooler': BertTokenizer, 44 | 'BertDynCls': BertTokenizer, 45 | 'BertDynEmbeddings': BertTokenizer, 46 | 'BertRNN': BertTokenizer, 47 | 'BertCNN': BertTokenizer, 48 | 'BertRCNN': BertTokenizer, 49 | 'XLNet': XLNetTokenizer, 50 | 'Electra': ElectraTokenizer, 51 | 'NEZHA': BertTokenizer 52 | } 53 | 54 | CONFIGS = { 55 | 'BertForClass': BertConfig, 56 | 'BertLstm': BertConfig, 57 | 'BertForClass_MultiDropout': BertConfig, 58 | 'BertLastTwoCls': BertConfig, 59 | 'BertLastCls': BertConfig, 60 | 'BertLastTwoClsPooler': BertConfig, 61 | 'BertLastTwoEmbeddings': BertConfig, 62 | 'BertLastTwoEmbeddingsPooler': BertConfig, 63 | 'BertLastFourCls': BertConfig, 64 | 'BertLastFourClsPooler': BertConfig, 65 | 'BertLastFourEmbeddings': BertConfig, 66 | 'BertLastFourEmbeddingsPooler': BertConfig, 67 | 'BertDynCls': BertConfig, 68 | 'BertDynEmbeddings': BertConfig, 69 | 'BertRNN': BertConfig, 70 | 'BertCNN': BertConfig, 71 | 'BertRCNN': BertConfig, 72 | 'XLNet': XLNetConfig, 73 | 'Electra': ElectraConfig, 74 | 'NEZHA': NeZhaConfig 75 | 76 | } 77 | -------------------------------------------------------------------------------- /Bert_pytorch/bert_finetuning/NEZHA/configuration_nezha.py: -------------------------------------------------------------------------------- 1 | 2 | from transformers import PretrainedConfig 3 | 4 | NEZHA_PRETRAINED_CONFIG_ARCHIVE_MAP = {} 5 | 6 | class NeZhaConfig(PretrainedConfig): 7 | r""" 8 | This is the configuration class to store the configuration of an :class:`~transformers.AlbertModel`. 9 | It is used to instantiate an ALBERT model according to the specified arguments, defining the model 10 | architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of 11 | the ALBERT `xxlarge `__ architecture. 12 | 13 | Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used 14 | to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig` 15 | for more information. 16 | 17 | 18 | Args: 19 | vocab_size (:obj:`int`, optional, defaults to 30000): 20 | Vocabulary size of the ALBERT model. Defines the different tokens that 21 | can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.AlbertModel`. 22 | embedding_size (:obj:`int`, optional, defaults to 128): 23 | Dimensionality of vocabulary embeddings. 24 | hidden_size (:obj:`int`, optional, defaults to 4096): 25 | Dimensionality of the encoder layers and the pooler layer. 26 | num_hidden_layers (:obj:`int`, optional, defaults to 12): 27 | Number of hidden layers in the Transformer encoder. 28 | num_hidden_groups (:obj:`int`, optional, defaults to 1): 29 | Number of groups for the hidden layers, parameters in the same group are shared. 30 | num_attention_heads (:obj:`int`, optional, defaults to 64): 31 | Number of attention heads for each attention layer in the Transformer encoder. 32 | intermediate_size (:obj:`int`, optional, defaults to 16384): 33 | The dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. 34 | inner_group_num (:obj:`int`, optional, defaults to 1): 35 | The number of inner repetition of attention and ffn. 36 | hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu_new"): 37 | The non-linear activation function (function or string) in the encoder and pooler. 38 | If string, "gelu", "relu", "swish" and "gelu_new" are supported. 39 | hidden_dropout_prob (:obj:`float`, optional, defaults to 0): 40 | The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. 41 | attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0): 42 | The dropout ratio for the attention probabilities. 43 | max_position_embeddings (:obj:`int`, optional, defaults to 512): 44 | The maximum sequence length that this model might ever be used with. Typically set this to something 45 | large (e.g., 512 or 1024 or 2048). 46 | type_vocab_size (:obj:`int`, optional, defaults to 2): 47 | The vocabulary size of the `token_type_ids` passed into :class:`~transformers.AlbertModel`. 48 | initializer_range (:obj:`float`, optional, defaults to 0.02): 49 | The standard deviation of the truncated_normal_initializer for initializing all weight matrices. 50 | layer_norm_eps (:obj:`float`, optional, defaults to 1e-12): 51 | The epsilon used by the layer normalization layers. 52 | classifier_dropout_prob (:obj:`float`, optional, defaults to 0.1): 53 | The dropout ratio for attached classifiers. 54 | 55 | Example:: 56 | 57 | from transformers import AlbertConfig, AlbertModel 58 | # Initializing an ALBERT-xxlarge style configuration 59 | albert_xxlarge_configuration = AlbertConfig() 60 | 61 | # Initializing an ALBERT-base style configuration 62 | albert_base_configuration = AlbertConfig( 63 | hidden_size=768, 64 | num_attention_heads=12, 65 | intermediate_size=3072, 66 | ) 67 | 68 | # Initializing a model from the ALBERT-base style configuration 69 | model = AlbertModel(albert_xxlarge_configuration) 70 | 71 | # Accessing the model configuration 72 | configuration = model.config 73 | 74 | Attributes: 75 | pretrained_config_archive_map (Dict[str, str]): 76 | A dictionary containing all the available pre-trained checkpoints. 77 | """ 78 | 79 | pretrained_config_archive_map = NEZHA_PRETRAINED_CONFIG_ARCHIVE_MAP 80 | model_type = "nezha" 81 | 82 | def __init__( 83 | self, 84 | vocab_size=30000, 85 | embedding_size=128, 86 | hidden_size=4096, 87 | num_hidden_layers=12, 88 | num_hidden_groups=1, 89 | num_attention_heads=64, 90 | intermediate_size=16384, 91 | inner_group_num=1, 92 | hidden_act="gelu_new", 93 | hidden_dropout_prob=0, 94 | attention_probs_dropout_prob=0, 95 | max_position_embeddings=512, 96 | max_relative_position=64, 97 | type_vocab_size=2, 98 | initializer_range=0.02, 99 | layer_norm_eps=1e-12, 100 | classifier_dropout_prob=0.1, 101 | use_relative_position=True, 102 | pad_token_id=0, 103 | bos_token_id=2, 104 | eos_token_id=3, 105 | **kwargs 106 | ): 107 | super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) 108 | 109 | self.vocab_size = vocab_size 110 | self.embedding_size = embedding_size 111 | self.hidden_size = hidden_size 112 | self.num_hidden_layers = num_hidden_layers 113 | self.num_hidden_groups = num_hidden_groups 114 | self.num_attention_heads = num_attention_heads 115 | self.inner_group_num = inner_group_num 116 | self.hidden_act = hidden_act 117 | self.intermediate_size = intermediate_size 118 | self.hidden_dropout_prob = hidden_dropout_prob 119 | self.attention_probs_dropout_prob = attention_probs_dropout_prob 120 | self.max_position_embeddings = max_position_embeddings 121 | self.max_relative_position = max_relative_position 122 | self.type_vocab_size = type_vocab_size 123 | self.initializer_range = initializer_range 124 | self.layer_norm_eps = layer_norm_eps 125 | self.use_relative_position=use_relative_position 126 | self.classifier_dropout_prob = classifier_dropout_prob 127 | -------------------------------------------------------------------------------- /Bert_pytorch/bert_finetuning/ensemble_10fold.py: -------------------------------------------------------------------------------- 1 | from sklearn.linear_model import LogisticRegression 2 | from sklearn.preprocessing import StandardScaler 3 | from utils import * 4 | from model import * 5 | from sklearn.metrics import f1_score 6 | from sklearn.model_selection import StratifiedKFold, KFold 7 | from NEZHA.modeling_nezha import * 8 | from tqdm import tqdm, trange 9 | import numpy as np 10 | import pandas as pd 11 | import torch 12 | import random 13 | import os 14 | import re 15 | from tqdm import tqdm 16 | tqdm.pandas() 17 | os.environ['PYTHONHASHSEED'] = '0' # 消除hash算法的随机性 18 | random.seed(2021) 19 | np.random.seed(2021) 20 | torch.manual_seed(2021) 21 | torch.cuda.manual_seed_all(2021) 22 | 23 | MODEL_CLASSES = { 24 | 'BertForClass': BertForClass, 25 | 'BertLastCls': BertLastCls, 26 | 'BertLastTwoCls': BertLastTwoCls, 27 | 'BertLastTwoClsPooler': BertLastTwoClsPooler, 28 | 'BertLastTwoEmbeddings': BertLastTwoEmbeddings, 29 | 'BertLastTwoEmbeddingsPooler': BertLastTwoEmbeddingsPooler, 30 | 'BertLastFourCls': BertLastFourCls, 31 | 'BertLastFourClsPooler': BertLastFourClsPooler, 32 | 'BertLastFourEmbeddings': BertLastFourEmbeddings, 33 | 'BertLastFourEmbeddingsPooler': BertLastFourEmbeddingsPooler, 34 | 'BertDynCls': BertDynCls, 35 | 'BertDynEmbeddings': BertDynEmbeddings, 36 | 'BertRNN': BertRNN, 37 | 'BertCNN': BertCNN, 38 | 'BertRCNN': BertRCNN, 39 | 'XLNet': XLNet, 40 | 'Electra': Electra, 41 | 'NEZHA': NEZHA, 42 | } 43 | 44 | class Config: 45 | def __init__(self): 46 | # 预训练模型路径 47 | self.modelId = 2 48 | self.model = "BertForClass" 49 | self.Stratification = False 50 | self.model_path = '/media/mgege007/winType/DaGuan/Pytorch-pretrain/Nezha_pytorch/nezha_model/' 51 | 52 | self.num_class = 35 53 | self.dropout = 0.2 54 | self.MAX_LEN = 100 55 | self.epoch = 6 56 | self.learn_rate = 2e-5 57 | self.normal_lr = 1e-4 58 | self.batch_size = 64 59 | self.k_fold = 10 60 | self.seed = 42 61 | self.device = torch.device('cuda') 62 | # self.device = torch.device('cpu') 63 | 64 | self.focalloss = False 65 | self.pgd = False 66 | self.fgm = True 67 | 68 | 69 | def preprocess_text(document): 70 | 71 | # 删除逗号 72 | text = str(document) 73 | text = text.replace(',', '') 74 | text = text.replace('!', '') 75 | text = text.replace('17281', '') 76 | # 用单个空格替换多个空格 77 | text = re.sub(r'\s+', ' ', text, flags=re.I) 78 | return text 79 | 80 | 81 | def build_data(): 82 | train_clean = '/media/mgege007/winType/DaGuan/data/datagrand_2021_train.csv' 83 | test_clean = '/media/mgege007/winType/DaGuan/data/datagrand_2021_test.csv' 84 | train = pd.read_csv(train_clean) 85 | test = pd.read_csv(test_clean) 86 | train["text"].progress_apply(lambda x: preprocess_text(x)) 87 | test["text"].progress_apply(lambda x: preprocess_text(x)) 88 | id2label = list(train['label'].unique()) 89 | test_dataset = [] 90 | for i in tqdm(range(len(test))): 91 | test_dict = {} 92 | test_dict['text'] = test.loc[i, 'text'] 93 | test_dict['label'] = [-1]*35 94 | test_dataset.append(test_dict) 95 | return test_dataset, test, id2label 96 | 97 | 98 | def pre_ensemble(models_path, test_df, test_dataset,submit_name): 99 | config = Config() 100 | ensemble_list = [] 101 | for i, models_path in enumerate(path_li): 102 | print("正在测试{}模型".format(models_path)) 103 | test_prelist = [] 104 | for fold in range(10): 105 | print("[{}/10]fold".format(fold+1)) 106 | test_D = data_generator(test_dataset, config) 107 | # 每个模型的 108 | PATH = './{}/bert_{}.pth'.format(models_path,fold) 109 | model = torch.load(PATH) 110 | model.eval() 111 | with torch.no_grad(): 112 | test_logit = None 113 | for input_ids, input_masks, segment_ids, labels in tqdm(test_D, disable=True): 114 | y_pred = model(input_ids, input_masks, segment_ids) 115 | y_pred = F.softmax(y_pred, dim=1) 116 | y_pred = y_pred.detach().to("cpu").numpy() 117 | if test_logit is None: 118 | test_logit = y_pred 119 | else: 120 | test_logit = np.vstack((test_logit, y_pred)) 121 | test_prelist.append(test_logit) 122 | test_pre = np.sum(np.array(test_prelist), axis=0) / (np.array(test_prelist).shape[0]) 123 | ensemble_list.append(test_pre) 124 | test_preds = np.sum(np.array(ensemble_list), axis=0) / (np.array(ensemble_list).shape[0]) 125 | test_preds = np.argmax(test_preds, axis=1) 126 | pred_labels = [id2label[i] for i in test_preds] 127 | SUBMISSION_DIR = "submit" 128 | if not os.path.exists(SUBMISSION_DIR): 129 | os.makedirs(SUBMISSION_DIR) 130 | Name = "{}-ensemble".format(submit_name) 131 | submit_file = SUBMISSION_DIR+"/{}.csv".format(Name) 132 | 133 | pd.DataFrame({"id": test_df['id'], "label": pred_labels}).to_csv( 134 | submit_file, index=False) 135 | 136 | print() 137 | 138 | 139 | if __name__ == "__main__": 140 | 141 | path_li = ["models-bertlstm-5986", "models_bertlast4", "models-bertfor-5904"] 142 | test_dataset, test, id2label = build_data() 143 | pre_ensemble(path_li, test, test_dataset, "-".join(path_li)) 144 | 145 | -------------------------------------------------------------------------------- /Bert_pytorch/bert_finetuning/ensemble_single.py: -------------------------------------------------------------------------------- 1 | from NEZHA.modeling_nezha import * 2 | from tqdm import tqdm, trange 3 | import numpy as np 4 | import pandas as pd 5 | import logging 6 | import torch 7 | import random 8 | import os 9 | from torch import nn, optim 10 | from transformers import BertTokenizer, AdamW, BertModel, BertPreTrainedModel, BertConfig, \ 11 | get_linear_schedule_with_warmup, XLNetModel, XLNetTokenizer, XLNetConfig 12 | from transformers.optimization import get_linear_schedule_with_warmup 13 | from sklearn.model_selection import StratifiedKFold, KFold 14 | from sklearn.metrics import mean_absolute_error, accuracy_score, f1_score, roc_auc_score 15 | from model import * 16 | from utils import * 17 | import time 18 | from tqdm import tqdm 19 | import re 20 | import json 21 | tqdm.pandas() 22 | os.environ['PYTHONHASHSEED'] = '0' # 消除hash算法的随机性 23 | random.seed(123) 24 | np.random.seed(123) 25 | torch.manual_seed(123) 26 | torch.cuda.manual_seed_all(123) 27 | 28 | 29 | class Config: 30 | def __init__(self): 31 | # 预训练模型路径 32 | self.modelId = 2 33 | self.model = "BertLstm" 34 | self.Stratification = False 35 | # '/Bert_pytorch/bert_model_800/' 36 | self.model_path = '/media/mgege007/winType/DaGuan/Pytorch-pretrain/Bert_pytorch/bert_model_1000/' 37 | self.num_class = 35 38 | self.dropout = 0.2 39 | self.MAX_LEN = 100 40 | self.epoch = 6 41 | self.learn_rate = 2e-5 42 | self.normal_lr = 1e-4 43 | self.batch_size = 64 44 | self.k_fold = 10 45 | self.seed = 42 46 | 47 | self.device = torch.device('cuda') 48 | # self.device = torch.device('cpu') 49 | 50 | self.focalloss = False 51 | self.pgd = False 52 | self.fgm = True 53 | 54 | 55 | class Config2: 56 | def __init__(self): 57 | # 预训练模型路径 58 | self.modelId = 2 59 | self.model = "BertLstm" 60 | self.Stratification = False 61 | # '/Bert_pytorch/bert_model_800/' 62 | self.model_path = '/media/mgege007/winType/DaGuan/Pytorch-pretrain/Bert_pytorch/bert_model_650/' 63 | self.num_class = 35 64 | self.dropout = 0.2 65 | self.MAX_LEN = 100 66 | self.epoch = 6 67 | self.learn_rate = 2e-5 68 | self.normal_lr = 1e-4 69 | self.batch_size = 64 70 | self.k_fold = 10 71 | self.seed = 42 72 | 73 | self.device = torch.device('cuda') 74 | # self.device = torch.device('cpu') 75 | 76 | self.focalloss = False 77 | self.pgd = False 78 | self.fgm = True 79 | 80 | def preprocess_text(document): 81 | 82 | # 删除逗号 83 | text = str(document) 84 | text = text.replace(',', '') 85 | text = text.replace('!', '') 86 | text = text.replace('?', '') 87 | text = text.replace('。', '') 88 | # 用单个空格替换多个空格 89 | text = re.sub(r'\s+', ' ', text, flags=re.I) 90 | return text 91 | 92 | 93 | def preprocess_text2(document): 94 | # 删除逗号 95 | text = str(document) 96 | text = text.replace(',', '35001') 97 | text = text.replace('!', '35002') 98 | text = text.replace('?', '35003') 99 | text = text.replace('。', '35004') 100 | test = text.replace(',', '35001') 101 | # text = text.replace('17281', '') 102 | # 用单个空格替换多个空格 103 | text = re.sub(r'\s+', ' ', text, flags=re.I) 104 | 105 | return text 106 | def ensemble(pred, test_df, id2label,models): 107 | test_preds = np.sum(np.array(pred), axis=0) / (np.array(pred).shape[0]) 108 | test_preds = np.argmax(test_preds, axis=1) 109 | pred_labels = [id2label[i] for i in test_preds] 110 | SUBMISSION_DIR = "submit" 111 | if not os.path.exists(SUBMISSION_DIR): 112 | os.makedirs(SUBMISSION_DIR) 113 | Name = "{}_ensemble".format(models) 114 | submit_file = SUBMISSION_DIR+"/{}.csv".format(Name) 115 | 116 | pd.DataFrame({"id": test_df['id'], "label": pred_labels}).to_csv(submit_file, index=False) 117 | 118 | 119 | def ensemble_TTA(pred, pred_tta,test_df, id2label, models): 120 | test_pred= np.sum(np.array(pred), axis=0) / (np.array(pred).shape[0]) 121 | tta_pred = np.sum(np.array(pred_tta), axis=0) / (np.array(pred_tta).shape[0]) 122 | 123 | total = [] 124 | for i in range(tta_pred.shape[0]): 125 | t = test_pred[i]+tta_pred[i] 126 | total.append(t) 127 | all_preds = np.array(total) 128 | test_preds = np.argmax(all_preds, axis=1) 129 | pred_labels = [id2label[i] for i in test_preds] 130 | SUBMISSION_DIR = "submit" 131 | if not os.path.exists(SUBMISSION_DIR): 132 | os.makedirs(SUBMISSION_DIR) 133 | Name = "{}_ensemble".format(models) 134 | submit_file = SUBMISSION_DIR+"/{}.csv".format(Name) 135 | 136 | pd.DataFrame({"id": test_df['id'], "label": pred_labels}).to_csv( 137 | submit_file, index=False) 138 | 139 | 140 | def build_data(): 141 | train_clean = '/media/mgege007/winType/DaGuan/data/datagrand_2021_train.csv' 142 | test_clean = '/media/mgege007/winType/DaGuan/data/datagrand_2021_test.csv' 143 | train = pd.read_csv(train_clean) 144 | test = pd.read_csv(test_clean) 145 | train["text"].progress_apply(lambda x: preprocess_text(x)) 146 | test["text"].progress_apply(lambda x: preprocess_text(x)) 147 | id2label = list(train['label'].unique()) 148 | test_dataset = [] 149 | for i in tqdm(range(len(test))): 150 | test_dict = {} 151 | test_dict['text'] = test.loc[i, 'text'] 152 | test_dict['label'] = [-1]*35 153 | test_dataset.append(test_dict) 154 | 155 | test_2 = pd.read_csv(test_clean) 156 | test_2["text"].progress_apply(lambda x: preprocess_text2(x)) 157 | test_dataset_2 = [] 158 | for i in tqdm(range(len(test))): 159 | test_dict = {} 160 | test_dict['text'] = test.loc[i, 'text'] 161 | test_dict['label'] = [-1]*35 162 | test_dataset_2.append(test_dict) 163 | return test_dataset, test_dataset_2, test, id2label 164 | def build_tta(): 165 | train_clean = '/media/mgege007/winType/DaGuan/data/datagrand_2021_train.csv' 166 | test_clean = '/media/mgege007/winType/DaGuan/data/ttatest.csv' 167 | train = pd.read_csv(train_clean) 168 | test = pd.read_csv(test_clean) 169 | train["text"].progress_apply(lambda x: preprocess_text(x)) 170 | test["text"].progress_apply(lambda x: preprocess_text(x)) 171 | id2label = list(train['label'].unique()) 172 | test_dataset = [] 173 | for i in tqdm(range(len(test))): 174 | test_dict = {} 175 | test_dict['text'] = test.loc[i, 'text'] 176 | test_dict['label'] = [-1]*35 177 | test_dataset.append(test_dict) 178 | 179 | test_2 = pd.read_csv(test_clean) 180 | test_2["text"].progress_apply(lambda x: preprocess_text2(x)) 181 | test_dataset_2 = [] 182 | for i in tqdm(range(len(test))): 183 | test_dict = {} 184 | test_dict['text'] = test.loc[i, 'text'] 185 | test_dict['label'] = [-1]*35 186 | test_dataset_2.append(test_dict) 187 | return test_dataset, test_dataset_2, test, id2label 188 | 189 | 190 | def pre_ensemble(model_li_1, model_li_2, test_dataset, test_dataset2): 191 | config = Config() 192 | config_2 = Config2() 193 | test_prelist = [] 194 | test_D = data_generator(test_dataset, config) 195 | test_D_2 = data_generator(test_dataset, config_2) 196 | for i,path in enumerate(model_li_1): 197 | # 每个模型的 198 | print("正在测试{}".format(path)) 199 | PATH = './ensemble_model/{}.pth'.format(path) 200 | model = torch.load(PATH) 201 | model.eval() 202 | n = 0 203 | with torch.no_grad(): 204 | train_logit = None 205 | for input_ids, input_masks, segment_ids, labels in tqdm(test_D, disable=True): 206 | print(n) 207 | n += 1 208 | y_pred = model(input_ids, input_masks, segment_ids) 209 | y_pred = F.softmax(y_pred, dim=1) 210 | y_pred = y_pred.detach().to("cpu").numpy() 211 | if train_logit is None: 212 | train_logit = y_pred 213 | else: 214 | train_logit = np.vstack((train_logit, y_pred)) 215 | test_prelist.append(train_logit) 216 | for i, path in enumerate(model_li_2): 217 | # 每个模型的 218 | print("正在测试{}".format(path)) 219 | PATH = './ensemble_model/{}.pth'.format(path) 220 | model = torch.load(PATH) 221 | model.eval() 222 | n = 0 223 | with torch.no_grad(): 224 | train_logit = None 225 | for input_ids, input_masks, segment_ids, labels in tqdm(test_D_2, disable=True): 226 | print(n) 227 | n += 1 228 | y_pred = model(input_ids, input_masks, segment_ids) 229 | y_pred = F.softmax(y_pred, dim=1) 230 | y_pred = y_pred.detach().to("cpu").numpy() 231 | if train_logit is None: 232 | train_logit = y_pred 233 | else: 234 | train_logit = np.vstack((train_logit, y_pred)) 235 | test_prelist.append(train_logit) 236 | test_prelist.append(train_logit) 237 | return test_prelist 238 | def submit(pred,pred2,test_df, id2label): 239 | 240 | test_preds_merge = np.sum(pred, axis=0) / (pred.shape[0]) 241 | test_pre_tensor = torch.tensor(test_preds_merge) 242 | test_preds_merge2 = np.sum(pred2, axis=0) / (pred2.shape[0]) 243 | test_pre_tensor2 = torch.tensor(test_preds_merge2) 244 | 245 | Len=len(test_preds_merge) 246 | total=[] 247 | print(Len) 248 | print(len(test_preds_merge2)) 249 | for i in range(Len): 250 | t=test_preds_merge[i]+test_preds_merge2[i] 251 | total.append(t) 252 | total=np.array(total) 253 | print(len(total)) 254 | test_pre_tensor3=torch.tensor(total) 255 | print(test_pre_tensor3[0]) 256 | test_pre = torch.max(test_pre_tensor3, 1)[1] 257 | pred_labels = [id2label[i] for i in test_pre] 258 | SUBMISSION_DIR = "submit" 259 | if not os.path.exists(SUBMISSION_DIR): 260 | os.makedirs(SUBMISSION_DIR) 261 | Name = "tta" 262 | submit_file = SUBMISSION_DIR+"/submit_{}.csv".format(Name) 263 | 264 | pd.DataFrame({"id": test_df['id'], "label": pred_labels}).to_csv( 265 | submit_file, index=False) 266 | # 不加TTA 267 | if __name__ == "__main__": 268 | 269 | model_li_1 = ["bertfor","bertlstm"] 270 | model_li_2 = ["model_0", "model_1", "model_2", "model_3"] 271 | # 不加TTA 272 | # test_dataset, test_dataset2, test, id2label = build_data() 273 | # test_prelist = pre_ensemble(model_li_1, model_li_2, test_dataset, test_dataset2) 274 | # ensemble(np.array(test_prelist), test, id2label, "3bert-4model-checkpoint") 275 | 276 | # 加入TTA 277 | test_dataset, test_dataset2, test, id2label = build_data() 278 | test_prelist_1 = pre_ensemble(model_li_1, model_li_2, test_dataset, test_dataset2) 279 | 280 | 281 | tta_dataset, tta_dataset2, test_tta, id2label = build_tta() 282 | test_prelist_2 = pre_ensemble(model_li_1, model_li_2, tta_dataset, tta_dataset2) 283 | ensemble_TTA(np.array(test_prelist_1),np.array(test_prelist_2), test, id2label, "TTA-3bert-4model-checkpoint") 284 | 285 | print() 286 | -------------------------------------------------------------------------------- /Bert_pytorch/bert_finetuning/generate_pseudo_label.py: -------------------------------------------------------------------------------- 1 | from NEZHA.modeling_nezha import * 2 | from tqdm import tqdm, trange 3 | import numpy as np 4 | import pandas as pd 5 | import logging 6 | import torch 7 | import random 8 | import os 9 | from torch import nn, optim 10 | from transformers import BertTokenizer, AdamW, BertModel, BertPreTrainedModel, BertConfig, \ 11 | get_linear_schedule_with_warmup, XLNetModel, XLNetTokenizer, XLNetConfig 12 | from transformers.optimization import get_linear_schedule_with_warmup 13 | from sklearn.model_selection import StratifiedKFold, KFold 14 | from sklearn.metrics import mean_absolute_error, accuracy_score, f1_score, roc_auc_score 15 | from model import * 16 | from utils import * 17 | import time 18 | from tqdm import tqdm 19 | import re 20 | import json 21 | tqdm.pandas() 22 | os.environ['PYTHONHASHSEED'] = '0' # 消除hash算法的随机性 23 | random.seed(123) 24 | np.random.seed(123) 25 | torch.manual_seed(123) 26 | torch.cuda.manual_seed_all(123) 27 | 28 | 29 | class Config: 30 | def __init__(self): 31 | # 预训练模型路径 32 | self.modelId = 2 33 | self.model = "BertLstm" 34 | self.Stratification = False 35 | # '/Bert_pytorch/bert_model_800/' 36 | self.model_path = '/media/mgege007/winType/DaGuan/Pytorch-pretrain/Bert_pytorch/bert_model_1000/' 37 | 38 | self.num_class = 35 39 | self.dropout = 0.2 40 | self.MAX_LEN = 100 41 | self.epoch = 6 42 | self.learn_rate = 2e-5 43 | self.normal_lr = 1e-4 44 | self.batch_size = 256 45 | self.k_fold = 10 46 | self.seed = 42 47 | 48 | self.device = torch.device('cuda') 49 | # self.device = torch.device('cpu') 50 | 51 | self.focalloss = False 52 | self.pgd = False 53 | self.fgm = True 54 | 55 | 56 | def preprocess_text(document): 57 | 58 | # 删除逗号 59 | text = str(document) 60 | text = text.replace(',','') 61 | text = text.replace('!', '') 62 | text = text.replace('?', '') 63 | text = text.replace('。', '') 64 | # 用单个空格替换多个空格 65 | text = re.sub(r'\s+', ' ', text, flags=re.I) 66 | 67 | return text 68 | 69 | 70 | def generate_label(pred, test_df, id2label): 71 | test_pre = np.argmax(pred, axis=1) 72 | pred_labels = [id2label[i] for i in test_pre] 73 | SUBMISSION_DIR = "submit" 74 | if not os.path.exists(SUBMISSION_DIR): 75 | os.makedirs(SUBMISSION_DIR) 76 | Name = "pesudo_label_{}".format(len(test_df)) 77 | submit_file = SUBMISSION_DIR+"/{}.csv".format(Name) 78 | 79 | pd.DataFrame({"id": list(range(len(test_df))), "text": test_df['text'], "label": pred_labels}).to_csv( 80 | submit_file, index=False) 81 | 82 | 83 | def predict_label(pred, test_df, id2label): 84 | test_pre = np.argmax(pred, axis=1) 85 | pred_labels = [id2label[i] for i in test_pre] 86 | SUBMISSION_DIR = "submit" 87 | if not os.path.exists(SUBMISSION_DIR): 88 | os.makedirs(SUBMISSION_DIR) 89 | Name = "all_train_berforclass_{}".format(len(test_df)) 90 | submit_file = SUBMISSION_DIR+"/{}.csv".format(Name) 91 | 92 | pd.DataFrame({"id": list(range(len(test_df))), "text": test_df['text'], "label": pred_labels}).to_csv(submit_file, index=False) 93 | def merge(): 94 | train_f = '/media/mgege007/winType/DaGuan/data/datagrand_2021_train.csv' 95 | train_p = '/media/mgege007/winType/DaGuan/data/pesudo_label_100000.csv' 96 | 97 | train1 = pd.read_csv(train_f) 98 | train2 = pd.read_csv(train_p) 99 | pseudo_train_data = pd.concat([train1, train2]).reset_index().drop(columns=['index']).sample(frac=1) 100 | train_path = '/media/mgege007/winType/DaGuan/data/pesudo_label_data_{}.csv'.format(len(pseudo_train_data)) 101 | pseudo_train_data.to_csv(train_path,index=False) 102 | def pre_generate(): 103 | config = Config() 104 | train_clean = '/media/mgege007/winType/DaGuan/data/datagrand_2021_train.csv' 105 | train = pd.read_csv(train_clean) 106 | id2label = list(train['label'].unique()) 107 | test_dataset = [] 108 | path = "/media/mgege007/新加卷/Compition_data/datagrand_2021_unlabeled_data.json" 109 | num = 0 110 | test_text = [] 111 | with open(path, 'r', encoding='utf-8') as f: 112 | try: 113 | while True and num < 100000: 114 | line_data = f.readline() 115 | num += 1 116 | print(num) 117 | if line_data: 118 | data = json.loads(line_data) 119 | sentence_data = preprocess_text(data['title']+" "+data['content']).strip() 120 | test_dict = {} 121 | test_dict['text'] = sentence_data 122 | test_text.append(sentence_data) 123 | test_dict['label'] = [-1]*35 124 | test_dataset.append(test_dict) 125 | else: 126 | break 127 | except Exception as e: 128 | print(e) 129 | f.close() 130 | test_D = data_generator(test_dataset, config) 131 | PATH = './models/bertforclass.pth' 132 | model = torch.load(PATH) 133 | model.eval() 134 | n = 0 135 | with torch.no_grad(): 136 | train_logit = None 137 | for input_ids, input_masks, segment_ids, labels in tqdm(test_D, disable=True): 138 | print(n) 139 | n+=1 140 | y_pred = model(input_ids, input_masks, segment_ids) 141 | y_pred = F.softmax(y_pred, dim=1) 142 | y_pred = y_pred.detach().to("cpu").numpy() 143 | if train_logit is None: 144 | train_logit = y_pred 145 | else: 146 | train_logit = np.vstack((train_logit, y_pred)) 147 | 148 | generate_label(train_logit, test_text, id2label) 149 | 150 | 151 | def pre_predict(): 152 | config = Config() 153 | train_clean = '/media/mgege007/winType/DaGuan/data/datagrand_2021_train.csv' 154 | test_clean = '/media/mgege007/winType/DaGuan/data/datagrand_2021_test.csv' 155 | train = pd.read_csv(train_clean) 156 | test = pd.read_csv(test_clean) 157 | test["text"].progress_apply(lambda x: preprocess_text(x)) 158 | id2label = list(train['label'].unique()) 159 | label2id = {id2label[i]: i for i in range(len(id2label))} 160 | test_dataset = [] 161 | for i in tqdm(range(len(test))): 162 | test_dict = {} 163 | test_dict['text'] = test.loc[i, 'text'] 164 | test_dict['label'] = [-1]*35 165 | test_dataset.append(test_dict) 166 | test_D = data_generator(test_dataset, config) 167 | PATH = './models/bertforclass.pth' 168 | model = torch.load(PATH) 169 | model.eval() 170 | n = 0 171 | with torch.no_grad(): 172 | train_logit = None 173 | for input_ids, input_masks, segment_ids, labels in tqdm(test_D, disable=True): 174 | print(n) 175 | n += 1 176 | y_pred = model(input_ids, input_masks, segment_ids) 177 | y_pred = F.softmax(y_pred, dim=1) 178 | y_pred = y_pred.detach().to("cpu").numpy() 179 | if train_logit is None: 180 | train_logit = y_pred 181 | else: 182 | train_logit = np.vstack((train_logit, y_pred)) 183 | 184 | predict_label(train_logit, test, id2label) 185 | if __name__ == "__main__": 186 | # pre_generate() 187 | # merge() 188 | # '/media/mgege007/winType/DaGuan/data/pesudo_label_data_114009.csv' 189 | pre_predict() 190 | 191 | -------------------------------------------------------------------------------- /Bert_pytorch/bert_finetuning/main_bert_all.py: -------------------------------------------------------------------------------- 1 | from tqdm import tqdm, trange 2 | import numpy as np 3 | import pandas as pd 4 | import logging 5 | import torch 6 | import random 7 | import os 8 | import re 9 | from torch import nn, optim 10 | from torch.optim.optimizer import Optimizer 11 | from transformers import Adafactor 12 | from transformers.optimization import get_linear_schedule_with_warmup 13 | from sklearn.metrics import f1_score 14 | from model import * 15 | from utils import * 16 | import time 17 | import logging 18 | from tqdm import tqdm 19 | from torch.cuda import amp # 要求pytorch>=1.6 20 | from utils import * 21 | tqdm.pandas() 22 | from transformers.optimization import ( 23 | get_constant_schedule, 24 | get_constant_schedule_with_warmup, 25 | get_linear_schedule_with_warmup, 26 | get_cosine_schedule_with_warmup, 27 | get_cosine_with_hard_restarts_schedule_with_warmup, 28 | get_polynomial_decay_schedule_with_warmup, 29 | ) 30 | 31 | logging.basicConfig(level=logging.DEBUG, filename="train.log",filemode='a') 32 | 33 | from NEZHA.modeling_nezha import * 34 | MODEL_CLASSES = { 35 | 'BertForClass': BertForClass, 36 | 'BertLstm': BertLstm, 37 | 'BertLastCls': BertLastCls, 38 | 'BertLastTwoCls': BertLastTwoCls, 39 | 'BertLastTwoClsPooler': BertLastTwoClsPooler, 40 | 'BertLastTwoEmbeddings': BertLastTwoEmbeddings, 41 | 'BertLastTwoEmbeddingsPooler': BertLastTwoEmbeddingsPooler, 42 | 'BertLastFourCls': BertLastFourCls, 43 | 'BertLastFourClsPooler': BertLastFourClsPooler, 44 | 'BertLastFourEmbeddings': BertLastFourEmbeddings, 45 | 'BertLastFourEmbeddingsPooler': BertLastFourEmbeddingsPooler, 46 | 'BertDynCls': BertDynCls, 47 | 'BertDynEmbeddings': BertDynEmbeddings, 48 | 'BertRNN': BertRNN, 49 | 'BertCNN': BertCNN, 50 | 'BertRCNN': BertRCNN, 51 | 'XLNet': XLNet, 52 | 'Electra': Electra, 53 | 'NEZHA': NEZHA, 54 | 55 | } 56 | 57 | class Config: 58 | def __init__(self): 59 | # 预训练模型路径 60 | self.modelId = 2 61 | self.model = 'BertLastTwoClsPooler' # "BertForClass" 62 | self.Stratification = False 63 | self.model_path = 'Bert_pytorch/bert_model_1000/' 64 | 65 | self.num_class = 35 66 | self.dropout = 0.1 67 | self.MAX_LEN = 100 68 | self.epoch = 100 69 | self.learn_rate = 4e-5 70 | self.normal_lr = 1e-4 71 | self.batch_size = 32 72 | self.k_fold = 10 73 | self.seed = 42 74 | self.device = torch.device('cuda') 75 | self.optimizer = "AdamW" 76 | self.focalloss = False 77 | self.pgd = False 78 | self.fgm = False 79 | self.scheduler = "cosine_schedule_with_warmup" 80 | self.fp16 = True 81 | 82 | 83 | def preprocess_text(document): 84 | 85 | # 删除逗号 86 | text = str(document) 87 | text = text.replace(',', '') 88 | text = text.replace('!', '') 89 | text = text.replace('17281', '') 90 | # 用单个空格替换多个空格 91 | text = re.sub(r'\s+', ' ', text, flags=re.I) 92 | 93 | return text 94 | 95 | 96 | config = Config() 97 | os.environ['PYTHONHASHSEED']='0'#消除hash算法的随机性 98 | random.seed(config.seed) 99 | np.random.seed(config.seed) 100 | torch.manual_seed(config.seed) 101 | torch.cuda.manual_seed_all(config.seed) 102 | 103 | 104 | file_path = './log/' 105 | # 创建一个logger 106 | logger = logging.getLogger('mylogger') 107 | logger.setLevel(logging.DEBUG) 108 | 109 | 110 | train_clean = '/media/mgege007/winType/DaGuan/data/datagrand_2021_train.csv' 111 | # train_clean = '/media/mgege007/winType/DaGuan/data/pseudo_train_data_18330.csv' 112 | train = pd.read_csv(train_clean) 113 | # train = train[:1001] 114 | train["text"].progress_apply(lambda x: preprocess_text(x)) 115 | ylabel = [] 116 | id2label = list(train['label'].unique()) 117 | label2id = {id2label[i]: i for i in range(len(id2label))} 118 | y_train = np.zeros((len(train), len(id2label)), dtype=np.int8) 119 | train_dataset = [] 120 | for i in tqdm(range(len(train))): 121 | train_dict = {} 122 | train_dict['text'] = train.loc[i, 'text'] 123 | y_train[i][label2id[train.loc[i, 'label']]] = 1 124 | train_dict['label'] = y_train[i] 125 | ylabel.append(train.loc[i, 'label']) 126 | train_dataset.append(train_dict) 127 | scaler = amp.GradScaler() 128 | 129 | train_D = data_generator(train_dataset, config, shuffle=True) 130 | model = MODEL_CLASSES[config.model](config).to(config.device) 131 | 132 | if torch.cuda.device_count() > 1: 133 | print("Let's use", torch.cuda.device_count(), "GPUs!") 134 | model = torch.nn.DataParallel(model) 135 | 136 | 137 | if config.pgd: 138 | pgd = PGD(model) 139 | K = 3 140 | 141 | elif config.fgm: 142 | fgm = FGM(model) 143 | 144 | if config.focalloss: 145 | loss_fn = FocalLoss(config.num_class) 146 | else: 147 | loss_fn = nn.BCEWithLogitsLoss() # BCEWithLogitsLoss就是把Sigmoid-BCELoss合成一步 148 | 149 | 150 | num_train_steps = int(len(train) / config.batch_size * config.epoch) 151 | param_optimizer = list(model.named_parameters()) 152 | 153 | no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] 154 | 155 | if config.Stratification: 156 | bert_params = [x for x in param_optimizer if 'bert' in x[0]] 157 | normal_params = [p for n, p in param_optimizer if 'bert' not in n] 158 | optimizer_parameters = [ 159 | {'params': [p for n, p in bert_params if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, 160 | {'params': [p for n, p in bert_params if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}, 161 | {'params': normal_params, 'lr': config.normal_lr}, 162 | ] 163 | else: 164 | optimizer_parameters = [ 165 | {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, 166 | {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}, 167 | ] 168 | adam_epsilon = 1e-6 169 | if config.optimizer == "AdamW": 170 | optimizer = AdamW(optimizer_parameters, lr=config.learn_rate) 171 | elif config.optimizer == "lookahead": 172 | optimizer = AdamW(optimizer_parameters, lr=config.learn_rate, eps=adam_epsilon) 173 | optimizer = Lookahead(optimizer=optimizer, la_steps=5, la_alpha=0.6) 174 | 175 | elif config.optimizer == "Adafactor": 176 | optimizer = Adafactor( 177 | optimizer_parameters, 178 | lr=config.learn_rate, 179 | eps=(1e-30, 1e-3), 180 | clip_threshold=1.0, 181 | decay_rate=-0.8, 182 | beta1=None, 183 | weight_decay=0.0, 184 | scale_parameter = False, 185 | relative_step=False, 186 | warmup_init=False, 187 | ) 188 | warmup_steps = int(len(train) / config.batch_size / 2) 189 | if config.scheduler == "constant_schedule": 190 | scheduler = get_constant_schedule(optimizer) 191 | 192 | elif config.scheduler == "constant_schedule_with_warmup": 193 | scheduler = get_constant_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps) 194 | elif config.scheduler == "linear_schedule_with_warmup": 195 | scheduler = get_linear_schedule_with_warmup( 196 | optimizer, 197 | num_warmup_steps=int(len(train) / config.batch_size / 2), 198 | num_training_steps=num_train_steps 199 | ) 200 | elif config.scheduler == "cosine_schedule_with_warmup": 201 | scheduler = get_cosine_schedule_with_warmup( 202 | optimizer, 203 | num_warmup_steps=warmup_steps, 204 | num_training_steps=num_train_steps, 205 | ) 206 | 207 | elif config.scheduler == "cosine_with_hard_restarts_schedule_with_warmup": 208 | scheduler = get_cosine_with_hard_restarts_schedule_with_warmup( 209 | optimizer, 210 | num_warmup_steps=warmup_steps, 211 | num_training_steps=num_train_steps, 212 | ) 213 | elif config.scheduler == "polynomial_decay_schedule_with_warmup": 214 | scheduler = get_polynomial_decay_schedule_with_warmup( 215 | optimizer, 216 | num_warmup_steps=warmup_steps, 217 | num_training_steps=num_train_steps, 218 | ) 219 | 220 | best_f1 = 0 221 | PATH = './{}/{}_all.pth'.format(config.model,config.model) 222 | save_model_path = './{}/'.format(config.model) 223 | if not os.path.exists(save_model_path): 224 | os.makedirs(save_model_path) 225 | score = [] 226 | train_len = 0 227 | loss_num = [] 228 | for e in range(config.epoch): 229 | print('\n------------epoch:{}------------'.format(e)) 230 | model.train() 231 | tq = tqdm(train_D,ncols=70,disable=True) 232 | last=time.time() 233 | for input_ids, input_masks, segment_ids, labels in tq: 234 | label_t = torch.tensor(labels, dtype=torch.float).to(config.device) 235 | if config.fp16: 236 | with amp.autocast(): 237 | y_pred = model(input_ids, input_masks, segment_ids) 238 | loss = loss_fn(y_pred, label_t) 239 | else: 240 | y_pred = model(input_ids, input_masks, segment_ids) 241 | loss = loss_fn(y_pred, label_t) 242 | loss = loss.mean() 243 | if config.fp16: 244 | scaler.scale(loss).backward() 245 | else: 246 | loss.backward() 247 | 248 | if config.pgd: 249 | pgd.backup_grad() 250 | # 对抗训练 251 | for t in range(K): 252 | pgd.attack(is_first_attack=(t == 0)) # 在embedding上添加对抗扰动, first attack时备份param.data 253 | if t != K - 1: 254 | model.zero_grad() 255 | else: 256 | pgd.restore_grad() 257 | y_pred = model(input_ids, input_masks, segment_ids) 258 | 259 | loss_adv = loss_fn(y_pred, label_t) 260 | loss_adv = loss_adv.mean() 261 | loss_adv.backward() # 反向传播,并在正常的grad基础上,累加对抗训练的梯度 262 | pgd.restore() # 恢复embedding参数 263 | 264 | elif config.fgm: 265 | # 对抗训练 266 | fgm.attack() # 在embedding上添加对抗扰动 267 | y_pred = model(input_ids, input_masks, segment_ids) 268 | loss_adv = loss_fn(y_pred, label_t) 269 | loss_adv = loss_adv.mean() 270 | loss_adv.backward() # 反向传播,并在正常的grad基础上,累加对抗训练的梯度 271 | fgm.restore() # 恢复embedding参数 272 | 273 | # 梯度下降,更新参数 274 | if config.fp16: 275 | scaler.unscale_(optimizer) 276 | scaler.step(optimizer) 277 | scaler.update() 278 | else: 279 | optimizer.step() 280 | scheduler.step() # Update learning rate schedule 281 | model.zero_grad() 282 | 283 | y_pred = np.argmax(y_pred.detach().to("cpu").numpy(), axis=1) 284 | label = np.argmax(labels, axis=1) 285 | score.append(f1_score(label, y_pred, average='macro')) 286 | loss_num.append(loss.item()) 287 | print(f"微调第{e}轮耗时:{time.time()-last}") 288 | torch.save(model.module if hasattr(model, "module") else model, PATH) 289 | print("train_loss={} train_f1={}".format(np.mean(loss_num), np.mean(score))) 290 | optimizer.zero_grad() 291 | 292 | 293 | 294 | 295 | 296 | -------------------------------------------------------------------------------- /Bert_pytorch/bert_finetuning/predict.py: -------------------------------------------------------------------------------- 1 | from NEZHA.modeling_nezha import * 2 | from tqdm import tqdm, trange 3 | import numpy as np 4 | import pandas as pd 5 | import logging 6 | import torch 7 | import random 8 | import os 9 | from torch import nn, optim 10 | from transformers import BertTokenizer, AdamW, BertModel, BertPreTrainedModel, BertConfig, \ 11 | get_linear_schedule_with_warmup, XLNetModel, XLNetTokenizer, XLNetConfig 12 | from transformers.optimization import get_linear_schedule_with_warmup 13 | from sklearn.model_selection import StratifiedKFold, KFold 14 | from sklearn.metrics import mean_absolute_error, accuracy_score, f1_score, roc_auc_score 15 | from model import * 16 | from utils import * 17 | import time 18 | from tqdm import tqdm 19 | import re 20 | tqdm.pandas() 21 | seed = 1949 22 | os.environ['PYTHONHASHSEED'] = '0' # 消除hash算法的随机性 23 | random.seed(seed) 24 | np.random.seed(seed) 25 | torch.manual_seed(seed) 26 | torch.cuda.manual_seed_all(seed) 27 | class Config: 28 | def __init__(self): 29 | # 预训练模型路径 30 | self.modelId = 2 31 | self.model = "BertLastFourCls" 32 | self.Stratification = False 33 | # '/Bert_pytorch/bert_model_800/' 34 | self.model_path = '/Bert_pytorch/bert_model_1000/' 35 | 36 | self.num_class = 35 37 | self.dropout = 0.2 38 | self.MAX_LEN = 100 39 | self.epoch = 6 40 | self.learn_rate = 2e-5 41 | self.normal_lr = 1e-4 42 | self.batch_size = 64 43 | self.k_fold = 10 44 | self.seed = 42 45 | 46 | self.device = torch.device('cuda') 47 | # self.device = torch.device('cpu') 48 | self.focalloss = False 49 | self.pgd = False 50 | self.fgm = True 51 | 52 | def preprocess_text(document): 53 | # 删除逗号 54 | text = str(document) 55 | text = text.replace(',', '') 56 | text = text.replace('!', '') 57 | text = text.replace('17281', '') 58 | # 用单个空格替换多个空格 59 | text = re.sub(r'\s+', ' ', text, flags=re.I) 60 | return text 61 | # 10fold的模型预测,生成提交文件 62 | def submit(pred, test_df, id2label): 63 | test_preds_merge = np.sum(pred, axis=0) / (pred.shape[0]) 64 | test_pre_tensor = torch.tensor(test_preds_merge) 65 | test_pre = torch.max(test_pre_tensor, 1)[1] 66 | pred_labels = [id2label[i] for i in test_pre] 67 | SUBMISSION_DIR = "submit" 68 | if not os.path.exists(SUBMISSION_DIR): 69 | os.makedirs(SUBMISSION_DIR) 70 | Name = "Bert_single_alltrain" 71 | submit_file = SUBMISSION_DIR+"/submit_{}.csv".format(Name) 72 | 73 | pd.DataFrame({"id": test_df['id'], "label": pred_labels}).to_csv( 74 | submit_file, index=False) 75 | 76 | # 单个模型的预测,生成提交文件 77 | def submit_single(pred, test_df, id2label): 78 | test_pre = np.argmax(pred,axis=1) 79 | pred_labels = [id2label[i] for i in test_pre] 80 | SUBMISSION_DIR = "submit" 81 | if not os.path.exists(SUBMISSION_DIR): 82 | os.makedirs(SUBMISSION_DIR) 83 | Name = "Bert_single_alltrain" 84 | submit_file = SUBMISSION_DIR+"/{}.csv".format(Name) 85 | 86 | pd.DataFrame({"id": test_df['id'], "label": pred_labels}).to_csv( 87 | submit_file, index=False) 88 | # 10fold的模型预测 89 | def fold10_predict(): 90 | config = Config() 91 | # train_clean = '/data/datagrand_2021_train.csv' 92 | train_clean = 'data/datagrand_2021_train.csv' 93 | test_clean = 'data/datagrand_2021_test.csv' 94 | train = pd.read_csv(train_clean) 95 | test = pd.read_csv(test_clean) 96 | test["text"].progress_apply(lambda x: preprocess_text(x)) 97 | id2label = list(train['label'].unique()) 98 | label2id = {id2label[i]: i for i in range(len(id2label))} 99 | test_dataset = [] 100 | for i in tqdm(range(len(test))): 101 | test_dict = {} 102 | test_dict['text'] = test.loc[i, 'text'] 103 | test_dict['label'] = [-1]*35 104 | test_dataset.append(test_dict) 105 | test_D = data_generator(test_dataset, config) 106 | model_pre = [] 107 | for fold in tqdm(range(config.k_fold)): 108 | PATH = './models/bert_{}.pth'.format(fold) 109 | #model = MODEL_CLASSES[config.model](config).to(config.device) 110 | model = torch.load(PATH) 111 | model.eval() 112 | with torch.no_grad(): 113 | y_p = [] 114 | y_l = [] 115 | val_y = [] 116 | train_logit = None 117 | for input_ids, input_masks, segment_ids, labels in tqdm(test_D, disable=True): 118 | y_pred = model(input_ids, input_masks, segment_ids) 119 | y_pred = F.softmax(y_pred,dim=1) 120 | y_pred = y_pred.detach().to("cpu").numpy() 121 | if train_logit is None: 122 | train_logit = y_pred 123 | else: 124 | train_logit = np.vstack((train_logit, y_pred)) 125 | model_pre.append(train_logit) 126 | 127 | submit(np.array(model_pre), test, id2label) 128 | 129 | # 单个模型的预测 130 | def single_predict(): 131 | config = Config() 132 | # train_clean = '/data/datagrand_2021_train.csv' 133 | train_clean = 'data/datagrand_2021_train.csv' 134 | test_clean = 'data/datagrand_2021_test.csv' 135 | train = pd.read_csv(train_clean) 136 | test = pd.read_csv(test_clean) 137 | test["text"].progress_apply(lambda x: preprocess_text(x)) 138 | id2label = list(train['label'].unique()) 139 | label2id = {id2label[i]: i for i in range(len(id2label))} 140 | test_dataset = [] 141 | for i in tqdm(range(len(test))): 142 | test_dict = {} 143 | test_dict['text'] = test.loc[i, 'text'] 144 | test_dict['label'] = [-1]*35 145 | test_dataset.append(test_dict) 146 | test_D = data_generator(test_dataset, config) 147 | model_pre = [] 148 | 149 | PATH = './models/bertforclass.pth' 150 | model = torch.load(PATH) 151 | model.eval() 152 | with torch.no_grad(): 153 | train_logit = None 154 | for input_ids, input_masks, segment_ids, labels in tqdm(test_D, disable=True): 155 | y_pred = model(input_ids, input_masks, segment_ids) 156 | y_pred = F.softmax(y_pred, dim=1) 157 | y_pred = y_pred.detach().to("cpu").numpy() 158 | if train_logit is None: 159 | train_logit = y_pred 160 | else: 161 | train_logit = np.vstack((train_logit, y_pred)) 162 | 163 | submit_single(train_logit, test, id2label) 164 | if __name__=="__main__": 165 | # 不划分验证集模型预测 166 | single_predict() 167 | # 划分验证集,10fold的模型预测 168 | fold10_predict() 169 | -------------------------------------------------------------------------------- /Bert_pytorch/bert_finetuning/predict_tta.py: -------------------------------------------------------------------------------- 1 | from NEZHA.modeling_nezha import * 2 | from tqdm import tqdm, trange 3 | import numpy as np 4 | import pandas as pd 5 | import logging 6 | import torch 7 | import random 8 | import os 9 | from torch import nn, optim 10 | from transformers import BertTokenizer, AdamW, BertModel, BertPreTrainedModel, BertConfig, \ 11 | get_linear_schedule_with_warmup, XLNetModel, XLNetTokenizer, XLNetConfig 12 | from transformers.optimization import get_linear_schedule_with_warmup 13 | from sklearn.model_selection import StratifiedKFold, KFold 14 | from sklearn.metrics import mean_absolute_error, accuracy_score, f1_score, roc_auc_score 15 | from model import * 16 | from utils import * 17 | import time 18 | from tqdm import tqdm 19 | import re 20 | import json 21 | tqdm.pandas() 22 | os.environ['PYTHONHASHSEED'] = '0' # 消除hash算法的随机性 23 | random.seed(123) 24 | np.random.seed(123) 25 | torch.manual_seed(123) 26 | torch.cuda.manual_seed_all(123) 27 | 28 | 29 | class Config: 30 | def __init__(self): 31 | # 预训练模型路径 32 | self.modelId = 2 33 | self.model = "BertLstm" 34 | self.Stratification = False 35 | # '/Bert_pytorch/bert_model_800/' 36 | self.model_path = '/media/mgege007/winType/DaGuan/Pytorch-pretrain/Bert_pytorch/bert_model_1000/' 37 | 38 | self.num_class = 35 39 | self.dropout = 0.2 40 | self.MAX_LEN = 100 41 | self.epoch = 6 42 | self.learn_rate = 2e-5 43 | self.normal_lr = 1e-4 44 | self.batch_size = 64 45 | self.k_fold = 10 46 | self.seed = 42 47 | 48 | self.device = torch.device('cuda') 49 | # self.device = torch.device('cpu') 50 | 51 | self.focalloss = False 52 | self.pgd = False 53 | self.fgm = True 54 | 55 | 56 | def preprocess_text(document): 57 | 58 | # 删除逗号 59 | text = str(document) 60 | text = text.replace(',', '') 61 | text = text.replace('!', '') 62 | text = text.replace('?', '') 63 | text = text.replace('。', '') 64 | # 用单个空格替换多个空格 65 | text = re.sub(r'\s+', ' ', text, flags=re.I) 66 | 67 | return text 68 | 69 | 70 | def ensemble(pred,tta_pred, test_df, id2label, models): 71 | total = [] 72 | for i in range(pred.shape[0]): 73 | t = pred[i]+tta_pred[i] 74 | total.append(t) 75 | test_preds = np.argmax(np.array(total), axis=1) 76 | pred_labels = [id2label[i] for i in test_preds] 77 | SUBMISSION_DIR = "submit" 78 | if not os.path.exists(SUBMISSION_DIR): 79 | os.makedirs(SUBMISSION_DIR) 80 | Name = "{}_ensemble".format(models) 81 | submit_file = SUBMISSION_DIR+"/{}.csv".format(Name) 82 | 83 | pd.DataFrame({"id": test_df['id'], "label": pred_labels}).to_csv( 84 | submit_file, index=False) 85 | 86 | 87 | def build_data(): 88 | train_clean = '/media/mgege007/winType/DaGuan/data/datagrand_2021_train.csv' 89 | test_clean = '/media/mgege007/winType/DaGuan/data/datagrand_2021_test.csv' 90 | tta_clean = '/media/mgege007/winType/DaGuan/data/tta_test.csv' 91 | train = pd.read_csv(train_clean) 92 | test = pd.read_csv(test_clean) 93 | tta = pd.read_csv(tta_clean) 94 | train["text"].progress_apply(lambda x: preprocess_text(x)) 95 | test["text"].progress_apply(lambda x: preprocess_text(x)) 96 | tta["text"].progress_apply(lambda x: preprocess_text(x)) 97 | id2label = list(train['label'].unique()) 98 | test_dataset = [] 99 | for i in tqdm(range(len(test))): 100 | test_dict = {} 101 | test_dict['text'] = test.loc[i, 'text'] 102 | test_dict['label'] = [-1]*35 103 | test_dataset.append(test_dict) 104 | tta_dataset = [] 105 | for i in tqdm(range(len(tta))): 106 | test_dict = {} 107 | test_dict['text'] = tta.loc[i, 'text'] 108 | test_dict['label'] = [-1]*35 109 | tta_dataset.append(test_dict) 110 | return test_dataset, tta_dataset, test, id2label 111 | 112 | 113 | def pre_ensemble(model_li, test_dataset): 114 | config = Config() 115 | test_D = data_generator(test_dataset, config) 116 | test_li =[] 117 | for i, path in enumerate(model_li): 118 | # 每个模型的 119 | print("正在测试{}".format(path)) 120 | PATH = './models/{}.pth'.format(path) 121 | model = torch.load(PATH) 122 | model.eval() 123 | n = 0 124 | with torch.no_grad(): 125 | train_logit = None 126 | for input_ids, input_masks, segment_ids, labels in tqdm(test_D, disable=True): 127 | print(n) 128 | n += 1 129 | y_pred = model(input_ids, input_masks, segment_ids) 130 | y_pred = F.softmax(y_pred, dim=1) 131 | y_pred = y_pred.detach().to("cpu").numpy() 132 | if train_logit is None: 133 | train_logit = y_pred 134 | else: 135 | train_logit = np.vstack((train_logit, y_pred)) 136 | test_li.append(train_logit) 137 | test_preds = np.sum(np.array(test_li), axis=0) / (np.array(test_li).shape[0]) 138 | 139 | return test_preds 140 | 141 | 142 | if __name__ == "__main__": 143 | # checkpoint 融合 144 | model_li = ["bertforclass", "nezha_all", "nezhalarge_all", "bertlastcls_all", 145 | "bertlastfourcls_all", "bertlasttwoclspooler_all", "bertlstm"] 146 | 147 | test_dataset, tta_dataset, test, id2label = build_data() 148 | test_arr = pre_ensemble(model_li, test_dataset) 149 | tta_arr = pre_ensemble(model_li, tta_dataset) 150 | # TTA 测试集数据增强 151 | ensemble(test_arr, tta_arr, test, id2label, "-".join(model_li)+"TTA") 152 | 153 | print() 154 | -------------------------------------------------------------------------------- /Bert_pytorch/bert_finetuning/stacking.py: -------------------------------------------------------------------------------- 1 | from sklearn.linear_model import LogisticRegression 2 | from sklearn.preprocessing import StandardScaler 3 | from utils import * 4 | from model import * 5 | from sklearn.metrics import f1_score 6 | from sklearn.model_selection import StratifiedKFold, KFold 7 | from NEZHA.modeling_nezha import * 8 | from tqdm import tqdm, trange 9 | import numpy as np 10 | import pandas as pd 11 | import torch 12 | import random 13 | import os 14 | import re 15 | from tqdm import tqdm 16 | tqdm.pandas() 17 | os.environ['PYTHONHASHSEED'] = '0' # 消除hash算法的随机性 18 | random.seed(2021) 19 | np.random.seed(2021) 20 | torch.manual_seed(2021) 21 | torch.cuda.manual_seed_all(2021) 22 | 23 | MODEL_CLASSES = { 24 | 'BertForClass': BertForClass, 25 | 'BertLastCls': BertLastCls, 26 | 'BertLastTwoCls': BertLastTwoCls, 27 | 'BertLastTwoClsPooler': BertLastTwoClsPooler, 28 | 'BertLastTwoEmbeddings': BertLastTwoEmbeddings, 29 | 'BertLastTwoEmbeddingsPooler': BertLastTwoEmbeddingsPooler, 30 | 'BertLastFourCls': BertLastFourCls, 31 | 'BertLastFourClsPooler': BertLastFourClsPooler, 32 | 'BertLastFourEmbeddings': BertLastFourEmbeddings, 33 | 'BertLastFourEmbeddingsPooler': BertLastFourEmbeddingsPooler, 34 | 'BertDynCls': BertDynCls, 35 | 'BertDynEmbeddings': BertDynEmbeddings, 36 | 'BertRNN': BertRNN, 37 | 'BertCNN': BertCNN, 38 | 'BertRCNN': BertRCNN, 39 | 'XLNet': XLNet, 40 | 'Electra': Electra, 41 | 'NEZHA': NEZHA, 42 | 43 | } 44 | 45 | 46 | class Config: 47 | def __init__(self): 48 | # 预训练模型路径 49 | self.modelId = 2 50 | self.model = "BertForClass" 51 | self.Stratification = False 52 | self.model_path = '/media/mgege007/winType/DaGuan/Pytorch-pretrain/Nezha_pytorch/nezha_model/' 53 | 54 | self.num_class = 35 55 | self.dropout = 0.2 56 | self.MAX_LEN = 100 57 | self.epoch = 6 58 | self.learn_rate = 2e-5 59 | self.normal_lr = 1e-4 60 | self.batch_size = 512 61 | self.k_fold = 10 62 | self.seed = 42 63 | 64 | self.device = torch.device('cuda') 65 | # self.device = torch.device('cpu') 66 | 67 | self.focalloss = False 68 | self.pgd = False 69 | self.fgm = True 70 | 71 | 72 | def preprocess_text(document): 73 | 74 | # 删除逗号 75 | text = str(document) 76 | text = text.replace(',', '') 77 | text = text.replace('!', '') 78 | text = text.replace('17281', '') 79 | # 用单个空格替换多个空格 80 | text = re.sub(r'\s+', ' ', text, flags=re.I) 81 | 82 | return text 83 | 84 | 85 | def build_data(): 86 | train_clean = '/media/mgege007/winType/DaGuan/data/datagrand_2021_train.csv' 87 | test_clean = '/media/mgege007/winType/DaGuan/data/datagrand_2021_test.csv' 88 | train = pd.read_csv(train_clean) 89 | test = pd.read_csv(test_clean) 90 | train["text"].progress_apply(lambda x: preprocess_text(x)) 91 | test["text"].progress_apply(lambda x: preprocess_text(x)) 92 | ylabel = [] 93 | id2label = list(train['label'].unique()) 94 | label2id = {id2label[i]: i for i in range(len(id2label))} 95 | y_train = np.zeros((len(train), len(id2label)), dtype=np.int8) 96 | test_dataset = [] 97 | for i in tqdm(range(len(test))): 98 | test_dict = {} 99 | test_dict['text'] = test.loc[i, 'text'] 100 | test_dict['label'] = [-1]*35 101 | test_dataset.append(test_dict) 102 | train_dataset = [] 103 | for i in tqdm(range(len(train))): 104 | train_dict = {} 105 | train_dict['text'] = train.loc[i, 'text'] 106 | y_train[i][label2id[train.loc[i, 'label']]] = 1 107 | train_dict['label'] = y_train[i] 108 | ylabel.append(train.loc[i, 'label']) 109 | train_dataset.append(train_dict) 110 | return train_dataset, test_dataset, ylabel, test, id2label 111 | 112 | 113 | def pre_stacking(models_path, train_dataset, test_dataset, ylabel): 114 | config = Config() 115 | 116 | kf = StratifiedKFold(n_splits=config.k_fold, 117 | shuffle=True, random_state=config.seed) 118 | val_pre = [] 119 | val_label = [] 120 | test_prelist = [] 121 | val_logit = None 122 | for fold, (train_index, valid_index) in enumerate(kf.split(np.arange(len(train_dataset)), ylabel)): 123 | print('\n\n------------fold:{}------------\n'.format(fold)) 124 | val = [train_dataset[index] for index in valid_index] 125 | test_D = data_generator(test_dataset, config) 126 | val_D = data_generator(val, config) 127 | # 每个模型的 128 | PATH = './{}/model_{}.pth'.format(models_path, fold) 129 | model = torch.load(PATH) 130 | model.eval() 131 | with torch.no_grad(): 132 | y_p = [] 133 | y_l = [] 134 | val_y = [] 135 | test_logit = None 136 | for input_ids, input_masks, segment_ids, labels in tqdm(val_D, disable=True): 137 | y_pred = model(input_ids, input_masks, segment_ids) 138 | y_pred = F.softmax(y_pred, dim=1) 139 | y_pred = y_pred.detach().to("cpu").numpy() 140 | val_label.extend(list(np.argmax(np.array(labels), axis=1))) 141 | if val_logit is None: 142 | val_logit = y_pred 143 | else: 144 | val_logit = np.vstack((val_logit, y_pred)) 145 | # val_pre.append(val_logit) 146 | 147 | for input_ids, input_masks, segment_ids, labels in tqdm(test_D, disable=True): 148 | y_pred = model(input_ids, input_masks, segment_ids) 149 | y_pred = F.softmax(y_pred, dim=1) 150 | y_pred = y_pred.detach().to("cpu").numpy() 151 | if test_logit is None: 152 | test_logit = y_pred 153 | else: 154 | test_logit = np.vstack((test_logit, y_pred)) 155 | test_prelist.append(test_logit) 156 | test_pre = np.sum(np.array(test_prelist), axis=0) / \ 157 | (np.array(test_prelist).shape[0]) 158 | val_path = "val_data" 159 | test_path = "test_data" 160 | if not os.path.exists(val_path): 161 | os.makedirs(val_path) 162 | os.makedirs(test_path) 163 | val_pre_name = "./val_data/{}_val_pre.npy".format(models_path) 164 | val_label_name = "./val_data/{}_val_label.npy".format(models_path) 165 | test_pre_name = "./test_data/{}_test_pre.npy".format(models_path) 166 | np.save(val_pre_name, np.array(val_logit)) 167 | np.save(val_label_name, np.array(val_label)) 168 | np.save(test_pre_name, np.array(test_pre)) 169 | 170 | 171 | def stacking(path_li, test_df, id2label): 172 | val_x = None 173 | val_y = [] 174 | test_x = None 175 | 176 | for i, p in enumerate(path_li): 177 | val_pre_name = "./val_data/{}_val_pre.npy".format(p) 178 | test_pre_name = "./test_data/{}_test_pre.npy".format(p) 179 | val_label_name = "./val_data/{}_val_label.npy".format(p) 180 | val_pre = np.load(val_pre_name, allow_pickle=True) 181 | val_y.append(np.load(val_label_name, allow_pickle=True)) 182 | test_pre = np.load(test_pre_name, allow_pickle=True) 183 | if val_x is None: 184 | val_x = val_pre 185 | test_x = test_pre 186 | else: 187 | val_x = np.hstack((val_x, val_pre)) 188 | test_x = np.hstack((test_x, test_pre)) 189 | scaler = StandardScaler() 190 | 191 | train_proba = val_x 192 | test_proba = test_x 193 | label = val_y[0] 194 | 195 | scaler.fit(train_proba) 196 | train_proba = scaler.transform(train_proba) 197 | test_proba = scaler.transform(test_proba) 198 | lr = LogisticRegression(tol=0.0001, C=0.5, random_state=98, max_iter=10000) 199 | 200 | kf = StratifiedKFold(n_splits=5, random_state=244, shuffle=True) 201 | pred_list = [] 202 | score = [] 203 | for fold, (train_index, val_index) in enumerate(kf.split(train_proba, label)): 204 | X_train = train_proba[train_index] 205 | y_train = label[train_index] 206 | X_val = train_proba[val_index] 207 | y_val = label[val_index] 208 | lr.fit(X_train, y_train) 209 | y_pred = lr.predict_proba(X_val) 210 | y_pred = np.argmax(y_pred, axis=1) 211 | f1 = f1_score(y_val, y_pred, average='macro') 212 | score.append(f1) 213 | print("{} fold f1 = {}".format(fold+1, f1)) 214 | y_testi = lr.predict_proba(test_proba) 215 | pred_list.append(y_testi) 216 | test_preds = np.sum(np.array(pred_list), axis=0) / \ 217 | (np.array(pred_list).shape[0]) 218 | test_preds = np.argmax(test_preds, axis=1) 219 | pred_labels = [id2label[i] for i in test_preds] 220 | SUBMISSION_DIR = "submit" 221 | if not os.path.exists(SUBMISSION_DIR): 222 | os.makedirs(SUBMISSION_DIR) 223 | Name = "NEZHA-Bert-Stacking" 224 | submit_file = SUBMISSION_DIR+"/{}.csv".format(Name) 225 | 226 | pd.DataFrame({"id": test_df['id'], "label": pred_labels}).to_csv( 227 | submit_file, index=False) 228 | # print(lr.coef_, lr.n_iter_) 229 | print("最终平均得分={}".format(np.mean(score))) 230 | print() 231 | 232 | 233 | if __name__ == "__main__": 234 | 235 | path_li = ["bert_model", "nezhalarge_model"] 236 | train_dataset, test_dataset, ylabel, test, id2label = build_data() 237 | for i, p in enumerate(path_li): 238 | pre_stacking(p, train_dataset, test_dataset, ylabel) 239 | stacking(path_li, test, id2label) 240 | -------------------------------------------------------------------------------- /Bert_pytorch/bert_finetuning/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from transformers import BertTokenizer, AdamW, BertModel, BertPreTrainedModel, BertConfig, \ 3 | get_linear_schedule_with_warmup, XLNetModel, XLNetTokenizer, XLNetConfig 4 | import numpy as np 5 | import os 6 | import random 7 | from Config import * 8 | import torch 9 | import torch.nn as nn 10 | import torch.nn.functional as F 11 | from collections import defaultdict 12 | from torch.optim.optimizer import Optimizer 13 | 14 | def paddingList(ls: list, val, returnTensor=False): 15 | ls = ls[:] # 不要改变了原list尺寸 16 | maxLen = max([len(i) for i in ls]) 17 | for i in range(len(ls)): 18 | ls[i] = ls[i]+[val]*(maxLen-len(ls[i])) 19 | return torch.tensor(ls, device='cuda') if returnTensor else ls 20 | 21 | 22 | def fastTokenizer(a: str, b: str, maxLen, tk): 23 | a, b = a.split(), b.split() 24 | a, b = tk.convert_tokens_to_ids(a), tk.convert_tokens_to_ids(b) 25 | maxLen -= 3 # 空留给cls sep sep 26 | assert maxLen >= 0 27 | len2 = maxLen//2 # 若为奇数,更长部分给左边 28 | len1 = maxLen-len2 29 | #一共就a超长与否,b超长与否,组合的四种情况 30 | if len(a)+len(b) > maxLen: # 需要截断 31 | if len(a) <= len1 and len(b) > len2: 32 | b = b[:maxLen-len(a)] 33 | elif len(a) > len1 and len(b) <= len2: 34 | a = a[:maxLen-len(b)] 35 | elif len(a) > len1 and len(b) > len2: 36 | a = a[:len1] 37 | b = b[:len2] 38 | input_ids = [tk.cls_token_id]+a+[tk.sep_token_id]+b+[tk.sep_token_id] 39 | token_type_ids = [0]*(len(a)+2)+[1]*(len(b)+1) 40 | return {'input_ids': input_ids, 'token_type_ids': token_type_ids} 41 | 42 | 43 | class data_generator: 44 | def __init__(self, data, config, shuffle=False): 45 | self.data = data 46 | self.batch_size = config.batch_size 47 | self.max_length = config.MAX_LEN 48 | self.shuffle = shuffle 49 | 50 | vocab = 'vocab.txt' if os.path.exists( 51 | config.model_path + 'vocab.txt') else 'spiece.model' 52 | self.tokenizer = TOKENIZERS[config.model].from_pretrained( 53 | config.model_path + vocab) 54 | 55 | self.steps = len(self.data[0]) // self.batch_size 56 | if len(self.data[0]) % self.batch_size != 0: 57 | self.steps += 1 58 | 59 | def __len__(self): 60 | return self.steps 61 | 62 | def __iter__(self): 63 | input_ids, input_masks, segment_ids, labels = [], [], [], [] 64 | for index, data_li in enumerate(self.data): 65 | 66 | text = data_li['text'] 67 | label = data_li['label'] 68 | tkRes = self.tokenizer(text, max_length=self.max_length, truncation='longest_first', 69 | return_attention_mask=False) 70 | input_id = tkRes['input_ids'] 71 | segment_id = tkRes['token_type_ids'] 72 | assert len(segment_id) == len(input_id) 73 | input_ids.append(input_id) 74 | segment_ids.append(segment_id) 75 | labels.append(label) 76 | 77 | if len(input_ids) == self.batch_size or index == len(self.data)-1: 78 | input_ids = paddingList( 79 | input_ids, 0, returnTensor=True) # 动态padding 80 | segment_ids = paddingList(segment_ids, 0, returnTensor=True) 81 | input_masks = (input_ids != 0) 82 | yield input_ids, input_masks, segment_ids, labels 83 | input_ids, input_masks, segment_ids, labels = [], [], [], [] 84 | 85 | 86 | class PGD(): 87 | def __init__(self, model): 88 | self.model = model 89 | self.emb_backup = {} 90 | self.grad_backup = {} 91 | 92 | def attack(self, epsilon=0.3, alpha=0.1, emb_name='word_embeddings', is_first_attack=False): 93 | # emb_name这个参数要换成你模型中embedding的参数名 94 | for name, param in self.model.named_parameters(): 95 | if param.requires_grad and emb_name in name: 96 | if is_first_attack: 97 | self.emb_backup[name] = param.data.clone() 98 | norm = torch.norm(param.grad) 99 | if norm != 0 and not torch.isnan(norm): 100 | r_at = alpha * param.grad / norm 101 | param.data.add_(r_at) 102 | param.data = self.project(name, param.data, epsilon) 103 | 104 | def restore(self, emb_name='word_embeddings'): 105 | # emb_name这个参数要换成你模型中embedding的参数名 106 | for name, param in self.model.named_parameters(): 107 | if param.requires_grad and emb_name in name: 108 | assert name in self.emb_backup 109 | param.data = self.emb_backup[name] 110 | self.emb_backup = {} 111 | 112 | def project(self, param_name, param_data, epsilon): 113 | r = param_data - self.emb_backup[param_name] 114 | if torch.norm(r) > epsilon: 115 | r = epsilon * r / torch.norm(r) 116 | return self.emb_backup[param_name] + r 117 | 118 | def backup_grad(self): 119 | for name, param in self.model.named_parameters(): 120 | if param.requires_grad: 121 | self.grad_backup[name] = param.grad.clone() 122 | 123 | def restore_grad(self): 124 | for name, param in self.model.named_parameters(): 125 | if param.requires_grad: 126 | param.grad = self.grad_backup[name] 127 | 128 | 129 | class FGM(): 130 | def __init__(self, model): 131 | self.model = model 132 | self.backup = {} 133 | 134 | def attack(self, epsilon=0.25, emb_name='word_embeddings'): 135 | # emb_name这个参数要换成你模型中embedding的参数名 136 | for name, param in self.model.named_parameters(): 137 | if param.requires_grad and emb_name in name: 138 | self.backup[name] = param.data.clone() 139 | norm = torch.norm(param.grad) 140 | if norm != 0: 141 | r_at = epsilon * param.grad / norm 142 | param.data.add_(r_at) 143 | 144 | def restore(self, emb_name='word_embeddings'): 145 | # emb_name这个参数要换成你模型中embedding的参数名 146 | for name, param in self.model.named_parameters(): 147 | if param.requires_grad and emb_name in name: 148 | assert name in self.backup 149 | param.data = self.backup[name] 150 | self.backup = {} 151 | 152 | 153 | # 支持多分类和二分类 154 | class FocalLoss(nn.Module): 155 | """ 156 | This is a implementation of Focal Loss with smooth label cross entropy supported which is proposed in 157 | 'Focal Loss for Dense Object Detection. (https://arxiv.org/abs/1708.02002)' 158 | Focal_Loss= -1*alpha*(1-pt)^gamma*log(pt) 159 | :param num_class: 160 | :param alpha: (tensor) 3D or 4D the scalar factor for this criterion 161 | :param gamma: (float,double) gamma > 0 reduces the relative loss 162 | for well-classified examples (p>0.5) putting more 163 | focus on hard misclassified example 164 | :param smooth: (float,double) smooth value when cross entropy 165 | :param balance_index: (int) balance class index, 166 | should be specific when alpha is float 167 | :param size_average: (bool, optional) By default, 168 | the losses are averaged over each loss element in the batch. 169 | """ 170 | 171 | def __init__(self, num_class, alpha=None, gamma=2, 172 | smooth=None, size_average=True): 173 | super(FocalLoss, self).__init__() 174 | self.num_class = num_class 175 | self.alpha = alpha 176 | self.gamma = gamma 177 | self.smooth = smooth 178 | self.size_average = size_average 179 | 180 | if self.alpha is None: 181 | self.alpha = torch.ones(self.num_class, 1) 182 | elif isinstance(self.alpha, (list, np.ndarray)): 183 | assert len(self.alpha) == self.num_class 184 | self.alpha = torch.FloatTensor(alpha).view(self.num_class, 1) 185 | self.alpha = self.alpha / self.alpha.sum() 186 | else: 187 | raise TypeError('Not support alpha type') 188 | if self.smooth is not None: 189 | if self.smooth < 0 or self.smooth > 1.0: 190 | raise ValueError('smooth value should be in [0,1]') 191 | 192 | def forward(self, input, target): 193 | logit = F.softmax(input, dim=1) 194 | 195 | if logit.dim() > 2: 196 | # N,C,d1,d2 -> N,C,m (m=d1*d2*...) 197 | logit = logit.view(logit.size(0), logit.size(1), -1) 198 | logit = logit.permute(0, 2, 1).contiguous() 199 | logit = logit.view(-1, logit.size(-1)) 200 | target = target.view(-1, 1) 201 | 202 | # N = input.size(0) 203 | # alpha = torch.ones(N, self.num_class) 204 | # alpha = alpha * (1 - self.alpha) 205 | # alpha = alpha.scatter_(1, target.long(), self.alpha) 206 | epsilon = 1e-10 207 | alpha = self.alpha 208 | if alpha.device != input.device: 209 | alpha = alpha.to(input.device) 210 | 211 | idx = target.cpu().long() 212 | one_hot_key = torch.FloatTensor(target.size(0), self.num_class).zero_() 213 | one_hot_key = one_hot_key.scatter_(1, idx, 1) 214 | if one_hot_key.device != logit.device: 215 | one_hot_key = one_hot_key.to(logit.device) 216 | 217 | if self.smooth: 218 | one_hot_key = torch.clamp( 219 | one_hot_key, self.smooth, 1.0 - self.smooth) 220 | pt = (one_hot_key * logit).sum(1) + epsilon 221 | logpt = pt.log() 222 | 223 | gamma = self.gamma 224 | 225 | alpha = alpha[idx] 226 | loss = -1 * alpha * torch.pow((1 - pt), gamma) * logpt 227 | 228 | if self.size_average: 229 | loss = loss.mean() 230 | else: 231 | loss = loss.sum() 232 | return loss 233 | 234 | 235 | def f1_match(y_true, y_pred): 236 | acc = sum(y_pred & y_true) / (sum(y_pred)) 237 | rec = sum(y_pred & y_true) / (sum(y_true)) 238 | 239 | return 2 * acc * rec / (acc + rec) 240 | class Lookahead(Optimizer): 241 | r"""PyTorch implementation of the lookahead wrapper. 242 | Lookahead Optimizer: https://arxiv.org/abs/1907.08610 243 | """ 244 | 245 | def __init__(self, optimizer, la_steps=5, la_alpha=0.8, pullback_momentum="none"): 246 | """optimizer: inner optimizer 247 | la_steps (int): number of lookahead steps 248 | la_alpha (float): linear interpolation factor. 1.0 recovers the inner optimizer. 249 | pullback_momentum (str): change to inner optimizer momentum on interpolation update 250 | """ 251 | self.optimizer = optimizer 252 | self._la_step = 0 # counter for inner optimizer 253 | self.la_alpha = la_alpha 254 | self._total_la_steps = la_steps 255 | pullback_momentum = pullback_momentum.lower() 256 | assert pullback_momentum in ["reset", "pullback", "none"] 257 | self.pullback_momentum = pullback_momentum 258 | 259 | self.state = defaultdict(dict) 260 | 261 | # Cache the current optimizer parameters 262 | for group in optimizer.param_groups: 263 | for p in group['params']: 264 | param_state = self.state[p] 265 | param_state['cached_params'] = torch.zeros_like(p.data) 266 | param_state['cached_params'].copy_(p.data) 267 | if self.pullback_momentum == "pullback": 268 | param_state['cached_mom'] = torch.zeros_like(p.data) 269 | 270 | def __getstate__(self): 271 | return { 272 | 'state': self.state, 273 | 'optimizer': self.optimizer, 274 | 'la_alpha': self.la_alpha, 275 | '_la_step': self._la_step, 276 | '_total_la_steps': self._total_la_steps, 277 | 'pullback_momentum': self.pullback_momentum 278 | } 279 | 280 | def zero_grad(self): 281 | self.optimizer.zero_grad() 282 | 283 | def get_la_step(self): 284 | return self._la_step 285 | 286 | def state_dict(self): 287 | return self.optimizer.state_dict() 288 | 289 | def load_state_dict(self, state_dict): 290 | self.optimizer.load_state_dict(state_dict) 291 | 292 | def _backup_and_load_cache(self): 293 | """Useful for performing evaluation on the slow weights (which typically generalize better) 294 | """ 295 | for group in self.optimizer.param_groups: 296 | for p in group['params']: 297 | param_state = self.state[p] 298 | param_state['backup_params'] = torch.zeros_like(p.data) 299 | param_state['backup_params'].copy_(p.data) 300 | p.data.copy_(param_state['cached_params']) 301 | 302 | def _clear_and_load_backup(self): 303 | for group in self.optimizer.param_groups: 304 | for p in group['params']: 305 | param_state = self.state[p] 306 | p.data.copy_(param_state['backup_params']) 307 | del param_state['backup_params'] 308 | 309 | @property 310 | def param_groups(self): 311 | return self.optimizer.param_groups 312 | 313 | def step(self, closure=None): 314 | """Performs a single Lookahead optimization step. 315 | Arguments: 316 | closure (callable, optional): A closure that reevaluates the model 317 | and returns the loss. 318 | """ 319 | loss = self.optimizer.step(closure) 320 | self._la_step += 1 321 | 322 | if self._la_step >= self._total_la_steps: 323 | self._la_step = 0 324 | # Lookahead and cache the current optimizer parameters 325 | for group in self.optimizer.param_groups: 326 | for p in group['params']: 327 | param_state = self.state[p] 328 | p.data.mul_(self.la_alpha).add_(param_state['cached_params'], alpha=1.0 - self.la_alpha) # crucial line 329 | param_state['cached_params'].copy_(p.data) 330 | if self.pullback_momentum == "pullback": 331 | internal_momentum = self.optimizer.state[p]["momentum_buffer"] 332 | self.optimizer.state[p]["momentum_buffer"] = internal_momentum.mul_(self.la_alpha).add_( 333 | 1.0 - self.la_alpha, param_state["cached_mom"]) 334 | param_state["cached_mom"] = self.optimizer.state[p]["momentum_buffer"] 335 | elif self.pullback_momentum == "reset": 336 | self.optimizer.state[p]["momentum_buffer"] = torch.zeros_like(p.data) 337 | 338 | return loss -------------------------------------------------------------------------------- /Bert_pytorch/pretrain/NLP_Utils.py: -------------------------------------------------------------------------------- 1 | from torch.utils.data.dataloader import _SingleProcessDataLoaderIter, _MultiProcessingDataLoaderIter 2 | import random 3 | import json 4 | import transformers as _ 5 | from transformers1 import BertTokenizer 6 | import torch 7 | from torch.utils.data import Dataset, DataLoader 8 | import numpy as np 9 | from itertools import chain 10 | import os 11 | import pandas as pd 12 | import re 13 | from tqdm import tqdm 14 | tqdm.pandas() 15 | 16 | 17 | def writeToJsonFile(path: str, obj): 18 | with open(path, "w", encoding="utf-8") as f: 19 | f.write(json.dumps(obj, ensure_ascii=False, indent=0)) 20 | 21 | 22 | def readFromJsonFile(path: str): 23 | with open(path, "r", encoding="utf-8") as f: 24 | return json.loads(f.read()) 25 | 26 | 27 | def loadData(path): 28 | allData = [] 29 | with open(path, "r") as f: 30 | j = 0 31 | for i in f: 32 | i = i.strip().split(',') 33 | if j == 0: 34 | j += 1 35 | continue 36 | if len(i) == 0: # 防止空行 37 | break 38 | if len(i) == 3: # 训练集 39 | a, b, label = i 40 | b = b.split(' ') 41 | else: # 测试集,直接转为id形式 42 | a, b, label = i[0], i[1], -1 43 | b = b.split(' ') 44 | allData.append([b, int(label)]) 45 | j += 1 46 | return allData 47 | 48 | 49 | def calNegPos(ls): # 计算正负比例 50 | posNum, negNum = 0, 0 51 | for i in ls: 52 | if i[2] == 0: 53 | negNum += 1 54 | elif i[2] == 1: 55 | posNum += 1 56 | posNum = 1 if posNum == 0 else posNum 57 | return negNum, posNum, round(negNum/posNum, 4) 58 | 59 | 60 | def preprocess_text(document): 61 | 62 | # 删除逗号, 脱敏数据中最大值为30357 63 | text = str(document) 64 | text = text.replace(',', '35001') 65 | text = text.replace('!', '35002') 66 | text = text.replace('?', '35003') 67 | text = text.replace('。', '35004') 68 | # text = text.replace('17281', '') 69 | # 用单个空格替换多个空格 70 | text = re.sub(r'\s+', ' ', text, flags=re.I) 71 | return text 72 | 73 | train_clean = '/media/mgege007/winType/DaGuan/data/train_clean.csv' 74 | test_clean = '/media/mgege007/winType/DaGuan/data/test_clean.csv' 75 | if not os.path.exists(train_clean): 76 | train_df = pd.read_csv( 77 | '/media/mgege007/winType/DaGuan/data/datagrand_2021_train.csv') 78 | train_df["text"] = train_df["text"].progress_apply( 79 | lambda x: preprocess_text(x)) 80 | id2label = list(train_df['label'].unique()) 81 | label2id = {id2label[i]: i for i in range(len(id2label))} 82 | train_df["label"] = train_df["label"].map(label2id) 83 | test_df = pd.read_csv( 84 | '/media/mgege007/winType/DaGuan/data/datagrand_2021_test.csv') 85 | test_df["text"] = test_df["text"].progress_apply( 86 | lambda x: preprocess_text(x)) 87 | train_df.to_csv(train_clean, index=False) 88 | test_df.to_csv(test_clean, index=False) 89 | 90 | allData = loadData(train_clean) + loadData(test_clean) 91 | testA_data = loadData(test_clean) 92 | # testB_data = loadData('/tcdata/gaiic_track3_round1_testB_20210317.tsv') 93 | random.shuffle(allData) 94 | 95 | train_data = allData # 全量 96 | valid_data = allData[-20000:] 97 | print("训练集样本数量:", len(train_data)) 98 | 99 | 100 | def paddingList(ls: list, val, returnTensor=False): 101 | ls = ls[:] # 不要改变了原list尺寸 102 | maxLen = max([len(i) for i in ls]) 103 | for i in range(len(ls)): 104 | ls[i] = ls[i]+[val]*(maxLen-len(ls[i])) 105 | return torch.tensor(ls, device='cuda') if returnTensor else ls 106 | 107 | 108 | def truncate(a: list, maxLen): 109 | maxLen -= 3 # 空留给cls sep sep 110 | assert maxLen >= 0 111 | #一共就a超长与否,b超长与否,组合的四种情况 112 | if len(a) > maxLen: # 需要截断 113 | # 尾截断 114 | # a=a[:maxLen] 115 | # 首截断 116 | # a = a[maxLen-len(a):] 117 | # 首尾截断 118 | outlen = (len(a)-maxLen) 119 | headid = int(outlen/2) 120 | a = a[headid:headid-outlen] 121 | return a 122 | 123 | 124 | class MLM_Data(Dataset): 125 | #传入句子对列表 126 | def __init__(self, textLs: list, maxLen: int, tk: BertTokenizer): 127 | super().__init__() 128 | self.data = textLs 129 | self.maxLen = maxLen 130 | self.tk = tk 131 | self.spNum = len(tk.all_special_tokens) 132 | self.tkNum = tk.vocab_size 133 | 134 | def __len__(self): 135 | return len(self.data) 136 | 137 | def random_mask(self, text_ids): 138 | input_ids, output_ids = [], [] 139 | rands = np.random.random(len(text_ids)) 140 | idx = 0 141 | while idx < len(rands): 142 | if rands[idx] < 0.3: # 需要mask 143 | # 若要mask,进行x_gram mask的概率 144 | ngram = np.random.choice([1, 2, 3], p=[0.7, 0.2, 0.1]) 145 | if ngram == 3 and len(rands) < 7: # 太大的gram不要应用于过短文本 146 | ngram = 2 147 | if ngram == 2 and len(rands) < 4: 148 | ngram = 1 149 | L = idx+1 150 | R = idx+ngram # 最终需要mask的右边界(开) 151 | while L < R and L < len(rands): 152 | rands[L] = np.random.random()*0.15 # 强制mask 153 | L += 1 154 | idx = R 155 | if idx < len(rands): 156 | rands[idx] = 1 # 禁止mask片段的下一个token被mask,防止一大片连续mask 157 | idx += 1 158 | 159 | for r, i in zip(rands, text_ids): 160 | if r < 0.15 * 0.8: 161 | input_ids.append(self.tk.mask_token_id) 162 | output_ids.append(i) # mask预测自己 163 | elif r < 0.15 * 0.9: 164 | input_ids.append(i) 165 | output_ids.append(i) # 自己预测自己 166 | elif r < 0.15: 167 | input_ids.append(np.random.randint(self.spNum, self.tkNum)) 168 | output_ids.append(i) # 随机的一个词预测自己,随机词不会从特殊符号中选取,有小概率抽到自己 169 | else: 170 | input_ids.append(i) 171 | output_ids.append(-100) # 保持原样不预测 172 | 173 | return input_ids, output_ids 174 | 175 | #耗时操作在此进行,可用上多进程 176 | def __getitem__(self, item): 177 | text1, _ = self.data[item] # 预处理,mask等操作 178 | 179 | text1 = truncate(text1, self.maxLen) 180 | text1_ids = self.tk.convert_tokens_to_ids(text1) 181 | text1_ids, out1_ids = self.random_mask(text1_ids) # 添加mask预测 182 | input_ids = [self.tk.cls_token_id] + \ 183 | text1_ids + [self.tk.sep_token_id] # 拼接 184 | token_type_ids = [0]*(len(text1_ids)+2) 185 | labels = [-100] + out1_ids + [-100] 186 | assert len(input_ids) == len(token_type_ids) == len(labels) 187 | return {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'labels': labels} 188 | 189 | @classmethod 190 | def collate(cls, batch): 191 | input_ids = [i['input_ids'] for i in batch] 192 | token_type_ids = [i['token_type_ids'] for i in batch] 193 | labels = [i['labels'] for i in batch] 194 | input_ids = paddingList(input_ids, 0, returnTensor=True) 195 | token_type_ids = paddingList(token_type_ids, 0, returnTensor=True) 196 | labels = paddingList(labels, -100, returnTensor=True) 197 | attention_mask = (input_ids != 0) 198 | return {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'attention_mask': attention_mask, 'labels': labels} 199 | 200 | 201 | def unionList(ls): return list(chain(*ls)) # 按元素拼接 202 | 203 | 204 | def splitList(x, bs): return [x[i:i+bs] for i in range(0, len(x), bs)] # 按bs切分 205 | 206 | 207 | #sortBsNum:原序列按多少个bs块为单位排序,可用来增强随机性 208 | #比如如果每次打乱后都全体一起排序,那每次都是一样的 209 | def blockShuffle(data: list, bs: int, sortBsNum, key): 210 | random.shuffle(data) # 先打乱 211 | tail = len(data) % bs # 计算碎片长度 212 | tail = [] if tail == 0 else data[-tail:] 213 | data = data[:len(data)-len(tail)] 214 | assert len(data) % bs == 0 # 剩下的一定能被bs整除 215 | # 为None就是整体排序 216 | sortBsNum = len(data)//bs if sortBsNum is None else sortBsNum 217 | data = splitList(data, sortBsNum*bs) 218 | data = [sorted(i, key=key, reverse=True) for i in data] # 每个大块进行降排序 219 | data = unionList(data) 220 | data = splitList(data, bs) # 最后,按bs分块 221 | random.shuffle(data) # 块间打乱 222 | data = unionList(data)+tail 223 | return data 224 | 225 | 226 | #每轮迭代重新分块shuffle数据的DataLoader 227 | class blockShuffleDataLoader(DataLoader): 228 | def __init__(self, dataset: Dataset, sortBsNum, key, **kwargs): 229 | assert isinstance(dataset.data, list) # 需要有list类型的data属性 230 | super().__init__(dataset, **kwargs) # 父类的参数传过去 231 | self.sortBsNum = sortBsNum 232 | self.key = key 233 | 234 | def __iter__(self): 235 | #分块shuffle 236 | self.dataset.data = blockShuffle( 237 | self.dataset.data, self.batch_size, self.sortBsNum, self.key) 238 | if self.num_workers == 0: 239 | return _SingleProcessDataLoaderIter(self) 240 | else: 241 | return _MultiProcessingDataLoaderIter(self) 242 | -------------------------------------------------------------------------------- /Bert_pytorch/pretrain/train_bert.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | import numpy as np 3 | import random 4 | import os 5 | random.seed(0) 6 | np.random.seed(0)#seed应该在main里尽早设置,以防万一 7 | os.environ['PYTHONHASHSEED'] =str(0)#消除hash算法的随机性 8 | from transformers import BertForMaskedLM#除nezha外模型用新版加载 9 | from transformers1 import Trainer, TrainingArguments,BertTokenizer,BertConfig 10 | from NLP_Utils import MLM_Data,train_data,blockShuffleDataLoader 11 | 12 | # 最大截断长度 13 | maxlen=100 14 | batch_size=32 15 | vocab_file_dir = 'Bert_pytorch/pretrain/bert_model/vocab_3462.txt' 16 | tokenizer = BertTokenizer.from_pretrained(vocab_file_dir) 17 | 18 | config = BertConfig( 19 | vocab_size=len(tokenizer), 20 | hidden_size=768, 21 | num_hidden_layers=12, 22 | num_attention_heads=12, 23 | max_position_embeddings=512, 24 | ) 25 | 26 | # 把层数改为8层 27 | # 必须是绝对路径 28 | model = BertForMaskedLM.from_pretrained("Bert_pytorch/bert_model_1000/") # './bert-base-chinese') 29 | 30 | model.resize_token_embeddings(len(tokenizer)) 31 | print(model) 32 | train_MLM_data=MLM_Data(train_data,maxlen,tokenizer) 33 | #自己定义dataloader,不要用huggingface的 34 | dl=blockShuffleDataLoader(train_MLM_data,None,key=lambda x:len(x[0])+1,shuffle=False 35 | ,batch_size=batch_size,collate_fn=train_MLM_data.collate) 36 | 37 | training_args = TrainingArguments( 38 | output_dir='Bert_pytorch/bert_output',#必须是绝对路径 39 | overwrite_output_dir=True, 40 | num_train_epochs=1000, 41 | per_device_train_batch_size=batch_size, 42 | save_steps=1000,#每50个epoch save一次 43 | save_total_limit=3, 44 | logging_steps=len(dl),#每个epoch log一次 45 | seed=2021, 46 | learning_rate=5e-5, 47 | lr_end=1e-5,#学习率衰减的终点 48 | weight_decay=0.01, 49 | warmup_steps=int(450000*150/batch_size*0.03) 50 | ) 51 | 52 | trainer = Trainer( 53 | model=model, 54 | args=training_args, 55 | train_dataLoader=dl, 56 | prediction_loss_only=True, 57 | ) 58 | 59 | if __name__ == '__main__': 60 | trainer.train() 61 | trainer.save_model('./bert_model_3462') 62 | -------------------------------------------------------------------------------- /Bert_pytorch/pretrain/transformers1.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BetterBench/2021-Daguan-Cup/9a140b7c8cba20965628d9ac9d9a11e04afe5aa8/Bert_pytorch/pretrain/transformers1.zip -------------------------------------------------------------------------------- /Nezha_pytorch/finetuning/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BetterBench/2021-Daguan-Cup/9a140b7c8cba20965628d9ac9d9a11e04afe5aa8/Nezha_pytorch/finetuning/.DS_Store -------------------------------------------------------------------------------- /Nezha_pytorch/finetuning/Config.py: -------------------------------------------------------------------------------- 1 | from transformers import BertTokenizer, AdamW, BertModel, BertPreTrainedModel, BertConfig, \ 2 | get_linear_schedule_with_warmup, XLNetModel, XLNetTokenizer, XLNetConfig, ElectraModel, ElectraConfig, ElectraTokenizer, \ 3 | RobertaTokenizer, RobertaModel, RobertaConfig 4 | from NEZHA.modeling_nezha import NeZhaModel 5 | from NEZHA.configuration_nezha import NeZhaConfig 6 | 7 | 8 | MODELS = { 9 | 'BertForClass': BertModel, 10 | 'BertForClass_MultiDropout': BertModel, 11 | 'BertLastTwoCls': BertModel, 12 | 'BertLastCls':BertModel, 13 | 'BertLastTwoClsPooler': BertModel, 14 | 'BertLastTwoEmbeddings': BertModel, 15 | 'BertLastTwoEmbeddingsPooler': BertModel, 16 | 'BertLastFourCls': BertModel, 17 | 'BertLastFourClsPooler': BertModel, 18 | 'BertLastFourEmbeddings': BertModel, 19 | 'BertLastFourEmbeddingsPooler': BertModel, 20 | 'BertDynCls': BertModel, 21 | 'BertDynEmbeddings': BertModel, 22 | 'BertRNN': BertModel, 23 | 'BertCNN': XLNetModel, 24 | 'BertRCNN': BertModel, 25 | 'XLNet': XLNetModel, 26 | 'Electra': ElectraModel, 27 | 'NEZHA': NeZhaModel 28 | } 29 | 30 | TOKENIZERS = { 31 | 'BertForClass': BertTokenizer, 32 | 'BertForClass_MultiDropout': BertTokenizer, 33 | 'BertLastTwoCls': BertTokenizer, 34 | 'BertLastCls': BertTokenizer, 35 | 'BertLastTwoClsPooler': BertTokenizer, 36 | 'BertLastTwoEmbeddings': BertTokenizer, 37 | 'BertLastTwoEmbeddingsPooler': BertTokenizer, 38 | 'BertLastFourCls': BertTokenizer, 39 | 'BertLastFourClsPooler': BertTokenizer, 40 | 'BertLastFourEmbeddings': BertTokenizer, 41 | 'BertLastFourEmbeddingsPooler': BertTokenizer, 42 | 'BertDynCls': BertTokenizer, 43 | 'BertDynEmbeddings': BertTokenizer, 44 | 'BertRNN': BertTokenizer, 45 | 'BertCNN': BertTokenizer, 46 | 'BertRCNN': BertTokenizer, 47 | 'XLNet': XLNetTokenizer, 48 | 'Electra': ElectraTokenizer, 49 | 'NEZHA': BertTokenizer 50 | } 51 | 52 | CONFIGS = { 53 | 'BertForClass': BertConfig, 54 | 'BertForClass_MultiDropout': BertConfig, 55 | 'BertLastTwoCls': BertConfig, 56 | 'BertLastCls': BertConfig, 57 | 'BertLastTwoClsPooler': BertConfig, 58 | 'BertLastTwoEmbeddings': BertConfig, 59 | 'BertLastTwoEmbeddingsPooler': BertConfig, 60 | 'BertLastFourCls': BertConfig, 61 | 'BertLastFourClsPooler': BertConfig, 62 | 'BertLastFourEmbeddings': BertConfig, 63 | 'BertLastFourEmbeddingsPooler': BertConfig, 64 | 'BertDynCls': BertConfig, 65 | 'BertDynEmbeddings': BertConfig, 66 | 'BertRNN': BertConfig, 67 | 'BertCNN': BertConfig, 68 | 'BertRCNN': BertConfig, 69 | 'XLNet': XLNetConfig, 70 | 'Electra': ElectraConfig, 71 | 'NEZHA': NeZhaConfig 72 | 73 | } -------------------------------------------------------------------------------- /Nezha_pytorch/finetuning/NEZHA/configuration_nezha.py: -------------------------------------------------------------------------------- 1 | 2 | from transformers import PretrainedConfig 3 | 4 | NEZHA_PRETRAINED_CONFIG_ARCHIVE_MAP = {} 5 | 6 | class NeZhaConfig(PretrainedConfig): 7 | r""" 8 | This is the configuration class to store the configuration of an :class:`~transformers.AlbertModel`. 9 | It is used to instantiate an ALBERT model according to the specified arguments, defining the model 10 | architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of 11 | the ALBERT `xxlarge `__ architecture. 12 | 13 | Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used 14 | to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig` 15 | for more information. 16 | 17 | 18 | Args: 19 | vocab_size (:obj:`int`, optional, defaults to 30000): 20 | Vocabulary size of the ALBERT model. Defines the different tokens that 21 | can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.AlbertModel`. 22 | embedding_size (:obj:`int`, optional, defaults to 128): 23 | Dimensionality of vocabulary embeddings. 24 | hidden_size (:obj:`int`, optional, defaults to 4096): 25 | Dimensionality of the encoder layers and the pooler layer. 26 | num_hidden_layers (:obj:`int`, optional, defaults to 12): 27 | Number of hidden layers in the Transformer encoder. 28 | num_hidden_groups (:obj:`int`, optional, defaults to 1): 29 | Number of groups for the hidden layers, parameters in the same group are shared. 30 | num_attention_heads (:obj:`int`, optional, defaults to 64): 31 | Number of attention heads for each attention layer in the Transformer encoder. 32 | intermediate_size (:obj:`int`, optional, defaults to 16384): 33 | The dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. 34 | inner_group_num (:obj:`int`, optional, defaults to 1): 35 | The number of inner repetition of attention and ffn. 36 | hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu_new"): 37 | The non-linear activation function (function or string) in the encoder and pooler. 38 | If string, "gelu", "relu", "swish" and "gelu_new" are supported. 39 | hidden_dropout_prob (:obj:`float`, optional, defaults to 0): 40 | The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. 41 | attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0): 42 | The dropout ratio for the attention probabilities. 43 | max_position_embeddings (:obj:`int`, optional, defaults to 512): 44 | The maximum sequence length that this model might ever be used with. Typically set this to something 45 | large (e.g., 512 or 1024 or 2048). 46 | type_vocab_size (:obj:`int`, optional, defaults to 2): 47 | The vocabulary size of the `token_type_ids` passed into :class:`~transformers.AlbertModel`. 48 | initializer_range (:obj:`float`, optional, defaults to 0.02): 49 | The standard deviation of the truncated_normal_initializer for initializing all weight matrices. 50 | layer_norm_eps (:obj:`float`, optional, defaults to 1e-12): 51 | The epsilon used by the layer normalization layers. 52 | classifier_dropout_prob (:obj:`float`, optional, defaults to 0.1): 53 | The dropout ratio for attached classifiers. 54 | 55 | Example:: 56 | 57 | from transformers import AlbertConfig, AlbertModel 58 | # Initializing an ALBERT-xxlarge style configuration 59 | albert_xxlarge_configuration = AlbertConfig() 60 | 61 | # Initializing an ALBERT-base style configuration 62 | albert_base_configuration = AlbertConfig( 63 | hidden_size=768, 64 | num_attention_heads=12, 65 | intermediate_size=3072, 66 | ) 67 | 68 | # Initializing a model from the ALBERT-base style configuration 69 | model = AlbertModel(albert_xxlarge_configuration) 70 | 71 | # Accessing the model configuration 72 | configuration = model.config 73 | 74 | Attributes: 75 | pretrained_config_archive_map (Dict[str, str]): 76 | A dictionary containing all the available pre-trained checkpoints. 77 | """ 78 | 79 | pretrained_config_archive_map = NEZHA_PRETRAINED_CONFIG_ARCHIVE_MAP 80 | model_type = "nezha" 81 | 82 | def __init__( 83 | self, 84 | vocab_size=30000, 85 | embedding_size=128, 86 | hidden_size=4096, 87 | num_hidden_layers=12, 88 | num_hidden_groups=1, 89 | num_attention_heads=64, 90 | intermediate_size=16384, 91 | inner_group_num=1, 92 | hidden_act="gelu_new", 93 | hidden_dropout_prob=0, 94 | attention_probs_dropout_prob=0, 95 | max_position_embeddings=512, 96 | max_relative_position=64, 97 | type_vocab_size=2, 98 | initializer_range=0.02, 99 | layer_norm_eps=1e-12, 100 | classifier_dropout_prob=0.1, 101 | use_relative_position=True, 102 | pad_token_id=0, 103 | bos_token_id=2, 104 | eos_token_id=3, 105 | **kwargs 106 | ): 107 | super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) 108 | 109 | self.vocab_size = vocab_size 110 | self.embedding_size = embedding_size 111 | self.hidden_size = hidden_size 112 | self.num_hidden_layers = num_hidden_layers 113 | self.num_hidden_groups = num_hidden_groups 114 | self.num_attention_heads = num_attention_heads 115 | self.inner_group_num = inner_group_num 116 | self.hidden_act = hidden_act 117 | self.intermediate_size = intermediate_size 118 | self.hidden_dropout_prob = hidden_dropout_prob 119 | self.attention_probs_dropout_prob = attention_probs_dropout_prob 120 | self.max_position_embeddings = max_position_embeddings 121 | self.max_relative_position = max_relative_position 122 | self.type_vocab_size = type_vocab_size 123 | self.initializer_range = initializer_range 124 | self.layer_norm_eps = layer_norm_eps 125 | self.use_relative_position=use_relative_position 126 | self.classifier_dropout_prob = classifier_dropout_prob 127 | -------------------------------------------------------------------------------- /Nezha_pytorch/finetuning/NEZHA_main.py: -------------------------------------------------------------------------------- 1 | from NEZHA.modeling_nezha import * 2 | from transformers.optimization import ( 3 | get_constant_schedule, 4 | get_constant_schedule_with_warmup, 5 | get_linear_schedule_with_warmup, 6 | get_cosine_schedule_with_warmup, 7 | get_cosine_with_hard_restarts_schedule_with_warmup, 8 | get_polynomial_decay_schedule_with_warmup, 9 | ) 10 | from tqdm import tqdm, trange 11 | import numpy as np 12 | import pandas as pd 13 | import logging 14 | import torch 15 | import random 16 | import os 17 | import re 18 | from torch import nn, optim 19 | from torch.optim.optimizer import Optimizer 20 | from collections import defaultdict 21 | from transformers import BertTokenizer, Adafactor, AdamW, BertModel, BertPreTrainedModel, BertConfig, \ 22 | get_linear_schedule_with_warmup, XLNetModel, XLNetTokenizer, XLNetConfig 23 | from transformers.optimization import get_linear_schedule_with_warmup 24 | from sklearn.model_selection import StratifiedKFold, KFold 25 | from sklearn.metrics import mean_absolute_error, accuracy_score, f1_score, roc_auc_score 26 | from model import * 27 | from utils import * 28 | from utils import Lookahead 29 | import time 30 | import logging 31 | from tqdm import tqdm 32 | from torch.cuda import amp # 要求pytorch>=1.6 33 | tqdm.pandas() 34 | 35 | 36 | class Config: 37 | def __init__(self): 38 | # 预训练模型路径 39 | self.modelId = 2 40 | self.model = 'NEZHA' # "BertForClass" 41 | self.Stratification = False 42 | self.model_path = 'Nezha_pytorch/nezha_model/' 43 | 44 | self.num_class = 35 45 | self.dropout = 0.2 46 | self.MAX_LEN = 100 47 | self.epoch = 50 48 | self.learn_rate = 4e-5 49 | self.normal_lr = 1e-4 50 | self.batch_size = 32 51 | self.k_fold = 10 52 | self.seed = 42 53 | self.device = torch.device('cuda') 54 | self.optimizer = "AdamW" 55 | self.focalloss = False 56 | self.pgd = False 57 | self.fgm = True 58 | self.scheduler = "cosine_schedule_with_warmup" 59 | self.fp16 = True 60 | 61 | 62 | MODEL_CLASSES = { 63 | 'BertForClass': BertForClass, 64 | 'BertLstm': BertLstm, 65 | 'BertLastCls': BertLastCls, 66 | 'BertLastTwoCls': BertLastTwoCls, 67 | 'BertLastTwoClsPooler': BertLastTwoClsPooler, 68 | 'BertLastTwoEmbeddings': BertLastTwoEmbeddings, 69 | 'BertForClass_MultiDropout': BertForClass_MultiDropout, 70 | 'BertLastTwoEmbeddingsPooler': BertLastTwoEmbeddingsPooler, 71 | 'BertLastFourCls': BertLastFourCls, 72 | 'BertLastFourClsPooler': BertLastFourClsPooler, 73 | 'BertLastFourEmbeddings': BertLastFourEmbeddings, 74 | 'BertLastFourEmbeddingsPooler': BertLastFourEmbeddingsPooler, 75 | 'BertDynCls': BertDynCls, 76 | 'BertDynEmbeddings': BertDynEmbeddings, 77 | 'BertRNN': BertRNN, 78 | 'BertCNN': BertCNN, 79 | 'BertRCNN': BertRCNN, 80 | 'XLNet': XLNet, 81 | 'Electra': Electra, 82 | 'NEZHA': NEZHA, 83 | 84 | } 85 | 86 | def preprocess_text(document): 87 | # 将符号替换为不在脱敏文本的词典中的词 88 | # 删除逗号, 脱敏数据中最大值为30357 89 | text = str(document) 90 | text = text.replace(',', '35001') 91 | text = text.replace('!', '35002') 92 | text = text.replace('?', '35003') 93 | text = text.replace('。', '35004') 94 | # text = text.replace('17281', '') 95 | # 用单个空格替换多个空格 96 | text = re.sub(r'\s+', ' ', text, flags=re.I) 97 | 98 | return text 99 | 100 | config = Config() 101 | os.environ['PYTHONHASHSEED'] = '0' # 消除hash算法的随机性 102 | random.seed(config.seed) 103 | np.random.seed(config.seed) 104 | torch.manual_seed(config.seed) 105 | torch.cuda.manual_seed_all(config.seed) 106 | 107 | # 数据预处理和加载 108 | train_clean = 'data/datagrand_2021_train.csv' 109 | train = pd.read_csv(train_clean) 110 | train["text"].progress_apply(lambda x: preprocess_text(x)) 111 | ylabel = [] 112 | id2label = list(train['label'].unique()) 113 | label2id = {id2label[i]: i for i in range(len(id2label))} 114 | y_train = np.zeros((len(train), len(id2label)), dtype=np.int8) 115 | train_dataset = [] 116 | for i in tqdm(range(len(train))): 117 | train_dict = {} 118 | train_dict['text'] = train.loc[i, 'text'] 119 | y_train[i][label2id[train.loc[i, 'label']]] = 1 120 | train_dict['label'] = y_train[i] 121 | ylabel.append(train.loc[i, 'label']) 122 | train_dataset.append(train_dict) 123 | 124 | # K折划分 125 | kf = StratifiedKFold(n_splits=config.k_fold, shuffle=True, 126 | random_state=config.seed) 127 | 128 | # FP16 混合精度训练 129 | scaler = amp.GradScaler() 130 | for fold, (train_index, valid_index) in enumerate(kf.split(np.arange(len(train_dataset)), ylabel)): 131 | print('\n\n------------fold:{}------------\n'.format(fold)) 132 | tra = [train_dataset[index] for index in train_index] 133 | val = [train_dataset[index] for index in valid_index] 134 | 135 | train_D = data_generator(tra, config, shuffle=True) 136 | val_D = data_generator(val, config) 137 | model = MODEL_CLASSES[config.model](config).to(config.device) 138 | 139 | if torch.cuda.device_count() > 1: 140 | print("Let's use", torch.cuda.device_count(), "GPUs!") 141 | model = torch.nn.DataParallel(model) 142 | # 是否PGD对抗训练 143 | if config.pgd: 144 | pgd = PGD(model) 145 | K = 3 146 | # 是否FGM对抗训练 147 | elif config.fgm: 148 | fgm = FGM(model) 149 | 150 | if config.focalloss: 151 | loss_fn = FocalLoss(config.num_class) 152 | else: 153 | loss_fn = nn.BCEWithLogitsLoss() # BCEWithLogitsLoss就是把Sigmoid-BCELoss合成一步 154 | 155 | num_train_steps = int(len(train) / config.batch_size * config.epoch) 156 | param_optimizer = list(model.named_parameters()) 157 | 158 | no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] 159 | 160 | if config.Stratification: 161 | bert_params = [x for x in param_optimizer if 'bert' in x[0]] 162 | normal_params = [p for n, p in param_optimizer if 'bert' not in n] 163 | optimizer_parameters = [ 164 | {'params': [p for n, p in bert_params if not any( 165 | nd in n for nd in no_decay)], 'weight_decay': 0.01}, 166 | {'params': [p for n, p in bert_params if any( 167 | nd in n for nd in no_decay)], 'weight_decay': 0.0}, 168 | {'params': normal_params, 'lr': config.normal_lr}, 169 | ] 170 | else: 171 | optimizer_parameters = [ 172 | {'params': [p for n, p in param_optimizer if not any( 173 | nd in n for nd in no_decay)], 'weight_decay': 0.01}, 174 | {'params': [p for n, p in param_optimizer if any( 175 | nd in n for nd in no_decay)], 'weight_decay': 0.0}, 176 | ] 177 | # 优化器的选择 178 | adam_epsilon = 1e-6 179 | if config.optimizer == "AdamW": 180 | optimizer = AdamW(optimizer_parameters, lr=config.learn_rate) 181 | elif config.optimizer == "lookahead": 182 | optimizer = AdamW(optimizer_parameters, 183 | lr=config.learn_rate, eps=adam_epsilon) 184 | optimizer = Lookahead(optimizer=optimizer, la_steps=5, la_alpha=0.6) 185 | 186 | elif config.optimizer == "Adafactor": 187 | optimizer = Adafactor( 188 | optimizer_parameters, 189 | lr=config.learn_rate, 190 | eps=(1e-30, 1e-3), 191 | clip_threshold=1.0, 192 | decay_rate=-0.8, 193 | beta1=None, 194 | weight_decay=0.0, 195 | scale_parameter=False, 196 | relative_step=False, 197 | warmup_init=False, 198 | ) 199 | # scheduler学习率的选择 200 | warmup_steps = int(len(train) / config.batch_size / 2) 201 | if config.scheduler == "constant_schedule": 202 | scheduler = get_constant_schedule(optimizer) 203 | 204 | elif config.scheduler == "constant_schedule_with_warmup": 205 | scheduler = get_constant_schedule_with_warmup( 206 | optimizer, num_warmup_steps=warmup_steps) 207 | elif config.scheduler == "linear_schedule_with_warmup": 208 | scheduler = get_linear_schedule_with_warmup( 209 | optimizer, 210 | num_warmup_steps=int(len(train) / config.batch_size / 2), 211 | num_training_steps=num_train_steps 212 | ) 213 | elif config.scheduler == "cosine_schedule_with_warmup": 214 | scheduler = get_cosine_schedule_with_warmup( 215 | optimizer, 216 | num_warmup_steps=warmup_steps, 217 | num_training_steps=num_train_steps, 218 | ) 219 | 220 | elif config.scheduler == "cosine_with_hard_restarts_schedule_with_warmup": 221 | scheduler = get_cosine_with_hard_restarts_schedule_with_warmup( 222 | optimizer, 223 | num_warmup_steps=warmup_steps, 224 | num_training_steps=num_train_steps, 225 | ) 226 | elif config.scheduler == "polynomial_decay_schedule_with_warmup": 227 | scheduler = get_polynomial_decay_schedule_with_warmup( 228 | optimizer, 229 | num_warmup_steps=warmup_steps, 230 | num_training_steps=num_train_steps, 231 | ) 232 | 233 | best_f1 = 0 234 | best_epoch=-1 235 | best_valloss = 0 236 | # 每一个fold保存一个模型 237 | PATH = './{}_models/model_{}.pth'.format(config.model, fold) 238 | save_model_path = './{}_models/'.format(config.model) 239 | if not os.path.exists(save_model_path): 240 | os.makedirs(save_model_path) 241 | 242 | score = [] 243 | train_len = 0 244 | loss_num = [] 245 | for e in range(config.epoch): 246 | print('\n------------epoch:{}------------'.format(e)) 247 | model.train() 248 | tq = tqdm(train_D, ncols=70, disable=True) 249 | last = time.time() 250 | for input_ids, input_masks, segment_ids, labels in tq: 251 | label_t = torch.tensor(labels, dtype=torch.float).to(config.device) 252 | if config.fp16: 253 | with amp.autocast(): 254 | y_pred = model(input_ids, input_masks, segment_ids) 255 | loss = loss_fn(y_pred, label_t) 256 | else: 257 | y_pred = model(input_ids, input_masks, segment_ids) 258 | loss = loss_fn(y_pred, label_t) 259 | loss = loss.mean() 260 | if config.fp16: 261 | scaler.scale(loss).backward() 262 | else: 263 | loss.backward() 264 | 265 | if config.pgd: 266 | pgd.backup_grad() 267 | # 对抗训练 268 | for t in range(K): 269 | # 在embedding上添加对抗扰动, first attack时备份param.data 270 | pgd.attack(is_first_attack=(t == 0)) 271 | if t != K - 1: 272 | model.zero_grad() 273 | else: 274 | pgd.restore_grad() 275 | y_pred = model(input_ids, input_masks, segment_ids) 276 | 277 | loss_adv = loss_fn(y_pred, label_t) 278 | loss_adv = loss_adv.mean() 279 | loss_adv.backward() # 反向传播,并在正常的grad基础上,累加对抗训练的梯度 280 | pgd.restore() # 恢复embedding参数 281 | 282 | elif config.fgm: 283 | # 对抗训练 284 | fgm.attack() # 在embedding上添加对抗扰动 285 | y_pred = model(input_ids, input_masks, segment_ids) 286 | loss_adv = loss_fn(y_pred, label_t) 287 | loss_adv = loss_adv.mean() 288 | loss_adv.backward() # 反向传播,并在正常的grad基础上,累加对抗训练的梯度 289 | fgm.restore() # 恢复embedding参数 290 | 291 | # 梯度下降,更新参数 292 | if config.fp16: 293 | scaler.unscale_(optimizer) 294 | scaler.step(optimizer) 295 | scaler.update() 296 | else: 297 | optimizer.step() 298 | scheduler.step() # Update learning rate schedule 299 | model.zero_grad() 300 | 301 | y_pred = np.argmax(y_pred.detach().to("cpu").numpy(), axis=1) 302 | label = np.argmax(labels, axis=1) 303 | score.append(f1_score(label, y_pred, average='macro')) 304 | loss_num.append(loss.item()) 305 | tq.set_postfix(fold=fold, epoch=e, 306 | loss=loss_num[-1], acc=score[-1]) 307 | # 计算训练时间 308 | print(f"微调第{e}轮耗时:{time.time()-last}") 309 | # 验证集测试 310 | model.eval() 311 | with torch.no_grad(): 312 | y_p = [] 313 | y_l = [] 314 | val_y = [] 315 | val_loss = [] 316 | train_logit = None 317 | for input_ids, input_masks, segment_ids, labels in tqdm(val_D, disable=True): 318 | label_t = torch.tensor( 319 | labels, dtype=torch.float).to(config.device) 320 | y_pred = model(input_ids, input_masks, segment_ids) 321 | loss = loss_fn(y_pred, label_t) 322 | val_loss.append(loss.item()) 323 | y_pred = F.softmax(y_pred, dim=1) 324 | y_pred = y_pred.detach().to("cpu").numpy() 325 | if train_logit is None: 326 | train_logit = y_pred 327 | else: 328 | train_logit = np.vstack((train_logit, y_pred)) 329 | 330 | y_p += list(y_pred[:, 1]) 331 | 332 | y_pred = np.argmax(y_pred, axis=1) 333 | y_l += list(y_pred) 334 | y_label = np.argmax(labels, axis=1) 335 | val_y += list(y_label) 336 | 337 | val_f1 = f1_score(val_y, y_l, average="macro") 338 | if val_f1 >= best_f1: 339 | best_f1 = val_f1 340 | best_epoch = e 341 | best_valloss = np.mean(val_loss) 342 | torch.save(model.module if hasattr( 343 | model, "module") else model, PATH) 344 | # 每一个epoch输出f1和loss 345 | print("fold [{}/{}] train_loss={} val_loss={} train_f1={} val_f1={}".format(fold + 346 | 1, config.k_fold, np.mean(loss_num), np.mean(val_loss), np.mean(score), val_f1)) 347 | # 打印每个fold中最佳f1的记录 348 | print("best_epoch={} val_loss={} val_f1={}".format(best_epoch,best_valloss,best_f1)) 349 | optimizer.zero_grad() 350 | -------------------------------------------------------------------------------- /Nezha_pytorch/finetuning/predict.py: -------------------------------------------------------------------------------- 1 | from NEZHA.modeling_nezha import * 2 | from tqdm import tqdm, trange 3 | import numpy as np 4 | import pandas as pd 5 | import logging 6 | import torch 7 | import random 8 | import os 9 | from torch import nn, optim 10 | from transformers import BertTokenizer, AdamW, BertModel, BertPreTrainedModel, BertConfig, \ 11 | get_linear_schedule_with_warmup, XLNetModel, XLNetTokenizer, XLNetConfig 12 | from transformers.optimization import get_linear_schedule_with_warmup 13 | from sklearn.model_selection import StratifiedKFold, KFold 14 | from sklearn.metrics import mean_absolute_error, accuracy_score, f1_score, roc_auc_score 15 | from model import * 16 | from utils import * 17 | import time 18 | 19 | 20 | MODEL_CLASSES = { 21 | 'BertForClass': BertForClass, 22 | 'BertLastCls': BertLastCls, 23 | 'BertLastTwoCls': BertLastTwoCls, 24 | 'BertLastTwoClsPooler': BertLastTwoClsPooler, 25 | 'BertLastTwoEmbeddings': BertLastTwoEmbeddings, 26 | 'BertLastTwoEmbeddingsPooler': BertLastTwoEmbeddingsPooler, 27 | 'BertLastFourCls': BertLastFourCls, 28 | 'BertLastFourClsPooler': BertLastFourClsPooler, 29 | 'BertLastFourEmbeddings': BertLastFourEmbeddings, 30 | 'BertLastFourEmbeddingsPooler': BertLastFourEmbeddingsPooler, 31 | 'BertDynCls': BertDynCls, 32 | 'BertDynEmbeddings': BertDynEmbeddings, 33 | 'BertRNN': BertRNN, 34 | 'BertCNN': BertCNN, 35 | 'BertRCNN': BertRCNN, 36 | 'XLNet': XLNet, 37 | 'Electra': Electra, 38 | 'NEZHA': NEZHA, 39 | 40 | } 41 | 42 | 43 | class Config: 44 | def __init__(self): 45 | # 预训练模型路径 46 | self.modelId = 2 47 | self.model = "NEZHA" 48 | self.Stratification = False 49 | self.model_path = '/media/mgege007/winType/DaGuan/nezha-base-count3/nezha_model/' 50 | 51 | self.num_class = 35 52 | self.dropout = 0.2 53 | self.MAX_LEN = 100 54 | self.epoch = 3 55 | self.learn_rate = 4e-5 56 | self.normal_lr = 1e-4 57 | self.batch_size = 128 58 | self.k_fold = 10 59 | self.seed = 42 60 | 61 | self.device = torch.device('cuda') 62 | # self.device = torch.device('cpu') 63 | 64 | self.focalloss = False 65 | self.pgd = False 66 | self.fgm = False 67 | 68 | 69 | def submit(pred, test_df, id2label): 70 | test_preds_merge = np.sum(pred, axis=0) / (pred.shape[0]) 71 | test_pre_tensor = torch.tensor(test_preds_merge) 72 | test_pre = torch.max(test_pre_tensor, 1)[1] 73 | pred_labels = [id2label[i] for i in test_pre] 74 | SUBMISSION_DIR = "submit" 75 | if not os.path.exists(SUBMISSION_DIR): 76 | os.makedirs(SUBMISSION_DIR) 77 | Name = "Nezha—5epoch-10fold-adaverisal" 78 | submit_file = SUBMISSION_DIR+"/submit_{}.csv".format(Name) 79 | 80 | pd.DataFrame({"id": test_df['id'], "label": pred_labels}).to_csv( 81 | submit_file, index=False) 82 | 83 | 84 | 85 | config = Config() 86 | os.environ['PYTHONHASHSEED'] = '0' # 消除hash算法的随机性 87 | random.seed(config.seed) 88 | np.random.seed(config.seed) 89 | torch.manual_seed(config.seed) 90 | torch.cuda.manual_seed_all(config.seed) 91 | 92 | train_clean = '/media/mgege007/winType/DaGuan/data/datagrand_2021_train.csv' 93 | test_clean = '/media/mgege007/winType/DaGuan/data/test_clean.csv' 94 | train = pd.read_csv(train_clean) 95 | test = pd.read_csv(test_clean) 96 | id2label = list(train['label'].unique()) 97 | label2id = {id2label[i]: i for i in range(len(id2label))} 98 | test_dataset = [] 99 | for i in tqdm(range(len(test))): 100 | test_dict = {} 101 | test_dict['text'] = test.loc[i, 'text'] 102 | test_dict['label'] = [-1]*35 103 | test_dataset.append(test_dict) 104 | test_D = data_generator(test_dataset, config) 105 | model_pre = [] 106 | for fold in tqdm(range(config.k_fold)): 107 | PATH = './models/bert_{}.pth'.format(fold) 108 | #model = MODEL_CLASSES[config.model](config).to(config.device) 109 | model = torch.load(PATH) 110 | model.eval() 111 | with torch.no_grad(): 112 | y_p = [] 113 | y_l = [] 114 | val_y = [] 115 | train_logit = None 116 | for input_ids, input_masks, segment_ids, labels in tqdm(test_D, disable=True): 117 | y_pred = model(input_ids, input_masks, segment_ids) 118 | y_pred = F.softmax(y_pred) 119 | y_pred = y_pred.detach().to("cpu").numpy() 120 | if train_logit is None: 121 | train_logit = y_pred 122 | else: 123 | train_logit = np.vstack((train_logit, y_pred)) 124 | model_pre.append(train_logit) 125 | 126 | 127 | submit(np.array(model_pre), test, id2label) 128 | 129 | -------------------------------------------------------------------------------- /Nezha_pytorch/finetuning/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from transformers import BertTokenizer, AdamW, BertModel, BertPreTrainedModel, BertConfig, \ 3 | get_linear_schedule_with_warmup, XLNetModel, XLNetTokenizer, XLNetConfig 4 | import numpy as np 5 | import os 6 | import random 7 | from Config import * 8 | import torch 9 | import torch.nn as nn 10 | import torch.nn.functional as F 11 | from collections import defaultdict 12 | from torch.optim.optimizer import Optimizer 13 | 14 | def paddingList(ls:list,val,returnTensor=False): 15 | ls=ls[:]#不要改变了原list尺寸 16 | maxLen=max([len(i) for i in ls]) 17 | for i in range(len(ls)): 18 | ls[i]=ls[i]+[val]*(maxLen-len(ls[i])) 19 | return torch.tensor(ls,device='cuda') if returnTensor else ls 20 | 21 | def fastTokenizer(a:str,b:str,maxLen,tk): 22 | a,b=a.split(),b.split() 23 | a,b=tk.convert_tokens_to_ids(a),tk.convert_tokens_to_ids(b) 24 | maxLen-=3#空留给cls sep sep 25 | assert maxLen>=0 26 | len2=maxLen//2#若为奇数,更长部分给左边 27 | len1=maxLen-len2 28 | #一共就a超长与否,b超长与否,组合的四种情况 29 | if len(a)+len(b)>maxLen:#需要截断 30 | if len(a)<=len1 and len(b)>len2: 31 | b=b[:maxLen-len(a)] 32 | elif len(a)>len1 and len(b)<=len2: 33 | a=a[:maxLen-len(b)] 34 | elif len(a)>len1 and len(b)>len2: 35 | a=a[:len1] 36 | b=b[:len2] 37 | input_ids=[tk.cls_token_id]+a+[tk.sep_token_id]+b+[tk.sep_token_id] 38 | token_type_ids=[0]*(len(a)+2)+[1]*(len(b)+1) 39 | return {'input_ids': input_ids, 'token_type_ids': token_type_ids} 40 | 41 | 42 | class data_generator: 43 | def __init__(self, data, config, shuffle=False): 44 | self.data = data 45 | self.batch_size = config.batch_size 46 | self.max_length = config.MAX_LEN 47 | self.shuffle = shuffle 48 | 49 | vocab = 'vocab.txt' if os.path.exists(config.model_path + 'vocab.txt') else 'spiece.model' 50 | self.tokenizer = TOKENIZERS[config.model].from_pretrained(config.model_path + vocab) 51 | 52 | self.steps = len(self.data[0]) // self.batch_size 53 | if len(self.data[0]) % self.batch_size != 0: 54 | self.steps += 1 55 | 56 | def __len__(self): 57 | return self.steps 58 | 59 | def __iter__(self): 60 | input_ids, input_masks, segment_ids, labels = [], [], [], [] 61 | for index, data_li in enumerate(self.data): 62 | 63 | text = data_li['text'] 64 | label = data_li['label'] 65 | tkRes = self.tokenizer(text, max_length=self.max_length, truncation='longest_first', 66 | return_attention_mask=False) 67 | input_id = tkRes['input_ids'] 68 | segment_id = tkRes['token_type_ids'] 69 | assert len(segment_id) == len(input_id) 70 | input_ids.append(input_id) 71 | segment_ids.append(segment_id) 72 | labels.append(label) 73 | 74 | if len(input_ids) == self.batch_size or index == len(self.data)-1: 75 | input_ids = paddingList(input_ids, 0, returnTensor=True) # 动态padding 76 | segment_ids = paddingList(segment_ids, 0, returnTensor=True) 77 | input_masks = (input_ids != 0) 78 | yield input_ids, input_masks, segment_ids, labels 79 | input_ids, input_masks, segment_ids, labels = [], [], [], [] 80 | 81 | 82 | 83 | class PGD(): 84 | def __init__(self, model): 85 | self.model = model 86 | self.emb_backup = {} 87 | self.grad_backup = {} 88 | 89 | def attack(self, epsilon=0.3, alpha=0.1, emb_name='word_embeddings', is_first_attack=False): 90 | # emb_name这个参数要换成你模型中embedding的参数名 91 | for name, param in self.model.named_parameters(): 92 | if param.requires_grad and emb_name in name: 93 | if is_first_attack: 94 | self.emb_backup[name] = param.data.clone() 95 | norm = torch.norm(param.grad) 96 | if norm != 0 and not torch.isnan(norm): 97 | r_at = alpha * param.grad / norm 98 | param.data.add_(r_at) 99 | param.data = self.project(name, param.data, epsilon) 100 | 101 | def restore(self, emb_name='word_embeddings'): 102 | # emb_name这个参数要换成你模型中embedding的参数名 103 | for name, param in self.model.named_parameters(): 104 | if param.requires_grad and emb_name in name: 105 | assert name in self.emb_backup 106 | param.data = self.emb_backup[name] 107 | self.emb_backup = {} 108 | 109 | def project(self, param_name, param_data, epsilon): 110 | r = param_data - self.emb_backup[param_name] 111 | if torch.norm(r) > epsilon: 112 | r = epsilon * r / torch.norm(r) 113 | return self.emb_backup[param_name] + r 114 | 115 | def backup_grad(self): 116 | for name, param in self.model.named_parameters(): 117 | if param.requires_grad: 118 | self.grad_backup[name] = param.grad.clone() 119 | 120 | def restore_grad(self): 121 | for name, param in self.model.named_parameters(): 122 | if param.requires_grad: 123 | param.grad = self.grad_backup[name] 124 | 125 | 126 | 127 | class FGM(): 128 | def __init__(self, model): 129 | self.model = model 130 | self.backup = {} 131 | 132 | def attack(self, epsilon=0.25, emb_name='word_embeddings'): 133 | # emb_name这个参数要换成你模型中embedding的参数名 134 | for name, param in self.model.named_parameters(): 135 | if param.requires_grad and emb_name in name: 136 | self.backup[name] = param.data.clone() 137 | norm = torch.norm(param.grad) 138 | if norm != 0: 139 | r_at = epsilon * param.grad / norm 140 | param.data.add_(r_at) 141 | 142 | def restore(self, emb_name='word_embeddings'): 143 | # emb_name这个参数要换成你模型中embedding的参数名 144 | for name, param in self.model.named_parameters(): 145 | if param.requires_grad and emb_name in name: 146 | assert name in self.backup 147 | param.data = self.backup[name] 148 | self.backup = {} 149 | 150 | 151 | # 支持多分类和二分类 152 | class FocalLoss(nn.Module): 153 | """ 154 | This is a implementation of Focal Loss with smooth label cross entropy supported which is proposed in 155 | 'Focal Loss for Dense Object Detection. (https://arxiv.org/abs/1708.02002)' 156 | Focal_Loss= -1*alpha*(1-pt)^gamma*log(pt) 157 | :param num_class: 158 | :param alpha: (tensor) 3D or 4D the scalar factor for this criterion 159 | :param gamma: (float,double) gamma > 0 reduces the relative loss 160 | for well-classified examples (p>0.5) putting more 161 | focus on hard misclassified example 162 | :param smooth: (float,double) smooth value when cross entropy 163 | :param balance_index: (int) balance class index, 164 | should be specific when alpha is float 165 | :param size_average: (bool, optional) By default, 166 | the losses are averaged over each loss element in the batch. 167 | """ 168 | def __init__(self, num_class, alpha=None, gamma=2, 169 | smooth=None, size_average=True): 170 | super(FocalLoss, self).__init__() 171 | self.num_class = num_class 172 | self.alpha = alpha 173 | self.gamma = gamma 174 | self.smooth = smooth 175 | self.size_average = size_average 176 | 177 | if self.alpha is None: 178 | self.alpha = torch.ones(self.num_class, 1) 179 | elif isinstance(self.alpha, (list, np.ndarray)): 180 | assert len(self.alpha) == self.num_class 181 | self.alpha = torch.FloatTensor(alpha).view(self.num_class, 1) 182 | self.alpha = self.alpha / self.alpha.sum() 183 | else: 184 | raise TypeError('Not support alpha type') 185 | if self.smooth is not None: 186 | if self.smooth < 0 or self.smooth > 1.0: 187 | raise ValueError('smooth value should be in [0,1]') 188 | 189 | def forward(self, input, target): 190 | logit = F.softmax(input, dim=1) 191 | 192 | if logit.dim() > 2: 193 | # N,C,d1,d2 -> N,C,m (m=d1*d2*...) 194 | logit = logit.view(logit.size(0), logit.size(1), -1) 195 | logit = logit.permute(0, 2, 1).contiguous() 196 | logit = logit.view(-1, logit.size(-1)) 197 | target = target.view(-1, 1) 198 | 199 | # N = input.size(0) 200 | # alpha = torch.ones(N, self.num_class) 201 | # alpha = alpha * (1 - self.alpha) 202 | # alpha = alpha.scatter_(1, target.long(), self.alpha) 203 | epsilon = 1e-10 204 | alpha = self.alpha 205 | if alpha.device != input.device: 206 | alpha = alpha.to(input.device) 207 | 208 | idx = target.cpu().long() 209 | one_hot_key = torch.FloatTensor(target.size(0), self.num_class).zero_() 210 | one_hot_key = one_hot_key.scatter_(1, idx, 1) 211 | if one_hot_key.device != logit.device: 212 | one_hot_key = one_hot_key.to(logit.device) 213 | 214 | if self.smooth: 215 | one_hot_key = torch.clamp( 216 | one_hot_key, self.smooth, 1.0 - self.smooth) 217 | pt = (one_hot_key * logit).sum(1) + epsilon 218 | logpt = pt.log() 219 | 220 | gamma = self.gamma 221 | 222 | alpha = alpha[idx] 223 | loss = -1 * alpha * torch.pow((1 - pt), gamma) * logpt 224 | 225 | if self.size_average: 226 | loss = loss.mean() 227 | else: 228 | loss = loss.sum() 229 | return loss 230 | 231 | 232 | def f1_match(y_true,y_pred): 233 | acc = sum(y_pred & y_true) / (sum(y_pred)) 234 | rec = sum(y_pred & y_true) / (sum(y_true)) 235 | 236 | return 2 * acc * rec /(acc + rec) 237 | 238 | class Lookahead(Optimizer): 239 | r"""PyTorch implementation of the lookahead wrapper. 240 | Lookahead Optimizer: https://arxiv.org/abs/1907.08610 241 | """ 242 | 243 | def __init__(self, optimizer, la_steps=5, la_alpha=0.8, pullback_momentum="none"): 244 | """optimizer: inner optimizer 245 | la_steps (int): number of lookahead steps 246 | la_alpha (float): linear interpolation factor. 1.0 recovers the inner optimizer. 247 | pullback_momentum (str): change to inner optimizer momentum on interpolation update 248 | """ 249 | self.optimizer = optimizer 250 | self._la_step = 0 # counter for inner optimizer 251 | self.la_alpha = la_alpha 252 | self._total_la_steps = la_steps 253 | pullback_momentum = pullback_momentum.lower() 254 | assert pullback_momentum in ["reset", "pullback", "none"] 255 | self.pullback_momentum = pullback_momentum 256 | 257 | self.state = defaultdict(dict) 258 | 259 | # Cache the current optimizer parameters 260 | for group in optimizer.param_groups: 261 | for p in group['params']: 262 | param_state = self.state[p] 263 | param_state['cached_params'] = torch.zeros_like(p.data) 264 | param_state['cached_params'].copy_(p.data) 265 | if self.pullback_momentum == "pullback": 266 | param_state['cached_mom'] = torch.zeros_like(p.data) 267 | 268 | def __getstate__(self): 269 | return { 270 | 'state': self.state, 271 | 'optimizer': self.optimizer, 272 | 'la_alpha': self.la_alpha, 273 | '_la_step': self._la_step, 274 | '_total_la_steps': self._total_la_steps, 275 | 'pullback_momentum': self.pullback_momentum 276 | } 277 | 278 | def zero_grad(self): 279 | self.optimizer.zero_grad() 280 | 281 | def get_la_step(self): 282 | return self._la_step 283 | 284 | def state_dict(self): 285 | return self.optimizer.state_dict() 286 | 287 | def load_state_dict(self, state_dict): 288 | self.optimizer.load_state_dict(state_dict) 289 | 290 | def _backup_and_load_cache(self): 291 | """Useful for performing evaluation on the slow weights (which typically generalize better) 292 | """ 293 | for group in self.optimizer.param_groups: 294 | for p in group['params']: 295 | param_state = self.state[p] 296 | param_state['backup_params'] = torch.zeros_like(p.data) 297 | param_state['backup_params'].copy_(p.data) 298 | p.data.copy_(param_state['cached_params']) 299 | 300 | def _clear_and_load_backup(self): 301 | for group in self.optimizer.param_groups: 302 | for p in group['params']: 303 | param_state = self.state[p] 304 | p.data.copy_(param_state['backup_params']) 305 | del param_state['backup_params'] 306 | 307 | @property 308 | def param_groups(self): 309 | return self.optimizer.param_groups 310 | 311 | def step(self, closure=None): 312 | """Performs a single Lookahead optimization step. 313 | Arguments: 314 | closure (callable, optional): A closure that reevaluates the model 315 | and returns the loss. 316 | """ 317 | loss = self.optimizer.step(closure) 318 | self._la_step += 1 319 | 320 | if self._la_step >= self._total_la_steps: 321 | self._la_step = 0 322 | # Lookahead and cache the current optimizer parameters 323 | for group in self.optimizer.param_groups: 324 | for p in group['params']: 325 | param_state = self.state[p] 326 | p.data.mul_(self.la_alpha).add_( 327 | param_state['cached_params'], alpha=1.0 - self.la_alpha) # crucial line 328 | param_state['cached_params'].copy_(p.data) 329 | if self.pullback_momentum == "pullback": 330 | internal_momentum = self.optimizer.state[p]["momentum_buffer"] 331 | self.optimizer.state[p]["momentum_buffer"] = internal_momentum.mul_(self.la_alpha).add_( 332 | 1.0 - self.la_alpha, param_state["cached_mom"]) 333 | param_state["cached_mom"] = self.optimizer.state[p]["momentum_buffer"] 334 | elif self.pullback_momentum == "reset": 335 | self.optimizer.state[p]["momentum_buffer"] = torch.zeros_like( 336 | p.data) 337 | 338 | return loss 339 | -------------------------------------------------------------------------------- /Nezha_pytorch/nezha_model/.ipynb_checkpoints/config-checkpoint.json: -------------------------------------------------------------------------------- 1 | { 2 | "_name_or_path": "../nezha-cn-base/", 3 | "architectures": [ 4 | "NeZhaForMaskedLM" 5 | ], 6 | "attention_probs_dropout_prob": 0.1, 7 | "classifier_dropout": null, 8 | "gradient_checkpointing": false, 9 | "hidden_act": "gelu", 10 | "hidden_dropout_prob": 0.1, 11 | "hidden_size": 768, 12 | "initializer_range": 0.02, 13 | "intermediate_size": 3072, 14 | "layer_norm_eps": 1e-12, 15 | "max_position_embeddings": 512, 16 | "max_relative_position": 64, 17 | "model_type": "bert", 18 | "num_attention_heads": 12, 19 | "num_hidden_layers": 12, 20 | "pad_token_id": 0, 21 | "position_embedding_type": "absolute", 22 | "torch_dtype": "float32", 23 | "transformers_version": "4.10.0", 24 | "type_vocab_size": 2, 25 | "use_cache": true, 26 | "use_relative_position": true, 27 | "vocab_size": 3459 28 | } 29 | -------------------------------------------------------------------------------- /Nezha_pytorch/pretrain/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BetterBench/2021-Daguan-Cup/9a140b7c8cba20965628d9ac9d9a11e04afe5aa8/Nezha_pytorch/pretrain/.DS_Store -------------------------------------------------------------------------------- /Nezha_pytorch/pretrain/NEZHA/__pycache__/configuration_nezha.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BetterBench/2021-Daguan-Cup/9a140b7c8cba20965628d9ac9d9a11e04afe5aa8/Nezha_pytorch/pretrain/NEZHA/__pycache__/configuration_nezha.cpython-36.pyc -------------------------------------------------------------------------------- /Nezha_pytorch/pretrain/NEZHA/__pycache__/modeling_nezha.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BetterBench/2021-Daguan-Cup/9a140b7c8cba20965628d9ac9d9a11e04afe5aa8/Nezha_pytorch/pretrain/NEZHA/__pycache__/modeling_nezha.cpython-36.pyc -------------------------------------------------------------------------------- /Nezha_pytorch/pretrain/NEZHA/configuration_nezha.py: -------------------------------------------------------------------------------- 1 | 2 | from transformers import PretrainedConfig 3 | 4 | NEZHA_PRETRAINED_CONFIG_ARCHIVE_MAP = {} 5 | 6 | class NeZhaConfig(PretrainedConfig): 7 | r""" 8 | This is the configuration class to store the configuration of an :class:`~transformers.AlbertModel`. 9 | It is used to instantiate an ALBERT model according to the specified arguments, defining the model 10 | architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of 11 | the ALBERT `xxlarge `__ architecture. 12 | 13 | Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used 14 | to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig` 15 | for more information. 16 | 17 | 18 | Args: 19 | vocab_size (:obj:`int`, optional, defaults to 30000): 20 | Vocabulary size of the ALBERT model. Defines the different tokens that 21 | can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.AlbertModel`. 22 | embedding_size (:obj:`int`, optional, defaults to 128): 23 | Dimensionality of vocabulary embeddings. 24 | hidden_size (:obj:`int`, optional, defaults to 4096): 25 | Dimensionality of the encoder layers and the pooler layer. 26 | num_hidden_layers (:obj:`int`, optional, defaults to 12): 27 | Number of hidden layers in the Transformer encoder. 28 | num_hidden_groups (:obj:`int`, optional, defaults to 1): 29 | Number of groups for the hidden layers, parameters in the same group are shared. 30 | num_attention_heads (:obj:`int`, optional, defaults to 64): 31 | Number of attention heads for each attention layer in the Transformer encoder. 32 | intermediate_size (:obj:`int`, optional, defaults to 16384): 33 | The dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. 34 | inner_group_num (:obj:`int`, optional, defaults to 1): 35 | The number of inner repetition of attention and ffn. 36 | hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu_new"): 37 | The non-linear activation function (function or string) in the encoder and pooler. 38 | If string, "gelu", "relu", "swish" and "gelu_new" are supported. 39 | hidden_dropout_prob (:obj:`float`, optional, defaults to 0): 40 | The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. 41 | attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0): 42 | The dropout ratio for the attention probabilities. 43 | max_position_embeddings (:obj:`int`, optional, defaults to 512): 44 | The maximum sequence length that this model might ever be used with. Typically set this to something 45 | large (e.g., 512 or 1024 or 2048). 46 | type_vocab_size (:obj:`int`, optional, defaults to 2): 47 | The vocabulary size of the `token_type_ids` passed into :class:`~transformers.AlbertModel`. 48 | initializer_range (:obj:`float`, optional, defaults to 0.02): 49 | The standard deviation of the truncated_normal_initializer for initializing all weight matrices. 50 | layer_norm_eps (:obj:`float`, optional, defaults to 1e-12): 51 | The epsilon used by the layer normalization layers. 52 | classifier_dropout_prob (:obj:`float`, optional, defaults to 0.1): 53 | The dropout ratio for attached classifiers. 54 | 55 | Example:: 56 | 57 | from transformers import AlbertConfig, AlbertModel 58 | # Initializing an ALBERT-xxlarge style configuration 59 | albert_xxlarge_configuration = AlbertConfig() 60 | 61 | # Initializing an ALBERT-base style configuration 62 | albert_base_configuration = AlbertConfig( 63 | hidden_size=768, 64 | num_attention_heads=12, 65 | intermediate_size=3072, 66 | ) 67 | 68 | # Initializing a model from the ALBERT-base style configuration 69 | model = AlbertModel(albert_xxlarge_configuration) 70 | 71 | # Accessing the model configuration 72 | configuration = model.config 73 | 74 | Attributes: 75 | pretrained_config_archive_map (Dict[str, str]): 76 | A dictionary containing all the available pre-trained checkpoints. 77 | """ 78 | 79 | pretrained_config_archive_map = NEZHA_PRETRAINED_CONFIG_ARCHIVE_MAP 80 | model_type = "nezha" 81 | 82 | def __init__( 83 | self, 84 | vocab_size=30000, 85 | embedding_size=128, 86 | hidden_size=4096, 87 | num_hidden_layers=12, 88 | num_hidden_groups=1, 89 | num_attention_heads=64, 90 | intermediate_size=16384, 91 | inner_group_num=1, 92 | hidden_act="gelu_new", 93 | hidden_dropout_prob=0, 94 | attention_probs_dropout_prob=0, 95 | max_position_embeddings=512, 96 | max_relative_position=64, 97 | type_vocab_size=2, 98 | initializer_range=0.02, 99 | layer_norm_eps=1e-12, 100 | classifier_dropout_prob=0.1, 101 | use_relative_position=True, 102 | pad_token_id=0, 103 | bos_token_id=2, 104 | eos_token_id=3, 105 | **kwargs 106 | ): 107 | super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) 108 | 109 | self.vocab_size = vocab_size 110 | self.embedding_size = embedding_size 111 | self.hidden_size = hidden_size 112 | self.num_hidden_layers = num_hidden_layers 113 | self.num_hidden_groups = num_hidden_groups 114 | self.num_attention_heads = num_attention_heads 115 | self.inner_group_num = inner_group_num 116 | self.hidden_act = hidden_act 117 | self.intermediate_size = intermediate_size 118 | self.hidden_dropout_prob = hidden_dropout_prob 119 | self.attention_probs_dropout_prob = attention_probs_dropout_prob 120 | self.max_position_embeddings = max_position_embeddings 121 | self.max_relative_position = max_relative_position 122 | self.type_vocab_size = type_vocab_size 123 | self.initializer_range = initializer_range 124 | self.layer_norm_eps = layer_norm_eps 125 | self.use_relative_position=use_relative_position 126 | self.classifier_dropout_prob = classifier_dropout_prob 127 | -------------------------------------------------------------------------------- /Nezha_pytorch/pretrain/NLP_Utils.py: -------------------------------------------------------------------------------- 1 | import random 2 | import json 3 | import transformers as _ 4 | from transformers1 import BertTokenizer 5 | import torch 6 | from torch.utils.data import Dataset,DataLoader 7 | import numpy as np 8 | from itertools import chain 9 | import os 10 | import pandas as pd 11 | import re 12 | from tqdm import tqdm 13 | tqdm.pandas() 14 | 15 | def writeToJsonFile(path: str, obj): 16 | with open(path, "w", encoding="utf-8") as f: 17 | f.write(json.dumps(obj, ensure_ascii=False,indent=0)) 18 | def readFromJsonFile(path: str): 19 | with open(path, "r", encoding="utf-8") as f: 20 | return json.loads(f.read()) 21 | 22 | def loadData(path): 23 | allData=[] 24 | with open(path,"r") as f: 25 | j = 0 26 | for i in f: 27 | i=i.strip().split(',') 28 | if j == 0: 29 | j += 1 30 | continue 31 | if len(i)==0:#防止空行 32 | break 33 | if len(i)==3:#训练集 34 | a,b,label=i 35 | b=b.split(' ') 36 | else:#测试集,直接转为id形式 37 | a,b,label=i[0],i[1],-1 38 | b=b.split(' ') 39 | allData.append([b, int(label)]) 40 | j+=1 41 | return allData 42 | 43 | def calNegPos(ls):#计算正负比例 44 | posNum,negNum=0,0 45 | for i in ls: 46 | if i[2]==0: 47 | negNum+=1 48 | elif i[2]==1: 49 | posNum+=1 50 | posNum=1 if posNum==0 else posNum 51 | return negNum,posNum,round(negNum/posNum,4) 52 | 53 | def preprocess_text(document): 54 | 55 | # 删除逗号, 脱敏数据中最大值为30357 56 | text = str(document) 57 | text = text.replace(',', '35001') 58 | text = text.replace('!', '35002') 59 | text = text.replace('?', '35003') 60 | text = text.replace('。', '35004') 61 | # text = text.replace('17281', '') 62 | # 用单个空格替换多个空格 63 | text = re.sub(r'\s+', ' ', text, flags=re.I) 64 | 65 | return text 66 | 67 | 68 | train_clean = '/media/mgege007/winType/DaGuan/data/train_clean.csv' 69 | test_clean = '/media/mgege007/winType/DaGuan/data/test_clean.csv' 70 | if not os.path.exists(train_clean): 71 | train_df = pd.read_csv('/media/mgege007/winType/DaGuan/data/datagrand_2021_train.csv') 72 | train_df["text"] = train_df["text"].progress_apply(lambda x: preprocess_text(x)) 73 | id2label = list(train_df['label'].unique()) 74 | label2id = {id2label[i]: i for i in range(len(id2label))} 75 | train_df["label"] = train_df["label"].map(label2id) 76 | test_df = pd.read_csv('/media/mgege007/winType/DaGuan/data/datagrand_2021_test.csv') 77 | test_df["text"] = test_df["text"].progress_apply(lambda x: preprocess_text(x)) 78 | train_df.to_csv(train_clean, index=False) 79 | test_df.to_csv(test_clean,index = False) 80 | 81 | allData = loadData(train_clean) + loadData(test_clean) 82 | testA_data = loadData(test_clean) 83 | # testB_data = loadData('/tcdata/gaiic_track3_round1_testB_20210317.tsv') 84 | random.shuffle(allData) 85 | 86 | train_data=allData#全量 87 | valid_data=allData[-20000:] 88 | print("训练集样本数量:", len(train_data)) 89 | 90 | def paddingList(ls:list,val,returnTensor=False): 91 | ls=ls[:]#不要改变了原list尺寸 92 | maxLen=max([len(i) for i in ls]) 93 | for i in range(len(ls)): 94 | ls[i]=ls[i]+[val]*(maxLen-len(ls[i])) 95 | return torch.tensor(ls,device='cuda') if returnTensor else ls 96 | 97 | def truncate(a:list,maxLen): 98 | maxLen-=3#空留给cls sep sep 99 | assert maxLen>=0 100 | #一共就a超长与否,b超长与否,组合的四种情况 101 | if len(a)>maxLen:#需要截断 102 | # 尾截断 103 | # a=a[:maxLen] 104 | # 首截断 105 | # a = a[maxLen-len(a):] 106 | # 首尾截断 107 | outlen = (len(a)-maxLen) 108 | headid = int(outlen/2) 109 | a = a[headid:headid-outlen] 110 | 111 | return a 112 | 113 | class MLM_Data(Dataset): 114 | #传入句子对列表 115 | def __init__(self,textLs:list,maxLen:int,tk:BertTokenizer): 116 | super().__init__() 117 | self.data=textLs 118 | self.maxLen=maxLen 119 | self.tk=tk 120 | self.spNum=len(tk.all_special_tokens) 121 | self.tkNum=tk.vocab_size 122 | 123 | def __len__(self): 124 | return len(self.data) 125 | 126 | def random_mask(self,text_ids): 127 | input_ids, output_ids = [], [] 128 | rands = np.random.random(len(text_ids)) 129 | idx=0 130 | mask_p = 0.5 # 原始是0.15,加大mask_p就会加大预训练难度 131 | while idx python 3.6+ 6 | >pytorch 1.7.1+ 7 | >cuda 11.2 8 | >transformers 4.9.2+ 9 | >tqdm 4.61.2 10 | 11 | (2)代码结构 12 | ``` 13 | ├── Bert_pytorch # Bert 方案 14 | │   ├── bert-base-chinese # 初始权重,下载地址https://huggingface.co/bert-base-chinese# 15 | │   ├── bert_finetuning # Bert微调 16 | │   │   ├── Config.py # Bert配置文件 17 | │   │   ├── ensemble_10fold.py # 10折checkpoint融合 18 | │   │   ├── ensemble_single.py #每种模型不划分验证集只生成的一个模型,用这些模型进行checkpoint融合 19 | │   │   ├── generate_pseudo_label.py # 利用做高分模型 给无标注数据做伪标签 20 | │   │   ├── main_bert_10fold.py # 划分10折的Bert,这种会存储10个模型,每一个fold一个模型 21 | │   │   ├── main_bert_all.py # 不划分验证集的Bert,这种只会存储一个模型 22 | │   │   ├── model.py # 17种魔改Bert,和其他网络的具体实现部分 23 | │   │   ├── models 24 | │   │   ├── NEZHA # 网络结构实现文件,来源于官网 25 | │   │   │   ├── configuration_nezha.py 26 | │   │   │   └── modeling_nezha.py 27 | │   │   ├── predict.py # 用模型模型进行预测测试集 28 | │   │   ├── predict_tta.py # 用模型进行预测测试集,并使用TTA 测试集增强 29 | │   │   ├── stacking.py # Stacking集成方法 30 | │   │   └── utils.py # 工具函数 31 | │   ├── bert_model_1000 # 存储预训练模型,下载地址https://drive.google.com/file/d/1rpWe5ec_buORvu8-ezvvAk9jrUZkOsIr/view?usp=sharing 32 | │   ├── Data_analysis.ipynb # 数据分析 33 | │   ├── Generate_TTA.ipynb # 生成TTA测试集增强的文件 34 | │   └── pretrain # Bert预训练 35 | │   ├── bert_model 36 | │   │   ├── vocab_100w.txt # 100W未标注数据语料的词典,有18544个词 37 | │   │   ├── vocab_3462.txt # 整个训练集和测试集的词典,不包括未标注数据 38 | │   │   └── vocab.txt 39 | │   ├── NLP_Utils.py 40 | │   ├── train_bert.py # Bert预训练主函数 41 | │   └── transformers1.zip # transformes较高的版本 42 | ├── data 43 | │   ├── datagrand_2021_test.csv # 测试集 44 | │   └── datagrand_2021_train.csv # 训练集 45 | ├── Nezha_pytorch #NEZHA预训练方案 46 | │   ├── finetuning # Nezha微调 47 | │   │   ├── Config.py  48 | │   │   ├── model.py #模型实现文件 49 | │   │   ├── models 50 | │   │   ├── NEZHA 51 | │   │   │   ├── configuration_nezha.py 52 | │   │   │   └── modeling_nezha.py 53 | │   │   ├── NEZHA_main.py #微调主函数 54 | │   │   ├── predict.py # 10折模型预测 55 | │   │   ├── submit 56 | │   │   │   └── submit_bert_5epoch-10fold-first.csv 57 | │   │   └── utils.py 58 | │   ├── nezha-cn-base #nezha-base初始权重,下载地址https://github.com/lonePatient/NeZha_Chinese_PyTorch 59 | │   ├── nezha_model #存放预训练生成的模型 60 | │   ├── NEZHA_models 61 | │   ├── nezha_output #预训练的checkpoint 62 | │   ├── pretrain #nezha预训练 63 | │   │   ├── __init__.py 64 | │   │   ├── NEZHA 65 | │   │   │   ├── configuration_nezha.py 66 | │   │   │   ├── modeling_nezha.py 67 | │   │   ├── nezha_model 68 | │   │   │   └── vocab.txt # 预训练时,所需要的训练集的词典 69 | │   │   ├── NLP_Utils.py 70 | │   │   ├── train_nezha.py #预训练NEZHA的主函数 71 | │   │   └── transformers1.zip # 更高版本的transformers 72 | │   └── submit 73 | ``` 74 | 75 | (3)下载 76 | + [nezha-base-chinese 权重下载](https://github.com/lonePatient/NeZha_Chinese_PyTorch) 77 | + [bert-base-chinese 权重下载](https://drive.google.com/file/d/1rpWe5ec_buORvu8-ezvvAk9jrUZkOsIr/view?usp=sharing) 78 | + [NEZHA预训练模型下载](https://drive.google.com/file/d/121KlWkc4PPOfjojFo-4LAYSNKpgigMQ-/view?usp=sharing) 79 | + [Bert 预训练模型下载](https://drive.google.com/file/d/1rpWe5ec_buORvu8-ezvvAk9jrUZkOsIr/view?usp=sharing) 80 | + [Word2vec和Fasttext训练好的词向量下载](https://drive.google.com/file/d/1jcQR4E7AvkYwiGj8SzI6D4TmKt4-5Wqa/view?usp=sharing) 81 | + 82 | (4)博客详细介绍 83 | [【2021 第五届“达观杯” 基于大规模预训练模型的风险事件标签识别】1 初赛Rank12的总结与分析](https://zhuanlan.zhihu.com/p/412897603/) 84 | [【2021 第五届“达观杯” 基于大规模预训练模型的风险事件标签识别】2 DPCNN、HAN、RCNN等传统深度学习方案](https://zhuanlan.zhihu.com/p/413250318) 85 | [【2021 第五届“达观杯” 基于大规模预训练模型的风险事件标签识别】3Bert和Nezha方案](https://zhuanlan.zhihu.com/p/413475410) 86 | 87 | 88 | # 2 引言 89 | 90 | ​ 2021年的暑假,与博远、禹飞、沛恒刚打完科大讯飞的比赛,又续上类似的赛题2021的”达观杯“,继续经过一个多月,连续的战斗,比赛终于落下帷幕。A榜我们最高成绩可以达到0.62+,原本可以排名到第7,但是提交次数限制,未能提交最高得分文件。导致A榜只达到第12名。以及对于这种赛制的不理解,导致B榜滑落到21名。对我们的打击巨大。第一次打这种赛制的比赛,被恶心到了。但是也是学习到了很多东西,吸取教训,下次还能再接再厉。 91 | 92 | ​ 该赛题和[2021年天池举办的全球人工智能大赛](https://tianchi.aliyun.com/competition/entrance/531852/introduction?spm=5176.12281957.1004.6.38b03eafApg5Vq)的赛道一几乎一样,就是标签性质不一样,天池赛题是多标签多分类,该赛题是多分类单标签。和[赛道三](https://tianchi.aliyun.com/competition/entrance/531851/information)也是类似,以及天池举办的新手赛-[新闻文本分类](https://tianchi.aliyun.com/competition/entrance/531810/introduction)都是一样的性质,脱敏数据的文本分类赛题。这个比赛我们参考了赛道一和赛道三的许多资料和方案。 93 | 94 | ​ 7月26号已经开赛,8月16号这天才决定参赛,比赛花了36天。比赛过程分为三个阶段,第阶段钻研传统DL模型、第二阶段使用NEZHA和Bert实现预训练模型、第三阶段微调和预训练改进,以及多种提分技巧的应用。第一阶段,完全不需要云服务器,我本地的显卡就足够使用,但是来到第二阶段的开始使用预训练模型,我们必须使用恒源云上更大显存,更快运行速度的3090云服务器。苦战一个月,每天100左右的开销,邀请了所有周围的同学朋友,帮忙注册并认证,才送了不少的使用券,比赛的最后有一个星期,几个程序在跑,GPU不够用,我们成绩达到0.62+,排名也来得历史最高第三,租下了一个包周的3090连续跑了三天。队友还租的是恒源云的V100S,32G显存的显卡,跑nezha_large,都占满了,跑了好几天,开销巨大,预训练模型成本太高了,GPU也实在是太贵了。 95 | 96 | 97 | # 3 方案 98 | 99 | ![3](https://img-blog.csdnimg.cn/03cf355f9a61447191ec67a16215dc49.png?x-oss-process=image/watermark,type_ZHJvaWRzYW5zZmFsbGJhY2s,shadow_50,text_Q1NETiBAQmV0dGVyIEJlbmNo,size_20,color_FFFFFF,t_70,g_se,x_16#pic_center) 100 | 101 | ## 2.1 传统DL方案 102 | 103 | 详细的方案代码解析见[【2021 第五届“达观杯” 基于大规模预训练模型的风险事件标签识别】3 DPCNN、HAN、RCNN等传统深度学习方案]() 104 | 105 | 我们的Baseline采用的是胶囊网络Capsule Net,进行线上提交就有0.55+的成绩,首次提交就排名在30+。传统的DL训练时间较短,仅有2小时左右。无标签的数据,我们未利用在传统的DL模型上。通过word2vec和fastext词向量的拼接、优化器的选择、sheduler学习率的选择,句子的最大长度选择截断,10折分层划分,用早停的方式去控制过拟合等方式,突破0.56+。同理实现DPCNN、RCNN、HAN,投票产生部分伪标签加入模型进行重新训练,单个模型DPCNN效果最高,达到传统DL模型的最高0.5779。再次投票得到0.5828的最高得分。单个模型最佳的参数如下 106 | 107 | | 模型 | 词向量、维度 | Max_len | BS | sheduler学习率 | 优化器 | Fold | 训练时间 | 108 | | :---------: | -------------------- | ------- | ---- | ---------------------------- | ------ | ---- | -------- | 109 | | Capsule Net | word2vc+fastext、128 | 100 | 32 | CosineAnnealingWarmRestrarts | Adamw | 10 | 2.0小时 | 110 | | RCNN | word2vc+fastext、128 | 100 | 32 | CosineAnnealingWarmRestrarts | Adamw | 10 | 2.5小时 | 111 | | HAN | word2vc+fastext、128 | 100 | 32 | CosineAnnealingLR | Adamw | 10 | 2.5小时 | 112 | | DPCNN | word2vc+fastext、128 | 100 | 32 | CosineAnnealingWarmRestrarts | Adamw | 10 | 2.0小时 | 113 | 114 | 对比过的选择 115 | 116 | + Scheduler学习率 117 | + Constant_shedule 118 | + CosineAnnealingWarmRestarts 最佳 119 | + CosineAnnealing 较好 120 | + 优化器 121 | + Lookhead 122 | + AdamW 最佳 123 | + 对抗训练 124 | + FGM 效果不佳 125 | + PGD 效果不佳 126 | + K折划分方式 127 | + Kfold 128 | + MutilabelStratifiedKfold 129 | + StratifiedKfold 最佳 130 | 131 | 132 | 133 | ## 2.2 预训练方案 134 | 135 | 详细代码解析见[【2021 第五届“达观杯” 基于大规模预训练模型的风险事件标签识别】3 Bert和Nezha方案]() 136 | 137 | 这种方案花费了我们四个星期的时间,训练迭代优化过程非常缓慢,但效果显著。预训练和微调训练参数对应训练时间如下。(Batch size 简称BS) 138 | 139 | | 模型 | 预训练Epoch | 预训练BS | 微调Epoch | 微调BS | 对抗训练 | GPU设备 | 训练时间 | 占用显存 | 140 | | ----------- | :---------: | :------: | :-------: | :----: | :------: | :-----: | :----------: | :------: | 141 | | 魔改Bert | 1000 | 32 | 50 | 32 | 无 | 3090 | 12+7=19小时 | 7G | 142 | | Nezha-base | 480 | 32 | 50 | 32 | FGM | 3090 | 6+13=19小时 | 7G | 143 | | Nezha-large | 300 | 64 | 50 | 32 | 无 | V100S | 4+9 = 13小时 | 31G | 144 | 145 | + 总共预训练只训练14009+6004条样本数据。未标注数据,我们有加入40万的语料去训练NEZHA,用3090训练了5天5夜,用来微调测试的效果并不佳,时间成本太高最终放弃该方案。词典也尝试过用10W 语料1.8w+的词典大小,去预训练,发现线上效果都不如只使用标注数据词典的效果。最终还是选择3456个词个数的词典和只使用标注的训练集。 146 | 147 | + Bert模型 148 | 149 | + 模型并不是使用传统的bert,使用多种魔改的Bert方案,最终在Bert后接上一个LSTM,效果最佳,次之是最后一层向量取平均后与最后一层cls拼接的魔改Bert最佳 150 | + 其他魔改,比如只是用最后一层cls,最后四层cls拼接等等17种魔改Bert,具体见实现代码[model.py]() 151 | 152 | + Bert预训练的技巧有 153 | 154 | + 首尾截断方式:首部阶段和尾部截断方式并没有时间进行对比,预训练的调参时间成本太高。 155 | 156 | + 动态MASK 策略:可以每次迭代都随机生成新的mask文本,增强模型泛化能力。 157 | 158 | + Mask概率 :0.15,在NEZHA上尝试加大训练难度,改为过0.5,但是在Bert上并没有带来增益 159 | 160 | + N-gram 掩码策略:以Mask掩码概率选中token,为增加训练难度,选中部分以70%、20%、10%的概率进行1-gram、2-gram、3-gram片段的mask(选中token使用[MASK]、随机词、自身替换的概率和原版Bert一致) 161 | 162 | + 权重衰退:weight_decay=0.01 163 | 164 | + Warmup学习率 165 | 166 | + 数据预处理:将逗号、感叹号、问号中文符号删除,且删除最高词频的17281。 167 | 168 | 169 | 170 | + NEZHA预训练技巧和Bert类似,区别在于数据预处理不同、掩码概率不同,选取的是0.5,且尝试了冻结word_Eembedding之外的所有层,只训练该层,加快预训练时间,缩短了一半的时间。但是这种冻结参数的方式,在Bert和nazha_large上,预训练的loss下降非常缓慢,最终只在nezha上进行了实验。 171 | 172 | + 数据预处理:并未删除中文符号,还将其替换为大于词典最大数的脱敏数据 173 | 174 | + Mask概率:0.5 175 | + 冻结word_Embedding以外的所有层数。 176 | 177 | + NEZHA和Bert的微调几乎类似,唯一的区别就是在于数据预处理的方式不一样,具体实现,查看[【2021 第五届“达观杯” 基于大规模预训练模型的风险事件标签识别】3 Bert和Nezha方案]() 178 | 179 | 180 | 181 | # 3 提分技巧 182 | 183 | + 训练集数据增广 184 | + 尝试过EDA的数据增广,效果并不佳 185 | + 在比赛后期,用在TTA测试集数据增强的上一种方式,还未在训练集上尝试,就是Shuffle每个样本的句子级别。把每个样本的句子进行调动顺序,并不是EDA中词级别的shuffle 186 | + 伪标签 187 | + 利用多模型方案投票的原理,选出测试集的高质量伪标签,加入训练集,重新训练模型。在此任务中,只在传统DL方案中有效果,在预训练方案中无效,反而降低了模型效果,具体原因分析,可能是因为该任务的本身计算的准确率只有60%不到。做出来的伪标签质量并不高。 188 | + 可以利用主办方提供的未标注数据,生成伪标签进行训练,但是由于该任务的准确率实在太低,A榜第一都只有0.63+的准确率,生成的伪标签质量并不高,这种方案在该任务中行不通。 189 | + 投票融合 190 | + 利用不同模型能学习到的不同特征,多模型的结果进行投票,能提升4个千分点。但是仅限于模型之间线上得分差异较小。比如我们Nezha单模达到了0.62+的时候,Bert和其他方案还在0.59+徘徊,这样的投票融合,反而会拉低最高单模的分数,加权也不行,血的教训。 191 | + checkpoint融合 192 | + 每个fold都存储一个模型,等程序跑完将这些模型一起加载都预测一遍测试集,得到多个6004行35列的矩阵,每行取算术平均后再进行计算标签。同样要求模型之间线上得分差异小,差异大了,加权也无法带来增益,反而会拉低最高单模的效果。具体实现,可以查看代码[predict.py]() 193 | + TTA测试集数据增强 194 | + 对测试集的样本进行句子级别的shuffle,作为新的测试集,用模型预测,得到6004行35列的矩阵,与原始测试集用模型预测得到6004行35列的矩阵,相加后取算术平均作为最终测试集的预测结果。线上能带来3个千分点的增益。具体实现,可以查看代码[predict.py]() 195 | 196 | # 4 加快训练 197 | 198 | + FP16:混合精度训练,每个epoch加快10S左右 199 | + 预训练只训练word_embedding,在其他赛题的任务中,有人提出过冻结word_embedding和position_embedding,但是我们在复现该方法时,查看NEZHA模型只有word_embedding层,并未找到position_embedding层。训练时间缩短一半。但是,该方法在Bert上导致预训练Loss下降缓慢,甚至不下降,最终只在Nezha-base上尝试使用。 200 | + 用更高配置的GPU:我们通过使用不同的显卡设备,发现设备配置越高,训练速度越快,即使占用的显存都一样。3090真香。 201 | 202 | 203 | 204 | # 5 总结和反思 205 | 206 | (1)总结 207 | 208 | + 在比赛中,做预训练模型,选用初始设置跑出来一个预训练模型后,再去固定了微调方案,反过来去对预训练方案进行改进和调参。不要着急去做微调,我们这次的比赛中,就犯了这个错误,预训练方案到比赛的最后一天都没有最终确定下来,最后一天还在跑预训练。导致比赛的最后阶段没有去做好微调方案,还有很多微调方案没来得及尝试和对比。 209 | + 我们团队虽然使用了语雀来维护一个文档,但是代码并没有管理,导致经常出现队友之前代码不一致,沟通和任务安排经常出现偏差。应该使用Git去管我们的代码 210 | + 队友之间配合还欠缺默契,经常传递信息不够明确,过程中出现了,队友之间跑着一样的程序,占用着两个GPU,或者说用GPU跑着一个没有实验意义的程序。团队中还出现,跑的程序不知道和哪个程序是对比实验,跑出来的结果没有实验对比性,无法判断跑的某个点是否带来增益,白白浪费GPU和时间。 211 | 212 | (2)继续提升方向 213 | 214 | + 预训练 215 | 216 | + 参考roberta,将句子复制若干份,让模型见到更多的句子遮罩方法,提高模型见到token的数量,提升预训练模型的鲁棒性 217 | + 句子数据增广后再预训练 218 | + TF-IDF Predict Task:提取TFIDF权重,显示的告诉模型权重,让模型学习不同词权重在中的分布关系(来源[[2021天池全球人工智能大赛赛道一](https://gaiic.tianchi.aliyun.com/)冠军方案提出) 219 | + 掩码策略改进(思路来源:https://github.com/nilboy/gaic_track3_pair_sim) 220 | + WWM 完全掩码 221 | + 动态Mask 222 | + n-gram Mask 223 | + 混合Maks 224 | + similar ngram mask 225 | + 加入主办方提供的未标注数据,足足有72G,如果时间允许,设备足够高,预训练充分后,这将会带来巨大的增益。 226 | + 通过Bert实现同义词替换(思路来源:天池-全球人工智能大赛赛道一-rank2-炼丹术士) 227 | 228 | ![tong](https://img-blog.csdnimg.cn/b734261699f040f5ad78c8f1ef1d8b94.png?x-oss-process=image/watermark,type_ZHJvaWRzYW5zZmFsbGJhY2s,shadow_50,text_Q1NETiBAQmV0dGVyIEJlbmNo,size_20,color_FFFFFF,t_70,g_se,x_16#pic_center) 229 | 230 | + 问题优化(思路来源:[小布助手问题匹配-冠军方案](https://yiwise-algo.yuque.com/docs/share/5a1e3b76-4d04-4127-979a-496d7bc8c1b8?#%20%E3%80%8A%E7%9F%AD%E6%96%87%E6%9C%AC%E7%9B%B8%E4%BC%BC%E5%8C%B9%E9%85%8D%E3%80%8B)) 231 | 232 | ![qe](https://img-blog.csdnimg.cn/ccfd61aef0574b859cbb156265a0dfc5.jpg?x-oss-process=image/watermark,type_ZHJvaWRzYW5zZmFsbGJhY2s,shadow_50,text_Q1NETiBAQmV0dGVyIEJlbmNo,size_20,color_FFFFFF,t_70,g_se,x_16#pic_center) 233 | 234 | - 微调 235 | + EDA 数据增广在脱敏数据上表现不佳,但是AEDA这个方法还未尝试过,就是在句子中随机插入标点符号。(来源资料:https://mp.weixin.qq.com/s/R6uDbn3CqxFkOye73Rqpqg) 236 | 237 | + 模型融合 238 | 239 | + Stacking:我实现过,单个模型都上了0.58+,但是本地验证只有0.55+左右,理论上不应该的,应该是未能正确实现 240 | + Checkpoint融合:这种方案得到的结果最为稳重,我们在B榜没有经验,提交的文件只是单模的,我们未能提交融合后的方案。 241 | 242 | + 伪标签 243 | 244 | + 由于该任务本身准确率不高,就连A榜第一都只有63%的准确率,做出来的标签不佳,但是如果在其他准确率高的任务中,这将会是一个大杀器。 245 | + 做伪标签的数据除了是测试集,还可以是未标注的数据,未标注的数据有足够大,足够训练模型。 246 | 247 | + 新方案 248 | 249 | + ELETRA-Pytorch版本,并没有尝试 250 | 251 | - https://github.com/richarddwang/electra_pytorch 252 | - https://github.com/lucidrains/electra-pytorch 253 | 254 | - 知识蒸馏的预训练模型 255 | - 训练加速 256 | - 华为 TinyBert fine-tune阶段采用了数据增强的策略(mask之后预测 并使用余弦相似度来选择对应的N个候选词最后以概率p选择是否替换这个单词,从而产生更多的文本数据) 257 | - 百度**ERNIE** pytorch 258 | - https://github.com/649453932/Bert-Chinese-Text-Classification-Pytorch 259 | - ConVBert 260 | - https://github.com/codertimo/ConveRT-pytorch 261 | 262 | # 6 参考资料 263 | 264 | + [AEDA:随机插入符号的数据增强](https://mp.weixin.qq.com/s/R6uDbn3CqxFkOye73Rqpqg) 265 | 266 | + [天池入门赛-新闻文本分类官网首页](https://tianchi.aliyun.com/competition/entrance/531810/introduction) 267 | + [天池-全球人工智能大赛赛道一官网首页](https://tianchi.aliyun.com/competition/entrance/531852/introduction?spm=5176.12281957.1004.6.38b03eafApg5Vq) 268 | + [天池-全球人工智能大赛赛道三官网首页](https://tianchi.aliyun.com/competition/entrance/531851/information) 269 | + [天池-全球人工智能大赛赛道一-rank 4详解博客](http://www.nsytsqdtn.cn/2021/06/10/gaic1/) 270 | + [天池-全球人工智能大赛赛道一-HAN 和Capsule网络开源方案-github](https://github.com/nsytsqdtn/competition-baseline/tree/main/%E5%85%A8%E7%90%83%E4%BA%BA%E5%B7%A5%E6%99%BA%E8%83%BD%E6%8A%80%E6%9C%AF%E5%88%9B%E6%96%B0%E5%A4%A7%E8%B5%9B/%E8%B5%9B%E9%81%93%E4%B8%80) 271 | + [天池-全球人工智能大赛赛道一NEZHA方案-github](https://github.com/daniellibin/gaiic2021_track3_querySim) 272 | 273 | - [天池-全球人工智能大赛赛道一Seq2Seq方案-github](https://github.com/lollipop97/2021-GAIIC-phase1-seq2seq) 274 | - [天池-全球人工智能大赛赛道一-TextCNN +encoder方案 ](https://blog.51cto.com/u_15194128/2795986) 275 | - [天池-全球人工智能大赛赛道一-Rank3 方案博客详解](https://yihaochan.github.io/post/c516f0e3.html#more) 276 | - [天池-全球人工智能大赛赛道一周周星方案1](https://github.com/chizhu/tianchi-gaic-track3-share) 277 | - [天池-全球人工智能大赛赛道一周星星方案2](https://github.com/lonePatient/2021-GAIIC-Track1-idea) 278 | - 新闻文本分类-Rank 1 [博客](https://tianchi.aliyun.com/forum/postDetail?postId=128789) [代码](https://github.com/kangyishuai/NEWS-TEXT-CLASSIFICATION?spm=5176.21852664.0.0.36fa49f5I7OwXj) 279 | - 新闻文本分类-Rank5 [博客](https://tianchi.aliyun.com/forum/postDetail?postId=132549) [代码](https://github.com/Goldgaruda/Tianchi-NLP-News-Text-Classification-Rank-5-solution?spm=5176.21852664.0.0.772a5d31tOpZ4r) 280 | - 新闻文本分类-Rank4 [博客](https://zhuanlan.zhihu.com/p/231180925?spm=5176.21852664.0.0.57183248jdRAwm) [代码](https://github.com/MM-IR/rank4_NLP_textclassification?spm=5176.21852664.0.0.57183248jdRAwm) 281 | - 新闻文本分类-Rank11 [代码](https://github.com/NorthblueM/NLP_NewsTextClassification) 282 | - 新闻文本分类-Rank6 [博客](https://tianchi.aliyun.com/forum/postDetail?postId=132518) [代码](https://github.com/Warrenheww/rank6_NLP_newstextclassification?spm=5176.21852664.0.0.23e119a2HVLK0q) 283 | - [天池-全球人工智能大赛赛道三 RANK1](https://github.com/nilboy/gaic_track3_pair_sim) 284 | - [天池-全球人工智能大赛赛道三 RANK3]( https://github.com/daniellibin/gaiic2021_track3_querySim?spm=5176.21852664.0.0.3edb2448GYhSKg) 285 | - [天池-全球人工智能大赛赛道三 NEZHA-Pytorch方案](https://github.com/Jackory/TCGAIIC) 286 | -------------------------------------------------------------------------------- /Traditional-DL/Voting.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "import numpy as np\n", 11 | "import os\n", 12 | "from pprint import pprint\n" 13 | ] 14 | }, 15 | { 16 | "cell_type": "markdown", 17 | "metadata": {}, 18 | "source": [ 19 | "# 1 加载数据" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": null, 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "DATA_DIR ='./submit/'\n", 29 | "files = os.listdir(DATA_DIR)\n", 30 | "files = [i for i in files if i[0]!='.']\n", 31 | "print(len(files))\n", 32 | "pprint(files)" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": null, 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "train = pd.read_csv('./data/datagrand_2021_train.csv')\n", 42 | "#将标签进行转换\n", 43 | "label_id2cate = dict(enumerate(train.label.unique()))\n", 44 | "label_cate2id = {value: key for key, value in label_id2cate.items()}\n", 45 | "\n", 46 | "# 合并所有结果\n", 47 | "sub_exp_df = pd.read_csv('./data/datagrand_2021_test.csv')\n", 48 | "df_merged = sub_exp_df.drop(['text'], axis=1)\n", 49 | "for file in files:\n", 50 | " tmp_df = pd.read_csv(DATA_DIR + file)\n", 51 | " tmp_df['ylabel'] = tmp_df['label'].map(label_cate2id)\n", 52 | " df_merged[file] = tmp_df['label'].map(label_cate2id)\n", 53 | "df_merged.head()" 54 | ] 55 | }, 56 | { 57 | "cell_type": "markdown", 58 | "metadata": {}, 59 | "source": [ 60 | "## 1.1 计算pearson相关系数" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [ 69 | "x = df_merged['a.csv']\n", 70 | "for i in range(len(files)):\n", 71 | " y = df_merged[files[i]]\n", 72 | " r = x.corr(y,method=\"kendall\") #-0.2611165\n", 73 | " print(files[i],r)" 74 | ] 75 | }, 76 | { 77 | "cell_type": "markdown", 78 | "metadata": {}, 79 | "source": [ 80 | "# 2 投票融合" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "\n", 90 | "# 计票\n", 91 | "def work(pres):\n", 92 | " count = [0]*35\n", 93 | " for i in pres:\n", 94 | " count[i] += 1\n", 95 | " out = count.index(max(count))\n", 96 | " return out" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": null, 102 | "metadata": {}, 103 | "outputs": [], 104 | "source": [ 105 | "tmp_arr = np.array(df_merged.iloc[:,1:])\n", 106 | "label_voted = [work(line) for line in tmp_arr]" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": null, 112 | "metadata": {}, 113 | "outputs": [], 114 | "source": [ 115 | "\n", 116 | "submit_df = sub_exp_df.drop(['text'], axis=1)\n", 117 | "submit_df['label'] = label_voted\n", 118 | "submit_df['label'] = submit_df['label'].map(label_id2cate)\n" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": null, 124 | "metadata": {}, 125 | "outputs": [], 126 | "source": [ 127 | "submit_df.head()" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": null, 133 | "metadata": {}, 134 | "outputs": [], 135 | "source": [ 136 | "model_name = \"voting-a-b\"\n", 137 | "submit_df.to_csv('./voting_data/{}.csv'.format(model_name), index=False)" 138 | ] 139 | }, 140 | { 141 | "cell_type": "markdown", 142 | "metadata": {}, 143 | "source": [ 144 | "# 3 投票生成伪标签" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": null, 150 | "metadata": {}, 151 | "outputs": [], 152 | "source": [ 153 | "# 计票\n", 154 | "def work_high(pres):\n", 155 | " count = [0]*35\n", 156 | " for i in pres:\n", 157 | " count[i] += 1\n", 158 | " p = len(files)-1 # 该数字根据融合的文件自定义的,如果有11个文件,设该值为11表示,11个结果都投票才可以作为伪标签数据\n", 159 | " if max(count) >p:\n", 160 | " out = count.index(max(count))\n", 161 | " else:\n", 162 | " out = -1\n", 163 | " return out\n", 164 | "\n", 165 | "tmp_arr = np.array(df_merged.iloc[:,1:])\n", 166 | "label_voted = [work_high(line) for line in tmp_arr]\n", 167 | "# 没有做伪标签的数据数量\n", 168 | "print(label_voted.count(-1))\n" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": null, 174 | "metadata": {}, 175 | "outputs": [], 176 | "source": [ 177 | "test_data = sub_exp_df\n", 178 | "test_data['label'] = label_voted\n", 179 | "# 删除不能作为伪标签的数据\n", 180 | "test_data = test_data.drop(test_data[test_data['label']==-1].index)\n", 181 | "len(test_data)" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": null, 187 | "metadata": {}, 188 | "outputs": [], 189 | "source": [ 190 | "# 反编码映射\n", 191 | "model_name = \"pseudo_label_data\"\n", 192 | "test_data['label'] = test_data['label'].map(label_id2cate)\n", 193 | "test_data" 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": null, 199 | "metadata": {}, 200 | "outputs": [], 201 | "source": [ 202 | "test_data" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": null, 208 | "metadata": {}, 209 | "outputs": [], 210 | "source": [ 211 | "# 读取原始训练集\n", 212 | "pseudo_train = train\n", 213 | "# 合并原始训练集和伪标签数据\n", 214 | "pseudo_label_train = pd.concat([pseudo_train,test_data]).reset_index().drop(columns=['index'])\n", 215 | "model_name = \"./data/pseudo_train_data_{}\".format(len(pseudo_label_train))\n", 216 | "pseudo_label_train.to_csv('{}.csv'.format(model_name), index=False)" 217 | ] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "execution_count": null, 222 | "metadata": {}, 223 | "outputs": [], 224 | "source": [ 225 | "pseudo_label_train" 226 | ] 227 | }, 228 | { 229 | "cell_type": "code", 230 | "execution_count": null, 231 | "metadata": {}, 232 | "outputs": [], 233 | "source": [ 234 | "pseudo_label_train" 235 | ] 236 | } 237 | ], 238 | "metadata": { 239 | "interpreter": { 240 | "hash": "d0af45aabaa4bdc75d90fcfc8fc229e38c92ba6df4ad10e64e4ab597fb95609a" 241 | }, 242 | "kernelspec": { 243 | "display_name": "Python 3.8.5 64-bit ('base': conda)", 244 | "name": "python3" 245 | }, 246 | "language_info": { 247 | "name": "python", 248 | "version": "" 249 | } 250 | }, 251 | "nbformat": 4, 252 | "nbformat_minor": 2 253 | } -------------------------------------------------------------------------------- /Traditional-DL/main_DL.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import re 4 | import torch 5 | from transformers.optimization import get_constant_schedule 6 | from sklearn.model_selection import KFold, StratifiedKFold 7 | from iterstrat.ml_stratifiers import MultilabelStratifiedKFold 8 | import warnings 9 | import torch.nn as nn 10 | from tqdm import tqdm 11 | import random 12 | import gensim 13 | import argparse 14 | import os 15 | from torch.utils import data 16 | from sklearn.metrics import f1_score 17 | from torch import nn 18 | from torch.optim import AdamW 19 | from utils.adversarial_model import FGM, PGD 20 | from utils.init_net import init_network 21 | from utils.optimizer_lookahead import Lookahead 22 | from utils.DL_model import * 23 | torch.set_printoptions(edgeitems=768) 24 | warnings.filterwarnings("ignore") 25 | np.set_printoptions(threshold=np.inf) 26 | DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu" 27 | MODELS = { 28 | 'CapsuleNet': CapsuleNet, 29 | 'HAN': HAN, 30 | 'DPCNN': DPCNN, 31 | 'TextRCNNAttn': TextRCNNAttn, 32 | 'TextRCNN': TextRCNN 33 | } 34 | def basic_setting(SEED, DEVICE): 35 | random.seed(SEED) 36 | os.environ["PYTHONHASHSEED"] = str(SEED) 37 | np.random.seed(SEED) 38 | torch.manual_seed(SEED) 39 | if DEVICE != 'cpu': 40 | torch.cuda.manual_seed(SEED) 41 | torch.backends.cudnn.deterministic = True 42 | torch.backends.cudnn.benchmark = False 43 | 44 | # 数据预处理和训练词向量 45 | def data_process(): 46 | train_data = pd.read_csv("data/datagrand_2021_train.csv") 47 | test_data = pd.read_csv("/data/datagrand_2021_test.csv") 48 | id2label = list(train_data['label'].unique()) 49 | label2id = {id2label[i]: i for i in range(len(id2label))} 50 | y_train = np.zeros((len(train_data), len(id2label)), dtype=np.int8) 51 | 52 | all_sentences = pd.concat( 53 | [train_data['text'], test_data['text']]).reset_index(drop=True) 54 | all_sentences.drop_duplicates().reset_index(drop=True, inplace=True) 55 | all_sentences = all_sentences.apply(lambda x: x.split(' ')).tolist() 56 | if not os.path.exists('./embedding/w2v.model'): 57 | w2v_model = gensim.models.word2vec.Word2Vec( 58 | all_sentences, sg=1, vector_size=300, window=7, min_count=1, negative=3, sample=0.001, hs=1, seed=452) 59 | w2v_model.save('./embedding/w2v.model') 60 | else: 61 | w2v_model = gensim.models.word2vec.Word2Vec.load( 62 | "./embedding/w2v.model") 63 | 64 | if not os.path.exists('./embedding/fasttext.model'): 65 | fasttext_model = gensim.models.FastText( 66 | all_sentences, seed=452, vector_size=100, min_count=1, epochs=20, window=2) 67 | fasttext_model.save('./embedding/fasttext.model') 68 | else: 69 | fasttext_model = gensim.models.word2vec.Word2Vec.load( 70 | "./embedding/fasttext.model") 71 | train_dataset = [] 72 | ylabel = [] 73 | for i in tqdm(range(len(train_data))): 74 | train_dict = {} 75 | train_dict['text'] = train_data.loc[i, 'text'] 76 | y_train[i][label2id[train_data.loc[i, 'label']]] = 1 77 | train_dict['label'] = y_train[i] 78 | ylabel.append(train_data.loc[i, 'label']) 79 | train_dataset.append(train_dict) 80 | test_dataset = [] 81 | for i in tqdm(range(len(test_data))): 82 | test_dict = {} 83 | test_dict['text'] = test_data.loc[i, 'text'] 84 | test_dict['label'] = -1 85 | test_dataset.append(test_dict) 86 | return test_data, train_dataset, test_dataset, w2v_model, fasttext_model, id2label, ylabel 87 | 88 | 89 | class DataSet(data.Dataset): 90 | def __init__(self, args, data, mode='train'): 91 | self.data = data 92 | self.mode = mode 93 | self.dataset = self.get_data(self.data, self.mode) 94 | 95 | def get_data(self, data, mode): 96 | dataset = [] 97 | global s 98 | for data_li in tqdm(data): 99 | text = data_li['text'].split(' ') 100 | text = [w2v_model.wv.key_to_index[s] + 101 | 1 if s in w2v_model.wv else 0 for s in text] 102 | if len(text) < args.MAX_LEN: 103 | text += [0] * (args.MAX_LEN - len(text)) 104 | else: 105 | text = text[:args.MAX_LEN] 106 | label = data_li['label'] 107 | dataset_dict = {'text': text, 'label': label} 108 | dataset.append(dataset_dict) 109 | return dataset 110 | 111 | def __len__(self): 112 | return len(self.dataset) 113 | 114 | def __getitem__(self, idx): 115 | data = self.dataset[idx] 116 | text = torch.tensor(data['text']) 117 | if self.mode == 'test': 118 | return text 119 | else: 120 | label = torch.tensor(data['label']) 121 | return text, label 122 | 123 | # 封装数据集 124 | def get_dataloader(args, dataset, mode): 125 | torchdata = DataSet(args, dataset, mode=mode) 126 | if mode == 'train': 127 | dataloader = torch.utils.data.DataLoader( 128 | torchdata, batch_size=args.batch_size, shuffle=True, num_workers=0, drop_last=True) 129 | elif mode == 'test': 130 | dataloader = torch.utils.data.DataLoader( 131 | torchdata, batch_size=args.batch_size*2, shuffle=False, num_workers=0, drop_last=False) 132 | elif mode == 'valid': 133 | dataloader = torch.utils.data.DataLoader( 134 | torchdata, batch_size=args.batch_size*2, shuffle=False, num_workers=0, drop_last=True) 135 | return dataloader, torchdata 136 | 137 | 138 | loss_fun = nn.BCEWithLogitsLoss() 139 | def validation_funtion(model, valid_dataloader, valid_torchdata, mode='valid'): 140 | model.eval() 141 | pred_list = [] 142 | labels_list = [] 143 | if mode == 'valid': 144 | for i, (description, label) in enumerate(tqdm(valid_dataloader)): 145 | output = model(description.to(DEVICE)) 146 | pred_list.extend(output.sigmoid().detach().cpu().numpy()) 147 | labels_list.extend(label.detach().cpu().numpy()) 148 | labels_arr = np.array(labels_list) 149 | pred_arr = np.array(pred_list) 150 | labels = np.argmax(labels_arr, axis=1) 151 | pred = np.argmax(pred_arr, axis=1) 152 | auc = f1_score(labels, pred, average='macro') 153 | loss = loss_fun(torch.FloatTensor(labels_arr), 154 | torch.FloatTensor(pred_arr)) 155 | return auc, loss 156 | else: 157 | for i, (description) in enumerate(tqdm(valid_dataloader)): 158 | output = model(description.to(DEVICE)) 159 | pred_list += output.sigmoid().detach().cpu().numpy().tolist() 160 | return pred_list 161 | 162 | 163 | def train(args, fold, model, train_dataloader, valid_dataloader, valid_torchdata, model_num,early_stop=None): 164 | 165 | param_optimizer = list(model.named_parameters()) 166 | embed_pa = ['embedding.weight'] 167 | # 不训练embedding层 168 | optimizer_grouped_parameters = [{'params': [p for n, p in param_optimizer if not any(nd in n for nd in embed_pa)]}, 169 | {'params': model.embedding.parameters(), 'lr': 5e-5}] 170 | num_train_steps = int(len(train_dataloader) * args.epochs) 171 | if args.optimizer == "AdamW": 172 | optimizer = AdamW(optimizer_grouped_parameters,lr=args.lr, amsgrad=True, weight_decay=5e-4) 173 | elif args.optimizer == "lookahead": 174 | optimizer = AdamW(optimizer_grouped_parameters,lr=args.lr, eps=args.adam_epsilon) 175 | # optimizer = AdamW(optimizer_grouped_parameters,lr=args.lr, amsgrad=True, weight_decay=5e-4) 176 | optimizer = Lookahead(optimizer=optimizer, la_steps=5, la_alpha=0.6) 177 | if args.scheduler == "constant_schedule": 178 | scheduler = get_constant_schedule(optimizer) 179 | elif args.scheduler == "CosineAnnealingWarmRestarts": 180 | scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=5, T_mult=2, eta_min=1e-5, last_epoch=-1) 181 | elif args.scheduler == "CosineAnnealingLR": 182 | scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( 183 | optimizer, T_max=5, eta_min=1e-5, last_epoch=-1) 184 | train_loss = [] 185 | best_val_loss = np.inf 186 | best_f1 = 0 187 | best_loss = np.inf 188 | no_improve = 0 189 | for epoch in range(args.epochs): 190 | model.train() 191 | if args.Model == "HAN" and epoch > 2: 192 | for param in model.named_parameters(): 193 | if param[0] == 'embedding.weight': 194 | param[1].requires_grad = True 195 | break 196 | bar = tqdm(train_dataloader) 197 | # 遍历训练集 198 | for i, (description, label) in enumerate(bar): 199 | optimizer.zero_grad() 200 | output = model(description.to(DEVICE), label.to(DEVICE)) 201 | loss = output 202 | loss.backward() 203 | train_loss.append(loss.item()) 204 | scheduler.step() 205 | optimizer.step() 206 | # 遍历验证集 207 | f1, val_loss = validation_funtion(model, valid_dataloader, valid_torchdata, 'valid') 208 | print('Epoch:[{}/{}] train_loss: {:.5f}, val_loss: {:.5f},f1-score: {:.5f}\n'.format( 209 | epoch+1, args.epochs, np.mean(train_loss), val_loss, f1)) 210 | 211 | if early_stop: 212 | if f1 > best_f1: 213 | best_val_loss = val_loss 214 | best_f1 = f1 215 | best_loss = train_loss[-1] 216 | torch.save(model.state_dict(), './saved/{}_model_{}.bin'.format(args.Model, fold)) 217 | else: 218 | no_improve += 1 219 | if no_improve == early_stop: 220 | break 221 | else: 222 | if epoch >= args.epochs-1: 223 | # 保存模型权重 224 | torch.save(model.state_dict(), './saved/{}_model_{}.bin'.format(args.Model, fold)) 225 | print('Fold:[{}/{}] best_trainloss: {:.5f}, best_valloss: {:.5f},best_f1score: {:.5f}\n'.format( 226 | fold, args.FOLD, best_loss, best_val_loss, best_f1)) 227 | return best_val_loss, best_f1, best_loss 228 | 229 | 230 | def run(args, train_dataset, w2v_model, fasttext_model, ylabel): 231 | kf = StratifiedKFold(n_splits=args.FOLD, shuffle=True, random_state=args.SEED) 232 | # kf = MultilabelStratifiedKFold(n_splits=args.FOLD, shuffle=True, random_state=2021) 233 | val_loss = [] 234 | best_f1 = [] 235 | train_loss = [] 236 | model_num = 1 237 | for i, (train_index, test_index) in enumerate(kf.split(np.arange(len(train_dataset)), ylabel)): 238 | model = MODELS[args.Model](args, w2v_model.wv.vectors.shape[0]+1, w2v_model.wv.vectors.shape[1]+fasttext_model.wv.vectors.shape[1], embeddings=True) 239 | #init_network(model) 240 | model.to(DEVICE) 241 | print(str(i+1), '-'*50) 242 | tra = [train_dataset[index] for index in train_index] 243 | val = [train_dataset[index] for index in test_index] 244 | print(len(tra)) 245 | print(len(val)) 246 | train_dataloader, train_torchdata = get_dataloader(args, tra, mode='train') 247 | valid_dataloader, valid_torchdata = get_dataloader(args, val, mode='valid') 248 | valloss, f1, trainloss = train(args, i, model, train_dataloader, 249 | valid_dataloader, 250 | valid_torchdata, 251 | model_num, 252 | early_stop=args.early_stop) 253 | torch.cuda.empty_cache() 254 | val_loss.append(valloss) 255 | train_loss.append(trainloss) 256 | best_f1.append(f1) 257 | # 打印每fold中最佳的模型的f1和loss 258 | for i in range(args.FOLD): 259 | print('- 第{}折中,best valloss: {} best f1: {} best trainloss: {}'.format(i +1, val_loss[i], best_f1[i], train_loss[i])) 260 | 261 | 262 | # 生成提交文件 263 | def get_submit(args, test_data, test_dataset, id2label): 264 | model = MODELS[args.Model]( 265 | args, w2v_model.wv.vectors.shape[0]+1, w2v_model.wv.vectors.shape[1]+fasttext_model.wv.vectors.shape[1], embeddings=True) 266 | model.to(DEVICE) 267 | test_preds_total = [] 268 | test_dataloader, test_torchdata = get_dataloader(args, test_dataset, mode='test') 269 | for i in range(0, args.FOLD): 270 | model.load_state_dict(torch.load('./saved/{}_model_{}.bin'.format(args.Model, i))) 271 | test_pred_results = validation_funtion( 272 | model, test_dataloader, test_torchdata, 'test') 273 | test_preds_total.append(test_pred_results) 274 | test_preds_merge = np.sum(test_preds_total, axis=0) / (args.FOLD) 275 | test_pre_tensor = torch.tensor(test_preds_merge) 276 | test_pre = torch.max(test_pre_tensor, 1)[1] 277 | 278 | pred_labels = [id2label[i] for i in test_pre] 279 | submit_file = "./submit/{}.csv".format(args.Model) 280 | pd.DataFrame({"id": test_data['id'], "label": pred_labels}).to_csv(submit_file, index=False) 281 | 282 | 283 | def arg_setting(): 284 | parser = argparse.ArgumentParser() 285 | parser.add_argument('--Model', default='TextRCNN', type=str, help="") 286 | # Model可选择:CapsuleNet、HAN、DPCNN、TextRCNNAttn、TextRCNN 287 | parser.add_argument('--MAX_LEN', default=100, type=int,help='max length of sentence') 288 | parser.add_argument('--batch_size', default=32, type=int, help='') 289 | parser.add_argument('--SEED', default=9797, type=int, help='') 290 | parser.add_argument('--FOLD', default=10, type=int, help="k fold") 291 | parser.add_argument('--epochs', default=40, type=int, help="") 292 | parser.add_argument('--early_stop', default=40, type=int, help="") 293 | parser.add_argument('--lr', default=1e-3, type=float, help="") 294 | parser.add_argument('--scheduler', default='CosineAnnealingWarmRestarts', type=str, help="") 295 | parser.add_argument('--optimizer', default='AdamW', type=str, help="") 296 | parser.add_argument('--adam_epsilon', default=1e-6, type=float, help="") 297 | 298 | args = parser.parse_args() 299 | return args 300 | 301 | 302 | if __name__ == '__main__': 303 | # 设置基本参数 304 | args = arg_setting() 305 | # 设置随机种子 306 | basic_setting(args.SEED, DEVICE) 307 | # 数据预处理 308 | test_data, train_dataset, test_dataset, w2v_model, fasttext_model, id2label, ylabel = data_process() 309 | # 开始训练 310 | run(args, train_dataset, w2v_model, fasttext_model, ylabel) 311 | # 获得提交结果文件 312 | get_submit(args, test_data, test_dataset, id2label) 313 | -------------------------------------------------------------------------------- /Traditional-DL/utils/__pycache__/DL_model.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BetterBench/2021-Daguan-Cup/9a140b7c8cba20965628d9ac9d9a11e04afe5aa8/Traditional-DL/utils/__pycache__/DL_model.cpython-36.pyc -------------------------------------------------------------------------------- /Traditional-DL/utils/__pycache__/adversarial_model.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BetterBench/2021-Daguan-Cup/9a140b7c8cba20965628d9ac9d9a11e04afe5aa8/Traditional-DL/utils/__pycache__/adversarial_model.cpython-36.pyc -------------------------------------------------------------------------------- /Traditional-DL/utils/__pycache__/init_net.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BetterBench/2021-Daguan-Cup/9a140b7c8cba20965628d9ac9d9a11e04afe5aa8/Traditional-DL/utils/__pycache__/init_net.cpython-36.pyc -------------------------------------------------------------------------------- /Traditional-DL/utils/__pycache__/optimizer_lookahead.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BetterBench/2021-Daguan-Cup/9a140b7c8cba20965628d9ac9d9a11e04afe5aa8/Traditional-DL/utils/__pycache__/optimizer_lookahead.cpython-36.pyc -------------------------------------------------------------------------------- /Traditional-DL/utils/__pycache__/spatial_dropout.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BetterBench/2021-Daguan-Cup/9a140b7c8cba20965628d9ac9d9a11e04afe5aa8/Traditional-DL/utils/__pycache__/spatial_dropout.cpython-36.pyc -------------------------------------------------------------------------------- /Traditional-DL/utils/adversarial_model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | class FGM(object): 4 | def __init__(self, model, emb_name, epsilon=1.0): 5 | # emb_name这个参数要换成你模型中embedding的参数名 6 | self.model = model 7 | self.epsilon = epsilon 8 | self.emb_name = emb_name 9 | self.backup = {} 10 | 11 | def attack(self, epsilon=1): 12 | # emb_name这个参数要换成你模型中embedding的参数名 13 | for name, param in self.model.named_parameters(): 14 | if param.requires_grad and self.emb_name in name: 15 | self.backup[name] = param.data.clone() 16 | norm = torch.norm(param.grad) 17 | if norm != 0 and not torch.isnan(norm): 18 | r_at = epsilon * param.grad / norm#噪声 19 | param.data.add_(r_at) 20 | 21 | def restore(self): 22 | # emb_name这个参数要换成你模型中embedding的参数名 23 | for name, param in self.model.named_parameters(): 24 | if param.requires_grad and self.emb_name in name: 25 | assert name in self.backup 26 | param.data = self.backup[name] 27 | self.backup = {} 28 | 29 | 30 | class PGD(object): 31 | def __init__(self, model, emb_name, epsilon=1., alpha=0.3): 32 | # emb_name这个参数要换成你模型中embedding的参数名 33 | self.model = model 34 | self.emb_name = emb_name 35 | self.epsilon = epsilon 36 | self.alpha = alpha 37 | self.emb_backup = {} 38 | self.grad_backup = {} 39 | 40 | def attack(self, is_first_attack=False): 41 | # emb_name这个参数要换成你模型中embedding的参数名 42 | for name, param in self.model.named_parameters(): 43 | if param.requires_grad and self.emb_name in name: 44 | if is_first_attack: 45 | self.emb_backup[name] = param.data.clone() 46 | norm = torch.norm(param.grad) 47 | if norm != 0: 48 | r_at = self.alpha * param.grad / norm 49 | param.data.add_(r_at) 50 | param.data = self.project(name, param.data, self.epsilon) 51 | 52 | def restore(self): 53 | # emb_name这个参数要换成你模型中embedding的参数名 54 | for name, param in self.model.named_parameters(): 55 | if param.requires_grad and self.emb_name in name: 56 | assert name in self.emb_backup 57 | param.data = self.emb_backup[name] 58 | self.emb_backup = {} 59 | 60 | def project(self, param_name, param_data, epsilon): 61 | r = param_data - self.emb_backup[param_name] 62 | if torch.norm(r) > epsilon: 63 | r = epsilon * r / torch.norm(r) 64 | return self.emb_backup[param_name] + r 65 | 66 | def backup_grad(self): 67 | for name, param in self.model.named_parameters(): 68 | if param.requires_grad and param.grad is not None: 69 | self.grad_backup[name] = param.grad.clone() 70 | 71 | def restore_grad(self): 72 | for name, param in self.model.named_parameters(): 73 | if param.requires_grad and param.grad is not None: 74 | param.grad = self.grad_backup[name] 75 | -------------------------------------------------------------------------------- /Traditional-DL/utils/init_net.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | def init_network(model, method='kaiming', exclude='embedding', seed=123): # method='kaiming' 3 | for name, w in model.named_parameters(): 4 | if exclude not in name: 5 | if 'weight' in name and w.ndim > 1: 6 | if method == 'xavier': 7 | nn.init.xavier_normal_(w) 8 | elif method == 'kaiming': 9 | nn.init.kaiming_normal_(w) 10 | else: 11 | nn.init.normal_(w) 12 | elif 'bias' in name: 13 | nn.init.constant_(w, 0) 14 | else: 15 | nn.init.constant_(w, 0) 16 | -------------------------------------------------------------------------------- /Traditional-DL/utils/optimizer_lookahead.py: -------------------------------------------------------------------------------- 1 | 2 | from torch.optim.optimizer import Optimizer 3 | from collections import defaultdict 4 | import torch 5 | class Lookahead(Optimizer): 6 | r"""PyTorch implementation of the lookahead wrapper. 7 | Lookahead Optimizer: https://arxiv.org/abs/1907.08610 8 | """ 9 | 10 | def __init__(self, optimizer, la_steps=5, la_alpha=0.8, pullback_momentum="none"): 11 | """optimizer: inner optimizer 12 | la_steps (int): number of lookahead steps 13 | la_alpha (float): linear interpolation factor. 1.0 recovers the inner optimizer. 14 | pullback_momentum (str): change to inner optimizer momentum on interpolation update 15 | """ 16 | self.optimizer = optimizer 17 | self._la_step = 0 # counter for inner optimizer 18 | self.la_alpha = la_alpha 19 | self._total_la_steps = la_steps 20 | pullback_momentum = pullback_momentum.lower() 21 | assert pullback_momentum in ["reset", "pullback", "none"] 22 | self.pullback_momentum = pullback_momentum 23 | 24 | self.state = defaultdict(dict) 25 | 26 | # Cache the current optimizer parameters 27 | for group in optimizer.param_groups: 28 | for p in group['params']: 29 | param_state = self.state[p] 30 | param_state['cached_params'] = torch.zeros_like(p.data) 31 | param_state['cached_params'].copy_(p.data) 32 | if self.pullback_momentum == "pullback": 33 | param_state['cached_mom'] = torch.zeros_like(p.data) 34 | 35 | def __getstate__(self): 36 | return { 37 | 'state': self.state, 38 | 'optimizer': self.optimizer, 39 | 'la_alpha': self.la_alpha, 40 | '_la_step': self._la_step, 41 | '_total_la_steps': self._total_la_steps, 42 | 'pullback_momentum': self.pullback_momentum 43 | } 44 | 45 | def zero_grad(self): 46 | self.optimizer.zero_grad() 47 | 48 | def get_la_step(self): 49 | return self._la_step 50 | 51 | def state_dict(self): 52 | return self.optimizer.state_dict() 53 | 54 | def load_state_dict(self, state_dict): 55 | self.optimizer.load_state_dict(state_dict) 56 | 57 | def _backup_and_load_cache(self): 58 | """Useful for performing evaluation on the slow weights (which typically generalize better) 59 | """ 60 | for group in self.optimizer.param_groups: 61 | for p in group['params']: 62 | param_state = self.state[p] 63 | param_state['backup_params'] = torch.zeros_like(p.data) 64 | param_state['backup_params'].copy_(p.data) 65 | p.data.copy_(param_state['cached_params']) 66 | 67 | def _clear_and_load_backup(self): 68 | for group in self.optimizer.param_groups: 69 | for p in group['params']: 70 | param_state = self.state[p] 71 | p.data.copy_(param_state['backup_params']) 72 | del param_state['backup_params'] 73 | 74 | @property 75 | def param_groups(self): 76 | return self.optimizer.param_groups 77 | 78 | def step(self, closure=None): 79 | """Performs a single Lookahead optimization step. 80 | Arguments: 81 | closure (callable, optional): A closure that reevaluates the model 82 | and returns the loss. 83 | """ 84 | loss = self.optimizer.step(closure) 85 | self._la_step += 1 86 | 87 | if self._la_step >= self._total_la_steps: 88 | self._la_step = 0 89 | # Lookahead and cache the current optimizer parameters 90 | for group in self.optimizer.param_groups: 91 | for p in group['params']: 92 | param_state = self.state[p] 93 | p.data.mul_(self.la_alpha).add_( 94 | param_state['cached_params'], alpha=1.0 - self.la_alpha) # crucial line 95 | param_state['cached_params'].copy_(p.data) 96 | if self.pullback_momentum == "pullback": 97 | internal_momentum = self.optimizer.state[p]["momentum_buffer"] 98 | self.optimizer.state[p]["momentum_buffer"] = internal_momentum.mul_(self.la_alpha).add_( 99 | 1.0 - self.la_alpha, param_state["cached_mom"]) 100 | param_state["cached_mom"] = self.optimizer.state[p]["momentum_buffer"] 101 | elif self.pullback_momentum == "reset": 102 | self.optimizer.state[p]["momentum_buffer"] = torch.zeros_like( 103 | p.data) 104 | 105 | return loss 106 | -------------------------------------------------------------------------------- /Traditional-DL/utils/spatial_dropout.py: -------------------------------------------------------------------------------- 1 | from itertools import repeat 2 | import torch.nn as nn 3 | 4 | 5 | class SpatialDropout(nn.Module): 6 | def __init__(self, drop_prob): 7 | super(SpatialDropout, self).__init__() 8 | self.drop_prob = drop_prob 9 | 10 | def forward(self, inputs): 11 | output = inputs.clone() 12 | if not self.training or self.drop_prob == 0: 13 | return inputs 14 | else: 15 | noise = self._make_noise(inputs) 16 | if self.drop_prob == 1: 17 | noise.fill_(0) 18 | else: 19 | noise.bernoulli_(1 - self.drop_prob).div_(1 - self.drop_prob) 20 | noise = noise.expand_as(inputs) 21 | output.mul_(noise) 22 | return output 23 | 24 | @staticmethod 25 | def _make_noise(inputs): 26 | return inputs.new().resize_(inputs.size(0), *repeat(1, inputs.dim() - 2), inputs.size(2)) 27 | --------------------------------------------------------------------------------