├── CRF分词 └── main.py ├── LSTM_GRU ├── .DS_Store ├── Picture │ ├── .DS_Store │ ├── GRU.JPG │ ├── GRU2.JPG │ ├── LSTM.JPG │ ├── LSTM2.JPG │ ├── accuracy.png │ ├── loss.png │ └── rnn_architecture.png ├── README.md ├── data │ └── data_clean.py └── demo │ ├── __pycache__ │ ├── process_data.cpython-36.pyc │ └── rnn_model.cpython-36.pyc │ ├── process_data.py │ ├── rnn_model.py │ └── rnn_run.py ├── README.md ├── Text_CNN ├── .idea │ ├── Text_CNN.iml │ ├── misc.xml │ ├── modules.xml │ └── workspace.xml ├── Result │ ├── 分类结果.jpg │ ├── 模型.png │ └── 流程.jpg ├── process_data.py ├── text_cnn_main.py └── text_cnn_model.py └── picture ├── README-1cd4ff0f.png ├── README-282eca2f.png ├── README-4fcc65db.png ├── README-61a8bed9.png ├── README-7ea1b04c.png ├── README-857a805b.png ├── README-85cdfcb9.png ├── README-85ffa053.png ├── README-8f6b1559.png ├── README-95176db7.png ├── README-d1c2b10b.png ├── README-f68d8b8d.png └── 分词数据集.png /CRF分词/main.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[81]: 5 | 6 | 7 | import re 8 | import numpy as np 9 | from 10 | 11 | 12 | # In[89]: 13 | 14 | 15 | sents = open(r'H:\分词数据\training\pku_training.utf8',encoding='utf-8').read() 16 | sents = sents.strip() 17 | sents = sents.split('\n') # 这个语料的换行符是\r\n 18 | 19 | 20 | # In[105]: 21 | 22 | 23 | sents = [re.split(' +', s) for s in sents] # 词之间以空格隔开 24 | sents = [[w for w in s if w] for s in sents] # 去掉空字符串 25 | np.random.shuffle(sents) # 打乱语料,以便后面划分验证集 26 | 27 | 28 | # In[91]: 29 | 30 | 31 | chars = {} # 统计字表 32 | for s in sents: 33 | for c in ''.join(s): 34 | if c in chars: 35 | chars[c] += 1 36 | else: 37 | chars[c] = 1 38 | 39 | min_count = 2 # 过滤低频字 40 | chars = {i:j for i,j in chars.items() if j >= min_count} # 过滤低频字 低频字的id是0 41 | id2char = {i+1:j for i,j in enumerate(chars)} # id到字的映射 42 | char2id = {j:i for i,j in id2char.items()} # 字到id的映射 43 | 44 | id2tag = {0:'s', 1:'b', 2:'m', 3:'e'} # 标签(sbme)与id之间的映射 45 | tag2id = {j:i for i,j in id2tag.items()} 46 | 47 | train_sents = sents[:-5000] # 留下5000个句子做验证,剩下的都用来训练 48 | valid_sents = sents[-5000:] 49 | 50 | 51 | # In[97]: 52 | 53 | 54 | batch_size = 128 55 | 56 | 57 | # In[98]: 58 | 59 | 60 | train_sents[0] 61 | 62 | 63 | # In[123]: 64 | 65 | 66 | def train_generator(): #定义数据生成器 67 | X, Y = [], [] 68 | while True: 69 | for i,text in enumerate(train_sents): 70 | sx,sy = [], [] 71 | for s in text: 72 | sx.extend([char2id.get(c,0) for c in s]) 73 | if len(s) == 1: 74 | sy.append(0) 75 | elif len(s) == 2: 76 | sy.extend([1,3]) 77 | else: 78 | sy.extend([1] + [2]*(len(s) - 2) + [3]) 79 | X.append(sx) 80 | Y.append(sy) 81 | if len(X) == batch_size or i == len(train_sents)-1: 82 | maxlen = max([len(t) for t in X]) 83 | X = [x+[4]*(maxlen-len(x)) for x in X] 84 | Y = [y+[4]*(maxlen-len(y)) for y in Y] 85 | yield np.array(X), to_categorical(Y, 5) 86 | X, Y = [], [] 87 | 88 | -------------------------------------------------------------------------------- /LSTM_GRU/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Happy-zyy/NLP-Model/f5fefb747f5c21fc8d348b8622846156197d21d6/LSTM_GRU/.DS_Store -------------------------------------------------------------------------------- /LSTM_GRU/Picture/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Happy-zyy/NLP-Model/f5fefb747f5c21fc8d348b8622846156197d21d6/LSTM_GRU/Picture/.DS_Store -------------------------------------------------------------------------------- /LSTM_GRU/Picture/GRU.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Happy-zyy/NLP-Model/f5fefb747f5c21fc8d348b8622846156197d21d6/LSTM_GRU/Picture/GRU.JPG -------------------------------------------------------------------------------- /LSTM_GRU/Picture/GRU2.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Happy-zyy/NLP-Model/f5fefb747f5c21fc8d348b8622846156197d21d6/LSTM_GRU/Picture/GRU2.JPG -------------------------------------------------------------------------------- /LSTM_GRU/Picture/LSTM.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Happy-zyy/NLP-Model/f5fefb747f5c21fc8d348b8622846156197d21d6/LSTM_GRU/Picture/LSTM.JPG -------------------------------------------------------------------------------- /LSTM_GRU/Picture/LSTM2.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Happy-zyy/NLP-Model/f5fefb747f5c21fc8d348b8622846156197d21d6/LSTM_GRU/Picture/LSTM2.JPG -------------------------------------------------------------------------------- /LSTM_GRU/Picture/accuracy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Happy-zyy/NLP-Model/f5fefb747f5c21fc8d348b8622846156197d21d6/LSTM_GRU/Picture/accuracy.png -------------------------------------------------------------------------------- /LSTM_GRU/Picture/loss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Happy-zyy/NLP-Model/f5fefb747f5c21fc8d348b8622846156197d21d6/LSTM_GRU/Picture/loss.png -------------------------------------------------------------------------------- /LSTM_GRU/Picture/rnn_architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Happy-zyy/NLP-Model/f5fefb747f5c21fc8d348b8622846156197d21d6/LSTM_GRU/Picture/rnn_architecture.png -------------------------------------------------------------------------------- /LSTM_GRU/README.md: -------------------------------------------------------------------------------- 1 | # LSTM 2 | 利用LSTM做文本分类 3 | 4 | ## Usage 5 | 6 | ### 1. 数据预处理 7 | 在data文件中,先使用`data_clean.py`对文本数据进行预处理,包括步骤如下: 8 | #### 1.1 原数据数据清洗 9 | 对所给文本文件进行去停用词、去异常文本、去超长文本操作 10 | 11 | #### 1.2 训练词向量 12 | 对将进行的文本信息(banner.txt)利用word2vec模型训练词向量 13 | ``` 14 | word2vec -train banner.txt -output vec1.bin -cbow 0 -hs 1 -threads 12 -binary 1 15 | ``` 16 | 17 | #### 1.3 词向量表示 18 | 对原文本中所有词建立词向量字典,未登录词采用正态分布随机表示. 19 | 20 | --- 21 | 22 | 最后处理的格式信息如下: 23 | ``` 24 | df, word_vecs, word_cab_num, sentence_max_len, class_num 25 | ``` 26 | `df`:句子字典列表。其中包括句子的text、分类、split等辅助信息 27 | ``` 28 | { 29 | "label": #标签 30 | "num_words":int #句子长度 31 | "text":str #句子 32 | "split":[0,10] #十折交叉使用 33 | } 34 | ``` 35 | `word_vecs`:文本中所有词的词向量表示 36 | `word_cab_num`:文本中共有多少不同的词汇 37 | `sentence_max_len`:句子的最大长度 38 | `class_num`:多分类问题分几类 39 | 40 | 41 | ### 2.模型超参 42 | 模型参数在`rnn_model.py`进行相关的设置。其中需要修改的包括: 43 | ```python 44 | class TRNNConfig(object): 45 | self.embedding_dim = 100 # 词向量维度 46 | self.num_layers= 2 # 隐藏层层数 47 | self.hidden_dim = 128 # 隐藏层神经元 48 | self.rnn = 'lstm' # lstm 或 gru 49 | 50 | self.dropout_keep_prob = 0.8 # dropout保留比例 51 | self.learning_rate = 1e-3 # 学习率 52 | 53 | self.batch_size = 128 # 每批训练大小 54 | self.num_epochs = 10 # 总迭代轮次 55 | ``` 56 | 启动参数包括`rnn_run.py`的一些路径等配置信息 57 | ``` 58 | train_data = "../data/word_vec.p" #配置数据清洗后生成的数据路径 59 | label = "brand" #1中所述df的类别标签名 60 | ``` 61 | 62 | ### 3.运行 63 | ```python 64 | rnn_run.py train #训练&验证 65 | rnn_run.py test #测试 66 | ``` 67 | 68 | ## 模型介绍 69 | ### 1.LSTM 70 | lstm作为加入了attention机制的rnn网络,对长文本具有很好的记忆效果,其主要归功于模型结构。 71 | ![模型](./Picture/LSTM2.JPG) 72 | 73 | 以下是一个lstm单元的结构(**一个lstm单元也就是网络中的一层,即由上述num_layers控制**) 74 | ![模型](./Picture/LSTM.JPG) 75 | 其中输出即是一个`hidden_dim`的向量,以上两个参数控制lstm最核心的网络架构。 76 | 77 | ### 2.GRU 78 | gru可以说是lstm的初代版本,一个GRU单元如下所示 79 | ![模型](./Picture/GRU.JPG) 80 | 81 | ### 3.整体模型结构 82 | ![模型](./Picture/rnn_architecture.png) 83 | 84 | ## 实验结果 85 | 本次实验是帮师兄做了的一个关于设备识别分类的工作。从50W条设备banner信息中对设备品牌和型号进行识别。 86 | 因为数据相对规整,用lstm处理得到的效果也非常好,正确率能达到99% 87 | ![模型](./Picture/accuracy.png) 88 | 89 | ![模型](./Picture/loss.png) 90 | 91 | 92 | 93 | ## LSTM和GRU的区别 94 | 先给出一些结论: 95 | - GRU和LSTM的性能在很多任务上不分伯仲。 96 | - GRU 参数更少因此更容易收敛,但是数据集很大的情况下,LSTM表达性能更好。 97 | - 从结构上来说,GRU只有两个门(update和reset),LSTM有三个门(forget,input,output),GRU直接将hidden state 传给下一个单元,而LSTM则用memory cell 把hidden state 包装起来。 98 | -------------------------------------------------------------------------------- /LSTM_GRU/data/data_clean.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[10]: 5 | 6 | 7 | import pickle 8 | import numpy as np 9 | from collections import defaultdict,OrderedDict 10 | import re 11 | from tqdm import tqdm 12 | import pandas as pd 13 | from bitarray import bitarray 14 | 15 | 16 | # In[185]: 17 | 18 | 19 | def clean_string(string,TREC=False): 20 | string = re.sub(r"[^A-Za-z0-9,!?.]", " ", string) 21 | string = re.sub(r",", " ", string) 22 | string = re.sub(r"!", " ", string) 23 | string = re.sub(r"\(", " ", string) 24 | string = re.sub(r"\)", " ", string) 25 | string = re.sub(r"\?", " ", string) 26 | string = re.sub(r"(?<=\s)\w(?=\s)", " ", string) 27 | string = re.sub(r"\s{2,}", " ", string) 28 | return string.strip() if TREC else string.strip().lower() 29 | 30 | 31 | # In[245]: 32 | 33 | 34 | def load_data_k_cv(folder,cv=10,miniData = True): 35 | """struct : text 36 | device 37 | brand 38 | model 39 | split 40 | word_cab : 词频字典 41 | 42 | """ 43 | word_cab=defaultdict(int) 44 | df = [] 45 | num = 0 46 | with open(folder,'rb') as f: 47 | for line in tqdm(f): 48 | line = line.decode(encoding='ISO-8859-1') 49 | row = list(map(lambda x : x.strip(),line.strip().split("|")))[1:] 50 | if not (5 <= len(row) <= 6) : 51 | continue 52 | row = row[:3] + row[3].split(",") + row[4:] 53 | if len(row) != 6: 54 | continue 55 | row = list(map(lambda x : clean_string(x), row)) 56 | row.append(np.random.randint(0, cv)) 57 | df.append({"text":str(row[5]) +" "+ row[0]+" " + row[1],"device":row[2],"brand":row[3],"model":row[4],"split":row[6]}) 58 | num += 1 59 | if miniData and num == 10000: 60 | break 61 | 62 | word_cab = defaultdict(int) 63 | sentence_max_len = 0 64 | final_df = [] 65 | 66 | print("cleaning data") 67 | for struct in tqdm(df): 68 | length = len(struct["text"].split()) 69 | if length <= 200: 70 | struct["text"] = clean_string(struct["text"]) 71 | sentence_max_len = max(sentence_max_len, len(struct["text"].split())) 72 | final_df.append(struct) 73 | for word in struct["text"].split(): 74 | word_cab[word] += 1 75 | print("cleaning data finish!") 76 | return final_df, word_cab, sentence_max_len 77 | 78 | 79 | # In[246]: 80 | 81 | 82 | def load_binary_vec(fname, vocab): 83 | word_vecs = {} 84 | with open(fname, 'rb') as fin: 85 | header = fin.readline() 86 | vocab_size, vector_size = list(map(int, header.split())) 87 | binary_len = np.dtype(np.float32).itemsize * vector_size 88 | # vectors = [] 89 | for i in tqdm(range(vocab_size)): 90 | # read word 91 | word = b'' 92 | while True: 93 | ch = fin.read(1) 94 | if ch == b' ': 95 | break 96 | word += ch 97 | # print(str(word)) 98 | word = word.decode(encoding='ISO-8859-1') 99 | if word in vocab: 100 | word_vecs[word] = np.fromstring(fin.read(binary_len), dtype=np.float32) 101 | else: 102 | fin.read(binary_len) 103 | fin.read(1) # newline 104 | return word_vecs 105 | 106 | 107 | # In[247]: 108 | 109 | 110 | def add_unexist_word_vec(word_vecs, word_cab): 111 | for word in tqdm(set(word_cab.keys() -word_vecs.keys())): 112 | word_vecs[word] = np.random.uniform(-0.1,0.1,100) 113 | 114 | 115 | # In[248]: 116 | 117 | 118 | data_folder = r"all.txt" 119 | w2v_file = r'vec1.bin' 120 | 121 | 122 | # In[265]: 123 | 124 | 125 | print("load text") 126 | df, word_cab, sentence_max_len = load_data_k_cv(data_folder, 10, False) 127 | print("finish text load !!!") 128 | 129 | 130 | # In[266]: 131 | 132 | 133 | brandCount = defaultdict(int) 134 | for struct in df: 135 | brandCount[(struct['brand'])] += 1 136 | 137 | 138 | # In[267]: 139 | 140 | 141 | usefulBrand = set() 142 | for k, v in brandCount.items(): 143 | if v > 50: 144 | usefulBrand.add(k) 145 | 146 | 147 | # In[268]: 148 | 149 | 150 | for i in range(len(df)-1,-1,-1): 151 | if df[i]['brand'] not in usefulBrand: 152 | df.pop(i) 153 | 154 | 155 | # In[271]: 156 | 157 | 158 | len(df) 159 | 160 | 161 | # In[282]: 162 | 163 | 164 | with open("banner.txt","wb") as f: 165 | for struct in df: 166 | f.write(bytes(struct['text']+'\n', encoding="utf8")) 167 | 168 | 169 | 170 | # In[283]: 171 | 172 | 173 | print("load word2vec") 174 | word_vecs = load_binary_vec(w2v_file, word_cab) 175 | print("finish word2vec load !!!") 176 | 177 | 178 | # In[285]: 179 | 180 | 181 | add_unexist_word_vec(word_vecs,word_cab) 182 | 183 | 184 | # In[286]: 185 | 186 | 187 | len(word_vecs) 188 | 189 | 190 | # In[287]: 191 | 192 | 193 | pickle.dump([df,word_vecs,word_cab,sentence_max_len],open(r'word_vec.p','wb')) 194 | 195 | 196 | -------------------------------------------------------------------------------- /LSTM_GRU/demo/__pycache__/process_data.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Happy-zyy/NLP-Model/f5fefb747f5c21fc8d348b8622846156197d21d6/LSTM_GRU/demo/__pycache__/process_data.cpython-36.pyc -------------------------------------------------------------------------------- /LSTM_GRU/demo/__pycache__/rnn_model.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Happy-zyy/NLP-Model/f5fefb747f5c21fc8d348b8622846156197d21d6/LSTM_GRU/demo/__pycache__/rnn_model.cpython-36.pyc -------------------------------------------------------------------------------- /LSTM_GRU/demo/process_data.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | import pandas as pd 3 | import numpy as np 4 | import tensorflow.contrib.keras as kr 5 | 6 | def getWordsVect(config, W): 7 | word_ids = defaultdict(int) 8 | W_list = [] 9 | W_list.append([0.0] * config.embedding_dim) 10 | count = 1 11 | for word,vector in W.items(): 12 | W_list.append(vector.tolist()) 13 | word_ids[word] = count 14 | count = count + 1 15 | return word_ids,W_list 16 | 17 | 18 | def get_train_test_data(word_ids, data_set_df, label, sentence_max_len, cv_id=9): 19 | """将句子转换为id表示""" 20 | s = set() 21 | for struct in data_set_df: 22 | s.add(struct[label]) 23 | cat = sorted(list(s)) 24 | cat_to_id = dict(zip(cat, range(len(cat)))) 25 | 26 | data_id, label_id = [], [] 27 | for i in range(len(data_set_df)): 28 | data_id.append([word_ids[x] for x in data_set_df[i]['text'] if x in word_ids]) 29 | label_id.append(cat_to_id[data_set_df[i][label]]) 30 | 31 | # 使用keras提供的pad_sequences来将文本pad为固定长度 32 | 33 | x_pad = kr.preprocessing.sequence.pad_sequences(data_id, sentence_max_len, padding="pre") 34 | y_pad = kr.utils.to_categorical(label_id, num_classes=len(cat_to_id)) # 将标签转换为one-hot表示 35 | 36 | train_index, test_index = [], [] 37 | if cv_id >= 0: 38 | for x in range(len(data_set_df)): 39 | if int(data_set_df[x]["split"]) < cv_id: 40 | train_index.append(x) 41 | else: 42 | test_index.append(x) 43 | 44 | print("************") 45 | print("train_Num",len(train_index)) 46 | print("test_Num", len(test_index)) 47 | return x_pad[train_index], y_pad[train_index], x_pad[test_index], y_pad[test_index] 48 | else: 49 | return x_pad, y_pad, cat -------------------------------------------------------------------------------- /LSTM_GRU/demo/rnn_model.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | 4 | import tensorflow as tf 5 | import pandas as pd 6 | class TRNNConfig(object): 7 | """RNN配置参数""" 8 | 9 | def __init__(self, sentence_max_len, class_num, vocab_size): 10 | # 模型参数 11 | self.embedding_dim = 100 # 词向量维度 12 | self.num_classes = class_num # 类别数 13 | self.vocab_size = vocab_size # 词汇表达小 14 | self.sentence_max_len = sentence_max_len 15 | 16 | self.num_layers= 2 # 隐藏层层数 17 | self.hidden_dim = 128 # 隐藏层神经元 18 | self.rnn = 'lstm' # lstm 或 gru 19 | 20 | self.dropout_keep_prob = 0.8 # dropout保留比例 21 | self.learning_rate = 1e-3 # 学习率 22 | 23 | self.batch_size = 128 # 每批训练大小 24 | self.num_epochs = 10 # 总迭代轮次 25 | 26 | self.print_per_batch = 100 # 每多少轮输出一次结果 27 | self.save_per_batch = 100 # 每多少轮存入tensorboard 28 | 29 | 30 | class TextRNN(object): 31 | """文本分类,RNN模型""" 32 | def __init__(self, config, W_list, trainWordVec = False): 33 | self.config = config 34 | self.W_list = W_list 35 | self.trainWordVec = trainWordVec 36 | # 三个待输入的数据 37 | self.input_x = tf.placeholder(tf.int32, [None, self.config.sentence_max_len], name='input_x') 38 | self.input_y = tf.placeholder(tf.float32, [None, self.config.num_classes], name='input_y') 39 | self.keep_prob = tf.placeholder(tf.float32, name='keep_prob') 40 | 41 | self.rnn() 42 | 43 | def rnn(self): 44 | """rnn模型""" 45 | 46 | def lstm_cell(): # lstm核 47 | return tf.contrib.rnn.BasicLSTMCell(self.config.hidden_dim, state_is_tuple=True) 48 | 49 | def gru_cell(): # gru核 50 | return tf.contrib.rnn.GRUCell(self.config.hidden_dim) 51 | 52 | def dropout(): # 为每一个rnn核后面加一个dropout层 53 | if (self.config.rnn == 'lstm'): 54 | cell = lstm_cell() 55 | else: 56 | cell = gru_cell() 57 | return tf.contrib.rnn.DropoutWrapper(cell, output_keep_prob=self.keep_prob) 58 | 59 | # 词向量映射 60 | with tf.device('/cpu:0'): 61 | if self.trainWordVec: 62 | embedding = tf.get_variable('embedding', [self.config.vocab_size, self.config.embedding_dim]) 63 | else: 64 | 65 | embedding = tf.Variable(initial_value=self.W_list, dtype=tf.float32, trainable=False, name='embedding_layer_W') 66 | embedding_inputs = tf.nn.embedding_lookup(embedding, self.input_x) 67 | 68 | with tf.name_scope("rnn"): 69 | # 多层rnn网络 70 | cells = [dropout() for _ in range(self.config.num_layers)] 71 | rnn_cell = tf.contrib.rnn.MultiRNNCell(cells, state_is_tuple=True) 72 | 73 | _outputs, _ = tf.nn.dynamic_rnn(cell=rnn_cell, inputs=embedding_inputs, dtype=tf.float32) 74 | last = _outputs[:, -1, :] # 取最后一个时序输出作为结果 75 | 76 | with tf.name_scope("score"): 77 | # 全连接层,后面接dropout以及relu激活 78 | fc = tf.layers.dense(last, self.config.hidden_dim, name='fc1') 79 | fc = tf.contrib.layers.dropout(fc, self.keep_prob) 80 | fc = tf.nn.relu(fc) 81 | 82 | # 分类器 83 | self.logits = tf.layers.dense(fc, self.config.num_classes, name='fc2') 84 | self.y_pred_cls = tf.argmax(tf.nn.softmax(self.logits), 1) # 预测类别 85 | 86 | with tf.name_scope("optimize"): 87 | # 损失函数,交叉熵 88 | cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=self.input_y) 89 | self.loss = tf.reduce_mean(cross_entropy) 90 | # 优化器 91 | self.optim = tf.train.AdamOptimizer(learning_rate=self.config.learning_rate).minimize(self.loss) 92 | 93 | with tf.name_scope("accuracy"): 94 | # 准确率 95 | correct_pred = tf.equal(tf.argmax(self.input_y, 1), self.y_pred_cls) 96 | self.acc = tf.reduce_mean(tf.cast(correct_pred, tf.float32)) 97 | -------------------------------------------------------------------------------- /LSTM_GRU/demo/rnn_run.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from __future__ import print_function 4 | 5 | import os 6 | import sys 7 | import time 8 | from datetime import timedelta 9 | import pickle 10 | import numpy as np 11 | import tensorflow as tf 12 | from sklearn import metrics 13 | from tqdm import tqdm 14 | from rnn_model import TRNNConfig, TextRNN 15 | import process_data 16 | 17 | tensorboard_dir = '../model/tensorboard/textrnn' # 可视化路径 18 | save_dir = '../model/checkpoints/textrnn' 19 | save_path = os.path.join(save_dir, 'best_validation') # 最佳验证结果保存路径 20 | 21 | train_data = "../data/word_vec.p" 22 | label = "brand" 23 | 24 | def get_time_dif(start_time): 25 | """获取已使用时间""" 26 | end_time = time.time() 27 | time_dif = end_time - start_time 28 | return timedelta(seconds=int(round(time_dif))) 29 | 30 | 31 | def feed_data(model, x_batch, y_batch, keep_prob): 32 | feed_dict = { 33 | model.input_x: x_batch, 34 | model.input_y: y_batch, 35 | model.keep_prob: keep_prob 36 | } 37 | return feed_dict 38 | 39 | def batch_iter(x, y, batch_size=128): 40 | """生成批次数据""" 41 | data_len = len(x) 42 | num_batch = int((data_len - 1) / batch_size) + 1 43 | 44 | indices = np.random.permutation(np.arange(data_len)) 45 | x_shuffle = x[indices] #乱序 46 | y_shuffle = y[indices] 47 | 48 | for i in range(num_batch): 49 | start_id = i * batch_size 50 | end_id = min((i + 1) * batch_size, data_len) 51 | yield x_shuffle[start_id:end_id], y_shuffle[start_id:end_id] 52 | 53 | 54 | def evaluate(model, sess, x_, y_): 55 | """评估在某一数据上的准确率和损失""" 56 | data_len = len(x_) 57 | batch_eval = batch_iter(x_, y_, 128) 58 | total_loss = 0.0 59 | total_acc = 0.0 60 | for x_batch, y_batch in batch_eval: 61 | batch_len = len(x_batch) 62 | feed_dict = feed_data(model, x_batch, y_batch, 1.0) 63 | loss, acc = sess.run([model.loss, model.acc], feed_dict=feed_dict) 64 | total_loss += loss * batch_len 65 | total_acc += acc * batch_len 66 | 67 | return total_loss / data_len, total_acc / data_len 68 | 69 | 70 | def train(): 71 | print("Configuring TensorBoard and Saver...") 72 | # 配置 Tensorboard,重新训练时,请将tensorboard文件夹删除,不然图会覆盖 73 | 74 | if not os.path.exists(tensorboard_dir): 75 | os.makedirs(tensorboard_dir) 76 | 77 | tf.summary.scalar("loss", model.loss) 78 | tf.summary.scalar("accuracy", model.acc) 79 | merged_summary = tf.summary.merge_all() 80 | writer = tf.summary.FileWriter(tensorboard_dir) 81 | 82 | # 配置 Saver 83 | saver = tf.train.Saver() 84 | if not os.path.exists(save_dir): 85 | os.makedirs(save_dir) 86 | 87 | print("Loading training and validation data...") 88 | # 载入训练集与验证集 89 | start_time = time.time() 90 | x_train, y_train, x_val, y_val = process_data.get_train_test_data(word_ids, df, label, sentence_max_len, 9) 91 | 92 | time_dif = get_time_dif(start_time) 93 | print("Time usage:", time_dif) 94 | 95 | # 创建session 96 | session = tf.Session() 97 | session.run(tf.global_variables_initializer()) 98 | writer.add_graph(session.graph) 99 | 100 | print('Training and evaluating...') 101 | start_time = time.time() 102 | total_batch = 0 # 总批次 103 | best_acc_val = 0.0 # 最佳验证集准确率 104 | last_improved = 0 # 记录上一次提升批次 105 | require_improvement = 1000 # 如果超过1000轮未提升,提前结束训练 106 | 107 | flag = False 108 | for epoch in range(config.num_epochs): 109 | print('Epoch:', epoch + 1) 110 | batch_train = batch_iter(x_train, y_train, config.batch_size) 111 | for x_batch, y_batch in tqdm(batch_train): 112 | feed_dict = feed_data(model, x_batch, y_batch, config.dropout_keep_prob) 113 | if total_batch % config.save_per_batch == 0: 114 | # 每多少轮次将训练结果写入tensorboard scalar 115 | s = session.run(merged_summary, feed_dict=feed_dict) 116 | writer.add_summary(s, total_batch) 117 | 118 | if total_batch % config.print_per_batch == 0: 119 | # 每多少轮次输出在训练集和验证集上的性能 120 | feed_dict[model.keep_prob] = 1.0 121 | loss_train, acc_train = session.run([model.loss, model.acc], feed_dict=feed_dict) 122 | loss_val, acc_val = evaluate(model, session, x_val, y_val) # todo 123 | 124 | if acc_val > best_acc_val: 125 | # 保存最好结果 126 | best_acc_val = acc_val 127 | last_improved = total_batch 128 | saver.save(sess=session, save_path=save_path) 129 | improved_str = '*' 130 | else: 131 | improved_str = '' 132 | 133 | time_dif = get_time_dif(start_time) 134 | msg = 'Iter: {0:>6}, Train Loss: {1:>6.2}, Train Acc: {2:>7.2%},' \ 135 | + ' Val Loss: {3:>6.2}, Val Acc: {4:>7.2%}, Time: {5} {6}' 136 | print(msg.format(total_batch, loss_train, acc_train, loss_val, acc_val, time_dif, improved_str)) 137 | 138 | session.run(model.optim, feed_dict=feed_dict) # 运行优化 139 | total_batch += 1 140 | 141 | if total_batch - last_improved > require_improvement: 142 | # 验证集正确率长期不提升,提前结束训练 143 | print("No optimization for a long time, auto-stopping...") 144 | flag = True 145 | break # 跳出循环 146 | if flag: # 同上 147 | break 148 | session.close() 149 | 150 | 151 | def test(): 152 | print("Loading test data...") 153 | start_time = time.time() 154 | 155 | x_test, y_test , categories = process_data.get_train_test_data(word_ids, df, label, sentence_max_len, -1) 156 | 157 | 158 | session = tf.Session() 159 | session.run(tf.global_variables_initializer()) 160 | saver = tf.train.Saver() 161 | saver.restore(sess=session, save_path=save_path) # 读取保存的模型 162 | 163 | print('Testing...') 164 | loss_test, acc_test = evaluate(session, x_test, y_test) 165 | msg = 'Test Loss: {0:>6.2}, Test Acc: {1:>7.2%}' 166 | print(msg.format(loss_test, acc_test)) 167 | 168 | batch_size = 128 169 | data_len = len(x_test) 170 | num_batch = int((data_len - 1) / batch_size) + 1 171 | 172 | y_test_cls = np.argmax(y_test, 1) 173 | y_pred_cls = np.zeros(shape=len(x_test), dtype=np.int32) # 保存预测结果 174 | for i in range(num_batch): # 逐批次处理 175 | start_id = i * batch_size 176 | end_id = min((i + 1) * batch_size, data_len) 177 | feed_dict = { 178 | model.input_x: x_test[start_id:end_id], 179 | model.keep_prob: 1.0 180 | } 181 | y_pred_cls[start_id:end_id] = session.run(model.y_pred_cls, feed_dict=feed_dict) 182 | 183 | # 评估 184 | print("Precision, Recall and F1-Score...") 185 | print(metrics.classification_report(y_test_cls, y_pred_cls, target_names=categories)) 186 | 187 | # 混淆矩阵 188 | print("Confusion Matrix...") 189 | cm = metrics.confusion_matrix(y_test_cls, y_pred_cls) 190 | print(cm) 191 | 192 | time_dif = get_time_dif(start_time) 193 | print("Time usage:", time_dif) 194 | 195 | 196 | if __name__ == '__main__': 197 | if len(sys.argv) != 2 or sys.argv[1] not in ['train', 'test']: 198 | raise ValueError("""usage: python run_rnn.py [train / test]""") 199 | 200 | print('Configuring RNN model...') 201 | 202 | print('load data. . .') 203 | X = pickle.load(open(train_data, 'rb')) 204 | df, word_vecs, word_cab_num, sentence_max_len, class_num = X[0], X[1], X[2], X[3], X[4] 205 | 206 | config = TRNNConfig(sentence_max_len, class_num, word_cab_num) 207 | 208 | word_ids, W_list = process_data.getWordsVect(config, word_vecs) 209 | 210 | model = TextRNN(config, W_list, False) #默认不训练词向量 211 | 212 | 213 | if sys.argv[1] == 'train': 214 | train() 215 | else: 216 | test() 217 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # NLP-Model 2 | Learn and demonstrate some classical model 3 | 4 | 5 | 6 | ## 目录 7 | 8 | * [Text-CNN](#text-cnn) 9 | * [LSTM&GRU](#lstm) 10 | * [HAN](#HAN) 11 | * [Glove](#glove) 12 | 13 | 14 | ## Text-CNN 15 | ### 1. 模型展示 16 | ![模型](./Text_CNN/Result/模型.png) 17 | 18 | ### 2. 参数与超参数 19 | 20 | **sequence_length** 21 | Q: 对于CNN, 输入与输出都是固定的,可每个句子长短不一, 怎么处理? 22 | A: 需要做定长处理, 比如定为n, 超过的截断, 不足的补0. 注意补充的0对后面的结果没有影响,因为后面的max-pooling只会输出最大值,补零的项会被过滤掉. 23 | 24 | **num_classes** 25 | 多分类, 分为几类. 26 | 27 | **vocabulary_size** 28 | 语料库的词典大小, 记为|D|. 29 | 30 | 31 | **embedding_size** 32 | 将词向量的维度, 由原始的 |D| 降维到 embedding_size. 33 | 34 | 35 | **filter_size_arr** 36 | 多个不同size的filter. 37 | 38 | 39 | ### 3. demo流程 40 | ```C 41 | str_length = 36 42 | word_vec = 128 43 | filter_size = [2,3,4] 每种尺寸2个filter 44 | ``` 45 | 46 | ![流程](./Text_CNN/Result/流程.jpg) 47 | 48 | ### 3.实验部分 49 | #### 1 数据集介绍 50 | 1.1 实验的过程中只使用了[MR数据集](https://www.cs.cornell.edu/people/pabo/movie-review-data/),验证方式是10 folds的交叉验证方式。 51 | > 数据集中包含了5331 positive and 5331 negative processed sentences / snippets. Introduced in Pang/Lee ACL 2005. Released July 2005. 52 | 53 | 2.1 词向量包含以下三种(**可以任意选一种或多种累加当作一个词不同的channel**): 54 | + **CNN-rand**:句子中的的word vector都是随机初始化的,同时当做CNN训练过程中需要优化的参数; 55 | + **CNN-static**:句子中的word vector是使用word2vec预先对Google News dataset(about 100 billion words)进行训练好的词向量表中的词向量。且在CNN训练过程中作为固定的输入,不作为优化的参数; 56 | + **CNN-non-static**:句子中的word vector是使用word2vec预先对Google News dataset(about 100 billion words)进行训练好的词向量表中的词向量。在CNN训练过程中作为固定的输入,做为CNN训练过程中**需要优化**的参数; 57 | 58 | 说明: 59 | 60 | > + GoogleNews-vectors-negative300.bin.gz词向量表是通过word2vec使用命令预先训练好,花费时间较长。 61 | 已经训练好的:[GoogleNews-vectors-negative300.bin.gz百度云盘下载地址](https://pan.baidu.com/share/init?surl=OglaQBBO30d5KdzZNNdRSg) 密码:18yf 62 | > + word2vec预先训练命令如:```./word2vec -train text8(语料) -output vectors.bin(输出词向量表) -cbow(训练使用模型方式) 0 -size 48 -window 5 -negative 0 -hs 1 -sample 1e-4 -threads 20 -binary 1 -iter 100``` 63 | > + 除了使用word2vec对语料库进行预先训练外,也可以使用glove或FastText进行词向量训练。 64 | 65 | 66 | #### 2.文件介绍 67 | 68 | 2.1 **process\_data.py**:加载Google训练的词向量表GoogleNews-vectors-negative300.bin,并对文本数据做一些预处理,使其转化为NN易用的形式,并将其存储在文件中。 69 | 最终存储为一个word\_vec.p,其文件存储的内容是[**随机词向量表,已训练好的词向量表, 词频字典, 最大句子长度, revs**]; 70 | 其中revs是一个结构体列表,列表中的每个元素如下所示: 71 | ``` 72 | { 73 | "y":0/1 #标签 74 | "num_words":int #句子长度 75 | "text":str #句子 76 | "split":[0,10] #十折交叉使用 77 | } 78 | ``` 79 | 2.2 **text_cnn_main.py**: 主程序文件。读取以上word_vec.p文件内容,设置一些配置信息并设置一些网络运行时需要的参数。 80 | 2.3 **text_cnn_model.py**:text-cnn模型文件。 81 | 82 | 83 | #### 3.实验结果展示 84 | ![结果](./Text_CNN/Result/分类结果.jpg) 85 | 86 | 87 | ### 4.经验分享 88 | 89 | 在工作用到TextCNN做query推荐,并结合先关的文献,谈几点经验: 90 | 1、TextCNN是一个n-gram特征提取器,对于训练集中没有的n-gram不能很好的提取。对于有些n-gram,可能过于强烈,反而会干扰模型,造成误分类。 91 | 2、TextCNN对词语的**顺序不敏感**,在query推荐中,我把正样本分词后得到的term做随机排序,正确率并没有降低太多,当然,其中一方面的原因短query本身对term的顺序要求不敏感。 92 | 3、TextCNN擅长长本文分类,在这一方面可以做到很高正确率。 93 | 4、TextCNN在模型结构方面有很多参数可调,具体参看文末的文献。 94 | 95 | 参考文献 96 | 《Convolutional Neural Networks for Sentence Classification》 97 | 《A Sensitivity Analysis of (and Practitioners’ Guide to) Convolutional Neural Networks for Sentence Classification》 98 | 99 | --- 100 | > [参考博客](https://jianwenjun.xyz/2018/03/16/%E5%8D%B7%E7%A7%AF%E7%A5%9E%E7%BB%8F%E7%BD%91%E7%BB%9C-TextCNN-%E5%9C%A8%E5%8F%A5%E5%AD%90%E5%88%86%E7%B1%BB%E4%B8%8A%E7%9A%84%E5%AE%9E%E7%8E%B0/) 101 | > [参考博客](https://blog.csdn.net/u012762419/article/details/79561441) 102 | 特此感谢 103 | 104 | --- 105 | 106 | ## LSTM&GRU 107 | 利用LSTM做文本分类 108 | 109 | ### 1.Usage 110 | 111 | #### 1.1 数据预处理 112 | 在data文件中,先使用`data_clean.py`对文本数据进行预处理 113 | 114 | 最后处理的格式信息如下: 115 | ``` 116 | df, word_vecs, word_cab_num, sentence_max_len, class_num 117 | ``` 118 | `df`:句子字典列表。其中包括句子的text、分类、split等辅助信息 119 | ``` 120 | { 121 | "label": #标签 122 | "num_words":int #句子长度 123 | "text":str #句子 124 | "split":[0,10] #十折交叉使用 125 | } 126 | ``` 127 | `word_vecs`:文本中所有词的词向量表示 128 | `word_cab_num`:文本中共有多少不同的词汇 129 | `sentence_max_len`:句子的最大长度 130 | `class_num`:多分类问题分几类 131 | 132 | 133 | #### 1.2 模型超参 134 | 模型参数在`rnn_model.py`进行相关的设置。其中需要修改的包括: 135 | ```python 136 | class TRNNConfig(object): 137 | self.embedding_dim = 100 # 词向量维度 138 | self.num_layers= 2 # 隐藏层层数 139 | self.hidden_dim = 128 # 隐藏层神经元 140 | self.rnn = 'lstm' # lstm 或 gru 141 | 142 | self.dropout_keep_prob = 0.8 # dropout保留比例 143 | self.learning_rate = 1e-3 # 学习率 144 | 145 | self.batch_size = 128 # 每批训练大小 146 | self.num_epochs = 10 # 总迭代轮次 147 | ``` 148 | 启动参数包括`rnn_run.py`的一些路径等配置信息 149 | ``` 150 | train_data = "../data/word_vec.p" #配置数据清洗后生成的数据路径 151 | label = "brand" #1中所述df的类别标签名 152 | ``` 153 | 154 | #### 1.3 运行 155 | ```python 156 | rnn_run.py train #训练&验证 157 | rnn_run.py test #测试 158 | ``` 159 | 160 | ### 2.模型介绍 161 | #### 2.1 LSTM 162 | lstm作为加入了attention机制的rnn网络,对长文本具有很好的记忆效果,其主要归功于模型结构。 163 | ![模型](LSTM_GRU/Picture/LSTM2.JPG) 164 | 165 | 以下是一个lstm单元的结构(**一个lstm单元也就是网络中的一层,即由上述num_layers控制**) 166 | ![模型](LSTM_GRU/Picture/LSTM.JPG) 167 | 其中输出即是一个`hidden_dim`的向量,以上两个参数控制lstm最核心的网络架构。 168 | 169 | #### 2.2 GRU 170 | gru可以说是lstm的初代版本,一个GRU单元如下所示 171 | ![模型](LSTM_GRU/Picture/GRU.JPG) 172 | 173 | 174 | ### 3.实验结果 175 | 本次实验是帮师兄做了的一个关于设备识别分类的工作。从50W条设备banner信息中对设备品牌和型号进行识别。 176 | 因为数据相对规整,用lstm处理得到的效果也非常好,正确率能达到99% 177 | ![模型](LSTM_GRU/Picture/accuracy.png) 178 | 179 | ![模型](LSTM_GRU/Picture/loss.png) 180 | 181 | ### 4.LSTM和GRU的区别 182 | 先给出一些结论: 183 | - GRU和LSTM的性能在很多任务上不分伯仲。 184 | - GRU 参数更少因此更容易收敛,但是数据集很大的情况下,LSTM表达性能更好。 185 | - 从结构上来说,GRU只有两个门(update和reset),LSTM有三个门(forget,input,output),GRU直接将hidden state 传给下一个单元,而LSTM则用memory cell 把hidden state 包装起来。 186 | 187 | 188 | ## HAN 189 | ### 1.模型介绍 190 | #### 1.1 特点 191 | (1)可以直观的看出用这个模型构建文本表示时各个句子和单词的重要程度,增强了可解释性。 192 | (2)文本中不同句子对文本的主旨影响程度不同,一个句子中不同的词语对句子主旨的影响程度也不同,因此HAN在**词语层面**和**句子层面**分别添加了注意力机制。 193 | 194 | #### 1.2 结构 195 | 它包括四个部分:一个词序列编码器,一个词级注意层,一个句子编码器和一个句子层注意层。具体结构如下图所示: 196 | ![](picture/README-1cd4ff0f.png) 197 | --- 198 | (1)词序列编码器是通过一个双向GRU实现的 199 | ![](picture/README-61a8bed9.png) 200 | 其中: 201 | $w_{it}: 第i个句子的第t个词语$ 202 | $W_e$ : embedding_matrix 203 | 前向和后向结果拼接得到词序列编码: 204 | ![](picture/README-85ffa053.png) 205 | 206 | (2)词级Attention层 207 | ![](picture/README-8f6b1559.png) 208 | 其中 $6$ 式得到权重向量 209 | 210 | (3)句子编码器和词编码器类似 211 | ![](picture/README-85cdfcb9.png) 212 | 拼接后得到句子编码结果 213 | ![](picture/README-7ea1b04c.png) 214 | 215 | (4)句子级Attention层 216 | ![](picture/README-857a805b.png) 217 | 得到 $v$ 即为文档的向量表示,可以通过一个全连接层,然后softmax输出,进行文档分类。 218 | ![](picture/README-4fcc65db.png) 219 | 220 | #### 1.3 可视化 221 | ![](picture/README-f68d8b8d.png) 222 | 图中蓝色颜色深浅表示word level的attention权重,红色颜色深浅表示sentence level的attention权重。 223 | 224 | 225 | 226 | 227 | ## Glove 228 | -------------------------------------------------------------------------------- /Text_CNN/.idea/Text_CNN.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 12 | -------------------------------------------------------------------------------- /Text_CNN/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | -------------------------------------------------------------------------------- /Text_CNN/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /Text_CNN/.idea/workspace.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | data_dir 61 | load_binary_vec 62 | 63 | 64 | 65 | 71 | 72 | 73 | 74 | 75 | true 76 | DEFINITION_ORDER 77 | 78 | 79 | 80 | 81 | 82 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 |