├── CRF分词
    └── main.py
├── LSTM_GRU
    ├── .DS_Store
    ├── Picture
    │   ├── .DS_Store
    │   ├── GRU.JPG
    │   ├── GRU2.JPG
    │   ├── LSTM.JPG
    │   ├── LSTM2.JPG
    │   ├── accuracy.png
    │   ├── loss.png
    │   └── rnn_architecture.png
    ├── README.md
    ├── data
    │   └── data_clean.py
    └── demo
    │   ├── __pycache__
    │       ├── process_data.cpython-36.pyc
    │       └── rnn_model.cpython-36.pyc
    │   ├── process_data.py
    │   ├── rnn_model.py
    │   └── rnn_run.py
├── README.md
├── Text_CNN
    ├── .idea
    │   ├── Text_CNN.iml
    │   ├── misc.xml
    │   ├── modules.xml
    │   └── workspace.xml
    ├── Result
    │   ├── 分类结果.jpg
    │   ├── 模型.png
    │   └── 流程.jpg
    ├── process_data.py
    ├── text_cnn_main.py
    └── text_cnn_model.py
└── picture
    ├── README-1cd4ff0f.png
    ├── README-282eca2f.png
    ├── README-4fcc65db.png
    ├── README-61a8bed9.png
    ├── README-7ea1b04c.png
    ├── README-857a805b.png
    ├── README-85cdfcb9.png
    ├── README-85ffa053.png
    ├── README-8f6b1559.png
    ├── README-95176db7.png
    ├── README-d1c2b10b.png
    ├── README-f68d8b8d.png
    └── 分词数据集.png


/CRF分词/main.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # coding: utf-8
 3 | 
 4 | # In[81]:
 5 | 
 6 | 
 7 | import re
 8 | import numpy as np
 9 | from 
10 | 
11 | 
12 | # In[89]:
13 | 
14 | 
15 | sents = open(r'H:\分词数据\training\pku_training.utf8',encoding='utf-8').read()
16 | sents = sents.strip()
17 | sents = sents.split('\n') # 这个语料的换行符是\r\n
18 | 
19 | 
20 | # In[105]:
21 | 
22 | 
23 | sents = [re.split(' +', s) for s in sents] # 词之间以空格隔开
24 | sents = [[w for w in s if w] for s in sents] # 去掉空字符串
25 | np.random.shuffle(sents) # 打乱语料，以便后面划分验证集
26 | 
27 | 
28 | # In[91]:
29 | 
30 | 
31 | chars = {} # 统计字表
32 | for s in sents:
33 |     for c in ''.join(s):
34 |         if c in chars:
35 |             chars[c] += 1
36 |         else:
37 |             chars[c] = 1
38 | 
39 | min_count = 2 # 过滤低频字
40 | chars = {i:j for i,j in chars.items() if j >= min_count} # 过滤低频字  低频字的id是0
41 | id2char = {i+1:j for i,j in enumerate(chars)} # id到字的映射
42 | char2id = {j:i for i,j in id2char.items()} # 字到id的映射
43 | 
44 | id2tag = {0:'s', 1:'b', 2:'m', 3:'e'} # 标签（sbme）与id之间的映射
45 | tag2id = {j:i for i,j in id2tag.items()}
46 | 
47 | train_sents = sents[:-5000] # 留下5000个句子做验证，剩下的都用来训练
48 | valid_sents = sents[-5000:]
49 | 
50 | 
51 | # In[97]:
52 | 
53 | 
54 | batch_size = 128
55 | 
56 | 
57 | # In[98]:
58 | 
59 | 
60 | train_sents[0]
61 | 
62 | 
63 | # In[123]:
64 | 
65 | 
66 | def train_generator(): #定义数据生成器
67 |     X, Y = [], []
68 |     while True:
69 |         for i,text in enumerate(train_sents):
70 |             sx,sy = [], []
71 |             for s in text:
72 |                 sx.extend([char2id.get(c,0) for c in s])
73 |                 if len(s) == 1:
74 |                     sy.append(0)
75 |                 elif len(s) == 2:
76 |                     sy.extend([1,3])
77 |                 else:
78 |                     sy.extend([1] + [2]*(len(s) - 2) + [3])
79 |             X.append(sx)
80 |             Y.append(sy)
81 |             if len(X) == batch_size or i == len(train_sents)-1:
82 |                 maxlen = max([len(t) for t in X])
83 |                 X = [x+[4]*(maxlen-len(x)) for x in X]
84 |                 Y = [y+[4]*(maxlen-len(y)) for y in Y]
85 |                 yield np.array(X), to_categorical(Y, 5)
86 |                 X, Y = [], []
87 | 
88 | 


--------------------------------------------------------------------------------
/LSTM_GRU/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Happy-zyy/NLP-Model/f5fefb747f5c21fc8d348b8622846156197d21d6/LSTM_GRU/.DS_Store


--------------------------------------------------------------------------------
/LSTM_GRU/Picture/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Happy-zyy/NLP-Model/f5fefb747f5c21fc8d348b8622846156197d21d6/LSTM_GRU/Picture/.DS_Store


--------------------------------------------------------------------------------
/LSTM_GRU/Picture/GRU.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Happy-zyy/NLP-Model/f5fefb747f5c21fc8d348b8622846156197d21d6/LSTM_GRU/Picture/GRU.JPG


--------------------------------------------------------------------------------
/LSTM_GRU/Picture/GRU2.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Happy-zyy/NLP-Model/f5fefb747f5c21fc8d348b8622846156197d21d6/LSTM_GRU/Picture/GRU2.JPG


--------------------------------------------------------------------------------
/LSTM_GRU/Picture/LSTM.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Happy-zyy/NLP-Model/f5fefb747f5c21fc8d348b8622846156197d21d6/LSTM_GRU/Picture/LSTM.JPG


--------------------------------------------------------------------------------
/LSTM_GRU/Picture/LSTM2.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Happy-zyy/NLP-Model/f5fefb747f5c21fc8d348b8622846156197d21d6/LSTM_GRU/Picture/LSTM2.JPG


--------------------------------------------------------------------------------
/LSTM_GRU/Picture/accuracy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Happy-zyy/NLP-Model/f5fefb747f5c21fc8d348b8622846156197d21d6/LSTM_GRU/Picture/accuracy.png


--------------------------------------------------------------------------------
/LSTM_GRU/Picture/loss.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Happy-zyy/NLP-Model/f5fefb747f5c21fc8d348b8622846156197d21d6/LSTM_GRU/Picture/loss.png


--------------------------------------------------------------------------------
/LSTM_GRU/Picture/rnn_architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Happy-zyy/NLP-Model/f5fefb747f5c21fc8d348b8622846156197d21d6/LSTM_GRU/Picture/rnn_architecture.png


--------------------------------------------------------------------------------
/LSTM_GRU/README.md:
--------------------------------------------------------------------------------
 1 | # LSTM
 2 | 利用LSTM做文本分类
 3 | 
 4 | ## Usage
 5 | 
 6 | ### 1. 数据预处理
 7 | 在data文件中，先使用`data_clean.py`对文本数据进行预处理,包括步骤如下：  
 8 | #### 1.1 原数据数据清洗
 9 | 对所给文本文件进行去停用词、去异常文本、去超长文本操作
10 | 
11 | #### 1.2 训练词向量
12 | 对将进行的文本信息（banner.txt）利用word2vec模型训练词向量 
13 | ```
14 | word2vec -train banner.txt -output vec1.bin -cbow 0 -hs 1 -threads 12 -binary 1
15 | ``` 
16 | 
17 | #### 1.3 词向量表示
18 | 对原文本中所有词建立词向量字典，未登录词采用正态分布随机表示.  
19 | 
20 | ---
21 | 
22 | 最后处理的格式信息如下：
23 | ```
24 | df, word_vecs, word_cab_num, sentence_max_len, class_num
25 | ```
26 | `df`:句子字典列表。其中包括句子的text、分类、split等辅助信息
27 | ```
28 | {
29 | "label":         #标签
30 | "num_words":int  #句子长度
31 | "text":str       #句子
32 | "split":[0,10]   #十折交叉使用
33 | }
34 | ```
35 | `word_vecs`:文本中所有词的词向量表示  
36 | `word_cab_num`:文本中共有多少不同的词汇  
37 | `sentence_max_len`:句子的最大长度  
38 | `class_num`:多分类问题分几类  
39 | 
40 | 
41 | ### 2.模型超参
42 | 模型参数在`rnn_model.py`进行相关的设置。其中需要修改的包括：
43 | ```python
44 | class TRNNConfig(object):
45 |     self.embedding_dim = 100     # 词向量维度
46 |     self.num_layers= 2           # 隐藏层层数
47 |     self.hidden_dim = 128        # 隐藏层神经元
48 |     self.rnn = 'lstm'             # lstm 或 gru
49 | 
50 |     self.dropout_keep_prob = 0.8 # dropout保留比例
51 |     self.learning_rate = 1e-3    # 学习率
52 | 
53 |     self.batch_size = 128          # 每批训练大小
54 |     self.num_epochs = 10           # 总迭代轮次
55 | ```
56 | 启动参数包括`rnn_run.py`的一些路径等配置信息
57 | ```
58 | train_data = "../data/word_vec.p"  #配置数据清洗后生成的数据路径
59 | label = "brand"                    #1中所述df的类别标签名
60 | ```
61 | 
62 | ### 3.运行
63 | ```python
64 | rnn_run.py train #训练&验证
65 | rnn_run.py test  #测试
66 | ```
67 | 
68 | ## 模型介绍
69 | ### 1.LSTM
70 | lstm作为加入了attention机制的rnn网络，对长文本具有很好的记忆效果，其主要归功于模型结构。    
71 | ![模型](./Picture/LSTM2.JPG)
72 | 
73 | 以下是一个lstm单元的结构（**一个lstm单元也就是网络中的一层,即由上述num_layers控制**）  
74 | ![模型](./Picture/LSTM.JPG)
75 | 其中输出即是一个`hidden_dim`的向量，以上两个参数控制lstm最核心的网络架构。  
76 | 
77 | ### 2.GRU
78 | gru可以说是lstm的初代版本，一个GRU单元如下所示  
79 | ![模型](./Picture/GRU.JPG)  
80 | 
81 | ### 3.整体模型结构  
82 | ![模型](./Picture/rnn_architecture.png)  
83 | 
84 | ## 实验结果
85 | 本次实验是帮师兄做了的一个关于设备识别分类的工作。从50W条设备banner信息中对设备品牌和型号进行识别。  
86 | 因为数据相对规整，用lstm处理得到的效果也非常好，正确率能达到99%    
87 | ![模型](./Picture/accuracy.png)  
88 | 
89 | ![模型](./Picture/loss.png)
90 | 
91 | 
92 | 
93 | ## LSTM和GRU的区别
94 | 先给出一些结论：  
95 | - GRU和LSTM的性能在很多任务上不分伯仲。
96 | - GRU 参数更少因此更容易收敛，但是数据集很大的情况下，LSTM表达性能更好。
97 | - 从结构上来说，GRU只有两个门（update和reset），LSTM有三个门（forget，input，output），GRU直接将hidden state 传给下一个单元，而LSTM则用memory cell 把hidden state 包装起来。
98 | 


--------------------------------------------------------------------------------
/LSTM_GRU/data/data_clean.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # coding: utf-8
  3 | 
  4 | # In[10]:
  5 | 
  6 | 
  7 | import pickle
  8 | import numpy as np
  9 | from collections import defaultdict,OrderedDict
 10 | import re
 11 | from tqdm import tqdm
 12 | import pandas as pd
 13 | from bitarray import bitarray
 14 | 
 15 | 
 16 | # In[185]:
 17 | 
 18 | 
 19 | def clean_string(string,TREC=False):
 20 |     string = re.sub(r"[^A-Za-z0-9,!?.]", " ", string)
 21 |     string = re.sub(r",", " ", string)
 22 |     string = re.sub(r"!", " ", string)
 23 |     string = re.sub(r"\(", " ", string)
 24 |     string = re.sub(r"\)", " ", string)
 25 |     string = re.sub(r"\?", " ", string)
 26 |     string = re.sub(r"(?<=\s)\w(?=\s)", " ", string)
 27 |     string = re.sub(r"\s{2,}", " ", string)
 28 |     return string.strip() if TREC else string.strip().lower()
 29 | 
 30 | 
 31 | # In[245]:
 32 | 
 33 | 
 34 | def load_data_k_cv(folder,cv=10,miniData = True): 
 35 |     """struct : text
 36 |                 device
 37 |                 brand
 38 |                 model
 39 |                 split
 40 |        word_cab : 词频字典
 41 |     
 42 |     """
 43 |     word_cab=defaultdict(int)
 44 |     df = []
 45 |     num = 0
 46 |     with open(folder,'rb') as f:
 47 |         for line in tqdm(f):
 48 |             line = line.decode(encoding='ISO-8859-1')
 49 |             row = list(map(lambda x : x.strip(),line.strip().split("|")))[1:]
 50 |             if not (5 <= len(row) <= 6) :
 51 |                 continue
 52 |             row = row[:3] + row[3].split(",") + row[4:]       
 53 |             if len(row) != 6:
 54 |                 continue
 55 |             row = list(map(lambda x : clean_string(x), row))
 56 |             row.append(np.random.randint(0, cv))
 57 |             df.append({"text":str(row[5]) +" "+ row[0]+" " + row[1],"device":row[2],"brand":row[3],"model":row[4],"split":row[6]})
 58 |             num += 1
 59 |             if miniData and num == 10000:
 60 |                 break
 61 | 
 62 |     word_cab = defaultdict(int)
 63 |     sentence_max_len = 0
 64 |     final_df = []
 65 |     
 66 |     print("cleaning data")
 67 |     for struct in tqdm(df):
 68 |         length = len(struct["text"].split())
 69 |         if length <= 200:
 70 |             struct["text"] = clean_string(struct["text"])
 71 |             sentence_max_len = max(sentence_max_len, len(struct["text"].split()))
 72 |             final_df.append(struct)
 73 |             for word in struct["text"].split():
 74 |                 word_cab[word] += 1
 75 |     print("cleaning data finish!")
 76 |     return final_df, word_cab, sentence_max_len
 77 | 
 78 | 
 79 | # In[246]:
 80 | 
 81 | 
 82 | def load_binary_vec(fname, vocab):
 83 |     word_vecs = {}
 84 |     with open(fname, 'rb') as fin:
 85 |         header = fin.readline()
 86 |         vocab_size, vector_size = list(map(int, header.split()))
 87 |         binary_len = np.dtype(np.float32).itemsize * vector_size
 88 |         # vectors = []
 89 |         for i in tqdm(range(vocab_size)):
 90 |             # read word
 91 |             word = b''
 92 |             while True:
 93 |                 ch = fin.read(1)
 94 |                 if ch == b' ':
 95 |                     break
 96 |                 word += ch
 97 |             # print(str(word))
 98 |             word = word.decode(encoding='ISO-8859-1')
 99 |             if word in vocab:
100 |                 word_vecs[word] = np.fromstring(fin.read(binary_len), dtype=np.float32)
101 |             else:
102 |                 fin.read(binary_len)
103 |             fin.read(1)  # newline
104 |         return word_vecs
105 | 
106 | 
107 | # In[247]:
108 | 
109 | 
110 | def add_unexist_word_vec(word_vecs, word_cab):
111 |     for word in tqdm(set(word_cab.keys() -word_vecs.keys())):
112 |         word_vecs[word] = np.random.uniform(-0.1,0.1,100)
113 | 
114 | 
115 | # In[248]:
116 | 
117 | 
118 | data_folder = r"all.txt"
119 | w2v_file = r'vec1.bin'
120 | 
121 | 
122 | # In[265]:
123 | 
124 | 
125 | print("load text")
126 | df, word_cab, sentence_max_len = load_data_k_cv(data_folder, 10, False)
127 | print("finish text load !!!")
128 | 
129 | 
130 | # In[266]:
131 | 
132 | 
133 | brandCount = defaultdict(int)
134 | for struct in df:
135 |     brandCount[(struct['brand'])] += 1
136 | 
137 | 
138 | # In[267]:
139 | 
140 | 
141 | usefulBrand = set()
142 | for k, v in brandCount.items():
143 |     if v > 50:
144 |         usefulBrand.add(k)
145 | 
146 | 
147 | # In[268]:
148 | 
149 | 
150 | for i in range(len(df)-1,-1,-1):
151 |     if df[i]['brand'] not in usefulBrand:
152 |         df.pop(i)
153 | 
154 | 
155 | # In[271]:
156 | 
157 | 
158 | len(df)
159 | 
160 | 
161 | # In[282]:
162 | 
163 | 
164 | with open("banner.txt","wb") as f:
165 |     for struct in df:
166 |         f.write(bytes(struct['text']+'\n', encoding="utf8"))
167 |     
168 | 
169 | 
170 | # In[283]:
171 | 
172 | 
173 | print("load word2vec")
174 | word_vecs = load_binary_vec(w2v_file, word_cab)
175 | print("finish word2vec load !!!")
176 | 
177 | 
178 | # In[285]:
179 | 
180 | 
181 | add_unexist_word_vec(word_vecs,word_cab)
182 | 
183 | 
184 | # In[286]:
185 | 
186 | 
187 | len(word_vecs)
188 | 
189 | 
190 | # In[287]:
191 | 
192 | 
193 | pickle.dump([df,word_vecs,word_cab,sentence_max_len],open(r'word_vec.p','wb'))
194 | 
195 | 
196 | 


--------------------------------------------------------------------------------
/LSTM_GRU/demo/__pycache__/process_data.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Happy-zyy/NLP-Model/f5fefb747f5c21fc8d348b8622846156197d21d6/LSTM_GRU/demo/__pycache__/process_data.cpython-36.pyc


--------------------------------------------------------------------------------
/LSTM_GRU/demo/__pycache__/rnn_model.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Happy-zyy/NLP-Model/f5fefb747f5c21fc8d348b8622846156197d21d6/LSTM_GRU/demo/__pycache__/rnn_model.cpython-36.pyc


--------------------------------------------------------------------------------
/LSTM_GRU/demo/process_data.py:
--------------------------------------------------------------------------------
 1 | from collections import defaultdict
 2 | import pandas as pd
 3 | import numpy as np
 4 | import tensorflow.contrib.keras as kr
 5 | 
 6 | def getWordsVect(config, W):
 7 |     word_ids = defaultdict(int)
 8 |     W_list = []
 9 |     W_list.append([0.0] * config.embedding_dim)
10 |     count = 1
11 |     for word,vector in W.items():
12 |         W_list.append(vector.tolist())
13 |         word_ids[word] = count
14 |         count = count + 1
15 |     return word_ids,W_list
16 | 
17 | 
18 | def get_train_test_data(word_ids, data_set_df, label, sentence_max_len, cv_id=9):
19 |     """将句子转换为id表示"""
20 |     s = set()
21 |     for struct in data_set_df:
22 |         s.add(struct[label])
23 |     cat = sorted(list(s))
24 |     cat_to_id = dict(zip(cat, range(len(cat))))
25 | 
26 |     data_id, label_id = [], []
27 |     for i in range(len(data_set_df)):
28 |         data_id.append([word_ids[x] for x in data_set_df[i]['text'] if x in word_ids])
29 |         label_id.append(cat_to_id[data_set_df[i][label]])
30 | 
31 |     # 使用keras提供的pad_sequences来将文本pad为固定长度
32 | 
33 |     x_pad = kr.preprocessing.sequence.pad_sequences(data_id, sentence_max_len, padding="pre")
34 |     y_pad = kr.utils.to_categorical(label_id, num_classes=len(cat_to_id))  # 将标签转换为one-hot表示
35 | 
36 |     train_index, test_index = [], []
37 |     if cv_id >= 0:
38 |         for x in range(len(data_set_df)):
39 |             if int(data_set_df[x]["split"]) < cv_id:
40 |                 train_index.append(x)
41 |             else:
42 |                 test_index.append(x)
43 | 
44 |         print("************")
45 |         print("train_Num",len(train_index))
46 |         print("test_Num", len(test_index))
47 |         return x_pad[train_index], y_pad[train_index], x_pad[test_index], y_pad[test_index]
48 |     else:
49 |         return x_pad, y_pad, cat


--------------------------------------------------------------------------------
/LSTM_GRU/demo/rnn_model.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import tensorflow as tf
 5 | import pandas as pd
 6 | class TRNNConfig(object):
 7 |     """RNN配置参数"""
 8 | 
 9 |     def __init__(self, sentence_max_len, class_num, vocab_size):
10 |     # 模型参数
11 |         self.embedding_dim = 100          # 词向量维度
12 |         self.num_classes = class_num      # 类别数
13 |         self.vocab_size = vocab_size      # 词汇表达小
14 |         self.sentence_max_len = sentence_max_len
15 | 
16 |         self.num_layers= 2           # 隐藏层层数
17 |         self.hidden_dim = 128        # 隐藏层神经元
18 |         self.rnn = 'lstm'             # lstm 或 gru
19 | 
20 |         self.dropout_keep_prob = 0.8 # dropout保留比例
21 |         self.learning_rate = 1e-3    # 学习率
22 | 
23 |         self.batch_size = 128          # 每批训练大小
24 |         self.num_epochs = 10          # 总迭代轮次
25 | 
26 |         self.print_per_batch = 100    # 每多少轮输出一次结果
27 |         self.save_per_batch = 100      # 每多少轮存入tensorboard
28 | 
29 | 
30 | class TextRNN(object):
31 |     """文本分类，RNN模型"""
32 |     def __init__(self, config, W_list, trainWordVec = False):
33 |         self.config = config
34 |         self.W_list = W_list
35 |         self.trainWordVec = trainWordVec
36 |         # 三个待输入的数据
37 |         self.input_x = tf.placeholder(tf.int32, [None, self.config.sentence_max_len], name='input_x')
38 |         self.input_y = tf.placeholder(tf.float32, [None, self.config.num_classes], name='input_y')
39 |         self.keep_prob = tf.placeholder(tf.float32, name='keep_prob')
40 | 
41 |         self.rnn()
42 | 
43 |     def rnn(self):
44 |         """rnn模型"""
45 | 
46 |         def lstm_cell():   # lstm核
47 |             return tf.contrib.rnn.BasicLSTMCell(self.config.hidden_dim, state_is_tuple=True)
48 | 
49 |         def gru_cell():  # gru核
50 |             return tf.contrib.rnn.GRUCell(self.config.hidden_dim)
51 | 
52 |         def dropout(): # 为每一个rnn核后面加一个dropout层
53 |             if (self.config.rnn == 'lstm'):
54 |                 cell = lstm_cell()
55 |             else:
56 |                 cell = gru_cell()
57 |             return tf.contrib.rnn.DropoutWrapper(cell, output_keep_prob=self.keep_prob)
58 | 
59 |         # 词向量映射
60 |         with tf.device('/cpu:0'):
61 |             if self.trainWordVec:
62 |                 embedding = tf.get_variable('embedding', [self.config.vocab_size, self.config.embedding_dim])
63 |             else:
64 | 
65 |                 embedding = tf.Variable(initial_value=self.W_list, dtype=tf.float32, trainable=False, name='embedding_layer_W')
66 |             embedding_inputs = tf.nn.embedding_lookup(embedding, self.input_x)
67 | 
68 |         with tf.name_scope("rnn"):
69 |             # 多层rnn网络
70 |             cells = [dropout() for _ in range(self.config.num_layers)]
71 |             rnn_cell = tf.contrib.rnn.MultiRNNCell(cells, state_is_tuple=True)
72 | 
73 |             _outputs, _ = tf.nn.dynamic_rnn(cell=rnn_cell, inputs=embedding_inputs, dtype=tf.float32)
74 |             last = _outputs[:, -1, :]  # 取最后一个时序输出作为结果
75 | 
76 |         with tf.name_scope("score"):
77 |             # 全连接层，后面接dropout以及relu激活
78 |             fc = tf.layers.dense(last, self.config.hidden_dim, name='fc1')
79 |             fc = tf.contrib.layers.dropout(fc, self.keep_prob)
80 |             fc = tf.nn.relu(fc)
81 | 
82 |             # 分类器
83 |             self.logits = tf.layers.dense(fc, self.config.num_classes, name='fc2')
84 |             self.y_pred_cls = tf.argmax(tf.nn.softmax(self.logits), 1)  # 预测类别
85 | 
86 |         with tf.name_scope("optimize"):
87 |             # 损失函数，交叉熵
88 |             cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=self.input_y)
89 |             self.loss = tf.reduce_mean(cross_entropy)
90 |             # 优化器
91 |             self.optim = tf.train.AdamOptimizer(learning_rate=self.config.learning_rate).minimize(self.loss)
92 | 
93 |         with tf.name_scope("accuracy"):
94 |             # 准确率
95 |             correct_pred = tf.equal(tf.argmax(self.input_y, 1), self.y_pred_cls)
96 |             self.acc = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
97 | 


--------------------------------------------------------------------------------
/LSTM_GRU/demo/rnn_run.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | 
  3 | from __future__ import print_function
  4 | 
  5 | import os
  6 | import sys
  7 | import time
  8 | from datetime import timedelta
  9 | import pickle
 10 | import numpy as np
 11 | import tensorflow as tf
 12 | from sklearn import metrics
 13 | from tqdm import tqdm
 14 | from rnn_model import TRNNConfig, TextRNN
 15 | import process_data
 16 | 
 17 | tensorboard_dir = '../model/tensorboard/textrnn'                # 可视化路径
 18 | save_dir = '../model/checkpoints/textrnn'
 19 | save_path = os.path.join(save_dir, 'best_validation')  # 最佳验证结果保存路径
 20 | 
 21 | train_data = "../data/word_vec.p"
 22 | label = "brand"
 23 | 
 24 | def get_time_dif(start_time):
 25 |     """获取已使用时间"""
 26 |     end_time = time.time()
 27 |     time_dif = end_time - start_time
 28 |     return timedelta(seconds=int(round(time_dif)))
 29 | 
 30 | 
 31 | def feed_data(model, x_batch, y_batch, keep_prob):
 32 |     feed_dict = {
 33 |         model.input_x: x_batch,
 34 |         model.input_y: y_batch,
 35 |         model.keep_prob: keep_prob
 36 |     }
 37 |     return feed_dict
 38 | 
 39 | def batch_iter(x, y, batch_size=128):
 40 |     """生成批次数据"""
 41 |     data_len = len(x)
 42 |     num_batch = int((data_len - 1) / batch_size) + 1
 43 | 
 44 |     indices = np.random.permutation(np.arange(data_len))
 45 |     x_shuffle = x[indices]  #乱序
 46 |     y_shuffle = y[indices]
 47 | 
 48 |     for i in range(num_batch):
 49 |         start_id = i * batch_size
 50 |         end_id = min((i + 1) * batch_size, data_len)
 51 |         yield x_shuffle[start_id:end_id], y_shuffle[start_id:end_id]
 52 | 
 53 | 
 54 | def evaluate(model, sess, x_, y_):
 55 |     """评估在某一数据上的准确率和损失"""
 56 |     data_len = len(x_)
 57 |     batch_eval = batch_iter(x_, y_, 128)
 58 |     total_loss = 0.0
 59 |     total_acc = 0.0
 60 |     for x_batch, y_batch in batch_eval:
 61 |         batch_len = len(x_batch)
 62 |         feed_dict = feed_data(model, x_batch, y_batch, 1.0)
 63 |         loss, acc = sess.run([model.loss, model.acc], feed_dict=feed_dict)
 64 |         total_loss += loss * batch_len
 65 |         total_acc += acc * batch_len
 66 | 
 67 |     return total_loss / data_len, total_acc / data_len
 68 | 
 69 | 
 70 | def train():
 71 |     print("Configuring TensorBoard and Saver...")
 72 |     # 配置 Tensorboard，重新训练时，请将tensorboard文件夹删除，不然图会覆盖
 73 | 
 74 |     if not os.path.exists(tensorboard_dir):
 75 |         os.makedirs(tensorboard_dir)
 76 | 
 77 |     tf.summary.scalar("loss", model.loss)
 78 |     tf.summary.scalar("accuracy", model.acc)
 79 |     merged_summary = tf.summary.merge_all()
 80 |     writer = tf.summary.FileWriter(tensorboard_dir)
 81 | 
 82 |     # 配置 Saver
 83 |     saver = tf.train.Saver()
 84 |     if not os.path.exists(save_dir):
 85 |         os.makedirs(save_dir)
 86 | 
 87 |     print("Loading training and validation data...")
 88 |     # 载入训练集与验证集
 89 |     start_time = time.time()
 90 |     x_train, y_train, x_val, y_val = process_data.get_train_test_data(word_ids, df, label, sentence_max_len, 9)
 91 | 
 92 |     time_dif = get_time_dif(start_time)
 93 |     print("Time usage:", time_dif)
 94 | 
 95 |     # 创建session
 96 |     session = tf.Session()
 97 |     session.run(tf.global_variables_initializer())
 98 |     writer.add_graph(session.graph)
 99 | 
100 |     print('Training and evaluating...')
101 |     start_time = time.time()
102 |     total_batch = 0  # 总批次
103 |     best_acc_val = 0.0  # 最佳验证集准确率
104 |     last_improved = 0  # 记录上一次提升批次
105 |     require_improvement = 1000  # 如果超过1000轮未提升，提前结束训练
106 | 
107 |     flag = False
108 |     for epoch in range(config.num_epochs):
109 |         print('Epoch:', epoch + 1)
110 |         batch_train = batch_iter(x_train, y_train, config.batch_size)
111 |         for x_batch, y_batch in tqdm(batch_train):
112 |             feed_dict = feed_data(model, x_batch, y_batch, config.dropout_keep_prob)
113 |             if total_batch % config.save_per_batch == 0:
114 |                 # 每多少轮次将训练结果写入tensorboard scalar
115 |                 s = session.run(merged_summary, feed_dict=feed_dict)
116 |                 writer.add_summary(s, total_batch)
117 | 
118 |             if total_batch % config.print_per_batch == 0:
119 |                 # 每多少轮次输出在训练集和验证集上的性能
120 |                 feed_dict[model.keep_prob] = 1.0
121 |                 loss_train, acc_train = session.run([model.loss, model.acc], feed_dict=feed_dict)
122 |                 loss_val, acc_val = evaluate(model, session, x_val, y_val)  # todo
123 | 
124 |                 if acc_val > best_acc_val:
125 |                     # 保存最好结果
126 |                     best_acc_val = acc_val
127 |                     last_improved = total_batch
128 |                     saver.save(sess=session, save_path=save_path)
129 |                     improved_str = '*'
130 |                 else:
131 |                     improved_str = ''
132 | 
133 |                 time_dif = get_time_dif(start_time)
134 |                 msg = 'Iter: {0:>6}, Train Loss: {1:>6.2}, Train Acc: {2:>7.2%},' \
135 |                       + ' Val Loss: {3:>6.2}, Val Acc: {4:>7.2%}, Time: {5} {6}'
136 |                 print(msg.format(total_batch, loss_train, acc_train, loss_val, acc_val, time_dif, improved_str))
137 | 
138 |             session.run(model.optim, feed_dict=feed_dict)  # 运行优化
139 |             total_batch += 1
140 | 
141 |             if total_batch - last_improved > require_improvement:
142 |                 # 验证集正确率长期不提升，提前结束训练
143 |                 print("No optimization for a long time, auto-stopping...")
144 |                 flag = True
145 |                 break  # 跳出循环
146 |         if flag:  # 同上
147 |             break
148 |     session.close()
149 | 
150 | 
151 | def test():
152 |     print("Loading test data...")
153 |     start_time = time.time()
154 | 
155 |     x_test, y_test , categories = process_data.get_train_test_data(word_ids, df, label, sentence_max_len, -1)
156 | 
157 | 
158 |     session = tf.Session()
159 |     session.run(tf.global_variables_initializer())
160 |     saver = tf.train.Saver()
161 |     saver.restore(sess=session, save_path=save_path)  # 读取保存的模型
162 | 
163 |     print('Testing...')
164 |     loss_test, acc_test = evaluate(session, x_test, y_test)
165 |     msg = 'Test Loss: {0:>6.2}, Test Acc: {1:>7.2%}'
166 |     print(msg.format(loss_test, acc_test))
167 | 
168 |     batch_size = 128
169 |     data_len = len(x_test)
170 |     num_batch = int((data_len - 1) / batch_size) + 1
171 | 
172 |     y_test_cls = np.argmax(y_test, 1)
173 |     y_pred_cls = np.zeros(shape=len(x_test), dtype=np.int32)  # 保存预测结果
174 |     for i in range(num_batch):  # 逐批次处理
175 |         start_id = i * batch_size
176 |         end_id = min((i + 1) * batch_size, data_len)
177 |         feed_dict = {
178 |             model.input_x: x_test[start_id:end_id],
179 |             model.keep_prob: 1.0
180 |         }
181 |         y_pred_cls[start_id:end_id] = session.run(model.y_pred_cls, feed_dict=feed_dict)
182 | 
183 |     # 评估
184 |     print("Precision, Recall and F1-Score...")
185 |     print(metrics.classification_report(y_test_cls, y_pred_cls, target_names=categories))
186 | 
187 |     # 混淆矩阵
188 |     print("Confusion Matrix...")
189 |     cm = metrics.confusion_matrix(y_test_cls, y_pred_cls)
190 |     print(cm)
191 | 
192 |     time_dif = get_time_dif(start_time)
193 |     print("Time usage:", time_dif)
194 | 
195 | 
196 | if __name__ == '__main__':
197 |     if len(sys.argv) != 2 or sys.argv[1] not in ['train', 'test']:
198 |         raise ValueError("""usage: python run_rnn.py [train / test]""")
199 | 
200 |     print('Configuring RNN model...')
201 | 
202 |     print('load data. . .')
203 |     X = pickle.load(open(train_data, 'rb'))
204 |     df, word_vecs, word_cab_num, sentence_max_len, class_num = X[0], X[1], X[2], X[3], X[4]
205 | 
206 |     config = TRNNConfig(sentence_max_len, class_num, word_cab_num)
207 | 
208 |     word_ids, W_list = process_data.getWordsVect(config, word_vecs)
209 | 
210 |     model = TextRNN(config, W_list, False) #默认不训练词向量
211 | 
212 | 
213 |     if sys.argv[1] == 'train':
214 |         train()
215 |     else:
216 |         test()
217 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # NLP-Model
  2 | Learn and demonstrate some  classical model
  3 | 
  4 | 
  5 | 
  6 | ## 目录
  7 | 
  8 | * [Text-CNN](#text-cnn)
  9 | * [LSTM&GRU](#lstm)
 10 | * [HAN](#HAN)
 11 | * [Glove](#glove)
 12 | 
 13 | 
 14 | ## <span id="text-cnn">Text-CNN</span>
 15 | ### 1. 模型展示
 16 | ![模型](./Text_CNN/Result/模型.png)
 17 | 
 18 | ### 2. 参数与超参数
 19 | 
 20 | **sequence_length**  
 21 | Q: 对于CNN, 输入与输出都是固定的，可每个句子长短不一, 怎么处理?  
 22 | A: 需要做定长处理, 比如定为n, 超过的截断, 不足的补0. 注意补充的0对后面的结果没有影响，因为后面的max-pooling只会输出最大值，补零的项会被过滤掉.  
 23 | 
 24 | **num_classes**  
 25 | 多分类, 分为几类.  
 26 | 
 27 | **vocabulary_size**  
 28 | 语料库的词典大小, 记为|D|.
 29 | 
 30 | 
 31 | **embedding_size**  
 32 | 将词向量的维度, 由原始的 |D| 降维到 embedding_size.
 33 | 
 34 | 
 35 | **filter_size_arr**  
 36 | 多个不同size的filter.
 37 | 
 38 | 
 39 | ### 3. demo流程
 40 | ```C
 41 | str_length = 36  
 42 | word_vec = 128  
 43 | filter_size = [2,3,4] 每种尺寸2个filter  
 44 | ```
 45 | 
 46 | ![流程](./Text_CNN/Result/流程.jpg)
 47 | 
 48 | ### 3.实验部分
 49 | #### 1 数据集介绍
 50 | 1.1 实验的过程中只使用了[MR数据集](https://www.cs.cornell.edu/people/pabo/movie-review-data/)，验证方式是10 folds的交叉验证方式。
 51 | > 数据集中包含了5331 positive and 5331 negative processed sentences / snippets. Introduced in Pang/Lee ACL 2005. Released July 2005.
 52 | 
 53 | 2.1 词向量包含以下三种(**可以任意选一种或多种累加当作一个词不同的channel**):
 54 | + **CNN-rand**:句子中的的word vector都是随机初始化的，同时当做CNN训练过程中需要优化的参数；
 55 | + **CNN-static**:句子中的word vector是使用word2vec预先对Google News dataset(about 100 billion words)进行训练好的词向量表中的词向量。且在CNN训练过程中作为固定的输入，不作为优化的参数；
 56 | + **CNN-non-static**:句子中的word vector是使用word2vec预先对Google News dataset(about 100 billion words)进行训练好的词向量表中的词向量。在CNN训练过程中作为固定的输入，做为CNN训练过程中**需要优化**的参数；
 57 | 
 58 | 说明：
 59 | 
 60 | > + GoogleNews-vectors-negative300.bin.gz词向量表是通过word2vec使用命令预先训练好，花费时间较长。
 61 | 已经训练好的：[GoogleNews-vectors-negative300.bin.gz百度云盘下载地址](https://pan.baidu.com/share/init?surl=OglaQBBO30d5KdzZNNdRSg) 密码:18yf
 62 | > + word2vec预先训练命令如：```./word2vec -train text8(语料) -output vectors.bin(输出词向量表) -cbow(训练使用模型方式) 0 -size 48 -window 5 -negative 0 -hs 1 -sample 1e-4 -threads 20 -binary 1 -iter 100```
 63 | > + 除了使用word2vec对语料库进行预先训练外，也可以使用glove或FastText进行词向量训练。
 64 | 
 65 | 
 66 | #### 2.文件介绍
 67 | 
 68 | 2.1 **process\_data.py**：加载Google训练的词向量表GoogleNews-vectors-negative300.bin，并对文本数据做一些预处理，使其转化为NN易用的形式，并将其存储在文件中。  
 69 | 最终存储为一个word\_vec.p,其文件存储的内容是[**随机词向量表，已训练好的词向量表， 词频字典， 最大句子长度， revs**];  
 70 | 其中revs是一个结构体列表,列表中的每个元素如下所示：
 71 | ```
 72 | {
 73 | "y":0/1          #标签
 74 | "num_words":int  #句子长度
 75 | "text":str       #句子
 76 | "split":[0,10]   #十折交叉使用
 77 | }
 78 | ```
 79 | 2.2 **text_cnn_main.py**: 主程序文件。读取以上word_vec.p文件内容，设置一些配置信息并设置一些网络运行时需要的参数。  
 80 | 2.3 **text_cnn_model.py**：text-cnn模型文件。
 81 | 
 82 | 
 83 | #### 3.实验结果展示
 84 | ![结果](./Text_CNN/Result/分类结果.jpg)
 85 | 
 86 | 
 87 | ### 4.经验分享
 88 | 
 89 | 在工作用到TextCNN做query推荐，并结合先关的文献，谈几点经验：  
 90 | 1、TextCNN是一个n-gram特征提取器，对于训练集中没有的n-gram不能很好的提取。对于有些n-gram，可能过于强烈，反而会干扰模型，造成误分类。   
 91 | 2、TextCNN对词语的**顺序不敏感**，在query推荐中，我把正样本分词后得到的term做随机排序，正确率并没有降低太多，当然，其中一方面的原因短query本身对term的顺序要求不敏感。          
 92 | 3、TextCNN擅长长本文分类，在这一方面可以做到很高正确率。    
 93 | 4、TextCNN在模型结构方面有很多参数可调，具体参看文末的文献。  
 94 | 
 95 | 参考文献  
 96 | 《Convolutional Neural Networks for Sentence Classification》   
 97 | 《A Sensitivity Analysis of (and Practitioners’ Guide to) Convolutional Neural Networks for Sentence Classification》
 98 | 
 99 | ---
100 | > [参考博客](https://jianwenjun.xyz/2018/03/16/%E5%8D%B7%E7%A7%AF%E7%A5%9E%E7%BB%8F%E7%BD%91%E7%BB%9C-TextCNN-%E5%9C%A8%E5%8F%A5%E5%AD%90%E5%88%86%E7%B1%BB%E4%B8%8A%E7%9A%84%E5%AE%9E%E7%8E%B0/)   
101 | > [参考博客](https://blog.csdn.net/u012762419/article/details/79561441)   
102 | 特此感谢
103 | 
104 | ---
105 | 
106 | ## <span id="lstm">LSTM&GRU</span>
107 | 利用LSTM做文本分类
108 | 
109 | ### 1.Usage
110 | 
111 | #### 1.1 数据预处理
112 | 在data文件中，先使用`data_clean.py`对文本数据进行预处理
113 | 
114 | 最后处理的格式信息如下：
115 | ```
116 | df, word_vecs, word_cab_num, sentence_max_len, class_num
117 | ```
118 | `df`:句子字典列表。其中包括句子的text、分类、split等辅助信息
119 | ```
120 | {
121 | "label":         #标签
122 | "num_words":int  #句子长度
123 | "text":str       #句子
124 | "split":[0,10]   #十折交叉使用
125 | }
126 | ```
127 | `word_vecs`:文本中所有词的词向量表示  
128 | `word_cab_num`:文本中共有多少不同的词汇  
129 | `sentence_max_len`:句子的最大长度  
130 | `class_num`:多分类问题分几类  
131 | 
132 | 
133 | #### 1.2 模型超参
134 | 模型参数在`rnn_model.py`进行相关的设置。其中需要修改的包括：
135 | ```python
136 | class TRNNConfig(object):
137 |     self.embedding_dim = 100     # 词向量维度
138 |     self.num_layers= 2           # 隐藏层层数
139 |     self.hidden_dim = 128        # 隐藏层神经元
140 |     self.rnn = 'lstm'             # lstm 或 gru
141 | 
142 |     self.dropout_keep_prob = 0.8 # dropout保留比例
143 |     self.learning_rate = 1e-3    # 学习率
144 | 
145 |     self.batch_size = 128          # 每批训练大小
146 |     self.num_epochs = 10           # 总迭代轮次
147 | ```
148 | 启动参数包括`rnn_run.py`的一些路径等配置信息
149 | ```
150 | train_data = "../data/word_vec.p"  #配置数据清洗后生成的数据路径
151 | label = "brand"                    #1中所述df的类别标签名
152 | ```
153 | 
154 | #### 1.3 运行
155 | ```python
156 | rnn_run.py train #训练&验证
157 | rnn_run.py test  #测试
158 | ```
159 | 
160 | ### 2.模型介绍
161 | #### 2.1 LSTM
162 | lstm作为加入了attention机制的rnn网络，对长文本具有很好的记忆效果，其主要归功于模型结构。    
163 | ![模型](LSTM_GRU/Picture/LSTM2.JPG)
164 | 
165 | 以下是一个lstm单元的结构（**一个lstm单元也就是网络中的一层,即由上述num_layers控制**）  
166 | ![模型](LSTM_GRU/Picture/LSTM.JPG)
167 | 其中输出即是一个`hidden_dim`的向量，以上两个参数控制lstm最核心的网络架构。  
168 | 
169 | #### 2.2 GRU
170 | gru可以说是lstm的初代版本，一个GRU单元如下所示  
171 | ![模型](LSTM_GRU/Picture/GRU.JPG)  
172 | 
173 | 
174 | ### 3.实验结果
175 | 本次实验是帮师兄做了的一个关于设备识别分类的工作。从50W条设备banner信息中对设备品牌和型号进行识别。  
176 | 因为数据相对规整，用lstm处理得到的效果也非常好，正确率能达到99%  
177 | ![模型](LSTM_GRU/Picture/accuracy.png)
178 | 
179 | ![模型](LSTM_GRU/Picture/loss.png)
180 | 
181 | ### 4.LSTM和GRU的区别
182 | 先给出一些结论：  
183 | - GRU和LSTM的性能在很多任务上不分伯仲。
184 | - GRU 参数更少因此更容易收敛，但是数据集很大的情况下，LSTM表达性能更好。
185 | - 从结构上来说，GRU只有两个门（update和reset），LSTM有三个门（forget，input，output），GRU直接将hidden state 传给下一个单元，而LSTM则用memory cell 把hidden state 包装起来。
186 | 
187 | 
188 | ## <span id="HAN">HAN</span>
189 | ### 1.模型介绍
190 | #### 1.1 特点  
191 | （1）可以直观的看出用这个模型构建文本表示时各个句子和单词的重要程度，增强了可解释性。  
192 | （2）文本中不同句子对文本的主旨影响程度不同，一个句子中不同的词语对句子主旨的影响程度也不同，因此HAN在**词语层面**和**句子层面**分别添加了注意力机制。
193 | 
194 | #### 1.2 结构
195 | 它包括四个部分：一个词序列编码器，一个词级注意层，一个句子编码器和一个句子层注意层。具体结构如下图所示：  
196 | ![](picture/README-1cd4ff0f.png)  
197 | ---
198 | （1）词序列编码器是通过一个双向GRU实现的  
199 | ![](picture/README-61a8bed9.png)  
200 | 其中：  
201 |  $w_{it}: 第i个句子的第t个词语$     
202 |  $W_e$  : embedding_matrix  
203 | 前向和后向结果拼接得到词序列编码：  
204 | ![](picture/README-85ffa053.png)  
205 | 
206 | （2）词级Attention层
207 | ![](picture/README-8f6b1559.png)  
208 | 其中 $6$ 式得到权重向量
209 | 
210 | (3)句子编码器和词编码器类似  
211 | ![](picture/README-85cdfcb9.png)  
212 | 拼接后得到句子编码结果  
213 | ![](picture/README-7ea1b04c.png)  
214 | 
215 | （4）句子级Attention层  
216 | ![](picture/README-857a805b.png)  
217 | 得到 $v$ 即为文档的向量表示，可以通过一个全连接层,然后softmax输出，进行文档分类。  
218 | ![](picture/README-4fcc65db.png)  
219 | 
220 | #### 1.3 可视化  
221 | ![](picture/README-f68d8b8d.png)  
222 | 图中蓝色颜色深浅表示word level的attention权重，红色颜色深浅表示sentence level的attention权重。
223 | 
224 | 
225 | 
226 | 
227 | ## <span id="glove">Glove</span>
228 | 


--------------------------------------------------------------------------------
/Text_CNN/.idea/Text_CNN.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$" />
 5 |     <orderEntry type="inheritedJdk" />
 6 |     <orderEntry type="sourceFolder" forTests="false" />
 7 |   </component>
 8 |   <component name="TestRunnerService">
 9 |     <option name="projectConfiguration" value="py.test" />
10 |     <option name="PROJECT_TEST_RUNNER" value="py.test" />
11 |   </component>
12 | </module>


--------------------------------------------------------------------------------
/Text_CNN/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="JavaScriptSettings">
4 |     <option name="languageLevel" value="ES6" />
5 |   </component>
6 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.6" project-jdk-type="Python SDK" />
7 | </project>


--------------------------------------------------------------------------------
/Text_CNN/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/Text_CNN.iml" filepath="$PROJECT_DIR$/.idea/Text_CNN.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/Text_CNN/.idea/workspace.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <project version="4">
  3 |   <component name="ChangeListManager">
  4 |     <list default="true" id="ac60a50f-a956-451e-87cc-d9b21f989948" name="Default" comment="" />
  5 |     <option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
  6 |     <option name="TRACKING_ENABLED" value="true" />
  7 |     <option name="SHOW_DIALOG" value="false" />
  8 |     <option name="HIGHLIGHT_CONFLICTS" value="true" />
  9 |     <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
 10 |     <option name="LAST_RESOLUTION" value="IGNORE" />
 11 |   </component>
 12 |   <component name="CoverageDataManager">
 13 |     <SUITE FILE_PATH="coverage/Text_CNN$text_cnn_main.coverage" NAME="text_cnn_main Coverage Results" MODIFIED="1545812564579" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
 14 |   </component>
 15 |   <component name="FileEditorManager">
 16 |     <leaf>
 17 |       <file leaf-file-name="process_data.py" pinned="false" current-in-tab="true">
 18 |         <entry file="file://$PROJECT_DIR$/process_data.py">
 19 |           <provider selected="true" editor-type-id="text-editor">
 20 |             <state relative-caret-position="173">
 21 |               <caret line="241" column="30" selection-start-line="241" selection-start-column="30" selection-end-line="241" selection-end-column="30" />
 22 |               <folding>
 23 |                 <element signature="e#109#122#0" expanded="true" />
 24 |               </folding>
 25 |             </state>
 26 |           </provider>
 27 |         </entry>
 28 |       </file>
 29 |       <file leaf-file-name="text_cnn_main.py" pinned="false" current-in-tab="false">
 30 |         <entry file="file://$PROJECT_DIR$/text_cnn_main.py">
 31 |           <provider selected="true" editor-type-id="text-editor">
 32 |             <state relative-caret-position="216">
 33 |               <caret line="73" column="47" selection-start-line="73" selection-start-column="35" selection-end-line="73" selection-end-column="47" />
 34 |             </state>
 35 |           </provider>
 36 |         </entry>
 37 |       </file>
 38 |       <file leaf-file-name="text_cnn_model.py" pinned="false" current-in-tab="false">
 39 |         <entry file="file://$PROJECT_DIR$/text_cnn_model.py">
 40 |           <provider selected="true" editor-type-id="text-editor">
 41 |             <state relative-caret-position="-369">
 42 |               <caret line="125" column="35" selection-start-line="125" selection-start-column="35" selection-end-line="125" selection-end-column="35" />
 43 |             </state>
 44 |           </provider>
 45 |         </entry>
 46 |       </file>
 47 |       <file leaf-file-name="nn_ops.py" pinned="false" current-in-tab="false">
 48 |         <entry file="file://E:/Anaconda/Lib/site-packages/tensorflow/python/ops/nn_ops.py">
 49 |           <provider selected="true" editor-type-id="text-editor">
 50 |             <state relative-caret-position="424">
 51 |               <caret line="2212" column="55" selection-start-line="2212" selection-start-column="55" selection-end-line="2212" selection-end-column="55" />
 52 |             </state>
 53 |           </provider>
 54 |         </entry>
 55 |       </file>
 56 |     </leaf>
 57 |   </component>
 58 |   <component name="FindInProjectRecents">
 59 |     <findStrings>
 60 |       <find>data_dir</find>
 61 |       <find>load_binary_vec</find>
 62 |     </findStrings>
 63 |   </component>
 64 |   <component name="IdeDocumentHistory">
 65 |     <option name="CHANGED_PATHS">
 66 |       <list>
 67 |         <option value="$PROJECT_DIR$/text_cnn_main.py" />
 68 |         <option value="$PROJECT_DIR$/text_cnn_model.py" />
 69 |       </list>
 70 |     </option>
 71 |   </component>
 72 |   <component name="JsBuildToolGruntFileManager" detection-done="true" sorting="DEFINITION_ORDER" />
 73 |   <component name="JsBuildToolPackageJson" detection-done="true" sorting="DEFINITION_ORDER" />
 74 |   <component name="JsGulpfileManager">
 75 |     <detection-done>true</detection-done>
 76 |     <sorting>DEFINITION_ORDER</sorting>
 77 |   </component>
 78 |   <component name="NodePackageJsonFileManager">
 79 |     <packageJsonPaths />
 80 |   </component>
 81 |   <component name="ProjectFrameBounds">
 82 |     <option name="x" value="-92" />
 83 |     <option name="y" value="288" />
 84 |     <option name="width" value="1936" />
 85 |     <option name="height" value="1056" />
 86 |   </component>
 87 |   <component name="ProjectView">
 88 |     <navigator proportions="" version="1">
 89 |       <foldersAlwaysOnTop value="true" />
 90 |     </navigator>
 91 |     <panes>
 92 |       <pane id="ProjectPane">
 93 |         <subPane>
 94 |           <expand>
 95 |             <path>
 96 |               <item name="Text_CNN" type="b2602c69:ProjectViewProjectNode" />
 97 |               <item name="Text_CNN" type="462c0819:PsiDirectoryNode" />
 98 |             </path>
 99 |           </expand>
100 |           <select />
101 |         </subPane>
102 |       </pane>
103 |       <pane id="Scope" />
104 |     </panes>
105 |   </component>
106 |   <component name="PropertiesComponent">
107 |     <property name="WebServerToolWindowFactoryState" value="false" />
108 |     <property name="last_opened_file_path" value="$PROJECT_DIR$/../LSTM_GRU" />
109 |     <property name="nodejs_interpreter_path.stuck_in_default_project" value="undefined stuck path" />
110 |     <property name="nodejs_npm_path_reset_for_default_project" value="true" />
111 |     <property name="settings.editor.selected.configurable" value="preferences.lookFeel" />
112 |   </component>
113 |   <component name="RunDashboard">
114 |     <option name="ruleStates">
115 |       <list>
116 |         <RuleState>
117 |           <option name="name" value="ConfigurationTypeDashboardGroupingRule" />
118 |         </RuleState>
119 |         <RuleState>
120 |           <option name="name" value="StatusDashboardGroupingRule" />
121 |         </RuleState>
122 |       </list>
123 |     </option>
124 |   </component>
125 |   <component name="RunManager">
126 |     <configuration name="text_cnn_main" type="PythonConfigurationType" factoryName="Python" temporary="true">
127 |       <module name="Text_CNN" />
128 |       <option name="INTERPRETER_OPTIONS" value="" />
129 |       <option name="PARENT_ENVS" value="true" />
130 |       <envs>
131 |         <env name="PYTHONUNBUFFERED" value="1" />
132 |       </envs>
133 |       <option name="SDK_HOME" value="" />
134 |       <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
135 |       <option name="IS_MODULE_SDK" value="true" />
136 |       <option name="ADD_CONTENT_ROOTS" value="true" />
137 |       <option name="ADD_SOURCE_ROOTS" value="true" />
138 |       <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
139 |       <option name="SCRIPT_NAME" value="$PROJECT_DIR$/text_cnn_main.py" />
140 |       <option name="PARAMETERS" value="--static --word2vec --shuffer" />
141 |       <option name="SHOW_COMMAND_LINE" value="false" />
142 |       <option name="EMULATE_TERMINAL" value="false" />
143 |       <option name="MODULE_MODE" value="false" />
144 |     </configuration>
145 |     <recent_temporary>
146 |       <list>
147 |         <item itemvalue="Python.text_cnn_main" />
148 |       </list>
149 |     </recent_temporary>
150 |   </component>
151 |   <component name="SvnConfiguration">
152 |     <configuration />
153 |   </component>
154 |   <component name="TaskManager">
155 |     <task active="true" id="Default" summary="Default task">
156 |       <changelist id="ac60a50f-a956-451e-87cc-d9b21f989948" name="Default" comment="" />
157 |       <created>1545379553723</created>
158 |       <option name="number" value="Default" />
159 |       <option name="presentableId" value="Default" />
160 |       <updated>1545379553723</updated>
161 |     </task>
162 |     <servers />
163 |   </component>
164 |   <component name="ToolWindowManager">
165 |     <frame x="-92" y="288" width="1936" height="1056" extended-state="0" />
166 |     <editor active="true" />
167 |     <layout>
168 |       <window_info anchor="bottom" id="TODO" order="6" />
169 |       <window_info anchor="bottom" id="Event Log" side_tool="true" />
170 |       <window_info anchor="bottom" id="Database Changes" show_stripe_button="false" />
171 |       <window_info anchor="bottom" id="Run" order="2" weight="0.32997763" />
172 |       <window_info anchor="bottom" id="Version Control" show_stripe_button="false" />
173 |       <window_info anchor="bottom" id="Python Console" />
174 |       <window_info anchor="bottom" id="Terminal" />
175 |       <window_info content_ui="combo" id="Project" order="0" visible="true" weight="0.25" />
176 |       <window_info anchor="bottom" id="Docker" show_stripe_button="false" />
177 |       <window_info anchor="right" id="Database" />
178 |       <window_info anchor="right" id="SciView" />
179 |       <window_info id="Structure" order="1" side_tool="true" weight="0.25" />
180 |       <window_info anchor="bottom" id="Concurrent Activities Diagram" weight="0.32997763" />
181 |       <window_info anchor="bottom" id="Debug" order="3" weight="0.4" />
182 |       <window_info id="Favorites" side_tool="true" />
183 |       <window_info anchor="bottom" id="Find" order="1" />
184 |       <window_info anchor="right" id="Commander" internal_type="SLIDING" order="0" type="SLIDING" weight="0.4" />
185 |       <window_info anchor="bottom" id="Inspection" order="5" weight="0.4" />
186 |       <window_info anchor="right" content_ui="combo" id="Hierarchy" order="2" weight="0.25" />
187 |       <window_info anchor="right" id="Ant Build" order="1" weight="0.25" />
188 |       <window_info anchor="bottom" id="Message" order="0" />
189 |       <window_info anchor="bottom" id="Cvs" order="4" weight="0.25" />
190 |     </layout>
191 |   </component>
192 |   <component name="TypeScriptGeneratedFilesManager">
193 |     <option name="version" value="1" />
194 |   </component>
195 |   <component name="VcsContentAnnotationSettings">
196 |     <option name="myLimit" value="2678400000" />
197 |   </component>
198 |   <component name="editorHistoryManager">
199 |     <entry file="file://E:/Anaconda/Lib/site-packages/tensorflow/python/ops/nn_ops.py">
200 |       <provider selected="true" editor-type-id="text-editor">
201 |         <state relative-caret-position="424">
202 |           <caret line="2212" column="55" selection-start-line="2212" selection-start-column="55" selection-end-line="2212" selection-end-column="55" />
203 |         </state>
204 |       </provider>
205 |     </entry>
206 |     <entry file="file://$PROJECT_DIR$/text_cnn_model.py">
207 |       <provider selected="true" editor-type-id="text-editor">
208 |         <state relative-caret-position="-369">
209 |           <caret line="125" column="35" selection-start-line="125" selection-start-column="35" selection-end-line="125" selection-end-column="35" />
210 |         </state>
211 |       </provider>
212 |     </entry>
213 |     <entry file="file://$PROJECT_DIR$/text_cnn_main.py">
214 |       <provider selected="true" editor-type-id="text-editor">
215 |         <state relative-caret-position="216">
216 |           <caret line="73" column="47" selection-start-line="73" selection-start-column="35" selection-end-line="73" selection-end-column="47" />
217 |         </state>
218 |       </provider>
219 |     </entry>
220 |     <entry file="file://$PROJECT_DIR$/process_data.py">
221 |       <provider selected="true" editor-type-id="text-editor">
222 |         <state relative-caret-position="173">
223 |           <caret line="241" column="30" selection-start-line="241" selection-start-column="30" selection-end-line="241" selection-end-column="30" />
224 |           <folding>
225 |             <element signature="e#109#122#0" expanded="true" />
226 |           </folding>
227 |         </state>
228 |       </provider>
229 |     </entry>
230 |   </component>
231 | </project>


--------------------------------------------------------------------------------
/Text_CNN/Result/分类结果.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Happy-zyy/NLP-Model/f5fefb747f5c21fc8d348b8622846156197d21d6/Text_CNN/Result/分类结果.jpg


--------------------------------------------------------------------------------
/Text_CNN/Result/模型.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Happy-zyy/NLP-Model/f5fefb747f5c21fc8d348b8622846156197d21d6/Text_CNN/Result/模型.png


--------------------------------------------------------------------------------
/Text_CNN/Result/流程.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Happy-zyy/NLP-Model/f5fefb747f5c21fc8d348b8622846156197d21d6/Text_CNN/Result/流程.jpg


--------------------------------------------------------------------------------
/Text_CNN/process_data.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # coding=utf-8
  3 | # @Time : 2018/3/8 下午3:02
  4 | # @Author : ComeOnJian
  5 | # @File : process_data.py
  6 | 
  7 | import pickle
  8 | # import word2vec
  9 | import numpy as np
 10 | from collections import defaultdict,OrderedDict
 11 | import re
 12 | from tqdm import tqdm
 13 | import pandas as pd
 14 | 
 15 | 
 16 | data_dir = '../data/rt-polaritydata/'
 17 | google_new_vector_dir = '../data/'
 18 | 
 19 | ##方式一，编写代码加载word2vec训练好的模型文件GoogleNews-vectors-negative300.bin
 20 | ####参照word2vec.from_binary方法改写
 21 | 
 22 | def load_binary_vec(fname, vocab):
 23 |     word_vecs = {}
 24 |     with open(fname, 'rb') as fin:
 25 |         header = fin.readline()
 26 |         vocab_size, vector_size = list(map(int, header.split()))
 27 |         binary_len = np.dtype(np.float32).itemsize * vector_size
 28 |         # vectors = []
 29 |         for i in tqdm(range(vocab_size)):
 30 |             # read word
 31 |             word = b''
 32 |             while True:
 33 |                 ch = fin.read(1)
 34 |                 if ch == b' ':
 35 |                     break
 36 |                 word += ch
 37 |             # print(str(word))
 38 |             word = word.decode(encoding='ISO-8859-1')
 39 |             if word in vocab:
 40 |                 word_vecs[word] = np.fromstring(fin.read(binary_len), dtype=np.float32)
 41 |             else:
 42 |                 fin.read(binary_len)
 43 |             # vector = np.fromstring(fin.read(binary_len), dtype=np.float32)
 44 |             # vectors.append(vector)
 45 |             # if include:
 46 |             #     vectors[i] = unitvec(vector)
 47 |             fin.read(1)  # newline
 48 |         return word_vecs
 49 | 
 50 | #load MR data —— Movie reviews with one sentence per review.
 51 | #Classification involves detecting positive/negative reviews
 52 | def load_data_k_cv(folder,cv=10,clear_flag=True):   #返回 1.词频统计  2.[(...)，(标注，文本，句子长度，split)，(...)]
 53 |     pos_file = folder[0]
 54 |     neg_file = folder[1]
 55 | 
 56 |     #训练集的语料词汇表,计数
 57 |     word_cab=defaultdict(float)
 58 |     revs = [] #最后的数据
 59 |     with open(pos_file,'rb') as pos_f:
 60 |         for line in pos_f:
 61 |             line = line.decode(encoding='ISO-8859-1')
 62 |             if clear_flag:
 63 |                 orign_rev = clean_string(line)
 64 |             else:
 65 |                 orign_rev = line
 66 |             words = set(orign_rev.split())
 67 |             for word in words:
 68 |                 word_cab[word] += 1
 69 |             datum = {"y": 1,
 70 |                      "text": orign_rev,
 71 |                      "num_words": len(orign_rev.split()),
 72 |                      "spilt": np.random.randint(0, cv)}
 73 |             revs.append(datum)
 74 | 
 75 |     with open(neg_file,'rb') as neg_f:
 76 |         for line in neg_f:
 77 |             line = line.decode(encoding='ISO-8859-1')
 78 |             if clear_flag:
 79 |                 orign_rev = clean_string(line)
 80 |             else:
 81 |                 orign_rev = line
 82 |             words = set(orign_rev.split())
 83 |             for word in words:
 84 |                 word_cab[word] += 1
 85 |             datum = {"y": 0,
 86 |                      "text": orign_rev,
 87 |                      "num_words": len(orign_rev.split()),
 88 |                      "spilt": np.random.randint(0, cv)}
 89 |             revs.append(datum)
 90 | 
 91 |     return word_cab,revs
 92 | 
 93 | def add_unexist_word_vec(w2v,vocab):
 94 |     """
 95 |     将词汇表中没有embedding的词初始化()
 96 |     :param w2v:经过word2vec训练好的词向量
 97 |     :param vocab:总体要embedding的词汇表
 98 |     """
 99 |     for word in set(vocab.keys()) - set(w2v.keys()):
100 |             w2v[word] = np.random.uniform(-0.25,0.25,300)
101 | 
102 | def clean_string(string,TREC=False):
103 |     string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
104 |     string = re.sub(r"\'s", " \'s", string)
105 |     string = re.sub(r"\'ve", " \'ve", string)
106 |     string = re.sub(r"n\'t", " n\'t", string)
107 |     string = re.sub(r"\'re", " \'re", string)
108 |     string = re.sub(r"\'d", " \'d", string)
109 |     string = re.sub(r"\'ll", " \'ll", string)
110 |     string = re.sub(r",", " , ", string)
111 |     string = re.sub(r"!", " ! ", string)
112 |     string = re.sub(r"\(", " \( ", string)
113 |     string = re.sub(r"\)", " \) ", string)
114 |     string = re.sub(r"\?", " \? ", string)
115 |     string = re.sub(r"\s{2,}", " ", string)
116 |     return string.strip() if TREC else string.strip().lower()
117 | 
118 | def get_vec_by_sentence_list(word_vecs,sentence_list,maxlen=56,values=0.,vec_size = 300):
119 |     """
120 |     :param sentence_list:句子列表
121 |     :return:句子对应的矩阵向量表示
122 |     """
123 |     data = []
124 | 
125 |     for sentence in sentence_list:
126 |         # get a sentence
127 |         sentence_vec = []
128 |         words = sentence.split()
129 |         for word in words:
130 |             sentence_vec.append(word_vecs[word].tolist())
131 | 
132 |         # padding sentence vector to maxlen(w * h)
133 |         sentence_vec = pad_sentences(sentence_vec,maxlen,values,vec_size)
134 |         # add a sentence vector
135 |         data.append(np.array(sentence_vec))
136 |     return data
137 | def get_index_by_sentence_list(word_ids,sentence_list,maxlen=56):
138 |     indexs = []
139 |     words_length = len(word_ids)
140 |     for sentence in sentence_list:
141 |         # get a sentence
142 |         sentence_indexs = []
143 |         words = sentence.split()
144 |         for word in words:
145 |             sentence_indexs.append(word_ids[word])
146 |         # padding sentence to maxlen
147 |         length = len(sentence_indexs)
148 |         if length < maxlen:
149 |             for i in range(maxlen - length):
150 |                 sentence_indexs.append(words_length)
151 |         # add a sentence vector
152 |         indexs.append(sentence_indexs)
153 | 
154 |     return np.array(indexs)
155 | 
156 |     pass
157 | def pad_sentences(data,maxlen=56,values=0.,vec_size = 300):
158 |     """padding to max length
159 |     :param data:要扩展的数据集
160 |     :param maxlen:扩展的h长度
161 |     :param values:默认的值
162 |     """
163 |     length = len(data)
164 |     if length < maxlen:
165 |         for i in range(maxlen - length):
166 |             data.append(np.array([values]*vec_size))
167 |     return data
168 | 
169 | def get_train_test_data1(word_vecs,revs,cv_id=0,sent_length = 56,default_values=0.,vec_size = 300):
170 |     """
171 |     获取的训练数据和测试数据是直接的数据
172 |     :param revs:
173 |     :param cv_id:
174 |     :param sent_length:
175 |     :return:
176 |     """
177 |     data_set_df = pd.DataFrame(revs)
178 | 
179 |     # DataFrame
180 |     # y text num_words spilt
181 |     # 1 'I like this movie' 4 3
182 |     data_set_df = data_set_df.sample(frac=1)#打乱顺序
183 | 
184 |     data_set_cv_train = data_set_df[data_set_df['spilt'] != cv_id]  # 训练集
185 |     data_set_cv_test = data_set_df[data_set_df['spilt'] == cv_id] #测试集
186 | 
187 |     # train
188 |     train_y_1 = np.array(data_set_cv_train['y'].tolist(),dtype='int')
189 |     train_y_2 = list(map(get_contrast,train_y_1))
190 |     train_y = np.array([train_y_1,train_y_2]).T
191 | 
192 |     test_y_1 = np.array(data_set_cv_test['y'].tolist(),dtype='int')
193 |     test_y_2 = list(map(get_contrast,test_y_1))
194 |     test_y = np.array([test_y_1,test_y_2]).T
195 | 
196 |     train_sentence_list = data_set_cv_train['text'].tolist()
197 |     test_sentence_list = data_set_cv_test['text'].tolist()
198 | 
199 | 
200 |     train_x = get_vec_by_sentence_list(word_vecs,train_sentence_list,sent_length,default_values,vec_size)
201 |     test_x = get_vec_by_sentence_list(word_vecs,test_sentence_list,sent_length,default_values,vec_size)
202 | 
203 | 
204 |     return train_x,train_y,test_x,test_y
205 | def get_train_test_data2(word_ids,revs,cv_id=0,sent_length = 56):
206 |     data_set_df = pd.DataFrame(revs)
207 | 
208 |     # DataFrame
209 |     # y text num_words spilt
210 |     # 1 'I like this movie' 4 3
211 |     data_set_df = data_set_df.sample(frac=1)  # 打乱顺序
212 | 
213 |     data_set_cv_train = data_set_df[data_set_df['spilt'] != cv_id]  # 训练集
214 |     data_set_cv_test = data_set_df[data_set_df['spilt'] == cv_id]  # 测试集
215 | 
216 |     # train
217 |     train_y_1 = np.array(data_set_cv_train['y'].tolist(), dtype='int')
218 |     train_y_2 = list(map(get_contrast, train_y_1))
219 |     train_y = np.array([train_y_1, train_y_2]).T
220 | 
221 |     test_y_1 = np.array(data_set_cv_test['y'].tolist(), dtype='int')
222 |     test_y_2 = list(map(get_contrast, test_y_1))
223 |     test_y = np.array([test_y_1, test_y_2]).T
224 | 
225 |     train_sentence_list = data_set_cv_train['text'].tolist()
226 |     test_sentence_list = data_set_cv_test['text'].tolist()
227 | 
228 |     train_x = get_index_by_sentence_list(word_ids,train_sentence_list,sent_length)
229 |     test_x = get_index_by_sentence_list(word_ids,test_sentence_list,sent_length)
230 | 
231 |     return train_x,train_y,test_x,test_y
232 | #对0，1取反
233 | def get_contrast(x):
234 |     return (2+~x)
235 | 
236 | def getWordsVect(W):
237 |     word_ids = OrderedDict()
238 |     W_list = []
239 |     count =0
240 |     for word,vector in W.items():
241 |         W_list.append(vector.tolist())
242 |         word_ids[word] = count
243 |         count = count + 1
244 |     W_list.append([0.0]*300)
245 |     return word_ids,W_list
246 | 
247 | 
248 | 
249 | 
250 | if __name__ == '__main__':
251 | 
252 |     #Testing --------------------------------------------------------------------------
253 | 
254 | 
255 |     data_folder = [r'H:\情感划分数据\rt-polaritydata\rt-polarity.pos', r'H:\情感划分数据\rt-polaritydata\rt-polarity.neg']
256 |     w2v_file = 'H:\情感划分数据\GoogleNews-vectors-negative300.bin'
257 |     print('load data ...')
258 |     word_cab, revs = load_data_k_cv(data_folder)
259 |     print('data loaded !!!')
260 |     print('number of sentences: ' + str(len(revs)))
261 |     print('size of vocab: ' + str(len(word_cab)))
262 |     sentence_max_len = np.max(pd.DataFrame(revs)['num_words'])
263 |     print('dataset the sentence max length is {}'.format(sentence_max_len))
264 | 
265 |     print('load word2vec vectors...')
266 |     word_vecs = load_binary_vec(w2v_file,word_cab)
267 |     print ("valid word vector : {}".format(len(list(word_vecs.keys()))))
268 |     print('finish word2vec load !!!')
269 | 
270 |     #对未登录词操作
271 |     add_unexist_word_vec(word_vecs, word_cab)
272 | 
273 |     # CNN-rand对应的词向量表
274 |     word_vecs_rand = {}
275 | 
276 |     add_unexist_word_vec(word_vecs_rand,word_cab)
277 | 
278 |     #将数据数据集对应的词向量保存好
279 |     pickle.dump([word_vecs_rand,word_vecs,word_cab,sentence_max_len,revs],open('../result/word_vec.p','wb'))
280 | 
281 | 
282 | 
283 | 
284 |     pass


--------------------------------------------------------------------------------
/Text_CNN/text_cnn_main.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # -*- coding: utf-8 -*-
  3 | # @Time : 2018/3/7 下午4:06 
  4 | # @Author : ComeOnJian
  5 | # @File : text_cnn.py
  6 | # implementation of Convolutional Neural Networks for Sentence CLassification
  7 | 
  8 | import argparse
  9 | import pandas as pd
 10 | # import ast
 11 | import numpy as np
 12 | # import NLP.Text_CNN.process_data as process_data
 13 | # import NLP.Text_CNN.text_cnn_model as TextCNN
 14 | import process_data
 15 | from text_cnn_model import TextCNN
 16 | import pickle
 17 | from tqdm import tqdm
 18 | 
 19 | import pdb
 20 | # step1 get paramater
 21 | # step2 load data
 22 | # step3 create TextCNN model
 23 | # step4 start train
 24 | # step5 validataion
 25 | 
 26 | if __name__ == '__main__':
 27 |     # step1 get paramater
 28 |     parse = argparse.ArgumentParser(description='Paramaters for construct TextCNN Model')
 29 |     # #方式一 type = bool
 30 |     # parse.add_argument('--nonstatic',type=ast.literal_eval,help='use textcnn nonstatic or not',dest='tt')
 31 |     # 方式二 取bool值的方式)添加互斥的参数
 32 |     group_static = parse.add_mutually_exclusive_group(required=True)
 33 |     group_static.add_argument('--static', dest='static_flag', action='store_true', help='use static Text_CNN')
 34 |     group_static.add_argument('--nonstatic', dest='static_flag', action='store_false', help='use nonstatic Text_CNN')
 35 | 
 36 |     group_word_vec = parse.add_mutually_exclusive_group(required=True)
 37 |     group_word_vec.add_argument('--word2vec', dest='wordvec_flag', action='store_true', help='word_vec is word2vec')
 38 |     group_word_vec.add_argument('--rand', dest='wordvec_flag', action='store_false', help='word_vec is rand')
 39 | 
 40 |     group_shuffer_batch = parse.add_mutually_exclusive_group(required=False)
 41 |     group_shuffer_batch.add_argument('--shuffer', dest='shuffer_flag', action='store_true', help='the train do shuffer')
 42 |     group_shuffer_batch.add_argument('--no-shuffer', dest='shuffer_flag', action='store_false',
 43 |                                      help='the train do not shuffer')
 44 | 
 45 |     parse.add_argument('--learnrate', type=float, dest='learnrate', help='the NN learnRate', default=0.05)
 46 |     parse.add_argument('--epochs', type=int, dest='epochs', help='the model train epochs', default=10)
 47 |     parse.add_argument('--batch_size', type=int, dest='batch_size', help='the train gd batch size.(50-300)', default=100)
 48 |     parse.add_argument('--dropout_pro', type=float, dest='dropout_pro', help='the nn layer dropout_pro', default=0.5)
 49 | 
 50 |     parse.set_defaults(static_flag=True)
 51 |     parse.set_defaults(wordvec_flag=True)
 52 |     parse.set_defaults(shuffer_flag=False)
 53 | 
 54 |     args = parse.parse_args()
 55 | 
 56 | 
 57 |     # step2 load data
 58 |     print('load data. . .')
 59 |     X = pickle.load(open('./word_vec.p','rb'))
 60 | 
 61 |     word_vecs_rand, word_vecs, word_cab, sentence_max_len, revs = X[0],X[1],X[2],X[3],X[4]
 62 | 
 63 |     print('load data finish. . .')
 64 |     # configuration tf
 65 |     filter_sizes = [3, 4, 5]
 66 |     filter_numbers = 100
 67 |     embedding_size = 300
 68 |     # use word2vec or not
 69 |     W = word_vecs_rand
 70 |     if args.wordvec_flag:
 71 |         W = word_vecs
 72 |         pass
 73 |     # pdb.set_trace()
 74 |     word_ids,W_list = process_data.getWordsVect(W)
 75 | 
 76 |     # use static train or not
 77 |     static_falg = args.static_flag
 78 |     # use shuffer the data or not
 79 |     shuffer_falg = args.shuffer_flag
 80 |     #交叉验证
 81 |     results = []
 82 |     for index in tqdm(range(10)):
 83 |         print(" ")
 84 |         #打调试断点
 85 |         # pdb.set_trace()
 86 |         # train_x, train_y, test_x, test_y = process_data.get_train_test_data1(W,revs,index,sentence_max_len,default_values=0.0,vec_size=300)
 87 |         train_x, train_y, test_x, test_y = process_data.get_train_test_data2(word_ids,revs,index,sentence_max_len)
 88 |         # step3 create TextCNN model
 89 |         text_cnn = TextCNN(W_list,shuffer_falg,static_falg,filter_numbers,filter_sizes,sentence_max_len,embedding_size,args.learnrate,args.epochs,args.batch_size,args.dropout_pro)
 90 |         # step4 start train
 91 |         text_cnn.train(train_x,train_y)
 92 |         # step5 validataion
 93 |         accur,loss = text_cnn.validataion(test_x, test_y)
 94 |         results.append(accur)
 95 |         print('cv {} accur is :{:.3f} loss is {:.3f}'.format(index+1,accur,loss))
 96 |         text_cnn.close()
 97 |     print('last accuracy is {}'.format(np.mean(results)))
 98 | 
 99 | 
100 | 
101 | 


--------------------------------------------------------------------------------
/Text_CNN/text_cnn_model.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # -*- coding: utf-8 -*-
  3 | # @Time : 2018/3/9 下午4:35 
  4 | # @Author : ComeOnJian 
  5 | # @File : text_cnn_model.py 
  6 | 
  7 | #  structure TextCNN
  8 | #  1.input-embedding layer(max_l * 300) 2.cov layer 3.max pool layer 4.full connect droput + softmax + l2
  9 | 
 10 | import tensorflow as tf
 11 | import numpy as np
 12 | import pdb
 13 | 
 14 | class TextCNN():
 15 | 
 16 |     __shuffer_falg = False
 17 |     __static_falg = True
 18 | 
 19 |     def __init__(self,W_list,shuffer_falg, static_falg, filter_numbers, filter_sizes,sentence_length,embedding_size,learnrate, epochs, batch_size, dropout_pro, L2_lambda = 0.1):
 20 | 
 21 |         self.__shuffer_falg = shuffer_falg
 22 |         self.__static_falg = static_falg
 23 |         self.learning_rate_item = learnrate
 24 |         self.epochs = epochs
 25 |         self.sentence_length = sentence_length
 26 |         self.filter_numbers = filter_numbers
 27 |         self.batch_size = batch_size
 28 |         self.dropout_pro_item = dropout_pro
 29 |         self.embedding_size = embedding_size
 30 |         self.L2_lambda = L2_lambda
 31 |         # 1. setting graph
 32 |         tf.reset_default_graph()
 33 |         self.train_graph = tf.Graph()
 34 |         with self.train_graph.as_default():
 35 |             # 1 input layer
 36 |             self.input_x = tf.placeholder(dtype=tf.int32,shape=[None,sentence_length],name='input_x')
 37 |             self.input_y = tf.placeholder(dtype=tf.int32, shape=[None, 2], name='input_y')
 38 |             self.dropout_pro = tf.placeholder(dtype=tf.float32, name='dropout_pro')
 39 |             self.learning_rate = tf.placeholder(dtype=tf.float32, name='learning_rate')
 40 |             self.l2_loss = tf.constant(0.0)
 41 |             # self.embedding_layer = tf.placeholder(dtype=tf.float32, shape=[self.batch_size, sentence_length, embedding_size],
 42 |             #                                       name='embedding_layer')
 43 |             #2 embedding layer
 44 |             with tf.name_scope('embedding_layer'):
 45 |                 train_bool = not self.__static_falg
 46 |                 # tf.convert_to_tensor(W_list,dtype=tf.float32)
 47 |                 # pdb.set_trace()
 48 |                 self.embedding_layer_W = tf.Variable(initial_value=W_list,dtype=tf.float32, trainable=train_bool, name='embedding_layer_W')
 49 |                 self.embedding_layer_layer = tf.nn.embedding_lookup(self.embedding_layer_W, self.input_x)
 50 |                 self.embedding_layer_expand = tf.expand_dims(self.embedding_layer_layer, -1)
 51 | 
 52 |             # with tf.name_scope('word_embedding_layer'):
 53 |             #
 54 |             #     embedding_matrix = tf.Variable()
 55 |             #     tf.expand_dims()
 56 | 
 57 |             #3 conv layer + maxpool layer for each filer size
 58 |             pool_layer_lst = []
 59 |             for filter_size in filter_sizes:
 60 |                 max_pool_layer = self.__add_conv_layer(filter_size,filter_numbers)   #
 61 |                 pool_layer_lst.append(max_pool_layer)
 62 | 
 63 |             # 4.full connect droput + softmax + l2
 64 |             # combine all the max pool —— feature
 65 | 
 66 |             with tf.name_scope('dropout_layer'):
 67 |                 # pdb.set_trace()
 68 | 
 69 |                 max_num = len(filter_sizes) * self.filter_numbers   # 300
 70 |                 h_pool = tf.concat(pool_layer_lst,name='last_pool_layer',axis=3)
 71 |                 pool_layer_flat = tf.reshape(h_pool,[-1,max_num],name='pool_layer_flat')
 72 | 
 73 |                 dropout_pro_layer = tf.nn.dropout(pool_layer_flat,self.dropout_pro,name='dropout')
 74 | 
 75 |             with tf.name_scope('soft_max_layer'):
 76 |                 SoftMax_W = tf.Variable(tf.truncated_normal([max_num,2],stddev=0.01),name='softmax_linear_weight')
 77 |                 self.__variable_summeries(SoftMax_W)
 78 |                 # print('test1------------')
 79 |                 SoftMax_b = tf.Variable(tf.constant(0.1,shape=[2]),name='softmax_linear_bias')
 80 |                 self.__variable_summeries(SoftMax_b)
 81 |                 # print('test2------------')
 82 |                 self.l2_loss += tf.nn.l2_loss(SoftMax_W)
 83 |                 self.l2_loss += tf.nn.l2_loss(SoftMax_b)
 84 |                 # dropout_pro_layer_reshape = tf.reshape(dropout_pro_layer,[batch_size,-1])
 85 |                 self.softmax_values = tf.nn.xw_plus_b(dropout_pro_layer,SoftMax_W,SoftMax_b,name='soft_values')
 86 |                 # print ('++++++',self.softmax_values.shape)
 87 |                 self.predictions = tf.argmax(self.softmax_values,axis=1,name='predictions',output_type=tf.int32)
 88 | 
 89 |             with tf.name_scope('loss'):
 90 |                 losses = tf.nn.softmax_cross_entropy_with_logits(logits=self.softmax_values,labels=self.input_y)
 91 |                 self.loss = tf.reduce_mean(losses) + self.L2_lambda * self.l2_loss
 92 | 
 93 |                 # print ('---------1',self.loss)
 94 |                 tf.summary.scalar('last_loss',self.loss)
 95 | 
 96 |             with tf.name_scope('accuracy'):
 97 |                 correct_acc = tf.equal(self.predictions,tf.argmax(self.input_y,axis=1,output_type=tf.int32))
 98 | 
 99 |                 self.accuracy = tf.reduce_mean(tf.cast(correct_acc,'float'),name='accuracy')
100 |                 tf.summary.scalar('accuracy',self.accuracy)
101 | 
102 |             with tf.name_scope('train'):
103 |                 optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
104 |                 # print('test1------------')
105 |                 # pdb打个断点
106 |                 # pdb.set_trace()
107 |                 self.train_op = optimizer.minimize(self.loss)
108 |                 # print('test2------------')
109 | 
110 |             self.session = tf.InteractiveSession(graph=self.train_graph)
111 |             self.merged = tf.summary.merge_all()
112 |             self.train_writer = tf.summary.FileWriter('./NLP/log/text_cnn', graph=self.train_graph)
113 |     def train(self,train_x,train_y):
114 |         # self.unstatic_embedding_layer
115 |         # print('11111111')
116 |         self.session.run(tf.global_variables_initializer())
117 |         #迭代训练
118 |         for epoch in range(self.epochs):
119 |             # pdb.set_trace()
120 |             train_batch = self.__get_batchs(train_x, train_y, self.batch_size)
121 |             train_loss, train_acc, count = 0.0, 0.0, 0
122 |             for batch_i in range(len(train_x)//self.batch_size):
123 |                 x,y = next(train_batch)
124 |                 feed = {
125 |                     self.input_x:x,
126 |                     self.input_y:y,
127 |                     self.dropout_pro:self.dropout_pro_item,
128 |                     self.learning_rate:self.learning_rate_item
129 |                 }
130 |                 _,summarys,loss,accuracy = self.session.run([self.train_op,self.merged,self.loss,self.accuracy], feed_dict=feed)
131 |                 # each 5 batch print log
132 |                 if batch_i % 20 == 0:
133 |                     print('Epoch {:>3} Batch {:>4}/{} train_loss = {:.3f} accuracy = {:.3f}'.
134 |                           format(epoch, batch_i, (len(train_x) // self.batch_size), loss, accuracy))
135 |                 train_loss, train_acc, count = train_loss + loss, train_acc + accuracy, count + 1
136 |                 self.train_writer.add_summary(summarys,epoch)
137 | 
138 | 
139 |     def validataion(self,test_x, test_y):
140 |         test_batch = self.__get_batchs(test_x,test_y,self.batch_size)
141 |         eval_loss, eval_acc ,count= 0.0, 0.0 ,0
142 |         for batch_i in range(len(test_x) // self.batch_size):
143 |             x,y = next(test_batch)
144 |             feed = {
145 |                 self.input_x: x,
146 |                 self.input_y: y,
147 |                 self.dropout_pro: 1.0,
148 |                 self.learning_rate: 0.0
149 |             }
150 |             loss ,accuracy = self.session.run([self.loss,self.accuracy],feed_dict=feed)
151 |             eval_loss ,eval_acc ,count  = eval_loss+loss ,eval_acc+accuracy ,count+1
152 |         return eval_acc/float(count),eval_loss/float(count)
153 | 
154 |     def close(self):
155 |         self.session.close()
156 |         self.train_writer.close()
157 |     #generate batch data
158 |     def __get_batchs(self,Xs,Ys,batch_size):
159 |         for start in range(0,len(Xs),batch_size):
160 |             end = min(start+batch_size,len(Xs))
161 |             yield Xs[start:end],Ys[start:end]
162 | 
163 |     def __add_conv_layer(self,filter_size,filter_num):
164 |         with tf.name_scope('conv-maxpool-size%d'%(filter_size)):
165 |             #convolutio layer
166 |             filter_shape =[filter_size,self.embedding_size,1,filter_num]
167 |             W = tf.Variable(tf.truncated_normal(filter_shape,stddev=0.1),name='filter_weight')
168 |             self.__variable_summeries(W)
169 |             b = tf.Variable(tf.constant(0.1,shape=[filter_num]),name='filter_bias')
170 |             self.__variable_summeries(b)
171 |             #参数说明
172 |             #第一个参数input：指需要做卷积的输入图像 [训练时一个batch的图片数量（句子的数量）, 图片高度（每个句子中的单词数量）, 图片宽度（每个单词的词向量维度）, 图像通道数（不同方式训练出来的词向量的表示方法，Glove、word2vec等）]
173 |             #第二个参数filter：相当于CNN中的卷积核 [卷积核的高度，卷积核的宽度，图像通道数，卷积核个数]
174 |             #第三个参数strides：卷积时在图像每一维的步长，这是一个一维的向量，长度4,
175 |             #第四个参数padding：string类型的量，只能是"SAME","VALID"其中之一，这个值决定了不同的填充方式
176 |             #第五个参数：use_cudnn_on_gpu: bool类型，是否使用cudnn加速，默认为true
177 |             conv_layer = tf.nn.conv2d(self.embedding_layer_expand,W,strides=[1,1,1,1],padding='VALID',name='conv_layer')
178 |             relu_layer = tf.nn.relu(tf.nn.bias_add(conv_layer,b),name='relu_layer')
179 | 
180 |             max_pool_layer = tf.nn.max_pool(relu_layer,ksize=[1,self.sentence_length - filter_size+1,1,1],strides=[1,1,1,1],padding='VALID',name='maxpool')
181 |             return max_pool_layer
182 | 
183 |     def __variable_summeries(self,var):
184 |         """
185 |         :param var: Tensor, Attach a lot of summaries to a Tensor (for TensorBoard visualization).
186 |         """
187 |         with tf.name_scope('summeries'):
188 |             mean = tf.reduce_mean(var)
189 |             tf.summary.scalar('mean', mean)  # 记录参数的均值
190 | 
191 |             with tf.name_scope('stddev'):
192 |                 stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean)))
193 |                 tf.summary.scalar('stddev', stddev)
194 |                 tf.summary.scalar('max', tf.reduce_max(var))
195 |                 tf.summary.scalar('min', tf.reduce_min(var))
196 | 
197 |                 # 用直方图记录参数的分布
198 |                 tf.summary.histogram('histogram', var)
199 | 


--------------------------------------------------------------------------------
/picture/README-1cd4ff0f.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Happy-zyy/NLP-Model/f5fefb747f5c21fc8d348b8622846156197d21d6/picture/README-1cd4ff0f.png


--------------------------------------------------------------------------------
/picture/README-282eca2f.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Happy-zyy/NLP-Model/f5fefb747f5c21fc8d348b8622846156197d21d6/picture/README-282eca2f.png


--------------------------------------------------------------------------------
/picture/README-4fcc65db.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Happy-zyy/NLP-Model/f5fefb747f5c21fc8d348b8622846156197d21d6/picture/README-4fcc65db.png


--------------------------------------------------------------------------------
/picture/README-61a8bed9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Happy-zyy/NLP-Model/f5fefb747f5c21fc8d348b8622846156197d21d6/picture/README-61a8bed9.png


--------------------------------------------------------------------------------
/picture/README-7ea1b04c.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Happy-zyy/NLP-Model/f5fefb747f5c21fc8d348b8622846156197d21d6/picture/README-7ea1b04c.png


--------------------------------------------------------------------------------
/picture/README-857a805b.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Happy-zyy/NLP-Model/f5fefb747f5c21fc8d348b8622846156197d21d6/picture/README-857a805b.png


--------------------------------------------------------------------------------
/picture/README-85cdfcb9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Happy-zyy/NLP-Model/f5fefb747f5c21fc8d348b8622846156197d21d6/picture/README-85cdfcb9.png


--------------------------------------------------------------------------------
/picture/README-85ffa053.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Happy-zyy/NLP-Model/f5fefb747f5c21fc8d348b8622846156197d21d6/picture/README-85ffa053.png


--------------------------------------------------------------------------------
/picture/README-8f6b1559.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Happy-zyy/NLP-Model/f5fefb747f5c21fc8d348b8622846156197d21d6/picture/README-8f6b1559.png


--------------------------------------------------------------------------------
/picture/README-95176db7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Happy-zyy/NLP-Model/f5fefb747f5c21fc8d348b8622846156197d21d6/picture/README-95176db7.png


--------------------------------------------------------------------------------
/picture/README-d1c2b10b.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Happy-zyy/NLP-Model/f5fefb747f5c21fc8d348b8622846156197d21d6/picture/README-d1c2b10b.png


--------------------------------------------------------------------------------
/picture/README-f68d8b8d.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Happy-zyy/NLP-Model/f5fefb747f5c21fc8d348b8622846156197d21d6/picture/README-f68d8b8d.png


--------------------------------------------------------------------------------
/picture/分词数据集.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Happy-zyy/NLP-Model/f5fefb747f5c21fc8d348b8622846156197d21d6/picture/分词数据集.png


--------------------------------------------------------------------------------