├── .idea
├── csdnSMP.iml
├── inspectionProfiles
│ └── profiles_settings.xml
├── modules.xml
└── workspace.xml
├── cut_lines.py
├── preprocess.py
├── readme.txt
├── seg_data.py
├── train_spyder.py
├── train_word2vec.py
├── training.py
├── utils
├── __init__.py
├── __pycache__
│ ├── __init__.cpython-35.pyc
│ └── data_path.cpython-35.pyc
└── data_path.py
└── word2vec_test.py
/.idea/csdnSMP.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/workspace.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 | 1499225071594
19 |
20 |
21 | 1499225071594
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
--------------------------------------------------------------------------------
/cut_lines.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/changxizhu/csdnSMP/ce9bec9bdce0c5949e0d500ad33539cdb5e4160a/cut_lines.py
--------------------------------------------------------------------------------
/preprocess.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/changxizhu/csdnSMP/ce9bec9bdce0c5949e0d500ad33539cdb5e4160a/preprocess.py
--------------------------------------------------------------------------------
/readme.txt:
--------------------------------------------------------------------------------
1 | CSDN用户画像
--------------------------------------------------------------------------------
/seg_data.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/changxizhu/csdnSMP/ce9bec9bdce0c5949e0d500ad33539cdb5e4160a/seg_data.py
--------------------------------------------------------------------------------
/train_spyder.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Thu Jul 13 21:14:40 2017
4 |
5 | @author: chauncy
6 | """
7 | """
8 | 现在已经训练好了词向量,应该如何用呢?
9 | """
10 | import itertools
11 | import pandas as pd
12 | import utils.data_path as dp
13 |
14 | """
15 | 下面使用规则来找出用户的标签
16 | 1.对于测试集中的每个用户,去找数据集中所有关联的博文,然后比较博文重合率大不大,
17 | """
18 | pd_train = pd.read_csv(dp.TrainCsv, index_col=0, encoding="utf8")
19 | pd_train["blog_uid"] = pd_train["blog_uid"].apply(eval)
20 | train_blog_uids = list(itertools.chain(*list(pd_train["blog_uid"])))
21 | print("train_blog_uids length---", len(train_blog_uids))
22 | train_unique_blog_uids = list(set(train_blog_uids))
23 | print("train_blog_uids length---", len(train_unique_blog_uids))
24 |
25 | # voca = {word:index for index,word in enumerate(train_unique_blog_uids)}
26 |
27 | pd_dev = pd.read_csv(dp.DevCsv, index_col=0, encoding="utf8")
28 | pd_dev["blog_uid"] = pd_dev["blog_uid"].apply(eval)
29 | dev_blog_uids = list(itertools.chain(*list(pd_dev["blog_uid"])))
30 | print("dev_blog_uids length---", len(dev_blog_uids))
31 | dev_unique_blog_uids = list(set(dev_blog_uids))
32 | print("dev_blog_uids length---", len(dev_unique_blog_uids))
33 |
34 | print("in train but not in dev blog length---", len(set(train_unique_blog_uids) - set(dev_unique_blog_uids)))
35 | print("in train also in dev blog length---", len(set(train_unique_blog_uids) & set(dev_unique_blog_uids)))
36 |
37 | # 获取所有需要的blog_id,并写入文件
38 | with open("test.txt", 'w') as f:
39 | blog_id_list = []
40 | union = list(set(train_unique_blog_uids) | set(dev_unique_blog_uids))
41 | for i in union:
42 | blog_id_list.append(i)
43 | f.write(i + "\n")
44 |
45 | # 获取blog id对应的博文
46 | blog_source = open(dp.BlogContentTxt, encoding="utf8")
47 | line = blog_source.readline()
48 | total_blog_ids = []
49 | while line:
50 | line = line.strip()
51 | blog_id_temp = line[:line.index("\001")]
52 | print("blog id:", blog_id_temp)
53 | total_blog_ids.append(blog_id_temp)
54 | line = blog_source.readline()
55 |
56 | blog_id_index = []
57 | for i in blog_id_list:
58 | print("blog id:", i)
59 | print("blog id:", total_blog_ids.index(i))
60 | blog_id_index.append(total_blog_ids.index(i))
61 |
62 | blog_id_index_sort = sorted(blog_id_index)
63 |
64 | target_file = open("/home/cike/PycharmProjects/pythondata/csdnSMP/Train_DATA/some_blogcontent.txt", 'w')
65 | # 获取blog id对应的博文
66 | blog_source = open(dp.BlogContentTxt, encoding="utf8")
67 | line = blog_source.readline()
68 | count_temp = 0
69 | count_stop = 0
70 | while line:
71 | if count_temp < len(blog_id_index_sort) and blog_id_index_sort[count_temp] == count_stop:
72 | print("count_temp:", count_temp)
73 | line = line.strip()
74 | target_file.write(line + "\n")
75 | count_temp += 1
76 | line = blog_source.readline()
77 | count_stop += 1
78 |
79 | if target_file:
80 | target_file.flush()
81 | target_file.close()
82 |
83 | """
84 | 重复的blog有8193593个
85 | unique的blog有23507个
86 | """
87 | """
88 | train blog_uids length---8193593
89 | train unique blog_uids length---23507
90 | test blog_uids length---8948850
91 | test unique blog_uids length---26703
92 | in train also in dev blog length--- 4139
93 | in train but not in dev blog length--- 19368
94 | """
95 |
96 | """
97 | 现在找出每个用户所关联的博客对应的用户
98 | """
99 | import pandas as pd
100 |
101 | pd_post = pd.read_csv(dp.PostTxt, encoding="utf8", sep="\001")
102 | pd_post.columns = ['uid', 'blog_id', 'time']
103 | temp = set(train_unique_blog_uids) - set(list(pd_post['blog_id'])) # 0
104 |
105 | pd_brows = pd.read_csv(dp.BrowseTxt, encoding="utf8", sep="\001")
106 | pd_brows.columns = ['uid', 'blog_id', 'time']
107 |
108 | pd_comment = pd.read_csv(dp.CommentTxt, encoding="utf8", sep="\001")
109 | pd_comment.columns = ['uid', 'blog_id', 'time']
110 |
111 | pd_voteup = pd.read_csv(dp.VoteupTxt, encoding="utf8", sep="\001")
112 | pd_voteup.columns = ['uid', 'blog_id', 'time']
113 |
114 | pd_votedown = pd.read_csv(dp.VotedownTxt, encoding="utf8", sep="\001")
115 | pd_votedown.columns = ['uid', 'blog_id', 'time']
116 |
117 | pd_favorite = pd.read_csv(dp.FavoriteTxt, encoding="utf8", sep="\001")
118 | pd_favorite.columns = ['uid', 'blog_id', 'time']
119 |
120 | """
121 | read some blog content(in train also in test data),
122 | which will be cut by jieba
123 | write in the some_blogcontent csv
124 | """
125 | pd_blog_content = pd.read_csv("/home/cike/PycharmProjects/pythondata/csdnSMP/Train_DATA/some_blogcontent.txt",
126 | encoding="utf8", sep="\001")
127 | pd_blog_content.columns = ['blog_id', 'title', 'content']
128 | pd_blog_content['blog_jieba'] = [None] * len(pd_blog_content)
129 |
130 | import jieba
131 |
132 | for index, row in pd_blog_content.iterrows():
133 | print("index", index)
134 | row['blog_jieba'] = list(jieba.cut(row['title'] + row['content']))
135 |
136 | pd_blog_content.to_csv('/home/cike/PycharmProjects/pythondata/csdnSMP/Train_DATA/some_blogcontent.csv', encoding="utf8")
137 |
138 | # read some_blogcontent csv file by pandas
139 | pd_blog_content = pd.read_csv('/home/cike/PycharmProjects/pythondata/csdnSMP/Train_DATA/some_blogcontent.csv',
140 | index_col=0, encoding="utf8")
141 | pd_blog_content['content'] = [None] * len(pd_blog_content)
142 | pd_blog_content["blog_jieba"] = pd_blog_content["blog_jieba"].apply(eval)
143 |
144 | """
145 | get some_blogcontent voca, beacause it's too big to load in the same time
146 | 获取词典以及对应的词向量, writer in the csv file
147 | """
148 | import itertools
149 |
150 | pd_voca = pd.DataFrame(list(set(list(itertools.chain(*list(pd_blog_content["blog_jieba"]))))))
151 | pd_voca.columns = ['word']
152 |
153 | from imp import reload
154 | import gensim
155 | import utils.data_path as dp
156 |
157 | reload(dp)
158 | model = gensim.models.Word2Vec.load(dp.CSDNMODEL)
159 |
160 | vector_len = len(model['print'])
161 | pd_voca['vector'] = [[]] * (len(pd_voca))
162 |
163 |
164 | def f(x):
165 | if x in model:
166 | return list(model[x])
167 | else:
168 | return [0] * vector_len
169 |
170 |
171 | pd_voca['vector'] = pd_voca['word'].apply(f)
172 | pd_voca.to_csv('/home/cike/PycharmProjects/pythondata/csdnSMP/Train_DATA/blog_voca.csv', encoding="utf8")
173 |
174 | """
175 | read blog voca from the csv file
176 | transfer vector fron string to vector
177 | """
178 | import pandas as pd
179 |
180 | pd_voca = pd.read_csv("/home/cike/PycharmProjects/pythondata/csdnSMP/Train_DATA/blog_voca.csv", index_col=0,
181 | encoding="utf8")
182 | pd_voca['vector'] = pd_voca['vector'].apply(eval)
183 | pd_voca = pd_voca.set_index('word')
184 |
185 | # read some_blogcontent csv file by pandas
186 | import numpy as np
187 |
188 | pd_blog_content = pd.read_csv('/home/cike/PycharmProjects/pythondata/csdnSMP/Train_DATA/some_blogcontent.csv',
189 | index_col=0, encoding="utf8")
190 | pd_blog_content['content'] = [None] * len(pd_blog_content)
191 | pd_blog_content["blog_jieba"] = pd_blog_content["blog_jieba"].apply(eval)
192 |
193 | pd_blog_content["some_blog_jieba"] = [[]] * len(pd_blog_content)
194 | # select word which in the pd voca index, because it's too big to calculate in the same time
195 | for index, row in pd_blog_content.iterrows():
196 | print("index", index)
197 | # temp = np.array([float(0)]*100) # vector has 100 feature
198 | temp = []
199 | for word in row['blog_jieba']:
200 | if word in pd_voca.index:
201 | temp.append(word)
202 |
203 | row["some_blog_jieba"] = temp
204 | temp = None
205 |
206 | # calculate the doc vector
207 | pd_blog_content["blog_jieba_vector"] = [[]] * len(pd_blog_content)
208 | for index, row in pd_blog_content.iterrows():
209 | print("index", index)
210 | temp = list(np.sum(list(pd_voca.loc[row["some_blog_jieba"]]['vector']), axis=0))
211 | # print(temp)
212 | row["blog_jieba_vector"] = temp
213 |
214 | pd_blog_content.to_csv('/home/cike/PycharmProjects/pythondata/csdnSMP/Train_DATA/some_blogcontent_vector.csv',
215 | encoding="utf8")
216 |
217 | """
218 | 已经计算好了doc的vector
219 | 现在将每个vector归一化处理
220 | 将训练数据和测试数据转为matrix
221 | """
222 | pd_train_data = pd.read_csv(dp.TrainCsv, encoding="utf8", index_col=0)
223 | pd_train_data["blog_uid"] = pd_train_data["blog_uid"].apply(eval)
224 |
225 | pd_dev_data = pd.read_csv(dp.DevCsv, encoding="utf8", index_col=0)
226 | pd_dev_data["blog_uid"] = pd_dev_data["blog_uid"].apply(eval)
227 |
228 | # 加载训练csv数据
229 | import utils.data_path as dp
230 | import pandas as pd
231 |
232 | pd_blog_content = pd.read_csv('/home/cike/PycharmProjects/pythondata/csdnSMP/Train_DATA/some_blogcontent_vector.csv',
233 | encoding="utf8", index_col=0)
234 | pd_blog_content['blog_jieba_vector'] = pd_blog_content['blog_jieba_vector'].apply(eval)
235 |
236 | # normalize
237 | import numpy as np
238 |
239 |
240 | def f(a):
241 | a = np.array(a)
242 | return (a - a.min()) / (a.max() - a.min())
243 |
244 |
245 | pd_blog_content['blog_jieba_vector'] = pd_blog_content['blog_jieba_vector'].apply(f)
246 |
247 | """
248 | 这里不再需要用到分词的结果,只需要找到blog对应的vector
249 | 获得voca的embedding matrix
250 | """
251 |
252 |
253 | def g(x):
254 | return list(pd_blog_content['blog_id'][pd_blog_content['blog_id'].isin(x)].index)
255 |
256 |
257 | pd_train_data["embedding_index"] = pd_train_data["blog_uid"].apply(g)
258 | pd_dev_data["embedding_index"] = pd_dev_data["blog_uid"].apply(g)
259 |
260 | from imp import reload
261 |
262 | reload(dp)
263 | pd_train_data.to_csv(dp.TrainCsv, encoding="utf8")
264 | pd_dev_data.to_csv(dp.DevCsv, encoding="utf8")
265 |
266 | count = 0
267 | for vector in pd_blog_content['blog_jieba_vector']:
268 | count += 1
269 | print("vector---", count)
270 | for i in vector:
271 | if not isinstance(i, float) or len(vector) != 100:
272 | print("error")
273 |
274 | pd_blog_content.to_pickle(
275 | '/home/cike/PycharmProjects/pythondata/csdnSMP/Train_DATA/some_blogcontent_vector_normalize.pkl')
276 |
277 | """
278 | begin read blog content of pickle, and training traindata and testdata
279 | """
280 | pd_blog_content = pd.read_pickle(
281 | '/home/cike/PycharmProjects/pythondata/csdnSMP/Train_DATA/some_blogcontent_vector_normalize.pkl')
282 |
283 | pd_train_data = pd.read_csv(dp.TrainCsv, encoding="utf8", index_col=0)
284 |
285 | pd_dev_data = pd.read_csv(dp.DevCsv, encoding="utf8", index_col=0)
286 |
287 | columns_train = list(pd_train_data.columns)
288 | del columns_train[1]
289 |
290 | columns_dev = list(pd_dev_data.columns)
291 | del columns_dev[0]
292 |
293 | import math
294 | import numpy as np
295 |
296 |
297 | def f(x):
298 | if x and not (type(x) == float and math.isnan(x)):
299 | # print("x:",x)
300 | # print("type:",type(x))
301 | return eval(x)
302 | else:
303 | return []
304 |
305 |
306 | for i in columns_train:
307 | print("column name:", i)
308 | pd_train_data[i] = pd_train_data[i].apply(f)
309 |
310 | for i in columns_dev:
311 | print("column name:", i)
312 | pd_dev_data[i] = pd_dev_data[i].apply(f)
313 |
314 | from imp import reload
315 | import utils.data_path as dp
316 |
317 | reload(dp)
318 | pd_train_data.to_pickle(dp.TrainPKL)
319 | pd_dev_data.to_pickle(dp.DevPKL)
320 |
321 | pd_train_data_pkl = pd.read_pickle(dp.TrainPKL)
322 | pd_dev_data_pkl = pd.read_pickle(dp.DevPKL)
323 |
324 | pd_blog_content = pd.read_pickle(
325 | '/home/cike/PycharmProjects/pythondata/csdnSMP/Train_DATA/some_blogcontent_vector_normalize.pkl')
326 |
327 |
--------------------------------------------------------------------------------
/train_word2vec.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/changxizhu/csdnSMP/ce9bec9bdce0c5949e0d500ad33539cdb5e4160a/train_word2vec.py
--------------------------------------------------------------------------------
/training.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | """
3 | 使用神经网络来训练该数据
4 | """
5 | import pandas as pd
6 | import numpy as np
7 | import data_path as dp
8 | from keras.preprocessing import sequence
9 | from keras.models import Sequential
10 | from keras.layers import Embedding
11 | from keras.layers.recurrent import LSTM, GRU
12 | from keras.layers.core import Dropout, Dense, Activation
13 | from keras.utils import np_utils # 这个使用还不是很清楚
14 |
15 | """
16 | 通过keras实现lstm神经网络,实际上只要确定好标签、输入、字典,就可以训练该神经网络
17 | """
18 |
19 |
20 | def get_pickle_data(pd_pkl_name, columns_name):
21 | """
22 | 产生embedding_matrix
23 | :param pd_pkl_name: 需要加载的字典内容
24 | :param columns_name: 需要保留的列
25 | :return:
26 | """
27 | # 从pd_pkl_name中读取文件
28 | pd_pkl = pd.read_pickle(pd_pkl_name)
29 |
30 | non_selected = list(set(pd_pkl.columns) - set(columns_name))
31 | pd_pkl = pd_pkl.drop(non_selected, axis=1) # 去掉没有选择的列,实际上只保留id以及对应的vector
32 |
33 | # print(pd_pkl)
34 | return pd_pkl
35 |
36 |
37 | def lstm(trainData, trainMark, testData, embedding_dim, embedding_matrix, maxlen, output_len):
38 | # 填充数据,将每个序列长度保持一致
39 | trainData = list(sequence.pad_sequences(trainData, maxlen=maxlen,
40 | dtype='float64')) # sequence返回的是一个numpy数组,pad_sequences用于填充指定长度的序列,长则阶段,短则补0,由于下面序号为0时,对应值也为0,因此可以这样
41 | testData = list(sequence.pad_sequences(testData, maxlen=maxlen,
42 | dtype='float64')) # sequence返回的是一个numpy数组,pad_sequences用于填充指定长度的序列,长则阶段,短则补0
43 |
44 | # 建立lstm神经网络模型
45 | model = Sequential() # 多个网络层的线性堆叠,可以通过传递一个layer的list来构造该模型,也可以通过.add()方法一个个的加上层
46 | # model.add(Dense(256, input_shape=(train_total_vova_len,))) #使用全连接的输入层
47 | model.add(Embedding(len(embedding_matrix), embedding_dim, weights=[embedding_matrix], mask_zero=False,
48 | input_length=maxlen)) # 指定输入层,将高维的one-hot转成低维的embedding表示,第一个参数大或等于0的整数,输入数据最大下标+1,第二个参数大于0的整数,代表全连接嵌入的维度
49 | # lstm层,也是比较核心的层
50 | model.add(LSTM(256)) # 256对应Embedding输出维度,128是输入维度可以推导出来
51 | model.add(Dropout(0.5)) # 每次在参数更新的时候以一定的几率断开层的链接,用于防止过拟合
52 | model.add(Dense(output_len)) # 全连接,这里用于输出层,1代表输出层维度,128代表LSTM层维度可以自行推导出来
53 | model.add(Activation('softmax')) # 输出用sigmoid激活函数
54 | # 编译该模型,categorical_crossentropy(亦称作对数损失,logloss),adam是一种优化器,class_mode表示分类模式
55 | model.compile(loss='categorical_crossentropy', optimizer='sgd')
56 |
57 | # 正式运行该模型,我知道为什么了,因为没有补0!!每个array的长度是不一样的,因此才会报错
58 | X = np.array(list(trainData)) # 输入数据
59 | print("X:", X)
60 | Y = np.array(list(trainMark)) # 标签
61 | print("Y:", Y)
62 | # batch_size:整数,指定进行梯度下降时每个batch包含的样本数
63 | # nb_epoch:整数,训练的轮数,训练数据将会被遍历nb_epoch次
64 | model.fit(X, Y, batch_size=200, nb_epoch=10) # 该函数的X、Y应该是多个输入:numpy list(其中每个元素为numpy.array),单个输入:numpy.array
65 |
66 | # 进行预测
67 | A = np.array(list(testData)) # 输入数据
68 | print("A:", A)
69 | classes = model.predict(A) # 这个是预测的数据
70 | return classes
71 |
72 |
73 | if __name__ == '__main__':
74 | """
75 | 整理好标签数据
76 | """
77 | embedding_dim = 100
78 | maxlen = 15772
79 |
80 | pd_embedding = get_pickle_data(dp.SOME_BLOGCONTENT_VECTOR_NORMALIZE, columns_name=['blog_id', 'blog_jieba_vector'])
81 |
82 | # 获得训练数据和测试数据labels_name
83 | pd_train = get_pickle_data(dp.TrainPKL, columns_name=['labels', 'uid', 'embedding_index'])
84 | pd_test = get_pickle_data(dp.DevPKL, columns_name=['uid', 'embedding_index'])
85 |
86 | # 获得所有的标签
87 | # attention:the decode way is 'gbk' not 'utf8'
88 | with open(dp.LabelSpace, encoding='gbk') as f:
89 | labels_name = [i.strip() for i in f]
90 | # print("labels_name---",labels_name)
91 |
92 | labels_len = len(labels_name)
93 |
94 |
95 | def f(x):
96 | a = [0] * labels_len
97 | x = x[0].split('\001')
98 | for i in x:
99 | a[labels_name.index(i)] = 1
100 | return a
101 |
102 |
103 | pd_train['labels'] = pd_train['labels'].apply(f)
104 |
105 | # change embedding
106 | embedding = np.array(list(pd_embedding['blog_jieba_vector'].apply(list)))
107 |
108 | print("pd_train:\n", pd_train)
109 | print("pd_test:\n", pd_test)
110 |
111 | # begin training
112 | dev_classes = lstm(list(pd_train['embedding_index'])[:100], list(pd_train['labels'])[:100],
113 | list(pd_test['embedding_index'])[:100],
114 | embedding_dim, embedding, maxlen, labels_len)
115 |
116 | print("dev classes:", dev_classes)
117 |
118 | # bottleneck
119 | import bottleneck as bl
120 |
121 | result = []
122 | labels_name = np.array(labels_name)
123 | for classes in dev_classes:
124 | result.append(labels_name[bl.argpartition(-classes, 3)[:3]])
125 |
126 | pd_result = pd.DataFrame(result)
127 | pd_result.to_csv(dp.ResultTxt, sep="\001", header=False, index=False, encoding='utf8')
128 |
129 |
130 |
--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/changxizhu/csdnSMP/ce9bec9bdce0c5949e0d500ad33539cdb5e4160a/utils/__init__.py
--------------------------------------------------------------------------------
/utils/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/changxizhu/csdnSMP/ce9bec9bdce0c5949e0d500ad33539cdb5e4160a/utils/__pycache__/__init__.cpython-35.pyc
--------------------------------------------------------------------------------
/utils/__pycache__/data_path.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/changxizhu/csdnSMP/ce9bec9bdce0c5949e0d500ad33539cdb5e4160a/utils/__pycache__/data_path.cpython-35.pyc
--------------------------------------------------------------------------------
/utils/data_path.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/changxizhu/csdnSMP/ce9bec9bdce0c5949e0d500ad33539cdb5e4160a/utils/data_path.py
--------------------------------------------------------------------------------
/word2vec_test.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/changxizhu/csdnSMP/ce9bec9bdce0c5949e0d500ad33539cdb5e4160a/word2vec_test.py
--------------------------------------------------------------------------------