├── .idea ├── csdnSMP.iml ├── inspectionProfiles │ └── profiles_settings.xml ├── modules.xml └── workspace.xml ├── cut_lines.py ├── preprocess.py ├── readme.txt ├── seg_data.py ├── train_spyder.py ├── train_word2vec.py ├── training.py ├── utils ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-35.pyc │ └── data_path.cpython-35.pyc └── data_path.py └── word2vec_test.py /.idea/csdnSMP.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 7 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/workspace.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 10 | 11 | 12 | 13 | 14 | 16 | 17 | 18 | 1499225071594 19 | 23 | 24 | 25 | 26 | 28 | 29 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /cut_lines.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/changxizhu/csdnSMP/ce9bec9bdce0c5949e0d500ad33539cdb5e4160a/cut_lines.py -------------------------------------------------------------------------------- /preprocess.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/changxizhu/csdnSMP/ce9bec9bdce0c5949e0d500ad33539cdb5e4160a/preprocess.py -------------------------------------------------------------------------------- /readme.txt: -------------------------------------------------------------------------------- 1 | CSDN用户画像 -------------------------------------------------------------------------------- /seg_data.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/changxizhu/csdnSMP/ce9bec9bdce0c5949e0d500ad33539cdb5e4160a/seg_data.py -------------------------------------------------------------------------------- /train_spyder.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu Jul 13 21:14:40 2017 4 | 5 | @author: chauncy 6 | """ 7 | """ 8 | 现在已经训练好了词向量,应该如何用呢? 9 | """ 10 | import itertools 11 | import pandas as pd 12 | import utils.data_path as dp 13 | 14 | """ 15 | 下面使用规则来找出用户的标签 16 | 1.对于测试集中的每个用户,去找数据集中所有关联的博文,然后比较博文重合率大不大, 17 | """ 18 | pd_train = pd.read_csv(dp.TrainCsv, index_col=0, encoding="utf8") 19 | pd_train["blog_uid"] = pd_train["blog_uid"].apply(eval) 20 | train_blog_uids = list(itertools.chain(*list(pd_train["blog_uid"]))) 21 | print("train_blog_uids length---", len(train_blog_uids)) 22 | train_unique_blog_uids = list(set(train_blog_uids)) 23 | print("train_blog_uids length---", len(train_unique_blog_uids)) 24 | 25 | # voca = {word:index for index,word in enumerate(train_unique_blog_uids)} 26 | 27 | pd_dev = pd.read_csv(dp.DevCsv, index_col=0, encoding="utf8") 28 | pd_dev["blog_uid"] = pd_dev["blog_uid"].apply(eval) 29 | dev_blog_uids = list(itertools.chain(*list(pd_dev["blog_uid"]))) 30 | print("dev_blog_uids length---", len(dev_blog_uids)) 31 | dev_unique_blog_uids = list(set(dev_blog_uids)) 32 | print("dev_blog_uids length---", len(dev_unique_blog_uids)) 33 | 34 | print("in train but not in dev blog length---", len(set(train_unique_blog_uids) - set(dev_unique_blog_uids))) 35 | print("in train also in dev blog length---", len(set(train_unique_blog_uids) & set(dev_unique_blog_uids))) 36 | 37 | # 获取所有需要的blog_id,并写入文件 38 | with open("test.txt", 'w') as f: 39 | blog_id_list = [] 40 | union = list(set(train_unique_blog_uids) | set(dev_unique_blog_uids)) 41 | for i in union: 42 | blog_id_list.append(i) 43 | f.write(i + "\n") 44 | 45 | # 获取blog id对应的博文 46 | blog_source = open(dp.BlogContentTxt, encoding="utf8") 47 | line = blog_source.readline() 48 | total_blog_ids = [] 49 | while line: 50 | line = line.strip() 51 | blog_id_temp = line[:line.index("\001")] 52 | print("blog id:", blog_id_temp) 53 | total_blog_ids.append(blog_id_temp) 54 | line = blog_source.readline() 55 | 56 | blog_id_index = [] 57 | for i in blog_id_list: 58 | print("blog id:", i) 59 | print("blog id:", total_blog_ids.index(i)) 60 | blog_id_index.append(total_blog_ids.index(i)) 61 | 62 | blog_id_index_sort = sorted(blog_id_index) 63 | 64 | target_file = open("/home/cike/PycharmProjects/pythondata/csdnSMP/Train_DATA/some_blogcontent.txt", 'w') 65 | # 获取blog id对应的博文 66 | blog_source = open(dp.BlogContentTxt, encoding="utf8") 67 | line = blog_source.readline() 68 | count_temp = 0 69 | count_stop = 0 70 | while line: 71 | if count_temp < len(blog_id_index_sort) and blog_id_index_sort[count_temp] == count_stop: 72 | print("count_temp:", count_temp) 73 | line = line.strip() 74 | target_file.write(line + "\n") 75 | count_temp += 1 76 | line = blog_source.readline() 77 | count_stop += 1 78 | 79 | if target_file: 80 | target_file.flush() 81 | target_file.close() 82 | 83 | """ 84 | 重复的blog有8193593个 85 | unique的blog有23507个 86 | """ 87 | """ 88 | train blog_uids length---8193593 89 | train unique blog_uids length---23507 90 | test blog_uids length---8948850 91 | test unique blog_uids length---26703 92 | in train also in dev blog length--- 4139 93 | in train but not in dev blog length--- 19368 94 | """ 95 | 96 | """ 97 | 现在找出每个用户所关联的博客对应的用户 98 | """ 99 | import pandas as pd 100 | 101 | pd_post = pd.read_csv(dp.PostTxt, encoding="utf8", sep="\001") 102 | pd_post.columns = ['uid', 'blog_id', 'time'] 103 | temp = set(train_unique_blog_uids) - set(list(pd_post['blog_id'])) # 0 104 | 105 | pd_brows = pd.read_csv(dp.BrowseTxt, encoding="utf8", sep="\001") 106 | pd_brows.columns = ['uid', 'blog_id', 'time'] 107 | 108 | pd_comment = pd.read_csv(dp.CommentTxt, encoding="utf8", sep="\001") 109 | pd_comment.columns = ['uid', 'blog_id', 'time'] 110 | 111 | pd_voteup = pd.read_csv(dp.VoteupTxt, encoding="utf8", sep="\001") 112 | pd_voteup.columns = ['uid', 'blog_id', 'time'] 113 | 114 | pd_votedown = pd.read_csv(dp.VotedownTxt, encoding="utf8", sep="\001") 115 | pd_votedown.columns = ['uid', 'blog_id', 'time'] 116 | 117 | pd_favorite = pd.read_csv(dp.FavoriteTxt, encoding="utf8", sep="\001") 118 | pd_favorite.columns = ['uid', 'blog_id', 'time'] 119 | 120 | """ 121 | read some blog content(in train also in test data), 122 | which will be cut by jieba 123 | write in the some_blogcontent csv 124 | """ 125 | pd_blog_content = pd.read_csv("/home/cike/PycharmProjects/pythondata/csdnSMP/Train_DATA/some_blogcontent.txt", 126 | encoding="utf8", sep="\001") 127 | pd_blog_content.columns = ['blog_id', 'title', 'content'] 128 | pd_blog_content['blog_jieba'] = [None] * len(pd_blog_content) 129 | 130 | import jieba 131 | 132 | for index, row in pd_blog_content.iterrows(): 133 | print("index", index) 134 | row['blog_jieba'] = list(jieba.cut(row['title'] + row['content'])) 135 | 136 | pd_blog_content.to_csv('/home/cike/PycharmProjects/pythondata/csdnSMP/Train_DATA/some_blogcontent.csv', encoding="utf8") 137 | 138 | # read some_blogcontent csv file by pandas 139 | pd_blog_content = pd.read_csv('/home/cike/PycharmProjects/pythondata/csdnSMP/Train_DATA/some_blogcontent.csv', 140 | index_col=0, encoding="utf8") 141 | pd_blog_content['content'] = [None] * len(pd_blog_content) 142 | pd_blog_content["blog_jieba"] = pd_blog_content["blog_jieba"].apply(eval) 143 | 144 | """ 145 | get some_blogcontent voca, beacause it's too big to load in the same time 146 | 获取词典以及对应的词向量, writer in the csv file 147 | """ 148 | import itertools 149 | 150 | pd_voca = pd.DataFrame(list(set(list(itertools.chain(*list(pd_blog_content["blog_jieba"])))))) 151 | pd_voca.columns = ['word'] 152 | 153 | from imp import reload 154 | import gensim 155 | import utils.data_path as dp 156 | 157 | reload(dp) 158 | model = gensim.models.Word2Vec.load(dp.CSDNMODEL) 159 | 160 | vector_len = len(model['print']) 161 | pd_voca['vector'] = [[]] * (len(pd_voca)) 162 | 163 | 164 | def f(x): 165 | if x in model: 166 | return list(model[x]) 167 | else: 168 | return [0] * vector_len 169 | 170 | 171 | pd_voca['vector'] = pd_voca['word'].apply(f) 172 | pd_voca.to_csv('/home/cike/PycharmProjects/pythondata/csdnSMP/Train_DATA/blog_voca.csv', encoding="utf8") 173 | 174 | """ 175 | read blog voca from the csv file 176 | transfer vector fron string to vector 177 | """ 178 | import pandas as pd 179 | 180 | pd_voca = pd.read_csv("/home/cike/PycharmProjects/pythondata/csdnSMP/Train_DATA/blog_voca.csv", index_col=0, 181 | encoding="utf8") 182 | pd_voca['vector'] = pd_voca['vector'].apply(eval) 183 | pd_voca = pd_voca.set_index('word') 184 | 185 | # read some_blogcontent csv file by pandas 186 | import numpy as np 187 | 188 | pd_blog_content = pd.read_csv('/home/cike/PycharmProjects/pythondata/csdnSMP/Train_DATA/some_blogcontent.csv', 189 | index_col=0, encoding="utf8") 190 | pd_blog_content['content'] = [None] * len(pd_blog_content) 191 | pd_blog_content["blog_jieba"] = pd_blog_content["blog_jieba"].apply(eval) 192 | 193 | pd_blog_content["some_blog_jieba"] = [[]] * len(pd_blog_content) 194 | # select word which in the pd voca index, because it's too big to calculate in the same time 195 | for index, row in pd_blog_content.iterrows(): 196 | print("index", index) 197 | # temp = np.array([float(0)]*100) # vector has 100 feature 198 | temp = [] 199 | for word in row['blog_jieba']: 200 | if word in pd_voca.index: 201 | temp.append(word) 202 | 203 | row["some_blog_jieba"] = temp 204 | temp = None 205 | 206 | # calculate the doc vector 207 | pd_blog_content["blog_jieba_vector"] = [[]] * len(pd_blog_content) 208 | for index, row in pd_blog_content.iterrows(): 209 | print("index", index) 210 | temp = list(np.sum(list(pd_voca.loc[row["some_blog_jieba"]]['vector']), axis=0)) 211 | # print(temp) 212 | row["blog_jieba_vector"] = temp 213 | 214 | pd_blog_content.to_csv('/home/cike/PycharmProjects/pythondata/csdnSMP/Train_DATA/some_blogcontent_vector.csv', 215 | encoding="utf8") 216 | 217 | """ 218 | 已经计算好了doc的vector 219 | 现在将每个vector归一化处理 220 | 将训练数据和测试数据转为matrix 221 | """ 222 | pd_train_data = pd.read_csv(dp.TrainCsv, encoding="utf8", index_col=0) 223 | pd_train_data["blog_uid"] = pd_train_data["blog_uid"].apply(eval) 224 | 225 | pd_dev_data = pd.read_csv(dp.DevCsv, encoding="utf8", index_col=0) 226 | pd_dev_data["blog_uid"] = pd_dev_data["blog_uid"].apply(eval) 227 | 228 | # 加载训练csv数据 229 | import utils.data_path as dp 230 | import pandas as pd 231 | 232 | pd_blog_content = pd.read_csv('/home/cike/PycharmProjects/pythondata/csdnSMP/Train_DATA/some_blogcontent_vector.csv', 233 | encoding="utf8", index_col=0) 234 | pd_blog_content['blog_jieba_vector'] = pd_blog_content['blog_jieba_vector'].apply(eval) 235 | 236 | # normalize 237 | import numpy as np 238 | 239 | 240 | def f(a): 241 | a = np.array(a) 242 | return (a - a.min()) / (a.max() - a.min()) 243 | 244 | 245 | pd_blog_content['blog_jieba_vector'] = pd_blog_content['blog_jieba_vector'].apply(f) 246 | 247 | """ 248 | 这里不再需要用到分词的结果,只需要找到blog对应的vector 249 | 获得voca的embedding matrix 250 | """ 251 | 252 | 253 | def g(x): 254 | return list(pd_blog_content['blog_id'][pd_blog_content['blog_id'].isin(x)].index) 255 | 256 | 257 | pd_train_data["embedding_index"] = pd_train_data["blog_uid"].apply(g) 258 | pd_dev_data["embedding_index"] = pd_dev_data["blog_uid"].apply(g) 259 | 260 | from imp import reload 261 | 262 | reload(dp) 263 | pd_train_data.to_csv(dp.TrainCsv, encoding="utf8") 264 | pd_dev_data.to_csv(dp.DevCsv, encoding="utf8") 265 | 266 | count = 0 267 | for vector in pd_blog_content['blog_jieba_vector']: 268 | count += 1 269 | print("vector---", count) 270 | for i in vector: 271 | if not isinstance(i, float) or len(vector) != 100: 272 | print("error") 273 | 274 | pd_blog_content.to_pickle( 275 | '/home/cike/PycharmProjects/pythondata/csdnSMP/Train_DATA/some_blogcontent_vector_normalize.pkl') 276 | 277 | """ 278 | begin read blog content of pickle, and training traindata and testdata 279 | """ 280 | pd_blog_content = pd.read_pickle( 281 | '/home/cike/PycharmProjects/pythondata/csdnSMP/Train_DATA/some_blogcontent_vector_normalize.pkl') 282 | 283 | pd_train_data = pd.read_csv(dp.TrainCsv, encoding="utf8", index_col=0) 284 | 285 | pd_dev_data = pd.read_csv(dp.DevCsv, encoding="utf8", index_col=0) 286 | 287 | columns_train = list(pd_train_data.columns) 288 | del columns_train[1] 289 | 290 | columns_dev = list(pd_dev_data.columns) 291 | del columns_dev[0] 292 | 293 | import math 294 | import numpy as np 295 | 296 | 297 | def f(x): 298 | if x and not (type(x) == float and math.isnan(x)): 299 | # print("x:",x) 300 | # print("type:",type(x)) 301 | return eval(x) 302 | else: 303 | return [] 304 | 305 | 306 | for i in columns_train: 307 | print("column name:", i) 308 | pd_train_data[i] = pd_train_data[i].apply(f) 309 | 310 | for i in columns_dev: 311 | print("column name:", i) 312 | pd_dev_data[i] = pd_dev_data[i].apply(f) 313 | 314 | from imp import reload 315 | import utils.data_path as dp 316 | 317 | reload(dp) 318 | pd_train_data.to_pickle(dp.TrainPKL) 319 | pd_dev_data.to_pickle(dp.DevPKL) 320 | 321 | pd_train_data_pkl = pd.read_pickle(dp.TrainPKL) 322 | pd_dev_data_pkl = pd.read_pickle(dp.DevPKL) 323 | 324 | pd_blog_content = pd.read_pickle( 325 | '/home/cike/PycharmProjects/pythondata/csdnSMP/Train_DATA/some_blogcontent_vector_normalize.pkl') 326 | 327 | -------------------------------------------------------------------------------- /train_word2vec.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/changxizhu/csdnSMP/ce9bec9bdce0c5949e0d500ad33539cdb5e4160a/train_word2vec.py -------------------------------------------------------------------------------- /training.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | """ 3 | 使用神经网络来训练该数据 4 | """ 5 | import pandas as pd 6 | import numpy as np 7 | import data_path as dp 8 | from keras.preprocessing import sequence 9 | from keras.models import Sequential 10 | from keras.layers import Embedding 11 | from keras.layers.recurrent import LSTM, GRU 12 | from keras.layers.core import Dropout, Dense, Activation 13 | from keras.utils import np_utils # 这个使用还不是很清楚 14 | 15 | """ 16 | 通过keras实现lstm神经网络,实际上只要确定好标签、输入、字典,就可以训练该神经网络 17 | """ 18 | 19 | 20 | def get_pickle_data(pd_pkl_name, columns_name): 21 | """ 22 | 产生embedding_matrix 23 | :param pd_pkl_name: 需要加载的字典内容 24 | :param columns_name: 需要保留的列 25 | :return: 26 | """ 27 | # 从pd_pkl_name中读取文件 28 | pd_pkl = pd.read_pickle(pd_pkl_name) 29 | 30 | non_selected = list(set(pd_pkl.columns) - set(columns_name)) 31 | pd_pkl = pd_pkl.drop(non_selected, axis=1) # 去掉没有选择的列,实际上只保留id以及对应的vector 32 | 33 | # print(pd_pkl) 34 | return pd_pkl 35 | 36 | 37 | def lstm(trainData, trainMark, testData, embedding_dim, embedding_matrix, maxlen, output_len): 38 | # 填充数据,将每个序列长度保持一致 39 | trainData = list(sequence.pad_sequences(trainData, maxlen=maxlen, 40 | dtype='float64')) # sequence返回的是一个numpy数组,pad_sequences用于填充指定长度的序列,长则阶段,短则补0,由于下面序号为0时,对应值也为0,因此可以这样 41 | testData = list(sequence.pad_sequences(testData, maxlen=maxlen, 42 | dtype='float64')) # sequence返回的是一个numpy数组,pad_sequences用于填充指定长度的序列,长则阶段,短则补0 43 | 44 | # 建立lstm神经网络模型 45 | model = Sequential() # 多个网络层的线性堆叠,可以通过传递一个layer的list来构造该模型,也可以通过.add()方法一个个的加上层 46 | # model.add(Dense(256, input_shape=(train_total_vova_len,))) #使用全连接的输入层 47 | model.add(Embedding(len(embedding_matrix), embedding_dim, weights=[embedding_matrix], mask_zero=False, 48 | input_length=maxlen)) # 指定输入层,将高维的one-hot转成低维的embedding表示,第一个参数大或等于0的整数,输入数据最大下标+1,第二个参数大于0的整数,代表全连接嵌入的维度 49 | # lstm层,也是比较核心的层 50 | model.add(LSTM(256)) # 256对应Embedding输出维度,128是输入维度可以推导出来 51 | model.add(Dropout(0.5)) # 每次在参数更新的时候以一定的几率断开层的链接,用于防止过拟合 52 | model.add(Dense(output_len)) # 全连接,这里用于输出层,1代表输出层维度,128代表LSTM层维度可以自行推导出来 53 | model.add(Activation('softmax')) # 输出用sigmoid激活函数 54 | # 编译该模型,categorical_crossentropy(亦称作对数损失,logloss),adam是一种优化器,class_mode表示分类模式 55 | model.compile(loss='categorical_crossentropy', optimizer='sgd') 56 | 57 | # 正式运行该模型,我知道为什么了,因为没有补0!!每个array的长度是不一样的,因此才会报错 58 | X = np.array(list(trainData)) # 输入数据 59 | print("X:", X) 60 | Y = np.array(list(trainMark)) # 标签 61 | print("Y:", Y) 62 | # batch_size:整数,指定进行梯度下降时每个batch包含的样本数 63 | # nb_epoch:整数,训练的轮数,训练数据将会被遍历nb_epoch次 64 | model.fit(X, Y, batch_size=200, nb_epoch=10) # 该函数的X、Y应该是多个输入:numpy list(其中每个元素为numpy.array),单个输入:numpy.array 65 | 66 | # 进行预测 67 | A = np.array(list(testData)) # 输入数据 68 | print("A:", A) 69 | classes = model.predict(A) # 这个是预测的数据 70 | return classes 71 | 72 | 73 | if __name__ == '__main__': 74 | """ 75 | 整理好标签数据 76 | """ 77 | embedding_dim = 100 78 | maxlen = 15772 79 | 80 | pd_embedding = get_pickle_data(dp.SOME_BLOGCONTENT_VECTOR_NORMALIZE, columns_name=['blog_id', 'blog_jieba_vector']) 81 | 82 | # 获得训练数据和测试数据labels_name 83 | pd_train = get_pickle_data(dp.TrainPKL, columns_name=['labels', 'uid', 'embedding_index']) 84 | pd_test = get_pickle_data(dp.DevPKL, columns_name=['uid', 'embedding_index']) 85 | 86 | # 获得所有的标签 87 | # attention:the decode way is 'gbk' not 'utf8' 88 | with open(dp.LabelSpace, encoding='gbk') as f: 89 | labels_name = [i.strip() for i in f] 90 | # print("labels_name---",labels_name) 91 | 92 | labels_len = len(labels_name) 93 | 94 | 95 | def f(x): 96 | a = [0] * labels_len 97 | x = x[0].split('\001') 98 | for i in x: 99 | a[labels_name.index(i)] = 1 100 | return a 101 | 102 | 103 | pd_train['labels'] = pd_train['labels'].apply(f) 104 | 105 | # change embedding 106 | embedding = np.array(list(pd_embedding['blog_jieba_vector'].apply(list))) 107 | 108 | print("pd_train:\n", pd_train) 109 | print("pd_test:\n", pd_test) 110 | 111 | # begin training 112 | dev_classes = lstm(list(pd_train['embedding_index'])[:100], list(pd_train['labels'])[:100], 113 | list(pd_test['embedding_index'])[:100], 114 | embedding_dim, embedding, maxlen, labels_len) 115 | 116 | print("dev classes:", dev_classes) 117 | 118 | # bottleneck 119 | import bottleneck as bl 120 | 121 | result = [] 122 | labels_name = np.array(labels_name) 123 | for classes in dev_classes: 124 | result.append(labels_name[bl.argpartition(-classes, 3)[:3]]) 125 | 126 | pd_result = pd.DataFrame(result) 127 | pd_result.to_csv(dp.ResultTxt, sep="\001", header=False, index=False, encoding='utf8') 128 | 129 | 130 | -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/changxizhu/csdnSMP/ce9bec9bdce0c5949e0d500ad33539cdb5e4160a/utils/__init__.py -------------------------------------------------------------------------------- /utils/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/changxizhu/csdnSMP/ce9bec9bdce0c5949e0d500ad33539cdb5e4160a/utils/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /utils/__pycache__/data_path.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/changxizhu/csdnSMP/ce9bec9bdce0c5949e0d500ad33539cdb5e4160a/utils/__pycache__/data_path.cpython-35.pyc -------------------------------------------------------------------------------- /utils/data_path.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/changxizhu/csdnSMP/ce9bec9bdce0c5949e0d500ad33539cdb5e4160a/utils/data_path.py -------------------------------------------------------------------------------- /word2vec_test.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/changxizhu/csdnSMP/ce9bec9bdce0c5949e0d500ad33539cdb5e4160a/word2vec_test.py --------------------------------------------------------------------------------