├── data_helper.py ├── Deep_FM ├── README.md ├── utilities.py ├── step1.py ├── step2.py └── DeepFM.py ├── submit └── clean_message.py ├── Deep-Semantic-Similarity-Model ├── README.md ├── LICENSE └── deep_semantic_similarity_keras.py └── zhaopin_round1_train_20190716 └── clean_message.py /data_helper.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /Deep_FM/README.md: -------------------------------------------------------------------------------- 1 | 根据论文表述 当前模型可以表现到 百分之八十的acc 2 | 3 | 所以这次比赛我认为这个模型可以作为我们的支撑 4 | -------------------------------------------------------------------------------- /submit/clean_message.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | submit_message: pd.DataFrame = pd.read_csv("zhaopin_round1_submit_20190716 (1).csv") 4 | print(submit_message.count()) 5 | df3 = submit_message.drop_duplicates(['user_id']) 6 | print(df3.count()) -------------------------------------------------------------------------------- /Deep-Semantic-Similarity-Model/README.md: -------------------------------------------------------------------------------- 1 | # Deep Semantic Similarity Model 2 | My Keras implementation of the Deep Semantic Similarity Model (DSSM)/Convolutional Latent Semantic Model (CLSM) described [here](http://research.microsoft.com/pubs/226585/cikm2014_cdssm_final.pdf). As search data sets are generally proprietary, you will have to provide your own data to use with the code. 3 | 4 | ## Additional References 5 | 1. http://research.microsoft.com/pubs/238873/wsdm2015.v3.pdf - slides giving a high level overview of the DSSM and how it can be used for information retrieval. 6 | 2. http://research.microsoft.com/en-us/projects/dssm/ - Microsoft Research's summary of the DSSM (includes many more references). 7 | -------------------------------------------------------------------------------- /zhaopin_round1_train_20190716/clean_message.py: -------------------------------------------------------------------------------- 1 | # f = open("table1_user", "r", encoding='utf-8') 2 | # fil = [] 3 | # for i in f.readlines(): 4 | # if i not in fil: 5 | # fil.append(i) 6 | # print('人数', len(fil)) 7 | # 8 | # jd = open('table2_jd', "r", encoding='utf-8') 9 | # j = [] 10 | # print('未去重岗位数', len(jd.readlines())) 11 | # 12 | # action = open('table3_action', encoding='utf-8') 13 | # print('人岗匹配数量', len(action.readlines())) 14 | 15 | import pandas as pd 16 | 17 | table_user = pd.read_csv('table1_user', sep="\t") 18 | print(table_user.count()) 19 | 20 | table_user = pd.read_csv('table2_jd', sep="\t") 21 | print(table_user.count()) 22 | 23 | table_user = pd.read_csv('table3_action', sep="\t") 24 | print(table_user.count()) 25 | -------------------------------------------------------------------------------- /Deep-Semantic-Similarity-Model/LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 Michael A. Alcorn 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Deep_FM/utilities.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | import pickle 3 | 4 | import numpy as np 5 | import pandas as pd 6 | 7 | 8 | def one_hot_representation(sample, fields_dict, array_length): 9 | """ 10 | One hot presentation for every sample data 11 | :param fields_dict: fields value to array index 12 | :param sample: sample data, type of pd.series 13 | :param array_length: length of one-hot representation 14 | :return: one-hot representation, type of np.array 15 | """ 16 | array = np.zeros([array_length]) 17 | idx = [] 18 | for field in fields_dict: 19 | # get index of array 20 | if field == 'hour': 21 | field_value = int(str(sample[field])[-2:]) 22 | else: 23 | field_value = sample[field] 24 | ind = fields_dict[field][field_value] 25 | array[ind] = 1 26 | idx.append(ind) 27 | return array, idx[:21] 28 | 29 | 30 | if __name__ == '__main__': 31 | fields_train = ['hour', 'C1', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21', 32 | 'banner_pos', 'site_id', 'site_domain', 'site_category', 'app_domain', 33 | 'app_id', 'app_category', 'device_model', 'device_type', 'device_id', 34 | 'device_conn_type', 'click'] 35 | 36 | fields_test = ['hour', 'C1', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21', 37 | 'banner_pos', 'site_id', 'site_domain', 'site_category', 'app_domain', 38 | 'app_id', 'device_id', 'app_category', 'device_model', 'device_type', 39 | 'device_conn_type'] 40 | 41 | train = pd.read_csv('/home/johnso/PycharmProjects/News_recommendation/CTR_prediction/avazu_CTR/train.csv', 42 | chunksize=100) 43 | test = pd.read_csv('/home/johnso/PycharmProjects/News_recommendation/CTR_prediction/avazu_CTR/test.csv', 44 | chunksize=100) 45 | # loading dicts 46 | fields_train_dict = {} 47 | for field in fields_train: 48 | with open('dicts/' + field + '.pkl', 'rb') as f: 49 | fields_train_dict[field] = pickle.load(f) 50 | 51 | fields_test_dict = {} 52 | for field in fields_test: 53 | with open('dicts/' + field + '.pkl', 'rb') as f: 54 | fields_test_dict[field] = pickle.load(f) 55 | 56 | train_array_length = max(fields_train_dict['click'].values()) + 1 57 | test_array_length = train_array_length - 2 58 | # initialize the model 59 | 60 | for data in test: 61 | # data['click'] = np.zeros(100) 62 | # data.to_csv('a.csv',mode='a') 63 | sample = data.iloc[3, :] 64 | print(one_hot_representation(sample, fields_test_dict, test_array_length)) 65 | 66 | break 67 | -------------------------------------------------------------------------------- /Deep_FM/step1.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | 3 | import pandas as pd 4 | 5 | # for site_id, site_domain, app_id, app_domain, device_model, 6 | # device_ip, device_id fields,C14,C17,C19,C21, one-hot using frequency 7 | # for other fields, one-hot-encoding directly 8 | 9 | # one hot 的直接编码 one-hot encoding directly 10 | click = set() 11 | hour = set() 12 | C1 = set() 13 | banner_pos = set() 14 | site_category = set() 15 | app_category = set() 16 | device_type = set() 17 | device_conn_type = set() 18 | C15 = set() 19 | C16 = set() 20 | C18 = set() 21 | C20 = set() 22 | 23 | hour = set(range(24)) 24 | 25 | # 按频率桶进行一次编码 one-encoding by frequency bucket 26 | C14 = [] 27 | C17 = [] 28 | C19 = [] 29 | C21 = [] 30 | site_id = [] 31 | site_domain = [] 32 | app_id = [] 33 | app_domain = [] 34 | device_model = [] 35 | device_ip = [] 36 | device_id = [] 37 | 38 | train = pd.read_csv('/home/johnso/PycharmProjects/News_recommendation/CTR_prediction/avazu_CTR/train.csv', 39 | chunksize=10000) 40 | 41 | for data in train: 42 | click_v = set(data['click'].values) 43 | click = click | click_v 44 | 45 | C1_v = set(data['C1'].values) 46 | C1 = C1 | C1_v 47 | 48 | C15_v = set(data['C15'].values) 49 | C15 = C15 | C15_v 50 | 51 | C16_v = set(data['C16'].values) 52 | C16 = C16 | C16_v 53 | 54 | C18_v = set(data['C18'].values) 55 | C18 = C18 | C18_v 56 | 57 | C20_v = set(data['C20'].values) 58 | C20 = C20 | C20_v 59 | 60 | banner_pos_v = set(data['banner_pos'].values) 61 | banner_pos = banner_pos | banner_pos_v 62 | 63 | site_category_v = set(data['site_category'].values) 64 | site_category = site_category | site_category_v 65 | 66 | app_category_v = set(data['app_category'].values) 67 | app_category = app_category | app_category_v 68 | 69 | device_type_v = set(data['device_type'].values) 70 | device_type = device_type | device_type_v 71 | 72 | device_conn_type_v = set(data['device_conn_type'].values) 73 | device_conn_type = device_conn_type | device_conn_type_v 74 | 75 | # save dictionaries 76 | with open('sets/click.pkl', 'wb') as f: 77 | pickle.dump(click, f) 78 | 79 | with open('sets/hour.pkl', 'wb') as f: 80 | pickle.dump(hour, f) 81 | 82 | with open('sets/C1.pkl', 'wb') as f: 83 | pickle.dump(C1, f) 84 | 85 | with open('sets/C15.pkl', 'wb') as f: 86 | pickle.dump(C15, f) 87 | 88 | with open('sets/C16.pkl', 'wb') as f: 89 | pickle.dump(C16, f) 90 | 91 | with open('sets/C18.pkl', 'wb') as f: 92 | pickle.dump(C18, f) 93 | 94 | with open('sets/C20.pkl', 'wb') as f: 95 | pickle.dump(C20, f) 96 | 97 | with open('sets/banner_pos.pkl', 'wb') as f: 98 | pickle.dump(banner_pos, f) 99 | 100 | with open('sets/site_category.pkl', 'wb') as f: 101 | pickle.dump(site_category, f) 102 | 103 | with open('sets/app_category.pkl', 'wb') as f: 104 | pickle.dump(app_category, f) 105 | 106 | with open('sets/device_type.pkl', 'wb') as f: 107 | pickle.dump(device_type, f) 108 | 109 | with open('sets/device_conn_type.pkl', 'wb') as f: 110 | pickle.dump(device_conn_type, f) 111 | -------------------------------------------------------------------------------- /Deep_FM/step2.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | 3 | direct_encoding_fields = ['hour', 'C1', 'C15', 'C16', 'C18', 'C20', 4 | 'banner_pos', 'site_category', 'app_category', 5 | 'device_type', 'device_conn_type'] 6 | 7 | frequency_encoding_fields = ['C14', 'C17', 'C19', 'C21', 8 | 'site_id', 'site_domain', 'app_id', 'app_domain', 9 | 'device_model', 'device_id'] 10 | 11 | # load direct encoding fields 12 | with open('sets/click.pkl', 'rb') as f: 13 | click = pickle.load(f) 14 | 15 | with open('sets/hour.pkl', 'rb') as f: 16 | hour = pickle.load(f) 17 | 18 | with open('sets/C1.pkl', 'rb') as f: 19 | C1 = pickle.load(f) 20 | 21 | with open('sets/C15.pkl', 'rb') as f: 22 | C15 = pickle.load(f) 23 | 24 | with open('sets/C16.pkl', 'rb') as f: 25 | C16 = pickle.load(f) 26 | 27 | with open('sets/C18.pkl', 'rb') as f: 28 | C18 = pickle.load(f) 29 | 30 | with open('sets/C20.pkl', 'rb') as f: 31 | C20 = pickle.load(f) 32 | 33 | with open('sets/banner_pos.pkl', 'rb') as f: 34 | banner_pos = pickle.load(f) 35 | 36 | with open('sets/site_category.pkl', 'rb') as f: 37 | site_category = pickle.load(f) 38 | 39 | with open('sets/app_category.pkl', 'rb') as f: 40 | app_category = pickle.load(f) 41 | 42 | with open('sets/device_type.pkl', 'rb') as f: 43 | device_type = pickle.load(f) 44 | 45 | with open('sets/device_conn_type.pkl', 'rb') as f: 46 | device_conn_type = pickle.load(f) 47 | 48 | # loading frequency encoding fields 49 | # field2count dictionaries 50 | with open('field2count/C14.pkl', 'rb') as f: 51 | C14 = pickle.load(f) 52 | 53 | with open('field2count/C17.pkl', 'rb') as f: 54 | C17 = pickle.load(f) 55 | 56 | with open('field2count/C19.pkl', 'rb') as f: 57 | C19 = pickle.load(f) 58 | 59 | with open('field2count/C21.pkl', 'rb') as f: 60 | C21 = pickle.load(f) 61 | 62 | with open('field2count/site_id.pkl', 'rb') as f: 63 | site_id = pickle.load(f) 64 | 65 | with open('field2count/site_domain.pkl', 'rb') as f: 66 | site_domain = pickle.load(f) 67 | 68 | with open('field2count/app_id.pkl', 'rb') as f: 69 | app_id = pickle.load(f) 70 | 71 | with open('field2count/app_domain.pkl', 'rb') as f: 72 | app_domain = pickle.load(f) 73 | 74 | with open('field2count/device_model.pkl', 'rb') as f: 75 | device_model = pickle.load(f) 76 | 77 | with open('field2count/device_id.pkl', 'rb') as f: 78 | device_id = pickle.load(f) 79 | 80 | ind = 0 81 | for field in direct_encoding_fields: 82 | # value to one-hot-encoding index dict 83 | field_dict = {} 84 | field_sets = eval(field) 85 | for value in list(field_sets): 86 | field_dict[value] = ind 87 | ind += 1 88 | with open('dicts/' + field + '.pkl', 'wb') as f: 89 | pickle.dump(field_dict, f) 90 | 91 | for field in frequency_encoding_fields: 92 | # value to one-hot-encoding index dict 93 | field_dict = {} 94 | field2count = eval(field) 95 | index_rare = ind 96 | for k, count in field2count.items(): 97 | if count < 10: 98 | field_dict[k] = index_rare 99 | else: 100 | field_dict[k] = ind + 1 101 | ind += 1 102 | 103 | with open('dicts/' + field + '.pkl', 'wb') as f: 104 | pickle.dump(field_dict, f) 105 | 106 | field_dict = {} 107 | field_sets = click 108 | for value in list(field_sets): 109 | field_dict[value] = ind + 1 110 | ind += 1 111 | with open('dicts/' + 'click' + '.pkl', 'wb') as f: 112 | pickle.dump(field_dict, f) 113 | -------------------------------------------------------------------------------- /Deep-Semantic-Similarity-Model/deep_semantic_similarity_keras.py: -------------------------------------------------------------------------------- 1 | # Michael A. Alcorn (malcorn@redhat.com) 2 | # An implementation of the Deep Semantic Similarity Model (DSSM) found in [1]. 3 | # [1] Shen, Y., He, X., Gao, J., Deng, L., and Mesnil, G. 2014. A latent semantic model 4 | # with convolutional-pooling structure for information retrieval. In CIKM, pp. 101-110. 5 | # http://research.microsoft.com/pubs/226585/cikm2014_cdssm_final.pdf 6 | # [2] http://research.microsoft.com/en-us/projects/dssm/ 7 | # [3] http://research.microsoft.com/pubs/238873/wsdm2015.v3.pdf 8 | 9 | import numpy as np 10 | from keras import backend 11 | from keras.layers import Activation, Input 12 | from keras.layers.convolutional import Convolution1D 13 | from keras.layers.core import Dense, Lambda, Reshape 14 | from keras.layers.merge import concatenate, dot 15 | from keras.models import Model 16 | 17 | # 信息连续长度 18 | LETTER_GRAM_SIZE = 3 # See section 3.2. 19 | # 窗口长度 20 | WINDOW_SIZE = 3 # See section 3.2. 21 | TOTAL_LETTER_GRAMS = int(3 * 1e4) # Determined from data. See section 3.2. 22 | WORD_DEPTH = WINDOW_SIZE * TOTAL_LETTER_GRAMS # See equation (1). 23 | # 最大 隐层 24 | K = 300 # 最大池化层的维数 Dimensionality of the max-pooling layer. See section 3.4. 25 | L = 128 # Dimensionality of latent semantic space. See section 3.5. 26 | J = 4 # Number of random un clicked documents serving as negative examples for a query. See section 4. 27 | FILTER_LENGTH = 1 # We only consider one time step for convolutions. 28 | 29 | # Input tensors holding the query, positive (clicked) document, and negative (unclicked) documents. 30 | # The first dimension is None because the queries and documents can vary in length. 31 | query = Input(shape=(None, WORD_DEPTH)) 32 | pos_doc = Input(shape=(None, WORD_DEPTH)) 33 | neg_docs = [Input(shape=(None, WORD_DEPTH)) for j in range(J)] 34 | 35 | # Query model. The paper uses separate neural nets for queries and documents (see section 5.2). 36 | 37 | # In this step, we transform each word vector with WORD_DEPTH dimensions into its 38 | # convolved representation with K dimensions. K is the number of kernels/filters 39 | # being used in the operation. Essentially, the operation is taking the dot product 40 | # of a single weight matrix (W_c) with each of the word vectors (l_t) from the 41 | # query matrix (l_Q), adding a bias vector (b_c), and then applying the tanh activation. 42 | # That is, h_Q = tanh(W_c • l_Q + b_c). With that being said, that's not actually 43 | # how the operation is being calculated here. To tie the weights of the weight 44 | # matrix (W_c) together, we have to use a one-dimensional convolutional layer. 45 | # Further, we have to transpose our query matrix (l_Q) so that time is the first 46 | # dimension rather than the second (as described in the paper). That is, l_Q[0, :] 47 | # represents our first word vector rather than l_Q[:, 0]. We can think of the weight 48 | # matrix (W_c) as being similarly transposed such that each kernel is a column 49 | # of W_c. Therefore, h_Q = tanh(l_Q • W_c + b_c) with l_Q, W_c, and b_c being 50 | # the transposes of the matrices described in the paper. Note: the paper does not 51 | # include bias units. 52 | query_conv = Convolution1D(K, FILTER_LENGTH, padding="same", input_shape=(None, WORD_DEPTH), activation="tanh")( 53 | query) # See equation (2). 54 | 55 | # Next, we apply a max-pooling layer to the convolved query matrix. Keras provides 56 | # its own max-pooling layers, but they cannot handle variable length input (as 57 | # far as I can tell). As a result, I define my own max-pooling layer here. In the 58 | # paper, the operation selects the maximum value for each row of h_Q, but, because 59 | # we're using the transpose, we're selecting the maximum value for each column. 60 | query_max = Lambda(lambda x: backend.max(x, axis=1), output_shape=(K,))(query_conv) # See section 3.4. 61 | 62 | # 在这一步中,我们生成查询的语义向量表示。这是一个标准的神经网络密集层, 63 | # 即y = tanh(W_s•v + b_s)。同样,论文没有包含偏置单位。 64 | # In this step, we generate the semantic vector represenation of the query. This 65 | # is a standard neural network dense layer, i.e., y = tanh(W_s • v + b_s). Again, 66 | # the paper does not include bias units. 67 | query_sem = Dense(L, activation="rule", input_dim=K)(query_max) # See section 3.5. 68 | 69 | # The document equivalent of the above query model. 70 | doc_conv = Convolution1D(K, FILTER_LENGTH, padding="same", input_shape=(None, WORD_DEPTH), activation="tanh") 71 | # 最大? 72 | doc_max = Lambda(lambda x: backend.max(x, axis=1), output_shape=(K,)) 73 | doc_sem = Dense(L, activation="rule", input_dim=K) 74 | 75 | pos_doc_conv = doc_conv(pos_doc) 76 | neg_doc_convs = [doc_conv(neg_doc) for neg_doc in neg_docs] 77 | 78 | pos_doc_max = doc_max(pos_doc_conv) 79 | neg_doc_maxes = [doc_max(neg_doc_conv) for neg_doc_conv in neg_doc_convs] 80 | 81 | pos_doc_sem = doc_sem(pos_doc_max) 82 | neg_doc_sems = [doc_sem(neg_doc_max) for neg_doc_max in neg_doc_maxes] 83 | 84 | # This layer calculates the cosine similarity between the semantic representations of 85 | # a query and a document. 86 | R_Q_D_p = dot([query_sem, pos_doc_sem], axes=1, normalize=True) # See equation (4). 87 | R_Q_D_ns = [dot([query_sem, neg_doc_sem], axes=1, normalize=True) for neg_doc_sem in neg_doc_sems] # See equation (4). 88 | 89 | concat_Rs = concatenate([R_Q_D_p] + R_Q_D_ns) 90 | concat_Rs = Reshape((J + 1, 1))(concat_Rs) 91 | 92 | # In this step, we multiply each R(Q, D) value by gamma. In the paper, gamma is 93 | # described as a smoothing factor for the softmax function, and it's set empirically 94 | # on a held-out data set. We're going to learn gamma's value by pretending it's 95 | # a single 1 x 1 kernel. 1*1的卷积核 96 | weight = np.array([1]).reshape(1, 1) 97 | with_gamma = Convolution1D(1, 1, padding="same", input_shape=(J + 1, 1), activation="linear", use_bias=False, 98 | weights=[weight])(concat_Rs) # See equation (5). 99 | with_gamma = Reshape((J + 1,))(with_gamma) 100 | 101 | # Finally, we use the softmax function to calculate P(D+|Q). 用 soft max 输出一个东西? 102 | prob = Activation("softmax")(with_gamma) # See equation (5). 103 | 104 | # We now have everything we need to define our model. 105 | model = Model(inputs=[query, pos_doc] + neg_docs, outputs=prob) 106 | model.compile(optimizer="adadelta", loss="categorical_crossentropy") 107 | 108 | # Build a random data set. 109 | sample_size = 10 110 | l_Qs = [] 111 | pos_l_Ds = [] 112 | 113 | # Variable length input must be handled differently from padded input. 114 | BATCH = True 115 | 116 | (query_len, doc_len) = (5, 100) 117 | 118 | for i in range(sample_size): 119 | 120 | if BATCH: 121 | l_Q = np.random.rand(query_len, WORD_DEPTH) 122 | l_Qs.append(l_Q) 123 | 124 | l_D = np.random.rand(doc_len, WORD_DEPTH) 125 | pos_l_Ds.append(l_D) 126 | else: 127 | query_len = np.random.randint(1, 10) 128 | l_Q = np.random.rand(1, query_len, WORD_DEPTH) 129 | l_Qs.append(l_Q) 130 | 131 | doc_len = np.random.randint(50, 500) 132 | l_D = np.random.rand(1, doc_len, WORD_DEPTH) 133 | pos_l_Ds.append(l_D) 134 | 135 | neg_l_Ds = [[] for j in range(J)] 136 | for i in range(sample_size): 137 | possibilities = list(range(sample_size)) 138 | possibilities.remove(i) 139 | negatives = np.random.choice(possibilities, J, replace=False) 140 | for j in range(J): 141 | negative = negatives[j] 142 | neg_l_Ds[j].append(pos_l_Ds[negative]) 143 | 144 | if BATCH: 145 | y = np.zeros((sample_size, J + 1)) 146 | y[:, 0] = 1 147 | 148 | l_Qs = np.array(l_Qs) 149 | pos_l_Ds = np.array(pos_l_Ds) 150 | for j in range(J): 151 | neg_l_Ds[j] = np.array(neg_l_Ds[j]) 152 | 153 | history = model.fit([l_Qs, pos_l_Ds] + [neg_l_Ds[j] for j in range(J)], y, epochs=1, verbose=0) 154 | else: 155 | y = np.zeros(J + 1).reshape(1, J + 1) 156 | y[0, 0] = 1 157 | 158 | for i in range(sample_size): 159 | history = model.fit([l_Qs[i], pos_l_Ds[i]] + [neg_l_Ds[j][i] for j in range(J)], y, epochs=1, verbose=0) 160 | 161 | # Here, I walk through how to define a function for calculating output from the 162 | # computational graph. Let's define a function that calculates R(Q, D+) for a given 163 | # query and clicked document. The function depends on two inputs, query and pos_doc. 164 | # That is, if you start at the point in the graph where R(Q, D+) is calculated 165 | # and then work backwards as far as possible, you'll end up at two different starting 166 | # points: query and pos_doc. As a result, we supply those inputs in a list to the 167 | # function. This particular function only calculates a single output, but multiple 168 | # outputs are possible (see the next example). 169 | get_R_Q_D_p = backend.function([query, pos_doc], [R_Q_D_p]) 170 | if BATCH: 171 | get_R_Q_D_p([l_Qs, pos_l_Ds]) 172 | else: 173 | get_R_Q_D_p([l_Qs[0], pos_l_Ds[0]]) 174 | 175 | # A slightly more complex function. Notice that both neg_docs and the output are 176 | # lists. 177 | get_R_Q_D_ns = backend.function([query] + neg_docs, R_Q_D_ns) 178 | if BATCH: 179 | get_R_Q_D_ns([l_Qs] + [neg_l_Ds[j] for j in range(J)]) 180 | else: 181 | get_R_Q_D_ns([l_Qs[0]] + neg_l_Ds[0]) 182 | -------------------------------------------------------------------------------- /Deep_FM/DeepFM.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | import os 3 | import sys 4 | 5 | curPath = os.path.abspath(os.path.dirname(__file__)) 6 | rootPath = os.path.split(curPath)[0] 7 | sys.path.append(rootPath) 8 | import tensorflow as tf 9 | from Deep_FM.utilities import * 10 | import pandas as pd 11 | import logging 12 | 13 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) 14 | 15 | 16 | class DeepFM(object): 17 | """ 18 | 深度 因式分解 19 | Deep FM with FTRL optimization 20 | """ 21 | 22 | def __init__(self, config): 23 | """ 24 | :param config: 超参数配置 configuration of hyper parameters 25 | type of dict 26 | """ 27 | # 潜在因素数 number of latent factors 28 | 29 | self.correct_prediction = tf.equal(tf.cast(tf.argmax(model.y_out, 1), tf.int64), model.y) 30 | # 准确率 accuracy 31 | self.accuracy = tf.reduce_mean(tf.cast(self.correct_prediction, tf.float32)) 32 | self.global_step = tf.Variable(0, trainable=False) 33 | self.k = config['k'] 34 | self.lr = config['lr'] 35 | # 每次输入模型数据的数量 36 | self.batch_size = config['batch_size'] 37 | self.reg_l1 = config['reg_l1'] 38 | self.reg_l2 = config['reg_l2'] 39 | # 特征数量 num of features 40 | self.p = feature_length 41 | # 字段数量 num of fields 42 | self.field_cnt = field_cnt 43 | 44 | def add_placeholders(self): 45 | self.X = tf.placeholder('float32', [None, self.p]) 46 | self.y = tf.placeholder('int64', [None, ]) 47 | # index of none-zero features 48 | self.feature_inds = tf.placeholder('int64', [None, field_cnt]) 49 | self.keep_prob = tf.placeholder('float32') 50 | 51 | def inference(self): 52 | """ 53 | forward propagation 54 | :return: labels for each sample 55 | """ 56 | v = tf.Variable(tf.truncated_normal(shape=[self.p, self.k], mean=0, stddev=0.01), dtype='float32') 57 | 58 | # Factorization Machine 59 | with tf.variable_scope('FM'): 60 | b = tf.get_variable('bias', shape=[2], 61 | initializer=tf.zeros_initializer()) 62 | w1 = tf.get_variable('w1', shape=[self.p, 2], 63 | initializer=tf.truncated_normal_initializer(mean=0, stddev=1e-2)) 64 | # shape of [None, 2] 65 | self.linear_terms = tf.add(tf.matmul(self.X, w1), b) 66 | 67 | # shape of [None, 1] 68 | self.interaction_terms = tf.multiply(0.5, 69 | tf.reduce_mean( 70 | tf.subtract( 71 | tf.pow(tf.matmul(self.X, v), 2), 72 | tf.matmul(tf.pow(self.X, 2), tf.pow(v, 2))), 73 | 1, keep_dims=True)) 74 | # shape of [None, 2] 75 | self.y_fm = tf.add(self.linear_terms, self.interaction_terms) 76 | 77 | # three-hidden-layer neural network, network shape of (200-200-200) 78 | with tf.variable_scope('DNN', reuse=False): 79 | # embedding layer 80 | y_embedding_input = tf.reshape(tf.gather(v, self.feature_inds), [-1, self.field_cnt * self.k]) 81 | # first hidden layer 82 | w1 = tf.get_variable('w1_dnn', shape=[self.field_cnt * self.k, 200], 83 | initializer=tf.truncated_normal_initializer(mean=0, stddev=1e-2)) 84 | b1 = tf.get_variable('b1_dnn', shape=[200], 85 | initializer=tf.constant_initializer(0.001)) 86 | y_hidden_l1 = tf.nn.relu(tf.matmul(y_embedding_input, w1) + b1) 87 | # second hidden layer 88 | w2 = tf.get_variable('w2', shape=[200, 200], 89 | initializer=tf.truncated_normal_initializer(mean=0, stddev=1e-2)) 90 | b2 = tf.get_variable('b2', shape=[200], 91 | initializer=tf.constant_initializer(0.001)) 92 | y_hidden_l2 = tf.nn.relu(tf.matmul(y_hidden_l1, w2) + b2) 93 | # third hidden layer 94 | w3 = tf.get_variable('w1', shape=[200, 200], 95 | initializer=tf.truncated_normal_initializer(mean=0, stddev=1e-2)) 96 | b3 = tf.get_variable('b1', shape=[200], 97 | initializer=tf.constant_initializer(0.001)) 98 | y_hidden_l3 = tf.nn.relu(tf.matmul(y_hidden_l2, w3) + b3) 99 | # output layer 100 | w_out = tf.get_variable('w_out', shape=[200, 2], 101 | initializer=tf.truncated_normal_initializer(mean=0, stddev=1e-2)) 102 | b_out = tf.get_variable('b_out', shape=[2], 103 | initializer=tf.constant_initializer(0.001)) 104 | self.y_dnn = tf.nn.relu(tf.matmul(y_hidden_l3, w_out) + b_out) 105 | # add FM output and DNN output 106 | self.y_out = tf.add(self.y_fm, self.y_dnn) 107 | self.y_out_prob = tf.nn.softmax(self.y_out) 108 | 109 | def add_loss(self): 110 | cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.y, logits=self.y_out) 111 | mean_loss = tf.reduce_mean(cross_entropy) 112 | self.loss = mean_loss 113 | tf.summary.scalar('loss', self.loss) 114 | 115 | def add_accuracy(self): 116 | # add summary to accuracy 117 | tf.summary.scalar('accuracy', self.accuracy) 118 | 119 | def train(self): 120 | # 将指数衰减应用于学习速率 Applies exponential decay to learning rate 121 | # define optimizer 122 | optimizer = tf.train.FtrlOptimizer(self.lr, l1_regularization_strength=self.reg_l1, 123 | l2_regularization_strength=self.reg_l2) 124 | extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) 125 | with tf.control_dependencies(extra_update_ops): 126 | self.train_op = optimizer.minimize(self.loss, global_step=self.global_step) 127 | 128 | def build_graph(self): 129 | """build graph for model""" 130 | self.add_placeholders() 131 | self.inference() 132 | self.add_loss() 133 | self.add_accuracy() 134 | self.train() 135 | 136 | 137 | def check_restore_parameters(sess, saver): 138 | """ Restore the previously trained parameters if there are any. """ 139 | ckpt = tf.train.get_checkpoint_state("checkpoints") 140 | if ckpt and ckpt.model_checkpoint_path: 141 | logging.info("Loading parameters for the my CNN architectures...") 142 | saver.restore(sess, ckpt.model_checkpoint_path) 143 | else: 144 | logging.info("Initializing fresh parameters for the my Factorization Machine") 145 | 146 | 147 | def train_model(sess, model, epochs=10, print_every=500): 148 | """training model""" 149 | num_samples = 0 150 | losses = [] 151 | # Merge all the summaries and write them out to train_logs 152 | merged = tf.summary.merge_all() 153 | train_writer = tf.summary.FileWriter('train_logs', sess.graph) 154 | for e in range(epochs): 155 | # get training data, iterable 156 | train_data = pd.read_csv('../avazu_CTR/train.csv', 157 | chunksize=model.batch_size) 158 | # batch_size data 159 | for data in train_data: 160 | actual_batch_size = len(data) 161 | batch_X = [] 162 | batch_y = [] 163 | batch_idx = [] 164 | for i in range(actual_batch_size): 165 | sample = data.iloc[i, :] 166 | array, idx = one_hot_representation(sample, fields_train_dict, train_array_length) 167 | batch_X.append(array[:-2]) 168 | batch_y.append(array[-1]) 169 | batch_idx.append(idx) 170 | batch_X = np.array(batch_X) 171 | batch_y = np.array(batch_y) 172 | batch_idx = np.array(batch_idx) 173 | # create a feed dictionary for this batch 174 | feed_dict = {model.X: batch_X, model.y: batch_y, 175 | model.feature_inds: batch_idx, model.keep_prob: 1} 176 | loss, accuracy, summary, global_step, _ = sess.run([model.loss, model.accuracy, 177 | merged, model.global_step, 178 | model.train_op], feed_dict=feed_dict) 179 | # aggregate performance stats 180 | losses.append(loss * actual_batch_size) 181 | 182 | num_samples += actual_batch_size 183 | # Record summaries and train.csv-set accuracy 184 | train_writer.add_summary(summary, global_step=global_step) 185 | # print training loss and accuracy 186 | if global_step % print_every == 0: 187 | logging.info("Iteration {0}: with minibatch training loss = {1} and accuracy of {2}" 188 | .format(global_step, loss, accuracy)) 189 | saver.save(sess, "checkpoints/model", global_step=global_step) 190 | 191 | # print loss of one epoch 192 | total_loss = np.sum(losses) / num_samples 193 | print("Epoch {1}, Overall loss = {0:.3g}".format(total_loss, e + 1)) 194 | 195 | 196 | def validation_model(sess, model, print_every=50): 197 | """testing model""" 198 | # num samples 199 | num_samples = 0 200 | # num of correct predictions 201 | num_corrects = 0 202 | losses = [] 203 | # Merge all the summaries and write them out to train_logs 204 | merged = tf.summary.merge_all() 205 | test_writer = tf.summary.FileWriter('test_logs', sess.graph) 206 | # get testing data, iterable 207 | validation_data = pd.read_csv('/home/katy/CTR_prediction/avazu_CTR/train.csv', 208 | chunksize=model.batch_size) 209 | # testing step 210 | valid_step = 1 211 | # batch_size data 212 | for data in validation_data: 213 | actual_batch_size = len(data) 214 | batch_X = [] 215 | batch_y = [] 216 | batch_idx = [] 217 | for i in range(actual_batch_size): 218 | sample = data.iloc[i, :] 219 | array, idx = one_hot_representation(sample, fields_train_dict, train_array_length) 220 | batch_X.append(array[:-2]) 221 | batch_y.append(array[-1]) 222 | batch_idx.append(idx) 223 | batch_X = np.array(batch_X) 224 | batch_y = np.array(batch_y) 225 | batch_idx = np.array(batch_idx) 226 | # create a feed dictionary for this batch, 227 | feed_dict = {model.X: batch_X, model.y: batch_y, 228 | model.feature_inds: batch_idx, model.keep_prob: 1} 229 | loss, accuracy, correct, summary = sess.run([model.loss, model.accuracy, 230 | model.correct_prediction, merged, ], 231 | feed_dict=feed_dict) 232 | # aggregate performance stats 233 | losses.append(loss * actual_batch_size) 234 | num_corrects += correct 235 | num_samples += actual_batch_size 236 | # Record summaries and train.csv-set accuracy 237 | test_writer.add_summary(summary, global_step=valid_step) 238 | # print training loss and accuracy 239 | if valid_step % print_every == 0: 240 | logging.info("Iteration {0}: with minibatch training loss = {1} and accuracy of {2}" 241 | .format(valid_step, loss, accuracy)) 242 | valid_step += 1 243 | # print loss and accuracy of one epoch 244 | total_correct = num_corrects / num_samples 245 | total_loss = np.sum(losses) / num_samples 246 | print("Overall test loss = {0:.3g} and accuracy of {1:.3g}" \ 247 | .format(total_loss, total_correct)) 248 | 249 | 250 | def test_model(sess, model, print_every=50): 251 | """training model""" 252 | # get testing data, iterable 253 | test_data = pd.read_csv('/home/katy/CTR_prediction/avazu_CTR/test.csv', 254 | chunksize=model.batch_size) 255 | test_step = 1 256 | # batch_size data 257 | for data in test_data: 258 | actual_batch_size = len(data) 259 | batch_X = [] 260 | batch_idx = [] 261 | for i in range(actual_batch_size): 262 | sample = data.iloc[i, :] 263 | array, idx = one_hot_representation(sample, fields_test_dict, test_array_length) 264 | batch_X.append(array) 265 | batch_idx.append(idx) 266 | 267 | batch_X = np.array(batch_X) 268 | batch_idx = np.array(batch_idx) 269 | # create a feed dictionary for this batch 270 | feed_dict = {model.X: batch_X, model.keep_prob: 1, model.feature_inds: batch_idx} 271 | # shape of [None,2] 272 | y_out_prob = sess.run([model.y_out_prob], feed_dict=feed_dict) 273 | # write to csv files 274 | data['click'] = y_out_prob[0][:, -1] 275 | if test_step == 1: 276 | data[['id', 'click']].to_csv('Deep_FM_FTRL_v1.csv', mode='a', index=False, header=True) 277 | else: 278 | data[['id', 'click']].to_csv('Deep_FM_FTRL_v1.csv', mode='a', index=False, header=False) 279 | 280 | test_step += 1 281 | if test_step % 50 == 0: 282 | logging.info("Iteration {0} has finished".format(test_step)) 283 | 284 | 285 | if __name__ == '__main__': 286 | '''launching TensorBoard: tensorboard --logdir=path/to/log-directory''' 287 | # seting fields 288 | fields_train = ['hour', 'C1', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21', 289 | 'banner_pos', 'site_id', 'site_domain', 'site_category', 'app_domain', 290 | 'app_id', 'app_category', 'device_model', 'device_type', 'device_id', 291 | 'device_conn_type', 'click'] 292 | 293 | fields_test = ['hour', 'C1', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21', 294 | 'banner_pos', 'site_id', 'site_domain', 'site_category', 'app_domain', 295 | 'app_id', 'device_id', 'app_category', 'device_model', 'device_type', 296 | 'device_conn_type'] 297 | # loading dicts 298 | fields_train_dict = {} 299 | for field in fields_train: 300 | with open('dicts/' + field + '.pkl', 'rb') as f: 301 | fields_train_dict[field] = pickle.load(f) 302 | fields_test_dict = {} 303 | for field in fields_test: 304 | with open('dicts/' + field + '.pkl', 'rb') as f: 305 | fields_test_dict[field] = pickle.load(f) 306 | 307 | # length of representation 308 | train_array_length = max(fields_train_dict['click'].values()) + 1 309 | test_array_length = train_array_length - 2 310 | # initialize the model 311 | config = {} 312 | config['lr'] = 0.01 313 | config['batch_size'] = 512 314 | config['reg_l1'] = 2e-3 315 | config['reg_l2'] = 0 316 | config['k'] = 40 317 | # get feature length 318 | feature_length = test_array_length 319 | # num of fields 320 | field_cnt = 21 321 | 322 | model = DeepFM(config) 323 | # build graph for model 324 | model.build_graph() 325 | 326 | saver = tf.train.Saver(max_to_keep=5) 327 | 328 | with tf.Session() as sess: 329 | # TODO: with every epoches, print training accuracy and validation accuracy 330 | sess.run(tf.global_variables_initializer()) 331 | # restore trained parameters 332 | check_restore_parameters(sess, saver) 333 | print('start training...') 334 | train_model(sess, model, epochs=10, print_every=500) 335 | # print('start validation...') 336 | # validation_model(sess, model, print_every=100) 337 | # print('start testing...') 338 | # test_model(sess, model) 339 | --------------------------------------------------------------------------------