├── data_helper.py
├── Deep_FM
    ├── README.md
    ├── utilities.py
    ├── step1.py
    ├── step2.py
    └── DeepFM.py
├── submit
    └── clean_message.py
├── Deep-Semantic-Similarity-Model
    ├── README.md
    ├── LICENSE
    └── deep_semantic_similarity_keras.py
└── zhaopin_round1_train_20190716
    └── clean_message.py


/data_helper.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/Deep_FM/README.md:
--------------------------------------------------------------------------------
1 | 根据论文表述 当前模型可以表现到 百分之八十的acc
2 | 
3 | 所以这次比赛我认为这个模型可以作为我们的支撑
4 | 


--------------------------------------------------------------------------------
/submit/clean_message.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | 
3 | submit_message: pd.DataFrame = pd.read_csv("zhaopin_round1_submit_20190716 (1).csv")
4 | print(submit_message.count())
5 | df3 = submit_message.drop_duplicates(['user_id'])
6 | print(df3.count())


--------------------------------------------------------------------------------
/Deep-Semantic-Similarity-Model/README.md:
--------------------------------------------------------------------------------
1 | # Deep Semantic Similarity Model
2 | My Keras implementation of the Deep Semantic Similarity Model (DSSM)/Convolutional Latent Semantic Model (CLSM) described [here](http://research.microsoft.com/pubs/226585/cikm2014_cdssm_final.pdf). As search data sets are generally proprietary, you will have to provide your own data to use with the code.
3 | 
4 | ## Additional References
5 | 1. http://research.microsoft.com/pubs/238873/wsdm2015.v3.pdf - slides giving a high level overview of the DSSM and how it can be used for information retrieval.
6 | 2. http://research.microsoft.com/en-us/projects/dssm/ - Microsoft Research's summary of the DSSM (includes many more references).
7 | 


--------------------------------------------------------------------------------
/zhaopin_round1_train_20190716/clean_message.py:
--------------------------------------------------------------------------------
 1 | # f = open("table1_user", "r", encoding='utf-8')
 2 | # fil = []
 3 | # for i in f.readlines():
 4 | #     if i not in fil:
 5 | #         fil.append(i)
 6 | # print('人数', len(fil))
 7 | #
 8 | # jd = open('table2_jd', "r", encoding='utf-8')
 9 | # j = []
10 | # print('未去重岗位数', len(jd.readlines()))
11 | #
12 | # action = open('table3_action', encoding='utf-8')
13 | # print('人岗匹配数量', len(action.readlines()))
14 | 
15 | import pandas as pd
16 | 
17 | table_user = pd.read_csv('table1_user', sep="\t")
18 | print(table_user.count())
19 | 
20 | table_user = pd.read_csv('table2_jd', sep="\t")
21 | print(table_user.count())
22 | 
23 | table_user = pd.read_csv('table3_action', sep="\t")
24 | print(table_user.count())
25 | 


--------------------------------------------------------------------------------
/Deep-Semantic-Similarity-Model/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2016 Michael A. Alcorn
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Deep_FM/utilities.py:
--------------------------------------------------------------------------------
 1 | # coding:utf-8
 2 | import pickle
 3 | 
 4 | import numpy as np
 5 | import pandas as pd
 6 | 
 7 | 
 8 | def one_hot_representation(sample, fields_dict, array_length):
 9 |     """
10 |     One hot presentation for every sample data
11 |     :param fields_dict: fields value to array index
12 |     :param sample: sample data, type of pd.series
13 |     :param array_length: length of one-hot representation
14 |     :return: one-hot representation, type of np.array
15 |     """
16 |     array = np.zeros([array_length])
17 |     idx = []
18 |     for field in fields_dict:
19 |         # get index of array
20 |         if field == 'hour':
21 |             field_value = int(str(sample[field])[-2:])
22 |         else:
23 |             field_value = sample[field]
24 |         ind = fields_dict[field][field_value]
25 |         array[ind] = 1
26 |         idx.append(ind)
27 |     return array, idx[:21]
28 | 
29 | 
30 | if __name__ == '__main__':
31 |     fields_train = ['hour', 'C1', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21',
32 |                     'banner_pos', 'site_id', 'site_domain', 'site_category', 'app_domain',
33 |                     'app_id', 'app_category', 'device_model', 'device_type', 'device_id',
34 |                     'device_conn_type', 'click']
35 | 
36 |     fields_test = ['hour', 'C1', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21',
37 |                    'banner_pos', 'site_id', 'site_domain', 'site_category', 'app_domain',
38 |                    'app_id', 'device_id', 'app_category', 'device_model', 'device_type',
39 |                    'device_conn_type']
40 | 
41 |     train = pd.read_csv('/home/johnso/PycharmProjects/News_recommendation/CTR_prediction/avazu_CTR/train.csv',
42 |                         chunksize=100)
43 |     test = pd.read_csv('/home/johnso/PycharmProjects/News_recommendation/CTR_prediction/avazu_CTR/test.csv',
44 |                        chunksize=100)
45 |     # loading dicts
46 |     fields_train_dict = {}
47 |     for field in fields_train:
48 |         with open('dicts/' + field + '.pkl', 'rb') as f:
49 |             fields_train_dict[field] = pickle.load(f)
50 | 
51 |     fields_test_dict = {}
52 |     for field in fields_test:
53 |         with open('dicts/' + field + '.pkl', 'rb') as f:
54 |             fields_test_dict[field] = pickle.load(f)
55 | 
56 |     train_array_length = max(fields_train_dict['click'].values()) + 1
57 |     test_array_length = train_array_length - 2
58 |     # initialize the model
59 | 
60 |     for data in test:
61 |         # data['click'] = np.zeros(100)
62 |         # data.to_csv('a.csv',mode='a')
63 |         sample = data.iloc[3, :]
64 |         print(one_hot_representation(sample, fields_test_dict, test_array_length))
65 | 
66 |         break
67 | 


--------------------------------------------------------------------------------
/Deep_FM/step1.py:
--------------------------------------------------------------------------------
  1 | import pickle
  2 | 
  3 | import pandas as pd
  4 | 
  5 | # for site_id, site_domain, app_id, app_domain, device_model,
  6 | # device_ip, device_id fields,C14,C17,C19,C21, one-hot using frequency
  7 | # for other fields, one-hot-encoding directly
  8 | 
  9 | # one hot 的直接编码 one-hot encoding directly
 10 | click = set()
 11 | hour = set()
 12 | C1 = set()
 13 | banner_pos = set()
 14 | site_category = set()
 15 | app_category = set()
 16 | device_type = set()
 17 | device_conn_type = set()
 18 | C15 = set()
 19 | C16 = set()
 20 | C18 = set()
 21 | C20 = set()
 22 | 
 23 | hour = set(range(24))
 24 | 
 25 | # 按频率桶进行一次编码 one-encoding by frequency bucket
 26 | C14 = []
 27 | C17 = []
 28 | C19 = []
 29 | C21 = []
 30 | site_id = []
 31 | site_domain = []
 32 | app_id = []
 33 | app_domain = []
 34 | device_model = []
 35 | device_ip = []
 36 | device_id = []
 37 | 
 38 | train = pd.read_csv('/home/johnso/PycharmProjects/News_recommendation/CTR_prediction/avazu_CTR/train.csv',
 39 |                     chunksize=10000)
 40 | 
 41 | for data in train:
 42 |     click_v = set(data['click'].values)
 43 |     click = click | click_v
 44 | 
 45 |     C1_v = set(data['C1'].values)
 46 |     C1 = C1 | C1_v
 47 | 
 48 |     C15_v = set(data['C15'].values)
 49 |     C15 = C15 | C15_v
 50 | 
 51 |     C16_v = set(data['C16'].values)
 52 |     C16 = C16 | C16_v
 53 | 
 54 |     C18_v = set(data['C18'].values)
 55 |     C18 = C18 | C18_v
 56 | 
 57 |     C20_v = set(data['C20'].values)
 58 |     C20 = C20 | C20_v
 59 | 
 60 |     banner_pos_v = set(data['banner_pos'].values)
 61 |     banner_pos = banner_pos | banner_pos_v
 62 | 
 63 |     site_category_v = set(data['site_category'].values)
 64 |     site_category = site_category | site_category_v
 65 | 
 66 |     app_category_v = set(data['app_category'].values)
 67 |     app_category = app_category | app_category_v
 68 | 
 69 |     device_type_v = set(data['device_type'].values)
 70 |     device_type = device_type | device_type_v
 71 | 
 72 |     device_conn_type_v = set(data['device_conn_type'].values)
 73 |     device_conn_type = device_conn_type | device_conn_type_v
 74 | 
 75 | # save dictionaries
 76 | with open('sets/click.pkl', 'wb') as f:
 77 |     pickle.dump(click, f)
 78 | 
 79 | with open('sets/hour.pkl', 'wb') as f:
 80 |     pickle.dump(hour, f)
 81 | 
 82 | with open('sets/C1.pkl', 'wb') as f:
 83 |     pickle.dump(C1, f)
 84 | 
 85 | with open('sets/C15.pkl', 'wb') as f:
 86 |     pickle.dump(C15, f)
 87 | 
 88 | with open('sets/C16.pkl', 'wb') as f:
 89 |     pickle.dump(C16, f)
 90 | 
 91 | with open('sets/C18.pkl', 'wb') as f:
 92 |     pickle.dump(C18, f)
 93 | 
 94 | with open('sets/C20.pkl', 'wb') as f:
 95 |     pickle.dump(C20, f)
 96 | 
 97 | with open('sets/banner_pos.pkl', 'wb') as f:
 98 |     pickle.dump(banner_pos, f)
 99 | 
100 | with open('sets/site_category.pkl', 'wb') as f:
101 |     pickle.dump(site_category, f)
102 | 
103 | with open('sets/app_category.pkl', 'wb') as f:
104 |     pickle.dump(app_category, f)
105 | 
106 | with open('sets/device_type.pkl', 'wb') as f:
107 |     pickle.dump(device_type, f)
108 | 
109 | with open('sets/device_conn_type.pkl', 'wb') as f:
110 |     pickle.dump(device_conn_type, f)
111 | 


--------------------------------------------------------------------------------
/Deep_FM/step2.py:
--------------------------------------------------------------------------------
  1 | import pickle
  2 | 
  3 | direct_encoding_fields = ['hour', 'C1', 'C15', 'C16', 'C18', 'C20',
  4 |                           'banner_pos', 'site_category', 'app_category',
  5 |                           'device_type', 'device_conn_type']
  6 | 
  7 | frequency_encoding_fields = ['C14', 'C17', 'C19', 'C21',
  8 |                              'site_id', 'site_domain', 'app_id', 'app_domain',
  9 |                              'device_model', 'device_id']
 10 | 
 11 | # load direct encoding fields
 12 | with open('sets/click.pkl', 'rb') as f:
 13 |     click = pickle.load(f)
 14 | 
 15 | with open('sets/hour.pkl', 'rb') as f:
 16 |     hour = pickle.load(f)
 17 | 
 18 | with open('sets/C1.pkl', 'rb') as f:
 19 |     C1 = pickle.load(f)
 20 | 
 21 | with open('sets/C15.pkl', 'rb') as f:
 22 |     C15 = pickle.load(f)
 23 | 
 24 | with open('sets/C16.pkl', 'rb') as f:
 25 |     C16 = pickle.load(f)
 26 | 
 27 | with open('sets/C18.pkl', 'rb') as f:
 28 |     C18 = pickle.load(f)
 29 | 
 30 | with open('sets/C20.pkl', 'rb') as f:
 31 |     C20 = pickle.load(f)
 32 | 
 33 | with open('sets/banner_pos.pkl', 'rb') as f:
 34 |     banner_pos = pickle.load(f)
 35 | 
 36 | with open('sets/site_category.pkl', 'rb') as f:
 37 |     site_category = pickle.load(f)
 38 | 
 39 | with open('sets/app_category.pkl', 'rb') as f:
 40 |     app_category = pickle.load(f)
 41 | 
 42 | with open('sets/device_type.pkl', 'rb') as f:
 43 |     device_type = pickle.load(f)
 44 | 
 45 | with open('sets/device_conn_type.pkl', 'rb') as f:
 46 |     device_conn_type = pickle.load(f)
 47 | 
 48 | # loading frequency encoding fields
 49 | # field2count dictionaries
 50 | with open('field2count/C14.pkl', 'rb') as f:
 51 |     C14 = pickle.load(f)
 52 | 
 53 | with open('field2count/C17.pkl', 'rb') as f:
 54 |     C17 = pickle.load(f)
 55 | 
 56 | with open('field2count/C19.pkl', 'rb') as f:
 57 |     C19 = pickle.load(f)
 58 | 
 59 | with open('field2count/C21.pkl', 'rb') as f:
 60 |     C21 = pickle.load(f)
 61 | 
 62 | with open('field2count/site_id.pkl', 'rb') as f:
 63 |     site_id = pickle.load(f)
 64 | 
 65 | with open('field2count/site_domain.pkl', 'rb') as f:
 66 |     site_domain = pickle.load(f)
 67 | 
 68 | with open('field2count/app_id.pkl', 'rb') as f:
 69 |     app_id = pickle.load(f)
 70 | 
 71 | with open('field2count/app_domain.pkl', 'rb') as f:
 72 |     app_domain = pickle.load(f)
 73 | 
 74 | with open('field2count/device_model.pkl', 'rb') as f:
 75 |     device_model = pickle.load(f)
 76 | 
 77 | with open('field2count/device_id.pkl', 'rb') as f:
 78 |     device_id = pickle.load(f)
 79 | 
 80 | ind = 0
 81 | for field in direct_encoding_fields:
 82 |     # value to one-hot-encoding index dict
 83 |     field_dict = {}
 84 |     field_sets = eval(field)
 85 |     for value in list(field_sets):
 86 |         field_dict[value] = ind
 87 |         ind += 1
 88 |     with open('dicts/' + field + '.pkl', 'wb') as f:
 89 |         pickle.dump(field_dict, f)
 90 | 
 91 | for field in frequency_encoding_fields:
 92 |     # value to one-hot-encoding index dict
 93 |     field_dict = {}
 94 |     field2count = eval(field)
 95 |     index_rare = ind
 96 |     for k, count in field2count.items():
 97 |         if count < 10:
 98 |             field_dict[k] = index_rare
 99 |         else:
100 |             field_dict[k] = ind + 1
101 |             ind += 1
102 | 
103 |     with open('dicts/' + field + '.pkl', 'wb') as f:
104 |         pickle.dump(field_dict, f)
105 | 
106 | field_dict = {}
107 | field_sets = click
108 | for value in list(field_sets):
109 |     field_dict[value] = ind + 1
110 |     ind += 1
111 | with open('dicts/' + 'click' + '.pkl', 'wb') as f:
112 |     pickle.dump(field_dict, f)
113 | 


--------------------------------------------------------------------------------
/Deep-Semantic-Similarity-Model/deep_semantic_similarity_keras.py:
--------------------------------------------------------------------------------
  1 | # Michael A. Alcorn (malcorn@redhat.com)
  2 | # An implementation of the Deep Semantic Similarity Model (DSSM) found in [1].
  3 | # [1] Shen, Y., He, X., Gao, J., Deng, L., and Mesnil, G. 2014. A latent semantic model
  4 | #         with convolutional-pooling structure for information retrieval. In CIKM, pp. 101-110.
  5 | #         http://research.microsoft.com/pubs/226585/cikm2014_cdssm_final.pdf
  6 | # [2] http://research.microsoft.com/en-us/projects/dssm/
  7 | # [3] http://research.microsoft.com/pubs/238873/wsdm2015.v3.pdf
  8 | 
  9 | import numpy as np
 10 | from keras import backend
 11 | from keras.layers import Activation, Input
 12 | from keras.layers.convolutional import Convolution1D
 13 | from keras.layers.core import Dense, Lambda, Reshape
 14 | from keras.layers.merge import concatenate, dot
 15 | from keras.models import Model
 16 | 
 17 | # 信息连续长度
 18 | LETTER_GRAM_SIZE = 3  # See section 3.2.
 19 | # 窗口长度
 20 | WINDOW_SIZE = 3  # See section 3.2.
 21 | TOTAL_LETTER_GRAMS = int(3 * 1e4)  # Determined from data. See section 3.2.
 22 | WORD_DEPTH = WINDOW_SIZE * TOTAL_LETTER_GRAMS  # See equation (1).
 23 | # 最大 隐层
 24 | K = 300  # 最大池化层的维数 Dimensionality of the max-pooling layer. See section 3.4.
 25 | L = 128  # Dimensionality of latent semantic space. See section 3.5.
 26 | J = 4  # Number of random un clicked documents serving as negative examples for a query. See section 4.
 27 | FILTER_LENGTH = 1  # We only consider one time step for convolutions.
 28 | 
 29 | # Input tensors holding the query, positive (clicked) document, and negative (unclicked) documents.
 30 | # The first dimension is None because the queries and documents can vary in length.
 31 | query = Input(shape=(None, WORD_DEPTH))
 32 | pos_doc = Input(shape=(None, WORD_DEPTH))
 33 | neg_docs = [Input(shape=(None, WORD_DEPTH)) for j in range(J)]
 34 | 
 35 | # Query model. The paper uses separate neural nets for queries and documents (see section 5.2).
 36 | 
 37 | # In this step, we transform each word vector with WORD_DEPTH dimensions into its
 38 | # convolved representation with K dimensions. K is the number of kernels/filters
 39 | # being used in the operation. Essentially, the operation is taking the dot product
 40 | # of a single weight matrix (W_c) with each of the word vectors (l_t) from the
 41 | # query matrix (l_Q), adding a bias vector (b_c), and then applying the tanh activation.
 42 | # That is, h_Q = tanh(W_c • l_Q + b_c). With that being said, that's not actually
 43 | # how the operation is being calculated here. To tie the weights of the weight
 44 | # matrix (W_c) together, we have to use a one-dimensional convolutional layer. 
 45 | # Further, we have to transpose our query matrix (l_Q) so that time is the first
 46 | # dimension rather than the second (as described in the paper). That is, l_Q[0, :]
 47 | # represents our first word vector rather than l_Q[:, 0]. We can think of the weight
 48 | # matrix (W_c) as being similarly transposed such that each kernel is a column
 49 | # of W_c. Therefore, h_Q = tanh(l_Q • W_c + b_c) with l_Q, W_c, and b_c being
 50 | # the transposes of the matrices described in the paper. Note: the paper does not
 51 | # include bias units.
 52 | query_conv = Convolution1D(K, FILTER_LENGTH, padding="same", input_shape=(None, WORD_DEPTH), activation="tanh")(
 53 |     query)  # See equation (2).
 54 | 
 55 | # Next, we apply a max-pooling layer to the convolved query matrix. Keras provides
 56 | # its own max-pooling layers, but they cannot handle variable length input (as
 57 | # far as I can tell). As a result, I define my own max-pooling layer here. In the
 58 | # paper, the operation selects the maximum value for each row of h_Q, but, because
 59 | # we're using the transpose, we're selecting the maximum value for each column.
 60 | query_max = Lambda(lambda x: backend.max(x, axis=1), output_shape=(K,))(query_conv)  # See section 3.4.
 61 | 
 62 | # 在这一步中，我们生成查询的语义向量表示。这是一个标准的神经网络密集层，
 63 | # 即y = tanh(W_s•v + b_s)。同样，论文没有包含偏置单位。
 64 | # In this step, we generate the semantic vector represenation of the query. This
 65 | # is a standard neural network dense layer, i.e., y = tanh(W_s • v + b_s). Again,
 66 | # the paper does not include bias units.
 67 | query_sem = Dense(L, activation="rule", input_dim=K)(query_max)  # See section 3.5.
 68 | 
 69 | # The document equivalent of the above query model.
 70 | doc_conv = Convolution1D(K, FILTER_LENGTH, padding="same", input_shape=(None, WORD_DEPTH), activation="tanh")
 71 | # 最大?
 72 | doc_max = Lambda(lambda x: backend.max(x, axis=1), output_shape=(K,))
 73 | doc_sem = Dense(L, activation="rule", input_dim=K)
 74 | 
 75 | pos_doc_conv = doc_conv(pos_doc)
 76 | neg_doc_convs = [doc_conv(neg_doc) for neg_doc in neg_docs]
 77 | 
 78 | pos_doc_max = doc_max(pos_doc_conv)
 79 | neg_doc_maxes = [doc_max(neg_doc_conv) for neg_doc_conv in neg_doc_convs]
 80 | 
 81 | pos_doc_sem = doc_sem(pos_doc_max)
 82 | neg_doc_sems = [doc_sem(neg_doc_max) for neg_doc_max in neg_doc_maxes]
 83 | 
 84 | # This layer calculates the cosine similarity between the semantic representations of
 85 | # a query and a document.
 86 | R_Q_D_p = dot([query_sem, pos_doc_sem], axes=1, normalize=True)  # See equation (4).
 87 | R_Q_D_ns = [dot([query_sem, neg_doc_sem], axes=1, normalize=True) for neg_doc_sem in neg_doc_sems]  # See equation (4).
 88 | 
 89 | concat_Rs = concatenate([R_Q_D_p] + R_Q_D_ns)
 90 | concat_Rs = Reshape((J + 1, 1))(concat_Rs)
 91 | 
 92 | # In this step, we multiply each R(Q, D) value by gamma. In the paper, gamma is
 93 | # described as a smoothing factor for the softmax function, and it's set empirically
 94 | # on a held-out data set. We're going to learn gamma's value by pretending it's
 95 | # a single 1 x 1 kernel. 1*1的卷积核
 96 | weight = np.array([1]).reshape(1, 1)
 97 | with_gamma = Convolution1D(1, 1, padding="same", input_shape=(J + 1, 1), activation="linear", use_bias=False,
 98 |                            weights=[weight])(concat_Rs)  # See equation (5).
 99 | with_gamma = Reshape((J + 1,))(with_gamma)
100 | 
101 | # Finally, we use the softmax function to calculate P(D+|Q). 用 soft max 输出一个东西？
102 | prob = Activation("softmax")(with_gamma)  # See equation (5).
103 | 
104 | # We now have everything we need to define our model.
105 | model = Model(inputs=[query, pos_doc] + neg_docs, outputs=prob)
106 | model.compile(optimizer="adadelta", loss="categorical_crossentropy")
107 | 
108 | # Build a random data set.
109 | sample_size = 10
110 | l_Qs = []
111 | pos_l_Ds = []
112 | 
113 | # Variable length input must be handled differently from padded input.
114 | BATCH = True
115 | 
116 | (query_len, doc_len) = (5, 100)
117 | 
118 | for i in range(sample_size):
119 | 
120 |     if BATCH:
121 |         l_Q = np.random.rand(query_len, WORD_DEPTH)
122 |         l_Qs.append(l_Q)
123 | 
124 |         l_D = np.random.rand(doc_len, WORD_DEPTH)
125 |         pos_l_Ds.append(l_D)
126 |     else:
127 |         query_len = np.random.randint(1, 10)
128 |         l_Q = np.random.rand(1, query_len, WORD_DEPTH)
129 |         l_Qs.append(l_Q)
130 | 
131 |         doc_len = np.random.randint(50, 500)
132 |         l_D = np.random.rand(1, doc_len, WORD_DEPTH)
133 |         pos_l_Ds.append(l_D)
134 | 
135 | neg_l_Ds = [[] for j in range(J)]
136 | for i in range(sample_size):
137 |     possibilities = list(range(sample_size))
138 |     possibilities.remove(i)
139 |     negatives = np.random.choice(possibilities, J, replace=False)
140 |     for j in range(J):
141 |         negative = negatives[j]
142 |         neg_l_Ds[j].append(pos_l_Ds[negative])
143 | 
144 | if BATCH:
145 |     y = np.zeros((sample_size, J + 1))
146 |     y[:, 0] = 1
147 | 
148 |     l_Qs = np.array(l_Qs)
149 |     pos_l_Ds = np.array(pos_l_Ds)
150 |     for j in range(J):
151 |         neg_l_Ds[j] = np.array(neg_l_Ds[j])
152 | 
153 |     history = model.fit([l_Qs, pos_l_Ds] + [neg_l_Ds[j] for j in range(J)], y, epochs=1, verbose=0)
154 | else:
155 |     y = np.zeros(J + 1).reshape(1, J + 1)
156 |     y[0, 0] = 1
157 | 
158 |     for i in range(sample_size):
159 |         history = model.fit([l_Qs[i], pos_l_Ds[i]] + [neg_l_Ds[j][i] for j in range(J)], y, epochs=1, verbose=0)
160 | 
161 | # Here, I walk through how to define a function for calculating output from the
162 | # computational graph. Let's define a function that calculates R(Q, D+) for a given
163 | # query and clicked document. The function depends on two inputs, query and pos_doc.
164 | # That is, if you start at the point in the graph where R(Q, D+) is calculated
165 | # and then work backwards as far as possible, you'll end up at two different starting
166 | # points: query and pos_doc. As a result, we supply those inputs in a list to the
167 | # function. This particular function only calculates a single output, but multiple
168 | # outputs are possible (see the next example).
169 | get_R_Q_D_p = backend.function([query, pos_doc], [R_Q_D_p])
170 | if BATCH:
171 |     get_R_Q_D_p([l_Qs, pos_l_Ds])
172 | else:
173 |     get_R_Q_D_p([l_Qs[0], pos_l_Ds[0]])
174 | 
175 | # A slightly more complex function. Notice that both neg_docs and the output are
176 | # lists.
177 | get_R_Q_D_ns = backend.function([query] + neg_docs, R_Q_D_ns)
178 | if BATCH:
179 |     get_R_Q_D_ns([l_Qs] + [neg_l_Ds[j] for j in range(J)])
180 | else:
181 |     get_R_Q_D_ns([l_Qs[0]] + neg_l_Ds[0])
182 | 


--------------------------------------------------------------------------------
/Deep_FM/DeepFM.py:
--------------------------------------------------------------------------------
  1 | # coding:utf-8
  2 | import os
  3 | import sys
  4 | 
  5 | curPath = os.path.abspath(os.path.dirname(__file__))
  6 | rootPath = os.path.split(curPath)[0]
  7 | sys.path.append(rootPath)
  8 | import tensorflow as tf
  9 | from Deep_FM.utilities import *
 10 | import pandas as pd
 11 | import logging
 12 | 
 13 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
 14 | 
 15 | 
 16 | class DeepFM(object):
 17 |     """
 18 |     深度 因式分解
 19 |     Deep FM with FTRL optimization
 20 |     """
 21 | 
 22 |     def __init__(self, config):
 23 |         """
 24 |         :param config: 超参数配置 configuration of hyper parameters
 25 |         type of dict
 26 |         """
 27 |         # 潜在因素数 number of latent factors
 28 | 
 29 |         self.correct_prediction = tf.equal(tf.cast(tf.argmax(model.y_out, 1), tf.int64), model.y)
 30 |         # 准确率 accuracy
 31 |         self.accuracy = tf.reduce_mean(tf.cast(self.correct_prediction, tf.float32))
 32 |         self.global_step = tf.Variable(0, trainable=False)
 33 |         self.k = config['k']
 34 |         self.lr = config['lr']
 35 |         # 每次输入模型数据的数量
 36 |         self.batch_size = config['batch_size']
 37 |         self.reg_l1 = config['reg_l1']
 38 |         self.reg_l2 = config['reg_l2']
 39 |         # 特征数量 num of features
 40 |         self.p = feature_length
 41 |         # 字段数量 num of fields
 42 |         self.field_cnt = field_cnt
 43 | 
 44 |     def add_placeholders(self):
 45 |         self.X = tf.placeholder('float32', [None, self.p])
 46 |         self.y = tf.placeholder('int64', [None, ])
 47 |         # index of none-zero features
 48 |         self.feature_inds = tf.placeholder('int64', [None, field_cnt])
 49 |         self.keep_prob = tf.placeholder('float32')
 50 | 
 51 |     def inference(self):
 52 |         """
 53 |         forward propagation
 54 |         :return: labels for each sample
 55 |         """
 56 |         v = tf.Variable(tf.truncated_normal(shape=[self.p, self.k], mean=0, stddev=0.01), dtype='float32')
 57 | 
 58 |         # Factorization Machine
 59 |         with tf.variable_scope('FM'):
 60 |             b = tf.get_variable('bias', shape=[2],
 61 |                                 initializer=tf.zeros_initializer())
 62 |             w1 = tf.get_variable('w1', shape=[self.p, 2],
 63 |                                  initializer=tf.truncated_normal_initializer(mean=0, stddev=1e-2))
 64 |             # shape of [None, 2]
 65 |             self.linear_terms = tf.add(tf.matmul(self.X, w1), b)
 66 | 
 67 |             # shape of [None, 1]
 68 |             self.interaction_terms = tf.multiply(0.5,
 69 |                                                  tf.reduce_mean(
 70 |                                                      tf.subtract(
 71 |                                                          tf.pow(tf.matmul(self.X, v), 2),
 72 |                                                          tf.matmul(tf.pow(self.X, 2), tf.pow(v, 2))),
 73 |                                                      1, keep_dims=True))
 74 |             # shape of [None, 2]
 75 |             self.y_fm = tf.add(self.linear_terms, self.interaction_terms)
 76 | 
 77 |         # three-hidden-layer neural network, network shape of (200-200-200)
 78 |         with tf.variable_scope('DNN', reuse=False):
 79 |             # embedding layer
 80 |             y_embedding_input = tf.reshape(tf.gather(v, self.feature_inds), [-1, self.field_cnt * self.k])
 81 |             # first hidden layer
 82 |             w1 = tf.get_variable('w1_dnn', shape=[self.field_cnt * self.k, 200],
 83 |                                  initializer=tf.truncated_normal_initializer(mean=0, stddev=1e-2))
 84 |             b1 = tf.get_variable('b1_dnn', shape=[200],
 85 |                                  initializer=tf.constant_initializer(0.001))
 86 |             y_hidden_l1 = tf.nn.relu(tf.matmul(y_embedding_input, w1) + b1)
 87 |             # second hidden layer
 88 |             w2 = tf.get_variable('w2', shape=[200, 200],
 89 |                                  initializer=tf.truncated_normal_initializer(mean=0, stddev=1e-2))
 90 |             b2 = tf.get_variable('b2', shape=[200],
 91 |                                  initializer=tf.constant_initializer(0.001))
 92 |             y_hidden_l2 = tf.nn.relu(tf.matmul(y_hidden_l1, w2) + b2)
 93 |             # third hidden layer
 94 |             w3 = tf.get_variable('w1', shape=[200, 200],
 95 |                                  initializer=tf.truncated_normal_initializer(mean=0, stddev=1e-2))
 96 |             b3 = tf.get_variable('b1', shape=[200],
 97 |                                  initializer=tf.constant_initializer(0.001))
 98 |             y_hidden_l3 = tf.nn.relu(tf.matmul(y_hidden_l2, w3) + b3)
 99 |             # output layer
100 |             w_out = tf.get_variable('w_out', shape=[200, 2],
101 |                                     initializer=tf.truncated_normal_initializer(mean=0, stddev=1e-2))
102 |             b_out = tf.get_variable('b_out', shape=[2],
103 |                                     initializer=tf.constant_initializer(0.001))
104 |             self.y_dnn = tf.nn.relu(tf.matmul(y_hidden_l3, w_out) + b_out)
105 |         # add FM output and DNN output
106 |         self.y_out = tf.add(self.y_fm, self.y_dnn)
107 |         self.y_out_prob = tf.nn.softmax(self.y_out)
108 | 
109 |     def add_loss(self):
110 |         cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.y, logits=self.y_out)
111 |         mean_loss = tf.reduce_mean(cross_entropy)
112 |         self.loss = mean_loss
113 |         tf.summary.scalar('loss', self.loss)
114 | 
115 |     def add_accuracy(self):
116 |         # add summary to accuracy
117 |         tf.summary.scalar('accuracy', self.accuracy)
118 | 
119 |     def train(self):
120 |         # 将指数衰减应用于学习速率 Applies exponential decay to learning rate
121 |         # define optimizer
122 |         optimizer = tf.train.FtrlOptimizer(self.lr, l1_regularization_strength=self.reg_l1,
123 |                                            l2_regularization_strength=self.reg_l2)
124 |         extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
125 |         with tf.control_dependencies(extra_update_ops):
126 |             self.train_op = optimizer.minimize(self.loss, global_step=self.global_step)
127 | 
128 |     def build_graph(self):
129 |         """build graph for model"""
130 |         self.add_placeholders()
131 |         self.inference()
132 |         self.add_loss()
133 |         self.add_accuracy()
134 |         self.train()
135 | 
136 | 
137 | def check_restore_parameters(sess, saver):
138 |     """ Restore the previously trained parameters if there are any. """
139 |     ckpt = tf.train.get_checkpoint_state("checkpoints")
140 |     if ckpt and ckpt.model_checkpoint_path:
141 |         logging.info("Loading parameters for the my CNN architectures...")
142 |         saver.restore(sess, ckpt.model_checkpoint_path)
143 |     else:
144 |         logging.info("Initializing fresh parameters for the my Factorization Machine")
145 | 
146 | 
147 | def train_model(sess, model, epochs=10, print_every=500):
148 |     """training model"""
149 |     num_samples = 0
150 |     losses = []
151 |     # Merge all the summaries and write them out to train_logs
152 |     merged = tf.summary.merge_all()
153 |     train_writer = tf.summary.FileWriter('train_logs', sess.graph)
154 |     for e in range(epochs):
155 |         # get training data, iterable
156 |         train_data = pd.read_csv('../avazu_CTR/train.csv',
157 |                                  chunksize=model.batch_size)
158 |         # batch_size data
159 |         for data in train_data:
160 |             actual_batch_size = len(data)
161 |             batch_X = []
162 |             batch_y = []
163 |             batch_idx = []
164 |             for i in range(actual_batch_size):
165 |                 sample = data.iloc[i, :]
166 |                 array, idx = one_hot_representation(sample, fields_train_dict, train_array_length)
167 |                 batch_X.append(array[:-2])
168 |                 batch_y.append(array[-1])
169 |                 batch_idx.append(idx)
170 |             batch_X = np.array(batch_X)
171 |             batch_y = np.array(batch_y)
172 |             batch_idx = np.array(batch_idx)
173 |             # create a feed dictionary for this batch
174 |             feed_dict = {model.X: batch_X, model.y: batch_y,
175 |                          model.feature_inds: batch_idx, model.keep_prob: 1}
176 |             loss, accuracy, summary, global_step, _ = sess.run([model.loss, model.accuracy,
177 |                                                                 merged, model.global_step,
178 |                                                                 model.train_op], feed_dict=feed_dict)
179 |             # aggregate performance stats
180 |             losses.append(loss * actual_batch_size)
181 | 
182 |             num_samples += actual_batch_size
183 |             # Record summaries and train.csv-set accuracy
184 |             train_writer.add_summary(summary, global_step=global_step)
185 |             # print training loss and accuracy
186 |             if global_step % print_every == 0:
187 |                 logging.info("Iteration {0}: with minibatch training loss = {1} and accuracy of {2}"
188 |                              .format(global_step, loss, accuracy))
189 |                 saver.save(sess, "checkpoints/model", global_step=global_step)
190 | 
191 |         # print loss of one epoch
192 |         total_loss = np.sum(losses) / num_samples
193 |         print("Epoch {1}, Overall loss = {0:.3g}".format(total_loss, e + 1))
194 | 
195 | 
196 | def validation_model(sess, model, print_every=50):
197 |     """testing model"""
198 |     # num samples
199 |     num_samples = 0
200 |     # num of correct predictions
201 |     num_corrects = 0
202 |     losses = []
203 |     # Merge all the summaries and write them out to train_logs
204 |     merged = tf.summary.merge_all()
205 |     test_writer = tf.summary.FileWriter('test_logs', sess.graph)
206 |     # get testing data, iterable
207 |     validation_data = pd.read_csv('/home/katy/CTR_prediction/avazu_CTR/train.csv',
208 |                                   chunksize=model.batch_size)
209 |     # testing step
210 |     valid_step = 1
211 |     # batch_size data
212 |     for data in validation_data:
213 |         actual_batch_size = len(data)
214 |         batch_X = []
215 |         batch_y = []
216 |         batch_idx = []
217 |         for i in range(actual_batch_size):
218 |             sample = data.iloc[i, :]
219 |             array, idx = one_hot_representation(sample, fields_train_dict, train_array_length)
220 |             batch_X.append(array[:-2])
221 |             batch_y.append(array[-1])
222 |             batch_idx.append(idx)
223 |         batch_X = np.array(batch_X)
224 |         batch_y = np.array(batch_y)
225 |         batch_idx = np.array(batch_idx)
226 |         # create a feed dictionary for this batch,
227 |         feed_dict = {model.X: batch_X, model.y: batch_y,
228 |                      model.feature_inds: batch_idx, model.keep_prob: 1}
229 |         loss, accuracy, correct, summary = sess.run([model.loss, model.accuracy,
230 |                                                      model.correct_prediction, merged, ],
231 |                                                     feed_dict=feed_dict)
232 |         # aggregate performance stats
233 |         losses.append(loss * actual_batch_size)
234 |         num_corrects += correct
235 |         num_samples += actual_batch_size
236 |         # Record summaries and train.csv-set accuracy
237 |         test_writer.add_summary(summary, global_step=valid_step)
238 |         # print training loss and accuracy
239 |         if valid_step % print_every == 0:
240 |             logging.info("Iteration {0}: with minibatch training loss = {1} and accuracy of {2}"
241 |                          .format(valid_step, loss, accuracy))
242 |         valid_step += 1
243 |     # print loss and accuracy of one epoch
244 |     total_correct = num_corrects / num_samples
245 |     total_loss = np.sum(losses) / num_samples
246 |     print("Overall test loss = {0:.3g} and accuracy of {1:.3g}" \
247 |           .format(total_loss, total_correct))
248 | 
249 | 
250 | def test_model(sess, model, print_every=50):
251 |     """training model"""
252 |     # get testing data, iterable
253 |     test_data = pd.read_csv('/home/katy/CTR_prediction/avazu_CTR/test.csv',
254 |                             chunksize=model.batch_size)
255 |     test_step = 1
256 |     # batch_size data
257 |     for data in test_data:
258 |         actual_batch_size = len(data)
259 |         batch_X = []
260 |         batch_idx = []
261 |         for i in range(actual_batch_size):
262 |             sample = data.iloc[i, :]
263 |             array, idx = one_hot_representation(sample, fields_test_dict, test_array_length)
264 |             batch_X.append(array)
265 |             batch_idx.append(idx)
266 | 
267 |         batch_X = np.array(batch_X)
268 |         batch_idx = np.array(batch_idx)
269 |         # create a feed dictionary for this batch
270 |         feed_dict = {model.X: batch_X, model.keep_prob: 1, model.feature_inds: batch_idx}
271 |         # shape of [None,2]
272 |         y_out_prob = sess.run([model.y_out_prob], feed_dict=feed_dict)
273 |         # write to csv files
274 |         data['click'] = y_out_prob[0][:, -1]
275 |         if test_step == 1:
276 |             data[['id', 'click']].to_csv('Deep_FM_FTRL_v1.csv', mode='a', index=False, header=True)
277 |         else:
278 |             data[['id', 'click']].to_csv('Deep_FM_FTRL_v1.csv', mode='a', index=False, header=False)
279 | 
280 |         test_step += 1
281 |         if test_step % 50 == 0:
282 |             logging.info("Iteration {0} has finished".format(test_step))
283 | 
284 | 
285 | if __name__ == '__main__':
286 |     '''launching TensorBoard: tensorboard --logdir=path/to/log-directory'''
287 |     # seting fields
288 |     fields_train = ['hour', 'C1', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21',
289 |                     'banner_pos', 'site_id', 'site_domain', 'site_category', 'app_domain',
290 |                     'app_id', 'app_category', 'device_model', 'device_type', 'device_id',
291 |                     'device_conn_type', 'click']
292 | 
293 |     fields_test = ['hour', 'C1', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21',
294 |                    'banner_pos', 'site_id', 'site_domain', 'site_category', 'app_domain',
295 |                    'app_id', 'device_id', 'app_category', 'device_model', 'device_type',
296 |                    'device_conn_type']
297 |     # loading dicts
298 |     fields_train_dict = {}
299 |     for field in fields_train:
300 |         with open('dicts/' + field + '.pkl', 'rb') as f:
301 |             fields_train_dict[field] = pickle.load(f)
302 |     fields_test_dict = {}
303 |     for field in fields_test:
304 |         with open('dicts/' + field + '.pkl', 'rb') as f:
305 |             fields_test_dict[field] = pickle.load(f)
306 | 
307 |     # length of representation
308 |     train_array_length = max(fields_train_dict['click'].values()) + 1
309 |     test_array_length = train_array_length - 2
310 |     # initialize the model
311 |     config = {}
312 |     config['lr'] = 0.01
313 |     config['batch_size'] = 512
314 |     config['reg_l1'] = 2e-3
315 |     config['reg_l2'] = 0
316 |     config['k'] = 40
317 |     # get feature length
318 |     feature_length = test_array_length
319 |     # num of fields
320 |     field_cnt = 21
321 | 
322 |     model = DeepFM(config)
323 |     # build graph for model
324 |     model.build_graph()
325 | 
326 |     saver = tf.train.Saver(max_to_keep=5)
327 | 
328 |     with tf.Session() as sess:
329 |         # TODO: with every epoches, print training accuracy and validation accuracy
330 |         sess.run(tf.global_variables_initializer())
331 |         # restore trained parameters
332 |         check_restore_parameters(sess, saver)
333 |         print('start training...')
334 |         train_model(sess, model, epochs=10, print_every=500)
335 |         # print('start validation...')
336 |         # validation_model(sess, model, print_every=100)
337 |         # print('start testing...')
338 |         # test_model(sess, model)
339 | 


--------------------------------------------------------------------------------