├── README.md
├── rerank_paper.tar.gz
└── rerank_paper
    ├── .DS_Store
    ├── DCN_model
        ├── __pycache__
        │   ├── config.cpython-37.pyc
        │   ├── data_input.cpython-37.pyc
        │   ├── layers.cpython-37.pyc
        │   ├── model.cpython-37.pyc
        │   ├── tools.cpython-37.pyc
        │   └── util.cpython-37.pyc
        ├── avg_std
        │   ├── delivery
        │   ├── poi
        │   └── user
        ├── build_model.sh
        ├── config.py
        ├── config.pyc
        ├── data_input.py
        ├── data_input.pyc
        ├── evaluate.py
        ├── layers.py
        ├── layers.pyc
        ├── main.py
        ├── model.py
        ├── model.pyc
        ├── model_load.py
        ├── restore.py
        ├── run.sh
        ├── tools.py
        ├── tools.pyc
        ├── util.py
        └── util.pyc
    ├── DNN_model
        ├── avg_std
        │   ├── delivery
        │   ├── poi
        │   └── user
        ├── build_model.sh
        ├── config.py
        ├── data_input.py
        ├── evaluate.py
        ├── layers.py
        ├── main.py
        ├── model.py
        ├── model_load.py
        ├── restore.py
        ├── run.sh
        ├── tools.py
        └── util.py
    ├── extr_model
        ├── __pycache__
        │   ├── config.cpython-37.pyc
        │   ├── data_input.cpython-37.pyc
        │   ├── layers.cpython-37.pyc
        │   ├── model.cpython-37.pyc
        │   ├── tools.cpython-37.pyc
        │   └── util.cpython-37.pyc
        ├── avg_std
        │   ├── delivery
        │   ├── poi
        │   └── user
        ├── build_model.sh
        ├── config.py
        ├── data_input.py
        ├── evaluate.py
        ├── layers.py
        ├── main.py
        ├── model.py
        ├── model_load.py
        ├── restore.py
        ├── run.sh
        ├── tools.py
        └── util.py
    ├── kuaishou_model
        ├── avg_std
        │   ├── delivery
        │   ├── poi
        │   └── user
        ├── build_model.sh
        ├── config.py
        ├── data_input.py
        ├── evaluate.py
        ├── layers.py
        ├── main.py
        ├── model.py
        ├── model_load.py
        ├── restore.py
        ├── run.sh
        ├── tools.py
        └── util.py
    ├── pier_model
        ├── avg_std
        │   ├── delivery
        │   ├── poi
        │   └── user
        ├── build_model.sh
        ├── config.py
        ├── data_input.py
        ├── evaluate.py
        ├── layers.py
        ├── main.py
        ├── model.py
        ├── model_load.py
        ├── restore.py
        ├── run.sh
        ├── tools.py
        └── util.py
    ├── pier_model_whole_framework
        ├── avg_std
        │   ├── delivery
        │   ├── poi
        │   └── user
        ├── build_model.sh
        ├── config.py
        ├── data_input.py
        ├── evaluate.py
        ├── layers.py
        ├── main.py
        ├── model.py
        ├── model_load.py
        ├── restore.py
        ├── run.sh
        ├── tools.py
        └── util.py
    ├── pier_model_without_oam_atten
        ├── avg_std
        │   ├── delivery
        │   ├── poi
        │   └── user
        ├── build_model.sh
        ├── config.py
        ├── data_input.py
        ├── evaluate.py
        ├── layers.py
        ├── main.py
        ├── model.py
        ├── model_load.py
        ├── restore.py
        ├── run.sh
        ├── tools.py
        └── util.py
    ├── pier_model_without_page_atten
        ├── avg_std
        │   ├── delivery
        │   ├── poi
        │   └── user
        ├── build_model.sh
        ├── config.py
        ├── data_input.py
        ├── evaluate.py
        ├── layers.py
        ├── main.py
        ├── model.py
        ├── model_load.py
        ├── restore.py
        ├── run.sh
        ├── tools.py
        └── util.py
    └── prm_model
        ├── avg_std
            ├── delivery
            ├── poi
            └── user
        ├── build_model.sh
        ├── config.py
        ├── data_input.py
        ├── evaluate.py
        ├── layers.py
        ├── main.py
        ├── model.py
        ├── model_load.py
        ├── restore.py
        ├── run.sh
        ├── tools.py
        └── util.py


/README.md:
--------------------------------------------------------------------------------
1 | # PIER_code
2 | <p>PIER code contains DNN, DCN, PRM, EXTR, KuaiShou re-rankding, PIER
3 | <p>Run Env: python3 + tf1.5
4 | <p>Run code: sh run.sh
5 | <p>Dataset: https://drive.google.com/drive/folders/1BRkP9YPiU1bdviLjo3jrYdXNTqrJAvJM?usp=share_link
6 | 


--------------------------------------------------------------------------------
/rerank_paper.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lemonace/PIER_code/9c47f80af24570154aab4a811dd82d1f1d436fc3/rerank_paper.tar.gz


--------------------------------------------------------------------------------
/rerank_paper/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lemonace/PIER_code/9c47f80af24570154aab4a811dd82d1f1d436fc3/rerank_paper/.DS_Store


--------------------------------------------------------------------------------
/rerank_paper/DCN_model/__pycache__/config.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lemonace/PIER_code/9c47f80af24570154aab4a811dd82d1f1d436fc3/rerank_paper/DCN_model/__pycache__/config.cpython-37.pyc


--------------------------------------------------------------------------------
/rerank_paper/DCN_model/__pycache__/data_input.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lemonace/PIER_code/9c47f80af24570154aab4a811dd82d1f1d436fc3/rerank_paper/DCN_model/__pycache__/data_input.cpython-37.pyc


--------------------------------------------------------------------------------
/rerank_paper/DCN_model/__pycache__/layers.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lemonace/PIER_code/9c47f80af24570154aab4a811dd82d1f1d436fc3/rerank_paper/DCN_model/__pycache__/layers.cpython-37.pyc


--------------------------------------------------------------------------------
/rerank_paper/DCN_model/__pycache__/model.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lemonace/PIER_code/9c47f80af24570154aab4a811dd82d1f1d436fc3/rerank_paper/DCN_model/__pycache__/model.cpython-37.pyc


--------------------------------------------------------------------------------
/rerank_paper/DCN_model/__pycache__/tools.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lemonace/PIER_code/9c47f80af24570154aab4a811dd82d1f1d436fc3/rerank_paper/DCN_model/__pycache__/tools.cpython-37.pyc


--------------------------------------------------------------------------------
/rerank_paper/DCN_model/__pycache__/util.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lemonace/PIER_code/9c47f80af24570154aab4a811dd82d1f1d436fc3/rerank_paper/DCN_model/__pycache__/util.cpython-37.pyc


--------------------------------------------------------------------------------
/rerank_paper/DCN_model/avg_std/delivery:
--------------------------------------------------------------------------------
1 | 4.668 40.692 17.227 2.616
2 | 2.722 10.623 13.181 1.742
3 | 


--------------------------------------------------------------------------------
/rerank_paper/DCN_model/avg_std/poi:
--------------------------------------------------------------------------------
1 | 2066.773487 23.97615642 26.98331926 3.040902147 4.622442584
2 | 2239.863594 17.22234242 18.71115826 1.824693613 0.597578887


--------------------------------------------------------------------------------
/rerank_paper/DCN_model/avg_std/user:
--------------------------------------------------------------------------------
1 | 0 0
2 | 1 1


--------------------------------------------------------------------------------
/rerank_paper/DCN_model/build_model.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | echo ""
 4 | # 共建模型服务模版的路径 ------------------------ 修改模版配置 ----------------------
 5 | template_path="../model_template/"
 6 | 
 7 | # 待上传模型pb文件的路径 ------------------------ 修改模型配置 ----------------------
 8 | model_path="../model/"
 9 | model_name=$1
10 | ep=$2
11 | model_file=${model_path}${model_name}/${ep}
12 | echo ${model_file}
13 | target_path="../model_zip/"
14 | if [[ ! -d ${target_path} ]]; then
15 |         mkdir ${target_path}
16 | fi
17 | target_name=${model_name}_${ep}
18 | target_file=${target_path}${target_name}
19 | target_name_zip=${target_name}.zip
20 | echo ${target_name}
21 | echo "---> 待上传模型的目标路径: "${target_file}
22 | if [[ -d ${target_file} ]]; then
23 |     cmd_del="rm -fr ${target_file}"
24 |     echo "---> 删除已存在的目标文件: "${cmd_del}
25 |     ${cmd_del}
26 | fi
27 | if [[ ! -d ${target_file} ]]; then
28 |         mkdir ${target_file}
29 | fi
30 | 
31 | cmd_cp_model_pb="cp -fr ${model_file} ${target_file}/${target_name}"
32 | echo "---> 拷贝模型pb文件到目标目录: "${cmd_cp_model_pb}
33 | ${cmd_cp_model_pb}
34 | 
35 | cmd_mv1="cp ${template_path}/model.properties ${target_file}/${target_name}.properties"
36 | cmd_mv2="cp ${template_path}/model.xml ${target_file}/${target_name}.xml"
37 | cmd_mv3="cp ${template_path}/tensors.xml ${target_file}/tensors.xml"
38 | cmd_mv4="cp ${template_path}/poi_feature.xml ${target_file}/poi_feature.xml"
39 | cmd_mv5="cp ${template_path}/poi_feature.tensor ${target_file}/poi_feature.tensor"
40 | echo "---> 修改properties文件名: "${cmd_mv1}
41 | echo "---> 修改xml文件名: "${cmd_mv2}
42 | ${cmd_mv1}
43 | ${cmd_mv2}
44 | ${cmd_mv3}
45 | ${cmd_mv4}
46 | ${cmd_mv5}
47 | 
48 | cmd_zip1="cd ${target_path}"
49 | cmd_zip2="zip -r ${target_name_zip} ${target_name}"
50 | echo "---> 压缩上传模型文件: "${cmd_zip1} ${cmd_zip2}
51 | ${cmd_zip1}
52 | ${cmd_zip2}
53 | cd -
54 | 
55 | echo ${target_name}.zip
56 | 


--------------------------------------------------------------------------------
/rerank_paper/DCN_model/config.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8 -*-
 2 | import shutil
 3 | import time
 4 | import os
 5 | 
 6 | # result
 7 | RANDOM_SEED = 2021
 8 | BATCH_SIZE = 1024
 9 | IMP_LOSS_WEIGHT = 0.02
10 | # basic config
11 | EPOCH = 1
12 | LEARNING_RATE = 0.005
13 | DATA_MODE = 3 # 1:local train，2:local test, 3:docker evaluate
14 | TRAIN_MODE = 3
15 | MODEL_NAME = "dcn_pointwise_model_v3"
16 | # poi类别特征
17 | FEATURE_CATE_NUM = 7 # v1r3:19
18 | # dense特征
19 | FEATURE_DENSE_NUM = 5  # v1:28 v1r2:79 v1r3:83
20 | # 预估值特征
21 | FEATURE_CXR_NUM = 3
22 | # 环境特征
23 | FEATURE_ENV_NUM = 2
24 | # 自然poi
25 | FEATURE_NATURE_POI = 25
26 | 
27 | # N: Cut Number of POI For Train
28 | POI_NUM = 5
29 | FEATURE_NUM = 9
30 | PAGE_NUM = 5
31 | FEATURE_NUM_FOR_PAGE = 11
32 | # 属性特征：KA AOR BRAND
33 | FEATURE_ATTR_NUM = 3
34 | 
35 | # DELIVERY_FEAT
36 | DELIVERY_FEAT_NUM = 4
37 | 
38 | # OUT NUM
39 | OUT_NUM = 1
40 | CROSS_LAYERS = [56, 56, 56]
41 | CROSS_LAYERS_NUM = 3
42 | 
43 | PLACE_HOLDER_NUM = 11
44 | DENSE_FEAT_NUM = 439
45 | 
46 | 
47 | # embedding_look_up维度
48 | CATE_FEATURE_EMBEDDINGS_SHAPE = [1 << 22, 8]
49 | 
50 | # 网络结构参数
51 | MODEL_PARAMS = {
52 |     'INPUT_TENSOR_LAYERS_A': [60, 32, 20],
53 |     'INPUT_TENSOR_LAYERS_B': [64, 32],
54 |     'INPUT_TENSOR_LAYERS_C': [50, 20],
55 |     'INPUT_TENSOR_LAYERS_D': [50, 20],
56 |     'INPUT_TENSOR_LAYERS_E': [50, 20]
57 | }
58 | A_INPUT_DIM = (MODEL_PARAMS['INPUT_TENSOR_LAYERS_A'][-1] + 1) * 1
59 | 
60 | DIN_CONF = {}
61 | 
62 | # train data
63 | # /users/lemonace/Downloads/tfrecord-rl-limit5-v1
64 | if DATA_MODE == 1:
65 |     # TRAIN_FILE = ['/users/meituan_sxw/Downloads/part-r-00047']
66 |     TRAIN_FILE = ['/users/lemonace/Downloads/docker_data/part-r-00018']
67 |     VALID_FILE = TRAIN_FILE
68 |     PREDICT_FILE = VALID_FILE
69 |     TEST_FILE = PREDICT_FILE
70 | elif DATA_MODE == 2:
71 |     TRAIN_FILE = ['/mnt/dolphinfs/hdd_pool/docker/user/hadoop-hmart-waimaiad/yangfan129/train_data/avito_v1_new/avito_v1_new/test_data/part-r-*']
72 |     VALID_FILE = TRAIN_FILE
73 |     TEST_FILE = VALID_FILE
74 | elif DATA_MODE == 3:
75 |     TRAIN_FILE = [
76 |         "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-hmart-waimaiad/yangfan129/train_data/avito_v3_new_point_wise/train_data/part-r-*"]
77 |     VALID_FILE = [
78 |         "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-hmart-waimaiad/yangfan129/train_data/avito_v3_new_point_wise/test_data/part-r-*"]
79 |     TEST_FILE = [
80 |         "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-hmart-waimaiad/yangfan129/train_data/avito_v3_new_point_wise/test_data/part-r-*"]
81 | elif DATA_MODE == 4:
82 |     DATA_FILE = "/home/hadoop-hmart-waimaiad/cephfs/data/yangfan129/train_data/tfrecord-multi-channel-v1/"
83 |     #TRAIN_LIST = ["20211222", "20211223", "20211224", "20211225"]
84 |     TRAIN_LIST = ["20220123"]
85 |     VALID_LIST = ["20220124"]
86 |     TRAIN_FILE = [DATA_FILE + x + "/part-r-*" for x in TRAIN_LIST]
87 |     VALID_FILE = [DATA_FILE + x + "/part-r-0001*" for x in VALID_LIST]
88 |     TEST_FILE = [DATA_FILE + x + "/part-r-00011" for x in TRAIN_LIST]
89 | 
90 | # 辅助脚本
91 | MEAN_VAR_PATH_POI = "./avg_std/poi"
92 | MEAN_VAR_PATH_DELIVERY = "./avg_std/delivery"
93 | MODEL_SAVE_PATH = "../model/" + MODEL_NAME
94 | MODEL_SAVE_PB_EPOCH_ON = False
95 | MODEL_SAVE_PB_EPOCH_PATH = MODEL_SAVE_PATH + "_pbs"
96 | 


--------------------------------------------------------------------------------
/rerank_paper/DCN_model/config.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lemonace/PIER_code/9c47f80af24570154aab4a811dd82d1f1d436fc3/rerank_paper/DCN_model/config.pyc


--------------------------------------------------------------------------------
/rerank_paper/DCN_model/data_input.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lemonace/PIER_code/9c47f80af24570154aab4a811dd82d1f1d436fc3/rerank_paper/DCN_model/data_input.pyc


--------------------------------------------------------------------------------
/rerank_paper/DCN_model/layers.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | def DIN(seq, seq_len, target, conf, scope="DIN"):
 4 |     # seq BATCH_SIZE * SEQ_LEN * FEAT_NUM : N * M * H
 5 |     # target BATCH_SIZE * FEAT_NUM : N * H
 6 |     # return : BATCH_SIZE * H
 7 | 
 8 |     with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
 9 |         seq_shape = tf.shape(seq)
10 |         target = tf.tile(target, [1, seq_shape[1], 1])
11 | 
12 |         input = tf.concat([seq, target, seq - target, seq * target], axis=-1)
13 | 
14 |         layers = conf.get("layers", [64, 32])
15 |         for layer in layers:
16 |             input = tf.layers.dense(input, layer, activation=tf.nn.sigmoid, name="att_"+str(layer))
17 | 
18 |         input = tf.squeeze(tf.layers.dense(input, 1, activation=None, name="att_final"), axis=-1) # N * M
19 | 
20 |         # Mask
21 |         seq_mask = tf.squeeze(tf.sequence_mask(seq_len, seq_shape[1]))
22 |         # seq_mask = tf.Print(seq_mask, [seq_mask], message="seq_mask", summarize=100)
23 |         padding = tf.ones_like(input) * (-2 ** 32 + 1)
24 |         attention = tf.nn.softmax(tf.where(seq_mask, input, padding), axis=-1) # N * M
25 |         # attention = tf.Print(attention, [attention], message="attention", summarize=100)
26 |         attention = tf.tile(tf.expand_dims(attention, axis=2), [1, 1, seq_shape[2]])
27 |         output = tf.reduce_sum(tf.transpose(attention * seq, [0, 2, 1]), axis=1)
28 |     return output
29 | 


--------------------------------------------------------------------------------
/rerank_paper/DCN_model/layers.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lemonace/PIER_code/9c47f80af24570154aab4a811dd82d1f1d436fc3/rerank_paper/DCN_model/layers.pyc


--------------------------------------------------------------------------------
/rerank_paper/DCN_model/main.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from config import *
  3 | from model import *
  4 | from sklearn import metrics
  5 | 
  6 | 
  7 | def create_estimator():
  8 |     tf.logging.set_verbosity(tf.logging.INFO)
  9 |     session_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
 10 |     session_config.gpu_options.allow_growth = True
 11 |     config = tf.estimator.RunConfig(
 12 |         tf_random_seed=RANDOM_SEED,
 13 |         save_summary_steps=100,
 14 |         save_checkpoints_steps=1000,
 15 |         model_dir=MODEL_SAVE_PATH,
 16 |         keep_checkpoint_max=2,
 17 |         log_step_count_steps=1000,
 18 |         session_config=session_config)
 19 |     nn_model = DNN()
 20 |     estimator = tf.estimator.Estimator(model_fn=nn_model.model_fn_estimator, config=config)
 21 |     return estimator, nn_model
 22 | 
 23 | 
 24 | def save_model_pb_with_estimator(estimator, params, export_dir_base):
 25 |     estimator._params['save_model'] = params['save_model']
 26 | 
 27 |     def _serving_input_receiver_fn():
 28 |         # env_feature = > dense_feature
 29 |         # cxr_feature = > screen_predict_feature
 30 |         # cat_feature = > screen_cate_feature
 31 |         # dense_feature = > screen_dense_feature
 32 |         receiver_tensors = {
 33 |             # ctr cvr gmv预估值 && bid
 34 |             'screen_predict_feature': tf.placeholder(tf.float32, [None, POI_NUM, FEATURE_CXR_NUM],
 35 |                                                      name='screen_predict_feature'),
 36 |             # dense 特征 (价格，评分)
 37 |             'screen_dense_feature': tf.placeholder(tf.float32, [None, POI_NUM, FEATURE_DENSE_NUM],
 38 |                                                    name='screen_dense_feature'),
 39 |             # 离散特征(品类)
 40 |             'screen_cate_feature': tf.placeholder(tf.int64, [None, POI_NUM, FEATURE_CATE_NUM],
 41 |                                                   name='screen_cate_feature'),
 42 |             # 环境特征（是否有铂金）
 43 |             'dense_feature': tf.placeholder(tf.float32, [None, DENSE_FEAT_NUM],
 44 |                                             name='dense_feature')
 45 |         }
 46 |         return tf.estimator.export.ServingInputReceiver(receiver_tensors=receiver_tensors, features=receiver_tensors)
 47 | 
 48 |     export_dir = estimator.export_saved_model(export_dir_base=export_dir_base,
 49 |                                               serving_input_receiver_fn=_serving_input_receiver_fn)
 50 |     estimator._params.pop('save_model')
 51 |     return export_dir.decode()
 52 | 
 53 | 
 54 | def calculate_result(result_generator):
 55 |     y_ctr, pred_ctr, ctr = [], [], []
 56 |     for result in result_generator:
 57 |         cxr_feature = result['cxr_feature']
 58 |         mask = result['mask']
 59 |         # ctr_label
 60 |         idx = np.where(mask.reshape(-1) == 1)
 61 |         y_ctr += result['ctr_label'].reshape(-1)[idx].tolist()
 62 |         pred_ctr += result['ctr_out'].reshape(-1)[idx].tolist()
 63 |         ctr += cxr_feature[:, 0].reshape(-1)[idx].tolist()
 64 | 
 65 |     ctr_auc, ctr_auc_jp, ctr_cb, ctr_cb_jp = metrics.roc_auc_score(y_ctr, pred_ctr), metrics.roc_auc_score(y_ctr,
 66 |                                                                                                            ctr), np.sum(
 67 |         pred_ctr) / np.sum(y_ctr), np.sum(ctr) / np.sum(y_ctr)
 68 |     print("ctr_auc:{}, ctr_auc_jp:{}, ctr_cb:{}, ctr_cb_jp:{}".format(ctr_auc, ctr_auc_jp, ctr_cb, ctr_cb_jp))
 69 | 
 70 | 
 71 | if __name__ == '__main__':
 72 | 
 73 |     estimator, nn_model = create_estimator()
 74 | 
 75 |     with tick_tock("DATA_INPUT") as _:
 76 |         valid_input_fn = input_fn_maker(VALID_FILE, False, batch_size=1024, epoch=1)
 77 |         test_input_fn = input_fn_maker(TEST_FILE, False, batch_size=1024, epoch=1)
 78 | 
 79 |     if TRAIN_MODE == 1:
 80 |         for i in range(EPOCH):
 81 |             for idx, data in enumerate(TRAIN_FILE):
 82 |                 with tick_tock("DATA_INPUT") as _:
 83 |                     train_input_fn = input_fn_maker([data], True, batch_size=BATCH_SIZE, epoch=1)
 84 |                 with tick_tock("TRAIN") as _:
 85 |                     estimator.train(train_input_fn)
 86 |                 if MODEL_SAVE_PB_EPOCH_ON:
 87 |                     export_dir = save_model_pb_with_estimator(estimator, params={'save_model': 'listwise'},
 88 |                                                               export_dir_base=MODEL_SAVE_PB_EPOCH_PATH)
 89 |                     ep_insert_index = i * len(TRAIN_FILE) + idx
 90 |                     target_dir = export_dir + "/../ep" + str(ep_insert_index)
 91 |                     while os.path.exists(target_dir):
 92 |                         target_dir = export_dir + "/../ep" + str(ep_insert_index)
 93 |                     shutil.move(export_dir, target_dir)
 94 |                     print(time.strftime("%m-%d %H:%M:%S ",
 95 |                                         time.localtime(time.time())) + "export model PB: " + target_dir)
 96 |                 # with tick_tock("PREDICT") as _:
 97 |                 # result_generator = estimator.predict(input_fn=valid_input_fn, yield_single_examples=False)
 98 |                 # calculate_result(result_generator)
 99 | 
100 | 
101 | 
102 |     elif TRAIN_MODE == 2:
103 |         with tick_tock("PREDICT") as _:
104 |             result_generator = estimator.predict(input_fn=valid_input_fn, yield_single_examples=False)
105 |             calculate_result(result_generator)
106 | 
107 |     elif TRAIN_MODE == 3:
108 |         for i in range(EPOCH):
109 |             for idx, data in enumerate(TRAIN_FILE):
110 |                 with tick_tock("DATA_INPUT") as _:
111 |                     train_input_fn = input_fn_maker([data], True, batch_size=BATCH_SIZE, epoch=1)
112 |                 with tick_tock("TRAIN") as _:
113 |                     estimator.train(train_input_fn)
114 |                 with tick_tock("PREDICT") as _:
115 |                     result_generator = estimator.predict(input_fn=valid_input_fn, yield_single_examples=False)
116 |                     print("valid_data")
117 |                     calculate_result(result_generator)
118 |                     # result_generator = estimator.predict(input_fn=test_input_fn, yield_single_examples=False)
119 |                     print("train_data")
120 |                     # calculate_result(result_generator)
121 |                     # save pb
122 | 
123 | 
124 |     elif TRAIN_MODE == 4:
125 |         export_dir = save_model_pb_with_estimator(estimator, params={'save_model': 'listwise'},
126 |                                                   export_dir_base=MODEL_SAVE_PB_EPOCH_PATH)
127 |         ep_insert_index = 0
128 |         target_dir = export_dir + "/../ep" + str(ep_insert_index)
129 |         while os.path.exists(target_dir):
130 |             target_dir = export_dir + "/../ep" + str(ep_insert_index)
131 |         shutil.move(export_dir, target_dir)
132 |         print(time.strftime("%m-%d %H:%M:%S ",
133 |                             time.localtime(time.time())) + "export model PB: " + target_dir)
134 | 
135 | 


--------------------------------------------------------------------------------
/rerank_paper/DCN_model/model.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lemonace/PIER_code/9c47f80af24570154aab4a811dd82d1f1d436fc3/rerank_paper/DCN_model/model.pyc


--------------------------------------------------------------------------------
/rerank_paper/DCN_model/model_load.py:
--------------------------------------------------------------------------------
 1 | import tensorflow
 2 | 
 3 | from config import *
 4 | import tensorflow_core.contrib.predictor as predictor
 5 | 
 6 | def load_listwise_model():
 7 |     model_filename_dir = MODEL_SAVE_PATH + "_pbs/ep0/"
 8 |     predict_fn = predictor.from_saved_model(model_filename_dir)
 9 |     # predict_fn = tf.compat.v2.saved_model.load(model_filename_dir)
10 | 
11 |     # env_feature = > dense_feature
12 |     # cxr_feature = > screen_predict_feature
13 |     # cat_feature = > screen_cate_feature
14 |     # dense_feature = > screen_dense_feature
15 |     predictions = predict_fn({
16 |         'screen_predict_feature': [[[0.036115, 0.05427262, 0.09489095, 0.2],
17 |                                     [0.027565, 0.07474336, 0.04988268, 0.53],
18 |                                     [0.024815, 0.1775544, 0.12052802, 0.24],
19 |                                     [0.023316, 0.12283709, 0.10298113, 0.1]]],
20 |         # dense 特征 (价格，评分)
21 |         'screen_dense_feature': [[[1359., 30.146147, 26., 5., 4.85],
22 |                                   [318., 14.675659, 0., 5., 4.94],
23 |                                   [637., 24.784016, 0., 5., 4.65],
24 |                                   [185., 25.333273, 0., 5., 4.75]]],
25 |         # 离散特征(品类)
26 |         'screen_cate_feature': [[[2638824, 4148885, 432243, 3985407, 3385100, 3019284],
27 |                                  [2638824, 3905410, 3212599, 3985407, 1997821, 3019284],
28 |                                  [2638824, 4148885, 3622545, 3985407, 1997821, 3019284],
29 |                                  [2638824, 4148885, 432243, 3985407, 1997821, 3019284]]],
30 |         # 环境特征（是否有铂金）
31 |         'dense_feature': [[0., 0.]]
32 |     })
33 | 
34 |     print('Q_network_output:', predictions['Q_network_output'])
35 |     print('out:', predictions['out'])
36 | 
37 | if __name__ == '__main__':
38 |     # load_pg_model()
39 |     load_listwise_model()


--------------------------------------------------------------------------------
/rerank_paper/DCN_model/restore.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lemonace/PIER_code/9c47f80af24570154aab4a811dd82d1f1d436fc3/rerank_paper/DCN_model/restore.py


--------------------------------------------------------------------------------
/rerank_paper/DCN_model/run.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | LOG_PATH="./"
 4 | if [[ ! -d ${LOG_PATH} ]]; then
 5 |         mkdir ${LOG_PATH}
 6 | fi
 7 | 
 8 | project_path=$(cd `dirname $0`; pwd)
 9 | project_name="${project_path##*/}"
10 | begin_time=$(date "+%Y-%m-%d %H:%M:%S")
11 | author="yangfan129"
12 | 
13 | LOG_FILENAME="log"
14 | CUDA_VISIBLE_DEVICES=0 nohup python -u main.py --author ${author} --project ${project_name} --begintime ${begin_time} > ${LOG_PATH}${LOG_FILENAME} 2>&1 &
15 | 


--------------------------------------------------------------------------------
/rerank_paper/DCN_model/tools.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import datetime
 3 | import time
 4 | import os
 5 | 
 6 | 
 7 | class tick_tock:
 8 |     def __init__(self, process_name, verbose=1):
 9 |         self.process_name = process_name
10 |         self.verbose = verbose
11 | 
12 |     def __enter__(self):
13 |         if self.verbose:
14 |             print(("*" * 50 + " {} START!!!! ".format(self.process_name) + "*" * 50))
15 |             self.begin_time = time.time()
16 | 
17 |     def __exit__(self, type, value, traceback):
18 |         if self.verbose:
19 |             end_time = time.time()
20 |             duration_seconds = end_time - self.begin_time
21 |             duration = str(datetime.timedelta(seconds=duration_seconds))
22 | 
23 |             print(("#" * 50 + " {} END... time lapsing {}  ".format(self.process_name, duration) + "#" * 50))
24 | 
25 | 
26 | class FeatureInfo:
27 |     def __init__(self, feature_info_str):
28 |         self.feature_info_str = feature_info_str
29 | 
30 |         self.feature_name = "NonFeaName"
31 |         self.feature_size = 0
32 |         self.feature_mask = 1
33 |         self.parse_info_flag = False
34 |         self.part_num = 3
35 | 
36 |         self._parse_info()
37 | 
38 |     def _parse_info(self):
39 |         infoList = self.feature_info_str.split()
40 | 
41 |         if len(infoList) == self.part_num:
42 |             self.feature_name = infoList[0]
43 |             self.feature_size = int(infoList[1])
44 |             self.feature_mask = int(infoList[2])
45 |             self.parse_info_flag = True
46 | 
47 | 
48 | def parse_mask_file(feature_mask_file):
49 |     try:
50 |         if not os.path.exists(feature_mask_file):
51 |             print("parse_mask_file fail - file not exists:", feature_mask_file)
52 |             return [], False
53 |         # feature_name_list = []
54 |         feature_mask_list = []
55 |         feature_hold_cnt = 0
56 | 
57 |         with open(feature_mask_file) as f:
58 |             str_list = f.readlines()
59 | 
60 |         for i in range(0, len(str_list)):
61 |             str_list[i] = str_list[i].strip('\n').strip()
62 |             if str_list[i] == "":
63 |                 continue
64 | 
65 |             info = FeatureInfo(str_list[i])
66 |             if not info.parse_info_flag:
67 |                 print("parse_mask_file fail - parse_info fail:", str_list[i])
68 |                 parse_mask_flag = False
69 |                 return [], parse_mask_flag
70 | 
71 |             for j in range(info.feature_size):
72 |                 feature_mask_list.append(info.feature_mask)
73 |                 if info.feature_mask != 0:
74 |                     feature_hold_cnt += 1
75 |                 # if info.feature_size > 1:
76 |                 #     feature_name_list.append(info.feature_name + "_" + str(j))
77 |                 # else:
78 |                 #     feature_name_list.append(info.feature_name)
79 | 
80 |         parse_mask_flag = True
81 |         return feature_mask_list, parse_mask_flag, feature_hold_cnt
82 |     except Exception as e:
83 |         print("parse_mask_file fail - Exception:", e)
84 |         return [], False
85 | 
86 | 
87 | if __name__ == "__main__":
88 |     feature_mask_list, parse_feature_mask_flag, feature_hold_cnt = parse_mask_file("feature_mask")
89 |     print(feature_mask_list)
90 |     print(len(feature_mask_list))
91 |     print(parse_feature_mask_flag)
92 |     print(feature_hold_cnt)
93 | 


--------------------------------------------------------------------------------
/rerank_paper/DCN_model/tools.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lemonace/PIER_code/9c47f80af24570154aab4a811dd82d1f1d436fc3/rerank_paper/DCN_model/tools.pyc


--------------------------------------------------------------------------------
/rerank_paper/DCN_model/util.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | def index_matrix_to_pairs(index_matrix):
 4 |     # [[3,1,2], [2,3,1]] -> [[[0, 3], [1, 1], [2, 2]],
 5 |     #                        [[0, 2], [1, 3], [2, 1]]]
 6 |     replicated_first_indices = tf.range(tf.shape(index_matrix)[0])
 7 |     rank = len(index_matrix.get_shape())
 8 |     if rank == 2:
 9 |         replicated_first_indices = tf.tile(
10 |             tf.expand_dims(replicated_first_indices, axis=1),
11 |             [1, tf.shape(index_matrix)[1]])
12 |         replicated_first_indices = tf.cast(replicated_first_indices, dtype=tf.int64)
13 |     return tf.stack([replicated_first_indices, index_matrix], axis=rank)
14 | 
15 | def string_hash_to_index(tensor, bucket=1<<22):
16 |     return tf.strings.to_hash_bucket_fast(tensor, bucket)
17 | 
18 | def int_to_string_with_key(tensor, key):
19 |     return key + "_" + tf.strings.as_string(tensor)
20 | 
21 | def float_to_string_with_key(tensor, key, precision=1):
22 |     return key + "_" + tf.strings.as_string(tensor, precision)
23 | 
24 | def float_to_int(tensor, order):
25 |     wc = 10 ** order   
26 |     return tf.cast(tensor * wc, tf.int64)
27 | 
28 | def float_custom_hash(tensor, key, precision=0, bucket=1<<22):
29 |     tensor = float_to_string_with_key(tensor, key, precision)
30 |     tensor = string_hash_to_index(tensor, bucket)
31 |     return tensor
32 | 
33 | def int_custom_hash(tensor, key, bucket=1<<22):
34 |     tensor = int_to_string_with_key(tensor, key)
35 |     tensor = string_hash_to_index(tensor, bucket)
36 |     return tensor
37 | 


--------------------------------------------------------------------------------
/rerank_paper/DCN_model/util.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lemonace/PIER_code/9c47f80af24570154aab4a811dd82d1f1d436fc3/rerank_paper/DCN_model/util.pyc


--------------------------------------------------------------------------------
/rerank_paper/DNN_model/avg_std/delivery:
--------------------------------------------------------------------------------
1 | 4.668 40.692 17.227 2.616
2 | 2.722 10.623 13.181 1.742
3 | 


--------------------------------------------------------------------------------
/rerank_paper/DNN_model/avg_std/poi:
--------------------------------------------------------------------------------
1 | 2066.773487 23.97615642 26.98331926 3.040902147 4.622442584
2 | 2239.863594 17.22234242 18.71115826 1.824693613 0.597578887


--------------------------------------------------------------------------------
/rerank_paper/DNN_model/avg_std/user:
--------------------------------------------------------------------------------
1 | 0 0
2 | 1 1


--------------------------------------------------------------------------------
/rerank_paper/DNN_model/build_model.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | echo ""
 4 | # 共建模型服务模版的路径 ------------------------ 修改模版配置 ----------------------
 5 | template_path="../model_template/"
 6 | 
 7 | # 待上传模型pb文件的路径 ------------------------ 修改模型配置 ----------------------
 8 | model_path="../model/"
 9 | model_name=$1
10 | ep=$2
11 | model_file=${model_path}${model_name}/${ep}
12 | echo ${model_file}
13 | target_path="../model_zip/"
14 | if [[ ! -d ${target_path} ]]; then
15 |         mkdir ${target_path}
16 | fi
17 | target_name=${model_name}_${ep}
18 | target_file=${target_path}${target_name}
19 | target_name_zip=${target_name}.zip
20 | echo ${target_name}
21 | echo "---> 待上传模型的目标路径: "${target_file}
22 | if [[ -d ${target_file} ]]; then
23 |     cmd_del="rm -fr ${target_file}"
24 |     echo "---> 删除已存在的目标文件: "${cmd_del}
25 |     ${cmd_del}
26 | fi
27 | if [[ ! -d ${target_file} ]]; then
28 |         mkdir ${target_file}
29 | fi
30 | 
31 | cmd_cp_model_pb="cp -fr ${model_file} ${target_file}/${target_name}"
32 | echo "---> 拷贝模型pb文件到目标目录: "${cmd_cp_model_pb}
33 | ${cmd_cp_model_pb}
34 | 
35 | cmd_mv1="cp ${template_path}/model.properties ${target_file}/${target_name}.properties"
36 | cmd_mv2="cp ${template_path}/model.xml ${target_file}/${target_name}.xml"
37 | cmd_mv3="cp ${template_path}/tensors.xml ${target_file}/tensors.xml"
38 | cmd_mv4="cp ${template_path}/poi_feature.xml ${target_file}/poi_feature.xml"
39 | cmd_mv5="cp ${template_path}/poi_feature.tensor ${target_file}/poi_feature.tensor"
40 | echo "---> 修改properties文件名: "${cmd_mv1}
41 | echo "---> 修改xml文件名: "${cmd_mv2}
42 | ${cmd_mv1}
43 | ${cmd_mv2}
44 | ${cmd_mv3}
45 | ${cmd_mv4}
46 | ${cmd_mv5}
47 | 
48 | cmd_zip1="cd ${target_path}"
49 | cmd_zip2="zip -r ${target_name_zip} ${target_name}"
50 | echo "---> 压缩上传模型文件: "${cmd_zip1} ${cmd_zip2}
51 | ${cmd_zip1}
52 | ${cmd_zip2}
53 | cd -
54 | 
55 | echo ${target_name}.zip
56 | 


--------------------------------------------------------------------------------
/rerank_paper/DNN_model/config.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8 -*-
 2 | import shutil
 3 | import time
 4 | import os
 5 | 
 6 | # result
 7 | RANDOM_SEED = 2021
 8 | BATCH_SIZE = 1024
 9 | IMP_LOSS_WEIGHT = 0.02
10 | # basic config
11 | EPOCH = 1
12 | LEARNING_RATE = 0.005
13 | DATA_MODE = 3 # 1:local train，2:local test, 3:docker evaluate
14 | TRAIN_MODE = 3
15 | MODEL_NAME = "dnn_pointwise_model_v1"
16 | # poi类别特征
17 | FEATURE_CATE_NUM = 7 # v1r3:19
18 | # dense特征
19 | FEATURE_DENSE_NUM = 5  # v1:28 v1r2:79 v1r3:83
20 | # 预估值特征
21 | FEATURE_CXR_NUM = 3
22 | # 环境特征
23 | FEATURE_ENV_NUM = 2
24 | # 自然poi
25 | FEATURE_NATURE_POI = 25
26 | 
27 | # N: Cut Number of POI For Train
28 | POI_NUM = 5
29 | FEATURE_NUM = 9
30 | PAGE_NUM = 5
31 | FEATURE_NUM_FOR_PAGE = 11
32 | # 属性特征：KA AOR BRAND
33 | FEATURE_ATTR_NUM = 3
34 | 
35 | # DELIVERY_FEAT
36 | DELIVERY_FEAT_NUM = 4
37 | 
38 | # OUT NUM
39 | OUT_NUM = 1
40 | 
41 | PLACE_HOLDER_NUM = 11
42 | DENSE_FEAT_NUM = 439
43 | 
44 | 
45 | # embedding_look_up维度
46 | CATE_FEATURE_EMBEDDINGS_SHAPE = [1 << 22, 8]
47 | 
48 | # 网络结构参数
49 | MODEL_PARAMS = {
50 |     'INPUT_TENSOR_LAYERS_A': [60, 32, 20],
51 |     'INPUT_TENSOR_LAYERS_B': [128, 32],
52 |     'INPUT_TENSOR_LAYERS_C': [50, 20],
53 |     'INPUT_TENSOR_LAYERS_D': [50, 20],
54 |     'INPUT_TENSOR_LAYERS_E': [50, 20]
55 | }
56 | A_INPUT_DIM = (MODEL_PARAMS['INPUT_TENSOR_LAYERS_A'][-1] + 1) * 1
57 | 
58 | DIN_CONF = {}
59 | 
60 | # train data
61 | # /users/lemonace/Downloads/tfrecord-rl-limit5-v1
62 | if DATA_MODE == 1:
63 |     TRAIN_FILE = ['/users/meituan_sxw/Downloads/part-r-00047']
64 |     VALID_FILE = TRAIN_FILE
65 |     PREDICT_FILE = VALID_FILE
66 |     TEST_FILE = PREDICT_FILE
67 | elif DATA_MODE == 2:
68 |     TRAIN_FILE = ['/mnt/dolphinfs/hdd_pool/docker/user/hadoop-hmart-waimaiad/yangfan129/train_data/avito_v1_new/avito_v1_new/test_data/part-r-*']
69 |     VALID_FILE = TRAIN_FILE
70 |     TEST_FILE = VALID_FILE
71 | elif DATA_MODE == 3:
72 |     TRAIN_FILE = [
73 |         "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-hmart-waimaiad/yangfan129/train_data/avito_v3_new_point_wise/train_data/part-r-*"]
74 |     VALID_FILE = [
75 |         "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-hmart-waimaiad/yangfan129/train_data/avito_v3_new_point_wise/test_data/part-r-*"]
76 |     TEST_FILE = [
77 |         "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-hmart-waimaiad/yangfan129/train_data/avito_v3_new_point_wise/test_data/part-r-*"]
78 | elif DATA_MODE == 4:
79 |     DATA_FILE = "/home/hadoop-hmart-waimaiad/cephfs/data/yangfan129/train_data/tfrecord-multi-channel-v1/"
80 |     #TRAIN_LIST = ["20211222", "20211223", "20211224", "20211225"]
81 |     TRAIN_LIST = ["20220123"]
82 |     VALID_LIST = ["20220124"]
83 |     TRAIN_FILE = [DATA_FILE + x + "/part-r-*" for x in TRAIN_LIST]
84 |     VALID_FILE = [DATA_FILE + x + "/part-r-0001*" for x in VALID_LIST]
85 |     TEST_FILE = [DATA_FILE + x + "/part-r-00011" for x in TRAIN_LIST]
86 | 
87 | # 辅助脚本
88 | MEAN_VAR_PATH_POI = "./avg_std/poi"
89 | MEAN_VAR_PATH_DELIVERY = "./avg_std/delivery"
90 | MODEL_SAVE_PATH = "../model/" + MODEL_NAME
91 | MODEL_SAVE_PB_EPOCH_ON = False
92 | MODEL_SAVE_PB_EPOCH_PATH = MODEL_SAVE_PATH + "_pbs"
93 | 


--------------------------------------------------------------------------------
/rerank_paper/DNN_model/layers.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | def DIN(seq, seq_len, target, conf, scope="DIN"):
 4 |     # seq BATCH_SIZE * SEQ_LEN * FEAT_NUM : N * M * H
 5 |     # target BATCH_SIZE * FEAT_NUM : N * H
 6 |     # return : BATCH_SIZE * H
 7 | 
 8 |     with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
 9 |         seq_shape = tf.shape(seq)
10 |         target = tf.tile(target, [1, seq_shape[1], 1])
11 | 
12 |         input = tf.concat([seq, target, seq - target, seq * target], axis=-1)
13 | 
14 |         layers = conf.get("layers", [64, 32])
15 |         for layer in layers:
16 |             input = tf.layers.dense(input, layer, activation=tf.nn.sigmoid, name="att_"+str(layer))
17 | 
18 |         input = tf.squeeze(tf.layers.dense(input, 1, activation=None, name="att_final"), axis=-1) # N * M
19 | 
20 |         # Mask
21 |         seq_mask = tf.squeeze(tf.sequence_mask(seq_len, seq_shape[1]))
22 |         # seq_mask = tf.Print(seq_mask, [seq_mask], message="seq_mask", summarize=100)
23 |         padding = tf.ones_like(input) * (-2 ** 32 + 1)
24 |         attention = tf.nn.softmax(tf.where(seq_mask, input, padding), axis=-1) # N * M
25 |         # attention = tf.Print(attention, [attention], message="attention", summarize=100)
26 |         attention = tf.tile(tf.expand_dims(attention, axis=2), [1, 1, seq_shape[2]])
27 |         output = tf.reduce_sum(tf.transpose(attention * seq, [0, 2, 1]), axis=1)
28 |     return output
29 | 


--------------------------------------------------------------------------------
/rerank_paper/DNN_model/main.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from config import *
  3 | from model import *
  4 | from sklearn import metrics
  5 | 
  6 | 
  7 | def create_estimator():
  8 |     tf.logging.set_verbosity(tf.logging.INFO)
  9 |     session_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
 10 |     session_config.gpu_options.allow_growth = True
 11 |     config = tf.estimator.RunConfig(
 12 |         tf_random_seed=RANDOM_SEED,
 13 |         save_summary_steps=100,
 14 |         save_checkpoints_steps=1000,
 15 |         model_dir=MODEL_SAVE_PATH,
 16 |         keep_checkpoint_max=2,
 17 |         log_step_count_steps=1000,
 18 |         session_config=session_config)
 19 |     nn_model = DNN()
 20 |     estimator = tf.estimator.Estimator(model_fn=nn_model.model_fn_estimator, config=config)
 21 |     return estimator, nn_model
 22 | 
 23 | 
 24 | def save_model_pb_with_estimator(estimator, params, export_dir_base):
 25 |     estimator._params['save_model'] = params['save_model']
 26 | 
 27 |     def _serving_input_receiver_fn():
 28 |         # env_feature = > dense_feature
 29 |         # cxr_feature = > screen_predict_feature
 30 |         # cat_feature = > screen_cate_feature
 31 |         # dense_feature = > screen_dense_feature
 32 |         receiver_tensors = {
 33 |             # ctr cvr gmv预估值 && bid
 34 |             'screen_predict_feature': tf.placeholder(tf.float32, [None, POI_NUM, FEATURE_CXR_NUM],
 35 |                                                      name='screen_predict_feature'),
 36 |             # dense 特征 (价格，评分)
 37 |             'screen_dense_feature': tf.placeholder(tf.float32, [None, POI_NUM, FEATURE_DENSE_NUM],
 38 |                                                    name='screen_dense_feature'),
 39 |             # 离散特征(品类)
 40 |             'screen_cate_feature': tf.placeholder(tf.int64, [None, POI_NUM, FEATURE_CATE_NUM],
 41 |                                                   name='screen_cate_feature'),
 42 |             # 环境特征（是否有铂金）
 43 |             'dense_feature': tf.placeholder(tf.float32, [None, DENSE_FEAT_NUM],
 44 |                                             name='dense_feature')
 45 |         }
 46 |         return tf.estimator.export.ServingInputReceiver(receiver_tensors=receiver_tensors, features=receiver_tensors)
 47 | 
 48 |     export_dir = estimator.export_saved_model(export_dir_base=export_dir_base,
 49 |                                               serving_input_receiver_fn=_serving_input_receiver_fn)
 50 |     estimator._params.pop('save_model')
 51 |     return export_dir.decode()
 52 | 
 53 | 
 54 | def calculate_result(result_generator):
 55 |     y_ctr, pred_ctr, ctr = [], [], []
 56 |     for result in result_generator:
 57 |         cxr_feature = result['cxr_feature']
 58 |         mask = result['mask']
 59 |         # ctr_label
 60 |         idx = np.where(mask.reshape(-1) == 1)
 61 |         y_ctr += result['ctr_label'].reshape(-1)[idx].tolist()
 62 |         pred_ctr += result['ctr_out'].reshape(-1)[idx].tolist()
 63 |         ctr += cxr_feature[:, 0].reshape(-1)[idx].tolist()
 64 | 
 65 |     ctr_auc, ctr_auc_jp, ctr_cb, ctr_cb_jp = metrics.roc_auc_score(y_ctr, pred_ctr), metrics.roc_auc_score(y_ctr,
 66 |                                                                                                            ctr), np.sum(
 67 |         pred_ctr) / np.sum(y_ctr), np.sum(ctr) / np.sum(y_ctr)
 68 |     print("ctr_auc:{}, ctr_auc_jp:{}, ctr_cb:{}, ctr_cb_jp:{}".format(ctr_auc, ctr_auc_jp, ctr_cb, ctr_cb_jp))
 69 | 
 70 | 
 71 | if __name__ == '__main__':
 72 | 
 73 |     estimator, nn_model = create_estimator()
 74 | 
 75 |     with tick_tock("DATA_INPUT") as _:
 76 |         valid_input_fn = input_fn_maker(VALID_FILE, False, batch_size=1024, epoch=1)
 77 |         test_input_fn = input_fn_maker(TEST_FILE, False, batch_size=1024, epoch=1)
 78 | 
 79 |     if TRAIN_MODE == 1:
 80 |         for i in range(EPOCH):
 81 |             for idx, data in enumerate(TRAIN_FILE):
 82 |                 with tick_tock("DATA_INPUT") as _:
 83 |                     train_input_fn = input_fn_maker([data], True, batch_size=BATCH_SIZE, epoch=1)
 84 |                 with tick_tock("TRAIN") as _:
 85 |                     estimator.train(train_input_fn)
 86 |                 if MODEL_SAVE_PB_EPOCH_ON:
 87 |                     export_dir = save_model_pb_with_estimator(estimator, params={'save_model': 'listwise'},
 88 |                                                               export_dir_base=MODEL_SAVE_PB_EPOCH_PATH)
 89 |                     ep_insert_index = i * len(TRAIN_FILE) + idx
 90 |                     target_dir = export_dir + "/../ep" + str(ep_insert_index)
 91 |                     while os.path.exists(target_dir):
 92 |                         target_dir = export_dir + "/../ep" + str(ep_insert_index)
 93 |                     shutil.move(export_dir, target_dir)
 94 |                     print(time.strftime("%m-%d %H:%M:%S ",
 95 |                                         time.localtime(time.time())) + "export model PB: " + target_dir)
 96 |                 # with tick_tock("PREDICT") as _:
 97 |                 # result_generator = estimator.predict(input_fn=valid_input_fn, yield_single_examples=False)
 98 |                 # calculate_result(result_generator)
 99 | 
100 | 
101 | 
102 |     elif TRAIN_MODE == 2:
103 |         with tick_tock("PREDICT") as _:
104 |             result_generator = estimator.predict(input_fn=valid_input_fn, yield_single_examples=False)
105 |             calculate_result(result_generator)
106 | 
107 |     elif TRAIN_MODE == 3:
108 |         for i in range(EPOCH):
109 |             for idx, data in enumerate(TRAIN_FILE):
110 |                 with tick_tock("DATA_INPUT") as _:
111 |                     train_input_fn = input_fn_maker([data], True, batch_size=BATCH_SIZE, epoch=1)
112 |                 with tick_tock("TRAIN") as _:
113 |                     estimator.train(train_input_fn)
114 |                 with tick_tock("PREDICT") as _:
115 |                     result_generator = estimator.predict(input_fn=valid_input_fn, yield_single_examples=False)
116 |                     print("valid_data")
117 |                     calculate_result(result_generator)
118 |                     # result_generator = estimator.predict(input_fn=test_input_fn, yield_single_examples=False)
119 |                     print("train_data")
120 |                     # calculate_result(result_generator)
121 |                     # save pb
122 | 
123 | 
124 |     elif TRAIN_MODE == 4:
125 |         export_dir = save_model_pb_with_estimator(estimator, params={'save_model': 'listwise'},
126 |                                                   export_dir_base=MODEL_SAVE_PB_EPOCH_PATH)
127 |         ep_insert_index = 0
128 |         target_dir = export_dir + "/../ep" + str(ep_insert_index)
129 |         while os.path.exists(target_dir):
130 |             target_dir = export_dir + "/../ep" + str(ep_insert_index)
131 |         shutil.move(export_dir, target_dir)
132 |         print(time.strftime("%m-%d %H:%M:%S ",
133 |                             time.localtime(time.time())) + "export model PB: " + target_dir)
134 | 
135 | 


--------------------------------------------------------------------------------
/rerank_paper/DNN_model/model_load.py:
--------------------------------------------------------------------------------
 1 | import tensorflow
 2 | 
 3 | from config import *
 4 | import tensorflow_core.contrib.predictor as predictor
 5 | 
 6 | def load_listwise_model():
 7 |     model_filename_dir = MODEL_SAVE_PATH + "_pbs/ep0/"
 8 |     predict_fn = predictor.from_saved_model(model_filename_dir)
 9 |     # predict_fn = tf.compat.v2.saved_model.load(model_filename_dir)
10 | 
11 |     # env_feature = > dense_feature
12 |     # cxr_feature = > screen_predict_feature
13 |     # cat_feature = > screen_cate_feature
14 |     # dense_feature = > screen_dense_feature
15 |     predictions = predict_fn({
16 |         'screen_predict_feature': [[[0.036115, 0.05427262, 0.09489095, 0.2],
17 |                                     [0.027565, 0.07474336, 0.04988268, 0.53],
18 |                                     [0.024815, 0.1775544, 0.12052802, 0.24],
19 |                                     [0.023316, 0.12283709, 0.10298113, 0.1]]],
20 |         # dense 特征 (价格，评分)
21 |         'screen_dense_feature': [[[1359., 30.146147, 26., 5., 4.85],
22 |                                   [318., 14.675659, 0., 5., 4.94],
23 |                                   [637., 24.784016, 0., 5., 4.65],
24 |                                   [185., 25.333273, 0., 5., 4.75]]],
25 |         # 离散特征(品类)
26 |         'screen_cate_feature': [[[2638824, 4148885, 432243, 3985407, 3385100, 3019284],
27 |                                  [2638824, 3905410, 3212599, 3985407, 1997821, 3019284],
28 |                                  [2638824, 4148885, 3622545, 3985407, 1997821, 3019284],
29 |                                  [2638824, 4148885, 432243, 3985407, 1997821, 3019284]]],
30 |         # 环境特征（是否有铂金）
31 |         'dense_feature': [[0., 0.]]
32 |     })
33 | 
34 |     print('Q_network_output:', predictions['Q_network_output'])
35 |     print('out:', predictions['out'])
36 | 
37 | if __name__ == '__main__':
38 |     # load_pg_model()
39 |     load_listwise_model()


--------------------------------------------------------------------------------
/rerank_paper/DNN_model/restore.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lemonace/PIER_code/9c47f80af24570154aab4a811dd82d1f1d436fc3/rerank_paper/DNN_model/restore.py


--------------------------------------------------------------------------------
/rerank_paper/DNN_model/run.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | LOG_PATH="./"
 4 | if [[ ! -d ${LOG_PATH} ]]; then
 5 |         mkdir ${LOG_PATH}
 6 | fi
 7 | 
 8 | project_path=$(cd `dirname $0`; pwd)
 9 | project_name="${project_path##*/}"
10 | begin_time=$(date "+%Y-%m-%d %H:%M:%S")
11 | author="yangfan129"
12 | 
13 | LOG_FILENAME="log"
14 | CUDA_VISIBLE_DEVICES=0 nohup python -u main.py --author ${author} --project ${project_name} --begintime ${begin_time} > ${LOG_PATH}${LOG_FILENAME} 2>&1 &
15 | 


--------------------------------------------------------------------------------
/rerank_paper/DNN_model/tools.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import datetime
 3 | import time
 4 | import os
 5 | 
 6 | 
 7 | class tick_tock:
 8 |     def __init__(self, process_name, verbose=1):
 9 |         self.process_name = process_name
10 |         self.verbose = verbose
11 | 
12 |     def __enter__(self):
13 |         if self.verbose:
14 |             print(("*" * 50 + " {} START!!!! ".format(self.process_name) + "*" * 50))
15 |             self.begin_time = time.time()
16 | 
17 |     def __exit__(self, type, value, traceback):
18 |         if self.verbose:
19 |             end_time = time.time()
20 |             duration_seconds = end_time - self.begin_time
21 |             duration = str(datetime.timedelta(seconds=duration_seconds))
22 | 
23 |             print(("#" * 50 + " {} END... time lapsing {}  ".format(self.process_name, duration) + "#" * 50))
24 | 
25 | 
26 | class FeatureInfo:
27 |     def __init__(self, feature_info_str):
28 |         self.feature_info_str = feature_info_str
29 | 
30 |         self.feature_name = "NonFeaName"
31 |         self.feature_size = 0
32 |         self.feature_mask = 1
33 |         self.parse_info_flag = False
34 |         self.part_num = 3
35 | 
36 |         self._parse_info()
37 | 
38 |     def _parse_info(self):
39 |         infoList = self.feature_info_str.split()
40 | 
41 |         if len(infoList) == self.part_num:
42 |             self.feature_name = infoList[0]
43 |             self.feature_size = int(infoList[1])
44 |             self.feature_mask = int(infoList[2])
45 |             self.parse_info_flag = True
46 | 
47 | 
48 | def parse_mask_file(feature_mask_file):
49 |     try:
50 |         if not os.path.exists(feature_mask_file):
51 |             print("parse_mask_file fail - file not exists:", feature_mask_file)
52 |             return [], False
53 |         # feature_name_list = []
54 |         feature_mask_list = []
55 |         feature_hold_cnt = 0
56 | 
57 |         with open(feature_mask_file) as f:
58 |             str_list = f.readlines()
59 | 
60 |         for i in range(0, len(str_list)):
61 |             str_list[i] = str_list[i].strip('\n').strip()
62 |             if str_list[i] == "":
63 |                 continue
64 | 
65 |             info = FeatureInfo(str_list[i])
66 |             if not info.parse_info_flag:
67 |                 print("parse_mask_file fail - parse_info fail:", str_list[i])
68 |                 parse_mask_flag = False
69 |                 return [], parse_mask_flag
70 | 
71 |             for j in range(info.feature_size):
72 |                 feature_mask_list.append(info.feature_mask)
73 |                 if info.feature_mask != 0:
74 |                     feature_hold_cnt += 1
75 |                 # if info.feature_size > 1:
76 |                 #     feature_name_list.append(info.feature_name + "_" + str(j))
77 |                 # else:
78 |                 #     feature_name_list.append(info.feature_name)
79 | 
80 |         parse_mask_flag = True
81 |         return feature_mask_list, parse_mask_flag, feature_hold_cnt
82 |     except Exception as e:
83 |         print("parse_mask_file fail - Exception:", e)
84 |         return [], False
85 | 
86 | 
87 | if __name__ == "__main__":
88 |     feature_mask_list, parse_feature_mask_flag, feature_hold_cnt = parse_mask_file("feature_mask")
89 |     print(feature_mask_list)
90 |     print(len(feature_mask_list))
91 |     print(parse_feature_mask_flag)
92 |     print(feature_hold_cnt)
93 | 


--------------------------------------------------------------------------------
/rerank_paper/DNN_model/util.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | def index_matrix_to_pairs(index_matrix):
 4 |     # [[3,1,2], [2,3,1]] -> [[[0, 3], [1, 1], [2, 2]],
 5 |     #                        [[0, 2], [1, 3], [2, 1]]]
 6 |     replicated_first_indices = tf.range(tf.shape(index_matrix)[0])
 7 |     rank = len(index_matrix.get_shape())
 8 |     if rank == 2:
 9 |         replicated_first_indices = tf.tile(
10 |             tf.expand_dims(replicated_first_indices, axis=1),
11 |             [1, tf.shape(index_matrix)[1]])
12 |         replicated_first_indices = tf.cast(replicated_first_indices, dtype=tf.int64)
13 |     return tf.stack([replicated_first_indices, index_matrix], axis=rank)
14 | 
15 | def string_hash_to_index(tensor, bucket=1<<22):
16 |     return tf.strings.to_hash_bucket_fast(tensor, bucket)
17 | 
18 | def int_to_string_with_key(tensor, key):
19 |     return key + "_" + tf.strings.as_string(tensor)
20 | 
21 | def float_to_string_with_key(tensor, key, precision=1):
22 |     return key + "_" + tf.strings.as_string(tensor, precision)
23 | 
24 | def float_to_int(tensor, order):
25 |     wc = 10 ** order   
26 |     return tf.cast(tensor * wc, tf.int64)
27 | 
28 | def float_custom_hash(tensor, key, precision=0, bucket=1<<22):
29 |     tensor = float_to_string_with_key(tensor, key, precision)
30 |     tensor = string_hash_to_index(tensor, bucket)
31 |     return tensor
32 | 
33 | def int_custom_hash(tensor, key, bucket=1<<22):
34 |     tensor = int_to_string_with_key(tensor, key)
35 |     tensor = string_hash_to_index(tensor, bucket)
36 |     return tensor
37 | 


--------------------------------------------------------------------------------
/rerank_paper/extr_model/__pycache__/config.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lemonace/PIER_code/9c47f80af24570154aab4a811dd82d1f1d436fc3/rerank_paper/extr_model/__pycache__/config.cpython-37.pyc


--------------------------------------------------------------------------------
/rerank_paper/extr_model/__pycache__/data_input.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lemonace/PIER_code/9c47f80af24570154aab4a811dd82d1f1d436fc3/rerank_paper/extr_model/__pycache__/data_input.cpython-37.pyc


--------------------------------------------------------------------------------
/rerank_paper/extr_model/__pycache__/layers.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lemonace/PIER_code/9c47f80af24570154aab4a811dd82d1f1d436fc3/rerank_paper/extr_model/__pycache__/layers.cpython-37.pyc


--------------------------------------------------------------------------------
/rerank_paper/extr_model/__pycache__/model.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lemonace/PIER_code/9c47f80af24570154aab4a811dd82d1f1d436fc3/rerank_paper/extr_model/__pycache__/model.cpython-37.pyc


--------------------------------------------------------------------------------
/rerank_paper/extr_model/__pycache__/tools.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lemonace/PIER_code/9c47f80af24570154aab4a811dd82d1f1d436fc3/rerank_paper/extr_model/__pycache__/tools.cpython-37.pyc


--------------------------------------------------------------------------------
/rerank_paper/extr_model/__pycache__/util.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lemonace/PIER_code/9c47f80af24570154aab4a811dd82d1f1d436fc3/rerank_paper/extr_model/__pycache__/util.cpython-37.pyc


--------------------------------------------------------------------------------
/rerank_paper/extr_model/avg_std/delivery:
--------------------------------------------------------------------------------
1 | 4.668 40.692 17.227 2.616
2 | 2.722 10.623 13.181 1.742
3 | 


--------------------------------------------------------------------------------
/rerank_paper/extr_model/avg_std/poi:
--------------------------------------------------------------------------------
1 | 2066.773487 23.97615642 26.98331926 3.040902147 4.622442584
2 | 2239.863594 17.22234242 18.71115826 1.824693613 0.597578887


--------------------------------------------------------------------------------
/rerank_paper/extr_model/avg_std/user:
--------------------------------------------------------------------------------
1 | 0 0
2 | 1 1


--------------------------------------------------------------------------------
/rerank_paper/extr_model/build_model.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | echo ""
 4 | # 共建模型服务模版的路径 ------------------------ 修改模版配置 ----------------------
 5 | template_path="../model_template/"
 6 | 
 7 | # 待上传模型pb文件的路径 ------------------------ 修改模型配置 ----------------------
 8 | model_path="../model/"
 9 | model_name=$1
10 | ep=$2
11 | model_file=${model_path}${model_name}/${ep}
12 | echo ${model_file}
13 | target_path="../model_zip/"
14 | if [[ ! -d ${target_path} ]]; then
15 |         mkdir ${target_path}
16 | fi
17 | target_name=${model_name}_${ep}
18 | target_file=${target_path}${target_name}
19 | target_name_zip=${target_name}.zip
20 | echo ${target_name}
21 | echo "---> 待上传模型的目标路径: "${target_file}
22 | if [[ -d ${target_file} ]]; then
23 |     cmd_del="rm -fr ${target_file}"
24 |     echo "---> 删除已存在的目标文件: "${cmd_del}
25 |     ${cmd_del}
26 | fi
27 | if [[ ! -d ${target_file} ]]; then
28 |         mkdir ${target_file}
29 | fi
30 | 
31 | cmd_cp_model_pb="cp -fr ${model_file} ${target_file}/${target_name}"
32 | echo "---> 拷贝模型pb文件到目标目录: "${cmd_cp_model_pb}
33 | ${cmd_cp_model_pb}
34 | 
35 | cmd_mv1="cp ${template_path}/model.properties ${target_file}/${target_name}.properties"
36 | cmd_mv2="cp ${template_path}/model.xml ${target_file}/${target_name}.xml"
37 | cmd_mv3="cp ${template_path}/tensors.xml ${target_file}/tensors.xml"
38 | cmd_mv4="cp ${template_path}/poi_feature.xml ${target_file}/poi_feature.xml"
39 | cmd_mv5="cp ${template_path}/poi_feature.tensor ${target_file}/poi_feature.tensor"
40 | echo "---> 修改properties文件名: "${cmd_mv1}
41 | echo "---> 修改xml文件名: "${cmd_mv2}
42 | ${cmd_mv1}
43 | ${cmd_mv2}
44 | ${cmd_mv3}
45 | ${cmd_mv4}
46 | ${cmd_mv5}
47 | 
48 | cmd_zip1="cd ${target_path}"
49 | cmd_zip2="zip -r ${target_name_zip} ${target_name}"
50 | echo "---> 压缩上传模型文件: "${cmd_zip1} ${cmd_zip2}
51 | ${cmd_zip1}
52 | ${cmd_zip2}
53 | cd -
54 | 
55 | echo ${target_name}.zip
56 | 


--------------------------------------------------------------------------------
/rerank_paper/extr_model/config.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8 -*-
 2 | import shutil
 3 | import time
 4 | import os
 5 | 
 6 | # result
 7 | RANDOM_SEED = 2021
 8 | BATCH_SIZE = 1024
 9 | IMP_LOSS_WEIGHT = 0.02
10 | # basic config
11 | EPOCH = 1
12 | LEARNING_RATE = 0.005
13 | DATA_MODE = 3 # 1:local train，2:local test, 3:docker evaluate
14 | TRAIN_MODE = 3
15 | MODEL_NAME = "extr_model_v3"
16 | # poi类别特征
17 | FEATURE_CATE_NUM = 7 # v1r3:19
18 | # dense特征
19 | FEATURE_DENSE_NUM = 5  # v1:28 v1r2:79 v1r3:83
20 | # 预估值特征
21 | FEATURE_CXR_NUM = 3
22 | # 环境特征
23 | FEATURE_ENV_NUM = 2
24 | # 自然poi
25 | FEATURE_NATURE_POI = 3
26 | 
27 | # N: Cut Number of POI For Train
28 | POI_NUM = 5
29 | FEATURE_NUM = 9
30 | PAGE_NUM = 5
31 | FEATURE_NUM_FOR_PAGE = 11
32 | # 属性特征：KA AOR BRAND
33 | FEATURE_ATTR_NUM = 3
34 | 
35 | # DELIVERY_FEAT
36 | DELIVERY_FEAT_NUM = 4
37 | 
38 | # OUT NUM
39 | OUT_NUM = 5
40 | 
41 | PLACE_HOLDER_NUM = 11
42 | DENSE_FEAT_NUM = 439
43 | 
44 | 
45 | # embedding_look_up维度
46 | CATE_FEATURE_EMBEDDINGS_SHAPE = [1 << 22, 8]
47 | 
48 | # 网络结构参数
49 | MODEL_PARAMS = {
50 |     'INPUT_TENSOR_LAYERS_A': [60, 32, 20],
51 |     'INPUT_TENSOR_LAYERS_B': [128, 32],
52 |     'INPUT_TENSOR_LAYERS_C': [50, 20],
53 |     'INPUT_TENSOR_LAYERS_D': [50, 20],
54 |     'INPUT_TENSOR_LAYERS_E': [50, 20]
55 | }
56 | A_INPUT_DIM = (MODEL_PARAMS['INPUT_TENSOR_LAYERS_A'][-1] + 1)
57 | 
58 | DIN_CONF = {}
59 | 
60 | # train data
61 | # /users/lemonace/Downloads/tfrecord-rl-limit5-v1
62 | if DATA_MODE == 1:
63 |     # TRAIN_FILE = ['/users/meituan_sxw/Downloads/part-r-00046']
64 |     TRAIN_FILE = ['/Users/lemonace/Downloads/docker_data/part-r-00049']
65 |     VALID_FILE = TRAIN_FILE
66 |     PREDICT_FILE = VALID_FILE
67 |     TEST_FILE = PREDICT_FILE
68 | elif DATA_MODE == 2:
69 |     TRAIN_FILE = ['/mnt/dolphinfs/hdd_pool/docker/user/hadoop-hmart-waimaiad/yangfan129/train_data/avito_v1_new/avito_v1_new/test_data/part-r-*']
70 |     VALID_FILE = TRAIN_FILE
71 |     TEST_FILE = VALID_FILE
72 | elif DATA_MODE == 3:
73 |     TRAIN_FILE = [
74 |         "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-hmart-waimaiad/yangfan129/train_data/avito_v3_new/train_data/part-r-*"]
75 |     VALID_FILE = [
76 |         "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-hmart-waimaiad/yangfan129/train_data/avito_v3_new/test_data/part-r-*"]
77 |     TEST_FILE = [
78 |         "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-hmart-waimaiad/yangfan129/train_data/avito_v3_new/test_data/part-r-*"]
79 | elif DATA_MODE == 4:
80 |     DATA_FILE = "/home/hadoop-hmart-waimaiad/cephfs/data/yangfan129/train_data/tfrecord-multi-channel-v1/"
81 |     #TRAIN_LIST = ["20211222", "20211223", "20211224", "20211225"]
82 |     TRAIN_LIST = ["20220123"]
83 |     VALID_LIST = ["20220124"]
84 |     TRAIN_FILE = [DATA_FILE + x + "/part-r-*" for x in TRAIN_LIST]
85 |     VALID_FILE = [DATA_FILE + x + "/part-r-0001*" for x in VALID_LIST]
86 |     TEST_FILE = [DATA_FILE + x + "/part-r-00011" for x in TRAIN_LIST]
87 | 
88 | # 辅助脚本
89 | MEAN_VAR_PATH_POI = "./avg_std/poi"
90 | MEAN_VAR_PATH_DELIVERY = "./avg_std/delivery"
91 | MODEL_SAVE_PATH = "../model/" + MODEL_NAME
92 | MODEL_SAVE_PB_EPOCH_ON = False
93 | MODEL_SAVE_PB_EPOCH_PATH = MODEL_SAVE_PATH + "_pbs"
94 | 


--------------------------------------------------------------------------------
/rerank_paper/extr_model/main.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from config import *
  3 | from model import *
  4 | from sklearn import metrics
  5 | 
  6 | 
  7 | def create_estimator():
  8 |     tf.logging.set_verbosity(tf.logging.INFO)
  9 |     session_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
 10 |     session_config.gpu_options.allow_growth = True
 11 |     config = tf.estimator.RunConfig(
 12 |         tf_random_seed=RANDOM_SEED,
 13 |         save_summary_steps=100,
 14 |         save_checkpoints_steps=1000,
 15 |         model_dir=MODEL_SAVE_PATH,
 16 |         keep_checkpoint_max=2,
 17 |         log_step_count_steps=1000,
 18 |         session_config=session_config)
 19 |     nn_model = DNN()
 20 |     estimator = tf.estimator.Estimator(model_fn=nn_model.model_fn_estimator, config=config)
 21 |     return estimator, nn_model
 22 | 
 23 | 
 24 | def save_model_pb_with_estimator(estimator, params, export_dir_base):
 25 |     estimator._params['save_model'] = params['save_model']
 26 | 
 27 |     def _serving_input_receiver_fn():
 28 |         # env_feature = > dense_feature
 29 |         # cxr_feature = > screen_predict_feature
 30 |         # cat_feature = > screen_cate_feature
 31 |         # dense_feature = > screen_dense_feature
 32 |         receiver_tensors = {
 33 |             # ctr cvr gmv预估值 && bid
 34 |             'screen_predict_feature': tf.placeholder(tf.float32, [None, POI_NUM, FEATURE_CXR_NUM],
 35 |                                                      name='screen_predict_feature'),
 36 |             # dense 特征 (价格，评分)
 37 |             'screen_dense_feature': tf.placeholder(tf.float32, [None, POI_NUM, FEATURE_DENSE_NUM],
 38 |                                                    name='screen_dense_feature'),
 39 |             # 离散特征(品类)
 40 |             'screen_cate_feature': tf.placeholder(tf.int64, [None, POI_NUM, FEATURE_CATE_NUM],
 41 |                                                   name='screen_cate_feature'),
 42 |             # 环境特征（是否有铂金）
 43 |             'dense_feature': tf.placeholder(tf.float32, [None, DENSE_FEAT_NUM],
 44 |                                             name='dense_feature')
 45 |         }
 46 |         return tf.estimator.export.ServingInputReceiver(receiver_tensors=receiver_tensors, features=receiver_tensors)
 47 | 
 48 |     export_dir = estimator.export_saved_model(export_dir_base=export_dir_base,
 49 |                                               serving_input_receiver_fn=_serving_input_receiver_fn)
 50 |     estimator._params.pop('save_model')
 51 |     return export_dir.decode()
 52 | 
 53 | 
 54 | def calculate_result(result_generator):
 55 |     y_ctr, pred_ctr, ctr = [], [], []
 56 |     for result in result_generator:
 57 |         cxr_feature = result['cxr_feature']
 58 |         mask = result['mask']
 59 |         # ctr_label
 60 |         idx = np.where(mask.reshape(-1) == 1)
 61 |         y_ctr += result['ctr_label'].reshape(-1)[idx].tolist()
 62 |         pred_ctr += result['ctr_out'].reshape(-1)[idx].tolist()
 63 |         ctr += cxr_feature[:, :, 0].reshape(-1)[idx].tolist()
 64 | 
 65 |     ctr_auc, ctr_auc_jp, ctr_cb, ctr_cb_jp = metrics.roc_auc_score(y_ctr, pred_ctr), metrics.roc_auc_score(y_ctr,
 66 |                                                                                                            ctr), np.sum(
 67 |         pred_ctr) / np.sum(y_ctr), np.sum(ctr) / np.sum(y_ctr)
 68 |     print("ctr_auc:{}, ctr_auc_jp:{}, ctr_cb:{}, ctr_cb_jp:{}".format(ctr_auc, ctr_auc_jp, ctr_cb, ctr_cb_jp))
 69 | 
 70 | 
 71 | if __name__ == '__main__':
 72 | 
 73 |     estimator, nn_model = create_estimator()
 74 | 
 75 |     with tick_tock("DATA_INPUT") as _:
 76 |         valid_input_fn = input_fn_maker(VALID_FILE, False, batch_size=1024, epoch=1)
 77 |         test_input_fn = input_fn_maker(TEST_FILE, False, batch_size=1024, epoch=1)
 78 | 
 79 |     if TRAIN_MODE == 1:
 80 |         for i in range(EPOCH):
 81 |             for idx, data in enumerate(TRAIN_FILE):
 82 |                 with tick_tock("DATA_INPUT") as _:
 83 |                     train_input_fn = input_fn_maker([data], True, batch_size=BATCH_SIZE, epoch=1)
 84 |                 with tick_tock("TRAIN") as _:
 85 |                     estimator.train(train_input_fn)
 86 |                 if MODEL_SAVE_PB_EPOCH_ON:
 87 |                     export_dir = save_model_pb_with_estimator(estimator, params={'save_model': 'listwise'},
 88 |                                                               export_dir_base=MODEL_SAVE_PB_EPOCH_PATH)
 89 |                     ep_insert_index = i * len(TRAIN_FILE) + idx
 90 |                     target_dir = export_dir + "/../ep" + str(ep_insert_index)
 91 |                     while os.path.exists(target_dir):
 92 |                         target_dir = export_dir + "/../ep" + str(ep_insert_index)
 93 |                     shutil.move(export_dir, target_dir)
 94 |                     print(time.strftime("%m-%d %H:%M:%S ",
 95 |                                         time.localtime(time.time())) + "export model PB: " + target_dir)
 96 |                 # with tick_tock("PREDICT") as _:
 97 |                 # result_generator = estimator.predict(input_fn=valid_input_fn, yield_single_examples=False)
 98 |                 # calculate_result(result_generator)
 99 | 
100 | 
101 | 
102 |     elif TRAIN_MODE == 2:
103 |         with tick_tock("PREDICT") as _:
104 |             result_generator = estimator.predict(input_fn=valid_input_fn, yield_single_examples=False)
105 |             calculate_result(result_generator)
106 | 
107 |     elif TRAIN_MODE == 3:
108 |         for i in range(EPOCH):
109 |             for idx, data in enumerate(TRAIN_FILE):
110 |                 with tick_tock("DATA_INPUT") as _:
111 |                     train_input_fn = input_fn_maker([data], True, batch_size=BATCH_SIZE, epoch=1)
112 |                 with tick_tock("TRAIN") as _:
113 |                     estimator.train(train_input_fn)
114 |                 with tick_tock("PREDICT") as _:
115 |                     result_generator = estimator.predict(input_fn=valid_input_fn, yield_single_examples=False)
116 |                     print("valid_data")
117 |                     calculate_result(result_generator)
118 |                     # result_generator = estimator.predict(input_fn=test_input_fn, yield_single_examples=False)
119 |                     print("train_data")
120 |                     # calculate_result(result_generator)
121 |                     # save pb
122 | 
123 | 
124 |     elif TRAIN_MODE == 4:
125 |         export_dir = save_model_pb_with_estimator(estimator, params={'save_model': 'listwise'},
126 |                                                   export_dir_base=MODEL_SAVE_PB_EPOCH_PATH)
127 |         ep_insert_index = 0
128 |         target_dir = export_dir + "/../ep" + str(ep_insert_index)
129 |         while os.path.exists(target_dir):
130 |             target_dir = export_dir + "/../ep" + str(ep_insert_index)
131 |         shutil.move(export_dir, target_dir)
132 |         print(time.strftime("%m-%d %H:%M:%S ",
133 |                             time.localtime(time.time())) + "export model PB: " + target_dir)
134 | 
135 | 


--------------------------------------------------------------------------------
/rerank_paper/extr_model/model_load.py:
--------------------------------------------------------------------------------
 1 | import tensorflow
 2 | 
 3 | from config import *
 4 | import tensorflow_core.contrib.predictor as predictor
 5 | 
 6 | def load_listwise_model():
 7 |     model_filename_dir = MODEL_SAVE_PATH + "_pbs/ep0/"
 8 |     predict_fn = predictor.from_saved_model(model_filename_dir)
 9 |     # predict_fn = tf.compat.v2.saved_model.load(model_filename_dir)
10 | 
11 |     # env_feature = > dense_feature
12 |     # cxr_feature = > screen_predict_feature
13 |     # cat_feature = > screen_cate_feature
14 |     # dense_feature = > screen_dense_feature
15 |     predictions = predict_fn({
16 |         'screen_predict_feature': [[[0.036115, 0.05427262, 0.09489095, 0.2],
17 |                                     [0.027565, 0.07474336, 0.04988268, 0.53],
18 |                                     [0.024815, 0.1775544, 0.12052802, 0.24],
19 |                                     [0.023316, 0.12283709, 0.10298113, 0.1]]],
20 |         # dense 特征 (价格，评分)
21 |         'screen_dense_feature': [[[1359., 30.146147, 26., 5., 4.85],
22 |                                   [318., 14.675659, 0., 5., 4.94],
23 |                                   [637., 24.784016, 0., 5., 4.65],
24 |                                   [185., 25.333273, 0., 5., 4.75]]],
25 |         # 离散特征(品类)
26 |         'screen_cate_feature': [[[2638824, 4148885, 432243, 3985407, 3385100, 3019284],
27 |                                  [2638824, 3905410, 3212599, 3985407, 1997821, 3019284],
28 |                                  [2638824, 4148885, 3622545, 3985407, 1997821, 3019284],
29 |                                  [2638824, 4148885, 432243, 3985407, 1997821, 3019284]]],
30 |         # 环境特征（是否有铂金）
31 |         'dense_feature': [[0., 0.]]
32 |     })
33 | 
34 |     print('Q_network_output:', predictions['Q_network_output'])
35 |     print('out:', predictions['out'])
36 | 
37 | if __name__ == '__main__':
38 |     # load_pg_model()
39 |     load_listwise_model()


--------------------------------------------------------------------------------
/rerank_paper/extr_model/restore.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lemonace/PIER_code/9c47f80af24570154aab4a811dd82d1f1d436fc3/rerank_paper/extr_model/restore.py


--------------------------------------------------------------------------------
/rerank_paper/extr_model/run.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | LOG_PATH="./"
 4 | if [[ ! -d ${LOG_PATH} ]]; then
 5 |         mkdir ${LOG_PATH}
 6 | fi
 7 | 
 8 | project_path=$(cd `dirname $0`; pwd)
 9 | project_name="${project_path##*/}"
10 | begin_time=$(date "+%Y-%m-%d %H:%M:%S")
11 | author="yangfan129"
12 | 
13 | LOG_FILENAME="log"
14 | CUDA_VISIBLE_DEVICES=0 nohup python -u main.py --author ${author} --project ${project_name} --begintime ${begin_time} > ${LOG_PATH}${LOG_FILENAME} 2>&1 &
15 | 


--------------------------------------------------------------------------------
/rerank_paper/extr_model/tools.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import datetime
 3 | import time
 4 | import os
 5 | 
 6 | 
 7 | class tick_tock:
 8 |     def __init__(self, process_name, verbose=1):
 9 |         self.process_name = process_name
10 |         self.verbose = verbose
11 | 
12 |     def __enter__(self):
13 |         if self.verbose:
14 |             print(("*" * 50 + " {} START!!!! ".format(self.process_name) + "*" * 50))
15 |             self.begin_time = time.time()
16 | 
17 |     def __exit__(self, type, value, traceback):
18 |         if self.verbose:
19 |             end_time = time.time()
20 |             duration_seconds = end_time - self.begin_time
21 |             duration = str(datetime.timedelta(seconds=duration_seconds))
22 | 
23 |             print(("#" * 50 + " {} END... time lapsing {}  ".format(self.process_name, duration) + "#" * 50))
24 | 
25 | 
26 | class FeatureInfo:
27 |     def __init__(self, feature_info_str):
28 |         self.feature_info_str = feature_info_str
29 | 
30 |         self.feature_name = "NonFeaName"
31 |         self.feature_size = 0
32 |         self.feature_mask = 1
33 |         self.parse_info_flag = False
34 |         self.part_num = 3
35 | 
36 |         self._parse_info()
37 | 
38 |     def _parse_info(self):
39 |         infoList = self.feature_info_str.split()
40 | 
41 |         if len(infoList) == self.part_num:
42 |             self.feature_name = infoList[0]
43 |             self.feature_size = int(infoList[1])
44 |             self.feature_mask = int(infoList[2])
45 |             self.parse_info_flag = True
46 | 
47 | 
48 | def parse_mask_file(feature_mask_file):
49 |     try:
50 |         if not os.path.exists(feature_mask_file):
51 |             print("parse_mask_file fail - file not exists:", feature_mask_file)
52 |             return [], False
53 |         # feature_name_list = []
54 |         feature_mask_list = []
55 |         feature_hold_cnt = 0
56 | 
57 |         with open(feature_mask_file) as f:
58 |             str_list = f.readlines()
59 | 
60 |         for i in range(0, len(str_list)):
61 |             str_list[i] = str_list[i].strip('\n').strip()
62 |             if str_list[i] == "":
63 |                 continue
64 | 
65 |             info = FeatureInfo(str_list[i])
66 |             if not info.parse_info_flag:
67 |                 print("parse_mask_file fail - parse_info fail:", str_list[i])
68 |                 parse_mask_flag = False
69 |                 return [], parse_mask_flag
70 | 
71 |             for j in range(info.feature_size):
72 |                 feature_mask_list.append(info.feature_mask)
73 |                 if info.feature_mask != 0:
74 |                     feature_hold_cnt += 1
75 |                 # if info.feature_size > 1:
76 |                 #     feature_name_list.append(info.feature_name + "_" + str(j))
77 |                 # else:
78 |                 #     feature_name_list.append(info.feature_name)
79 | 
80 |         parse_mask_flag = True
81 |         return feature_mask_list, parse_mask_flag, feature_hold_cnt
82 |     except Exception as e:
83 |         print("parse_mask_file fail - Exception:", e)
84 |         return [], False
85 | 
86 | 
87 | if __name__ == "__main__":
88 |     feature_mask_list, parse_feature_mask_flag, feature_hold_cnt = parse_mask_file("feature_mask")
89 |     print(feature_mask_list)
90 |     print(len(feature_mask_list))
91 |     print(parse_feature_mask_flag)
92 |     print(feature_hold_cnt)
93 | 


--------------------------------------------------------------------------------
/rerank_paper/extr_model/util.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | def index_matrix_to_pairs(index_matrix):
 4 |     # [[3,1,2], [2,3,1]] -> [[[0, 3], [1, 1], [2, 2]],
 5 |     #                        [[0, 2], [1, 3], [2, 1]]]
 6 |     replicated_first_indices = tf.range(tf.shape(index_matrix)[0])
 7 |     rank = len(index_matrix.get_shape())
 8 |     if rank == 2:
 9 |         replicated_first_indices = tf.tile(
10 |             tf.expand_dims(replicated_first_indices, axis=1),
11 |             [1, tf.shape(index_matrix)[1]])
12 |         replicated_first_indices = tf.cast(replicated_first_indices, dtype=tf.int64)
13 |     return tf.stack([replicated_first_indices, index_matrix], axis=rank)
14 | 
15 | def string_hash_to_index(tensor, bucket=1<<22):
16 |     return tf.strings.to_hash_bucket_fast(tensor, bucket)
17 | 
18 | def int_to_string_with_key(tensor, key):
19 |     return key + "_" + tf.strings.as_string(tensor)
20 | 
21 | def float_to_string_with_key(tensor, key, precision=1):
22 |     return key + "_" + tf.strings.as_string(tensor, precision)
23 | 
24 | def float_to_int(tensor, order):
25 |     wc = 10 ** order   
26 |     return tf.cast(tensor * wc, tf.int64)
27 | 
28 | def float_custom_hash(tensor, key, precision=0, bucket=1<<22):
29 |     tensor = float_to_string_with_key(tensor, key, precision)
30 |     tensor = string_hash_to_index(tensor, bucket)
31 |     return tensor
32 | 
33 | def int_custom_hash(tensor, key, bucket=1<<22):
34 |     tensor = int_to_string_with_key(tensor, key)
35 |     tensor = string_hash_to_index(tensor, bucket)
36 |     return tensor
37 | 


--------------------------------------------------------------------------------
/rerank_paper/kuaishou_model/avg_std/delivery:
--------------------------------------------------------------------------------
1 | 4.668 40.692 17.227 2.616
2 | 2.722 10.623 13.181 1.742
3 | 


--------------------------------------------------------------------------------
/rerank_paper/kuaishou_model/avg_std/poi:
--------------------------------------------------------------------------------
1 | 2066.773487 23.97615642 26.98331926 3.040902147 4.622442584
2 | 2239.863594 17.22234242 18.71115826 1.824693613 0.597578887


--------------------------------------------------------------------------------
/rerank_paper/kuaishou_model/avg_std/user:
--------------------------------------------------------------------------------
1 | 0 0
2 | 1 1


--------------------------------------------------------------------------------
/rerank_paper/kuaishou_model/build_model.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | echo ""
 4 | # 共建模型服务模版的路径 ------------------------ 修改模版配置 ----------------------
 5 | template_path="../model_template/"
 6 | 
 7 | # 待上传模型pb文件的路径 ------------------------ 修改模型配置 ----------------------
 8 | model_path="../model/"
 9 | model_name=$1
10 | ep=$2
11 | model_file=${model_path}${model_name}/${ep}
12 | echo ${model_file}
13 | target_path="../model_zip/"
14 | if [[ ! -d ${target_path} ]]; then
15 |         mkdir ${target_path}
16 | fi
17 | target_name=${model_name}_${ep}
18 | target_file=${target_path}${target_name}
19 | target_name_zip=${target_name}.zip
20 | echo ${target_name}
21 | echo "---> 待上传模型的目标路径: "${target_file}
22 | if [[ -d ${target_file} ]]; then
23 |     cmd_del="rm -fr ${target_file}"
24 |     echo "---> 删除已存在的目标文件: "${cmd_del}
25 |     ${cmd_del}
26 | fi
27 | if [[ ! -d ${target_file} ]]; then
28 |         mkdir ${target_file}
29 | fi
30 | 
31 | cmd_cp_model_pb="cp -fr ${model_file} ${target_file}/${target_name}"
32 | echo "---> 拷贝模型pb文件到目标目录: "${cmd_cp_model_pb}
33 | ${cmd_cp_model_pb}
34 | 
35 | cmd_mv1="cp ${template_path}/model.properties ${target_file}/${target_name}.properties"
36 | cmd_mv2="cp ${template_path}/model.xml ${target_file}/${target_name}.xml"
37 | cmd_mv3="cp ${template_path}/tensors.xml ${target_file}/tensors.xml"
38 | cmd_mv4="cp ${template_path}/poi_feature.xml ${target_file}/poi_feature.xml"
39 | cmd_mv5="cp ${template_path}/poi_feature.tensor ${target_file}/poi_feature.tensor"
40 | echo "---> 修改properties文件名: "${cmd_mv1}
41 | echo "---> 修改xml文件名: "${cmd_mv2}
42 | ${cmd_mv1}
43 | ${cmd_mv2}
44 | ${cmd_mv3}
45 | ${cmd_mv4}
46 | ${cmd_mv5}
47 | 
48 | cmd_zip1="cd ${target_path}"
49 | cmd_zip2="zip -r ${target_name_zip} ${target_name}"
50 | echo "---> 压缩上传模型文件: "${cmd_zip1} ${cmd_zip2}
51 | ${cmd_zip1}
52 | ${cmd_zip2}
53 | cd -
54 | 
55 | echo ${target_name}.zip
56 | 


--------------------------------------------------------------------------------
/rerank_paper/kuaishou_model/config.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8 -*-
 2 | import shutil
 3 | import time
 4 | import os
 5 | 
 6 | # result
 7 | RANDOM_SEED = 2021
 8 | BATCH_SIZE = 1024
 9 | IMP_LOSS_WEIGHT = 0.02
10 | # basic config
11 | EPOCH = 1
12 | LEARNING_RATE = 0.005
13 | DATA_MODE = 1 # 1:local train，2:local test, 3:docker evaluate
14 | TRAIN_MODE = 2
15 | MODEL_NAME = "kuaishou_pointwise_model_v1"
16 | # poi类别特征
17 | FEATURE_CATE_NUM = 7 # v1r3:19
18 | # dense特征
19 | FEATURE_DENSE_NUM = 5  # v1:28 v1r2:79 v1r3:83
20 | # 预估值特征
21 | FEATURE_CXR_NUM = 3
22 | # 环境特征
23 | FEATURE_ENV_NUM = 2
24 | # 自然poi
25 | FEATURE_NATURE_POI = 25
26 | 
27 | # N: Cut Number of POI For Train
28 | POI_NUM = 5
29 | FEATURE_NUM = 9
30 | PAGE_NUM = 5
31 | FEATURE_NUM_FOR_PAGE = 11
32 | # 属性特征：KA AOR BRAND
33 | FEATURE_ATTR_NUM = 3
34 | 
35 | # DELIVERY_FEAT
36 | DELIVERY_FEAT_NUM = 4
37 | 
38 | # OUT NUM
39 | OUT_NUM = 1
40 | 
41 | PLACE_HOLDER_NUM = 11
42 | DENSE_FEAT_NUM = 439
43 | 
44 | 
45 | # embedding_look_up维度
46 | CATE_FEATURE_EMBEDDINGS_SHAPE = [1 << 22, 8]
47 | 
48 | # 网络结构参数
49 | MODEL_PARAMS = {
50 |     'INPUT_TENSOR_LAYERS_A': [60, 32, 10],
51 |     'INPUT_TENSOR_LAYERS_B': [50, 20],
52 |     'INPUT_TENSOR_LAYERS_C': [50, 20],
53 |     'INPUT_TENSOR_LAYERS_D': [50, 20],
54 |     'INPUT_TENSOR_LAYERS_E': [50, 20]
55 | }
56 | A_INPUT_DIM = (MODEL_PARAMS['INPUT_TENSOR_LAYERS_A'][-1] + 1) * 3
57 | 
58 | DIN_CONF = {}
59 | 
60 | # train data
61 | # /users/lemonace/Downloads/tfrecord-rl-limit5-v1
62 | if DATA_MODE == 1:
63 |     TRAIN_FILE = ['/users/meituan_sxw/Downloads/part-r-00047']
64 |     VALID_FILE = TRAIN_FILE
65 |     PREDICT_FILE = VALID_FILE
66 |     TEST_FILE = PREDICT_FILE
67 | elif DATA_MODE == 2:
68 |     TRAIN_FILE = ['/mnt/dolphinfs/hdd_pool/docker/user/hadoop-hmart-waimaiad/yangfan129/train_data/avito_v1_new/avito_v1_new/test_data/part-r-*']
69 |     VALID_FILE = TRAIN_FILE
70 |     TEST_FILE = VALID_FILE
71 | elif DATA_MODE == 3:
72 |     TRAIN_FILE = [
73 |         "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-hmart-waimaiad/yangfan129/train_data/avito_v3_new_point_wise/train_data/part-r-*"]
74 |     VALID_FILE = [
75 |         "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-hmart-waimaiad/yangfan129/train_data/avito_v3_new_point_wise/test_data/part-r-*"]
76 |     TEST_FILE = [
77 |         "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-hmart-waimaiad/yangfan129/train_data/avito_v3_new_point_wise/test_data/part-r-*"]
78 | elif DATA_MODE == 4:
79 |     DATA_FILE = "/home/hadoop-hmart-waimaiad/cephfs/data/yangfan129/train_data/tfrecord-multi-channel-v1/"
80 |     #TRAIN_LIST = ["20211222", "20211223", "20211224", "20211225"]
81 |     TRAIN_LIST = ["20220123"]
82 |     VALID_LIST = ["20220124"]
83 |     TRAIN_FILE = [DATA_FILE + x + "/part-r-*" for x in TRAIN_LIST]
84 |     VALID_FILE = [DATA_FILE + x + "/part-r-0001*" for x in VALID_LIST]
85 |     TEST_FILE = [DATA_FILE + x + "/part-r-00011" for x in TRAIN_LIST]
86 | 
87 | # 辅助脚本
88 | MEAN_VAR_PATH_POI = "./avg_std/poi"
89 | MEAN_VAR_PATH_DELIVERY = "./avg_std/delivery"
90 | MODEL_SAVE_PATH = "../model/" + MODEL_NAME
91 | MODEL_SAVE_PB_EPOCH_ON = False
92 | MODEL_SAVE_PB_EPOCH_PATH = MODEL_SAVE_PATH + "_pbs"
93 | 


--------------------------------------------------------------------------------
/rerank_paper/kuaishou_model/layers.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | def DIN(seq, seq_len, target, conf, scope="DIN"):
 4 |     # seq BATCH_SIZE * SEQ_LEN * FEAT_NUM : N * M * H
 5 |     # target BATCH_SIZE * FEAT_NUM : N * H
 6 |     # return : BATCH_SIZE * H
 7 | 
 8 |     with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
 9 |         seq_shape = tf.shape(seq)
10 |         target = tf.tile(target, [1, seq_shape[1], 1])
11 | 
12 |         input = tf.concat([seq, target, seq - target, seq * target], axis=-1)
13 | 
14 |         layers = conf.get("layers", [64, 32])
15 |         for layer in layers:
16 |             input = tf.layers.dense(input, layer, activation=tf.nn.sigmoid, name="att_"+str(layer))
17 | 
18 |         input = tf.squeeze(tf.layers.dense(input, 1, activation=None, name="att_final"), axis=-1) # N * M
19 | 
20 |         # Mask
21 |         seq_mask = tf.squeeze(tf.sequence_mask(seq_len, seq_shape[1]))
22 |         # seq_mask = tf.Print(seq_mask, [seq_mask], message="seq_mask", summarize=100)
23 |         padding = tf.ones_like(input) * (-2 ** 32 + 1)
24 |         attention = tf.nn.softmax(tf.where(seq_mask, input, padding), axis=-1) # N * M
25 |         # attention = tf.Print(attention, [attention], message="attention", summarize=100)
26 |         attention = tf.tile(tf.expand_dims(attention, axis=2), [1, 1, seq_shape[2]])
27 |         output = tf.reduce_sum(tf.transpose(attention * seq, [0, 2, 1]), axis=1)
28 |     return output
29 | 


--------------------------------------------------------------------------------
/rerank_paper/kuaishou_model/main.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from config import *
  3 | from model import *
  4 | from sklearn import metrics
  5 | 
  6 | 
  7 | def create_estimator():
  8 |     tf.logging.set_verbosity(tf.logging.INFO)
  9 |     session_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
 10 |     session_config.gpu_options.allow_growth = True
 11 |     config = tf.estimator.RunConfig(
 12 |         tf_random_seed=RANDOM_SEED,
 13 |         save_summary_steps=100,
 14 |         save_checkpoints_steps=1000,
 15 |         model_dir=MODEL_SAVE_PATH,
 16 |         keep_checkpoint_max=2,
 17 |         log_step_count_steps=1000,
 18 |         session_config=session_config)
 19 |     nn_model = DNN()
 20 |     estimator = tf.estimator.Estimator(model_fn=nn_model.model_fn_estimator, config=config)
 21 |     return estimator, nn_model
 22 | 
 23 | 
 24 | def save_model_pb_with_estimator(estimator, params, export_dir_base):
 25 |     estimator._params['save_model'] = params['save_model']
 26 | 
 27 |     def _serving_input_receiver_fn():
 28 |         # env_feature = > dense_feature
 29 |         # cxr_feature = > screen_predict_feature
 30 |         # cat_feature = > screen_cate_feature
 31 |         # dense_feature = > screen_dense_feature
 32 |         receiver_tensors = {
 33 |             # ctr cvr gmv预估值 && bid
 34 |             'screen_predict_feature': tf.placeholder(tf.float32, [None, POI_NUM, FEATURE_CXR_NUM],
 35 |                                                      name='screen_predict_feature'),
 36 |             # dense 特征 (价格，评分)
 37 |             'screen_dense_feature': tf.placeholder(tf.float32, [None, POI_NUM, FEATURE_DENSE_NUM],
 38 |                                                    name='screen_dense_feature'),
 39 |             # 离散特征(品类)
 40 |             'screen_cate_feature': tf.placeholder(tf.int64, [None, POI_NUM, FEATURE_CATE_NUM],
 41 |                                                   name='screen_cate_feature'),
 42 |             # 环境特征（是否有铂金）
 43 |             'dense_feature': tf.placeholder(tf.float32, [None, DENSE_FEAT_NUM],
 44 |                                             name='dense_feature')
 45 |         }
 46 |         return tf.estimator.export.ServingInputReceiver(receiver_tensors=receiver_tensors, features=receiver_tensors)
 47 | 
 48 |     export_dir = estimator.export_saved_model(export_dir_base=export_dir_base,
 49 |                                               serving_input_receiver_fn=_serving_input_receiver_fn)
 50 |     estimator._params.pop('save_model')
 51 |     return export_dir.decode()
 52 | 
 53 | 
 54 | def calculate_result(result_generator):
 55 |     y_ctr, pred_ctr, ctr = [], [], []
 56 |     for result in result_generator:
 57 |         cxr_feature = result['cxr_feature']
 58 |         mask = result['mask']
 59 |         # ctr_label
 60 |         idx = np.where(mask.reshape(-1) == 1)
 61 |         y_ctr += result['ctr_label'].reshape(-1)[idx].tolist()
 62 |         pred_ctr += result['ctr_out'].reshape(-1)[idx].tolist()
 63 |         ctr += cxr_feature[:, 0].reshape(-1)[idx].tolist()
 64 | 
 65 |     ctr_auc, ctr_auc_jp, ctr_cb, ctr_cb_jp = metrics.roc_auc_score(y_ctr, pred_ctr), metrics.roc_auc_score(y_ctr,
 66 |                                                                                                            ctr), np.sum(
 67 |         pred_ctr) / np.sum(y_ctr), np.sum(ctr) / np.sum(y_ctr)
 68 |     print("ctr_auc:{}, ctr_auc_jp:{}, ctr_cb:{}, ctr_cb_jp:{}".format(ctr_auc, ctr_auc_jp, ctr_cb, ctr_cb_jp))
 69 | 
 70 | 
 71 | if __name__ == '__main__':
 72 | 
 73 |     estimator, nn_model = create_estimator()
 74 | 
 75 |     with tick_tock("DATA_INPUT") as _:
 76 |         valid_input_fn = input_fn_maker(VALID_FILE, False, batch_size=1024, epoch=1)
 77 |         test_input_fn = input_fn_maker(TEST_FILE, False, batch_size=1024, epoch=1)
 78 | 
 79 |     if TRAIN_MODE == 1:
 80 |         for i in range(EPOCH):
 81 |             for idx, data in enumerate(TRAIN_FILE):
 82 |                 with tick_tock("DATA_INPUT") as _:
 83 |                     train_input_fn = input_fn_maker([data], True, batch_size=BATCH_SIZE, epoch=1)
 84 |                 with tick_tock("TRAIN") as _:
 85 |                     estimator.train(train_input_fn)
 86 |                 if MODEL_SAVE_PB_EPOCH_ON:
 87 |                     export_dir = save_model_pb_with_estimator(estimator, params={'save_model': 'listwise'},
 88 |                                                               export_dir_base=MODEL_SAVE_PB_EPOCH_PATH)
 89 |                     ep_insert_index = i * len(TRAIN_FILE) + idx
 90 |                     target_dir = export_dir + "/../ep" + str(ep_insert_index)
 91 |                     while os.path.exists(target_dir):
 92 |                         target_dir = export_dir + "/../ep" + str(ep_insert_index)
 93 |                     shutil.move(export_dir, target_dir)
 94 |                     print(time.strftime("%m-%d %H:%M:%S ",
 95 |                                         time.localtime(time.time())) + "export model PB: " + target_dir)
 96 |                 # with tick_tock("PREDICT") as _:
 97 |                 # result_generator = estimator.predict(input_fn=valid_input_fn, yield_single_examples=False)
 98 |                 # calculate_result(result_generator)
 99 | 
100 | 
101 | 
102 |     elif TRAIN_MODE == 2:
103 |         with tick_tock("PREDICT") as _:
104 |             result_generator = estimator.predict(input_fn=valid_input_fn, yield_single_examples=False)
105 |             calculate_result(result_generator)
106 | 
107 |     elif TRAIN_MODE == 3:
108 |         for i in range(EPOCH):
109 |             for idx, data in enumerate(TRAIN_FILE):
110 |                 with tick_tock("DATA_INPUT") as _:
111 |                     train_input_fn = input_fn_maker([data], True, batch_size=BATCH_SIZE, epoch=1)
112 |                 with tick_tock("TRAIN") as _:
113 |                     estimator.train(train_input_fn)
114 |                 with tick_tock("PREDICT") as _:
115 |                     result_generator = estimator.predict(input_fn=valid_input_fn, yield_single_examples=False)
116 |                     print("valid_data")
117 |                     calculate_result(result_generator)
118 |                     # result_generator = estimator.predict(input_fn=test_input_fn, yield_single_examples=False)
119 |                     print("train_data")
120 |                     # calculate_result(result_generator)
121 |                     # save pb
122 | 
123 | 
124 |     elif TRAIN_MODE == 4:
125 |         export_dir = save_model_pb_with_estimator(estimator, params={'save_model': 'listwise'},
126 |                                                   export_dir_base=MODEL_SAVE_PB_EPOCH_PATH)
127 |         ep_insert_index = 0
128 |         target_dir = export_dir + "/../ep" + str(ep_insert_index)
129 |         while os.path.exists(target_dir):
130 |             target_dir = export_dir + "/../ep" + str(ep_insert_index)
131 |         shutil.move(export_dir, target_dir)
132 |         print(time.strftime("%m-%d %H:%M:%S ",
133 |                             time.localtime(time.time())) + "export model PB: " + target_dir)
134 | 
135 | 


--------------------------------------------------------------------------------
/rerank_paper/kuaishou_model/model_load.py:
--------------------------------------------------------------------------------
 1 | import tensorflow
 2 | 
 3 | from config import *
 4 | import tensorflow_core.contrib.predictor as predictor
 5 | 
 6 | def load_listwise_model():
 7 |     model_filename_dir = MODEL_SAVE_PATH + "_pbs/ep0/"
 8 |     predict_fn = predictor.from_saved_model(model_filename_dir)
 9 |     # predict_fn = tf.compat.v2.saved_model.load(model_filename_dir)
10 | 
11 |     # env_feature = > dense_feature
12 |     # cxr_feature = > screen_predict_feature
13 |     # cat_feature = > screen_cate_feature
14 |     # dense_feature = > screen_dense_feature
15 |     predictions = predict_fn({
16 |         'screen_predict_feature': [[[0.036115, 0.05427262, 0.09489095, 0.2],
17 |                                     [0.027565, 0.07474336, 0.04988268, 0.53],
18 |                                     [0.024815, 0.1775544, 0.12052802, 0.24],
19 |                                     [0.023316, 0.12283709, 0.10298113, 0.1]]],
20 |         # dense 特征 (价格，评分)
21 |         'screen_dense_feature': [[[1359., 30.146147, 26., 5., 4.85],
22 |                                   [318., 14.675659, 0., 5., 4.94],
23 |                                   [637., 24.784016, 0., 5., 4.65],
24 |                                   [185., 25.333273, 0., 5., 4.75]]],
25 |         # 离散特征(品类)
26 |         'screen_cate_feature': [[[2638824, 4148885, 432243, 3985407, 3385100, 3019284],
27 |                                  [2638824, 3905410, 3212599, 3985407, 1997821, 3019284],
28 |                                  [2638824, 4148885, 3622545, 3985407, 1997821, 3019284],
29 |                                  [2638824, 4148885, 432243, 3985407, 1997821, 3019284]]],
30 |         # 环境特征（是否有铂金）
31 |         'dense_feature': [[0., 0.]]
32 |     })
33 | 
34 |     print('Q_network_output:', predictions['Q_network_output'])
35 |     print('out:', predictions['out'])
36 | 
37 | if __name__ == '__main__':
38 |     # load_pg_model()
39 |     load_listwise_model()


--------------------------------------------------------------------------------
/rerank_paper/kuaishou_model/restore.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lemonace/PIER_code/9c47f80af24570154aab4a811dd82d1f1d436fc3/rerank_paper/kuaishou_model/restore.py


--------------------------------------------------------------------------------
/rerank_paper/kuaishou_model/run.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | LOG_PATH="./"
 4 | if [[ ! -d ${LOG_PATH} ]]; then
 5 |         mkdir ${LOG_PATH}
 6 | fi
 7 | 
 8 | project_path=$(cd `dirname $0`; pwd)
 9 | project_name="${project_path##*/}"
10 | begin_time=$(date "+%Y-%m-%d %H:%M:%S")
11 | author="yangfan129"
12 | 
13 | LOG_FILENAME="log"
14 | CUDA_VISIBLE_DEVICES=0 nohup python -u main.py --author ${author} --project ${project_name} --begintime ${begin_time} > ${LOG_PATH}${LOG_FILENAME} 2>&1 &
15 | 


--------------------------------------------------------------------------------
/rerank_paper/kuaishou_model/tools.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import datetime
 3 | import time
 4 | import os
 5 | 
 6 | 
 7 | class tick_tock:
 8 |     def __init__(self, process_name, verbose=1):
 9 |         self.process_name = process_name
10 |         self.verbose = verbose
11 | 
12 |     def __enter__(self):
13 |         if self.verbose:
14 |             print(("*" * 50 + " {} START!!!! ".format(self.process_name) + "*" * 50))
15 |             self.begin_time = time.time()
16 | 
17 |     def __exit__(self, type, value, traceback):
18 |         if self.verbose:
19 |             end_time = time.time()
20 |             duration_seconds = end_time - self.begin_time
21 |             duration = str(datetime.timedelta(seconds=duration_seconds))
22 | 
23 |             print(("#" * 50 + " {} END... time lapsing {}  ".format(self.process_name, duration) + "#" * 50))
24 | 
25 | 
26 | class FeatureInfo:
27 |     def __init__(self, feature_info_str):
28 |         self.feature_info_str = feature_info_str
29 | 
30 |         self.feature_name = "NonFeaName"
31 |         self.feature_size = 0
32 |         self.feature_mask = 1
33 |         self.parse_info_flag = False
34 |         self.part_num = 3
35 | 
36 |         self._parse_info()
37 | 
38 |     def _parse_info(self):
39 |         infoList = self.feature_info_str.split()
40 | 
41 |         if len(infoList) == self.part_num:
42 |             self.feature_name = infoList[0]
43 |             self.feature_size = int(infoList[1])
44 |             self.feature_mask = int(infoList[2])
45 |             self.parse_info_flag = True
46 | 
47 | 
48 | def parse_mask_file(feature_mask_file):
49 |     try:
50 |         if not os.path.exists(feature_mask_file):
51 |             print("parse_mask_file fail - file not exists:", feature_mask_file)
52 |             return [], False
53 |         # feature_name_list = []
54 |         feature_mask_list = []
55 |         feature_hold_cnt = 0
56 | 
57 |         with open(feature_mask_file) as f:
58 |             str_list = f.readlines()
59 | 
60 |         for i in range(0, len(str_list)):
61 |             str_list[i] = str_list[i].strip('\n').strip()
62 |             if str_list[i] == "":
63 |                 continue
64 | 
65 |             info = FeatureInfo(str_list[i])
66 |             if not info.parse_info_flag:
67 |                 print("parse_mask_file fail - parse_info fail:", str_list[i])
68 |                 parse_mask_flag = False
69 |                 return [], parse_mask_flag
70 | 
71 |             for j in range(info.feature_size):
72 |                 feature_mask_list.append(info.feature_mask)
73 |                 if info.feature_mask != 0:
74 |                     feature_hold_cnt += 1
75 |                 # if info.feature_size > 1:
76 |                 #     feature_name_list.append(info.feature_name + "_" + str(j))
77 |                 # else:
78 |                 #     feature_name_list.append(info.feature_name)
79 | 
80 |         parse_mask_flag = True
81 |         return feature_mask_list, parse_mask_flag, feature_hold_cnt
82 |     except Exception as e:
83 |         print("parse_mask_file fail - Exception:", e)
84 |         return [], False
85 | 
86 | 
87 | if __name__ == "__main__":
88 |     feature_mask_list, parse_feature_mask_flag, feature_hold_cnt = parse_mask_file("feature_mask")
89 |     print(feature_mask_list)
90 |     print(len(feature_mask_list))
91 |     print(parse_feature_mask_flag)
92 |     print(feature_hold_cnt)
93 | 


--------------------------------------------------------------------------------
/rerank_paper/kuaishou_model/util.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | def index_matrix_to_pairs(index_matrix):
 4 |     # [[3,1,2], [2,3,1]] -> [[[0, 3], [1, 1], [2, 2]],
 5 |     #                        [[0, 2], [1, 3], [2, 1]]]
 6 |     replicated_first_indices = tf.range(tf.shape(index_matrix)[0])
 7 |     rank = len(index_matrix.get_shape())
 8 |     if rank == 2:
 9 |         replicated_first_indices = tf.tile(
10 |             tf.expand_dims(replicated_first_indices, axis=1),
11 |             [1, tf.shape(index_matrix)[1]])
12 |         replicated_first_indices = tf.cast(replicated_first_indices, dtype=tf.int64)
13 |     return tf.stack([replicated_first_indices, index_matrix], axis=rank)
14 | 
15 | def string_hash_to_index(tensor, bucket=1<<22):
16 |     return tf.strings.to_hash_bucket_fast(tensor, bucket)
17 | 
18 | def int_to_string_with_key(tensor, key):
19 |     return key + "_" + tf.strings.as_string(tensor)
20 | 
21 | def float_to_string_with_key(tensor, key, precision=1):
22 |     return key + "_" + tf.strings.as_string(tensor, precision)
23 | 
24 | def float_to_int(tensor, order):
25 |     wc = 10 ** order   
26 |     return tf.cast(tensor * wc, tf.int64)
27 | 
28 | def float_custom_hash(tensor, key, precision=0, bucket=1<<22):
29 |     tensor = float_to_string_with_key(tensor, key, precision)
30 |     tensor = string_hash_to_index(tensor, bucket)
31 |     return tensor
32 | 
33 | def int_custom_hash(tensor, key, bucket=1<<22):
34 |     tensor = int_to_string_with_key(tensor, key)
35 |     tensor = string_hash_to_index(tensor, bucket)
36 |     return tensor
37 | 


--------------------------------------------------------------------------------
/rerank_paper/pier_model/avg_std/delivery:
--------------------------------------------------------------------------------
1 | 4.668 40.692 17.227 2.616
2 | 2.722 10.623 13.181 1.742
3 | 


--------------------------------------------------------------------------------
/rerank_paper/pier_model/avg_std/poi:
--------------------------------------------------------------------------------
1 | 2066.773487 23.97615642 26.98331926 3.040902147 4.622442584
2 | 2239.863594 17.22234242 18.71115826 1.824693613 0.597578887


--------------------------------------------------------------------------------
/rerank_paper/pier_model/avg_std/user:
--------------------------------------------------------------------------------
1 | 0 0
2 | 1 1


--------------------------------------------------------------------------------
/rerank_paper/pier_model/build_model.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | echo ""
 4 | # 共建模型服务模版的路径 ------------------------ 修改模版配置 ----------------------
 5 | template_path="../model_template/"
 6 | 
 7 | # 待上传模型pb文件的路径 ------------------------ 修改模型配置 ----------------------
 8 | model_path="../model/"
 9 | model_name=$1
10 | ep=$2
11 | model_file=${model_path}${model_name}/${ep}
12 | echo ${model_file}
13 | target_path="../model_zip/"
14 | if [[ ! -d ${target_path} ]]; then
15 |         mkdir ${target_path}
16 | fi
17 | target_name=${model_name}_${ep}
18 | target_file=${target_path}${target_name}
19 | target_name_zip=${target_name}.zip
20 | echo ${target_name}
21 | echo "---> 待上传模型的目标路径: "${target_file}
22 | if [[ -d ${target_file} ]]; then
23 |     cmd_del="rm -fr ${target_file}"
24 |     echo "---> 删除已存在的目标文件: "${cmd_del}
25 |     ${cmd_del}
26 | fi
27 | if [[ ! -d ${target_file} ]]; then
28 |         mkdir ${target_file}
29 | fi
30 | 
31 | cmd_cp_model_pb="cp -fr ${model_file} ${target_file}/${target_name}"
32 | echo "---> 拷贝模型pb文件到目标目录: "${cmd_cp_model_pb}
33 | ${cmd_cp_model_pb}
34 | 
35 | cmd_mv1="cp ${template_path}/model.properties ${target_file}/${target_name}.properties"
36 | cmd_mv2="cp ${template_path}/model.xml ${target_file}/${target_name}.xml"
37 | cmd_mv3="cp ${template_path}/tensors.xml ${target_file}/tensors.xml"
38 | cmd_mv4="cp ${template_path}/poi_feature.xml ${target_file}/poi_feature.xml"
39 | cmd_mv5="cp ${template_path}/poi_feature.tensor ${target_file}/poi_feature.tensor"
40 | echo "---> 修改properties文件名: "${cmd_mv1}
41 | echo "---> 修改xml文件名: "${cmd_mv2}
42 | ${cmd_mv1}
43 | ${cmd_mv2}
44 | ${cmd_mv3}
45 | ${cmd_mv4}
46 | ${cmd_mv5}
47 | 
48 | cmd_zip1="cd ${target_path}"
49 | cmd_zip2="zip -r ${target_name_zip} ${target_name}"
50 | echo "---> 压缩上传模型文件: "${cmd_zip1} ${cmd_zip2}
51 | ${cmd_zip1}
52 | ${cmd_zip2}
53 | cd -
54 | 
55 | echo ${target_name}.zip
56 | 


--------------------------------------------------------------------------------
/rerank_paper/pier_model/config.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8 -*-
 2 | import shutil
 3 | import time
 4 | import os
 5 | 
 6 | # result
 7 | RANDOM_SEED = 2022
 8 | BATCH_SIZE = 1024
 9 | IMP_LOSS_WEIGHT = 0.02
10 | # basic config
11 | EPOCH = 1 
12 | LEARNING_RATE = 0.005
13 | DATA_MODE = 3 # 1:local train，2:local test, 3:docker evaluate
14 | TRAIN_MODE = 3
15 | MODEL_NAME = "pier_listwise_model_v4"
16 | # poi类别特征
17 | FEATURE_CATE_NUM = 7 # v1r3:19
18 | # dense特征
19 | FEATURE_DENSE_NUM = 5  # v1:28 v1r2:79 v1r3:83
20 | # 预估值特征
21 | FEATURE_CXR_NUM = 3
22 | # 环境特征
23 | FEATURE_ENV_NUM = 2
24 | # 自然poi
25 | FEATURE_NATURE_POI = 25
26 | 
27 | # embedding_look_up维度
28 | CATE_FEATURE_EMBEDDINGS_SHAPE = [1 << 22, 8]
29 | 
30 | # N: Cut Number of POI For Train
31 | POI_NUM = 5
32 | FEATURE_NUM = 9
33 | PAGE_NUM = 5
34 | FEATURE_NUM_FOR_PAGE = 11
35 | PERMUATION_SIZE = 120
36 | # 属性特征：KA AOR BRAND
37 | FEATURE_ATTR_NUM = 3
38 | 
39 | # DELIVERY_FEAT
40 | DELIVERY_FEAT_NUM = 4
41 | 
42 | # OUT NUM
43 | OUT_NUM = 1
44 | 
45 | PLACE_HOLDER_NUM = 11
46 | DENSE_FEAT_NUM = 439
47 | 
48 | 
49 | # 网络结构参数
50 | MODEL_PARAMS = {
51 |     'INPUT_TENSOR_LAYERS_A': [60, 32, 10],
52 |     'INPUT_TENSOR_LAYERS_B': [50, 20],
53 |     'INPUT_TENSOR_LAYERS_C': [60, 32, 10],
54 |     'INPUT_TENSOR_LAYERS_D': [50, 20],
55 |     'INPUT_TENSOR_LAYERS_E': [50, 20]
56 | }
57 | # A_INPUT_DIM = POI_NUM * (MODEL_PARAMS['INPUT_TENSOR_LAYERS_A'][-1] + 1)
58 | MLP_INPUT_DIM = CATE_FEATURE_EMBEDDINGS_SHAPE[1] * 2 + 1 + MODEL_PARAMS['INPUT_TENSOR_LAYERS_A'][-1]
59 | DIN_CONF = {}
60 | 
61 | # train data
62 | # /users/lemonace/Downloads/tfrecord-rl-limit5-v1
63 | if DATA_MODE == 1:
64 |     TRAIN_FILE = ['/users/meituan_sxw/Downloads/part-r-00046']
65 |     VALID_FILE = TRAIN_FILE
66 |     PREDICT_FILE = VALID_FILE
67 |     TEST_FILE = PREDICT_FILE
68 | elif DATA_MODE == 2:
69 |     TRAIN_FILE = ['/mnt/dolphinfs/hdd_pool/docker/user/hadoop-hmart-waimaiad/yangfan129/train_data/avito_v1_new/avito_v1_new/test_data/part-r-*']
70 |     VALID_FILE = TRAIN_FILE
71 |     TEST_FILE = VALID_FILE
72 | elif DATA_MODE == 3:
73 |     TRAIN_FILE = ["/mnt/dolphinfs/hdd_pool/docker/user/hadoop-hmart-waimaiad/yangfan129/train_data/avito_v3_new/train_data/part-r-*"]
74 |     VALID_FILE = ["/mnt/dolphinfs/hdd_pool/docker/user/hadoop-hmart-waimaiad/yangfan129/train_data/avito_v3_new/test_data/part-r-*"]
75 |     TEST_FILE= ["/mnt/dolphinfs/hdd_pool/docker/user/hadoop-hmart-waimaiad/yangfan129/train_data/avito_v3_new/test_data/part-r-*"]
76 | elif DATA_MODE == 4:
77 |     DATA_FILE = "/home/hadoop-hmart-waimaiad/cephfs/data/yangfan129/train_data/tfrecord-multi-channel-v1/"
78 |     #TRAIN_LIST = ["20211222", "20211223", "20211224", "20211225"]
79 |     TRAIN_LIST = ["20220123"]
80 |     VALID_LIST = ["20220124"]
81 |     TRAIN_FILE = [DATA_FILE + x + "/part-r-*" for x in TRAIN_LIST]
82 |     VALID_FILE = [DATA_FILE + x + "/part-r-0001*" for x in VALID_LIST]
83 |     TEST_FILE = [DATA_FILE + x + "/part-r-00011" for x in TRAIN_LIST]
84 | 
85 | # 辅助脚本
86 | MEAN_VAR_PATH_POI = "./avg_std/poi"
87 | MEAN_VAR_PATH_DELIVERY = "./avg_std/delivery"
88 | MODEL_SAVE_PATH = "../model/" + MODEL_NAME
89 | MODEL_SAVE_PB_EPOCH_ON = False
90 | MODEL_SAVE_PB_EPOCH_PATH = MODEL_SAVE_PATH + "_pbs"
91 | 


--------------------------------------------------------------------------------
/rerank_paper/pier_model/data_input.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import tensorflow as tf
  3 | from config import *
  4 | import numpy as np
  5 | from tools import tick_tock,allPermutation
  6 | 
  7 | 
  8 | def generate_parse_tfrecord_local_fn():
  9 |     def _parse_function(batch_examples):
 10 |         common_features, sequence_features = feature_parse_scheme()
 11 |         parsed_features = tf.parse_example(
 12 |             serialized=batch_examples,
 13 |             features=common_features
 14 |         )
 15 |         features = feature_product(parsed_features)
 16 |         labels = label_product(parsed_features)
 17 |         return features, labels
 18 | 
 19 |     return _parse_function
 20 | 
 21 | 
 22 | def generate_parse_valid_tfrecord_local_fn():
 23 |     def _parse_function(batch_examples):
 24 |         common_features, sequence_features = feature_parse_scheme()
 25 |         parsed_features = tf.parse_example(
 26 |             serialized=batch_examples,
 27 |             features=common_features
 28 |         )
 29 |         features = feature_product(parsed_features)
 30 |         labels = label_product(parsed_features)
 31 |         return features, labels
 32 | 
 33 |     return _parse_function
 34 | 
 35 | 
 36 | def feature_parse_scheme():
 37 |     label_len = POI_NUM * 2 + PAGE_NUM
 38 |     feature_len = POI_NUM * FEATURE_NUM + PAGE_NUM * POI_NUM * FEATURE_NUM_FOR_PAGE
 39 |     common_features = {
 40 |         "label": tf.FixedLenFeature([label_len], dtype=tf.float32),
 41 |         "feature": tf.FixedLenFeature([feature_len], dtype=tf.float32),
 42 |     }
 43 | 
 44 |     sequence_features = {}
 45 |     return common_features, sequence_features
 46 | 
 47 | 
 48 | def label_product(parsed_features):
 49 |     labels = parsed_features['label']
 50 | 
 51 |     labels_result = {
 52 |         # ctr_label
 53 |         'ctr_label': tf.gather(labels, list(range(0, POI_NUM)), axis=1),
 54 |         'mask': tf.gather(labels, list(range(POI_NUM, 2 * POI_NUM)), axis=1),
 55 |         'page_mask': tf.gather(labels, list(range(2 * POI_NUM, 2 * POI_NUM + PAGE_NUM)), axis=1),
 56 |     }
 57 |     return labels_result
 58 | 
 59 | 
 60 | def feature_product(parsed_features):
 61 |     feature_buffer = parsed_features['feature']
 62 |     labels = parsed_features['label']
 63 |     # 获取特征
 64 |     # FEATURE_CATE_NUM：品类相关特征
 65 |     # FEATURE_DENSE_NUM：连续值特征
 66 |     # FEATURE_CXR_NUM：模型预估值特征
 67 | 
 68 |     full_permuation_index = allPermutation(POI_NUM)
 69 | 
 70 |     # current page
 71 |     current_page_start = 0
 72 |     current_page_end = current_page_start + POI_NUM * FEATURE_NUM
 73 | 
 74 |     pre_page_start = current_page_end
 75 |     pre_page_end = pre_page_start + PAGE_NUM * POI_NUM * FEATURE_NUM_FOR_PAGE
 76 | 
 77 |     
 78 |     cur_page_features = tf.reshape(tf.gather(feature_buffer,list(range(current_page_start,current_page_end)),axis=1), [-1, POI_NUM, FEATURE_NUM])
 79 |     pre_page_features = tf.reshape(tf.gather(feature_buffer,list(range(pre_page_start,pre_page_end)),axis=1), [-1, PAGE_NUM, POI_NUM, FEATURE_NUM_FOR_PAGE])
 80 |     
 81 |     position_fea = tf.gather(cur_page_features, list(range(0, 1)), axis=2)
 82 |     adid_fea = tf.gather(cur_page_features, list(range(1, 2)), axis=2)
 83 |     obj_type_fea = tf.gather(cur_page_features, list(range(2, 3)), axis=2)
 84 |     hist_ctr_fea = tf.gather(cur_page_features, list(range(3, 4)), axis=2)
 85 |     locationid_fea = tf.gather(cur_page_features, list(range(4, 5)), axis=2)
 86 |     categoryid_fea = tf.gather(cur_page_features, list(range(5, 6)), axis=2)
 87 |     price_fea =  tf.gather(cur_page_features, list(range(6, 7)), axis=2)
 88 |     iscontext_fea = tf.gather(cur_page_features, list(range(7, 8)), axis=2)
 89 |     userid_fea = tf.gather(cur_page_features, list(range(8, 9)), axis=2)
 90 | 
 91 |     pre_position_fea = tf.gather(pre_page_features, list(range(0, 1)), axis=3)
 92 |     pre_adid_fea = tf.gather(pre_page_features, list(range(1, 2)), axis=3)
 93 |     pre_obj_type_fea = tf.gather(pre_page_features, list(range(2, 3)), axis=3)
 94 |     pre_hist_ctr_fea = tf.gather(pre_page_features, list(range(3, 4)), axis=3)
 95 |     pre_locationid_fea = tf.gather(pre_page_features, list(range(4, 5)), axis=3)
 96 |     pre_categoryid_fea = tf.gather(pre_page_features, list(range(5, 6)), axis=3)
 97 |     pre_price_fea = tf.gather(pre_page_features, list(range(6, 7)), axis=3)
 98 |     pre_iscontext_fea = tf.gather(pre_page_features, list(range(7, 8)), axis=3)
 99 |     pre_userid_fea = tf.gather(pre_page_features, list(range(8, 9)), axis=3)
100 | 
101 |     features_result = {
102 |         
103 |         'dense_feature': hist_ctr_fea,
104 |         # 离散特征(品类)
105 |         'cate_feature': tf.cast(tf.concat([position_fea, adid_fea, obj_type_fea, locationid_fea, iscontext_fea, categoryid_fea, userid_fea], axis=2), tf.int64),
106 |         # ctr_label
107 |         'ctr_label': tf.gather(labels, list(range(0, POI_NUM)), axis=1),
108 |         'mask': tf.gather(labels, list(range(POI_NUM, 2 * POI_NUM)), axis=1),
109 |         'page_mask':tf.gather(labels, list(range(2 * POI_NUM, 2 * POI_NUM + PAGE_NUM)), axis=1),
110 |         'behavior_dense_feature': pre_hist_ctr_fea,
111 |         # 离散特征(品类)
112 |         'behavior_cate_feature': tf.cast(tf.concat([pre_position_fea, pre_adid_fea, pre_obj_type_fea, pre_locationid_fea,
113 |                                                     pre_iscontext_fea, pre_categoryid_fea, pre_userid_fea],axis=3), tf.int64),
114 | 
115 |         'full_permuation_index': tf.constant(full_permuation_index,tf.int32)
116 | 
117 |     }
118 |     return features_result
119 | 
120 | 
121 | # num_parallel 表示cpu的核数,用于控制 map的并行度
122 | def input_fn_maker(file_names, is_train, batch_size, epoch=None, num_parallel=4):
123 |     def input_fn():
124 |         _parse_fn = generate_parse_tfrecord_local_fn() if is_train else generate_parse_valid_tfrecord_local_fn()
125 |         files = tf.data.Dataset.list_files(file_names)
126 |         # print(files)
127 |         dataset = files.apply(tf.contrib.data.parallel_interleave(tf.data.TFRecordDataset, cycle_length=4 * 10))
128 |         dataset = dataset.prefetch(buffer_size=batch_size * 10)
129 |         dataset = dataset.repeat(epoch)
130 |         dataset = dataset.batch(batch_size)
131 |         dataset = dataset.map(_parse_fn, num_parallel_calls=num_parallel)
132 |         iterator = dataset.make_one_shot_iterator()
133 |         return iterator.get_next()
134 | 
135 |     return input_fn
136 | 
137 | 
138 | # 从hive表统计得到均值和方差文件
139 | def get_normalization_parameter(mean_var_path):
140 |     with tf.gfile.Open(mean_var_path) as f:
141 |         fea_mean = f.readline().strip().split(' ')
142 |         fea_var = f.readline().strip().split(' ')
143 |         cont_fea_mean = list(map(float, fea_mean))
144 |         cont_fea_var = list(map(float, fea_var))
145 |     f.close()
146 |     return cont_fea_mean, cont_fea_var
147 | 
148 | 
149 | def get_bias_weight_parameter(bias_weight_path):
150 |     with tf.gfile.Open(bias_weight_path) as f2:
151 |         fea_mean = f2.readline().strip().split('\t')
152 |         cont_fea_mean = list(map(float, fea_mean))
153 |     f2.close()
154 |     return cont_fea_mean
155 | 
156 | 
157 | 
158 | if __name__ == '__main__':
159 |     train_file = TRAIN_FILE
160 |     # train_file = ["/mnt/dolphinfs/hdd_pool/docker/user/hadoop-hmart-waimaiad/yangfan129/train_data/avito_v1_new/avito_v1_new/train_data/part-r-00000"]
161 |     train_input_fn = input_fn_maker(train_file, is_train=True, batch_size=16, epoch=1)
162 |     features, labels = train_input_fn()
163 | 
164 |     sess = tf.Session()
165 |     try:
166 |         with tick_tock("DATA_INPUT") as _:
167 |             features_np, labels_np = sess.run([features, labels])
168 | 
169 |         print("*" * 100, "features_np")
170 |         for key in features_np:
171 |             print("=" * 50, key, np.shape(features_np[key]))
172 |             print(features_np[key])
173 | 
174 | 
175 |         print("*" * 100, "labels_np")
176 |         for key in labels_np:
177 |             print("=" * 50, key, np.shape(labels_np[key]))
178 |             print(labels_np[key])
179 | 
180 |     except Exception as e:
181 |         print(e)
182 | 


--------------------------------------------------------------------------------
/rerank_paper/pier_model/layers.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | def DIN(seq, seq_len, target, conf, scope="DIN"):
 4 |     # seq BATCH_SIZE * SEQ_LEN * FEAT_NUM : N * M * H
 5 |     # target BATCH_SIZE * FEAT_NUM : N * H
 6 |     # return : BATCH_SIZE * H
 7 | 
 8 |     with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
 9 |         seq_shape = tf.shape(seq)
10 |         target = tf.tile(target, [1, seq_shape[1], 1])
11 | 
12 |         input = tf.concat([seq, target, seq - target, seq * target], axis=-1)
13 | 
14 |         layers = conf.get("layers", [64, 32])
15 |         for layer in layers:
16 |             input = tf.layers.dense(input, layer, activation=tf.nn.sigmoid, name="att_"+str(layer))
17 | 
18 |         input = tf.squeeze(tf.layers.dense(input, 1, activation=None, name="att_final"), axis=-1) # N * M
19 | 
20 |         # Mask
21 |         seq_mask = tf.squeeze(tf.sequence_mask(seq_len, seq_shape[1]))
22 |         # seq_mask = tf.Print(seq_mask, [seq_mask], message="seq_mask", summarize=100)
23 |         padding = tf.ones_like(input) * (-2 ** 32 + 1)
24 |         attention = tf.nn.softmax(tf.where(seq_mask, input, padding), axis=-1) # N * M
25 |         # attention = tf.Print(attention, [attention], message="attention", summarize=100)
26 |         attention = tf.tile(tf.expand_dims(attention, axis=2), [1, 1, seq_shape[2]])
27 |         output = tf.reduce_sum(tf.transpose(attention * seq, [0, 2, 1]), axis=1)
28 |     return output
29 | 


--------------------------------------------------------------------------------
/rerank_paper/pier_model/main.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from config import *
  3 | from model import *
  4 | from sklearn import metrics
  5 | 
  6 | def create_estimator():
  7 |     tf.logging.set_verbosity(tf.logging.INFO)
  8 |     session_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
  9 |     session_config.gpu_options.allow_growth = True
 10 |     config = tf.estimator.RunConfig(
 11 |         tf_random_seed=RANDOM_SEED,
 12 |         save_summary_steps=100,
 13 |         save_checkpoints_steps=1000,
 14 |         model_dir=MODEL_SAVE_PATH,
 15 |         keep_checkpoint_max=2,
 16 |         log_step_count_steps=1000,
 17 |         session_config=session_config)
 18 |     nn_model = DNN()
 19 |     estimator = tf.estimator.Estimator(model_fn=nn_model.model_fn_estimator, config=config)
 20 |     return estimator, nn_model
 21 | 
 22 | 
 23 | def save_model_pb_with_estimator(estimator, params, export_dir_base):
 24 |     estimator._params['save_model'] = params['save_model']
 25 | 
 26 |     def _serving_input_receiver_fn():
 27 |         # env_feature = > dense_feature
 28 |         # cxr_feature = > screen_predict_feature
 29 |         # cat_feature = > screen_cate_feature
 30 |         # dense_feature = > screen_dense_feature
 31 |         receiver_tensors = {
 32 |             # ctr cvr gmv预估值 && bid
 33 |             'screen_predict_feature': tf.placeholder(tf.float32, [None, POI_NUM, FEATURE_CXR_NUM],
 34 |                                                 name='screen_predict_feature'),
 35 |             # dense 特征 (价格，评分)
 36 |             'screen_dense_feature': tf.placeholder(tf.float32, [None, POI_NUM, FEATURE_DENSE_NUM],
 37 |                                                 name='screen_dense_feature'),
 38 |             # 离散特征(品类)
 39 |             'screen_cate_feature': tf.placeholder(tf.int64, [None, POI_NUM, FEATURE_CATE_NUM],
 40 |                                                 name='screen_cate_feature'),
 41 |             # 环境特征（是否有铂金）
 42 |             'dense_feature': tf.placeholder(tf.float32, [None, DENSE_FEAT_NUM],
 43 |                                                 name='dense_feature')
 44 |         }
 45 |         return tf.estimator.export.ServingInputReceiver(receiver_tensors=receiver_tensors, features=receiver_tensors)
 46 | 
 47 |     export_dir = estimator.export_saved_model(export_dir_base=export_dir_base,
 48 |                                               serving_input_receiver_fn=_serving_input_receiver_fn)
 49 |     estimator._params.pop('save_model')
 50 |     return export_dir.decode()
 51 | 
 52 | def calculate_result(result_generator):
 53 | 
 54 |     y_ctr, pred_ctr, ctr = [], [], []
 55 |     for result in result_generator:
 56 |         cxr_feature = result['cxr_feature']
 57 |         mask = result['mask']
 58 |         # ctr_label
 59 |         idx = np.where(mask.reshape(-1) == 1)
 60 |         y_ctr += result['ctr_label'].reshape(-1)[idx].tolist()
 61 |         pred_ctr += result['ctr_out'].reshape(-1)[idx].tolist()
 62 |         ctr += cxr_feature[:, :, 0].reshape(-1)[idx].tolist()
 63 | 
 64 |     ctr_auc, ctr_auc_jp, ctr_cb, ctr_cb_jp = metrics.roc_auc_score(y_ctr, pred_ctr), metrics.roc_auc_score(y_ctr, ctr), np.sum(pred_ctr) / np.sum(y_ctr),  np.sum(ctr) / np.sum(y_ctr)
 65 |     print("ctr_auc:{}, ctr_auc_jp:{}, ctr_cb:{}, ctr_cb_jp:{}".format(ctr_auc, ctr_auc_jp, ctr_cb, ctr_cb_jp))
 66 | 
 67 | if __name__ == '__main__':
 68 | 
 69 |     estimator, nn_model = create_estimator()
 70 | 
 71 |     with tick_tock("DATA_INPUT") as _:
 72 |         valid_input_fn = input_fn_maker(VALID_FILE, False, batch_size=1024, epoch=1)
 73 |         test_input_fn = input_fn_maker(TEST_FILE, False, batch_size=1024, epoch=1)
 74 | 
 75 |     if TRAIN_MODE == 1:
 76 |         for i in range(EPOCH):
 77 |             for idx, data in enumerate(TRAIN_FILE):
 78 |                 with tick_tock("DATA_INPUT") as _:
 79 |                     train_input_fn = input_fn_maker([data], True, batch_size=BATCH_SIZE, epoch=1)
 80 |                 with tick_tock("TRAIN") as _:
 81 |                     estimator.train(train_input_fn)
 82 |                 if MODEL_SAVE_PB_EPOCH_ON:
 83 |                     export_dir = save_model_pb_with_estimator(estimator, params={'save_model': 'listwise'},
 84 |                                                               export_dir_base=MODEL_SAVE_PB_EPOCH_PATH)
 85 |                     ep_insert_index = i * len(TRAIN_FILE) + idx
 86 |                     target_dir = export_dir + "/../ep" + str(ep_insert_index)
 87 |                     while os.path.exists(target_dir):
 88 |                         target_dir = export_dir + "/../ep" + str(ep_insert_index)
 89 |                     shutil.move(export_dir, target_dir)
 90 |                     print(time.strftime("%m-%d %H:%M:%S ",
 91 |                                         time.localtime(time.time())) + "export model PB: " + target_dir)
 92 |                 #with tick_tock("PREDICT") as _:
 93 |                     #result_generator = estimator.predict(input_fn=valid_input_fn, yield_single_examples=False)
 94 |                     #calculate_result(result_generator)
 95 | 
 96 | 
 97 | 
 98 |     elif TRAIN_MODE == 2:
 99 |         with tick_tock("PREDICT") as _:
100 |             result_generator = estimator.predict(input_fn=valid_input_fn, yield_single_examples=False)
101 |             calculate_result(result_generator)
102 | 
103 |     elif TRAIN_MODE == 3:
104 |         for i in range(EPOCH):
105 |             for idx, data in enumerate(TRAIN_FILE):
106 |                 with tick_tock("DATA_INPUT") as _:
107 |                     train_input_fn = input_fn_maker([data], True, batch_size=BATCH_SIZE, epoch=1)
108 |                 with tick_tock("TRAIN") as _:
109 |                     estimator.train(train_input_fn)
110 |                 with tick_tock("PREDICT") as _:
111 |                     result_generator = estimator.predict(input_fn=valid_input_fn, yield_single_examples=False)
112 |                     print("valid_data")
113 |                     calculate_result(result_generator)
114 |                     #result_generator = estimator.predict(input_fn=test_input_fn, yield_single_examples=False)
115 |                     print("train_data")
116 |                     #calculate_result(result_generator)
117 |                     # save pb
118 | 
119 |     
120 |     elif TRAIN_MODE == 4:
121 |         export_dir = save_model_pb_with_estimator(estimator, params={'save_model': 'listwise'},
122 |                                                     export_dir_base=MODEL_SAVE_PB_EPOCH_PATH)
123 |         ep_insert_index = 0 
124 |         target_dir = export_dir + "/../ep" + str(ep_insert_index)
125 |         while os.path.exists(target_dir):
126 |             target_dir = export_dir + "/../ep" + str(ep_insert_index)
127 |         shutil.move(export_dir, target_dir)
128 |         print(time.strftime("%m-%d %H:%M:%S ",
129 |                             time.localtime(time.time())) + "export model PB: " + target_dir)
130 |     
131 | 


--------------------------------------------------------------------------------
/rerank_paper/pier_model/model_load.py:
--------------------------------------------------------------------------------
 1 | import tensorflow
 2 | 
 3 | from config import *
 4 | import tensorflow_core.contrib.predictor as predictor
 5 | 
 6 | def load_listwise_model():
 7 |     model_filename_dir = MODEL_SAVE_PATH + "_pbs/ep0/"
 8 |     predict_fn = predictor.from_saved_model(model_filename_dir)
 9 |     # predict_fn = tf.compat.v2.saved_model.load(model_filename_dir)
10 | 
11 |     # env_feature = > dense_feature
12 |     # cxr_feature = > screen_predict_feature
13 |     # cat_feature = > screen_cate_feature
14 |     # dense_feature = > screen_dense_feature
15 |     predictions = predict_fn({
16 |         'screen_predict_feature': [[[0.036115, 0.05427262, 0.09489095, 0.2],
17 |                                     [0.027565, 0.07474336, 0.04988268, 0.53],
18 |                                     [0.024815, 0.1775544, 0.12052802, 0.24],
19 |                                     [0.023316, 0.12283709, 0.10298113, 0.1]]],
20 |         # dense 特征 (价格，评分)
21 |         'screen_dense_feature': [[[1359., 30.146147, 26., 5., 4.85],
22 |                                   [318., 14.675659, 0., 5., 4.94],
23 |                                   [637., 24.784016, 0., 5., 4.65],
24 |                                   [185., 25.333273, 0., 5., 4.75]]],
25 |         # 离散特征(品类)
26 |         'screen_cate_feature': [[[2638824, 4148885, 432243, 3985407, 3385100, 3019284],
27 |                                  [2638824, 3905410, 3212599, 3985407, 1997821, 3019284],
28 |                                  [2638824, 4148885, 3622545, 3985407, 1997821, 3019284],
29 |                                  [2638824, 4148885, 432243, 3985407, 1997821, 3019284]]],
30 |         # 环境特征（是否有铂金）
31 |         'dense_feature': [[0., 0.]]
32 |     })
33 | 
34 |     print('Q_network_output:', predictions['Q_network_output'])
35 |     print('out:', predictions['out'])
36 | 
37 | if __name__ == '__main__':
38 |     # load_pg_model()
39 |     load_listwise_model()


--------------------------------------------------------------------------------
/rerank_paper/pier_model/restore.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lemonace/PIER_code/9c47f80af24570154aab4a811dd82d1f1d436fc3/rerank_paper/pier_model/restore.py


--------------------------------------------------------------------------------
/rerank_paper/pier_model/run.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | LOG_PATH="./"
 4 | if [[ ! -d ${LOG_PATH} ]]; then
 5 |         mkdir ${LOG_PATH}
 6 | fi
 7 | 
 8 | project_path=$(cd `dirname $0`; pwd)
 9 | project_name="${project_path##*/}"
10 | begin_time=$(date "+%Y-%m-%d %H:%M:%S")
11 | author="yangfan129"
12 | 
13 | LOG_FILENAME="log"
14 | CUDA_VISIBLE_DEVICES=0 nohup python -u main.py --author ${author} --project ${project_name} --begintime ${begin_time} > ${LOG_PATH}${LOG_FILENAME} 2>&1 &
15 | 


--------------------------------------------------------------------------------
/rerank_paper/pier_model/tools.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import datetime
  3 | import time
  4 | import os
  5 | 
  6 | 
  7 | class tick_tock:
  8 |     def __init__(self, process_name, verbose=1):
  9 |         self.process_name = process_name
 10 |         self.verbose = verbose
 11 | 
 12 |     def __enter__(self):
 13 |         if self.verbose:
 14 |             print(("*" * 50 + " {} START!!!! ".format(self.process_name) + "*" * 50))
 15 |             self.begin_time = time.time()
 16 | 
 17 |     def __exit__(self, type, value, traceback):
 18 |         if self.verbose:
 19 |             end_time = time.time()
 20 |             duration_seconds = end_time - self.begin_time
 21 |             duration = str(datetime.timedelta(seconds=duration_seconds))
 22 | 
 23 |             print(("#" * 50 + " {} END... time lapsing {}  ".format(self.process_name, duration) + "#" * 50))
 24 | 
 25 | 
 26 | class FeatureInfo:
 27 |     def __init__(self, feature_info_str):
 28 |         self.feature_info_str = feature_info_str
 29 | 
 30 |         self.feature_name = "NonFeaName"
 31 |         self.feature_size = 0
 32 |         self.feature_mask = 1
 33 |         self.parse_info_flag = False
 34 |         self.part_num = 3
 35 | 
 36 |         self._parse_info()
 37 | 
 38 |     def _parse_info(self):
 39 |         infoList = self.feature_info_str.split()
 40 | 
 41 |         if len(infoList) == self.part_num:
 42 |             self.feature_name = infoList[0]
 43 |             self.feature_size = int(infoList[1])
 44 |             self.feature_mask = int(infoList[2])
 45 |             self.parse_info_flag = True
 46 | 
 47 | 
 48 | def parse_mask_file(feature_mask_file):
 49 |     try:
 50 |         if not os.path.exists(feature_mask_file):
 51 |             print("parse_mask_file fail - file not exists:", feature_mask_file)
 52 |             return [], False
 53 |         # feature_name_list = []
 54 |         feature_mask_list = []
 55 |         feature_hold_cnt = 0
 56 | 
 57 |         with open(feature_mask_file) as f:
 58 |             str_list = f.readlines()
 59 | 
 60 |         for i in range(0, len(str_list)):
 61 |             str_list[i] = str_list[i].strip('\n').strip()
 62 |             if str_list[i] == "":
 63 |                 continue
 64 | 
 65 |             info = FeatureInfo(str_list[i])
 66 |             if not info.parse_info_flag:
 67 |                 print("parse_mask_file fail - parse_info fail:", str_list[i])
 68 |                 parse_mask_flag = False
 69 |                 return [], parse_mask_flag
 70 | 
 71 |             for j in range(info.feature_size):
 72 |                 feature_mask_list.append(info.feature_mask)
 73 |                 if info.feature_mask != 0:
 74 |                     feature_hold_cnt += 1
 75 |                 # if info.feature_size > 1:
 76 |                 #     feature_name_list.append(info.feature_name + "_" + str(j))
 77 |                 # else:
 78 |                 #     feature_name_list.append(info.feature_name)
 79 | 
 80 |         parse_mask_flag = True
 81 |         return feature_mask_list, parse_mask_flag, feature_hold_cnt
 82 |     except Exception as e:
 83 |         print("parse_mask_file fail - Exception:", e)
 84 |         return [], False
 85 | 
 86 | import itertools
 87 | 
 88 | 
 89 | # 利用itertools库中的permutations函数,给定一个排列,输出他的全排列
 90 | def allPermutation(n):
 91 |     permutation = []
 92 |     # 首先需要初始化一个1-n的排列
 93 |     for i in range(n):
 94 |         permutation.append(i+1)
 95 |     # itertools.permutations返回的只是一个对象,需要将其转化成list
 96 |     # 每一种排列情况以元组类型存储
 97 |     all_permutation = list(itertools.permutations(permutation))
 98 |     return all_permutation
 99 | 
100 | 
101 | 
102 | 
103 | 
104 | if __name__ == "__main__":
105 |     # feature_mask_list, parse_feature_mask_flag, feature_hold_cnt = parse_mask_file("feature_mask")
106 |     # print(feature_mask_list)
107 |     # print(len(feature_mask_list))
108 |     # print(parse_feature_mask_flag)
109 |     # print(feature_hold_cnt)
110 |     print(allPermutation(5))
111 | 


--------------------------------------------------------------------------------
/rerank_paper/pier_model/util.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | def index_matrix_to_pairs(index_matrix):
 4 |     # [[3,1,2], [2,3,1]] -> [[[0, 3], [1, 1], [2, 2]],
 5 |     #                        [[0, 2], [1, 3], [2, 1]]]
 6 |     replicated_first_indices = tf.range(tf.shape(index_matrix)[0])
 7 |     rank = len(index_matrix.get_shape())
 8 |     if rank == 2:
 9 |         replicated_first_indices = tf.tile(
10 |             tf.expand_dims(replicated_first_indices, axis=1),
11 |             [1, tf.shape(index_matrix)[1]])
12 |         replicated_first_indices = tf.cast(replicated_first_indices, dtype=tf.int64)
13 |     return tf.stack([replicated_first_indices, index_matrix], axis=rank)
14 | 
15 | def string_hash_to_index(tensor, bucket=1<<22):
16 |     return tf.strings.to_hash_bucket_fast(tensor, bucket)
17 | 
18 | def int_to_string_with_key(tensor, key):
19 |     return key + "_" + tf.strings.as_string(tensor)
20 | 
21 | def float_to_string_with_key(tensor, key, precision=1):
22 |     return key + "_" + tf.strings.as_string(tensor, precision)
23 | 
24 | def float_to_int(tensor, order):
25 |     wc = 10 ** order   
26 |     return tf.cast(tensor * wc, tf.int64)
27 | 
28 | def float_custom_hash(tensor, key, precision=0, bucket=1<<22):
29 |     tensor = float_to_string_with_key(tensor, key, precision)
30 |     tensor = string_hash_to_index(tensor, bucket)
31 |     return tensor
32 | 
33 | def int_custom_hash(tensor, key, bucket=1<<22):
34 |     tensor = int_to_string_with_key(tensor, key)
35 |     tensor = string_hash_to_index(tensor, bucket)
36 |     return tensor
37 | 


--------------------------------------------------------------------------------
/rerank_paper/pier_model_whole_framework/avg_std/delivery:
--------------------------------------------------------------------------------
1 | 4.668 40.692 17.227 2.616
2 | 2.722 10.623 13.181 1.742
3 | 


--------------------------------------------------------------------------------
/rerank_paper/pier_model_whole_framework/avg_std/poi:
--------------------------------------------------------------------------------
1 | 2066.773487 23.97615642 26.98331926 3.040902147 4.622442584
2 | 2239.863594 17.22234242 18.71115826 1.824693613 0.597578887


--------------------------------------------------------------------------------
/rerank_paper/pier_model_whole_framework/avg_std/user:
--------------------------------------------------------------------------------
1 | 0 0
2 | 1 1


--------------------------------------------------------------------------------
/rerank_paper/pier_model_whole_framework/build_model.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | echo ""
 4 | # 共建模型服务模版的路径 ------------------------ 修改模版配置 ----------------------
 5 | template_path="../model_template/"
 6 | 
 7 | # 待上传模型pb文件的路径 ------------------------ 修改模型配置 ----------------------
 8 | model_path="../model/"
 9 | model_name=$1
10 | ep=$2
11 | model_file=${model_path}${model_name}/${ep}
12 | echo ${model_file}
13 | target_path="../model_zip/"
14 | if [[ ! -d ${target_path} ]]; then
15 |         mkdir ${target_path}
16 | fi
17 | target_name=${model_name}_${ep}
18 | target_file=${target_path}${target_name}
19 | target_name_zip=${target_name}.zip
20 | echo ${target_name}
21 | echo "---> 待上传模型的目标路径: "${target_file}
22 | if [[ -d ${target_file} ]]; then
23 |     cmd_del="rm -fr ${target_file}"
24 |     echo "---> 删除已存在的目标文件: "${cmd_del}
25 |     ${cmd_del}
26 | fi
27 | if [[ ! -d ${target_file} ]]; then
28 |         mkdir ${target_file}
29 | fi
30 | 
31 | cmd_cp_model_pb="cp -fr ${model_file} ${target_file}/${target_name}"
32 | echo "---> 拷贝模型pb文件到目标目录: "${cmd_cp_model_pb}
33 | ${cmd_cp_model_pb}
34 | 
35 | cmd_mv1="cp ${template_path}/model.properties ${target_file}/${target_name}.properties"
36 | cmd_mv2="cp ${template_path}/model.xml ${target_file}/${target_name}.xml"
37 | cmd_mv3="cp ${template_path}/tensors.xml ${target_file}/tensors.xml"
38 | cmd_mv4="cp ${template_path}/poi_feature.xml ${target_file}/poi_feature.xml"
39 | cmd_mv5="cp ${template_path}/poi_feature.tensor ${target_file}/poi_feature.tensor"
40 | echo "---> 修改properties文件名: "${cmd_mv1}
41 | echo "---> 修改xml文件名: "${cmd_mv2}
42 | ${cmd_mv1}
43 | ${cmd_mv2}
44 | ${cmd_mv3}
45 | ${cmd_mv4}
46 | ${cmd_mv5}
47 | 
48 | cmd_zip1="cd ${target_path}"
49 | cmd_zip2="zip -r ${target_name_zip} ${target_name}"
50 | echo "---> 压缩上传模型文件: "${cmd_zip1} ${cmd_zip2}
51 | ${cmd_zip1}
52 | ${cmd_zip2}
53 | cd -
54 | 
55 | echo ${target_name}.zip
56 | 


--------------------------------------------------------------------------------
/rerank_paper/pier_model_whole_framework/config.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8 -*-
  2 | import shutil
  3 | import time
  4 | import os
  5 | 
  6 | # result
  7 | RANDOM_SEED = 2022
  8 | BATCH_SIZE = 1024
  9 | IMP_LOSS_WEIGHT = 0.02
 10 | # basic config
 11 | EPOCH = 1 
 12 | LEARNING_RATE = 0.001
 13 | DATA_MODE = 3 # 1:local train，2:local test, 3:docker evaluate
 14 | TRAIN_MODE = 3
 15 | MODEL_NAME = "pier_listwise_whole_v4"
 16 | 
 17 | USE_CONSTRATIVE_LOSS = True
 18 | CONSTRATIVE_LOSS_K = 0.01
 19 | 
 20 | # poi类别特征
 21 | FEATURE_CATE_NUM = 7 # v1r3:19
 22 | # dense特征
 23 | FEATURE_DENSE_NUM = 5  # v1:28 v1r2:79 v1r3:83
 24 | # 预估值特征
 25 | FEATURE_CXR_NUM = 3
 26 | # 环境特征
 27 | FEATURE_ENV_NUM = 2
 28 | # 自然poi
 29 | FEATURE_NATURE_POI = 25
 30 | 
 31 | # embedding_look_up维度
 32 | CATE_FEATURE_EMBEDDINGS_SHAPE = [1 << 22, 8]
 33 | 
 34 | # hash vector
 35 | HASH_VECTOR = [[0.4011729380078948, 0.9907933488085712, 0.889170658914898, 0.015553701343792192, 0.2269008585804, 0.889609750280199, 0.19280345796299014, 0.33362195188257815],
 36 |                [0.1174580997476552, 0.4475570392889796, 0.9925939893135071, 0.7296640075216434, 0.6436313332140967, 0.8568068597644793, 0.9018604021483339, 0.28774343258170776],
 37 |                [0.4374993384759095, 0.6807149381274915, 0.20502126763268802, 0.6968687323602859, 0.6449029002244834, 0.08732080642447282, 0.6119476780855001, 0.8616788453789646],
 38 |                [0.19344266090314144, 0.4268233179146762, 0.10951597767480326, 0.9867838283258178, 0.8340011944969644, 0.7992329879482085, 0.005303560724105649, 0.9662924610057512],
 39 |                [0.5824282763301396, 0.5090708710943849, 0.4462703076663568, 0.5482943153972023, 0.06782829736851825, 0.08907408658464577, 0.13400486343251583, 0.1848228429919272]]
 40 | 
 41 | POSITION_ENCODING = [[0.08790239841717873, 0.033267486152506076, 0.5495130189114207, 0.17419777583517537, 0.6838981992197484, 0.07935154925635501, 0.02665372302227631, 0.4181008411574786],
 42 |                      [0.02038159041970189, 0.9310485215006198, 0.723661313625571, 0.5110323516977285, 0.7812851438212606, 0.40722710713947474, 0.04646490014890503, 0.9565353323381218],
 43 |                      [0.17279361698259843, 0.12968275664201512, 0.3183103529758954, 0.6360591081256931, 0.1558507653689548, 0.5972802646455662, 0.4380619835390329, 0.6088094249662641],
 44 |                      [0.038262298606345335, 0.9999545626217287, 0.7113776275017341, 0.5434618368150265, 0.6853527957705402, 0.9662653254145415, 0.9641592716989676, 0.3443813983264],
 45 |                      [0.7903243938847678, 0.9952713339078417, 0.8741415264071601, 0.45665348276461737, 0.7693872696125916, 0.772509599868299, 0.2540369924156157, 0.24781240400239857]]
 46 | 
 47 | 
 48 | TIME_AWARE_WEIGHT = [[1/2,1/2,1/3,1/3,1/4]]
 49 | PERMUATION_SIZE = 120
 50 | TOP_K = 20
 51 | 
 52 | 
 53 | EXPOSE_RATE_FOR_BEAM_SEARCH = [[1.0,0.9,0.8,0.7,0.6]]
 54 | 
 55 | # N: Cut Number of POI For Train
 56 | POI_NUM = 5
 57 | FEATURE_NUM = 9
 58 | PAGE_NUM = 5
 59 | FEATURE_NUM_FOR_PAGE = 11
 60 | 
 61 | # 属性特征：KA AOR BRAND
 62 | FEATURE_ATTR_NUM = 3
 63 | 
 64 | # DELIVERY_FEAT
 65 | DELIVERY_FEAT_NUM = 4
 66 | 
 67 | # OUT NUM
 68 | OUT_NUM = 1
 69 | 
 70 | PLACE_HOLDER_NUM = 11
 71 | DENSE_FEAT_NUM = 439
 72 | 
 73 | 
 74 | # 网络结构参数
 75 | MODEL_PARAMS = {
 76 |     'INPUT_TENSOR_LAYERS_A': [60, 32, 20],
 77 |     'INPUT_TENSOR_LAYERS_B': [128, 32],
 78 |     'INPUT_TENSOR_LAYERS_C': [50, 20],
 79 |     'INPUT_TENSOR_LAYERS_D': [50, 20],
 80 |     'INPUT_TENSOR_LAYERS_E': [50, 20]
 81 | }
 82 | # A_INPUT_DIM = POI_NUM * (MODEL_PARAMS['INPUT_TENSOR_LAYERS_A'][-1] + 1)
 83 | MLP_INPUT_DIM = CATE_FEATURE_EMBEDDINGS_SHAPE[1] * 3 + 1 + MODEL_PARAMS['INPUT_TENSOR_LAYERS_A'][-1]
 84 | DIN_CONF = {}
 85 | 
 86 | # train data
 87 | # /users/lemonace/Downloads/tfrecord-rl-limit5-v1
 88 | if DATA_MODE == 1:
 89 |     TRAIN_FILE = ['/users/meituan_sxw/Downloads/part-r-00046']
 90 |     VALID_FILE = TRAIN_FILE
 91 |     PREDICT_FILE = VALID_FILE
 92 |     TEST_FILE = PREDICT_FILE
 93 | elif DATA_MODE == 2:
 94 |     TRAIN_FILE = ['/mnt/dolphinfs/hdd_pool/docker/user/hadoop-hmart-waimaiad/yangfan129/train_data/avito_v1_new/avito_v1_new/test_data/part-r-*']
 95 |     VALID_FILE = TRAIN_FILE
 96 |     TEST_FILE = VALID_FILE
 97 | elif DATA_MODE == 3:
 98 |     TRAIN_FILE = ["/mnt/dolphinfs/hdd_pool/docker/user/hadoop-hmart-waimaiad/yangfan129/train_data/avito_v3_new/train_data/part-r-000*1"]
 99 |     VALID_FILE = ["/mnt/dolphinfs/hdd_pool/docker/user/hadoop-hmart-waimaiad/yangfan129/train_data/avito_v3_new/test_data/part-r-000*1"]
100 |     TEST_FILE= ["/mnt/dolphinfs/hdd_pool/docker/user/hadoop-hmart-waimaiad/yangfan129/train_data/avito_v3_new/test_data/part-r-000*1"]
101 | elif DATA_MODE == 4:
102 |     DATA_FILE = "/home/hadoop-hmart-waimaiad/cephfs/data/yangfan129/train_data/tfrecord-multi-channel-v1/"
103 |     #TRAIN_LIST = ["20211222", "20211223", "20211224", "20211225"]
104 |     TRAIN_LIST = ["20220123"]
105 |     VALID_LIST = ["20220124"]
106 |     TRAIN_FILE = [DATA_FILE + x + "/part-r-*" for x in TRAIN_LIST]
107 |     VALID_FILE = [DATA_FILE + x + "/part-r-0001*" for x in VALID_LIST]
108 |     TEST_FILE = [DATA_FILE + x + "/part-r-00011" for x in TRAIN_LIST]
109 | 
110 | # 辅助脚本
111 | MEAN_VAR_PATH_POI = "./avg_std/poi"
112 | MEAN_VAR_PATH_DELIVERY = "./avg_std/delivery"
113 | MODEL_SAVE_PATH = "../model/" + MODEL_NAME
114 | MODEL_SAVE_PB_EPOCH_ON = False
115 | MODEL_SAVE_PB_EPOCH_PATH = MODEL_SAVE_PATH + "_pbs"
116 | 


--------------------------------------------------------------------------------
/rerank_paper/pier_model_whole_framework/layers.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | def DIN(seq, seq_len, target, conf, scope="DIN"):
 4 |     # seq BATCH_SIZE * SEQ_LEN * FEAT_NUM : N * M * H
 5 |     # target BATCH_SIZE * FEAT_NUM : N * H
 6 |     # return : BATCH_SIZE * H
 7 | 
 8 |     with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
 9 |         seq_shape = tf.shape(seq)
10 |         target = tf.tile(target, [1, seq_shape[1], 1])
11 | 
12 |         input = tf.concat([seq, target, seq - target, seq * target], axis=-1)
13 | 
14 |         layers = conf.get("layers", [64, 32])
15 |         for layer in layers:
16 |             input = tf.layers.dense(input, layer, activation=tf.nn.sigmoid, name="att_"+str(layer))
17 | 
18 |         input = tf.squeeze(tf.layers.dense(input, 1, activation=None, name="att_final"), axis=-1) # N * M
19 | 
20 |         # Mask
21 |         seq_mask = tf.squeeze(tf.sequence_mask(seq_len, seq_shape[1]))
22 |         # seq_mask = tf.Print(seq_mask, [seq_mask], message="seq_mask", summarize=100)
23 |         padding = tf.ones_like(input) * (-2 ** 32 + 1)
24 |         attention = tf.nn.softmax(tf.where(seq_mask, input, padding), axis=-1) # N * M
25 |         # attention = tf.Print(attention, [attention], message="attention", summarize=100)
26 |         attention = tf.tile(tf.expand_dims(attention, axis=2), [1, 1, seq_shape[2]])
27 |         output = tf.reduce_sum(tf.transpose(attention * seq, [0, 2, 1]), axis=1)
28 |     return output
29 | 


--------------------------------------------------------------------------------
/rerank_paper/pier_model_whole_framework/model_load.py:
--------------------------------------------------------------------------------
 1 | import tensorflow
 2 | 
 3 | from config import *
 4 | import tensorflow_core.contrib.predictor as predictor
 5 | 
 6 | def load_listwise_model():
 7 |     model_filename_dir = MODEL_SAVE_PATH + "_pbs/ep0/"
 8 |     predict_fn = predictor.from_saved_model(model_filename_dir)
 9 |     # predict_fn = tf.compat.v2.saved_model.load(model_filename_dir)
10 | 
11 |     # env_feature = > dense_feature
12 |     # cxr_feature = > screen_predict_feature
13 |     # cat_feature = > screen_cate_feature
14 |     # dense_feature = > screen_dense_feature
15 |     predictions = predict_fn({
16 |         'screen_predict_feature': [[[0.036115, 0.05427262, 0.09489095, 0.2],
17 |                                     [0.027565, 0.07474336, 0.04988268, 0.53],
18 |                                     [0.024815, 0.1775544, 0.12052802, 0.24],
19 |                                     [0.023316, 0.12283709, 0.10298113, 0.1]]],
20 |         # dense 特征 (价格，评分)
21 |         'screen_dense_feature': [[[1359., 30.146147, 26., 5., 4.85],
22 |                                   [318., 14.675659, 0., 5., 4.94],
23 |                                   [637., 24.784016, 0., 5., 4.65],
24 |                                   [185., 25.333273, 0., 5., 4.75]]],
25 |         # 离散特征(品类)
26 |         'screen_cate_feature': [[[2638824, 4148885, 432243, 3985407, 3385100, 3019284],
27 |                                  [2638824, 3905410, 3212599, 3985407, 1997821, 3019284],
28 |                                  [2638824, 4148885, 3622545, 3985407, 1997821, 3019284],
29 |                                  [2638824, 4148885, 432243, 3985407, 1997821, 3019284]]],
30 |         # 环境特征（是否有铂金）
31 |         'dense_feature': [[0., 0.]]
32 |     })
33 | 
34 |     print('Q_network_output:', predictions['Q_network_output'])
35 |     print('out:', predictions['out'])
36 | 
37 | if __name__ == '__main__':
38 |     # load_pg_model()
39 |     load_listwise_model()


--------------------------------------------------------------------------------
/rerank_paper/pier_model_whole_framework/restore.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lemonace/PIER_code/9c47f80af24570154aab4a811dd82d1f1d436fc3/rerank_paper/pier_model_whole_framework/restore.py


--------------------------------------------------------------------------------
/rerank_paper/pier_model_whole_framework/run.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | LOG_PATH="./"
 4 | if [[ ! -d ${LOG_PATH} ]]; then
 5 |         mkdir ${LOG_PATH}
 6 | fi
 7 | 
 8 | project_path=$(cd `dirname $0`; pwd)
 9 | project_name="${project_path##*/}"
10 | begin_time=$(date "+%Y-%m-%d %H:%M:%S")
11 | author="yangfan129"
12 | 
13 | LOG_FILENAME="log"
14 | CUDA_VISIBLE_DEVICES=0 nohup python -u main.py --author ${author} --project ${project_name} --begintime ${begin_time} > ${LOG_PATH}${LOG_FILENAME} 2>&1 &
15 | 


--------------------------------------------------------------------------------
/rerank_paper/pier_model_whole_framework/tools.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import datetime
  3 | import time
  4 | import os
  5 | 
  6 | import math
  7 | import random
  8 | 
  9 | class tick_tock:
 10 |     def __init__(self, process_name, verbose=1):
 11 |         self.process_name = process_name
 12 |         self.verbose = verbose
 13 | 
 14 |     def __enter__(self):
 15 |         if self.verbose:
 16 |             print(("*" * 50 + " {} START!!!! ".format(self.process_name) + "*" * 50))
 17 |             self.begin_time = time.time()
 18 | 
 19 |     def __exit__(self, type, value, traceback):
 20 |         if self.verbose:
 21 |             end_time = time.time()
 22 |             duration_seconds = end_time - self.begin_time
 23 |             duration = str(datetime.timedelta(seconds=duration_seconds))
 24 | 
 25 |             print(("#" * 50 + " {} END... time lapsing {}  ".format(self.process_name, duration) + "#" * 50))
 26 | 
 27 | 
 28 | class FeatureInfo:
 29 |     def __init__(self, feature_info_str):
 30 |         self.feature_info_str = feature_info_str
 31 | 
 32 |         self.feature_name = "NonFeaName"
 33 |         self.feature_size = 0
 34 |         self.feature_mask = 1
 35 |         self.parse_info_flag = False
 36 |         self.part_num = 3
 37 | 
 38 |         self._parse_info()
 39 | 
 40 |     def _parse_info(self):
 41 |         infoList = self.feature_info_str.split()
 42 | 
 43 |         if len(infoList) == self.part_num:
 44 |             self.feature_name = infoList[0]
 45 |             self.feature_size = int(infoList[1])
 46 |             self.feature_mask = int(infoList[2])
 47 |             self.parse_info_flag = True
 48 | 
 49 | 
 50 | def parse_mask_file(feature_mask_file):
 51 |     try:
 52 |         if not os.path.exists(feature_mask_file):
 53 |             print("parse_mask_file fail - file not exists:", feature_mask_file)
 54 |             return [], False
 55 |         # feature_name_list = []
 56 |         feature_mask_list = []
 57 |         feature_hold_cnt = 0
 58 | 
 59 |         with open(feature_mask_file) as f:
 60 |             str_list = f.readlines()
 61 | 
 62 |         for i in range(0, len(str_list)):
 63 |             str_list[i] = str_list[i].strip('\n').strip()
 64 |             if str_list[i] == "":
 65 |                 continue
 66 | 
 67 |             info = FeatureInfo(str_list[i])
 68 |             if not info.parse_info_flag:
 69 |                 print("parse_mask_file fail - parse_info fail:", str_list[i])
 70 |                 parse_mask_flag = False
 71 |                 return [], parse_mask_flag
 72 | 
 73 |             for j in range(info.feature_size):
 74 |                 feature_mask_list.append(info.feature_mask)
 75 |                 if info.feature_mask != 0:
 76 |                     feature_hold_cnt += 1
 77 |                 # if info.feature_size > 1:
 78 |                 #     feature_name_list.append(info.feature_name + "_" + str(j))
 79 |                 # else:
 80 |                 #     feature_name_list.append(info.feature_name)
 81 | 
 82 |         parse_mask_flag = True
 83 |         return feature_mask_list, parse_mask_flag, feature_hold_cnt
 84 |     except Exception as e:
 85 |         print("parse_mask_file fail - Exception:", e)
 86 |         return [], False
 87 | 
 88 | import itertools
 89 | 
 90 | 
 91 | # 利用itertools库中的permutations函数,给定一个排列,输出他的全排列
 92 | def allPermutation(n):
 93 |     permutation = []
 94 |     # 首先需要初始化一个1-n的排列
 95 |     for i in range(n):
 96 |         permutation.append(i)
 97 |     # itertools.permutations返回的只是一个对象,需要将其转化成list
 98 |     # 每一种排列情况以元组类型存储
 99 |     all_permutation = list(itertools.permutations(permutation))
100 |     return all_permutation
101 | 
102 | 
103 | def random_vector():
104 |     print([[random.random() for x in list(range(0,8))] for y in list(range(0,5))])
105 | 
106 | 
107 | 
108 | 
109 | if __name__ == "__main__":
110 |     # feature_mask_list, parse_feature_mask_flag, feature_hold_cnt = parse_mask_file("feature_mask")
111 |     # print(feature_mask_list)
112 |     # print(len(feature_mask_list))
113 |     # print(parse_feature_mask_flag)
114 |     # print(feature_hold_cnt)
115 |     # print(allPermutation(5))
116 |     random_vector()
117 | 


--------------------------------------------------------------------------------
/rerank_paper/pier_model_whole_framework/util.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | def index_matrix_to_pairs(index_matrix):
 4 |     # [[3,1,2], [2,3,1]] -> [[[0, 3], [1, 1], [2, 2]],
 5 |     #                        [[0, 2], [1, 3], [2, 1]]]
 6 |     replicated_first_indices = tf.range(tf.shape(index_matrix)[0])
 7 |     rank = len(index_matrix.get_shape())
 8 |     if rank == 2:
 9 |         replicated_first_indices = tf.tile(
10 |             tf.expand_dims(replicated_first_indices, axis=1),
11 |             [1, tf.shape(index_matrix)[1]])
12 |         replicated_first_indices = tf.cast(replicated_first_indices, dtype=tf.int64)
13 |     return tf.stack([replicated_first_indices, index_matrix], axis=rank)
14 | 
15 | def string_hash_to_index(tensor, bucket=1<<22):
16 |     return tf.strings.to_hash_bucket_fast(tensor, bucket)
17 | 
18 | def int_to_string_with_key(tensor, key):
19 |     return key + "_" + tf.strings.as_string(tensor)
20 | 
21 | def float_to_string_with_key(tensor, key, precision=1):
22 |     return key + "_" + tf.strings.as_string(tensor, precision)
23 | 
24 | def float_to_int(tensor, order):
25 |     wc = 10 ** order   
26 |     return tf.cast(tensor * wc, tf.int64)
27 | 
28 | def float_custom_hash(tensor, key, precision=0, bucket=1<<22):
29 |     tensor = float_to_string_with_key(tensor, key, precision)
30 |     tensor = string_hash_to_index(tensor, bucket)
31 |     return tensor
32 | 
33 | def int_custom_hash(tensor, key, bucket=1<<22):
34 |     tensor = int_to_string_with_key(tensor, key)
35 |     tensor = string_hash_to_index(tensor, bucket)
36 |     return tensor
37 | 


--------------------------------------------------------------------------------
/rerank_paper/pier_model_without_oam_atten/avg_std/delivery:
--------------------------------------------------------------------------------
1 | 4.668 40.692 17.227 2.616
2 | 2.722 10.623 13.181 1.742
3 | 


--------------------------------------------------------------------------------
/rerank_paper/pier_model_without_oam_atten/avg_std/poi:
--------------------------------------------------------------------------------
1 | 2066.773487 23.97615642 26.98331926 3.040902147 4.622442584
2 | 2239.863594 17.22234242 18.71115826 1.824693613 0.597578887


--------------------------------------------------------------------------------
/rerank_paper/pier_model_without_oam_atten/avg_std/user:
--------------------------------------------------------------------------------
1 | 0 0
2 | 1 1


--------------------------------------------------------------------------------
/rerank_paper/pier_model_without_oam_atten/build_model.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | echo ""
 4 | # 共建模型服务模版的路径 ------------------------ 修改模版配置 ----------------------
 5 | template_path="../model_template/"
 6 | 
 7 | # 待上传模型pb文件的路径 ------------------------ 修改模型配置 ----------------------
 8 | model_path="../model/"
 9 | model_name=$1
10 | ep=$2
11 | model_file=${model_path}${model_name}/${ep}
12 | echo ${model_file}
13 | target_path="../model_zip/"
14 | if [[ ! -d ${target_path} ]]; then
15 |         mkdir ${target_path}
16 | fi
17 | target_name=${model_name}_${ep}
18 | target_file=${target_path}${target_name}
19 | target_name_zip=${target_name}.zip
20 | echo ${target_name}
21 | echo "---> 待上传模型的目标路径: "${target_file}
22 | if [[ -d ${target_file} ]]; then
23 |     cmd_del="rm -fr ${target_file}"
24 |     echo "---> 删除已存在的目标文件: "${cmd_del}
25 |     ${cmd_del}
26 | fi
27 | if [[ ! -d ${target_file} ]]; then
28 |         mkdir ${target_file}
29 | fi
30 | 
31 | cmd_cp_model_pb="cp -fr ${model_file} ${target_file}/${target_name}"
32 | echo "---> 拷贝模型pb文件到目标目录: "${cmd_cp_model_pb}
33 | ${cmd_cp_model_pb}
34 | 
35 | cmd_mv1="cp ${template_path}/model.properties ${target_file}/${target_name}.properties"
36 | cmd_mv2="cp ${template_path}/model.xml ${target_file}/${target_name}.xml"
37 | cmd_mv3="cp ${template_path}/tensors.xml ${target_file}/tensors.xml"
38 | cmd_mv4="cp ${template_path}/poi_feature.xml ${target_file}/poi_feature.xml"
39 | cmd_mv5="cp ${template_path}/poi_feature.tensor ${target_file}/poi_feature.tensor"
40 | echo "---> 修改properties文件名: "${cmd_mv1}
41 | echo "---> 修改xml文件名: "${cmd_mv2}
42 | ${cmd_mv1}
43 | ${cmd_mv2}
44 | ${cmd_mv3}
45 | ${cmd_mv4}
46 | ${cmd_mv5}
47 | 
48 | cmd_zip1="cd ${target_path}"
49 | cmd_zip2="zip -r ${target_name_zip} ${target_name}"
50 | echo "---> 压缩上传模型文件: "${cmd_zip1} ${cmd_zip2}
51 | ${cmd_zip1}
52 | ${cmd_zip2}
53 | cd -
54 | 
55 | echo ${target_name}.zip
56 | 


--------------------------------------------------------------------------------
/rerank_paper/pier_model_without_oam_atten/config.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8 -*-
 2 | import shutil
 3 | import time
 4 | import os
 5 | 
 6 | # result
 7 | RANDOM_SEED = 2022
 8 | BATCH_SIZE = 1024
 9 | IMP_LOSS_WEIGHT = 0.02
10 | # basic config
11 | EPOCH = 1 
12 | LEARNING_RATE = 0.005
13 | DATA_MODE = 3 # 1:local train，2:local test, 3:docker evaluate
14 | TRAIN_MODE = 3
15 | MODEL_NAME = "pier_model_without_oam_atten_v1"
16 | # poi类别特征
17 | FEATURE_CATE_NUM = 7 # v1r3:19
18 | # dense特征
19 | FEATURE_DENSE_NUM = 5  # v1:28 v1r2:79 v1r3:83
20 | # 预估值特征
21 | FEATURE_CXR_NUM = 3
22 | # 环境特征
23 | FEATURE_ENV_NUM = 2
24 | # 自然poi
25 | FEATURE_NATURE_POI = 25
26 | 
27 | # embedding_look_up维度
28 | CATE_FEATURE_EMBEDDINGS_SHAPE = [1 << 22, 8]
29 | 
30 | # N: Cut Number of POI For Train
31 | POI_NUM = 5
32 | FEATURE_NUM = 9
33 | PAGE_NUM = 5
34 | FEATURE_NUM_FOR_PAGE = 11
35 | PERMUATION_SIZE = 120
36 | # 属性特征：KA AOR BRAND
37 | FEATURE_ATTR_NUM = 3
38 | 
39 | # DELIVERY_FEAT
40 | DELIVERY_FEAT_NUM = 4
41 | 
42 | # OUT NUM
43 | OUT_NUM = 1
44 | 
45 | PLACE_HOLDER_NUM = 11
46 | DENSE_FEAT_NUM = 439
47 | 
48 | 
49 | # 网络结构参数
50 | MODEL_PARAMS = {
51 |     'INPUT_TENSOR_LAYERS_A': [60, 32, 10],
52 |     'INPUT_TENSOR_LAYERS_B': [50, 20],
53 |     'INPUT_TENSOR_LAYERS_C': [60, 32, 10],
54 |     'INPUT_TENSOR_LAYERS_D': [50, 20],
55 |     'INPUT_TENSOR_LAYERS_E': [50, 20]
56 | }
57 | # A_INPUT_DIM = POI_NUM * (MODEL_PARAMS['INPUT_TENSOR_LAYERS_A'][-1] + 1)
58 | MLP_INPUT_DIM = CATE_FEATURE_EMBEDDINGS_SHAPE[1] * 1 + 1 + MODEL_PARAMS['INPUT_TENSOR_LAYERS_A'][-1]
59 | DIN_CONF = {}
60 | 
61 | # train data
62 | # /users/lemonace/Downloads/tfrecord-rl-limit5-v1
63 | if DATA_MODE == 1:
64 |     TRAIN_FILE = ['/users/meituan_sxw/Downloads/part-r-00046']
65 |     VALID_FILE = TRAIN_FILE
66 |     PREDICT_FILE = VALID_FILE
67 |     TEST_FILE = PREDICT_FILE
68 | elif DATA_MODE == 2:
69 |     TRAIN_FILE = ['/mnt/dolphinfs/hdd_pool/docker/user/hadoop-hmart-waimaiad/yangfan129/train_data/avito_v1_new/avito_v1_new/test_data/part-r-*']
70 |     VALID_FILE = TRAIN_FILE
71 |     TEST_FILE = VALID_FILE
72 | elif DATA_MODE == 3:
73 |     TRAIN_FILE = ["/mnt/dolphinfs/hdd_pool/docker/user/hadoop-hmart-waimaiad/yangfan129/train_data/avito_v3_new/train_data/part-r-*"]
74 |     VALID_FILE = ["/mnt/dolphinfs/hdd_pool/docker/user/hadoop-hmart-waimaiad/yangfan129/train_data/avito_v3_new/test_data/part-r-*"]
75 |     TEST_FILE= ["/mnt/dolphinfs/hdd_pool/docker/user/hadoop-hmart-waimaiad/yangfan129/train_data/avito_v3_new/test_data/part-r-*"]
76 | elif DATA_MODE == 4:
77 |     DATA_FILE = "/home/hadoop-hmart-waimaiad/cephfs/data/yangfan129/train_data/tfrecord-multi-channel-v1/"
78 |     #TRAIN_LIST = ["20211222", "20211223", "20211224", "20211225"]
79 |     TRAIN_LIST = ["20220123"]
80 |     VALID_LIST = ["20220124"]
81 |     TRAIN_FILE = [DATA_FILE + x + "/part-r-*" for x in TRAIN_LIST]
82 |     VALID_FILE = [DATA_FILE + x + "/part-r-0001*" for x in VALID_LIST]
83 |     TEST_FILE = [DATA_FILE + x + "/part-r-00011" for x in TRAIN_LIST]
84 | 
85 | # 辅助脚本
86 | MEAN_VAR_PATH_POI = "./avg_std/poi"
87 | MEAN_VAR_PATH_DELIVERY = "./avg_std/delivery"
88 | MODEL_SAVE_PATH = "../model/" + MODEL_NAME
89 | MODEL_SAVE_PB_EPOCH_ON = False
90 | MODEL_SAVE_PB_EPOCH_PATH = MODEL_SAVE_PATH + "_pbs"
91 | 


--------------------------------------------------------------------------------
/rerank_paper/pier_model_without_oam_atten/data_input.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import tensorflow as tf
  3 | from config import *
  4 | import numpy as np
  5 | from tools import tick_tock,allPermutation
  6 | 
  7 | 
  8 | def generate_parse_tfrecord_local_fn():
  9 |     def _parse_function(batch_examples):
 10 |         common_features, sequence_features = feature_parse_scheme()
 11 |         parsed_features = tf.parse_example(
 12 |             serialized=batch_examples,
 13 |             features=common_features
 14 |         )
 15 |         features = feature_product(parsed_features)
 16 |         labels = label_product(parsed_features)
 17 |         return features, labels
 18 | 
 19 |     return _parse_function
 20 | 
 21 | 
 22 | def generate_parse_valid_tfrecord_local_fn():
 23 |     def _parse_function(batch_examples):
 24 |         common_features, sequence_features = feature_parse_scheme()
 25 |         parsed_features = tf.parse_example(
 26 |             serialized=batch_examples,
 27 |             features=common_features
 28 |         )
 29 |         features = feature_product(parsed_features)
 30 |         labels = label_product(parsed_features)
 31 |         return features, labels
 32 | 
 33 |     return _parse_function
 34 | 
 35 | 
 36 | def feature_parse_scheme():
 37 |     label_len = POI_NUM * 2 + PAGE_NUM
 38 |     feature_len = POI_NUM * FEATURE_NUM + PAGE_NUM * POI_NUM * FEATURE_NUM_FOR_PAGE
 39 |     common_features = {
 40 |         "label": tf.FixedLenFeature([label_len], dtype=tf.float32),
 41 |         "feature": tf.FixedLenFeature([feature_len], dtype=tf.float32),
 42 |     }
 43 | 
 44 |     sequence_features = {}
 45 |     return common_features, sequence_features
 46 | 
 47 | 
 48 | def label_product(parsed_features):
 49 |     labels = parsed_features['label']
 50 | 
 51 |     labels_result = {
 52 |         # ctr_label
 53 |         'ctr_label': tf.gather(labels, list(range(0, POI_NUM)), axis=1),
 54 |         'mask': tf.gather(labels, list(range(POI_NUM, 2 * POI_NUM)), axis=1),
 55 |         'page_mask': tf.gather(labels, list(range(2 * POI_NUM, 2 * POI_NUM + PAGE_NUM)), axis=1),
 56 |     }
 57 |     return labels_result
 58 | 
 59 | 
 60 | def feature_product(parsed_features):
 61 |     feature_buffer = parsed_features['feature']
 62 |     labels = parsed_features['label']
 63 |     # 获取特征
 64 |     # FEATURE_CATE_NUM：品类相关特征
 65 |     # FEATURE_DENSE_NUM：连续值特征
 66 |     # FEATURE_CXR_NUM：模型预估值特征
 67 | 
 68 |     full_permuation_index = allPermutation(POI_NUM)
 69 | 
 70 |     # current page
 71 |     current_page_start = 0
 72 |     current_page_end = current_page_start + POI_NUM * FEATURE_NUM
 73 | 
 74 |     pre_page_start = current_page_end
 75 |     pre_page_end = pre_page_start + PAGE_NUM * POI_NUM * FEATURE_NUM_FOR_PAGE
 76 | 
 77 |     
 78 |     cur_page_features = tf.reshape(tf.gather(feature_buffer,list(range(current_page_start,current_page_end)),axis=1), [-1, POI_NUM, FEATURE_NUM])
 79 |     pre_page_features = tf.reshape(tf.gather(feature_buffer,list(range(pre_page_start,pre_page_end)),axis=1), [-1, PAGE_NUM, POI_NUM, FEATURE_NUM_FOR_PAGE])
 80 |     
 81 |     position_fea = tf.gather(cur_page_features, list(range(0, 1)), axis=2)
 82 |     adid_fea = tf.gather(cur_page_features, list(range(1, 2)), axis=2)
 83 |     obj_type_fea = tf.gather(cur_page_features, list(range(2, 3)), axis=2)
 84 |     hist_ctr_fea = tf.gather(cur_page_features, list(range(3, 4)), axis=2)
 85 |     locationid_fea = tf.gather(cur_page_features, list(range(4, 5)), axis=2)
 86 |     categoryid_fea = tf.gather(cur_page_features, list(range(5, 6)), axis=2)
 87 |     price_fea =  tf.gather(cur_page_features, list(range(6, 7)), axis=2)
 88 |     iscontext_fea = tf.gather(cur_page_features, list(range(7, 8)), axis=2)
 89 |     userid_fea = tf.gather(cur_page_features, list(range(8, 9)), axis=2)
 90 | 
 91 |     pre_position_fea = tf.gather(pre_page_features, list(range(0, 1)), axis=3)
 92 |     pre_adid_fea = tf.gather(pre_page_features, list(range(1, 2)), axis=3)
 93 |     pre_obj_type_fea = tf.gather(pre_page_features, list(range(2, 3)), axis=3)
 94 |     pre_hist_ctr_fea = tf.gather(pre_page_features, list(range(3, 4)), axis=3)
 95 |     pre_locationid_fea = tf.gather(pre_page_features, list(range(4, 5)), axis=3)
 96 |     pre_categoryid_fea = tf.gather(pre_page_features, list(range(5, 6)), axis=3)
 97 |     pre_price_fea = tf.gather(pre_page_features, list(range(6, 7)), axis=3)
 98 |     pre_iscontext_fea = tf.gather(pre_page_features, list(range(7, 8)), axis=3)
 99 |     pre_userid_fea = tf.gather(pre_page_features, list(range(8, 9)), axis=3)
100 | 
101 |     features_result = {
102 |         
103 |         'dense_feature': hist_ctr_fea,
104 |         # 离散特征(品类)
105 |         'cate_feature': tf.cast(tf.concat([position_fea, adid_fea, obj_type_fea, locationid_fea, iscontext_fea, categoryid_fea, userid_fea], axis=2), tf.int64),
106 |         # ctr_label
107 |         'ctr_label': tf.gather(labels, list(range(0, POI_NUM)), axis=1),
108 |         'mask': tf.gather(labels, list(range(POI_NUM, 2 * POI_NUM)), axis=1),
109 |         'page_mask':tf.gather(labels, list(range(2 * POI_NUM, 2 * POI_NUM + PAGE_NUM)), axis=1),
110 |         'behavior_dense_feature': pre_hist_ctr_fea,
111 |         # 离散特征(品类)
112 |         'behavior_cate_feature': tf.cast(tf.concat([pre_position_fea, pre_adid_fea, pre_obj_type_fea, pre_locationid_fea,
113 |                                                     pre_iscontext_fea, pre_categoryid_fea, pre_userid_fea],axis=3), tf.int64),
114 | 
115 |         'full_permuation_index': tf.constant(full_permuation_index,tf.int32)
116 | 
117 |     }
118 |     return features_result
119 | 
120 | 
121 | # num_parallel 表示cpu的核数,用于控制 map的并行度
122 | def input_fn_maker(file_names, is_train, batch_size, epoch=None, num_parallel=4):
123 |     def input_fn():
124 |         _parse_fn = generate_parse_tfrecord_local_fn() if is_train else generate_parse_valid_tfrecord_local_fn()
125 |         files = tf.data.Dataset.list_files(file_names)
126 |         # print(files)
127 |         dataset = files.apply(tf.contrib.data.parallel_interleave(tf.data.TFRecordDataset, cycle_length=4 * 10))
128 |         dataset = dataset.prefetch(buffer_size=batch_size * 10)
129 |         dataset = dataset.repeat(epoch)
130 |         dataset = dataset.batch(batch_size)
131 |         dataset = dataset.map(_parse_fn, num_parallel_calls=num_parallel)
132 |         iterator = dataset.make_one_shot_iterator()
133 |         return iterator.get_next()
134 | 
135 |     return input_fn
136 | 
137 | 
138 | # 从hive表统计得到均值和方差文件
139 | def get_normalization_parameter(mean_var_path):
140 |     with tf.gfile.Open(mean_var_path) as f:
141 |         fea_mean = f.readline().strip().split(' ')
142 |         fea_var = f.readline().strip().split(' ')
143 |         cont_fea_mean = list(map(float, fea_mean))
144 |         cont_fea_var = list(map(float, fea_var))
145 |     f.close()
146 |     return cont_fea_mean, cont_fea_var
147 | 
148 | 
149 | def get_bias_weight_parameter(bias_weight_path):
150 |     with tf.gfile.Open(bias_weight_path) as f2:
151 |         fea_mean = f2.readline().strip().split('\t')
152 |         cont_fea_mean = list(map(float, fea_mean))
153 |     f2.close()
154 |     return cont_fea_mean
155 | 
156 | 
157 | 
158 | if __name__ == '__main__':
159 |     train_file = TRAIN_FILE
160 |     # train_file = ["/mnt/dolphinfs/hdd_pool/docker/user/hadoop-hmart-waimaiad/yangfan129/train_data/avito_v1_new/avito_v1_new/train_data/part-r-00000"]
161 |     train_input_fn = input_fn_maker(train_file, is_train=True, batch_size=16, epoch=1)
162 |     features, labels = train_input_fn()
163 | 
164 |     sess = tf.Session()
165 |     try:
166 |         with tick_tock("DATA_INPUT") as _:
167 |             features_np, labels_np = sess.run([features, labels])
168 | 
169 |         print("*" * 100, "features_np")
170 |         for key in features_np:
171 |             print("=" * 50, key, np.shape(features_np[key]))
172 |             print(features_np[key])
173 | 
174 | 
175 |         print("*" * 100, "labels_np")
176 |         for key in labels_np:
177 |             print("=" * 50, key, np.shape(labels_np[key]))
178 |             print(labels_np[key])
179 | 
180 |     except Exception as e:
181 |         print(e)
182 | 


--------------------------------------------------------------------------------
/rerank_paper/pier_model_without_oam_atten/layers.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | def DIN(seq, seq_len, target, conf, scope="DIN"):
 4 |     # seq BATCH_SIZE * SEQ_LEN * FEAT_NUM : N * M * H
 5 |     # target BATCH_SIZE * FEAT_NUM : N * H
 6 |     # return : BATCH_SIZE * H
 7 | 
 8 |     with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
 9 |         seq_shape = tf.shape(seq)
10 |         target = tf.tile(target, [1, seq_shape[1], 1])
11 | 
12 |         input = tf.concat([seq, target, seq - target, seq * target], axis=-1)
13 | 
14 |         layers = conf.get("layers", [64, 32])
15 |         for layer in layers:
16 |             input = tf.layers.dense(input, layer, activation=tf.nn.sigmoid, name="att_"+str(layer))
17 | 
18 |         input = tf.squeeze(tf.layers.dense(input, 1, activation=None, name="att_final"), axis=-1) # N * M
19 | 
20 |         # Mask
21 |         seq_mask = tf.squeeze(tf.sequence_mask(seq_len, seq_shape[1]))
22 |         # seq_mask = tf.Print(seq_mask, [seq_mask], message="seq_mask", summarize=100)
23 |         padding = tf.ones_like(input) * (-2 ** 32 + 1)
24 |         attention = tf.nn.softmax(tf.where(seq_mask, input, padding), axis=-1) # N * M
25 |         # attention = tf.Print(attention, [attention], message="attention", summarize=100)
26 |         attention = tf.tile(tf.expand_dims(attention, axis=2), [1, 1, seq_shape[2]])
27 |         output = tf.reduce_sum(tf.transpose(attention * seq, [0, 2, 1]), axis=1)
28 |     return output
29 | 


--------------------------------------------------------------------------------
/rerank_paper/pier_model_without_oam_atten/main.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from config import *
  3 | from model import *
  4 | from sklearn import metrics
  5 | 
  6 | def create_estimator():
  7 |     tf.logging.set_verbosity(tf.logging.INFO)
  8 |     session_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
  9 |     session_config.gpu_options.allow_growth = True
 10 |     config = tf.estimator.RunConfig(
 11 |         tf_random_seed=RANDOM_SEED,
 12 |         save_summary_steps=100,
 13 |         save_checkpoints_steps=1000,
 14 |         model_dir=MODEL_SAVE_PATH,
 15 |         keep_checkpoint_max=2,
 16 |         log_step_count_steps=1000,
 17 |         session_config=session_config)
 18 |     nn_model = DNN()
 19 |     estimator = tf.estimator.Estimator(model_fn=nn_model.model_fn_estimator, config=config)
 20 |     return estimator, nn_model
 21 | 
 22 | 
 23 | def save_model_pb_with_estimator(estimator, params, export_dir_base):
 24 |     estimator._params['save_model'] = params['save_model']
 25 | 
 26 |     def _serving_input_receiver_fn():
 27 |         # env_feature = > dense_feature
 28 |         # cxr_feature = > screen_predict_feature
 29 |         # cat_feature = > screen_cate_feature
 30 |         # dense_feature = > screen_dense_feature
 31 |         receiver_tensors = {
 32 |             # ctr cvr gmv预估值 && bid
 33 |             'screen_predict_feature': tf.placeholder(tf.float32, [None, POI_NUM, FEATURE_CXR_NUM],
 34 |                                                 name='screen_predict_feature'),
 35 |             # dense 特征 (价格，评分)
 36 |             'screen_dense_feature': tf.placeholder(tf.float32, [None, POI_NUM, FEATURE_DENSE_NUM],
 37 |                                                 name='screen_dense_feature'),
 38 |             # 离散特征(品类)
 39 |             'screen_cate_feature': tf.placeholder(tf.int64, [None, POI_NUM, FEATURE_CATE_NUM],
 40 |                                                 name='screen_cate_feature'),
 41 |             # 环境特征（是否有铂金）
 42 |             'dense_feature': tf.placeholder(tf.float32, [None, DENSE_FEAT_NUM],
 43 |                                                 name='dense_feature')
 44 |         }
 45 |         return tf.estimator.export.ServingInputReceiver(receiver_tensors=receiver_tensors, features=receiver_tensors)
 46 | 
 47 |     export_dir = estimator.export_saved_model(export_dir_base=export_dir_base,
 48 |                                               serving_input_receiver_fn=_serving_input_receiver_fn)
 49 |     estimator._params.pop('save_model')
 50 |     return export_dir.decode()
 51 | 
 52 | def calculate_result(result_generator):
 53 | 
 54 |     y_ctr, pred_ctr, ctr = [], [], []
 55 |     for result in result_generator:
 56 |         cxr_feature = result['cxr_feature']
 57 |         mask = result['mask']
 58 |         # ctr_label
 59 |         idx = np.where(mask.reshape(-1) == 1)
 60 |         y_ctr += result['ctr_label'].reshape(-1)[idx].tolist()
 61 |         pred_ctr += result['ctr_out'].reshape(-1)[idx].tolist()
 62 |         ctr += cxr_feature[:, :, 0].reshape(-1)[idx].tolist()
 63 | 
 64 |     ctr_auc, ctr_auc_jp, ctr_cb, ctr_cb_jp = metrics.roc_auc_score(y_ctr, pred_ctr), metrics.roc_auc_score(y_ctr, ctr), np.sum(pred_ctr) / np.sum(y_ctr),  np.sum(ctr) / np.sum(y_ctr)
 65 |     print("ctr_auc:{}, ctr_auc_jp:{}, ctr_cb:{}, ctr_cb_jp:{}".format(ctr_auc, ctr_auc_jp, ctr_cb, ctr_cb_jp))
 66 | 
 67 | if __name__ == '__main__':
 68 | 
 69 |     estimator, nn_model = create_estimator()
 70 | 
 71 |     with tick_tock("DATA_INPUT") as _:
 72 |         valid_input_fn = input_fn_maker(VALID_FILE, False, batch_size=1024, epoch=1)
 73 |         test_input_fn = input_fn_maker(TEST_FILE, False, batch_size=1024, epoch=1)
 74 | 
 75 |     if TRAIN_MODE == 1:
 76 |         for i in range(EPOCH):
 77 |             for idx, data in enumerate(TRAIN_FILE):
 78 |                 with tick_tock("DATA_INPUT") as _:
 79 |                     train_input_fn = input_fn_maker([data], True, batch_size=BATCH_SIZE, epoch=1)
 80 |                 with tick_tock("TRAIN") as _:
 81 |                     estimator.train(train_input_fn)
 82 |                 if MODEL_SAVE_PB_EPOCH_ON:
 83 |                     export_dir = save_model_pb_with_estimator(estimator, params={'save_model': 'listwise'},
 84 |                                                               export_dir_base=MODEL_SAVE_PB_EPOCH_PATH)
 85 |                     ep_insert_index = i * len(TRAIN_FILE) + idx
 86 |                     target_dir = export_dir + "/../ep" + str(ep_insert_index)
 87 |                     while os.path.exists(target_dir):
 88 |                         target_dir = export_dir + "/../ep" + str(ep_insert_index)
 89 |                     shutil.move(export_dir, target_dir)
 90 |                     print(time.strftime("%m-%d %H:%M:%S ",
 91 |                                         time.localtime(time.time())) + "export model PB: " + target_dir)
 92 |                 #with tick_tock("PREDICT") as _:
 93 |                     #result_generator = estimator.predict(input_fn=valid_input_fn, yield_single_examples=False)
 94 |                     #calculate_result(result_generator)
 95 | 
 96 | 
 97 | 
 98 |     elif TRAIN_MODE == 2:
 99 |         with tick_tock("PREDICT") as _:
100 |             result_generator = estimator.predict(input_fn=valid_input_fn, yield_single_examples=False)
101 |             calculate_result(result_generator)
102 | 
103 |     elif TRAIN_MODE == 3:
104 |         for i in range(EPOCH):
105 |             for idx, data in enumerate(TRAIN_FILE):
106 |                 with tick_tock("DATA_INPUT") as _:
107 |                     train_input_fn = input_fn_maker([data], True, batch_size=BATCH_SIZE, epoch=1)
108 |                 with tick_tock("TRAIN") as _:
109 |                     estimator.train(train_input_fn)
110 |                 with tick_tock("PREDICT") as _:
111 |                     result_generator = estimator.predict(input_fn=valid_input_fn, yield_single_examples=False)
112 |                     print("valid_data")
113 |                     calculate_result(result_generator)
114 |                     #result_generator = estimator.predict(input_fn=test_input_fn, yield_single_examples=False)
115 |                     print("train_data")
116 |                     #calculate_result(result_generator)
117 |                     # save pb
118 | 
119 |     
120 |     elif TRAIN_MODE == 4:
121 |         export_dir = save_model_pb_with_estimator(estimator, params={'save_model': 'listwise'},
122 |                                                     export_dir_base=MODEL_SAVE_PB_EPOCH_PATH)
123 |         ep_insert_index = 0 
124 |         target_dir = export_dir + "/../ep" + str(ep_insert_index)
125 |         while os.path.exists(target_dir):
126 |             target_dir = export_dir + "/../ep" + str(ep_insert_index)
127 |         shutil.move(export_dir, target_dir)
128 |         print(time.strftime("%m-%d %H:%M:%S ",
129 |                             time.localtime(time.time())) + "export model PB: " + target_dir)
130 |     
131 | 


--------------------------------------------------------------------------------
/rerank_paper/pier_model_without_oam_atten/model_load.py:
--------------------------------------------------------------------------------
 1 | import tensorflow
 2 | 
 3 | from config import *
 4 | import tensorflow_core.contrib.predictor as predictor
 5 | 
 6 | def load_listwise_model():
 7 |     model_filename_dir = MODEL_SAVE_PATH + "_pbs/ep0/"
 8 |     predict_fn = predictor.from_saved_model(model_filename_dir)
 9 |     # predict_fn = tf.compat.v2.saved_model.load(model_filename_dir)
10 | 
11 |     # env_feature = > dense_feature
12 |     # cxr_feature = > screen_predict_feature
13 |     # cat_feature = > screen_cate_feature
14 |     # dense_feature = > screen_dense_feature
15 |     predictions = predict_fn({
16 |         'screen_predict_feature': [[[0.036115, 0.05427262, 0.09489095, 0.2],
17 |                                     [0.027565, 0.07474336, 0.04988268, 0.53],
18 |                                     [0.024815, 0.1775544, 0.12052802, 0.24],
19 |                                     [0.023316, 0.12283709, 0.10298113, 0.1]]],
20 |         # dense 特征 (价格，评分)
21 |         'screen_dense_feature': [[[1359., 30.146147, 26., 5., 4.85],
22 |                                   [318., 14.675659, 0., 5., 4.94],
23 |                                   [637., 24.784016, 0., 5., 4.65],
24 |                                   [185., 25.333273, 0., 5., 4.75]]],
25 |         # 离散特征(品类)
26 |         'screen_cate_feature': [[[2638824, 4148885, 432243, 3985407, 3385100, 3019284],
27 |                                  [2638824, 3905410, 3212599, 3985407, 1997821, 3019284],
28 |                                  [2638824, 4148885, 3622545, 3985407, 1997821, 3019284],
29 |                                  [2638824, 4148885, 432243, 3985407, 1997821, 3019284]]],
30 |         # 环境特征（是否有铂金）
31 |         'dense_feature': [[0., 0.]]
32 |     })
33 | 
34 |     print('Q_network_output:', predictions['Q_network_output'])
35 |     print('out:', predictions['out'])
36 | 
37 | if __name__ == '__main__':
38 |     # load_pg_model()
39 |     load_listwise_model()


--------------------------------------------------------------------------------
/rerank_paper/pier_model_without_oam_atten/restore.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lemonace/PIER_code/9c47f80af24570154aab4a811dd82d1f1d436fc3/rerank_paper/pier_model_without_oam_atten/restore.py


--------------------------------------------------------------------------------
/rerank_paper/pier_model_without_oam_atten/run.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | LOG_PATH="./"
 4 | if [[ ! -d ${LOG_PATH} ]]; then
 5 |         mkdir ${LOG_PATH}
 6 | fi
 7 | 
 8 | project_path=$(cd `dirname $0`; pwd)
 9 | project_name="${project_path##*/}"
10 | begin_time=$(date "+%Y-%m-%d %H:%M:%S")
11 | author="yangfan129"
12 | 
13 | LOG_FILENAME="log"
14 | CUDA_VISIBLE_DEVICES=0 nohup python -u main.py --author ${author} --project ${project_name} --begintime ${begin_time} > ${LOG_PATH}${LOG_FILENAME} 2>&1 &
15 | 


--------------------------------------------------------------------------------
/rerank_paper/pier_model_without_oam_atten/tools.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import datetime
  3 | import time
  4 | import os
  5 | 
  6 | 
  7 | class tick_tock:
  8 |     def __init__(self, process_name, verbose=1):
  9 |         self.process_name = process_name
 10 |         self.verbose = verbose
 11 | 
 12 |     def __enter__(self):
 13 |         if self.verbose:
 14 |             print(("*" * 50 + " {} START!!!! ".format(self.process_name) + "*" * 50))
 15 |             self.begin_time = time.time()
 16 | 
 17 |     def __exit__(self, type, value, traceback):
 18 |         if self.verbose:
 19 |             end_time = time.time()
 20 |             duration_seconds = end_time - self.begin_time
 21 |             duration = str(datetime.timedelta(seconds=duration_seconds))
 22 | 
 23 |             print(("#" * 50 + " {} END... time lapsing {}  ".format(self.process_name, duration) + "#" * 50))
 24 | 
 25 | 
 26 | class FeatureInfo:
 27 |     def __init__(self, feature_info_str):
 28 |         self.feature_info_str = feature_info_str
 29 | 
 30 |         self.feature_name = "NonFeaName"
 31 |         self.feature_size = 0
 32 |         self.feature_mask = 1
 33 |         self.parse_info_flag = False
 34 |         self.part_num = 3
 35 | 
 36 |         self._parse_info()
 37 | 
 38 |     def _parse_info(self):
 39 |         infoList = self.feature_info_str.split()
 40 | 
 41 |         if len(infoList) == self.part_num:
 42 |             self.feature_name = infoList[0]
 43 |             self.feature_size = int(infoList[1])
 44 |             self.feature_mask = int(infoList[2])
 45 |             self.parse_info_flag = True
 46 | 
 47 | 
 48 | def parse_mask_file(feature_mask_file):
 49 |     try:
 50 |         if not os.path.exists(feature_mask_file):
 51 |             print("parse_mask_file fail - file not exists:", feature_mask_file)
 52 |             return [], False
 53 |         # feature_name_list = []
 54 |         feature_mask_list = []
 55 |         feature_hold_cnt = 0
 56 | 
 57 |         with open(feature_mask_file) as f:
 58 |             str_list = f.readlines()
 59 | 
 60 |         for i in range(0, len(str_list)):
 61 |             str_list[i] = str_list[i].strip('\n').strip()
 62 |             if str_list[i] == "":
 63 |                 continue
 64 | 
 65 |             info = FeatureInfo(str_list[i])
 66 |             if not info.parse_info_flag:
 67 |                 print("parse_mask_file fail - parse_info fail:", str_list[i])
 68 |                 parse_mask_flag = False
 69 |                 return [], parse_mask_flag
 70 | 
 71 |             for j in range(info.feature_size):
 72 |                 feature_mask_list.append(info.feature_mask)
 73 |                 if info.feature_mask != 0:
 74 |                     feature_hold_cnt += 1
 75 |                 # if info.feature_size > 1:
 76 |                 #     feature_name_list.append(info.feature_name + "_" + str(j))
 77 |                 # else:
 78 |                 #     feature_name_list.append(info.feature_name)
 79 | 
 80 |         parse_mask_flag = True
 81 |         return feature_mask_list, parse_mask_flag, feature_hold_cnt
 82 |     except Exception as e:
 83 |         print("parse_mask_file fail - Exception:", e)
 84 |         return [], False
 85 | 
 86 | import itertools
 87 | 
 88 | 
 89 | # 利用itertools库中的permutations函数,给定一个排列,输出他的全排列
 90 | def allPermutation(n):
 91 |     permutation = []
 92 |     # 首先需要初始化一个1-n的排列
 93 |     for i in range(n):
 94 |         permutation.append(i+1)
 95 |     # itertools.permutations返回的只是一个对象,需要将其转化成list
 96 |     # 每一种排列情况以元组类型存储
 97 |     all_permutation = list(itertools.permutations(permutation))
 98 |     return all_permutation
 99 | 
100 | 
101 | 
102 | 
103 | 
104 | if __name__ == "__main__":
105 |     # feature_mask_list, parse_feature_mask_flag, feature_hold_cnt = parse_mask_file("feature_mask")
106 |     # print(feature_mask_list)
107 |     # print(len(feature_mask_list))
108 |     # print(parse_feature_mask_flag)
109 |     # print(feature_hold_cnt)
110 |     print(allPermutation(5))
111 | 


--------------------------------------------------------------------------------
/rerank_paper/pier_model_without_oam_atten/util.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | def index_matrix_to_pairs(index_matrix):
 4 |     # [[3,1,2], [2,3,1]] -> [[[0, 3], [1, 1], [2, 2]],
 5 |     #                        [[0, 2], [1, 3], [2, 1]]]
 6 |     replicated_first_indices = tf.range(tf.shape(index_matrix)[0])
 7 |     rank = len(index_matrix.get_shape())
 8 |     if rank == 2:
 9 |         replicated_first_indices = tf.tile(
10 |             tf.expand_dims(replicated_first_indices, axis=1),
11 |             [1, tf.shape(index_matrix)[1]])
12 |         replicated_first_indices = tf.cast(replicated_first_indices, dtype=tf.int64)
13 |     return tf.stack([replicated_first_indices, index_matrix], axis=rank)
14 | 
15 | def string_hash_to_index(tensor, bucket=1<<22):
16 |     return tf.strings.to_hash_bucket_fast(tensor, bucket)
17 | 
18 | def int_to_string_with_key(tensor, key):
19 |     return key + "_" + tf.strings.as_string(tensor)
20 | 
21 | def float_to_string_with_key(tensor, key, precision=1):
22 |     return key + "_" + tf.strings.as_string(tensor, precision)
23 | 
24 | def float_to_int(tensor, order):
25 |     wc = 10 ** order   
26 |     return tf.cast(tensor * wc, tf.int64)
27 | 
28 | def float_custom_hash(tensor, key, precision=0, bucket=1<<22):
29 |     tensor = float_to_string_with_key(tensor, key, precision)
30 |     tensor = string_hash_to_index(tensor, bucket)
31 |     return tensor
32 | 
33 | def int_custom_hash(tensor, key, bucket=1<<22):
34 |     tensor = int_to_string_with_key(tensor, key)
35 |     tensor = string_hash_to_index(tensor, bucket)
36 |     return tensor
37 | 


--------------------------------------------------------------------------------
/rerank_paper/pier_model_without_page_atten/avg_std/delivery:
--------------------------------------------------------------------------------
1 | 4.668 40.692 17.227 2.616
2 | 2.722 10.623 13.181 1.742
3 | 


--------------------------------------------------------------------------------
/rerank_paper/pier_model_without_page_atten/avg_std/poi:
--------------------------------------------------------------------------------
1 | 2066.773487 23.97615642 26.98331926 3.040902147 4.622442584
2 | 2239.863594 17.22234242 18.71115826 1.824693613 0.597578887


--------------------------------------------------------------------------------
/rerank_paper/pier_model_without_page_atten/avg_std/user:
--------------------------------------------------------------------------------
1 | 0 0
2 | 1 1


--------------------------------------------------------------------------------
/rerank_paper/pier_model_without_page_atten/build_model.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | echo ""
 4 | # 共建模型服务模版的路径 ------------------------ 修改模版配置 ----------------------
 5 | template_path="../model_template/"
 6 | 
 7 | # 待上传模型pb文件的路径 ------------------------ 修改模型配置 ----------------------
 8 | model_path="../model/"
 9 | model_name=$1
10 | ep=$2
11 | model_file=${model_path}${model_name}/${ep}
12 | echo ${model_file}
13 | target_path="../model_zip/"
14 | if [[ ! -d ${target_path} ]]; then
15 |         mkdir ${target_path}
16 | fi
17 | target_name=${model_name}_${ep}
18 | target_file=${target_path}${target_name}
19 | target_name_zip=${target_name}.zip
20 | echo ${target_name}
21 | echo "---> 待上传模型的目标路径: "${target_file}
22 | if [[ -d ${target_file} ]]; then
23 |     cmd_del="rm -fr ${target_file}"
24 |     echo "---> 删除已存在的目标文件: "${cmd_del}
25 |     ${cmd_del}
26 | fi
27 | if [[ ! -d ${target_file} ]]; then
28 |         mkdir ${target_file}
29 | fi
30 | 
31 | cmd_cp_model_pb="cp -fr ${model_file} ${target_file}/${target_name}"
32 | echo "---> 拷贝模型pb文件到目标目录: "${cmd_cp_model_pb}
33 | ${cmd_cp_model_pb}
34 | 
35 | cmd_mv1="cp ${template_path}/model.properties ${target_file}/${target_name}.properties"
36 | cmd_mv2="cp ${template_path}/model.xml ${target_file}/${target_name}.xml"
37 | cmd_mv3="cp ${template_path}/tensors.xml ${target_file}/tensors.xml"
38 | cmd_mv4="cp ${template_path}/poi_feature.xml ${target_file}/poi_feature.xml"
39 | cmd_mv5="cp ${template_path}/poi_feature.tensor ${target_file}/poi_feature.tensor"
40 | echo "---> 修改properties文件名: "${cmd_mv1}
41 | echo "---> 修改xml文件名: "${cmd_mv2}
42 | ${cmd_mv1}
43 | ${cmd_mv2}
44 | ${cmd_mv3}
45 | ${cmd_mv4}
46 | ${cmd_mv5}
47 | 
48 | cmd_zip1="cd ${target_path}"
49 | cmd_zip2="zip -r ${target_name_zip} ${target_name}"
50 | echo "---> 压缩上传模型文件: "${cmd_zip1} ${cmd_zip2}
51 | ${cmd_zip1}
52 | ${cmd_zip2}
53 | cd -
54 | 
55 | echo ${target_name}.zip
56 | 


--------------------------------------------------------------------------------
/rerank_paper/pier_model_without_page_atten/config.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8 -*-
 2 | import shutil
 3 | import time
 4 | import os
 5 | 
 6 | # result
 7 | RANDOM_SEED = 2022
 8 | BATCH_SIZE = 1024
 9 | IMP_LOSS_WEIGHT = 0.02
10 | # basic config
11 | EPOCH = 1 
12 | LEARNING_RATE = 0.005
13 | DATA_MODE = 3 # 1:local train，2:local test, 3:docker evaluate
14 | TRAIN_MODE = 3
15 | MODEL_NAME = "pier_model_without_page_atten"
16 | # poi类别特征
17 | FEATURE_CATE_NUM = 7 # v1r3:19
18 | # dense特征
19 | FEATURE_DENSE_NUM = 5  # v1:28 v1r2:79 v1r3:83
20 | # 预估值特征
21 | FEATURE_CXR_NUM = 3
22 | # 环境特征
23 | FEATURE_ENV_NUM = 2
24 | # 自然poi
25 | FEATURE_NATURE_POI = 25
26 | 
27 | # embedding_look_up维度
28 | CATE_FEATURE_EMBEDDINGS_SHAPE = [1 << 22, 8]
29 | 
30 | # N: Cut Number of POI For Train
31 | POI_NUM = 5
32 | FEATURE_NUM = 9
33 | PAGE_NUM = 5
34 | FEATURE_NUM_FOR_PAGE = 11
35 | PERMUATION_SIZE = 120
36 | # 属性特征：KA AOR BRAND
37 | FEATURE_ATTR_NUM = 3
38 | 
39 | # DELIVERY_FEAT
40 | DELIVERY_FEAT_NUM = 4
41 | 
42 | # OUT NUM
43 | OUT_NUM = 1
44 | 
45 | PLACE_HOLDER_NUM = 11
46 | DENSE_FEAT_NUM = 439
47 | 
48 | 
49 | # 网络结构参数
50 | MODEL_PARAMS = {
51 |     'INPUT_TENSOR_LAYERS_A': [60, 32, 10],
52 |     'INPUT_TENSOR_LAYERS_B': [50, 20],
53 |     'INPUT_TENSOR_LAYERS_C': [60, 32, 10],
54 |     'INPUT_TENSOR_LAYERS_D': [50, 20],
55 |     'INPUT_TENSOR_LAYERS_E': [50, 20]
56 | }
57 | # A_INPUT_DIM = POI_NUM * (MODEL_PARAMS['INPUT_TENSOR_LAYERS_A'][-1] + 1)
58 | MLP_INPUT_DIM = CATE_FEATURE_EMBEDDINGS_SHAPE[1]  + 1 + MODEL_PARAMS['INPUT_TENSOR_LAYERS_A'][-1]
59 | DIN_CONF = {}
60 | 
61 | # train data
62 | # /users/lemonace/Downloads/tfrecord-rl-limit5-v1
63 | if DATA_MODE == 1:
64 |     TRAIN_FILE = ['/users/meituan_sxw/Downloads/part-r-00046']
65 |     VALID_FILE = TRAIN_FILE
66 |     PREDICT_FILE = VALID_FILE
67 |     TEST_FILE = PREDICT_FILE
68 | elif DATA_MODE == 2:
69 |     TRAIN_FILE = ['/mnt/dolphinfs/hdd_pool/docker/user/hadoop-hmart-waimaiad/yangfan129/train_data/avito_v1_new/avito_v1_new/test_data/part-r-*']
70 |     VALID_FILE = TRAIN_FILE
71 |     TEST_FILE = VALID_FILE
72 | elif DATA_MODE == 3:
73 |     TRAIN_FILE = ["/mnt/dolphinfs/hdd_pool/docker/user/hadoop-hmart-waimaiad/yangfan129/train_data/avito_v3_new/train_data/part-r-*"]
74 |     VALID_FILE = ["/mnt/dolphinfs/hdd_pool/docker/user/hadoop-hmart-waimaiad/yangfan129/train_data/avito_v3_new/test_data/part-r-*"]
75 |     TEST_FILE= ["/mnt/dolphinfs/hdd_pool/docker/user/hadoop-hmart-waimaiad/yangfan129/train_data/avito_v3_new/test_data/part-r-*"]
76 | elif DATA_MODE == 4:
77 |     DATA_FILE = "/home/hadoop-hmart-waimaiad/cephfs/data/yangfan129/train_data/tfrecord-multi-channel-v1/"
78 |     #TRAIN_LIST = ["20211222", "20211223", "20211224", "20211225"]
79 |     TRAIN_LIST = ["20220123"]
80 |     VALID_LIST = ["20220124"]
81 |     TRAIN_FILE = [DATA_FILE + x + "/part-r-*" for x in TRAIN_LIST]
82 |     VALID_FILE = [DATA_FILE + x + "/part-r-0001*" for x in VALID_LIST]
83 |     TEST_FILE = [DATA_FILE + x + "/part-r-00011" for x in TRAIN_LIST]
84 | 
85 | # 辅助脚本
86 | MEAN_VAR_PATH_POI = "./avg_std/poi"
87 | MEAN_VAR_PATH_DELIVERY = "./avg_std/delivery"
88 | MODEL_SAVE_PATH = "../model/" + MODEL_NAME
89 | MODEL_SAVE_PB_EPOCH_ON = False
90 | MODEL_SAVE_PB_EPOCH_PATH = MODEL_SAVE_PATH + "_pbs"
91 | 


--------------------------------------------------------------------------------
/rerank_paper/pier_model_without_page_atten/layers.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | def DIN(seq, seq_len, target, conf, scope="DIN"):
 4 |     # seq BATCH_SIZE * SEQ_LEN * FEAT_NUM : N * M * H
 5 |     # target BATCH_SIZE * FEAT_NUM : N * H
 6 |     # return : BATCH_SIZE * H
 7 | 
 8 |     with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
 9 |         seq_shape = tf.shape(seq)
10 |         target = tf.tile(target, [1, seq_shape[1], 1])
11 | 
12 |         input = tf.concat([seq, target, seq - target, seq * target], axis=-1)
13 | 
14 |         layers = conf.get("layers", [64, 32])
15 |         for layer in layers:
16 |             input = tf.layers.dense(input, layer, activation=tf.nn.sigmoid, name="att_"+str(layer))
17 | 
18 |         input = tf.squeeze(tf.layers.dense(input, 1, activation=None, name="att_final"), axis=-1) # N * M
19 | 
20 |         # Mask
21 |         seq_mask = tf.squeeze(tf.sequence_mask(seq_len, seq_shape[1]))
22 |         # seq_mask = tf.Print(seq_mask, [seq_mask], message="seq_mask", summarize=100)
23 |         padding = tf.ones_like(input) * (-2 ** 32 + 1)
24 |         attention = tf.nn.softmax(tf.where(seq_mask, input, padding), axis=-1) # N * M
25 |         # attention = tf.Print(attention, [attention], message="attention", summarize=100)
26 |         attention = tf.tile(tf.expand_dims(attention, axis=2), [1, 1, seq_shape[2]])
27 |         output = tf.reduce_sum(tf.transpose(attention * seq, [0, 2, 1]), axis=1)
28 |     return output
29 | 


--------------------------------------------------------------------------------
/rerank_paper/pier_model_without_page_atten/main.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from config import *
  3 | from model import *
  4 | from sklearn import metrics
  5 | 
  6 | def create_estimator():
  7 |     tf.logging.set_verbosity(tf.logging.INFO)
  8 |     session_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
  9 |     session_config.gpu_options.allow_growth = True
 10 |     config = tf.estimator.RunConfig(
 11 |         tf_random_seed=RANDOM_SEED,
 12 |         save_summary_steps=100,
 13 |         save_checkpoints_steps=1000,
 14 |         model_dir=MODEL_SAVE_PATH,
 15 |         keep_checkpoint_max=2,
 16 |         log_step_count_steps=1000,
 17 |         session_config=session_config)
 18 |     nn_model = DNN()
 19 |     estimator = tf.estimator.Estimator(model_fn=nn_model.model_fn_estimator, config=config)
 20 |     return estimator, nn_model
 21 | 
 22 | 
 23 | def save_model_pb_with_estimator(estimator, params, export_dir_base):
 24 |     estimator._params['save_model'] = params['save_model']
 25 | 
 26 |     def _serving_input_receiver_fn():
 27 |         # env_feature = > dense_feature
 28 |         # cxr_feature = > screen_predict_feature
 29 |         # cat_feature = > screen_cate_feature
 30 |         # dense_feature = > screen_dense_feature
 31 |         receiver_tensors = {
 32 |             # ctr cvr gmv预估值 && bid
 33 |             'screen_predict_feature': tf.placeholder(tf.float32, [None, POI_NUM, FEATURE_CXR_NUM],
 34 |                                                 name='screen_predict_feature'),
 35 |             # dense 特征 (价格，评分)
 36 |             'screen_dense_feature': tf.placeholder(tf.float32, [None, POI_NUM, FEATURE_DENSE_NUM],
 37 |                                                 name='screen_dense_feature'),
 38 |             # 离散特征(品类)
 39 |             'screen_cate_feature': tf.placeholder(tf.int64, [None, POI_NUM, FEATURE_CATE_NUM],
 40 |                                                 name='screen_cate_feature'),
 41 |             # 环境特征（是否有铂金）
 42 |             'dense_feature': tf.placeholder(tf.float32, [None, DENSE_FEAT_NUM],
 43 |                                                 name='dense_feature')
 44 |         }
 45 |         return tf.estimator.export.ServingInputReceiver(receiver_tensors=receiver_tensors, features=receiver_tensors)
 46 | 
 47 |     export_dir = estimator.export_saved_model(export_dir_base=export_dir_base,
 48 |                                               serving_input_receiver_fn=_serving_input_receiver_fn)
 49 |     estimator._params.pop('save_model')
 50 |     return export_dir.decode()
 51 | 
 52 | def calculate_result(result_generator):
 53 | 
 54 |     y_ctr, pred_ctr, ctr = [], [], []
 55 |     for result in result_generator:
 56 |         cxr_feature = result['cxr_feature']
 57 |         mask = result['mask']
 58 |         # ctr_label
 59 |         idx = np.where(mask.reshape(-1) == 1)
 60 |         y_ctr += result['ctr_label'].reshape(-1)[idx].tolist()
 61 |         pred_ctr += result['ctr_out'].reshape(-1)[idx].tolist()
 62 |         ctr += cxr_feature[:, :, 0].reshape(-1)[idx].tolist()
 63 | 
 64 |     ctr_auc, ctr_auc_jp, ctr_cb, ctr_cb_jp = metrics.roc_auc_score(y_ctr, pred_ctr), metrics.roc_auc_score(y_ctr, ctr), np.sum(pred_ctr) / np.sum(y_ctr),  np.sum(ctr) / np.sum(y_ctr)
 65 |     print("ctr_auc:{}, ctr_auc_jp:{}, ctr_cb:{}, ctr_cb_jp:{}".format(ctr_auc, ctr_auc_jp, ctr_cb, ctr_cb_jp))
 66 | 
 67 | if __name__ == '__main__':
 68 | 
 69 |     estimator, nn_model = create_estimator()
 70 | 
 71 |     with tick_tock("DATA_INPUT") as _:
 72 |         valid_input_fn = input_fn_maker(VALID_FILE, False, batch_size=1024, epoch=1)
 73 |         test_input_fn = input_fn_maker(TEST_FILE, False, batch_size=1024, epoch=1)
 74 | 
 75 |     if TRAIN_MODE == 1:
 76 |         for i in range(EPOCH):
 77 |             for idx, data in enumerate(TRAIN_FILE):
 78 |                 with tick_tock("DATA_INPUT") as _:
 79 |                     train_input_fn = input_fn_maker([data], True, batch_size=BATCH_SIZE, epoch=1)
 80 |                 with tick_tock("TRAIN") as _:
 81 |                     estimator.train(train_input_fn)
 82 |                 if MODEL_SAVE_PB_EPOCH_ON:
 83 |                     export_dir = save_model_pb_with_estimator(estimator, params={'save_model': 'listwise'},
 84 |                                                               export_dir_base=MODEL_SAVE_PB_EPOCH_PATH)
 85 |                     ep_insert_index = i * len(TRAIN_FILE) + idx
 86 |                     target_dir = export_dir + "/../ep" + str(ep_insert_index)
 87 |                     while os.path.exists(target_dir):
 88 |                         target_dir = export_dir + "/../ep" + str(ep_insert_index)
 89 |                     shutil.move(export_dir, target_dir)
 90 |                     print(time.strftime("%m-%d %H:%M:%S ",
 91 |                                         time.localtime(time.time())) + "export model PB: " + target_dir)
 92 |                 #with tick_tock("PREDICT") as _:
 93 |                     #result_generator = estimator.predict(input_fn=valid_input_fn, yield_single_examples=False)
 94 |                     #calculate_result(result_generator)
 95 | 
 96 | 
 97 | 
 98 |     elif TRAIN_MODE == 2:
 99 |         with tick_tock("PREDICT") as _:
100 |             result_generator = estimator.predict(input_fn=valid_input_fn, yield_single_examples=False)
101 |             calculate_result(result_generator)
102 | 
103 |     elif TRAIN_MODE == 3:
104 |         for i in range(EPOCH):
105 |             for idx, data in enumerate(TRAIN_FILE):
106 |                 with tick_tock("DATA_INPUT") as _:
107 |                     train_input_fn = input_fn_maker([data], True, batch_size=BATCH_SIZE, epoch=1)
108 |                 with tick_tock("TRAIN") as _:
109 |                     estimator.train(train_input_fn)
110 |                 with tick_tock("PREDICT") as _:
111 |                     result_generator = estimator.predict(input_fn=valid_input_fn, yield_single_examples=False)
112 |                     print("valid_data")
113 |                     calculate_result(result_generator)
114 |                     #result_generator = estimator.predict(input_fn=test_input_fn, yield_single_examples=False)
115 |                     print("train_data")
116 |                     #calculate_result(result_generator)
117 |                     # save pb
118 | 
119 |     
120 |     elif TRAIN_MODE == 4:
121 |         export_dir = save_model_pb_with_estimator(estimator, params={'save_model': 'listwise'},
122 |                                                     export_dir_base=MODEL_SAVE_PB_EPOCH_PATH)
123 |         ep_insert_index = 0 
124 |         target_dir = export_dir + "/../ep" + str(ep_insert_index)
125 |         while os.path.exists(target_dir):
126 |             target_dir = export_dir + "/../ep" + str(ep_insert_index)
127 |         shutil.move(export_dir, target_dir)
128 |         print(time.strftime("%m-%d %H:%M:%S ",
129 |                             time.localtime(time.time())) + "export model PB: " + target_dir)
130 |     
131 | 


--------------------------------------------------------------------------------
/rerank_paper/pier_model_without_page_atten/model_load.py:
--------------------------------------------------------------------------------
 1 | import tensorflow
 2 | 
 3 | from config import *
 4 | import tensorflow_core.contrib.predictor as predictor
 5 | 
 6 | def load_listwise_model():
 7 |     model_filename_dir = MODEL_SAVE_PATH + "_pbs/ep0/"
 8 |     predict_fn = predictor.from_saved_model(model_filename_dir)
 9 |     # predict_fn = tf.compat.v2.saved_model.load(model_filename_dir)
10 | 
11 |     # env_feature = > dense_feature
12 |     # cxr_feature = > screen_predict_feature
13 |     # cat_feature = > screen_cate_feature
14 |     # dense_feature = > screen_dense_feature
15 |     predictions = predict_fn({
16 |         'screen_predict_feature': [[[0.036115, 0.05427262, 0.09489095, 0.2],
17 |                                     [0.027565, 0.07474336, 0.04988268, 0.53],
18 |                                     [0.024815, 0.1775544, 0.12052802, 0.24],
19 |                                     [0.023316, 0.12283709, 0.10298113, 0.1]]],
20 |         # dense 特征 (价格，评分)
21 |         'screen_dense_feature': [[[1359., 30.146147, 26., 5., 4.85],
22 |                                   [318., 14.675659, 0., 5., 4.94],
23 |                                   [637., 24.784016, 0., 5., 4.65],
24 |                                   [185., 25.333273, 0., 5., 4.75]]],
25 |         # 离散特征(品类)
26 |         'screen_cate_feature': [[[2638824, 4148885, 432243, 3985407, 3385100, 3019284],
27 |                                  [2638824, 3905410, 3212599, 3985407, 1997821, 3019284],
28 |                                  [2638824, 4148885, 3622545, 3985407, 1997821, 3019284],
29 |                                  [2638824, 4148885, 432243, 3985407, 1997821, 3019284]]],
30 |         # 环境特征（是否有铂金）
31 |         'dense_feature': [[0., 0.]]
32 |     })
33 | 
34 |     print('Q_network_output:', predictions['Q_network_output'])
35 |     print('out:', predictions['out'])
36 | 
37 | if __name__ == '__main__':
38 |     # load_pg_model()
39 |     load_listwise_model()


--------------------------------------------------------------------------------
/rerank_paper/pier_model_without_page_atten/restore.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lemonace/PIER_code/9c47f80af24570154aab4a811dd82d1f1d436fc3/rerank_paper/pier_model_without_page_atten/restore.py


--------------------------------------------------------------------------------
/rerank_paper/pier_model_without_page_atten/run.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | LOG_PATH="./"
 4 | if [[ ! -d ${LOG_PATH} ]]; then
 5 |         mkdir ${LOG_PATH}
 6 | fi
 7 | 
 8 | project_path=$(cd `dirname $0`; pwd)
 9 | project_name="${project_path##*/}"
10 | begin_time=$(date "+%Y-%m-%d %H:%M:%S")
11 | author="yangfan129"
12 | 
13 | LOG_FILENAME="log"
14 | CUDA_VISIBLE_DEVICES=0 nohup python -u main.py --author ${author} --project ${project_name} --begintime ${begin_time} > ${LOG_PATH}${LOG_FILENAME} 2>&1 &
15 | 


--------------------------------------------------------------------------------
/rerank_paper/pier_model_without_page_atten/tools.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import datetime
  3 | import time
  4 | import os
  5 | 
  6 | 
  7 | class tick_tock:
  8 |     def __init__(self, process_name, verbose=1):
  9 |         self.process_name = process_name
 10 |         self.verbose = verbose
 11 | 
 12 |     def __enter__(self):
 13 |         if self.verbose:
 14 |             print(("*" * 50 + " {} START!!!! ".format(self.process_name) + "*" * 50))
 15 |             self.begin_time = time.time()
 16 | 
 17 |     def __exit__(self, type, value, traceback):
 18 |         if self.verbose:
 19 |             end_time = time.time()
 20 |             duration_seconds = end_time - self.begin_time
 21 |             duration = str(datetime.timedelta(seconds=duration_seconds))
 22 | 
 23 |             print(("#" * 50 + " {} END... time lapsing {}  ".format(self.process_name, duration) + "#" * 50))
 24 | 
 25 | 
 26 | class FeatureInfo:
 27 |     def __init__(self, feature_info_str):
 28 |         self.feature_info_str = feature_info_str
 29 | 
 30 |         self.feature_name = "NonFeaName"
 31 |         self.feature_size = 0
 32 |         self.feature_mask = 1
 33 |         self.parse_info_flag = False
 34 |         self.part_num = 3
 35 | 
 36 |         self._parse_info()
 37 | 
 38 |     def _parse_info(self):
 39 |         infoList = self.feature_info_str.split()
 40 | 
 41 |         if len(infoList) == self.part_num:
 42 |             self.feature_name = infoList[0]
 43 |             self.feature_size = int(infoList[1])
 44 |             self.feature_mask = int(infoList[2])
 45 |             self.parse_info_flag = True
 46 | 
 47 | 
 48 | def parse_mask_file(feature_mask_file):
 49 |     try:
 50 |         if not os.path.exists(feature_mask_file):
 51 |             print("parse_mask_file fail - file not exists:", feature_mask_file)
 52 |             return [], False
 53 |         # feature_name_list = []
 54 |         feature_mask_list = []
 55 |         feature_hold_cnt = 0
 56 | 
 57 |         with open(feature_mask_file) as f:
 58 |             str_list = f.readlines()
 59 | 
 60 |         for i in range(0, len(str_list)):
 61 |             str_list[i] = str_list[i].strip('\n').strip()
 62 |             if str_list[i] == "":
 63 |                 continue
 64 | 
 65 |             info = FeatureInfo(str_list[i])
 66 |             if not info.parse_info_flag:
 67 |                 print("parse_mask_file fail - parse_info fail:", str_list[i])
 68 |                 parse_mask_flag = False
 69 |                 return [], parse_mask_flag
 70 | 
 71 |             for j in range(info.feature_size):
 72 |                 feature_mask_list.append(info.feature_mask)
 73 |                 if info.feature_mask != 0:
 74 |                     feature_hold_cnt += 1
 75 |                 # if info.feature_size > 1:
 76 |                 #     feature_name_list.append(info.feature_name + "_" + str(j))
 77 |                 # else:
 78 |                 #     feature_name_list.append(info.feature_name)
 79 | 
 80 |         parse_mask_flag = True
 81 |         return feature_mask_list, parse_mask_flag, feature_hold_cnt
 82 |     except Exception as e:
 83 |         print("parse_mask_file fail - Exception:", e)
 84 |         return [], False
 85 | 
 86 | import itertools
 87 | 
 88 | 
 89 | # 利用itertools库中的permutations函数,给定一个排列,输出他的全排列
 90 | def allPermutation(n):
 91 |     permutation = []
 92 |     # 首先需要初始化一个1-n的排列
 93 |     for i in range(n):
 94 |         permutation.append(i+1)
 95 |     # itertools.permutations返回的只是一个对象,需要将其转化成list
 96 |     # 每一种排列情况以元组类型存储
 97 |     all_permutation = list(itertools.permutations(permutation))
 98 |     return all_permutation
 99 | 
100 | 
101 | 
102 | 
103 | 
104 | if __name__ == "__main__":
105 |     # feature_mask_list, parse_feature_mask_flag, feature_hold_cnt = parse_mask_file("feature_mask")
106 |     # print(feature_mask_list)
107 |     # print(len(feature_mask_list))
108 |     # print(parse_feature_mask_flag)
109 |     # print(feature_hold_cnt)
110 |     print(allPermutation(5))
111 | 


--------------------------------------------------------------------------------
/rerank_paper/pier_model_without_page_atten/util.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | def index_matrix_to_pairs(index_matrix):
 4 |     # [[3,1,2], [2,3,1]] -> [[[0, 3], [1, 1], [2, 2]],
 5 |     #                        [[0, 2], [1, 3], [2, 1]]]
 6 |     replicated_first_indices = tf.range(tf.shape(index_matrix)[0])
 7 |     rank = len(index_matrix.get_shape())
 8 |     if rank == 2:
 9 |         replicated_first_indices = tf.tile(
10 |             tf.expand_dims(replicated_first_indices, axis=1),
11 |             [1, tf.shape(index_matrix)[1]])
12 |         replicated_first_indices = tf.cast(replicated_first_indices, dtype=tf.int64)
13 |     return tf.stack([replicated_first_indices, index_matrix], axis=rank)
14 | 
15 | def string_hash_to_index(tensor, bucket=1<<22):
16 |     return tf.strings.to_hash_bucket_fast(tensor, bucket)
17 | 
18 | def int_to_string_with_key(tensor, key):
19 |     return key + "_" + tf.strings.as_string(tensor)
20 | 
21 | def float_to_string_with_key(tensor, key, precision=1):
22 |     return key + "_" + tf.strings.as_string(tensor, precision)
23 | 
24 | def float_to_int(tensor, order):
25 |     wc = 10 ** order   
26 |     return tf.cast(tensor * wc, tf.int64)
27 | 
28 | def float_custom_hash(tensor, key, precision=0, bucket=1<<22):
29 |     tensor = float_to_string_with_key(tensor, key, precision)
30 |     tensor = string_hash_to_index(tensor, bucket)
31 |     return tensor
32 | 
33 | def int_custom_hash(tensor, key, bucket=1<<22):
34 |     tensor = int_to_string_with_key(tensor, key)
35 |     tensor = string_hash_to_index(tensor, bucket)
36 |     return tensor
37 | 


--------------------------------------------------------------------------------
/rerank_paper/prm_model/avg_std/delivery:
--------------------------------------------------------------------------------
1 | 4.668 40.692 17.227 2.616
2 | 2.722 10.623 13.181 1.742
3 | 


--------------------------------------------------------------------------------
/rerank_paper/prm_model/avg_std/poi:
--------------------------------------------------------------------------------
1 | 2066.773487 23.97615642 26.98331926 3.040902147 4.622442584
2 | 2239.863594 17.22234242 18.71115826 1.824693613 0.597578887


--------------------------------------------------------------------------------
/rerank_paper/prm_model/avg_std/user:
--------------------------------------------------------------------------------
1 | 0 0
2 | 1 1


--------------------------------------------------------------------------------
/rerank_paper/prm_model/build_model.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | echo ""
 4 | # 共建模型服务模版的路径 ------------------------ 修改模版配置 ----------------------
 5 | template_path="../model_template/"
 6 | 
 7 | # 待上传模型pb文件的路径 ------------------------ 修改模型配置 ----------------------
 8 | model_path="../model/"
 9 | model_name=$1
10 | ep=$2
11 | model_file=${model_path}${model_name}/${ep}
12 | echo ${model_file}
13 | target_path="../model_zip/"
14 | if [[ ! -d ${target_path} ]]; then
15 |         mkdir ${target_path}
16 | fi
17 | target_name=${model_name}_${ep}
18 | target_file=${target_path}${target_name}
19 | target_name_zip=${target_name}.zip
20 | echo ${target_name}
21 | echo "---> 待上传模型的目标路径: "${target_file}
22 | if [[ -d ${target_file} ]]; then
23 |     cmd_del="rm -fr ${target_file}"
24 |     echo "---> 删除已存在的目标文件: "${cmd_del}
25 |     ${cmd_del}
26 | fi
27 | if [[ ! -d ${target_file} ]]; then
28 |         mkdir ${target_file}
29 | fi
30 | 
31 | cmd_cp_model_pb="cp -fr ${model_file} ${target_file}/${target_name}"
32 | echo "---> 拷贝模型pb文件到目标目录: "${cmd_cp_model_pb}
33 | ${cmd_cp_model_pb}
34 | 
35 | cmd_mv1="cp ${template_path}/model.properties ${target_file}/${target_name}.properties"
36 | cmd_mv2="cp ${template_path}/model.xml ${target_file}/${target_name}.xml"
37 | cmd_mv3="cp ${template_path}/tensors.xml ${target_file}/tensors.xml"
38 | cmd_mv4="cp ${template_path}/poi_feature.xml ${target_file}/poi_feature.xml"
39 | cmd_mv5="cp ${template_path}/poi_feature.tensor ${target_file}/poi_feature.tensor"
40 | echo "---> 修改properties文件名: "${cmd_mv1}
41 | echo "---> 修改xml文件名: "${cmd_mv2}
42 | ${cmd_mv1}
43 | ${cmd_mv2}
44 | ${cmd_mv3}
45 | ${cmd_mv4}
46 | ${cmd_mv5}
47 | 
48 | cmd_zip1="cd ${target_path}"
49 | cmd_zip2="zip -r ${target_name_zip} ${target_name}"
50 | echo "---> 压缩上传模型文件: "${cmd_zip1} ${cmd_zip2}
51 | ${cmd_zip1}
52 | ${cmd_zip2}
53 | cd -
54 | 
55 | echo ${target_name}.zip
56 | 


--------------------------------------------------------------------------------
/rerank_paper/prm_model/config.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8 -*-
 2 | import shutil
 3 | import time
 4 | import os
 5 | 
 6 | # result
 7 | RANDOM_SEED = 2021
 8 | BATCH_SIZE = 1024
 9 | IMP_LOSS_WEIGHT = 0.02
10 | # basic config
11 | EPOCH = 1
12 | LEARNING_RATE = 0.005
13 | DATA_MODE = 1 # 1:local train，2:local test, 3:docker evaluate
14 | TRAIN_MODE = 3
15 | MODEL_NAME = "avito_listwise_model"
16 | # poi类别特征
17 | FEATURE_CATE_NUM = 7 # v1r3:19
18 | # dense特征
19 | FEATURE_DENSE_NUM = 5  # v1:28 v1r2:79 v1r3:83
20 | # 预估值特征
21 | FEATURE_CXR_NUM = 3
22 | # 环境特征
23 | FEATURE_ENV_NUM = 2
24 | # 自然poi
25 | FEATURE_NATURE_POI = 25
26 | 
27 | # N: Cut Number of POI For Train
28 | POI_NUM = 5
29 | FEATURE_NUM = 9
30 | PAGE_NUM = 5
31 | FEATURE_NUM_FOR_PAGE = 11
32 | # 属性特征：KA AOR BRAND
33 | FEATURE_ATTR_NUM = 3
34 | 
35 | # DELIVERY_FEAT
36 | DELIVERY_FEAT_NUM = 4
37 | 
38 | # OUT NUM
39 | OUT_NUM = 1
40 | 
41 | PLACE_HOLDER_NUM = 11
42 | DENSE_FEAT_NUM = 439
43 | 
44 | 
45 | # embedding_look_up维度
46 | CATE_FEATURE_EMBEDDINGS_SHAPE = [1 << 22, 8]
47 | 
48 | # 网络结构参数
49 | MODEL_PARAMS = {
50 |     'INPUT_TENSOR_LAYERS_A': [60, 32, 10],
51 |     'INPUT_TENSOR_LAYERS_B': [50, 20],
52 |     'INPUT_TENSOR_LAYERS_C': [50, 20],
53 |     'INPUT_TENSOR_LAYERS_D': [50, 20],
54 |     'INPUT_TENSOR_LAYERS_E': [50, 20]
55 | }
56 | A_INPUT_DIM = (MODEL_PARAMS['INPUT_TENSOR_LAYERS_A'][-1] + 1)
57 | 
58 | DIN_CONF = {}
59 | 
60 | # train data
61 | # /users/lemonace/Downloads/tfrecord-rl-limit5-v1
62 | if DATA_MODE == 1:
63 |     TRAIN_FILE = ['/users/meituan_sxw/Downloads/part-r-00046']
64 |     VALID_FILE = TRAIN_FILE
65 |     PREDICT_FILE = VALID_FILE
66 |     TEST_FILE = PREDICT_FILE
67 | elif DATA_MODE == 2:
68 |     TRAIN_FILE = ['/mnt/dolphinfs/hdd_pool/docker/user/hadoop-hmart-waimaiad/yangfan129/train_data/avito_v1_new/avito_v1_new/test_data/part-r-*']
69 |     VALID_FILE = TRAIN_FILE
70 |     TEST_FILE = VALID_FILE
71 | elif DATA_MODE == 3:
72 |     TRAIN_FILE = [
73 |         "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-hmart-waimaiad/yangfan129/train_data/avito_v3_new/train_data/part-r-*"]
74 |     VALID_FILE = [
75 |         "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-hmart-waimaiad/yangfan129/train_data/avito_v3_new/test_data/part-r-*"]
76 |     TEST_FILE = [
77 |         "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-hmart-waimaiad/yangfan129/train_data/avito_v3_new/test_data/part-r-*"]
78 | elif DATA_MODE == 4:
79 |     DATA_FILE = "/home/hadoop-hmart-waimaiad/cephfs/data/yangfan129/train_data/tfrecord-multi-channel-v1/"
80 |     #TRAIN_LIST = ["20211222", "20211223", "20211224", "20211225"]
81 |     TRAIN_LIST = ["20220123"]
82 |     VALID_LIST = ["20220124"]
83 |     TRAIN_FILE = [DATA_FILE + x + "/part-r-*" for x in TRAIN_LIST]
84 |     VALID_FILE = [DATA_FILE + x + "/part-r-0001*" for x in VALID_LIST]
85 |     TEST_FILE = [DATA_FILE + x + "/part-r-00011" for x in TRAIN_LIST]
86 | 
87 | # 辅助脚本
88 | MEAN_VAR_PATH_POI = "./avg_std/poi"
89 | MEAN_VAR_PATH_DELIVERY = "./avg_std/delivery"
90 | MODEL_SAVE_PATH = "../model/" + MODEL_NAME
91 | MODEL_SAVE_PB_EPOCH_ON = False
92 | MODEL_SAVE_PB_EPOCH_PATH = MODEL_SAVE_PATH + "_pbs"
93 | 


--------------------------------------------------------------------------------
/rerank_paper/prm_model/data_input.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import tensorflow as tf
  3 | from config import *
  4 | import numpy as np
  5 | from tools import tick_tock
  6 | 
  7 | 
  8 | def generate_parse_tfrecord_local_fn():
  9 |     def _parse_function(batch_examples):
 10 |         common_features, sequence_features = feature_parse_scheme()
 11 |         parsed_features = tf.parse_example(
 12 |             serialized=batch_examples,
 13 |             features=common_features
 14 |         )
 15 |         features = feature_product(parsed_features)
 16 |         labels = label_product(parsed_features)
 17 |         return features, labels
 18 | 
 19 |     return _parse_function
 20 | 
 21 | 
 22 | def generate_parse_valid_tfrecord_local_fn():
 23 |     def _parse_function(batch_examples):
 24 |         common_features, sequence_features = feature_parse_scheme()
 25 |         parsed_features = tf.parse_example(
 26 |             serialized=batch_examples,
 27 |             features=common_features
 28 |         )
 29 |         features = feature_product(parsed_features)
 30 |         labels = label_product(parsed_features)
 31 |         return features, labels
 32 | 
 33 |     return _parse_function
 34 | 
 35 | 
 36 | def feature_parse_scheme():
 37 |     label_len = POI_NUM * 2 + PAGE_NUM
 38 |     feature_len = POI_NUM * FEATURE_NUM + PAGE_NUM * POI_NUM * FEATURE_NUM_FOR_PAGE
 39 |     common_features = {
 40 |         "label": tf.FixedLenFeature([label_len], dtype=tf.float32),
 41 |         "feature": tf.FixedLenFeature([feature_len], dtype=tf.float32),
 42 |     }
 43 | 
 44 |     sequence_features = {}
 45 |     return common_features, sequence_features
 46 | 
 47 | 
 48 | def label_product(parsed_features):
 49 |     labels = parsed_features['label']
 50 | 
 51 |     labels_result = {
 52 |         # ctr_label
 53 |         'ctr_label': tf.gather(labels, list(range(0, POI_NUM)), axis=1),
 54 |         'mask': tf.gather(labels, list(range(POI_NUM, 2 * POI_NUM)), axis=1),
 55 |         'page_mask': tf.gather(labels, list(range(2 * POI_NUM, 2 * POI_NUM + PAGE_NUM)), axis=1),
 56 |     }
 57 |     return labels_result
 58 | 
 59 | 
 60 | def feature_product(parsed_features):
 61 |     feature_buffer = parsed_features['feature']
 62 |     labels = parsed_features['label']
 63 |     # 获取特征
 64 |     # FEATURE_CATE_NUM：品类相关特征
 65 |     # FEATURE_DENSE_NUM：连续值特征
 66 |     # FEATURE_CXR_NUM：模型预估值特征
 67 | 
 68 |     # current page
 69 |     current_page_start = 0
 70 |     current_page_end = current_page_start + POI_NUM * FEATURE_NUM
 71 | 
 72 |     pre_page_start = current_page_end
 73 |     pre_page_end = pre_page_start + PAGE_NUM * POI_NUM * FEATURE_NUM_FOR_PAGE
 74 | 
 75 |     cur_page_features = tf.reshape(tf.gather(feature_buffer, list(range(current_page_start, current_page_end)), axis=1),
 76 |                                    [-1, POI_NUM, FEATURE_NUM])
 77 |     pre_page_features = tf.reshape(tf.gather(feature_buffer, list(range(pre_page_start, pre_page_end)), axis=1),
 78 |                                    [-1, PAGE_NUM, POI_NUM, FEATURE_NUM_FOR_PAGE])
 79 | 
 80 |     position_fea = tf.gather(cur_page_features, list(range(0, 1)), axis=2)
 81 |     adid_fea = tf.gather(cur_page_features, list(range(1, 2)), axis=2)
 82 |     obj_type_fea = tf.gather(cur_page_features, list(range(2, 3)), axis=2)
 83 |     hist_ctr_fea = tf.gather(cur_page_features, list(range(3, 4)), axis=2)
 84 |     locationid_fea = tf.gather(cur_page_features, list(range(4, 5)), axis=2)
 85 |     categoryid_fea = tf.gather(cur_page_features, list(range(5, 6)), axis=2)
 86 |     price_fea = tf.gather(cur_page_features, list(range(6, 7)), axis=2)
 87 |     iscontext_fea = tf.gather(cur_page_features, list(range(7, 8)), axis=2)
 88 |     userid_fea = tf.gather(cur_page_features, list(range(8, 9)), axis=2)
 89 | 
 90 |     pre_position_fea = tf.gather(pre_page_features, list(range(0, 1)), axis=3)
 91 |     pre_adid_fea = tf.gather(pre_page_features, list(range(1, 2)), axis=3)
 92 |     pre_obj_type_fea = tf.gather(pre_page_features, list(range(2, 3)), axis=3)
 93 |     pre_hist_ctr_fea = tf.gather(pre_page_features, list(range(3, 4)), axis=3)
 94 |     pre_locationid_fea = tf.gather(pre_page_features, list(range(4, 5)), axis=3)
 95 |     pre_categoryid_fea = tf.gather(pre_page_features, list(range(5, 6)), axis=3)
 96 |     pre_price_fea = tf.gather(pre_page_features, list(range(6, 7)), axis=3)
 97 |     pre_iscontext_fea = tf.gather(pre_page_features, list(range(7, 8)), axis=3)
 98 |     pre_userid_fea = tf.gather(pre_page_features, list(range(8, 9)), axis=3)
 99 | 
100 |     features_result = {
101 | 
102 |         'dense_feature': hist_ctr_fea,
103 |         # 离散特征(品类)
104 |         'cate_feature': tf.cast(
105 |             tf.concat([position_fea, adid_fea, obj_type_fea, locationid_fea, iscontext_fea, categoryid_fea, userid_fea],
106 |                       axis=2), tf.int64),
107 |         # ctr_label
108 |         'ctr_label': tf.gather(labels, list(range(0, POI_NUM)), axis=1),
109 |         'mask': tf.gather(labels, list(range(POI_NUM, 2 * POI_NUM)), axis=1),
110 |         'page_mask': tf.gather(labels, list(range(2 * POI_NUM, 2 * POI_NUM + PAGE_NUM)), axis=1),
111 |         'behavior_dense_feature': pre_hist_ctr_fea,
112 |         # 离散特征(品类)
113 |         'behavior_cate_feature': tf.cast(
114 |             tf.concat([pre_position_fea, pre_adid_fea, pre_obj_type_fea, pre_locationid_fea,
115 |                        pre_iscontext_fea, pre_categoryid_fea, pre_userid_fea], axis=3), tf.int64)
116 | 
117 |     }
118 |     return features_result
119 | 
120 | 
121 | # num_parallel 表示cpu的核数,用于控制 map的并行度
122 | def input_fn_maker(file_names, is_train, batch_size, epoch=None, num_parallel=4):
123 |     def input_fn():
124 |         _parse_fn = generate_parse_tfrecord_local_fn() if is_train else generate_parse_valid_tfrecord_local_fn()
125 |         files = tf.data.Dataset.list_files(file_names)
126 |         # print(files)
127 |         dataset = files.apply(tf.contrib.data.parallel_interleave(tf.data.TFRecordDataset, cycle_length=4 * 10))
128 |         dataset = dataset.prefetch(buffer_size=batch_size * 10)
129 |         dataset = dataset.repeat(epoch)
130 |         dataset = dataset.batch(batch_size)
131 |         dataset = dataset.map(_parse_fn, num_parallel_calls=num_parallel)
132 |         iterator = dataset.make_one_shot_iterator()
133 |         return iterator.get_next()
134 | 
135 |     return input_fn
136 | 
137 | 
138 | # 从hive表统计得到均值和方差文件
139 | def get_normalization_parameter(mean_var_path):
140 |     with tf.gfile.Open(mean_var_path) as f:
141 |         fea_mean = f.readline().strip().split(' ')
142 |         fea_var = f.readline().strip().split(' ')
143 |         cont_fea_mean = list(map(float, fea_mean))
144 |         cont_fea_var = list(map(float, fea_var))
145 |     f.close()
146 |     return cont_fea_mean, cont_fea_var
147 | 
148 | 
149 | def get_bias_weight_parameter(bias_weight_path):
150 |     with tf.gfile.Open(bias_weight_path) as f2:
151 |         fea_mean = f2.readline().strip().split('\t')
152 |         cont_fea_mean = list(map(float, fea_mean))
153 |     f2.close()
154 |     return cont_fea_mean
155 | 
156 | 
157 | if __name__ == '__main__':
158 |     train_file = TRAIN_FILE
159 |     # train_file = ["/mnt/dolphinfs/hdd_pool/docker/user/hadoop-hmart-waimaiad/yangfan129/train_data/avito_v1_new/avito_v1_new/train_data/part-r-00000"]
160 |     train_input_fn = input_fn_maker(train_file, is_train=True, batch_size=1000, epoch=1)
161 |     features, labels = train_input_fn()
162 | 
163 |     sess = tf.Session()
164 |     try:
165 |         with tick_tock("DATA_INPUT") as _:
166 |             features_np, labels_np = sess.run([features, labels])
167 | 
168 |         # print("*" * 100, "features_np")
169 |         # for key in features_np:
170 |         #     print("=" * 50, key, np.shape(features_np[key]))
171 |         #     print(features_np[key])
172 | 
173 |         print("*" * 100, "labels_np")
174 |         for key in labels_np:
175 |             print("=" * 50, key, np.shape(labels_np[key]))
176 |             print(labels_np[key])
177 | 
178 |     except Exception as e:
179 |         print(e)
180 | 


--------------------------------------------------------------------------------
/rerank_paper/prm_model/layers.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | def DIN(seq, seq_len, target, conf, scope="DIN"):
 4 |     # seq BATCH_SIZE * SEQ_LEN * FEAT_NUM : N * M * H
 5 |     # target BATCH_SIZE * FEAT_NUM : N * H
 6 |     # return : BATCH_SIZE * H
 7 | 
 8 |     with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
 9 |         seq_shape = tf.shape(seq)
10 |         target = tf.tile(target, [1, seq_shape[1], 1])
11 | 
12 |         input = tf.concat([seq, target, seq - target, seq * target], axis=-1)
13 | 
14 |         layers = conf.get("layers", [64, 32])
15 |         for layer in layers:
16 |             input = tf.layers.dense(input, layer, activation=tf.nn.sigmoid, name="att_"+str(layer))
17 | 
18 |         input = tf.squeeze(tf.layers.dense(input, 1, activation=None, name="att_final"), axis=-1) # N * M
19 | 
20 |         # Mask
21 |         seq_mask = tf.squeeze(tf.sequence_mask(seq_len, seq_shape[1]))
22 |         # seq_mask = tf.Print(seq_mask, [seq_mask], message="seq_mask", summarize=100)
23 |         padding = tf.ones_like(input) * (-2 ** 32 + 1)
24 |         attention = tf.nn.softmax(tf.where(seq_mask, input, padding), axis=-1) # N * M
25 |         # attention = tf.Print(attention, [attention], message="attention", summarize=100)
26 |         attention = tf.tile(tf.expand_dims(attention, axis=2), [1, 1, seq_shape[2]])
27 |         output = tf.reduce_sum(tf.transpose(attention * seq, [0, 2, 1]), axis=1)
28 |     return output
29 | 


--------------------------------------------------------------------------------
/rerank_paper/prm_model/main.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from config import *
  3 | from model import *
  4 | from sklearn import metrics
  5 | 
  6 | 
  7 | def create_estimator():
  8 |     tf.logging.set_verbosity(tf.logging.INFO)
  9 |     session_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
 10 |     session_config.gpu_options.allow_growth = True
 11 |     config = tf.estimator.RunConfig(
 12 |         tf_random_seed=RANDOM_SEED,
 13 |         save_summary_steps=100,
 14 |         save_checkpoints_steps=1000,
 15 |         model_dir=MODEL_SAVE_PATH,
 16 |         keep_checkpoint_max=2,
 17 |         log_step_count_steps=1000,
 18 |         session_config=session_config)
 19 |     nn_model = DNN()
 20 |     estimator = tf.estimator.Estimator(model_fn=nn_model.model_fn_estimator, config=config)
 21 |     return estimator, nn_model
 22 | 
 23 | 
 24 | def save_model_pb_with_estimator(estimator, params, export_dir_base):
 25 |     estimator._params['save_model'] = params['save_model']
 26 | 
 27 |     def _serving_input_receiver_fn():
 28 |         # env_feature = > dense_feature
 29 |         # cxr_feature = > screen_predict_feature
 30 |         # cat_feature = > screen_cate_feature
 31 |         # dense_feature = > screen_dense_feature
 32 |         receiver_tensors = {
 33 |             # ctr cvr gmv预估值 && bid
 34 |             'screen_predict_feature': tf.placeholder(tf.float32, [None, POI_NUM, FEATURE_CXR_NUM],
 35 |                                                      name='screen_predict_feature'),
 36 |             # dense 特征 (价格，评分)
 37 |             'screen_dense_feature': tf.placeholder(tf.float32, [None, POI_NUM, FEATURE_DENSE_NUM],
 38 |                                                    name='screen_dense_feature'),
 39 |             # 离散特征(品类)
 40 |             'screen_cate_feature': tf.placeholder(tf.int64, [None, POI_NUM, FEATURE_CATE_NUM],
 41 |                                                   name='screen_cate_feature'),
 42 |             # 环境特征（是否有铂金）
 43 |             'dense_feature': tf.placeholder(tf.float32, [None, DENSE_FEAT_NUM],
 44 |                                             name='dense_feature')
 45 |         }
 46 |         return tf.estimator.export.ServingInputReceiver(receiver_tensors=receiver_tensors, features=receiver_tensors)
 47 | 
 48 |     export_dir = estimator.export_saved_model(export_dir_base=export_dir_base,
 49 |                                               serving_input_receiver_fn=_serving_input_receiver_fn)
 50 |     estimator._params.pop('save_model')
 51 |     return export_dir.decode()
 52 | 
 53 | 
 54 | def calculate_result(result_generator):
 55 |     y_ctr, pred_ctr, ctr = [], [], []
 56 |     for result in result_generator:
 57 |         cxr_feature = result['cxr_feature']
 58 |         mask = result['mask']
 59 |         # ctr_label
 60 |         idx = np.where(mask.reshape(-1) == 1)
 61 |         y_ctr += result['ctr_label'].reshape(-1)[idx].tolist()
 62 |         pred_ctr += result['ctr_out'].reshape(-1)[idx].tolist()
 63 |         ctr += cxr_feature[:, :, 0].reshape(-1)[idx].tolist()
 64 | 
 65 |     ctr_auc, ctr_auc_jp, ctr_cb, ctr_cb_jp = metrics.roc_auc_score(y_ctr, pred_ctr), metrics.roc_auc_score(y_ctr,
 66 |                                                                                                            ctr), np.sum(
 67 |         pred_ctr) / np.sum(y_ctr), np.sum(ctr) / np.sum(y_ctr)
 68 |     print("ctr_auc:{}, ctr_auc_jp:{}, ctr_cb:{}, ctr_cb_jp:{}".format(ctr_auc, ctr_auc_jp, ctr_cb, ctr_cb_jp))
 69 | 
 70 | 
 71 | if __name__ == '__main__':
 72 | 
 73 |     estimator, nn_model = create_estimator()
 74 | 
 75 |     with tick_tock("DATA_INPUT") as _:
 76 |         valid_input_fn = input_fn_maker(VALID_FILE, False, batch_size=1024, epoch=1)
 77 |         test_input_fn = input_fn_maker(TEST_FILE, False, batch_size=1024, epoch=1)
 78 | 
 79 |     if TRAIN_MODE == 1:
 80 |         for i in range(EPOCH):
 81 |             for idx, data in enumerate(TRAIN_FILE):
 82 |                 with tick_tock("DATA_INPUT") as _:
 83 |                     train_input_fn = input_fn_maker([data], True, batch_size=BATCH_SIZE, epoch=1)
 84 |                 with tick_tock("TRAIN") as _:
 85 |                     estimator.train(train_input_fn)
 86 |                 if MODEL_SAVE_PB_EPOCH_ON:
 87 |                     export_dir = save_model_pb_with_estimator(estimator, params={'save_model': 'listwise'},
 88 |                                                               export_dir_base=MODEL_SAVE_PB_EPOCH_PATH)
 89 |                     ep_insert_index = i * len(TRAIN_FILE) + idx
 90 |                     target_dir = export_dir + "/../ep" + str(ep_insert_index)
 91 |                     while os.path.exists(target_dir):
 92 |                         target_dir = export_dir + "/../ep" + str(ep_insert_index)
 93 |                     shutil.move(export_dir, target_dir)
 94 |                     print(time.strftime("%m-%d %H:%M:%S ",
 95 |                                         time.localtime(time.time())) + "export model PB: " + target_dir)
 96 |                 # with tick_tock("PREDICT") as _:
 97 |                 # result_generator = estimator.predict(input_fn=valid_input_fn, yield_single_examples=False)
 98 |                 # calculate_result(result_generator)
 99 | 
100 | 
101 | 
102 |     elif TRAIN_MODE == 2:
103 |         with tick_tock("PREDICT") as _:
104 |             result_generator = estimator.predict(input_fn=valid_input_fn, yield_single_examples=False)
105 |             calculate_result(result_generator)
106 | 
107 |     elif TRAIN_MODE == 3:
108 |         for i in range(EPOCH):
109 |             for idx, data in enumerate(TRAIN_FILE):
110 |                 with tick_tock("DATA_INPUT") as _:
111 |                     train_input_fn = input_fn_maker([data], True, batch_size=BATCH_SIZE, epoch=1)
112 |                 with tick_tock("TRAIN") as _:
113 |                     estimator.train(train_input_fn)
114 |                 with tick_tock("PREDICT") as _:
115 |                     result_generator = estimator.predict(input_fn=valid_input_fn, yield_single_examples=False)
116 |                     print("valid_data")
117 |                     calculate_result(result_generator)
118 |                     # result_generator = estimator.predict(input_fn=test_input_fn, yield_single_examples=False)
119 |                     print("train_data")
120 |                     # calculate_result(result_generator)
121 |                     # save pb
122 | 
123 | 
124 |     elif TRAIN_MODE == 4:
125 |         export_dir = save_model_pb_with_estimator(estimator, params={'save_model': 'listwise'},
126 |                                                   export_dir_base=MODEL_SAVE_PB_EPOCH_PATH)
127 |         ep_insert_index = 0
128 |         target_dir = export_dir + "/../ep" + str(ep_insert_index)
129 |         while os.path.exists(target_dir):
130 |             target_dir = export_dir + "/../ep" + str(ep_insert_index)
131 |         shutil.move(export_dir, target_dir)
132 |         print(time.strftime("%m-%d %H:%M:%S ",
133 |                             time.localtime(time.time())) + "export model PB: " + target_dir)
134 | 
135 | 


--------------------------------------------------------------------------------
/rerank_paper/prm_model/model_load.py:
--------------------------------------------------------------------------------
 1 | import tensorflow
 2 | 
 3 | from config import *
 4 | import tensorflow_core.contrib.predictor as predictor
 5 | 
 6 | def load_listwise_model():
 7 |     model_filename_dir = MODEL_SAVE_PATH + "_pbs/ep0/"
 8 |     predict_fn = predictor.from_saved_model(model_filename_dir)
 9 |     # predict_fn = tf.compat.v2.saved_model.load(model_filename_dir)
10 | 
11 |     # env_feature = > dense_feature
12 |     # cxr_feature = > screen_predict_feature
13 |     # cat_feature = > screen_cate_feature
14 |     # dense_feature = > screen_dense_feature
15 |     predictions = predict_fn({
16 |         'screen_predict_feature': [[[0.036115, 0.05427262, 0.09489095, 0.2],
17 |                                     [0.027565, 0.07474336, 0.04988268, 0.53],
18 |                                     [0.024815, 0.1775544, 0.12052802, 0.24],
19 |                                     [0.023316, 0.12283709, 0.10298113, 0.1]]],
20 |         # dense 特征 (价格，评分)
21 |         'screen_dense_feature': [[[1359., 30.146147, 26., 5., 4.85],
22 |                                   [318., 14.675659, 0., 5., 4.94],
23 |                                   [637., 24.784016, 0., 5., 4.65],
24 |                                   [185., 25.333273, 0., 5., 4.75]]],
25 |         # 离散特征(品类)
26 |         'screen_cate_feature': [[[2638824, 4148885, 432243, 3985407, 3385100, 3019284],
27 |                                  [2638824, 3905410, 3212599, 3985407, 1997821, 3019284],
28 |                                  [2638824, 4148885, 3622545, 3985407, 1997821, 3019284],
29 |                                  [2638824, 4148885, 432243, 3985407, 1997821, 3019284]]],
30 |         # 环境特征（是否有铂金）
31 |         'dense_feature': [[0., 0.]]
32 |     })
33 | 
34 |     print('Q_network_output:', predictions['Q_network_output'])
35 |     print('out:', predictions['out'])
36 | 
37 | if __name__ == '__main__':
38 |     # load_pg_model()
39 |     load_listwise_model()


--------------------------------------------------------------------------------
/rerank_paper/prm_model/restore.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lemonace/PIER_code/9c47f80af24570154aab4a811dd82d1f1d436fc3/rerank_paper/prm_model/restore.py


--------------------------------------------------------------------------------
/rerank_paper/prm_model/run.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | LOG_PATH="./"
 4 | if [[ ! -d ${LOG_PATH} ]]; then
 5 |         mkdir ${LOG_PATH}
 6 | fi
 7 | 
 8 | project_path=$(cd `dirname $0`; pwd)
 9 | project_name="${project_path##*/}"
10 | begin_time=$(date "+%Y-%m-%d %H:%M:%S")
11 | author="yangfan129"
12 | 
13 | LOG_FILENAME="log"
14 | CUDA_VISIBLE_DEVICES=0 nohup python -u main.py --author ${author} --project ${project_name} --begintime ${begin_time} > ${LOG_PATH}${LOG_FILENAME} 2>&1 &
15 | 


--------------------------------------------------------------------------------
/rerank_paper/prm_model/tools.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import datetime
 3 | import time
 4 | import os
 5 | 
 6 | 
 7 | class tick_tock:
 8 |     def __init__(self, process_name, verbose=1):
 9 |         self.process_name = process_name
10 |         self.verbose = verbose
11 | 
12 |     def __enter__(self):
13 |         if self.verbose:
14 |             print(("*" * 50 + " {} START!!!! ".format(self.process_name) + "*" * 50))
15 |             self.begin_time = time.time()
16 | 
17 |     def __exit__(self, type, value, traceback):
18 |         if self.verbose:
19 |             end_time = time.time()
20 |             duration_seconds = end_time - self.begin_time
21 |             duration = str(datetime.timedelta(seconds=duration_seconds))
22 | 
23 |             print(("#" * 50 + " {} END... time lapsing {}  ".format(self.process_name, duration) + "#" * 50))
24 | 
25 | 
26 | class FeatureInfo:
27 |     def __init__(self, feature_info_str):
28 |         self.feature_info_str = feature_info_str
29 | 
30 |         self.feature_name = "NonFeaName"
31 |         self.feature_size = 0
32 |         self.feature_mask = 1
33 |         self.parse_info_flag = False
34 |         self.part_num = 3
35 | 
36 |         self._parse_info()
37 | 
38 |     def _parse_info(self):
39 |         infoList = self.feature_info_str.split()
40 | 
41 |         if len(infoList) == self.part_num:
42 |             self.feature_name = infoList[0]
43 |             self.feature_size = int(infoList[1])
44 |             self.feature_mask = int(infoList[2])
45 |             self.parse_info_flag = True
46 | 
47 | 
48 | def parse_mask_file(feature_mask_file):
49 |     try:
50 |         if not os.path.exists(feature_mask_file):
51 |             print("parse_mask_file fail - file not exists:", feature_mask_file)
52 |             return [], False
53 |         # feature_name_list = []
54 |         feature_mask_list = []
55 |         feature_hold_cnt = 0
56 | 
57 |         with open(feature_mask_file) as f:
58 |             str_list = f.readlines()
59 | 
60 |         for i in range(0, len(str_list)):
61 |             str_list[i] = str_list[i].strip('\n').strip()
62 |             if str_list[i] == "":
63 |                 continue
64 | 
65 |             info = FeatureInfo(str_list[i])
66 |             if not info.parse_info_flag:
67 |                 print("parse_mask_file fail - parse_info fail:", str_list[i])
68 |                 parse_mask_flag = False
69 |                 return [], parse_mask_flag
70 | 
71 |             for j in range(info.feature_size):
72 |                 feature_mask_list.append(info.feature_mask)
73 |                 if info.feature_mask != 0:
74 |                     feature_hold_cnt += 1
75 |                 # if info.feature_size > 1:
76 |                 #     feature_name_list.append(info.feature_name + "_" + str(j))
77 |                 # else:
78 |                 #     feature_name_list.append(info.feature_name)
79 | 
80 |         parse_mask_flag = True
81 |         return feature_mask_list, parse_mask_flag, feature_hold_cnt
82 |     except Exception as e:
83 |         print("parse_mask_file fail - Exception:", e)
84 |         return [], False
85 | 
86 | 
87 | if __name__ == "__main__":
88 |     feature_mask_list, parse_feature_mask_flag, feature_hold_cnt = parse_mask_file("feature_mask")
89 |     print(feature_mask_list)
90 |     print(len(feature_mask_list))
91 |     print(parse_feature_mask_flag)
92 |     print(feature_hold_cnt)
93 | 


--------------------------------------------------------------------------------
/rerank_paper/prm_model/util.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | def index_matrix_to_pairs(index_matrix):
 4 |     # [[3,1,2], [2,3,1]] -> [[[0, 3], [1, 1], [2, 2]],
 5 |     #                        [[0, 2], [1, 3], [2, 1]]]
 6 |     replicated_first_indices = tf.range(tf.shape(index_matrix)[0])
 7 |     rank = len(index_matrix.get_shape())
 8 |     if rank == 2:
 9 |         replicated_first_indices = tf.tile(
10 |             tf.expand_dims(replicated_first_indices, axis=1),
11 |             [1, tf.shape(index_matrix)[1]])
12 |         replicated_first_indices = tf.cast(replicated_first_indices, dtype=tf.int64)
13 |     return tf.stack([replicated_first_indices, index_matrix], axis=rank)
14 | 
15 | def string_hash_to_index(tensor, bucket=1<<22):
16 |     return tf.strings.to_hash_bucket_fast(tensor, bucket)
17 | 
18 | def int_to_string_with_key(tensor, key):
19 |     return key + "_" + tf.strings.as_string(tensor)
20 | 
21 | def float_to_string_with_key(tensor, key, precision=1):
22 |     return key + "_" + tf.strings.as_string(tensor, precision)
23 | 
24 | def float_to_int(tensor, order):
25 |     wc = 10 ** order   
26 |     return tf.cast(tensor * wc, tf.int64)
27 | 
28 | def float_custom_hash(tensor, key, precision=0, bucket=1<<22):
29 |     tensor = float_to_string_with_key(tensor, key, precision)
30 |     tensor = string_hash_to_index(tensor, bucket)
31 |     return tensor
32 | 
33 | def int_custom_hash(tensor, key, bucket=1<<22):
34 |     tensor = int_to_string_with_key(tensor, key)
35 |     tensor = string_hash_to_index(tensor, bucket)
36 |     return tensor
37 | 


--------------------------------------------------------------------------------