├── README.md ├── rerank_paper.tar.gz └── rerank_paper ├── .DS_Store ├── DCN_model ├── __pycache__ │ ├── config.cpython-37.pyc │ ├── data_input.cpython-37.pyc │ ├── layers.cpython-37.pyc │ ├── model.cpython-37.pyc │ ├── tools.cpython-37.pyc │ └── util.cpython-37.pyc ├── avg_std │ ├── delivery │ ├── poi │ └── user ├── build_model.sh ├── config.py ├── config.pyc ├── data_input.py ├── data_input.pyc ├── evaluate.py ├── layers.py ├── layers.pyc ├── main.py ├── model.py ├── model.pyc ├── model_load.py ├── restore.py ├── run.sh ├── tools.py ├── tools.pyc ├── util.py └── util.pyc ├── DNN_model ├── avg_std │ ├── delivery │ ├── poi │ └── user ├── build_model.sh ├── config.py ├── data_input.py ├── evaluate.py ├── layers.py ├── main.py ├── model.py ├── model_load.py ├── restore.py ├── run.sh ├── tools.py └── util.py ├── extr_model ├── __pycache__ │ ├── config.cpython-37.pyc │ ├── data_input.cpython-37.pyc │ ├── layers.cpython-37.pyc │ ├── model.cpython-37.pyc │ ├── tools.cpython-37.pyc │ └── util.cpython-37.pyc ├── avg_std │ ├── delivery │ ├── poi │ └── user ├── build_model.sh ├── config.py ├── data_input.py ├── evaluate.py ├── layers.py ├── main.py ├── model.py ├── model_load.py ├── restore.py ├── run.sh ├── tools.py └── util.py ├── kuaishou_model ├── avg_std │ ├── delivery │ ├── poi │ └── user ├── build_model.sh ├── config.py ├── data_input.py ├── evaluate.py ├── layers.py ├── main.py ├── model.py ├── model_load.py ├── restore.py ├── run.sh ├── tools.py └── util.py ├── pier_model ├── avg_std │ ├── delivery │ ├── poi │ └── user ├── build_model.sh ├── config.py ├── data_input.py ├── evaluate.py ├── layers.py ├── main.py ├── model.py ├── model_load.py ├── restore.py ├── run.sh ├── tools.py └── util.py ├── pier_model_whole_framework ├── avg_std │ ├── delivery │ ├── poi │ └── user ├── build_model.sh ├── config.py ├── data_input.py ├── evaluate.py ├── layers.py ├── main.py ├── model.py ├── model_load.py ├── restore.py ├── run.sh ├── tools.py └── util.py ├── pier_model_without_oam_atten ├── avg_std │ ├── delivery │ ├── poi │ └── user ├── build_model.sh ├── config.py ├── data_input.py ├── evaluate.py ├── layers.py ├── main.py ├── model.py ├── model_load.py ├── restore.py ├── run.sh ├── tools.py └── util.py ├── pier_model_without_page_atten ├── avg_std │ ├── delivery │ ├── poi │ └── user ├── build_model.sh ├── config.py ├── data_input.py ├── evaluate.py ├── layers.py ├── main.py ├── model.py ├── model_load.py ├── restore.py ├── run.sh ├── tools.py └── util.py └── prm_model ├── avg_std ├── delivery ├── poi └── user ├── build_model.sh ├── config.py ├── data_input.py ├── evaluate.py ├── layers.py ├── main.py ├── model.py ├── model_load.py ├── restore.py ├── run.sh ├── tools.py └── util.py /README.md: -------------------------------------------------------------------------------- 1 | # PIER_code 2 |
PIER code contains DNN, DCN, PRM, EXTR, KuaiShou re-rankding, PIER 3 |
Run Env: python3 + tf1.5 4 |
Run code: sh run.sh 5 |
Dataset: https://drive.google.com/drive/folders/1BRkP9YPiU1bdviLjo3jrYdXNTqrJAvJM?usp=share_link 6 | -------------------------------------------------------------------------------- /rerank_paper.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lemonace/PIER_code/9c47f80af24570154aab4a811dd82d1f1d436fc3/rerank_paper.tar.gz -------------------------------------------------------------------------------- /rerank_paper/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lemonace/PIER_code/9c47f80af24570154aab4a811dd82d1f1d436fc3/rerank_paper/.DS_Store -------------------------------------------------------------------------------- /rerank_paper/DCN_model/__pycache__/config.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lemonace/PIER_code/9c47f80af24570154aab4a811dd82d1f1d436fc3/rerank_paper/DCN_model/__pycache__/config.cpython-37.pyc -------------------------------------------------------------------------------- /rerank_paper/DCN_model/__pycache__/data_input.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lemonace/PIER_code/9c47f80af24570154aab4a811dd82d1f1d436fc3/rerank_paper/DCN_model/__pycache__/data_input.cpython-37.pyc -------------------------------------------------------------------------------- /rerank_paper/DCN_model/__pycache__/layers.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lemonace/PIER_code/9c47f80af24570154aab4a811dd82d1f1d436fc3/rerank_paper/DCN_model/__pycache__/layers.cpython-37.pyc -------------------------------------------------------------------------------- /rerank_paper/DCN_model/__pycache__/model.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lemonace/PIER_code/9c47f80af24570154aab4a811dd82d1f1d436fc3/rerank_paper/DCN_model/__pycache__/model.cpython-37.pyc -------------------------------------------------------------------------------- /rerank_paper/DCN_model/__pycache__/tools.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lemonace/PIER_code/9c47f80af24570154aab4a811dd82d1f1d436fc3/rerank_paper/DCN_model/__pycache__/tools.cpython-37.pyc -------------------------------------------------------------------------------- /rerank_paper/DCN_model/__pycache__/util.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lemonace/PIER_code/9c47f80af24570154aab4a811dd82d1f1d436fc3/rerank_paper/DCN_model/__pycache__/util.cpython-37.pyc -------------------------------------------------------------------------------- /rerank_paper/DCN_model/avg_std/delivery: -------------------------------------------------------------------------------- 1 | 4.668 40.692 17.227 2.616 2 | 2.722 10.623 13.181 1.742 3 | -------------------------------------------------------------------------------- /rerank_paper/DCN_model/avg_std/poi: -------------------------------------------------------------------------------- 1 | 2066.773487 23.97615642 26.98331926 3.040902147 4.622442584 2 | 2239.863594 17.22234242 18.71115826 1.824693613 0.597578887 -------------------------------------------------------------------------------- /rerank_paper/DCN_model/avg_std/user: -------------------------------------------------------------------------------- 1 | 0 0 2 | 1 1 -------------------------------------------------------------------------------- /rerank_paper/DCN_model/build_model.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "" 4 | # 共建模型服务模版的路径 ------------------------ 修改模版配置 ---------------------- 5 | template_path="../model_template/" 6 | 7 | # 待上传模型pb文件的路径 ------------------------ 修改模型配置 ---------------------- 8 | model_path="../model/" 9 | model_name=$1 10 | ep=$2 11 | model_file=${model_path}${model_name}/${ep} 12 | echo ${model_file} 13 | target_path="../model_zip/" 14 | if [[ ! -d ${target_path} ]]; then 15 | mkdir ${target_path} 16 | fi 17 | target_name=${model_name}_${ep} 18 | target_file=${target_path}${target_name} 19 | target_name_zip=${target_name}.zip 20 | echo ${target_name} 21 | echo "---> 待上传模型的目标路径: "${target_file} 22 | if [[ -d ${target_file} ]]; then 23 | cmd_del="rm -fr ${target_file}" 24 | echo "---> 删除已存在的目标文件: "${cmd_del} 25 | ${cmd_del} 26 | fi 27 | if [[ ! -d ${target_file} ]]; then 28 | mkdir ${target_file} 29 | fi 30 | 31 | cmd_cp_model_pb="cp -fr ${model_file} ${target_file}/${target_name}" 32 | echo "---> 拷贝模型pb文件到目标目录: "${cmd_cp_model_pb} 33 | ${cmd_cp_model_pb} 34 | 35 | cmd_mv1="cp ${template_path}/model.properties ${target_file}/${target_name}.properties" 36 | cmd_mv2="cp ${template_path}/model.xml ${target_file}/${target_name}.xml" 37 | cmd_mv3="cp ${template_path}/tensors.xml ${target_file}/tensors.xml" 38 | cmd_mv4="cp ${template_path}/poi_feature.xml ${target_file}/poi_feature.xml" 39 | cmd_mv5="cp ${template_path}/poi_feature.tensor ${target_file}/poi_feature.tensor" 40 | echo "---> 修改properties文件名: "${cmd_mv1} 41 | echo "---> 修改xml文件名: "${cmd_mv2} 42 | ${cmd_mv1} 43 | ${cmd_mv2} 44 | ${cmd_mv3} 45 | ${cmd_mv4} 46 | ${cmd_mv5} 47 | 48 | cmd_zip1="cd ${target_path}" 49 | cmd_zip2="zip -r ${target_name_zip} ${target_name}" 50 | echo "---> 压缩上传模型文件: "${cmd_zip1} ${cmd_zip2} 51 | ${cmd_zip1} 52 | ${cmd_zip2} 53 | cd - 54 | 55 | echo ${target_name}.zip 56 | -------------------------------------------------------------------------------- /rerank_paper/DCN_model/config.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 -*- 2 | import shutil 3 | import time 4 | import os 5 | 6 | # result 7 | RANDOM_SEED = 2021 8 | BATCH_SIZE = 1024 9 | IMP_LOSS_WEIGHT = 0.02 10 | # basic config 11 | EPOCH = 1 12 | LEARNING_RATE = 0.005 13 | DATA_MODE = 3 # 1:local train,2:local test, 3:docker evaluate 14 | TRAIN_MODE = 3 15 | MODEL_NAME = "dcn_pointwise_model_v3" 16 | # poi类别特征 17 | FEATURE_CATE_NUM = 7 # v1r3:19 18 | # dense特征 19 | FEATURE_DENSE_NUM = 5 # v1:28 v1r2:79 v1r3:83 20 | # 预估值特征 21 | FEATURE_CXR_NUM = 3 22 | # 环境特征 23 | FEATURE_ENV_NUM = 2 24 | # 自然poi 25 | FEATURE_NATURE_POI = 25 26 | 27 | # N: Cut Number of POI For Train 28 | POI_NUM = 5 29 | FEATURE_NUM = 9 30 | PAGE_NUM = 5 31 | FEATURE_NUM_FOR_PAGE = 11 32 | # 属性特征:KA AOR BRAND 33 | FEATURE_ATTR_NUM = 3 34 | 35 | # DELIVERY_FEAT 36 | DELIVERY_FEAT_NUM = 4 37 | 38 | # OUT NUM 39 | OUT_NUM = 1 40 | CROSS_LAYERS = [56, 56, 56] 41 | CROSS_LAYERS_NUM = 3 42 | 43 | PLACE_HOLDER_NUM = 11 44 | DENSE_FEAT_NUM = 439 45 | 46 | 47 | # embedding_look_up维度 48 | CATE_FEATURE_EMBEDDINGS_SHAPE = [1 << 22, 8] 49 | 50 | # 网络结构参数 51 | MODEL_PARAMS = { 52 | 'INPUT_TENSOR_LAYERS_A': [60, 32, 20], 53 | 'INPUT_TENSOR_LAYERS_B': [64, 32], 54 | 'INPUT_TENSOR_LAYERS_C': [50, 20], 55 | 'INPUT_TENSOR_LAYERS_D': [50, 20], 56 | 'INPUT_TENSOR_LAYERS_E': [50, 20] 57 | } 58 | A_INPUT_DIM = (MODEL_PARAMS['INPUT_TENSOR_LAYERS_A'][-1] + 1) * 1 59 | 60 | DIN_CONF = {} 61 | 62 | # train data 63 | # /users/lemonace/Downloads/tfrecord-rl-limit5-v1 64 | if DATA_MODE == 1: 65 | # TRAIN_FILE = ['/users/meituan_sxw/Downloads/part-r-00047'] 66 | TRAIN_FILE = ['/users/lemonace/Downloads/docker_data/part-r-00018'] 67 | VALID_FILE = TRAIN_FILE 68 | PREDICT_FILE = VALID_FILE 69 | TEST_FILE = PREDICT_FILE 70 | elif DATA_MODE == 2: 71 | TRAIN_FILE = ['/mnt/dolphinfs/hdd_pool/docker/user/hadoop-hmart-waimaiad/yangfan129/train_data/avito_v1_new/avito_v1_new/test_data/part-r-*'] 72 | VALID_FILE = TRAIN_FILE 73 | TEST_FILE = VALID_FILE 74 | elif DATA_MODE == 3: 75 | TRAIN_FILE = [ 76 | "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-hmart-waimaiad/yangfan129/train_data/avito_v3_new_point_wise/train_data/part-r-*"] 77 | VALID_FILE = [ 78 | "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-hmart-waimaiad/yangfan129/train_data/avito_v3_new_point_wise/test_data/part-r-*"] 79 | TEST_FILE = [ 80 | "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-hmart-waimaiad/yangfan129/train_data/avito_v3_new_point_wise/test_data/part-r-*"] 81 | elif DATA_MODE == 4: 82 | DATA_FILE = "/home/hadoop-hmart-waimaiad/cephfs/data/yangfan129/train_data/tfrecord-multi-channel-v1/" 83 | #TRAIN_LIST = ["20211222", "20211223", "20211224", "20211225"] 84 | TRAIN_LIST = ["20220123"] 85 | VALID_LIST = ["20220124"] 86 | TRAIN_FILE = [DATA_FILE + x + "/part-r-*" for x in TRAIN_LIST] 87 | VALID_FILE = [DATA_FILE + x + "/part-r-0001*" for x in VALID_LIST] 88 | TEST_FILE = [DATA_FILE + x + "/part-r-00011" for x in TRAIN_LIST] 89 | 90 | # 辅助脚本 91 | MEAN_VAR_PATH_POI = "./avg_std/poi" 92 | MEAN_VAR_PATH_DELIVERY = "./avg_std/delivery" 93 | MODEL_SAVE_PATH = "../model/" + MODEL_NAME 94 | MODEL_SAVE_PB_EPOCH_ON = False 95 | MODEL_SAVE_PB_EPOCH_PATH = MODEL_SAVE_PATH + "_pbs" 96 | -------------------------------------------------------------------------------- /rerank_paper/DCN_model/config.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lemonace/PIER_code/9c47f80af24570154aab4a811dd82d1f1d436fc3/rerank_paper/DCN_model/config.pyc -------------------------------------------------------------------------------- /rerank_paper/DCN_model/data_input.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lemonace/PIER_code/9c47f80af24570154aab4a811dd82d1f1d436fc3/rerank_paper/DCN_model/data_input.pyc -------------------------------------------------------------------------------- /rerank_paper/DCN_model/layers.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | def DIN(seq, seq_len, target, conf, scope="DIN"): 4 | # seq BATCH_SIZE * SEQ_LEN * FEAT_NUM : N * M * H 5 | # target BATCH_SIZE * FEAT_NUM : N * H 6 | # return : BATCH_SIZE * H 7 | 8 | with tf.variable_scope(scope, reuse=tf.AUTO_REUSE): 9 | seq_shape = tf.shape(seq) 10 | target = tf.tile(target, [1, seq_shape[1], 1]) 11 | 12 | input = tf.concat([seq, target, seq - target, seq * target], axis=-1) 13 | 14 | layers = conf.get("layers", [64, 32]) 15 | for layer in layers: 16 | input = tf.layers.dense(input, layer, activation=tf.nn.sigmoid, name="att_"+str(layer)) 17 | 18 | input = tf.squeeze(tf.layers.dense(input, 1, activation=None, name="att_final"), axis=-1) # N * M 19 | 20 | # Mask 21 | seq_mask = tf.squeeze(tf.sequence_mask(seq_len, seq_shape[1])) 22 | # seq_mask = tf.Print(seq_mask, [seq_mask], message="seq_mask", summarize=100) 23 | padding = tf.ones_like(input) * (-2 ** 32 + 1) 24 | attention = tf.nn.softmax(tf.where(seq_mask, input, padding), axis=-1) # N * M 25 | # attention = tf.Print(attention, [attention], message="attention", summarize=100) 26 | attention = tf.tile(tf.expand_dims(attention, axis=2), [1, 1, seq_shape[2]]) 27 | output = tf.reduce_sum(tf.transpose(attention * seq, [0, 2, 1]), axis=1) 28 | return output 29 | -------------------------------------------------------------------------------- /rerank_paper/DCN_model/layers.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lemonace/PIER_code/9c47f80af24570154aab4a811dd82d1f1d436fc3/rerank_paper/DCN_model/layers.pyc -------------------------------------------------------------------------------- /rerank_paper/DCN_model/main.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from config import * 3 | from model import * 4 | from sklearn import metrics 5 | 6 | 7 | def create_estimator(): 8 | tf.logging.set_verbosity(tf.logging.INFO) 9 | session_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) 10 | session_config.gpu_options.allow_growth = True 11 | config = tf.estimator.RunConfig( 12 | tf_random_seed=RANDOM_SEED, 13 | save_summary_steps=100, 14 | save_checkpoints_steps=1000, 15 | model_dir=MODEL_SAVE_PATH, 16 | keep_checkpoint_max=2, 17 | log_step_count_steps=1000, 18 | session_config=session_config) 19 | nn_model = DNN() 20 | estimator = tf.estimator.Estimator(model_fn=nn_model.model_fn_estimator, config=config) 21 | return estimator, nn_model 22 | 23 | 24 | def save_model_pb_with_estimator(estimator, params, export_dir_base): 25 | estimator._params['save_model'] = params['save_model'] 26 | 27 | def _serving_input_receiver_fn(): 28 | # env_feature = > dense_feature 29 | # cxr_feature = > screen_predict_feature 30 | # cat_feature = > screen_cate_feature 31 | # dense_feature = > screen_dense_feature 32 | receiver_tensors = { 33 | # ctr cvr gmv预估值 && bid 34 | 'screen_predict_feature': tf.placeholder(tf.float32, [None, POI_NUM, FEATURE_CXR_NUM], 35 | name='screen_predict_feature'), 36 | # dense 特征 (价格,评分) 37 | 'screen_dense_feature': tf.placeholder(tf.float32, [None, POI_NUM, FEATURE_DENSE_NUM], 38 | name='screen_dense_feature'), 39 | # 离散特征(品类) 40 | 'screen_cate_feature': tf.placeholder(tf.int64, [None, POI_NUM, FEATURE_CATE_NUM], 41 | name='screen_cate_feature'), 42 | # 环境特征(是否有铂金) 43 | 'dense_feature': tf.placeholder(tf.float32, [None, DENSE_FEAT_NUM], 44 | name='dense_feature') 45 | } 46 | return tf.estimator.export.ServingInputReceiver(receiver_tensors=receiver_tensors, features=receiver_tensors) 47 | 48 | export_dir = estimator.export_saved_model(export_dir_base=export_dir_base, 49 | serving_input_receiver_fn=_serving_input_receiver_fn) 50 | estimator._params.pop('save_model') 51 | return export_dir.decode() 52 | 53 | 54 | def calculate_result(result_generator): 55 | y_ctr, pred_ctr, ctr = [], [], [] 56 | for result in result_generator: 57 | cxr_feature = result['cxr_feature'] 58 | mask = result['mask'] 59 | # ctr_label 60 | idx = np.where(mask.reshape(-1) == 1) 61 | y_ctr += result['ctr_label'].reshape(-1)[idx].tolist() 62 | pred_ctr += result['ctr_out'].reshape(-1)[idx].tolist() 63 | ctr += cxr_feature[:, 0].reshape(-1)[idx].tolist() 64 | 65 | ctr_auc, ctr_auc_jp, ctr_cb, ctr_cb_jp = metrics.roc_auc_score(y_ctr, pred_ctr), metrics.roc_auc_score(y_ctr, 66 | ctr), np.sum( 67 | pred_ctr) / np.sum(y_ctr), np.sum(ctr) / np.sum(y_ctr) 68 | print("ctr_auc:{}, ctr_auc_jp:{}, ctr_cb:{}, ctr_cb_jp:{}".format(ctr_auc, ctr_auc_jp, ctr_cb, ctr_cb_jp)) 69 | 70 | 71 | if __name__ == '__main__': 72 | 73 | estimator, nn_model = create_estimator() 74 | 75 | with tick_tock("DATA_INPUT") as _: 76 | valid_input_fn = input_fn_maker(VALID_FILE, False, batch_size=1024, epoch=1) 77 | test_input_fn = input_fn_maker(TEST_FILE, False, batch_size=1024, epoch=1) 78 | 79 | if TRAIN_MODE == 1: 80 | for i in range(EPOCH): 81 | for idx, data in enumerate(TRAIN_FILE): 82 | with tick_tock("DATA_INPUT") as _: 83 | train_input_fn = input_fn_maker([data], True, batch_size=BATCH_SIZE, epoch=1) 84 | with tick_tock("TRAIN") as _: 85 | estimator.train(train_input_fn) 86 | if MODEL_SAVE_PB_EPOCH_ON: 87 | export_dir = save_model_pb_with_estimator(estimator, params={'save_model': 'listwise'}, 88 | export_dir_base=MODEL_SAVE_PB_EPOCH_PATH) 89 | ep_insert_index = i * len(TRAIN_FILE) + idx 90 | target_dir = export_dir + "/../ep" + str(ep_insert_index) 91 | while os.path.exists(target_dir): 92 | target_dir = export_dir + "/../ep" + str(ep_insert_index) 93 | shutil.move(export_dir, target_dir) 94 | print(time.strftime("%m-%d %H:%M:%S ", 95 | time.localtime(time.time())) + "export model PB: " + target_dir) 96 | # with tick_tock("PREDICT") as _: 97 | # result_generator = estimator.predict(input_fn=valid_input_fn, yield_single_examples=False) 98 | # calculate_result(result_generator) 99 | 100 | 101 | 102 | elif TRAIN_MODE == 2: 103 | with tick_tock("PREDICT") as _: 104 | result_generator = estimator.predict(input_fn=valid_input_fn, yield_single_examples=False) 105 | calculate_result(result_generator) 106 | 107 | elif TRAIN_MODE == 3: 108 | for i in range(EPOCH): 109 | for idx, data in enumerate(TRAIN_FILE): 110 | with tick_tock("DATA_INPUT") as _: 111 | train_input_fn = input_fn_maker([data], True, batch_size=BATCH_SIZE, epoch=1) 112 | with tick_tock("TRAIN") as _: 113 | estimator.train(train_input_fn) 114 | with tick_tock("PREDICT") as _: 115 | result_generator = estimator.predict(input_fn=valid_input_fn, yield_single_examples=False) 116 | print("valid_data") 117 | calculate_result(result_generator) 118 | # result_generator = estimator.predict(input_fn=test_input_fn, yield_single_examples=False) 119 | print("train_data") 120 | # calculate_result(result_generator) 121 | # save pb 122 | 123 | 124 | elif TRAIN_MODE == 4: 125 | export_dir = save_model_pb_with_estimator(estimator, params={'save_model': 'listwise'}, 126 | export_dir_base=MODEL_SAVE_PB_EPOCH_PATH) 127 | ep_insert_index = 0 128 | target_dir = export_dir + "/../ep" + str(ep_insert_index) 129 | while os.path.exists(target_dir): 130 | target_dir = export_dir + "/../ep" + str(ep_insert_index) 131 | shutil.move(export_dir, target_dir) 132 | print(time.strftime("%m-%d %H:%M:%S ", 133 | time.localtime(time.time())) + "export model PB: " + target_dir) 134 | 135 | -------------------------------------------------------------------------------- /rerank_paper/DCN_model/model.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lemonace/PIER_code/9c47f80af24570154aab4a811dd82d1f1d436fc3/rerank_paper/DCN_model/model.pyc -------------------------------------------------------------------------------- /rerank_paper/DCN_model/model_load.py: -------------------------------------------------------------------------------- 1 | import tensorflow 2 | 3 | from config import * 4 | import tensorflow_core.contrib.predictor as predictor 5 | 6 | def load_listwise_model(): 7 | model_filename_dir = MODEL_SAVE_PATH + "_pbs/ep0/" 8 | predict_fn = predictor.from_saved_model(model_filename_dir) 9 | # predict_fn = tf.compat.v2.saved_model.load(model_filename_dir) 10 | 11 | # env_feature = > dense_feature 12 | # cxr_feature = > screen_predict_feature 13 | # cat_feature = > screen_cate_feature 14 | # dense_feature = > screen_dense_feature 15 | predictions = predict_fn({ 16 | 'screen_predict_feature': [[[0.036115, 0.05427262, 0.09489095, 0.2], 17 | [0.027565, 0.07474336, 0.04988268, 0.53], 18 | [0.024815, 0.1775544, 0.12052802, 0.24], 19 | [0.023316, 0.12283709, 0.10298113, 0.1]]], 20 | # dense 特征 (价格,评分) 21 | 'screen_dense_feature': [[[1359., 30.146147, 26., 5., 4.85], 22 | [318., 14.675659, 0., 5., 4.94], 23 | [637., 24.784016, 0., 5., 4.65], 24 | [185., 25.333273, 0., 5., 4.75]]], 25 | # 离散特征(品类) 26 | 'screen_cate_feature': [[[2638824, 4148885, 432243, 3985407, 3385100, 3019284], 27 | [2638824, 3905410, 3212599, 3985407, 1997821, 3019284], 28 | [2638824, 4148885, 3622545, 3985407, 1997821, 3019284], 29 | [2638824, 4148885, 432243, 3985407, 1997821, 3019284]]], 30 | # 环境特征(是否有铂金) 31 | 'dense_feature': [[0., 0.]] 32 | }) 33 | 34 | print('Q_network_output:', predictions['Q_network_output']) 35 | print('out:', predictions['out']) 36 | 37 | if __name__ == '__main__': 38 | # load_pg_model() 39 | load_listwise_model() -------------------------------------------------------------------------------- /rerank_paper/DCN_model/restore.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lemonace/PIER_code/9c47f80af24570154aab4a811dd82d1f1d436fc3/rerank_paper/DCN_model/restore.py -------------------------------------------------------------------------------- /rerank_paper/DCN_model/run.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | LOG_PATH="./" 4 | if [[ ! -d ${LOG_PATH} ]]; then 5 | mkdir ${LOG_PATH} 6 | fi 7 | 8 | project_path=$(cd `dirname $0`; pwd) 9 | project_name="${project_path##*/}" 10 | begin_time=$(date "+%Y-%m-%d %H:%M:%S") 11 | author="yangfan129" 12 | 13 | LOG_FILENAME="log" 14 | CUDA_VISIBLE_DEVICES=0 nohup python -u main.py --author ${author} --project ${project_name} --begintime ${begin_time} > ${LOG_PATH}${LOG_FILENAME} 2>&1 & 15 | -------------------------------------------------------------------------------- /rerank_paper/DCN_model/tools.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import datetime 3 | import time 4 | import os 5 | 6 | 7 | class tick_tock: 8 | def __init__(self, process_name, verbose=1): 9 | self.process_name = process_name 10 | self.verbose = verbose 11 | 12 | def __enter__(self): 13 | if self.verbose: 14 | print(("*" * 50 + " {} START!!!! ".format(self.process_name) + "*" * 50)) 15 | self.begin_time = time.time() 16 | 17 | def __exit__(self, type, value, traceback): 18 | if self.verbose: 19 | end_time = time.time() 20 | duration_seconds = end_time - self.begin_time 21 | duration = str(datetime.timedelta(seconds=duration_seconds)) 22 | 23 | print(("#" * 50 + " {} END... time lapsing {} ".format(self.process_name, duration) + "#" * 50)) 24 | 25 | 26 | class FeatureInfo: 27 | def __init__(self, feature_info_str): 28 | self.feature_info_str = feature_info_str 29 | 30 | self.feature_name = "NonFeaName" 31 | self.feature_size = 0 32 | self.feature_mask = 1 33 | self.parse_info_flag = False 34 | self.part_num = 3 35 | 36 | self._parse_info() 37 | 38 | def _parse_info(self): 39 | infoList = self.feature_info_str.split() 40 | 41 | if len(infoList) == self.part_num: 42 | self.feature_name = infoList[0] 43 | self.feature_size = int(infoList[1]) 44 | self.feature_mask = int(infoList[2]) 45 | self.parse_info_flag = True 46 | 47 | 48 | def parse_mask_file(feature_mask_file): 49 | try: 50 | if not os.path.exists(feature_mask_file): 51 | print("parse_mask_file fail - file not exists:", feature_mask_file) 52 | return [], False 53 | # feature_name_list = [] 54 | feature_mask_list = [] 55 | feature_hold_cnt = 0 56 | 57 | with open(feature_mask_file) as f: 58 | str_list = f.readlines() 59 | 60 | for i in range(0, len(str_list)): 61 | str_list[i] = str_list[i].strip('\n').strip() 62 | if str_list[i] == "": 63 | continue 64 | 65 | info = FeatureInfo(str_list[i]) 66 | if not info.parse_info_flag: 67 | print("parse_mask_file fail - parse_info fail:", str_list[i]) 68 | parse_mask_flag = False 69 | return [], parse_mask_flag 70 | 71 | for j in range(info.feature_size): 72 | feature_mask_list.append(info.feature_mask) 73 | if info.feature_mask != 0: 74 | feature_hold_cnt += 1 75 | # if info.feature_size > 1: 76 | # feature_name_list.append(info.feature_name + "_" + str(j)) 77 | # else: 78 | # feature_name_list.append(info.feature_name) 79 | 80 | parse_mask_flag = True 81 | return feature_mask_list, parse_mask_flag, feature_hold_cnt 82 | except Exception as e: 83 | print("parse_mask_file fail - Exception:", e) 84 | return [], False 85 | 86 | 87 | if __name__ == "__main__": 88 | feature_mask_list, parse_feature_mask_flag, feature_hold_cnt = parse_mask_file("feature_mask") 89 | print(feature_mask_list) 90 | print(len(feature_mask_list)) 91 | print(parse_feature_mask_flag) 92 | print(feature_hold_cnt) 93 | -------------------------------------------------------------------------------- /rerank_paper/DCN_model/tools.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lemonace/PIER_code/9c47f80af24570154aab4a811dd82d1f1d436fc3/rerank_paper/DCN_model/tools.pyc -------------------------------------------------------------------------------- /rerank_paper/DCN_model/util.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | def index_matrix_to_pairs(index_matrix): 4 | # [[3,1,2], [2,3,1]] -> [[[0, 3], [1, 1], [2, 2]], 5 | # [[0, 2], [1, 3], [2, 1]]] 6 | replicated_first_indices = tf.range(tf.shape(index_matrix)[0]) 7 | rank = len(index_matrix.get_shape()) 8 | if rank == 2: 9 | replicated_first_indices = tf.tile( 10 | tf.expand_dims(replicated_first_indices, axis=1), 11 | [1, tf.shape(index_matrix)[1]]) 12 | replicated_first_indices = tf.cast(replicated_first_indices, dtype=tf.int64) 13 | return tf.stack([replicated_first_indices, index_matrix], axis=rank) 14 | 15 | def string_hash_to_index(tensor, bucket=1<<22): 16 | return tf.strings.to_hash_bucket_fast(tensor, bucket) 17 | 18 | def int_to_string_with_key(tensor, key): 19 | return key + "_" + tf.strings.as_string(tensor) 20 | 21 | def float_to_string_with_key(tensor, key, precision=1): 22 | return key + "_" + tf.strings.as_string(tensor, precision) 23 | 24 | def float_to_int(tensor, order): 25 | wc = 10 ** order 26 | return tf.cast(tensor * wc, tf.int64) 27 | 28 | def float_custom_hash(tensor, key, precision=0, bucket=1<<22): 29 | tensor = float_to_string_with_key(tensor, key, precision) 30 | tensor = string_hash_to_index(tensor, bucket) 31 | return tensor 32 | 33 | def int_custom_hash(tensor, key, bucket=1<<22): 34 | tensor = int_to_string_with_key(tensor, key) 35 | tensor = string_hash_to_index(tensor, bucket) 36 | return tensor 37 | -------------------------------------------------------------------------------- /rerank_paper/DCN_model/util.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lemonace/PIER_code/9c47f80af24570154aab4a811dd82d1f1d436fc3/rerank_paper/DCN_model/util.pyc -------------------------------------------------------------------------------- /rerank_paper/DNN_model/avg_std/delivery: -------------------------------------------------------------------------------- 1 | 4.668 40.692 17.227 2.616 2 | 2.722 10.623 13.181 1.742 3 | -------------------------------------------------------------------------------- /rerank_paper/DNN_model/avg_std/poi: -------------------------------------------------------------------------------- 1 | 2066.773487 23.97615642 26.98331926 3.040902147 4.622442584 2 | 2239.863594 17.22234242 18.71115826 1.824693613 0.597578887 -------------------------------------------------------------------------------- /rerank_paper/DNN_model/avg_std/user: -------------------------------------------------------------------------------- 1 | 0 0 2 | 1 1 -------------------------------------------------------------------------------- /rerank_paper/DNN_model/build_model.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "" 4 | # 共建模型服务模版的路径 ------------------------ 修改模版配置 ---------------------- 5 | template_path="../model_template/" 6 | 7 | # 待上传模型pb文件的路径 ------------------------ 修改模型配置 ---------------------- 8 | model_path="../model/" 9 | model_name=$1 10 | ep=$2 11 | model_file=${model_path}${model_name}/${ep} 12 | echo ${model_file} 13 | target_path="../model_zip/" 14 | if [[ ! -d ${target_path} ]]; then 15 | mkdir ${target_path} 16 | fi 17 | target_name=${model_name}_${ep} 18 | target_file=${target_path}${target_name} 19 | target_name_zip=${target_name}.zip 20 | echo ${target_name} 21 | echo "---> 待上传模型的目标路径: "${target_file} 22 | if [[ -d ${target_file} ]]; then 23 | cmd_del="rm -fr ${target_file}" 24 | echo "---> 删除已存在的目标文件: "${cmd_del} 25 | ${cmd_del} 26 | fi 27 | if [[ ! -d ${target_file} ]]; then 28 | mkdir ${target_file} 29 | fi 30 | 31 | cmd_cp_model_pb="cp -fr ${model_file} ${target_file}/${target_name}" 32 | echo "---> 拷贝模型pb文件到目标目录: "${cmd_cp_model_pb} 33 | ${cmd_cp_model_pb} 34 | 35 | cmd_mv1="cp ${template_path}/model.properties ${target_file}/${target_name}.properties" 36 | cmd_mv2="cp ${template_path}/model.xml ${target_file}/${target_name}.xml" 37 | cmd_mv3="cp ${template_path}/tensors.xml ${target_file}/tensors.xml" 38 | cmd_mv4="cp ${template_path}/poi_feature.xml ${target_file}/poi_feature.xml" 39 | cmd_mv5="cp ${template_path}/poi_feature.tensor ${target_file}/poi_feature.tensor" 40 | echo "---> 修改properties文件名: "${cmd_mv1} 41 | echo "---> 修改xml文件名: "${cmd_mv2} 42 | ${cmd_mv1} 43 | ${cmd_mv2} 44 | ${cmd_mv3} 45 | ${cmd_mv4} 46 | ${cmd_mv5} 47 | 48 | cmd_zip1="cd ${target_path}" 49 | cmd_zip2="zip -r ${target_name_zip} ${target_name}" 50 | echo "---> 压缩上传模型文件: "${cmd_zip1} ${cmd_zip2} 51 | ${cmd_zip1} 52 | ${cmd_zip2} 53 | cd - 54 | 55 | echo ${target_name}.zip 56 | -------------------------------------------------------------------------------- /rerank_paper/DNN_model/config.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 -*- 2 | import shutil 3 | import time 4 | import os 5 | 6 | # result 7 | RANDOM_SEED = 2021 8 | BATCH_SIZE = 1024 9 | IMP_LOSS_WEIGHT = 0.02 10 | # basic config 11 | EPOCH = 1 12 | LEARNING_RATE = 0.005 13 | DATA_MODE = 3 # 1:local train,2:local test, 3:docker evaluate 14 | TRAIN_MODE = 3 15 | MODEL_NAME = "dnn_pointwise_model_v1" 16 | # poi类别特征 17 | FEATURE_CATE_NUM = 7 # v1r3:19 18 | # dense特征 19 | FEATURE_DENSE_NUM = 5 # v1:28 v1r2:79 v1r3:83 20 | # 预估值特征 21 | FEATURE_CXR_NUM = 3 22 | # 环境特征 23 | FEATURE_ENV_NUM = 2 24 | # 自然poi 25 | FEATURE_NATURE_POI = 25 26 | 27 | # N: Cut Number of POI For Train 28 | POI_NUM = 5 29 | FEATURE_NUM = 9 30 | PAGE_NUM = 5 31 | FEATURE_NUM_FOR_PAGE = 11 32 | # 属性特征:KA AOR BRAND 33 | FEATURE_ATTR_NUM = 3 34 | 35 | # DELIVERY_FEAT 36 | DELIVERY_FEAT_NUM = 4 37 | 38 | # OUT NUM 39 | OUT_NUM = 1 40 | 41 | PLACE_HOLDER_NUM = 11 42 | DENSE_FEAT_NUM = 439 43 | 44 | 45 | # embedding_look_up维度 46 | CATE_FEATURE_EMBEDDINGS_SHAPE = [1 << 22, 8] 47 | 48 | # 网络结构参数 49 | MODEL_PARAMS = { 50 | 'INPUT_TENSOR_LAYERS_A': [60, 32, 20], 51 | 'INPUT_TENSOR_LAYERS_B': [128, 32], 52 | 'INPUT_TENSOR_LAYERS_C': [50, 20], 53 | 'INPUT_TENSOR_LAYERS_D': [50, 20], 54 | 'INPUT_TENSOR_LAYERS_E': [50, 20] 55 | } 56 | A_INPUT_DIM = (MODEL_PARAMS['INPUT_TENSOR_LAYERS_A'][-1] + 1) * 1 57 | 58 | DIN_CONF = {} 59 | 60 | # train data 61 | # /users/lemonace/Downloads/tfrecord-rl-limit5-v1 62 | if DATA_MODE == 1: 63 | TRAIN_FILE = ['/users/meituan_sxw/Downloads/part-r-00047'] 64 | VALID_FILE = TRAIN_FILE 65 | PREDICT_FILE = VALID_FILE 66 | TEST_FILE = PREDICT_FILE 67 | elif DATA_MODE == 2: 68 | TRAIN_FILE = ['/mnt/dolphinfs/hdd_pool/docker/user/hadoop-hmart-waimaiad/yangfan129/train_data/avito_v1_new/avito_v1_new/test_data/part-r-*'] 69 | VALID_FILE = TRAIN_FILE 70 | TEST_FILE = VALID_FILE 71 | elif DATA_MODE == 3: 72 | TRAIN_FILE = [ 73 | "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-hmart-waimaiad/yangfan129/train_data/avito_v3_new_point_wise/train_data/part-r-*"] 74 | VALID_FILE = [ 75 | "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-hmart-waimaiad/yangfan129/train_data/avito_v3_new_point_wise/test_data/part-r-*"] 76 | TEST_FILE = [ 77 | "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-hmart-waimaiad/yangfan129/train_data/avito_v3_new_point_wise/test_data/part-r-*"] 78 | elif DATA_MODE == 4: 79 | DATA_FILE = "/home/hadoop-hmart-waimaiad/cephfs/data/yangfan129/train_data/tfrecord-multi-channel-v1/" 80 | #TRAIN_LIST = ["20211222", "20211223", "20211224", "20211225"] 81 | TRAIN_LIST = ["20220123"] 82 | VALID_LIST = ["20220124"] 83 | TRAIN_FILE = [DATA_FILE + x + "/part-r-*" for x in TRAIN_LIST] 84 | VALID_FILE = [DATA_FILE + x + "/part-r-0001*" for x in VALID_LIST] 85 | TEST_FILE = [DATA_FILE + x + "/part-r-00011" for x in TRAIN_LIST] 86 | 87 | # 辅助脚本 88 | MEAN_VAR_PATH_POI = "./avg_std/poi" 89 | MEAN_VAR_PATH_DELIVERY = "./avg_std/delivery" 90 | MODEL_SAVE_PATH = "../model/" + MODEL_NAME 91 | MODEL_SAVE_PB_EPOCH_ON = False 92 | MODEL_SAVE_PB_EPOCH_PATH = MODEL_SAVE_PATH + "_pbs" 93 | -------------------------------------------------------------------------------- /rerank_paper/DNN_model/layers.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | def DIN(seq, seq_len, target, conf, scope="DIN"): 4 | # seq BATCH_SIZE * SEQ_LEN * FEAT_NUM : N * M * H 5 | # target BATCH_SIZE * FEAT_NUM : N * H 6 | # return : BATCH_SIZE * H 7 | 8 | with tf.variable_scope(scope, reuse=tf.AUTO_REUSE): 9 | seq_shape = tf.shape(seq) 10 | target = tf.tile(target, [1, seq_shape[1], 1]) 11 | 12 | input = tf.concat([seq, target, seq - target, seq * target], axis=-1) 13 | 14 | layers = conf.get("layers", [64, 32]) 15 | for layer in layers: 16 | input = tf.layers.dense(input, layer, activation=tf.nn.sigmoid, name="att_"+str(layer)) 17 | 18 | input = tf.squeeze(tf.layers.dense(input, 1, activation=None, name="att_final"), axis=-1) # N * M 19 | 20 | # Mask 21 | seq_mask = tf.squeeze(tf.sequence_mask(seq_len, seq_shape[1])) 22 | # seq_mask = tf.Print(seq_mask, [seq_mask], message="seq_mask", summarize=100) 23 | padding = tf.ones_like(input) * (-2 ** 32 + 1) 24 | attention = tf.nn.softmax(tf.where(seq_mask, input, padding), axis=-1) # N * M 25 | # attention = tf.Print(attention, [attention], message="attention", summarize=100) 26 | attention = tf.tile(tf.expand_dims(attention, axis=2), [1, 1, seq_shape[2]]) 27 | output = tf.reduce_sum(tf.transpose(attention * seq, [0, 2, 1]), axis=1) 28 | return output 29 | -------------------------------------------------------------------------------- /rerank_paper/DNN_model/main.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from config import * 3 | from model import * 4 | from sklearn import metrics 5 | 6 | 7 | def create_estimator(): 8 | tf.logging.set_verbosity(tf.logging.INFO) 9 | session_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) 10 | session_config.gpu_options.allow_growth = True 11 | config = tf.estimator.RunConfig( 12 | tf_random_seed=RANDOM_SEED, 13 | save_summary_steps=100, 14 | save_checkpoints_steps=1000, 15 | model_dir=MODEL_SAVE_PATH, 16 | keep_checkpoint_max=2, 17 | log_step_count_steps=1000, 18 | session_config=session_config) 19 | nn_model = DNN() 20 | estimator = tf.estimator.Estimator(model_fn=nn_model.model_fn_estimator, config=config) 21 | return estimator, nn_model 22 | 23 | 24 | def save_model_pb_with_estimator(estimator, params, export_dir_base): 25 | estimator._params['save_model'] = params['save_model'] 26 | 27 | def _serving_input_receiver_fn(): 28 | # env_feature = > dense_feature 29 | # cxr_feature = > screen_predict_feature 30 | # cat_feature = > screen_cate_feature 31 | # dense_feature = > screen_dense_feature 32 | receiver_tensors = { 33 | # ctr cvr gmv预估值 && bid 34 | 'screen_predict_feature': tf.placeholder(tf.float32, [None, POI_NUM, FEATURE_CXR_NUM], 35 | name='screen_predict_feature'), 36 | # dense 特征 (价格,评分) 37 | 'screen_dense_feature': tf.placeholder(tf.float32, [None, POI_NUM, FEATURE_DENSE_NUM], 38 | name='screen_dense_feature'), 39 | # 离散特征(品类) 40 | 'screen_cate_feature': tf.placeholder(tf.int64, [None, POI_NUM, FEATURE_CATE_NUM], 41 | name='screen_cate_feature'), 42 | # 环境特征(是否有铂金) 43 | 'dense_feature': tf.placeholder(tf.float32, [None, DENSE_FEAT_NUM], 44 | name='dense_feature') 45 | } 46 | return tf.estimator.export.ServingInputReceiver(receiver_tensors=receiver_tensors, features=receiver_tensors) 47 | 48 | export_dir = estimator.export_saved_model(export_dir_base=export_dir_base, 49 | serving_input_receiver_fn=_serving_input_receiver_fn) 50 | estimator._params.pop('save_model') 51 | return export_dir.decode() 52 | 53 | 54 | def calculate_result(result_generator): 55 | y_ctr, pred_ctr, ctr = [], [], [] 56 | for result in result_generator: 57 | cxr_feature = result['cxr_feature'] 58 | mask = result['mask'] 59 | # ctr_label 60 | idx = np.where(mask.reshape(-1) == 1) 61 | y_ctr += result['ctr_label'].reshape(-1)[idx].tolist() 62 | pred_ctr += result['ctr_out'].reshape(-1)[idx].tolist() 63 | ctr += cxr_feature[:, 0].reshape(-1)[idx].tolist() 64 | 65 | ctr_auc, ctr_auc_jp, ctr_cb, ctr_cb_jp = metrics.roc_auc_score(y_ctr, pred_ctr), metrics.roc_auc_score(y_ctr, 66 | ctr), np.sum( 67 | pred_ctr) / np.sum(y_ctr), np.sum(ctr) / np.sum(y_ctr) 68 | print("ctr_auc:{}, ctr_auc_jp:{}, ctr_cb:{}, ctr_cb_jp:{}".format(ctr_auc, ctr_auc_jp, ctr_cb, ctr_cb_jp)) 69 | 70 | 71 | if __name__ == '__main__': 72 | 73 | estimator, nn_model = create_estimator() 74 | 75 | with tick_tock("DATA_INPUT") as _: 76 | valid_input_fn = input_fn_maker(VALID_FILE, False, batch_size=1024, epoch=1) 77 | test_input_fn = input_fn_maker(TEST_FILE, False, batch_size=1024, epoch=1) 78 | 79 | if TRAIN_MODE == 1: 80 | for i in range(EPOCH): 81 | for idx, data in enumerate(TRAIN_FILE): 82 | with tick_tock("DATA_INPUT") as _: 83 | train_input_fn = input_fn_maker([data], True, batch_size=BATCH_SIZE, epoch=1) 84 | with tick_tock("TRAIN") as _: 85 | estimator.train(train_input_fn) 86 | if MODEL_SAVE_PB_EPOCH_ON: 87 | export_dir = save_model_pb_with_estimator(estimator, params={'save_model': 'listwise'}, 88 | export_dir_base=MODEL_SAVE_PB_EPOCH_PATH) 89 | ep_insert_index = i * len(TRAIN_FILE) + idx 90 | target_dir = export_dir + "/../ep" + str(ep_insert_index) 91 | while os.path.exists(target_dir): 92 | target_dir = export_dir + "/../ep" + str(ep_insert_index) 93 | shutil.move(export_dir, target_dir) 94 | print(time.strftime("%m-%d %H:%M:%S ", 95 | time.localtime(time.time())) + "export model PB: " + target_dir) 96 | # with tick_tock("PREDICT") as _: 97 | # result_generator = estimator.predict(input_fn=valid_input_fn, yield_single_examples=False) 98 | # calculate_result(result_generator) 99 | 100 | 101 | 102 | elif TRAIN_MODE == 2: 103 | with tick_tock("PREDICT") as _: 104 | result_generator = estimator.predict(input_fn=valid_input_fn, yield_single_examples=False) 105 | calculate_result(result_generator) 106 | 107 | elif TRAIN_MODE == 3: 108 | for i in range(EPOCH): 109 | for idx, data in enumerate(TRAIN_FILE): 110 | with tick_tock("DATA_INPUT") as _: 111 | train_input_fn = input_fn_maker([data], True, batch_size=BATCH_SIZE, epoch=1) 112 | with tick_tock("TRAIN") as _: 113 | estimator.train(train_input_fn) 114 | with tick_tock("PREDICT") as _: 115 | result_generator = estimator.predict(input_fn=valid_input_fn, yield_single_examples=False) 116 | print("valid_data") 117 | calculate_result(result_generator) 118 | # result_generator = estimator.predict(input_fn=test_input_fn, yield_single_examples=False) 119 | print("train_data") 120 | # calculate_result(result_generator) 121 | # save pb 122 | 123 | 124 | elif TRAIN_MODE == 4: 125 | export_dir = save_model_pb_with_estimator(estimator, params={'save_model': 'listwise'}, 126 | export_dir_base=MODEL_SAVE_PB_EPOCH_PATH) 127 | ep_insert_index = 0 128 | target_dir = export_dir + "/../ep" + str(ep_insert_index) 129 | while os.path.exists(target_dir): 130 | target_dir = export_dir + "/../ep" + str(ep_insert_index) 131 | shutil.move(export_dir, target_dir) 132 | print(time.strftime("%m-%d %H:%M:%S ", 133 | time.localtime(time.time())) + "export model PB: " + target_dir) 134 | 135 | -------------------------------------------------------------------------------- /rerank_paper/DNN_model/model_load.py: -------------------------------------------------------------------------------- 1 | import tensorflow 2 | 3 | from config import * 4 | import tensorflow_core.contrib.predictor as predictor 5 | 6 | def load_listwise_model(): 7 | model_filename_dir = MODEL_SAVE_PATH + "_pbs/ep0/" 8 | predict_fn = predictor.from_saved_model(model_filename_dir) 9 | # predict_fn = tf.compat.v2.saved_model.load(model_filename_dir) 10 | 11 | # env_feature = > dense_feature 12 | # cxr_feature = > screen_predict_feature 13 | # cat_feature = > screen_cate_feature 14 | # dense_feature = > screen_dense_feature 15 | predictions = predict_fn({ 16 | 'screen_predict_feature': [[[0.036115, 0.05427262, 0.09489095, 0.2], 17 | [0.027565, 0.07474336, 0.04988268, 0.53], 18 | [0.024815, 0.1775544, 0.12052802, 0.24], 19 | [0.023316, 0.12283709, 0.10298113, 0.1]]], 20 | # dense 特征 (价格,评分) 21 | 'screen_dense_feature': [[[1359., 30.146147, 26., 5., 4.85], 22 | [318., 14.675659, 0., 5., 4.94], 23 | [637., 24.784016, 0., 5., 4.65], 24 | [185., 25.333273, 0., 5., 4.75]]], 25 | # 离散特征(品类) 26 | 'screen_cate_feature': [[[2638824, 4148885, 432243, 3985407, 3385100, 3019284], 27 | [2638824, 3905410, 3212599, 3985407, 1997821, 3019284], 28 | [2638824, 4148885, 3622545, 3985407, 1997821, 3019284], 29 | [2638824, 4148885, 432243, 3985407, 1997821, 3019284]]], 30 | # 环境特征(是否有铂金) 31 | 'dense_feature': [[0., 0.]] 32 | }) 33 | 34 | print('Q_network_output:', predictions['Q_network_output']) 35 | print('out:', predictions['out']) 36 | 37 | if __name__ == '__main__': 38 | # load_pg_model() 39 | load_listwise_model() -------------------------------------------------------------------------------- /rerank_paper/DNN_model/restore.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lemonace/PIER_code/9c47f80af24570154aab4a811dd82d1f1d436fc3/rerank_paper/DNN_model/restore.py -------------------------------------------------------------------------------- /rerank_paper/DNN_model/run.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | LOG_PATH="./" 4 | if [[ ! -d ${LOG_PATH} ]]; then 5 | mkdir ${LOG_PATH} 6 | fi 7 | 8 | project_path=$(cd `dirname $0`; pwd) 9 | project_name="${project_path##*/}" 10 | begin_time=$(date "+%Y-%m-%d %H:%M:%S") 11 | author="yangfan129" 12 | 13 | LOG_FILENAME="log" 14 | CUDA_VISIBLE_DEVICES=0 nohup python -u main.py --author ${author} --project ${project_name} --begintime ${begin_time} > ${LOG_PATH}${LOG_FILENAME} 2>&1 & 15 | -------------------------------------------------------------------------------- /rerank_paper/DNN_model/tools.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import datetime 3 | import time 4 | import os 5 | 6 | 7 | class tick_tock: 8 | def __init__(self, process_name, verbose=1): 9 | self.process_name = process_name 10 | self.verbose = verbose 11 | 12 | def __enter__(self): 13 | if self.verbose: 14 | print(("*" * 50 + " {} START!!!! ".format(self.process_name) + "*" * 50)) 15 | self.begin_time = time.time() 16 | 17 | def __exit__(self, type, value, traceback): 18 | if self.verbose: 19 | end_time = time.time() 20 | duration_seconds = end_time - self.begin_time 21 | duration = str(datetime.timedelta(seconds=duration_seconds)) 22 | 23 | print(("#" * 50 + " {} END... time lapsing {} ".format(self.process_name, duration) + "#" * 50)) 24 | 25 | 26 | class FeatureInfo: 27 | def __init__(self, feature_info_str): 28 | self.feature_info_str = feature_info_str 29 | 30 | self.feature_name = "NonFeaName" 31 | self.feature_size = 0 32 | self.feature_mask = 1 33 | self.parse_info_flag = False 34 | self.part_num = 3 35 | 36 | self._parse_info() 37 | 38 | def _parse_info(self): 39 | infoList = self.feature_info_str.split() 40 | 41 | if len(infoList) == self.part_num: 42 | self.feature_name = infoList[0] 43 | self.feature_size = int(infoList[1]) 44 | self.feature_mask = int(infoList[2]) 45 | self.parse_info_flag = True 46 | 47 | 48 | def parse_mask_file(feature_mask_file): 49 | try: 50 | if not os.path.exists(feature_mask_file): 51 | print("parse_mask_file fail - file not exists:", feature_mask_file) 52 | return [], False 53 | # feature_name_list = [] 54 | feature_mask_list = [] 55 | feature_hold_cnt = 0 56 | 57 | with open(feature_mask_file) as f: 58 | str_list = f.readlines() 59 | 60 | for i in range(0, len(str_list)): 61 | str_list[i] = str_list[i].strip('\n').strip() 62 | if str_list[i] == "": 63 | continue 64 | 65 | info = FeatureInfo(str_list[i]) 66 | if not info.parse_info_flag: 67 | print("parse_mask_file fail - parse_info fail:", str_list[i]) 68 | parse_mask_flag = False 69 | return [], parse_mask_flag 70 | 71 | for j in range(info.feature_size): 72 | feature_mask_list.append(info.feature_mask) 73 | if info.feature_mask != 0: 74 | feature_hold_cnt += 1 75 | # if info.feature_size > 1: 76 | # feature_name_list.append(info.feature_name + "_" + str(j)) 77 | # else: 78 | # feature_name_list.append(info.feature_name) 79 | 80 | parse_mask_flag = True 81 | return feature_mask_list, parse_mask_flag, feature_hold_cnt 82 | except Exception as e: 83 | print("parse_mask_file fail - Exception:", e) 84 | return [], False 85 | 86 | 87 | if __name__ == "__main__": 88 | feature_mask_list, parse_feature_mask_flag, feature_hold_cnt = parse_mask_file("feature_mask") 89 | print(feature_mask_list) 90 | print(len(feature_mask_list)) 91 | print(parse_feature_mask_flag) 92 | print(feature_hold_cnt) 93 | -------------------------------------------------------------------------------- /rerank_paper/DNN_model/util.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | def index_matrix_to_pairs(index_matrix): 4 | # [[3,1,2], [2,3,1]] -> [[[0, 3], [1, 1], [2, 2]], 5 | # [[0, 2], [1, 3], [2, 1]]] 6 | replicated_first_indices = tf.range(tf.shape(index_matrix)[0]) 7 | rank = len(index_matrix.get_shape()) 8 | if rank == 2: 9 | replicated_first_indices = tf.tile( 10 | tf.expand_dims(replicated_first_indices, axis=1), 11 | [1, tf.shape(index_matrix)[1]]) 12 | replicated_first_indices = tf.cast(replicated_first_indices, dtype=tf.int64) 13 | return tf.stack([replicated_first_indices, index_matrix], axis=rank) 14 | 15 | def string_hash_to_index(tensor, bucket=1<<22): 16 | return tf.strings.to_hash_bucket_fast(tensor, bucket) 17 | 18 | def int_to_string_with_key(tensor, key): 19 | return key + "_" + tf.strings.as_string(tensor) 20 | 21 | def float_to_string_with_key(tensor, key, precision=1): 22 | return key + "_" + tf.strings.as_string(tensor, precision) 23 | 24 | def float_to_int(tensor, order): 25 | wc = 10 ** order 26 | return tf.cast(tensor * wc, tf.int64) 27 | 28 | def float_custom_hash(tensor, key, precision=0, bucket=1<<22): 29 | tensor = float_to_string_with_key(tensor, key, precision) 30 | tensor = string_hash_to_index(tensor, bucket) 31 | return tensor 32 | 33 | def int_custom_hash(tensor, key, bucket=1<<22): 34 | tensor = int_to_string_with_key(tensor, key) 35 | tensor = string_hash_to_index(tensor, bucket) 36 | return tensor 37 | -------------------------------------------------------------------------------- /rerank_paper/extr_model/__pycache__/config.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lemonace/PIER_code/9c47f80af24570154aab4a811dd82d1f1d436fc3/rerank_paper/extr_model/__pycache__/config.cpython-37.pyc -------------------------------------------------------------------------------- /rerank_paper/extr_model/__pycache__/data_input.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lemonace/PIER_code/9c47f80af24570154aab4a811dd82d1f1d436fc3/rerank_paper/extr_model/__pycache__/data_input.cpython-37.pyc -------------------------------------------------------------------------------- /rerank_paper/extr_model/__pycache__/layers.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lemonace/PIER_code/9c47f80af24570154aab4a811dd82d1f1d436fc3/rerank_paper/extr_model/__pycache__/layers.cpython-37.pyc -------------------------------------------------------------------------------- /rerank_paper/extr_model/__pycache__/model.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lemonace/PIER_code/9c47f80af24570154aab4a811dd82d1f1d436fc3/rerank_paper/extr_model/__pycache__/model.cpython-37.pyc -------------------------------------------------------------------------------- /rerank_paper/extr_model/__pycache__/tools.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lemonace/PIER_code/9c47f80af24570154aab4a811dd82d1f1d436fc3/rerank_paper/extr_model/__pycache__/tools.cpython-37.pyc -------------------------------------------------------------------------------- /rerank_paper/extr_model/__pycache__/util.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lemonace/PIER_code/9c47f80af24570154aab4a811dd82d1f1d436fc3/rerank_paper/extr_model/__pycache__/util.cpython-37.pyc -------------------------------------------------------------------------------- /rerank_paper/extr_model/avg_std/delivery: -------------------------------------------------------------------------------- 1 | 4.668 40.692 17.227 2.616 2 | 2.722 10.623 13.181 1.742 3 | -------------------------------------------------------------------------------- /rerank_paper/extr_model/avg_std/poi: -------------------------------------------------------------------------------- 1 | 2066.773487 23.97615642 26.98331926 3.040902147 4.622442584 2 | 2239.863594 17.22234242 18.71115826 1.824693613 0.597578887 -------------------------------------------------------------------------------- /rerank_paper/extr_model/avg_std/user: -------------------------------------------------------------------------------- 1 | 0 0 2 | 1 1 -------------------------------------------------------------------------------- /rerank_paper/extr_model/build_model.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "" 4 | # 共建模型服务模版的路径 ------------------------ 修改模版配置 ---------------------- 5 | template_path="../model_template/" 6 | 7 | # 待上传模型pb文件的路径 ------------------------ 修改模型配置 ---------------------- 8 | model_path="../model/" 9 | model_name=$1 10 | ep=$2 11 | model_file=${model_path}${model_name}/${ep} 12 | echo ${model_file} 13 | target_path="../model_zip/" 14 | if [[ ! -d ${target_path} ]]; then 15 | mkdir ${target_path} 16 | fi 17 | target_name=${model_name}_${ep} 18 | target_file=${target_path}${target_name} 19 | target_name_zip=${target_name}.zip 20 | echo ${target_name} 21 | echo "---> 待上传模型的目标路径: "${target_file} 22 | if [[ -d ${target_file} ]]; then 23 | cmd_del="rm -fr ${target_file}" 24 | echo "---> 删除已存在的目标文件: "${cmd_del} 25 | ${cmd_del} 26 | fi 27 | if [[ ! -d ${target_file} ]]; then 28 | mkdir ${target_file} 29 | fi 30 | 31 | cmd_cp_model_pb="cp -fr ${model_file} ${target_file}/${target_name}" 32 | echo "---> 拷贝模型pb文件到目标目录: "${cmd_cp_model_pb} 33 | ${cmd_cp_model_pb} 34 | 35 | cmd_mv1="cp ${template_path}/model.properties ${target_file}/${target_name}.properties" 36 | cmd_mv2="cp ${template_path}/model.xml ${target_file}/${target_name}.xml" 37 | cmd_mv3="cp ${template_path}/tensors.xml ${target_file}/tensors.xml" 38 | cmd_mv4="cp ${template_path}/poi_feature.xml ${target_file}/poi_feature.xml" 39 | cmd_mv5="cp ${template_path}/poi_feature.tensor ${target_file}/poi_feature.tensor" 40 | echo "---> 修改properties文件名: "${cmd_mv1} 41 | echo "---> 修改xml文件名: "${cmd_mv2} 42 | ${cmd_mv1} 43 | ${cmd_mv2} 44 | ${cmd_mv3} 45 | ${cmd_mv4} 46 | ${cmd_mv5} 47 | 48 | cmd_zip1="cd ${target_path}" 49 | cmd_zip2="zip -r ${target_name_zip} ${target_name}" 50 | echo "---> 压缩上传模型文件: "${cmd_zip1} ${cmd_zip2} 51 | ${cmd_zip1} 52 | ${cmd_zip2} 53 | cd - 54 | 55 | echo ${target_name}.zip 56 | -------------------------------------------------------------------------------- /rerank_paper/extr_model/config.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 -*- 2 | import shutil 3 | import time 4 | import os 5 | 6 | # result 7 | RANDOM_SEED = 2021 8 | BATCH_SIZE = 1024 9 | IMP_LOSS_WEIGHT = 0.02 10 | # basic config 11 | EPOCH = 1 12 | LEARNING_RATE = 0.005 13 | DATA_MODE = 3 # 1:local train,2:local test, 3:docker evaluate 14 | TRAIN_MODE = 3 15 | MODEL_NAME = "extr_model_v3" 16 | # poi类别特征 17 | FEATURE_CATE_NUM = 7 # v1r3:19 18 | # dense特征 19 | FEATURE_DENSE_NUM = 5 # v1:28 v1r2:79 v1r3:83 20 | # 预估值特征 21 | FEATURE_CXR_NUM = 3 22 | # 环境特征 23 | FEATURE_ENV_NUM = 2 24 | # 自然poi 25 | FEATURE_NATURE_POI = 3 26 | 27 | # N: Cut Number of POI For Train 28 | POI_NUM = 5 29 | FEATURE_NUM = 9 30 | PAGE_NUM = 5 31 | FEATURE_NUM_FOR_PAGE = 11 32 | # 属性特征:KA AOR BRAND 33 | FEATURE_ATTR_NUM = 3 34 | 35 | # DELIVERY_FEAT 36 | DELIVERY_FEAT_NUM = 4 37 | 38 | # OUT NUM 39 | OUT_NUM = 5 40 | 41 | PLACE_HOLDER_NUM = 11 42 | DENSE_FEAT_NUM = 439 43 | 44 | 45 | # embedding_look_up维度 46 | CATE_FEATURE_EMBEDDINGS_SHAPE = [1 << 22, 8] 47 | 48 | # 网络结构参数 49 | MODEL_PARAMS = { 50 | 'INPUT_TENSOR_LAYERS_A': [60, 32, 20], 51 | 'INPUT_TENSOR_LAYERS_B': [128, 32], 52 | 'INPUT_TENSOR_LAYERS_C': [50, 20], 53 | 'INPUT_TENSOR_LAYERS_D': [50, 20], 54 | 'INPUT_TENSOR_LAYERS_E': [50, 20] 55 | } 56 | A_INPUT_DIM = (MODEL_PARAMS['INPUT_TENSOR_LAYERS_A'][-1] + 1) 57 | 58 | DIN_CONF = {} 59 | 60 | # train data 61 | # /users/lemonace/Downloads/tfrecord-rl-limit5-v1 62 | if DATA_MODE == 1: 63 | # TRAIN_FILE = ['/users/meituan_sxw/Downloads/part-r-00046'] 64 | TRAIN_FILE = ['/Users/lemonace/Downloads/docker_data/part-r-00049'] 65 | VALID_FILE = TRAIN_FILE 66 | PREDICT_FILE = VALID_FILE 67 | TEST_FILE = PREDICT_FILE 68 | elif DATA_MODE == 2: 69 | TRAIN_FILE = ['/mnt/dolphinfs/hdd_pool/docker/user/hadoop-hmart-waimaiad/yangfan129/train_data/avito_v1_new/avito_v1_new/test_data/part-r-*'] 70 | VALID_FILE = TRAIN_FILE 71 | TEST_FILE = VALID_FILE 72 | elif DATA_MODE == 3: 73 | TRAIN_FILE = [ 74 | "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-hmart-waimaiad/yangfan129/train_data/avito_v3_new/train_data/part-r-*"] 75 | VALID_FILE = [ 76 | "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-hmart-waimaiad/yangfan129/train_data/avito_v3_new/test_data/part-r-*"] 77 | TEST_FILE = [ 78 | "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-hmart-waimaiad/yangfan129/train_data/avito_v3_new/test_data/part-r-*"] 79 | elif DATA_MODE == 4: 80 | DATA_FILE = "/home/hadoop-hmart-waimaiad/cephfs/data/yangfan129/train_data/tfrecord-multi-channel-v1/" 81 | #TRAIN_LIST = ["20211222", "20211223", "20211224", "20211225"] 82 | TRAIN_LIST = ["20220123"] 83 | VALID_LIST = ["20220124"] 84 | TRAIN_FILE = [DATA_FILE + x + "/part-r-*" for x in TRAIN_LIST] 85 | VALID_FILE = [DATA_FILE + x + "/part-r-0001*" for x in VALID_LIST] 86 | TEST_FILE = [DATA_FILE + x + "/part-r-00011" for x in TRAIN_LIST] 87 | 88 | # 辅助脚本 89 | MEAN_VAR_PATH_POI = "./avg_std/poi" 90 | MEAN_VAR_PATH_DELIVERY = "./avg_std/delivery" 91 | MODEL_SAVE_PATH = "../model/" + MODEL_NAME 92 | MODEL_SAVE_PB_EPOCH_ON = False 93 | MODEL_SAVE_PB_EPOCH_PATH = MODEL_SAVE_PATH + "_pbs" 94 | -------------------------------------------------------------------------------- /rerank_paper/extr_model/main.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from config import * 3 | from model import * 4 | from sklearn import metrics 5 | 6 | 7 | def create_estimator(): 8 | tf.logging.set_verbosity(tf.logging.INFO) 9 | session_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) 10 | session_config.gpu_options.allow_growth = True 11 | config = tf.estimator.RunConfig( 12 | tf_random_seed=RANDOM_SEED, 13 | save_summary_steps=100, 14 | save_checkpoints_steps=1000, 15 | model_dir=MODEL_SAVE_PATH, 16 | keep_checkpoint_max=2, 17 | log_step_count_steps=1000, 18 | session_config=session_config) 19 | nn_model = DNN() 20 | estimator = tf.estimator.Estimator(model_fn=nn_model.model_fn_estimator, config=config) 21 | return estimator, nn_model 22 | 23 | 24 | def save_model_pb_with_estimator(estimator, params, export_dir_base): 25 | estimator._params['save_model'] = params['save_model'] 26 | 27 | def _serving_input_receiver_fn(): 28 | # env_feature = > dense_feature 29 | # cxr_feature = > screen_predict_feature 30 | # cat_feature = > screen_cate_feature 31 | # dense_feature = > screen_dense_feature 32 | receiver_tensors = { 33 | # ctr cvr gmv预估值 && bid 34 | 'screen_predict_feature': tf.placeholder(tf.float32, [None, POI_NUM, FEATURE_CXR_NUM], 35 | name='screen_predict_feature'), 36 | # dense 特征 (价格,评分) 37 | 'screen_dense_feature': tf.placeholder(tf.float32, [None, POI_NUM, FEATURE_DENSE_NUM], 38 | name='screen_dense_feature'), 39 | # 离散特征(品类) 40 | 'screen_cate_feature': tf.placeholder(tf.int64, [None, POI_NUM, FEATURE_CATE_NUM], 41 | name='screen_cate_feature'), 42 | # 环境特征(是否有铂金) 43 | 'dense_feature': tf.placeholder(tf.float32, [None, DENSE_FEAT_NUM], 44 | name='dense_feature') 45 | } 46 | return tf.estimator.export.ServingInputReceiver(receiver_tensors=receiver_tensors, features=receiver_tensors) 47 | 48 | export_dir = estimator.export_saved_model(export_dir_base=export_dir_base, 49 | serving_input_receiver_fn=_serving_input_receiver_fn) 50 | estimator._params.pop('save_model') 51 | return export_dir.decode() 52 | 53 | 54 | def calculate_result(result_generator): 55 | y_ctr, pred_ctr, ctr = [], [], [] 56 | for result in result_generator: 57 | cxr_feature = result['cxr_feature'] 58 | mask = result['mask'] 59 | # ctr_label 60 | idx = np.where(mask.reshape(-1) == 1) 61 | y_ctr += result['ctr_label'].reshape(-1)[idx].tolist() 62 | pred_ctr += result['ctr_out'].reshape(-1)[idx].tolist() 63 | ctr += cxr_feature[:, :, 0].reshape(-1)[idx].tolist() 64 | 65 | ctr_auc, ctr_auc_jp, ctr_cb, ctr_cb_jp = metrics.roc_auc_score(y_ctr, pred_ctr), metrics.roc_auc_score(y_ctr, 66 | ctr), np.sum( 67 | pred_ctr) / np.sum(y_ctr), np.sum(ctr) / np.sum(y_ctr) 68 | print("ctr_auc:{}, ctr_auc_jp:{}, ctr_cb:{}, ctr_cb_jp:{}".format(ctr_auc, ctr_auc_jp, ctr_cb, ctr_cb_jp)) 69 | 70 | 71 | if __name__ == '__main__': 72 | 73 | estimator, nn_model = create_estimator() 74 | 75 | with tick_tock("DATA_INPUT") as _: 76 | valid_input_fn = input_fn_maker(VALID_FILE, False, batch_size=1024, epoch=1) 77 | test_input_fn = input_fn_maker(TEST_FILE, False, batch_size=1024, epoch=1) 78 | 79 | if TRAIN_MODE == 1: 80 | for i in range(EPOCH): 81 | for idx, data in enumerate(TRAIN_FILE): 82 | with tick_tock("DATA_INPUT") as _: 83 | train_input_fn = input_fn_maker([data], True, batch_size=BATCH_SIZE, epoch=1) 84 | with tick_tock("TRAIN") as _: 85 | estimator.train(train_input_fn) 86 | if MODEL_SAVE_PB_EPOCH_ON: 87 | export_dir = save_model_pb_with_estimator(estimator, params={'save_model': 'listwise'}, 88 | export_dir_base=MODEL_SAVE_PB_EPOCH_PATH) 89 | ep_insert_index = i * len(TRAIN_FILE) + idx 90 | target_dir = export_dir + "/../ep" + str(ep_insert_index) 91 | while os.path.exists(target_dir): 92 | target_dir = export_dir + "/../ep" + str(ep_insert_index) 93 | shutil.move(export_dir, target_dir) 94 | print(time.strftime("%m-%d %H:%M:%S ", 95 | time.localtime(time.time())) + "export model PB: " + target_dir) 96 | # with tick_tock("PREDICT") as _: 97 | # result_generator = estimator.predict(input_fn=valid_input_fn, yield_single_examples=False) 98 | # calculate_result(result_generator) 99 | 100 | 101 | 102 | elif TRAIN_MODE == 2: 103 | with tick_tock("PREDICT") as _: 104 | result_generator = estimator.predict(input_fn=valid_input_fn, yield_single_examples=False) 105 | calculate_result(result_generator) 106 | 107 | elif TRAIN_MODE == 3: 108 | for i in range(EPOCH): 109 | for idx, data in enumerate(TRAIN_FILE): 110 | with tick_tock("DATA_INPUT") as _: 111 | train_input_fn = input_fn_maker([data], True, batch_size=BATCH_SIZE, epoch=1) 112 | with tick_tock("TRAIN") as _: 113 | estimator.train(train_input_fn) 114 | with tick_tock("PREDICT") as _: 115 | result_generator = estimator.predict(input_fn=valid_input_fn, yield_single_examples=False) 116 | print("valid_data") 117 | calculate_result(result_generator) 118 | # result_generator = estimator.predict(input_fn=test_input_fn, yield_single_examples=False) 119 | print("train_data") 120 | # calculate_result(result_generator) 121 | # save pb 122 | 123 | 124 | elif TRAIN_MODE == 4: 125 | export_dir = save_model_pb_with_estimator(estimator, params={'save_model': 'listwise'}, 126 | export_dir_base=MODEL_SAVE_PB_EPOCH_PATH) 127 | ep_insert_index = 0 128 | target_dir = export_dir + "/../ep" + str(ep_insert_index) 129 | while os.path.exists(target_dir): 130 | target_dir = export_dir + "/../ep" + str(ep_insert_index) 131 | shutil.move(export_dir, target_dir) 132 | print(time.strftime("%m-%d %H:%M:%S ", 133 | time.localtime(time.time())) + "export model PB: " + target_dir) 134 | 135 | -------------------------------------------------------------------------------- /rerank_paper/extr_model/model_load.py: -------------------------------------------------------------------------------- 1 | import tensorflow 2 | 3 | from config import * 4 | import tensorflow_core.contrib.predictor as predictor 5 | 6 | def load_listwise_model(): 7 | model_filename_dir = MODEL_SAVE_PATH + "_pbs/ep0/" 8 | predict_fn = predictor.from_saved_model(model_filename_dir) 9 | # predict_fn = tf.compat.v2.saved_model.load(model_filename_dir) 10 | 11 | # env_feature = > dense_feature 12 | # cxr_feature = > screen_predict_feature 13 | # cat_feature = > screen_cate_feature 14 | # dense_feature = > screen_dense_feature 15 | predictions = predict_fn({ 16 | 'screen_predict_feature': [[[0.036115, 0.05427262, 0.09489095, 0.2], 17 | [0.027565, 0.07474336, 0.04988268, 0.53], 18 | [0.024815, 0.1775544, 0.12052802, 0.24], 19 | [0.023316, 0.12283709, 0.10298113, 0.1]]], 20 | # dense 特征 (价格,评分) 21 | 'screen_dense_feature': [[[1359., 30.146147, 26., 5., 4.85], 22 | [318., 14.675659, 0., 5., 4.94], 23 | [637., 24.784016, 0., 5., 4.65], 24 | [185., 25.333273, 0., 5., 4.75]]], 25 | # 离散特征(品类) 26 | 'screen_cate_feature': [[[2638824, 4148885, 432243, 3985407, 3385100, 3019284], 27 | [2638824, 3905410, 3212599, 3985407, 1997821, 3019284], 28 | [2638824, 4148885, 3622545, 3985407, 1997821, 3019284], 29 | [2638824, 4148885, 432243, 3985407, 1997821, 3019284]]], 30 | # 环境特征(是否有铂金) 31 | 'dense_feature': [[0., 0.]] 32 | }) 33 | 34 | print('Q_network_output:', predictions['Q_network_output']) 35 | print('out:', predictions['out']) 36 | 37 | if __name__ == '__main__': 38 | # load_pg_model() 39 | load_listwise_model() -------------------------------------------------------------------------------- /rerank_paper/extr_model/restore.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lemonace/PIER_code/9c47f80af24570154aab4a811dd82d1f1d436fc3/rerank_paper/extr_model/restore.py -------------------------------------------------------------------------------- /rerank_paper/extr_model/run.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | LOG_PATH="./" 4 | if [[ ! -d ${LOG_PATH} ]]; then 5 | mkdir ${LOG_PATH} 6 | fi 7 | 8 | project_path=$(cd `dirname $0`; pwd) 9 | project_name="${project_path##*/}" 10 | begin_time=$(date "+%Y-%m-%d %H:%M:%S") 11 | author="yangfan129" 12 | 13 | LOG_FILENAME="log" 14 | CUDA_VISIBLE_DEVICES=0 nohup python -u main.py --author ${author} --project ${project_name} --begintime ${begin_time} > ${LOG_PATH}${LOG_FILENAME} 2>&1 & 15 | -------------------------------------------------------------------------------- /rerank_paper/extr_model/tools.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import datetime 3 | import time 4 | import os 5 | 6 | 7 | class tick_tock: 8 | def __init__(self, process_name, verbose=1): 9 | self.process_name = process_name 10 | self.verbose = verbose 11 | 12 | def __enter__(self): 13 | if self.verbose: 14 | print(("*" * 50 + " {} START!!!! ".format(self.process_name) + "*" * 50)) 15 | self.begin_time = time.time() 16 | 17 | def __exit__(self, type, value, traceback): 18 | if self.verbose: 19 | end_time = time.time() 20 | duration_seconds = end_time - self.begin_time 21 | duration = str(datetime.timedelta(seconds=duration_seconds)) 22 | 23 | print(("#" * 50 + " {} END... time lapsing {} ".format(self.process_name, duration) + "#" * 50)) 24 | 25 | 26 | class FeatureInfo: 27 | def __init__(self, feature_info_str): 28 | self.feature_info_str = feature_info_str 29 | 30 | self.feature_name = "NonFeaName" 31 | self.feature_size = 0 32 | self.feature_mask = 1 33 | self.parse_info_flag = False 34 | self.part_num = 3 35 | 36 | self._parse_info() 37 | 38 | def _parse_info(self): 39 | infoList = self.feature_info_str.split() 40 | 41 | if len(infoList) == self.part_num: 42 | self.feature_name = infoList[0] 43 | self.feature_size = int(infoList[1]) 44 | self.feature_mask = int(infoList[2]) 45 | self.parse_info_flag = True 46 | 47 | 48 | def parse_mask_file(feature_mask_file): 49 | try: 50 | if not os.path.exists(feature_mask_file): 51 | print("parse_mask_file fail - file not exists:", feature_mask_file) 52 | return [], False 53 | # feature_name_list = [] 54 | feature_mask_list = [] 55 | feature_hold_cnt = 0 56 | 57 | with open(feature_mask_file) as f: 58 | str_list = f.readlines() 59 | 60 | for i in range(0, len(str_list)): 61 | str_list[i] = str_list[i].strip('\n').strip() 62 | if str_list[i] == "": 63 | continue 64 | 65 | info = FeatureInfo(str_list[i]) 66 | if not info.parse_info_flag: 67 | print("parse_mask_file fail - parse_info fail:", str_list[i]) 68 | parse_mask_flag = False 69 | return [], parse_mask_flag 70 | 71 | for j in range(info.feature_size): 72 | feature_mask_list.append(info.feature_mask) 73 | if info.feature_mask != 0: 74 | feature_hold_cnt += 1 75 | # if info.feature_size > 1: 76 | # feature_name_list.append(info.feature_name + "_" + str(j)) 77 | # else: 78 | # feature_name_list.append(info.feature_name) 79 | 80 | parse_mask_flag = True 81 | return feature_mask_list, parse_mask_flag, feature_hold_cnt 82 | except Exception as e: 83 | print("parse_mask_file fail - Exception:", e) 84 | return [], False 85 | 86 | 87 | if __name__ == "__main__": 88 | feature_mask_list, parse_feature_mask_flag, feature_hold_cnt = parse_mask_file("feature_mask") 89 | print(feature_mask_list) 90 | print(len(feature_mask_list)) 91 | print(parse_feature_mask_flag) 92 | print(feature_hold_cnt) 93 | -------------------------------------------------------------------------------- /rerank_paper/extr_model/util.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | def index_matrix_to_pairs(index_matrix): 4 | # [[3,1,2], [2,3,1]] -> [[[0, 3], [1, 1], [2, 2]], 5 | # [[0, 2], [1, 3], [2, 1]]] 6 | replicated_first_indices = tf.range(tf.shape(index_matrix)[0]) 7 | rank = len(index_matrix.get_shape()) 8 | if rank == 2: 9 | replicated_first_indices = tf.tile( 10 | tf.expand_dims(replicated_first_indices, axis=1), 11 | [1, tf.shape(index_matrix)[1]]) 12 | replicated_first_indices = tf.cast(replicated_first_indices, dtype=tf.int64) 13 | return tf.stack([replicated_first_indices, index_matrix], axis=rank) 14 | 15 | def string_hash_to_index(tensor, bucket=1<<22): 16 | return tf.strings.to_hash_bucket_fast(tensor, bucket) 17 | 18 | def int_to_string_with_key(tensor, key): 19 | return key + "_" + tf.strings.as_string(tensor) 20 | 21 | def float_to_string_with_key(tensor, key, precision=1): 22 | return key + "_" + tf.strings.as_string(tensor, precision) 23 | 24 | def float_to_int(tensor, order): 25 | wc = 10 ** order 26 | return tf.cast(tensor * wc, tf.int64) 27 | 28 | def float_custom_hash(tensor, key, precision=0, bucket=1<<22): 29 | tensor = float_to_string_with_key(tensor, key, precision) 30 | tensor = string_hash_to_index(tensor, bucket) 31 | return tensor 32 | 33 | def int_custom_hash(tensor, key, bucket=1<<22): 34 | tensor = int_to_string_with_key(tensor, key) 35 | tensor = string_hash_to_index(tensor, bucket) 36 | return tensor 37 | -------------------------------------------------------------------------------- /rerank_paper/kuaishou_model/avg_std/delivery: -------------------------------------------------------------------------------- 1 | 4.668 40.692 17.227 2.616 2 | 2.722 10.623 13.181 1.742 3 | -------------------------------------------------------------------------------- /rerank_paper/kuaishou_model/avg_std/poi: -------------------------------------------------------------------------------- 1 | 2066.773487 23.97615642 26.98331926 3.040902147 4.622442584 2 | 2239.863594 17.22234242 18.71115826 1.824693613 0.597578887 -------------------------------------------------------------------------------- /rerank_paper/kuaishou_model/avg_std/user: -------------------------------------------------------------------------------- 1 | 0 0 2 | 1 1 -------------------------------------------------------------------------------- /rerank_paper/kuaishou_model/build_model.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "" 4 | # 共建模型服务模版的路径 ------------------------ 修改模版配置 ---------------------- 5 | template_path="../model_template/" 6 | 7 | # 待上传模型pb文件的路径 ------------------------ 修改模型配置 ---------------------- 8 | model_path="../model/" 9 | model_name=$1 10 | ep=$2 11 | model_file=${model_path}${model_name}/${ep} 12 | echo ${model_file} 13 | target_path="../model_zip/" 14 | if [[ ! -d ${target_path} ]]; then 15 | mkdir ${target_path} 16 | fi 17 | target_name=${model_name}_${ep} 18 | target_file=${target_path}${target_name} 19 | target_name_zip=${target_name}.zip 20 | echo ${target_name} 21 | echo "---> 待上传模型的目标路径: "${target_file} 22 | if [[ -d ${target_file} ]]; then 23 | cmd_del="rm -fr ${target_file}" 24 | echo "---> 删除已存在的目标文件: "${cmd_del} 25 | ${cmd_del} 26 | fi 27 | if [[ ! -d ${target_file} ]]; then 28 | mkdir ${target_file} 29 | fi 30 | 31 | cmd_cp_model_pb="cp -fr ${model_file} ${target_file}/${target_name}" 32 | echo "---> 拷贝模型pb文件到目标目录: "${cmd_cp_model_pb} 33 | ${cmd_cp_model_pb} 34 | 35 | cmd_mv1="cp ${template_path}/model.properties ${target_file}/${target_name}.properties" 36 | cmd_mv2="cp ${template_path}/model.xml ${target_file}/${target_name}.xml" 37 | cmd_mv3="cp ${template_path}/tensors.xml ${target_file}/tensors.xml" 38 | cmd_mv4="cp ${template_path}/poi_feature.xml ${target_file}/poi_feature.xml" 39 | cmd_mv5="cp ${template_path}/poi_feature.tensor ${target_file}/poi_feature.tensor" 40 | echo "---> 修改properties文件名: "${cmd_mv1} 41 | echo "---> 修改xml文件名: "${cmd_mv2} 42 | ${cmd_mv1} 43 | ${cmd_mv2} 44 | ${cmd_mv3} 45 | ${cmd_mv4} 46 | ${cmd_mv5} 47 | 48 | cmd_zip1="cd ${target_path}" 49 | cmd_zip2="zip -r ${target_name_zip} ${target_name}" 50 | echo "---> 压缩上传模型文件: "${cmd_zip1} ${cmd_zip2} 51 | ${cmd_zip1} 52 | ${cmd_zip2} 53 | cd - 54 | 55 | echo ${target_name}.zip 56 | -------------------------------------------------------------------------------- /rerank_paper/kuaishou_model/config.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 -*- 2 | import shutil 3 | import time 4 | import os 5 | 6 | # result 7 | RANDOM_SEED = 2021 8 | BATCH_SIZE = 1024 9 | IMP_LOSS_WEIGHT = 0.02 10 | # basic config 11 | EPOCH = 1 12 | LEARNING_RATE = 0.005 13 | DATA_MODE = 1 # 1:local train,2:local test, 3:docker evaluate 14 | TRAIN_MODE = 2 15 | MODEL_NAME = "kuaishou_pointwise_model_v1" 16 | # poi类别特征 17 | FEATURE_CATE_NUM = 7 # v1r3:19 18 | # dense特征 19 | FEATURE_DENSE_NUM = 5 # v1:28 v1r2:79 v1r3:83 20 | # 预估值特征 21 | FEATURE_CXR_NUM = 3 22 | # 环境特征 23 | FEATURE_ENV_NUM = 2 24 | # 自然poi 25 | FEATURE_NATURE_POI = 25 26 | 27 | # N: Cut Number of POI For Train 28 | POI_NUM = 5 29 | FEATURE_NUM = 9 30 | PAGE_NUM = 5 31 | FEATURE_NUM_FOR_PAGE = 11 32 | # 属性特征:KA AOR BRAND 33 | FEATURE_ATTR_NUM = 3 34 | 35 | # DELIVERY_FEAT 36 | DELIVERY_FEAT_NUM = 4 37 | 38 | # OUT NUM 39 | OUT_NUM = 1 40 | 41 | PLACE_HOLDER_NUM = 11 42 | DENSE_FEAT_NUM = 439 43 | 44 | 45 | # embedding_look_up维度 46 | CATE_FEATURE_EMBEDDINGS_SHAPE = [1 << 22, 8] 47 | 48 | # 网络结构参数 49 | MODEL_PARAMS = { 50 | 'INPUT_TENSOR_LAYERS_A': [60, 32, 10], 51 | 'INPUT_TENSOR_LAYERS_B': [50, 20], 52 | 'INPUT_TENSOR_LAYERS_C': [50, 20], 53 | 'INPUT_TENSOR_LAYERS_D': [50, 20], 54 | 'INPUT_TENSOR_LAYERS_E': [50, 20] 55 | } 56 | A_INPUT_DIM = (MODEL_PARAMS['INPUT_TENSOR_LAYERS_A'][-1] + 1) * 3 57 | 58 | DIN_CONF = {} 59 | 60 | # train data 61 | # /users/lemonace/Downloads/tfrecord-rl-limit5-v1 62 | if DATA_MODE == 1: 63 | TRAIN_FILE = ['/users/meituan_sxw/Downloads/part-r-00047'] 64 | VALID_FILE = TRAIN_FILE 65 | PREDICT_FILE = VALID_FILE 66 | TEST_FILE = PREDICT_FILE 67 | elif DATA_MODE == 2: 68 | TRAIN_FILE = ['/mnt/dolphinfs/hdd_pool/docker/user/hadoop-hmart-waimaiad/yangfan129/train_data/avito_v1_new/avito_v1_new/test_data/part-r-*'] 69 | VALID_FILE = TRAIN_FILE 70 | TEST_FILE = VALID_FILE 71 | elif DATA_MODE == 3: 72 | TRAIN_FILE = [ 73 | "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-hmart-waimaiad/yangfan129/train_data/avito_v3_new_point_wise/train_data/part-r-*"] 74 | VALID_FILE = [ 75 | "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-hmart-waimaiad/yangfan129/train_data/avito_v3_new_point_wise/test_data/part-r-*"] 76 | TEST_FILE = [ 77 | "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-hmart-waimaiad/yangfan129/train_data/avito_v3_new_point_wise/test_data/part-r-*"] 78 | elif DATA_MODE == 4: 79 | DATA_FILE = "/home/hadoop-hmart-waimaiad/cephfs/data/yangfan129/train_data/tfrecord-multi-channel-v1/" 80 | #TRAIN_LIST = ["20211222", "20211223", "20211224", "20211225"] 81 | TRAIN_LIST = ["20220123"] 82 | VALID_LIST = ["20220124"] 83 | TRAIN_FILE = [DATA_FILE + x + "/part-r-*" for x in TRAIN_LIST] 84 | VALID_FILE = [DATA_FILE + x + "/part-r-0001*" for x in VALID_LIST] 85 | TEST_FILE = [DATA_FILE + x + "/part-r-00011" for x in TRAIN_LIST] 86 | 87 | # 辅助脚本 88 | MEAN_VAR_PATH_POI = "./avg_std/poi" 89 | MEAN_VAR_PATH_DELIVERY = "./avg_std/delivery" 90 | MODEL_SAVE_PATH = "../model/" + MODEL_NAME 91 | MODEL_SAVE_PB_EPOCH_ON = False 92 | MODEL_SAVE_PB_EPOCH_PATH = MODEL_SAVE_PATH + "_pbs" 93 | -------------------------------------------------------------------------------- /rerank_paper/kuaishou_model/layers.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | def DIN(seq, seq_len, target, conf, scope="DIN"): 4 | # seq BATCH_SIZE * SEQ_LEN * FEAT_NUM : N * M * H 5 | # target BATCH_SIZE * FEAT_NUM : N * H 6 | # return : BATCH_SIZE * H 7 | 8 | with tf.variable_scope(scope, reuse=tf.AUTO_REUSE): 9 | seq_shape = tf.shape(seq) 10 | target = tf.tile(target, [1, seq_shape[1], 1]) 11 | 12 | input = tf.concat([seq, target, seq - target, seq * target], axis=-1) 13 | 14 | layers = conf.get("layers", [64, 32]) 15 | for layer in layers: 16 | input = tf.layers.dense(input, layer, activation=tf.nn.sigmoid, name="att_"+str(layer)) 17 | 18 | input = tf.squeeze(tf.layers.dense(input, 1, activation=None, name="att_final"), axis=-1) # N * M 19 | 20 | # Mask 21 | seq_mask = tf.squeeze(tf.sequence_mask(seq_len, seq_shape[1])) 22 | # seq_mask = tf.Print(seq_mask, [seq_mask], message="seq_mask", summarize=100) 23 | padding = tf.ones_like(input) * (-2 ** 32 + 1) 24 | attention = tf.nn.softmax(tf.where(seq_mask, input, padding), axis=-1) # N * M 25 | # attention = tf.Print(attention, [attention], message="attention", summarize=100) 26 | attention = tf.tile(tf.expand_dims(attention, axis=2), [1, 1, seq_shape[2]]) 27 | output = tf.reduce_sum(tf.transpose(attention * seq, [0, 2, 1]), axis=1) 28 | return output 29 | -------------------------------------------------------------------------------- /rerank_paper/kuaishou_model/main.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from config import * 3 | from model import * 4 | from sklearn import metrics 5 | 6 | 7 | def create_estimator(): 8 | tf.logging.set_verbosity(tf.logging.INFO) 9 | session_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) 10 | session_config.gpu_options.allow_growth = True 11 | config = tf.estimator.RunConfig( 12 | tf_random_seed=RANDOM_SEED, 13 | save_summary_steps=100, 14 | save_checkpoints_steps=1000, 15 | model_dir=MODEL_SAVE_PATH, 16 | keep_checkpoint_max=2, 17 | log_step_count_steps=1000, 18 | session_config=session_config) 19 | nn_model = DNN() 20 | estimator = tf.estimator.Estimator(model_fn=nn_model.model_fn_estimator, config=config) 21 | return estimator, nn_model 22 | 23 | 24 | def save_model_pb_with_estimator(estimator, params, export_dir_base): 25 | estimator._params['save_model'] = params['save_model'] 26 | 27 | def _serving_input_receiver_fn(): 28 | # env_feature = > dense_feature 29 | # cxr_feature = > screen_predict_feature 30 | # cat_feature = > screen_cate_feature 31 | # dense_feature = > screen_dense_feature 32 | receiver_tensors = { 33 | # ctr cvr gmv预估值 && bid 34 | 'screen_predict_feature': tf.placeholder(tf.float32, [None, POI_NUM, FEATURE_CXR_NUM], 35 | name='screen_predict_feature'), 36 | # dense 特征 (价格,评分) 37 | 'screen_dense_feature': tf.placeholder(tf.float32, [None, POI_NUM, FEATURE_DENSE_NUM], 38 | name='screen_dense_feature'), 39 | # 离散特征(品类) 40 | 'screen_cate_feature': tf.placeholder(tf.int64, [None, POI_NUM, FEATURE_CATE_NUM], 41 | name='screen_cate_feature'), 42 | # 环境特征(是否有铂金) 43 | 'dense_feature': tf.placeholder(tf.float32, [None, DENSE_FEAT_NUM], 44 | name='dense_feature') 45 | } 46 | return tf.estimator.export.ServingInputReceiver(receiver_tensors=receiver_tensors, features=receiver_tensors) 47 | 48 | export_dir = estimator.export_saved_model(export_dir_base=export_dir_base, 49 | serving_input_receiver_fn=_serving_input_receiver_fn) 50 | estimator._params.pop('save_model') 51 | return export_dir.decode() 52 | 53 | 54 | def calculate_result(result_generator): 55 | y_ctr, pred_ctr, ctr = [], [], [] 56 | for result in result_generator: 57 | cxr_feature = result['cxr_feature'] 58 | mask = result['mask'] 59 | # ctr_label 60 | idx = np.where(mask.reshape(-1) == 1) 61 | y_ctr += result['ctr_label'].reshape(-1)[idx].tolist() 62 | pred_ctr += result['ctr_out'].reshape(-1)[idx].tolist() 63 | ctr += cxr_feature[:, 0].reshape(-1)[idx].tolist() 64 | 65 | ctr_auc, ctr_auc_jp, ctr_cb, ctr_cb_jp = metrics.roc_auc_score(y_ctr, pred_ctr), metrics.roc_auc_score(y_ctr, 66 | ctr), np.sum( 67 | pred_ctr) / np.sum(y_ctr), np.sum(ctr) / np.sum(y_ctr) 68 | print("ctr_auc:{}, ctr_auc_jp:{}, ctr_cb:{}, ctr_cb_jp:{}".format(ctr_auc, ctr_auc_jp, ctr_cb, ctr_cb_jp)) 69 | 70 | 71 | if __name__ == '__main__': 72 | 73 | estimator, nn_model = create_estimator() 74 | 75 | with tick_tock("DATA_INPUT") as _: 76 | valid_input_fn = input_fn_maker(VALID_FILE, False, batch_size=1024, epoch=1) 77 | test_input_fn = input_fn_maker(TEST_FILE, False, batch_size=1024, epoch=1) 78 | 79 | if TRAIN_MODE == 1: 80 | for i in range(EPOCH): 81 | for idx, data in enumerate(TRAIN_FILE): 82 | with tick_tock("DATA_INPUT") as _: 83 | train_input_fn = input_fn_maker([data], True, batch_size=BATCH_SIZE, epoch=1) 84 | with tick_tock("TRAIN") as _: 85 | estimator.train(train_input_fn) 86 | if MODEL_SAVE_PB_EPOCH_ON: 87 | export_dir = save_model_pb_with_estimator(estimator, params={'save_model': 'listwise'}, 88 | export_dir_base=MODEL_SAVE_PB_EPOCH_PATH) 89 | ep_insert_index = i * len(TRAIN_FILE) + idx 90 | target_dir = export_dir + "/../ep" + str(ep_insert_index) 91 | while os.path.exists(target_dir): 92 | target_dir = export_dir + "/../ep" + str(ep_insert_index) 93 | shutil.move(export_dir, target_dir) 94 | print(time.strftime("%m-%d %H:%M:%S ", 95 | time.localtime(time.time())) + "export model PB: " + target_dir) 96 | # with tick_tock("PREDICT") as _: 97 | # result_generator = estimator.predict(input_fn=valid_input_fn, yield_single_examples=False) 98 | # calculate_result(result_generator) 99 | 100 | 101 | 102 | elif TRAIN_MODE == 2: 103 | with tick_tock("PREDICT") as _: 104 | result_generator = estimator.predict(input_fn=valid_input_fn, yield_single_examples=False) 105 | calculate_result(result_generator) 106 | 107 | elif TRAIN_MODE == 3: 108 | for i in range(EPOCH): 109 | for idx, data in enumerate(TRAIN_FILE): 110 | with tick_tock("DATA_INPUT") as _: 111 | train_input_fn = input_fn_maker([data], True, batch_size=BATCH_SIZE, epoch=1) 112 | with tick_tock("TRAIN") as _: 113 | estimator.train(train_input_fn) 114 | with tick_tock("PREDICT") as _: 115 | result_generator = estimator.predict(input_fn=valid_input_fn, yield_single_examples=False) 116 | print("valid_data") 117 | calculate_result(result_generator) 118 | # result_generator = estimator.predict(input_fn=test_input_fn, yield_single_examples=False) 119 | print("train_data") 120 | # calculate_result(result_generator) 121 | # save pb 122 | 123 | 124 | elif TRAIN_MODE == 4: 125 | export_dir = save_model_pb_with_estimator(estimator, params={'save_model': 'listwise'}, 126 | export_dir_base=MODEL_SAVE_PB_EPOCH_PATH) 127 | ep_insert_index = 0 128 | target_dir = export_dir + "/../ep" + str(ep_insert_index) 129 | while os.path.exists(target_dir): 130 | target_dir = export_dir + "/../ep" + str(ep_insert_index) 131 | shutil.move(export_dir, target_dir) 132 | print(time.strftime("%m-%d %H:%M:%S ", 133 | time.localtime(time.time())) + "export model PB: " + target_dir) 134 | 135 | -------------------------------------------------------------------------------- /rerank_paper/kuaishou_model/model_load.py: -------------------------------------------------------------------------------- 1 | import tensorflow 2 | 3 | from config import * 4 | import tensorflow_core.contrib.predictor as predictor 5 | 6 | def load_listwise_model(): 7 | model_filename_dir = MODEL_SAVE_PATH + "_pbs/ep0/" 8 | predict_fn = predictor.from_saved_model(model_filename_dir) 9 | # predict_fn = tf.compat.v2.saved_model.load(model_filename_dir) 10 | 11 | # env_feature = > dense_feature 12 | # cxr_feature = > screen_predict_feature 13 | # cat_feature = > screen_cate_feature 14 | # dense_feature = > screen_dense_feature 15 | predictions = predict_fn({ 16 | 'screen_predict_feature': [[[0.036115, 0.05427262, 0.09489095, 0.2], 17 | [0.027565, 0.07474336, 0.04988268, 0.53], 18 | [0.024815, 0.1775544, 0.12052802, 0.24], 19 | [0.023316, 0.12283709, 0.10298113, 0.1]]], 20 | # dense 特征 (价格,评分) 21 | 'screen_dense_feature': [[[1359., 30.146147, 26., 5., 4.85], 22 | [318., 14.675659, 0., 5., 4.94], 23 | [637., 24.784016, 0., 5., 4.65], 24 | [185., 25.333273, 0., 5., 4.75]]], 25 | # 离散特征(品类) 26 | 'screen_cate_feature': [[[2638824, 4148885, 432243, 3985407, 3385100, 3019284], 27 | [2638824, 3905410, 3212599, 3985407, 1997821, 3019284], 28 | [2638824, 4148885, 3622545, 3985407, 1997821, 3019284], 29 | [2638824, 4148885, 432243, 3985407, 1997821, 3019284]]], 30 | # 环境特征(是否有铂金) 31 | 'dense_feature': [[0., 0.]] 32 | }) 33 | 34 | print('Q_network_output:', predictions['Q_network_output']) 35 | print('out:', predictions['out']) 36 | 37 | if __name__ == '__main__': 38 | # load_pg_model() 39 | load_listwise_model() -------------------------------------------------------------------------------- /rerank_paper/kuaishou_model/restore.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lemonace/PIER_code/9c47f80af24570154aab4a811dd82d1f1d436fc3/rerank_paper/kuaishou_model/restore.py -------------------------------------------------------------------------------- /rerank_paper/kuaishou_model/run.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | LOG_PATH="./" 4 | if [[ ! -d ${LOG_PATH} ]]; then 5 | mkdir ${LOG_PATH} 6 | fi 7 | 8 | project_path=$(cd `dirname $0`; pwd) 9 | project_name="${project_path##*/}" 10 | begin_time=$(date "+%Y-%m-%d %H:%M:%S") 11 | author="yangfan129" 12 | 13 | LOG_FILENAME="log" 14 | CUDA_VISIBLE_DEVICES=0 nohup python -u main.py --author ${author} --project ${project_name} --begintime ${begin_time} > ${LOG_PATH}${LOG_FILENAME} 2>&1 & 15 | -------------------------------------------------------------------------------- /rerank_paper/kuaishou_model/tools.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import datetime 3 | import time 4 | import os 5 | 6 | 7 | class tick_tock: 8 | def __init__(self, process_name, verbose=1): 9 | self.process_name = process_name 10 | self.verbose = verbose 11 | 12 | def __enter__(self): 13 | if self.verbose: 14 | print(("*" * 50 + " {} START!!!! ".format(self.process_name) + "*" * 50)) 15 | self.begin_time = time.time() 16 | 17 | def __exit__(self, type, value, traceback): 18 | if self.verbose: 19 | end_time = time.time() 20 | duration_seconds = end_time - self.begin_time 21 | duration = str(datetime.timedelta(seconds=duration_seconds)) 22 | 23 | print(("#" * 50 + " {} END... time lapsing {} ".format(self.process_name, duration) + "#" * 50)) 24 | 25 | 26 | class FeatureInfo: 27 | def __init__(self, feature_info_str): 28 | self.feature_info_str = feature_info_str 29 | 30 | self.feature_name = "NonFeaName" 31 | self.feature_size = 0 32 | self.feature_mask = 1 33 | self.parse_info_flag = False 34 | self.part_num = 3 35 | 36 | self._parse_info() 37 | 38 | def _parse_info(self): 39 | infoList = self.feature_info_str.split() 40 | 41 | if len(infoList) == self.part_num: 42 | self.feature_name = infoList[0] 43 | self.feature_size = int(infoList[1]) 44 | self.feature_mask = int(infoList[2]) 45 | self.parse_info_flag = True 46 | 47 | 48 | def parse_mask_file(feature_mask_file): 49 | try: 50 | if not os.path.exists(feature_mask_file): 51 | print("parse_mask_file fail - file not exists:", feature_mask_file) 52 | return [], False 53 | # feature_name_list = [] 54 | feature_mask_list = [] 55 | feature_hold_cnt = 0 56 | 57 | with open(feature_mask_file) as f: 58 | str_list = f.readlines() 59 | 60 | for i in range(0, len(str_list)): 61 | str_list[i] = str_list[i].strip('\n').strip() 62 | if str_list[i] == "": 63 | continue 64 | 65 | info = FeatureInfo(str_list[i]) 66 | if not info.parse_info_flag: 67 | print("parse_mask_file fail - parse_info fail:", str_list[i]) 68 | parse_mask_flag = False 69 | return [], parse_mask_flag 70 | 71 | for j in range(info.feature_size): 72 | feature_mask_list.append(info.feature_mask) 73 | if info.feature_mask != 0: 74 | feature_hold_cnt += 1 75 | # if info.feature_size > 1: 76 | # feature_name_list.append(info.feature_name + "_" + str(j)) 77 | # else: 78 | # feature_name_list.append(info.feature_name) 79 | 80 | parse_mask_flag = True 81 | return feature_mask_list, parse_mask_flag, feature_hold_cnt 82 | except Exception as e: 83 | print("parse_mask_file fail - Exception:", e) 84 | return [], False 85 | 86 | 87 | if __name__ == "__main__": 88 | feature_mask_list, parse_feature_mask_flag, feature_hold_cnt = parse_mask_file("feature_mask") 89 | print(feature_mask_list) 90 | print(len(feature_mask_list)) 91 | print(parse_feature_mask_flag) 92 | print(feature_hold_cnt) 93 | -------------------------------------------------------------------------------- /rerank_paper/kuaishou_model/util.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | def index_matrix_to_pairs(index_matrix): 4 | # [[3,1,2], [2,3,1]] -> [[[0, 3], [1, 1], [2, 2]], 5 | # [[0, 2], [1, 3], [2, 1]]] 6 | replicated_first_indices = tf.range(tf.shape(index_matrix)[0]) 7 | rank = len(index_matrix.get_shape()) 8 | if rank == 2: 9 | replicated_first_indices = tf.tile( 10 | tf.expand_dims(replicated_first_indices, axis=1), 11 | [1, tf.shape(index_matrix)[1]]) 12 | replicated_first_indices = tf.cast(replicated_first_indices, dtype=tf.int64) 13 | return tf.stack([replicated_first_indices, index_matrix], axis=rank) 14 | 15 | def string_hash_to_index(tensor, bucket=1<<22): 16 | return tf.strings.to_hash_bucket_fast(tensor, bucket) 17 | 18 | def int_to_string_with_key(tensor, key): 19 | return key + "_" + tf.strings.as_string(tensor) 20 | 21 | def float_to_string_with_key(tensor, key, precision=1): 22 | return key + "_" + tf.strings.as_string(tensor, precision) 23 | 24 | def float_to_int(tensor, order): 25 | wc = 10 ** order 26 | return tf.cast(tensor * wc, tf.int64) 27 | 28 | def float_custom_hash(tensor, key, precision=0, bucket=1<<22): 29 | tensor = float_to_string_with_key(tensor, key, precision) 30 | tensor = string_hash_to_index(tensor, bucket) 31 | return tensor 32 | 33 | def int_custom_hash(tensor, key, bucket=1<<22): 34 | tensor = int_to_string_with_key(tensor, key) 35 | tensor = string_hash_to_index(tensor, bucket) 36 | return tensor 37 | -------------------------------------------------------------------------------- /rerank_paper/pier_model/avg_std/delivery: -------------------------------------------------------------------------------- 1 | 4.668 40.692 17.227 2.616 2 | 2.722 10.623 13.181 1.742 3 | -------------------------------------------------------------------------------- /rerank_paper/pier_model/avg_std/poi: -------------------------------------------------------------------------------- 1 | 2066.773487 23.97615642 26.98331926 3.040902147 4.622442584 2 | 2239.863594 17.22234242 18.71115826 1.824693613 0.597578887 -------------------------------------------------------------------------------- /rerank_paper/pier_model/avg_std/user: -------------------------------------------------------------------------------- 1 | 0 0 2 | 1 1 -------------------------------------------------------------------------------- /rerank_paper/pier_model/build_model.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "" 4 | # 共建模型服务模版的路径 ------------------------ 修改模版配置 ---------------------- 5 | template_path="../model_template/" 6 | 7 | # 待上传模型pb文件的路径 ------------------------ 修改模型配置 ---------------------- 8 | model_path="../model/" 9 | model_name=$1 10 | ep=$2 11 | model_file=${model_path}${model_name}/${ep} 12 | echo ${model_file} 13 | target_path="../model_zip/" 14 | if [[ ! -d ${target_path} ]]; then 15 | mkdir ${target_path} 16 | fi 17 | target_name=${model_name}_${ep} 18 | target_file=${target_path}${target_name} 19 | target_name_zip=${target_name}.zip 20 | echo ${target_name} 21 | echo "---> 待上传模型的目标路径: "${target_file} 22 | if [[ -d ${target_file} ]]; then 23 | cmd_del="rm -fr ${target_file}" 24 | echo "---> 删除已存在的目标文件: "${cmd_del} 25 | ${cmd_del} 26 | fi 27 | if [[ ! -d ${target_file} ]]; then 28 | mkdir ${target_file} 29 | fi 30 | 31 | cmd_cp_model_pb="cp -fr ${model_file} ${target_file}/${target_name}" 32 | echo "---> 拷贝模型pb文件到目标目录: "${cmd_cp_model_pb} 33 | ${cmd_cp_model_pb} 34 | 35 | cmd_mv1="cp ${template_path}/model.properties ${target_file}/${target_name}.properties" 36 | cmd_mv2="cp ${template_path}/model.xml ${target_file}/${target_name}.xml" 37 | cmd_mv3="cp ${template_path}/tensors.xml ${target_file}/tensors.xml" 38 | cmd_mv4="cp ${template_path}/poi_feature.xml ${target_file}/poi_feature.xml" 39 | cmd_mv5="cp ${template_path}/poi_feature.tensor ${target_file}/poi_feature.tensor" 40 | echo "---> 修改properties文件名: "${cmd_mv1} 41 | echo "---> 修改xml文件名: "${cmd_mv2} 42 | ${cmd_mv1} 43 | ${cmd_mv2} 44 | ${cmd_mv3} 45 | ${cmd_mv4} 46 | ${cmd_mv5} 47 | 48 | cmd_zip1="cd ${target_path}" 49 | cmd_zip2="zip -r ${target_name_zip} ${target_name}" 50 | echo "---> 压缩上传模型文件: "${cmd_zip1} ${cmd_zip2} 51 | ${cmd_zip1} 52 | ${cmd_zip2} 53 | cd - 54 | 55 | echo ${target_name}.zip 56 | -------------------------------------------------------------------------------- /rerank_paper/pier_model/config.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 -*- 2 | import shutil 3 | import time 4 | import os 5 | 6 | # result 7 | RANDOM_SEED = 2022 8 | BATCH_SIZE = 1024 9 | IMP_LOSS_WEIGHT = 0.02 10 | # basic config 11 | EPOCH = 1 12 | LEARNING_RATE = 0.005 13 | DATA_MODE = 3 # 1:local train,2:local test, 3:docker evaluate 14 | TRAIN_MODE = 3 15 | MODEL_NAME = "pier_listwise_model_v4" 16 | # poi类别特征 17 | FEATURE_CATE_NUM = 7 # v1r3:19 18 | # dense特征 19 | FEATURE_DENSE_NUM = 5 # v1:28 v1r2:79 v1r3:83 20 | # 预估值特征 21 | FEATURE_CXR_NUM = 3 22 | # 环境特征 23 | FEATURE_ENV_NUM = 2 24 | # 自然poi 25 | FEATURE_NATURE_POI = 25 26 | 27 | # embedding_look_up维度 28 | CATE_FEATURE_EMBEDDINGS_SHAPE = [1 << 22, 8] 29 | 30 | # N: Cut Number of POI For Train 31 | POI_NUM = 5 32 | FEATURE_NUM = 9 33 | PAGE_NUM = 5 34 | FEATURE_NUM_FOR_PAGE = 11 35 | PERMUATION_SIZE = 120 36 | # 属性特征:KA AOR BRAND 37 | FEATURE_ATTR_NUM = 3 38 | 39 | # DELIVERY_FEAT 40 | DELIVERY_FEAT_NUM = 4 41 | 42 | # OUT NUM 43 | OUT_NUM = 1 44 | 45 | PLACE_HOLDER_NUM = 11 46 | DENSE_FEAT_NUM = 439 47 | 48 | 49 | # 网络结构参数 50 | MODEL_PARAMS = { 51 | 'INPUT_TENSOR_LAYERS_A': [60, 32, 10], 52 | 'INPUT_TENSOR_LAYERS_B': [50, 20], 53 | 'INPUT_TENSOR_LAYERS_C': [60, 32, 10], 54 | 'INPUT_TENSOR_LAYERS_D': [50, 20], 55 | 'INPUT_TENSOR_LAYERS_E': [50, 20] 56 | } 57 | # A_INPUT_DIM = POI_NUM * (MODEL_PARAMS['INPUT_TENSOR_LAYERS_A'][-1] + 1) 58 | MLP_INPUT_DIM = CATE_FEATURE_EMBEDDINGS_SHAPE[1] * 2 + 1 + MODEL_PARAMS['INPUT_TENSOR_LAYERS_A'][-1] 59 | DIN_CONF = {} 60 | 61 | # train data 62 | # /users/lemonace/Downloads/tfrecord-rl-limit5-v1 63 | if DATA_MODE == 1: 64 | TRAIN_FILE = ['/users/meituan_sxw/Downloads/part-r-00046'] 65 | VALID_FILE = TRAIN_FILE 66 | PREDICT_FILE = VALID_FILE 67 | TEST_FILE = PREDICT_FILE 68 | elif DATA_MODE == 2: 69 | TRAIN_FILE = ['/mnt/dolphinfs/hdd_pool/docker/user/hadoop-hmart-waimaiad/yangfan129/train_data/avito_v1_new/avito_v1_new/test_data/part-r-*'] 70 | VALID_FILE = TRAIN_FILE 71 | TEST_FILE = VALID_FILE 72 | elif DATA_MODE == 3: 73 | TRAIN_FILE = ["/mnt/dolphinfs/hdd_pool/docker/user/hadoop-hmart-waimaiad/yangfan129/train_data/avito_v3_new/train_data/part-r-*"] 74 | VALID_FILE = ["/mnt/dolphinfs/hdd_pool/docker/user/hadoop-hmart-waimaiad/yangfan129/train_data/avito_v3_new/test_data/part-r-*"] 75 | TEST_FILE= ["/mnt/dolphinfs/hdd_pool/docker/user/hadoop-hmart-waimaiad/yangfan129/train_data/avito_v3_new/test_data/part-r-*"] 76 | elif DATA_MODE == 4: 77 | DATA_FILE = "/home/hadoop-hmart-waimaiad/cephfs/data/yangfan129/train_data/tfrecord-multi-channel-v1/" 78 | #TRAIN_LIST = ["20211222", "20211223", "20211224", "20211225"] 79 | TRAIN_LIST = ["20220123"] 80 | VALID_LIST = ["20220124"] 81 | TRAIN_FILE = [DATA_FILE + x + "/part-r-*" for x in TRAIN_LIST] 82 | VALID_FILE = [DATA_FILE + x + "/part-r-0001*" for x in VALID_LIST] 83 | TEST_FILE = [DATA_FILE + x + "/part-r-00011" for x in TRAIN_LIST] 84 | 85 | # 辅助脚本 86 | MEAN_VAR_PATH_POI = "./avg_std/poi" 87 | MEAN_VAR_PATH_DELIVERY = "./avg_std/delivery" 88 | MODEL_SAVE_PATH = "../model/" + MODEL_NAME 89 | MODEL_SAVE_PB_EPOCH_ON = False 90 | MODEL_SAVE_PB_EPOCH_PATH = MODEL_SAVE_PATH + "_pbs" 91 | -------------------------------------------------------------------------------- /rerank_paper/pier_model/data_input.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import tensorflow as tf 3 | from config import * 4 | import numpy as np 5 | from tools import tick_tock,allPermutation 6 | 7 | 8 | def generate_parse_tfrecord_local_fn(): 9 | def _parse_function(batch_examples): 10 | common_features, sequence_features = feature_parse_scheme() 11 | parsed_features = tf.parse_example( 12 | serialized=batch_examples, 13 | features=common_features 14 | ) 15 | features = feature_product(parsed_features) 16 | labels = label_product(parsed_features) 17 | return features, labels 18 | 19 | return _parse_function 20 | 21 | 22 | def generate_parse_valid_tfrecord_local_fn(): 23 | def _parse_function(batch_examples): 24 | common_features, sequence_features = feature_parse_scheme() 25 | parsed_features = tf.parse_example( 26 | serialized=batch_examples, 27 | features=common_features 28 | ) 29 | features = feature_product(parsed_features) 30 | labels = label_product(parsed_features) 31 | return features, labels 32 | 33 | return _parse_function 34 | 35 | 36 | def feature_parse_scheme(): 37 | label_len = POI_NUM * 2 + PAGE_NUM 38 | feature_len = POI_NUM * FEATURE_NUM + PAGE_NUM * POI_NUM * FEATURE_NUM_FOR_PAGE 39 | common_features = { 40 | "label": tf.FixedLenFeature([label_len], dtype=tf.float32), 41 | "feature": tf.FixedLenFeature([feature_len], dtype=tf.float32), 42 | } 43 | 44 | sequence_features = {} 45 | return common_features, sequence_features 46 | 47 | 48 | def label_product(parsed_features): 49 | labels = parsed_features['label'] 50 | 51 | labels_result = { 52 | # ctr_label 53 | 'ctr_label': tf.gather(labels, list(range(0, POI_NUM)), axis=1), 54 | 'mask': tf.gather(labels, list(range(POI_NUM, 2 * POI_NUM)), axis=1), 55 | 'page_mask': tf.gather(labels, list(range(2 * POI_NUM, 2 * POI_NUM + PAGE_NUM)), axis=1), 56 | } 57 | return labels_result 58 | 59 | 60 | def feature_product(parsed_features): 61 | feature_buffer = parsed_features['feature'] 62 | labels = parsed_features['label'] 63 | # 获取特征 64 | # FEATURE_CATE_NUM:品类相关特征 65 | # FEATURE_DENSE_NUM:连续值特征 66 | # FEATURE_CXR_NUM:模型预估值特征 67 | 68 | full_permuation_index = allPermutation(POI_NUM) 69 | 70 | # current page 71 | current_page_start = 0 72 | current_page_end = current_page_start + POI_NUM * FEATURE_NUM 73 | 74 | pre_page_start = current_page_end 75 | pre_page_end = pre_page_start + PAGE_NUM * POI_NUM * FEATURE_NUM_FOR_PAGE 76 | 77 | 78 | cur_page_features = tf.reshape(tf.gather(feature_buffer,list(range(current_page_start,current_page_end)),axis=1), [-1, POI_NUM, FEATURE_NUM]) 79 | pre_page_features = tf.reshape(tf.gather(feature_buffer,list(range(pre_page_start,pre_page_end)),axis=1), [-1, PAGE_NUM, POI_NUM, FEATURE_NUM_FOR_PAGE]) 80 | 81 | position_fea = tf.gather(cur_page_features, list(range(0, 1)), axis=2) 82 | adid_fea = tf.gather(cur_page_features, list(range(1, 2)), axis=2) 83 | obj_type_fea = tf.gather(cur_page_features, list(range(2, 3)), axis=2) 84 | hist_ctr_fea = tf.gather(cur_page_features, list(range(3, 4)), axis=2) 85 | locationid_fea = tf.gather(cur_page_features, list(range(4, 5)), axis=2) 86 | categoryid_fea = tf.gather(cur_page_features, list(range(5, 6)), axis=2) 87 | price_fea = tf.gather(cur_page_features, list(range(6, 7)), axis=2) 88 | iscontext_fea = tf.gather(cur_page_features, list(range(7, 8)), axis=2) 89 | userid_fea = tf.gather(cur_page_features, list(range(8, 9)), axis=2) 90 | 91 | pre_position_fea = tf.gather(pre_page_features, list(range(0, 1)), axis=3) 92 | pre_adid_fea = tf.gather(pre_page_features, list(range(1, 2)), axis=3) 93 | pre_obj_type_fea = tf.gather(pre_page_features, list(range(2, 3)), axis=3) 94 | pre_hist_ctr_fea = tf.gather(pre_page_features, list(range(3, 4)), axis=3) 95 | pre_locationid_fea = tf.gather(pre_page_features, list(range(4, 5)), axis=3) 96 | pre_categoryid_fea = tf.gather(pre_page_features, list(range(5, 6)), axis=3) 97 | pre_price_fea = tf.gather(pre_page_features, list(range(6, 7)), axis=3) 98 | pre_iscontext_fea = tf.gather(pre_page_features, list(range(7, 8)), axis=3) 99 | pre_userid_fea = tf.gather(pre_page_features, list(range(8, 9)), axis=3) 100 | 101 | features_result = { 102 | 103 | 'dense_feature': hist_ctr_fea, 104 | # 离散特征(品类) 105 | 'cate_feature': tf.cast(tf.concat([position_fea, adid_fea, obj_type_fea, locationid_fea, iscontext_fea, categoryid_fea, userid_fea], axis=2), tf.int64), 106 | # ctr_label 107 | 'ctr_label': tf.gather(labels, list(range(0, POI_NUM)), axis=1), 108 | 'mask': tf.gather(labels, list(range(POI_NUM, 2 * POI_NUM)), axis=1), 109 | 'page_mask':tf.gather(labels, list(range(2 * POI_NUM, 2 * POI_NUM + PAGE_NUM)), axis=1), 110 | 'behavior_dense_feature': pre_hist_ctr_fea, 111 | # 离散特征(品类) 112 | 'behavior_cate_feature': tf.cast(tf.concat([pre_position_fea, pre_adid_fea, pre_obj_type_fea, pre_locationid_fea, 113 | pre_iscontext_fea, pre_categoryid_fea, pre_userid_fea],axis=3), tf.int64), 114 | 115 | 'full_permuation_index': tf.constant(full_permuation_index,tf.int32) 116 | 117 | } 118 | return features_result 119 | 120 | 121 | # num_parallel 表示cpu的核数,用于控制 map的并行度 122 | def input_fn_maker(file_names, is_train, batch_size, epoch=None, num_parallel=4): 123 | def input_fn(): 124 | _parse_fn = generate_parse_tfrecord_local_fn() if is_train else generate_parse_valid_tfrecord_local_fn() 125 | files = tf.data.Dataset.list_files(file_names) 126 | # print(files) 127 | dataset = files.apply(tf.contrib.data.parallel_interleave(tf.data.TFRecordDataset, cycle_length=4 * 10)) 128 | dataset = dataset.prefetch(buffer_size=batch_size * 10) 129 | dataset = dataset.repeat(epoch) 130 | dataset = dataset.batch(batch_size) 131 | dataset = dataset.map(_parse_fn, num_parallel_calls=num_parallel) 132 | iterator = dataset.make_one_shot_iterator() 133 | return iterator.get_next() 134 | 135 | return input_fn 136 | 137 | 138 | # 从hive表统计得到均值和方差文件 139 | def get_normalization_parameter(mean_var_path): 140 | with tf.gfile.Open(mean_var_path) as f: 141 | fea_mean = f.readline().strip().split(' ') 142 | fea_var = f.readline().strip().split(' ') 143 | cont_fea_mean = list(map(float, fea_mean)) 144 | cont_fea_var = list(map(float, fea_var)) 145 | f.close() 146 | return cont_fea_mean, cont_fea_var 147 | 148 | 149 | def get_bias_weight_parameter(bias_weight_path): 150 | with tf.gfile.Open(bias_weight_path) as f2: 151 | fea_mean = f2.readline().strip().split('\t') 152 | cont_fea_mean = list(map(float, fea_mean)) 153 | f2.close() 154 | return cont_fea_mean 155 | 156 | 157 | 158 | if __name__ == '__main__': 159 | train_file = TRAIN_FILE 160 | # train_file = ["/mnt/dolphinfs/hdd_pool/docker/user/hadoop-hmart-waimaiad/yangfan129/train_data/avito_v1_new/avito_v1_new/train_data/part-r-00000"] 161 | train_input_fn = input_fn_maker(train_file, is_train=True, batch_size=16, epoch=1) 162 | features, labels = train_input_fn() 163 | 164 | sess = tf.Session() 165 | try: 166 | with tick_tock("DATA_INPUT") as _: 167 | features_np, labels_np = sess.run([features, labels]) 168 | 169 | print("*" * 100, "features_np") 170 | for key in features_np: 171 | print("=" * 50, key, np.shape(features_np[key])) 172 | print(features_np[key]) 173 | 174 | 175 | print("*" * 100, "labels_np") 176 | for key in labels_np: 177 | print("=" * 50, key, np.shape(labels_np[key])) 178 | print(labels_np[key]) 179 | 180 | except Exception as e: 181 | print(e) 182 | -------------------------------------------------------------------------------- /rerank_paper/pier_model/layers.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | def DIN(seq, seq_len, target, conf, scope="DIN"): 4 | # seq BATCH_SIZE * SEQ_LEN * FEAT_NUM : N * M * H 5 | # target BATCH_SIZE * FEAT_NUM : N * H 6 | # return : BATCH_SIZE * H 7 | 8 | with tf.variable_scope(scope, reuse=tf.AUTO_REUSE): 9 | seq_shape = tf.shape(seq) 10 | target = tf.tile(target, [1, seq_shape[1], 1]) 11 | 12 | input = tf.concat([seq, target, seq - target, seq * target], axis=-1) 13 | 14 | layers = conf.get("layers", [64, 32]) 15 | for layer in layers: 16 | input = tf.layers.dense(input, layer, activation=tf.nn.sigmoid, name="att_"+str(layer)) 17 | 18 | input = tf.squeeze(tf.layers.dense(input, 1, activation=None, name="att_final"), axis=-1) # N * M 19 | 20 | # Mask 21 | seq_mask = tf.squeeze(tf.sequence_mask(seq_len, seq_shape[1])) 22 | # seq_mask = tf.Print(seq_mask, [seq_mask], message="seq_mask", summarize=100) 23 | padding = tf.ones_like(input) * (-2 ** 32 + 1) 24 | attention = tf.nn.softmax(tf.where(seq_mask, input, padding), axis=-1) # N * M 25 | # attention = tf.Print(attention, [attention], message="attention", summarize=100) 26 | attention = tf.tile(tf.expand_dims(attention, axis=2), [1, 1, seq_shape[2]]) 27 | output = tf.reduce_sum(tf.transpose(attention * seq, [0, 2, 1]), axis=1) 28 | return output 29 | -------------------------------------------------------------------------------- /rerank_paper/pier_model/main.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from config import * 3 | from model import * 4 | from sklearn import metrics 5 | 6 | def create_estimator(): 7 | tf.logging.set_verbosity(tf.logging.INFO) 8 | session_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) 9 | session_config.gpu_options.allow_growth = True 10 | config = tf.estimator.RunConfig( 11 | tf_random_seed=RANDOM_SEED, 12 | save_summary_steps=100, 13 | save_checkpoints_steps=1000, 14 | model_dir=MODEL_SAVE_PATH, 15 | keep_checkpoint_max=2, 16 | log_step_count_steps=1000, 17 | session_config=session_config) 18 | nn_model = DNN() 19 | estimator = tf.estimator.Estimator(model_fn=nn_model.model_fn_estimator, config=config) 20 | return estimator, nn_model 21 | 22 | 23 | def save_model_pb_with_estimator(estimator, params, export_dir_base): 24 | estimator._params['save_model'] = params['save_model'] 25 | 26 | def _serving_input_receiver_fn(): 27 | # env_feature = > dense_feature 28 | # cxr_feature = > screen_predict_feature 29 | # cat_feature = > screen_cate_feature 30 | # dense_feature = > screen_dense_feature 31 | receiver_tensors = { 32 | # ctr cvr gmv预估值 && bid 33 | 'screen_predict_feature': tf.placeholder(tf.float32, [None, POI_NUM, FEATURE_CXR_NUM], 34 | name='screen_predict_feature'), 35 | # dense 特征 (价格,评分) 36 | 'screen_dense_feature': tf.placeholder(tf.float32, [None, POI_NUM, FEATURE_DENSE_NUM], 37 | name='screen_dense_feature'), 38 | # 离散特征(品类) 39 | 'screen_cate_feature': tf.placeholder(tf.int64, [None, POI_NUM, FEATURE_CATE_NUM], 40 | name='screen_cate_feature'), 41 | # 环境特征(是否有铂金) 42 | 'dense_feature': tf.placeholder(tf.float32, [None, DENSE_FEAT_NUM], 43 | name='dense_feature') 44 | } 45 | return tf.estimator.export.ServingInputReceiver(receiver_tensors=receiver_tensors, features=receiver_tensors) 46 | 47 | export_dir = estimator.export_saved_model(export_dir_base=export_dir_base, 48 | serving_input_receiver_fn=_serving_input_receiver_fn) 49 | estimator._params.pop('save_model') 50 | return export_dir.decode() 51 | 52 | def calculate_result(result_generator): 53 | 54 | y_ctr, pred_ctr, ctr = [], [], [] 55 | for result in result_generator: 56 | cxr_feature = result['cxr_feature'] 57 | mask = result['mask'] 58 | # ctr_label 59 | idx = np.where(mask.reshape(-1) == 1) 60 | y_ctr += result['ctr_label'].reshape(-1)[idx].tolist() 61 | pred_ctr += result['ctr_out'].reshape(-1)[idx].tolist() 62 | ctr += cxr_feature[:, :, 0].reshape(-1)[idx].tolist() 63 | 64 | ctr_auc, ctr_auc_jp, ctr_cb, ctr_cb_jp = metrics.roc_auc_score(y_ctr, pred_ctr), metrics.roc_auc_score(y_ctr, ctr), np.sum(pred_ctr) / np.sum(y_ctr), np.sum(ctr) / np.sum(y_ctr) 65 | print("ctr_auc:{}, ctr_auc_jp:{}, ctr_cb:{}, ctr_cb_jp:{}".format(ctr_auc, ctr_auc_jp, ctr_cb, ctr_cb_jp)) 66 | 67 | if __name__ == '__main__': 68 | 69 | estimator, nn_model = create_estimator() 70 | 71 | with tick_tock("DATA_INPUT") as _: 72 | valid_input_fn = input_fn_maker(VALID_FILE, False, batch_size=1024, epoch=1) 73 | test_input_fn = input_fn_maker(TEST_FILE, False, batch_size=1024, epoch=1) 74 | 75 | if TRAIN_MODE == 1: 76 | for i in range(EPOCH): 77 | for idx, data in enumerate(TRAIN_FILE): 78 | with tick_tock("DATA_INPUT") as _: 79 | train_input_fn = input_fn_maker([data], True, batch_size=BATCH_SIZE, epoch=1) 80 | with tick_tock("TRAIN") as _: 81 | estimator.train(train_input_fn) 82 | if MODEL_SAVE_PB_EPOCH_ON: 83 | export_dir = save_model_pb_with_estimator(estimator, params={'save_model': 'listwise'}, 84 | export_dir_base=MODEL_SAVE_PB_EPOCH_PATH) 85 | ep_insert_index = i * len(TRAIN_FILE) + idx 86 | target_dir = export_dir + "/../ep" + str(ep_insert_index) 87 | while os.path.exists(target_dir): 88 | target_dir = export_dir + "/../ep" + str(ep_insert_index) 89 | shutil.move(export_dir, target_dir) 90 | print(time.strftime("%m-%d %H:%M:%S ", 91 | time.localtime(time.time())) + "export model PB: " + target_dir) 92 | #with tick_tock("PREDICT") as _: 93 | #result_generator = estimator.predict(input_fn=valid_input_fn, yield_single_examples=False) 94 | #calculate_result(result_generator) 95 | 96 | 97 | 98 | elif TRAIN_MODE == 2: 99 | with tick_tock("PREDICT") as _: 100 | result_generator = estimator.predict(input_fn=valid_input_fn, yield_single_examples=False) 101 | calculate_result(result_generator) 102 | 103 | elif TRAIN_MODE == 3: 104 | for i in range(EPOCH): 105 | for idx, data in enumerate(TRAIN_FILE): 106 | with tick_tock("DATA_INPUT") as _: 107 | train_input_fn = input_fn_maker([data], True, batch_size=BATCH_SIZE, epoch=1) 108 | with tick_tock("TRAIN") as _: 109 | estimator.train(train_input_fn) 110 | with tick_tock("PREDICT") as _: 111 | result_generator = estimator.predict(input_fn=valid_input_fn, yield_single_examples=False) 112 | print("valid_data") 113 | calculate_result(result_generator) 114 | #result_generator = estimator.predict(input_fn=test_input_fn, yield_single_examples=False) 115 | print("train_data") 116 | #calculate_result(result_generator) 117 | # save pb 118 | 119 | 120 | elif TRAIN_MODE == 4: 121 | export_dir = save_model_pb_with_estimator(estimator, params={'save_model': 'listwise'}, 122 | export_dir_base=MODEL_SAVE_PB_EPOCH_PATH) 123 | ep_insert_index = 0 124 | target_dir = export_dir + "/../ep" + str(ep_insert_index) 125 | while os.path.exists(target_dir): 126 | target_dir = export_dir + "/../ep" + str(ep_insert_index) 127 | shutil.move(export_dir, target_dir) 128 | print(time.strftime("%m-%d %H:%M:%S ", 129 | time.localtime(time.time())) + "export model PB: " + target_dir) 130 | 131 | -------------------------------------------------------------------------------- /rerank_paper/pier_model/model_load.py: -------------------------------------------------------------------------------- 1 | import tensorflow 2 | 3 | from config import * 4 | import tensorflow_core.contrib.predictor as predictor 5 | 6 | def load_listwise_model(): 7 | model_filename_dir = MODEL_SAVE_PATH + "_pbs/ep0/" 8 | predict_fn = predictor.from_saved_model(model_filename_dir) 9 | # predict_fn = tf.compat.v2.saved_model.load(model_filename_dir) 10 | 11 | # env_feature = > dense_feature 12 | # cxr_feature = > screen_predict_feature 13 | # cat_feature = > screen_cate_feature 14 | # dense_feature = > screen_dense_feature 15 | predictions = predict_fn({ 16 | 'screen_predict_feature': [[[0.036115, 0.05427262, 0.09489095, 0.2], 17 | [0.027565, 0.07474336, 0.04988268, 0.53], 18 | [0.024815, 0.1775544, 0.12052802, 0.24], 19 | [0.023316, 0.12283709, 0.10298113, 0.1]]], 20 | # dense 特征 (价格,评分) 21 | 'screen_dense_feature': [[[1359., 30.146147, 26., 5., 4.85], 22 | [318., 14.675659, 0., 5., 4.94], 23 | [637., 24.784016, 0., 5., 4.65], 24 | [185., 25.333273, 0., 5., 4.75]]], 25 | # 离散特征(品类) 26 | 'screen_cate_feature': [[[2638824, 4148885, 432243, 3985407, 3385100, 3019284], 27 | [2638824, 3905410, 3212599, 3985407, 1997821, 3019284], 28 | [2638824, 4148885, 3622545, 3985407, 1997821, 3019284], 29 | [2638824, 4148885, 432243, 3985407, 1997821, 3019284]]], 30 | # 环境特征(是否有铂金) 31 | 'dense_feature': [[0., 0.]] 32 | }) 33 | 34 | print('Q_network_output:', predictions['Q_network_output']) 35 | print('out:', predictions['out']) 36 | 37 | if __name__ == '__main__': 38 | # load_pg_model() 39 | load_listwise_model() -------------------------------------------------------------------------------- /rerank_paper/pier_model/restore.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lemonace/PIER_code/9c47f80af24570154aab4a811dd82d1f1d436fc3/rerank_paper/pier_model/restore.py -------------------------------------------------------------------------------- /rerank_paper/pier_model/run.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | LOG_PATH="./" 4 | if [[ ! -d ${LOG_PATH} ]]; then 5 | mkdir ${LOG_PATH} 6 | fi 7 | 8 | project_path=$(cd `dirname $0`; pwd) 9 | project_name="${project_path##*/}" 10 | begin_time=$(date "+%Y-%m-%d %H:%M:%S") 11 | author="yangfan129" 12 | 13 | LOG_FILENAME="log" 14 | CUDA_VISIBLE_DEVICES=0 nohup python -u main.py --author ${author} --project ${project_name} --begintime ${begin_time} > ${LOG_PATH}${LOG_FILENAME} 2>&1 & 15 | -------------------------------------------------------------------------------- /rerank_paper/pier_model/tools.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import datetime 3 | import time 4 | import os 5 | 6 | 7 | class tick_tock: 8 | def __init__(self, process_name, verbose=1): 9 | self.process_name = process_name 10 | self.verbose = verbose 11 | 12 | def __enter__(self): 13 | if self.verbose: 14 | print(("*" * 50 + " {} START!!!! ".format(self.process_name) + "*" * 50)) 15 | self.begin_time = time.time() 16 | 17 | def __exit__(self, type, value, traceback): 18 | if self.verbose: 19 | end_time = time.time() 20 | duration_seconds = end_time - self.begin_time 21 | duration = str(datetime.timedelta(seconds=duration_seconds)) 22 | 23 | print(("#" * 50 + " {} END... time lapsing {} ".format(self.process_name, duration) + "#" * 50)) 24 | 25 | 26 | class FeatureInfo: 27 | def __init__(self, feature_info_str): 28 | self.feature_info_str = feature_info_str 29 | 30 | self.feature_name = "NonFeaName" 31 | self.feature_size = 0 32 | self.feature_mask = 1 33 | self.parse_info_flag = False 34 | self.part_num = 3 35 | 36 | self._parse_info() 37 | 38 | def _parse_info(self): 39 | infoList = self.feature_info_str.split() 40 | 41 | if len(infoList) == self.part_num: 42 | self.feature_name = infoList[0] 43 | self.feature_size = int(infoList[1]) 44 | self.feature_mask = int(infoList[2]) 45 | self.parse_info_flag = True 46 | 47 | 48 | def parse_mask_file(feature_mask_file): 49 | try: 50 | if not os.path.exists(feature_mask_file): 51 | print("parse_mask_file fail - file not exists:", feature_mask_file) 52 | return [], False 53 | # feature_name_list = [] 54 | feature_mask_list = [] 55 | feature_hold_cnt = 0 56 | 57 | with open(feature_mask_file) as f: 58 | str_list = f.readlines() 59 | 60 | for i in range(0, len(str_list)): 61 | str_list[i] = str_list[i].strip('\n').strip() 62 | if str_list[i] == "": 63 | continue 64 | 65 | info = FeatureInfo(str_list[i]) 66 | if not info.parse_info_flag: 67 | print("parse_mask_file fail - parse_info fail:", str_list[i]) 68 | parse_mask_flag = False 69 | return [], parse_mask_flag 70 | 71 | for j in range(info.feature_size): 72 | feature_mask_list.append(info.feature_mask) 73 | if info.feature_mask != 0: 74 | feature_hold_cnt += 1 75 | # if info.feature_size > 1: 76 | # feature_name_list.append(info.feature_name + "_" + str(j)) 77 | # else: 78 | # feature_name_list.append(info.feature_name) 79 | 80 | parse_mask_flag = True 81 | return feature_mask_list, parse_mask_flag, feature_hold_cnt 82 | except Exception as e: 83 | print("parse_mask_file fail - Exception:", e) 84 | return [], False 85 | 86 | import itertools 87 | 88 | 89 | # 利用itertools库中的permutations函数,给定一个排列,输出他的全排列 90 | def allPermutation(n): 91 | permutation = [] 92 | # 首先需要初始化一个1-n的排列 93 | for i in range(n): 94 | permutation.append(i+1) 95 | # itertools.permutations返回的只是一个对象,需要将其转化成list 96 | # 每一种排列情况以元组类型存储 97 | all_permutation = list(itertools.permutations(permutation)) 98 | return all_permutation 99 | 100 | 101 | 102 | 103 | 104 | if __name__ == "__main__": 105 | # feature_mask_list, parse_feature_mask_flag, feature_hold_cnt = parse_mask_file("feature_mask") 106 | # print(feature_mask_list) 107 | # print(len(feature_mask_list)) 108 | # print(parse_feature_mask_flag) 109 | # print(feature_hold_cnt) 110 | print(allPermutation(5)) 111 | -------------------------------------------------------------------------------- /rerank_paper/pier_model/util.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | def index_matrix_to_pairs(index_matrix): 4 | # [[3,1,2], [2,3,1]] -> [[[0, 3], [1, 1], [2, 2]], 5 | # [[0, 2], [1, 3], [2, 1]]] 6 | replicated_first_indices = tf.range(tf.shape(index_matrix)[0]) 7 | rank = len(index_matrix.get_shape()) 8 | if rank == 2: 9 | replicated_first_indices = tf.tile( 10 | tf.expand_dims(replicated_first_indices, axis=1), 11 | [1, tf.shape(index_matrix)[1]]) 12 | replicated_first_indices = tf.cast(replicated_first_indices, dtype=tf.int64) 13 | return tf.stack([replicated_first_indices, index_matrix], axis=rank) 14 | 15 | def string_hash_to_index(tensor, bucket=1<<22): 16 | return tf.strings.to_hash_bucket_fast(tensor, bucket) 17 | 18 | def int_to_string_with_key(tensor, key): 19 | return key + "_" + tf.strings.as_string(tensor) 20 | 21 | def float_to_string_with_key(tensor, key, precision=1): 22 | return key + "_" + tf.strings.as_string(tensor, precision) 23 | 24 | def float_to_int(tensor, order): 25 | wc = 10 ** order 26 | return tf.cast(tensor * wc, tf.int64) 27 | 28 | def float_custom_hash(tensor, key, precision=0, bucket=1<<22): 29 | tensor = float_to_string_with_key(tensor, key, precision) 30 | tensor = string_hash_to_index(tensor, bucket) 31 | return tensor 32 | 33 | def int_custom_hash(tensor, key, bucket=1<<22): 34 | tensor = int_to_string_with_key(tensor, key) 35 | tensor = string_hash_to_index(tensor, bucket) 36 | return tensor 37 | -------------------------------------------------------------------------------- /rerank_paper/pier_model_whole_framework/avg_std/delivery: -------------------------------------------------------------------------------- 1 | 4.668 40.692 17.227 2.616 2 | 2.722 10.623 13.181 1.742 3 | -------------------------------------------------------------------------------- /rerank_paper/pier_model_whole_framework/avg_std/poi: -------------------------------------------------------------------------------- 1 | 2066.773487 23.97615642 26.98331926 3.040902147 4.622442584 2 | 2239.863594 17.22234242 18.71115826 1.824693613 0.597578887 -------------------------------------------------------------------------------- /rerank_paper/pier_model_whole_framework/avg_std/user: -------------------------------------------------------------------------------- 1 | 0 0 2 | 1 1 -------------------------------------------------------------------------------- /rerank_paper/pier_model_whole_framework/build_model.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "" 4 | # 共建模型服务模版的路径 ------------------------ 修改模版配置 ---------------------- 5 | template_path="../model_template/" 6 | 7 | # 待上传模型pb文件的路径 ------------------------ 修改模型配置 ---------------------- 8 | model_path="../model/" 9 | model_name=$1 10 | ep=$2 11 | model_file=${model_path}${model_name}/${ep} 12 | echo ${model_file} 13 | target_path="../model_zip/" 14 | if [[ ! -d ${target_path} ]]; then 15 | mkdir ${target_path} 16 | fi 17 | target_name=${model_name}_${ep} 18 | target_file=${target_path}${target_name} 19 | target_name_zip=${target_name}.zip 20 | echo ${target_name} 21 | echo "---> 待上传模型的目标路径: "${target_file} 22 | if [[ -d ${target_file} ]]; then 23 | cmd_del="rm -fr ${target_file}" 24 | echo "---> 删除已存在的目标文件: "${cmd_del} 25 | ${cmd_del} 26 | fi 27 | if [[ ! -d ${target_file} ]]; then 28 | mkdir ${target_file} 29 | fi 30 | 31 | cmd_cp_model_pb="cp -fr ${model_file} ${target_file}/${target_name}" 32 | echo "---> 拷贝模型pb文件到目标目录: "${cmd_cp_model_pb} 33 | ${cmd_cp_model_pb} 34 | 35 | cmd_mv1="cp ${template_path}/model.properties ${target_file}/${target_name}.properties" 36 | cmd_mv2="cp ${template_path}/model.xml ${target_file}/${target_name}.xml" 37 | cmd_mv3="cp ${template_path}/tensors.xml ${target_file}/tensors.xml" 38 | cmd_mv4="cp ${template_path}/poi_feature.xml ${target_file}/poi_feature.xml" 39 | cmd_mv5="cp ${template_path}/poi_feature.tensor ${target_file}/poi_feature.tensor" 40 | echo "---> 修改properties文件名: "${cmd_mv1} 41 | echo "---> 修改xml文件名: "${cmd_mv2} 42 | ${cmd_mv1} 43 | ${cmd_mv2} 44 | ${cmd_mv3} 45 | ${cmd_mv4} 46 | ${cmd_mv5} 47 | 48 | cmd_zip1="cd ${target_path}" 49 | cmd_zip2="zip -r ${target_name_zip} ${target_name}" 50 | echo "---> 压缩上传模型文件: "${cmd_zip1} ${cmd_zip2} 51 | ${cmd_zip1} 52 | ${cmd_zip2} 53 | cd - 54 | 55 | echo ${target_name}.zip 56 | -------------------------------------------------------------------------------- /rerank_paper/pier_model_whole_framework/config.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 -*- 2 | import shutil 3 | import time 4 | import os 5 | 6 | # result 7 | RANDOM_SEED = 2022 8 | BATCH_SIZE = 1024 9 | IMP_LOSS_WEIGHT = 0.02 10 | # basic config 11 | EPOCH = 1 12 | LEARNING_RATE = 0.001 13 | DATA_MODE = 3 # 1:local train,2:local test, 3:docker evaluate 14 | TRAIN_MODE = 3 15 | MODEL_NAME = "pier_listwise_whole_v4" 16 | 17 | USE_CONSTRATIVE_LOSS = True 18 | CONSTRATIVE_LOSS_K = 0.01 19 | 20 | # poi类别特征 21 | FEATURE_CATE_NUM = 7 # v1r3:19 22 | # dense特征 23 | FEATURE_DENSE_NUM = 5 # v1:28 v1r2:79 v1r3:83 24 | # 预估值特征 25 | FEATURE_CXR_NUM = 3 26 | # 环境特征 27 | FEATURE_ENV_NUM = 2 28 | # 自然poi 29 | FEATURE_NATURE_POI = 25 30 | 31 | # embedding_look_up维度 32 | CATE_FEATURE_EMBEDDINGS_SHAPE = [1 << 22, 8] 33 | 34 | # hash vector 35 | HASH_VECTOR = [[0.4011729380078948, 0.9907933488085712, 0.889170658914898, 0.015553701343792192, 0.2269008585804, 0.889609750280199, 0.19280345796299014, 0.33362195188257815], 36 | [0.1174580997476552, 0.4475570392889796, 0.9925939893135071, 0.7296640075216434, 0.6436313332140967, 0.8568068597644793, 0.9018604021483339, 0.28774343258170776], 37 | [0.4374993384759095, 0.6807149381274915, 0.20502126763268802, 0.6968687323602859, 0.6449029002244834, 0.08732080642447282, 0.6119476780855001, 0.8616788453789646], 38 | [0.19344266090314144, 0.4268233179146762, 0.10951597767480326, 0.9867838283258178, 0.8340011944969644, 0.7992329879482085, 0.005303560724105649, 0.9662924610057512], 39 | [0.5824282763301396, 0.5090708710943849, 0.4462703076663568, 0.5482943153972023, 0.06782829736851825, 0.08907408658464577, 0.13400486343251583, 0.1848228429919272]] 40 | 41 | POSITION_ENCODING = [[0.08790239841717873, 0.033267486152506076, 0.5495130189114207, 0.17419777583517537, 0.6838981992197484, 0.07935154925635501, 0.02665372302227631, 0.4181008411574786], 42 | [0.02038159041970189, 0.9310485215006198, 0.723661313625571, 0.5110323516977285, 0.7812851438212606, 0.40722710713947474, 0.04646490014890503, 0.9565353323381218], 43 | [0.17279361698259843, 0.12968275664201512, 0.3183103529758954, 0.6360591081256931, 0.1558507653689548, 0.5972802646455662, 0.4380619835390329, 0.6088094249662641], 44 | [0.038262298606345335, 0.9999545626217287, 0.7113776275017341, 0.5434618368150265, 0.6853527957705402, 0.9662653254145415, 0.9641592716989676, 0.3443813983264], 45 | [0.7903243938847678, 0.9952713339078417, 0.8741415264071601, 0.45665348276461737, 0.7693872696125916, 0.772509599868299, 0.2540369924156157, 0.24781240400239857]] 46 | 47 | 48 | TIME_AWARE_WEIGHT = [[1/2,1/2,1/3,1/3,1/4]] 49 | PERMUATION_SIZE = 120 50 | TOP_K = 20 51 | 52 | 53 | EXPOSE_RATE_FOR_BEAM_SEARCH = [[1.0,0.9,0.8,0.7,0.6]] 54 | 55 | # N: Cut Number of POI For Train 56 | POI_NUM = 5 57 | FEATURE_NUM = 9 58 | PAGE_NUM = 5 59 | FEATURE_NUM_FOR_PAGE = 11 60 | 61 | # 属性特征:KA AOR BRAND 62 | FEATURE_ATTR_NUM = 3 63 | 64 | # DELIVERY_FEAT 65 | DELIVERY_FEAT_NUM = 4 66 | 67 | # OUT NUM 68 | OUT_NUM = 1 69 | 70 | PLACE_HOLDER_NUM = 11 71 | DENSE_FEAT_NUM = 439 72 | 73 | 74 | # 网络结构参数 75 | MODEL_PARAMS = { 76 | 'INPUT_TENSOR_LAYERS_A': [60, 32, 20], 77 | 'INPUT_TENSOR_LAYERS_B': [128, 32], 78 | 'INPUT_TENSOR_LAYERS_C': [50, 20], 79 | 'INPUT_TENSOR_LAYERS_D': [50, 20], 80 | 'INPUT_TENSOR_LAYERS_E': [50, 20] 81 | } 82 | # A_INPUT_DIM = POI_NUM * (MODEL_PARAMS['INPUT_TENSOR_LAYERS_A'][-1] + 1) 83 | MLP_INPUT_DIM = CATE_FEATURE_EMBEDDINGS_SHAPE[1] * 3 + 1 + MODEL_PARAMS['INPUT_TENSOR_LAYERS_A'][-1] 84 | DIN_CONF = {} 85 | 86 | # train data 87 | # /users/lemonace/Downloads/tfrecord-rl-limit5-v1 88 | if DATA_MODE == 1: 89 | TRAIN_FILE = ['/users/meituan_sxw/Downloads/part-r-00046'] 90 | VALID_FILE = TRAIN_FILE 91 | PREDICT_FILE = VALID_FILE 92 | TEST_FILE = PREDICT_FILE 93 | elif DATA_MODE == 2: 94 | TRAIN_FILE = ['/mnt/dolphinfs/hdd_pool/docker/user/hadoop-hmart-waimaiad/yangfan129/train_data/avito_v1_new/avito_v1_new/test_data/part-r-*'] 95 | VALID_FILE = TRAIN_FILE 96 | TEST_FILE = VALID_FILE 97 | elif DATA_MODE == 3: 98 | TRAIN_FILE = ["/mnt/dolphinfs/hdd_pool/docker/user/hadoop-hmart-waimaiad/yangfan129/train_data/avito_v3_new/train_data/part-r-000*1"] 99 | VALID_FILE = ["/mnt/dolphinfs/hdd_pool/docker/user/hadoop-hmart-waimaiad/yangfan129/train_data/avito_v3_new/test_data/part-r-000*1"] 100 | TEST_FILE= ["/mnt/dolphinfs/hdd_pool/docker/user/hadoop-hmart-waimaiad/yangfan129/train_data/avito_v3_new/test_data/part-r-000*1"] 101 | elif DATA_MODE == 4: 102 | DATA_FILE = "/home/hadoop-hmart-waimaiad/cephfs/data/yangfan129/train_data/tfrecord-multi-channel-v1/" 103 | #TRAIN_LIST = ["20211222", "20211223", "20211224", "20211225"] 104 | TRAIN_LIST = ["20220123"] 105 | VALID_LIST = ["20220124"] 106 | TRAIN_FILE = [DATA_FILE + x + "/part-r-*" for x in TRAIN_LIST] 107 | VALID_FILE = [DATA_FILE + x + "/part-r-0001*" for x in VALID_LIST] 108 | TEST_FILE = [DATA_FILE + x + "/part-r-00011" for x in TRAIN_LIST] 109 | 110 | # 辅助脚本 111 | MEAN_VAR_PATH_POI = "./avg_std/poi" 112 | MEAN_VAR_PATH_DELIVERY = "./avg_std/delivery" 113 | MODEL_SAVE_PATH = "../model/" + MODEL_NAME 114 | MODEL_SAVE_PB_EPOCH_ON = False 115 | MODEL_SAVE_PB_EPOCH_PATH = MODEL_SAVE_PATH + "_pbs" 116 | -------------------------------------------------------------------------------- /rerank_paper/pier_model_whole_framework/layers.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | def DIN(seq, seq_len, target, conf, scope="DIN"): 4 | # seq BATCH_SIZE * SEQ_LEN * FEAT_NUM : N * M * H 5 | # target BATCH_SIZE * FEAT_NUM : N * H 6 | # return : BATCH_SIZE * H 7 | 8 | with tf.variable_scope(scope, reuse=tf.AUTO_REUSE): 9 | seq_shape = tf.shape(seq) 10 | target = tf.tile(target, [1, seq_shape[1], 1]) 11 | 12 | input = tf.concat([seq, target, seq - target, seq * target], axis=-1) 13 | 14 | layers = conf.get("layers", [64, 32]) 15 | for layer in layers: 16 | input = tf.layers.dense(input, layer, activation=tf.nn.sigmoid, name="att_"+str(layer)) 17 | 18 | input = tf.squeeze(tf.layers.dense(input, 1, activation=None, name="att_final"), axis=-1) # N * M 19 | 20 | # Mask 21 | seq_mask = tf.squeeze(tf.sequence_mask(seq_len, seq_shape[1])) 22 | # seq_mask = tf.Print(seq_mask, [seq_mask], message="seq_mask", summarize=100) 23 | padding = tf.ones_like(input) * (-2 ** 32 + 1) 24 | attention = tf.nn.softmax(tf.where(seq_mask, input, padding), axis=-1) # N * M 25 | # attention = tf.Print(attention, [attention], message="attention", summarize=100) 26 | attention = tf.tile(tf.expand_dims(attention, axis=2), [1, 1, seq_shape[2]]) 27 | output = tf.reduce_sum(tf.transpose(attention * seq, [0, 2, 1]), axis=1) 28 | return output 29 | -------------------------------------------------------------------------------- /rerank_paper/pier_model_whole_framework/model_load.py: -------------------------------------------------------------------------------- 1 | import tensorflow 2 | 3 | from config import * 4 | import tensorflow_core.contrib.predictor as predictor 5 | 6 | def load_listwise_model(): 7 | model_filename_dir = MODEL_SAVE_PATH + "_pbs/ep0/" 8 | predict_fn = predictor.from_saved_model(model_filename_dir) 9 | # predict_fn = tf.compat.v2.saved_model.load(model_filename_dir) 10 | 11 | # env_feature = > dense_feature 12 | # cxr_feature = > screen_predict_feature 13 | # cat_feature = > screen_cate_feature 14 | # dense_feature = > screen_dense_feature 15 | predictions = predict_fn({ 16 | 'screen_predict_feature': [[[0.036115, 0.05427262, 0.09489095, 0.2], 17 | [0.027565, 0.07474336, 0.04988268, 0.53], 18 | [0.024815, 0.1775544, 0.12052802, 0.24], 19 | [0.023316, 0.12283709, 0.10298113, 0.1]]], 20 | # dense 特征 (价格,评分) 21 | 'screen_dense_feature': [[[1359., 30.146147, 26., 5., 4.85], 22 | [318., 14.675659, 0., 5., 4.94], 23 | [637., 24.784016, 0., 5., 4.65], 24 | [185., 25.333273, 0., 5., 4.75]]], 25 | # 离散特征(品类) 26 | 'screen_cate_feature': [[[2638824, 4148885, 432243, 3985407, 3385100, 3019284], 27 | [2638824, 3905410, 3212599, 3985407, 1997821, 3019284], 28 | [2638824, 4148885, 3622545, 3985407, 1997821, 3019284], 29 | [2638824, 4148885, 432243, 3985407, 1997821, 3019284]]], 30 | # 环境特征(是否有铂金) 31 | 'dense_feature': [[0., 0.]] 32 | }) 33 | 34 | print('Q_network_output:', predictions['Q_network_output']) 35 | print('out:', predictions['out']) 36 | 37 | if __name__ == '__main__': 38 | # load_pg_model() 39 | load_listwise_model() -------------------------------------------------------------------------------- /rerank_paper/pier_model_whole_framework/restore.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lemonace/PIER_code/9c47f80af24570154aab4a811dd82d1f1d436fc3/rerank_paper/pier_model_whole_framework/restore.py -------------------------------------------------------------------------------- /rerank_paper/pier_model_whole_framework/run.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | LOG_PATH="./" 4 | if [[ ! -d ${LOG_PATH} ]]; then 5 | mkdir ${LOG_PATH} 6 | fi 7 | 8 | project_path=$(cd `dirname $0`; pwd) 9 | project_name="${project_path##*/}" 10 | begin_time=$(date "+%Y-%m-%d %H:%M:%S") 11 | author="yangfan129" 12 | 13 | LOG_FILENAME="log" 14 | CUDA_VISIBLE_DEVICES=0 nohup python -u main.py --author ${author} --project ${project_name} --begintime ${begin_time} > ${LOG_PATH}${LOG_FILENAME} 2>&1 & 15 | -------------------------------------------------------------------------------- /rerank_paper/pier_model_whole_framework/tools.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import datetime 3 | import time 4 | import os 5 | 6 | import math 7 | import random 8 | 9 | class tick_tock: 10 | def __init__(self, process_name, verbose=1): 11 | self.process_name = process_name 12 | self.verbose = verbose 13 | 14 | def __enter__(self): 15 | if self.verbose: 16 | print(("*" * 50 + " {} START!!!! ".format(self.process_name) + "*" * 50)) 17 | self.begin_time = time.time() 18 | 19 | def __exit__(self, type, value, traceback): 20 | if self.verbose: 21 | end_time = time.time() 22 | duration_seconds = end_time - self.begin_time 23 | duration = str(datetime.timedelta(seconds=duration_seconds)) 24 | 25 | print(("#" * 50 + " {} END... time lapsing {} ".format(self.process_name, duration) + "#" * 50)) 26 | 27 | 28 | class FeatureInfo: 29 | def __init__(self, feature_info_str): 30 | self.feature_info_str = feature_info_str 31 | 32 | self.feature_name = "NonFeaName" 33 | self.feature_size = 0 34 | self.feature_mask = 1 35 | self.parse_info_flag = False 36 | self.part_num = 3 37 | 38 | self._parse_info() 39 | 40 | def _parse_info(self): 41 | infoList = self.feature_info_str.split() 42 | 43 | if len(infoList) == self.part_num: 44 | self.feature_name = infoList[0] 45 | self.feature_size = int(infoList[1]) 46 | self.feature_mask = int(infoList[2]) 47 | self.parse_info_flag = True 48 | 49 | 50 | def parse_mask_file(feature_mask_file): 51 | try: 52 | if not os.path.exists(feature_mask_file): 53 | print("parse_mask_file fail - file not exists:", feature_mask_file) 54 | return [], False 55 | # feature_name_list = [] 56 | feature_mask_list = [] 57 | feature_hold_cnt = 0 58 | 59 | with open(feature_mask_file) as f: 60 | str_list = f.readlines() 61 | 62 | for i in range(0, len(str_list)): 63 | str_list[i] = str_list[i].strip('\n').strip() 64 | if str_list[i] == "": 65 | continue 66 | 67 | info = FeatureInfo(str_list[i]) 68 | if not info.parse_info_flag: 69 | print("parse_mask_file fail - parse_info fail:", str_list[i]) 70 | parse_mask_flag = False 71 | return [], parse_mask_flag 72 | 73 | for j in range(info.feature_size): 74 | feature_mask_list.append(info.feature_mask) 75 | if info.feature_mask != 0: 76 | feature_hold_cnt += 1 77 | # if info.feature_size > 1: 78 | # feature_name_list.append(info.feature_name + "_" + str(j)) 79 | # else: 80 | # feature_name_list.append(info.feature_name) 81 | 82 | parse_mask_flag = True 83 | return feature_mask_list, parse_mask_flag, feature_hold_cnt 84 | except Exception as e: 85 | print("parse_mask_file fail - Exception:", e) 86 | return [], False 87 | 88 | import itertools 89 | 90 | 91 | # 利用itertools库中的permutations函数,给定一个排列,输出他的全排列 92 | def allPermutation(n): 93 | permutation = [] 94 | # 首先需要初始化一个1-n的排列 95 | for i in range(n): 96 | permutation.append(i) 97 | # itertools.permutations返回的只是一个对象,需要将其转化成list 98 | # 每一种排列情况以元组类型存储 99 | all_permutation = list(itertools.permutations(permutation)) 100 | return all_permutation 101 | 102 | 103 | def random_vector(): 104 | print([[random.random() for x in list(range(0,8))] for y in list(range(0,5))]) 105 | 106 | 107 | 108 | 109 | if __name__ == "__main__": 110 | # feature_mask_list, parse_feature_mask_flag, feature_hold_cnt = parse_mask_file("feature_mask") 111 | # print(feature_mask_list) 112 | # print(len(feature_mask_list)) 113 | # print(parse_feature_mask_flag) 114 | # print(feature_hold_cnt) 115 | # print(allPermutation(5)) 116 | random_vector() 117 | -------------------------------------------------------------------------------- /rerank_paper/pier_model_whole_framework/util.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | def index_matrix_to_pairs(index_matrix): 4 | # [[3,1,2], [2,3,1]] -> [[[0, 3], [1, 1], [2, 2]], 5 | # [[0, 2], [1, 3], [2, 1]]] 6 | replicated_first_indices = tf.range(tf.shape(index_matrix)[0]) 7 | rank = len(index_matrix.get_shape()) 8 | if rank == 2: 9 | replicated_first_indices = tf.tile( 10 | tf.expand_dims(replicated_first_indices, axis=1), 11 | [1, tf.shape(index_matrix)[1]]) 12 | replicated_first_indices = tf.cast(replicated_first_indices, dtype=tf.int64) 13 | return tf.stack([replicated_first_indices, index_matrix], axis=rank) 14 | 15 | def string_hash_to_index(tensor, bucket=1<<22): 16 | return tf.strings.to_hash_bucket_fast(tensor, bucket) 17 | 18 | def int_to_string_with_key(tensor, key): 19 | return key + "_" + tf.strings.as_string(tensor) 20 | 21 | def float_to_string_with_key(tensor, key, precision=1): 22 | return key + "_" + tf.strings.as_string(tensor, precision) 23 | 24 | def float_to_int(tensor, order): 25 | wc = 10 ** order 26 | return tf.cast(tensor * wc, tf.int64) 27 | 28 | def float_custom_hash(tensor, key, precision=0, bucket=1<<22): 29 | tensor = float_to_string_with_key(tensor, key, precision) 30 | tensor = string_hash_to_index(tensor, bucket) 31 | return tensor 32 | 33 | def int_custom_hash(tensor, key, bucket=1<<22): 34 | tensor = int_to_string_with_key(tensor, key) 35 | tensor = string_hash_to_index(tensor, bucket) 36 | return tensor 37 | -------------------------------------------------------------------------------- /rerank_paper/pier_model_without_oam_atten/avg_std/delivery: -------------------------------------------------------------------------------- 1 | 4.668 40.692 17.227 2.616 2 | 2.722 10.623 13.181 1.742 3 | -------------------------------------------------------------------------------- /rerank_paper/pier_model_without_oam_atten/avg_std/poi: -------------------------------------------------------------------------------- 1 | 2066.773487 23.97615642 26.98331926 3.040902147 4.622442584 2 | 2239.863594 17.22234242 18.71115826 1.824693613 0.597578887 -------------------------------------------------------------------------------- /rerank_paper/pier_model_without_oam_atten/avg_std/user: -------------------------------------------------------------------------------- 1 | 0 0 2 | 1 1 -------------------------------------------------------------------------------- /rerank_paper/pier_model_without_oam_atten/build_model.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "" 4 | # 共建模型服务模版的路径 ------------------------ 修改模版配置 ---------------------- 5 | template_path="../model_template/" 6 | 7 | # 待上传模型pb文件的路径 ------------------------ 修改模型配置 ---------------------- 8 | model_path="../model/" 9 | model_name=$1 10 | ep=$2 11 | model_file=${model_path}${model_name}/${ep} 12 | echo ${model_file} 13 | target_path="../model_zip/" 14 | if [[ ! -d ${target_path} ]]; then 15 | mkdir ${target_path} 16 | fi 17 | target_name=${model_name}_${ep} 18 | target_file=${target_path}${target_name} 19 | target_name_zip=${target_name}.zip 20 | echo ${target_name} 21 | echo "---> 待上传模型的目标路径: "${target_file} 22 | if [[ -d ${target_file} ]]; then 23 | cmd_del="rm -fr ${target_file}" 24 | echo "---> 删除已存在的目标文件: "${cmd_del} 25 | ${cmd_del} 26 | fi 27 | if [[ ! -d ${target_file} ]]; then 28 | mkdir ${target_file} 29 | fi 30 | 31 | cmd_cp_model_pb="cp -fr ${model_file} ${target_file}/${target_name}" 32 | echo "---> 拷贝模型pb文件到目标目录: "${cmd_cp_model_pb} 33 | ${cmd_cp_model_pb} 34 | 35 | cmd_mv1="cp ${template_path}/model.properties ${target_file}/${target_name}.properties" 36 | cmd_mv2="cp ${template_path}/model.xml ${target_file}/${target_name}.xml" 37 | cmd_mv3="cp ${template_path}/tensors.xml ${target_file}/tensors.xml" 38 | cmd_mv4="cp ${template_path}/poi_feature.xml ${target_file}/poi_feature.xml" 39 | cmd_mv5="cp ${template_path}/poi_feature.tensor ${target_file}/poi_feature.tensor" 40 | echo "---> 修改properties文件名: "${cmd_mv1} 41 | echo "---> 修改xml文件名: "${cmd_mv2} 42 | ${cmd_mv1} 43 | ${cmd_mv2} 44 | ${cmd_mv3} 45 | ${cmd_mv4} 46 | ${cmd_mv5} 47 | 48 | cmd_zip1="cd ${target_path}" 49 | cmd_zip2="zip -r ${target_name_zip} ${target_name}" 50 | echo "---> 压缩上传模型文件: "${cmd_zip1} ${cmd_zip2} 51 | ${cmd_zip1} 52 | ${cmd_zip2} 53 | cd - 54 | 55 | echo ${target_name}.zip 56 | -------------------------------------------------------------------------------- /rerank_paper/pier_model_without_oam_atten/config.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 -*- 2 | import shutil 3 | import time 4 | import os 5 | 6 | # result 7 | RANDOM_SEED = 2022 8 | BATCH_SIZE = 1024 9 | IMP_LOSS_WEIGHT = 0.02 10 | # basic config 11 | EPOCH = 1 12 | LEARNING_RATE = 0.005 13 | DATA_MODE = 3 # 1:local train,2:local test, 3:docker evaluate 14 | TRAIN_MODE = 3 15 | MODEL_NAME = "pier_model_without_oam_atten_v1" 16 | # poi类别特征 17 | FEATURE_CATE_NUM = 7 # v1r3:19 18 | # dense特征 19 | FEATURE_DENSE_NUM = 5 # v1:28 v1r2:79 v1r3:83 20 | # 预估值特征 21 | FEATURE_CXR_NUM = 3 22 | # 环境特征 23 | FEATURE_ENV_NUM = 2 24 | # 自然poi 25 | FEATURE_NATURE_POI = 25 26 | 27 | # embedding_look_up维度 28 | CATE_FEATURE_EMBEDDINGS_SHAPE = [1 << 22, 8] 29 | 30 | # N: Cut Number of POI For Train 31 | POI_NUM = 5 32 | FEATURE_NUM = 9 33 | PAGE_NUM = 5 34 | FEATURE_NUM_FOR_PAGE = 11 35 | PERMUATION_SIZE = 120 36 | # 属性特征:KA AOR BRAND 37 | FEATURE_ATTR_NUM = 3 38 | 39 | # DELIVERY_FEAT 40 | DELIVERY_FEAT_NUM = 4 41 | 42 | # OUT NUM 43 | OUT_NUM = 1 44 | 45 | PLACE_HOLDER_NUM = 11 46 | DENSE_FEAT_NUM = 439 47 | 48 | 49 | # 网络结构参数 50 | MODEL_PARAMS = { 51 | 'INPUT_TENSOR_LAYERS_A': [60, 32, 10], 52 | 'INPUT_TENSOR_LAYERS_B': [50, 20], 53 | 'INPUT_TENSOR_LAYERS_C': [60, 32, 10], 54 | 'INPUT_TENSOR_LAYERS_D': [50, 20], 55 | 'INPUT_TENSOR_LAYERS_E': [50, 20] 56 | } 57 | # A_INPUT_DIM = POI_NUM * (MODEL_PARAMS['INPUT_TENSOR_LAYERS_A'][-1] + 1) 58 | MLP_INPUT_DIM = CATE_FEATURE_EMBEDDINGS_SHAPE[1] * 1 + 1 + MODEL_PARAMS['INPUT_TENSOR_LAYERS_A'][-1] 59 | DIN_CONF = {} 60 | 61 | # train data 62 | # /users/lemonace/Downloads/tfrecord-rl-limit5-v1 63 | if DATA_MODE == 1: 64 | TRAIN_FILE = ['/users/meituan_sxw/Downloads/part-r-00046'] 65 | VALID_FILE = TRAIN_FILE 66 | PREDICT_FILE = VALID_FILE 67 | TEST_FILE = PREDICT_FILE 68 | elif DATA_MODE == 2: 69 | TRAIN_FILE = ['/mnt/dolphinfs/hdd_pool/docker/user/hadoop-hmart-waimaiad/yangfan129/train_data/avito_v1_new/avito_v1_new/test_data/part-r-*'] 70 | VALID_FILE = TRAIN_FILE 71 | TEST_FILE = VALID_FILE 72 | elif DATA_MODE == 3: 73 | TRAIN_FILE = ["/mnt/dolphinfs/hdd_pool/docker/user/hadoop-hmart-waimaiad/yangfan129/train_data/avito_v3_new/train_data/part-r-*"] 74 | VALID_FILE = ["/mnt/dolphinfs/hdd_pool/docker/user/hadoop-hmart-waimaiad/yangfan129/train_data/avito_v3_new/test_data/part-r-*"] 75 | TEST_FILE= ["/mnt/dolphinfs/hdd_pool/docker/user/hadoop-hmart-waimaiad/yangfan129/train_data/avito_v3_new/test_data/part-r-*"] 76 | elif DATA_MODE == 4: 77 | DATA_FILE = "/home/hadoop-hmart-waimaiad/cephfs/data/yangfan129/train_data/tfrecord-multi-channel-v1/" 78 | #TRAIN_LIST = ["20211222", "20211223", "20211224", "20211225"] 79 | TRAIN_LIST = ["20220123"] 80 | VALID_LIST = ["20220124"] 81 | TRAIN_FILE = [DATA_FILE + x + "/part-r-*" for x in TRAIN_LIST] 82 | VALID_FILE = [DATA_FILE + x + "/part-r-0001*" for x in VALID_LIST] 83 | TEST_FILE = [DATA_FILE + x + "/part-r-00011" for x in TRAIN_LIST] 84 | 85 | # 辅助脚本 86 | MEAN_VAR_PATH_POI = "./avg_std/poi" 87 | MEAN_VAR_PATH_DELIVERY = "./avg_std/delivery" 88 | MODEL_SAVE_PATH = "../model/" + MODEL_NAME 89 | MODEL_SAVE_PB_EPOCH_ON = False 90 | MODEL_SAVE_PB_EPOCH_PATH = MODEL_SAVE_PATH + "_pbs" 91 | -------------------------------------------------------------------------------- /rerank_paper/pier_model_without_oam_atten/data_input.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import tensorflow as tf 3 | from config import * 4 | import numpy as np 5 | from tools import tick_tock,allPermutation 6 | 7 | 8 | def generate_parse_tfrecord_local_fn(): 9 | def _parse_function(batch_examples): 10 | common_features, sequence_features = feature_parse_scheme() 11 | parsed_features = tf.parse_example( 12 | serialized=batch_examples, 13 | features=common_features 14 | ) 15 | features = feature_product(parsed_features) 16 | labels = label_product(parsed_features) 17 | return features, labels 18 | 19 | return _parse_function 20 | 21 | 22 | def generate_parse_valid_tfrecord_local_fn(): 23 | def _parse_function(batch_examples): 24 | common_features, sequence_features = feature_parse_scheme() 25 | parsed_features = tf.parse_example( 26 | serialized=batch_examples, 27 | features=common_features 28 | ) 29 | features = feature_product(parsed_features) 30 | labels = label_product(parsed_features) 31 | return features, labels 32 | 33 | return _parse_function 34 | 35 | 36 | def feature_parse_scheme(): 37 | label_len = POI_NUM * 2 + PAGE_NUM 38 | feature_len = POI_NUM * FEATURE_NUM + PAGE_NUM * POI_NUM * FEATURE_NUM_FOR_PAGE 39 | common_features = { 40 | "label": tf.FixedLenFeature([label_len], dtype=tf.float32), 41 | "feature": tf.FixedLenFeature([feature_len], dtype=tf.float32), 42 | } 43 | 44 | sequence_features = {} 45 | return common_features, sequence_features 46 | 47 | 48 | def label_product(parsed_features): 49 | labels = parsed_features['label'] 50 | 51 | labels_result = { 52 | # ctr_label 53 | 'ctr_label': tf.gather(labels, list(range(0, POI_NUM)), axis=1), 54 | 'mask': tf.gather(labels, list(range(POI_NUM, 2 * POI_NUM)), axis=1), 55 | 'page_mask': tf.gather(labels, list(range(2 * POI_NUM, 2 * POI_NUM + PAGE_NUM)), axis=1), 56 | } 57 | return labels_result 58 | 59 | 60 | def feature_product(parsed_features): 61 | feature_buffer = parsed_features['feature'] 62 | labels = parsed_features['label'] 63 | # 获取特征 64 | # FEATURE_CATE_NUM:品类相关特征 65 | # FEATURE_DENSE_NUM:连续值特征 66 | # FEATURE_CXR_NUM:模型预估值特征 67 | 68 | full_permuation_index = allPermutation(POI_NUM) 69 | 70 | # current page 71 | current_page_start = 0 72 | current_page_end = current_page_start + POI_NUM * FEATURE_NUM 73 | 74 | pre_page_start = current_page_end 75 | pre_page_end = pre_page_start + PAGE_NUM * POI_NUM * FEATURE_NUM_FOR_PAGE 76 | 77 | 78 | cur_page_features = tf.reshape(tf.gather(feature_buffer,list(range(current_page_start,current_page_end)),axis=1), [-1, POI_NUM, FEATURE_NUM]) 79 | pre_page_features = tf.reshape(tf.gather(feature_buffer,list(range(pre_page_start,pre_page_end)),axis=1), [-1, PAGE_NUM, POI_NUM, FEATURE_NUM_FOR_PAGE]) 80 | 81 | position_fea = tf.gather(cur_page_features, list(range(0, 1)), axis=2) 82 | adid_fea = tf.gather(cur_page_features, list(range(1, 2)), axis=2) 83 | obj_type_fea = tf.gather(cur_page_features, list(range(2, 3)), axis=2) 84 | hist_ctr_fea = tf.gather(cur_page_features, list(range(3, 4)), axis=2) 85 | locationid_fea = tf.gather(cur_page_features, list(range(4, 5)), axis=2) 86 | categoryid_fea = tf.gather(cur_page_features, list(range(5, 6)), axis=2) 87 | price_fea = tf.gather(cur_page_features, list(range(6, 7)), axis=2) 88 | iscontext_fea = tf.gather(cur_page_features, list(range(7, 8)), axis=2) 89 | userid_fea = tf.gather(cur_page_features, list(range(8, 9)), axis=2) 90 | 91 | pre_position_fea = tf.gather(pre_page_features, list(range(0, 1)), axis=3) 92 | pre_adid_fea = tf.gather(pre_page_features, list(range(1, 2)), axis=3) 93 | pre_obj_type_fea = tf.gather(pre_page_features, list(range(2, 3)), axis=3) 94 | pre_hist_ctr_fea = tf.gather(pre_page_features, list(range(3, 4)), axis=3) 95 | pre_locationid_fea = tf.gather(pre_page_features, list(range(4, 5)), axis=3) 96 | pre_categoryid_fea = tf.gather(pre_page_features, list(range(5, 6)), axis=3) 97 | pre_price_fea = tf.gather(pre_page_features, list(range(6, 7)), axis=3) 98 | pre_iscontext_fea = tf.gather(pre_page_features, list(range(7, 8)), axis=3) 99 | pre_userid_fea = tf.gather(pre_page_features, list(range(8, 9)), axis=3) 100 | 101 | features_result = { 102 | 103 | 'dense_feature': hist_ctr_fea, 104 | # 离散特征(品类) 105 | 'cate_feature': tf.cast(tf.concat([position_fea, adid_fea, obj_type_fea, locationid_fea, iscontext_fea, categoryid_fea, userid_fea], axis=2), tf.int64), 106 | # ctr_label 107 | 'ctr_label': tf.gather(labels, list(range(0, POI_NUM)), axis=1), 108 | 'mask': tf.gather(labels, list(range(POI_NUM, 2 * POI_NUM)), axis=1), 109 | 'page_mask':tf.gather(labels, list(range(2 * POI_NUM, 2 * POI_NUM + PAGE_NUM)), axis=1), 110 | 'behavior_dense_feature': pre_hist_ctr_fea, 111 | # 离散特征(品类) 112 | 'behavior_cate_feature': tf.cast(tf.concat([pre_position_fea, pre_adid_fea, pre_obj_type_fea, pre_locationid_fea, 113 | pre_iscontext_fea, pre_categoryid_fea, pre_userid_fea],axis=3), tf.int64), 114 | 115 | 'full_permuation_index': tf.constant(full_permuation_index,tf.int32) 116 | 117 | } 118 | return features_result 119 | 120 | 121 | # num_parallel 表示cpu的核数,用于控制 map的并行度 122 | def input_fn_maker(file_names, is_train, batch_size, epoch=None, num_parallel=4): 123 | def input_fn(): 124 | _parse_fn = generate_parse_tfrecord_local_fn() if is_train else generate_parse_valid_tfrecord_local_fn() 125 | files = tf.data.Dataset.list_files(file_names) 126 | # print(files) 127 | dataset = files.apply(tf.contrib.data.parallel_interleave(tf.data.TFRecordDataset, cycle_length=4 * 10)) 128 | dataset = dataset.prefetch(buffer_size=batch_size * 10) 129 | dataset = dataset.repeat(epoch) 130 | dataset = dataset.batch(batch_size) 131 | dataset = dataset.map(_parse_fn, num_parallel_calls=num_parallel) 132 | iterator = dataset.make_one_shot_iterator() 133 | return iterator.get_next() 134 | 135 | return input_fn 136 | 137 | 138 | # 从hive表统计得到均值和方差文件 139 | def get_normalization_parameter(mean_var_path): 140 | with tf.gfile.Open(mean_var_path) as f: 141 | fea_mean = f.readline().strip().split(' ') 142 | fea_var = f.readline().strip().split(' ') 143 | cont_fea_mean = list(map(float, fea_mean)) 144 | cont_fea_var = list(map(float, fea_var)) 145 | f.close() 146 | return cont_fea_mean, cont_fea_var 147 | 148 | 149 | def get_bias_weight_parameter(bias_weight_path): 150 | with tf.gfile.Open(bias_weight_path) as f2: 151 | fea_mean = f2.readline().strip().split('\t') 152 | cont_fea_mean = list(map(float, fea_mean)) 153 | f2.close() 154 | return cont_fea_mean 155 | 156 | 157 | 158 | if __name__ == '__main__': 159 | train_file = TRAIN_FILE 160 | # train_file = ["/mnt/dolphinfs/hdd_pool/docker/user/hadoop-hmart-waimaiad/yangfan129/train_data/avito_v1_new/avito_v1_new/train_data/part-r-00000"] 161 | train_input_fn = input_fn_maker(train_file, is_train=True, batch_size=16, epoch=1) 162 | features, labels = train_input_fn() 163 | 164 | sess = tf.Session() 165 | try: 166 | with tick_tock("DATA_INPUT") as _: 167 | features_np, labels_np = sess.run([features, labels]) 168 | 169 | print("*" * 100, "features_np") 170 | for key in features_np: 171 | print("=" * 50, key, np.shape(features_np[key])) 172 | print(features_np[key]) 173 | 174 | 175 | print("*" * 100, "labels_np") 176 | for key in labels_np: 177 | print("=" * 50, key, np.shape(labels_np[key])) 178 | print(labels_np[key]) 179 | 180 | except Exception as e: 181 | print(e) 182 | -------------------------------------------------------------------------------- /rerank_paper/pier_model_without_oam_atten/layers.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | def DIN(seq, seq_len, target, conf, scope="DIN"): 4 | # seq BATCH_SIZE * SEQ_LEN * FEAT_NUM : N * M * H 5 | # target BATCH_SIZE * FEAT_NUM : N * H 6 | # return : BATCH_SIZE * H 7 | 8 | with tf.variable_scope(scope, reuse=tf.AUTO_REUSE): 9 | seq_shape = tf.shape(seq) 10 | target = tf.tile(target, [1, seq_shape[1], 1]) 11 | 12 | input = tf.concat([seq, target, seq - target, seq * target], axis=-1) 13 | 14 | layers = conf.get("layers", [64, 32]) 15 | for layer in layers: 16 | input = tf.layers.dense(input, layer, activation=tf.nn.sigmoid, name="att_"+str(layer)) 17 | 18 | input = tf.squeeze(tf.layers.dense(input, 1, activation=None, name="att_final"), axis=-1) # N * M 19 | 20 | # Mask 21 | seq_mask = tf.squeeze(tf.sequence_mask(seq_len, seq_shape[1])) 22 | # seq_mask = tf.Print(seq_mask, [seq_mask], message="seq_mask", summarize=100) 23 | padding = tf.ones_like(input) * (-2 ** 32 + 1) 24 | attention = tf.nn.softmax(tf.where(seq_mask, input, padding), axis=-1) # N * M 25 | # attention = tf.Print(attention, [attention], message="attention", summarize=100) 26 | attention = tf.tile(tf.expand_dims(attention, axis=2), [1, 1, seq_shape[2]]) 27 | output = tf.reduce_sum(tf.transpose(attention * seq, [0, 2, 1]), axis=1) 28 | return output 29 | -------------------------------------------------------------------------------- /rerank_paper/pier_model_without_oam_atten/main.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from config import * 3 | from model import * 4 | from sklearn import metrics 5 | 6 | def create_estimator(): 7 | tf.logging.set_verbosity(tf.logging.INFO) 8 | session_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) 9 | session_config.gpu_options.allow_growth = True 10 | config = tf.estimator.RunConfig( 11 | tf_random_seed=RANDOM_SEED, 12 | save_summary_steps=100, 13 | save_checkpoints_steps=1000, 14 | model_dir=MODEL_SAVE_PATH, 15 | keep_checkpoint_max=2, 16 | log_step_count_steps=1000, 17 | session_config=session_config) 18 | nn_model = DNN() 19 | estimator = tf.estimator.Estimator(model_fn=nn_model.model_fn_estimator, config=config) 20 | return estimator, nn_model 21 | 22 | 23 | def save_model_pb_with_estimator(estimator, params, export_dir_base): 24 | estimator._params['save_model'] = params['save_model'] 25 | 26 | def _serving_input_receiver_fn(): 27 | # env_feature = > dense_feature 28 | # cxr_feature = > screen_predict_feature 29 | # cat_feature = > screen_cate_feature 30 | # dense_feature = > screen_dense_feature 31 | receiver_tensors = { 32 | # ctr cvr gmv预估值 && bid 33 | 'screen_predict_feature': tf.placeholder(tf.float32, [None, POI_NUM, FEATURE_CXR_NUM], 34 | name='screen_predict_feature'), 35 | # dense 特征 (价格,评分) 36 | 'screen_dense_feature': tf.placeholder(tf.float32, [None, POI_NUM, FEATURE_DENSE_NUM], 37 | name='screen_dense_feature'), 38 | # 离散特征(品类) 39 | 'screen_cate_feature': tf.placeholder(tf.int64, [None, POI_NUM, FEATURE_CATE_NUM], 40 | name='screen_cate_feature'), 41 | # 环境特征(是否有铂金) 42 | 'dense_feature': tf.placeholder(tf.float32, [None, DENSE_FEAT_NUM], 43 | name='dense_feature') 44 | } 45 | return tf.estimator.export.ServingInputReceiver(receiver_tensors=receiver_tensors, features=receiver_tensors) 46 | 47 | export_dir = estimator.export_saved_model(export_dir_base=export_dir_base, 48 | serving_input_receiver_fn=_serving_input_receiver_fn) 49 | estimator._params.pop('save_model') 50 | return export_dir.decode() 51 | 52 | def calculate_result(result_generator): 53 | 54 | y_ctr, pred_ctr, ctr = [], [], [] 55 | for result in result_generator: 56 | cxr_feature = result['cxr_feature'] 57 | mask = result['mask'] 58 | # ctr_label 59 | idx = np.where(mask.reshape(-1) == 1) 60 | y_ctr += result['ctr_label'].reshape(-1)[idx].tolist() 61 | pred_ctr += result['ctr_out'].reshape(-1)[idx].tolist() 62 | ctr += cxr_feature[:, :, 0].reshape(-1)[idx].tolist() 63 | 64 | ctr_auc, ctr_auc_jp, ctr_cb, ctr_cb_jp = metrics.roc_auc_score(y_ctr, pred_ctr), metrics.roc_auc_score(y_ctr, ctr), np.sum(pred_ctr) / np.sum(y_ctr), np.sum(ctr) / np.sum(y_ctr) 65 | print("ctr_auc:{}, ctr_auc_jp:{}, ctr_cb:{}, ctr_cb_jp:{}".format(ctr_auc, ctr_auc_jp, ctr_cb, ctr_cb_jp)) 66 | 67 | if __name__ == '__main__': 68 | 69 | estimator, nn_model = create_estimator() 70 | 71 | with tick_tock("DATA_INPUT") as _: 72 | valid_input_fn = input_fn_maker(VALID_FILE, False, batch_size=1024, epoch=1) 73 | test_input_fn = input_fn_maker(TEST_FILE, False, batch_size=1024, epoch=1) 74 | 75 | if TRAIN_MODE == 1: 76 | for i in range(EPOCH): 77 | for idx, data in enumerate(TRAIN_FILE): 78 | with tick_tock("DATA_INPUT") as _: 79 | train_input_fn = input_fn_maker([data], True, batch_size=BATCH_SIZE, epoch=1) 80 | with tick_tock("TRAIN") as _: 81 | estimator.train(train_input_fn) 82 | if MODEL_SAVE_PB_EPOCH_ON: 83 | export_dir = save_model_pb_with_estimator(estimator, params={'save_model': 'listwise'}, 84 | export_dir_base=MODEL_SAVE_PB_EPOCH_PATH) 85 | ep_insert_index = i * len(TRAIN_FILE) + idx 86 | target_dir = export_dir + "/../ep" + str(ep_insert_index) 87 | while os.path.exists(target_dir): 88 | target_dir = export_dir + "/../ep" + str(ep_insert_index) 89 | shutil.move(export_dir, target_dir) 90 | print(time.strftime("%m-%d %H:%M:%S ", 91 | time.localtime(time.time())) + "export model PB: " + target_dir) 92 | #with tick_tock("PREDICT") as _: 93 | #result_generator = estimator.predict(input_fn=valid_input_fn, yield_single_examples=False) 94 | #calculate_result(result_generator) 95 | 96 | 97 | 98 | elif TRAIN_MODE == 2: 99 | with tick_tock("PREDICT") as _: 100 | result_generator = estimator.predict(input_fn=valid_input_fn, yield_single_examples=False) 101 | calculate_result(result_generator) 102 | 103 | elif TRAIN_MODE == 3: 104 | for i in range(EPOCH): 105 | for idx, data in enumerate(TRAIN_FILE): 106 | with tick_tock("DATA_INPUT") as _: 107 | train_input_fn = input_fn_maker([data], True, batch_size=BATCH_SIZE, epoch=1) 108 | with tick_tock("TRAIN") as _: 109 | estimator.train(train_input_fn) 110 | with tick_tock("PREDICT") as _: 111 | result_generator = estimator.predict(input_fn=valid_input_fn, yield_single_examples=False) 112 | print("valid_data") 113 | calculate_result(result_generator) 114 | #result_generator = estimator.predict(input_fn=test_input_fn, yield_single_examples=False) 115 | print("train_data") 116 | #calculate_result(result_generator) 117 | # save pb 118 | 119 | 120 | elif TRAIN_MODE == 4: 121 | export_dir = save_model_pb_with_estimator(estimator, params={'save_model': 'listwise'}, 122 | export_dir_base=MODEL_SAVE_PB_EPOCH_PATH) 123 | ep_insert_index = 0 124 | target_dir = export_dir + "/../ep" + str(ep_insert_index) 125 | while os.path.exists(target_dir): 126 | target_dir = export_dir + "/../ep" + str(ep_insert_index) 127 | shutil.move(export_dir, target_dir) 128 | print(time.strftime("%m-%d %H:%M:%S ", 129 | time.localtime(time.time())) + "export model PB: " + target_dir) 130 | 131 | -------------------------------------------------------------------------------- /rerank_paper/pier_model_without_oam_atten/model_load.py: -------------------------------------------------------------------------------- 1 | import tensorflow 2 | 3 | from config import * 4 | import tensorflow_core.contrib.predictor as predictor 5 | 6 | def load_listwise_model(): 7 | model_filename_dir = MODEL_SAVE_PATH + "_pbs/ep0/" 8 | predict_fn = predictor.from_saved_model(model_filename_dir) 9 | # predict_fn = tf.compat.v2.saved_model.load(model_filename_dir) 10 | 11 | # env_feature = > dense_feature 12 | # cxr_feature = > screen_predict_feature 13 | # cat_feature = > screen_cate_feature 14 | # dense_feature = > screen_dense_feature 15 | predictions = predict_fn({ 16 | 'screen_predict_feature': [[[0.036115, 0.05427262, 0.09489095, 0.2], 17 | [0.027565, 0.07474336, 0.04988268, 0.53], 18 | [0.024815, 0.1775544, 0.12052802, 0.24], 19 | [0.023316, 0.12283709, 0.10298113, 0.1]]], 20 | # dense 特征 (价格,评分) 21 | 'screen_dense_feature': [[[1359., 30.146147, 26., 5., 4.85], 22 | [318., 14.675659, 0., 5., 4.94], 23 | [637., 24.784016, 0., 5., 4.65], 24 | [185., 25.333273, 0., 5., 4.75]]], 25 | # 离散特征(品类) 26 | 'screen_cate_feature': [[[2638824, 4148885, 432243, 3985407, 3385100, 3019284], 27 | [2638824, 3905410, 3212599, 3985407, 1997821, 3019284], 28 | [2638824, 4148885, 3622545, 3985407, 1997821, 3019284], 29 | [2638824, 4148885, 432243, 3985407, 1997821, 3019284]]], 30 | # 环境特征(是否有铂金) 31 | 'dense_feature': [[0., 0.]] 32 | }) 33 | 34 | print('Q_network_output:', predictions['Q_network_output']) 35 | print('out:', predictions['out']) 36 | 37 | if __name__ == '__main__': 38 | # load_pg_model() 39 | load_listwise_model() -------------------------------------------------------------------------------- /rerank_paper/pier_model_without_oam_atten/restore.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lemonace/PIER_code/9c47f80af24570154aab4a811dd82d1f1d436fc3/rerank_paper/pier_model_without_oam_atten/restore.py -------------------------------------------------------------------------------- /rerank_paper/pier_model_without_oam_atten/run.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | LOG_PATH="./" 4 | if [[ ! -d ${LOG_PATH} ]]; then 5 | mkdir ${LOG_PATH} 6 | fi 7 | 8 | project_path=$(cd `dirname $0`; pwd) 9 | project_name="${project_path##*/}" 10 | begin_time=$(date "+%Y-%m-%d %H:%M:%S") 11 | author="yangfan129" 12 | 13 | LOG_FILENAME="log" 14 | CUDA_VISIBLE_DEVICES=0 nohup python -u main.py --author ${author} --project ${project_name} --begintime ${begin_time} > ${LOG_PATH}${LOG_FILENAME} 2>&1 & 15 | -------------------------------------------------------------------------------- /rerank_paper/pier_model_without_oam_atten/tools.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import datetime 3 | import time 4 | import os 5 | 6 | 7 | class tick_tock: 8 | def __init__(self, process_name, verbose=1): 9 | self.process_name = process_name 10 | self.verbose = verbose 11 | 12 | def __enter__(self): 13 | if self.verbose: 14 | print(("*" * 50 + " {} START!!!! ".format(self.process_name) + "*" * 50)) 15 | self.begin_time = time.time() 16 | 17 | def __exit__(self, type, value, traceback): 18 | if self.verbose: 19 | end_time = time.time() 20 | duration_seconds = end_time - self.begin_time 21 | duration = str(datetime.timedelta(seconds=duration_seconds)) 22 | 23 | print(("#" * 50 + " {} END... time lapsing {} ".format(self.process_name, duration) + "#" * 50)) 24 | 25 | 26 | class FeatureInfo: 27 | def __init__(self, feature_info_str): 28 | self.feature_info_str = feature_info_str 29 | 30 | self.feature_name = "NonFeaName" 31 | self.feature_size = 0 32 | self.feature_mask = 1 33 | self.parse_info_flag = False 34 | self.part_num = 3 35 | 36 | self._parse_info() 37 | 38 | def _parse_info(self): 39 | infoList = self.feature_info_str.split() 40 | 41 | if len(infoList) == self.part_num: 42 | self.feature_name = infoList[0] 43 | self.feature_size = int(infoList[1]) 44 | self.feature_mask = int(infoList[2]) 45 | self.parse_info_flag = True 46 | 47 | 48 | def parse_mask_file(feature_mask_file): 49 | try: 50 | if not os.path.exists(feature_mask_file): 51 | print("parse_mask_file fail - file not exists:", feature_mask_file) 52 | return [], False 53 | # feature_name_list = [] 54 | feature_mask_list = [] 55 | feature_hold_cnt = 0 56 | 57 | with open(feature_mask_file) as f: 58 | str_list = f.readlines() 59 | 60 | for i in range(0, len(str_list)): 61 | str_list[i] = str_list[i].strip('\n').strip() 62 | if str_list[i] == "": 63 | continue 64 | 65 | info = FeatureInfo(str_list[i]) 66 | if not info.parse_info_flag: 67 | print("parse_mask_file fail - parse_info fail:", str_list[i]) 68 | parse_mask_flag = False 69 | return [], parse_mask_flag 70 | 71 | for j in range(info.feature_size): 72 | feature_mask_list.append(info.feature_mask) 73 | if info.feature_mask != 0: 74 | feature_hold_cnt += 1 75 | # if info.feature_size > 1: 76 | # feature_name_list.append(info.feature_name + "_" + str(j)) 77 | # else: 78 | # feature_name_list.append(info.feature_name) 79 | 80 | parse_mask_flag = True 81 | return feature_mask_list, parse_mask_flag, feature_hold_cnt 82 | except Exception as e: 83 | print("parse_mask_file fail - Exception:", e) 84 | return [], False 85 | 86 | import itertools 87 | 88 | 89 | # 利用itertools库中的permutations函数,给定一个排列,输出他的全排列 90 | def allPermutation(n): 91 | permutation = [] 92 | # 首先需要初始化一个1-n的排列 93 | for i in range(n): 94 | permutation.append(i+1) 95 | # itertools.permutations返回的只是一个对象,需要将其转化成list 96 | # 每一种排列情况以元组类型存储 97 | all_permutation = list(itertools.permutations(permutation)) 98 | return all_permutation 99 | 100 | 101 | 102 | 103 | 104 | if __name__ == "__main__": 105 | # feature_mask_list, parse_feature_mask_flag, feature_hold_cnt = parse_mask_file("feature_mask") 106 | # print(feature_mask_list) 107 | # print(len(feature_mask_list)) 108 | # print(parse_feature_mask_flag) 109 | # print(feature_hold_cnt) 110 | print(allPermutation(5)) 111 | -------------------------------------------------------------------------------- /rerank_paper/pier_model_without_oam_atten/util.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | def index_matrix_to_pairs(index_matrix): 4 | # [[3,1,2], [2,3,1]] -> [[[0, 3], [1, 1], [2, 2]], 5 | # [[0, 2], [1, 3], [2, 1]]] 6 | replicated_first_indices = tf.range(tf.shape(index_matrix)[0]) 7 | rank = len(index_matrix.get_shape()) 8 | if rank == 2: 9 | replicated_first_indices = tf.tile( 10 | tf.expand_dims(replicated_first_indices, axis=1), 11 | [1, tf.shape(index_matrix)[1]]) 12 | replicated_first_indices = tf.cast(replicated_first_indices, dtype=tf.int64) 13 | return tf.stack([replicated_first_indices, index_matrix], axis=rank) 14 | 15 | def string_hash_to_index(tensor, bucket=1<<22): 16 | return tf.strings.to_hash_bucket_fast(tensor, bucket) 17 | 18 | def int_to_string_with_key(tensor, key): 19 | return key + "_" + tf.strings.as_string(tensor) 20 | 21 | def float_to_string_with_key(tensor, key, precision=1): 22 | return key + "_" + tf.strings.as_string(tensor, precision) 23 | 24 | def float_to_int(tensor, order): 25 | wc = 10 ** order 26 | return tf.cast(tensor * wc, tf.int64) 27 | 28 | def float_custom_hash(tensor, key, precision=0, bucket=1<<22): 29 | tensor = float_to_string_with_key(tensor, key, precision) 30 | tensor = string_hash_to_index(tensor, bucket) 31 | return tensor 32 | 33 | def int_custom_hash(tensor, key, bucket=1<<22): 34 | tensor = int_to_string_with_key(tensor, key) 35 | tensor = string_hash_to_index(tensor, bucket) 36 | return tensor 37 | -------------------------------------------------------------------------------- /rerank_paper/pier_model_without_page_atten/avg_std/delivery: -------------------------------------------------------------------------------- 1 | 4.668 40.692 17.227 2.616 2 | 2.722 10.623 13.181 1.742 3 | -------------------------------------------------------------------------------- /rerank_paper/pier_model_without_page_atten/avg_std/poi: -------------------------------------------------------------------------------- 1 | 2066.773487 23.97615642 26.98331926 3.040902147 4.622442584 2 | 2239.863594 17.22234242 18.71115826 1.824693613 0.597578887 -------------------------------------------------------------------------------- /rerank_paper/pier_model_without_page_atten/avg_std/user: -------------------------------------------------------------------------------- 1 | 0 0 2 | 1 1 -------------------------------------------------------------------------------- /rerank_paper/pier_model_without_page_atten/build_model.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "" 4 | # 共建模型服务模版的路径 ------------------------ 修改模版配置 ---------------------- 5 | template_path="../model_template/" 6 | 7 | # 待上传模型pb文件的路径 ------------------------ 修改模型配置 ---------------------- 8 | model_path="../model/" 9 | model_name=$1 10 | ep=$2 11 | model_file=${model_path}${model_name}/${ep} 12 | echo ${model_file} 13 | target_path="../model_zip/" 14 | if [[ ! -d ${target_path} ]]; then 15 | mkdir ${target_path} 16 | fi 17 | target_name=${model_name}_${ep} 18 | target_file=${target_path}${target_name} 19 | target_name_zip=${target_name}.zip 20 | echo ${target_name} 21 | echo "---> 待上传模型的目标路径: "${target_file} 22 | if [[ -d ${target_file} ]]; then 23 | cmd_del="rm -fr ${target_file}" 24 | echo "---> 删除已存在的目标文件: "${cmd_del} 25 | ${cmd_del} 26 | fi 27 | if [[ ! -d ${target_file} ]]; then 28 | mkdir ${target_file} 29 | fi 30 | 31 | cmd_cp_model_pb="cp -fr ${model_file} ${target_file}/${target_name}" 32 | echo "---> 拷贝模型pb文件到目标目录: "${cmd_cp_model_pb} 33 | ${cmd_cp_model_pb} 34 | 35 | cmd_mv1="cp ${template_path}/model.properties ${target_file}/${target_name}.properties" 36 | cmd_mv2="cp ${template_path}/model.xml ${target_file}/${target_name}.xml" 37 | cmd_mv3="cp ${template_path}/tensors.xml ${target_file}/tensors.xml" 38 | cmd_mv4="cp ${template_path}/poi_feature.xml ${target_file}/poi_feature.xml" 39 | cmd_mv5="cp ${template_path}/poi_feature.tensor ${target_file}/poi_feature.tensor" 40 | echo "---> 修改properties文件名: "${cmd_mv1} 41 | echo "---> 修改xml文件名: "${cmd_mv2} 42 | ${cmd_mv1} 43 | ${cmd_mv2} 44 | ${cmd_mv3} 45 | ${cmd_mv4} 46 | ${cmd_mv5} 47 | 48 | cmd_zip1="cd ${target_path}" 49 | cmd_zip2="zip -r ${target_name_zip} ${target_name}" 50 | echo "---> 压缩上传模型文件: "${cmd_zip1} ${cmd_zip2} 51 | ${cmd_zip1} 52 | ${cmd_zip2} 53 | cd - 54 | 55 | echo ${target_name}.zip 56 | -------------------------------------------------------------------------------- /rerank_paper/pier_model_without_page_atten/config.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 -*- 2 | import shutil 3 | import time 4 | import os 5 | 6 | # result 7 | RANDOM_SEED = 2022 8 | BATCH_SIZE = 1024 9 | IMP_LOSS_WEIGHT = 0.02 10 | # basic config 11 | EPOCH = 1 12 | LEARNING_RATE = 0.005 13 | DATA_MODE = 3 # 1:local train,2:local test, 3:docker evaluate 14 | TRAIN_MODE = 3 15 | MODEL_NAME = "pier_model_without_page_atten" 16 | # poi类别特征 17 | FEATURE_CATE_NUM = 7 # v1r3:19 18 | # dense特征 19 | FEATURE_DENSE_NUM = 5 # v1:28 v1r2:79 v1r3:83 20 | # 预估值特征 21 | FEATURE_CXR_NUM = 3 22 | # 环境特征 23 | FEATURE_ENV_NUM = 2 24 | # 自然poi 25 | FEATURE_NATURE_POI = 25 26 | 27 | # embedding_look_up维度 28 | CATE_FEATURE_EMBEDDINGS_SHAPE = [1 << 22, 8] 29 | 30 | # N: Cut Number of POI For Train 31 | POI_NUM = 5 32 | FEATURE_NUM = 9 33 | PAGE_NUM = 5 34 | FEATURE_NUM_FOR_PAGE = 11 35 | PERMUATION_SIZE = 120 36 | # 属性特征:KA AOR BRAND 37 | FEATURE_ATTR_NUM = 3 38 | 39 | # DELIVERY_FEAT 40 | DELIVERY_FEAT_NUM = 4 41 | 42 | # OUT NUM 43 | OUT_NUM = 1 44 | 45 | PLACE_HOLDER_NUM = 11 46 | DENSE_FEAT_NUM = 439 47 | 48 | 49 | # 网络结构参数 50 | MODEL_PARAMS = { 51 | 'INPUT_TENSOR_LAYERS_A': [60, 32, 10], 52 | 'INPUT_TENSOR_LAYERS_B': [50, 20], 53 | 'INPUT_TENSOR_LAYERS_C': [60, 32, 10], 54 | 'INPUT_TENSOR_LAYERS_D': [50, 20], 55 | 'INPUT_TENSOR_LAYERS_E': [50, 20] 56 | } 57 | # A_INPUT_DIM = POI_NUM * (MODEL_PARAMS['INPUT_TENSOR_LAYERS_A'][-1] + 1) 58 | MLP_INPUT_DIM = CATE_FEATURE_EMBEDDINGS_SHAPE[1] + 1 + MODEL_PARAMS['INPUT_TENSOR_LAYERS_A'][-1] 59 | DIN_CONF = {} 60 | 61 | # train data 62 | # /users/lemonace/Downloads/tfrecord-rl-limit5-v1 63 | if DATA_MODE == 1: 64 | TRAIN_FILE = ['/users/meituan_sxw/Downloads/part-r-00046'] 65 | VALID_FILE = TRAIN_FILE 66 | PREDICT_FILE = VALID_FILE 67 | TEST_FILE = PREDICT_FILE 68 | elif DATA_MODE == 2: 69 | TRAIN_FILE = ['/mnt/dolphinfs/hdd_pool/docker/user/hadoop-hmart-waimaiad/yangfan129/train_data/avito_v1_new/avito_v1_new/test_data/part-r-*'] 70 | VALID_FILE = TRAIN_FILE 71 | TEST_FILE = VALID_FILE 72 | elif DATA_MODE == 3: 73 | TRAIN_FILE = ["/mnt/dolphinfs/hdd_pool/docker/user/hadoop-hmart-waimaiad/yangfan129/train_data/avito_v3_new/train_data/part-r-*"] 74 | VALID_FILE = ["/mnt/dolphinfs/hdd_pool/docker/user/hadoop-hmart-waimaiad/yangfan129/train_data/avito_v3_new/test_data/part-r-*"] 75 | TEST_FILE= ["/mnt/dolphinfs/hdd_pool/docker/user/hadoop-hmart-waimaiad/yangfan129/train_data/avito_v3_new/test_data/part-r-*"] 76 | elif DATA_MODE == 4: 77 | DATA_FILE = "/home/hadoop-hmart-waimaiad/cephfs/data/yangfan129/train_data/tfrecord-multi-channel-v1/" 78 | #TRAIN_LIST = ["20211222", "20211223", "20211224", "20211225"] 79 | TRAIN_LIST = ["20220123"] 80 | VALID_LIST = ["20220124"] 81 | TRAIN_FILE = [DATA_FILE + x + "/part-r-*" for x in TRAIN_LIST] 82 | VALID_FILE = [DATA_FILE + x + "/part-r-0001*" for x in VALID_LIST] 83 | TEST_FILE = [DATA_FILE + x + "/part-r-00011" for x in TRAIN_LIST] 84 | 85 | # 辅助脚本 86 | MEAN_VAR_PATH_POI = "./avg_std/poi" 87 | MEAN_VAR_PATH_DELIVERY = "./avg_std/delivery" 88 | MODEL_SAVE_PATH = "../model/" + MODEL_NAME 89 | MODEL_SAVE_PB_EPOCH_ON = False 90 | MODEL_SAVE_PB_EPOCH_PATH = MODEL_SAVE_PATH + "_pbs" 91 | -------------------------------------------------------------------------------- /rerank_paper/pier_model_without_page_atten/layers.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | def DIN(seq, seq_len, target, conf, scope="DIN"): 4 | # seq BATCH_SIZE * SEQ_LEN * FEAT_NUM : N * M * H 5 | # target BATCH_SIZE * FEAT_NUM : N * H 6 | # return : BATCH_SIZE * H 7 | 8 | with tf.variable_scope(scope, reuse=tf.AUTO_REUSE): 9 | seq_shape = tf.shape(seq) 10 | target = tf.tile(target, [1, seq_shape[1], 1]) 11 | 12 | input = tf.concat([seq, target, seq - target, seq * target], axis=-1) 13 | 14 | layers = conf.get("layers", [64, 32]) 15 | for layer in layers: 16 | input = tf.layers.dense(input, layer, activation=tf.nn.sigmoid, name="att_"+str(layer)) 17 | 18 | input = tf.squeeze(tf.layers.dense(input, 1, activation=None, name="att_final"), axis=-1) # N * M 19 | 20 | # Mask 21 | seq_mask = tf.squeeze(tf.sequence_mask(seq_len, seq_shape[1])) 22 | # seq_mask = tf.Print(seq_mask, [seq_mask], message="seq_mask", summarize=100) 23 | padding = tf.ones_like(input) * (-2 ** 32 + 1) 24 | attention = tf.nn.softmax(tf.where(seq_mask, input, padding), axis=-1) # N * M 25 | # attention = tf.Print(attention, [attention], message="attention", summarize=100) 26 | attention = tf.tile(tf.expand_dims(attention, axis=2), [1, 1, seq_shape[2]]) 27 | output = tf.reduce_sum(tf.transpose(attention * seq, [0, 2, 1]), axis=1) 28 | return output 29 | -------------------------------------------------------------------------------- /rerank_paper/pier_model_without_page_atten/main.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from config import * 3 | from model import * 4 | from sklearn import metrics 5 | 6 | def create_estimator(): 7 | tf.logging.set_verbosity(tf.logging.INFO) 8 | session_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) 9 | session_config.gpu_options.allow_growth = True 10 | config = tf.estimator.RunConfig( 11 | tf_random_seed=RANDOM_SEED, 12 | save_summary_steps=100, 13 | save_checkpoints_steps=1000, 14 | model_dir=MODEL_SAVE_PATH, 15 | keep_checkpoint_max=2, 16 | log_step_count_steps=1000, 17 | session_config=session_config) 18 | nn_model = DNN() 19 | estimator = tf.estimator.Estimator(model_fn=nn_model.model_fn_estimator, config=config) 20 | return estimator, nn_model 21 | 22 | 23 | def save_model_pb_with_estimator(estimator, params, export_dir_base): 24 | estimator._params['save_model'] = params['save_model'] 25 | 26 | def _serving_input_receiver_fn(): 27 | # env_feature = > dense_feature 28 | # cxr_feature = > screen_predict_feature 29 | # cat_feature = > screen_cate_feature 30 | # dense_feature = > screen_dense_feature 31 | receiver_tensors = { 32 | # ctr cvr gmv预估值 && bid 33 | 'screen_predict_feature': tf.placeholder(tf.float32, [None, POI_NUM, FEATURE_CXR_NUM], 34 | name='screen_predict_feature'), 35 | # dense 特征 (价格,评分) 36 | 'screen_dense_feature': tf.placeholder(tf.float32, [None, POI_NUM, FEATURE_DENSE_NUM], 37 | name='screen_dense_feature'), 38 | # 离散特征(品类) 39 | 'screen_cate_feature': tf.placeholder(tf.int64, [None, POI_NUM, FEATURE_CATE_NUM], 40 | name='screen_cate_feature'), 41 | # 环境特征(是否有铂金) 42 | 'dense_feature': tf.placeholder(tf.float32, [None, DENSE_FEAT_NUM], 43 | name='dense_feature') 44 | } 45 | return tf.estimator.export.ServingInputReceiver(receiver_tensors=receiver_tensors, features=receiver_tensors) 46 | 47 | export_dir = estimator.export_saved_model(export_dir_base=export_dir_base, 48 | serving_input_receiver_fn=_serving_input_receiver_fn) 49 | estimator._params.pop('save_model') 50 | return export_dir.decode() 51 | 52 | def calculate_result(result_generator): 53 | 54 | y_ctr, pred_ctr, ctr = [], [], [] 55 | for result in result_generator: 56 | cxr_feature = result['cxr_feature'] 57 | mask = result['mask'] 58 | # ctr_label 59 | idx = np.where(mask.reshape(-1) == 1) 60 | y_ctr += result['ctr_label'].reshape(-1)[idx].tolist() 61 | pred_ctr += result['ctr_out'].reshape(-1)[idx].tolist() 62 | ctr += cxr_feature[:, :, 0].reshape(-1)[idx].tolist() 63 | 64 | ctr_auc, ctr_auc_jp, ctr_cb, ctr_cb_jp = metrics.roc_auc_score(y_ctr, pred_ctr), metrics.roc_auc_score(y_ctr, ctr), np.sum(pred_ctr) / np.sum(y_ctr), np.sum(ctr) / np.sum(y_ctr) 65 | print("ctr_auc:{}, ctr_auc_jp:{}, ctr_cb:{}, ctr_cb_jp:{}".format(ctr_auc, ctr_auc_jp, ctr_cb, ctr_cb_jp)) 66 | 67 | if __name__ == '__main__': 68 | 69 | estimator, nn_model = create_estimator() 70 | 71 | with tick_tock("DATA_INPUT") as _: 72 | valid_input_fn = input_fn_maker(VALID_FILE, False, batch_size=1024, epoch=1) 73 | test_input_fn = input_fn_maker(TEST_FILE, False, batch_size=1024, epoch=1) 74 | 75 | if TRAIN_MODE == 1: 76 | for i in range(EPOCH): 77 | for idx, data in enumerate(TRAIN_FILE): 78 | with tick_tock("DATA_INPUT") as _: 79 | train_input_fn = input_fn_maker([data], True, batch_size=BATCH_SIZE, epoch=1) 80 | with tick_tock("TRAIN") as _: 81 | estimator.train(train_input_fn) 82 | if MODEL_SAVE_PB_EPOCH_ON: 83 | export_dir = save_model_pb_with_estimator(estimator, params={'save_model': 'listwise'}, 84 | export_dir_base=MODEL_SAVE_PB_EPOCH_PATH) 85 | ep_insert_index = i * len(TRAIN_FILE) + idx 86 | target_dir = export_dir + "/../ep" + str(ep_insert_index) 87 | while os.path.exists(target_dir): 88 | target_dir = export_dir + "/../ep" + str(ep_insert_index) 89 | shutil.move(export_dir, target_dir) 90 | print(time.strftime("%m-%d %H:%M:%S ", 91 | time.localtime(time.time())) + "export model PB: " + target_dir) 92 | #with tick_tock("PREDICT") as _: 93 | #result_generator = estimator.predict(input_fn=valid_input_fn, yield_single_examples=False) 94 | #calculate_result(result_generator) 95 | 96 | 97 | 98 | elif TRAIN_MODE == 2: 99 | with tick_tock("PREDICT") as _: 100 | result_generator = estimator.predict(input_fn=valid_input_fn, yield_single_examples=False) 101 | calculate_result(result_generator) 102 | 103 | elif TRAIN_MODE == 3: 104 | for i in range(EPOCH): 105 | for idx, data in enumerate(TRAIN_FILE): 106 | with tick_tock("DATA_INPUT") as _: 107 | train_input_fn = input_fn_maker([data], True, batch_size=BATCH_SIZE, epoch=1) 108 | with tick_tock("TRAIN") as _: 109 | estimator.train(train_input_fn) 110 | with tick_tock("PREDICT") as _: 111 | result_generator = estimator.predict(input_fn=valid_input_fn, yield_single_examples=False) 112 | print("valid_data") 113 | calculate_result(result_generator) 114 | #result_generator = estimator.predict(input_fn=test_input_fn, yield_single_examples=False) 115 | print("train_data") 116 | #calculate_result(result_generator) 117 | # save pb 118 | 119 | 120 | elif TRAIN_MODE == 4: 121 | export_dir = save_model_pb_with_estimator(estimator, params={'save_model': 'listwise'}, 122 | export_dir_base=MODEL_SAVE_PB_EPOCH_PATH) 123 | ep_insert_index = 0 124 | target_dir = export_dir + "/../ep" + str(ep_insert_index) 125 | while os.path.exists(target_dir): 126 | target_dir = export_dir + "/../ep" + str(ep_insert_index) 127 | shutil.move(export_dir, target_dir) 128 | print(time.strftime("%m-%d %H:%M:%S ", 129 | time.localtime(time.time())) + "export model PB: " + target_dir) 130 | 131 | -------------------------------------------------------------------------------- /rerank_paper/pier_model_without_page_atten/model_load.py: -------------------------------------------------------------------------------- 1 | import tensorflow 2 | 3 | from config import * 4 | import tensorflow_core.contrib.predictor as predictor 5 | 6 | def load_listwise_model(): 7 | model_filename_dir = MODEL_SAVE_PATH + "_pbs/ep0/" 8 | predict_fn = predictor.from_saved_model(model_filename_dir) 9 | # predict_fn = tf.compat.v2.saved_model.load(model_filename_dir) 10 | 11 | # env_feature = > dense_feature 12 | # cxr_feature = > screen_predict_feature 13 | # cat_feature = > screen_cate_feature 14 | # dense_feature = > screen_dense_feature 15 | predictions = predict_fn({ 16 | 'screen_predict_feature': [[[0.036115, 0.05427262, 0.09489095, 0.2], 17 | [0.027565, 0.07474336, 0.04988268, 0.53], 18 | [0.024815, 0.1775544, 0.12052802, 0.24], 19 | [0.023316, 0.12283709, 0.10298113, 0.1]]], 20 | # dense 特征 (价格,评分) 21 | 'screen_dense_feature': [[[1359., 30.146147, 26., 5., 4.85], 22 | [318., 14.675659, 0., 5., 4.94], 23 | [637., 24.784016, 0., 5., 4.65], 24 | [185., 25.333273, 0., 5., 4.75]]], 25 | # 离散特征(品类) 26 | 'screen_cate_feature': [[[2638824, 4148885, 432243, 3985407, 3385100, 3019284], 27 | [2638824, 3905410, 3212599, 3985407, 1997821, 3019284], 28 | [2638824, 4148885, 3622545, 3985407, 1997821, 3019284], 29 | [2638824, 4148885, 432243, 3985407, 1997821, 3019284]]], 30 | # 环境特征(是否有铂金) 31 | 'dense_feature': [[0., 0.]] 32 | }) 33 | 34 | print('Q_network_output:', predictions['Q_network_output']) 35 | print('out:', predictions['out']) 36 | 37 | if __name__ == '__main__': 38 | # load_pg_model() 39 | load_listwise_model() -------------------------------------------------------------------------------- /rerank_paper/pier_model_without_page_atten/restore.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lemonace/PIER_code/9c47f80af24570154aab4a811dd82d1f1d436fc3/rerank_paper/pier_model_without_page_atten/restore.py -------------------------------------------------------------------------------- /rerank_paper/pier_model_without_page_atten/run.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | LOG_PATH="./" 4 | if [[ ! -d ${LOG_PATH} ]]; then 5 | mkdir ${LOG_PATH} 6 | fi 7 | 8 | project_path=$(cd `dirname $0`; pwd) 9 | project_name="${project_path##*/}" 10 | begin_time=$(date "+%Y-%m-%d %H:%M:%S") 11 | author="yangfan129" 12 | 13 | LOG_FILENAME="log" 14 | CUDA_VISIBLE_DEVICES=0 nohup python -u main.py --author ${author} --project ${project_name} --begintime ${begin_time} > ${LOG_PATH}${LOG_FILENAME} 2>&1 & 15 | -------------------------------------------------------------------------------- /rerank_paper/pier_model_without_page_atten/tools.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import datetime 3 | import time 4 | import os 5 | 6 | 7 | class tick_tock: 8 | def __init__(self, process_name, verbose=1): 9 | self.process_name = process_name 10 | self.verbose = verbose 11 | 12 | def __enter__(self): 13 | if self.verbose: 14 | print(("*" * 50 + " {} START!!!! ".format(self.process_name) + "*" * 50)) 15 | self.begin_time = time.time() 16 | 17 | def __exit__(self, type, value, traceback): 18 | if self.verbose: 19 | end_time = time.time() 20 | duration_seconds = end_time - self.begin_time 21 | duration = str(datetime.timedelta(seconds=duration_seconds)) 22 | 23 | print(("#" * 50 + " {} END... time lapsing {} ".format(self.process_name, duration) + "#" * 50)) 24 | 25 | 26 | class FeatureInfo: 27 | def __init__(self, feature_info_str): 28 | self.feature_info_str = feature_info_str 29 | 30 | self.feature_name = "NonFeaName" 31 | self.feature_size = 0 32 | self.feature_mask = 1 33 | self.parse_info_flag = False 34 | self.part_num = 3 35 | 36 | self._parse_info() 37 | 38 | def _parse_info(self): 39 | infoList = self.feature_info_str.split() 40 | 41 | if len(infoList) == self.part_num: 42 | self.feature_name = infoList[0] 43 | self.feature_size = int(infoList[1]) 44 | self.feature_mask = int(infoList[2]) 45 | self.parse_info_flag = True 46 | 47 | 48 | def parse_mask_file(feature_mask_file): 49 | try: 50 | if not os.path.exists(feature_mask_file): 51 | print("parse_mask_file fail - file not exists:", feature_mask_file) 52 | return [], False 53 | # feature_name_list = [] 54 | feature_mask_list = [] 55 | feature_hold_cnt = 0 56 | 57 | with open(feature_mask_file) as f: 58 | str_list = f.readlines() 59 | 60 | for i in range(0, len(str_list)): 61 | str_list[i] = str_list[i].strip('\n').strip() 62 | if str_list[i] == "": 63 | continue 64 | 65 | info = FeatureInfo(str_list[i]) 66 | if not info.parse_info_flag: 67 | print("parse_mask_file fail - parse_info fail:", str_list[i]) 68 | parse_mask_flag = False 69 | return [], parse_mask_flag 70 | 71 | for j in range(info.feature_size): 72 | feature_mask_list.append(info.feature_mask) 73 | if info.feature_mask != 0: 74 | feature_hold_cnt += 1 75 | # if info.feature_size > 1: 76 | # feature_name_list.append(info.feature_name + "_" + str(j)) 77 | # else: 78 | # feature_name_list.append(info.feature_name) 79 | 80 | parse_mask_flag = True 81 | return feature_mask_list, parse_mask_flag, feature_hold_cnt 82 | except Exception as e: 83 | print("parse_mask_file fail - Exception:", e) 84 | return [], False 85 | 86 | import itertools 87 | 88 | 89 | # 利用itertools库中的permutations函数,给定一个排列,输出他的全排列 90 | def allPermutation(n): 91 | permutation = [] 92 | # 首先需要初始化一个1-n的排列 93 | for i in range(n): 94 | permutation.append(i+1) 95 | # itertools.permutations返回的只是一个对象,需要将其转化成list 96 | # 每一种排列情况以元组类型存储 97 | all_permutation = list(itertools.permutations(permutation)) 98 | return all_permutation 99 | 100 | 101 | 102 | 103 | 104 | if __name__ == "__main__": 105 | # feature_mask_list, parse_feature_mask_flag, feature_hold_cnt = parse_mask_file("feature_mask") 106 | # print(feature_mask_list) 107 | # print(len(feature_mask_list)) 108 | # print(parse_feature_mask_flag) 109 | # print(feature_hold_cnt) 110 | print(allPermutation(5)) 111 | -------------------------------------------------------------------------------- /rerank_paper/pier_model_without_page_atten/util.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | def index_matrix_to_pairs(index_matrix): 4 | # [[3,1,2], [2,3,1]] -> [[[0, 3], [1, 1], [2, 2]], 5 | # [[0, 2], [1, 3], [2, 1]]] 6 | replicated_first_indices = tf.range(tf.shape(index_matrix)[0]) 7 | rank = len(index_matrix.get_shape()) 8 | if rank == 2: 9 | replicated_first_indices = tf.tile( 10 | tf.expand_dims(replicated_first_indices, axis=1), 11 | [1, tf.shape(index_matrix)[1]]) 12 | replicated_first_indices = tf.cast(replicated_first_indices, dtype=tf.int64) 13 | return tf.stack([replicated_first_indices, index_matrix], axis=rank) 14 | 15 | def string_hash_to_index(tensor, bucket=1<<22): 16 | return tf.strings.to_hash_bucket_fast(tensor, bucket) 17 | 18 | def int_to_string_with_key(tensor, key): 19 | return key + "_" + tf.strings.as_string(tensor) 20 | 21 | def float_to_string_with_key(tensor, key, precision=1): 22 | return key + "_" + tf.strings.as_string(tensor, precision) 23 | 24 | def float_to_int(tensor, order): 25 | wc = 10 ** order 26 | return tf.cast(tensor * wc, tf.int64) 27 | 28 | def float_custom_hash(tensor, key, precision=0, bucket=1<<22): 29 | tensor = float_to_string_with_key(tensor, key, precision) 30 | tensor = string_hash_to_index(tensor, bucket) 31 | return tensor 32 | 33 | def int_custom_hash(tensor, key, bucket=1<<22): 34 | tensor = int_to_string_with_key(tensor, key) 35 | tensor = string_hash_to_index(tensor, bucket) 36 | return tensor 37 | -------------------------------------------------------------------------------- /rerank_paper/prm_model/avg_std/delivery: -------------------------------------------------------------------------------- 1 | 4.668 40.692 17.227 2.616 2 | 2.722 10.623 13.181 1.742 3 | -------------------------------------------------------------------------------- /rerank_paper/prm_model/avg_std/poi: -------------------------------------------------------------------------------- 1 | 2066.773487 23.97615642 26.98331926 3.040902147 4.622442584 2 | 2239.863594 17.22234242 18.71115826 1.824693613 0.597578887 -------------------------------------------------------------------------------- /rerank_paper/prm_model/avg_std/user: -------------------------------------------------------------------------------- 1 | 0 0 2 | 1 1 -------------------------------------------------------------------------------- /rerank_paper/prm_model/build_model.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "" 4 | # 共建模型服务模版的路径 ------------------------ 修改模版配置 ---------------------- 5 | template_path="../model_template/" 6 | 7 | # 待上传模型pb文件的路径 ------------------------ 修改模型配置 ---------------------- 8 | model_path="../model/" 9 | model_name=$1 10 | ep=$2 11 | model_file=${model_path}${model_name}/${ep} 12 | echo ${model_file} 13 | target_path="../model_zip/" 14 | if [[ ! -d ${target_path} ]]; then 15 | mkdir ${target_path} 16 | fi 17 | target_name=${model_name}_${ep} 18 | target_file=${target_path}${target_name} 19 | target_name_zip=${target_name}.zip 20 | echo ${target_name} 21 | echo "---> 待上传模型的目标路径: "${target_file} 22 | if [[ -d ${target_file} ]]; then 23 | cmd_del="rm -fr ${target_file}" 24 | echo "---> 删除已存在的目标文件: "${cmd_del} 25 | ${cmd_del} 26 | fi 27 | if [[ ! -d ${target_file} ]]; then 28 | mkdir ${target_file} 29 | fi 30 | 31 | cmd_cp_model_pb="cp -fr ${model_file} ${target_file}/${target_name}" 32 | echo "---> 拷贝模型pb文件到目标目录: "${cmd_cp_model_pb} 33 | ${cmd_cp_model_pb} 34 | 35 | cmd_mv1="cp ${template_path}/model.properties ${target_file}/${target_name}.properties" 36 | cmd_mv2="cp ${template_path}/model.xml ${target_file}/${target_name}.xml" 37 | cmd_mv3="cp ${template_path}/tensors.xml ${target_file}/tensors.xml" 38 | cmd_mv4="cp ${template_path}/poi_feature.xml ${target_file}/poi_feature.xml" 39 | cmd_mv5="cp ${template_path}/poi_feature.tensor ${target_file}/poi_feature.tensor" 40 | echo "---> 修改properties文件名: "${cmd_mv1} 41 | echo "---> 修改xml文件名: "${cmd_mv2} 42 | ${cmd_mv1} 43 | ${cmd_mv2} 44 | ${cmd_mv3} 45 | ${cmd_mv4} 46 | ${cmd_mv5} 47 | 48 | cmd_zip1="cd ${target_path}" 49 | cmd_zip2="zip -r ${target_name_zip} ${target_name}" 50 | echo "---> 压缩上传模型文件: "${cmd_zip1} ${cmd_zip2} 51 | ${cmd_zip1} 52 | ${cmd_zip2} 53 | cd - 54 | 55 | echo ${target_name}.zip 56 | -------------------------------------------------------------------------------- /rerank_paper/prm_model/config.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 -*- 2 | import shutil 3 | import time 4 | import os 5 | 6 | # result 7 | RANDOM_SEED = 2021 8 | BATCH_SIZE = 1024 9 | IMP_LOSS_WEIGHT = 0.02 10 | # basic config 11 | EPOCH = 1 12 | LEARNING_RATE = 0.005 13 | DATA_MODE = 1 # 1:local train,2:local test, 3:docker evaluate 14 | TRAIN_MODE = 3 15 | MODEL_NAME = "avito_listwise_model" 16 | # poi类别特征 17 | FEATURE_CATE_NUM = 7 # v1r3:19 18 | # dense特征 19 | FEATURE_DENSE_NUM = 5 # v1:28 v1r2:79 v1r3:83 20 | # 预估值特征 21 | FEATURE_CXR_NUM = 3 22 | # 环境特征 23 | FEATURE_ENV_NUM = 2 24 | # 自然poi 25 | FEATURE_NATURE_POI = 25 26 | 27 | # N: Cut Number of POI For Train 28 | POI_NUM = 5 29 | FEATURE_NUM = 9 30 | PAGE_NUM = 5 31 | FEATURE_NUM_FOR_PAGE = 11 32 | # 属性特征:KA AOR BRAND 33 | FEATURE_ATTR_NUM = 3 34 | 35 | # DELIVERY_FEAT 36 | DELIVERY_FEAT_NUM = 4 37 | 38 | # OUT NUM 39 | OUT_NUM = 1 40 | 41 | PLACE_HOLDER_NUM = 11 42 | DENSE_FEAT_NUM = 439 43 | 44 | 45 | # embedding_look_up维度 46 | CATE_FEATURE_EMBEDDINGS_SHAPE = [1 << 22, 8] 47 | 48 | # 网络结构参数 49 | MODEL_PARAMS = { 50 | 'INPUT_TENSOR_LAYERS_A': [60, 32, 10], 51 | 'INPUT_TENSOR_LAYERS_B': [50, 20], 52 | 'INPUT_TENSOR_LAYERS_C': [50, 20], 53 | 'INPUT_TENSOR_LAYERS_D': [50, 20], 54 | 'INPUT_TENSOR_LAYERS_E': [50, 20] 55 | } 56 | A_INPUT_DIM = (MODEL_PARAMS['INPUT_TENSOR_LAYERS_A'][-1] + 1) 57 | 58 | DIN_CONF = {} 59 | 60 | # train data 61 | # /users/lemonace/Downloads/tfrecord-rl-limit5-v1 62 | if DATA_MODE == 1: 63 | TRAIN_FILE = ['/users/meituan_sxw/Downloads/part-r-00046'] 64 | VALID_FILE = TRAIN_FILE 65 | PREDICT_FILE = VALID_FILE 66 | TEST_FILE = PREDICT_FILE 67 | elif DATA_MODE == 2: 68 | TRAIN_FILE = ['/mnt/dolphinfs/hdd_pool/docker/user/hadoop-hmart-waimaiad/yangfan129/train_data/avito_v1_new/avito_v1_new/test_data/part-r-*'] 69 | VALID_FILE = TRAIN_FILE 70 | TEST_FILE = VALID_FILE 71 | elif DATA_MODE == 3: 72 | TRAIN_FILE = [ 73 | "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-hmart-waimaiad/yangfan129/train_data/avito_v3_new/train_data/part-r-*"] 74 | VALID_FILE = [ 75 | "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-hmart-waimaiad/yangfan129/train_data/avito_v3_new/test_data/part-r-*"] 76 | TEST_FILE = [ 77 | "/mnt/dolphinfs/hdd_pool/docker/user/hadoop-hmart-waimaiad/yangfan129/train_data/avito_v3_new/test_data/part-r-*"] 78 | elif DATA_MODE == 4: 79 | DATA_FILE = "/home/hadoop-hmart-waimaiad/cephfs/data/yangfan129/train_data/tfrecord-multi-channel-v1/" 80 | #TRAIN_LIST = ["20211222", "20211223", "20211224", "20211225"] 81 | TRAIN_LIST = ["20220123"] 82 | VALID_LIST = ["20220124"] 83 | TRAIN_FILE = [DATA_FILE + x + "/part-r-*" for x in TRAIN_LIST] 84 | VALID_FILE = [DATA_FILE + x + "/part-r-0001*" for x in VALID_LIST] 85 | TEST_FILE = [DATA_FILE + x + "/part-r-00011" for x in TRAIN_LIST] 86 | 87 | # 辅助脚本 88 | MEAN_VAR_PATH_POI = "./avg_std/poi" 89 | MEAN_VAR_PATH_DELIVERY = "./avg_std/delivery" 90 | MODEL_SAVE_PATH = "../model/" + MODEL_NAME 91 | MODEL_SAVE_PB_EPOCH_ON = False 92 | MODEL_SAVE_PB_EPOCH_PATH = MODEL_SAVE_PATH + "_pbs" 93 | -------------------------------------------------------------------------------- /rerank_paper/prm_model/data_input.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import tensorflow as tf 3 | from config import * 4 | import numpy as np 5 | from tools import tick_tock 6 | 7 | 8 | def generate_parse_tfrecord_local_fn(): 9 | def _parse_function(batch_examples): 10 | common_features, sequence_features = feature_parse_scheme() 11 | parsed_features = tf.parse_example( 12 | serialized=batch_examples, 13 | features=common_features 14 | ) 15 | features = feature_product(parsed_features) 16 | labels = label_product(parsed_features) 17 | return features, labels 18 | 19 | return _parse_function 20 | 21 | 22 | def generate_parse_valid_tfrecord_local_fn(): 23 | def _parse_function(batch_examples): 24 | common_features, sequence_features = feature_parse_scheme() 25 | parsed_features = tf.parse_example( 26 | serialized=batch_examples, 27 | features=common_features 28 | ) 29 | features = feature_product(parsed_features) 30 | labels = label_product(parsed_features) 31 | return features, labels 32 | 33 | return _parse_function 34 | 35 | 36 | def feature_parse_scheme(): 37 | label_len = POI_NUM * 2 + PAGE_NUM 38 | feature_len = POI_NUM * FEATURE_NUM + PAGE_NUM * POI_NUM * FEATURE_NUM_FOR_PAGE 39 | common_features = { 40 | "label": tf.FixedLenFeature([label_len], dtype=tf.float32), 41 | "feature": tf.FixedLenFeature([feature_len], dtype=tf.float32), 42 | } 43 | 44 | sequence_features = {} 45 | return common_features, sequence_features 46 | 47 | 48 | def label_product(parsed_features): 49 | labels = parsed_features['label'] 50 | 51 | labels_result = { 52 | # ctr_label 53 | 'ctr_label': tf.gather(labels, list(range(0, POI_NUM)), axis=1), 54 | 'mask': tf.gather(labels, list(range(POI_NUM, 2 * POI_NUM)), axis=1), 55 | 'page_mask': tf.gather(labels, list(range(2 * POI_NUM, 2 * POI_NUM + PAGE_NUM)), axis=1), 56 | } 57 | return labels_result 58 | 59 | 60 | def feature_product(parsed_features): 61 | feature_buffer = parsed_features['feature'] 62 | labels = parsed_features['label'] 63 | # 获取特征 64 | # FEATURE_CATE_NUM:品类相关特征 65 | # FEATURE_DENSE_NUM:连续值特征 66 | # FEATURE_CXR_NUM:模型预估值特征 67 | 68 | # current page 69 | current_page_start = 0 70 | current_page_end = current_page_start + POI_NUM * FEATURE_NUM 71 | 72 | pre_page_start = current_page_end 73 | pre_page_end = pre_page_start + PAGE_NUM * POI_NUM * FEATURE_NUM_FOR_PAGE 74 | 75 | cur_page_features = tf.reshape(tf.gather(feature_buffer, list(range(current_page_start, current_page_end)), axis=1), 76 | [-1, POI_NUM, FEATURE_NUM]) 77 | pre_page_features = tf.reshape(tf.gather(feature_buffer, list(range(pre_page_start, pre_page_end)), axis=1), 78 | [-1, PAGE_NUM, POI_NUM, FEATURE_NUM_FOR_PAGE]) 79 | 80 | position_fea = tf.gather(cur_page_features, list(range(0, 1)), axis=2) 81 | adid_fea = tf.gather(cur_page_features, list(range(1, 2)), axis=2) 82 | obj_type_fea = tf.gather(cur_page_features, list(range(2, 3)), axis=2) 83 | hist_ctr_fea = tf.gather(cur_page_features, list(range(3, 4)), axis=2) 84 | locationid_fea = tf.gather(cur_page_features, list(range(4, 5)), axis=2) 85 | categoryid_fea = tf.gather(cur_page_features, list(range(5, 6)), axis=2) 86 | price_fea = tf.gather(cur_page_features, list(range(6, 7)), axis=2) 87 | iscontext_fea = tf.gather(cur_page_features, list(range(7, 8)), axis=2) 88 | userid_fea = tf.gather(cur_page_features, list(range(8, 9)), axis=2) 89 | 90 | pre_position_fea = tf.gather(pre_page_features, list(range(0, 1)), axis=3) 91 | pre_adid_fea = tf.gather(pre_page_features, list(range(1, 2)), axis=3) 92 | pre_obj_type_fea = tf.gather(pre_page_features, list(range(2, 3)), axis=3) 93 | pre_hist_ctr_fea = tf.gather(pre_page_features, list(range(3, 4)), axis=3) 94 | pre_locationid_fea = tf.gather(pre_page_features, list(range(4, 5)), axis=3) 95 | pre_categoryid_fea = tf.gather(pre_page_features, list(range(5, 6)), axis=3) 96 | pre_price_fea = tf.gather(pre_page_features, list(range(6, 7)), axis=3) 97 | pre_iscontext_fea = tf.gather(pre_page_features, list(range(7, 8)), axis=3) 98 | pre_userid_fea = tf.gather(pre_page_features, list(range(8, 9)), axis=3) 99 | 100 | features_result = { 101 | 102 | 'dense_feature': hist_ctr_fea, 103 | # 离散特征(品类) 104 | 'cate_feature': tf.cast( 105 | tf.concat([position_fea, adid_fea, obj_type_fea, locationid_fea, iscontext_fea, categoryid_fea, userid_fea], 106 | axis=2), tf.int64), 107 | # ctr_label 108 | 'ctr_label': tf.gather(labels, list(range(0, POI_NUM)), axis=1), 109 | 'mask': tf.gather(labels, list(range(POI_NUM, 2 * POI_NUM)), axis=1), 110 | 'page_mask': tf.gather(labels, list(range(2 * POI_NUM, 2 * POI_NUM + PAGE_NUM)), axis=1), 111 | 'behavior_dense_feature': pre_hist_ctr_fea, 112 | # 离散特征(品类) 113 | 'behavior_cate_feature': tf.cast( 114 | tf.concat([pre_position_fea, pre_adid_fea, pre_obj_type_fea, pre_locationid_fea, 115 | pre_iscontext_fea, pre_categoryid_fea, pre_userid_fea], axis=3), tf.int64) 116 | 117 | } 118 | return features_result 119 | 120 | 121 | # num_parallel 表示cpu的核数,用于控制 map的并行度 122 | def input_fn_maker(file_names, is_train, batch_size, epoch=None, num_parallel=4): 123 | def input_fn(): 124 | _parse_fn = generate_parse_tfrecord_local_fn() if is_train else generate_parse_valid_tfrecord_local_fn() 125 | files = tf.data.Dataset.list_files(file_names) 126 | # print(files) 127 | dataset = files.apply(tf.contrib.data.parallel_interleave(tf.data.TFRecordDataset, cycle_length=4 * 10)) 128 | dataset = dataset.prefetch(buffer_size=batch_size * 10) 129 | dataset = dataset.repeat(epoch) 130 | dataset = dataset.batch(batch_size) 131 | dataset = dataset.map(_parse_fn, num_parallel_calls=num_parallel) 132 | iterator = dataset.make_one_shot_iterator() 133 | return iterator.get_next() 134 | 135 | return input_fn 136 | 137 | 138 | # 从hive表统计得到均值和方差文件 139 | def get_normalization_parameter(mean_var_path): 140 | with tf.gfile.Open(mean_var_path) as f: 141 | fea_mean = f.readline().strip().split(' ') 142 | fea_var = f.readline().strip().split(' ') 143 | cont_fea_mean = list(map(float, fea_mean)) 144 | cont_fea_var = list(map(float, fea_var)) 145 | f.close() 146 | return cont_fea_mean, cont_fea_var 147 | 148 | 149 | def get_bias_weight_parameter(bias_weight_path): 150 | with tf.gfile.Open(bias_weight_path) as f2: 151 | fea_mean = f2.readline().strip().split('\t') 152 | cont_fea_mean = list(map(float, fea_mean)) 153 | f2.close() 154 | return cont_fea_mean 155 | 156 | 157 | if __name__ == '__main__': 158 | train_file = TRAIN_FILE 159 | # train_file = ["/mnt/dolphinfs/hdd_pool/docker/user/hadoop-hmart-waimaiad/yangfan129/train_data/avito_v1_new/avito_v1_new/train_data/part-r-00000"] 160 | train_input_fn = input_fn_maker(train_file, is_train=True, batch_size=1000, epoch=1) 161 | features, labels = train_input_fn() 162 | 163 | sess = tf.Session() 164 | try: 165 | with tick_tock("DATA_INPUT") as _: 166 | features_np, labels_np = sess.run([features, labels]) 167 | 168 | # print("*" * 100, "features_np") 169 | # for key in features_np: 170 | # print("=" * 50, key, np.shape(features_np[key])) 171 | # print(features_np[key]) 172 | 173 | print("*" * 100, "labels_np") 174 | for key in labels_np: 175 | print("=" * 50, key, np.shape(labels_np[key])) 176 | print(labels_np[key]) 177 | 178 | except Exception as e: 179 | print(e) 180 | -------------------------------------------------------------------------------- /rerank_paper/prm_model/layers.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | def DIN(seq, seq_len, target, conf, scope="DIN"): 4 | # seq BATCH_SIZE * SEQ_LEN * FEAT_NUM : N * M * H 5 | # target BATCH_SIZE * FEAT_NUM : N * H 6 | # return : BATCH_SIZE * H 7 | 8 | with tf.variable_scope(scope, reuse=tf.AUTO_REUSE): 9 | seq_shape = tf.shape(seq) 10 | target = tf.tile(target, [1, seq_shape[1], 1]) 11 | 12 | input = tf.concat([seq, target, seq - target, seq * target], axis=-1) 13 | 14 | layers = conf.get("layers", [64, 32]) 15 | for layer in layers: 16 | input = tf.layers.dense(input, layer, activation=tf.nn.sigmoid, name="att_"+str(layer)) 17 | 18 | input = tf.squeeze(tf.layers.dense(input, 1, activation=None, name="att_final"), axis=-1) # N * M 19 | 20 | # Mask 21 | seq_mask = tf.squeeze(tf.sequence_mask(seq_len, seq_shape[1])) 22 | # seq_mask = tf.Print(seq_mask, [seq_mask], message="seq_mask", summarize=100) 23 | padding = tf.ones_like(input) * (-2 ** 32 + 1) 24 | attention = tf.nn.softmax(tf.where(seq_mask, input, padding), axis=-1) # N * M 25 | # attention = tf.Print(attention, [attention], message="attention", summarize=100) 26 | attention = tf.tile(tf.expand_dims(attention, axis=2), [1, 1, seq_shape[2]]) 27 | output = tf.reduce_sum(tf.transpose(attention * seq, [0, 2, 1]), axis=1) 28 | return output 29 | -------------------------------------------------------------------------------- /rerank_paper/prm_model/main.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from config import * 3 | from model import * 4 | from sklearn import metrics 5 | 6 | 7 | def create_estimator(): 8 | tf.logging.set_verbosity(tf.logging.INFO) 9 | session_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) 10 | session_config.gpu_options.allow_growth = True 11 | config = tf.estimator.RunConfig( 12 | tf_random_seed=RANDOM_SEED, 13 | save_summary_steps=100, 14 | save_checkpoints_steps=1000, 15 | model_dir=MODEL_SAVE_PATH, 16 | keep_checkpoint_max=2, 17 | log_step_count_steps=1000, 18 | session_config=session_config) 19 | nn_model = DNN() 20 | estimator = tf.estimator.Estimator(model_fn=nn_model.model_fn_estimator, config=config) 21 | return estimator, nn_model 22 | 23 | 24 | def save_model_pb_with_estimator(estimator, params, export_dir_base): 25 | estimator._params['save_model'] = params['save_model'] 26 | 27 | def _serving_input_receiver_fn(): 28 | # env_feature = > dense_feature 29 | # cxr_feature = > screen_predict_feature 30 | # cat_feature = > screen_cate_feature 31 | # dense_feature = > screen_dense_feature 32 | receiver_tensors = { 33 | # ctr cvr gmv预估值 && bid 34 | 'screen_predict_feature': tf.placeholder(tf.float32, [None, POI_NUM, FEATURE_CXR_NUM], 35 | name='screen_predict_feature'), 36 | # dense 特征 (价格,评分) 37 | 'screen_dense_feature': tf.placeholder(tf.float32, [None, POI_NUM, FEATURE_DENSE_NUM], 38 | name='screen_dense_feature'), 39 | # 离散特征(品类) 40 | 'screen_cate_feature': tf.placeholder(tf.int64, [None, POI_NUM, FEATURE_CATE_NUM], 41 | name='screen_cate_feature'), 42 | # 环境特征(是否有铂金) 43 | 'dense_feature': tf.placeholder(tf.float32, [None, DENSE_FEAT_NUM], 44 | name='dense_feature') 45 | } 46 | return tf.estimator.export.ServingInputReceiver(receiver_tensors=receiver_tensors, features=receiver_tensors) 47 | 48 | export_dir = estimator.export_saved_model(export_dir_base=export_dir_base, 49 | serving_input_receiver_fn=_serving_input_receiver_fn) 50 | estimator._params.pop('save_model') 51 | return export_dir.decode() 52 | 53 | 54 | def calculate_result(result_generator): 55 | y_ctr, pred_ctr, ctr = [], [], [] 56 | for result in result_generator: 57 | cxr_feature = result['cxr_feature'] 58 | mask = result['mask'] 59 | # ctr_label 60 | idx = np.where(mask.reshape(-1) == 1) 61 | y_ctr += result['ctr_label'].reshape(-1)[idx].tolist() 62 | pred_ctr += result['ctr_out'].reshape(-1)[idx].tolist() 63 | ctr += cxr_feature[:, :, 0].reshape(-1)[idx].tolist() 64 | 65 | ctr_auc, ctr_auc_jp, ctr_cb, ctr_cb_jp = metrics.roc_auc_score(y_ctr, pred_ctr), metrics.roc_auc_score(y_ctr, 66 | ctr), np.sum( 67 | pred_ctr) / np.sum(y_ctr), np.sum(ctr) / np.sum(y_ctr) 68 | print("ctr_auc:{}, ctr_auc_jp:{}, ctr_cb:{}, ctr_cb_jp:{}".format(ctr_auc, ctr_auc_jp, ctr_cb, ctr_cb_jp)) 69 | 70 | 71 | if __name__ == '__main__': 72 | 73 | estimator, nn_model = create_estimator() 74 | 75 | with tick_tock("DATA_INPUT") as _: 76 | valid_input_fn = input_fn_maker(VALID_FILE, False, batch_size=1024, epoch=1) 77 | test_input_fn = input_fn_maker(TEST_FILE, False, batch_size=1024, epoch=1) 78 | 79 | if TRAIN_MODE == 1: 80 | for i in range(EPOCH): 81 | for idx, data in enumerate(TRAIN_FILE): 82 | with tick_tock("DATA_INPUT") as _: 83 | train_input_fn = input_fn_maker([data], True, batch_size=BATCH_SIZE, epoch=1) 84 | with tick_tock("TRAIN") as _: 85 | estimator.train(train_input_fn) 86 | if MODEL_SAVE_PB_EPOCH_ON: 87 | export_dir = save_model_pb_with_estimator(estimator, params={'save_model': 'listwise'}, 88 | export_dir_base=MODEL_SAVE_PB_EPOCH_PATH) 89 | ep_insert_index = i * len(TRAIN_FILE) + idx 90 | target_dir = export_dir + "/../ep" + str(ep_insert_index) 91 | while os.path.exists(target_dir): 92 | target_dir = export_dir + "/../ep" + str(ep_insert_index) 93 | shutil.move(export_dir, target_dir) 94 | print(time.strftime("%m-%d %H:%M:%S ", 95 | time.localtime(time.time())) + "export model PB: " + target_dir) 96 | # with tick_tock("PREDICT") as _: 97 | # result_generator = estimator.predict(input_fn=valid_input_fn, yield_single_examples=False) 98 | # calculate_result(result_generator) 99 | 100 | 101 | 102 | elif TRAIN_MODE == 2: 103 | with tick_tock("PREDICT") as _: 104 | result_generator = estimator.predict(input_fn=valid_input_fn, yield_single_examples=False) 105 | calculate_result(result_generator) 106 | 107 | elif TRAIN_MODE == 3: 108 | for i in range(EPOCH): 109 | for idx, data in enumerate(TRAIN_FILE): 110 | with tick_tock("DATA_INPUT") as _: 111 | train_input_fn = input_fn_maker([data], True, batch_size=BATCH_SIZE, epoch=1) 112 | with tick_tock("TRAIN") as _: 113 | estimator.train(train_input_fn) 114 | with tick_tock("PREDICT") as _: 115 | result_generator = estimator.predict(input_fn=valid_input_fn, yield_single_examples=False) 116 | print("valid_data") 117 | calculate_result(result_generator) 118 | # result_generator = estimator.predict(input_fn=test_input_fn, yield_single_examples=False) 119 | print("train_data") 120 | # calculate_result(result_generator) 121 | # save pb 122 | 123 | 124 | elif TRAIN_MODE == 4: 125 | export_dir = save_model_pb_with_estimator(estimator, params={'save_model': 'listwise'}, 126 | export_dir_base=MODEL_SAVE_PB_EPOCH_PATH) 127 | ep_insert_index = 0 128 | target_dir = export_dir + "/../ep" + str(ep_insert_index) 129 | while os.path.exists(target_dir): 130 | target_dir = export_dir + "/../ep" + str(ep_insert_index) 131 | shutil.move(export_dir, target_dir) 132 | print(time.strftime("%m-%d %H:%M:%S ", 133 | time.localtime(time.time())) + "export model PB: " + target_dir) 134 | 135 | -------------------------------------------------------------------------------- /rerank_paper/prm_model/model_load.py: -------------------------------------------------------------------------------- 1 | import tensorflow 2 | 3 | from config import * 4 | import tensorflow_core.contrib.predictor as predictor 5 | 6 | def load_listwise_model(): 7 | model_filename_dir = MODEL_SAVE_PATH + "_pbs/ep0/" 8 | predict_fn = predictor.from_saved_model(model_filename_dir) 9 | # predict_fn = tf.compat.v2.saved_model.load(model_filename_dir) 10 | 11 | # env_feature = > dense_feature 12 | # cxr_feature = > screen_predict_feature 13 | # cat_feature = > screen_cate_feature 14 | # dense_feature = > screen_dense_feature 15 | predictions = predict_fn({ 16 | 'screen_predict_feature': [[[0.036115, 0.05427262, 0.09489095, 0.2], 17 | [0.027565, 0.07474336, 0.04988268, 0.53], 18 | [0.024815, 0.1775544, 0.12052802, 0.24], 19 | [0.023316, 0.12283709, 0.10298113, 0.1]]], 20 | # dense 特征 (价格,评分) 21 | 'screen_dense_feature': [[[1359., 30.146147, 26., 5., 4.85], 22 | [318., 14.675659, 0., 5., 4.94], 23 | [637., 24.784016, 0., 5., 4.65], 24 | [185., 25.333273, 0., 5., 4.75]]], 25 | # 离散特征(品类) 26 | 'screen_cate_feature': [[[2638824, 4148885, 432243, 3985407, 3385100, 3019284], 27 | [2638824, 3905410, 3212599, 3985407, 1997821, 3019284], 28 | [2638824, 4148885, 3622545, 3985407, 1997821, 3019284], 29 | [2638824, 4148885, 432243, 3985407, 1997821, 3019284]]], 30 | # 环境特征(是否有铂金) 31 | 'dense_feature': [[0., 0.]] 32 | }) 33 | 34 | print('Q_network_output:', predictions['Q_network_output']) 35 | print('out:', predictions['out']) 36 | 37 | if __name__ == '__main__': 38 | # load_pg_model() 39 | load_listwise_model() -------------------------------------------------------------------------------- /rerank_paper/prm_model/restore.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lemonace/PIER_code/9c47f80af24570154aab4a811dd82d1f1d436fc3/rerank_paper/prm_model/restore.py -------------------------------------------------------------------------------- /rerank_paper/prm_model/run.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | LOG_PATH="./" 4 | if [[ ! -d ${LOG_PATH} ]]; then 5 | mkdir ${LOG_PATH} 6 | fi 7 | 8 | project_path=$(cd `dirname $0`; pwd) 9 | project_name="${project_path##*/}" 10 | begin_time=$(date "+%Y-%m-%d %H:%M:%S") 11 | author="yangfan129" 12 | 13 | LOG_FILENAME="log" 14 | CUDA_VISIBLE_DEVICES=0 nohup python -u main.py --author ${author} --project ${project_name} --begintime ${begin_time} > ${LOG_PATH}${LOG_FILENAME} 2>&1 & 15 | -------------------------------------------------------------------------------- /rerank_paper/prm_model/tools.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import datetime 3 | import time 4 | import os 5 | 6 | 7 | class tick_tock: 8 | def __init__(self, process_name, verbose=1): 9 | self.process_name = process_name 10 | self.verbose = verbose 11 | 12 | def __enter__(self): 13 | if self.verbose: 14 | print(("*" * 50 + " {} START!!!! ".format(self.process_name) + "*" * 50)) 15 | self.begin_time = time.time() 16 | 17 | def __exit__(self, type, value, traceback): 18 | if self.verbose: 19 | end_time = time.time() 20 | duration_seconds = end_time - self.begin_time 21 | duration = str(datetime.timedelta(seconds=duration_seconds)) 22 | 23 | print(("#" * 50 + " {} END... time lapsing {} ".format(self.process_name, duration) + "#" * 50)) 24 | 25 | 26 | class FeatureInfo: 27 | def __init__(self, feature_info_str): 28 | self.feature_info_str = feature_info_str 29 | 30 | self.feature_name = "NonFeaName" 31 | self.feature_size = 0 32 | self.feature_mask = 1 33 | self.parse_info_flag = False 34 | self.part_num = 3 35 | 36 | self._parse_info() 37 | 38 | def _parse_info(self): 39 | infoList = self.feature_info_str.split() 40 | 41 | if len(infoList) == self.part_num: 42 | self.feature_name = infoList[0] 43 | self.feature_size = int(infoList[1]) 44 | self.feature_mask = int(infoList[2]) 45 | self.parse_info_flag = True 46 | 47 | 48 | def parse_mask_file(feature_mask_file): 49 | try: 50 | if not os.path.exists(feature_mask_file): 51 | print("parse_mask_file fail - file not exists:", feature_mask_file) 52 | return [], False 53 | # feature_name_list = [] 54 | feature_mask_list = [] 55 | feature_hold_cnt = 0 56 | 57 | with open(feature_mask_file) as f: 58 | str_list = f.readlines() 59 | 60 | for i in range(0, len(str_list)): 61 | str_list[i] = str_list[i].strip('\n').strip() 62 | if str_list[i] == "": 63 | continue 64 | 65 | info = FeatureInfo(str_list[i]) 66 | if not info.parse_info_flag: 67 | print("parse_mask_file fail - parse_info fail:", str_list[i]) 68 | parse_mask_flag = False 69 | return [], parse_mask_flag 70 | 71 | for j in range(info.feature_size): 72 | feature_mask_list.append(info.feature_mask) 73 | if info.feature_mask != 0: 74 | feature_hold_cnt += 1 75 | # if info.feature_size > 1: 76 | # feature_name_list.append(info.feature_name + "_" + str(j)) 77 | # else: 78 | # feature_name_list.append(info.feature_name) 79 | 80 | parse_mask_flag = True 81 | return feature_mask_list, parse_mask_flag, feature_hold_cnt 82 | except Exception as e: 83 | print("parse_mask_file fail - Exception:", e) 84 | return [], False 85 | 86 | 87 | if __name__ == "__main__": 88 | feature_mask_list, parse_feature_mask_flag, feature_hold_cnt = parse_mask_file("feature_mask") 89 | print(feature_mask_list) 90 | print(len(feature_mask_list)) 91 | print(parse_feature_mask_flag) 92 | print(feature_hold_cnt) 93 | -------------------------------------------------------------------------------- /rerank_paper/prm_model/util.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | def index_matrix_to_pairs(index_matrix): 4 | # [[3,1,2], [2,3,1]] -> [[[0, 3], [1, 1], [2, 2]], 5 | # [[0, 2], [1, 3], [2, 1]]] 6 | replicated_first_indices = tf.range(tf.shape(index_matrix)[0]) 7 | rank = len(index_matrix.get_shape()) 8 | if rank == 2: 9 | replicated_first_indices = tf.tile( 10 | tf.expand_dims(replicated_first_indices, axis=1), 11 | [1, tf.shape(index_matrix)[1]]) 12 | replicated_first_indices = tf.cast(replicated_first_indices, dtype=tf.int64) 13 | return tf.stack([replicated_first_indices, index_matrix], axis=rank) 14 | 15 | def string_hash_to_index(tensor, bucket=1<<22): 16 | return tf.strings.to_hash_bucket_fast(tensor, bucket) 17 | 18 | def int_to_string_with_key(tensor, key): 19 | return key + "_" + tf.strings.as_string(tensor) 20 | 21 | def float_to_string_with_key(tensor, key, precision=1): 22 | return key + "_" + tf.strings.as_string(tensor, precision) 23 | 24 | def float_to_int(tensor, order): 25 | wc = 10 ** order 26 | return tf.cast(tensor * wc, tf.int64) 27 | 28 | def float_custom_hash(tensor, key, precision=0, bucket=1<<22): 29 | tensor = float_to_string_with_key(tensor, key, precision) 30 | tensor = string_hash_to_index(tensor, bucket) 31 | return tensor 32 | 33 | def int_custom_hash(tensor, key, bucket=1<<22): 34 | tensor = int_to_string_with_key(tensor, key) 35 | tensor = string_hash_to_index(tensor, bucket) 36 | return tensor 37 | --------------------------------------------------------------------------------