├── code ├── v1 │ ├── src │ │ ├── model │ │ │ ├── __init__.py │ │ │ ├── estimator_builder.py │ │ │ ├── helper.py │ │ │ ├── att_kn.py │ │ │ ├── att.py │ │ │ └── att_predict.py │ │ ├── reader │ │ │ ├── __init__.py │ │ │ └── data_set.py │ │ ├── utils │ │ │ ├── __init__.py │ │ │ ├── json_reader.py │ │ │ └── flag_setup.py │ │ ├── conf │ │ │ └── model │ │ │ │ ├── local_att_kn_pointwise.json │ │ │ │ ├── local_att_pointwise.json │ │ │ │ ├── local_att_interid_pointwise.json │ │ │ │ ├── cluster_att_pointwise.json │ │ │ │ └── cluster_att_interid_pointwise.json │ │ ├── predict.py │ │ ├── predict_vec.py │ │ ├── run.py │ │ ├── official_eval_ndcg.py │ │ └── eval_ndcg.py │ ├── run_cv_predict.sh │ ├── constants.py │ ├── gen_interid_dict.py │ ├── main_v1.sh │ ├── gen_dict.py │ ├── gen_word_vec.py │ ├── cv_lgb_predict.py │ ├── utils.py │ ├── generate_tf_record.py │ └── cv_lgb_train.py ├── main.sh ├── gen_submit.py ├── file_merge_score.py ├── cv_merge_score.py ├── file_merge_score_wei.py └── ensemble_merge_score.sh ├── README.md ├── .gitignore └── LICENSE /code/v1/src/model/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /code/v1/src/reader/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /code/v1/src/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /code/main.sh: -------------------------------------------------------------------------------- 1 | cd v1 2 | ./main_v1.sh 3 | cd .. 4 | ./ensemble_merge_score.sh 5 | -------------------------------------------------------------------------------- /code/v1/src/utils/json_reader.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import json 3 | import tensorflow as tf 4 | 5 | 6 | def load_json(json_file_path): 7 | with open(json_file_path, "r") as config_file: 8 | try: 9 | json_conf = json.load(config_file) 10 | return json_conf 11 | except Exception: 12 | tf.logging.error("load json file %s error" % json_file_path) 13 | -------------------------------------------------------------------------------- /code/gen_submit.py: -------------------------------------------------------------------------------- 1 | """ 2 | @input: 3 | score_file format: qid,pid,score,label 4 | pos_file format: json file, key: qid, value: list of pid 5 | """ 6 | import math 7 | EVAL_NUM = 5 8 | import sys 9 | score_file, submit_file = sys.argv[1:] 10 | submit_fout = open(submit_file, 'w') 11 | submit_fout.write('query-id,product1,product2,product3,product4,product5\n') 12 | import json 13 | from collections import defaultdict 14 | predict = defaultdict(list) 15 | for l in open(score_file, 'r'): 16 | qid, pid, s = l.strip().split(",") 17 | predict[qid].append([pid, float(s)]) 18 | 19 | print('[INFO] length of predition: ', len(predict)) 20 | 21 | for qid in predict: 22 | p = sorted(predict[qid], key=lambda x:x [1], reverse=True)[:EVAL_NUM] 23 | submit_fout.write(qid + ',' + ','.join([x[0] for x in p]) + '\n') 24 | print("[INFO] gen submit done.") -------------------------------------------------------------------------------- /code/file_merge_score.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import sys 3 | from collections import defaultdict 4 | import datetime 5 | 6 | def fill(res, df): 7 | for data in df.values: 8 | q, p, v = data 9 | q, p = int(q), int(p) 10 | key = str(q) + ',' + str(p) 11 | res[key].append(v) 12 | return res 13 | 14 | def mergeScore(input_files, output): 15 | res = defaultdict(list) 16 | for file in input_files.split(','): 17 | print("current file: {}".format(file)) 18 | df = pd.read_csv(file, sep=',', header=None) 19 | print(df.shape) 20 | 21 | res = fill(res, df) 22 | print('res length: {}'.format(len(res))) 23 | 24 | with open(output, 'w') as fout: 25 | for key in res: 26 | fout.write(key + ',' + str(sum(res[key]) / len(res[key])) + '\n') 27 | 28 | if __name__ == '__main__': 29 | input_files,output = sys.argv[1:] 30 | mergeScore(input_files, output) -------------------------------------------------------------------------------- /code/v1/run_cv_predict.sh: -------------------------------------------------------------------------------- 1 | echo "run_cv" 2 | 3 | 4 | while [[ $# -gt 0 ]] 5 | do 6 | case "$1" in 7 | 8 | --prefix) 9 | prefix="$2" 10 | shift 11 | shift 12 | ;; 13 | --predict_file) 14 | predict_file="$2" 15 | shift 16 | shift 17 | ;; 18 | --tag) 19 | TAG_ID="$2" 20 | shift 21 | shift 22 | ;; 23 | *) 24 | 25 | echo "Unknown option $1. Supported options are [--mode local|afo] [--hparam] [--tag]" 26 | exit 3 27 | ;; 28 | esac 29 | done 30 | 31 | 32 | for i in 1 33 | do 34 | echo "current cv round is " $i 35 | 36 | ./run.sh --mode cv_local_predict_v1 --model local_att_pointwise.json --cv_input_dir ../../../user_data/cv_valid$i --test_file $predict_file --cv_fold 5 --tag ${TAG_ID}_${i} 37 | done 38 | -------------------------------------------------------------------------------- /code/cv_merge_score.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import sys 3 | import os, pickle 4 | from collections import defaultdict 5 | def fill(res, df): 6 | for data in df.values: 7 | q, p, v = data 8 | q, p = int(q), int(p) 9 | key = str(q) + ',' + str(p) 10 | res[key].append(v) 11 | return res 12 | 13 | def mergeScore(input_prefix,cv_num, output): 14 | res = defaultdict(list) 15 | for i in range(cv_num): 16 | print("current cv fold: {}".format(i)) 17 | df = pd.read_csv(input_prefix+str(i), sep=',', header=None) 18 | print(df.shape) 19 | 20 | res = fill(res, df) 21 | print('res length: {}'.format(len(res))) 22 | 23 | with open(output, 'w') as fout: 24 | for key in res: 25 | fout.write(key + ',' + str(sum(res[key]) / len(res[key])) + '\n') 26 | 27 | if __name__ == '__main__': 28 | input_prefix, cv_num = sys.argv[1:] 29 | cv_num = int(cv_num) 30 | output = input_prefix + 'merged' 31 | mergeScore(input_prefix,cv_num, output) -------------------------------------------------------------------------------- /code/file_merge_score_wei.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import sys 3 | from collections import defaultdict 4 | import datetime 5 | import numpy as np 6 | 7 | def fill(res, df, wei): 8 | for data in df.values: 9 | q, p, v = data 10 | q, p = int(q), int(p) 11 | key = str(q) + ',' + str(p) 12 | res[key].append(v*wei) 13 | return res 14 | 15 | def mergeScore(input_files,wei, output): 16 | res = defaultdict(list) 17 | 18 | wei = np.array([float(i) for i in wei.split(',')]) 19 | wei = wei/wei.sum() 20 | 21 | for i,file in enumerate(input_files.split(',')): 22 | print("current file: {}".format(file)) 23 | df = pd.read_csv(file, sep=',', header=None) 24 | print(df.shape) 25 | 26 | res = fill(res, df, wei[i]) 27 | print('res length: {}'.format(len(res))) 28 | 29 | with open(output, 'w') as fout: 30 | for key in res: 31 | fout.write(key + ',' + str(sum(res[key])) + '\n') 32 | 33 | if __name__ == '__main__': 34 | input_files,wei,output = sys.argv[1:] 35 | mergeScore(input_files, wei, output) -------------------------------------------------------------------------------- /code/v1/constants.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | 4 | data_dir = '../../data/' 5 | input_dir = '../../user_data/v1/' 6 | 7 | train_tsv = data_dir+'train/train.tsv' 8 | valid_tsv = data_dir+'valid/valid.tsv' 9 | testA_tsv = data_dir+'testB/testB.tsv' 10 | testB_tsv = '' 11 | 12 | external_resources_dir = '../../external_resources/' 13 | 14 | if not os.path.exists(input_dir): 15 | os.makedirs(input_dir) 16 | 17 | valid_answer = data_dir+'valid/valid_answer.json' 18 | 19 | glove_src = external_resources_dir+'glove.42B.300d.txt' 20 | 21 | glove_path = input_dir+'glove.pkl' 22 | word2vec_path = input_dir+'word2vec.pkl' 23 | blend_word2vec_path = input_dir+'blend_word2vec.pkl' 24 | 25 | 26 | ndcg_valid_tfrecord = input_dir+'officialValid.tfrecord' 27 | ndcg_testA_tfrecord = input_dir+'officialTestA.tfrecord' 28 | 29 | 30 | word_dict_path = input_dir+'word_dict.pkl' 31 | # query_dict_path = input_dir+'query_dict.pkl' 32 | # product_dict_path = input_dir+'product_dict.pkl' 33 | 34 | interid_product_dict_path = input_dir+'interid_product_dict.pkl' 35 | interid_query_dict_path = input_dir+'interid_query_dict.pkl' 36 | 37 | # trichar_dict_path = input_dir+'trichar_dict.pkl' 38 | # trichar2vec_path = input_dir+'trichar2vec.pkl' 39 | 40 | 41 | word2cluster_path = input_dir+'word2cluster_{}.pkl' 42 | 43 | 44 | valid_query_prodcut_pkl = input_dir+'valid_query_product.pkl' 45 | 46 | 47 | 48 | SEED = 2020 49 | 50 | THREAD = 8 -------------------------------------------------------------------------------- /code/v1/gen_interid_dict.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | 4 | import numpy as np 5 | import pickle 6 | from constants import * 7 | import utils 8 | 9 | test_query = [] 10 | test_product_id = set() 11 | with open(testA_tsv, 'r') as fin: 12 | header = fin.readline() 13 | for line in fin: 14 | features = line.strip('\n').split('\t') 15 | product_id = features[5] 16 | test_product_id.add(product_id) 17 | test_query.append(utils.get_sorted_split_text(features[-2])) 18 | 19 | 20 | 21 | product_id_set = set() 22 | train_query = [] 23 | 24 | 25 | # with open(train_tsv, 'r') as fin: 26 | # header = fin.readline() 27 | # for line in fin: 28 | # features = line.strip('\n').split('\t') 29 | # product_id = features[5] 30 | # if product_id in test_product_id: 31 | # product_id_set.add(product_id) 32 | 33 | # train_query.append(utils.get_sorted_split_text(features[-2])) 34 | 35 | with open(valid_tsv, 'r') as fin: 36 | header = fin.readline() 37 | for line in fin: 38 | features = line.strip('\n').split('\t') 39 | product_id = features[5] 40 | if product_id in test_product_id: 41 | product_id_set.add(product_id) 42 | train_query.append(utils.get_sorted_split_text(features[-2])) 43 | 44 | 45 | 46 | product_ids = sorted(product_id_set) 47 | 48 | product_id_dict = {image_id:(i+1) for i,image_id in enumerate(product_ids)} 49 | 50 | 51 | query_set = set(test_query) & set(train_query) 52 | queries = sorted(query_set) 53 | 54 | print('product_id_dict',len(product_id_dict)) 55 | 56 | query_id_dict = {query:(i+1) for i,query in enumerate(queries)} 57 | 58 | print('query_id_dict',len(query_id_dict)) 59 | 60 | pickle.dump(product_id_dict,open(interid_product_dict_path,'wb')) 61 | pickle.dump(query_id_dict,open(interid_query_dict_path,'wb')) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # KDD CUP 2020: Multimodalities Recall 2 | ### Team: aister 3 | *** 4 | + Members: Jianqiang Huang, Yi Qi, Ke Hu, Bohang Zheng, Mingjian Chen, Xingyuan Tang, Tan Qu, Jun Lei 5 | + Team Introduction: Most of our members come from the Search Ads Algorithm Team of the Meituan Dianping Advertising Platform Department. We participated in three of the five competitions held by KDD CUP 2020 and achieved promising results. We won first place in Debiasing(1/1895), first place in AutoGraph(1/149), and third place in Multimodalities Recall(3/1433). 6 | + Based on the business scenario of Meituan and Dianping App, the Search Ads Algorithm Team of Meituan Dianping has rich expertise in innovation and algorithm optimization in the field of cutting-edge technology, including but not limited to, conducting algorithm research and application in the fileds of Debiasing, Graph Learning and Multimodalities. 7 | + If you are interested in our team or would like to communicate with our team(b.t.w., we are hiring), you can email to huangjianqiang@meituan.com. 8 | 9 | ### Introduction 10 | *** 11 | + For this competition, the official have prepared the real-scenario multimodal data from the mobile Taobao, one of the largest e-commerce platforms. The dataset consists of Taobao search queries and product image features, which is organized into a query-based multimodal retrieval task. 12 | Given a search query in natural language form, the participating teams are required to implement a model to rank a collection of candidate products based on their image features. Most of these queries are noun phrases searching for products with specific characteristics. The images of the candidate products are provided by the sellers displaying the product features. Candidate products most relevant to the query are regarded as the ground truth of the query, which are expected to be top-ranked by the participating models. 13 | Please refer to the competition official website for more details: https://tianchi.aliyun.com/competition/entrance/231786/information 14 | 15 | -------------------------------------------------------------------------------- /code/v1/main_v1.sh: -------------------------------------------------------------------------------- 1 | #python generate_tf_record.py 2 | 3 | 4 | models=("20200605102104" "20200605102155" "20200605102210" "20200605103300" "20200605103317" "20200605103338" "20200605103358" "20200605104920" "20200605104935" "20200605104949" "20200605105004" "20200605110558" "20200605110612" "20200605110629" "20200605135828_20200528202620" "20200605135844_20200527214553" "20200605135855_20200528023614" "20200605135905_20200527235736" "20200605141133_20200529114748" "20200605141145_20200528154733" "20200605141200_20200528125945" "20200605141213_20200529120120" "20200605142455_20200529120105" "20200605142514_20200529115623" "20200605142526_20200530144344" "20200605142545_20200529014325" "20200605150057_20200530085657" "20200605150632_20200530085707") 5 | 6 | for model in ${models[@]} 7 | do 8 | echo $model 9 | ./run_cv_predict.sh --predict_file ../../../user_data/v1/officialTestA.tfrecord --tag $model 10 | done 11 | 12 | 13 | 14 | models=("20200605113210" "20200605113456" "20200605113523" "20200605113629" "20200605113728" "20200605113821" "20200605114059" "20200605114112" "20200605114201" "20200605114245" "20200605114313" "20200605114346" "20200605114450" "20200605114603") 15 | for model in ${models[@]} 16 | do 17 | echo $model 18 | ./run.sh --mode local_predict_export_test --tag $model --model local_att_pointwise.json --test_file ../../../user_data/v1/officialTestA.tfrecord 19 | done 20 | 21 | 22 | i=0 23 | lgb_models=("0605115205" "0605115428" "0605115503" "0605115521" "0605115547" "0605122723" "0605122757" "0605122825" "0605122841" "0605122902" "0605130536" "0605130558" "0605130617" "0605130634") 24 | for model in ${lgb_models[@]} 25 | do 26 | 27 | echo $model 28 | python cv_lgb_predict.py $model ${models[i]} 0 1 0 2020 29 | i=`expr $i + 1` 30 | done 31 | 32 | 33 | i=0 34 | lgb_models=("0605151550" "0605151621" "0605151638" "0605151654" "0605154950" "0605155010" "0605155026" "0605155044" "0605165244" "0605165302" "0605165318" "0605165336" "0605175627" "0605175650") 35 | for model in ${lgb_models[@]} 36 | do 37 | 38 | echo $model 39 | python cv_lgb_predict.py $model ${models[i]} 0 0 0 2020 40 | i=`expr $i + 1` 41 | done -------------------------------------------------------------------------------- /code/v1/src/conf/model/local_att_kn_pointwise.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": { 3 | "model_name": "att_kn", 4 | "hidden_layers": [300, 128], 5 | "query_embedding_size": 300, 6 | "neg_num": 0, 7 | "eval_neg_num": 0, 8 | "batch_size": 128, 9 | "max_step": 10000000, 10 | "epoch": 15, 11 | "learning_rate": 0.001, 12 | "decay_steps": 200, 13 | "decay_rate": 0.8, 14 | "extra_pred_num": 1 15 | }, 16 | 17 | "data_schema": { 18 | "features": [ 19 | {"name": "height", "type": "embedding", "max": 1000}, 20 | {"name": "width", "type": "embedding", "max": 1000}, 21 | {"name": "num_boxes", "type": "embedding", "max": 100}, 22 | {"name": "boxes", "type": "countinous", "last_dim": 4}, 23 | {"name": "boxes_features", "type": "countinous", "last_dim": 2048}, 24 | {"name": "boxes_labels", "type": "embedding_seq", "max": 100}, 25 | {"name": "query_id", "type": "embedding", "max": 1352000}, 26 | {"name": "query", "type": "embedding_seq", "max": 22000}, 27 | {"name": "query_words_num", "type": "embedding", "max": 100}, 28 | {"name": "last_word", "type": "embedding", "max": 22000}, 29 | {"name": "image_area", "type": "embedding", "max": 1000}, 30 | {"name": "boxes_position", "type": "embedding_seq", "max": 30}, 31 | {"name": "boxes_height", "type": "embedding_seq", "max": 1000}, 32 | {"name": "boxes_width", "type": "embedding_seq", "max": 1000}, 33 | {"name": "boxes_area", "type": "embedding_seq", "max": 1000} 34 | ], 35 | "label": {"name": "peudo-label", "type": "embedding", "max": 1}, 36 | "extra_preds": {"name": "extra_preds", "type": "embedding", "max": 1}, 37 | "item_features": ["height", "width", "num_boxes", "boxes", "boxes_features", "boxes_labels","image_area","boxes_position","boxes_height","boxes_width","boxes_area"], 38 | "query_features": ["query_id", "query", "query_words_num","last_word"] 39 | }, 40 | 41 | 42 | "export": { 43 | "model_dir": "../../../user_data/export/checkpoint", 44 | "savedmodel_dir": "../../../user_data/export/savedmodel", 45 | "checkpoint_secs": 6000, 46 | "checkpoint_steps": -1, 47 | "summary_steps": 100, 48 | "max_checkpoints": 10, 49 | "checkpoint_interval_secs": 6000 50 | }, 51 | 52 | "mode": "local" 53 | } -------------------------------------------------------------------------------- /code/v1/src/conf/model/local_att_pointwise.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": { 3 | "model_name": "att", 4 | "hidden_layers": [300, 128], 5 | "query_embedding_size": 300, 6 | "neg_num": 0, 7 | "eval_neg_num": 0, 8 | "batch_size": 128, 9 | "max_step": 10000000, 10 | "epoch": 15, 11 | "learning_rate": 0.001, 12 | "decay_steps": 200, 13 | "decay_rate": 0.8, 14 | "use_sample_type": 0, 15 | "extra_pred_num": 1 16 | }, 17 | 18 | "data_schema": { 19 | "features": [ 20 | {"name": "height", "type": "embedding", "max": 1000}, 21 | {"name": "width", "type": "embedding", "max": 1000}, 22 | {"name": "num_boxes", "type": "embedding", "max": 100}, 23 | {"name": "boxes", "type": "countinous", "last_dim": 4}, 24 | {"name": "boxes_features", "type": "countinous", "last_dim": 2048}, 25 | {"name": "boxes_labels", "type": "embedding_seq", "max": 100}, 26 | {"name": "query_id", "type": "embedding", "max": 2000}, 27 | {"name": "query", "type": "embedding_seq", "max": 22000}, 28 | {"name": "query_words_num", "type": "embedding", "max": 100}, 29 | {"name": "last_word", "type": "embedding", "max": 22000}, 30 | {"name": "image_area", "type": "embedding", "max": 1000}, 31 | {"name": "boxes_position", "type": "embedding_seq", "max": 30}, 32 | {"name": "boxes_height", "type": "embedding_seq", "max": 1000}, 33 | {"name": "boxes_width", "type": "embedding_seq", "max": 1000}, 34 | {"name": "boxes_area", "type": "embedding_seq", "max": 1000}, 35 | {"name": "product_id", "type": "embedding", "max": 10000} 36 | ], 37 | "sample_type": {"name": "sample_type", "type": "embedding", "max": 10}, 38 | "extra_preds": {"name": "extra_preds", "type": "embedding", "max": 1}, 39 | "label": {"name": "peudo-label", "type": "embedding", "max": 1}, 40 | "item_features": ["product_id","height", "width", "num_boxes", "boxes", "boxes_features", "boxes_labels","image_area","boxes_position","boxes_height","boxes_width","boxes_area"], 41 | "query_features": ["query_id", "query", "query_words_num","last_word"] 42 | }, 43 | 44 | 45 | "export": { 46 | "model_dir": "../../../user_data/export/checkpoint", 47 | "savedmodel_dir": "../../../user_data/export/savedmodel", 48 | "checkpoint_secs": 6000, 49 | "checkpoint_steps": -1, 50 | "summary_steps": 100, 51 | "max_checkpoints": 10, 52 | "checkpoint_interval_secs": 6000 53 | }, 54 | 55 | "mode": "local" 56 | } -------------------------------------------------------------------------------- /code/v1/src/conf/model/local_att_interid_pointwise.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": { 3 | "model_name": "att_interid", 4 | "hidden_layers": [300, 128], 5 | "query_embedding_size": 300, 6 | "neg_num": 0, 7 | "eval_neg_num": 0, 8 | "batch_size": 128, 9 | "max_step": 10000000, 10 | "epoch": 15, 11 | "learning_rate": 0.001, 12 | "decay_steps": 200, 13 | "decay_rate": 0.8, 14 | "use_sample_type": 0, 15 | "extra_pred_num": 1 16 | }, 17 | 18 | "data_schema": { 19 | "features": [ 20 | {"name": "height", "type": "embedding", "max": 1000}, 21 | {"name": "width", "type": "embedding", "max": 1000}, 22 | {"name": "num_boxes", "type": "embedding", "max": 100}, 23 | {"name": "boxes", "type": "countinous", "last_dim": 4}, 24 | {"name": "boxes_features", "type": "countinous", "last_dim": 2048}, 25 | {"name": "boxes_labels", "type": "embedding_seq", "max": 100}, 26 | {"name": "query_id", "type": "embedding", "max": 2000}, 27 | {"name": "query", "type": "embedding_seq", "max": 22000}, 28 | {"name": "query_words_num", "type": "embedding", "max": 100}, 29 | {"name": "last_word", "type": "embedding", "max": 22000}, 30 | {"name": "image_area", "type": "embedding", "max": 1000}, 31 | {"name": "boxes_position", "type": "embedding_seq", "max": 30}, 32 | {"name": "boxes_height", "type": "embedding_seq", "max": 1000}, 33 | {"name": "boxes_width", "type": "embedding_seq", "max": 1000}, 34 | {"name": "boxes_area", "type": "embedding_seq", "max": 1000}, 35 | {"name": "product_id", "type": "embedding", "max": 10000} 36 | ], 37 | "sample_type": {"name": "sample_type", "type": "embedding", "max": 10}, 38 | "extra_preds": {"name": "extra_preds", "type": "embedding", "max": 1}, 39 | "label": {"name": "peudo-label", "type": "embedding", "max": 1}, 40 | "item_features": ["product_id","height", "width", "num_boxes", "boxes", "boxes_features", "boxes_labels","image_area","boxes_position","boxes_height","boxes_width","boxes_area"], 41 | "query_features": ["query_id", "query", "query_words_num","last_word"] 42 | }, 43 | 44 | 45 | "export": { 46 | "model_dir": "../../../user_data/export/checkpoint", 47 | "savedmodel_dir": "../../../user_data/export/savedmodel", 48 | "checkpoint_secs": 6000, 49 | "checkpoint_steps": -1, 50 | "summary_steps": 100, 51 | "max_checkpoints": 10, 52 | "checkpoint_interval_secs": 6000 53 | }, 54 | 55 | "mode": "local" 56 | } -------------------------------------------------------------------------------- /code/v1/src/conf/model/cluster_att_pointwise.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": { 3 | "model_name": "att", 4 | "hidden_layers": [300, 128], 5 | "query_embedding_size": 300, 6 | "neg_num": 0, 7 | "eval_neg_num": 0, 8 | "batch_size": 1024, 9 | "max_step": 10000000, 10 | "epoch": 2, 11 | "learning_rate": 0.0006, 12 | "decay_steps": 5000, 13 | "decay_rate": 0.95, 14 | "use_sample_type": 0, 15 | "sample_type_weight_decay": 1, 16 | "sample_type_num": 3, 17 | "sample_type_weight_stepsize": 350000 18 | }, 19 | 20 | 21 | "data_schema": { 22 | "features": [ 23 | {"name": "height", "type": "embedding", "max": 1000}, 24 | {"name": "width", "type": "embedding", "max": 1000}, 25 | {"name": "num_boxes", "type": "embedding", "max": 100}, 26 | {"name": "boxes", "type": "countinous", "last_dim": 4}, 27 | {"name": "boxes_features", "type": "countinous", "last_dim": 2048}, 28 | {"name": "boxes_labels", "type": "embedding_seq", "max": 100}, 29 | {"name": "query_id", "type": "embedding", "max": 1352000}, 30 | {"name": "query", "type": "embedding_seq", "max": 22000}, 31 | {"name": "query_words_num", "type": "embedding", "max": 100}, 32 | {"name": "last_word", "type": "embedding", "max": 22000}, 33 | {"name": "image_area", "type": "embedding", "max": 1000}, 34 | {"name": "boxes_position", "type": "embedding_seq", "max": 30}, 35 | {"name": "boxes_height", "type": "embedding_seq", "max": 1000}, 36 | {"name": "boxes_width", "type": "embedding_seq", "max": 1000}, 37 | {"name": "boxes_area", "type": "embedding_seq", "max": 1000} 38 | ], 39 | "label": {"name": "peudo-label", "type": "embedding", "max": 1}, 40 | "item_features": ["height", "width", "num_boxes", "boxes", "boxes_features", "boxes_labels","image_area","boxes_position","boxes_height","boxes_width","boxes_area"], 41 | "query_features": ["query_id", "query", "query_words_num","last_word"] 42 | }, 43 | 44 | "export": { 45 | "model_dir": "viewfs://hadoop-meituan/user/hadoop-mining/huangjianqiang/data/kdd/model/", 46 | "savedmodel_dir": "viewfs://hadoop-meituan/user/hadoop-mining/huangjianqiang/data/kdd/model/saved_model", 47 | "checkpoint_secs": -1, 48 | "checkpoint_steps": -1, 49 | "summary_steps": 1000, 50 | "max_checkpoints": 100, 51 | "checkpoint_interval_secs": 60 52 | }, 53 | 54 | "distribute": { 55 | "strategy": "ps", 56 | "gpu_per_worker": 0 57 | }, 58 | "mode": "afo" 59 | } -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | .DS_Store 132 | 133 | -------------------------------------------------------------------------------- /code/v1/src/predict.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow.contrib import predictor 3 | import utils.flag_setup as flag_setup 4 | #from utils.flag_setup import FLAGS 5 | import json 6 | 7 | 8 | def predict(model_conf, src_file, out_file_path): 9 | savedmodel_path = flag_setup.FLAGS.timestamped_saved_model 10 | clf = predictor.from_saved_model(savedmodel_path) 11 | dataset = tf.data.TFRecordDataset(src_file, num_parallel_reads=8) 12 | dataset = dataset.batch(model_conf['model']['batch_size']) 13 | iterator = dataset.make_one_shot_iterator() 14 | one_batch = iterator.get_next() 15 | #y_one_batch = tf.parse_example(one_batch, {'label': tf.FixedLenFeature([], tf.int64)}) 16 | pid_batch = tf.parse_example(one_batch, {'ori_product_id_0': tf.FixedLenFeature([], tf.int64)}) 17 | qid_batch = tf.parse_example(one_batch, {'ori_query_id': tf.FixedLenFeature([], tf.int64)}) 18 | #dataset = dataset.batch(model_conf['model']['batch_size']) 19 | #iterator = dataset.make_one_shot_iterator() 20 | #one_batch = iterator.get_next() 21 | out_file = open(out_file_path, 'w') 22 | with tf.Session() as sess: 23 | coord = tf.train.Coordinator() 24 | threads = tf.train.start_queue_runners(sess=sess, coord=coord) 25 | try: 26 | batch = 0 27 | while not coord.should_stop(): 28 | batch_data, pid, qid = sess.run([one_batch, pid_batch, qid_batch]) 29 | pid = pid['ori_product_id_0'] 30 | qid = qid['ori_query_id'] 31 | predicts = clf({"examples": batch_data})['prediction'] 32 | # print("predicts shape: ", predicts.shape) 33 | for idx in range(len(predicts)): 34 | ctr = predicts[idx][0] 35 | out_file.write( 36 | str(qid[idx]) + ',' + str(pid[idx]) + ',' + str(ctr) + "\n") 37 | batch = batch + 1 38 | # print(str(batch * model_conf['model']['batch_size']) + " evaluate done...") 39 | print("Evaluate Finished!") 40 | except tf.errors.OutOfRangeError: 41 | print('Done evaluating -- epoch limit reached') 42 | finally: 43 | coord.request_stop() 44 | coord.join(threads) 45 | 46 | def main(_): 47 | model_conf = json.load(open(flag_setup.FLAGS.model_conf, 'r')) 48 | predict(model_conf, flag_setup.FLAGS.eval_file, flag_setup.FLAGS.output_file) 49 | 50 | if __name__ == "__main__": 51 | #flag_setup.flag_setup() 52 | tf.logging.info("----start---") 53 | tf.app.run() -------------------------------------------------------------------------------- /code/v1/src/conf/model/cluster_att_interid_pointwise.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": { 3 | "model_name": "att_interid", 4 | "hidden_layers": [300, 128], 5 | "query_embedding_size": 300, 6 | "neg_num": 0, 7 | "eval_neg_num": 0, 8 | "batch_size": 1024, 9 | "max_step": 10000000, 10 | "epoch": 2, 11 | "learning_rate": 0.0006, 12 | "decay_steps": 5000, 13 | "decay_rate": 0.95, 14 | "use_sample_type": 0, 15 | "sample_type_weight_decay": 1, 16 | "sample_type_num": 3, 17 | "sample_type_weight_stepsize": 350000 18 | }, 19 | 20 | 21 | "data_schema": { 22 | "features": [ 23 | {"name": "height", "type": "embedding", "max": 1000}, 24 | {"name": "width", "type": "embedding", "max": 1000}, 25 | {"name": "num_boxes", "type": "embedding", "max": 100}, 26 | {"name": "boxes", "type": "countinous", "last_dim": 4}, 27 | {"name": "boxes_features", "type": "countinous", "last_dim": 2048}, 28 | {"name": "boxes_labels", "type": "embedding_seq", "max": 100}, 29 | {"name": "query_id", "type": "embedding", "max": 2000}, 30 | {"name": "query", "type": "embedding_seq", "max": 22000}, 31 | {"name": "query_words_num", "type": "embedding", "max": 100}, 32 | {"name": "last_word", "type": "embedding", "max": 22000}, 33 | {"name": "image_area", "type": "embedding", "max": 1000}, 34 | {"name": "boxes_position", "type": "embedding_seq", "max": 30}, 35 | {"name": "boxes_height", "type": "embedding_seq", "max": 1000}, 36 | {"name": "boxes_width", "type": "embedding_seq", "max": 1000}, 37 | {"name": "boxes_area", "type": "embedding_seq", "max": 1000}, 38 | {"name": "product_id", "type": "embedding", "max": 10000} 39 | ], 40 | "label": {"name": "peudo-label", "type": "embedding", "max": 1}, 41 | "item_features": ["product_id","height", "width", "num_boxes", "boxes", "boxes_features", "boxes_labels","image_area","boxes_position","boxes_height","boxes_width","boxes_area"], 42 | "query_features": ["query_id", "query", "query_words_num","last_word"] 43 | }, 44 | 45 | "export": { 46 | "model_dir": "viewfs://hadoop-meituan/user/hadoop-mining/huangjianqiang/data/kdd/model/", 47 | "savedmodel_dir": "viewfs://hadoop-meituan/user/hadoop-mining/huangjianqiang/data/kdd/model/saved_model", 48 | "checkpoint_secs": -1, 49 | "checkpoint_steps": -1, 50 | "summary_steps": 1000, 51 | "max_checkpoints": 100, 52 | "checkpoint_interval_secs": 60 53 | }, 54 | 55 | "distribute": { 56 | "strategy": "ps", 57 | "gpu_per_worker": 0 58 | }, 59 | "mode": "afo" 60 | } -------------------------------------------------------------------------------- /code/v1/src/utils/flag_setup.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import tensorflow as tf 4 | import os 5 | import json 6 | 7 | flags = tf.app.flags 8 | FLAGS = flags.FLAGS 9 | 10 | flags.DEFINE_string("script_mode", "local", "local or afo") 11 | flags.DEFINE_string("run_id", "", "if not given, a `yyyyMMddHHmmSS` will be generated automatically") 12 | flags.DEFINE_string("model_conf", "", "name of model definition json file") 13 | flags.DEFINE_integer("epoch_samples", -1, "epoch_samples") 14 | flags.DEFINE_integer("epochs", -1, "epochs") 15 | 16 | # 由AFO喂入的参数 17 | flags.DEFINE_string("ps_hosts", "", "comma-separated hosts list with host format in host:port") 18 | flags.DEFINE_string("worker_hosts", "", "comma-separated hosts list with host format in host:port") 19 | flags.DEFINE_string("chief_hosts", "", "comma-separated hosts list with host format in host:port") 20 | flags.DEFINE_string("evaluator_hosts", "", "comma-separated hosts list with host format in host:port") 21 | flags.DEFINE_string("job_name", "", "role") 22 | flags.DEFINE_integer("task_index", 0, "task index") 23 | flags.DEFINE_string("eval_file", "NULL", "task index") 24 | flags.DEFINE_string("output_file", "NULL", "task index") 25 | flags.DEFINE_string("timestamped_saved_model", "NULL", "task index") 26 | flags.DEFINE_string("train_train_data", "NULL", "train train") 27 | flags.DEFINE_string("train_valid_data", "NULL", "train valid") 28 | flags.DEFINE_string("valid_data", "NULL", "train valid") 29 | flags.DEFINE_string("train_data", "NULL", "train valid") 30 | flags.DEFINE_string("model_ckpt_dir", "NULL", "train valid") 31 | flags.DEFINE_string("batch_size", "120", "batch size") 32 | flags.DEFINE_bool("use_socket", True, "data agent use socket or not.") 33 | flags.DEFINE_integer("data_port", 0, "data agent port if agent use socket .") 34 | flags.DEFINE_integer("shm_size", 0, "data agent shr mem size if agent use shr mem.") 35 | flags.DEFINE_string("shm_name", "", "data agent shr name size if agent use shr mem.") 36 | flags.DEFINE_bool ("async_save_ckpt", False, "async_save_ckpt") 37 | flags.DEFINE_string("warm_start_id", "", "") 38 | def parse_hosts(hosts_str): 39 | return [x.strip() for x in hosts_str.split(',') if x.strip()] 40 | 41 | def flag_setup(): 42 | if FLAGS.script_mode == "afo": 43 | print(os.environ['TF_CONFIG']) 44 | 45 | cluster_config = { 46 | 'chief': parse_hosts(FLAGS.chief_hosts), 47 | 'worker': parse_hosts(FLAGS.worker_hosts), 48 | 'ps': parse_hosts(FLAGS.ps_hosts), 49 | 'evaluator': parse_hosts(FLAGS.evaluator_hosts) 50 | } 51 | 52 | os.environ['TF_CONFIG'] = json.dumps({ 53 | 'cluster': cluster_config, 54 | 'task': { 55 | 'type': FLAGS.job_name, 56 | 'index': FLAGS.task_index} 57 | } 58 | ) 59 | -------------------------------------------------------------------------------- /code/v1/gen_dict.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import pickle 3 | import sys 4 | import pandas as pd 5 | import utils 6 | 7 | from constants import * 8 | 9 | def read_csv(path): 10 | product_ids = [] 11 | queries = [] 12 | query_ids = [] 13 | with open(path, 'r') as fin: 14 | header = fin.readline().strip('\n').split('\t') 15 | for line in fin: 16 | line = line.strip('\n').split('\t') 17 | product_ids.append(int(line[0])) 18 | 19 | queries.append(utils.get_split_text(line[7])) 20 | query_ids.append(int(line[8])) 21 | 22 | return pd.DataFrame({'product_id':product_ids,'query':queries,'query_id':query_ids}) 23 | 24 | 25 | def add_to_dict(df,query_dict,product_dict,word_dict,train=True): 26 | if train: 27 | for query in sorted(set(df['query'])): 28 | if query not in query_dict: 29 | query_dict[query] = len(query_dict)+1 30 | 31 | for product in sorted(set(df['product_id'])): 32 | if product not in product_dict: 33 | product_dict[product] = len(product_dict) + 1 34 | 35 | word_cnt = {} 36 | for string in df['query']: 37 | words = utils.split_text(string) 38 | for word in words: 39 | if word not in word_cnt: 40 | word_cnt[word] = 1 41 | else: 42 | word_cnt[word] += 1 43 | 44 | word_cnt = sorted(word_cnt.items(),key=lambda x:(x[1],x[0]),reverse = True) 45 | 46 | for word,i in word_cnt: 47 | if word not in word_dict: 48 | word_dict[word] = len(word_dict) + 1 49 | 50 | 51 | def get_dict(train_tsv, valid_tsv, testA_tsv,testB_tsv=""): 52 | df_train = read_csv(train_tsv) 53 | df_valid = read_csv(valid_tsv) 54 | df_testA = read_csv(testA_tsv) 55 | 56 | if testB_tsv != "": 57 | df_testB = read_csv(testB_tsv) 58 | 59 | 60 | if testB_tsv != "": 61 | dfs = [df_train,df_valid,df_testA,df_testB] 62 | trains = [True,True,False,False] 63 | else: 64 | dfs = [df_train,df_valid,df_testA] 65 | trains = [True,True,False] 66 | 67 | query_dict = {} 68 | product_dict = {} 69 | word_dict = {} 70 | 71 | 72 | for is_train,df in zip(trains,dfs): 73 | add_to_dict(df,query_dict,product_dict,word_dict,is_train) 74 | 75 | print('query_dict',len(query_dict)) 76 | print('product_dict',len(product_dict)) 77 | print('word_dict',len(word_dict)) 78 | # pickle.dump(query_dict, open(query_dict_path, 'wb')) 79 | # pickle.dump(product_dict, open(product_dict_path, 'wb')) 80 | pickle.dump(word_dict, open(word_dict_path, 'wb')) 81 | 82 | 83 | if __name__ == '__main__': 84 | get_dict(train_tsv,valid_tsv,testA_tsv,testB_tsv) 85 | 86 | 87 | -------------------------------------------------------------------------------- /code/v1/src/predict_vec.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow.contrib import predictor 3 | import utils.flag_setup as flag_setup 4 | #from utils.flag_setup import FLAGS 5 | import json 6 | import pickle 7 | 8 | def predict(model_conf, src_file, out_file_path): 9 | savedmodel_path = flag_setup.FLAGS.timestamped_saved_model 10 | clf = predictor.from_saved_model(savedmodel_path) 11 | dataset = tf.data.TFRecordDataset(src_file, num_parallel_reads=8) 12 | dataset = dataset.batch(model_conf['model']['batch_size']) 13 | iterator = dataset.make_one_shot_iterator() 14 | one_batch = iterator.get_next() 15 | #y_one_batch = tf.parse_example(one_batch, {'label': tf.FixedLenFeature([], tf.int64)}) 16 | pid_batch = tf.parse_example(one_batch, {'ori_product_id_0': tf.FixedLenFeature([], tf.int64)}) 17 | qid_batch = tf.parse_example(one_batch, {'ori_query_id': tf.FixedLenFeature([], tf.int64)}) 18 | #dataset = dataset.batch(model_conf['model']['batch_size']) 19 | #iterator = dataset.make_one_shot_iterator() 20 | #one_batch = iterator.get_next() 21 | 22 | outputs = {} 23 | with tf.Session() as sess: 24 | coord = tf.train.Coordinator() 25 | threads = tf.train.start_queue_runners(sess=sess, coord=coord) 26 | try: 27 | batch = 0 28 | while not coord.should_stop(): 29 | batch_data, pid, qid = sess.run([one_batch, pid_batch, qid_batch]) 30 | pid = pid['ori_product_id_0'] 31 | qid = qid['ori_query_id'] 32 | predicts = clf({"examples": batch_data}) 33 | 34 | pred = predicts['prediction'] 35 | query_out = predicts["query_out"] 36 | image_out = predicts["image_out"] 37 | deep_out = predicts["deep_out"] 38 | query_emb = predicts["query_emb"] 39 | # print("predicts shape: ", pred.shape) 40 | 41 | for idx in range(len(pred)): 42 | ctr = pred[idx] 43 | qout = query_out[idx] 44 | iout = image_out[idx] 45 | dout = deep_out[idx] 46 | emb = query_emb[idx] 47 | outputs[(qid[idx],pid[idx])] = [ctr,qout,iout,dout,emb] 48 | 49 | batch = batch + 1 50 | # print(str(batch * model_conf['model']['batch_size']) + " evaluate done...") 51 | print("Evaluate Finished!") 52 | except tf.errors.OutOfRangeError: 53 | print('Done evaluating -- epoch limit reached') 54 | finally: 55 | coord.request_stop() 56 | coord.join(threads) 57 | 58 | pickle.dump(outputs,open(out_file_path,'wb')) 59 | 60 | 61 | def main(_): 62 | model_conf = json.load(open(flag_setup.FLAGS.model_conf, 'r')) 63 | predict(model_conf, flag_setup.FLAGS.eval_file, flag_setup.FLAGS.output_file) 64 | 65 | if __name__ == "__main__": 66 | #flag_setup.flag_setup() 67 | tf.logging.info("----start---") 68 | tf.app.run() -------------------------------------------------------------------------------- /code/v1/gen_word_vec.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | 4 | from constants import * 5 | import numpy as np 6 | import pickle 7 | import utils 8 | from gensim.models import Word2Vec 9 | 10 | np.random.seed(SEED) 11 | 12 | def glove_vec(): 13 | word2vec = {} 14 | with open(glove_src) as fin: 15 | for line in fin: 16 | line = line.strip('\n') 17 | space = line.index(' ') 18 | word = line[:space] 19 | vec = line[space+1:] 20 | vec = np.array(vec.split(' '),dtype='float32') 21 | 22 | word2vec[word] = vec 23 | 24 | word_dict = pickle.load(open(word_dict_path,'rb')) 25 | not_in_words = [] 26 | vecs = np.zeros([22001,300],dtype='float32') 27 | i = 0 28 | for k,v in word_dict.items(): 29 | if k in word2vec: 30 | vecs[v] = word2vec[k] 31 | else: 32 | # vecs[v] = np.random.uniform(-1,1,size=300) 33 | not_in_words.append(k) 34 | i += 1 35 | 36 | 37 | 38 | print('{} words not in dict'.format(i)) 39 | 40 | pickle.dump(vecs, open(glove_path,'wb')) 41 | 42 | return vecs 43 | 44 | 45 | def word2vec(): 46 | query = [] 47 | with open(train_tsv) as fin: 48 | header = fin.readline() 49 | for line in fin: 50 | features = line.strip('\n').split('\t') 51 | query.append(features[-2]) 52 | 53 | valid_test_query = [] 54 | with open(valid_tsv) as fin: 55 | header = fin.readline() 56 | for line in fin: 57 | features = line.strip('\n').split('\t') 58 | valid_test_query.append(features[-2]) 59 | 60 | with open(testA_tsv) as fin: 61 | header = fin.readline() 62 | for line in fin: 63 | features = line.strip('\n').split('\t') 64 | valid_test_query.append(features[-2]) 65 | 66 | query.extend(list(set(valid_test_query))) 67 | 68 | all_query = [] 69 | for q in query: 70 | all_query.append(utils.split_text(q)) 71 | 72 | word2vec = Word2Vec(all_query, size=300, window=7, min_count=1, workers=4, iter=10) 73 | 74 | word_dict = pickle.load(open(word_dict_path,'rb')) 75 | not_in_words = [] 76 | vecs = np.zeros([22001,300],dtype='float32') 77 | i = 0 78 | for k,v in word_dict.items(): 79 | if k in word2vec: 80 | vecs[v] = word2vec[k] 81 | else: 82 | # vecs[v] = np.random.uniform(-1,1,size=300) 83 | not_in_words.append(k) 84 | i += 1 85 | 86 | 87 | print('{} words not in dict'.format(i)) 88 | print('word not in dict:',not_in_words) 89 | pickle.dump(vecs, open(word2vec_path,'wb')) 90 | 91 | return vecs 92 | 93 | 94 | def blend(): 95 | word2vec = pickle.load(open(word2vec_path,'rb')) 96 | glove = pickle.load(open(glove_path,'rb')) 97 | pickle.dump((word2vec+glove)/2, open(blend_word2vec_path,'wb')) 98 | 99 | if __name__ == "__main__": 100 | 101 | word2vec = word2vec() 102 | glove = glove_vec() 103 | blend() 104 | 105 | 106 | 107 | 108 | -------------------------------------------------------------------------------- /code/v1/cv_lgb_predict.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import numpy as np 4 | import pickle 5 | from constants import * 6 | import time 7 | import json 8 | import sys 9 | import os 10 | import utils 11 | import base64 12 | import generate_tf_record 13 | import pandas as pd 14 | import lightgbm as lgb 15 | from datetime import datetime 16 | import math 17 | from collections import defaultdict 18 | 19 | from cv_lgb_train import create_sample,create_dataframe 20 | 21 | if __name__ == "__main__": 22 | if len(sys.argv[1:])>0: 23 | lgb_model, vec_input_date, lambdarank,with_productid,use_category, seed = sys.argv[1:] 24 | 25 | print('lgb_model',lgb_model) 26 | print('vec_input_date',vec_input_date) 27 | print('lambdarank',lambdarank) 28 | print('with_productid',with_productid) 29 | print('use_category',use_category) 30 | print('seed',seed) 31 | 32 | lambdarank = int(lambdarank) 33 | with_productid = int(with_productid) 34 | use_category = int(use_category) 35 | seed = int(seed) 36 | 37 | np.random.seed(seed) 38 | 39 | lgb_model_dir = '../../user_data/lgb_model/' 40 | output_dir = '../../user_data/output/testA_lgb/' 41 | 42 | if not os.path.exists(lgb_model_dir): 43 | os.makedirs(lgb_model_dir) 44 | 45 | if not os.path.exists(output_dir): 46 | os.makedirs(output_dir) 47 | 48 | word_dict_file = word_dict_path 49 | query_dict_file = interid_query_dict_path 50 | product_dict_file = interid_product_dict_path 51 | 52 | word_dict = pickle.load(open(word_dict_file, 'rb')) 53 | product_dict = pickle.load(open(product_dict_file, 'rb')) 54 | query_dict = pickle.load(open(query_dict_file, 'rb')) 55 | cv_fold = 5 56 | 57 | wv = pickle.load(open(blend_word2vec_path,'rb')) 58 | 59 | all_test_example = [] 60 | 61 | test_qidpid_vecs = pickle.load(open("../../user_data/output/testA_vec/submit_{}".format(vec_input_date),'rb')) 62 | 63 | with open(testA_tsv, 'r') as fin: 64 | header = fin.readline() 65 | for line in fin: 66 | 67 | features = line.strip('\n').split('\t') 68 | 69 | features = generate_tf_record.parseTrainFileLine(features, product_dict, query_dict, word_dict, ) 70 | 71 | example = create_sample(features,test_qidpid_vecs,wv) 72 | all_test_example.append(example) 73 | 74 | df_test = create_dataframe(all_test_example) 75 | 76 | if with_productid==1: 77 | drop_cols = ["origin_query_id","origin_product_id","query_id"] 78 | categories = ["product_id"] 79 | else: 80 | drop_cols = ["origin_query_id","origin_product_id","query_id","product_id"] 81 | categories = [] 82 | 83 | if use_category != 1: 84 | categories = [] 85 | 86 | X_test = df_test.drop(drop_cols,axis=1) 87 | all_preds = [] 88 | for i in range(cv_fold): 89 | 90 | lgb_model_path = '../../user_data/lgb_model/{}.model_cv{}'.format(lgb_model,i) 91 | model = lgb.Booster(model_file=lgb_model_path) 92 | 93 | pred_test = model.predict(X_test) 94 | all_preds.append(pred_test) 95 | 96 | 97 | test_pred = np.array(all_preds).mean(axis=0) 98 | qid = df_test['origin_query_id'].values 99 | pid = df_test['origin_product_id'].values 100 | 101 | out_file_path = output_dir+"lgb{}".format(lgb_model) 102 | 103 | with open(out_file_path,'w') as out_file: 104 | for idx in range(len(test_pred)): 105 | ctr = test_pred[idx] 106 | out_file.write( 107 | str(qid[idx]) + ',' + str(pid[idx]) + ',' + str(ctr) + "\n") -------------------------------------------------------------------------------- /code/ensemble_merge_score.sh: -------------------------------------------------------------------------------- 1 | 2 | models=("20200605102104" "20200605102155" "20200605102210" "20200605103300" "20200605103317" "20200605103338" "20200605103358" "20200605104920" "20200605104935" "20200605104949" "20200605105004" "20200605110558" "20200605110612" "20200605110629") 3 | 4 | allmodel="" 5 | for model in ${models[@]} 6 | do 7 | for dataset in {1..1} 8 | do 9 | 10 | if [ -f "../user_data/output/${model}_${dataset}_cv0" ];then 11 | echo ${model}_${dataset} 12 | if [ -n "$allmodel" ]; then 13 | allmodel=${allmodel}, 14 | fi 15 | 16 | rm ../user_data/output/${model}_${dataset}_cvmerged 17 | python cv_merge_score.py ../user_data/output/${model}_${dataset}_cv 5 18 | allmodel=${allmodel}../user_data/output/${model}_${dataset}_cvmerged 19 | fi 20 | 21 | done 22 | done 23 | 24 | echo $allmodel 25 | python file_merge_score.py $allmodel ../user_data/output/interid_nn_merged 26 | 27 | 28 | 29 | models=("0605115205" "0605115428" "0605115503" "0605115521" "0605115547" "0605122723" "0605122757" "0605122825" "0605122841" "0605122902" "0605130536" "0605130558" "0605130617" "0605130634") 30 | 31 | allmodel="" 32 | for model in ${models[@]} 33 | do 34 | 35 | if [ -f "../user_data/output/testA_lgb/lgb${model}" ];then 36 | echo ${model} 37 | if [ -n "$allmodel" ]; then 38 | allmodel=${allmodel}, 39 | fi 40 | 41 | allmodel=${allmodel}../user_data/output/testA_lgb/lgb${model} 42 | fi 43 | 44 | done 45 | 46 | echo $allmodel 47 | python file_merge_score.py $allmodel ../user_data/output/interid_lgb_merged 48 | 49 | 50 | 51 | models=("20200605135828_20200528202620" "20200605135844_20200527214553" "20200605135855_20200528023614" "20200605135905_20200527235736" "20200605141133_20200529114748" "20200605141145_20200528154733" "20200605141200_20200528125945" "20200605141213_20200529120120" "20200605142455_20200529120105" "20200605142514_20200529115623" "20200605142526_20200530144344" "20200605142545_20200529014325" "20200605150057_20200530085657" "20200605150632_20200530085707") 52 | 53 | allmodel="" 54 | for model in ${models[@]} 55 | do 56 | for dataset in {1..1} 57 | do 58 | 59 | if [ -f "../user_data/output/${model}_${dataset}_cv0" ];then 60 | echo ${model}_${dataset} 61 | if [ -n "$allmodel" ]; then 62 | allmodel=${allmodel}, 63 | fi 64 | 65 | rm ../user_data/output/${model}_${dataset}_cvmerged 66 | python cv_merge_score.py ../user_data/output/${model}_${dataset}_cv 5 67 | allmodel=${allmodel}../user_data/output/${model}_${dataset}_cvmerged 68 | fi 69 | 70 | done 71 | done 72 | 73 | echo $allmodel 74 | python file_merge_score.py $allmodel ../user_data/output/kn_nn_merged 75 | 76 | 77 | models=("0605151550" "0605151621" "0605151638" "0605151654" "0605154950" "0605155010" "0605155026" "0605155044" "0605165244" "0605165302" "0605165318" "0605165336" "0605175627" "0605175650") 78 | 79 | allmodel="" 80 | for model in ${models[@]} 81 | do 82 | 83 | if [ -f "../user_data/output/testA_lgb/lgb${model}" ];then 84 | echo ${model} 85 | if [ -n "$allmodel" ]; then 86 | allmodel=${allmodel}, 87 | fi 88 | 89 | allmodel=${allmodel}../user_data/output/testA_lgb/lgb${model} 90 | fi 91 | 92 | done 93 | 94 | echo $allmodel 95 | python file_merge_score.py $allmodel ../user_data/output/kn_lgb_merged 96 | 97 | 98 | 99 | 100 | python file_merge_score_wei.py ../user_data/output/interid_nn_merged,../user_data/output/interid_lgb_merged 5.3,4.7 ../user_data/output/merge1 101 | python file_merge_score_wei.py ../user_data/output/kn_nn_merged,../user_data/output/kn_lgb_merged 5.3,4.7 ../user_data/output/merge2 102 | python file_merge_score_wei.py ../user_data/output/merge1,../user_data/output/merge2 8,2 ../user_data/output/merge3 103 | 104 | python gen_submit.py ../user_data/output/merge3 ../prediction_result/submission.csv 105 | 106 | -------------------------------------------------------------------------------- /code/v1/src/run.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import tensorflow as tf 4 | import datetime 5 | import os 6 | 7 | import utils.flag_setup as flag_setup 8 | import utils.json_reader as json_reader 9 | import model.estimator_builder as estimator_builder 10 | 11 | tf.logging.set_verbosity(tf.logging.INFO) 12 | 13 | 14 | def local_run(model_json): 15 | epoch_steps = int(flag_setup.FLAGS.epoch_samples/model_json['model']['batch_size'])+1 16 | print('epoch_steps',epoch_steps) 17 | run_config = tf.estimator.RunConfig( 18 | model_dir=os.path.join(model_json["export"]["model_dir"], flag_setup.FLAGS.run_id), 19 | # save_checkpoints_steps=epoch_steps, 20 | save_checkpoints_secs=model_json["export"]["checkpoint_secs"], 21 | save_summary_steps=model_json["export"]["summary_steps"], 22 | keep_checkpoint_max=model_json["export"]["max_checkpoints"], 23 | ) 24 | 25 | estimator, train_spec, eval_spec, data_loader = estimator_builder.create_estimator_and_specs(run_config, model_json) 26 | tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec) 27 | 28 | # 保存savedModel用来做serving 29 | estimator.export_savedmodel( 30 | export_dir_base=os.path.join(model_json["export"]["savedmodel_dir"], flag_setup.FLAGS.run_id), 31 | serving_input_receiver_fn=data_loader.serving_example_input_receiver_fn 32 | ) 33 | 34 | 35 | def local_predict_export(model_json): 36 | run_config = tf.estimator.RunConfig( 37 | model_dir=os.path.join(model_json["export"]["model_dir"], flag_setup.FLAGS.warm_start_id), 38 | # save_checkpoints_steps=epoch_steps, 39 | save_checkpoints_secs=model_json["export"]["checkpoint_secs"], 40 | save_summary_steps=model_json["export"]["summary_steps"], 41 | keep_checkpoint_max=model_json["export"]["max_checkpoints"], 42 | ) 43 | 44 | estimator, data_loader = estimator_builder.create_estimator_predict(run_config, model_json) 45 | 46 | # 保存savedModel用来做serving 47 | estimator.export_savedmodel( 48 | export_dir_base=os.path.join(model_json["export"]["savedmodel_dir"], flag_setup.FLAGS.run_id), 49 | serving_input_receiver_fn=data_loader.serving_example_input_receiver_fn 50 | ) 51 | 52 | 53 | def afo_run(model_json): 54 | epoch_steps = int(flag_setup.FLAGS.epoch_samples/model_json['model']['batch_size'])+1 55 | 56 | tf.disable_chief_training(shut_ratio=0.8, slow_worker_delay_ratio=1.2) 57 | run_config = tf.estimator.RunConfig( 58 | model_dir=os.path.join(model_json["export"]["model_dir"], flag_setup.FLAGS.run_id), 59 | save_checkpoints_steps=epoch_steps, 60 | save_summary_steps=model_json["export"]["summary_steps"], 61 | keep_checkpoint_max=model_json["export"]["max_checkpoints"]) 62 | 63 | estimator, train_spec, eval_spec, data_loader = estimator_builder.create_estimator_and_specs(run_config, model_json) 64 | 65 | tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec) 66 | tf.logging.warn(flag_setup.FLAGS.job_name + " finished training at " + str(datetime.datetime.now().time())) 67 | 68 | if flag_setup.FLAGS.job_name == "chief": 69 | # 保存savedModel用来做serving 70 | estimator.export_savedmodel( 71 | export_dir_base=os.path.join(os.path.join(model_json["export"]["savedmodel_dir"], flag_setup.FLAGS.run_id), 'epoch'), 72 | serving_input_receiver_fn=data_loader.serving_example_input_receiver_fn 73 | ) 74 | 75 | 76 | def main(unused_argv): 77 | # 加载模型配置 78 | if flag_setup.FLAGS.model_conf: 79 | 80 | model_conf = json_reader.load_json(flag_setup.FLAGS.model_conf) 81 | 82 | if flag_setup.FLAGS.epochs is not None and flag_setup.FLAGS.epochs!="" and flag_setup.FLAGS.epochs > 0: 83 | model_conf['model']['epoch'] = flag_setup.FLAGS.epochs 84 | 85 | print('epochs',model_conf['model']['epoch']) 86 | 87 | if flag_setup.FLAGS.script_mode == "local": 88 | local_run(model_conf) 89 | 90 | if flag_setup.FLAGS.script_mode == "afo": 91 | afo_run(model_conf) 92 | 93 | if flag_setup.FLAGS.script_mode == "local_predict_export": 94 | local_predict_export(model_conf) 95 | else: 96 | tf.logging.info('can not load model_conf file %s' % flag_setup.FLAGS.model_conf) 97 | 98 | 99 | if __name__ == "__main__": 100 | flag_setup.flag_setup() 101 | tf.app.run() 102 | -------------------------------------------------------------------------------- /code/v1/src/official_eval_ndcg.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import json 3 | import sys 4 | import os 5 | import string 6 | import numpy as np 7 | import time 8 | import glob 9 | import zipfile 10 | import shutil 11 | 12 | 13 | def glob_matching(fn, fmt): 14 | matched_fns = list(glob.iglob('submit/**/submission.csv', recursive=True)) 15 | if len(matched_fns) == 0: 16 | raise Exception("You submitted a {} file, but we didn't find submission.csv in it. Please check your submission.".format(fmt)) 17 | if len(matched_fns) > 1: 18 | raise Exception("You submitted a {} file, but there are more than one files named submission.csv in it. Please check your submission.".format(fmt)) 19 | return matched_fns[0] 20 | 21 | 22 | def read_submission(submit_path, reference, k=5): 23 | # check whether the path of submitted file exists 24 | if not os.path.exists(submit_path): 25 | raise Exception("The submission file is not found!") 26 | 27 | # evaluate a zip file 28 | if os.path.isdir("submit"): 29 | shutil.rmtree("submit") 30 | if submit_path.endswith('.zip'): 31 | try: 32 | with zipfile.ZipFile(submit_path, "r") as zip_data: 33 | zip_data.extractall("submit") 34 | zip_data.close() 35 | except: 36 | raise Exception('The submitted zip file is corrputed! Please check your submission.') 37 | real_submit_path = glob_matching('submission.csv', 'zip') 38 | # evaluate a csv file 39 | else: 40 | real_submit_path = submit_path 41 | 42 | submission_dict = {} 43 | ref_qids = set(reference.keys()) 44 | 45 | with open(real_submit_path) as fin: 46 | for line in fin: 47 | line = line.strip() 48 | records = [elem.strip() for elem in line.split(',')] 49 | if records[0] not in ref_qids: 50 | continue 51 | qid = records[0] 52 | # check whether there are K products for each query 53 | if len(records[1:]) != k: 54 | raise Exception('Query-id {} has wrong number of predicted product-ids! Require {}, but {} founded.'.format(qid, k, len(records[1:]))) 55 | # check whether there exists an empty prediction for any query 56 | if any([len(r) == 0 for r in records[1:]]): 57 | raise Exception('Query-id {} has an empty prediction at rank {}! Pleace check again!'.format(qid, records[1:].index("") + 1)) 58 | # check whether there exist an invalid prediction for any query 59 | for rank, r in enumerate(records[1:]): 60 | if not all([char in string.digits for char in r]): 61 | raise Exception('Query-id {} has an invalid prediction product-id \"{}\" at rank {}'.format(qid, r, rank + 1)) 62 | # check whether there are duplicate predicted products for a single query 63 | if len(set(records[1:])) != k: 64 | raise Exception('Query-id {} has duplicate products in your prediction. Pleace check again!'.format(qid)) 65 | submission_dict[qid] = records[1:] # here we save the list of string 66 | 67 | # check if any query is missing in the submission 68 | pred_qids = set(submission_dict.keys()) 69 | nopred_qids = ref_qids - pred_qids 70 | if len(nopred_qids) != 0: 71 | raise Exception('The following query-ids have no prediction in your submission, please check again: {}'.format(", ".join(nopred_qids))) 72 | 73 | return submission_dict 74 | 75 | 76 | # compute dcg@k for a single sample 77 | def dcg_at_k(r, k): 78 | r = np.asfarray(r)[:k] 79 | if r.size: 80 | return r[0] + np.sum(r[1:] / np.log2(np.arange(3, r.size + 2))) 81 | return 0. 82 | 83 | 84 | # compute ndcg@k (dcg@k / idcg@k) for a single sample 85 | def get_ndcg(r, ref, k): 86 | dcg_max = dcg_at_k(ref, k) 87 | if not dcg_max: 88 | return 0. 89 | dcg = dcg_at_k(r, k) 90 | return dcg / dcg_max 91 | 92 | 93 | def dump_2_json(info, path): 94 | with open(path, 'w') as output_json_file: 95 | json.dump(info, output_json_file) 96 | 97 | 98 | def report_error_msg(detail, showMsg, out_p): 99 | error_dict=dict() 100 | error_dict['errorDetail']=detail 101 | error_dict['errorMsg']=showMsg 102 | error_dict['score']=0 103 | error_dict['scoreJson']={} 104 | error_dict['success']=False 105 | dump_2_json(error_dict,out_p) 106 | 107 | 108 | def report_score(score, out_p): 109 | result = dict() 110 | result['success']=True 111 | result['score'] = score 112 | result['scoreJson'] = {'score': score} 113 | dump_2_json(result,out_p) 114 | 115 | 116 | if __name__=="__main__": 117 | # the path of answer json file (eg. valid_answer.json) 118 | standard_path = sys.argv[1] 119 | # the path of prediction file (csv or zip) 120 | submit_path = sys.argv[2] 121 | # the score will be dumped into this output json file 122 | out_path = sys.argv[3] 123 | 124 | print("Read standard from %s" % standard_path) 125 | print("Read user submit file from %s" % submit_path) 126 | 127 | try: 128 | # read ground-truth 129 | reference = json.load(open(standard_path)) 130 | 131 | # read predictions 132 | k = 5 133 | predictions = read_submission(submit_path, reference, k) 134 | 135 | # compute score for each query 136 | score_sum = 0. 137 | for qid in reference.keys(): 138 | ground_truth_ids = set([str(pid) for pid in reference[qid]]) 139 | ref_vec = [1.0] * len(ground_truth_ids) 140 | pred_vec = [1.0 if pid in ground_truth_ids else 0.0 for pid in predictions[qid]] 141 | score_sum += get_ndcg(pred_vec, ref_vec, k) 142 | # the higher score, the better 143 | score = score_sum / len(reference) 144 | report_score(score, out_path) 145 | print("The evaluation finished successfully.") 146 | except Exception as e: 147 | report_error_msg(e.args[0], e.args[0], out_path) 148 | print("The evaluation failed: {}".format(e.args[0])) -------------------------------------------------------------------------------- /code/v1/src/eval_ndcg.py: -------------------------------------------------------------------------------- 1 | """ 2 | @input: 3 | score_file format: qid,pid,score,label 4 | pos_file format: json file, key: qid, value: list of pid 5 | """ 6 | import math 7 | import os 8 | 9 | EVAL_NUM = 5 10 | def cal_IDCG(n): 11 | assert(n >= 1) 12 | res = 0 13 | for i in range(1, n + 1): 14 | res += 1 / math.log(i+1) 15 | return res 16 | def cal_DCG(hit, k=EVAL_NUM): 17 | assert(len(hit) == k) 18 | res = 0 19 | for idx, h in enumerate(hit): 20 | res += h / math.log(idx + 2) 21 | return res 22 | IDCG = {} 23 | for i in range(1, EVAL_NUM + 1): 24 | IDCG[i] = cal_IDCG(i) 25 | 26 | 27 | import sys 28 | import pickle 29 | score_file, pos_file, submit_file, query_ids_file = sys.argv[1:] 30 | if query_ids_file != 'all': 31 | query_ids = pickle.load(open(query_ids_file, 'rb')) 32 | query_ids = set([q.split(',')[0] for q in query_ids]) 33 | else: 34 | query_ids = None 35 | submit_fout = open(submit_file, 'w') 36 | submit_fout.write('query-id,product1,product2,product3,product4,product5\n') 37 | import json 38 | pos = json.load(open(pos_file, 'r')) 39 | for i in pos: 40 | pos[i] = set([str(j) for j in pos[i]]) 41 | 42 | #score = [l.strip().split(',') for l in open(score_file, 'r')] 43 | from collections import defaultdict 44 | predict = defaultdict(list) 45 | for l in open(score_file, 'r'): 46 | qid, pid, s = l.strip().split(",") 47 | predict[qid].append([pid, float(s)]) 48 | 49 | print('[INFO] length of predition: ', len(predict)) 50 | print('[INFO] length of groundtruth: ', len(pos)) 51 | 52 | valid_query_product = None 53 | if os.path.exists('../../../user_data/valid_query_product.pkl'): 54 | valid_query_product = pickle.load(open('../../../user_data/valid_query_product.pkl','rb')) 55 | 56 | ndcg = 0.0 57 | cnt = 0.0 58 | for qid in predict: 59 | if query_ids_file != 'all': 60 | if qid not in query_ids: continue 61 | cnt += 1 62 | p = sorted(predict[qid], key=lambda x:x [1], reverse=True)[:EVAL_NUM] 63 | hit = [1 if x[0] in pos[qid] else 0 for x in p] 64 | if len(hit) < EVAL_NUM: hit = hit + [0 for _ in range(EVAL_NUM - len(hit))] 65 | dcg = cal_DCG(hit) 66 | pos_num = len(pos[qid]) 67 | idcg = IDCG[5] if pos_num >= 5 else IDCG[pos_num] 68 | ndcg += dcg / idcg 69 | print('[INSPECTION] query is {}, predict products are {}, ndcg is {}'.format(qid, p, dcg / idcg)) 70 | submit_fout.write(qid + ',' + ','.join([x[0] for x in p]) + '\n') 71 | 72 | ndcg = ndcg / cnt 73 | print('length of predict is {}'.format(cnt)) 74 | print("[INFO] ndcg for file {} is {}".format(score_file, ndcg)) 75 | 76 | if valid_query_product is not None: 77 | valid_qids = set(predict.keys()) 78 | train_pids = {pid for qid,pid in valid_query_product if qid not in valid_qids} 79 | 80 | 81 | for kkk in [1]: 82 | print('current threshold',kkk) 83 | predict1 = {} 84 | predict2 = {} 85 | 86 | 87 | for qid,items in predict.items(): 88 | pids = set(item[0] for item in items) 89 | if len((pids & train_pids))<=kkk: 90 | predict1[qid] = items 91 | else: 92 | predict2[qid] = items 93 | 94 | 95 | ndcg = 0.0 96 | cnt = 0.0 97 | for qid in predict1: 98 | cnt += 1 99 | p = sorted(predict1[qid], key=lambda x:x [1], reverse=True)[:EVAL_NUM] 100 | hit = [1 if x[0] in pos[qid] else 0 for x in p] 101 | if len(hit) < EVAL_NUM: hit = hit + [0 for _ in range(EVAL_NUM - len(hit))] 102 | dcg = cal_DCG(hit) 103 | pos_num = len(pos[qid]) 104 | idcg = IDCG[5] if pos_num >= 5 else IDCG[pos_num] 105 | ndcg += dcg / idcg 106 | 107 | ndcg = ndcg / (cnt+1e-9) 108 | print('length of predict1 is {}'.format(cnt)) 109 | print("[INFO] n_predict1_dcg for file {} is {}".format(score_file, ndcg)) 110 | 111 | 112 | ndcg = 0.0 113 | cnt = 0.0 114 | for qid in predict2: 115 | cnt += 1 116 | p = sorted(predict2[qid], key=lambda x:x [1], reverse=True)[:EVAL_NUM] 117 | hit = [1 if x[0] in pos[qid] else 0 for x in p] 118 | if len(hit) < EVAL_NUM: hit = hit + [0 for _ in range(EVAL_NUM - len(hit))] 119 | dcg = cal_DCG(hit) 120 | pos_num = len(pos[qid]) 121 | idcg = IDCG[5] if pos_num >= 5 else IDCG[pos_num] 122 | ndcg += dcg / idcg 123 | 124 | ndcg = ndcg / (cnt+1e-9) 125 | print('length of predict2 is {}'.format(cnt)) 126 | print("[INFO] n_predict2_dcg for file {} is {}".format(score_file, ndcg)) 127 | 128 | 129 | 130 | predict1 = {} 131 | predict2 = {} 132 | predict3 = {} 133 | predict4 = {} 134 | for qid,items in predict.items(): 135 | pids = set(item[0] for item in items) 136 | length = len((pids & train_pids)) 137 | if length<=1: 138 | predict1[qid] = items 139 | elif length<= 9: 140 | predict2[qid] = items 141 | elif length <= 17: 142 | predict3[qid] = items 143 | else: 144 | predict4[qid] = items 145 | 146 | 147 | preds = [predict1,predict2,predict3,predict4] 148 | 149 | for k,pred in enumerate(preds): 150 | cur_fold = k + 1 151 | ndcg = 0.0 152 | cnt = 0.0 153 | for qid in pred: 154 | cnt += 1 155 | p = sorted(pred[qid], key=lambda x:x [1], reverse=True)[:EVAL_NUM] 156 | hit = [1 if x[0] in pos[qid] else 0 for x in p] 157 | if len(hit) < EVAL_NUM: hit = hit + [0 for _ in range(EVAL_NUM - len(hit))] 158 | dcg = cal_DCG(hit) 159 | pos_num = len(pos[qid]) 160 | idcg = IDCG[5] if pos_num >= 5 else IDCG[pos_num] 161 | ndcg += dcg / idcg 162 | 163 | ndcg = ndcg / (cnt+1e-9) 164 | print('length of predict_fold_{} is {}'.format(cur_fold,cnt)) 165 | print("[INFO] n_predict_fold_{}_dcg for file {} is {}".format(cur_fold,score_file, ndcg)) 166 | 167 | 168 | 169 | 170 | -------------------------------------------------------------------------------- /code/v1/utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | 5 | defaultencoding = 'utf-8' 6 | if sys.getdefaultencoding() != defaultencoding: 7 | reload(sys) 8 | sys.setdefaultencoding(defaultencoding) 9 | 10 | def read_large_file(file_handler, block_size=100000): 11 | block = [] 12 | for line in file_handler: 13 | block.append(line) 14 | if len(block) == block_size: 15 | yield block 16 | block = [] 17 | # don't forget to yield the last block 18 | if block: 19 | yield block 20 | 21 | puncts = u""".,",:,),(,!,?,|,;,$,&,/,[,],>,%,=,#,*,+,\,•,~,@,£,·,_,{,},©,^,®,`,<,→,°,€,™,›,♥,←,×,§,″,′,Â,█,½,à,…,“,★,”,–,●,â,►,−,¢,²,¬,░,¶,↑,±,¿,▾,═,¦,║,―,¥,▓,—,‹,─,▒,:,¼,⊕,▼,▪,†,■,’,▀,¨,▄,♫,☆,¯,♦,¤,▲,è,¸,¾,Ã,⋅,‘,∞,∙,),↓,、,│,(,»,,,♪,╩,╚,³,・,╦,╣,╔,╗,▬,❤,ï,Ø,¹,≤,‡,√""".split(',') 22 | puncts = puncts + [u',',u'-',u"'"] 23 | # keep é 24 | 25 | puncts = [x for x in puncts if x != ''] 26 | stopwords = ['','s','i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', 'couldn', 'didn', 'doesn', 'hadn', 'hasn', 'haven', 'isn', 'ma', 'mightn', 'mustn', 'needn', 'shan', 'shouldn', 'wasn', 'weren', 'won', 'wouldn'] 27 | # stopwords=['','s'] 28 | stopwords = set(stopwords) 29 | 30 | def clean_text(x): 31 | for punct in puncts: 32 | x = x.replace(punct, ' ') 33 | return x 34 | 35 | def split_text(text): 36 | if text=='__EMPTY__': 37 | return text.split(' ') 38 | 39 | x = clean_text(text) 40 | x = x.lower() 41 | x = x.split(' ') 42 | x = [w for w in x if w not in stopwords] 43 | if len(x)==0 or x == ['']: 44 | x = ['__EMPTY__'] 45 | print('null',text) 46 | return x 47 | 48 | def get_split_text(x): 49 | return ' '.join(split_text(x)) 50 | 51 | def get_sorted_split_text(x): 52 | return ' '.join(sorted(split_text(x))) 53 | 54 | 55 | def get_trichar(words): 56 | trichars = [] 57 | for word in words: 58 | word = "#"+word+"#" 59 | for i in range(len(word)-2): 60 | trichar = word[i:i+3] 61 | 62 | trichars.append(trichar) 63 | 64 | return trichars 65 | 66 | 67 | valid_test_lastword = {'accessories', 68 | 'agent', 69 | 'animal', 70 | 'anklet', 71 | 'antenna', 72 | 'apron', 73 | 'ark', 74 | 'ashtray', 75 | 'backpack', 76 | 'bag', 77 | 'bags', 78 | 'balloon', 79 | 'balm', 80 | 'basin', 81 | 'basket', 82 | 'beans', 83 | 'bed', 84 | 'belt', 85 | 'bibs', 86 | 'bikini', 87 | 'biscuits', 88 | 'blanket', 89 | 'blocks', 90 | 'blouse', 91 | 'boots', 92 | 'bottle', 93 | 'box', 94 | 'bracelet', 95 | 'brooch', 96 | 'brush', 97 | 'brushes', 98 | 'cabinet', 99 | 'caddy', 100 | 'cap', 101 | 'car', 102 | 'carpet', 103 | 'case', 104 | 'cases', 105 | 'ceiling', 106 | 'censer', 107 | 'chair', 108 | 'chandelier', 109 | 'charger', 110 | 'cheongsam', 111 | 'child', 112 | 'chocolate', 113 | 'chopsticks', 114 | 'clip', 115 | 'clock', 116 | 'clothes', 117 | 'clothesline', 118 | 'clothing', 119 | 'coat', 120 | 'collar', 121 | 'comb', 122 | 'conditioning', 123 | 'control', 124 | 'cord', 125 | 'corsage', 126 | 'costume', 127 | 'cotton', 128 | 'covers', 129 | 'crib', 130 | 'cube', 131 | 'cuff', 132 | 'cufflinks', 133 | 'cup', 134 | 'cups', 135 | 'curtain', 136 | 'dance', 137 | 'desk', 138 | 'device', 139 | 'dispenser', 140 | 'drawers', 141 | 'dress', 142 | 'earrings', 143 | 'enclosures', 144 | 'eyelashes', 145 | 'fan', 146 | 'filter', 147 | 'frame', 148 | 'furniture', 149 | 'gear', 150 | 'glass', 151 | 'gloves', 152 | 'goggles', 153 | 'gyro', 154 | 'hammock', 155 | 'handbag', 156 | 'hat', 157 | 'headband', 158 | 'headdress', 159 | 'headrest', 160 | 'holder', 161 | 'hood', 162 | 'hoodie', 163 | 'humidifier', 164 | 'ink', 165 | 'insole', 166 | 'instruments', 167 | 'inverter', 168 | 'jacket', 169 | 'jeans', 170 | 'johns', 171 | 'kettle', 172 | 'keychain', 173 | 'knit', 174 | 'lantern', 175 | 'leather', 176 | 'leggings', 177 | 'lens', 178 | 'lid', 179 | 'light', 180 | 'lights', 181 | 'linen', 182 | 'lock', 183 | 'low', 184 | 'lumbar', 185 | 'machine', 186 | 'marker', 187 | 'mask', 188 | 'masks', 189 | 'mats', 190 | 'mirror', 191 | 'mold', 192 | 'mug', 193 | 'necklace', 194 | 'nets', 195 | 'nightgown', 196 | 'opener', 197 | 'ornaments', 198 | 'pack', 199 | 'package', 200 | 'pad', 201 | 'painting', 202 | 'pants', 203 | 'paper', 204 | 'pen', 205 | 'pencil', 206 | 'perfume', 207 | 'pet', 208 | 'phone', 209 | 'pillow', 210 | 'plants', 211 | 'po', 212 | 'pockets', 213 | 'pole', 214 | 'pots', 215 | 'power', 216 | 'pump', 217 | 'purifier', 218 | 'purse', 219 | 'quilt', 220 | 'racket', 221 | 'raincoat', 222 | 'rod', 223 | 'sandals', 224 | 'sauce', 225 | 'scarf', 226 | 'schoolbag', 227 | 'scooter', 228 | 'seal', 229 | 'seaweed', 230 | 'sensor', 231 | 'sets', 232 | 'shaver', 233 | 'shell', 234 | 'shirt', 235 | 'shoe', 236 | 'shoes', 237 | 'shovel', 238 | 'skirt', 239 | 'slippers', 240 | 'socket', 241 | 'socks', 242 | 'sofa', 243 | 'spoon', 244 | 'spotlights', 245 | 'steamer', 246 | 'stickers', 247 | 'stitch', 248 | 'strip', 249 | 'suit', 250 | 'sunglasses', 251 | 'sweater', 252 | 'sweatshirt', 253 | 'swimsuit', 254 | 'talkie', 255 | 'teapot', 256 | 'thread', 257 | 'tie', 258 | 'toothpaste', 259 | 'top', 260 | 'towel', 261 | 'tracksuit', 262 | 'trash', 263 | 'tray', 264 | 'tricycle', 265 | 'trousers', 266 | 'tureen', 267 | 'tv', 268 | 'umbrella', 269 | 'underwear', 270 | 'vase', 271 | 'vest', 272 | 'wallet', 273 | 'wardrobe', 274 | 'watch', 275 | 'wear', 276 | 'wedding', 277 | 'windbreaker', 278 | 'wipes'} 279 | 280 | 281 | def lastword_filter(text): 282 | if split_text(text)[-1] in valid_test_lastword: 283 | return True 284 | else: 285 | return False -------------------------------------------------------------------------------- /code/v1/src/model/estimator_builder.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import tensorflow as tf 3 | import utils.flag_setup as flag_setup 4 | import model.att as att 5 | import model.att_list as att_list 6 | import reader.data_set as data_set 7 | import os 8 | import model.image_sim as image_sim 9 | import model.att_kn as att_kn 10 | import model.att_interid as att_interid 11 | import model.att_predict as att_predict 12 | 13 | class EvalResultsExporter(tf.estimator.Exporter): 14 | """Passed into an EvalSpec for saving the result of the final evaluation 15 | step locally or in Google Cloud Storage. 16 | """ 17 | 18 | def __init__(self, name, model_json, run_id, serving_input_receiver_fn): 19 | assert name, '"name" argument is required.' 20 | self._name = name 21 | self._model_json = model_json 22 | self._run_id = run_id 23 | self._serving_input_receiver_fn = serving_input_receiver_fn 24 | self.num = 0 25 | self.batch_size = model_json['model']['batch_size'] 26 | self.epoch_steps = int(flag_setup.FLAGS.epoch_samples / model_json['model']['batch_size']) + 1 27 | @property 28 | def name(self): 29 | return self._name 30 | 31 | def export(self, estimator, export_path, checkpoint_path, 32 | eval_result, is_the_final_export): 33 | if self._model_json['mode'] == 'local': return None 34 | 35 | current_epoch = self.num 36 | saved_path = os.path.join(os.path.join(self._model_json["export"]["savedmodel_dir"], self._run_id), 'epoch{}'.format(current_epoch)) 37 | if tf.gfile.Exists(saved_path): 38 | tf.logging.info("no export at steps: {}".format(self.num * self.epoch_steps * self.batch_size)) 39 | return None 40 | 41 | tf.logging.info("Exporting at steps: {}".format(self.num * self.epoch_steps * self.batch_size)) 42 | tf.gfile.MakeDirs(saved_path) 43 | tf.logging.info(('EvalResultsExporter (name: %s) ' 44 | 'running after final evaluation.') % self._name) 45 | tf.logging.info('export_path: %s' % export_path) 46 | tf.logging.info('eval_result: %s' % eval_result) 47 | estimator.export_savedmodel( 48 | export_dir_base=saved_path, 49 | serving_input_receiver_fn=self._serving_input_receiver_fn 50 | ) 51 | self.num += 1 52 | return saved_path 53 | 54 | 55 | def create_estimator_and_specs(run_config, model_json): 56 | if model_json['model']['model_name'] == 'att': 57 | model = att.ATT(model_json, flag_setup.FLAGS.script_mode) 58 | elif model_json['model']['model_name'] == 'att_list': 59 | model = att_list.ATTList(model_json, flag_setup.FLAGS.script_mode) 60 | elif model_json['model']['model_name'] == 'image_sim': 61 | model = image_sim.ImageSim(model_json, flag_setup.FLAGS.script_mode) 62 | elif model_json['model']['model_name'] == 'att_kn': 63 | model = att_kn.ATTKN(model_json, flag_setup.FLAGS.script_mode) 64 | elif model_json['model']['model_name'] == 'att_interid': 65 | model = att_interid.ATTInterID(model_json, flag_setup.FLAGS.script_mode) 66 | 67 | epoch_steps = int(flag_setup.FLAGS.epoch_samples / model_json['model']['batch_size']) + 1 68 | epochs = model_json['model']['epoch'] 69 | max_steps = epoch_steps*epochs 70 | 71 | data_loader = data_set.Data_Loader(model_json, flag_setup.FLAGS.script_mode) 72 | 73 | model_params = tf.contrib.training.HParams() 74 | if model_json['mode'] == 'local': 75 | if flag_setup.FLAGS.warm_start_id is not None and flag_setup.FLAGS.warm_start_id!="": 76 | estimator = tf.estimator.Estimator( 77 | model_fn=model.model_fn, 78 | config=run_config, 79 | params=model_params, 80 | warm_start_from=tf.estimator.WarmStartSettings( 81 | ckpt_to_initialize_from='../../../user_data/export/checkpoint/{}'.format( 82 | flag_setup.FLAGS.warm_start_id),vars_to_warm_start=['(?!global_step:0)(?!beta1_power:0)(?!beta2_power:0)(?!extra).*']) 83 | # flag_setup.FLAGS.warm_start_id),vars_to_warm_start=['cross/boxes_features','boxes(?!/input_embedding/emb_height)(?!/input_embedding/emb_image_area)(?!/input_embedding/emb_width)']) 84 | ) 85 | else: 86 | estimator = tf.estimator.Estimator( 87 | model_fn=model.model_fn, 88 | config=run_config, 89 | params=model_params 90 | ) 91 | else: 92 | if flag_setup.FLAGS.warm_start_id is not None and flag_setup.FLAGS.warm_start_id!="": 93 | estimator = tf.estimator.Estimator( 94 | model_fn=model.model_fn, 95 | config=run_config, 96 | params=model_params, 97 | warm_start_from=tf.estimator.WarmStartSettings(ckpt_to_initialize_from="viewfs://hadoop-meituan/user/hadoop-mining/huangjianqiang/data/kdd/model/{}".format( 98 | flag_setup.FLAGS.warm_start_id),vars_to_warm_start=['(?!global_step:0)(?!beta1_power:0)(?!beta2_power:0)(?!extra).*']) 99 | ) 100 | else: 101 | estimator = tf.estimator.Estimator( 102 | model_fn=model.model_fn, 103 | config=run_config, 104 | params=model_params, 105 | 106 | ) 107 | 108 | if model_json['mode'] == 'local': 109 | train_spec = tf.estimator.TrainSpec( 110 | input_fn=lambda: data_loader.generate_tfrecord_dataset(stage=tf.estimator.ModeKeys.TRAIN), 111 | max_steps=max_steps 112 | ) 113 | else: 114 | train_spec = tf.estimator.TrainSpec( 115 | input_fn=lambda: data_loader.generate_tfrecord_dataset(stage=tf.estimator.ModeKeys.TRAIN), 116 | max_steps=model_json['model']['max_step'] 117 | ) 118 | 119 | eval_spec = tf.estimator.EvalSpec( 120 | input_fn=lambda: data_loader.generate_tfrecord_dataset(stage=tf.estimator.ModeKeys.EVAL), 121 | throttle_secs=model_json["export"]["checkpoint_interval_secs"], 122 | start_delay_secs=30, 123 | steps=500, 124 | exporters=EvalResultsExporter('eval-saved-model', model_json, flag_setup.FLAGS.run_id, data_loader.serving_example_input_receiver_fn) 125 | ) 126 | 127 | return estimator, train_spec, eval_spec, data_loader 128 | 129 | 130 | 131 | def create_estimator_predict(run_config, model_json): 132 | model = att_predict.ATTPredict(model_json, flag_setup.FLAGS.script_mode) 133 | 134 | data_loader = data_set.Data_Loader(model_json, flag_setup.FLAGS.script_mode) 135 | 136 | model_params = tf.contrib.training.HParams() 137 | estimator = tf.estimator.Estimator( 138 | model_fn=model.model_fn, 139 | config=run_config, 140 | params=model_params, 141 | # warm_start_from=tf.estimator.WarmStartSettings( 142 | # ckpt_to_initialize_from='../../../user_data/export/checkpoint/{}'.format( 143 | # flag_setup.FLAGS.warm_start_id),vars_to_warm_start=['(?!global_step:0)(?!beta1_power:0)(?!beta2_power:0)(?!extra).*']) 144 | # flag_setup.FLAGS.warm_start_id),vars_to_warm_start=['cross/boxes_features','boxes(?!/input_embedding/emb_height)(?!/input_embedding/emb_image_area)(?!/input_embedding/emb_width)']) 145 | ) 146 | 147 | 148 | return estimator, data_loader -------------------------------------------------------------------------------- /code/v1/src/reader/data_set.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os 4 | import tensorflow as tf 5 | import utils.flag_setup as flag_setup 6 | 7 | INT64_VARLEN = tf.VarLenFeature(dtype=tf.int64) 8 | 9 | class Data_Loader(object): 10 | def __init__(self, model_conf, mode): 11 | self.model_conf = model_conf 12 | self.mode = mode 13 | self.features = [] 14 | self.cont_fea = [] 15 | self.cont_fea_last_dim = [] 16 | self.neg_num = model_conf['model']['neg_num'] 17 | self.eval_neg_num = model_conf['model']['eval_neg_num'] 18 | self.features = [] 19 | self.features_sp = [] 20 | self.features_seq = [] 21 | self.cont_fea = [] 22 | self.cont_fea_last_dim = [] 23 | 24 | self.cont_fix_fea = [] 25 | self.cont_fix_fea_last_dim = [] 26 | for field in model_conf["data_schema"]["features"]: 27 | if field["type"] == "embedding": 28 | self.features.append(field["name"]) 29 | 30 | if field["type"] == "embedding_sp": 31 | self.features_sp.append(field["name"]) 32 | 33 | if field["type"] == "embedding_seq": 34 | self.features_seq.append(field["name"]) 35 | 36 | if field['type'] == 'countinous': 37 | self.cont_fea.append(field['name']) 38 | self.cont_fea_last_dim.append(field['last_dim']) 39 | 40 | if field['type'] == 'countinous_fix': 41 | self.cont_fix_fea.append(field['name']) 42 | self.cont_fix_fea_last_dim.append(field['last_dim']) 43 | 44 | self.extra_preds = False 45 | if "extra_preds" in model_conf["data_schema"]: 46 | self.extra_preds = True 47 | self.extra_pred_num = model_conf["model"]["extra_pred_num"] 48 | 49 | self.sample_type = False 50 | if "sample_type" in model_conf["data_schema"]: 51 | self.sample_type = True 52 | 53 | self.batch_size = model_conf["model"]["batch_size"] 54 | self.epoch = model_conf["model"]["epoch"] 55 | self.epoch_samples = flag_setup.FLAGS.epoch_samples 56 | if mode == "local": 57 | self.train_data = flag_setup.FLAGS.train_train_data 58 | self.eval_data = flag_setup.FLAGS.train_valid_data 59 | 60 | self.job_name = "worker" 61 | if mode == "afo": 62 | self.job_name = flag_setup.FLAGS.job_name 63 | 64 | if self.job_name == "chief": 65 | self.epoch = self.epoch * 2 66 | 67 | 68 | @staticmethod 69 | def int64_list_feature(list_value): 70 | values = [] 71 | for each in list_value: 72 | values.append(tf.train.Feature(int64_list=tf.train.Int64List(value=each))) 73 | return tf.train.FeatureList(feature=values) 74 | 75 | def generate_tfrecord_dataset(self, stage): 76 | def _get_input_files(): 77 | if self.mode == "local": 78 | if stage == tf.estimator.ModeKeys.TRAIN: 79 | if ',' in self.train_data: 80 | return self.train_data.split(',') 81 | elif os.path.isfile(self.train_data): 82 | return self.train_data 83 | elif os.path.isdir(self.train_data): 84 | return [os.path.join(self.train_data, l) for l in os.listdir(self.train_data)] 85 | else: 86 | raise Exception("train data %s not exist." % self.train_data) 87 | if stage == tf.estimator.ModeKeys.EVAL: 88 | if os.path.isfile(self.eval_data): 89 | return self.eval_data 90 | elif os.path.isdir(self.eval_data): 91 | return [os.path.join(self.eval_data, l) for l in os.listdir(self.eval_data)] 92 | else: 93 | raise Exception("train data %s not exist." % self.eval_data) 94 | if self.mode == "afo": 95 | """注意在分布式环境下,传入的数据目录默认是本机的inputs下。然后我们需要把这个文件夹下的文件整理出来用""" 96 | 97 | 98 | file_names = [] 99 | if flag_setup.FLAGS.job_name == 'worker' or flag_setup.FLAGS.job_name == 'evaluator': 100 | file_list = os.listdir("inputs") 101 | for current_file_name in file_list: 102 | file_path = os.path.join("inputs", current_file_name) 103 | file_names.append(file_path) 104 | return file_names 105 | 106 | def _parse_tfrecord_example(raw_serialized, schema): 107 | features = tf.parse_example(raw_serialized, schema) 108 | label_tensor = features.pop("peudo-label") 109 | label_tensor = tf.sparse_tensor_to_dense(label_tensor, default_value=0) 110 | return features, label_tensor 111 | 112 | def _input_fn(): 113 | if self.mode == "afo": 114 | dataset = tf.data.AfoDataset() #构造统一分发AFODataset 115 | else: 116 | filenames = _get_input_files() 117 | print('input filenames',filenames) 118 | dataset = tf.data.TFRecordDataset(filenames) 119 | if stage == tf.estimator.ModeKeys.TRAIN: 120 | dataset = dataset.shuffle(self.epoch_samples) 121 | dataset = dataset.repeat(self.epoch) 122 | else: 123 | dataset = dataset.repeat(1) 124 | 125 | if stage == tf.estimator.ModeKeys.TRAIN: 126 | neg_num = self.neg_num 127 | else: 128 | neg_num = self.eval_neg_num 129 | example_schema = self.get_schema(neg_num) 130 | if self.extra_preds: 131 | example_schema["extra_preds"] = tf.FixedLenFeature(shape=[self.extra_pred_num],dtype=tf.float32) 132 | 133 | if self.sample_type: 134 | example_schema["sample_type"] = tf.FixedLenFeature(shape=[],dtype=tf.int64) 135 | 136 | example_schema = dict(sorted(example_schema.items(), key=lambda k: k[0])) 137 | dataset = dataset.batch(self.batch_size) 138 | dataset = dataset.map(lambda value: _parse_tfrecord_example(value, example_schema)) 139 | dataset = dataset.prefetch(10) 140 | 141 | return dataset 142 | 143 | return _input_fn() 144 | 145 | 146 | def get_schema(self,neg_num): 147 | def _build_example_schema(example_schema, feature_name, feature_schema): 148 | if feature_name in self.model_conf['data_schema']['query_features']: 149 | example_schema[feature_name] = feature_schema 150 | elif feature_name in self.model_conf['data_schema']['item_features']: 151 | for i in range(1 + neg_num): 152 | example_schema[feature_name + '_' + str(i)] = feature_schema 153 | else: 154 | raise Exception("feature name {} has no owner.".format(feature_name)) 155 | return example_schema 156 | example_schema = {} 157 | example_schema["peudo-label"] = INT64_VARLEN 158 | for k in self.features_sp: 159 | example_schema = _build_example_schema(example_schema, k, tf.VarLenFeature(dtype=tf.int64)) 160 | for k in self.features: 161 | example_schema = _build_example_schema(example_schema, k, tf.FixedLenFeature(shape=[],dtype=tf.int64)) 162 | for k in self.features_seq: 163 | example_schema = _build_example_schema(example_schema, k, tf.FixedLenSequenceFeature([], tf.int64, True)) 164 | for k, dim in zip(self.cont_fea, self.cont_fea_last_dim): 165 | example_schema = _build_example_schema(example_schema, k, tf.FixedLenSequenceFeature([dim], tf.float32, True)) 166 | for k,dim in zip(self.cont_fix_fea, self.cont_fix_fea_last_dim): 167 | example_schema = _build_example_schema(example_schema, k, tf.FixedLenFeature([dim], tf.float32)) 168 | example_schema = dict(sorted(example_schema.items(), key=lambda k: k[0])) 169 | return example_schema 170 | 171 | def serving_example_input_receiver_fn(self): 172 | """把我们的数据解析成一个example的格式, 其输出的格式和tain的input格式一致""" 173 | serialized_tf_example = tf.placeholder(dtype=tf.string, name='input_example_tensor') 174 | # 这里的examples就定义了入参的名称 175 | receiver_tensors = {'examples': serialized_tf_example} 176 | neg_num = 0 177 | example_schema = self.get_schema(neg_num) 178 | features = tf.parse_example(serialized_tf_example, example_schema) 179 | return tf.estimator.export.ServingInputReceiver(features, receiver_tensors) 180 | 181 | 182 | 183 | 184 | 185 | 186 | -------------------------------------------------------------------------------- /code/v1/generate_tf_record.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import base64 4 | import tensorflow as tf 5 | import sys 6 | import utils 7 | import pickle 8 | from utils import read_large_file 9 | from constants import * 10 | from functools import reduce 11 | 12 | import codecs 13 | def _bytes_feature(value): 14 | """Returns a bytes_list from a string / byte.""" 15 | if isinstance(value, type(tf.constant(0))): 16 | value = value.numpy() # BytesList won't unpack a string from an EagerTensor. 17 | return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value])) 18 | def append_p_idx(s, i): 19 | return s + '_' + str(i) 20 | 21 | 22 | 23 | def cover(range1,range2): 24 | left = max(range1[0],range2[0]) 25 | right = min(range1[1],range2[1]) 26 | 27 | if right>left: 28 | return right-left 29 | else: 30 | return 0 31 | 32 | def parseTrainFileLine(features, product_dict, query_dict, word_dict): 33 | if features[5] in product_dict: 34 | product_id = product_dict[features[5]] 35 | else: 36 | product_id = 0 37 | text = utils.get_sorted_split_text(features[7]) 38 | if text in query_dict: 39 | query_id = query_dict[text] 40 | else: 41 | query_id = 0 42 | words = [word_dict[x] if x in word_dict else 0 for x in utils.split_text(text)] 43 | 44 | words = words[-100:] 45 | res = [int(features[-1]), int(query_id), words, len(words), words[-1], 46 | int(features[0]), product_id, int(features[1]), int(features[2]), int(features[3]), np.frombuffer(base64.b64decode(features[4]), dtype=np.float32).reshape(-1).tolist(), 47 | np.frombuffer(base64.b64decode(features[5]), dtype=np.float32).reshape(-1).tolist(), np.frombuffer(base64.b64decode(features[6]), dtype=np.int64).reshape(-1).tolist()] 48 | 49 | 50 | height= res[7] 51 | width = res[8] 52 | num_box = res[9] 53 | boxes = res[10] 54 | 55 | split = 3 56 | image_area = int(np.sqrt((height*width))/10) 57 | height_slice = [i*height/split for i in range(split+1)] 58 | width_slice = [i*width/split for i in range(split+1)] 59 | 60 | position = [] 61 | hei = [] 62 | wid = [] 63 | area = [] 64 | for i in range(num_box): 65 | t,l,b,r = boxes[(i*4):(i+1)*4] 66 | 67 | hei_cover = [] 68 | for j in range(len(height_slice)-1): 69 | range1 = (t,b) 70 | range2 = (height_slice[j],height_slice[j+1]) 71 | hei_cover.append(cover(range1,range2)) 72 | 73 | wid_cover = [] 74 | for j in range(len(width_slice)-1): 75 | range1 = (l,r) 76 | range2 = (width_slice[j],width_slice[j+1]) 77 | wid_cover.append(cover(range1,range2)) 78 | 79 | 80 | hei_max = np.argmax(hei_cover) 81 | wei_max = np.argmax(wid_cover) 82 | 83 | position.append(int(hei_max*split+wei_max)) 84 | 85 | hei.append(int((b-t)/5)) 86 | wid.append(int((r-l)/5)) 87 | 88 | area.append(int((np.sqrt((b-t)*(r-l)))/10)) 89 | 90 | res[7] = int(res[7]/10) 91 | res[8] = int(res[8]/10) 92 | res += [image_area,position,hei,wid,area] 93 | return res 94 | 95 | def create_tf_example(features, product_dict, query_dict, word_dict, mode=1, label=None, extra_preds=None, sample_type=None): 96 | features = parseTrainFileLine(features, product_dict, query_dict, word_dict) 97 | fd = {} 98 | 99 | if label is not None: 100 | fd['peudo-label'] = tf.train.Feature(int64_list=tf.train.Int64List(value=[label])) 101 | 102 | if extra_preds is not None: 103 | fd['extra_preds'] = tf.train.Feature(float_list=tf.train.FloatList(value=extra_preds)) 104 | 105 | if sample_type is not None: 106 | fd['sample_type'] = tf.train.Feature(int64_list=tf.train.Int64List(value=[sample_type])) 107 | 108 | fd['ori_query_id'] = tf.train.Feature(int64_list=tf.train.Int64List(value=[int(features[0])])) 109 | fd['query_id'] = tf.train.Feature(int64_list=tf.train.Int64List(value=[int(features[1])])) 110 | fd['query'] = tf.train.Feature(int64_list=tf.train.Int64List(value=features[2])) 111 | fd['query_words_num'] = tf.train.Feature(int64_list=tf.train.Int64List(value=[features[3]])) 112 | fd['last_word'] = tf.train.Feature(int64_list=tf.train.Int64List(value=[features[4]])) 113 | for i in range(mode): 114 | offset = i * 0 + 6 115 | fd[append_p_idx('ori_product_id', i)] = tf.train.Feature(int64_list=tf.train.Int64List(value=[int(features[offset - 1])])) 116 | fd[append_p_idx('product_id', i)] = tf.train.Feature(int64_list=tf.train.Int64List(value=[int(features[offset + 0])])) 117 | fd[append_p_idx('height', i)] = tf.train.Feature(int64_list=tf.train.Int64List(value=[int(features[offset + 1])])) 118 | fd[append_p_idx('width', i)] = tf.train.Feature(int64_list=tf.train.Int64List(value=[int(features[offset + 2])])) 119 | fd[append_p_idx('num_boxes', i)] = tf.train.Feature(int64_list=tf.train.Int64List(value=[int(features[offset + 3])])) 120 | fd[append_p_idx('boxes', i)] = tf.train.Feature(float_list=tf.train.FloatList(value=features[offset + 4])) 121 | fd[append_p_idx('boxes_features', i)] = tf.train.Feature(float_list=tf.train.FloatList(value=features[offset + 5])) 122 | fd[append_p_idx('boxes_labels', i)] = tf.train.Feature(int64_list=tf.train.Int64List(value=features[offset + 6])) 123 | fd[append_p_idx('image_area', i)] = tf.train.Feature(int64_list=tf.train.Int64List(value=[features[offset + 7]])) 124 | fd[append_p_idx('boxes_position', i)] = tf.train.Feature(int64_list=tf.train.Int64List(value=features[offset + 8])) 125 | fd[append_p_idx('boxes_height', i)] = tf.train.Feature(int64_list=tf.train.Int64List(value=features[offset + 9])) 126 | fd[append_p_idx('boxes_width', i)] = tf.train.Feature(int64_list=tf.train.Int64List(value=features[offset + 10])) 127 | fd[append_p_idx('boxes_area', i)] = tf.train.Feature(int64_list=tf.train.Int64List(value=features[offset + 11])) 128 | tf_example = tf.train.Example(features=tf.train.Features(feature=fd)) 129 | return tf_example 130 | 131 | 132 | 133 | def create_tf_example_list(features, product_dict, query_dict, word_dict, mode): 134 | all_features = [parseTrainFileLine(feature, product_dict, query_dict, word_dict) for feature in features] 135 | fd = {} 136 | 137 | 138 | features = reduce(lambda x,y: x+y,all_features) 139 | fd['peudo-label'] = tf.train.Feature(int64_list=tf.train.Int64List(value=[1])) 140 | fd['ori_query_id'] = tf.train.Feature(int64_list=tf.train.Int64List(value=[int(features[0])])) 141 | fd['query_id'] = tf.train.Feature(int64_list=tf.train.Int64List(value=[int(features[1])])) 142 | fd['query'] = tf.train.Feature(int64_list=tf.train.Int64List(value=features[2])) 143 | fd['query_words_num'] = tf.train.Feature(int64_list=tf.train.Int64List(value=[features[3]])) 144 | fd['last_word'] = tf.train.Feature(int64_list=tf.train.Int64List(value=[features[4]])) 145 | for i in range(mode): 146 | offset = i * len(all_features[0]) + 6 147 | fd[append_p_idx('ori_product_id', i)] = tf.train.Feature(int64_list=tf.train.Int64List(value=[int(features[offset - 1])])) 148 | fd[append_p_idx('product_id', i)] = tf.train.Feature(int64_list=tf.train.Int64List(value=[int(features[offset + 0])])) 149 | fd[append_p_idx('height', i)] = tf.train.Feature(int64_list=tf.train.Int64List(value=[int(features[offset + 1])])) 150 | fd[append_p_idx('width', i)] = tf.train.Feature(int64_list=tf.train.Int64List(value=[int(features[offset + 2])])) 151 | fd[append_p_idx('num_boxes', i)] = tf.train.Feature(int64_list=tf.train.Int64List(value=[int(features[offset + 3])])) 152 | fd[append_p_idx('boxes', i)] = tf.train.Feature(float_list=tf.train.FloatList(value=features[offset + 4])) 153 | fd[append_p_idx('boxes_features', i)] = tf.train.Feature(float_list=tf.train.FloatList(value=features[offset + 5])) 154 | fd[append_p_idx('boxes_labels', i)] = tf.train.Feature(int64_list=tf.train.Int64List(value=features[offset + 6])) 155 | fd[append_p_idx('image_area', i)] = tf.train.Feature(int64_list=tf.train.Int64List(value=[features[offset + 7]])) 156 | fd[append_p_idx('boxes_position', i)] = tf.train.Feature(int64_list=tf.train.Int64List(value=features[offset + 8])) 157 | fd[append_p_idx('boxes_height', i)] = tf.train.Feature(int64_list=tf.train.Int64List(value=features[offset + 9])) 158 | fd[append_p_idx('boxes_width', i)] = tf.train.Feature(int64_list=tf.train.Int64List(value=features[offset + 10])) 159 | fd[append_p_idx('boxes_area', i)] = tf.train.Feature(int64_list=tf.train.Int64List(value=features[offset + 11])) 160 | tf_example = tf.train.Example(features=tf.train.Features(feature=fd)) 161 | return tf_example 162 | 163 | def main(train_label_path, output_tfrecord_path, product_dict_file, query_dict_file, word_dict_file): 164 | word_dict = pickle.load(open(word_dict_file, 'rb')) 165 | product_dict = pickle.load(open(product_dict_file, 'rb')) 166 | query_dict = pickle.load(open(query_dict_file, 'rb')) 167 | idx = 0 168 | with codecs.open(train_label_path, 'r', encoding='utf-8') as file_hander: 169 | file_hander.readline() 170 | with tf.python_io.TFRecordWriter(output_tfrecord_path) as writer: 171 | cnt = 0 172 | for block in read_large_file(file_hander): 173 | print("block: {}".format(cnt)) 174 | cnt += 1 175 | for b in block: 176 | features = b.strip('\n').split('\t') 177 | idx += 1 178 | example = create_tf_example(features, product_dict, query_dict, word_dict) 179 | writer.write(example.SerializeToString()) 180 | 181 | print('total num',idx) 182 | 183 | if __name__ == "__main__": 184 | # train_label_path, output_tfrecord_path, product_dict_file, query_dict_file, word_dict_file, neg_num = sys.argv[1:] 185 | # neg_num = int(neg_num) 186 | # main(train_label_path, output_tfrecord_path, product_dict_file, query_dict_file, word_dict_file, neg_num) 187 | # main(valid_tsv, ndcg_valid_tfrecord, interid_product_dict_path, interim_query_dict_path, word_dict_path) 188 | main(testA_tsv, ndcg_testA_tfrecord, interid_product_dict_path, interid_query_dict_path, word_dict_path) -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /code/v1/cv_lgb_train.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | 4 | import numpy as np 5 | import pickle 6 | from constants import * 7 | import time 8 | import json 9 | import sys 10 | import os 11 | import utils 12 | import base64 13 | import generate_tf_record 14 | import pandas as pd 15 | import lightgbm as lgb 16 | from datetime import datetime 17 | import math 18 | from collections import defaultdict 19 | 20 | EVAL_NUM = 5 21 | def cal_IDCG(n): 22 | assert(n >= 1) 23 | res = 0 24 | for i in range(1, n + 1): 25 | res += 1 / math.log(i+1) 26 | return res 27 | 28 | def cal_DCG(hit, k=EVAL_NUM): 29 | assert(len(hit) == k) 30 | res = 0 31 | for idx, h in enumerate(hit): 32 | res += h / math.log(idx + 2) 33 | return res 34 | 35 | 36 | 37 | IDCG = {} 38 | for i in range(1, EVAL_NUM + 1): 39 | IDCG[i] = cal_IDCG(i) 40 | 41 | 42 | def create_sample(features,qidpid_vecs,wv): 43 | features_id = [features[0], features[5], features[1],features[6], features[7], features[8], features[9], features[3], features[4]] 44 | 45 | qid_pid_vec = qidpid_vecs[(features[0], features[5])] 46 | features_vec = np.concatenate([np.array([wv[i] for i in features[2]]).mean(axis=0), \ 47 | np.array(features[10]).reshape(-1,4).mean(axis=0), np.array(features[11]).reshape(-1,2048).mean(axis=0),\ 48 | np.array(qid_pid_vec[0]), 49 | qid_pid_vec[1],qid_pid_vec[2], 50 | qid_pid_vec[3],qid_pid_vec[4] 51 | 52 | 53 | ]) 54 | 55 | return features_id,features_vec 56 | 57 | 58 | def create_dataframe(example): 59 | columns1 = ["origin_query_id","origin_product_id","query_id","product_id","image_h","image_w","num_boxes","words_len","lastword"] 60 | 61 | feas = [i[0] for i in example] 62 | fea_vecs = [i[1] for i in example] 63 | 64 | columns2 = ["vec{}".format(i) for i in range(fea_vecs[0].shape[0])] 65 | 66 | df1 = pd.DataFrame(np.stack(feas),columns=columns1) 67 | df2 = pd.DataFrame(np.stack(fea_vecs),columns=columns2) 68 | 69 | df = pd.concat([df1,df2],axis=1) 70 | return df 71 | 72 | 73 | 74 | if __name__ == "__main__": 75 | if len(sys.argv[1:])>0: 76 | vec_input_date, lambdarank,with_productid,use_category, seed = sys.argv[1:] 77 | 78 | 79 | print('vec_input_date',vec_input_date) 80 | print('lambdarank',lambdarank) 81 | print('with_productid',with_productid) 82 | print('use_category',use_category) 83 | print('seed',seed) 84 | 85 | lambdarank = int(lambdarank) 86 | with_productid = int(with_productid) 87 | use_category = int(use_category) 88 | seed = int(seed) 89 | 90 | np.random.seed(seed) 91 | 92 | 93 | feat_imp_dir = '../user_data/feat_imp/' 94 | lgb_model_dir = '../user_data/lgb_model/' 95 | output_dir = 'training/output/testA_lgb/' 96 | valid_output_dir = 'training/output/lgb_prediction/' 97 | 98 | 99 | if not os.path.exists(feat_imp_dir): 100 | os.makedirs(feat_imp_dir) 101 | 102 | if not os.path.exists(lgb_model_dir): 103 | os.makedirs(lgb_model_dir) 104 | 105 | if not os.path.exists(output_dir): 106 | os.makedirs(output_dir) 107 | 108 | if not os.path.exists(valid_output_dir): 109 | os.makedirs(valid_output_dir) 110 | 111 | version = datetime.now().strftime("%m%d%H%M%S") 112 | 113 | 114 | word_dict_file = word_dict_path 115 | query_dict_file = interid_query_dict_path 116 | product_dict_file = interid_product_dict_path 117 | 118 | 119 | answer_dict = json.load(open(valid_answer, 'r')) 120 | answer_set = set() 121 | for key, value in answer_dict.items(): 122 | for pid in value: 123 | answer_set.add((key, pid)) 124 | 125 | 126 | qidpid_vecs = pickle.load(open("training/output/prediction_vec_{}/prediction_{}".format(vec_input_date,vec_input_date),'rb')) 127 | from collections import defaultdict 128 | predict = defaultdict(list) 129 | for qid,pid in qidpid_vecs: 130 | 131 | predict[qid].append([pid, qidpid_vecs[(qid,pid)][0]]) 132 | 133 | ndcgs = [] 134 | ndcg = 0.0 135 | cnt = 0.0 136 | for qid in predict: 137 | cnt += 1 138 | p = sorted(predict[qid], key=lambda x:x [1], reverse=True)[:EVAL_NUM] 139 | hit = [1 if x[0] in answer_dict[str(qid)] else 0 for x in p] 140 | if len(hit) < EVAL_NUM: hit = hit + [0 for _ in range(EVAL_NUM - len(hit))] 141 | dcg = cal_DCG(hit) 142 | pos_num = len(answer_dict[str(qid)] ) 143 | idcg = IDCG[5] if pos_num >= 5 else IDCG[pos_num] 144 | ndcg += dcg / idcg 145 | 146 | ndcg = ndcg / cnt 147 | 148 | print('length of predict is {}'.format(cnt)) 149 | print("[INFO] ndcg is {}".format(ndcg)) 150 | ndcgs.append(ndcg) 151 | print('ndcg mean',np.mean(ndcgs)) 152 | 153 | word_dict = pickle.load(open(word_dict_file, 'rb')) 154 | product_dict = pickle.load(open(product_dict_file, 'rb')) 155 | query_dict = pickle.load(open(query_dict_file, 'rb')) 156 | cv_fold = 5 157 | 158 | keys = list(answer_dict.keys()) 159 | np.random.shuffle(keys) 160 | split_num = int(len(keys)/cv_fold) 161 | 162 | 163 | 164 | 165 | 166 | wv = pickle.load(open(blend_word2vec_path,'rb')) 167 | 168 | 169 | 170 | all_test_example = [] 171 | 172 | test_qidpid_vecs = pickle.load(open("training/output/testA_vec/submit_{}".format(vec_input_date),'rb')) 173 | 174 | with open(testA_tsv, 'r') as fin: 175 | header = fin.readline() 176 | for line in fin: 177 | 178 | features = line.strip('\n').split('\t') 179 | 180 | features = generate_tf_record.parseTrainFileLine(features, product_dict, query_dict, word_dict, ) 181 | 182 | example = create_sample(features,test_qidpid_vecs,wv) 183 | all_test_example.append(example) 184 | 185 | df_test = create_dataframe(all_test_example) 186 | 187 | if with_productid==1: 188 | drop_cols = ["origin_query_id","origin_product_id","query_id"] 189 | categories = ["product_id"] 190 | else: 191 | drop_cols = ["origin_query_id","origin_product_id","query_id","product_id"] 192 | categories = [] 193 | 194 | if use_category != 1: 195 | categories = [] 196 | 197 | 198 | X_test = df_test.drop(drop_cols,axis=1) 199 | all_preds = [] 200 | 201 | 202 | 203 | 204 | def modeling(train_X, train_Y,train_group, test_X, test_Y,test_gorup, categoricals, mode, OPT_ROUNDS=600): 205 | 206 | EARLY_STOP = 100 207 | OPT_ROUNDS = OPT_ROUNDS 208 | MAX_ROUNDS = 1000 209 | params = { 210 | 'boosting': 'gbdt', 211 | # 'metric' : 'binary_logloss', 212 | 'metric' : ['ndcg'], 213 | 'objective': 'binary', 214 | 'learning_rate': 0.02, 215 | 'max_depth': -1, 216 | 'min_child_samples': 20, 217 | 'max_bin': 255, 218 | 'subsample': 0.85, 219 | 'subsample_freq': 10, 220 | 'colsample_bytree': 0.8, 221 | 'min_child_weight': 0.001, 222 | 'subsample_for_bin': 200000, 223 | 'min_split_gain': 0, 224 | 'reg_alpha': 0, 225 | 'reg_lambda': 0, 226 | 'num_leaves':63, 227 | 'seed': seed, 228 | 'nthread': 16, 229 | 'scale_pos_weight': 1, 230 | 'eval_at': 5 231 | #'is_unbalance': True, 232 | } 233 | 234 | if lambdarank == 1: 235 | params['objective'] = "lambdarank" 236 | 237 | print(params) 238 | print('Now Version {}'.format(version)) 239 | if mode == 'valid': 240 | print('Start train and validate...') 241 | print('feature number:', len(train_X.columns)) 242 | feat_cols = list(train_X.columns) 243 | 244 | dtrain = lgb.Dataset(data=train_X, label=train_Y, feature_name=feat_cols, group=train_group) 245 | dvalid = lgb.Dataset(data=test_X, label=test_Y, feature_name=feat_cols,group=test_gorup ) 246 | model = lgb.train(params, 247 | dtrain, 248 | categorical_feature=categoricals, 249 | num_boost_round=MAX_ROUNDS, 250 | early_stopping_rounds=EARLY_STOP, 251 | verbose_eval=50, 252 | valid_sets=[dvalid], 253 | valid_names=['valid'] 254 | ) 255 | importances = pd.DataFrame({'features':model.feature_name(), 256 | 'importances':model.feature_importance()}) 257 | importances.sort_values('importances',ascending=False,inplace=True) 258 | print(importances) 259 | importances.to_csv( (feat_imp_dir+'{}_imp.csv').format(version), index=False ) 260 | return model 261 | else: 262 | print('Start training... Please set OPT-ROUNDS.') 263 | feat_cols = list(train_X.columns) 264 | dtrain = lgb.Dataset(data=train_X, label=train_Y, feature_name=feat_cols, group=train_group) 265 | print('feature number:', len(train_X.columns)) 266 | print('feature :', train_X.columns) 267 | model = lgb.train(params, 268 | dtrain, 269 | categorical_feature=categoricals, 270 | num_boost_round=OPT_ROUNDS, 271 | verbose_eval=50, 272 | valid_sets=[dtrain], 273 | valid_names='train' 274 | ) 275 | 276 | importances = pd.DataFrame({'features':model.feature_name(), 277 | 'importances':model.feature_importance()}) 278 | importances.sort_values('importances',ascending=False,inplace=True) 279 | importances.to_csv( (feat_imp_dir+'{}_imp.csv').format(version), index=False ) 280 | 281 | 282 | return model 283 | 284 | all_valid_preds = [] 285 | 286 | ndcgs = [] 287 | 288 | for i in range(cv_fold): 289 | 290 | if i == cv_fold-1: 291 | valid_idx = keys[i*split_num:] 292 | else: 293 | valid_idx = keys[i * split_num:(i+1) * split_num] 294 | 295 | train_idx = set(keys) - set(valid_idx) 296 | train_keys = train_idx 297 | print('train keys len',len(train_keys)) 298 | 299 | all_example = [] 300 | pos = 0 301 | neg = 0 302 | train_num = 0 303 | valid_num = 0 304 | idx = 0 305 | 306 | with open(valid_tsv, 'r') as fin: 307 | header = fin.readline() 308 | for line in fin: 309 | features = line.strip('\n').split('\t') 310 | if (features[-1], int(features[0])) in answer_set: 311 | features = features + [1] 312 | pos += 1 313 | else: 314 | features = features + [0] 315 | neg += 1 316 | all_example.append(features) 317 | idx += 1 318 | 319 | np.random.shuffle(all_example) 320 | train_example = [] 321 | train_label = [] 322 | valid_example = [] 323 | valid_label = [] 324 | 325 | for features in all_example: 326 | if features[-2] in train_keys: 327 | train_label.append(features[-1]) 328 | features = generate_tf_record.parseTrainFileLine(features[:-1], product_dict, query_dict, word_dict, ) 329 | 330 | example = create_sample(features, qidpid_vecs,wv) 331 | train_example.append(example) 332 | 333 | train_num += 1 334 | else: 335 | valid_label.append(features[-1]) 336 | features = generate_tf_record.parseTrainFileLine(features[:-1], product_dict, query_dict, word_dict, ) 337 | 338 | example = create_sample(features, qidpid_vecs,wv) 339 | valid_example.append(example) 340 | 341 | valid_num += 1 342 | 343 | 344 | print("pos num:{},neg num:{}, train num:{},valid num:{}".format(pos, neg,train_num,valid_num)) 345 | 346 | df_train = create_dataframe(train_example) 347 | df_train['label'] = train_label 348 | df_valid = create_dataframe(valid_example) 349 | df_valid['label'] = valid_label 350 | 351 | df_train = df_train.sort_values("origin_query_id") 352 | df_valid = df_valid.sort_values("origin_query_id") 353 | 354 | train_label = df_train.pop('label') 355 | valid_label = df_valid.pop('label') 356 | 357 | train_group = df_train["origin_query_id"].value_counts().sort_index().values 358 | valid_group = df_valid["origin_query_id"].value_counts().sort_index().values 359 | 360 | 361 | train_X = df_train.drop(drop_cols,axis=1) 362 | valid_X = df_valid.drop(drop_cols,axis=1) 363 | model = modeling(train_X, train_label,train_group , valid_X, valid_label,valid_group, categories, "valid") 364 | 365 | pred = model.predict(valid_X) 366 | model.save_model( lgb_model_dir+'{}.model_cv{}'.format(version,i) ) 367 | 368 | pred_test = model.predict(X_test) 369 | all_preds.append(pred_test) 370 | 371 | 372 | 373 | predict = defaultdict(list) 374 | for qid, pid, s in zip(df_valid['origin_query_id'].values,df_valid['origin_product_id'],pred): 375 | predict[qid].append([pid, float(s)]) 376 | all_valid_preds.append([qid,pid,s]) 377 | 378 | ndcg = 0.0 379 | cnt = 0.0 380 | for qid in predict: 381 | cnt += 1 382 | p = sorted(predict[qid], key=lambda x:x [1], reverse=True)[:EVAL_NUM] 383 | hit = [1 if x[0] in answer_dict[str(qid)] else 0 for x in p] 384 | if len(hit) < EVAL_NUM: hit = hit + [0 for _ in range(EVAL_NUM - len(hit))] 385 | dcg = cal_DCG(hit) 386 | pos_num = len(answer_dict[str(qid)] ) 387 | idcg = IDCG[5] if pos_num >= 5 else IDCG[pos_num] 388 | ndcg += dcg / idcg 389 | 390 | ndcg = ndcg / cnt 391 | 392 | print('length of predict is {}'.format(cnt)) 393 | print("[INFO] ndcg is {}".format(ndcg)) 394 | ndcgs.append(ndcg) 395 | 396 | print('ndcgs',ndcgs) 397 | print('ndcg mean',np.mean(ndcgs)) 398 | 399 | test_pred = np.array(all_preds).mean(axis=0) 400 | qid = df_test['origin_query_id'].values 401 | pid = df_test['origin_product_id'].values 402 | 403 | 404 | valid_out_file_path = valid_output_dir+"prediction_{}".format(version) 405 | with open(valid_out_file_path,'w') as out_file: 406 | for idx in range(len(all_valid_preds)): 407 | cur = all_valid_preds[idx] 408 | out_file.write(','.join([str(j) for j in cur]) + '\n') 409 | 410 | 411 | out_file_path = output_dir+"lgb{}".format(version) 412 | 413 | with open(out_file_path,'w') as out_file: 414 | for idx in range(len(test_pred)): 415 | ctr = test_pred[idx] 416 | out_file.write( 417 | str(qid[idx]) + ',' + str(pid[idx]) + ',' + str(ctr) + "\n") 418 | 419 | -------------------------------------------------------------------------------- /code/v1/src/model/helper.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import tensorflow as tf 3 | import math 4 | 5 | def multiplication_attention(query, doc, mask, name=None): 6 | query = tf.expand_dims(query, axis=1) 7 | query = tf.tile(query, [1, tf.shape(doc)[1], 1]) 8 | enc = tf.concat([doc, query], axis=2) 9 | e = tf.layers.dense(enc, 1, kernel_initializer=tf.initializers.truncated_normal(0, 0.05), name='query_doc_' + name, reuse=True) 10 | mask = tf.expand_dim(mask, axis=1) 11 | e = tf.exp(e) * mask 12 | e = tf.reduce_sum(e, axis=1) 13 | 14 | weights = tf.nn.softmax(e, axis=1) 15 | 16 | def attention_net_v2(enc, dec, bias=None, name="", scope="intra_session_attention_"): 17 | 18 | with tf.variable_scope(scope + name, reuse=tf.AUTO_REUSE): 19 | dec = tf.tile(tf.expand_dims(dec, axis=1), [1, tf.shape(enc)[1], 1]) 20 | if bias is None: 21 | concate = tf.concat([enc, dec], axis=2) 22 | else: 23 | concate = tf.concat([enc, dec, bias], axis=2) 24 | e = tf.layers.dense(concate, 1, kernel_initializer=tf.initializers.truncated_normal(0, 0.05)) 25 | weights = tf.nn.softmax(e, axis=1) 26 | return tf.squeeze(tf.matmul(tf.transpose(weights, [0, 2, 1]), enc), axis=1), weights 27 | 28 | 29 | 30 | def tanh_sigmoid(inputs,units,activation=tf.nn.tanh): 31 | outputs = tf.layers.dense(inputs,units=units, activation=activation) 32 | gate = tf.layers.dense(inputs, units=units, activation=tf.nn.sigmoid) 33 | outputs = outputs * gate 34 | return outputs 35 | 36 | 37 | def dot_attention_with_query(att_query,att_key,att_value,mask,activation=None,add_bias=True,scale=None,scale_dot=False): 38 | 39 | weights = tf.matmul(tf.expand_dims(att_query,axis=1),tf.transpose(att_key,[0,2,1])) 40 | if scale_dot: 41 | weights = weights/(att_query.get_shape().as_list()[-1]**0.5) 42 | 43 | if add_bias: 44 | 45 | bias = tf.get_variable('att_bias',[1],initializer=tf.zeros_initializer()) 46 | weights = weights + bias 47 | 48 | if activation: 49 | weights = activation(weights) 50 | 51 | if scale is not None: 52 | weights = scale * weights 53 | 54 | score = tf.exp(weights) 55 | 56 | 57 | if mask is not None: 58 | mask = tf.expand_dims(mask,axis=1) 59 | score = score * mask 60 | 61 | score_sum = tf.reduce_sum(score,axis=2,keepdims=True) + 1e-10 62 | score = score/score_sum 63 | 64 | value = tf.squeeze(tf.matmul(score,att_value),axis=1) 65 | return value,score 66 | 67 | def attention_with_query(att_query,att_key,att_value,mask,activation=None,add_bias=True,scale=None): 68 | 69 | 70 | att_query_tile = tf.expand_dims(att_query,axis=1) 71 | att_query_tile = tf.tile(att_query_tile,[1,tf.shape(att_key)[1],1]) 72 | 73 | weights = tanh_sigmoid(tf.concat([att_query_tile,att_key],axis=-1),units=512) 74 | weights = tf.layers.dense(weights,units=1,activation=activation,use_bias=add_bias) 75 | weights = tf.transpose(weights,[0,2,1]) 76 | 77 | # weights = tf.matmul(tf.expand_dims(att_query,axis=1),tf.transpose(att_key,[0,2,1]))#/(att_query.get_shape().as_list()[-1]**0.5) 78 | 79 | # if add_bias: 80 | 81 | # bias = tf.get_variable('att_bias',[1],initializer=tf.zeros_initializer()) 82 | # weights = weights + bias 83 | 84 | # if activation: 85 | # weights = activation(weights) 86 | 87 | # if scale is not None: 88 | # weights = scale * weights 89 | 90 | score = tf.exp(weights) 91 | 92 | 93 | if mask is not None: 94 | mask = tf.expand_dims(mask,axis=1) 95 | score = score * mask 96 | 97 | score_sum = tf.reduce_sum(score,axis=2,keepdims=True) + 1e-10 98 | score = score/score_sum 99 | 100 | value = tf.squeeze(tf.matmul(score,att_value),axis=1) 101 | return value,score 102 | 103 | 104 | def layer_norm_vars(units): 105 | """Create Variables for layer norm.""" 106 | scale = tf.get_variable( 107 | "layer_norm_scale", [units], initializer=tf.ones_initializer()) 108 | bias = tf.get_variable( 109 | "layer_norm_bias", [units], initializer=tf.zeros_initializer()) 110 | return scale, bias 111 | 112 | 113 | def conditional_layer_norm_vars(units, conditional_input): 114 | """Conditional Layer Norm""" 115 | scale = tf.layers.dense(conditional_input, units, use_bias=True, name='layer_norm_scale_dense') 116 | bias = tf.layers.dense(conditional_input, units, use_bias=True, name='layer_norm_scale_bias') 117 | return scale, bias 118 | 119 | 120 | def layer_norm_compute(x, epsilon, scale, bias): 121 | """Layer norm raw computation.""" 122 | mean = tf.reduce_mean(x, axis=[-1], keepdims=True) 123 | variance = tf.reduce_mean(tf.square(x - mean), axis=[-1], keepdims=True) 124 | norm_x = (x - mean) * tf.rsqrt(variance + epsilon) 125 | return norm_x * scale + bias 126 | 127 | 128 | def conditional_layer_norm(x, units=None, epsilon=1e-6, name=None, reuse=None, conditional_input=None): 129 | """Layer normalize the tensor x, averaging over the last dimension.""" 130 | with tf.variable_scope(name, default_name="layer_norm", reuse=reuse): 131 | 132 | scale, bias = conditional_layer_norm_vars(units, conditional_input=conditional_input) 133 | 134 | return layer_norm_compute(x, epsilon, scale, bias) 135 | 136 | 137 | def conditional_layer_norm_with_query(x, units=None, epsilon=1e-6, name=None, reuse=None, conditional_input=None): 138 | """Layer normalize the tensor x, averaging over the last dimension.""" 139 | with tf.variable_scope(name, default_name="layer_norm", reuse=reuse): 140 | 141 | scale, bias = conditional_layer_norm_vars(units, conditional_input=conditional_input) 142 | scale = tf.expand_dims(scale, axis=1) 143 | bias = tf.expand_dims(bias, axis=1) 144 | 145 | return layer_norm_compute(x, epsilon, scale, bias) 146 | 147 | 148 | def layer_norm(x, units=None, epsilon=1e-6, name=None, reuse=None): 149 | """Layer normalize the tensor x, averaging over the last dimension.""" 150 | with tf.variable_scope(name, default_name="layer_norm", reuse=reuse): 151 | scale, bias = layer_norm_vars(units) 152 | return layer_norm_compute(x, epsilon, scale, bias) 153 | 154 | 155 | 156 | def attention(seq,mask,activation=None): 157 | weights = tf.layers.dense(seq,units=1,activation=activation) 158 | 159 | 160 | score = tf.exp(weights) 161 | score = tf.transpose(score,[0,2,1]) 162 | 163 | if mask is not None: 164 | mask = tf.expand_dims(mask,axis=1) 165 | score = score * mask 166 | 167 | score_sum = tf.reduce_sum(score,axis=2,keepdims=True) + 1e-10 168 | score = score/score_sum 169 | 170 | value = tf.squeeze(tf.matmul(score,seq),axis=1) 171 | return value 172 | 173 | 174 | 175 | 176 | def get_shape_list(tensor, expected_rank=None, name=None): 177 | if name is None: 178 | name = tensor.name 179 | 180 | shape = tensor.shape.as_list() 181 | 182 | non_static_indexes = [] 183 | for (index, dim) in enumerate(shape): 184 | if dim is None: 185 | non_static_indexes.append(index) 186 | 187 | if not non_static_indexes: 188 | return shape 189 | 190 | dyn_shape = tf.shape(tensor) 191 | for index in non_static_indexes: 192 | shape[index] = dyn_shape[index] 193 | return shape 194 | 195 | 196 | def reshape_to_matrix(input_tensor): 197 | """Reshapes a >= rank 2 tensor to a rank 2 tensor (i.e., a matrix).""" 198 | ndims = input_tensor.shape.ndims 199 | if ndims < 2: 200 | raise ValueError("Input tensor must have at least rank 2. Shape = %s" % 201 | (input_tensor.shape)) 202 | if ndims == 2: 203 | return input_tensor 204 | 205 | width = input_tensor.shape[-1] 206 | output_tensor = tf.reshape(input_tensor, [-1, width]) 207 | return output_tensor 208 | 209 | 210 | def create_initializer(initializer_range=0.02): 211 | """Creates a `truncated_normal_initializer` with the given range.""" 212 | return tf.truncated_normal_initializer(stddev=initializer_range) 213 | 214 | def dropout(input_tensor, dropout_prob): 215 | """Perform dropout. 216 | Args: 217 | input_tensor: float Tensor. 218 | dropout_prob: Python float. The probability of dropping out a value (NOT of 219 | *keeping* a dimension as in `tf.nn.dropout`). 220 | Returns: 221 | A version of `input_tensor` with dropout applied. 222 | """ 223 | if dropout_prob is None or dropout_prob == 0.0: 224 | return input_tensor 225 | 226 | output = tf.nn.dropout(input_tensor, 1.0 - dropout_prob) 227 | return output 228 | 229 | 230 | def create_attention_mask_from_input_mask(from_tensor, to_mask): 231 | """Create 3D attention mask from a 2D tensor mask. 232 | Args: 233 | from_tensor: 2D or 3D Tensor of shape [batch_size, from_seq_length, ...]. 234 | to_mask: int32 Tensor of shape [batch_size, to_seq_length]. 235 | Returns: 236 | float Tensor of shape [batch_size, from_seq_length, to_seq_length]. 237 | """ 238 | from_shape = get_shape_list(from_tensor, expected_rank=[2, 3]) 239 | batch_size = from_shape[0] 240 | from_seq_length = from_shape[1] 241 | 242 | to_shape = get_shape_list(to_mask, expected_rank=2) 243 | to_seq_length = to_shape[1] 244 | 245 | to_mask = tf.cast( 246 | tf.reshape(to_mask, [batch_size, 1, to_seq_length]), tf.float32) 247 | 248 | # We don't assume that `from_tensor` is a mask (although it could be). We 249 | # don't actually care if we attend *from* padding tokens (only *to* padding) 250 | # tokens so we create a tensor of all ones. 251 | # 252 | # `broadcast_ones` = [batch_size, from_seq_length, 1] 253 | broadcast_ones = tf.ones( 254 | shape=[batch_size, from_seq_length, 1], dtype=tf.float32) 255 | 256 | # Here we broadcast along two dimensions to create the mask. 257 | mask = broadcast_ones * to_mask 258 | 259 | return mask 260 | 261 | 262 | 263 | def self_attention_layer(from_tensor, 264 | to_tensor, 265 | attention_mask=None, 266 | num_attention_heads=1, 267 | size_per_head=512, 268 | query_act=None, 269 | key_act=None, 270 | value_act=None, 271 | attention_probs_dropout_prob=0.0, 272 | initializer_range=0.02, 273 | do_return_2d_tensor=False, 274 | batch_size=None, 275 | from_seq_length=None, 276 | to_seq_length=None): 277 | 278 | def transpose_for_scores(input_tensor, batch_size, num_attention_heads, 279 | seq_length, width): 280 | output_tensor = tf.reshape( 281 | input_tensor, [batch_size, seq_length, num_attention_heads, width]) 282 | 283 | output_tensor = tf.transpose(output_tensor, [0, 2, 1, 3]) 284 | return output_tensor 285 | 286 | from_shape = get_shape_list(from_tensor, expected_rank=[2, 3]) 287 | to_shape = get_shape_list(to_tensor, expected_rank=[2, 3]) 288 | 289 | if len(from_shape) != len(to_shape): 290 | raise ValueError( 291 | "The rank of `from_tensor` must match the rank of `to_tensor`.") 292 | 293 | if len(from_shape) == 3: 294 | batch_size = from_shape[0] 295 | from_seq_length = from_shape[1] 296 | to_seq_length = to_shape[1] 297 | elif len(from_shape) == 2: 298 | if (batch_size is None or from_seq_length is None or to_seq_length is None): 299 | raise ValueError( 300 | "When passing in rank 2 tensors to attention_layer, the values " 301 | "for `batch_size`, `from_seq_length`, and `to_seq_length` " 302 | "must all be specified.") 303 | 304 | # Scalar dimensions referenced here: 305 | # B = batch size (number of sequences) 306 | # F = `from_tensor` sequence length 307 | # T = `to_tensor` sequence length 308 | # N = `num_attention_heads` 309 | # H = `size_per_head` 310 | 311 | from_tensor_2d = reshape_to_matrix(from_tensor) 312 | to_tensor_2d = reshape_to_matrix(to_tensor) 313 | 314 | # `query_layer` = [B*F, N*H] 315 | query_layer = tf.layers.dense( 316 | from_tensor_2d, 317 | num_attention_heads * size_per_head, 318 | activation=query_act, 319 | name="query", 320 | kernel_initializer=create_initializer(initializer_range)) 321 | 322 | # `key_layer` = [B*T, N*H] 323 | key_layer = tf.layers.dense( 324 | to_tensor_2d, 325 | num_attention_heads * size_per_head, 326 | activation=key_act, 327 | name="key", 328 | kernel_initializer=create_initializer(initializer_range)) 329 | 330 | # `value_layer` = [B*T, N*H] 331 | value_layer = tf.layers.dense( 332 | to_tensor_2d, 333 | num_attention_heads * size_per_head, 334 | activation=value_act, 335 | name="value", 336 | kernel_initializer=create_initializer(initializer_range)) 337 | 338 | # `query_layer` = [B, N, F, H] 339 | query_layer = transpose_for_scores(query_layer, batch_size, 340 | num_attention_heads, from_seq_length, 341 | size_per_head) 342 | 343 | # `key_layer` = [B, N, T, H] 344 | key_layer = transpose_for_scores(key_layer, batch_size, num_attention_heads, 345 | to_seq_length, size_per_head) 346 | 347 | # Take the dot product between "query" and "key" to get the raw 348 | # attention scores. 349 | # `attention_scores` = [B, N, F, T] 350 | attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True) 351 | attention_scores = tf.multiply(attention_scores,1.0 / math.sqrt(float(size_per_head))) 352 | 353 | if attention_mask is not None: 354 | # `attention_mask` = [B, 1, F, T] 355 | attention_mask = tf.expand_dims(attention_mask, axis=[1]) 356 | 357 | # Since attention_mask is 1.0 for positions we want to attend and 0.0 for 358 | # masked positions, this operation will create a tensor which is 0.0 for 359 | # positions we want to attend and -10000.0 for masked positions. 360 | adder = (1.0 - tf.cast(attention_mask, tf.float32)) * -10000.0 361 | 362 | # Since we are adding it to the raw scores before the softmax, this is 363 | # effectively the same as removing these entirely. 364 | attention_scores += adder 365 | 366 | # Normalize the attention scores to probabilities. 367 | # `attention_probs` = [B, N, F, T] 368 | attention_probs = tf.nn.softmax(attention_scores) 369 | 370 | # This is actually dropping out entire tokens to attend to, which might 371 | # seem a bit unusual, but is taken from the original Transformer paper. 372 | attention_probs = dropout(attention_probs, attention_probs_dropout_prob) 373 | 374 | # `value_layer` = [B, T, N, H] 375 | value_layer = tf.reshape( 376 | value_layer, 377 | [batch_size, to_seq_length, num_attention_heads, size_per_head]) 378 | 379 | # `value_layer` = [B, N, T, H] 380 | value_layer = tf.transpose(value_layer, [0, 2, 1, 3]) 381 | 382 | # `context_layer` = [B, N, F, H] 383 | context_layer = tf.matmul(attention_probs, value_layer) 384 | 385 | # `context_layer` = [B, F, N, H] 386 | context_layer = tf.transpose(context_layer, [0, 2, 1, 3]) 387 | 388 | if do_return_2d_tensor: 389 | # `context_layer` = [B*F, N*H] 390 | context_layer = tf.reshape( 391 | context_layer, 392 | [batch_size * from_seq_length, num_attention_heads * size_per_head]) 393 | else: 394 | # `context_layer` = [B, F, N*H] 395 | context_layer = tf.reshape( 396 | context_layer, 397 | [batch_size, from_seq_length, num_attention_heads * size_per_head]) 398 | 399 | return context_layer 400 | 401 | def append_idx(name, i): 402 | return name + '_' + str(i) 403 | 404 | image_feature_mean=[0] 405 | image_feature_std=[0] 406 | -------------------------------------------------------------------------------- /code/v1/src/model/att_kn.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import tensorflow as tf 3 | import utils.flag_setup as flag_setup 4 | import numpy as np 5 | from helper import append_idx 6 | import helper 7 | import pickle 8 | class ATTKN(object): 9 | def __init__(self, model_json, mode): 10 | blend_word2vec_path = '../../../user_data/blend_word2vec.pkl' 11 | if mode == 'afo': 12 | blend_word2vec_path = 'training.tar.gz/src/blend_word2vec.pkl' 13 | 14 | self.cur_run_mode = mode 15 | self.model_json = model_json 16 | self.neg_num = model_json["model"]["neg_num"] 17 | self.eval_neg_num = model_json["model"]["eval_neg_num"] 18 | self.NUM_NAME = 'num_boxes' 19 | self.run_id = flag_setup.FLAGS.run_id 20 | self.model_name = model_json["model"]["model_name"] 21 | self.hidden_layer = model_json["model"]["hidden_layers"] 22 | self.embedding_size = {} 23 | for field in model_json["data_schema"]["features"]: 24 | if field['type'] == 'embedding' or field['type'] == 'embedding_sp' or field['type'] == 'embedding_seq': 25 | self.embedding_size[field["name"]] = field["max"] + 1 26 | 27 | self.learning_rate = model_json["model"]["learning_rate"] 28 | self.epoch = model_json["model"]["epoch"] 29 | self.batch_size = model_json["model"]["batch_size"] 30 | self.job_name = "worker" 31 | if mode == "afo": 32 | self.job_name = flag_setup.FLAGS.job_name 33 | self.word_vecs = pickle.load(open(blend_word2vec_path,'rb')) 34 | print('word2vec path',blend_word2vec_path) 35 | 36 | 37 | def cal_logit(self, query_seq,query_num,query_mask, query_emb, query_lastword, boxes_embedding, label_embedding,\ 38 | boxes_num,\ 39 | height, width, area, item_mask, mode): 40 | 41 | 42 | training = (mode == tf.estimator.ModeKeys.TRAIN) 43 | 44 | 45 | # with tf.variable_scope("item_semantic", reuse=tf.AUTO_REUSE): 46 | 47 | 48 | 49 | with tf.variable_scope("cross", reuse=tf.AUTO_REUSE): 50 | # label_features, boxes_area_ratio_embedding, left_id_ratio_embedding, width_ratio_embedding, top_id_ratio_embedding,\ 51 | # heigth_ratio_embedding,boxes_position,boxes_height,boxes_width,boxes_area = label_embedding 52 | 53 | # boxes_concat = tf.concat([boxes_embedding]+label_embedding,axis=-1) 54 | 55 | # boxes_concat_shape1 = tf.shape(boxes_concat)[1] 56 | 57 | 58 | with tf.variable_scope('query_semantic'): 59 | # query_emb = tf.layers.dropout(query_emb, rate=0.1, training=training) 60 | # query_emb = helper.tanh_sigmoid(query_emb,300) 61 | # query_emb = tf.layers.batch_normalization(query_emb, training=training) 62 | # query_emb = tf.layers.dropout(query_emb, rate=0.1, training=training) 63 | query_emb = tf.layers.dense(query_emb,300,activation=tf.nn.relu) 64 | query_emb = helper.layer_norm(query_emb,300) 65 | query_emb = tf.layers.dropout(query_emb, rate=0.1, training=training) 66 | 67 | 68 | with tf.variable_scope('item_semantic',reuse=tf.AUTO_REUSE): 69 | boxes_embedding = tf.layers.dense(boxes_embedding,units=1024,activation=tf.nn.relu) 70 | boxes_embedding = helper.layer_norm(boxes_embedding,1024) 71 | boxes_embedding = tf.layers.dropout(boxes_embedding, rate=0.1, training=training) 72 | 73 | # boxes_embedding = helper.tanh_sigmoid(boxes_embedding,384) 74 | boxes_embedding = tf.layers.dense(boxes_embedding,units=512,activation=tf.nn.relu) 75 | boxes_embedding = helper.layer_norm(boxes_embedding,512) 76 | boxes_embedding = tf.layers.dropout(boxes_embedding, rate=0.1, training=training) 77 | 78 | 79 | # boxes_embedding = helper.tanh_sigmoid(boxes_embedding,300) 80 | boxes_embedding = tf.layers.dense(boxes_embedding,units=300,activation=tf.nn.relu) 81 | boxes_embedding = helper.layer_norm(boxes_embedding,300) 82 | boxes_embedding = tf.layers.dropout(boxes_embedding, rate=0.1, training=training) 83 | 84 | # label_embedding = helper.tanh_sigmoid(tf.concat(label_embedding,axis=-1),300) 85 | label_embedding = tf.layers.dense(tf.concat(label_embedding,axis=-1),units=300,activation=tf.nn.relu) 86 | label_embedding = helper.layer_norm(label_embedding,300) 87 | label_embedding = tf.layers.dropout(label_embedding, rate=0.1, training=training) 88 | 89 | 90 | boxes_concat = tf.concat([boxes_embedding,label_embedding],axis=-1) 91 | boxes_value = tf.layers.dense(boxes_concat,300,activation=tf.nn.relu) 92 | # boxes_value = tf.layers.tanh_sigmoid(boxes_concat,300) 93 | boxes_value = helper.layer_norm(boxes_value,300) 94 | boxes_value = tf.layers.dropout(boxes_value, rate=0.1, training=training) 95 | 96 | # boxes_concat = helper.conditional_layer_norm_with_query(boxes_concat,300,conditional_input=query_emb) 97 | # boxes_concat = tf.layers.dropout(boxes_concat, rate=0.1, training=training) 98 | 99 | 100 | # query_query = query_emb 101 | # # query_query = tf.concat([query_emb,query_lastword],axis=-1) 102 | # boxes_key = boxes_concat 103 | # boxes_value = boxes_concat 104 | 105 | 106 | 107 | 108 | 109 | with tf.variable_scope("query_image_image_attention", reuse=tf.AUTO_REUSE): 110 | # query_query = tf.layers.dense(query_emb,100) 111 | # boxes_key = tf.layers.dense(boxes_concat,100) 112 | 113 | # query_query = helper.tanh_sigmoid(query_emb,100) 114 | # boxes_key = helper.tanh_sigmoid(boxes_concat,100) 115 | 116 | query_query = query_emb 117 | boxes_key = boxes_value 118 | 119 | att_query_image_image,softmax_score = helper.dot_attention_with_query(query_query, boxes_key, boxes_value, mask=item_mask,scale_dot=True) 120 | 121 | # att_query_image_image,softmax_score = helper.attention_with_query(query_query, boxes_key, boxes_value, mask=item_mask) 122 | 123 | # att_query_image_image = tf.reduce_sum(boxes_value*tf.expand_dims(item_mask,axis=2),axis=1)/tf.expand_dims(tf.cast(boxes_num,dtype=tf.float32),axis=1) 124 | 125 | query_out = query_emb 126 | # query_out = helper.tanh_sigmoid(query_emb, 300) 127 | # query_out = tf.layers.batch_normalization(query_out, training=training) 128 | # query_out = tf.layers.dropout(query_out, rate=0.1, training=training) 129 | 130 | image_out = att_query_image_image 131 | # image_out = helper.tanh_sigmoid(att_query_image_image, 300) 132 | # image_out = tf.layers.batch_normalization(image_out, training=training) 133 | # image_out = tf.layers.dropout(image_out, rate=0.1, training=training) 134 | 135 | # image_out = helper.conditional_layer_norm(image_out,units=300,conditional_input=query_out) 136 | 137 | # image_out = tf.layers.batch_normalization(image_out, training=training) 138 | # image_out = tf.layers.dropout(image_out, rate=0.1, training=training) 139 | # att_query_image_image,softmax_score = helper.attention_with_query(query_query, boxes_value, boxes_value, mask=item_mask, activation=None, scale=None) 140 | 141 | 142 | # att_query_image_image_ex = tf.expand_dims(att_query_image_image, axis=1) 143 | # image_tile = tf.tile(att_query_image_image_ex,[1,tf.shape(query_seq)[1],1]) 144 | # seq = tf.concat([query_seq,image_tile],axis=-1) 145 | 146 | # seq_dense = self.add_layer(seq, 700, 300, activation_function=tf.nn.tanh, name='seq_dense') 147 | 148 | # seq_dense = seq_dense * query_mask 149 | # # query_object_match = tf.reduce_max(seq_dense,axis=1) 150 | # query_object_match = tf.reduce_sum(seq_dense,axis=1)#/tf.expand_dims(tf.cast(query_num,dtype=tf.float32),axis=1) 151 | 152 | # image_mean = tf.reduce_sum(boxes_value*tf.expand_dims(item_mask,axis=2),axis=1) /tf.expand_dims(tf.cast(boxes_num,dtype=tf.float32),axis=1) 153 | 154 | 155 | concat_out = tf.concat([query_out*image_out,height,width,area], axis=1) 156 | # concat_out = tf.concat([query_emb, att_query_image_image, height,width,area], axis=1) 157 | concat_out = tf.layers.batch_normalization(concat_out, training=training) 158 | concat_out = tf.layers.dropout(concat_out, rate=0.1, training=training) 159 | 160 | 161 | 162 | logit = self.add_fc_layers(concat_out, name='dense', mode=mode) 163 | return logit,softmax_score 164 | 165 | 166 | def model_fn(self, features, labels, mode, params): 167 | neg_num = self.neg_num 168 | if mode == tf.estimator.ModeKeys.PREDICT: 169 | neg_num = 0 170 | tf.logging.info("neg_num:") 171 | tf.logging.info(neg_num) 172 | if mode == tf.estimator.ModeKeys.EVAL: 173 | neg_num = self.eval_neg_num 174 | def _embedding_simple(name, embedding_ids, embedding_size, embedding_dim): 175 | X = tf.get_variable(name, [embedding_size, embedding_dim], 176 | initializer=tf.truncated_normal_initializer(0.0, 1e-5), trainable=True) 177 | out_tensor = tf.gather(X, embedding_ids) 178 | return out_tensor 179 | def _embedding(f, embedding_dim, is_sp=False, idx=None, init_vec=None, fea_name=None): 180 | with tf.variable_scope("input_embedding", reuse=tf.AUTO_REUSE): 181 | if idx is not None: 182 | feature_name = append_idx(f, idx) 183 | else: 184 | feature_name = f 185 | 186 | if fea_name is not None: 187 | feature_name = fea_name 188 | 189 | if init_vec is None: 190 | emb_var = tf.get_variable("emb_" + str(f), [self.embedding_size[f], embedding_dim], 191 | initializer=tf.truncated_normal_initializer(0.0, 1e-5), trainable=True) 192 | else: 193 | emb_var = tf.get_variable("emb_" + str(f), [self.embedding_size[f], embedding_dim], 194 | initializer=tf.constant_initializer(init_vec), 195 | trainable=True 196 | ) 197 | if is_sp: 198 | out_tensor = tf.nn.embedding_lookup_sparse(emb_var, features[feature_name], None, combiner="mean") 199 | else: 200 | out_tensor = tf.gather(emb_var, features[feature_name]) 201 | return out_tensor 202 | 203 | training = (mode == tf.estimator.ModeKeys.TRAIN) 204 | 205 | pos_emb = tf.get_variable("pos_embedding", [100, 100], initializer=tf.truncated_normal_initializer(0.0, 1e-5), trainable=True) 206 | with tf.variable_scope("query_semantic"): 207 | query_emb_size = self.model_json['model']['query_embedding_size'] 208 | query_emb = _embedding('query', query_emb_size, init_vec=self.word_vecs) 209 | cur_pos_emb = tf.expand_dims(pos_emb[0:tf.shape(query_emb)[1]], axis=0) 210 | cur_pos_emb = tf.tile(cur_pos_emb, [tf.shape(query_emb)[0], 1, 1]) 211 | query_seq = tf.concat([query_emb, cur_pos_emb], axis=-1) 212 | 213 | # query_seq = query_emb 214 | query_num = features['query_words_num'] 215 | query_mask = tf.expand_dims(tf.sequence_mask(query_num, dtype=tf.float32), axis=2) 216 | # /tf.expand_dims(tf.cast(query_num,dtype=tf.float32),axis=1) 217 | # query_emb = self.query_semantic_layer(tf.reduce_sum(query_seq * query_mask, axis=1), query_emb_size + 100, mode=mode) 218 | query_emb = tf.reduce_sum(query_seq * query_mask, axis=1) 219 | query_lastword = None#_embedding('last_word', query_emb_size, is_sp=False, init_vec=self.word_vecs) 220 | # query_emb = tf.layers.dropout(query_emb, rate=0.1, training=training) 221 | 222 | image_feature_mean = tf.constant([[helper.image_feature_mean]]) 223 | image_feature_std = tf.constant([[helper.image_feature_std]]) 224 | 225 | item_masks = [] 226 | boxes_embeddings = [] 227 | label_embeddings = [] 228 | boxes_num = [] 229 | 230 | height_embs = [] 231 | width_embs = [] 232 | area_embs = [] 233 | for i in range(neg_num + 1): 234 | with tf.variable_scope("boxes", reuse=tf.AUTO_REUSE): 235 | 236 | boxes_features = features[append_idx('boxes_features', i)] 237 | # boxes_features = (boxes_features-image_feature_mean)/image_feature_std 238 | label_feature_embedding = _embedding('boxes_labels', 300, idx=i) 239 | # label_features = tf.layers.dropout(label_features, rate=0.1, training=training) 240 | boxes_position_embedding = _embedding('boxes_position', 20, idx=i) 241 | boxes_height_embedding = _embedding('boxes_height', 20, idx=i) 242 | boxes_width_embedding = _embedding('boxes_width', 20, idx=i) 243 | boxes_area_embedding = _embedding('boxes_area', 20, idx=i) 244 | 245 | num = features[append_idx('num_boxes', i)] 246 | 247 | boxes_masks = tf.sequence_mask(num, dtype=tf.float32) 248 | 249 | 250 | boxes_coordinate = tf.clip_by_value(tf.cast(features[append_idx('boxes', i)], tf.float32), 0, 1000) 251 | tf.logging.info("boxes_coordinate shape:") 252 | tf.logging.info(boxes_coordinate.get_shape().as_list()) 253 | img_height = tf.expand_dims(tf.cast(features[append_idx('height', i)], tf.float32), axis=1) 254 | img_width = tf.expand_dims(tf.cast(features[append_idx('width', i)], tf.float32), axis=1) 255 | tf.logging.info("img_width shape:") 256 | tf.logging.info(img_width.get_shape().as_list()) 257 | boxes_width = tf.cast(features[append_idx('boxes_width',i)],dtype=tf.float32) 258 | boxes_height = tf.cast(features[append_idx('boxes_height',i)],dtype=tf.float32) 259 | boxes_area_ratio = tf.cast(features[append_idx('boxes_area',i)],dtype=tf.float32)/tf.expand_dims(tf.cast(features[append_idx("image_area",i)], tf.float32), axis=1) 260 | boxes_area_ratio_ids = tf.clip_by_value(tf.cast(boxes_area_ratio / 0.1, tf.int64), 0, 10) 261 | left_id_ratio_ids = tf.clip_by_value(tf.cast( boxes_coordinate[:, :, 1] / (img_width*10) / 0.1, tf.int64), 0, 10) 262 | width_ratio_ids = tf.clip_by_value(tf.cast((boxes_width*5) / (img_width*10) / 0.1, tf.int64), 0, 10) 263 | top_id_ratio_ids = tf.clip_by_value(tf.cast( boxes_coordinate[:, :, 0] / (img_height*10) / 0.1, tf.int64), 0, 10) 264 | heigth_ratio_ids = tf.clip_by_value(tf.cast((boxes_height*5) / (img_height*10) / 0.1, tf.int64), 0, 10) 265 | 266 | boxes_area_ratio_embedding = _embedding_simple('boxes_area_ratio', boxes_area_ratio_ids, 11, 20) 267 | left_id_ratio_embedding = _embedding_simple('boxes_left_ratio', left_id_ratio_ids, 11, 20) 268 | width_ratio_embedding = _embedding_simple('boxes_width_ratio', width_ratio_ids, 11, 20) 269 | top_id_ratio_embedding = _embedding_simple('boxes_top_ratio', top_id_ratio_ids, 11, 20) 270 | heigth_ratio_embedding = _embedding_simple('boxes_height_ratio', heigth_ratio_ids, 11, 20) 271 | label_embeddings.append([label_feature_embedding, boxes_area_ratio_embedding, left_id_ratio_embedding, width_ratio_embedding, top_id_ratio_embedding, 272 | heigth_ratio_embedding,boxes_position_embedding,boxes_height_embedding,boxes_width_embedding,boxes_area_embedding]) 273 | 274 | boxes_embeddings.append(boxes_features) 275 | boxes_num.append(num) 276 | 277 | height = _embedding('height', 20, idx=i) 278 | width = _embedding('width', 20, idx=i) 279 | image_area = _embedding('image_area', 20, idx=i) 280 | height_embs.append(height) 281 | width_embs.append(width) 282 | area_embs.append(image_area) 283 | 284 | item_masks.append(boxes_masks) 285 | 286 | tf.logging.info("query_in:") 287 | tf.logging.info(tf.shape(query_emb)) 288 | 289 | logit,sfotmax_score = self.cal_logit(query_seq,query_num,query_mask, query_emb, query_lastword, boxes_embeddings[0], label_embeddings[0],boxes_num[0], 290 | height_embs[0],width_embs[0],area_embs[0], item_masks[0], mode=mode) 291 | 292 | if self.cur_run_mode=='afo': 293 | every_n_iter = 5000 294 | else: 295 | every_n_iter = 200 296 | logging_hook = tf.train.LoggingTensorHook(every_n_iter=every_n_iter,tensors={'softmax_score': sfotmax_score}) 297 | 298 | logit = tf.reshape(logit, [-1, 1]) 299 | predict = tf.sigmoid(logit) 300 | 301 | if mode == tf.estimator.ModeKeys.PREDICT: 302 | predict_dict = {"prediction": predict} 303 | export_output = {'serving': tf.estimator.export.PredictOutput(predict_dict)} 304 | return tf.estimator.EstimatorSpec(mode, predictions=predict_dict, export_outputs=export_output) 305 | 306 | global_step = tf.train.get_global_step() 307 | if neg_num > 0: 308 | score = [tf.reshape(logit, [-1, 1])] 309 | for i in range(1, neg_num + 1): 310 | logit,sfotmax_score = self.cal_logit(query_seq,query_num,query_mask, query_emb, query_lastword, boxes_embeddings[i], label_embeddings[i],boxes_num[i], 311 | height_embs[i],width_embs[i],area_embs[i], item_masks[i], mode=mode) 312 | score.append(tf.reshape(logit, [-1, 1])) 313 | score = tf.concat(score, axis=1) 314 | prob = tf.nn.softmax(score, axis=1) 315 | predict = prob[:, 0] 316 | loss = -tf.reduce_mean(tf.log(predict)) 317 | else: 318 | stepsize = 400 319 | iteration = tf.cast(global_step,tf.float32) 320 | 321 | 322 | 323 | beta = 0.5**(1+iteration/stepsize) 324 | 325 | # beta = tf.reduce_max([beta,0.1]) 326 | 327 | extra_preds = tf.reshape(tf.cast(features['extra_preds'], tf.float32), [-1, 1]) 328 | soft_loss = tf.reduce_mean( 329 | tf.nn.sigmoid_cross_entropy_with_logits(labels=extra_preds, logits=logit)) 330 | 331 | label = tf.reshape(tf.cast(labels, tf.float32), [-1, 1]) 332 | hard_loss = tf.reduce_mean( 333 | tf.nn.sigmoid_cross_entropy_with_logits(labels=label, logits=logit)) 334 | 335 | # beta = 0.1 336 | loss = (1-beta)*hard_loss + beta * soft_loss 337 | # auc = tf.metrics.auc(labels, predict) 338 | 339 | # logging_hook = tf.train.LoggingTensorHook(every_n_iter=100, 340 | # tensors={'auc': auc[0]}) 341 | 342 | # 有loss和auc,可以定义eval的返回了 343 | if mode == tf.estimator.ModeKeys.EVAL: 344 | auc = tf.metrics.auc(labels, predict) 345 | return tf.estimator.EstimatorSpec(mode, loss=loss, eval_metric_ops={"eval-auc": auc}) 346 | 347 | assert mode == tf.estimator.ModeKeys.TRAIN 348 | decay_steps = self.model_json['model']['decay_steps'] 349 | decay_rate = self.model_json['model']['decay_rate'] 350 | 351 | tf.summary.scalar('train-loss', loss) 352 | 353 | lr = tf.train.exponential_decay(learning_rate=self.learning_rate, global_step=global_step, decay_steps=decay_steps, decay_rate=decay_rate) 354 | 355 | 356 | optimizer = tf.train.AdamOptimizer(learning_rate=lr) 357 | 358 | update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) 359 | train_op = optimizer.minimize(loss, global_step=global_step) 360 | tf.logging.info('all_variable:{}'.format(tf.all_variables())) 361 | train_op = tf.group([train_op, update_ops]) 362 | return tf.estimator.EstimatorSpec( 363 | mode=mode, 364 | loss=loss, 365 | train_op=train_op, 366 | ) 367 | # training_hooks=[logging_hook]) 368 | 369 | 370 | def add_fc_layers(self, deep_in, mode, name): 371 | training = (mode == tf.estimator.ModeKeys.TRAIN) 372 | """各层的定义""" 373 | with tf.variable_scope("dense_layers", reuse=tf.AUTO_REUSE): 374 | deep_out = deep_in 375 | for idx, unit in enumerate([300,150]): 376 | 377 | deep_out = tf.layers.dense(deep_out, units=unit, activation=tf.nn.tanh, name=name + "_" + str(idx)) 378 | gate = tf.layers.dense(deep_out, units=unit, activation=tf.sigmoid, name=name + "_gate" + str(idx)) 379 | deep_out = deep_out * gate 380 | 381 | deep_out = tf.layers.batch_normalization(deep_out, training=training) 382 | deep_out = tf.layers.dropout(deep_out, rate=0.1, training=training) 383 | deep_predict = tf.layers.dense(deep_out, units=1, name=name + "_" + "final") 384 | return deep_predict 385 | -------------------------------------------------------------------------------- /code/v1/src/model/att.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import tensorflow as tf 3 | import utils.flag_setup as flag_setup 4 | import numpy as np 5 | from helper import append_idx 6 | import helper 7 | import pickle 8 | class ATT(object): 9 | def __init__(self, model_json, mode): 10 | blend_word2vec_path = '../../../user_data/blend_word2vec.pkl' 11 | if mode == 'afo': 12 | blend_word2vec_path = 'training.tar.gz/src/blend_word2vec.pkl' 13 | 14 | self.cur_run_mode = mode 15 | self.model_json = model_json 16 | self.neg_num = model_json["model"]["neg_num"] 17 | self.eval_neg_num = model_json["model"]["eval_neg_num"] 18 | self.NUM_NAME = 'num_boxes' 19 | self.run_id = flag_setup.FLAGS.run_id 20 | self.model_name = model_json["model"]["model_name"] 21 | self.hidden_layer = model_json["model"]["hidden_layers"] 22 | self.embedding_size = {} 23 | for field in model_json["data_schema"]["features"]: 24 | if field['type'] == 'embedding' or field['type'] == 'embedding_sp' or field['type'] == 'embedding_seq': 25 | self.embedding_size[field["name"]] = field["max"] + 1 26 | 27 | self.learning_rate = model_json["model"]["learning_rate"] 28 | self.epoch = model_json["model"]["epoch"] 29 | self.batch_size = model_json["model"]["batch_size"] 30 | self.job_name = "worker" 31 | if mode == "afo": 32 | self.job_name = flag_setup.FLAGS.job_name 33 | self.word_vecs = pickle.load(open(blend_word2vec_path,'rb')) 34 | print('word2vec path',blend_word2vec_path) 35 | 36 | self.use_sample_type = model_json["model"]["use_sample_type"] 37 | 38 | def cal_logit(self, query_seq,query_num,query_mask, query_emb, query_lastword, boxes_embedding, label_embedding,\ 39 | boxes_num,\ 40 | height, width, area, item_mask, mode): 41 | 42 | 43 | training = (mode == tf.estimator.ModeKeys.TRAIN) 44 | 45 | 46 | # with tf.variable_scope("item_semantic", reuse=tf.AUTO_REUSE): 47 | 48 | 49 | 50 | with tf.variable_scope("cross", reuse=tf.AUTO_REUSE): 51 | # label_features, boxes_area_ratio_embedding, left_id_ratio_embedding, width_ratio_embedding, top_id_ratio_embedding,\ 52 | # heigth_ratio_embedding,boxes_position,boxes_height,boxes_width,boxes_area = label_embedding 53 | 54 | # boxes_concat = tf.concat([boxes_embedding]+label_embedding,axis=-1) 55 | 56 | # boxes_concat_shape1 = tf.shape(boxes_concat)[1] 57 | 58 | 59 | with tf.variable_scope('query_semantic'): 60 | # query_emb = tf.layers.dropout(query_emb, rate=0.1, training=training) 61 | # query_emb = helper.tanh_sigmoid(query_emb,300) 62 | # query_emb = tf.layers.batch_normalization(query_emb, training=training) 63 | # query_emb = tf.layers.dropout(query_emb, rate=0.1, training=training) 64 | query_emb = tf.layers.dense(query_emb,300,activation=tf.nn.relu) 65 | query_emb = helper.layer_norm(query_emb,300) 66 | query_emb = tf.layers.dropout(query_emb, rate=0.1, training=training) 67 | 68 | 69 | with tf.variable_scope('item_semantic',reuse=tf.AUTO_REUSE): 70 | boxes_embedding = tf.layers.dense(boxes_embedding,units=1024,activation=tf.nn.relu) 71 | boxes_embedding = helper.layer_norm(boxes_embedding,1024) 72 | boxes_embedding = tf.layers.dropout(boxes_embedding, rate=0.1, training=training) 73 | 74 | # boxes_embedding = helper.tanh_sigmoid(boxes_embedding,384) 75 | boxes_embedding = tf.layers.dense(boxes_embedding,units=512,activation=tf.nn.relu) 76 | boxes_embedding = helper.layer_norm(boxes_embedding,512) 77 | boxes_embedding = tf.layers.dropout(boxes_embedding, rate=0.1, training=training) 78 | 79 | 80 | # boxes_embedding = helper.tanh_sigmoid(boxes_embedding,300) 81 | boxes_embedding = tf.layers.dense(boxes_embedding,units=300,activation=tf.nn.relu) 82 | boxes_embedding = helper.layer_norm(boxes_embedding,300) 83 | boxes_embedding = tf.layers.dropout(boxes_embedding, rate=0.1, training=training) 84 | 85 | # label_embedding = helper.tanh_sigmoid(tf.concat(label_embedding,axis=-1),300) 86 | label_embedding = tf.layers.dense(tf.concat(label_embedding,axis=-1),units=300,activation=tf.nn.relu) 87 | label_embedding = helper.layer_norm(label_embedding,300) 88 | label_embedding = tf.layers.dropout(label_embedding, rate=0.1, training=training) 89 | 90 | 91 | boxes_concat = tf.concat([boxes_embedding,label_embedding],axis=-1) 92 | boxes_value = tf.layers.dense(boxes_concat,300,activation=tf.nn.relu) 93 | # boxes_value = tf.layers.tanh_sigmoid(boxes_concat,300) 94 | boxes_value = helper.layer_norm(boxes_value,300) 95 | boxes_value = tf.layers.dropout(boxes_value, rate=0.1, training=training) 96 | 97 | # boxes_concat = helper.conditional_layer_norm_with_query(boxes_concat,300,conditional_input=query_emb) 98 | # boxes_concat = tf.layers.dropout(boxes_concat, rate=0.1, training=training) 99 | 100 | 101 | # query_query = query_emb 102 | # # query_query = tf.concat([query_emb,query_lastword],axis=-1) 103 | # boxes_key = boxes_concat 104 | # boxes_value = boxes_concat 105 | 106 | 107 | 108 | 109 | 110 | with tf.variable_scope("query_image_image_attention", reuse=tf.AUTO_REUSE): 111 | # query_query = tf.layers.dense(query_emb,100) 112 | # boxes_key = tf.layers.dense(boxes_concat,100) 113 | 114 | # query_query = helper.tanh_sigmoid(query_emb,100) 115 | # boxes_key = helper.tanh_sigmoid(boxes_concat,100) 116 | 117 | query_query = query_emb 118 | boxes_key = boxes_value 119 | 120 | att_query_image_image,softmax_score = helper.dot_attention_with_query(query_query, boxes_key, boxes_value, mask=item_mask,scale_dot=True) 121 | 122 | # att_query_image_image,softmax_score = helper.attention_with_query(query_query, boxes_key, boxes_value, mask=item_mask) 123 | 124 | # att_query_image_image = tf.reduce_sum(boxes_value*tf.expand_dims(item_mask,axis=2),axis=1)/tf.expand_dims(tf.cast(boxes_num,dtype=tf.float32),axis=1) 125 | 126 | query_out = query_emb 127 | # query_out = helper.tanh_sigmoid(query_emb, 300) 128 | # query_out = tf.layers.batch_normalization(query_out, training=training) 129 | # query_out = tf.layers.dropout(query_out, rate=0.1, training=training) 130 | 131 | image_out = att_query_image_image 132 | # image_out = helper.tanh_sigmoid(att_query_image_image, 300) 133 | # image_out = tf.layers.batch_normalization(image_out, training=training) 134 | # image_out = tf.layers.dropout(image_out, rate=0.1, training=training) 135 | 136 | # image_out = helper.conditional_layer_norm(image_out,units=300,conditional_input=query_out) 137 | 138 | # image_out = tf.layers.batch_normalization(image_out, training=training) 139 | # image_out = tf.layers.dropout(image_out, rate=0.1, training=training) 140 | # att_query_image_image,softmax_score = helper.attention_with_query(query_query, boxes_value, boxes_value, mask=item_mask, activation=None, scale=None) 141 | 142 | 143 | # att_query_image_image_ex = tf.expand_dims(att_query_image_image, axis=1) 144 | # image_tile = tf.tile(att_query_image_image_ex,[1,tf.shape(query_seq)[1],1]) 145 | # seq = tf.concat([query_seq,image_tile],axis=-1) 146 | 147 | # seq_dense = self.add_layer(seq, 700, 300, activation_function=tf.nn.tanh, name='seq_dense') 148 | 149 | # seq_dense = seq_dense * query_mask 150 | # # query_object_match = tf.reduce_max(seq_dense,axis=1) 151 | # query_object_match = tf.reduce_sum(seq_dense,axis=1)#/tf.expand_dims(tf.cast(query_num,dtype=tf.float32),axis=1) 152 | 153 | # image_mean = tf.reduce_sum(boxes_value*tf.expand_dims(item_mask,axis=2),axis=1) /tf.expand_dims(tf.cast(boxes_num,dtype=tf.float32),axis=1) 154 | 155 | 156 | concat_out = tf.concat([query_out*image_out,height,width,area], axis=1) 157 | # concat_out = tf.concat([query_emb, att_query_image_image, height,width,area], axis=1) 158 | concat_out = tf.layers.batch_normalization(concat_out, training=training) 159 | concat_out = tf.layers.dropout(concat_out, rate=0.1, training=training) 160 | 161 | 162 | 163 | logit = self.add_fc_layers(concat_out, name='dense', mode=mode) 164 | return logit,softmax_score 165 | 166 | 167 | def model_fn(self, features, labels, mode, params): 168 | neg_num = self.neg_num 169 | if mode == tf.estimator.ModeKeys.PREDICT: 170 | neg_num = 0 171 | tf.logging.info("neg_num:") 172 | tf.logging.info(neg_num) 173 | if mode == tf.estimator.ModeKeys.EVAL: 174 | neg_num = self.eval_neg_num 175 | def _embedding_simple(name, embedding_ids, embedding_size, embedding_dim): 176 | X = tf.get_variable(name, [embedding_size, embedding_dim], 177 | initializer=tf.truncated_normal_initializer(0.0, 1e-5), trainable=True) 178 | out_tensor = tf.gather(X, embedding_ids) 179 | return out_tensor 180 | def _embedding(f, embedding_dim, is_sp=False, idx=None, init_vec=None, fea_name=None): 181 | with tf.variable_scope("input_embedding", reuse=tf.AUTO_REUSE): 182 | if idx is not None: 183 | feature_name = append_idx(f, idx) 184 | else: 185 | feature_name = f 186 | 187 | if fea_name is not None: 188 | feature_name = fea_name 189 | 190 | if init_vec is None: 191 | emb_var = tf.get_variable("emb_" + str(f), [self.embedding_size[f], embedding_dim], 192 | initializer=tf.truncated_normal_initializer(0.0, 1e-5), trainable=True) 193 | else: 194 | emb_var = tf.get_variable("emb_" + str(f), [self.embedding_size[f], embedding_dim], 195 | initializer=tf.constant_initializer(init_vec), 196 | trainable=True 197 | ) 198 | if is_sp: 199 | out_tensor = tf.nn.embedding_lookup_sparse(emb_var, features[feature_name], None, combiner="mean") 200 | else: 201 | out_tensor = tf.gather(emb_var, features[feature_name]) 202 | return out_tensor 203 | 204 | training = (mode == tf.estimator.ModeKeys.TRAIN) 205 | 206 | pos_emb = tf.get_variable("pos_embedding", [100, 100], initializer=tf.truncated_normal_initializer(0.0, 1e-5), trainable=True) 207 | with tf.variable_scope("query_semantic"): 208 | query_emb_size = self.model_json['model']['query_embedding_size'] 209 | query_emb = _embedding('query', query_emb_size, init_vec=self.word_vecs) 210 | cur_pos_emb = tf.expand_dims(pos_emb[0:tf.shape(query_emb)[1]], axis=0) 211 | cur_pos_emb = tf.tile(cur_pos_emb, [tf.shape(query_emb)[0], 1, 1]) 212 | query_seq = tf.concat([query_emb, cur_pos_emb], axis=-1) 213 | 214 | # query_seq = query_emb 215 | query_num = features['query_words_num'] 216 | query_mask = tf.expand_dims(tf.sequence_mask(query_num, dtype=tf.float32), axis=2) 217 | # /tf.expand_dims(tf.cast(query_num,dtype=tf.float32),axis=1) 218 | # query_emb = self.query_semantic_layer(tf.reduce_sum(query_seq * query_mask, axis=1), query_emb_size + 100, mode=mode) 219 | query_emb = tf.reduce_sum(query_seq * query_mask, axis=1) 220 | query_lastword = None # _embedding('last_word', query_emb_size, is_sp=False, init_vec=self.word_vecs) 221 | # query_emb = tf.layers.dropout(query_emb, rate=0.1, training=training) 222 | 223 | image_feature_mean = tf.constant([[helper.image_feature_mean]]) 224 | image_feature_std = tf.constant([[helper.image_feature_std]]) 225 | 226 | item_masks = [] 227 | boxes_embeddings = [] 228 | label_embeddings = [] 229 | boxes_num = [] 230 | 231 | height_embs = [] 232 | width_embs = [] 233 | area_embs = [] 234 | for i in range(neg_num + 1): 235 | with tf.variable_scope("boxes", reuse=tf.AUTO_REUSE): 236 | 237 | boxes_features = features[append_idx('boxes_features', i)] 238 | # boxes_features = (boxes_features-image_feature_mean)/image_feature_std 239 | label_feature_embedding = _embedding('boxes_labels', 300, idx=i) 240 | # label_features = tf.layers.dropout(label_features, rate=0.1, training=training) 241 | boxes_position_embedding = _embedding('boxes_position', 20, idx=i) 242 | boxes_height_embedding = _embedding('boxes_height', 20, idx=i) 243 | boxes_width_embedding = _embedding('boxes_width', 20, idx=i) 244 | boxes_area_embedding = _embedding('boxes_area', 20, idx=i) 245 | 246 | num = features[append_idx('num_boxes', i)] 247 | 248 | boxes_masks = tf.sequence_mask(num, dtype=tf.float32) 249 | 250 | 251 | boxes_coordinate = tf.clip_by_value(tf.cast(features[append_idx('boxes', i)], tf.float32), 0, 1000) 252 | tf.logging.info("boxes_coordinate shape:") 253 | tf.logging.info(boxes_coordinate.get_shape().as_list()) 254 | img_height = tf.expand_dims(tf.cast(features[append_idx('height', i)], tf.float32), axis=1) 255 | img_width = tf.expand_dims(tf.cast(features[append_idx('width', i)], tf.float32), axis=1) 256 | tf.logging.info("img_width shape:") 257 | tf.logging.info(img_width.get_shape().as_list()) 258 | boxes_width = tf.cast(features[append_idx('boxes_width',i)],dtype=tf.float32) 259 | boxes_height = tf.cast(features[append_idx('boxes_height',i)],dtype=tf.float32) 260 | boxes_area_ratio = tf.cast(features[append_idx('boxes_area',i)],dtype=tf.float32)/tf.expand_dims(tf.cast(features[append_idx("image_area",i)], tf.float32), axis=1) 261 | boxes_area_ratio_ids = tf.clip_by_value(tf.cast(boxes_area_ratio / 0.1, tf.int64), 0, 10) 262 | left_id_ratio_ids = tf.clip_by_value(tf.cast( boxes_coordinate[:, :, 1] / (img_width*10) / 0.1, tf.int64), 0, 10) 263 | width_ratio_ids = tf.clip_by_value(tf.cast((boxes_width*5) / (img_width*10) / 0.1, tf.int64), 0, 10) 264 | top_id_ratio_ids = tf.clip_by_value(tf.cast( boxes_coordinate[:, :, 0] / (img_height*10) / 0.1, tf.int64), 0, 10) 265 | heigth_ratio_ids = tf.clip_by_value(tf.cast((boxes_height*5) / (img_height*10) / 0.1, tf.int64), 0, 10) 266 | 267 | boxes_area_ratio_embedding = _embedding_simple('boxes_area_ratio', boxes_area_ratio_ids, 11, 20) 268 | left_id_ratio_embedding = _embedding_simple('boxes_left_ratio', left_id_ratio_ids, 11, 20) 269 | width_ratio_embedding = _embedding_simple('boxes_width_ratio', width_ratio_ids, 11, 20) 270 | top_id_ratio_embedding = _embedding_simple('boxes_top_ratio', top_id_ratio_ids, 11, 20) 271 | heigth_ratio_embedding = _embedding_simple('boxes_height_ratio', heigth_ratio_ids, 11, 20) 272 | label_embeddings.append([label_feature_embedding, boxes_area_ratio_embedding, left_id_ratio_embedding, width_ratio_embedding, top_id_ratio_embedding, 273 | heigth_ratio_embedding,boxes_position_embedding,boxes_height_embedding,boxes_width_embedding,boxes_area_embedding]) 274 | 275 | boxes_embeddings.append(boxes_features) 276 | boxes_num.append(num) 277 | 278 | height = _embedding('height', 20, idx=i) 279 | width = _embedding('width', 20, idx=i) 280 | image_area = _embedding('image_area', 20, idx=i) 281 | height_embs.append(height) 282 | width_embs.append(width) 283 | area_embs.append(image_area) 284 | 285 | item_masks.append(boxes_masks) 286 | 287 | tf.logging.info("query_in:") 288 | tf.logging.info(tf.shape(query_emb)) 289 | 290 | logit,sfotmax_score = self.cal_logit(query_seq,query_num,query_mask, query_emb, query_lastword, boxes_embeddings[0], label_embeddings[0],boxes_num[0], 291 | height_embs[0],width_embs[0],area_embs[0], item_masks[0], mode=mode) 292 | 293 | if self.cur_run_mode=='afo': 294 | every_n_iter = 5000 295 | else: 296 | every_n_iter = 200 297 | logging_hook = tf.train.LoggingTensorHook(every_n_iter=every_n_iter,tensors={'softmax_score': sfotmax_score}) 298 | 299 | logit = tf.reshape(logit, [-1, 1]) 300 | predict = tf.sigmoid(logit) 301 | 302 | if mode == tf.estimator.ModeKeys.PREDICT: 303 | predict_dict = {"prediction": predict} 304 | export_output = {'serving': tf.estimator.export.PredictOutput(predict_dict)} 305 | return tf.estimator.EstimatorSpec(mode, predictions=predict_dict, export_outputs=export_output) 306 | 307 | global_step = tf.train.get_global_step() 308 | if neg_num > 0: 309 | score = [tf.reshape(logit, [-1, 1])] 310 | for i in range(1, neg_num + 1): 311 | logit,sfotmax_score = self.cal_logit(query_seq,query_num,query_mask, query_emb, query_lastword, boxes_embeddings[i], label_embeddings[i],boxes_num[i], 312 | height_embs[i],width_embs[i],area_embs[i], item_masks[i], mode=mode) 313 | score.append(tf.reshape(logit, [-1, 1])) 314 | score = tf.concat(score, axis=1) 315 | prob = tf.nn.softmax(score, axis=1) 316 | predict = prob[:, 0] 317 | loss = -tf.reduce_mean(tf.log(predict)) 318 | else: 319 | label = tf.reshape(tf.cast(labels, tf.float32), [-1, 1]) 320 | if self.use_sample_type==1: 321 | stepsize = 300 322 | iteration = tf.cast(global_step,tf.float32) 323 | 324 | beta = 0.7**(1+iteration/stepsize) 325 | 326 | extra_preds = tf.reshape(tf.cast(features['extra_preds'], tf.float32), [-1, 1]) 327 | soft_loss = tf.reduce_mean( 328 | tf.nn.sigmoid_cross_entropy_with_logits(labels=extra_preds, logits=logit)) 329 | 330 | label = tf.reshape(tf.cast(labels, tf.float32), [-1, 1]) 331 | hard_loss = tf.reduce_mean( 332 | tf.nn.sigmoid_cross_entropy_with_logits(labels=label, logits=logit)) 333 | 334 | loss = beta*hard_loss + (1-beta) * soft_loss 335 | 336 | 337 | # weights = tf.constant([0,1,1,1],dtype=tf.float32) 338 | # sample_type = features['sample_type'] 339 | 340 | # loss_weights = tf.gather(weights,tf.reshape(sample_type,[-1,1])) 341 | # loss = loss * loss_weights 342 | # loss = tf.reduce_mean(loss) 343 | else: 344 | 345 | loss = tf.reduce_mean( 346 | tf.nn.sigmoid_cross_entropy_with_logits(labels=label, logits=logit)) 347 | # auc = tf.metrics.auc(labels, predict) 348 | 349 | # logging_hook = tf.train.LoggingTensorHook(every_n_iter=100, 350 | # tensors={'auc': auc[0]}) 351 | 352 | # 有loss和auc,可以定义eval的返回了 353 | if mode == tf.estimator.ModeKeys.EVAL: 354 | auc = tf.metrics.auc(labels, predict) 355 | return tf.estimator.EstimatorSpec(mode, loss=loss, eval_metric_ops={"eval-auc": auc}) 356 | 357 | assert mode == tf.estimator.ModeKeys.TRAIN 358 | decay_steps = self.model_json['model']['decay_steps'] 359 | decay_rate = self.model_json['model']['decay_rate'] 360 | 361 | tf.summary.scalar('train-loss', loss) 362 | global_step = tf.train.get_global_step() 363 | lr = tf.train.exponential_decay(learning_rate=self.learning_rate, global_step=global_step, decay_steps=decay_steps, decay_rate=decay_rate) 364 | 365 | 366 | optimizer = tf.train.AdamOptimizer(learning_rate=lr) 367 | 368 | update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) 369 | train_op = optimizer.minimize(loss, global_step=global_step) 370 | tf.logging.info('all_variable:{}'.format(tf.all_variables())) 371 | train_op = tf.group([train_op, update_ops]) 372 | return tf.estimator.EstimatorSpec( 373 | mode=mode, 374 | loss=loss, 375 | train_op=train_op,) 376 | 377 | # training_hooks=[logging_hook]) 378 | 379 | 380 | def add_fc_layers(self, deep_in, mode, name): 381 | training = (mode == tf.estimator.ModeKeys.TRAIN) 382 | """各层的定义""" 383 | with tf.variable_scope("dense_layers", reuse=tf.AUTO_REUSE): 384 | deep_out = deep_in 385 | for idx, unit in enumerate([300,150]): 386 | 387 | deep_out = tf.layers.dense(deep_out, units=unit, activation=tf.nn.tanh, name=name + "_" + str(idx)) 388 | gate = tf.layers.dense(deep_out, units=unit, activation=tf.sigmoid, name=name + "_gate" + str(idx)) 389 | deep_out = deep_out * gate 390 | 391 | deep_out = tf.layers.batch_normalization(deep_out, training=training) 392 | deep_out = tf.layers.dropout(deep_out, rate=0.1, training=training) 393 | deep_predict = tf.layers.dense(deep_out, units=1, name=name + "_" + "final") 394 | return deep_predict 395 | -------------------------------------------------------------------------------- /code/v1/src/model/att_predict.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import tensorflow as tf 3 | import utils.flag_setup as flag_setup 4 | import numpy as np 5 | from helper import append_idx 6 | import helper 7 | import pickle 8 | class ATTPredict(object): 9 | def __init__(self, model_json, mode): 10 | blend_word2vec_path = '../../../user_data/blend_word2vec.pkl' 11 | if mode == 'afo': 12 | blend_word2vec_path = 'training.tar.gz/src/blend_word2vec.pkl' 13 | 14 | self.cur_run_mode = mode 15 | self.model_json = model_json 16 | self.neg_num = model_json["model"]["neg_num"] 17 | self.eval_neg_num = model_json["model"]["eval_neg_num"] 18 | self.NUM_NAME = 'num_boxes' 19 | self.run_id = flag_setup.FLAGS.run_id 20 | self.model_name = model_json["model"]["model_name"] 21 | self.hidden_layer = model_json["model"]["hidden_layers"] 22 | self.embedding_size = {} 23 | for field in model_json["data_schema"]["features"]: 24 | if field['type'] == 'embedding' or field['type'] == 'embedding_sp' or field['type'] == 'embedding_seq': 25 | self.embedding_size[field["name"]] = field["max"] + 1 26 | 27 | self.learning_rate = model_json["model"]["learning_rate"] 28 | self.epoch = model_json["model"]["epoch"] 29 | self.batch_size = model_json["model"]["batch_size"] 30 | self.job_name = "worker" 31 | if mode == "afo": 32 | self.job_name = flag_setup.FLAGS.job_name 33 | self.word_vecs = pickle.load(open(blend_word2vec_path,'rb')) 34 | print('word2vec path',blend_word2vec_path) 35 | 36 | self.use_sample_type = model_json["model"]["use_sample_type"] 37 | 38 | def cal_logit(self, query_seq,query_num,query_mask, query_emb, query_lastword, boxes_embedding, label_embedding,\ 39 | boxes_num,\ 40 | height, width, area, item_mask, mode): 41 | 42 | 43 | training = (mode == tf.estimator.ModeKeys.TRAIN) 44 | 45 | 46 | # with tf.variable_scope("item_semantic", reuse=tf.AUTO_REUSE): 47 | 48 | 49 | 50 | with tf.variable_scope("cross", reuse=tf.AUTO_REUSE): 51 | # label_features, boxes_area_ratio_embedding, left_id_ratio_embedding, width_ratio_embedding, top_id_ratio_embedding,\ 52 | # heigth_ratio_embedding,boxes_position,boxes_height,boxes_width,boxes_area = label_embedding 53 | 54 | # boxes_concat = tf.concat([boxes_embedding]+label_embedding,axis=-1) 55 | 56 | # boxes_concat_shape1 = tf.shape(boxes_concat)[1] 57 | 58 | 59 | with tf.variable_scope('query_semantic'): 60 | # query_emb = tf.layers.dropout(query_emb, rate=0.1, training=training) 61 | # query_emb = helper.tanh_sigmoid(query_emb,300) 62 | # query_emb = tf.layers.batch_normalization(query_emb, training=training) 63 | # query_emb = tf.layers.dropout(query_emb, rate=0.1, training=training) 64 | query_emb = tf.layers.dense(query_emb,300,activation=tf.nn.relu) 65 | query_emb = helper.layer_norm(query_emb,300) 66 | query_emb = tf.layers.dropout(query_emb, rate=0.1, training=training) 67 | 68 | 69 | with tf.variable_scope('item_semantic',reuse=tf.AUTO_REUSE): 70 | boxes_embedding = tf.layers.dense(boxes_embedding,units=1024,activation=tf.nn.relu) 71 | boxes_embedding = helper.layer_norm(boxes_embedding,1024) 72 | boxes_embedding = tf.layers.dropout(boxes_embedding, rate=0.1, training=training) 73 | 74 | # boxes_embedding = helper.tanh_sigmoid(boxes_embedding,384) 75 | boxes_embedding = tf.layers.dense(boxes_embedding,units=512,activation=tf.nn.relu) 76 | boxes_embedding = helper.layer_norm(boxes_embedding,512) 77 | boxes_embedding = tf.layers.dropout(boxes_embedding, rate=0.1, training=training) 78 | 79 | 80 | # boxes_embedding = helper.tanh_sigmoid(boxes_embedding,300) 81 | boxes_embedding = tf.layers.dense(boxes_embedding,units=300,activation=tf.nn.relu) 82 | boxes_embedding = helper.layer_norm(boxes_embedding,300) 83 | boxes_embedding = tf.layers.dropout(boxes_embedding, rate=0.1, training=training) 84 | 85 | # label_embedding = helper.tanh_sigmoid(tf.concat(label_embedding,axis=-1),300) 86 | label_embedding = tf.layers.dense(tf.concat(label_embedding,axis=-1),units=300,activation=tf.nn.relu) 87 | label_embedding = helper.layer_norm(label_embedding,300) 88 | label_embedding = tf.layers.dropout(label_embedding, rate=0.1, training=training) 89 | 90 | 91 | boxes_concat = tf.concat([boxes_embedding,label_embedding],axis=-1) 92 | boxes_value = tf.layers.dense(boxes_concat,300,activation=tf.nn.relu) 93 | # boxes_value = tf.layers.tanh_sigmoid(boxes_concat,300) 94 | boxes_value = helper.layer_norm(boxes_value,300) 95 | boxes_value = tf.layers.dropout(boxes_value, rate=0.1, training=training) 96 | 97 | # boxes_concat = helper.conditional_layer_norm_with_query(boxes_concat,300,conditional_input=query_emb) 98 | # boxes_concat = tf.layers.dropout(boxes_concat, rate=0.1, training=training) 99 | 100 | 101 | # query_query = query_emb 102 | # # query_query = tf.concat([query_emb,query_lastword],axis=-1) 103 | # boxes_key = boxes_concat 104 | # boxes_value = boxes_concat 105 | 106 | 107 | 108 | 109 | 110 | with tf.variable_scope("query_image_image_attention", reuse=tf.AUTO_REUSE): 111 | # query_query = tf.layers.dense(query_emb,100) 112 | # boxes_key = tf.layers.dense(boxes_concat,100) 113 | 114 | # query_query = helper.tanh_sigmoid(query_emb,100) 115 | # boxes_key = helper.tanh_sigmoid(boxes_concat,100) 116 | 117 | query_query = query_emb 118 | boxes_key = boxes_value 119 | 120 | att_query_image_image,softmax_score = helper.dot_attention_with_query(query_query, boxes_key, boxes_value, mask=item_mask,scale_dot=True) 121 | 122 | # att_query_image_image,softmax_score = helper.attention_with_query(query_query, boxes_key, boxes_value, mask=item_mask) 123 | 124 | # att_query_image_image = tf.reduce_sum(boxes_value*tf.expand_dims(item_mask,axis=2),axis=1)/tf.expand_dims(tf.cast(boxes_num,dtype=tf.float32),axis=1) 125 | 126 | query_out = query_emb 127 | # query_out = helper.tanh_sigmoid(query_emb, 300) 128 | # query_out = tf.layers.batch_normalization(query_out, training=training) 129 | # query_out = tf.layers.dropout(query_out, rate=0.1, training=training) 130 | 131 | image_out = att_query_image_image 132 | # image_out = helper.tanh_sigmoid(att_query_image_image, 300) 133 | # image_out = tf.layers.batch_normalization(image_out, training=training) 134 | # image_out = tf.layers.dropout(image_out, rate=0.1, training=training) 135 | 136 | # image_out = helper.conditional_layer_norm(image_out,units=300,conditional_input=query_out) 137 | 138 | # image_out = tf.layers.batch_normalization(image_out, training=training) 139 | # image_out = tf.layers.dropout(image_out, rate=0.1, training=training) 140 | # att_query_image_image,softmax_score = helper.attention_with_query(query_query, boxes_value, boxes_value, mask=item_mask, activation=None, scale=None) 141 | 142 | 143 | # att_query_image_image_ex = tf.expand_dims(att_query_image_image, axis=1) 144 | # image_tile = tf.tile(att_query_image_image_ex,[1,tf.shape(query_seq)[1],1]) 145 | # seq = tf.concat([query_seq,image_tile],axis=-1) 146 | 147 | # seq_dense = self.add_layer(seq, 700, 300, activation_function=tf.nn.tanh, name='seq_dense') 148 | 149 | # seq_dense = seq_dense * query_mask 150 | # # query_object_match = tf.reduce_max(seq_dense,axis=1) 151 | # query_object_match = tf.reduce_sum(seq_dense,axis=1)#/tf.expand_dims(tf.cast(query_num,dtype=tf.float32),axis=1) 152 | 153 | # image_mean = tf.reduce_sum(boxes_value*tf.expand_dims(item_mask,axis=2),axis=1) /tf.expand_dims(tf.cast(boxes_num,dtype=tf.float32),axis=1) 154 | 155 | 156 | concat_out = tf.concat([query_out*image_out,height,width,area], axis=1) 157 | # concat_out = tf.concat([query_emb, att_query_image_image, height,width,area], axis=1) 158 | concat_out = tf.layers.batch_normalization(concat_out, training=training) 159 | concat_out = tf.layers.dropout(concat_out, rate=0.1, training=training) 160 | 161 | 162 | 163 | deep_out,logit = self.add_fc_layers(concat_out, name='dense', mode=mode) 164 | return logit,softmax_score,query_out,image_out,deep_out 165 | 166 | 167 | def model_fn(self, features, labels, mode, params): 168 | neg_num = self.neg_num 169 | if mode == tf.estimator.ModeKeys.PREDICT: 170 | neg_num = 0 171 | tf.logging.info("neg_num:") 172 | tf.logging.info(neg_num) 173 | if mode == tf.estimator.ModeKeys.EVAL: 174 | neg_num = self.eval_neg_num 175 | def _embedding_simple(name, embedding_ids, embedding_size, embedding_dim): 176 | X = tf.get_variable(name, [embedding_size, embedding_dim], 177 | initializer=tf.truncated_normal_initializer(0.0, 1e-5), trainable=True) 178 | out_tensor = tf.gather(X, embedding_ids) 179 | return out_tensor 180 | def _embedding(f, embedding_dim, is_sp=False, idx=None, init_vec=None, fea_name=None): 181 | with tf.variable_scope("input_embedding", reuse=tf.AUTO_REUSE): 182 | if idx is not None: 183 | feature_name = append_idx(f, idx) 184 | else: 185 | feature_name = f 186 | 187 | if fea_name is not None: 188 | feature_name = fea_name 189 | 190 | if init_vec is None: 191 | emb_var = tf.get_variable("emb_" + str(f), [self.embedding_size[f], embedding_dim], 192 | initializer=tf.truncated_normal_initializer(0.0, 1e-5), trainable=True) 193 | else: 194 | emb_var = tf.get_variable("emb_" + str(f), [self.embedding_size[f], embedding_dim], 195 | initializer=tf.constant_initializer(init_vec), 196 | trainable=True 197 | ) 198 | if is_sp: 199 | out_tensor = tf.nn.embedding_lookup_sparse(emb_var, features[feature_name], None, combiner="mean") 200 | else: 201 | out_tensor = tf.gather(emb_var, features[feature_name]) 202 | return out_tensor 203 | 204 | training = (mode == tf.estimator.ModeKeys.TRAIN) 205 | 206 | pos_emb = tf.get_variable("pos_embedding", [100, 100], initializer=tf.truncated_normal_initializer(0.0, 1e-5), trainable=True) 207 | with tf.variable_scope("query_semantic"): 208 | query_emb_size = self.model_json['model']['query_embedding_size'] 209 | query_emb = _embedding('query', query_emb_size, init_vec=self.word_vecs) 210 | cur_pos_emb = tf.expand_dims(pos_emb[0:tf.shape(query_emb)[1]], axis=0) 211 | cur_pos_emb = tf.tile(cur_pos_emb, [tf.shape(query_emb)[0], 1, 1]) 212 | query_seq = tf.concat([query_emb, cur_pos_emb], axis=-1) 213 | 214 | # query_seq = query_emb 215 | query_num = features['query_words_num'] 216 | query_mask = tf.expand_dims(tf.sequence_mask(query_num, dtype=tf.float32), axis=2) 217 | # /tf.expand_dims(tf.cast(query_num,dtype=tf.float32),axis=1) 218 | # query_emb = self.query_semantic_layer(tf.reduce_sum(query_seq * query_mask, axis=1), query_emb_size + 100, mode=mode) 219 | query_emb = tf.reduce_sum(query_seq * query_mask, axis=1) 220 | query_lastword = None # _embedding('last_word', query_emb_size, is_sp=False, init_vec=self.word_vecs) 221 | # query_emb = tf.layers.dropout(query_emb, rate=0.1, training=training) 222 | 223 | image_feature_mean = tf.constant([[helper.image_feature_mean]]) 224 | image_feature_std = tf.constant([[helper.image_feature_std]]) 225 | 226 | item_masks = [] 227 | boxes_embeddings = [] 228 | label_embeddings = [] 229 | boxes_num = [] 230 | 231 | height_embs = [] 232 | width_embs = [] 233 | area_embs = [] 234 | for i in range(neg_num + 1): 235 | with tf.variable_scope("boxes", reuse=tf.AUTO_REUSE): 236 | 237 | boxes_features = features[append_idx('boxes_features', i)] 238 | # boxes_features = (boxes_features-image_feature_mean)/image_feature_std 239 | label_feature_embedding = _embedding('boxes_labels', 300, idx=i) 240 | # label_features = tf.layers.dropout(label_features, rate=0.1, training=training) 241 | boxes_position_embedding = _embedding('boxes_position', 20, idx=i) 242 | boxes_height_embedding = _embedding('boxes_height', 20, idx=i) 243 | boxes_width_embedding = _embedding('boxes_width', 20, idx=i) 244 | boxes_area_embedding = _embedding('boxes_area', 20, idx=i) 245 | 246 | num = features[append_idx('num_boxes', i)] 247 | 248 | boxes_masks = tf.sequence_mask(num, dtype=tf.float32) 249 | 250 | 251 | boxes_coordinate = tf.clip_by_value(tf.cast(features[append_idx('boxes', i)], tf.float32), 0, 1000) 252 | tf.logging.info("boxes_coordinate shape:") 253 | tf.logging.info(boxes_coordinate.get_shape().as_list()) 254 | img_height = tf.expand_dims(tf.cast(features[append_idx('height', i)], tf.float32), axis=1) 255 | img_width = tf.expand_dims(tf.cast(features[append_idx('width', i)], tf.float32), axis=1) 256 | tf.logging.info("img_width shape:") 257 | tf.logging.info(img_width.get_shape().as_list()) 258 | boxes_width = tf.cast(features[append_idx('boxes_width',i)],dtype=tf.float32) 259 | boxes_height = tf.cast(features[append_idx('boxes_height',i)],dtype=tf.float32) 260 | boxes_area_ratio = tf.cast(features[append_idx('boxes_area',i)],dtype=tf.float32)/tf.expand_dims(tf.cast(features[append_idx("image_area",i)], tf.float32), axis=1) 261 | boxes_area_ratio_ids = tf.clip_by_value(tf.cast(boxes_area_ratio / 0.1, tf.int64), 0, 10) 262 | left_id_ratio_ids = tf.clip_by_value(tf.cast( boxes_coordinate[:, :, 1] / (img_width*10) / 0.1, tf.int64), 0, 10) 263 | width_ratio_ids = tf.clip_by_value(tf.cast((boxes_width*5) / (img_width*10) / 0.1, tf.int64), 0, 10) 264 | top_id_ratio_ids = tf.clip_by_value(tf.cast( boxes_coordinate[:, :, 0] / (img_height*10) / 0.1, tf.int64), 0, 10) 265 | heigth_ratio_ids = tf.clip_by_value(tf.cast((boxes_height*5) / (img_height*10) / 0.1, tf.int64), 0, 10) 266 | 267 | boxes_area_ratio_embedding = _embedding_simple('boxes_area_ratio', boxes_area_ratio_ids, 11, 20) 268 | left_id_ratio_embedding = _embedding_simple('boxes_left_ratio', left_id_ratio_ids, 11, 20) 269 | width_ratio_embedding = _embedding_simple('boxes_width_ratio', width_ratio_ids, 11, 20) 270 | top_id_ratio_embedding = _embedding_simple('boxes_top_ratio', top_id_ratio_ids, 11, 20) 271 | heigth_ratio_embedding = _embedding_simple('boxes_height_ratio', heigth_ratio_ids, 11, 20) 272 | label_embeddings.append([label_feature_embedding, boxes_area_ratio_embedding, left_id_ratio_embedding, width_ratio_embedding, top_id_ratio_embedding, 273 | heigth_ratio_embedding,boxes_position_embedding,boxes_height_embedding,boxes_width_embedding,boxes_area_embedding]) 274 | 275 | boxes_embeddings.append(boxes_features) 276 | boxes_num.append(num) 277 | 278 | height = _embedding('height', 20, idx=i) 279 | width = _embedding('width', 20, idx=i) 280 | image_area = _embedding('image_area', 20, idx=i) 281 | height_embs.append(height) 282 | width_embs.append(width) 283 | area_embs.append(image_area) 284 | 285 | item_masks.append(boxes_masks) 286 | 287 | tf.logging.info("query_in:") 288 | tf.logging.info(tf.shape(query_emb)) 289 | 290 | logit,softmax_score,query_out,image_out,deep_out = self.cal_logit(query_seq,query_num,query_mask, query_emb, query_lastword, boxes_embeddings[0], label_embeddings[0],boxes_num[0], 291 | height_embs[0],width_embs[0],area_embs[0], item_masks[0], mode=mode) 292 | 293 | if self.cur_run_mode=='afo': 294 | every_n_iter = 5000 295 | else: 296 | every_n_iter = 200 297 | logging_hook = tf.train.LoggingTensorHook(every_n_iter=every_n_iter,tensors={'softmax_score': softmax_score}) 298 | 299 | logit = tf.reshape(logit, [-1, 1]) 300 | predict = tf.sigmoid(logit) 301 | 302 | if mode == tf.estimator.ModeKeys.PREDICT: 303 | predict_dict = {"prediction": predict,"query_out":query_out,"image_out":image_out,"deep_out":deep_out,"query_emb":query_emb} 304 | export_output = {'serving': tf.estimator.export.PredictOutput(predict_dict)} 305 | return tf.estimator.EstimatorSpec(mode, predictions=predict_dict, export_outputs=export_output) 306 | 307 | global_step = tf.train.get_global_step() 308 | if neg_num > 0: 309 | score = [tf.reshape(logit, [-1, 1])] 310 | for i in range(1, neg_num + 1): 311 | logit,softmax_score,query_out,image_out,deep_out = self.cal_logit(query_seq,query_num,query_mask, query_emb, query_lastword, boxes_embeddings[i], label_embeddings[i],boxes_num[i], 312 | height_embs[i],width_embs[i],area_embs[i], item_masks[i], mode=mode) 313 | score.append(tf.reshape(logit, [-1, 1])) 314 | score = tf.concat(score, axis=1) 315 | prob = tf.nn.softmax(score, axis=1) 316 | predict = prob[:, 0] 317 | loss = -tf.reduce_mean(tf.log(predict)) 318 | else: 319 | label = tf.reshape(tf.cast(labels, tf.float32), [-1, 1]) 320 | if self.use_sample_type==1: 321 | stepsize = 300 322 | iteration = tf.cast(global_step,tf.float32) 323 | 324 | beta = 0.7**(1+iteration/stepsize) 325 | 326 | extra_preds = tf.reshape(tf.cast(features['extra_preds'], tf.float32), [-1, 1]) 327 | soft_loss = tf.reduce_mean( 328 | tf.nn.sigmoid_cross_entropy_with_logits(labels=extra_preds, logits=logit)) 329 | 330 | label = tf.reshape(tf.cast(labels, tf.float32), [-1, 1]) 331 | hard_loss = tf.reduce_mean( 332 | tf.nn.sigmoid_cross_entropy_with_logits(labels=label, logits=logit)) 333 | 334 | loss = beta*hard_loss + (1-beta) * soft_loss 335 | 336 | 337 | # weights = tf.constant([0,1,1,1],dtype=tf.float32) 338 | # sample_type = features['sample_type'] 339 | 340 | # loss_weights = tf.gather(weights,tf.reshape(sample_type,[-1,1])) 341 | # loss = loss * loss_weights 342 | # loss = tf.reduce_mean(loss) 343 | else: 344 | 345 | loss = tf.reduce_mean( 346 | tf.nn.sigmoid_cross_entropy_with_logits(labels=label, logits=logit)) 347 | # auc = tf.metrics.auc(labels, predict) 348 | 349 | # logging_hook = tf.train.LoggingTensorHook(every_n_iter=100, 350 | # tensors={'auc': auc[0]}) 351 | 352 | # 有loss和auc,可以定义eval的返回了 353 | if mode == tf.estimator.ModeKeys.EVAL: 354 | auc = tf.metrics.auc(labels, predict) 355 | return tf.estimator.EstimatorSpec(mode, loss=loss, eval_metric_ops={"eval-auc": auc}) 356 | 357 | assert mode == tf.estimator.ModeKeys.TRAIN 358 | decay_steps = self.model_json['model']['decay_steps'] 359 | decay_rate = self.model_json['model']['decay_rate'] 360 | 361 | tf.summary.scalar('train-loss', loss) 362 | global_step = tf.train.get_global_step() 363 | lr = tf.train.exponential_decay(learning_rate=self.learning_rate, global_step=global_step, decay_steps=decay_steps, decay_rate=decay_rate) 364 | 365 | 366 | optimizer = tf.train.AdamOptimizer(learning_rate=lr) 367 | 368 | update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) 369 | train_op = optimizer.minimize(loss, global_step=global_step) 370 | tf.logging.info('all_variable:{}'.format(tf.all_variables())) 371 | train_op = tf.group([train_op, update_ops]) 372 | return tf.estimator.EstimatorSpec( 373 | mode=mode, 374 | loss=loss, 375 | train_op=train_op,) 376 | 377 | # training_hooks=[logging_hook]) 378 | 379 | 380 | def add_fc_layers(self, deep_in, mode, name): 381 | training = (mode == tf.estimator.ModeKeys.TRAIN) 382 | """各层的定义""" 383 | with tf.variable_scope("dense_layers", reuse=tf.AUTO_REUSE): 384 | deep_out = deep_in 385 | for idx, unit in enumerate([300,150]): 386 | 387 | deep_out = tf.layers.dense(deep_out, units=unit, activation=tf.nn.tanh, name=name + "_" + str(idx)) 388 | gate = tf.layers.dense(deep_out, units=unit, activation=tf.sigmoid, name=name + "_gate" + str(idx)) 389 | deep_out = deep_out * gate 390 | 391 | deep_out = tf.layers.batch_normalization(deep_out, training=training) 392 | deep_out = tf.layers.dropout(deep_out, rate=0.1, training=training) 393 | deep_predict = tf.layers.dense(deep_out, units=1, name=name + "_" + "final") 394 | return deep_out,deep_predict 395 | --------------------------------------------------------------------------------