├── rl4rs ├── nets │ ├── __init__.py │ ├── cql │ │ ├── __init__.py │ │ ├── q_function.py │ │ └── encoder.py │ ├── exact_k │ │ ├── __init__.py │ │ ├── utils.py │ │ └── model.py │ ├── rllib │ │ ├── __init__.py │ │ ├── rllib_rawstate_model.py │ │ └── rllib_mask_model.py │ ├── dnn.py │ ├── widedeep.py │ ├── lstm_slate.py │ ├── lstm.py │ ├── dien.py │ ├── dnn_slate.py │ ├── widedeep_slate.py │ ├── dien_slate.py │ ├── lstm_slate_multiclass.py │ ├── dnn_slate_multiclass.py │ ├── widedeep_slate_multiclass.py │ ├── dien_slate_multiclass.py │ ├── adversarial_slate.py │ └── utils.py ├── policy │ ├── __init__.py │ ├── behavior_model.py │ └── policy_model.py ├── server │ ├── __init__.py │ ├── httpEnv.py │ └── gymHttpClient.py ├── utils │ ├── __init__.py │ ├── fileutil.py │ ├── rllib_print.py │ ├── rllib_vector_env.py │ ├── d3rlpy_scorer.py │ └── offline_policy_metrics.py ├── mdpchecker │ ├── __init__.py │ └── decoder.py ├── env │ ├── __init__.py │ ├── seqslate.py │ └── base.py └── __init__.py ├── assets ├── fuxi.jpg └── new.gif ├── RL4RS_appendix.pdf ├── reproductions ├── run_mdp_checker.sh ├── run_supervised_item.sh ├── run_supervised_slate.sh ├── file_split.sh ├── run_simulator_env_test.sh ├── run_simulator_train.sh ├── run_simulator_eval.sh ├── run_exact_k.sh ├── run_modelfree_rl.sh └── run_split.sh ├── script ├── modelfree_trainer.py ├── supervised_train.py ├── simulator_eval.py ├── test_exact_k.py ├── simulator_env_test.py ├── offline_evaluation.py ├── exact_k_train.py ├── batchrl_train.py ├── data_preprocess.py └── mdpchecker │ ├── mdp_checker.py │ └── preprocess.py ├── index.html └── environment.yml /rl4rs/nets/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------------------- /rl4rs/nets/cql/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------------------- /rl4rs/policy/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------------------- /rl4rs/server/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------------------- /rl4rs/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------------------- /rl4rs/mdpchecker/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------------------- /rl4rs/nets/exact_k/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------------------- /rl4rs/nets/rllib/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------------------- /assets/fuxi.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fuxiAIlab/RL4RS/HEAD/assets/fuxi.jpg -------------------------------------------------------------------------------- /assets/new.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fuxiAIlab/RL4RS/HEAD/assets/new.gif -------------------------------------------------------------------------------- /RL4RS_appendix.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fuxiAIlab/RL4RS/HEAD/RL4RS_appendix.pdf -------------------------------------------------------------------------------- /rl4rs/env/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import RecDataBase, RecSimBase, RecEnvBase, RecState 2 | 3 | __all__ = [ 4 | "RecDataBase", 5 | "RecSimBase", 6 | "RecEnvBase", 7 | "RecState", 8 | ] 9 | -------------------------------------------------------------------------------- /rl4rs/__init__.py: -------------------------------------------------------------------------------- 1 | from gym.envs.registration import register 2 | 3 | register( 4 | id='HttpEnv-v0', 5 | entry_point='rl4rs.server.httpEnv:HttpEnv', 6 | ) 7 | 8 | register( 9 | id='SlateRecEnv-v0', 10 | entry_point='rl4rs.env:RecEnvBase', 11 | ) 12 | 13 | register( 14 | id='SeqSlateRecEnv-v0', 15 | entry_point='rl4rs.env:RecEnvBase', 16 | ) 17 | -------------------------------------------------------------------------------- /reproductions/run_mdp_checker.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | conda activate rl4rs 4 | script_abs=$(readlink -f "$0") 5 | rl4rs_benchmark_dir=$(dirname $script_abs)/.. 6 | rl4rs_output_dir=${rl4rs_benchmark_dir}/output 7 | rl4rs_dataset_dir=${rl4rs_benchmark_dir}/dataset 8 | script_dir=${rl4rs_benchmark_dir}/script 9 | export rl4rs_benchmark_dir && export rl4rs_output_dir && export rl4rs_dataset_dir 10 | 11 | dataset=$1 12 | 13 | cd ${script_dir}/tool 14 | python -u preprocess.py $dataset ${rl4rs_dataset_dir} && 15 | python -u mdp_checker.py $dataset ${rl4rs_dataset_dir} >> ${rl4rs_output_dir}/data_understanding_tool_${dataset}.log && 16 | echo "1" -------------------------------------------------------------------------------- /rl4rs/utils/fileutil.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import glob 4 | import numpy as np 5 | 6 | 7 | def find_match_files(pattern, search_path, pathsep=os.pathsep): 8 | for path in search_path.split(pathsep): 9 | for match in glob.glob(os.path.join(path, pattern)): 10 | yield match 11 | 12 | 13 | def find_newest_files(pattern, search_path, pathsep=os.pathsep): 14 | files = [] 15 | timestamps = [] 16 | for path in search_path.split(pathsep): 17 | for match in glob.glob(os.path.join(path, pattern)): 18 | files.append(match) 19 | timestamps.append(float(os.path.getctime(match))) 20 | if len(files) > 0: 21 | return files[np.argmax(timestamps)] 22 | else: 23 | return '' 24 | -------------------------------------------------------------------------------- /reproductions/run_supervised_item.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | conda activate rl4rs 4 | script_abs=$(readlink -f "$0") 5 | rl4rs_benchmark_dir=$(dirname $script_abs)/.. 6 | rl4rs_output_dir=${rl4rs_benchmark_dir}/output 7 | script_dir=${rl4rs_benchmark_dir}/script 8 | export rl4rs_benchmark_dir && export rl4rs_output_dir && export rl4rs_dataset_dir 9 | 10 | algo=$1 11 | 12 | cd ${script_dir} 13 | 14 | # supervised learning evaluation 15 | 16 | python supervised_train.py "${rl4rs_output_dir}/rl4rs_dataset_a_train.tfrecord" "${rl4rs_output_dir}/rl4rs_dataset_a_test.tfrecord" "${rl4rs_output_dir}/supervised_a_train_$algo/model" $algo 0 >> ${rl4rs_output_dir}/supervised_a_train_${algo}_item.log && 17 | 18 | python supervised_train.py "${rl4rs_output_dir}/rl4rs_dataset_b2_train.tfrecord" "${rl4rs_output_dir}/rl4rs_dataset_b2_test.tfrecord" "${rl4rs_output_dir}/supervised_b2_train_$algo/model" $algo 0 >> ${rl4rs_output_dir}/supervised_b2_train_${algo}_item.log && 19 | 20 | echo "1" 21 | 22 | -------------------------------------------------------------------------------- /reproductions/run_supervised_slate.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | conda activate rl4rs 4 | script_abs=$(readlink -f "$0") 5 | rl4rs_benchmark_dir=$(dirname $script_abs)/.. 6 | rl4rs_output_dir=${rl4rs_benchmark_dir}/output 7 | script_dir=${rl4rs_benchmark_dir}/script 8 | export rl4rs_benchmark_dir && export rl4rs_output_dir && export rl4rs_dataset_dir 9 | 10 | algo=$1 11 | 12 | cd ${script_dir} 13 | 14 | # supervised learning evaluation 15 | 16 | python supervised_train.py "${rl4rs_output_dir}/rl4rs_dataset_a_train_slate.tfrecord" "${rl4rs_output_dir}/rl4rs_dataset_a_test_slate.tfrecord" "${rl4rs_output_dir}/supervised_a_train_slate_$algo/model" $algo 1 >> ${rl4rs_output_dir}/supervised_a_train_${algo}_slate.log && 17 | 18 | python supervised_train.py "${rl4rs_output_dir}/rl4rs_dataset_b2_train_slate.tfrecord" "${rl4rs_output_dir}/rl4rs_dataset_b2_test_slate.tfrecord" "${rl4rs_output_dir}/supervised_b2_train_slate_$algo/model" $algo 1 >> ${rl4rs_output_dir}/supervised_b2_train_${algo}_slate.log && 19 | 20 | echo "1" 21 | 22 | -------------------------------------------------------------------------------- /rl4rs/utils/rllib_print.py: -------------------------------------------------------------------------------- 1 | import json 2 | import yaml 3 | from ray.tune.utils.util import SafeFallbackEncoder 4 | 5 | 6 | def pretty_print(result): 7 | result = result.copy() 8 | result.update(config=None) # drop config from pretty print 9 | result.update(hist_stats=None) # drop hist_stats from pretty print 10 | out = {} 11 | print_keys = ('episode_reward_mean', 12 | 'episode_reward_min', 13 | 'timesteps_total', 14 | 'training_iteration') 15 | for k, v in result.items(): 16 | if v is not None: 17 | if k in print_keys: 18 | out[k] = v 19 | elif k == 'evaluation': 20 | out[k] = { 21 | 'episode_reward_mean': v['episode_reward_mean'], 22 | 'episode_reward_min': v['episode_reward_min'], 23 | } 24 | cleaned = json.dumps(out, cls=SafeFallbackEncoder) 25 | return yaml.safe_dump(json.loads(cleaned), default_flow_style=False) 26 | -------------------------------------------------------------------------------- /reproductions/file_split.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | script_abs=$(readlink -f "$0") 4 | rl4rs_benchmark_dir=$(dirname $script_abs)/.. 5 | rl4rs_dataset_dir=${rl4rs_benchmark_dir}/dataset 6 | script_dir=${rl4rs_benchmark_dir}/script 7 | rl4rs_output_dir=${rl4rs_benchmark_dir}/output 8 | export rl4rs_benchmark_dir && export rl4rs_output_dir && export rl4rs_dataset_dir 9 | 10 | file=$1 11 | 12 | cd ${rl4rs_dataset_dir} && 13 | 14 | awk -F "@" '$2%11<2 {print}' ${file} > ${rl4rs_output_dir}/${file}_0000.csv && 15 | awk -F "@" '$2%11>=2 && $2%11<4 {print}' ${file} > ${rl4rs_output_dir}/${file}_0001.csv && 16 | awk -F "@" '$2%11>=4 && $2%11<6 {print}' ${file} > ${rl4rs_output_dir}/${file}_0002.csv && 17 | awk -F "@" '$2%11>=6 && $2%11<8 {print}' ${file} > ${rl4rs_output_dir}/${file}_0003.csv && 18 | awk -F "@" '$2%11>=8 {print}' ${file} > ${rl4rs_output_dir}/${file}_0004.csv 19 | 20 | #file_rows=`wc -l ${file}|awk '{print $1}'` 21 | #file_num=5 22 | #file_num_row=$((${file_rows} + 4)) 23 | #every_file_row=$((${file_num_row}/${file_num})) 24 | #split -d -a 4 -l ${every_file_row} ${file} --additional-suffix=.csv ${rl4rs_output_dir}/${file}_ 25 | 26 | 27 | -------------------------------------------------------------------------------- /reproductions/run_simulator_env_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | conda activate rl4rs 4 | script_abs=$(readlink -f "$0") 5 | rl4rs_benchmark_dir=$(dirname $script_abs)/.. 6 | rl4rs_output_dir=${rl4rs_benchmark_dir}/output 7 | rl4rs_dataset_dir=${rl4rs_benchmark_dir}/dataset 8 | script_dir=${rl4rs_benchmark_dir}/script 9 | export rl4rs_benchmark_dir && export rl4rs_output_dir && export rl4rs_dataset_dir 10 | 11 | algo=$1 12 | 13 | cd ${script_dir} 14 | 15 | head -1 ${rl4rs_dataset_dir}/rl4rs_dataset_a_train.csv > ${rl4rs_dataset_dir}/rl4rs_dataset_a_train_tiny.csv 16 | python data_preprocess.py "${rl4rs_dataset_dir}/rl4rs_dataset_a_train_tiny.csv" "${rl4rs_output_dir}/rl4rs_dataset_a_train_tiny.tfrecord" "tfrecord_item" 17 | python -u simulator_env_test.py "{'env':'SlateRecEnv-v0','support_conti_env':False,'rawstate_as_obs':False}" && 18 | python -u simulator_env_test.py "{'env':'SlateRecEnv-v0','support_conti_env':False,'rawstate_as_obs':True}" && 19 | python -u simulator_env_test.py "{'env':'SlateRecEnv-v0','support_conti_env':True,'rawstate_as_obs':False,'action_emb_size':32}" && 20 | python -u simulator_env_test.py "{'env':'SlateRecEnv-v0','support_conti_env':True,'rawstate_as_obs':True,'action_emb_size':32}" && 21 | echo '1' -------------------------------------------------------------------------------- /script/modelfree_trainer.py: -------------------------------------------------------------------------------- 1 | import ray.rllib.agents.ppo as ppo 2 | import ray.rllib.agents.dqn as dqn 3 | import ray.rllib.agents.a3c as a3c 4 | import ray.rllib.agents.pg as pg 5 | import ray.rllib.agents.ddpg.td3 as td3 6 | import ray.rllib.agents.impala as impala 7 | import ray.rllib.agents.ddpg as ddpg 8 | import ray.rllib.agents.slateq as slateq 9 | 10 | 11 | def get_rl_model(algo, rllib_config): 12 | trainer = None 13 | if algo == "PPO": 14 | trainer = ppo.PPOTrainer(config=rllib_config, env="rllibEnv-v0") 15 | elif algo == "DQN": 16 | trainer = dqn.DQNTrainer(config=rllib_config, env="rllibEnv-v0") 17 | elif algo == "RAINBOW": 18 | trainer = dqn.DQNTrainer(config=rllib_config, env="rllibEnv-v0") 19 | elif algo == "A2C": 20 | trainer = a3c.A2CTrainer(config=rllib_config, env="rllibEnv-v0") 21 | elif algo == "A3C": 22 | trainer = a3c.A3CTrainer(config=rllib_config, env="rllibEnv-v0") 23 | elif algo == "PG": 24 | trainer = pg.PGTrainer(config=rllib_config, env="rllibEnv-v0") 25 | elif algo == "DDPG": 26 | trainer = ddpg.DDPGTrainer(config=rllib_config, env="rllibEnv-v0") 27 | elif algo == "TD3": 28 | trainer = td3.TD3Trainer(config=rllib_config, env="rllibEnv-v0") 29 | elif algo == "IMPALA": 30 | trainer = impala.ImpalaTrainer(config=rllib_config, env="rllibEnv-v0") 31 | elif algo == "SLATEQ": 32 | trainer = slateq.SlateQTrainer(config=rllib_config, env="rllibEnv-v0") 33 | else: 34 | assert algo in ("PPO", "DQN", "A2C", "A3C", "PG", "IMPALA", "TD3", "RAINBOW", "SLATEQ") 35 | print('trainer_default_config', trainer._default_config) 36 | return trainer 37 | -------------------------------------------------------------------------------- /reproductions/run_simulator_train.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | conda activate rl4rs 4 | script_abs=$(readlink -f "$0") 5 | rl4rs_benchmark_dir=$(dirname $script_abs)/.. 6 | rl4rs_output_dir=${rl4rs_benchmark_dir}/output 7 | script_dir=${rl4rs_benchmark_dir}/script 8 | export rl4rs_benchmark_dir && export rl4rs_output_dir && export rl4rs_dataset_dir 9 | 10 | algo=$1 11 | 12 | cd ${script_dir} 13 | 14 | # RL Env Construction 15 | 16 | python supervised_train.py "${rl4rs_output_dir}/rl4rs_dataset_a_sl.tfrecord" "${rl4rs_output_dir}/rl4rs_dataset_a_rl.tfrecord" "${rl4rs_output_dir}/simulator_a_sl_$algo/model" $algo 0 >> ${rl4rs_output_dir}/simulator_a_sl_${algo}.log && 17 | 18 | python supervised_train.py "${rl4rs_output_dir}/rl4rs_dataset_a_rl.tfrecord" "${rl4rs_output_dir}/rl4rs_dataset_a_sl.tfrecord" "${rl4rs_output_dir}/simulator_a_rl_$algo/model" $algo 0 >> ${rl4rs_output_dir}/simulator_a_rl_${algo}.log && 19 | 20 | python supervised_train.py "${rl4rs_output_dir}/rl4rs_dataset_a.tfrecord" "${rl4rs_output_dir}/rl4rs_dataset_a.tfrecord" "${rl4rs_output_dir}/simulator_a_$algo/model" $algo 0 >> ${rl4rs_output_dir}/simulator_a_${algo}.log && 21 | 22 | python supervised_train.py "${rl4rs_output_dir}/rl4rs_dataset_b2_sl.tfrecord" "${rl4rs_output_dir}/rl4rs_dataset_b2_rl.tfrecord" "${rl4rs_output_dir}/simulator_b2_sl_$algo/model" $algo 0 >> ${rl4rs_output_dir}/simulator_b2_sl_${algo}.log && 23 | 24 | python supervised_train.py "${rl4rs_output_dir}/rl4rs_dataset_b2_rl.tfrecord" "${rl4rs_output_dir}/rl4rs_dataset_b2_sl.tfrecord" "${rl4rs_output_dir}/simulator_b2_rl_$algo/model" $algo 0 >> ${rl4rs_output_dir}/simulator_b2_rl_${algo}.log && 25 | 26 | python supervised_train.py "${rl4rs_output_dir}/rl4rs_dataset_b2.tfrecord" "${rl4rs_output_dir}/rl4rs_dataset_b2.tfrecord" "${rl4rs_output_dir}/simulator_b2_$algo/model" $algo 0 >> ${rl4rs_output_dir}/simulator_b2_${algo}.log && 27 | 28 | echo "1" -------------------------------------------------------------------------------- /script/supervised_train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import glob 4 | import tensorflow as tf 5 | tf.compat.v1.disable_eager_execution() 6 | if tf.test.is_gpu_available(): 7 | os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true' 8 | from tensorflow import keras 9 | from rl4rs.utils.datautil import FeatureUtil 10 | from rl4rs.utils.fileutil import find_match_files 11 | 12 | config = { 13 | "epoch": 20, 14 | "maxlen": 64, 15 | "batch_size": 256, 16 | "class_num": 2, 17 | "dense_feature_num": 432, 18 | "category_feature_num": 21, 19 | "category_hash_size": 100000, 20 | "seq_num": 2, 21 | "emb_size": 128, 22 | "hidden_units": 128, 23 | "action_size": 284 24 | } 25 | train_file = sys.argv[1] 26 | test_file = sys.argv[2] 27 | model_file = sys.argv[3] 28 | model_type = sys.argv[4] 29 | is_slate_label = bool(int(sys.argv[5])) 30 | featureutil = FeatureUtil(config) 31 | 32 | train_files = [match for match in find_match_files(train_file + '*', train_file)] 33 | test_files = [match for match in find_match_files(test_file + '*', test_file)] 34 | print('train on ', train_files, ' test on ', test_files) 35 | iter_train = featureutil.read_tfrecord(train_files, is_slate_label=is_slate_label) 36 | iter_test = featureutil.read_tfrecord(test_files, is_slate_label=is_slate_label) 37 | model = __import__("rl4rs.nets." + model_type, fromlist=['get_model']).get_model(config) 38 | steps_per_epoch = 600000 // config["batch_size"] 39 | steps_per_epoch_val = 400000 // config["batch_size"] 40 | earlyStopping = keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, verbose=2, mode='min') 41 | model.fit(iter_train, steps_per_epoch=steps_per_epoch, epochs=int(config["epoch"]), 42 | validation_data=iter_test, validation_steps=steps_per_epoch_val, verbose=2, callbacks=[earlyStopping]) 43 | 44 | saver = tf.train.Saver() 45 | sess = tf.keras.backend.get_session() 46 | saver.save(sess, model_file) 47 | -------------------------------------------------------------------------------- /rl4rs/nets/dnn.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow import keras 3 | from tensorflow.keras import layers 4 | from tensorflow.keras.models import Model 5 | from rl4rs.nets import utils 6 | 7 | 8 | def get_model(config): 9 | maxlen = config['maxlen'] 10 | dense_feature_num = config['dense_feature_num'] 11 | category_feature_num = config['category_feature_num'] 12 | class_num = config['class_num'] 13 | seq_num = config['seq_num'] 14 | 15 | sequence_feature_input = layers.Input( 16 | shape=(seq_num, maxlen,), dtype='float32', name='sequence_feature_input' 17 | ) 18 | 19 | dense_feature_input = layers.Input( 20 | shape=(dense_feature_num,), dtype='float32', name='dense_feature_input' 21 | ) 22 | 23 | category_feature_input = layers.Input( 24 | shape=(category_feature_num,), dtype='int64', name='category_feature_input' 25 | ) 26 | 27 | slate_label_input = layers.Input( 28 | shape=(9,), dtype='int64', name='slate_label' 29 | ) 30 | 31 | category_feature = utils.id_input_processing(category_feature_input, config) 32 | dense_feature = utils.dense_input_processing(dense_feature_input, config) 33 | sequence_feature = utils.sequence_input_concat(sequence_feature_input, config) 34 | all_feature = layers.Concatenate(axis=-1)([category_feature, dense_feature]) 35 | all_feature = layers.Dense(256, activation=layers.ELU())(all_feature) 36 | obs = layers.Dense(256, activation=layers.ELU(), name='simulator_obs')(all_feature) 37 | output = layers.Dense(class_num, activation='softmax', name='simulator_reward')(obs) 38 | 39 | model = Model(inputs=[sequence_feature_input, 40 | dense_feature_input, 41 | category_feature_input, 42 | slate_label_input], 43 | outputs=[output]) 44 | model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['AUC', 'acc']) 45 | return model 46 | -------------------------------------------------------------------------------- /rl4rs/nets/widedeep.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow import keras 3 | from tensorflow.keras import layers 4 | from tensorflow.keras.models import Model 5 | from rl4rs.nets import utils 6 | 7 | 8 | def get_model(config): 9 | maxlen = config['maxlen'] 10 | dense_feature_num = config['dense_feature_num'] 11 | category_feature_num = config['category_feature_num'] 12 | class_num = config['class_num'] 13 | seq_num = config['seq_num'] 14 | 15 | sequence_feature_input = layers.Input( 16 | shape=(seq_num, maxlen,), dtype='float32', name='sequence_feature_input' 17 | ) 18 | 19 | dense_feature_input = layers.Input( 20 | shape=(dense_feature_num,), dtype='float32', name='dense_feature_input' 21 | ) 22 | 23 | category_feature_input = layers.Input( 24 | shape=(category_feature_num,), dtype='int64', name='category_feature_input' 25 | ) 26 | 27 | slate_label_input = layers.Input( 28 | shape=(9,), dtype='int64', name='slate_label' 29 | ) 30 | 31 | category_feature = utils.id_input_processing_concat(category_feature_input, config) 32 | dense_feature = utils.dense_input_processing(dense_feature_input, config) 33 | sequence_feature = utils.sequence_input_concat(sequence_feature_input, config) 34 | sequence_feature_dnn = layers.Dense(256, activation=layers.ELU())(sequence_feature) 35 | all_feature = layers.Concatenate(axis=-1, name='simulator_obs')( 36 | [sequence_feature_dnn, dense_feature, category_feature] 37 | ) 38 | output = layers.Dense(class_num, activation='softmax', name='simulator_reward')(all_feature) 39 | 40 | model = Model(inputs=[sequence_feature_input, 41 | dense_feature_input, 42 | category_feature_input, 43 | slate_label_input], 44 | outputs=[output]) 45 | model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['AUC', 'acc']) 46 | return model 47 | -------------------------------------------------------------------------------- /rl4rs/nets/lstm_slate.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow import keras 3 | from tensorflow.keras import layers 4 | from tensorflow.keras.models import Model 5 | from rl4rs.nets import utils 6 | 7 | 8 | def get_model(config): 9 | maxlen = config['maxlen'] 10 | dense_feature_num = config['dense_feature_num'] 11 | category_feature_num = config['category_feature_num'] 12 | class_num = config['class_num'] 13 | seq_num = config['seq_num'] 14 | 15 | sequence_feature_input = layers.Input(shape=(seq_num, maxlen,), dtype='float32', name='sequence_feature_input') 16 | 17 | dense_feature_input = layers.Input(shape=(dense_feature_num,), dtype='float32', name='dense_feature_input') 18 | 19 | category_feature_input = layers.Input(shape=(category_feature_num,), dtype='int64', name='category_feature_input') 20 | 21 | slate_label_input = layers.Input(shape=(9,), dtype='int64', name='slate_label') 22 | 23 | category_feature = utils.id_input_processing_lstm(category_feature_input, config) 24 | dense_feature = utils.dense_input_processing(dense_feature_input, config) 25 | sequence_feature = utils.sequence_input_LSTM(sequence_feature_input, config) 26 | all_feature = layers.Concatenate(axis=-1)([sequence_feature, dense_feature, category_feature]) 27 | obs = layers.Dense(256, activation=layers.ELU(), name='simulator_obs')(all_feature) 28 | output = layers.Dense(9, activation='sigmoid', name='simulator_reward')(obs) 29 | 30 | model = Model(inputs=[sequence_feature_input, 31 | dense_feature_input, 32 | category_feature_input, 33 | slate_label_input], 34 | outputs=[output]) 35 | model.compile(loss='binary_crossentropy', 36 | optimizer='adam', 37 | metrics=[tf.keras.metrics.AUC(), 38 | tf.keras.metrics.Precision(), 39 | tf.keras.metrics.Recall()]) 40 | return model 41 | -------------------------------------------------------------------------------- /rl4rs/nets/lstm.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow import keras 3 | from tensorflow.keras import layers 4 | from tensorflow.keras.models import Model 5 | from rl4rs.nets import utils 6 | 7 | 8 | def get_model(config): 9 | maxlen = config['maxlen'] 10 | dense_feature_num = config['dense_feature_num'] 11 | category_feature_num = config['category_feature_num'] 12 | class_num = config['class_num'] 13 | seq_num = config['seq_num'] 14 | 15 | sequence_feature_input = layers.Input( 16 | shape=(seq_num, maxlen,), dtype='float32', name='sequence_feature_input' 17 | ) 18 | 19 | dense_feature_input = layers.Input( 20 | shape=(dense_feature_num,), dtype='float32', name='dense_feature_input' 21 | ) 22 | 23 | category_feature_input = layers.Input( 24 | shape=(category_feature_num,), dtype='int64', name='category_feature_input' 25 | ) 26 | 27 | slate_label_input = layers.Input( 28 | shape=(9,), dtype='int64', name='slate_label' 29 | ) 30 | 31 | category_feature = utils.id_input_processing_lstm(category_feature_input, config) 32 | # category_feature_concat = utils.id_input_processing_concat(category_feature_input, config) 33 | dense_feature = utils.dense_input_processing(dense_feature_input, config) 34 | sequence_feature = utils.sequence_input_LSTM(sequence_feature_input, config) 35 | all_feature = layers.Concatenate(axis=-1)([sequence_feature, dense_feature, category_feature]) 36 | obs = layers.Dense(256, activation=layers.ELU(), name='simulator_obs')(all_feature) 37 | output = layers.Dense(class_num, activation='softmax', name='simulator_reward')(obs) 38 | 39 | model = Model(inputs=[sequence_feature_input, 40 | dense_feature_input, 41 | category_feature_input, 42 | slate_label_input], 43 | outputs=[output]) 44 | model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['AUC', 'acc']) 45 | return model 46 | -------------------------------------------------------------------------------- /rl4rs/nets/exact_k/utils.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | 4 | 5 | def index_matrix_to_pairs_fn(batch_size, seq_length): 6 | replicated_first_indices = tf.range(batch_size) # range(128) 7 | # replicated_first_indices = 8 | # [[ 0, 0, 0,...], 9 | # [ 1, 1, 1,...], 10 | # ...... 11 | # [127,127,127,...]] 12 | replicated_first_indices2 = tf.tile( 13 | tf.expand_dims(replicated_first_indices, dim=1), # [128,1] 14 | [1, seq_length]) 15 | 16 | def index_matrix_to_pairs(index_matrix): 17 | """ 18 | :param index_matrix: [batch_size, data_len] or [batch_size] 19 | :return: [batch_size, data_len, 2] or [batch_size, 2] 20 | ie: 21 | a: [128, 10] -> c[i,j,:] = [i,a[i,j]], shape(c) = [128,10,2] 22 | a: [128] -> c[i,:] = [i,a[i]], shape(c) = [128,2] 23 | """ 24 | rank = len(index_matrix.get_shape()) 25 | if rank == 1: 26 | return tf.stack([replicated_first_indices, index_matrix], axis=rank) 27 | elif rank == 2: 28 | return tf.stack([replicated_first_indices2, index_matrix], axis=rank) 29 | else: 30 | raise NotImplementedError("index_matrix rank should be 1 or 2, but %d found" % rank) 31 | 32 | return index_matrix_to_pairs 33 | 34 | 35 | def batch_gather(data, indices): 36 | batch_size = data.get_shape()[0].merge_with(indices.get_shape()[0]).value 37 | if batch_size is None: 38 | batch_size = tf.shape(indices)[0] 39 | gather_data_size = indices.get_shape()[1].value 40 | if gather_data_size is None: 41 | gather_data_size = tf.shape(indices)[1] 42 | flat_indices = tf.reshape(tf.transpose(indices), (-1,)) #[batch*4,1] 43 | input_index_pairs = tf.stop_gradient(tf.stack( 44 | [tf.range(batch_size*gather_data_size, dtype=tf.int32), flat_indices], axis=1)) 45 | flat_data = tf.tile(data, [gather_data_size, 1]) 46 | return tf.transpose(tf.reshape(tf.gather_nd(flat_data, input_index_pairs), (gather_data_size, batch_size))) 47 | -------------------------------------------------------------------------------- /rl4rs/nets/dien.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow import keras 3 | from tensorflow.keras import layers 4 | from tensorflow.keras.models import Model 5 | from rl4rs.nets import utils 6 | 7 | 8 | def get_model(config): 9 | maxlen = config['maxlen'] 10 | dense_feature_num = config['dense_feature_num'] 11 | category_feature_num = config['category_feature_num'] 12 | class_num = config['class_num'] 13 | seq_num = config['seq_num'] 14 | 15 | sequence_feature_input = layers.Input( 16 | shape=(seq_num, maxlen,), dtype='float32', name='sequence_feature_input' 17 | ) 18 | 19 | dense_feature_input = layers.Input( 20 | shape=(dense_feature_num,), dtype='float32', name='dense_feature_input' 21 | ) 22 | 23 | category_feature_input = layers.Input( 24 | shape=(category_feature_num,), dtype='int64', name='category_feature_input' 25 | ) 26 | 27 | slate_label_input = layers.Input(shape=(9,), dtype='int64', name='slate_label') 28 | 29 | slice_layer = layers.Lambda(lambda x: x[0][:, x[1]:]) 30 | id_slate_input = slice_layer([category_feature_input, -10]) 31 | category_feature = utils.id_input_processing_attn(category_feature_input, config) 32 | dense_feature = utils.dense_input_processing(dense_feature_input, config) 33 | sequence_feature = utils.sequence_input_attn([sequence_feature_input, id_slate_input], config) 34 | all_feature = layers.Concatenate(axis=-1)([sequence_feature, dense_feature, category_feature]) 35 | obs = layers.Dense(256, activation=layers.ELU(), name='simulator_obs')(all_feature) 36 | output = layers.Dense(class_num, activation='softmax', name='simulator_reward')(obs) 37 | 38 | model = Model(inputs=[sequence_feature_input, 39 | dense_feature_input, 40 | category_feature_input, 41 | slate_label_input], 42 | outputs=[output]) 43 | tf.keras.backend.get_session().run(tf.global_variables_initializer()) 44 | model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['AUC', 'acc']) 45 | return model 46 | -------------------------------------------------------------------------------- /rl4rs/nets/dnn_slate.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow import keras 3 | from tensorflow.keras import layers 4 | from tensorflow.keras.models import Model 5 | from rl4rs.nets import utils 6 | 7 | 8 | def get_model(config): 9 | maxlen = config['maxlen'] 10 | dense_feature_num = config['dense_feature_num'] 11 | category_feature_num = config['category_feature_num'] 12 | class_num = config['class_num'] 13 | seq_num = config['seq_num'] 14 | 15 | sequence_feature_input = layers.Input( 16 | shape=(seq_num, maxlen,), dtype='float32', name='sequence_feature_input' 17 | ) 18 | 19 | dense_feature_input = layers.Input( 20 | shape=(dense_feature_num,), dtype='float32', name='dense_feature_input' 21 | ) 22 | 23 | category_feature_input = layers.Input( 24 | shape=(category_feature_num,), dtype='int64', name='category_feature_input' 25 | ) 26 | 27 | slate_label_input = layers.Input( 28 | shape=(9,), dtype='int64', name='slate_label' 29 | ) 30 | 31 | category_feature = utils.id_input_processing(category_feature_input, config) 32 | dense_feature = utils.dense_input_processing(dense_feature_input, config) 33 | sequence_feature = utils.sequence_input_concat(sequence_feature_input, config) 34 | all_feature = layers.Concatenate(axis=-1)([category_feature, dense_feature]) 35 | all_feature = layers.Dense(256, activation=layers.ELU())(all_feature) 36 | obs = layers.Dense(256, activation=layers.ELU(), name='simulator_obs')(all_feature) 37 | output = layers.Dense(9, activation='sigmoid', name='simulator_reward')(obs) 38 | 39 | model = Model(inputs=[sequence_feature_input, 40 | dense_feature_input, 41 | category_feature_input, 42 | slate_label_input], 43 | outputs=[output]) 44 | model.compile(loss='binary_crossentropy', 45 | optimizer='adam', 46 | metrics=[tf.keras.metrics.AUC(), 47 | tf.keras.metrics.Precision(), 48 | tf.keras.metrics.Recall()]) 49 | return model 50 | -------------------------------------------------------------------------------- /rl4rs/nets/widedeep_slate.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow import keras 3 | from tensorflow.keras import layers 4 | from tensorflow.keras.models import Model 5 | from rl4rs.nets import utils 6 | 7 | 8 | def get_model(config): 9 | maxlen = config['maxlen'] 10 | dense_feature_num = config['dense_feature_num'] 11 | category_feature_num = config['category_feature_num'] 12 | class_num = config['class_num'] 13 | seq_num = config['seq_num'] 14 | 15 | sequence_feature_input = layers.Input( 16 | shape=(seq_num, maxlen,), dtype='float32', name='sequence_feature_input' 17 | ) 18 | 19 | dense_feature_input = layers.Input( 20 | shape=(dense_feature_num,), dtype='float32', name='dense_feature_input' 21 | ) 22 | 23 | category_feature_input = layers.Input( 24 | shape=(category_feature_num,), dtype='int64', name='category_feature_input' 25 | ) 26 | 27 | slate_label_input = layers.Input( 28 | shape=(9,), dtype='int64', name='slate_label' 29 | ) 30 | 31 | category_feature = utils.id_input_processing_concat(category_feature_input, config) 32 | dense_feature = utils.dense_input_processing(dense_feature_input, config) 33 | sequence_feature = utils.sequence_input_concat(sequence_feature_input, config) 34 | # all_feature = layers.Concatenate(axis=-1)([sequence_feature, dense_feature]) 35 | sequence_feature_dnn = layers.Dense(256, activation=layers.ELU())(sequence_feature) 36 | all_feature = layers.Concatenate(axis=-1, name='simulator_obs')( 37 | [sequence_feature_dnn, dense_feature, category_feature] 38 | ) 39 | output = layers.Dense(9, activation='sigmoid', name='simulator_reward')(all_feature) 40 | 41 | model = Model(inputs=[sequence_feature_input, 42 | dense_feature_input, 43 | category_feature_input, 44 | slate_label_input], 45 | outputs=[output]) 46 | model.compile(loss='binary_crossentropy', 47 | optimizer='adam', 48 | metrics=[tf.keras.metrics.AUC(), 49 | tf.keras.metrics.Precision(), 50 | tf.keras.metrics.Recall()]) 51 | return model 52 | -------------------------------------------------------------------------------- /rl4rs/nets/dien_slate.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow import keras 3 | from tensorflow.keras import layers 4 | from tensorflow.keras.models import Model 5 | from rl4rs.nets import utils 6 | 7 | 8 | def get_model(config): 9 | maxlen = config['maxlen'] 10 | dense_feature_num = config['dense_feature_num'] 11 | category_feature_num = config['category_feature_num'] 12 | class_num = config['class_num'] 13 | seq_num = config['seq_num'] 14 | 15 | sequence_feature_input = layers.Input( 16 | shape=(seq_num, maxlen,), dtype='float32', name='sequence_feature_input' 17 | ) 18 | 19 | dense_feature_input = layers.Input( 20 | shape=(dense_feature_num,), dtype='float32', name='dense_feature_input' 21 | ) 22 | 23 | category_feature_input = layers.Input( 24 | shape=(category_feature_num,), dtype='int64', name='category_feature_input' 25 | ) 26 | 27 | slate_label_input = layers.Input(shape=(9,), dtype='int64', name='slate_label') 28 | 29 | slice_layer = layers.Lambda(lambda x: x[0][:, x[1]:]) 30 | id_slate_input = slice_layer([category_feature_input, -10]) 31 | 32 | category_feature = utils.id_input_processing_attn(category_feature_input, config) 33 | dense_feature = utils.dense_input_processing(dense_feature_input, config) 34 | sequence_feature = utils.sequence_input_attn([sequence_feature_input, id_slate_input], config) 35 | all_feature = layers.Concatenate(axis=-1)([sequence_feature, dense_feature, category_feature]) 36 | obs = layers.Dense(256, activation=layers.ELU(), name='simulator_obs')(all_feature) 37 | output = layers.Dense(9, activation='sigmoid', name='simulator_reward')(obs) 38 | 39 | model = Model(inputs=[ 40 | sequence_feature_input, 41 | dense_feature_input, 42 | category_feature_input, 43 | slate_label_input], 44 | outputs=[output]) 45 | tf.keras.backend.get_session().run(tf.global_variables_initializer()) 46 | model.compile(loss='binary_crossentropy', 47 | optimizer='adam', 48 | metrics=[tf.keras.metrics.AUC(), 49 | tf.keras.metrics.Precision(), 50 | tf.keras.metrics.Recall()]) 51 | return model 52 | -------------------------------------------------------------------------------- /rl4rs/server/httpEnv.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | from rl4rs.server.gymHttpClient import Client 4 | 5 | 6 | class HttpEnv(gym.Env): 7 | metadata = {'render.modes': ['human']} 8 | 9 | def __init__(self, env_id, config={}): 10 | remote_base = config["remote_base"] 11 | self.client = Client(remote_base) 12 | self.instance_id = self.client.env_create(env_id, config) 13 | action_info = self.client.env_action_space_info(self.instance_id) 14 | obs_info = self.client.env_observation_space_info(self.instance_id) 15 | if action_info['name'] == 'Box': 16 | self.action_space = gym.spaces.Box(np.array(action_info['low']), np.array(action_info['high']), shape=action_info['shape']) 17 | else: 18 | self.action_space = gym.spaces.Discrete(action_info['n']) 19 | if obs_info['name'] == 'Box': 20 | self.observation_space = gym.spaces.Box(np.array(obs_info['low']), np.array(obs_info['high']), shape=obs_info['shape']) 21 | elif obs_info['name'] == 'Dict': 22 | keys = obs_info['keys'] 23 | space_D = {} 24 | for key in keys: 25 | shape = obs_info[key]['shape'] 26 | space_D[key] = gym.spaces.Box(np.array(obs_info[key]['low']).reshape(shape), np.array(obs_info[key]['high']).reshape(shape), shape=shape) 27 | self.observation_space = gym.spaces.Dict(space_D) 28 | else: 29 | assert obs_info['name'] in ('Box', 'Dict') 30 | 31 | def seed(self, sd=0): 32 | pass 33 | 34 | def step(self, action): 35 | if isinstance(action, np.ndarray): 36 | action = action.tolist() 37 | if isinstance(action, np.int): 38 | action = int(action) 39 | observation, reward, done, info = self.client.env_step(self.instance_id, action, False) 40 | return self.observation_space.from_jsonable(observation), reward, done, info 41 | 42 | def reset(self): 43 | observation = self.client.env_reset(self.instance_id) 44 | return self.observation_space.from_jsonable(observation) 45 | 46 | def render(self, mode='human', close=False): 47 | return '' 48 | 49 | def close(self): 50 | return self.client.env_close(self.instance_id) 51 | -------------------------------------------------------------------------------- /script/simulator_eval.py: -------------------------------------------------------------------------------- 1 | import os, sys 2 | import gym 3 | import numpy as np 4 | from rl4rs.env.slate import SlateRecEnv, SlateState 5 | from rl4rs.env.seqslate import SeqSlateRecEnv, SeqSlateState 6 | 7 | extra_config = eval(sys.argv[1]) if len(sys.argv) >= 2 else {} 8 | 9 | config = {"epoch": 4, "maxlen": 64, "batch_size": 2048, "action_size": 284, "class_num": 2, "dense_feature_num": 432, 10 | "category_feature_num": 21, "category_hash_size": 100000, "seq_num": 2, "emb_size": 128, "page_items": 9, 11 | "hidden_units": 128, "max_steps": 9, "sample_file": '../dataset/rl4rs_dataset_b3_shuf.csv', "iteminfo_file": '../item_info.csv', 12 | "model_file": "../output/simulator_b2_dien/model", "support_rllib_mask": False, "is_eval": True, 'env': "SeqSlateRecEnv-v0"} 13 | 14 | config = dict(config, **extra_config) 15 | 16 | if config.get('gpu', 0) < 1: 17 | os.environ['CUDA_VISIBLE_DEVICES'] = '-1' 18 | 19 | if config['env'] == 'SeqSlateRecEnv-v0': 20 | config['max_steps'] = 36 21 | sim = SeqSlateRecEnv(config, state_cls=SeqSlateState) 22 | env = gym.make('SeqSlateRecEnv-v0', recsim=sim) 23 | else: 24 | sim = SlateRecEnv(config, state_cls=SlateState) 25 | env = gym.make('SlateRecEnv-v0', recsim=sim) 26 | 27 | batch_size = config["batch_size"] 28 | epoch = config["epoch"] 29 | max_steps = config["max_steps"] 30 | rewards = np.zeros((epoch, batch_size)) 31 | offline_rewards = np.zeros((epoch, batch_size)) 32 | offline_actions = np.zeros((epoch, batch_size, max_steps)) 33 | 34 | for i in range(epoch): 35 | env.reset() 36 | for j in range(config["max_steps"]): 37 | action = env.offline_action 38 | offline_actions[i, :, j] = env.offline_action 39 | next_obs, reward, done, info = env.step(action) 40 | rewards[i] = rewards[i] + np.array(reward) 41 | offline_rewards[i] = offline_rewards[i] + np.array(env.offline_reward) 42 | if done[0]: 43 | print( 44 | i, 45 | np.sum(rewards) / config["batch_size"] / (i + 1), 46 | np.sum(offline_rewards) / config["batch_size"] / (i + 1) 47 | ) 48 | break 49 | print('the mean of offline reward', np.mean(offline_rewards)) 50 | print('the mean of reward prediction error', np.mean(rewards - offline_rewards)) 51 | print('the absolute mean of reward prediction error', np.mean(np.abs(rewards - offline_rewards))) 52 | print('the std of reward prediction error', np.std(np.reshape(rewards - offline_rewards, -1))) 53 | print('success') 54 | 55 | -------------------------------------------------------------------------------- /script/test_exact_k.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | from rl4rs.nets.exact_k.model import Generator, Discriminator 4 | 5 | batch_size = 2 6 | l1_mask = np.zeros(284) 7 | l1_mask[:40] = 1 8 | l2_mask = np.zeros(284) 9 | l2_mask[40:150] = 1 10 | l3_mask = np.zeros(284) 11 | l3_mask[150:] = 1 12 | l0_ssr_mask = np.zeros(284) 13 | l0_ssr_mask[:30] = 1 14 | l0_ssr_mask[40:140] = 1 15 | l0_ssr_mask[160:] = 1 16 | 17 | with tf.name_scope('Generator'): 18 | g = Generator(l1_mask, 19 | l2_mask, 20 | l3_mask, 21 | l0_ssr_mask, 22 | is_training=True, 23 | seq_length=284) 24 | 25 | with tf.name_scope('Discriminator'): 26 | d = Discriminator(seq_length=284) 27 | 28 | print("Graph loaded") 29 | 30 | gpu_options = tf.GPUOptions( 31 | per_process_gpu_memory_fraction=0.95, 32 | allow_growth=True) 33 | sess_config = tf.ConfigProto(allow_soft_placement=True, 34 | gpu_options=gpu_options) 35 | 36 | with tf.Session(config=sess_config) as sess: 37 | sess.run(tf.initialize_all_variables()) 38 | print('Generator training start!') 39 | 40 | reward_total = 0.0 41 | observation = np.random.random((batch_size, 256)) 42 | item_cand = np.array([list(range(0, 284))] * batch_size) 43 | for _ in range(9): 44 | sampled_card_idx, sampled_card = sess.run([g.sampled_path, g.sampled_result], 45 | feed_dict={g.user: observation, g.item_cand: item_cand}) 46 | reward = np.ones((batch_size,)) 47 | 48 | reward_ = sess.run(d.reward, feed_dict={d.user: observation}) 49 | sess.run(d.train_op, feed_dict={d.user: observation, d.reward_target: reward}) 50 | 51 | reward_total += np.mean(reward) 52 | 53 | reward = (reward - reward_) 54 | 55 | sess.run(g.train_op, feed_dict={g.decode_target_ids: sampled_card_idx, 56 | g.reward: reward, 57 | g.item_cand: item_cand, 58 | g.user: observation, 59 | }) 60 | gs_gen = sess.run(g.global_step) 61 | 62 | # beamsearch 63 | # beam_card = sess.run(g.infer_result, 64 | # feed_dict={g.item_cand: item_cand, 65 | # g.enc_user: observation}) 66 | 67 | print(sampled_card_idx, sampled_card, reward_) 68 | 69 | print("Done") 70 | -------------------------------------------------------------------------------- /rl4rs/nets/lstm_slate_multiclass.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow import keras 3 | from tensorflow.keras import layers 4 | from tensorflow.keras.models import Model 5 | from rl4rs.nets import utils 6 | 7 | 8 | def my_loss_fn(y_true, y_pred): 9 | slate2label = tf.einsum('ij,j->i', 10 | tf.cast(y_true, tf.int64), 11 | tf.constant([1, 2, 4, 1, 2, 4, 1, 2, 4], dtype=tf.int64)) 12 | return tf.keras.losses.categorical_crossentropy(tf.one_hot(slate2label, 22), y_pred) 13 | 14 | 15 | def my_acc_metrics(y_true, y_pred): 16 | slate2label = tf.einsum('ij,j->i', 17 | tf.cast(y_true, tf.int64), 18 | tf.constant([1, 2, 4, 1, 2, 4, 1, 2, 4], dtype=tf.int64)) 19 | return tf.keras.metrics.categorical_accuracy(tf.one_hot(slate2label, 22), y_pred) 20 | 21 | 22 | def get_model(config): 23 | maxlen = config['maxlen'] 24 | dense_feature_num = config['dense_feature_num'] 25 | category_feature_num = config['category_feature_num'] 26 | class_num = config['class_num'] 27 | seq_num = config['seq_num'] 28 | 29 | sequence_feature_input = layers.Input(shape=(seq_num, maxlen,), dtype='float32', name='sequence_feature_input') 30 | 31 | dense_feature_input = layers.Input(shape=(dense_feature_num,), dtype='float32', name='dense_feature_input') 32 | 33 | category_feature_input = layers.Input(shape=(category_feature_num,), dtype='int64', name='category_feature_input') 34 | 35 | slate_label_input = layers.Input(shape=(9,), dtype='int64', name='slate_label') 36 | 37 | category_feature = utils.id_input_processing_lstm(category_feature_input, config) 38 | dense_feature = utils.dense_input_processing(dense_feature_input, config) 39 | sequence_feature = utils.sequence_input_LSTM(sequence_feature_input, config) 40 | all_feature = layers.Concatenate(axis=-1)([sequence_feature, dense_feature, category_feature]) 41 | # all_feature = layers.Concatenate(axis=-1)([sequence_feature, category_feature]) 42 | obs = layers.Dense(256, activation=layers.ELU(), name='simulator_obs')(all_feature) 43 | output = layers.Dense(22, activation='softmax', name='simulator_reward')(obs) 44 | 45 | model = Model(inputs=[sequence_feature_input, 46 | dense_feature_input, 47 | category_feature_input, 48 | slate_label_input], 49 | outputs=[output]) 50 | model.compile(loss=my_loss_fn, optimizer='adam', metrics=[my_acc_metrics]) 51 | return model 52 | -------------------------------------------------------------------------------- /rl4rs/nets/dnn_slate_multiclass.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow import keras 3 | from tensorflow.keras import layers 4 | from tensorflow.keras.models import Model 5 | from rl4rs.nets import utils 6 | 7 | 8 | def my_loss_fn(y_true, y_pred): 9 | slate2label = tf.einsum('ij,j->i', 10 | tf.cast(y_true, tf.int64), 11 | tf.constant([1, 2, 4, 1, 2, 4, 1, 2, 4], dtype=tf.int64)) 12 | return tf.keras.losses.categorical_crossentropy(tf.one_hot(slate2label, 22), y_pred) 13 | 14 | 15 | def my_acc_metrics(y_true, y_pred): 16 | slate2label = tf.einsum('ij,j->i', 17 | tf.cast(y_true, tf.int64), 18 | tf.constant([1, 2, 4, 1, 2, 4, 1, 2, 4], dtype=tf.int64)) 19 | return tf.keras.metrics.categorical_accuracy(tf.one_hot(slate2label, 22), y_pred) 20 | 21 | 22 | def get_model(config): 23 | maxlen = config['maxlen'] 24 | dense_feature_num = config['dense_feature_num'] 25 | category_feature_num = config['category_feature_num'] 26 | class_num = config['class_num'] 27 | seq_num = config['seq_num'] 28 | 29 | sequence_feature_input = layers.Input( 30 | shape=(seq_num, maxlen,), dtype='float32', name='sequence_feature_input' 31 | ) 32 | 33 | dense_feature_input = layers.Input( 34 | shape=(dense_feature_num,), dtype='float32', name='dense_feature_input' 35 | ) 36 | 37 | category_feature_input = layers.Input( 38 | shape=(category_feature_num,), dtype='int64', name='category_feature_input' 39 | ) 40 | 41 | slate_label_input = layers.Input( 42 | shape=(9,), dtype='int64', name='slate_label' 43 | ) 44 | 45 | category_feature = utils.id_input_processing(category_feature_input, config) 46 | dense_feature = utils.dense_input_processing(dense_feature_input, config) 47 | sequence_feature = utils.sequence_input_concat(sequence_feature_input, config) 48 | all_feature = layers.Concatenate(axis=-1)([category_feature, dense_feature]) 49 | all_feature = layers.Dense(256, activation=layers.ELU())(all_feature) 50 | obs = layers.Dense(256, activation=layers.ELU(), name='simulator_obs')(all_feature) 51 | output = layers.Dense(22, activation='softmax', name='simulator_reward')(obs) 52 | 53 | model = Model(inputs=[sequence_feature_input, 54 | dense_feature_input, 55 | category_feature_input, 56 | slate_label_input], 57 | outputs=[output]) 58 | model.compile(loss=my_loss_fn, optimizer='adam', metrics=[my_acc_metrics]) 59 | return model 60 | -------------------------------------------------------------------------------- /index.html: -------------------------------------------------------------------------------- 1 | 2 |   3 |     RL4RS Dataset 4 |     67 |   68 |   69 |   70 | 71 | -------------------------------------------------------------------------------- /rl4rs/nets/widedeep_slate_multiclass.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow import keras 3 | from tensorflow.keras import layers 4 | from tensorflow.keras.models import Model 5 | from rl4rs.nets import utils 6 | 7 | 8 | def my_loss_fn(y_true, y_pred): 9 | slate2label = tf.einsum('ij,j->i', 10 | tf.cast(y_true, tf.int64), 11 | tf.constant([1, 2, 4, 1, 2, 4, 1, 2, 4], dtype=tf.int64)) 12 | return tf.keras.losses.categorical_crossentropy(tf.one_hot(slate2label, 22), y_pred) 13 | 14 | 15 | def my_acc_metrics(y_true, y_pred): 16 | slate2label = tf.einsum('ij,j->i', 17 | tf.cast(y_true, tf.int64), 18 | tf.constant([1, 2, 4, 1, 2, 4, 1, 2, 4], dtype=tf.int64)) 19 | return tf.keras.metrics.categorical_accuracy(tf.one_hot(slate2label, 22), y_pred) 20 | 21 | 22 | def get_model(config): 23 | maxlen = config['maxlen'] 24 | dense_feature_num = config['dense_feature_num'] 25 | category_feature_num = config['category_feature_num'] 26 | class_num = config['class_num'] 27 | seq_num = config['seq_num'] 28 | 29 | sequence_feature_input = layers.Input( 30 | shape=(seq_num, maxlen,), dtype='float32', name='sequence_feature_input' 31 | ) 32 | 33 | dense_feature_input = layers.Input( 34 | shape=(dense_feature_num,), dtype='float32', name='dense_feature_input' 35 | ) 36 | 37 | category_feature_input = layers.Input( 38 | shape=(category_feature_num,), dtype='int64', name='category_feature_input' 39 | ) 40 | 41 | slate_label_input = layers.Input( 42 | shape=(9,), dtype='int64', name='slate_label' 43 | ) 44 | 45 | category_feature = utils.id_input_processing_concat(category_feature_input, config) 46 | dense_feature = utils.dense_input_processing(dense_feature_input, config) 47 | sequence_feature = utils.sequence_input_concat(sequence_feature_input, config) 48 | # all_feature = layers.Concatenate(axis=-1)([sequence_feature, dense_feature]) 49 | sequence_feature_dnn = layers.Dense(256, activation=layers.ELU())(sequence_feature) 50 | all_feature = layers.Concatenate(axis=-1, name='simulator_obs')( 51 | [sequence_feature_dnn, dense_feature, category_feature] 52 | ) 53 | output = layers.Dense(22, activation='softmax', name='simulator_reward')(all_feature) 54 | 55 | model = Model(inputs=[sequence_feature_input, 56 | dense_feature_input, 57 | category_feature_input, 58 | slate_label_input], 59 | outputs=[output]) 60 | model.compile(loss=my_loss_fn, optimizer='adam', metrics=[my_acc_metrics]) 61 | return model 62 | -------------------------------------------------------------------------------- /rl4rs/nets/dien_slate_multiclass.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow import keras 3 | from tensorflow.keras import layers 4 | from tensorflow.keras.models import Model 5 | from rl4rs.nets import utils 6 | 7 | 8 | def my_loss_fn(y_true, y_pred): 9 | slate2label = tf.einsum('ij,j->i', 10 | tf.cast(y_true, tf.int64), 11 | tf.constant([1, 2, 4, 1, 2, 4, 1, 2, 4], dtype=tf.int64)) 12 | return tf.keras.losses.categorical_crossentropy(tf.one_hot(slate2label, 22), y_pred) 13 | 14 | 15 | def my_acc_metrics(y_true, y_pred): 16 | slate2label = tf.einsum('ij,j->i', 17 | tf.cast(y_true, tf.int64), 18 | tf.constant([1, 2, 4, 1, 2, 4, 1, 2, 4], dtype=tf.int64)) 19 | return tf.keras.metrics.categorical_accuracy(tf.one_hot(slate2label, 22), y_pred) 20 | 21 | 22 | def get_model(config): 23 | maxlen = config['maxlen'] 24 | dense_feature_num = config['dense_feature_num'] 25 | category_feature_num = config['category_feature_num'] 26 | class_num = config['class_num'] 27 | seq_num = config['seq_num'] 28 | 29 | sequence_feature_input = layers.Input( 30 | shape=(seq_num, maxlen,), dtype='float32', name='sequence_feature_input' 31 | ) 32 | 33 | dense_feature_input = layers.Input( 34 | shape=(dense_feature_num,), dtype='float32', name='dense_feature_input' 35 | ) 36 | 37 | category_feature_input = layers.Input( 38 | shape=(category_feature_num,), dtype='int64', name='category_feature_input' 39 | ) 40 | 41 | slate_label_input = layers.Input(shape=(9,), dtype='int64', name='slate_label') 42 | 43 | slice_layer = layers.Lambda(lambda x: x[0][:, x[1]:]) 44 | id_slate_input = slice_layer([category_feature_input, -10]) 45 | 46 | category_feature = utils.id_input_processing_attn(category_feature_input, config) 47 | dense_feature = utils.dense_input_processing(dense_feature_input, config) 48 | sequence_feature = utils.sequence_input_attn([sequence_feature_input, id_slate_input], config) 49 | all_feature = layers.Concatenate(axis=-1)([sequence_feature, dense_feature, category_feature]) 50 | obs = layers.Dense(256, activation=layers.ELU(), name='simulator_obs')(all_feature) 51 | output = layers.Dense(22, activation='softmax', name='simulator_reward')(obs) 52 | 53 | model = Model(inputs=[sequence_feature_input, 54 | dense_feature_input, 55 | category_feature_input, 56 | slate_label_input], 57 | outputs=[output]) 58 | tf.keras.backend.get_session().run(tf.global_variables_initializer()) 59 | model.compile(loss=my_loss_fn, optimizer='adam', metrics=[my_acc_metrics]) 60 | return model 61 | -------------------------------------------------------------------------------- /rl4rs/policy/behavior_model.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | # tf.compat.v1.enable_eager_execution() 4 | from tensorflow import keras 5 | from rl4rs.utils.datautil import FeatureUtil 6 | from copy import deepcopy 7 | 8 | 9 | class behavior_model(object): 10 | def __init__(self, config, modelfile): 11 | behavior_config = deepcopy(config) 12 | behavior_config['category_feature_num'] = 21 13 | behavior_config['dense_feature_num'] = 50 14 | self.featureutil = FeatureUtil(behavior_config) 15 | self.item_feature_size = config.get('item_feature_size', 40) 16 | self.page_items = config.get("page_items", 9) 17 | self.sess = tf.Session() 18 | with self.sess.as_default(): 19 | self.model = keras.models.load_model(modelfile) 20 | 21 | def record2input(self, records, page=0): 22 | inputs = [] 23 | for record in records: 24 | role_id, _, sequence_id, exposed_items, user_feedback, user_seqfeature, \ 25 | user_protrait, item_feature, _ = self.featureutil.record_split(record) 26 | category_feature = user_protrait[:10] + \ 27 | [sequence_id] + \ 28 | exposed_items[self.page_items*page:self.page_items*(page+1)] 29 | sequence_feature = [user_seqfeature, [0]] 30 | label = 0 31 | dense_feature_size = self.item_feature_size*self.page_items 32 | item_feature = item_feature[dense_feature_size*page:dense_feature_size*(page+1)] 33 | item_feature = np.array(item_feature).reshape((self.page_items, self.item_feature_size)) 34 | item_feature = item_feature[:, :5].reshape(-1) 35 | inputs.append(( 36 | role_id, 37 | sequence_feature, 38 | item_feature, 39 | category_feature, 40 | user_feedback[self.page_items*page:self.page_items*(page+1)], 41 | label)) 42 | return inputs 43 | 44 | def action_probs(self, record, action, layer, page=0): 45 | batch_size = len(action) 46 | seq, dense, category, slate = self.featureutil.feature_extraction(self.record2input(record, page))[0] 47 | with self.sess.as_default(): 48 | y = self.model.predict([seq, dense, category, slate]) 49 | if layer == 1: 50 | action = np.clip(np.array(action) - 1, 0, 38) 51 | action_probs = y[:, 1:40] / np.sum(y[:, 1:40], axis=1, keepdims=True) 52 | elif layer == 2: 53 | action = np.clip(np.array(action) - 40, 0, 107) 54 | action_probs = y[:, 40:148] / np.sum(y[:, 40:148], axis=1, keepdims=True) 55 | else: 56 | action = np.clip(np.array(action) - 148, 0, 233) 57 | action_probs = y[:, 148:] / np.sum(y[:, 148:], axis=1, keepdims=True) 58 | return action_probs[range(batch_size), action] 59 | -------------------------------------------------------------------------------- /rl4rs/utils/rllib_vector_env.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from typing import Callable, List, Optional, Tuple 3 | from ray.rllib.utils.typing import EnvActionType, EnvConfigDict, EnvInfoDict, \ 4 | EnvObsType, EnvType, PartialTrainerConfigDict 5 | from ray.rllib.env.vector_env import VectorEnv 6 | from rl4rs.env import RecEnvBase 7 | 8 | 9 | class MyVectorEnvWrapper(VectorEnv): 10 | """An environment that supports batch evaluation using clones of sub-envs. 11 | """ 12 | 13 | def __init__(self, env: RecEnvBase, batch_size: int): 14 | """Initializes a VectorEnv object. 15 | 16 | Args: 17 | observation_space (Space): The observation Space of a single 18 | sub-env. 19 | action_space (Space): The action Space of a single sub-env. 20 | num_envs (int): The number of clones to make of the given sub-env. 21 | """ 22 | self.env = env 23 | self.reset_cache = [] 24 | super().__init__(self.env.observation_space, self.env.action_space, num_envs=batch_size) 25 | 26 | def vector_reset(self) -> List[EnvObsType]: 27 | """Resets all sub-environments. 28 | 29 | Returns: 30 | obs (List[any]): List of observations from each environment. 31 | """ 32 | return self.env.reset() 33 | 34 | def reset_at(self, index: Optional[int] = None) -> EnvObsType: 35 | """Resets a single environment. 36 | 37 | Args: 38 | index (Optional[int]): An optional sub-env index to reset. 39 | 40 | Returns: 41 | obs (obj): Observations from the reset sub environment. 42 | """ 43 | if index == 0: 44 | self.reset_cache = self.env.reset() 45 | return self.reset_cache[index] 46 | 47 | def vector_step( 48 | self, actions: List[EnvActionType] 49 | ) -> Tuple[List[EnvObsType], List[float], List[bool], List[EnvInfoDict]]: 50 | """Performs a vectorized step on all sub environments using `actions`. 51 | 52 | Args: 53 | actions (List[any]): List of actions (one for each sub-env). 54 | 55 | Returns: 56 | obs (List[any]): New observations for each sub-env. 57 | rewards (List[any]): Reward values for each sub-env. 58 | dones (List[any]): Done values for each sub-env. 59 | infos (List[any]): Info values for each sub-env. 60 | """ 61 | return self.env.step(np.array(actions)) 62 | 63 | def get_unwrapped(self) -> List[EnvType]: 64 | """Returns the underlying sub environments. 65 | 66 | Returns: 67 | List[Env]: List of all underlying sub environments. 68 | """ 69 | return [self.env, ] * self.num_envs 70 | 71 | # Experimental method. 72 | def try_render_at(self, index: Optional[int] = None) -> None: 73 | """Renders a single environment. 74 | 75 | Args: 76 | index (Optional[int]): An optional sub-env index to render. 77 | """ 78 | return self.env.render() 79 | -------------------------------------------------------------------------------- /script/simulator_env_test.py: -------------------------------------------------------------------------------- 1 | import os, sys 2 | import gym 3 | import numpy as np 4 | import tensorflow as tf 5 | tf.compat.v1.enable_eager_execution() 6 | from rl4rs.utils.datautil import FeatureUtil 7 | from rl4rs.env.slate import SlateRecEnv, SlateState 8 | from rl4rs.env.seqslate import SeqSlateRecEnv, SeqSlateState 9 | 10 | extra_config = eval(sys.argv[1]) if len(sys.argv) >= 2 else {} 11 | 12 | config = {"epoch": 1, "maxlen": 64, "batch_size": 2048, "action_size": 284, "class_num": 2, "dense_feature_num": 432, 13 | "category_feature_num": 21, "category_hash_size": 100000, "seq_num": 2, "emb_size": 128, "page_items": 9, 14 | "hidden_units": 128, "max_steps": 9, "sample_file": '../dataset/rl4rs_dataset_a_train.csv', 15 | "iteminfo_file": '../dataset/item_info.csv', "tfrecord_file":'../output/rl4rs_dataset_a_train_tiny.tfrecord', 16 | "model_file": "../output/supervised_a_train_dien/model", "support_rllib_mask": False, "is_eval": True, 'env': "SlateRecEnv-v0", 17 | "support_conti_env":True, "rawstate_as_obs":False} 18 | 19 | config = dict(config, **extra_config) 20 | 21 | if config.get('gpu', 0) < 1: 22 | os.environ['CUDA_VISIBLE_DEVICES'] = '-1' 23 | 24 | if config['env'] == 'SeqSlateRecEnv-v0': 25 | config['max_steps'] = 36 26 | sim = SeqSlateRecEnv(config, state_cls=SeqSlateState) 27 | env = gym.make('SeqSlateRecEnv-v0', recsim=sim) 28 | else: 29 | sim = SlateRecEnv(config, state_cls=SlateState) 30 | env = gym.make('SlateRecEnv-v0', recsim=sim) 31 | 32 | batch_size = config["batch_size"] 33 | epoch = config["epoch"] 34 | max_steps = config["max_steps"] 35 | rewards = np.zeros((epoch, batch_size)) 36 | offline_rewards = np.zeros((epoch, batch_size)) 37 | offline_actions = np.zeros((epoch, batch_size, max_steps)) 38 | for i in range(epoch): 39 | env.reset(reset_file=True) 40 | for j in range(config["max_steps"]): 41 | if not config.get("support_conti_env"): 42 | action = env.offline_action 43 | else: 44 | action = np.full((batch_size, 32), 1) 45 | offline_actions[i, :, j] = env.offline_action 46 | next_obs, reward, done, info = env.step(action) 47 | rewards[i] = rewards[i] + np.array(reward) 48 | offline_rewards[i] = offline_rewards[i] + np.array(env.offline_reward) 49 | if done[0]: 50 | print(next_obs[0], reward[0], action[0], done[0], info[0]) 51 | break 52 | 53 | if config['rawstate_as_obs']: 54 | config['batch_size'] = 1 55 | featureutil = FeatureUtil(config) 56 | iter_train = featureutil.read_tfrecord(config['tfrecord_file'], is_slate_label=False) 57 | feature = iter_train.make_one_shot_iterator().get_next() 58 | seq_feature = feature[0][0].numpy()[0] 59 | dense_feature = feature[0][1].numpy()[0] 60 | category_feature = feature[0][2].numpy()[0] 61 | assert np.min(np.equal(next_obs[0]['category_feature'][:-1], category_feature[:-1])) 62 | assert np.min(np.equal(next_obs[0]['dense_feature'][:-40], dense_feature[:-40])) 63 | assert np.min(np.equal(next_obs[0]['sequence_feature'], seq_feature)) 64 | -------------------------------------------------------------------------------- /script/offline_evaluation.py: -------------------------------------------------------------------------------- 1 | import os 2 | import gym 3 | import numpy as np 4 | from rl4rs.policy.behavior_model import behavior_model 5 | from rl4rs.policy.policy_model import policy_model 6 | import rl4rs.utils.offline_policy_metrics as OPE 7 | 8 | 9 | def ope_eval(config, eval_env, algo, sample_model: behavior_model = None): 10 | policy = policy_model(algo, config) 11 | metrics = [] 12 | epoch = config["epoch"] 13 | batch_size = config["batch_size"] 14 | max_steps = config["max_steps"] 15 | page_items = config.get("page_items", 9) 16 | for i in range(epoch): 17 | obs = eval_env.reset() 18 | episode_rewards, q_values, off_rewards = [], [], [] 19 | prev_actions = [] 20 | action_probs, behavior_probs, rewards = [], [], [] 21 | print('test batch at ', i) 22 | for j in range(max_steps): 23 | # obs = dict(enumerate(obs)) 24 | action = policy.predict_with_mask(obs) 25 | off_action = eval_env.offline_action 26 | if sample_model is not None: 27 | action_prob = policy.action_probs(obs) 28 | action_prob = action_prob[range(batch_size), off_action] 29 | q_values.append(policy.predict_q(obs, action)) 30 | action_probs.append(action_prob) 31 | behavior_prob = sample_model.action_probs(eval_env.samples.records, off_action, j // 3 + 1, page=j//page_items) 32 | behavior_probs.append(behavior_prob) 33 | obs, reward, done, info = eval_env.step(action) 34 | off_rewards.append(eval_env.offline_reward) 35 | rewards.append(reward) 36 | prev_actions.append(action) 37 | 38 | episode_reward = np.sum(np.array(rewards), axis=0) 39 | episode_rewards.append(episode_reward) 40 | if sample_model is not None: 41 | action_probs = np.array(action_probs).swapaxes(0, 1) 42 | behavior_probs = np.array(behavior_probs).swapaxes(0, 1) 43 | off_rewards = np.array(off_rewards).swapaxes(0, 1) 44 | off_rewards_sum = np.sum(off_rewards, axis=1) 45 | rewards_hat = np.array(rewards).swapaxes(0, 1) 46 | q_values = np.array(q_values).swapaxes(0, 1) 47 | # multiply probs 48 | action_probs_mul = np.multiply.reduce(action_probs*100, axis=1) 49 | behavior_probs_mul = np.multiply.reduce(behavior_probs*100, axis=1) 50 | cips = OPE.eval_CIPS(off_rewards_sum, action_probs_mul, behavior_probs_mul) 51 | # snips = OPE.eval_SNIPS(off_rewards_sum, action_probs_mul, behavior_probs_mul) 52 | dr = OPE.eval_doubly_robust( 53 | episode_reward, 54 | np.average(q_values, 1), 55 | off_rewards_sum, 56 | action_probs_mul, 57 | behavior_probs_mul 58 | ) 59 | # step-wise 60 | wips = OPE.eval_WIPS(off_rewards, action_probs, behavior_probs) 61 | sdr = OPE.eval_seq_doubly_robust( 62 | rewards_hat, 63 | q_values, 64 | off_rewards, 65 | action_probs, 66 | behavior_probs 67 | ) 68 | 69 | metrics.append((cips, dr, wips, sdr)) 70 | 71 | print('IS', 'DR', 'WIPS', 'SeqDR', sep=' ') 72 | print(np.average(np.array(metrics), axis=0)) 73 | print(np.std(np.array(metrics), axis=0)) 74 | -------------------------------------------------------------------------------- /rl4rs/nets/cql/q_function.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, cast 2 | 3 | import torch 4 | import torch.nn.functional as F 5 | from torch import nn 6 | from typing import Any, ClassVar, Dict, Type 7 | from d3rlpy.models.torch.encoders import Encoder 8 | from d3rlpy.models.torch.q_functions.base import DiscreteQFunction 9 | from d3rlpy.models.torch.q_functions.utility import compute_huber_loss, compute_reduce, pick_value_by_action 10 | from d3rlpy.models.q_functions import QFunctionFactory 11 | from d3rlpy.models.torch import EncoderWithAction, ContinuousMeanQFunction 12 | 13 | 14 | class CustomDiscreteMeanQFunction(DiscreteQFunction, nn.Module): # type: ignore 15 | _action_size: int 16 | _encoder: Encoder 17 | _fc: nn.Linear 18 | 19 | def __init__(self, encoder: Encoder, action_size: int): 20 | super().__init__() 21 | self._action_size = action_size 22 | self._encoder = encoder 23 | # self._fc = nn.Linear(encoder.get_feature_size(), action_size) 24 | 25 | def forward(self, x: torch.Tensor) -> torch.Tensor: 26 | return cast(torch.Tensor, self._encoder(x)) 27 | 28 | def compute_error( 29 | self, 30 | obs_t: torch.Tensor, 31 | act_t: torch.Tensor, 32 | rew_tp1: torch.Tensor, 33 | q_tp1: torch.Tensor, 34 | ter_tp1: torch.Tensor, 35 | gamma: float = 0.99, 36 | reduction: str = "mean", 37 | ) -> torch.Tensor: 38 | one_hot = F.one_hot(act_t.view(-1), num_classes=self.action_size) 39 | q_t = (self.forward(obs_t) * one_hot.float()).sum(dim=1, keepdim=True) 40 | y = rew_tp1 + gamma * q_tp1 * (1 - ter_tp1) 41 | loss = compute_huber_loss(q_t, y) 42 | return compute_reduce(loss, reduction) 43 | 44 | def compute_target( 45 | self, x: torch.Tensor, action: Optional[torch.Tensor] = None 46 | ) -> torch.Tensor: 47 | if action is None: 48 | return self.forward(x) 49 | # q=pick_value_by_action(self.forward(x), action, keepdim=True) 50 | values = self.forward(x) 51 | action_size = values.shape[1] 52 | one_hot = F.one_hot(action.view(-1), num_classes=action_size) 53 | masked_values = values * cast(torch.Tensor, one_hot.float()) 54 | q = masked_values.sum(dim=1, keepdim=True) 55 | # assert torch.min(q)>-100 56 | return q 57 | 58 | @property 59 | def action_size(self) -> int: 60 | return self._action_size 61 | 62 | @property 63 | def encoder(self) -> Encoder: 64 | return self._encoder 65 | 66 | 67 | class CustomMeanQFunctionFactory(QFunctionFactory): 68 | TYPE: ClassVar[str] = "mean" 69 | 70 | def __init__(self, bootstrap: bool = False, share_encoder: bool = False): 71 | super().__init__(bootstrap, share_encoder) 72 | 73 | def create_discrete( 74 | self, 75 | encoder: Encoder, 76 | action_size: int, 77 | ) -> CustomDiscreteMeanQFunction: 78 | return CustomDiscreteMeanQFunction(encoder, action_size) 79 | 80 | def create_continuous( 81 | self, 82 | encoder: EncoderWithAction, 83 | ) -> ContinuousMeanQFunction: 84 | return ContinuousMeanQFunction(encoder) 85 | 86 | def get_params(self, deep: bool = False) -> Dict[str, Any]: 87 | return { 88 | "bootstrap": self._bootstrap, 89 | "share_encoder": self._share_encoder, 90 | } 91 | -------------------------------------------------------------------------------- /rl4rs/nets/adversarial_slate.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow import keras 3 | from tensorflow.keras import layers 4 | from tensorflow.keras.models import Model 5 | from rl4rs.nets import utils 6 | 7 | 8 | def custom_loss(external_loss): 9 | def loss(y_true, y_pred): 10 | return 0.1 * tf.keras.losses.binary_crossentropy(y_true, y_pred) + external_loss 11 | 12 | return loss 13 | 14 | 15 | def my_loss_fn(y_true, y_pred): 16 | item_scores_exp = tf.exp(y_pred) 17 | item_scores_click = tf.einsum('ij,ij->ij', y_pred, tf.cast(y_true, tf.float32)) 18 | return -tf.log(tf.reduce_sum(tf.exp(item_scores_click), axis=1) + 1) \ 19 | + tf.log(tf.reduce_sum(item_scores_exp, axis=1) + 1) 20 | 21 | 22 | def my_metrics(y_true, y_pred): 23 | score = tf.einsum('ij,ij->ij', y_pred, 1 - tf.cast(y_true, tf.float32)) 24 | return tf.reduce_sum(score, 1) 25 | 26 | 27 | def my_mean_metrics(y_true, y_pred): 28 | return tf.reduce_mean(y_pred, 1) 29 | 30 | 31 | def my_max_metrics(y_true, y_pred): 32 | return tf.reduce_max(y_pred, 1) 33 | 34 | 35 | def my_min_metrics(y_true, y_pred): 36 | return tf.reduce_min(y_pred, 1) 37 | 38 | 39 | def get_model(config): 40 | maxlen = config['maxlen'] 41 | dense_feature_num = config['dense_feature_num'] 42 | category_feature_num = config['category_feature_num'] 43 | class_num = config['class_num'] 44 | seq_num = config['seq_num'] 45 | 46 | sequence_feature_input = layers.Input( 47 | shape=(seq_num, maxlen,), dtype='float32', name='sequence_feature_input' 48 | ) 49 | dense_feature_input = layers.Input( 50 | shape=(dense_feature_num,), dtype='float32', name='dense_feature_input' 51 | ) 52 | category_feature_input = layers.Input( 53 | shape=(category_feature_num,), dtype='int64', name='category_feature_input' 54 | ) 55 | slate_label_input = layers.Input( 56 | shape=(9,), dtype='int64', name='slate_label' 57 | ) 58 | 59 | feature_omit = layers.Lambda(lambda x: x[:, :-1]) 60 | category_feature_input_slate = feature_omit(category_feature_input) 61 | config['category_feature_num'] = config['category_feature_num'] - 1 62 | 63 | category_feature = utils.id_input_processing(category_feature_input_slate, config) 64 | dense_feature = utils.dense_input_processing(dense_feature_input, config) 65 | sequence_feature = utils.sequence_input_concat(sequence_feature_input, config) 66 | 67 | all_feature = layers.Concatenate(axis=-1)( 68 | [sequence_feature, dense_feature, category_feature] 69 | ) 70 | item_scores = layers.Dense(9, activation='sigmoid')(all_feature) 71 | item_scores_norm = layers.Softmax()(item_scores) 72 | item_scores_no_click = tf.einsum('ij,ij->ij', 73 | item_scores_norm, 74 | 1 - tf.cast(slate_label_input, tf.float32)) 75 | loss3 = tf.reduce_sum(item_scores_no_click, axis=1) 76 | 77 | model = Model(inputs=[sequence_feature_input, 78 | dense_feature_input, 79 | category_feature_input, 80 | slate_label_input], 81 | outputs=[item_scores]) 82 | model.compile(optimizer='adam', 83 | loss=custom_loss(loss3), 84 | metrics=[ 85 | tf.keras.metrics.AUC(), 86 | tf.keras.metrics.Precision(), 87 | tf.keras.metrics.Recall()]) 88 | return model 89 | -------------------------------------------------------------------------------- /reproductions/run_simulator_eval.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | conda activate rl4rs 4 | script_abs=$(readlink -f "$0") 5 | rl4rs_benchmark_dir=$(dirname $script_abs)/.. 6 | rl4rs_output_dir=${rl4rs_benchmark_dir}/output 7 | rl4rs_dataset_dir=${rl4rs_benchmark_dir}/dataset 8 | script_dir=${rl4rs_benchmark_dir}/script 9 | export rl4rs_benchmark_dir && export rl4rs_output_dir && export rl4rs_dataset_dir 10 | 11 | algo=$1 12 | 13 | cd ${script_dir} 14 | 15 | # train in train set and test in all sample 16 | python -u simulator_eval.py "{'env':'SlateRecEnv-v0','algo':'${algo}','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_a_shuf.csv','model_file':'${rl4rs_output_dir}/supervised_a_train_${algo}/model'}" >> ${rl4rs_output_dir}/eval_simulator_a_all_${algo}.log && 17 | python -u simulator_eval.py "{'gpu':False,'env':'SeqSlateRecEnv-v0','algo':'${algo}','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_b3_shuf.csv','model_file':'${rl4rs_output_dir}/supervised_b2_train_${algo}/model'}" >> ${rl4rs_output_dir}/eval_simulator_b2_all_${algo}.log 18 | 19 | # train in all set and test in sl/rl as a baseline 20 | python -u simulator_eval.py "{'env':'SlateRecEnv-v0','algo':'${algo}','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_a_sl_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_a_${algo}/model'}" >> ${rl4rs_output_dir}/eval_simulator_a_all_sl_${algo}.log && 21 | python -u simulator_eval.py "{'env':'SlateRecEnv-v0','algo':'${algo}','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_a_rl_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_a_${algo}/model'}" >> ${rl4rs_output_dir}/eval_simulator_a_all_rl_${algo}.log && 22 | 23 | python -u simulator_eval.py "{'gpu':False,'env':'SeqSlateRecEnv-v0','algo':'${algo}','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_b3_sl_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_b2_${algo}/model'}" >> ${rl4rs_output_dir}/eval_simulator_b2_all_sl_${algo}.log && 24 | python -u simulator_eval.py "{'gpu':False,'env':'SeqSlateRecEnv-v0','algo':'${algo}','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_b3_rl_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_b2_${algo}/model'}" >> ${rl4rs_output_dir}/eval_simulator_b2_all_rl_${algo}.log && 25 | 26 | # train in sl/rl and test in rl/sl 27 | python -u simulator_eval.py "{'env':'SlateRecEnv-v0','algo':'${algo}','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_a_rl_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_a_sl_${algo}/model'}" >> ${rl4rs_output_dir}/eval_simulator_a_sl_rl_${algo}.log && 28 | python -u simulator_eval.py "{'env':'SlateRecEnv-v0','algo':'${algo}','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_a_sl_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_a_rl_${algo}/model'}" >> ${rl4rs_output_dir}/eval_simulator_a_rl_sl_${algo}.log && 29 | 30 | python -u simulator_eval.py "{'gpu':False,'env':'SeqSlateRecEnv-v0','algo':'${algo}','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_b3_rl_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_b2_sl_${algo}/model'}" >> ${rl4rs_output_dir}/eval_simulator_b2_sl_rl_${algo}.log && 31 | python -u simulator_eval.py "{'gpu':False,'env':'SeqSlateRecEnv-v0','algo':'${algo}','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_b3_sl_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_b2_rl_${algo}/model'}" >> ${rl4rs_output_dir}/eval_simulator_b2_rl_sl_${algo}.log 32 | 33 | echo '1' -------------------------------------------------------------------------------- /rl4rs/mdpchecker/decoder.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import time 3 | import bottleneck 4 | 5 | def token_probs(model, 6 | batch_inputs, 7 | batch_outputs): 8 | return np.array(model.predict([np.array(batch_inputs), np.array(batch_outputs)]))[:, -1] 9 | 10 | 11 | def decode_step(model, 12 | batch_inputs, 13 | batch_outputs, 14 | candidates=None, 15 | beam_size=1): 16 | a = time.time() 17 | # predicts (batch_size, token_size) 18 | predicts = model.predict([np.array(batch_inputs), np.array(batch_outputs)])[:,-1] 19 | batch_size, token_size = predicts.shape 20 | print('decode_step', time.time()-a) 21 | # print('decode_step', time.time()-a) 22 | # tmp = [] 23 | # for i in range(len(predicts)): 24 | # probs = [(prob, j) for j, prob in enumerate(predicts[i])] 25 | # if candidates is not None: 26 | # probs = [x if x[1] in candidates[i] else (0, x[1]) for x in probs] 27 | # probs.sort(reverse=True) 28 | # probs = probs[:beam_size] 29 | # tmp.append(probs) 30 | if candidates is not None: 31 | mask = np.zeros(predicts.shape) 32 | inds = np.array([[i,]*len(candidates[i]) for i in range(len(candidates))]).flatten() 33 | mask[inds, candidates.flatten().astype(int)] = 1 34 | predicts = predicts * mask 35 | # index = np.argpartition(-predicts, beam_size, axis=1)[:, :beam_size] 36 | # probs = -np.partition(-predicts, beam_size, axis=1)[:, :beam_size] 37 | index = bottleneck.argpartition(-predicts, beam_size, axis=1)[:, :beam_size] 38 | probs = -bottleneck.partition(-predicts, beam_size, axis=1)[:, :beam_size] 39 | inds = np.array([[i,]*len(probs[i]) for i in range(len(probs))]) 40 | inds_sorted = np.argsort(-probs, axis=1)[:,:beam_size] 41 | index = index[inds, inds_sorted] 42 | probs = probs[inds, inds_sorted] 43 | # print('decode_step', time.time()-a) 44 | tmp2 = np.array(list(zip(probs.flatten(),index.flatten()))).reshape((batch_size, beam_size, 2)) 45 | # tmp (batch_size, beam_size, 2) 46 | # print(np.min(tmp==tmp2)) 47 | return tmp2 48 | 49 | 50 | def beam_search(model, encode_input, beam_size, target_len, use_candidates=False, candidates_size = None): 51 | batch_size = len(encode_input) 52 | output_topk = np.zeros((batch_size, beam_size, target_len + 1), dtype=np.int) 53 | beam_score = np.ones((batch_size, beam_size)) 54 | output_topk[:, :, 0] = 1 55 | # probs = [] 56 | candidates = None 57 | prob = decode_step(model, encode_input, output_topk[:, 0, :1], candidates=candidates, beam_size=beam_size) 58 | if use_candidates: 59 | probs_first_step = decode_step(model, encode_input, output_topk[:, 0, :1], candidates=candidates, beam_size=candidates_size) 60 | candidates = probs_first_step[:, :, 1] 61 | output_topk[:, :, 1] = prob[:, :, 1] 62 | beam_score[:, :] = prob[:, :, 0] 63 | for i in range(1, target_len): 64 | a = time.time() 65 | print('beam_search at target_len_', i) 66 | probs = [] 67 | for j in range(beam_size): 68 | # batch_size,k,2 69 | prob = decode_step(model, encode_input, output_topk[:, j, :i + 1], candidates=candidates, beam_size=beam_size) 70 | probs.append(prob) 71 | # batch_size,k,k,2 72 | probs = np.array(probs).swapaxes(0, 1) 73 | # batch_size,k,k 74 | beam_scores = np.einsum('abc,ab->abc', probs[:, :, :, 0], beam_score) 75 | # batch_size,k,2 76 | top_k_fn = lambda x: np.dstack(np.unravel_index(np.argsort(-x.ravel()), (beam_size, beam_size))) 77 | top_k_index = np.array(list(map(top_k_fn, beam_scores)))[:, 0][:, :beam_size, :] 78 | for ii in range(batch_size): 79 | output_topk[ii, :, :] = output_topk[ii, top_k_index[ii, :, 0], :] 80 | output_topk[ii, :, i + 1] = probs[ii, top_k_index[ii, :, 0], top_k_index[ii, :, 1], 1] 81 | beam_score[ii, :] = beam_scores[ii, top_k_index[ii, :, 0], top_k_index[ii, :, 1]] 82 | return output_topk, beam_score 83 | -------------------------------------------------------------------------------- /rl4rs/nets/rllib/rllib_rawstate_model.py: -------------------------------------------------------------------------------- 1 | import gym 2 | from gym.spaces import Dict 3 | from rl4rs.nets import utils 4 | from ray.rllib.models.utils import get_activation_fn 5 | from ray.rllib.models.tf.misc import normc_initializer 6 | from ray.rllib.models.tf.tf_modelv2 import TFModelV2 7 | from ray.rllib.utils.framework import try_import_tf, try_import_torch 8 | 9 | tf1, tf, tfv = try_import_tf() 10 | torch, nn = try_import_torch() 11 | 12 | 13 | def getTFModelWithRawState(config): 14 | config = config 15 | 16 | class MyTFModelWithRawState(TFModelWithRawState): 17 | def __init__(self, obs_space, action_space, num_outputs, model_config, 18 | name): 19 | super(MyTFModelWithRawState, self).__init__( 20 | obs_space, action_space, num_outputs, model_config, name, config=config) 21 | 22 | return MyTFModelWithRawState 23 | 24 | 25 | class TFModelWithRawState(TFModelV2): 26 | """Implements the `.action_model` branch required above.""" 27 | 28 | def __init__(self, obs_space, action_space, num_outputs, model_config, 29 | name, config): 30 | obs_space = obs_space.original_space 31 | super(TFModelWithRawState, self).__init__( 32 | obs_space, action_space, num_outputs, model_config, name) 33 | if not (isinstance(obs_space, Dict) and obs_space['category_feature'] \ 34 | and obs_space['dense_feature'] and obs_space['sequence_feature']): 35 | raise ValueError("""This model only supports the Dict{'category_feature':[], 36 | 'dense_feature':[], 'sequence_feature':[]} obs space""") 37 | activation = model_config.get("fcnet_activation", "linear") 38 | activation = get_activation_fn(activation) 39 | no_final_linear = model_config.get("no_final_linear", False) 40 | # Inputs 41 | category_feature_input = tf.keras.layers.Input( 42 | shape=obs_space['category_feature'].shape, name="obs_category_input") 43 | dense_feature_input = tf.keras.layers.Input( 44 | shape=obs_space['dense_feature'].shape, name="obs_dense_input") 45 | sequence_feature_input = tf.keras.layers.Input( 46 | shape=obs_space['sequence_feature'].shape, name="obs_sequence_input") 47 | 48 | slice_layer = tf.keras.layers.Lambda(lambda x: x[0][:, x[1]:]) 49 | category_feature = utils.id_input_processing(category_feature_input, config) 50 | dense_feature = utils.dense_input_processing(dense_feature_input, config) 51 | sequence_feature = utils.sequence_input_concat(sequence_feature_input, config) 52 | all_feature = tf.keras.layers.Concatenate(axis=-1)([sequence_feature, dense_feature, category_feature]) 53 | context = tf.keras.layers.Dense(256, activation=tf.keras.layers.ELU())(all_feature) 54 | model_out = None 55 | if no_final_linear and num_outputs: 56 | model_out = tf.keras.layers.Dense( 57 | num_outputs, 58 | name="fc_out", 59 | activation=activation, 60 | kernel_initializer=normc_initializer(1.0))(context) 61 | else: 62 | model_out = tf.keras.layers.Dense( 63 | num_outputs, 64 | name="fc_out", 65 | activation=None, 66 | kernel_initializer=normc_initializer(0.01))(context) 67 | 68 | # V(s) 69 | value_out = tf.keras.layers.Dense( 70 | 1, 71 | name="value_out", 72 | activation=None, 73 | kernel_initializer=normc_initializer(0.01))(context) 74 | 75 | # Base layers 76 | self.base_model = tf.keras.Model([category_feature_input, dense_feature_input, sequence_feature_input], [model_out, value_out]) 77 | self.base_model.summary() 78 | 79 | def forward(self, input_dict, state, seq_lens): 80 | model_out, self._value_out = self.base_model([input_dict["obs"]["category_feature"], 81 | input_dict["obs"]["dense_feature"], 82 | input_dict["obs"]["sequence_feature"]]) 83 | return model_out, state 84 | 85 | def value_function(self): 86 | return tf.reshape(self._value_out, [-1]) 87 | -------------------------------------------------------------------------------- /rl4rs/policy/policy_model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import d3rlpy 3 | import numpy as np 4 | from ray.rllib.agents.trainer import Trainer as rllib_trainer 5 | from scipy.special import softmax 6 | 7 | 8 | class policy_model(object): 9 | def __init__(self, model, config = {}): 10 | self.policy = model 11 | self.config = config 12 | self.page_items = int(config.get('page_items', 9)) 13 | self.mask_size = self.page_items+1 14 | self.location_mask = config.get('location_mask', None) 15 | self.special_items = config.get('special_items', None) 16 | 17 | def predict_with_mask(self, obs): 18 | if self.config.get("support_conti_env",False): 19 | return self.predict(obs) 20 | elif isinstance(self.policy, d3rlpy.algos.AlgoBase): 21 | obs = np.array(obs) 22 | action_probs = np.array(self.action_probs(obs)) 23 | batch_size = len(obs) 24 | # mask 25 | prev_actions = obs[:, -self.mask_size:-1].astype(int) 26 | cur_step = obs[:, -1].astype(int) 27 | x_mask_layer = cur_step % self.page_items // 3 28 | mask = self.location_mask[x_mask_layer.astype(int)] 29 | for i in range(self.mask_size-1): 30 | mask[range(batch_size), prev_actions[:, i]] = 0 31 | action_mask = mask < 0.01 32 | action_probs[action_mask] = -2 ** 15 33 | for i in range(batch_size): 34 | if len(np.intersect1d(prev_actions[i], self.special_items)) > 0: 35 | action_probs[i][self.special_items] = -2 ** 15 36 | return action_probs.argmax(axis=1) 37 | elif isinstance(self.policy, rllib_trainer): 38 | return self.predict(obs) 39 | else: 40 | assert isinstance(self.policy, d3rlpy.algos.AlgoBase) \ 41 | or isinstance(self.policy, rllib_trainer) 42 | 43 | def predict(self, obs): 44 | if isinstance(self.policy, d3rlpy.algos.AlgoBase): 45 | return self.policy.predict(obs) 46 | elif isinstance(self.policy, rllib_trainer): 47 | obs = dict(enumerate(obs)) 48 | action = self.policy.compute_actions(obs, explore=False) 49 | action = np.array(list(action.values())) 50 | return action 51 | else: 52 | assert isinstance(self.policy, d3rlpy.algos.AlgoBase) \ 53 | or isinstance(self.policy, rllib_trainer) 54 | 55 | def predict_q(self, obs, action): 56 | if isinstance(self.policy, d3rlpy.algos.AlgoBase): 57 | q = self.policy.predict_value(obs, action) 58 | if self.policy.reward_scaler is not None: 59 | return self.policy.reward_scaler.reverse_transform(q) 60 | else: 61 | return q 62 | elif isinstance(self.policy, rllib_trainer): 63 | obs = dict(enumerate(obs)) 64 | _, _, infos = self.policy. \ 65 | compute_actions(obs, explore=False, full_fetch=True) 66 | batch_size = len(action) 67 | return infos['q_values'][range(batch_size), action] \ 68 | if 'q_values' in infos \ 69 | else infos['vf_preds'] 70 | else: 71 | assert isinstance(self.policy, d3rlpy.algos.AlgoBase) \ 72 | or isinstance(self.policy, rllib_trainer) 73 | 74 | def action_probs(self, obs): 75 | if isinstance(self.policy, d3rlpy.algos.DiscreteBC): 76 | obs = torch.tensor(obs, dtype=torch.float32) 77 | return self.policy._impl._imitator(obs).detach().numpy() 78 | elif isinstance(self.policy, d3rlpy.algos.DiscreteBCQ) \ 79 | or isinstance(self.policy, d3rlpy.algos.DiscreteCQL): 80 | obs = torch.tensor(obs, dtype=torch.float32) 81 | action_q = self.policy._impl._q_func(obs).detach().numpy() 82 | return softmax(action_q, axis=1) 83 | elif isinstance(self.policy, rllib_trainer): 84 | obs = dict(enumerate(obs)) 85 | actions, _, infos = self.policy. \ 86 | compute_actions(obs, explore=False, full_fetch=True) 87 | return softmax(infos['action_dist_inputs'], axis=1) 88 | else: 89 | assert isinstance(self.policy, d3rlpy.algos.AlgoBase) \ 90 | or isinstance(self.policy, rllib_trainer) 91 | -------------------------------------------------------------------------------- /reproductions/run_exact_k.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | conda activate rl4rs 4 | script_abs=$(readlink -f "$0") 5 | rl4rs_benchmark_dir=$(dirname $script_abs)/.. 6 | rl4rs_output_dir=${rl4rs_benchmark_dir}/output 7 | rl4rs_dataset_dir=${rl4rs_benchmark_dir}/dataset 8 | script_dir=${rl4rs_benchmark_dir}/script 9 | export rl4rs_benchmark_dir && export rl4rs_output_dir && export rl4rs_dataset_dir 10 | 11 | 12 | cd ${script_dir} 13 | 14 | # experiment in a_all env, train in a_all sample and test in a_all sample 15 | python -u exact_k_train.py "train" "{'env':'SlateRecEnv-v0','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_a_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_a_dien/model','trial_name':'a_all'}" >> ${rl4rs_output_dir}/exactk_a_all.log 16 | python -u exact_k_train.py "eval" "{'gpu':False,'batch_size':2048,'is_eval':True,'env':'SlateRecEnv-v0','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_a_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_a_dien/model','trial_name':'a_all'}" >> ${rl4rs_output_dir}/exactk_a_all.log 17 | 18 | 19 | # experiment in a_all env, train in a_train sample and test in a_test sample 20 | #python -u exact_k_train.py "train" "{'env':'SlateRecEnv-v0','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_a_train_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_a_dien/model','trial_name':'a_train'}" >> ${rl4rs_output_dir}/exactk_a_train.log 21 | #python -u exact_k_train.py "eval" "{'gpu':False,'batch_size':2048,'is_eval':True,'env':'SlateRecEnv-v0','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_a_test_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_a_dien/model','trial_name':'a_train'}" >> ${rl4rs_output_dir}/exactk_a_train.log 22 | 23 | 24 | # experiment train in a_sl env and test in a_rl env 25 | #python -u exact_k_train.py "train" "{'env':'SlateRecEnv-v0','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_a_sl_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_a_sl_dien/model','trial_name':'a_sl'}" >> ${rl4rs_output_dir}/exactk_a_sl.log 26 | #python -u exact_k_train.py "eval" "{'gpu':False,'batch_size':2048,'is_eval':True,'env':'SlateRecEnv-v0','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_a_rl_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_a_rl_dien/model','trial_name':'a_sl'}" >> ${rl4rs_output_dir}/exactk_a_sl.log 27 | 28 | 29 | # experiment in b_all env, train in b_all sample and test in b_all sample 30 | #python -u exact_k_train.py "train" "{'env':'SeqSlateRecEnv-v0','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_b3_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_b2_dien/model','trial_name':'b_all'}" >> ${rl4rs_output_dir}/exactk_b_all.log 31 | #python -u exact_k_train.py "eval" "{'gpu':False,'batch_size':2048,'is_eval':True,'env':'SeqSlateRecEnv-v0','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_b3_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_b2_dien/model','trial_name':'b_all'}" >> ${rl4rs_output_dir}/exactk_b_all.log 32 | 33 | 34 | # experiment in b_all env, train in b_train sample and test in b_test sample 35 | #python -u exact_k_train.py "train" "{'env':'SeqSlateRecEnv-v0','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_b3_train_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_b2_dien/model','trial_name':'b_train'}" >> ${rl4rs_output_dir}/exactk_b_train.log 36 | #python -u exact_k_train.py "eval" "{'gpu':False,'batch_size':2048,'is_eval':True,'env':'SeqSlateRecEnv-v0','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_b3_test_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_b2_dien/model','trial_name':'b_train'}" >> ${rl4rs_output_dir}/exactk_b_train.log 37 | 38 | 39 | # experiment train in b_sl env and test in b_rl env 40 | #python -u exact_k_train.py "train" "{'env':'SeqSlateRecEnv-v0','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_b3_sl_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_b2_sl_dien/model','trial_name':'b_sl'}" >> ${rl4rs_output_dir}/exactk_b_sl.log 41 | #python -u exact_k_train.py "eval" "{'gpu':False,'batch_size':2048,'is_eval':True,'env':'SeqSlateRecEnv-v0','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_b3_rl_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_b2_rl_dien/model','trial_name':'b_sl'}" >> ${rl4rs_output_dir}/exactk_b_sl.log 42 | -------------------------------------------------------------------------------- /rl4rs/nets/rllib/rllib_mask_model.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from rl4rs.nets.rllib.rllib_rawstate_model import TFModelWithRawState 3 | from ray.rllib.examples.models.parametric_actions_model import \ 4 | ParametricActionsModel 5 | 6 | 7 | def getMaskActionsModel(true_obs_shape, action_size): 8 | class MyMaskActionsModel(ParametricActionsModel): 9 | """Parametric action model that handles the dot product and masking. 10 | 11 | This assumes the outputs are logits for a single Categorical action dist. 12 | Getting this to work with a more complex output (e.g., if the action space 13 | is a tuple of several distributions) is also possible but left as an 14 | exercise to the reader. 15 | """ 16 | 17 | def __init__(self, 18 | obs_space, 19 | action_space, 20 | num_outputs, 21 | model_config, 22 | name, 23 | **kw): 24 | config = { 25 | # FullyConnectedNetwork (tf and torch): rllib.models.tf|torch.fcnet.py 26 | # These are used if no custom model is specified and the input space is 1D. 27 | # Number of hidden layers to be used. 28 | "fcnet_hiddens": [64], 29 | # Activation function descriptor. 30 | # Supported values are: "tanh", "relu", "swish" (or "silu"), 31 | # "linear" (or None). 32 | # "fcnet_activation": "linear", 33 | # "no_final_linear": True, 34 | "vf_share_layers": True, 35 | } 36 | model_config = dict(model_config, **config) 37 | super(MyMaskActionsModel, self).__init__( 38 | obs_space, action_space, num_outputs, model_config, name, true_obs_shape, action_embed_size=action_size, **kw) 39 | print('MyMaskActionsModel', self.action_embed_model.model_config) 40 | 41 | def forward(self, input_dict, state, seq_lens): 42 | # Extract the available actions tensor from the observation. 43 | # avail_actions = input_dict["obs"]["avail_actions"] 44 | action_mask = input_dict["obs"]["action_mask"] 45 | 46 | # Compute the predicted action embedding 47 | action_embed, _ = self.action_embed_model({ 48 | "obs": input_dict["obs"]["obs"] 49 | }) 50 | # action_values = self.action_embed_model.value_function() 51 | # print(tf.shape(action_embed), action_embed) 52 | 53 | # Expand the model output to [BATCH, 1, EMBED_SIZE]. Note that the 54 | # avail actions tensor is of shape [BATCH, MAX_ACTIONS, EMBED_SIZE]. 55 | # intent_vector = tf.expand_dims(action_embed, 1) 56 | 57 | # Batch dot product => shape of logits is [BATCH, MAX_ACTIONS]. 58 | # action_prob = tf.nn.softmax(action_embed) 59 | 60 | # Mask out invalid actions (use tf.float32.min for stability) 61 | inf_mask = tf.maximum(tf.math.log(action_mask), tf.float32.min) 62 | return action_embed + inf_mask, state 63 | 64 | return MyMaskActionsModel 65 | 66 | 67 | def getMaskActionsModelWithRawState(config, action_size): 68 | config = config 69 | 70 | class MyMaskActionsModelWithRawState(ParametricActionsModel): 71 | """Parametric action model that handles the dot product and masking. 72 | 73 | This assumes the outputs are logits for a single Categorical action dist. 74 | Getting this to work with a more complex output (e.g., if the action space 75 | is a tuple of several distributions) is also possible but left as an 76 | exercise to the reader. 77 | """ 78 | 79 | def __init__(self, 80 | obs_space, 81 | action_space, 82 | num_outputs, 83 | model_config, 84 | name, 85 | **kw): 86 | # model_config = dict(model_config, **config) 87 | super(MyMaskActionsModelWithRawState, self).__init__( 88 | obs_space, action_space, num_outputs, model_config, name, action_embed_size=action_size, **kw) 89 | print('MyMaskActionsModelWithRawStateModel', self.action_embed_model.model_config) 90 | self.action_embed_model = TFModelWithRawState( 91 | obs_space, action_space, action_size, 92 | model_config, name + "_action_embed", config = config) 93 | 94 | def forward(self, input_dict, state, seq_lens): 95 | # Extract the available actions tensor from the observation. 96 | # avail_actions = input_dict["obs"]["avail_actions"] 97 | action_mask = input_dict["obs"]["action_mask"] 98 | 99 | # Compute the predicted action embedding 100 | action_embed, _ = self.action_embed_model(input_dict) 101 | # action_values = self.action_embed_model.value_function() 102 | # print(tf.shape(action_embed), action_embed) 103 | 104 | # Expand the model output to [BATCH, 1, EMBED_SIZE]. Note that the 105 | # avail actions tensor is of shape [BATCH, MAX_ACTIONS, EMBED_SIZE]. 106 | # intent_vector = tf.expand_dims(action_embed, 1) 107 | 108 | # Batch dot product => shape of logits is [BATCH, MAX_ACTIONS]. 109 | # action_prob = tf.nn.softmax(action_embed) 110 | 111 | # Mask out invalid actions (use tf.float32.min for stability) 112 | inf_mask = tf.maximum(tf.math.log(action_mask), tf.float32.min) 113 | return action_embed + inf_mask, state 114 | 115 | return MyMaskActionsModelWithRawState 116 | -------------------------------------------------------------------------------- /rl4rs/nets/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | from deepctr.layers.sequence import AttentionSequencePoolingLayer, DynamicGRU 4 | from tensorflow.keras import layers, regularizers 5 | 6 | 7 | def id_input_processing(category_feature_input, config): 8 | emb_size = config['emb_size'] 9 | category_hash_size = config['category_hash_size'] 10 | emb_layer = layers.Embedding(input_dim=category_hash_size, output_dim=emb_size) 11 | category_emb = emb_layer(category_feature_input) 12 | category_feature = layers.GlobalAveragePooling1D()(category_emb) 13 | return category_feature 14 | 15 | 16 | def id_input_processing_attn(category_feature_input, config): 17 | emb_size = config['emb_size'] 18 | hidden_unit = config['hidden_units'] 19 | category_hash_size = config['category_hash_size'] 20 | emb_layer = layers.Embedding(input_dim=category_hash_size, output_dim=emb_size) 21 | category_emb = emb_layer(category_feature_input) 22 | category_feature = tf.keras.layers.Attention()([category_emb, category_emb]) 23 | category_feature = tf.keras.layers.GlobalAveragePooling1D()(category_feature) 24 | category_feature_2 = layers.Flatten()(category_emb) 25 | return layers.Concatenate(axis=-1)([category_feature, category_feature_2]) 26 | 27 | 28 | def id_input_processing_lstm(category_feature_input, config): 29 | emb_size = config['emb_size'] 30 | hidden_unit = config['hidden_units'] 31 | category_hash_size = config['category_hash_size'] 32 | emb_layer = layers.Embedding(input_dim=category_hash_size, output_dim=emb_size) 33 | category_emb = emb_layer(category_feature_input) 34 | category_feature = layers.GRU(units=hidden_unit)(category_emb) 35 | category_feature_2 = layers.Flatten()(category_emb) 36 | return layers.Concatenate(axis=-1)([category_feature, category_feature_2]) 37 | 38 | 39 | def id_input_processing_concat(category_feature_input, config): 40 | emb_size = config['emb_size'] 41 | category_hash_size = config['category_hash_size'] 42 | emb_layer = layers.Embedding(input_dim=category_hash_size, output_dim=emb_size) 43 | category_emb = emb_layer(category_feature_input) 44 | category_feature = layers.Flatten()(category_emb) 45 | return category_feature 46 | 47 | 48 | def dense_input_processing(cross_feature_input, config): 49 | hidden_unit = config['hidden_units'] 50 | cross_feature = layers.Dense(hidden_unit, activation=layers.ELU())(cross_feature_input) 51 | cross_feature = layers.Dropout(0.2)(cross_feature) 52 | cross_feature = layers.Dense(hidden_unit, activation=layers.ELU())(cross_feature) 53 | cross_feature = layers.Dropout(0.2)(cross_feature) 54 | return cross_feature 55 | 56 | 57 | def sequence_input_concat(sequence_feature_input, config): 58 | category_hash_size = config['category_hash_size'] 59 | hidden_unit = config['hidden_units'] 60 | emb_size = config['emb_size'] 61 | seq_num = config['seq_num'] 62 | 63 | seq_index_layer = layers.Lambda(lambda x: x[0][:, x[1]]) 64 | emb_layer = layers.Embedding(input_dim=category_hash_size, output_dim=emb_size) 65 | 66 | seqs_lstm = [] 67 | for i in range(seq_num): 68 | seq_i = seq_index_layer([sequence_feature_input, i]) 69 | seq_i_embeddings = emb_layer(seq_i) 70 | seq_i_lstm = layers.GlobalAveragePooling1D()(seq_i_embeddings) 71 | seqs_lstm.append(seq_i_lstm) 72 | 73 | seqs_embeddings = layers.Concatenate(axis=-1)(seqs_lstm) if len(seqs_lstm) > 1 else seqs_lstm[0] 74 | 75 | return seqs_embeddings 76 | 77 | 78 | def sequence_input_LSTM(sequence_feature_input, config): 79 | category_hash_size = config['category_hash_size'] 80 | hidden_unit = config['hidden_units'] 81 | emb_size = config['emb_size'] 82 | seq_num = config['seq_num'] 83 | 84 | seq_index_layer = layers.Lambda(lambda x: x[0][:, x[1]]) 85 | 86 | emb_layer = layers.Embedding(input_dim=category_hash_size, output_dim=emb_size) 87 | 88 | seqs_lstm = [] 89 | for i in range(seq_num): 90 | seq_i = seq_index_layer([sequence_feature_input, i]) 91 | seq_i_embeddings = emb_layer(seq_i) 92 | seq_i_lstm = layers.GRU(units=hidden_unit)(seq_i_embeddings) 93 | seqs_lstm.append(seq_i_lstm) 94 | 95 | seqs_embeddings = layers.Concatenate(axis=-1)(seqs_lstm) if len(seqs_lstm) > 1 else seqs_lstm[0] 96 | 97 | return seqs_embeddings 98 | 99 | 100 | def sequence_input_attn(input, config): 101 | category_hash_size = config['category_hash_size'] 102 | hidden_unit = config['hidden_units'] 103 | emb_size = config['emb_size'] 104 | maxlen = config['maxlen'] 105 | batch_size = config['batch_size'] 106 | seq_num = config['seq_num'] 107 | 108 | sequence_feature_input = input[0] 109 | id_slate_input = input[1] 110 | 111 | sequence_length = tf.fill((tf.shape(sequence_feature_input)[0], 1), maxlen) 112 | seq_index_layer = layers.Lambda(lambda x: x[0][:, x[1]]) 113 | emb_layer = layers.Embedding(input_dim=category_hash_size, output_dim=emb_size) 114 | id_slate_embeddings = emb_layer(id_slate_input) 115 | id_slate_pooling = tf.math.reduce_mean(id_slate_embeddings, axis=1, keepdims=True) 116 | seqs_attn = [] 117 | for i in range(seq_num): 118 | seq_i = seq_index_layer([sequence_feature_input, i]) 119 | seq_i_embeddings = emb_layer(seq_i) 120 | rnn_outputs = DynamicGRU(emb_size, return_sequence=True)([seq_i_embeddings, sequence_length]) 121 | scores = AttentionSequencePoolingLayer(att_hidden_units=(64, 16), return_score=True)([ 122 | id_slate_pooling, rnn_outputs, sequence_length]) 123 | final_state2 = DynamicGRU(emb_size * 2, gru_type='AUGRU', return_sequence=False 124 | )([rnn_outputs, sequence_length, tf.keras.layers.Permute([2, 1])(scores)]) 125 | seqs_attn.append(final_state2) 126 | 127 | seqs_embeddings = layers.Concatenate(axis=-1)(seqs_attn) if len(seqs_attn) > 1 else seqs_attn[0] 128 | 129 | return tf.squeeze(seqs_embeddings, axis=1) 130 | -------------------------------------------------------------------------------- /rl4rs/nets/cql/encoder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import numpy as np 4 | import copy 5 | from typing import Any, ClassVar, Dict, List, Optional, Sequence, Type, Union 6 | from d3rlpy.models.encoders import EncoderFactory, Encoder, VectorEncoderWithAction, _create_activation, VectorEncoder 7 | 8 | 9 | class CustomVectorEncoder(VectorEncoder): 10 | 11 | def __init__( 12 | self, 13 | config, 14 | action_size, 15 | mask_size, 16 | with_q, 17 | observation_shape: Sequence[int], 18 | hidden_units: Optional[Sequence[int]] = None, 19 | use_batch_norm: bool = False, 20 | dropout_rate: Optional[float] = None, 21 | use_dense: bool = False, 22 | activation: nn.Module = nn.ReLU(), 23 | ): 24 | super().__init__(observation_shape, hidden_units, use_batch_norm, dropout_rate, use_dense, activation) 25 | self.action_size = action_size 26 | self.mask_size = mask_size 27 | self.with_q = with_q 28 | self.emb_size = 32 29 | self.emb_layer = nn.Embedding(action_size, self.emb_size) 30 | self.fc2 = nn.Linear(self._feature_size + self.emb_size * mask_size, action_size) 31 | self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 32 | location_mask = config['location_mask'] 33 | self.special_items = config['special_items'] 34 | self.location_mask = torch.tensor(location_mask, device=self.device) 35 | 36 | def get_feature_size(self) -> int: 37 | if not self.with_q: 38 | return self._feature_size + self.emb_size * self.mask_size 39 | else: 40 | return self.action_size 41 | 42 | def forward(self, x: torch.Tensor) -> torch.Tensor: 43 | batch_size = x.shape[0] 44 | # mask 45 | prev_actions = x[:, -self.mask_size:-1].to(torch.long) 46 | cur_step = x[:, -1].to(torch.long) 47 | x_mask_layer = cur_step % 9 // 3 48 | mask = self.location_mask[x_mask_layer] 49 | for i in range(self.mask_size-1): 50 | mask[range(batch_size), prev_actions[:, i]] = 0 51 | h = self._fc_encode(x) 52 | if self._use_batch_norm: 53 | h = self._bns[-1](h) 54 | if self._dropout_rate is not None: 55 | h = self._dropouts[-1](h) 56 | prev_action_emb = nn.Flatten()(self.emb_layer(x[:, -self.mask_size:].to(torch.long))) 57 | h = torch.cat([h, prev_action_emb], dim=-1) 58 | if self.with_q: 59 | h = self.fc2(h) 60 | action_mask = mask < 0.01 61 | # h[action_mask] = -2 ** 15 62 | h[action_mask] = 0 63 | for i in range(batch_size): 64 | if len(np.intersect1d(prev_actions[i].cpu().numpy(), self.special_items)) > 0: 65 | h[i][self.special_items] = 0 66 | # h[i][self.special_items] = -2 ** 15 67 | return h 68 | 69 | 70 | class CustomVectorEncoderFactory(EncoderFactory): 71 | TYPE: ClassVar[str] = "vector" 72 | _hidden_units: Sequence[int] 73 | _activation: str 74 | _use_batch_norm: bool 75 | _dropout_rate: Optional[float] 76 | _use_dense: bool 77 | 78 | def __init__( 79 | self, 80 | config, 81 | action_size, 82 | mask_size, 83 | with_q=False, 84 | hidden_units: Optional[Sequence[int]] = None, 85 | activation: str = "relu", 86 | use_batch_norm: bool = False, 87 | dropout_rate: Optional[float] = None, 88 | use_dense: bool = False, 89 | ): 90 | self.config = config 91 | self.action_size = action_size 92 | self.mask_size = mask_size 93 | self.with_q = with_q 94 | if hidden_units is None: 95 | self._hidden_units = [256] 96 | else: 97 | self._hidden_units = hidden_units 98 | self._activation = activation 99 | self._use_batch_norm = use_batch_norm 100 | self._dropout_rate = dropout_rate 101 | self._use_dense = use_dense 102 | 103 | def create(self, observation_shape: Sequence[int]) -> CustomVectorEncoder: 104 | assert len(observation_shape) == 1 105 | return CustomVectorEncoder( 106 | config=self.config, 107 | action_size=self.action_size, 108 | mask_size=self.mask_size, 109 | with_q=self.with_q, 110 | observation_shape=observation_shape, 111 | hidden_units=self._hidden_units, 112 | use_batch_norm=self._use_batch_norm, 113 | dropout_rate=self._dropout_rate, 114 | use_dense=self._use_dense, 115 | activation=_create_activation(self._activation), 116 | ) 117 | 118 | def create_with_action( 119 | self, 120 | observation_shape: Sequence[int], 121 | action_size: int, 122 | discrete_action: bool = False, 123 | ) -> VectorEncoderWithAction: 124 | assert len(observation_shape) == 1 125 | return VectorEncoderWithAction( 126 | observation_shape=observation_shape, 127 | action_size=action_size, 128 | hidden_units=self._hidden_units, 129 | use_batch_norm=self._use_batch_norm, 130 | dropout_rate=self._dropout_rate, 131 | use_dense=self._use_dense, 132 | discrete_action=discrete_action, 133 | activation=_create_activation(self._activation), 134 | ) 135 | 136 | def get_params(self, deep: bool = False) -> Dict[str, Any]: 137 | if deep: 138 | hidden_units = copy.deepcopy(self._hidden_units) 139 | else: 140 | hidden_units = self._hidden_units 141 | params = { 142 | "hidden_units": hidden_units, 143 | "activation": self._activation, 144 | "use_batch_norm": self._use_batch_norm, 145 | "dropout_rate": self._dropout_rate, 146 | "use_dense": self._use_dense, 147 | } 148 | return params 149 | -------------------------------------------------------------------------------- /rl4rs/utils/d3rlpy_scorer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from typing import Any, Callable, Iterator, List, Optional, Tuple, Union, cast 3 | from d3rlpy.metrics.scorer import AlgoProtocol, _make_batches 4 | from d3rlpy.dataset import Episode 5 | from rl4rs.policy.policy_model import policy_model 6 | 7 | WINDOW_SIZE = 1024 8 | 9 | 10 | # modify from https://github.com/takuseno/d3rlpy/blob/master/d3rlpy/metrics/scorer.py 11 | def soft_opc_scorer( 12 | return_threshold: float, 13 | ) -> Callable[[policy_model, List[Episode]], float]: 14 | r"""Returns Soft Off-Policy Classification metrics. 15 | 16 | This function returns scorer function, which is suitable to the standard 17 | scikit-learn scorer function style. 18 | The metrics of the scorer funciton is evaluating gaps of action-value 19 | estimation between the success episodes and the all episodes. 20 | If the learned Q-function is optimal, action-values in success episodes 21 | are expected to be higher than the others. 22 | The success episode is defined as an episode with a return above the given 23 | threshold. 24 | 25 | .. math:: 26 | 27 | \mathbb{E}_{s, a \sim D_{success}} [Q(s, a)] 28 | - \mathbb{E}_{s, a \sim D} [Q(s, a)] 29 | 30 | .. code-block:: python 31 | 32 | from d3rlpy.datasets import get_cartpole 33 | from d3rlpy.algos import DQN 34 | from d3rlpy.metrics.scorer import soft_opc_scorer 35 | from sklearn.model_selection import train_test_split 36 | 37 | dataset, _ = get_cartpole() 38 | train_episodes, test_episodes = train_test_split(dataset, test_size=0.2) 39 | 40 | scorer = soft_opc_scorer(return_threshold=180) 41 | 42 | dqn = DQN() 43 | dqn.fit(train_episodes, 44 | eval_episodes=test_episodes, 45 | scorers={'soft_opc': scorer}) 46 | 47 | References: 48 | * `Irpan et al., Off-Policy Evaluation via Off-Policy Classification. 49 | `_ 50 | 51 | Args: 52 | return_threshold: threshold of success episodes. 53 | 54 | Returns: 55 | scorer function. 56 | 57 | """ 58 | 59 | def scorer(algo: policy_model, episodes: List[Episode]) -> float: 60 | success_values = [] 61 | all_values = [] 62 | for episode in episodes: 63 | is_success = episode.compute_return() >= return_threshold 64 | for batch in _make_batches(episode, WINDOW_SIZE, algo.policy.n_frames): 65 | values = algo.predict_q(batch.observations, batch.actions) 66 | values = cast(np.ndarray, values) 67 | all_values += values.reshape(-1).tolist() 68 | if is_success: 69 | success_values += values.reshape(-1).tolist() 70 | return float(np.mean(success_values) - np.mean(all_values)) 71 | 72 | return scorer 73 | 74 | 75 | def dynamics_reward_prediction_mean_error_scorer( 76 | dynamics: policy_model, episodes: List[Episode] 77 | ) -> float: 78 | r"""Returns MSE of reward prediction (in negative scale). 79 | 80 | This metrics suggests how dynamics model is generalized to test sets. 81 | If the MSE is large, the dynamics model are overfitting. 82 | 83 | .. math:: 84 | 85 | \mathbb{E}_{s_t, a_t, r_{t+1} \sim D} [(r_{t+1} - r')] 86 | 87 | where :math:`r' \sim T(s_t, a_t)`. 88 | 89 | Args: 90 | dynamics: dynamics model. 91 | episodes: list of episodes. 92 | 93 | Returns: 94 | negative mean squared error. 95 | 96 | """ 97 | total_errors = [] 98 | for episode in episodes: 99 | for batch in _make_batches(episode, WINDOW_SIZE, dynamics.policy.n_frames): 100 | pred = dynamics.predict_q(batch.observations, batch.actions) 101 | rewards = batch.next_rewards 102 | errors = (rewards - pred[1]).reshape(-1) 103 | total_errors += errors.tolist() 104 | # smaller is better 105 | return float(np.mean(total_errors)) 106 | 107 | 108 | def dynamics_reward_prediction_abs_mean_error_scorer( 109 | dynamics: policy_model, episodes: List[Episode] 110 | ) -> float: 111 | r"""Returns MSE of reward prediction (in negative scale). 112 | 113 | This metrics suggests how dynamics model is generalized to test sets. 114 | If the MSE is large, the dynamics model are overfitting. 115 | 116 | .. math:: 117 | 118 | \mathbb{E}_{s_t, a_t, r_{t+1} \sim D} [abs(r_{t+1} - r')] 119 | 120 | where :math:`r' \sim T(s_t, a_t)`. 121 | 122 | Args: 123 | dynamics: dynamics model. 124 | episodes: list of episodes. 125 | 126 | Returns: 127 | negative mean squared error. 128 | 129 | """ 130 | total_errors = [] 131 | for episode in episodes: 132 | for batch in _make_batches(episode, WINDOW_SIZE, dynamics.policy.n_frames): 133 | pred = dynamics.predict_q(batch.observations, batch.actions) 134 | rewards = batch.next_rewards 135 | errors = np.abs(rewards - pred[1]).reshape(-1) 136 | total_errors += errors.tolist() 137 | # smaller is better 138 | return float(np.mean(total_errors)) 139 | 140 | def discrete_action_match_scorer( 141 | algo: policy_model, episodes: List[Episode] 142 | ) -> float: 143 | r"""Returns percentage of identical actions between algorithm and dataset. 144 | 145 | This metrics suggests how different the greedy-policy is from the given 146 | episodes in discrete action-space. 147 | If the given episdoes are near-optimal, the large percentage would be 148 | better. 149 | 150 | .. math:: 151 | 152 | \frac{1}{N} \sum^N \parallel 153 | \{a_t = \text{argmax}_a Q_\theta (s_t, a)\} 154 | 155 | Args: 156 | algo: algorithm. 157 | episodes: list of episodes. 158 | 159 | Returns: 160 | percentage of identical actions. 161 | 162 | """ 163 | total_matches = [] 164 | for episode in episodes: 165 | for batch in _make_batches(episode, WINDOW_SIZE, algo.policy.n_frames): 166 | actions = algo.predict_with_mask(batch.observations) 167 | match = (batch.actions.reshape(-1) == actions).tolist() 168 | total_matches += match 169 | return float(np.mean(total_matches)) -------------------------------------------------------------------------------- /script/exact_k_train.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os, sys 3 | import gym 4 | import numpy as np 5 | import tensorflow as tf 6 | from rl4rs.nets.exact_k.model import Generator, Discriminator 7 | from rl4rs.env.slate import SlateRecEnv, SlateState 8 | from rl4rs.env.seqslate import SeqSlateRecEnv, SeqSlateState 9 | from rl4rs.utils.fileutil import find_newest_files 10 | 11 | stage = sys.argv[1] 12 | extra_config = eval(sys.argv[2]) 13 | 14 | config = {"epoch": 10000, "maxlen": 64, "batch_size": 256, "action_size": 284, "class_num": 2, "dense_feature_num": 432, 15 | "category_feature_num": 21, "category_hash_size": 100000, "seq_num": 2, "emb_size": 128, "page_items": 9, 16 | "hidden_units": 128, "max_steps": 9, "sample_file": '../dataset/rl4rs_dataset_b3_shuf.csv', "iteminfo_file": '../item_info.csv', 17 | "model_file": "../output/simulator_b2_dien/model", "support_rllib_mask": False, "is_eval": False, 'env': "SlateRecEnv-v0"} 18 | 19 | config = dict(config, **extra_config) 20 | 21 | if config['env'] == 'SeqSlateRecEnv-v0': 22 | config['max_steps'] = 36 23 | sim = SeqSlateRecEnv(config, state_cls=SeqSlateState) 24 | env = gym.make('SeqSlateRecEnv-v0', recsim=sim) 25 | else: 26 | sim = SlateRecEnv(config, state_cls=SlateState) 27 | env = gym.make('SlateRecEnv-v0', recsim=sim) 28 | 29 | batch_size = config["batch_size"] 30 | action_size = config["action_size"] 31 | epoch = config["epoch"] 32 | max_steps = config["max_steps"] 33 | output_dir = os.environ['rl4rs_output_dir'] 34 | model_dir = '%s/%s/' % (output_dir, 'exactk_' + config['env'] + '_' + config['trial_name']) 35 | model_save_path = model_dir + 'exact_k.ckpt' 36 | restore_file = find_newest_files('exact_k.ckpt*', model_dir) 37 | restore_file = restore_file[:restore_file.rfind('.')] 38 | 39 | l0_ssr_mask = np.zeros(action_size) 40 | location_mask, special_items = SlateState.get_mask_from_file(config['iteminfo_file'], action_size) 41 | l1_mask, l2_mask, l3_mask = location_mask[0], location_mask[1], location_mask[2] 42 | l0_ssr_mask[special_items] = 1 43 | 44 | with tf.name_scope('Generator'): 45 | g = Generator(l1_mask, 46 | l2_mask, 47 | l3_mask, 48 | l0_ssr_mask, 49 | is_training=True, 50 | seq_length=action_size) 51 | 52 | with tf.name_scope('Discriminator'): 53 | d = Discriminator(seq_length=action_size) 54 | 55 | print("Graph loaded") 56 | 57 | if config.get('gpu', True): 58 | gpu_options = tf.GPUOptions( 59 | per_process_gpu_memory_fraction=0.5, 60 | allow_growth=True) # seems to be not working 61 | sess_config = tf.ConfigProto(allow_soft_placement=True, 62 | gpu_options=gpu_options) 63 | else: 64 | os.environ['CUDA_VISIBLE_DEVICES'] = '-1' 65 | sess_config = tf.ConfigProto() 66 | 67 | if stage == 'train': 68 | with tf.Session(config=sess_config) as sess: 69 | sess.run(tf.initialize_all_variables()) 70 | print('Generator training start!') 71 | reward_total = 0.0 72 | for episode in range(epoch): 73 | print('Generator episode: ', episode) 74 | 75 | observation = np.array(env.reset()) 76 | item_cand = np.array([list(range(0, config['action_size']))] * batch_size) 77 | hill_b_f = [] 78 | for i in range(2): 79 | # get action 80 | sampled_card_idx, sampled_card = sess.run([g.sampled_path, g.sampled_result], 81 | feed_dict={g.user: observation, g.item_cand: item_cand}) 82 | for step in range(config['max_steps']): 83 | observation_, reward, done, info = env.step(sampled_card[:, step]) 84 | 85 | env.reset() 86 | # hill b f 87 | hill_b_f.append(list(zip(sampled_card, sampled_card_idx, reward))) 88 | 89 | b_hill_f = np.transpose(hill_b_f, [1, 0, 2]) 90 | samples = [] 91 | for hill_f in b_hill_f: 92 | sorted_list = sorted(hill_f, key=lambda x: x[2], reverse=True) 93 | samples.append(sorted_list[np.random.choice(1)]) 94 | 95 | (sampled_card, sampled_card_idx, reward) = zip(*samples) 96 | reward = np.array(reward) 97 | 98 | reward_ = sess.run(d.reward, feed_dict={d.user: observation}) 99 | sess.run(d.train_op, feed_dict={d.user: observation, d.reward_target: reward}) 100 | 101 | if episode % 50 == 0: 102 | print('episode:', episode) 103 | print('reward_target', np.mean(reward_)) 104 | print('reward', np.mean(reward)) 105 | print('actions', sampled_card[:10]) 106 | reward = (reward - reward_) 107 | 108 | reward = reward / np.std(reward) 109 | 110 | sess.run(g.train_op, feed_dict={g.decode_target_ids: sampled_card_idx, 111 | g.reward: reward, 112 | g.item_cand: item_cand, 113 | g.user: observation, 114 | }) 115 | gs_gen = sess.run(g.global_step) 116 | 117 | if episode % 500 == 0: 118 | saver = tf.train.Saver() 119 | saver.save(sess, model_save_path + '.' + str(episode)) 120 | print('save model:' + model_save_path + '.' + str(episode)) 121 | print('Generator training done!') 122 | saver = tf.train.Saver() 123 | saver.save(sess, model_save_path + '.' + str(episode)) 124 | print('save model:' + model_save_path + '.' + str(episode)) 125 | print("Done") 126 | 127 | if stage == 'eval': 128 | with tf.Session(config=sess_config) as sess: 129 | sess.run(tf.initialize_all_variables()) 130 | saver = tf.train.Saver() 131 | saver.restore(sess, restore_file) 132 | print('restore exact-k model from %s' % (restore_file)) 133 | episode_reward = 0 134 | done = False 135 | epoch = 4 136 | for i in range(epoch): 137 | observation = np.array(env.reset()) 138 | item_cand = np.array([list(range(0, config['action_size']))] * batch_size) 139 | sampled_card_idx, sampled_card = sess.run([g.greedy_path, g.greedy_result], 140 | feed_dict={g.user: observation, g.item_cand: item_cand}) 141 | for step in range(config['max_steps']): 142 | observation_, reward, done, info = env.step(sampled_card[:, step]) 143 | episode_reward += sum(reward) 144 | print('actions', sampled_card[:10]) 145 | print('avg reward', episode_reward / config['batch_size'] / (i + 1)) 146 | -------------------------------------------------------------------------------- /rl4rs/server/gymHttpClient.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import six.moves.urllib.parse as urlparse 3 | import json 4 | import numpy as np 5 | import os 6 | import gym 7 | 8 | import logging 9 | 10 | logger = logging.getLogger(__name__) 11 | logger.setLevel(logging.INFO) 12 | 13 | 14 | # modify from https://github.com/openai/gym-http-api 15 | class Client(object): 16 | """ 17 | Gym client to interface with gym_http_server 18 | """ 19 | 20 | def __init__(self, remote_base): 21 | self.remote_base = remote_base 22 | self.session = requests.Session() 23 | self.session.headers.update({'Content-type': 'application/json'}) 24 | 25 | def _parse_server_error_or_raise_for_status(self, resp): 26 | j = {} 27 | try: 28 | j = resp.json() 29 | except: 30 | # Most likely json parse failed because of network error, not server error (server 31 | # sends its errors in json). Don't let parse exception go up, but rather raise default 32 | # error. 33 | resp.raise_for_status() 34 | if resp.status_code != 200 and "message" in j: # descriptive message from server side 35 | raise ServerError(message=j["message"], status_code=resp.status_code) 36 | resp.raise_for_status() 37 | return j 38 | 39 | def _post_request(self, route, data): 40 | url = urlparse.urljoin(self.remote_base, route) 41 | # logger.info("POST {}\n{}".format(url, json.dumps(data))) 42 | resp = self.session.post(urlparse.urljoin(self.remote_base, route), 43 | data=json.dumps(data)) 44 | return self._parse_server_error_or_raise_for_status(resp) 45 | 46 | def _get_request(self, route): 47 | url = urlparse.urljoin(self.remote_base, route) 48 | # logger.info("GET {}".format(url)) 49 | resp = self.session.get(url) 50 | return self._parse_server_error_or_raise_for_status(resp) 51 | 52 | def env_create(self, env_id, config={}): 53 | route = '/v1/envs/' 54 | data = {'env_id': env_id, 'config': config} 55 | resp = self._post_request(route, data) 56 | instance_id = resp['instance_id'] 57 | return instance_id 58 | 59 | def env_list_all(self): 60 | route = '/v1/envs/' 61 | resp = self._get_request(route) 62 | all_envs = resp['all_envs'] 63 | return all_envs 64 | 65 | def env_reset(self, instance_id): 66 | route = '/v1/envs/{}/reset/'.format(instance_id) 67 | resp = self._post_request(route, None) 68 | if 'observation' in resp: 69 | observation = resp['observation'] 70 | else: 71 | resp = self._post_request(route, None) 72 | observation = resp['observation'] 73 | return observation 74 | 75 | def env_step(self, instance_id, action, render=False): 76 | route = '/v1/envs/{}/step/'.format(instance_id) 77 | data = {'action': action, 'render': render} 78 | resp = self._post_request(route, data) 79 | observation = resp['observation'] 80 | reward = resp['reward'] 81 | done = resp['done'] 82 | info = resp['info'] 83 | return [observation, reward, done, info] 84 | 85 | def env_action_space_info(self, instance_id): 86 | route = '/v1/envs/{}/action_space/'.format(instance_id) 87 | resp = self._get_request(route) 88 | info = resp['info'] 89 | return info 90 | 91 | def env_action_space_sample(self, instance_id): 92 | route = '/v1/envs/{}/action_space/sample'.format(instance_id) 93 | resp = self._get_request(route) 94 | action = resp['action'] 95 | return action 96 | 97 | def env_action_space_contains(self, instance_id, x): 98 | route = '/v1/envs/{}/action_space/contains/{}'.format(instance_id, x) 99 | resp = self._get_request(route) 100 | member = resp['member'] 101 | return member 102 | 103 | def env_observation_space_info(self, instance_id): 104 | route = '/v1/envs/{}/observation_space/'.format(instance_id) 105 | resp = self._get_request(route) 106 | info = resp['info'] 107 | return info 108 | 109 | def env_observation_space_contains(self, instance_id, params): 110 | route = '/v1/envs/{}/observation_space/contains'.format(instance_id) 111 | resp = self._post_request(route, params) 112 | member = resp['member'] 113 | return member 114 | 115 | def env_monitor_start(self, instance_id, directory, 116 | force=False, resume=False, video_callable=False): 117 | route = '/v1/envs/{}/monitor/start/'.format(instance_id) 118 | data = {'directory': directory, 119 | 'force': force, 120 | 'resume': resume, 121 | 'video_callable': video_callable} 122 | self._post_request(route, data) 123 | 124 | def env_monitor_close(self, instance_id): 125 | route = '/v1/envs/{}/monitor/close/'.format(instance_id) 126 | self._post_request(route, None) 127 | 128 | def env_close(self, instance_id): 129 | route = '/v1/envs/{}/close/'.format(instance_id) 130 | self._post_request(route, None) 131 | 132 | def upload(self, training_dir, algorithm_id=None, api_key=None): 133 | if not api_key: 134 | api_key = os.environ.get('OPENAI_GYM_API_KEY') 135 | 136 | route = '/v1/upload/' 137 | data = {'training_dir': training_dir, 138 | 'algorithm_id': algorithm_id, 139 | 'api_key': api_key} 140 | self._post_request(route, data) 141 | 142 | def shutdown_server(self): 143 | route = '/v1/shutdown/' 144 | self._post_request(route, None) 145 | 146 | 147 | class ServerError(Exception): 148 | def __init__(self, message, status_code=None): 149 | Exception.__init__(self) 150 | self.message = message 151 | if status_code is not None: 152 | self.status_code = status_code 153 | 154 | 155 | if __name__ == '__main__': 156 | remote_base = 'http://127.0.0.1:5000' 157 | client = Client(remote_base) 158 | 159 | # Create environment 160 | env_id = 'CartPole-v0' 161 | instance_id = client.env_create(env_id) 162 | print(instance_id) 163 | # Check properties 164 | all_envs = client.env_list_all() 165 | action_info = client.env_action_space_info(instance_id) 166 | obs_info = client.env_observation_space_info(instance_id) 167 | print(obs_info) 168 | # Run a single step 169 | client.env_monitor_start(instance_id, directory='tmp', force=True) 170 | init_obs = client.env_reset(instance_id) 171 | [observation, reward, done, info] = client.env_step(instance_id, 1, False) 172 | client.env_monitor_close(instance_id) 173 | print(observation, reward, done, info) 174 | # client.upload(training_dir='tmp') 175 | -------------------------------------------------------------------------------- /script/batchrl_train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import gym 3 | import random 4 | import d3rlpy 5 | import sys 6 | import torch 7 | from rl4rs.env.slate import SlateRecEnv, SlateState 8 | from rl4rs.env.seqslate import SeqSlateRecEnv, SeqSlateState 9 | from script import batchrl_trainer 10 | from d3rlpy.dataset import MDPDataset 11 | from script.offline_evaluation import ope_eval 12 | from rl4rs.policy.behavior_model import behavior_model 13 | from rl4rs.policy.policy_model import policy_model 14 | from rl4rs.nets.cql.encoder import CustomVectorEncoderFactory 15 | from d3rlpy.metrics.scorer import dynamics_observation_prediction_error_scorer 16 | from d3rlpy.metrics.scorer import dynamics_reward_prediction_error_scorer 17 | from d3rlpy.metrics.scorer import dynamics_prediction_variance_scorer 18 | 19 | algo = sys.argv[1] 20 | stage = sys.argv[2] 21 | extra_config = eval(sys.argv[3]) if len(sys.argv) >= 4 else {} 22 | 23 | config = {"epoch": 4, "maxlen": 64, "batch_size": 2048, "action_size": 284, "class_num": 2, "dense_feature_num": 432, 24 | "category_feature_num": 21, "category_hash_size": 100000, "seq_num": 2, "emb_size": 128, 25 | "hidden_units": 128, "max_steps": 9, "sample_file": '../dataset/rl4rs_dataset_a_shuf.csv', 26 | "model_file": "../output/rl4rs_dataset_a_dnn/model", 'gpu': True, "page_items": 9, 'action_emb_size':32, 27 | "iteminfo_file": '../dataset/item_info.csv', "support_d3rl_mask": True, "is_eval": True, 28 | "CQL_alpha": 1, 'env': 'SlateRecEnv-v0', 'trial_name': 'a_all'} 29 | 30 | config = dict(config, **extra_config) 31 | 32 | if config['env'] == 'SeqSlateRecEnv-v0': 33 | config['max_steps'] = 36 34 | location_mask, special_items = SeqSlateState.get_mask_from_file(config['iteminfo_file'], config['action_size']) 35 | config['location_mask'] = location_mask 36 | config['special_items'] = special_items 37 | elif config['env'] == 'SlateRecEnv-v0': 38 | location_mask, special_items = SlateState.get_mask_from_file(config['iteminfo_file'], config['action_size']) 39 | config['location_mask'] = location_mask 40 | config['special_items'] = special_items 41 | else: 42 | assert config['env'] in ('SlateRecEnv-v0', 'SeqSlateRecEnv-v0') 43 | 44 | if algo in ('MOPO', 'COMBO') or 'conti' in algo: 45 | config["support_conti_env"] = True 46 | 47 | if not config.get('gpu', True): 48 | os.environ['CUDA_VISIBLE_DEVICES'] = '-1' 49 | torch.cuda.is_available = lambda: False 50 | print('CUDA_VISIBLE_DEVICES', torch.cuda.is_available()) 51 | 52 | if not config.get("support_conti_env",False): 53 | trail_name = config['env'] + '_' + config['trial_name'] + '.h5' 54 | elif config.get("support_onehot_action", False): 55 | config['action_emb_size'] = config["action_size"] 56 | trail_name = config['env'] + '_' + config['trial_name'] + '_onehot.h5' 57 | else: 58 | trail_name = config['env'] + '_' + config['trial_name'] + '_conti.h5' 59 | dataset_dir = os.environ['rl4rs_dataset_dir'] 60 | output_dir = os.environ['rl4rs_output_dir'] 61 | dataset_save_path = dataset_dir + '/' + trail_name 62 | dynamics_save_path = output_dir + '/' + 'dynamics' + '_' + trail_name 63 | model_save_path = output_dir + '/' + algo + '_' + trail_name 64 | scaler = None 65 | print(trail_name, config) 66 | 67 | try: 68 | dataset = MDPDataset.load(dataset_save_path) 69 | except Exception: 70 | dataset = None 71 | 72 | try: 73 | dynamics = batchrl_trainer.get_model(config, 'dynamics') 74 | dynamics = batchrl_trainer.build_with_dataset(dynamics, dataset) 75 | dynamics.load_model(dynamics_save_path) 76 | except Exception: 77 | dynamics = None 78 | 79 | if stage == 'dataset_generate': 80 | if config['env'] == 'SlateRecEnv-v0': 81 | if not config.get("support_conti_env",False): 82 | batchrl_trainer.data_generate_rl4rs_a(config, dataset_save_path) 83 | else: 84 | batchrl_trainer.data_generate_rl4rs_a_conti(config, dataset_save_path) 85 | elif config['env'] == 'SeqSlateRecEnv-v0': 86 | if not config.get("support_conti_env",False): 87 | batchrl_trainer.data_generate_rl4rs_b(config, dataset_save_path) 88 | else: 89 | batchrl_trainer.data_generate_rl4rs_b_conti(config, dataset_save_path) 90 | else: 91 | batchrl_trainer.data_generate_rl4rs_a(config, dataset_save_path) 92 | assert config['env'] in ('SlateRecEnv-v0', 'SeqSlateRecEnv-v0') 93 | 94 | if stage == 'train_dynamics' or (stage == 'train' and algo == 'dynamics'): 95 | dynamics = batchrl_trainer.get_model(config, 'dynamics') 96 | print('get_action_size', dataset.episodes[0].get_action_size()) 97 | dynamics.fit(dataset, 98 | eval_episodes=dataset.episodes[-3000:], 99 | n_epochs=10, 100 | show_progress=False, 101 | scorers={ 102 | 'observation_error': dynamics_observation_prediction_error_scorer, 103 | 'reward_error': dynamics_reward_prediction_error_scorer, 104 | 'variance': dynamics_prediction_variance_scorer, 105 | } 106 | ) 107 | dynamics.save_model(dynamics_save_path) 108 | 109 | if stage == 'train': 110 | model = batchrl_trainer.get_model(config, algo, dynamics) 111 | model.fit(dataset, 112 | eval_episodes=dataset.episodes[-3000:], 113 | n_epochs=config['epoch'], 114 | show_progress=False) 115 | model.save_model(model_save_path) 116 | 117 | if stage == 'eval': 118 | default_soft_opc_score = 90 \ 119 | if config['env'] == 'SlateRecEnv-v0' \ 120 | else 90 * 2 121 | soft_opc_score = config.get('soft_opc_score', default_soft_opc_score) 122 | model = batchrl_trainer.get_model(config, algo, dynamics) 123 | model = batchrl_trainer.build_with_dataset(model, dataset) 124 | model.load_model(model_save_path) 125 | eval_episodes = random.sample(dataset.episodes, 2048 * 4) 126 | policy = policy_model(model, config=config) 127 | # batchrl_trainer.d3rlpy_eval(eval_episodes, policy, soft_opc_score) 128 | batchrl_trainer.evaluate(config, policy) 129 | 130 | if stage == 'ope': 131 | dataset_dir = os.environ['rl4rs_dataset_dir'] 132 | sample_model = behavior_model(config, modelfile=dataset_dir + '/logged_policy.h5') 133 | model = batchrl_trainer.get_model(config, algo, dynamics) 134 | model = batchrl_trainer.build_with_dataset(model, dataset) 135 | model.load_model(model_save_path) 136 | eval_config = config.copy() 137 | eval_config["is_eval"] = True 138 | eval_config["batch_size"] = 2048 139 | eval_config["epoch"] = 1 140 | if config['env'] == 'SeqSlateRecEnv-v0': 141 | config['max_steps'] = 36 142 | sim = SeqSlateRecEnv(eval_config, state_cls=SeqSlateState) 143 | eval_env = gym.make('SeqSlateRecEnv-v0', recsim=sim) 144 | else: 145 | sim = SlateRecEnv(eval_config, state_cls=SlateState) 146 | eval_env = gym.make('SlateRecEnv-v0', recsim=sim) 147 | ope_eval(eval_config, eval_env, model, sample_model=sample_model) 148 | -------------------------------------------------------------------------------- /reproductions/run_modelfree_rl.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | conda activate rl4rs 4 | script_abs=$(readlink -f "$0") 5 | rl4rs_benchmark_dir=$(dirname $script_abs)/.. 6 | rl4rs_output_dir=${rl4rs_benchmark_dir}/output 7 | rl4rs_dataset_dir=${rl4rs_benchmark_dir}/dataset 8 | script_dir=${rl4rs_benchmark_dir}/script 9 | export rl4rs_benchmark_dir && export rl4rs_output_dir && export rl4rs_dataset_dir 10 | 11 | algo=$1 12 | 13 | cd ${script_dir} 14 | 15 | # experiment in a_all env, train in a_all sample and test in a_all sample 16 | python -u modelfree_train.py $algo "train" "{'env':'SlateRecEnv-v0','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_a_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_a_dien/model','trial_name':'a_all','remote_base':'http://127.0.0.1:5000'}" >> ${rl4rs_output_dir}/modelfree_a_all_${algo}.log && 17 | python -u modelfree_train.py $algo "eval" "{'env':'SlateRecEnv-v0','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_a_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_a_dien/model','trial_name':'a_all','remote_base':'http://127.0.0.1:5000'}" >> ${rl4rs_output_dir}/modelfree_a_all_${algo}.log && 18 | #python -u modelfree_train.py $algo "ope" "{'env':'SlateRecEnv-v0','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_a_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_a_dien/model','trial_name':'a_all','remote_base':'http://127.0.0.1:5000'}" >> ${rl4rs_output_dir}/modelfree_a_all_${algo}.log && 19 | 20 | 21 | # experiment in a_all env, train in a_train sample and test in a_test sample 22 | #python -u modelfree_train.py $algo "train" "{'env':'SlateRecEnv-v0','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_a_train_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_a_dien/model','trial_name':'a_train','remote_base':'http://127.0.0.1:5000'}" >> ${rl4rs_output_dir}/modelfree_a_train_${algo}.log && 23 | #python -u modelfree_train.py $algo "eval" "{'env':'SlateRecEnv-v0','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_a_test_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_a_dien/model','trial_name':'a_train','remote_base':'http://127.0.0.1:5000'}" >> ${rl4rs_output_dir}/modelfree_a_train_${algo}.log && 24 | #python -u modelfree_train.py $algo "ope" "{'env':'SlateRecEnv-v0','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_a_test_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_a_dien/model','trial_name':'a_train','remote_base':'http://127.0.0.1:5000'}" >> ${rl4rs_output_dir}/modelfree_a_train_${algo}.log && 25 | 26 | 27 | # experiment train in a_sl env and test in a_rl env 28 | #python -u modelfree_train.py $algo "train" "{'env':'SlateRecEnv-v0','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_a_sl_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_a_sl_dien/model','trial_name':'a_sl','remote_base':'http://127.0.0.1:5000'}" >> ${rl4rs_output_dir}/modelfree_a_sl_${algo}.log && 29 | #python -u modelfree_train.py $algo "eval" "{'env':'SlateRecEnv-v0','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_a_rl_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_a_rl_dien/model','trial_name':'a_sl','remote_base':'http://127.0.0.1:5000'}" >> ${rl4rs_output_dir}/modelfree_a_sl_${algo}.log && 30 | #python -u modelfree_train.py $algo "ope" "{'env':'SlateRecEnv-v0','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_a_rl_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_a_rl_dien/model','trial_name':'a_sl','remote_base':'http://127.0.0.1:5000'}" >> ${rl4rs_output_dir}/modelfree_a_sl_${algo}.log && 31 | 32 | 33 | # experiment in b_all env, train in b_all sample and test in b_all sample 34 | python -u modelfree_train.py $algo "train" "{'env':'SeqSlateRecEnv-v0','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_b3_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_b2_dien/model','trial_name':'b_all','remote_base':'http://127.0.0.1:5000'}" >> ${rl4rs_output_dir}/modelfree_b_all_${algo}.log && 35 | python -u modelfree_train.py $algo "eval" "{'env':'SeqSlateRecEnv-v0','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_b3_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_b2_dien/model','trial_name':'b_all','remote_base':'http://127.0.0.1:5000'}" >> ${rl4rs_output_dir}/modelfree_b_all_${algo}.log && 36 | #python -u modelfree_train.py $algo "ope" "{'env':'SeqSlateRecEnv-v0','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_b3_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_b2_dien/model','trial_name':'b_all','remote_base':'http://127.0.0.1:5000'}" >> ${rl4rs_output_dir}/modelfree_b_all_${algo}.log && 37 | 38 | 39 | # experiment in b_all env, train in b_train sample and test in b_test sample 40 | #python -u modelfree_train.py $algo "train" "{'env':'SeqSlateRecEnv-v0','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_b3_train_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_b2_dien/model','trial_name':'b_train','remote_base':'http://127.0.0.1:5000'}" >> ${rl4rs_output_dir}/modelfree_b_train_${algo}.log && 41 | #python -u modelfree_train.py $algo "eval" "{'env':'SeqSlateRecEnv-v0','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_b3_test_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_b2_dien/model','trial_name':'b_train','remote_base':'http://127.0.0.1:5000'}" >> ${rl4rs_output_dir}/modelfree_b_train_${algo}.log && 42 | #python -u modelfree_train.py $algo "ope" "{'env':'SeqSlateRecEnv-v0','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_b3_test_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_b2_dien/model','trial_name':'b_train','remote_base':'http://127.0.0.1:5000'}" >> ${rl4rs_output_dir}/modelfree_b_train_${algo}.log && 43 | 44 | 45 | # experiment train in b_sl env and test in b_rl env 46 | #python -u modelfree_train.py $algo "train" "{'env':'SeqSlateRecEnv-v0','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_b3_sl_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_b2_sl_dien/model','trial_name':'b_sl','remote_base':'http://127.0.0.1:5000'}" >> ${rl4rs_output_dir}/modelfree_b_sl_${algo}.log && 47 | #python -u modelfree_train.py $algo "eval" "{'env':'SeqSlateRecEnv-v0','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_b3_rl_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_b2_rl_dien/model','trial_name':'b_sl','remote_base':'http://127.0.0.1:5000'}" >> ${rl4rs_output_dir}/modelfree_b_sl_${algo}.log && 48 | #python -u modelfree_train.py $algo "ope" "{'env':'SeqSlateRecEnv-v0','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_b3_rl_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_b2_rl_dien/model','trial_name':'b_sl','remote_base':'http://127.0.0.1:5000'}" >> ${rl4rs_output_dir}/modelfree_b_sl_${algo}.log && 49 | 50 | echo "1" 51 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: rl4rs 2 | channels: 3 | - defaults 4 | dependencies: 5 | - _libgcc_mutex=0.1=main 6 | - _openmp_mutex=4.5=1_gnu 7 | - _tflow_select=2.3.0=mkl 8 | - absl-py=0.15.0=pyhd3eb1b0_0 9 | - argon2-cffi=20.1.0=py36h27cfd23_1 10 | - astor=0.8.1=py36h06a4308_0 11 | - async_generator=1.10=py36h28b3542_0 12 | - attrs=21.2.0=pyhd3eb1b0_0 13 | - backcall=0.2.0=pyhd3eb1b0_0 14 | - blas=1.0=mkl 15 | - bleach=4.0.0=pyhd3eb1b0_0 16 | - c-ares=1.17.1=h27cfd23_0 17 | - ca-certificates=2021.10.26=h06a4308_2 18 | - certifi=2021.5.30=py36h06a4308_0 19 | - cffi=1.14.6=py36h400218f_0 20 | - coverage=5.5=py36h27cfd23_2 21 | - cython=0.29.24=py36h295c915_0 22 | - dataclasses=0.8=pyh4f3eec9_6 23 | - dbus=1.13.18=hb2f20db_0 24 | - decorator=5.1.0=pyhd3eb1b0_0 25 | - defusedxml=0.7.1=pyhd3eb1b0_0 26 | - entrypoints=0.3=py36_0 27 | - expat=2.4.1=h2531618_2 28 | - fontconfig=2.13.1=h6c09931_0 29 | - freetype=2.11.0=h70c0345_0 30 | - glib=2.69.1=h5202010_0 31 | - google-pasta=0.2.0=pyhd3eb1b0_0 32 | - grpcio=1.36.1=py36h2157cd5_1 33 | - gst-plugins-base=1.14.0=h8213a91_2 34 | - gstreamer=1.14.0=h28cd5cc_2 35 | - h5py=2.10.0=py36hd6299e0_1 36 | - hdf5=1.10.6=hb1b8bf9_0 37 | - icu=58.2=he6710b0_3 38 | - importlib-metadata=4.8.1=py36h06a4308_0 39 | - importlib_metadata=4.8.1=hd3eb1b0_0 40 | - intel-openmp=2021.4.0=h06a4308_3561 41 | - ipykernel=5.3.4=py36h5ca1d4c_0 42 | - ipython=7.16.1=py36h5ca1d4c_0 43 | - ipython_genutils=0.2.0=pyhd3eb1b0_1 44 | - ipywidgets=7.6.5=pyhd3eb1b0_1 45 | - jedi=0.17.0=py36_0 46 | - jinja2=3.0.2=pyhd3eb1b0_0 47 | - jpeg=9d=h7f8727e_0 48 | - jsonschema=3.2.0=pyhd3eb1b0_2 49 | - jupyter=1.0.0=py36_7 50 | - jupyter_client=7.1.0=pyhd3eb1b0_0 51 | - jupyter_console=6.4.0=pyhd3eb1b0_0 52 | - jupyter_core=4.8.1=py36h06a4308_0 53 | - jupyterlab_pygments=0.1.2=py_0 54 | - jupyterlab_widgets=1.0.0=pyhd3eb1b0_1 55 | - keras-applications=1.0.8=py_1 56 | - keras-preprocessing=1.1.2=pyhd3eb1b0_0 57 | - ld_impl_linux-64=2.35.1=h7274673_9 58 | - libffi=3.3=he6710b0_2 59 | - libgcc-ng=9.3.0=h5101ec6_17 60 | - libgfortran-ng=7.5.0=ha8ba4b0_17 61 | - libgfortran4=7.5.0=ha8ba4b0_17 62 | - libgomp=9.3.0=h5101ec6_17 63 | - libpng=1.6.37=hbc83047_0 64 | - libprotobuf=3.17.2=h4ff587b_1 65 | - libsodium=1.0.18=h7b6447c_0 66 | - libstdcxx-ng=9.3.0=hd4cf53a_17 67 | - libuuid=1.0.3=h7f8727e_2 68 | - libxcb=1.14=h7b6447c_0 69 | - libxml2=2.9.12=h03d6c58_0 70 | - markdown=3.3.4=py36h06a4308_0 71 | - markupsafe=2.0.1=py36h27cfd23_0 72 | - mistune=0.8.4=py36h7b6447c_0 73 | - mkl=2020.2=256 74 | - mkl-service=2.3.0=py36he8ac12f_0 75 | - mkl_fft=1.3.0=py36h54f3939_0 76 | - mkl_random=1.1.1=py36h0573a6f_0 77 | - nbclient=0.5.3=pyhd3eb1b0_0 78 | - nbconvert=6.0.7=py36_0 79 | - nbformat=5.1.3=pyhd3eb1b0_0 80 | - ncurses=6.3=h7f8727e_2 81 | - nest-asyncio=1.5.1=pyhd3eb1b0_0 82 | - notebook=6.4.3=py36h06a4308_0 83 | - numpy=1.19.2=py36h54aff64_0 84 | - numpy-base=1.19.2=py36hfa32c7d_0 85 | - openssl=1.1.1m=h7f8727e_0 86 | - packaging=21.3=pyhd3eb1b0_0 87 | - pandoc=2.12=h06a4308_0 88 | - pandocfilters=1.4.3=py36h06a4308_1 89 | - parso=0.8.2=pyhd3eb1b0_0 90 | - pcre=8.45=h295c915_0 91 | - pexpect=4.8.0=pyhd3eb1b0_3 92 | - pickleshare=0.7.5=pyhd3eb1b0_1003 93 | - pip=21.2.2=py36h06a4308_0 94 | - prometheus_client=0.12.0=pyhd3eb1b0_0 95 | - prompt-toolkit=3.0.20=pyhd3eb1b0_0 96 | - prompt_toolkit=3.0.20=hd3eb1b0_0 97 | - ptyprocess=0.7.0=pyhd3eb1b0_2 98 | - pycparser=2.21=pyhd3eb1b0_0 99 | - pygments=2.10.0=pyhd3eb1b0_0 100 | - pyparsing=3.0.4=pyhd3eb1b0_0 101 | - pyqt=5.9.2=py36h05f1152_2 102 | - python=3.6.13=h12debd9_1 103 | - python-dateutil=2.8.2=pyhd3eb1b0_0 104 | - pyzmq=22.2.1=py36h295c915_1 105 | - qt=5.9.7=h5867ecd_1 106 | - qtconsole=5.1.1=pyhd3eb1b0_0 107 | - qtpy=1.10.0=pyhd3eb1b0_0 108 | - readline=8.1=h27cfd23_0 109 | - scipy=1.3.1=py36h7c811a0_0 110 | - send2trash=1.8.0=pyhd3eb1b0_1 111 | - setuptools=58.0.4=py36h06a4308_0 112 | - sip=4.19.8=py36hf484d3e_0 113 | - six=1.16.0=pyhd3eb1b0_0 114 | - sqlite=3.37.0=hc218d9a_0 115 | - termcolor=1.1.0=py36h06a4308_1 116 | - terminado=0.9.4=py36h06a4308_0 117 | - testpath=0.5.0=pyhd3eb1b0_0 118 | - tk=8.6.11=h1ccaba5_0 119 | - tornado=6.1=py36h27cfd23_0 120 | - traitlets=4.3.3=py36h06a4308_0 121 | - typing_extensions=3.10.0.2=pyh06a4308_0 122 | - wcwidth=0.2.5=pyhd3eb1b0_0 123 | - webencodings=0.5.1=py36_1 124 | - werkzeug=2.0.2=pyhd3eb1b0_0 125 | - wheel=0.37.0=pyhd3eb1b0_1 126 | - widgetsnbextension=3.5.1=py36_0 127 | - wrapt=1.12.1=py36h7b6447c_1 128 | - xz=5.2.5=h7b6447c_0 129 | - zeromq=4.3.4=h2531618_0 130 | - zipp=3.6.0=pyhd3eb1b0_0 131 | - zlib=1.2.11=h7f8727e_4 132 | - pip: 133 | - aiohttp==3.7.4.post0 134 | - aiohttp-cors==0.7.0 135 | - aioredis==1.3.1 136 | - antlr4-python3-runtime==4.8 137 | - async-timeout==3.0.1 138 | - blessings==1.7 139 | - cachetools==4.2.2 140 | - chardet==4.0.0 141 | - charset-normalizer==2.0.6 142 | - click==8.0.1 143 | - cloudpickle==1.6.0 144 | - colorama==0.4.4 145 | - contextvars==2.4 146 | - d3rlpy==0.91 147 | - deepctr==0.9.0 148 | - dm-tree==0.1.6 149 | - fairseq==0.10.2 150 | - filelock==3.0.12 151 | - flask==1.1.2 152 | - gast==0.2.2 153 | - google-api-core==1.31.2 154 | - google-auth==1.35.0 155 | - googleapis-common-protos==1.53.0 156 | - gpustat==0.6.0 157 | - gputil==1.4.0 158 | - greenlet==1.1.2 159 | - gym==0.19.0 160 | - hiredis==2.0.0 161 | - hydra-core==1.1.1 162 | - idna==3.2 163 | - idna-ssl==1.1.0 164 | - immutables==0.16 165 | - importlib-resources==5.2.2 166 | - itsdangerous==2.0.1 167 | - joblib==1.0.1 168 | - keras==2.2.5 169 | - keras-embed-sim==0.10.0 170 | - keras-layer-normalization==0.16.0 171 | - keras-multi-head==0.29.0 172 | - keras-pos-embd==0.13.0 173 | - keras-position-wise-feed-forward==0.8.0 174 | - keras-self-attention==0.51.0 175 | - keras-transformer==0.40.0 176 | - lightseq==2.1.4 177 | - lz4==3.1.3 178 | - msgpack==1.0.2 179 | - multidict==5.1.0 180 | - ninja==1.10.2 181 | - nvidia-ml-py3==7.352.0 182 | - omegaconf==2.1.1 183 | - opencensus==0.7.13 184 | - opencensus-context==0.1.2 185 | - opencv-python-headless==4.3.0.36 186 | - opt-einsum==3.3.0 187 | - pandas==1.1.5 188 | - pandasql==0.7.3 189 | - portalocker==2.3.2 190 | - protobuf==3.19.3 191 | - psutil==5.8.0 192 | - py-spy==0.3.9 193 | - pyasn1==0.4.8 194 | - pyasn1-modules==0.2.8 195 | - pydantic==1.8.2 196 | - pyrsistent==0.18.0 197 | - pytz==2021.1 198 | - pyyaml==5.4.1 199 | - ray==1.5.1 200 | - redis==3.5.3 201 | - regex==2021.8.28 202 | - requests==2.27.1 203 | - rsa==4.7.2 204 | - sacrebleu==2.0.0 205 | - sacremoses==0.0.45 206 | - scikit-learn==0.24.2 207 | - sklearn==0.0 208 | - sqlalchemy==1.4.29 209 | - structlog==21.3.0 210 | - tabulate==0.8.9 211 | - tensorboard==1.15.0 212 | - tensorboardx==2.4.1 213 | - tensorflow-estimator==1.15.1 214 | - tensorflow-gpu==1.15.0 215 | - threadpoolctl==2.2.0 216 | - torch==1.9.0 217 | - tqdm==4.62.2 218 | - urllib3==1.26.6 219 | - yarl==1.6.3 220 | prefix: /project/miniconda3/envs/rl4rs -------------------------------------------------------------------------------- /reproductions/run_split.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | conda activate rl4rs 4 | script_abs=$(readlink -f "$0") 5 | rl4rs_benchmark_dir=$(dirname $script_abs)/.. 6 | rl4rs_dataset_dir=${rl4rs_benchmark_dir}/dataset 7 | script_dir=${rl4rs_benchmark_dir}/script 8 | rl4rs_output_dir=${rl4rs_benchmark_dir}/output 9 | mkdir $rl4rs_output_dir 10 | export rl4rs_benchmark_dir && export rl4rs_output_dir && export rl4rs_dataset_dir 11 | 12 | cd $rl4rs_dataset_dir 13 | 14 | #raw dataset 15 | awk -F "@" 'NR>1 {print}' rl4rs_dataset_a_sl.csv > rl4rs_dataset_a.csv && 16 | awk -F "@" 'NR>1 {print}' rl4rs_dataset_a_rl.csv >> rl4rs_dataset_a.csv && 17 | awk -F "@" 'NR>1 {print}' rl4rs_dataset_b_sl.csv > rl4rs_dataset_b.csv && 18 | awk -F "@" 'NR>1 {print}' rl4rs_dataset_b_rl.csv >> rl4rs_dataset_b.csv && 19 | 20 | #train/test split 21 | awk -F "@" 'NR>1 && $2%10<=5 {print}' rl4rs_dataset_a_sl.csv > rl4rs_dataset_a_sl_train.csv && 22 | awk -F "@" 'NR>1 && $2%10>=6 {print}' rl4rs_dataset_a_sl.csv > rl4rs_dataset_a_sl_test.csv && 23 | awk -F "@" 'NR>1 && $2%10<=5 {print}' rl4rs_dataset_a_rl.csv > rl4rs_dataset_a_rl_train.csv && 24 | awk -F "@" 'NR>1 && $2%10>=6 {print}' rl4rs_dataset_a_rl.csv > rl4rs_dataset_a_rl_test.csv && 25 | 26 | awk -F "@" 'NR>1 && $2%10<=5 {print}' rl4rs_dataset_b_sl.csv > rl4rs_dataset_b_sl_train.csv && 27 | awk -F "@" 'NR>1 && $2%10>=6 {print}' rl4rs_dataset_b_sl.csv > rl4rs_dataset_b_sl_test.csv && 28 | awk -F "@" 'NR>1 && $2%10<=5 {print}' rl4rs_dataset_b_rl.csv > rl4rs_dataset_b_rl_train.csv && 29 | awk -F "@" 'NR>1 && $2%10>=6 {print}' rl4rs_dataset_b_rl.csv > rl4rs_dataset_b_rl_test.csv && 30 | 31 | cat rl4rs_dataset_a_sl_train.csv > rl4rs_dataset_a_train.csv && 32 | cat rl4rs_dataset_a_rl_train.csv >> rl4rs_dataset_a_train.csv && 33 | cat rl4rs_dataset_b_sl_train.csv > rl4rs_dataset_b_train.csv && 34 | cat rl4rs_dataset_b_rl_train.csv >> rl4rs_dataset_b_train.csv && 35 | 36 | cat rl4rs_dataset_a_sl_test.csv > rl4rs_dataset_a_test.csv && 37 | cat rl4rs_dataset_a_rl_test.csv >> rl4rs_dataset_a_test.csv && 38 | cat rl4rs_dataset_b_sl_test.csv > rl4rs_dataset_b_test.csv && 39 | cat rl4rs_dataset_b_rl_test.csv >> rl4rs_dataset_b_test.csv && 40 | 41 | cat rl4rs_dataset_a_train.csv > rl4rs_dataset_a.csv && 42 | cat rl4rs_dataset_a_test.csv >> rl4rs_dataset_a.csv && 43 | cat rl4rs_dataset_b_train.csv > rl4rs_dataset_b.csv && 44 | cat rl4rs_dataset_b_test.csv >> rl4rs_dataset_b.csv && 45 | 46 | #dataset_b 47 | cd ${script_dir} && 48 | python data_preprocess.py "${rl4rs_dataset_dir}/rl4rs_dataset_b_sl.csv" "${rl4rs_dataset_dir}/rl4rs_dataset_b2_sl.csv" "data_augment" && 49 | python data_preprocess.py "${rl4rs_dataset_dir}/rl4rs_dataset_b_rl.csv" "${rl4rs_dataset_dir}/rl4rs_dataset_b2_rl.csv" "data_augment" && 50 | python data_preprocess.py "${rl4rs_dataset_dir}/rl4rs_dataset_b_train.csv" "${rl4rs_dataset_dir}/rl4rs_dataset_b2_train.csv" "data_augment" && 51 | python data_preprocess.py "${rl4rs_dataset_dir}/rl4rs_dataset_b_test.csv" "${rl4rs_dataset_dir}/rl4rs_dataset_b2_test.csv" "data_augment" && 52 | python data_preprocess.py "${rl4rs_dataset_dir}/rl4rs_dataset_b.csv" "${rl4rs_dataset_dir}/rl4rs_dataset_b2.csv" "data_augment" && 53 | 54 | python data_preprocess.py "${rl4rs_dataset_dir}/rl4rs_dataset_b2_sl.csv" "${rl4rs_dataset_dir}/rl4rs_dataset_b3_sl.csv" "slate2trajectory" && 55 | python data_preprocess.py "${rl4rs_dataset_dir}/rl4rs_dataset_b2_rl.csv" "${rl4rs_dataset_dir}/rl4rs_dataset_b3_rl.csv" "slate2trajectory" && 56 | python data_preprocess.py "${rl4rs_dataset_dir}/rl4rs_dataset_b2_train.csv" "${rl4rs_dataset_dir}/rl4rs_dataset_b3_train.csv" "slate2trajectory" && 57 | python data_preprocess.py "${rl4rs_dataset_dir}/rl4rs_dataset_b2_test.csv" "${rl4rs_dataset_dir}/rl4rs_dataset_b3_test.csv" "slate2trajectory" && 58 | python data_preprocess.py "${rl4rs_dataset_dir}/rl4rs_dataset_b2.csv" "${rl4rs_dataset_dir}/rl4rs_dataset_b3.csv" "slate2trajectory" && 59 | 60 | 61 | #shuffle for RL Env. 62 | cd $rl4rs_dataset_dir && 63 | cat rl4rs_dataset_a.csv|shuf > rl4rs_dataset_a_shuf.csv && 64 | awk -F "@" 'NR>1 {print}' rl4rs_dataset_a_sl.csv|shuf > rl4rs_dataset_a_sl_shuf.csv && 65 | awk -F "@" 'NR>1 {print}' rl4rs_dataset_a_rl.csv|shuf > rl4rs_dataset_a_rl_shuf.csv && 66 | cat rl4rs_dataset_a_train.csv|shuf > rl4rs_dataset_a_train_shuf.csv && 67 | cat rl4rs_dataset_a_test.csv|shuf > rl4rs_dataset_a_test_shuf.csv && 68 | cat rl4rs_dataset_b3.csv|shuf > rl4rs_dataset_b3_shuf.csv && 69 | cat rl4rs_dataset_b3_sl.csv|shuf > rl4rs_dataset_b3_sl_shuf.csv && 70 | cat rl4rs_dataset_b3_rl.csv|shuf > rl4rs_dataset_b3_rl_shuf.csv && 71 | cat rl4rs_dataset_b3_train.csv|shuf > rl4rs_dataset_b3_train_shuf.csv && 72 | cat rl4rs_dataset_b3_test.csv|shuf > rl4rs_dataset_b3_test_shuf.csv && 73 | 74 | 75 | cd $(dirname $script_abs) && 76 | bash file_split.sh "rl4rs_dataset_a_sl_shuf.csv" && 77 | bash file_split.sh "rl4rs_dataset_a_rl_shuf.csv" && 78 | bash file_split.sh "rl4rs_dataset_a_train_shuf.csv" && 79 | bash file_split.sh "rl4rs_dataset_a_test_shuf.csv" && 80 | bash file_split.sh "rl4rs_dataset_a_shuf.csv" && 81 | bash file_split.sh "rl4rs_dataset_b2_sl.csv" && 82 | bash file_split.sh "rl4rs_dataset_b2_rl.csv" && 83 | bash file_split.sh "rl4rs_dataset_b2_train.csv" && 84 | bash file_split.sh "rl4rs_dataset_b2_test.csv" && 85 | bash file_split.sh "rl4rs_dataset_b2.csv" 86 | 87 | 88 | #tfrecord for supervised learning 89 | cd ${script_dir} 90 | 91 | for ((i=0;i<5;i=i+1)) 92 | do 93 | python data_preprocess.py "${rl4rs_output_dir}/rl4rs_dataset_a_sl_shuf.csv_000${i}.csv" "${rl4rs_output_dir}/rl4rs_dataset_a_sl.tfrecord.${i}" "tfrecord_item" && 94 | python data_preprocess.py "${rl4rs_output_dir}/rl4rs_dataset_a_rl_shuf.csv_000${i}.csv" "${rl4rs_output_dir}/rl4rs_dataset_a_rl.tfrecord.${i}" "tfrecord_item" && 95 | python data_preprocess.py "${rl4rs_output_dir}/rl4rs_dataset_a_train_shuf.csv_000${i}.csv" "${rl4rs_output_dir}/rl4rs_dataset_a_train.tfrecord.${i}" "tfrecord_item" && 96 | python data_preprocess.py "${rl4rs_output_dir}/rl4rs_dataset_a_test_shuf.csv_000${i}.csv" "${rl4rs_output_dir}/rl4rs_dataset_a_test.tfrecord.${i}" "tfrecord_item" && 97 | python data_preprocess.py "${rl4rs_output_dir}/rl4rs_dataset_a_shuf.csv_000${i}.csv" "${rl4rs_output_dir}/rl4rs_dataset_a.tfrecord.${i}" "tfrecord_item" && 98 | python data_preprocess.py "${rl4rs_output_dir}/rl4rs_dataset_b2_sl.csv_000${i}.csv" "${rl4rs_output_dir}/rl4rs_dataset_b2_sl.tfrecord.${i}" "tfrecord_item" && 99 | python data_preprocess.py "${rl4rs_output_dir}/rl4rs_dataset_b2_rl.csv_000${i}.csv" "${rl4rs_output_dir}/rl4rs_dataset_b2_rl.tfrecord.${i}" "tfrecord_item" && 100 | python data_preprocess.py "${rl4rs_output_dir}/rl4rs_dataset_b2_train.csv_000${i}.csv" "${rl4rs_output_dir}/rl4rs_dataset_b2_train.tfrecord.${i}" "tfrecord_item" && 101 | python data_preprocess.py "${rl4rs_output_dir}/rl4rs_dataset_b2_test.csv_000${i}.csv" "${rl4rs_output_dir}/rl4rs_dataset_b2_test.tfrecord.${i}" "tfrecord_item" && 102 | python data_preprocess.py "${rl4rs_output_dir}/rl4rs_dataset_b2.csv_000${i}.csv" "${rl4rs_output_dir}/rl4rs_dataset_b2.tfrecord.${i}" "tfrecord_item" && 103 | echo "1" 104 | done 105 | 106 | cd ${script_dir} && 107 | python data_preprocess.py "${rl4rs_dataset_dir}/rl4rs_dataset_a_train_shuf.csv" "${rl4rs_output_dir}/rl4rs_dataset_a_train_slate.tfrecord" "tfrecord_slate" && 108 | python data_preprocess.py "${rl4rs_dataset_dir}/rl4rs_dataset_a_test_shuf.csv" "${rl4rs_output_dir}/rl4rs_dataset_a_test_slate.tfrecord" "tfrecord_slate" && 109 | python data_preprocess.py "${rl4rs_dataset_dir}/rl4rs_dataset_b2_train.csv" "${rl4rs_output_dir}/rl4rs_dataset_b2_train_slate.tfrecord" "tfrecord_slate" && 110 | python data_preprocess.py "${rl4rs_dataset_dir}/rl4rs_dataset_b2_test.csv" "${rl4rs_output_dir}/rl4rs_dataset_b2_test_slate.tfrecord" "tfrecord_slate" && 111 | 112 | echo "1" 113 | -------------------------------------------------------------------------------- /script/data_preprocess.py: -------------------------------------------------------------------------------- 1 | from rl4rs.utils.datautil import FeatureUtil 2 | import numpy as np 3 | import sys, os, random 4 | 5 | 6 | def data_augment(file, out_file): 7 | f = open(out_file, 'w') 8 | data = open(file, 'r').read().split('\n') 9 | data_size = len(data) 10 | print('data length', data_size) 11 | tmp = [] 12 | role_id_prev = None 13 | for record in data: 14 | if len(record) < 1 or 'timestamp' in record: 15 | continue 16 | role_id = record.split('@')[1] 17 | if role_id == role_id_prev or role_id_prev is None: 18 | tmp.append(record) 19 | role_id_prev = role_id 20 | else: 21 | assert len(tmp) <= 4 22 | for i in range(len(tmp), 4): 23 | timestamp, session_id, sequence_id, exposed_items, user_feedback, \ 24 | user_seqfeature, user_protrait, item_feature, behavior_policy_id = tmp[-1].split('@') 25 | timestamp_new = str(int(timestamp) + 1) 26 | sequence_id_new = str(int(sequence_id) + 1) 27 | random_i = np.random.randint(1, data_size - 1) 28 | exposed_items_new = data[random_i].split('@')[3] 29 | item_feature_new = data[random_i].split('@')[7] 30 | user_feedback_new = '0,0,0,0,0,0,0,0,0' 31 | tmp.append('@'.join([ 32 | timestamp_new, 33 | session_id, 34 | sequence_id_new, 35 | exposed_items_new, 36 | user_feedback_new, 37 | user_seqfeature, 38 | user_protrait, 39 | item_feature_new, 40 | behavior_policy_id 41 | ])) 42 | print(*tmp, sep='\n', end='\n', file=f) 43 | tmp = [record] 44 | role_id_prev = role_id 45 | f.close() 46 | 47 | 48 | def slate2trajectory(file, out_file): 49 | f = open(out_file, 'w') 50 | data = open(file, 'r').read().split('\n') 51 | data_size = len(data) 52 | print('data length', data_size) 53 | tmp = [] 54 | role_id_prev = None 55 | for record in data: 56 | if len(record) < 1 or 'timestamp' in record: 57 | continue 58 | role_id = record.split('@')[1] 59 | if role_id == role_id_prev or role_id_prev is None: 60 | tmp.append(record) 61 | role_id_prev = role_id 62 | else: 63 | assert len(tmp) == 4 64 | # timestamp, session_id, sequence_id, exposed_items, user_feedback, user_seqfeature, user_protrait, item_feature, behavior_policy_id 65 | timestamp = tmp[0].split('@')[0] 66 | session_id = tmp[0].split('@')[1] 67 | sequence_id = '1' 68 | exposed_items = ','.join([x.split('@')[3] for x in tmp]) 69 | user_feedback = ','.join([x.split('@')[4] for x in tmp]) 70 | user_seqfeature = tmp[0].split('@')[5] 71 | user_protrait = tmp[0].split('@')[6] 72 | item_feature = ';'.join([x.split('@')[7] for x in tmp]) 73 | behavior_policy_id = tmp[0].split('@')[8] 74 | traj = [ 75 | timestamp, 76 | session_id, 77 | sequence_id, 78 | exposed_items, 79 | user_feedback, 80 | user_seqfeature, 81 | user_protrait, 82 | item_feature, 83 | behavior_policy_id 84 | ] 85 | print(*traj, sep='@', end='\n', file=f) 86 | tmp = [record] 87 | role_id_prev = role_id 88 | f.close() 89 | 90 | 91 | def dataset2tfrecord(config, file, tfrecord_file, is_slate): 92 | def feature_construct(session, is_slate): 93 | samples = [] 94 | for i in range(len(session)): 95 | _, _, sequence_id, exposed_items, user_feedback, user_seqfeature, \ 96 | user_protrait, item_feature, _ = FeatureUtil.record_split(session[i]) 97 | assert sequence_id - 1 == i 98 | user_protrait_category = user_protrait[:10] 99 | user_protrait_dense = user_protrait[10:] 100 | category_feature = user_protrait_category + [sequence_id] + exposed_items 101 | prev_items = [session[ii].split('@')[3].split(',')[jj] for ii in range(i) for jj in range(9)] 102 | prev_items = list(map(int, prev_items)) 103 | sequence_feature_clicked = prev_items if i > 0 else [0] 104 | sequence_feature = [user_seqfeature, sequence_feature_clicked] 105 | if is_slate: 106 | # label = '0' 107 | label = 0 108 | samples.append(( 109 | role_id_prev, 110 | sequence_feature, 111 | user_protrait_dense + item_feature, 112 | category_feature, 113 | user_feedback, 114 | label 115 | )) 116 | else: 117 | for j in range(9): 118 | item_id = exposed_items[j] 119 | label = user_feedback[j] 120 | item_feature_size = len(item_feature) // 9 121 | item_feature_j = item_feature[item_feature_size * j:item_feature_size * (j + 1)] 122 | category_feature_j = category_feature + [item_id] 123 | dense_feature_j = item_feature + item_feature_j 124 | samples.append(( 125 | role_id_prev, 126 | sequence_feature, 127 | user_protrait_dense + dense_feature_j, 128 | category_feature_j, 129 | user_feedback, 130 | label 131 | )) 132 | return samples 133 | featureutil = FeatureUtil(config) 134 | data = open(file, 'r').read().split('\n') 135 | print('data length', len(data)) 136 | # role_id, sequence_feature, dense_feature, category_feature, label 137 | # timestamp@session_id@sequence_id@exposed_items@user_feedback@user_seqfeature@user_protrait@item_feature@behavior_policy_id 138 | tmp = [] 139 | records = [] 140 | role_id_prev = None 141 | for record in data: 142 | if len(record) < 1 or 'timestamp' in record: 143 | continue 144 | role_id = record.split('@')[1] 145 | if role_id == role_id_prev or role_id_prev is None: 146 | tmp.append(record) 147 | role_id_prev = role_id 148 | else: 149 | samples = feature_construct(tmp, is_slate) 150 | records = records + samples 151 | tmp = [record] 152 | role_id_prev = role_id 153 | if len(tmp) > 0: 154 | samples = feature_construct(tmp, is_slate) 155 | records = records + samples 156 | print('tfrecord length', len(records), records[0]) 157 | random.shuffle(records) 158 | featureutil.to_tfrecord(records, tfrecord_file) 159 | 160 | 161 | config = { 162 | "maxlen": 64, 163 | "batch_size": 32, 164 | "class_num": 2, 165 | "dense_feature_num": 432, 166 | "category_feature_num": 21, 167 | "category_hash_size": 100000, 168 | "seq_num": 2 169 | } 170 | file = sys.argv[1] 171 | out_file = sys.argv[2] 172 | stage = sys.argv[3] 173 | assert stage in ('data_augment', 'slate2trajectory', 'tfrecord_item', 'tfrecord_slate') 174 | if stage == 'data_augment': 175 | data_augment(file, out_file) 176 | if stage == 'slate2trajectory': 177 | slate2trajectory(file, out_file) 178 | if stage == 'tfrecord_item': 179 | dataset2tfrecord(config, file, out_file, is_slate=False) 180 | if stage == 'tfrecord_slate': 181 | dataset2tfrecord(config, file, out_file, is_slate=True) 182 | -------------------------------------------------------------------------------- /rl4rs/utils/offline_policy_metrics.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | # import scipy 3 | import scipy.stats 4 | 5 | 6 | # modify from https://mars-gym.readthedocs.io/en/latest/quick_start.html#off-policy-metrics 7 | 8 | def _calc_sequential_weigths(policy_prob, behavior_prob, weighted=False, a_min=None, a_max=None): 9 | # behavior_prob: Coleta 10 | # policy_prob: Avaliação 11 | # 12 | # Compute the sample weights - propensity ratios 13 | probs = np.array(policy_prob) / np.array(behavior_prob) 14 | rho = np.clip(probs, a_min=a_min, a_max=a_max).cumprod(1) 15 | if weighted: 16 | weight = np.sum(rho, axis=0) 17 | else: 18 | weight = len(policy_prob) 19 | ws = rho / weight 20 | return np.clip(ws, a_min=a_min, a_max=a_max) 21 | 22 | 23 | def _calc_sample_weigths(policy_prob, behavior_prob, a_min=None, a_max=None): 24 | # behavior_prob: Coleta 25 | # policy_prob: Avaliação 26 | # 27 | # Compute the sample weights - propensity ratios 28 | p_ratio = np.array(policy_prob) / np.array(behavior_prob) 29 | 30 | if a_min is not None: 31 | p_ratio = np.clip(p_ratio, a_min=a_min, a_max=a_max) 32 | 33 | # Effective sample size for E_t estimate (from A. Owen) 34 | n_e = len(policy_prob) * (np.mean(p_ratio) ** 2) / (p_ratio ** 2).mean() 35 | 36 | # Critical value from t-distribution as we have unknown variance 37 | alpha = 0.00125 38 | cv = scipy.stats.t.ppf(1 - alpha, df=int(n_e) - 1) 39 | 40 | return p_ratio, n_e, cv 41 | 42 | 43 | def eval_DM(policy, obs): 44 | return policy(obs) 45 | 46 | 47 | def eval_IPS(rewards, policy_prob, behavior_prob): 48 | # Calculate Sample Weigths 49 | p_ratio, n_e, cv = _calc_sample_weigths(policy_prob, behavior_prob) 50 | ############### 51 | # VANILLA IPS # 52 | ############### 53 | # Expected reward for pi_t 54 | E_t = np.mean(rewards * p_ratio) 55 | 56 | # Variance of the estimate 57 | var = ((rewards * p_ratio - E_t) ** 2).mean() 58 | stddev = np.sqrt(var) 59 | 60 | # C.I. assuming unknown variance - use t-distribution and effective sample size 61 | c = cv * stddev / np.sqrt(int(n_e)) 62 | min_bound = E_t - c 63 | max_bound = E_t + c 64 | 65 | result = (E_t, c) # 0.025, 0.500, 0.975 66 | return result 67 | 68 | 69 | def eval_CIPS(rewards, policy_prob, behavior_prob): 70 | # Calculate Sample Weigths 71 | p_ratio, n_e, cv = _calc_sample_weigths(policy_prob, behavior_prob, a_min=0.1, a_max=10) 72 | 73 | ############## 74 | # CAPPED IPS # 75 | ############## 76 | # Cap ratios 77 | p_ratio_capped = np.clip(p_ratio, a_min=0.1, a_max=10) 78 | 79 | # Expected reward for pi_t 80 | E_t_capped = np.mean(rewards * p_ratio_capped) 81 | 82 | # Variance of the estimate 83 | var_capped = ((rewards * p_ratio_capped - E_t_capped) ** 2).mean() 84 | stddev_capped = np.sqrt(var_capped) 85 | 86 | # C.I. assuming unknown variance - use t-distribution and effective sample size 87 | c = cv * stddev_capped / np.sqrt(int(n_e)) 88 | 89 | min_bound_capped = E_t_capped - c 90 | max_bound_capped = E_t_capped + c 91 | 92 | result = (E_t_capped, c) # 0.025, 0.500, 0.975 93 | 94 | return result 95 | 96 | 97 | def eval_SNIPS(rewards, policy_prob, behavior_prob): 98 | # Calculate Sample Weigths 99 | p_ratio, n_e, cv = _calc_sample_weigths(policy_prob, behavior_prob, a_min=0.1, a_max=10) 100 | 101 | ############## 102 | # NORMED IPS # 103 | ############## 104 | # Expected reward for pi_t 105 | E_t_normed = np.sum(rewards * p_ratio) / np.sum(p_ratio) 106 | 107 | # Variance of the estimate 108 | var_normed = np.sum(((rewards - E_t_normed) ** 2) * (p_ratio ** 2)) / ( 109 | p_ratio.sum() ** 2 110 | ) 111 | stddev_normed = np.sqrt(var_normed) 112 | 113 | # C.I. assuming unknown variance - use t-distribution and effective sample size 114 | c = cv * stddev_normed / np.sqrt(int(n_e)) 115 | 116 | min_bound_normed = E_t_normed - c 117 | max_bound_normed = E_t_normed + c 118 | 119 | # Store result 120 | result = (E_t_normed, c) # 0.025, 0.500, 0.975 121 | 122 | return result 123 | 124 | 125 | def eval_WIPS(step_rewards, policy_prob, behavior_prob, gamma=1.0): 126 | batch_size = len(step_rewards) 127 | steps = len(step_rewards[0]) 128 | w_t = [] 129 | 130 | # calculate importance ratios 131 | p = _calc_sequential_weigths(policy_prob, behavior_prob, a_min=0.1, a_max=10) 132 | 133 | for i in range(steps): 134 | w_t.append(np.average(p[:, :i + 1], axis=1)) 135 | w_t = np.array(w_t).swapaxes(0, 1) 136 | # calculate stepwise weighted IS estimate 137 | V_prev, V_step_WIS = 0.0, 0.0 138 | for t in range(steps): 139 | V_prev += np.sum(step_rewards[:, t] * gamma ** t) 140 | V_step_WIS += np.sum(p[:, t] / w_t[:, t] * step_rewards[:, t] * gamma ** t) 141 | # print('WIPS', p[:, -1], w_t[:, -1], np.max(p[:, -1] / w_t[:, -1]), step_rewards[:, -1]) 142 | return V_step_WIS / np.clip(V_prev, a_min=1e-8, a_max=None), 0 143 | 144 | 145 | def eval_doubly_robust( 146 | action_rhat_rewards, state_rewards, rewards, policy_prob, behavior_prob 147 | ): 148 | # Calculate Sample Weigths 149 | p_ratio, n_e, cv = _calc_sample_weigths(policy_prob, behavior_prob, a_min=0.1, a_max=10) 150 | 151 | ################# 152 | # Roubly Robust # 153 | ################# 154 | 155 | dr = state_rewards + (p_ratio * (rewards - action_rhat_rewards)) 156 | 157 | confidence = 0.95 158 | n = len(dr) 159 | m, se = np.mean(dr), scipy.stats.sem(dr) 160 | # h = se * scipy.stats.t.ppf((1 + confidence) / 2.0, n - 1) 161 | # print('dr', action_rhat_rewards[:2], p_ratio[:2], rewards[:2], m) 162 | return m / np.average(rewards), se 163 | 164 | 165 | def eval_seq_doubly_robust( 166 | action_rhat_rewards, state_rewards, rewards, policy_prob, behavior_prob 167 | ): 168 | # Calculate Sample Weigths 169 | ws = _calc_sequential_weigths(policy_prob, behavior_prob, a_min=0.1, a_max=10) 170 | 171 | dr = np.zeros((len(action_rhat_rewards))) 172 | steps = len(action_rhat_rewards[0]) 173 | for i in range(steps): 174 | t = steps - i - 1 175 | dr = state_rewards[:, t] + ws[:, t] * (rewards[:, t] + dr - action_rhat_rewards[:, t]) 176 | 177 | ################# 178 | # Roubly Robust # 179 | ################# 180 | # dr = action_rhat_rewards + (p_ratio * (rewards - action_rhat_rewards)) 181 | # estimate = ws * (rewards - action_rhat_rewards) + state_rewards 182 | # print('sdr', dr, np.average(dr), np.average(rewards)) 183 | 184 | return np.average(dr) / np.mean(np.sum(rewards, axis=1)), 0 185 | 186 | 187 | if __name__ == '__main__': 188 | batch_size = 10 189 | max_steps = 9 190 | off_rewards_sum = np.ones(batch_size, ) 191 | action_probs_mul = np.random.random((batch_size,)) 192 | behavior_probs_mul = np.random.random((batch_size,)) 193 | episode_reward = np.random.random((batch_size,)) * 2 194 | off_rewards = np.ones((batch_size, max_steps)) 195 | action_probs = np.random.random((batch_size, max_steps)) 196 | behavior_probs = np.random.random((batch_size, max_steps)) 197 | rewards_hat = np.random.random((batch_size, max_steps)) 198 | state_reward = np.ones((batch_size, max_steps)) 199 | 200 | ips = eval_IPS(off_rewards_sum, action_probs_mul, behavior_probs_mul) 201 | cips = eval_CIPS(off_rewards_sum, action_probs_mul, behavior_probs_mul) 202 | snips = eval_SNIPS(off_rewards_sum, action_probs_mul, behavior_probs_mul) 203 | dr = eval_doubly_robust(episode_reward, off_rewards_sum, action_probs_mul, behavior_probs_mul) 204 | # step-wise 205 | sips = eval_WIPS(off_rewards, action_probs, behavior_probs) 206 | sdr = eval_seq_doubly_robust(rewards_hat, state_reward, off_rewards, action_probs, behavior_probs) 207 | print(ips, cips, snips, dr, sips, sdr, sep='\n') 208 | -------------------------------------------------------------------------------- /script/mdpchecker/mdp_checker.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import sys 3 | import random 4 | from scipy.stats import spearmanr 5 | from keras_transformer import get_model, decode 6 | from rl4rs.mdpchecker.decoder import beam_search, token_probs 7 | 8 | # dataset_file = 'recsys15.csv' 9 | # dataset_file = 'movielens.csv' 10 | # dataset_file = 'rl4rs.csv' 11 | # dataset_file = 'lastfm.csv' 12 | # dataset_file = 'cikm2016.csv' 13 | dataset_file = sys.argv[1] + '.csv' 14 | dataset_dir = sys.argv[2] 15 | 16 | # the data of recsys15 relative to the 17 | # number of commodities is too sparse, 18 | # increase the sample size 19 | if 'recsys15' in dataset_file: 20 | source_len = 8 21 | elif 'cikm2016' in dataset_file: 22 | source_len = 5 23 | else: 24 | source_len = 16 25 | target_len = 5 26 | np.random.seed(1) 27 | 28 | data = open(dataset_dir + '/' + dataset_file).read().split('\n')[1:-1] 29 | 30 | source_tokens = [] 31 | target_tokens = [] 32 | for sample in data: 33 | user_id, items = sample.split(' ') 34 | item_list = items.split(',') 35 | if len(item_list) >= source_len + target_len: 36 | # assert len(item_list) >= source_len + target_len 37 | i = 0 38 | if 'rl4rs' in dataset_file or 'cikm2016' in dataset_file: 39 | source_tokens.append(item_list[:source_len]) 40 | target_tokens.append(item_list[source_len:source_len + target_len]) 41 | else: 42 | while i + source_len + target_len < len(item_list): 43 | source_tokens.append(item_list[i: i + source_len]) 44 | target_tokens.append(item_list[i + source_len: i + source_len + target_len]) 45 | i = i + np.random.randint(source_len, source_len + target_len) // 6 46 | else: 47 | print('len(item_list) <= source_len + target_len in', '\t',sample) 48 | 49 | # Generate dictionaries 50 | token_dict = { 51 | '': 0, 52 | '': 1, 53 | '': 2, 54 | } 55 | 56 | 57 | def build_token_dict(token_list): 58 | for tokens in token_list: 59 | for token in tokens: 60 | if token not in token_dict: 61 | token_dict[token] = len(token_dict) 62 | return token_dict 63 | 64 | 65 | source_token_dict = build_token_dict(source_tokens) 66 | target_token_dict = build_token_dict(target_tokens) 67 | target_token_dict_inv = {v: k for k, v in target_token_dict.items()} 68 | 69 | # Add special tokens 70 | encode_tokens = [[''] + tokens + [''] for tokens in source_tokens] 71 | decode_tokens = [[''] + tokens + [''] for tokens in target_tokens] 72 | output_tokens = [tokens + ['', ''] for tokens in target_tokens] 73 | 74 | # Padding 75 | source_max_len = max(map(len, encode_tokens)) 76 | target_max_len = max(map(len, decode_tokens)) 77 | 78 | encode_tokens = [tokens + [''] * (source_max_len - len(tokens)) for tokens in encode_tokens] 79 | decode_tokens = [tokens + [''] * (target_max_len - len(tokens)) for tokens in decode_tokens] 80 | output_tokens = [tokens + [''] * (target_max_len - len(tokens)) for tokens in output_tokens] 81 | 82 | encode_input = [list(map(lambda x: source_token_dict[x], tokens)) for tokens in encode_tokens] 83 | decode_input = [list(map(lambda x: target_token_dict[x], tokens)) for tokens in decode_tokens] 84 | decode_output = [list(map(lambda x: [target_token_dict[x]], tokens)) for tokens in output_tokens] 85 | 86 | print('sample lens:', len(encode_input)) 87 | print('source_token_dict lens:', len(source_token_dict)) 88 | print('target_token_dict lens:', len(target_token_dict)) 89 | # [1, 3, 4, 5, 6, 2] [1, 3, 4, 5, 6, 7, 8, 9, 2] [[3], [4], [5], [6], [7], [8], [9], [2], [0]] 90 | # print(encode_input[0], decode_input[0], decode_output[0]) 91 | 92 | # Build & fit model 93 | model = get_model( 94 | token_num=max(len(source_token_dict), len(target_token_dict)), 95 | embed_dim=256, 96 | encoder_num=1, 97 | decoder_num=1, 98 | head_num=1, 99 | hidden_dim=128, 100 | dropout_rate=0.05, 101 | use_same_embed=False, # Use different embeddings for different languages 102 | ) 103 | 104 | model.compile('adam', 'sparse_categorical_crossentropy') 105 | model.summary() 106 | 107 | model.fit( 108 | x=[np.array(encode_input)[:-10000], np.array(decode_input)[:-10000]], 109 | y=np.array(decode_output)[:-10000], 110 | epochs=20, 111 | batch_size=256, 112 | shuffle=True, 113 | verbose=2 114 | ) 115 | 116 | model.save_weights(dataset_file.split('.')[0] + '.h5') 117 | 118 | # Load 119 | model.load_weights(dataset_file.split('.')[0] + '.h5') 120 | 121 | # greedy result print & input output comparison 122 | # decoded = decode( 123 | # model, 124 | # encode_input[:1024], 125 | # start_token=target_token_dict[''], 126 | # end_token=target_token_dict[''], 127 | # pad_token=target_token_dict[''], 128 | # top_k=1 129 | # ) 130 | # print([target_token_dict_inv[x] for x in decode_input[0]], [target_token_dict_inv[x] for x in decoded[0]]) 131 | # print([target_token_dict_inv[x] for x in decode_input[1]], [target_token_dict_inv[x] for x in decoded[1]]) 132 | 133 | # beam search 134 | batch_size = 2048 135 | beam_size = 100 136 | # use 20 hot items since rl4rs has only 200+ items 137 | hot_beam_size = 20 if 'rl4rs' in dataset_file else beam_size 138 | # cikm2016 has only 60853 items 139 | candidates_size = 6000 if 'cikm2016' in dataset_file else hot_beam_size 140 | random.seed(1) 141 | encode_input = random.sample(encode_input[-10000:], batch_size) 142 | output_greedy, greedy_score = beam_search(model, encode_input, beam_size=1, target_len=target_len) 143 | output_topk, beam_score = beam_search(model, encode_input, beam_size=beam_size, target_len=target_len) 144 | # np.savez(dataset_file.split('.')[0]+'.npz', output_topk=output_topk, beam_score=beam_score) 145 | # npzdata = np.load(dataset_file.split('.')[0] + '.npz') 146 | # output_topk = npzdata['output_topk'] 147 | # beam_score = npzdata['beam_score'] 148 | 149 | output_topk_5, beam_score_5 = output_topk[:, :int(beam_size * 0.05)], beam_score[:, :int(beam_size * 0.05)] 150 | output_topk_20, beam_score_20 = output_topk[:, :int(beam_size * 0.2)], beam_score[:, :int(beam_size * 0.2)] 151 | output_topk_hot, beam_score_hot = beam_search(model, encode_input, beam_size=hot_beam_size, target_len=target_len, use_candidates=True, candidates_size=candidates_size) 152 | output_topk_hot5, beam_score_hot5 = output_topk_hot[:, :int(beam_size * 0.05)], beam_score_hot[:, :int(beam_size * 0.05)] 153 | output_topk_hot20, beam_score_hot20 = output_topk_hot[:, :int(beam_size * 0.2)], beam_score_hot[:, :int(beam_size * 0.2)] 154 | 155 | greedy_score = np.nanmean(greedy_score, axis=1) 156 | top_5_percent_score = np.nanmean(beam_score_5, axis=1) 157 | top_20_percent_score = np.nanmean(beam_score_20, axis=1) 158 | hot_5_percent_score = np.nanmean(beam_score_hot5, axis=1) 159 | hot_20_percent_score = np.nanmean(beam_score_hot20, axis=1) 160 | 161 | print('experiment II results') 162 | print('top_5_percent_score top_20_percent_score greedy_score hot_5_percent_score hot_20_percent_score') 163 | print(1, 164 | np.nanmean(top_20_percent_score / top_5_percent_score), 165 | np.nanmean(greedy_score / top_5_percent_score), 166 | np.nanmean(hot_5_percent_score / top_5_percent_score), 167 | np.nanmean(hot_20_percent_score / top_5_percent_score)) 168 | 169 | print('experiment I start') 170 | tmp = [] 171 | for j in range(int(beam_size)): 172 | batch_outputs = output_topk[:, j] 173 | probs = [] 174 | for i in range(5): 175 | prob = token_probs(model, encode_input, batch_outputs[:, :i + 1])[list(range(batch_size)), output_topk[:, j, i + 1]] 176 | probs.append(prob) 177 | tmp.append(probs) 178 | probs = np.array(tmp).swapaxes(0, 2).swapaxes(1, 2) 179 | metrics = [] 180 | for j in range(batch_size): 181 | prob = probs[j] 182 | prob_sum = np.sum(prob, axis=1) 183 | seq_score = np.multiply.reduce(np.array(prob), axis=1) 184 | for i in range(5): 185 | metrics.append((np.corrcoef(np.multiply.reduce(np.array(prob[:, :i + 1]), axis=1), seq_score)[0][1], 186 | spearmanr(np.multiply.reduce(np.array(prob[:, :i + 1]), axis=1), seq_score)[0])) 187 | metrics = np.array(metrics).reshape((batch_size, 5, 2)) 188 | metrics = np.nan_to_num(metrics, nan=1.0) 189 | print('experiment I results') 190 | print('corrcoef', ' ', 'spearman') 191 | print(np.nanmean(metrics, axis=0)) -------------------------------------------------------------------------------- /rl4rs/env/seqslate.py: -------------------------------------------------------------------------------- 1 | from functools import reduce 2 | from operator import add 3 | from copy import deepcopy as copy 4 | import numpy as np 5 | from rl4rs.env.slate import SlateState, SlateRecEnv 6 | 7 | 8 | class SeqSlateState(SlateState): 9 | def __init__(self, config, records): 10 | super().__init__(config, records) 11 | self.page_items = config.get("page_items", 9) 12 | 13 | @property 14 | def state(self): 15 | if self.config.get("support_rllib_mask", False): 16 | location_mask = self.get_location_mask(self.location_mask, self.cur_steps % self.page_items // 3) 17 | return {"state": self._state, "action_mask": self.action_mask & location_mask & self.special_mask} 18 | elif self.config.get("support_d3rl_mask", False): 19 | cur_steps = np.full((self.batch_size, 1), self.cur_steps) 20 | page_init = self.cur_steps // self.page_items * self.page_items 21 | page_end = min(page_init + self.page_items - 1, self.max_steps - 1) 22 | masked_actions = self.prev_actions[:, page_end + 1 - self.page_items:page_end + 1] 23 | return {"state": self._state, "masked_actions": masked_actions, "cur_steps": cur_steps} 24 | else: 25 | return self._state 26 | 27 | def get_complete_states(self): 28 | states = [] 29 | for j in range(self.cur_steps): 30 | tmp = copy(self._init_state) 31 | for state, action, i in zip(self._init_state, self.prev_actions[:, j], range(len(self._init_state))): 32 | page_init = j // self.page_items * self.page_items 33 | page_end = page_init + self.page_items - 1 34 | sequence_id = j // self.page_items + 1 35 | # seq 36 | prev_expose = self.prev_actions[i, :page_init] if page_init > 0 else [0] 37 | tmp[i][1] = [tmp[i][1][0], prev_expose] 38 | # dense 39 | prev_item_feat = [ 40 | self.item_info_d[str(x)]['item_vec'] 41 | for x in self.prev_actions[i, page_init:page_end + 1] 42 | ] 43 | cur_item_feat = self.item_info_d[str(action)]['item_vec'] 44 | prev_item_feat = np.array(prev_item_feat).flatten() 45 | tmp[i][2] = np.concatenate((tmp[i][2], prev_item_feat, cur_item_feat)) 46 | # category 47 | cur_exposed = self.prev_actions[i, page_init:page_end + 1] 48 | tmp[i][3] = np.concatenate((tmp[i][3], [sequence_id], cur_exposed, [action])) 49 | states.append(tmp) 50 | return states 51 | 52 | def get_violation(self): 53 | tmp = np.ones((self.batch_size,), dtype=np.int) 54 | for step in range(self.cur_steps): 55 | location_mask = self.location_mask[step % self.page_items // 3] 56 | tmp = tmp & location_mask[self.prev_actions[:, step]] 57 | for step in range(max(self.cur_steps - 1, 1)): 58 | duplicate_mask = (self.prev_actions[:, step] != self.prev_actions[:, step + 1]) 59 | tmp = tmp & duplicate_mask 60 | for step in range(max(self.cur_steps - 2, 1)): 61 | duplicate_mask = (self.prev_actions[:, step] != self.prev_actions[:, step + 2]) 62 | tmp = tmp & duplicate_mask 63 | for i in range(self.batch_size): 64 | cur_page = self.cur_steps % self.page_items 65 | for j in range(cur_page+1): 66 | actions = self.prev_actions[i][self.page_items*j:self.page_items*(j+1)] 67 | if len(np.intersect1d(actions, self.special_items)) > 1: 68 | tmp[i] = 0 69 | return tmp 70 | 71 | @property 72 | def offline_reward(self): 73 | cur_step = self.cur_steps 74 | if cur_step % 9 != 0: 75 | reward = [0, ] * self.batch_size 76 | else: 77 | action = np.array([list(map(int, x.split('@')[3].split(',')[:cur_step])) 78 | for x in self.records]) 79 | price = self.get_price(action)[:, -self.page_items:] 80 | slate_label = np.array([ 81 | list(map(int, x.split('@')[4].split(','))) 82 | for x in self.records 83 | ]) 84 | slate_label = slate_label[:, cur_step - self.page_items:cur_step] 85 | reward = np.sum(price * slate_label, axis=1) 86 | return reward 87 | 88 | # @property 89 | # def info(self): 90 | # return [{}]*self.batch_size 91 | 92 | def act(self, actions): 93 | if self.config.get("support_conti_env", False): 94 | location_mask = self.get_location_mask(self.location_mask, 95 | self.cur_steps % self.page_items // 3) 96 | action_mask = self.action_mask & location_mask & self.special_mask 97 | actions = self.get_nearest_neighbor_with_mask(actions, self.action_emb, action_mask) 98 | self.prev_actions[:, self.cur_steps] = actions 99 | self.action_mask[list(range(self.batch_size)), actions] = 0 100 | for i in range(self.batch_size): 101 | if len(np.intersect1d(self.prev_actions[i], self.special_items)) > 0: 102 | self.special_mask[i][self.special_items] = 0 103 | tmp = copy(self._init_state) 104 | for state, action, i in zip(self._state, actions, range(self.batch_size)): 105 | page_init = self.cur_steps // self.page_items * self.page_items 106 | page_end = page_init + self.page_items - 1 107 | sequence_id = self.cur_steps // self.page_items + 1 108 | # seq 109 | prev_expose = self.prev_actions[i, :page_init] if page_init > 0 else [0] 110 | tmp[i][1] = [tmp[i][1][0], prev_expose] 111 | # dense 112 | prev_item_feat = [ 113 | self.item_info_d[str(x)]['item_vec'] 114 | for x in self.prev_actions[i, page_init:page_end + 1] 115 | ] 116 | cur_item_feat = self.item_info_d[str(action)]['item_vec'] 117 | prev_item_feat = np.array(prev_item_feat).flatten() 118 | tmp[i][2] = np.concatenate((tmp[i][2], prev_item_feat, cur_item_feat)) 119 | # category 120 | cur_exposed = self.prev_actions[i, page_init:page_end + 1] 121 | tmp[i][3] = np.concatenate((tmp[i][3], [sequence_id], cur_exposed, [action])) 122 | self._state = tmp 123 | self.cur_steps += 1 124 | if self.cur_steps % self.page_items == 0: 125 | self.action_mask = np.full((self.batch_size, self.action_size), 1, dtype=np.int) 126 | self.special_mask = np.full((self.batch_size, self.action_size), 1, dtype=np.int) 127 | 128 | 129 | class SeqSlateRecEnv(SlateRecEnv): 130 | """ Implements core recommendation simulator""" 131 | 132 | def __init__(self, config, state_cls): 133 | super().__init__(config, state_cls) 134 | self.page_items = config.get("page_items", 9) 135 | 136 | def forward(self, model, samples): 137 | step = samples.cur_steps 138 | if step % self.page_items == 0: 139 | # state = samples.state 140 | prev_actions = samples.prev_actions[:, :step] 141 | # shapes = prev_actions.shape 142 | complete_states = np.array(samples.get_complete_states()) 143 | complete_states = complete_states[-self.page_items:] 144 | complete_states = complete_states \ 145 | .swapaxes(0, 1) \ 146 | .reshape((self.batch_size * self.page_items, 6)) 147 | price = samples.get_price(prev_actions)[:, -self.page_items:] 148 | feat, _ = self.FeatureUtil.feature_extraction(complete_states) 149 | with self.sess.as_default(): 150 | with self.graph.as_default(): 151 | res = self.reward_layer(feat) 152 | probs = np.array(res)[:, 1].reshape((self.batch_size, self.page_items)) 153 | reward = np.sum(price * probs, axis=1) 154 | if self.config.get("support_rllib_mask", False) or \ 155 | self.config.get("support_d3rl_mask", False): 156 | violation = samples.get_violation() 157 | reward[violation < 0.5] = 0 158 | else: 159 | reward = np.array([0, ] * self.batch_size) 160 | return reward.tolist() 161 | -------------------------------------------------------------------------------- /script/mdpchecker/preprocess.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import pandas as pd 3 | import pandasql as ps 4 | 5 | dataset_file = sys.argv[1] 6 | dataset_dir = sys.argv[2] 7 | 8 | pysqldf = lambda q: ps.sqldf(q, globals()) 9 | 10 | # recsys15-click 11 | if 'lastfm' in dataset_file: 12 | df = pd.read_csv(dataset_dir + '/lastfm-dataset-1K/userid-timestamp-artid-artname-traid-traname.tsv' 13 | ,names=["userid", "timestamp", "artid", "artname", "traid", "traname"] 14 | ,sep='\t') 15 | 16 | sql0 = """ 17 | select userid as sessionid, min(timestamp) as timestamp, artid 18 | from 19 | df a 20 | group by userid, artid, substr(timestamp,1,12) 21 | """ 22 | 23 | sql1 = """ 24 | select a.sessionid, a.timestamp, b.item 25 | from 26 | df a 27 | join 28 | (select artid, ROW_NUMBER() OVER(ORDER BY artid) AS item 29 | from ( 30 | select artid 31 | from 32 | df a 33 | group by artid 34 | having count(*)>=30 35 | )aa 36 | )b 37 | on a.artid=b.artid 38 | """ 39 | 40 | sql2 = """ 41 | select sessionid, group_concat(item) as items 42 | from( 43 | select * 44 | from 45 | df2 46 | order by timestamp asc 47 | )a 48 | group by sessionid 49 | 50 | """ 51 | 52 | df = pysqldf(sql0) 53 | 54 | df2 = pysqldf(sql1) 55 | 56 | df3 = pysqldf(sql2) 57 | 58 | print('items num.', df2['item'].value_counts().count()) 59 | print('max item id', df2['item'].max()) 60 | print('sessionid num.', df2['sessionid'].value_counts().count()) 61 | 62 | df3.to_csv(dataset_dir + '/' + dataset_file + '.csv', sep=' ', header=True, index=False, encoding='utf-8') 63 | 64 | if 'cikm2016' in dataset_file: 65 | # queryId;sessionId;userId;timeframe;duration;eventdate;searchstring.tokens;categoryId;items;is.test 66 | queries_df = pd.read_csv(dataset_dir + '/CIKMCUP2016_Track2/train-queries.csv',sep=';') 67 | # queryId;timeframe;itemId 68 | click_df = pd.read_csv(dataset_dir + '/CIKMCUP2016_Track2/train-clicks.csv',sep=';') 69 | # sessionId;userId;itemId;timeframe;eventdate 70 | pv_df = pd.read_csv(dataset_dir + '/CIKMCUP2016_Track2/train-item-views.csv',sep=';') 71 | 72 | # sql0 = """ 73 | # select a.sessionId as sessionid, min(b.timeframe) as timestamp, b.itemId, a.items as pv_items 74 | # from 75 | # queries_df a 76 | # join click_df b 77 | # on a.queryId = b.queryId 78 | # group by b.queryId, b.itemId, cast(b.timeframe/1000 as int) 79 | # """ 80 | 81 | df_click_sql = """ 82 | select a.sessionId as sessionid, min(cast(b.timeframe as int)) as timestamp, b.itemId as item 83 | from 84 | queries_df a 85 | join click_df b 86 | on a.queryId = b.queryId 87 | join (select sessionId from pv_df group by sessionId)c 88 | on a.sessionId = c.sessionId 89 | group by a.sessionId, b.itemId, cast(b.timeframe/1000 as int) 90 | """ 91 | 92 | df_pv_sql = """ 93 | select a.sessionId as sessionid, min(cast(c.timeframe as int)) as timestamp, c.itemId as item 94 | from 95 | queries_df a 96 | join (select queryId from click_df group by queryId) b 97 | on a.queryId = b.queryId 98 | join pv_df c 99 | on a.sessionId = c.sessionId 100 | group by a.sessionId, c.itemId, cast(c.timeframe/1000 as int) 101 | """ 102 | 103 | df_sql = """ 104 | select aa.sessionid, group_concat(c.item|| ':' ||c.timestamp) as pv_items, aa.click_items 105 | from 106 | ( 107 | select a.sessionid,a.timestamp,a.item,group_concat(b.item|| ':' ||b.timestamp) as click_items from 108 | df_click a 109 | join df_click b 110 | on a.sessionid=b.sessionid and a.timestamp<=b.timestamp 111 | group by a.sessionid,a.item 112 | )aa 113 | 114 | join df_pv c 115 | on aa.sessionid=c.sessionid and aa.timestamp>c.timestamp 116 | group by aa.sessionid,aa.click_items 117 | """ 118 | 119 | df_click = pysqldf(df_click_sql) 120 | df_pv = pysqldf(df_pv_sql) 121 | df = pysqldf(df_sql) 122 | 123 | tmp = [] 124 | items = set() 125 | for x in df.values: 126 | if len(x[1].split(','))>=5 and len(x[2].split(','))>=5: 127 | [items.add(x.split(':')[0]) for x in x[1].split(',')] 128 | [items.add(x.split(':')[0]) for x in x[2].split(',')] 129 | 130 | # item2id=dict([(x,str(i)) for i,x in enumerate(items)]) 131 | # item2id_fn = lambda x:item2id[x] 132 | 133 | for x in df.values: 134 | if len(x[1].split(','))>=5 and len(x[2].split(','))>=5: 135 | pv_items = x[1].split(',') 136 | sorted_pv_items = sorted(pv_items, key=lambda x:int(x.split(':')[1]))[-5:] 137 | sorted_pv_items = [x.split(':')[0] for x in sorted_pv_items] 138 | click_items = x[2].split(',') 139 | sorted_click_items = sorted(click_items, key=lambda x:int(x.split(':')[1]))[:5] 140 | sorted_click_items = [x.split(':')[0] for x in sorted_click_items] 141 | tmp.append([x[0], ','.join(sorted_pv_items), ','.join(sorted_click_items)]) 142 | 143 | print('items num.', len(items)) 144 | print('max item id', len(items)-1) 145 | print('sessionid num.', len(tmp)) 146 | 147 | with open(dataset_dir + '/' + dataset_file + '.csv', 'w') as f: 148 | f.write('sessionid items'+'\n') 149 | f.write('\n'.join([str(x[0])+' '+x[1]+','+x[2] for x in tmp])) 150 | 151 | 152 | # recsys15-click 153 | if 'recsys15' in dataset_file: 154 | df = pd.read_csv(dataset_dir + '/yoochoose-clicks.dat', names=["sessionid", "timestamp", "item", "Category"]) 155 | 156 | sql0 = """ 157 | select sessionid, min(timestamp) as timestamp, item, Category 158 | from 159 | df a 160 | group by sessionid, item, substr(timestamp,1,12) 161 | """ 162 | 163 | sql1 = """ 164 | select a.* 165 | from 166 | df a 167 | join 168 | (SELECT item FROM df group by item having count(*)>=1000)b 169 | on a.item=b.item 170 | """ 171 | 172 | sql2 = """ 173 | select a.* 174 | from 175 | df2 a 176 | join 177 | (SELECT sessionid FROM df2 group by sessionid having count(*)>=13)b 178 | on a.sessionid=b.sessionid 179 | """ 180 | 181 | sql3 = """ 182 | select sessionid, group_concat(item) as items 183 | from 184 | df3 185 | group by sessionid 186 | order by timestamp asc 187 | """ 188 | 189 | df = pysqldf(sql0) 190 | 191 | df2 = pysqldf(sql1) 192 | 193 | df3 = pysqldf(sql2) 194 | 195 | df4 = pysqldf(sql3) 196 | 197 | print('items num.', df3['item'].value_counts().count()) 198 | print('max item id', df3['item'].max()) 199 | print('sessionid num.', df3['sessionid'].value_counts().count()) 200 | 201 | df4.to_csv(dataset_dir + '/' + dataset_file + '.csv', sep=' ', header=True, index=False, encoding='utf-8') 202 | 203 | if 'movielens' in dataset_file: 204 | # movielens-25m 205 | df = pd.read_csv(dataset_dir + '/ml-25m/ratings.csv') 206 | # userId,movieId,rating,timestamp 207 | sql0 = """ 208 | select * 209 | from 210 | df a 211 | where rating>=3 212 | """ 213 | 214 | sql1 = """ 215 | select a.* 216 | from 217 | df a 218 | join 219 | (SELECT movieId FROM df group by movieId having count(*)>=1000)b 220 | on a.movieId=b.movieId 221 | """ 222 | 223 | sql2 = """ 224 | select a.* 225 | from 226 | df2 a 227 | join 228 | (SELECT userId FROM df2 group by userId having count(*)>=30 and count(*)<=100)b 229 | on a.userId=b.userId 230 | """ 231 | 232 | sql3 = """ 233 | select userId as sessionid, group_concat(movieId) as items 234 | from 235 | df3 236 | group by userId 237 | order by timestamp asc 238 | """ 239 | df = pysqldf(sql0) 240 | 241 | df2 = pysqldf(sql1) 242 | 243 | df3 = pysqldf(sql2) 244 | 245 | df4 = pysqldf(sql3) 246 | 247 | print('items num.', df3['movieId'].value_counts().count()) 248 | print('max item id', df3['movieId'].max()) 249 | print('sessionid num.', df3['userId'].value_counts().count()) 250 | 251 | df4.to_csv(dataset_dir + '/movielens.csv', sep=' ', header=True, index=False, encoding='utf-8') 252 | 253 | if 'rl4rs' in dataset_file: 254 | # RL4RS 255 | data = open(dataset_dir + '/rl4rs_dataset_a.csv', 'r').read().split('\n')[:-1] 256 | tmp = ['sessionid items'] 257 | for x in data: 258 | session_id = x.split('@')[1] 259 | sequence_id = list(map(int, x.split('@')[5].split(','))) 260 | items = list(map(int, x.split('@')[3].split(','))) 261 | if len(sequence_id) >= 16: 262 | tmp.append(session_id + ' ' + ','.join(list(map(str, sequence_id[-16:] + items[:5])))) 263 | 264 | print('items num.', 283) 265 | print('max item id', 283) 266 | print('sessionid num.', len(tmp)) 267 | 268 | with open(dataset_dir + '/rl4rs.csv', 'w') as f: 269 | f.write('\n'.join(tmp)) 270 | -------------------------------------------------------------------------------- /rl4rs/env/base.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gym 3 | from copy import deepcopy as copy 4 | import tensorflow as tf 5 | from abc import ABC, abstractmethod 6 | import os 7 | 8 | 9 | def single_elem_support(func): 10 | """aop func""" 11 | type_list = (type([]), type(()), type(np.array(1))) 12 | 13 | def wrapper(*args, **kwargs): 14 | """wrapper func""" 15 | res = func(*args, **kwargs) 16 | if type(res) in type_list and len(res) == 1: 17 | return res[0] 18 | elif type(res[0]) in type_list and len(res[0]) == 1: 19 | return [x[0] for x in res] 20 | else: 21 | return res 22 | 23 | return wrapper 24 | 25 | 26 | class RecState(ABC): 27 | def __init__(self, config, records): 28 | self.config = config 29 | self.records = records 30 | self._init_state = self.records_to_state(records) 31 | self._state = copy(self._init_state) 32 | 33 | @staticmethod 34 | def records_to_state(records): 35 | pass 36 | 37 | @property 38 | def state(self): 39 | return self._state 40 | 41 | @property 42 | @abstractmethod 43 | def user(self): 44 | pass 45 | 46 | @property 47 | @abstractmethod 48 | def info(self): 49 | pass 50 | 51 | @abstractmethod 52 | def act(self, actions): 53 | pass 54 | 55 | @abstractmethod 56 | def to_string(self): 57 | pass 58 | 59 | 60 | class RecDataBase(object): 61 | ''' 62 | file-based implementation of a RecommnedEnv's data source. 63 | 64 | Pulls data from file, preps for use by RecommnedEnv and then 65 | acts as data provider for each new episode. 66 | ''' 67 | 68 | def __init__(self, config, state_cls): 69 | self.config = config 70 | self.sample_list = [] 71 | self.state_cls = state_cls 72 | self.is_eval = config.get('is_eval', False) 73 | self.cache_size = config.get('cache_size', 2048) 74 | # sample file cache 75 | self.fp = open(config['sample_file'], 'r') 76 | # self.fp.readline() 77 | 78 | @staticmethod 79 | def seed(seed): 80 | np.random.seed(seed) 81 | 82 | def sample_cache(self, f, num): 83 | for i in range(num): 84 | tmp = f.readline().rstrip() 85 | if len(tmp) < 1: 86 | f.seek(0, 0) 87 | f.readline() 88 | self.sample_list.append(f.readline().rstrip()) 89 | else: 90 | self.sample_list.append(tmp) 91 | 92 | def sample(self, batch_size): 93 | if self.is_eval: 94 | assert self.cache_size == batch_size 95 | assert len(self.sample_list) == batch_size 96 | records = self.sample_list[:batch_size] 97 | else: 98 | records = np.random.choice(self.sample_list, batch_size) 99 | samples = self.state_cls(self.config, records) 100 | return samples 101 | 102 | def reset(self, reset_file=False): 103 | # self.state_list = [] 104 | self.sample_list = [] 105 | # self.rawstate_cache(self.fs, 10000) 106 | if reset_file: 107 | self.fp.seek(0, 0) 108 | self.sample_cache(self.fp, self.cache_size) 109 | 110 | 111 | class RecSimBase(ABC): 112 | """ Implemention of core recommendation simulator""" 113 | 114 | def __init__(self, config, state_cls): 115 | self.config = config 116 | self.max_steps = config['max_steps'] 117 | self.batch_size = config['batch_size'] 118 | model_file = config['model_file'] 119 | self.graph = tf.Graph() 120 | with self.graph.as_default(): 121 | self.model = self.get_model(config) 122 | if self.config.get('gpu', False): 123 | os.environ['CUDA_VISIBLE_DEVICES'] = '-1' 124 | self.sess = tf.Session(graph=self.graph, 125 | config=tf.ConfigProto(device_count={"CPU": 4})) 126 | else: 127 | os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true' 128 | self.sess = tf.Session(graph=self.graph) 129 | self.saver = tf.train.Saver() 130 | self.reload_model(model_file) 131 | self._recData = RecDataBase(config, state_cls) 132 | 133 | def reset(self, reset_file=False): 134 | self._recData.reset(reset_file) 135 | 136 | @abstractmethod 137 | def get_model(self, config): 138 | pass 139 | 140 | @abstractmethod 141 | def obs_fn(self, state): 142 | pass 143 | 144 | @abstractmethod 145 | def forward(self, model, samples): 146 | pass 147 | 148 | def reload_model(self, model_file): 149 | with self.sess.as_default(): 150 | with self.graph.as_default(): 151 | self.saver.restore(self.sess, model_file) 152 | 153 | def seed(self, sd=0): 154 | self._recData.seed(sd) 155 | np.random.seed(sd) 156 | 157 | def _step(self, samples, action, **kwargs): 158 | step = kwargs['step'] 159 | samples.act(action) 160 | next_state = samples.state 161 | next_obs = self.obs_fn(next_state) 162 | reward = self.forward(self.model, samples) 163 | next_info = samples.info 164 | 165 | if step < self.max_steps - 1: 166 | done = [0] * self.batch_size 167 | else: 168 | done = [1] * self.batch_size 169 | 170 | return next_obs, reward, done, next_info 171 | 172 | def sample(self, batch_size): 173 | samples = self._recData.sample(batch_size) 174 | obs = self.obs_fn(samples.state) 175 | return samples, obs 176 | 177 | 178 | class RecEnvBase(gym.Env): 179 | metadata = {'render.modes': ['human']} 180 | 181 | def __init__(self, recsim: RecSimBase): 182 | self.config = recsim.config 183 | self.batch_size = self.config['batch_size'] 184 | self.cur_step = 0 185 | self.sim = recsim 186 | self.sim.reset() 187 | self.samples, self.obs = self.sim.sample(self.batch_size) 188 | if self.config.get("rawstate_as_obs", False): 189 | category_size = len(self.obs[0]['category_feature']) 190 | dense_size = len(self.obs[0]['dense_feature']) 191 | sequence_size = np.array(self.obs[0]['sequence_feature']).shape 192 | features = { 193 | "category_feature": gym.spaces.Box(-1000000.0, 1000000.0, shape=(category_size,)), 194 | "dense_feature": gym.spaces.Box(-1000000.0, 1000000.0, shape=(dense_size,)), 195 | "sequence_feature": gym.spaces.Box(-1000000.0, 1000000.0, shape=sequence_size), 196 | } 197 | if self.config.get("support_rllib_mask", False): 198 | action_feature_size = len(self.obs[0]['action_mask']) 199 | self.observation_space = gym.spaces.Dict({ 200 | "action_mask": gym.spaces.Box(0, 1, shape=(action_feature_size,)), 201 | **features 202 | }) 203 | else: 204 | self.observation_space = gym.spaces.Dict(features) 205 | else: 206 | if self.config.get("support_rllib_mask", False): 207 | action_feature_size = len(self.obs[0]['action_mask']) 208 | self.observation_space = gym.spaces.Dict({ 209 | "action_mask": gym.spaces.Box(0, 1, shape=(action_feature_size,)), 210 | "obs": gym.spaces.Box(-100000.0, 100000.0, shape=(len(self.obs[0]["obs"]),)) 211 | }) 212 | else: 213 | self.observation_space = gym.spaces.Box(-100000.0, 100000.0, shape=(len(self.obs[0]),)) 214 | if self.config.get("support_conti_env", False): 215 | self.action_space = gym.spaces.Box(-1, 1, shape=(self.config['action_emb_size'],)) 216 | else: 217 | self.action_space = gym.spaces.Discrete(self.config['action_size']) 218 | # if self.config.get("support_rllib_mask", False): 219 | # action_feature_size = len(self.obs[0]['action_mask']) 220 | # # avail_actions_size = len(self.obs[0]['avail_actions'][0]) 221 | # # self.action_space = gym.spaces.Discrete(self.config['action_size']) 222 | # self.observation_space = gym.spaces.Dict({ 223 | # "action_mask": gym.spaces.Box(0, 1, shape=(action_feature_size,)), 224 | # "obs": self.observation_space, 225 | # }) 226 | # elif self.config.get("support_d3rl_mask", False): 227 | # self.action_space = gym.spaces.Discrete(self.config['action_size']) 228 | # else: 229 | # self.action_space = gym.spaces.Discrete(self.config['action_size']) 230 | self.reset() 231 | 232 | def seed(self, sd=0): 233 | self.sim.seed(sd) 234 | np.random.seed(sd) 235 | 236 | @property 237 | @single_elem_support 238 | def state(self): 239 | return self.obs 240 | 241 | @property 242 | @single_elem_support 243 | def user_id(self): 244 | return self.samples.user 245 | 246 | @property 247 | @single_elem_support 248 | def offline_action(self): 249 | return self.samples.offline_action 250 | 251 | @property 252 | @single_elem_support 253 | def offline_reward(self): 254 | return self.samples.offline_reward 255 | 256 | @single_elem_support 257 | def step(self, action): 258 | if not isinstance(action, (list, np.ndarray)): 259 | action = [action] 260 | obs, reward, done, info = \ 261 | self.sim._step(self.samples, action, step=self.cur_step) 262 | self.cur_step += 1 263 | return obs, reward, done, info 264 | 265 | def reset(self, reset_file=False): 266 | self.cur_step = 0 267 | self.sim.reset(reset_file) 268 | self.samples, self.obs = self.sim.sample(self.batch_size) 269 | return self.state 270 | 271 | def render(self, mode='human', close=False): 272 | print('Current State:', '\n') 273 | print(self.samples.to_string()) 274 | -------------------------------------------------------------------------------- /rl4rs/nets/exact_k/model.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import tensorflow as tf 3 | 4 | from .layers import * 5 | from .modules import * 6 | from .utils import * 7 | 8 | 9 | class Generator: 10 | def __init__(self, 11 | l1_mask, 12 | l2_mask, 13 | l3_mask, 14 | l0_ssr_mask, 15 | is_training=True, 16 | lr=0.001, 17 | temperature=1, 18 | train_sample='random', 19 | predict_sample='random', 20 | seq_length=500, 21 | res_length=9, 22 | hidden_units=64, 23 | dropout_rate=0.1, 24 | num_heads=4, 25 | num_layers=1, 26 | num_glimpse=1, 27 | num_blocks=2, 28 | use_mha=True, 29 | beam_size=3 30 | ): 31 | 32 | self.user = tf.placeholder(tf.float32, shape=(None, 256), name='user') # 779 33 | 34 | self.batch_size = tf.shape(self.user)[0] 35 | self.item_cand = tf.placeholder(tf.int32, shape=(None, seq_length), name='item_cand') 36 | 37 | self.decode_target_ids = tf.placeholder(dtype=tf.int32, shape=[None, res_length], name="decoder_target_ids") # [batch_size, res_length] 38 | self.reward = tf.placeholder(dtype=tf.float32, shape=[None], name="reward") # [batch_size] 39 | 40 | # Encoder 41 | with tf.variable_scope("encoder"): 42 | # region emb 43 | self.enc_user = tf.layers.dense(self.user, hidden_units, activation=tf.nn.relu) # (N, T_q, C) 44 | # enc_item = [batch_size, seq_len, hidden_units] 45 | self.enc_item = embedding(self.item_cand, 46 | vocab_size=500, 47 | num_units=hidden_units, 48 | zero_pad=False, 49 | scale=True, 50 | scope='enc_item_embed', 51 | # reuse=not is_training, 52 | reuse=False 53 | ) 54 | self.enc = tf.concat([tf.stack(seq_length * [self.enc_user], axis=1), self.enc_item], axis=2) 55 | # endregion 56 | # region Dropout 57 | self.enc = tf.layers.dropout(self.enc, 58 | rate=dropout_rate, 59 | training=tf.convert_to_tensor(is_training)) 60 | # endregion 61 | # region squence 62 | if use_mha: 63 | ## Blocks 64 | for i in range(num_blocks): 65 | with tf.variable_scope("num_blocks_{}".format(i)): 66 | ### Multihead Attention 67 | self.enc = multihead_attention(queries=self.enc, 68 | keys=self.enc, 69 | num_units=hidden_units * 2, 70 | num_heads=num_heads, 71 | dropout_rate=dropout_rate, 72 | is_training=is_training, 73 | causality=False) 74 | 75 | ### Feed Forward 76 | self.enc = feedforward(self.enc, num_units=[4 * hidden_units, hidden_units * 2]) 77 | else: 78 | cell = tf.nn.rnn_cell.GRUCell(num_units=hidden_units * 2) 79 | outputs, _ = tf.nn.dynamic_rnn(cell=cell, inputs=self.enc, dtype=tf.float32) 80 | self.enc = outputs 81 | # endregion 82 | 83 | # Decoder 84 | with tf.variable_scope("decoder"): 85 | dec_cell = LSTMCell(hidden_units * 2) 86 | 87 | if num_layers > 1: 88 | cells = [dec_cell] * num_layers 89 | dec_cell = MultiRNNCell(cells) 90 | # ptr sampling 91 | enc_init_state = trainable_initial_state(self.batch_size, dec_cell.state_size) 92 | 93 | custom_logits, custom_path, _ = ptn_rnn_decoder( 94 | dec_cell, None, 95 | self.enc, enc_init_state, 96 | seq_length, res_length, hidden_units * 2, 97 | num_glimpse, self.batch_size, 98 | l1_mask, l2_mask, l3_mask, l0_ssr_mask, 99 | mode="CUSTOM", reuse=False, beam_size=None, 100 | temperature=temperature, 101 | train_sample=train_sample, predict_sample=predict_sample 102 | ) 103 | # logits: [batch_size, res_length, seq_length] 104 | self.custom_logits = tf.identity(custom_logits, name="custom_logits") 105 | # sample_path: [batch_size, res_length] 106 | self.custom_path = tf.identity(custom_path, name="custom_path") 107 | self.custom_result = batch_gather(self.item_cand, self.custom_path) 108 | sampled_logits, sampled_path, _ = ptn_rnn_decoder( 109 | dec_cell, None, 110 | self.enc, enc_init_state, 111 | seq_length, res_length, hidden_units * 2, 112 | num_glimpse, self.batch_size, 113 | l1_mask, l2_mask, l3_mask, l0_ssr_mask, 114 | mode="SAMPLE", reuse=True, beam_size=None, 115 | temperature=temperature, 116 | train_sample=train_sample, predict_sample=predict_sample 117 | ) 118 | # logits: [batch_size, res_length, seq_length] 119 | self.sampled_logits = tf.identity(sampled_logits, name="sampled_logits") 120 | # sample_path: [batch_size, res_length] 121 | self.sampled_path = tf.identity(sampled_path, name="sampled_path") 122 | self.sampled_result = batch_gather(self.item_cand, self.sampled_path) 123 | 124 | # self.decode_target_ids is placeholder 125 | decoder_logits, _ = ptn_rnn_decoder( 126 | dec_cell, self.decode_target_ids, 127 | self.enc, enc_init_state, 128 | seq_length, res_length, hidden_units * 2, 129 | num_glimpse, self.batch_size, 130 | l1_mask, l2_mask, l3_mask, l0_ssr_mask, 131 | mode="TRAIN", reuse=True, beam_size=None, 132 | temperature=temperature, 133 | train_sample=train_sample, predict_sample=predict_sample 134 | ) 135 | self.dec_logits = tf.identity(decoder_logits, name="dec_logits") 136 | 137 | _, beam_path, _ = ptn_rnn_decoder( 138 | dec_cell, None, 139 | self.enc, enc_init_state, 140 | seq_length, res_length, hidden_units * 2, 141 | num_glimpse, self.batch_size, 142 | l1_mask, l2_mask, l3_mask, l0_ssr_mask, 143 | mode="BEAMSEARCH", reuse=True, beam_size=beam_size, 144 | temperature=temperature, 145 | train_sample=train_sample, predict_sample=predict_sample 146 | ) 147 | self.beam_path = tf.identity(beam_path, name="beam_path") 148 | self.beam_result = batch_gather(self.item_cand, self.beam_path) 149 | 150 | _, greedy_path, _ = ptn_rnn_decoder( 151 | dec_cell, None, 152 | self.enc, enc_init_state, 153 | seq_length, res_length, hidden_units * 2, 154 | num_glimpse, self.batch_size, 155 | l1_mask, l2_mask, l3_mask, l0_ssr_mask, 156 | mode="GREEDY", reuse=True, beam_size=None, 157 | temperature=temperature, 158 | train_sample=train_sample, predict_sample=predict_sample 159 | ) 160 | self.greedy_path = tf.identity(greedy_path, name="greedy_path") 161 | self.greedy_result = batch_gather(self.item_cand, self.greedy_path) 162 | 163 | if is_training: 164 | # Loss 165 | # self.y_smoothed = label_smoothing(tf.one_hot(self.decode_target_ids, depth=hp.data_length)) 166 | self.r_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.dec_logits, 167 | labels=self.decode_target_ids) 168 | # reinforcement 169 | self.policy_loss = tf.reduce_mean(tf.reduce_sum(self.r_loss, axis=1) * self.reward) 170 | # supervised loss 171 | self.loss = self.policy_loss 172 | 173 | # Training Scheme 174 | self.global_step = tf.Variable(0, name='global_step', trainable=False) 175 | self.optimizer = tf.train.AdamOptimizer(learning_rate=lr, beta1=0.9, beta2=0.98, epsilon=1e-8) 176 | self.train_op = self.optimizer.minimize(self.loss, global_step=self.global_step) 177 | 178 | self.variables = tf.global_variables() 179 | 180 | 181 | class Discriminator: 182 | def __init__(self, lr=0.005, seq_length=500): 183 | self.user = tf.placeholder(tf.float32, shape=(None, 256), name='user') 184 | self.batch_size = tf.shape(self.user)[0] 185 | self.item_cand = tf.placeholder(tf.int32, shape=(None, seq_length), name='item_cand') 186 | 187 | self.reward_target = tf.placeholder(dtype=tf.float32, shape=[None], name="reward") # [batch_size] 188 | 189 | dense0 = self.user 190 | dense1 = tf.layers.dense(dense0, 128, activation=tf.nn.relu) 191 | dense2 = tf.layers.dense(dense1, 128, activation=tf.nn.relu) 192 | dense3 = tf.layers.dense(dense2, 128, activation=tf.nn.relu) 193 | 194 | self.reward = tf.layers.dense(dense3, 1)[:, 0] 195 | 196 | self.td_error = tf.abs(self.reward_target - self.reward) 197 | self.loss = tf.square(self.td_error) 198 | 199 | # Training Scheme 200 | self.global_step = tf.Variable(0, name='global_step', trainable=False) 201 | self.optimizer = tf.train.AdamOptimizer(learning_rate=lr, beta1=0.9, beta2=0.98, epsilon=1e-8) 202 | self.train_op = self.optimizer.minimize(self.loss, global_step=self.global_step) 203 | --------------------------------------------------------------------------------