├── rl4rs
    ├── nets
    │   ├── __init__.py
    │   ├── cql
    │   │   ├── __init__.py
    │   │   ├── q_function.py
    │   │   └── encoder.py
    │   ├── exact_k
    │   │   ├── __init__.py
    │   │   ├── utils.py
    │   │   └── model.py
    │   ├── rllib
    │   │   ├── __init__.py
    │   │   ├── rllib_rawstate_model.py
    │   │   └── rllib_mask_model.py
    │   ├── dnn.py
    │   ├── widedeep.py
    │   ├── lstm_slate.py
    │   ├── lstm.py
    │   ├── dien.py
    │   ├── dnn_slate.py
    │   ├── widedeep_slate.py
    │   ├── dien_slate.py
    │   ├── lstm_slate_multiclass.py
    │   ├── dnn_slate_multiclass.py
    │   ├── widedeep_slate_multiclass.py
    │   ├── dien_slate_multiclass.py
    │   ├── adversarial_slate.py
    │   └── utils.py
    ├── policy
    │   ├── __init__.py
    │   ├── behavior_model.py
    │   └── policy_model.py
    ├── server
    │   ├── __init__.py
    │   ├── httpEnv.py
    │   └── gymHttpClient.py
    ├── utils
    │   ├── __init__.py
    │   ├── fileutil.py
    │   ├── rllib_print.py
    │   ├── rllib_vector_env.py
    │   ├── d3rlpy_scorer.py
    │   └── offline_policy_metrics.py
    ├── mdpchecker
    │   ├── __init__.py
    │   └── decoder.py
    ├── env
    │   ├── __init__.py
    │   ├── seqslate.py
    │   └── base.py
    └── __init__.py
├── assets
    ├── fuxi.jpg
    └── new.gif
├── RL4RS_appendix.pdf
├── reproductions
    ├── run_mdp_checker.sh
    ├── run_supervised_item.sh
    ├── run_supervised_slate.sh
    ├── file_split.sh
    ├── run_simulator_env_test.sh
    ├── run_simulator_train.sh
    ├── run_simulator_eval.sh
    ├── run_exact_k.sh
    ├── run_modelfree_rl.sh
    └── run_split.sh
├── script
    ├── modelfree_trainer.py
    ├── supervised_train.py
    ├── simulator_eval.py
    ├── test_exact_k.py
    ├── simulator_env_test.py
    ├── offline_evaluation.py
    ├── exact_k_train.py
    ├── batchrl_train.py
    ├── data_preprocess.py
    └── mdpchecker
    │   ├── mdp_checker.py
    │   └── preprocess.py
├── index.html
└── environment.yml


/rl4rs/nets/__init__.py:
--------------------------------------------------------------------------------
1 | #


--------------------------------------------------------------------------------
/rl4rs/nets/cql/__init__.py:
--------------------------------------------------------------------------------
1 | #


--------------------------------------------------------------------------------
/rl4rs/policy/__init__.py:
--------------------------------------------------------------------------------
1 | #


--------------------------------------------------------------------------------
/rl4rs/server/__init__.py:
--------------------------------------------------------------------------------
1 | #


--------------------------------------------------------------------------------
/rl4rs/utils/__init__.py:
--------------------------------------------------------------------------------
1 | #


--------------------------------------------------------------------------------
/rl4rs/mdpchecker/__init__.py:
--------------------------------------------------------------------------------
1 | #


--------------------------------------------------------------------------------
/rl4rs/nets/exact_k/__init__.py:
--------------------------------------------------------------------------------
1 | #


--------------------------------------------------------------------------------
/rl4rs/nets/rllib/__init__.py:
--------------------------------------------------------------------------------
1 | #


--------------------------------------------------------------------------------
/assets/fuxi.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fuxiAIlab/RL4RS/HEAD/assets/fuxi.jpg


--------------------------------------------------------------------------------
/assets/new.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fuxiAIlab/RL4RS/HEAD/assets/new.gif


--------------------------------------------------------------------------------
/RL4RS_appendix.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fuxiAIlab/RL4RS/HEAD/RL4RS_appendix.pdf


--------------------------------------------------------------------------------
/rl4rs/env/__init__.py:
--------------------------------------------------------------------------------
1 | from .base import RecDataBase, RecSimBase, RecEnvBase, RecState
2 | 
3 | __all__ = [
4 |     "RecDataBase",
5 |     "RecSimBase",
6 |     "RecEnvBase",
7 |     "RecState",
8 | ]
9 | 


--------------------------------------------------------------------------------
/rl4rs/__init__.py:
--------------------------------------------------------------------------------
 1 | from gym.envs.registration import register
 2 | 
 3 | register(
 4 |     id='HttpEnv-v0',
 5 |     entry_point='rl4rs.server.httpEnv:HttpEnv',
 6 | )
 7 | 
 8 | register(
 9 |     id='SlateRecEnv-v0',
10 |     entry_point='rl4rs.env:RecEnvBase',
11 | )
12 | 
13 | register(
14 |     id='SeqSlateRecEnv-v0',
15 |     entry_point='rl4rs.env:RecEnvBase',
16 | )
17 | 


--------------------------------------------------------------------------------
/reproductions/run_mdp_checker.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | conda activate rl4rs
 4 | script_abs=$(readlink -f "$0")
 5 | rl4rs_benchmark_dir=$(dirname $script_abs)/..
 6 | rl4rs_output_dir=${rl4rs_benchmark_dir}/output
 7 | rl4rs_dataset_dir=${rl4rs_benchmark_dir}/dataset
 8 | script_dir=${rl4rs_benchmark_dir}/script
 9 | export rl4rs_benchmark_dir && export rl4rs_output_dir && export rl4rs_dataset_dir
10 | 
11 | dataset=$1
12 | 
13 | cd ${script_dir}/tool
14 | python -u preprocess.py $dataset ${rl4rs_dataset_dir} &&
15 | python -u mdp_checker.py $dataset ${rl4rs_dataset_dir} >> ${rl4rs_output_dir}/data_understanding_tool_${dataset}.log &&
16 | echo "1"


--------------------------------------------------------------------------------
/rl4rs/utils/fileutil.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import glob
 4 | import numpy as np
 5 | 
 6 | 
 7 | def find_match_files(pattern, search_path, pathsep=os.pathsep):
 8 |     for path in search_path.split(pathsep):
 9 |         for match in glob.glob(os.path.join(path, pattern)):
10 |             yield match
11 | 
12 | 
13 | def find_newest_files(pattern, search_path, pathsep=os.pathsep):
14 |     files = []
15 |     timestamps = []
16 |     for path in search_path.split(pathsep):
17 |         for match in glob.glob(os.path.join(path, pattern)):
18 |             files.append(match)
19 |             timestamps.append(float(os.path.getctime(match)))
20 |     if len(files) > 0:
21 |         return files[np.argmax(timestamps)]
22 |     else:
23 |         return ''
24 | 


--------------------------------------------------------------------------------
/reproductions/run_supervised_item.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | conda activate rl4rs
 4 | script_abs=$(readlink -f "$0")
 5 | rl4rs_benchmark_dir=$(dirname $script_abs)/..
 6 | rl4rs_output_dir=${rl4rs_benchmark_dir}/output
 7 | script_dir=${rl4rs_benchmark_dir}/script
 8 | export rl4rs_benchmark_dir && export rl4rs_output_dir && export rl4rs_dataset_dir
 9 | 
10 | algo=$1
11 | 
12 | cd ${script_dir}
13 | 
14 | # supervised learning evaluation
15 | 
16 | python supervised_train.py "${rl4rs_output_dir}/rl4rs_dataset_a_train.tfrecord" "${rl4rs_output_dir}/rl4rs_dataset_a_test.tfrecord" "${rl4rs_output_dir}/supervised_a_train_$algo/model" $algo 0 >> ${rl4rs_output_dir}/supervised_a_train_${algo}_item.log &&
17 | 
18 | python supervised_train.py "${rl4rs_output_dir}/rl4rs_dataset_b2_train.tfrecord" "${rl4rs_output_dir}/rl4rs_dataset_b2_test.tfrecord" "${rl4rs_output_dir}/supervised_b2_train_$algo/model" $algo 0 >> ${rl4rs_output_dir}/supervised_b2_train_${algo}_item.log &&
19 | 
20 | echo "1"
21 | 
22 | 


--------------------------------------------------------------------------------
/reproductions/run_supervised_slate.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | conda activate rl4rs
 4 | script_abs=$(readlink -f "$0")
 5 | rl4rs_benchmark_dir=$(dirname $script_abs)/..
 6 | rl4rs_output_dir=${rl4rs_benchmark_dir}/output
 7 | script_dir=${rl4rs_benchmark_dir}/script
 8 | export rl4rs_benchmark_dir && export rl4rs_output_dir && export rl4rs_dataset_dir
 9 | 
10 | algo=$1
11 | 
12 | cd ${script_dir}
13 | 
14 | # supervised learning evaluation
15 | 
16 | python supervised_train.py "${rl4rs_output_dir}/rl4rs_dataset_a_train_slate.tfrecord" "${rl4rs_output_dir}/rl4rs_dataset_a_test_slate.tfrecord" "${rl4rs_output_dir}/supervised_a_train_slate_$algo/model" $algo 1 >> ${rl4rs_output_dir}/supervised_a_train_${algo}_slate.log &&
17 | 
18 | python supervised_train.py "${rl4rs_output_dir}/rl4rs_dataset_b2_train_slate.tfrecord" "${rl4rs_output_dir}/rl4rs_dataset_b2_test_slate.tfrecord" "${rl4rs_output_dir}/supervised_b2_train_slate_$algo/model" $algo 1 >> ${rl4rs_output_dir}/supervised_b2_train_${algo}_slate.log &&
19 | 
20 | echo "1"
21 | 
22 | 


--------------------------------------------------------------------------------
/rl4rs/utils/rllib_print.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import yaml
 3 | from ray.tune.utils.util import SafeFallbackEncoder
 4 | 
 5 | 
 6 | def pretty_print(result):
 7 |     result = result.copy()
 8 |     result.update(config=None)  # drop config from pretty print
 9 |     result.update(hist_stats=None)  # drop hist_stats from pretty print
10 |     out = {}
11 |     print_keys = ('episode_reward_mean',
12 |                   'episode_reward_min',
13 |                   'timesteps_total',
14 |                   'training_iteration')
15 |     for k, v in result.items():
16 |         if v is not None:
17 |             if k in print_keys:
18 |                 out[k] = v
19 |             elif k == 'evaluation':
20 |                 out[k] = {
21 |                     'episode_reward_mean': v['episode_reward_mean'],
22 |                     'episode_reward_min': v['episode_reward_min'],
23 |                 }
24 |     cleaned = json.dumps(out, cls=SafeFallbackEncoder)
25 |     return yaml.safe_dump(json.loads(cleaned), default_flow_style=False)
26 | 


--------------------------------------------------------------------------------
/reproductions/file_split.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | script_abs=$(readlink -f "$0")
 4 | rl4rs_benchmark_dir=$(dirname $script_abs)/..
 5 | rl4rs_dataset_dir=${rl4rs_benchmark_dir}/dataset
 6 | script_dir=${rl4rs_benchmark_dir}/script
 7 | rl4rs_output_dir=${rl4rs_benchmark_dir}/output
 8 | export rl4rs_benchmark_dir && export rl4rs_output_dir && export rl4rs_dataset_dir
 9 | 
10 | file=$1
11 | 
12 | cd ${rl4rs_dataset_dir} &&
13 | 
14 | awk -F "@" '$2%11<2 {print}' ${file} > ${rl4rs_output_dir}/${file}_0000.csv &&
15 | awk -F "@" '$2%11>=2 && $2%11<4 {print}' ${file} > ${rl4rs_output_dir}/${file}_0001.csv &&
16 | awk -F "@" '$2%11>=4 && $2%11<6 {print}' ${file} > ${rl4rs_output_dir}/${file}_0002.csv &&
17 | awk -F "@" '$2%11>=6 && $2%11<8 {print}' ${file} > ${rl4rs_output_dir}/${file}_0003.csv &&
18 | awk -F "@" '$2%11>=8 {print}' ${file} > ${rl4rs_output_dir}/${file}_0004.csv
19 | 
20 | #file_rows=`wc -l ${file}|awk '{print $1}'`
21 | #file_num=5
22 | #file_num_row=$((${file_rows} + 4))
23 | #every_file_row=$((${file_num_row}/${file_num}))
24 | #split -d -a 4 -l ${every_file_row} ${file} --additional-suffix=.csv ${rl4rs_output_dir}/${file}_
25 | 
26 | 
27 | 


--------------------------------------------------------------------------------
/reproductions/run_simulator_env_test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | conda activate rl4rs
 4 | script_abs=$(readlink -f "$0")
 5 | rl4rs_benchmark_dir=$(dirname $script_abs)/..
 6 | rl4rs_output_dir=${rl4rs_benchmark_dir}/output
 7 | rl4rs_dataset_dir=${rl4rs_benchmark_dir}/dataset
 8 | script_dir=${rl4rs_benchmark_dir}/script
 9 | export rl4rs_benchmark_dir && export rl4rs_output_dir && export rl4rs_dataset_dir
10 | 
11 | algo=$1
12 | 
13 | cd ${script_dir}
14 | 
15 | head -1 ${rl4rs_dataset_dir}/rl4rs_dataset_a_train.csv > ${rl4rs_dataset_dir}/rl4rs_dataset_a_train_tiny.csv
16 | python data_preprocess.py "${rl4rs_dataset_dir}/rl4rs_dataset_a_train_tiny.csv" "${rl4rs_output_dir}/rl4rs_dataset_a_train_tiny.tfrecord" "tfrecord_item"
17 | python -u simulator_env_test.py "{'env':'SlateRecEnv-v0','support_conti_env':False,'rawstate_as_obs':False}" &&
18 | python -u simulator_env_test.py "{'env':'SlateRecEnv-v0','support_conti_env':False,'rawstate_as_obs':True}" &&
19 | python -u simulator_env_test.py "{'env':'SlateRecEnv-v0','support_conti_env':True,'rawstate_as_obs':False,'action_emb_size':32}" &&
20 | python -u simulator_env_test.py "{'env':'SlateRecEnv-v0','support_conti_env':True,'rawstate_as_obs':True,'action_emb_size':32}" &&
21 | echo '1'


--------------------------------------------------------------------------------
/script/modelfree_trainer.py:
--------------------------------------------------------------------------------
 1 | import ray.rllib.agents.ppo as ppo
 2 | import ray.rllib.agents.dqn as dqn
 3 | import ray.rllib.agents.a3c as a3c
 4 | import ray.rllib.agents.pg as pg
 5 | import ray.rllib.agents.ddpg.td3 as td3
 6 | import ray.rllib.agents.impala as impala
 7 | import ray.rllib.agents.ddpg as ddpg
 8 | import ray.rllib.agents.slateq as slateq
 9 | 
10 | 
11 | def get_rl_model(algo, rllib_config):
12 |     trainer = None
13 |     if algo == "PPO":
14 |         trainer = ppo.PPOTrainer(config=rllib_config, env="rllibEnv-v0")
15 |     elif algo == "DQN":
16 |         trainer = dqn.DQNTrainer(config=rllib_config, env="rllibEnv-v0")
17 |     elif algo == "RAINBOW":
18 |         trainer = dqn.DQNTrainer(config=rllib_config, env="rllibEnv-v0")
19 |     elif algo == "A2C":
20 |         trainer = a3c.A2CTrainer(config=rllib_config, env="rllibEnv-v0")
21 |     elif algo == "A3C":
22 |         trainer = a3c.A3CTrainer(config=rllib_config, env="rllibEnv-v0")
23 |     elif algo == "PG":
24 |         trainer = pg.PGTrainer(config=rllib_config, env="rllibEnv-v0")
25 |     elif algo == "DDPG":
26 |         trainer = ddpg.DDPGTrainer(config=rllib_config, env="rllibEnv-v0")
27 |     elif algo == "TD3":
28 |         trainer = td3.TD3Trainer(config=rllib_config, env="rllibEnv-v0")
29 |     elif algo == "IMPALA":
30 |         trainer = impala.ImpalaTrainer(config=rllib_config, env="rllibEnv-v0")
31 |     elif algo == "SLATEQ":
32 |         trainer = slateq.SlateQTrainer(config=rllib_config, env="rllibEnv-v0")
33 |     else:
34 |         assert algo in ("PPO", "DQN", "A2C", "A3C", "PG", "IMPALA", "TD3", "RAINBOW", "SLATEQ")
35 |     print('trainer_default_config', trainer._default_config)
36 |     return trainer
37 | 


--------------------------------------------------------------------------------
/reproductions/run_simulator_train.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | conda activate rl4rs
 4 | script_abs=$(readlink -f "$0")
 5 | rl4rs_benchmark_dir=$(dirname $script_abs)/..
 6 | rl4rs_output_dir=${rl4rs_benchmark_dir}/output
 7 | script_dir=${rl4rs_benchmark_dir}/script
 8 | export rl4rs_benchmark_dir && export rl4rs_output_dir && export rl4rs_dataset_dir
 9 | 
10 | algo=$1
11 | 
12 | cd ${script_dir}
13 | 
14 | # RL Env Construction
15 | 
16 | python supervised_train.py "${rl4rs_output_dir}/rl4rs_dataset_a_sl.tfrecord" "${rl4rs_output_dir}/rl4rs_dataset_a_rl.tfrecord" "${rl4rs_output_dir}/simulator_a_sl_$algo/model" $algo 0 >> ${rl4rs_output_dir}/simulator_a_sl_${algo}.log &&
17 | 
18 | python supervised_train.py "${rl4rs_output_dir}/rl4rs_dataset_a_rl.tfrecord" "${rl4rs_output_dir}/rl4rs_dataset_a_sl.tfrecord" "${rl4rs_output_dir}/simulator_a_rl_$algo/model" $algo 0 >> ${rl4rs_output_dir}/simulator_a_rl_${algo}.log &&
19 | 
20 | python supervised_train.py "${rl4rs_output_dir}/rl4rs_dataset_a.tfrecord" "${rl4rs_output_dir}/rl4rs_dataset_a.tfrecord" "${rl4rs_output_dir}/simulator_a_$algo/model" $algo 0 >> ${rl4rs_output_dir}/simulator_a_${algo}.log &&
21 | 
22 | python supervised_train.py "${rl4rs_output_dir}/rl4rs_dataset_b2_sl.tfrecord" "${rl4rs_output_dir}/rl4rs_dataset_b2_rl.tfrecord" "${rl4rs_output_dir}/simulator_b2_sl_$algo/model" $algo 0 >> ${rl4rs_output_dir}/simulator_b2_sl_${algo}.log &&
23 | 
24 | python supervised_train.py "${rl4rs_output_dir}/rl4rs_dataset_b2_rl.tfrecord" "${rl4rs_output_dir}/rl4rs_dataset_b2_sl.tfrecord" "${rl4rs_output_dir}/simulator_b2_rl_$algo/model" $algo 0 >> ${rl4rs_output_dir}/simulator_b2_rl_${algo}.log &&
25 | 
26 | python supervised_train.py "${rl4rs_output_dir}/rl4rs_dataset_b2.tfrecord" "${rl4rs_output_dir}/rl4rs_dataset_b2.tfrecord" "${rl4rs_output_dir}/simulator_b2_$algo/model" $algo 0 >> ${rl4rs_output_dir}/simulator_b2_${algo}.log &&
27 | 
28 | echo "1"


--------------------------------------------------------------------------------
/script/supervised_train.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import glob
 4 | import tensorflow as tf
 5 | tf.compat.v1.disable_eager_execution()
 6 | if tf.test.is_gpu_available():
 7 |     os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'
 8 | from tensorflow import keras
 9 | from rl4rs.utils.datautil import FeatureUtil
10 | from rl4rs.utils.fileutil import find_match_files
11 | 
12 | config = {
13 |     "epoch": 20,
14 |     "maxlen": 64,
15 |     "batch_size": 256,
16 |     "class_num": 2,
17 |     "dense_feature_num": 432,
18 |     "category_feature_num": 21,
19 |     "category_hash_size": 100000,
20 |     "seq_num": 2,
21 |     "emb_size": 128,
22 |     "hidden_units": 128,
23 |     "action_size": 284
24 | }
25 | train_file = sys.argv[1]
26 | test_file = sys.argv[2]
27 | model_file = sys.argv[3]
28 | model_type = sys.argv[4]
29 | is_slate_label = bool(int(sys.argv[5]))
30 | featureutil = FeatureUtil(config)
31 | 
32 | train_files = [match for match in find_match_files(train_file + '*', train_file)]
33 | test_files = [match for match in find_match_files(test_file + '*', test_file)]
34 | print('train on ', train_files, ' test on ', test_files)
35 | iter_train = featureutil.read_tfrecord(train_files, is_slate_label=is_slate_label)
36 | iter_test = featureutil.read_tfrecord(test_files, is_slate_label=is_slate_label)
37 | model = __import__("rl4rs.nets." + model_type, fromlist=['get_model']).get_model(config)
38 | steps_per_epoch = 600000 // config["batch_size"]
39 | steps_per_epoch_val = 400000 // config["batch_size"]
40 | earlyStopping = keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, verbose=2, mode='min')
41 | model.fit(iter_train, steps_per_epoch=steps_per_epoch, epochs=int(config["epoch"]),
42 |           validation_data=iter_test, validation_steps=steps_per_epoch_val, verbose=2, callbacks=[earlyStopping])
43 | 
44 | saver = tf.train.Saver()
45 | sess = tf.keras.backend.get_session()
46 | saver.save(sess, model_file)
47 | 


--------------------------------------------------------------------------------
/rl4rs/nets/dnn.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from tensorflow import keras
 3 | from tensorflow.keras import layers
 4 | from tensorflow.keras.models import Model
 5 | from rl4rs.nets import utils
 6 | 
 7 | 
 8 | def get_model(config):
 9 |     maxlen = config['maxlen']
10 |     dense_feature_num = config['dense_feature_num']
11 |     category_feature_num = config['category_feature_num']
12 |     class_num = config['class_num']
13 |     seq_num = config['seq_num']
14 | 
15 |     sequence_feature_input = layers.Input(
16 |         shape=(seq_num, maxlen,), dtype='float32', name='sequence_feature_input'
17 |     )
18 | 
19 |     dense_feature_input = layers.Input(
20 |         shape=(dense_feature_num,), dtype='float32', name='dense_feature_input'
21 |     )
22 | 
23 |     category_feature_input = layers.Input(
24 |         shape=(category_feature_num,), dtype='int64', name='category_feature_input'
25 |     )
26 | 
27 |     slate_label_input = layers.Input(
28 |         shape=(9,), dtype='int64', name='slate_label'
29 |     )
30 | 
31 |     category_feature = utils.id_input_processing(category_feature_input, config)
32 |     dense_feature = utils.dense_input_processing(dense_feature_input, config)
33 |     sequence_feature = utils.sequence_input_concat(sequence_feature_input, config)
34 |     all_feature = layers.Concatenate(axis=-1)([category_feature, dense_feature])
35 |     all_feature = layers.Dense(256, activation=layers.ELU())(all_feature)
36 |     obs = layers.Dense(256, activation=layers.ELU(), name='simulator_obs')(all_feature)
37 |     output = layers.Dense(class_num, activation='softmax', name='simulator_reward')(obs)
38 | 
39 |     model = Model(inputs=[sequence_feature_input,
40 |                           dense_feature_input,
41 |                           category_feature_input,
42 |                           slate_label_input],
43 |                   outputs=[output])
44 |     model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['AUC', 'acc'])
45 |     return model
46 | 


--------------------------------------------------------------------------------
/rl4rs/nets/widedeep.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from tensorflow import keras
 3 | from tensorflow.keras import layers
 4 | from tensorflow.keras.models import Model
 5 | from rl4rs.nets import utils
 6 | 
 7 | 
 8 | def get_model(config):
 9 |     maxlen = config['maxlen']
10 |     dense_feature_num = config['dense_feature_num']
11 |     category_feature_num = config['category_feature_num']
12 |     class_num = config['class_num']
13 |     seq_num = config['seq_num']
14 | 
15 |     sequence_feature_input = layers.Input(
16 |         shape=(seq_num, maxlen,), dtype='float32', name='sequence_feature_input'
17 |     )
18 | 
19 |     dense_feature_input = layers.Input(
20 |         shape=(dense_feature_num,), dtype='float32', name='dense_feature_input'
21 |     )
22 | 
23 |     category_feature_input = layers.Input(
24 |         shape=(category_feature_num,), dtype='int64', name='category_feature_input'
25 |     )
26 | 
27 |     slate_label_input = layers.Input(
28 |         shape=(9,), dtype='int64', name='slate_label'
29 |     )
30 | 
31 |     category_feature = utils.id_input_processing_concat(category_feature_input, config)
32 |     dense_feature = utils.dense_input_processing(dense_feature_input, config)
33 |     sequence_feature = utils.sequence_input_concat(sequence_feature_input, config)
34 |     sequence_feature_dnn = layers.Dense(256, activation=layers.ELU())(sequence_feature)
35 |     all_feature = layers.Concatenate(axis=-1, name='simulator_obs')(
36 |         [sequence_feature_dnn, dense_feature, category_feature]
37 |     )
38 |     output = layers.Dense(class_num, activation='softmax', name='simulator_reward')(all_feature)
39 | 
40 |     model = Model(inputs=[sequence_feature_input,
41 |                           dense_feature_input,
42 |                           category_feature_input,
43 |                           slate_label_input],
44 |                   outputs=[output])
45 |     model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['AUC', 'acc'])
46 |     return model
47 | 


--------------------------------------------------------------------------------
/rl4rs/nets/lstm_slate.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from tensorflow import keras
 3 | from tensorflow.keras import layers
 4 | from tensorflow.keras.models import Model
 5 | from rl4rs.nets import utils
 6 | 
 7 | 
 8 | def get_model(config):
 9 |     maxlen = config['maxlen']
10 |     dense_feature_num = config['dense_feature_num']
11 |     category_feature_num = config['category_feature_num']
12 |     class_num = config['class_num']
13 |     seq_num = config['seq_num']
14 | 
15 |     sequence_feature_input = layers.Input(shape=(seq_num, maxlen,), dtype='float32', name='sequence_feature_input')
16 | 
17 |     dense_feature_input = layers.Input(shape=(dense_feature_num,), dtype='float32', name='dense_feature_input')
18 | 
19 |     category_feature_input = layers.Input(shape=(category_feature_num,), dtype='int64', name='category_feature_input')
20 | 
21 |     slate_label_input = layers.Input(shape=(9,), dtype='int64', name='slate_label')
22 | 
23 |     category_feature = utils.id_input_processing_lstm(category_feature_input, config)
24 |     dense_feature = utils.dense_input_processing(dense_feature_input, config)
25 |     sequence_feature = utils.sequence_input_LSTM(sequence_feature_input, config)
26 |     all_feature = layers.Concatenate(axis=-1)([sequence_feature, dense_feature, category_feature])
27 |     obs = layers.Dense(256, activation=layers.ELU(), name='simulator_obs')(all_feature)
28 |     output = layers.Dense(9, activation='sigmoid', name='simulator_reward')(obs)
29 | 
30 |     model = Model(inputs=[sequence_feature_input,
31 |                           dense_feature_input,
32 |                           category_feature_input,
33 |                           slate_label_input],
34 |                   outputs=[output])
35 |     model.compile(loss='binary_crossentropy',
36 |                   optimizer='adam',
37 |                   metrics=[tf.keras.metrics.AUC(),
38 |                            tf.keras.metrics.Precision(),
39 |                            tf.keras.metrics.Recall()])
40 |     return model
41 | 


--------------------------------------------------------------------------------
/rl4rs/nets/lstm.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from tensorflow import keras
 3 | from tensorflow.keras import layers
 4 | from tensorflow.keras.models import Model
 5 | from rl4rs.nets import utils
 6 | 
 7 | 
 8 | def get_model(config):
 9 |     maxlen = config['maxlen']
10 |     dense_feature_num = config['dense_feature_num']
11 |     category_feature_num = config['category_feature_num']
12 |     class_num = config['class_num']
13 |     seq_num = config['seq_num']
14 | 
15 |     sequence_feature_input = layers.Input(
16 |         shape=(seq_num, maxlen,), dtype='float32', name='sequence_feature_input'
17 |     )
18 | 
19 |     dense_feature_input = layers.Input(
20 |         shape=(dense_feature_num,), dtype='float32', name='dense_feature_input'
21 |     )
22 | 
23 |     category_feature_input = layers.Input(
24 |         shape=(category_feature_num,), dtype='int64', name='category_feature_input'
25 |     )
26 | 
27 |     slate_label_input = layers.Input(
28 |         shape=(9,), dtype='int64', name='slate_label'
29 |     )
30 | 
31 |     category_feature = utils.id_input_processing_lstm(category_feature_input, config)
32 |     # category_feature_concat = utils.id_input_processing_concat(category_feature_input, config)
33 |     dense_feature = utils.dense_input_processing(dense_feature_input, config)
34 |     sequence_feature = utils.sequence_input_LSTM(sequence_feature_input, config)
35 |     all_feature = layers.Concatenate(axis=-1)([sequence_feature, dense_feature, category_feature])
36 |     obs = layers.Dense(256, activation=layers.ELU(), name='simulator_obs')(all_feature)
37 |     output = layers.Dense(class_num, activation='softmax', name='simulator_reward')(obs)
38 | 
39 |     model = Model(inputs=[sequence_feature_input,
40 |                           dense_feature_input,
41 |                           category_feature_input,
42 |                           slate_label_input],
43 |                   outputs=[output])
44 |     model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['AUC', 'acc'])
45 |     return model
46 | 


--------------------------------------------------------------------------------
/rl4rs/nets/exact_k/utils.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import numpy as np
 3 | 
 4 | 
 5 | def index_matrix_to_pairs_fn(batch_size, seq_length):
 6 |     replicated_first_indices = tf.range(batch_size)  # range(128)
 7 |     # replicated_first_indices =
 8 |     #    [[  0,  0,  0,...],
 9 |     #     [  1,  1,  1,...],
10 |     #     ......
11 |     #     [127,127,127,...]]
12 |     replicated_first_indices2 = tf.tile(
13 |         tf.expand_dims(replicated_first_indices, dim=1),  # [128,1]
14 |         [1, seq_length])
15 | 
16 |     def index_matrix_to_pairs(index_matrix):
17 |         """
18 |         :param index_matrix: [batch_size, data_len] or [batch_size]
19 |         :return: [batch_size, data_len, 2] or [batch_size, 2]
20 |         ie:
21 |           a: [128, 10] -> c[i,j,:] = [i,a[i,j]], shape(c) = [128,10,2]
22 |           a: [128] -> c[i,:] = [i,a[i]], shape(c) = [128,2]
23 |         """
24 |         rank = len(index_matrix.get_shape())
25 |         if rank == 1:
26 |             return tf.stack([replicated_first_indices, index_matrix], axis=rank)
27 |         elif rank == 2:
28 |             return tf.stack([replicated_first_indices2, index_matrix], axis=rank)
29 |         else:
30 |             raise NotImplementedError("index_matrix rank should be 1 or 2, but %d found" % rank)
31 | 
32 |     return index_matrix_to_pairs
33 | 
34 | 
35 | def batch_gather(data, indices):
36 |     batch_size = data.get_shape()[0].merge_with(indices.get_shape()[0]).value
37 |     if batch_size is None:
38 |         batch_size = tf.shape(indices)[0]
39 |     gather_data_size = indices.get_shape()[1].value
40 |     if gather_data_size is None:
41 |         gather_data_size = tf.shape(indices)[1]
42 |     flat_indices = tf.reshape(tf.transpose(indices), (-1,))  #[batch*4,1]
43 |     input_index_pairs = tf.stop_gradient(tf.stack(
44 |              [tf.range(batch_size*gather_data_size, dtype=tf.int32), flat_indices], axis=1))
45 |     flat_data = tf.tile(data, [gather_data_size, 1])
46 |     return tf.transpose(tf.reshape(tf.gather_nd(flat_data, input_index_pairs), (gather_data_size, batch_size)))
47 | 


--------------------------------------------------------------------------------
/rl4rs/nets/dien.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from tensorflow import keras
 3 | from tensorflow.keras import layers
 4 | from tensorflow.keras.models import Model
 5 | from rl4rs.nets import utils
 6 | 
 7 | 
 8 | def get_model(config):
 9 |     maxlen = config['maxlen']
10 |     dense_feature_num = config['dense_feature_num']
11 |     category_feature_num = config['category_feature_num']
12 |     class_num = config['class_num']
13 |     seq_num = config['seq_num']
14 | 
15 |     sequence_feature_input = layers.Input(
16 |         shape=(seq_num, maxlen,), dtype='float32', name='sequence_feature_input'
17 |     )
18 | 
19 |     dense_feature_input = layers.Input(
20 |         shape=(dense_feature_num,), dtype='float32', name='dense_feature_input'
21 |     )
22 | 
23 |     category_feature_input = layers.Input(
24 |         shape=(category_feature_num,), dtype='int64', name='category_feature_input'
25 |     )
26 | 
27 |     slate_label_input = layers.Input(shape=(9,), dtype='int64', name='slate_label')
28 | 
29 |     slice_layer = layers.Lambda(lambda x: x[0][:, x[1]:])
30 |     id_slate_input = slice_layer([category_feature_input, -10])
31 |     category_feature = utils.id_input_processing_attn(category_feature_input, config)
32 |     dense_feature = utils.dense_input_processing(dense_feature_input, config)
33 |     sequence_feature = utils.sequence_input_attn([sequence_feature_input, id_slate_input], config)
34 |     all_feature = layers.Concatenate(axis=-1)([sequence_feature, dense_feature, category_feature])
35 |     obs = layers.Dense(256, activation=layers.ELU(), name='simulator_obs')(all_feature)
36 |     output = layers.Dense(class_num, activation='softmax', name='simulator_reward')(obs)
37 | 
38 |     model = Model(inputs=[sequence_feature_input,
39 |                           dense_feature_input,
40 |                           category_feature_input,
41 |                           slate_label_input],
42 |                   outputs=[output])
43 |     tf.keras.backend.get_session().run(tf.global_variables_initializer())
44 |     model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['AUC', 'acc'])
45 |     return model
46 | 


--------------------------------------------------------------------------------
/rl4rs/nets/dnn_slate.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from tensorflow import keras
 3 | from tensorflow.keras import layers
 4 | from tensorflow.keras.models import Model
 5 | from rl4rs.nets import utils
 6 | 
 7 | 
 8 | def get_model(config):
 9 |     maxlen = config['maxlen']
10 |     dense_feature_num = config['dense_feature_num']
11 |     category_feature_num = config['category_feature_num']
12 |     class_num = config['class_num']
13 |     seq_num = config['seq_num']
14 | 
15 |     sequence_feature_input = layers.Input(
16 |         shape=(seq_num, maxlen,), dtype='float32', name='sequence_feature_input'
17 |     )
18 | 
19 |     dense_feature_input = layers.Input(
20 |         shape=(dense_feature_num,), dtype='float32', name='dense_feature_input'
21 |     )
22 | 
23 |     category_feature_input = layers.Input(
24 |         shape=(category_feature_num,), dtype='int64', name='category_feature_input'
25 |     )
26 | 
27 |     slate_label_input = layers.Input(
28 |         shape=(9,), dtype='int64', name='slate_label'
29 |     )
30 | 
31 |     category_feature = utils.id_input_processing(category_feature_input, config)
32 |     dense_feature = utils.dense_input_processing(dense_feature_input, config)
33 |     sequence_feature = utils.sequence_input_concat(sequence_feature_input, config)
34 |     all_feature = layers.Concatenate(axis=-1)([category_feature, dense_feature])
35 |     all_feature = layers.Dense(256, activation=layers.ELU())(all_feature)
36 |     obs = layers.Dense(256, activation=layers.ELU(), name='simulator_obs')(all_feature)
37 |     output = layers.Dense(9, activation='sigmoid', name='simulator_reward')(obs)
38 | 
39 |     model = Model(inputs=[sequence_feature_input,
40 |                           dense_feature_input,
41 |                           category_feature_input,
42 |                           slate_label_input],
43 |                   outputs=[output])
44 |     model.compile(loss='binary_crossentropy',
45 |                   optimizer='adam',
46 |                   metrics=[tf.keras.metrics.AUC(),
47 |                            tf.keras.metrics.Precision(),
48 |                            tf.keras.metrics.Recall()])
49 |     return model
50 | 


--------------------------------------------------------------------------------
/rl4rs/nets/widedeep_slate.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from tensorflow import keras
 3 | from tensorflow.keras import layers
 4 | from tensorflow.keras.models import Model
 5 | from rl4rs.nets import utils
 6 | 
 7 | 
 8 | def get_model(config):
 9 |     maxlen = config['maxlen']
10 |     dense_feature_num = config['dense_feature_num']
11 |     category_feature_num = config['category_feature_num']
12 |     class_num = config['class_num']
13 |     seq_num = config['seq_num']
14 | 
15 |     sequence_feature_input = layers.Input(
16 |         shape=(seq_num, maxlen,), dtype='float32', name='sequence_feature_input'
17 |     )
18 | 
19 |     dense_feature_input = layers.Input(
20 |         shape=(dense_feature_num,), dtype='float32', name='dense_feature_input'
21 |     )
22 | 
23 |     category_feature_input = layers.Input(
24 |         shape=(category_feature_num,), dtype='int64', name='category_feature_input'
25 |     )
26 | 
27 |     slate_label_input = layers.Input(
28 |         shape=(9,), dtype='int64', name='slate_label'
29 |     )
30 | 
31 |     category_feature = utils.id_input_processing_concat(category_feature_input, config)
32 |     dense_feature = utils.dense_input_processing(dense_feature_input, config)
33 |     sequence_feature = utils.sequence_input_concat(sequence_feature_input, config)
34 |     # all_feature = layers.Concatenate(axis=-1)([sequence_feature, dense_feature])
35 |     sequence_feature_dnn = layers.Dense(256, activation=layers.ELU())(sequence_feature)
36 |     all_feature = layers.Concatenate(axis=-1, name='simulator_obs')(
37 |         [sequence_feature_dnn, dense_feature, category_feature]
38 |     )
39 |     output = layers.Dense(9, activation='sigmoid', name='simulator_reward')(all_feature)
40 | 
41 |     model = Model(inputs=[sequence_feature_input,
42 |                           dense_feature_input,
43 |                           category_feature_input,
44 |                           slate_label_input],
45 |                   outputs=[output])
46 |     model.compile(loss='binary_crossentropy',
47 |                   optimizer='adam',
48 |                   metrics=[tf.keras.metrics.AUC(),
49 |                            tf.keras.metrics.Precision(),
50 |                            tf.keras.metrics.Recall()])
51 |     return model
52 | 


--------------------------------------------------------------------------------
/rl4rs/nets/dien_slate.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from tensorflow import keras
 3 | from tensorflow.keras import layers
 4 | from tensorflow.keras.models import Model
 5 | from rl4rs.nets import utils
 6 | 
 7 | 
 8 | def get_model(config):
 9 |     maxlen = config['maxlen']
10 |     dense_feature_num = config['dense_feature_num']
11 |     category_feature_num = config['category_feature_num']
12 |     class_num = config['class_num']
13 |     seq_num = config['seq_num']
14 | 
15 |     sequence_feature_input = layers.Input(
16 |         shape=(seq_num, maxlen,), dtype='float32', name='sequence_feature_input'
17 |     )
18 | 
19 |     dense_feature_input = layers.Input(
20 |         shape=(dense_feature_num,), dtype='float32', name='dense_feature_input'
21 |     )
22 | 
23 |     category_feature_input = layers.Input(
24 |         shape=(category_feature_num,), dtype='int64', name='category_feature_input'
25 |     )
26 | 
27 |     slate_label_input = layers.Input(shape=(9,), dtype='int64', name='slate_label')
28 | 
29 |     slice_layer = layers.Lambda(lambda x: x[0][:, x[1]:])
30 |     id_slate_input = slice_layer([category_feature_input, -10])
31 | 
32 |     category_feature = utils.id_input_processing_attn(category_feature_input, config)
33 |     dense_feature = utils.dense_input_processing(dense_feature_input, config)
34 |     sequence_feature = utils.sequence_input_attn([sequence_feature_input, id_slate_input], config)
35 |     all_feature = layers.Concatenate(axis=-1)([sequence_feature, dense_feature, category_feature])
36 |     obs = layers.Dense(256, activation=layers.ELU(), name='simulator_obs')(all_feature)
37 |     output = layers.Dense(9, activation='sigmoid', name='simulator_reward')(obs)
38 | 
39 |     model = Model(inputs=[
40 |         sequence_feature_input,
41 |         dense_feature_input,
42 |         category_feature_input,
43 |         slate_label_input],
44 |         outputs=[output])
45 |     tf.keras.backend.get_session().run(tf.global_variables_initializer())
46 |     model.compile(loss='binary_crossentropy',
47 |                   optimizer='adam',
48 |                   metrics=[tf.keras.metrics.AUC(),
49 |                            tf.keras.metrics.Precision(),
50 |                            tf.keras.metrics.Recall()])
51 |     return model
52 | 


--------------------------------------------------------------------------------
/rl4rs/server/httpEnv.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import numpy as np
 3 | from rl4rs.server.gymHttpClient import Client
 4 | 
 5 | 
 6 | class HttpEnv(gym.Env):
 7 |     metadata = {'render.modes': ['human']}
 8 | 
 9 |     def __init__(self, env_id, config={}):
10 |         remote_base = config["remote_base"]
11 |         self.client = Client(remote_base)
12 |         self.instance_id = self.client.env_create(env_id, config)
13 |         action_info = self.client.env_action_space_info(self.instance_id)
14 |         obs_info = self.client.env_observation_space_info(self.instance_id)
15 |         if action_info['name'] == 'Box':
16 |             self.action_space = gym.spaces.Box(np.array(action_info['low']), np.array(action_info['high']), shape=action_info['shape'])
17 |         else:
18 |             self.action_space = gym.spaces.Discrete(action_info['n'])
19 |         if obs_info['name'] == 'Box':
20 |             self.observation_space = gym.spaces.Box(np.array(obs_info['low']), np.array(obs_info['high']), shape=obs_info['shape'])
21 |         elif obs_info['name'] == 'Dict':
22 |             keys = obs_info['keys']
23 |             space_D = {}
24 |             for key in keys:
25 |                 shape = obs_info[key]['shape']
26 |                 space_D[key] = gym.spaces.Box(np.array(obs_info[key]['low']).reshape(shape), np.array(obs_info[key]['high']).reshape(shape), shape=shape)
27 |             self.observation_space = gym.spaces.Dict(space_D)
28 |         else:
29 |             assert obs_info['name'] in ('Box', 'Dict')
30 | 
31 |     def seed(self, sd=0):
32 |         pass
33 | 
34 |     def step(self, action):
35 |         if isinstance(action, np.ndarray):
36 |             action = action.tolist()
37 |         if isinstance(action, np.int):
38 |             action = int(action)
39 |         observation, reward, done, info = self.client.env_step(self.instance_id, action, False)
40 |         return self.observation_space.from_jsonable(observation), reward, done, info
41 | 
42 |     def reset(self):
43 |         observation = self.client.env_reset(self.instance_id)
44 |         return self.observation_space.from_jsonable(observation)
45 | 
46 |     def render(self, mode='human', close=False):
47 |         return ''
48 | 
49 |     def close(self):
50 |         return self.client.env_close(self.instance_id)
51 | 


--------------------------------------------------------------------------------
/script/simulator_eval.py:
--------------------------------------------------------------------------------
 1 | import os, sys
 2 | import gym
 3 | import numpy as np
 4 | from rl4rs.env.slate import SlateRecEnv, SlateState
 5 | from rl4rs.env.seqslate import SeqSlateRecEnv, SeqSlateState
 6 | 
 7 | extra_config = eval(sys.argv[1]) if len(sys.argv) >= 2 else {}
 8 | 
 9 | config = {"epoch": 4, "maxlen": 64, "batch_size": 2048, "action_size": 284, "class_num": 2, "dense_feature_num": 432,
10 |           "category_feature_num": 21, "category_hash_size": 100000, "seq_num": 2, "emb_size": 128, "page_items": 9,
11 |           "hidden_units": 128, "max_steps": 9, "sample_file": '../dataset/rl4rs_dataset_b3_shuf.csv', "iteminfo_file": '../item_info.csv',
12 |           "model_file": "../output/simulator_b2_dien/model", "support_rllib_mask": False, "is_eval": True, 'env': "SeqSlateRecEnv-v0"}
13 | 
14 | config = dict(config, **extra_config)
15 | 
16 | if config.get('gpu', 0) < 1:
17 |     os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
18 | 
19 | if config['env'] == 'SeqSlateRecEnv-v0':
20 |     config['max_steps'] = 36
21 |     sim = SeqSlateRecEnv(config, state_cls=SeqSlateState)
22 |     env = gym.make('SeqSlateRecEnv-v0', recsim=sim)
23 | else:
24 |     sim = SlateRecEnv(config, state_cls=SlateState)
25 |     env = gym.make('SlateRecEnv-v0', recsim=sim)
26 | 
27 | batch_size = config["batch_size"]
28 | epoch = config["epoch"]
29 | max_steps = config["max_steps"]
30 | rewards = np.zeros((epoch, batch_size))
31 | offline_rewards = np.zeros((epoch, batch_size))
32 | offline_actions = np.zeros((epoch, batch_size, max_steps))
33 | 
34 | for i in range(epoch):
35 |     env.reset()
36 |     for j in range(config["max_steps"]):
37 |         action = env.offline_action
38 |         offline_actions[i, :, j] = env.offline_action
39 |         next_obs, reward, done, info = env.step(action)
40 |         rewards[i] = rewards[i] + np.array(reward)
41 |         offline_rewards[i] = offline_rewards[i] + np.array(env.offline_reward)
42 |         if done[0]:
43 |             print(
44 |                 i,
45 |                 np.sum(rewards) / config["batch_size"] / (i + 1),
46 |                 np.sum(offline_rewards) / config["batch_size"] / (i + 1)
47 |             )
48 |             break
49 | print('the mean of offline reward', np.mean(offline_rewards))
50 | print('the mean of reward prediction error', np.mean(rewards - offline_rewards))
51 | print('the absolute mean of reward prediction error', np.mean(np.abs(rewards - offline_rewards)))
52 | print('the std of reward prediction error', np.std(np.reshape(rewards - offline_rewards, -1)))
53 | print('success')
54 | 
55 | 


--------------------------------------------------------------------------------
/script/test_exact_k.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import tensorflow as tf
 3 | from rl4rs.nets.exact_k.model import Generator, Discriminator
 4 | 
 5 | batch_size = 2
 6 | l1_mask = np.zeros(284)
 7 | l1_mask[:40] = 1
 8 | l2_mask = np.zeros(284)
 9 | l2_mask[40:150] = 1
10 | l3_mask = np.zeros(284)
11 | l3_mask[150:] = 1
12 | l0_ssr_mask = np.zeros(284)
13 | l0_ssr_mask[:30] = 1
14 | l0_ssr_mask[40:140] = 1
15 | l0_ssr_mask[160:] = 1
16 | 
17 | with tf.name_scope('Generator'):
18 |     g = Generator(l1_mask,
19 |                   l2_mask,
20 |                   l3_mask,
21 |                   l0_ssr_mask,
22 |                   is_training=True,
23 |                   seq_length=284)
24 | 
25 | with tf.name_scope('Discriminator'):
26 |     d = Discriminator(seq_length=284)
27 | 
28 | print("Graph loaded")
29 | 
30 | gpu_options = tf.GPUOptions(
31 |     per_process_gpu_memory_fraction=0.95,
32 |     allow_growth=True)
33 | sess_config = tf.ConfigProto(allow_soft_placement=True,
34 |                              gpu_options=gpu_options)
35 | 
36 | with tf.Session(config=sess_config) as sess:
37 |     sess.run(tf.initialize_all_variables())
38 |     print('Generator training start!')
39 | 
40 |     reward_total = 0.0
41 |     observation = np.random.random((batch_size, 256))
42 |     item_cand = np.array([list(range(0, 284))] * batch_size)
43 |     for _ in range(9):
44 |         sampled_card_idx, sampled_card = sess.run([g.sampled_path, g.sampled_result],
45 |                                                   feed_dict={g.user: observation, g.item_cand: item_cand})
46 |         reward = np.ones((batch_size,))
47 | 
48 |         reward_ = sess.run(d.reward, feed_dict={d.user: observation})
49 |         sess.run(d.train_op, feed_dict={d.user: observation, d.reward_target: reward})
50 | 
51 |         reward_total += np.mean(reward)
52 | 
53 |         reward = (reward - reward_)
54 | 
55 |         sess.run(g.train_op, feed_dict={g.decode_target_ids: sampled_card_idx,
56 |                                         g.reward: reward,
57 |                                         g.item_cand: item_cand,
58 |                                         g.user: observation,
59 |                                         })
60 |         gs_gen = sess.run(g.global_step)
61 | 
62 |         # beamsearch
63 |         # beam_card = sess.run(g.infer_result,
64 |         #                      feed_dict={g.item_cand: item_cand,
65 |         #                                 g.enc_user: observation})
66 | 
67 |         print(sampled_card_idx, sampled_card, reward_)
68 | 
69 | print("Done")
70 | 


--------------------------------------------------------------------------------
/rl4rs/nets/lstm_slate_multiclass.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from tensorflow import keras
 3 | from tensorflow.keras import layers
 4 | from tensorflow.keras.models import Model
 5 | from rl4rs.nets import utils
 6 | 
 7 | 
 8 | def my_loss_fn(y_true, y_pred):
 9 |     slate2label = tf.einsum('ij,j->i',
10 |                             tf.cast(y_true, tf.int64),
11 |                             tf.constant([1, 2, 4, 1, 2, 4, 1, 2, 4], dtype=tf.int64))
12 |     return tf.keras.losses.categorical_crossentropy(tf.one_hot(slate2label, 22), y_pred)
13 | 
14 | 
15 | def my_acc_metrics(y_true, y_pred):
16 |     slate2label = tf.einsum('ij,j->i',
17 |                             tf.cast(y_true, tf.int64),
18 |                             tf.constant([1, 2, 4, 1, 2, 4, 1, 2, 4], dtype=tf.int64))
19 |     return tf.keras.metrics.categorical_accuracy(tf.one_hot(slate2label, 22), y_pred)
20 | 
21 | 
22 | def get_model(config):
23 |     maxlen = config['maxlen']
24 |     dense_feature_num = config['dense_feature_num']
25 |     category_feature_num = config['category_feature_num']
26 |     class_num = config['class_num']
27 |     seq_num = config['seq_num']
28 | 
29 |     sequence_feature_input = layers.Input(shape=(seq_num, maxlen,), dtype='float32', name='sequence_feature_input')
30 | 
31 |     dense_feature_input = layers.Input(shape=(dense_feature_num,), dtype='float32', name='dense_feature_input')
32 | 
33 |     category_feature_input = layers.Input(shape=(category_feature_num,), dtype='int64', name='category_feature_input')
34 | 
35 |     slate_label_input = layers.Input(shape=(9,), dtype='int64', name='slate_label')
36 | 
37 |     category_feature = utils.id_input_processing_lstm(category_feature_input, config)
38 |     dense_feature = utils.dense_input_processing(dense_feature_input, config)
39 |     sequence_feature = utils.sequence_input_LSTM(sequence_feature_input, config)
40 |     all_feature = layers.Concatenate(axis=-1)([sequence_feature, dense_feature, category_feature])
41 |     # all_feature = layers.Concatenate(axis=-1)([sequence_feature, category_feature])
42 |     obs = layers.Dense(256, activation=layers.ELU(), name='simulator_obs')(all_feature)
43 |     output = layers.Dense(22, activation='softmax', name='simulator_reward')(obs)
44 | 
45 |     model = Model(inputs=[sequence_feature_input,
46 |                           dense_feature_input,
47 |                           category_feature_input,
48 |                           slate_label_input],
49 |                   outputs=[output])
50 |     model.compile(loss=my_loss_fn, optimizer='adam', metrics=[my_acc_metrics])
51 |     return model
52 | 


--------------------------------------------------------------------------------
/rl4rs/nets/dnn_slate_multiclass.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from tensorflow import keras
 3 | from tensorflow.keras import layers
 4 | from tensorflow.keras.models import Model
 5 | from rl4rs.nets import utils
 6 | 
 7 | 
 8 | def my_loss_fn(y_true, y_pred):
 9 |     slate2label = tf.einsum('ij,j->i',
10 |                             tf.cast(y_true, tf.int64),
11 |                             tf.constant([1, 2, 4, 1, 2, 4, 1, 2, 4], dtype=tf.int64))
12 |     return tf.keras.losses.categorical_crossentropy(tf.one_hot(slate2label, 22), y_pred)
13 | 
14 | 
15 | def my_acc_metrics(y_true, y_pred):
16 |     slate2label = tf.einsum('ij,j->i',
17 |                             tf.cast(y_true, tf.int64),
18 |                             tf.constant([1, 2, 4, 1, 2, 4, 1, 2, 4], dtype=tf.int64))
19 |     return tf.keras.metrics.categorical_accuracy(tf.one_hot(slate2label, 22), y_pred)
20 | 
21 | 
22 | def get_model(config):
23 |     maxlen = config['maxlen']
24 |     dense_feature_num = config['dense_feature_num']
25 |     category_feature_num = config['category_feature_num']
26 |     class_num = config['class_num']
27 |     seq_num = config['seq_num']
28 | 
29 |     sequence_feature_input = layers.Input(
30 |         shape=(seq_num, maxlen,), dtype='float32', name='sequence_feature_input'
31 |     )
32 | 
33 |     dense_feature_input = layers.Input(
34 |         shape=(dense_feature_num,), dtype='float32', name='dense_feature_input'
35 |     )
36 | 
37 |     category_feature_input = layers.Input(
38 |         shape=(category_feature_num,), dtype='int64', name='category_feature_input'
39 |     )
40 | 
41 |     slate_label_input = layers.Input(
42 |         shape=(9,), dtype='int64', name='slate_label'
43 |     )
44 | 
45 |     category_feature = utils.id_input_processing(category_feature_input, config)
46 |     dense_feature = utils.dense_input_processing(dense_feature_input, config)
47 |     sequence_feature = utils.sequence_input_concat(sequence_feature_input, config)
48 |     all_feature = layers.Concatenate(axis=-1)([category_feature, dense_feature])
49 |     all_feature = layers.Dense(256, activation=layers.ELU())(all_feature)
50 |     obs = layers.Dense(256, activation=layers.ELU(), name='simulator_obs')(all_feature)
51 |     output = layers.Dense(22, activation='softmax', name='simulator_reward')(obs)
52 | 
53 |     model = Model(inputs=[sequence_feature_input,
54 |                           dense_feature_input,
55 |                           category_feature_input,
56 |                           slate_label_input],
57 |                   outputs=[output])
58 |     model.compile(loss=my_loss_fn, optimizer='adam', metrics=[my_acc_metrics])
59 |     return model
60 | 


--------------------------------------------------------------------------------
/index.html:
--------------------------------------------------------------------------------
 1 | <html>
 2 |   <head>
 3 |     <title>RL4RS Dataset</title>
 4 |     <script type="application/ld+json">
 5 |     {
 6 |       "@context":"https://schema.org/",
 7 |       "@type":"Dataset",
 8 |       "name":"RL4RS Dataset",
 9 |       "description":"The Dataset of <RL4RS: A Real-World Benchmark for Reinforcement Learning based Recommender System>; Github Page: https://github.com/fuxiAIlab/RL4RS",
10 |       "url":"https://zenodo.org/record/6622390#.YqBBfRNBxQJ",
11 |       "sameAs":"https://github.com/fuxiAIlab/RL4RS",
12 |       "identifier": ["https://doi.org/10.5281/zenodo.6622390"],
13 |       "keywords":[
14 |          "Applied Reinforcement Learning",
15 |          "Reinforcement Learning based Recommender System",
16 |          "Recommender System",
17 |          "RL-based RS"
18 |       ],
19 |       "license" : "https://creativecommons.org/licenses/by/4.0/",
20 |       "isAccessibleForFree" : true,
21 |       "hasPart" : [
22 |         {
23 |           "@type": "Dataset",
24 |           "name": "RL4RS-Slate Dataset",
25 |           "description": "rl4rs-dataset/rl4rs_dataset_a_sl.csv and rl4rs-dataset/rl4rs_dataset_a_rl.csv",
26 |           "license" : "https://creativecommons.org/licenses/by/4.0/",
27 |           "creator":{
28 |              "@type":"Organization",
29 |              "name": "Fuxi AI Lab, Netease"
30 |           }
31 |         },
32 |         {
33 |           "@type": "Dataset",
34 |           "name": "RL4RS-SeqSlate Dataset",
35 |           "description": "rl4rs-dataset/rl4rs_dataset_b_sl.csv and rl4rs-dataset/rl4rs_dataset_b_rl.csv",
36 |           "license" : "https://creativecommons.org/licenses/by/4.0/",
37 |           "creator":{
38 |              "@type":"Organization",
39 |              "name": "Fuxi AI Lab, Netease"
40 |           }
41 |         }
42 |       ],
43 |       "creator":{
44 |          "@type":"Organization",
45 |          "url": "https://fuxi.163.com/en/",
46 |          "name":"Fuxi AI Lab, Netease",
47 |          "contactPoint":{
48 |             "@type":"ContactPoint",
49 |             "contactType": "customer service",
50 |             "email":"wangkai02@corp.netease.com"
51 |          }
52 |       },
53 |       "distribution":[
54 |          {
55 |             "@type":"DataDownload",
56 |             "encodingFormat":"TAR.GZ",
57 |             "contentUrl":"https://zenodo.org/record/6622390/files/rl4rs-dataset.tar.gz?download=1"
58 |          },
59 |          {
60 |             "@type":"DataDownload",
61 |             "encodingFormat":"TAR.GZ",
62 |             "contentUrl":"https://drive.google.com/file/d/1YbPtPyYrMvMGOuqD4oHvK0epDtEhEb9v/view?usp=sharing"
63 |          }
64 |       ],
65 |     }
66 |     </script>
67 |   </head>
68 |   <body>
69 |   </body>
70 | </html>
71 | 


--------------------------------------------------------------------------------
/rl4rs/nets/widedeep_slate_multiclass.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from tensorflow import keras
 3 | from tensorflow.keras import layers
 4 | from tensorflow.keras.models import Model
 5 | from rl4rs.nets import utils
 6 | 
 7 | 
 8 | def my_loss_fn(y_true, y_pred):
 9 |     slate2label = tf.einsum('ij,j->i',
10 |                             tf.cast(y_true, tf.int64),
11 |                             tf.constant([1, 2, 4, 1, 2, 4, 1, 2, 4], dtype=tf.int64))
12 |     return tf.keras.losses.categorical_crossentropy(tf.one_hot(slate2label, 22), y_pred)
13 | 
14 | 
15 | def my_acc_metrics(y_true, y_pred):
16 |     slate2label = tf.einsum('ij,j->i',
17 |                             tf.cast(y_true, tf.int64),
18 |                             tf.constant([1, 2, 4, 1, 2, 4, 1, 2, 4], dtype=tf.int64))
19 |     return tf.keras.metrics.categorical_accuracy(tf.one_hot(slate2label, 22), y_pred)
20 | 
21 | 
22 | def get_model(config):
23 |     maxlen = config['maxlen']
24 |     dense_feature_num = config['dense_feature_num']
25 |     category_feature_num = config['category_feature_num']
26 |     class_num = config['class_num']
27 |     seq_num = config['seq_num']
28 | 
29 |     sequence_feature_input = layers.Input(
30 |         shape=(seq_num, maxlen,), dtype='float32', name='sequence_feature_input'
31 |     )
32 | 
33 |     dense_feature_input = layers.Input(
34 |         shape=(dense_feature_num,), dtype='float32', name='dense_feature_input'
35 |     )
36 | 
37 |     category_feature_input = layers.Input(
38 |         shape=(category_feature_num,), dtype='int64', name='category_feature_input'
39 |     )
40 | 
41 |     slate_label_input = layers.Input(
42 |         shape=(9,), dtype='int64', name='slate_label'
43 |     )
44 | 
45 |     category_feature = utils.id_input_processing_concat(category_feature_input, config)
46 |     dense_feature = utils.dense_input_processing(dense_feature_input, config)
47 |     sequence_feature = utils.sequence_input_concat(sequence_feature_input, config)
48 |     # all_feature = layers.Concatenate(axis=-1)([sequence_feature, dense_feature])
49 |     sequence_feature_dnn = layers.Dense(256, activation=layers.ELU())(sequence_feature)
50 |     all_feature = layers.Concatenate(axis=-1, name='simulator_obs')(
51 |         [sequence_feature_dnn, dense_feature, category_feature]
52 |     )
53 |     output = layers.Dense(22, activation='softmax', name='simulator_reward')(all_feature)
54 | 
55 |     model = Model(inputs=[sequence_feature_input,
56 |                           dense_feature_input,
57 |                           category_feature_input,
58 |                           slate_label_input],
59 |                   outputs=[output])
60 |     model.compile(loss=my_loss_fn, optimizer='adam', metrics=[my_acc_metrics])
61 |     return model
62 | 


--------------------------------------------------------------------------------
/rl4rs/nets/dien_slate_multiclass.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from tensorflow import keras
 3 | from tensorflow.keras import layers
 4 | from tensorflow.keras.models import Model
 5 | from rl4rs.nets import utils
 6 | 
 7 | 
 8 | def my_loss_fn(y_true, y_pred):
 9 |     slate2label = tf.einsum('ij,j->i',
10 |                             tf.cast(y_true, tf.int64),
11 |                             tf.constant([1, 2, 4, 1, 2, 4, 1, 2, 4], dtype=tf.int64))
12 |     return tf.keras.losses.categorical_crossentropy(tf.one_hot(slate2label, 22), y_pred)
13 | 
14 | 
15 | def my_acc_metrics(y_true, y_pred):
16 |     slate2label = tf.einsum('ij,j->i',
17 |                             tf.cast(y_true, tf.int64),
18 |                             tf.constant([1, 2, 4, 1, 2, 4, 1, 2, 4], dtype=tf.int64))
19 |     return tf.keras.metrics.categorical_accuracy(tf.one_hot(slate2label, 22), y_pred)
20 | 
21 | 
22 | def get_model(config):
23 |     maxlen = config['maxlen']
24 |     dense_feature_num = config['dense_feature_num']
25 |     category_feature_num = config['category_feature_num']
26 |     class_num = config['class_num']
27 |     seq_num = config['seq_num']
28 | 
29 |     sequence_feature_input = layers.Input(
30 |         shape=(seq_num, maxlen,), dtype='float32', name='sequence_feature_input'
31 |     )
32 | 
33 |     dense_feature_input = layers.Input(
34 |         shape=(dense_feature_num,), dtype='float32', name='dense_feature_input'
35 |     )
36 | 
37 |     category_feature_input = layers.Input(
38 |         shape=(category_feature_num,), dtype='int64', name='category_feature_input'
39 |     )
40 | 
41 |     slate_label_input = layers.Input(shape=(9,), dtype='int64', name='slate_label')
42 | 
43 |     slice_layer = layers.Lambda(lambda x: x[0][:, x[1]:])
44 |     id_slate_input = slice_layer([category_feature_input, -10])
45 | 
46 |     category_feature = utils.id_input_processing_attn(category_feature_input, config)
47 |     dense_feature = utils.dense_input_processing(dense_feature_input, config)
48 |     sequence_feature = utils.sequence_input_attn([sequence_feature_input, id_slate_input], config)
49 |     all_feature = layers.Concatenate(axis=-1)([sequence_feature, dense_feature, category_feature])
50 |     obs = layers.Dense(256, activation=layers.ELU(), name='simulator_obs')(all_feature)
51 |     output = layers.Dense(22, activation='softmax', name='simulator_reward')(obs)
52 | 
53 |     model = Model(inputs=[sequence_feature_input,
54 |                           dense_feature_input,
55 |                           category_feature_input,
56 |                           slate_label_input],
57 |                   outputs=[output])
58 |     tf.keras.backend.get_session().run(tf.global_variables_initializer())
59 |     model.compile(loss=my_loss_fn, optimizer='adam', metrics=[my_acc_metrics])
60 |     return model
61 | 


--------------------------------------------------------------------------------
/rl4rs/policy/behavior_model.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import tensorflow as tf
 3 | # tf.compat.v1.enable_eager_execution()
 4 | from tensorflow import keras
 5 | from rl4rs.utils.datautil import FeatureUtil
 6 | from copy import deepcopy
 7 | 
 8 | 
 9 | class behavior_model(object):
10 |     def __init__(self, config, modelfile):
11 |         behavior_config = deepcopy(config)
12 |         behavior_config['category_feature_num'] = 21
13 |         behavior_config['dense_feature_num'] = 50
14 |         self.featureutil = FeatureUtil(behavior_config)
15 |         self.item_feature_size = config.get('item_feature_size', 40)
16 |         self.page_items = config.get("page_items", 9)
17 |         self.sess = tf.Session()
18 |         with self.sess.as_default():
19 |             self.model = keras.models.load_model(modelfile)
20 | 
21 |     def record2input(self, records, page=0):
22 |         inputs = []
23 |         for record in records:
24 |             role_id, _, sequence_id, exposed_items, user_feedback, user_seqfeature, \
25 |                 user_protrait, item_feature, _ = self.featureutil.record_split(record)
26 |             category_feature = user_protrait[:10] + \
27 |                                [sequence_id] + \
28 |                                exposed_items[self.page_items*page:self.page_items*(page+1)]
29 |             sequence_feature = [user_seqfeature, [0]]
30 |             label = 0
31 |             dense_feature_size = self.item_feature_size*self.page_items
32 |             item_feature = item_feature[dense_feature_size*page:dense_feature_size*(page+1)]
33 |             item_feature = np.array(item_feature).reshape((self.page_items, self.item_feature_size))
34 |             item_feature = item_feature[:, :5].reshape(-1)
35 |             inputs.append((
36 |                 role_id,
37 |                 sequence_feature,
38 |                 item_feature,
39 |                 category_feature,
40 |                 user_feedback[self.page_items*page:self.page_items*(page+1)],
41 |                 label))
42 |         return inputs
43 | 
44 |     def action_probs(self, record, action, layer, page=0):
45 |         batch_size = len(action)
46 |         seq, dense, category, slate = self.featureutil.feature_extraction(self.record2input(record, page))[0]
47 |         with self.sess.as_default():
48 |             y = self.model.predict([seq, dense, category, slate])
49 |         if layer == 1:
50 |             action = np.clip(np.array(action) - 1, 0, 38)
51 |             action_probs = y[:, 1:40] / np.sum(y[:, 1:40], axis=1, keepdims=True)
52 |         elif layer == 2:
53 |             action = np.clip(np.array(action) - 40, 0, 107)
54 |             action_probs = y[:, 40:148] / np.sum(y[:, 40:148], axis=1, keepdims=True)
55 |         else:
56 |             action = np.clip(np.array(action) - 148, 0, 233)
57 |             action_probs = y[:, 148:] / np.sum(y[:, 148:], axis=1, keepdims=True)
58 |         return action_probs[range(batch_size), action]
59 | 


--------------------------------------------------------------------------------
/rl4rs/utils/rllib_vector_env.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from typing import Callable, List, Optional, Tuple
 3 | from ray.rllib.utils.typing import EnvActionType, EnvConfigDict, EnvInfoDict, \
 4 |     EnvObsType, EnvType, PartialTrainerConfigDict
 5 | from ray.rllib.env.vector_env import VectorEnv
 6 | from rl4rs.env import RecEnvBase
 7 | 
 8 | 
 9 | class MyVectorEnvWrapper(VectorEnv):
10 |     """An environment that supports batch evaluation using clones of sub-envs.
11 |     """
12 | 
13 |     def __init__(self, env: RecEnvBase, batch_size: int):
14 |         """Initializes a VectorEnv object.
15 | 
16 |         Args:
17 |             observation_space (Space): The observation Space of a single
18 |                 sub-env.
19 |             action_space (Space): The action Space of a single sub-env.
20 |             num_envs (int): The number of clones to make of the given sub-env.
21 |         """
22 |         self.env = env
23 |         self.reset_cache = []
24 |         super().__init__(self.env.observation_space, self.env.action_space, num_envs=batch_size)
25 | 
26 |     def vector_reset(self) -> List[EnvObsType]:
27 |         """Resets all sub-environments.
28 | 
29 |         Returns:
30 |             obs (List[any]): List of observations from each environment.
31 |         """
32 |         return self.env.reset()
33 | 
34 |     def reset_at(self, index: Optional[int] = None) -> EnvObsType:
35 |         """Resets a single environment.
36 | 
37 |         Args:
38 |             index (Optional[int]): An optional sub-env index to reset.
39 | 
40 |         Returns:
41 |             obs (obj): Observations from the reset sub environment.
42 |         """
43 |         if index == 0:
44 |             self.reset_cache = self.env.reset()
45 |         return self.reset_cache[index]
46 | 
47 |     def vector_step(
48 |             self, actions: List[EnvActionType]
49 |     ) -> Tuple[List[EnvObsType], List[float], List[bool], List[EnvInfoDict]]:
50 |         """Performs a vectorized step on all sub environments using `actions`.
51 | 
52 |         Args:
53 |             actions (List[any]): List of actions (one for each sub-env).
54 | 
55 |         Returns:
56 |             obs (List[any]): New observations for each sub-env.
57 |             rewards (List[any]): Reward values for each sub-env.
58 |             dones (List[any]): Done values for each sub-env.
59 |             infos (List[any]): Info values for each sub-env.
60 |         """
61 |         return self.env.step(np.array(actions))
62 | 
63 |     def get_unwrapped(self) -> List[EnvType]:
64 |         """Returns the underlying sub environments.
65 | 
66 |         Returns:
67 |             List[Env]: List of all underlying sub environments.
68 |         """
69 |         return [self.env, ] * self.num_envs
70 | 
71 |     # Experimental method.
72 |     def try_render_at(self, index: Optional[int] = None) -> None:
73 |         """Renders a single environment.
74 | 
75 |         Args:
76 |             index (Optional[int]): An optional sub-env index to render.
77 |         """
78 |         return self.env.render()
79 | 


--------------------------------------------------------------------------------
/script/simulator_env_test.py:
--------------------------------------------------------------------------------
 1 | import os, sys
 2 | import gym
 3 | import numpy as np
 4 | import tensorflow as tf
 5 | tf.compat.v1.enable_eager_execution()
 6 | from rl4rs.utils.datautil import FeatureUtil
 7 | from rl4rs.env.slate import SlateRecEnv, SlateState
 8 | from rl4rs.env.seqslate import SeqSlateRecEnv, SeqSlateState
 9 | 
10 | extra_config = eval(sys.argv[1]) if len(sys.argv) >= 2 else {}
11 | 
12 | config = {"epoch": 1, "maxlen": 64, "batch_size": 2048, "action_size": 284, "class_num": 2, "dense_feature_num": 432,
13 |           "category_feature_num": 21, "category_hash_size": 100000, "seq_num": 2, "emb_size": 128, "page_items": 9,
14 |           "hidden_units": 128, "max_steps": 9, "sample_file": '../dataset/rl4rs_dataset_a_train.csv',
15 |           "iteminfo_file": '../dataset/item_info.csv', "tfrecord_file":'../output/rl4rs_dataset_a_train_tiny.tfrecord',
16 |           "model_file": "../output/supervised_a_train_dien/model", "support_rllib_mask": False, "is_eval": True, 'env': "SlateRecEnv-v0",
17 |           "support_conti_env":True, "rawstate_as_obs":False}
18 | 
19 | config = dict(config, **extra_config)
20 | 
21 | if config.get('gpu', 0) < 1:
22 |     os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
23 | 
24 | if config['env'] == 'SeqSlateRecEnv-v0':
25 |     config['max_steps'] = 36
26 |     sim = SeqSlateRecEnv(config, state_cls=SeqSlateState)
27 |     env = gym.make('SeqSlateRecEnv-v0', recsim=sim)
28 | else:
29 |     sim = SlateRecEnv(config, state_cls=SlateState)
30 |     env = gym.make('SlateRecEnv-v0', recsim=sim)
31 | 
32 | batch_size = config["batch_size"]
33 | epoch = config["epoch"]
34 | max_steps = config["max_steps"]
35 | rewards = np.zeros((epoch, batch_size))
36 | offline_rewards = np.zeros((epoch, batch_size))
37 | offline_actions = np.zeros((epoch, batch_size, max_steps))
38 | for i in range(epoch):
39 |     env.reset(reset_file=True)
40 |     for j in range(config["max_steps"]):
41 |         if not config.get("support_conti_env"):
42 |             action = env.offline_action
43 |         else:
44 |             action = np.full((batch_size, 32), 1)
45 |         offline_actions[i, :, j] = env.offline_action
46 |         next_obs, reward, done, info = env.step(action)
47 |         rewards[i] = rewards[i] + np.array(reward)
48 |         offline_rewards[i] = offline_rewards[i] + np.array(env.offline_reward)
49 |         if done[0]:
50 |             print(next_obs[0], reward[0], action[0], done[0], info[0])
51 |             break
52 | 
53 |     if config['rawstate_as_obs']:
54 |         config['batch_size'] = 1
55 |         featureutil = FeatureUtil(config)
56 |         iter_train = featureutil.read_tfrecord(config['tfrecord_file'], is_slate_label=False)
57 |         feature = iter_train.make_one_shot_iterator().get_next()
58 |         seq_feature = feature[0][0].numpy()[0]
59 |         dense_feature = feature[0][1].numpy()[0]
60 |         category_feature = feature[0][2].numpy()[0]
61 |         assert np.min(np.equal(next_obs[0]['category_feature'][:-1], category_feature[:-1]))
62 |         assert np.min(np.equal(next_obs[0]['dense_feature'][:-40], dense_feature[:-40]))
63 |         assert np.min(np.equal(next_obs[0]['sequence_feature'], seq_feature))
64 | 


--------------------------------------------------------------------------------
/script/offline_evaluation.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import gym
 3 | import numpy as np
 4 | from rl4rs.policy.behavior_model import behavior_model
 5 | from rl4rs.policy.policy_model import policy_model
 6 | import rl4rs.utils.offline_policy_metrics as OPE
 7 | 
 8 | 
 9 | def ope_eval(config, eval_env, algo, sample_model: behavior_model = None):
10 |     policy = policy_model(algo, config)
11 |     metrics = []
12 |     epoch = config["epoch"]
13 |     batch_size = config["batch_size"]
14 |     max_steps = config["max_steps"]
15 |     page_items = config.get("page_items", 9)
16 |     for i in range(epoch):
17 |         obs = eval_env.reset()
18 |         episode_rewards, q_values, off_rewards = [], [], []
19 |         prev_actions = []
20 |         action_probs, behavior_probs, rewards = [], [], []
21 |         print('test batch at ', i)
22 |         for j in range(max_steps):
23 |             # obs = dict(enumerate(obs))
24 |             action = policy.predict_with_mask(obs)
25 |             off_action = eval_env.offline_action
26 |             if sample_model is not None:
27 |                 action_prob = policy.action_probs(obs)
28 |                 action_prob = action_prob[range(batch_size), off_action]
29 |                 q_values.append(policy.predict_q(obs, action))
30 |                 action_probs.append(action_prob)
31 |                 behavior_prob = sample_model.action_probs(eval_env.samples.records, off_action, j // 3 + 1, page=j//page_items)
32 |                 behavior_probs.append(behavior_prob)
33 |             obs, reward, done, info = eval_env.step(action)
34 |             off_rewards.append(eval_env.offline_reward)
35 |             rewards.append(reward)
36 |             prev_actions.append(action)
37 | 
38 |         episode_reward = np.sum(np.array(rewards), axis=0)
39 |         episode_rewards.append(episode_reward)
40 |         if sample_model is not None:
41 |             action_probs = np.array(action_probs).swapaxes(0, 1)
42 |             behavior_probs = np.array(behavior_probs).swapaxes(0, 1)
43 |             off_rewards = np.array(off_rewards).swapaxes(0, 1)
44 |             off_rewards_sum = np.sum(off_rewards, axis=1)
45 |             rewards_hat = np.array(rewards).swapaxes(0, 1)
46 |             q_values = np.array(q_values).swapaxes(0, 1)
47 |             # multiply probs
48 |             action_probs_mul = np.multiply.reduce(action_probs*100, axis=1)
49 |             behavior_probs_mul = np.multiply.reduce(behavior_probs*100, axis=1)
50 |             cips = OPE.eval_CIPS(off_rewards_sum, action_probs_mul, behavior_probs_mul)
51 |             # snips = OPE.eval_SNIPS(off_rewards_sum, action_probs_mul, behavior_probs_mul)
52 |             dr = OPE.eval_doubly_robust(
53 |                 episode_reward,
54 |                 np.average(q_values, 1),
55 |                 off_rewards_sum,
56 |                 action_probs_mul,
57 |                 behavior_probs_mul
58 |             )
59 |             # step-wise
60 |             wips = OPE.eval_WIPS(off_rewards, action_probs, behavior_probs)
61 |             sdr = OPE.eval_seq_doubly_robust(
62 |                 rewards_hat,
63 |                 q_values,
64 |                 off_rewards,
65 |                 action_probs,
66 |                 behavior_probs
67 |             )
68 | 
69 |             metrics.append((cips, dr, wips, sdr))
70 | 
71 |     print('IS', 'DR', 'WIPS', 'SeqDR', sep=' ')
72 |     print(np.average(np.array(metrics), axis=0))
73 |     print(np.std(np.array(metrics), axis=0))
74 | 


--------------------------------------------------------------------------------
/rl4rs/nets/cql/q_function.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional, cast
 2 | 
 3 | import torch
 4 | import torch.nn.functional as F
 5 | from torch import nn
 6 | from typing import Any, ClassVar, Dict, Type
 7 | from d3rlpy.models.torch.encoders import Encoder
 8 | from d3rlpy.models.torch.q_functions.base import DiscreteQFunction
 9 | from d3rlpy.models.torch.q_functions.utility import compute_huber_loss, compute_reduce, pick_value_by_action
10 | from d3rlpy.models.q_functions import QFunctionFactory
11 | from d3rlpy.models.torch import EncoderWithAction, ContinuousMeanQFunction
12 | 
13 | 
14 | class CustomDiscreteMeanQFunction(DiscreteQFunction, nn.Module):  # type: ignore
15 |     _action_size: int
16 |     _encoder: Encoder
17 |     _fc: nn.Linear
18 | 
19 |     def __init__(self, encoder: Encoder, action_size: int):
20 |         super().__init__()
21 |         self._action_size = action_size
22 |         self._encoder = encoder
23 |         # self._fc = nn.Linear(encoder.get_feature_size(), action_size)
24 | 
25 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
26 |         return cast(torch.Tensor, self._encoder(x))
27 | 
28 |     def compute_error(
29 |             self,
30 |             obs_t: torch.Tensor,
31 |             act_t: torch.Tensor,
32 |             rew_tp1: torch.Tensor,
33 |             q_tp1: torch.Tensor,
34 |             ter_tp1: torch.Tensor,
35 |             gamma: float = 0.99,
36 |             reduction: str = "mean",
37 |     ) -> torch.Tensor:
38 |         one_hot = F.one_hot(act_t.view(-1), num_classes=self.action_size)
39 |         q_t = (self.forward(obs_t) * one_hot.float()).sum(dim=1, keepdim=True)
40 |         y = rew_tp1 + gamma * q_tp1 * (1 - ter_tp1)
41 |         loss = compute_huber_loss(q_t, y)
42 |         return compute_reduce(loss, reduction)
43 | 
44 |     def compute_target(
45 |             self, x: torch.Tensor, action: Optional[torch.Tensor] = None
46 |     ) -> torch.Tensor:
47 |         if action is None:
48 |             return self.forward(x)
49 |         # q=pick_value_by_action(self.forward(x), action, keepdim=True)
50 |         values = self.forward(x)
51 |         action_size = values.shape[1]
52 |         one_hot = F.one_hot(action.view(-1), num_classes=action_size)
53 |         masked_values = values * cast(torch.Tensor, one_hot.float())
54 |         q = masked_values.sum(dim=1, keepdim=True)
55 |         # assert torch.min(q)>-100
56 |         return q
57 | 
58 |     @property
59 |     def action_size(self) -> int:
60 |         return self._action_size
61 | 
62 |     @property
63 |     def encoder(self) -> Encoder:
64 |         return self._encoder
65 | 
66 | 
67 | class CustomMeanQFunctionFactory(QFunctionFactory):
68 |     TYPE: ClassVar[str] = "mean"
69 | 
70 |     def __init__(self, bootstrap: bool = False, share_encoder: bool = False):
71 |         super().__init__(bootstrap, share_encoder)
72 | 
73 |     def create_discrete(
74 |             self,
75 |             encoder: Encoder,
76 |             action_size: int,
77 |     ) -> CustomDiscreteMeanQFunction:
78 |         return CustomDiscreteMeanQFunction(encoder, action_size)
79 | 
80 |     def create_continuous(
81 |             self,
82 |             encoder: EncoderWithAction,
83 |     ) -> ContinuousMeanQFunction:
84 |         return ContinuousMeanQFunction(encoder)
85 | 
86 |     def get_params(self, deep: bool = False) -> Dict[str, Any]:
87 |         return {
88 |             "bootstrap": self._bootstrap,
89 |             "share_encoder": self._share_encoder,
90 |         }
91 | 


--------------------------------------------------------------------------------
/rl4rs/nets/adversarial_slate.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from tensorflow import keras
 3 | from tensorflow.keras import layers
 4 | from tensorflow.keras.models import Model
 5 | from rl4rs.nets import utils
 6 | 
 7 | 
 8 | def custom_loss(external_loss):
 9 |     def loss(y_true, y_pred):
10 |         return 0.1 * tf.keras.losses.binary_crossentropy(y_true, y_pred) + external_loss
11 | 
12 |     return loss
13 | 
14 | 
15 | def my_loss_fn(y_true, y_pred):
16 |     item_scores_exp = tf.exp(y_pred)
17 |     item_scores_click = tf.einsum('ij,ij->ij', y_pred, tf.cast(y_true, tf.float32))
18 |     return -tf.log(tf.reduce_sum(tf.exp(item_scores_click), axis=1) + 1) \
19 |            + tf.log(tf.reduce_sum(item_scores_exp, axis=1) + 1)
20 | 
21 | 
22 | def my_metrics(y_true, y_pred):
23 |     score = tf.einsum('ij,ij->ij', y_pred, 1 - tf.cast(y_true, tf.float32))
24 |     return tf.reduce_sum(score, 1)
25 | 
26 | 
27 | def my_mean_metrics(y_true, y_pred):
28 |     return tf.reduce_mean(y_pred, 1)
29 | 
30 | 
31 | def my_max_metrics(y_true, y_pred):
32 |     return tf.reduce_max(y_pred, 1)
33 | 
34 | 
35 | def my_min_metrics(y_true, y_pred):
36 |     return tf.reduce_min(y_pred, 1)
37 | 
38 | 
39 | def get_model(config):
40 |     maxlen = config['maxlen']
41 |     dense_feature_num = config['dense_feature_num']
42 |     category_feature_num = config['category_feature_num']
43 |     class_num = config['class_num']
44 |     seq_num = config['seq_num']
45 | 
46 |     sequence_feature_input = layers.Input(
47 |         shape=(seq_num, maxlen,), dtype='float32', name='sequence_feature_input'
48 |     )
49 |     dense_feature_input = layers.Input(
50 |         shape=(dense_feature_num,), dtype='float32', name='dense_feature_input'
51 |     )
52 |     category_feature_input = layers.Input(
53 |         shape=(category_feature_num,), dtype='int64', name='category_feature_input'
54 |     )
55 |     slate_label_input = layers.Input(
56 |         shape=(9,), dtype='int64', name='slate_label'
57 |     )
58 | 
59 |     feature_omit = layers.Lambda(lambda x: x[:, :-1])
60 |     category_feature_input_slate = feature_omit(category_feature_input)
61 |     config['category_feature_num'] = config['category_feature_num'] - 1
62 | 
63 |     category_feature = utils.id_input_processing(category_feature_input_slate, config)
64 |     dense_feature = utils.dense_input_processing(dense_feature_input, config)
65 |     sequence_feature = utils.sequence_input_concat(sequence_feature_input, config)
66 | 
67 |     all_feature = layers.Concatenate(axis=-1)(
68 |         [sequence_feature, dense_feature, category_feature]
69 |     )
70 |     item_scores = layers.Dense(9, activation='sigmoid')(all_feature)
71 |     item_scores_norm = layers.Softmax()(item_scores)
72 |     item_scores_no_click = tf.einsum('ij,ij->ij',
73 |                                      item_scores_norm,
74 |                                      1 - tf.cast(slate_label_input, tf.float32))
75 |     loss3 = tf.reduce_sum(item_scores_no_click, axis=1)
76 | 
77 |     model = Model(inputs=[sequence_feature_input,
78 |                           dense_feature_input,
79 |                           category_feature_input,
80 |                           slate_label_input],
81 |                   outputs=[item_scores])
82 |     model.compile(optimizer='adam',
83 |                   loss=custom_loss(loss3),
84 |                   metrics=[
85 |                       tf.keras.metrics.AUC(),
86 |                       tf.keras.metrics.Precision(),
87 |                       tf.keras.metrics.Recall()])
88 |     return model
89 | 


--------------------------------------------------------------------------------
/reproductions/run_simulator_eval.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | conda activate rl4rs
 4 | script_abs=$(readlink -f "$0")
 5 | rl4rs_benchmark_dir=$(dirname $script_abs)/..
 6 | rl4rs_output_dir=${rl4rs_benchmark_dir}/output
 7 | rl4rs_dataset_dir=${rl4rs_benchmark_dir}/dataset
 8 | script_dir=${rl4rs_benchmark_dir}/script
 9 | export rl4rs_benchmark_dir && export rl4rs_output_dir && export rl4rs_dataset_dir
10 | 
11 | algo=$1
12 | 
13 | cd ${script_dir}
14 | 
15 | # train in train set and test in all sample
16 | python -u simulator_eval.py "{'env':'SlateRecEnv-v0','algo':'${algo}','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_a_shuf.csv','model_file':'${rl4rs_output_dir}/supervised_a_train_${algo}/model'}" >> ${rl4rs_output_dir}/eval_simulator_a_all_${algo}.log &&
17 | python -u simulator_eval.py "{'gpu':False,'env':'SeqSlateRecEnv-v0','algo':'${algo}','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_b3_shuf.csv','model_file':'${rl4rs_output_dir}/supervised_b2_train_${algo}/model'}" >> ${rl4rs_output_dir}/eval_simulator_b2_all_${algo}.log
18 | 
19 | # train in all set and test in sl/rl as a baseline
20 | python -u simulator_eval.py "{'env':'SlateRecEnv-v0','algo':'${algo}','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_a_sl_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_a_${algo}/model'}" >> ${rl4rs_output_dir}/eval_simulator_a_all_sl_${algo}.log &&
21 | python -u simulator_eval.py "{'env':'SlateRecEnv-v0','algo':'${algo}','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_a_rl_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_a_${algo}/model'}" >> ${rl4rs_output_dir}/eval_simulator_a_all_rl_${algo}.log &&
22 | 
23 | python -u simulator_eval.py "{'gpu':False,'env':'SeqSlateRecEnv-v0','algo':'${algo}','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_b3_sl_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_b2_${algo}/model'}" >> ${rl4rs_output_dir}/eval_simulator_b2_all_sl_${algo}.log &&
24 | python -u simulator_eval.py "{'gpu':False,'env':'SeqSlateRecEnv-v0','algo':'${algo}','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_b3_rl_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_b2_${algo}/model'}" >> ${rl4rs_output_dir}/eval_simulator_b2_all_rl_${algo}.log &&
25 | 
26 | # train in sl/rl and test in rl/sl
27 | python -u simulator_eval.py "{'env':'SlateRecEnv-v0','algo':'${algo}','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_a_rl_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_a_sl_${algo}/model'}" >> ${rl4rs_output_dir}/eval_simulator_a_sl_rl_${algo}.log &&
28 | python -u simulator_eval.py "{'env':'SlateRecEnv-v0','algo':'${algo}','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_a_sl_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_a_rl_${algo}/model'}" >> ${rl4rs_output_dir}/eval_simulator_a_rl_sl_${algo}.log &&
29 | 
30 | python -u simulator_eval.py "{'gpu':False,'env':'SeqSlateRecEnv-v0','algo':'${algo}','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_b3_rl_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_b2_sl_${algo}/model'}" >> ${rl4rs_output_dir}/eval_simulator_b2_sl_rl_${algo}.log &&
31 | python -u simulator_eval.py "{'gpu':False,'env':'SeqSlateRecEnv-v0','algo':'${algo}','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_b3_sl_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_b2_rl_${algo}/model'}" >> ${rl4rs_output_dir}/eval_simulator_b2_rl_sl_${algo}.log
32 | 
33 | echo '1'


--------------------------------------------------------------------------------
/rl4rs/mdpchecker/decoder.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import time
 3 | import bottleneck
 4 | 
 5 | def token_probs(model,
 6 |                 batch_inputs,
 7 |                 batch_outputs):
 8 |     return np.array(model.predict([np.array(batch_inputs), np.array(batch_outputs)]))[:, -1]
 9 | 
10 | 
11 | def decode_step(model,
12 |                 batch_inputs,
13 |                 batch_outputs,
14 |                 candidates=None,
15 |                 beam_size=1):
16 |     a = time.time()
17 |     # predicts (batch_size, token_size)
18 |     predicts = model.predict([np.array(batch_inputs), np.array(batch_outputs)])[:,-1]
19 |     batch_size, token_size = predicts.shape
20 |     print('decode_step', time.time()-a)
21 |     # print('decode_step', time.time()-a)
22 |     # tmp = []
23 |     # for i in range(len(predicts)):
24 |     #     probs = [(prob, j) for j, prob in enumerate(predicts[i])]
25 |     #     if candidates is not None:
26 |     #         probs = [x if x[1] in candidates[i] else (0, x[1]) for x in probs]
27 |     #     probs.sort(reverse=True)
28 |     #     probs = probs[:beam_size]
29 |     #     tmp.append(probs)
30 |     if candidates is not None:
31 |         mask = np.zeros(predicts.shape)
32 |         inds = np.array([[i,]*len(candidates[i]) for i in range(len(candidates))]).flatten()
33 |         mask[inds, candidates.flatten().astype(int)] = 1
34 |         predicts = predicts * mask
35 |     # index = np.argpartition(-predicts, beam_size, axis=1)[:, :beam_size]
36 |     # probs = -np.partition(-predicts, beam_size, axis=1)[:, :beam_size]
37 |     index = bottleneck.argpartition(-predicts, beam_size, axis=1)[:, :beam_size]
38 |     probs = -bottleneck.partition(-predicts, beam_size, axis=1)[:, :beam_size]
39 |     inds = np.array([[i,]*len(probs[i]) for i in range(len(probs))])
40 |     inds_sorted = np.argsort(-probs, axis=1)[:,:beam_size]
41 |     index = index[inds, inds_sorted]
42 |     probs = probs[inds, inds_sorted]
43 |     # print('decode_step', time.time()-a)
44 |     tmp2 = np.array(list(zip(probs.flatten(),index.flatten()))).reshape((batch_size, beam_size, 2))
45 |     # tmp (batch_size, beam_size, 2)
46 |     # print(np.min(tmp==tmp2))
47 |     return tmp2
48 | 
49 | 
50 | def beam_search(model, encode_input, beam_size, target_len, use_candidates=False, candidates_size = None):
51 |     batch_size = len(encode_input)
52 |     output_topk = np.zeros((batch_size, beam_size, target_len + 1), dtype=np.int)
53 |     beam_score = np.ones((batch_size, beam_size))
54 |     output_topk[:, :, 0] = 1
55 |     # probs = []
56 |     candidates = None
57 |     prob = decode_step(model, encode_input, output_topk[:, 0, :1], candidates=candidates, beam_size=beam_size)
58 |     if use_candidates:
59 |         probs_first_step = decode_step(model, encode_input, output_topk[:, 0, :1], candidates=candidates, beam_size=candidates_size)
60 |         candidates = probs_first_step[:, :, 1]
61 |     output_topk[:, :, 1] = prob[:, :, 1]
62 |     beam_score[:, :] = prob[:, :, 0]
63 |     for i in range(1, target_len):
64 |         a = time.time()
65 |         print('beam_search at target_len_', i)
66 |         probs = []
67 |         for j in range(beam_size):
68 |             # batch_size,k,2
69 |             prob = decode_step(model, encode_input, output_topk[:, j, :i + 1], candidates=candidates, beam_size=beam_size)
70 |             probs.append(prob)
71 |         # batch_size,k,k,2
72 |         probs = np.array(probs).swapaxes(0, 1)
73 |         # batch_size,k,k
74 |         beam_scores = np.einsum('abc,ab->abc', probs[:, :, :, 0], beam_score)
75 |         # batch_size,k,2
76 |         top_k_fn = lambda x: np.dstack(np.unravel_index(np.argsort(-x.ravel()), (beam_size, beam_size)))
77 |         top_k_index = np.array(list(map(top_k_fn, beam_scores)))[:, 0][:, :beam_size, :]
78 |         for ii in range(batch_size):
79 |             output_topk[ii, :, :] = output_topk[ii, top_k_index[ii, :, 0], :]
80 |             output_topk[ii, :, i + 1] = probs[ii, top_k_index[ii, :, 0], top_k_index[ii, :, 1], 1]
81 |             beam_score[ii, :] = beam_scores[ii, top_k_index[ii, :, 0], top_k_index[ii, :, 1]]
82 |     return output_topk, beam_score
83 | 


--------------------------------------------------------------------------------
/rl4rs/nets/rllib/rllib_rawstate_model.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | from gym.spaces import Dict
 3 | from rl4rs.nets import utils
 4 | from ray.rllib.models.utils import get_activation_fn
 5 | from ray.rllib.models.tf.misc import normc_initializer
 6 | from ray.rllib.models.tf.tf_modelv2 import TFModelV2
 7 | from ray.rllib.utils.framework import try_import_tf, try_import_torch
 8 | 
 9 | tf1, tf, tfv = try_import_tf()
10 | torch, nn = try_import_torch()
11 | 
12 | 
13 | def getTFModelWithRawState(config):
14 |     config = config
15 | 
16 |     class MyTFModelWithRawState(TFModelWithRawState):
17 |         def __init__(self, obs_space, action_space, num_outputs, model_config,
18 |                      name):
19 |             super(MyTFModelWithRawState, self).__init__(
20 |                 obs_space, action_space, num_outputs, model_config, name, config=config)
21 | 
22 |     return MyTFModelWithRawState
23 | 
24 | 
25 | class TFModelWithRawState(TFModelV2):
26 |     """Implements the `.action_model` branch required above."""
27 | 
28 |     def __init__(self, obs_space, action_space, num_outputs, model_config,
29 |                  name, config):
30 |         obs_space = obs_space.original_space
31 |         super(TFModelWithRawState, self).__init__(
32 |             obs_space, action_space, num_outputs, model_config, name)
33 |         if not (isinstance(obs_space, Dict) and obs_space['category_feature'] \
34 |                 and obs_space['dense_feature'] and obs_space['sequence_feature']):
35 |             raise ValueError("""This model only supports the Dict{'category_feature':[], 
36 |                 'dense_feature':[], 'sequence_feature':[]} obs space""")
37 |         activation = model_config.get("fcnet_activation", "linear")
38 |         activation = get_activation_fn(activation)
39 |         no_final_linear = model_config.get("no_final_linear", False)
40 |         # Inputs
41 |         category_feature_input = tf.keras.layers.Input(
42 |             shape=obs_space['category_feature'].shape, name="obs_category_input")
43 |         dense_feature_input = tf.keras.layers.Input(
44 |             shape=obs_space['dense_feature'].shape, name="obs_dense_input")
45 |         sequence_feature_input = tf.keras.layers.Input(
46 |             shape=obs_space['sequence_feature'].shape, name="obs_sequence_input")
47 | 
48 |         slice_layer = tf.keras.layers.Lambda(lambda x: x[0][:, x[1]:])
49 |         category_feature = utils.id_input_processing(category_feature_input, config)
50 |         dense_feature = utils.dense_input_processing(dense_feature_input, config)
51 |         sequence_feature = utils.sequence_input_concat(sequence_feature_input, config)
52 |         all_feature = tf.keras.layers.Concatenate(axis=-1)([sequence_feature, dense_feature, category_feature])
53 |         context = tf.keras.layers.Dense(256, activation=tf.keras.layers.ELU())(all_feature)
54 |         model_out = None
55 |         if no_final_linear and num_outputs:
56 |             model_out = tf.keras.layers.Dense(
57 |                 num_outputs,
58 |                 name="fc_out",
59 |                 activation=activation,
60 |                 kernel_initializer=normc_initializer(1.0))(context)
61 |         else:
62 |             model_out = tf.keras.layers.Dense(
63 |                 num_outputs,
64 |                 name="fc_out",
65 |                 activation=None,
66 |                 kernel_initializer=normc_initializer(0.01))(context)
67 | 
68 |         # V(s)
69 |         value_out = tf.keras.layers.Dense(
70 |             1,
71 |             name="value_out",
72 |             activation=None,
73 |             kernel_initializer=normc_initializer(0.01))(context)
74 | 
75 |         # Base layers
76 |         self.base_model = tf.keras.Model([category_feature_input, dense_feature_input, sequence_feature_input], [model_out, value_out])
77 |         self.base_model.summary()
78 | 
79 |     def forward(self, input_dict, state, seq_lens):
80 |         model_out, self._value_out = self.base_model([input_dict["obs"]["category_feature"],
81 |                                                       input_dict["obs"]["dense_feature"],
82 |                                                       input_dict["obs"]["sequence_feature"]])
83 |         return model_out, state
84 | 
85 |     def value_function(self):
86 |         return tf.reshape(self._value_out, [-1])
87 | 


--------------------------------------------------------------------------------
/rl4rs/policy/policy_model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import d3rlpy
 3 | import numpy as np
 4 | from ray.rllib.agents.trainer import Trainer as rllib_trainer
 5 | from scipy.special import softmax
 6 | 
 7 | 
 8 | class policy_model(object):
 9 |     def __init__(self, model, config = {}):
10 |         self.policy = model
11 |         self.config = config
12 |         self.page_items = int(config.get('page_items', 9))
13 |         self.mask_size = self.page_items+1
14 |         self.location_mask = config.get('location_mask', None)
15 |         self.special_items = config.get('special_items', None)
16 | 
17 |     def predict_with_mask(self, obs):
18 |         if self.config.get("support_conti_env",False):
19 |             return self.predict(obs)
20 |         elif isinstance(self.policy, d3rlpy.algos.AlgoBase):
21 |             obs = np.array(obs)
22 |             action_probs = np.array(self.action_probs(obs))
23 |             batch_size = len(obs)
24 |             # mask
25 |             prev_actions = obs[:, -self.mask_size:-1].astype(int)
26 |             cur_step = obs[:, -1].astype(int)
27 |             x_mask_layer = cur_step % self.page_items // 3
28 |             mask = self.location_mask[x_mask_layer.astype(int)]
29 |             for i in range(self.mask_size-1):
30 |                 mask[range(batch_size), prev_actions[:, i]] = 0
31 |             action_mask = mask < 0.01
32 |             action_probs[action_mask] = -2 ** 15
33 |             for i in range(batch_size):
34 |                 if len(np.intersect1d(prev_actions[i], self.special_items)) > 0:
35 |                     action_probs[i][self.special_items] = -2 ** 15
36 |             return action_probs.argmax(axis=1)
37 |         elif isinstance(self.policy, rllib_trainer):
38 |             return self.predict(obs)
39 |         else:
40 |             assert isinstance(self.policy, d3rlpy.algos.AlgoBase) \
41 |                    or isinstance(self.policy, rllib_trainer)
42 | 
43 |     def predict(self, obs):
44 |         if isinstance(self.policy, d3rlpy.algos.AlgoBase):
45 |             return self.policy.predict(obs)
46 |         elif isinstance(self.policy, rllib_trainer):
47 |             obs = dict(enumerate(obs))
48 |             action = self.policy.compute_actions(obs, explore=False)
49 |             action = np.array(list(action.values()))
50 |             return action
51 |         else:
52 |             assert isinstance(self.policy, d3rlpy.algos.AlgoBase) \
53 |                    or isinstance(self.policy, rllib_trainer)
54 | 
55 |     def predict_q(self, obs, action):
56 |         if isinstance(self.policy, d3rlpy.algos.AlgoBase):
57 |             q = self.policy.predict_value(obs, action)
58 |             if self.policy.reward_scaler is not None:
59 |                 return self.policy.reward_scaler.reverse_transform(q)
60 |             else:
61 |                 return q
62 |         elif isinstance(self.policy, rllib_trainer):
63 |             obs = dict(enumerate(obs))
64 |             _, _, infos = self.policy. \
65 |                 compute_actions(obs, explore=False, full_fetch=True)
66 |             batch_size = len(action)
67 |             return infos['q_values'][range(batch_size), action] \
68 |                 if 'q_values' in infos \
69 |                 else infos['vf_preds']
70 |         else:
71 |             assert isinstance(self.policy, d3rlpy.algos.AlgoBase) \
72 |                    or isinstance(self.policy, rllib_trainer)
73 | 
74 |     def action_probs(self, obs):
75 |         if isinstance(self.policy, d3rlpy.algos.DiscreteBC):
76 |             obs = torch.tensor(obs, dtype=torch.float32)
77 |             return self.policy._impl._imitator(obs).detach().numpy()
78 |         elif isinstance(self.policy, d3rlpy.algos.DiscreteBCQ) \
79 |                 or isinstance(self.policy, d3rlpy.algos.DiscreteCQL):
80 |             obs = torch.tensor(obs, dtype=torch.float32)
81 |             action_q = self.policy._impl._q_func(obs).detach().numpy()
82 |             return softmax(action_q, axis=1)
83 |         elif isinstance(self.policy, rllib_trainer):
84 |             obs = dict(enumerate(obs))
85 |             actions, _, infos = self.policy. \
86 |                 compute_actions(obs, explore=False, full_fetch=True)
87 |             return softmax(infos['action_dist_inputs'], axis=1)
88 |         else:
89 |             assert isinstance(self.policy, d3rlpy.algos.AlgoBase) \
90 |                    or isinstance(self.policy, rllib_trainer)
91 | 


--------------------------------------------------------------------------------
/reproductions/run_exact_k.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | conda activate rl4rs
 4 | script_abs=$(readlink -f "$0")
 5 | rl4rs_benchmark_dir=$(dirname $script_abs)/..
 6 | rl4rs_output_dir=${rl4rs_benchmark_dir}/output
 7 | rl4rs_dataset_dir=${rl4rs_benchmark_dir}/dataset
 8 | script_dir=${rl4rs_benchmark_dir}/script
 9 | export rl4rs_benchmark_dir && export rl4rs_output_dir && export rl4rs_dataset_dir
10 | 
11 | 
12 | cd ${script_dir}
13 | 
14 | # experiment in a_all env, train in a_all sample and test in a_all sample
15 | python -u exact_k_train.py "train" "{'env':'SlateRecEnv-v0','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_a_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_a_dien/model','trial_name':'a_all'}" >> ${rl4rs_output_dir}/exactk_a_all.log
16 | python -u exact_k_train.py "eval" "{'gpu':False,'batch_size':2048,'is_eval':True,'env':'SlateRecEnv-v0','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_a_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_a_dien/model','trial_name':'a_all'}" >> ${rl4rs_output_dir}/exactk_a_all.log
17 | 
18 | 
19 | # experiment in a_all env, train in a_train sample and test in a_test sample
20 | #python -u exact_k_train.py "train" "{'env':'SlateRecEnv-v0','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_a_train_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_a_dien/model','trial_name':'a_train'}" >> ${rl4rs_output_dir}/exactk_a_train.log
21 | #python -u exact_k_train.py "eval" "{'gpu':False,'batch_size':2048,'is_eval':True,'env':'SlateRecEnv-v0','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_a_test_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_a_dien/model','trial_name':'a_train'}" >> ${rl4rs_output_dir}/exactk_a_train.log
22 | 
23 | 
24 | # experiment train in a_sl env and test in a_rl env
25 | #python -u exact_k_train.py "train" "{'env':'SlateRecEnv-v0','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_a_sl_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_a_sl_dien/model','trial_name':'a_sl'}" >> ${rl4rs_output_dir}/exactk_a_sl.log
26 | #python -u exact_k_train.py "eval" "{'gpu':False,'batch_size':2048,'is_eval':True,'env':'SlateRecEnv-v0','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_a_rl_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_a_rl_dien/model','trial_name':'a_sl'}" >> ${rl4rs_output_dir}/exactk_a_sl.log
27 | 
28 | 
29 | # experiment in b_all env, train in b_all sample and test in b_all sample
30 | #python -u exact_k_train.py "train" "{'env':'SeqSlateRecEnv-v0','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_b3_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_b2_dien/model','trial_name':'b_all'}" >> ${rl4rs_output_dir}/exactk_b_all.log
31 | #python -u exact_k_train.py "eval" "{'gpu':False,'batch_size':2048,'is_eval':True,'env':'SeqSlateRecEnv-v0','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_b3_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_b2_dien/model','trial_name':'b_all'}" >> ${rl4rs_output_dir}/exactk_b_all.log
32 | 
33 | 
34 | # experiment in b_all env, train in b_train sample and test in b_test sample
35 | #python -u exact_k_train.py "train" "{'env':'SeqSlateRecEnv-v0','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_b3_train_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_b2_dien/model','trial_name':'b_train'}" >> ${rl4rs_output_dir}/exactk_b_train.log
36 | #python -u exact_k_train.py "eval" "{'gpu':False,'batch_size':2048,'is_eval':True,'env':'SeqSlateRecEnv-v0','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_b3_test_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_b2_dien/model','trial_name':'b_train'}" >> ${rl4rs_output_dir}/exactk_b_train.log
37 | 
38 | 
39 | # experiment train in b_sl env and test in b_rl env
40 | #python -u exact_k_train.py "train" "{'env':'SeqSlateRecEnv-v0','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_b3_sl_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_b2_sl_dien/model','trial_name':'b_sl'}" >> ${rl4rs_output_dir}/exactk_b_sl.log
41 | #python -u exact_k_train.py "eval" "{'gpu':False,'batch_size':2048,'is_eval':True,'env':'SeqSlateRecEnv-v0','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_b3_rl_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_b2_rl_dien/model','trial_name':'b_sl'}" >> ${rl4rs_output_dir}/exactk_b_sl.log
42 | 


--------------------------------------------------------------------------------
/rl4rs/nets/rllib/rllib_mask_model.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | from rl4rs.nets.rllib.rllib_rawstate_model import TFModelWithRawState
  3 | from ray.rllib.examples.models.parametric_actions_model import \
  4 |     ParametricActionsModel
  5 | 
  6 | 
  7 | def getMaskActionsModel(true_obs_shape, action_size):
  8 |     class MyMaskActionsModel(ParametricActionsModel):
  9 |         """Parametric action model that handles the dot product and masking.
 10 | 
 11 |         This assumes the outputs are logits for a single Categorical action dist.
 12 |         Getting this to work with a more complex output (e.g., if the action space
 13 |         is a tuple of several distributions) is also possible but left as an
 14 |         exercise to the reader.
 15 |         """
 16 | 
 17 |         def __init__(self,
 18 |                      obs_space,
 19 |                      action_space,
 20 |                      num_outputs,
 21 |                      model_config,
 22 |                      name,
 23 |                      **kw):
 24 |             config = {
 25 |                 # FullyConnectedNetwork (tf and torch): rllib.models.tf|torch.fcnet.py
 26 |                 # These are used if no custom model is specified and the input space is 1D.
 27 |                 # Number of hidden layers to be used.
 28 |                 "fcnet_hiddens": [64],
 29 |                 # Activation function descriptor.
 30 |                 # Supported values are: "tanh", "relu", "swish" (or "silu"),
 31 |                 # "linear" (or None).
 32 |                 # "fcnet_activation": "linear",
 33 |                 # "no_final_linear": True,
 34 |                 "vf_share_layers": True,
 35 |             }
 36 |             model_config = dict(model_config, **config)
 37 |             super(MyMaskActionsModel, self).__init__(
 38 |                 obs_space, action_space, num_outputs, model_config, name, true_obs_shape, action_embed_size=action_size, **kw)
 39 |             print('MyMaskActionsModel', self.action_embed_model.model_config)
 40 | 
 41 |         def forward(self, input_dict, state, seq_lens):
 42 |             # Extract the available actions tensor from the observation.
 43 |             # avail_actions = input_dict["obs"]["avail_actions"]
 44 |             action_mask = input_dict["obs"]["action_mask"]
 45 | 
 46 |             # Compute the predicted action embedding
 47 |             action_embed, _ = self.action_embed_model({
 48 |                 "obs": input_dict["obs"]["obs"]
 49 |             })
 50 |             # action_values = self.action_embed_model.value_function()
 51 |             # print(tf.shape(action_embed), action_embed)
 52 | 
 53 |             # Expand the model output to [BATCH, 1, EMBED_SIZE]. Note that the
 54 |             # avail actions tensor is of shape [BATCH, MAX_ACTIONS, EMBED_SIZE].
 55 |             # intent_vector = tf.expand_dims(action_embed, 1)
 56 | 
 57 |             # Batch dot product => shape of logits is [BATCH, MAX_ACTIONS].
 58 |             # action_prob = tf.nn.softmax(action_embed)
 59 | 
 60 |             # Mask out invalid actions (use tf.float32.min for stability)
 61 |             inf_mask = tf.maximum(tf.math.log(action_mask), tf.float32.min)
 62 |             return action_embed + inf_mask, state
 63 | 
 64 |     return MyMaskActionsModel
 65 | 
 66 | 
 67 | def getMaskActionsModelWithRawState(config, action_size):
 68 |     config = config
 69 | 
 70 |     class MyMaskActionsModelWithRawState(ParametricActionsModel):
 71 |         """Parametric action model that handles the dot product and masking.
 72 | 
 73 |         This assumes the outputs are logits for a single Categorical action dist.
 74 |         Getting this to work with a more complex output (e.g., if the action space
 75 |         is a tuple of several distributions) is also possible but left as an
 76 |         exercise to the reader.
 77 |         """
 78 | 
 79 |         def __init__(self,
 80 |                      obs_space,
 81 |                      action_space,
 82 |                      num_outputs,
 83 |                      model_config,
 84 |                      name,
 85 |                      **kw):
 86 |             # model_config = dict(model_config, **config)
 87 |             super(MyMaskActionsModelWithRawState, self).__init__(
 88 |                 obs_space, action_space, num_outputs, model_config, name, action_embed_size=action_size, **kw)
 89 |             print('MyMaskActionsModelWithRawStateModel', self.action_embed_model.model_config)
 90 |             self.action_embed_model = TFModelWithRawState(
 91 |                 obs_space, action_space, action_size,
 92 |                 model_config, name + "_action_embed", config = config)
 93 | 
 94 |         def forward(self, input_dict, state, seq_lens):
 95 |             # Extract the available actions tensor from the observation.
 96 |             # avail_actions = input_dict["obs"]["avail_actions"]
 97 |             action_mask = input_dict["obs"]["action_mask"]
 98 | 
 99 |             # Compute the predicted action embedding
100 |             action_embed, _ = self.action_embed_model(input_dict)
101 |             # action_values = self.action_embed_model.value_function()
102 |             # print(tf.shape(action_embed), action_embed)
103 | 
104 |             # Expand the model output to [BATCH, 1, EMBED_SIZE]. Note that the
105 |             # avail actions tensor is of shape [BATCH, MAX_ACTIONS, EMBED_SIZE].
106 |             # intent_vector = tf.expand_dims(action_embed, 1)
107 | 
108 |             # Batch dot product => shape of logits is [BATCH, MAX_ACTIONS].
109 |             # action_prob = tf.nn.softmax(action_embed)
110 | 
111 |             # Mask out invalid actions (use tf.float32.min for stability)
112 |             inf_mask = tf.maximum(tf.math.log(action_mask), tf.float32.min)
113 |             return action_embed + inf_mask, state
114 | 
115 |     return MyMaskActionsModelWithRawState
116 | 


--------------------------------------------------------------------------------
/rl4rs/nets/utils.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | from deepctr.layers.sequence import AttentionSequencePoolingLayer, DynamicGRU
  4 | from tensorflow.keras import layers, regularizers
  5 | 
  6 | 
  7 | def id_input_processing(category_feature_input, config):
  8 |     emb_size = config['emb_size']
  9 |     category_hash_size = config['category_hash_size']
 10 |     emb_layer = layers.Embedding(input_dim=category_hash_size, output_dim=emb_size)
 11 |     category_emb = emb_layer(category_feature_input)
 12 |     category_feature = layers.GlobalAveragePooling1D()(category_emb)
 13 |     return category_feature
 14 | 
 15 | 
 16 | def id_input_processing_attn(category_feature_input, config):
 17 |     emb_size = config['emb_size']
 18 |     hidden_unit = config['hidden_units']
 19 |     category_hash_size = config['category_hash_size']
 20 |     emb_layer = layers.Embedding(input_dim=category_hash_size, output_dim=emb_size)
 21 |     category_emb = emb_layer(category_feature_input)
 22 |     category_feature = tf.keras.layers.Attention()([category_emb, category_emb])
 23 |     category_feature = tf.keras.layers.GlobalAveragePooling1D()(category_feature)
 24 |     category_feature_2 = layers.Flatten()(category_emb)
 25 |     return layers.Concatenate(axis=-1)([category_feature, category_feature_2])
 26 | 
 27 | 
 28 | def id_input_processing_lstm(category_feature_input, config):
 29 |     emb_size = config['emb_size']
 30 |     hidden_unit = config['hidden_units']
 31 |     category_hash_size = config['category_hash_size']
 32 |     emb_layer = layers.Embedding(input_dim=category_hash_size, output_dim=emb_size)
 33 |     category_emb = emb_layer(category_feature_input)
 34 |     category_feature = layers.GRU(units=hidden_unit)(category_emb)
 35 |     category_feature_2 = layers.Flatten()(category_emb)
 36 |     return layers.Concatenate(axis=-1)([category_feature, category_feature_2])
 37 | 
 38 | 
 39 | def id_input_processing_concat(category_feature_input, config):
 40 |     emb_size = config['emb_size']
 41 |     category_hash_size = config['category_hash_size']
 42 |     emb_layer = layers.Embedding(input_dim=category_hash_size, output_dim=emb_size)
 43 |     category_emb = emb_layer(category_feature_input)
 44 |     category_feature = layers.Flatten()(category_emb)
 45 |     return category_feature
 46 | 
 47 | 
 48 | def dense_input_processing(cross_feature_input, config):
 49 |     hidden_unit = config['hidden_units']
 50 |     cross_feature = layers.Dense(hidden_unit, activation=layers.ELU())(cross_feature_input)
 51 |     cross_feature = layers.Dropout(0.2)(cross_feature)
 52 |     cross_feature = layers.Dense(hidden_unit, activation=layers.ELU())(cross_feature)
 53 |     cross_feature = layers.Dropout(0.2)(cross_feature)
 54 |     return cross_feature
 55 | 
 56 | 
 57 | def sequence_input_concat(sequence_feature_input, config):
 58 |     category_hash_size = config['category_hash_size']
 59 |     hidden_unit = config['hidden_units']
 60 |     emb_size = config['emb_size']
 61 |     seq_num = config['seq_num']
 62 | 
 63 |     seq_index_layer = layers.Lambda(lambda x: x[0][:, x[1]])
 64 |     emb_layer = layers.Embedding(input_dim=category_hash_size, output_dim=emb_size)
 65 | 
 66 |     seqs_lstm = []
 67 |     for i in range(seq_num):
 68 |         seq_i = seq_index_layer([sequence_feature_input, i])
 69 |         seq_i_embeddings = emb_layer(seq_i)
 70 |         seq_i_lstm = layers.GlobalAveragePooling1D()(seq_i_embeddings)
 71 |         seqs_lstm.append(seq_i_lstm)
 72 | 
 73 |     seqs_embeddings = layers.Concatenate(axis=-1)(seqs_lstm) if len(seqs_lstm) > 1 else seqs_lstm[0]
 74 | 
 75 |     return seqs_embeddings
 76 | 
 77 | 
 78 | def sequence_input_LSTM(sequence_feature_input, config):
 79 |     category_hash_size = config['category_hash_size']
 80 |     hidden_unit = config['hidden_units']
 81 |     emb_size = config['emb_size']
 82 |     seq_num = config['seq_num']
 83 | 
 84 |     seq_index_layer = layers.Lambda(lambda x: x[0][:, x[1]])
 85 | 
 86 |     emb_layer = layers.Embedding(input_dim=category_hash_size, output_dim=emb_size)
 87 | 
 88 |     seqs_lstm = []
 89 |     for i in range(seq_num):
 90 |         seq_i = seq_index_layer([sequence_feature_input, i])
 91 |         seq_i_embeddings = emb_layer(seq_i)
 92 |         seq_i_lstm = layers.GRU(units=hidden_unit)(seq_i_embeddings)
 93 |         seqs_lstm.append(seq_i_lstm)
 94 | 
 95 |     seqs_embeddings = layers.Concatenate(axis=-1)(seqs_lstm) if len(seqs_lstm) > 1 else seqs_lstm[0]
 96 | 
 97 |     return seqs_embeddings
 98 | 
 99 | 
100 | def sequence_input_attn(input, config):
101 |     category_hash_size = config['category_hash_size']
102 |     hidden_unit = config['hidden_units']
103 |     emb_size = config['emb_size']
104 |     maxlen = config['maxlen']
105 |     batch_size = config['batch_size']
106 |     seq_num = config['seq_num']
107 | 
108 |     sequence_feature_input = input[0]
109 |     id_slate_input = input[1]
110 | 
111 |     sequence_length = tf.fill((tf.shape(sequence_feature_input)[0], 1), maxlen)
112 |     seq_index_layer = layers.Lambda(lambda x: x[0][:, x[1]])
113 |     emb_layer = layers.Embedding(input_dim=category_hash_size, output_dim=emb_size)
114 |     id_slate_embeddings = emb_layer(id_slate_input)
115 |     id_slate_pooling = tf.math.reduce_mean(id_slate_embeddings, axis=1, keepdims=True)
116 |     seqs_attn = []
117 |     for i in range(seq_num):
118 |         seq_i = seq_index_layer([sequence_feature_input, i])
119 |         seq_i_embeddings = emb_layer(seq_i)
120 |         rnn_outputs = DynamicGRU(emb_size, return_sequence=True)([seq_i_embeddings, sequence_length])
121 |         scores = AttentionSequencePoolingLayer(att_hidden_units=(64, 16), return_score=True)([
122 |             id_slate_pooling, rnn_outputs, sequence_length])
123 |         final_state2 = DynamicGRU(emb_size * 2, gru_type='AUGRU', return_sequence=False
124 |                                   )([rnn_outputs, sequence_length, tf.keras.layers.Permute([2, 1])(scores)])
125 |         seqs_attn.append(final_state2)
126 | 
127 |     seqs_embeddings = layers.Concatenate(axis=-1)(seqs_attn) if len(seqs_attn) > 1 else seqs_attn[0]
128 | 
129 |     return tf.squeeze(seqs_embeddings, axis=1)
130 | 


--------------------------------------------------------------------------------
/rl4rs/nets/cql/encoder.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import numpy as np
  4 | import copy
  5 | from typing import Any, ClassVar, Dict, List, Optional, Sequence, Type, Union
  6 | from d3rlpy.models.encoders import EncoderFactory, Encoder, VectorEncoderWithAction, _create_activation, VectorEncoder
  7 | 
  8 | 
  9 | class CustomVectorEncoder(VectorEncoder):
 10 | 
 11 |     def __init__(
 12 |             self,
 13 |             config,
 14 |             action_size,
 15 |             mask_size,
 16 |             with_q,
 17 |             observation_shape: Sequence[int],
 18 |             hidden_units: Optional[Sequence[int]] = None,
 19 |             use_batch_norm: bool = False,
 20 |             dropout_rate: Optional[float] = None,
 21 |             use_dense: bool = False,
 22 |             activation: nn.Module = nn.ReLU(),
 23 |     ):
 24 |         super().__init__(observation_shape, hidden_units, use_batch_norm, dropout_rate, use_dense, activation)
 25 |         self.action_size = action_size
 26 |         self.mask_size = mask_size
 27 |         self.with_q = with_q
 28 |         self.emb_size = 32
 29 |         self.emb_layer = nn.Embedding(action_size, self.emb_size)
 30 |         self.fc2 = nn.Linear(self._feature_size + self.emb_size * mask_size, action_size)
 31 |         self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 32 |         location_mask = config['location_mask']
 33 |         self.special_items = config['special_items']
 34 |         self.location_mask = torch.tensor(location_mask, device=self.device)
 35 | 
 36 |     def get_feature_size(self) -> int:
 37 |         if not self.with_q:
 38 |             return self._feature_size + self.emb_size * self.mask_size
 39 |         else:
 40 |             return self.action_size
 41 | 
 42 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
 43 |         batch_size = x.shape[0]
 44 |         # mask
 45 |         prev_actions = x[:, -self.mask_size:-1].to(torch.long)
 46 |         cur_step = x[:, -1].to(torch.long)
 47 |         x_mask_layer = cur_step % 9 // 3
 48 |         mask = self.location_mask[x_mask_layer]
 49 |         for i in range(self.mask_size-1):
 50 |             mask[range(batch_size), prev_actions[:, i]] = 0
 51 |         h = self._fc_encode(x)
 52 |         if self._use_batch_norm:
 53 |             h = self._bns[-1](h)
 54 |         if self._dropout_rate is not None:
 55 |             h = self._dropouts[-1](h)
 56 |         prev_action_emb = nn.Flatten()(self.emb_layer(x[:, -self.mask_size:].to(torch.long)))
 57 |         h = torch.cat([h, prev_action_emb], dim=-1)
 58 |         if self.with_q:
 59 |             h = self.fc2(h)
 60 |             action_mask = mask < 0.01
 61 |             # h[action_mask] = -2 ** 15
 62 |             h[action_mask] = 0
 63 |             for i in range(batch_size):
 64 |                 if len(np.intersect1d(prev_actions[i].cpu().numpy(), self.special_items)) > 0:
 65 |                     h[i][self.special_items] = 0
 66 |                     # h[i][self.special_items] = -2 ** 15
 67 |         return h
 68 | 
 69 | 
 70 | class CustomVectorEncoderFactory(EncoderFactory):
 71 |     TYPE: ClassVar[str] = "vector"
 72 |     _hidden_units: Sequence[int]
 73 |     _activation: str
 74 |     _use_batch_norm: bool
 75 |     _dropout_rate: Optional[float]
 76 |     _use_dense: bool
 77 | 
 78 |     def __init__(
 79 |             self,
 80 |             config,
 81 |             action_size,
 82 |             mask_size,
 83 |             with_q=False,
 84 |             hidden_units: Optional[Sequence[int]] = None,
 85 |             activation: str = "relu",
 86 |             use_batch_norm: bool = False,
 87 |             dropout_rate: Optional[float] = None,
 88 |             use_dense: bool = False,
 89 |     ):
 90 |         self.config = config
 91 |         self.action_size = action_size
 92 |         self.mask_size = mask_size
 93 |         self.with_q = with_q
 94 |         if hidden_units is None:
 95 |             self._hidden_units = [256]
 96 |         else:
 97 |             self._hidden_units = hidden_units
 98 |         self._activation = activation
 99 |         self._use_batch_norm = use_batch_norm
100 |         self._dropout_rate = dropout_rate
101 |         self._use_dense = use_dense
102 | 
103 |     def create(self, observation_shape: Sequence[int]) -> CustomVectorEncoder:
104 |         assert len(observation_shape) == 1
105 |         return CustomVectorEncoder(
106 |             config=self.config,
107 |             action_size=self.action_size,
108 |             mask_size=self.mask_size,
109 |             with_q=self.with_q,
110 |             observation_shape=observation_shape,
111 |             hidden_units=self._hidden_units,
112 |             use_batch_norm=self._use_batch_norm,
113 |             dropout_rate=self._dropout_rate,
114 |             use_dense=self._use_dense,
115 |             activation=_create_activation(self._activation),
116 |         )
117 | 
118 |     def create_with_action(
119 |             self,
120 |             observation_shape: Sequence[int],
121 |             action_size: int,
122 |             discrete_action: bool = False,
123 |     ) -> VectorEncoderWithAction:
124 |         assert len(observation_shape) == 1
125 |         return VectorEncoderWithAction(
126 |             observation_shape=observation_shape,
127 |             action_size=action_size,
128 |             hidden_units=self._hidden_units,
129 |             use_batch_norm=self._use_batch_norm,
130 |             dropout_rate=self._dropout_rate,
131 |             use_dense=self._use_dense,
132 |             discrete_action=discrete_action,
133 |             activation=_create_activation(self._activation),
134 |         )
135 | 
136 |     def get_params(self, deep: bool = False) -> Dict[str, Any]:
137 |         if deep:
138 |             hidden_units = copy.deepcopy(self._hidden_units)
139 |         else:
140 |             hidden_units = self._hidden_units
141 |         params = {
142 |             "hidden_units": hidden_units,
143 |             "activation": self._activation,
144 |             "use_batch_norm": self._use_batch_norm,
145 |             "dropout_rate": self._dropout_rate,
146 |             "use_dense": self._use_dense,
147 |         }
148 |         return params
149 | 


--------------------------------------------------------------------------------
/rl4rs/utils/d3rlpy_scorer.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from typing import Any, Callable, Iterator, List, Optional, Tuple, Union, cast
  3 | from d3rlpy.metrics.scorer import AlgoProtocol, _make_batches
  4 | from d3rlpy.dataset import Episode
  5 | from rl4rs.policy.policy_model import policy_model
  6 | 
  7 | WINDOW_SIZE = 1024
  8 | 
  9 | 
 10 | # modify from https://github.com/takuseno/d3rlpy/blob/master/d3rlpy/metrics/scorer.py
 11 | def soft_opc_scorer(
 12 |         return_threshold: float,
 13 | ) -> Callable[[policy_model, List[Episode]], float]:
 14 |     r"""Returns Soft Off-Policy Classification metrics.
 15 | 
 16 |     This function returns scorer function, which is suitable to the standard
 17 |     scikit-learn scorer function style.
 18 |     The metrics of the scorer funciton is evaluating gaps of action-value
 19 |     estimation between the success episodes and the all episodes.
 20 |     If the learned Q-function is optimal, action-values in success episodes
 21 |     are expected to be higher than the others.
 22 |     The success episode is defined as an episode with a return above the given
 23 |     threshold.
 24 | 
 25 |     .. math::
 26 | 
 27 |         \mathbb{E}_{s, a \sim D_{success}} [Q(s, a)]
 28 |             - \mathbb{E}_{s, a \sim D} [Q(s, a)]
 29 | 
 30 |     .. code-block:: python
 31 | 
 32 |         from d3rlpy.datasets import get_cartpole
 33 |         from d3rlpy.algos import DQN
 34 |         from d3rlpy.metrics.scorer import soft_opc_scorer
 35 |         from sklearn.model_selection import train_test_split
 36 | 
 37 |         dataset, _ = get_cartpole()
 38 |         train_episodes, test_episodes = train_test_split(dataset, test_size=0.2)
 39 | 
 40 |         scorer = soft_opc_scorer(return_threshold=180)
 41 | 
 42 |         dqn = DQN()
 43 |         dqn.fit(train_episodes,
 44 |                 eval_episodes=test_episodes,
 45 |                 scorers={'soft_opc': scorer})
 46 | 
 47 |     References:
 48 |         * `Irpan et al., Off-Policy Evaluation via Off-Policy Classification.
 49 |           <https://arxiv.org/abs/1906.01624>`_
 50 | 
 51 |     Args:
 52 |         return_threshold: threshold of success episodes.
 53 | 
 54 |     Returns:
 55 |         scorer function.
 56 | 
 57 |     """
 58 | 
 59 |     def scorer(algo: policy_model, episodes: List[Episode]) -> float:
 60 |         success_values = []
 61 |         all_values = []
 62 |         for episode in episodes:
 63 |             is_success = episode.compute_return() >= return_threshold
 64 |             for batch in _make_batches(episode, WINDOW_SIZE, algo.policy.n_frames):
 65 |                 values = algo.predict_q(batch.observations, batch.actions)
 66 |                 values = cast(np.ndarray, values)
 67 |                 all_values += values.reshape(-1).tolist()
 68 |                 if is_success:
 69 |                     success_values += values.reshape(-1).tolist()
 70 |         return float(np.mean(success_values) - np.mean(all_values))
 71 | 
 72 |     return scorer
 73 | 
 74 | 
 75 | def dynamics_reward_prediction_mean_error_scorer(
 76 |         dynamics: policy_model, episodes: List[Episode]
 77 | ) -> float:
 78 |     r"""Returns MSE of reward prediction (in negative scale).
 79 | 
 80 |     This metrics suggests how dynamics model is generalized to test sets.
 81 |     If the MSE is large, the dynamics model are overfitting.
 82 | 
 83 |     .. math::
 84 | 
 85 |         \mathbb{E}_{s_t, a_t, r_{t+1} \sim D} [(r_{t+1} - r')]
 86 | 
 87 |     where :math:`r' \sim T(s_t, a_t)`.
 88 | 
 89 |     Args:
 90 |         dynamics: dynamics model.
 91 |         episodes: list of episodes.
 92 | 
 93 |     Returns:
 94 |         negative mean squared error.
 95 | 
 96 |     """
 97 |     total_errors = []
 98 |     for episode in episodes:
 99 |         for batch in _make_batches(episode, WINDOW_SIZE, dynamics.policy.n_frames):
100 |             pred = dynamics.predict_q(batch.observations, batch.actions)
101 |             rewards = batch.next_rewards
102 |             errors = (rewards - pred[1]).reshape(-1)
103 |             total_errors += errors.tolist()
104 |     # smaller is better
105 |     return float(np.mean(total_errors))
106 | 
107 | 
108 | def dynamics_reward_prediction_abs_mean_error_scorer(
109 |         dynamics: policy_model, episodes: List[Episode]
110 | ) -> float:
111 |     r"""Returns MSE of reward prediction (in negative scale).
112 | 
113 |     This metrics suggests how dynamics model is generalized to test sets.
114 |     If the MSE is large, the dynamics model are overfitting.
115 | 
116 |     .. math::
117 | 
118 |         \mathbb{E}_{s_t, a_t, r_{t+1} \sim D} [abs(r_{t+1} - r')]
119 | 
120 |     where :math:`r' \sim T(s_t, a_t)`.
121 | 
122 |     Args:
123 |         dynamics: dynamics model.
124 |         episodes: list of episodes.
125 | 
126 |     Returns:
127 |         negative mean squared error.
128 | 
129 |     """
130 |     total_errors = []
131 |     for episode in episodes:
132 |         for batch in _make_batches(episode, WINDOW_SIZE, dynamics.policy.n_frames):
133 |             pred = dynamics.predict_q(batch.observations, batch.actions)
134 |             rewards = batch.next_rewards
135 |             errors = np.abs(rewards - pred[1]).reshape(-1)
136 |             total_errors += errors.tolist()
137 |     # smaller is better
138 |     return float(np.mean(total_errors))
139 | 
140 | def discrete_action_match_scorer(
141 |         algo: policy_model, episodes: List[Episode]
142 | ) -> float:
143 |     r"""Returns percentage of identical actions between algorithm and dataset.
144 | 
145 |     This metrics suggests how different the greedy-policy is from the given
146 |     episodes in discrete action-space.
147 |     If the given episdoes are near-optimal, the large percentage would be
148 |     better.
149 | 
150 |     .. math::
151 | 
152 |         \frac{1}{N} \sum^N \parallel
153 |             \{a_t = \text{argmax}_a Q_\theta (s_t, a)\}
154 | 
155 |     Args:
156 |         algo: algorithm.
157 |         episodes: list of episodes.
158 | 
159 |     Returns:
160 |         percentage of identical actions.
161 | 
162 |     """
163 |     total_matches = []
164 |     for episode in episodes:
165 |         for batch in _make_batches(episode, WINDOW_SIZE, algo.policy.n_frames):
166 |             actions = algo.predict_with_mask(batch.observations)
167 |             match = (batch.actions.reshape(-1) == actions).tolist()
168 |             total_matches += match
169 |     return float(np.mean(total_matches))


--------------------------------------------------------------------------------
/script/exact_k_train.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import os, sys
  3 | import gym
  4 | import numpy as np
  5 | import tensorflow as tf
  6 | from rl4rs.nets.exact_k.model import Generator, Discriminator
  7 | from rl4rs.env.slate import SlateRecEnv, SlateState
  8 | from rl4rs.env.seqslate import SeqSlateRecEnv, SeqSlateState
  9 | from rl4rs.utils.fileutil import find_newest_files
 10 | 
 11 | stage = sys.argv[1]
 12 | extra_config = eval(sys.argv[2])
 13 | 
 14 | config = {"epoch": 10000, "maxlen": 64, "batch_size": 256, "action_size": 284, "class_num": 2, "dense_feature_num": 432,
 15 |           "category_feature_num": 21, "category_hash_size": 100000, "seq_num": 2, "emb_size": 128, "page_items": 9,
 16 |           "hidden_units": 128, "max_steps": 9, "sample_file": '../dataset/rl4rs_dataset_b3_shuf.csv', "iteminfo_file": '../item_info.csv',
 17 |           "model_file": "../output/simulator_b2_dien/model", "support_rllib_mask": False, "is_eval": False, 'env': "SlateRecEnv-v0"}
 18 | 
 19 | config = dict(config, **extra_config)
 20 | 
 21 | if config['env'] == 'SeqSlateRecEnv-v0':
 22 |     config['max_steps'] = 36
 23 |     sim = SeqSlateRecEnv(config, state_cls=SeqSlateState)
 24 |     env = gym.make('SeqSlateRecEnv-v0', recsim=sim)
 25 | else:
 26 |     sim = SlateRecEnv(config, state_cls=SlateState)
 27 |     env = gym.make('SlateRecEnv-v0', recsim=sim)
 28 | 
 29 | batch_size = config["batch_size"]
 30 | action_size = config["action_size"]
 31 | epoch = config["epoch"]
 32 | max_steps = config["max_steps"]
 33 | output_dir = os.environ['rl4rs_output_dir']
 34 | model_dir = '%s/%s/' % (output_dir, 'exactk_' + config['env'] + '_' + config['trial_name'])
 35 | model_save_path = model_dir + 'exact_k.ckpt'
 36 | restore_file = find_newest_files('exact_k.ckpt*', model_dir)
 37 | restore_file = restore_file[:restore_file.rfind('.')]
 38 | 
 39 | l0_ssr_mask = np.zeros(action_size)
 40 | location_mask, special_items = SlateState.get_mask_from_file(config['iteminfo_file'], action_size)
 41 | l1_mask, l2_mask, l3_mask = location_mask[0], location_mask[1], location_mask[2]
 42 | l0_ssr_mask[special_items] = 1
 43 | 
 44 | with tf.name_scope('Generator'):
 45 |     g = Generator(l1_mask,
 46 |                   l2_mask,
 47 |                   l3_mask,
 48 |                   l0_ssr_mask,
 49 |                   is_training=True,
 50 |                   seq_length=action_size)
 51 | 
 52 | with tf.name_scope('Discriminator'):
 53 |     d = Discriminator(seq_length=action_size)
 54 | 
 55 | print("Graph loaded")
 56 | 
 57 | if config.get('gpu', True):
 58 |     gpu_options = tf.GPUOptions(
 59 |         per_process_gpu_memory_fraction=0.5,
 60 |         allow_growth=True)  # seems to be not working
 61 |     sess_config = tf.ConfigProto(allow_soft_placement=True,
 62 |                                  gpu_options=gpu_options)
 63 | else:
 64 |     os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
 65 |     sess_config = tf.ConfigProto()
 66 | 
 67 | if stage == 'train':
 68 |     with tf.Session(config=sess_config) as sess:
 69 |         sess.run(tf.initialize_all_variables())
 70 |         print('Generator training start!')
 71 |         reward_total = 0.0
 72 |         for episode in range(epoch):
 73 |             print('Generator episode: ', episode)
 74 | 
 75 |             observation = np.array(env.reset())
 76 |             item_cand = np.array([list(range(0, config['action_size']))] * batch_size)
 77 |             hill_b_f = []
 78 |             for i in range(2):
 79 |                 # get action
 80 |                 sampled_card_idx, sampled_card = sess.run([g.sampled_path, g.sampled_result],
 81 |                                                           feed_dict={g.user: observation, g.item_cand: item_cand})
 82 |                 for step in range(config['max_steps']):
 83 |                     observation_, reward, done, info = env.step(sampled_card[:, step])
 84 | 
 85 |                 env.reset()
 86 |                 # hill b f
 87 |                 hill_b_f.append(list(zip(sampled_card, sampled_card_idx, reward)))
 88 | 
 89 |             b_hill_f = np.transpose(hill_b_f, [1, 0, 2])
 90 |             samples = []
 91 |             for hill_f in b_hill_f:
 92 |                 sorted_list = sorted(hill_f, key=lambda x: x[2], reverse=True)
 93 |                 samples.append(sorted_list[np.random.choice(1)])
 94 | 
 95 |             (sampled_card, sampled_card_idx, reward) = zip(*samples)
 96 |             reward = np.array(reward)
 97 | 
 98 |             reward_ = sess.run(d.reward, feed_dict={d.user: observation})
 99 |             sess.run(d.train_op, feed_dict={d.user: observation, d.reward_target: reward})
100 | 
101 |             if episode % 50 == 0:
102 |                 print('episode:', episode)
103 |                 print('reward_target', np.mean(reward_))
104 |                 print('reward', np.mean(reward))
105 |                 print('actions', sampled_card[:10])
106 |             reward = (reward - reward_)
107 | 
108 |             reward = reward / np.std(reward)
109 | 
110 |             sess.run(g.train_op, feed_dict={g.decode_target_ids: sampled_card_idx,
111 |                                             g.reward: reward,
112 |                                             g.item_cand: item_cand,
113 |                                             g.user: observation,
114 |                                             })
115 |             gs_gen = sess.run(g.global_step)
116 | 
117 |             if episode % 500 == 0:
118 |                 saver = tf.train.Saver()
119 |                 saver.save(sess, model_save_path + '.' + str(episode))
120 |                 print('save model:' + model_save_path + '.' + str(episode))
121 |         print('Generator training done!')
122 |     saver = tf.train.Saver()
123 |     saver.save(sess, model_save_path + '.' + str(episode))
124 |     print('save model:' + model_save_path + '.' + str(episode))
125 |     print("Done")
126 | 
127 | if stage == 'eval':
128 |     with tf.Session(config=sess_config) as sess:
129 |         sess.run(tf.initialize_all_variables())
130 |         saver = tf.train.Saver()
131 |         saver.restore(sess, restore_file)
132 |         print('restore exact-k model from %s' % (restore_file))
133 |         episode_reward = 0
134 |         done = False
135 |         epoch = 4
136 |         for i in range(epoch):
137 |             observation = np.array(env.reset())
138 |             item_cand = np.array([list(range(0, config['action_size']))] * batch_size)
139 |             sampled_card_idx, sampled_card = sess.run([g.greedy_path, g.greedy_result],
140 |                                                       feed_dict={g.user: observation, g.item_cand: item_cand})
141 |             for step in range(config['max_steps']):
142 |                 observation_, reward, done, info = env.step(sampled_card[:, step])
143 |                 episode_reward += sum(reward)
144 |             print('actions', sampled_card[:10])
145 |             print('avg reward', episode_reward / config['batch_size'] / (i + 1))
146 | 


--------------------------------------------------------------------------------
/rl4rs/server/gymHttpClient.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | import six.moves.urllib.parse as urlparse
  3 | import json
  4 | import numpy as np
  5 | import os
  6 | import gym
  7 | 
  8 | import logging
  9 | 
 10 | logger = logging.getLogger(__name__)
 11 | logger.setLevel(logging.INFO)
 12 | 
 13 | 
 14 | # modify from https://github.com/openai/gym-http-api
 15 | class Client(object):
 16 |     """
 17 |     Gym client to interface with gym_http_server
 18 |     """
 19 | 
 20 |     def __init__(self, remote_base):
 21 |         self.remote_base = remote_base
 22 |         self.session = requests.Session()
 23 |         self.session.headers.update({'Content-type': 'application/json'})
 24 | 
 25 |     def _parse_server_error_or_raise_for_status(self, resp):
 26 |         j = {}
 27 |         try:
 28 |             j = resp.json()
 29 |         except:
 30 |             # Most likely json parse failed because of network error, not server error (server
 31 |             # sends its errors in json). Don't let parse exception go up, but rather raise default
 32 |             # error.
 33 |             resp.raise_for_status()
 34 |         if resp.status_code != 200 and "message" in j:  # descriptive message from server side
 35 |             raise ServerError(message=j["message"], status_code=resp.status_code)
 36 |         resp.raise_for_status()
 37 |         return j
 38 | 
 39 |     def _post_request(self, route, data):
 40 |         url = urlparse.urljoin(self.remote_base, route)
 41 |         # logger.info("POST {}\n{}".format(url, json.dumps(data)))
 42 |         resp = self.session.post(urlparse.urljoin(self.remote_base, route),
 43 |                                  data=json.dumps(data))
 44 |         return self._parse_server_error_or_raise_for_status(resp)
 45 | 
 46 |     def _get_request(self, route):
 47 |         url = urlparse.urljoin(self.remote_base, route)
 48 |         # logger.info("GET {}".format(url))
 49 |         resp = self.session.get(url)
 50 |         return self._parse_server_error_or_raise_for_status(resp)
 51 | 
 52 |     def env_create(self, env_id, config={}):
 53 |         route = '/v1/envs/'
 54 |         data = {'env_id': env_id, 'config': config}
 55 |         resp = self._post_request(route, data)
 56 |         instance_id = resp['instance_id']
 57 |         return instance_id
 58 | 
 59 |     def env_list_all(self):
 60 |         route = '/v1/envs/'
 61 |         resp = self._get_request(route)
 62 |         all_envs = resp['all_envs']
 63 |         return all_envs
 64 | 
 65 |     def env_reset(self, instance_id):
 66 |         route = '/v1/envs/{}/reset/'.format(instance_id)
 67 |         resp = self._post_request(route, None)
 68 |         if 'observation' in resp:
 69 |             observation = resp['observation']
 70 |         else:
 71 |             resp = self._post_request(route, None)
 72 |             observation = resp['observation']
 73 |         return observation
 74 | 
 75 |     def env_step(self, instance_id, action, render=False):
 76 |         route = '/v1/envs/{}/step/'.format(instance_id)
 77 |         data = {'action': action, 'render': render}
 78 |         resp = self._post_request(route, data)
 79 |         observation = resp['observation']
 80 |         reward = resp['reward']
 81 |         done = resp['done']
 82 |         info = resp['info']
 83 |         return [observation, reward, done, info]
 84 | 
 85 |     def env_action_space_info(self, instance_id):
 86 |         route = '/v1/envs/{}/action_space/'.format(instance_id)
 87 |         resp = self._get_request(route)
 88 |         info = resp['info']
 89 |         return info
 90 | 
 91 |     def env_action_space_sample(self, instance_id):
 92 |         route = '/v1/envs/{}/action_space/sample'.format(instance_id)
 93 |         resp = self._get_request(route)
 94 |         action = resp['action']
 95 |         return action
 96 | 
 97 |     def env_action_space_contains(self, instance_id, x):
 98 |         route = '/v1/envs/{}/action_space/contains/{}'.format(instance_id, x)
 99 |         resp = self._get_request(route)
100 |         member = resp['member']
101 |         return member
102 | 
103 |     def env_observation_space_info(self, instance_id):
104 |         route = '/v1/envs/{}/observation_space/'.format(instance_id)
105 |         resp = self._get_request(route)
106 |         info = resp['info']
107 |         return info
108 | 
109 |     def env_observation_space_contains(self, instance_id, params):
110 |         route = '/v1/envs/{}/observation_space/contains'.format(instance_id)
111 |         resp = self._post_request(route, params)
112 |         member = resp['member']
113 |         return member
114 | 
115 |     def env_monitor_start(self, instance_id, directory,
116 |                           force=False, resume=False, video_callable=False):
117 |         route = '/v1/envs/{}/monitor/start/'.format(instance_id)
118 |         data = {'directory': directory,
119 |                 'force': force,
120 |                 'resume': resume,
121 |                 'video_callable': video_callable}
122 |         self._post_request(route, data)
123 | 
124 |     def env_monitor_close(self, instance_id):
125 |         route = '/v1/envs/{}/monitor/close/'.format(instance_id)
126 |         self._post_request(route, None)
127 | 
128 |     def env_close(self, instance_id):
129 |         route = '/v1/envs/{}/close/'.format(instance_id)
130 |         self._post_request(route, None)
131 | 
132 |     def upload(self, training_dir, algorithm_id=None, api_key=None):
133 |         if not api_key:
134 |             api_key = os.environ.get('OPENAI_GYM_API_KEY')
135 | 
136 |         route = '/v1/upload/'
137 |         data = {'training_dir': training_dir,
138 |                 'algorithm_id': algorithm_id,
139 |                 'api_key': api_key}
140 |         self._post_request(route, data)
141 | 
142 |     def shutdown_server(self):
143 |         route = '/v1/shutdown/'
144 |         self._post_request(route, None)
145 | 
146 | 
147 | class ServerError(Exception):
148 |     def __init__(self, message, status_code=None):
149 |         Exception.__init__(self)
150 |         self.message = message
151 |         if status_code is not None:
152 |             self.status_code = status_code
153 | 
154 | 
155 | if __name__ == '__main__':
156 |     remote_base = 'http://127.0.0.1:5000'
157 |     client = Client(remote_base)
158 | 
159 |     # Create environment
160 |     env_id = 'CartPole-v0'
161 |     instance_id = client.env_create(env_id)
162 |     print(instance_id)
163 |     # Check properties
164 |     all_envs = client.env_list_all()
165 |     action_info = client.env_action_space_info(instance_id)
166 |     obs_info = client.env_observation_space_info(instance_id)
167 |     print(obs_info)
168 |     # Run a single step
169 |     client.env_monitor_start(instance_id, directory='tmp', force=True)
170 |     init_obs = client.env_reset(instance_id)
171 |     [observation, reward, done, info] = client.env_step(instance_id, 1, False)
172 |     client.env_monitor_close(instance_id)
173 |     print(observation, reward, done, info)
174 |     # client.upload(training_dir='tmp')
175 | 


--------------------------------------------------------------------------------
/script/batchrl_train.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import gym
  3 | import random
  4 | import d3rlpy
  5 | import sys
  6 | import torch
  7 | from rl4rs.env.slate import SlateRecEnv, SlateState
  8 | from rl4rs.env.seqslate import SeqSlateRecEnv, SeqSlateState
  9 | from script import batchrl_trainer
 10 | from d3rlpy.dataset import MDPDataset
 11 | from script.offline_evaluation import ope_eval
 12 | from rl4rs.policy.behavior_model import behavior_model
 13 | from rl4rs.policy.policy_model import policy_model
 14 | from rl4rs.nets.cql.encoder import CustomVectorEncoderFactory
 15 | from d3rlpy.metrics.scorer import dynamics_observation_prediction_error_scorer
 16 | from d3rlpy.metrics.scorer import dynamics_reward_prediction_error_scorer
 17 | from d3rlpy.metrics.scorer import dynamics_prediction_variance_scorer
 18 | 
 19 | algo = sys.argv[1]
 20 | stage = sys.argv[2]
 21 | extra_config = eval(sys.argv[3]) if len(sys.argv) >= 4 else {}
 22 | 
 23 | config = {"epoch": 4, "maxlen": 64, "batch_size": 2048, "action_size": 284, "class_num": 2, "dense_feature_num": 432,
 24 |           "category_feature_num": 21, "category_hash_size": 100000, "seq_num": 2, "emb_size": 128,
 25 |           "hidden_units": 128, "max_steps": 9, "sample_file": '../dataset/rl4rs_dataset_a_shuf.csv',
 26 |           "model_file": "../output/rl4rs_dataset_a_dnn/model", 'gpu': True, "page_items": 9, 'action_emb_size':32,
 27 |           "iteminfo_file": '../dataset/item_info.csv', "support_d3rl_mask": True, "is_eval": True,
 28 |           "CQL_alpha": 1, 'env': 'SlateRecEnv-v0', 'trial_name': 'a_all'}
 29 | 
 30 | config = dict(config, **extra_config)
 31 | 
 32 | if config['env'] == 'SeqSlateRecEnv-v0':
 33 |     config['max_steps'] = 36
 34 |     location_mask, special_items = SeqSlateState.get_mask_from_file(config['iteminfo_file'], config['action_size'])
 35 |     config['location_mask'] = location_mask
 36 |     config['special_items'] = special_items
 37 | elif config['env'] == 'SlateRecEnv-v0':
 38 |     location_mask, special_items = SlateState.get_mask_from_file(config['iteminfo_file'], config['action_size'])
 39 |     config['location_mask'] = location_mask
 40 |     config['special_items'] = special_items
 41 | else:
 42 |     assert config['env'] in ('SlateRecEnv-v0', 'SeqSlateRecEnv-v0')
 43 | 
 44 | if algo in ('MOPO', 'COMBO') or 'conti' in algo:
 45 |     config["support_conti_env"] = True
 46 | 
 47 | if not config.get('gpu', True):
 48 |     os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
 49 |     torch.cuda.is_available = lambda: False
 50 |     print('CUDA_VISIBLE_DEVICES', torch.cuda.is_available())
 51 | 
 52 | if not config.get("support_conti_env",False):
 53 |     trail_name = config['env'] + '_' + config['trial_name'] + '.h5'
 54 | elif config.get("support_onehot_action", False):
 55 |     config['action_emb_size'] = config["action_size"]
 56 |     trail_name = config['env'] + '_' + config['trial_name'] + '_onehot.h5'
 57 | else:
 58 |     trail_name = config['env'] + '_' + config['trial_name'] + '_conti.h5'
 59 | dataset_dir = os.environ['rl4rs_dataset_dir']
 60 | output_dir = os.environ['rl4rs_output_dir']
 61 | dataset_save_path = dataset_dir + '/' + trail_name
 62 | dynamics_save_path = output_dir + '/' + 'dynamics' + '_' + trail_name
 63 | model_save_path = output_dir + '/' + algo + '_' + trail_name
 64 | scaler = None
 65 | print(trail_name, config)
 66 | 
 67 | try:
 68 |     dataset = MDPDataset.load(dataset_save_path)
 69 | except Exception:
 70 |     dataset = None
 71 | 
 72 | try:
 73 |     dynamics = batchrl_trainer.get_model(config, 'dynamics')
 74 |     dynamics = batchrl_trainer.build_with_dataset(dynamics, dataset)
 75 |     dynamics.load_model(dynamics_save_path)
 76 | except Exception:
 77 |     dynamics = None
 78 | 
 79 | if stage == 'dataset_generate':
 80 |     if config['env'] == 'SlateRecEnv-v0':
 81 |         if not config.get("support_conti_env",False):
 82 |             batchrl_trainer.data_generate_rl4rs_a(config, dataset_save_path)
 83 |         else:
 84 |             batchrl_trainer.data_generate_rl4rs_a_conti(config, dataset_save_path)
 85 |     elif config['env'] == 'SeqSlateRecEnv-v0':
 86 |         if not config.get("support_conti_env",False):
 87 |             batchrl_trainer.data_generate_rl4rs_b(config, dataset_save_path)
 88 |         else:
 89 |             batchrl_trainer.data_generate_rl4rs_b_conti(config, dataset_save_path)
 90 |     else:
 91 |         batchrl_trainer.data_generate_rl4rs_a(config, dataset_save_path)
 92 |         assert config['env'] in ('SlateRecEnv-v0', 'SeqSlateRecEnv-v0')
 93 | 
 94 | if stage == 'train_dynamics' or (stage == 'train' and algo == 'dynamics'):
 95 |     dynamics = batchrl_trainer.get_model(config, 'dynamics')
 96 |     print('get_action_size', dataset.episodes[0].get_action_size())
 97 |     dynamics.fit(dataset,
 98 |                  eval_episodes=dataset.episodes[-3000:],
 99 |                  n_epochs=10,
100 |                  show_progress=False,
101 |                  scorers={
102 |                      'observation_error': dynamics_observation_prediction_error_scorer,
103 |                      'reward_error': dynamics_reward_prediction_error_scorer,
104 |                      'variance': dynamics_prediction_variance_scorer,
105 |                  }
106 |                  )
107 |     dynamics.save_model(dynamics_save_path)
108 | 
109 | if stage == 'train':
110 |     model = batchrl_trainer.get_model(config, algo, dynamics)
111 |     model.fit(dataset,
112 |               eval_episodes=dataset.episodes[-3000:],
113 |               n_epochs=config['epoch'],
114 |               show_progress=False)
115 |     model.save_model(model_save_path)
116 | 
117 | if stage == 'eval':
118 |     default_soft_opc_score = 90 \
119 |         if config['env'] == 'SlateRecEnv-v0' \
120 |         else 90 * 2
121 |     soft_opc_score = config.get('soft_opc_score', default_soft_opc_score)
122 |     model = batchrl_trainer.get_model(config, algo, dynamics)
123 |     model = batchrl_trainer.build_with_dataset(model, dataset)
124 |     model.load_model(model_save_path)
125 |     eval_episodes = random.sample(dataset.episodes, 2048 * 4)
126 |     policy = policy_model(model, config=config)
127 |     # batchrl_trainer.d3rlpy_eval(eval_episodes, policy, soft_opc_score)
128 |     batchrl_trainer.evaluate(config, policy)
129 | 
130 | if stage == 'ope':
131 |     dataset_dir = os.environ['rl4rs_dataset_dir']
132 |     sample_model = behavior_model(config, modelfile=dataset_dir + '/logged_policy.h5')
133 |     model = batchrl_trainer.get_model(config, algo, dynamics)
134 |     model = batchrl_trainer.build_with_dataset(model, dataset)
135 |     model.load_model(model_save_path)
136 |     eval_config = config.copy()
137 |     eval_config["is_eval"] = True
138 |     eval_config["batch_size"] = 2048
139 |     eval_config["epoch"] = 1
140 |     if config['env'] == 'SeqSlateRecEnv-v0':
141 |         config['max_steps'] = 36
142 |         sim = SeqSlateRecEnv(eval_config, state_cls=SeqSlateState)
143 |         eval_env = gym.make('SeqSlateRecEnv-v0', recsim=sim)
144 |     else:
145 |         sim = SlateRecEnv(eval_config, state_cls=SlateState)
146 |         eval_env = gym.make('SlateRecEnv-v0', recsim=sim)
147 |     ope_eval(eval_config, eval_env, model, sample_model=sample_model)
148 | 


--------------------------------------------------------------------------------
/reproductions/run_modelfree_rl.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | conda activate rl4rs
 4 | script_abs=$(readlink -f "$0")
 5 | rl4rs_benchmark_dir=$(dirname $script_abs)/..
 6 | rl4rs_output_dir=${rl4rs_benchmark_dir}/output
 7 | rl4rs_dataset_dir=${rl4rs_benchmark_dir}/dataset
 8 | script_dir=${rl4rs_benchmark_dir}/script
 9 | export rl4rs_benchmark_dir && export rl4rs_output_dir && export rl4rs_dataset_dir
10 | 
11 | algo=$1
12 | 
13 | cd ${script_dir}
14 | 
15 | # experiment in a_all env, train in a_all sample and test in a_all sample
16 | python -u modelfree_train.py $algo "train" "{'env':'SlateRecEnv-v0','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_a_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_a_dien/model','trial_name':'a_all','remote_base':'http://127.0.0.1:5000'}" >> ${rl4rs_output_dir}/modelfree_a_all_${algo}.log &&
17 | python -u modelfree_train.py $algo "eval" "{'env':'SlateRecEnv-v0','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_a_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_a_dien/model','trial_name':'a_all','remote_base':'http://127.0.0.1:5000'}" >> ${rl4rs_output_dir}/modelfree_a_all_${algo}.log &&
18 | #python -u modelfree_train.py $algo "ope" "{'env':'SlateRecEnv-v0','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_a_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_a_dien/model','trial_name':'a_all','remote_base':'http://127.0.0.1:5000'}" >> ${rl4rs_output_dir}/modelfree_a_all_${algo}.log &&
19 | 
20 | 
21 | # experiment in a_all env, train in a_train sample and test in a_test sample
22 | #python -u modelfree_train.py $algo "train" "{'env':'SlateRecEnv-v0','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_a_train_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_a_dien/model','trial_name':'a_train','remote_base':'http://127.0.0.1:5000'}" >> ${rl4rs_output_dir}/modelfree_a_train_${algo}.log &&
23 | #python -u modelfree_train.py $algo "eval" "{'env':'SlateRecEnv-v0','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_a_test_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_a_dien/model','trial_name':'a_train','remote_base':'http://127.0.0.1:5000'}" >> ${rl4rs_output_dir}/modelfree_a_train_${algo}.log &&
24 | #python -u modelfree_train.py $algo "ope" "{'env':'SlateRecEnv-v0','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_a_test_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_a_dien/model','trial_name':'a_train','remote_base':'http://127.0.0.1:5000'}" >> ${rl4rs_output_dir}/modelfree_a_train_${algo}.log &&
25 | 
26 | 
27 | # experiment train in a_sl env and test in a_rl env
28 | #python -u modelfree_train.py $algo "train" "{'env':'SlateRecEnv-v0','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_a_sl_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_a_sl_dien/model','trial_name':'a_sl','remote_base':'http://127.0.0.1:5000'}" >> ${rl4rs_output_dir}/modelfree_a_sl_${algo}.log &&
29 | #python -u modelfree_train.py $algo "eval" "{'env':'SlateRecEnv-v0','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_a_rl_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_a_rl_dien/model','trial_name':'a_sl','remote_base':'http://127.0.0.1:5000'}" >> ${rl4rs_output_dir}/modelfree_a_sl_${algo}.log &&
30 | #python -u modelfree_train.py $algo "ope" "{'env':'SlateRecEnv-v0','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_a_rl_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_a_rl_dien/model','trial_name':'a_sl','remote_base':'http://127.0.0.1:5000'}" >> ${rl4rs_output_dir}/modelfree_a_sl_${algo}.log &&
31 | 
32 | 
33 | # experiment in b_all env, train in b_all sample and test in b_all sample
34 | python -u modelfree_train.py $algo "train" "{'env':'SeqSlateRecEnv-v0','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_b3_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_b2_dien/model','trial_name':'b_all','remote_base':'http://127.0.0.1:5000'}" >> ${rl4rs_output_dir}/modelfree_b_all_${algo}.log &&
35 | python -u modelfree_train.py $algo "eval" "{'env':'SeqSlateRecEnv-v0','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_b3_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_b2_dien/model','trial_name':'b_all','remote_base':'http://127.0.0.1:5000'}" >> ${rl4rs_output_dir}/modelfree_b_all_${algo}.log &&
36 | #python -u modelfree_train.py $algo "ope" "{'env':'SeqSlateRecEnv-v0','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_b3_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_b2_dien/model','trial_name':'b_all','remote_base':'http://127.0.0.1:5000'}" >> ${rl4rs_output_dir}/modelfree_b_all_${algo}.log &&
37 | 
38 | 
39 | # experiment in b_all env, train in b_train sample and test in b_test sample
40 | #python -u modelfree_train.py $algo "train" "{'env':'SeqSlateRecEnv-v0','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_b3_train_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_b2_dien/model','trial_name':'b_train','remote_base':'http://127.0.0.1:5000'}" >> ${rl4rs_output_dir}/modelfree_b_train_${algo}.log &&
41 | #python -u modelfree_train.py $algo "eval" "{'env':'SeqSlateRecEnv-v0','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_b3_test_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_b2_dien/model','trial_name':'b_train','remote_base':'http://127.0.0.1:5000'}" >> ${rl4rs_output_dir}/modelfree_b_train_${algo}.log &&
42 | #python -u modelfree_train.py $algo "ope" "{'env':'SeqSlateRecEnv-v0','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_b3_test_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_b2_dien/model','trial_name':'b_train','remote_base':'http://127.0.0.1:5000'}" >> ${rl4rs_output_dir}/modelfree_b_train_${algo}.log &&
43 | 
44 | 
45 | # experiment train in b_sl env and test in b_rl env
46 | #python -u modelfree_train.py $algo "train" "{'env':'SeqSlateRecEnv-v0','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_b3_sl_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_b2_sl_dien/model','trial_name':'b_sl','remote_base':'http://127.0.0.1:5000'}" >> ${rl4rs_output_dir}/modelfree_b_sl_${algo}.log &&
47 | #python -u modelfree_train.py $algo "eval" "{'env':'SeqSlateRecEnv-v0','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_b3_rl_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_b2_rl_dien/model','trial_name':'b_sl','remote_base':'http://127.0.0.1:5000'}" >> ${rl4rs_output_dir}/modelfree_b_sl_${algo}.log &&
48 | #python -u modelfree_train.py $algo "ope" "{'env':'SeqSlateRecEnv-v0','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_b3_rl_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_b2_rl_dien/model','trial_name':'b_sl','remote_base':'http://127.0.0.1:5000'}" >> ${rl4rs_output_dir}/modelfree_b_sl_${algo}.log &&
49 | 
50 | echo "1"
51 | 


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
  1 | name: rl4rs
  2 | channels:
  3 |   - defaults
  4 | dependencies:
  5 |   - _libgcc_mutex=0.1=main
  6 |   - _openmp_mutex=4.5=1_gnu
  7 |   - _tflow_select=2.3.0=mkl
  8 |   - absl-py=0.15.0=pyhd3eb1b0_0
  9 |   - argon2-cffi=20.1.0=py36h27cfd23_1
 10 |   - astor=0.8.1=py36h06a4308_0
 11 |   - async_generator=1.10=py36h28b3542_0
 12 |   - attrs=21.2.0=pyhd3eb1b0_0
 13 |   - backcall=0.2.0=pyhd3eb1b0_0
 14 |   - blas=1.0=mkl
 15 |   - bleach=4.0.0=pyhd3eb1b0_0
 16 |   - c-ares=1.17.1=h27cfd23_0
 17 |   - ca-certificates=2021.10.26=h06a4308_2
 18 |   - certifi=2021.5.30=py36h06a4308_0
 19 |   - cffi=1.14.6=py36h400218f_0
 20 |   - coverage=5.5=py36h27cfd23_2
 21 |   - cython=0.29.24=py36h295c915_0
 22 |   - dataclasses=0.8=pyh4f3eec9_6
 23 |   - dbus=1.13.18=hb2f20db_0
 24 |   - decorator=5.1.0=pyhd3eb1b0_0
 25 |   - defusedxml=0.7.1=pyhd3eb1b0_0
 26 |   - entrypoints=0.3=py36_0
 27 |   - expat=2.4.1=h2531618_2
 28 |   - fontconfig=2.13.1=h6c09931_0
 29 |   - freetype=2.11.0=h70c0345_0
 30 |   - glib=2.69.1=h5202010_0
 31 |   - google-pasta=0.2.0=pyhd3eb1b0_0
 32 |   - grpcio=1.36.1=py36h2157cd5_1
 33 |   - gst-plugins-base=1.14.0=h8213a91_2
 34 |   - gstreamer=1.14.0=h28cd5cc_2
 35 |   - h5py=2.10.0=py36hd6299e0_1
 36 |   - hdf5=1.10.6=hb1b8bf9_0
 37 |   - icu=58.2=he6710b0_3
 38 |   - importlib-metadata=4.8.1=py36h06a4308_0
 39 |   - importlib_metadata=4.8.1=hd3eb1b0_0
 40 |   - intel-openmp=2021.4.0=h06a4308_3561
 41 |   - ipykernel=5.3.4=py36h5ca1d4c_0
 42 |   - ipython=7.16.1=py36h5ca1d4c_0
 43 |   - ipython_genutils=0.2.0=pyhd3eb1b0_1
 44 |   - ipywidgets=7.6.5=pyhd3eb1b0_1
 45 |   - jedi=0.17.0=py36_0
 46 |   - jinja2=3.0.2=pyhd3eb1b0_0
 47 |   - jpeg=9d=h7f8727e_0
 48 |   - jsonschema=3.2.0=pyhd3eb1b0_2
 49 |   - jupyter=1.0.0=py36_7
 50 |   - jupyter_client=7.1.0=pyhd3eb1b0_0
 51 |   - jupyter_console=6.4.0=pyhd3eb1b0_0
 52 |   - jupyter_core=4.8.1=py36h06a4308_0
 53 |   - jupyterlab_pygments=0.1.2=py_0
 54 |   - jupyterlab_widgets=1.0.0=pyhd3eb1b0_1
 55 |   - keras-applications=1.0.8=py_1
 56 |   - keras-preprocessing=1.1.2=pyhd3eb1b0_0
 57 |   - ld_impl_linux-64=2.35.1=h7274673_9
 58 |   - libffi=3.3=he6710b0_2
 59 |   - libgcc-ng=9.3.0=h5101ec6_17
 60 |   - libgfortran-ng=7.5.0=ha8ba4b0_17
 61 |   - libgfortran4=7.5.0=ha8ba4b0_17
 62 |   - libgomp=9.3.0=h5101ec6_17
 63 |   - libpng=1.6.37=hbc83047_0
 64 |   - libprotobuf=3.17.2=h4ff587b_1
 65 |   - libsodium=1.0.18=h7b6447c_0
 66 |   - libstdcxx-ng=9.3.0=hd4cf53a_17
 67 |   - libuuid=1.0.3=h7f8727e_2
 68 |   - libxcb=1.14=h7b6447c_0
 69 |   - libxml2=2.9.12=h03d6c58_0
 70 |   - markdown=3.3.4=py36h06a4308_0
 71 |   - markupsafe=2.0.1=py36h27cfd23_0
 72 |   - mistune=0.8.4=py36h7b6447c_0
 73 |   - mkl=2020.2=256
 74 |   - mkl-service=2.3.0=py36he8ac12f_0
 75 |   - mkl_fft=1.3.0=py36h54f3939_0
 76 |   - mkl_random=1.1.1=py36h0573a6f_0
 77 |   - nbclient=0.5.3=pyhd3eb1b0_0
 78 |   - nbconvert=6.0.7=py36_0
 79 |   - nbformat=5.1.3=pyhd3eb1b0_0
 80 |   - ncurses=6.3=h7f8727e_2
 81 |   - nest-asyncio=1.5.1=pyhd3eb1b0_0
 82 |   - notebook=6.4.3=py36h06a4308_0
 83 |   - numpy=1.19.2=py36h54aff64_0
 84 |   - numpy-base=1.19.2=py36hfa32c7d_0
 85 |   - openssl=1.1.1m=h7f8727e_0
 86 |   - packaging=21.3=pyhd3eb1b0_0
 87 |   - pandoc=2.12=h06a4308_0
 88 |   - pandocfilters=1.4.3=py36h06a4308_1
 89 |   - parso=0.8.2=pyhd3eb1b0_0
 90 |   - pcre=8.45=h295c915_0
 91 |   - pexpect=4.8.0=pyhd3eb1b0_3
 92 |   - pickleshare=0.7.5=pyhd3eb1b0_1003
 93 |   - pip=21.2.2=py36h06a4308_0
 94 |   - prometheus_client=0.12.0=pyhd3eb1b0_0
 95 |   - prompt-toolkit=3.0.20=pyhd3eb1b0_0
 96 |   - prompt_toolkit=3.0.20=hd3eb1b0_0
 97 |   - ptyprocess=0.7.0=pyhd3eb1b0_2
 98 |   - pycparser=2.21=pyhd3eb1b0_0
 99 |   - pygments=2.10.0=pyhd3eb1b0_0
100 |   - pyparsing=3.0.4=pyhd3eb1b0_0
101 |   - pyqt=5.9.2=py36h05f1152_2
102 |   - python=3.6.13=h12debd9_1
103 |   - python-dateutil=2.8.2=pyhd3eb1b0_0
104 |   - pyzmq=22.2.1=py36h295c915_1
105 |   - qt=5.9.7=h5867ecd_1
106 |   - qtconsole=5.1.1=pyhd3eb1b0_0
107 |   - qtpy=1.10.0=pyhd3eb1b0_0
108 |   - readline=8.1=h27cfd23_0
109 |   - scipy=1.3.1=py36h7c811a0_0
110 |   - send2trash=1.8.0=pyhd3eb1b0_1
111 |   - setuptools=58.0.4=py36h06a4308_0
112 |   - sip=4.19.8=py36hf484d3e_0
113 |   - six=1.16.0=pyhd3eb1b0_0
114 |   - sqlite=3.37.0=hc218d9a_0
115 |   - termcolor=1.1.0=py36h06a4308_1
116 |   - terminado=0.9.4=py36h06a4308_0
117 |   - testpath=0.5.0=pyhd3eb1b0_0
118 |   - tk=8.6.11=h1ccaba5_0
119 |   - tornado=6.1=py36h27cfd23_0
120 |   - traitlets=4.3.3=py36h06a4308_0
121 |   - typing_extensions=3.10.0.2=pyh06a4308_0
122 |   - wcwidth=0.2.5=pyhd3eb1b0_0
123 |   - webencodings=0.5.1=py36_1
124 |   - werkzeug=2.0.2=pyhd3eb1b0_0
125 |   - wheel=0.37.0=pyhd3eb1b0_1
126 |   - widgetsnbextension=3.5.1=py36_0
127 |   - wrapt=1.12.1=py36h7b6447c_1
128 |   - xz=5.2.5=h7b6447c_0
129 |   - zeromq=4.3.4=h2531618_0
130 |   - zipp=3.6.0=pyhd3eb1b0_0
131 |   - zlib=1.2.11=h7f8727e_4
132 |   - pip:
133 |     - aiohttp==3.7.4.post0
134 |     - aiohttp-cors==0.7.0
135 |     - aioredis==1.3.1
136 |     - antlr4-python3-runtime==4.8
137 |     - async-timeout==3.0.1
138 |     - blessings==1.7
139 |     - cachetools==4.2.2
140 |     - chardet==4.0.0
141 |     - charset-normalizer==2.0.6
142 |     - click==8.0.1
143 |     - cloudpickle==1.6.0
144 |     - colorama==0.4.4
145 |     - contextvars==2.4
146 |     - d3rlpy==0.91
147 |     - deepctr==0.9.0
148 |     - dm-tree==0.1.6
149 |     - fairseq==0.10.2
150 |     - filelock==3.0.12
151 |     - flask==1.1.2
152 |     - gast==0.2.2
153 |     - google-api-core==1.31.2
154 |     - google-auth==1.35.0
155 |     - googleapis-common-protos==1.53.0
156 |     - gpustat==0.6.0
157 |     - gputil==1.4.0
158 |     - greenlet==1.1.2
159 |     - gym==0.19.0
160 |     - hiredis==2.0.0
161 |     - hydra-core==1.1.1
162 |     - idna==3.2
163 |     - idna-ssl==1.1.0
164 |     - immutables==0.16
165 |     - importlib-resources==5.2.2
166 |     - itsdangerous==2.0.1
167 |     - joblib==1.0.1
168 |     - keras==2.2.5
169 |     - keras-embed-sim==0.10.0
170 |     - keras-layer-normalization==0.16.0
171 |     - keras-multi-head==0.29.0
172 |     - keras-pos-embd==0.13.0
173 |     - keras-position-wise-feed-forward==0.8.0
174 |     - keras-self-attention==0.51.0
175 |     - keras-transformer==0.40.0
176 |     - lightseq==2.1.4
177 |     - lz4==3.1.3
178 |     - msgpack==1.0.2
179 |     - multidict==5.1.0
180 |     - ninja==1.10.2
181 |     - nvidia-ml-py3==7.352.0
182 |     - omegaconf==2.1.1
183 |     - opencensus==0.7.13
184 |     - opencensus-context==0.1.2
185 |     - opencv-python-headless==4.3.0.36
186 |     - opt-einsum==3.3.0
187 |     - pandas==1.1.5
188 |     - pandasql==0.7.3
189 |     - portalocker==2.3.2
190 |     - protobuf==3.19.3
191 |     - psutil==5.8.0
192 |     - py-spy==0.3.9
193 |     - pyasn1==0.4.8
194 |     - pyasn1-modules==0.2.8
195 |     - pydantic==1.8.2
196 |     - pyrsistent==0.18.0
197 |     - pytz==2021.1
198 |     - pyyaml==5.4.1
199 |     - ray==1.5.1
200 |     - redis==3.5.3
201 |     - regex==2021.8.28
202 |     - requests==2.27.1
203 |     - rsa==4.7.2
204 |     - sacrebleu==2.0.0
205 |     - sacremoses==0.0.45
206 |     - scikit-learn==0.24.2
207 |     - sklearn==0.0
208 |     - sqlalchemy==1.4.29
209 |     - structlog==21.3.0
210 |     - tabulate==0.8.9
211 |     - tensorboard==1.15.0
212 |     - tensorboardx==2.4.1
213 |     - tensorflow-estimator==1.15.1
214 |     - tensorflow-gpu==1.15.0
215 |     - threadpoolctl==2.2.0
216 |     - torch==1.9.0
217 |     - tqdm==4.62.2
218 |     - urllib3==1.26.6
219 |     - yarl==1.6.3
220 | prefix: /project/miniconda3/envs/rl4rs


--------------------------------------------------------------------------------
/reproductions/run_split.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | conda activate rl4rs
  4 | script_abs=$(readlink -f "$0")
  5 | rl4rs_benchmark_dir=$(dirname $script_abs)/..
  6 | rl4rs_dataset_dir=${rl4rs_benchmark_dir}/dataset
  7 | script_dir=${rl4rs_benchmark_dir}/script
  8 | rl4rs_output_dir=${rl4rs_benchmark_dir}/output
  9 | mkdir $rl4rs_output_dir
 10 | export rl4rs_benchmark_dir && export rl4rs_output_dir && export rl4rs_dataset_dir
 11 | 
 12 | cd $rl4rs_dataset_dir
 13 | 
 14 | #raw dataset
 15 | awk -F "@" 'NR>1 {print}' rl4rs_dataset_a_sl.csv > rl4rs_dataset_a.csv &&
 16 | awk -F "@" 'NR>1 {print}' rl4rs_dataset_a_rl.csv >> rl4rs_dataset_a.csv &&
 17 | awk -F "@" 'NR>1 {print}' rl4rs_dataset_b_sl.csv > rl4rs_dataset_b.csv &&
 18 | awk -F "@" 'NR>1 {print}' rl4rs_dataset_b_rl.csv >> rl4rs_dataset_b.csv &&
 19 | 
 20 | #train/test split
 21 | awk -F "@" 'NR>1 && $2%10<=5 {print}' rl4rs_dataset_a_sl.csv > rl4rs_dataset_a_sl_train.csv &&
 22 | awk -F "@" 'NR>1 && $2%10>=6 {print}' rl4rs_dataset_a_sl.csv > rl4rs_dataset_a_sl_test.csv &&
 23 | awk -F "@" 'NR>1 && $2%10<=5 {print}' rl4rs_dataset_a_rl.csv > rl4rs_dataset_a_rl_train.csv &&
 24 | awk -F "@" 'NR>1 && $2%10>=6 {print}' rl4rs_dataset_a_rl.csv > rl4rs_dataset_a_rl_test.csv &&
 25 | 
 26 | awk -F "@" 'NR>1 && $2%10<=5 {print}' rl4rs_dataset_b_sl.csv > rl4rs_dataset_b_sl_train.csv &&
 27 | awk -F "@" 'NR>1 && $2%10>=6 {print}' rl4rs_dataset_b_sl.csv > rl4rs_dataset_b_sl_test.csv &&
 28 | awk -F "@" 'NR>1 && $2%10<=5 {print}' rl4rs_dataset_b_rl.csv > rl4rs_dataset_b_rl_train.csv &&
 29 | awk -F "@" 'NR>1 && $2%10>=6 {print}' rl4rs_dataset_b_rl.csv > rl4rs_dataset_b_rl_test.csv &&
 30 | 
 31 | cat rl4rs_dataset_a_sl_train.csv >  rl4rs_dataset_a_train.csv &&
 32 | cat rl4rs_dataset_a_rl_train.csv >>  rl4rs_dataset_a_train.csv &&
 33 | cat rl4rs_dataset_b_sl_train.csv >  rl4rs_dataset_b_train.csv &&
 34 | cat rl4rs_dataset_b_rl_train.csv >>  rl4rs_dataset_b_train.csv &&
 35 | 
 36 | cat rl4rs_dataset_a_sl_test.csv >  rl4rs_dataset_a_test.csv &&
 37 | cat rl4rs_dataset_a_rl_test.csv >>  rl4rs_dataset_a_test.csv &&
 38 | cat rl4rs_dataset_b_sl_test.csv >  rl4rs_dataset_b_test.csv &&
 39 | cat rl4rs_dataset_b_rl_test.csv >>  rl4rs_dataset_b_test.csv &&
 40 | 
 41 | cat rl4rs_dataset_a_train.csv > rl4rs_dataset_a.csv &&
 42 | cat rl4rs_dataset_a_test.csv >> rl4rs_dataset_a.csv &&
 43 | cat rl4rs_dataset_b_train.csv > rl4rs_dataset_b.csv &&
 44 | cat rl4rs_dataset_b_test.csv >> rl4rs_dataset_b.csv  &&
 45 | 
 46 | #dataset_b
 47 | cd ${script_dir}  &&
 48 | python data_preprocess.py "${rl4rs_dataset_dir}/rl4rs_dataset_b_sl.csv" "${rl4rs_dataset_dir}/rl4rs_dataset_b2_sl.csv" "data_augment" &&
 49 | python data_preprocess.py "${rl4rs_dataset_dir}/rl4rs_dataset_b_rl.csv" "${rl4rs_dataset_dir}/rl4rs_dataset_b2_rl.csv" "data_augment" &&
 50 | python data_preprocess.py "${rl4rs_dataset_dir}/rl4rs_dataset_b_train.csv" "${rl4rs_dataset_dir}/rl4rs_dataset_b2_train.csv" "data_augment" &&
 51 | python data_preprocess.py "${rl4rs_dataset_dir}/rl4rs_dataset_b_test.csv" "${rl4rs_dataset_dir}/rl4rs_dataset_b2_test.csv" "data_augment" &&
 52 | python data_preprocess.py "${rl4rs_dataset_dir}/rl4rs_dataset_b.csv" "${rl4rs_dataset_dir}/rl4rs_dataset_b2.csv" "data_augment" &&
 53 | 
 54 | python data_preprocess.py "${rl4rs_dataset_dir}/rl4rs_dataset_b2_sl.csv" "${rl4rs_dataset_dir}/rl4rs_dataset_b3_sl.csv" "slate2trajectory" &&
 55 | python data_preprocess.py "${rl4rs_dataset_dir}/rl4rs_dataset_b2_rl.csv" "${rl4rs_dataset_dir}/rl4rs_dataset_b3_rl.csv" "slate2trajectory" &&
 56 | python data_preprocess.py "${rl4rs_dataset_dir}/rl4rs_dataset_b2_train.csv" "${rl4rs_dataset_dir}/rl4rs_dataset_b3_train.csv" "slate2trajectory" &&
 57 | python data_preprocess.py "${rl4rs_dataset_dir}/rl4rs_dataset_b2_test.csv" "${rl4rs_dataset_dir}/rl4rs_dataset_b3_test.csv" "slate2trajectory" &&
 58 | python data_preprocess.py "${rl4rs_dataset_dir}/rl4rs_dataset_b2.csv" "${rl4rs_dataset_dir}/rl4rs_dataset_b3.csv" "slate2trajectory" &&
 59 | 
 60 | 
 61 | #shuffle for RL Env.
 62 | cd $rl4rs_dataset_dir &&
 63 | cat rl4rs_dataset_a.csv|shuf > rl4rs_dataset_a_shuf.csv &&
 64 | awk -F "@" 'NR>1 {print}' rl4rs_dataset_a_sl.csv|shuf > rl4rs_dataset_a_sl_shuf.csv &&
 65 | awk -F "@" 'NR>1 {print}' rl4rs_dataset_a_rl.csv|shuf > rl4rs_dataset_a_rl_shuf.csv &&
 66 | cat rl4rs_dataset_a_train.csv|shuf > rl4rs_dataset_a_train_shuf.csv &&
 67 | cat rl4rs_dataset_a_test.csv|shuf > rl4rs_dataset_a_test_shuf.csv &&
 68 | cat rl4rs_dataset_b3.csv|shuf > rl4rs_dataset_b3_shuf.csv &&
 69 | cat rl4rs_dataset_b3_sl.csv|shuf > rl4rs_dataset_b3_sl_shuf.csv &&
 70 | cat rl4rs_dataset_b3_rl.csv|shuf > rl4rs_dataset_b3_rl_shuf.csv &&
 71 | cat rl4rs_dataset_b3_train.csv|shuf > rl4rs_dataset_b3_train_shuf.csv &&
 72 | cat rl4rs_dataset_b3_test.csv|shuf > rl4rs_dataset_b3_test_shuf.csv &&
 73 | 
 74 | 
 75 | cd $(dirname $script_abs) &&
 76 | bash file_split.sh "rl4rs_dataset_a_sl_shuf.csv" &&
 77 | bash file_split.sh "rl4rs_dataset_a_rl_shuf.csv" &&
 78 | bash file_split.sh "rl4rs_dataset_a_train_shuf.csv" &&
 79 | bash file_split.sh "rl4rs_dataset_a_test_shuf.csv" &&
 80 | bash file_split.sh "rl4rs_dataset_a_shuf.csv" &&
 81 | bash file_split.sh "rl4rs_dataset_b2_sl.csv" &&
 82 | bash file_split.sh "rl4rs_dataset_b2_rl.csv" &&
 83 | bash file_split.sh "rl4rs_dataset_b2_train.csv" &&
 84 | bash file_split.sh "rl4rs_dataset_b2_test.csv" &&
 85 | bash file_split.sh "rl4rs_dataset_b2.csv"
 86 | 
 87 | 
 88 | #tfrecord for supervised learning
 89 | cd ${script_dir}
 90 | 
 91 | for ((i=0;i<5;i=i+1))
 92 | do
 93 | python data_preprocess.py "${rl4rs_output_dir}/rl4rs_dataset_a_sl_shuf.csv_000${i}.csv" "${rl4rs_output_dir}/rl4rs_dataset_a_sl.tfrecord.${i}" "tfrecord_item" &&
 94 | python data_preprocess.py "${rl4rs_output_dir}/rl4rs_dataset_a_rl_shuf.csv_000${i}.csv" "${rl4rs_output_dir}/rl4rs_dataset_a_rl.tfrecord.${i}" "tfrecord_item" &&
 95 | python data_preprocess.py "${rl4rs_output_dir}/rl4rs_dataset_a_train_shuf.csv_000${i}.csv" "${rl4rs_output_dir}/rl4rs_dataset_a_train.tfrecord.${i}" "tfrecord_item" &&
 96 | python data_preprocess.py "${rl4rs_output_dir}/rl4rs_dataset_a_test_shuf.csv_000${i}.csv" "${rl4rs_output_dir}/rl4rs_dataset_a_test.tfrecord.${i}" "tfrecord_item" &&
 97 | python data_preprocess.py "${rl4rs_output_dir}/rl4rs_dataset_a_shuf.csv_000${i}.csv" "${rl4rs_output_dir}/rl4rs_dataset_a.tfrecord.${i}" "tfrecord_item" &&
 98 | python data_preprocess.py "${rl4rs_output_dir}/rl4rs_dataset_b2_sl.csv_000${i}.csv" "${rl4rs_output_dir}/rl4rs_dataset_b2_sl.tfrecord.${i}" "tfrecord_item" &&
 99 | python data_preprocess.py "${rl4rs_output_dir}/rl4rs_dataset_b2_rl.csv_000${i}.csv" "${rl4rs_output_dir}/rl4rs_dataset_b2_rl.tfrecord.${i}" "tfrecord_item" &&
100 | python data_preprocess.py "${rl4rs_output_dir}/rl4rs_dataset_b2_train.csv_000${i}.csv" "${rl4rs_output_dir}/rl4rs_dataset_b2_train.tfrecord.${i}" "tfrecord_item" &&
101 | python data_preprocess.py "${rl4rs_output_dir}/rl4rs_dataset_b2_test.csv_000${i}.csv" "${rl4rs_output_dir}/rl4rs_dataset_b2_test.tfrecord.${i}" "tfrecord_item" &&
102 | python data_preprocess.py "${rl4rs_output_dir}/rl4rs_dataset_b2.csv_000${i}.csv" "${rl4rs_output_dir}/rl4rs_dataset_b2.tfrecord.${i}" "tfrecord_item" &&
103 | echo "1"
104 | done
105 | 
106 | cd ${script_dir} &&
107 | python data_preprocess.py "${rl4rs_dataset_dir}/rl4rs_dataset_a_train_shuf.csv" "${rl4rs_output_dir}/rl4rs_dataset_a_train_slate.tfrecord" "tfrecord_slate" &&
108 | python data_preprocess.py "${rl4rs_dataset_dir}/rl4rs_dataset_a_test_shuf.csv" "${rl4rs_output_dir}/rl4rs_dataset_a_test_slate.tfrecord" "tfrecord_slate" &&
109 | python data_preprocess.py "${rl4rs_dataset_dir}/rl4rs_dataset_b2_train.csv" "${rl4rs_output_dir}/rl4rs_dataset_b2_train_slate.tfrecord" "tfrecord_slate" &&
110 | python data_preprocess.py "${rl4rs_dataset_dir}/rl4rs_dataset_b2_test.csv" "${rl4rs_output_dir}/rl4rs_dataset_b2_test_slate.tfrecord" "tfrecord_slate" &&
111 | 
112 | echo "1"
113 | 


--------------------------------------------------------------------------------
/script/data_preprocess.py:
--------------------------------------------------------------------------------
  1 | from rl4rs.utils.datautil import FeatureUtil
  2 | import numpy as np
  3 | import sys, os, random
  4 | 
  5 | 
  6 | def data_augment(file, out_file):
  7 |     f = open(out_file, 'w')
  8 |     data = open(file, 'r').read().split('\n')
  9 |     data_size = len(data)
 10 |     print('data length', data_size)
 11 |     tmp = []
 12 |     role_id_prev = None
 13 |     for record in data:
 14 |         if len(record) < 1 or 'timestamp' in record:
 15 |             continue
 16 |         role_id = record.split('@')[1]
 17 |         if role_id == role_id_prev or role_id_prev is None:
 18 |             tmp.append(record)
 19 |             role_id_prev = role_id
 20 |         else:
 21 |             assert len(tmp) <= 4
 22 |             for i in range(len(tmp), 4):
 23 |                 timestamp, session_id, sequence_id, exposed_items, user_feedback, \
 24 |                     user_seqfeature, user_protrait, item_feature, behavior_policy_id = tmp[-1].split('@')
 25 |                 timestamp_new = str(int(timestamp) + 1)
 26 |                 sequence_id_new = str(int(sequence_id) + 1)
 27 |                 random_i = np.random.randint(1, data_size - 1)
 28 |                 exposed_items_new = data[random_i].split('@')[3]
 29 |                 item_feature_new = data[random_i].split('@')[7]
 30 |                 user_feedback_new = '0,0,0,0,0,0,0,0,0'
 31 |                 tmp.append('@'.join([
 32 |                     timestamp_new,
 33 |                     session_id,
 34 |                     sequence_id_new,
 35 |                     exposed_items_new,
 36 |                     user_feedback_new,
 37 |                     user_seqfeature,
 38 |                     user_protrait,
 39 |                     item_feature_new,
 40 |                     behavior_policy_id
 41 |                 ]))
 42 |             print(*tmp, sep='\n', end='\n', file=f)
 43 |             tmp = [record]
 44 |             role_id_prev = role_id
 45 |     f.close()
 46 | 
 47 | 
 48 | def slate2trajectory(file, out_file):
 49 |     f = open(out_file, 'w')
 50 |     data = open(file, 'r').read().split('\n')
 51 |     data_size = len(data)
 52 |     print('data length', data_size)
 53 |     tmp = []
 54 |     role_id_prev = None
 55 |     for record in data:
 56 |         if len(record) < 1 or 'timestamp' in record:
 57 |             continue
 58 |         role_id = record.split('@')[1]
 59 |         if role_id == role_id_prev or role_id_prev is None:
 60 |             tmp.append(record)
 61 |             role_id_prev = role_id
 62 |         else:
 63 |             assert len(tmp) == 4
 64 |             # timestamp, session_id, sequence_id, exposed_items, user_feedback, user_seqfeature, user_protrait, item_feature, behavior_policy_id
 65 |             timestamp = tmp[0].split('@')[0]
 66 |             session_id = tmp[0].split('@')[1]
 67 |             sequence_id = '1'
 68 |             exposed_items = ','.join([x.split('@')[3] for x in tmp])
 69 |             user_feedback = ','.join([x.split('@')[4] for x in tmp])
 70 |             user_seqfeature = tmp[0].split('@')[5]
 71 |             user_protrait = tmp[0].split('@')[6]
 72 |             item_feature = ';'.join([x.split('@')[7] for x in tmp])
 73 |             behavior_policy_id = tmp[0].split('@')[8]
 74 |             traj = [
 75 |                 timestamp,
 76 |                 session_id,
 77 |                 sequence_id,
 78 |                 exposed_items,
 79 |                 user_feedback,
 80 |                 user_seqfeature,
 81 |                 user_protrait,
 82 |                 item_feature,
 83 |                 behavior_policy_id
 84 |             ]
 85 |             print(*traj, sep='@', end='\n', file=f)
 86 |             tmp = [record]
 87 |             role_id_prev = role_id
 88 |     f.close()
 89 | 
 90 | 
 91 | def dataset2tfrecord(config, file, tfrecord_file, is_slate):
 92 |     def feature_construct(session, is_slate):
 93 |         samples = []
 94 |         for i in range(len(session)):
 95 |             _, _, sequence_id, exposed_items, user_feedback, user_seqfeature, \
 96 |             user_protrait, item_feature, _ = FeatureUtil.record_split(session[i])
 97 |             assert sequence_id - 1 == i
 98 |             user_protrait_category = user_protrait[:10]
 99 |             user_protrait_dense = user_protrait[10:]
100 |             category_feature = user_protrait_category + [sequence_id] + exposed_items
101 |             prev_items = [session[ii].split('@')[3].split(',')[jj] for ii in range(i) for jj in range(9)]
102 |             prev_items = list(map(int, prev_items))
103 |             sequence_feature_clicked = prev_items if i > 0 else [0]
104 |             sequence_feature = [user_seqfeature, sequence_feature_clicked]
105 |             if is_slate:
106 |                 # label = '0'
107 |                 label = 0
108 |                 samples.append((
109 |                     role_id_prev,
110 |                     sequence_feature,
111 |                     user_protrait_dense + item_feature,
112 |                     category_feature,
113 |                     user_feedback,
114 |                     label
115 |                 ))
116 |             else:
117 |                 for j in range(9):
118 |                     item_id = exposed_items[j]
119 |                     label = user_feedback[j]
120 |                     item_feature_size = len(item_feature) // 9
121 |                     item_feature_j = item_feature[item_feature_size * j:item_feature_size * (j + 1)]
122 |                     category_feature_j = category_feature + [item_id]
123 |                     dense_feature_j = item_feature + item_feature_j
124 |                     samples.append((
125 |                         role_id_prev,
126 |                         sequence_feature,
127 |                         user_protrait_dense + dense_feature_j,
128 |                         category_feature_j,
129 |                         user_feedback,
130 |                         label
131 |                     ))
132 |         return samples
133 |     featureutil = FeatureUtil(config)
134 |     data = open(file, 'r').read().split('\n')
135 |     print('data length', len(data))
136 |     # role_id, sequence_feature, dense_feature, category_feature, label
137 |     # timestamp@session_id@sequence_id@exposed_items@user_feedback@user_seqfeature@user_protrait@item_feature@behavior_policy_id
138 |     tmp = []
139 |     records = []
140 |     role_id_prev = None
141 |     for record in data:
142 |         if len(record) < 1 or 'timestamp' in record:
143 |             continue
144 |         role_id = record.split('@')[1]
145 |         if role_id == role_id_prev or role_id_prev is None:
146 |             tmp.append(record)
147 |             role_id_prev = role_id
148 |         else:
149 |             samples = feature_construct(tmp, is_slate)
150 |             records = records + samples
151 |             tmp = [record]
152 |             role_id_prev = role_id
153 |     if len(tmp) > 0:
154 |         samples = feature_construct(tmp, is_slate)
155 |         records = records + samples
156 |     print('tfrecord length', len(records), records[0])
157 |     random.shuffle(records)
158 |     featureutil.to_tfrecord(records, tfrecord_file)
159 | 
160 | 
161 | config = {
162 |     "maxlen": 64,
163 |     "batch_size": 32,
164 |     "class_num": 2,
165 |     "dense_feature_num": 432,
166 |     "category_feature_num": 21,
167 |     "category_hash_size": 100000,
168 |     "seq_num": 2
169 | }
170 | file = sys.argv[1]
171 | out_file = sys.argv[2]
172 | stage = sys.argv[3]
173 | assert stage in ('data_augment', 'slate2trajectory', 'tfrecord_item', 'tfrecord_slate')
174 | if stage == 'data_augment':
175 |     data_augment(file, out_file)
176 | if stage == 'slate2trajectory':
177 |     slate2trajectory(file, out_file)
178 | if stage == 'tfrecord_item':
179 |     dataset2tfrecord(config, file, out_file, is_slate=False)
180 | if stage == 'tfrecord_slate':
181 |     dataset2tfrecord(config, file, out_file, is_slate=True)
182 | 


--------------------------------------------------------------------------------
/rl4rs/utils/offline_policy_metrics.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | # import scipy
  3 | import scipy.stats
  4 | 
  5 | 
  6 | # modify from https://mars-gym.readthedocs.io/en/latest/quick_start.html#off-policy-metrics
  7 | 
  8 | def _calc_sequential_weigths(policy_prob, behavior_prob, weighted=False, a_min=None, a_max=None):
  9 |     # behavior_prob: Coleta
 10 |     # policy_prob: Avaliação
 11 |     #
 12 |     # Compute the sample weights - propensity ratios
 13 |     probs = np.array(policy_prob) / np.array(behavior_prob)
 14 |     rho = np.clip(probs, a_min=a_min, a_max=a_max).cumprod(1)
 15 |     if weighted:
 16 |         weight = np.sum(rho, axis=0)
 17 |     else:
 18 |         weight = len(policy_prob)
 19 |     ws = rho / weight
 20 |     return np.clip(ws, a_min=a_min, a_max=a_max)
 21 | 
 22 | 
 23 | def _calc_sample_weigths(policy_prob, behavior_prob, a_min=None, a_max=None):
 24 |     # behavior_prob: Coleta
 25 |     # policy_prob: Avaliação
 26 |     #
 27 |     # Compute the sample weights - propensity ratios
 28 |     p_ratio = np.array(policy_prob) / np.array(behavior_prob)
 29 | 
 30 |     if a_min is not None:
 31 |         p_ratio = np.clip(p_ratio, a_min=a_min, a_max=a_max)
 32 | 
 33 |     # Effective sample size for E_t estimate (from A. Owen)
 34 |     n_e = len(policy_prob) * (np.mean(p_ratio) ** 2) / (p_ratio ** 2).mean()
 35 | 
 36 |     # Critical value from t-distribution as we have unknown variance
 37 |     alpha = 0.00125
 38 |     cv = scipy.stats.t.ppf(1 - alpha, df=int(n_e) - 1)
 39 | 
 40 |     return p_ratio, n_e, cv
 41 | 
 42 | 
 43 | def eval_DM(policy, obs):
 44 |     return policy(obs)
 45 | 
 46 | 
 47 | def eval_IPS(rewards, policy_prob, behavior_prob):
 48 |     # Calculate Sample Weigths
 49 |     p_ratio, n_e, cv = _calc_sample_weigths(policy_prob, behavior_prob)
 50 |     ###############
 51 |     # VANILLA IPS #
 52 |     ###############
 53 |     # Expected reward for pi_t
 54 |     E_t = np.mean(rewards * p_ratio)
 55 | 
 56 |     # Variance of the estimate
 57 |     var = ((rewards * p_ratio - E_t) ** 2).mean()
 58 |     stddev = np.sqrt(var)
 59 | 
 60 |     # C.I. assuming unknown variance - use t-distribution and effective sample size
 61 |     c = cv * stddev / np.sqrt(int(n_e))
 62 |     min_bound = E_t - c
 63 |     max_bound = E_t + c
 64 | 
 65 |     result = (E_t, c)  # 0.025, 0.500, 0.975
 66 |     return result
 67 | 
 68 | 
 69 | def eval_CIPS(rewards, policy_prob, behavior_prob):
 70 |     # Calculate Sample Weigths
 71 |     p_ratio, n_e, cv = _calc_sample_weigths(policy_prob, behavior_prob, a_min=0.1, a_max=10)
 72 | 
 73 |     ##############
 74 |     # CAPPED IPS #
 75 |     ##############
 76 |     # Cap ratios
 77 |     p_ratio_capped = np.clip(p_ratio, a_min=0.1, a_max=10)
 78 | 
 79 |     # Expected reward for pi_t
 80 |     E_t_capped = np.mean(rewards * p_ratio_capped)
 81 | 
 82 |     # Variance of the estimate
 83 |     var_capped = ((rewards * p_ratio_capped - E_t_capped) ** 2).mean()
 84 |     stddev_capped = np.sqrt(var_capped)
 85 | 
 86 |     # C.I. assuming unknown variance - use t-distribution and effective sample size
 87 |     c = cv * stddev_capped / np.sqrt(int(n_e))
 88 | 
 89 |     min_bound_capped = E_t_capped - c
 90 |     max_bound_capped = E_t_capped + c
 91 | 
 92 |     result = (E_t_capped, c)  # 0.025, 0.500, 0.975
 93 | 
 94 |     return result
 95 | 
 96 | 
 97 | def eval_SNIPS(rewards, policy_prob, behavior_prob):
 98 |     # Calculate Sample Weigths
 99 |     p_ratio, n_e, cv = _calc_sample_weigths(policy_prob, behavior_prob, a_min=0.1, a_max=10)
100 | 
101 |     ##############
102 |     # NORMED IPS #
103 |     ##############
104 |     # Expected reward for pi_t
105 |     E_t_normed = np.sum(rewards * p_ratio) / np.sum(p_ratio)
106 | 
107 |     # Variance of the estimate
108 |     var_normed = np.sum(((rewards - E_t_normed) ** 2) * (p_ratio ** 2)) / (
109 |             p_ratio.sum() ** 2
110 |     )
111 |     stddev_normed = np.sqrt(var_normed)
112 | 
113 |     # C.I. assuming unknown variance - use t-distribution and effective sample size
114 |     c = cv * stddev_normed / np.sqrt(int(n_e))
115 | 
116 |     min_bound_normed = E_t_normed - c
117 |     max_bound_normed = E_t_normed + c
118 | 
119 |     # Store result
120 |     result = (E_t_normed, c)  # 0.025, 0.500, 0.975
121 | 
122 |     return result
123 | 
124 | 
125 | def eval_WIPS(step_rewards, policy_prob, behavior_prob, gamma=1.0):
126 |     batch_size = len(step_rewards)
127 |     steps = len(step_rewards[0])
128 |     w_t = []
129 | 
130 |     # calculate importance ratios
131 |     p = _calc_sequential_weigths(policy_prob, behavior_prob, a_min=0.1, a_max=10)
132 | 
133 |     for i in range(steps):
134 |         w_t.append(np.average(p[:, :i + 1], axis=1))
135 |     w_t = np.array(w_t).swapaxes(0, 1)
136 |     # calculate stepwise weighted IS estimate
137 |     V_prev, V_step_WIS = 0.0, 0.0
138 |     for t in range(steps):
139 |         V_prev += np.sum(step_rewards[:, t] * gamma ** t)
140 |         V_step_WIS += np.sum(p[:, t] / w_t[:, t] * step_rewards[:, t] * gamma ** t)
141 |     # print('WIPS', p[:, -1], w_t[:, -1], np.max(p[:, -1] / w_t[:, -1]), step_rewards[:, -1])
142 |     return V_step_WIS / np.clip(V_prev, a_min=1e-8, a_max=None), 0
143 | 
144 | 
145 | def eval_doubly_robust(
146 |         action_rhat_rewards, state_rewards, rewards, policy_prob, behavior_prob
147 | ):
148 |     # Calculate Sample Weigths
149 |     p_ratio, n_e, cv = _calc_sample_weigths(policy_prob, behavior_prob, a_min=0.1, a_max=10)
150 | 
151 |     #################
152 |     # Roubly Robust #
153 |     #################
154 | 
155 |     dr = state_rewards + (p_ratio * (rewards - action_rhat_rewards))
156 | 
157 |     confidence = 0.95
158 |     n = len(dr)
159 |     m, se = np.mean(dr), scipy.stats.sem(dr)
160 |     # h = se * scipy.stats.t.ppf((1 + confidence) / 2.0, n - 1)
161 |     # print('dr', action_rhat_rewards[:2], p_ratio[:2], rewards[:2], m)
162 |     return m / np.average(rewards), se
163 | 
164 | 
165 | def eval_seq_doubly_robust(
166 |         action_rhat_rewards, state_rewards, rewards, policy_prob, behavior_prob
167 | ):
168 |     # Calculate Sample Weigths
169 |     ws = _calc_sequential_weigths(policy_prob, behavior_prob, a_min=0.1, a_max=10)
170 | 
171 |     dr = np.zeros((len(action_rhat_rewards)))
172 |     steps = len(action_rhat_rewards[0])
173 |     for i in range(steps):
174 |         t = steps - i - 1
175 |         dr = state_rewards[:, t] + ws[:, t] * (rewards[:, t] + dr - action_rhat_rewards[:, t])
176 | 
177 |     #################
178 |     # Roubly Robust #
179 |     #################
180 |     # dr = action_rhat_rewards + (p_ratio * (rewards - action_rhat_rewards))
181 |     # estimate = ws * (rewards - action_rhat_rewards) +  state_rewards
182 |     # print('sdr', dr, np.average(dr), np.average(rewards))
183 | 
184 |     return np.average(dr) / np.mean(np.sum(rewards, axis=1)), 0
185 | 
186 | 
187 | if __name__ == '__main__':
188 |     batch_size = 10
189 |     max_steps = 9
190 |     off_rewards_sum = np.ones(batch_size, )
191 |     action_probs_mul = np.random.random((batch_size,))
192 |     behavior_probs_mul = np.random.random((batch_size,))
193 |     episode_reward = np.random.random((batch_size,)) * 2
194 |     off_rewards = np.ones((batch_size, max_steps))
195 |     action_probs = np.random.random((batch_size, max_steps))
196 |     behavior_probs = np.random.random((batch_size, max_steps))
197 |     rewards_hat = np.random.random((batch_size, max_steps))
198 |     state_reward = np.ones((batch_size, max_steps))
199 | 
200 |     ips = eval_IPS(off_rewards_sum, action_probs_mul, behavior_probs_mul)
201 |     cips = eval_CIPS(off_rewards_sum, action_probs_mul, behavior_probs_mul)
202 |     snips = eval_SNIPS(off_rewards_sum, action_probs_mul, behavior_probs_mul)
203 |     dr = eval_doubly_robust(episode_reward, off_rewards_sum, action_probs_mul, behavior_probs_mul)
204 |     # step-wise
205 |     sips = eval_WIPS(off_rewards, action_probs, behavior_probs)
206 |     sdr = eval_seq_doubly_robust(rewards_hat, state_reward, off_rewards, action_probs, behavior_probs)
207 |     print(ips, cips, snips, dr, sips, sdr, sep='\n')
208 | 


--------------------------------------------------------------------------------
/script/mdpchecker/mdp_checker.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import sys
  3 | import random
  4 | from scipy.stats import spearmanr
  5 | from keras_transformer import get_model, decode
  6 | from rl4rs.mdpchecker.decoder import beam_search, token_probs
  7 | 
  8 | # dataset_file = 'recsys15.csv'
  9 | # dataset_file = 'movielens.csv'
 10 | # dataset_file = 'rl4rs.csv'
 11 | # dataset_file = 'lastfm.csv'
 12 | # dataset_file = 'cikm2016.csv'
 13 | dataset_file = sys.argv[1] + '.csv'
 14 | dataset_dir = sys.argv[2]
 15 | 
 16 | # the data of recsys15 relative to the
 17 | # number of commodities is too sparse,
 18 | # increase the sample size
 19 | if 'recsys15' in dataset_file:
 20 |     source_len = 8
 21 | elif 'cikm2016' in dataset_file:
 22 |     source_len = 5
 23 | else:
 24 |     source_len = 16
 25 | target_len = 5
 26 | np.random.seed(1)
 27 | 
 28 | data = open(dataset_dir + '/' + dataset_file).read().split('\n')[1:-1]
 29 | 
 30 | source_tokens = []
 31 | target_tokens = []
 32 | for sample in data:
 33 |     user_id, items = sample.split(' ')
 34 |     item_list = items.split(',')
 35 |     if len(item_list) >= source_len + target_len:
 36 |         # assert len(item_list) >= source_len + target_len
 37 |         i = 0
 38 |         if 'rl4rs' in dataset_file or 'cikm2016' in dataset_file:
 39 |             source_tokens.append(item_list[:source_len])
 40 |             target_tokens.append(item_list[source_len:source_len + target_len])
 41 |         else:
 42 |             while i + source_len + target_len < len(item_list):
 43 |                 source_tokens.append(item_list[i: i + source_len])
 44 |                 target_tokens.append(item_list[i + source_len: i + source_len + target_len])
 45 |                 i = i + np.random.randint(source_len, source_len + target_len) // 6
 46 |     else:
 47 |         print('len(item_list) <= source_len + target_len in', '\t',sample)
 48 | 
 49 | # Generate dictionaries
 50 | token_dict = {
 51 |     '<PAD>': 0,
 52 |     '<START>': 1,
 53 |     '<END>': 2,
 54 | }
 55 | 
 56 | 
 57 | def build_token_dict(token_list):
 58 |     for tokens in token_list:
 59 |         for token in tokens:
 60 |             if token not in token_dict:
 61 |                 token_dict[token] = len(token_dict)
 62 |     return token_dict
 63 | 
 64 | 
 65 | source_token_dict = build_token_dict(source_tokens)
 66 | target_token_dict = build_token_dict(target_tokens)
 67 | target_token_dict_inv = {v: k for k, v in target_token_dict.items()}
 68 | 
 69 | # Add special tokens
 70 | encode_tokens = [['<START>'] + tokens + ['<END>'] for tokens in source_tokens]
 71 | decode_tokens = [['<START>'] + tokens + ['<END>'] for tokens in target_tokens]
 72 | output_tokens = [tokens + ['<END>', '<PAD>'] for tokens in target_tokens]
 73 | 
 74 | # Padding
 75 | source_max_len = max(map(len, encode_tokens))
 76 | target_max_len = max(map(len, decode_tokens))
 77 | 
 78 | encode_tokens = [tokens + ['<PAD>'] * (source_max_len - len(tokens)) for tokens in encode_tokens]
 79 | decode_tokens = [tokens + ['<PAD>'] * (target_max_len - len(tokens)) for tokens in decode_tokens]
 80 | output_tokens = [tokens + ['<PAD>'] * (target_max_len - len(tokens)) for tokens in output_tokens]
 81 | 
 82 | encode_input = [list(map(lambda x: source_token_dict[x], tokens)) for tokens in encode_tokens]
 83 | decode_input = [list(map(lambda x: target_token_dict[x], tokens)) for tokens in decode_tokens]
 84 | decode_output = [list(map(lambda x: [target_token_dict[x]], tokens)) for tokens in output_tokens]
 85 | 
 86 | print('sample lens:', len(encode_input))
 87 | print('source_token_dict lens:', len(source_token_dict))
 88 | print('target_token_dict lens:', len(target_token_dict))
 89 | # [1, 3, 4, 5, 6, 2] [1, 3, 4, 5, 6, 7, 8, 9, 2] [[3], [4], [5], [6], [7], [8], [9], [2], [0]]
 90 | # print(encode_input[0], decode_input[0], decode_output[0])
 91 | 
 92 | # Build & fit model
 93 | model = get_model(
 94 |     token_num=max(len(source_token_dict), len(target_token_dict)),
 95 |     embed_dim=256,
 96 |     encoder_num=1,
 97 |     decoder_num=1,
 98 |     head_num=1,
 99 |     hidden_dim=128,
100 |     dropout_rate=0.05,
101 |     use_same_embed=False,  # Use different embeddings for different languages
102 | )
103 | 
104 | model.compile('adam', 'sparse_categorical_crossentropy')
105 | model.summary()
106 | 
107 | model.fit(
108 |     x=[np.array(encode_input)[:-10000], np.array(decode_input)[:-10000]],
109 |     y=np.array(decode_output)[:-10000],
110 |     epochs=20,
111 |     batch_size=256,
112 |     shuffle=True,
113 |     verbose=2
114 | )
115 | 
116 | model.save_weights(dataset_file.split('.')[0] + '.h5')
117 | 
118 | # Load
119 | model.load_weights(dataset_file.split('.')[0] + '.h5')
120 | 
121 | # greedy result print & input output comparison
122 | # decoded = decode(
123 | #     model,
124 | #     encode_input[:1024],
125 | #     start_token=target_token_dict['<START>'],
126 | #     end_token=target_token_dict['<END>'],
127 | #     pad_token=target_token_dict['<PAD>'],
128 | #     top_k=1
129 | # )
130 | # print([target_token_dict_inv[x] for x in decode_input[0]], [target_token_dict_inv[x] for x in decoded[0]])
131 | # print([target_token_dict_inv[x] for x in decode_input[1]], [target_token_dict_inv[x] for x in decoded[1]])
132 | 
133 | # beam search
134 | batch_size = 2048
135 | beam_size = 100
136 | # use 20 hot items since rl4rs has only 200+ items
137 | hot_beam_size = 20 if 'rl4rs' in dataset_file else beam_size
138 | # cikm2016 has only 60853 items
139 | candidates_size = 6000 if 'cikm2016' in dataset_file else hot_beam_size
140 | random.seed(1)
141 | encode_input = random.sample(encode_input[-10000:], batch_size)
142 | output_greedy, greedy_score = beam_search(model, encode_input, beam_size=1, target_len=target_len)
143 | output_topk, beam_score = beam_search(model, encode_input, beam_size=beam_size, target_len=target_len)
144 | # np.savez(dataset_file.split('.')[0]+'.npz', output_topk=output_topk, beam_score=beam_score)
145 | # npzdata = np.load(dataset_file.split('.')[0] + '.npz')
146 | # output_topk = npzdata['output_topk']
147 | # beam_score = npzdata['beam_score']
148 | 
149 | output_topk_5, beam_score_5 = output_topk[:, :int(beam_size * 0.05)], beam_score[:, :int(beam_size * 0.05)]
150 | output_topk_20, beam_score_20 = output_topk[:, :int(beam_size * 0.2)], beam_score[:, :int(beam_size * 0.2)]
151 | output_topk_hot, beam_score_hot = beam_search(model, encode_input, beam_size=hot_beam_size, target_len=target_len, use_candidates=True, candidates_size=candidates_size)
152 | output_topk_hot5, beam_score_hot5 = output_topk_hot[:, :int(beam_size * 0.05)], beam_score_hot[:, :int(beam_size * 0.05)]
153 | output_topk_hot20, beam_score_hot20 = output_topk_hot[:, :int(beam_size * 0.2)], beam_score_hot[:, :int(beam_size * 0.2)]
154 | 
155 | greedy_score = np.nanmean(greedy_score, axis=1)
156 | top_5_percent_score = np.nanmean(beam_score_5, axis=1)
157 | top_20_percent_score = np.nanmean(beam_score_20, axis=1)
158 | hot_5_percent_score = np.nanmean(beam_score_hot5, axis=1)
159 | hot_20_percent_score = np.nanmean(beam_score_hot20, axis=1)
160 | 
161 | print('experiment II results')
162 | print('top_5_percent_score top_20_percent_score greedy_score hot_5_percent_score hot_20_percent_score')
163 | print(1,
164 |       np.nanmean(top_20_percent_score / top_5_percent_score),
165 |       np.nanmean(greedy_score / top_5_percent_score),
166 |       np.nanmean(hot_5_percent_score / top_5_percent_score),
167 |       np.nanmean(hot_20_percent_score / top_5_percent_score))
168 | 
169 | print('experiment I start')
170 | tmp = []
171 | for j in range(int(beam_size)):
172 |     batch_outputs = output_topk[:, j]
173 |     probs = []
174 |     for i in range(5):
175 |         prob = token_probs(model, encode_input, batch_outputs[:, :i + 1])[list(range(batch_size)), output_topk[:, j, i + 1]]
176 |         probs.append(prob)
177 |     tmp.append(probs)
178 | probs = np.array(tmp).swapaxes(0, 2).swapaxes(1, 2)
179 | metrics = []
180 | for j in range(batch_size):
181 |     prob = probs[j]
182 |     prob_sum = np.sum(prob, axis=1)
183 |     seq_score = np.multiply.reduce(np.array(prob), axis=1)
184 |     for i in range(5):
185 |         metrics.append((np.corrcoef(np.multiply.reduce(np.array(prob[:, :i + 1]), axis=1), seq_score)[0][1],
186 |                         spearmanr(np.multiply.reduce(np.array(prob[:, :i + 1]), axis=1), seq_score)[0]))
187 | metrics = np.array(metrics).reshape((batch_size, 5, 2))
188 | metrics = np.nan_to_num(metrics, nan=1.0)
189 | print('experiment I results')
190 | print('corrcoef', ' ', 'spearman')
191 | print(np.nanmean(metrics, axis=0))


--------------------------------------------------------------------------------
/rl4rs/env/seqslate.py:
--------------------------------------------------------------------------------
  1 | from functools import reduce
  2 | from operator import add
  3 | from copy import deepcopy as copy
  4 | import numpy as np
  5 | from rl4rs.env.slate import SlateState, SlateRecEnv
  6 | 
  7 | 
  8 | class SeqSlateState(SlateState):
  9 |     def __init__(self, config, records):
 10 |         super().__init__(config, records)
 11 |         self.page_items = config.get("page_items", 9)
 12 | 
 13 |     @property
 14 |     def state(self):
 15 |         if self.config.get("support_rllib_mask", False):
 16 |             location_mask = self.get_location_mask(self.location_mask, self.cur_steps % self.page_items // 3)
 17 |             return {"state": self._state, "action_mask": self.action_mask & location_mask & self.special_mask}
 18 |         elif self.config.get("support_d3rl_mask", False):
 19 |             cur_steps = np.full((self.batch_size, 1), self.cur_steps)
 20 |             page_init = self.cur_steps // self.page_items * self.page_items
 21 |             page_end = min(page_init + self.page_items - 1, self.max_steps - 1)
 22 |             masked_actions = self.prev_actions[:, page_end + 1 - self.page_items:page_end + 1]
 23 |             return {"state": self._state, "masked_actions": masked_actions, "cur_steps": cur_steps}
 24 |         else:
 25 |             return self._state
 26 | 
 27 |     def get_complete_states(self):
 28 |         states = []
 29 |         for j in range(self.cur_steps):
 30 |             tmp = copy(self._init_state)
 31 |             for state, action, i in zip(self._init_state, self.prev_actions[:, j], range(len(self._init_state))):
 32 |                 page_init = j // self.page_items * self.page_items
 33 |                 page_end = page_init + self.page_items - 1
 34 |                 sequence_id = j // self.page_items + 1
 35 |                 # seq
 36 |                 prev_expose = self.prev_actions[i, :page_init] if page_init > 0 else [0]
 37 |                 tmp[i][1] = [tmp[i][1][0], prev_expose]
 38 |                 # dense
 39 |                 prev_item_feat = [
 40 |                     self.item_info_d[str(x)]['item_vec']
 41 |                     for x in self.prev_actions[i, page_init:page_end + 1]
 42 |                 ]
 43 |                 cur_item_feat = self.item_info_d[str(action)]['item_vec']
 44 |                 prev_item_feat = np.array(prev_item_feat).flatten()
 45 |                 tmp[i][2] = np.concatenate((tmp[i][2], prev_item_feat, cur_item_feat))
 46 |                 # category
 47 |                 cur_exposed = self.prev_actions[i, page_init:page_end + 1]
 48 |                 tmp[i][3] = np.concatenate((tmp[i][3], [sequence_id], cur_exposed, [action]))
 49 |             states.append(tmp)
 50 |         return states
 51 | 
 52 |     def get_violation(self):
 53 |         tmp = np.ones((self.batch_size,), dtype=np.int)
 54 |         for step in range(self.cur_steps):
 55 |             location_mask = self.location_mask[step % self.page_items // 3]
 56 |             tmp = tmp & location_mask[self.prev_actions[:, step]]
 57 |         for step in range(max(self.cur_steps - 1, 1)):
 58 |             duplicate_mask = (self.prev_actions[:, step] != self.prev_actions[:, step + 1])
 59 |             tmp = tmp & duplicate_mask
 60 |         for step in range(max(self.cur_steps - 2, 1)):
 61 |             duplicate_mask = (self.prev_actions[:, step] != self.prev_actions[:, step + 2])
 62 |             tmp = tmp & duplicate_mask
 63 |         for i in range(self.batch_size):
 64 |             cur_page = self.cur_steps % self.page_items
 65 |             for j in range(cur_page+1):
 66 |                 actions = self.prev_actions[i][self.page_items*j:self.page_items*(j+1)]
 67 |                 if len(np.intersect1d(actions, self.special_items)) > 1:
 68 |                     tmp[i] = 0
 69 |         return tmp
 70 | 
 71 |     @property
 72 |     def offline_reward(self):
 73 |         cur_step = self.cur_steps
 74 |         if cur_step % 9 != 0:
 75 |             reward = [0, ] * self.batch_size
 76 |         else:
 77 |             action = np.array([list(map(int, x.split('@')[3].split(',')[:cur_step]))
 78 |                                for x in self.records])
 79 |             price = self.get_price(action)[:, -self.page_items:]
 80 |             slate_label = np.array([
 81 |                 list(map(int, x.split('@')[4].split(',')))
 82 |                 for x in self.records
 83 |             ])
 84 |             slate_label = slate_label[:, cur_step - self.page_items:cur_step]
 85 |             reward = np.sum(price * slate_label, axis=1)
 86 |         return reward
 87 | 
 88 |     # @property
 89 |     # def info(self):
 90 |     #     return [{}]*self.batch_size
 91 | 
 92 |     def act(self, actions):
 93 |         if self.config.get("support_conti_env", False):
 94 |             location_mask = self.get_location_mask(self.location_mask,
 95 |                                                    self.cur_steps % self.page_items // 3)
 96 |             action_mask = self.action_mask & location_mask & self.special_mask
 97 |             actions = self.get_nearest_neighbor_with_mask(actions, self.action_emb, action_mask)
 98 |         self.prev_actions[:, self.cur_steps] = actions
 99 |         self.action_mask[list(range(self.batch_size)), actions] = 0
100 |         for i in range(self.batch_size):
101 |             if len(np.intersect1d(self.prev_actions[i], self.special_items)) > 0:
102 |                 self.special_mask[i][self.special_items] = 0
103 |         tmp = copy(self._init_state)
104 |         for state, action, i in zip(self._state, actions, range(self.batch_size)):
105 |             page_init = self.cur_steps // self.page_items * self.page_items
106 |             page_end = page_init + self.page_items - 1
107 |             sequence_id = self.cur_steps // self.page_items + 1
108 |             # seq
109 |             prev_expose = self.prev_actions[i, :page_init] if page_init > 0 else [0]
110 |             tmp[i][1] = [tmp[i][1][0], prev_expose]
111 |             # dense
112 |             prev_item_feat = [
113 |                 self.item_info_d[str(x)]['item_vec']
114 |                 for x in self.prev_actions[i, page_init:page_end + 1]
115 |             ]
116 |             cur_item_feat = self.item_info_d[str(action)]['item_vec']
117 |             prev_item_feat = np.array(prev_item_feat).flatten()
118 |             tmp[i][2] = np.concatenate((tmp[i][2], prev_item_feat, cur_item_feat))
119 |             # category
120 |             cur_exposed = self.prev_actions[i, page_init:page_end + 1]
121 |             tmp[i][3] = np.concatenate((tmp[i][3], [sequence_id], cur_exposed, [action]))
122 |         self._state = tmp
123 |         self.cur_steps += 1
124 |         if self.cur_steps % self.page_items == 0:
125 |             self.action_mask = np.full((self.batch_size, self.action_size), 1, dtype=np.int)
126 |             self.special_mask = np.full((self.batch_size, self.action_size), 1, dtype=np.int)
127 | 
128 | 
129 | class SeqSlateRecEnv(SlateRecEnv):
130 |     """ Implements core recommendation simulator"""
131 | 
132 |     def __init__(self, config, state_cls):
133 |         super().__init__(config, state_cls)
134 |         self.page_items = config.get("page_items", 9)
135 | 
136 |     def forward(self, model, samples):
137 |         step = samples.cur_steps
138 |         if step % self.page_items == 0:
139 |             # state = samples.state
140 |             prev_actions = samples.prev_actions[:, :step]
141 |             # shapes = prev_actions.shape
142 |             complete_states = np.array(samples.get_complete_states())
143 |             complete_states = complete_states[-self.page_items:]
144 |             complete_states = complete_states \
145 |                 .swapaxes(0, 1) \
146 |                 .reshape((self.batch_size * self.page_items, 6))
147 |             price = samples.get_price(prev_actions)[:, -self.page_items:]
148 |             feat, _ = self.FeatureUtil.feature_extraction(complete_states)
149 |             with self.sess.as_default():
150 |                 with self.graph.as_default():
151 |                     res = self.reward_layer(feat)
152 |             probs = np.array(res)[:, 1].reshape((self.batch_size, self.page_items))
153 |             reward = np.sum(price * probs, axis=1)
154 |             if self.config.get("support_rllib_mask", False) or \
155 |                     self.config.get("support_d3rl_mask", False):
156 |                 violation = samples.get_violation()
157 |                 reward[violation < 0.5] = 0
158 |         else:
159 |             reward = np.array([0, ] * self.batch_size)
160 |         return reward.tolist()
161 | 


--------------------------------------------------------------------------------
/script/mdpchecker/preprocess.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import pandas as pd
  3 | import pandasql as ps
  4 | 
  5 | dataset_file = sys.argv[1]
  6 | dataset_dir = sys.argv[2]
  7 | 
  8 | pysqldf = lambda q: ps.sqldf(q, globals())
  9 | 
 10 | # recsys15-click
 11 | if 'lastfm' in dataset_file:
 12 |     df = pd.read_csv(dataset_dir + '/lastfm-dataset-1K/userid-timestamp-artid-artname-traid-traname.tsv'
 13 |                      ,names=["userid", "timestamp", "artid", "artname", "traid", "traname"]
 14 |                      ,sep='\t')
 15 | 
 16 |     sql0 = """
 17 |         select userid as sessionid, min(timestamp) as timestamp, artid
 18 |         from
 19 |         df a
 20 |         group by userid, artid, substr(timestamp,1,12)
 21 |         """
 22 | 
 23 |     sql1 = """
 24 |         select a.sessionid, a.timestamp, b.item
 25 |         from
 26 |         df a
 27 |         join
 28 |         (select artid, ROW_NUMBER() OVER(ORDER BY artid) AS item
 29 |             from (
 30 |                 select artid
 31 |                 from
 32 |                 df a
 33 |                 group by artid 
 34 |                 having count(*)>=30
 35 |             )aa
 36 |         )b
 37 |         on a.artid=b.artid
 38 |         """
 39 | 
 40 |     sql2 = """
 41 |     select sessionid, group_concat(item) as items
 42 |     from(
 43 |         select *
 44 |         from
 45 |         df2
 46 |         order by timestamp asc
 47 |     )a
 48 |     group by sessionid
 49 |     
 50 |     """
 51 | 
 52 |     df = pysqldf(sql0)
 53 | 
 54 |     df2 = pysqldf(sql1)
 55 | 
 56 |     df3 = pysqldf(sql2)
 57 | 
 58 |     print('items num.', df2['item'].value_counts().count())
 59 |     print('max item id', df2['item'].max())
 60 |     print('sessionid num.', df2['sessionid'].value_counts().count())
 61 | 
 62 |     df3.to_csv(dataset_dir + '/' + dataset_file + '.csv', sep=' ', header=True, index=False, encoding='utf-8')
 63 | 
 64 | if 'cikm2016' in dataset_file:
 65 |     # queryId;sessionId;userId;timeframe;duration;eventdate;searchstring.tokens;categoryId;items;is.test
 66 |     queries_df = pd.read_csv(dataset_dir + '/CIKMCUP2016_Track2/train-queries.csv',sep=';')
 67 |     # queryId;timeframe;itemId
 68 |     click_df = pd.read_csv(dataset_dir + '/CIKMCUP2016_Track2/train-clicks.csv',sep=';')
 69 |     # sessionId;userId;itemId;timeframe;eventdate
 70 |     pv_df = pd.read_csv(dataset_dir + '/CIKMCUP2016_Track2/train-item-views.csv',sep=';')
 71 | 
 72 |     # sql0 = """
 73 |     #     select a.sessionId as sessionid, min(b.timeframe) as timestamp, b.itemId, a.items as pv_items
 74 |     #     from
 75 |     #     queries_df a
 76 |     #     join click_df b
 77 |     #     on a.queryId = b.queryId
 78 |     #     group by b.queryId, b.itemId, cast(b.timeframe/1000 as int)
 79 |     #     """
 80 | 
 81 |     df_click_sql = """
 82 |         select a.sessionId as sessionid, min(cast(b.timeframe as int)) as timestamp, b.itemId as item
 83 |         from
 84 |         queries_df a
 85 |         join click_df b
 86 |         on a.queryId = b.queryId
 87 |         join (select sessionId from pv_df group by sessionId)c 
 88 |         on a.sessionId = c.sessionId
 89 |         group by a.sessionId, b.itemId, cast(b.timeframe/1000 as int)
 90 |         """
 91 | 
 92 |     df_pv_sql = """
 93 |         select a.sessionId as sessionid, min(cast(c.timeframe as int)) as timestamp, c.itemId as item
 94 |         from
 95 |         queries_df a
 96 |         join (select queryId from click_df group by queryId) b
 97 |         on a.queryId = b.queryId
 98 |         join pv_df c 
 99 |         on a.sessionId = c.sessionId
100 |         group by a.sessionId, c.itemId, cast(c.timeframe/1000 as int)
101 |         """
102 | 
103 |     df_sql = """
104 |     select aa.sessionid, group_concat(c.item|| ':' ||c.timestamp) as pv_items, aa.click_items
105 |       from
106 |         ( 
107 |         select a.sessionid,a.timestamp,a.item,group_concat(b.item|| ':' ||b.timestamp) as click_items  from
108 |         df_click a 
109 |         join df_click b 
110 |         on a.sessionid=b.sessionid and a.timestamp<=b.timestamp
111 |         group by a.sessionid,a.item
112 |         )aa
113 |         
114 |         join df_pv c 
115 |         on aa.sessionid=c.sessionid and aa.timestamp>c.timestamp
116 |         group by aa.sessionid,aa.click_items
117 |     """
118 | 
119 |     df_click = pysqldf(df_click_sql)
120 |     df_pv = pysqldf(df_pv_sql)
121 |     df = pysqldf(df_sql)
122 | 
123 |     tmp = []
124 |     items = set()
125 |     for x in df.values:
126 |         if len(x[1].split(','))>=5 and len(x[2].split(','))>=5:
127 |             [items.add(x.split(':')[0]) for x in x[1].split(',')]
128 |             [items.add(x.split(':')[0]) for x in x[2].split(',')]
129 | 
130 |     # item2id=dict([(x,str(i)) for i,x in enumerate(items)])
131 |     # item2id_fn = lambda x:item2id[x]
132 | 
133 |     for x in df.values:
134 |         if len(x[1].split(','))>=5 and len(x[2].split(','))>=5:
135 |             pv_items = x[1].split(',')
136 |             sorted_pv_items = sorted(pv_items, key=lambda x:int(x.split(':')[1]))[-5:]
137 |             sorted_pv_items = [x.split(':')[0] for x in sorted_pv_items]
138 |             click_items = x[2].split(',')
139 |             sorted_click_items = sorted(click_items, key=lambda x:int(x.split(':')[1]))[:5]
140 |             sorted_click_items = [x.split(':')[0] for x in sorted_click_items]
141 |             tmp.append([x[0], ','.join(sorted_pv_items), ','.join(sorted_click_items)])
142 | 
143 |     print('items num.', len(items))
144 |     print('max item id', len(items)-1)
145 |     print('sessionid num.', len(tmp))
146 | 
147 |     with open(dataset_dir + '/' + dataset_file + '.csv', 'w') as f:
148 |         f.write('sessionid items'+'\n')
149 |         f.write('\n'.join([str(x[0])+' '+x[1]+','+x[2] for x in tmp]))
150 | 
151 | 
152 | # recsys15-click
153 | if 'recsys15' in dataset_file:
154 |     df = pd.read_csv(dataset_dir + '/yoochoose-clicks.dat', names=["sessionid", "timestamp", "item", "Category"])
155 | 
156 |     sql0 = """
157 |     select sessionid, min(timestamp) as timestamp, item, Category
158 |     from
159 |     df a
160 |     group by sessionid, item, substr(timestamp,1,12)
161 |     """
162 | 
163 |     sql1 = """
164 |     select a.*
165 |     from
166 |     df a
167 |     join
168 |     (SELECT item FROM df group by item having count(*)>=1000)b
169 |     on a.item=b.item
170 |     """
171 | 
172 |     sql2 = """
173 |     select a.*
174 |     from
175 |     df2 a
176 |     join
177 |     (SELECT sessionid FROM df2 group by sessionid having count(*)>=13)b
178 |     on a.sessionid=b.sessionid
179 |     """
180 | 
181 |     sql3 = """
182 |     select sessionid, group_concat(item) as items
183 |     from
184 |     df3
185 |     group by sessionid
186 |     order by timestamp asc
187 |     """
188 | 
189 |     df = pysqldf(sql0)
190 | 
191 |     df2 = pysqldf(sql1)
192 | 
193 |     df3 = pysqldf(sql2)
194 | 
195 |     df4 = pysqldf(sql3)
196 | 
197 |     print('items num.', df3['item'].value_counts().count())
198 |     print('max item id', df3['item'].max())
199 |     print('sessionid num.', df3['sessionid'].value_counts().count())
200 | 
201 |     df4.to_csv(dataset_dir + '/' + dataset_file + '.csv', sep=' ', header=True, index=False, encoding='utf-8')
202 | 
203 | if 'movielens' in dataset_file:
204 |     # movielens-25m
205 |     df = pd.read_csv(dataset_dir + '/ml-25m/ratings.csv')
206 |     # userId,movieId,rating,timestamp
207 |     sql0 = """
208 |     select *
209 |     from
210 |     df a
211 |     where rating>=3
212 |     """
213 | 
214 |     sql1 = """
215 |     select a.*
216 |     from
217 |     df a
218 |     join
219 |     (SELECT movieId FROM df group by movieId having count(*)>=1000)b
220 |     on a.movieId=b.movieId
221 |     """
222 | 
223 |     sql2 = """
224 |     select a.*
225 |     from
226 |     df2 a
227 |     join
228 |     (SELECT userId FROM df2 group by userId having count(*)>=30 and count(*)<=100)b
229 |     on a.userId=b.userId
230 |     """
231 | 
232 |     sql3 = """
233 |     select userId as sessionid, group_concat(movieId) as items
234 |     from
235 |     df3
236 |     group by userId
237 |     order by timestamp asc
238 |     """
239 |     df = pysqldf(sql0)
240 | 
241 |     df2 = pysqldf(sql1)
242 | 
243 |     df3 = pysqldf(sql2)
244 | 
245 |     df4 = pysqldf(sql3)
246 | 
247 |     print('items num.', df3['movieId'].value_counts().count())
248 |     print('max item id', df3['movieId'].max())
249 |     print('sessionid num.', df3['userId'].value_counts().count())
250 | 
251 |     df4.to_csv(dataset_dir + '/movielens.csv', sep=' ', header=True, index=False, encoding='utf-8')
252 | 
253 | if 'rl4rs' in dataset_file:
254 |     # RL4RS
255 |     data = open(dataset_dir + '/rl4rs_dataset_a.csv', 'r').read().split('\n')[:-1]
256 |     tmp = ['sessionid items']
257 |     for x in data:
258 |         session_id = x.split('@')[1]
259 |         sequence_id = list(map(int, x.split('@')[5].split(',')))
260 |         items = list(map(int, x.split('@')[3].split(',')))
261 |         if len(sequence_id) >= 16:
262 |             tmp.append(session_id + ' ' + ','.join(list(map(str, sequence_id[-16:] + items[:5]))))
263 | 
264 |     print('items num.', 283)
265 |     print('max item id', 283)
266 |     print('sessionid num.', len(tmp))
267 | 
268 |     with open(dataset_dir + '/rl4rs.csv', 'w') as f:
269 |         f.write('\n'.join(tmp))
270 | 


--------------------------------------------------------------------------------
/rl4rs/env/base.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import gym
  3 | from copy import deepcopy as copy
  4 | import tensorflow as tf
  5 | from abc import ABC, abstractmethod
  6 | import os
  7 | 
  8 | 
  9 | def single_elem_support(func):
 10 |     """aop func"""
 11 |     type_list = (type([]), type(()), type(np.array(1)))
 12 | 
 13 |     def wrapper(*args, **kwargs):
 14 |         """wrapper func"""
 15 |         res = func(*args, **kwargs)
 16 |         if type(res) in type_list and len(res) == 1:
 17 |             return res[0]
 18 |         elif type(res[0]) in type_list and len(res[0]) == 1:
 19 |             return [x[0] for x in res]
 20 |         else:
 21 |             return res
 22 | 
 23 |     return wrapper
 24 | 
 25 | 
 26 | class RecState(ABC):
 27 |     def __init__(self, config, records):
 28 |         self.config = config
 29 |         self.records = records
 30 |         self._init_state = self.records_to_state(records)
 31 |         self._state = copy(self._init_state)
 32 | 
 33 |     @staticmethod
 34 |     def records_to_state(records):
 35 |         pass
 36 | 
 37 |     @property
 38 |     def state(self):
 39 |         return self._state
 40 | 
 41 |     @property
 42 |     @abstractmethod
 43 |     def user(self):
 44 |         pass
 45 | 
 46 |     @property
 47 |     @abstractmethod
 48 |     def info(self):
 49 |         pass
 50 | 
 51 |     @abstractmethod
 52 |     def act(self, actions):
 53 |         pass
 54 | 
 55 |     @abstractmethod
 56 |     def to_string(self):
 57 |         pass
 58 | 
 59 | 
 60 | class RecDataBase(object):
 61 |     '''
 62 |     file-based implementation of a RecommnedEnv's data source.
 63 | 
 64 |     Pulls data from file, preps for use by RecommnedEnv and then
 65 |     acts as data provider for each new episode.
 66 |     '''
 67 | 
 68 |     def __init__(self, config, state_cls):
 69 |         self.config = config
 70 |         self.sample_list = []
 71 |         self.state_cls = state_cls
 72 |         self.is_eval = config.get('is_eval', False)
 73 |         self.cache_size = config.get('cache_size', 2048)
 74 |         # sample file cache
 75 |         self.fp = open(config['sample_file'], 'r')
 76 |         # self.fp.readline()
 77 | 
 78 |     @staticmethod
 79 |     def seed(seed):
 80 |         np.random.seed(seed)
 81 | 
 82 |     def sample_cache(self, f, num):
 83 |         for i in range(num):
 84 |             tmp = f.readline().rstrip()
 85 |             if len(tmp) < 1:
 86 |                 f.seek(0, 0)
 87 |                 f.readline()
 88 |                 self.sample_list.append(f.readline().rstrip())
 89 |             else:
 90 |                 self.sample_list.append(tmp)
 91 | 
 92 |     def sample(self, batch_size):
 93 |         if self.is_eval:
 94 |             assert self.cache_size == batch_size
 95 |             assert len(self.sample_list) == batch_size
 96 |             records = self.sample_list[:batch_size]
 97 |         else:
 98 |             records = np.random.choice(self.sample_list, batch_size)
 99 |         samples = self.state_cls(self.config, records)
100 |         return samples
101 | 
102 |     def reset(self, reset_file=False):
103 |         # self.state_list = []
104 |         self.sample_list = []
105 |         # self.rawstate_cache(self.fs, 10000)
106 |         if reset_file:
107 |             self.fp.seek(0, 0)
108 |         self.sample_cache(self.fp, self.cache_size)
109 | 
110 | 
111 | class RecSimBase(ABC):
112 |     """ Implemention of core recommendation simulator"""
113 | 
114 |     def __init__(self, config, state_cls):
115 |         self.config = config
116 |         self.max_steps = config['max_steps']
117 |         self.batch_size = config['batch_size']
118 |         model_file = config['model_file']
119 |         self.graph = tf.Graph()
120 |         with self.graph.as_default():
121 |             self.model = self.get_model(config)
122 |             if self.config.get('gpu', False):
123 |                 os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
124 |                 self.sess = tf.Session(graph=self.graph,
125 |                                        config=tf.ConfigProto(device_count={"CPU": 4}))
126 |             else:
127 |                 os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'
128 |                 self.sess = tf.Session(graph=self.graph)
129 |             self.saver = tf.train.Saver()
130 |         self.reload_model(model_file)
131 |         self._recData = RecDataBase(config, state_cls)
132 | 
133 |     def reset(self, reset_file=False):
134 |         self._recData.reset(reset_file)
135 | 
136 |     @abstractmethod
137 |     def get_model(self, config):
138 |         pass
139 | 
140 |     @abstractmethod
141 |     def obs_fn(self, state):
142 |         pass
143 | 
144 |     @abstractmethod
145 |     def forward(self, model, samples):
146 |         pass
147 | 
148 |     def reload_model(self, model_file):
149 |         with self.sess.as_default():
150 |             with self.graph.as_default():
151 |                 self.saver.restore(self.sess, model_file)
152 | 
153 |     def seed(self, sd=0):
154 |         self._recData.seed(sd)
155 |         np.random.seed(sd)
156 | 
157 |     def _step(self, samples, action, **kwargs):
158 |         step = kwargs['step']
159 |         samples.act(action)
160 |         next_state = samples.state
161 |         next_obs = self.obs_fn(next_state)
162 |         reward = self.forward(self.model, samples)
163 |         next_info = samples.info
164 | 
165 |         if step < self.max_steps - 1:
166 |             done = [0] * self.batch_size
167 |         else:
168 |             done = [1] * self.batch_size
169 | 
170 |         return next_obs, reward, done, next_info
171 | 
172 |     def sample(self, batch_size):
173 |         samples = self._recData.sample(batch_size)
174 |         obs = self.obs_fn(samples.state)
175 |         return samples, obs
176 | 
177 | 
178 | class RecEnvBase(gym.Env):
179 |     metadata = {'render.modes': ['human']}
180 | 
181 |     def __init__(self, recsim: RecSimBase):
182 |         self.config = recsim.config
183 |         self.batch_size = self.config['batch_size']
184 |         self.cur_step = 0
185 |         self.sim = recsim
186 |         self.sim.reset()
187 |         self.samples, self.obs = self.sim.sample(self.batch_size)
188 |         if self.config.get("rawstate_as_obs", False):
189 |             category_size = len(self.obs[0]['category_feature'])
190 |             dense_size = len(self.obs[0]['dense_feature'])
191 |             sequence_size = np.array(self.obs[0]['sequence_feature']).shape
192 |             features = {
193 |                 "category_feature": gym.spaces.Box(-1000000.0, 1000000.0, shape=(category_size,)),
194 |                 "dense_feature": gym.spaces.Box(-1000000.0, 1000000.0, shape=(dense_size,)),
195 |                 "sequence_feature": gym.spaces.Box(-1000000.0, 1000000.0, shape=sequence_size),
196 |             }
197 |             if self.config.get("support_rllib_mask", False):
198 |                 action_feature_size = len(self.obs[0]['action_mask'])
199 |                 self.observation_space = gym.spaces.Dict({
200 |                     "action_mask": gym.spaces.Box(0, 1, shape=(action_feature_size,)),
201 |                     **features
202 |                 })
203 |             else:
204 |                 self.observation_space = gym.spaces.Dict(features)
205 |         else:
206 |             if self.config.get("support_rllib_mask", False):
207 |                 action_feature_size = len(self.obs[0]['action_mask'])
208 |                 self.observation_space = gym.spaces.Dict({
209 |                     "action_mask": gym.spaces.Box(0, 1, shape=(action_feature_size,)),
210 |                     "obs": gym.spaces.Box(-100000.0, 100000.0, shape=(len(self.obs[0]["obs"]),))
211 |                 })
212 |             else:
213 |                 self.observation_space = gym.spaces.Box(-100000.0, 100000.0, shape=(len(self.obs[0]),))
214 |         if self.config.get("support_conti_env", False):
215 |             self.action_space = gym.spaces.Box(-1, 1, shape=(self.config['action_emb_size'],))
216 |         else:
217 |             self.action_space = gym.spaces.Discrete(self.config['action_size'])
218 |         # if self.config.get("support_rllib_mask", False):
219 |         #     action_feature_size = len(self.obs[0]['action_mask'])
220 |         #     # avail_actions_size = len(self.obs[0]['avail_actions'][0])
221 |         #     # self.action_space = gym.spaces.Discrete(self.config['action_size'])
222 |         #     self.observation_space = gym.spaces.Dict({
223 |         #         "action_mask": gym.spaces.Box(0, 1, shape=(action_feature_size,)),
224 |         #         "obs": self.observation_space,
225 |         #     })
226 |         # elif self.config.get("support_d3rl_mask", False):
227 |         #     self.action_space = gym.spaces.Discrete(self.config['action_size'])
228 |         # else:
229 |         #     self.action_space = gym.spaces.Discrete(self.config['action_size'])
230 |         self.reset()
231 | 
232 |     def seed(self, sd=0):
233 |         self.sim.seed(sd)
234 |         np.random.seed(sd)
235 | 
236 |     @property
237 |     @single_elem_support
238 |     def state(self):
239 |         return self.obs
240 | 
241 |     @property
242 |     @single_elem_support
243 |     def user_id(self):
244 |         return self.samples.user
245 | 
246 |     @property
247 |     @single_elem_support
248 |     def offline_action(self):
249 |         return self.samples.offline_action
250 | 
251 |     @property
252 |     @single_elem_support
253 |     def offline_reward(self):
254 |         return self.samples.offline_reward
255 | 
256 |     @single_elem_support
257 |     def step(self, action):
258 |         if not isinstance(action, (list, np.ndarray)):
259 |             action = [action]
260 |         obs, reward, done, info = \
261 |             self.sim._step(self.samples, action, step=self.cur_step)
262 |         self.cur_step += 1
263 |         return obs, reward, done, info
264 | 
265 |     def reset(self, reset_file=False):
266 |         self.cur_step = 0
267 |         self.sim.reset(reset_file)
268 |         self.samples, self.obs = self.sim.sample(self.batch_size)
269 |         return self.state
270 | 
271 |     def render(self, mode='human', close=False):
272 |         print('Current State:', '\n')
273 |         print(self.samples.to_string())
274 | 


--------------------------------------------------------------------------------
/rl4rs/nets/exact_k/model.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | import tensorflow as tf
  3 | 
  4 | from .layers import *
  5 | from .modules import *
  6 | from .utils import *
  7 | 
  8 | 
  9 | class Generator:
 10 |     def __init__(self,
 11 |                  l1_mask,
 12 |                  l2_mask,
 13 |                  l3_mask,
 14 |                  l0_ssr_mask,
 15 |                  is_training=True,
 16 |                  lr=0.001,
 17 |                  temperature=1,
 18 |                  train_sample='random',
 19 |                  predict_sample='random',
 20 |                  seq_length=500,
 21 |                  res_length=9,
 22 |                  hidden_units=64,
 23 |                  dropout_rate=0.1,
 24 |                  num_heads=4,
 25 |                  num_layers=1,
 26 |                  num_glimpse=1,
 27 |                  num_blocks=2,
 28 |                  use_mha=True,
 29 |                  beam_size=3
 30 |                  ):
 31 | 
 32 |         self.user = tf.placeholder(tf.float32, shape=(None, 256), name='user')  # 779
 33 | 
 34 |         self.batch_size = tf.shape(self.user)[0]
 35 |         self.item_cand = tf.placeholder(tf.int32, shape=(None, seq_length), name='item_cand')
 36 | 
 37 |         self.decode_target_ids = tf.placeholder(dtype=tf.int32, shape=[None, res_length], name="decoder_target_ids")  # [batch_size, res_length]
 38 |         self.reward = tf.placeholder(dtype=tf.float32, shape=[None], name="reward")  # [batch_size]
 39 | 
 40 |         # Encoder
 41 |         with tf.variable_scope("encoder"):
 42 |             # region emb
 43 |             self.enc_user = tf.layers.dense(self.user, hidden_units, activation=tf.nn.relu)  # (N, T_q, C)
 44 |             # enc_item = [batch_size, seq_len, hidden_units]
 45 |             self.enc_item = embedding(self.item_cand,
 46 |                                       vocab_size=500,
 47 |                                       num_units=hidden_units,
 48 |                                       zero_pad=False,
 49 |                                       scale=True,
 50 |                                       scope='enc_item_embed',
 51 |                                       # reuse=not is_training,
 52 |                                       reuse=False
 53 |                                       )
 54 |             self.enc = tf.concat([tf.stack(seq_length * [self.enc_user], axis=1), self.enc_item], axis=2)
 55 |             # endregion
 56 |             # region Dropout
 57 |             self.enc = tf.layers.dropout(self.enc,
 58 |                                          rate=dropout_rate,
 59 |                                          training=tf.convert_to_tensor(is_training))
 60 |             # endregion
 61 |             # region squence
 62 |             if use_mha:
 63 |                 ## Blocks
 64 |                 for i in range(num_blocks):
 65 |                     with tf.variable_scope("num_blocks_{}".format(i)):
 66 |                         ### Multihead Attention
 67 |                         self.enc = multihead_attention(queries=self.enc,
 68 |                                                        keys=self.enc,
 69 |                                                        num_units=hidden_units * 2,
 70 |                                                        num_heads=num_heads,
 71 |                                                        dropout_rate=dropout_rate,
 72 |                                                        is_training=is_training,
 73 |                                                        causality=False)
 74 | 
 75 |                         ### Feed Forward
 76 |                         self.enc = feedforward(self.enc, num_units=[4 * hidden_units, hidden_units * 2])
 77 |             else:
 78 |                 cell = tf.nn.rnn_cell.GRUCell(num_units=hidden_units * 2)
 79 |                 outputs, _ = tf.nn.dynamic_rnn(cell=cell, inputs=self.enc, dtype=tf.float32)
 80 |                 self.enc = outputs
 81 |             # endregion
 82 | 
 83 |         # Decoder
 84 |         with tf.variable_scope("decoder"):
 85 |             dec_cell = LSTMCell(hidden_units * 2)
 86 | 
 87 |             if num_layers > 1:
 88 |                 cells = [dec_cell] * num_layers
 89 |                 dec_cell = MultiRNNCell(cells)
 90 |             # ptr sampling
 91 |             enc_init_state = trainable_initial_state(self.batch_size, dec_cell.state_size)
 92 | 
 93 |             custom_logits, custom_path, _ = ptn_rnn_decoder(
 94 |                 dec_cell, None,
 95 |                 self.enc, enc_init_state,
 96 |                 seq_length, res_length, hidden_units * 2,
 97 |                 num_glimpse, self.batch_size,
 98 |                 l1_mask, l2_mask, l3_mask, l0_ssr_mask,
 99 |                 mode="CUSTOM", reuse=False, beam_size=None,
100 |                 temperature=temperature,
101 |                 train_sample=train_sample, predict_sample=predict_sample
102 |             )
103 |             # logits: [batch_size, res_length, seq_length]
104 |             self.custom_logits = tf.identity(custom_logits, name="custom_logits")
105 |             # sample_path: [batch_size, res_length]
106 |             self.custom_path = tf.identity(custom_path, name="custom_path")
107 |             self.custom_result = batch_gather(self.item_cand, self.custom_path)
108 |             sampled_logits, sampled_path, _ = ptn_rnn_decoder(
109 |                 dec_cell, None,
110 |                 self.enc, enc_init_state,
111 |                 seq_length, res_length, hidden_units * 2,
112 |                 num_glimpse, self.batch_size,
113 |                 l1_mask, l2_mask, l3_mask, l0_ssr_mask,
114 |                 mode="SAMPLE", reuse=True, beam_size=None,
115 |                 temperature=temperature,
116 |                 train_sample=train_sample, predict_sample=predict_sample
117 |             )
118 |             # logits: [batch_size, res_length, seq_length]
119 |             self.sampled_logits = tf.identity(sampled_logits, name="sampled_logits")
120 |             # sample_path: [batch_size, res_length]
121 |             self.sampled_path = tf.identity(sampled_path, name="sampled_path")
122 |             self.sampled_result = batch_gather(self.item_cand, self.sampled_path)
123 | 
124 |             # self.decode_target_ids is placeholder
125 |             decoder_logits, _ = ptn_rnn_decoder(
126 |                 dec_cell, self.decode_target_ids,
127 |                 self.enc, enc_init_state,
128 |                 seq_length, res_length, hidden_units * 2,
129 |                 num_glimpse, self.batch_size,
130 |                 l1_mask, l2_mask, l3_mask, l0_ssr_mask,
131 |                 mode="TRAIN", reuse=True, beam_size=None,
132 |                 temperature=temperature,
133 |                 train_sample=train_sample, predict_sample=predict_sample
134 |             )
135 |             self.dec_logits = tf.identity(decoder_logits, name="dec_logits")
136 | 
137 |             _, beam_path, _ = ptn_rnn_decoder(
138 |                 dec_cell, None,
139 |                 self.enc, enc_init_state,
140 |                 seq_length, res_length, hidden_units * 2,
141 |                 num_glimpse, self.batch_size,
142 |                 l1_mask, l2_mask, l3_mask, l0_ssr_mask,
143 |                 mode="BEAMSEARCH", reuse=True, beam_size=beam_size,
144 |                 temperature=temperature,
145 |                 train_sample=train_sample, predict_sample=predict_sample
146 |             )
147 |             self.beam_path = tf.identity(beam_path, name="beam_path")
148 |             self.beam_result = batch_gather(self.item_cand, self.beam_path)
149 | 
150 |             _, greedy_path, _ = ptn_rnn_decoder(
151 |                 dec_cell, None,
152 |                 self.enc, enc_init_state,
153 |                 seq_length, res_length, hidden_units * 2,
154 |                 num_glimpse, self.batch_size,
155 |                 l1_mask, l2_mask, l3_mask, l0_ssr_mask,
156 |                 mode="GREEDY", reuse=True, beam_size=None,
157 |                 temperature=temperature,
158 |                 train_sample=train_sample, predict_sample=predict_sample
159 |             )
160 |             self.greedy_path = tf.identity(greedy_path, name="greedy_path")
161 |             self.greedy_result = batch_gather(self.item_cand, self.greedy_path)
162 | 
163 |         if is_training:
164 |             # Loss
165 |             # self.y_smoothed = label_smoothing(tf.one_hot(self.decode_target_ids, depth=hp.data_length))
166 |             self.r_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.dec_logits,
167 |                                                                          labels=self.decode_target_ids)
168 |             # reinforcement
169 |             self.policy_loss = tf.reduce_mean(tf.reduce_sum(self.r_loss, axis=1) * self.reward)
170 |             # supervised loss
171 |             self.loss = self.policy_loss
172 | 
173 |             # Training Scheme
174 |             self.global_step = tf.Variable(0, name='global_step', trainable=False)
175 |             self.optimizer = tf.train.AdamOptimizer(learning_rate=lr, beta1=0.9, beta2=0.98, epsilon=1e-8)
176 |             self.train_op = self.optimizer.minimize(self.loss, global_step=self.global_step)
177 | 
178 |         self.variables = tf.global_variables()
179 | 
180 | 
181 | class Discriminator:
182 |     def __init__(self, lr=0.005, seq_length=500):
183 |         self.user = tf.placeholder(tf.float32, shape=(None, 256), name='user')
184 |         self.batch_size = tf.shape(self.user)[0]
185 |         self.item_cand = tf.placeholder(tf.int32, shape=(None, seq_length), name='item_cand')
186 | 
187 |         self.reward_target = tf.placeholder(dtype=tf.float32, shape=[None], name="reward")  # [batch_size]
188 | 
189 |         dense0 = self.user
190 |         dense1 = tf.layers.dense(dense0, 128, activation=tf.nn.relu)
191 |         dense2 = tf.layers.dense(dense1, 128, activation=tf.nn.relu)
192 |         dense3 = tf.layers.dense(dense2, 128, activation=tf.nn.relu)
193 | 
194 |         self.reward = tf.layers.dense(dense3, 1)[:, 0]
195 | 
196 |         self.td_error = tf.abs(self.reward_target - self.reward)
197 |         self.loss = tf.square(self.td_error)
198 | 
199 |         # Training Scheme
200 |         self.global_step = tf.Variable(0, name='global_step', trainable=False)
201 |         self.optimizer = tf.train.AdamOptimizer(learning_rate=lr, beta1=0.9, beta2=0.98, epsilon=1e-8)
202 |         self.train_op = self.optimizer.minimize(self.loss, global_step=self.global_step)
203 | 


--------------------------------------------------------------------------------