├── rl4rs
├── nets
│ ├── __init__.py
│ ├── cql
│ │ ├── __init__.py
│ │ ├── q_function.py
│ │ └── encoder.py
│ ├── exact_k
│ │ ├── __init__.py
│ │ ├── utils.py
│ │ └── model.py
│ ├── rllib
│ │ ├── __init__.py
│ │ ├── rllib_rawstate_model.py
│ │ └── rllib_mask_model.py
│ ├── dnn.py
│ ├── widedeep.py
│ ├── lstm_slate.py
│ ├── lstm.py
│ ├── dien.py
│ ├── dnn_slate.py
│ ├── widedeep_slate.py
│ ├── dien_slate.py
│ ├── lstm_slate_multiclass.py
│ ├── dnn_slate_multiclass.py
│ ├── widedeep_slate_multiclass.py
│ ├── dien_slate_multiclass.py
│ ├── adversarial_slate.py
│ └── utils.py
├── policy
│ ├── __init__.py
│ ├── behavior_model.py
│ └── policy_model.py
├── server
│ ├── __init__.py
│ ├── httpEnv.py
│ └── gymHttpClient.py
├── utils
│ ├── __init__.py
│ ├── fileutil.py
│ ├── rllib_print.py
│ ├── rllib_vector_env.py
│ ├── d3rlpy_scorer.py
│ └── offline_policy_metrics.py
├── mdpchecker
│ ├── __init__.py
│ └── decoder.py
├── env
│ ├── __init__.py
│ ├── seqslate.py
│ └── base.py
└── __init__.py
├── assets
├── fuxi.jpg
└── new.gif
├── RL4RS_appendix.pdf
├── reproductions
├── run_mdp_checker.sh
├── run_supervised_item.sh
├── run_supervised_slate.sh
├── file_split.sh
├── run_simulator_env_test.sh
├── run_simulator_train.sh
├── run_simulator_eval.sh
├── run_exact_k.sh
├── run_modelfree_rl.sh
└── run_split.sh
├── script
├── modelfree_trainer.py
├── supervised_train.py
├── simulator_eval.py
├── test_exact_k.py
├── simulator_env_test.py
├── offline_evaluation.py
├── exact_k_train.py
├── batchrl_train.py
├── data_preprocess.py
└── mdpchecker
│ ├── mdp_checker.py
│ └── preprocess.py
├── index.html
└── environment.yml
/rl4rs/nets/__init__.py:
--------------------------------------------------------------------------------
1 | #
--------------------------------------------------------------------------------
/rl4rs/nets/cql/__init__.py:
--------------------------------------------------------------------------------
1 | #
--------------------------------------------------------------------------------
/rl4rs/policy/__init__.py:
--------------------------------------------------------------------------------
1 | #
--------------------------------------------------------------------------------
/rl4rs/server/__init__.py:
--------------------------------------------------------------------------------
1 | #
--------------------------------------------------------------------------------
/rl4rs/utils/__init__.py:
--------------------------------------------------------------------------------
1 | #
--------------------------------------------------------------------------------
/rl4rs/mdpchecker/__init__.py:
--------------------------------------------------------------------------------
1 | #
--------------------------------------------------------------------------------
/rl4rs/nets/exact_k/__init__.py:
--------------------------------------------------------------------------------
1 | #
--------------------------------------------------------------------------------
/rl4rs/nets/rllib/__init__.py:
--------------------------------------------------------------------------------
1 | #
--------------------------------------------------------------------------------
/assets/fuxi.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fuxiAIlab/RL4RS/HEAD/assets/fuxi.jpg
--------------------------------------------------------------------------------
/assets/new.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fuxiAIlab/RL4RS/HEAD/assets/new.gif
--------------------------------------------------------------------------------
/RL4RS_appendix.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fuxiAIlab/RL4RS/HEAD/RL4RS_appendix.pdf
--------------------------------------------------------------------------------
/rl4rs/env/__init__.py:
--------------------------------------------------------------------------------
1 | from .base import RecDataBase, RecSimBase, RecEnvBase, RecState
2 |
3 | __all__ = [
4 | "RecDataBase",
5 | "RecSimBase",
6 | "RecEnvBase",
7 | "RecState",
8 | ]
9 |
--------------------------------------------------------------------------------
/rl4rs/__init__.py:
--------------------------------------------------------------------------------
1 | from gym.envs.registration import register
2 |
3 | register(
4 | id='HttpEnv-v0',
5 | entry_point='rl4rs.server.httpEnv:HttpEnv',
6 | )
7 |
8 | register(
9 | id='SlateRecEnv-v0',
10 | entry_point='rl4rs.env:RecEnvBase',
11 | )
12 |
13 | register(
14 | id='SeqSlateRecEnv-v0',
15 | entry_point='rl4rs.env:RecEnvBase',
16 | )
17 |
--------------------------------------------------------------------------------
/reproductions/run_mdp_checker.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | conda activate rl4rs
4 | script_abs=$(readlink -f "$0")
5 | rl4rs_benchmark_dir=$(dirname $script_abs)/..
6 | rl4rs_output_dir=${rl4rs_benchmark_dir}/output
7 | rl4rs_dataset_dir=${rl4rs_benchmark_dir}/dataset
8 | script_dir=${rl4rs_benchmark_dir}/script
9 | export rl4rs_benchmark_dir && export rl4rs_output_dir && export rl4rs_dataset_dir
10 |
11 | dataset=$1
12 |
13 | cd ${script_dir}/tool
14 | python -u preprocess.py $dataset ${rl4rs_dataset_dir} &&
15 | python -u mdp_checker.py $dataset ${rl4rs_dataset_dir} >> ${rl4rs_output_dir}/data_understanding_tool_${dataset}.log &&
16 | echo "1"
--------------------------------------------------------------------------------
/rl4rs/utils/fileutil.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import glob
4 | import numpy as np
5 |
6 |
7 | def find_match_files(pattern, search_path, pathsep=os.pathsep):
8 | for path in search_path.split(pathsep):
9 | for match in glob.glob(os.path.join(path, pattern)):
10 | yield match
11 |
12 |
13 | def find_newest_files(pattern, search_path, pathsep=os.pathsep):
14 | files = []
15 | timestamps = []
16 | for path in search_path.split(pathsep):
17 | for match in glob.glob(os.path.join(path, pattern)):
18 | files.append(match)
19 | timestamps.append(float(os.path.getctime(match)))
20 | if len(files) > 0:
21 | return files[np.argmax(timestamps)]
22 | else:
23 | return ''
24 |
--------------------------------------------------------------------------------
/reproductions/run_supervised_item.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | conda activate rl4rs
4 | script_abs=$(readlink -f "$0")
5 | rl4rs_benchmark_dir=$(dirname $script_abs)/..
6 | rl4rs_output_dir=${rl4rs_benchmark_dir}/output
7 | script_dir=${rl4rs_benchmark_dir}/script
8 | export rl4rs_benchmark_dir && export rl4rs_output_dir && export rl4rs_dataset_dir
9 |
10 | algo=$1
11 |
12 | cd ${script_dir}
13 |
14 | # supervised learning evaluation
15 |
16 | python supervised_train.py "${rl4rs_output_dir}/rl4rs_dataset_a_train.tfrecord" "${rl4rs_output_dir}/rl4rs_dataset_a_test.tfrecord" "${rl4rs_output_dir}/supervised_a_train_$algo/model" $algo 0 >> ${rl4rs_output_dir}/supervised_a_train_${algo}_item.log &&
17 |
18 | python supervised_train.py "${rl4rs_output_dir}/rl4rs_dataset_b2_train.tfrecord" "${rl4rs_output_dir}/rl4rs_dataset_b2_test.tfrecord" "${rl4rs_output_dir}/supervised_b2_train_$algo/model" $algo 0 >> ${rl4rs_output_dir}/supervised_b2_train_${algo}_item.log &&
19 |
20 | echo "1"
21 |
22 |
--------------------------------------------------------------------------------
/reproductions/run_supervised_slate.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | conda activate rl4rs
4 | script_abs=$(readlink -f "$0")
5 | rl4rs_benchmark_dir=$(dirname $script_abs)/..
6 | rl4rs_output_dir=${rl4rs_benchmark_dir}/output
7 | script_dir=${rl4rs_benchmark_dir}/script
8 | export rl4rs_benchmark_dir && export rl4rs_output_dir && export rl4rs_dataset_dir
9 |
10 | algo=$1
11 |
12 | cd ${script_dir}
13 |
14 | # supervised learning evaluation
15 |
16 | python supervised_train.py "${rl4rs_output_dir}/rl4rs_dataset_a_train_slate.tfrecord" "${rl4rs_output_dir}/rl4rs_dataset_a_test_slate.tfrecord" "${rl4rs_output_dir}/supervised_a_train_slate_$algo/model" $algo 1 >> ${rl4rs_output_dir}/supervised_a_train_${algo}_slate.log &&
17 |
18 | python supervised_train.py "${rl4rs_output_dir}/rl4rs_dataset_b2_train_slate.tfrecord" "${rl4rs_output_dir}/rl4rs_dataset_b2_test_slate.tfrecord" "${rl4rs_output_dir}/supervised_b2_train_slate_$algo/model" $algo 1 >> ${rl4rs_output_dir}/supervised_b2_train_${algo}_slate.log &&
19 |
20 | echo "1"
21 |
22 |
--------------------------------------------------------------------------------
/rl4rs/utils/rllib_print.py:
--------------------------------------------------------------------------------
1 | import json
2 | import yaml
3 | from ray.tune.utils.util import SafeFallbackEncoder
4 |
5 |
6 | def pretty_print(result):
7 | result = result.copy()
8 | result.update(config=None) # drop config from pretty print
9 | result.update(hist_stats=None) # drop hist_stats from pretty print
10 | out = {}
11 | print_keys = ('episode_reward_mean',
12 | 'episode_reward_min',
13 | 'timesteps_total',
14 | 'training_iteration')
15 | for k, v in result.items():
16 | if v is not None:
17 | if k in print_keys:
18 | out[k] = v
19 | elif k == 'evaluation':
20 | out[k] = {
21 | 'episode_reward_mean': v['episode_reward_mean'],
22 | 'episode_reward_min': v['episode_reward_min'],
23 | }
24 | cleaned = json.dumps(out, cls=SafeFallbackEncoder)
25 | return yaml.safe_dump(json.loads(cleaned), default_flow_style=False)
26 |
--------------------------------------------------------------------------------
/reproductions/file_split.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | script_abs=$(readlink -f "$0")
4 | rl4rs_benchmark_dir=$(dirname $script_abs)/..
5 | rl4rs_dataset_dir=${rl4rs_benchmark_dir}/dataset
6 | script_dir=${rl4rs_benchmark_dir}/script
7 | rl4rs_output_dir=${rl4rs_benchmark_dir}/output
8 | export rl4rs_benchmark_dir && export rl4rs_output_dir && export rl4rs_dataset_dir
9 |
10 | file=$1
11 |
12 | cd ${rl4rs_dataset_dir} &&
13 |
14 | awk -F "@" '$2%11<2 {print}' ${file} > ${rl4rs_output_dir}/${file}_0000.csv &&
15 | awk -F "@" '$2%11>=2 && $2%11<4 {print}' ${file} > ${rl4rs_output_dir}/${file}_0001.csv &&
16 | awk -F "@" '$2%11>=4 && $2%11<6 {print}' ${file} > ${rl4rs_output_dir}/${file}_0002.csv &&
17 | awk -F "@" '$2%11>=6 && $2%11<8 {print}' ${file} > ${rl4rs_output_dir}/${file}_0003.csv &&
18 | awk -F "@" '$2%11>=8 {print}' ${file} > ${rl4rs_output_dir}/${file}_0004.csv
19 |
20 | #file_rows=`wc -l ${file}|awk '{print $1}'`
21 | #file_num=5
22 | #file_num_row=$((${file_rows} + 4))
23 | #every_file_row=$((${file_num_row}/${file_num}))
24 | #split -d -a 4 -l ${every_file_row} ${file} --additional-suffix=.csv ${rl4rs_output_dir}/${file}_
25 |
26 |
27 |
--------------------------------------------------------------------------------
/reproductions/run_simulator_env_test.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | conda activate rl4rs
4 | script_abs=$(readlink -f "$0")
5 | rl4rs_benchmark_dir=$(dirname $script_abs)/..
6 | rl4rs_output_dir=${rl4rs_benchmark_dir}/output
7 | rl4rs_dataset_dir=${rl4rs_benchmark_dir}/dataset
8 | script_dir=${rl4rs_benchmark_dir}/script
9 | export rl4rs_benchmark_dir && export rl4rs_output_dir && export rl4rs_dataset_dir
10 |
11 | algo=$1
12 |
13 | cd ${script_dir}
14 |
15 | head -1 ${rl4rs_dataset_dir}/rl4rs_dataset_a_train.csv > ${rl4rs_dataset_dir}/rl4rs_dataset_a_train_tiny.csv
16 | python data_preprocess.py "${rl4rs_dataset_dir}/rl4rs_dataset_a_train_tiny.csv" "${rl4rs_output_dir}/rl4rs_dataset_a_train_tiny.tfrecord" "tfrecord_item"
17 | python -u simulator_env_test.py "{'env':'SlateRecEnv-v0','support_conti_env':False,'rawstate_as_obs':False}" &&
18 | python -u simulator_env_test.py "{'env':'SlateRecEnv-v0','support_conti_env':False,'rawstate_as_obs':True}" &&
19 | python -u simulator_env_test.py "{'env':'SlateRecEnv-v0','support_conti_env':True,'rawstate_as_obs':False,'action_emb_size':32}" &&
20 | python -u simulator_env_test.py "{'env':'SlateRecEnv-v0','support_conti_env':True,'rawstate_as_obs':True,'action_emb_size':32}" &&
21 | echo '1'
--------------------------------------------------------------------------------
/script/modelfree_trainer.py:
--------------------------------------------------------------------------------
1 | import ray.rllib.agents.ppo as ppo
2 | import ray.rllib.agents.dqn as dqn
3 | import ray.rllib.agents.a3c as a3c
4 | import ray.rllib.agents.pg as pg
5 | import ray.rllib.agents.ddpg.td3 as td3
6 | import ray.rllib.agents.impala as impala
7 | import ray.rllib.agents.ddpg as ddpg
8 | import ray.rllib.agents.slateq as slateq
9 |
10 |
11 | def get_rl_model(algo, rllib_config):
12 | trainer = None
13 | if algo == "PPO":
14 | trainer = ppo.PPOTrainer(config=rllib_config, env="rllibEnv-v0")
15 | elif algo == "DQN":
16 | trainer = dqn.DQNTrainer(config=rllib_config, env="rllibEnv-v0")
17 | elif algo == "RAINBOW":
18 | trainer = dqn.DQNTrainer(config=rllib_config, env="rllibEnv-v0")
19 | elif algo == "A2C":
20 | trainer = a3c.A2CTrainer(config=rllib_config, env="rllibEnv-v0")
21 | elif algo == "A3C":
22 | trainer = a3c.A3CTrainer(config=rllib_config, env="rllibEnv-v0")
23 | elif algo == "PG":
24 | trainer = pg.PGTrainer(config=rllib_config, env="rllibEnv-v0")
25 | elif algo == "DDPG":
26 | trainer = ddpg.DDPGTrainer(config=rllib_config, env="rllibEnv-v0")
27 | elif algo == "TD3":
28 | trainer = td3.TD3Trainer(config=rllib_config, env="rllibEnv-v0")
29 | elif algo == "IMPALA":
30 | trainer = impala.ImpalaTrainer(config=rllib_config, env="rllibEnv-v0")
31 | elif algo == "SLATEQ":
32 | trainer = slateq.SlateQTrainer(config=rllib_config, env="rllibEnv-v0")
33 | else:
34 | assert algo in ("PPO", "DQN", "A2C", "A3C", "PG", "IMPALA", "TD3", "RAINBOW", "SLATEQ")
35 | print('trainer_default_config', trainer._default_config)
36 | return trainer
37 |
--------------------------------------------------------------------------------
/reproductions/run_simulator_train.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | conda activate rl4rs
4 | script_abs=$(readlink -f "$0")
5 | rl4rs_benchmark_dir=$(dirname $script_abs)/..
6 | rl4rs_output_dir=${rl4rs_benchmark_dir}/output
7 | script_dir=${rl4rs_benchmark_dir}/script
8 | export rl4rs_benchmark_dir && export rl4rs_output_dir && export rl4rs_dataset_dir
9 |
10 | algo=$1
11 |
12 | cd ${script_dir}
13 |
14 | # RL Env Construction
15 |
16 | python supervised_train.py "${rl4rs_output_dir}/rl4rs_dataset_a_sl.tfrecord" "${rl4rs_output_dir}/rl4rs_dataset_a_rl.tfrecord" "${rl4rs_output_dir}/simulator_a_sl_$algo/model" $algo 0 >> ${rl4rs_output_dir}/simulator_a_sl_${algo}.log &&
17 |
18 | python supervised_train.py "${rl4rs_output_dir}/rl4rs_dataset_a_rl.tfrecord" "${rl4rs_output_dir}/rl4rs_dataset_a_sl.tfrecord" "${rl4rs_output_dir}/simulator_a_rl_$algo/model" $algo 0 >> ${rl4rs_output_dir}/simulator_a_rl_${algo}.log &&
19 |
20 | python supervised_train.py "${rl4rs_output_dir}/rl4rs_dataset_a.tfrecord" "${rl4rs_output_dir}/rl4rs_dataset_a.tfrecord" "${rl4rs_output_dir}/simulator_a_$algo/model" $algo 0 >> ${rl4rs_output_dir}/simulator_a_${algo}.log &&
21 |
22 | python supervised_train.py "${rl4rs_output_dir}/rl4rs_dataset_b2_sl.tfrecord" "${rl4rs_output_dir}/rl4rs_dataset_b2_rl.tfrecord" "${rl4rs_output_dir}/simulator_b2_sl_$algo/model" $algo 0 >> ${rl4rs_output_dir}/simulator_b2_sl_${algo}.log &&
23 |
24 | python supervised_train.py "${rl4rs_output_dir}/rl4rs_dataset_b2_rl.tfrecord" "${rl4rs_output_dir}/rl4rs_dataset_b2_sl.tfrecord" "${rl4rs_output_dir}/simulator_b2_rl_$algo/model" $algo 0 >> ${rl4rs_output_dir}/simulator_b2_rl_${algo}.log &&
25 |
26 | python supervised_train.py "${rl4rs_output_dir}/rl4rs_dataset_b2.tfrecord" "${rl4rs_output_dir}/rl4rs_dataset_b2.tfrecord" "${rl4rs_output_dir}/simulator_b2_$algo/model" $algo 0 >> ${rl4rs_output_dir}/simulator_b2_${algo}.log &&
27 |
28 | echo "1"
--------------------------------------------------------------------------------
/script/supervised_train.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import glob
4 | import tensorflow as tf
5 | tf.compat.v1.disable_eager_execution()
6 | if tf.test.is_gpu_available():
7 | os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'
8 | from tensorflow import keras
9 | from rl4rs.utils.datautil import FeatureUtil
10 | from rl4rs.utils.fileutil import find_match_files
11 |
12 | config = {
13 | "epoch": 20,
14 | "maxlen": 64,
15 | "batch_size": 256,
16 | "class_num": 2,
17 | "dense_feature_num": 432,
18 | "category_feature_num": 21,
19 | "category_hash_size": 100000,
20 | "seq_num": 2,
21 | "emb_size": 128,
22 | "hidden_units": 128,
23 | "action_size": 284
24 | }
25 | train_file = sys.argv[1]
26 | test_file = sys.argv[2]
27 | model_file = sys.argv[3]
28 | model_type = sys.argv[4]
29 | is_slate_label = bool(int(sys.argv[5]))
30 | featureutil = FeatureUtil(config)
31 |
32 | train_files = [match for match in find_match_files(train_file + '*', train_file)]
33 | test_files = [match for match in find_match_files(test_file + '*', test_file)]
34 | print('train on ', train_files, ' test on ', test_files)
35 | iter_train = featureutil.read_tfrecord(train_files, is_slate_label=is_slate_label)
36 | iter_test = featureutil.read_tfrecord(test_files, is_slate_label=is_slate_label)
37 | model = __import__("rl4rs.nets." + model_type, fromlist=['get_model']).get_model(config)
38 | steps_per_epoch = 600000 // config["batch_size"]
39 | steps_per_epoch_val = 400000 // config["batch_size"]
40 | earlyStopping = keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, verbose=2, mode='min')
41 | model.fit(iter_train, steps_per_epoch=steps_per_epoch, epochs=int(config["epoch"]),
42 | validation_data=iter_test, validation_steps=steps_per_epoch_val, verbose=2, callbacks=[earlyStopping])
43 |
44 | saver = tf.train.Saver()
45 | sess = tf.keras.backend.get_session()
46 | saver.save(sess, model_file)
47 |
--------------------------------------------------------------------------------
/rl4rs/nets/dnn.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | from tensorflow import keras
3 | from tensorflow.keras import layers
4 | from tensorflow.keras.models import Model
5 | from rl4rs.nets import utils
6 |
7 |
8 | def get_model(config):
9 | maxlen = config['maxlen']
10 | dense_feature_num = config['dense_feature_num']
11 | category_feature_num = config['category_feature_num']
12 | class_num = config['class_num']
13 | seq_num = config['seq_num']
14 |
15 | sequence_feature_input = layers.Input(
16 | shape=(seq_num, maxlen,), dtype='float32', name='sequence_feature_input'
17 | )
18 |
19 | dense_feature_input = layers.Input(
20 | shape=(dense_feature_num,), dtype='float32', name='dense_feature_input'
21 | )
22 |
23 | category_feature_input = layers.Input(
24 | shape=(category_feature_num,), dtype='int64', name='category_feature_input'
25 | )
26 |
27 | slate_label_input = layers.Input(
28 | shape=(9,), dtype='int64', name='slate_label'
29 | )
30 |
31 | category_feature = utils.id_input_processing(category_feature_input, config)
32 | dense_feature = utils.dense_input_processing(dense_feature_input, config)
33 | sequence_feature = utils.sequence_input_concat(sequence_feature_input, config)
34 | all_feature = layers.Concatenate(axis=-1)([category_feature, dense_feature])
35 | all_feature = layers.Dense(256, activation=layers.ELU())(all_feature)
36 | obs = layers.Dense(256, activation=layers.ELU(), name='simulator_obs')(all_feature)
37 | output = layers.Dense(class_num, activation='softmax', name='simulator_reward')(obs)
38 |
39 | model = Model(inputs=[sequence_feature_input,
40 | dense_feature_input,
41 | category_feature_input,
42 | slate_label_input],
43 | outputs=[output])
44 | model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['AUC', 'acc'])
45 | return model
46 |
--------------------------------------------------------------------------------
/rl4rs/nets/widedeep.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | from tensorflow import keras
3 | from tensorflow.keras import layers
4 | from tensorflow.keras.models import Model
5 | from rl4rs.nets import utils
6 |
7 |
8 | def get_model(config):
9 | maxlen = config['maxlen']
10 | dense_feature_num = config['dense_feature_num']
11 | category_feature_num = config['category_feature_num']
12 | class_num = config['class_num']
13 | seq_num = config['seq_num']
14 |
15 | sequence_feature_input = layers.Input(
16 | shape=(seq_num, maxlen,), dtype='float32', name='sequence_feature_input'
17 | )
18 |
19 | dense_feature_input = layers.Input(
20 | shape=(dense_feature_num,), dtype='float32', name='dense_feature_input'
21 | )
22 |
23 | category_feature_input = layers.Input(
24 | shape=(category_feature_num,), dtype='int64', name='category_feature_input'
25 | )
26 |
27 | slate_label_input = layers.Input(
28 | shape=(9,), dtype='int64', name='slate_label'
29 | )
30 |
31 | category_feature = utils.id_input_processing_concat(category_feature_input, config)
32 | dense_feature = utils.dense_input_processing(dense_feature_input, config)
33 | sequence_feature = utils.sequence_input_concat(sequence_feature_input, config)
34 | sequence_feature_dnn = layers.Dense(256, activation=layers.ELU())(sequence_feature)
35 | all_feature = layers.Concatenate(axis=-1, name='simulator_obs')(
36 | [sequence_feature_dnn, dense_feature, category_feature]
37 | )
38 | output = layers.Dense(class_num, activation='softmax', name='simulator_reward')(all_feature)
39 |
40 | model = Model(inputs=[sequence_feature_input,
41 | dense_feature_input,
42 | category_feature_input,
43 | slate_label_input],
44 | outputs=[output])
45 | model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['AUC', 'acc'])
46 | return model
47 |
--------------------------------------------------------------------------------
/rl4rs/nets/lstm_slate.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | from tensorflow import keras
3 | from tensorflow.keras import layers
4 | from tensorflow.keras.models import Model
5 | from rl4rs.nets import utils
6 |
7 |
8 | def get_model(config):
9 | maxlen = config['maxlen']
10 | dense_feature_num = config['dense_feature_num']
11 | category_feature_num = config['category_feature_num']
12 | class_num = config['class_num']
13 | seq_num = config['seq_num']
14 |
15 | sequence_feature_input = layers.Input(shape=(seq_num, maxlen,), dtype='float32', name='sequence_feature_input')
16 |
17 | dense_feature_input = layers.Input(shape=(dense_feature_num,), dtype='float32', name='dense_feature_input')
18 |
19 | category_feature_input = layers.Input(shape=(category_feature_num,), dtype='int64', name='category_feature_input')
20 |
21 | slate_label_input = layers.Input(shape=(9,), dtype='int64', name='slate_label')
22 |
23 | category_feature = utils.id_input_processing_lstm(category_feature_input, config)
24 | dense_feature = utils.dense_input_processing(dense_feature_input, config)
25 | sequence_feature = utils.sequence_input_LSTM(sequence_feature_input, config)
26 | all_feature = layers.Concatenate(axis=-1)([sequence_feature, dense_feature, category_feature])
27 | obs = layers.Dense(256, activation=layers.ELU(), name='simulator_obs')(all_feature)
28 | output = layers.Dense(9, activation='sigmoid', name='simulator_reward')(obs)
29 |
30 | model = Model(inputs=[sequence_feature_input,
31 | dense_feature_input,
32 | category_feature_input,
33 | slate_label_input],
34 | outputs=[output])
35 | model.compile(loss='binary_crossentropy',
36 | optimizer='adam',
37 | metrics=[tf.keras.metrics.AUC(),
38 | tf.keras.metrics.Precision(),
39 | tf.keras.metrics.Recall()])
40 | return model
41 |
--------------------------------------------------------------------------------
/rl4rs/nets/lstm.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | from tensorflow import keras
3 | from tensorflow.keras import layers
4 | from tensorflow.keras.models import Model
5 | from rl4rs.nets import utils
6 |
7 |
8 | def get_model(config):
9 | maxlen = config['maxlen']
10 | dense_feature_num = config['dense_feature_num']
11 | category_feature_num = config['category_feature_num']
12 | class_num = config['class_num']
13 | seq_num = config['seq_num']
14 |
15 | sequence_feature_input = layers.Input(
16 | shape=(seq_num, maxlen,), dtype='float32', name='sequence_feature_input'
17 | )
18 |
19 | dense_feature_input = layers.Input(
20 | shape=(dense_feature_num,), dtype='float32', name='dense_feature_input'
21 | )
22 |
23 | category_feature_input = layers.Input(
24 | shape=(category_feature_num,), dtype='int64', name='category_feature_input'
25 | )
26 |
27 | slate_label_input = layers.Input(
28 | shape=(9,), dtype='int64', name='slate_label'
29 | )
30 |
31 | category_feature = utils.id_input_processing_lstm(category_feature_input, config)
32 | # category_feature_concat = utils.id_input_processing_concat(category_feature_input, config)
33 | dense_feature = utils.dense_input_processing(dense_feature_input, config)
34 | sequence_feature = utils.sequence_input_LSTM(sequence_feature_input, config)
35 | all_feature = layers.Concatenate(axis=-1)([sequence_feature, dense_feature, category_feature])
36 | obs = layers.Dense(256, activation=layers.ELU(), name='simulator_obs')(all_feature)
37 | output = layers.Dense(class_num, activation='softmax', name='simulator_reward')(obs)
38 |
39 | model = Model(inputs=[sequence_feature_input,
40 | dense_feature_input,
41 | category_feature_input,
42 | slate_label_input],
43 | outputs=[output])
44 | model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['AUC', 'acc'])
45 | return model
46 |
--------------------------------------------------------------------------------
/rl4rs/nets/exact_k/utils.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | import numpy as np
3 |
4 |
5 | def index_matrix_to_pairs_fn(batch_size, seq_length):
6 | replicated_first_indices = tf.range(batch_size) # range(128)
7 | # replicated_first_indices =
8 | # [[ 0, 0, 0,...],
9 | # [ 1, 1, 1,...],
10 | # ......
11 | # [127,127,127,...]]
12 | replicated_first_indices2 = tf.tile(
13 | tf.expand_dims(replicated_first_indices, dim=1), # [128,1]
14 | [1, seq_length])
15 |
16 | def index_matrix_to_pairs(index_matrix):
17 | """
18 | :param index_matrix: [batch_size, data_len] or [batch_size]
19 | :return: [batch_size, data_len, 2] or [batch_size, 2]
20 | ie:
21 | a: [128, 10] -> c[i,j,:] = [i,a[i,j]], shape(c) = [128,10,2]
22 | a: [128] -> c[i,:] = [i,a[i]], shape(c) = [128,2]
23 | """
24 | rank = len(index_matrix.get_shape())
25 | if rank == 1:
26 | return tf.stack([replicated_first_indices, index_matrix], axis=rank)
27 | elif rank == 2:
28 | return tf.stack([replicated_first_indices2, index_matrix], axis=rank)
29 | else:
30 | raise NotImplementedError("index_matrix rank should be 1 or 2, but %d found" % rank)
31 |
32 | return index_matrix_to_pairs
33 |
34 |
35 | def batch_gather(data, indices):
36 | batch_size = data.get_shape()[0].merge_with(indices.get_shape()[0]).value
37 | if batch_size is None:
38 | batch_size = tf.shape(indices)[0]
39 | gather_data_size = indices.get_shape()[1].value
40 | if gather_data_size is None:
41 | gather_data_size = tf.shape(indices)[1]
42 | flat_indices = tf.reshape(tf.transpose(indices), (-1,)) #[batch*4,1]
43 | input_index_pairs = tf.stop_gradient(tf.stack(
44 | [tf.range(batch_size*gather_data_size, dtype=tf.int32), flat_indices], axis=1))
45 | flat_data = tf.tile(data, [gather_data_size, 1])
46 | return tf.transpose(tf.reshape(tf.gather_nd(flat_data, input_index_pairs), (gather_data_size, batch_size)))
47 |
--------------------------------------------------------------------------------
/rl4rs/nets/dien.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | from tensorflow import keras
3 | from tensorflow.keras import layers
4 | from tensorflow.keras.models import Model
5 | from rl4rs.nets import utils
6 |
7 |
8 | def get_model(config):
9 | maxlen = config['maxlen']
10 | dense_feature_num = config['dense_feature_num']
11 | category_feature_num = config['category_feature_num']
12 | class_num = config['class_num']
13 | seq_num = config['seq_num']
14 |
15 | sequence_feature_input = layers.Input(
16 | shape=(seq_num, maxlen,), dtype='float32', name='sequence_feature_input'
17 | )
18 |
19 | dense_feature_input = layers.Input(
20 | shape=(dense_feature_num,), dtype='float32', name='dense_feature_input'
21 | )
22 |
23 | category_feature_input = layers.Input(
24 | shape=(category_feature_num,), dtype='int64', name='category_feature_input'
25 | )
26 |
27 | slate_label_input = layers.Input(shape=(9,), dtype='int64', name='slate_label')
28 |
29 | slice_layer = layers.Lambda(lambda x: x[0][:, x[1]:])
30 | id_slate_input = slice_layer([category_feature_input, -10])
31 | category_feature = utils.id_input_processing_attn(category_feature_input, config)
32 | dense_feature = utils.dense_input_processing(dense_feature_input, config)
33 | sequence_feature = utils.sequence_input_attn([sequence_feature_input, id_slate_input], config)
34 | all_feature = layers.Concatenate(axis=-1)([sequence_feature, dense_feature, category_feature])
35 | obs = layers.Dense(256, activation=layers.ELU(), name='simulator_obs')(all_feature)
36 | output = layers.Dense(class_num, activation='softmax', name='simulator_reward')(obs)
37 |
38 | model = Model(inputs=[sequence_feature_input,
39 | dense_feature_input,
40 | category_feature_input,
41 | slate_label_input],
42 | outputs=[output])
43 | tf.keras.backend.get_session().run(tf.global_variables_initializer())
44 | model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['AUC', 'acc'])
45 | return model
46 |
--------------------------------------------------------------------------------
/rl4rs/nets/dnn_slate.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | from tensorflow import keras
3 | from tensorflow.keras import layers
4 | from tensorflow.keras.models import Model
5 | from rl4rs.nets import utils
6 |
7 |
8 | def get_model(config):
9 | maxlen = config['maxlen']
10 | dense_feature_num = config['dense_feature_num']
11 | category_feature_num = config['category_feature_num']
12 | class_num = config['class_num']
13 | seq_num = config['seq_num']
14 |
15 | sequence_feature_input = layers.Input(
16 | shape=(seq_num, maxlen,), dtype='float32', name='sequence_feature_input'
17 | )
18 |
19 | dense_feature_input = layers.Input(
20 | shape=(dense_feature_num,), dtype='float32', name='dense_feature_input'
21 | )
22 |
23 | category_feature_input = layers.Input(
24 | shape=(category_feature_num,), dtype='int64', name='category_feature_input'
25 | )
26 |
27 | slate_label_input = layers.Input(
28 | shape=(9,), dtype='int64', name='slate_label'
29 | )
30 |
31 | category_feature = utils.id_input_processing(category_feature_input, config)
32 | dense_feature = utils.dense_input_processing(dense_feature_input, config)
33 | sequence_feature = utils.sequence_input_concat(sequence_feature_input, config)
34 | all_feature = layers.Concatenate(axis=-1)([category_feature, dense_feature])
35 | all_feature = layers.Dense(256, activation=layers.ELU())(all_feature)
36 | obs = layers.Dense(256, activation=layers.ELU(), name='simulator_obs')(all_feature)
37 | output = layers.Dense(9, activation='sigmoid', name='simulator_reward')(obs)
38 |
39 | model = Model(inputs=[sequence_feature_input,
40 | dense_feature_input,
41 | category_feature_input,
42 | slate_label_input],
43 | outputs=[output])
44 | model.compile(loss='binary_crossentropy',
45 | optimizer='adam',
46 | metrics=[tf.keras.metrics.AUC(),
47 | tf.keras.metrics.Precision(),
48 | tf.keras.metrics.Recall()])
49 | return model
50 |
--------------------------------------------------------------------------------
/rl4rs/nets/widedeep_slate.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | from tensorflow import keras
3 | from tensorflow.keras import layers
4 | from tensorflow.keras.models import Model
5 | from rl4rs.nets import utils
6 |
7 |
8 | def get_model(config):
9 | maxlen = config['maxlen']
10 | dense_feature_num = config['dense_feature_num']
11 | category_feature_num = config['category_feature_num']
12 | class_num = config['class_num']
13 | seq_num = config['seq_num']
14 |
15 | sequence_feature_input = layers.Input(
16 | shape=(seq_num, maxlen,), dtype='float32', name='sequence_feature_input'
17 | )
18 |
19 | dense_feature_input = layers.Input(
20 | shape=(dense_feature_num,), dtype='float32', name='dense_feature_input'
21 | )
22 |
23 | category_feature_input = layers.Input(
24 | shape=(category_feature_num,), dtype='int64', name='category_feature_input'
25 | )
26 |
27 | slate_label_input = layers.Input(
28 | shape=(9,), dtype='int64', name='slate_label'
29 | )
30 |
31 | category_feature = utils.id_input_processing_concat(category_feature_input, config)
32 | dense_feature = utils.dense_input_processing(dense_feature_input, config)
33 | sequence_feature = utils.sequence_input_concat(sequence_feature_input, config)
34 | # all_feature = layers.Concatenate(axis=-1)([sequence_feature, dense_feature])
35 | sequence_feature_dnn = layers.Dense(256, activation=layers.ELU())(sequence_feature)
36 | all_feature = layers.Concatenate(axis=-1, name='simulator_obs')(
37 | [sequence_feature_dnn, dense_feature, category_feature]
38 | )
39 | output = layers.Dense(9, activation='sigmoid', name='simulator_reward')(all_feature)
40 |
41 | model = Model(inputs=[sequence_feature_input,
42 | dense_feature_input,
43 | category_feature_input,
44 | slate_label_input],
45 | outputs=[output])
46 | model.compile(loss='binary_crossentropy',
47 | optimizer='adam',
48 | metrics=[tf.keras.metrics.AUC(),
49 | tf.keras.metrics.Precision(),
50 | tf.keras.metrics.Recall()])
51 | return model
52 |
--------------------------------------------------------------------------------
/rl4rs/nets/dien_slate.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | from tensorflow import keras
3 | from tensorflow.keras import layers
4 | from tensorflow.keras.models import Model
5 | from rl4rs.nets import utils
6 |
7 |
8 | def get_model(config):
9 | maxlen = config['maxlen']
10 | dense_feature_num = config['dense_feature_num']
11 | category_feature_num = config['category_feature_num']
12 | class_num = config['class_num']
13 | seq_num = config['seq_num']
14 |
15 | sequence_feature_input = layers.Input(
16 | shape=(seq_num, maxlen,), dtype='float32', name='sequence_feature_input'
17 | )
18 |
19 | dense_feature_input = layers.Input(
20 | shape=(dense_feature_num,), dtype='float32', name='dense_feature_input'
21 | )
22 |
23 | category_feature_input = layers.Input(
24 | shape=(category_feature_num,), dtype='int64', name='category_feature_input'
25 | )
26 |
27 | slate_label_input = layers.Input(shape=(9,), dtype='int64', name='slate_label')
28 |
29 | slice_layer = layers.Lambda(lambda x: x[0][:, x[1]:])
30 | id_slate_input = slice_layer([category_feature_input, -10])
31 |
32 | category_feature = utils.id_input_processing_attn(category_feature_input, config)
33 | dense_feature = utils.dense_input_processing(dense_feature_input, config)
34 | sequence_feature = utils.sequence_input_attn([sequence_feature_input, id_slate_input], config)
35 | all_feature = layers.Concatenate(axis=-1)([sequence_feature, dense_feature, category_feature])
36 | obs = layers.Dense(256, activation=layers.ELU(), name='simulator_obs')(all_feature)
37 | output = layers.Dense(9, activation='sigmoid', name='simulator_reward')(obs)
38 |
39 | model = Model(inputs=[
40 | sequence_feature_input,
41 | dense_feature_input,
42 | category_feature_input,
43 | slate_label_input],
44 | outputs=[output])
45 | tf.keras.backend.get_session().run(tf.global_variables_initializer())
46 | model.compile(loss='binary_crossentropy',
47 | optimizer='adam',
48 | metrics=[tf.keras.metrics.AUC(),
49 | tf.keras.metrics.Precision(),
50 | tf.keras.metrics.Recall()])
51 | return model
52 |
--------------------------------------------------------------------------------
/rl4rs/server/httpEnv.py:
--------------------------------------------------------------------------------
1 | import gym
2 | import numpy as np
3 | from rl4rs.server.gymHttpClient import Client
4 |
5 |
6 | class HttpEnv(gym.Env):
7 | metadata = {'render.modes': ['human']}
8 |
9 | def __init__(self, env_id, config={}):
10 | remote_base = config["remote_base"]
11 | self.client = Client(remote_base)
12 | self.instance_id = self.client.env_create(env_id, config)
13 | action_info = self.client.env_action_space_info(self.instance_id)
14 | obs_info = self.client.env_observation_space_info(self.instance_id)
15 | if action_info['name'] == 'Box':
16 | self.action_space = gym.spaces.Box(np.array(action_info['low']), np.array(action_info['high']), shape=action_info['shape'])
17 | else:
18 | self.action_space = gym.spaces.Discrete(action_info['n'])
19 | if obs_info['name'] == 'Box':
20 | self.observation_space = gym.spaces.Box(np.array(obs_info['low']), np.array(obs_info['high']), shape=obs_info['shape'])
21 | elif obs_info['name'] == 'Dict':
22 | keys = obs_info['keys']
23 | space_D = {}
24 | for key in keys:
25 | shape = obs_info[key]['shape']
26 | space_D[key] = gym.spaces.Box(np.array(obs_info[key]['low']).reshape(shape), np.array(obs_info[key]['high']).reshape(shape), shape=shape)
27 | self.observation_space = gym.spaces.Dict(space_D)
28 | else:
29 | assert obs_info['name'] in ('Box', 'Dict')
30 |
31 | def seed(self, sd=0):
32 | pass
33 |
34 | def step(self, action):
35 | if isinstance(action, np.ndarray):
36 | action = action.tolist()
37 | if isinstance(action, np.int):
38 | action = int(action)
39 | observation, reward, done, info = self.client.env_step(self.instance_id, action, False)
40 | return self.observation_space.from_jsonable(observation), reward, done, info
41 |
42 | def reset(self):
43 | observation = self.client.env_reset(self.instance_id)
44 | return self.observation_space.from_jsonable(observation)
45 |
46 | def render(self, mode='human', close=False):
47 | return ''
48 |
49 | def close(self):
50 | return self.client.env_close(self.instance_id)
51 |
--------------------------------------------------------------------------------
/script/simulator_eval.py:
--------------------------------------------------------------------------------
1 | import os, sys
2 | import gym
3 | import numpy as np
4 | from rl4rs.env.slate import SlateRecEnv, SlateState
5 | from rl4rs.env.seqslate import SeqSlateRecEnv, SeqSlateState
6 |
7 | extra_config = eval(sys.argv[1]) if len(sys.argv) >= 2 else {}
8 |
9 | config = {"epoch": 4, "maxlen": 64, "batch_size": 2048, "action_size": 284, "class_num": 2, "dense_feature_num": 432,
10 | "category_feature_num": 21, "category_hash_size": 100000, "seq_num": 2, "emb_size": 128, "page_items": 9,
11 | "hidden_units": 128, "max_steps": 9, "sample_file": '../dataset/rl4rs_dataset_b3_shuf.csv', "iteminfo_file": '../item_info.csv',
12 | "model_file": "../output/simulator_b2_dien/model", "support_rllib_mask": False, "is_eval": True, 'env': "SeqSlateRecEnv-v0"}
13 |
14 | config = dict(config, **extra_config)
15 |
16 | if config.get('gpu', 0) < 1:
17 | os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
18 |
19 | if config['env'] == 'SeqSlateRecEnv-v0':
20 | config['max_steps'] = 36
21 | sim = SeqSlateRecEnv(config, state_cls=SeqSlateState)
22 | env = gym.make('SeqSlateRecEnv-v0', recsim=sim)
23 | else:
24 | sim = SlateRecEnv(config, state_cls=SlateState)
25 | env = gym.make('SlateRecEnv-v0', recsim=sim)
26 |
27 | batch_size = config["batch_size"]
28 | epoch = config["epoch"]
29 | max_steps = config["max_steps"]
30 | rewards = np.zeros((epoch, batch_size))
31 | offline_rewards = np.zeros((epoch, batch_size))
32 | offline_actions = np.zeros((epoch, batch_size, max_steps))
33 |
34 | for i in range(epoch):
35 | env.reset()
36 | for j in range(config["max_steps"]):
37 | action = env.offline_action
38 | offline_actions[i, :, j] = env.offline_action
39 | next_obs, reward, done, info = env.step(action)
40 | rewards[i] = rewards[i] + np.array(reward)
41 | offline_rewards[i] = offline_rewards[i] + np.array(env.offline_reward)
42 | if done[0]:
43 | print(
44 | i,
45 | np.sum(rewards) / config["batch_size"] / (i + 1),
46 | np.sum(offline_rewards) / config["batch_size"] / (i + 1)
47 | )
48 | break
49 | print('the mean of offline reward', np.mean(offline_rewards))
50 | print('the mean of reward prediction error', np.mean(rewards - offline_rewards))
51 | print('the absolute mean of reward prediction error', np.mean(np.abs(rewards - offline_rewards)))
52 | print('the std of reward prediction error', np.std(np.reshape(rewards - offline_rewards, -1)))
53 | print('success')
54 |
55 |
--------------------------------------------------------------------------------
/script/test_exact_k.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import tensorflow as tf
3 | from rl4rs.nets.exact_k.model import Generator, Discriminator
4 |
5 | batch_size = 2
6 | l1_mask = np.zeros(284)
7 | l1_mask[:40] = 1
8 | l2_mask = np.zeros(284)
9 | l2_mask[40:150] = 1
10 | l3_mask = np.zeros(284)
11 | l3_mask[150:] = 1
12 | l0_ssr_mask = np.zeros(284)
13 | l0_ssr_mask[:30] = 1
14 | l0_ssr_mask[40:140] = 1
15 | l0_ssr_mask[160:] = 1
16 |
17 | with tf.name_scope('Generator'):
18 | g = Generator(l1_mask,
19 | l2_mask,
20 | l3_mask,
21 | l0_ssr_mask,
22 | is_training=True,
23 | seq_length=284)
24 |
25 | with tf.name_scope('Discriminator'):
26 | d = Discriminator(seq_length=284)
27 |
28 | print("Graph loaded")
29 |
30 | gpu_options = tf.GPUOptions(
31 | per_process_gpu_memory_fraction=0.95,
32 | allow_growth=True)
33 | sess_config = tf.ConfigProto(allow_soft_placement=True,
34 | gpu_options=gpu_options)
35 |
36 | with tf.Session(config=sess_config) as sess:
37 | sess.run(tf.initialize_all_variables())
38 | print('Generator training start!')
39 |
40 | reward_total = 0.0
41 | observation = np.random.random((batch_size, 256))
42 | item_cand = np.array([list(range(0, 284))] * batch_size)
43 | for _ in range(9):
44 | sampled_card_idx, sampled_card = sess.run([g.sampled_path, g.sampled_result],
45 | feed_dict={g.user: observation, g.item_cand: item_cand})
46 | reward = np.ones((batch_size,))
47 |
48 | reward_ = sess.run(d.reward, feed_dict={d.user: observation})
49 | sess.run(d.train_op, feed_dict={d.user: observation, d.reward_target: reward})
50 |
51 | reward_total += np.mean(reward)
52 |
53 | reward = (reward - reward_)
54 |
55 | sess.run(g.train_op, feed_dict={g.decode_target_ids: sampled_card_idx,
56 | g.reward: reward,
57 | g.item_cand: item_cand,
58 | g.user: observation,
59 | })
60 | gs_gen = sess.run(g.global_step)
61 |
62 | # beamsearch
63 | # beam_card = sess.run(g.infer_result,
64 | # feed_dict={g.item_cand: item_cand,
65 | # g.enc_user: observation})
66 |
67 | print(sampled_card_idx, sampled_card, reward_)
68 |
69 | print("Done")
70 |
--------------------------------------------------------------------------------
/rl4rs/nets/lstm_slate_multiclass.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | from tensorflow import keras
3 | from tensorflow.keras import layers
4 | from tensorflow.keras.models import Model
5 | from rl4rs.nets import utils
6 |
7 |
8 | def my_loss_fn(y_true, y_pred):
9 | slate2label = tf.einsum('ij,j->i',
10 | tf.cast(y_true, tf.int64),
11 | tf.constant([1, 2, 4, 1, 2, 4, 1, 2, 4], dtype=tf.int64))
12 | return tf.keras.losses.categorical_crossentropy(tf.one_hot(slate2label, 22), y_pred)
13 |
14 |
15 | def my_acc_metrics(y_true, y_pred):
16 | slate2label = tf.einsum('ij,j->i',
17 | tf.cast(y_true, tf.int64),
18 | tf.constant([1, 2, 4, 1, 2, 4, 1, 2, 4], dtype=tf.int64))
19 | return tf.keras.metrics.categorical_accuracy(tf.one_hot(slate2label, 22), y_pred)
20 |
21 |
22 | def get_model(config):
23 | maxlen = config['maxlen']
24 | dense_feature_num = config['dense_feature_num']
25 | category_feature_num = config['category_feature_num']
26 | class_num = config['class_num']
27 | seq_num = config['seq_num']
28 |
29 | sequence_feature_input = layers.Input(shape=(seq_num, maxlen,), dtype='float32', name='sequence_feature_input')
30 |
31 | dense_feature_input = layers.Input(shape=(dense_feature_num,), dtype='float32', name='dense_feature_input')
32 |
33 | category_feature_input = layers.Input(shape=(category_feature_num,), dtype='int64', name='category_feature_input')
34 |
35 | slate_label_input = layers.Input(shape=(9,), dtype='int64', name='slate_label')
36 |
37 | category_feature = utils.id_input_processing_lstm(category_feature_input, config)
38 | dense_feature = utils.dense_input_processing(dense_feature_input, config)
39 | sequence_feature = utils.sequence_input_LSTM(sequence_feature_input, config)
40 | all_feature = layers.Concatenate(axis=-1)([sequence_feature, dense_feature, category_feature])
41 | # all_feature = layers.Concatenate(axis=-1)([sequence_feature, category_feature])
42 | obs = layers.Dense(256, activation=layers.ELU(), name='simulator_obs')(all_feature)
43 | output = layers.Dense(22, activation='softmax', name='simulator_reward')(obs)
44 |
45 | model = Model(inputs=[sequence_feature_input,
46 | dense_feature_input,
47 | category_feature_input,
48 | slate_label_input],
49 | outputs=[output])
50 | model.compile(loss=my_loss_fn, optimizer='adam', metrics=[my_acc_metrics])
51 | return model
52 |
--------------------------------------------------------------------------------
/rl4rs/nets/dnn_slate_multiclass.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | from tensorflow import keras
3 | from tensorflow.keras import layers
4 | from tensorflow.keras.models import Model
5 | from rl4rs.nets import utils
6 |
7 |
8 | def my_loss_fn(y_true, y_pred):
9 | slate2label = tf.einsum('ij,j->i',
10 | tf.cast(y_true, tf.int64),
11 | tf.constant([1, 2, 4, 1, 2, 4, 1, 2, 4], dtype=tf.int64))
12 | return tf.keras.losses.categorical_crossentropy(tf.one_hot(slate2label, 22), y_pred)
13 |
14 |
15 | def my_acc_metrics(y_true, y_pred):
16 | slate2label = tf.einsum('ij,j->i',
17 | tf.cast(y_true, tf.int64),
18 | tf.constant([1, 2, 4, 1, 2, 4, 1, 2, 4], dtype=tf.int64))
19 | return tf.keras.metrics.categorical_accuracy(tf.one_hot(slate2label, 22), y_pred)
20 |
21 |
22 | def get_model(config):
23 | maxlen = config['maxlen']
24 | dense_feature_num = config['dense_feature_num']
25 | category_feature_num = config['category_feature_num']
26 | class_num = config['class_num']
27 | seq_num = config['seq_num']
28 |
29 | sequence_feature_input = layers.Input(
30 | shape=(seq_num, maxlen,), dtype='float32', name='sequence_feature_input'
31 | )
32 |
33 | dense_feature_input = layers.Input(
34 | shape=(dense_feature_num,), dtype='float32', name='dense_feature_input'
35 | )
36 |
37 | category_feature_input = layers.Input(
38 | shape=(category_feature_num,), dtype='int64', name='category_feature_input'
39 | )
40 |
41 | slate_label_input = layers.Input(
42 | shape=(9,), dtype='int64', name='slate_label'
43 | )
44 |
45 | category_feature = utils.id_input_processing(category_feature_input, config)
46 | dense_feature = utils.dense_input_processing(dense_feature_input, config)
47 | sequence_feature = utils.sequence_input_concat(sequence_feature_input, config)
48 | all_feature = layers.Concatenate(axis=-1)([category_feature, dense_feature])
49 | all_feature = layers.Dense(256, activation=layers.ELU())(all_feature)
50 | obs = layers.Dense(256, activation=layers.ELU(), name='simulator_obs')(all_feature)
51 | output = layers.Dense(22, activation='softmax', name='simulator_reward')(obs)
52 |
53 | model = Model(inputs=[sequence_feature_input,
54 | dense_feature_input,
55 | category_feature_input,
56 | slate_label_input],
57 | outputs=[output])
58 | model.compile(loss=my_loss_fn, optimizer='adam', metrics=[my_acc_metrics])
59 | return model
60 |
--------------------------------------------------------------------------------
/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 | RL4RS Dataset
4 |
67 |
68 |
69 |
70 |
71 |
--------------------------------------------------------------------------------
/rl4rs/nets/widedeep_slate_multiclass.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | from tensorflow import keras
3 | from tensorflow.keras import layers
4 | from tensorflow.keras.models import Model
5 | from rl4rs.nets import utils
6 |
7 |
8 | def my_loss_fn(y_true, y_pred):
9 | slate2label = tf.einsum('ij,j->i',
10 | tf.cast(y_true, tf.int64),
11 | tf.constant([1, 2, 4, 1, 2, 4, 1, 2, 4], dtype=tf.int64))
12 | return tf.keras.losses.categorical_crossentropy(tf.one_hot(slate2label, 22), y_pred)
13 |
14 |
15 | def my_acc_metrics(y_true, y_pred):
16 | slate2label = tf.einsum('ij,j->i',
17 | tf.cast(y_true, tf.int64),
18 | tf.constant([1, 2, 4, 1, 2, 4, 1, 2, 4], dtype=tf.int64))
19 | return tf.keras.metrics.categorical_accuracy(tf.one_hot(slate2label, 22), y_pred)
20 |
21 |
22 | def get_model(config):
23 | maxlen = config['maxlen']
24 | dense_feature_num = config['dense_feature_num']
25 | category_feature_num = config['category_feature_num']
26 | class_num = config['class_num']
27 | seq_num = config['seq_num']
28 |
29 | sequence_feature_input = layers.Input(
30 | shape=(seq_num, maxlen,), dtype='float32', name='sequence_feature_input'
31 | )
32 |
33 | dense_feature_input = layers.Input(
34 | shape=(dense_feature_num,), dtype='float32', name='dense_feature_input'
35 | )
36 |
37 | category_feature_input = layers.Input(
38 | shape=(category_feature_num,), dtype='int64', name='category_feature_input'
39 | )
40 |
41 | slate_label_input = layers.Input(
42 | shape=(9,), dtype='int64', name='slate_label'
43 | )
44 |
45 | category_feature = utils.id_input_processing_concat(category_feature_input, config)
46 | dense_feature = utils.dense_input_processing(dense_feature_input, config)
47 | sequence_feature = utils.sequence_input_concat(sequence_feature_input, config)
48 | # all_feature = layers.Concatenate(axis=-1)([sequence_feature, dense_feature])
49 | sequence_feature_dnn = layers.Dense(256, activation=layers.ELU())(sequence_feature)
50 | all_feature = layers.Concatenate(axis=-1, name='simulator_obs')(
51 | [sequence_feature_dnn, dense_feature, category_feature]
52 | )
53 | output = layers.Dense(22, activation='softmax', name='simulator_reward')(all_feature)
54 |
55 | model = Model(inputs=[sequence_feature_input,
56 | dense_feature_input,
57 | category_feature_input,
58 | slate_label_input],
59 | outputs=[output])
60 | model.compile(loss=my_loss_fn, optimizer='adam', metrics=[my_acc_metrics])
61 | return model
62 |
--------------------------------------------------------------------------------
/rl4rs/nets/dien_slate_multiclass.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | from tensorflow import keras
3 | from tensorflow.keras import layers
4 | from tensorflow.keras.models import Model
5 | from rl4rs.nets import utils
6 |
7 |
8 | def my_loss_fn(y_true, y_pred):
9 | slate2label = tf.einsum('ij,j->i',
10 | tf.cast(y_true, tf.int64),
11 | tf.constant([1, 2, 4, 1, 2, 4, 1, 2, 4], dtype=tf.int64))
12 | return tf.keras.losses.categorical_crossentropy(tf.one_hot(slate2label, 22), y_pred)
13 |
14 |
15 | def my_acc_metrics(y_true, y_pred):
16 | slate2label = tf.einsum('ij,j->i',
17 | tf.cast(y_true, tf.int64),
18 | tf.constant([1, 2, 4, 1, 2, 4, 1, 2, 4], dtype=tf.int64))
19 | return tf.keras.metrics.categorical_accuracy(tf.one_hot(slate2label, 22), y_pred)
20 |
21 |
22 | def get_model(config):
23 | maxlen = config['maxlen']
24 | dense_feature_num = config['dense_feature_num']
25 | category_feature_num = config['category_feature_num']
26 | class_num = config['class_num']
27 | seq_num = config['seq_num']
28 |
29 | sequence_feature_input = layers.Input(
30 | shape=(seq_num, maxlen,), dtype='float32', name='sequence_feature_input'
31 | )
32 |
33 | dense_feature_input = layers.Input(
34 | shape=(dense_feature_num,), dtype='float32', name='dense_feature_input'
35 | )
36 |
37 | category_feature_input = layers.Input(
38 | shape=(category_feature_num,), dtype='int64', name='category_feature_input'
39 | )
40 |
41 | slate_label_input = layers.Input(shape=(9,), dtype='int64', name='slate_label')
42 |
43 | slice_layer = layers.Lambda(lambda x: x[0][:, x[1]:])
44 | id_slate_input = slice_layer([category_feature_input, -10])
45 |
46 | category_feature = utils.id_input_processing_attn(category_feature_input, config)
47 | dense_feature = utils.dense_input_processing(dense_feature_input, config)
48 | sequence_feature = utils.sequence_input_attn([sequence_feature_input, id_slate_input], config)
49 | all_feature = layers.Concatenate(axis=-1)([sequence_feature, dense_feature, category_feature])
50 | obs = layers.Dense(256, activation=layers.ELU(), name='simulator_obs')(all_feature)
51 | output = layers.Dense(22, activation='softmax', name='simulator_reward')(obs)
52 |
53 | model = Model(inputs=[sequence_feature_input,
54 | dense_feature_input,
55 | category_feature_input,
56 | slate_label_input],
57 | outputs=[output])
58 | tf.keras.backend.get_session().run(tf.global_variables_initializer())
59 | model.compile(loss=my_loss_fn, optimizer='adam', metrics=[my_acc_metrics])
60 | return model
61 |
--------------------------------------------------------------------------------
/rl4rs/policy/behavior_model.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import tensorflow as tf
3 | # tf.compat.v1.enable_eager_execution()
4 | from tensorflow import keras
5 | from rl4rs.utils.datautil import FeatureUtil
6 | from copy import deepcopy
7 |
8 |
9 | class behavior_model(object):
10 | def __init__(self, config, modelfile):
11 | behavior_config = deepcopy(config)
12 | behavior_config['category_feature_num'] = 21
13 | behavior_config['dense_feature_num'] = 50
14 | self.featureutil = FeatureUtil(behavior_config)
15 | self.item_feature_size = config.get('item_feature_size', 40)
16 | self.page_items = config.get("page_items", 9)
17 | self.sess = tf.Session()
18 | with self.sess.as_default():
19 | self.model = keras.models.load_model(modelfile)
20 |
21 | def record2input(self, records, page=0):
22 | inputs = []
23 | for record in records:
24 | role_id, _, sequence_id, exposed_items, user_feedback, user_seqfeature, \
25 | user_protrait, item_feature, _ = self.featureutil.record_split(record)
26 | category_feature = user_protrait[:10] + \
27 | [sequence_id] + \
28 | exposed_items[self.page_items*page:self.page_items*(page+1)]
29 | sequence_feature = [user_seqfeature, [0]]
30 | label = 0
31 | dense_feature_size = self.item_feature_size*self.page_items
32 | item_feature = item_feature[dense_feature_size*page:dense_feature_size*(page+1)]
33 | item_feature = np.array(item_feature).reshape((self.page_items, self.item_feature_size))
34 | item_feature = item_feature[:, :5].reshape(-1)
35 | inputs.append((
36 | role_id,
37 | sequence_feature,
38 | item_feature,
39 | category_feature,
40 | user_feedback[self.page_items*page:self.page_items*(page+1)],
41 | label))
42 | return inputs
43 |
44 | def action_probs(self, record, action, layer, page=0):
45 | batch_size = len(action)
46 | seq, dense, category, slate = self.featureutil.feature_extraction(self.record2input(record, page))[0]
47 | with self.sess.as_default():
48 | y = self.model.predict([seq, dense, category, slate])
49 | if layer == 1:
50 | action = np.clip(np.array(action) - 1, 0, 38)
51 | action_probs = y[:, 1:40] / np.sum(y[:, 1:40], axis=1, keepdims=True)
52 | elif layer == 2:
53 | action = np.clip(np.array(action) - 40, 0, 107)
54 | action_probs = y[:, 40:148] / np.sum(y[:, 40:148], axis=1, keepdims=True)
55 | else:
56 | action = np.clip(np.array(action) - 148, 0, 233)
57 | action_probs = y[:, 148:] / np.sum(y[:, 148:], axis=1, keepdims=True)
58 | return action_probs[range(batch_size), action]
59 |
--------------------------------------------------------------------------------
/rl4rs/utils/rllib_vector_env.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from typing import Callable, List, Optional, Tuple
3 | from ray.rllib.utils.typing import EnvActionType, EnvConfigDict, EnvInfoDict, \
4 | EnvObsType, EnvType, PartialTrainerConfigDict
5 | from ray.rllib.env.vector_env import VectorEnv
6 | from rl4rs.env import RecEnvBase
7 |
8 |
9 | class MyVectorEnvWrapper(VectorEnv):
10 | """An environment that supports batch evaluation using clones of sub-envs.
11 | """
12 |
13 | def __init__(self, env: RecEnvBase, batch_size: int):
14 | """Initializes a VectorEnv object.
15 |
16 | Args:
17 | observation_space (Space): The observation Space of a single
18 | sub-env.
19 | action_space (Space): The action Space of a single sub-env.
20 | num_envs (int): The number of clones to make of the given sub-env.
21 | """
22 | self.env = env
23 | self.reset_cache = []
24 | super().__init__(self.env.observation_space, self.env.action_space, num_envs=batch_size)
25 |
26 | def vector_reset(self) -> List[EnvObsType]:
27 | """Resets all sub-environments.
28 |
29 | Returns:
30 | obs (List[any]): List of observations from each environment.
31 | """
32 | return self.env.reset()
33 |
34 | def reset_at(self, index: Optional[int] = None) -> EnvObsType:
35 | """Resets a single environment.
36 |
37 | Args:
38 | index (Optional[int]): An optional sub-env index to reset.
39 |
40 | Returns:
41 | obs (obj): Observations from the reset sub environment.
42 | """
43 | if index == 0:
44 | self.reset_cache = self.env.reset()
45 | return self.reset_cache[index]
46 |
47 | def vector_step(
48 | self, actions: List[EnvActionType]
49 | ) -> Tuple[List[EnvObsType], List[float], List[bool], List[EnvInfoDict]]:
50 | """Performs a vectorized step on all sub environments using `actions`.
51 |
52 | Args:
53 | actions (List[any]): List of actions (one for each sub-env).
54 |
55 | Returns:
56 | obs (List[any]): New observations for each sub-env.
57 | rewards (List[any]): Reward values for each sub-env.
58 | dones (List[any]): Done values for each sub-env.
59 | infos (List[any]): Info values for each sub-env.
60 | """
61 | return self.env.step(np.array(actions))
62 |
63 | def get_unwrapped(self) -> List[EnvType]:
64 | """Returns the underlying sub environments.
65 |
66 | Returns:
67 | List[Env]: List of all underlying sub environments.
68 | """
69 | return [self.env, ] * self.num_envs
70 |
71 | # Experimental method.
72 | def try_render_at(self, index: Optional[int] = None) -> None:
73 | """Renders a single environment.
74 |
75 | Args:
76 | index (Optional[int]): An optional sub-env index to render.
77 | """
78 | return self.env.render()
79 |
--------------------------------------------------------------------------------
/script/simulator_env_test.py:
--------------------------------------------------------------------------------
1 | import os, sys
2 | import gym
3 | import numpy as np
4 | import tensorflow as tf
5 | tf.compat.v1.enable_eager_execution()
6 | from rl4rs.utils.datautil import FeatureUtil
7 | from rl4rs.env.slate import SlateRecEnv, SlateState
8 | from rl4rs.env.seqslate import SeqSlateRecEnv, SeqSlateState
9 |
10 | extra_config = eval(sys.argv[1]) if len(sys.argv) >= 2 else {}
11 |
12 | config = {"epoch": 1, "maxlen": 64, "batch_size": 2048, "action_size": 284, "class_num": 2, "dense_feature_num": 432,
13 | "category_feature_num": 21, "category_hash_size": 100000, "seq_num": 2, "emb_size": 128, "page_items": 9,
14 | "hidden_units": 128, "max_steps": 9, "sample_file": '../dataset/rl4rs_dataset_a_train.csv',
15 | "iteminfo_file": '../dataset/item_info.csv', "tfrecord_file":'../output/rl4rs_dataset_a_train_tiny.tfrecord',
16 | "model_file": "../output/supervised_a_train_dien/model", "support_rllib_mask": False, "is_eval": True, 'env': "SlateRecEnv-v0",
17 | "support_conti_env":True, "rawstate_as_obs":False}
18 |
19 | config = dict(config, **extra_config)
20 |
21 | if config.get('gpu', 0) < 1:
22 | os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
23 |
24 | if config['env'] == 'SeqSlateRecEnv-v0':
25 | config['max_steps'] = 36
26 | sim = SeqSlateRecEnv(config, state_cls=SeqSlateState)
27 | env = gym.make('SeqSlateRecEnv-v0', recsim=sim)
28 | else:
29 | sim = SlateRecEnv(config, state_cls=SlateState)
30 | env = gym.make('SlateRecEnv-v0', recsim=sim)
31 |
32 | batch_size = config["batch_size"]
33 | epoch = config["epoch"]
34 | max_steps = config["max_steps"]
35 | rewards = np.zeros((epoch, batch_size))
36 | offline_rewards = np.zeros((epoch, batch_size))
37 | offline_actions = np.zeros((epoch, batch_size, max_steps))
38 | for i in range(epoch):
39 | env.reset(reset_file=True)
40 | for j in range(config["max_steps"]):
41 | if not config.get("support_conti_env"):
42 | action = env.offline_action
43 | else:
44 | action = np.full((batch_size, 32), 1)
45 | offline_actions[i, :, j] = env.offline_action
46 | next_obs, reward, done, info = env.step(action)
47 | rewards[i] = rewards[i] + np.array(reward)
48 | offline_rewards[i] = offline_rewards[i] + np.array(env.offline_reward)
49 | if done[0]:
50 | print(next_obs[0], reward[0], action[0], done[0], info[0])
51 | break
52 |
53 | if config['rawstate_as_obs']:
54 | config['batch_size'] = 1
55 | featureutil = FeatureUtil(config)
56 | iter_train = featureutil.read_tfrecord(config['tfrecord_file'], is_slate_label=False)
57 | feature = iter_train.make_one_shot_iterator().get_next()
58 | seq_feature = feature[0][0].numpy()[0]
59 | dense_feature = feature[0][1].numpy()[0]
60 | category_feature = feature[0][2].numpy()[0]
61 | assert np.min(np.equal(next_obs[0]['category_feature'][:-1], category_feature[:-1]))
62 | assert np.min(np.equal(next_obs[0]['dense_feature'][:-40], dense_feature[:-40]))
63 | assert np.min(np.equal(next_obs[0]['sequence_feature'], seq_feature))
64 |
--------------------------------------------------------------------------------
/script/offline_evaluation.py:
--------------------------------------------------------------------------------
1 | import os
2 | import gym
3 | import numpy as np
4 | from rl4rs.policy.behavior_model import behavior_model
5 | from rl4rs.policy.policy_model import policy_model
6 | import rl4rs.utils.offline_policy_metrics as OPE
7 |
8 |
9 | def ope_eval(config, eval_env, algo, sample_model: behavior_model = None):
10 | policy = policy_model(algo, config)
11 | metrics = []
12 | epoch = config["epoch"]
13 | batch_size = config["batch_size"]
14 | max_steps = config["max_steps"]
15 | page_items = config.get("page_items", 9)
16 | for i in range(epoch):
17 | obs = eval_env.reset()
18 | episode_rewards, q_values, off_rewards = [], [], []
19 | prev_actions = []
20 | action_probs, behavior_probs, rewards = [], [], []
21 | print('test batch at ', i)
22 | for j in range(max_steps):
23 | # obs = dict(enumerate(obs))
24 | action = policy.predict_with_mask(obs)
25 | off_action = eval_env.offline_action
26 | if sample_model is not None:
27 | action_prob = policy.action_probs(obs)
28 | action_prob = action_prob[range(batch_size), off_action]
29 | q_values.append(policy.predict_q(obs, action))
30 | action_probs.append(action_prob)
31 | behavior_prob = sample_model.action_probs(eval_env.samples.records, off_action, j // 3 + 1, page=j//page_items)
32 | behavior_probs.append(behavior_prob)
33 | obs, reward, done, info = eval_env.step(action)
34 | off_rewards.append(eval_env.offline_reward)
35 | rewards.append(reward)
36 | prev_actions.append(action)
37 |
38 | episode_reward = np.sum(np.array(rewards), axis=0)
39 | episode_rewards.append(episode_reward)
40 | if sample_model is not None:
41 | action_probs = np.array(action_probs).swapaxes(0, 1)
42 | behavior_probs = np.array(behavior_probs).swapaxes(0, 1)
43 | off_rewards = np.array(off_rewards).swapaxes(0, 1)
44 | off_rewards_sum = np.sum(off_rewards, axis=1)
45 | rewards_hat = np.array(rewards).swapaxes(0, 1)
46 | q_values = np.array(q_values).swapaxes(0, 1)
47 | # multiply probs
48 | action_probs_mul = np.multiply.reduce(action_probs*100, axis=1)
49 | behavior_probs_mul = np.multiply.reduce(behavior_probs*100, axis=1)
50 | cips = OPE.eval_CIPS(off_rewards_sum, action_probs_mul, behavior_probs_mul)
51 | # snips = OPE.eval_SNIPS(off_rewards_sum, action_probs_mul, behavior_probs_mul)
52 | dr = OPE.eval_doubly_robust(
53 | episode_reward,
54 | np.average(q_values, 1),
55 | off_rewards_sum,
56 | action_probs_mul,
57 | behavior_probs_mul
58 | )
59 | # step-wise
60 | wips = OPE.eval_WIPS(off_rewards, action_probs, behavior_probs)
61 | sdr = OPE.eval_seq_doubly_robust(
62 | rewards_hat,
63 | q_values,
64 | off_rewards,
65 | action_probs,
66 | behavior_probs
67 | )
68 |
69 | metrics.append((cips, dr, wips, sdr))
70 |
71 | print('IS', 'DR', 'WIPS', 'SeqDR', sep=' ')
72 | print(np.average(np.array(metrics), axis=0))
73 | print(np.std(np.array(metrics), axis=0))
74 |
--------------------------------------------------------------------------------
/rl4rs/nets/cql/q_function.py:
--------------------------------------------------------------------------------
1 | from typing import Optional, cast
2 |
3 | import torch
4 | import torch.nn.functional as F
5 | from torch import nn
6 | from typing import Any, ClassVar, Dict, Type
7 | from d3rlpy.models.torch.encoders import Encoder
8 | from d3rlpy.models.torch.q_functions.base import DiscreteQFunction
9 | from d3rlpy.models.torch.q_functions.utility import compute_huber_loss, compute_reduce, pick_value_by_action
10 | from d3rlpy.models.q_functions import QFunctionFactory
11 | from d3rlpy.models.torch import EncoderWithAction, ContinuousMeanQFunction
12 |
13 |
14 | class CustomDiscreteMeanQFunction(DiscreteQFunction, nn.Module): # type: ignore
15 | _action_size: int
16 | _encoder: Encoder
17 | _fc: nn.Linear
18 |
19 | def __init__(self, encoder: Encoder, action_size: int):
20 | super().__init__()
21 | self._action_size = action_size
22 | self._encoder = encoder
23 | # self._fc = nn.Linear(encoder.get_feature_size(), action_size)
24 |
25 | def forward(self, x: torch.Tensor) -> torch.Tensor:
26 | return cast(torch.Tensor, self._encoder(x))
27 |
28 | def compute_error(
29 | self,
30 | obs_t: torch.Tensor,
31 | act_t: torch.Tensor,
32 | rew_tp1: torch.Tensor,
33 | q_tp1: torch.Tensor,
34 | ter_tp1: torch.Tensor,
35 | gamma: float = 0.99,
36 | reduction: str = "mean",
37 | ) -> torch.Tensor:
38 | one_hot = F.one_hot(act_t.view(-1), num_classes=self.action_size)
39 | q_t = (self.forward(obs_t) * one_hot.float()).sum(dim=1, keepdim=True)
40 | y = rew_tp1 + gamma * q_tp1 * (1 - ter_tp1)
41 | loss = compute_huber_loss(q_t, y)
42 | return compute_reduce(loss, reduction)
43 |
44 | def compute_target(
45 | self, x: torch.Tensor, action: Optional[torch.Tensor] = None
46 | ) -> torch.Tensor:
47 | if action is None:
48 | return self.forward(x)
49 | # q=pick_value_by_action(self.forward(x), action, keepdim=True)
50 | values = self.forward(x)
51 | action_size = values.shape[1]
52 | one_hot = F.one_hot(action.view(-1), num_classes=action_size)
53 | masked_values = values * cast(torch.Tensor, one_hot.float())
54 | q = masked_values.sum(dim=1, keepdim=True)
55 | # assert torch.min(q)>-100
56 | return q
57 |
58 | @property
59 | def action_size(self) -> int:
60 | return self._action_size
61 |
62 | @property
63 | def encoder(self) -> Encoder:
64 | return self._encoder
65 |
66 |
67 | class CustomMeanQFunctionFactory(QFunctionFactory):
68 | TYPE: ClassVar[str] = "mean"
69 |
70 | def __init__(self, bootstrap: bool = False, share_encoder: bool = False):
71 | super().__init__(bootstrap, share_encoder)
72 |
73 | def create_discrete(
74 | self,
75 | encoder: Encoder,
76 | action_size: int,
77 | ) -> CustomDiscreteMeanQFunction:
78 | return CustomDiscreteMeanQFunction(encoder, action_size)
79 |
80 | def create_continuous(
81 | self,
82 | encoder: EncoderWithAction,
83 | ) -> ContinuousMeanQFunction:
84 | return ContinuousMeanQFunction(encoder)
85 |
86 | def get_params(self, deep: bool = False) -> Dict[str, Any]:
87 | return {
88 | "bootstrap": self._bootstrap,
89 | "share_encoder": self._share_encoder,
90 | }
91 |
--------------------------------------------------------------------------------
/rl4rs/nets/adversarial_slate.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | from tensorflow import keras
3 | from tensorflow.keras import layers
4 | from tensorflow.keras.models import Model
5 | from rl4rs.nets import utils
6 |
7 |
8 | def custom_loss(external_loss):
9 | def loss(y_true, y_pred):
10 | return 0.1 * tf.keras.losses.binary_crossentropy(y_true, y_pred) + external_loss
11 |
12 | return loss
13 |
14 |
15 | def my_loss_fn(y_true, y_pred):
16 | item_scores_exp = tf.exp(y_pred)
17 | item_scores_click = tf.einsum('ij,ij->ij', y_pred, tf.cast(y_true, tf.float32))
18 | return -tf.log(tf.reduce_sum(tf.exp(item_scores_click), axis=1) + 1) \
19 | + tf.log(tf.reduce_sum(item_scores_exp, axis=1) + 1)
20 |
21 |
22 | def my_metrics(y_true, y_pred):
23 | score = tf.einsum('ij,ij->ij', y_pred, 1 - tf.cast(y_true, tf.float32))
24 | return tf.reduce_sum(score, 1)
25 |
26 |
27 | def my_mean_metrics(y_true, y_pred):
28 | return tf.reduce_mean(y_pred, 1)
29 |
30 |
31 | def my_max_metrics(y_true, y_pred):
32 | return tf.reduce_max(y_pred, 1)
33 |
34 |
35 | def my_min_metrics(y_true, y_pred):
36 | return tf.reduce_min(y_pred, 1)
37 |
38 |
39 | def get_model(config):
40 | maxlen = config['maxlen']
41 | dense_feature_num = config['dense_feature_num']
42 | category_feature_num = config['category_feature_num']
43 | class_num = config['class_num']
44 | seq_num = config['seq_num']
45 |
46 | sequence_feature_input = layers.Input(
47 | shape=(seq_num, maxlen,), dtype='float32', name='sequence_feature_input'
48 | )
49 | dense_feature_input = layers.Input(
50 | shape=(dense_feature_num,), dtype='float32', name='dense_feature_input'
51 | )
52 | category_feature_input = layers.Input(
53 | shape=(category_feature_num,), dtype='int64', name='category_feature_input'
54 | )
55 | slate_label_input = layers.Input(
56 | shape=(9,), dtype='int64', name='slate_label'
57 | )
58 |
59 | feature_omit = layers.Lambda(lambda x: x[:, :-1])
60 | category_feature_input_slate = feature_omit(category_feature_input)
61 | config['category_feature_num'] = config['category_feature_num'] - 1
62 |
63 | category_feature = utils.id_input_processing(category_feature_input_slate, config)
64 | dense_feature = utils.dense_input_processing(dense_feature_input, config)
65 | sequence_feature = utils.sequence_input_concat(sequence_feature_input, config)
66 |
67 | all_feature = layers.Concatenate(axis=-1)(
68 | [sequence_feature, dense_feature, category_feature]
69 | )
70 | item_scores = layers.Dense(9, activation='sigmoid')(all_feature)
71 | item_scores_norm = layers.Softmax()(item_scores)
72 | item_scores_no_click = tf.einsum('ij,ij->ij',
73 | item_scores_norm,
74 | 1 - tf.cast(slate_label_input, tf.float32))
75 | loss3 = tf.reduce_sum(item_scores_no_click, axis=1)
76 |
77 | model = Model(inputs=[sequence_feature_input,
78 | dense_feature_input,
79 | category_feature_input,
80 | slate_label_input],
81 | outputs=[item_scores])
82 | model.compile(optimizer='adam',
83 | loss=custom_loss(loss3),
84 | metrics=[
85 | tf.keras.metrics.AUC(),
86 | tf.keras.metrics.Precision(),
87 | tf.keras.metrics.Recall()])
88 | return model
89 |
--------------------------------------------------------------------------------
/reproductions/run_simulator_eval.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | conda activate rl4rs
4 | script_abs=$(readlink -f "$0")
5 | rl4rs_benchmark_dir=$(dirname $script_abs)/..
6 | rl4rs_output_dir=${rl4rs_benchmark_dir}/output
7 | rl4rs_dataset_dir=${rl4rs_benchmark_dir}/dataset
8 | script_dir=${rl4rs_benchmark_dir}/script
9 | export rl4rs_benchmark_dir && export rl4rs_output_dir && export rl4rs_dataset_dir
10 |
11 | algo=$1
12 |
13 | cd ${script_dir}
14 |
15 | # train in train set and test in all sample
16 | python -u simulator_eval.py "{'env':'SlateRecEnv-v0','algo':'${algo}','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_a_shuf.csv','model_file':'${rl4rs_output_dir}/supervised_a_train_${algo}/model'}" >> ${rl4rs_output_dir}/eval_simulator_a_all_${algo}.log &&
17 | python -u simulator_eval.py "{'gpu':False,'env':'SeqSlateRecEnv-v0','algo':'${algo}','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_b3_shuf.csv','model_file':'${rl4rs_output_dir}/supervised_b2_train_${algo}/model'}" >> ${rl4rs_output_dir}/eval_simulator_b2_all_${algo}.log
18 |
19 | # train in all set and test in sl/rl as a baseline
20 | python -u simulator_eval.py "{'env':'SlateRecEnv-v0','algo':'${algo}','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_a_sl_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_a_${algo}/model'}" >> ${rl4rs_output_dir}/eval_simulator_a_all_sl_${algo}.log &&
21 | python -u simulator_eval.py "{'env':'SlateRecEnv-v0','algo':'${algo}','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_a_rl_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_a_${algo}/model'}" >> ${rl4rs_output_dir}/eval_simulator_a_all_rl_${algo}.log &&
22 |
23 | python -u simulator_eval.py "{'gpu':False,'env':'SeqSlateRecEnv-v0','algo':'${algo}','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_b3_sl_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_b2_${algo}/model'}" >> ${rl4rs_output_dir}/eval_simulator_b2_all_sl_${algo}.log &&
24 | python -u simulator_eval.py "{'gpu':False,'env':'SeqSlateRecEnv-v0','algo':'${algo}','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_b3_rl_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_b2_${algo}/model'}" >> ${rl4rs_output_dir}/eval_simulator_b2_all_rl_${algo}.log &&
25 |
26 | # train in sl/rl and test in rl/sl
27 | python -u simulator_eval.py "{'env':'SlateRecEnv-v0','algo':'${algo}','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_a_rl_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_a_sl_${algo}/model'}" >> ${rl4rs_output_dir}/eval_simulator_a_sl_rl_${algo}.log &&
28 | python -u simulator_eval.py "{'env':'SlateRecEnv-v0','algo':'${algo}','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_a_sl_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_a_rl_${algo}/model'}" >> ${rl4rs_output_dir}/eval_simulator_a_rl_sl_${algo}.log &&
29 |
30 | python -u simulator_eval.py "{'gpu':False,'env':'SeqSlateRecEnv-v0','algo':'${algo}','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_b3_rl_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_b2_sl_${algo}/model'}" >> ${rl4rs_output_dir}/eval_simulator_b2_sl_rl_${algo}.log &&
31 | python -u simulator_eval.py "{'gpu':False,'env':'SeqSlateRecEnv-v0','algo':'${algo}','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_b3_sl_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_b2_rl_${algo}/model'}" >> ${rl4rs_output_dir}/eval_simulator_b2_rl_sl_${algo}.log
32 |
33 | echo '1'
--------------------------------------------------------------------------------
/rl4rs/mdpchecker/decoder.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import time
3 | import bottleneck
4 |
5 | def token_probs(model,
6 | batch_inputs,
7 | batch_outputs):
8 | return np.array(model.predict([np.array(batch_inputs), np.array(batch_outputs)]))[:, -1]
9 |
10 |
11 | def decode_step(model,
12 | batch_inputs,
13 | batch_outputs,
14 | candidates=None,
15 | beam_size=1):
16 | a = time.time()
17 | # predicts (batch_size, token_size)
18 | predicts = model.predict([np.array(batch_inputs), np.array(batch_outputs)])[:,-1]
19 | batch_size, token_size = predicts.shape
20 | print('decode_step', time.time()-a)
21 | # print('decode_step', time.time()-a)
22 | # tmp = []
23 | # for i in range(len(predicts)):
24 | # probs = [(prob, j) for j, prob in enumerate(predicts[i])]
25 | # if candidates is not None:
26 | # probs = [x if x[1] in candidates[i] else (0, x[1]) for x in probs]
27 | # probs.sort(reverse=True)
28 | # probs = probs[:beam_size]
29 | # tmp.append(probs)
30 | if candidates is not None:
31 | mask = np.zeros(predicts.shape)
32 | inds = np.array([[i,]*len(candidates[i]) for i in range(len(candidates))]).flatten()
33 | mask[inds, candidates.flatten().astype(int)] = 1
34 | predicts = predicts * mask
35 | # index = np.argpartition(-predicts, beam_size, axis=1)[:, :beam_size]
36 | # probs = -np.partition(-predicts, beam_size, axis=1)[:, :beam_size]
37 | index = bottleneck.argpartition(-predicts, beam_size, axis=1)[:, :beam_size]
38 | probs = -bottleneck.partition(-predicts, beam_size, axis=1)[:, :beam_size]
39 | inds = np.array([[i,]*len(probs[i]) for i in range(len(probs))])
40 | inds_sorted = np.argsort(-probs, axis=1)[:,:beam_size]
41 | index = index[inds, inds_sorted]
42 | probs = probs[inds, inds_sorted]
43 | # print('decode_step', time.time()-a)
44 | tmp2 = np.array(list(zip(probs.flatten(),index.flatten()))).reshape((batch_size, beam_size, 2))
45 | # tmp (batch_size, beam_size, 2)
46 | # print(np.min(tmp==tmp2))
47 | return tmp2
48 |
49 |
50 | def beam_search(model, encode_input, beam_size, target_len, use_candidates=False, candidates_size = None):
51 | batch_size = len(encode_input)
52 | output_topk = np.zeros((batch_size, beam_size, target_len + 1), dtype=np.int)
53 | beam_score = np.ones((batch_size, beam_size))
54 | output_topk[:, :, 0] = 1
55 | # probs = []
56 | candidates = None
57 | prob = decode_step(model, encode_input, output_topk[:, 0, :1], candidates=candidates, beam_size=beam_size)
58 | if use_candidates:
59 | probs_first_step = decode_step(model, encode_input, output_topk[:, 0, :1], candidates=candidates, beam_size=candidates_size)
60 | candidates = probs_first_step[:, :, 1]
61 | output_topk[:, :, 1] = prob[:, :, 1]
62 | beam_score[:, :] = prob[:, :, 0]
63 | for i in range(1, target_len):
64 | a = time.time()
65 | print('beam_search at target_len_', i)
66 | probs = []
67 | for j in range(beam_size):
68 | # batch_size,k,2
69 | prob = decode_step(model, encode_input, output_topk[:, j, :i + 1], candidates=candidates, beam_size=beam_size)
70 | probs.append(prob)
71 | # batch_size,k,k,2
72 | probs = np.array(probs).swapaxes(0, 1)
73 | # batch_size,k,k
74 | beam_scores = np.einsum('abc,ab->abc', probs[:, :, :, 0], beam_score)
75 | # batch_size,k,2
76 | top_k_fn = lambda x: np.dstack(np.unravel_index(np.argsort(-x.ravel()), (beam_size, beam_size)))
77 | top_k_index = np.array(list(map(top_k_fn, beam_scores)))[:, 0][:, :beam_size, :]
78 | for ii in range(batch_size):
79 | output_topk[ii, :, :] = output_topk[ii, top_k_index[ii, :, 0], :]
80 | output_topk[ii, :, i + 1] = probs[ii, top_k_index[ii, :, 0], top_k_index[ii, :, 1], 1]
81 | beam_score[ii, :] = beam_scores[ii, top_k_index[ii, :, 0], top_k_index[ii, :, 1]]
82 | return output_topk, beam_score
83 |
--------------------------------------------------------------------------------
/rl4rs/nets/rllib/rllib_rawstate_model.py:
--------------------------------------------------------------------------------
1 | import gym
2 | from gym.spaces import Dict
3 | from rl4rs.nets import utils
4 | from ray.rllib.models.utils import get_activation_fn
5 | from ray.rllib.models.tf.misc import normc_initializer
6 | from ray.rllib.models.tf.tf_modelv2 import TFModelV2
7 | from ray.rllib.utils.framework import try_import_tf, try_import_torch
8 |
9 | tf1, tf, tfv = try_import_tf()
10 | torch, nn = try_import_torch()
11 |
12 |
13 | def getTFModelWithRawState(config):
14 | config = config
15 |
16 | class MyTFModelWithRawState(TFModelWithRawState):
17 | def __init__(self, obs_space, action_space, num_outputs, model_config,
18 | name):
19 | super(MyTFModelWithRawState, self).__init__(
20 | obs_space, action_space, num_outputs, model_config, name, config=config)
21 |
22 | return MyTFModelWithRawState
23 |
24 |
25 | class TFModelWithRawState(TFModelV2):
26 | """Implements the `.action_model` branch required above."""
27 |
28 | def __init__(self, obs_space, action_space, num_outputs, model_config,
29 | name, config):
30 | obs_space = obs_space.original_space
31 | super(TFModelWithRawState, self).__init__(
32 | obs_space, action_space, num_outputs, model_config, name)
33 | if not (isinstance(obs_space, Dict) and obs_space['category_feature'] \
34 | and obs_space['dense_feature'] and obs_space['sequence_feature']):
35 | raise ValueError("""This model only supports the Dict{'category_feature':[],
36 | 'dense_feature':[], 'sequence_feature':[]} obs space""")
37 | activation = model_config.get("fcnet_activation", "linear")
38 | activation = get_activation_fn(activation)
39 | no_final_linear = model_config.get("no_final_linear", False)
40 | # Inputs
41 | category_feature_input = tf.keras.layers.Input(
42 | shape=obs_space['category_feature'].shape, name="obs_category_input")
43 | dense_feature_input = tf.keras.layers.Input(
44 | shape=obs_space['dense_feature'].shape, name="obs_dense_input")
45 | sequence_feature_input = tf.keras.layers.Input(
46 | shape=obs_space['sequence_feature'].shape, name="obs_sequence_input")
47 |
48 | slice_layer = tf.keras.layers.Lambda(lambda x: x[0][:, x[1]:])
49 | category_feature = utils.id_input_processing(category_feature_input, config)
50 | dense_feature = utils.dense_input_processing(dense_feature_input, config)
51 | sequence_feature = utils.sequence_input_concat(sequence_feature_input, config)
52 | all_feature = tf.keras.layers.Concatenate(axis=-1)([sequence_feature, dense_feature, category_feature])
53 | context = tf.keras.layers.Dense(256, activation=tf.keras.layers.ELU())(all_feature)
54 | model_out = None
55 | if no_final_linear and num_outputs:
56 | model_out = tf.keras.layers.Dense(
57 | num_outputs,
58 | name="fc_out",
59 | activation=activation,
60 | kernel_initializer=normc_initializer(1.0))(context)
61 | else:
62 | model_out = tf.keras.layers.Dense(
63 | num_outputs,
64 | name="fc_out",
65 | activation=None,
66 | kernel_initializer=normc_initializer(0.01))(context)
67 |
68 | # V(s)
69 | value_out = tf.keras.layers.Dense(
70 | 1,
71 | name="value_out",
72 | activation=None,
73 | kernel_initializer=normc_initializer(0.01))(context)
74 |
75 | # Base layers
76 | self.base_model = tf.keras.Model([category_feature_input, dense_feature_input, sequence_feature_input], [model_out, value_out])
77 | self.base_model.summary()
78 |
79 | def forward(self, input_dict, state, seq_lens):
80 | model_out, self._value_out = self.base_model([input_dict["obs"]["category_feature"],
81 | input_dict["obs"]["dense_feature"],
82 | input_dict["obs"]["sequence_feature"]])
83 | return model_out, state
84 |
85 | def value_function(self):
86 | return tf.reshape(self._value_out, [-1])
87 |
--------------------------------------------------------------------------------
/rl4rs/policy/policy_model.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import d3rlpy
3 | import numpy as np
4 | from ray.rllib.agents.trainer import Trainer as rllib_trainer
5 | from scipy.special import softmax
6 |
7 |
8 | class policy_model(object):
9 | def __init__(self, model, config = {}):
10 | self.policy = model
11 | self.config = config
12 | self.page_items = int(config.get('page_items', 9))
13 | self.mask_size = self.page_items+1
14 | self.location_mask = config.get('location_mask', None)
15 | self.special_items = config.get('special_items', None)
16 |
17 | def predict_with_mask(self, obs):
18 | if self.config.get("support_conti_env",False):
19 | return self.predict(obs)
20 | elif isinstance(self.policy, d3rlpy.algos.AlgoBase):
21 | obs = np.array(obs)
22 | action_probs = np.array(self.action_probs(obs))
23 | batch_size = len(obs)
24 | # mask
25 | prev_actions = obs[:, -self.mask_size:-1].astype(int)
26 | cur_step = obs[:, -1].astype(int)
27 | x_mask_layer = cur_step % self.page_items // 3
28 | mask = self.location_mask[x_mask_layer.astype(int)]
29 | for i in range(self.mask_size-1):
30 | mask[range(batch_size), prev_actions[:, i]] = 0
31 | action_mask = mask < 0.01
32 | action_probs[action_mask] = -2 ** 15
33 | for i in range(batch_size):
34 | if len(np.intersect1d(prev_actions[i], self.special_items)) > 0:
35 | action_probs[i][self.special_items] = -2 ** 15
36 | return action_probs.argmax(axis=1)
37 | elif isinstance(self.policy, rllib_trainer):
38 | return self.predict(obs)
39 | else:
40 | assert isinstance(self.policy, d3rlpy.algos.AlgoBase) \
41 | or isinstance(self.policy, rllib_trainer)
42 |
43 | def predict(self, obs):
44 | if isinstance(self.policy, d3rlpy.algos.AlgoBase):
45 | return self.policy.predict(obs)
46 | elif isinstance(self.policy, rllib_trainer):
47 | obs = dict(enumerate(obs))
48 | action = self.policy.compute_actions(obs, explore=False)
49 | action = np.array(list(action.values()))
50 | return action
51 | else:
52 | assert isinstance(self.policy, d3rlpy.algos.AlgoBase) \
53 | or isinstance(self.policy, rllib_trainer)
54 |
55 | def predict_q(self, obs, action):
56 | if isinstance(self.policy, d3rlpy.algos.AlgoBase):
57 | q = self.policy.predict_value(obs, action)
58 | if self.policy.reward_scaler is not None:
59 | return self.policy.reward_scaler.reverse_transform(q)
60 | else:
61 | return q
62 | elif isinstance(self.policy, rllib_trainer):
63 | obs = dict(enumerate(obs))
64 | _, _, infos = self.policy. \
65 | compute_actions(obs, explore=False, full_fetch=True)
66 | batch_size = len(action)
67 | return infos['q_values'][range(batch_size), action] \
68 | if 'q_values' in infos \
69 | else infos['vf_preds']
70 | else:
71 | assert isinstance(self.policy, d3rlpy.algos.AlgoBase) \
72 | or isinstance(self.policy, rllib_trainer)
73 |
74 | def action_probs(self, obs):
75 | if isinstance(self.policy, d3rlpy.algos.DiscreteBC):
76 | obs = torch.tensor(obs, dtype=torch.float32)
77 | return self.policy._impl._imitator(obs).detach().numpy()
78 | elif isinstance(self.policy, d3rlpy.algos.DiscreteBCQ) \
79 | or isinstance(self.policy, d3rlpy.algos.DiscreteCQL):
80 | obs = torch.tensor(obs, dtype=torch.float32)
81 | action_q = self.policy._impl._q_func(obs).detach().numpy()
82 | return softmax(action_q, axis=1)
83 | elif isinstance(self.policy, rllib_trainer):
84 | obs = dict(enumerate(obs))
85 | actions, _, infos = self.policy. \
86 | compute_actions(obs, explore=False, full_fetch=True)
87 | return softmax(infos['action_dist_inputs'], axis=1)
88 | else:
89 | assert isinstance(self.policy, d3rlpy.algos.AlgoBase) \
90 | or isinstance(self.policy, rllib_trainer)
91 |
--------------------------------------------------------------------------------
/reproductions/run_exact_k.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | conda activate rl4rs
4 | script_abs=$(readlink -f "$0")
5 | rl4rs_benchmark_dir=$(dirname $script_abs)/..
6 | rl4rs_output_dir=${rl4rs_benchmark_dir}/output
7 | rl4rs_dataset_dir=${rl4rs_benchmark_dir}/dataset
8 | script_dir=${rl4rs_benchmark_dir}/script
9 | export rl4rs_benchmark_dir && export rl4rs_output_dir && export rl4rs_dataset_dir
10 |
11 |
12 | cd ${script_dir}
13 |
14 | # experiment in a_all env, train in a_all sample and test in a_all sample
15 | python -u exact_k_train.py "train" "{'env':'SlateRecEnv-v0','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_a_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_a_dien/model','trial_name':'a_all'}" >> ${rl4rs_output_dir}/exactk_a_all.log
16 | python -u exact_k_train.py "eval" "{'gpu':False,'batch_size':2048,'is_eval':True,'env':'SlateRecEnv-v0','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_a_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_a_dien/model','trial_name':'a_all'}" >> ${rl4rs_output_dir}/exactk_a_all.log
17 |
18 |
19 | # experiment in a_all env, train in a_train sample and test in a_test sample
20 | #python -u exact_k_train.py "train" "{'env':'SlateRecEnv-v0','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_a_train_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_a_dien/model','trial_name':'a_train'}" >> ${rl4rs_output_dir}/exactk_a_train.log
21 | #python -u exact_k_train.py "eval" "{'gpu':False,'batch_size':2048,'is_eval':True,'env':'SlateRecEnv-v0','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_a_test_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_a_dien/model','trial_name':'a_train'}" >> ${rl4rs_output_dir}/exactk_a_train.log
22 |
23 |
24 | # experiment train in a_sl env and test in a_rl env
25 | #python -u exact_k_train.py "train" "{'env':'SlateRecEnv-v0','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_a_sl_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_a_sl_dien/model','trial_name':'a_sl'}" >> ${rl4rs_output_dir}/exactk_a_sl.log
26 | #python -u exact_k_train.py "eval" "{'gpu':False,'batch_size':2048,'is_eval':True,'env':'SlateRecEnv-v0','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_a_rl_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_a_rl_dien/model','trial_name':'a_sl'}" >> ${rl4rs_output_dir}/exactk_a_sl.log
27 |
28 |
29 | # experiment in b_all env, train in b_all sample and test in b_all sample
30 | #python -u exact_k_train.py "train" "{'env':'SeqSlateRecEnv-v0','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_b3_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_b2_dien/model','trial_name':'b_all'}" >> ${rl4rs_output_dir}/exactk_b_all.log
31 | #python -u exact_k_train.py "eval" "{'gpu':False,'batch_size':2048,'is_eval':True,'env':'SeqSlateRecEnv-v0','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_b3_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_b2_dien/model','trial_name':'b_all'}" >> ${rl4rs_output_dir}/exactk_b_all.log
32 |
33 |
34 | # experiment in b_all env, train in b_train sample and test in b_test sample
35 | #python -u exact_k_train.py "train" "{'env':'SeqSlateRecEnv-v0','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_b3_train_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_b2_dien/model','trial_name':'b_train'}" >> ${rl4rs_output_dir}/exactk_b_train.log
36 | #python -u exact_k_train.py "eval" "{'gpu':False,'batch_size':2048,'is_eval':True,'env':'SeqSlateRecEnv-v0','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_b3_test_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_b2_dien/model','trial_name':'b_train'}" >> ${rl4rs_output_dir}/exactk_b_train.log
37 |
38 |
39 | # experiment train in b_sl env and test in b_rl env
40 | #python -u exact_k_train.py "train" "{'env':'SeqSlateRecEnv-v0','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_b3_sl_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_b2_sl_dien/model','trial_name':'b_sl'}" >> ${rl4rs_output_dir}/exactk_b_sl.log
41 | #python -u exact_k_train.py "eval" "{'gpu':False,'batch_size':2048,'is_eval':True,'env':'SeqSlateRecEnv-v0','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_b3_rl_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_b2_rl_dien/model','trial_name':'b_sl'}" >> ${rl4rs_output_dir}/exactk_b_sl.log
42 |
--------------------------------------------------------------------------------
/rl4rs/nets/rllib/rllib_mask_model.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | from rl4rs.nets.rllib.rllib_rawstate_model import TFModelWithRawState
3 | from ray.rllib.examples.models.parametric_actions_model import \
4 | ParametricActionsModel
5 |
6 |
7 | def getMaskActionsModel(true_obs_shape, action_size):
8 | class MyMaskActionsModel(ParametricActionsModel):
9 | """Parametric action model that handles the dot product and masking.
10 |
11 | This assumes the outputs are logits for a single Categorical action dist.
12 | Getting this to work with a more complex output (e.g., if the action space
13 | is a tuple of several distributions) is also possible but left as an
14 | exercise to the reader.
15 | """
16 |
17 | def __init__(self,
18 | obs_space,
19 | action_space,
20 | num_outputs,
21 | model_config,
22 | name,
23 | **kw):
24 | config = {
25 | # FullyConnectedNetwork (tf and torch): rllib.models.tf|torch.fcnet.py
26 | # These are used if no custom model is specified and the input space is 1D.
27 | # Number of hidden layers to be used.
28 | "fcnet_hiddens": [64],
29 | # Activation function descriptor.
30 | # Supported values are: "tanh", "relu", "swish" (or "silu"),
31 | # "linear" (or None).
32 | # "fcnet_activation": "linear",
33 | # "no_final_linear": True,
34 | "vf_share_layers": True,
35 | }
36 | model_config = dict(model_config, **config)
37 | super(MyMaskActionsModel, self).__init__(
38 | obs_space, action_space, num_outputs, model_config, name, true_obs_shape, action_embed_size=action_size, **kw)
39 | print('MyMaskActionsModel', self.action_embed_model.model_config)
40 |
41 | def forward(self, input_dict, state, seq_lens):
42 | # Extract the available actions tensor from the observation.
43 | # avail_actions = input_dict["obs"]["avail_actions"]
44 | action_mask = input_dict["obs"]["action_mask"]
45 |
46 | # Compute the predicted action embedding
47 | action_embed, _ = self.action_embed_model({
48 | "obs": input_dict["obs"]["obs"]
49 | })
50 | # action_values = self.action_embed_model.value_function()
51 | # print(tf.shape(action_embed), action_embed)
52 |
53 | # Expand the model output to [BATCH, 1, EMBED_SIZE]. Note that the
54 | # avail actions tensor is of shape [BATCH, MAX_ACTIONS, EMBED_SIZE].
55 | # intent_vector = tf.expand_dims(action_embed, 1)
56 |
57 | # Batch dot product => shape of logits is [BATCH, MAX_ACTIONS].
58 | # action_prob = tf.nn.softmax(action_embed)
59 |
60 | # Mask out invalid actions (use tf.float32.min for stability)
61 | inf_mask = tf.maximum(tf.math.log(action_mask), tf.float32.min)
62 | return action_embed + inf_mask, state
63 |
64 | return MyMaskActionsModel
65 |
66 |
67 | def getMaskActionsModelWithRawState(config, action_size):
68 | config = config
69 |
70 | class MyMaskActionsModelWithRawState(ParametricActionsModel):
71 | """Parametric action model that handles the dot product and masking.
72 |
73 | This assumes the outputs are logits for a single Categorical action dist.
74 | Getting this to work with a more complex output (e.g., if the action space
75 | is a tuple of several distributions) is also possible but left as an
76 | exercise to the reader.
77 | """
78 |
79 | def __init__(self,
80 | obs_space,
81 | action_space,
82 | num_outputs,
83 | model_config,
84 | name,
85 | **kw):
86 | # model_config = dict(model_config, **config)
87 | super(MyMaskActionsModelWithRawState, self).__init__(
88 | obs_space, action_space, num_outputs, model_config, name, action_embed_size=action_size, **kw)
89 | print('MyMaskActionsModelWithRawStateModel', self.action_embed_model.model_config)
90 | self.action_embed_model = TFModelWithRawState(
91 | obs_space, action_space, action_size,
92 | model_config, name + "_action_embed", config = config)
93 |
94 | def forward(self, input_dict, state, seq_lens):
95 | # Extract the available actions tensor from the observation.
96 | # avail_actions = input_dict["obs"]["avail_actions"]
97 | action_mask = input_dict["obs"]["action_mask"]
98 |
99 | # Compute the predicted action embedding
100 | action_embed, _ = self.action_embed_model(input_dict)
101 | # action_values = self.action_embed_model.value_function()
102 | # print(tf.shape(action_embed), action_embed)
103 |
104 | # Expand the model output to [BATCH, 1, EMBED_SIZE]. Note that the
105 | # avail actions tensor is of shape [BATCH, MAX_ACTIONS, EMBED_SIZE].
106 | # intent_vector = tf.expand_dims(action_embed, 1)
107 |
108 | # Batch dot product => shape of logits is [BATCH, MAX_ACTIONS].
109 | # action_prob = tf.nn.softmax(action_embed)
110 |
111 | # Mask out invalid actions (use tf.float32.min for stability)
112 | inf_mask = tf.maximum(tf.math.log(action_mask), tf.float32.min)
113 | return action_embed + inf_mask, state
114 |
115 | return MyMaskActionsModelWithRawState
116 |
--------------------------------------------------------------------------------
/rl4rs/nets/utils.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import tensorflow as tf
3 | from deepctr.layers.sequence import AttentionSequencePoolingLayer, DynamicGRU
4 | from tensorflow.keras import layers, regularizers
5 |
6 |
7 | def id_input_processing(category_feature_input, config):
8 | emb_size = config['emb_size']
9 | category_hash_size = config['category_hash_size']
10 | emb_layer = layers.Embedding(input_dim=category_hash_size, output_dim=emb_size)
11 | category_emb = emb_layer(category_feature_input)
12 | category_feature = layers.GlobalAveragePooling1D()(category_emb)
13 | return category_feature
14 |
15 |
16 | def id_input_processing_attn(category_feature_input, config):
17 | emb_size = config['emb_size']
18 | hidden_unit = config['hidden_units']
19 | category_hash_size = config['category_hash_size']
20 | emb_layer = layers.Embedding(input_dim=category_hash_size, output_dim=emb_size)
21 | category_emb = emb_layer(category_feature_input)
22 | category_feature = tf.keras.layers.Attention()([category_emb, category_emb])
23 | category_feature = tf.keras.layers.GlobalAveragePooling1D()(category_feature)
24 | category_feature_2 = layers.Flatten()(category_emb)
25 | return layers.Concatenate(axis=-1)([category_feature, category_feature_2])
26 |
27 |
28 | def id_input_processing_lstm(category_feature_input, config):
29 | emb_size = config['emb_size']
30 | hidden_unit = config['hidden_units']
31 | category_hash_size = config['category_hash_size']
32 | emb_layer = layers.Embedding(input_dim=category_hash_size, output_dim=emb_size)
33 | category_emb = emb_layer(category_feature_input)
34 | category_feature = layers.GRU(units=hidden_unit)(category_emb)
35 | category_feature_2 = layers.Flatten()(category_emb)
36 | return layers.Concatenate(axis=-1)([category_feature, category_feature_2])
37 |
38 |
39 | def id_input_processing_concat(category_feature_input, config):
40 | emb_size = config['emb_size']
41 | category_hash_size = config['category_hash_size']
42 | emb_layer = layers.Embedding(input_dim=category_hash_size, output_dim=emb_size)
43 | category_emb = emb_layer(category_feature_input)
44 | category_feature = layers.Flatten()(category_emb)
45 | return category_feature
46 |
47 |
48 | def dense_input_processing(cross_feature_input, config):
49 | hidden_unit = config['hidden_units']
50 | cross_feature = layers.Dense(hidden_unit, activation=layers.ELU())(cross_feature_input)
51 | cross_feature = layers.Dropout(0.2)(cross_feature)
52 | cross_feature = layers.Dense(hidden_unit, activation=layers.ELU())(cross_feature)
53 | cross_feature = layers.Dropout(0.2)(cross_feature)
54 | return cross_feature
55 |
56 |
57 | def sequence_input_concat(sequence_feature_input, config):
58 | category_hash_size = config['category_hash_size']
59 | hidden_unit = config['hidden_units']
60 | emb_size = config['emb_size']
61 | seq_num = config['seq_num']
62 |
63 | seq_index_layer = layers.Lambda(lambda x: x[0][:, x[1]])
64 | emb_layer = layers.Embedding(input_dim=category_hash_size, output_dim=emb_size)
65 |
66 | seqs_lstm = []
67 | for i in range(seq_num):
68 | seq_i = seq_index_layer([sequence_feature_input, i])
69 | seq_i_embeddings = emb_layer(seq_i)
70 | seq_i_lstm = layers.GlobalAveragePooling1D()(seq_i_embeddings)
71 | seqs_lstm.append(seq_i_lstm)
72 |
73 | seqs_embeddings = layers.Concatenate(axis=-1)(seqs_lstm) if len(seqs_lstm) > 1 else seqs_lstm[0]
74 |
75 | return seqs_embeddings
76 |
77 |
78 | def sequence_input_LSTM(sequence_feature_input, config):
79 | category_hash_size = config['category_hash_size']
80 | hidden_unit = config['hidden_units']
81 | emb_size = config['emb_size']
82 | seq_num = config['seq_num']
83 |
84 | seq_index_layer = layers.Lambda(lambda x: x[0][:, x[1]])
85 |
86 | emb_layer = layers.Embedding(input_dim=category_hash_size, output_dim=emb_size)
87 |
88 | seqs_lstm = []
89 | for i in range(seq_num):
90 | seq_i = seq_index_layer([sequence_feature_input, i])
91 | seq_i_embeddings = emb_layer(seq_i)
92 | seq_i_lstm = layers.GRU(units=hidden_unit)(seq_i_embeddings)
93 | seqs_lstm.append(seq_i_lstm)
94 |
95 | seqs_embeddings = layers.Concatenate(axis=-1)(seqs_lstm) if len(seqs_lstm) > 1 else seqs_lstm[0]
96 |
97 | return seqs_embeddings
98 |
99 |
100 | def sequence_input_attn(input, config):
101 | category_hash_size = config['category_hash_size']
102 | hidden_unit = config['hidden_units']
103 | emb_size = config['emb_size']
104 | maxlen = config['maxlen']
105 | batch_size = config['batch_size']
106 | seq_num = config['seq_num']
107 |
108 | sequence_feature_input = input[0]
109 | id_slate_input = input[1]
110 |
111 | sequence_length = tf.fill((tf.shape(sequence_feature_input)[0], 1), maxlen)
112 | seq_index_layer = layers.Lambda(lambda x: x[0][:, x[1]])
113 | emb_layer = layers.Embedding(input_dim=category_hash_size, output_dim=emb_size)
114 | id_slate_embeddings = emb_layer(id_slate_input)
115 | id_slate_pooling = tf.math.reduce_mean(id_slate_embeddings, axis=1, keepdims=True)
116 | seqs_attn = []
117 | for i in range(seq_num):
118 | seq_i = seq_index_layer([sequence_feature_input, i])
119 | seq_i_embeddings = emb_layer(seq_i)
120 | rnn_outputs = DynamicGRU(emb_size, return_sequence=True)([seq_i_embeddings, sequence_length])
121 | scores = AttentionSequencePoolingLayer(att_hidden_units=(64, 16), return_score=True)([
122 | id_slate_pooling, rnn_outputs, sequence_length])
123 | final_state2 = DynamicGRU(emb_size * 2, gru_type='AUGRU', return_sequence=False
124 | )([rnn_outputs, sequence_length, tf.keras.layers.Permute([2, 1])(scores)])
125 | seqs_attn.append(final_state2)
126 |
127 | seqs_embeddings = layers.Concatenate(axis=-1)(seqs_attn) if len(seqs_attn) > 1 else seqs_attn[0]
128 |
129 | return tf.squeeze(seqs_embeddings, axis=1)
130 |
--------------------------------------------------------------------------------
/rl4rs/nets/cql/encoder.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import numpy as np
4 | import copy
5 | from typing import Any, ClassVar, Dict, List, Optional, Sequence, Type, Union
6 | from d3rlpy.models.encoders import EncoderFactory, Encoder, VectorEncoderWithAction, _create_activation, VectorEncoder
7 |
8 |
9 | class CustomVectorEncoder(VectorEncoder):
10 |
11 | def __init__(
12 | self,
13 | config,
14 | action_size,
15 | mask_size,
16 | with_q,
17 | observation_shape: Sequence[int],
18 | hidden_units: Optional[Sequence[int]] = None,
19 | use_batch_norm: bool = False,
20 | dropout_rate: Optional[float] = None,
21 | use_dense: bool = False,
22 | activation: nn.Module = nn.ReLU(),
23 | ):
24 | super().__init__(observation_shape, hidden_units, use_batch_norm, dropout_rate, use_dense, activation)
25 | self.action_size = action_size
26 | self.mask_size = mask_size
27 | self.with_q = with_q
28 | self.emb_size = 32
29 | self.emb_layer = nn.Embedding(action_size, self.emb_size)
30 | self.fc2 = nn.Linear(self._feature_size + self.emb_size * mask_size, action_size)
31 | self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
32 | location_mask = config['location_mask']
33 | self.special_items = config['special_items']
34 | self.location_mask = torch.tensor(location_mask, device=self.device)
35 |
36 | def get_feature_size(self) -> int:
37 | if not self.with_q:
38 | return self._feature_size + self.emb_size * self.mask_size
39 | else:
40 | return self.action_size
41 |
42 | def forward(self, x: torch.Tensor) -> torch.Tensor:
43 | batch_size = x.shape[0]
44 | # mask
45 | prev_actions = x[:, -self.mask_size:-1].to(torch.long)
46 | cur_step = x[:, -1].to(torch.long)
47 | x_mask_layer = cur_step % 9 // 3
48 | mask = self.location_mask[x_mask_layer]
49 | for i in range(self.mask_size-1):
50 | mask[range(batch_size), prev_actions[:, i]] = 0
51 | h = self._fc_encode(x)
52 | if self._use_batch_norm:
53 | h = self._bns[-1](h)
54 | if self._dropout_rate is not None:
55 | h = self._dropouts[-1](h)
56 | prev_action_emb = nn.Flatten()(self.emb_layer(x[:, -self.mask_size:].to(torch.long)))
57 | h = torch.cat([h, prev_action_emb], dim=-1)
58 | if self.with_q:
59 | h = self.fc2(h)
60 | action_mask = mask < 0.01
61 | # h[action_mask] = -2 ** 15
62 | h[action_mask] = 0
63 | for i in range(batch_size):
64 | if len(np.intersect1d(prev_actions[i].cpu().numpy(), self.special_items)) > 0:
65 | h[i][self.special_items] = 0
66 | # h[i][self.special_items] = -2 ** 15
67 | return h
68 |
69 |
70 | class CustomVectorEncoderFactory(EncoderFactory):
71 | TYPE: ClassVar[str] = "vector"
72 | _hidden_units: Sequence[int]
73 | _activation: str
74 | _use_batch_norm: bool
75 | _dropout_rate: Optional[float]
76 | _use_dense: bool
77 |
78 | def __init__(
79 | self,
80 | config,
81 | action_size,
82 | mask_size,
83 | with_q=False,
84 | hidden_units: Optional[Sequence[int]] = None,
85 | activation: str = "relu",
86 | use_batch_norm: bool = False,
87 | dropout_rate: Optional[float] = None,
88 | use_dense: bool = False,
89 | ):
90 | self.config = config
91 | self.action_size = action_size
92 | self.mask_size = mask_size
93 | self.with_q = with_q
94 | if hidden_units is None:
95 | self._hidden_units = [256]
96 | else:
97 | self._hidden_units = hidden_units
98 | self._activation = activation
99 | self._use_batch_norm = use_batch_norm
100 | self._dropout_rate = dropout_rate
101 | self._use_dense = use_dense
102 |
103 | def create(self, observation_shape: Sequence[int]) -> CustomVectorEncoder:
104 | assert len(observation_shape) == 1
105 | return CustomVectorEncoder(
106 | config=self.config,
107 | action_size=self.action_size,
108 | mask_size=self.mask_size,
109 | with_q=self.with_q,
110 | observation_shape=observation_shape,
111 | hidden_units=self._hidden_units,
112 | use_batch_norm=self._use_batch_norm,
113 | dropout_rate=self._dropout_rate,
114 | use_dense=self._use_dense,
115 | activation=_create_activation(self._activation),
116 | )
117 |
118 | def create_with_action(
119 | self,
120 | observation_shape: Sequence[int],
121 | action_size: int,
122 | discrete_action: bool = False,
123 | ) -> VectorEncoderWithAction:
124 | assert len(observation_shape) == 1
125 | return VectorEncoderWithAction(
126 | observation_shape=observation_shape,
127 | action_size=action_size,
128 | hidden_units=self._hidden_units,
129 | use_batch_norm=self._use_batch_norm,
130 | dropout_rate=self._dropout_rate,
131 | use_dense=self._use_dense,
132 | discrete_action=discrete_action,
133 | activation=_create_activation(self._activation),
134 | )
135 |
136 | def get_params(self, deep: bool = False) -> Dict[str, Any]:
137 | if deep:
138 | hidden_units = copy.deepcopy(self._hidden_units)
139 | else:
140 | hidden_units = self._hidden_units
141 | params = {
142 | "hidden_units": hidden_units,
143 | "activation": self._activation,
144 | "use_batch_norm": self._use_batch_norm,
145 | "dropout_rate": self._dropout_rate,
146 | "use_dense": self._use_dense,
147 | }
148 | return params
149 |
--------------------------------------------------------------------------------
/rl4rs/utils/d3rlpy_scorer.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from typing import Any, Callable, Iterator, List, Optional, Tuple, Union, cast
3 | from d3rlpy.metrics.scorer import AlgoProtocol, _make_batches
4 | from d3rlpy.dataset import Episode
5 | from rl4rs.policy.policy_model import policy_model
6 |
7 | WINDOW_SIZE = 1024
8 |
9 |
10 | # modify from https://github.com/takuseno/d3rlpy/blob/master/d3rlpy/metrics/scorer.py
11 | def soft_opc_scorer(
12 | return_threshold: float,
13 | ) -> Callable[[policy_model, List[Episode]], float]:
14 | r"""Returns Soft Off-Policy Classification metrics.
15 |
16 | This function returns scorer function, which is suitable to the standard
17 | scikit-learn scorer function style.
18 | The metrics of the scorer funciton is evaluating gaps of action-value
19 | estimation between the success episodes and the all episodes.
20 | If the learned Q-function is optimal, action-values in success episodes
21 | are expected to be higher than the others.
22 | The success episode is defined as an episode with a return above the given
23 | threshold.
24 |
25 | .. math::
26 |
27 | \mathbb{E}_{s, a \sim D_{success}} [Q(s, a)]
28 | - \mathbb{E}_{s, a \sim D} [Q(s, a)]
29 |
30 | .. code-block:: python
31 |
32 | from d3rlpy.datasets import get_cartpole
33 | from d3rlpy.algos import DQN
34 | from d3rlpy.metrics.scorer import soft_opc_scorer
35 | from sklearn.model_selection import train_test_split
36 |
37 | dataset, _ = get_cartpole()
38 | train_episodes, test_episodes = train_test_split(dataset, test_size=0.2)
39 |
40 | scorer = soft_opc_scorer(return_threshold=180)
41 |
42 | dqn = DQN()
43 | dqn.fit(train_episodes,
44 | eval_episodes=test_episodes,
45 | scorers={'soft_opc': scorer})
46 |
47 | References:
48 | * `Irpan et al., Off-Policy Evaluation via Off-Policy Classification.
49 | `_
50 |
51 | Args:
52 | return_threshold: threshold of success episodes.
53 |
54 | Returns:
55 | scorer function.
56 |
57 | """
58 |
59 | def scorer(algo: policy_model, episodes: List[Episode]) -> float:
60 | success_values = []
61 | all_values = []
62 | for episode in episodes:
63 | is_success = episode.compute_return() >= return_threshold
64 | for batch in _make_batches(episode, WINDOW_SIZE, algo.policy.n_frames):
65 | values = algo.predict_q(batch.observations, batch.actions)
66 | values = cast(np.ndarray, values)
67 | all_values += values.reshape(-1).tolist()
68 | if is_success:
69 | success_values += values.reshape(-1).tolist()
70 | return float(np.mean(success_values) - np.mean(all_values))
71 |
72 | return scorer
73 |
74 |
75 | def dynamics_reward_prediction_mean_error_scorer(
76 | dynamics: policy_model, episodes: List[Episode]
77 | ) -> float:
78 | r"""Returns MSE of reward prediction (in negative scale).
79 |
80 | This metrics suggests how dynamics model is generalized to test sets.
81 | If the MSE is large, the dynamics model are overfitting.
82 |
83 | .. math::
84 |
85 | \mathbb{E}_{s_t, a_t, r_{t+1} \sim D} [(r_{t+1} - r')]
86 |
87 | where :math:`r' \sim T(s_t, a_t)`.
88 |
89 | Args:
90 | dynamics: dynamics model.
91 | episodes: list of episodes.
92 |
93 | Returns:
94 | negative mean squared error.
95 |
96 | """
97 | total_errors = []
98 | for episode in episodes:
99 | for batch in _make_batches(episode, WINDOW_SIZE, dynamics.policy.n_frames):
100 | pred = dynamics.predict_q(batch.observations, batch.actions)
101 | rewards = batch.next_rewards
102 | errors = (rewards - pred[1]).reshape(-1)
103 | total_errors += errors.tolist()
104 | # smaller is better
105 | return float(np.mean(total_errors))
106 |
107 |
108 | def dynamics_reward_prediction_abs_mean_error_scorer(
109 | dynamics: policy_model, episodes: List[Episode]
110 | ) -> float:
111 | r"""Returns MSE of reward prediction (in negative scale).
112 |
113 | This metrics suggests how dynamics model is generalized to test sets.
114 | If the MSE is large, the dynamics model are overfitting.
115 |
116 | .. math::
117 |
118 | \mathbb{E}_{s_t, a_t, r_{t+1} \sim D} [abs(r_{t+1} - r')]
119 |
120 | where :math:`r' \sim T(s_t, a_t)`.
121 |
122 | Args:
123 | dynamics: dynamics model.
124 | episodes: list of episodes.
125 |
126 | Returns:
127 | negative mean squared error.
128 |
129 | """
130 | total_errors = []
131 | for episode in episodes:
132 | for batch in _make_batches(episode, WINDOW_SIZE, dynamics.policy.n_frames):
133 | pred = dynamics.predict_q(batch.observations, batch.actions)
134 | rewards = batch.next_rewards
135 | errors = np.abs(rewards - pred[1]).reshape(-1)
136 | total_errors += errors.tolist()
137 | # smaller is better
138 | return float(np.mean(total_errors))
139 |
140 | def discrete_action_match_scorer(
141 | algo: policy_model, episodes: List[Episode]
142 | ) -> float:
143 | r"""Returns percentage of identical actions between algorithm and dataset.
144 |
145 | This metrics suggests how different the greedy-policy is from the given
146 | episodes in discrete action-space.
147 | If the given episdoes are near-optimal, the large percentage would be
148 | better.
149 |
150 | .. math::
151 |
152 | \frac{1}{N} \sum^N \parallel
153 | \{a_t = \text{argmax}_a Q_\theta (s_t, a)\}
154 |
155 | Args:
156 | algo: algorithm.
157 | episodes: list of episodes.
158 |
159 | Returns:
160 | percentage of identical actions.
161 |
162 | """
163 | total_matches = []
164 | for episode in episodes:
165 | for batch in _make_batches(episode, WINDOW_SIZE, algo.policy.n_frames):
166 | actions = algo.predict_with_mask(batch.observations)
167 | match = (batch.actions.reshape(-1) == actions).tolist()
168 | total_matches += match
169 | return float(np.mean(total_matches))
--------------------------------------------------------------------------------
/script/exact_k_train.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import os, sys
3 | import gym
4 | import numpy as np
5 | import tensorflow as tf
6 | from rl4rs.nets.exact_k.model import Generator, Discriminator
7 | from rl4rs.env.slate import SlateRecEnv, SlateState
8 | from rl4rs.env.seqslate import SeqSlateRecEnv, SeqSlateState
9 | from rl4rs.utils.fileutil import find_newest_files
10 |
11 | stage = sys.argv[1]
12 | extra_config = eval(sys.argv[2])
13 |
14 | config = {"epoch": 10000, "maxlen": 64, "batch_size": 256, "action_size": 284, "class_num": 2, "dense_feature_num": 432,
15 | "category_feature_num": 21, "category_hash_size": 100000, "seq_num": 2, "emb_size": 128, "page_items": 9,
16 | "hidden_units": 128, "max_steps": 9, "sample_file": '../dataset/rl4rs_dataset_b3_shuf.csv', "iteminfo_file": '../item_info.csv',
17 | "model_file": "../output/simulator_b2_dien/model", "support_rllib_mask": False, "is_eval": False, 'env': "SlateRecEnv-v0"}
18 |
19 | config = dict(config, **extra_config)
20 |
21 | if config['env'] == 'SeqSlateRecEnv-v0':
22 | config['max_steps'] = 36
23 | sim = SeqSlateRecEnv(config, state_cls=SeqSlateState)
24 | env = gym.make('SeqSlateRecEnv-v0', recsim=sim)
25 | else:
26 | sim = SlateRecEnv(config, state_cls=SlateState)
27 | env = gym.make('SlateRecEnv-v0', recsim=sim)
28 |
29 | batch_size = config["batch_size"]
30 | action_size = config["action_size"]
31 | epoch = config["epoch"]
32 | max_steps = config["max_steps"]
33 | output_dir = os.environ['rl4rs_output_dir']
34 | model_dir = '%s/%s/' % (output_dir, 'exactk_' + config['env'] + '_' + config['trial_name'])
35 | model_save_path = model_dir + 'exact_k.ckpt'
36 | restore_file = find_newest_files('exact_k.ckpt*', model_dir)
37 | restore_file = restore_file[:restore_file.rfind('.')]
38 |
39 | l0_ssr_mask = np.zeros(action_size)
40 | location_mask, special_items = SlateState.get_mask_from_file(config['iteminfo_file'], action_size)
41 | l1_mask, l2_mask, l3_mask = location_mask[0], location_mask[1], location_mask[2]
42 | l0_ssr_mask[special_items] = 1
43 |
44 | with tf.name_scope('Generator'):
45 | g = Generator(l1_mask,
46 | l2_mask,
47 | l3_mask,
48 | l0_ssr_mask,
49 | is_training=True,
50 | seq_length=action_size)
51 |
52 | with tf.name_scope('Discriminator'):
53 | d = Discriminator(seq_length=action_size)
54 |
55 | print("Graph loaded")
56 |
57 | if config.get('gpu', True):
58 | gpu_options = tf.GPUOptions(
59 | per_process_gpu_memory_fraction=0.5,
60 | allow_growth=True) # seems to be not working
61 | sess_config = tf.ConfigProto(allow_soft_placement=True,
62 | gpu_options=gpu_options)
63 | else:
64 | os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
65 | sess_config = tf.ConfigProto()
66 |
67 | if stage == 'train':
68 | with tf.Session(config=sess_config) as sess:
69 | sess.run(tf.initialize_all_variables())
70 | print('Generator training start!')
71 | reward_total = 0.0
72 | for episode in range(epoch):
73 | print('Generator episode: ', episode)
74 |
75 | observation = np.array(env.reset())
76 | item_cand = np.array([list(range(0, config['action_size']))] * batch_size)
77 | hill_b_f = []
78 | for i in range(2):
79 | # get action
80 | sampled_card_idx, sampled_card = sess.run([g.sampled_path, g.sampled_result],
81 | feed_dict={g.user: observation, g.item_cand: item_cand})
82 | for step in range(config['max_steps']):
83 | observation_, reward, done, info = env.step(sampled_card[:, step])
84 |
85 | env.reset()
86 | # hill b f
87 | hill_b_f.append(list(zip(sampled_card, sampled_card_idx, reward)))
88 |
89 | b_hill_f = np.transpose(hill_b_f, [1, 0, 2])
90 | samples = []
91 | for hill_f in b_hill_f:
92 | sorted_list = sorted(hill_f, key=lambda x: x[2], reverse=True)
93 | samples.append(sorted_list[np.random.choice(1)])
94 |
95 | (sampled_card, sampled_card_idx, reward) = zip(*samples)
96 | reward = np.array(reward)
97 |
98 | reward_ = sess.run(d.reward, feed_dict={d.user: observation})
99 | sess.run(d.train_op, feed_dict={d.user: observation, d.reward_target: reward})
100 |
101 | if episode % 50 == 0:
102 | print('episode:', episode)
103 | print('reward_target', np.mean(reward_))
104 | print('reward', np.mean(reward))
105 | print('actions', sampled_card[:10])
106 | reward = (reward - reward_)
107 |
108 | reward = reward / np.std(reward)
109 |
110 | sess.run(g.train_op, feed_dict={g.decode_target_ids: sampled_card_idx,
111 | g.reward: reward,
112 | g.item_cand: item_cand,
113 | g.user: observation,
114 | })
115 | gs_gen = sess.run(g.global_step)
116 |
117 | if episode % 500 == 0:
118 | saver = tf.train.Saver()
119 | saver.save(sess, model_save_path + '.' + str(episode))
120 | print('save model:' + model_save_path + '.' + str(episode))
121 | print('Generator training done!')
122 | saver = tf.train.Saver()
123 | saver.save(sess, model_save_path + '.' + str(episode))
124 | print('save model:' + model_save_path + '.' + str(episode))
125 | print("Done")
126 |
127 | if stage == 'eval':
128 | with tf.Session(config=sess_config) as sess:
129 | sess.run(tf.initialize_all_variables())
130 | saver = tf.train.Saver()
131 | saver.restore(sess, restore_file)
132 | print('restore exact-k model from %s' % (restore_file))
133 | episode_reward = 0
134 | done = False
135 | epoch = 4
136 | for i in range(epoch):
137 | observation = np.array(env.reset())
138 | item_cand = np.array([list(range(0, config['action_size']))] * batch_size)
139 | sampled_card_idx, sampled_card = sess.run([g.greedy_path, g.greedy_result],
140 | feed_dict={g.user: observation, g.item_cand: item_cand})
141 | for step in range(config['max_steps']):
142 | observation_, reward, done, info = env.step(sampled_card[:, step])
143 | episode_reward += sum(reward)
144 | print('actions', sampled_card[:10])
145 | print('avg reward', episode_reward / config['batch_size'] / (i + 1))
146 |
--------------------------------------------------------------------------------
/rl4rs/server/gymHttpClient.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import six.moves.urllib.parse as urlparse
3 | import json
4 | import numpy as np
5 | import os
6 | import gym
7 |
8 | import logging
9 |
10 | logger = logging.getLogger(__name__)
11 | logger.setLevel(logging.INFO)
12 |
13 |
14 | # modify from https://github.com/openai/gym-http-api
15 | class Client(object):
16 | """
17 | Gym client to interface with gym_http_server
18 | """
19 |
20 | def __init__(self, remote_base):
21 | self.remote_base = remote_base
22 | self.session = requests.Session()
23 | self.session.headers.update({'Content-type': 'application/json'})
24 |
25 | def _parse_server_error_or_raise_for_status(self, resp):
26 | j = {}
27 | try:
28 | j = resp.json()
29 | except:
30 | # Most likely json parse failed because of network error, not server error (server
31 | # sends its errors in json). Don't let parse exception go up, but rather raise default
32 | # error.
33 | resp.raise_for_status()
34 | if resp.status_code != 200 and "message" in j: # descriptive message from server side
35 | raise ServerError(message=j["message"], status_code=resp.status_code)
36 | resp.raise_for_status()
37 | return j
38 |
39 | def _post_request(self, route, data):
40 | url = urlparse.urljoin(self.remote_base, route)
41 | # logger.info("POST {}\n{}".format(url, json.dumps(data)))
42 | resp = self.session.post(urlparse.urljoin(self.remote_base, route),
43 | data=json.dumps(data))
44 | return self._parse_server_error_or_raise_for_status(resp)
45 |
46 | def _get_request(self, route):
47 | url = urlparse.urljoin(self.remote_base, route)
48 | # logger.info("GET {}".format(url))
49 | resp = self.session.get(url)
50 | return self._parse_server_error_or_raise_for_status(resp)
51 |
52 | def env_create(self, env_id, config={}):
53 | route = '/v1/envs/'
54 | data = {'env_id': env_id, 'config': config}
55 | resp = self._post_request(route, data)
56 | instance_id = resp['instance_id']
57 | return instance_id
58 |
59 | def env_list_all(self):
60 | route = '/v1/envs/'
61 | resp = self._get_request(route)
62 | all_envs = resp['all_envs']
63 | return all_envs
64 |
65 | def env_reset(self, instance_id):
66 | route = '/v1/envs/{}/reset/'.format(instance_id)
67 | resp = self._post_request(route, None)
68 | if 'observation' in resp:
69 | observation = resp['observation']
70 | else:
71 | resp = self._post_request(route, None)
72 | observation = resp['observation']
73 | return observation
74 |
75 | def env_step(self, instance_id, action, render=False):
76 | route = '/v1/envs/{}/step/'.format(instance_id)
77 | data = {'action': action, 'render': render}
78 | resp = self._post_request(route, data)
79 | observation = resp['observation']
80 | reward = resp['reward']
81 | done = resp['done']
82 | info = resp['info']
83 | return [observation, reward, done, info]
84 |
85 | def env_action_space_info(self, instance_id):
86 | route = '/v1/envs/{}/action_space/'.format(instance_id)
87 | resp = self._get_request(route)
88 | info = resp['info']
89 | return info
90 |
91 | def env_action_space_sample(self, instance_id):
92 | route = '/v1/envs/{}/action_space/sample'.format(instance_id)
93 | resp = self._get_request(route)
94 | action = resp['action']
95 | return action
96 |
97 | def env_action_space_contains(self, instance_id, x):
98 | route = '/v1/envs/{}/action_space/contains/{}'.format(instance_id, x)
99 | resp = self._get_request(route)
100 | member = resp['member']
101 | return member
102 |
103 | def env_observation_space_info(self, instance_id):
104 | route = '/v1/envs/{}/observation_space/'.format(instance_id)
105 | resp = self._get_request(route)
106 | info = resp['info']
107 | return info
108 |
109 | def env_observation_space_contains(self, instance_id, params):
110 | route = '/v1/envs/{}/observation_space/contains'.format(instance_id)
111 | resp = self._post_request(route, params)
112 | member = resp['member']
113 | return member
114 |
115 | def env_monitor_start(self, instance_id, directory,
116 | force=False, resume=False, video_callable=False):
117 | route = '/v1/envs/{}/monitor/start/'.format(instance_id)
118 | data = {'directory': directory,
119 | 'force': force,
120 | 'resume': resume,
121 | 'video_callable': video_callable}
122 | self._post_request(route, data)
123 |
124 | def env_monitor_close(self, instance_id):
125 | route = '/v1/envs/{}/monitor/close/'.format(instance_id)
126 | self._post_request(route, None)
127 |
128 | def env_close(self, instance_id):
129 | route = '/v1/envs/{}/close/'.format(instance_id)
130 | self._post_request(route, None)
131 |
132 | def upload(self, training_dir, algorithm_id=None, api_key=None):
133 | if not api_key:
134 | api_key = os.environ.get('OPENAI_GYM_API_KEY')
135 |
136 | route = '/v1/upload/'
137 | data = {'training_dir': training_dir,
138 | 'algorithm_id': algorithm_id,
139 | 'api_key': api_key}
140 | self._post_request(route, data)
141 |
142 | def shutdown_server(self):
143 | route = '/v1/shutdown/'
144 | self._post_request(route, None)
145 |
146 |
147 | class ServerError(Exception):
148 | def __init__(self, message, status_code=None):
149 | Exception.__init__(self)
150 | self.message = message
151 | if status_code is not None:
152 | self.status_code = status_code
153 |
154 |
155 | if __name__ == '__main__':
156 | remote_base = 'http://127.0.0.1:5000'
157 | client = Client(remote_base)
158 |
159 | # Create environment
160 | env_id = 'CartPole-v0'
161 | instance_id = client.env_create(env_id)
162 | print(instance_id)
163 | # Check properties
164 | all_envs = client.env_list_all()
165 | action_info = client.env_action_space_info(instance_id)
166 | obs_info = client.env_observation_space_info(instance_id)
167 | print(obs_info)
168 | # Run a single step
169 | client.env_monitor_start(instance_id, directory='tmp', force=True)
170 | init_obs = client.env_reset(instance_id)
171 | [observation, reward, done, info] = client.env_step(instance_id, 1, False)
172 | client.env_monitor_close(instance_id)
173 | print(observation, reward, done, info)
174 | # client.upload(training_dir='tmp')
175 |
--------------------------------------------------------------------------------
/script/batchrl_train.py:
--------------------------------------------------------------------------------
1 | import os
2 | import gym
3 | import random
4 | import d3rlpy
5 | import sys
6 | import torch
7 | from rl4rs.env.slate import SlateRecEnv, SlateState
8 | from rl4rs.env.seqslate import SeqSlateRecEnv, SeqSlateState
9 | from script import batchrl_trainer
10 | from d3rlpy.dataset import MDPDataset
11 | from script.offline_evaluation import ope_eval
12 | from rl4rs.policy.behavior_model import behavior_model
13 | from rl4rs.policy.policy_model import policy_model
14 | from rl4rs.nets.cql.encoder import CustomVectorEncoderFactory
15 | from d3rlpy.metrics.scorer import dynamics_observation_prediction_error_scorer
16 | from d3rlpy.metrics.scorer import dynamics_reward_prediction_error_scorer
17 | from d3rlpy.metrics.scorer import dynamics_prediction_variance_scorer
18 |
19 | algo = sys.argv[1]
20 | stage = sys.argv[2]
21 | extra_config = eval(sys.argv[3]) if len(sys.argv) >= 4 else {}
22 |
23 | config = {"epoch": 4, "maxlen": 64, "batch_size": 2048, "action_size": 284, "class_num": 2, "dense_feature_num": 432,
24 | "category_feature_num": 21, "category_hash_size": 100000, "seq_num": 2, "emb_size": 128,
25 | "hidden_units": 128, "max_steps": 9, "sample_file": '../dataset/rl4rs_dataset_a_shuf.csv',
26 | "model_file": "../output/rl4rs_dataset_a_dnn/model", 'gpu': True, "page_items": 9, 'action_emb_size':32,
27 | "iteminfo_file": '../dataset/item_info.csv', "support_d3rl_mask": True, "is_eval": True,
28 | "CQL_alpha": 1, 'env': 'SlateRecEnv-v0', 'trial_name': 'a_all'}
29 |
30 | config = dict(config, **extra_config)
31 |
32 | if config['env'] == 'SeqSlateRecEnv-v0':
33 | config['max_steps'] = 36
34 | location_mask, special_items = SeqSlateState.get_mask_from_file(config['iteminfo_file'], config['action_size'])
35 | config['location_mask'] = location_mask
36 | config['special_items'] = special_items
37 | elif config['env'] == 'SlateRecEnv-v0':
38 | location_mask, special_items = SlateState.get_mask_from_file(config['iteminfo_file'], config['action_size'])
39 | config['location_mask'] = location_mask
40 | config['special_items'] = special_items
41 | else:
42 | assert config['env'] in ('SlateRecEnv-v0', 'SeqSlateRecEnv-v0')
43 |
44 | if algo in ('MOPO', 'COMBO') or 'conti' in algo:
45 | config["support_conti_env"] = True
46 |
47 | if not config.get('gpu', True):
48 | os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
49 | torch.cuda.is_available = lambda: False
50 | print('CUDA_VISIBLE_DEVICES', torch.cuda.is_available())
51 |
52 | if not config.get("support_conti_env",False):
53 | trail_name = config['env'] + '_' + config['trial_name'] + '.h5'
54 | elif config.get("support_onehot_action", False):
55 | config['action_emb_size'] = config["action_size"]
56 | trail_name = config['env'] + '_' + config['trial_name'] + '_onehot.h5'
57 | else:
58 | trail_name = config['env'] + '_' + config['trial_name'] + '_conti.h5'
59 | dataset_dir = os.environ['rl4rs_dataset_dir']
60 | output_dir = os.environ['rl4rs_output_dir']
61 | dataset_save_path = dataset_dir + '/' + trail_name
62 | dynamics_save_path = output_dir + '/' + 'dynamics' + '_' + trail_name
63 | model_save_path = output_dir + '/' + algo + '_' + trail_name
64 | scaler = None
65 | print(trail_name, config)
66 |
67 | try:
68 | dataset = MDPDataset.load(dataset_save_path)
69 | except Exception:
70 | dataset = None
71 |
72 | try:
73 | dynamics = batchrl_trainer.get_model(config, 'dynamics')
74 | dynamics = batchrl_trainer.build_with_dataset(dynamics, dataset)
75 | dynamics.load_model(dynamics_save_path)
76 | except Exception:
77 | dynamics = None
78 |
79 | if stage == 'dataset_generate':
80 | if config['env'] == 'SlateRecEnv-v0':
81 | if not config.get("support_conti_env",False):
82 | batchrl_trainer.data_generate_rl4rs_a(config, dataset_save_path)
83 | else:
84 | batchrl_trainer.data_generate_rl4rs_a_conti(config, dataset_save_path)
85 | elif config['env'] == 'SeqSlateRecEnv-v0':
86 | if not config.get("support_conti_env",False):
87 | batchrl_trainer.data_generate_rl4rs_b(config, dataset_save_path)
88 | else:
89 | batchrl_trainer.data_generate_rl4rs_b_conti(config, dataset_save_path)
90 | else:
91 | batchrl_trainer.data_generate_rl4rs_a(config, dataset_save_path)
92 | assert config['env'] in ('SlateRecEnv-v0', 'SeqSlateRecEnv-v0')
93 |
94 | if stage == 'train_dynamics' or (stage == 'train' and algo == 'dynamics'):
95 | dynamics = batchrl_trainer.get_model(config, 'dynamics')
96 | print('get_action_size', dataset.episodes[0].get_action_size())
97 | dynamics.fit(dataset,
98 | eval_episodes=dataset.episodes[-3000:],
99 | n_epochs=10,
100 | show_progress=False,
101 | scorers={
102 | 'observation_error': dynamics_observation_prediction_error_scorer,
103 | 'reward_error': dynamics_reward_prediction_error_scorer,
104 | 'variance': dynamics_prediction_variance_scorer,
105 | }
106 | )
107 | dynamics.save_model(dynamics_save_path)
108 |
109 | if stage == 'train':
110 | model = batchrl_trainer.get_model(config, algo, dynamics)
111 | model.fit(dataset,
112 | eval_episodes=dataset.episodes[-3000:],
113 | n_epochs=config['epoch'],
114 | show_progress=False)
115 | model.save_model(model_save_path)
116 |
117 | if stage == 'eval':
118 | default_soft_opc_score = 90 \
119 | if config['env'] == 'SlateRecEnv-v0' \
120 | else 90 * 2
121 | soft_opc_score = config.get('soft_opc_score', default_soft_opc_score)
122 | model = batchrl_trainer.get_model(config, algo, dynamics)
123 | model = batchrl_trainer.build_with_dataset(model, dataset)
124 | model.load_model(model_save_path)
125 | eval_episodes = random.sample(dataset.episodes, 2048 * 4)
126 | policy = policy_model(model, config=config)
127 | # batchrl_trainer.d3rlpy_eval(eval_episodes, policy, soft_opc_score)
128 | batchrl_trainer.evaluate(config, policy)
129 |
130 | if stage == 'ope':
131 | dataset_dir = os.environ['rl4rs_dataset_dir']
132 | sample_model = behavior_model(config, modelfile=dataset_dir + '/logged_policy.h5')
133 | model = batchrl_trainer.get_model(config, algo, dynamics)
134 | model = batchrl_trainer.build_with_dataset(model, dataset)
135 | model.load_model(model_save_path)
136 | eval_config = config.copy()
137 | eval_config["is_eval"] = True
138 | eval_config["batch_size"] = 2048
139 | eval_config["epoch"] = 1
140 | if config['env'] == 'SeqSlateRecEnv-v0':
141 | config['max_steps'] = 36
142 | sim = SeqSlateRecEnv(eval_config, state_cls=SeqSlateState)
143 | eval_env = gym.make('SeqSlateRecEnv-v0', recsim=sim)
144 | else:
145 | sim = SlateRecEnv(eval_config, state_cls=SlateState)
146 | eval_env = gym.make('SlateRecEnv-v0', recsim=sim)
147 | ope_eval(eval_config, eval_env, model, sample_model=sample_model)
148 |
--------------------------------------------------------------------------------
/reproductions/run_modelfree_rl.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | conda activate rl4rs
4 | script_abs=$(readlink -f "$0")
5 | rl4rs_benchmark_dir=$(dirname $script_abs)/..
6 | rl4rs_output_dir=${rl4rs_benchmark_dir}/output
7 | rl4rs_dataset_dir=${rl4rs_benchmark_dir}/dataset
8 | script_dir=${rl4rs_benchmark_dir}/script
9 | export rl4rs_benchmark_dir && export rl4rs_output_dir && export rl4rs_dataset_dir
10 |
11 | algo=$1
12 |
13 | cd ${script_dir}
14 |
15 | # experiment in a_all env, train in a_all sample and test in a_all sample
16 | python -u modelfree_train.py $algo "train" "{'env':'SlateRecEnv-v0','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_a_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_a_dien/model','trial_name':'a_all','remote_base':'http://127.0.0.1:5000'}" >> ${rl4rs_output_dir}/modelfree_a_all_${algo}.log &&
17 | python -u modelfree_train.py $algo "eval" "{'env':'SlateRecEnv-v0','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_a_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_a_dien/model','trial_name':'a_all','remote_base':'http://127.0.0.1:5000'}" >> ${rl4rs_output_dir}/modelfree_a_all_${algo}.log &&
18 | #python -u modelfree_train.py $algo "ope" "{'env':'SlateRecEnv-v0','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_a_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_a_dien/model','trial_name':'a_all','remote_base':'http://127.0.0.1:5000'}" >> ${rl4rs_output_dir}/modelfree_a_all_${algo}.log &&
19 |
20 |
21 | # experiment in a_all env, train in a_train sample and test in a_test sample
22 | #python -u modelfree_train.py $algo "train" "{'env':'SlateRecEnv-v0','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_a_train_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_a_dien/model','trial_name':'a_train','remote_base':'http://127.0.0.1:5000'}" >> ${rl4rs_output_dir}/modelfree_a_train_${algo}.log &&
23 | #python -u modelfree_train.py $algo "eval" "{'env':'SlateRecEnv-v0','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_a_test_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_a_dien/model','trial_name':'a_train','remote_base':'http://127.0.0.1:5000'}" >> ${rl4rs_output_dir}/modelfree_a_train_${algo}.log &&
24 | #python -u modelfree_train.py $algo "ope" "{'env':'SlateRecEnv-v0','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_a_test_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_a_dien/model','trial_name':'a_train','remote_base':'http://127.0.0.1:5000'}" >> ${rl4rs_output_dir}/modelfree_a_train_${algo}.log &&
25 |
26 |
27 | # experiment train in a_sl env and test in a_rl env
28 | #python -u modelfree_train.py $algo "train" "{'env':'SlateRecEnv-v0','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_a_sl_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_a_sl_dien/model','trial_name':'a_sl','remote_base':'http://127.0.0.1:5000'}" >> ${rl4rs_output_dir}/modelfree_a_sl_${algo}.log &&
29 | #python -u modelfree_train.py $algo "eval" "{'env':'SlateRecEnv-v0','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_a_rl_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_a_rl_dien/model','trial_name':'a_sl','remote_base':'http://127.0.0.1:5000'}" >> ${rl4rs_output_dir}/modelfree_a_sl_${algo}.log &&
30 | #python -u modelfree_train.py $algo "ope" "{'env':'SlateRecEnv-v0','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_a_rl_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_a_rl_dien/model','trial_name':'a_sl','remote_base':'http://127.0.0.1:5000'}" >> ${rl4rs_output_dir}/modelfree_a_sl_${algo}.log &&
31 |
32 |
33 | # experiment in b_all env, train in b_all sample and test in b_all sample
34 | python -u modelfree_train.py $algo "train" "{'env':'SeqSlateRecEnv-v0','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_b3_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_b2_dien/model','trial_name':'b_all','remote_base':'http://127.0.0.1:5000'}" >> ${rl4rs_output_dir}/modelfree_b_all_${algo}.log &&
35 | python -u modelfree_train.py $algo "eval" "{'env':'SeqSlateRecEnv-v0','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_b3_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_b2_dien/model','trial_name':'b_all','remote_base':'http://127.0.0.1:5000'}" >> ${rl4rs_output_dir}/modelfree_b_all_${algo}.log &&
36 | #python -u modelfree_train.py $algo "ope" "{'env':'SeqSlateRecEnv-v0','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_b3_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_b2_dien/model','trial_name':'b_all','remote_base':'http://127.0.0.1:5000'}" >> ${rl4rs_output_dir}/modelfree_b_all_${algo}.log &&
37 |
38 |
39 | # experiment in b_all env, train in b_train sample and test in b_test sample
40 | #python -u modelfree_train.py $algo "train" "{'env':'SeqSlateRecEnv-v0','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_b3_train_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_b2_dien/model','trial_name':'b_train','remote_base':'http://127.0.0.1:5000'}" >> ${rl4rs_output_dir}/modelfree_b_train_${algo}.log &&
41 | #python -u modelfree_train.py $algo "eval" "{'env':'SeqSlateRecEnv-v0','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_b3_test_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_b2_dien/model','trial_name':'b_train','remote_base':'http://127.0.0.1:5000'}" >> ${rl4rs_output_dir}/modelfree_b_train_${algo}.log &&
42 | #python -u modelfree_train.py $algo "ope" "{'env':'SeqSlateRecEnv-v0','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_b3_test_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_b2_dien/model','trial_name':'b_train','remote_base':'http://127.0.0.1:5000'}" >> ${rl4rs_output_dir}/modelfree_b_train_${algo}.log &&
43 |
44 |
45 | # experiment train in b_sl env and test in b_rl env
46 | #python -u modelfree_train.py $algo "train" "{'env':'SeqSlateRecEnv-v0','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_b3_sl_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_b2_sl_dien/model','trial_name':'b_sl','remote_base':'http://127.0.0.1:5000'}" >> ${rl4rs_output_dir}/modelfree_b_sl_${algo}.log &&
47 | #python -u modelfree_train.py $algo "eval" "{'env':'SeqSlateRecEnv-v0','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_b3_rl_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_b2_rl_dien/model','trial_name':'b_sl','remote_base':'http://127.0.0.1:5000'}" >> ${rl4rs_output_dir}/modelfree_b_sl_${algo}.log &&
48 | #python -u modelfree_train.py $algo "ope" "{'env':'SeqSlateRecEnv-v0','iteminfo_file':'${rl4rs_dataset_dir}/item_info.csv','sample_file':'${rl4rs_dataset_dir}/rl4rs_dataset_b3_rl_shuf.csv','model_file':'${rl4rs_output_dir}/simulator_b2_rl_dien/model','trial_name':'b_sl','remote_base':'http://127.0.0.1:5000'}" >> ${rl4rs_output_dir}/modelfree_b_sl_${algo}.log &&
49 |
50 | echo "1"
51 |
--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
1 | name: rl4rs
2 | channels:
3 | - defaults
4 | dependencies:
5 | - _libgcc_mutex=0.1=main
6 | - _openmp_mutex=4.5=1_gnu
7 | - _tflow_select=2.3.0=mkl
8 | - absl-py=0.15.0=pyhd3eb1b0_0
9 | - argon2-cffi=20.1.0=py36h27cfd23_1
10 | - astor=0.8.1=py36h06a4308_0
11 | - async_generator=1.10=py36h28b3542_0
12 | - attrs=21.2.0=pyhd3eb1b0_0
13 | - backcall=0.2.0=pyhd3eb1b0_0
14 | - blas=1.0=mkl
15 | - bleach=4.0.0=pyhd3eb1b0_0
16 | - c-ares=1.17.1=h27cfd23_0
17 | - ca-certificates=2021.10.26=h06a4308_2
18 | - certifi=2021.5.30=py36h06a4308_0
19 | - cffi=1.14.6=py36h400218f_0
20 | - coverage=5.5=py36h27cfd23_2
21 | - cython=0.29.24=py36h295c915_0
22 | - dataclasses=0.8=pyh4f3eec9_6
23 | - dbus=1.13.18=hb2f20db_0
24 | - decorator=5.1.0=pyhd3eb1b0_0
25 | - defusedxml=0.7.1=pyhd3eb1b0_0
26 | - entrypoints=0.3=py36_0
27 | - expat=2.4.1=h2531618_2
28 | - fontconfig=2.13.1=h6c09931_0
29 | - freetype=2.11.0=h70c0345_0
30 | - glib=2.69.1=h5202010_0
31 | - google-pasta=0.2.0=pyhd3eb1b0_0
32 | - grpcio=1.36.1=py36h2157cd5_1
33 | - gst-plugins-base=1.14.0=h8213a91_2
34 | - gstreamer=1.14.0=h28cd5cc_2
35 | - h5py=2.10.0=py36hd6299e0_1
36 | - hdf5=1.10.6=hb1b8bf9_0
37 | - icu=58.2=he6710b0_3
38 | - importlib-metadata=4.8.1=py36h06a4308_0
39 | - importlib_metadata=4.8.1=hd3eb1b0_0
40 | - intel-openmp=2021.4.0=h06a4308_3561
41 | - ipykernel=5.3.4=py36h5ca1d4c_0
42 | - ipython=7.16.1=py36h5ca1d4c_0
43 | - ipython_genutils=0.2.0=pyhd3eb1b0_1
44 | - ipywidgets=7.6.5=pyhd3eb1b0_1
45 | - jedi=0.17.0=py36_0
46 | - jinja2=3.0.2=pyhd3eb1b0_0
47 | - jpeg=9d=h7f8727e_0
48 | - jsonschema=3.2.0=pyhd3eb1b0_2
49 | - jupyter=1.0.0=py36_7
50 | - jupyter_client=7.1.0=pyhd3eb1b0_0
51 | - jupyter_console=6.4.0=pyhd3eb1b0_0
52 | - jupyter_core=4.8.1=py36h06a4308_0
53 | - jupyterlab_pygments=0.1.2=py_0
54 | - jupyterlab_widgets=1.0.0=pyhd3eb1b0_1
55 | - keras-applications=1.0.8=py_1
56 | - keras-preprocessing=1.1.2=pyhd3eb1b0_0
57 | - ld_impl_linux-64=2.35.1=h7274673_9
58 | - libffi=3.3=he6710b0_2
59 | - libgcc-ng=9.3.0=h5101ec6_17
60 | - libgfortran-ng=7.5.0=ha8ba4b0_17
61 | - libgfortran4=7.5.0=ha8ba4b0_17
62 | - libgomp=9.3.0=h5101ec6_17
63 | - libpng=1.6.37=hbc83047_0
64 | - libprotobuf=3.17.2=h4ff587b_1
65 | - libsodium=1.0.18=h7b6447c_0
66 | - libstdcxx-ng=9.3.0=hd4cf53a_17
67 | - libuuid=1.0.3=h7f8727e_2
68 | - libxcb=1.14=h7b6447c_0
69 | - libxml2=2.9.12=h03d6c58_0
70 | - markdown=3.3.4=py36h06a4308_0
71 | - markupsafe=2.0.1=py36h27cfd23_0
72 | - mistune=0.8.4=py36h7b6447c_0
73 | - mkl=2020.2=256
74 | - mkl-service=2.3.0=py36he8ac12f_0
75 | - mkl_fft=1.3.0=py36h54f3939_0
76 | - mkl_random=1.1.1=py36h0573a6f_0
77 | - nbclient=0.5.3=pyhd3eb1b0_0
78 | - nbconvert=6.0.7=py36_0
79 | - nbformat=5.1.3=pyhd3eb1b0_0
80 | - ncurses=6.3=h7f8727e_2
81 | - nest-asyncio=1.5.1=pyhd3eb1b0_0
82 | - notebook=6.4.3=py36h06a4308_0
83 | - numpy=1.19.2=py36h54aff64_0
84 | - numpy-base=1.19.2=py36hfa32c7d_0
85 | - openssl=1.1.1m=h7f8727e_0
86 | - packaging=21.3=pyhd3eb1b0_0
87 | - pandoc=2.12=h06a4308_0
88 | - pandocfilters=1.4.3=py36h06a4308_1
89 | - parso=0.8.2=pyhd3eb1b0_0
90 | - pcre=8.45=h295c915_0
91 | - pexpect=4.8.0=pyhd3eb1b0_3
92 | - pickleshare=0.7.5=pyhd3eb1b0_1003
93 | - pip=21.2.2=py36h06a4308_0
94 | - prometheus_client=0.12.0=pyhd3eb1b0_0
95 | - prompt-toolkit=3.0.20=pyhd3eb1b0_0
96 | - prompt_toolkit=3.0.20=hd3eb1b0_0
97 | - ptyprocess=0.7.0=pyhd3eb1b0_2
98 | - pycparser=2.21=pyhd3eb1b0_0
99 | - pygments=2.10.0=pyhd3eb1b0_0
100 | - pyparsing=3.0.4=pyhd3eb1b0_0
101 | - pyqt=5.9.2=py36h05f1152_2
102 | - python=3.6.13=h12debd9_1
103 | - python-dateutil=2.8.2=pyhd3eb1b0_0
104 | - pyzmq=22.2.1=py36h295c915_1
105 | - qt=5.9.7=h5867ecd_1
106 | - qtconsole=5.1.1=pyhd3eb1b0_0
107 | - qtpy=1.10.0=pyhd3eb1b0_0
108 | - readline=8.1=h27cfd23_0
109 | - scipy=1.3.1=py36h7c811a0_0
110 | - send2trash=1.8.0=pyhd3eb1b0_1
111 | - setuptools=58.0.4=py36h06a4308_0
112 | - sip=4.19.8=py36hf484d3e_0
113 | - six=1.16.0=pyhd3eb1b0_0
114 | - sqlite=3.37.0=hc218d9a_0
115 | - termcolor=1.1.0=py36h06a4308_1
116 | - terminado=0.9.4=py36h06a4308_0
117 | - testpath=0.5.0=pyhd3eb1b0_0
118 | - tk=8.6.11=h1ccaba5_0
119 | - tornado=6.1=py36h27cfd23_0
120 | - traitlets=4.3.3=py36h06a4308_0
121 | - typing_extensions=3.10.0.2=pyh06a4308_0
122 | - wcwidth=0.2.5=pyhd3eb1b0_0
123 | - webencodings=0.5.1=py36_1
124 | - werkzeug=2.0.2=pyhd3eb1b0_0
125 | - wheel=0.37.0=pyhd3eb1b0_1
126 | - widgetsnbextension=3.5.1=py36_0
127 | - wrapt=1.12.1=py36h7b6447c_1
128 | - xz=5.2.5=h7b6447c_0
129 | - zeromq=4.3.4=h2531618_0
130 | - zipp=3.6.0=pyhd3eb1b0_0
131 | - zlib=1.2.11=h7f8727e_4
132 | - pip:
133 | - aiohttp==3.7.4.post0
134 | - aiohttp-cors==0.7.0
135 | - aioredis==1.3.1
136 | - antlr4-python3-runtime==4.8
137 | - async-timeout==3.0.1
138 | - blessings==1.7
139 | - cachetools==4.2.2
140 | - chardet==4.0.0
141 | - charset-normalizer==2.0.6
142 | - click==8.0.1
143 | - cloudpickle==1.6.0
144 | - colorama==0.4.4
145 | - contextvars==2.4
146 | - d3rlpy==0.91
147 | - deepctr==0.9.0
148 | - dm-tree==0.1.6
149 | - fairseq==0.10.2
150 | - filelock==3.0.12
151 | - flask==1.1.2
152 | - gast==0.2.2
153 | - google-api-core==1.31.2
154 | - google-auth==1.35.0
155 | - googleapis-common-protos==1.53.0
156 | - gpustat==0.6.0
157 | - gputil==1.4.0
158 | - greenlet==1.1.2
159 | - gym==0.19.0
160 | - hiredis==2.0.0
161 | - hydra-core==1.1.1
162 | - idna==3.2
163 | - idna-ssl==1.1.0
164 | - immutables==0.16
165 | - importlib-resources==5.2.2
166 | - itsdangerous==2.0.1
167 | - joblib==1.0.1
168 | - keras==2.2.5
169 | - keras-embed-sim==0.10.0
170 | - keras-layer-normalization==0.16.0
171 | - keras-multi-head==0.29.0
172 | - keras-pos-embd==0.13.0
173 | - keras-position-wise-feed-forward==0.8.0
174 | - keras-self-attention==0.51.0
175 | - keras-transformer==0.40.0
176 | - lightseq==2.1.4
177 | - lz4==3.1.3
178 | - msgpack==1.0.2
179 | - multidict==5.1.0
180 | - ninja==1.10.2
181 | - nvidia-ml-py3==7.352.0
182 | - omegaconf==2.1.1
183 | - opencensus==0.7.13
184 | - opencensus-context==0.1.2
185 | - opencv-python-headless==4.3.0.36
186 | - opt-einsum==3.3.0
187 | - pandas==1.1.5
188 | - pandasql==0.7.3
189 | - portalocker==2.3.2
190 | - protobuf==3.19.3
191 | - psutil==5.8.0
192 | - py-spy==0.3.9
193 | - pyasn1==0.4.8
194 | - pyasn1-modules==0.2.8
195 | - pydantic==1.8.2
196 | - pyrsistent==0.18.0
197 | - pytz==2021.1
198 | - pyyaml==5.4.1
199 | - ray==1.5.1
200 | - redis==3.5.3
201 | - regex==2021.8.28
202 | - requests==2.27.1
203 | - rsa==4.7.2
204 | - sacrebleu==2.0.0
205 | - sacremoses==0.0.45
206 | - scikit-learn==0.24.2
207 | - sklearn==0.0
208 | - sqlalchemy==1.4.29
209 | - structlog==21.3.0
210 | - tabulate==0.8.9
211 | - tensorboard==1.15.0
212 | - tensorboardx==2.4.1
213 | - tensorflow-estimator==1.15.1
214 | - tensorflow-gpu==1.15.0
215 | - threadpoolctl==2.2.0
216 | - torch==1.9.0
217 | - tqdm==4.62.2
218 | - urllib3==1.26.6
219 | - yarl==1.6.3
220 | prefix: /project/miniconda3/envs/rl4rs
--------------------------------------------------------------------------------
/reproductions/run_split.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | conda activate rl4rs
4 | script_abs=$(readlink -f "$0")
5 | rl4rs_benchmark_dir=$(dirname $script_abs)/..
6 | rl4rs_dataset_dir=${rl4rs_benchmark_dir}/dataset
7 | script_dir=${rl4rs_benchmark_dir}/script
8 | rl4rs_output_dir=${rl4rs_benchmark_dir}/output
9 | mkdir $rl4rs_output_dir
10 | export rl4rs_benchmark_dir && export rl4rs_output_dir && export rl4rs_dataset_dir
11 |
12 | cd $rl4rs_dataset_dir
13 |
14 | #raw dataset
15 | awk -F "@" 'NR>1 {print}' rl4rs_dataset_a_sl.csv > rl4rs_dataset_a.csv &&
16 | awk -F "@" 'NR>1 {print}' rl4rs_dataset_a_rl.csv >> rl4rs_dataset_a.csv &&
17 | awk -F "@" 'NR>1 {print}' rl4rs_dataset_b_sl.csv > rl4rs_dataset_b.csv &&
18 | awk -F "@" 'NR>1 {print}' rl4rs_dataset_b_rl.csv >> rl4rs_dataset_b.csv &&
19 |
20 | #train/test split
21 | awk -F "@" 'NR>1 && $2%10<=5 {print}' rl4rs_dataset_a_sl.csv > rl4rs_dataset_a_sl_train.csv &&
22 | awk -F "@" 'NR>1 && $2%10>=6 {print}' rl4rs_dataset_a_sl.csv > rl4rs_dataset_a_sl_test.csv &&
23 | awk -F "@" 'NR>1 && $2%10<=5 {print}' rl4rs_dataset_a_rl.csv > rl4rs_dataset_a_rl_train.csv &&
24 | awk -F "@" 'NR>1 && $2%10>=6 {print}' rl4rs_dataset_a_rl.csv > rl4rs_dataset_a_rl_test.csv &&
25 |
26 | awk -F "@" 'NR>1 && $2%10<=5 {print}' rl4rs_dataset_b_sl.csv > rl4rs_dataset_b_sl_train.csv &&
27 | awk -F "@" 'NR>1 && $2%10>=6 {print}' rl4rs_dataset_b_sl.csv > rl4rs_dataset_b_sl_test.csv &&
28 | awk -F "@" 'NR>1 && $2%10<=5 {print}' rl4rs_dataset_b_rl.csv > rl4rs_dataset_b_rl_train.csv &&
29 | awk -F "@" 'NR>1 && $2%10>=6 {print}' rl4rs_dataset_b_rl.csv > rl4rs_dataset_b_rl_test.csv &&
30 |
31 | cat rl4rs_dataset_a_sl_train.csv > rl4rs_dataset_a_train.csv &&
32 | cat rl4rs_dataset_a_rl_train.csv >> rl4rs_dataset_a_train.csv &&
33 | cat rl4rs_dataset_b_sl_train.csv > rl4rs_dataset_b_train.csv &&
34 | cat rl4rs_dataset_b_rl_train.csv >> rl4rs_dataset_b_train.csv &&
35 |
36 | cat rl4rs_dataset_a_sl_test.csv > rl4rs_dataset_a_test.csv &&
37 | cat rl4rs_dataset_a_rl_test.csv >> rl4rs_dataset_a_test.csv &&
38 | cat rl4rs_dataset_b_sl_test.csv > rl4rs_dataset_b_test.csv &&
39 | cat rl4rs_dataset_b_rl_test.csv >> rl4rs_dataset_b_test.csv &&
40 |
41 | cat rl4rs_dataset_a_train.csv > rl4rs_dataset_a.csv &&
42 | cat rl4rs_dataset_a_test.csv >> rl4rs_dataset_a.csv &&
43 | cat rl4rs_dataset_b_train.csv > rl4rs_dataset_b.csv &&
44 | cat rl4rs_dataset_b_test.csv >> rl4rs_dataset_b.csv &&
45 |
46 | #dataset_b
47 | cd ${script_dir} &&
48 | python data_preprocess.py "${rl4rs_dataset_dir}/rl4rs_dataset_b_sl.csv" "${rl4rs_dataset_dir}/rl4rs_dataset_b2_sl.csv" "data_augment" &&
49 | python data_preprocess.py "${rl4rs_dataset_dir}/rl4rs_dataset_b_rl.csv" "${rl4rs_dataset_dir}/rl4rs_dataset_b2_rl.csv" "data_augment" &&
50 | python data_preprocess.py "${rl4rs_dataset_dir}/rl4rs_dataset_b_train.csv" "${rl4rs_dataset_dir}/rl4rs_dataset_b2_train.csv" "data_augment" &&
51 | python data_preprocess.py "${rl4rs_dataset_dir}/rl4rs_dataset_b_test.csv" "${rl4rs_dataset_dir}/rl4rs_dataset_b2_test.csv" "data_augment" &&
52 | python data_preprocess.py "${rl4rs_dataset_dir}/rl4rs_dataset_b.csv" "${rl4rs_dataset_dir}/rl4rs_dataset_b2.csv" "data_augment" &&
53 |
54 | python data_preprocess.py "${rl4rs_dataset_dir}/rl4rs_dataset_b2_sl.csv" "${rl4rs_dataset_dir}/rl4rs_dataset_b3_sl.csv" "slate2trajectory" &&
55 | python data_preprocess.py "${rl4rs_dataset_dir}/rl4rs_dataset_b2_rl.csv" "${rl4rs_dataset_dir}/rl4rs_dataset_b3_rl.csv" "slate2trajectory" &&
56 | python data_preprocess.py "${rl4rs_dataset_dir}/rl4rs_dataset_b2_train.csv" "${rl4rs_dataset_dir}/rl4rs_dataset_b3_train.csv" "slate2trajectory" &&
57 | python data_preprocess.py "${rl4rs_dataset_dir}/rl4rs_dataset_b2_test.csv" "${rl4rs_dataset_dir}/rl4rs_dataset_b3_test.csv" "slate2trajectory" &&
58 | python data_preprocess.py "${rl4rs_dataset_dir}/rl4rs_dataset_b2.csv" "${rl4rs_dataset_dir}/rl4rs_dataset_b3.csv" "slate2trajectory" &&
59 |
60 |
61 | #shuffle for RL Env.
62 | cd $rl4rs_dataset_dir &&
63 | cat rl4rs_dataset_a.csv|shuf > rl4rs_dataset_a_shuf.csv &&
64 | awk -F "@" 'NR>1 {print}' rl4rs_dataset_a_sl.csv|shuf > rl4rs_dataset_a_sl_shuf.csv &&
65 | awk -F "@" 'NR>1 {print}' rl4rs_dataset_a_rl.csv|shuf > rl4rs_dataset_a_rl_shuf.csv &&
66 | cat rl4rs_dataset_a_train.csv|shuf > rl4rs_dataset_a_train_shuf.csv &&
67 | cat rl4rs_dataset_a_test.csv|shuf > rl4rs_dataset_a_test_shuf.csv &&
68 | cat rl4rs_dataset_b3.csv|shuf > rl4rs_dataset_b3_shuf.csv &&
69 | cat rl4rs_dataset_b3_sl.csv|shuf > rl4rs_dataset_b3_sl_shuf.csv &&
70 | cat rl4rs_dataset_b3_rl.csv|shuf > rl4rs_dataset_b3_rl_shuf.csv &&
71 | cat rl4rs_dataset_b3_train.csv|shuf > rl4rs_dataset_b3_train_shuf.csv &&
72 | cat rl4rs_dataset_b3_test.csv|shuf > rl4rs_dataset_b3_test_shuf.csv &&
73 |
74 |
75 | cd $(dirname $script_abs) &&
76 | bash file_split.sh "rl4rs_dataset_a_sl_shuf.csv" &&
77 | bash file_split.sh "rl4rs_dataset_a_rl_shuf.csv" &&
78 | bash file_split.sh "rl4rs_dataset_a_train_shuf.csv" &&
79 | bash file_split.sh "rl4rs_dataset_a_test_shuf.csv" &&
80 | bash file_split.sh "rl4rs_dataset_a_shuf.csv" &&
81 | bash file_split.sh "rl4rs_dataset_b2_sl.csv" &&
82 | bash file_split.sh "rl4rs_dataset_b2_rl.csv" &&
83 | bash file_split.sh "rl4rs_dataset_b2_train.csv" &&
84 | bash file_split.sh "rl4rs_dataset_b2_test.csv" &&
85 | bash file_split.sh "rl4rs_dataset_b2.csv"
86 |
87 |
88 | #tfrecord for supervised learning
89 | cd ${script_dir}
90 |
91 | for ((i=0;i<5;i=i+1))
92 | do
93 | python data_preprocess.py "${rl4rs_output_dir}/rl4rs_dataset_a_sl_shuf.csv_000${i}.csv" "${rl4rs_output_dir}/rl4rs_dataset_a_sl.tfrecord.${i}" "tfrecord_item" &&
94 | python data_preprocess.py "${rl4rs_output_dir}/rl4rs_dataset_a_rl_shuf.csv_000${i}.csv" "${rl4rs_output_dir}/rl4rs_dataset_a_rl.tfrecord.${i}" "tfrecord_item" &&
95 | python data_preprocess.py "${rl4rs_output_dir}/rl4rs_dataset_a_train_shuf.csv_000${i}.csv" "${rl4rs_output_dir}/rl4rs_dataset_a_train.tfrecord.${i}" "tfrecord_item" &&
96 | python data_preprocess.py "${rl4rs_output_dir}/rl4rs_dataset_a_test_shuf.csv_000${i}.csv" "${rl4rs_output_dir}/rl4rs_dataset_a_test.tfrecord.${i}" "tfrecord_item" &&
97 | python data_preprocess.py "${rl4rs_output_dir}/rl4rs_dataset_a_shuf.csv_000${i}.csv" "${rl4rs_output_dir}/rl4rs_dataset_a.tfrecord.${i}" "tfrecord_item" &&
98 | python data_preprocess.py "${rl4rs_output_dir}/rl4rs_dataset_b2_sl.csv_000${i}.csv" "${rl4rs_output_dir}/rl4rs_dataset_b2_sl.tfrecord.${i}" "tfrecord_item" &&
99 | python data_preprocess.py "${rl4rs_output_dir}/rl4rs_dataset_b2_rl.csv_000${i}.csv" "${rl4rs_output_dir}/rl4rs_dataset_b2_rl.tfrecord.${i}" "tfrecord_item" &&
100 | python data_preprocess.py "${rl4rs_output_dir}/rl4rs_dataset_b2_train.csv_000${i}.csv" "${rl4rs_output_dir}/rl4rs_dataset_b2_train.tfrecord.${i}" "tfrecord_item" &&
101 | python data_preprocess.py "${rl4rs_output_dir}/rl4rs_dataset_b2_test.csv_000${i}.csv" "${rl4rs_output_dir}/rl4rs_dataset_b2_test.tfrecord.${i}" "tfrecord_item" &&
102 | python data_preprocess.py "${rl4rs_output_dir}/rl4rs_dataset_b2.csv_000${i}.csv" "${rl4rs_output_dir}/rl4rs_dataset_b2.tfrecord.${i}" "tfrecord_item" &&
103 | echo "1"
104 | done
105 |
106 | cd ${script_dir} &&
107 | python data_preprocess.py "${rl4rs_dataset_dir}/rl4rs_dataset_a_train_shuf.csv" "${rl4rs_output_dir}/rl4rs_dataset_a_train_slate.tfrecord" "tfrecord_slate" &&
108 | python data_preprocess.py "${rl4rs_dataset_dir}/rl4rs_dataset_a_test_shuf.csv" "${rl4rs_output_dir}/rl4rs_dataset_a_test_slate.tfrecord" "tfrecord_slate" &&
109 | python data_preprocess.py "${rl4rs_dataset_dir}/rl4rs_dataset_b2_train.csv" "${rl4rs_output_dir}/rl4rs_dataset_b2_train_slate.tfrecord" "tfrecord_slate" &&
110 | python data_preprocess.py "${rl4rs_dataset_dir}/rl4rs_dataset_b2_test.csv" "${rl4rs_output_dir}/rl4rs_dataset_b2_test_slate.tfrecord" "tfrecord_slate" &&
111 |
112 | echo "1"
113 |
--------------------------------------------------------------------------------
/script/data_preprocess.py:
--------------------------------------------------------------------------------
1 | from rl4rs.utils.datautil import FeatureUtil
2 | import numpy as np
3 | import sys, os, random
4 |
5 |
6 | def data_augment(file, out_file):
7 | f = open(out_file, 'w')
8 | data = open(file, 'r').read().split('\n')
9 | data_size = len(data)
10 | print('data length', data_size)
11 | tmp = []
12 | role_id_prev = None
13 | for record in data:
14 | if len(record) < 1 or 'timestamp' in record:
15 | continue
16 | role_id = record.split('@')[1]
17 | if role_id == role_id_prev or role_id_prev is None:
18 | tmp.append(record)
19 | role_id_prev = role_id
20 | else:
21 | assert len(tmp) <= 4
22 | for i in range(len(tmp), 4):
23 | timestamp, session_id, sequence_id, exposed_items, user_feedback, \
24 | user_seqfeature, user_protrait, item_feature, behavior_policy_id = tmp[-1].split('@')
25 | timestamp_new = str(int(timestamp) + 1)
26 | sequence_id_new = str(int(sequence_id) + 1)
27 | random_i = np.random.randint(1, data_size - 1)
28 | exposed_items_new = data[random_i].split('@')[3]
29 | item_feature_new = data[random_i].split('@')[7]
30 | user_feedback_new = '0,0,0,0,0,0,0,0,0'
31 | tmp.append('@'.join([
32 | timestamp_new,
33 | session_id,
34 | sequence_id_new,
35 | exposed_items_new,
36 | user_feedback_new,
37 | user_seqfeature,
38 | user_protrait,
39 | item_feature_new,
40 | behavior_policy_id
41 | ]))
42 | print(*tmp, sep='\n', end='\n', file=f)
43 | tmp = [record]
44 | role_id_prev = role_id
45 | f.close()
46 |
47 |
48 | def slate2trajectory(file, out_file):
49 | f = open(out_file, 'w')
50 | data = open(file, 'r').read().split('\n')
51 | data_size = len(data)
52 | print('data length', data_size)
53 | tmp = []
54 | role_id_prev = None
55 | for record in data:
56 | if len(record) < 1 or 'timestamp' in record:
57 | continue
58 | role_id = record.split('@')[1]
59 | if role_id == role_id_prev or role_id_prev is None:
60 | tmp.append(record)
61 | role_id_prev = role_id
62 | else:
63 | assert len(tmp) == 4
64 | # timestamp, session_id, sequence_id, exposed_items, user_feedback, user_seqfeature, user_protrait, item_feature, behavior_policy_id
65 | timestamp = tmp[0].split('@')[0]
66 | session_id = tmp[0].split('@')[1]
67 | sequence_id = '1'
68 | exposed_items = ','.join([x.split('@')[3] for x in tmp])
69 | user_feedback = ','.join([x.split('@')[4] for x in tmp])
70 | user_seqfeature = tmp[0].split('@')[5]
71 | user_protrait = tmp[0].split('@')[6]
72 | item_feature = ';'.join([x.split('@')[7] for x in tmp])
73 | behavior_policy_id = tmp[0].split('@')[8]
74 | traj = [
75 | timestamp,
76 | session_id,
77 | sequence_id,
78 | exposed_items,
79 | user_feedback,
80 | user_seqfeature,
81 | user_protrait,
82 | item_feature,
83 | behavior_policy_id
84 | ]
85 | print(*traj, sep='@', end='\n', file=f)
86 | tmp = [record]
87 | role_id_prev = role_id
88 | f.close()
89 |
90 |
91 | def dataset2tfrecord(config, file, tfrecord_file, is_slate):
92 | def feature_construct(session, is_slate):
93 | samples = []
94 | for i in range(len(session)):
95 | _, _, sequence_id, exposed_items, user_feedback, user_seqfeature, \
96 | user_protrait, item_feature, _ = FeatureUtil.record_split(session[i])
97 | assert sequence_id - 1 == i
98 | user_protrait_category = user_protrait[:10]
99 | user_protrait_dense = user_protrait[10:]
100 | category_feature = user_protrait_category + [sequence_id] + exposed_items
101 | prev_items = [session[ii].split('@')[3].split(',')[jj] for ii in range(i) for jj in range(9)]
102 | prev_items = list(map(int, prev_items))
103 | sequence_feature_clicked = prev_items if i > 0 else [0]
104 | sequence_feature = [user_seqfeature, sequence_feature_clicked]
105 | if is_slate:
106 | # label = '0'
107 | label = 0
108 | samples.append((
109 | role_id_prev,
110 | sequence_feature,
111 | user_protrait_dense + item_feature,
112 | category_feature,
113 | user_feedback,
114 | label
115 | ))
116 | else:
117 | for j in range(9):
118 | item_id = exposed_items[j]
119 | label = user_feedback[j]
120 | item_feature_size = len(item_feature) // 9
121 | item_feature_j = item_feature[item_feature_size * j:item_feature_size * (j + 1)]
122 | category_feature_j = category_feature + [item_id]
123 | dense_feature_j = item_feature + item_feature_j
124 | samples.append((
125 | role_id_prev,
126 | sequence_feature,
127 | user_protrait_dense + dense_feature_j,
128 | category_feature_j,
129 | user_feedback,
130 | label
131 | ))
132 | return samples
133 | featureutil = FeatureUtil(config)
134 | data = open(file, 'r').read().split('\n')
135 | print('data length', len(data))
136 | # role_id, sequence_feature, dense_feature, category_feature, label
137 | # timestamp@session_id@sequence_id@exposed_items@user_feedback@user_seqfeature@user_protrait@item_feature@behavior_policy_id
138 | tmp = []
139 | records = []
140 | role_id_prev = None
141 | for record in data:
142 | if len(record) < 1 or 'timestamp' in record:
143 | continue
144 | role_id = record.split('@')[1]
145 | if role_id == role_id_prev or role_id_prev is None:
146 | tmp.append(record)
147 | role_id_prev = role_id
148 | else:
149 | samples = feature_construct(tmp, is_slate)
150 | records = records + samples
151 | tmp = [record]
152 | role_id_prev = role_id
153 | if len(tmp) > 0:
154 | samples = feature_construct(tmp, is_slate)
155 | records = records + samples
156 | print('tfrecord length', len(records), records[0])
157 | random.shuffle(records)
158 | featureutil.to_tfrecord(records, tfrecord_file)
159 |
160 |
161 | config = {
162 | "maxlen": 64,
163 | "batch_size": 32,
164 | "class_num": 2,
165 | "dense_feature_num": 432,
166 | "category_feature_num": 21,
167 | "category_hash_size": 100000,
168 | "seq_num": 2
169 | }
170 | file = sys.argv[1]
171 | out_file = sys.argv[2]
172 | stage = sys.argv[3]
173 | assert stage in ('data_augment', 'slate2trajectory', 'tfrecord_item', 'tfrecord_slate')
174 | if stage == 'data_augment':
175 | data_augment(file, out_file)
176 | if stage == 'slate2trajectory':
177 | slate2trajectory(file, out_file)
178 | if stage == 'tfrecord_item':
179 | dataset2tfrecord(config, file, out_file, is_slate=False)
180 | if stage == 'tfrecord_slate':
181 | dataset2tfrecord(config, file, out_file, is_slate=True)
182 |
--------------------------------------------------------------------------------
/rl4rs/utils/offline_policy_metrics.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | # import scipy
3 | import scipy.stats
4 |
5 |
6 | # modify from https://mars-gym.readthedocs.io/en/latest/quick_start.html#off-policy-metrics
7 |
8 | def _calc_sequential_weigths(policy_prob, behavior_prob, weighted=False, a_min=None, a_max=None):
9 | # behavior_prob: Coleta
10 | # policy_prob: Avaliação
11 | #
12 | # Compute the sample weights - propensity ratios
13 | probs = np.array(policy_prob) / np.array(behavior_prob)
14 | rho = np.clip(probs, a_min=a_min, a_max=a_max).cumprod(1)
15 | if weighted:
16 | weight = np.sum(rho, axis=0)
17 | else:
18 | weight = len(policy_prob)
19 | ws = rho / weight
20 | return np.clip(ws, a_min=a_min, a_max=a_max)
21 |
22 |
23 | def _calc_sample_weigths(policy_prob, behavior_prob, a_min=None, a_max=None):
24 | # behavior_prob: Coleta
25 | # policy_prob: Avaliação
26 | #
27 | # Compute the sample weights - propensity ratios
28 | p_ratio = np.array(policy_prob) / np.array(behavior_prob)
29 |
30 | if a_min is not None:
31 | p_ratio = np.clip(p_ratio, a_min=a_min, a_max=a_max)
32 |
33 | # Effective sample size for E_t estimate (from A. Owen)
34 | n_e = len(policy_prob) * (np.mean(p_ratio) ** 2) / (p_ratio ** 2).mean()
35 |
36 | # Critical value from t-distribution as we have unknown variance
37 | alpha = 0.00125
38 | cv = scipy.stats.t.ppf(1 - alpha, df=int(n_e) - 1)
39 |
40 | return p_ratio, n_e, cv
41 |
42 |
43 | def eval_DM(policy, obs):
44 | return policy(obs)
45 |
46 |
47 | def eval_IPS(rewards, policy_prob, behavior_prob):
48 | # Calculate Sample Weigths
49 | p_ratio, n_e, cv = _calc_sample_weigths(policy_prob, behavior_prob)
50 | ###############
51 | # VANILLA IPS #
52 | ###############
53 | # Expected reward for pi_t
54 | E_t = np.mean(rewards * p_ratio)
55 |
56 | # Variance of the estimate
57 | var = ((rewards * p_ratio - E_t) ** 2).mean()
58 | stddev = np.sqrt(var)
59 |
60 | # C.I. assuming unknown variance - use t-distribution and effective sample size
61 | c = cv * stddev / np.sqrt(int(n_e))
62 | min_bound = E_t - c
63 | max_bound = E_t + c
64 |
65 | result = (E_t, c) # 0.025, 0.500, 0.975
66 | return result
67 |
68 |
69 | def eval_CIPS(rewards, policy_prob, behavior_prob):
70 | # Calculate Sample Weigths
71 | p_ratio, n_e, cv = _calc_sample_weigths(policy_prob, behavior_prob, a_min=0.1, a_max=10)
72 |
73 | ##############
74 | # CAPPED IPS #
75 | ##############
76 | # Cap ratios
77 | p_ratio_capped = np.clip(p_ratio, a_min=0.1, a_max=10)
78 |
79 | # Expected reward for pi_t
80 | E_t_capped = np.mean(rewards * p_ratio_capped)
81 |
82 | # Variance of the estimate
83 | var_capped = ((rewards * p_ratio_capped - E_t_capped) ** 2).mean()
84 | stddev_capped = np.sqrt(var_capped)
85 |
86 | # C.I. assuming unknown variance - use t-distribution and effective sample size
87 | c = cv * stddev_capped / np.sqrt(int(n_e))
88 |
89 | min_bound_capped = E_t_capped - c
90 | max_bound_capped = E_t_capped + c
91 |
92 | result = (E_t_capped, c) # 0.025, 0.500, 0.975
93 |
94 | return result
95 |
96 |
97 | def eval_SNIPS(rewards, policy_prob, behavior_prob):
98 | # Calculate Sample Weigths
99 | p_ratio, n_e, cv = _calc_sample_weigths(policy_prob, behavior_prob, a_min=0.1, a_max=10)
100 |
101 | ##############
102 | # NORMED IPS #
103 | ##############
104 | # Expected reward for pi_t
105 | E_t_normed = np.sum(rewards * p_ratio) / np.sum(p_ratio)
106 |
107 | # Variance of the estimate
108 | var_normed = np.sum(((rewards - E_t_normed) ** 2) * (p_ratio ** 2)) / (
109 | p_ratio.sum() ** 2
110 | )
111 | stddev_normed = np.sqrt(var_normed)
112 |
113 | # C.I. assuming unknown variance - use t-distribution and effective sample size
114 | c = cv * stddev_normed / np.sqrt(int(n_e))
115 |
116 | min_bound_normed = E_t_normed - c
117 | max_bound_normed = E_t_normed + c
118 |
119 | # Store result
120 | result = (E_t_normed, c) # 0.025, 0.500, 0.975
121 |
122 | return result
123 |
124 |
125 | def eval_WIPS(step_rewards, policy_prob, behavior_prob, gamma=1.0):
126 | batch_size = len(step_rewards)
127 | steps = len(step_rewards[0])
128 | w_t = []
129 |
130 | # calculate importance ratios
131 | p = _calc_sequential_weigths(policy_prob, behavior_prob, a_min=0.1, a_max=10)
132 |
133 | for i in range(steps):
134 | w_t.append(np.average(p[:, :i + 1], axis=1))
135 | w_t = np.array(w_t).swapaxes(0, 1)
136 | # calculate stepwise weighted IS estimate
137 | V_prev, V_step_WIS = 0.0, 0.0
138 | for t in range(steps):
139 | V_prev += np.sum(step_rewards[:, t] * gamma ** t)
140 | V_step_WIS += np.sum(p[:, t] / w_t[:, t] * step_rewards[:, t] * gamma ** t)
141 | # print('WIPS', p[:, -1], w_t[:, -1], np.max(p[:, -1] / w_t[:, -1]), step_rewards[:, -1])
142 | return V_step_WIS / np.clip(V_prev, a_min=1e-8, a_max=None), 0
143 |
144 |
145 | def eval_doubly_robust(
146 | action_rhat_rewards, state_rewards, rewards, policy_prob, behavior_prob
147 | ):
148 | # Calculate Sample Weigths
149 | p_ratio, n_e, cv = _calc_sample_weigths(policy_prob, behavior_prob, a_min=0.1, a_max=10)
150 |
151 | #################
152 | # Roubly Robust #
153 | #################
154 |
155 | dr = state_rewards + (p_ratio * (rewards - action_rhat_rewards))
156 |
157 | confidence = 0.95
158 | n = len(dr)
159 | m, se = np.mean(dr), scipy.stats.sem(dr)
160 | # h = se * scipy.stats.t.ppf((1 + confidence) / 2.0, n - 1)
161 | # print('dr', action_rhat_rewards[:2], p_ratio[:2], rewards[:2], m)
162 | return m / np.average(rewards), se
163 |
164 |
165 | def eval_seq_doubly_robust(
166 | action_rhat_rewards, state_rewards, rewards, policy_prob, behavior_prob
167 | ):
168 | # Calculate Sample Weigths
169 | ws = _calc_sequential_weigths(policy_prob, behavior_prob, a_min=0.1, a_max=10)
170 |
171 | dr = np.zeros((len(action_rhat_rewards)))
172 | steps = len(action_rhat_rewards[0])
173 | for i in range(steps):
174 | t = steps - i - 1
175 | dr = state_rewards[:, t] + ws[:, t] * (rewards[:, t] + dr - action_rhat_rewards[:, t])
176 |
177 | #################
178 | # Roubly Robust #
179 | #################
180 | # dr = action_rhat_rewards + (p_ratio * (rewards - action_rhat_rewards))
181 | # estimate = ws * (rewards - action_rhat_rewards) + state_rewards
182 | # print('sdr', dr, np.average(dr), np.average(rewards))
183 |
184 | return np.average(dr) / np.mean(np.sum(rewards, axis=1)), 0
185 |
186 |
187 | if __name__ == '__main__':
188 | batch_size = 10
189 | max_steps = 9
190 | off_rewards_sum = np.ones(batch_size, )
191 | action_probs_mul = np.random.random((batch_size,))
192 | behavior_probs_mul = np.random.random((batch_size,))
193 | episode_reward = np.random.random((batch_size,)) * 2
194 | off_rewards = np.ones((batch_size, max_steps))
195 | action_probs = np.random.random((batch_size, max_steps))
196 | behavior_probs = np.random.random((batch_size, max_steps))
197 | rewards_hat = np.random.random((batch_size, max_steps))
198 | state_reward = np.ones((batch_size, max_steps))
199 |
200 | ips = eval_IPS(off_rewards_sum, action_probs_mul, behavior_probs_mul)
201 | cips = eval_CIPS(off_rewards_sum, action_probs_mul, behavior_probs_mul)
202 | snips = eval_SNIPS(off_rewards_sum, action_probs_mul, behavior_probs_mul)
203 | dr = eval_doubly_robust(episode_reward, off_rewards_sum, action_probs_mul, behavior_probs_mul)
204 | # step-wise
205 | sips = eval_WIPS(off_rewards, action_probs, behavior_probs)
206 | sdr = eval_seq_doubly_robust(rewards_hat, state_reward, off_rewards, action_probs, behavior_probs)
207 | print(ips, cips, snips, dr, sips, sdr, sep='\n')
208 |
--------------------------------------------------------------------------------
/script/mdpchecker/mdp_checker.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import sys
3 | import random
4 | from scipy.stats import spearmanr
5 | from keras_transformer import get_model, decode
6 | from rl4rs.mdpchecker.decoder import beam_search, token_probs
7 |
8 | # dataset_file = 'recsys15.csv'
9 | # dataset_file = 'movielens.csv'
10 | # dataset_file = 'rl4rs.csv'
11 | # dataset_file = 'lastfm.csv'
12 | # dataset_file = 'cikm2016.csv'
13 | dataset_file = sys.argv[1] + '.csv'
14 | dataset_dir = sys.argv[2]
15 |
16 | # the data of recsys15 relative to the
17 | # number of commodities is too sparse,
18 | # increase the sample size
19 | if 'recsys15' in dataset_file:
20 | source_len = 8
21 | elif 'cikm2016' in dataset_file:
22 | source_len = 5
23 | else:
24 | source_len = 16
25 | target_len = 5
26 | np.random.seed(1)
27 |
28 | data = open(dataset_dir + '/' + dataset_file).read().split('\n')[1:-1]
29 |
30 | source_tokens = []
31 | target_tokens = []
32 | for sample in data:
33 | user_id, items = sample.split(' ')
34 | item_list = items.split(',')
35 | if len(item_list) >= source_len + target_len:
36 | # assert len(item_list) >= source_len + target_len
37 | i = 0
38 | if 'rl4rs' in dataset_file or 'cikm2016' in dataset_file:
39 | source_tokens.append(item_list[:source_len])
40 | target_tokens.append(item_list[source_len:source_len + target_len])
41 | else:
42 | while i + source_len + target_len < len(item_list):
43 | source_tokens.append(item_list[i: i + source_len])
44 | target_tokens.append(item_list[i + source_len: i + source_len + target_len])
45 | i = i + np.random.randint(source_len, source_len + target_len) // 6
46 | else:
47 | print('len(item_list) <= source_len + target_len in', '\t',sample)
48 |
49 | # Generate dictionaries
50 | token_dict = {
51 | '': 0,
52 | '': 1,
53 | '': 2,
54 | }
55 |
56 |
57 | def build_token_dict(token_list):
58 | for tokens in token_list:
59 | for token in tokens:
60 | if token not in token_dict:
61 | token_dict[token] = len(token_dict)
62 | return token_dict
63 |
64 |
65 | source_token_dict = build_token_dict(source_tokens)
66 | target_token_dict = build_token_dict(target_tokens)
67 | target_token_dict_inv = {v: k for k, v in target_token_dict.items()}
68 |
69 | # Add special tokens
70 | encode_tokens = [[''] + tokens + [''] for tokens in source_tokens]
71 | decode_tokens = [[''] + tokens + [''] for tokens in target_tokens]
72 | output_tokens = [tokens + ['', ''] for tokens in target_tokens]
73 |
74 | # Padding
75 | source_max_len = max(map(len, encode_tokens))
76 | target_max_len = max(map(len, decode_tokens))
77 |
78 | encode_tokens = [tokens + [''] * (source_max_len - len(tokens)) for tokens in encode_tokens]
79 | decode_tokens = [tokens + [''] * (target_max_len - len(tokens)) for tokens in decode_tokens]
80 | output_tokens = [tokens + [''] * (target_max_len - len(tokens)) for tokens in output_tokens]
81 |
82 | encode_input = [list(map(lambda x: source_token_dict[x], tokens)) for tokens in encode_tokens]
83 | decode_input = [list(map(lambda x: target_token_dict[x], tokens)) for tokens in decode_tokens]
84 | decode_output = [list(map(lambda x: [target_token_dict[x]], tokens)) for tokens in output_tokens]
85 |
86 | print('sample lens:', len(encode_input))
87 | print('source_token_dict lens:', len(source_token_dict))
88 | print('target_token_dict lens:', len(target_token_dict))
89 | # [1, 3, 4, 5, 6, 2] [1, 3, 4, 5, 6, 7, 8, 9, 2] [[3], [4], [5], [6], [7], [8], [9], [2], [0]]
90 | # print(encode_input[0], decode_input[0], decode_output[0])
91 |
92 | # Build & fit model
93 | model = get_model(
94 | token_num=max(len(source_token_dict), len(target_token_dict)),
95 | embed_dim=256,
96 | encoder_num=1,
97 | decoder_num=1,
98 | head_num=1,
99 | hidden_dim=128,
100 | dropout_rate=0.05,
101 | use_same_embed=False, # Use different embeddings for different languages
102 | )
103 |
104 | model.compile('adam', 'sparse_categorical_crossentropy')
105 | model.summary()
106 |
107 | model.fit(
108 | x=[np.array(encode_input)[:-10000], np.array(decode_input)[:-10000]],
109 | y=np.array(decode_output)[:-10000],
110 | epochs=20,
111 | batch_size=256,
112 | shuffle=True,
113 | verbose=2
114 | )
115 |
116 | model.save_weights(dataset_file.split('.')[0] + '.h5')
117 |
118 | # Load
119 | model.load_weights(dataset_file.split('.')[0] + '.h5')
120 |
121 | # greedy result print & input output comparison
122 | # decoded = decode(
123 | # model,
124 | # encode_input[:1024],
125 | # start_token=target_token_dict[''],
126 | # end_token=target_token_dict[''],
127 | # pad_token=target_token_dict[''],
128 | # top_k=1
129 | # )
130 | # print([target_token_dict_inv[x] for x in decode_input[0]], [target_token_dict_inv[x] for x in decoded[0]])
131 | # print([target_token_dict_inv[x] for x in decode_input[1]], [target_token_dict_inv[x] for x in decoded[1]])
132 |
133 | # beam search
134 | batch_size = 2048
135 | beam_size = 100
136 | # use 20 hot items since rl4rs has only 200+ items
137 | hot_beam_size = 20 if 'rl4rs' in dataset_file else beam_size
138 | # cikm2016 has only 60853 items
139 | candidates_size = 6000 if 'cikm2016' in dataset_file else hot_beam_size
140 | random.seed(1)
141 | encode_input = random.sample(encode_input[-10000:], batch_size)
142 | output_greedy, greedy_score = beam_search(model, encode_input, beam_size=1, target_len=target_len)
143 | output_topk, beam_score = beam_search(model, encode_input, beam_size=beam_size, target_len=target_len)
144 | # np.savez(dataset_file.split('.')[0]+'.npz', output_topk=output_topk, beam_score=beam_score)
145 | # npzdata = np.load(dataset_file.split('.')[0] + '.npz')
146 | # output_topk = npzdata['output_topk']
147 | # beam_score = npzdata['beam_score']
148 |
149 | output_topk_5, beam_score_5 = output_topk[:, :int(beam_size * 0.05)], beam_score[:, :int(beam_size * 0.05)]
150 | output_topk_20, beam_score_20 = output_topk[:, :int(beam_size * 0.2)], beam_score[:, :int(beam_size * 0.2)]
151 | output_topk_hot, beam_score_hot = beam_search(model, encode_input, beam_size=hot_beam_size, target_len=target_len, use_candidates=True, candidates_size=candidates_size)
152 | output_topk_hot5, beam_score_hot5 = output_topk_hot[:, :int(beam_size * 0.05)], beam_score_hot[:, :int(beam_size * 0.05)]
153 | output_topk_hot20, beam_score_hot20 = output_topk_hot[:, :int(beam_size * 0.2)], beam_score_hot[:, :int(beam_size * 0.2)]
154 |
155 | greedy_score = np.nanmean(greedy_score, axis=1)
156 | top_5_percent_score = np.nanmean(beam_score_5, axis=1)
157 | top_20_percent_score = np.nanmean(beam_score_20, axis=1)
158 | hot_5_percent_score = np.nanmean(beam_score_hot5, axis=1)
159 | hot_20_percent_score = np.nanmean(beam_score_hot20, axis=1)
160 |
161 | print('experiment II results')
162 | print('top_5_percent_score top_20_percent_score greedy_score hot_5_percent_score hot_20_percent_score')
163 | print(1,
164 | np.nanmean(top_20_percent_score / top_5_percent_score),
165 | np.nanmean(greedy_score / top_5_percent_score),
166 | np.nanmean(hot_5_percent_score / top_5_percent_score),
167 | np.nanmean(hot_20_percent_score / top_5_percent_score))
168 |
169 | print('experiment I start')
170 | tmp = []
171 | for j in range(int(beam_size)):
172 | batch_outputs = output_topk[:, j]
173 | probs = []
174 | for i in range(5):
175 | prob = token_probs(model, encode_input, batch_outputs[:, :i + 1])[list(range(batch_size)), output_topk[:, j, i + 1]]
176 | probs.append(prob)
177 | tmp.append(probs)
178 | probs = np.array(tmp).swapaxes(0, 2).swapaxes(1, 2)
179 | metrics = []
180 | for j in range(batch_size):
181 | prob = probs[j]
182 | prob_sum = np.sum(prob, axis=1)
183 | seq_score = np.multiply.reduce(np.array(prob), axis=1)
184 | for i in range(5):
185 | metrics.append((np.corrcoef(np.multiply.reduce(np.array(prob[:, :i + 1]), axis=1), seq_score)[0][1],
186 | spearmanr(np.multiply.reduce(np.array(prob[:, :i + 1]), axis=1), seq_score)[0]))
187 | metrics = np.array(metrics).reshape((batch_size, 5, 2))
188 | metrics = np.nan_to_num(metrics, nan=1.0)
189 | print('experiment I results')
190 | print('corrcoef', ' ', 'spearman')
191 | print(np.nanmean(metrics, axis=0))
--------------------------------------------------------------------------------
/rl4rs/env/seqslate.py:
--------------------------------------------------------------------------------
1 | from functools import reduce
2 | from operator import add
3 | from copy import deepcopy as copy
4 | import numpy as np
5 | from rl4rs.env.slate import SlateState, SlateRecEnv
6 |
7 |
8 | class SeqSlateState(SlateState):
9 | def __init__(self, config, records):
10 | super().__init__(config, records)
11 | self.page_items = config.get("page_items", 9)
12 |
13 | @property
14 | def state(self):
15 | if self.config.get("support_rllib_mask", False):
16 | location_mask = self.get_location_mask(self.location_mask, self.cur_steps % self.page_items // 3)
17 | return {"state": self._state, "action_mask": self.action_mask & location_mask & self.special_mask}
18 | elif self.config.get("support_d3rl_mask", False):
19 | cur_steps = np.full((self.batch_size, 1), self.cur_steps)
20 | page_init = self.cur_steps // self.page_items * self.page_items
21 | page_end = min(page_init + self.page_items - 1, self.max_steps - 1)
22 | masked_actions = self.prev_actions[:, page_end + 1 - self.page_items:page_end + 1]
23 | return {"state": self._state, "masked_actions": masked_actions, "cur_steps": cur_steps}
24 | else:
25 | return self._state
26 |
27 | def get_complete_states(self):
28 | states = []
29 | for j in range(self.cur_steps):
30 | tmp = copy(self._init_state)
31 | for state, action, i in zip(self._init_state, self.prev_actions[:, j], range(len(self._init_state))):
32 | page_init = j // self.page_items * self.page_items
33 | page_end = page_init + self.page_items - 1
34 | sequence_id = j // self.page_items + 1
35 | # seq
36 | prev_expose = self.prev_actions[i, :page_init] if page_init > 0 else [0]
37 | tmp[i][1] = [tmp[i][1][0], prev_expose]
38 | # dense
39 | prev_item_feat = [
40 | self.item_info_d[str(x)]['item_vec']
41 | for x in self.prev_actions[i, page_init:page_end + 1]
42 | ]
43 | cur_item_feat = self.item_info_d[str(action)]['item_vec']
44 | prev_item_feat = np.array(prev_item_feat).flatten()
45 | tmp[i][2] = np.concatenate((tmp[i][2], prev_item_feat, cur_item_feat))
46 | # category
47 | cur_exposed = self.prev_actions[i, page_init:page_end + 1]
48 | tmp[i][3] = np.concatenate((tmp[i][3], [sequence_id], cur_exposed, [action]))
49 | states.append(tmp)
50 | return states
51 |
52 | def get_violation(self):
53 | tmp = np.ones((self.batch_size,), dtype=np.int)
54 | for step in range(self.cur_steps):
55 | location_mask = self.location_mask[step % self.page_items // 3]
56 | tmp = tmp & location_mask[self.prev_actions[:, step]]
57 | for step in range(max(self.cur_steps - 1, 1)):
58 | duplicate_mask = (self.prev_actions[:, step] != self.prev_actions[:, step + 1])
59 | tmp = tmp & duplicate_mask
60 | for step in range(max(self.cur_steps - 2, 1)):
61 | duplicate_mask = (self.prev_actions[:, step] != self.prev_actions[:, step + 2])
62 | tmp = tmp & duplicate_mask
63 | for i in range(self.batch_size):
64 | cur_page = self.cur_steps % self.page_items
65 | for j in range(cur_page+1):
66 | actions = self.prev_actions[i][self.page_items*j:self.page_items*(j+1)]
67 | if len(np.intersect1d(actions, self.special_items)) > 1:
68 | tmp[i] = 0
69 | return tmp
70 |
71 | @property
72 | def offline_reward(self):
73 | cur_step = self.cur_steps
74 | if cur_step % 9 != 0:
75 | reward = [0, ] * self.batch_size
76 | else:
77 | action = np.array([list(map(int, x.split('@')[3].split(',')[:cur_step]))
78 | for x in self.records])
79 | price = self.get_price(action)[:, -self.page_items:]
80 | slate_label = np.array([
81 | list(map(int, x.split('@')[4].split(',')))
82 | for x in self.records
83 | ])
84 | slate_label = slate_label[:, cur_step - self.page_items:cur_step]
85 | reward = np.sum(price * slate_label, axis=1)
86 | return reward
87 |
88 | # @property
89 | # def info(self):
90 | # return [{}]*self.batch_size
91 |
92 | def act(self, actions):
93 | if self.config.get("support_conti_env", False):
94 | location_mask = self.get_location_mask(self.location_mask,
95 | self.cur_steps % self.page_items // 3)
96 | action_mask = self.action_mask & location_mask & self.special_mask
97 | actions = self.get_nearest_neighbor_with_mask(actions, self.action_emb, action_mask)
98 | self.prev_actions[:, self.cur_steps] = actions
99 | self.action_mask[list(range(self.batch_size)), actions] = 0
100 | for i in range(self.batch_size):
101 | if len(np.intersect1d(self.prev_actions[i], self.special_items)) > 0:
102 | self.special_mask[i][self.special_items] = 0
103 | tmp = copy(self._init_state)
104 | for state, action, i in zip(self._state, actions, range(self.batch_size)):
105 | page_init = self.cur_steps // self.page_items * self.page_items
106 | page_end = page_init + self.page_items - 1
107 | sequence_id = self.cur_steps // self.page_items + 1
108 | # seq
109 | prev_expose = self.prev_actions[i, :page_init] if page_init > 0 else [0]
110 | tmp[i][1] = [tmp[i][1][0], prev_expose]
111 | # dense
112 | prev_item_feat = [
113 | self.item_info_d[str(x)]['item_vec']
114 | for x in self.prev_actions[i, page_init:page_end + 1]
115 | ]
116 | cur_item_feat = self.item_info_d[str(action)]['item_vec']
117 | prev_item_feat = np.array(prev_item_feat).flatten()
118 | tmp[i][2] = np.concatenate((tmp[i][2], prev_item_feat, cur_item_feat))
119 | # category
120 | cur_exposed = self.prev_actions[i, page_init:page_end + 1]
121 | tmp[i][3] = np.concatenate((tmp[i][3], [sequence_id], cur_exposed, [action]))
122 | self._state = tmp
123 | self.cur_steps += 1
124 | if self.cur_steps % self.page_items == 0:
125 | self.action_mask = np.full((self.batch_size, self.action_size), 1, dtype=np.int)
126 | self.special_mask = np.full((self.batch_size, self.action_size), 1, dtype=np.int)
127 |
128 |
129 | class SeqSlateRecEnv(SlateRecEnv):
130 | """ Implements core recommendation simulator"""
131 |
132 | def __init__(self, config, state_cls):
133 | super().__init__(config, state_cls)
134 | self.page_items = config.get("page_items", 9)
135 |
136 | def forward(self, model, samples):
137 | step = samples.cur_steps
138 | if step % self.page_items == 0:
139 | # state = samples.state
140 | prev_actions = samples.prev_actions[:, :step]
141 | # shapes = prev_actions.shape
142 | complete_states = np.array(samples.get_complete_states())
143 | complete_states = complete_states[-self.page_items:]
144 | complete_states = complete_states \
145 | .swapaxes(0, 1) \
146 | .reshape((self.batch_size * self.page_items, 6))
147 | price = samples.get_price(prev_actions)[:, -self.page_items:]
148 | feat, _ = self.FeatureUtil.feature_extraction(complete_states)
149 | with self.sess.as_default():
150 | with self.graph.as_default():
151 | res = self.reward_layer(feat)
152 | probs = np.array(res)[:, 1].reshape((self.batch_size, self.page_items))
153 | reward = np.sum(price * probs, axis=1)
154 | if self.config.get("support_rllib_mask", False) or \
155 | self.config.get("support_d3rl_mask", False):
156 | violation = samples.get_violation()
157 | reward[violation < 0.5] = 0
158 | else:
159 | reward = np.array([0, ] * self.batch_size)
160 | return reward.tolist()
161 |
--------------------------------------------------------------------------------
/script/mdpchecker/preprocess.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import pandas as pd
3 | import pandasql as ps
4 |
5 | dataset_file = sys.argv[1]
6 | dataset_dir = sys.argv[2]
7 |
8 | pysqldf = lambda q: ps.sqldf(q, globals())
9 |
10 | # recsys15-click
11 | if 'lastfm' in dataset_file:
12 | df = pd.read_csv(dataset_dir + '/lastfm-dataset-1K/userid-timestamp-artid-artname-traid-traname.tsv'
13 | ,names=["userid", "timestamp", "artid", "artname", "traid", "traname"]
14 | ,sep='\t')
15 |
16 | sql0 = """
17 | select userid as sessionid, min(timestamp) as timestamp, artid
18 | from
19 | df a
20 | group by userid, artid, substr(timestamp,1,12)
21 | """
22 |
23 | sql1 = """
24 | select a.sessionid, a.timestamp, b.item
25 | from
26 | df a
27 | join
28 | (select artid, ROW_NUMBER() OVER(ORDER BY artid) AS item
29 | from (
30 | select artid
31 | from
32 | df a
33 | group by artid
34 | having count(*)>=30
35 | )aa
36 | )b
37 | on a.artid=b.artid
38 | """
39 |
40 | sql2 = """
41 | select sessionid, group_concat(item) as items
42 | from(
43 | select *
44 | from
45 | df2
46 | order by timestamp asc
47 | )a
48 | group by sessionid
49 |
50 | """
51 |
52 | df = pysqldf(sql0)
53 |
54 | df2 = pysqldf(sql1)
55 |
56 | df3 = pysqldf(sql2)
57 |
58 | print('items num.', df2['item'].value_counts().count())
59 | print('max item id', df2['item'].max())
60 | print('sessionid num.', df2['sessionid'].value_counts().count())
61 |
62 | df3.to_csv(dataset_dir + '/' + dataset_file + '.csv', sep=' ', header=True, index=False, encoding='utf-8')
63 |
64 | if 'cikm2016' in dataset_file:
65 | # queryId;sessionId;userId;timeframe;duration;eventdate;searchstring.tokens;categoryId;items;is.test
66 | queries_df = pd.read_csv(dataset_dir + '/CIKMCUP2016_Track2/train-queries.csv',sep=';')
67 | # queryId;timeframe;itemId
68 | click_df = pd.read_csv(dataset_dir + '/CIKMCUP2016_Track2/train-clicks.csv',sep=';')
69 | # sessionId;userId;itemId;timeframe;eventdate
70 | pv_df = pd.read_csv(dataset_dir + '/CIKMCUP2016_Track2/train-item-views.csv',sep=';')
71 |
72 | # sql0 = """
73 | # select a.sessionId as sessionid, min(b.timeframe) as timestamp, b.itemId, a.items as pv_items
74 | # from
75 | # queries_df a
76 | # join click_df b
77 | # on a.queryId = b.queryId
78 | # group by b.queryId, b.itemId, cast(b.timeframe/1000 as int)
79 | # """
80 |
81 | df_click_sql = """
82 | select a.sessionId as sessionid, min(cast(b.timeframe as int)) as timestamp, b.itemId as item
83 | from
84 | queries_df a
85 | join click_df b
86 | on a.queryId = b.queryId
87 | join (select sessionId from pv_df group by sessionId)c
88 | on a.sessionId = c.sessionId
89 | group by a.sessionId, b.itemId, cast(b.timeframe/1000 as int)
90 | """
91 |
92 | df_pv_sql = """
93 | select a.sessionId as sessionid, min(cast(c.timeframe as int)) as timestamp, c.itemId as item
94 | from
95 | queries_df a
96 | join (select queryId from click_df group by queryId) b
97 | on a.queryId = b.queryId
98 | join pv_df c
99 | on a.sessionId = c.sessionId
100 | group by a.sessionId, c.itemId, cast(c.timeframe/1000 as int)
101 | """
102 |
103 | df_sql = """
104 | select aa.sessionid, group_concat(c.item|| ':' ||c.timestamp) as pv_items, aa.click_items
105 | from
106 | (
107 | select a.sessionid,a.timestamp,a.item,group_concat(b.item|| ':' ||b.timestamp) as click_items from
108 | df_click a
109 | join df_click b
110 | on a.sessionid=b.sessionid and a.timestamp<=b.timestamp
111 | group by a.sessionid,a.item
112 | )aa
113 |
114 | join df_pv c
115 | on aa.sessionid=c.sessionid and aa.timestamp>c.timestamp
116 | group by aa.sessionid,aa.click_items
117 | """
118 |
119 | df_click = pysqldf(df_click_sql)
120 | df_pv = pysqldf(df_pv_sql)
121 | df = pysqldf(df_sql)
122 |
123 | tmp = []
124 | items = set()
125 | for x in df.values:
126 | if len(x[1].split(','))>=5 and len(x[2].split(','))>=5:
127 | [items.add(x.split(':')[0]) for x in x[1].split(',')]
128 | [items.add(x.split(':')[0]) for x in x[2].split(',')]
129 |
130 | # item2id=dict([(x,str(i)) for i,x in enumerate(items)])
131 | # item2id_fn = lambda x:item2id[x]
132 |
133 | for x in df.values:
134 | if len(x[1].split(','))>=5 and len(x[2].split(','))>=5:
135 | pv_items = x[1].split(',')
136 | sorted_pv_items = sorted(pv_items, key=lambda x:int(x.split(':')[1]))[-5:]
137 | sorted_pv_items = [x.split(':')[0] for x in sorted_pv_items]
138 | click_items = x[2].split(',')
139 | sorted_click_items = sorted(click_items, key=lambda x:int(x.split(':')[1]))[:5]
140 | sorted_click_items = [x.split(':')[0] for x in sorted_click_items]
141 | tmp.append([x[0], ','.join(sorted_pv_items), ','.join(sorted_click_items)])
142 |
143 | print('items num.', len(items))
144 | print('max item id', len(items)-1)
145 | print('sessionid num.', len(tmp))
146 |
147 | with open(dataset_dir + '/' + dataset_file + '.csv', 'w') as f:
148 | f.write('sessionid items'+'\n')
149 | f.write('\n'.join([str(x[0])+' '+x[1]+','+x[2] for x in tmp]))
150 |
151 |
152 | # recsys15-click
153 | if 'recsys15' in dataset_file:
154 | df = pd.read_csv(dataset_dir + '/yoochoose-clicks.dat', names=["sessionid", "timestamp", "item", "Category"])
155 |
156 | sql0 = """
157 | select sessionid, min(timestamp) as timestamp, item, Category
158 | from
159 | df a
160 | group by sessionid, item, substr(timestamp,1,12)
161 | """
162 |
163 | sql1 = """
164 | select a.*
165 | from
166 | df a
167 | join
168 | (SELECT item FROM df group by item having count(*)>=1000)b
169 | on a.item=b.item
170 | """
171 |
172 | sql2 = """
173 | select a.*
174 | from
175 | df2 a
176 | join
177 | (SELECT sessionid FROM df2 group by sessionid having count(*)>=13)b
178 | on a.sessionid=b.sessionid
179 | """
180 |
181 | sql3 = """
182 | select sessionid, group_concat(item) as items
183 | from
184 | df3
185 | group by sessionid
186 | order by timestamp asc
187 | """
188 |
189 | df = pysqldf(sql0)
190 |
191 | df2 = pysqldf(sql1)
192 |
193 | df3 = pysqldf(sql2)
194 |
195 | df4 = pysqldf(sql3)
196 |
197 | print('items num.', df3['item'].value_counts().count())
198 | print('max item id', df3['item'].max())
199 | print('sessionid num.', df3['sessionid'].value_counts().count())
200 |
201 | df4.to_csv(dataset_dir + '/' + dataset_file + '.csv', sep=' ', header=True, index=False, encoding='utf-8')
202 |
203 | if 'movielens' in dataset_file:
204 | # movielens-25m
205 | df = pd.read_csv(dataset_dir + '/ml-25m/ratings.csv')
206 | # userId,movieId,rating,timestamp
207 | sql0 = """
208 | select *
209 | from
210 | df a
211 | where rating>=3
212 | """
213 |
214 | sql1 = """
215 | select a.*
216 | from
217 | df a
218 | join
219 | (SELECT movieId FROM df group by movieId having count(*)>=1000)b
220 | on a.movieId=b.movieId
221 | """
222 |
223 | sql2 = """
224 | select a.*
225 | from
226 | df2 a
227 | join
228 | (SELECT userId FROM df2 group by userId having count(*)>=30 and count(*)<=100)b
229 | on a.userId=b.userId
230 | """
231 |
232 | sql3 = """
233 | select userId as sessionid, group_concat(movieId) as items
234 | from
235 | df3
236 | group by userId
237 | order by timestamp asc
238 | """
239 | df = pysqldf(sql0)
240 |
241 | df2 = pysqldf(sql1)
242 |
243 | df3 = pysqldf(sql2)
244 |
245 | df4 = pysqldf(sql3)
246 |
247 | print('items num.', df3['movieId'].value_counts().count())
248 | print('max item id', df3['movieId'].max())
249 | print('sessionid num.', df3['userId'].value_counts().count())
250 |
251 | df4.to_csv(dataset_dir + '/movielens.csv', sep=' ', header=True, index=False, encoding='utf-8')
252 |
253 | if 'rl4rs' in dataset_file:
254 | # RL4RS
255 | data = open(dataset_dir + '/rl4rs_dataset_a.csv', 'r').read().split('\n')[:-1]
256 | tmp = ['sessionid items']
257 | for x in data:
258 | session_id = x.split('@')[1]
259 | sequence_id = list(map(int, x.split('@')[5].split(',')))
260 | items = list(map(int, x.split('@')[3].split(',')))
261 | if len(sequence_id) >= 16:
262 | tmp.append(session_id + ' ' + ','.join(list(map(str, sequence_id[-16:] + items[:5]))))
263 |
264 | print('items num.', 283)
265 | print('max item id', 283)
266 | print('sessionid num.', len(tmp))
267 |
268 | with open(dataset_dir + '/rl4rs.csv', 'w') as f:
269 | f.write('\n'.join(tmp))
270 |
--------------------------------------------------------------------------------
/rl4rs/env/base.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import gym
3 | from copy import deepcopy as copy
4 | import tensorflow as tf
5 | from abc import ABC, abstractmethod
6 | import os
7 |
8 |
9 | def single_elem_support(func):
10 | """aop func"""
11 | type_list = (type([]), type(()), type(np.array(1)))
12 |
13 | def wrapper(*args, **kwargs):
14 | """wrapper func"""
15 | res = func(*args, **kwargs)
16 | if type(res) in type_list and len(res) == 1:
17 | return res[0]
18 | elif type(res[0]) in type_list and len(res[0]) == 1:
19 | return [x[0] for x in res]
20 | else:
21 | return res
22 |
23 | return wrapper
24 |
25 |
26 | class RecState(ABC):
27 | def __init__(self, config, records):
28 | self.config = config
29 | self.records = records
30 | self._init_state = self.records_to_state(records)
31 | self._state = copy(self._init_state)
32 |
33 | @staticmethod
34 | def records_to_state(records):
35 | pass
36 |
37 | @property
38 | def state(self):
39 | return self._state
40 |
41 | @property
42 | @abstractmethod
43 | def user(self):
44 | pass
45 |
46 | @property
47 | @abstractmethod
48 | def info(self):
49 | pass
50 |
51 | @abstractmethod
52 | def act(self, actions):
53 | pass
54 |
55 | @abstractmethod
56 | def to_string(self):
57 | pass
58 |
59 |
60 | class RecDataBase(object):
61 | '''
62 | file-based implementation of a RecommnedEnv's data source.
63 |
64 | Pulls data from file, preps for use by RecommnedEnv and then
65 | acts as data provider for each new episode.
66 | '''
67 |
68 | def __init__(self, config, state_cls):
69 | self.config = config
70 | self.sample_list = []
71 | self.state_cls = state_cls
72 | self.is_eval = config.get('is_eval', False)
73 | self.cache_size = config.get('cache_size', 2048)
74 | # sample file cache
75 | self.fp = open(config['sample_file'], 'r')
76 | # self.fp.readline()
77 |
78 | @staticmethod
79 | def seed(seed):
80 | np.random.seed(seed)
81 |
82 | def sample_cache(self, f, num):
83 | for i in range(num):
84 | tmp = f.readline().rstrip()
85 | if len(tmp) < 1:
86 | f.seek(0, 0)
87 | f.readline()
88 | self.sample_list.append(f.readline().rstrip())
89 | else:
90 | self.sample_list.append(tmp)
91 |
92 | def sample(self, batch_size):
93 | if self.is_eval:
94 | assert self.cache_size == batch_size
95 | assert len(self.sample_list) == batch_size
96 | records = self.sample_list[:batch_size]
97 | else:
98 | records = np.random.choice(self.sample_list, batch_size)
99 | samples = self.state_cls(self.config, records)
100 | return samples
101 |
102 | def reset(self, reset_file=False):
103 | # self.state_list = []
104 | self.sample_list = []
105 | # self.rawstate_cache(self.fs, 10000)
106 | if reset_file:
107 | self.fp.seek(0, 0)
108 | self.sample_cache(self.fp, self.cache_size)
109 |
110 |
111 | class RecSimBase(ABC):
112 | """ Implemention of core recommendation simulator"""
113 |
114 | def __init__(self, config, state_cls):
115 | self.config = config
116 | self.max_steps = config['max_steps']
117 | self.batch_size = config['batch_size']
118 | model_file = config['model_file']
119 | self.graph = tf.Graph()
120 | with self.graph.as_default():
121 | self.model = self.get_model(config)
122 | if self.config.get('gpu', False):
123 | os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
124 | self.sess = tf.Session(graph=self.graph,
125 | config=tf.ConfigProto(device_count={"CPU": 4}))
126 | else:
127 | os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'
128 | self.sess = tf.Session(graph=self.graph)
129 | self.saver = tf.train.Saver()
130 | self.reload_model(model_file)
131 | self._recData = RecDataBase(config, state_cls)
132 |
133 | def reset(self, reset_file=False):
134 | self._recData.reset(reset_file)
135 |
136 | @abstractmethod
137 | def get_model(self, config):
138 | pass
139 |
140 | @abstractmethod
141 | def obs_fn(self, state):
142 | pass
143 |
144 | @abstractmethod
145 | def forward(self, model, samples):
146 | pass
147 |
148 | def reload_model(self, model_file):
149 | with self.sess.as_default():
150 | with self.graph.as_default():
151 | self.saver.restore(self.sess, model_file)
152 |
153 | def seed(self, sd=0):
154 | self._recData.seed(sd)
155 | np.random.seed(sd)
156 |
157 | def _step(self, samples, action, **kwargs):
158 | step = kwargs['step']
159 | samples.act(action)
160 | next_state = samples.state
161 | next_obs = self.obs_fn(next_state)
162 | reward = self.forward(self.model, samples)
163 | next_info = samples.info
164 |
165 | if step < self.max_steps - 1:
166 | done = [0] * self.batch_size
167 | else:
168 | done = [1] * self.batch_size
169 |
170 | return next_obs, reward, done, next_info
171 |
172 | def sample(self, batch_size):
173 | samples = self._recData.sample(batch_size)
174 | obs = self.obs_fn(samples.state)
175 | return samples, obs
176 |
177 |
178 | class RecEnvBase(gym.Env):
179 | metadata = {'render.modes': ['human']}
180 |
181 | def __init__(self, recsim: RecSimBase):
182 | self.config = recsim.config
183 | self.batch_size = self.config['batch_size']
184 | self.cur_step = 0
185 | self.sim = recsim
186 | self.sim.reset()
187 | self.samples, self.obs = self.sim.sample(self.batch_size)
188 | if self.config.get("rawstate_as_obs", False):
189 | category_size = len(self.obs[0]['category_feature'])
190 | dense_size = len(self.obs[0]['dense_feature'])
191 | sequence_size = np.array(self.obs[0]['sequence_feature']).shape
192 | features = {
193 | "category_feature": gym.spaces.Box(-1000000.0, 1000000.0, shape=(category_size,)),
194 | "dense_feature": gym.spaces.Box(-1000000.0, 1000000.0, shape=(dense_size,)),
195 | "sequence_feature": gym.spaces.Box(-1000000.0, 1000000.0, shape=sequence_size),
196 | }
197 | if self.config.get("support_rllib_mask", False):
198 | action_feature_size = len(self.obs[0]['action_mask'])
199 | self.observation_space = gym.spaces.Dict({
200 | "action_mask": gym.spaces.Box(0, 1, shape=(action_feature_size,)),
201 | **features
202 | })
203 | else:
204 | self.observation_space = gym.spaces.Dict(features)
205 | else:
206 | if self.config.get("support_rllib_mask", False):
207 | action_feature_size = len(self.obs[0]['action_mask'])
208 | self.observation_space = gym.spaces.Dict({
209 | "action_mask": gym.spaces.Box(0, 1, shape=(action_feature_size,)),
210 | "obs": gym.spaces.Box(-100000.0, 100000.0, shape=(len(self.obs[0]["obs"]),))
211 | })
212 | else:
213 | self.observation_space = gym.spaces.Box(-100000.0, 100000.0, shape=(len(self.obs[0]),))
214 | if self.config.get("support_conti_env", False):
215 | self.action_space = gym.spaces.Box(-1, 1, shape=(self.config['action_emb_size'],))
216 | else:
217 | self.action_space = gym.spaces.Discrete(self.config['action_size'])
218 | # if self.config.get("support_rllib_mask", False):
219 | # action_feature_size = len(self.obs[0]['action_mask'])
220 | # # avail_actions_size = len(self.obs[0]['avail_actions'][0])
221 | # # self.action_space = gym.spaces.Discrete(self.config['action_size'])
222 | # self.observation_space = gym.spaces.Dict({
223 | # "action_mask": gym.spaces.Box(0, 1, shape=(action_feature_size,)),
224 | # "obs": self.observation_space,
225 | # })
226 | # elif self.config.get("support_d3rl_mask", False):
227 | # self.action_space = gym.spaces.Discrete(self.config['action_size'])
228 | # else:
229 | # self.action_space = gym.spaces.Discrete(self.config['action_size'])
230 | self.reset()
231 |
232 | def seed(self, sd=0):
233 | self.sim.seed(sd)
234 | np.random.seed(sd)
235 |
236 | @property
237 | @single_elem_support
238 | def state(self):
239 | return self.obs
240 |
241 | @property
242 | @single_elem_support
243 | def user_id(self):
244 | return self.samples.user
245 |
246 | @property
247 | @single_elem_support
248 | def offline_action(self):
249 | return self.samples.offline_action
250 |
251 | @property
252 | @single_elem_support
253 | def offline_reward(self):
254 | return self.samples.offline_reward
255 |
256 | @single_elem_support
257 | def step(self, action):
258 | if not isinstance(action, (list, np.ndarray)):
259 | action = [action]
260 | obs, reward, done, info = \
261 | self.sim._step(self.samples, action, step=self.cur_step)
262 | self.cur_step += 1
263 | return obs, reward, done, info
264 |
265 | def reset(self, reset_file=False):
266 | self.cur_step = 0
267 | self.sim.reset(reset_file)
268 | self.samples, self.obs = self.sim.sample(self.batch_size)
269 | return self.state
270 |
271 | def render(self, mode='human', close=False):
272 | print('Current State:', '\n')
273 | print(self.samples.to_string())
274 |
--------------------------------------------------------------------------------
/rl4rs/nets/exact_k/model.py:
--------------------------------------------------------------------------------
1 | from __future__ import print_function
2 | import tensorflow as tf
3 |
4 | from .layers import *
5 | from .modules import *
6 | from .utils import *
7 |
8 |
9 | class Generator:
10 | def __init__(self,
11 | l1_mask,
12 | l2_mask,
13 | l3_mask,
14 | l0_ssr_mask,
15 | is_training=True,
16 | lr=0.001,
17 | temperature=1,
18 | train_sample='random',
19 | predict_sample='random',
20 | seq_length=500,
21 | res_length=9,
22 | hidden_units=64,
23 | dropout_rate=0.1,
24 | num_heads=4,
25 | num_layers=1,
26 | num_glimpse=1,
27 | num_blocks=2,
28 | use_mha=True,
29 | beam_size=3
30 | ):
31 |
32 | self.user = tf.placeholder(tf.float32, shape=(None, 256), name='user') # 779
33 |
34 | self.batch_size = tf.shape(self.user)[0]
35 | self.item_cand = tf.placeholder(tf.int32, shape=(None, seq_length), name='item_cand')
36 |
37 | self.decode_target_ids = tf.placeholder(dtype=tf.int32, shape=[None, res_length], name="decoder_target_ids") # [batch_size, res_length]
38 | self.reward = tf.placeholder(dtype=tf.float32, shape=[None], name="reward") # [batch_size]
39 |
40 | # Encoder
41 | with tf.variable_scope("encoder"):
42 | # region emb
43 | self.enc_user = tf.layers.dense(self.user, hidden_units, activation=tf.nn.relu) # (N, T_q, C)
44 | # enc_item = [batch_size, seq_len, hidden_units]
45 | self.enc_item = embedding(self.item_cand,
46 | vocab_size=500,
47 | num_units=hidden_units,
48 | zero_pad=False,
49 | scale=True,
50 | scope='enc_item_embed',
51 | # reuse=not is_training,
52 | reuse=False
53 | )
54 | self.enc = tf.concat([tf.stack(seq_length * [self.enc_user], axis=1), self.enc_item], axis=2)
55 | # endregion
56 | # region Dropout
57 | self.enc = tf.layers.dropout(self.enc,
58 | rate=dropout_rate,
59 | training=tf.convert_to_tensor(is_training))
60 | # endregion
61 | # region squence
62 | if use_mha:
63 | ## Blocks
64 | for i in range(num_blocks):
65 | with tf.variable_scope("num_blocks_{}".format(i)):
66 | ### Multihead Attention
67 | self.enc = multihead_attention(queries=self.enc,
68 | keys=self.enc,
69 | num_units=hidden_units * 2,
70 | num_heads=num_heads,
71 | dropout_rate=dropout_rate,
72 | is_training=is_training,
73 | causality=False)
74 |
75 | ### Feed Forward
76 | self.enc = feedforward(self.enc, num_units=[4 * hidden_units, hidden_units * 2])
77 | else:
78 | cell = tf.nn.rnn_cell.GRUCell(num_units=hidden_units * 2)
79 | outputs, _ = tf.nn.dynamic_rnn(cell=cell, inputs=self.enc, dtype=tf.float32)
80 | self.enc = outputs
81 | # endregion
82 |
83 | # Decoder
84 | with tf.variable_scope("decoder"):
85 | dec_cell = LSTMCell(hidden_units * 2)
86 |
87 | if num_layers > 1:
88 | cells = [dec_cell] * num_layers
89 | dec_cell = MultiRNNCell(cells)
90 | # ptr sampling
91 | enc_init_state = trainable_initial_state(self.batch_size, dec_cell.state_size)
92 |
93 | custom_logits, custom_path, _ = ptn_rnn_decoder(
94 | dec_cell, None,
95 | self.enc, enc_init_state,
96 | seq_length, res_length, hidden_units * 2,
97 | num_glimpse, self.batch_size,
98 | l1_mask, l2_mask, l3_mask, l0_ssr_mask,
99 | mode="CUSTOM", reuse=False, beam_size=None,
100 | temperature=temperature,
101 | train_sample=train_sample, predict_sample=predict_sample
102 | )
103 | # logits: [batch_size, res_length, seq_length]
104 | self.custom_logits = tf.identity(custom_logits, name="custom_logits")
105 | # sample_path: [batch_size, res_length]
106 | self.custom_path = tf.identity(custom_path, name="custom_path")
107 | self.custom_result = batch_gather(self.item_cand, self.custom_path)
108 | sampled_logits, sampled_path, _ = ptn_rnn_decoder(
109 | dec_cell, None,
110 | self.enc, enc_init_state,
111 | seq_length, res_length, hidden_units * 2,
112 | num_glimpse, self.batch_size,
113 | l1_mask, l2_mask, l3_mask, l0_ssr_mask,
114 | mode="SAMPLE", reuse=True, beam_size=None,
115 | temperature=temperature,
116 | train_sample=train_sample, predict_sample=predict_sample
117 | )
118 | # logits: [batch_size, res_length, seq_length]
119 | self.sampled_logits = tf.identity(sampled_logits, name="sampled_logits")
120 | # sample_path: [batch_size, res_length]
121 | self.sampled_path = tf.identity(sampled_path, name="sampled_path")
122 | self.sampled_result = batch_gather(self.item_cand, self.sampled_path)
123 |
124 | # self.decode_target_ids is placeholder
125 | decoder_logits, _ = ptn_rnn_decoder(
126 | dec_cell, self.decode_target_ids,
127 | self.enc, enc_init_state,
128 | seq_length, res_length, hidden_units * 2,
129 | num_glimpse, self.batch_size,
130 | l1_mask, l2_mask, l3_mask, l0_ssr_mask,
131 | mode="TRAIN", reuse=True, beam_size=None,
132 | temperature=temperature,
133 | train_sample=train_sample, predict_sample=predict_sample
134 | )
135 | self.dec_logits = tf.identity(decoder_logits, name="dec_logits")
136 |
137 | _, beam_path, _ = ptn_rnn_decoder(
138 | dec_cell, None,
139 | self.enc, enc_init_state,
140 | seq_length, res_length, hidden_units * 2,
141 | num_glimpse, self.batch_size,
142 | l1_mask, l2_mask, l3_mask, l0_ssr_mask,
143 | mode="BEAMSEARCH", reuse=True, beam_size=beam_size,
144 | temperature=temperature,
145 | train_sample=train_sample, predict_sample=predict_sample
146 | )
147 | self.beam_path = tf.identity(beam_path, name="beam_path")
148 | self.beam_result = batch_gather(self.item_cand, self.beam_path)
149 |
150 | _, greedy_path, _ = ptn_rnn_decoder(
151 | dec_cell, None,
152 | self.enc, enc_init_state,
153 | seq_length, res_length, hidden_units * 2,
154 | num_glimpse, self.batch_size,
155 | l1_mask, l2_mask, l3_mask, l0_ssr_mask,
156 | mode="GREEDY", reuse=True, beam_size=None,
157 | temperature=temperature,
158 | train_sample=train_sample, predict_sample=predict_sample
159 | )
160 | self.greedy_path = tf.identity(greedy_path, name="greedy_path")
161 | self.greedy_result = batch_gather(self.item_cand, self.greedy_path)
162 |
163 | if is_training:
164 | # Loss
165 | # self.y_smoothed = label_smoothing(tf.one_hot(self.decode_target_ids, depth=hp.data_length))
166 | self.r_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.dec_logits,
167 | labels=self.decode_target_ids)
168 | # reinforcement
169 | self.policy_loss = tf.reduce_mean(tf.reduce_sum(self.r_loss, axis=1) * self.reward)
170 | # supervised loss
171 | self.loss = self.policy_loss
172 |
173 | # Training Scheme
174 | self.global_step = tf.Variable(0, name='global_step', trainable=False)
175 | self.optimizer = tf.train.AdamOptimizer(learning_rate=lr, beta1=0.9, beta2=0.98, epsilon=1e-8)
176 | self.train_op = self.optimizer.minimize(self.loss, global_step=self.global_step)
177 |
178 | self.variables = tf.global_variables()
179 |
180 |
181 | class Discriminator:
182 | def __init__(self, lr=0.005, seq_length=500):
183 | self.user = tf.placeholder(tf.float32, shape=(None, 256), name='user')
184 | self.batch_size = tf.shape(self.user)[0]
185 | self.item_cand = tf.placeholder(tf.int32, shape=(None, seq_length), name='item_cand')
186 |
187 | self.reward_target = tf.placeholder(dtype=tf.float32, shape=[None], name="reward") # [batch_size]
188 |
189 | dense0 = self.user
190 | dense1 = tf.layers.dense(dense0, 128, activation=tf.nn.relu)
191 | dense2 = tf.layers.dense(dense1, 128, activation=tf.nn.relu)
192 | dense3 = tf.layers.dense(dense2, 128, activation=tf.nn.relu)
193 |
194 | self.reward = tf.layers.dense(dense3, 1)[:, 0]
195 |
196 | self.td_error = tf.abs(self.reward_target - self.reward)
197 | self.loss = tf.square(self.td_error)
198 |
199 | # Training Scheme
200 | self.global_step = tf.Variable(0, name='global_step', trainable=False)
201 | self.optimizer = tf.train.AdamOptimizer(learning_rate=lr, beta1=0.9, beta2=0.98, epsilon=1e-8)
202 | self.train_op = self.optimizer.minimize(self.loss, global_step=self.global_step)
203 |
--------------------------------------------------------------------------------