├── .gitignore ├── README.md ├── dataloader.py ├── dlrm_criteo_gpu.py ├── dlrm_criteo_tpu.py ├── noddlrm ├── __init__.py ├── data │ ├── __init__.py │ ├── dataset.py │ └── utils.py ├── metrics │ ├── __init__.py │ ├── dict_mean.py │ └── ranking_metrics.py ├── modules │ ├── __init__.py │ ├── latent_factor.py │ ├── multi_layer_perceptron.py │ ├── pairwise_log_loss.py │ ├── pointwise_mse_loss.py │ └── second_order_feature_interaction.py └── recommenders │ ├── __init__.py │ └── dlrm.py ├── setup.cfg └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | *.egg-info/ 23 | .installed.cfg 24 | *.egg 25 | 26 | # PyInstaller 27 | # Usually these files are written by a python script from a template 28 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 29 | *.manifest 30 | *.spec 31 | 32 | # Installer logs 33 | pip-log.txt 34 | pip-delete-this-directory.txt 35 | 36 | # Unit test / coverage reports 37 | htmlcov/ 38 | .tox/ 39 | .coverage 40 | .coverage.* 41 | .cache 42 | nosetests.xml 43 | coverage.xml 44 | *,cover 45 | 46 | # Translations 47 | *.mo 48 | *.pot 49 | 50 | # Django stuff: 51 | *.log 52 | 53 | # Sphinx documentation 54 | docs/_build/ 55 | 56 | # PyBuilder 57 | target/ 58 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # tensorflow-dlrm 2 | This is Nod's Tensorflow version of DLRM which is based on [**OpenRec**](http://www.openrec.ai/) DLRM model. We extract the Openrec DRML source code and fixed some bugs in their model definition to make it work with tensorflow-gpu==2.2 and python3.7 3 | 4 | 5 | ## Install tensorflow-dlrm from source code ## 6 | 7 | First, clone noddlrm using `git`: 8 | 9 | ```sh 10 | git clone https://github.com/NodLabs/tensorflow-dlrm 11 | ``` 12 | 13 | Then, `cd` to the tensorflow-dlrm folder and run the install command(if you want to install 14 | noddlrm to your python lib): 15 | 16 | ```sh 17 | cd tensorflow-dlrm 18 | python setup.py install 19 | ``` 20 | Now you have installed noddlrm to you system. 21 | 22 | ## Dataset download 23 | 24 | All datasets can be downloaded from Google drive [here](https://drive.google.com/drive/folders/1taJ91txiMAWBMUtezc_N5gaYuTEpvW_e?usp=sharing). 25 | In our example, we use the dataset criteo. 26 | 27 | ## Training and get the saved model 28 | Edit the dlrm_criteo_gpu/tpu.py to use your dataset criteo path 29 | Then run the example script we have provided. 30 | ```sh 31 | cd tensorflow-dlrm/ 32 | export PYTHONPATH="$PWD" 33 | python3 dlrm_criteo_gpu.py 34 | # python3 dlrm_criteo_tpu.py 35 | ``` 36 | ## Outputs ## 37 | ### GPU ### 38 | ```sh 39 | python3 dlrm_criteo_gpu.py 40 | 41 | 2020-07-14 08:10:32.701182: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcuda.so.1 42 | 2020-07-14 08:10:32.729195: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 43 | 2020-07-14 08:10:32.729514: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1561] Found device 0 with properties: 44 | pciBusID: 0000:01:00.0 name: GeForce RTX 2080 computeCapability: 7.5 45 | coreClock: 1.59GHz coreCount: 46 deviceMemorySize: 7.79GiB deviceMemoryBandwidth: 417.29GiB/s 46 | 2020-07-14 08:10:32.729734: W tensorflow/stream_executor/platform/default/dso_loader.cc:55] Could not load dynamic library 'libcudart.so.10.1'; dlerror: libcudart.so.10.1: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /home/chi/bin/vulkansdk/x86_64/lib::/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64 47 | 2020-07-14 08:10:32.730158: W tensorflow/stream_executor/platform/default/dso_loader.cc:55] Could not load dynamic library 'libcublas.so.10'; dlerror: libcublas.so.10: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /home/chi/bin/vulkansdk/x86_64/lib::/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64 48 | 2020-07-14 08:10:32.730298: W tensorflow/stream_executor/platform/default/dso_loader.cc:55] Could not load dynamic library 'libcufft.so.10'; dlerror: libcufft.so.10: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /home/chi/bin/vulkansdk/x86_64/lib::/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64 49 | 2020-07-14 08:10:32.730501: W tensorflow/stream_executor/platform/default/dso_loader.cc:55] Could not load dynamic library 'libcurand.so.10'; dlerror: libcurand.so.10: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /home/chi/bin/vulkansdk/x86_64/lib::/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64 50 | 2020-07-14 08:10:32.730632: W tensorflow/stream_executor/platform/default/dso_loader.cc:55] Could not load dynamic library 'libcusolver.so.10'; dlerror: libcusolver.so.10: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /home/chi/bin/vulkansdk/x86_64/lib::/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64 51 | 2020-07-14 08:10:32.730848: W tensorflow/stream_executor/platform/default/dso_loader.cc:55] Could not load dynamic library 'libcusparse.so.10'; dlerror: libcusparse.so.10: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /home/chi/bin/vulkansdk/x86_64/lib::/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64 52 | 2020-07-14 08:10:32.766959: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudnn.so.7 53 | 2020-07-14 08:10:32.766987: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1598] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform. 54 | Skipping registering GPU devices... 55 | 2020-07-14 08:10:32.768812: I tensorflow/core/platform/cpu_feature_guard.cc:143] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA 56 | 2020-07-14 08:10:32.799122: I tensorflow/core/platform/profile_utils/cpu_utils.cc:102] CPU Frequency: 2899885000 Hz 57 | 2020-07-14 08:10:32.799874: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7f2204000b20 initialized for platform Host (this does not guarantee that XLA will be used). Devices: 58 | 2020-07-14 08:10:32.799887: I tensorflow/compiler/xla/service/service.cc:176] StreamExecutor device (0): Host, Default Version 59 | 2020-07-14 08:10:32.801441: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1102] Device interconnect StreamExecutor with strength 1 edge matrix: 60 | 2020-07-14 08:10:32.801453: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1108] 61 | WARNING:tensorflow:From /home/chi/nnc_env/lib/python3.7/site-packages/tensorflow/python/ops/linalg/linear_operator_lower_triangular.py:158: calling LinearOperator.__init__ (from tensorflow.python.ops.linalg.linear_operator) with graph_parents is deprecated and will be removed in a future version. 62 | Instructions for updating: 63 | Do not pass `graph_parents`. They will no longer be used. 64 | Iter: 0, Loss: 0.24, AUC: 0.5614 65 | Iter: 100, Loss: 0.19, AUC: 0.6755 66 | Iter: 200, Loss: 0.17, AUC: 0.6976 67 | Iter: 300, Loss: 0.17, AUC: 0.7037 68 | Iter: 400, Loss: 0.17, AUC: 0.7062 69 | Iter: 500, Loss: 0.17, AUC: 0.7079 70 | Iter: 600, Loss: 0.17, AUC: 0.7080 71 | Iter: 700, Loss: 0.17, AUC: 0.7095 72 | Iter: 800, Loss: 0.17, AUC: 0.7099 73 | Iter: 900, Loss: 0.17, AUC: 0.7103 74 | 2020-07-14 08:21:00.611816: W tensorflow/python/util/util.cc:329] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them. 75 | WARNING:tensorflow:From /home/chi/nnc_env/lib/python3.7/site-packages/tensorflow/python/ops/resource_variable_ops.py:1817: calling BaseResourceVariable.__init__ (from tensorflow.python.ops.resource_variable_ops) with constraint is deprecated and will be removed in a future version. 76 | Instructions for updating: 77 | If using Keras pass *_constraint arguments to layers. 78 | 79 | ``` 80 | ### TPU ### 81 | ```sh 82 | python3 dlrm_criteo_tpu.py 83 | 84 | 2020-07-14 15:16:10.152558: I tensorflow/core/platform/cpu_feature_guard.cc:143] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA 85 | 2020-07-14 15:16:10.179204: I tensorflow/core/platform/profile_utils/cpu_utils.cc:102] CPU Frequency: 2300000000 Hz 86 | 2020-07-14 15:16:10.183868: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x38c1f20 initialized for platform Host (this does not guarantee that XLA will be used). 87 | Devices: 88 | 2020-07-14 15:16:10.183924: I tensorflow/compiler/xla/service/service.cc:176] StreamExecutor device (0): Host, Default Version 89 | 2020-07-14 15:16:10.232148: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:301] Initialize GrpcChannelCache for job worker -> {0 -> 10.240.1.2:8470} 90 | 2020-07-14 15:16:10.232200: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:301] Initialize GrpcChannelCache for job localhost -> {0 -> localhost:31017} 91 | 2020-07-14 15:16:10.255055: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:301] Initialize GrpcChannelCache for job worker -> {0 -> 10.240.1.2:8470} 92 | 2020-07-14 15:16:10.255110: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:301] Initialize GrpcChannelCache for job localhost -> {0 -> localhost:31017} 93 | 2020-07-14 15:16:10.259154: I tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc:390] Started server with target: grpc://localhost:31017 94 | All devices: [LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:7', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:6', device_type='TP 95 | U'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:5', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:4', device_type='TPU'), Logic 96 | alDevice(name='/job:worker/replica:0/task:0/device:TPU:0', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:1', device_type='TPU'), LogicalDevice(n 97 | ame='/job:worker/replica:0/task:0/device:TPU:2', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:3', device_type='TPU')] 98 | WARNING:tensorflow:From /usr/local/lib/python3.7/dist-packages/tensorflow/python/ops/linalg/linear_operator_lower_triangular.py:158: calling LinearOperator.__init__ (from tensorf 99 | low.python.ops.linalg.linear_operator) with graph_parents is deprecated and will be removed in a future version. 100 | Instructions for updating: 101 | Do not pass `graph_parents`. They will no longer be used. 102 | WARNING:tensorflow:From /usr/local/lib/python3.7/dist-packages/tensorflow/python/ops/linalg/linear_operator_lower_triangular.py:158: calling LinearOperator.__init__ (from tensorf 103 | low.python.ops.linalg.linear_operator) with graph_parents is deprecated and will be removed in a future version. 104 | Instructions for updating: 105 | Do not pass `graph_parents`. They will no longer be used. 106 | Iter: 0, Loss: 0.22, AUC: 0.4884 107 | Iter: 100, Loss: 0.19, AUC: 0.6258 108 | Iter: 200, Loss: 0.18, AUC: 0.6673 109 | Iter: 300, Loss: 0.18, AUC: 0.6827 110 | Iter: 400, Loss: 0.17, AUC: 0.6939 111 | Iter: 500, Loss: 0.17, AUC: 0.7030 112 | Iter: 600, Loss: 0.17, AUC: 0.7066 113 | Iter: 700, Loss: 0.17, AUC: 0.7079 114 | Iter: 800, Loss: 0.17, AUC: 0.7100 115 | Iter: 900, Loss: 0.17, AUC: 0.7107 116 | 2020-07-14 15:22:10.259060: W tensorflow/python/util/util.cc:329] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them. 117 | WARNING:tensorflow:From /usr/local/lib/python3.7/dist-packages/tensorflow/python/ops/resource_variable_ops.py:1817: calling BaseResourceVariable.__init__ (from tensorflow.python. 118 | ops.resource_variable_ops) with constraint is deprecated and will be removed in a future version. 119 | Instructions for updating: 120 | If using Keras pass *_constraint arguments to layers. 121 | WARNING:tensorflow:From /usr/local/lib/python3.7/dist-packages/tensorflow/python/ops/resource_variable_ops.py:1817: calling BaseResourceVariable.__init__ (from tensorflow.python. 122 | ops.resource_variable_ops) with constraint is deprecated and will be removed in a future version. 123 | Instructions for updating: 124 | If using Keras pass *_constraint arguments to layers. 125 | ``` 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | -------------------------------------------------------------------------------- /dataloader.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | from io import BytesIO 4 | from tensorflow.python.lib.io import file_io 5 | 6 | def load_amazon_book(dataset_folder='dataset/'): 7 | 8 | raw_data = dict() 9 | raw_data['total_users'] = 99473 10 | raw_data['total_items'] = 450166 11 | 12 | raw_data['train_data'] = np.load(dataset_folder + 'amazon/user_data_train.npy') 13 | raw_data['val_data'] = np.load(dataset_folder + 'amazon/user_data_val.npy') 14 | raw_data['test_data'] = np.load(dataset_folder + 'amazon/user_data_test.npy') 15 | 16 | raw_data['item_features'] = np.array(np.memmap(dataset_folder + 'amazon/book_features_update.mem', 17 | dtype=np.float32, mode='r', shape=(raw_data['max_item'], 4096))) 18 | raw_data['user_features'] = np.load(dataset_folder + 'amazon/user_features_categories.npy') 19 | return raw_data 20 | 21 | def load_citeulike(dataset_folder='dataset/'): 22 | 23 | raw_data = dict() 24 | raw_data['total_users'] = 5551 25 | raw_data['total_items'] = 16980 26 | 27 | raw_data['train_data'] = np.load(dataset_folder + 'citeulike/user_data_train.npy') 28 | raw_data['val_data'] = np.load(dataset_folder + 'citeulike/user_data_val.npy') 29 | raw_data['test_data'] = np.load(dataset_folder + 'citeulike/user_data_test.npy') 30 | 31 | return raw_data 32 | 33 | def load_tradesy(dataset_folder='dataset/'): 34 | 35 | raw_data = dict() 36 | raw_data['total_users'] = 19243 37 | raw_data['total_items'] = 165906 38 | 39 | raw_data['train_data'] = np.load(dataset_folder + 'tradesy/user_data_train.npy') 40 | raw_data['val_data'] = np.load(dataset_folder + 'tradesy/user_data_val.npy') 41 | raw_data['test_data'] = np.load(dataset_folder + 'tradesy/user_data_test.npy') 42 | 43 | raw_data['item_features'] = np.load(dataset_folder + 'tradesy/item_features.npy') / 32.671101 44 | return raw_data 45 | 46 | 47 | def load_criteo_google_cloud(dataset_folder='dataset/'): 48 | # Data processing code adapted from https://github.com/facebookresearch/dlrm 49 | # Follow steps in https://github.com/ylongqi/dlrm/blob/master/data_utils.py to generate kaggle_processed.npz 50 | # Or using `./download_dataset.sh criteo` command to download the processed data. 51 | 52 | f = BytesIO(file_io.read_file_to_string(dataset_folder + 'criteo/kaggle_processed.npz', binary_mode=True)) 53 | with np.load(f) as data: 54 | X_int = data["X_int"] 55 | X_cat = data["X_cat"] 56 | y = data["y"] 57 | counts = data["counts"] 58 | 59 | indices = np.arange(len(y)) 60 | indices = np.array_split(indices, 7) 61 | for i in range(len(indices)): 62 | indices[i] = np.random.permutation(indices[i]) 63 | 64 | train_indices = np.concatenate(indices[:-1]) 65 | test_indices = indices[-1] 66 | val_indices, test_indices = np.array_split(test_indices, 2) 67 | train_indices = np.random.permutation(train_indices) 68 | 69 | raw_data = dict() 70 | 71 | raw_data['counts'] = counts 72 | 73 | raw_data['X_cat_train'] = X_cat[train_indices].astype(np.int32) 74 | raw_data['X_int_train'] = np.log(X_int[train_indices] + 1).astype(np.float32) 75 | raw_data['y_train'] = y[train_indices].astype(np.float32) 76 | 77 | raw_data['X_cat_val'] = X_cat[val_indices] 78 | raw_data['X_int_val'] = np.log(X_int[val_indices] + 1).astype(np.float32) 79 | raw_data['y_val'] = y[val_indices] 80 | 81 | raw_data['X_cat_test'] = X_cat[test_indices] 82 | raw_data['X_int_test'] = np.log(X_int[test_indices] + 1).astype(np.float32) 83 | raw_data['y_test'] = y[test_indices] 84 | 85 | return raw_data 86 | 87 | def load_criteo(dataset_folder='dataset/'): 88 | 89 | # Data processing code adapted from https://github.com/facebookresearch/dlrm 90 | # Follow steps in https://github.com/ylongqi/dlrm/blob/master/data_utils.py to generate kaggle_processed.npz 91 | # Or using `./download_dataset.sh criteo` command to download the processed data. 92 | 93 | with np.load(dataset_folder + 'criteo/kaggle_processed.npz') as data: 94 | 95 | X_int = data["X_int"] 96 | X_cat = data["X_cat"] 97 | y = data["y"] 98 | counts = data["counts"] 99 | 100 | indices = np.arange(len(y)) 101 | indices = np.array_split(indices, 7) 102 | for i in range(len(indices)): 103 | indices[i] = np.random.permutation(indices[i]) 104 | 105 | train_indices = np.concatenate(indices[:-1]) 106 | test_indices = indices[-1] 107 | val_indices, test_indices = np.array_split(test_indices, 2) 108 | train_indices = np.random.permutation(train_indices) 109 | 110 | raw_data = dict() 111 | 112 | raw_data['counts'] = counts 113 | 114 | raw_data['X_cat_train'] = X_cat[train_indices].astype(np.int32) 115 | raw_data['X_int_train'] = np.log(X_int[train_indices]+1).astype(np.float32) 116 | raw_data['y_train'] = y[train_indices].astype(np.float32) 117 | 118 | raw_data['X_cat_val'] = X_cat[val_indices] 119 | raw_data['X_int_val'] = np.log(X_int[val_indices]+1).astype(np.float32) 120 | raw_data['y_val'] = y[val_indices] 121 | 122 | raw_data['X_cat_test'] = X_cat[test_indices] 123 | raw_data['X_int_test'] = np.log(X_int[test_indices]+1).astype(np.float32) 124 | raw_data['y_test'] = y[test_indices] 125 | 126 | return raw_data 127 | -------------------------------------------------------------------------------- /dlrm_criteo_gpu.py: -------------------------------------------------------------------------------- 1 | from tensorflow.data import Dataset 2 | from noddlrm.recommenders import DLRM 3 | from tensorflow.keras import optimizers 4 | from tqdm import tqdm 5 | import tensorflow as tf 6 | import dataloader 7 | 8 | raw_data = dataloader.load_criteo('../dataset/') 9 | dim_embed = 4 10 | bottom_mlp_size = [8, 4] 11 | top_mlp_size = [128, 64, 1] 12 | total_iter = int(1e5) 13 | batch_size = 1024 14 | eval_interval = 100 15 | save_interval = eval_interval 16 | 17 | # Sample 1000 batches for training 18 | train_dataset = Dataset.from_tensor_slices({ 19 | 'dense_features': raw_data['X_int_train'][:batch_size*1000], 20 | 'sparse_features': raw_data['X_cat_train'][:batch_size*1000], 21 | 'label': raw_data['y_train'][:batch_size*1000] 22 | }).batch(batch_size).prefetch(1).shuffle(5*batch_size) 23 | 24 | # Sample 100 batches for validation 25 | val_dataset = Dataset.from_tensor_slices({ 26 | 'dense_features': raw_data['X_int_val'][:batch_size*100], 27 | 'sparse_features': raw_data['X_cat_val'][:batch_size*100], 28 | 'label': raw_data['y_val'][:batch_size*100] 29 | }).batch(batch_size) 30 | 31 | optimizer = optimizers.Adam() 32 | 33 | dlrm_model = DLRM( 34 | m_spa=dim_embed, 35 | ln_emb=raw_data['counts'], 36 | ln_bot=bottom_mlp_size, 37 | ln_top=top_mlp_size 38 | ) 39 | 40 | auc = tf.keras.metrics.AUC() 41 | 42 | @tf.function 43 | def train_step(dense_features, sparse_features, label): 44 | with tf.GradientTape() as tape: 45 | loss_value = dlrm_model.get_myloss(dense_features, sparse_features, label) 46 | gradients = tape.gradient(loss_value, dlrm_model.trainable_variables) 47 | optimizer.apply_gradients(zip(gradients, dlrm_model.trainable_variables)) 48 | return loss_value 49 | 50 | @tf.function 51 | def eval_step(dense_features, sparse_features, label): 52 | pred = dlrm_model.inference(dense_features, sparse_features) 53 | auc.update_state(y_true=label, y_pred=pred) 54 | 55 | average_loss = tf.keras.metrics.Mean() 56 | 57 | for train_iter, batch_data in enumerate(train_dataset): 58 | 59 | loss = train_step(**batch_data) 60 | average_loss.update_state(loss) 61 | print('%d iter training.' % train_iter, end='\r') 62 | 63 | if train_iter % eval_interval == 0: 64 | for eval_batch_data in tqdm(val_dataset, 65 | leave=False, 66 | desc='%d iter evaluation' % train_iter): 67 | eval_step(**eval_batch_data) 68 | print("Iter: %d, Loss: %.2f, AUC: %.4f" % (train_iter, 69 | average_loss.result().numpy(), 70 | auc.result().numpy())) 71 | average_loss.reset_states() 72 | auc.reset_states() 73 | 74 | dlrm_model.save('DLRMModel_tf2_2') 75 | -------------------------------------------------------------------------------- /dlrm_criteo_tpu.py: -------------------------------------------------------------------------------- 1 | from tensorflow.data import Dataset 2 | from noddlrm.recommenders import DLRM 3 | from tensorflow.keras import optimizers 4 | from tqdm import tqdm 5 | import tensorflow as tf 6 | import dataloader 7 | 8 | #setup tpu enviroment 9 | resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='grpc://10.240.1.2') 10 | tf.config.experimental_connect_to_cluster(resolver) 11 | # This is the TPU initialization code that has to be at the beginning. 12 | tf.tpu.experimental.initialize_tpu_system(resolver) 13 | print("All devices: ", tf.config.list_logical_devices('TPU')) 14 | 15 | raw_data = dataloader.load_criteo('../dataset/') 16 | dim_embed = 4 17 | bottom_mlp_size = [8, 4] 18 | top_mlp_size = [128, 64, 1] 19 | total_iter = int(1e5) 20 | batch_size = 1024 21 | eval_interval = 100 22 | save_interval = eval_interval 23 | 24 | # Sample 1000 batches for training 25 | train_dataset = Dataset.from_tensor_slices({ 26 | 'dense_features': raw_data['X_int_train'][:batch_size*1000], 27 | 'sparse_features': raw_data['X_cat_train'][:batch_size*1000], 28 | 'label': raw_data['y_train'][:batch_size*1000] 29 | }).batch(batch_size).prefetch(1).shuffle(5*batch_size) 30 | 31 | # Sample 100 batches for validation 32 | val_dataset = Dataset.from_tensor_slices({ 33 | 'dense_features': raw_data['X_int_val'][:batch_size*100], 34 | 'sparse_features': raw_data['X_cat_val'][:batch_size*100], 35 | 'label': raw_data['y_val'][:batch_size*100] 36 | }).batch(batch_size) 37 | 38 | optimizer = optimizers.Adam() 39 | 40 | dlrm_model = DLRM( 41 | m_spa=dim_embed, 42 | ln_emb=raw_data['counts'], 43 | ln_bot=bottom_mlp_size, 44 | ln_top=top_mlp_size 45 | ) 46 | 47 | auc = tf.keras.metrics.AUC() 48 | 49 | @tf.function 50 | def train_step(dense_features, sparse_features, label): 51 | with tf.GradientTape() as tape: 52 | loss_value = dlrm_model.get_myloss(dense_features, sparse_features, label) 53 | gradients = tape.gradient(loss_value, dlrm_model.trainable_variables) 54 | optimizer.apply_gradients(zip(gradients, dlrm_model.trainable_variables)) 55 | return loss_value 56 | 57 | @tf.function 58 | def eval_step(dense_features, sparse_features, label): 59 | pred = dlrm_model.inference(dense_features, sparse_features) 60 | auc.update_state(y_true=label, y_pred=pred) 61 | 62 | average_loss = tf.keras.metrics.Mean() 63 | 64 | for train_iter, batch_data in enumerate(train_dataset): 65 | 66 | loss = train_step(**batch_data) 67 | average_loss.update_state(loss) 68 | print('%d iter training.' % train_iter, end='\r') 69 | 70 | if train_iter % eval_interval == 0: 71 | for eval_batch_data in tqdm(val_dataset, 72 | leave=False, 73 | desc='%d iter evaluation' % train_iter): 74 | eval_step(**eval_batch_data) 75 | print("Iter: %d, Loss: %.2f, AUC: %.4f" % (train_iter, 76 | average_loss.result().numpy(), 77 | auc.result().numpy())) 78 | average_loss.reset_states() 79 | auc.reset_states() 80 | 81 | dlrm_model.save('gs://nodtpu/chi/drlm/models/criteo/') 82 | -------------------------------------------------------------------------------- /noddlrm/__init__.py: -------------------------------------------------------------------------------- 1 | from noddlrm.recommenders.dlrm import DLRM -------------------------------------------------------------------------------- /noddlrm/data/__init__.py: -------------------------------------------------------------------------------- 1 | from noddlrm.data.utils import _DataStore 2 | from noddlrm.data.utils import _ParallelDataset 3 | from noddlrm.data.dataset import Dataset -------------------------------------------------------------------------------- /noddlrm/data/dataset.py: -------------------------------------------------------------------------------- 1 | from noddlrm.data import _ParallelDataset 2 | from noddlrm.data import _DataStore 3 | import tensorflow as tf 4 | import numpy as np 5 | import random 6 | 7 | def _pairwise_generator(datastore): 8 | 9 | while True: 10 | entry = datastore.next_random_record() 11 | user_id = entry['user_id'] 12 | p_item_id = entry['item_id'] 13 | n_item_id = datastore.sample_negative_items(user_id)[0] 14 | yield {'user_id': user_id, 15 | 'p_item_id': p_item_id, 16 | 'n_item_id': n_item_id} 17 | 18 | def _stratified_pointwise_generator(datastore, pos_ratio): 19 | 20 | while True: 21 | if random.random() <= pos_ratio: 22 | entry = datastore.next_random_record() 23 | yield {'user_id': entry['user_id'], 24 | 'item_id': entry['item_id'], 25 | 'label': 1.0} 26 | else: 27 | user_id = random.randint(0, datastore.total_users()-1) 28 | item_id = random.randint(0, datastore.total_items()-1) 29 | while datastore.is_positive(user_id, item_id): 30 | user_id = random.randint(0, datastore.total_users()-1) 31 | item_id = random.randint(0, datastore.total_items()-1) 32 | yield {'user_id': user_id, 33 | 'item_id': item_id, 34 | 'label': 0.0} 35 | 36 | def _per_pos_stratified_pointwise_generator(datastore, pos_ratio): 37 | 38 | num_negative_per_positive = int((1 - pos_ratio) / pos_ratio) 39 | 40 | while True: 41 | 42 | entry = datastore.next_random_record() 43 | user_id = entry['user_id'] 44 | p_item_id = entry['item_id'] 45 | yield {'user_id': user_id, 46 | 'item_id': p_item_id, 47 | 'label': 1.0} 48 | 49 | count = 0 50 | for n_item_id in random.sample(range(datastore.total_items()), k=num_negative_per_positive + 1): 51 | if n_item_id == p_item_id: 52 | continue 53 | yield {'user_id': user_id, 54 | 'item_id': n_item_id, 55 | 'label': 0.0} 56 | count += 1 57 | if count >= num_negative_per_positive: 58 | break 59 | 60 | def _evaluation_generator(datastore, excl_datasets): 61 | 62 | eval_users = datastore.warm_users() 63 | 64 | for user_id in eval_users: 65 | 66 | pos_mask_npy = np.zeros(datastore.total_items(), dtype=np.bool) # Reset pos_mask 67 | positive_items = datastore.get_positive_items(user_id) 68 | pos_mask_npy[positive_items] = True 69 | 70 | if datastore.contain_negatives(): 71 | excl_mask_npy = np.ones(datastore.total_items(), dtype=np.bool) # Reset excl_mask 72 | excl_mask_npy[positive_items] = False 73 | negative_items = datastore.get_negative_items(user_id) 74 | excl_mask_npy[negative_items] = False 75 | else: 76 | excl_mask_npy = np.zeros(datastore.total_items(), dtype=np.bool) # Reset excl_mask 77 | 78 | excl_positive_items = [] 79 | for excl_d in excl_datasets: 80 | excl_positive_items += excl_d.datastore.get_positive_items(user_id) 81 | excl_mask_npy[excl_positive_items] = True 82 | 83 | yield {'user_id': user_id, 84 | 'pos_mask': pos_mask_npy, 85 | 'excl_mask': excl_mask_npy} 86 | 87 | class Dataset: 88 | 89 | def __init__(self, raw_data, total_users, total_items, implicit_negative=True, 90 | num_negatives=None, seed=None, sortby=None, asc=True, name=None): 91 | 92 | self.datastore = _DataStore(raw_data=raw_data, 93 | total_users=total_users, 94 | total_items=total_items, 95 | implicit_negative=implicit_negative, 96 | num_negatives=num_negatives, 97 | seed=seed, sortby=sortby, name=name, asc=asc) 98 | 99 | def _build_dataset(self, generator, generator_params, output_types, output_shapes, 100 | batch_size, num_parallel_calls, take=None): 101 | 102 | 103 | return _ParallelDataset(generator=generator, 104 | generator_params=generator_params, 105 | output_types=output_types, 106 | output_shapes=output_shapes, 107 | batch_size=batch_size, 108 | num_parallel_calls=num_parallel_calls, 109 | take=take) 110 | 111 | def pairwise(self, batch_size, num_parallel_calls=1, take=None): 112 | 113 | output_types = {'user_id': tf.int32, 114 | 'p_item_id': tf.int32, 115 | 'n_item_id': tf.int32} 116 | output_shapes = {'user_id':[], 117 | 'p_item_id':[], 118 | 'n_item_id':[]} 119 | 120 | return self._build_dataset(generator=_pairwise_generator, 121 | generator_params=(self.datastore, ), 122 | output_types=output_types, 123 | output_shapes=output_shapes, 124 | batch_size=batch_size, 125 | num_parallel_calls=num_parallel_calls, 126 | take=take) 127 | 128 | def stratified_pointwise(self, batch_size, pos_ratio=0.5, num_parallel_calls=1, take=None): 129 | 130 | output_types = {'user_id': tf.int32, 131 | 'item_id': tf.int32, 132 | 'label': tf.float32} 133 | output_shapes = {'user_id':[], 134 | 'item_id':[], 135 | 'label':[]} 136 | 137 | return self._build_dataset(generator=_stratified_pointwise_generator, 138 | generator_params=(self.datastore, pos_ratio), 139 | output_types=output_types, 140 | output_shapes=output_shapes, 141 | batch_size=batch_size, 142 | num_parallel_calls=num_parallel_calls, 143 | take=take) 144 | 145 | def per_pos_stratified_pointwise(self, batch_size, pos_ratio=0.5, num_parallel_calls=1, take=None): 146 | 147 | output_types = {'user_id': tf.int32, 148 | 'item_id': tf.int32, 149 | 'label': tf.float32} 150 | output_shapes = {'user_id':[], 151 | 'item_id':[], 152 | 'label':[]} 153 | 154 | return self._build_dataset(generator=_per_pos_stratified_pointwise_generator, 155 | generator_params=(self.datastore, pos_ratio), 156 | output_types=output_types, 157 | output_shapes=output_shapes, 158 | batch_size=batch_size, 159 | num_parallel_calls=num_parallel_calls, 160 | take=take) 161 | 162 | def evaluation(self, batch_size, excl_datasets=[]): 163 | 164 | output_types = {'user_id': tf.int32, 165 | 'pos_mask': tf.bool, 166 | 'excl_mask': tf.bool} 167 | output_shapes = {'user_id': [], 168 | 'pos_mask': [self.datastore.total_items()], 169 | 'excl_mask': [self.datastore.total_items()]} 170 | 171 | return self._build_dataset(generator=_evaluation_generator, 172 | generator_params=(self.datastore, excl_datasets), 173 | output_types=output_types, 174 | output_shapes=output_shapes, 175 | batch_size=batch_size, 176 | num_parallel_calls=1) 177 | -------------------------------------------------------------------------------- /noddlrm/data/utils.py: -------------------------------------------------------------------------------- 1 | import multiprocessing as mp 2 | import tensorflow as tf 3 | import numpy as np 4 | import random 5 | 6 | class _DataStore(object): 7 | 8 | def __init__(self, raw_data, total_users, total_items, implicit_negative=True, 9 | num_negatives=None, seed=None, sortby=None, asc=True, name=None): 10 | 11 | self.name = name 12 | random.seed(seed) 13 | if type(raw_data) == np.ndarray: 14 | self._raw_data = raw_data 15 | else: 16 | raise TypeError("Unsupported data input schema. Please use structured numpy array.") 17 | self._rand_ids = [] 18 | 19 | self._total_users = total_users 20 | self._total_items = total_items 21 | 22 | self._sortby = sortby 23 | 24 | self._index_store = dict() 25 | self._implicit_negative = implicit_negative 26 | self._num_negatives = num_negatives 27 | if self._implicit_negative: 28 | self._index_store['positive'] = dict() 29 | for ind, entry in enumerate(self._raw_data): 30 | if entry['user_id'] not in self._index_store['positive']: 31 | self._index_store['positive'][entry['user_id']] = dict() 32 | self._index_store['positive'][entry['user_id']][entry['item_id']] = ind 33 | self._index_store['positive_sets'] = dict() 34 | for user_id in self._index_store['positive']: 35 | self._index_store['positive_sets'][user_id] = set(self._index_store['positive'][user_id]) 36 | if num_negatives is not None: 37 | self._index_store['negative'] = dict() 38 | for user_id in self._index_store['positive']: 39 | self._index_store['negative'][user_id] = dict() 40 | shuffled_items = np.random.permutation(self._total_items) 41 | for item in shuffled_items: 42 | if item not in self._index_store['positive'][user_id]: 43 | self._index_store['negative'][user_id][item] = None 44 | if len(self._index_store['negative'][user_id]) == num_negatives: 45 | break 46 | self._index_store['negative_sets'] = dict() 47 | for user_id in self._index_store['negative']: 48 | self._index_store['negative_sets'][user_id] = set(self._index_store['negative'][user_id]) 49 | else: 50 | self._index_store['positive'] = dict() 51 | self._index_store['negative'] = dict() 52 | for ind, entry in enumerate(self._raw_data): 53 | if entry['label'] > 0: 54 | if entry['user_id'] not in self._index_store['positive']: 55 | self._index_store['positive'][entry['user_id']] = dict() 56 | self._index_store['positive'][entry['user_id']][entry['item_id']] = ind 57 | else: 58 | if entry['user_id'] not in self._index_store['negative']: 59 | self._index_store['negative'][entry['user_id']] = dict() 60 | self._index_store['negative'][entry['user_id']][entry['item_id']] = ind 61 | self._index_store['positive_sets'] = dict() 62 | for user_id in self._index_store['positive']: 63 | self._index_store['positive_sets'][user_id] = set(self._index_store['positive'][user_id]) 64 | self._index_store['negative_sets'] = dict() 65 | for user_id in self._index_store['negative']: 66 | self._index_store['negative_sets'][user_id] = set(self._index_store['negative'][user_id]) 67 | 68 | if self._sortby is not None: 69 | self._index_store['positive_sorts'] = dict() 70 | for user_id in self._index_store['positive_sets']: 71 | self._index_store['positive_sorts'][user_id] = sorted(list(self._index_store['positive_sets'][user_id]), 72 | key=lambda item:\ 73 | self._raw_data[self._index_store['positive'][user_id][item]][self._sortby], 74 | reverse=not asc) 75 | def contain_negatives(self): 76 | 77 | if self._implicit_negative and self._num_negatives is None: 78 | return False 79 | else: 80 | return True 81 | 82 | def next_random_record(self): 83 | 84 | if len(self._rand_ids) == 0: 85 | self._rand_ids = list(range(len(self._raw_data))) 86 | random.shuffle(self._rand_ids) 87 | return self._raw_data[self._rand_ids.pop()] 88 | 89 | def is_positive(self, user_id, item_id): 90 | 91 | if user_id in self._index_store['positive'] and item_id in self._index_store['positive'][user_id]: 92 | return True 93 | return False 94 | 95 | def sample_positive_items(self, user_id, num_samples=1): 96 | 97 | if user_id in self._index_store['positive_sets']: 98 | return random.sample(self._index_store['positive_sets'][user_id], num_samples) 99 | else: 100 | return [] 101 | 102 | def sample_negative_items(self, user_id, num_samples=1): 103 | 104 | if 'negative_sets' in self._index_store: 105 | if user_id in self._index_store['negative_sets']: 106 | return random.sample(self._index_store['negative_sets'][user_id], num_samples) 107 | else: 108 | return [] 109 | else: 110 | sample_id = random.randint(0, self._total_items-1) 111 | sample_set = set() 112 | while len(sample_set) < num_samples: 113 | if user_id not in self._index_store['positive_sets'] or sample_id not in self._index_store['positive_sets'][user_id]: 114 | sample_set.add(sample_id) 115 | sample_id = random.randint(0, self._total_items-1) 116 | return list(sample_set) 117 | 118 | def get_positive_items(self, user_id, sort=False): 119 | 120 | if user_id in self._index_store['positive_sets']: 121 | if sort: 122 | assert self._sortby is not None, "sortby key is not specified." 123 | return self._index_store['positive_sorts'][user_id] 124 | else: 125 | return list(self._index_store['positive_sets'][user_id]) 126 | else: 127 | return [] 128 | 129 | def get_negative_items(self, user_id): 130 | 131 | if 'negative_sets' in self._index_store: 132 | if user_id in self._index_store['negative_sets']: 133 | return list(self._index_store['negative_sets'][user_id]) 134 | else: 135 | return [] 136 | else: 137 | negative_items = [] 138 | for item_id in range(self._total_items): 139 | if item_id not in self._index_store['positive_sets'][user_id]: 140 | negative_items.append(item_id) 141 | return negative_items 142 | 143 | def warm_users(self, threshold=1): 144 | 145 | users_list = [] 146 | for user_id in self._index_store['positive']: 147 | if len(self._index_store['positive'][user_id]) >= threshold: 148 | users_list.append(user_id) 149 | return users_list 150 | 151 | def total_users(self): 152 | 153 | return self._total_users 154 | 155 | def total_items(self): 156 | 157 | return self._total_items 158 | 159 | def total_records(self): 160 | 161 | return len(self._raw_data) 162 | 163 | 164 | def _process(q, generator, generator_params, output_shapes, batch_size): 165 | 166 | batch_data = {key:[] for key in output_shapes} 167 | num_data_points = 0 168 | 169 | for single_data in generator(*generator_params): 170 | for key in single_data: 171 | batch_data[key].append(single_data[key]) 172 | num_data_points += 1 173 | if num_data_points == batch_size: 174 | q.put(batch_data) 175 | batch_data = {key:[] for key in output_shapes} 176 | num_data_points = 0 177 | 178 | if num_data_points > 0: 179 | q.put(batch_data) 180 | q.put(None) 181 | 182 | class _ParallelDataset: 183 | 184 | def __init__(self, generator, generator_params, output_types, output_shapes, 185 | num_parallel_calls, batch_size, take): 186 | 187 | ctx = mp.get_context('spawn') 188 | self._q = ctx.Queue(maxsize=num_parallel_calls) 189 | self._output_types = output_types 190 | self._take = take 191 | self._count = 0 192 | 193 | self._p_list = [] 194 | 195 | for i in range(num_parallel_calls): 196 | self._p_list.append(ctx.Process(target=_process, args=(self._q, generator, generator_params, output_shapes, batch_size))) 197 | self._p_list[i].daemon = True 198 | self._p_list[i].start() 199 | 200 | def __iter__(self): 201 | 202 | return self 203 | 204 | def __next__(self): 205 | 206 | if self._take is None or self._count < self._take: 207 | batch_data = self._q.get() 208 | if batch_data is None: 209 | raise StopIteration() 210 | else: 211 | self._count += 1 212 | return {key:tf.constant(batch_data[key], dtype=self._output_types[key]) for key in batch_data} 213 | else: 214 | raise StopIteration() 215 | 216 | -------------------------------------------------------------------------------- /noddlrm/metrics/__init__.py: -------------------------------------------------------------------------------- 1 | from noddlrm.modules.ranking_metrics import * 2 | from noddlrm.modules.dict_mean import DictMean -------------------------------------------------------------------------------- /noddlrm/metrics/dict_mean.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | 4 | class DictMean: 5 | 6 | def __init__(self, state_shape): 7 | 8 | self._states = {} 9 | for key in state_shape: 10 | shape = state_shape[key] 11 | self._states[key] = {'sum': tf.Variable(tf.zeros(shape, dtype=tf.float32)), 12 | 'count': tf.Variable(tf.zeros([], dtype=tf.float32))} 13 | 14 | def reset_states(self): 15 | 16 | for key in self._states: 17 | self._states[key]['sum'].assign(tf.zeros(tf.shape(self._states[key]['sum']), 18 | dtype=tf.float32)) 19 | self._states[key]['count'].assign(0.) 20 | 21 | def update_state(self, state): 22 | 23 | for key in state: 24 | self._states[key]['sum'].assign_add(tf.math.reduce_sum(state[key], axis=0)) 25 | self._states[key]['count'].assign_add(tf.cast(tf.shape(state[key])[0], tf.float32)) 26 | 27 | def result(self): 28 | 29 | result = {} 30 | for key in self._states: 31 | result[key] = self._states[key]['sum'] / self._states[key]['count'] 32 | return result 33 | -------------------------------------------------------------------------------- /noddlrm/metrics/ranking_metrics.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | 4 | def _log2(value): 5 | 6 | return tf.math.log(value) / tf.math.log(2.0) 7 | 8 | def AUC(pos_mask, pred, excl_mask): 9 | 10 | def _map_fn(tups): 11 | 12 | user_pos_mask, user_pred, user_excl_mask = tups 13 | 14 | eval_mask = tf.math.logical_not(tf.math.logical_or(user_pos_mask, user_excl_mask)) 15 | eval_pred = user_pred[eval_mask] 16 | pos_pred = user_pred[user_pos_mask] 17 | eval_num = tf.math.count_nonzero(eval_mask, dtype=tf.int32) 18 | user_auc = tf.math.count_nonzero(eval_pred <= tf.reshape(pos_pred, (-1, 1)), dtype=tf.float32) \ 19 | / tf.cast(tf.size(pos_pred) * eval_num, dtype=tf.float32) 20 | 21 | return user_auc 22 | 23 | auc = tf.map_fn(_map_fn, (pos_mask, pred, excl_mask), parallel_iterations=10, dtype=tf.float32) 24 | 25 | return auc 26 | 27 | 28 | def NDCG(pos_mask, pred, excl_mask, at=[100]): 29 | 30 | def _map_fn(tups): 31 | 32 | user_pos_mask, user_pred, user_excl_mask = tups 33 | user_pred = tf.math.exp(user_pred) * tf.cast(tf.math.logical_not(user_excl_mask), tf.float32) 34 | pos_pred = user_pred[user_pos_mask] 35 | rank_above = tf.math.count_nonzero(user_pred > tf.reshape(pos_pred, (-1, 1)), axis=1, dtype=tf.float32) 36 | rank_above = tf.tile(tf.expand_dims(rank_above, 0), [len(at), 1]) 37 | tf_at = tf.reshape(tf.constant(at, dtype=tf.float32), [-1, 1]) 38 | log_recipr = tf.math.reciprocal(_log2(rank_above+2)) 39 | 40 | user_ndcg = tf.reduce_sum(log_recipr * tf.cast(rank_above < tf_at, tf.float32), 41 | axis=1) 42 | 43 | return user_ndcg 44 | 45 | ndcg = tf.map_fn(_map_fn, (pos_mask, pred, excl_mask), parallel_iterations=10, dtype=tf.float32) 46 | 47 | return ndcg 48 | 49 | 50 | def Recall(pos_mask, pred, excl_mask, at=[100]): 51 | 52 | 53 | def _map_fn(tups): 54 | 55 | user_pos_mask, user_pred, user_excl_mask = tups 56 | user_pred = tf.math.exp(user_pred) * tf.cast(tf.math.logical_not(user_excl_mask), tf.float32) 57 | pos_pred = user_pred[user_pos_mask] 58 | rank_above = tf.math.count_nonzero(user_pred > tf.reshape(pos_pred, (-1, 1)), axis=1, dtype=tf.float32) 59 | rank_above = tf.tile(tf.expand_dims(rank_above, 0), [len(at), 1]) 60 | tf_at = tf.reshape(tf.constant(at, dtype=tf.float32), [-1, 1]) 61 | 62 | user_recall = tf.math.count_nonzero(rank_above < tf_at, axis=1, dtype=tf.float32) / \ 63 | tf.cast(tf.size(pos_pred), tf.float32) 64 | 65 | return user_recall 66 | 67 | recall = tf.map_fn(_map_fn, (pos_mask, pred, excl_mask), parallel_iterations=10, dtype=tf.float32) 68 | 69 | return recall -------------------------------------------------------------------------------- /noddlrm/modules/__init__.py: -------------------------------------------------------------------------------- 1 | from noddlrm.modules.latent_factor import LatentFactor 2 | from noddlrm.modules.pairwise_log_loss import PairwiseLogLoss 3 | from noddlrm.modules.pointwise_mse_loss import PointwiseMSELoss 4 | from noddlrm.modules.multi_layer_perceptron import MLP 5 | from noddlrm.modules.second_order_feature_interaction import SecondOrderFeatureInteraction -------------------------------------------------------------------------------- /noddlrm/modules/latent_factor.py: -------------------------------------------------------------------------------- 1 | from tensorflow.keras.layers import Embedding 2 | import tensorflow as tf 3 | 4 | class LatentFactor(Embedding): 5 | 6 | def __init__(self, num_instances, dim, zero_init=False, name=None): 7 | 8 | if zero_init: 9 | initializer = 'zeros' 10 | else: 11 | initializer = 'uniform' 12 | super(LatentFactor, self).__init__(input_dim=num_instances, 13 | output_dim=dim, 14 | embeddings_initializer=initializer, 15 | name=name) 16 | 17 | def censor(self, censor_id): 18 | 19 | unique_censor_id, _ = tf.unique(censor_id) 20 | embedding_gather = tf.gather(self.variables[0], indices=unique_censor_id) 21 | norm = tf.norm(embedding_gather, axis=1, keepdims=True) 22 | return self.variables[0].scatter_nd_update(indices=tf.expand_dims(unique_censor_id, 1), 23 | updates=embedding_gather / tf.math.maximum(norm, 0.1)) -------------------------------------------------------------------------------- /noddlrm/modules/multi_layer_perceptron.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow.keras import Sequential 3 | from tensorflow.keras.layers import Dense 4 | 5 | def MLP(units_list, use_bias=True, activation='relu', out_activation=None): 6 | 7 | mlp = Sequential() 8 | 9 | for units in units_list[:-1]: 10 | mlp.add(Dense(units, 11 | activation=activation, 12 | use_bias=use_bias)) 13 | 14 | mlp.add(Dense(units_list[-1], 15 | activation=out_activation, 16 | use_bias=use_bias)) 17 | 18 | return mlp -------------------------------------------------------------------------------- /noddlrm/modules/pairwise_log_loss.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow.keras.layers import Layer 3 | 4 | class PairwiseLogLoss(Layer): 5 | 6 | def __call__(self, user_vec, p_item_vec, n_item_vec, p_item_bias=None, n_item_bias=None): 7 | 8 | outputs = super(PairwiseLogLoss, self).__call__((user_vec, 9 | p_item_vec, 10 | n_item_vec, 11 | p_item_bias, 12 | n_item_bias)) 13 | return outputs 14 | 15 | def call(self, inputs): 16 | 17 | user_vec, p_item_vec, n_item_vec, p_item_bias, n_item_bias = inputs 18 | 19 | dot_user_pos = tf.math.reduce_sum(user_vec*p_item_vec, 20 | axis=1, 21 | keepdims=True) 22 | dot_user_neg = tf.math.reduce_sum(user_vec*n_item_vec, 23 | axis=1, 24 | keepdims=True) 25 | 26 | if p_item_bias is not None: 27 | dot_user_pos += p_item_bias 28 | 29 | if n_item_bias is not None: 30 | dot_user_neg += n_item_bias 31 | 32 | loss = -tf.math.reduce_mean(tf.math.log_sigmoid(tf.math.maximum(dot_user_pos-dot_user_neg, -30.0))) 33 | 34 | return loss -------------------------------------------------------------------------------- /noddlrm/modules/pointwise_mse_loss.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow.keras.layers import Layer 3 | 4 | class PointwiseMSELoss(Layer): 5 | 6 | def __init__(self, a=1.0, b=1.0, sigmoid=False): 7 | 8 | super(PointwiseMSELoss, self).__init__() 9 | self._a = a 10 | self._b = b 11 | self._sigmoid = sigmoid 12 | 13 | def __call__(self, user_vec, item_vec, item_bias, label): 14 | 15 | outputs = super(PointwiseMSELoss, self).__call__((user_vec, item_vec, item_bias, label)) 16 | return outputs 17 | 18 | def call(self, inputs): 19 | 20 | user_vec, item_vec, item_bias, label = inputs 21 | 22 | dot_user_item = tf.math.reduce_sum(tf.math.multiply(user_vec, item_vec), 23 | axis=1, keepdims=False, name="dot_user_item") 24 | 25 | if self._sigmoid: 26 | prediction = tf.math.sigmoid(dot_user_item + tf.reshape(item_bias, [-1])) 27 | else: 28 | prediction = dot_user_item + tf.reshape(item_bias, [-1]) 29 | 30 | label_weight = (self._a - self._b) * label + self._b 31 | return tf.math.reduce_sum(label_weight * tf.square(label - prediction)) 32 | -------------------------------------------------------------------------------- /noddlrm/modules/second_order_feature_interaction.py: -------------------------------------------------------------------------------- 1 | from tensorflow.keras.layers import Layer 2 | import tensorflow as tf 3 | 4 | class SecondOrderFeatureInteraction(Layer): 5 | 6 | def __init__(self, self_interaction=False): 7 | 8 | self._self_interaction = self_interaction 9 | 10 | super(SecondOrderFeatureInteraction, self).__init__() 11 | 12 | def call(self, inputs): 13 | 14 | ''' 15 | inputs: list of features with shape [batch_size, feature_dim] 16 | ''' 17 | 18 | batch_size = tf.shape(inputs[0])[0] 19 | 20 | concat_features = tf.stack(inputs, axis=1) 21 | dot_products = tf.linalg.LinearOperatorLowerTriangular(tf.matmul(concat_features, concat_features, transpose_b=True)).to_dense() 22 | 23 | ones = tf.ones_like(dot_products) 24 | mask = tf.linalg.band_part(ones, 0, -1) 25 | 26 | if not self._self_interaction: 27 | mask = mask - tf.linalg.band_part(ones, 0, 0) 28 | out_dim = int(len(inputs) * (len(inputs)-1) / 2) 29 | else: 30 | out_dim = int(len(inputs) * (len(inputs)+1) / 2) 31 | 32 | flat_interactions = tf.reshape(tf.boolean_mask(dot_products, mask), (batch_size, out_dim)) 33 | 34 | return flat_interactions 35 | -------------------------------------------------------------------------------- /noddlrm/recommenders/__init__.py: -------------------------------------------------------------------------------- 1 | from noddlrm.recommenders.dlrm import DLRM -------------------------------------------------------------------------------- /noddlrm/recommenders/dlrm.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import tensorflow as tf 3 | from tensorflow.keras import Model 4 | from noddlrm.modules import LatentFactor, SecondOrderFeatureInteraction, MLP 5 | 6 | class DLRM(Model): 7 | 8 | def __init__( 9 | self, 10 | m_spa, 11 | ln_emb, 12 | ln_bot, 13 | ln_top, 14 | arch_interaction_op='dot', 15 | arch_interaction_itself=False, 16 | sigmoid_bot=False, 17 | sigmoid_top=True, 18 | loss_func='mse', 19 | loss_threshold=0.0): 20 | 21 | ''' 22 | m_spa: the dimensionality of sparse feature embeddings 23 | ln_emb: the size of sparse feature embeddings (num_instances) 24 | ln_bot: the size of the bottom MLP 25 | ln_top: the size of the top MLP 26 | ''' 27 | 28 | super(DLRM, self).__init__() 29 | 30 | self._loss_threshold = loss_threshold 31 | self._loss_func = loss_func 32 | self._latent_factors = [LatentFactor(num_instances=num, 33 | dim=m_spa) for num in ln_emb] 34 | self._mlp_bot = MLP(units_list=ln_bot, 35 | out_activation='sigmoid' if sigmoid_bot else 'relu') 36 | self._mlp_top = MLP(units_list=ln_top, 37 | out_activation='sigmoid' if sigmoid_top else 'relu') 38 | 39 | self._dot_interaction = None 40 | if arch_interaction_op == 'dot': 41 | self._dot_interaction = SecondOrderFeatureInteraction( 42 | self_interaction=arch_interaction_itself 43 | ) 44 | 45 | elif self._arch_interaction_op != 'cat': 46 | sys.exit( 47 | "ERROR: arch_interaction_op=" 48 | + self._arch_interaction_op 49 | + " is not supported" 50 | ) 51 | 52 | if loss_func == 'mse': 53 | self._loss = tf.keras.losses.MeanSquaredError() 54 | elif loss_func == 'bce': 55 | self._loss = tf.keras.losses.BinaryCrossentropy() 56 | else: 57 | sys.exit( 58 | "ERROR: loss_func=" 59 | + loss_func 60 | + " is not supported" 61 | ) 62 | 63 | def get_myloss(self, dense_features, sparse_features, label): 64 | 65 | ''' 66 | dense_features shape: [batch_size, num of dense features] 67 | sparse_features shape: [batch_size, num_of_sparse_features] 68 | label shape: [batch_size] 69 | ''' 70 | 71 | prediction = self.inference(dense_features, sparse_features) 72 | loss = self._loss(y_true=label, 73 | y_pred=prediction) 74 | return loss 75 | 76 | def call(self, inputs, training=None, mask=None): 77 | dense_features, sparse_features = inputs 78 | return self.inference(dense_features, sparse_features) 79 | 80 | def inference(self, dense_features, sparse_features): 81 | 82 | ''' 83 | dense_features shape: [batch_size, num of dense features] 84 | sparse_features shape: [num_of_sparse_features, batch_size] 85 | ''' 86 | self._set_inputs([dense_features, sparse_features]) 87 | sparse_emb_vecs = list(map(lambda pair: pair[1](pair[0]), 88 | zip(tf.unstack(sparse_features, axis=1), 89 | self._latent_factors))) 90 | 91 | dense_emb_vec = self._mlp_bot(dense_features) 92 | 93 | if self._dot_interaction is not None: 94 | prediction = self._mlp_top(tf.concat([dense_emb_vec, 95 | self._dot_interaction(sparse_emb_vecs + [dense_emb_vec])], 96 | axis=1)) 97 | else: 98 | prediction = self._mlp_top(tf.concat(sparse_emb_vecs + [dense_emb_vec], 99 | axis=1)) 100 | 101 | if 0.0 < self._loss_threshold and self._loss_threshold < 1.0: 102 | prediction = tf.clip_by_value(prediction, self._loss_threshold, 1.0 - self._loss_threshold) 103 | 104 | return tf.reshape(prediction, [-1]) 105 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md 3 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import find_packages, setup 2 | 3 | setup( 4 | name='noddlrm', 5 | version='0.0.1', 6 | packages=find_packages(exclude=("tutorials",)), 7 | description="NOD DLRM - Adapted from OpenRec(https://openrec.ai/)", 8 | url="https://nod.ai/", 9 | license='Apache 2.0', 10 | author='Chi Liu', 11 | author_email='chi@nod-labs.com', 12 | install_requires=[ 13 | 'tqdm>=4.15.0', 14 | 'numpy>=1.13.0', 15 | 'termcolor>=1.1.0' 16 | ], 17 | classifiers=['Development Status :: 3 - Alpha', 18 | 'License :: OSI Approved :: Apache Software License', 19 | 'Programming Language :: Python :: 3.7', 20 | 'Topic :: Scientific/Engineering :: Artificial Intelligence'], 21 | ) 22 | --------------------------------------------------------------------------------