├── .idea ├── .gitignore ├── vcs.xml ├── misc.xml ├── inspectionProfiles │ ├── profiles_settings.xml │ └── Project_Default.xml ├── modules.xml └── dlrm_data_parallel.iml ├── tricks ├── __pycache__ │ ├── md_embedding_bag.cpython-36.pyc │ ├── md_embedding_bag.cpython-38.pyc │ ├── qr_embedding_bag.cpython-36.pyc │ └── qr_embedding_bag.cpython-38.pyc ├── md_embedding_bag.py └── qr_embedding_bag.py ├── README.md ├── cache_manager.py ├── data_loader_terabyte.py ├── model_no_ddp.py ├── main_no_ddp.py ├── dlrm_data_pytorch.py └── data_utils.py /.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # Default ignored files 2 | /shelf/ 3 | /workspace.xml 4 | -------------------------------------------------------------------------------- /tricks/__pycache__/md_embedding_bag.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lkp411/cDLRM/HEAD/tricks/__pycache__/md_embedding_bag.cpython-36.pyc -------------------------------------------------------------------------------- /tricks/__pycache__/md_embedding_bag.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lkp411/cDLRM/HEAD/tricks/__pycache__/md_embedding_bag.cpython-38.pyc -------------------------------------------------------------------------------- /tricks/__pycache__/qr_embedding_bag.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lkp411/cDLRM/HEAD/tricks/__pycache__/qr_embedding_bag.cpython-36.pyc -------------------------------------------------------------------------------- /tricks/__pycache__/qr_embedding_bag.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lkp411/cDLRM/HEAD/tricks/__pycache__/qr_embedding_bag.cpython-38.pyc -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/dlrm_data_parallel.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 12 | 13 | 15 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # cDLRM 2 | 3 | Enabling pure data parallel training of DLRM via caching and prefetching 4 | 5 | Example launch command: 6 | 7 | python3 main_no_ddp.py --arch-sparse-feature-size=128 --arch-mlp-bot="13-512-256-128" --arch-mlp-top="512-512-256-1" --max-ind-range=-1 --data-generation=dataset --data-set=terabyte --raw-data-file=../../../../datasets/terabyte/day --processed-data-file=../../../../datasets/terabyte/terabyte_processed.npz --loss-function=bce --round-targets=True --learning-rate=0.8 --lr-embeds=0.8 --mini-batch-size=8192 --print-freq=8192 --print-time --test-mini-batch-size=4096 --test-num-workers=16 --test-freq=16384 --memory-map --data-sub-sample-rate=0.875 --cache-workers=4 --lookahead=3000 --cache-size=150000 --num-ways=16 --table-agg-freq=100 --batch-fifo-size=8 --large-batch --world-size=8 --master-port=12345 8 | 9 | ** Code for the paper cDLRM: Look Ahead Caching for Scalable Training of Recommendation Models accepted at RecSys 21: https://dl.acm.org/doi/pdf/10.1145/3460231.3474246 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/Project_Default.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 33 | -------------------------------------------------------------------------------- /tricks/md_embedding_bag.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | # 6 | # Mixed-Dimensions Trick 7 | # 8 | # Description: Applies mixed dimension trick to embeddings to reduce 9 | # embedding sizes. 10 | # 11 | # References: 12 | # [1] Antonio Ginart, Maxim Naumov, Dheevatsa Mudigere, Jiyan Yang, James Zou, 13 | # "Mixed Dimension Embeddings with Application to Memory-Efficient Recommendation 14 | # Systems", CoRR, arXiv:1909.11810, 2019 15 | from __future__ import absolute_import, division, print_function, unicode_literals 16 | import torch 17 | import torch.nn as nn 18 | 19 | 20 | def md_solver(n, alpha, d0=None, B=None, round_dim=True, k=None): 21 | ''' 22 | An external facing function call for mixed-dimension assignment 23 | with the alpha power temperature heuristic 24 | Inputs: 25 | n -- (torch.LongTensor) ; Vector of num of rows for each embedding matrix 26 | alpha -- (torch.FloatTensor); Scalar, non-negative, controls dim. skew 27 | d0 -- (torch.FloatTensor); Scalar, baseline embedding dimension 28 | B -- (torch.FloatTensor); Scalar, parameter budget for embedding layer 29 | round_dim -- (bool); flag for rounding dims to nearest pow of 2 30 | k -- (torch.LongTensor) ; Vector of average number of queries per inference 31 | ''' 32 | n, indices = torch.sort(n) 33 | k = k[indices] if k is not None else torch.ones(len(n)) 34 | d = alpha_power_rule(n.type(torch.float) / k, alpha, d0=d0, B=B) 35 | if round_dim: 36 | d = pow_2_round(d) 37 | return d 38 | 39 | 40 | def alpha_power_rule(n, alpha, d0=None, B=None): 41 | if d0 is not None: 42 | lamb = d0 * (n[0].type(torch.float) ** alpha) 43 | elif B is not None: 44 | lamb = B / torch.sum(n.type(torch.float) ** (1 - alpha)) 45 | else: 46 | raise ValueError("Must specify either d0 or B") 47 | d = torch.ones(len(n)) * lamb * (n.type(torch.float) ** (-alpha)) 48 | for i in range(len(d)): 49 | if i == 0 and d0 is not None: 50 | d[i] = d0 51 | else: 52 | d[i] = 1 if d[i] < 1 else d[i] 53 | return (torch.round(d).type(torch.long)) 54 | 55 | 56 | def pow_2_round(dims): 57 | return 2 ** torch.round(torch.log2(dims.type(torch.float))) 58 | 59 | 60 | class PrEmbeddingBag(nn.Module): 61 | def __init__(self, num_embeddings, embedding_dim, base_dim): 62 | super(PrEmbeddingBag, self).__init__() 63 | self.embs = nn.EmbeddingBag( 64 | num_embeddings, embedding_dim, mode="sum", sparse=True) 65 | torch.nn.init.xavier_uniform_(self.embs.weight) 66 | if embedding_dim < base_dim: 67 | self.proj = nn.Linear(embedding_dim, base_dim, bias=False) 68 | torch.nn.init.xavier_uniform_(self.proj.weight) 69 | elif embedding_dim == base_dim: 70 | self.proj = nn.Identity() 71 | else: 72 | raise ValueError( 73 | "Embedding dim " + str(embedding_dim) + " > base dim " + str(base_dim) 74 | ) 75 | 76 | def forward(self, input, offsets=None, per_sample_weights=None): 77 | return self.proj(self.embs( 78 | input, offsets=offsets, per_sample_weights=per_sample_weights)) 79 | -------------------------------------------------------------------------------- /cache_manager.py: -------------------------------------------------------------------------------- 1 | import math 2 | import os 3 | 4 | import torch 5 | import torch.multiprocessing as mp 6 | 7 | 8 | class Prefetcher(mp.Process): 9 | def __init__(self, args, emb_tables_cpu, batch_fifo, eviction_fifo, finish_event, cache_ld): 10 | mp.Process.__init__(self) 11 | 12 | # Shared variables 13 | self.args = args 14 | self.emb_tables_cpu = emb_tables_cpu 15 | self.batch_fifo = batch_fifo 16 | self.eviction_fifo = eviction_fifo 17 | self.finish_event = finish_event 18 | self.cache_ld = cache_ld 19 | 20 | @staticmethod 21 | def pin_pool(p, core): 22 | this_pid = os.getpid() 23 | os.system("taskset -p -c %d %d" % (core + 3 + p, this_pid)) 24 | 25 | return 1 26 | 27 | @staticmethod 28 | def process_batch_slice(slice, emb_tables_cpu): 29 | lists_of_unique_indices = [] 30 | unique_indices_maps = [] 31 | for i in range(len(emb_tables_cpu.emb_l)): 32 | unique_indices_tensor = torch.unique(slice[i]) # .long() 33 | unique_indices_tensor.share_memory_() 34 | lists_of_unique_indices.append(unique_indices_tensor) 35 | 36 | idxs = torch.arange(unique_indices_tensor.shape[0]) 37 | max = torch.max(unique_indices_tensor) 38 | map = -1 * torch.ones(max + 1, 1, dtype=torch.long) 39 | map[unique_indices_tensor] = idxs.view(-1, 1) 40 | map.share_memory_() 41 | 42 | unique_indices_maps.append(map) 43 | 44 | cached_entries_per_table = emb_tables_cpu.fetch_unique_idx_slices(lists_of_unique_indices) 45 | 46 | return cached_entries_per_table, lists_of_unique_indices, unique_indices_maps 47 | 48 | @staticmethod 49 | def eviction_manager(emb_tables, eviction_fifo, average_on_writeback, core, timeout): 50 | this_pid = os.getpid() 51 | print('Pinning eviction process...') 52 | os.system("taskset -p -c %d %d" % (core, this_pid)) 53 | print('Done pinning eviction process') 54 | 55 | try: 56 | while (True): 57 | eviction_data = eviction_fifo.get(timeout=timeout) if timeout > 0 else eviction_fifo.get() 58 | for k, table_eviction_data in enumerate(eviction_data): 59 | idxs = table_eviction_data[0] 60 | embeddings = table_eviction_data[1] 61 | emb_tables.emb_l[k].weight.data[idxs] = (emb_tables.emb_l[k].weight.data[ 62 | idxs] + embeddings) / 2 if average_on_writeback else embeddings 63 | except: 64 | print('Eviction queue empty longer than expected. Exiting eviction manager...') 65 | 66 | def run(self): 67 | this_pid = os.getpid() 68 | os.system("taskset -p -c %d %d" % (self.args.main_start_core + 1, this_pid)) 69 | 70 | eviction_process = mp.Process(target=Prefetcher.eviction_manager, 71 | args=(self.emb_tables_cpu, self.eviction_fifo, self.args.average_on_writeback, self.args.main_start_core + 2, 72 | self.args.eviction_fifo_timeout)) 73 | eviction_process.start() 74 | 75 | num_examples_per_process = self.args.lookahead * self.args.mini_batch_size 76 | 77 | pool = mp.Pool(processes=self.args.cache_workers) 78 | 79 | results = [pool.apply_async(Prefetcher.pin_pool, args=(p, self.args.main_start_core)) for p in range(self.args.cache_workers)] 80 | for res in results: 81 | res.get() 82 | print('Done pinning processes. Starting cache manager.') 83 | 84 | 85 | collection_limit = self.args.lookahead * self.args.cache_workers 86 | 87 | for epoch in range(self.args.nepochs): 88 | lS_i = [] 89 | collected = 0 90 | for j, (_, _, sparse_idxs, _) in enumerate(self.cache_ld): 91 | if (j > 0 and collected % collection_limit == 0) or j == len(self.cache_ld) - 1: 92 | if j == len(self.cache_ld) - 1: 93 | lS_i.append(sparse_idxs) 94 | 95 | lS_i = torch.cat(lS_i, dim=1) 96 | num_processes_needed = math.ceil(lS_i.shape[1] / num_examples_per_process) 97 | 98 | processed_slices = [pool.apply_async(Prefetcher.process_batch_slice, args=( 99 | lS_i[:, p * num_examples_per_process: (p + 1) * num_examples_per_process], self.emb_tables_cpu)) for p 100 | in range(num_processes_needed)] 101 | 102 | for res in processed_slices: 103 | a = res.get() 104 | self.batch_fifo.put((a[0], a[1], a[2])) 105 | 106 | lS_i = [sparse_idxs] 107 | collected = 1 108 | else: 109 | lS_i.append(sparse_idxs) 110 | collected += 1 111 | 112 | pool.close() 113 | pool.join() 114 | eviction_process.join() 115 | self.finish_event.wait() 116 | -------------------------------------------------------------------------------- /tricks/qr_embedding_bag.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | # 6 | # Quotient-Remainder Trick 7 | # 8 | # Description: Applies quotient remainder-trick to embeddings to reduce 9 | # embedding sizes. 10 | # 11 | # References: 12 | # [1] Hao-Jun Michael Shi, Dheevatsa Mudigere, Maxim Naumov, Jiyan Yang, 13 | # "Compositional Embeddings Using Complementary Partitions for Memory-Efficient 14 | # Recommendation Systems", CoRR, arXiv:1909.02107, 2019 15 | 16 | 17 | from __future__ import absolute_import, division, print_function, unicode_literals 18 | import torch 19 | import torch.nn as nn 20 | import torch.nn.functional as F 21 | from torch.nn.parameter import Parameter 22 | import numpy as np 23 | 24 | 25 | class QREmbeddingBag(nn.Module): 26 | r"""Computes sums or means over two 'bags' of embeddings, one using the quotient 27 | of the indices and the other using the remainder of the indices, without 28 | instantiating the intermediate embeddings, then performs an operation to combine these. 29 | 30 | For bags of constant length and no :attr:`per_sample_weights`, this class 31 | 32 | * with ``mode="sum"`` is equivalent to :class:`~torch.nn.Embedding` followed by ``torch.sum(dim=0)``, 33 | * with ``mode="mean"`` is equivalent to :class:`~torch.nn.Embedding` followed by ``torch.mean(dim=0)``, 34 | * with ``mode="max"`` is equivalent to :class:`~torch.nn.Embedding` followed by ``torch.max(dim=0)``. 35 | 36 | However, :class:`~torch.nn.EmbeddingBag` is much more time and memory efficient than using a chain of these 37 | operations. 38 | 39 | QREmbeddingBag also supports per-sample weights as an argument to the forward 40 | pass. This scales the output of the Embedding before performing a weighted 41 | reduction as specified by ``mode``. If :attr:`per_sample_weights`` is passed, the 42 | only supported ``mode`` is ``"sum"``, which computes a weighted sum according to 43 | :attr:`per_sample_weights`. 44 | 45 | Known Issues: 46 | Autograd breaks with multiple GPUs. It breaks only with multiple embeddings. 47 | 48 | Args: 49 | num_categories (int): total number of unique categories. The input indices must be in 50 | 0, 1, ..., num_categories - 1. 51 | embedding_dim (list): list of sizes for each embedding vector in each table. If ``"add"`` 52 | or ``"mult"`` operation are used, these embedding dimensions must be 53 | the same. If a single embedding_dim is used, then it will use this 54 | embedding_dim for both embedding tables. 55 | num_collisions (int): number of collisions to enforce. 56 | operation (string, optional): ``"concat"``, ``"add"``, or ``"mult". Specifies the operation 57 | to compose embeddings. ``"concat"`` concatenates the embeddings, 58 | ``"add"`` sums the embeddings, and ``"mult"`` multiplies 59 | (component-wise) the embeddings. 60 | Default: ``"mult"`` 61 | max_norm (float, optional): If given, each embedding vector with norm larger than :attr:`max_norm` 62 | is renormalized to have norm :attr:`max_norm`. 63 | norm_type (float, optional): The p of the p-norm to compute for the :attr:`max_norm` option. Default ``2``. 64 | scale_grad_by_freq (boolean, optional): if given, this will scale gradients by the inverse of frequency of 65 | the words in the mini-batch. Default ``False``. 66 | Note: this option is not supported when ``mode="max"``. 67 | mode (string, optional): ``"sum"``, ``"mean"`` or ``"max"``. Specifies the way to reduce the bag. 68 | ``"sum"`` computes the weighted sum, taking :attr:`per_sample_weights` 69 | into consideration. ``"mean"`` computes the average of the values 70 | in the bag, ``"max"`` computes the max value over each bag. 71 | Default: ``"mean"`` 72 | sparse (bool, optional): if ``True``, gradient w.r.t. :attr:`weight` matrix will be a sparse tensor. See 73 | Notes for more details regarding sparse gradients. Note: this option is not 74 | supported when ``mode="max"``. 75 | 76 | Attributes: 77 | weight (Tensor): the learnable weights of each embedding table is the module of shape 78 | `(num_embeddings, embedding_dim)` initialized using a uniform distribution 79 | with sqrt(1 / num_categories). 80 | 81 | Inputs: :attr:`input` (LongTensor), :attr:`offsets` (LongTensor, optional), and 82 | :attr:`per_index_weights` (Tensor, optional) 83 | 84 | - If :attr:`input` is 2D of shape `(B, N)`, 85 | 86 | it will be treated as ``B`` bags (sequences) each of fixed length ``N``, and 87 | this will return ``B`` values aggregated in a way depending on the :attr:`mode`. 88 | :attr:`offsets` is ignored and required to be ``None`` in this case. 89 | 90 | - If :attr:`input` is 1D of shape `(N)`, 91 | 92 | it will be treated as a concatenation of multiple bags (sequences). 93 | :attr:`offsets` is required to be a 1D tensor containing the 94 | starting index positions of each bag in :attr:`input`. Therefore, 95 | for :attr:`offsets` of shape `(B)`, :attr:`input` will be viewed as 96 | having ``B`` bags. Empty bags (i.e., having 0-length) will have 97 | returned vectors filled by zeros. 98 | 99 | per_sample_weights (Tensor, optional): a tensor of float / double weights, or None 100 | to indicate all weights should be taken to be ``1``. If specified, :attr:`per_sample_weights` 101 | must have exactly the same shape as input and is treated as having the same 102 | :attr:`offsets`, if those are not ``None``. Only supported for ``mode='sum'``. 103 | 104 | 105 | Output shape: `(B, embedding_dim)` 106 | 107 | """ 108 | __constants__ = ['num_categories', 'embedding_dim', 'num_collisions', 109 | 'operation', 'max_norm', 'norm_type', 'scale_grad_by_freq', 110 | 'mode', 'sparse'] 111 | 112 | def __init__(self, num_categories, embedding_dim, num_collisions, 113 | operation='mult', max_norm=None, norm_type=2., 114 | scale_grad_by_freq=False, mode='mean', sparse=False, 115 | _weight=None): 116 | super(QREmbeddingBag, self).__init__() 117 | 118 | assert operation in ['concat', 'mult', 'add'], 'Not valid operation!' 119 | 120 | self.num_categories = num_categories 121 | if isinstance(embedding_dim, int) or len(embedding_dim) == 1: 122 | self.embedding_dim = [embedding_dim, embedding_dim] 123 | else: 124 | self.embedding_dim = embedding_dim 125 | self.num_collisions = num_collisions 126 | self.operation = operation 127 | self.max_norm = max_norm 128 | self.norm_type = norm_type 129 | self.scale_grad_by_freq = scale_grad_by_freq 130 | 131 | if self.operation == 'add' or self.operation == 'mult': 132 | assert self.embedding_dim[0] == self.embedding_dim[1], \ 133 | 'Embedding dimensions do not match!' 134 | 135 | self.num_embeddings = [int(np.ceil(num_categories / num_collisions)), 136 | num_collisions] 137 | 138 | if _weight is None: 139 | self.weight_q = Parameter(torch.Tensor(self.num_embeddings[0], self.embedding_dim[0])) 140 | self.weight_r = Parameter(torch.Tensor(self.num_embeddings[1], self.embedding_dim[1])) 141 | self.reset_parameters() 142 | else: 143 | assert list(_weight[0].shape) == [self.num_embeddings[0], self.embedding_dim[0]], \ 144 | 'Shape of weight for quotient table does not match num_embeddings and embedding_dim' 145 | assert list(_weight[1].shape) == [self.num_embeddings[1], self.embedding_dim[1]], \ 146 | 'Shape of weight for remainder table does not match num_embeddings and embedding_dim' 147 | self.weight_q = Parameter(_weight[0]) 148 | self.weight_r = Parameter(_weight[1]) 149 | self.mode = mode 150 | self.sparse = sparse 151 | 152 | def reset_parameters(self): 153 | nn.init.uniform_(self.weight_q, np.sqrt(1 / self.num_categories)) 154 | nn.init.uniform_(self.weight_r, np.sqrt(1 / self.num_categories)) 155 | 156 | def forward(self, input, offsets=None, per_sample_weights=None): 157 | input_q = (input / self.num_collisions).long() 158 | input_r = torch.remainder(input, self.num_collisions).long() 159 | 160 | embed_q = F.embedding_bag(input_q, self.weight_q, offsets, self.max_norm, 161 | self.norm_type, self.scale_grad_by_freq, self.mode, 162 | self.sparse, per_sample_weights) 163 | embed_r = F.embedding_bag(input_r, self.weight_r, offsets, self.max_norm, 164 | self.norm_type, self.scale_grad_by_freq, self.mode, 165 | self.sparse, per_sample_weights) 166 | 167 | if self.operation == 'concat': 168 | embed = torch.cat((embed_q, embed_r), dim=1) 169 | elif self.operation == 'add': 170 | embed = embed_q + embed_r 171 | elif self.operation == 'mult': 172 | embed = embed_q * embed_r 173 | 174 | return embed 175 | 176 | def extra_repr(self): 177 | s = '{num_embeddings}, {embedding_dim}' 178 | if self.max_norm is not None: 179 | s += ', max_norm={max_norm}' 180 | if self.norm_type != 2: 181 | s += ', norm_type={norm_type}' 182 | if self.scale_grad_by_freq is not False: 183 | s += ', scale_grad_by_freq={scale_grad_by_freq}' 184 | s += ', mode={mode}' 185 | return s.format(**self.__dict__) 186 | -------------------------------------------------------------------------------- /data_loader_terabyte.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | 6 | 7 | from __future__ import absolute_import, division, print_function, unicode_literals 8 | 9 | import os 10 | import numpy as np 11 | from torch.utils.data import Dataset 12 | import torch 13 | import time 14 | import math 15 | from tqdm import tqdm 16 | import argparse 17 | 18 | 19 | class DataLoader: 20 | """ 21 | DataLoader dedicated for the Criteo Terabyte Click Logs dataset 22 | """ 23 | 24 | def __init__( 25 | self, 26 | data_filename, 27 | data_directory, 28 | days, 29 | batch_size, 30 | max_ind_range=-1, 31 | split="train", 32 | drop_last_batch=False 33 | ): 34 | self.data_filename = data_filename 35 | self.data_directory = data_directory 36 | self.days = days 37 | self.batch_size = batch_size 38 | self.max_ind_range = max_ind_range 39 | 40 | total_file = os.path.join( 41 | data_directory, 42 | data_filename + "_day_count.npz" 43 | ) 44 | with np.load(total_file) as data: 45 | total_per_file = data["total_per_file"][np.array(days)] 46 | 47 | self.length = sum(total_per_file) 48 | if split == "test" or split == "val": 49 | self.length = int(np.ceil(self.length / 2.)) 50 | self.split = split 51 | self.drop_last_batch = drop_last_batch 52 | 53 | def __iter__(self): 54 | return iter( 55 | _batch_generator( 56 | self.data_filename, self.data_directory, self.days, 57 | self.batch_size, self.split, self.drop_last_batch, self.max_ind_range 58 | ) 59 | ) 60 | 61 | def __len__(self): 62 | if self.drop_last_batch: 63 | return self.length // self.batch_size 64 | else: 65 | return math.ceil(self.length / self.batch_size) 66 | 67 | 68 | def _transform_features( 69 | x_int_batch, x_cat_batch, y_batch, max_ind_range, flag_input_torch_tensor=False 70 | ): 71 | if max_ind_range > 0: 72 | x_cat_batch = x_cat_batch % max_ind_range 73 | 74 | if flag_input_torch_tensor: 75 | x_int_batch = torch.log(x_int_batch.clone().detach().type(torch.float) + 1) 76 | x_cat_batch = x_cat_batch.clone().detach().type(torch.long) 77 | y_batch = y_batch.clone().detach().type(torch.float32).view(-1, 1) 78 | else: 79 | x_int_batch = torch.log(torch.tensor(x_int_batch, dtype=torch.float) + 1) 80 | x_cat_batch = torch.tensor(x_cat_batch, dtype=torch.long) 81 | y_batch = torch.tensor(y_batch, dtype=torch.float32).view(-1, 1) 82 | 83 | batch_size = x_cat_batch.shape[0] 84 | feature_count = x_cat_batch.shape[1] 85 | lS_o = torch.arange(batch_size).reshape(1, -1).repeat(feature_count, 1) 86 | 87 | return x_int_batch, lS_o, x_cat_batch.t(), y_batch.view(-1, 1) 88 | 89 | 90 | def _batch_generator( 91 | data_filename, data_directory, days, batch_size, split, drop_last, max_ind_range 92 | ): 93 | previous_file = None 94 | for day in days: 95 | filepath = os.path.join( 96 | data_directory, 97 | data_filename + "_{}_reordered.npz".format(day) 98 | ) 99 | 100 | # print('Loading file: ', filepath) 101 | with np.load(filepath) as data: 102 | x_int = data["X_int"] 103 | x_cat = data["X_cat"] 104 | y = data["y"] 105 | 106 | samples_in_file = y.shape[0] 107 | batch_start_idx = 0 108 | if split == "test" or split == "val": 109 | length = int(np.ceil(samples_in_file / 2.)) 110 | if split == "test": 111 | samples_in_file = length 112 | elif split == "val": 113 | batch_start_idx = samples_in_file - length 114 | 115 | while batch_start_idx < samples_in_file - batch_size: 116 | 117 | missing_samples = batch_size 118 | if previous_file is not None: 119 | missing_samples -= previous_file['y'].shape[0] 120 | 121 | current_slice = slice(batch_start_idx, batch_start_idx + missing_samples) 122 | 123 | x_int_batch = x_int[current_slice] 124 | x_cat_batch = x_cat[current_slice] 125 | y_batch = y[current_slice] 126 | 127 | if previous_file is not None: 128 | x_int_batch = np.concatenate( 129 | [previous_file['x_int'], x_int_batch], 130 | axis=0 131 | ) 132 | x_cat_batch = np.concatenate( 133 | [previous_file['x_cat'], x_cat_batch], 134 | axis=0 135 | ) 136 | y_batch = np.concatenate([previous_file['y'], y_batch], axis=0) 137 | previous_file = None 138 | 139 | if x_int_batch.shape[0] != batch_size: 140 | raise ValueError('should not happen') 141 | 142 | yield _transform_features(x_int_batch, x_cat_batch, y_batch, max_ind_range) 143 | 144 | batch_start_idx += missing_samples 145 | if batch_start_idx != samples_in_file: 146 | current_slice = slice(batch_start_idx, samples_in_file) 147 | if previous_file is not None: 148 | previous_file = { 149 | 'x_int' : np.concatenate( 150 | [previous_file['x_int'], x_int[current_slice]], 151 | axis=0 152 | ), 153 | 'x_cat' : np.concatenate( 154 | [previous_file['x_cat'], x_cat[current_slice]], 155 | axis=0 156 | ), 157 | 'y' : np.concatenate([previous_file['y'], y[current_slice]], axis=0) 158 | } 159 | else: 160 | previous_file = { 161 | 'x_int' : x_int[current_slice], 162 | 'x_cat' : x_cat[current_slice], 163 | 'y' : y[current_slice] 164 | } 165 | 166 | if not drop_last: 167 | yield _transform_features( 168 | previous_file['x_int'], 169 | previous_file['x_cat'], 170 | previous_file['y'], 171 | max_ind_range 172 | ) 173 | 174 | 175 | def _test(): 176 | generator = _batch_generator( 177 | data_filename='day', 178 | data_directory='/input', 179 | days=range(23), 180 | split="train", 181 | batch_size=2048 182 | ) 183 | t1 = time.time() 184 | for x_int, lS_o, x_cat, y in generator: 185 | t2 = time.time() 186 | time_diff = t2 - t1 187 | t1 = t2 188 | print( 189 | "time {} x_int.shape: {} lS_o.shape: {} x_cat.shape: {} y.shape: {}".format( 190 | time_diff, x_int.shape, lS_o.shape, x_cat.shape, y.shape 191 | ) 192 | ) 193 | 194 | 195 | class CriteoBinDataset(Dataset): 196 | """Binary version of criteo dataset.""" 197 | 198 | def __init__(self, data_file, counts_file, 199 | batch_size=1, max_ind_range=-1, bytes_per_feature=4): 200 | # dataset 201 | self.tar_fea = 1 # single target 202 | self.den_fea = 13 # 13 dense features 203 | self.spa_fea = 26 # 26 sparse features 204 | self.tad_fea = self.tar_fea + self.den_fea 205 | self.tot_fea = self.tad_fea + self.spa_fea 206 | 207 | self.batch_size = batch_size 208 | self.max_ind_range = max_ind_range 209 | self.bytes_per_entry = (bytes_per_feature * self.tot_fea * batch_size) 210 | 211 | self.num_entries = math.ceil(os.path.getsize(data_file) / self.bytes_per_entry) 212 | 213 | print('data file:', data_file, 'number of batches:', self.num_entries) 214 | self.file = open(data_file, 'rb') 215 | 216 | with np.load(counts_file) as data: 217 | self.counts = data["counts"] 218 | 219 | # hardcoded for now 220 | self.m_den = 13 221 | 222 | def __len__(self): 223 | return self.num_entries 224 | 225 | def __getitem__(self, idx): 226 | self.file.seek(idx * self.bytes_per_entry, 0) 227 | raw_data = self.file.read(self.bytes_per_entry) 228 | array = np.frombuffer(raw_data, dtype=np.int32) 229 | tensor = torch.from_numpy(array).view((-1, self.tot_fea)) 230 | 231 | return _transform_features(x_int_batch=tensor[:, 1:14], 232 | x_cat_batch=tensor[:, 14:], 233 | y_batch=tensor[:, 0], 234 | max_ind_range=self.max_ind_range, 235 | flag_input_torch_tensor=True) 236 | 237 | 238 | def numpy_to_binary(input_files, output_file_path, split='train'): 239 | """Convert the data to a binary format to be read with CriteoBinDataset.""" 240 | 241 | # WARNING - both categorical and numerical data must fit into int32 for 242 | # the following code to work correctly 243 | 244 | with open(output_file_path, 'wb') as output_file: 245 | if split == 'train': 246 | for input_file in input_files: 247 | print('Processing file: ', input_file) 248 | 249 | np_data = np.load(input_file) 250 | np_data = np.concatenate([np_data['y'].reshape(-1, 1), 251 | np_data['X_int'], 252 | np_data['X_cat']], axis=1) 253 | np_data = np_data.astype(np.int32) 254 | 255 | output_file.write(np_data.tobytes()) 256 | else: 257 | assert len(input_files) == 1 258 | np_data = np.load(input_files[0]) 259 | np_data = np.concatenate([np_data['y'].reshape(-1, 1), 260 | np_data['X_int'], 261 | np_data['X_cat']], axis=1) 262 | np_data = np_data.astype(np.int32) 263 | 264 | samples_in_file = np_data.shape[0] 265 | midpoint = int(np.ceil(samples_in_file / 2.)) 266 | if split == "test": 267 | begin = 0 268 | end = midpoint 269 | elif split == "val": 270 | begin = midpoint 271 | end = samples_in_file 272 | else: 273 | raise ValueError('Unknown split value: ', split) 274 | 275 | output_file.write(np_data[begin:end].tobytes()) 276 | 277 | 278 | def _preprocess(args): 279 | train_files = ['{}_{}_reordered.npz'.format(args.input_data_prefix, day) for 280 | day in range(0, 23)] 281 | 282 | test_valid_file = args.input_data_prefix + '_23_reordered.npz' 283 | 284 | os.makedirs(args.output_directory, exist_ok=True) 285 | for split in ['train', 'val', 'test']: 286 | print('Running preprocessing for split =', split) 287 | 288 | output_file = os.path.join(args.output_directory, 289 | '{}_data.bin'.format(split)) 290 | 291 | input_files = train_files if split == 'train' else [test_valid_file] 292 | numpy_to_binary(input_files=input_files, 293 | output_file_path=output_file, 294 | split=split) 295 | 296 | 297 | def _test_bin(): 298 | parser = argparse.ArgumentParser() 299 | parser.add_argument('--output_directory', required=True) 300 | parser.add_argument('--input_data_prefix', required=True) 301 | parser.add_argument('--split', choices=['train', 'test', 'val'], 302 | required=True) 303 | args = parser.parse_args() 304 | 305 | # _preprocess(args) 306 | 307 | binary_data_file = os.path.join(args.output_directory, 308 | '{}_data.bin'.format(args.split)) 309 | 310 | counts_file = os.path.join(args.output_directory, 'day_fea_count.npz') 311 | dataset_binary = CriteoBinDataset(data_file=binary_data_file, 312 | counts_file=counts_file, 313 | batch_size=2048,) 314 | from dlrm_data_pytorch import CriteoDataset, collate_wrapper_criteo 315 | 316 | binary_loader = torch.utils.data.DataLoader( 317 | dataset_binary, 318 | batch_size=None, 319 | shuffle=False, 320 | num_workers=0, 321 | collate_fn=None, 322 | pin_memory=False, 323 | drop_last=False, 324 | ) 325 | 326 | original_dataset = CriteoDataset( 327 | dataset='terabyte', 328 | max_ind_range=10 * 1000 * 1000, 329 | sub_sample_rate=1, 330 | randomize=True, 331 | split=args.split, 332 | raw_path=args.input_data_prefix, 333 | pro_data='dummy_string', 334 | memory_map=True 335 | ) 336 | 337 | original_loader = torch.utils.data.DataLoader( 338 | original_dataset, 339 | batch_size=2048, 340 | shuffle=False, 341 | num_workers=0, 342 | collate_fn=collate_wrapper_criteo, 343 | pin_memory=False, 344 | drop_last=False, 345 | ) 346 | 347 | assert len(dataset_binary) == len(original_loader) 348 | for i, (old_batch, new_batch) in tqdm(enumerate(zip(original_loader, 349 | binary_loader)), 350 | total=len(dataset_binary)): 351 | 352 | for j in range(len(new_batch)): 353 | if not np.array_equal(old_batch[j], new_batch[j]): 354 | raise ValueError('FAILED: Datasets not equal') 355 | if i > len(dataset_binary): 356 | break 357 | print('PASSED') 358 | 359 | 360 | if __name__ == '__main__': 361 | _test() 362 | _test_bin 363 | -------------------------------------------------------------------------------- /model_no_ddp.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import warnings 3 | 4 | import numpy as np 5 | 6 | with warnings.catch_warnings(): 7 | warnings.filterwarnings("ignore", category=DeprecationWarning) 8 | 9 | import torch 10 | import torch.nn as nn 11 | from torch.nn.parallel.parallel_apply import parallel_apply 12 | from torch.nn.parallel.replicate import replicate 13 | from torch.nn.parallel.scatter_gather import gather, scatter 14 | 15 | # quotient-remainder trick 16 | from tricks.qr_embedding_bag import QREmbeddingBag 17 | # mixed-dimension trick 18 | from tricks.md_embedding_bag import PrEmbeddingBag 19 | 20 | 21 | class Embedding_Table_Group(nn.Module): 22 | def __init__(self, 23 | m_spa=None, 24 | ln_emb=None, 25 | qr_flag=False, 26 | qr_operation="mult", 27 | qr_collisions=0, 28 | qr_threshold=200, 29 | md_flag=False, 30 | md_threshold=200): 31 | 32 | super(Embedding_Table_Group, self).__init__() 33 | 34 | if (m_spa is not None) and (ln_emb is not None): 35 | self.qr_flag = qr_flag 36 | if self.qr_flag: 37 | self.qr_collisions = qr_collisions 38 | self.qr_operation = qr_operation 39 | self.qr_threshold = qr_threshold 40 | # create variables for MD embedding if applicable 41 | self.md_flag = md_flag 42 | if self.md_flag: 43 | self.md_threshold = md_threshold 44 | 45 | # create embedding tables 46 | self.emb_l = self.create_emb(m_spa, ln_emb) 47 | 48 | def create_emb(self, m, ln): 49 | emb_l = nn.ModuleList() 50 | for i in range(0, ln.size): 51 | n = ln[i] 52 | # construct embedding operator 53 | if self.qr_flag and n > self.qr_threshold: 54 | EE = QREmbeddingBag(n, m, self.qr_collisions, 55 | operation=self.qr_operation, mode="sum", sparse=True) 56 | elif self.md_flag and n > self.md_threshold: 57 | _m = m[i] 58 | base = max(m) 59 | EE = PrEmbeddingBag(n, _m, base) 60 | # use np initialization as below for consistency... 61 | W = np.random.uniform( 62 | low=-np.sqrt(1 / n), high=np.sqrt(1 / n), size=(n, _m) 63 | ).astype(np.float32) 64 | EE.embs.weight.data = torch.tensor(W, requires_grad=False) 65 | 66 | else: 67 | EE = nn.EmbeddingBag(n, m, mode="sum", sparse=True) 68 | 69 | # initialize embeddings 70 | W = np.random.uniform( 71 | low=-np.sqrt(1 / n), high=np.sqrt(1 / n), size=(n, m) 72 | ).astype(np.float32) 73 | EE.weight.data = torch.tensor(W, requires_grad=False) 74 | EE.weight.requires_grad = False 75 | 76 | emb_l.append(EE) 77 | 78 | return emb_l 79 | 80 | def fetch_unique_idx_slices(self, lists_of_unique_indices): 81 | cached_entries_per_table = [] 82 | for k, unique_indices in enumerate(lists_of_unique_indices): 83 | E = self.emb_l[k] 84 | cached_entries = E.weight.data[unique_indices] 85 | cached_entries_per_table.append(cached_entries) 86 | 87 | return cached_entries_per_table 88 | 89 | def forward(self, lS_o, lS_i): 90 | ly = [] 91 | for k, sparse_index_group_batch in enumerate(lS_i): 92 | sparse_offset_group_batch = lS_o[k] 93 | E = self.emb_l[k] 94 | 95 | V = E(sparse_index_group_batch, sparse_offset_group_batch) 96 | ly.append(V) 97 | 98 | return ly 99 | 100 | 101 | class Embedding_Table_Cache_Group(nn.Module): 102 | def __init__(self, 103 | m_spa, 104 | ln_emb, 105 | max_cache_size, 106 | aux_table_size, 107 | num_ways): 108 | 109 | super(Embedding_Table_Cache_Group, self).__init__() 110 | self.ln_emb = ln_emb 111 | self.num_ways = num_ways 112 | 113 | self.max_cache_size = self.find_next_prime(max_cache_size) 114 | 115 | self.emb_l, self.cache_sizes = self.create_emb(m_spa, ln_emb, self.max_cache_size, num_ways, 116 | aux_table_size) # emb_l[i] is a set of num_ways tables, each corresponding to 1 way. The set would just be the row itself. 117 | 118 | self.occupancy_tables = self.create_occupancy_tables(self.cache_sizes, num_ways) 119 | 120 | self.victim_cache_entries = [None] * len(self.emb_l) 121 | 122 | def find_next_prime(self, max_cache_size): 123 | for i in range(max_cache_size, 2 * max_cache_size): 124 | if isPrime(i): 125 | return i 126 | 127 | def compute_set_indices(self, table_idx, lookup_idxs): 128 | return torch.remainder(lookup_idxs, self.cache_sizes[table_idx]) 129 | 130 | def create_emb(self, m, ln, max_cache_size, num_ways, aux_table_size): 131 | emb_l = nn.ModuleList() 132 | cache_sizes = [] 133 | 134 | for i in range(0, ln.size): 135 | n = ln[i] 136 | num_rows = n if n < max_cache_size else max_cache_size 137 | cache_sizes.append(num_rows) 138 | EE = nn.EmbeddingBag(num_ways * num_rows + aux_table_size, m, mode="sum", sparse=True) 139 | 140 | emb_l.append(EE) 141 | 142 | return emb_l, cache_sizes 143 | 144 | def create_occupancy_tables(self, cache_sizes, num_ways): 145 | occupancy_tables = [-1 * torch.ones(cache_sizes[i], num_ways, dtype=torch.int64) for i in 146 | range(len(cache_sizes))] 147 | return occupancy_tables 148 | 149 | def forward(self, lS_o, lS_i, emb_tables, rank): 150 | # WARNING: notice that we are processing the batch at once. We implicitly 151 | # assume that the data is laid out such that: 152 | # 1. each embedding is indexed with a group of sparse indices, 153 | # corresponding to a single lookup 154 | # 2. for each embedding the lookups are further organized into a batch 155 | # 3. for a list of embedding tables there is a list of batched lookups 156 | 157 | if (len(self.emb_l) != len(lS_o)) or (len(self.emb_l) != len(lS_i)): 158 | sys.exit("ERROR: corrupted model input detected in parallel_forward call") 159 | 160 | ly = [] 161 | per_table_hit_rates = [] 162 | cache_group_idxs = [] 163 | for k, sparse_index_group_batch in enumerate(lS_i): 164 | occupancy_table = self.occupancy_tables[k] 165 | 166 | set_idxs = self.compute_set_indices(k, 167 | sparse_index_group_batch) # of shape torch.Size([2048]). set_idx[i] is the set_idx that sparse_index_group_batch[i] maps to. 168 | hit_tensor = (occupancy_table[set_idxs] == sparse_index_group_batch.view(-1, 1)).any(dim=1) 169 | hit_positions = hit_tensor.nonzero(as_tuple=False).flatten() 170 | miss_positions = (hit_tensor == False).nonzero(as_tuple=False).flatten() 171 | 172 | hitting_set_idxs = set_idxs[hit_positions] 173 | hitting_ways = (occupancy_table[hitting_set_idxs] == sparse_index_group_batch[hit_positions].view(-1, 1)).nonzero(as_tuple=True)[1] 174 | hitting_cache_lookup_idxs = self.cache_sizes[k] * hitting_ways + hitting_set_idxs 175 | 176 | missing_sparse_idxs = sparse_index_group_batch[miss_positions] # Need to fetch from embedding table 177 | aux_storage_idxs = torch.tensor([self.cache_sizes[k] * self.num_ways + i for i in range(missing_sparse_idxs.shape[0])], dtype=torch.long, 178 | device=rank) 179 | self.emb_l[k].weight.data[aux_storage_idxs] = emb_tables.emb_l[k].weight.data[missing_sparse_idxs].to(rank) 180 | 181 | cache_lookup_idxs = torch.empty(sparse_index_group_batch.shape, dtype=torch.long) 182 | cache_lookup_idxs[hit_positions] = hitting_cache_lookup_idxs 183 | 184 | cache_lookup_idxs = cache_lookup_idxs.to(rank) 185 | cache_lookup_idxs[miss_positions] = aux_storage_idxs 186 | 187 | self.victim_cache_entries[k] = (aux_storage_idxs, missing_sparse_idxs) 188 | 189 | # print(k, hit_positions.shape) 190 | 191 | sparse_offset_group_batch = lS_o[k].to(rank) 192 | 193 | # embedding lookup 194 | # We are using EmbeddingBag, which implicitly uses sum operator. 195 | # The embeddings are represented as tall matrices, with sum 196 | # happening vertically across 0 axis, resulting in a row vector 197 | 198 | # import pdb; pdb.set_trace() 199 | 200 | E = self.emb_l[k] 201 | 202 | V = E(cache_lookup_idxs, sparse_offset_group_batch) # 2048 x 64 tensor 203 | ly.append(V) 204 | cache_group_idxs.append(cache_lookup_idxs.int()) 205 | 206 | # hit_rate = hit_positions.shape[0] / sparse_index_group_batch.shape[0] 207 | # per_table_hit_rates.append(hit_rate) 208 | 209 | if len(self.emb_l) != len(ly): 210 | sys.exit("ERROR: corrupted intermediate result in parallel_forward call") 211 | 212 | return ly, cache_group_idxs # , sum(per_table_hit_rates) / lS_i.shape[0] 213 | 214 | 215 | class DLRM_Net(nn.Module): 216 | def __init__( 217 | self, 218 | ln_bot=None, 219 | ln_top=None, 220 | arch_interaction_op=None, 221 | arch_interaction_itself=False, 222 | sync_dense_params=True, 223 | sigmoid_bot=-1, 224 | sigmoid_top=-1, 225 | loss_threshold=0.0, 226 | ): 227 | super(DLRM_Net, self).__init__() 228 | 229 | if (ln_bot is not None) and (ln_top is not None) and (arch_interaction_op is not None): 230 | # save arguments 231 | self.output_d = 0 232 | self.parallel_model_batch_size = -1 233 | self.parallel_model_is_not_prepared = True 234 | self.arch_interaction_op = arch_interaction_op 235 | self.arch_interaction_itself = arch_interaction_itself 236 | self.sync_dense_params = sync_dense_params 237 | self.loss_threshold = loss_threshold 238 | self.cpu = torch.device('cpu') 239 | 240 | # Trainable parameters 241 | self.bot_l = self.create_mlp(ln_bot, sigmoid_bot) 242 | self.top_l = self.create_mlp(ln_top, sigmoid_top) 243 | 244 | def create_mlp(self, ln, sigmoid_layer): 245 | # build MLP layer by layer 246 | layers = nn.ModuleList() 247 | for i in range(0, ln.size - 1): 248 | n = ln[i] 249 | m = ln[i + 1] 250 | 251 | # construct fully connected operator 252 | LL = nn.Linear(int(n), int(m), bias=True) 253 | 254 | # custom Xavier input, output or two-sided fill 255 | mean = 0.0 # std_dev = np.sqrt(variance) 256 | std_dev = np.sqrt(2 / (m + n)) # np.sqrt(1 / m) # np.sqrt(1 / n) 257 | W = np.random.normal(mean, std_dev, size=(m, n)).astype(np.float32) 258 | std_dev = np.sqrt(1 / m) # np.sqrt(2 / (m + 1)) 259 | bt = np.random.normal(mean, std_dev, size=m).astype(np.float32) 260 | LL.weight.data = torch.tensor(W, requires_grad=True) 261 | LL.bias.data = torch.tensor(bt, requires_grad=True) 262 | layers.append(LL) 263 | 264 | # construct sigmoid or relu operator 265 | if i == sigmoid_layer: 266 | layers.append(nn.Sigmoid()) 267 | else: 268 | layers.append(nn.ReLU()) 269 | 270 | return torch.nn.Sequential(*layers) 271 | 272 | def interact_features(self, x, ly): 273 | if self.arch_interaction_op == "dot": 274 | # concatenate dense and sparse features 275 | (batch_size, d) = x.shape 276 | T = torch.cat([x] + ly, dim=1).view((batch_size, -1, d)) 277 | # perform a dot product 278 | Z = torch.bmm(T, torch.transpose(T, 1, 2)) 279 | # append dense feature with the interactions (into a row vector) 280 | # approach 1: all 281 | # Zflat = Z.view((batch_size, -1)) 282 | # approach 2: unique 283 | _, ni, nj = Z.shape 284 | # approach 1: tril_indices 285 | # offset = 0 if self.arch_interaction_itself else -1 286 | # li, lj = torch.tril_indices(ni, nj, offset=offset) 287 | # approach 2: custom 288 | offset = 1 if self.arch_interaction_itself else 0 289 | li = torch.tensor([i for i in range(ni) for j in range(i + offset)], dtype=torch.long) 290 | lj = torch.tensor([j for i in range(nj) for j in range(i + offset)], dtype=torch.long) 291 | Zflat = Z[:, li, lj] 292 | # concatenate dense features and interactions 293 | R = torch.cat([x] + [Zflat], dim=1) 294 | elif self.arch_interaction_op == "cat": 295 | # concatenation features (into a row vector) 296 | R = torch.cat([x] + ly, dim=1) 297 | else: 298 | sys.exit( 299 | "ERROR: --arch-interaction-op=" 300 | + self.arch_interaction_op 301 | + " is not supported" 302 | ) 303 | 304 | return R 305 | 306 | def forward(self, dense_x, ly): 307 | x = self.bot_l(dense_x) 308 | z = self.interact_features(x, ly) 309 | p = self.top_l(z) 310 | 311 | if 0.0 < self.loss_threshold < 1.0: 312 | z = torch.clamp(p, min=self.loss_threshold, max=(1.0 - self.loss_threshold)) 313 | else: 314 | z = p 315 | 316 | return z 317 | 318 | 319 | def isPrime(n): 320 | if n == 1 or n == 2: 321 | return False 322 | 323 | i = 3 324 | 325 | while i * i < n: 326 | if n % i == 0: 327 | return False 328 | 329 | i += 1 330 | 331 | return True 332 | -------------------------------------------------------------------------------- /main_no_ddp.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import builtins 3 | import math 4 | import os 5 | import sys 6 | import time 7 | import warnings 8 | from setproctitle import setproctitle 9 | 10 | import numpy as np 11 | import psutil 12 | 13 | import dlrm_data_pytorch as dp 14 | from cache_manager import Prefetcher 15 | 16 | with warnings.catch_warnings(): 17 | warnings.filterwarnings("ignore", category=DeprecationWarning) 18 | 19 | import torch 20 | import torch.nn as nn 21 | import torch.multiprocessing as mp 22 | import torch.distributed as dist 23 | 24 | # quotient-remainder trick 25 | # mixed-dimension trick 26 | from tricks.md_embedding_bag import md_solver 27 | from model_no_ddp import Embedding_Table_Group, Embedding_Table_Cache_Group, DLRM_Net 28 | 29 | from timeit import default_timer as timer 30 | 31 | exc = getattr(builtins, "IOError", "FileNotFoundError") 32 | 33 | 34 | def ProcessArgs(): 35 | parser = argparse.ArgumentParser(description="Train Deep Learning Recommendation Model (DLRM)") 36 | 37 | ################################### Model Parameters ################################## 38 | parser.add_argument("--arch-sparse-feature-size", type=int, default=2) 39 | parser.add_argument("--arch-embedding-size", type=str, default="4-3-2") 40 | parser.add_argument("--arch-mlp-bot", type=str, default="4-3-2") 41 | parser.add_argument("--arch-mlp-top", type=str, default="4-2-1") 42 | parser.add_argument("--arch-interaction-op", type=str, default="dot") 43 | parser.add_argument("--arch-interaction-itself", action="store_true", default=False) 44 | ####################################################################################### 45 | 46 | ################################### Activation and loss ############################### 47 | parser.add_argument("--activation-function", type=str, default="relu") 48 | parser.add_argument("--loss-function", type=str, default="mse") # or bce or wbce 49 | parser.add_argument("--loss-weights", type=str, default="1.0-1.0") # for wbce 50 | parser.add_argument("--loss-threshold", type=float, default=0.0) # 1.0e-7 51 | parser.add_argument("--round-targets", type=bool, default=False) 52 | ####################################################################################### 53 | 54 | ######################################## Data ######################################### 55 | parser.add_argument("--data-size", type=int, default=1) 56 | parser.add_argument("--num-batches", type=int, default=0) 57 | parser.add_argument("--data-generation", type=str, default="random") 58 | parser.add_argument("--data-trace-file", type=str, default="./input/dist_emb_j.log") 59 | parser.add_argument("--data-set", type=str, default="kaggle") # or terabyte 60 | parser.add_argument("--raw-data-file", type=str, default="") 61 | parser.add_argument("--processed-data-file", type=str, default="") 62 | parser.add_argument("--data-randomize", type=str, default="total") # or day or none 63 | parser.add_argument("--data-trace-enable-padding", type=bool, default=False) 64 | parser.add_argument("--max-ind-range", type=int, default=-1) 65 | parser.add_argument("--data-sub-sample-rate", type=float, default=0.0) # in [0, 1] 66 | parser.add_argument("--num-indices-per-lookup", type=int, default=10) 67 | parser.add_argument("--num-indices-per-lookup-fixed", type=bool, default=False) 68 | parser.add_argument("--num-workers", type=int, default=0) 69 | parser.add_argument("--memory-map", action="store_true", default=False) 70 | ####################################################################################### 71 | 72 | ################################# Embedding Table Args ################################ 73 | parser.add_argument("--md-flag", action="store_true", default=False) 74 | parser.add_argument("--md-threshold", type=int, default=200) 75 | parser.add_argument("--md-temperature", type=float, default=0.3) 76 | parser.add_argument("--md-round-dims", action="store_true", default=False) 77 | parser.add_argument("--qr-flag", action="store_true", default=False) 78 | parser.add_argument("--qr-threshold", type=int, default=200) 79 | parser.add_argument("--qr-operation", type=str, default="mult") 80 | parser.add_argument("--qr-collisions", type=int, default=4) 81 | ####################################################################################### 82 | 83 | ##################################### Training ######################################## 84 | parser.add_argument("--mini-batch-size", type=int, default=1) 85 | parser.add_argument("--nepochs", type=int, default=1) 86 | parser.add_argument("--learning-rate", type=float, default=0.1) 87 | parser.add_argument("--lr-embeds", type=float, default=0.3) 88 | parser.add_argument("--print-precision", type=int, default=5) 89 | parser.add_argument("--numpy-rand-seed", type=int, default=123) 90 | parser.add_argument("--sync-dense-params", type=bool, default=True) 91 | parser.add_argument("--lookahead", type=int, default=2) # Added 92 | parser.add_argument("--cache-workers", type=int, default=2) # Added 93 | parser.add_argument("--cache-size", type=int, default=10240) 94 | parser.add_argument("--num-ways", type=int, default=4) # Added 95 | parser.add_argument("--average-on-writeback", action="store_true", default=False) # Added 96 | parser.add_argument("--evict-victim-cache", action="store_true", default=False) # Added 97 | ####################################################################################### 98 | 99 | ############################### Debugging and profiling ################################ 100 | parser.add_argument("--print-freq", type=int, default=1) 101 | parser.add_argument("--test-freq", type=int, default=-1) 102 | parser.add_argument("--test-mini-batch-size", type=int, default=-1) 103 | parser.add_argument("--test-num-workers", type=int, default=-1) 104 | parser.add_argument("--print-time", action="store_true", default=False) 105 | parser.add_argument("--debug-mode", action="store_true", default=False) 106 | parser.add_argument("--enable-profiling", action="store_true", default=False) 107 | parser.add_argument("--plot-compute-graph", action="store_true", default=False) 108 | ######################################################################################## 109 | 110 | ################################## Store/load model #################################### 111 | parser.add_argument("--save-model", type=str, default="") 112 | parser.add_argument("--load-model", type=str, default="") 113 | ######################################################################################## 114 | 115 | ################################## MLPerf Args ######################################### 116 | parser.add_argument("--mlperf-logging", action="store_true", default=False) 117 | # stop at target accuracy Kaggle 0.789, Terabyte (sub-sampled=0.875) 0.8107 118 | parser.add_argument("--mlperf-acc-threshold", type=float, default=0.0) 119 | # stop at target AUC Terabyte (no subsampling) 0.8025 120 | parser.add_argument("--mlperf-auc-threshold", type=float, default=0.0) 121 | parser.add_argument("--mlperf-bin-loader", action='store_true', default=False) 122 | parser.add_argument("--mlperf-bin-shuffle", action='store_true', default=False) 123 | parser.add_argument("--large-batch", action="store_true", default=False) 124 | ######################################################################################## 125 | 126 | ################################## Distributed training ################################ 127 | parser.add_argument("--world-size", type=int, default=2) 128 | parser.add_argument("--master-port", type=int, default=12345) 129 | parser.add_argument("--trainer-start-core", type=int, default=7) 130 | parser.add_argument("--main-start-core", type=int, default=0) 131 | parser.add_argument("--dense-threshold", type=int, default=1000) 132 | parser.add_argument("--table-agg-op", type=str, default="mean") 133 | parser.add_argument("--table-agg-freq", type=int, default=1) 134 | parser.add_argument("--batch-fifo-size", type=int, default=8) 135 | parser.add_argument("--eviction-fifo-size", type=int, default=8) 136 | parser.add_argument("--eviction-fifo-timeout", type=int, default=300) 137 | ######################################################################################## 138 | 139 | ######################################## Misc ########################################## 140 | parser.add_argument("--inference-only", action="store_true", default=False) 141 | parser.add_argument("--save-onnx", action="store_true", default=False) 142 | parser.add_argument("--use-gpu", action="store_true", default=False) 143 | ######################################################################################## 144 | 145 | return parser.parse_args() 146 | 147 | 148 | def CacheEmbeddings(cached_entries_per_table, lists_of_unique_idxs, unique_indices_maps, cache_group, eviction_fifo, rank): 149 | cpu = torch.device("cpu") 150 | eviction_data = [] 151 | for k, table_cache in enumerate(cached_entries_per_table): 152 | unique_idxs = lists_of_unique_idxs[k] # One dimensional tensor of unique ids (original ids) 153 | map = unique_indices_maps[k] 154 | 155 | set_idxs = cache_group.compute_set_indices(k, 156 | unique_idxs) # One dimensional tensor of set indices (new ids = row in the cached embedding tables) 157 | occupancy_table = cache_group.occupancy_tables[k] 158 | 159 | # Filter out the hitting indices 160 | hit_tensor = (occupancy_table[set_idxs] == unique_idxs.view(-1, 1)).any(dim=1) 161 | hit_positions = hit_tensor.nonzero(as_tuple=False).flatten() 162 | miss_positions = (hit_tensor == False).nonzero(as_tuple=False).flatten() 163 | 164 | hitting_set_idxs = set_idxs[hit_positions] 165 | hitting_ways = (occupancy_table[set_idxs] == unique_idxs.view(-1, 1)).nonzero(as_tuple=True)[1] 166 | 167 | necessary_unique_idxs = unique_idxs[miss_positions] # This is after cache hit evaluation 168 | necessary_set_idxs = set_idxs[miss_positions] # This is after cache hit evaluation 169 | 170 | # Compute availability tensor 171 | avail_tensor_sampler = torch.ones(occupancy_table.shape, dtype=torch.bool) 172 | avail_tensor_sampler[hitting_set_idxs, hitting_ways] = False 173 | occupied_sets = (avail_tensor_sampler.any(dim=1) == 0).nonzero(as_tuple=False).flatten() 174 | 175 | # Filter out unique indices that map to sets whose ways are all occupied 176 | to_be_used_indices = ((necessary_set_idxs.view(-1, 1) == occupied_sets).any(dim=1) == 0).nonzero( 177 | as_tuple=False).flatten() 178 | 179 | necessary_unique_idxs = necessary_unique_idxs[to_be_used_indices] 180 | necessary_set_idxs = necessary_set_idxs[to_be_used_indices] 181 | 182 | # Convert to float and sample way assignments 183 | avail_tensor_sampler = avail_tensor_sampler[necessary_set_idxs].float() 184 | dist = torch.distributions.Categorical(avail_tensor_sampler) 185 | ways_assignments = dist.sample() 186 | 187 | ############################################### EVICTION CODE #################################################### 188 | 189 | # Find unique indices being evicted and fetch their embeddings for writeback 190 | evicting_positions = ((occupancy_table[necessary_set_idxs, ways_assignments] == -1) == False).nonzero( 191 | as_tuple=False).flatten() 192 | evicting_set_idxs = necessary_set_idxs[evicting_positions] 193 | evicting_ways = ways_assignments[evicting_positions] 194 | evicting_table_idxs = cache_group.cache_sizes[k] * evicting_ways + evicting_set_idxs 195 | 196 | evicting_unique_idxs = occupancy_table[evicting_set_idxs, evicting_ways] 197 | evicting_embeddings = cache_group.emb_l[k].weight.data[evicting_table_idxs].to(cpu) 198 | 199 | eviction_data.append((evicting_unique_idxs, evicting_embeddings)) 200 | ################################################################################################################### 201 | 202 | # Finally cache current window embeddings and update occupancy table 203 | table_idxs = cache_group.cache_sizes[k] * ways_assignments + necessary_set_idxs 204 | occupancy_table[necessary_set_idxs, ways_assignments] = necessary_unique_idxs 205 | cached_table_idxs = map[necessary_unique_idxs].flatten() 206 | cache_group.emb_l[k].weight.data[table_idxs] = table_cache[cached_table_idxs].to(rank) 207 | 208 | if rank == 0: 209 | eviction_fifo.put(eviction_data) 210 | 211 | 212 | def loss_fn_wrap(Z, T, loss_fn, args, loss_ws=None): 213 | if args.loss_function == "mse" or args.loss_function == "bce": 214 | return loss_fn(Z, T) 215 | 216 | elif args.loss_function == "wbce": 217 | loss_ws_ = loss_ws[T.data.view(-1).long()].view_as(T) 218 | loss_fn_ = loss_fn(Z, T) 219 | 220 | loss_sc_ = loss_ws_ * loss_fn_ 221 | return loss_sc_.mean() 222 | 223 | 224 | def time_wrap(rank): 225 | torch.cuda.synchronize(rank) 226 | return time.time() 227 | 228 | 229 | def wait_wrap(req_objs): 230 | for obj in req_objs: 231 | obj.wait() 232 | 233 | 234 | def aggregate_gradients(dlrm): 235 | # Aggregate MLPs 236 | request_objs_mlp = [] 237 | for layer in dlrm.bot_l: 238 | if isinstance(layer, nn.modules.linear.Linear): 239 | layer.weight.grad /= dist.get_world_size() 240 | request_objs_mlp.append(dist.all_reduce_multigpu([layer.weight.grad], async_op=True)) 241 | 242 | for layer in dlrm.top_l: 243 | if isinstance(layer, nn.modules.linear.Linear): 244 | layer.weight.grad /= dist.get_world_size() 245 | request_objs_mlp.append(dist.all_reduce_multigpu([layer.weight.grad], async_op=True)) 246 | 247 | return request_objs_mlp 248 | 249 | 250 | @torch.no_grad() 251 | def broadcast_and_aggregate(cache_group, cache_group_idxs, rank, reduce_op="mean"): 252 | recieve_tensors = [] 253 | dist_request_objs = [] 254 | for i in range(dist.get_world_size()): 255 | if i == rank: 256 | recieve_tensors.append(cache_group_idxs) 257 | dist_request_objs.append(dist.broadcast(cache_group_idxs, src=i, async_op=True)) 258 | else: 259 | tmp = torch.empty_like(cache_group_idxs, device=rank) 260 | dist_request_objs.append(dist.broadcast(tmp, src=i, async_op=True)) 261 | recieve_tensors.append(tmp) 262 | 263 | # Wait for broadcasts to finish 264 | wait_wrap(dist_request_objs) 265 | unique_idxs_list = [] 266 | weight_slice_list = [] 267 | dist_request_objs = [] 268 | cache_lookups = torch.cat(recieve_tensors, dim=1) 269 | for i, table in enumerate(cache_group.emb_l): 270 | unique_idxs = torch.unique(cache_lookups[i], sorted=True).long() 271 | 272 | if reduce_op == "sum": 273 | weight_slice = table.weight[unique_idxs] 274 | op = dist.ReduceOp.SUM 275 | 276 | elif reduce_op == "mean": 277 | weight_slice = table.weight[unique_idxs] / dist.get_world_size() 278 | op = dist.ReduceOp.SUM 279 | 280 | elif reduce_op == "max": 281 | weight_slice = table.weight[unique_idxs] 282 | op = dist.ReduceOp.MAX 283 | 284 | dist_request_objs.append(dist.all_reduce_multigpu([weight_slice], op=op, async_op=True)) 285 | unique_idxs_list.append(unique_idxs) 286 | weight_slice_list.append(weight_slice) 287 | 288 | for i, table in enumerate(cache_group.emb_l): 289 | unique_idxs = unique_idxs_list[i] 290 | weight_slice = weight_slice_list[i] 291 | dist_request_objs[i].wait() 292 | table.weight[unique_idxs] = weight_slice 293 | 294 | 295 | def share_occupancy_tables(cache_group, occupancy_tables_fifos, rank): 296 | if rank == 0: 297 | for table in cache_group.occupancy_tables: 298 | table.share_memory_() 299 | 300 | for fifo in occupancy_tables_fifos: 301 | fifo.put(cache_group.occupancy_tables) 302 | 303 | else: 304 | fifo = occupancy_tables_fifos[rank - 1] 305 | shared_occupancy_tables = fifo.get() 306 | cache_group.occupancy_tables = shared_occupancy_tables 307 | 308 | 309 | @torch.no_grad() 310 | def load_caches_and_broadcast(cache_group, batch_fifo, eviction_fifo, rank): 311 | dist_req_objs = [] 312 | if rank == 0: 313 | # Pull out of batch queue and cache in cache on GPU 0 314 | cached_entries_per_table, lists_of_unique_idxs, unique_indices_maps = batch_fifo.get() 315 | CacheEmbeddings(cached_entries_per_table, lists_of_unique_idxs, unique_indices_maps, cache_group, eviction_fifo, rank) 316 | 317 | # Broadcast to all other GPUs 318 | for embedding_table_cache in cache_group.emb_l: 319 | dist_req_objs.append(dist.broadcast_multigpu([embedding_table_cache.weight], src=0, async_op=True)) 320 | 321 | return dist_req_objs 322 | 323 | 324 | def Run(rank, m_spa, ln_emb, ln_bot, ln_top, train_ld, test_ld, batch_fifo, eviction_fifo, occupancy_tables_fifos, emb_tables, args): 325 | # Set proc title 326 | setproctitle("DlrmTrainer:" + str(rank)) 327 | 328 | # First pin processes to avoid context switching overhead 329 | avail_cores = psutil.cpu_count() - args.trainer_start_core 330 | stride = rank if rank < avail_cores else rank % avail_cores 331 | new_core = args.trainer_start_core + stride 332 | this_pid = os.getpid() 333 | os.system("taskset -p -c %d %d" % (new_core, this_pid)) 334 | 335 | np.random.seed(args.numpy_rand_seed) 336 | torch.cuda.manual_seed(args.numpy_rand_seed) 337 | torch.manual_seed(args.numpy_rand_seed) 338 | np.set_printoptions(precision=args.print_precision) 339 | torch.set_printoptions(precision=args.print_precision) 340 | 341 | os.environ['MASTER_ADDR'] = 'localhost' 342 | os.environ['MASTER_PORT'] = str(args.master_port) 343 | dist.init_process_group("nccl", rank=rank, world_size=args.world_size) 344 | local_batch_size = math.ceil(args.mini_batch_size / args.world_size) 345 | 346 | cache_group = Embedding_Table_Cache_Group(m_spa, ln_emb, 347 | max_cache_size=args.cache_size, 348 | aux_table_size=args.mini_batch_size, 349 | num_ways=args.num_ways).to(rank) 350 | 351 | dlrm = DLRM_Net( 352 | ln_bot, 353 | ln_top, 354 | arch_interaction_op=args.arch_interaction_op, 355 | arch_interaction_itself=args.arch_interaction_itself, 356 | sync_dense_params=args.sync_dense_params, 357 | sigmoid_bot=-1, 358 | sigmoid_top=ln_top.size - 2, 359 | loss_threshold=args.loss_threshold, 360 | ).to(rank) 361 | 362 | share_occupancy_tables(cache_group, occupancy_tables_fifos, rank) 363 | 364 | if args.loss_function == "mse": 365 | loss_fn = torch.nn.MSELoss(reduction="mean") 366 | loss_ws = None 367 | elif args.loss_function == "bce": 368 | loss_fn = torch.nn.BCELoss(reduction="mean") 369 | loss_ws = None 370 | elif args.loss_function == "wbce": 371 | loss_ws = torch.tensor(np.fromstring(args.loss_weights, dtype=float, sep="-")) 372 | loss_fn = torch.nn.BCELoss(reduction="none") 373 | 374 | # Creating optimizer 375 | optimizer_mlps = torch.optim.SGD(dlrm.parameters(), lr=args.learning_rate) 376 | optimizer_embeds = torch.optim.SGD(cache_group.parameters(), lr=args.lr_embeds) 377 | 378 | total_time = 0 379 | total_iter = 0 380 | total_loss = 0 381 | total_accu = 0 382 | total_samp = 0 383 | 384 | caching_overhead = [] 385 | cache_group_idxs_window = [] 386 | for epoch in range(args.nepochs): 387 | for j, (X, lS_o, lS_i, T) in enumerate(train_ld): 388 | X = X[rank * local_batch_size: (rank + 1) * local_batch_size, :].to(rank) 389 | lS_i = lS_i[:, rank * local_batch_size: (rank + 1) * local_batch_size] 390 | lS_o = lS_o[:, :local_batch_size] 391 | T = T[rank * local_batch_size: (rank + 1) * local_batch_size, :].to(rank) 392 | 393 | if j % args.lookahead == 0: 394 | # Pull from fifo and setup caches 395 | start = timer() 396 | dist_req_objs = load_caches_and_broadcast(cache_group, batch_fifo, eviction_fifo, rank) 397 | wait_wrap(dist_req_objs) 398 | end = timer() 399 | caching_overhead.append(end - start) 400 | 401 | t1 = time_wrap(rank) 402 | 403 | # Forward and Backward 404 | lookups, cache_group_idxs = cache_group(lS_o, lS_i, emb_tables, rank) 405 | Z = dlrm(X, lookups) 406 | E = loss_fn_wrap(Z, T, loss_fn, args, loss_ws) 407 | optimizer_mlps.zero_grad() 408 | optimizer_embeds.zero_grad() 409 | E.backward() 410 | 411 | # Gradient aggregation for MLPs + param update 412 | request_objs_mlps = aggregate_gradients(dlrm) 413 | optimizer_embeds.step() 414 | wait_wrap(request_objs_mlps) 415 | optimizer_mlps.step() 416 | 417 | # Aggregate parameters for cache group 418 | if j > 0 and j % args.table_agg_freq == 0: 419 | cache_group_idxs = torch.cat(cache_group_idxs_window + [torch.stack(cache_group_idxs)], dim=1) 420 | broadcast_and_aggregate(cache_group, cache_group_idxs, rank, args.table_agg_op) 421 | cache_group_idxs_window = [] 422 | else: 423 | cache_group_idxs_window.append(torch.stack(cache_group_idxs)) 424 | 425 | t2 = time_wrap(rank) 426 | 427 | L = E.detach().cpu().numpy() # numpy array 428 | S = Z.detach().cpu().numpy() # numpy array 429 | T = T.detach().cpu().numpy() # numpy array 430 | mbs = T.shape[0] # = args.mini_batch_size except maybe for last 431 | A = np.sum((np.round(S, 0) == T).astype(np.uint8)) 432 | 433 | if rank == 0: 434 | total_loss_world = 0 435 | total_acc_world = 0 436 | # Get losses and accuracies from other processes 437 | for process in range(1, dist.get_world_size()): 438 | # Get losses 439 | L_t = torch.tensor(L, device=rank) 440 | L_p = torch.zeros_like(L_t) 441 | dist.broadcast(L_p, src=process) 442 | L_p = L_p.detach().cpu().numpy() 443 | total_loss_world += L_p * mbs 444 | 445 | # Get accuracies 446 | A_t = torch.tensor([A.item()], device=rank) 447 | A_p = torch.zeros_like(A_t) 448 | dist.broadcast(A_p, src=process) 449 | A_p = A_p.detach().cpu().item() 450 | total_acc_world += A_p 451 | 452 | total_time += t2 - t1 453 | total_loss += ((L * mbs + total_loss_world) / dist.get_world_size()) 454 | total_accu += ((A + total_acc_world) / dist.get_world_size()) 455 | total_iter += 1 456 | total_samp += mbs 457 | 458 | if j > 0 and j % args.print_freq == 0: 459 | gT = 1000.0 * total_time / total_iter 460 | total_time = 0 461 | 462 | gA = total_accu / total_samp 463 | total_accu = 0 464 | 465 | gL = total_loss / total_samp 466 | total_loss = 0 467 | 468 | avg_caching_overhead = np.mean(caching_overhead) / args.lookahead 469 | caching_overhead = [] 470 | 471 | print('Epoch {}: Finished {}/{} in {} ms/it. Caching overhead = {}. Loss = {}, Train Acc = {}'.format(epoch, j, len(train_ld), gT, 472 | 1000 * avg_caching_overhead, 473 | gL, gA)) 474 | 475 | total_samp = 0 476 | total_iter = 0 477 | 478 | # region Testing - Only rank 0 tests 479 | if (j > 0 and j % args.test_freq == 0) or j == len(train_ld) - 1: 480 | print('Testing at {}/{}....'.format(j, len(train_ld))) 481 | test_samp = 0 482 | total_test_acc = 0 483 | with torch.no_grad(): 484 | for i, (X, lS_o, lS_i, T) in enumerate(test_ld): 485 | X = X.to(rank) 486 | lookups, _ = cache_group(lS_o, lS_i, emb_tables, rank) 487 | Z = dlrm(X, lookups) 488 | S = Z.cpu().numpy() 489 | T = T.cpu().numpy() 490 | test_acc = np.sum((np.round(S, 0) == T).astype(np.uint32)) 491 | test_samp += T.shape[0] 492 | total_test_acc += test_acc 493 | 494 | print('Test accuracy = {}%'.format(100 * (total_test_acc / test_samp))) 495 | # endregion 496 | 497 | else: 498 | for process in range(1, dist.get_world_size()): 499 | L_t = torch.tensor(L, device=rank) 500 | A_t = torch.tensor([A.item()], device=rank) 501 | dist.broadcast(L_t, src=process) 502 | dist.broadcast(A_t, src=process) 503 | 504 | 505 | if __name__ == '__main__': 506 | mp.set_start_method("spawn") # Cache manager deadlocks with fork as start method. This is paramount. 507 | args = ProcessArgs() 508 | 509 | np.random.seed(args.numpy_rand_seed) 510 | np.set_printoptions(precision=args.print_precision) 511 | torch.set_printoptions(precision=args.print_precision) 512 | torch.manual_seed(args.numpy_rand_seed) 513 | 514 | # region Sanity Checks 515 | if args.test_mini_batch_size < 0: 516 | # if the parameter is not set, use the training batch size 517 | args.test_mini_batch_size = args.mini_batch_size 518 | if args.test_num_workers < 0: 519 | # if the parameter is not set, use the same parameter for training 520 | args.test_num_workers = args.num_workers 521 | 522 | ln_bot = np.fromstring(args.arch_mlp_bot, dtype=int, sep="-") 523 | if args.data_generation == "dataset": 524 | 525 | train_data, train_ld, test_data, test_ld, cache_ld = dp.make_criteo_data_and_loaders(args) 526 | 527 | nbatches = args.num_batches if args.num_batches > 0 else len(train_ld) 528 | nbatches_test = len(test_ld) 529 | 530 | ln_emb = train_data.counts 531 | # enforce maximum limit on number of vectors per embedding 532 | if args.max_ind_range > 0: 533 | ln_emb = np.array(list(map( 534 | lambda x: x if x < args.max_ind_range else args.max_ind_range, 535 | ln_emb 536 | ))) 537 | m_den = train_data.m_den 538 | ln_bot[0] = m_den 539 | else: 540 | # input and target at random 541 | if args.cache_workers > psutil.cpu_count(): 542 | args.cache_workers = psutil.cpu_count() 543 | 544 | ln_emb = np.fromstring(args.arch_embedding_size, dtype=int, sep="-") 545 | m_den = ln_bot[0] 546 | train_data, train_ld = dp.make_random_data_and_loader(args, ln_emb, m_den) 547 | nbatches = args.num_batches if args.num_batches > 0 else len(train_ld) 548 | 549 | m_spa = args.arch_sparse_feature_size 550 | num_fea = ln_emb.size + 1 # num sparse + num dense features 551 | m_den_out = ln_bot[ln_bot.size - 1] 552 | if args.arch_interaction_op == "dot": 553 | # approach 1: all 554 | # num_int = num_fea * num_fea + m_den_out 555 | # approach 2: unique 556 | if args.arch_interaction_itself: 557 | num_int = (num_fea * (num_fea + 1)) // 2 + m_den_out 558 | else: 559 | num_int = (num_fea * (num_fea - 1)) // 2 + m_den_out 560 | elif args.arch_interaction_op == "cat": 561 | num_int = num_fea * m_den_out 562 | else: 563 | sys.exit( 564 | "ERROR: --arch-interaction-op=" 565 | + args.arch_interaction_op 566 | + " is not supported" 567 | ) 568 | arch_mlp_top_adjusted = str(num_int) + "-" + args.arch_mlp_top 569 | ln_top = np.fromstring(arch_mlp_top_adjusted, dtype=int, sep="-") 570 | 571 | # sanity check: feature sizes and mlp dimensions must match 572 | if m_den != ln_bot[0]: 573 | sys.exit( 574 | "ERROR: arch-dense-feature-size " 575 | + str(m_den) 576 | + " does not match first dim of bottom mlp " 577 | + str(ln_bot[0]) 578 | ) 579 | if args.qr_flag: 580 | if args.qr_operation == "concat" and 2 * m_spa != m_den_out: 581 | sys.exit( 582 | "ERROR: 2 arch-sparse-feature-size " 583 | + str(2 * m_spa) 584 | + " does not match last dim of bottom mlp " 585 | + str(m_den_out) 586 | + " (note that the last dim of bottom mlp must be 2x the embedding dim)" 587 | ) 588 | if args.qr_operation != "concat" and m_spa != m_den_out: 589 | sys.exit( 590 | "ERROR: arch-sparse-feature-size " 591 | + str(m_spa) 592 | + " does not match last dim of bottom mlp " 593 | + str(m_den_out) 594 | ) 595 | else: 596 | if m_spa != m_den_out: 597 | sys.exit( 598 | "ERROR: arch-sparse-feature-size " 599 | + str(m_spa) 600 | + " does not match last dim of bottom mlp " 601 | + str(m_den_out) 602 | ) 603 | if num_int != ln_top[0]: 604 | sys.exit( 605 | "ERROR: # of feature interactions " 606 | + str(num_int) 607 | + " does not match first dimension of top mlp " 608 | + str(ln_top[0]) 609 | ) 610 | 611 | # assign mixed dimensions if applicable 612 | if args.md_flag: 613 | m_spa = md_solver( 614 | torch.tensor(ln_emb), 615 | args.md_temperature, # alpha 616 | d0=m_spa, 617 | round_dim=args.md_round_dims 618 | ).tolist() 619 | # endregion 620 | 621 | emb_tables = Embedding_Table_Group(m_spa, ln_emb) 622 | emb_tables.share_memory() 623 | 624 | batch_fifo = mp.Manager().Queue(maxsize=args.batch_fifo_size) 625 | eviction_fifo = mp.Manager().Queue(maxsize=args.eviction_fifo_size) 626 | occupancy_tables_fifos = [mp.Manager().Queue(maxsize=1)] * (args.world_size - 1) 627 | finish_event = mp.Event() 628 | barrier = mp.Barrier(args.world_size) 629 | 630 | cm = Prefetcher(args, emb_tables, batch_fifo, eviction_fifo, finish_event, cache_ld) 631 | 632 | # Pin main process 633 | this_pid = os.getpid() 634 | os.system("taskset -p -c %d %d" % (args.main_start_core, this_pid)) 635 | args.trainer_start_core = args.main_start_core + args.cache_workers + 3 636 | 637 | cm.start() 638 | spawn_context = mp.spawn(Run, 639 | args=(m_spa, ln_emb, ln_bot, ln_top, train_ld, test_ld, 640 | batch_fifo, eviction_fifo, occupancy_tables_fifos, 641 | emb_tables, args), 642 | nprocs=args.world_size, 643 | join=True) 644 | 645 | finish_event.set() 646 | cm.join() 647 | -------------------------------------------------------------------------------- /dlrm_data_pytorch.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | # 6 | # Description: generate inputs and targets for the dlrm benchmark 7 | # The inpts and outputs are generated according to the following three option(s) 8 | # 1) random distribution 9 | # 2) synthetic distribution, based on unique accesses and distances between them 10 | # i) R. Hassan, A. Harris, N. Topham and A. Efthymiou "Synthetic Trace-Driven 11 | # Simulation of Cache Memory", IEEE AINAM'07 12 | # 3) public data set 13 | # i) Criteo Kaggle Display Advertising Challenge Dataset 14 | # https://labs.criteo.com/2014/02/kaggle-display-advertising-challenge-dataset 15 | # ii) Criteo Terabyte Dataset 16 | # https://labs.criteo.com/2013/12/download-terabyte-click-logs 17 | 18 | 19 | from __future__ import absolute_import, division, print_function, unicode_literals 20 | 21 | # others 22 | from os import path 23 | import bisect 24 | import collections 25 | 26 | import data_utils 27 | 28 | # numpy 29 | import numpy as np 30 | from numpy import random as ra 31 | 32 | # pytorch 33 | import torch 34 | from torch.utils.data import Dataset, RandomSampler 35 | 36 | import data_loader_terabyte 37 | import os 38 | 39 | 40 | # Kaggle Display Advertising Challenge Dataset 41 | # dataset (str): name of dataset (Kaggle or Terabyte) 42 | # randomize (str): determines randomization scheme 43 | # "none": no randomization 44 | # "day": randomizes each day"s data (only works if split = True) 45 | # "total": randomizes total dataset 46 | # split (bool) : to split into train, test, validation data-sets 47 | class CriteoDataset(Dataset): 48 | 49 | def __init__( 50 | self, 51 | dataset, 52 | max_ind_range, 53 | sub_sample_rate, 54 | randomize, 55 | split="train", 56 | raw_path="", 57 | pro_data="", 58 | memory_map=False 59 | ): 60 | # dataset 61 | # tar_fea = 1 # single target 62 | den_fea = 13 # 13 dense features 63 | # spa_fea = 26 # 26 sparse features 64 | # tad_fea = tar_fea + den_fea 65 | # tot_fea = tad_fea + spa_fea 66 | if dataset == "kaggle": 67 | days = 7 68 | out_file = "kaggleAdDisplayChallenge_processed" 69 | elif dataset == "terabyte": 70 | days = 24 71 | out_file = "terabyte_processed" 72 | else: 73 | raise (ValueError("Data set option is not supported")) 74 | self.max_ind_range = max_ind_range 75 | self.memory_map = memory_map 76 | 77 | # split the datafile into path and filename 78 | lstr = raw_path.split("/") 79 | self.d_path = "/".join(lstr[0:-1]) + "/" 80 | self.d_file = lstr[-1].split(".")[0] if dataset == "kaggle" else lstr[-1] 81 | self.npzfile = self.d_path + ( 82 | (self.d_file + "_day") if dataset == "kaggle" else self.d_file 83 | ) 84 | self.trafile = self.d_path + ( 85 | (self.d_file + "_fea") if dataset == "kaggle" else "fea" 86 | ) 87 | 88 | # check if pre-processed data is available 89 | data_ready = True 90 | if memory_map: 91 | for i in range(days): 92 | reo_data = self.npzfile + "_{0}_reordered.npz".format(i) 93 | if not path.exists(str(reo_data)): 94 | data_ready = False 95 | else: 96 | if not path.exists(str(pro_data)): 97 | data_ready = False 98 | 99 | # pre-process data if needed 100 | # WARNNING: when memory mapping is used we get a collection of files 101 | if data_ready: 102 | print("Reading pre-processed data=%s" % (str(pro_data))) 103 | file = str(pro_data) 104 | else: 105 | print("Reading raw data=%s" % (str(raw_path))) 106 | file = data_utils.getCriteoAdData( 107 | raw_path, 108 | out_file, 109 | max_ind_range, 110 | sub_sample_rate, 111 | days, 112 | split, 113 | randomize, 114 | dataset == "kaggle", 115 | memory_map 116 | ) 117 | 118 | # get a number of samples per day 119 | total_file = self.d_path + self.d_file + "_day_count.npz" 120 | with np.load(total_file) as data: 121 | total_per_file = data["total_per_file"] 122 | # compute offsets per file 123 | self.offset_per_file = np.array([0] + [x for x in total_per_file]) 124 | for i in range(days): 125 | self.offset_per_file[i + 1] += self.offset_per_file[i] 126 | # print(self.offset_per_file) 127 | 128 | # setup data 129 | if memory_map: 130 | # setup the training/testing split 131 | self.split = split 132 | if split == 'none' or split == 'train': 133 | self.day = 0 134 | self.max_day_range = days if split == 'none' else days - 1 135 | elif split == 'test' or split == 'val': 136 | self.day = days - 1 137 | num_samples = self.offset_per_file[days] - \ 138 | self.offset_per_file[days - 1] 139 | self.test_size = int(np.ceil(num_samples / 2.)) 140 | self.val_size = num_samples - self.test_size 141 | else: 142 | sys.exit("ERROR: dataset split is neither none, nor train or test.") 143 | 144 | ''' 145 | # text 146 | print("text") 147 | for i in range(days): 148 | fi = self.npzfile + "_{0}".format(i) 149 | with open(fi) as data: 150 | ttt = 0; nnn = 0 151 | for _j, line in enumerate(data): 152 | ttt +=1 153 | if np.int32(line[0]) > 0: 154 | nnn +=1 155 | print("day=" + str(i) + " total=" + str(ttt) + " non-zeros=" 156 | + str(nnn) + " ratio=" +str((nnn * 100.) / ttt) + "%") 157 | # processed 158 | print("processed") 159 | for i in range(days): 160 | fi = self.npzfile + "_{0}_processed.npz".format(i) 161 | with np.load(fi) as data: 162 | yyy = data["y"] 163 | ttt = len(yyy) 164 | nnn = np.count_nonzero(yyy) 165 | print("day=" + str(i) + " total=" + str(ttt) + " non-zeros=" 166 | + str(nnn) + " ratio=" +str((nnn * 100.) / ttt) + "%") 167 | # reordered 168 | print("reordered") 169 | for i in range(days): 170 | fi = self.npzfile + "_{0}_reordered.npz".format(i) 171 | with np.load(fi) as data: 172 | yyy = data["y"] 173 | ttt = len(yyy) 174 | nnn = np.count_nonzero(yyy) 175 | print("day=" + str(i) + " total=" + str(ttt) + " non-zeros=" 176 | + str(nnn) + " ratio=" +str((nnn * 100.) / ttt) + "%") 177 | ''' 178 | 179 | # load unique counts 180 | with np.load(self.d_path + self.d_file + "_fea_count.npz") as data: 181 | self.counts = data["counts"] 182 | self.m_den = den_fea # X_int.shape[1] 183 | self.n_emb = len(self.counts) 184 | print("Sparse features= %d, Dense features= %d" % (self.n_emb, self.m_den)) 185 | 186 | # Load the test data 187 | # Only a single day is used for testing 188 | if self.split == 'test' or self.split == 'val': 189 | # only a single day is used for testing 190 | fi = self.npzfile + "_{0}_reordered.npz".format( 191 | self.day 192 | ) 193 | with np.load(fi) as data: 194 | self.X_int = data["X_int"] # continuous feature 195 | self.X_cat = data["X_cat"] # categorical feature 196 | self.y = data["y"] # target 197 | 198 | else: 199 | # load and preprocess data 200 | with np.load(file) as data: 201 | X_int = data["X_int"] # continuous feature 202 | X_cat = data["X_cat"] # categorical feature 203 | y = data["y"] # target 204 | self.counts = data["counts"] 205 | self.m_den = X_int.shape[1] # den_fea 206 | self.n_emb = len(self.counts) 207 | print("Sparse fea = %d, Dense fea = %d" % (self.n_emb, self.m_den)) 208 | 209 | # create reordering 210 | indices = np.arange(len(y)) 211 | 212 | if split == "none": 213 | # randomize all data 214 | if randomize == "total": 215 | indices = np.random.permutation(indices) 216 | print("Randomized indices...") 217 | 218 | X_int[indices] = X_int 219 | X_cat[indices] = X_cat 220 | y[indices] = y 221 | 222 | else: 223 | indices = np.array_split(indices, self.offset_per_file[1:-1]) 224 | 225 | # randomize train data (per day) 226 | if randomize == "day": # or randomize == "total": 227 | for i in range(len(indices) - 1): 228 | indices[i] = np.random.permutation(indices[i]) 229 | print("Randomized indices per day ...") 230 | 231 | train_indices = np.concatenate(indices[:-1]) 232 | test_indices = indices[-1] 233 | test_indices, val_indices = np.array_split(test_indices, 2) 234 | 235 | print("Defined %s indices..." % (split)) 236 | 237 | # randomize train data (across days) 238 | if randomize == "total": 239 | train_indices = np.random.permutation(train_indices) 240 | print("Randomized indices across days ...") 241 | 242 | # create training, validation, and test sets 243 | if split == 'train': 244 | self.X_int = [X_int[i] for i in train_indices] 245 | self.X_cat = [X_cat[i] for i in train_indices] 246 | self.y = [y[i] for i in train_indices] 247 | elif split == 'val': 248 | self.X_int = [X_int[i] for i in val_indices] 249 | self.X_cat = [X_cat[i] for i in val_indices] 250 | self.y = [y[i] for i in val_indices] 251 | elif split == 'test': 252 | self.X_int = [X_int[i] for i in test_indices] 253 | self.X_cat = [X_cat[i] for i in test_indices] 254 | self.y = [y[i] for i in test_indices] 255 | 256 | print("Split data according to indices...") 257 | 258 | def __getitem__(self, index): 259 | 260 | if isinstance(index, slice): 261 | return [ 262 | self[idx] for idx in range( 263 | index.start or 0, index.stop or len(self), index.step or 1 264 | ) 265 | ] 266 | 267 | if self.memory_map: 268 | if self.split == 'none' or self.split == 'train': 269 | # check if need to swicth to next day and load data 270 | if index == self.offset_per_file[self.day]: 271 | # print("day_boundary switch", index) 272 | self.day_boundary = self.offset_per_file[self.day] 273 | fi = self.npzfile + "_{0}_reordered.npz".format( 274 | self.day 275 | ) 276 | # print('Loading file: ', fi) 277 | with np.load(fi) as data: 278 | self.X_int = data["X_int"] # continuous feature 279 | self.X_cat = data["X_cat"] # categorical feature 280 | self.y = data["y"] # target 281 | self.day = (self.day + 1) % self.max_day_range 282 | 283 | i = index - self.day_boundary 284 | elif self.split == 'test' or self.split == 'val': 285 | # only a single day is used for testing 286 | i = index + (0 if self.split == 'test' else self.test_size) 287 | else: 288 | sys.exit("ERROR: dataset split is neither none, nor train or test.") 289 | else: 290 | i = index 291 | 292 | if self.max_ind_range > 0: 293 | return self.X_int[i], self.X_cat[i] % self.max_ind_range, self.y[i] 294 | else: 295 | return self.X_int[i], self.X_cat[i], self.y[i] 296 | 297 | def _default_preprocess(self, X_int, X_cat, y): 298 | X_int = torch.log(torch.tensor(X_int, dtype=torch.float) + 1) 299 | if self.max_ind_range > 0: 300 | X_cat = torch.tensor(X_cat % self.max_ind_range, dtype=torch.long) 301 | else: 302 | X_cat = torch.tensor(X_cat, dtype=torch.long) 303 | y = torch.tensor(y.astype(np.float32)) 304 | 305 | return X_int, X_cat, y 306 | 307 | def __len__(self): 308 | if self.memory_map: 309 | if self.split == 'none': 310 | return self.offset_per_file[-1] 311 | elif self.split == 'train': 312 | return self.offset_per_file[-2] 313 | elif self.split == 'test': 314 | return self.test_size 315 | elif self.split == 'val': 316 | return self.val_size 317 | else: 318 | sys.exit("ERROR: dataset split is neither none, nor train nor test.") 319 | else: 320 | return len(self.y) 321 | 322 | 323 | def collate_wrapper_criteo(list_of_tuples): 324 | # where each tuple is (X_int, X_cat, y) 325 | transposed_data = list(zip(*list_of_tuples)) 326 | X_int = torch.log(torch.tensor(transposed_data[0], dtype=torch.float) + 1) 327 | X_cat = torch.tensor(transposed_data[1], dtype=torch.long) 328 | T = torch.tensor(transposed_data[2], dtype=torch.float32).view(-1, 1) 329 | 330 | # import pdb; pdb.set_trace() 331 | 332 | batchSize = X_cat.shape[0] 333 | featureCnt = X_cat.shape[1] 334 | 335 | lS_i = [X_cat[:, i] for i in range(featureCnt)] 336 | lS_o = [torch.tensor(range(batchSize)) for _ in range(featureCnt)] 337 | 338 | return X_int, torch.stack(lS_o), torch.stack(lS_i), T 339 | 340 | 341 | def criteo_worker_pin_fn(worker_id): 342 | this_pid = os.getpid() 343 | os.system("taskset -p -c %d %d" % (13 + worker_id, this_pid)) 344 | 345 | 346 | def ensure_dataset_preprocessed(args, d_path): 347 | _ = CriteoDataset( 348 | args.data_set, 349 | args.max_ind_range, 350 | args.data_sub_sample_rate, 351 | args.data_randomize, 352 | "train", 353 | args.raw_data_file, 354 | args.processed_data_file, 355 | args.memory_map 356 | ) 357 | 358 | _ = CriteoDataset( 359 | args.data_set, 360 | args.max_ind_range, 361 | args.data_sub_sample_rate, 362 | args.data_randomize, 363 | "test", 364 | args.raw_data_file, 365 | args.processed_data_file, 366 | args.memory_map 367 | ) 368 | 369 | for split in ['train', 'val', 'test']: 370 | print('Running preprocessing for split =', split) 371 | 372 | train_files = ['{}_{}_reordered.npz'.format(args.raw_data_file, day) 373 | for 374 | day in range(0, 23)] 375 | 376 | test_valid_file = args.raw_data_file + '_23_reordered.npz' 377 | 378 | output_file = d_path + '_{}.bin'.format(split) 379 | 380 | input_files = train_files if split == 'train' else [test_valid_file] 381 | data_loader_terabyte.numpy_to_binary(input_files=input_files, 382 | output_file_path=output_file, 383 | split=split) 384 | 385 | 386 | def make_criteo_data_and_loaders(args): 387 | if args.large_batch and args.memory_map and args.data_set == "terabyte": 388 | # more efficient for larger batches 389 | data_directory = path.dirname(args.raw_data_file) 390 | 391 | if args.mlperf_bin_loader: 392 | lstr = args.processed_data_file.split("/") 393 | d_path = "/".join(lstr[0:-1]) + "/" + lstr[-1].split(".")[0] 394 | train_file = d_path + "_train.bin" 395 | test_file = d_path + "_test.bin" 396 | # val_file = d_path + "_val.bin" 397 | counts_file = args.raw_data_file + '_fea_count.npz' 398 | 399 | if any(not path.exists(p) for p in [train_file, 400 | test_file, 401 | counts_file]): 402 | ensure_dataset_preprocessed(args, d_path) 403 | 404 | train_data = data_loader_terabyte.CriteoBinDataset( 405 | data_file=train_file, 406 | counts_file=counts_file, 407 | batch_size=args.mini_batch_size, 408 | max_ind_range=args.max_ind_range 409 | ) 410 | 411 | train_loader = torch.utils.data.DataLoader( 412 | train_data, 413 | batch_size=None, 414 | batch_sampler=None, 415 | shuffle=False, 416 | num_workers=0, 417 | collate_fn=None, 418 | pin_memory=False, 419 | drop_last=False, 420 | sampler=RandomSampler(train_data) if args.mlperf_bin_shuffle else None 421 | ) 422 | 423 | test_data = data_loader_terabyte.CriteoBinDataset( 424 | data_file=test_file, 425 | counts_file=counts_file, 426 | batch_size=args.test_mini_batch_size, 427 | max_ind_range=args.max_ind_range 428 | ) 429 | 430 | test_loader = torch.utils.data.DataLoader( 431 | test_data, 432 | batch_size=None, 433 | batch_sampler=None, 434 | shuffle=False, 435 | num_workers=0, 436 | collate_fn=None, 437 | pin_memory=False, 438 | drop_last=False, 439 | ) 440 | else: 441 | data_filename = args.raw_data_file.split("/")[-1] 442 | 443 | train_data = CriteoDataset( 444 | args.data_set, 445 | args.max_ind_range, 446 | args.data_sub_sample_rate, 447 | args.data_randomize, 448 | "train", 449 | args.raw_data_file, 450 | args.processed_data_file, 451 | args.memory_map 452 | ) 453 | 454 | test_data = CriteoDataset( 455 | args.data_set, 456 | args.max_ind_range, 457 | args.data_sub_sample_rate, 458 | args.data_randomize, 459 | "test", 460 | args.raw_data_file, 461 | args.processed_data_file, 462 | args.memory_map 463 | ) 464 | 465 | train_loader = data_loader_terabyte.DataLoader( 466 | data_directory=data_directory, 467 | data_filename=data_filename, 468 | days=list(range(23)), 469 | batch_size=args.mini_batch_size, 470 | max_ind_range=args.max_ind_range, 471 | split="train", 472 | drop_last_batch=True 473 | ) 474 | 475 | cache_loader = data_loader_terabyte.DataLoader( 476 | data_directory=data_directory, 477 | data_filename=data_filename, 478 | days=list(range(23)), 479 | batch_size=args.mini_batch_size, 480 | max_ind_range=args.max_ind_range, 481 | split="train", 482 | drop_last_batch=False 483 | ) 484 | 485 | test_loader = data_loader_terabyte.DataLoader( 486 | data_directory=data_directory, 487 | data_filename=data_filename, 488 | days=[23], 489 | batch_size=args.test_mini_batch_size, 490 | max_ind_range=args.max_ind_range, 491 | split="test" 492 | ) 493 | else: 494 | train_data = CriteoDataset( 495 | args.data_set, 496 | args.max_ind_range, 497 | args.data_sub_sample_rate, 498 | args.data_randomize, 499 | "train", 500 | args.raw_data_file, 501 | args.processed_data_file, 502 | args.memory_map 503 | ) 504 | 505 | test_data = CriteoDataset( 506 | args.data_set, 507 | args.max_ind_range, 508 | args.data_sub_sample_rate, 509 | args.data_randomize, 510 | "test", 511 | args.raw_data_file, 512 | args.processed_data_file, 513 | args.memory_map 514 | ) 515 | 516 | train_loader = torch.utils.data.DataLoader( 517 | train_data, 518 | batch_size=args.mini_batch_size, 519 | shuffle=False, 520 | num_workers=args.num_workers, 521 | collate_fn=collate_wrapper_criteo, 522 | pin_memory=False, 523 | drop_last=False, # True 524 | worker_init_fn=criteo_worker_pin_fn 525 | ) 526 | 527 | cache_loader = torch.utils.data.DataLoader( 528 | train_data, 529 | batch_size=args.cache_workers * args.lookahead * args.mini_batch_size, 530 | shuffle=False, 531 | num_workers=1, 532 | collate_fn=collate_wrapper_criteo, 533 | pin_memory=False, 534 | drop_last=False 535 | ) 536 | 537 | test_loader = torch.utils.data.DataLoader( 538 | test_data, 539 | batch_size=args.test_mini_batch_size, 540 | shuffle=False, 541 | num_workers=0, 542 | collate_fn=collate_wrapper_criteo, 543 | pin_memory=False, 544 | drop_last=False, # True 545 | ) 546 | 547 | return train_data, train_loader, test_data, test_loader, cache_loader 548 | 549 | 550 | # uniform ditribution (input data) 551 | class RandomDataset(Dataset): 552 | 553 | def __init__( 554 | self, 555 | m_den, 556 | ln_emb, 557 | data_size, 558 | num_batches, 559 | mini_batch_size, 560 | num_indices_per_lookup, 561 | num_indices_per_lookup_fixed, 562 | num_targets=1, 563 | round_targets=False, 564 | data_generation="random", 565 | trace_file="", 566 | enable_padding=False, 567 | reset_seed_on_access=False, 568 | rand_seed=0 569 | ): 570 | # compute batch size 571 | nbatches = int(np.ceil((data_size * 1.0) / mini_batch_size)) 572 | if num_batches != 0: 573 | nbatches = num_batches 574 | data_size = nbatches * mini_batch_size 575 | # print("Total number of batches %d" % nbatches) 576 | 577 | # save args (recompute data_size if needed) 578 | self.m_den = m_den 579 | self.ln_emb = ln_emb 580 | self.data_size = data_size 581 | self.num_batches = nbatches 582 | self.mini_batch_size = mini_batch_size 583 | self.num_indices_per_lookup = num_indices_per_lookup 584 | self.num_indices_per_lookup_fixed = num_indices_per_lookup_fixed 585 | self.num_targets = num_targets 586 | self.round_targets = round_targets 587 | self.data_generation = data_generation 588 | self.trace_file = trace_file 589 | self.enable_padding = enable_padding 590 | self.reset_seed_on_access = reset_seed_on_access 591 | self.rand_seed = rand_seed 592 | 593 | def reset_numpy_seed(self, numpy_rand_seed): 594 | np.random.seed(numpy_rand_seed) 595 | # torch.manual_seed(numpy_rand_seed) 596 | 597 | def __getitem__(self, index): 598 | 599 | if isinstance(index, slice): 600 | return [ 601 | self[idx] for idx in range( 602 | index.start or 0, index.stop or len(self), index.step or 1 603 | ) 604 | ] 605 | 606 | # WARNING: reset seed on access to first element 607 | # (e.g. if same random samples needed across epochs) 608 | if self.reset_seed_on_access and index == 0: 609 | self.reset_numpy_seed(self.rand_seed) 610 | 611 | # number of data points in a batch 612 | n = min(self.mini_batch_size, self.data_size - (index * self.mini_batch_size)) 613 | 614 | # generate a batch of dense and sparse features 615 | if self.data_generation == "random": 616 | (X, lS_o, lS_i) = generate_uniform_input_batch( 617 | self.m_den, 618 | self.ln_emb, 619 | n, 620 | self.num_indices_per_lookup, 621 | self.num_indices_per_lookup_fixed 622 | ) 623 | elif self.data_generation == "synthetic": 624 | (X, lS_o, lS_i) = generate_synthetic_input_batch( 625 | self.m_den, 626 | self.ln_emb, 627 | n, 628 | self.num_indices_per_lookup, 629 | self.num_indices_per_lookup_fixed, 630 | self.trace_file, 631 | self.enable_padding 632 | ) 633 | else: 634 | sys.exit( 635 | "ERROR: --data-generation=" + self.data_generation + " is not supported" 636 | ) 637 | 638 | # generate a batch of target (probability of a click) 639 | T = generate_random_output_batch(n, self.num_targets, self.round_targets) 640 | 641 | return (X, lS_o, lS_i, T) 642 | 643 | def __len__(self): 644 | # WARNING: note that we produce bacthes of outputs in __getitem__ 645 | # therefore we should use num_batches rather than data_size below 646 | return self.num_batches 647 | 648 | 649 | def collate_wrapper_random(list_of_tuples): 650 | # where each tuple is (X, lS_o, lS_i, T) 651 | (X, lS_o, lS_i, T) = list_of_tuples[0] 652 | return (X, 653 | torch.stack(lS_o), 654 | lS_i, 655 | T) 656 | 657 | 658 | def make_random_data_and_loader(args, ln_emb, m_den): 659 | train_data = RandomDataset( 660 | m_den, 661 | ln_emb, 662 | args.data_size, 663 | args.num_batches, 664 | args.mini_batch_size, 665 | args.num_indices_per_lookup, 666 | args.num_indices_per_lookup_fixed, 667 | 1, # num_targets 668 | args.round_targets, 669 | args.data_generation, 670 | args.data_trace_file, 671 | args.data_trace_enable_padding, 672 | reset_seed_on_access=True, 673 | rand_seed=args.numpy_rand_seed 674 | ) # WARNING: generates a batch of lookups at once 675 | train_loader = torch.utils.data.DataLoader( 676 | train_data, 677 | batch_size=1, 678 | shuffle=False, 679 | num_workers=args.num_workers, 680 | collate_fn=collate_wrapper_random, 681 | pin_memory=False, 682 | drop_last=False, # True 683 | ) 684 | return train_data, train_loader 685 | 686 | 687 | def generate_random_data( 688 | m_den, 689 | ln_emb, 690 | data_size, 691 | num_batches, 692 | mini_batch_size, 693 | num_indices_per_lookup, 694 | num_indices_per_lookup_fixed, 695 | num_targets=1, 696 | round_targets=False, 697 | data_generation="random", 698 | trace_file="", 699 | enable_padding=False, 700 | ): 701 | nbatches = int(np.ceil((data_size * 1.0) / mini_batch_size)) 702 | if num_batches != 0: 703 | nbatches = num_batches 704 | data_size = nbatches * mini_batch_size 705 | # print("Total number of batches %d" % nbatches) 706 | 707 | # inputs 708 | lT = [] 709 | lX = [] 710 | lS_offsets = [] 711 | lS_indices = [] 712 | for j in range(0, nbatches): 713 | # number of data points in a batch 714 | n = min(mini_batch_size, data_size - (j * mini_batch_size)) 715 | 716 | # generate a batch of dense and sparse features 717 | if data_generation == "random": 718 | (Xt, lS_emb_offsets, lS_emb_indices) = generate_uniform_input_batch( 719 | m_den, 720 | ln_emb, 721 | n, 722 | num_indices_per_lookup, 723 | num_indices_per_lookup_fixed 724 | ) 725 | elif data_generation == "synthetic": 726 | (Xt, lS_emb_offsets, lS_emb_indices) = generate_synthetic_input_batch( 727 | m_den, 728 | ln_emb, 729 | n, 730 | num_indices_per_lookup, 731 | num_indices_per_lookup_fixed, 732 | trace_file, 733 | enable_padding 734 | ) 735 | else: 736 | sys.exit( 737 | "ERROR: --data-generation=" + data_generation + " is not supported" 738 | ) 739 | # dense feature 740 | lX.append(Xt) 741 | # sparse feature (sparse indices) 742 | lS_offsets.append(lS_emb_offsets) 743 | lS_indices.append(lS_emb_indices) 744 | 745 | # generate a batch of target (probability of a click) 746 | P = generate_random_output_batch(n, num_targets, round_targets) 747 | lT.append(P) 748 | 749 | return (nbatches, lX, lS_offsets, lS_indices, lT) 750 | 751 | 752 | def generate_random_output_batch(n, num_targets, round_targets=False): 753 | # target (probability of a click) 754 | if round_targets: 755 | P = np.round(ra.rand(n, num_targets).astype(np.float32)).astype(np.float32) 756 | else: 757 | P = ra.rand(n, num_targets).astype(np.float32) 758 | 759 | return torch.tensor(P) 760 | 761 | 762 | # uniform ditribution (input data) 763 | def generate_uniform_input_batch( 764 | m_den, 765 | ln_emb, 766 | n, 767 | num_indices_per_lookup, 768 | num_indices_per_lookup_fixed, 769 | ): 770 | # dense feature 771 | Xt = torch.tensor(ra.rand(n, m_den).astype(np.float32)) 772 | 773 | # sparse feature (sparse indices) 774 | lS_emb_offsets = [] 775 | lS_emb_indices = [] 776 | # for each embedding generate a list of n lookups, 777 | # where each lookup is composed of multiple sparse indices 778 | for size in ln_emb: 779 | lS_batch_offsets = [] 780 | lS_batch_indices = [] 781 | offset = 0 782 | for _ in range(n): 783 | # num of sparse indices to be used per embedding (between 784 | if num_indices_per_lookup_fixed: 785 | sparse_group_size = np.int64(num_indices_per_lookup) 786 | else: 787 | # random between [1,num_indices_per_lookup]) 788 | r = ra.random(1) 789 | sparse_group_size = np.int64( 790 | np.round(max([1.0], r * min(size, num_indices_per_lookup))) 791 | ) 792 | # sparse indices to be used per embedding 793 | r = ra.random(sparse_group_size) 794 | sparse_group = np.unique(np.round(r * (size - 1)).astype(np.int64)) 795 | # reset sparse_group_size in case some index duplicates were removed 796 | sparse_group_size = np.int64(sparse_group.size) 797 | # store lengths and indices 798 | lS_batch_offsets += [offset] 799 | lS_batch_indices += sparse_group.tolist() 800 | # update offset for next iteration 801 | offset += sparse_group_size 802 | lS_emb_offsets.append(torch.tensor(lS_batch_offsets)) 803 | lS_emb_indices.append(torch.tensor(lS_batch_indices)) 804 | 805 | return (Xt, lS_emb_offsets, lS_emb_indices) 806 | 807 | 808 | # synthetic distribution (input data) 809 | def generate_synthetic_input_batch( 810 | m_den, 811 | ln_emb, 812 | n, 813 | num_indices_per_lookup, 814 | num_indices_per_lookup_fixed, 815 | trace_file, 816 | enable_padding=False, 817 | ): 818 | # dense feature 819 | Xt = torch.tensor(ra.rand(n, m_den).astype(np.float32)) 820 | 821 | # sparse feature (sparse indices) 822 | lS_emb_offsets = [] 823 | lS_emb_indices = [] 824 | # for each embedding generate a list of n lookups, 825 | # where each lookup is composed of multiple sparse indices 826 | for i, size in enumerate(ln_emb): 827 | lS_batch_offsets = [] 828 | lS_batch_indices = [] 829 | offset = 0 830 | for _ in range(n): 831 | # num of sparse indices to be used per embedding (between 832 | if num_indices_per_lookup_fixed: 833 | sparse_group_size = np.int64(num_indices_per_lookup) 834 | else: 835 | # random between [1,num_indices_per_lookup]) 836 | r = ra.random(1) 837 | sparse_group_size = np.int64( 838 | max(1, np.round(r * min(size, num_indices_per_lookup))[0]) 839 | ) 840 | # sparse indices to be used per embedding 841 | file_path = trace_file 842 | line_accesses, list_sd, cumm_sd = read_dist_from_file( 843 | file_path.replace("j", str(i)) 844 | ) 845 | # debug prints 846 | # print("input") 847 | # print(line_accesses); print(list_sd); print(cumm_sd); 848 | # print(sparse_group_size) 849 | # approach 1: rand 850 | # r = trace_generate_rand( 851 | # line_accesses, list_sd, cumm_sd, sparse_group_size, enable_padding 852 | # ) 853 | # approach 2: lru 854 | r = trace_generate_lru( 855 | line_accesses, list_sd, cumm_sd, sparse_group_size, enable_padding 856 | ) 857 | # WARNING: if the distribution in the file is not consistent 858 | # with embedding table dimensions, below mod guards against out 859 | # of range access 860 | sparse_group = np.unique(r).astype(np.int64) 861 | minsg = np.min(sparse_group) 862 | maxsg = np.max(sparse_group) 863 | if (minsg < 0) or (size <= maxsg): 864 | print( 865 | "WARNING: distribution is inconsistent with embedding " 866 | + "table size (using mod to recover and continue)" 867 | ) 868 | sparse_group = np.mod(sparse_group, size).astype(np.int64) 869 | # sparse_group = np.unique(np.array(np.mod(r, size-1)).astype(np.int64)) 870 | # reset sparse_group_size in case some index duplicates were removed 871 | sparse_group_size = np.int64(sparse_group.size) 872 | # store lengths and indices 873 | lS_batch_offsets += [offset] 874 | lS_batch_indices += sparse_group.tolist() 875 | # update offset for next iteration 876 | offset += sparse_group_size 877 | lS_emb_offsets.append(torch.tensor(lS_batch_offsets)) 878 | lS_emb_indices.append(torch.tensor(lS_batch_indices)) 879 | 880 | return (Xt, lS_emb_offsets, lS_emb_indices) 881 | 882 | 883 | def generate_stack_distance(cumm_val, cumm_dist, max_i, i, enable_padding=False): 884 | u = ra.rand(1) 885 | if i < max_i: 886 | # only generate stack distances up to the number of new references seen so far 887 | j = bisect.bisect(cumm_val, i) - 1 888 | fi = cumm_dist[j] 889 | u *= fi # shrink distribution support to exclude last values 890 | elif enable_padding: 891 | # WARNING: disable generation of new references (once all have been seen) 892 | fi = cumm_dist[0] 893 | u = (1.0 - fi) * u + fi # remap distribution support to exclude first value 894 | 895 | for (j, f) in enumerate(cumm_dist): 896 | if u <= f: 897 | return cumm_val[j] 898 | 899 | 900 | # WARNING: global define, must be consistent across all synthetic functions 901 | cache_line_size = 1 902 | 903 | 904 | def trace_generate_lru( 905 | line_accesses, list_sd, cumm_sd, out_trace_len, enable_padding=False 906 | ): 907 | max_sd = list_sd[-1] 908 | l = len(line_accesses) 909 | i = 0 910 | ztrace = [] 911 | for _ in range(out_trace_len): 912 | sd = generate_stack_distance(list_sd, cumm_sd, max_sd, i, enable_padding) 913 | mem_ref_within_line = 0 # floor(ra.rand(1)*cache_line_size) #0 914 | 915 | # generate memory reference 916 | if sd == 0: # new reference # 917 | line_ref = line_accesses.pop(0) 918 | line_accesses.append(line_ref) 919 | mem_ref = np.uint64(line_ref * cache_line_size + mem_ref_within_line) 920 | i += 1 921 | else: # existing reference # 922 | line_ref = line_accesses[l - sd] 923 | mem_ref = np.uint64(line_ref * cache_line_size + mem_ref_within_line) 924 | line_accesses.pop(l - sd) 925 | line_accesses.append(line_ref) 926 | # save generated memory reference 927 | ztrace.append(mem_ref) 928 | 929 | return ztrace 930 | 931 | 932 | def trace_generate_rand( 933 | line_accesses, list_sd, cumm_sd, out_trace_len, enable_padding=False 934 | ): 935 | max_sd = list_sd[-1] 936 | l = len(line_accesses) # !!!Unique, 937 | i = 0 938 | ztrace = [] 939 | for _ in range(out_trace_len): 940 | sd = generate_stack_distance(list_sd, cumm_sd, max_sd, i, enable_padding) 941 | mem_ref_within_line = 0 # floor(ra.rand(1)*cache_line_size) #0 942 | # generate memory reference 943 | if sd == 0: # new reference # 944 | line_ref = line_accesses.pop(0) 945 | line_accesses.append(line_ref) 946 | mem_ref = np.uint64(line_ref * cache_line_size + mem_ref_within_line) 947 | i += 1 948 | else: # existing reference # 949 | line_ref = line_accesses[l - sd] 950 | mem_ref = np.uint64(line_ref * cache_line_size + mem_ref_within_line) 951 | ztrace.append(mem_ref) 952 | 953 | return ztrace 954 | 955 | 956 | def trace_profile(trace, enable_padding=False): 957 | # number of elements in the array (assuming 1D) 958 | # n = trace.size 959 | 960 | rstack = [] # S 961 | stack_distances = [] # SDS 962 | line_accesses = [] # L 963 | for x in trace: 964 | r = np.uint64(x / cache_line_size) 965 | l = len(rstack) 966 | try: # found # 967 | i = rstack.index(r) 968 | # WARNING: I believe below is the correct depth in terms of meaning of the 969 | # algorithm, but that is not what seems to be in the paper alg. 970 | # -1 can be subtracted if we defined the distance between 971 | # consecutive accesses (e.g. r, r) as 0 rather than 1. 972 | sd = l - i # - 1 973 | # push r to the end of stack_distances 974 | stack_distances.insert(0, sd) 975 | # remove r from its position and insert to the top of stack 976 | rstack.pop(i) # rstack.remove(r) 977 | rstack.insert(l - 1, r) 978 | except ValueError: # not found # 979 | sd = 0 # -1 980 | # push r to the end of stack_distances/line_accesses 981 | stack_distances.insert(0, sd) 982 | line_accesses.insert(0, r) 983 | # push r to the top of stack 984 | rstack.insert(l, r) 985 | 986 | if enable_padding: 987 | # WARNING: notice that as the ratio between the number of samples (l) 988 | # and cardinality (c) of a sample increases the probability of 989 | # generating a sample gets smaller and smaller because there are 990 | # few new samples compared to repeated samples. This means that for a 991 | # long trace with relatively small cardinality it will take longer to 992 | # generate all new samples and therefore obtain full distribution support 993 | # and hence it takes longer for distribution to resemble the original. 994 | # Therefore, we may pad the number of new samples to be on par with 995 | # average number of samples l/c artificially. 996 | l = len(stack_distances) 997 | c = max(stack_distances) 998 | padding = int(np.ceil(l / c)) 999 | stack_distances = stack_distances + [0] * padding 1000 | 1001 | return (rstack, stack_distances, line_accesses) 1002 | 1003 | 1004 | # auxiliary read/write routines 1005 | def read_trace_from_file(file_path): 1006 | try: 1007 | with open(file_path) as f: 1008 | if args.trace_file_binary_type: 1009 | array = np.fromfile(f, dtype=np.uint64) 1010 | trace = array.astype(np.uint64).tolist() 1011 | else: 1012 | line = f.readline() 1013 | trace = list(map(lambda x: np.uint64(x), line.split(", "))) 1014 | return trace 1015 | except Exception: 1016 | print("ERROR: no input trace file has been provided") 1017 | 1018 | 1019 | def write_trace_to_file(file_path, trace): 1020 | try: 1021 | if args.trace_file_binary_type: 1022 | with open(file_path, "wb+") as f: 1023 | np.array(trace).astype(np.uint64).tofile(f) 1024 | else: 1025 | with open(file_path, "w+") as f: 1026 | s = str(trace) 1027 | f.write(s[1: len(s) - 1]) 1028 | except Exception: 1029 | print("ERROR: no output trace file has been provided") 1030 | 1031 | 1032 | def read_dist_from_file(file_path): 1033 | try: 1034 | with open(file_path, "r") as f: 1035 | lines = f.read().splitlines() 1036 | except Exception: 1037 | print("Wrong file or file path") 1038 | # read unique accesses 1039 | unique_accesses = [int(el) for el in lines[0].split(", ")] 1040 | # read cumulative distribution (elements are passed as two separate lists) 1041 | list_sd = [int(el) for el in lines[1].split(", ")] 1042 | cumm_sd = [float(el) for el in lines[2].split(", ")] 1043 | 1044 | return unique_accesses, list_sd, cumm_sd 1045 | 1046 | 1047 | def write_dist_to_file(file_path, unique_accesses, list_sd, cumm_sd): 1048 | try: 1049 | with open(file_path, "w") as f: 1050 | # unique_acesses 1051 | s = str(unique_accesses) 1052 | f.write(s[1: len(s) - 1] + "\n") 1053 | # list_sd 1054 | s = str(list_sd) 1055 | f.write(s[1: len(s) - 1] + "\n") 1056 | # cumm_sd 1057 | s = str(cumm_sd) 1058 | f.write(s[1: len(s) - 1] + "\n") 1059 | except Exception: 1060 | print("Wrong file or file path") 1061 | 1062 | 1063 | if __name__ == "__main__": 1064 | import sys 1065 | import operator 1066 | import argparse 1067 | 1068 | ### parse arguments ### 1069 | parser = argparse.ArgumentParser(description="Generate Synthetic Distributions") 1070 | parser.add_argument("--trace-file", type=str, default="./input/trace.log") 1071 | parser.add_argument("--trace-file-binary-type", type=bool, default=False) 1072 | parser.add_argument("--trace-enable-padding", type=bool, default=False) 1073 | parser.add_argument("--dist-file", type=str, default="./input/dist.log") 1074 | parser.add_argument( 1075 | "--synthetic-file", type=str, default="./input/trace_synthetic.log" 1076 | ) 1077 | parser.add_argument("--numpy-rand-seed", type=int, default=123) 1078 | parser.add_argument("--print-precision", type=int, default=5) 1079 | args = parser.parse_args() 1080 | 1081 | ### some basic setup ### 1082 | np.random.seed(args.numpy_rand_seed) 1083 | np.set_printoptions(precision=args.print_precision) 1084 | 1085 | ### read trace ### 1086 | trace = read_trace_from_file(args.trace_file) 1087 | # print(trace) 1088 | 1089 | ### profile trace ### 1090 | (_, stack_distances, line_accesses) = trace_profile( 1091 | trace, args.trace_enable_padding 1092 | ) 1093 | stack_distances.reverse() 1094 | line_accesses.reverse() 1095 | # print(line_accesses) 1096 | # print(stack_distances) 1097 | 1098 | ### compute probability distribution ### 1099 | # count items 1100 | l = len(stack_distances) 1101 | dc = sorted( 1102 | collections.Counter(stack_distances).items(), key=operator.itemgetter(0) 1103 | ) 1104 | 1105 | # create a distribution 1106 | list_sd = list(map(lambda tuple_x_k: tuple_x_k[0], dc)) # x = tuple_x_k[0] 1107 | dist_sd = list( 1108 | map(lambda tuple_x_k: tuple_x_k[1] / float(l), dc) 1109 | ) # k = tuple_x_k[1] 1110 | cumm_sd = [] # np.cumsum(dc).tolist() #prefixsum 1111 | for i, (_, k) in enumerate(dc): 1112 | if i == 0: 1113 | cumm_sd.append(k / float(l)) 1114 | else: 1115 | # add the 2nd element of the i-th tuple in the dist_sd list 1116 | cumm_sd.append(cumm_sd[i - 1] + (k / float(l))) 1117 | 1118 | ### write stack_distance and line_accesses to a file ### 1119 | write_dist_to_file(args.dist_file, line_accesses, list_sd, cumm_sd) 1120 | 1121 | ### generate correspondinf synthetic ### 1122 | # line_accesses, list_sd, cumm_sd = read_dist_from_file(args.dist_file) 1123 | synthetic_trace = trace_generate_lru( 1124 | line_accesses, list_sd, cumm_sd, len(trace), args.trace_enable_padding 1125 | ) 1126 | # synthetic_trace = trace_generate_rand( 1127 | # line_accesses, list_sd, cumm_sd, len(trace), args.trace_enable_padding 1128 | # ) 1129 | write_trace_to_file(args.synthetic_file, synthetic_trace) 1130 | -------------------------------------------------------------------------------- /data_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | # 6 | # Description: generate inputs and targets for the DLRM benchmark 7 | # 8 | # Utility function(s) to download and pre-process public data sets 9 | # - Criteo Kaggle Display Advertising Challenge Dataset 10 | # https://labs.criteo.com/2014/02/kaggle-display-advertising-challenge-dataset 11 | # - Criteo Terabyte Dataset 12 | # https://labs.criteo.com/2013/12/download-terabyte-click-logs 13 | # 14 | # After downloading dataset, run: 15 | # getCriteoAdData( 16 | # datafile="", 17 | # o_filename=kaggleAdDisplayChallenge_processed.npz, 18 | # max_ind_range=-1, 19 | # sub_sample_rate=0.0, 20 | # days=7, 21 | # data_split='train', 22 | # randomize='total', 23 | # criteo_kaggle=True, 24 | # memory_map=False 25 | # ) 26 | # getCriteoAdData( 27 | # datafile="", 28 | # o_filename=terabyte_processed.npz, 29 | # max_ind_range=-1, 30 | # sub_sample_rate=0.0, 31 | # days=24, 32 | # data_split='train', 33 | # randomize='total', 34 | # criteo_kaggle=False, 35 | # memory_map=False 36 | # ) 37 | 38 | from __future__ import absolute_import, division, print_function, unicode_literals 39 | 40 | import sys 41 | # import os 42 | from os import path 43 | # import io 44 | # from io import StringIO 45 | # import collections as coll 46 | 47 | import numpy as np 48 | 49 | 50 | def convertUStringToDistinctIntsDict(mat, convertDicts, counts): 51 | # Converts matrix of unicode strings into distinct integers. 52 | # 53 | # Inputs: 54 | # mat (np.array): array of unicode strings to convert 55 | # convertDicts (list): dictionary for each column 56 | # counts (list): number of different categories in each column 57 | # 58 | # Outputs: 59 | # out (np.array): array of output integers 60 | # convertDicts (list): dictionary for each column 61 | # counts (list): number of different categories in each column 62 | 63 | # check if convertDicts and counts match correct length of mat 64 | if len(convertDicts) != mat.shape[1] or len(counts) != mat.shape[1]: 65 | print("Length of convertDicts or counts does not match input shape") 66 | print("Generating convertDicts and counts...") 67 | 68 | convertDicts = [{} for _ in range(mat.shape[1])] 69 | counts = [0 for _ in range(mat.shape[1])] 70 | 71 | # initialize output 72 | out = np.zeros(mat.shape) 73 | 74 | for j in range(mat.shape[1]): 75 | for i in range(mat.shape[0]): 76 | # add to convertDict and increment count 77 | if mat[i, j] not in convertDicts[j]: 78 | convertDicts[j][mat[i, j]] = counts[j] 79 | counts[j] += 1 80 | out[i, j] = convertDicts[j][mat[i, j]] 81 | 82 | return out, convertDicts, counts 83 | 84 | 85 | def convertUStringToDistinctIntsUnique(mat, mat_uni, counts): 86 | # mat is an array of 0,...,# samples, with each being 26 categorical features 87 | 88 | # check if mat_unique and counts match correct length of mat 89 | if len(mat_uni) != mat.shape[1] or len(counts) != mat.shape[1]: 90 | print("Length of mat_unique or counts does not match input shape") 91 | print("Generating mat_unique and counts...") 92 | 93 | mat_uni = [np.array([]) for _ in range(mat.shape[1])] 94 | counts = [0 for _ in range(mat.shape[1])] 95 | 96 | # initialize output 97 | out = np.zeros(mat.shape) 98 | ind_map = [np.array([]) for _ in range(mat.shape[1])] 99 | 100 | # find out and assign unique ids to features 101 | for j in range(mat.shape[1]): 102 | m = mat_uni[j].size 103 | mat_concat = np.concatenate((mat_uni[j], mat[:, j])) 104 | mat_uni[j], ind_map[j] = np.unique(mat_concat, return_inverse=True) 105 | out[:, j] = ind_map[j][m:] 106 | counts[j] = mat_uni[j].size 107 | 108 | return out, mat_uni, counts 109 | 110 | 111 | def processCriteoAdData(d_path, d_file, npzfile, split, convertDicts, pre_comp_counts): 112 | # Process Kaggle Display Advertising Challenge or Terabyte Dataset 113 | # by converting unicode strings in X_cat to integers and 114 | # converting negative integer values in X_int. 115 | # 116 | # Loads data in the form "{kaggle|terabyte}_day_i.npz" where i is the day. 117 | # 118 | # Inputs: 119 | # d_path (str): path for {kaggle|terabyte}_day_i.npz files 120 | # split (int): total number of splits in the dataset (typically 7 or 24) 121 | 122 | # process data if not all files exist 123 | for i in range(split): 124 | filename_i = npzfile + "_{0}_processed.npz".format(i) 125 | 126 | if path.exists(filename_i): 127 | print("Using existing " + filename_i, end="\r") 128 | else: 129 | with np.load(npzfile + "_{0}.npz".format(i)) as data: 130 | # categorical features 131 | ''' 132 | # Approach 1a: using empty dictionaries 133 | X_cat, convertDicts, counts = convertUStringToDistinctIntsDict( 134 | data["X_cat"], convertDicts, counts 135 | ) 136 | ''' 137 | ''' 138 | # Approach 1b: using empty np.unique 139 | X_cat, convertDicts, counts = convertUStringToDistinctIntsUnique( 140 | data["X_cat"], convertDicts, counts 141 | ) 142 | ''' 143 | # Approach 2a: using pre-computed dictionaries 144 | X_cat_t = np.zeros(data["X_cat_t"].shape) 145 | for j in range(26): 146 | for k, x in enumerate(data["X_cat_t"][j, :]): 147 | X_cat_t[j, k] = convertDicts[j][x] 148 | # continuous features 149 | X_int = data["X_int"] 150 | X_int[X_int < 0] = 0 151 | # targets 152 | y = data["y"] 153 | 154 | np.savez_compressed( 155 | filename_i, 156 | # X_cat = X_cat, 157 | X_cat=np.transpose(X_cat_t), # transpose of the data 158 | X_int=X_int, 159 | y=y, 160 | ) 161 | print("Processed " + filename_i, end="\r") 162 | print("") 163 | # sanity check (applicable only if counts have been pre-computed & are re-computed) 164 | # for j in range(26): 165 | # if pre_comp_counts[j] != counts[j]: 166 | # sys.exit("ERROR: Sanity check on counts has failed") 167 | # print("\nSanity check on counts passed") 168 | 169 | return 170 | 171 | 172 | def concatCriteoAdData( 173 | d_path, 174 | d_file, 175 | npzfile, 176 | trafile, 177 | days, 178 | data_split, 179 | randomize, 180 | total_per_file, 181 | total_count, 182 | memory_map, 183 | o_filename 184 | ): 185 | # Concatenates different days and saves the result. 186 | # 187 | # Inputs: 188 | # days (int): total number of days in the dataset (typically 7 or 24) 189 | # d_path (str): path for {kaggle|terabyte}_day_i.npz files 190 | # o_filename (str): output file name 191 | # 192 | # Output: 193 | # o_file (str): output file path 194 | 195 | if memory_map: 196 | # dataset break up per fea 197 | # tar_fea = 1 # single target 198 | den_fea = 13 # 13 dense features 199 | spa_fea = 26 # 26 sparse features 200 | # tad_fea = tar_fea + den_fea 201 | # tot_fea = tad_fea + spa_fea 202 | # create offset per file 203 | offset_per_file = np.array([0] + [x for x in total_per_file]) 204 | for i in range(days): 205 | offset_per_file[i + 1] += offset_per_file[i] 206 | 207 | ''' 208 | # Approach 1, 2 and 3 use indices, while Approach 4 does not use them 209 | # create indices 210 | indices = np.arange(total_count) 211 | if data_split == "none": 212 | if randomize == "total": 213 | indices = np.random.permutation(indices) 214 | else: 215 | indices = np.array_split(indices, offset_per_file[1:-1]) 216 | 217 | # randomize train data (per day) 218 | if randomize == "day": # or randomize == "total": 219 | for i in range(len(indices) - 1): 220 | indices[i] = np.random.permutation(indices[i]) 221 | print("Randomized indices per day ...") 222 | 223 | train_indices = np.concatenate(indices[:-1]) 224 | test_indices = indices[-1] 225 | 226 | # randomize train data (across days) 227 | if randomize == "total": 228 | train_indices = np.random.permutation(train_indices) 229 | print("Randomized indices across days ...") 230 | 231 | indices = np.concatenate((train_indices, test_indices)) 232 | # no reordering 233 | # indices = np.arange(total_count) 234 | ''' 235 | ''' 236 | # Approach 1: simple and slow (no grouping is used) 237 | # check if data already exists 238 | recreate_flag = False 239 | for j in range(tot_fea): 240 | filename_j = trafile + "_{0}_reordered.npy".format(j) 241 | if path.exists(filename_j): 242 | print("Using existing " + filename_j) 243 | else: 244 | recreate_flag = True 245 | # load, reorder and concatenate data (memmap all reordered files per feature) 246 | if recreate_flag: 247 | # init reordered files (.npy appended automatically) 248 | z = np.zeros((total_count)) 249 | for j in range(tot_fea): 250 | filename_j = trafile + "_{0}_reordered".format(j) 251 | np.save(filename_j, z) 252 | print("Creating " + filename_j) 253 | 254 | for i in range(days): 255 | filename_i = d_path + npzfile + "_{0}_processed.npz".format(i) 256 | with np.load(filename_i) as data: 257 | X_cat_t = np.transpose(data["X_cat"]) 258 | X_int_t = np.transpose(data["X_int"]) 259 | y = data["y"] 260 | size = len(y) 261 | # sanity check 262 | if total_per_file[i] != size: 263 | sys.exit("ERROR: sanity check on number of samples failed") 264 | # setup start and end ranges 265 | start = offset_per_file[i] 266 | end = offset_per_file[i + 1] 267 | # print(filename_i) 268 | # print("start=" + str(start) + " end=" + str(end) 269 | # + " diff=" + str(end - start) + "=" + str(total_per_file[i])) 270 | 271 | for j in range(tot_fea): 272 | filename_j = trafile + "_{0}_reordered.npy".format(j) 273 | fj = np.load(filename_j, mmap_mode='r+') 274 | if j < tar_fea: 275 | fj[indices[start:end]] = y 276 | elif tar_fea <= j and j < tad_fea: 277 | fj[indices[start:end]] = X_int_t[j - tar_fea, :] 278 | else: 279 | fj[indices[start:end]] = X_cat_t[j - tad_fea, :] 280 | del fj 281 | else: 282 | print("Reordered fea files already exist, skipping ...") 283 | 284 | # check if data already exists 285 | recreate_flag = False 286 | for i in range(days): 287 | filename_i = d_path + npzfile + "_{0}_reordered.npz".format(i) 288 | if path.exists(filename_i): 289 | print("Using existing " + filename_i) 290 | else: 291 | recreate_flag = True 292 | # split reordered data by files (memmap all reordered files per feature) 293 | # on the day boundary del the file object and memmap again 294 | if recreate_flag: 295 | for i in range(days): 296 | filename_i = d_path + npzfile + "_{0}_reordered.npz".format(i) 297 | size = total_per_file[i] 298 | X_int_t = np.zeros((den_fea, size)) 299 | X_cat_t = np.zeros((spa_fea, size)) 300 | # setup start and end ranges 301 | start = offset_per_file[i] 302 | end = offset_per_file[i + 1] 303 | print("Creating " + filename_i) 304 | # print("start=" + str(start) + " end=" + str(end) 305 | # + " diff=" + str(end - start) + "=" + str(total_per_file[i])) 306 | 307 | for j in range(tot_fea): 308 | filename_j = trafile + "_{0}_reordered.npy".format(j) 309 | fj = np.load(filename_j, mmap_mode='r') 310 | if j < tar_fea: 311 | y = fj[start:end] 312 | elif tar_fea <= j and j < tad_fea: 313 | X_int_t[j - tar_fea, :] = fj[start:end] 314 | else: 315 | X_cat_t[j - tad_fea, :] = fj[start:end] 316 | del fj 317 | 318 | np.savez_compressed( 319 | filename_i, 320 | X_cat=np.transpose(X_cat_t), # transpose of the data 321 | X_int=np.transpose(X_int_t), # transpose of the data 322 | y=y, 323 | ) 324 | else: 325 | print("Reordered day files already exist, skipping ...") 326 | ''' 327 | ''' 328 | # Approach 2: group days 329 | # check if data already exists 330 | recreate_flag = False 331 | for j in range(tot_fea): 332 | filename_j = trafile + "_{0}_reordered.npy".format(j) 333 | if path.exists(filename_j): 334 | print("Using existing " + filename_j) 335 | else: 336 | recreate_flag = True 337 | # load, reorder and concatenate data (memmap all reordered files per feature) 338 | if recreate_flag: 339 | # init reordered files (.npy appended automatically) 340 | z = np.zeros((total_count)) 341 | for j in range(tot_fea): 342 | filename_j = trafile + "_{0}_reordered".format(j) 343 | np.save(filename_j, z) 344 | print("Creating " + filename_j) 345 | 346 | group_day = 3 # e.g. 8, 4 or 3 347 | group_num = days // group_day 348 | file_group = [i*group_day for i in range(group_num)] + [days] 349 | for ii in range(group_num): 350 | # for last may be group_size != group_num, therefore reset it below 351 | group_size = file_group[ii + 1] - file_group[ii] 352 | X_cat_t = [0]*group_size 353 | X_int_t = [0]*group_size 354 | y = [0]*group_size 355 | start = [0]*group_size 356 | end = [0]*group_size 357 | for ig in range(group_size): 358 | i = file_group[ii] + ig 359 | filename_i = d_path + npzfile + "_{0}_processed.npz".format(i) 360 | # setup start and end ranges 361 | start[ig] = offset_per_file[i] 362 | end[ig] = offset_per_file[i + 1] 363 | # print(filename_i) 364 | # load a group of files 365 | with np.load(filename_i) as data: 366 | X_cat_t[ig] = np.transpose(data["X_cat"]) 367 | X_int_t[ig] = np.transpose(data["X_int"]) 368 | y[ig] = data["y"] 369 | # sanity check 370 | if total_per_file[i] != len(y[ig]): 371 | sys.exit("ERROR: sanity check on number of samples failed") 372 | # print("start=" + str(start) + " end=" + str(end) 373 | # + " diff=" + str(end[ig]-start[ig]) + "=" + str(total_per_file[i])) 374 | 375 | for j in range(tot_fea): 376 | filename_j = trafile + "_{0}_reordered.npy".format(j) 377 | fj = np.load(filename_j, mmap_mode='r+') 378 | for ig in range(group_size): 379 | if j < tar_fea: 380 | fj[indices[start[ig]:end[ig]]] = y[ig] 381 | elif tar_fea <= j and j < tad_fea: 382 | fj[indices[start[ig]:end[ig]]] = X_int_t[ig][j - tar_fea, :] 383 | else: 384 | fj[indices[start[ig]:end[ig]]] = X_cat_t[ig][j - tad_fea, :] 385 | del fj 386 | else: 387 | print("Reordered fea files already exist, skipping ...") 388 | 389 | # check if data already exists 390 | recreate_flag = False 391 | for i in range(days): 392 | filename_i = d_path + npzfile + "_{0}_reordered.npz".format(i) 393 | if path.exists(filename_i): 394 | print("Using existing " + filename_i) 395 | else: 396 | recreate_flag = True 397 | # split reordered data by files (memmap all reordered files per feature) 398 | # on the day boundary del the file object and memmap again 399 | if recreate_flag: 400 | for ii in range(group_num): 401 | # for last may be group_size != group_num, therefore reset it below 402 | group_size = file_group[ii + 1] - file_group[ii] 403 | X_cat_t= []; X_int_t = [] 404 | for ig in range(group_size): 405 | i = file_group[ii] + ig 406 | X_int_t.append(np.zeros((den_fea, total_per_file[i]))) 407 | X_cat_t.append(np.zeros((spa_fea, total_per_file[i]))) 408 | y = [0]*group_size 409 | start = [0]*group_size 410 | end = [0]*group_size 411 | 412 | for j in range(tot_fea): 413 | filename_j = trafile + "_{0}_reordered.npy".format(j) 414 | fj = np.load(filename_j, mmap_mode='r') 415 | # load a group of files 416 | for ig in range(group_size): 417 | i = file_group[ii] + ig 418 | # setup start and end ranges 419 | start[ig] = offset_per_file[i] 420 | end[ig] = offset_per_file[i + 1] 421 | # load data for the group of files 422 | if j < tar_fea: 423 | y[ig] = fj[start[ig]:end[ig]] 424 | elif tar_fea <= j and j < tad_fea: 425 | X_int_t[ig][j - tar_fea, :] = fj[start[ig]:end[ig]] 426 | else: 427 | X_cat_t[ig][j - tad_fea, :] = fj[start[ig]:end[ig]] 428 | del fj 429 | 430 | for ig in range(group_size): 431 | i = file_group[ii] + ig 432 | filename_i = d_path + npzfile + "_{0}_reordered.npz".format(i) 433 | print("Creating " + filename_i) 434 | np.savez_compressed( 435 | filename_i, 436 | X_cat=np.transpose(X_cat_t[ig]), # transpose of the data 437 | X_int=np.transpose(X_int_t[ig]), # transpose of the data 438 | y=y[ig], 439 | ) 440 | else: 441 | print("Reordered day files already exist, skipping ...") 442 | ''' 443 | ''' 444 | # Approach 3: group features 445 | # check if data already exists 446 | group_fea = 5 # e.g. 8, 5 or 4 447 | group_num = tot_fea // group_fea 448 | if tot_fea % group_fea != 0: # sanity check 449 | sys.exit("ERROR: the group_fea must divided tot_fea evenly.") 450 | recreate_flag = False 451 | for jn in range(group_num): 452 | filename_j = trafile + "_{0}_reordered{1}.npy".format( 453 | jn, group_fea 454 | ) 455 | if path.exists(filename_j): 456 | print("Using existing " + filename_j) 457 | else: 458 | recreate_flag = True 459 | # load, reorder and concatenate data (memmap all reordered files per feature) 460 | if recreate_flag: 461 | # init reordered files (.npy appended automatically) 462 | z = np.zeros((group_fea, total_count)) 463 | for jn in range(group_num): 464 | filename_j = trafile + "_{0}_reordered{1}".format( 465 | jn, group_fea 466 | ) 467 | np.save(filename_j, z) 468 | print("Creating " + filename_j) 469 | 470 | for i in range(days): 471 | filename_i = d_path + npzfile + "_{0}_processed.npz".format(i) 472 | with np.load(filename_i) as data: 473 | X_cat_t = np.transpose(data["X_cat"]) 474 | X_int_t = np.transpose(data["X_int"]) 475 | y = data["y"] 476 | size = len(y) 477 | # sanity check 478 | if total_per_file[i] != size: 479 | sys.exit("ERROR: sanity check on number of samples failed") 480 | # setup start and end ranges 481 | start = offset_per_file[i] 482 | end = offset_per_file[i + 1] 483 | # print(filename_i) 484 | # print("start=" + str(start) + " end=" + str(end) 485 | # + " diff=" + str(end - start) + "=" + str(total_per_file[i])) 486 | 487 | for jn in range(group_num): 488 | filename_j = trafile + "_{0}_reordered{1}.npy".format( 489 | jn, group_fea 490 | ) 491 | fj = np.load(filename_j, mmap_mode='r+') 492 | for jg in range(group_fea): 493 | j = jn * group_fea + jg 494 | # print("j=" + str(j) + " jn=" + str(jn) + " jg=" + str(jg)) 495 | if j < tar_fea: 496 | fj[jg, indices[start:end]] = y 497 | elif tar_fea <= j and j < tad_fea: 498 | fj[jg, indices[start:end]] = X_int_t[j - tar_fea, :] 499 | else: 500 | fj[jg, indices[start:end]] = X_cat_t[j - tad_fea, :] 501 | del fj 502 | else: 503 | print("Reordered fea files already exist, skipping ...") 504 | 505 | # check if data already exists 506 | recreate_flag = False 507 | for i in range(days): 508 | filename_i = d_path + npzfile + "_{0}_reordered.npz".format(i) 509 | if path.exists(filename_i): 510 | print("Using existing" + filename_i) 511 | else: 512 | recreate_flag = True 513 | # split reordered data by files (memmap all reordered files per feature) 514 | # on the day boundary del the file object and memmap again 515 | if recreate_flag: 516 | for i in range(days): 517 | filename_i = d_path + npzfile + "_{0}_reordered.npz".format(i) 518 | size = total_per_file[i] 519 | X_int_t = np.zeros((den_fea, size)) 520 | X_cat_t = np.zeros((spa_fea, size)) 521 | # setup start and end ranges 522 | start = offset_per_file[i] 523 | end = offset_per_file[i + 1] 524 | print("Creating " + filename_i) 525 | # print("start=" + str(start) + " end=" + str(end) 526 | # + " diff=" + str(end - start) + "=" + str(total_per_file[i])) 527 | 528 | for jn in range(group_num): 529 | filename_j = trafile + "_{0}_reordered{1}.npy".format( 530 | jn, group_fea 531 | ) 532 | fj = np.load(filename_j, mmap_mode='r') 533 | for jg in range(group_fea): 534 | j = jn * group_fea + jg 535 | # print("j=" + str(j) + " jn=" + str(jn) + " jg=" + str(jg)) 536 | if j < tar_fea: 537 | y = fj[jg, start:end] 538 | elif tar_fea <= j and j < tad_fea: 539 | X_int_t[j - tar_fea, :] = fj[jg, start:end] 540 | else: 541 | X_cat_t[j - tad_fea, :] = fj[jg, start:end] 542 | del fj 543 | 544 | np.savez_compressed( 545 | filename_i, 546 | X_cat=np.transpose(X_cat_t), # transpose of the data 547 | X_int=np.transpose(X_int_t), # transpose of the data 548 | y=y, 549 | ) 550 | 551 | else: 552 | print("Reordered day files already exist, skipping ...") 553 | ''' 554 | 555 | # Approach 4: Fisher-Yates-Rao (FYR) shuffle algorithm 556 | # 1st pass of FYR shuffle 557 | # check if data already exists 558 | recreate_flag = False 559 | for j in range(days): 560 | filename_j_y = npzfile + "_{0}_intermediate_y.npy".format(j) 561 | filename_j_d = npzfile + "_{0}_intermediate_d.npy".format(j) 562 | filename_j_s = npzfile + "_{0}_intermediate_s.npy".format(j) 563 | if ( 564 | path.exists(filename_j_y) 565 | and path.exists(filename_j_d) 566 | and path.exists(filename_j_s) 567 | ): 568 | print( 569 | "Using existing\n" 570 | + filename_j_y + "\n" 571 | + filename_j_d + "\n" 572 | + filename_j_s 573 | ) 574 | else: 575 | recreate_flag = True 576 | # reorder across buckets using sampling 577 | if recreate_flag: 578 | # init intermediate files (.npy appended automatically) 579 | for j in range(days): 580 | filename_j_y = npzfile + "_{0}_intermediate_y".format(j) 581 | filename_j_d = npzfile + "_{0}_intermediate_d".format(j) 582 | filename_j_s = npzfile + "_{0}_intermediate_s".format(j) 583 | np.save(filename_j_y, np.zeros((total_per_file[j]))) 584 | np.save(filename_j_d, np.zeros((total_per_file[j], den_fea))) 585 | np.save(filename_j_s, np.zeros((total_per_file[j], spa_fea))) 586 | # start processing files 587 | total_counter = [0] * days 588 | for i in range(days): 589 | filename_i = npzfile + "_{0}_processed.npz".format(i) 590 | with np.load(filename_i) as data: 591 | X_cat = data["X_cat"] 592 | X_int = data["X_int"] 593 | y = data["y"] 594 | size = len(y) 595 | # sanity check 596 | if total_per_file[i] != size: 597 | sys.exit("ERROR: sanity check on number of samples failed") 598 | # debug prints 599 | print("Reordering (1st pass) " + filename_i) 600 | 601 | # create buckets using sampling of random ints 602 | # from (discrete) uniform distribution 603 | buckets = [] 604 | for _j in range(days): 605 | buckets.append([]) 606 | counter = [0] * days 607 | days_to_sample = days if data_split == "none" else days - 1 608 | if randomize == "total": 609 | rand_u = np.random.randint(low=0, high=days_to_sample, size=size) 610 | for k in range(size): 611 | # sample and make sure elements per buckets do not overflow 612 | if data_split == "none" or i < days - 1: 613 | # choose bucket 614 | p = rand_u[k] 615 | # retry of the bucket is full 616 | while total_counter[p] + counter[p] >= total_per_file[p]: 617 | p = np.random.randint(low=0, high=days_to_sample) 618 | else: # preserve the last day/bucket if needed 619 | p = i 620 | buckets[p].append(k) 621 | counter[p] += 1 622 | else: # randomize is day or none 623 | for k in range(size): 624 | # do not sample, preserve the data in this bucket 625 | p = i 626 | buckets[p].append(k) 627 | counter[p] += 1 628 | 629 | # sanity check 630 | if np.sum(counter) != size: 631 | sys.exit("ERROR: sanity check on number of samples failed") 632 | # debug prints 633 | # print(counter) 634 | # print(str(np.sum(counter)) + " = " + str(size)) 635 | # print([len(x) for x in buckets]) 636 | # print(total_counter) 637 | 638 | # partially feel the buckets 639 | for j in range(days): 640 | filename_j_y = npzfile + "_{0}_intermediate_y.npy".format(j) 641 | filename_j_d = npzfile + "_{0}_intermediate_d.npy".format(j) 642 | filename_j_s = npzfile + "_{0}_intermediate_s.npy".format(j) 643 | start = total_counter[j] 644 | end = total_counter[j] + counter[j] 645 | # target buckets 646 | fj_y = np.load(filename_j_y, mmap_mode='r+') 647 | # print("start=" + str(start) + " end=" + str(end) 648 | # + " end - start=" + str(end - start) + " " 649 | # + str(fj_y[start:end].shape) + " " 650 | # + str(len(buckets[j]))) 651 | fj_y[start:end] = y[buckets[j]] 652 | del fj_y 653 | # dense buckets 654 | fj_d = np.load(filename_j_d, mmap_mode='r+') 655 | # print("start=" + str(start) + " end=" + str(end) 656 | # + " end - start=" + str(end - start) + " " 657 | # + str(fj_d[start:end, :].shape) + " " 658 | # + str(len(buckets[j]))) 659 | fj_d[start:end, :] = X_int[buckets[j], :] 660 | del fj_d 661 | # sparse buckets 662 | fj_s = np.load(filename_j_s, mmap_mode='r+') 663 | # print("start=" + str(start) + " end=" + str(end) 664 | # + " end - start=" + str(end - start) + " " 665 | # + str(fj_s[start:end, :].shape) + " " 666 | # + str(len(buckets[j]))) 667 | fj_s[start:end, :] = X_cat[buckets[j], :] 668 | del fj_s 669 | # update counters for next step 670 | total_counter[j] += counter[j] 671 | 672 | # 2nd pass of FYR shuffle 673 | # check if data already exists 674 | for j in range(days): 675 | filename_j = npzfile + "_{0}_reordered.npz".format(j) 676 | if path.exists(filename_j): 677 | print("Using existing " + filename_j) 678 | else: 679 | recreate_flag = True 680 | # reorder within buckets 681 | if recreate_flag: 682 | for j in range(days): 683 | filename_j_y = npzfile + "_{0}_intermediate_y.npy".format(j) 684 | filename_j_d = npzfile + "_{0}_intermediate_d.npy".format(j) 685 | filename_j_s = npzfile + "_{0}_intermediate_s.npy".format(j) 686 | fj_y = np.load(filename_j_y) 687 | fj_d = np.load(filename_j_d) 688 | fj_s = np.load(filename_j_s) 689 | 690 | indices = range(total_per_file[j]) 691 | if randomize == "day" or randomize == "total": 692 | if data_split == "none" or j < days - 1: 693 | indices = np.random.permutation(range(total_per_file[j])) 694 | 695 | filename_r = npzfile + "_{0}_reordered.npz".format(j) 696 | print("Reordering (2nd pass) " + filename_r) 697 | np.savez_compressed( 698 | filename_r, 699 | X_cat=fj_s[indices, :], 700 | X_int=fj_d[indices, :], 701 | y=fj_y[indices], 702 | ) 703 | 704 | ''' 705 | # sanity check (under no reordering norms should be zero) 706 | for i in range(days): 707 | filename_i_o = npzfile + "_{0}_processed.npz".format(i) 708 | print(filename_i_o) 709 | with np.load(filename_i_o) as data_original: 710 | X_cat_o = data_original["X_cat"] 711 | X_int_o = data_original["X_int"] 712 | y_o = data_original["y"] 713 | filename_i_r = npzfile + "_{0}_reordered.npz".format(i) 714 | print(filename_i_r) 715 | with np.load(filename_i_r) as data_reordered: 716 | X_cat_r = data_reordered["X_cat"] 717 | X_int_r = data_reordered["X_int"] 718 | y_r = data_reordered["y"] 719 | print(np.linalg.norm(y_o - y_r)) 720 | print(np.linalg.norm(X_int_o - X_int_r)) 721 | print(np.linalg.norm(X_cat_o - X_cat_r)) 722 | ''' 723 | 724 | else: 725 | print("Concatenating multiple days into %s.npz file" % str(d_path + o_filename)) 726 | 727 | # load and concatenate data 728 | for i in range(days): 729 | filename_i = npzfile + "_{0}_processed.npz".format(i) 730 | with np.load(filename_i) as data: 731 | if i == 0: 732 | X_cat = data["X_cat"] 733 | X_int = data["X_int"] 734 | y = data["y"] 735 | else: 736 | X_cat = np.concatenate((X_cat, data["X_cat"])) 737 | X_int = np.concatenate((X_int, data["X_int"])) 738 | y = np.concatenate((y, data["y"])) 739 | print("Loaded day:", i, "y = 1:", len(y[y == 1]), "y = 0:", len(y[y == 0])) 740 | 741 | with np.load(d_path + d_file + "_fea_count.npz") as data: 742 | counts = data["counts"] 743 | print("Loaded counts!") 744 | 745 | np.savez_compressed( 746 | d_path + o_filename + ".npz", 747 | X_cat=X_cat, 748 | X_int=X_int, 749 | y=y, 750 | counts=counts, 751 | ) 752 | 753 | return d_path + o_filename + ".npz" 754 | 755 | 756 | def transformCriteoAdData(X_cat, X_int, y, days, data_split, randomize, total_per_file): 757 | # Transforms Criteo Kaggle or terabyte data by applying log transformation 758 | # on dense features and converting everything to appropriate tensors. 759 | # 760 | # Inputs: 761 | # X_cat (ndarray): array of integers corresponding to preprocessed 762 | # categorical features 763 | # X_int (ndarray): array of integers corresponding to dense features 764 | # y (ndarray): array of bool corresponding to labels 765 | # data_split(str): flag for splitting dataset into training/validation/test 766 | # sets 767 | # randomize (str): determines randomization scheme 768 | # "none": no randomization 769 | # "day": randomizes each day"s data (only works if split = True) 770 | # "total": randomizes total dataset 771 | # 772 | # Outputs: 773 | # if split: 774 | # X_cat_train (tensor): sparse features for training set 775 | # X_int_train (tensor): dense features for training set 776 | # y_train (tensor): labels for training set 777 | # X_cat_val (tensor): sparse features for validation set 778 | # X_int_val (tensor): dense features for validation set 779 | # y_val (tensor): labels for validation set 780 | # X_cat_test (tensor): sparse features for test set 781 | # X_int_test (tensor): dense features for test set 782 | # y_test (tensor): labels for test set 783 | # else: 784 | # X_cat (tensor): sparse features 785 | # X_int (tensor): dense features 786 | # y (tensor): label 787 | 788 | # define initial set of indices 789 | indices = np.arange(len(y)) 790 | 791 | # create offset per file 792 | offset_per_file = np.array([0] + [x for x in total_per_file]) 793 | for i in range(days): 794 | offset_per_file[i + 1] += offset_per_file[i] 795 | 796 | # split dataset 797 | if data_split == 'train': 798 | indices = np.array_split(indices, offset_per_file[1:-1]) 799 | 800 | # randomize train data (per day) 801 | if randomize == "day": # or randomize == "total": 802 | for i in range(len(indices) - 1): 803 | indices[i] = np.random.permutation(indices[i]) 804 | print("Randomized indices per day ...") 805 | 806 | train_indices = np.concatenate(indices[:-1]) 807 | test_indices = indices[-1] 808 | test_indices, val_indices = np.array_split(test_indices, 2) 809 | 810 | print("Defined training and testing indices...") 811 | 812 | # randomize train data (across days) 813 | if randomize == "total": 814 | train_indices = np.random.permutation(train_indices) 815 | print("Randomized indices across days ...") 816 | 817 | # indices = np.concatenate((train_indices, test_indices)) 818 | 819 | # create training, validation, and test sets 820 | X_cat_train = X_cat[train_indices] 821 | X_int_train = X_int[train_indices] 822 | y_train = y[train_indices] 823 | 824 | X_cat_val = X_cat[val_indices] 825 | X_int_val = X_int[val_indices] 826 | y_val = y[val_indices] 827 | 828 | X_cat_test = X_cat[test_indices] 829 | X_int_test = X_int[test_indices] 830 | y_test = y[test_indices] 831 | 832 | print("Split data according to indices...") 833 | 834 | X_cat_train = X_cat_train.astype(np.long) 835 | X_int_train = np.log(X_int_train.astype(np.float32) + 1) 836 | y_train = y_train.astype(np.float32) 837 | 838 | X_cat_val = X_cat_val.astype(np.long) 839 | X_int_val = np.log(X_int_val.astype(np.float32) + 1) 840 | y_val = y_val.astype(np.float32) 841 | 842 | X_cat_test = X_cat_test.astype(np.long) 843 | X_int_test = np.log(X_int_test.astype(np.float32) + 1) 844 | y_test = y_test.astype(np.float32) 845 | 846 | print("Converted to tensors...done!") 847 | 848 | return ( 849 | X_cat_train, 850 | X_int_train, 851 | y_train, 852 | X_cat_val, 853 | X_int_val, 854 | y_val, 855 | X_cat_test, 856 | X_int_test, 857 | y_test, 858 | ) 859 | 860 | else: 861 | 862 | # randomize data 863 | if randomize == "total": 864 | indices = np.random.permutation(indices) 865 | print("Randomized indices...") 866 | 867 | X_cat = X_cat[indices].astype(np.long) 868 | X_int = np.log(X_int[indices].astype(np.float32) + 1) 869 | y = y[indices].astype(np.float32) 870 | 871 | print("Converted to tensors...done!") 872 | 873 | return (X_cat, X_int, y, [], [], [], [], [], []) 874 | 875 | 876 | def getCriteoAdData( 877 | datafile, 878 | o_filename, 879 | max_ind_range=-1, 880 | sub_sample_rate=0.0, 881 | days=7, 882 | data_split='train', 883 | randomize='total', 884 | criteo_kaggle=True, 885 | memory_map=False 886 | ): 887 | # Passes through entire dataset and defines dictionaries for categorical 888 | # features and determines the number of total categories. 889 | # 890 | # Inputs: 891 | # datafile : path to downloaded raw data file 892 | # o_filename (str): saves results under o_filename if filename is not "" 893 | # 894 | # Output: 895 | # o_file (str): output file path 896 | 897 | #split the datafile into path and filename 898 | lstr = datafile.split("/") 899 | d_path = "/".join(lstr[0:-1]) + "/" 900 | d_file = lstr[-1].split(".")[0] if criteo_kaggle else lstr[-1] 901 | npzfile = d_path + ((d_file + "_day") if criteo_kaggle else d_file) 902 | trafile = d_path + ((d_file + "_fea") if criteo_kaggle else "fea") 903 | 904 | # count number of datapoints in training set 905 | total_file = d_path + d_file + "_day_count.npz" 906 | if path.exists(total_file): 907 | with np.load(total_file) as data: 908 | total_per_file = list(data["total_per_file"]) 909 | total_count = np.sum(total_per_file) 910 | print("Skipping counts per file (already exist)") 911 | else: 912 | total_count = 0 913 | total_per_file = [] 914 | if criteo_kaggle: 915 | # WARNING: The raw data consists of a single train.txt file 916 | # Each line in the file is a sample, consisting of 13 continuous and 917 | # 26 categorical features (an extra space indicates that feature is 918 | # missing and will be interpreted as 0). 919 | if path.exists(datafile): 920 | print("Reading data from path=%s" % (datafile)) 921 | with open(str(datafile)) as f: 922 | for _ in f: 923 | total_count += 1 924 | total_per_file.append(total_count) 925 | # reset total per file due to split 926 | num_data_per_split, extras = divmod(total_count, days) 927 | total_per_file = [num_data_per_split] * days 928 | for j in range(extras): 929 | total_per_file[j] += 1 930 | # split into days (simplifies code later on) 931 | file_id = 0 932 | boundary = total_per_file[file_id] 933 | nf = open(npzfile + "_" + str(file_id), "w") 934 | with open(str(datafile)) as f: 935 | for j, line in enumerate(f): 936 | if j == boundary: 937 | nf.close() 938 | file_id += 1 939 | nf = open(npzfile + "_" + str(file_id), "w") 940 | boundary += total_per_file[file_id] 941 | nf.write(line) 942 | nf.close() 943 | else: 944 | sys.exit("ERROR: Criteo Kaggle Display Ad Challenge Dataset path is invalid; please download from https://labs.criteo.com/2014/02/kaggle-display-advertising-challenge-dataset") 945 | else: 946 | # WARNING: The raw data consist of day_0.gz,... ,day_23.gz text files 947 | # Each line in the file is a sample, consisting of 13 continuous and 948 | # 26 categorical features (an extra space indicates that feature is 949 | # missing and will be interpreted as 0). 950 | for i in range(days): 951 | datafile_i = datafile + "_" + str(i) # + ".gz" 952 | if path.exists(str(datafile_i)): 953 | print("Reading data from path=%s" % (str(datafile_i))) 954 | # file day_ 955 | total_per_file_count = 0 956 | with open(str(datafile_i)) as f: 957 | for _ in f: 958 | total_per_file_count += 1 959 | total_per_file.append(total_per_file_count) 960 | total_count += total_per_file_count 961 | else: 962 | sys.exit("ERROR: Criteo Terabyte Dataset path is invalid; please download from https://labs.criteo.com/2013/12/download-terabyte-click-logs") 963 | 964 | # process a file worth of data and reinitialize data 965 | # note that a file main contain a single or multiple splits 966 | def process_one_file( 967 | datfile, 968 | npzfile, 969 | split, 970 | num_data_in_split, 971 | ): 972 | with open(str(datfile)) as f: 973 | y = np.zeros(num_data_in_split, dtype="i4") # 4 byte int 974 | X_int = np.zeros((num_data_in_split, 13), dtype="i4") # 4 byte int 975 | X_cat = np.zeros((num_data_in_split, 26), dtype="i4") # 4 byte int 976 | if sub_sample_rate == 0.0: 977 | rand_u = 1.0 978 | else: 979 | rand_u = np.random.uniform(low=0.0, high=1.0, size=num_data_in_split) 980 | 981 | i = 0 982 | for k, line in enumerate(f): 983 | # process a line (data point) 984 | line = line.split('\t') 985 | # set missing values to zero 986 | for j in range(len(line)): 987 | if (line[j] == '') or (line[j] == '\n'): 988 | line[j] = '0' 989 | # sub-sample data by dropping zero targets, if needed 990 | target = np.int32(line[0]) 991 | if target == 0 and \ 992 | (rand_u if sub_sample_rate == 0.0 else rand_u[k]) < sub_sample_rate: 993 | continue 994 | 995 | y[i] = target 996 | X_int[i] = np.array(line[1:14], dtype=np.int32) 997 | if max_ind_range > 0: 998 | X_cat[i] = np.array( 999 | list(map(lambda x: int(x, 16) % max_ind_range, line[14:])), 1000 | dtype=np.int32 1001 | ) 1002 | else: 1003 | X_cat[i] = np.array( 1004 | list(map(lambda x: int(x, 16), line[14:])), 1005 | dtype=np.int32 1006 | ) 1007 | # count uniques 1008 | for j in range(26): 1009 | convertDicts[j][X_cat[i][j]] = 1 1010 | 1011 | # debug prints 1012 | print( 1013 | "Load %d/%d Split: %d Label True: %d Stored: %d" 1014 | % ( 1015 | i, 1016 | num_data_in_split, 1017 | split, 1018 | target, 1019 | y[i], 1020 | ), 1021 | end="\r", 1022 | ) 1023 | i += 1 1024 | 1025 | # store num_data_in_split samples or extras at the end of file 1026 | # count uniques 1027 | # X_cat_t = np.transpose(X_cat) 1028 | # for j in range(26): 1029 | # for x in X_cat_t[j,:]: 1030 | # convertDicts[j][x] = 1 1031 | # store parsed 1032 | filename_s = npzfile + "_{0}.npz".format(split) 1033 | if path.exists(filename_s): 1034 | print("\nSkip existing " + filename_s) 1035 | else: 1036 | np.savez_compressed( 1037 | filename_s, 1038 | X_int=X_int[0:i, :], 1039 | # X_cat=X_cat[0:i, :], 1040 | X_cat_t=np.transpose(X_cat[0:i, :]), # transpose of the data 1041 | y=y[0:i], 1042 | ) 1043 | print("\nSaved " + npzfile + "_{0}.npz!".format(split)) 1044 | return i 1045 | 1046 | # create all splits (reuse existing files if possible) 1047 | recreate_flag = False 1048 | convertDicts = [{} for _ in range(26)] 1049 | # WARNING: to get reproducable sub-sampling results you must reset the seed below 1050 | # np.random.seed(123) 1051 | # in this case there is a single split in each day 1052 | for i in range(days): 1053 | datfile_i = npzfile + "_{0}".format(i) # + ".gz" 1054 | npzfile_i = npzfile + "_{0}.npz".format(i) 1055 | npzfile_p = npzfile + "_{0}_processed.npz".format(i) 1056 | if path.exists(npzfile_i): 1057 | print("Skip existing " + npzfile_i) 1058 | elif path.exists(npzfile_p): 1059 | print("Skip existing " + npzfile_p) 1060 | else: 1061 | recreate_flag = True 1062 | total_per_file[i] = process_one_file( 1063 | datfile_i, 1064 | npzfile, 1065 | i, 1066 | total_per_file[i], 1067 | ) 1068 | 1069 | # report and save total into a file 1070 | total_count = np.sum(total_per_file) 1071 | if not path.exists(total_file): 1072 | np.savez_compressed(total_file, total_per_file=total_per_file) 1073 | print("Total number of samples:", total_count) 1074 | print("Divided into days/splits:\n", total_per_file) 1075 | 1076 | # dictionary files 1077 | counts = np.zeros(26, dtype=np.int32) 1078 | if recreate_flag: 1079 | # create dictionaries 1080 | for j in range(26): 1081 | for i, x in enumerate(convertDicts[j]): 1082 | convertDicts[j][x] = i 1083 | dict_file_j = d_path + d_file + "_fea_dict_{0}.npz".format(j) 1084 | if not path.exists(dict_file_j): 1085 | np.savez_compressed( 1086 | dict_file_j, 1087 | unique=np.array(list(convertDicts[j]), dtype=np.int32) 1088 | ) 1089 | counts[j] = len(convertDicts[j]) 1090 | # store (uniques and) counts 1091 | count_file = d_path + d_file + "_fea_count.npz" 1092 | if not path.exists(count_file): 1093 | np.savez_compressed(count_file, counts=counts) 1094 | else: 1095 | # create dictionaries (from existing files) 1096 | for j in range(26): 1097 | with np.load(d_path + d_file + "_fea_dict_{0}.npz".format(j)) as data: 1098 | unique = data["unique"] 1099 | for i, x in enumerate(unique): 1100 | convertDicts[j][x] = i 1101 | # load (uniques and) counts 1102 | with np.load(d_path + d_file + "_fea_count.npz") as data: 1103 | counts = data["counts"] 1104 | 1105 | # process all splits 1106 | processCriteoAdData(d_path, d_file, npzfile, days, convertDicts, counts) 1107 | o_file = concatCriteoAdData( 1108 | d_path, 1109 | d_file, 1110 | npzfile, 1111 | trafile, 1112 | days, 1113 | data_split, 1114 | randomize, 1115 | total_per_file, 1116 | total_count, 1117 | memory_map, 1118 | o_filename 1119 | ) 1120 | 1121 | return o_file 1122 | 1123 | 1124 | def loadDataset( 1125 | dataset, 1126 | max_ind_range, 1127 | sub_sample_rate, 1128 | randomize, 1129 | data_split, 1130 | raw_path="", 1131 | pro_data="", 1132 | memory_map=False 1133 | ): 1134 | # dataset 1135 | if dataset == "kaggle": 1136 | days = 7 1137 | o_filename = "kaggleAdDisplayChallenge_processed" 1138 | elif dataset == "terabyte": 1139 | days = 24 1140 | o_filename = "terabyte_processed" 1141 | else: 1142 | raise(ValueError("Data set option is not supported")) 1143 | 1144 | # split the datafile into path and filename 1145 | lstr = raw_path.split("/") 1146 | d_path = "/".join(lstr[0:-1]) + "/" 1147 | d_file = lstr[-1].split(".")[0] if dataset == "kaggle" else lstr[-1] 1148 | npzfile = d_path + ((d_file + "_day") if dataset == "kaggle" else d_file) 1149 | # trafile = d_path + ((d_file + "_fea") if dataset == "kaggle" else "fea") 1150 | 1151 | # check if pre-processed data is available 1152 | data_ready = True 1153 | if memory_map: 1154 | for i in range(days): 1155 | reo_data = d_path + npzfile + "_{0}_reordered.npz".format(i) 1156 | if not path.exists(str(reo_data)): 1157 | data_ready = False 1158 | else: 1159 | if not path.exists(str(pro_data)): 1160 | data_ready = False 1161 | 1162 | # pre-process data if needed 1163 | # WARNNING: when memory mapping is used we get a collection of files 1164 | if data_ready: 1165 | print("Reading pre-processed data=%s" % (str(pro_data))) 1166 | file = str(pro_data) 1167 | else: 1168 | print("Reading raw data=%s" % (str(raw_path))) 1169 | file = getCriteoAdData( 1170 | raw_path, 1171 | o_filename, 1172 | max_ind_range, 1173 | sub_sample_rate, 1174 | days, 1175 | data_split, 1176 | randomize, 1177 | dataset == "kaggle", 1178 | memory_map 1179 | ) 1180 | 1181 | return file, days 1182 | 1183 | 1184 | if __name__ == "__main__": 1185 | ### import packages ### 1186 | import argparse 1187 | 1188 | ### parse arguments ### 1189 | parser = argparse.ArgumentParser( 1190 | description="Preprocess Criteo dataset" 1191 | ) 1192 | # model related parameters 1193 | parser.add_argument("--max-ind-range", type=int, default=-1) 1194 | parser.add_argument("--data-sub-sample-rate", type=float, default=0.0) # in [0, 1] 1195 | parser.add_argument("--data-randomize", type=str, default="total") # or day or none 1196 | parser.add_argument("--memory-map", action="store_true", default=False) 1197 | parser.add_argument("--data-set", type=str, default="kaggle") # or terabyte 1198 | parser.add_argument("--raw-data-file", type=str, default="") 1199 | parser.add_argument("--processed-data-file", type=str, default="") 1200 | args = parser.parse_args() 1201 | 1202 | loadDataset( 1203 | args.data_set, 1204 | args.max_ind_range, 1205 | args.data_sub_sample_rate, 1206 | args.data_randomize, 1207 | "train", 1208 | args.raw_data_file, 1209 | args.processed_data_file, 1210 | args.memory_map 1211 | ) 1212 | --------------------------------------------------------------------------------