├── .idea
├── .gitignore
├── vcs.xml
├── misc.xml
├── inspectionProfiles
│ ├── profiles_settings.xml
│ └── Project_Default.xml
├── modules.xml
└── dlrm_data_parallel.iml
├── tricks
├── __pycache__
│ ├── md_embedding_bag.cpython-36.pyc
│ ├── md_embedding_bag.cpython-38.pyc
│ ├── qr_embedding_bag.cpython-36.pyc
│ └── qr_embedding_bag.cpython-38.pyc
├── md_embedding_bag.py
└── qr_embedding_bag.py
├── README.md
├── cache_manager.py
├── data_loader_terabyte.py
├── model_no_ddp.py
├── main_no_ddp.py
├── dlrm_data_pytorch.py
└── data_utils.py
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 |
--------------------------------------------------------------------------------
/tricks/__pycache__/md_embedding_bag.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lkp411/cDLRM/HEAD/tricks/__pycache__/md_embedding_bag.cpython-36.pyc
--------------------------------------------------------------------------------
/tricks/__pycache__/md_embedding_bag.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lkp411/cDLRM/HEAD/tricks/__pycache__/md_embedding_bag.cpython-38.pyc
--------------------------------------------------------------------------------
/tricks/__pycache__/qr_embedding_bag.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lkp411/cDLRM/HEAD/tricks/__pycache__/qr_embedding_bag.cpython-36.pyc
--------------------------------------------------------------------------------
/tricks/__pycache__/qr_embedding_bag.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lkp411/cDLRM/HEAD/tricks/__pycache__/qr_embedding_bag.cpython-38.pyc
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/dlrm_data_parallel.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # cDLRM
2 |
3 | Enabling pure data parallel training of DLRM via caching and prefetching
4 |
5 | Example launch command:
6 |
7 | python3 main_no_ddp.py --arch-sparse-feature-size=128 --arch-mlp-bot="13-512-256-128" --arch-mlp-top="512-512-256-1" --max-ind-range=-1 --data-generation=dataset --data-set=terabyte --raw-data-file=../../../../datasets/terabyte/day --processed-data-file=../../../../datasets/terabyte/terabyte_processed.npz --loss-function=bce --round-targets=True --learning-rate=0.8 --lr-embeds=0.8 --mini-batch-size=8192 --print-freq=8192 --print-time --test-mini-batch-size=4096 --test-num-workers=16 --test-freq=16384 --memory-map --data-sub-sample-rate=0.875 --cache-workers=4 --lookahead=3000 --cache-size=150000 --num-ways=16 --table-agg-freq=100 --batch-fifo-size=8 --large-batch --world-size=8 --master-port=12345
8 |
9 | ** Code for the paper cDLRM: Look Ahead Caching for Scalable Training of Recommendation Models accepted at RecSys 21: https://dl.acm.org/doi/pdf/10.1145/3460231.3474246
10 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/Project_Default.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
17 |
18 |
19 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
--------------------------------------------------------------------------------
/tricks/md_embedding_bag.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | #
3 | # This source code is licensed under the MIT license found in the
4 | # LICENSE file in the root directory of this source tree.
5 | #
6 | # Mixed-Dimensions Trick
7 | #
8 | # Description: Applies mixed dimension trick to embeddings to reduce
9 | # embedding sizes.
10 | #
11 | # References:
12 | # [1] Antonio Ginart, Maxim Naumov, Dheevatsa Mudigere, Jiyan Yang, James Zou,
13 | # "Mixed Dimension Embeddings with Application to Memory-Efficient Recommendation
14 | # Systems", CoRR, arXiv:1909.11810, 2019
15 | from __future__ import absolute_import, division, print_function, unicode_literals
16 | import torch
17 | import torch.nn as nn
18 |
19 |
20 | def md_solver(n, alpha, d0=None, B=None, round_dim=True, k=None):
21 | '''
22 | An external facing function call for mixed-dimension assignment
23 | with the alpha power temperature heuristic
24 | Inputs:
25 | n -- (torch.LongTensor) ; Vector of num of rows for each embedding matrix
26 | alpha -- (torch.FloatTensor); Scalar, non-negative, controls dim. skew
27 | d0 -- (torch.FloatTensor); Scalar, baseline embedding dimension
28 | B -- (torch.FloatTensor); Scalar, parameter budget for embedding layer
29 | round_dim -- (bool); flag for rounding dims to nearest pow of 2
30 | k -- (torch.LongTensor) ; Vector of average number of queries per inference
31 | '''
32 | n, indices = torch.sort(n)
33 | k = k[indices] if k is not None else torch.ones(len(n))
34 | d = alpha_power_rule(n.type(torch.float) / k, alpha, d0=d0, B=B)
35 | if round_dim:
36 | d = pow_2_round(d)
37 | return d
38 |
39 |
40 | def alpha_power_rule(n, alpha, d0=None, B=None):
41 | if d0 is not None:
42 | lamb = d0 * (n[0].type(torch.float) ** alpha)
43 | elif B is not None:
44 | lamb = B / torch.sum(n.type(torch.float) ** (1 - alpha))
45 | else:
46 | raise ValueError("Must specify either d0 or B")
47 | d = torch.ones(len(n)) * lamb * (n.type(torch.float) ** (-alpha))
48 | for i in range(len(d)):
49 | if i == 0 and d0 is not None:
50 | d[i] = d0
51 | else:
52 | d[i] = 1 if d[i] < 1 else d[i]
53 | return (torch.round(d).type(torch.long))
54 |
55 |
56 | def pow_2_round(dims):
57 | return 2 ** torch.round(torch.log2(dims.type(torch.float)))
58 |
59 |
60 | class PrEmbeddingBag(nn.Module):
61 | def __init__(self, num_embeddings, embedding_dim, base_dim):
62 | super(PrEmbeddingBag, self).__init__()
63 | self.embs = nn.EmbeddingBag(
64 | num_embeddings, embedding_dim, mode="sum", sparse=True)
65 | torch.nn.init.xavier_uniform_(self.embs.weight)
66 | if embedding_dim < base_dim:
67 | self.proj = nn.Linear(embedding_dim, base_dim, bias=False)
68 | torch.nn.init.xavier_uniform_(self.proj.weight)
69 | elif embedding_dim == base_dim:
70 | self.proj = nn.Identity()
71 | else:
72 | raise ValueError(
73 | "Embedding dim " + str(embedding_dim) + " > base dim " + str(base_dim)
74 | )
75 |
76 | def forward(self, input, offsets=None, per_sample_weights=None):
77 | return self.proj(self.embs(
78 | input, offsets=offsets, per_sample_weights=per_sample_weights))
79 |
--------------------------------------------------------------------------------
/cache_manager.py:
--------------------------------------------------------------------------------
1 | import math
2 | import os
3 |
4 | import torch
5 | import torch.multiprocessing as mp
6 |
7 |
8 | class Prefetcher(mp.Process):
9 | def __init__(self, args, emb_tables_cpu, batch_fifo, eviction_fifo, finish_event, cache_ld):
10 | mp.Process.__init__(self)
11 |
12 | # Shared variables
13 | self.args = args
14 | self.emb_tables_cpu = emb_tables_cpu
15 | self.batch_fifo = batch_fifo
16 | self.eviction_fifo = eviction_fifo
17 | self.finish_event = finish_event
18 | self.cache_ld = cache_ld
19 |
20 | @staticmethod
21 | def pin_pool(p, core):
22 | this_pid = os.getpid()
23 | os.system("taskset -p -c %d %d" % (core + 3 + p, this_pid))
24 |
25 | return 1
26 |
27 | @staticmethod
28 | def process_batch_slice(slice, emb_tables_cpu):
29 | lists_of_unique_indices = []
30 | unique_indices_maps = []
31 | for i in range(len(emb_tables_cpu.emb_l)):
32 | unique_indices_tensor = torch.unique(slice[i]) # .long()
33 | unique_indices_tensor.share_memory_()
34 | lists_of_unique_indices.append(unique_indices_tensor)
35 |
36 | idxs = torch.arange(unique_indices_tensor.shape[0])
37 | max = torch.max(unique_indices_tensor)
38 | map = -1 * torch.ones(max + 1, 1, dtype=torch.long)
39 | map[unique_indices_tensor] = idxs.view(-1, 1)
40 | map.share_memory_()
41 |
42 | unique_indices_maps.append(map)
43 |
44 | cached_entries_per_table = emb_tables_cpu.fetch_unique_idx_slices(lists_of_unique_indices)
45 |
46 | return cached_entries_per_table, lists_of_unique_indices, unique_indices_maps
47 |
48 | @staticmethod
49 | def eviction_manager(emb_tables, eviction_fifo, average_on_writeback, core, timeout):
50 | this_pid = os.getpid()
51 | print('Pinning eviction process...')
52 | os.system("taskset -p -c %d %d" % (core, this_pid))
53 | print('Done pinning eviction process')
54 |
55 | try:
56 | while (True):
57 | eviction_data = eviction_fifo.get(timeout=timeout) if timeout > 0 else eviction_fifo.get()
58 | for k, table_eviction_data in enumerate(eviction_data):
59 | idxs = table_eviction_data[0]
60 | embeddings = table_eviction_data[1]
61 | emb_tables.emb_l[k].weight.data[idxs] = (emb_tables.emb_l[k].weight.data[
62 | idxs] + embeddings) / 2 if average_on_writeback else embeddings
63 | except:
64 | print('Eviction queue empty longer than expected. Exiting eviction manager...')
65 |
66 | def run(self):
67 | this_pid = os.getpid()
68 | os.system("taskset -p -c %d %d" % (self.args.main_start_core + 1, this_pid))
69 |
70 | eviction_process = mp.Process(target=Prefetcher.eviction_manager,
71 | args=(self.emb_tables_cpu, self.eviction_fifo, self.args.average_on_writeback, self.args.main_start_core + 2,
72 | self.args.eviction_fifo_timeout))
73 | eviction_process.start()
74 |
75 | num_examples_per_process = self.args.lookahead * self.args.mini_batch_size
76 |
77 | pool = mp.Pool(processes=self.args.cache_workers)
78 |
79 | results = [pool.apply_async(Prefetcher.pin_pool, args=(p, self.args.main_start_core)) for p in range(self.args.cache_workers)]
80 | for res in results:
81 | res.get()
82 | print('Done pinning processes. Starting cache manager.')
83 |
84 |
85 | collection_limit = self.args.lookahead * self.args.cache_workers
86 |
87 | for epoch in range(self.args.nepochs):
88 | lS_i = []
89 | collected = 0
90 | for j, (_, _, sparse_idxs, _) in enumerate(self.cache_ld):
91 | if (j > 0 and collected % collection_limit == 0) or j == len(self.cache_ld) - 1:
92 | if j == len(self.cache_ld) - 1:
93 | lS_i.append(sparse_idxs)
94 |
95 | lS_i = torch.cat(lS_i, dim=1)
96 | num_processes_needed = math.ceil(lS_i.shape[1] / num_examples_per_process)
97 |
98 | processed_slices = [pool.apply_async(Prefetcher.process_batch_slice, args=(
99 | lS_i[:, p * num_examples_per_process: (p + 1) * num_examples_per_process], self.emb_tables_cpu)) for p
100 | in range(num_processes_needed)]
101 |
102 | for res in processed_slices:
103 | a = res.get()
104 | self.batch_fifo.put((a[0], a[1], a[2]))
105 |
106 | lS_i = [sparse_idxs]
107 | collected = 1
108 | else:
109 | lS_i.append(sparse_idxs)
110 | collected += 1
111 |
112 | pool.close()
113 | pool.join()
114 | eviction_process.join()
115 | self.finish_event.wait()
116 |
--------------------------------------------------------------------------------
/tricks/qr_embedding_bag.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | #
3 | # This source code is licensed under the MIT license found in the
4 | # LICENSE file in the root directory of this source tree.
5 | #
6 | # Quotient-Remainder Trick
7 | #
8 | # Description: Applies quotient remainder-trick to embeddings to reduce
9 | # embedding sizes.
10 | #
11 | # References:
12 | # [1] Hao-Jun Michael Shi, Dheevatsa Mudigere, Maxim Naumov, Jiyan Yang,
13 | # "Compositional Embeddings Using Complementary Partitions for Memory-Efficient
14 | # Recommendation Systems", CoRR, arXiv:1909.02107, 2019
15 |
16 |
17 | from __future__ import absolute_import, division, print_function, unicode_literals
18 | import torch
19 | import torch.nn as nn
20 | import torch.nn.functional as F
21 | from torch.nn.parameter import Parameter
22 | import numpy as np
23 |
24 |
25 | class QREmbeddingBag(nn.Module):
26 | r"""Computes sums or means over two 'bags' of embeddings, one using the quotient
27 | of the indices and the other using the remainder of the indices, without
28 | instantiating the intermediate embeddings, then performs an operation to combine these.
29 |
30 | For bags of constant length and no :attr:`per_sample_weights`, this class
31 |
32 | * with ``mode="sum"`` is equivalent to :class:`~torch.nn.Embedding` followed by ``torch.sum(dim=0)``,
33 | * with ``mode="mean"`` is equivalent to :class:`~torch.nn.Embedding` followed by ``torch.mean(dim=0)``,
34 | * with ``mode="max"`` is equivalent to :class:`~torch.nn.Embedding` followed by ``torch.max(dim=0)``.
35 |
36 | However, :class:`~torch.nn.EmbeddingBag` is much more time and memory efficient than using a chain of these
37 | operations.
38 |
39 | QREmbeddingBag also supports per-sample weights as an argument to the forward
40 | pass. This scales the output of the Embedding before performing a weighted
41 | reduction as specified by ``mode``. If :attr:`per_sample_weights`` is passed, the
42 | only supported ``mode`` is ``"sum"``, which computes a weighted sum according to
43 | :attr:`per_sample_weights`.
44 |
45 | Known Issues:
46 | Autograd breaks with multiple GPUs. It breaks only with multiple embeddings.
47 |
48 | Args:
49 | num_categories (int): total number of unique categories. The input indices must be in
50 | 0, 1, ..., num_categories - 1.
51 | embedding_dim (list): list of sizes for each embedding vector in each table. If ``"add"``
52 | or ``"mult"`` operation are used, these embedding dimensions must be
53 | the same. If a single embedding_dim is used, then it will use this
54 | embedding_dim for both embedding tables.
55 | num_collisions (int): number of collisions to enforce.
56 | operation (string, optional): ``"concat"``, ``"add"``, or ``"mult". Specifies the operation
57 | to compose embeddings. ``"concat"`` concatenates the embeddings,
58 | ``"add"`` sums the embeddings, and ``"mult"`` multiplies
59 | (component-wise) the embeddings.
60 | Default: ``"mult"``
61 | max_norm (float, optional): If given, each embedding vector with norm larger than :attr:`max_norm`
62 | is renormalized to have norm :attr:`max_norm`.
63 | norm_type (float, optional): The p of the p-norm to compute for the :attr:`max_norm` option. Default ``2``.
64 | scale_grad_by_freq (boolean, optional): if given, this will scale gradients by the inverse of frequency of
65 | the words in the mini-batch. Default ``False``.
66 | Note: this option is not supported when ``mode="max"``.
67 | mode (string, optional): ``"sum"``, ``"mean"`` or ``"max"``. Specifies the way to reduce the bag.
68 | ``"sum"`` computes the weighted sum, taking :attr:`per_sample_weights`
69 | into consideration. ``"mean"`` computes the average of the values
70 | in the bag, ``"max"`` computes the max value over each bag.
71 | Default: ``"mean"``
72 | sparse (bool, optional): if ``True``, gradient w.r.t. :attr:`weight` matrix will be a sparse tensor. See
73 | Notes for more details regarding sparse gradients. Note: this option is not
74 | supported when ``mode="max"``.
75 |
76 | Attributes:
77 | weight (Tensor): the learnable weights of each embedding table is the module of shape
78 | `(num_embeddings, embedding_dim)` initialized using a uniform distribution
79 | with sqrt(1 / num_categories).
80 |
81 | Inputs: :attr:`input` (LongTensor), :attr:`offsets` (LongTensor, optional), and
82 | :attr:`per_index_weights` (Tensor, optional)
83 |
84 | - If :attr:`input` is 2D of shape `(B, N)`,
85 |
86 | it will be treated as ``B`` bags (sequences) each of fixed length ``N``, and
87 | this will return ``B`` values aggregated in a way depending on the :attr:`mode`.
88 | :attr:`offsets` is ignored and required to be ``None`` in this case.
89 |
90 | - If :attr:`input` is 1D of shape `(N)`,
91 |
92 | it will be treated as a concatenation of multiple bags (sequences).
93 | :attr:`offsets` is required to be a 1D tensor containing the
94 | starting index positions of each bag in :attr:`input`. Therefore,
95 | for :attr:`offsets` of shape `(B)`, :attr:`input` will be viewed as
96 | having ``B`` bags. Empty bags (i.e., having 0-length) will have
97 | returned vectors filled by zeros.
98 |
99 | per_sample_weights (Tensor, optional): a tensor of float / double weights, or None
100 | to indicate all weights should be taken to be ``1``. If specified, :attr:`per_sample_weights`
101 | must have exactly the same shape as input and is treated as having the same
102 | :attr:`offsets`, if those are not ``None``. Only supported for ``mode='sum'``.
103 |
104 |
105 | Output shape: `(B, embedding_dim)`
106 |
107 | """
108 | __constants__ = ['num_categories', 'embedding_dim', 'num_collisions',
109 | 'operation', 'max_norm', 'norm_type', 'scale_grad_by_freq',
110 | 'mode', 'sparse']
111 |
112 | def __init__(self, num_categories, embedding_dim, num_collisions,
113 | operation='mult', max_norm=None, norm_type=2.,
114 | scale_grad_by_freq=False, mode='mean', sparse=False,
115 | _weight=None):
116 | super(QREmbeddingBag, self).__init__()
117 |
118 | assert operation in ['concat', 'mult', 'add'], 'Not valid operation!'
119 |
120 | self.num_categories = num_categories
121 | if isinstance(embedding_dim, int) or len(embedding_dim) == 1:
122 | self.embedding_dim = [embedding_dim, embedding_dim]
123 | else:
124 | self.embedding_dim = embedding_dim
125 | self.num_collisions = num_collisions
126 | self.operation = operation
127 | self.max_norm = max_norm
128 | self.norm_type = norm_type
129 | self.scale_grad_by_freq = scale_grad_by_freq
130 |
131 | if self.operation == 'add' or self.operation == 'mult':
132 | assert self.embedding_dim[0] == self.embedding_dim[1], \
133 | 'Embedding dimensions do not match!'
134 |
135 | self.num_embeddings = [int(np.ceil(num_categories / num_collisions)),
136 | num_collisions]
137 |
138 | if _weight is None:
139 | self.weight_q = Parameter(torch.Tensor(self.num_embeddings[0], self.embedding_dim[0]))
140 | self.weight_r = Parameter(torch.Tensor(self.num_embeddings[1], self.embedding_dim[1]))
141 | self.reset_parameters()
142 | else:
143 | assert list(_weight[0].shape) == [self.num_embeddings[0], self.embedding_dim[0]], \
144 | 'Shape of weight for quotient table does not match num_embeddings and embedding_dim'
145 | assert list(_weight[1].shape) == [self.num_embeddings[1], self.embedding_dim[1]], \
146 | 'Shape of weight for remainder table does not match num_embeddings and embedding_dim'
147 | self.weight_q = Parameter(_weight[0])
148 | self.weight_r = Parameter(_weight[1])
149 | self.mode = mode
150 | self.sparse = sparse
151 |
152 | def reset_parameters(self):
153 | nn.init.uniform_(self.weight_q, np.sqrt(1 / self.num_categories))
154 | nn.init.uniform_(self.weight_r, np.sqrt(1 / self.num_categories))
155 |
156 | def forward(self, input, offsets=None, per_sample_weights=None):
157 | input_q = (input / self.num_collisions).long()
158 | input_r = torch.remainder(input, self.num_collisions).long()
159 |
160 | embed_q = F.embedding_bag(input_q, self.weight_q, offsets, self.max_norm,
161 | self.norm_type, self.scale_grad_by_freq, self.mode,
162 | self.sparse, per_sample_weights)
163 | embed_r = F.embedding_bag(input_r, self.weight_r, offsets, self.max_norm,
164 | self.norm_type, self.scale_grad_by_freq, self.mode,
165 | self.sparse, per_sample_weights)
166 |
167 | if self.operation == 'concat':
168 | embed = torch.cat((embed_q, embed_r), dim=1)
169 | elif self.operation == 'add':
170 | embed = embed_q + embed_r
171 | elif self.operation == 'mult':
172 | embed = embed_q * embed_r
173 |
174 | return embed
175 |
176 | def extra_repr(self):
177 | s = '{num_embeddings}, {embedding_dim}'
178 | if self.max_norm is not None:
179 | s += ', max_norm={max_norm}'
180 | if self.norm_type != 2:
181 | s += ', norm_type={norm_type}'
182 | if self.scale_grad_by_freq is not False:
183 | s += ', scale_grad_by_freq={scale_grad_by_freq}'
184 | s += ', mode={mode}'
185 | return s.format(**self.__dict__)
186 |
--------------------------------------------------------------------------------
/data_loader_terabyte.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
2 | #
3 | # This source code is licensed under the MIT license found in the
4 | # LICENSE file in the root directory of this source tree.
5 |
6 |
7 | from __future__ import absolute_import, division, print_function, unicode_literals
8 |
9 | import os
10 | import numpy as np
11 | from torch.utils.data import Dataset
12 | import torch
13 | import time
14 | import math
15 | from tqdm import tqdm
16 | import argparse
17 |
18 |
19 | class DataLoader:
20 | """
21 | DataLoader dedicated for the Criteo Terabyte Click Logs dataset
22 | """
23 |
24 | def __init__(
25 | self,
26 | data_filename,
27 | data_directory,
28 | days,
29 | batch_size,
30 | max_ind_range=-1,
31 | split="train",
32 | drop_last_batch=False
33 | ):
34 | self.data_filename = data_filename
35 | self.data_directory = data_directory
36 | self.days = days
37 | self.batch_size = batch_size
38 | self.max_ind_range = max_ind_range
39 |
40 | total_file = os.path.join(
41 | data_directory,
42 | data_filename + "_day_count.npz"
43 | )
44 | with np.load(total_file) as data:
45 | total_per_file = data["total_per_file"][np.array(days)]
46 |
47 | self.length = sum(total_per_file)
48 | if split == "test" or split == "val":
49 | self.length = int(np.ceil(self.length / 2.))
50 | self.split = split
51 | self.drop_last_batch = drop_last_batch
52 |
53 | def __iter__(self):
54 | return iter(
55 | _batch_generator(
56 | self.data_filename, self.data_directory, self.days,
57 | self.batch_size, self.split, self.drop_last_batch, self.max_ind_range
58 | )
59 | )
60 |
61 | def __len__(self):
62 | if self.drop_last_batch:
63 | return self.length // self.batch_size
64 | else:
65 | return math.ceil(self.length / self.batch_size)
66 |
67 |
68 | def _transform_features(
69 | x_int_batch, x_cat_batch, y_batch, max_ind_range, flag_input_torch_tensor=False
70 | ):
71 | if max_ind_range > 0:
72 | x_cat_batch = x_cat_batch % max_ind_range
73 |
74 | if flag_input_torch_tensor:
75 | x_int_batch = torch.log(x_int_batch.clone().detach().type(torch.float) + 1)
76 | x_cat_batch = x_cat_batch.clone().detach().type(torch.long)
77 | y_batch = y_batch.clone().detach().type(torch.float32).view(-1, 1)
78 | else:
79 | x_int_batch = torch.log(torch.tensor(x_int_batch, dtype=torch.float) + 1)
80 | x_cat_batch = torch.tensor(x_cat_batch, dtype=torch.long)
81 | y_batch = torch.tensor(y_batch, dtype=torch.float32).view(-1, 1)
82 |
83 | batch_size = x_cat_batch.shape[0]
84 | feature_count = x_cat_batch.shape[1]
85 | lS_o = torch.arange(batch_size).reshape(1, -1).repeat(feature_count, 1)
86 |
87 | return x_int_batch, lS_o, x_cat_batch.t(), y_batch.view(-1, 1)
88 |
89 |
90 | def _batch_generator(
91 | data_filename, data_directory, days, batch_size, split, drop_last, max_ind_range
92 | ):
93 | previous_file = None
94 | for day in days:
95 | filepath = os.path.join(
96 | data_directory,
97 | data_filename + "_{}_reordered.npz".format(day)
98 | )
99 |
100 | # print('Loading file: ', filepath)
101 | with np.load(filepath) as data:
102 | x_int = data["X_int"]
103 | x_cat = data["X_cat"]
104 | y = data["y"]
105 |
106 | samples_in_file = y.shape[0]
107 | batch_start_idx = 0
108 | if split == "test" or split == "val":
109 | length = int(np.ceil(samples_in_file / 2.))
110 | if split == "test":
111 | samples_in_file = length
112 | elif split == "val":
113 | batch_start_idx = samples_in_file - length
114 |
115 | while batch_start_idx < samples_in_file - batch_size:
116 |
117 | missing_samples = batch_size
118 | if previous_file is not None:
119 | missing_samples -= previous_file['y'].shape[0]
120 |
121 | current_slice = slice(batch_start_idx, batch_start_idx + missing_samples)
122 |
123 | x_int_batch = x_int[current_slice]
124 | x_cat_batch = x_cat[current_slice]
125 | y_batch = y[current_slice]
126 |
127 | if previous_file is not None:
128 | x_int_batch = np.concatenate(
129 | [previous_file['x_int'], x_int_batch],
130 | axis=0
131 | )
132 | x_cat_batch = np.concatenate(
133 | [previous_file['x_cat'], x_cat_batch],
134 | axis=0
135 | )
136 | y_batch = np.concatenate([previous_file['y'], y_batch], axis=0)
137 | previous_file = None
138 |
139 | if x_int_batch.shape[0] != batch_size:
140 | raise ValueError('should not happen')
141 |
142 | yield _transform_features(x_int_batch, x_cat_batch, y_batch, max_ind_range)
143 |
144 | batch_start_idx += missing_samples
145 | if batch_start_idx != samples_in_file:
146 | current_slice = slice(batch_start_idx, samples_in_file)
147 | if previous_file is not None:
148 | previous_file = {
149 | 'x_int' : np.concatenate(
150 | [previous_file['x_int'], x_int[current_slice]],
151 | axis=0
152 | ),
153 | 'x_cat' : np.concatenate(
154 | [previous_file['x_cat'], x_cat[current_slice]],
155 | axis=0
156 | ),
157 | 'y' : np.concatenate([previous_file['y'], y[current_slice]], axis=0)
158 | }
159 | else:
160 | previous_file = {
161 | 'x_int' : x_int[current_slice],
162 | 'x_cat' : x_cat[current_slice],
163 | 'y' : y[current_slice]
164 | }
165 |
166 | if not drop_last:
167 | yield _transform_features(
168 | previous_file['x_int'],
169 | previous_file['x_cat'],
170 | previous_file['y'],
171 | max_ind_range
172 | )
173 |
174 |
175 | def _test():
176 | generator = _batch_generator(
177 | data_filename='day',
178 | data_directory='/input',
179 | days=range(23),
180 | split="train",
181 | batch_size=2048
182 | )
183 | t1 = time.time()
184 | for x_int, lS_o, x_cat, y in generator:
185 | t2 = time.time()
186 | time_diff = t2 - t1
187 | t1 = t2
188 | print(
189 | "time {} x_int.shape: {} lS_o.shape: {} x_cat.shape: {} y.shape: {}".format(
190 | time_diff, x_int.shape, lS_o.shape, x_cat.shape, y.shape
191 | )
192 | )
193 |
194 |
195 | class CriteoBinDataset(Dataset):
196 | """Binary version of criteo dataset."""
197 |
198 | def __init__(self, data_file, counts_file,
199 | batch_size=1, max_ind_range=-1, bytes_per_feature=4):
200 | # dataset
201 | self.tar_fea = 1 # single target
202 | self.den_fea = 13 # 13 dense features
203 | self.spa_fea = 26 # 26 sparse features
204 | self.tad_fea = self.tar_fea + self.den_fea
205 | self.tot_fea = self.tad_fea + self.spa_fea
206 |
207 | self.batch_size = batch_size
208 | self.max_ind_range = max_ind_range
209 | self.bytes_per_entry = (bytes_per_feature * self.tot_fea * batch_size)
210 |
211 | self.num_entries = math.ceil(os.path.getsize(data_file) / self.bytes_per_entry)
212 |
213 | print('data file:', data_file, 'number of batches:', self.num_entries)
214 | self.file = open(data_file, 'rb')
215 |
216 | with np.load(counts_file) as data:
217 | self.counts = data["counts"]
218 |
219 | # hardcoded for now
220 | self.m_den = 13
221 |
222 | def __len__(self):
223 | return self.num_entries
224 |
225 | def __getitem__(self, idx):
226 | self.file.seek(idx * self.bytes_per_entry, 0)
227 | raw_data = self.file.read(self.bytes_per_entry)
228 | array = np.frombuffer(raw_data, dtype=np.int32)
229 | tensor = torch.from_numpy(array).view((-1, self.tot_fea))
230 |
231 | return _transform_features(x_int_batch=tensor[:, 1:14],
232 | x_cat_batch=tensor[:, 14:],
233 | y_batch=tensor[:, 0],
234 | max_ind_range=self.max_ind_range,
235 | flag_input_torch_tensor=True)
236 |
237 |
238 | def numpy_to_binary(input_files, output_file_path, split='train'):
239 | """Convert the data to a binary format to be read with CriteoBinDataset."""
240 |
241 | # WARNING - both categorical and numerical data must fit into int32 for
242 | # the following code to work correctly
243 |
244 | with open(output_file_path, 'wb') as output_file:
245 | if split == 'train':
246 | for input_file in input_files:
247 | print('Processing file: ', input_file)
248 |
249 | np_data = np.load(input_file)
250 | np_data = np.concatenate([np_data['y'].reshape(-1, 1),
251 | np_data['X_int'],
252 | np_data['X_cat']], axis=1)
253 | np_data = np_data.astype(np.int32)
254 |
255 | output_file.write(np_data.tobytes())
256 | else:
257 | assert len(input_files) == 1
258 | np_data = np.load(input_files[0])
259 | np_data = np.concatenate([np_data['y'].reshape(-1, 1),
260 | np_data['X_int'],
261 | np_data['X_cat']], axis=1)
262 | np_data = np_data.astype(np.int32)
263 |
264 | samples_in_file = np_data.shape[0]
265 | midpoint = int(np.ceil(samples_in_file / 2.))
266 | if split == "test":
267 | begin = 0
268 | end = midpoint
269 | elif split == "val":
270 | begin = midpoint
271 | end = samples_in_file
272 | else:
273 | raise ValueError('Unknown split value: ', split)
274 |
275 | output_file.write(np_data[begin:end].tobytes())
276 |
277 |
278 | def _preprocess(args):
279 | train_files = ['{}_{}_reordered.npz'.format(args.input_data_prefix, day) for
280 | day in range(0, 23)]
281 |
282 | test_valid_file = args.input_data_prefix + '_23_reordered.npz'
283 |
284 | os.makedirs(args.output_directory, exist_ok=True)
285 | for split in ['train', 'val', 'test']:
286 | print('Running preprocessing for split =', split)
287 |
288 | output_file = os.path.join(args.output_directory,
289 | '{}_data.bin'.format(split))
290 |
291 | input_files = train_files if split == 'train' else [test_valid_file]
292 | numpy_to_binary(input_files=input_files,
293 | output_file_path=output_file,
294 | split=split)
295 |
296 |
297 | def _test_bin():
298 | parser = argparse.ArgumentParser()
299 | parser.add_argument('--output_directory', required=True)
300 | parser.add_argument('--input_data_prefix', required=True)
301 | parser.add_argument('--split', choices=['train', 'test', 'val'],
302 | required=True)
303 | args = parser.parse_args()
304 |
305 | # _preprocess(args)
306 |
307 | binary_data_file = os.path.join(args.output_directory,
308 | '{}_data.bin'.format(args.split))
309 |
310 | counts_file = os.path.join(args.output_directory, 'day_fea_count.npz')
311 | dataset_binary = CriteoBinDataset(data_file=binary_data_file,
312 | counts_file=counts_file,
313 | batch_size=2048,)
314 | from dlrm_data_pytorch import CriteoDataset, collate_wrapper_criteo
315 |
316 | binary_loader = torch.utils.data.DataLoader(
317 | dataset_binary,
318 | batch_size=None,
319 | shuffle=False,
320 | num_workers=0,
321 | collate_fn=None,
322 | pin_memory=False,
323 | drop_last=False,
324 | )
325 |
326 | original_dataset = CriteoDataset(
327 | dataset='terabyte',
328 | max_ind_range=10 * 1000 * 1000,
329 | sub_sample_rate=1,
330 | randomize=True,
331 | split=args.split,
332 | raw_path=args.input_data_prefix,
333 | pro_data='dummy_string',
334 | memory_map=True
335 | )
336 |
337 | original_loader = torch.utils.data.DataLoader(
338 | original_dataset,
339 | batch_size=2048,
340 | shuffle=False,
341 | num_workers=0,
342 | collate_fn=collate_wrapper_criteo,
343 | pin_memory=False,
344 | drop_last=False,
345 | )
346 |
347 | assert len(dataset_binary) == len(original_loader)
348 | for i, (old_batch, new_batch) in tqdm(enumerate(zip(original_loader,
349 | binary_loader)),
350 | total=len(dataset_binary)):
351 |
352 | for j in range(len(new_batch)):
353 | if not np.array_equal(old_batch[j], new_batch[j]):
354 | raise ValueError('FAILED: Datasets not equal')
355 | if i > len(dataset_binary):
356 | break
357 | print('PASSED')
358 |
359 |
360 | if __name__ == '__main__':
361 | _test()
362 | _test_bin
363 |
--------------------------------------------------------------------------------
/model_no_ddp.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import warnings
3 |
4 | import numpy as np
5 |
6 | with warnings.catch_warnings():
7 | warnings.filterwarnings("ignore", category=DeprecationWarning)
8 |
9 | import torch
10 | import torch.nn as nn
11 | from torch.nn.parallel.parallel_apply import parallel_apply
12 | from torch.nn.parallel.replicate import replicate
13 | from torch.nn.parallel.scatter_gather import gather, scatter
14 |
15 | # quotient-remainder trick
16 | from tricks.qr_embedding_bag import QREmbeddingBag
17 | # mixed-dimension trick
18 | from tricks.md_embedding_bag import PrEmbeddingBag
19 |
20 |
21 | class Embedding_Table_Group(nn.Module):
22 | def __init__(self,
23 | m_spa=None,
24 | ln_emb=None,
25 | qr_flag=False,
26 | qr_operation="mult",
27 | qr_collisions=0,
28 | qr_threshold=200,
29 | md_flag=False,
30 | md_threshold=200):
31 |
32 | super(Embedding_Table_Group, self).__init__()
33 |
34 | if (m_spa is not None) and (ln_emb is not None):
35 | self.qr_flag = qr_flag
36 | if self.qr_flag:
37 | self.qr_collisions = qr_collisions
38 | self.qr_operation = qr_operation
39 | self.qr_threshold = qr_threshold
40 | # create variables for MD embedding if applicable
41 | self.md_flag = md_flag
42 | if self.md_flag:
43 | self.md_threshold = md_threshold
44 |
45 | # create embedding tables
46 | self.emb_l = self.create_emb(m_spa, ln_emb)
47 |
48 | def create_emb(self, m, ln):
49 | emb_l = nn.ModuleList()
50 | for i in range(0, ln.size):
51 | n = ln[i]
52 | # construct embedding operator
53 | if self.qr_flag and n > self.qr_threshold:
54 | EE = QREmbeddingBag(n, m, self.qr_collisions,
55 | operation=self.qr_operation, mode="sum", sparse=True)
56 | elif self.md_flag and n > self.md_threshold:
57 | _m = m[i]
58 | base = max(m)
59 | EE = PrEmbeddingBag(n, _m, base)
60 | # use np initialization as below for consistency...
61 | W = np.random.uniform(
62 | low=-np.sqrt(1 / n), high=np.sqrt(1 / n), size=(n, _m)
63 | ).astype(np.float32)
64 | EE.embs.weight.data = torch.tensor(W, requires_grad=False)
65 |
66 | else:
67 | EE = nn.EmbeddingBag(n, m, mode="sum", sparse=True)
68 |
69 | # initialize embeddings
70 | W = np.random.uniform(
71 | low=-np.sqrt(1 / n), high=np.sqrt(1 / n), size=(n, m)
72 | ).astype(np.float32)
73 | EE.weight.data = torch.tensor(W, requires_grad=False)
74 | EE.weight.requires_grad = False
75 |
76 | emb_l.append(EE)
77 |
78 | return emb_l
79 |
80 | def fetch_unique_idx_slices(self, lists_of_unique_indices):
81 | cached_entries_per_table = []
82 | for k, unique_indices in enumerate(lists_of_unique_indices):
83 | E = self.emb_l[k]
84 | cached_entries = E.weight.data[unique_indices]
85 | cached_entries_per_table.append(cached_entries)
86 |
87 | return cached_entries_per_table
88 |
89 | def forward(self, lS_o, lS_i):
90 | ly = []
91 | for k, sparse_index_group_batch in enumerate(lS_i):
92 | sparse_offset_group_batch = lS_o[k]
93 | E = self.emb_l[k]
94 |
95 | V = E(sparse_index_group_batch, sparse_offset_group_batch)
96 | ly.append(V)
97 |
98 | return ly
99 |
100 |
101 | class Embedding_Table_Cache_Group(nn.Module):
102 | def __init__(self,
103 | m_spa,
104 | ln_emb,
105 | max_cache_size,
106 | aux_table_size,
107 | num_ways):
108 |
109 | super(Embedding_Table_Cache_Group, self).__init__()
110 | self.ln_emb = ln_emb
111 | self.num_ways = num_ways
112 |
113 | self.max_cache_size = self.find_next_prime(max_cache_size)
114 |
115 | self.emb_l, self.cache_sizes = self.create_emb(m_spa, ln_emb, self.max_cache_size, num_ways,
116 | aux_table_size) # emb_l[i] is a set of num_ways tables, each corresponding to 1 way. The set would just be the row itself.
117 |
118 | self.occupancy_tables = self.create_occupancy_tables(self.cache_sizes, num_ways)
119 |
120 | self.victim_cache_entries = [None] * len(self.emb_l)
121 |
122 | def find_next_prime(self, max_cache_size):
123 | for i in range(max_cache_size, 2 * max_cache_size):
124 | if isPrime(i):
125 | return i
126 |
127 | def compute_set_indices(self, table_idx, lookup_idxs):
128 | return torch.remainder(lookup_idxs, self.cache_sizes[table_idx])
129 |
130 | def create_emb(self, m, ln, max_cache_size, num_ways, aux_table_size):
131 | emb_l = nn.ModuleList()
132 | cache_sizes = []
133 |
134 | for i in range(0, ln.size):
135 | n = ln[i]
136 | num_rows = n if n < max_cache_size else max_cache_size
137 | cache_sizes.append(num_rows)
138 | EE = nn.EmbeddingBag(num_ways * num_rows + aux_table_size, m, mode="sum", sparse=True)
139 |
140 | emb_l.append(EE)
141 |
142 | return emb_l, cache_sizes
143 |
144 | def create_occupancy_tables(self, cache_sizes, num_ways):
145 | occupancy_tables = [-1 * torch.ones(cache_sizes[i], num_ways, dtype=torch.int64) for i in
146 | range(len(cache_sizes))]
147 | return occupancy_tables
148 |
149 | def forward(self, lS_o, lS_i, emb_tables, rank):
150 | # WARNING: notice that we are processing the batch at once. We implicitly
151 | # assume that the data is laid out such that:
152 | # 1. each embedding is indexed with a group of sparse indices,
153 | # corresponding to a single lookup
154 | # 2. for each embedding the lookups are further organized into a batch
155 | # 3. for a list of embedding tables there is a list of batched lookups
156 |
157 | if (len(self.emb_l) != len(lS_o)) or (len(self.emb_l) != len(lS_i)):
158 | sys.exit("ERROR: corrupted model input detected in parallel_forward call")
159 |
160 | ly = []
161 | per_table_hit_rates = []
162 | cache_group_idxs = []
163 | for k, sparse_index_group_batch in enumerate(lS_i):
164 | occupancy_table = self.occupancy_tables[k]
165 |
166 | set_idxs = self.compute_set_indices(k,
167 | sparse_index_group_batch) # of shape torch.Size([2048]). set_idx[i] is the set_idx that sparse_index_group_batch[i] maps to.
168 | hit_tensor = (occupancy_table[set_idxs] == sparse_index_group_batch.view(-1, 1)).any(dim=1)
169 | hit_positions = hit_tensor.nonzero(as_tuple=False).flatten()
170 | miss_positions = (hit_tensor == False).nonzero(as_tuple=False).flatten()
171 |
172 | hitting_set_idxs = set_idxs[hit_positions]
173 | hitting_ways = (occupancy_table[hitting_set_idxs] == sparse_index_group_batch[hit_positions].view(-1, 1)).nonzero(as_tuple=True)[1]
174 | hitting_cache_lookup_idxs = self.cache_sizes[k] * hitting_ways + hitting_set_idxs
175 |
176 | missing_sparse_idxs = sparse_index_group_batch[miss_positions] # Need to fetch from embedding table
177 | aux_storage_idxs = torch.tensor([self.cache_sizes[k] * self.num_ways + i for i in range(missing_sparse_idxs.shape[0])], dtype=torch.long,
178 | device=rank)
179 | self.emb_l[k].weight.data[aux_storage_idxs] = emb_tables.emb_l[k].weight.data[missing_sparse_idxs].to(rank)
180 |
181 | cache_lookup_idxs = torch.empty(sparse_index_group_batch.shape, dtype=torch.long)
182 | cache_lookup_idxs[hit_positions] = hitting_cache_lookup_idxs
183 |
184 | cache_lookup_idxs = cache_lookup_idxs.to(rank)
185 | cache_lookup_idxs[miss_positions] = aux_storage_idxs
186 |
187 | self.victim_cache_entries[k] = (aux_storage_idxs, missing_sparse_idxs)
188 |
189 | # print(k, hit_positions.shape)
190 |
191 | sparse_offset_group_batch = lS_o[k].to(rank)
192 |
193 | # embedding lookup
194 | # We are using EmbeddingBag, which implicitly uses sum operator.
195 | # The embeddings are represented as tall matrices, with sum
196 | # happening vertically across 0 axis, resulting in a row vector
197 |
198 | # import pdb; pdb.set_trace()
199 |
200 | E = self.emb_l[k]
201 |
202 | V = E(cache_lookup_idxs, sparse_offset_group_batch) # 2048 x 64 tensor
203 | ly.append(V)
204 | cache_group_idxs.append(cache_lookup_idxs.int())
205 |
206 | # hit_rate = hit_positions.shape[0] / sparse_index_group_batch.shape[0]
207 | # per_table_hit_rates.append(hit_rate)
208 |
209 | if len(self.emb_l) != len(ly):
210 | sys.exit("ERROR: corrupted intermediate result in parallel_forward call")
211 |
212 | return ly, cache_group_idxs # , sum(per_table_hit_rates) / lS_i.shape[0]
213 |
214 |
215 | class DLRM_Net(nn.Module):
216 | def __init__(
217 | self,
218 | ln_bot=None,
219 | ln_top=None,
220 | arch_interaction_op=None,
221 | arch_interaction_itself=False,
222 | sync_dense_params=True,
223 | sigmoid_bot=-1,
224 | sigmoid_top=-1,
225 | loss_threshold=0.0,
226 | ):
227 | super(DLRM_Net, self).__init__()
228 |
229 | if (ln_bot is not None) and (ln_top is not None) and (arch_interaction_op is not None):
230 | # save arguments
231 | self.output_d = 0
232 | self.parallel_model_batch_size = -1
233 | self.parallel_model_is_not_prepared = True
234 | self.arch_interaction_op = arch_interaction_op
235 | self.arch_interaction_itself = arch_interaction_itself
236 | self.sync_dense_params = sync_dense_params
237 | self.loss_threshold = loss_threshold
238 | self.cpu = torch.device('cpu')
239 |
240 | # Trainable parameters
241 | self.bot_l = self.create_mlp(ln_bot, sigmoid_bot)
242 | self.top_l = self.create_mlp(ln_top, sigmoid_top)
243 |
244 | def create_mlp(self, ln, sigmoid_layer):
245 | # build MLP layer by layer
246 | layers = nn.ModuleList()
247 | for i in range(0, ln.size - 1):
248 | n = ln[i]
249 | m = ln[i + 1]
250 |
251 | # construct fully connected operator
252 | LL = nn.Linear(int(n), int(m), bias=True)
253 |
254 | # custom Xavier input, output or two-sided fill
255 | mean = 0.0 # std_dev = np.sqrt(variance)
256 | std_dev = np.sqrt(2 / (m + n)) # np.sqrt(1 / m) # np.sqrt(1 / n)
257 | W = np.random.normal(mean, std_dev, size=(m, n)).astype(np.float32)
258 | std_dev = np.sqrt(1 / m) # np.sqrt(2 / (m + 1))
259 | bt = np.random.normal(mean, std_dev, size=m).astype(np.float32)
260 | LL.weight.data = torch.tensor(W, requires_grad=True)
261 | LL.bias.data = torch.tensor(bt, requires_grad=True)
262 | layers.append(LL)
263 |
264 | # construct sigmoid or relu operator
265 | if i == sigmoid_layer:
266 | layers.append(nn.Sigmoid())
267 | else:
268 | layers.append(nn.ReLU())
269 |
270 | return torch.nn.Sequential(*layers)
271 |
272 | def interact_features(self, x, ly):
273 | if self.arch_interaction_op == "dot":
274 | # concatenate dense and sparse features
275 | (batch_size, d) = x.shape
276 | T = torch.cat([x] + ly, dim=1).view((batch_size, -1, d))
277 | # perform a dot product
278 | Z = torch.bmm(T, torch.transpose(T, 1, 2))
279 | # append dense feature with the interactions (into a row vector)
280 | # approach 1: all
281 | # Zflat = Z.view((batch_size, -1))
282 | # approach 2: unique
283 | _, ni, nj = Z.shape
284 | # approach 1: tril_indices
285 | # offset = 0 if self.arch_interaction_itself else -1
286 | # li, lj = torch.tril_indices(ni, nj, offset=offset)
287 | # approach 2: custom
288 | offset = 1 if self.arch_interaction_itself else 0
289 | li = torch.tensor([i for i in range(ni) for j in range(i + offset)], dtype=torch.long)
290 | lj = torch.tensor([j for i in range(nj) for j in range(i + offset)], dtype=torch.long)
291 | Zflat = Z[:, li, lj]
292 | # concatenate dense features and interactions
293 | R = torch.cat([x] + [Zflat], dim=1)
294 | elif self.arch_interaction_op == "cat":
295 | # concatenation features (into a row vector)
296 | R = torch.cat([x] + ly, dim=1)
297 | else:
298 | sys.exit(
299 | "ERROR: --arch-interaction-op="
300 | + self.arch_interaction_op
301 | + " is not supported"
302 | )
303 |
304 | return R
305 |
306 | def forward(self, dense_x, ly):
307 | x = self.bot_l(dense_x)
308 | z = self.interact_features(x, ly)
309 | p = self.top_l(z)
310 |
311 | if 0.0 < self.loss_threshold < 1.0:
312 | z = torch.clamp(p, min=self.loss_threshold, max=(1.0 - self.loss_threshold))
313 | else:
314 | z = p
315 |
316 | return z
317 |
318 |
319 | def isPrime(n):
320 | if n == 1 or n == 2:
321 | return False
322 |
323 | i = 3
324 |
325 | while i * i < n:
326 | if n % i == 0:
327 | return False
328 |
329 | i += 1
330 |
331 | return True
332 |
--------------------------------------------------------------------------------
/main_no_ddp.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import builtins
3 | import math
4 | import os
5 | import sys
6 | import time
7 | import warnings
8 | from setproctitle import setproctitle
9 |
10 | import numpy as np
11 | import psutil
12 |
13 | import dlrm_data_pytorch as dp
14 | from cache_manager import Prefetcher
15 |
16 | with warnings.catch_warnings():
17 | warnings.filterwarnings("ignore", category=DeprecationWarning)
18 |
19 | import torch
20 | import torch.nn as nn
21 | import torch.multiprocessing as mp
22 | import torch.distributed as dist
23 |
24 | # quotient-remainder trick
25 | # mixed-dimension trick
26 | from tricks.md_embedding_bag import md_solver
27 | from model_no_ddp import Embedding_Table_Group, Embedding_Table_Cache_Group, DLRM_Net
28 |
29 | from timeit import default_timer as timer
30 |
31 | exc = getattr(builtins, "IOError", "FileNotFoundError")
32 |
33 |
34 | def ProcessArgs():
35 | parser = argparse.ArgumentParser(description="Train Deep Learning Recommendation Model (DLRM)")
36 |
37 | ################################### Model Parameters ##################################
38 | parser.add_argument("--arch-sparse-feature-size", type=int, default=2)
39 | parser.add_argument("--arch-embedding-size", type=str, default="4-3-2")
40 | parser.add_argument("--arch-mlp-bot", type=str, default="4-3-2")
41 | parser.add_argument("--arch-mlp-top", type=str, default="4-2-1")
42 | parser.add_argument("--arch-interaction-op", type=str, default="dot")
43 | parser.add_argument("--arch-interaction-itself", action="store_true", default=False)
44 | #######################################################################################
45 |
46 | ################################### Activation and loss ###############################
47 | parser.add_argument("--activation-function", type=str, default="relu")
48 | parser.add_argument("--loss-function", type=str, default="mse") # or bce or wbce
49 | parser.add_argument("--loss-weights", type=str, default="1.0-1.0") # for wbce
50 | parser.add_argument("--loss-threshold", type=float, default=0.0) # 1.0e-7
51 | parser.add_argument("--round-targets", type=bool, default=False)
52 | #######################################################################################
53 |
54 | ######################################## Data #########################################
55 | parser.add_argument("--data-size", type=int, default=1)
56 | parser.add_argument("--num-batches", type=int, default=0)
57 | parser.add_argument("--data-generation", type=str, default="random")
58 | parser.add_argument("--data-trace-file", type=str, default="./input/dist_emb_j.log")
59 | parser.add_argument("--data-set", type=str, default="kaggle") # or terabyte
60 | parser.add_argument("--raw-data-file", type=str, default="")
61 | parser.add_argument("--processed-data-file", type=str, default="")
62 | parser.add_argument("--data-randomize", type=str, default="total") # or day or none
63 | parser.add_argument("--data-trace-enable-padding", type=bool, default=False)
64 | parser.add_argument("--max-ind-range", type=int, default=-1)
65 | parser.add_argument("--data-sub-sample-rate", type=float, default=0.0) # in [0, 1]
66 | parser.add_argument("--num-indices-per-lookup", type=int, default=10)
67 | parser.add_argument("--num-indices-per-lookup-fixed", type=bool, default=False)
68 | parser.add_argument("--num-workers", type=int, default=0)
69 | parser.add_argument("--memory-map", action="store_true", default=False)
70 | #######################################################################################
71 |
72 | ################################# Embedding Table Args ################################
73 | parser.add_argument("--md-flag", action="store_true", default=False)
74 | parser.add_argument("--md-threshold", type=int, default=200)
75 | parser.add_argument("--md-temperature", type=float, default=0.3)
76 | parser.add_argument("--md-round-dims", action="store_true", default=False)
77 | parser.add_argument("--qr-flag", action="store_true", default=False)
78 | parser.add_argument("--qr-threshold", type=int, default=200)
79 | parser.add_argument("--qr-operation", type=str, default="mult")
80 | parser.add_argument("--qr-collisions", type=int, default=4)
81 | #######################################################################################
82 |
83 | ##################################### Training ########################################
84 | parser.add_argument("--mini-batch-size", type=int, default=1)
85 | parser.add_argument("--nepochs", type=int, default=1)
86 | parser.add_argument("--learning-rate", type=float, default=0.1)
87 | parser.add_argument("--lr-embeds", type=float, default=0.3)
88 | parser.add_argument("--print-precision", type=int, default=5)
89 | parser.add_argument("--numpy-rand-seed", type=int, default=123)
90 | parser.add_argument("--sync-dense-params", type=bool, default=True)
91 | parser.add_argument("--lookahead", type=int, default=2) # Added
92 | parser.add_argument("--cache-workers", type=int, default=2) # Added
93 | parser.add_argument("--cache-size", type=int, default=10240)
94 | parser.add_argument("--num-ways", type=int, default=4) # Added
95 | parser.add_argument("--average-on-writeback", action="store_true", default=False) # Added
96 | parser.add_argument("--evict-victim-cache", action="store_true", default=False) # Added
97 | #######################################################################################
98 |
99 | ############################### Debugging and profiling ################################
100 | parser.add_argument("--print-freq", type=int, default=1)
101 | parser.add_argument("--test-freq", type=int, default=-1)
102 | parser.add_argument("--test-mini-batch-size", type=int, default=-1)
103 | parser.add_argument("--test-num-workers", type=int, default=-1)
104 | parser.add_argument("--print-time", action="store_true", default=False)
105 | parser.add_argument("--debug-mode", action="store_true", default=False)
106 | parser.add_argument("--enable-profiling", action="store_true", default=False)
107 | parser.add_argument("--plot-compute-graph", action="store_true", default=False)
108 | ########################################################################################
109 |
110 | ################################## Store/load model ####################################
111 | parser.add_argument("--save-model", type=str, default="")
112 | parser.add_argument("--load-model", type=str, default="")
113 | ########################################################################################
114 |
115 | ################################## MLPerf Args #########################################
116 | parser.add_argument("--mlperf-logging", action="store_true", default=False)
117 | # stop at target accuracy Kaggle 0.789, Terabyte (sub-sampled=0.875) 0.8107
118 | parser.add_argument("--mlperf-acc-threshold", type=float, default=0.0)
119 | # stop at target AUC Terabyte (no subsampling) 0.8025
120 | parser.add_argument("--mlperf-auc-threshold", type=float, default=0.0)
121 | parser.add_argument("--mlperf-bin-loader", action='store_true', default=False)
122 | parser.add_argument("--mlperf-bin-shuffle", action='store_true', default=False)
123 | parser.add_argument("--large-batch", action="store_true", default=False)
124 | ########################################################################################
125 |
126 | ################################## Distributed training ################################
127 | parser.add_argument("--world-size", type=int, default=2)
128 | parser.add_argument("--master-port", type=int, default=12345)
129 | parser.add_argument("--trainer-start-core", type=int, default=7)
130 | parser.add_argument("--main-start-core", type=int, default=0)
131 | parser.add_argument("--dense-threshold", type=int, default=1000)
132 | parser.add_argument("--table-agg-op", type=str, default="mean")
133 | parser.add_argument("--table-agg-freq", type=int, default=1)
134 | parser.add_argument("--batch-fifo-size", type=int, default=8)
135 | parser.add_argument("--eviction-fifo-size", type=int, default=8)
136 | parser.add_argument("--eviction-fifo-timeout", type=int, default=300)
137 | ########################################################################################
138 |
139 | ######################################## Misc ##########################################
140 | parser.add_argument("--inference-only", action="store_true", default=False)
141 | parser.add_argument("--save-onnx", action="store_true", default=False)
142 | parser.add_argument("--use-gpu", action="store_true", default=False)
143 | ########################################################################################
144 |
145 | return parser.parse_args()
146 |
147 |
148 | def CacheEmbeddings(cached_entries_per_table, lists_of_unique_idxs, unique_indices_maps, cache_group, eviction_fifo, rank):
149 | cpu = torch.device("cpu")
150 | eviction_data = []
151 | for k, table_cache in enumerate(cached_entries_per_table):
152 | unique_idxs = lists_of_unique_idxs[k] # One dimensional tensor of unique ids (original ids)
153 | map = unique_indices_maps[k]
154 |
155 | set_idxs = cache_group.compute_set_indices(k,
156 | unique_idxs) # One dimensional tensor of set indices (new ids = row in the cached embedding tables)
157 | occupancy_table = cache_group.occupancy_tables[k]
158 |
159 | # Filter out the hitting indices
160 | hit_tensor = (occupancy_table[set_idxs] == unique_idxs.view(-1, 1)).any(dim=1)
161 | hit_positions = hit_tensor.nonzero(as_tuple=False).flatten()
162 | miss_positions = (hit_tensor == False).nonzero(as_tuple=False).flatten()
163 |
164 | hitting_set_idxs = set_idxs[hit_positions]
165 | hitting_ways = (occupancy_table[set_idxs] == unique_idxs.view(-1, 1)).nonzero(as_tuple=True)[1]
166 |
167 | necessary_unique_idxs = unique_idxs[miss_positions] # This is after cache hit evaluation
168 | necessary_set_idxs = set_idxs[miss_positions] # This is after cache hit evaluation
169 |
170 | # Compute availability tensor
171 | avail_tensor_sampler = torch.ones(occupancy_table.shape, dtype=torch.bool)
172 | avail_tensor_sampler[hitting_set_idxs, hitting_ways] = False
173 | occupied_sets = (avail_tensor_sampler.any(dim=1) == 0).nonzero(as_tuple=False).flatten()
174 |
175 | # Filter out unique indices that map to sets whose ways are all occupied
176 | to_be_used_indices = ((necessary_set_idxs.view(-1, 1) == occupied_sets).any(dim=1) == 0).nonzero(
177 | as_tuple=False).flatten()
178 |
179 | necessary_unique_idxs = necessary_unique_idxs[to_be_used_indices]
180 | necessary_set_idxs = necessary_set_idxs[to_be_used_indices]
181 |
182 | # Convert to float and sample way assignments
183 | avail_tensor_sampler = avail_tensor_sampler[necessary_set_idxs].float()
184 | dist = torch.distributions.Categorical(avail_tensor_sampler)
185 | ways_assignments = dist.sample()
186 |
187 | ############################################### EVICTION CODE ####################################################
188 |
189 | # Find unique indices being evicted and fetch their embeddings for writeback
190 | evicting_positions = ((occupancy_table[necessary_set_idxs, ways_assignments] == -1) == False).nonzero(
191 | as_tuple=False).flatten()
192 | evicting_set_idxs = necessary_set_idxs[evicting_positions]
193 | evicting_ways = ways_assignments[evicting_positions]
194 | evicting_table_idxs = cache_group.cache_sizes[k] * evicting_ways + evicting_set_idxs
195 |
196 | evicting_unique_idxs = occupancy_table[evicting_set_idxs, evicting_ways]
197 | evicting_embeddings = cache_group.emb_l[k].weight.data[evicting_table_idxs].to(cpu)
198 |
199 | eviction_data.append((evicting_unique_idxs, evicting_embeddings))
200 | ###################################################################################################################
201 |
202 | # Finally cache current window embeddings and update occupancy table
203 | table_idxs = cache_group.cache_sizes[k] * ways_assignments + necessary_set_idxs
204 | occupancy_table[necessary_set_idxs, ways_assignments] = necessary_unique_idxs
205 | cached_table_idxs = map[necessary_unique_idxs].flatten()
206 | cache_group.emb_l[k].weight.data[table_idxs] = table_cache[cached_table_idxs].to(rank)
207 |
208 | if rank == 0:
209 | eviction_fifo.put(eviction_data)
210 |
211 |
212 | def loss_fn_wrap(Z, T, loss_fn, args, loss_ws=None):
213 | if args.loss_function == "mse" or args.loss_function == "bce":
214 | return loss_fn(Z, T)
215 |
216 | elif args.loss_function == "wbce":
217 | loss_ws_ = loss_ws[T.data.view(-1).long()].view_as(T)
218 | loss_fn_ = loss_fn(Z, T)
219 |
220 | loss_sc_ = loss_ws_ * loss_fn_
221 | return loss_sc_.mean()
222 |
223 |
224 | def time_wrap(rank):
225 | torch.cuda.synchronize(rank)
226 | return time.time()
227 |
228 |
229 | def wait_wrap(req_objs):
230 | for obj in req_objs:
231 | obj.wait()
232 |
233 |
234 | def aggregate_gradients(dlrm):
235 | # Aggregate MLPs
236 | request_objs_mlp = []
237 | for layer in dlrm.bot_l:
238 | if isinstance(layer, nn.modules.linear.Linear):
239 | layer.weight.grad /= dist.get_world_size()
240 | request_objs_mlp.append(dist.all_reduce_multigpu([layer.weight.grad], async_op=True))
241 |
242 | for layer in dlrm.top_l:
243 | if isinstance(layer, nn.modules.linear.Linear):
244 | layer.weight.grad /= dist.get_world_size()
245 | request_objs_mlp.append(dist.all_reduce_multigpu([layer.weight.grad], async_op=True))
246 |
247 | return request_objs_mlp
248 |
249 |
250 | @torch.no_grad()
251 | def broadcast_and_aggregate(cache_group, cache_group_idxs, rank, reduce_op="mean"):
252 | recieve_tensors = []
253 | dist_request_objs = []
254 | for i in range(dist.get_world_size()):
255 | if i == rank:
256 | recieve_tensors.append(cache_group_idxs)
257 | dist_request_objs.append(dist.broadcast(cache_group_idxs, src=i, async_op=True))
258 | else:
259 | tmp = torch.empty_like(cache_group_idxs, device=rank)
260 | dist_request_objs.append(dist.broadcast(tmp, src=i, async_op=True))
261 | recieve_tensors.append(tmp)
262 |
263 | # Wait for broadcasts to finish
264 | wait_wrap(dist_request_objs)
265 | unique_idxs_list = []
266 | weight_slice_list = []
267 | dist_request_objs = []
268 | cache_lookups = torch.cat(recieve_tensors, dim=1)
269 | for i, table in enumerate(cache_group.emb_l):
270 | unique_idxs = torch.unique(cache_lookups[i], sorted=True).long()
271 |
272 | if reduce_op == "sum":
273 | weight_slice = table.weight[unique_idxs]
274 | op = dist.ReduceOp.SUM
275 |
276 | elif reduce_op == "mean":
277 | weight_slice = table.weight[unique_idxs] / dist.get_world_size()
278 | op = dist.ReduceOp.SUM
279 |
280 | elif reduce_op == "max":
281 | weight_slice = table.weight[unique_idxs]
282 | op = dist.ReduceOp.MAX
283 |
284 | dist_request_objs.append(dist.all_reduce_multigpu([weight_slice], op=op, async_op=True))
285 | unique_idxs_list.append(unique_idxs)
286 | weight_slice_list.append(weight_slice)
287 |
288 | for i, table in enumerate(cache_group.emb_l):
289 | unique_idxs = unique_idxs_list[i]
290 | weight_slice = weight_slice_list[i]
291 | dist_request_objs[i].wait()
292 | table.weight[unique_idxs] = weight_slice
293 |
294 |
295 | def share_occupancy_tables(cache_group, occupancy_tables_fifos, rank):
296 | if rank == 0:
297 | for table in cache_group.occupancy_tables:
298 | table.share_memory_()
299 |
300 | for fifo in occupancy_tables_fifos:
301 | fifo.put(cache_group.occupancy_tables)
302 |
303 | else:
304 | fifo = occupancy_tables_fifos[rank - 1]
305 | shared_occupancy_tables = fifo.get()
306 | cache_group.occupancy_tables = shared_occupancy_tables
307 |
308 |
309 | @torch.no_grad()
310 | def load_caches_and_broadcast(cache_group, batch_fifo, eviction_fifo, rank):
311 | dist_req_objs = []
312 | if rank == 0:
313 | # Pull out of batch queue and cache in cache on GPU 0
314 | cached_entries_per_table, lists_of_unique_idxs, unique_indices_maps = batch_fifo.get()
315 | CacheEmbeddings(cached_entries_per_table, lists_of_unique_idxs, unique_indices_maps, cache_group, eviction_fifo, rank)
316 |
317 | # Broadcast to all other GPUs
318 | for embedding_table_cache in cache_group.emb_l:
319 | dist_req_objs.append(dist.broadcast_multigpu([embedding_table_cache.weight], src=0, async_op=True))
320 |
321 | return dist_req_objs
322 |
323 |
324 | def Run(rank, m_spa, ln_emb, ln_bot, ln_top, train_ld, test_ld, batch_fifo, eviction_fifo, occupancy_tables_fifos, emb_tables, args):
325 | # Set proc title
326 | setproctitle("DlrmTrainer:" + str(rank))
327 |
328 | # First pin processes to avoid context switching overhead
329 | avail_cores = psutil.cpu_count() - args.trainer_start_core
330 | stride = rank if rank < avail_cores else rank % avail_cores
331 | new_core = args.trainer_start_core + stride
332 | this_pid = os.getpid()
333 | os.system("taskset -p -c %d %d" % (new_core, this_pid))
334 |
335 | np.random.seed(args.numpy_rand_seed)
336 | torch.cuda.manual_seed(args.numpy_rand_seed)
337 | torch.manual_seed(args.numpy_rand_seed)
338 | np.set_printoptions(precision=args.print_precision)
339 | torch.set_printoptions(precision=args.print_precision)
340 |
341 | os.environ['MASTER_ADDR'] = 'localhost'
342 | os.environ['MASTER_PORT'] = str(args.master_port)
343 | dist.init_process_group("nccl", rank=rank, world_size=args.world_size)
344 | local_batch_size = math.ceil(args.mini_batch_size / args.world_size)
345 |
346 | cache_group = Embedding_Table_Cache_Group(m_spa, ln_emb,
347 | max_cache_size=args.cache_size,
348 | aux_table_size=args.mini_batch_size,
349 | num_ways=args.num_ways).to(rank)
350 |
351 | dlrm = DLRM_Net(
352 | ln_bot,
353 | ln_top,
354 | arch_interaction_op=args.arch_interaction_op,
355 | arch_interaction_itself=args.arch_interaction_itself,
356 | sync_dense_params=args.sync_dense_params,
357 | sigmoid_bot=-1,
358 | sigmoid_top=ln_top.size - 2,
359 | loss_threshold=args.loss_threshold,
360 | ).to(rank)
361 |
362 | share_occupancy_tables(cache_group, occupancy_tables_fifos, rank)
363 |
364 | if args.loss_function == "mse":
365 | loss_fn = torch.nn.MSELoss(reduction="mean")
366 | loss_ws = None
367 | elif args.loss_function == "bce":
368 | loss_fn = torch.nn.BCELoss(reduction="mean")
369 | loss_ws = None
370 | elif args.loss_function == "wbce":
371 | loss_ws = torch.tensor(np.fromstring(args.loss_weights, dtype=float, sep="-"))
372 | loss_fn = torch.nn.BCELoss(reduction="none")
373 |
374 | # Creating optimizer
375 | optimizer_mlps = torch.optim.SGD(dlrm.parameters(), lr=args.learning_rate)
376 | optimizer_embeds = torch.optim.SGD(cache_group.parameters(), lr=args.lr_embeds)
377 |
378 | total_time = 0
379 | total_iter = 0
380 | total_loss = 0
381 | total_accu = 0
382 | total_samp = 0
383 |
384 | caching_overhead = []
385 | cache_group_idxs_window = []
386 | for epoch in range(args.nepochs):
387 | for j, (X, lS_o, lS_i, T) in enumerate(train_ld):
388 | X = X[rank * local_batch_size: (rank + 1) * local_batch_size, :].to(rank)
389 | lS_i = lS_i[:, rank * local_batch_size: (rank + 1) * local_batch_size]
390 | lS_o = lS_o[:, :local_batch_size]
391 | T = T[rank * local_batch_size: (rank + 1) * local_batch_size, :].to(rank)
392 |
393 | if j % args.lookahead == 0:
394 | # Pull from fifo and setup caches
395 | start = timer()
396 | dist_req_objs = load_caches_and_broadcast(cache_group, batch_fifo, eviction_fifo, rank)
397 | wait_wrap(dist_req_objs)
398 | end = timer()
399 | caching_overhead.append(end - start)
400 |
401 | t1 = time_wrap(rank)
402 |
403 | # Forward and Backward
404 | lookups, cache_group_idxs = cache_group(lS_o, lS_i, emb_tables, rank)
405 | Z = dlrm(X, lookups)
406 | E = loss_fn_wrap(Z, T, loss_fn, args, loss_ws)
407 | optimizer_mlps.zero_grad()
408 | optimizer_embeds.zero_grad()
409 | E.backward()
410 |
411 | # Gradient aggregation for MLPs + param update
412 | request_objs_mlps = aggregate_gradients(dlrm)
413 | optimizer_embeds.step()
414 | wait_wrap(request_objs_mlps)
415 | optimizer_mlps.step()
416 |
417 | # Aggregate parameters for cache group
418 | if j > 0 and j % args.table_agg_freq == 0:
419 | cache_group_idxs = torch.cat(cache_group_idxs_window + [torch.stack(cache_group_idxs)], dim=1)
420 | broadcast_and_aggregate(cache_group, cache_group_idxs, rank, args.table_agg_op)
421 | cache_group_idxs_window = []
422 | else:
423 | cache_group_idxs_window.append(torch.stack(cache_group_idxs))
424 |
425 | t2 = time_wrap(rank)
426 |
427 | L = E.detach().cpu().numpy() # numpy array
428 | S = Z.detach().cpu().numpy() # numpy array
429 | T = T.detach().cpu().numpy() # numpy array
430 | mbs = T.shape[0] # = args.mini_batch_size except maybe for last
431 | A = np.sum((np.round(S, 0) == T).astype(np.uint8))
432 |
433 | if rank == 0:
434 | total_loss_world = 0
435 | total_acc_world = 0
436 | # Get losses and accuracies from other processes
437 | for process in range(1, dist.get_world_size()):
438 | # Get losses
439 | L_t = torch.tensor(L, device=rank)
440 | L_p = torch.zeros_like(L_t)
441 | dist.broadcast(L_p, src=process)
442 | L_p = L_p.detach().cpu().numpy()
443 | total_loss_world += L_p * mbs
444 |
445 | # Get accuracies
446 | A_t = torch.tensor([A.item()], device=rank)
447 | A_p = torch.zeros_like(A_t)
448 | dist.broadcast(A_p, src=process)
449 | A_p = A_p.detach().cpu().item()
450 | total_acc_world += A_p
451 |
452 | total_time += t2 - t1
453 | total_loss += ((L * mbs + total_loss_world) / dist.get_world_size())
454 | total_accu += ((A + total_acc_world) / dist.get_world_size())
455 | total_iter += 1
456 | total_samp += mbs
457 |
458 | if j > 0 and j % args.print_freq == 0:
459 | gT = 1000.0 * total_time / total_iter
460 | total_time = 0
461 |
462 | gA = total_accu / total_samp
463 | total_accu = 0
464 |
465 | gL = total_loss / total_samp
466 | total_loss = 0
467 |
468 | avg_caching_overhead = np.mean(caching_overhead) / args.lookahead
469 | caching_overhead = []
470 |
471 | print('Epoch {}: Finished {}/{} in {} ms/it. Caching overhead = {}. Loss = {}, Train Acc = {}'.format(epoch, j, len(train_ld), gT,
472 | 1000 * avg_caching_overhead,
473 | gL, gA))
474 |
475 | total_samp = 0
476 | total_iter = 0
477 |
478 | # region Testing - Only rank 0 tests
479 | if (j > 0 and j % args.test_freq == 0) or j == len(train_ld) - 1:
480 | print('Testing at {}/{}....'.format(j, len(train_ld)))
481 | test_samp = 0
482 | total_test_acc = 0
483 | with torch.no_grad():
484 | for i, (X, lS_o, lS_i, T) in enumerate(test_ld):
485 | X = X.to(rank)
486 | lookups, _ = cache_group(lS_o, lS_i, emb_tables, rank)
487 | Z = dlrm(X, lookups)
488 | S = Z.cpu().numpy()
489 | T = T.cpu().numpy()
490 | test_acc = np.sum((np.round(S, 0) == T).astype(np.uint32))
491 | test_samp += T.shape[0]
492 | total_test_acc += test_acc
493 |
494 | print('Test accuracy = {}%'.format(100 * (total_test_acc / test_samp)))
495 | # endregion
496 |
497 | else:
498 | for process in range(1, dist.get_world_size()):
499 | L_t = torch.tensor(L, device=rank)
500 | A_t = torch.tensor([A.item()], device=rank)
501 | dist.broadcast(L_t, src=process)
502 | dist.broadcast(A_t, src=process)
503 |
504 |
505 | if __name__ == '__main__':
506 | mp.set_start_method("spawn") # Cache manager deadlocks with fork as start method. This is paramount.
507 | args = ProcessArgs()
508 |
509 | np.random.seed(args.numpy_rand_seed)
510 | np.set_printoptions(precision=args.print_precision)
511 | torch.set_printoptions(precision=args.print_precision)
512 | torch.manual_seed(args.numpy_rand_seed)
513 |
514 | # region Sanity Checks
515 | if args.test_mini_batch_size < 0:
516 | # if the parameter is not set, use the training batch size
517 | args.test_mini_batch_size = args.mini_batch_size
518 | if args.test_num_workers < 0:
519 | # if the parameter is not set, use the same parameter for training
520 | args.test_num_workers = args.num_workers
521 |
522 | ln_bot = np.fromstring(args.arch_mlp_bot, dtype=int, sep="-")
523 | if args.data_generation == "dataset":
524 |
525 | train_data, train_ld, test_data, test_ld, cache_ld = dp.make_criteo_data_and_loaders(args)
526 |
527 | nbatches = args.num_batches if args.num_batches > 0 else len(train_ld)
528 | nbatches_test = len(test_ld)
529 |
530 | ln_emb = train_data.counts
531 | # enforce maximum limit on number of vectors per embedding
532 | if args.max_ind_range > 0:
533 | ln_emb = np.array(list(map(
534 | lambda x: x if x < args.max_ind_range else args.max_ind_range,
535 | ln_emb
536 | )))
537 | m_den = train_data.m_den
538 | ln_bot[0] = m_den
539 | else:
540 | # input and target at random
541 | if args.cache_workers > psutil.cpu_count():
542 | args.cache_workers = psutil.cpu_count()
543 |
544 | ln_emb = np.fromstring(args.arch_embedding_size, dtype=int, sep="-")
545 | m_den = ln_bot[0]
546 | train_data, train_ld = dp.make_random_data_and_loader(args, ln_emb, m_den)
547 | nbatches = args.num_batches if args.num_batches > 0 else len(train_ld)
548 |
549 | m_spa = args.arch_sparse_feature_size
550 | num_fea = ln_emb.size + 1 # num sparse + num dense features
551 | m_den_out = ln_bot[ln_bot.size - 1]
552 | if args.arch_interaction_op == "dot":
553 | # approach 1: all
554 | # num_int = num_fea * num_fea + m_den_out
555 | # approach 2: unique
556 | if args.arch_interaction_itself:
557 | num_int = (num_fea * (num_fea + 1)) // 2 + m_den_out
558 | else:
559 | num_int = (num_fea * (num_fea - 1)) // 2 + m_den_out
560 | elif args.arch_interaction_op == "cat":
561 | num_int = num_fea * m_den_out
562 | else:
563 | sys.exit(
564 | "ERROR: --arch-interaction-op="
565 | + args.arch_interaction_op
566 | + " is not supported"
567 | )
568 | arch_mlp_top_adjusted = str(num_int) + "-" + args.arch_mlp_top
569 | ln_top = np.fromstring(arch_mlp_top_adjusted, dtype=int, sep="-")
570 |
571 | # sanity check: feature sizes and mlp dimensions must match
572 | if m_den != ln_bot[0]:
573 | sys.exit(
574 | "ERROR: arch-dense-feature-size "
575 | + str(m_den)
576 | + " does not match first dim of bottom mlp "
577 | + str(ln_bot[0])
578 | )
579 | if args.qr_flag:
580 | if args.qr_operation == "concat" and 2 * m_spa != m_den_out:
581 | sys.exit(
582 | "ERROR: 2 arch-sparse-feature-size "
583 | + str(2 * m_spa)
584 | + " does not match last dim of bottom mlp "
585 | + str(m_den_out)
586 | + " (note that the last dim of bottom mlp must be 2x the embedding dim)"
587 | )
588 | if args.qr_operation != "concat" and m_spa != m_den_out:
589 | sys.exit(
590 | "ERROR: arch-sparse-feature-size "
591 | + str(m_spa)
592 | + " does not match last dim of bottom mlp "
593 | + str(m_den_out)
594 | )
595 | else:
596 | if m_spa != m_den_out:
597 | sys.exit(
598 | "ERROR: arch-sparse-feature-size "
599 | + str(m_spa)
600 | + " does not match last dim of bottom mlp "
601 | + str(m_den_out)
602 | )
603 | if num_int != ln_top[0]:
604 | sys.exit(
605 | "ERROR: # of feature interactions "
606 | + str(num_int)
607 | + " does not match first dimension of top mlp "
608 | + str(ln_top[0])
609 | )
610 |
611 | # assign mixed dimensions if applicable
612 | if args.md_flag:
613 | m_spa = md_solver(
614 | torch.tensor(ln_emb),
615 | args.md_temperature, # alpha
616 | d0=m_spa,
617 | round_dim=args.md_round_dims
618 | ).tolist()
619 | # endregion
620 |
621 | emb_tables = Embedding_Table_Group(m_spa, ln_emb)
622 | emb_tables.share_memory()
623 |
624 | batch_fifo = mp.Manager().Queue(maxsize=args.batch_fifo_size)
625 | eviction_fifo = mp.Manager().Queue(maxsize=args.eviction_fifo_size)
626 | occupancy_tables_fifos = [mp.Manager().Queue(maxsize=1)] * (args.world_size - 1)
627 | finish_event = mp.Event()
628 | barrier = mp.Barrier(args.world_size)
629 |
630 | cm = Prefetcher(args, emb_tables, batch_fifo, eviction_fifo, finish_event, cache_ld)
631 |
632 | # Pin main process
633 | this_pid = os.getpid()
634 | os.system("taskset -p -c %d %d" % (args.main_start_core, this_pid))
635 | args.trainer_start_core = args.main_start_core + args.cache_workers + 3
636 |
637 | cm.start()
638 | spawn_context = mp.spawn(Run,
639 | args=(m_spa, ln_emb, ln_bot, ln_top, train_ld, test_ld,
640 | batch_fifo, eviction_fifo, occupancy_tables_fifos,
641 | emb_tables, args),
642 | nprocs=args.world_size,
643 | join=True)
644 |
645 | finish_event.set()
646 | cm.join()
647 |
--------------------------------------------------------------------------------
/dlrm_data_pytorch.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | #
3 | # This source code is licensed under the MIT license found in the
4 | # LICENSE file in the root directory of this source tree.
5 | #
6 | # Description: generate inputs and targets for the dlrm benchmark
7 | # The inpts and outputs are generated according to the following three option(s)
8 | # 1) random distribution
9 | # 2) synthetic distribution, based on unique accesses and distances between them
10 | # i) R. Hassan, A. Harris, N. Topham and A. Efthymiou "Synthetic Trace-Driven
11 | # Simulation of Cache Memory", IEEE AINAM'07
12 | # 3) public data set
13 | # i) Criteo Kaggle Display Advertising Challenge Dataset
14 | # https://labs.criteo.com/2014/02/kaggle-display-advertising-challenge-dataset
15 | # ii) Criteo Terabyte Dataset
16 | # https://labs.criteo.com/2013/12/download-terabyte-click-logs
17 |
18 |
19 | from __future__ import absolute_import, division, print_function, unicode_literals
20 |
21 | # others
22 | from os import path
23 | import bisect
24 | import collections
25 |
26 | import data_utils
27 |
28 | # numpy
29 | import numpy as np
30 | from numpy import random as ra
31 |
32 | # pytorch
33 | import torch
34 | from torch.utils.data import Dataset, RandomSampler
35 |
36 | import data_loader_terabyte
37 | import os
38 |
39 |
40 | # Kaggle Display Advertising Challenge Dataset
41 | # dataset (str): name of dataset (Kaggle or Terabyte)
42 | # randomize (str): determines randomization scheme
43 | # "none": no randomization
44 | # "day": randomizes each day"s data (only works if split = True)
45 | # "total": randomizes total dataset
46 | # split (bool) : to split into train, test, validation data-sets
47 | class CriteoDataset(Dataset):
48 |
49 | def __init__(
50 | self,
51 | dataset,
52 | max_ind_range,
53 | sub_sample_rate,
54 | randomize,
55 | split="train",
56 | raw_path="",
57 | pro_data="",
58 | memory_map=False
59 | ):
60 | # dataset
61 | # tar_fea = 1 # single target
62 | den_fea = 13 # 13 dense features
63 | # spa_fea = 26 # 26 sparse features
64 | # tad_fea = tar_fea + den_fea
65 | # tot_fea = tad_fea + spa_fea
66 | if dataset == "kaggle":
67 | days = 7
68 | out_file = "kaggleAdDisplayChallenge_processed"
69 | elif dataset == "terabyte":
70 | days = 24
71 | out_file = "terabyte_processed"
72 | else:
73 | raise (ValueError("Data set option is not supported"))
74 | self.max_ind_range = max_ind_range
75 | self.memory_map = memory_map
76 |
77 | # split the datafile into path and filename
78 | lstr = raw_path.split("/")
79 | self.d_path = "/".join(lstr[0:-1]) + "/"
80 | self.d_file = lstr[-1].split(".")[0] if dataset == "kaggle" else lstr[-1]
81 | self.npzfile = self.d_path + (
82 | (self.d_file + "_day") if dataset == "kaggle" else self.d_file
83 | )
84 | self.trafile = self.d_path + (
85 | (self.d_file + "_fea") if dataset == "kaggle" else "fea"
86 | )
87 |
88 | # check if pre-processed data is available
89 | data_ready = True
90 | if memory_map:
91 | for i in range(days):
92 | reo_data = self.npzfile + "_{0}_reordered.npz".format(i)
93 | if not path.exists(str(reo_data)):
94 | data_ready = False
95 | else:
96 | if not path.exists(str(pro_data)):
97 | data_ready = False
98 |
99 | # pre-process data if needed
100 | # WARNNING: when memory mapping is used we get a collection of files
101 | if data_ready:
102 | print("Reading pre-processed data=%s" % (str(pro_data)))
103 | file = str(pro_data)
104 | else:
105 | print("Reading raw data=%s" % (str(raw_path)))
106 | file = data_utils.getCriteoAdData(
107 | raw_path,
108 | out_file,
109 | max_ind_range,
110 | sub_sample_rate,
111 | days,
112 | split,
113 | randomize,
114 | dataset == "kaggle",
115 | memory_map
116 | )
117 |
118 | # get a number of samples per day
119 | total_file = self.d_path + self.d_file + "_day_count.npz"
120 | with np.load(total_file) as data:
121 | total_per_file = data["total_per_file"]
122 | # compute offsets per file
123 | self.offset_per_file = np.array([0] + [x for x in total_per_file])
124 | for i in range(days):
125 | self.offset_per_file[i + 1] += self.offset_per_file[i]
126 | # print(self.offset_per_file)
127 |
128 | # setup data
129 | if memory_map:
130 | # setup the training/testing split
131 | self.split = split
132 | if split == 'none' or split == 'train':
133 | self.day = 0
134 | self.max_day_range = days if split == 'none' else days - 1
135 | elif split == 'test' or split == 'val':
136 | self.day = days - 1
137 | num_samples = self.offset_per_file[days] - \
138 | self.offset_per_file[days - 1]
139 | self.test_size = int(np.ceil(num_samples / 2.))
140 | self.val_size = num_samples - self.test_size
141 | else:
142 | sys.exit("ERROR: dataset split is neither none, nor train or test.")
143 |
144 | '''
145 | # text
146 | print("text")
147 | for i in range(days):
148 | fi = self.npzfile + "_{0}".format(i)
149 | with open(fi) as data:
150 | ttt = 0; nnn = 0
151 | for _j, line in enumerate(data):
152 | ttt +=1
153 | if np.int32(line[0]) > 0:
154 | nnn +=1
155 | print("day=" + str(i) + " total=" + str(ttt) + " non-zeros="
156 | + str(nnn) + " ratio=" +str((nnn * 100.) / ttt) + "%")
157 | # processed
158 | print("processed")
159 | for i in range(days):
160 | fi = self.npzfile + "_{0}_processed.npz".format(i)
161 | with np.load(fi) as data:
162 | yyy = data["y"]
163 | ttt = len(yyy)
164 | nnn = np.count_nonzero(yyy)
165 | print("day=" + str(i) + " total=" + str(ttt) + " non-zeros="
166 | + str(nnn) + " ratio=" +str((nnn * 100.) / ttt) + "%")
167 | # reordered
168 | print("reordered")
169 | for i in range(days):
170 | fi = self.npzfile + "_{0}_reordered.npz".format(i)
171 | with np.load(fi) as data:
172 | yyy = data["y"]
173 | ttt = len(yyy)
174 | nnn = np.count_nonzero(yyy)
175 | print("day=" + str(i) + " total=" + str(ttt) + " non-zeros="
176 | + str(nnn) + " ratio=" +str((nnn * 100.) / ttt) + "%")
177 | '''
178 |
179 | # load unique counts
180 | with np.load(self.d_path + self.d_file + "_fea_count.npz") as data:
181 | self.counts = data["counts"]
182 | self.m_den = den_fea # X_int.shape[1]
183 | self.n_emb = len(self.counts)
184 | print("Sparse features= %d, Dense features= %d" % (self.n_emb, self.m_den))
185 |
186 | # Load the test data
187 | # Only a single day is used for testing
188 | if self.split == 'test' or self.split == 'val':
189 | # only a single day is used for testing
190 | fi = self.npzfile + "_{0}_reordered.npz".format(
191 | self.day
192 | )
193 | with np.load(fi) as data:
194 | self.X_int = data["X_int"] # continuous feature
195 | self.X_cat = data["X_cat"] # categorical feature
196 | self.y = data["y"] # target
197 |
198 | else:
199 | # load and preprocess data
200 | with np.load(file) as data:
201 | X_int = data["X_int"] # continuous feature
202 | X_cat = data["X_cat"] # categorical feature
203 | y = data["y"] # target
204 | self.counts = data["counts"]
205 | self.m_den = X_int.shape[1] # den_fea
206 | self.n_emb = len(self.counts)
207 | print("Sparse fea = %d, Dense fea = %d" % (self.n_emb, self.m_den))
208 |
209 | # create reordering
210 | indices = np.arange(len(y))
211 |
212 | if split == "none":
213 | # randomize all data
214 | if randomize == "total":
215 | indices = np.random.permutation(indices)
216 | print("Randomized indices...")
217 |
218 | X_int[indices] = X_int
219 | X_cat[indices] = X_cat
220 | y[indices] = y
221 |
222 | else:
223 | indices = np.array_split(indices, self.offset_per_file[1:-1])
224 |
225 | # randomize train data (per day)
226 | if randomize == "day": # or randomize == "total":
227 | for i in range(len(indices) - 1):
228 | indices[i] = np.random.permutation(indices[i])
229 | print("Randomized indices per day ...")
230 |
231 | train_indices = np.concatenate(indices[:-1])
232 | test_indices = indices[-1]
233 | test_indices, val_indices = np.array_split(test_indices, 2)
234 |
235 | print("Defined %s indices..." % (split))
236 |
237 | # randomize train data (across days)
238 | if randomize == "total":
239 | train_indices = np.random.permutation(train_indices)
240 | print("Randomized indices across days ...")
241 |
242 | # create training, validation, and test sets
243 | if split == 'train':
244 | self.X_int = [X_int[i] for i in train_indices]
245 | self.X_cat = [X_cat[i] for i in train_indices]
246 | self.y = [y[i] for i in train_indices]
247 | elif split == 'val':
248 | self.X_int = [X_int[i] for i in val_indices]
249 | self.X_cat = [X_cat[i] for i in val_indices]
250 | self.y = [y[i] for i in val_indices]
251 | elif split == 'test':
252 | self.X_int = [X_int[i] for i in test_indices]
253 | self.X_cat = [X_cat[i] for i in test_indices]
254 | self.y = [y[i] for i in test_indices]
255 |
256 | print("Split data according to indices...")
257 |
258 | def __getitem__(self, index):
259 |
260 | if isinstance(index, slice):
261 | return [
262 | self[idx] for idx in range(
263 | index.start or 0, index.stop or len(self), index.step or 1
264 | )
265 | ]
266 |
267 | if self.memory_map:
268 | if self.split == 'none' or self.split == 'train':
269 | # check if need to swicth to next day and load data
270 | if index == self.offset_per_file[self.day]:
271 | # print("day_boundary switch", index)
272 | self.day_boundary = self.offset_per_file[self.day]
273 | fi = self.npzfile + "_{0}_reordered.npz".format(
274 | self.day
275 | )
276 | # print('Loading file: ', fi)
277 | with np.load(fi) as data:
278 | self.X_int = data["X_int"] # continuous feature
279 | self.X_cat = data["X_cat"] # categorical feature
280 | self.y = data["y"] # target
281 | self.day = (self.day + 1) % self.max_day_range
282 |
283 | i = index - self.day_boundary
284 | elif self.split == 'test' or self.split == 'val':
285 | # only a single day is used for testing
286 | i = index + (0 if self.split == 'test' else self.test_size)
287 | else:
288 | sys.exit("ERROR: dataset split is neither none, nor train or test.")
289 | else:
290 | i = index
291 |
292 | if self.max_ind_range > 0:
293 | return self.X_int[i], self.X_cat[i] % self.max_ind_range, self.y[i]
294 | else:
295 | return self.X_int[i], self.X_cat[i], self.y[i]
296 |
297 | def _default_preprocess(self, X_int, X_cat, y):
298 | X_int = torch.log(torch.tensor(X_int, dtype=torch.float) + 1)
299 | if self.max_ind_range > 0:
300 | X_cat = torch.tensor(X_cat % self.max_ind_range, dtype=torch.long)
301 | else:
302 | X_cat = torch.tensor(X_cat, dtype=torch.long)
303 | y = torch.tensor(y.astype(np.float32))
304 |
305 | return X_int, X_cat, y
306 |
307 | def __len__(self):
308 | if self.memory_map:
309 | if self.split == 'none':
310 | return self.offset_per_file[-1]
311 | elif self.split == 'train':
312 | return self.offset_per_file[-2]
313 | elif self.split == 'test':
314 | return self.test_size
315 | elif self.split == 'val':
316 | return self.val_size
317 | else:
318 | sys.exit("ERROR: dataset split is neither none, nor train nor test.")
319 | else:
320 | return len(self.y)
321 |
322 |
323 | def collate_wrapper_criteo(list_of_tuples):
324 | # where each tuple is (X_int, X_cat, y)
325 | transposed_data = list(zip(*list_of_tuples))
326 | X_int = torch.log(torch.tensor(transposed_data[0], dtype=torch.float) + 1)
327 | X_cat = torch.tensor(transposed_data[1], dtype=torch.long)
328 | T = torch.tensor(transposed_data[2], dtype=torch.float32).view(-1, 1)
329 |
330 | # import pdb; pdb.set_trace()
331 |
332 | batchSize = X_cat.shape[0]
333 | featureCnt = X_cat.shape[1]
334 |
335 | lS_i = [X_cat[:, i] for i in range(featureCnt)]
336 | lS_o = [torch.tensor(range(batchSize)) for _ in range(featureCnt)]
337 |
338 | return X_int, torch.stack(lS_o), torch.stack(lS_i), T
339 |
340 |
341 | def criteo_worker_pin_fn(worker_id):
342 | this_pid = os.getpid()
343 | os.system("taskset -p -c %d %d" % (13 + worker_id, this_pid))
344 |
345 |
346 | def ensure_dataset_preprocessed(args, d_path):
347 | _ = CriteoDataset(
348 | args.data_set,
349 | args.max_ind_range,
350 | args.data_sub_sample_rate,
351 | args.data_randomize,
352 | "train",
353 | args.raw_data_file,
354 | args.processed_data_file,
355 | args.memory_map
356 | )
357 |
358 | _ = CriteoDataset(
359 | args.data_set,
360 | args.max_ind_range,
361 | args.data_sub_sample_rate,
362 | args.data_randomize,
363 | "test",
364 | args.raw_data_file,
365 | args.processed_data_file,
366 | args.memory_map
367 | )
368 |
369 | for split in ['train', 'val', 'test']:
370 | print('Running preprocessing for split =', split)
371 |
372 | train_files = ['{}_{}_reordered.npz'.format(args.raw_data_file, day)
373 | for
374 | day in range(0, 23)]
375 |
376 | test_valid_file = args.raw_data_file + '_23_reordered.npz'
377 |
378 | output_file = d_path + '_{}.bin'.format(split)
379 |
380 | input_files = train_files if split == 'train' else [test_valid_file]
381 | data_loader_terabyte.numpy_to_binary(input_files=input_files,
382 | output_file_path=output_file,
383 | split=split)
384 |
385 |
386 | def make_criteo_data_and_loaders(args):
387 | if args.large_batch and args.memory_map and args.data_set == "terabyte":
388 | # more efficient for larger batches
389 | data_directory = path.dirname(args.raw_data_file)
390 |
391 | if args.mlperf_bin_loader:
392 | lstr = args.processed_data_file.split("/")
393 | d_path = "/".join(lstr[0:-1]) + "/" + lstr[-1].split(".")[0]
394 | train_file = d_path + "_train.bin"
395 | test_file = d_path + "_test.bin"
396 | # val_file = d_path + "_val.bin"
397 | counts_file = args.raw_data_file + '_fea_count.npz'
398 |
399 | if any(not path.exists(p) for p in [train_file,
400 | test_file,
401 | counts_file]):
402 | ensure_dataset_preprocessed(args, d_path)
403 |
404 | train_data = data_loader_terabyte.CriteoBinDataset(
405 | data_file=train_file,
406 | counts_file=counts_file,
407 | batch_size=args.mini_batch_size,
408 | max_ind_range=args.max_ind_range
409 | )
410 |
411 | train_loader = torch.utils.data.DataLoader(
412 | train_data,
413 | batch_size=None,
414 | batch_sampler=None,
415 | shuffle=False,
416 | num_workers=0,
417 | collate_fn=None,
418 | pin_memory=False,
419 | drop_last=False,
420 | sampler=RandomSampler(train_data) if args.mlperf_bin_shuffle else None
421 | )
422 |
423 | test_data = data_loader_terabyte.CriteoBinDataset(
424 | data_file=test_file,
425 | counts_file=counts_file,
426 | batch_size=args.test_mini_batch_size,
427 | max_ind_range=args.max_ind_range
428 | )
429 |
430 | test_loader = torch.utils.data.DataLoader(
431 | test_data,
432 | batch_size=None,
433 | batch_sampler=None,
434 | shuffle=False,
435 | num_workers=0,
436 | collate_fn=None,
437 | pin_memory=False,
438 | drop_last=False,
439 | )
440 | else:
441 | data_filename = args.raw_data_file.split("/")[-1]
442 |
443 | train_data = CriteoDataset(
444 | args.data_set,
445 | args.max_ind_range,
446 | args.data_sub_sample_rate,
447 | args.data_randomize,
448 | "train",
449 | args.raw_data_file,
450 | args.processed_data_file,
451 | args.memory_map
452 | )
453 |
454 | test_data = CriteoDataset(
455 | args.data_set,
456 | args.max_ind_range,
457 | args.data_sub_sample_rate,
458 | args.data_randomize,
459 | "test",
460 | args.raw_data_file,
461 | args.processed_data_file,
462 | args.memory_map
463 | )
464 |
465 | train_loader = data_loader_terabyte.DataLoader(
466 | data_directory=data_directory,
467 | data_filename=data_filename,
468 | days=list(range(23)),
469 | batch_size=args.mini_batch_size,
470 | max_ind_range=args.max_ind_range,
471 | split="train",
472 | drop_last_batch=True
473 | )
474 |
475 | cache_loader = data_loader_terabyte.DataLoader(
476 | data_directory=data_directory,
477 | data_filename=data_filename,
478 | days=list(range(23)),
479 | batch_size=args.mini_batch_size,
480 | max_ind_range=args.max_ind_range,
481 | split="train",
482 | drop_last_batch=False
483 | )
484 |
485 | test_loader = data_loader_terabyte.DataLoader(
486 | data_directory=data_directory,
487 | data_filename=data_filename,
488 | days=[23],
489 | batch_size=args.test_mini_batch_size,
490 | max_ind_range=args.max_ind_range,
491 | split="test"
492 | )
493 | else:
494 | train_data = CriteoDataset(
495 | args.data_set,
496 | args.max_ind_range,
497 | args.data_sub_sample_rate,
498 | args.data_randomize,
499 | "train",
500 | args.raw_data_file,
501 | args.processed_data_file,
502 | args.memory_map
503 | )
504 |
505 | test_data = CriteoDataset(
506 | args.data_set,
507 | args.max_ind_range,
508 | args.data_sub_sample_rate,
509 | args.data_randomize,
510 | "test",
511 | args.raw_data_file,
512 | args.processed_data_file,
513 | args.memory_map
514 | )
515 |
516 | train_loader = torch.utils.data.DataLoader(
517 | train_data,
518 | batch_size=args.mini_batch_size,
519 | shuffle=False,
520 | num_workers=args.num_workers,
521 | collate_fn=collate_wrapper_criteo,
522 | pin_memory=False,
523 | drop_last=False, # True
524 | worker_init_fn=criteo_worker_pin_fn
525 | )
526 |
527 | cache_loader = torch.utils.data.DataLoader(
528 | train_data,
529 | batch_size=args.cache_workers * args.lookahead * args.mini_batch_size,
530 | shuffle=False,
531 | num_workers=1,
532 | collate_fn=collate_wrapper_criteo,
533 | pin_memory=False,
534 | drop_last=False
535 | )
536 |
537 | test_loader = torch.utils.data.DataLoader(
538 | test_data,
539 | batch_size=args.test_mini_batch_size,
540 | shuffle=False,
541 | num_workers=0,
542 | collate_fn=collate_wrapper_criteo,
543 | pin_memory=False,
544 | drop_last=False, # True
545 | )
546 |
547 | return train_data, train_loader, test_data, test_loader, cache_loader
548 |
549 |
550 | # uniform ditribution (input data)
551 | class RandomDataset(Dataset):
552 |
553 | def __init__(
554 | self,
555 | m_den,
556 | ln_emb,
557 | data_size,
558 | num_batches,
559 | mini_batch_size,
560 | num_indices_per_lookup,
561 | num_indices_per_lookup_fixed,
562 | num_targets=1,
563 | round_targets=False,
564 | data_generation="random",
565 | trace_file="",
566 | enable_padding=False,
567 | reset_seed_on_access=False,
568 | rand_seed=0
569 | ):
570 | # compute batch size
571 | nbatches = int(np.ceil((data_size * 1.0) / mini_batch_size))
572 | if num_batches != 0:
573 | nbatches = num_batches
574 | data_size = nbatches * mini_batch_size
575 | # print("Total number of batches %d" % nbatches)
576 |
577 | # save args (recompute data_size if needed)
578 | self.m_den = m_den
579 | self.ln_emb = ln_emb
580 | self.data_size = data_size
581 | self.num_batches = nbatches
582 | self.mini_batch_size = mini_batch_size
583 | self.num_indices_per_lookup = num_indices_per_lookup
584 | self.num_indices_per_lookup_fixed = num_indices_per_lookup_fixed
585 | self.num_targets = num_targets
586 | self.round_targets = round_targets
587 | self.data_generation = data_generation
588 | self.trace_file = trace_file
589 | self.enable_padding = enable_padding
590 | self.reset_seed_on_access = reset_seed_on_access
591 | self.rand_seed = rand_seed
592 |
593 | def reset_numpy_seed(self, numpy_rand_seed):
594 | np.random.seed(numpy_rand_seed)
595 | # torch.manual_seed(numpy_rand_seed)
596 |
597 | def __getitem__(self, index):
598 |
599 | if isinstance(index, slice):
600 | return [
601 | self[idx] for idx in range(
602 | index.start or 0, index.stop or len(self), index.step or 1
603 | )
604 | ]
605 |
606 | # WARNING: reset seed on access to first element
607 | # (e.g. if same random samples needed across epochs)
608 | if self.reset_seed_on_access and index == 0:
609 | self.reset_numpy_seed(self.rand_seed)
610 |
611 | # number of data points in a batch
612 | n = min(self.mini_batch_size, self.data_size - (index * self.mini_batch_size))
613 |
614 | # generate a batch of dense and sparse features
615 | if self.data_generation == "random":
616 | (X, lS_o, lS_i) = generate_uniform_input_batch(
617 | self.m_den,
618 | self.ln_emb,
619 | n,
620 | self.num_indices_per_lookup,
621 | self.num_indices_per_lookup_fixed
622 | )
623 | elif self.data_generation == "synthetic":
624 | (X, lS_o, lS_i) = generate_synthetic_input_batch(
625 | self.m_den,
626 | self.ln_emb,
627 | n,
628 | self.num_indices_per_lookup,
629 | self.num_indices_per_lookup_fixed,
630 | self.trace_file,
631 | self.enable_padding
632 | )
633 | else:
634 | sys.exit(
635 | "ERROR: --data-generation=" + self.data_generation + " is not supported"
636 | )
637 |
638 | # generate a batch of target (probability of a click)
639 | T = generate_random_output_batch(n, self.num_targets, self.round_targets)
640 |
641 | return (X, lS_o, lS_i, T)
642 |
643 | def __len__(self):
644 | # WARNING: note that we produce bacthes of outputs in __getitem__
645 | # therefore we should use num_batches rather than data_size below
646 | return self.num_batches
647 |
648 |
649 | def collate_wrapper_random(list_of_tuples):
650 | # where each tuple is (X, lS_o, lS_i, T)
651 | (X, lS_o, lS_i, T) = list_of_tuples[0]
652 | return (X,
653 | torch.stack(lS_o),
654 | lS_i,
655 | T)
656 |
657 |
658 | def make_random_data_and_loader(args, ln_emb, m_den):
659 | train_data = RandomDataset(
660 | m_den,
661 | ln_emb,
662 | args.data_size,
663 | args.num_batches,
664 | args.mini_batch_size,
665 | args.num_indices_per_lookup,
666 | args.num_indices_per_lookup_fixed,
667 | 1, # num_targets
668 | args.round_targets,
669 | args.data_generation,
670 | args.data_trace_file,
671 | args.data_trace_enable_padding,
672 | reset_seed_on_access=True,
673 | rand_seed=args.numpy_rand_seed
674 | ) # WARNING: generates a batch of lookups at once
675 | train_loader = torch.utils.data.DataLoader(
676 | train_data,
677 | batch_size=1,
678 | shuffle=False,
679 | num_workers=args.num_workers,
680 | collate_fn=collate_wrapper_random,
681 | pin_memory=False,
682 | drop_last=False, # True
683 | )
684 | return train_data, train_loader
685 |
686 |
687 | def generate_random_data(
688 | m_den,
689 | ln_emb,
690 | data_size,
691 | num_batches,
692 | mini_batch_size,
693 | num_indices_per_lookup,
694 | num_indices_per_lookup_fixed,
695 | num_targets=1,
696 | round_targets=False,
697 | data_generation="random",
698 | trace_file="",
699 | enable_padding=False,
700 | ):
701 | nbatches = int(np.ceil((data_size * 1.0) / mini_batch_size))
702 | if num_batches != 0:
703 | nbatches = num_batches
704 | data_size = nbatches * mini_batch_size
705 | # print("Total number of batches %d" % nbatches)
706 |
707 | # inputs
708 | lT = []
709 | lX = []
710 | lS_offsets = []
711 | lS_indices = []
712 | for j in range(0, nbatches):
713 | # number of data points in a batch
714 | n = min(mini_batch_size, data_size - (j * mini_batch_size))
715 |
716 | # generate a batch of dense and sparse features
717 | if data_generation == "random":
718 | (Xt, lS_emb_offsets, lS_emb_indices) = generate_uniform_input_batch(
719 | m_den,
720 | ln_emb,
721 | n,
722 | num_indices_per_lookup,
723 | num_indices_per_lookup_fixed
724 | )
725 | elif data_generation == "synthetic":
726 | (Xt, lS_emb_offsets, lS_emb_indices) = generate_synthetic_input_batch(
727 | m_den,
728 | ln_emb,
729 | n,
730 | num_indices_per_lookup,
731 | num_indices_per_lookup_fixed,
732 | trace_file,
733 | enable_padding
734 | )
735 | else:
736 | sys.exit(
737 | "ERROR: --data-generation=" + data_generation + " is not supported"
738 | )
739 | # dense feature
740 | lX.append(Xt)
741 | # sparse feature (sparse indices)
742 | lS_offsets.append(lS_emb_offsets)
743 | lS_indices.append(lS_emb_indices)
744 |
745 | # generate a batch of target (probability of a click)
746 | P = generate_random_output_batch(n, num_targets, round_targets)
747 | lT.append(P)
748 |
749 | return (nbatches, lX, lS_offsets, lS_indices, lT)
750 |
751 |
752 | def generate_random_output_batch(n, num_targets, round_targets=False):
753 | # target (probability of a click)
754 | if round_targets:
755 | P = np.round(ra.rand(n, num_targets).astype(np.float32)).astype(np.float32)
756 | else:
757 | P = ra.rand(n, num_targets).astype(np.float32)
758 |
759 | return torch.tensor(P)
760 |
761 |
762 | # uniform ditribution (input data)
763 | def generate_uniform_input_batch(
764 | m_den,
765 | ln_emb,
766 | n,
767 | num_indices_per_lookup,
768 | num_indices_per_lookup_fixed,
769 | ):
770 | # dense feature
771 | Xt = torch.tensor(ra.rand(n, m_den).astype(np.float32))
772 |
773 | # sparse feature (sparse indices)
774 | lS_emb_offsets = []
775 | lS_emb_indices = []
776 | # for each embedding generate a list of n lookups,
777 | # where each lookup is composed of multiple sparse indices
778 | for size in ln_emb:
779 | lS_batch_offsets = []
780 | lS_batch_indices = []
781 | offset = 0
782 | for _ in range(n):
783 | # num of sparse indices to be used per embedding (between
784 | if num_indices_per_lookup_fixed:
785 | sparse_group_size = np.int64(num_indices_per_lookup)
786 | else:
787 | # random between [1,num_indices_per_lookup])
788 | r = ra.random(1)
789 | sparse_group_size = np.int64(
790 | np.round(max([1.0], r * min(size, num_indices_per_lookup)))
791 | )
792 | # sparse indices to be used per embedding
793 | r = ra.random(sparse_group_size)
794 | sparse_group = np.unique(np.round(r * (size - 1)).astype(np.int64))
795 | # reset sparse_group_size in case some index duplicates were removed
796 | sparse_group_size = np.int64(sparse_group.size)
797 | # store lengths and indices
798 | lS_batch_offsets += [offset]
799 | lS_batch_indices += sparse_group.tolist()
800 | # update offset for next iteration
801 | offset += sparse_group_size
802 | lS_emb_offsets.append(torch.tensor(lS_batch_offsets))
803 | lS_emb_indices.append(torch.tensor(lS_batch_indices))
804 |
805 | return (Xt, lS_emb_offsets, lS_emb_indices)
806 |
807 |
808 | # synthetic distribution (input data)
809 | def generate_synthetic_input_batch(
810 | m_den,
811 | ln_emb,
812 | n,
813 | num_indices_per_lookup,
814 | num_indices_per_lookup_fixed,
815 | trace_file,
816 | enable_padding=False,
817 | ):
818 | # dense feature
819 | Xt = torch.tensor(ra.rand(n, m_den).astype(np.float32))
820 |
821 | # sparse feature (sparse indices)
822 | lS_emb_offsets = []
823 | lS_emb_indices = []
824 | # for each embedding generate a list of n lookups,
825 | # where each lookup is composed of multiple sparse indices
826 | for i, size in enumerate(ln_emb):
827 | lS_batch_offsets = []
828 | lS_batch_indices = []
829 | offset = 0
830 | for _ in range(n):
831 | # num of sparse indices to be used per embedding (between
832 | if num_indices_per_lookup_fixed:
833 | sparse_group_size = np.int64(num_indices_per_lookup)
834 | else:
835 | # random between [1,num_indices_per_lookup])
836 | r = ra.random(1)
837 | sparse_group_size = np.int64(
838 | max(1, np.round(r * min(size, num_indices_per_lookup))[0])
839 | )
840 | # sparse indices to be used per embedding
841 | file_path = trace_file
842 | line_accesses, list_sd, cumm_sd = read_dist_from_file(
843 | file_path.replace("j", str(i))
844 | )
845 | # debug prints
846 | # print("input")
847 | # print(line_accesses); print(list_sd); print(cumm_sd);
848 | # print(sparse_group_size)
849 | # approach 1: rand
850 | # r = trace_generate_rand(
851 | # line_accesses, list_sd, cumm_sd, sparse_group_size, enable_padding
852 | # )
853 | # approach 2: lru
854 | r = trace_generate_lru(
855 | line_accesses, list_sd, cumm_sd, sparse_group_size, enable_padding
856 | )
857 | # WARNING: if the distribution in the file is not consistent
858 | # with embedding table dimensions, below mod guards against out
859 | # of range access
860 | sparse_group = np.unique(r).astype(np.int64)
861 | minsg = np.min(sparse_group)
862 | maxsg = np.max(sparse_group)
863 | if (minsg < 0) or (size <= maxsg):
864 | print(
865 | "WARNING: distribution is inconsistent with embedding "
866 | + "table size (using mod to recover and continue)"
867 | )
868 | sparse_group = np.mod(sparse_group, size).astype(np.int64)
869 | # sparse_group = np.unique(np.array(np.mod(r, size-1)).astype(np.int64))
870 | # reset sparse_group_size in case some index duplicates were removed
871 | sparse_group_size = np.int64(sparse_group.size)
872 | # store lengths and indices
873 | lS_batch_offsets += [offset]
874 | lS_batch_indices += sparse_group.tolist()
875 | # update offset for next iteration
876 | offset += sparse_group_size
877 | lS_emb_offsets.append(torch.tensor(lS_batch_offsets))
878 | lS_emb_indices.append(torch.tensor(lS_batch_indices))
879 |
880 | return (Xt, lS_emb_offsets, lS_emb_indices)
881 |
882 |
883 | def generate_stack_distance(cumm_val, cumm_dist, max_i, i, enable_padding=False):
884 | u = ra.rand(1)
885 | if i < max_i:
886 | # only generate stack distances up to the number of new references seen so far
887 | j = bisect.bisect(cumm_val, i) - 1
888 | fi = cumm_dist[j]
889 | u *= fi # shrink distribution support to exclude last values
890 | elif enable_padding:
891 | # WARNING: disable generation of new references (once all have been seen)
892 | fi = cumm_dist[0]
893 | u = (1.0 - fi) * u + fi # remap distribution support to exclude first value
894 |
895 | for (j, f) in enumerate(cumm_dist):
896 | if u <= f:
897 | return cumm_val[j]
898 |
899 |
900 | # WARNING: global define, must be consistent across all synthetic functions
901 | cache_line_size = 1
902 |
903 |
904 | def trace_generate_lru(
905 | line_accesses, list_sd, cumm_sd, out_trace_len, enable_padding=False
906 | ):
907 | max_sd = list_sd[-1]
908 | l = len(line_accesses)
909 | i = 0
910 | ztrace = []
911 | for _ in range(out_trace_len):
912 | sd = generate_stack_distance(list_sd, cumm_sd, max_sd, i, enable_padding)
913 | mem_ref_within_line = 0 # floor(ra.rand(1)*cache_line_size) #0
914 |
915 | # generate memory reference
916 | if sd == 0: # new reference #
917 | line_ref = line_accesses.pop(0)
918 | line_accesses.append(line_ref)
919 | mem_ref = np.uint64(line_ref * cache_line_size + mem_ref_within_line)
920 | i += 1
921 | else: # existing reference #
922 | line_ref = line_accesses[l - sd]
923 | mem_ref = np.uint64(line_ref * cache_line_size + mem_ref_within_line)
924 | line_accesses.pop(l - sd)
925 | line_accesses.append(line_ref)
926 | # save generated memory reference
927 | ztrace.append(mem_ref)
928 |
929 | return ztrace
930 |
931 |
932 | def trace_generate_rand(
933 | line_accesses, list_sd, cumm_sd, out_trace_len, enable_padding=False
934 | ):
935 | max_sd = list_sd[-1]
936 | l = len(line_accesses) # !!!Unique,
937 | i = 0
938 | ztrace = []
939 | for _ in range(out_trace_len):
940 | sd = generate_stack_distance(list_sd, cumm_sd, max_sd, i, enable_padding)
941 | mem_ref_within_line = 0 # floor(ra.rand(1)*cache_line_size) #0
942 | # generate memory reference
943 | if sd == 0: # new reference #
944 | line_ref = line_accesses.pop(0)
945 | line_accesses.append(line_ref)
946 | mem_ref = np.uint64(line_ref * cache_line_size + mem_ref_within_line)
947 | i += 1
948 | else: # existing reference #
949 | line_ref = line_accesses[l - sd]
950 | mem_ref = np.uint64(line_ref * cache_line_size + mem_ref_within_line)
951 | ztrace.append(mem_ref)
952 |
953 | return ztrace
954 |
955 |
956 | def trace_profile(trace, enable_padding=False):
957 | # number of elements in the array (assuming 1D)
958 | # n = trace.size
959 |
960 | rstack = [] # S
961 | stack_distances = [] # SDS
962 | line_accesses = [] # L
963 | for x in trace:
964 | r = np.uint64(x / cache_line_size)
965 | l = len(rstack)
966 | try: # found #
967 | i = rstack.index(r)
968 | # WARNING: I believe below is the correct depth in terms of meaning of the
969 | # algorithm, but that is not what seems to be in the paper alg.
970 | # -1 can be subtracted if we defined the distance between
971 | # consecutive accesses (e.g. r, r) as 0 rather than 1.
972 | sd = l - i # - 1
973 | # push r to the end of stack_distances
974 | stack_distances.insert(0, sd)
975 | # remove r from its position and insert to the top of stack
976 | rstack.pop(i) # rstack.remove(r)
977 | rstack.insert(l - 1, r)
978 | except ValueError: # not found #
979 | sd = 0 # -1
980 | # push r to the end of stack_distances/line_accesses
981 | stack_distances.insert(0, sd)
982 | line_accesses.insert(0, r)
983 | # push r to the top of stack
984 | rstack.insert(l, r)
985 |
986 | if enable_padding:
987 | # WARNING: notice that as the ratio between the number of samples (l)
988 | # and cardinality (c) of a sample increases the probability of
989 | # generating a sample gets smaller and smaller because there are
990 | # few new samples compared to repeated samples. This means that for a
991 | # long trace with relatively small cardinality it will take longer to
992 | # generate all new samples and therefore obtain full distribution support
993 | # and hence it takes longer for distribution to resemble the original.
994 | # Therefore, we may pad the number of new samples to be on par with
995 | # average number of samples l/c artificially.
996 | l = len(stack_distances)
997 | c = max(stack_distances)
998 | padding = int(np.ceil(l / c))
999 | stack_distances = stack_distances + [0] * padding
1000 |
1001 | return (rstack, stack_distances, line_accesses)
1002 |
1003 |
1004 | # auxiliary read/write routines
1005 | def read_trace_from_file(file_path):
1006 | try:
1007 | with open(file_path) as f:
1008 | if args.trace_file_binary_type:
1009 | array = np.fromfile(f, dtype=np.uint64)
1010 | trace = array.astype(np.uint64).tolist()
1011 | else:
1012 | line = f.readline()
1013 | trace = list(map(lambda x: np.uint64(x), line.split(", ")))
1014 | return trace
1015 | except Exception:
1016 | print("ERROR: no input trace file has been provided")
1017 |
1018 |
1019 | def write_trace_to_file(file_path, trace):
1020 | try:
1021 | if args.trace_file_binary_type:
1022 | with open(file_path, "wb+") as f:
1023 | np.array(trace).astype(np.uint64).tofile(f)
1024 | else:
1025 | with open(file_path, "w+") as f:
1026 | s = str(trace)
1027 | f.write(s[1: len(s) - 1])
1028 | except Exception:
1029 | print("ERROR: no output trace file has been provided")
1030 |
1031 |
1032 | def read_dist_from_file(file_path):
1033 | try:
1034 | with open(file_path, "r") as f:
1035 | lines = f.read().splitlines()
1036 | except Exception:
1037 | print("Wrong file or file path")
1038 | # read unique accesses
1039 | unique_accesses = [int(el) for el in lines[0].split(", ")]
1040 | # read cumulative distribution (elements are passed as two separate lists)
1041 | list_sd = [int(el) for el in lines[1].split(", ")]
1042 | cumm_sd = [float(el) for el in lines[2].split(", ")]
1043 |
1044 | return unique_accesses, list_sd, cumm_sd
1045 |
1046 |
1047 | def write_dist_to_file(file_path, unique_accesses, list_sd, cumm_sd):
1048 | try:
1049 | with open(file_path, "w") as f:
1050 | # unique_acesses
1051 | s = str(unique_accesses)
1052 | f.write(s[1: len(s) - 1] + "\n")
1053 | # list_sd
1054 | s = str(list_sd)
1055 | f.write(s[1: len(s) - 1] + "\n")
1056 | # cumm_sd
1057 | s = str(cumm_sd)
1058 | f.write(s[1: len(s) - 1] + "\n")
1059 | except Exception:
1060 | print("Wrong file or file path")
1061 |
1062 |
1063 | if __name__ == "__main__":
1064 | import sys
1065 | import operator
1066 | import argparse
1067 |
1068 | ### parse arguments ###
1069 | parser = argparse.ArgumentParser(description="Generate Synthetic Distributions")
1070 | parser.add_argument("--trace-file", type=str, default="./input/trace.log")
1071 | parser.add_argument("--trace-file-binary-type", type=bool, default=False)
1072 | parser.add_argument("--trace-enable-padding", type=bool, default=False)
1073 | parser.add_argument("--dist-file", type=str, default="./input/dist.log")
1074 | parser.add_argument(
1075 | "--synthetic-file", type=str, default="./input/trace_synthetic.log"
1076 | )
1077 | parser.add_argument("--numpy-rand-seed", type=int, default=123)
1078 | parser.add_argument("--print-precision", type=int, default=5)
1079 | args = parser.parse_args()
1080 |
1081 | ### some basic setup ###
1082 | np.random.seed(args.numpy_rand_seed)
1083 | np.set_printoptions(precision=args.print_precision)
1084 |
1085 | ### read trace ###
1086 | trace = read_trace_from_file(args.trace_file)
1087 | # print(trace)
1088 |
1089 | ### profile trace ###
1090 | (_, stack_distances, line_accesses) = trace_profile(
1091 | trace, args.trace_enable_padding
1092 | )
1093 | stack_distances.reverse()
1094 | line_accesses.reverse()
1095 | # print(line_accesses)
1096 | # print(stack_distances)
1097 |
1098 | ### compute probability distribution ###
1099 | # count items
1100 | l = len(stack_distances)
1101 | dc = sorted(
1102 | collections.Counter(stack_distances).items(), key=operator.itemgetter(0)
1103 | )
1104 |
1105 | # create a distribution
1106 | list_sd = list(map(lambda tuple_x_k: tuple_x_k[0], dc)) # x = tuple_x_k[0]
1107 | dist_sd = list(
1108 | map(lambda tuple_x_k: tuple_x_k[1] / float(l), dc)
1109 | ) # k = tuple_x_k[1]
1110 | cumm_sd = [] # np.cumsum(dc).tolist() #prefixsum
1111 | for i, (_, k) in enumerate(dc):
1112 | if i == 0:
1113 | cumm_sd.append(k / float(l))
1114 | else:
1115 | # add the 2nd element of the i-th tuple in the dist_sd list
1116 | cumm_sd.append(cumm_sd[i - 1] + (k / float(l)))
1117 |
1118 | ### write stack_distance and line_accesses to a file ###
1119 | write_dist_to_file(args.dist_file, line_accesses, list_sd, cumm_sd)
1120 |
1121 | ### generate correspondinf synthetic ###
1122 | # line_accesses, list_sd, cumm_sd = read_dist_from_file(args.dist_file)
1123 | synthetic_trace = trace_generate_lru(
1124 | line_accesses, list_sd, cumm_sd, len(trace), args.trace_enable_padding
1125 | )
1126 | # synthetic_trace = trace_generate_rand(
1127 | # line_accesses, list_sd, cumm_sd, len(trace), args.trace_enable_padding
1128 | # )
1129 | write_trace_to_file(args.synthetic_file, synthetic_trace)
1130 |
--------------------------------------------------------------------------------
/data_utils.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | #
3 | # This source code is licensed under the MIT license found in the
4 | # LICENSE file in the root directory of this source tree.
5 | #
6 | # Description: generate inputs and targets for the DLRM benchmark
7 | #
8 | # Utility function(s) to download and pre-process public data sets
9 | # - Criteo Kaggle Display Advertising Challenge Dataset
10 | # https://labs.criteo.com/2014/02/kaggle-display-advertising-challenge-dataset
11 | # - Criteo Terabyte Dataset
12 | # https://labs.criteo.com/2013/12/download-terabyte-click-logs
13 | #
14 | # After downloading dataset, run:
15 | # getCriteoAdData(
16 | # datafile="",
17 | # o_filename=kaggleAdDisplayChallenge_processed.npz,
18 | # max_ind_range=-1,
19 | # sub_sample_rate=0.0,
20 | # days=7,
21 | # data_split='train',
22 | # randomize='total',
23 | # criteo_kaggle=True,
24 | # memory_map=False
25 | # )
26 | # getCriteoAdData(
27 | # datafile="",
28 | # o_filename=terabyte_processed.npz,
29 | # max_ind_range=-1,
30 | # sub_sample_rate=0.0,
31 | # days=24,
32 | # data_split='train',
33 | # randomize='total',
34 | # criteo_kaggle=False,
35 | # memory_map=False
36 | # )
37 |
38 | from __future__ import absolute_import, division, print_function, unicode_literals
39 |
40 | import sys
41 | # import os
42 | from os import path
43 | # import io
44 | # from io import StringIO
45 | # import collections as coll
46 |
47 | import numpy as np
48 |
49 |
50 | def convertUStringToDistinctIntsDict(mat, convertDicts, counts):
51 | # Converts matrix of unicode strings into distinct integers.
52 | #
53 | # Inputs:
54 | # mat (np.array): array of unicode strings to convert
55 | # convertDicts (list): dictionary for each column
56 | # counts (list): number of different categories in each column
57 | #
58 | # Outputs:
59 | # out (np.array): array of output integers
60 | # convertDicts (list): dictionary for each column
61 | # counts (list): number of different categories in each column
62 |
63 | # check if convertDicts and counts match correct length of mat
64 | if len(convertDicts) != mat.shape[1] or len(counts) != mat.shape[1]:
65 | print("Length of convertDicts or counts does not match input shape")
66 | print("Generating convertDicts and counts...")
67 |
68 | convertDicts = [{} for _ in range(mat.shape[1])]
69 | counts = [0 for _ in range(mat.shape[1])]
70 |
71 | # initialize output
72 | out = np.zeros(mat.shape)
73 |
74 | for j in range(mat.shape[1]):
75 | for i in range(mat.shape[0]):
76 | # add to convertDict and increment count
77 | if mat[i, j] not in convertDicts[j]:
78 | convertDicts[j][mat[i, j]] = counts[j]
79 | counts[j] += 1
80 | out[i, j] = convertDicts[j][mat[i, j]]
81 |
82 | return out, convertDicts, counts
83 |
84 |
85 | def convertUStringToDistinctIntsUnique(mat, mat_uni, counts):
86 | # mat is an array of 0,...,# samples, with each being 26 categorical features
87 |
88 | # check if mat_unique and counts match correct length of mat
89 | if len(mat_uni) != mat.shape[1] or len(counts) != mat.shape[1]:
90 | print("Length of mat_unique or counts does not match input shape")
91 | print("Generating mat_unique and counts...")
92 |
93 | mat_uni = [np.array([]) for _ in range(mat.shape[1])]
94 | counts = [0 for _ in range(mat.shape[1])]
95 |
96 | # initialize output
97 | out = np.zeros(mat.shape)
98 | ind_map = [np.array([]) for _ in range(mat.shape[1])]
99 |
100 | # find out and assign unique ids to features
101 | for j in range(mat.shape[1]):
102 | m = mat_uni[j].size
103 | mat_concat = np.concatenate((mat_uni[j], mat[:, j]))
104 | mat_uni[j], ind_map[j] = np.unique(mat_concat, return_inverse=True)
105 | out[:, j] = ind_map[j][m:]
106 | counts[j] = mat_uni[j].size
107 |
108 | return out, mat_uni, counts
109 |
110 |
111 | def processCriteoAdData(d_path, d_file, npzfile, split, convertDicts, pre_comp_counts):
112 | # Process Kaggle Display Advertising Challenge or Terabyte Dataset
113 | # by converting unicode strings in X_cat to integers and
114 | # converting negative integer values in X_int.
115 | #
116 | # Loads data in the form "{kaggle|terabyte}_day_i.npz" where i is the day.
117 | #
118 | # Inputs:
119 | # d_path (str): path for {kaggle|terabyte}_day_i.npz files
120 | # split (int): total number of splits in the dataset (typically 7 or 24)
121 |
122 | # process data if not all files exist
123 | for i in range(split):
124 | filename_i = npzfile + "_{0}_processed.npz".format(i)
125 |
126 | if path.exists(filename_i):
127 | print("Using existing " + filename_i, end="\r")
128 | else:
129 | with np.load(npzfile + "_{0}.npz".format(i)) as data:
130 | # categorical features
131 | '''
132 | # Approach 1a: using empty dictionaries
133 | X_cat, convertDicts, counts = convertUStringToDistinctIntsDict(
134 | data["X_cat"], convertDicts, counts
135 | )
136 | '''
137 | '''
138 | # Approach 1b: using empty np.unique
139 | X_cat, convertDicts, counts = convertUStringToDistinctIntsUnique(
140 | data["X_cat"], convertDicts, counts
141 | )
142 | '''
143 | # Approach 2a: using pre-computed dictionaries
144 | X_cat_t = np.zeros(data["X_cat_t"].shape)
145 | for j in range(26):
146 | for k, x in enumerate(data["X_cat_t"][j, :]):
147 | X_cat_t[j, k] = convertDicts[j][x]
148 | # continuous features
149 | X_int = data["X_int"]
150 | X_int[X_int < 0] = 0
151 | # targets
152 | y = data["y"]
153 |
154 | np.savez_compressed(
155 | filename_i,
156 | # X_cat = X_cat,
157 | X_cat=np.transpose(X_cat_t), # transpose of the data
158 | X_int=X_int,
159 | y=y,
160 | )
161 | print("Processed " + filename_i, end="\r")
162 | print("")
163 | # sanity check (applicable only if counts have been pre-computed & are re-computed)
164 | # for j in range(26):
165 | # if pre_comp_counts[j] != counts[j]:
166 | # sys.exit("ERROR: Sanity check on counts has failed")
167 | # print("\nSanity check on counts passed")
168 |
169 | return
170 |
171 |
172 | def concatCriteoAdData(
173 | d_path,
174 | d_file,
175 | npzfile,
176 | trafile,
177 | days,
178 | data_split,
179 | randomize,
180 | total_per_file,
181 | total_count,
182 | memory_map,
183 | o_filename
184 | ):
185 | # Concatenates different days and saves the result.
186 | #
187 | # Inputs:
188 | # days (int): total number of days in the dataset (typically 7 or 24)
189 | # d_path (str): path for {kaggle|terabyte}_day_i.npz files
190 | # o_filename (str): output file name
191 | #
192 | # Output:
193 | # o_file (str): output file path
194 |
195 | if memory_map:
196 | # dataset break up per fea
197 | # tar_fea = 1 # single target
198 | den_fea = 13 # 13 dense features
199 | spa_fea = 26 # 26 sparse features
200 | # tad_fea = tar_fea + den_fea
201 | # tot_fea = tad_fea + spa_fea
202 | # create offset per file
203 | offset_per_file = np.array([0] + [x for x in total_per_file])
204 | for i in range(days):
205 | offset_per_file[i + 1] += offset_per_file[i]
206 |
207 | '''
208 | # Approach 1, 2 and 3 use indices, while Approach 4 does not use them
209 | # create indices
210 | indices = np.arange(total_count)
211 | if data_split == "none":
212 | if randomize == "total":
213 | indices = np.random.permutation(indices)
214 | else:
215 | indices = np.array_split(indices, offset_per_file[1:-1])
216 |
217 | # randomize train data (per day)
218 | if randomize == "day": # or randomize == "total":
219 | for i in range(len(indices) - 1):
220 | indices[i] = np.random.permutation(indices[i])
221 | print("Randomized indices per day ...")
222 |
223 | train_indices = np.concatenate(indices[:-1])
224 | test_indices = indices[-1]
225 |
226 | # randomize train data (across days)
227 | if randomize == "total":
228 | train_indices = np.random.permutation(train_indices)
229 | print("Randomized indices across days ...")
230 |
231 | indices = np.concatenate((train_indices, test_indices))
232 | # no reordering
233 | # indices = np.arange(total_count)
234 | '''
235 | '''
236 | # Approach 1: simple and slow (no grouping is used)
237 | # check if data already exists
238 | recreate_flag = False
239 | for j in range(tot_fea):
240 | filename_j = trafile + "_{0}_reordered.npy".format(j)
241 | if path.exists(filename_j):
242 | print("Using existing " + filename_j)
243 | else:
244 | recreate_flag = True
245 | # load, reorder and concatenate data (memmap all reordered files per feature)
246 | if recreate_flag:
247 | # init reordered files (.npy appended automatically)
248 | z = np.zeros((total_count))
249 | for j in range(tot_fea):
250 | filename_j = trafile + "_{0}_reordered".format(j)
251 | np.save(filename_j, z)
252 | print("Creating " + filename_j)
253 |
254 | for i in range(days):
255 | filename_i = d_path + npzfile + "_{0}_processed.npz".format(i)
256 | with np.load(filename_i) as data:
257 | X_cat_t = np.transpose(data["X_cat"])
258 | X_int_t = np.transpose(data["X_int"])
259 | y = data["y"]
260 | size = len(y)
261 | # sanity check
262 | if total_per_file[i] != size:
263 | sys.exit("ERROR: sanity check on number of samples failed")
264 | # setup start and end ranges
265 | start = offset_per_file[i]
266 | end = offset_per_file[i + 1]
267 | # print(filename_i)
268 | # print("start=" + str(start) + " end=" + str(end)
269 | # + " diff=" + str(end - start) + "=" + str(total_per_file[i]))
270 |
271 | for j in range(tot_fea):
272 | filename_j = trafile + "_{0}_reordered.npy".format(j)
273 | fj = np.load(filename_j, mmap_mode='r+')
274 | if j < tar_fea:
275 | fj[indices[start:end]] = y
276 | elif tar_fea <= j and j < tad_fea:
277 | fj[indices[start:end]] = X_int_t[j - tar_fea, :]
278 | else:
279 | fj[indices[start:end]] = X_cat_t[j - tad_fea, :]
280 | del fj
281 | else:
282 | print("Reordered fea files already exist, skipping ...")
283 |
284 | # check if data already exists
285 | recreate_flag = False
286 | for i in range(days):
287 | filename_i = d_path + npzfile + "_{0}_reordered.npz".format(i)
288 | if path.exists(filename_i):
289 | print("Using existing " + filename_i)
290 | else:
291 | recreate_flag = True
292 | # split reordered data by files (memmap all reordered files per feature)
293 | # on the day boundary del the file object and memmap again
294 | if recreate_flag:
295 | for i in range(days):
296 | filename_i = d_path + npzfile + "_{0}_reordered.npz".format(i)
297 | size = total_per_file[i]
298 | X_int_t = np.zeros((den_fea, size))
299 | X_cat_t = np.zeros((spa_fea, size))
300 | # setup start and end ranges
301 | start = offset_per_file[i]
302 | end = offset_per_file[i + 1]
303 | print("Creating " + filename_i)
304 | # print("start=" + str(start) + " end=" + str(end)
305 | # + " diff=" + str(end - start) + "=" + str(total_per_file[i]))
306 |
307 | for j in range(tot_fea):
308 | filename_j = trafile + "_{0}_reordered.npy".format(j)
309 | fj = np.load(filename_j, mmap_mode='r')
310 | if j < tar_fea:
311 | y = fj[start:end]
312 | elif tar_fea <= j and j < tad_fea:
313 | X_int_t[j - tar_fea, :] = fj[start:end]
314 | else:
315 | X_cat_t[j - tad_fea, :] = fj[start:end]
316 | del fj
317 |
318 | np.savez_compressed(
319 | filename_i,
320 | X_cat=np.transpose(X_cat_t), # transpose of the data
321 | X_int=np.transpose(X_int_t), # transpose of the data
322 | y=y,
323 | )
324 | else:
325 | print("Reordered day files already exist, skipping ...")
326 | '''
327 | '''
328 | # Approach 2: group days
329 | # check if data already exists
330 | recreate_flag = False
331 | for j in range(tot_fea):
332 | filename_j = trafile + "_{0}_reordered.npy".format(j)
333 | if path.exists(filename_j):
334 | print("Using existing " + filename_j)
335 | else:
336 | recreate_flag = True
337 | # load, reorder and concatenate data (memmap all reordered files per feature)
338 | if recreate_flag:
339 | # init reordered files (.npy appended automatically)
340 | z = np.zeros((total_count))
341 | for j in range(tot_fea):
342 | filename_j = trafile + "_{0}_reordered".format(j)
343 | np.save(filename_j, z)
344 | print("Creating " + filename_j)
345 |
346 | group_day = 3 # e.g. 8, 4 or 3
347 | group_num = days // group_day
348 | file_group = [i*group_day for i in range(group_num)] + [days]
349 | for ii in range(group_num):
350 | # for last may be group_size != group_num, therefore reset it below
351 | group_size = file_group[ii + 1] - file_group[ii]
352 | X_cat_t = [0]*group_size
353 | X_int_t = [0]*group_size
354 | y = [0]*group_size
355 | start = [0]*group_size
356 | end = [0]*group_size
357 | for ig in range(group_size):
358 | i = file_group[ii] + ig
359 | filename_i = d_path + npzfile + "_{0}_processed.npz".format(i)
360 | # setup start and end ranges
361 | start[ig] = offset_per_file[i]
362 | end[ig] = offset_per_file[i + 1]
363 | # print(filename_i)
364 | # load a group of files
365 | with np.load(filename_i) as data:
366 | X_cat_t[ig] = np.transpose(data["X_cat"])
367 | X_int_t[ig] = np.transpose(data["X_int"])
368 | y[ig] = data["y"]
369 | # sanity check
370 | if total_per_file[i] != len(y[ig]):
371 | sys.exit("ERROR: sanity check on number of samples failed")
372 | # print("start=" + str(start) + " end=" + str(end)
373 | # + " diff=" + str(end[ig]-start[ig]) + "=" + str(total_per_file[i]))
374 |
375 | for j in range(tot_fea):
376 | filename_j = trafile + "_{0}_reordered.npy".format(j)
377 | fj = np.load(filename_j, mmap_mode='r+')
378 | for ig in range(group_size):
379 | if j < tar_fea:
380 | fj[indices[start[ig]:end[ig]]] = y[ig]
381 | elif tar_fea <= j and j < tad_fea:
382 | fj[indices[start[ig]:end[ig]]] = X_int_t[ig][j - tar_fea, :]
383 | else:
384 | fj[indices[start[ig]:end[ig]]] = X_cat_t[ig][j - tad_fea, :]
385 | del fj
386 | else:
387 | print("Reordered fea files already exist, skipping ...")
388 |
389 | # check if data already exists
390 | recreate_flag = False
391 | for i in range(days):
392 | filename_i = d_path + npzfile + "_{0}_reordered.npz".format(i)
393 | if path.exists(filename_i):
394 | print("Using existing " + filename_i)
395 | else:
396 | recreate_flag = True
397 | # split reordered data by files (memmap all reordered files per feature)
398 | # on the day boundary del the file object and memmap again
399 | if recreate_flag:
400 | for ii in range(group_num):
401 | # for last may be group_size != group_num, therefore reset it below
402 | group_size = file_group[ii + 1] - file_group[ii]
403 | X_cat_t= []; X_int_t = []
404 | for ig in range(group_size):
405 | i = file_group[ii] + ig
406 | X_int_t.append(np.zeros((den_fea, total_per_file[i])))
407 | X_cat_t.append(np.zeros((spa_fea, total_per_file[i])))
408 | y = [0]*group_size
409 | start = [0]*group_size
410 | end = [0]*group_size
411 |
412 | for j in range(tot_fea):
413 | filename_j = trafile + "_{0}_reordered.npy".format(j)
414 | fj = np.load(filename_j, mmap_mode='r')
415 | # load a group of files
416 | for ig in range(group_size):
417 | i = file_group[ii] + ig
418 | # setup start and end ranges
419 | start[ig] = offset_per_file[i]
420 | end[ig] = offset_per_file[i + 1]
421 | # load data for the group of files
422 | if j < tar_fea:
423 | y[ig] = fj[start[ig]:end[ig]]
424 | elif tar_fea <= j and j < tad_fea:
425 | X_int_t[ig][j - tar_fea, :] = fj[start[ig]:end[ig]]
426 | else:
427 | X_cat_t[ig][j - tad_fea, :] = fj[start[ig]:end[ig]]
428 | del fj
429 |
430 | for ig in range(group_size):
431 | i = file_group[ii] + ig
432 | filename_i = d_path + npzfile + "_{0}_reordered.npz".format(i)
433 | print("Creating " + filename_i)
434 | np.savez_compressed(
435 | filename_i,
436 | X_cat=np.transpose(X_cat_t[ig]), # transpose of the data
437 | X_int=np.transpose(X_int_t[ig]), # transpose of the data
438 | y=y[ig],
439 | )
440 | else:
441 | print("Reordered day files already exist, skipping ...")
442 | '''
443 | '''
444 | # Approach 3: group features
445 | # check if data already exists
446 | group_fea = 5 # e.g. 8, 5 or 4
447 | group_num = tot_fea // group_fea
448 | if tot_fea % group_fea != 0: # sanity check
449 | sys.exit("ERROR: the group_fea must divided tot_fea evenly.")
450 | recreate_flag = False
451 | for jn in range(group_num):
452 | filename_j = trafile + "_{0}_reordered{1}.npy".format(
453 | jn, group_fea
454 | )
455 | if path.exists(filename_j):
456 | print("Using existing " + filename_j)
457 | else:
458 | recreate_flag = True
459 | # load, reorder and concatenate data (memmap all reordered files per feature)
460 | if recreate_flag:
461 | # init reordered files (.npy appended automatically)
462 | z = np.zeros((group_fea, total_count))
463 | for jn in range(group_num):
464 | filename_j = trafile + "_{0}_reordered{1}".format(
465 | jn, group_fea
466 | )
467 | np.save(filename_j, z)
468 | print("Creating " + filename_j)
469 |
470 | for i in range(days):
471 | filename_i = d_path + npzfile + "_{0}_processed.npz".format(i)
472 | with np.load(filename_i) as data:
473 | X_cat_t = np.transpose(data["X_cat"])
474 | X_int_t = np.transpose(data["X_int"])
475 | y = data["y"]
476 | size = len(y)
477 | # sanity check
478 | if total_per_file[i] != size:
479 | sys.exit("ERROR: sanity check on number of samples failed")
480 | # setup start and end ranges
481 | start = offset_per_file[i]
482 | end = offset_per_file[i + 1]
483 | # print(filename_i)
484 | # print("start=" + str(start) + " end=" + str(end)
485 | # + " diff=" + str(end - start) + "=" + str(total_per_file[i]))
486 |
487 | for jn in range(group_num):
488 | filename_j = trafile + "_{0}_reordered{1}.npy".format(
489 | jn, group_fea
490 | )
491 | fj = np.load(filename_j, mmap_mode='r+')
492 | for jg in range(group_fea):
493 | j = jn * group_fea + jg
494 | # print("j=" + str(j) + " jn=" + str(jn) + " jg=" + str(jg))
495 | if j < tar_fea:
496 | fj[jg, indices[start:end]] = y
497 | elif tar_fea <= j and j < tad_fea:
498 | fj[jg, indices[start:end]] = X_int_t[j - tar_fea, :]
499 | else:
500 | fj[jg, indices[start:end]] = X_cat_t[j - tad_fea, :]
501 | del fj
502 | else:
503 | print("Reordered fea files already exist, skipping ...")
504 |
505 | # check if data already exists
506 | recreate_flag = False
507 | for i in range(days):
508 | filename_i = d_path + npzfile + "_{0}_reordered.npz".format(i)
509 | if path.exists(filename_i):
510 | print("Using existing" + filename_i)
511 | else:
512 | recreate_flag = True
513 | # split reordered data by files (memmap all reordered files per feature)
514 | # on the day boundary del the file object and memmap again
515 | if recreate_flag:
516 | for i in range(days):
517 | filename_i = d_path + npzfile + "_{0}_reordered.npz".format(i)
518 | size = total_per_file[i]
519 | X_int_t = np.zeros((den_fea, size))
520 | X_cat_t = np.zeros((spa_fea, size))
521 | # setup start and end ranges
522 | start = offset_per_file[i]
523 | end = offset_per_file[i + 1]
524 | print("Creating " + filename_i)
525 | # print("start=" + str(start) + " end=" + str(end)
526 | # + " diff=" + str(end - start) + "=" + str(total_per_file[i]))
527 |
528 | for jn in range(group_num):
529 | filename_j = trafile + "_{0}_reordered{1}.npy".format(
530 | jn, group_fea
531 | )
532 | fj = np.load(filename_j, mmap_mode='r')
533 | for jg in range(group_fea):
534 | j = jn * group_fea + jg
535 | # print("j=" + str(j) + " jn=" + str(jn) + " jg=" + str(jg))
536 | if j < tar_fea:
537 | y = fj[jg, start:end]
538 | elif tar_fea <= j and j < tad_fea:
539 | X_int_t[j - tar_fea, :] = fj[jg, start:end]
540 | else:
541 | X_cat_t[j - tad_fea, :] = fj[jg, start:end]
542 | del fj
543 |
544 | np.savez_compressed(
545 | filename_i,
546 | X_cat=np.transpose(X_cat_t), # transpose of the data
547 | X_int=np.transpose(X_int_t), # transpose of the data
548 | y=y,
549 | )
550 |
551 | else:
552 | print("Reordered day files already exist, skipping ...")
553 | '''
554 |
555 | # Approach 4: Fisher-Yates-Rao (FYR) shuffle algorithm
556 | # 1st pass of FYR shuffle
557 | # check if data already exists
558 | recreate_flag = False
559 | for j in range(days):
560 | filename_j_y = npzfile + "_{0}_intermediate_y.npy".format(j)
561 | filename_j_d = npzfile + "_{0}_intermediate_d.npy".format(j)
562 | filename_j_s = npzfile + "_{0}_intermediate_s.npy".format(j)
563 | if (
564 | path.exists(filename_j_y)
565 | and path.exists(filename_j_d)
566 | and path.exists(filename_j_s)
567 | ):
568 | print(
569 | "Using existing\n"
570 | + filename_j_y + "\n"
571 | + filename_j_d + "\n"
572 | + filename_j_s
573 | )
574 | else:
575 | recreate_flag = True
576 | # reorder across buckets using sampling
577 | if recreate_flag:
578 | # init intermediate files (.npy appended automatically)
579 | for j in range(days):
580 | filename_j_y = npzfile + "_{0}_intermediate_y".format(j)
581 | filename_j_d = npzfile + "_{0}_intermediate_d".format(j)
582 | filename_j_s = npzfile + "_{0}_intermediate_s".format(j)
583 | np.save(filename_j_y, np.zeros((total_per_file[j])))
584 | np.save(filename_j_d, np.zeros((total_per_file[j], den_fea)))
585 | np.save(filename_j_s, np.zeros((total_per_file[j], spa_fea)))
586 | # start processing files
587 | total_counter = [0] * days
588 | for i in range(days):
589 | filename_i = npzfile + "_{0}_processed.npz".format(i)
590 | with np.load(filename_i) as data:
591 | X_cat = data["X_cat"]
592 | X_int = data["X_int"]
593 | y = data["y"]
594 | size = len(y)
595 | # sanity check
596 | if total_per_file[i] != size:
597 | sys.exit("ERROR: sanity check on number of samples failed")
598 | # debug prints
599 | print("Reordering (1st pass) " + filename_i)
600 |
601 | # create buckets using sampling of random ints
602 | # from (discrete) uniform distribution
603 | buckets = []
604 | for _j in range(days):
605 | buckets.append([])
606 | counter = [0] * days
607 | days_to_sample = days if data_split == "none" else days - 1
608 | if randomize == "total":
609 | rand_u = np.random.randint(low=0, high=days_to_sample, size=size)
610 | for k in range(size):
611 | # sample and make sure elements per buckets do not overflow
612 | if data_split == "none" or i < days - 1:
613 | # choose bucket
614 | p = rand_u[k]
615 | # retry of the bucket is full
616 | while total_counter[p] + counter[p] >= total_per_file[p]:
617 | p = np.random.randint(low=0, high=days_to_sample)
618 | else: # preserve the last day/bucket if needed
619 | p = i
620 | buckets[p].append(k)
621 | counter[p] += 1
622 | else: # randomize is day or none
623 | for k in range(size):
624 | # do not sample, preserve the data in this bucket
625 | p = i
626 | buckets[p].append(k)
627 | counter[p] += 1
628 |
629 | # sanity check
630 | if np.sum(counter) != size:
631 | sys.exit("ERROR: sanity check on number of samples failed")
632 | # debug prints
633 | # print(counter)
634 | # print(str(np.sum(counter)) + " = " + str(size))
635 | # print([len(x) for x in buckets])
636 | # print(total_counter)
637 |
638 | # partially feel the buckets
639 | for j in range(days):
640 | filename_j_y = npzfile + "_{0}_intermediate_y.npy".format(j)
641 | filename_j_d = npzfile + "_{0}_intermediate_d.npy".format(j)
642 | filename_j_s = npzfile + "_{0}_intermediate_s.npy".format(j)
643 | start = total_counter[j]
644 | end = total_counter[j] + counter[j]
645 | # target buckets
646 | fj_y = np.load(filename_j_y, mmap_mode='r+')
647 | # print("start=" + str(start) + " end=" + str(end)
648 | # + " end - start=" + str(end - start) + " "
649 | # + str(fj_y[start:end].shape) + " "
650 | # + str(len(buckets[j])))
651 | fj_y[start:end] = y[buckets[j]]
652 | del fj_y
653 | # dense buckets
654 | fj_d = np.load(filename_j_d, mmap_mode='r+')
655 | # print("start=" + str(start) + " end=" + str(end)
656 | # + " end - start=" + str(end - start) + " "
657 | # + str(fj_d[start:end, :].shape) + " "
658 | # + str(len(buckets[j])))
659 | fj_d[start:end, :] = X_int[buckets[j], :]
660 | del fj_d
661 | # sparse buckets
662 | fj_s = np.load(filename_j_s, mmap_mode='r+')
663 | # print("start=" + str(start) + " end=" + str(end)
664 | # + " end - start=" + str(end - start) + " "
665 | # + str(fj_s[start:end, :].shape) + " "
666 | # + str(len(buckets[j])))
667 | fj_s[start:end, :] = X_cat[buckets[j], :]
668 | del fj_s
669 | # update counters for next step
670 | total_counter[j] += counter[j]
671 |
672 | # 2nd pass of FYR shuffle
673 | # check if data already exists
674 | for j in range(days):
675 | filename_j = npzfile + "_{0}_reordered.npz".format(j)
676 | if path.exists(filename_j):
677 | print("Using existing " + filename_j)
678 | else:
679 | recreate_flag = True
680 | # reorder within buckets
681 | if recreate_flag:
682 | for j in range(days):
683 | filename_j_y = npzfile + "_{0}_intermediate_y.npy".format(j)
684 | filename_j_d = npzfile + "_{0}_intermediate_d.npy".format(j)
685 | filename_j_s = npzfile + "_{0}_intermediate_s.npy".format(j)
686 | fj_y = np.load(filename_j_y)
687 | fj_d = np.load(filename_j_d)
688 | fj_s = np.load(filename_j_s)
689 |
690 | indices = range(total_per_file[j])
691 | if randomize == "day" or randomize == "total":
692 | if data_split == "none" or j < days - 1:
693 | indices = np.random.permutation(range(total_per_file[j]))
694 |
695 | filename_r = npzfile + "_{0}_reordered.npz".format(j)
696 | print("Reordering (2nd pass) " + filename_r)
697 | np.savez_compressed(
698 | filename_r,
699 | X_cat=fj_s[indices, :],
700 | X_int=fj_d[indices, :],
701 | y=fj_y[indices],
702 | )
703 |
704 | '''
705 | # sanity check (under no reordering norms should be zero)
706 | for i in range(days):
707 | filename_i_o = npzfile + "_{0}_processed.npz".format(i)
708 | print(filename_i_o)
709 | with np.load(filename_i_o) as data_original:
710 | X_cat_o = data_original["X_cat"]
711 | X_int_o = data_original["X_int"]
712 | y_o = data_original["y"]
713 | filename_i_r = npzfile + "_{0}_reordered.npz".format(i)
714 | print(filename_i_r)
715 | with np.load(filename_i_r) as data_reordered:
716 | X_cat_r = data_reordered["X_cat"]
717 | X_int_r = data_reordered["X_int"]
718 | y_r = data_reordered["y"]
719 | print(np.linalg.norm(y_o - y_r))
720 | print(np.linalg.norm(X_int_o - X_int_r))
721 | print(np.linalg.norm(X_cat_o - X_cat_r))
722 | '''
723 |
724 | else:
725 | print("Concatenating multiple days into %s.npz file" % str(d_path + o_filename))
726 |
727 | # load and concatenate data
728 | for i in range(days):
729 | filename_i = npzfile + "_{0}_processed.npz".format(i)
730 | with np.load(filename_i) as data:
731 | if i == 0:
732 | X_cat = data["X_cat"]
733 | X_int = data["X_int"]
734 | y = data["y"]
735 | else:
736 | X_cat = np.concatenate((X_cat, data["X_cat"]))
737 | X_int = np.concatenate((X_int, data["X_int"]))
738 | y = np.concatenate((y, data["y"]))
739 | print("Loaded day:", i, "y = 1:", len(y[y == 1]), "y = 0:", len(y[y == 0]))
740 |
741 | with np.load(d_path + d_file + "_fea_count.npz") as data:
742 | counts = data["counts"]
743 | print("Loaded counts!")
744 |
745 | np.savez_compressed(
746 | d_path + o_filename + ".npz",
747 | X_cat=X_cat,
748 | X_int=X_int,
749 | y=y,
750 | counts=counts,
751 | )
752 |
753 | return d_path + o_filename + ".npz"
754 |
755 |
756 | def transformCriteoAdData(X_cat, X_int, y, days, data_split, randomize, total_per_file):
757 | # Transforms Criteo Kaggle or terabyte data by applying log transformation
758 | # on dense features and converting everything to appropriate tensors.
759 | #
760 | # Inputs:
761 | # X_cat (ndarray): array of integers corresponding to preprocessed
762 | # categorical features
763 | # X_int (ndarray): array of integers corresponding to dense features
764 | # y (ndarray): array of bool corresponding to labels
765 | # data_split(str): flag for splitting dataset into training/validation/test
766 | # sets
767 | # randomize (str): determines randomization scheme
768 | # "none": no randomization
769 | # "day": randomizes each day"s data (only works if split = True)
770 | # "total": randomizes total dataset
771 | #
772 | # Outputs:
773 | # if split:
774 | # X_cat_train (tensor): sparse features for training set
775 | # X_int_train (tensor): dense features for training set
776 | # y_train (tensor): labels for training set
777 | # X_cat_val (tensor): sparse features for validation set
778 | # X_int_val (tensor): dense features for validation set
779 | # y_val (tensor): labels for validation set
780 | # X_cat_test (tensor): sparse features for test set
781 | # X_int_test (tensor): dense features for test set
782 | # y_test (tensor): labels for test set
783 | # else:
784 | # X_cat (tensor): sparse features
785 | # X_int (tensor): dense features
786 | # y (tensor): label
787 |
788 | # define initial set of indices
789 | indices = np.arange(len(y))
790 |
791 | # create offset per file
792 | offset_per_file = np.array([0] + [x for x in total_per_file])
793 | for i in range(days):
794 | offset_per_file[i + 1] += offset_per_file[i]
795 |
796 | # split dataset
797 | if data_split == 'train':
798 | indices = np.array_split(indices, offset_per_file[1:-1])
799 |
800 | # randomize train data (per day)
801 | if randomize == "day": # or randomize == "total":
802 | for i in range(len(indices) - 1):
803 | indices[i] = np.random.permutation(indices[i])
804 | print("Randomized indices per day ...")
805 |
806 | train_indices = np.concatenate(indices[:-1])
807 | test_indices = indices[-1]
808 | test_indices, val_indices = np.array_split(test_indices, 2)
809 |
810 | print("Defined training and testing indices...")
811 |
812 | # randomize train data (across days)
813 | if randomize == "total":
814 | train_indices = np.random.permutation(train_indices)
815 | print("Randomized indices across days ...")
816 |
817 | # indices = np.concatenate((train_indices, test_indices))
818 |
819 | # create training, validation, and test sets
820 | X_cat_train = X_cat[train_indices]
821 | X_int_train = X_int[train_indices]
822 | y_train = y[train_indices]
823 |
824 | X_cat_val = X_cat[val_indices]
825 | X_int_val = X_int[val_indices]
826 | y_val = y[val_indices]
827 |
828 | X_cat_test = X_cat[test_indices]
829 | X_int_test = X_int[test_indices]
830 | y_test = y[test_indices]
831 |
832 | print("Split data according to indices...")
833 |
834 | X_cat_train = X_cat_train.astype(np.long)
835 | X_int_train = np.log(X_int_train.astype(np.float32) + 1)
836 | y_train = y_train.astype(np.float32)
837 |
838 | X_cat_val = X_cat_val.astype(np.long)
839 | X_int_val = np.log(X_int_val.astype(np.float32) + 1)
840 | y_val = y_val.astype(np.float32)
841 |
842 | X_cat_test = X_cat_test.astype(np.long)
843 | X_int_test = np.log(X_int_test.astype(np.float32) + 1)
844 | y_test = y_test.astype(np.float32)
845 |
846 | print("Converted to tensors...done!")
847 |
848 | return (
849 | X_cat_train,
850 | X_int_train,
851 | y_train,
852 | X_cat_val,
853 | X_int_val,
854 | y_val,
855 | X_cat_test,
856 | X_int_test,
857 | y_test,
858 | )
859 |
860 | else:
861 |
862 | # randomize data
863 | if randomize == "total":
864 | indices = np.random.permutation(indices)
865 | print("Randomized indices...")
866 |
867 | X_cat = X_cat[indices].astype(np.long)
868 | X_int = np.log(X_int[indices].astype(np.float32) + 1)
869 | y = y[indices].astype(np.float32)
870 |
871 | print("Converted to tensors...done!")
872 |
873 | return (X_cat, X_int, y, [], [], [], [], [], [])
874 |
875 |
876 | def getCriteoAdData(
877 | datafile,
878 | o_filename,
879 | max_ind_range=-1,
880 | sub_sample_rate=0.0,
881 | days=7,
882 | data_split='train',
883 | randomize='total',
884 | criteo_kaggle=True,
885 | memory_map=False
886 | ):
887 | # Passes through entire dataset and defines dictionaries for categorical
888 | # features and determines the number of total categories.
889 | #
890 | # Inputs:
891 | # datafile : path to downloaded raw data file
892 | # o_filename (str): saves results under o_filename if filename is not ""
893 | #
894 | # Output:
895 | # o_file (str): output file path
896 |
897 | #split the datafile into path and filename
898 | lstr = datafile.split("/")
899 | d_path = "/".join(lstr[0:-1]) + "/"
900 | d_file = lstr[-1].split(".")[0] if criteo_kaggle else lstr[-1]
901 | npzfile = d_path + ((d_file + "_day") if criteo_kaggle else d_file)
902 | trafile = d_path + ((d_file + "_fea") if criteo_kaggle else "fea")
903 |
904 | # count number of datapoints in training set
905 | total_file = d_path + d_file + "_day_count.npz"
906 | if path.exists(total_file):
907 | with np.load(total_file) as data:
908 | total_per_file = list(data["total_per_file"])
909 | total_count = np.sum(total_per_file)
910 | print("Skipping counts per file (already exist)")
911 | else:
912 | total_count = 0
913 | total_per_file = []
914 | if criteo_kaggle:
915 | # WARNING: The raw data consists of a single train.txt file
916 | # Each line in the file is a sample, consisting of 13 continuous and
917 | # 26 categorical features (an extra space indicates that feature is
918 | # missing and will be interpreted as 0).
919 | if path.exists(datafile):
920 | print("Reading data from path=%s" % (datafile))
921 | with open(str(datafile)) as f:
922 | for _ in f:
923 | total_count += 1
924 | total_per_file.append(total_count)
925 | # reset total per file due to split
926 | num_data_per_split, extras = divmod(total_count, days)
927 | total_per_file = [num_data_per_split] * days
928 | for j in range(extras):
929 | total_per_file[j] += 1
930 | # split into days (simplifies code later on)
931 | file_id = 0
932 | boundary = total_per_file[file_id]
933 | nf = open(npzfile + "_" + str(file_id), "w")
934 | with open(str(datafile)) as f:
935 | for j, line in enumerate(f):
936 | if j == boundary:
937 | nf.close()
938 | file_id += 1
939 | nf = open(npzfile + "_" + str(file_id), "w")
940 | boundary += total_per_file[file_id]
941 | nf.write(line)
942 | nf.close()
943 | else:
944 | sys.exit("ERROR: Criteo Kaggle Display Ad Challenge Dataset path is invalid; please download from https://labs.criteo.com/2014/02/kaggle-display-advertising-challenge-dataset")
945 | else:
946 | # WARNING: The raw data consist of day_0.gz,... ,day_23.gz text files
947 | # Each line in the file is a sample, consisting of 13 continuous and
948 | # 26 categorical features (an extra space indicates that feature is
949 | # missing and will be interpreted as 0).
950 | for i in range(days):
951 | datafile_i = datafile + "_" + str(i) # + ".gz"
952 | if path.exists(str(datafile_i)):
953 | print("Reading data from path=%s" % (str(datafile_i)))
954 | # file day_
955 | total_per_file_count = 0
956 | with open(str(datafile_i)) as f:
957 | for _ in f:
958 | total_per_file_count += 1
959 | total_per_file.append(total_per_file_count)
960 | total_count += total_per_file_count
961 | else:
962 | sys.exit("ERROR: Criteo Terabyte Dataset path is invalid; please download from https://labs.criteo.com/2013/12/download-terabyte-click-logs")
963 |
964 | # process a file worth of data and reinitialize data
965 | # note that a file main contain a single or multiple splits
966 | def process_one_file(
967 | datfile,
968 | npzfile,
969 | split,
970 | num_data_in_split,
971 | ):
972 | with open(str(datfile)) as f:
973 | y = np.zeros(num_data_in_split, dtype="i4") # 4 byte int
974 | X_int = np.zeros((num_data_in_split, 13), dtype="i4") # 4 byte int
975 | X_cat = np.zeros((num_data_in_split, 26), dtype="i4") # 4 byte int
976 | if sub_sample_rate == 0.0:
977 | rand_u = 1.0
978 | else:
979 | rand_u = np.random.uniform(low=0.0, high=1.0, size=num_data_in_split)
980 |
981 | i = 0
982 | for k, line in enumerate(f):
983 | # process a line (data point)
984 | line = line.split('\t')
985 | # set missing values to zero
986 | for j in range(len(line)):
987 | if (line[j] == '') or (line[j] == '\n'):
988 | line[j] = '0'
989 | # sub-sample data by dropping zero targets, if needed
990 | target = np.int32(line[0])
991 | if target == 0 and \
992 | (rand_u if sub_sample_rate == 0.0 else rand_u[k]) < sub_sample_rate:
993 | continue
994 |
995 | y[i] = target
996 | X_int[i] = np.array(line[1:14], dtype=np.int32)
997 | if max_ind_range > 0:
998 | X_cat[i] = np.array(
999 | list(map(lambda x: int(x, 16) % max_ind_range, line[14:])),
1000 | dtype=np.int32
1001 | )
1002 | else:
1003 | X_cat[i] = np.array(
1004 | list(map(lambda x: int(x, 16), line[14:])),
1005 | dtype=np.int32
1006 | )
1007 | # count uniques
1008 | for j in range(26):
1009 | convertDicts[j][X_cat[i][j]] = 1
1010 |
1011 | # debug prints
1012 | print(
1013 | "Load %d/%d Split: %d Label True: %d Stored: %d"
1014 | % (
1015 | i,
1016 | num_data_in_split,
1017 | split,
1018 | target,
1019 | y[i],
1020 | ),
1021 | end="\r",
1022 | )
1023 | i += 1
1024 |
1025 | # store num_data_in_split samples or extras at the end of file
1026 | # count uniques
1027 | # X_cat_t = np.transpose(X_cat)
1028 | # for j in range(26):
1029 | # for x in X_cat_t[j,:]:
1030 | # convertDicts[j][x] = 1
1031 | # store parsed
1032 | filename_s = npzfile + "_{0}.npz".format(split)
1033 | if path.exists(filename_s):
1034 | print("\nSkip existing " + filename_s)
1035 | else:
1036 | np.savez_compressed(
1037 | filename_s,
1038 | X_int=X_int[0:i, :],
1039 | # X_cat=X_cat[0:i, :],
1040 | X_cat_t=np.transpose(X_cat[0:i, :]), # transpose of the data
1041 | y=y[0:i],
1042 | )
1043 | print("\nSaved " + npzfile + "_{0}.npz!".format(split))
1044 | return i
1045 |
1046 | # create all splits (reuse existing files if possible)
1047 | recreate_flag = False
1048 | convertDicts = [{} for _ in range(26)]
1049 | # WARNING: to get reproducable sub-sampling results you must reset the seed below
1050 | # np.random.seed(123)
1051 | # in this case there is a single split in each day
1052 | for i in range(days):
1053 | datfile_i = npzfile + "_{0}".format(i) # + ".gz"
1054 | npzfile_i = npzfile + "_{0}.npz".format(i)
1055 | npzfile_p = npzfile + "_{0}_processed.npz".format(i)
1056 | if path.exists(npzfile_i):
1057 | print("Skip existing " + npzfile_i)
1058 | elif path.exists(npzfile_p):
1059 | print("Skip existing " + npzfile_p)
1060 | else:
1061 | recreate_flag = True
1062 | total_per_file[i] = process_one_file(
1063 | datfile_i,
1064 | npzfile,
1065 | i,
1066 | total_per_file[i],
1067 | )
1068 |
1069 | # report and save total into a file
1070 | total_count = np.sum(total_per_file)
1071 | if not path.exists(total_file):
1072 | np.savez_compressed(total_file, total_per_file=total_per_file)
1073 | print("Total number of samples:", total_count)
1074 | print("Divided into days/splits:\n", total_per_file)
1075 |
1076 | # dictionary files
1077 | counts = np.zeros(26, dtype=np.int32)
1078 | if recreate_flag:
1079 | # create dictionaries
1080 | for j in range(26):
1081 | for i, x in enumerate(convertDicts[j]):
1082 | convertDicts[j][x] = i
1083 | dict_file_j = d_path + d_file + "_fea_dict_{0}.npz".format(j)
1084 | if not path.exists(dict_file_j):
1085 | np.savez_compressed(
1086 | dict_file_j,
1087 | unique=np.array(list(convertDicts[j]), dtype=np.int32)
1088 | )
1089 | counts[j] = len(convertDicts[j])
1090 | # store (uniques and) counts
1091 | count_file = d_path + d_file + "_fea_count.npz"
1092 | if not path.exists(count_file):
1093 | np.savez_compressed(count_file, counts=counts)
1094 | else:
1095 | # create dictionaries (from existing files)
1096 | for j in range(26):
1097 | with np.load(d_path + d_file + "_fea_dict_{0}.npz".format(j)) as data:
1098 | unique = data["unique"]
1099 | for i, x in enumerate(unique):
1100 | convertDicts[j][x] = i
1101 | # load (uniques and) counts
1102 | with np.load(d_path + d_file + "_fea_count.npz") as data:
1103 | counts = data["counts"]
1104 |
1105 | # process all splits
1106 | processCriteoAdData(d_path, d_file, npzfile, days, convertDicts, counts)
1107 | o_file = concatCriteoAdData(
1108 | d_path,
1109 | d_file,
1110 | npzfile,
1111 | trafile,
1112 | days,
1113 | data_split,
1114 | randomize,
1115 | total_per_file,
1116 | total_count,
1117 | memory_map,
1118 | o_filename
1119 | )
1120 |
1121 | return o_file
1122 |
1123 |
1124 | def loadDataset(
1125 | dataset,
1126 | max_ind_range,
1127 | sub_sample_rate,
1128 | randomize,
1129 | data_split,
1130 | raw_path="",
1131 | pro_data="",
1132 | memory_map=False
1133 | ):
1134 | # dataset
1135 | if dataset == "kaggle":
1136 | days = 7
1137 | o_filename = "kaggleAdDisplayChallenge_processed"
1138 | elif dataset == "terabyte":
1139 | days = 24
1140 | o_filename = "terabyte_processed"
1141 | else:
1142 | raise(ValueError("Data set option is not supported"))
1143 |
1144 | # split the datafile into path and filename
1145 | lstr = raw_path.split("/")
1146 | d_path = "/".join(lstr[0:-1]) + "/"
1147 | d_file = lstr[-1].split(".")[0] if dataset == "kaggle" else lstr[-1]
1148 | npzfile = d_path + ((d_file + "_day") if dataset == "kaggle" else d_file)
1149 | # trafile = d_path + ((d_file + "_fea") if dataset == "kaggle" else "fea")
1150 |
1151 | # check if pre-processed data is available
1152 | data_ready = True
1153 | if memory_map:
1154 | for i in range(days):
1155 | reo_data = d_path + npzfile + "_{0}_reordered.npz".format(i)
1156 | if not path.exists(str(reo_data)):
1157 | data_ready = False
1158 | else:
1159 | if not path.exists(str(pro_data)):
1160 | data_ready = False
1161 |
1162 | # pre-process data if needed
1163 | # WARNNING: when memory mapping is used we get a collection of files
1164 | if data_ready:
1165 | print("Reading pre-processed data=%s" % (str(pro_data)))
1166 | file = str(pro_data)
1167 | else:
1168 | print("Reading raw data=%s" % (str(raw_path)))
1169 | file = getCriteoAdData(
1170 | raw_path,
1171 | o_filename,
1172 | max_ind_range,
1173 | sub_sample_rate,
1174 | days,
1175 | data_split,
1176 | randomize,
1177 | dataset == "kaggle",
1178 | memory_map
1179 | )
1180 |
1181 | return file, days
1182 |
1183 |
1184 | if __name__ == "__main__":
1185 | ### import packages ###
1186 | import argparse
1187 |
1188 | ### parse arguments ###
1189 | parser = argparse.ArgumentParser(
1190 | description="Preprocess Criteo dataset"
1191 | )
1192 | # model related parameters
1193 | parser.add_argument("--max-ind-range", type=int, default=-1)
1194 | parser.add_argument("--data-sub-sample-rate", type=float, default=0.0) # in [0, 1]
1195 | parser.add_argument("--data-randomize", type=str, default="total") # or day or none
1196 | parser.add_argument("--memory-map", action="store_true", default=False)
1197 | parser.add_argument("--data-set", type=str, default="kaggle") # or terabyte
1198 | parser.add_argument("--raw-data-file", type=str, default="")
1199 | parser.add_argument("--processed-data-file", type=str, default="")
1200 | args = parser.parse_args()
1201 |
1202 | loadDataset(
1203 | args.data_set,
1204 | args.max_ind_range,
1205 | args.data_sub_sample_rate,
1206 | args.data_randomize,
1207 | "train",
1208 | args.raw_data_file,
1209 | args.processed_data_file,
1210 | args.memory_map
1211 | )
1212 |
--------------------------------------------------------------------------------