├── .gitignore
├── README.md
├── activation.py
├── base_estimator.py
├── census_data.py
├── census_main.py
├── dataset
    ├── raw_test.txt
    ├── raw_train.txt
    ├── test.csv
    └── train.csv
├── dense_layer.py
├── dnn.py
├── embedding_layer.py
├── ftrl.py
├── initialization.py
├── input_layer.py
├── metrics.py
├── optimization.py
├── test_dense_layer.py
├── test_embed_layer.py
├── test_input_layer.py
├── test_others.py
├── utils.py
├── wide_layer.py
└── wide_n_deep.py


/.gitignore:
--------------------------------------------------------------------------------
1 | 
2 | *.pyc
3 | .idea
4 | learn_curve_*.csv
5 | log_*.log


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # NumpyWDL
 2 | Implement Wide &amp; Deep algorithm by using NumPy
 3 | 
 4 | 用NumPy手工实现 Wide & Deep
 5 | 
 6 | To run it: 
 7 | 
 8 | python census_main.py -e wide_n_deep -n 20
 9 | 
10 | python census_main.py -e wide -n 20
11 | 
12 | python census_main.py -e deep -n 20
13 | 
14 | 


--------------------------------------------------------------------------------
/activation.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | class Sigmoid:
 5 |     def __init__(self):
 6 |         self._last_forward_result = None
 7 | 
 8 |     def forward(self, X):
 9 |         """
10 |         element-wise sigmoid
11 |         :param X: [batch_size, #neuron]
12 |         :return: same shape as X
13 |         """
14 |         self._last_forward_result = 1.0 / (1.0 + np.exp(-X))
15 |         return self._last_forward_result
16 | 
17 |     def backward(self, prev_grads):
18 |         """
19 |         :param prev_grads:  gradients from loss to "last forward result"
20 |                             must have the same shape as 'last forward result'
21 |         :return:    gradients from loss to X, has same shape as X
22 |         """
23 |         assert prev_grads.shape == self._last_forward_result.shape
24 | 
25 |         return prev_grads * self._last_forward_result * (1 - self._last_forward_result)
26 | 
27 | 
28 | class ReLU:
29 |     def __init__(self):
30 |         self._last_input = None
31 | 
32 |     def forward(self, X):
33 |         self._last_input = X
34 |         return np.maximum(0, X)
35 | 
36 |     def backward(self, prev_grads):
37 |         assert prev_grads.shape == self._last_input.shape
38 | 
39 |         local_grads = np.zeros_like(self._last_input)
40 |         local_grads[self._last_input > 0] = 1.0
41 | 
42 |         return prev_grads * local_grads
43 | 


--------------------------------------------------------------------------------
/base_estimator.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | # from sklearn.metrics import roc_auc_score, log_loss
 3 | from metrics import logloss as log_loss, auc as roc_auc_score
 4 | from tqdm import tqdm
 5 | import logging
 6 | 
 7 | 
 8 | class BaseEstimator:
 9 | 
10 |     def __init__(self, data_source):
11 |         self._data_source = data_source
12 | 
13 |     def get_metrics(self, scores, labels, prefix):
14 |         scores = np.asarray(scores)
15 |         labels = np.asarray(labels)
16 | 
17 |         metrics = {'{}_logloss'.format(prefix): log_loss(y_true=labels, y_pred=scores),
18 |                    '{}_auc'.format(prefix): roc_auc_score(y_true=labels, y_score=scores)}
19 | 
20 |         pred_labels = (scores > 0.5).astype(int)
21 |         metrics['{}_accuracy'.format(prefix)] = np.sum(pred_labels == labels) / len(labels)
22 | 
23 |         return metrics
24 | 
25 |     def train_batch(self, features, labels):
26 |         """
27 |         :param features: dict, field_name ==> dense matrix or SparseInput
28 |         :param labels: [batch_size] ndarray
29 |         :return: [batch_size] ndarray of predicted probabilities in that batch
30 |         """
31 |         raise NotImplementedError()
32 | 
33 |     def predict(self, features):
34 |         """
35 |         :param features: dict, field_name ==> dense matrix or SparseInput
36 |         :return: [batch_size] ndarray of predicted probabilities in that batch
37 |         """
38 |         raise NotImplementedError()
39 | 
40 |     def _train_epoch(self):
41 |         scores = []
42 |         labels = []
43 | 
44 |         batch_stream = self._data_source.train_batches_per_epoch()
45 |         for batch_features, batch_labels in tqdm(batch_stream):
46 |             pred_probas = self.train_batch(batch_features, batch_labels)
47 | 
48 |             scores.extend(pred_probas)
49 |             labels.extend(batch_labels)
50 | 
51 |         return self.get_metrics(scores=scores, labels=labels, prefix='train')
52 | 
53 |     def _eval_epoch(self):
54 |         scores = []
55 |         labels = []
56 | 
57 |         batch_stream = self._data_source.test_batches_per_epoch()
58 |         for batch_features, batch_labels in tqdm(batch_stream):
59 |             pred_probas = self.predict(batch_features)
60 | 
61 |             scores.extend(pred_probas)
62 |             labels.extend(batch_labels)
63 | 
64 |         return self.get_metrics(scores=scores, labels=labels, prefix='test')
65 | 
66 |     def train(self, n_epochs):
67 |         metrics_history = []
68 |         for epoch_idx in range(n_epochs):
69 |             logging.info("\n=============== {}-th EPOCH".format(epoch_idx + 1))
70 | 
71 |             metrics = {}
72 |             metrics.update(self._train_epoch())
73 |             metrics.update(self._eval_epoch())
74 | 
75 |             logging.info(metrics)
76 |             metrics_history.append(metrics)
77 | 
78 |         return metrics_history
79 | 


--------------------------------------------------------------------------------
/census_data.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import sys
  3 | import numpy as np
  4 | import random
  5 | import utils
  6 | from input_layer import SparseInput
  7 | import bisect
  8 | import argparse
  9 | from tqdm import tqdm
 10 | 
 11 | VOCAB_LISTS = {
 12 |     'education': ['Bachelors',
 13 |                   'HS-grad',
 14 |                   '11th',
 15 |                   'Masters',
 16 |                   '9th',
 17 |                   'Some-college',
 18 |                   'Assoc-acdm',
 19 |                   'Assoc-voc',
 20 |                   '7th-8th',
 21 |                   'Doctorate',
 22 |                   'Prof-school',
 23 |                   '5th-6th',
 24 |                   '10th',
 25 |                   '1st-4th',
 26 |                   'Preschool',
 27 |                   '12th'],
 28 | 
 29 |     'marital_status': ['Married-civ-spouse',
 30 |                        'Divorced',
 31 |                        'Married-spouse-absent',
 32 |                        'Never-married',
 33 |                        'Separated',
 34 |                        'Married-AF-spouse',
 35 |                        'Widowed'],
 36 | 
 37 |     'relationship': ['Husband',
 38 |                      'Not-in-family',
 39 |                      'Wife',
 40 |                      'Own-child',
 41 |                      'Unmarried',
 42 |                      'Other-relative'],
 43 | 
 44 |     'workclass': ['Self-emp-not-inc',
 45 |                   'Private',
 46 |                   'State-gov',
 47 |                   'Federal-gov',
 48 |                   'Local-gov',
 49 |                   'Self-emp-inc',
 50 |                   'Without-pay',
 51 |                   'Never-worked'],
 52 | 
 53 |     'occupation': ['Tech-support',
 54 |                    'Craft-repair',
 55 |                    'Other-service',
 56 |                    'Sales',
 57 |                    'Exec-managerial',
 58 |                    'Prof-specialty',
 59 |                    'Handlers-cleaners',
 60 |                    'Machine-op-inspct',
 61 |                    'Adm-clerical',
 62 |                    'Farming-fishing',
 63 |                    'Transport-moving',
 64 |                    'Priv-house-serv',
 65 |                    'Protective-serv',
 66 |                    'Armed-Forces']
 67 | }
 68 | 
 69 | VOCAB_MAPPINGS = {field: {featname: idx for idx, featname in enumerate(featnames)} for field, featnames in
 70 |                   VOCAB_LISTS.items()}
 71 | 
 72 | AGE_BOUNDARIES = [18, 25, 30, 35, 40, 45, 50, 55, 60, 65]
 73 | 
 74 | DENSE_FIELDS = ['age', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']
 75 | 
 76 | DENSE_LOG_MEAN_STD = {'age': (3.6183599219864133, 0.35003117354646957),
 77 |                       'education_num': (2.372506496597371, 0.27381608590073075),
 78 |                       'capital_gain': (0.7346209104536965, 2.4547377400238553),
 79 |                       'capital_loss': (0.35030508122367104, 1.5845809727578963),
 80 |                       'hours_per_week': (3.665366478972777, 0.38701441353280025)}
 81 | 
 82 | CATEGORY_FIELDS = ['education', 'marital_status', 'relationship', 'workclass', 'occupation', 'age_buckets']
 83 | 
 84 | 
 85 | class Dataset:
 86 |     def __init__(self, infname):
 87 |         with open(infname, "rt") as fin:
 88 |             self._field_names = fin.readline().strip().split(',')
 89 |             self._lines = [line.strip() for line in fin]
 90 | 
 91 |     @property
 92 |     def n_examples(self):
 93 |         return len(self._lines)
 94 | 
 95 |     def parse_line(self, line):
 96 |         contents = dict(zip(self._field_names, line.split(',')))
 97 |         features = {}
 98 | 
 99 |         # ------------- label
100 |         label = int(contents['income_bracket'] == '>50K')
101 | 
102 |         # ------------- categorical features
103 |         for field in ['education', 'marital_status', 'relationship', 'workclass', 'occupation']:
104 |             vocab_mapping = VOCAB_MAPPINGS[field]
105 |             txt_value = contents[field]
106 |             if txt_value in vocab_mapping:
107 |                 # 找不到的，算缺失，不包含进特征
108 |                 features[field] = vocab_mapping[txt_value]
109 | 
110 |         age = int(contents['age'])
111 |         features['age_buckets'] = bisect.bisect(AGE_BOUNDARIES, age)
112 | 
113 |         # ------------- numeric features
114 |         for field in DENSE_FIELDS:
115 |             raw_value = float(contents[field])
116 |             logmean, logstd = DENSE_LOG_MEAN_STD[field]
117 |             features[field] = (np.log1p(raw_value) - logmean) / logstd
118 | 
119 |         return features, label
120 | 
121 |     def get_batch_stream(self, batch_size, n_repeat=1):
122 |         n_repeat = n_repeat if n_repeat > 0 else sys.maxsize
123 | 
124 |         for _ in range(n_repeat):
125 |             random.shuffle(self._lines)
126 | 
127 |             for batch_lines in utils.chunk(self._lines, batch_size):
128 |                 Xs = {}
129 |                 ys = []
130 | 
131 |                 # ------------- allocate for categorical feature
132 |                 for field in CATEGORY_FIELDS:
133 |                     Xs[field] = SparseInput(n_total_examples=len(batch_lines),
134 |                                             example_indices=[],
135 |                                             feature_ids=[],
136 |                                             feature_values=[])
137 | 
138 |                 # ------------- allocate for numeric feature
139 |                 for field in DENSE_FIELDS:
140 |                     # Xs[field]应该是一个list of list
141 |                     # 外面的list，对应batch中的每个example
142 |                     # 内层的list，对应该样本在field下的值。
143 |                     # 某样本可以在某个field下有多个dense值，比如当你非要用OHE来表示categorical特征的时候
144 |                     # 只不过，这里每个样本在每个field下只有一个值
145 |                     Xs[field] = []
146 | 
147 |                 # ------------- loop and add
148 |                 for example_index, line in enumerate(batch_lines):
149 |                     # 顺序遍历，能够保证插入SparseInput中的非零元是按example_index从小到大排好序的
150 |                     current_features, label = self.parse_line(line)
151 |                     ys.append(label)
152 | 
153 |                     # add categorical feature
154 |                     for field in CATEGORY_FIELDS:
155 |                         if field in current_features:
156 |                             Xs[field].add(example_idx=example_index,
157 |                                           feat_id=current_features[field],
158 |                                           feat_val=1)
159 | 
160 |                     # add numeric feature
161 |                     for field in DENSE_FIELDS:
162 |                         # wrap into one-element list, since we need to add one row
163 |                         Xs[field].append([current_features[field]])
164 | 
165 |                 yield Xs, np.asarray(ys)
166 | 
167 | 
168 | def precompute_log_mean_stddev():
169 |     df = pd.read_csv('dataset/train.csv', usecols=DENSE_FIELDS)
170 |     df = np.log1p(df)  # 数据有长尾, log使之更像正态一些
171 | 
172 |     means = df.mean()
173 |     stddevs = df.std()
174 |     log_means_stddevs = {field: (means[field], stddevs[field]) for field in DENSE_FIELDS}
175 | 
176 | 
177 | def test_standardize(infname):
178 |     print("\n============= standardize '{}'".format(infname))
179 | 
180 |     df = pd.read_csv(infname, usecols=DENSE_FIELDS)
181 |     df = np.log1p(df)
182 | 
183 |     means = pd.Series({field: mean for field, (mean, std) in DENSE_LOG_MEAN_STD.items()})
184 |     stddevs = pd.Series({field: std for field, (mean, std) in DENSE_LOG_MEAN_STD.items()})
185 | 
186 |     df = (df - means) / stddevs
187 |     print(df.describe().loc[['mean', 'std'], :])
188 | 
189 | 
190 | def test_batch_stream(infname):
191 |     dataset = Dataset(infname)
192 | 
193 |     batch_stream = dataset.get_batch_stream(16)
194 | 
195 |     for batch_idx, (features, labels) in enumerate(batch_stream, start=1):
196 |         print("\n================== {}-th batch".format(batch_idx))
197 |         print("labels: {}\n".format(labels))
198 | 
199 |         for field in DENSE_FIELDS:
200 |             print("[{}]: {}".format(field, features[field]))
201 | 
202 |         for field in CATEGORY_FIELDS:
203 |             sp_input = features[field]
204 |             print("\n[{}] example_indices: {}".format(field, sp_input._example_indices))
205 |             print("[{}] feature_ids: {}".format(field, sp_input._feature_ids))
206 |             print("[{}] feature_values: {}".format(field, sp_input._feature_values))
207 | 
208 | 
209 | def clean_datas(infname, outfname):
210 |     csv_columns = [
211 |         'age', 'workclass', 'fnlwgt', 'education', 'education_num',
212 |         'marital_status', 'occupation', 'relationship', 'race', 'gender',
213 |         'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
214 |         'income_bracket'
215 |     ]
216 | 
217 |     with open(infname, 'rt') as fin, open(outfname, 'wt') as fout:
218 |         # write header
219 |         fout.write(",".join(csv_columns) + "\n")
220 | 
221 |         for line in tqdm(fin):
222 |             line = line.strip()
223 |             line = line.replace(', ', ',')
224 |             if not line or ',' not in line:
225 |                 continue
226 |             if line[-1] == '.':
227 |                 line = line[:-1]
228 |             line += '\n'
229 |             fout.write(line)
230 |     print("'{}' is cleaned, and re-save to '{}'".format(infname, outfname))
231 | 
232 | 
233 | if __name__ == "__main__":
234 |     parser = argparse.ArgumentParser()
235 |     parser.add_argument('-j', "--job")
236 |     args = parser.parse_args()
237 | 
238 |     if args.job == "clean":
239 |         clean_datas(infname='dataset/raw_train.txt', outfname='dataset/train.csv')
240 |         clean_datas(infname='dataset/raw_test.txt', outfname='dataset/test.csv')
241 | 
242 |     else:
243 |         raise ValueError('unknown job={}'.format(args.job))
244 | 


--------------------------------------------------------------------------------
/census_main.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import pandas as pd
  3 | import argparse
  4 | import time
  5 | 
  6 | from census_data import Dataset, DENSE_FIELDS, CATEGORY_FIELDS
  7 | from census_data import VOCAB_LISTS, AGE_BOUNDARIES
  8 | from optimization import Adagrad
  9 | from wide_n_deep import WideDeepEstimator
 10 | from wide_layer import WideHParams, WideEstimator
 11 | from dnn import DeepHParams, DeepEstimator
 12 | import utils
 13 | 
 14 | 
 15 | class DataSource:
 16 |     def __init__(self, batch_size):
 17 |         self._train_dataset = Dataset('dataset/train.csv')
 18 |         self._test_dataset = Dataset('dataset/test.csv')
 19 |         self._batch_size = batch_size
 20 | 
 21 |     def train_batches_per_epoch(self):
 22 |         return self._train_dataset.get_batch_stream(self._batch_size, n_repeat=1)
 23 | 
 24 |     def test_batches_per_epoch(self):
 25 |         return self._test_dataset.get_batch_stream(self._batch_size, n_repeat=1)
 26 | 
 27 |     @property
 28 |     def n_train_examples(self):
 29 |         return self._train_dataset.n_examples
 30 | 
 31 |     @property
 32 |     def n_test_examples(self):
 33 |         return self._test_dataset.n_examples
 34 | 
 35 | 
 36 | def get_deep_hparams(embed_size, hidden_units, L2, learning_rate):
 37 |     dense_fields = [(field, 1) for field in DENSE_FIELDS]
 38 | 
 39 |     vocab_infos = []
 40 |     for vocab_name in CATEGORY_FIELDS:
 41 |         if vocab_name == 'age_buckets':
 42 |             vocab_size = len(AGE_BOUNDARIES) + 1
 43 |         else:
 44 |             vocab_size = len(VOCAB_LISTS[vocab_name])
 45 |         vocab_infos.append((vocab_name, vocab_size, embed_size))
 46 | 
 47 |     # 第一个field代表field name,第二个field代表vocab name
 48 |     # 在这个例子中，因为field与vocab是1:1，所以二者同名
 49 |     embed_fields = [(field, field) for field in CATEGORY_FIELDS]
 50 | 
 51 |     optimizer = Adagrad(learning_rate)
 52 | 
 53 |     return DeepHParams(
 54 |         dense_fields=dense_fields,
 55 |         vocab_infos=vocab_infos,
 56 |         embed_fields=embed_fields,
 57 |         hidden_units=hidden_units,
 58 |         L2=L2,
 59 |         optimizer=optimizer)
 60 | 
 61 | 
 62 | if __name__ == "__main__":
 63 |     # ************ define command-line-arguments
 64 |     parser = argparse.ArgumentParser()
 65 |     parser.add_argument('-e', "--estimator")
 66 |     parser.add_argument('-n', "--n_epoches", type=int, default=10)
 67 |     args = parser.parse_args()
 68 | 
 69 |     # ************ prepare
 70 |     utils.config_logging('log_{}.log'.format(args.estimator))
 71 | 
 72 |     data_source = DataSource(batch_size=32)
 73 | 
 74 |     deep_hparams = get_deep_hparams(embed_size=16,
 75 |                                     hidden_units=[64, 16],
 76 |                                     L2=0.01,
 77 |                                     learning_rate=0.001)
 78 | 
 79 |     wide_hparams = WideHParams(field_names=CATEGORY_FIELDS,
 80 |                                alpha=0.1,
 81 |                                beta=1,
 82 |                                L1=0.1,
 83 |                                L2=0.1)
 84 | 
 85 |     # ************ run
 86 |     if args.estimator == 'wide_n_deep':
 87 |         estimator = WideDeepEstimator(wide_hparams=wide_hparams, deep_hparams=deep_hparams, data_source=data_source)
 88 |     elif args.estimator == "deep":
 89 |         estimator = DeepEstimator(hparams=deep_hparams, data_source=data_source)
 90 |     elif args.estimator == 'wide':
 91 |         estimator = WideEstimator(hparams=wide_hparams, data_source=data_source)
 92 |     else:
 93 |         raise ValueError('unknown estimator type={}'.format(args.estimator))
 94 | 
 95 |     start_time = time.time()
 96 |     metrics_history = estimator.train(args.n_epoches)
 97 |     elapsed = time.time() - start_time
 98 | 
 99 |     # ************ display result
100 |     logging.info("\n************** TIME COST **************")
101 |     logging.info('{:.2f} seconds for {} epoches'.format(elapsed, args.n_epoches))
102 |     logging.info('{:.2f} examples per second'.format(
103 |         args.n_epoches * (data_source.n_train_examples + data_source.n_test_examples) / elapsed))
104 | 
105 |     logging.info("\n************** LEARNING CURVE **************")
106 |     metrics_history = pd.DataFrame(metrics_history)
107 |     logging.info(metrics_history)
108 |     metrics_history.to_csv('learn_curve_{}.csv'.format(args.estimator), index=False)
109 | 


--------------------------------------------------------------------------------
/dense_layer.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import initialization
 3 | 
 4 | 
 5 | class DenseLayer:
 6 |     def __init__(self, name, shape, l2reg=0, init_method='glorot_uniform'):
 7 |         self._name = name
 8 |         self._l2reg = l2reg
 9 | 
10 |         self._W = initialization.get_global_init(init_method)(shape)
11 |         self._b = initialization.get_global_init('zero')(shape[1])
12 | 
13 |         self._dW = None
14 |         self._db = None
15 | 
16 |         self._last_input = None
17 | 
18 |     def forward(self, X):
19 |         self._last_input = X
20 | 
21 |         # last_input: [batch_size, fan_in]
22 |         # W: [fan_in, fan_out]
23 |         # b: [fanout]
24 |         # result: [batch_size, fan_out]
25 |         return np.dot(self._last_input, self._W) + self._b
26 | 
27 |     def backward(self, prev_grads):
28 |         # prev_grads: [batch_size, fan_out]
29 |         assert prev_grads.shape[1] == self._W.shape[1]
30 | 
31 |         # self._last_input.T: [fan_in, batch_size]
32 |         # prev_grads: [batch_size, fan_out]
33 |         # dW: [fan_in, fan_out], same shape as W
34 |         self._dW = np.dot(self._last_input.T, prev_grads)
35 | 
36 |         # 加上l2_loss对W的导数
37 |         self._dW += self._l2reg * self._W
38 | 
39 |         # 把b想像成特殊的fan_in=1的W，则套用上面的公式
40 |         # db = [1,1,...,1](共batch_size个1,shape=[1,batch_size])*prev_grads([batch_size,fan_out])=各列之和([1,fan_out])
41 |         self._db = np.sum(prev_grads, axis=0)
42 | 
43 |         # return: dLoss/dX: [batch_size, fan_in]
44 |         # prev_grads: [batch_size, fan_out]
45 |         # self._W.T: [fan_out,fan_in]
46 |         return np.dot(prev_grads, self._W.T)
47 | 
48 |     @property
49 |     def l2reg_loss(self):
50 |         return 0.5 * self._l2reg * np.sum(self._W ** 2)
51 | 
52 |     @property
53 |     def shape(self):
54 |         return self._W.shape
55 | 
56 |     @property
57 |     def output_dim(self):
58 |         return self._W.shape[1]
59 | 
60 |     @property
61 |     def variables(self):
62 |         return {"{}_W".format(self._name): self._W,
63 |                 "{}_b".format(self._name): self._b}
64 | 
65 |     @property
66 |     def grads2var(self):
67 |         return {"{}_W".format(self._name): self._dW,
68 |                 "{}_b".format(self._name): self._db}
69 | 


--------------------------------------------------------------------------------
/dnn.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from input_layer import DenseInputCombineLayer
  3 | from embedding_layer import EmbeddingCombineLayer
  4 | from dense_layer import DenseLayer
  5 | from activation import ReLU
  6 | import utils
  7 | import logging
  8 | from collections import namedtuple
  9 | from base_estimator import BaseEstimator
 10 | 
 11 | 
 12 | class DeepNetwork:
 13 |     def __init__(self, dense_fields, vocab_infos, embed_fields, hidden_units, L2, optimizer):
 14 |         """
 15 |         :param dense_fields: a list of tuple (field_name, field's input-dim)
 16 |         :param vocab_infos: a list of tuple, each tuple is (vocab_name, vocab_size, embed_size)
 17 |         :param embed_fields: a list of tuple (field_name, vocab_name)
 18 |         :param hidden_units: a list of ints, n_units for each hidden layer
 19 |         :param L2: L2 regularization for hidden dense layer
 20 |         :param optimizer: optimizer instance to update the weights
 21 |         """
 22 |         self._optimizer = optimizer
 23 | 
 24 |         # ***************** dense input layer
 25 |         self._dense_combine_layer = DenseInputCombineLayer(dense_fields)
 26 | 
 27 |         # ***************** embedding layers
 28 |         self._embed_combine_layer = EmbeddingCombineLayer(vocab_infos)
 29 |         for field_name, vocab_name in embed_fields:
 30 |             self._embed_combine_layer.add_embedding(vocab_name=vocab_name, field_name=field_name)
 31 | 
 32 |         self._optimize_layers = [self._embed_combine_layer]
 33 | 
 34 |         # ***************** MLP
 35 |         prev_out_dim = self._dense_combine_layer.output_dim + self._embed_combine_layer.output_dim
 36 | 
 37 |         self._hidden_layers = []
 38 |         for layer_idx, n_units in enumerate(hidden_units, start=1):
 39 |             # ----------- add hidden layer
 40 |             hidden_layer = DenseLayer(name="hidden{}".format(layer_idx), shape=[prev_out_dim, n_units], l2reg=L2)
 41 |             self._hidden_layers.append(hidden_layer)
 42 |             self._optimize_layers.append(hidden_layer)
 43 |             logging.info("{}-th hidden layer, weight shape={}".format(layer_idx, hidden_layer.shape))
 44 | 
 45 |             # ----------- add activation layer
 46 |             self._hidden_layers.append(ReLU())
 47 | 
 48 |             # ----------- update previous dimension
 49 |             prev_out_dim = n_units
 50 | 
 51 |         # final logit layer
 52 |         final_logit_layer = DenseLayer(name="final_logit", shape=[prev_out_dim, 1], l2reg=L2)
 53 |         logging.info("final logit layer, weight shape={}".format(final_logit_layer.shape))
 54 |         self._hidden_layers.append(final_logit_layer)
 55 |         self._optimize_layers.append(final_logit_layer)
 56 | 
 57 |     def forward(self, features):
 58 |         """
 59 |         :param features: dict, mapping from field=>dense ndarray or field=>SparseInput
 60 |         :return: logits, [batch_size]
 61 |         """
 62 |         dense_input = self._dense_combine_layer.forward(features)
 63 | 
 64 |         embed_input = self._embed_combine_layer.forward(features)
 65 | 
 66 |         X = np.hstack([dense_input, embed_input])
 67 | 
 68 |         for hidden_layer in self._hidden_layers:
 69 |             X = hidden_layer.forward(X)
 70 | 
 71 |         return X.flatten()
 72 | 
 73 |     def backward(self, grads2logits):
 74 |         """
 75 |         :param grads2logits: gradients from loss to logits, [batch_size]
 76 |         """
 77 |         # ***************** 计算所有梯度
 78 |         prev_grads = grads2logits.reshape([-1, 1])  # reshape to [batch_size,1]
 79 | 
 80 |         # iterate hidden layers backwards
 81 |         for hidden_layer in self._hidden_layers[::-1]:
 82 |             prev_grads = hidden_layer.backward(prev_grads)
 83 | 
 84 |         col_sizes = [self._dense_combine_layer.output_dim, self._embed_combine_layer.output_dim]
 85 |         # 抛弃第一个split，因为其对应的是input，无可优化
 86 |         _, grads_for_all_embedding = utils.split_column(prev_grads, col_sizes)
 87 | 
 88 |         self._embed_combine_layer.backward(grads_for_all_embedding)
 89 | 
 90 |         # ***************** 优化
 91 |         # 这个操作必须每次backward都调用，这是因为，尽管dense部分的权重是固定的
 92 |         # 但是sparse部分，要优化哪个变量，是随着输入不同而不同的
 93 |         all_vars, all_grads2var = {}, {}
 94 |         for opt_layer in self._optimize_layers:
 95 |             all_vars.update(opt_layer.variables)
 96 |             all_grads2var.update(opt_layer.grads2var)
 97 | 
 98 |         self._optimizer.update(variables=all_vars, gradients=all_grads2var)
 99 | 
100 | 
101 | DeepHParams = namedtuple("DeepHParams",
102 |                          ['dense_fields', 'vocab_infos', 'embed_fields', 'hidden_units', 'L2', 'optimizer'])
103 | 
104 | 
105 | class DeepEstimator(BaseEstimator):
106 |     def __init__(self, hparams, data_source):
107 |         self._dnn = DeepNetwork(dense_fields=hparams.dense_fields,
108 |                                 vocab_infos=hparams.vocab_infos,
109 |                                 embed_fields=hparams.embed_fields,
110 |                                 hidden_units=hparams.hidden_units,
111 |                                 L2=hparams.L2,
112 |                                 optimizer=hparams.optimizer)
113 |         super().__init__(data_source)
114 | 
115 |     def train_batch(self, features, labels):
116 |         # ********* forward
117 |         logits = self._dnn.forward(features)
118 |         pred_probas = 1 / (1 + np.exp(-logits))
119 | 
120 |         # ********* backward
121 |         grads2logits = pred_probas - labels
122 |         self._dnn.backward(grads2logits)
123 | 
124 |         return pred_probas
125 | 
126 |     def predict(self, features):
127 |         logits = self._dnn.forward(features)
128 |         return 1 / (1 + np.exp(-logits))
129 | 


--------------------------------------------------------------------------------
/embedding_layer.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from initialization import TruncatedNormal
  3 | import utils
  4 | 
  5 | 
  6 | class EmbeddingLayer:
  7 |     """
  8 |     简化起见，不支持use_bias和regularization
  9 |     不支持regularization的原因是：weight是稠密的，自然L2 Loss的gradient也是稠密的
 10 |     为了L2 Loss而破坏稀疏性，增加内容与耗时，有些得不偿失
 11 |     一种改进方案是：只正则化本batch中用到的embedding向量
 12 |     """
 13 | 
 14 |     def __init__(self, W, vocab_name, field_name):
 15 |         """
 16 |         :param W: dense weight matrix, [vocab_size,embed_size]
 17 |         :param b: bias, [embed_size]
 18 |         """
 19 |         self.vocab_name = vocab_name
 20 |         self.field_name = field_name
 21 |         self._W = W
 22 |         self._last_input = None
 23 | 
 24 |     @property
 25 |     def output_dim(self):
 26 |         return self._W.shape[1]
 27 | 
 28 |     def forward(self, X):
 29 |         """
 30 |         :param X: SparseInput
 31 |         :return: [batch_size, embed_size]
 32 |         """
 33 |         self._last_input = X
 34 | 
 35 |         # output: [batch_size, embed_size]
 36 |         output = np.zeros((X.n_total_examples, self._W.shape[1]))
 37 | 
 38 |         for example_idx, feat_id, feat_val in X.iterate_non_zeros():
 39 |             embedding = self._W[feat_id, :]
 40 |             output[example_idx, :] += embedding * feat_val
 41 | 
 42 |         return output
 43 | 
 44 |     def backward(self, prev_grads):
 45 |         """
 46 |         :param prev_grads: [batch_size, embed_size]
 47 |         :return: dw
 48 |         """
 49 |         dW = {}
 50 | 
 51 |         for example_idx, feat_id, feat_val in self._last_input.iterate_non_zeros():
 52 |             # [1,embed_size]
 53 |             grad_from_one_example = prev_grads[example_idx, :] * feat_val
 54 | 
 55 |             if feat_id in dW:
 56 |                 dW[feat_id] += grad_from_one_example
 57 | 
 58 |             else:
 59 |                 dW[feat_id] = grad_from_one_example
 60 | 
 61 |         return dW
 62 | 
 63 | 
 64 | class EmbeddingCombineLayer:
 65 |     def __init__(self, vocab_infos):
 66 |         """
 67 |         :param vocab_infos: a list of tuple, each tuple is (vocab_name, vocab_size, embed_size)
 68 |         """
 69 |         self._weights = {}  # vocab_name ==> weight
 70 |         for vocab_name, vocab_size, embed_size in vocab_infos:
 71 |             # TruncatedNormal是TF WDL中embedding_column的默认初始化方式
 72 |             # These values are similar to values from a `random_normal_initializer`
 73 |             # except that values more than two standard deviations from the mean are discarded and re-drawn
 74 |             stddev = 1 / np.sqrt(embed_size)
 75 |             initializer = TruncatedNormal(mean=0,
 76 |                                           stddev=stddev,
 77 |                                           lower=-2 * stddev,
 78 |                                           upper=2 * stddev)
 79 |             self._weights[vocab_name] = initializer(shape=[vocab_size, embed_size])
 80 | 
 81 |         # 注意，由于embedding input的稀疏性，一次回代时，不太可能对所有embedding weight有梯度
 82 |         # 而是只针对某个field的embedding weight中某feature id对应的行有梯度
 83 |         # _grads_to_embed是一个dict,
 84 |         # key是"vocab_name@feature_id"的形式，value是一个[embed_size]的ndarray。
 85 |         # 因为vocab的weight是多个field所共享的，所以value是每个field对vocab_name@feature_id的梯度的叠加
 86 |         self._grads_to_embed = {}
 87 |         self._embed_layers = []
 88 | 
 89 |     def add_embedding(self, vocab_name, field_name):
 90 |         weight = self._weights[vocab_name]
 91 |         layer = EmbeddingLayer(W=weight, vocab_name=vocab_name, field_name=field_name)
 92 |         self._embed_layers.append(layer)
 93 | 
 94 |     @property
 95 |     def output_dim(self):
 96 |         return sum(layer.output_dim for layer in self._embed_layers)
 97 | 
 98 |     def forward(self, sparse_inputs):
 99 |         """
100 |         :param sparse_inputs: dict {field_name: SparseInput}
101 |         :return:    每个SparseInput贡献一个embedding vector，返回结果是这些embedding vector的拼接
102 |                     拼接顺序由add_embedding的调用顺序决定
103 |         """
104 |         embedded_outputs = []
105 |         for embed_layer in self._embed_layers:
106 |             sp_input = sparse_inputs[embed_layer.field_name]
107 |             embedded_outputs.append(embed_layer.forward(sp_input))
108 | 
109 |         # [batch_size, sum of all embed-layer's embed_size]
110 |         return np.hstack(embedded_outputs)
111 | 
112 |     def backward(self, prev_grads):
113 |         """
114 |         :param prev_grads:  [batch_size, sum of all embed-layer's embed_size]
115 |                             上一层传入的, Loss对本层输出的梯度
116 |         """
117 |         assert prev_grads.shape[1] == self.output_dim
118 | 
119 |         # 因为output是每列输出的拼接，自然上一层输入的导数也是各层所需要导数的拼接
120 |         # prev_grads_splits是一个数组，存储对应各层的导数
121 |         col_sizes = [layer.output_dim for layer in self._embed_layers]
122 |         prev_grads_splits = utils.split_column(prev_grads, col_sizes)
123 | 
124 |         self._grads_to_embed.clear()  # reset
125 |         for layer, layer_prev_grads in zip(self._embed_layers, prev_grads_splits):
126 |             # layer_prev_grads: 上一层传入的，Loss对某个layer的输出的梯度
127 |             # layer_grads_to_feat_embed: dict, feat_id==>grads，
128 |             # 由这一个layer造成对某vocab的embedding矩阵的某feat_id对应行的梯度
129 |             layer_grads_to_embed = layer.backward(layer_prev_grads)
130 | 
131 |             for feat_id, g in layer_grads_to_embed.items():
132 |                 # 表示"对某个vocab的embedding weight中的第feat_id行的总导数"
133 |                 key = "{}@{}".format(layer.vocab_name, feat_id)
134 | 
135 |                 if key in self._grads_to_embed:
136 |                     self._grads_to_embed[key] += g
137 |                 else:
138 |                     self._grads_to_embed[key] = g
139 | 
140 |     @property
141 |     def variables(self):
142 |         """ 优化变量
143 |         :return: dict from vocab_name to weight matrix
144 |         """
145 |         return self._weights
146 | 
147 |     @property
148 |     def grads2var(self):
149 |         """ Loss对优化变量的梯度
150 |         :return: dict, key是"vocab_name@feature_id"的形式，value是一个[embed_size]的ndarray
151 |         """
152 |         return self._grads_to_embed
153 | 
154 |     @property
155 |     def l2reg_loss(self):
156 |         return 0  # 出于保持稀疏的考虑，在embedding层暂不支持正则
157 | 


--------------------------------------------------------------------------------
/ftrl.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from collections import defaultdict
 3 | 
 4 | 
 5 | class FtrlEstimator:
 6 |     """
 7 |     每个field对应一个FtrlEstimator。类比于在TensorFlow WDL中，一个feature column对应一个FtrlEstimator
 8 |     """
 9 | 
10 |     def __init__(self, alpha, beta, L1, L2):
11 |         self._alpha = alpha
12 |         self._beta = beta
13 |         self._L1 = L1
14 |         self._L2 = L2
15 | 
16 |         self._n = defaultdict(float)  # n[i]: i-th feature's squared sum of past gradients
17 |         self._z = defaultdict(float)
18 | 
19 |         # lazy weights, 实际上是一个临时变量，只在：
20 |         # 1. 对应的feature value != 0, 并且
21 |         # 2. 之前累积的abs(z) > L1
22 |         # 两种情况都满足时，w才在feature id对应的位置上存储一个值
23 |         # 而且w中数据的存储周期，只在一次前代、回代之间，在新的前代开始之前，就清空上次的w
24 |         self._w = {}
25 | 
26 |         self._current_feat_ids = None
27 |         self._current_feat_vals = None
28 | 
29 |     def predict_logit(self, feature_ids, feature_values):
30 |         """
31 |         :param feature_ids: non-zero feature ids for one example
32 |         :param feature_values: non-zero feature values for one example
33 |         :return: logit for this example
34 |         """
35 |         self._current_feat_ids = feature_ids
36 |         self._current_feat_vals = feature_values
37 | 
38 |         logit = 0
39 |         self._w.clear() # lazy weights，所以没有必要保留之前的weights
40 | 
41 |         # 如果当前样本在这个field下所有feature都为0，则feature_ids==feature_values==[]
42 |         # 则没有以下循环，logit=0
43 |         for feat_id, feat_val in zip(feature_ids, feature_values):
44 |             z = self._z[feat_id]
45 |             sign_z = -1. if z < 0 else 1.
46 | 
47 |             # build w on the fly using z and n, hence the name - lazy weights
48 |             # this allows us for not storing the complete w
49 |             # if abs(z) <= self._L1: self._w[feat_id] = 0.  # w[i] vanishes due to L1 regularization
50 |             if abs(z) > self._L1:
51 |                 # apply prediction time L1, L2 regularization to z and get w
52 |                 w = (sign_z * self._L1 - z) / ((self._beta + np.sqrt(self._n[feat_id])) / self._alpha + self._L2)
53 |                 self._w[feat_id] = w
54 |                 logit += w * feat_val
55 | 
56 |         return logit
57 | 
58 |     def update(self, pred_proba, label):
59 |         """
60 |         :param pred_proba:  与last_feat_ids/last_feat_vals对应的预测CTR
61 |                             注意pred_proba并不一定等于sigmoid(predict_logit(...))，因为要还要考虑deep侧贡献的logit
62 |         :param label:       与last_feat_ids/last_feat_vals对应的true label
63 |         """
64 |         grad2logit = pred_proba - label
65 | 
66 |         # 如果当前样本在这个field下所有feature都为0，则没有以下循环，没有更新
67 |         for feat_id, feat_val in zip(self._current_feat_ids, self._current_feat_vals):
68 |             g = grad2logit * feat_val
69 |             g2 = g * g
70 |             n = self._n[feat_id]
71 | 
72 |             self._z[feat_id] += g
73 | 
74 |             if feat_id in self._w: # if self._w[feat_id] != 0
75 |                 sigma = (np.sqrt(n + g2) - np.sqrt(n)) / self._alpha
76 |                 self._z[feat_id] -= sigma * self._w[feat_id]
77 | 
78 |             self._n[feat_id] = n + g2
79 | 


--------------------------------------------------------------------------------
/initialization.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import scipy.stats
 3 | 
 4 | 
 5 | class Zero:
 6 |     def __call__(self, shape):
 7 |         return np.zeros(shape)
 8 | 
 9 | 
10 | class GlorotUniform:
11 |     def __call__(self, shape):
12 |         fan_in, fan_out = shape
13 |         scale = np.sqrt(6 / (fan_in + fan_out))
14 |         return np.random.uniform(-scale, scale, shape)
15 | 
16 | 
17 | class GlorotNormal:
18 |     def __call__(self, shape):
19 |         fan_in, fan_out = shape
20 |         stdev = np.sqrt(2 / (fan_out + fan_in))
21 |         return np.random.normal(loc=0, scale=stdev, size=shape)
22 | 
23 | 
24 | class TruncatedNormal:
25 |     def __init__(self, mean, stddev, lower, upper):
26 |         self._rand = scipy.stats.truncnorm(
27 |             (lower - mean) / stddev,
28 |             (upper - mean) / stddev,
29 |             loc=mean,
30 |             scale=stddev)
31 | 
32 |     def __call__(self, shape):
33 |         return self._rand.rvs(size=shape)
34 | 
35 | 
36 | _Global_Initializers = {} # initializers which can be shared
37 | 
38 | 
39 | def get_global_init(name):
40 |     if name in _Global_Initializers:
41 |         return _Global_Initializers[name]
42 | 
43 |     if name == "zero":
44 |         initializer = Zero()
45 |     elif name == "glorot_uniform":
46 |         initializer = GlorotUniform()
47 |     elif name == "glorot_normal":
48 |         initializer = GlorotNormal()
49 |     else:
50 |         raise ValueError('unknown initializer={}'.format(name))
51 | 
52 |     _Global_Initializers[name] = initializer
53 |     return initializer
54 | 


--------------------------------------------------------------------------------
/input_layer.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | 
  4 | class DenseInputCombineLayer:
  5 |     def __init__(self, field_sizes):
  6 |         # field_sizes: a list of tuple
  7 |         # tuple[0]: field name
  8 |         # tuple[1]: input dim for this field
  9 |         self._field_sizes = field_sizes
 10 | 
 11 |     @property
 12 |     def output_dim(self):
 13 |         return sum(in_dim for _, in_dim in self._field_sizes)
 14 | 
 15 |     def forward(self, inputs):
 16 |         """
 17 |         按field_sizes的顺序从inputs提取ndarray，并拼接起来
 18 |         :param inputs: dict of {field_name: ndarray}
 19 |         """
 20 |         outputs = []
 21 |         for field_name, in_dim in self._field_sizes:
 22 |             a_input = np.asarray(inputs[field_name])
 23 |             assert in_dim == a_input.shape[1]
 24 |             outputs.append(a_input)
 25 |         return np.hstack(outputs)
 26 | 
 27 | 
 28 | class SparseInput:
 29 |     """
 30 |     如何表示稀疏输入，很费了一番思考
 31 |     TensorFlow中是用sp_ids, sp_weights两上SparseTensor来表示，但是这两个SparseTensor中的indices, dense_shape必须完全相同，是重复的
 32 | 
 33 |     后来考虑使用KVPair = namedtuple('KVPair', ['idx_in_batch', 'id', 'value'])表示一个非零特征
 34 |     整个稀疏输入就是list of KVPair，程序处理上是方便了很多，但是每个KVPair都是一个namedtuple，生成了大多的small object，会给GC造成压力
 35 | 
 36 |     后来还考虑使用一个[n_nonzero, 3]的ndarray来表示，
 37 |     第0列是idx_in_batch（行号）
 38 |     第1列是id
 39 |     第2列是数值
 40 |     但是因为ndarray只能有一个dtype，为了容纳value，整个ndarray必须是float，处理起行号和id这样的整数，既不方便，也浪费了空间
 41 | 
 42 |     目前决定采用3个list的方式来表示一个理论、稠密形状为[batch_size, max_bag_size]的稀疏输入
 43 |     所谓max_bag_size，是一个理论概念，可以认为infinity，在代码中并不出现，也不会对代码造成限制
 44 |     比如表示用户行为历史，max_bag_size可以是用户一段历史内阅读的文章数、购买的商品数
 45 |     比如表示用户的手机使用习惯，max_bag_size可以是所有app的数目
 46 |     这里，我们将这些信息表示成一个bag，而不是sequence，忽略其中的时序关系
 47 | 
 48 |     第一个list example_indices: 是[n_non_zeros]的整数数组，表示在[batch_size, max_bag_size]中的行号（样本序号），>=0 and < batch_size
 49 |                                而且要求其中的数值是从小到大，排好序的
 50 |     第二个list feature_ids:     是[n_non_zeros]的整数数组，表示非零元对应特征的序号，可以重复
 51 |     第三个list feature_values:  是[n_non_zeros]的浮点数组，表示非零元对应特征的数值
 52 |     举例来说，第i个非零元(0<=i<n_non_zeros)
 53 |     它对应哪个样本？example_indices[i]
 54 |     它对应哪个特征？feature_ids[i]
 55 |     它的数值是多少？values[i]
 56 |     """
 57 | 
 58 |     def __init__(self, n_total_examples, example_indices, feature_ids, feature_values):
 59 |         assert len(example_indices) == len(feature_ids) == len(feature_values)
 60 |         self._example_indices = example_indices
 61 |         self._feature_ids = feature_ids
 62 |         self._feature_values = feature_values
 63 | 
 64 |         self.n_total_examples = n_total_examples  # 理论上这个batch包含的样本的个数，相当于SparseTensor中的dense_shape[0]
 65 |         self.__nnz_idx = 0
 66 | 
 67 |     def add(self, example_idx, feat_id, feat_val):
 68 |         self._example_indices.append(example_idx)
 69 |         self._feature_ids.append(feat_id)
 70 |         self._feature_values.append(feat_val)
 71 | 
 72 |     def iterate_non_zeros(self):
 73 |         return zip(self._example_indices, self._feature_ids, self._feature_values)
 74 | 
 75 |     def __move_to_next_example(self, nnz_idx):
 76 |         """
 77 |         返回当前样本的所有feature id和feature value
 78 |         并把nnz_index移动到下一个样本的起始位置
 79 |         """
 80 |         if nnz_idx >= len(self._example_indices):
 81 |             return None
 82 | 
 83 |         end = nnz_idx + 1
 84 |         while end < len(self._example_indices) and self._example_indices[end] == self._example_indices[nnz_idx]:
 85 |             end += 1
 86 | 
 87 |         current_feat_ids = self._feature_ids[nnz_idx:end]
 88 |         current_feat_vals = self._feature_values[nnz_idx:end]
 89 | 
 90 |         return end, current_feat_ids, current_feat_vals
 91 | 
 92 |     # def iterate_example(self):
 93 |     #     nnz_idx = 0
 94 |     #     for example_idx in range(self.batch_size):
 95 |     #
 96 |     #         if (nnz_idx >= len(self.example_indices)) or (self.example_indices[nnz_idx] != example_idx):
 97 |     #             yield example_idx, None, None
 98 |     #
 99 |     #         else:
100 |     #             nnz_idx, current_feat_ids, current_feat_vals = self.__move_to_next_example(nnz_idx)
101 |     #             yield example_idx, current_feat_ids, current_feat_vals
102 | 
103 |     def get_example_in_order(self, example_idx):
104 |         """
105 |         :param example_idx: 有一个前提，example_idx必须从0到batch_size顺序输入
106 |         :return: 与example_idx对应的feat_ids和feat_vals
107 |         """
108 |         if self.__nnz_idx >= len(self._example_indices):
109 |             return [], []
110 | 
111 |         elif self._example_indices[self.__nnz_idx] == example_idx:
112 |             self.__nnz_idx, feat_ids, feat_vals = self.__move_to_next_example(self.__nnz_idx)
113 |             return feat_ids, feat_vals
114 | 
115 |         elif self._example_indices[self.__nnz_idx] > example_idx:
116 |             # 等待调用者下次传入更大的example_idx
117 |             return [], []
118 | 
119 |         else:
120 |             # 如果当前example_index并不是调用者需要的example_idx
121 |             # 则一定是比外界需要用example_idx大，等待调用者传入更大的example_idx
122 |             # 如果比比外界需要用example_idx小，说明调用方式不对
123 |             raise ValueError("incorrect invocation")
124 | 


--------------------------------------------------------------------------------
/metrics.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def tied_rank(x):
 5 |     """
 6 |     Computes the tied rank of elements in x.
 7 |     This function computes the tied rank of elements in x.
 8 |     Parameters
 9 |     ----------
10 |     x : list of numbers, numpy array
11 |     Returns
12 |     -------
13 |     score : list of numbers
14 |             The tied rank f each element in x
15 |     """
16 |     sorted_x = sorted(zip(x, range(len(x))))
17 |     r = [0 for k in x]
18 |     cur_val = sorted_x[0][0]
19 |     last_rank = 0
20 |     for i in range(len(sorted_x)):
21 |         if cur_val != sorted_x[i][0]:
22 |             cur_val = sorted_x[i][0]
23 |             for j in range(last_rank, i):
24 |                 r[sorted_x[j][1]] = float(last_rank + 1 + i) / 2.0
25 |             last_rank = i
26 |         if i == len(sorted_x) - 1:
27 |             for j in range(last_rank, i + 1):
28 |                 r[sorted_x[j][1]] = float(last_rank + i + 2) / 2.0
29 |     return r
30 | 
31 | 
32 | def auc(y_true, y_score):
33 |     """
34 |     copy from: https://github.com/benhamner/Metrics/blob/master/Python/ml_metrics/auc.py
35 | 
36 |     Computes the area under the receiver-operater characteristic (AUC)
37 |     This function computes the AUC error metric for binary classification.
38 |     Parameters
39 |     ----------
40 |     y_true : list of binary numbers, numpy array
41 |              The ground truth value
42 |     y_score : same type as actual
43 |                 Defines a ranking on the binary numbers, from most likely to
44 |                 be positive to least likely to be positive.
45 |     Returns
46 |     -------
47 |     score : double
48 |             The mean squared error between actual and posterior
49 |     """
50 |     r = tied_rank(y_score)
51 |     num_positive = len([0 for x in y_true if x == 1])
52 |     num_negative = len(y_true) - num_positive
53 |     sum_positive = sum([r[i] for i in range(len(r)) if y_true[i] == 1])
54 |     auc = ((sum_positive - num_positive * (num_positive + 1) / 2.0) /
55 |            (num_negative * num_positive))
56 |     return auc
57 | 
58 | 
59 | def logloss(y_true, y_pred, normalize=True):
60 |     loss_array = -y_true * np.log(y_pred) - (1 - y_true) * np.log(1 - y_pred)
61 |     if normalize:
62 |         return np.mean(loss_array)
63 |     else:
64 |         return np.sum(loss_array)
65 | 


--------------------------------------------------------------------------------
/optimization.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | class Adagrad:
 5 |     def __init__(self, lr):
 6 |         self._lr = lr
 7 |         # variable name => sum of gradient square (also a vector)
 8 |         self._sum_grad2 = {}
 9 | 
10 |     def update(self, variables, gradients):
11 |         for gradname, gradient in gradients.items():
12 |             # ------ update cache
13 |             g2 = gradient * gradient
14 |             if gradname in self._sum_grad2:
15 |                 self._sum_grad2[gradname] += g2
16 |             else:
17 |                 self._sum_grad2[gradname] = g2
18 | 
19 |             # ------ calculate delta
20 |             delta = self._lr * gradient / (np.sqrt(self._sum_grad2[gradname]) + 1e-6)
21 | 
22 |             # ------ update
23 |             if '@' in gradname:
24 |                 # 对应着稀疏输入的权重与梯度，gradients中的key遵循着'vocab_name@feat_id'的格式
25 |                 varname, row = gradname.split('@')
26 |                 row = int(row)
27 | 
28 |                 variable = variables[varname]
29 |                 variable[row, :] -= delta
30 |             else:
31 |                 variable = variables[gradname]
32 |                 variable -= delta
33 | 
34 | 
35 | 
36 | 
37 | 
38 | 
39 | 


--------------------------------------------------------------------------------
/test_dense_layer.py:
--------------------------------------------------------------------------------
 1 | from dense_layer import DenseLayer
 2 | import numpy as np
 3 | from tqdm import tqdm
 4 | 
 5 | 
 6 | def calc_numeric_grads(variable, epsilon, loss_fn):
 7 |     numeric_grad = np.zeros_like(variable)
 8 | 
 9 |     if len(variable.shape) == 2:
10 |         pbar = tqdm(total=variable.shape[0] * variable.shape[1])
11 |         for r in range(variable.shape[0]):
12 |             for c in range(variable.shape[1]):
13 |                 variable[r, c] -= epsilon
14 |                 # 最终的loss选择用np.sum，从而prev_grads是全1矩阵
15 |                 neg_loss = loss_fn()
16 | 
17 |                 variable[r, c] += 2 * epsilon
18 |                 # 最终的loss选择用np.sum，从而prev_grads是全1矩阵
19 |                 pos_loss = loss_fn()
20 | 
21 |                 numeric_grad[r, c] = (pos_loss - neg_loss) / (2 * epsilon)
22 | 
23 |                 variable[r, c] -= epsilon  # restore to original
24 |                 pbar.update(1)
25 | 
26 |     elif len(variable.shape) == 1:
27 |         pbar = tqdm(total=variable.shape[0])
28 |         for r in range(variable.shape[0]):
29 |             variable[r] -= epsilon
30 |             # 最终的loss选择用np.sum，从而prev_grads是全1矩阵
31 |             neg_loss = loss_fn()
32 | 
33 |             variable[r] += 2 * epsilon
34 |             # 最终的loss选择用np.sum，从而prev_grads是全1矩阵
35 |             pos_loss = loss_fn()
36 | 
37 |             numeric_grad[r] = (pos_loss - neg_loss) / (2 * epsilon)
38 | 
39 |             variable[r] -= epsilon  # restore to original
40 |             pbar.update(1)
41 | 
42 |     else:
43 |         raise ValueError('unsupported shape')
44 | 
45 |     return numeric_grad
46 | 
47 | 
48 | def test_dense_fc_layer():
49 |     batch_size = 3
50 |     fan_in = 4
51 |     fan_out = 2
52 |     epsilon = 1e-6
53 | 
54 |     # ---------- forward
55 |     layer = DenseLayer(name='test', shape=[fan_in,fan_out], l2reg=0.01)
56 |     X = np.random.randn(batch_size, fan_in)
57 |     y = layer.forward(X)
58 |     assert y.shape == (batch_size, fan_out)
59 | 
60 |     # ---------- backward
61 |     # 最终的loss选择用np.sum，从而prev_grads是全1矩阵，得到的derived_grads就是本层自身的gradients
62 |     dX = layer.backward(prev_grads=np.ones((batch_size, fan_out)))
63 | 
64 |     # ---------- test grads on W
65 |     var_grads = [('W', layer._W, layer._dW), ('b', layer._b, layer._db), ('input', X, dX)]
66 |     for name, variable, grad in var_grads:
67 |         print("\n************* checking numerical gradients on '{}', ......".format(name))
68 |         numeric_grad = calc_numeric_grads(variable=variable,
69 |                                           epsilon=epsilon,
70 |                                           loss_fn=lambda: np.sum(layer.forward(X)) + layer.l2reg_loss)
71 | 
72 |         print("========== derived gradients = \n{}".format(grad))
73 |         print("========== numeric gradients = \n{}".format(numeric_grad))
74 |         is_equal = np.allclose(grad, numeric_grad)
75 |         assert is_equal
76 |         print("Equal = {}".format(is_equal))
77 | 
78 | 
79 | if __name__ == "__main__":
80 |     # np.random.seed(999)
81 | 
82 |     test_dense_fc_layer()
83 | 


--------------------------------------------------------------------------------
/test_embed_layer.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from embedding_layer import EmbeddingLayer
 3 | from input_layer import SparseInput
 4 | 
 5 | 
 6 | def test_embedding_forward():
 7 |     layer = EmbeddingLayer(W=np.arange(12).reshape(4, 3), vocab_name=None, field_name=None)
 8 | 
 9 |     X = SparseInput(example_indices=[2, 1, 2],
10 |                     feature_ids=[2, 3, 2],
11 |                     feature_values=[1, 2, 2],
12 |                     n_total_examples=5)
13 | 
14 |     output = layer.forward(X)
15 |     print(output)
16 | 
17 | 
18 | def test_embedding_backward():
19 |     layer = EmbeddingLayer(W=np.random.randn(4, 3), vocab_name=None, field_name=None)
20 | 
21 |     X = SparseInput(example_indices=[1, 1, 2, 3, 3, 3],
22 |                     feature_ids=[0, 3, 1, 2, 1, 0],
23 |                     feature_values=[1, 2, 2, 1, 1, 2],
24 |                     n_total_examples=5)
25 |     output = layer.forward(X)
26 | 
27 |     grads2W = layer.backward(np.ones((X.n_total_examples, 3)))
28 |     print("========== derived gradients = \n{}".format(grads2W))
29 | 
30 |     # ----------- calculate numeric gradients
31 |     epsilon = 1e-6
32 |     variable = layer._W
33 |     numeric_grads = np.zeros_like(variable)
34 | 
35 |     for r in range(variable.shape[0]):
36 |         for c in range(variable.shape[1]):
37 |             variable[r, c] -= epsilon
38 |             neg_delta_loss = np.sum(layer.forward(X))
39 | 
40 |             variable[r, c] += 2 * epsilon
41 |             pos_delta_loss = np.sum(layer.forward(X))
42 | 
43 |             numeric_grads[r, c] = (pos_delta_loss - neg_delta_loss) / (2 * epsilon)
44 | 
45 |             variable[r, c] -= epsilon  # restore to original
46 | 
47 |     print("========== numeric gradients = \n{}".format(numeric_grads))
48 | 
49 | 
50 | if __name__ == "__main__":
51 |     np.random.seed(999)
52 | 
53 |     # test_embedding_forward()
54 |     test_embedding_backward()
55 | 


--------------------------------------------------------------------------------
/test_input_layer.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from input_layer import SparseInput
 3 | 
 4 | 
 5 | # def test_iterate_example_from_sparse_input(example_indices, batch_size):
 6 | #     sp_input = SparseInput(example_indices=example_indices,
 7 | #                            feature_ids=example_indices,
 8 | #                            feature_values=example_indices,
 9 | #                            batch_size=batch_size)
10 | #
11 | #     # for example_idx, feat_ids, feat_values in sp_input.iterate_example():
12 | #     #     print("\n**************** {}-th example: ".format(example_idx))
13 | #     #     print("feature ids:    {}".format(feat_ids))
14 | #     #     print("feature values: {}".format(feat_values))
15 | #
16 | #     iterator = sp_input.iterate_example()
17 | #     while True:
18 | #         try:
19 | #             example_idx, feat_ids, feat_values = next(iterator)
20 | #             print("\n**************** {}-th example: ".format(example_idx))
21 | #             print("feature ids:    {}".format(feat_ids))
22 | #             print("feature values: {}".format(feat_values))
23 | #         except StopIteration:
24 | #             break
25 | 
26 | 
27 | def test_get_example_in_order_from_sparse(example_indices, batch_size):
28 |     sp_input = SparseInput(example_indices=example_indices,
29 |                            feature_ids=example_indices,
30 |                            feature_values=example_indices,
31 |                            n_total_examples=batch_size)
32 | 
33 |     for example_idx in range(batch_size):
34 |         feat_ids, feat_vals = sp_input.get_example_in_order(example_idx)
35 |         print("\n**************** {}-th example: ".format(example_idx))
36 |         print("feature ids:    {}".format(feat_ids))
37 |         print("feature values: {}".format(feat_vals))
38 | 
39 | 
40 | if __name__ == "__main__":
41 |     test_get_example_in_order_from_sparse(example_indices=[1, 1, 1, 3, 4, 6],batch_size=10)
42 | 
43 |     # test_get_example_in_order_from_sparse(example_indices=[1, 1, 1, 3, 4, 6],batch_size=3)
44 | 
45 |     # test_get_example_in_order_from_sparse(example_indices=[0, 1, 1, 1, 3, 4, 7],batch_size=9)
46 | 
47 |     # test_get_example_in_order_from_sparse(example_indices=[], batch_size=9)
48 | 


--------------------------------------------------------------------------------
/test_others.py:
--------------------------------------------------------------------------------
  1 | import initialization
  2 | import numpy as np
  3 | import activation
  4 | from tqdm import tqdm
  5 | import bisect
  6 | import utils
  7 | 
  8 | 
  9 | def numerical_gradient(variable, loss_fn, epsilon):
 10 |     # gradients must have the same shape as variable
 11 |     numeric_grad = np.zeros_like(variable)
 12 | 
 13 |     pbar = tqdm(total=variable.shape[0] * variable.shape[1])
 14 |     for r in range(variable.shape[0]):
 15 |         for c in range(variable.shape[1]):
 16 |             variable[r, c] -= epsilon
 17 |             neg_loss = loss_fn(variable)
 18 | 
 19 |             variable[r, c] += 2 * epsilon
 20 |             pos_loss = loss_fn(variable)
 21 | 
 22 |             numeric_grad[r, c] = (pos_loss - neg_loss) / (2 * epsilon)
 23 | 
 24 |             variable[r, c] -= epsilon  # restore to original
 25 |             pbar.update(1)
 26 | 
 27 |     return numeric_grad
 28 | 
 29 | 
 30 | def check_activation(layer):
 31 |     # ---------- forward and backward
 32 |     X = np.random.randn(3, 4)
 33 |     _ = layer.forward(X)
 34 |     # 最终的loss选择用np.sum，从而prev_grads是全1矩阵，得到的derived_grads就是本层自身的gradients
 35 |     derived_grads = layer.backward(prev_grads=np.ones_like(X))
 36 | 
 37 |     # ---------- calculate numeric gradients
 38 |     epsilon = 1e-6
 39 |     numeric_grads = numerical_gradient(variable=X,
 40 |                                        loss_fn=lambda x: np.sum(layer.forward(x)),
 41 |                                        epsilon=epsilon)
 42 | 
 43 |     # ---------- display
 44 |     print("========== derived gradients = \n{}".format(derived_grads))
 45 |     print("========== numeric gradients = \n{}".format(numeric_grads))
 46 | 
 47 |     # ---------- check
 48 |     is_equal = np.allclose(numeric_grads, derived_grads)
 49 |     assert is_equal
 50 |     print("Equal = {}".format(is_equal))
 51 | 
 52 | 
 53 | def test_activations():
 54 |     check_activation(activation.Sigmoid())
 55 |     check_activation(activation.ReLU())
 56 | 
 57 | 
 58 | def test_initializer():
 59 |     # ---------------- GlorotUniform
 60 |     init_glorot_uniform = initialization.get_global_init('glorot_uniform')
 61 |     w = init_glorot_uniform([2, 3])
 62 |     print("\nGlorotUniform")
 63 |     print(w)
 64 |     print(w.sum())
 65 | 
 66 |     # ---------------- GlorotNormal
 67 |     init_glorot_normal = initialization.get_global_init('glorot_normal')
 68 |     w = init_glorot_uniform([20, 50])
 69 |     print("\nGlorotNormal")
 70 |     # print(w)
 71 |     print(w.mean())
 72 | 
 73 | 
 74 | # def test_bce_loss_with_logits():
 75 | #     bce_layer = BinaryCrossEntropy4Logits()
 76 | #
 77 | #     # 必须转化成float，否则logits[idx] -= epsilon这种inplace modification时，因为logits本身还是正数
 78 | #     # 会导致logits[idx]变化后还是整数，比如1-1e-6会被强制转化成0
 79 | #     logits = np.asarray([1, 2, 3], dtype=float)
 80 | #     labels = [1, 1, 0]
 81 | #
 82 | #     loss = bce_layer.forward(logits=logits, labels=labels)
 83 | #     print("loss={}".format(loss))
 84 | #
 85 | #     grad2logits = bce_layer.backward()
 86 | #     print("          gradients to logits = {}".format(grad2logits))
 87 | #
 88 | #     # --------- numeric loss
 89 | #     epsilon = 1e-6
 90 | #     numeric_grads = np.zeros_like(logits)
 91 | #
 92 | #     for idx in range(len(logits)):
 93 | #         logits[idx] -= epsilon
 94 | #         neg_loss = bce_layer.forward(logits=logits, labels=labels)
 95 | #
 96 | #         logits[idx] += 2 * epsilon
 97 | #         pos_loss = bce_layer.forward(logits=logits, labels=labels)
 98 | #
 99 | #         numeric_grads[idx] = (pos_loss - neg_loss) / (2 * epsilon)
100 | #
101 | #         logits[idx] -= epsilon  # restore to original
102 | #
103 | #     print("numerical gradients to logits = {}".format(numeric_grads))
104 | #
105 | #     # --------- check
106 | #     is_equal = np.allclose(grad2logits, len(logits) * numeric_grads)
107 | #     assert is_equal
108 | #     print("Equal = {}".format(is_equal))
109 | 
110 | 
111 | def test_bucket_by_bisect():
112 |     age_boundaries = [18, 25, 30, 35, 40, 45, 50, 55, 60, 65]
113 | 
114 |     for age in [5, 18, 31, 55, 42, 67]:
115 |         idx = bisect.bisect(age_boundaries, age)
116 | 
117 |         if idx == 0:
118 |             lb = '-inf'
119 |         else:
120 |             lb = age_boundaries[idx - 1]
121 | 
122 |         if idx == len(age_boundaries):
123 |             hb = 'inf'
124 |         else:
125 |             hb = age_boundaries[idx]
126 | 
127 |         print("{}<={}<{}".format(lb, age, hb))
128 | 
129 | 
130 | def test_split_columns():
131 |     a = np.arange(12).reshape(2, 6)
132 |     print(a)
133 | 
134 |     splits = utils.split_column(a,[2,1,3])
135 |     for idx, split in enumerate(splits,start=1):
136 |         print("\n--------- {}th split".format(idx))
137 |         print(split)
138 | 
139 | 
140 | if __name__ == "__main__":
141 |     np.random.seed(999)
142 | 
143 |     # test_activations()
144 |     # test_initializer()
145 |     # test_bce_loss_with_logits()
146 |     # test_bucket_by_bisect()
147 |     test_split_columns()
148 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | 
 4 | def chunk(stream, chunk_size):
 5 |     buf = []
 6 | 
 7 |     for item in stream:
 8 |         buf.append(item)
 9 | 
10 |         if len(buf) >= chunk_size:
11 |             yield buf
12 |             del buf[:]
13 | 
14 |     if len(buf) > 0:
15 |         yield buf
16 | 
17 | 
18 | def split_column(m, col_sizes):
19 |     offset = 0
20 |     splits = []
21 | 
22 |     for colsize in col_sizes:
23 |         splits.append(m[:, offset:(offset + colsize)])
24 |         offset += colsize
25 | 
26 |     assert offset == m.shape[1]
27 |     return splits
28 | 
29 | 
30 | def config_logging(fname):
31 |     logging.basicConfig(level=logging.INFO, format='%(message)s')  # re-format to remove prefix 'INFO:root'
32 | 
33 |     fh = logging.FileHandler(fname)
34 |     fh.setLevel(logging.INFO)
35 |     logging.getLogger("").addHandler(fh)
36 | 


--------------------------------------------------------------------------------
/wide_layer.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from ftrl import FtrlEstimator
 3 | from base_estimator import BaseEstimator
 4 | from collections import namedtuple
 5 | 
 6 | 
 7 | class WideLayer:
 8 |     def __init__(self, field_names, alpha, beta, L1, L2, proba_fn):
 9 |         """
10 |         :param proba_fn:    proba_fn(example_idx,logit)=probability
11 |                             之所以用function是因为如果与DNN结合，计算probability还要考虑DNN提供的logit
12 |         """
13 |         self._estimators = {field: FtrlEstimator(alpha=alpha,
14 |                                                  beta=beta,
15 |                                                  L1=L1,
16 |                                                  L2=L2) for field in (['bias'] + field_names)}
17 |         self._proba_fn = proba_fn
18 | 
19 |     def __predict_logit(self, sp_features, example_idx):
20 |         logit = 0
21 | 
22 |         for field, estimator in self._estimators.items():
23 |             if field == 'bias':
24 |                 feat_ids = [0]
25 |                 feat_vals = [1]
26 |             else:
27 |                 sp_input = sp_features[field]
28 |                 feat_ids, feat_vals = sp_input.get_example_in_order(example_idx)
29 | 
30 |             logit += estimator.predict_logit(feature_ids=feat_ids, feature_values=feat_vals)
31 | 
32 |         return logit
33 | 
34 |     def train(self, sp_features, labels):
35 |         """
36 |         :param sp_features: dict from field_name ==> SparseInput
37 |         :return: probabilities from this train batch
38 |         """
39 |         probas = []
40 |         for example_idx, label in enumerate(labels):
41 |             logit = self.__predict_logit(sp_features, example_idx)
42 | 
43 |             pred_proba = self._proba_fn(example_idx, logit)
44 |             probas.append(pred_proba)
45 | 
46 |             for _, estimator in self._estimators.items():
47 |                 estimator.update(pred_proba=pred_proba, label=label)
48 | 
49 |         return np.asarray(probas)
50 | 
51 |     def predict_logits(self, sp_features):
52 |         # 假定所有sp_feature都拥有相同的行数
53 |         batch_size = None
54 |         for sp_input in sp_features.values():
55 |             batch_size = sp_input.n_total_examples
56 |             break
57 | 
58 |         logits = [self.__predict_logit(sp_features, example_idx) for example_idx in range(batch_size)]
59 |         return np.asarray(logits)
60 | 
61 | 
62 | WideHParams = namedtuple("WideHParams", ['field_names', 'alpha', 'beta', 'L1', 'L2'])
63 | 
64 | 
65 | def _sigmoid(example_idx, logit):
66 |     return 1 / (1 + np.exp(-logit))
67 | 
68 | 
69 | class WideEstimator(BaseEstimator):
70 |     def __init__(self, hparams, data_source):
71 |         self._layer = WideLayer(field_names=hparams.field_names,
72 |                                 alpha=hparams.alpha,
73 |                                 beta=hparams.beta,
74 |                                 L1=hparams.L1,
75 |                                 L2=hparams.L2,
76 |                                 proba_fn=_sigmoid)
77 |         super().__init__(data_source)
78 | 
79 |     def train_batch(self, features, labels):
80 |         return self._layer.train(sp_features=features, labels=labels)
81 | 
82 |     def predict(self, features):
83 |         pred_logits = self._layer.predict_logits(sp_features=features)
84 |         return 1 / (1 + np.exp(-pred_logits))
85 | 


--------------------------------------------------------------------------------
/wide_n_deep.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from dnn import DeepNetwork
 3 | from wide_layer import WideLayer
 4 | from base_estimator import BaseEstimator
 5 | 
 6 | 
 7 | class WideDeepEstimator(BaseEstimator):
 8 |     def __init__(self, wide_hparams, deep_hparams, data_source):
 9 |         self._current_deep_logits = None
10 | 
11 |         self._wide_layer = WideLayer(field_names=wide_hparams.field_names,
12 |                                      alpha=wide_hparams.alpha,
13 |                                      beta=wide_hparams.beta,
14 |                                      L1=wide_hparams.L1,
15 |                                      L2=wide_hparams.L2,
16 |                                      proba_fn=self._predict_proba)
17 | 
18 |         self._dnn = DeepNetwork(dense_fields=deep_hparams.dense_fields,
19 |                                 vocab_infos=deep_hparams.vocab_infos,
20 |                                 embed_fields=deep_hparams.embed_fields,
21 |                                 hidden_units=deep_hparams.hidden_units,
22 |                                 L2=deep_hparams.L2,
23 |                                 optimizer=deep_hparams.optimizer)
24 | 
25 |         super().__init__(data_source)
26 | 
27 |     def _predict_proba(self, example_idx, wide_logit):
28 |         deep_logit = self._current_deep_logits[example_idx]
29 |         logit = deep_logit + wide_logit
30 |         return 1 / (1 + np.exp(-logit))
31 | 
32 |     def train_batch(self, features, labels):
33 |         self._current_deep_logits = self._dnn.forward(features)
34 | 
35 |         pred_probas = self._wide_layer.train(features, labels)
36 | 
37 |         self._dnn.backward(grads2logits=pred_probas - labels)
38 | 
39 |         return pred_probas
40 | 
41 |     def predict(self, features):
42 |         deep_logits = self._dnn.forward(features)
43 | 
44 |         wide_logits = self._wide_layer.predict_logits(features)
45 | 
46 |         logits = deep_logits + wide_logits
47 | 
48 |         return 1 / (1 + np.exp(-logits))
49 | 


--------------------------------------------------------------------------------