├── .gitignore ├── README.md ├── activation.py ├── base_estimator.py ├── census_data.py ├── census_main.py ├── dataset ├── raw_test.txt ├── raw_train.txt ├── test.csv └── train.csv ├── dense_layer.py ├── dnn.py ├── embedding_layer.py ├── ftrl.py ├── initialization.py ├── input_layer.py ├── metrics.py ├── optimization.py ├── test_dense_layer.py ├── test_embed_layer.py ├── test_input_layer.py ├── test_others.py ├── utils.py ├── wide_layer.py └── wide_n_deep.py /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | *.pyc 3 | .idea 4 | learn_curve_*.csv 5 | log_*.log -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # NumpyWDL 2 | Implement Wide & Deep algorithm by using NumPy 3 | 4 | 用NumPy手工实现 Wide & Deep 5 | 6 | To run it: 7 | 8 | python census_main.py -e wide_n_deep -n 20 9 | 10 | python census_main.py -e wide -n 20 11 | 12 | python census_main.py -e deep -n 20 13 | 14 | -------------------------------------------------------------------------------- /activation.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class Sigmoid: 5 | def __init__(self): 6 | self._last_forward_result = None 7 | 8 | def forward(self, X): 9 | """ 10 | element-wise sigmoid 11 | :param X: [batch_size, #neuron] 12 | :return: same shape as X 13 | """ 14 | self._last_forward_result = 1.0 / (1.0 + np.exp(-X)) 15 | return self._last_forward_result 16 | 17 | def backward(self, prev_grads): 18 | """ 19 | :param prev_grads: gradients from loss to "last forward result" 20 | must have the same shape as 'last forward result' 21 | :return: gradients from loss to X, has same shape as X 22 | """ 23 | assert prev_grads.shape == self._last_forward_result.shape 24 | 25 | return prev_grads * self._last_forward_result * (1 - self._last_forward_result) 26 | 27 | 28 | class ReLU: 29 | def __init__(self): 30 | self._last_input = None 31 | 32 | def forward(self, X): 33 | self._last_input = X 34 | return np.maximum(0, X) 35 | 36 | def backward(self, prev_grads): 37 | assert prev_grads.shape == self._last_input.shape 38 | 39 | local_grads = np.zeros_like(self._last_input) 40 | local_grads[self._last_input > 0] = 1.0 41 | 42 | return prev_grads * local_grads 43 | -------------------------------------------------------------------------------- /base_estimator.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | # from sklearn.metrics import roc_auc_score, log_loss 3 | from metrics import logloss as log_loss, auc as roc_auc_score 4 | from tqdm import tqdm 5 | import logging 6 | 7 | 8 | class BaseEstimator: 9 | 10 | def __init__(self, data_source): 11 | self._data_source = data_source 12 | 13 | def get_metrics(self, scores, labels, prefix): 14 | scores = np.asarray(scores) 15 | labels = np.asarray(labels) 16 | 17 | metrics = {'{}_logloss'.format(prefix): log_loss(y_true=labels, y_pred=scores), 18 | '{}_auc'.format(prefix): roc_auc_score(y_true=labels, y_score=scores)} 19 | 20 | pred_labels = (scores > 0.5).astype(int) 21 | metrics['{}_accuracy'.format(prefix)] = np.sum(pred_labels == labels) / len(labels) 22 | 23 | return metrics 24 | 25 | def train_batch(self, features, labels): 26 | """ 27 | :param features: dict, field_name ==> dense matrix or SparseInput 28 | :param labels: [batch_size] ndarray 29 | :return: [batch_size] ndarray of predicted probabilities in that batch 30 | """ 31 | raise NotImplementedError() 32 | 33 | def predict(self, features): 34 | """ 35 | :param features: dict, field_name ==> dense matrix or SparseInput 36 | :return: [batch_size] ndarray of predicted probabilities in that batch 37 | """ 38 | raise NotImplementedError() 39 | 40 | def _train_epoch(self): 41 | scores = [] 42 | labels = [] 43 | 44 | batch_stream = self._data_source.train_batches_per_epoch() 45 | for batch_features, batch_labels in tqdm(batch_stream): 46 | pred_probas = self.train_batch(batch_features, batch_labels) 47 | 48 | scores.extend(pred_probas) 49 | labels.extend(batch_labels) 50 | 51 | return self.get_metrics(scores=scores, labels=labels, prefix='train') 52 | 53 | def _eval_epoch(self): 54 | scores = [] 55 | labels = [] 56 | 57 | batch_stream = self._data_source.test_batches_per_epoch() 58 | for batch_features, batch_labels in tqdm(batch_stream): 59 | pred_probas = self.predict(batch_features) 60 | 61 | scores.extend(pred_probas) 62 | labels.extend(batch_labels) 63 | 64 | return self.get_metrics(scores=scores, labels=labels, prefix='test') 65 | 66 | def train(self, n_epochs): 67 | metrics_history = [] 68 | for epoch_idx in range(n_epochs): 69 | logging.info("\n=============== {}-th EPOCH".format(epoch_idx + 1)) 70 | 71 | metrics = {} 72 | metrics.update(self._train_epoch()) 73 | metrics.update(self._eval_epoch()) 74 | 75 | logging.info(metrics) 76 | metrics_history.append(metrics) 77 | 78 | return metrics_history 79 | -------------------------------------------------------------------------------- /census_data.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import sys 3 | import numpy as np 4 | import random 5 | import utils 6 | from input_layer import SparseInput 7 | import bisect 8 | import argparse 9 | from tqdm import tqdm 10 | 11 | VOCAB_LISTS = { 12 | 'education': ['Bachelors', 13 | 'HS-grad', 14 | '11th', 15 | 'Masters', 16 | '9th', 17 | 'Some-college', 18 | 'Assoc-acdm', 19 | 'Assoc-voc', 20 | '7th-8th', 21 | 'Doctorate', 22 | 'Prof-school', 23 | '5th-6th', 24 | '10th', 25 | '1st-4th', 26 | 'Preschool', 27 | '12th'], 28 | 29 | 'marital_status': ['Married-civ-spouse', 30 | 'Divorced', 31 | 'Married-spouse-absent', 32 | 'Never-married', 33 | 'Separated', 34 | 'Married-AF-spouse', 35 | 'Widowed'], 36 | 37 | 'relationship': ['Husband', 38 | 'Not-in-family', 39 | 'Wife', 40 | 'Own-child', 41 | 'Unmarried', 42 | 'Other-relative'], 43 | 44 | 'workclass': ['Self-emp-not-inc', 45 | 'Private', 46 | 'State-gov', 47 | 'Federal-gov', 48 | 'Local-gov', 49 | 'Self-emp-inc', 50 | 'Without-pay', 51 | 'Never-worked'], 52 | 53 | 'occupation': ['Tech-support', 54 | 'Craft-repair', 55 | 'Other-service', 56 | 'Sales', 57 | 'Exec-managerial', 58 | 'Prof-specialty', 59 | 'Handlers-cleaners', 60 | 'Machine-op-inspct', 61 | 'Adm-clerical', 62 | 'Farming-fishing', 63 | 'Transport-moving', 64 | 'Priv-house-serv', 65 | 'Protective-serv', 66 | 'Armed-Forces'] 67 | } 68 | 69 | VOCAB_MAPPINGS = {field: {featname: idx for idx, featname in enumerate(featnames)} for field, featnames in 70 | VOCAB_LISTS.items()} 71 | 72 | AGE_BOUNDARIES = [18, 25, 30, 35, 40, 45, 50, 55, 60, 65] 73 | 74 | DENSE_FIELDS = ['age', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week'] 75 | 76 | DENSE_LOG_MEAN_STD = {'age': (3.6183599219864133, 0.35003117354646957), 77 | 'education_num': (2.372506496597371, 0.27381608590073075), 78 | 'capital_gain': (0.7346209104536965, 2.4547377400238553), 79 | 'capital_loss': (0.35030508122367104, 1.5845809727578963), 80 | 'hours_per_week': (3.665366478972777, 0.38701441353280025)} 81 | 82 | CATEGORY_FIELDS = ['education', 'marital_status', 'relationship', 'workclass', 'occupation', 'age_buckets'] 83 | 84 | 85 | class Dataset: 86 | def __init__(self, infname): 87 | with open(infname, "rt") as fin: 88 | self._field_names = fin.readline().strip().split(',') 89 | self._lines = [line.strip() for line in fin] 90 | 91 | @property 92 | def n_examples(self): 93 | return len(self._lines) 94 | 95 | def parse_line(self, line): 96 | contents = dict(zip(self._field_names, line.split(','))) 97 | features = {} 98 | 99 | # ------------- label 100 | label = int(contents['income_bracket'] == '>50K') 101 | 102 | # ------------- categorical features 103 | for field in ['education', 'marital_status', 'relationship', 'workclass', 'occupation']: 104 | vocab_mapping = VOCAB_MAPPINGS[field] 105 | txt_value = contents[field] 106 | if txt_value in vocab_mapping: 107 | # 找不到的,算缺失,不包含进特征 108 | features[field] = vocab_mapping[txt_value] 109 | 110 | age = int(contents['age']) 111 | features['age_buckets'] = bisect.bisect(AGE_BOUNDARIES, age) 112 | 113 | # ------------- numeric features 114 | for field in DENSE_FIELDS: 115 | raw_value = float(contents[field]) 116 | logmean, logstd = DENSE_LOG_MEAN_STD[field] 117 | features[field] = (np.log1p(raw_value) - logmean) / logstd 118 | 119 | return features, label 120 | 121 | def get_batch_stream(self, batch_size, n_repeat=1): 122 | n_repeat = n_repeat if n_repeat > 0 else sys.maxsize 123 | 124 | for _ in range(n_repeat): 125 | random.shuffle(self._lines) 126 | 127 | for batch_lines in utils.chunk(self._lines, batch_size): 128 | Xs = {} 129 | ys = [] 130 | 131 | # ------------- allocate for categorical feature 132 | for field in CATEGORY_FIELDS: 133 | Xs[field] = SparseInput(n_total_examples=len(batch_lines), 134 | example_indices=[], 135 | feature_ids=[], 136 | feature_values=[]) 137 | 138 | # ------------- allocate for numeric feature 139 | for field in DENSE_FIELDS: 140 | # Xs[field]应该是一个list of list 141 | # 外面的list,对应batch中的每个example 142 | # 内层的list,对应该样本在field下的值。 143 | # 某样本可以在某个field下有多个dense值,比如当你非要用OHE来表示categorical特征的时候 144 | # 只不过,这里每个样本在每个field下只有一个值 145 | Xs[field] = [] 146 | 147 | # ------------- loop and add 148 | for example_index, line in enumerate(batch_lines): 149 | # 顺序遍历,能够保证插入SparseInput中的非零元是按example_index从小到大排好序的 150 | current_features, label = self.parse_line(line) 151 | ys.append(label) 152 | 153 | # add categorical feature 154 | for field in CATEGORY_FIELDS: 155 | if field in current_features: 156 | Xs[field].add(example_idx=example_index, 157 | feat_id=current_features[field], 158 | feat_val=1) 159 | 160 | # add numeric feature 161 | for field in DENSE_FIELDS: 162 | # wrap into one-element list, since we need to add one row 163 | Xs[field].append([current_features[field]]) 164 | 165 | yield Xs, np.asarray(ys) 166 | 167 | 168 | def precompute_log_mean_stddev(): 169 | df = pd.read_csv('dataset/train.csv', usecols=DENSE_FIELDS) 170 | df = np.log1p(df) # 数据有长尾, log使之更像正态一些 171 | 172 | means = df.mean() 173 | stddevs = df.std() 174 | log_means_stddevs = {field: (means[field], stddevs[field]) for field in DENSE_FIELDS} 175 | 176 | 177 | def test_standardize(infname): 178 | print("\n============= standardize '{}'".format(infname)) 179 | 180 | df = pd.read_csv(infname, usecols=DENSE_FIELDS) 181 | df = np.log1p(df) 182 | 183 | means = pd.Series({field: mean for field, (mean, std) in DENSE_LOG_MEAN_STD.items()}) 184 | stddevs = pd.Series({field: std for field, (mean, std) in DENSE_LOG_MEAN_STD.items()}) 185 | 186 | df = (df - means) / stddevs 187 | print(df.describe().loc[['mean', 'std'], :]) 188 | 189 | 190 | def test_batch_stream(infname): 191 | dataset = Dataset(infname) 192 | 193 | batch_stream = dataset.get_batch_stream(16) 194 | 195 | for batch_idx, (features, labels) in enumerate(batch_stream, start=1): 196 | print("\n================== {}-th batch".format(batch_idx)) 197 | print("labels: {}\n".format(labels)) 198 | 199 | for field in DENSE_FIELDS: 200 | print("[{}]: {}".format(field, features[field])) 201 | 202 | for field in CATEGORY_FIELDS: 203 | sp_input = features[field] 204 | print("\n[{}] example_indices: {}".format(field, sp_input._example_indices)) 205 | print("[{}] feature_ids: {}".format(field, sp_input._feature_ids)) 206 | print("[{}] feature_values: {}".format(field, sp_input._feature_values)) 207 | 208 | 209 | def clean_datas(infname, outfname): 210 | csv_columns = [ 211 | 'age', 'workclass', 'fnlwgt', 'education', 'education_num', 212 | 'marital_status', 'occupation', 'relationship', 'race', 'gender', 213 | 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 214 | 'income_bracket' 215 | ] 216 | 217 | with open(infname, 'rt') as fin, open(outfname, 'wt') as fout: 218 | # write header 219 | fout.write(",".join(csv_columns) + "\n") 220 | 221 | for line in tqdm(fin): 222 | line = line.strip() 223 | line = line.replace(', ', ',') 224 | if not line or ',' not in line: 225 | continue 226 | if line[-1] == '.': 227 | line = line[:-1] 228 | line += '\n' 229 | fout.write(line) 230 | print("'{}' is cleaned, and re-save to '{}'".format(infname, outfname)) 231 | 232 | 233 | if __name__ == "__main__": 234 | parser = argparse.ArgumentParser() 235 | parser.add_argument('-j', "--job") 236 | args = parser.parse_args() 237 | 238 | if args.job == "clean": 239 | clean_datas(infname='dataset/raw_train.txt', outfname='dataset/train.csv') 240 | clean_datas(infname='dataset/raw_test.txt', outfname='dataset/test.csv') 241 | 242 | else: 243 | raise ValueError('unknown job={}'.format(args.job)) 244 | -------------------------------------------------------------------------------- /census_main.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import pandas as pd 3 | import argparse 4 | import time 5 | 6 | from census_data import Dataset, DENSE_FIELDS, CATEGORY_FIELDS 7 | from census_data import VOCAB_LISTS, AGE_BOUNDARIES 8 | from optimization import Adagrad 9 | from wide_n_deep import WideDeepEstimator 10 | from wide_layer import WideHParams, WideEstimator 11 | from dnn import DeepHParams, DeepEstimator 12 | import utils 13 | 14 | 15 | class DataSource: 16 | def __init__(self, batch_size): 17 | self._train_dataset = Dataset('dataset/train.csv') 18 | self._test_dataset = Dataset('dataset/test.csv') 19 | self._batch_size = batch_size 20 | 21 | def train_batches_per_epoch(self): 22 | return self._train_dataset.get_batch_stream(self._batch_size, n_repeat=1) 23 | 24 | def test_batches_per_epoch(self): 25 | return self._test_dataset.get_batch_stream(self._batch_size, n_repeat=1) 26 | 27 | @property 28 | def n_train_examples(self): 29 | return self._train_dataset.n_examples 30 | 31 | @property 32 | def n_test_examples(self): 33 | return self._test_dataset.n_examples 34 | 35 | 36 | def get_deep_hparams(embed_size, hidden_units, L2, learning_rate): 37 | dense_fields = [(field, 1) for field in DENSE_FIELDS] 38 | 39 | vocab_infos = [] 40 | for vocab_name in CATEGORY_FIELDS: 41 | if vocab_name == 'age_buckets': 42 | vocab_size = len(AGE_BOUNDARIES) + 1 43 | else: 44 | vocab_size = len(VOCAB_LISTS[vocab_name]) 45 | vocab_infos.append((vocab_name, vocab_size, embed_size)) 46 | 47 | # 第一个field代表field name,第二个field代表vocab name 48 | # 在这个例子中,因为field与vocab是1:1,所以二者同名 49 | embed_fields = [(field, field) for field in CATEGORY_FIELDS] 50 | 51 | optimizer = Adagrad(learning_rate) 52 | 53 | return DeepHParams( 54 | dense_fields=dense_fields, 55 | vocab_infos=vocab_infos, 56 | embed_fields=embed_fields, 57 | hidden_units=hidden_units, 58 | L2=L2, 59 | optimizer=optimizer) 60 | 61 | 62 | if __name__ == "__main__": 63 | # ************ define command-line-arguments 64 | parser = argparse.ArgumentParser() 65 | parser.add_argument('-e', "--estimator") 66 | parser.add_argument('-n', "--n_epoches", type=int, default=10) 67 | args = parser.parse_args() 68 | 69 | # ************ prepare 70 | utils.config_logging('log_{}.log'.format(args.estimator)) 71 | 72 | data_source = DataSource(batch_size=32) 73 | 74 | deep_hparams = get_deep_hparams(embed_size=16, 75 | hidden_units=[64, 16], 76 | L2=0.01, 77 | learning_rate=0.001) 78 | 79 | wide_hparams = WideHParams(field_names=CATEGORY_FIELDS, 80 | alpha=0.1, 81 | beta=1, 82 | L1=0.1, 83 | L2=0.1) 84 | 85 | # ************ run 86 | if args.estimator == 'wide_n_deep': 87 | estimator = WideDeepEstimator(wide_hparams=wide_hparams, deep_hparams=deep_hparams, data_source=data_source) 88 | elif args.estimator == "deep": 89 | estimator = DeepEstimator(hparams=deep_hparams, data_source=data_source) 90 | elif args.estimator == 'wide': 91 | estimator = WideEstimator(hparams=wide_hparams, data_source=data_source) 92 | else: 93 | raise ValueError('unknown estimator type={}'.format(args.estimator)) 94 | 95 | start_time = time.time() 96 | metrics_history = estimator.train(args.n_epoches) 97 | elapsed = time.time() - start_time 98 | 99 | # ************ display result 100 | logging.info("\n************** TIME COST **************") 101 | logging.info('{:.2f} seconds for {} epoches'.format(elapsed, args.n_epoches)) 102 | logging.info('{:.2f} examples per second'.format( 103 | args.n_epoches * (data_source.n_train_examples + data_source.n_test_examples) / elapsed)) 104 | 105 | logging.info("\n************** LEARNING CURVE **************") 106 | metrics_history = pd.DataFrame(metrics_history) 107 | logging.info(metrics_history) 108 | metrics_history.to_csv('learn_curve_{}.csv'.format(args.estimator), index=False) 109 | -------------------------------------------------------------------------------- /dense_layer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import initialization 3 | 4 | 5 | class DenseLayer: 6 | def __init__(self, name, shape, l2reg=0, init_method='glorot_uniform'): 7 | self._name = name 8 | self._l2reg = l2reg 9 | 10 | self._W = initialization.get_global_init(init_method)(shape) 11 | self._b = initialization.get_global_init('zero')(shape[1]) 12 | 13 | self._dW = None 14 | self._db = None 15 | 16 | self._last_input = None 17 | 18 | def forward(self, X): 19 | self._last_input = X 20 | 21 | # last_input: [batch_size, fan_in] 22 | # W: [fan_in, fan_out] 23 | # b: [fanout] 24 | # result: [batch_size, fan_out] 25 | return np.dot(self._last_input, self._W) + self._b 26 | 27 | def backward(self, prev_grads): 28 | # prev_grads: [batch_size, fan_out] 29 | assert prev_grads.shape[1] == self._W.shape[1] 30 | 31 | # self._last_input.T: [fan_in, batch_size] 32 | # prev_grads: [batch_size, fan_out] 33 | # dW: [fan_in, fan_out], same shape as W 34 | self._dW = np.dot(self._last_input.T, prev_grads) 35 | 36 | # 加上l2_loss对W的导数 37 | self._dW += self._l2reg * self._W 38 | 39 | # 把b想像成特殊的fan_in=1的W,则套用上面的公式 40 | # db = [1,1,...,1](共batch_size个1,shape=[1,batch_size])*prev_grads([batch_size,fan_out])=各列之和([1,fan_out]) 41 | self._db = np.sum(prev_grads, axis=0) 42 | 43 | # return: dLoss/dX: [batch_size, fan_in] 44 | # prev_grads: [batch_size, fan_out] 45 | # self._W.T: [fan_out,fan_in] 46 | return np.dot(prev_grads, self._W.T) 47 | 48 | @property 49 | def l2reg_loss(self): 50 | return 0.5 * self._l2reg * np.sum(self._W ** 2) 51 | 52 | @property 53 | def shape(self): 54 | return self._W.shape 55 | 56 | @property 57 | def output_dim(self): 58 | return self._W.shape[1] 59 | 60 | @property 61 | def variables(self): 62 | return {"{}_W".format(self._name): self._W, 63 | "{}_b".format(self._name): self._b} 64 | 65 | @property 66 | def grads2var(self): 67 | return {"{}_W".format(self._name): self._dW, 68 | "{}_b".format(self._name): self._db} 69 | -------------------------------------------------------------------------------- /dnn.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from input_layer import DenseInputCombineLayer 3 | from embedding_layer import EmbeddingCombineLayer 4 | from dense_layer import DenseLayer 5 | from activation import ReLU 6 | import utils 7 | import logging 8 | from collections import namedtuple 9 | from base_estimator import BaseEstimator 10 | 11 | 12 | class DeepNetwork: 13 | def __init__(self, dense_fields, vocab_infos, embed_fields, hidden_units, L2, optimizer): 14 | """ 15 | :param dense_fields: a list of tuple (field_name, field's input-dim) 16 | :param vocab_infos: a list of tuple, each tuple is (vocab_name, vocab_size, embed_size) 17 | :param embed_fields: a list of tuple (field_name, vocab_name) 18 | :param hidden_units: a list of ints, n_units for each hidden layer 19 | :param L2: L2 regularization for hidden dense layer 20 | :param optimizer: optimizer instance to update the weights 21 | """ 22 | self._optimizer = optimizer 23 | 24 | # ***************** dense input layer 25 | self._dense_combine_layer = DenseInputCombineLayer(dense_fields) 26 | 27 | # ***************** embedding layers 28 | self._embed_combine_layer = EmbeddingCombineLayer(vocab_infos) 29 | for field_name, vocab_name in embed_fields: 30 | self._embed_combine_layer.add_embedding(vocab_name=vocab_name, field_name=field_name) 31 | 32 | self._optimize_layers = [self._embed_combine_layer] 33 | 34 | # ***************** MLP 35 | prev_out_dim = self._dense_combine_layer.output_dim + self._embed_combine_layer.output_dim 36 | 37 | self._hidden_layers = [] 38 | for layer_idx, n_units in enumerate(hidden_units, start=1): 39 | # ----------- add hidden layer 40 | hidden_layer = DenseLayer(name="hidden{}".format(layer_idx), shape=[prev_out_dim, n_units], l2reg=L2) 41 | self._hidden_layers.append(hidden_layer) 42 | self._optimize_layers.append(hidden_layer) 43 | logging.info("{}-th hidden layer, weight shape={}".format(layer_idx, hidden_layer.shape)) 44 | 45 | # ----------- add activation layer 46 | self._hidden_layers.append(ReLU()) 47 | 48 | # ----------- update previous dimension 49 | prev_out_dim = n_units 50 | 51 | # final logit layer 52 | final_logit_layer = DenseLayer(name="final_logit", shape=[prev_out_dim, 1], l2reg=L2) 53 | logging.info("final logit layer, weight shape={}".format(final_logit_layer.shape)) 54 | self._hidden_layers.append(final_logit_layer) 55 | self._optimize_layers.append(final_logit_layer) 56 | 57 | def forward(self, features): 58 | """ 59 | :param features: dict, mapping from field=>dense ndarray or field=>SparseInput 60 | :return: logits, [batch_size] 61 | """ 62 | dense_input = self._dense_combine_layer.forward(features) 63 | 64 | embed_input = self._embed_combine_layer.forward(features) 65 | 66 | X = np.hstack([dense_input, embed_input]) 67 | 68 | for hidden_layer in self._hidden_layers: 69 | X = hidden_layer.forward(X) 70 | 71 | return X.flatten() 72 | 73 | def backward(self, grads2logits): 74 | """ 75 | :param grads2logits: gradients from loss to logits, [batch_size] 76 | """ 77 | # ***************** 计算所有梯度 78 | prev_grads = grads2logits.reshape([-1, 1]) # reshape to [batch_size,1] 79 | 80 | # iterate hidden layers backwards 81 | for hidden_layer in self._hidden_layers[::-1]: 82 | prev_grads = hidden_layer.backward(prev_grads) 83 | 84 | col_sizes = [self._dense_combine_layer.output_dim, self._embed_combine_layer.output_dim] 85 | # 抛弃第一个split,因为其对应的是input,无可优化 86 | _, grads_for_all_embedding = utils.split_column(prev_grads, col_sizes) 87 | 88 | self._embed_combine_layer.backward(grads_for_all_embedding) 89 | 90 | # ***************** 优化 91 | # 这个操作必须每次backward都调用,这是因为,尽管dense部分的权重是固定的 92 | # 但是sparse部分,要优化哪个变量,是随着输入不同而不同的 93 | all_vars, all_grads2var = {}, {} 94 | for opt_layer in self._optimize_layers: 95 | all_vars.update(opt_layer.variables) 96 | all_grads2var.update(opt_layer.grads2var) 97 | 98 | self._optimizer.update(variables=all_vars, gradients=all_grads2var) 99 | 100 | 101 | DeepHParams = namedtuple("DeepHParams", 102 | ['dense_fields', 'vocab_infos', 'embed_fields', 'hidden_units', 'L2', 'optimizer']) 103 | 104 | 105 | class DeepEstimator(BaseEstimator): 106 | def __init__(self, hparams, data_source): 107 | self._dnn = DeepNetwork(dense_fields=hparams.dense_fields, 108 | vocab_infos=hparams.vocab_infos, 109 | embed_fields=hparams.embed_fields, 110 | hidden_units=hparams.hidden_units, 111 | L2=hparams.L2, 112 | optimizer=hparams.optimizer) 113 | super().__init__(data_source) 114 | 115 | def train_batch(self, features, labels): 116 | # ********* forward 117 | logits = self._dnn.forward(features) 118 | pred_probas = 1 / (1 + np.exp(-logits)) 119 | 120 | # ********* backward 121 | grads2logits = pred_probas - labels 122 | self._dnn.backward(grads2logits) 123 | 124 | return pred_probas 125 | 126 | def predict(self, features): 127 | logits = self._dnn.forward(features) 128 | return 1 / (1 + np.exp(-logits)) 129 | -------------------------------------------------------------------------------- /embedding_layer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from initialization import TruncatedNormal 3 | import utils 4 | 5 | 6 | class EmbeddingLayer: 7 | """ 8 | 简化起见,不支持use_bias和regularization 9 | 不支持regularization的原因是:weight是稠密的,自然L2 Loss的gradient也是稠密的 10 | 为了L2 Loss而破坏稀疏性,增加内容与耗时,有些得不偿失 11 | 一种改进方案是:只正则化本batch中用到的embedding向量 12 | """ 13 | 14 | def __init__(self, W, vocab_name, field_name): 15 | """ 16 | :param W: dense weight matrix, [vocab_size,embed_size] 17 | :param b: bias, [embed_size] 18 | """ 19 | self.vocab_name = vocab_name 20 | self.field_name = field_name 21 | self._W = W 22 | self._last_input = None 23 | 24 | @property 25 | def output_dim(self): 26 | return self._W.shape[1] 27 | 28 | def forward(self, X): 29 | """ 30 | :param X: SparseInput 31 | :return: [batch_size, embed_size] 32 | """ 33 | self._last_input = X 34 | 35 | # output: [batch_size, embed_size] 36 | output = np.zeros((X.n_total_examples, self._W.shape[1])) 37 | 38 | for example_idx, feat_id, feat_val in X.iterate_non_zeros(): 39 | embedding = self._W[feat_id, :] 40 | output[example_idx, :] += embedding * feat_val 41 | 42 | return output 43 | 44 | def backward(self, prev_grads): 45 | """ 46 | :param prev_grads: [batch_size, embed_size] 47 | :return: dw 48 | """ 49 | dW = {} 50 | 51 | for example_idx, feat_id, feat_val in self._last_input.iterate_non_zeros(): 52 | # [1,embed_size] 53 | grad_from_one_example = prev_grads[example_idx, :] * feat_val 54 | 55 | if feat_id in dW: 56 | dW[feat_id] += grad_from_one_example 57 | 58 | else: 59 | dW[feat_id] = grad_from_one_example 60 | 61 | return dW 62 | 63 | 64 | class EmbeddingCombineLayer: 65 | def __init__(self, vocab_infos): 66 | """ 67 | :param vocab_infos: a list of tuple, each tuple is (vocab_name, vocab_size, embed_size) 68 | """ 69 | self._weights = {} # vocab_name ==> weight 70 | for vocab_name, vocab_size, embed_size in vocab_infos: 71 | # TruncatedNormal是TF WDL中embedding_column的默认初始化方式 72 | # These values are similar to values from a `random_normal_initializer` 73 | # except that values more than two standard deviations from the mean are discarded and re-drawn 74 | stddev = 1 / np.sqrt(embed_size) 75 | initializer = TruncatedNormal(mean=0, 76 | stddev=stddev, 77 | lower=-2 * stddev, 78 | upper=2 * stddev) 79 | self._weights[vocab_name] = initializer(shape=[vocab_size, embed_size]) 80 | 81 | # 注意,由于embedding input的稀疏性,一次回代时,不太可能对所有embedding weight有梯度 82 | # 而是只针对某个field的embedding weight中某feature id对应的行有梯度 83 | # _grads_to_embed是一个dict, 84 | # key是"vocab_name@feature_id"的形式,value是一个[embed_size]的ndarray。 85 | # 因为vocab的weight是多个field所共享的,所以value是每个field对vocab_name@feature_id的梯度的叠加 86 | self._grads_to_embed = {} 87 | self._embed_layers = [] 88 | 89 | def add_embedding(self, vocab_name, field_name): 90 | weight = self._weights[vocab_name] 91 | layer = EmbeddingLayer(W=weight, vocab_name=vocab_name, field_name=field_name) 92 | self._embed_layers.append(layer) 93 | 94 | @property 95 | def output_dim(self): 96 | return sum(layer.output_dim for layer in self._embed_layers) 97 | 98 | def forward(self, sparse_inputs): 99 | """ 100 | :param sparse_inputs: dict {field_name: SparseInput} 101 | :return: 每个SparseInput贡献一个embedding vector,返回结果是这些embedding vector的拼接 102 | 拼接顺序由add_embedding的调用顺序决定 103 | """ 104 | embedded_outputs = [] 105 | for embed_layer in self._embed_layers: 106 | sp_input = sparse_inputs[embed_layer.field_name] 107 | embedded_outputs.append(embed_layer.forward(sp_input)) 108 | 109 | # [batch_size, sum of all embed-layer's embed_size] 110 | return np.hstack(embedded_outputs) 111 | 112 | def backward(self, prev_grads): 113 | """ 114 | :param prev_grads: [batch_size, sum of all embed-layer's embed_size] 115 | 上一层传入的, Loss对本层输出的梯度 116 | """ 117 | assert prev_grads.shape[1] == self.output_dim 118 | 119 | # 因为output是每列输出的拼接,自然上一层输入的导数也是各层所需要导数的拼接 120 | # prev_grads_splits是一个数组,存储对应各层的导数 121 | col_sizes = [layer.output_dim for layer in self._embed_layers] 122 | prev_grads_splits = utils.split_column(prev_grads, col_sizes) 123 | 124 | self._grads_to_embed.clear() # reset 125 | for layer, layer_prev_grads in zip(self._embed_layers, prev_grads_splits): 126 | # layer_prev_grads: 上一层传入的,Loss对某个layer的输出的梯度 127 | # layer_grads_to_feat_embed: dict, feat_id==>grads, 128 | # 由这一个layer造成对某vocab的embedding矩阵的某feat_id对应行的梯度 129 | layer_grads_to_embed = layer.backward(layer_prev_grads) 130 | 131 | for feat_id, g in layer_grads_to_embed.items(): 132 | # 表示"对某个vocab的embedding weight中的第feat_id行的总导数" 133 | key = "{}@{}".format(layer.vocab_name, feat_id) 134 | 135 | if key in self._grads_to_embed: 136 | self._grads_to_embed[key] += g 137 | else: 138 | self._grads_to_embed[key] = g 139 | 140 | @property 141 | def variables(self): 142 | """ 优化变量 143 | :return: dict from vocab_name to weight matrix 144 | """ 145 | return self._weights 146 | 147 | @property 148 | def grads2var(self): 149 | """ Loss对优化变量的梯度 150 | :return: dict, key是"vocab_name@feature_id"的形式,value是一个[embed_size]的ndarray 151 | """ 152 | return self._grads_to_embed 153 | 154 | @property 155 | def l2reg_loss(self): 156 | return 0 # 出于保持稀疏的考虑,在embedding层暂不支持正则 157 | -------------------------------------------------------------------------------- /ftrl.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from collections import defaultdict 3 | 4 | 5 | class FtrlEstimator: 6 | """ 7 | 每个field对应一个FtrlEstimator。类比于在TensorFlow WDL中,一个feature column对应一个FtrlEstimator 8 | """ 9 | 10 | def __init__(self, alpha, beta, L1, L2): 11 | self._alpha = alpha 12 | self._beta = beta 13 | self._L1 = L1 14 | self._L2 = L2 15 | 16 | self._n = defaultdict(float) # n[i]: i-th feature's squared sum of past gradients 17 | self._z = defaultdict(float) 18 | 19 | # lazy weights, 实际上是一个临时变量,只在: 20 | # 1. 对应的feature value != 0, 并且 21 | # 2. 之前累积的abs(z) > L1 22 | # 两种情况都满足时,w才在feature id对应的位置上存储一个值 23 | # 而且w中数据的存储周期,只在一次前代、回代之间,在新的前代开始之前,就清空上次的w 24 | self._w = {} 25 | 26 | self._current_feat_ids = None 27 | self._current_feat_vals = None 28 | 29 | def predict_logit(self, feature_ids, feature_values): 30 | """ 31 | :param feature_ids: non-zero feature ids for one example 32 | :param feature_values: non-zero feature values for one example 33 | :return: logit for this example 34 | """ 35 | self._current_feat_ids = feature_ids 36 | self._current_feat_vals = feature_values 37 | 38 | logit = 0 39 | self._w.clear() # lazy weights,所以没有必要保留之前的weights 40 | 41 | # 如果当前样本在这个field下所有feature都为0,则feature_ids==feature_values==[] 42 | # 则没有以下循环,logit=0 43 | for feat_id, feat_val in zip(feature_ids, feature_values): 44 | z = self._z[feat_id] 45 | sign_z = -1. if z < 0 else 1. 46 | 47 | # build w on the fly using z and n, hence the name - lazy weights 48 | # this allows us for not storing the complete w 49 | # if abs(z) <= self._L1: self._w[feat_id] = 0. # w[i] vanishes due to L1 regularization 50 | if abs(z) > self._L1: 51 | # apply prediction time L1, L2 regularization to z and get w 52 | w = (sign_z * self._L1 - z) / ((self._beta + np.sqrt(self._n[feat_id])) / self._alpha + self._L2) 53 | self._w[feat_id] = w 54 | logit += w * feat_val 55 | 56 | return logit 57 | 58 | def update(self, pred_proba, label): 59 | """ 60 | :param pred_proba: 与last_feat_ids/last_feat_vals对应的预测CTR 61 | 注意pred_proba并不一定等于sigmoid(predict_logit(...)),因为要还要考虑deep侧贡献的logit 62 | :param label: 与last_feat_ids/last_feat_vals对应的true label 63 | """ 64 | grad2logit = pred_proba - label 65 | 66 | # 如果当前样本在这个field下所有feature都为0,则没有以下循环,没有更新 67 | for feat_id, feat_val in zip(self._current_feat_ids, self._current_feat_vals): 68 | g = grad2logit * feat_val 69 | g2 = g * g 70 | n = self._n[feat_id] 71 | 72 | self._z[feat_id] += g 73 | 74 | if feat_id in self._w: # if self._w[feat_id] != 0 75 | sigma = (np.sqrt(n + g2) - np.sqrt(n)) / self._alpha 76 | self._z[feat_id] -= sigma * self._w[feat_id] 77 | 78 | self._n[feat_id] = n + g2 79 | -------------------------------------------------------------------------------- /initialization.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.stats 3 | 4 | 5 | class Zero: 6 | def __call__(self, shape): 7 | return np.zeros(shape) 8 | 9 | 10 | class GlorotUniform: 11 | def __call__(self, shape): 12 | fan_in, fan_out = shape 13 | scale = np.sqrt(6 / (fan_in + fan_out)) 14 | return np.random.uniform(-scale, scale, shape) 15 | 16 | 17 | class GlorotNormal: 18 | def __call__(self, shape): 19 | fan_in, fan_out = shape 20 | stdev = np.sqrt(2 / (fan_out + fan_in)) 21 | return np.random.normal(loc=0, scale=stdev, size=shape) 22 | 23 | 24 | class TruncatedNormal: 25 | def __init__(self, mean, stddev, lower, upper): 26 | self._rand = scipy.stats.truncnorm( 27 | (lower - mean) / stddev, 28 | (upper - mean) / stddev, 29 | loc=mean, 30 | scale=stddev) 31 | 32 | def __call__(self, shape): 33 | return self._rand.rvs(size=shape) 34 | 35 | 36 | _Global_Initializers = {} # initializers which can be shared 37 | 38 | 39 | def get_global_init(name): 40 | if name in _Global_Initializers: 41 | return _Global_Initializers[name] 42 | 43 | if name == "zero": 44 | initializer = Zero() 45 | elif name == "glorot_uniform": 46 | initializer = GlorotUniform() 47 | elif name == "glorot_normal": 48 | initializer = GlorotNormal() 49 | else: 50 | raise ValueError('unknown initializer={}'.format(name)) 51 | 52 | _Global_Initializers[name] = initializer 53 | return initializer 54 | -------------------------------------------------------------------------------- /input_layer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class DenseInputCombineLayer: 5 | def __init__(self, field_sizes): 6 | # field_sizes: a list of tuple 7 | # tuple[0]: field name 8 | # tuple[1]: input dim for this field 9 | self._field_sizes = field_sizes 10 | 11 | @property 12 | def output_dim(self): 13 | return sum(in_dim for _, in_dim in self._field_sizes) 14 | 15 | def forward(self, inputs): 16 | """ 17 | 按field_sizes的顺序从inputs提取ndarray,并拼接起来 18 | :param inputs: dict of {field_name: ndarray} 19 | """ 20 | outputs = [] 21 | for field_name, in_dim in self._field_sizes: 22 | a_input = np.asarray(inputs[field_name]) 23 | assert in_dim == a_input.shape[1] 24 | outputs.append(a_input) 25 | return np.hstack(outputs) 26 | 27 | 28 | class SparseInput: 29 | """ 30 | 如何表示稀疏输入,很费了一番思考 31 | TensorFlow中是用sp_ids, sp_weights两上SparseTensor来表示,但是这两个SparseTensor中的indices, dense_shape必须完全相同,是重复的 32 | 33 | 后来考虑使用KVPair = namedtuple('KVPair', ['idx_in_batch', 'id', 'value'])表示一个非零特征 34 | 整个稀疏输入就是list of KVPair,程序处理上是方便了很多,但是每个KVPair都是一个namedtuple,生成了大多的small object,会给GC造成压力 35 | 36 | 后来还考虑使用一个[n_nonzero, 3]的ndarray来表示, 37 | 第0列是idx_in_batch(行号) 38 | 第1列是id 39 | 第2列是数值 40 | 但是因为ndarray只能有一个dtype,为了容纳value,整个ndarray必须是float,处理起行号和id这样的整数,既不方便,也浪费了空间 41 | 42 | 目前决定采用3个list的方式来表示一个理论、稠密形状为[batch_size, max_bag_size]的稀疏输入 43 | 所谓max_bag_size,是一个理论概念,可以认为infinity,在代码中并不出现,也不会对代码造成限制 44 | 比如表示用户行为历史,max_bag_size可以是用户一段历史内阅读的文章数、购买的商品数 45 | 比如表示用户的手机使用习惯,max_bag_size可以是所有app的数目 46 | 这里,我们将这些信息表示成一个bag,而不是sequence,忽略其中的时序关系 47 | 48 | 第一个list example_indices: 是[n_non_zeros]的整数数组,表示在[batch_size, max_bag_size]中的行号(样本序号),>=0 and < batch_size 49 | 而且要求其中的数值是从小到大,排好序的 50 | 第二个list feature_ids: 是[n_non_zeros]的整数数组,表示非零元对应特征的序号,可以重复 51 | 第三个list feature_values: 是[n_non_zeros]的浮点数组,表示非零元对应特征的数值 52 | 举例来说,第i个非零元(0<=i= len(self._example_indices): 81 | return None 82 | 83 | end = nnz_idx + 1 84 | while end < len(self._example_indices) and self._example_indices[end] == self._example_indices[nnz_idx]: 85 | end += 1 86 | 87 | current_feat_ids = self._feature_ids[nnz_idx:end] 88 | current_feat_vals = self._feature_values[nnz_idx:end] 89 | 90 | return end, current_feat_ids, current_feat_vals 91 | 92 | # def iterate_example(self): 93 | # nnz_idx = 0 94 | # for example_idx in range(self.batch_size): 95 | # 96 | # if (nnz_idx >= len(self.example_indices)) or (self.example_indices[nnz_idx] != example_idx): 97 | # yield example_idx, None, None 98 | # 99 | # else: 100 | # nnz_idx, current_feat_ids, current_feat_vals = self.__move_to_next_example(nnz_idx) 101 | # yield example_idx, current_feat_ids, current_feat_vals 102 | 103 | def get_example_in_order(self, example_idx): 104 | """ 105 | :param example_idx: 有一个前提,example_idx必须从0到batch_size顺序输入 106 | :return: 与example_idx对应的feat_ids和feat_vals 107 | """ 108 | if self.__nnz_idx >= len(self._example_indices): 109 | return [], [] 110 | 111 | elif self._example_indices[self.__nnz_idx] == example_idx: 112 | self.__nnz_idx, feat_ids, feat_vals = self.__move_to_next_example(self.__nnz_idx) 113 | return feat_ids, feat_vals 114 | 115 | elif self._example_indices[self.__nnz_idx] > example_idx: 116 | # 等待调用者下次传入更大的example_idx 117 | return [], [] 118 | 119 | else: 120 | # 如果当前example_index并不是调用者需要的example_idx 121 | # 则一定是比外界需要用example_idx大,等待调用者传入更大的example_idx 122 | # 如果比比外界需要用example_idx小,说明调用方式不对 123 | raise ValueError("incorrect invocation") 124 | -------------------------------------------------------------------------------- /metrics.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def tied_rank(x): 5 | """ 6 | Computes the tied rank of elements in x. 7 | This function computes the tied rank of elements in x. 8 | Parameters 9 | ---------- 10 | x : list of numbers, numpy array 11 | Returns 12 | ------- 13 | score : list of numbers 14 | The tied rank f each element in x 15 | """ 16 | sorted_x = sorted(zip(x, range(len(x)))) 17 | r = [0 for k in x] 18 | cur_val = sorted_x[0][0] 19 | last_rank = 0 20 | for i in range(len(sorted_x)): 21 | if cur_val != sorted_x[i][0]: 22 | cur_val = sorted_x[i][0] 23 | for j in range(last_rank, i): 24 | r[sorted_x[j][1]] = float(last_rank + 1 + i) / 2.0 25 | last_rank = i 26 | if i == len(sorted_x) - 1: 27 | for j in range(last_rank, i + 1): 28 | r[sorted_x[j][1]] = float(last_rank + i + 2) / 2.0 29 | return r 30 | 31 | 32 | def auc(y_true, y_score): 33 | """ 34 | copy from: https://github.com/benhamner/Metrics/blob/master/Python/ml_metrics/auc.py 35 | 36 | Computes the area under the receiver-operater characteristic (AUC) 37 | This function computes the AUC error metric for binary classification. 38 | Parameters 39 | ---------- 40 | y_true : list of binary numbers, numpy array 41 | The ground truth value 42 | y_score : same type as actual 43 | Defines a ranking on the binary numbers, from most likely to 44 | be positive to least likely to be positive. 45 | Returns 46 | ------- 47 | score : double 48 | The mean squared error between actual and posterior 49 | """ 50 | r = tied_rank(y_score) 51 | num_positive = len([0 for x in y_true if x == 1]) 52 | num_negative = len(y_true) - num_positive 53 | sum_positive = sum([r[i] for i in range(len(r)) if y_true[i] == 1]) 54 | auc = ((sum_positive - num_positive * (num_positive + 1) / 2.0) / 55 | (num_negative * num_positive)) 56 | return auc 57 | 58 | 59 | def logloss(y_true, y_pred, normalize=True): 60 | loss_array = -y_true * np.log(y_pred) - (1 - y_true) * np.log(1 - y_pred) 61 | if normalize: 62 | return np.mean(loss_array) 63 | else: 64 | return np.sum(loss_array) 65 | -------------------------------------------------------------------------------- /optimization.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class Adagrad: 5 | def __init__(self, lr): 6 | self._lr = lr 7 | # variable name => sum of gradient square (also a vector) 8 | self._sum_grad2 = {} 9 | 10 | def update(self, variables, gradients): 11 | for gradname, gradient in gradients.items(): 12 | # ------ update cache 13 | g2 = gradient * gradient 14 | if gradname in self._sum_grad2: 15 | self._sum_grad2[gradname] += g2 16 | else: 17 | self._sum_grad2[gradname] = g2 18 | 19 | # ------ calculate delta 20 | delta = self._lr * gradient / (np.sqrt(self._sum_grad2[gradname]) + 1e-6) 21 | 22 | # ------ update 23 | if '@' in gradname: 24 | # 对应着稀疏输入的权重与梯度,gradients中的key遵循着'vocab_name@feat_id'的格式 25 | varname, row = gradname.split('@') 26 | row = int(row) 27 | 28 | variable = variables[varname] 29 | variable[row, :] -= delta 30 | else: 31 | variable = variables[gradname] 32 | variable -= delta 33 | 34 | 35 | 36 | 37 | 38 | 39 | -------------------------------------------------------------------------------- /test_dense_layer.py: -------------------------------------------------------------------------------- 1 | from dense_layer import DenseLayer 2 | import numpy as np 3 | from tqdm import tqdm 4 | 5 | 6 | def calc_numeric_grads(variable, epsilon, loss_fn): 7 | numeric_grad = np.zeros_like(variable) 8 | 9 | if len(variable.shape) == 2: 10 | pbar = tqdm(total=variable.shape[0] * variable.shape[1]) 11 | for r in range(variable.shape[0]): 12 | for c in range(variable.shape[1]): 13 | variable[r, c] -= epsilon 14 | # 最终的loss选择用np.sum,从而prev_grads是全1矩阵 15 | neg_loss = loss_fn() 16 | 17 | variable[r, c] += 2 * epsilon 18 | # 最终的loss选择用np.sum,从而prev_grads是全1矩阵 19 | pos_loss = loss_fn() 20 | 21 | numeric_grad[r, c] = (pos_loss - neg_loss) / (2 * epsilon) 22 | 23 | variable[r, c] -= epsilon # restore to original 24 | pbar.update(1) 25 | 26 | elif len(variable.shape) == 1: 27 | pbar = tqdm(total=variable.shape[0]) 28 | for r in range(variable.shape[0]): 29 | variable[r] -= epsilon 30 | # 最终的loss选择用np.sum,从而prev_grads是全1矩阵 31 | neg_loss = loss_fn() 32 | 33 | variable[r] += 2 * epsilon 34 | # 最终的loss选择用np.sum,从而prev_grads是全1矩阵 35 | pos_loss = loss_fn() 36 | 37 | numeric_grad[r] = (pos_loss - neg_loss) / (2 * epsilon) 38 | 39 | variable[r] -= epsilon # restore to original 40 | pbar.update(1) 41 | 42 | else: 43 | raise ValueError('unsupported shape') 44 | 45 | return numeric_grad 46 | 47 | 48 | def test_dense_fc_layer(): 49 | batch_size = 3 50 | fan_in = 4 51 | fan_out = 2 52 | epsilon = 1e-6 53 | 54 | # ---------- forward 55 | layer = DenseLayer(name='test', shape=[fan_in,fan_out], l2reg=0.01) 56 | X = np.random.randn(batch_size, fan_in) 57 | y = layer.forward(X) 58 | assert y.shape == (batch_size, fan_out) 59 | 60 | # ---------- backward 61 | # 最终的loss选择用np.sum,从而prev_grads是全1矩阵,得到的derived_grads就是本层自身的gradients 62 | dX = layer.backward(prev_grads=np.ones((batch_size, fan_out))) 63 | 64 | # ---------- test grads on W 65 | var_grads = [('W', layer._W, layer._dW), ('b', layer._b, layer._db), ('input', X, dX)] 66 | for name, variable, grad in var_grads: 67 | print("\n************* checking numerical gradients on '{}', ......".format(name)) 68 | numeric_grad = calc_numeric_grads(variable=variable, 69 | epsilon=epsilon, 70 | loss_fn=lambda: np.sum(layer.forward(X)) + layer.l2reg_loss) 71 | 72 | print("========== derived gradients = \n{}".format(grad)) 73 | print("========== numeric gradients = \n{}".format(numeric_grad)) 74 | is_equal = np.allclose(grad, numeric_grad) 75 | assert is_equal 76 | print("Equal = {}".format(is_equal)) 77 | 78 | 79 | if __name__ == "__main__": 80 | # np.random.seed(999) 81 | 82 | test_dense_fc_layer() 83 | -------------------------------------------------------------------------------- /test_embed_layer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from embedding_layer import EmbeddingLayer 3 | from input_layer import SparseInput 4 | 5 | 6 | def test_embedding_forward(): 7 | layer = EmbeddingLayer(W=np.arange(12).reshape(4, 3), vocab_name=None, field_name=None) 8 | 9 | X = SparseInput(example_indices=[2, 1, 2], 10 | feature_ids=[2, 3, 2], 11 | feature_values=[1, 2, 2], 12 | n_total_examples=5) 13 | 14 | output = layer.forward(X) 15 | print(output) 16 | 17 | 18 | def test_embedding_backward(): 19 | layer = EmbeddingLayer(W=np.random.randn(4, 3), vocab_name=None, field_name=None) 20 | 21 | X = SparseInput(example_indices=[1, 1, 2, 3, 3, 3], 22 | feature_ids=[0, 3, 1, 2, 1, 0], 23 | feature_values=[1, 2, 2, 1, 1, 2], 24 | n_total_examples=5) 25 | output = layer.forward(X) 26 | 27 | grads2W = layer.backward(np.ones((X.n_total_examples, 3))) 28 | print("========== derived gradients = \n{}".format(grads2W)) 29 | 30 | # ----------- calculate numeric gradients 31 | epsilon = 1e-6 32 | variable = layer._W 33 | numeric_grads = np.zeros_like(variable) 34 | 35 | for r in range(variable.shape[0]): 36 | for c in range(variable.shape[1]): 37 | variable[r, c] -= epsilon 38 | neg_delta_loss = np.sum(layer.forward(X)) 39 | 40 | variable[r, c] += 2 * epsilon 41 | pos_delta_loss = np.sum(layer.forward(X)) 42 | 43 | numeric_grads[r, c] = (pos_delta_loss - neg_delta_loss) / (2 * epsilon) 44 | 45 | variable[r, c] -= epsilon # restore to original 46 | 47 | print("========== numeric gradients = \n{}".format(numeric_grads)) 48 | 49 | 50 | if __name__ == "__main__": 51 | np.random.seed(999) 52 | 53 | # test_embedding_forward() 54 | test_embedding_backward() 55 | -------------------------------------------------------------------------------- /test_input_layer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from input_layer import SparseInput 3 | 4 | 5 | # def test_iterate_example_from_sparse_input(example_indices, batch_size): 6 | # sp_input = SparseInput(example_indices=example_indices, 7 | # feature_ids=example_indices, 8 | # feature_values=example_indices, 9 | # batch_size=batch_size) 10 | # 11 | # # for example_idx, feat_ids, feat_values in sp_input.iterate_example(): 12 | # # print("\n**************** {}-th example: ".format(example_idx)) 13 | # # print("feature ids: {}".format(feat_ids)) 14 | # # print("feature values: {}".format(feat_values)) 15 | # 16 | # iterator = sp_input.iterate_example() 17 | # while True: 18 | # try: 19 | # example_idx, feat_ids, feat_values = next(iterator) 20 | # print("\n**************** {}-th example: ".format(example_idx)) 21 | # print("feature ids: {}".format(feat_ids)) 22 | # print("feature values: {}".format(feat_values)) 23 | # except StopIteration: 24 | # break 25 | 26 | 27 | def test_get_example_in_order_from_sparse(example_indices, batch_size): 28 | sp_input = SparseInput(example_indices=example_indices, 29 | feature_ids=example_indices, 30 | feature_values=example_indices, 31 | n_total_examples=batch_size) 32 | 33 | for example_idx in range(batch_size): 34 | feat_ids, feat_vals = sp_input.get_example_in_order(example_idx) 35 | print("\n**************** {}-th example: ".format(example_idx)) 36 | print("feature ids: {}".format(feat_ids)) 37 | print("feature values: {}".format(feat_vals)) 38 | 39 | 40 | if __name__ == "__main__": 41 | test_get_example_in_order_from_sparse(example_indices=[1, 1, 1, 3, 4, 6],batch_size=10) 42 | 43 | # test_get_example_in_order_from_sparse(example_indices=[1, 1, 1, 3, 4, 6],batch_size=3) 44 | 45 | # test_get_example_in_order_from_sparse(example_indices=[0, 1, 1, 1, 3, 4, 7],batch_size=9) 46 | 47 | # test_get_example_in_order_from_sparse(example_indices=[], batch_size=9) 48 | -------------------------------------------------------------------------------- /test_others.py: -------------------------------------------------------------------------------- 1 | import initialization 2 | import numpy as np 3 | import activation 4 | from tqdm import tqdm 5 | import bisect 6 | import utils 7 | 8 | 9 | def numerical_gradient(variable, loss_fn, epsilon): 10 | # gradients must have the same shape as variable 11 | numeric_grad = np.zeros_like(variable) 12 | 13 | pbar = tqdm(total=variable.shape[0] * variable.shape[1]) 14 | for r in range(variable.shape[0]): 15 | for c in range(variable.shape[1]): 16 | variable[r, c] -= epsilon 17 | neg_loss = loss_fn(variable) 18 | 19 | variable[r, c] += 2 * epsilon 20 | pos_loss = loss_fn(variable) 21 | 22 | numeric_grad[r, c] = (pos_loss - neg_loss) / (2 * epsilon) 23 | 24 | variable[r, c] -= epsilon # restore to original 25 | pbar.update(1) 26 | 27 | return numeric_grad 28 | 29 | 30 | def check_activation(layer): 31 | # ---------- forward and backward 32 | X = np.random.randn(3, 4) 33 | _ = layer.forward(X) 34 | # 最终的loss选择用np.sum,从而prev_grads是全1矩阵,得到的derived_grads就是本层自身的gradients 35 | derived_grads = layer.backward(prev_grads=np.ones_like(X)) 36 | 37 | # ---------- calculate numeric gradients 38 | epsilon = 1e-6 39 | numeric_grads = numerical_gradient(variable=X, 40 | loss_fn=lambda x: np.sum(layer.forward(x)), 41 | epsilon=epsilon) 42 | 43 | # ---------- display 44 | print("========== derived gradients = \n{}".format(derived_grads)) 45 | print("========== numeric gradients = \n{}".format(numeric_grads)) 46 | 47 | # ---------- check 48 | is_equal = np.allclose(numeric_grads, derived_grads) 49 | assert is_equal 50 | print("Equal = {}".format(is_equal)) 51 | 52 | 53 | def test_activations(): 54 | check_activation(activation.Sigmoid()) 55 | check_activation(activation.ReLU()) 56 | 57 | 58 | def test_initializer(): 59 | # ---------------- GlorotUniform 60 | init_glorot_uniform = initialization.get_global_init('glorot_uniform') 61 | w = init_glorot_uniform([2, 3]) 62 | print("\nGlorotUniform") 63 | print(w) 64 | print(w.sum()) 65 | 66 | # ---------------- GlorotNormal 67 | init_glorot_normal = initialization.get_global_init('glorot_normal') 68 | w = init_glorot_uniform([20, 50]) 69 | print("\nGlorotNormal") 70 | # print(w) 71 | print(w.mean()) 72 | 73 | 74 | # def test_bce_loss_with_logits(): 75 | # bce_layer = BinaryCrossEntropy4Logits() 76 | # 77 | # # 必须转化成float,否则logits[idx] -= epsilon这种inplace modification时,因为logits本身还是正数 78 | # # 会导致logits[idx]变化后还是整数,比如1-1e-6会被强制转化成0 79 | # logits = np.asarray([1, 2, 3], dtype=float) 80 | # labels = [1, 1, 0] 81 | # 82 | # loss = bce_layer.forward(logits=logits, labels=labels) 83 | # print("loss={}".format(loss)) 84 | # 85 | # grad2logits = bce_layer.backward() 86 | # print(" gradients to logits = {}".format(grad2logits)) 87 | # 88 | # # --------- numeric loss 89 | # epsilon = 1e-6 90 | # numeric_grads = np.zeros_like(logits) 91 | # 92 | # for idx in range(len(logits)): 93 | # logits[idx] -= epsilon 94 | # neg_loss = bce_layer.forward(logits=logits, labels=labels) 95 | # 96 | # logits[idx] += 2 * epsilon 97 | # pos_loss = bce_layer.forward(logits=logits, labels=labels) 98 | # 99 | # numeric_grads[idx] = (pos_loss - neg_loss) / (2 * epsilon) 100 | # 101 | # logits[idx] -= epsilon # restore to original 102 | # 103 | # print("numerical gradients to logits = {}".format(numeric_grads)) 104 | # 105 | # # --------- check 106 | # is_equal = np.allclose(grad2logits, len(logits) * numeric_grads) 107 | # assert is_equal 108 | # print("Equal = {}".format(is_equal)) 109 | 110 | 111 | def test_bucket_by_bisect(): 112 | age_boundaries = [18, 25, 30, 35, 40, 45, 50, 55, 60, 65] 113 | 114 | for age in [5, 18, 31, 55, 42, 67]: 115 | idx = bisect.bisect(age_boundaries, age) 116 | 117 | if idx == 0: 118 | lb = '-inf' 119 | else: 120 | lb = age_boundaries[idx - 1] 121 | 122 | if idx == len(age_boundaries): 123 | hb = 'inf' 124 | else: 125 | hb = age_boundaries[idx] 126 | 127 | print("{}<={}<{}".format(lb, age, hb)) 128 | 129 | 130 | def test_split_columns(): 131 | a = np.arange(12).reshape(2, 6) 132 | print(a) 133 | 134 | splits = utils.split_column(a,[2,1,3]) 135 | for idx, split in enumerate(splits,start=1): 136 | print("\n--------- {}th split".format(idx)) 137 | print(split) 138 | 139 | 140 | if __name__ == "__main__": 141 | np.random.seed(999) 142 | 143 | # test_activations() 144 | # test_initializer() 145 | # test_bce_loss_with_logits() 146 | # test_bucket_by_bisect() 147 | test_split_columns() 148 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | 4 | def chunk(stream, chunk_size): 5 | buf = [] 6 | 7 | for item in stream: 8 | buf.append(item) 9 | 10 | if len(buf) >= chunk_size: 11 | yield buf 12 | del buf[:] 13 | 14 | if len(buf) > 0: 15 | yield buf 16 | 17 | 18 | def split_column(m, col_sizes): 19 | offset = 0 20 | splits = [] 21 | 22 | for colsize in col_sizes: 23 | splits.append(m[:, offset:(offset + colsize)]) 24 | offset += colsize 25 | 26 | assert offset == m.shape[1] 27 | return splits 28 | 29 | 30 | def config_logging(fname): 31 | logging.basicConfig(level=logging.INFO, format='%(message)s') # re-format to remove prefix 'INFO:root' 32 | 33 | fh = logging.FileHandler(fname) 34 | fh.setLevel(logging.INFO) 35 | logging.getLogger("").addHandler(fh) 36 | -------------------------------------------------------------------------------- /wide_layer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from ftrl import FtrlEstimator 3 | from base_estimator import BaseEstimator 4 | from collections import namedtuple 5 | 6 | 7 | class WideLayer: 8 | def __init__(self, field_names, alpha, beta, L1, L2, proba_fn): 9 | """ 10 | :param proba_fn: proba_fn(example_idx,logit)=probability 11 | 之所以用function是因为如果与DNN结合,计算probability还要考虑DNN提供的logit 12 | """ 13 | self._estimators = {field: FtrlEstimator(alpha=alpha, 14 | beta=beta, 15 | L1=L1, 16 | L2=L2) for field in (['bias'] + field_names)} 17 | self._proba_fn = proba_fn 18 | 19 | def __predict_logit(self, sp_features, example_idx): 20 | logit = 0 21 | 22 | for field, estimator in self._estimators.items(): 23 | if field == 'bias': 24 | feat_ids = [0] 25 | feat_vals = [1] 26 | else: 27 | sp_input = sp_features[field] 28 | feat_ids, feat_vals = sp_input.get_example_in_order(example_idx) 29 | 30 | logit += estimator.predict_logit(feature_ids=feat_ids, feature_values=feat_vals) 31 | 32 | return logit 33 | 34 | def train(self, sp_features, labels): 35 | """ 36 | :param sp_features: dict from field_name ==> SparseInput 37 | :return: probabilities from this train batch 38 | """ 39 | probas = [] 40 | for example_idx, label in enumerate(labels): 41 | logit = self.__predict_logit(sp_features, example_idx) 42 | 43 | pred_proba = self._proba_fn(example_idx, logit) 44 | probas.append(pred_proba) 45 | 46 | for _, estimator in self._estimators.items(): 47 | estimator.update(pred_proba=pred_proba, label=label) 48 | 49 | return np.asarray(probas) 50 | 51 | def predict_logits(self, sp_features): 52 | # 假定所有sp_feature都拥有相同的行数 53 | batch_size = None 54 | for sp_input in sp_features.values(): 55 | batch_size = sp_input.n_total_examples 56 | break 57 | 58 | logits = [self.__predict_logit(sp_features, example_idx) for example_idx in range(batch_size)] 59 | return np.asarray(logits) 60 | 61 | 62 | WideHParams = namedtuple("WideHParams", ['field_names', 'alpha', 'beta', 'L1', 'L2']) 63 | 64 | 65 | def _sigmoid(example_idx, logit): 66 | return 1 / (1 + np.exp(-logit)) 67 | 68 | 69 | class WideEstimator(BaseEstimator): 70 | def __init__(self, hparams, data_source): 71 | self._layer = WideLayer(field_names=hparams.field_names, 72 | alpha=hparams.alpha, 73 | beta=hparams.beta, 74 | L1=hparams.L1, 75 | L2=hparams.L2, 76 | proba_fn=_sigmoid) 77 | super().__init__(data_source) 78 | 79 | def train_batch(self, features, labels): 80 | return self._layer.train(sp_features=features, labels=labels) 81 | 82 | def predict(self, features): 83 | pred_logits = self._layer.predict_logits(sp_features=features) 84 | return 1 / (1 + np.exp(-pred_logits)) 85 | -------------------------------------------------------------------------------- /wide_n_deep.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from dnn import DeepNetwork 3 | from wide_layer import WideLayer 4 | from base_estimator import BaseEstimator 5 | 6 | 7 | class WideDeepEstimator(BaseEstimator): 8 | def __init__(self, wide_hparams, deep_hparams, data_source): 9 | self._current_deep_logits = None 10 | 11 | self._wide_layer = WideLayer(field_names=wide_hparams.field_names, 12 | alpha=wide_hparams.alpha, 13 | beta=wide_hparams.beta, 14 | L1=wide_hparams.L1, 15 | L2=wide_hparams.L2, 16 | proba_fn=self._predict_proba) 17 | 18 | self._dnn = DeepNetwork(dense_fields=deep_hparams.dense_fields, 19 | vocab_infos=deep_hparams.vocab_infos, 20 | embed_fields=deep_hparams.embed_fields, 21 | hidden_units=deep_hparams.hidden_units, 22 | L2=deep_hparams.L2, 23 | optimizer=deep_hparams.optimizer) 24 | 25 | super().__init__(data_source) 26 | 27 | def _predict_proba(self, example_idx, wide_logit): 28 | deep_logit = self._current_deep_logits[example_idx] 29 | logit = deep_logit + wide_logit 30 | return 1 / (1 + np.exp(-logit)) 31 | 32 | def train_batch(self, features, labels): 33 | self._current_deep_logits = self._dnn.forward(features) 34 | 35 | pred_probas = self._wide_layer.train(features, labels) 36 | 37 | self._dnn.backward(grads2logits=pred_probas - labels) 38 | 39 | return pred_probas 40 | 41 | def predict(self, features): 42 | deep_logits = self._dnn.forward(features) 43 | 44 | wide_logits = self._wide_layer.predict_logits(features) 45 | 46 | logits = deep_logits + wide_logits 47 | 48 | return 1 / (1 + np.exp(-logits)) 49 | --------------------------------------------------------------------------------