├── .gitignore ├── README.md ├── activation ├── __init__.py ├── entmax.py └── glu.py ├── config ├── __init__.py └── covertype.py ├── download_prepare_covertype.py ├── model ├── tabnet.py └── tabnet_reduced.py ├── requirements.txt ├── run.sh ├── train_classifier.py └── util ├── __init__.py ├── data_helper.py ├── logging.py └── tfutils.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | *.csv 132 | *.gz 133 | tflog/ 134 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # TabNet Reduced 2 | 3 | Most of the code is taken from [here](https://github.com/google-research/google-research/tree/master/tabnet) for "TabNet: Attentive Interpretable Tabular Learning" by Sercan O. Arik and Tomas Pfister (paper: https://arxiv.org/abs/1908.07442). 4 | 5 | The modified model, reduced TabNet, is defined in `model/tabnet_reduced.py`. There are two modifications: 6 | * there is now 1 shared feature transformer and 1 decision-dependent feature transformer (from 2 and 2 before respectively), and 7 | * the SparseMax mask for feature selection has been replaced by EntMax 1.5 (implementation in TensorFlow from [here](https://gist.github.com/justheuristic/60167e77a95221586be315ae527c3cbd)). 8 | 9 | The combination of these modifications has improved the performance of TabNet with fewer parameters, particularly with a sharper mask for feature selection. 10 | 11 | ## Training and Evaluation 12 | 13 | As in the original repository, this repository contains an example implementation of TabNet on the Forest Covertype dataset (https://archive.ics.uci.edu/ml/datasets/covertype). 14 | 15 | To run the script, run `run.sh`. Otherwise, a manual approach can be taken as follows. 16 | 17 | First, run `python download_prepare_covertype.py` to download and prepare the Forest Covertype dataset. 18 | This command creates `train.csv`, `val.csv`, and `test.csv` files under the `data/` directory (will create the directory if it does not exist). 19 | 20 | To run the pipeline for training and evaluation, simply use `python train_classifier.py`. Note that Tensorboard logs are written in `tflog/`. 21 | 22 | For simplicity, the hyperparameters for both the reduced TabNet and TabNet model are kept the same. These can be found in `config/covertype.py`. To set training to reduced TabNet, 23 | set `REDUCED = True`, else set `REDUCED = False`. 24 | 25 | ## Modifications for Other Datasets 26 | 27 | To modify the experiment to other tabular datasets: 28 | - Substitute the `train.csv`, `val.csv`, and `test.csv` files under `data/` directory, 29 | - Create a new config in `config/` by copying `config/covertype.py` for the numerical and categorical features of the new dataset and hyperparameters, 30 | - Reoptimize the TabNet hyperparameters for the new dataset in your config, 31 | - Import the parameters in `train_classifier.py`, 32 | - Select the reduced TabNet architecture by setting `REDUCED = True`, and 33 | - Change `MODEL_NAME` in your config to a name you desire. 34 | -------------------------------------------------------------------------------- /activation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ptuls/tabnet-modified/5b1f8d13584b552f4808d95ffd253830c696fb4e/activation/__init__.py -------------------------------------------------------------------------------- /activation/entmax.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import tensorflow as tf 3 | 4 | 5 | # taken from https://gist.github.com/justheuristic/60167e77a95221586be315ae527c3cbd 6 | def entmax15(inputs, axis=-1): 7 | """ 8 | Entmax 1.5 implementation, heavily inspired by 9 | * paper: https://arxiv.org/pdf/1905.05702.pdf 10 | * pytorch code: https://github.com/deep-spin/entmax 11 | :param inputs: similar to softmax logits, but for entmax1.5 12 | :param axis: entmax1.5 outputs will sum to 1 over this axis 13 | :return: entmax activations of same shape as inputs 14 | """ 15 | 16 | @tf.custom_gradient 17 | def _entmax_inner(inputs): 18 | with tf.name_scope("entmax"): 19 | inputs = inputs / 2 # divide by 2 so as to solve actual entmax 20 | # subtract max for stability 21 | inputs -= tf.reduce_max(inputs, axis, keep_dims=True) 22 | 23 | threshold, _ = entmax_threshold_and_support(inputs, axis) 24 | outputs_sqrt = tf.nn.relu(inputs - threshold) 25 | outputs = tf.square(outputs_sqrt) 26 | 27 | def grad_fn(d_outputs): 28 | with tf.name_scope("entmax_grad"): 29 | d_inputs = d_outputs * outputs_sqrt 30 | q = tf.reduce_sum(d_inputs, axis=axis, keep_dims=True) 31 | q = q / tf.reduce_sum(outputs_sqrt, axis=axis, keep_dims=True) 32 | d_inputs -= q * outputs_sqrt 33 | return d_inputs 34 | 35 | return outputs, grad_fn 36 | 37 | return _entmax_inner(inputs) 38 | 39 | 40 | @tf.custom_gradient 41 | def sparse_entmax15_loss_with_logits(labels, logits): 42 | """ 43 | Computes sample-wise entmax1.5 loss 44 | :param labels: reference answers vector int64[batch_size] \in [0, num_classes) 45 | :param logits: output matrix float32[batch_size, num_classes] (not actually logits :) 46 | :returns: elementwise loss, float32[batch_size] 47 | """ 48 | assert logits.shape.ndims == 2 and labels.shape.ndims == 1 49 | with tf.name_scope("entmax_loss"): 50 | p_star = entmax15(logits, axis=-1) 51 | omega_entmax15 = (1 - (tf.reduce_sum(p_star * tf.sqrt(p_star), axis=-1))) / 0.75 52 | p_incr = p_star - tf.one_hot(labels, depth=tf.shape(logits)[-1], axis=-1) 53 | loss = omega_entmax15 + tf.einsum("ij,ij->i", p_incr, logits) 54 | 55 | def grad_fn(grad_output): 56 | with tf.name_scope("entmax_loss_grad"): 57 | return None, grad_output[..., None] * p_incr 58 | 59 | return loss, grad_fn 60 | 61 | 62 | @tf.custom_gradient 63 | def entmax15_loss_with_logits(labels, logits): 64 | """ 65 | Computes sample-wise entmax1.5 loss 66 | :param logits: "logits" matrix float32[batch_size, num_classes] 67 | :param labels: reference answers indicators, float32[batch_size, num_classes] 68 | :returns: elementwise loss, float32[batch_size] 69 | 70 | WARNING: this function does not propagate gradients through :labels: 71 | This behavior is the same as like softmax_crossentropy_with_logits v1 72 | It may become an issue if you do something like co-distillation 73 | """ 74 | assert labels.shape.ndims == logits.shape.ndims == 2 75 | with tf.name_scope("entmax_loss"): 76 | p_star = entmax15(logits, axis=-1) 77 | omega_entmax15 = (1 - (tf.reduce_sum(p_star * tf.sqrt(p_star), axis=-1))) / 0.75 78 | p_incr = p_star - labels 79 | loss = omega_entmax15 + tf.einsum("ij,ij->i", p_incr, logits) 80 | 81 | def grad_fn(grad_output): 82 | with tf.name_scope("entmax_loss_grad"): 83 | return None, grad_output[..., None] * p_incr 84 | 85 | return loss, grad_fn 86 | 87 | 88 | def top_k_over_axis(inputs, k, axis=-1, **kwargs): 89 | """ performs tf.nn.top_k over any chosen axis """ 90 | with tf.name_scope("top_k_along_axis"): 91 | if axis == -1: 92 | return tf.nn.top_k(inputs, k, **kwargs) 93 | 94 | perm_order = list(range(inputs.shape.ndims)) 95 | perm_order.append(perm_order.pop(axis)) 96 | inv_order = [perm_order.index(i) for i in range(len(perm_order))] 97 | 98 | input_perm = tf.transpose(inputs, perm_order) 99 | input_perm_sorted, sort_indices_perm = tf.nn.top_k(input_perm, k=k, **kwargs) 100 | 101 | input_sorted = tf.transpose(input_perm_sorted, inv_order) 102 | sort_indices = tf.transpose(sort_indices_perm, inv_order) 103 | return input_sorted, sort_indices 104 | 105 | 106 | def _make_ix_like(inputs, axis=-1): 107 | """ creates indices 0, ... , input[axis] unsqueezed to input dimensios """ 108 | assert inputs.shape.ndims is not None 109 | rho = tf.cast(tf.range(1, tf.shape(inputs)[axis] + 1), dtype=inputs.dtype) 110 | view = [1] * inputs.shape.ndims 111 | view[axis] = -1 112 | return tf.reshape(rho, view) 113 | 114 | 115 | def gather_over_axis(values, indices, gather_axis): 116 | """ 117 | replicates the behavior of torch.gather for tf<=1.8; 118 | for newer versions use tf.gather with batch_dims 119 | :param values: tensor [d0, ..., dn] 120 | :param indices: int64 tensor of same shape as values except for gather_axis 121 | :param gather_axis: performs gather along this axis 122 | :returns: gathered values, same shape as values except for gather_axis 123 | If gather_axis == 2 124 | gathered_values[i, j, k, ...] = values[i, j, indices[i, j, k, ...], ...] 125 | see torch.gather for more detils 126 | """ 127 | assert indices.shape.ndims is not None 128 | assert indices.shape.ndims == values.shape.ndims 129 | 130 | ndims = indices.shape.ndims 131 | gather_axis = gather_axis % ndims 132 | shape = tf.shape(indices) 133 | 134 | selectors = [] 135 | for axis_i in range(ndims): 136 | if axis_i == gather_axis: 137 | selectors.append(indices) 138 | else: 139 | index_i = tf.range(tf.cast(shape[axis_i], dtype=indices.dtype), dtype=indices.dtype) 140 | index_i = tf.reshape(index_i, [-1 if i == axis_i else 1 for i in range(ndims)]) 141 | index_i = tf.tile(index_i, [shape[i] if i != axis_i else 1 for i in range(ndims)]) 142 | selectors.append(index_i) 143 | 144 | return tf.gather_nd(values, tf.stack(selectors, axis=-1)) 145 | 146 | 147 | def entmax_threshold_and_support(inputs, axis=-1): 148 | """ 149 | Computes clipping threshold for entmax1.5 over specified axis 150 | NOTE this implementation uses the same heuristic as 151 | the original code: https://tinyurl.com/pytorch-entmax-line-203 152 | :param inputs: (entmax1.5 inputs - max) / 2 153 | :param axis: entmax1.5 outputs will sum to 1 over this axis 154 | """ 155 | 156 | with tf.name_scope("entmax_threshold_and_support"): 157 | num_outcomes = tf.shape(inputs)[axis] 158 | inputs_sorted, _ = top_k_over_axis(inputs, k=num_outcomes, axis=axis, sorted=True) 159 | 160 | rho = _make_ix_like(inputs, axis=axis) 161 | 162 | mean = tf.cumsum(inputs_sorted, axis=axis) / rho 163 | 164 | mean_sq = tf.cumsum(tf.square(inputs_sorted), axis=axis) / rho 165 | delta = (1 - rho * (mean_sq - tf.square(mean))) / rho 166 | 167 | delta_nz = tf.nn.relu(delta) 168 | tau = mean - tf.sqrt(delta_nz) 169 | 170 | support_size = tf.reduce_sum( 171 | tf.to_int64(tf.less_equal(tau, inputs_sorted)), axis=axis, keep_dims=True 172 | ) 173 | 174 | tau_star = gather_over_axis(tau, support_size - 1, axis) 175 | return tau_star, support_size 176 | -------------------------------------------------------------------------------- /activation/glu.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import tensorflow as tf 3 | 4 | 5 | def glu(act, n_units): 6 | """Generalized linear unit nonlinear activation.""" 7 | return act[:, :n_units] * tf.nn.sigmoid(act[:, n_units:]) 8 | -------------------------------------------------------------------------------- /config/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ptuls/tabnet-modified/5b1f8d13584b552f4808d95ffd253830c696fb4e/config/__init__.py -------------------------------------------------------------------------------- /config/covertype.py: -------------------------------------------------------------------------------- 1 | # Dataset size 2 | # N_TRAIN_SAMPLES = 309871 3 | N_VAL_SAMPLES = 154937 4 | N_TEST_SAMPLES = 116203 5 | NUM_FEATURES = 54 6 | NUM_CLASSES = 7 7 | 8 | # All feature columns in the data 9 | LABEL_COLUMN = "Covertype" 10 | 11 | BOOL_COLUMNS = [ 12 | "Wilderness_Area1", 13 | "Wilderness_Area2", 14 | "Wilderness_Area3", 15 | "Wilderness_Area4", 16 | "Soil_Type1", 17 | "Soil_Type2", 18 | "Soil_Type3", 19 | "Soil_Type4", 20 | "Soil_Type5", 21 | "Soil_Type6", 22 | "Soil_Type7", 23 | "Soil_Type8", 24 | "Soil_Type9", 25 | "Soil_Type10", 26 | "Soil_Type11", 27 | "Soil_Type12", 28 | "Soil_Type13", 29 | "Soil_Type14", 30 | "Soil_Type15", 31 | "Soil_Type16", 32 | "Soil_Type17", 33 | "Soil_Type18", 34 | "Soil_Type19", 35 | "Soil_Type20", 36 | "Soil_Type21", 37 | "Soil_Type22", 38 | "Soil_Type23", 39 | "Soil_Type24", 40 | "Soil_Type25", 41 | "Soil_Type26", 42 | "Soil_Type27", 43 | "Soil_Type28", 44 | "Soil_Type29", 45 | "Soil_Type30", 46 | "Soil_Type31", 47 | "Soil_Type32", 48 | "Soil_Type33", 49 | "Soil_Type34", 50 | "Soil_Type35", 51 | "Soil_Type36", 52 | "Soil_Type37", 53 | "Soil_Type38", 54 | "Soil_Type39", 55 | "Soil_Type40", 56 | ] 57 | 58 | INT_COLUMNS = [ 59 | "Elevation", 60 | "Aspect", 61 | "Slope", 62 | "Horizontal_Distance_To_Hydrology", 63 | "Vertical_Distance_To_Hydrology", 64 | "Horizontal_Distance_To_Roadways", 65 | "Hillshade_9am", 66 | "Hillshade_Noon", 67 | "Hillshade_3pm", 68 | "Horizontal_Distance_To_Fire_Points", 69 | ] 70 | 71 | STR_COLUMNS = [] 72 | STR_NUNIQUESS = [] 73 | 74 | FLOAT_COLUMNS = [] 75 | 76 | ENCODED_CATEGORICAL_COLUMNS = [] 77 | 78 | # Model hyperparameters 79 | FEATURE_DIM = 64 80 | OUTPUT_DIM = 64 81 | NUM_DECISION_STEPS = 5 82 | RELAXATION_FACTOR = 1.5 83 | BATCH_MOMENTUM = 0.7 84 | VIRTUAL_BATCH_SIZE = 512 85 | 86 | # Training parameters 87 | TRAIN_FILE = "data/train_covertype.csv" 88 | VAL_FILE = "data/val_covertype.csv" 89 | TEST_FILE = "data/test_covertype.csv" 90 | MAX_STEPS = 1000000 91 | DISPLAY_STEP = 1000 92 | VAL_STEP = 10000 93 | SAVE_STEP = 40000 94 | TEST_STEP = 1000 95 | INIT_LEARNING_RATE = 0.02 96 | DECAY_EVERY = 500 97 | DECAY_RATE = 0.95 98 | BATCH_SIZE = 16384 99 | SPARSITY_LOSS_WEIGHT = 0.0001 100 | GRADIENT_THRESH = 2000.0 101 | SEED = 1 102 | REDUCED = True 103 | MODEL_NAME = "tabnet_forest_covertype_reduced_model" if REDUCED else "tabnet_forest_covertype_model" 104 | -------------------------------------------------------------------------------- /download_prepare_covertype.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The Google Research Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | # This script was taken from TabNet's repository 17 | 18 | """Downloads and prepares the Forest Covertype dataset.""" 19 | 20 | import gzip 21 | import os 22 | import shutil 23 | import pandas as pd 24 | from sklearn.model_selection import train_test_split 25 | import wget 26 | 27 | URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.data.gz" 28 | 29 | 30 | def main(): 31 | 32 | if not os.path.exists("./data"): 33 | os.makedirs("./data") 34 | 35 | filename = wget.download(URL) 36 | with gzip.open(filename, "rb") as f_in: 37 | with open("data/covtype.csv", "wb") as f_out: 38 | shutil.copyfileobj(f_in, f_out) 39 | 40 | df = pd.read_csv("data/covtype.csv") 41 | n_total = len(df) 42 | 43 | # Train, val and test split follows 44 | # Rory Mitchell, Andrey Adinets, Thejaswi Rao, and Eibe Frank. 45 | # Xgboost: Scalable GPU accelerated learning. arXiv:1806.11248, 2018. 46 | 47 | train_val_indices, test_indices = train_test_split( 48 | range(n_total), test_size=0.2, random_state=0 49 | ) 50 | train_indices, val_indices = train_test_split( 51 | train_val_indices, test_size=0.2 / 0.6, random_state=0 52 | ) 53 | 54 | traindf = df.iloc[train_indices] 55 | valdf = df.iloc[val_indices] 56 | testdf = df.iloc[test_indices] 57 | traindf = traindf.sample(frac=1) 58 | 59 | traindf.to_csv("data/train_covertype.csv", index=False, header=False) 60 | valdf.to_csv("data/val_covertype.csv", index=False, header=False) 61 | testdf.to_csv("data/test_covertype.csv", index=False, header=False) 62 | 63 | 64 | if __name__ == "__main__": 65 | main() 66 | -------------------------------------------------------------------------------- /model/tabnet.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The Google Research Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """TabNet model.""" 17 | import numpy as np 18 | import tensorflow.compat.v1 as tf 19 | import sys 20 | 21 | sys.path.append("..") 22 | 23 | from activation.glu import glu 24 | 25 | 26 | class TabNet(object): 27 | """TabNet model class.""" 28 | 29 | def __init__( 30 | self, 31 | columns, 32 | num_features, 33 | feature_dim, 34 | output_dim, 35 | num_decision_steps, 36 | relaxation_factor, 37 | batch_momentum, 38 | virtual_batch_size, 39 | num_classes, 40 | epsilon=0.00001, 41 | ): 42 | """Initializes a TabNet instance. 43 | 44 | Args: 45 | columns: The Tensorflow column names for the dataset. 46 | num_features: The number of input features (i.e the number of columns for 47 | tabular data assuming each feature is represented with 1 dimension). 48 | feature_dim: Dimensionality of the hidden representation in feature 49 | transformation block. Each layer first maps the representation to a 50 | 2*feature_dim-dimensional output and half of it is used to determine the 51 | nonlinearity of the GLU activation where the other half is used as an 52 | input to GLU, and eventually feature_dim-dimensional output is 53 | transferred to the next layer. 54 | output_dim: Dimensionality of the outputs of each decision step, which is 55 | later mapped to the final classification or regression output. 56 | num_decision_steps: Number of sequential decision steps. 57 | relaxation_factor: Relaxation factor that promotes the reuse of each 58 | feature at different decision steps. When it is 1, a feature is enforced 59 | to be used only at one decision step and as it increases, more 60 | flexibility is provided to use a feature at multiple decision steps. 61 | batch_momentum: Momentum in ghost batch normalization. 62 | virtual_batch_size: Virtual batch size in ghost batch normalization. The 63 | overall batch size should be an integer multiple of virtual_batch_size. 64 | num_classes: Number of output classes. 65 | epsilon: A small number for numerical stability of the entropy calcations. 66 | 67 | Returns: 68 | A TabNet instance. 69 | """ 70 | 71 | self.columns = columns 72 | self.num_features = num_features 73 | self.feature_dim = feature_dim 74 | self.output_dim = output_dim 75 | self.num_decision_steps = num_decision_steps 76 | self.relaxation_factor = relaxation_factor 77 | self.batch_momentum = batch_momentum 78 | self.virtual_batch_size = virtual_batch_size 79 | self.num_classes = num_classes 80 | self.epsilon = epsilon 81 | 82 | def encoder(self, data, reuse, is_training): 83 | """TabNet encoder model.""" 84 | 85 | with tf.variable_scope("Encoder", reuse=reuse): 86 | 87 | # Reads and normalizes input features. 88 | features = tf.feature_column.input_layer(data, self.columns) 89 | features = tf.layers.batch_normalization( 90 | features, training=is_training, momentum=self.batch_momentum 91 | ) 92 | batch_size = tf.shape(features)[0] 93 | 94 | # Initializes decision-step dependent variables. 95 | output_aggregated = tf.zeros([batch_size, self.output_dim]) 96 | masked_features = features 97 | mask_values = tf.zeros([batch_size, self.num_features]) 98 | aggregated_mask_values = tf.zeros([batch_size, self.num_features]) 99 | complemantary_aggregated_mask_values = tf.ones([batch_size, self.num_features]) 100 | total_entropy = 0 101 | 102 | v_b = self.virtual_batch_size if is_training else 1 103 | for ni in range(self.num_decision_steps): 104 | # Feature transformer with two shared and two decision step dependent 105 | # blocks is used below. 106 | reuse_flag = ni > 0 107 | 108 | transform_f1 = tf.layers.dense( 109 | masked_features, 110 | self.feature_dim * 2, 111 | name="Transform_f1", 112 | reuse=reuse_flag, 113 | use_bias=False, 114 | ) 115 | transform_f1 = tf.layers.batch_normalization( 116 | transform_f1, 117 | training=is_training, 118 | momentum=self.batch_momentum, 119 | virtual_batch_size=v_b, 120 | ) 121 | transform_f1 = glu(transform_f1, self.feature_dim) 122 | 123 | transform_f2 = tf.layers.dense( 124 | transform_f1, 125 | self.feature_dim * 2, 126 | name="Transform_f2", 127 | reuse=reuse_flag, 128 | use_bias=False, 129 | ) 130 | transform_f2 = tf.layers.batch_normalization( 131 | transform_f2, 132 | training=is_training, 133 | momentum=self.batch_momentum, 134 | virtual_batch_size=v_b, 135 | ) 136 | transform_f2 = (glu(transform_f2, self.feature_dim) + transform_f1) * np.sqrt(0.5) 137 | 138 | transform_f3 = tf.layers.dense( 139 | transform_f2, 140 | self.feature_dim * 2, 141 | name="Transform_f3" + str(ni), 142 | use_bias=False, 143 | ) 144 | transform_f3 = tf.layers.batch_normalization( 145 | transform_f3, 146 | training=is_training, 147 | momentum=self.batch_momentum, 148 | virtual_batch_size=v_b, 149 | ) 150 | transform_f3 = (glu(transform_f3, self.feature_dim) + transform_f2) * np.sqrt(0.5) 151 | 152 | transform_f4 = tf.layers.dense( 153 | transform_f3, 154 | self.feature_dim * 2, 155 | name="Transform_f4" + str(ni), 156 | use_bias=False, 157 | ) 158 | transform_f4 = tf.layers.batch_normalization( 159 | transform_f4, 160 | training=is_training, 161 | momentum=self.batch_momentum, 162 | virtual_batch_size=v_b, 163 | ) 164 | transform_f4 = (glu(transform_f4, self.feature_dim) + transform_f3) * np.sqrt(0.5) 165 | 166 | if ni > 0: 167 | 168 | decision_out = tf.nn.relu(transform_f4[:, : self.output_dim]) 169 | 170 | # Decision aggregation. 171 | output_aggregated += decision_out 172 | 173 | # Aggregated masks are used for visualization of the 174 | # feature importance attributes. 175 | scale_agg = tf.reduce_sum(decision_out, axis=1, keep_dims=True) / ( 176 | self.num_decision_steps - 1 177 | ) 178 | aggregated_mask_values += mask_values * scale_agg 179 | 180 | features_for_coef = transform_f4[:, self.output_dim :] 181 | 182 | if ni < self.num_decision_steps - 1: 183 | 184 | # Determines the feature masks via linear and nonlinear 185 | # transformations, taking into account of aggregated feature use. 186 | mask_values = tf.layers.dense( 187 | features_for_coef, 188 | self.num_features, 189 | name="Transform_coef" + str(ni), 190 | use_bias=False, 191 | ) 192 | mask_values = tf.layers.batch_normalization( 193 | mask_values, 194 | training=is_training, 195 | momentum=self.batch_momentum, 196 | virtual_batch_size=v_b, 197 | ) 198 | mask_values *= complemantary_aggregated_mask_values 199 | mask_values = tf.contrib.sparsemax.sparsemax(mask_values) 200 | 201 | # Relaxation factor controls the amount of reuse of features between 202 | # different decision blocks and updated with the values of 203 | # coefficients. 204 | complemantary_aggregated_mask_values *= self.relaxation_factor - mask_values 205 | 206 | # Entropy is used to penalize the amount of sparsity in feature 207 | # selection. 208 | total_entropy += tf.reduce_mean( 209 | tf.reduce_sum(-mask_values * tf.log(mask_values + self.epsilon), axis=1) 210 | ) / (self.num_decision_steps - 1) 211 | 212 | # Feature selection. 213 | masked_features = tf.multiply(mask_values, features) 214 | 215 | # Visualization of the feature selection mask at decision step ni 216 | tf.summary.image( 217 | "Mask_for_step" + str(ni), 218 | tf.expand_dims(tf.expand_dims(mask_values, 0), 3), 219 | max_outputs=1, 220 | ) 221 | 222 | # Visualization of the aggregated feature importances 223 | tf.summary.image( 224 | "Aggregated_mask", 225 | tf.expand_dims(tf.expand_dims(aggregated_mask_values, 0), 3), 226 | max_outputs=1, 227 | ) 228 | 229 | return output_aggregated, total_entropy 230 | 231 | def classify(self, activations, reuse): 232 | """TabNet classify block.""" 233 | 234 | with tf.variable_scope("Classify", reuse=reuse): 235 | logits = tf.layers.dense(activations, self.num_classes, use_bias=False) 236 | predictions = tf.nn.softmax(logits) 237 | return logits, predictions 238 | 239 | def regress(self, activations, reuse): 240 | """TabNet regress block.""" 241 | 242 | with tf.variable_scope("Regress", reuse=reuse): 243 | predictions = tf.layers.dense(activations, 1) 244 | return predictions 245 | -------------------------------------------------------------------------------- /model/tabnet_reduced.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Modified reduced TabNet model.""" 3 | import numpy as np 4 | import sys 5 | 6 | sys.path.append("..") 7 | 8 | import tensorflow.compat.v1 as tf 9 | 10 | from activation.entmax import entmax15 11 | from activation.glu import glu 12 | 13 | 14 | class TabNetReduced(object): 15 | """Reduced TabNet model class.""" 16 | 17 | def __init__( 18 | self, 19 | columns, 20 | num_features, 21 | feature_dim, 22 | output_dim, 23 | num_decision_steps, 24 | relaxation_factor, 25 | batch_momentum, 26 | virtual_batch_size, 27 | num_classes, 28 | epsilon=0.00001, 29 | ): 30 | """Initializes a reduced TabNet instance. 31 | 32 | Args: 33 | columns: The Tensorflow column names for the dataset. 34 | num_features: The number of input features (i.e the number of columns for 35 | tabular data assuming each feature is represented with 1 dimension). 36 | feature_dim: Dimensionality of the hidden representation in feature 37 | transformation block. Each layer first maps the representation to a 38 | 2*feature_dim-dimensional output and half of it is used to determine the 39 | nonlinearity of the GLU activation where the other half is used as an 40 | input to GLU, and eventually feature_dim-dimensional output is 41 | transferred to the next layer. 42 | output_dim: Dimensionality of the outputs of each decision step, which is 43 | later mapped to the final classification or regression output. 44 | num_decision_steps: Number of sequential decision steps. 45 | relaxation_factor: Relaxation factor that promotes the reuse of each 46 | feature at different decision steps. When it is 1, a feature is enforced 47 | to be used only at one decision step and as it increases, more 48 | flexibility is provided to use a feature at multiple decision steps. 49 | batch_momentum: Momentum in ghost batch normalization. 50 | virtual_batch_size: Virtual batch size in ghost batch normalization. The 51 | overall batch size should be an integer multiple of virtual_batch_size. 52 | num_classes: Number of output classes. 53 | epsilon: A small number for numerical stability of the entropy calcations. 54 | 55 | Returns: 56 | A reduced TabNet instance. 57 | """ 58 | 59 | self.columns = columns 60 | self.num_features = num_features 61 | self.feature_dim = feature_dim 62 | self.output_dim = output_dim 63 | self.num_decision_steps = num_decision_steps 64 | self.relaxation_factor = relaxation_factor 65 | self.batch_momentum = batch_momentum 66 | self.virtual_batch_size = virtual_batch_size 67 | self.num_classes = num_classes 68 | self.epsilon = epsilon 69 | 70 | def encoder(self, data, reuse, is_training): 71 | """Reduced TabNet encoder model.""" 72 | 73 | with tf.variable_scope("Encoder", reuse=reuse): 74 | 75 | # Reads and normalizes input features. 76 | features = tf.feature_column.input_layer(data, self.columns) 77 | features = tf.layers.batch_normalization( 78 | features, training=is_training, momentum=self.batch_momentum 79 | ) 80 | batch_size = tf.shape(features)[0] 81 | 82 | # Initializes decision-step dependent variables. 83 | output_aggregated = tf.zeros([batch_size, self.output_dim]) 84 | masked_features = features 85 | mask_values = tf.zeros([batch_size, self.num_features]) 86 | aggregated_mask_values = tf.zeros([batch_size, self.num_features]) 87 | complementary_aggregated_mask_values = tf.ones([batch_size, self.num_features]) 88 | total_entropy = 0 89 | 90 | v_b = self.virtual_batch_size if is_training else 1 91 | # Feature transformer: a sort of recurrent structure 92 | # TODO: can we automate number of decision steps needed? 93 | for ni in range(self.num_decision_steps): 94 | # Feature transformer with one shared and one decision step dependent 95 | # blocks is used below. This departs from the original model 96 | reuse_flag = ni > 0 97 | 98 | # shared because of the same name 99 | transform_f1 = tf.layers.dense( 100 | masked_features, 101 | self.feature_dim * 2, 102 | name="Transform_f1", 103 | reuse=reuse_flag, 104 | use_bias=False, 105 | ) 106 | transform_f1 = tf.layers.batch_normalization( 107 | transform_f1, 108 | training=is_training, 109 | momentum=self.batch_momentum, 110 | virtual_batch_size=v_b, 111 | ) 112 | transform_f1 = glu(transform_f1, self.feature_dim) 113 | 114 | # step dependent 115 | transform_f2 = tf.layers.dense( 116 | transform_f1, 117 | self.feature_dim * 2, 118 | name="Transform_f1" + str(ni), 119 | use_bias=False, 120 | ) 121 | transform_f2 = tf.layers.batch_normalization( 122 | transform_f2, 123 | training=is_training, 124 | momentum=self.batch_momentum, 125 | virtual_batch_size=v_b, 126 | ) 127 | transform_f2 = (glu(transform_f2, self.feature_dim) + transform_f1) * np.sqrt(0.5) 128 | 129 | if ni > 0: 130 | decision_out = tf.nn.relu(transform_f2[:, : self.output_dim]) 131 | 132 | # Decision aggregation. 133 | output_aggregated += decision_out 134 | 135 | # Aggregated masks are used for visualization of the 136 | # feature importance attributes. 137 | scale_agg = tf.reduce_sum(decision_out, axis=1, keep_dims=True) / ( 138 | self.num_decision_steps - 1 139 | ) 140 | aggregated_mask_values += mask_values * scale_agg 141 | 142 | features_for_coef = transform_f2[:, self.output_dim :] 143 | 144 | # Attentive transformer 145 | if ni < self.num_decision_steps - 1: 146 | 147 | # Determines the feature masks via linear and nonlinear 148 | # transformations, taking into account of aggregated feature use. 149 | mask_values = tf.layers.dense( 150 | features_for_coef, 151 | self.num_features, 152 | name="Transform_coef" + str(ni), 153 | use_bias=False, 154 | ) 155 | mask_values = tf.layers.batch_normalization( 156 | mask_values, 157 | training=is_training, 158 | momentum=self.batch_momentum, 159 | virtual_batch_size=v_b, 160 | ) 161 | mask_values *= complementary_aggregated_mask_values 162 | # replace sparsemax with entmax 1.5 163 | mask_values = entmax15(mask_values) 164 | 165 | # Relaxation factor controls the amount of reuse of features between 166 | # different decision blocks and updated with the values of 167 | # coefficients. 168 | complementary_aggregated_mask_values *= self.relaxation_factor - mask_values 169 | 170 | # Entropy is used to penalize the amount of sparsity in feature 171 | # selection. 172 | total_entropy += tf.reduce_mean( 173 | tf.reduce_sum(-mask_values * tf.log(mask_values + self.epsilon), axis=1) 174 | ) / (self.num_decision_steps - 1) 175 | 176 | # Feature selection. 177 | masked_features = tf.multiply(mask_values, features) 178 | 179 | # Visualization of the feature selection mask at decision step ni 180 | tf.summary.image( 181 | "Mask_for_step" + str(ni), 182 | tf.expand_dims(tf.expand_dims(mask_values, 0), 3), 183 | max_outputs=1, 184 | ) 185 | 186 | # Visualization of the aggregated feature importances 187 | tf.summary.image( 188 | "Aggregated_mask", 189 | tf.expand_dims(tf.expand_dims(aggregated_mask_values, 0), 3), 190 | max_outputs=1, 191 | ) 192 | 193 | return output_aggregated, total_entropy 194 | 195 | def classify(self, activations, reuse): 196 | """Reduced TabNet classify block.""" 197 | 198 | with tf.variable_scope("Classify", reuse=reuse): 199 | logits = tf.layers.dense(activations, self.num_classes, use_bias=False) 200 | predictions = tf.nn.softmax(logits) 201 | return logits, predictions 202 | 203 | def regress(self, activations, reuse): 204 | """Reduced TabNet regress block.""" 205 | 206 | with tf.variable_scope("Regress", reuse=reuse): 207 | predictions = tf.layers.dense(activations, 1) 208 | return predictions 209 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | tensorflow-gpu==1.15.4 2 | absl-py>=0.5.0 3 | numpy==1.15.1 4 | scikit-learn==0.20.1 5 | wget>=3.2 6 | -------------------------------------------------------------------------------- /run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | set -x 4 | 5 | virtualenv -p python3 . 6 | source ./bin/activate 7 | 8 | pip install tensorflow 9 | pip install -r requirements.txt 10 | python -m train_classifier 11 | -------------------------------------------------------------------------------- /train_classifier.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Train the TabNet or reduced TabNet model on various datasets.""" 3 | import os 4 | from absl import app 5 | import numpy as np 6 | import tensorflow.compat.v1 as tf 7 | 8 | from datetime import datetime 9 | from config.covertype import * 10 | from model import tabnet, tabnet_reduced 11 | from util import data_helper, logging 12 | 13 | logger = logging.create_logger() 14 | 15 | # Run Tensorflow on GPU 0 16 | os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" 17 | os.environ["CUDA_VISIBLE_DEVICES"] = "0" 18 | 19 | 20 | def sort_col_names(feature_cols): 21 | column_names = sorted(feature_cols) 22 | logger.info("Ordered column names, corresponding to the indexing in Tensorboard visualization") 23 | for fi in range(len(column_names)): 24 | logger.info(str(fi) + " : " + column_names[fi]) 25 | 26 | 27 | def main(unused_argv): 28 | # column order 29 | feature_columns = ( 30 | INT_COLUMNS + ENCODED_CATEGORICAL_COLUMNS + BOOL_COLUMNS + STR_COLUMNS + FLOAT_COLUMNS 31 | ) 32 | all_columns = feature_columns + [LABEL_COLUMN] 33 | 34 | # Fix random seeds 35 | tf.set_random_seed(SEED) 36 | np.random.seed(SEED) 37 | 38 | input_columns = data_helper.get_columns( 39 | INT_COLUMNS, ENCODED_CATEGORICAL_COLUMNS, BOOL_COLUMNS, FLOAT_COLUMNS, STR_COLUMNS 40 | ) 41 | 42 | # Define the TabNet model 43 | tabnet_model = ( 44 | ( 45 | tabnet_reduced.TabNetReduced( 46 | columns=input_columns, 47 | num_features=NUM_FEATURES, 48 | feature_dim=FEATURE_DIM, 49 | output_dim=OUTPUT_DIM, 50 | num_decision_steps=NUM_DECISION_STEPS, 51 | relaxation_factor=RELAXATION_FACTOR, 52 | batch_momentum=BATCH_MOMENTUM, 53 | virtual_batch_size=VIRTUAL_BATCH_SIZE, 54 | num_classes=NUM_CLASSES, 55 | ) 56 | ) 57 | if REDUCED 58 | else ( 59 | tabnet.TabNet( 60 | columns=input_columns, 61 | num_features=NUM_FEATURES, 62 | feature_dim=FEATURE_DIM, 63 | output_dim=OUTPUT_DIM, 64 | num_decision_steps=NUM_DECISION_STEPS, 65 | relaxation_factor=RELAXATION_FACTOR, 66 | batch_momentum=BATCH_MOMENTUM, 67 | virtual_batch_size=VIRTUAL_BATCH_SIZE, 68 | num_classes=NUM_CLASSES, 69 | ) 70 | ) 71 | ) 72 | 73 | sort_col_names(feature_columns) 74 | 75 | # Input sampling 76 | train_batch = data_helper.input_fn( 77 | TRAIN_FILE, 78 | INT_COLUMNS, 79 | BOOL_COLUMNS, 80 | FLOAT_COLUMNS, 81 | STR_COLUMNS, 82 | LABEL_COLUMN, 83 | num_epochs=MAX_STEPS, 84 | shuffle=True, 85 | batch_size=BATCH_SIZE, 86 | ) 87 | test_batch = data_helper.input_fn( 88 | TEST_FILE, 89 | INT_COLUMNS, 90 | BOOL_COLUMNS, 91 | FLOAT_COLUMNS, 92 | STR_COLUMNS, 93 | LABEL_COLUMN, 94 | num_epochs=MAX_STEPS, 95 | shuffle=False, 96 | batch_size=N_TEST_SAMPLES, 97 | ) 98 | 99 | train_iter = train_batch.make_initializable_iterator() 100 | test_iter = test_batch.make_initializable_iterator() 101 | 102 | feature_train_batch, label_train_batch = train_iter.get_next() 103 | feature_test_batch, label_test_batch = test_iter.get_next() 104 | 105 | # Define the model and losses 106 | encoded_train_batch, total_entropy = tabnet_model.encoder( 107 | feature_train_batch, reuse=False, is_training=True 108 | ) 109 | 110 | logits_orig_batch, _ = tabnet_model.classify(encoded_train_batch, reuse=False) 111 | 112 | softmax_orig_key_op = tf.reduce_mean( 113 | tf.nn.sparse_softmax_cross_entropy_with_logits( 114 | logits=logits_orig_batch, labels=label_train_batch 115 | ) 116 | ) 117 | 118 | train_loss_op = softmax_orig_key_op + SPARSITY_LOSS_WEIGHT * total_entropy 119 | tf.summary.scalar("Total loss", train_loss_op) 120 | 121 | # Optimization step 122 | global_step = tf.train.get_or_create_global_step() 123 | learning_rate = tf.train.exponential_decay( 124 | INIT_LEARNING_RATE, global_step=global_step, decay_steps=DECAY_EVERY, decay_rate=DECAY_RATE 125 | ) 126 | optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) 127 | update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) 128 | with tf.control_dependencies(update_ops): 129 | gvs = optimizer.compute_gradients(train_loss_op) 130 | capped_gvs = [ 131 | (tf.clip_by_value(grad, -GRADIENT_THRESH, GRADIENT_THRESH), var) for grad, var in gvs 132 | ] 133 | train_op = optimizer.apply_gradients(capped_gvs, global_step=global_step) 134 | 135 | # Model evaluation 136 | # Test performance 137 | encoded_test_batch, _ = tabnet_model.encoder(feature_test_batch, reuse=True, is_training=False) 138 | 139 | _, prediction_test = tabnet_model.classify(encoded_test_batch, reuse=True) 140 | 141 | predicted_labels = tf.cast(tf.argmax(prediction_test, 1), dtype=tf.int32) 142 | test_eq_op = tf.equal(predicted_labels, label_test_batch) 143 | test_acc_op = tf.reduce_mean(tf.cast(test_eq_op, dtype=tf.float32)) 144 | tf.summary.scalar("Test accuracy", test_acc_op) 145 | 146 | # Training setup 147 | current_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") 148 | model_name = MODEL_NAME + f"_{current_time}" 149 | init = tf.initialize_all_variables() 150 | init_local = tf.local_variables_initializer() 151 | init_table = tf.tables_initializer(name="Initialize_all_tables") 152 | saver = tf.train.Saver() 153 | summaries = tf.summary.merge_all() 154 | 155 | with tf.Session() as sess: 156 | summary_writer = tf.summary.FileWriter("./tflog/" + model_name, sess.graph) 157 | 158 | sess.run(init) 159 | sess.run(init_local) 160 | sess.run(init_table) 161 | sess.run(train_iter.initializer) 162 | sess.run(test_iter.initializer) 163 | 164 | for step in range(1, MAX_STEPS + 1): 165 | if step % DISPLAY_STEP == 0: 166 | _, train_loss, merged_summary = sess.run([train_op, train_loss_op, summaries]) 167 | summary_writer.add_summary(merged_summary, step) 168 | logger.info( 169 | "Step " + str(step) + ", Training Loss = " + "{:.4f}".format(train_loss) 170 | ) 171 | else: 172 | _ = sess.run(train_op) 173 | 174 | if step % TEST_STEP == 0: 175 | feed_arr = [vars()["summaries"], vars()["test_acc_op"]] 176 | 177 | test_arr = sess.run(feed_arr) 178 | merged_summary = test_arr[0] 179 | test_acc = test_arr[1] 180 | 181 | logger.info("Step " + str(step) + ", Test Accuracy = " + "{:.4f}".format(test_acc)) 182 | summary_writer.add_summary(merged_summary, step) 183 | 184 | if step % SAVE_STEP == 0: 185 | saver.save(sess, "./checkpoints/" + model_name + ".ckpt") 186 | 187 | 188 | if __name__ == "__main__": 189 | app.run(main) 190 | -------------------------------------------------------------------------------- /util/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ptuls/tabnet-modified/5b1f8d13584b552f4808d95ffd253830c696fb4e/util/__init__.py -------------------------------------------------------------------------------- /util/data_helper.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import functools 3 | import tensorflow as tf 4 | from tensorflow.python.framework import dtypes 5 | 6 | 7 | def set_defaults(int_columns, bool_columns, float_columns, str_columns): 8 | return ( 9 | [[0] for col in int_columns] 10 | + [[""] for col in bool_columns] 11 | + [[0.0] for col in float_columns] 12 | + [[""] for col in str_columns] 13 | + [[-1]] 14 | ) 15 | 16 | 17 | def get_columns(int_columns, encoded_categorical_columns, bool_columns, float_columns, str_columns): 18 | """Get the representations for all input columns.""" 19 | 20 | columns = [] 21 | if float_columns: 22 | columns += [ 23 | tf.feature_column.numeric_column(ci, dtype=dtypes.float32) for ci in float_columns 24 | ] 25 | if int_columns: 26 | columns += [tf.feature_column.numeric_column(ci, dtype=dtypes.int32) for ci in int_columns] 27 | if encoded_categorical_columns: 28 | columns += [ 29 | tf.feature_column.numeric_column(ci, dtype=dtypes.int32) 30 | for ci in encoded_categorical_columns 31 | ] 32 | if str_columns: 33 | # pylint: disable=g-complex-comprehension 34 | str_nuniquess = len(set(str_columns)) 35 | columns += [ 36 | tf.feature_column.embedding_column( 37 | tf.feature_column.categorical_column_with_hash_bucket( 38 | ci, hash_bucket_size=int(3 * num) 39 | ), 40 | dimension=1, 41 | ) 42 | for ci, num in zip(str_columns, str_nuniquess) 43 | ] 44 | if bool_columns: 45 | # pylint: disable=g-complex-comprehension 46 | columns += [ 47 | tf.feature_column.embedding_column( 48 | tf.feature_column.categorical_column_with_hash_bucket(ci, hash_bucket_size=3), 49 | dimension=1, 50 | ) 51 | for ci in bool_columns 52 | ] 53 | return columns 54 | 55 | 56 | def parse_csv(int_columns, bool_columns, float_columns, str_columns, label_column, value_column): 57 | """Parses a CSV file based on the provided column types.""" 58 | defaults = set_defaults(int_columns, bool_columns, float_columns, str_columns) 59 | all_columns = int_columns + bool_columns + float_columns + str_columns + [label_column] 60 | columns = tf.decode_csv(value_column, record_defaults=defaults) 61 | features = dict(zip(all_columns, columns)) 62 | label = features.pop(label_column) 63 | classes = tf.cast(label, tf.int32) - 1 64 | return features, classes 65 | 66 | 67 | def input_fn( 68 | data_file, 69 | int_columns, 70 | bool_columns, 71 | float_columns, 72 | str_columns, 73 | label_column, 74 | num_epochs, 75 | shuffle, 76 | batch_size, 77 | n_buffer=50, 78 | n_parallel=16, 79 | ): 80 | """Function to read the input file and return the dataset. 81 | 82 | Args: 83 | data_file: Name of the file. 84 | num_epochs: Number of epochs. 85 | shuffle: Whether to shuffle the data. 86 | batch_size: Batch size. 87 | n_buffer: Buffer size. 88 | n_parallel: Number of cores for multi-core processing option. 89 | 90 | Returns: 91 | The Tensorflow dataset. 92 | """ 93 | 94 | # Extract lines from input files using the Dataset API. 95 | dataset = tf.data.TextLineDataset(data_file) 96 | 97 | if shuffle: 98 | dataset = dataset.shuffle(buffer_size=n_buffer) 99 | 100 | parse_csv_partial = functools.partial( 101 | parse_csv, 102 | int_columns, 103 | bool_columns, 104 | float_columns, 105 | str_columns, 106 | label_column, 107 | ) 108 | 109 | dataset = dataset.batch(batch_size, drop_remainder=True) 110 | dataset = dataset.map(parse_csv_partial, num_parallel_calls=n_parallel) 111 | 112 | # Repeat after shuffling, to prevent separate epochs from blending together. 113 | dataset = dataset.repeat(num_epochs) 114 | return dataset 115 | -------------------------------------------------------------------------------- /util/logging.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | import sys 4 | 5 | 6 | def create_logger(): 7 | log = logging.getLogger() 8 | log.setLevel(logging.INFO) 9 | 10 | ch = logging.StreamHandler(sys.stdout) 11 | ch.setLevel(logging.INFO) 12 | formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") 13 | ch.setFormatter(formatter) 14 | log.addHandler(ch) 15 | return log 16 | -------------------------------------------------------------------------------- /util/tfutils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import tensorflow as tf 3 | 4 | 5 | def create_tf_example(row, int_columns, float_columns, label_column): 6 | features = {} 7 | 8 | for feat_name in int_columns: 9 | features[feat_name] = tf.train.Feature( 10 | int64_list=tf.train.Int64List(value=[row[feat_name].astype(dtype=int)]) 11 | ) 12 | 13 | for feat_name in float_columns: 14 | features[feat_name] = tf.train.Feature( 15 | float_list=tf.train.FloatList(value=[row[feat_name]]) 16 | ) 17 | 18 | features[label_column] = tf.train.Feature(float_list=tf.train.FloatList(value=[row[feat_name]])) 19 | 20 | tf_example = tf.train.Example(features=tf.train.Features(feature=features)) 21 | return tf_example 22 | 23 | 24 | def write_tfrecords(df, path): 25 | with tf.python_io.TFRecordWriter(path) as writer: 26 | for index, row in df.iterrows(): 27 | tf_example = create_tf_example(row) 28 | writer.write(tf_example.SerializeToString()) 29 | --------------------------------------------------------------------------------