├── .gitignore ├── README.md ├── autorecsys ├── __init__.py ├── auto_search.py ├── pipeline │ ├── __init__.py │ ├── base.py │ ├── graph.py │ ├── interactor.py │ ├── mapper.py │ ├── node.py │ ├── optimizer.py │ ├── preprocessor.py │ └── utils.py ├── recommender.py ├── searcher │ ├── __init__.py │ ├── core │ │ ├── __init__.py │ │ ├── hyperparameters.py │ │ ├── oracle.py │ │ ├── trial.py │ │ └── utils.py │ └── tuners │ │ ├── __init__.py │ │ ├── bayesian.py │ │ ├── greedy.py │ │ ├── randomsearch.py │ │ └── tuner.py └── utils │ ├── __init__.py │ ├── common.py │ ├── display.py │ └── metric.py ├── docs ├── autogen.py ├── index.md ├── mkdocs.yml ├── readme.md ├── requirements.txt ├── templates │ ├── about.md │ ├── benchmark.md │ ├── index.md │ └── install.md └── tutobooks.py ├── examples ├── README.md ├── ctr_autoint.py ├── ctr_autorec.py ├── ctr_benchmark.py ├── ctr_crossnet.py ├── ctr_deepfm.py ├── ctr_dlrm.py ├── ctr_neumf.py ├── example_datasets │ ├── avazu │ │ └── train-10k │ ├── criteo │ │ └── train-10k.txt │ ├── movielens │ │ └── ratings-10k.dat │ └── netflix │ │ └── combined_data_1-10k.txt ├── rp_autorec.py ├── rp_benchmark.py ├── rp_mf.py └── rp_neumf.py ├── mkdocs.yml ├── requirements.txt ├── setup.py └── tests ├── __init__.py ├── common.py ├── integration_tests.py ├── integration_tests └── test_models.py ├── pipeline_tests ├── __init__.py ├── test_graph.py ├── test_interactor.py ├── test_mapper.py ├── test_node.py ├── test_optimizer.py ├── test_preprocessor.py └── test_utils.py ├── searcher_tests ├── __init__.py ├── core_tests │ ├── __init__.py │ ├── test_hyperparameters.py │ ├── test_oracle.py │ ├── test_trial.py │ └── test_tuner.py └── searchers_test.py └── utils_test ├── __init__.py ├── test.csv └── test_common.py /.gitignore: -------------------------------------------------------------------------------- 1 | /tests/datasets/* 2 | /examples/datasets/* 3 | .idea/* 4 | .DS_Store 5 | *.pyc 6 | /tests/tmp_autokaggle-tmp-3/ 7 | /examples/old/ 8 | __pycache__/ 9 | /tests/config/ 10 | /.pytest_cache/ 11 | ._* 12 | test.py 13 | *.sh 14 | *.log.txt 15 | *.log 16 | _rp_benchmark_latest.py 17 | _rp_benchmark_10m.py 18 | search_1/ 19 | 20 | # MKdocs 21 | /docs/sources 22 | /docs/site 23 | 24 | # Local test scripts 25 | ctr_benchmark-gpu*.py 26 | examples/netflix.py 27 | examples/ctr_deepfm_test_criteo.py 28 | examples/ctr_deepfm_test_avazu.py 29 | examples/ctr_test_criteo.py 30 | 31 | # Others 32 | autorecsys/utils/config.py 33 | examples/example_datasets/netflix/combined_data_1-10k.csv 34 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # AutoRec 2 | 3 | 4 | AutoRec is a Keras-based implementation of automated recommendation algorithms for both rating prediction and Click Through Rate task. 5 | 6 | 7 | For more details, see the [Documentation](http://autorec.ai). 8 | 9 | 10 | ## Installation 11 | Install from `pip`: 12 | ``` 13 | pip install autorec 14 | ``` 15 | 16 | 17 | ## Quickstart 18 | Build an rating prediction model which can search the model architecture automatically on the MovieLens dataset is very easy as follows: 19 | ```python 20 | # -*- coding: utf-8 -*- 21 | import tensorflow as tf 22 | from autorecsys.auto_search import Search 23 | from autorecsys.pipeline import Input, LatentFactorMapper, RatingPredictionOptimizer, ElementwiseInteraction 24 | from autorecsys.pipeline.preprocessor import MovielensPreprocessor, NetflixPrizePreprocessor 25 | from autorecsys.recommender import RPRecommender 26 | 27 | # load dataset 28 | #Movielens 1M Dataset 29 | data = MovielensPreprocessor("./examples/datasets/ml-1m/ratings.dat") 30 | data.preprocessing(val_test_size=0.1, random_state=1314) 31 | train_X, train_y = data.train_X, data.train_y 32 | val_X, val_y = data.val_X, data.val_y 33 | test_X, test_y = data.test_X, data.test_y 34 | user_num, item_num = data.user_num, data.item_num 35 | 36 | # build the pipeline. 37 | input = Input(shape=[2]) 38 | user_emb = LatentFactorMapper(column_id=0, 39 | num_of_entities=user_num, 40 | embedding_dim=64)(input) 41 | item_emb = LatentFactorMapper(column_id=1, 42 | num_of_entities=item_num, 43 | embedding_dim=64)(input) 44 | output = ElementwiseInteraction(elementwise_type="innerporduct")([user_emb, item_emb]) 45 | output = RatingPredictionOptimizer()(output) 46 | model = RPRecommender(inputs=input, outputs=output) 47 | 48 | # AutoML search and predict 49 | searcher = Search(model=model, 50 | tuner='greedy', # hyperband, greedy, bayesian 51 | tuner_params={"max_trials": 5} 52 | ) 53 | 54 | searcher.search(x=train_X, 55 | y=train_y, 56 | x_val=val_X, 57 | y_val=val_y, 58 | objective='val_mse', 59 | batch_size=1024, 60 | epochs=10, 61 | callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=1)]) 62 | ``` 63 | -------------------------------------------------------------------------------- /autorecsys/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datamllab/AutoRec/2dbc8778cfb597402d8b0337186bf9152663b20a/autorecsys/__init__.py -------------------------------------------------------------------------------- /autorecsys/auto_search.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, print_function, unicode_literals 2 | 3 | import os 4 | import logging 5 | import tempfile 6 | import tensorflow as tf 7 | 8 | from autorecsys.utils.common import to_snake_case, create_directory, load_dataframe_input 9 | from autorecsys.searcher.tuners.tuner import METRIC, PipeTuner 10 | from autorecsys.searcher import tuners 11 | from autorecsys.recommender import CTRRecommender, RPRecommender 12 | 13 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') 14 | logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') 15 | logger = logging.getLogger(__name__) 16 | 17 | 18 | class Search(object): 19 | """ A search object to search on a Recommender HyperModel (CTRRecommender/RPRecommender) 20 | defined by inputs and outputs. 21 | 22 | ``Search`` combines a Recommender and a Tuner to tune the Recommender. The user can 23 | use ``search()`` to perform search, and use a similar way to a Keras model to adopt 24 | the best discovered model as it also has `fit()`/`predict()`/`evaluate()` methods. 25 | The user should input a Recommender HyperModel (CTRRecommender/RPRecommender) and a 26 | selected tuning method to initial the ``Search`` object and input the dataset when 27 | calling the ``search`` method to discover the best architecture. 28 | ``` 29 | # Arguments 30 | model: A Recommender HyperModel (CTRRecommender/RPRecommender). 31 | name: String. The name of the project, which is used for saving and loading purposes. 32 | tuner: String. The name of the tuner. It should be one of 'greedy', 'bayesian' or 33 | 'random'. Default to be 'random'. 34 | 35 | 36 | tuner_params: Dict. The hyperparameters of the tuner. The commons ones are: 37 | 'max_trials': Int. Specify the number of search epochs. 38 | 'overwrite': Boolean. Whether we want to ovewrite an existing 39 | tuner or not. 40 | 41 | directory: String. The path to a directory for storing the search outputs. 42 | Defaults to None, which would create a folder with the name of the 43 | project in the current directory, i.e., ``directory/name``. 44 | overwrite: Boolean. Defaults to `True`. Whether we want to ovewrite an existing 45 | project with the name defined as ``directory/name`` or not. 46 | """ 47 | def __init__(self, model=None, name=None, tuner='random', tuner_params=None, directory='.', overwrite=True): 48 | self.pipe = model 49 | self.tuner = tuner 50 | self.tuner_params = tuner_params 51 | if not name: 52 | prefix = self.__class__.__name__ 53 | name = prefix + '_' + str(tf.keras.backend.get_uid(prefix)) 54 | name = to_snake_case(name) 55 | self.name = name 56 | directory = directory or tempfile.gettempdir() 57 | self.dir = os.path.join(directory, self.name) 58 | 59 | self.overwrite = overwrite 60 | create_directory(self.dir, remove_existing=overwrite) 61 | self.logger = logging.getLogger(self.name) 62 | self.logger.info('Project directory: {}'.format(self.dir)) 63 | self.best_keras_graph = None 64 | self.best_model = None 65 | self.need_fully_train = False 66 | 67 | def search(self, x=None, y=None, x_val=None, y_val=None, objective='mse', **fit_kwargs): 68 | """Search the best deep recommendation model. 69 | 70 | # Arguments 71 | x: numpy array. Training features. 72 | y: numpy array. Training targets. 73 | x_val: numpy array. Validation features. 74 | y_val: numpy array. Validation features. 75 | objective: String. Name of model metric to minimize or maximize, 76 | e.g. 'val_BinaryCrossentropy'. Defaults to 'mse'. 77 | **fit_kwargs: Any arguments supported by the fit method of a Keras model such as: 78 | ``batch_size``, ``epochs``, ``callbacks``. 79 | """ 80 | 81 | # overwrite the objective 82 | self.objective = objective 83 | tuner = self._build_tuner(self.tuner, self.tuner_params) 84 | 85 | # TODO search on a small piece of train data, currently it uses whole train data 86 | tuner.search(x=x, y=y, x_val=x_val, y_val=y_val, **fit_kwargs) 87 | # show the search space 88 | tuner.search_space_summary() 89 | # show the search results 90 | tuner.results_summary() 91 | best_pipe_lists = tuner.get_best_models(1) 92 | # len(best_pipe_lists) == 0 means that this pipeline does not have tunable parameters 93 | self.best_model = best_pipe_lists[0] 94 | return self.best_model 95 | 96 | def _build_tuner(self, tuner, tuner_params): 97 | """Build a tuner based on its name and hyperparameters. 98 | 99 | # Arguments 100 | tuner: String. The name of the tuner. It should be one of 'greedy', 'bayesian' or 101 | 'random'. Default to be 'random'. 102 | 103 | tuner_params: Dict. The hyperparameters of the tuner. The commons ones are: 104 | 'max_trials': Int. Specify the number of search epochs. 105 | 'overwrite': Boolean. Whether we want to ovewrite an existing 106 | tuner or not. 107 | """ 108 | tuner_cls = tuners.get_tuner_class( tuner ) 109 | hps = self.pipe.get_hyperparameters() 110 | tuner = tuner_cls(hypergraph=self.pipe, 111 | objective=self.objective, 112 | hyperparameters=hps, 113 | directory=self.dir, 114 | **tuner_params) 115 | return tuner 116 | 117 | def predict(self, x): 118 | """Use the best searched model to conduct prediction on the dataset x. 119 | 120 | # Arguments 121 | x: numpy array / data frame / string path of a csv file. 122 | Features used to do the prediction. 123 | """ 124 | if isinstance (self.pipe, RPRecommender): 125 | x = load_dataframe_input(x) 126 | return self.best_model.predict(x) 127 | 128 | def evaluate(self, x, y_true): 129 | """Evaluate the best searched model. 130 | 131 | # Arguments 132 | x: numpy array / data frame / string path of a csv file. 133 | Features used to do the prediction. 134 | y_true: numpy array / data frame / string path of a csv file. 135 | Ground-truth labels. 136 | """ 137 | y_pred = self.predict(x) 138 | score_func = METRIC[self.objective.split('_')[-1]] 139 | y_true = load_dataframe_input(y_true) 140 | y_true = y_true.values.reshape(-1, 1) 141 | self.logger.info(f'evaluate prediction results using {self.objective}') 142 | return score_func(y_true, y_pred) 143 | -------------------------------------------------------------------------------- /autorecsys/pipeline/__init__.py: -------------------------------------------------------------------------------- 1 | from autorecsys.pipeline.mapper import LatentFactorMapper, DenseFeatureMapper, SparseFeatureMapper 2 | from autorecsys.pipeline.interactor import MLPInteraction, ConcatenateInteraction, FMInteraction,\ 3 | ElementwiseInteraction, CrossNetInteraction, SelfAttentionInteraction, HyperInteraction, InnerProductInteraction 4 | from autorecsys.pipeline.optimizer import RatingPredictionOptimizer, CTRPredictionOptimizer 5 | from autorecsys.pipeline.node import Input, StructuredDataInput 6 | -------------------------------------------------------------------------------- /autorecsys/pipeline/base.py: -------------------------------------------------------------------------------- 1 | import types 2 | import tensorflow as tf 3 | from autorecsys.searcher.core import hyperparameters as hp_module 4 | from autorecsys.searcher.core.trial import Stateful 5 | from autorecsys.utils.common import to_snake_case 6 | from tensorflow.python.util import nest 7 | 8 | 9 | class Node(Stateful): 10 | """The nodes in a network connecting the blocks.""" 11 | 12 | def __init__(self, shape=None): 13 | super().__init__() 14 | self.in_blocks = [] 15 | self.out_blocks = [] 16 | self.shape = shape 17 | 18 | def add_in_block(self, hypermodel): 19 | self.in_blocks.append(hypermodel) 20 | 21 | def add_out_block(self, hypermodel): 22 | self.out_blocks.append(hypermodel) 23 | 24 | def build(self): 25 | return tf.keras.Input(shape=self.shape) 26 | 27 | def get_state(self): 28 | return {'shape': self.shape} 29 | 30 | def set_state(self, state): 31 | self.shape = state['shape'] 32 | 33 | 34 | class HyperModel(object): 35 | """Defines a searchable space of Models and builds Models from this space. 36 | # Attributes: 37 | name: The name of this HyperModel. 38 | tunable: Whether the hyperparameters defined in this hypermodel 39 | should be added to search space. If `False`, either the search 40 | space for these parameters must be defined in advance, or the 41 | default values will be used. 42 | """ 43 | 44 | def __init__(self, name=None, tunable=True): 45 | self.name = name 46 | self.tunable = tunable 47 | 48 | self._build = self.build 49 | self.build = self._build_wrapper 50 | 51 | def build(self, hp): 52 | """Builds a model. 53 | # Arguments: 54 | hp: A `HyperParameters` instance. 55 | # Returns: 56 | A model instance. 57 | """ 58 | raise NotImplementedError 59 | 60 | def _build_wrapper(self, hp, *args, **kwargs): 61 | if not self.tunable: 62 | # Copy `HyperParameters` object so that new entries are not added 63 | # to the search space. 64 | hp = hp.copy() 65 | return self._build(hp, *args, **kwargs) 66 | 67 | 68 | class Block(HyperModel, Stateful): 69 | def __init__(self, name=None, **kwargs): 70 | super().__init__(**kwargs) 71 | self.fixed_params = None 72 | self.tunable_candidates = None 73 | if not name: 74 | prefix = self.__class__.__name__ 75 | name = prefix + '_' + str(tf.keras.backend.get_uid(prefix)) 76 | name = to_snake_case(name) 77 | self._hyperparameters = None 78 | self.name = name 79 | self.inputs = None 80 | self.outputs = None 81 | self._num_output_node = 1 82 | 83 | def __new__(cls, *args, **kwargs): 84 | obj = super().__new__(cls) 85 | build_fn = obj.build 86 | 87 | def build_wrapper(obj, hp, *args, **kwargs): 88 | with hp.name_scope(obj.name): 89 | return build_fn(hp, *args, **kwargs) 90 | 91 | obj.build = types.MethodType(build_wrapper, obj) 92 | return obj 93 | 94 | def __str__(self): 95 | return self.name 96 | 97 | @property 98 | def hyperparameters(self): 99 | return self._hyperparameters 100 | 101 | def __call__(self, inputs): 102 | """Functional API. 103 | # Arguments 104 | inputs: A list of input node(s) or a single input node for the block. 105 | # Returns 106 | list: A list of output node(s) of the Block. 107 | """ 108 | inputs = nest.flatten(inputs) 109 | self.inputs = inputs 110 | for input_node in self.inputs: 111 | if not isinstance(input_node, Node): 112 | raise TypeError('Expect the inputs to layer {name} to be ' 113 | 'a Node, but got {type}.'.format( 114 | name=self.name, 115 | type=type(input_node))) 116 | input_node.add_out_block(self) 117 | self.outputs = [] 118 | for _ in range(self._num_output_node): 119 | output_node = Node() 120 | output_node.add_in_block(self) 121 | self.outputs.append(output_node) 122 | return self.outputs 123 | 124 | def get_state(self): 125 | """Get the configuration of the preprocessor. 126 | # Returns 127 | A dictionary of configurations of the preprocessor. 128 | """ 129 | return {'name': self.name} 130 | 131 | def set_state(self, state): 132 | """Set the configuration of the preprocessor. 133 | # Arguments 134 | state: A dictionary of the configurations of the preprocessor. 135 | """ 136 | if 'name' in state: 137 | self.name = state['name'] 138 | 139 | 140 | class HyperBlock(Block): 141 | """HyperBlock uses hyperparameters to decide inner Block graph. 142 | A HyperBlock should be build into connected Blocks instead of individual Keras 143 | layers. The main purpose of creating the HyperBlock class is for the ease of 144 | parsing the graph for preprocessors. The graph would be hard to parse if a Block, 145 | whose inner structure is decided by hyperparameters dynamically, contains both 146 | preprocessors and Keras layers. 147 | When the preprocessing layers of Keras are ready to cover all the preprocessors 148 | in AutoKeras, the preprocessors should be handled by the Keras Model. The 149 | HyperBlock class should be removed. The subclasses should extend Block class 150 | directly and the build function should build connected Keras layers instead of 151 | Blocks. 152 | # Arguments 153 | output_shape: Tuple of int(s). Defaults to None. If None, the output shape 154 | will be inferred from the AutoModel. 155 | name: String. The name of the block. If unspecified, it will be set 156 | automatically with the class name. 157 | """ 158 | 159 | def __init__(self, output_shape=None, **kwargs): 160 | super().__init__(**kwargs) 161 | self.output_shape = output_shape 162 | 163 | def build(self, hp, inputs=None): 164 | """Build the HyperModel instead of Keras Model. 165 | # Arguments 166 | hp: HyperParameters. The hyperparameters for building the model. 167 | inputs: A list of instances of Node. 168 | # Returns 169 | An Node instance, the output node of the output Block. 170 | """ 171 | raise NotImplementedError 172 | 173 | 174 | class Preprocessor(Block): 175 | """Hyper preprocessing block base class. 176 | It extends Block which extends Hypermodel. A preprocessor is a Hypermodel, which 177 | means it is a search space. However, different from other Hypermodels, it is 178 | also a model which can be fit. 179 | """ 180 | 181 | def build(self, hp): 182 | """Get the values of the required HyperParameters. 183 | It does not build and return a Keras Model, but initialize the 184 | HyperParameters for the preprocessor to be fit. 185 | """ 186 | pass 187 | 188 | def update(self, x, y=None): 189 | """Incrementally fit the preprocessor with a single training instance. 190 | # Arguments 191 | x: EagerTensor. A single instance in the training dataset. 192 | y: EagerTensor. The targets of the tasks. Defaults to None. 193 | """ 194 | raise NotImplementedError 195 | 196 | def transform(self, x, fit=False): 197 | """Incrementally fit the preprocessor with a single training instance. 198 | # Arguments 199 | x: EagerTensor. A single instance in the training dataset. 200 | fit: Boolean. Whether it is in fit mode. 201 | Returns: 202 | A transformed instanced which can be converted to a tf.Tensor. 203 | """ 204 | raise NotImplementedError 205 | 206 | def output_types(self): 207 | """The output types of the transformed data, e.g. tf.int64. 208 | The output types are required by tf.py_function, which is used for transform 209 | the dataset into a new one with a map function. 210 | # Returns 211 | A tuple of data types. 212 | """ 213 | raise NotImplementedError 214 | 215 | @property 216 | def output_shape(self): 217 | """The output shape of the transformed data. 218 | The output shape is needed to build the Keras Model from the AutoModel. 219 | The output shape of the preprocessor is the input shape of the Keras Model. 220 | # Returns 221 | A tuple of int(s) or a TensorShape. 222 | """ 223 | raise NotImplementedError 224 | 225 | def finalize(self): 226 | """Training process of the preprocessor after update with all instances.""" 227 | pass 228 | 229 | def get_config(self): 230 | """Get the configuration of the preprocessor. 231 | # Returns 232 | A dictionary of configurations of the preprocessor. 233 | """ 234 | return {} 235 | 236 | def set_config(self, config): 237 | """Set the configuration of the preprocessor. 238 | # Arguments 239 | config: A dictionary of the configurations of the preprocessor. 240 | """ 241 | pass 242 | 243 | def get_weights(self): 244 | """Get the trained weights of the preprocessor. 245 | # Returns 246 | A dictionary of trained weights of the preprocessor. 247 | """ 248 | return {} 249 | 250 | def set_weights(self, weights): 251 | """Set the trained weights of the preprocessor. 252 | # Arguments 253 | weights: A dictionary of trained weights of the preprocessor. 254 | """ 255 | pass 256 | 257 | def get_state(self): 258 | state = super().get_state() 259 | state.update(self.get_config()) 260 | return {'config': state, 261 | 'weights': self.get_weights()} 262 | 263 | def set_state(self, state): 264 | self.set_config(state['config']) 265 | super().set_state(state['config']) 266 | self.set_weights(state['weights']) 267 | -------------------------------------------------------------------------------- /autorecsys/pipeline/mapper.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, print_function, unicode_literals 2 | 3 | import tensorflow as tf 4 | from autorecsys.pipeline.base import Block 5 | 6 | 7 | class LatentFactorMapper(Block): 8 | """ This module maps the user (item) entity into embeddings (latent factors). 9 | 10 | # Note 11 | Data-wise, the similarity b/t class LatentFactorMapper and class SparseFeatureMapper is that both user (item) 12 | identifiers and indexed categorical data are sparse and devoid of numerical meaning. 13 | Functionally, the difference b/t class LatentFactorMapper and class SparseFeatureMapper is that they handle one 14 | sparse column (either user or item) and multiple sparse columns (categorical features), respectively. 15 | In terms of nomenclature, the difference b/t class LatentFactorMapper and class SparseFeatureMapper is to 16 | distinguish the host of features (user and item) from the features themselves. 17 | The use of the term "latent factor" can be traced back to early matrix factorization models for recommendation, 18 | which involve only user and item. 19 | Reference: https://datajobs.com/data-science-repo/Recommender-Systems-[Netflix].pdf 20 | 21 | # Arguments 22 | column_id (int): The index of the user (item) entity column. 23 | num_of_entities (int): The number of the user (item) entity. 24 | embedding_dim (int): The dimension of the embeddings (latent factors). 25 | 26 | # Attributes 27 | column_id (int): The index of the user (item) entity column. 28 | num_of_entities (int): The number of the user (item) entities. 29 | embedding_dim (int): The dimension of the embeddings (latent factors). 30 | """ 31 | 32 | def __init__(self, 33 | column_id=None, 34 | num_of_entities=None, 35 | embedding_dim=None, 36 | **kwargs): 37 | super().__init__(**kwargs) 38 | self.column_id = column_id 39 | self.num_of_entities = num_of_entities 40 | self.embedding_dim = embedding_dim 41 | 42 | def get_state(self): 43 | state = super().get_state() 44 | state.update({ 45 | 'column_id': self.column_id, 46 | 'num_of_entities': self.num_of_entities, 47 | 'embedding_dim': self.embedding_dim}) 48 | return state 49 | 50 | def set_state(self, state): 51 | super().set_state(state) 52 | self.column_id = state['column_id'] 53 | self.num_of_entities = state['num_of_entities'] 54 | self.embedding_dim = state['embedding_dim'] 55 | 56 | def build(self, hp, inputs=None): 57 | input_node = inputs 58 | num_of_entities = self.num_of_entities or hp.Choice('num_of_entities', [10000], default=10000) 59 | embedding_dim = self.embedding_dim or hp.Choice('embedding_dim', [8, 16, 32, 64, 128], default=32) 60 | output_node = tf.keras.layers.Embedding(num_of_entities, embedding_dim)(input_node[0][:, self.column_id]) 61 | return output_node 62 | 63 | 64 | class SparseFeatureMapper(Block): 65 | """ This module maps the categorical data of sparse feature columns into embeddings. 66 | 67 | # Arguments 68 | num_of_fields (int): The number of sparse feature columns (fields). 69 | hash_size (list): The numbers of categories used in each sparse feature column. 70 | embedding_dim (int): The dimension of the embeddings. 71 | 72 | # Attributes 73 | num_of_fields (int): The number of sparse feature columns (fields). 74 | hash_size (list): The list of numbers of categories used in each sparse feature column. 75 | embedding_dim (int): The dimension of the embeddings. 76 | """ 77 | 78 | def __init__(self, 79 | num_of_fields=None, 80 | hash_size=None, 81 | embedding_dim=None, 82 | **kwargs): 83 | super().__init__(**kwargs) 84 | self.num_of_fields = num_of_fields 85 | self.hash_size = hash_size 86 | self.embedding_dim = embedding_dim 87 | 88 | def get_state(self): 89 | """ Get information about the mapper layer, including name, level, and hyperparameters. 90 | 91 | # Returns 92 | Dictionary where key=attribute name and val=attribute value. 93 | """ 94 | state = super().get_state() 95 | state.update({ 96 | 'num_of_fields': self.num_of_fields, 97 | 'hash_size': self.hash_size, 98 | 'embedding_dim': self.embedding_dim}) 99 | return state 100 | 101 | def set_state(self, state): 102 | """ Set information about the mapper layer, including name, level, and hyperparameters. 103 | 104 | # Arguments 105 | state (dict): Map attribute names to attribute values. 106 | """ 107 | super().set_state(state) 108 | self.num_of_fields = state['num_of_fields'] 109 | self.hash_size = state['hash_size'] 110 | self.embedding_dim = state['embedding_dim'] 111 | 112 | def build(self, hp, inputs=None): 113 | """ Build the mapper layer. 114 | 115 | Note: 116 | Attribute "hash_size" has search space [10000]. Default is 10000. 117 | Attribute "embedding_dim" has search space [8, 16]. Default is 8. 118 | 119 | # Arguments 120 | hp (HyperParameters): Specifies the search space and default value for the block's hyperparameters. 121 | inputs (Tensor): List of batch input tensors. 122 | 123 | # Returns 124 | The defined mapper block. 125 | """ 126 | input_node = inputs 127 | # TODO: modify default hash_size, current version is wrong when category of a feature is more than 10000 128 | hash_size = self.hash_size or [hp.Choice('hash_size', [10000], default=10000) 129 | for _ in range(self.num_of_fields)] 130 | embedding_dim = self.embedding_dim or hp.Choice('embedding_dim', [8, 16], default=8) 131 | output_node = tf.stack( 132 | [ 133 | tf.keras.layers.Embedding(hash_size[col_id], embedding_dim)(input_node[0][:, col_id]) 134 | for col_id in range(self.num_of_fields) 135 | ], 136 | axis=1 137 | ) 138 | return output_node 139 | 140 | 141 | class DenseFeatureMapper(Block): 142 | """ This module maps the numerical data of dense feature columns into embeddings. 143 | 144 | # Arguments 145 | num_of_fields (int): The number of dense feature columns. 146 | embedding_dim (int): The dimension of the embeddings. 147 | 148 | # Attributes 149 | num_of_fields (int): The number of dense feature columns. 150 | embedding_dim (int): The dimension of the embeddings. 151 | """ 152 | 153 | def __init__(self, 154 | num_of_fields=None, 155 | embedding_dim=None, 156 | **kwargs): 157 | super().__init__(**kwargs) 158 | self.num_of_fields = num_of_fields 159 | self.embedding_dim = embedding_dim 160 | 161 | def get_state(self): 162 | """ Get information about the mapper layer, including name, level, and hyperparameters. 163 | 164 | # Returns 165 | Dictionary where key=attribute name and val=attribute value. 166 | """ 167 | state = super().get_state() 168 | state.update({ 169 | 'num_of_fields': self.num_of_fields, 170 | 'embedding_dim': self.embedding_dim}) 171 | return state 172 | 173 | def set_state(self, state): 174 | """ Set information about the mapper layer, including name, level, and hyperparameters. 175 | 176 | # Arguments 177 | state (dict): Map attribute names to attribute values. 178 | """ 179 | super().set_state(state) 180 | self.num_of_fields = state['num_of_fields'] 181 | self.embedding_dim = state['embedding_dim'] 182 | 183 | def build(self, hp, inputs=None): 184 | """ Build the mapper layer. 185 | 186 | Note: 187 | Attribute "embedding_dim" has search space [8, 16, 32]. Default is 8. 188 | 189 | # Arguments 190 | hp (HyperParameters): Specifies the search space and default value for the block's hyperparameters. 191 | inputs (Tensor): List of batch input tensors. 192 | 193 | # Returns 194 | The defined mapper block. 195 | """ 196 | input_node = inputs 197 | embedding_dim = self.embedding_dim or hp.Choice('embedding_dim', [8, 16], default=8) 198 | output_node = tf.stack( 199 | [ 200 | tf.tensordot(input_node[0][:, col_id], tf.keras.layers.Embedding(1, embedding_dim)(0), axes=0) 201 | for col_id in range(self.num_of_fields) 202 | ], 203 | axis=1 204 | ) 205 | return output_node 206 | 207 | -------------------------------------------------------------------------------- /autorecsys/pipeline/node.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import tensorflow as tf 4 | from tensorflow.python.util import nest 5 | 6 | from autorecsys.utils.common import dataset_shape 7 | from autorecsys.pipeline import base 8 | 9 | 10 | 11 | class Input(base.Node): 12 | """Input node for tensor data. 13 | The data should be numpy.ndarray or tf.data.Dataset. 14 | """ 15 | 16 | def _check(self, x): 17 | """Record any information needed by transform.""" 18 | if not isinstance(x, (np.ndarray, tf.data.Dataset)): 19 | raise TypeError('Expect the data to Input to be numpy.ndarray or ' 20 | 'tf.data.Dataset, but got {type}.'.format(type=type(x))) 21 | if isinstance(x, np.ndarray) and not np.issubdtype(x.dtype, np.number): 22 | raise TypeError('Expect the data to Input to be numerical, but got ' 23 | '{type}.'.format(type=x.dtype)) 24 | 25 | def _convert_to_dataset(self, x): 26 | if isinstance(x, tf.data.Dataset): 27 | return x 28 | if isinstance(x, np.ndarray): 29 | x = x.astype(np.float32) 30 | return tf.data.Dataset.from_tensor_slices(x) 31 | 32 | def _record_dataset_shape(self, dataset): 33 | self.shape = dataset_shape(dataset) 34 | 35 | def fit_transform(self, x): 36 | dataset = self.transform(x) 37 | self._record_dataset_shape(dataset) 38 | return dataset 39 | 40 | def transform(self, x): 41 | """Transform x into a compatible type (tf.data.Dataset).""" 42 | self._check(x) 43 | dataset = self._convert_to_dataset(x) 44 | return dataset 45 | 46 | 47 | class StructuredDataInput(Input): 48 | """Input node for structured data. 49 | The input data should be numpy.ndarray, pandas.DataFrame or tensorflow.Dataset. 50 | # Arguments 51 | column_names: A list of strings specifying the names of the columns. The 52 | length of the list should be equal to the number of columns of the data. 53 | Defaults to None. If None, it will obtained from the header of the csv 54 | file or the pandas.DataFrame. 55 | column_types: Dict. The keys are the column names. The values should either 56 | be 'numerical' or 'categorical', indicating the type of that column. 57 | Defaults to None. If not None, the column_names need to be specified. 58 | If None, it will be inferred from the data. A column will be judged as 59 | categorical if the number of different values is less than 5% of the 60 | number of instances. 61 | """ 62 | 63 | def __init__(self, column_names=None, column_types=None, **kwargs): 64 | super().__init__(**kwargs) 65 | self.column_names = column_names 66 | self.column_types = column_types 67 | # Variables for inferring column types. 68 | self.count_nan = None 69 | self.count_numerical = None 70 | self.count_categorical = None 71 | self.count_unique_numerical = [] 72 | self.num_col = None 73 | 74 | def get_state(self): 75 | state = super().get_state() 76 | state.update({ 77 | 'column_names': self.column_names, 78 | 'column_types': self.column_types, 79 | 'count_nan': self.count_nan, 80 | 'count_numerical': self.count_numerical, 81 | 'count_categorical': self.count_categorical, 82 | 'count_unique_numerical': self.count_unique_numerical, 83 | 'num_col': self.num_col 84 | }) 85 | return state 86 | 87 | def set_state(self, state): 88 | super().set_state(state) 89 | self.column_names = state['column_names'] 90 | self.column_types = state['column_types'] 91 | self.count_nan = state['count_nan'] 92 | self.count_numerical = state['count_numerical'] 93 | self.count_categorical = state['count_categorical'] 94 | self.count_unique_numerical = state['count_unique_numerical'] 95 | self.num_col = state['num_col'] 96 | 97 | def _check(self, x): 98 | if not isinstance(x, (pd.DataFrame, np.ndarray)): 99 | raise TypeError('Unsupported type {type} for ' 100 | '{name}.'.format(type=type(x), 101 | name=self.__class__.__name__)) 102 | 103 | # Extract column_names from pd.DataFrame. 104 | if isinstance(x, pd.DataFrame) and self.column_names is None: 105 | self.column_names = list(x.columns) 106 | # column_types is provided by user 107 | if self.column_types: 108 | for column_name in self.column_types: 109 | if column_name not in self.column_names: 110 | raise ValueError('Column_names and column_types are ' 111 | 'mismatched. Cannot find column name ' 112 | '{name} in the data.'.format( 113 | name=column_name)) 114 | 115 | # Generate column_names. 116 | if self.column_names is None: 117 | if self.column_types: 118 | raise ValueError('Column names must be specified.') 119 | self.column_names = [index for index in range(x.shape[1])] 120 | 121 | # Check if column_names has the correct length. 122 | if len(self.column_names) != x.shape[1]: 123 | raise ValueError('Expect column_names to have length {expect} ' 124 | 'but got {actual}.'.format( 125 | expect=x.shape[1], 126 | actual=len(self.column_names))) 127 | 128 | def _convert_to_dataset(self, x): 129 | if isinstance(x, pd.DataFrame): 130 | # Convert x, y, validation_data to tf.Dataset. 131 | x = tf.data.Dataset.from_tensor_slices( 132 | x.values.astype(np.unicode)) 133 | if isinstance(x, np.ndarray): 134 | x = tf.data.Dataset.from_tensor_slices(x.astype(np.unicode)) 135 | dataset = super()._convert_to_dataset(x) 136 | for x in dataset: 137 | self.update(x) 138 | self.infer_column_types() 139 | return dataset 140 | 141 | def update(self, x): 142 | # Calculate the statistics. 143 | x = nest.flatten(x)[0].numpy() 144 | if self.num_col is None: 145 | self.num_col = len(x) 146 | self.count_nan = np.zeros(self.num_col) 147 | self.count_numerical = np.zeros(self.num_col) 148 | self.count_categorical = np.zeros(self.num_col) 149 | for i in range(len(x)): 150 | self.count_unique_numerical.append({}) 151 | for i in range(self.num_col): 152 | x[i] = x[i].decode('utf-8') 153 | if x[i] == 'nan': 154 | self.count_nan[i] += 1 155 | elif x[i] == 'True': 156 | self.count_categorical[i] += 1 157 | elif x[i] == 'False': 158 | self.count_categorical[i] += 1 159 | else: 160 | try: 161 | tmp_num = float(x[i]) 162 | self.count_numerical[i] += 1 163 | if tmp_num not in self.count_unique_numerical[i]: 164 | self.count_unique_numerical[i][tmp_num] = 1 165 | else: 166 | self.count_unique_numerical[i][tmp_num] += 1 167 | except ValueError: 168 | self.count_categorical[i] += 1 169 | 170 | def infer_column_types(self): 171 | column_types = {} 172 | for i in range(self.num_col): 173 | if self.count_categorical[i] > 0: 174 | column_types[self.column_names[i]] = 'categorical' 175 | elif len(self.count_unique_numerical[i])/self.count_numerical[i] < 0.05: 176 | column_types[self.column_names[i]] = 'categorical' 177 | else: 178 | column_types[self.column_names[i]] = 'numerical' 179 | # Partial column_types is provided. 180 | if self.column_types is None: 181 | self.column_types = {} 182 | for key, value in column_types.items(): 183 | if key not in self.column_types: 184 | self.column_types[key] = value 185 | -------------------------------------------------------------------------------- /autorecsys/pipeline/optimizer.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, print_function, unicode_literals 2 | 3 | import tensorflow as tf 4 | from autorecsys.pipeline.base import Block 5 | 6 | 7 | class RatingPredictionOptimizer(Block): 8 | """ For the rating prediction task, this module employs the default 'linear' activation function and the 'mse' (mean 9 | square error) loss and metric for training and evaluation. 10 | 11 | # Note 12 | This module takes a list of single tensor batch as input. When the input is a list of multiple tensor batches, 13 | they are concatenated into a single single tensor batch. 14 | """ 15 | 16 | def __init__(self, **kwargs): 17 | super().__init__(**kwargs) 18 | 19 | def build(self, hp, inputs=None): 20 | """ Build the optimization layer. 21 | 22 | # Arguments 23 | hp (HyperParameters): Specifies the search space and default value for the block's hyperparameters. 24 | inputs (Tensor): List of batch input tensors. 25 | 26 | # Returns 27 | The defined optimizer block. 28 | """ 29 | input_node = tf.concat(inputs, axis=1) 30 | output_node = tf.keras.layers.Dense(1)(input_node) 31 | output_node = tf.reshape(output_node, [-1]) 32 | return output_node 33 | 34 | @property 35 | def metric(self): 36 | """ Define the metric used for model evaluation. 37 | 38 | # Returns 39 | The defined metric object. 40 | """ 41 | return tf.keras.metrics.MeanSquaredError(name='mse') 42 | 43 | @property 44 | def loss(self): 45 | """ Define the loss used for model training. 46 | 47 | # Returns 48 | The defined loss object. 49 | """ 50 | return tf.keras.losses.MeanSquaredError(name='mse') 51 | 52 | 53 | class CTRPredictionOptimizer(Block): 54 | """ For the CTR (click-through rate) prediction task, this module employs the 'sigmoid' activation function and 55 | the 'BinaryCrossentropy' loss and metric for training and evaluation. 56 | 57 | # Note 58 | This module takes a list of single tensor batch as input. When the input is a list of multiple tensor batches, 59 | they are concatenated into a single single tensor batch. 60 | """ 61 | 62 | def build(self, hp, inputs=None): 63 | """ Build the optimization layer. 64 | 65 | # Arguments 66 | hp (HyperParameters): Specifies the search space and default value for the block's hyperparameters. 67 | inputs (Tensor): List of batch input tensors. 68 | 69 | # Returns 70 | The defined optimizer block. 71 | """ 72 | input_node = tf.concat(inputs, axis=1) 73 | output_node = tf.keras.layers.Dense(1, activation='sigmoid')(input_node) 74 | output_node = tf.reshape(output_node, [-1, 1]) 75 | return output_node 76 | 77 | @property 78 | def metric(self): 79 | """ Define the metric used for model evaluation. 80 | 81 | # Returns 82 | The defined metric object. 83 | """ 84 | return tf.keras.metrics.BinaryCrossentropy(name='BinaryCrossentropy') 85 | 86 | @property 87 | def loss(self): 88 | """ Define the loss used for model training. 89 | 90 | # Returns 91 | The defined loss object. 92 | """ 93 | return tf.keras.losses.BinaryCrossentropy(name='BinaryCrossentropy') 94 | -------------------------------------------------------------------------------- /autorecsys/pipeline/utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, print_function, unicode_literals 2 | 3 | import tensorflow as tf 4 | from tensorflow.keras.layers import Layer 5 | 6 | 7 | class Bias(Layer): 8 | """ This module builds a Keras layer of bias terms (e.g., MLP layer with zero weight matrix). 9 | 10 | # Arguments 11 | units (int): The units of all layer in the Bias layer. 12 | 13 | # Attributes 14 | bias (Tensor): The bias layer. 15 | """ 16 | 17 | def __init__(self, units=32): 18 | super(Bias, self).__init__() 19 | bias_init = tf.zeros_initializer() 20 | self.bias = tf.Variable(initial_value=bias_init(shape=(units,), dtype='float32'), trainable=True) 21 | 22 | def call(self, inputs): 23 | """ Add the bias layer to the input tensor layer. 24 | 25 | # Arguments 26 | inputs (Tensor): List of batch input tensors. 27 | 28 | # Returns 29 | List of batch input tensors added with bias tensors. 30 | """ 31 | return inputs + self.bias 32 | -------------------------------------------------------------------------------- /autorecsys/recommender.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, print_function, unicode_literals 2 | 3 | from autorecsys.pipeline.graph import HyperGraph 4 | 5 | 6 | class RPRecommender(HyperGraph): # pragma: no cover 7 | """A rating prediction HyperModel based on connected Blocks and HyperBlocks. 8 | 9 | # Arguments 10 | inputs (list): A list of input node(s) for the HyperGraph. 11 | outputs (list): A list of output node(s) for the HyperGraph. 12 | """ 13 | def __init__(self, **kwargs): 14 | super().__init__(**kwargs) 15 | 16 | 17 | class CTRRecommender(HyperGraph): # pragma: no cover 18 | """A CTR (click-through rate) prediction HyperModel based on connected Blocks and HyperBlocks. 19 | 20 | # Arguments 21 | inputs (list): A list of input node(s) for the HyperGraph. 22 | outputs (list): A list of output node(s) for the HyperGraph. 23 | """ 24 | def __init__(self, **kwargs): 25 | super().__init__(**kwargs) 26 | -------------------------------------------------------------------------------- /autorecsys/searcher/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datamllab/AutoRec/2dbc8778cfb597402d8b0337186bf9152663b20a/autorecsys/searcher/__init__.py -------------------------------------------------------------------------------- /autorecsys/searcher/core/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datamllab/AutoRec/2dbc8778cfb597402d8b0337186bf9152663b20a/autorecsys/searcher/core/__init__.py -------------------------------------------------------------------------------- /autorecsys/searcher/core/trial.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # This codes are migrated from Keras Tuner: https://keras-team.github.io/keras-tuner/. 3 | # The copyright belows to the Keras Tuner authors. 4 | 5 | 6 | from __future__ import absolute_import, division, print_function, unicode_literals 7 | 8 | import random 9 | import tensorflow as tf 10 | import time 11 | import json 12 | 13 | from autorecsys.searcher.core import hyperparameters as hp_module 14 | from autorecsys.utils import display, metric 15 | 16 | 17 | class Stateful(object): 18 | 19 | def get_state(self): 20 | raise NotImplementedError 21 | 22 | def set_state(self, state): 23 | raise NotImplementedError 24 | 25 | def save(self, fname): 26 | state = self.get_state() 27 | state_json = json.dumps(state) 28 | with open(fname, 'w') as fp: 29 | fp.write(state_json) 30 | return str(fname) 31 | 32 | def reload(self, fname): 33 | with open(fname, 'r') as fp: 34 | state = json.load(fp) 35 | self.set_state(state) 36 | 37 | 38 | class TrialStatus: 39 | RUNNING = 'RUNNING' 40 | IDLE = 'IDLE' 41 | INVALID = 'INVALID' 42 | STOPPED = 'STOPPED' 43 | COMPLETED = 'COMPLETED' 44 | 45 | 46 | class Trial(Stateful): 47 | 48 | def __init__(self, 49 | hyperparameters, 50 | trial_id=None, 51 | status=TrialStatus.RUNNING): 52 | self.hyperparameters = hyperparameters 53 | self.trial_id = generate_trial_id() if trial_id is None else trial_id 54 | self.metrics = metric.MetricsTracker() 55 | self.score = None 56 | self.best_step = None 57 | self.status = status 58 | 59 | def summary(self): 60 | display.section('Trial summary') 61 | if self.hyperparameters.values: 62 | display.subsection('Hp values:') 63 | value_need_display = {k: v for k, v in self.hyperparameters.values.items() 64 | if k in self.hyperparameters._space and 65 | self.hyperparameters._space[k].__class__.__name__ != 'Fixed'} 66 | display.display_settings(value_need_display) 67 | else: 68 | display.subsection('Hp values: default configuration.') 69 | if self.score is not None: 70 | display.display_setting('Score: {}'.format(self.score)) 71 | if self.best_step is not None: 72 | display.display_setting('Best step: {}'.format(self.best_step)) 73 | 74 | def get_state(self): 75 | return { 76 | 'trial_id': self.trial_id, 77 | 'hyperparameters': self.hyperparameters.get_config(), 78 | 'metrics': self.metrics.get_config(), 79 | 'score': self.score, 80 | 'best_step': self.best_step, 81 | 'status': self.status 82 | } 83 | 84 | def set_state(self, state): 85 | self.trial_id = state['trial_id'] 86 | hp = hp_module.HyperParameters.from_config( 87 | state['hyperparameters'] 88 | ) 89 | self.hyperparameters = hp 90 | self.metrics = metric.MetricsTracker.from_config(state['metrics']) 91 | self.score = state['score'] 92 | self.best_step = state['best_step'] 93 | self.status = state['status'] 94 | 95 | @classmethod 96 | def from_state(cls, state): 97 | trial = cls(hyperparameters=None) 98 | trial.set_state(state) 99 | return trial 100 | 101 | @classmethod 102 | def load(cls, fname): 103 | with tf.io.gfile.GFile(fname, 'r') as f: 104 | state_data = f.read() 105 | return cls.from_state(state_data) 106 | 107 | 108 | def generate_trial_id(): 109 | s = str(time.time()) + str(random.randint(1, 1e7)) 110 | # return hashlib.sha256(s.encode('utf-8')).hexdigest()[:32] 111 | return hash(s) % 1045543567 112 | -------------------------------------------------------------------------------- /autorecsys/searcher/core/utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | LOGGER = logging.getLogger(__name__) 4 | TYPE_MAP = {'int': int, 'float': float, 'str': str, 'list': list, 'tuple': tuple, 'bool': bool} 5 | CANT_BE_SET = -1 6 | 7 | 8 | def check_valid_params(name, x, param_info, skip_range_detect): 9 | param_type = TYPE_MAP[param_info['type']] 10 | try: 11 | x = param_type(x) 12 | except ValueError as e: 13 | LOGGER.exception(f'can not cast {name} to {param_type}') 14 | raise e 15 | param_range = param_info.get('range', None) 16 | if param_range == CANT_BE_SET: 17 | raise TypeError(f'{name} can not be set from config files') 18 | if not skip_range_detect: 19 | if isinstance(param_range, tuple): 20 | if x not in param_range: 21 | raise ValueError(f'{name} must be in {param_range}, {x} doesn\'t') 22 | elif isinstance(param_range, list): 23 | low, high = param_range 24 | if x < low or x > high: 25 | raise ValueError(f'{name} valid range: x>={low} && x<={high}') 26 | else: 27 | raise NotImplementedError(f'code error: the param\'range of a model must be tuple, list') 28 | return x 29 | -------------------------------------------------------------------------------- /autorecsys/searcher/tuners/__init__.py: -------------------------------------------------------------------------------- 1 | from .randomsearch import RandomSearch 2 | from .bayesian import BayesianOptimization 3 | from .greedy import Greedy 4 | 5 | TUNER_CLASSES = { 6 | 'random': RandomSearch, 7 | 'bayesian': BayesianOptimization, 8 | "greedy": Greedy 9 | } 10 | 11 | 12 | def get_tuner_class(tuner): 13 | if isinstance(tuner, str) and tuner in TUNER_CLASSES: 14 | return TUNER_CLASSES.get(tuner) 15 | else: 16 | raise ValueError('The value {tuner} passed for argument tuner is invalid, ' 17 | 'expected one of "random","bayesian".'.format(tuner=tuner)) 18 | -------------------------------------------------------------------------------- /autorecsys/searcher/tuners/greedy.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # This codes are migrated from Keras Tuner: https://keras-team.github.io/keras-tuner/. 3 | # The copyright belows to the Keras Tuner authors. 4 | 5 | 6 | from __future__ import absolute_import, division, print_function, unicode_literals 7 | 8 | import random 9 | import numpy as np 10 | 11 | from autorecsys.searcher.tuners.tuner import PipeTuner 12 | from autorecsys.searcher.core import hyperparameters as hp_module 13 | from autorecsys.searcher.core import oracle as oracle_module 14 | from autorecsys.searcher.core import trial as trial_lib 15 | 16 | 17 | class GreedyOracle(oracle_module.Oracle): 18 | """An oracle combining random search and greedy algorithm. 19 | It groups the HyperParameters into several categories, namely, HyperGraph, 20 | Preprocessor, Architecture, and Optimization. The oracle tunes each group 21 | separately using random search. In each trial, it use a greedy strategy to 22 | generate new values for one of the categories of HyperParameters and use the best 23 | trial so far for the rest of the HyperParameters values. 24 | # Arguments 25 | initial_hps: A list of dictionaries in the form of 26 | {HyperParameter name (String): HyperParameter value}. 27 | Each dictionary is one set of HyperParameters, which are used as the 28 | initial trials for the search. Defaults to None. 29 | seed: Int. Random seed. 30 | """ 31 | 32 | HYPER = 'HYPER' 33 | PREPROCESS = 'PREPROCESS' 34 | OPT = 'OPT' 35 | ARCH = 'ARCH' 36 | STAGES = [HYPER, PREPROCESS, OPT, ARCH] 37 | 38 | @staticmethod 39 | def next_stage(stage): 40 | stages = GreedyOracle.STAGES 41 | return stages[(stages.index(stage) + 1) % len(stages)] 42 | 43 | def __init__(self, 44 | hypermodel, 45 | initial_hps=None, 46 | seed=None, 47 | **kwargs): 48 | super().__init__(**kwargs) 49 | self.initial_hps = initial_hps or [] 50 | self._tried_initial_hps = [False] * len(self.initial_hps) 51 | self.hypermodel = hypermodel 52 | # Sets of HyperParameter names. 53 | self._hp_names = { 54 | GreedyOracle.HYPER: set(), 55 | GreedyOracle.PREPROCESS: set(), 56 | GreedyOracle.OPT: set(), 57 | GreedyOracle.ARCH: set(), 58 | } 59 | # The quota used to tune each category of hps. 60 | self.seed = seed or random.randint(1, 1e4) 61 | # Incremented at every call to `populate_space`. 62 | self._seed_state = self.seed 63 | self._tried_so_far = set() 64 | self._max_collisions = 5 65 | 66 | def update_space(self, hyperparameters): 67 | # Get the block names. 68 | self.hypermodel.build(hyperparameters) 69 | 70 | # Add the new Hyperparameters to different categories. 71 | ref_names = {hp.name for hp in self.hyperparameters.space} 72 | for hp in hyperparameters.space: 73 | if hp.name not in ref_names: 74 | hp_type = GreedyOracle.ARCH 75 | self._hp_names[hp_type].add(hp.name) 76 | super().update_space(hyperparameters) 77 | 78 | def _generate_stage(self): 79 | probabilities = np.array([pow(len(value), 2) 80 | for value in self._hp_names.values()]) 81 | sum_p = np.sum(probabilities) 82 | if sum_p == 0: 83 | probabilities = np.array([1] * len(probabilities)) 84 | sum_p = np.sum(probabilities) 85 | probabilities = probabilities / sum_p 86 | return np.random.choice(list(self._hp_names.keys()), p=probabilities) 87 | 88 | def _next_initial_hps(self): 89 | for index, hps in enumerate(self.initial_hps): 90 | if not self._tried_initial_hps[index]: 91 | self._tried_initial_hps[index] = True 92 | return hps 93 | 94 | def _populate_space(self, trial_id): 95 | if not all(self._tried_initial_hps): 96 | return {'status': trial_lib.TrialStatus.RUNNING, 97 | 'values': self._next_initial_hps()} 98 | 99 | stage = self._generate_stage() 100 | for _ in range(len(GreedyOracle.STAGES)): 101 | values = self._generate_stage_values(stage) 102 | # Reached max collisions. 103 | if values is None: 104 | # Try next stage. 105 | stage = GreedyOracle.next_stage(stage) 106 | continue 107 | # Values found. 108 | return {'status': trial_lib.TrialStatus.RUNNING, 109 | 'values': values} 110 | # All stages reached max collisions. 111 | return {'status': trial_lib.TrialStatus.STOPPED, 112 | 'values': None} 113 | 114 | def _generate_stage_values(self, stage): 115 | best_trials = self.get_best_trials() 116 | if best_trials: 117 | best_values = best_trials[0].hyperparameters.values 118 | else: 119 | best_values = self.hyperparameters.values 120 | collisions = 0 121 | while True: 122 | # Generate new values for the current stage. 123 | values = {} 124 | for p in self.hyperparameters.space: 125 | if p.name in self._hp_names[stage]: 126 | values[p.name] = p.random_sample(self._seed_state) 127 | self._seed_state += 1 128 | values = {**best_values, **values} 129 | # Keep trying until the set of values is unique, 130 | # or until we exit due to too many collisions. 131 | values_hash = self._compute_values_hash(values) 132 | if values_hash not in self._tried_so_far: 133 | self._tried_so_far.add(values_hash) 134 | break 135 | collisions += 1 136 | if collisions > self._max_collisions: 137 | # Reached max collisions. No value to return. 138 | return None 139 | return values 140 | 141 | 142 | class Greedy(PipeTuner): 143 | 144 | def __init__(self, 145 | hypergraph, 146 | objective, 147 | max_trials, 148 | initial_hps=None, 149 | seed=None, 150 | hyperparameters=None, 151 | tune_new_entries=True, 152 | allow_new_entries=True, 153 | **kwargs): 154 | self.seed = seed 155 | oracle = GreedyOracle(hypermodel=hypergraph, 156 | objective=objective, 157 | max_trials=max_trials, 158 | initial_hps=initial_hps, 159 | seed=seed, 160 | hyperparameters=hyperparameters, 161 | tune_new_entries=tune_new_entries, 162 | allow_new_entries=allow_new_entries) 163 | super(Greedy, self).__init__(oracle, 164 | hypergraph, 165 | **kwargs) 166 | 167 | @classmethod 168 | def get_name(cls): 169 | return 'greedy' 170 | -------------------------------------------------------------------------------- /autorecsys/searcher/tuners/randomsearch.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # This codes are migrated from Keras Tuner: https://keras-team.github.io/keras-tuner/. 3 | # The copyright belows to the Keras Tuner authors. 4 | 5 | 6 | "Basic random search searcher." 7 | 8 | from __future__ import absolute_import, division, print_function, unicode_literals 9 | 10 | import random 11 | 12 | from autorecsys.searcher.tuners.tuner import PipeTuner 13 | from autorecsys.searcher.core import hyperparameters as hp_module 14 | from autorecsys.searcher.core import oracle as oracle_module 15 | from autorecsys.searcher.core import trial as trial_lib 16 | 17 | 18 | class RandomSearchOracle(oracle_module.Oracle): 19 | """Random search oracle. 20 | Attributes: 21 | objective: String or `kerastuner.Objective`. If a string, 22 | the direction of the optimization (min or max) will be 23 | inferred. 24 | max_trials: Int. Total number of trials 25 | (model configurations) to test at most. 26 | Note that the oracle may interrupt the search 27 | before `max_trial` models have been tested. 28 | seed: Int. Random seed. 29 | hyperparameters: HyperParameters class instance. 30 | Can be used to override (or register in advance) 31 | hyperparamters in the search space. 32 | tune_new_entries: Whether hyperparameter entries 33 | that are requested by the hypermodel 34 | but that were not specified in `hyperparameters` 35 | should be added to the search space, or not. 36 | If not, then the default value for these parameters 37 | will be used. 38 | allow_new_entries: Whether the hypermodel is allowed 39 | to request hyperparameter entries not listed in 40 | `hyperparameters`. 41 | """ 42 | 43 | def __init__(self, 44 | objective, 45 | max_trials, 46 | seed=None, 47 | hyperparameters=None, 48 | allow_new_entries=True, 49 | tune_new_entries=True): 50 | super(RandomSearchOracle, self).__init__( 51 | objective=objective, 52 | max_trials=max_trials, 53 | hyperparameters=hyperparameters, 54 | tune_new_entries=tune_new_entries, 55 | allow_new_entries=allow_new_entries) 56 | self.seed = seed or random.randint(1, 1e4) 57 | # Incremented at every call to `populate_space`. 58 | self._seed_state = self.seed 59 | # Hashes of values tried so far. 60 | self._tried_so_far = set() 61 | # Maximum number of identical values that can be generated 62 | # before we consider the space to be exhausted. 63 | self._max_collisions = 5 64 | 65 | def _populate_space(self, _): 66 | """Fill the hyperparameter space with values. 67 | Args: 68 | `trial_id`: The id for this Trial. 69 | Returns: 70 | A dictionary with keys "values" and "status", where "values" is 71 | a mapping of parameter names to suggested values, and "status" 72 | is the TrialStatus that should be returned for this trial (one 73 | of "RUNNING", "IDLE", or "STOPPED"). 74 | """ 75 | collisions = 0 76 | while 1: 77 | # Generate a set of random values. 78 | values = {} 79 | if all(isinstance(p, hp_module.Fixed) for p in self.hyperparameters.space): 80 | break 81 | for p in self.hyperparameters.space: 82 | values[p.name] = p.random_sample(self._seed_state) 83 | self._seed_state += 1 84 | # Keep trying until the set of values is unique, 85 | # or until we exit due to too many collisions. 86 | values_hash = self._compute_values_hash(values) 87 | if values_hash in self._tried_so_far: 88 | collisions += 1 89 | if collisions > self._max_collisions: 90 | return {'status': trial_lib.TrialStatus.STOPPED, 91 | 'values': None} 92 | continue 93 | self._tried_so_far.add(values_hash) 94 | break 95 | return {'status': trial_lib.TrialStatus.RUNNING, 96 | 'values': values} 97 | 98 | def get_state(self): 99 | state = super(RandomSearchOracle, self).get_state() 100 | state.update({ 101 | 'seed': self.seed, 102 | 'seed_state': self._seed_state, 103 | 'tried_so_far': list(self._tried_so_far), 104 | }) 105 | return state 106 | 107 | def set_state(self, state): 108 | super(RandomSearchOracle, self).set_state(state) 109 | self.seed = state['seed'] 110 | self._seed_state = state['seed_state'] 111 | self._tried_so_far = set(state['tried_so_far']) 112 | 113 | 114 | class RandomSearch(PipeTuner): 115 | """Random search tuner. 116 | # Arguments: 117 | config: Dictionary. Specify the search configurations 118 | including TrainOptions, ModelOptions, Search Options. 119 | objective: String. Name of model metric to minimize 120 | or maximize, e.g. "val_accuracy". 121 | max_trials: Int. Total number of trials 122 | (model configurations) to test at most. 123 | Note that the oracle may interrupt the search 124 | before `max_trial` models have been tested. 125 | seed: Int. Random seed. 126 | hyperparameters: HyperParameters class instance. 127 | Can be used to override (or register in advance) 128 | hyperparamters in the search space. 129 | tune_new_entries: Whether hyperparameter entries 130 | that are requested by the hypermodel 131 | but that were not specified in `hyperparameters` 132 | should be added to the search space, or not. 133 | If not, then the default value for these parameters 134 | will be used. 135 | allow_new_entries: Whether the hypermodel is allowed 136 | to request hyperparameter entries not listed in 137 | `hyperparameters`. 138 | **kwargs: Keyword arguments relevant to all `Tuner` subclasses. 139 | Please see the docstring for `Tuner`. 140 | """ 141 | 142 | def __init__(self, 143 | hypergraph, 144 | objective, 145 | max_trials, 146 | seed=None, 147 | hyperparameters=None, 148 | tune_new_entries=True, 149 | allow_new_entries=True, 150 | **kwargs): 151 | self.seed = seed 152 | oracle = RandomSearchOracle(objective=objective, 153 | max_trials=max_trials, 154 | seed=seed, 155 | hyperparameters=hyperparameters, 156 | tune_new_entries=tune_new_entries, 157 | allow_new_entries=allow_new_entries) 158 | super(RandomSearch, self).__init__(oracle, 159 | hypergraph, 160 | **kwargs) 161 | 162 | @classmethod 163 | def get_name(cls): 164 | return 'random' 165 | -------------------------------------------------------------------------------- /autorecsys/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datamllab/AutoRec/2dbc8778cfb597402d8b0337186bf9152663b20a/autorecsys/utils/__init__.py -------------------------------------------------------------------------------- /autorecsys/utils/common.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, print_function, unicode_literals 2 | 3 | import re 4 | import os 5 | import shutil 6 | import pandas as pd 7 | import numpy as np 8 | import tensorflow as tf 9 | import random 10 | import pickle 11 | import string 12 | 13 | 14 | def dataset_shape(dataset): 15 | """ Get the shape of the dataset. 16 | 17 | Args: 18 | dataset (tf.data.Dataset or Tf.data.Iterator): A TensorFlow Dataset or Iterator. 19 | 20 | Returns: 21 | A nested structure of tf.TensorShape object matching the structure of the dataset / iterator elements and 22 | specifying the shape of the individual components. 23 | """ 24 | return tf.compat.v1.data.get_output_shapes(dataset) 25 | 26 | 27 | def to_snake_case(name): 28 | """ Convert the given class name to snake case. 29 | 30 | # Arguments 31 | name (str): The name of the class. 32 | 33 | # Returns 34 | String name of the class in snake case. 35 | """ 36 | insecure = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name) 37 | insecure = re.sub('([a-z0-9])([A-Z])', r'\1_\2', insecure).lower() 38 | for p in string.punctuation: 39 | insecure = insecure.replace(p, "_") 40 | 41 | if insecure[0] != '_': 42 | return insecure 43 | # A private class (starts with "_") is not secure for creating scopes and is thus prefixed w/ "private". 44 | return 'private' + insecure 45 | 46 | 47 | def create_directory(path, remove_existing=False): 48 | """ Create the designated directory. 49 | 50 | # Arguments 51 | path (str): Path to create the directory. 52 | remove_existing (bool): Whether to remove the directory if it already exists. 53 | """ 54 | # Create the directory if it doesn't exist. 55 | if not os.path.exists(path): 56 | os.mkdir(path) 57 | # Remove the preexisting directory if allowed. 58 | elif remove_existing: 59 | shutil.rmtree(path) 60 | os.mkdir(path) 61 | 62 | 63 | def set_device(device_name): 64 | """ Set the computational devices used to run models. 65 | 66 | # Arguments 67 | device_name (str): Name of the CPU or GPU. 68 | """ 69 | if device_name[0:3] == "cpu": 70 | cpus = tf.config.experimental.list_physical_devices('CPU') 71 | print("Available CPUs: {}".format(cpus)) 72 | assert len(cpus) > 0, "Not enough CPU hardware devices available" 73 | cpu_idx = int(device_name[-1]) 74 | tf.config.experimental.set_visible_devices(cpus[cpu_idx], 'CPU') 75 | else: 76 | gpus = tf.config.experimental.list_physical_devices('GPU') 77 | for gpu in gpus: 78 | tf.config.experimental.set_memory_growth(gpu, True) 79 | print("Available GPUs: {}".format(gpus)) 80 | assert len(gpus) > 0, "Not enough GPU hardware devices available" 81 | gpu_idx = int(device_name[-1]) 82 | tf.config.experimental.set_visible_devices(gpus[gpu_idx], 'GPU') 83 | 84 | 85 | def load_dataframe_input(x): 86 | """ Load the input object as a DataFrame or a Series. 87 | 88 | # Note 89 | Cover the following classes: None, DataFrame, Series, ndarray, and str. 90 | 91 | # Arguments 92 | x (object): The object to be loaded as a DataFrame or Series. 93 | 94 | # Returns 95 | The loaded DataFrame or Series. 96 | """ 97 | if x is None: 98 | return None 99 | if isinstance(x, pd.DataFrame) or isinstance(x, pd.Series): 100 | res = x 101 | elif isinstance(x, np.ndarray): 102 | res = pd.Series(x) if len(x.shape) == 1 else pd.DataFrame(x) 103 | elif isinstance(x, str): 104 | if not x.endswith('.csv'): 105 | raise TypeError(f'ONLY accept path to the local csv files') 106 | res = pd.read_csv(x) 107 | else: 108 | raise TypeError(f"cannot load {type(x)} into pandas dataframe") 109 | 110 | # Ensure the type of column names is string 111 | if isinstance(res, pd.DataFrame): 112 | res.columns = res.columns.astype('str') 113 | return res 114 | 115 | 116 | def set_seed(seed=42): 117 | """ Set the seed for randomization functions. 118 | 119 | # Note 120 | Cover the following libraries: Python, Numpy, and TensorFlow 121 | 122 | # Arguments 123 | seed (float): The seed number used to create fixed randomization. 124 | """ 125 | random.seed(seed) 126 | np.random.seed(seed) 127 | tf.random.set_seed(seed) 128 | 129 | 130 | def save_pickle(path, obj): 131 | """ Save the input object to the designated path. 132 | 133 | # Arguments 134 | path (str): Designated path to save the object. 135 | obj (object): The object to be saved. 136 | """ 137 | with open(path, 'wb') as f: 138 | pickle.dump(obj, f) 139 | 140 | 141 | def load_pickle(path): 142 | """ Load the object file from the designated path. 143 | 144 | # Arguments 145 | path: Designated path to load the object. 146 | 147 | Returns: 148 | The loaded object. 149 | """ 150 | with open(path, 'rb') as f: 151 | return pickle.load(f) 152 | -------------------------------------------------------------------------------- /autorecsys/utils/display.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # This codes are migrated from Keras Tuner: https://keras-team.github.io/keras-tuner/. 3 | # The copyright belows to the Keras Tuner authors. 4 | 5 | 6 | 7 | """Display utilities.""" 8 | 9 | from __future__ import absolute_import 10 | from __future__ import division 11 | from __future__ import print_function 12 | 13 | from terminaltables import SingleTable, AsciiTable 14 | from tabulate import tabulate 15 | from colorama import init, Fore, Back, Style 16 | 17 | init() # colorama init 18 | 19 | # Check if we are in a ipython/colab environement 20 | try: 21 | class_name = get_ipython().__class__.__name__ 22 | if "Terminal" in class_name: 23 | IS_NOTEBOOK = False 24 | else: 25 | IS_NOTEBOOK = True 26 | 27 | except NameError: 28 | IS_NOTEBOOK = False 29 | 30 | if IS_NOTEBOOK: 31 | from tqdm import tqdm_notebook as tqdm 32 | from IPython.display import HTML 33 | from IPython.display import display as ipython_display 34 | 35 | 36 | def display(text): 37 | ipython_display(HTML(text)) 38 | else: 39 | from tqdm import tqdm 40 | 41 | display = print 42 | 43 | FG = 0 44 | BG = 1 45 | 46 | # TODO: create a set of HTML color to allows richer display in colab 47 | colors = { 48 | 'black': [Fore.BLACK, Back.BLACK], 49 | 'red': [Fore.RED, Back.RED], 50 | 'green': [Fore.GREEN, Back.GREEN], 51 | 'yellow': [Fore.YELLOW, Back.YELLOW], 52 | 'blue': [Fore.BLUE, Back.BLUE], 53 | 'magenta': [Fore.MAGENTA, Back.MAGENTA], 54 | 'cyan': [Fore.CYAN, Back.CYAN], 55 | 'white': [Fore.WHITE, Back.WHITE], 56 | } 57 | 58 | styles = { 59 | "dim": Style.DIM, 60 | "normal": Style.NORMAL, 61 | "bright": Style.BRIGHT, 62 | "reset": Style.RESET_ALL 63 | } 64 | 65 | 66 | # Shorthand functions 67 | def info(text, render=1): 68 | """ display a info 69 | 70 | Args: 71 | text (str): info message 72 | display (bool, optional): Defaults to True. Display or return settings 73 | 74 | Returns: 75 | str: setting value if display=False, None otherwise 76 | """ 77 | color = 'blue' 78 | s = "[Info] %s" % text 79 | 80 | if render: 81 | cprint(s, color) 82 | else: 83 | return colorize(s + '\n', color) 84 | 85 | 86 | def warning(text, render=1): 87 | """ display a warning 88 | 89 | Args: 90 | text (str): warning message 91 | render (bool, optional): Defaults to True. render or return settings 92 | 93 | Returns: 94 | str: setting value if render=False, None otherwise 95 | """ 96 | color = 'yellow' 97 | s = "[Warning] %s" % text 98 | 99 | if render: 100 | cprint(s, color) 101 | else: 102 | return colorize(s + '\n', color) 103 | 104 | 105 | def fatal(text, render=True, raise_exception=True): 106 | """ Display a fatal error, and die 107 | 108 | Args: 109 | text (str): Fatal message 110 | render (bool, optional): Render or return settings. Defaults to True. 111 | raise_exception (bool, optional): Raise a ValueError. Defaults to True. 112 | Returns: 113 | str: Formated fatal message 114 | """ 115 | color = 'white' 116 | bgcolor = 'red' 117 | s = "[FATAL] %s" % text 118 | 119 | if render: 120 | cprint(s, color, bgcolor) 121 | if raise_exception: 122 | raise ValueError(s) 123 | return colorize(s + '\n', color, bgcolor) 124 | 125 | 126 | def section(text): 127 | """ Render a section 128 | 129 | Args: 130 | text (str): Section name 131 | """ 132 | if IS_NOTEBOOK: 133 | section = '

' + text + '

' 134 | cprint(section, '#4527A0') 135 | else: 136 | section = '[' + text + ']' 137 | cprint(section, 'yellow') 138 | 139 | 140 | def subsection(text): 141 | """ Render a subsection. 142 | 143 | Args: 144 | text (str): Subsection name 145 | """ 146 | if IS_NOTEBOOK: 147 | section = '

' + text + '

' 148 | cprint(section, '#7E57C2') 149 | else: 150 | section = ' > ' + text + '' 151 | cprint(section, 'magenta', brightness='dim') 152 | 153 | 154 | def display_setting(text, indent_level=1, idx=0, render=True): 155 | """ Print a single setting 156 | 157 | Args: 158 | text (str): Setting key:value as string 159 | indent_level (int, optional): Num indentation space. Defaults to 0. 160 | idx (int, optional): Index of setting to rotate color. Defaults to 0. 161 | render (bool, optional): Render or return settings. Defaults to True. 162 | 163 | Returns: 164 | str: colorized settings. 165 | """ 166 | s = ' ' * indent_level 167 | s += '|-' + text 168 | if idx % 2: 169 | color = 'blue' 170 | else: 171 | color = 'cyan' 172 | 173 | if render: 174 | cprint(s, color) 175 | return colorize(s + '\n', color) 176 | 177 | 178 | def display_settings(mysettings, indent_level=1, render=True): 179 | """ 180 | Render a collection of settings 181 | 182 | Args: 183 | mysettings (dict): Dictionnary of settings 184 | indent_level (int): Identation level. Defaults to 1. 185 | render (bool, optional): Print? Defaults to True. 186 | """ 187 | s = "" 188 | idx = 0 189 | for name in sorted(mysettings.keys()): 190 | value = mysettings[name] 191 | txt = "%s: %s" % (name, value) 192 | s += display_setting(txt, idx=idx, indent_level=indent_level, 193 | render=render) 194 | idx += 1 195 | return s 196 | 197 | 198 | def highlight(text): 199 | if IS_NOTEBOOK: 200 | text = '' + text + '' 201 | cprint(text, '#64DD17') 202 | else: 203 | cprint(text, 'green', brightness="bright") 204 | 205 | 206 | # Charts 207 | 208 | 209 | def display_bar_chart(val, max_val, title=None, left='', right='', 210 | color='green', length=80): 211 | bar = make_bar_chart(val, max_val, title=title, left=left, right=right, 212 | color=color, length=length) 213 | display(bar) 214 | 215 | 216 | def make_bar_chart(val, max_val, title=None, left='', right='', 217 | color='green', length=80): 218 | full_block = '█' 219 | empty_block = '░' 220 | half_block = '▒' 221 | 222 | # building the bar 223 | bar = '' 224 | num_full = length * val / float(max_val) 225 | bar += full_block * int(num_full) 226 | if not (num_full).is_integer(): 227 | bar += half_block 228 | bar += empty_block * (length - len(bar)) 229 | 230 | # colorize 231 | bar = colorize(bar, color) 232 | 233 | # adding left/right text if needed 234 | row = [] 235 | if left: 236 | row.append(left) 237 | row.append(bar) 238 | if right: 239 | row.append(right) 240 | 241 | st = SingleTable([row], title) 242 | st.inner_column_border = False 243 | return st.table 244 | 245 | 246 | # Low level function 247 | 248 | 249 | def cprint(text, color, bg_color=None, brightness='normal'): 250 | """ Print given piece of text with color 251 | 252 | Args: 253 | text (str): text to colorize 254 | color (str): forground color 255 | bg_color (str, optional): Defaults to None. background color. 256 | brightness (str, optional): Defaults to normal. Text brightness. 257 | """ 258 | 259 | text = colorize(text, color, bg_color, brightness) 260 | 261 | # HTMLify if needed 262 | display(text) 263 | 264 | 265 | def colorize_row(row, color, bg_color=None, brightness='normal'): 266 | """Colorize a table row. 267 | 268 | Args: 269 | row (list): The row to colorize. 270 | color (str): Forground color. 271 | bg_color (str): Background color. Defaults to None. 272 | brightness (str, optional): Defaults to normal. Text brightness. 273 | Returns: 274 | list: colorized row 275 | """ 276 | colored_row = [] 277 | for v in row: 278 | colored_row.append(colorize(v, color, bg_color, brightness)) 279 | return colored_row 280 | 281 | 282 | def colorize_default(text): 283 | """Colorize a given piece of text with the terminal default color 284 | Args: 285 | text (str): text to colorize 286 | """ 287 | if IS_NOTEBOOK: 288 | text = text + '' 289 | else: 290 | text = text + styles['reset'] 291 | return text 292 | 293 | 294 | def colorize(text, color, bg_color=None, brightness='normal'): 295 | """ Colorize a given piece of text 296 | Args: 297 | text (str): text to colorize 298 | color (str): forground color 299 | bg_color (str, optional): Defaults to None. background color. 300 | brightness (str, optional): Defaults to normal. Text brightness. 301 | 302 | Returns: 303 | str: colorized text 304 | """ 305 | 306 | text = str(text) # in case user pass a float/int 307 | 308 | # we need a special case as term default color/bgcolor is unknown 309 | if color == 'default': 310 | return colorize_default(text) 311 | 312 | if color not in colors and not IS_NOTEBOOK: 313 | msg = "Foreground color invalid:%s" % color 314 | raise ValueError(msg) 315 | 316 | if bg_color and bg_color not in colors and not IS_NOTEBOOK: 317 | msg = "Background color invalid:%s" % bg_color 318 | raise ValueError(msg) 319 | 320 | if brightness not in brightness and not IS_NOTEBOOK: 321 | raise ValueError("Brightness invalid:" + brightness) 322 | 323 | # foreground color 324 | if IS_NOTEBOOK: 325 | text = text.replace('\n', '
') 326 | h = '' % color 327 | text = h + text 328 | else: 329 | text = colors[color][FG] + text 330 | # background if needed 331 | if bg_color and not IS_NOTEBOOK: 332 | text = colors[bg_color][BG] + text 333 | 334 | # brightness if neeed 335 | if brightness != 'normal' and not IS_NOTEBOOK: 336 | text = styles[brightness] + text 337 | 338 | # reset 339 | if IS_NOTEBOOK: 340 | text = text + '' 341 | else: 342 | text = text + styles['reset'] 343 | 344 | return text 345 | 346 | 347 | # TABLE 348 | def display_table(rows, title=None, indent=0): 349 | """ Print data as a nicely formated ascii table 350 | Args: 351 | rows (list(list)): data to display as list of lists. 352 | title (str, optional): Defaults to None. Table title 353 | """ 354 | table = make_table(rows, title) 355 | 356 | if indent and not IS_NOTEBOOK: 357 | indent = " " * indent 358 | out = [] 359 | for line in table.split("\n"): 360 | out.append(indent + line) 361 | table = "\n".join(out) 362 | display(table) 363 | 364 | 365 | def make_table(rows, title=None): 366 | """ Format list as a pretty ascii table 367 | Args: 368 | rows (list(list)): data to display as list of lists. 369 | title (str, optional): Defaults to None. Table title 370 | Returns: 371 | str: string representing table 372 | """ 373 | if IS_NOTEBOOK: 374 | headers = rows[0] 375 | body = rows[1:] 376 | table = tabulate(body, headers, tablefmt="html") 377 | else: 378 | st = SingleTable(rows, title) 379 | table = st.table 380 | return table 381 | 382 | 383 | def make_combined_table(array_rows): 384 | """ Build a table of tables 385 | 386 | Args: 387 | array_rows (list(list)): Array of tables rows to combine 388 | Returns: 389 | str: string representing table 390 | """ 391 | 392 | if IS_NOTEBOOK: 393 | # compute the size for each col 394 | col_size = str(int(100 / len(array_rows)) - 5) + '%' 395 | gtc = [col_size] * len(array_rows) 396 | table = """ 397 | 404 |
405 | """ % (" ".join(gtc)) 406 | for rows in array_rows: 407 | table += '
' 408 | headers = rows[0] 409 | body = rows[1:] 410 | table += tabulate(body, headers, tablefmt="html") 411 | table += '
' 412 | table += "
" 413 | return table 414 | else: 415 | tables = [] 416 | for rows in array_rows: 417 | tables.append(make_table(rows)) 418 | combined_table = AsciiTable([tables]) 419 | combined_table.outer_border = False 420 | combined_table.inner_column_border = False 421 | return combined_table.table 422 | 423 | 424 | def display_combined_table(array_rows): 425 | """ Build a table of tables and print it 426 | 427 | Args: 428 | array_rows (list(list)): Array of tables rows to combine 429 | """ 430 | table = make_combined_table(array_rows) 431 | display(table) 432 | 433 | 434 | def progress_bar(*args, **kwargs): 435 | """ Returns a new tqdm progress bar appropriate for the current display. 436 | 437 | Returns: 438 | tqdm progress bar. 439 | """ 440 | 441 | return tqdm(*args, **kwargs) 442 | -------------------------------------------------------------------------------- /autorecsys/utils/metric.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # This codes are migrated from Keras Tuner: https://keras-team.github.io/keras-tuner/. 3 | # The copyright belows to the Keras Tuner authors. 4 | 5 | 6 | from __future__ import absolute_import, division, print_function, unicode_literals 7 | 8 | 9 | import numpy as np 10 | from tensorflow import keras 11 | 12 | 13 | 14 | class MetricObservation(object): 15 | 16 | def __init__(self, value, step): 17 | if not isinstance(value, list): 18 | value = [value] 19 | self.value = value 20 | self.step = step 21 | 22 | def append(self, value): 23 | if not isinstance(value, list): 24 | value = [value] 25 | self.value += value 26 | 27 | def mean(self): 28 | return np.mean(self.value) 29 | 30 | def get_config(self): 31 | return {'value': self.value, 32 | 'step': self.step} 33 | 34 | @classmethod 35 | def from_config(cls, config): 36 | return cls(**config) 37 | 38 | def __eq__(self, other): 39 | if not isinstance(other, MetricObservation): 40 | return False 41 | return (other.value == self.value and 42 | other.step == self.step) 43 | 44 | def __repr__(self): 45 | return 'MetricObservation(value={}, step={})'.format( 46 | self.value, self.step) 47 | 48 | 49 | class MetricHistory(object): 50 | 51 | def __init__(self, direction='min'): 52 | if direction not in {'min', 'max'}: 53 | raise ValueError( 54 | '`direction` should be one of ' 55 | '{"min", "max"}, but got: %s' % (direction,)) 56 | self.direction = direction 57 | self._observations = {} 58 | 59 | def update(self, value, step): 60 | if step in self._observations: 61 | self._observations[step].append(value) 62 | else: 63 | self._observations[step] = MetricObservation( 64 | value, step=step) 65 | 66 | def get_best_value(self): 67 | values = list( 68 | obs.mean() for obs in self._observations.values()) 69 | if not values: 70 | return None 71 | if self.direction == 'min': 72 | return np.nanmin(values) 73 | return np.nanmax(values) 74 | 75 | def get_best_step(self): 76 | best_value = self.get_best_value() 77 | if best_value is None: 78 | return None 79 | for obs in self._observations.values(): 80 | if obs.mean() == best_value: 81 | return obs.step 82 | 83 | def get_history(self): 84 | return sorted(self._observations.values(), 85 | key=lambda obs: obs.step) 86 | 87 | def set_history(self, observations): 88 | for obs in observations: 89 | self.update(obs.value, step=obs.step) 90 | 91 | def get_statistics(self): 92 | history = self.get_history() 93 | history_values = [obs.mean() for obs in history] 94 | if not len(history_values): 95 | return {} 96 | return { 97 | 'min': float(np.nanmin(history_values)), 98 | 'max': float(np.nanmax(history_values)), 99 | 'mean': float(np.nanmean(history_values)), 100 | 'median': float(np.nanmedian(history_values)), 101 | 'var': float(np.nanvar(history_values)), 102 | 'std': float(np.nanstd(history_values)) 103 | } 104 | 105 | def get_last_value(self): 106 | history = self.get_history() 107 | if history: 108 | last_obs = history[-1] 109 | return last_obs.mean() 110 | else: 111 | return None 112 | 113 | def get_config(self): 114 | config = {'direction': self.direction, 115 | 'observations': [obs.get_config() for obs in self.get_history()]} 116 | return config 117 | 118 | @classmethod 119 | def from_config(cls, config): 120 | instance = cls(config['direction']) 121 | instance.set_history([MetricObservation.from_config(obs) 122 | for obs in config['observations']]) 123 | return instance 124 | 125 | 126 | class MetricsTracker(object): 127 | 128 | def __init__(self, metrics=None): 129 | # str -> MetricHistory 130 | self.metrics = {} 131 | self.register_metrics(metrics) 132 | 133 | def exists(self, name): 134 | return name in self.metrics 135 | 136 | def register_metrics(self, metrics=None): 137 | metrics = metrics or [] 138 | for metric in metrics: 139 | self.register(metric.name) 140 | 141 | def register(self, name, direction=None): 142 | if self.exists(name): 143 | raise ValueError('Metric already exists: %s' % (name,)) 144 | if direction is None: 145 | direction = infer_metric_direction(name) 146 | self.metrics[name] = MetricHistory(direction) 147 | 148 | def update(self, name, value, step=0): 149 | value = float(value) 150 | if not self.exists(name): 151 | self.register(name) 152 | 153 | prev_best = self.metrics[name].get_best_value() 154 | self.metrics[name].update(value, step=step) 155 | new_best = self.metrics[name].get_best_value() 156 | 157 | improved = new_best != prev_best 158 | return improved 159 | 160 | def get_history(self, name): 161 | self._assert_exists(name) 162 | return self.metrics[name].get_history() 163 | 164 | def set_history(self, name, observations): 165 | assert type(observations) == list 166 | if not self.exists(name): 167 | self.register(name) 168 | self.metrics[name].set_history(observations) 169 | 170 | def get_best_value(self, name): 171 | self._assert_exists(name) 172 | return self.metrics[name].get_best_value() 173 | 174 | def get_best_step(self, name): 175 | self._assert_exists(name) 176 | return self.metrics[name].get_best_step() 177 | 178 | def get_statistics(self, name): 179 | self._assert_exists(name) 180 | return self.metrics[name].get_statistics() 181 | 182 | def get_last_value(self, name): 183 | self._assert_exists(name) 184 | return self.metrics[name].get_last_value() 185 | 186 | def get_direction(self, name): 187 | self._assert_exists(name) 188 | return self.metrics[name].direction 189 | 190 | def get_config(self): 191 | return { 192 | 'metrics': { 193 | name: metric_history.get_config() 194 | for name, metric_history in self.metrics.items()}} 195 | 196 | @classmethod 197 | def from_config(cls, config): 198 | instance = cls() 199 | instance.metrics = { 200 | name: MetricHistory.from_config(metric_history) 201 | for name, metric_history in config['metrics'].items()} 202 | return instance 203 | 204 | def _assert_exists(self, name): 205 | if name not in self.metrics: 206 | raise ValueError('Unknown metric: %s' % (name,)) 207 | 208 | 209 | _MAX_METRICS = { 210 | 'Accuracy', 'BinaryAccuracy', 211 | 'CategoricalAccuracy', 'SparseCategoricalAccuracy', 212 | 'TopKCategoricalAccuracy', 'SparseTopKCategoricalAccuracy', 213 | 'TruePositives', 'TrueNegatives', 214 | 'Precision', 'Recall', 'AUC', 215 | 'SensitivityAtSpecificity', 'SpecificityAtSensitivity' 216 | } 217 | 218 | _MAX_METRIC_FNS = { 219 | 'accuracy', 'categorical_accuracy', 'binary_accuracy', 220 | 'sparse_categorical_accuracy' 221 | } 222 | 223 | 224 | def infer_metric_direction(metric): 225 | # Handle str input and get canonical object. 226 | if isinstance(metric, str): 227 | metric_name = metric 228 | if len(metric_name) > 4 and metric_name[:4] == 'val_': 229 | metric_name = metric_name[4:] 230 | if metric_name == 'loss': 231 | # Special-case the overall loss. 232 | return 'min' 233 | try: 234 | metric = keras.metrics.get(metric_name) 235 | except ValueError: 236 | # Default to minimization for unknown metric. 237 | return 'min' 238 | 239 | # Metric class or function. 240 | if isinstance(metric, keras.metrics.Metric): 241 | name = metric.__class__.__name__ 242 | if name == 'MeanMetricWrapper': 243 | name = metric._fn.__name__ 244 | else: 245 | name = metric.__name__ 246 | 247 | if name in _MAX_METRICS or name in _MAX_METRIC_FNS: 248 | return 'max' 249 | return 'min' 250 | -------------------------------------------------------------------------------- /docs/autogen.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pathlib 3 | import shutil 4 | 5 | import keras_autodoc 6 | import tutobooks 7 | 8 | 9 | PAGES = { 10 | 'preprocessor.md': [ 11 | 'autorecsys.pipeline.preprocessor.BasePreprocessor', 12 | 'autorecsys.pipeline.preprocessor.BasePreprocessor.format_dataset', 13 | 'autorecsys.pipeline.preprocessor.BasePreprocessor.load_dataset', 14 | 'autorecsys.pipeline.preprocessor.BasePreprocessor.transform_categorical', 15 | 'autorecsys.pipeline.preprocessor.BasePreprocessor.transform_numerical', 16 | 'autorecsys.pipeline.preprocessor.BasePreprocessor.get_hash_size', 17 | 'autorecsys.pipeline.preprocessor.BasePreprocessor.get_x', 18 | 'autorecsys.pipeline.preprocessor.BasePreprocessor.get_x_numerical', 19 | 'autorecsys.pipeline.preprocessor.BasePreprocessor.get_x_categorical', 20 | 'autorecsys.pipeline.preprocessor.BasePreprocessor.get_y', 21 | 'autorecsys.pipeline.preprocessor.BasePreprocessor.get_numerical_count', 22 | 'autorecsys.pipeline.preprocessor.BasePreprocessor.get_categorical_count', 23 | 'autorecsys.pipeline.preprocessor.BasePreprocessor.split_data', 24 | 'autorecsys.pipeline.preprocessor.BasePreprocessor.preprocess', 25 | 'autorecsys.pipeline.preprocessor.AvazuPreprocessor', 26 | 'autorecsys.pipeline.preprocessor.AvazuPreprocessor.preprocess', 27 | 'autorecsys.pipeline.preprocessor.CriteoPreprocessor', 28 | 'autorecsys.pipeline.preprocessor.CriteoPreprocessor.preprocess', 29 | 'autorecsys.pipeline.preprocessor.NetflixPrizePreprocessor', 30 | 'autorecsys.pipeline.preprocessor.NetflixPrizePreprocessor.format_dataset', 31 | 'autorecsys.pipeline.preprocessor.NetflixPrizePreprocessor.preprocess', 32 | 'autorecsys.pipeline.preprocessor.MovielensPreprocessor', 33 | 'autorecsys.pipeline.preprocessor.MovielensPreprocessor.preprocess', 34 | ], 35 | 'node.md': [ 36 | 'autorecsys.pipeline.node.Input', 37 | 'autorecsys.pipeline.node.Input.fit_transform', 38 | 'autorecsys.pipeline.node.Input.transform', 39 | 'autorecsys.pipeline.node.StructuredDataInput', 40 | 'autorecsys.pipeline.node.StructuredDataInput.get_state', 41 | 'autorecsys.pipeline.node.StructuredDataInput.set_state', 42 | 'autorecsys.pipeline.node.StructuredDataInput.update', 43 | 'autorecsys.pipeline.node.StructuredDataInput.infer_column_types', 44 | ], 45 | 'mapper.md': [ 46 | 'autorecsys.pipeline.mapper.LatentFactorMapper', 47 | 'autorecsys.pipeline.mapper.LatentFactorMapper.get_state', 48 | 'autorecsys.pipeline.mapper.LatentFactorMapper.set_state', 49 | 'autorecsys.pipeline.mapper.LatentFactorMapper.build', 50 | 'autorecsys.pipeline.mapper.DenseFeatureMapper', 51 | 'autorecsys.pipeline.mapper.DenseFeatureMapper.get_state', 52 | 'autorecsys.pipeline.mapper.DenseFeatureMapper.set_state', 53 | 'autorecsys.pipeline.mapper.DenseFeatureMapper.build', 54 | 'autorecsys.pipeline.mapper.SparseFeatureMapper', 55 | 'autorecsys.pipeline.mapper.SparseFeatureMapper.get_state', 56 | 'autorecsys.pipeline.mapper.SparseFeatureMapper.set_state', 57 | 'autorecsys.pipeline.mapper.SparseFeatureMapper.build', 58 | ], 59 | 'interactor.md': [ 60 | 'autorecsys.pipeline.interactor.RandomSelectInteraction', 61 | 'autorecsys.pipeline.interactor.RandomSelectInteraction.get_state', 62 | 'autorecsys.pipeline.interactor.RandomSelectInteraction.set_state', 63 | 'autorecsys.pipeline.interactor.RandomSelectInteraction.build', 64 | 'autorecsys.pipeline.interactor.ConcatenateInteraction', 65 | 'autorecsys.pipeline.interactor.ConcatenateInteraction.get_state', 66 | 'autorecsys.pipeline.interactor.ConcatenateInteraction.set_state', 67 | 'autorecsys.pipeline.interactor.ConcatenateInteraction.build', 68 | 'autorecsys.pipeline.interactor.InnerProductInteraction', 69 | 'autorecsys.pipeline.interactor.InnerProductInteraction.get_state', 70 | 'autorecsys.pipeline.interactor.InnerProductInteraction.set_state', 71 | 'autorecsys.pipeline.interactor.InnerProductInteraction.build', 72 | 'autorecsys.pipeline.interactor.ElementwiseInteraction', 73 | 'autorecsys.pipeline.interactor.ElementwiseInteraction.get_state', 74 | 'autorecsys.pipeline.interactor.ElementwiseInteraction.set_state', 75 | 'autorecsys.pipeline.interactor.ElementwiseInteraction.build', 76 | 'autorecsys.pipeline.interactor.MLPInteraction', 77 | 'autorecsys.pipeline.interactor.MLPInteraction.get_state', 78 | 'autorecsys.pipeline.interactor.MLPInteraction.set_state', 79 | 'autorecsys.pipeline.interactor.MLPInteraction.build', 80 | 'autorecsys.pipeline.interactor.HyperInteraction', 81 | 'autorecsys.pipeline.interactor.HyperInteraction.get_state', 82 | 'autorecsys.pipeline.interactor.HyperInteraction.set_state', 83 | 'autorecsys.pipeline.interactor.HyperInteraction.build', 84 | 'autorecsys.pipeline.interactor.FMInteraction', 85 | 'autorecsys.pipeline.interactor.FMInteraction.get_state', 86 | 'autorecsys.pipeline.interactor.FMInteraction.set_state', 87 | 'autorecsys.pipeline.interactor.FMInteraction.build', 88 | 'autorecsys.pipeline.interactor.CrossNetInteraction', 89 | 'autorecsys.pipeline.interactor.CrossNetInteraction.get_state', 90 | 'autorecsys.pipeline.interactor.CrossNetInteraction.set_state', 91 | 'autorecsys.pipeline.interactor.CrossNetInteraction.build', 92 | 'autorecsys.pipeline.interactor.SelfAttentionInteraction', 93 | 'autorecsys.pipeline.interactor.SelfAttentionInteraction.get_state', 94 | 'autorecsys.pipeline.interactor.SelfAttentionInteraction.set_state', 95 | 'autorecsys.pipeline.interactor.SelfAttentionInteraction.build', 96 | ], 97 | 'optimizer.md': [ 98 | 'autorecsys.pipeline.optimizer.RatingPredictionOptimizer', 99 | 'autorecsys.pipeline.optimizer.RatingPredictionOptimizer.build', 100 | 'autorecsys.pipeline.optimizer.CTRPredictionOptimizer', 101 | 'autorecsys.pipeline.optimizer.CTRPredictionOptimizer.build', 102 | ], 103 | 'recommender.md': [ 104 | 'autorecsys.recommender.RPRecommender', 105 | 'autorecsys.recommender.CTRRecommender', 106 | ], 107 | 'auto_search.md': [ 108 | 'autorecsys.auto_search.Search', 109 | 'autorecsys.auto_search.Search.search', 110 | 'autorecsys.auto_search.Search.predict', 111 | 'autorecsys.auto_search.Search.evaluate', 112 | ], 113 | 114 | } 115 | 116 | 117 | aliases_needed = [ 118 | 'tensorflow.keras.callbacks.Callback', 119 | 'tensorflow.keras.losses.Loss', 120 | 'tensorflow.keras.metrics.Metric', 121 | 'tensorflow.data.Dataset' 122 | ] 123 | 124 | 125 | ROOT = 'http://autorecsys.com/' 126 | 127 | project_dir = pathlib.Path(__file__).resolve().parents[1] 128 | 129 | def py_to_nb_md(dest_dir): 130 | for file_path in os.listdir('py/'): 131 | dir_path = 'py' 132 | file_name = file_path 133 | py_path = os.path.join(dir_path, file_path) 134 | file_name_no_ext = os.path.splitext(file_name)[0] 135 | ext = os.path.splitext(file_name)[1] 136 | 137 | if ext != '.py': 138 | continue 139 | 140 | nb_path = os.path.join('ipynb', file_name_no_ext + '.ipynb') 141 | md_path = os.path.join(dest_dir, 'tutorial', file_name_no_ext + '.md') 142 | 143 | tutobooks.py_to_md(py_path, nb_path, md_path, 'templates/img') 144 | 145 | github_repo_dir = 'keras-team/autokeras/blob/master/docs/' 146 | with open(md_path, 'r') as md_file: 147 | button_lines = [ 148 | ':material-link: ' 149 | "[**View in Colab**](https://colab.research.google.com/github/" 150 | + github_repo_dir 151 | + "ipynb/" 152 | + file_name_no_ext + ".ipynb" 153 | + ")    " 154 | # + '' 155 | + ':octicons-octoface: ' 156 | "[**GitHub source**](https://github.com/" + github_repo_dir + "py/" 157 | + file_name_no_ext + ".py)", 158 | "\n", 159 | ] 160 | md_content = ''.join(button_lines) + '\n' + md_file.read() 161 | 162 | with open(md_path, 'w') as md_file: 163 | md_file.write(md_content) 164 | 165 | 166 | def generate(dest_dir): 167 | template_dir = project_dir / 'docs' / 'templates' 168 | doc_generator = keras_autodoc.DocumentationGenerator( 169 | PAGES, 170 | 'https://github.com/datamllab/AutoRecSys', 171 | template_dir, 172 | project_dir / 'examples' 173 | ) 174 | doc_generator.generate(dest_dir) 175 | readme = (project_dir / 'README.md').read_text() 176 | index = (template_dir / 'index.md').read_text() 177 | index = index.replace('{{autogenerated}}', readme[readme.find('##'):]) 178 | (dest_dir / 'index.md').write_text(index, encoding='utf-8') 179 | # shutil.copyfile(project_dir / '.github' / 'CONTRIBUTING.md', 180 | # dest_dir / 'contributing.md') 181 | 182 | # py_to_nb_md(dest_dir) 183 | 184 | 185 | if __name__ == '__main__': 186 | generate(project_dir / 'docs' / 'sources') 187 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | # Welcome to MkDocs 2 | 3 | For full documentation visit [mkdocs.org](https://mkdocs.org). 4 | 5 | ## Commands 6 | 7 | * `mkdocs new [dir-name]` - Create a new project. 8 | * `mkdocs serve` - Start the live-reloading docs server. 9 | * `mkdocs build` - Build the documentation site. 10 | * `mkdocs help` - Print this help message. 11 | 12 | ## Project layout 13 | 14 | mkdocs.yml # The configuration file. 15 | docs/ 16 | index.md # The documentation homepage. 17 | ... # Other markdown pages, images and other files. 18 | -------------------------------------------------------------------------------- /docs/mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: AutoRec 2 | theme: 3 | name: 'material' 4 | palette: 5 | primary: 'green' 6 | accent: 'green' 7 | 8 | docs_dir: sources 9 | repo_url: https://github.com/datamllab/AutoRecSys 10 | site_url: http://autokeras.org 11 | edit_uri: "" 12 | site_description: 'Documentation for AutoRec.' 13 | # google_analytics: ['UA-44322747-3', 'autokeras.com'] 14 | markdown_extensions: 15 | - codehilite 16 | - pymdownx.superfences: 17 | custom_fences: 18 | - name: mermaid 19 | class: mermaid 20 | format: !!python/name:pymdownx.superfences.fence_div_format 21 | - pymdownx.emoji: 22 | emoji_index: !!python/name:materialx.emoji.twemoji 23 | emoji_generator: !!python/name:materialx.emoji.to_svg 24 | - admonition 25 | 26 | extra_javascript: 27 | - https://unpkg.com/mermaid@8.4.4/dist/mermaid.min.js 28 | 29 | nav: 30 | - Home: index.md 31 | - Installation: install.md 32 | - Documentation: 33 | - Preprocessor: preprocessor.md 34 | - Mapper: mapper.md 35 | - Node: node.md 36 | - Interactor: interactor.md 37 | - Optimizer: optimizer.md 38 | - Recommender: recommender.md 39 | - Auto Search: auto_search.md 40 | - About: about.md 41 | -------------------------------------------------------------------------------- /docs/readme.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datamllab/AutoRec/2dbc8778cfb597402d8b0337186bf9152663b20a/docs/readme.md -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | keras-autodoc==0.5.1 2 | mkdocs 3 | mkdocs-material 4 | pygments 5 | jupyter 6 | pymdown-extensions 7 | Sphinx<3.1.0 8 | -------------------------------------------------------------------------------- /docs/templates/about.md: -------------------------------------------------------------------------------- 1 | This package is developed by [DATA LAB](http://faculty.cs.tamu.edu/xiahu/) at Texas A&M University. 2 | 3 | ## Core Team 4 | 5 | [**Ting-Hsiang Wang**](https://github.com/thwang1231): 6 | 7 | [**Qingquan Song**](https://github.com/song3134): 8 | 9 | [**Xiaotian Han**](https://github.com/ahxt): 10 | 11 | [**Zirui Liu**](https://github.com/warai-otoko): 12 | 13 | [**Haifeng Jin**](https://github.com/haifeng-jin): 14 | 15 | [**Xia "Ben" Hu**](http://faculty.cs.tamu.edu/xiahu/): 16 | Project lead and maintainer. 17 | 18 | -------------------------------------------------------------------------------- /docs/templates/benchmark.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datamllab/AutoRec/2dbc8778cfb597402d8b0337186bf9152663b20a/docs/templates/benchmark.md -------------------------------------------------------------------------------- /docs/templates/index.md: -------------------------------------------------------------------------------- 1 | # Welcome to AutoRec 2 | 3 | ## Abstract 4 | 5 | Realistic recommender systems are often required to adapt to ever-changing 6 | data and tasks or to explore different models systematically. 7 | To address the need, we present **AutoRec**, an open-source automated machine 8 | learning (AutoML) platform extended from the TensorFlow ecosystem and, to our 9 | knowledge, the first framework to leverage AutoML for model search and 10 | hyperparameter tuning in deep recommendation models. 11 | 12 | AutoRec also supports a highly flexible pipeline that accommodates both sparse 13 | and dense inputs, rating prediction and click-through rate (CTR) prediction 14 | tasks, and an array of recommendation models. 15 | Lastly, AutoRec provides a simple, user-friendly API. 16 | 17 | Experiments conducted on the benchmark datasets reveal AutoRec is reliable and 18 | can identify models which resemble the best model without prior knowledge. 19 | 20 | -------------------------------------------------------------------------------- /docs/templates/install.md: -------------------------------------------------------------------------------- 1 | ## Requirements 2 | 3 | **Python 3**: Follow the TensorFlow install steps to install Python 3. 4 | 5 | **Pip**: Follow the TensorFlow install steps to install Pip. 6 | 7 | **Tensorflow >= 2.2.0**: AutoRec is based on TensorFlow. 8 | Please follow 9 | [this tutorial](https://www.tensorflow.org/install/pip) to install TensorFlow for python3. 10 | 11 | **GPU Setup (Optional)**: 12 | If you have GPUs on your machine and want to use them to accelerate the training, 13 | you can follow [this tutorial](https://www.tensorflow.org/install/gpu) to setup. 14 | 15 | ## Install AutoRec 16 | 17 | -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- 1 | # Benchmarking 2 | 3 | Benchmarks for popular recommendation methods. 4 | 5 | ## Rating Predication Task 6 | 7 | We adpot two dataset to evaluate our autoRec. 8 | 9 | - **Movilens**: GroupLens Research has collected and made available rating data sets from the MovieLens web site (http://movielens.org). The data sets were collected over various periods of time, depending on the size of the set.In our experimets, we use different version of this dataset. 10 | - **Netflix**: Netflix held the Netflix Prize open competition for the best algorithm to predict user ratings for films. The grand prize was $1,000,000 and was won by BellKor's Pragmatic Chaos team. This is the dataset that was used in that competition. 11 | 12 | The statistics of the dataset are as follow: 13 | 14 | |Dataset|#user|#item|#interaction| 15 | |---|---:|---:|---:| 16 | |[Movelens1m](#Movelens100k)|6,040|3,900|1,000,209| 17 | |[Movelens10m](#Movelens10M)|71,567|10,681|10,000,054| 18 | |[Movelens_latest](#Movelens_latest)|283,228|58,098|27,753,444| 19 | |[Netflix](#netflix)|480189|480189|100480507| 20 | 21 | 22 | Some popular model for rating prediction: 23 | 24 | - **MF**:Matrix factorization is a class of collaborative filtering algorithms used in recommender systems. Matrix factorization algorithms work by decomposing the user-item interaction matrix into the product of two lower dimensionality rectangular matrices. 25 | - **MLP**: Three collaborative filtering models: Generalized Matrix Factorization (GMF), Multi-Layer Perceptron (MLP), and Neural Matrix Factorization (NeuMF). To target the models for implicit feedback and ranking task, we optimize them using log loss with negative sampling. 26 | - **GMF**: Three collaborative filtering models: Generalized Matrix Factorization (GMF), Multi-Layer Perceptron (MLP), and Neural Matrix Factorization (NeuMF). To target the models for implicit feedback and ranking task, we optimize them using log loss with negative sampling. 27 | - **NeuMF**: Three collaborative filtering models: Generalized Matrix Factorization (GMF), Multi-Layer Perceptron (MLP), and Neural Matrix Factorization (NeuMF). To target the models for implicit feedback and ranking task, we optimize them using log loss with negative sampling. 28 | - **Hp search**: Our autorec with hyperparameter search. 29 | - **Block Search**: Our autorec with both the block search and hyperparameter search. 30 | 31 | All benchmarks were run with our AutoRec Package. 32 | The benchmarks experiments were run on a machine with dual 33 | Intel Xeon E5-2630 v3 processors (8 cores each plus hyperthreading means 32 34 | threads) and one GTX 2080Ti running Ubuntu 16.04 with the Tensorflow 2.1.0 and CUDA 10.0 Release. 35 | 36 | We benchmark all models with a minibatch size of 256 ; 37 | this allows fair comparisons between different models. 38 | 39 | The following models are benchmarked: 40 | 41 | ## Movelens1M 42 | |Movelens1M|Val_MSE|Test_MSE|Time(s)| 43 | |---|---:|---:|---:| 44 | |MF_random|0.7553643584251404|0.7550543546676636|103.57773876190186| 45 | |MF_greedy|0.7503780722618103|0.7502530217170715|85.47167634963989| 46 | |MF_bayesian|0.7521297335624695|0.7517699599266052|1031.2954790592194| 47 | |MLP_random|0.7676995396614075|0.7681054472923279|1383.5030148029327| 48 | |MLP_greedy|0.769902765750885|0.7706407308578491|1292.7048692703247| 49 | |MLP_bayesian|0.758850634098053|0.7597852945327759|1353.2627713680267| 50 | |NeuMF_random|0.7707042694091797|0.7720282077789307|1025.5578093528748| 51 | |NeuMF_greedy|0.7517987489700317|0.7520723342895508|1276.7933542728424| 52 | |NeuMF_bayesian|0.7721487879753113|0.7723560333251953|1098.1503052711487| 53 | |AutoRec_random|0.7500635981559753|0.749731719493866|1577.6531774997711| 54 | |AutoRec_greedy|0.7496007084846497|0.7510735392570496|1689.560632944107| 55 | |AutoRec_bayesian|0.7484513521194458|0.7494882345199585|5405.682264328003| 56 | 57 | 58 | ## Movelens10M 59 | |Movelens10M|Val_MSE|Test_MSE|Time(s)| 60 | |---|---:|---:|---:| 61 | |MF_random|0.6472423672676086|0.6456527709960938|795.4746537208557| 62 | |MF_greedy|0.6473642587661743|0.6467021107673645|838.2489671707153| 63 | |MF_bayesian|0.6490539312362671|0.6481097936630249|7755.805980920792| 64 | |MLP_random|||| 65 | |MLP_greedy|0.6532657742500305|0.652294397354126|10709.204501867294| 66 | |MLP_bayesian|||| 67 | |NeuMF_random|0.6536459922790527|0.6527888774871826|16713.71854186058| 68 | |NeuMF_greedy|0.6541951298713684|0.6537747979164124|11205.822769880295| 69 | |NeuMF_bayesian|0.650793194770813|0.6504989862442017|15727.56122994423| 70 | |AutoRec_random|||| 71 | |AutoRec_greedy|||| 72 | |AutoRec_bayesian|||| 73 | 74 | 75 | 76 | 77 | ## Movelens_latest 78 | |Movelens_latest|Val_MSE|Test_MSE|Time(s)| 79 | |---|---:|---:|---:| 80 | |MF_random|0.6520289182662964|0.6528090238571167|68519.18232417107| 81 | |MF_greedy|||| 82 | |MF_bayesian|||| 83 | |MLP_random|||| 84 | |MLP_greedy|||| 85 | |MLP_bayesian|||| 86 | |NeuMF_random|||| 87 | |NeuMF_greedy|0.6434351801872253|0.6440964937210083|56383.871745824814| 88 | |NeuMF_bayesian|||| 89 | |AutoRec_random|0.6365838050842285|0.6371557712554932|133145.96114301682| 90 | |AutoRec_greedy|||| 91 | |AutoRec_bayesian|0.6448036432266235|0.6453331708908081|133532.19134521484| 92 | 93 | 94 | ## Netflix 95 | |Netflix|Val_MSE|Test_MSE|Time(s)| 96 | |---|---:|---:|---:| 97 | |MF_random|0.7473645806312561|0.74784255027771|8169.921831846237| 98 | |MF_greedy|0.7397633790969849|0.7402286529541016|8646.685072422028| 99 | |MF_bayesian|0.7282611727714539|0.7287141680717468|82759.47434949875| 100 | |MLP_random|0.7549719214439392| 0.7553735971450806|59066.82922792435| 101 | |MLP_greedy|0.7648082375526428|0.7652896046638489|56700.0296475887s3| 102 | |MLP_bayesian|0.7546935081481934|0.755224347114563|46708.42347598076| 103 | |NeuMF_random|0.7073774337768555|0.7063089609146118|50333.9074454409| 104 | |NeuMF_greedy|0.6434351801872253|0.6440964937210083|56383.871745824814| 105 | |NeuMF_bayesian|0.70604610443s11523|0.706568717956543|73228.66933822632| 106 | |AutoRec_random|0.6365838050842285|0.6371557712554932|133145.96114301682| 107 | |AutoRec_greedy|0.739780068397522|0.7401751279830933|105307.948792696| 108 | |AutoRec_bayesian|0.6448036432266235|0.6453331708908081|133532.19134521484| 109 | 110 | 111 | MSE and MAE are the mean square error and mean abslute error. 112 | 113 | Time, for the baseline model, is the total training time; for the automated model, is the total search and training time. 114 | 115 | 116 | 117 | ## Click-Through Rate Task 118 | 119 | We adpot two dataset to evaluate our autoRec. 120 | 121 | - **Criteo**: Display advertising is a billion dollar effort and one of the central uses of machine learning on the Internet. However, its data and methods are usually kept under lock and key. In this research competition, CriteoLabs is sharing a week’s worth of data for you to develop models predicting ad click-through rate (CTR). Given a user and the page he is visiting, what is the probability that he will click on a given ad? 122 | - **Avazu**: For this competition, we have provided 11 days worth of Avazu data to build and test prediction models. Can you find a strategy that beats standard classification algorithms? The winning models from this competition will be released under an open-source license. 123 | 124 | The statistics of the dataset are as follow: 125 | 126 | |Dataset|#user|#item|#interaction| 127 | |---|---:|---:|---:| 128 | |[Movielens](#Movielens)|10000|10000|10000| 129 | 130 | 131 | |Dataset|#dense field|#sparse field|#instance| 132 | |---|---:|---:|---:| 133 | |[Criteo](#Criteo)|10000|10000|10000| 134 | |[Avazu](#Avazu)|10000|10000|10000| 135 | 136 | 137 | Some popular model for rating prediction: 138 | 139 | - **NeuMF**: Three collaborative filtering models: Generalized Matrix Factorization (GMF), Multi-Layer Perceptron (MLP), and Neural Matrix Factorization (NeuMF). To target the models for implicit feedback and ranking task, we optimize them using log loss with negative sampling. 140 | - **Hp search**: Our autorec with hyperparameter search. 141 | - **Block Search**: Our autorec with both the block search and hyperparameter search. 142 | 143 | All benchmarks were run with our AutoRec Package. 144 | The benchmarks experiments were run on a machine with dual 145 | Intel Xeon E5-2630 v3 processors (8 cores each plus hyperthreading means 32 146 | threads) and one GTX 2080Ti running Ubuntu 16.04 with the Tensorflow 2.1.0 and CUDA 10.0 Release. 147 | 148 | We benchmark all models with a minibatch size of 256; this allows fair comparisons between different models. 149 | The following models are benchmarked: 150 | 151 | ## Movielens 152 | |Movielens|logloss|AUC|Time(s)| 153 | |---|---:|---:|---:| 154 | |MF|0.0000|0.0000|0.0000| 155 | |GMF|0.0000|0.0000|0.0000| 156 | |MLP|0.0000|0.0000|0.0000| 157 | |NeuMF|0.0000|0.0000|0.0000| 158 | |AutoRec_random|0.0000|0.0000|0.0000| 159 | |AutoRec_bayesian|0.0000|0.0000|0.0000| 160 | |AutoRec_hyperband|0.0000|0.0000|0.0000| 161 | 162 | ## Criteo 163 | |Criteo|logloss|AUC|Time(s)| 164 | |---|---:|---:|---:| 165 | |FM|0.0000|0.0000|0.0000| 166 | |AutoRec_random|0.0000|0.0000|0.0000| 167 | |AutoRec_bayesian|0.0000|0.0000|0.0000| 168 | |AutoRec_hyperband|0.0000|0.0000|0.0000| 169 | 170 | 171 | ## Avazu 172 | |Avazu|logloss|AUC|Time(s)| 173 | |---|---:|---:|---:| 174 | |FM|0.0000|0.0000|0.0000| 175 | |AutoRec_random|0.0000|0.0000|0.0000| 176 | |AutoRec_bayesian|0.0000|0.0000|0.0000| 177 | |AutoRec_hyperband|0.0000|0.0000|0.0000| 178 | 179 | Logloss and AUC are the binary cross-entropy loss and Area Under the Receiver Operating Characteristic Curve Score. 180 | 181 | Time, for the baseline model, is the total training time; for the automated model, is the total search and training time. -------------------------------------------------------------------------------- /examples/ctr_autoint.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import, division, print_function, unicode_literals 3 | 4 | import os 5 | os.environ["CUDA_VISIBLE_DEVICES"] = "7" 6 | import logging 7 | import tensorflow as tf 8 | from autorecsys.auto_search import Search 9 | from autorecsys.pipeline import Input, DenseFeatureMapper, SparseFeatureMapper, SelfAttentionInteraction,\ 10 | MLPInteraction, CTRPredictionOptimizer 11 | from autorecsys.recommender import CTRRecommender 12 | from autorecsys.pipeline.preprocessor import CriteoPreprocessor 13 | 14 | 15 | # logging setting 16 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') 17 | logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') 18 | logger = logging.getLogger(__name__) 19 | 20 | # Step 1: Preprocess data 21 | criteo = CriteoPreprocessor() # the default arguments are setup to preprocess the Criteo example dataset 22 | train_X, train_y, val_X, val_y, test_X, test_y = criteo.preprocess() 23 | train_X_numerical, train_X_categorical = criteo.get_x_numerical(train_X), criteo.get_x_categorical(train_X) 24 | val_X_numerical, val_X_categorical = criteo.get_x_numerical(val_X), criteo.get_x_categorical(val_X) 25 | test_X_numerical, test_X_categorical = criteo.get_x_numerical(test_X), criteo.get_x_categorical(test_X) 26 | numerical_count = criteo.get_numerical_count() 27 | categorical_count = criteo.get_categorical_count() 28 | hash_size = criteo.get_hash_size() 29 | 30 | # Step 2: Build the recommender, which provides search space 31 | # Step 2.1: Setup mappers to handle inputs 32 | dense_input_node = Input(shape=[numerical_count]) 33 | sparse_input_node = Input(shape=[categorical_count]) 34 | dense_feat_emb = DenseFeatureMapper( 35 | num_of_fields=numerical_count, 36 | embedding_dim=2)(dense_input_node) 37 | sparse_feat_emb = SparseFeatureMapper( 38 | num_of_fields=categorical_count, 39 | hash_size=hash_size, 40 | embedding_dim=2)(sparse_input_node) 41 | 42 | # Step 2.2: Setup interactors to handle models 43 | attention_output = SelfAttentionInteraction()([dense_feat_emb, sparse_feat_emb]) 44 | bottom_mlp_output = MLPInteraction()([dense_feat_emb]) 45 | top_mlp_output = MLPInteraction()([attention_output, bottom_mlp_output]) 46 | 47 | # Step 2.3: Setup optimizer to handle the target task 48 | output = CTRPredictionOptimizer()(top_mlp_output) 49 | model = CTRRecommender(inputs=[dense_input_node, sparse_input_node], outputs=output) 50 | 51 | # Step 3: Build the searcher, which provides search algorithm 52 | searcher = Search(model=model, 53 | tuner='random', 54 | tuner_params={'max_trials': 2, 'overwrite': True}, 55 | ) 56 | 57 | # Step 4: Use the searcher to search the recommender 58 | searcher.search(x=[train_X_numerical, train_X_categorical], 59 | y=train_y, 60 | x_val=[val_X_numerical, val_X_categorical], 61 | y_val=val_y, 62 | objective='val_BinaryCrossentropy', 63 | batch_size=10000, 64 | epochs=2, 65 | callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=1)] 66 | ) 67 | logger.info('Validation Accuracy (logloss): {}'.format(searcher.evaluate(x=[val_X_numerical, val_X_categorical], 68 | y_true=val_y))) 69 | 70 | # Step 5: Evaluate the searched model 71 | logger.info('Test Accuracy (logloss): {}'.format(searcher.evaluate(x=[test_X_numerical, test_X_categorical], 72 | y_true=test_y))) 73 | -------------------------------------------------------------------------------- /examples/ctr_autorec.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import, division, print_function, unicode_literals 3 | 4 | import os 5 | os.environ["CUDA_VISIBLE_DEVICES"] = "7" 6 | 7 | import logging 8 | import tensorflow as tf 9 | from autorecsys.auto_search import Search 10 | from autorecsys.pipeline import Input, DenseFeatureMapper, SparseFeatureMapper, HyperInteraction, CTRPredictionOptimizer 11 | from autorecsys.recommender import CTRRecommender 12 | from autorecsys.pipeline.preprocessor import CriteoPreprocessor 13 | 14 | 15 | # logging setting 16 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') 17 | logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') 18 | logger = logging.getLogger(__name__) 19 | 20 | # Step 1: Preprocess data 21 | criteo = CriteoPreprocessor() # the default arguments are setup to preprocess the Criteo example dataset 22 | train_X, train_y, val_X, val_y, test_X, test_y = criteo.preprocess() 23 | train_X_numerical, train_X_categorical = criteo.get_x_numerical(train_X), criteo.get_x_categorical(train_X) 24 | val_X_numerical, val_X_categorical = criteo.get_x_numerical(val_X), criteo.get_x_categorical(val_X) 25 | test_X_numerical, test_X_categorical = criteo.get_x_numerical(test_X), criteo.get_x_categorical(test_X) 26 | numerical_count = criteo.get_numerical_count() 27 | categorical_count = criteo.get_categorical_count() 28 | hash_size = criteo.get_hash_size() 29 | 30 | # Step 2: Build the recommender, which provides search space 31 | # Step 2.1: Setup mappers to handle inputs 32 | dense_input_node = Input(shape=[numerical_count]) 33 | sparse_input_node = Input(shape=[categorical_count]) 34 | dense_feat_emb = DenseFeatureMapper( 35 | num_of_fields=numerical_count, 36 | embedding_dim=2)(dense_input_node) 37 | sparse_feat_emb = SparseFeatureMapper( 38 | num_of_fields=categorical_count, 39 | hash_size=hash_size, 40 | embedding_dim=2)(sparse_input_node) 41 | 42 | # Step 2.2: Setup interactors to handle models 43 | sparse_feat_bottom_output = HyperInteraction(meta_interactor_num=2)([sparse_feat_emb]) 44 | dense_feat_bottom_output = HyperInteraction(meta_interactor_num=2)([dense_feat_emb]) 45 | hyper_output = HyperInteraction(meta_interactor_num=2)([sparse_feat_bottom_output, dense_feat_bottom_output]) 46 | 47 | # Step 2.3: Setup optimizer to handle the target task 48 | output = CTRPredictionOptimizer()(hyper_output) 49 | model = CTRRecommender(inputs=[dense_input_node, sparse_input_node], outputs=output) 50 | 51 | # Step 3: Build the searcher, which provides search algorithm 52 | searcher = Search(model=model, 53 | tuner='random', 54 | tuner_params={'max_trials': 2, 'overwrite': True}, 55 | ) 56 | 57 | # Step 4: Use the searcher to search the recommender 58 | searcher.search(x=[train_X_numerical, train_X_categorical], 59 | y=train_y, 60 | x_val=[val_X_numerical, val_X_categorical], 61 | y_val=val_y, 62 | objective='val_BinaryCrossentropy', 63 | batch_size=10000, 64 | epochs=2, 65 | callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=1)] 66 | ) 67 | logger.info('Validation Accuracy (logloss): {}'.format(searcher.evaluate(x=[val_X_numerical, val_X_categorical], 68 | y_true=val_y))) 69 | 70 | # Step 5: Evaluate the searched model 71 | logger.info('Test Accuracy (logloss): {}'.format(searcher.evaluate(x=[test_X_numerical, test_X_categorical], 72 | y_true=test_y))) 73 | -------------------------------------------------------------------------------- /examples/ctr_benchmark.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import, division, print_function, unicode_literals 3 | 4 | import argparse 5 | import time 6 | import os 7 | 8 | os.environ["CUDA_VISIBLE_DEVICES"] = "1" 9 | 10 | import logging 11 | import tensorflow as tf 12 | from autorecsys.auto_search import Search 13 | from autorecsys.pipeline import Input, DenseFeatureMapper, SparseFeatureMapper, FMInteraction, MLPInteraction, \ 14 | CrossNetInteraction, SelfAttentionInteraction, HyperInteraction, InnerProductInteraction, CTRPredictionOptimizer 15 | from autorecsys.pipeline.preprocessor import CriteoPreprocessor, AvazuPreprocessor 16 | from autorecsys.recommender import CTRRecommender 17 | 18 | # logging setting 19 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') 20 | logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') 21 | logger = logging.getLogger(__name__) 22 | 23 | 24 | def build_dlrm(emb_dict): 25 | if 'user' in emb_dict or 'item' in emb_dict: 26 | emb_list = [emb for _, emb in emb_dict.items()] 27 | output = MLPInteraction(num_layers=2)(emb_list) 28 | else: 29 | sparse_feat_mlp_output = [MLPInteraction()( [emb_dict['sparse']] )] if 'sparse' in emb_dict else [] 30 | dense_feat_mlp_output = [MLPInteraction()( [emb_dict['dense']] )] if 'dense' in emb_dict else [] 31 | output = MLPInteraction(num_layers=2)(sparse_feat_mlp_output + dense_feat_mlp_output) 32 | return output 33 | 34 | 35 | def build_deepfm(emb_dict): 36 | if 'user' in emb_dict or 'item' in emb_dict: 37 | emb_list = [emb for _, emb in emb_dict.items()] 38 | fm_output = [FMInteraction()(emb_list)] 39 | bottom_mlp_output = [MLPInteraction(num_layers=2)(emb_list)] 40 | output = MLPInteraction(num_layers=2)(fm_output + bottom_mlp_output) 41 | else: 42 | fm_output = [FMInteraction()( [emb_dict['sparse']] )] if 'sparse' in emb_dict else [] 43 | bottom_mlp_output = [MLPInteraction()( [emb_dict['dense']] )] if 'dense' in emb_dict else [] 44 | output = MLPInteraction(num_layers=2)(fm_output + bottom_mlp_output) 45 | return output 46 | 47 | 48 | def build_crossnet(emb_dict): 49 | if 'user' in emb_dict or 'item' in emb_dict: 50 | emb_list = [emb for _, emb in emb_dict.items()] 51 | fm_output = [CrossNetInteraction()(emb_list)] 52 | bottom_mlp_output = [MLPInteraction(num_layers=2)(emb_list)] 53 | output = MLPInteraction(num_layers=2)(fm_output + bottom_mlp_output) 54 | else: 55 | fm_output = [CrossNetInteraction()( [emb_dict['sparse']] )] if 'sparse' in emb_dict else [] 56 | bottom_mlp_output = [MLPInteraction()( [emb_dict['dense']] )] if 'dense' in emb_dict else [] 57 | output = MLPInteraction(num_layers=2)(fm_output + bottom_mlp_output) 58 | return output 59 | 60 | 61 | def build_autoint(emb_dict): 62 | if 'user' in emb_dict or 'item' in emb_dict: 63 | emb_list = [emb for _, emb in emb_dict.items()] 64 | fm_output = [SelfAttentionInteraction()(emb_list)] 65 | bottom_mlp_output = [MLPInteraction(num_layers=2)(emb_list)] 66 | output = MLPInteraction(num_layers=2)(fm_output + bottom_mlp_output) 67 | else: 68 | fm_output = [SelfAttentionInteraction()( [emb_dict['sparse']] )] if 'sparse' in emb_dict else [] 69 | bottom_mlp_output = [MLPInteraction()( [emb_dict['dense']] )] if 'dense' in emb_dict else [] 70 | output = MLPInteraction(num_layers=2)(fm_output + bottom_mlp_output) 71 | return output 72 | 73 | 74 | def build_neumf(emb_dict): 75 | emb_list = [emb for _, emb in emb_dict.items()] 76 | innerproduct_output = [InnerProductInteraction()(emb_list)] 77 | mlp_output = [MLPInteraction(num_layers=2)(emb_list)] 78 | output = innerproduct_output + mlp_output 79 | return output 80 | 81 | 82 | def build_autorec(emb_dict): 83 | if 'user' in emb_dict or 'item' in emb_dict: 84 | emb_list = [emb for _, emb in emb_dict.items()] 85 | output = HyperInteraction()(emb_list) 86 | else: 87 | sparse_feat_bottom_output = [HyperInteraction(meta_interactor_num=2)([sparse_feat_emb])] if 'sparse' in emb_dict else [] 88 | dense_feat_bottom_output = [HyperInteraction(meta_interactor_num=2)([dense_feat_emb])] if 'dense' in emb_dict else [] 89 | top_mlp_output = HyperInteraction(meta_interactor_num=2)(sparse_feat_bottom_output + dense_feat_bottom_output) 90 | output = HyperInteraction(meta_interactor_num=2)([top_mlp_output]) 91 | return output 92 | 93 | 94 | if __name__ == '__main__': 95 | # parse args 96 | parser = argparse.ArgumentParser() 97 | parser.add_argument('-model', type=str, help='input a model name', default='dlrm') 98 | parser.add_argument('-data', type=str, help='dataset name', default="avazu") 99 | parser.add_argument('-data_path', type=str, help='dataset path', default='./example_datasets/avazu/train-10k') 100 | parser.add_argument('-sep', type=str, help='dataset sep') 101 | parser.add_argument('-search', type=str, help='input a search method name', default='random') 102 | parser.add_argument('-batch_size', type=int, help='batch size', default=256) 103 | parser.add_argument('-trials', type=int, help='try number', default=2) 104 | parser.add_argument('-gpu_index', type=int, help='the index of gpu to use', default=0) 105 | args = parser.parse_args() 106 | print("args:", args) 107 | os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu_index) 108 | 109 | if args.sep == None: 110 | args.sep = '::' 111 | 112 | if args.data == "avazu": 113 | # Step 1: Preprocess data 114 | avazu = AvazuPreprocessor(csv_path=args.data_path, validate_percentage=0.1, test_percentage=0.1) 115 | train_X, train_y, val_X, val_y, test_X, test_y = avazu.preprocess() 116 | train_X_categorical = avazu.get_x_categorical(train_X) 117 | val_X_categorical = avazu.get_x_categorical(val_X) 118 | test_X_categorical = avazu.get_x_categorical(test_X) 119 | categorical_count = avazu.get_categorical_count() 120 | hash_size = avazu.get_hash_size() 121 | 122 | # Step 2: Build the recommender, which provides search space 123 | # Step 2.1: Setup mappers to handle inputs 124 | # dense_input_node = None 125 | sparse_input_node = Input(shape=[categorical_count]) 126 | input = [sparse_input_node] 127 | 128 | # dense_feat_emb = None 129 | sparse_feat_emb = SparseFeatureMapper( 130 | num_of_fields=categorical_count, 131 | hash_size=hash_size, 132 | embedding_dim=64)(sparse_input_node) 133 | 134 | emb_dict = {'sparse': sparse_feat_emb} 135 | 136 | if args.data == "criteo": 137 | # Step 1: Preprocess data 138 | criteo = CriteoPreprocessor(csv_path=args.data_path, validate_percentage=0.1, test_percentage=0.1) 139 | train_X, train_y, val_X, val_y, test_X, test_y = criteo.preprocess() 140 | train_X_numerical, train_X_categorical = criteo.get_x_numerical(train_X), criteo.get_x_categorical(train_X) 141 | val_X_numerical, val_X_categorical = criteo.get_x_numerical(val_X), criteo.get_x_categorical(val_X) 142 | test_X_numerical, test_X_categorical = criteo.get_x_numerical(test_X), criteo.get_x_categorical(test_X) 143 | numerical_count = criteo.get_numerical_count() 144 | categorical_count = criteo.get_categorical_count() 145 | hash_size = criteo.get_hash_size() 146 | 147 | # Step 2: Build the recommender, which provides search space 148 | # Step 2.1: Setup mappers to handle inputs 149 | dense_input_node = Input(shape=[numerical_count]) 150 | sparse_input_node = Input(shape=[categorical_count]) 151 | input = [dense_input_node, sparse_input_node] 152 | 153 | dense_feat_emb = DenseFeatureMapper( 154 | num_of_fields=numerical_count, 155 | embedding_dim=64)(dense_input_node) 156 | 157 | sparse_feat_emb = SparseFeatureMapper( 158 | num_of_fields=categorical_count, 159 | hash_size=hash_size, 160 | embedding_dim=64)(sparse_input_node) 161 | 162 | emb_dict = {'dense': dense_feat_emb, 'sparse': sparse_feat_emb} 163 | 164 | # Step 2.2: Setup interactors to handle models 165 | if args.model == 'dlrm': 166 | output = build_dlrm(emb_dict) 167 | if args.model == 'deepfm': 168 | output = build_deepfm(emb_dict) 169 | if args.model == 'crossnet': 170 | output = build_neumf(emb_dict) 171 | if args.model == 'autoint': 172 | output = build_autorec(emb_dict) 173 | if args.model == 'autorec': 174 | output = build_autorec(emb_dict) 175 | 176 | # Step 2.3: Setup optimizer to handle the target task 177 | output = CTRPredictionOptimizer()(output) 178 | model = CTRRecommender(inputs=input, outputs=output) 179 | 180 | # Step 3: Build the searcher, which provides search algorithm 181 | searcher = Search(model=model, 182 | tuner=args.search, 183 | tuner_params={'max_trials': args.trials, 'overwrite': True} 184 | ) 185 | 186 | # Step 4: Use the searcher to search the recommender 187 | start_time = time.time() 188 | searcher.search(x=train_X, 189 | y=train_y, 190 | x_val=val_X, 191 | y_val=val_y, 192 | objective='val_BinaryCrossentropy', 193 | batch_size=args.batch_size, 194 | epochs=1, 195 | callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=1)] 196 | ) 197 | end_time = time.time() 198 | print("running time:", end_time - start_time) 199 | print("args", args) 200 | logger.info('Validation Accuracy (logloss): {}'.format(searcher.evaluate(x=val_X, 201 | y_true=val_y))) 202 | 203 | # Step 5: Evaluate the searched model 204 | logger.info('Test Accuracy (logloss): {}'.format(searcher.evaluate(x=test_X, 205 | y_true=test_y))) 206 | -------------------------------------------------------------------------------- /examples/ctr_crossnet.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import, division, print_function, unicode_literals 3 | 4 | import os 5 | os.environ["CUDA_VISIBLE_DEVICES"] = "7" 6 | 7 | import logging 8 | import tensorflow as tf 9 | from autorecsys.auto_search import Search 10 | from autorecsys.pipeline import Input, DenseFeatureMapper, SparseFeatureMapper, CrossNetInteraction, MLPInteraction,\ 11 | CTRPredictionOptimizer 12 | from autorecsys.recommender import CTRRecommender 13 | from autorecsys.pipeline.preprocessor import CriteoPreprocessor 14 | 15 | 16 | # logging setting 17 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') 18 | logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') 19 | logger = logging.getLogger(__name__) 20 | 21 | # Step 1: Preprocess data 22 | criteo = CriteoPreprocessor() # the default arguments are setup to preprocess the Criteo example dataset 23 | train_X, train_y, val_X, val_y, test_X, test_y = criteo.preprocess() 24 | train_X_numerical, train_X_categorical = criteo.get_x_numerical(train_X), criteo.get_x_categorical(train_X) 25 | val_X_numerical, val_X_categorical = criteo.get_x_numerical(val_X), criteo.get_x_categorical(val_X) 26 | test_X_numerical, test_X_categorical = criteo.get_x_numerical(test_X), criteo.get_x_categorical(test_X) 27 | numerical_count = criteo.get_numerical_count() 28 | categorical_count = criteo.get_categorical_count() 29 | hash_size = criteo.get_hash_size() 30 | 31 | # Step 2: Build the recommender, which provides search space 32 | # Step 2.1: Setup mappers to handle inputs 33 | dense_input_node = Input(shape=[numerical_count]) 34 | sparse_input_node = Input(shape=[categorical_count]) 35 | dense_feat_emb = DenseFeatureMapper( 36 | num_of_fields=numerical_count, 37 | embedding_dim=2)(dense_input_node) 38 | sparse_feat_emb = SparseFeatureMapper( 39 | num_of_fields=categorical_count, 40 | hash_size=hash_size, 41 | embedding_dim=2)(sparse_input_node) 42 | 43 | # Step 2.2: Setup interactors to handle models 44 | crossnet_output = CrossNetInteraction()([dense_feat_emb, sparse_feat_emb]) 45 | bottom_mlp_output = MLPInteraction()([dense_feat_emb]) 46 | top_mlp_output = MLPInteraction()([crossnet_output, bottom_mlp_output]) 47 | 48 | # Step 2.3: Setup optimizer to handle the target task 49 | output = CTRPredictionOptimizer()(top_mlp_output) 50 | model = CTRRecommender(inputs=[dense_input_node, sparse_input_node], outputs=output) 51 | 52 | # Step 3: Build the searcher, which provides search algorithm 53 | searcher = Search(model=model, 54 | tuner='random', 55 | tuner_params={'max_trials': 2, 'overwrite': True}, 56 | ) 57 | 58 | # Step 4: Use the searcher to search the recommender 59 | searcher.search(x=[train_X_numerical, train_X_categorical], 60 | y=train_y, 61 | x_val=[val_X_numerical, val_X_categorical], 62 | y_val=val_y, 63 | objective='val_BinaryCrossentropy', 64 | batch_size=10000, 65 | epochs=2, 66 | callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=1)] 67 | ) 68 | logger.info('Validation Accuracy (logloss): {}'.format(searcher.evaluate(x=[val_X_numerical, val_X_categorical], 69 | y_true=val_y))) 70 | 71 | # Step 5: Evaluate the searched model 72 | logger.info('Test Accuracy (logloss): {}'.format(searcher.evaluate(x=[test_X_numerical, test_X_categorical], 73 | y_true=test_y))) 74 | -------------------------------------------------------------------------------- /examples/ctr_deepfm.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import, division, print_function, unicode_literals 3 | 4 | import os 5 | os.environ["CUDA_VISIBLE_DEVICES"] = "7" 6 | 7 | import logging 8 | import tensorflow as tf 9 | from autorecsys.auto_search import Search 10 | from autorecsys.pipeline import Input, DenseFeatureMapper, SparseFeatureMapper, FMInteraction, MLPInteraction,\ 11 | CTRPredictionOptimizer 12 | from autorecsys.recommender import CTRRecommender 13 | from autorecsys.pipeline.preprocessor import CriteoPreprocessor 14 | 15 | 16 | # logging setting 17 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') 18 | logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') 19 | logger = logging.getLogger(__name__) 20 | 21 | # Step 1: Preprocess data 22 | criteo = CriteoPreprocessor() # the default arguments are setup to preprocess the Criteo example dataset 23 | train_X, train_y, val_X, val_y, test_X, test_y = criteo.preprocess() 24 | train_X_numerical, train_X_categorical = criteo.get_x_numerical(train_X), criteo.get_x_categorical(train_X) 25 | val_X_numerical, val_X_categorical = criteo.get_x_numerical(val_X), criteo.get_x_categorical(val_X) 26 | test_X_numerical, test_X_categorical = criteo.get_x_numerical(test_X), criteo.get_x_categorical(test_X) 27 | numerical_count = criteo.get_numerical_count() 28 | categorical_count = criteo.get_categorical_count() 29 | hash_size = criteo.get_hash_size() 30 | 31 | # Step 2: Build the recommender, which provides search space 32 | # Step 2.1: Setup mappers to handle inputs 33 | dense_input_node = Input(shape=[numerical_count]) 34 | sparse_input_node = Input(shape=[categorical_count]) 35 | dense_feat_emb = DenseFeatureMapper( 36 | num_of_fields=numerical_count, 37 | embedding_dim=2)(dense_input_node) 38 | sparse_feat_emb = SparseFeatureMapper( 39 | num_of_fields=categorical_count, 40 | hash_size=hash_size, 41 | embedding_dim=2)(sparse_input_node) 42 | 43 | # Step 2.2: Setup interactors to handle models 44 | fm_output = FMInteraction()([sparse_feat_emb]) 45 | bottom_mlp_output = MLPInteraction()([dense_feat_emb]) 46 | top_mlp_output = MLPInteraction()([fm_output, bottom_mlp_output]) 47 | 48 | # Step 2.3: Setup optimizer to handle the target task 49 | output = CTRPredictionOptimizer()(top_mlp_output) 50 | model = CTRRecommender(inputs=[dense_input_node, sparse_input_node], outputs=output) 51 | 52 | # Step 3: Build the searcher, which provides search algorithm 53 | searcher = Search(model=model, 54 | tuner='random', 55 | tuner_params={'max_trials': 2, 'overwrite': True}, 56 | ) 57 | 58 | # Step 4: Use the searcher to search the recommender 59 | searcher.search(x=[train_X_numerical, train_X_categorical], 60 | y=train_y, 61 | x_val=[val_X_numerical, val_X_categorical], 62 | y_val=val_y, 63 | objective='val_BinaryCrossentropy', 64 | batch_size=10000, 65 | epochs=2, 66 | callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=1)] 67 | ) 68 | logger.info('Validation Accuracy (logloss): {}'.format(searcher.evaluate(x=[val_X_numerical, val_X_categorical], 69 | y_true=val_y))) 70 | 71 | # Step 5: Evaluate the searched model 72 | logger.info('Test Accuracy (logloss): {}'.format(searcher.evaluate(x=[test_X_numerical, test_X_categorical], 73 | y_true=test_y))) 74 | -------------------------------------------------------------------------------- /examples/ctr_dlrm.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import, division, print_function, unicode_literals 3 | 4 | import os 5 | os.environ["CUDA_VISIBLE_DEVICES"] = "7" 6 | 7 | import logging 8 | import tensorflow as tf 9 | from autorecsys.auto_search import Search 10 | from autorecsys.pipeline import Input, DenseFeatureMapper, SparseFeatureMapper, MLPInteraction, CTRPredictionOptimizer 11 | from autorecsys.recommender import CTRRecommender 12 | from autorecsys.pipeline.preprocessor import CriteoPreprocessor 13 | 14 | 15 | # logging setting 16 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') 17 | logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') 18 | logger = logging.getLogger(__name__) 19 | 20 | # Step 1: Preprocess data 21 | criteo = CriteoPreprocessor() # the default arguments are setup to preprocess the Criteo example dataset 22 | train_X, train_y, val_X, val_y, test_X, test_y = criteo.preprocess() 23 | train_X_numerical, train_X_categorical = criteo.get_x_numerical(train_X), criteo.get_x_categorical(train_X) 24 | val_X_numerical, val_X_categorical = criteo.get_x_numerical(val_X), criteo.get_x_categorical(val_X) 25 | test_X_numerical, test_X_categorical = criteo.get_x_numerical(test_X), criteo.get_x_categorical(test_X) 26 | numerical_count = criteo.get_numerical_count() 27 | categorical_count = criteo.get_categorical_count() 28 | hash_size = criteo.get_hash_size() 29 | 30 | # Step 2: Build the recommender, which provides search space 31 | # Step 2.1: Setup mappers to handle inputs 32 | dense_input_node = Input(shape=[numerical_count]) 33 | sparse_input_node = Input(shape=[categorical_count]) 34 | dense_feat_emb = DenseFeatureMapper( 35 | num_of_fields=numerical_count, 36 | embedding_dim=2)(dense_input_node) 37 | sparse_feat_emb = SparseFeatureMapper( 38 | num_of_fields=categorical_count, 39 | hash_size=hash_size, 40 | embedding_dim=2)(sparse_input_node) 41 | 42 | # Step 2.2: Setup interactors to handle models 43 | sparse_feat_mlp_output = MLPInteraction()([sparse_feat_emb]) 44 | dense_feat_mlp_output = MLPInteraction()([dense_feat_emb]) 45 | top_mlp_output = MLPInteraction(num_layers=2)([sparse_feat_mlp_output, dense_feat_mlp_output]) 46 | 47 | # Step 2.3: Setup optimizer to handle the target task 48 | output = CTRPredictionOptimizer()(top_mlp_output) 49 | model = CTRRecommender(inputs=[dense_input_node, sparse_input_node], outputs=output) 50 | 51 | # Step 3: Build the searcher, which provides search algorithm 52 | searcher = Search(model=model, 53 | tuner='random', 54 | tuner_params={'max_trials': 2, 'overwrite': True}, 55 | ) 56 | 57 | # Step 4: Use the searcher to search the recommender 58 | searcher.search(x=[train_X_numerical, train_X_categorical], 59 | y=train_y, 60 | x_val=[val_X_numerical, val_X_categorical], 61 | y_val=val_y, 62 | objective='val_BinaryCrossentropy', 63 | batch_size=10000, 64 | epochs=2, 65 | callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=1)] 66 | ) 67 | logger.info('Validation Accuracy (logloss): {}'.format(searcher.evaluate(x=[val_X_numerical, val_X_categorical], 68 | y_true=val_y))) 69 | 70 | # Step 5: Evaluate the searched model 71 | logger.info('Test Accuracy (logloss): {}'.format(searcher.evaluate(x=[test_X_numerical, test_X_categorical], 72 | y_true=test_y))) 73 | -------------------------------------------------------------------------------- /examples/ctr_neumf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import, division, print_function, unicode_literals 3 | 4 | import os 5 | os.environ["CUDA_VISIBLE_DEVICES"] = "6" 6 | 7 | import logging 8 | import tensorflow as tf 9 | from autorecsys.auto_search import Search 10 | from autorecsys.pipeline import Input, LatentFactorMapper, MLPInteraction, InnerProductInteraction, \ 11 | CTRPredictionOptimizer 12 | from autorecsys.recommender import CTRRecommender 13 | from autorecsys.pipeline.preprocessor import CriteoPreprocessor 14 | 15 | 16 | # logging setting 17 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') 18 | logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') 19 | logger = logging.getLogger(__name__) 20 | 21 | # load dataset 22 | criteo = CriteoPreprocessor() # automatically set up for preprocessing the Criteo dataset 23 | train_X, train_y, val_X, val_y, test_X, test_y = criteo.preprocess() 24 | 25 | # build the pipeline. 26 | input = Input(shape=[criteo.get_categorical_count()]) 27 | user_emb_gmf = LatentFactorMapper(column_id=0, 28 | num_of_entities=10000, 29 | embedding_dim=64)(input) 30 | item_emb_gmf = LatentFactorMapper(column_id=1, 31 | num_of_entities=10000, 32 | embedding_dim=64)(input) 33 | 34 | user_emb_mlp = LatentFactorMapper(column_id=0, 35 | num_of_entities=10000, 36 | embedding_dim=64)(input) 37 | item_emb_mlp = LatentFactorMapper(column_id=1, 38 | num_of_entities=10000, 39 | embedding_dim=64)(input) 40 | innerproduct_output = InnerProductInteraction()([user_emb_gmf, item_emb_gmf]) 41 | mlp_output = MLPInteraction()([user_emb_mlp, item_emb_mlp]) 42 | output = CTRPredictionOptimizer()([innerproduct_output, mlp_output]) 43 | model = CTRRecommender(inputs=input, outputs=output) 44 | 45 | # AutoML search and predict. 46 | searcher = Search(model=model, 47 | tuner='random', 48 | tuner_params={'max_trials': 10, 'overwrite': True}, 49 | ) 50 | searcher.search(x=[criteo.get_x_categorical(train_X)], 51 | y=train_y, 52 | x_val=[criteo.get_x_categorical(val_X)], 53 | y_val=val_y, 54 | objective='val_BinaryCrossentropy', 55 | batch_size=256, 56 | epochs = 20, 57 | callbacks = [ tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=1)] 58 | ) 59 | logger.info('Predicted Ratings: {}'.format(searcher.predict(x=[criteo.get_x_categorical(val_X)]))) 60 | logger.info('Predicting Accuracy (mse): {}'.format(searcher.evaluate(x=[criteo.get_x_categorical(val_X)], y_true=val_y))) 61 | -------------------------------------------------------------------------------- /examples/rp_autorec.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import, division, print_function, unicode_literals 3 | 4 | import os 5 | os.environ["CUDA_VISIBLE_DEVICES"] = "2" 6 | 7 | import logging 8 | import tensorflow as tf 9 | from autorecsys.auto_search import Search 10 | from autorecsys.pipeline import Input, LatentFactorMapper, RatingPredictionOptimizer, HyperInteraction 11 | from autorecsys.pipeline.preprocessor import MovielensPreprocessor 12 | from autorecsys.recommender import RPRecommender 13 | 14 | # logging setting 15 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') 16 | logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') 17 | logger = logging.getLogger(__name__) 18 | 19 | # Step 1: Preprocess data 20 | movielens = MovielensPreprocessor() 21 | train_X, train_y, val_X, val_y, test_X, test_y = movielens.preprocess() 22 | train_X_categorical = movielens.get_x_categorical(train_X) 23 | val_X_categorical = movielens.get_x_categorical(val_X) 24 | test_X_categorical = movielens.get_x_categorical(test_X) 25 | user_num, item_num = movielens.get_hash_size() 26 | 27 | # Step 2: Build the recommender, which provides search space 28 | # Step 2.1: Setup mappers to handle inputs 29 | input = Input(shape=[2]) 30 | user_emb = LatentFactorMapper(column_id=0, 31 | num_of_entities=user_num, 32 | embedding_dim=64)(input) 33 | item_emb = LatentFactorMapper(column_id=1, 34 | num_of_entities=item_num, 35 | embedding_dim=64)(input) 36 | 37 | # Step 2.2: Setup interactors to handle models 38 | output1 = HyperInteraction()([user_emb, item_emb]) 39 | output2 = HyperInteraction()([output1, user_emb, item_emb]) 40 | output3 = HyperInteraction()([output1, output2, user_emb, item_emb]) 41 | output4 = HyperInteraction()([output1, output2, output3, user_emb, item_emb]) 42 | 43 | # Step 2.3: Setup optimizer to handle the target task 44 | output = RatingPredictionOptimizer()(output4) 45 | model = RPRecommender(inputs=input, outputs=output) 46 | 47 | # Step 3: Build the searcher, which provides search algorithm 48 | searcher = Search(model=model, 49 | tuner='random', 50 | tuner_params={'max_trials': 2, 'overwrite': True},) 51 | 52 | # Step 4: Use the searcher to search the recommender 53 | searcher.search(x=[train_X_categorical], 54 | y=train_y, 55 | x_val=[val_X_categorical], 56 | y_val=val_y, 57 | objective='val_mse', 58 | batch_size=1024, 59 | epochs=1, 60 | callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=1)]) 61 | logger.info('Validation Accuracy (mse): {}'.format(searcher.evaluate(x=val_X_categorical, 62 | y_true=val_y))) 63 | 64 | # Step 5: Evaluate the searched model 65 | logger.info('Test Accuracy (mse): {}'.format(searcher.evaluate(x=test_X_categorical, 66 | y_true=test_y))) 67 | -------------------------------------------------------------------------------- /examples/rp_benchmark.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import, division, print_function, unicode_literals 3 | 4 | import argparse 5 | import time 6 | import os 7 | import sys 8 | # os.environ["CUDA_VISIBLE_DEVICES"] = "5" 9 | 10 | import logging 11 | # logging setting 12 | logging.basicConfig(stream=sys.stdout, 13 | level=logging.DEBUG, 14 | format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') 15 | # logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') 16 | # logger = logging.getLogger(__name__) 17 | 18 | 19 | import tensorflow as tf 20 | from autorecsys.auto_search import Search 21 | from autorecsys.pipeline import Input, LatentFactorMapper, RatingPredictionOptimizer, HyperInteraction, MLPInteraction,\ 22 | InnerProductInteraction 23 | from autorecsys.pipeline.preprocessor import MovielensPreprocessor 24 | from autorecsys.recommender import RPRecommender 25 | 26 | 27 | 28 | 29 | def build_mf(user_num, item_num): 30 | input = Input(shape=[2]) 31 | user_emb = LatentFactorMapper(column_id=0, 32 | num_of_entities=user_num, 33 | embedding_dim=64)(input) 34 | item_emb = LatentFactorMapper(column_id=1, 35 | num_of_entities=item_num, 36 | embedding_dim=64)(input) 37 | output = InnerProductInteraction()([user_emb, item_emb]) 38 | output = RatingPredictionOptimizer()(output) 39 | model = RPRecommender(inputs=input, outputs=output) 40 | return model 41 | 42 | 43 | def build_gmf(user_num, item_num): 44 | input = Input(shape=[2]) 45 | user_emb = LatentFactorMapper(column_id=0, 46 | num_of_entities=user_num, 47 | embedding_dim=64)(input) 48 | item_emb = LatentFactorMapper(column_id=1, 49 | num_of_entities=item_num, 50 | embedding_dim=64)(input) 51 | output = InnerProductInteraction()([user_emb, item_emb]) 52 | output = RatingPredictionOptimizer()(output) 53 | model = RPRecommender(inputs=input, outputs=output) 54 | return model 55 | 56 | 57 | def build_mlp(user_num, item_num): 58 | input = Input(shape=[2]) 59 | user_emb_mlp = LatentFactorMapper(column_id=0, 60 | num_of_entities=user_num, 61 | embedding_dim=64)(input) 62 | item_emb_mlp = LatentFactorMapper(column_id=1, 63 | num_of_entities=user_num, 64 | embedding_dim=64)(input) 65 | output = MLPInteraction()([user_emb_mlp, item_emb_mlp]) 66 | output = RatingPredictionOptimizer()(output) 67 | model = RPRecommender(inputs=input, outputs=output) 68 | return model 69 | 70 | 71 | def build_neumf(user_num, item_num): 72 | input = Input(shape=[2]) 73 | user_emb_gmf = LatentFactorMapper(column_id=0, 74 | num_of_entities=user_num, 75 | embedding_dim=64)(input) 76 | item_emb_gmf = LatentFactorMapper(column_id=1, 77 | num_of_entities=item_num, 78 | embedding_dim=64)(input) 79 | innerproduct_output = InnerProductInteraction()([user_emb_gmf, item_emb_gmf]) 80 | 81 | user_emb_mlp = LatentFactorMapper(column_id=0, 82 | num_of_entities=user_num, 83 | embedding_dim=64)(input) 84 | item_emb_mlp = LatentFactorMapper(column_id=1, 85 | num_of_entities=item_num, 86 | embedding_dim=64)(input) 87 | mlp_output = MLPInteraction()([user_emb_mlp, item_emb_mlp]) 88 | 89 | output = RatingPredictionOptimizer()([innerproduct_output, mlp_output]) 90 | model = RPRecommender(inputs=input, outputs=output) 91 | return model 92 | 93 | 94 | def build_autorec(user_num, item_num): 95 | input = Input(shape=[2]) 96 | user_emb_1 = LatentFactorMapper(column_id=0, 97 | num_of_entities=user_num, 98 | embedding_dim=64)(input) 99 | item_emb_1 = LatentFactorMapper(column_id=1, 100 | num_of_entities=item_num, 101 | embedding_dim=64)(input) 102 | 103 | user_emb_2 = LatentFactorMapper(column_id=0, 104 | num_of_entities=user_num, 105 | embedding_dim=64)(input) 106 | item_emb_2 = LatentFactorMapper(column_id=1, 107 | num_of_entities=item_num, 108 | embedding_dim=64)(input) 109 | 110 | output = HyperInteraction()([user_emb_1, item_emb_1, user_emb_2, item_emb_2]) 111 | output = RatingPredictionOptimizer()(output) 112 | model = RPRecommender(inputs=input, outputs=output) 113 | return model 114 | 115 | 116 | if __name__ == '__main__': 117 | # parse args 118 | parser = argparse.ArgumentParser() 119 | parser.add_argument('-model', type=str, help='input a model name') 120 | parser.add_argument('-data', type=str, help='dataset name') 121 | parser.add_argument('-data_path', type=str, help='dataset path') 122 | parser.add_argument('-sep', type=str, help='dataset sep') 123 | parser.add_argument('-search', type=str, help='input a search method name') 124 | parser.add_argument('-batch_size', type=int, help='batch size') 125 | parser.add_argument('-epochs', type=int, help='epochs') 126 | parser.add_argument('-early_stop', type=int, help='early stop') 127 | parser.add_argument('-trials', type=int, help='try number') 128 | args = parser.parse_args() 129 | 130 | if args.sep == None: 131 | args.sep = '::' 132 | 133 | # Step 1: Preprocess data 134 | if args.data == "ml": 135 | data = MovielensPreprocessor(csv_path=args.data_path, validate_percentage=0.1, test_percentage=0.1) 136 | train_X, train_y, val_X, val_y, test_X, test_y = data.preprocess() 137 | train_X_categorical = data.get_x_categorical(train_X) 138 | val_X_categorical = data.get_x_categorical(val_X) 139 | test_X_categorical = data.get_x_categorical(test_X) 140 | user_num, item_num = data.get_hash_size() 141 | 142 | # Step 2: Build the recommender, which provides search space 143 | 144 | if args.model == 'mf': 145 | model = build_mf(user_num, item_num) 146 | if args.model == 'mlp': 147 | model = build_mlp(user_num, item_num) 148 | if args.model == 'gmf': 149 | model = build_gmf(user_num, item_num) 150 | if args.model == 'neumf': 151 | model = build_neumf(user_num, item_num) 152 | if args.model == 'autorec': 153 | model = build_autorec(user_num, item_num) 154 | 155 | # Step 3: Build the searcher, which provides search algorithm 156 | searcher = Search(model=model, 157 | tuner=args.search, 158 | tuner_params={'max_trials': args.trials, 'overwrite': True} 159 | ) 160 | 161 | # Step 4: Use the searcher to search the recommender 162 | start_time = time.time() 163 | searcher.search(x=train_X_categorical, 164 | y=train_y, 165 | x_val=val_X_categorical, 166 | y_val=val_y, 167 | objective='val_mse', 168 | batch_size=args.batch_size, 169 | epochs=args.epochs, 170 | callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=args.early_stop)]) 171 | end_time = time.time() 172 | print("Runing time:", end_time - start_time) 173 | print("Args", args) 174 | logger.info('Validation Accuracy (mse): {}'.format(searcher.evaluate(x=val_X_categorical, 175 | y_true=val_y))) 176 | 177 | # Step 5: Evaluate the searched model 178 | logger.info('Test Accuracy (mse): {}'.format(searcher.evaluate(x=test_X_categorical, 179 | y_true=test_y))) 180 | 181 | -------------------------------------------------------------------------------- /examples/rp_mf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import, division, print_function, unicode_literals 3 | 4 | import os 5 | 6 | os.environ["CUDA_VISIBLE_DEVICES"] = "5" 7 | 8 | import tensorflow as tf 9 | # gpus = tf.config.experimental.list_physical_devices(device_type='GPU') 10 | # for gpu in gpus: 11 | # tf.config.experimental.set_memory_growth(gpu, True) 12 | # import tensorflow as tf 13 | # physical_devices = tf.config.list_physical_devices('GPU') 14 | # tf.config.experimental.set_memory_growth(physical_devices[0], True) 15 | 16 | import logging 17 | from autorecsys.auto_search import Search 18 | from autorecsys.pipeline import Input, LatentFactorMapper, InnerProductInteraction, RatingPredictionOptimizer 19 | from autorecsys.pipeline.preprocessor import MovielensPreprocessor 20 | from autorecsys.recommender import RPRecommender 21 | 22 | # logging setting 23 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') 24 | logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') 25 | logger = logging.getLogger(__name__) 26 | 27 | # load dataset 28 | ##Netflix Dataset 29 | # dataset_paths = ["./examples/datasets/netflix-prize-data/combined_data_" + str(i) + ".txt" for i in range(1, 5)] 30 | # data = NetflixPrizePreprocessor(dataset_paths) 31 | 32 | # Step 1: Preprocess data 33 | movielens = MovielensPreprocessor() 34 | train_X, train_y, val_X, val_y, test_X, test_y = movielens.preprocess() 35 | train_X_categorical = movielens.get_x_categorical(train_X) 36 | val_X_categorical = movielens.get_x_categorical(val_X) 37 | test_X_categorical = movielens.get_x_categorical(test_X) 38 | user_num, item_num = movielens.get_hash_size() 39 | 40 | # Step 2: Build the recommender, which provides search space 41 | # Step 2.1: Setup mappers to handle inputs 42 | input = Input(shape=[2]) 43 | user_emb = LatentFactorMapper(column_id=0, 44 | num_of_entities=user_num, 45 | embedding_dim=64)(input) 46 | item_emb = LatentFactorMapper(column_id=1, 47 | num_of_entities=item_num, 48 | embedding_dim=64)(input) 49 | 50 | # Step 2.2: Setup interactors to handle models 51 | output = InnerProductInteraction()([user_emb, item_emb]) 52 | 53 | # Step 2.3: Setup optimizer to handle the target task 54 | output = RatingPredictionOptimizer()(output) 55 | model = RPRecommender(inputs=input, outputs=output) 56 | 57 | # Step 3: Build the searcher, which provides search algorithm 58 | searcher = Search(model=model, 59 | tuner='greedy', # hyperband, greedy, bayesian 60 | tuner_params={"max_trials": 5} 61 | ) 62 | 63 | # Step 4: Use the searcher to search the recommender 64 | searcher.search(x=[train_X_categorical], 65 | y=train_y, 66 | x_val=[val_X_categorical], 67 | y_val=val_y, 68 | objective='val_mse', 69 | batch_size=1024, 70 | epochs=10, 71 | callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=1)]) 72 | logger.info('Validation Accuracy (mse): {}'.format(searcher.evaluate(x=val_X_categorical, 73 | y_true=val_y))) 74 | 75 | # Step 5: Evaluate the searched model 76 | logger.info('Test Accuracy (mse): {}'.format(searcher.evaluate(x=test_X_categorical, 77 | y_true=test_y))) 78 | -------------------------------------------------------------------------------- /examples/rp_neumf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import, division, print_function, unicode_literals 3 | 4 | import os 5 | os.environ["CUDA_VISIBLE_DEVICES"] = "6" 6 | 7 | import logging 8 | import tensorflow as tf 9 | from autorecsys.auto_search import Search 10 | from autorecsys.pipeline import Input, LatentFactorMapper, MLPInteraction, InnerProductInteraction,\ 11 | RatingPredictionOptimizer 12 | from autorecsys.pipeline.preprocessor import MovielensPreprocessor 13 | from autorecsys.recommender import RPRecommender 14 | 15 | # logging setting 16 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') 17 | logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') 18 | logger = logging.getLogger(__name__) 19 | 20 | # load dataset 21 | ##Netflix Dataset 22 | # dataset_paths = ["./examples/datasets/netflix-prize-data/combined_data_" + str(i) + ".txt" for i in range(1, 5)] 23 | # data = NetflixPrizePreprocessor(dataset_paths) 24 | 25 | # Step 1: Preprocess data 26 | movielens = MovielensPreprocessor() 27 | train_X, train_y, val_X, val_y, test_X, test_y = movielens.preprocess() 28 | train_X_categorical = movielens.get_x_categorical(train_X) 29 | val_X_categorical = movielens.get_x_categorical(val_X) 30 | test_X_categorical = movielens.get_x_categorical(test_X) 31 | user_num, item_num = movielens.get_hash_size() 32 | 33 | # Step 2: Build the recommender, which provides search space 34 | # Step 2.1: Setup mappers to handle inputs 35 | input = Input(shape=[2]) 36 | user_emb_gmf = LatentFactorMapper(column_id=0, 37 | num_of_entities=user_num, 38 | embedding_dim=64)(input) 39 | item_emb_gmf = LatentFactorMapper(column_id=1, 40 | num_of_entities=item_num, 41 | embedding_dim=64)(input) 42 | user_emb_mlp = LatentFactorMapper(column_id=0, 43 | num_of_entities=user_num, 44 | embedding_dim=64)(input) 45 | item_emb_mlp = LatentFactorMapper(column_id=1, 46 | num_of_entities=item_num, 47 | embedding_dim=64)(input) 48 | 49 | # Step 2.2: Setup interactors to handle models 50 | innerproduct_output = InnerProductInteraction()([user_emb_gmf, item_emb_gmf]) 51 | mlp_output = MLPInteraction()([user_emb_mlp, item_emb_mlp]) 52 | 53 | # Step 2.3: Setup optimizer to handle the target task 54 | output = RatingPredictionOptimizer()([innerproduct_output, mlp_output]) 55 | model = RPRecommender(inputs=input, outputs=output) 56 | 57 | # Step 3: Build the searcher, which provides search algorithm 58 | searcher = Search(model=model, 59 | tuner='greedy', # random, greedy 60 | tuner_params={"max_trials": 5, 'overwrite': True} 61 | ) 62 | 63 | # Step 4: Use the searcher to search the recommender 64 | searcher.search(x=[train_X_categorical], 65 | y=train_y, 66 | x_val=[val_X_categorical], 67 | y_val=val_y, 68 | objective='val_mse', 69 | batch_size=1024, 70 | epochs=1, 71 | callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=1)]) 72 | logger.info('Validation Accuracy (mse): {}'.format(searcher.evaluate(x=val_X_categorical, 73 | y_true=val_y))) 74 | 75 | # Step 5: Evaluate the searched model 76 | logger.info('Test Accuracy (mse): {}'.format(searcher.evaluate(x=test_X_categorical, 77 | y_true=test_y))) 78 | -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: AutoRecSys 2 | pages: 3 | - Home: index.md 4 | - About: about.md 5 | theme: readthedocs -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy>=1.17.3 2 | pandas==0.25.2 3 | pytest==5.2.2 4 | scikit-learn==0.21.3 5 | scipy>=1.4.1 6 | tabulate==0.8.5 7 | tensorboard>=2.2.0 8 | tensorflow-gpu==2.4.0 9 | termcolor==1.1.0 10 | terminaltables==3.1.0 11 | tqdm==4.36.1 12 | colorama==0.4.3 13 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | from setuptools import setup, find_packages 3 | import subprocess 4 | import logging 5 | 6 | setup( 7 | name='autorec', 8 | version='0.0.2', 9 | description='AutoRec: An Automated Recommender System', 10 | author='DATA Lab@Texas A&M University', 11 | author_email='thwang1231@tamu.edu', 12 | url='https://github.com/datamllab/AutoRec.git', 13 | packages=find_packages(exclude=['contrib', 'docs', 'tests*']), 14 | # package_data={ 15 | # 'tods': ['resources/.entry_points.ini', 16 | # 'resources/.requirements.txt', 17 | # 'resources/default_pipeline.json' 18 | # ] 19 | # }, 20 | install_requires=[ 21 | 'numpy>=1.17.3', 22 | 'pandas==0.25.2', 23 | 'pytest==5.2.2', 24 | 'scikit-learn==0.21.3', 25 | 'scipy>=1.4.1', 26 | 'tabulate==0.8.5', 27 | 'tensorboard>=2.2.0', 28 | 'tensorflow-gpu==2.4.0', 29 | 'termcolor==1.1.0', 30 | 'terminaltables==3.1.0', 31 | 'tqdm==4.36.1', 32 | 'colorama==0.4.3', 33 | ], 34 | 35 | ) 36 | 37 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datamllab/AutoRec/2dbc8778cfb597402d8b0337186bf9152663b20a/tests/__init__.py -------------------------------------------------------------------------------- /tests/common.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datamllab/AutoRec/2dbc8778cfb597402d8b0337186bf9152663b20a/tests/common.py -------------------------------------------------------------------------------- /tests/integration_tests.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from autorecsys.auto_search import Search 4 | 5 | @pytest.fixture(scope='module') 6 | def tmp_dir(tmpdir_factory): 7 | return tmpdir_factory.mktemp('integration_test') 8 | 9 | 10 | def test_Search(tmp_dir): 11 | # TODO 12 | pass -------------------------------------------------------------------------------- /tests/integration_tests/test_models.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, print_function, unicode_literals 2 | 3 | import os 4 | import logging 5 | import unittest 6 | 7 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' # Suppress warning for running TF with CPU 8 | os.chdir("../../examples") 9 | 10 | logger = logging.getLogger(__name__) 11 | # tf.random.set_seed(1) 12 | 13 | 14 | class CTRTestModels(unittest.TestCase): 15 | 16 | def setUp(self): 17 | super(CTRTestModels, self).setUp() 18 | self.ctr_model = {'autoint': 'ctr_autoint.py', 19 | 'autorec': 'ctr_autorec.py', 20 | 'crossnet': 'ctr_crossnet.py', 21 | 'deepfm': 'ctr_deepfm.py', 22 | 'dlrm': 'ctr_dlrm.py', 23 | 'neumf': 'ctr_neumf.py'} 24 | 25 | def test_ctr_autoint(self): 26 | """ 27 | Test class in ctr_autoint.py 28 | """ 29 | try: 30 | exec(open(self.ctr_model['autoint']).read()) 31 | except RuntimeError: 32 | assert False, 'Runtime Error' 33 | 34 | def test_ctr_autorec(self): 35 | """ 36 | Test class in ctr_autorec.py 37 | """ 38 | try: 39 | exec(open(self.ctr_model['autorec']).read()) 40 | except RuntimeError: 41 | assert False, 'Runtime Error' 42 | 43 | def test_ctr_crossnet(self): 44 | """ 45 | Test class in ctr_crossnet.py 46 | """ 47 | try: 48 | exec(open(self.ctr_model['crossnet']).read()) 49 | except RuntimeError: 50 | assert False, 'Runtime Error' 51 | 52 | def test_ctr_deepfm(self): 53 | """ 54 | Test class in ctr_deepfm.py 55 | """ 56 | try: 57 | exec(open(self.ctr_model['deepfm']).read()) 58 | except RuntimeError: 59 | assert False, 'Runtime Error' 60 | 61 | def test_ctr_dlrm(self): 62 | """ 63 | Test class in ctr_dlrm.py 64 | """ 65 | try: 66 | exec(open(self.ctr_model['dlrm']).read()) 67 | except RuntimeError: 68 | assert False, 'Runtime Error' 69 | 70 | def test_ctr_neumf(self): 71 | """ 72 | Test class in ctr_neumf.py 73 | """ 74 | try: 75 | exec(open(self.ctr_model['neumf']).read()) 76 | except RuntimeError: 77 | assert False, 'Runtime Error' 78 | 79 | 80 | class RPTestModels(unittest.TestCase): 81 | 82 | def setUp(self): 83 | super(RPTestModels, self).setUp() 84 | self.rp_model = {'autorec': 'rp_autorec.py', 85 | 'mf': 'rp_autorec.py', 86 | 'neumf': 'rp_neumf.py'} 87 | 88 | def test_rp_autorec(self): 89 | """ 90 | Test class in rp_autorec.py 91 | """ 92 | try: 93 | exec(open(self.rp_model['autorec']).read()) 94 | except RuntimeError: 95 | assert False, 'Runtime Error' 96 | 97 | def test_rp_mf(self): 98 | """ 99 | Test class in rp_mf.py 100 | """ 101 | try: 102 | exec(open(self.rp_model['mf']).read()) 103 | except RuntimeError: 104 | assert False, 'Runtime Error' 105 | 106 | def test_rp_neumf(self): 107 | """ 108 | Test class in rp_neumf.py 109 | """ 110 | try: 111 | exec(open(self.rp_model['neumf']).read()) 112 | except RuntimeError: 113 | assert False, 'Runtime Error' 114 | -------------------------------------------------------------------------------- /tests/pipeline_tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datamllab/AutoRec/2dbc8778cfb597402d8b0337186bf9152663b20a/tests/pipeline_tests/__init__.py -------------------------------------------------------------------------------- /tests/pipeline_tests/test_graph.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import tensorflow as tf 3 | from autorecsys.searcher.core import hyperparameters as hp_module 4 | 5 | from autorecsys.pipeline import Input, MLPInteraction, ConcatenateInteraction, RatingPredictionOptimizer 6 | from autorecsys.pipeline import graph as graph_module 7 | 8 | # TODO: we don't support overwrite hp for graph now. 9 | # def test_set_hp(): 10 | # input_node = Input((32,)) 11 | # output_node = input_node 12 | # output_node = MLPInteraction()(output_node) 13 | # output_node = RatingPredictionOptimizer()[output_node] 14 | 15 | # graph = graph_module.HyperGraph( 16 | # input_node, 17 | # output_node, 18 | # override_hps=[hp_module.Choice('dense_block_1/num_layers', [6], default=6)]) 19 | # hp = hp_module.HyperParameters() 20 | # plain_graph = graph.hyper_build(hp) 21 | # plain_graph.build_keras_graph().build(hp) 22 | 23 | # for single_hp in hp.space: 24 | # if single_hp.name == 'dense_block_1/num_layers': 25 | # assert len(single_hp.values) == 1 26 | # assert single_hp.values[0] == 6 27 | # return 28 | # assert False 29 | 30 | 31 | def test_input_output_disconnect(): 32 | input_node1 = Input() 33 | output_node = input_node1 34 | _ = MLPInteraction()(output_node) 35 | 36 | input_node = Input() 37 | output_node = input_node 38 | output_node = MLPInteraction()(output_node) 39 | output_node = RatingPredictionOptimizer()(output_node) 40 | 41 | with pytest.raises(ValueError) as info: 42 | graph_module.HyperGraph(input_node1, output_node) 43 | assert 'Inputs and outputs not connected.' in str(info.value) 44 | 45 | 46 | # def test_hyper_graph_cycle(): 47 | # input_node1 = Input() 48 | # input_node2 = Input() 49 | # output_node1 = MLPInteraction()(input_node1) 50 | # output_node2 = MLPInteraction()(input_node2) 51 | # output_node = ConcatenateInteraction()([output_node1, output_node2]) 52 | # head = RatingPredictionOptimizer() 53 | # output_node = head(output_node) 54 | # head.outputs = output_node1 55 | 56 | # with pytest.raises(ValueError) as info: 57 | # graph_module.HyperGraph([input_node1, input_node2], output_node) 58 | # assert 'The network has a cycle.' in str(info.value) 59 | 60 | # TODO: this test criterion may have some problem 61 | def test_input_missing(): 62 | input_node1 = Input() 63 | input_node2 = Input() 64 | output_node1 = MLPInteraction()(input_node1) 65 | output_node2 = MLPInteraction()(input_node2) 66 | output_node = ConcatenateInteraction()([output_node1, output_node2]) 67 | output_node = RatingPredictionOptimizer()(output_node) 68 | 69 | with pytest.raises(ValueError) as info: 70 | graph_module.HyperGraph(input_node1, output_node) 71 | assert 'A required input is missing for HyperModel' in str(info.value) 72 | 73 | 74 | def test_graph_basics(): 75 | input_node = Input(shape=(30,)) 76 | output_node = input_node 77 | output_node = MLPInteraction()(output_node) 78 | output_node = RatingPredictionOptimizer()(output_node) 79 | 80 | graph = graph_module.PlainGraph(input_node, output_node) 81 | model = graph.build_keras_graph().build(hp_module.HyperParameters()) 82 | assert model.input_shape == (None, 30) 83 | assert model.output_shape == (None, ) 84 | 85 | -------------------------------------------------------------------------------- /tests/pipeline_tests/test_mapper.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, print_function, unicode_literals 2 | 3 | import os 4 | import logging 5 | import pytest 6 | import unittest 7 | 8 | import numpy as np 9 | import tensorflow as tf 10 | import pandas as pd 11 | from autorecsys.pipeline.mapper import ( 12 | LatentFactorMapper, 13 | DenseFeatureMapper, 14 | SparseFeatureMapper 15 | ) 16 | from autorecsys.searcher.core import hyperparameters as hp_module 17 | from tensorflow.python.util import nest 18 | 19 | 20 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' # Suppress warning for running TF with CPU 21 | 22 | logger = logging.getLogger(__name__) 23 | 24 | 25 | class TestMappers(unittest.TestCase): 26 | @pytest.fixture(autouse=True) 27 | def initdir(self, tmpdir): 28 | tmpdir.chdir() # change to pytest-provided temporary directory 29 | tmpdir.join("test_mapper.ini").write("# testdata") 30 | 31 | def setUp(self): 32 | super(TestMappers, self).setUp() 33 | self.column_id = 1 34 | self.input_shape = 13 35 | self.batch = 2 36 | self.embed_dim = 8 37 | self.tensor_inputs = [tf.random.uniform([self.batch, self.input_shape])] # standard input type 38 | self.df_inputs = pd.DataFrame(np.random.rand(self.batch, self.input_shape)) # for the ease of getting hash size 39 | 40 | def test_LatentFactorMapper(self): 41 | # test constructor and get_state 42 | p = { 43 | 'column_id': 0, 44 | 'num_of_entities': 3, 45 | 'embedding_dim': 4} 46 | mapper = LatentFactorMapper(**p) 47 | sol_get_state = { 48 | 'name': 'latent_factor_mapper_1', 49 | 'column_id': 0, 50 | 'num_of_entities': 3, 51 | 'embedding_dim': 4} 52 | assert mapper.get_state() == sol_get_state 53 | 54 | # test set_state 55 | p = { 56 | 'column_id': self.column_id, 57 | 'num_of_entities': 10, 58 | 'embedding_dim': self.embed_dim} 59 | sol_set_state = { 60 | 'name': 'latent_factor_mapper_1', 61 | 'column_id': self.column_id, 62 | 'num_of_entities': 10, 63 | 'embedding_dim': self.embed_dim} 64 | mapper.set_state(p) 65 | ans_set_state = mapper.get_state() 66 | assert ans_set_state == sol_set_state 67 | 68 | # test build 69 | hp = hp_module.HyperParameters() 70 | output = mapper.build(hp, self.tensor_inputs) 71 | assert len(nest.flatten(output)) == 1 72 | assert output.shape == (self.batch, self.embed_dim) # LatentFactorMapper does not have input shape dimension 73 | 74 | def test_DenseFeatureMapper(self): 75 | # test constructor and get_state 76 | p = { 77 | 'num_of_fields': 10, 78 | 'embedding_dim': 4} 79 | mapper = DenseFeatureMapper(**p) 80 | sol_get_state = { 81 | 'name': 'dense_feature_mapper_1', 82 | 'num_of_fields': 10, 83 | 'embedding_dim': 4} 84 | assert mapper.get_state() == sol_get_state 85 | 86 | # test set_state 87 | p = { 88 | 'num_of_fields': self.input_shape, 89 | 'embedding_dim': self.embed_dim} 90 | sol_set_state = { 91 | 'name': 'dense_feature_mapper_1', 92 | 'num_of_fields': self.input_shape, 93 | 'embedding_dim': self.embed_dim} 94 | mapper.set_state(p) 95 | ans_set_state = mapper.get_state() 96 | assert ans_set_state == sol_set_state 97 | 98 | # test build 99 | hp = hp_module.HyperParameters() 100 | output = mapper.build(hp, self.tensor_inputs) # Act 101 | assert len(nest.flatten(output)) == 1 102 | assert output.shape == (self.batch, self.input_shape, self.embed_dim) 103 | 104 | def test_SparseFeatureMapper(self): 105 | # test constructor and get_state 106 | p = { 107 | 'num_of_fields': 10, 108 | 'hash_size': [2, 4, 10], 109 | 'embedding_dim': 4} 110 | mapper = SparseFeatureMapper(**p) 111 | sol_get_state = { 112 | 'name': 'sparse_feature_mapper_1', 113 | 'num_of_fields': 10, 114 | 'hash_size': [2, 4, 10], 115 | 'embedding_dim': 4} 116 | assert mapper.get_state() == sol_get_state 117 | 118 | # test set_state 119 | hash_size = self.df_inputs.nunique().tolist() 120 | p = { 121 | 'num_of_fields': self.input_shape, 122 | 'hash_size': hash_size, 123 | 'embedding_dim': self.embed_dim} 124 | sol_set_state = { 125 | 'name': 'sparse_feature_mapper_1', 126 | 'num_of_fields': self.input_shape, 127 | 'hash_size': hash_size, 128 | 'embedding_dim': self.embed_dim} 129 | mapper.set_state(p) 130 | ans_set_state = mapper.get_state() 131 | assert ans_set_state == sol_set_state 132 | 133 | # test build 134 | hp = hp_module.HyperParameters() 135 | tensor_inputs = [tf.convert_to_tensor(self.df_inputs.values)] 136 | mapper = SparseFeatureMapper(**p) 137 | output = mapper.build(hp, tensor_inputs) # Act 138 | assert len(nest.flatten(output)) == 1 139 | assert output.shape == (self.batch, self.input_shape, self.embed_dim) 140 | -------------------------------------------------------------------------------- /tests/pipeline_tests/test_node.py: -------------------------------------------------------------------------------- 1 | import copy 2 | 3 | import numpy as np 4 | import pytest 5 | import tensorflow as tf 6 | 7 | from autorecsys.pipeline import node 8 | 9 | 10 | def test_input_type_error(): 11 | x = 'unknown' 12 | input_node = node.Input() 13 | with pytest.raises(TypeError) as info: 14 | input_node._check(x) 15 | x = input_node.transform(x) 16 | assert 'Expect the data to Input to be numpy' in str(info.value) 17 | 18 | 19 | def test_input_numerical(): 20 | x = np.array([[['unknown']]]) 21 | input_node = node.Input() 22 | with pytest.raises(TypeError) as info: 23 | input_node._check(x) 24 | x = input_node.transform(x) 25 | assert 'Expect the data to Input to be numerical' in str(info.value) 26 | 27 | 28 | -------------------------------------------------------------------------------- /tests/pipeline_tests/test_optimizer.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, print_function, unicode_literals 2 | 3 | import os 4 | import logging 5 | import pytest 6 | import unittest 7 | import tensorflow as tf 8 | from autorecsys.pipeline.optimizer import ( 9 | CTRPredictionOptimizer, 10 | RatingPredictionOptimizer, 11 | ) 12 | from autorecsys.searcher.core import hyperparameters as hp_module 13 | from tensorflow.python.util import nest 14 | 15 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' # Suppress warning for running TF with CPU 16 | 17 | logger = logging.getLogger(__name__) 18 | 19 | 20 | class TestOptimizers(unittest.TestCase): 21 | @pytest.fixture(autouse=True) 22 | def initdir(self, tmpdir): 23 | tmpdir.chdir() # change to pytest-provided temporary directory 24 | tmpdir.join("test_optimizer.ini").write("# testdata") 25 | 26 | def setUp(self): 27 | super(TestOptimizers, self).setUp() 28 | self.batch = 2 29 | self.emb = 4 30 | self.inputs = [tf.random.uniform([self.batch, self.emb], dtype=tf.float32), 31 | tf.random.uniform([self.batch, self.emb], dtype=tf.float32)] 32 | 33 | def test_RatingPredictionOptimizer(self): 34 | hp = hp_module.HyperParameters() 35 | optimizer = RatingPredictionOptimizer() 36 | output = optimizer.build(hp, self.inputs) 37 | assert len(nest.flatten(output)) == 1 38 | assert output.shape == self.batch 39 | 40 | def test_CTRPredictionOptimizer(self): 41 | hp = hp_module.HyperParameters() # Arrange 42 | optimizer = CTRPredictionOptimizer() 43 | output = optimizer.build(hp, self.inputs) # Act 44 | assert len(tf.nest.flatten(output)) == 1 # Assert 45 | assert output.shape == (self.batch, 1) 46 | -------------------------------------------------------------------------------- /tests/pipeline_tests/test_preprocessor.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, print_function, unicode_literals 2 | from sklearn.utils import shuffle 3 | 4 | import os 5 | import random 6 | import functools 7 | import logging 8 | import pytest 9 | import unittest 10 | 11 | import math 12 | import pandas as pd 13 | import numpy as np 14 | import tensorflow as tf 15 | 16 | from autorecsys.pipeline.preprocessor import BasePreprocessor, NetflixPrizePreprocessor, CriteoPreprocessor, AvazuPreprocessor, MovielensPreprocessor 17 | 18 | 19 | logger = logging.getLogger(__name__) 20 | 21 | # directory of this test file so that datasets are imported no mattter where the code is run 22 | current_directory = os.path.dirname(os.path.abspath(__file__)) 23 | dataset_directory = os.path.join( 24 | current_directory, '../../examples/example_datasets') 25 | 26 | 27 | class DummyPreprocessor(BasePreprocessor): 28 | """ Dummy class for testing base functions """ 29 | 30 | def __init__(self, 31 | data_df=None, 32 | non_csv_path=None, 33 | csv_path=None, 34 | header=0, 35 | columns=None, 36 | delimiter='\t', 37 | filler=0.0, 38 | dtype_dict=None, # inferred in load_data() 39 | ignored_columns=None, 40 | target_column='rating', 41 | numerical_columns=None, 42 | categorical_columns=None, 43 | categorical_filter=0, # all categories are counted 44 | fit_dictionary_path=None, 45 | transform_path=None, 46 | test_percentage=0.1, 47 | validate_percentage=0.1, 48 | train_path=None, 49 | validate_path=None, 50 | test_path=None): 51 | 52 | if columns is None: 53 | columns = range(3) 54 | if dtype_dict is None: 55 | dtype_dict = {} 56 | if ignored_columns is None: 57 | ignored_columns = [] 58 | if numerical_columns is None: 59 | numerical_columns = ['num_people'] 60 | if categorical_columns is None: 61 | categorical_columns = ['user_id'] 62 | 63 | super().__init__(non_csv_path=non_csv_path, 64 | csv_path=csv_path, 65 | header=header, 66 | delimiter=delimiter, 67 | filler=filler, 68 | dtype_dict=dtype_dict, 69 | columns=columns, 70 | ignored_columns=ignored_columns, 71 | target_column=target_column, 72 | numerical_columns=numerical_columns, 73 | categorical_columns=categorical_columns, 74 | categorical_filter=categorical_filter, 75 | fit_dictionary_path=fit_dictionary_path, 76 | transform_path=transform_path, 77 | test_percentage=test_percentage, 78 | validate_percentage=validate_percentage, 79 | train_path=train_path, 80 | validate_path=validate_path, 81 | test_path=test_path) 82 | self.data_df = data_df 83 | 84 | def preprocess(self): 85 | return [] 86 | 87 | 88 | class TestPreprocessors(unittest.TestCase): 89 | @pytest.fixture(autouse=True) 90 | def initdir(self, tmpdir): 91 | tmpdir.chdir() # change to pytest-provided temporary directory 92 | tmpdir.join("test_preprocessor.ini").write("# testdata") 93 | 94 | def setUp(self): 95 | super(TestPreprocessors, self).setUp() 96 | 97 | column_names = ["user_id", "num_people", "rating"] 98 | tabular_data = np.array([ 99 | [1, 1, 1], [1, 2, 1], [1, 3, 1], [1, 4, 1], 100 | [2, 1, 1], [2, 2, 1], [2, 3, 1], 101 | [3, 1, 1], [3, 2, 1], 102 | [4, 1, 1] 103 | ]) 104 | small_data = np.array([[1, 1, 1], [1, 2, 1], [2, 3, 1]]) 105 | self.input_df = pd.DataFrame(tabular_data, columns=column_names) 106 | self.small_input_df = pd.DataFrame(small_data, columns=column_names) 107 | self.x_df = self.small_input_df.drop(["rating"], axis=1) 108 | 109 | def test_split_data(self): 110 | base = DummyPreprocessor(data_df=self.input_df) 111 | train_X, test_X, train_y, test_y = base.split_data(base.get_x(), base.get_y(), 0.2) 112 | assert train_X.shape[0] == 8 113 | assert train_y.shape[0] == 8 114 | assert test_X.shape[0] == 2 115 | assert test_y.shape[0] == 2 116 | 117 | def test_transform_numerical(self): 118 | sol = np.array([[1, 1, 1], [1, 2, 1], [2, math.log(float(3)) ** 2, 1]]) 119 | base = DummyPreprocessor(data_df=self.small_input_df) 120 | base.transform_numerical() 121 | assert base.data_df.shape == (3, 3) 122 | assert np.array_equal(sol, base.data_df.values) 123 | 124 | def test_transform_categorical(self): 125 | sol = np.array([[0, 1, 1], [0, 2, 1], [1, 3, 1]]) 126 | base = DummyPreprocessor(data_df=self.small_input_df) 127 | base.transform_categorical() 128 | assert base.data_df.shape == (3, 3) 129 | assert np.array_equal(sol, base.data_df.values) 130 | 131 | def test_get_hash_size(self): 132 | base = DummyPreprocessor(data_df=self.small_input_df) 133 | base.transform_categorical() 134 | assert base.get_hash_size() == [2] 135 | 136 | def test_get_x(self): 137 | sol = self.x_df 138 | base = DummyPreprocessor(data_df=self.small_input_df) 139 | pd.testing.assert_frame_equal(sol, base.get_x()) 140 | 141 | def test_get_x_numerical(self): 142 | sol = self.x_df[['num_people']].values 143 | base = DummyPreprocessor(data_df=self.small_input_df) 144 | assert np.array_equal(base.get_x_numerical( 145 | self.x_df), sol) 146 | 147 | def test_get_x_categorical(self): 148 | sol = self.x_df[['user_id']].values 149 | base = DummyPreprocessor(data_df=self.small_input_df) 150 | assert np.array_equal(base.get_x_categorical( 151 | self.x_df), sol) 152 | 153 | def test_get_y(self): 154 | sol = np.ones(3) 155 | base = DummyPreprocessor(data_df=self.small_input_df) 156 | assert np.array_equal(base.get_y(), sol) 157 | 158 | def test_get_categorical_count(self): 159 | base = DummyPreprocessor(data_df=self.small_input_df) 160 | assert base.get_categorical_count() == 1 161 | 162 | def test_get_numerical_count(self): 163 | base = DummyPreprocessor(data_df=self.small_input_df) 164 | assert base.get_numerical_count() == 1 165 | 166 | def test_MovielensPreprocessor(self): 167 | movielens = MovielensPreprocessor(csv_path=os.path.join( 168 | dataset_directory, 'movielens/ratings-10k.dat')) 169 | movielens.preprocess() 170 | assert movielens.data_df.shape == (10000, 3) 171 | 172 | def test_CriteoPreprocessor(self): 173 | criteo = CriteoPreprocessor(csv_path=os.path.join( 174 | dataset_directory, 'criteo/train-10k.txt')) 175 | criteo.preprocess() 176 | assert criteo.data_df.shape == (10000, 40) 177 | 178 | def test_NetflixPreprocessor(self): 179 | netflix = NetflixPrizePreprocessor( 180 | non_csv_path=os.path.join(dataset_directory, 'netflix/combined_data_1-10k.txt'), 181 | csv_path=os.path.join(dataset_directory, 'netflix/combined_data_1-10k.csv')) 182 | netflix.preprocess() 183 | assert netflix.data_df.shape == (10000, 3) 184 | 185 | def test_AvazuPreprocessor(self): 186 | avazu = AvazuPreprocessor(csv_path=os.path.join( 187 | dataset_directory, 'avazu/train-10k')) 188 | avazu.preprocess() 189 | assert avazu.data_df.shape == (9999, 23) 190 | -------------------------------------------------------------------------------- /tests/pipeline_tests/test_utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, print_function, unicode_literals 2 | 3 | import os 4 | import logging 5 | import pytest 6 | import unittest 7 | 8 | import numpy as np 9 | import tensorflow as tf 10 | from autorecsys.pipeline.utils import Bias 11 | 12 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' # Suppress warning for running TF with CPU 13 | 14 | 15 | logger = logging.getLogger(__name__) 16 | 17 | class TestBias(unittest.TestCase): 18 | @pytest.fixture(autouse=True) 19 | def initdir(self, tmpdir): 20 | tmpdir.chdir() # change to pytest-provided temporary directory 21 | tmpdir.join("test_utils.ini").write("# testdata") 22 | 23 | def setUp(self): 24 | super(TestBias, self).setUp() 25 | self.inputs = tf.constant([ [1, 2, 3], [4, 5, 6] ], dtype="float32") 26 | self.test_units = 4 27 | 28 | def test_Bias(self): 29 | bias = Bias(units=self.test_units) 30 | assert bias.bias.shape == (self.test_units,) 31 | 32 | def test_call(self): 33 | """ 34 | Test Bias.call() 35 | """ 36 | bias = Bias(self.inputs.shape[-1]) # Pass shape of input as units argument 37 | ans = bias(self.inputs) 38 | tf.assert_equal(self.inputs, ans) # Assert tensor is equal since bias layer adds zeroes 39 | -------------------------------------------------------------------------------- /tests/searcher_tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datamllab/AutoRec/2dbc8778cfb597402d8b0337186bf9152663b20a/tests/searcher_tests/__init__.py -------------------------------------------------------------------------------- /tests/searcher_tests/core_tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datamllab/AutoRec/2dbc8778cfb597402d8b0337186bf9152663b20a/tests/searcher_tests/core_tests/__init__.py -------------------------------------------------------------------------------- /tests/searcher_tests/core_tests/test_hyperparameters.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 The Keras Tuner Authors 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import math 16 | import numpy as np 17 | import pytest 18 | 19 | from autorecsys.searcher.core import hyperparameters as hp_module 20 | 21 | 22 | def test_base_hyperparameter(): 23 | base_param = hp_module.HyperParameter(name='base', default=0) 24 | assert base_param.name == 'base' 25 | assert base_param.default == 0 26 | assert base_param.get_config() == {'name': 'base', 'default': 0} 27 | base_param = hp_module.HyperParameter.from_config( 28 | base_param.get_config()) 29 | assert base_param.name == 'base' 30 | assert base_param.default == 0 31 | 32 | 33 | def test_hyperparameters(): 34 | hp = hp_module.HyperParameters() 35 | assert hp.values == {} 36 | assert hp.space == [] 37 | hp.Choice('choice', [1, 2, 3], default=2) 38 | assert hp.values == {'choice': 2} 39 | assert len(hp.space) == 1 40 | assert hp.space[0].name == 'choice' 41 | hp.values['choice'] = 3 42 | assert hp.get('choice') == 3 43 | hp = hp.copy() 44 | assert hp.values == {'choice': 3} 45 | assert len(hp.space) == 1 46 | assert hp.space[0].name == 'choice' 47 | with pytest.raises(ValueError, match='Unknown parameter'): 48 | hp.get('wrong') 49 | 50 | 51 | def test_name_collision(): 52 | # TODO: figure out how name collision checks 53 | # should work. 54 | pass 55 | 56 | 57 | def test_name_scope(): 58 | hp = hp_module.HyperParameters() 59 | hp.Choice('choice', [1, 2, 3], default=2) 60 | with hp.name_scope('scope1'): 61 | hp.Choice('choice', [4, 5, 6], default=5) 62 | with hp.name_scope('scope2'): 63 | hp.Choice('choice', [7, 8, 9], default=8) 64 | hp.Int('range', min_value=0, max_value=10, default=0) 65 | 66 | assert hp.values == { 67 | 'choice': 2, 68 | 'scope1/choice': 5, 69 | 'scope1/scope2/choice': 8, 70 | 'scope1/range': 0 71 | } 72 | assert hp.get_value_in_nested_format() == { 73 | 'choice': 2, 74 | 'scope1': {'choice': 5, 75 | 'scope2': {'choice': 8}, 76 | 'range': 0, 77 | }, 78 | } 79 | 80 | 81 | def test_parent_name(): 82 | hp = hp_module.HyperParameters() 83 | hp.Choice('a', [1, 2, 3], default=2) 84 | b1 = hp.Int( 85 | 'b', 0, 10, parent_name='a', parent_values=1, default=5) 86 | b2 = hp.Int( 87 | 'b', 0, 100, parent_name='a', parent_values=2, default=4) 88 | assert b1 is None 89 | assert b2 == 4 90 | assert hp.values == { 91 | 'a': 2, 92 | 'a=1/b': 5, 93 | 'a=2/b': 4 94 | } 95 | 96 | 97 | def test_conditional_scope(): 98 | hp = hp_module.HyperParameters() 99 | hp.Choice('choice', [1, 2, 3], default=2) 100 | with hp.conditional_scope('choice', [1, 3]): 101 | child1 = hp.Choice('child_choice', [4, 5, 6]) 102 | with hp.conditional_scope('choice', 2): 103 | child2 = hp.Choice('child_choice', [7, 8, 9]) 104 | assert hp.values == { 105 | 'choice': 2, 106 | 'choice=1,3/child_choice': 4, 107 | 'choice=2/child_choice': 7 108 | } 109 | # Assignment to a non-active conditional hyperparameter returns `None`. 110 | assert child1 is None 111 | # Assignment to an active conditional hyperparameter returns the value. 112 | assert child2 == 7 113 | 114 | 115 | def test_nested_conditional_scopes_and_name_scopes(): 116 | hp = hp_module.HyperParameters() 117 | a = hp.Choice('a', [1, 2, 3], default=2) 118 | with hp.conditional_scope('a', [1, 3]): 119 | b = hp.Choice('b', [4, 5, 6]) 120 | with hp.conditional_scope('b', 6): 121 | c = hp.Choice('c', [7, 8, 9]) 122 | with hp.name_scope('d'): 123 | e = hp.Choice('e', [10, 11, 12]) 124 | with hp.conditional_scope('a', 2): 125 | f = hp.Choice('f', [13, 14, 15]) 126 | 127 | assert hp.values == { 128 | 'a': 2, 129 | 'a=1,3/b': 4, 130 | 'a=1,3/b=6/c': 7, 131 | 'a=1,3/b=6/d/e': 10, 132 | 'a=2/f': 13 133 | } 134 | # Assignment to an active conditional hyperparameter returns the value. 135 | assert a == 2 136 | assert f == 13 137 | # Assignment to a non-active conditional hyperparameter returns `None`. 138 | assert b is None 139 | assert c is None 140 | assert e is None 141 | 142 | 143 | def test_get_with_conditional_scopes(): 144 | hp = hp_module.HyperParameters() 145 | hp.Choice('a', [1, 2, 3], default=2) 146 | assert hp.get('a') == 2 147 | with hp.conditional_scope('a', 2): 148 | assert hp.get('a') == 2 149 | 150 | 151 | def test_Choice(): 152 | choice = hp_module.Choice('choice', [1, 2, 3], default=2) 153 | choice = hp_module.Choice.from_config(choice.get_config()) 154 | assert choice.default == 2 155 | assert choice.random_sample() in [1, 2, 3] 156 | assert choice.random_sample(123) == choice.random_sample(123) 157 | # No default 158 | choice = hp_module.Choice('choice', [1, 2, 3]) 159 | assert choice.default == 1 160 | with pytest.raises(ValueError, match='default value should be'): 161 | hp_module.Choice('choice', [1, 2, 3], default=4) 162 | 163 | 164 | @pytest.mark.parametrize( 165 | "values,ordered_arg,ordered_val", 166 | [([1, 2, 3], True, True), 167 | ([1, 2, 3], False, False), 168 | ([1, 2, 3], None, True), 169 | (['a', 'b', 'c'], False, False), 170 | (['a', 'b', 'c'], None, False)]) 171 | def test_Choice_ordered(values, ordered_arg, ordered_val): 172 | choice = hp_module.Choice('choice', values, ordered=ordered_arg) 173 | assert choice.ordered == ordered_val 174 | choice_new = hp_module.Choice(**choice.get_config()) 175 | assert choice_new.ordered == ordered_val 176 | 177 | 178 | def test_Choice_ordered_invalid(): 179 | with pytest.raises(ValueError, match='must be `False`'): 180 | hp_module.Choice('a', ['a', 'b'], ordered=True) 181 | 182 | 183 | def test_Choice_types(): 184 | values1 = ['a', 'b', 0] 185 | with pytest.raises(TypeError, match='can contain only one'): 186 | hp_module.Choice('a', values1) 187 | values2 = [{'a': 1}, {'a': 2}] 188 | with pytest.raises(TypeError, match='can contain only `int`'): 189 | hp_module.Choice('a', values2) 190 | 191 | 192 | def test_Float(): 193 | # Test with step arg 194 | linear = hp_module.Float( 195 | 'linear', min_value=0.5, max_value=9.5, default=9.) 196 | linear = hp_module.Float.from_config(linear.get_config()) 197 | assert linear.default == 9. 198 | assert 0.5 <= linear.random_sample() < 9.5 199 | assert isinstance(linear.random_sample(), float) 200 | assert linear.random_sample(123) == linear.random_sample(123) 201 | 202 | # No default 203 | linear = hp_module.Float( 204 | 'linear', min_value=0.5, max_value=9.5) 205 | assert linear.default == 0.5 206 | 207 | 208 | def test_sampling_arg(): 209 | f = hp_module.Float('f', 1e-20, 1e10, sampling='loguniform') 210 | f = hp_module.Float.from_config(f.get_config()) 211 | assert f.sampling == 'loguniform' 212 | 213 | i = hp_module.Int('i', 0, 10, sampling='uniform') 214 | i = hp_module.Int.from_config(i.get_config()) 215 | assert i.sampling == 'uniform' 216 | 217 | with pytest.raises(ValueError, match='`sampling` must be one of'): 218 | hp_module.Int('j', 0, 10, sampling='invalid') 219 | 220 | 221 | def test_sampling_random_state(): 222 | f = hp_module.Float('f', 1e-3, 1e3, sampling='loguniform') 223 | rand_sample = f.random_sample() 224 | assert rand_sample >= f.min_value 225 | assert rand_sample <= f.max_value 226 | 227 | def log_scale(x, min_value, max_value): 228 | return math.log(x/min_value) / math.log(max_value/min_value) 229 | 230 | x = 1e-1 231 | min_value, max_value = 1e-10, 1e10 232 | # Scale x to [0, 1]. 233 | x_scaled = log_scale(x, min_value, max_value) 234 | # Scale back. 235 | x_rescaled = hp_module._log_sample(x_scaled, min_value, max_value) 236 | assert np.allclose(x, x_rescaled) 237 | 238 | f = hp_module.Float('f', 1e-3, 1e3, sampling='uniform') 239 | rand_sample = f.random_sample() 240 | assert rand_sample >= f.min_value 241 | assert rand_sample <= f.max_value 242 | 243 | 244 | def test_Int(): 245 | rg = hp_module.Int( 246 | 'rg', min_value=5, max_value=9, default=6) 247 | rg = hp_module.Int.from_config(rg.get_config()) 248 | assert rg.default == 6 249 | assert 5 <= rg.random_sample() < 9 250 | assert isinstance(rg.random_sample(), int) 251 | assert rg.random_sample(123) == rg.random_sample(123) 252 | # No default 253 | rg = hp_module.Int( 254 | 'rg', min_value=5, max_value=9) 255 | assert rg.default == 5 256 | 257 | 258 | def test_Boolean(): 259 | # Test default default 260 | boolean = hp_module.Boolean('bool') 261 | assert boolean.default is False 262 | # Test default setting 263 | boolean = hp_module.Boolean('bool', default=True) 264 | assert boolean.default is True 265 | # Wrong default type 266 | with pytest.raises(ValueError, match='must be a Python boolean'): 267 | hp_module.Boolean('bool', default=None) 268 | # Test serialization 269 | boolean = hp_module.Boolean('bool', default=True) 270 | boolean = hp_module.Boolean.from_config(boolean.get_config()) 271 | assert boolean.default is True 272 | assert boolean.name == 'bool' 273 | 274 | # Test random_sample 275 | assert boolean.random_sample() in {True, False} 276 | assert boolean.random_sample(123) == boolean.random_sample(123) 277 | 278 | 279 | def test_merge(): 280 | hp = hp_module.HyperParameters() 281 | hp.Int('a', 0, 100) 282 | hp.Float('b', min_value=0.5, max_value=9.5, default=2) 283 | 284 | hp2 = hp_module.HyperParameters() 285 | hp2.Int('a', 3, 4, default=3) 286 | hp.Int('c', 10, 100, default=30) 287 | hp.merge(hp2) 288 | 289 | assert hp.get('a') == 3 290 | assert hp.get('b') == 2 291 | assert hp.get('c') == 30 292 | 293 | hp3 = hp_module.HyperParameters() 294 | hp3.Float('a', 3.5, 4.5) 295 | hp3.Choice('d', [1, 2, 3], default=1) 296 | 297 | hp.merge(hp3, overwrite=False) 298 | 299 | assert hp.get('a') == 3 300 | assert hp.get('b') == 2 301 | assert hp.get('c') == 30 302 | assert hp.get('d') == 1 303 | 304 | 305 | def _sort_space(hps): 306 | space = hps.get_config()['space'] 307 | return sorted(space, key=lambda hp: hp['config']['name']) 308 | -------------------------------------------------------------------------------- /tests/searcher_tests/core_tests/test_oracle.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | 4 | from autorecsys.searcher.core.oracle import Oracle, Objective 5 | from autorecsys.searcher.core import hyperparameters as hps_module 6 | from autorecsys.searcher.core import trial as trial_module 7 | 8 | from tensorflow.keras import metrics 9 | 10 | 11 | @pytest.fixture(scope='function') 12 | def tmp_dir(tmpdir_factory): 13 | return tmpdir_factory.mktemp('oracle_test', numbered=True) 14 | 15 | 16 | class OracleTest(Oracle): 17 | def _populate_space(self, trial_id): 18 | return {'status': trial_module.TrialStatus.IDLE, 19 | 'values': self.hyperparameters.values} 20 | 21 | def test_oracle(tmp_dir): 22 | hps = hps_module.HyperParameters() 23 | hps.Choice('iyo_koiyo', values=[1, 2, 3, 4, 5, 6], ordered=False) 24 | oracle_tst = OracleTest(objective=['mse', 'auc_roc_score'], max_trials=50, hyperparameters=hps) 25 | assert oracle_tst.objective == [Objective(name='mse', direction='min'), Objective(name='auc_roc_score', direction='min')] 26 | trial1 = oracle_tst.create_trial(tuner_id='114514') 27 | trial2 = oracle_tst.create_trial(tuner_id='114514') 28 | oracle_tst.set_project_dir(directory=tmp_dir, project_name='test', overwrite=False) 29 | oracle_tst.save() 30 | assert os.path.exists(os.path.join(tmp_dir, oracle_tst._get_oracle_fname())) 31 | oracle_tst._save_trial(trial1) 32 | oracle_tst._save_trial(trial2) 33 | assert os.path.exists(os.path.join(oracle_tst._project_dir, f'trial_{trial1.trial_id}')) 34 | assert os.path.exists(os.path.join(oracle_tst._project_dir, f'trial_{trial2.trial_id}')) 35 | oracle_tst.reload() 36 | assert all(_id in oracle_tst.trials for _id in [trial1.trial_id, trial2.trial_id]) -------------------------------------------------------------------------------- /tests/searcher_tests/core_tests/test_trial.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | 4 | from autorecsys.utils import metric 5 | from autorecsys.searcher.core import hyperparameters as hps_module 6 | from autorecsys.searcher.core import trial as trial_module 7 | 8 | from tensorflow.keras import metrics 9 | 10 | 11 | @pytest.fixture(scope='function') 12 | def tmp_dir(tmpdir_factory): 13 | return tmpdir_factory.mktemp('trial_test', numbered=True) 14 | 15 | 16 | @pytest.mark.skip(reason="TODO Later") 17 | def test_register_from_metrics(): 18 | # As well as direction inference. 19 | tracker = metric.MetricsTracker( 20 | metrics=[metrics.CategoricalAccuracy(), 21 | metrics.MeanSquaredError()] 22 | ) 23 | assert set(tracker.metrics.keys()) == {'categorical_accuracy', 24 | 'mean_squared_error'} 25 | assert tracker.metrics['categorical_accuracy'].direction == 'max' 26 | assert tracker.metrics['mean_squared_error'].direction == 'min' 27 | 28 | 29 | def test_register(): 30 | tracker = metric.MetricsTracker() 31 | tracker.register('new_metric', direction='max') 32 | assert set(tracker.metrics.keys()) == {'new_metric'} 33 | assert tracker.metrics['new_metric'].direction == 'max' 34 | with pytest.raises(ValueError, 35 | match='`direction` should be one of'): 36 | tracker.register('another_metric', direction='wrong') 37 | with pytest.raises(ValueError, 38 | match='already exists'): 39 | tracker.register('new_metric', direction='max') 40 | 41 | 42 | def test_exists(): 43 | tracker = metric.MetricsTracker() 44 | tracker.register('new_metric', direction='max') 45 | assert tracker.exists('new_metric') 46 | assert not tracker.exists('another_metric') 47 | 48 | 49 | def test_update(): 50 | tracker = metric.MetricsTracker() 51 | tracker.update('new_metric', 0.5) # automatic registration 52 | assert set(tracker.metrics.keys()) == {'new_metric'} 53 | assert tracker.metrics['new_metric'].direction == 'min' # default direction 54 | assert (tracker.get_history('new_metric') == 55 | [metric.MetricObservation(0.5, step=0)]) 56 | 57 | 58 | def test_get_history(): 59 | tracker = metric.MetricsTracker() 60 | tracker.update('new_metric', 0.5, step=0) 61 | tracker.update('new_metric', 1.5, step=1) 62 | tracker.update('new_metric', 2., step=2) 63 | assert tracker.get_history('new_metric') == [ 64 | metric.MetricObservation(0.5, 0), 65 | metric.MetricObservation(1.5, 1), 66 | metric.MetricObservation(2., 2), 67 | ] 68 | with pytest.raises(ValueError, match='Unknown metric'): 69 | tracker.get_history('another_metric') 70 | 71 | 72 | def test_get_last_value(): 73 | tracker = metric.MetricsTracker() 74 | tracker.register('new_metric', 'min') 75 | assert tracker.get_last_value('new_metric') is None 76 | tracker.set_history( 77 | 'new_metric', 78 | [metric.MetricObservation(1., 0), 79 | metric.MetricObservation(2., 1), 80 | metric.MetricObservation(3., 2)]) 81 | assert tracker.get_last_value('new_metric') == 3. 82 | 83 | 84 | def test_serialization(): 85 | tracker = metric.MetricsTracker() 86 | tracker.register('metric_min', 'min') 87 | tracker.register('metric_max', 'max') 88 | tracker.set_history( 89 | 'metric_min', 90 | [metric.MetricObservation(1., 0), 91 | metric.MetricObservation(2., 1), 92 | metric.MetricObservation(3., 2)]) 93 | tracker.set_history( 94 | 'metric_max', 95 | [metric.MetricObservation(1., 0), 96 | metric.MetricObservation(2., 1), 97 | metric.MetricObservation(3., 2)]) 98 | 99 | new_tracker = metric.MetricsTracker.from_config( 100 | tracker.get_config()) 101 | assert new_tracker.metrics.keys() == tracker.metrics.keys() 102 | 103 | 104 | def test_trial(): 105 | hps = hps_module.HyperParameters() 106 | hps.Int('a', 0, 10, default=3) 107 | trial = trial_module.Trial( 108 | hps, trial_id='trial1', status='COMPLETED') 109 | trial.metrics.register('score', direction='max') 110 | trial.metrics.update('score', 10, step=1) 111 | assert len(trial.hyperparameters.space) == 1 112 | _trail = trial_module.Trial.from_state(trial.get_state()) 113 | assert _trail.hyperparameters.get('a') == 3 114 | assert _trail.trial_id == 'trial1' 115 | assert _trail.score is None 116 | assert _trail.best_step is None 117 | assert _trail.metrics.get_best_value('score') == 10 118 | assert _trail.metrics.get_history('score') == [metric.MetricObservation(10, step=1)] 119 | -------------------------------------------------------------------------------- /tests/searcher_tests/core_tests/test_tuner.py: -------------------------------------------------------------------------------- 1 | # TODO -------------------------------------------------------------------------------- /tests/searcher_tests/searchers_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | @pytest.fixture(scope='module') 5 | def tmp_dir(tmpdir_factory): 6 | return tmpdir_factory.mktemp('searcher_test') 7 | 8 | 9 | def test_randomsearch(tmp_dir): 10 | # TODO 11 | pass 12 | 13 | -------------------------------------------------------------------------------- /tests/utils_test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datamllab/AutoRec/2dbc8778cfb597402d8b0337186bf9152663b20a/tests/utils_test/__init__.py -------------------------------------------------------------------------------- /tests/utils_test/test.csv: -------------------------------------------------------------------------------- 1 | Sample 1 2 3 4 5 2 | -------------------------------------------------------------------------------- /tests/utils_test/test_common.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import os 3 | from autorecsys.utils.common import ( 4 | set_device, 5 | # dataset_shape, 6 | to_snake_case, 7 | create_directory, 8 | load_dataframe_input, 9 | set_seed, 10 | save_pickle, 11 | load_pickle, 12 | ) 13 | import tensorflow as tf 14 | from tensorflow.python.client import device_lib 15 | import pandas as pd 16 | import numpy as np 17 | import random 18 | import unittest 19 | 20 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 21 | 22 | class test_common(unittest.TestCase): 23 | device_info = "cpu:0" 24 | def test_set_cpu(self): 25 | set_device("cpu:0") 26 | # checks that the current devices being used by tf is a cpu 27 | assert (len(tf.config.experimental.list_physical_devices()) > 0) 28 | 29 | def test_to_snake_case(self): 30 | temp = to_snake_case("i am a string") 31 | assert(temp == "i_am_a_string") 32 | temp = to_snake_case("_i am a private string") 33 | assert(temp == "private_i_am_a_private_string") 34 | temp = to_snake_case("IAmStringWithCaps") 35 | assert(temp == "i_am_string_with_caps") 36 | temp = to_snake_case("I#am%a&string(with*special+characters") 37 | assert(temp == "i_am_a_string_with_special_characters") 38 | temp = to_snake_case("MLPInteractor") 39 | assert(temp == "mlp_interactor") 40 | 41 | #Creates a directory and sees if it exists 42 | def test_create_directory(self): 43 | assert(os.path.exists("test_dir")==False) 44 | create_directory("test_dir") 45 | assert(os.path.exists("test_dir")==True) 46 | 47 | #Tests for panda dataframe for 5 possible inputs 48 | def test_load_dataframe_input(self): 49 | #Test for panda dataframe 50 | assert(isinstance(load_dataframe_input(pd.DataFrame(data={'col1': [1, 2], 'col2': [3, 4]})), pd.DataFrame)) 51 | 52 | #Test for np_ndarray 53 | assert(isinstance(load_dataframe_input(np.array( [ 1, 2, 3])), pd.Series)) 54 | 55 | assert(isinstance(load_dataframe_input(np.array( [[ 1, 2, 3], [ 4, 2, 5]] )), pd.DataFrame)) 56 | #Test for string 57 | 58 | try: 59 | load_dataframe_input("wrong_file.exe") 60 | except TypeError: 61 | assert(True) 62 | assert(isinstance(load_dataframe_input("test.csv"), pd.DataFrame)) 63 | 64 | #Sets seed then compares the output to the expected output 65 | def test_set_seed(self): 66 | set_seed(10); 67 | temp = random.random() 68 | random.seed(10) 69 | assert(random.random() == temp) 70 | 71 | temp = np.random.rand(1); 72 | np.random.seed(10) 73 | assert(np.random.rand(1)==temp) 74 | 75 | temp = tf.random.uniform([1]) 76 | tf.random.set_seed(10) 77 | assert(tf.random.uniform([1]) == temp) 78 | 79 | #Test save and load pickle 80 | def test_save_pickle(self): 81 | save_pickle("test_pickle", { "lion": "yellow", "kitty": "red" }) 82 | assert(os.path.exists("test_pickle") == True) 83 | 84 | def test_load_pickle(self): 85 | save_pickle("test_pickle", { "lion": "yellow", "kitty": "red" }) 86 | temp = load_pickle("test_pickle") 87 | assert(temp == { "lion": "yellow", "kitty": "red" }) --------------------------------------------------------------------------------