├── .gitignore
├── README.md
├── autorecsys
├── __init__.py
├── auto_search.py
├── pipeline
│ ├── __init__.py
│ ├── base.py
│ ├── graph.py
│ ├── interactor.py
│ ├── mapper.py
│ ├── node.py
│ ├── optimizer.py
│ ├── preprocessor.py
│ └── utils.py
├── recommender.py
├── searcher
│ ├── __init__.py
│ ├── core
│ │ ├── __init__.py
│ │ ├── hyperparameters.py
│ │ ├── oracle.py
│ │ ├── trial.py
│ │ └── utils.py
│ └── tuners
│ │ ├── __init__.py
│ │ ├── bayesian.py
│ │ ├── greedy.py
│ │ ├── randomsearch.py
│ │ └── tuner.py
└── utils
│ ├── __init__.py
│ ├── common.py
│ ├── display.py
│ └── metric.py
├── docs
├── autogen.py
├── index.md
├── mkdocs.yml
├── readme.md
├── requirements.txt
├── templates
│ ├── about.md
│ ├── benchmark.md
│ ├── index.md
│ └── install.md
└── tutobooks.py
├── examples
├── README.md
├── ctr_autoint.py
├── ctr_autorec.py
├── ctr_benchmark.py
├── ctr_crossnet.py
├── ctr_deepfm.py
├── ctr_dlrm.py
├── ctr_neumf.py
├── example_datasets
│ ├── avazu
│ │ └── train-10k
│ ├── criteo
│ │ └── train-10k.txt
│ ├── movielens
│ │ └── ratings-10k.dat
│ └── netflix
│ │ └── combined_data_1-10k.txt
├── rp_autorec.py
├── rp_benchmark.py
├── rp_mf.py
└── rp_neumf.py
├── mkdocs.yml
├── requirements.txt
├── setup.py
└── tests
├── __init__.py
├── common.py
├── integration_tests.py
├── integration_tests
└── test_models.py
├── pipeline_tests
├── __init__.py
├── test_graph.py
├── test_interactor.py
├── test_mapper.py
├── test_node.py
├── test_optimizer.py
├── test_preprocessor.py
└── test_utils.py
├── searcher_tests
├── __init__.py
├── core_tests
│ ├── __init__.py
│ ├── test_hyperparameters.py
│ ├── test_oracle.py
│ ├── test_trial.py
│ └── test_tuner.py
└── searchers_test.py
└── utils_test
├── __init__.py
├── test.csv
└── test_common.py
/.gitignore:
--------------------------------------------------------------------------------
1 | /tests/datasets/*
2 | /examples/datasets/*
3 | .idea/*
4 | .DS_Store
5 | *.pyc
6 | /tests/tmp_autokaggle-tmp-3/
7 | /examples/old/
8 | __pycache__/
9 | /tests/config/
10 | /.pytest_cache/
11 | ._*
12 | test.py
13 | *.sh
14 | *.log.txt
15 | *.log
16 | _rp_benchmark_latest.py
17 | _rp_benchmark_10m.py
18 | search_1/
19 |
20 | # MKdocs
21 | /docs/sources
22 | /docs/site
23 |
24 | # Local test scripts
25 | ctr_benchmark-gpu*.py
26 | examples/netflix.py
27 | examples/ctr_deepfm_test_criteo.py
28 | examples/ctr_deepfm_test_avazu.py
29 | examples/ctr_test_criteo.py
30 |
31 | # Others
32 | autorecsys/utils/config.py
33 | examples/example_datasets/netflix/combined_data_1-10k.csv
34 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # AutoRec
2 |
3 |
4 | AutoRec is a Keras-based implementation of automated recommendation algorithms for both rating prediction and Click Through Rate task.
5 |
6 |
7 | For more details, see the [Documentation](http://autorec.ai).
8 |
9 |
10 | ## Installation
11 | Install from `pip`:
12 | ```
13 | pip install autorec
14 | ```
15 |
16 |
17 | ## Quickstart
18 | Build an rating prediction model which can search the model architecture automatically on the MovieLens dataset is very easy as follows:
19 | ```python
20 | # -*- coding: utf-8 -*-
21 | import tensorflow as tf
22 | from autorecsys.auto_search import Search
23 | from autorecsys.pipeline import Input, LatentFactorMapper, RatingPredictionOptimizer, ElementwiseInteraction
24 | from autorecsys.pipeline.preprocessor import MovielensPreprocessor, NetflixPrizePreprocessor
25 | from autorecsys.recommender import RPRecommender
26 |
27 | # load dataset
28 | #Movielens 1M Dataset
29 | data = MovielensPreprocessor("./examples/datasets/ml-1m/ratings.dat")
30 | data.preprocessing(val_test_size=0.1, random_state=1314)
31 | train_X, train_y = data.train_X, data.train_y
32 | val_X, val_y = data.val_X, data.val_y
33 | test_X, test_y = data.test_X, data.test_y
34 | user_num, item_num = data.user_num, data.item_num
35 |
36 | # build the pipeline.
37 | input = Input(shape=[2])
38 | user_emb = LatentFactorMapper(column_id=0,
39 | num_of_entities=user_num,
40 | embedding_dim=64)(input)
41 | item_emb = LatentFactorMapper(column_id=1,
42 | num_of_entities=item_num,
43 | embedding_dim=64)(input)
44 | output = ElementwiseInteraction(elementwise_type="innerporduct")([user_emb, item_emb])
45 | output = RatingPredictionOptimizer()(output)
46 | model = RPRecommender(inputs=input, outputs=output)
47 |
48 | # AutoML search and predict
49 | searcher = Search(model=model,
50 | tuner='greedy', # hyperband, greedy, bayesian
51 | tuner_params={"max_trials": 5}
52 | )
53 |
54 | searcher.search(x=train_X,
55 | y=train_y,
56 | x_val=val_X,
57 | y_val=val_y,
58 | objective='val_mse',
59 | batch_size=1024,
60 | epochs=10,
61 | callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=1)])
62 | ```
63 |
--------------------------------------------------------------------------------
/autorecsys/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datamllab/AutoRec/2dbc8778cfb597402d8b0337186bf9152663b20a/autorecsys/__init__.py
--------------------------------------------------------------------------------
/autorecsys/auto_search.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import, division, print_function, unicode_literals
2 |
3 | import os
4 | import logging
5 | import tempfile
6 | import tensorflow as tf
7 |
8 | from autorecsys.utils.common import to_snake_case, create_directory, load_dataframe_input
9 | from autorecsys.searcher.tuners.tuner import METRIC, PipeTuner
10 | from autorecsys.searcher import tuners
11 | from autorecsys.recommender import CTRRecommender, RPRecommender
12 |
13 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
14 | logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
15 | logger = logging.getLogger(__name__)
16 |
17 |
18 | class Search(object):
19 | """ A search object to search on a Recommender HyperModel (CTRRecommender/RPRecommender)
20 | defined by inputs and outputs.
21 |
22 | ``Search`` combines a Recommender and a Tuner to tune the Recommender. The user can
23 | use ``search()`` to perform search, and use a similar way to a Keras model to adopt
24 | the best discovered model as it also has `fit()`/`predict()`/`evaluate()` methods.
25 | The user should input a Recommender HyperModel (CTRRecommender/RPRecommender) and a
26 | selected tuning method to initial the ``Search`` object and input the dataset when
27 | calling the ``search`` method to discover the best architecture.
28 | ```
29 | # Arguments
30 | model: A Recommender HyperModel (CTRRecommender/RPRecommender).
31 | name: String. The name of the project, which is used for saving and loading purposes.
32 | tuner: String. The name of the tuner. It should be one of 'greedy', 'bayesian' or
33 | 'random'. Default to be 'random'.
34 |
35 |
36 | tuner_params: Dict. The hyperparameters of the tuner. The commons ones are:
37 | 'max_trials': Int. Specify the number of search epochs.
38 | 'overwrite': Boolean. Whether we want to ovewrite an existing
39 | tuner or not.
40 |
41 | directory: String. The path to a directory for storing the search outputs.
42 | Defaults to None, which would create a folder with the name of the
43 | project in the current directory, i.e., ``directory/name``.
44 | overwrite: Boolean. Defaults to `True`. Whether we want to ovewrite an existing
45 | project with the name defined as ``directory/name`` or not.
46 | """
47 | def __init__(self, model=None, name=None, tuner='random', tuner_params=None, directory='.', overwrite=True):
48 | self.pipe = model
49 | self.tuner = tuner
50 | self.tuner_params = tuner_params
51 | if not name:
52 | prefix = self.__class__.__name__
53 | name = prefix + '_' + str(tf.keras.backend.get_uid(prefix))
54 | name = to_snake_case(name)
55 | self.name = name
56 | directory = directory or tempfile.gettempdir()
57 | self.dir = os.path.join(directory, self.name)
58 |
59 | self.overwrite = overwrite
60 | create_directory(self.dir, remove_existing=overwrite)
61 | self.logger = logging.getLogger(self.name)
62 | self.logger.info('Project directory: {}'.format(self.dir))
63 | self.best_keras_graph = None
64 | self.best_model = None
65 | self.need_fully_train = False
66 |
67 | def search(self, x=None, y=None, x_val=None, y_val=None, objective='mse', **fit_kwargs):
68 | """Search the best deep recommendation model.
69 |
70 | # Arguments
71 | x: numpy array. Training features.
72 | y: numpy array. Training targets.
73 | x_val: numpy array. Validation features.
74 | y_val: numpy array. Validation features.
75 | objective: String. Name of model metric to minimize or maximize,
76 | e.g. 'val_BinaryCrossentropy'. Defaults to 'mse'.
77 | **fit_kwargs: Any arguments supported by the fit method of a Keras model such as:
78 | ``batch_size``, ``epochs``, ``callbacks``.
79 | """
80 |
81 | # overwrite the objective
82 | self.objective = objective
83 | tuner = self._build_tuner(self.tuner, self.tuner_params)
84 |
85 | # TODO search on a small piece of train data, currently it uses whole train data
86 | tuner.search(x=x, y=y, x_val=x_val, y_val=y_val, **fit_kwargs)
87 | # show the search space
88 | tuner.search_space_summary()
89 | # show the search results
90 | tuner.results_summary()
91 | best_pipe_lists = tuner.get_best_models(1)
92 | # len(best_pipe_lists) == 0 means that this pipeline does not have tunable parameters
93 | self.best_model = best_pipe_lists[0]
94 | return self.best_model
95 |
96 | def _build_tuner(self, tuner, tuner_params):
97 | """Build a tuner based on its name and hyperparameters.
98 |
99 | # Arguments
100 | tuner: String. The name of the tuner. It should be one of 'greedy', 'bayesian' or
101 | 'random'. Default to be 'random'.
102 |
103 | tuner_params: Dict. The hyperparameters of the tuner. The commons ones are:
104 | 'max_trials': Int. Specify the number of search epochs.
105 | 'overwrite': Boolean. Whether we want to ovewrite an existing
106 | tuner or not.
107 | """
108 | tuner_cls = tuners.get_tuner_class( tuner )
109 | hps = self.pipe.get_hyperparameters()
110 | tuner = tuner_cls(hypergraph=self.pipe,
111 | objective=self.objective,
112 | hyperparameters=hps,
113 | directory=self.dir,
114 | **tuner_params)
115 | return tuner
116 |
117 | def predict(self, x):
118 | """Use the best searched model to conduct prediction on the dataset x.
119 |
120 | # Arguments
121 | x: numpy array / data frame / string path of a csv file.
122 | Features used to do the prediction.
123 | """
124 | if isinstance (self.pipe, RPRecommender):
125 | x = load_dataframe_input(x)
126 | return self.best_model.predict(x)
127 |
128 | def evaluate(self, x, y_true):
129 | """Evaluate the best searched model.
130 |
131 | # Arguments
132 | x: numpy array / data frame / string path of a csv file.
133 | Features used to do the prediction.
134 | y_true: numpy array / data frame / string path of a csv file.
135 | Ground-truth labels.
136 | """
137 | y_pred = self.predict(x)
138 | score_func = METRIC[self.objective.split('_')[-1]]
139 | y_true = load_dataframe_input(y_true)
140 | y_true = y_true.values.reshape(-1, 1)
141 | self.logger.info(f'evaluate prediction results using {self.objective}')
142 | return score_func(y_true, y_pred)
143 |
--------------------------------------------------------------------------------
/autorecsys/pipeline/__init__.py:
--------------------------------------------------------------------------------
1 | from autorecsys.pipeline.mapper import LatentFactorMapper, DenseFeatureMapper, SparseFeatureMapper
2 | from autorecsys.pipeline.interactor import MLPInteraction, ConcatenateInteraction, FMInteraction,\
3 | ElementwiseInteraction, CrossNetInteraction, SelfAttentionInteraction, HyperInteraction, InnerProductInteraction
4 | from autorecsys.pipeline.optimizer import RatingPredictionOptimizer, CTRPredictionOptimizer
5 | from autorecsys.pipeline.node import Input, StructuredDataInput
6 |
--------------------------------------------------------------------------------
/autorecsys/pipeline/base.py:
--------------------------------------------------------------------------------
1 | import types
2 | import tensorflow as tf
3 | from autorecsys.searcher.core import hyperparameters as hp_module
4 | from autorecsys.searcher.core.trial import Stateful
5 | from autorecsys.utils.common import to_snake_case
6 | from tensorflow.python.util import nest
7 |
8 |
9 | class Node(Stateful):
10 | """The nodes in a network connecting the blocks."""
11 |
12 | def __init__(self, shape=None):
13 | super().__init__()
14 | self.in_blocks = []
15 | self.out_blocks = []
16 | self.shape = shape
17 |
18 | def add_in_block(self, hypermodel):
19 | self.in_blocks.append(hypermodel)
20 |
21 | def add_out_block(self, hypermodel):
22 | self.out_blocks.append(hypermodel)
23 |
24 | def build(self):
25 | return tf.keras.Input(shape=self.shape)
26 |
27 | def get_state(self):
28 | return {'shape': self.shape}
29 |
30 | def set_state(self, state):
31 | self.shape = state['shape']
32 |
33 |
34 | class HyperModel(object):
35 | """Defines a searchable space of Models and builds Models from this space.
36 | # Attributes:
37 | name: The name of this HyperModel.
38 | tunable: Whether the hyperparameters defined in this hypermodel
39 | should be added to search space. If `False`, either the search
40 | space for these parameters must be defined in advance, or the
41 | default values will be used.
42 | """
43 |
44 | def __init__(self, name=None, tunable=True):
45 | self.name = name
46 | self.tunable = tunable
47 |
48 | self._build = self.build
49 | self.build = self._build_wrapper
50 |
51 | def build(self, hp):
52 | """Builds a model.
53 | # Arguments:
54 | hp: A `HyperParameters` instance.
55 | # Returns:
56 | A model instance.
57 | """
58 | raise NotImplementedError
59 |
60 | def _build_wrapper(self, hp, *args, **kwargs):
61 | if not self.tunable:
62 | # Copy `HyperParameters` object so that new entries are not added
63 | # to the search space.
64 | hp = hp.copy()
65 | return self._build(hp, *args, **kwargs)
66 |
67 |
68 | class Block(HyperModel, Stateful):
69 | def __init__(self, name=None, **kwargs):
70 | super().__init__(**kwargs)
71 | self.fixed_params = None
72 | self.tunable_candidates = None
73 | if not name:
74 | prefix = self.__class__.__name__
75 | name = prefix + '_' + str(tf.keras.backend.get_uid(prefix))
76 | name = to_snake_case(name)
77 | self._hyperparameters = None
78 | self.name = name
79 | self.inputs = None
80 | self.outputs = None
81 | self._num_output_node = 1
82 |
83 | def __new__(cls, *args, **kwargs):
84 | obj = super().__new__(cls)
85 | build_fn = obj.build
86 |
87 | def build_wrapper(obj, hp, *args, **kwargs):
88 | with hp.name_scope(obj.name):
89 | return build_fn(hp, *args, **kwargs)
90 |
91 | obj.build = types.MethodType(build_wrapper, obj)
92 | return obj
93 |
94 | def __str__(self):
95 | return self.name
96 |
97 | @property
98 | def hyperparameters(self):
99 | return self._hyperparameters
100 |
101 | def __call__(self, inputs):
102 | """Functional API.
103 | # Arguments
104 | inputs: A list of input node(s) or a single input node for the block.
105 | # Returns
106 | list: A list of output node(s) of the Block.
107 | """
108 | inputs = nest.flatten(inputs)
109 | self.inputs = inputs
110 | for input_node in self.inputs:
111 | if not isinstance(input_node, Node):
112 | raise TypeError('Expect the inputs to layer {name} to be '
113 | 'a Node, but got {type}.'.format(
114 | name=self.name,
115 | type=type(input_node)))
116 | input_node.add_out_block(self)
117 | self.outputs = []
118 | for _ in range(self._num_output_node):
119 | output_node = Node()
120 | output_node.add_in_block(self)
121 | self.outputs.append(output_node)
122 | return self.outputs
123 |
124 | def get_state(self):
125 | """Get the configuration of the preprocessor.
126 | # Returns
127 | A dictionary of configurations of the preprocessor.
128 | """
129 | return {'name': self.name}
130 |
131 | def set_state(self, state):
132 | """Set the configuration of the preprocessor.
133 | # Arguments
134 | state: A dictionary of the configurations of the preprocessor.
135 | """
136 | if 'name' in state:
137 | self.name = state['name']
138 |
139 |
140 | class HyperBlock(Block):
141 | """HyperBlock uses hyperparameters to decide inner Block graph.
142 | A HyperBlock should be build into connected Blocks instead of individual Keras
143 | layers. The main purpose of creating the HyperBlock class is for the ease of
144 | parsing the graph for preprocessors. The graph would be hard to parse if a Block,
145 | whose inner structure is decided by hyperparameters dynamically, contains both
146 | preprocessors and Keras layers.
147 | When the preprocessing layers of Keras are ready to cover all the preprocessors
148 | in AutoKeras, the preprocessors should be handled by the Keras Model. The
149 | HyperBlock class should be removed. The subclasses should extend Block class
150 | directly and the build function should build connected Keras layers instead of
151 | Blocks.
152 | # Arguments
153 | output_shape: Tuple of int(s). Defaults to None. If None, the output shape
154 | will be inferred from the AutoModel.
155 | name: String. The name of the block. If unspecified, it will be set
156 | automatically with the class name.
157 | """
158 |
159 | def __init__(self, output_shape=None, **kwargs):
160 | super().__init__(**kwargs)
161 | self.output_shape = output_shape
162 |
163 | def build(self, hp, inputs=None):
164 | """Build the HyperModel instead of Keras Model.
165 | # Arguments
166 | hp: HyperParameters. The hyperparameters for building the model.
167 | inputs: A list of instances of Node.
168 | # Returns
169 | An Node instance, the output node of the output Block.
170 | """
171 | raise NotImplementedError
172 |
173 |
174 | class Preprocessor(Block):
175 | """Hyper preprocessing block base class.
176 | It extends Block which extends Hypermodel. A preprocessor is a Hypermodel, which
177 | means it is a search space. However, different from other Hypermodels, it is
178 | also a model which can be fit.
179 | """
180 |
181 | def build(self, hp):
182 | """Get the values of the required HyperParameters.
183 | It does not build and return a Keras Model, but initialize the
184 | HyperParameters for the preprocessor to be fit.
185 | """
186 | pass
187 |
188 | def update(self, x, y=None):
189 | """Incrementally fit the preprocessor with a single training instance.
190 | # Arguments
191 | x: EagerTensor. A single instance in the training dataset.
192 | y: EagerTensor. The targets of the tasks. Defaults to None.
193 | """
194 | raise NotImplementedError
195 |
196 | def transform(self, x, fit=False):
197 | """Incrementally fit the preprocessor with a single training instance.
198 | # Arguments
199 | x: EagerTensor. A single instance in the training dataset.
200 | fit: Boolean. Whether it is in fit mode.
201 | Returns:
202 | A transformed instanced which can be converted to a tf.Tensor.
203 | """
204 | raise NotImplementedError
205 |
206 | def output_types(self):
207 | """The output types of the transformed data, e.g. tf.int64.
208 | The output types are required by tf.py_function, which is used for transform
209 | the dataset into a new one with a map function.
210 | # Returns
211 | A tuple of data types.
212 | """
213 | raise NotImplementedError
214 |
215 | @property
216 | def output_shape(self):
217 | """The output shape of the transformed data.
218 | The output shape is needed to build the Keras Model from the AutoModel.
219 | The output shape of the preprocessor is the input shape of the Keras Model.
220 | # Returns
221 | A tuple of int(s) or a TensorShape.
222 | """
223 | raise NotImplementedError
224 |
225 | def finalize(self):
226 | """Training process of the preprocessor after update with all instances."""
227 | pass
228 |
229 | def get_config(self):
230 | """Get the configuration of the preprocessor.
231 | # Returns
232 | A dictionary of configurations of the preprocessor.
233 | """
234 | return {}
235 |
236 | def set_config(self, config):
237 | """Set the configuration of the preprocessor.
238 | # Arguments
239 | config: A dictionary of the configurations of the preprocessor.
240 | """
241 | pass
242 |
243 | def get_weights(self):
244 | """Get the trained weights of the preprocessor.
245 | # Returns
246 | A dictionary of trained weights of the preprocessor.
247 | """
248 | return {}
249 |
250 | def set_weights(self, weights):
251 | """Set the trained weights of the preprocessor.
252 | # Arguments
253 | weights: A dictionary of trained weights of the preprocessor.
254 | """
255 | pass
256 |
257 | def get_state(self):
258 | state = super().get_state()
259 | state.update(self.get_config())
260 | return {'config': state,
261 | 'weights': self.get_weights()}
262 |
263 | def set_state(self, state):
264 | self.set_config(state['config'])
265 | super().set_state(state['config'])
266 | self.set_weights(state['weights'])
267 |
--------------------------------------------------------------------------------
/autorecsys/pipeline/mapper.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import, division, print_function, unicode_literals
2 |
3 | import tensorflow as tf
4 | from autorecsys.pipeline.base import Block
5 |
6 |
7 | class LatentFactorMapper(Block):
8 | """ This module maps the user (item) entity into embeddings (latent factors).
9 |
10 | # Note
11 | Data-wise, the similarity b/t class LatentFactorMapper and class SparseFeatureMapper is that both user (item)
12 | identifiers and indexed categorical data are sparse and devoid of numerical meaning.
13 | Functionally, the difference b/t class LatentFactorMapper and class SparseFeatureMapper is that they handle one
14 | sparse column (either user or item) and multiple sparse columns (categorical features), respectively.
15 | In terms of nomenclature, the difference b/t class LatentFactorMapper and class SparseFeatureMapper is to
16 | distinguish the host of features (user and item) from the features themselves.
17 | The use of the term "latent factor" can be traced back to early matrix factorization models for recommendation,
18 | which involve only user and item.
19 | Reference: https://datajobs.com/data-science-repo/Recommender-Systems-[Netflix].pdf
20 |
21 | # Arguments
22 | column_id (int): The index of the user (item) entity column.
23 | num_of_entities (int): The number of the user (item) entity.
24 | embedding_dim (int): The dimension of the embeddings (latent factors).
25 |
26 | # Attributes
27 | column_id (int): The index of the user (item) entity column.
28 | num_of_entities (int): The number of the user (item) entities.
29 | embedding_dim (int): The dimension of the embeddings (latent factors).
30 | """
31 |
32 | def __init__(self,
33 | column_id=None,
34 | num_of_entities=None,
35 | embedding_dim=None,
36 | **kwargs):
37 | super().__init__(**kwargs)
38 | self.column_id = column_id
39 | self.num_of_entities = num_of_entities
40 | self.embedding_dim = embedding_dim
41 |
42 | def get_state(self):
43 | state = super().get_state()
44 | state.update({
45 | 'column_id': self.column_id,
46 | 'num_of_entities': self.num_of_entities,
47 | 'embedding_dim': self.embedding_dim})
48 | return state
49 |
50 | def set_state(self, state):
51 | super().set_state(state)
52 | self.column_id = state['column_id']
53 | self.num_of_entities = state['num_of_entities']
54 | self.embedding_dim = state['embedding_dim']
55 |
56 | def build(self, hp, inputs=None):
57 | input_node = inputs
58 | num_of_entities = self.num_of_entities or hp.Choice('num_of_entities', [10000], default=10000)
59 | embedding_dim = self.embedding_dim or hp.Choice('embedding_dim', [8, 16, 32, 64, 128], default=32)
60 | output_node = tf.keras.layers.Embedding(num_of_entities, embedding_dim)(input_node[0][:, self.column_id])
61 | return output_node
62 |
63 |
64 | class SparseFeatureMapper(Block):
65 | """ This module maps the categorical data of sparse feature columns into embeddings.
66 |
67 | # Arguments
68 | num_of_fields (int): The number of sparse feature columns (fields).
69 | hash_size (list): The numbers of categories used in each sparse feature column.
70 | embedding_dim (int): The dimension of the embeddings.
71 |
72 | # Attributes
73 | num_of_fields (int): The number of sparse feature columns (fields).
74 | hash_size (list): The list of numbers of categories used in each sparse feature column.
75 | embedding_dim (int): The dimension of the embeddings.
76 | """
77 |
78 | def __init__(self,
79 | num_of_fields=None,
80 | hash_size=None,
81 | embedding_dim=None,
82 | **kwargs):
83 | super().__init__(**kwargs)
84 | self.num_of_fields = num_of_fields
85 | self.hash_size = hash_size
86 | self.embedding_dim = embedding_dim
87 |
88 | def get_state(self):
89 | """ Get information about the mapper layer, including name, level, and hyperparameters.
90 |
91 | # Returns
92 | Dictionary where key=attribute name and val=attribute value.
93 | """
94 | state = super().get_state()
95 | state.update({
96 | 'num_of_fields': self.num_of_fields,
97 | 'hash_size': self.hash_size,
98 | 'embedding_dim': self.embedding_dim})
99 | return state
100 |
101 | def set_state(self, state):
102 | """ Set information about the mapper layer, including name, level, and hyperparameters.
103 |
104 | # Arguments
105 | state (dict): Map attribute names to attribute values.
106 | """
107 | super().set_state(state)
108 | self.num_of_fields = state['num_of_fields']
109 | self.hash_size = state['hash_size']
110 | self.embedding_dim = state['embedding_dim']
111 |
112 | def build(self, hp, inputs=None):
113 | """ Build the mapper layer.
114 |
115 | Note:
116 | Attribute "hash_size" has search space [10000]. Default is 10000.
117 | Attribute "embedding_dim" has search space [8, 16]. Default is 8.
118 |
119 | # Arguments
120 | hp (HyperParameters): Specifies the search space and default value for the block's hyperparameters.
121 | inputs (Tensor): List of batch input tensors.
122 |
123 | # Returns
124 | The defined mapper block.
125 | """
126 | input_node = inputs
127 | # TODO: modify default hash_size, current version is wrong when category of a feature is more than 10000
128 | hash_size = self.hash_size or [hp.Choice('hash_size', [10000], default=10000)
129 | for _ in range(self.num_of_fields)]
130 | embedding_dim = self.embedding_dim or hp.Choice('embedding_dim', [8, 16], default=8)
131 | output_node = tf.stack(
132 | [
133 | tf.keras.layers.Embedding(hash_size[col_id], embedding_dim)(input_node[0][:, col_id])
134 | for col_id in range(self.num_of_fields)
135 | ],
136 | axis=1
137 | )
138 | return output_node
139 |
140 |
141 | class DenseFeatureMapper(Block):
142 | """ This module maps the numerical data of dense feature columns into embeddings.
143 |
144 | # Arguments
145 | num_of_fields (int): The number of dense feature columns.
146 | embedding_dim (int): The dimension of the embeddings.
147 |
148 | # Attributes
149 | num_of_fields (int): The number of dense feature columns.
150 | embedding_dim (int): The dimension of the embeddings.
151 | """
152 |
153 | def __init__(self,
154 | num_of_fields=None,
155 | embedding_dim=None,
156 | **kwargs):
157 | super().__init__(**kwargs)
158 | self.num_of_fields = num_of_fields
159 | self.embedding_dim = embedding_dim
160 |
161 | def get_state(self):
162 | """ Get information about the mapper layer, including name, level, and hyperparameters.
163 |
164 | # Returns
165 | Dictionary where key=attribute name and val=attribute value.
166 | """
167 | state = super().get_state()
168 | state.update({
169 | 'num_of_fields': self.num_of_fields,
170 | 'embedding_dim': self.embedding_dim})
171 | return state
172 |
173 | def set_state(self, state):
174 | """ Set information about the mapper layer, including name, level, and hyperparameters.
175 |
176 | # Arguments
177 | state (dict): Map attribute names to attribute values.
178 | """
179 | super().set_state(state)
180 | self.num_of_fields = state['num_of_fields']
181 | self.embedding_dim = state['embedding_dim']
182 |
183 | def build(self, hp, inputs=None):
184 | """ Build the mapper layer.
185 |
186 | Note:
187 | Attribute "embedding_dim" has search space [8, 16, 32]. Default is 8.
188 |
189 | # Arguments
190 | hp (HyperParameters): Specifies the search space and default value for the block's hyperparameters.
191 | inputs (Tensor): List of batch input tensors.
192 |
193 | # Returns
194 | The defined mapper block.
195 | """
196 | input_node = inputs
197 | embedding_dim = self.embedding_dim or hp.Choice('embedding_dim', [8, 16], default=8)
198 | output_node = tf.stack(
199 | [
200 | tf.tensordot(input_node[0][:, col_id], tf.keras.layers.Embedding(1, embedding_dim)(0), axes=0)
201 | for col_id in range(self.num_of_fields)
202 | ],
203 | axis=1
204 | )
205 | return output_node
206 |
207 |
--------------------------------------------------------------------------------
/autorecsys/pipeline/node.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | import tensorflow as tf
4 | from tensorflow.python.util import nest
5 |
6 | from autorecsys.utils.common import dataset_shape
7 | from autorecsys.pipeline import base
8 |
9 |
10 |
11 | class Input(base.Node):
12 | """Input node for tensor data.
13 | The data should be numpy.ndarray or tf.data.Dataset.
14 | """
15 |
16 | def _check(self, x):
17 | """Record any information needed by transform."""
18 | if not isinstance(x, (np.ndarray, tf.data.Dataset)):
19 | raise TypeError('Expect the data to Input to be numpy.ndarray or '
20 | 'tf.data.Dataset, but got {type}.'.format(type=type(x)))
21 | if isinstance(x, np.ndarray) and not np.issubdtype(x.dtype, np.number):
22 | raise TypeError('Expect the data to Input to be numerical, but got '
23 | '{type}.'.format(type=x.dtype))
24 |
25 | def _convert_to_dataset(self, x):
26 | if isinstance(x, tf.data.Dataset):
27 | return x
28 | if isinstance(x, np.ndarray):
29 | x = x.astype(np.float32)
30 | return tf.data.Dataset.from_tensor_slices(x)
31 |
32 | def _record_dataset_shape(self, dataset):
33 | self.shape = dataset_shape(dataset)
34 |
35 | def fit_transform(self, x):
36 | dataset = self.transform(x)
37 | self._record_dataset_shape(dataset)
38 | return dataset
39 |
40 | def transform(self, x):
41 | """Transform x into a compatible type (tf.data.Dataset)."""
42 | self._check(x)
43 | dataset = self._convert_to_dataset(x)
44 | return dataset
45 |
46 |
47 | class StructuredDataInput(Input):
48 | """Input node for structured data.
49 | The input data should be numpy.ndarray, pandas.DataFrame or tensorflow.Dataset.
50 | # Arguments
51 | column_names: A list of strings specifying the names of the columns. The
52 | length of the list should be equal to the number of columns of the data.
53 | Defaults to None. If None, it will obtained from the header of the csv
54 | file or the pandas.DataFrame.
55 | column_types: Dict. The keys are the column names. The values should either
56 | be 'numerical' or 'categorical', indicating the type of that column.
57 | Defaults to None. If not None, the column_names need to be specified.
58 | If None, it will be inferred from the data. A column will be judged as
59 | categorical if the number of different values is less than 5% of the
60 | number of instances.
61 | """
62 |
63 | def __init__(self, column_names=None, column_types=None, **kwargs):
64 | super().__init__(**kwargs)
65 | self.column_names = column_names
66 | self.column_types = column_types
67 | # Variables for inferring column types.
68 | self.count_nan = None
69 | self.count_numerical = None
70 | self.count_categorical = None
71 | self.count_unique_numerical = []
72 | self.num_col = None
73 |
74 | def get_state(self):
75 | state = super().get_state()
76 | state.update({
77 | 'column_names': self.column_names,
78 | 'column_types': self.column_types,
79 | 'count_nan': self.count_nan,
80 | 'count_numerical': self.count_numerical,
81 | 'count_categorical': self.count_categorical,
82 | 'count_unique_numerical': self.count_unique_numerical,
83 | 'num_col': self.num_col
84 | })
85 | return state
86 |
87 | def set_state(self, state):
88 | super().set_state(state)
89 | self.column_names = state['column_names']
90 | self.column_types = state['column_types']
91 | self.count_nan = state['count_nan']
92 | self.count_numerical = state['count_numerical']
93 | self.count_categorical = state['count_categorical']
94 | self.count_unique_numerical = state['count_unique_numerical']
95 | self.num_col = state['num_col']
96 |
97 | def _check(self, x):
98 | if not isinstance(x, (pd.DataFrame, np.ndarray)):
99 | raise TypeError('Unsupported type {type} for '
100 | '{name}.'.format(type=type(x),
101 | name=self.__class__.__name__))
102 |
103 | # Extract column_names from pd.DataFrame.
104 | if isinstance(x, pd.DataFrame) and self.column_names is None:
105 | self.column_names = list(x.columns)
106 | # column_types is provided by user
107 | if self.column_types:
108 | for column_name in self.column_types:
109 | if column_name not in self.column_names:
110 | raise ValueError('Column_names and column_types are '
111 | 'mismatched. Cannot find column name '
112 | '{name} in the data.'.format(
113 | name=column_name))
114 |
115 | # Generate column_names.
116 | if self.column_names is None:
117 | if self.column_types:
118 | raise ValueError('Column names must be specified.')
119 | self.column_names = [index for index in range(x.shape[1])]
120 |
121 | # Check if column_names has the correct length.
122 | if len(self.column_names) != x.shape[1]:
123 | raise ValueError('Expect column_names to have length {expect} '
124 | 'but got {actual}.'.format(
125 | expect=x.shape[1],
126 | actual=len(self.column_names)))
127 |
128 | def _convert_to_dataset(self, x):
129 | if isinstance(x, pd.DataFrame):
130 | # Convert x, y, validation_data to tf.Dataset.
131 | x = tf.data.Dataset.from_tensor_slices(
132 | x.values.astype(np.unicode))
133 | if isinstance(x, np.ndarray):
134 | x = tf.data.Dataset.from_tensor_slices(x.astype(np.unicode))
135 | dataset = super()._convert_to_dataset(x)
136 | for x in dataset:
137 | self.update(x)
138 | self.infer_column_types()
139 | return dataset
140 |
141 | def update(self, x):
142 | # Calculate the statistics.
143 | x = nest.flatten(x)[0].numpy()
144 | if self.num_col is None:
145 | self.num_col = len(x)
146 | self.count_nan = np.zeros(self.num_col)
147 | self.count_numerical = np.zeros(self.num_col)
148 | self.count_categorical = np.zeros(self.num_col)
149 | for i in range(len(x)):
150 | self.count_unique_numerical.append({})
151 | for i in range(self.num_col):
152 | x[i] = x[i].decode('utf-8')
153 | if x[i] == 'nan':
154 | self.count_nan[i] += 1
155 | elif x[i] == 'True':
156 | self.count_categorical[i] += 1
157 | elif x[i] == 'False':
158 | self.count_categorical[i] += 1
159 | else:
160 | try:
161 | tmp_num = float(x[i])
162 | self.count_numerical[i] += 1
163 | if tmp_num not in self.count_unique_numerical[i]:
164 | self.count_unique_numerical[i][tmp_num] = 1
165 | else:
166 | self.count_unique_numerical[i][tmp_num] += 1
167 | except ValueError:
168 | self.count_categorical[i] += 1
169 |
170 | def infer_column_types(self):
171 | column_types = {}
172 | for i in range(self.num_col):
173 | if self.count_categorical[i] > 0:
174 | column_types[self.column_names[i]] = 'categorical'
175 | elif len(self.count_unique_numerical[i])/self.count_numerical[i] < 0.05:
176 | column_types[self.column_names[i]] = 'categorical'
177 | else:
178 | column_types[self.column_names[i]] = 'numerical'
179 | # Partial column_types is provided.
180 | if self.column_types is None:
181 | self.column_types = {}
182 | for key, value in column_types.items():
183 | if key not in self.column_types:
184 | self.column_types[key] = value
185 |
--------------------------------------------------------------------------------
/autorecsys/pipeline/optimizer.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import, division, print_function, unicode_literals
2 |
3 | import tensorflow as tf
4 | from autorecsys.pipeline.base import Block
5 |
6 |
7 | class RatingPredictionOptimizer(Block):
8 | """ For the rating prediction task, this module employs the default 'linear' activation function and the 'mse' (mean
9 | square error) loss and metric for training and evaluation.
10 |
11 | # Note
12 | This module takes a list of single tensor batch as input. When the input is a list of multiple tensor batches,
13 | they are concatenated into a single single tensor batch.
14 | """
15 |
16 | def __init__(self, **kwargs):
17 | super().__init__(**kwargs)
18 |
19 | def build(self, hp, inputs=None):
20 | """ Build the optimization layer.
21 |
22 | # Arguments
23 | hp (HyperParameters): Specifies the search space and default value for the block's hyperparameters.
24 | inputs (Tensor): List of batch input tensors.
25 |
26 | # Returns
27 | The defined optimizer block.
28 | """
29 | input_node = tf.concat(inputs, axis=1)
30 | output_node = tf.keras.layers.Dense(1)(input_node)
31 | output_node = tf.reshape(output_node, [-1])
32 | return output_node
33 |
34 | @property
35 | def metric(self):
36 | """ Define the metric used for model evaluation.
37 |
38 | # Returns
39 | The defined metric object.
40 | """
41 | return tf.keras.metrics.MeanSquaredError(name='mse')
42 |
43 | @property
44 | def loss(self):
45 | """ Define the loss used for model training.
46 |
47 | # Returns
48 | The defined loss object.
49 | """
50 | return tf.keras.losses.MeanSquaredError(name='mse')
51 |
52 |
53 | class CTRPredictionOptimizer(Block):
54 | """ For the CTR (click-through rate) prediction task, this module employs the 'sigmoid' activation function and
55 | the 'BinaryCrossentropy' loss and metric for training and evaluation.
56 |
57 | # Note
58 | This module takes a list of single tensor batch as input. When the input is a list of multiple tensor batches,
59 | they are concatenated into a single single tensor batch.
60 | """
61 |
62 | def build(self, hp, inputs=None):
63 | """ Build the optimization layer.
64 |
65 | # Arguments
66 | hp (HyperParameters): Specifies the search space and default value for the block's hyperparameters.
67 | inputs (Tensor): List of batch input tensors.
68 |
69 | # Returns
70 | The defined optimizer block.
71 | """
72 | input_node = tf.concat(inputs, axis=1)
73 | output_node = tf.keras.layers.Dense(1, activation='sigmoid')(input_node)
74 | output_node = tf.reshape(output_node, [-1, 1])
75 | return output_node
76 |
77 | @property
78 | def metric(self):
79 | """ Define the metric used for model evaluation.
80 |
81 | # Returns
82 | The defined metric object.
83 | """
84 | return tf.keras.metrics.BinaryCrossentropy(name='BinaryCrossentropy')
85 |
86 | @property
87 | def loss(self):
88 | """ Define the loss used for model training.
89 |
90 | # Returns
91 | The defined loss object.
92 | """
93 | return tf.keras.losses.BinaryCrossentropy(name='BinaryCrossentropy')
94 |
--------------------------------------------------------------------------------
/autorecsys/pipeline/utils.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import, division, print_function, unicode_literals
2 |
3 | import tensorflow as tf
4 | from tensorflow.keras.layers import Layer
5 |
6 |
7 | class Bias(Layer):
8 | """ This module builds a Keras layer of bias terms (e.g., MLP layer with zero weight matrix).
9 |
10 | # Arguments
11 | units (int): The units of all layer in the Bias layer.
12 |
13 | # Attributes
14 | bias (Tensor): The bias layer.
15 | """
16 |
17 | def __init__(self, units=32):
18 | super(Bias, self).__init__()
19 | bias_init = tf.zeros_initializer()
20 | self.bias = tf.Variable(initial_value=bias_init(shape=(units,), dtype='float32'), trainable=True)
21 |
22 | def call(self, inputs):
23 | """ Add the bias layer to the input tensor layer.
24 |
25 | # Arguments
26 | inputs (Tensor): List of batch input tensors.
27 |
28 | # Returns
29 | List of batch input tensors added with bias tensors.
30 | """
31 | return inputs + self.bias
32 |
--------------------------------------------------------------------------------
/autorecsys/recommender.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import, division, print_function, unicode_literals
2 |
3 | from autorecsys.pipeline.graph import HyperGraph
4 |
5 |
6 | class RPRecommender(HyperGraph): # pragma: no cover
7 | """A rating prediction HyperModel based on connected Blocks and HyperBlocks.
8 |
9 | # Arguments
10 | inputs (list): A list of input node(s) for the HyperGraph.
11 | outputs (list): A list of output node(s) for the HyperGraph.
12 | """
13 | def __init__(self, **kwargs):
14 | super().__init__(**kwargs)
15 |
16 |
17 | class CTRRecommender(HyperGraph): # pragma: no cover
18 | """A CTR (click-through rate) prediction HyperModel based on connected Blocks and HyperBlocks.
19 |
20 | # Arguments
21 | inputs (list): A list of input node(s) for the HyperGraph.
22 | outputs (list): A list of output node(s) for the HyperGraph.
23 | """
24 | def __init__(self, **kwargs):
25 | super().__init__(**kwargs)
26 |
--------------------------------------------------------------------------------
/autorecsys/searcher/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datamllab/AutoRec/2dbc8778cfb597402d8b0337186bf9152663b20a/autorecsys/searcher/__init__.py
--------------------------------------------------------------------------------
/autorecsys/searcher/core/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datamllab/AutoRec/2dbc8778cfb597402d8b0337186bf9152663b20a/autorecsys/searcher/core/__init__.py
--------------------------------------------------------------------------------
/autorecsys/searcher/core/trial.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # This codes are migrated from Keras Tuner: https://keras-team.github.io/keras-tuner/.
3 | # The copyright belows to the Keras Tuner authors.
4 |
5 |
6 | from __future__ import absolute_import, division, print_function, unicode_literals
7 |
8 | import random
9 | import tensorflow as tf
10 | import time
11 | import json
12 |
13 | from autorecsys.searcher.core import hyperparameters as hp_module
14 | from autorecsys.utils import display, metric
15 |
16 |
17 | class Stateful(object):
18 |
19 | def get_state(self):
20 | raise NotImplementedError
21 |
22 | def set_state(self, state):
23 | raise NotImplementedError
24 |
25 | def save(self, fname):
26 | state = self.get_state()
27 | state_json = json.dumps(state)
28 | with open(fname, 'w') as fp:
29 | fp.write(state_json)
30 | return str(fname)
31 |
32 | def reload(self, fname):
33 | with open(fname, 'r') as fp:
34 | state = json.load(fp)
35 | self.set_state(state)
36 |
37 |
38 | class TrialStatus:
39 | RUNNING = 'RUNNING'
40 | IDLE = 'IDLE'
41 | INVALID = 'INVALID'
42 | STOPPED = 'STOPPED'
43 | COMPLETED = 'COMPLETED'
44 |
45 |
46 | class Trial(Stateful):
47 |
48 | def __init__(self,
49 | hyperparameters,
50 | trial_id=None,
51 | status=TrialStatus.RUNNING):
52 | self.hyperparameters = hyperparameters
53 | self.trial_id = generate_trial_id() if trial_id is None else trial_id
54 | self.metrics = metric.MetricsTracker()
55 | self.score = None
56 | self.best_step = None
57 | self.status = status
58 |
59 | def summary(self):
60 | display.section('Trial summary')
61 | if self.hyperparameters.values:
62 | display.subsection('Hp values:')
63 | value_need_display = {k: v for k, v in self.hyperparameters.values.items()
64 | if k in self.hyperparameters._space and
65 | self.hyperparameters._space[k].__class__.__name__ != 'Fixed'}
66 | display.display_settings(value_need_display)
67 | else:
68 | display.subsection('Hp values: default configuration.')
69 | if self.score is not None:
70 | display.display_setting('Score: {}'.format(self.score))
71 | if self.best_step is not None:
72 | display.display_setting('Best step: {}'.format(self.best_step))
73 |
74 | def get_state(self):
75 | return {
76 | 'trial_id': self.trial_id,
77 | 'hyperparameters': self.hyperparameters.get_config(),
78 | 'metrics': self.metrics.get_config(),
79 | 'score': self.score,
80 | 'best_step': self.best_step,
81 | 'status': self.status
82 | }
83 |
84 | def set_state(self, state):
85 | self.trial_id = state['trial_id']
86 | hp = hp_module.HyperParameters.from_config(
87 | state['hyperparameters']
88 | )
89 | self.hyperparameters = hp
90 | self.metrics = metric.MetricsTracker.from_config(state['metrics'])
91 | self.score = state['score']
92 | self.best_step = state['best_step']
93 | self.status = state['status']
94 |
95 | @classmethod
96 | def from_state(cls, state):
97 | trial = cls(hyperparameters=None)
98 | trial.set_state(state)
99 | return trial
100 |
101 | @classmethod
102 | def load(cls, fname):
103 | with tf.io.gfile.GFile(fname, 'r') as f:
104 | state_data = f.read()
105 | return cls.from_state(state_data)
106 |
107 |
108 | def generate_trial_id():
109 | s = str(time.time()) + str(random.randint(1, 1e7))
110 | # return hashlib.sha256(s.encode('utf-8')).hexdigest()[:32]
111 | return hash(s) % 1045543567
112 |
--------------------------------------------------------------------------------
/autorecsys/searcher/core/utils.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | LOGGER = logging.getLogger(__name__)
4 | TYPE_MAP = {'int': int, 'float': float, 'str': str, 'list': list, 'tuple': tuple, 'bool': bool}
5 | CANT_BE_SET = -1
6 |
7 |
8 | def check_valid_params(name, x, param_info, skip_range_detect):
9 | param_type = TYPE_MAP[param_info['type']]
10 | try:
11 | x = param_type(x)
12 | except ValueError as e:
13 | LOGGER.exception(f'can not cast {name} to {param_type}')
14 | raise e
15 | param_range = param_info.get('range', None)
16 | if param_range == CANT_BE_SET:
17 | raise TypeError(f'{name} can not be set from config files')
18 | if not skip_range_detect:
19 | if isinstance(param_range, tuple):
20 | if x not in param_range:
21 | raise ValueError(f'{name} must be in {param_range}, {x} doesn\'t')
22 | elif isinstance(param_range, list):
23 | low, high = param_range
24 | if x < low or x > high:
25 | raise ValueError(f'{name} valid range: x>={low} && x<={high}')
26 | else:
27 | raise NotImplementedError(f'code error: the param\'range of a model must be tuple, list')
28 | return x
29 |
--------------------------------------------------------------------------------
/autorecsys/searcher/tuners/__init__.py:
--------------------------------------------------------------------------------
1 | from .randomsearch import RandomSearch
2 | from .bayesian import BayesianOptimization
3 | from .greedy import Greedy
4 |
5 | TUNER_CLASSES = {
6 | 'random': RandomSearch,
7 | 'bayesian': BayesianOptimization,
8 | "greedy": Greedy
9 | }
10 |
11 |
12 | def get_tuner_class(tuner):
13 | if isinstance(tuner, str) and tuner in TUNER_CLASSES:
14 | return TUNER_CLASSES.get(tuner)
15 | else:
16 | raise ValueError('The value {tuner} passed for argument tuner is invalid, '
17 | 'expected one of "random","bayesian".'.format(tuner=tuner))
18 |
--------------------------------------------------------------------------------
/autorecsys/searcher/tuners/greedy.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # This codes are migrated from Keras Tuner: https://keras-team.github.io/keras-tuner/.
3 | # The copyright belows to the Keras Tuner authors.
4 |
5 |
6 | from __future__ import absolute_import, division, print_function, unicode_literals
7 |
8 | import random
9 | import numpy as np
10 |
11 | from autorecsys.searcher.tuners.tuner import PipeTuner
12 | from autorecsys.searcher.core import hyperparameters as hp_module
13 | from autorecsys.searcher.core import oracle as oracle_module
14 | from autorecsys.searcher.core import trial as trial_lib
15 |
16 |
17 | class GreedyOracle(oracle_module.Oracle):
18 | """An oracle combining random search and greedy algorithm.
19 | It groups the HyperParameters into several categories, namely, HyperGraph,
20 | Preprocessor, Architecture, and Optimization. The oracle tunes each group
21 | separately using random search. In each trial, it use a greedy strategy to
22 | generate new values for one of the categories of HyperParameters and use the best
23 | trial so far for the rest of the HyperParameters values.
24 | # Arguments
25 | initial_hps: A list of dictionaries in the form of
26 | {HyperParameter name (String): HyperParameter value}.
27 | Each dictionary is one set of HyperParameters, which are used as the
28 | initial trials for the search. Defaults to None.
29 | seed: Int. Random seed.
30 | """
31 |
32 | HYPER = 'HYPER'
33 | PREPROCESS = 'PREPROCESS'
34 | OPT = 'OPT'
35 | ARCH = 'ARCH'
36 | STAGES = [HYPER, PREPROCESS, OPT, ARCH]
37 |
38 | @staticmethod
39 | def next_stage(stage):
40 | stages = GreedyOracle.STAGES
41 | return stages[(stages.index(stage) + 1) % len(stages)]
42 |
43 | def __init__(self,
44 | hypermodel,
45 | initial_hps=None,
46 | seed=None,
47 | **kwargs):
48 | super().__init__(**kwargs)
49 | self.initial_hps = initial_hps or []
50 | self._tried_initial_hps = [False] * len(self.initial_hps)
51 | self.hypermodel = hypermodel
52 | # Sets of HyperParameter names.
53 | self._hp_names = {
54 | GreedyOracle.HYPER: set(),
55 | GreedyOracle.PREPROCESS: set(),
56 | GreedyOracle.OPT: set(),
57 | GreedyOracle.ARCH: set(),
58 | }
59 | # The quota used to tune each category of hps.
60 | self.seed = seed or random.randint(1, 1e4)
61 | # Incremented at every call to `populate_space`.
62 | self._seed_state = self.seed
63 | self._tried_so_far = set()
64 | self._max_collisions = 5
65 |
66 | def update_space(self, hyperparameters):
67 | # Get the block names.
68 | self.hypermodel.build(hyperparameters)
69 |
70 | # Add the new Hyperparameters to different categories.
71 | ref_names = {hp.name for hp in self.hyperparameters.space}
72 | for hp in hyperparameters.space:
73 | if hp.name not in ref_names:
74 | hp_type = GreedyOracle.ARCH
75 | self._hp_names[hp_type].add(hp.name)
76 | super().update_space(hyperparameters)
77 |
78 | def _generate_stage(self):
79 | probabilities = np.array([pow(len(value), 2)
80 | for value in self._hp_names.values()])
81 | sum_p = np.sum(probabilities)
82 | if sum_p == 0:
83 | probabilities = np.array([1] * len(probabilities))
84 | sum_p = np.sum(probabilities)
85 | probabilities = probabilities / sum_p
86 | return np.random.choice(list(self._hp_names.keys()), p=probabilities)
87 |
88 | def _next_initial_hps(self):
89 | for index, hps in enumerate(self.initial_hps):
90 | if not self._tried_initial_hps[index]:
91 | self._tried_initial_hps[index] = True
92 | return hps
93 |
94 | def _populate_space(self, trial_id):
95 | if not all(self._tried_initial_hps):
96 | return {'status': trial_lib.TrialStatus.RUNNING,
97 | 'values': self._next_initial_hps()}
98 |
99 | stage = self._generate_stage()
100 | for _ in range(len(GreedyOracle.STAGES)):
101 | values = self._generate_stage_values(stage)
102 | # Reached max collisions.
103 | if values is None:
104 | # Try next stage.
105 | stage = GreedyOracle.next_stage(stage)
106 | continue
107 | # Values found.
108 | return {'status': trial_lib.TrialStatus.RUNNING,
109 | 'values': values}
110 | # All stages reached max collisions.
111 | return {'status': trial_lib.TrialStatus.STOPPED,
112 | 'values': None}
113 |
114 | def _generate_stage_values(self, stage):
115 | best_trials = self.get_best_trials()
116 | if best_trials:
117 | best_values = best_trials[0].hyperparameters.values
118 | else:
119 | best_values = self.hyperparameters.values
120 | collisions = 0
121 | while True:
122 | # Generate new values for the current stage.
123 | values = {}
124 | for p in self.hyperparameters.space:
125 | if p.name in self._hp_names[stage]:
126 | values[p.name] = p.random_sample(self._seed_state)
127 | self._seed_state += 1
128 | values = {**best_values, **values}
129 | # Keep trying until the set of values is unique,
130 | # or until we exit due to too many collisions.
131 | values_hash = self._compute_values_hash(values)
132 | if values_hash not in self._tried_so_far:
133 | self._tried_so_far.add(values_hash)
134 | break
135 | collisions += 1
136 | if collisions > self._max_collisions:
137 | # Reached max collisions. No value to return.
138 | return None
139 | return values
140 |
141 |
142 | class Greedy(PipeTuner):
143 |
144 | def __init__(self,
145 | hypergraph,
146 | objective,
147 | max_trials,
148 | initial_hps=None,
149 | seed=None,
150 | hyperparameters=None,
151 | tune_new_entries=True,
152 | allow_new_entries=True,
153 | **kwargs):
154 | self.seed = seed
155 | oracle = GreedyOracle(hypermodel=hypergraph,
156 | objective=objective,
157 | max_trials=max_trials,
158 | initial_hps=initial_hps,
159 | seed=seed,
160 | hyperparameters=hyperparameters,
161 | tune_new_entries=tune_new_entries,
162 | allow_new_entries=allow_new_entries)
163 | super(Greedy, self).__init__(oracle,
164 | hypergraph,
165 | **kwargs)
166 |
167 | @classmethod
168 | def get_name(cls):
169 | return 'greedy'
170 |
--------------------------------------------------------------------------------
/autorecsys/searcher/tuners/randomsearch.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # This codes are migrated from Keras Tuner: https://keras-team.github.io/keras-tuner/.
3 | # The copyright belows to the Keras Tuner authors.
4 |
5 |
6 | "Basic random search searcher."
7 |
8 | from __future__ import absolute_import, division, print_function, unicode_literals
9 |
10 | import random
11 |
12 | from autorecsys.searcher.tuners.tuner import PipeTuner
13 | from autorecsys.searcher.core import hyperparameters as hp_module
14 | from autorecsys.searcher.core import oracle as oracle_module
15 | from autorecsys.searcher.core import trial as trial_lib
16 |
17 |
18 | class RandomSearchOracle(oracle_module.Oracle):
19 | """Random search oracle.
20 | Attributes:
21 | objective: String or `kerastuner.Objective`. If a string,
22 | the direction of the optimization (min or max) will be
23 | inferred.
24 | max_trials: Int. Total number of trials
25 | (model configurations) to test at most.
26 | Note that the oracle may interrupt the search
27 | before `max_trial` models have been tested.
28 | seed: Int. Random seed.
29 | hyperparameters: HyperParameters class instance.
30 | Can be used to override (or register in advance)
31 | hyperparamters in the search space.
32 | tune_new_entries: Whether hyperparameter entries
33 | that are requested by the hypermodel
34 | but that were not specified in `hyperparameters`
35 | should be added to the search space, or not.
36 | If not, then the default value for these parameters
37 | will be used.
38 | allow_new_entries: Whether the hypermodel is allowed
39 | to request hyperparameter entries not listed in
40 | `hyperparameters`.
41 | """
42 |
43 | def __init__(self,
44 | objective,
45 | max_trials,
46 | seed=None,
47 | hyperparameters=None,
48 | allow_new_entries=True,
49 | tune_new_entries=True):
50 | super(RandomSearchOracle, self).__init__(
51 | objective=objective,
52 | max_trials=max_trials,
53 | hyperparameters=hyperparameters,
54 | tune_new_entries=tune_new_entries,
55 | allow_new_entries=allow_new_entries)
56 | self.seed = seed or random.randint(1, 1e4)
57 | # Incremented at every call to `populate_space`.
58 | self._seed_state = self.seed
59 | # Hashes of values tried so far.
60 | self._tried_so_far = set()
61 | # Maximum number of identical values that can be generated
62 | # before we consider the space to be exhausted.
63 | self._max_collisions = 5
64 |
65 | def _populate_space(self, _):
66 | """Fill the hyperparameter space with values.
67 | Args:
68 | `trial_id`: The id for this Trial.
69 | Returns:
70 | A dictionary with keys "values" and "status", where "values" is
71 | a mapping of parameter names to suggested values, and "status"
72 | is the TrialStatus that should be returned for this trial (one
73 | of "RUNNING", "IDLE", or "STOPPED").
74 | """
75 | collisions = 0
76 | while 1:
77 | # Generate a set of random values.
78 | values = {}
79 | if all(isinstance(p, hp_module.Fixed) for p in self.hyperparameters.space):
80 | break
81 | for p in self.hyperparameters.space:
82 | values[p.name] = p.random_sample(self._seed_state)
83 | self._seed_state += 1
84 | # Keep trying until the set of values is unique,
85 | # or until we exit due to too many collisions.
86 | values_hash = self._compute_values_hash(values)
87 | if values_hash in self._tried_so_far:
88 | collisions += 1
89 | if collisions > self._max_collisions:
90 | return {'status': trial_lib.TrialStatus.STOPPED,
91 | 'values': None}
92 | continue
93 | self._tried_so_far.add(values_hash)
94 | break
95 | return {'status': trial_lib.TrialStatus.RUNNING,
96 | 'values': values}
97 |
98 | def get_state(self):
99 | state = super(RandomSearchOracle, self).get_state()
100 | state.update({
101 | 'seed': self.seed,
102 | 'seed_state': self._seed_state,
103 | 'tried_so_far': list(self._tried_so_far),
104 | })
105 | return state
106 |
107 | def set_state(self, state):
108 | super(RandomSearchOracle, self).set_state(state)
109 | self.seed = state['seed']
110 | self._seed_state = state['seed_state']
111 | self._tried_so_far = set(state['tried_so_far'])
112 |
113 |
114 | class RandomSearch(PipeTuner):
115 | """Random search tuner.
116 | # Arguments:
117 | config: Dictionary. Specify the search configurations
118 | including TrainOptions, ModelOptions, Search Options.
119 | objective: String. Name of model metric to minimize
120 | or maximize, e.g. "val_accuracy".
121 | max_trials: Int. Total number of trials
122 | (model configurations) to test at most.
123 | Note that the oracle may interrupt the search
124 | before `max_trial` models have been tested.
125 | seed: Int. Random seed.
126 | hyperparameters: HyperParameters class instance.
127 | Can be used to override (or register in advance)
128 | hyperparamters in the search space.
129 | tune_new_entries: Whether hyperparameter entries
130 | that are requested by the hypermodel
131 | but that were not specified in `hyperparameters`
132 | should be added to the search space, or not.
133 | If not, then the default value for these parameters
134 | will be used.
135 | allow_new_entries: Whether the hypermodel is allowed
136 | to request hyperparameter entries not listed in
137 | `hyperparameters`.
138 | **kwargs: Keyword arguments relevant to all `Tuner` subclasses.
139 | Please see the docstring for `Tuner`.
140 | """
141 |
142 | def __init__(self,
143 | hypergraph,
144 | objective,
145 | max_trials,
146 | seed=None,
147 | hyperparameters=None,
148 | tune_new_entries=True,
149 | allow_new_entries=True,
150 | **kwargs):
151 | self.seed = seed
152 | oracle = RandomSearchOracle(objective=objective,
153 | max_trials=max_trials,
154 | seed=seed,
155 | hyperparameters=hyperparameters,
156 | tune_new_entries=tune_new_entries,
157 | allow_new_entries=allow_new_entries)
158 | super(RandomSearch, self).__init__(oracle,
159 | hypergraph,
160 | **kwargs)
161 |
162 | @classmethod
163 | def get_name(cls):
164 | return 'random'
165 |
--------------------------------------------------------------------------------
/autorecsys/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datamllab/AutoRec/2dbc8778cfb597402d8b0337186bf9152663b20a/autorecsys/utils/__init__.py
--------------------------------------------------------------------------------
/autorecsys/utils/common.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import, division, print_function, unicode_literals
2 |
3 | import re
4 | import os
5 | import shutil
6 | import pandas as pd
7 | import numpy as np
8 | import tensorflow as tf
9 | import random
10 | import pickle
11 | import string
12 |
13 |
14 | def dataset_shape(dataset):
15 | """ Get the shape of the dataset.
16 |
17 | Args:
18 | dataset (tf.data.Dataset or Tf.data.Iterator): A TensorFlow Dataset or Iterator.
19 |
20 | Returns:
21 | A nested structure of tf.TensorShape object matching the structure of the dataset / iterator elements and
22 | specifying the shape of the individual components.
23 | """
24 | return tf.compat.v1.data.get_output_shapes(dataset)
25 |
26 |
27 | def to_snake_case(name):
28 | """ Convert the given class name to snake case.
29 |
30 | # Arguments
31 | name (str): The name of the class.
32 |
33 | # Returns
34 | String name of the class in snake case.
35 | """
36 | insecure = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
37 | insecure = re.sub('([a-z0-9])([A-Z])', r'\1_\2', insecure).lower()
38 | for p in string.punctuation:
39 | insecure = insecure.replace(p, "_")
40 |
41 | if insecure[0] != '_':
42 | return insecure
43 | # A private class (starts with "_") is not secure for creating scopes and is thus prefixed w/ "private".
44 | return 'private' + insecure
45 |
46 |
47 | def create_directory(path, remove_existing=False):
48 | """ Create the designated directory.
49 |
50 | # Arguments
51 | path (str): Path to create the directory.
52 | remove_existing (bool): Whether to remove the directory if it already exists.
53 | """
54 | # Create the directory if it doesn't exist.
55 | if not os.path.exists(path):
56 | os.mkdir(path)
57 | # Remove the preexisting directory if allowed.
58 | elif remove_existing:
59 | shutil.rmtree(path)
60 | os.mkdir(path)
61 |
62 |
63 | def set_device(device_name):
64 | """ Set the computational devices used to run models.
65 |
66 | # Arguments
67 | device_name (str): Name of the CPU or GPU.
68 | """
69 | if device_name[0:3] == "cpu":
70 | cpus = tf.config.experimental.list_physical_devices('CPU')
71 | print("Available CPUs: {}".format(cpus))
72 | assert len(cpus) > 0, "Not enough CPU hardware devices available"
73 | cpu_idx = int(device_name[-1])
74 | tf.config.experimental.set_visible_devices(cpus[cpu_idx], 'CPU')
75 | else:
76 | gpus = tf.config.experimental.list_physical_devices('GPU')
77 | for gpu in gpus:
78 | tf.config.experimental.set_memory_growth(gpu, True)
79 | print("Available GPUs: {}".format(gpus))
80 | assert len(gpus) > 0, "Not enough GPU hardware devices available"
81 | gpu_idx = int(device_name[-1])
82 | tf.config.experimental.set_visible_devices(gpus[gpu_idx], 'GPU')
83 |
84 |
85 | def load_dataframe_input(x):
86 | """ Load the input object as a DataFrame or a Series.
87 |
88 | # Note
89 | Cover the following classes: None, DataFrame, Series, ndarray, and str.
90 |
91 | # Arguments
92 | x (object): The object to be loaded as a DataFrame or Series.
93 |
94 | # Returns
95 | The loaded DataFrame or Series.
96 | """
97 | if x is None:
98 | return None
99 | if isinstance(x, pd.DataFrame) or isinstance(x, pd.Series):
100 | res = x
101 | elif isinstance(x, np.ndarray):
102 | res = pd.Series(x) if len(x.shape) == 1 else pd.DataFrame(x)
103 | elif isinstance(x, str):
104 | if not x.endswith('.csv'):
105 | raise TypeError(f'ONLY accept path to the local csv files')
106 | res = pd.read_csv(x)
107 | else:
108 | raise TypeError(f"cannot load {type(x)} into pandas dataframe")
109 |
110 | # Ensure the type of column names is string
111 | if isinstance(res, pd.DataFrame):
112 | res.columns = res.columns.astype('str')
113 | return res
114 |
115 |
116 | def set_seed(seed=42):
117 | """ Set the seed for randomization functions.
118 |
119 | # Note
120 | Cover the following libraries: Python, Numpy, and TensorFlow
121 |
122 | # Arguments
123 | seed (float): The seed number used to create fixed randomization.
124 | """
125 | random.seed(seed)
126 | np.random.seed(seed)
127 | tf.random.set_seed(seed)
128 |
129 |
130 | def save_pickle(path, obj):
131 | """ Save the input object to the designated path.
132 |
133 | # Arguments
134 | path (str): Designated path to save the object.
135 | obj (object): The object to be saved.
136 | """
137 | with open(path, 'wb') as f:
138 | pickle.dump(obj, f)
139 |
140 |
141 | def load_pickle(path):
142 | """ Load the object file from the designated path.
143 |
144 | # Arguments
145 | path: Designated path to load the object.
146 |
147 | Returns:
148 | The loaded object.
149 | """
150 | with open(path, 'rb') as f:
151 | return pickle.load(f)
152 |
--------------------------------------------------------------------------------
/autorecsys/utils/display.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # This codes are migrated from Keras Tuner: https://keras-team.github.io/keras-tuner/.
3 | # The copyright belows to the Keras Tuner authors.
4 |
5 |
6 |
7 | """Display utilities."""
8 |
9 | from __future__ import absolute_import
10 | from __future__ import division
11 | from __future__ import print_function
12 |
13 | from terminaltables import SingleTable, AsciiTable
14 | from tabulate import tabulate
15 | from colorama import init, Fore, Back, Style
16 |
17 | init() # colorama init
18 |
19 | # Check if we are in a ipython/colab environement
20 | try:
21 | class_name = get_ipython().__class__.__name__
22 | if "Terminal" in class_name:
23 | IS_NOTEBOOK = False
24 | else:
25 | IS_NOTEBOOK = True
26 |
27 | except NameError:
28 | IS_NOTEBOOK = False
29 |
30 | if IS_NOTEBOOK:
31 | from tqdm import tqdm_notebook as tqdm
32 | from IPython.display import HTML
33 | from IPython.display import display as ipython_display
34 |
35 |
36 | def display(text):
37 | ipython_display(HTML(text))
38 | else:
39 | from tqdm import tqdm
40 |
41 | display = print
42 |
43 | FG = 0
44 | BG = 1
45 |
46 | # TODO: create a set of HTML color to allows richer display in colab
47 | colors = {
48 | 'black': [Fore.BLACK, Back.BLACK],
49 | 'red': [Fore.RED, Back.RED],
50 | 'green': [Fore.GREEN, Back.GREEN],
51 | 'yellow': [Fore.YELLOW, Back.YELLOW],
52 | 'blue': [Fore.BLUE, Back.BLUE],
53 | 'magenta': [Fore.MAGENTA, Back.MAGENTA],
54 | 'cyan': [Fore.CYAN, Back.CYAN],
55 | 'white': [Fore.WHITE, Back.WHITE],
56 | }
57 |
58 | styles = {
59 | "dim": Style.DIM,
60 | "normal": Style.NORMAL,
61 | "bright": Style.BRIGHT,
62 | "reset": Style.RESET_ALL
63 | }
64 |
65 |
66 | # Shorthand functions
67 | def info(text, render=1):
68 | """ display a info
69 |
70 | Args:
71 | text (str): info message
72 | display (bool, optional): Defaults to True. Display or return settings
73 |
74 | Returns:
75 | str: setting value if display=False, None otherwise
76 | """
77 | color = 'blue'
78 | s = "[Info] %s" % text
79 |
80 | if render:
81 | cprint(s, color)
82 | else:
83 | return colorize(s + '\n', color)
84 |
85 |
86 | def warning(text, render=1):
87 | """ display a warning
88 |
89 | Args:
90 | text (str): warning message
91 | render (bool, optional): Defaults to True. render or return settings
92 |
93 | Returns:
94 | str: setting value if render=False, None otherwise
95 | """
96 | color = 'yellow'
97 | s = "[Warning] %s" % text
98 |
99 | if render:
100 | cprint(s, color)
101 | else:
102 | return colorize(s + '\n', color)
103 |
104 |
105 | def fatal(text, render=True, raise_exception=True):
106 | """ Display a fatal error, and die
107 |
108 | Args:
109 | text (str): Fatal message
110 | render (bool, optional): Render or return settings. Defaults to True.
111 | raise_exception (bool, optional): Raise a ValueError. Defaults to True.
112 | Returns:
113 | str: Formated fatal message
114 | """
115 | color = 'white'
116 | bgcolor = 'red'
117 | s = "[FATAL] %s" % text
118 |
119 | if render:
120 | cprint(s, color, bgcolor)
121 | if raise_exception:
122 | raise ValueError(s)
123 | return colorize(s + '\n', color, bgcolor)
124 |
125 |
126 | def section(text):
127 | """ Render a section
128 |
129 | Args:
130 | text (str): Section name
131 | """
132 | if IS_NOTEBOOK:
133 | section = '
' + text + '
'
134 | cprint(section, '#4527A0')
135 | else:
136 | section = '[' + text + ']'
137 | cprint(section, 'yellow')
138 |
139 |
140 | def subsection(text):
141 | """ Render a subsection.
142 |
143 | Args:
144 | text (str): Subsection name
145 | """
146 | if IS_NOTEBOOK:
147 | section = '' + text + '
'
148 | cprint(section, '#7E57C2')
149 | else:
150 | section = ' > ' + text + ''
151 | cprint(section, 'magenta', brightness='dim')
152 |
153 |
154 | def display_setting(text, indent_level=1, idx=0, render=True):
155 | """ Print a single setting
156 |
157 | Args:
158 | text (str): Setting key:value as string
159 | indent_level (int, optional): Num indentation space. Defaults to 0.
160 | idx (int, optional): Index of setting to rotate color. Defaults to 0.
161 | render (bool, optional): Render or return settings. Defaults to True.
162 |
163 | Returns:
164 | str: colorized settings.
165 | """
166 | s = ' ' * indent_level
167 | s += '|-' + text
168 | if idx % 2:
169 | color = 'blue'
170 | else:
171 | color = 'cyan'
172 |
173 | if render:
174 | cprint(s, color)
175 | return colorize(s + '\n', color)
176 |
177 |
178 | def display_settings(mysettings, indent_level=1, render=True):
179 | """
180 | Render a collection of settings
181 |
182 | Args:
183 | mysettings (dict): Dictionnary of settings
184 | indent_level (int): Identation level. Defaults to 1.
185 | render (bool, optional): Print? Defaults to True.
186 | """
187 | s = ""
188 | idx = 0
189 | for name in sorted(mysettings.keys()):
190 | value = mysettings[name]
191 | txt = "%s: %s" % (name, value)
192 | s += display_setting(txt, idx=idx, indent_level=indent_level,
193 | render=render)
194 | idx += 1
195 | return s
196 |
197 |
198 | def highlight(text):
199 | if IS_NOTEBOOK:
200 | text = '' + text + ''
201 | cprint(text, '#64DD17')
202 | else:
203 | cprint(text, 'green', brightness="bright")
204 |
205 |
206 | # Charts
207 |
208 |
209 | def display_bar_chart(val, max_val, title=None, left='', right='',
210 | color='green', length=80):
211 | bar = make_bar_chart(val, max_val, title=title, left=left, right=right,
212 | color=color, length=length)
213 | display(bar)
214 |
215 |
216 | def make_bar_chart(val, max_val, title=None, left='', right='',
217 | color='green', length=80):
218 | full_block = '█'
219 | empty_block = '░'
220 | half_block = '▒'
221 |
222 | # building the bar
223 | bar = ''
224 | num_full = length * val / float(max_val)
225 | bar += full_block * int(num_full)
226 | if not (num_full).is_integer():
227 | bar += half_block
228 | bar += empty_block * (length - len(bar))
229 |
230 | # colorize
231 | bar = colorize(bar, color)
232 |
233 | # adding left/right text if needed
234 | row = []
235 | if left:
236 | row.append(left)
237 | row.append(bar)
238 | if right:
239 | row.append(right)
240 |
241 | st = SingleTable([row], title)
242 | st.inner_column_border = False
243 | return st.table
244 |
245 |
246 | # Low level function
247 |
248 |
249 | def cprint(text, color, bg_color=None, brightness='normal'):
250 | """ Print given piece of text with color
251 |
252 | Args:
253 | text (str): text to colorize
254 | color (str): forground color
255 | bg_color (str, optional): Defaults to None. background color.
256 | brightness (str, optional): Defaults to normal. Text brightness.
257 | """
258 |
259 | text = colorize(text, color, bg_color, brightness)
260 |
261 | # HTMLify if needed
262 | display(text)
263 |
264 |
265 | def colorize_row(row, color, bg_color=None, brightness='normal'):
266 | """Colorize a table row.
267 |
268 | Args:
269 | row (list): The row to colorize.
270 | color (str): Forground color.
271 | bg_color (str): Background color. Defaults to None.
272 | brightness (str, optional): Defaults to normal. Text brightness.
273 | Returns:
274 | list: colorized row
275 | """
276 | colored_row = []
277 | for v in row:
278 | colored_row.append(colorize(v, color, bg_color, brightness))
279 | return colored_row
280 |
281 |
282 | def colorize_default(text):
283 | """Colorize a given piece of text with the terminal default color
284 | Args:
285 | text (str): text to colorize
286 | """
287 | if IS_NOTEBOOK:
288 | text = text + ''
289 | else:
290 | text = text + styles['reset']
291 | return text
292 |
293 |
294 | def colorize(text, color, bg_color=None, brightness='normal'):
295 | """ Colorize a given piece of text
296 | Args:
297 | text (str): text to colorize
298 | color (str): forground color
299 | bg_color (str, optional): Defaults to None. background color.
300 | brightness (str, optional): Defaults to normal. Text brightness.
301 |
302 | Returns:
303 | str: colorized text
304 | """
305 |
306 | text = str(text) # in case user pass a float/int
307 |
308 | # we need a special case as term default color/bgcolor is unknown
309 | if color == 'default':
310 | return colorize_default(text)
311 |
312 | if color not in colors and not IS_NOTEBOOK:
313 | msg = "Foreground color invalid:%s" % color
314 | raise ValueError(msg)
315 |
316 | if bg_color and bg_color not in colors and not IS_NOTEBOOK:
317 | msg = "Background color invalid:%s" % bg_color
318 | raise ValueError(msg)
319 |
320 | if brightness not in brightness and not IS_NOTEBOOK:
321 | raise ValueError("Brightness invalid:" + brightness)
322 |
323 | # foreground color
324 | if IS_NOTEBOOK:
325 | text = text.replace('\n', '
')
326 | h = '' % color
327 | text = h + text
328 | else:
329 | text = colors[color][FG] + text
330 | # background if needed
331 | if bg_color and not IS_NOTEBOOK:
332 | text = colors[bg_color][BG] + text
333 |
334 | # brightness if neeed
335 | if brightness != 'normal' and not IS_NOTEBOOK:
336 | text = styles[brightness] + text
337 |
338 | # reset
339 | if IS_NOTEBOOK:
340 | text = text + ''
341 | else:
342 | text = text + styles['reset']
343 |
344 | return text
345 |
346 |
347 | # TABLE
348 | def display_table(rows, title=None, indent=0):
349 | """ Print data as a nicely formated ascii table
350 | Args:
351 | rows (list(list)): data to display as list of lists.
352 | title (str, optional): Defaults to None. Table title
353 | """
354 | table = make_table(rows, title)
355 |
356 | if indent and not IS_NOTEBOOK:
357 | indent = " " * indent
358 | out = []
359 | for line in table.split("\n"):
360 | out.append(indent + line)
361 | table = "\n".join(out)
362 | display(table)
363 |
364 |
365 | def make_table(rows, title=None):
366 | """ Format list as a pretty ascii table
367 | Args:
368 | rows (list(list)): data to display as list of lists.
369 | title (str, optional): Defaults to None. Table title
370 | Returns:
371 | str: string representing table
372 | """
373 | if IS_NOTEBOOK:
374 | headers = rows[0]
375 | body = rows[1:]
376 | table = tabulate(body, headers, tablefmt="html")
377 | else:
378 | st = SingleTable(rows, title)
379 | table = st.table
380 | return table
381 |
382 |
383 | def make_combined_table(array_rows):
384 | """ Build a table of tables
385 |
386 | Args:
387 | array_rows (list(list)): Array of tables rows to combine
388 | Returns:
389 | str: string representing table
390 | """
391 |
392 | if IS_NOTEBOOK:
393 | # compute the size for each col
394 | col_size = str(int(100 / len(array_rows)) - 5) + '%'
395 | gtc = [col_size] * len(array_rows)
396 | table = """
397 |
404 |
405 | """ % (" ".join(gtc))
406 | for rows in array_rows:
407 | table += '
'
408 | headers = rows[0]
409 | body = rows[1:]
410 | table += tabulate(body, headers, tablefmt="html")
411 | table += '
'
412 | table += "
"
413 | return table
414 | else:
415 | tables = []
416 | for rows in array_rows:
417 | tables.append(make_table(rows))
418 | combined_table = AsciiTable([tables])
419 | combined_table.outer_border = False
420 | combined_table.inner_column_border = False
421 | return combined_table.table
422 |
423 |
424 | def display_combined_table(array_rows):
425 | """ Build a table of tables and print it
426 |
427 | Args:
428 | array_rows (list(list)): Array of tables rows to combine
429 | """
430 | table = make_combined_table(array_rows)
431 | display(table)
432 |
433 |
434 | def progress_bar(*args, **kwargs):
435 | """ Returns a new tqdm progress bar appropriate for the current display.
436 |
437 | Returns:
438 | tqdm progress bar.
439 | """
440 |
441 | return tqdm(*args, **kwargs)
442 |
--------------------------------------------------------------------------------
/autorecsys/utils/metric.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # This codes are migrated from Keras Tuner: https://keras-team.github.io/keras-tuner/.
3 | # The copyright belows to the Keras Tuner authors.
4 |
5 |
6 | from __future__ import absolute_import, division, print_function, unicode_literals
7 |
8 |
9 | import numpy as np
10 | from tensorflow import keras
11 |
12 |
13 |
14 | class MetricObservation(object):
15 |
16 | def __init__(self, value, step):
17 | if not isinstance(value, list):
18 | value = [value]
19 | self.value = value
20 | self.step = step
21 |
22 | def append(self, value):
23 | if not isinstance(value, list):
24 | value = [value]
25 | self.value += value
26 |
27 | def mean(self):
28 | return np.mean(self.value)
29 |
30 | def get_config(self):
31 | return {'value': self.value,
32 | 'step': self.step}
33 |
34 | @classmethod
35 | def from_config(cls, config):
36 | return cls(**config)
37 |
38 | def __eq__(self, other):
39 | if not isinstance(other, MetricObservation):
40 | return False
41 | return (other.value == self.value and
42 | other.step == self.step)
43 |
44 | def __repr__(self):
45 | return 'MetricObservation(value={}, step={})'.format(
46 | self.value, self.step)
47 |
48 |
49 | class MetricHistory(object):
50 |
51 | def __init__(self, direction='min'):
52 | if direction not in {'min', 'max'}:
53 | raise ValueError(
54 | '`direction` should be one of '
55 | '{"min", "max"}, but got: %s' % (direction,))
56 | self.direction = direction
57 | self._observations = {}
58 |
59 | def update(self, value, step):
60 | if step in self._observations:
61 | self._observations[step].append(value)
62 | else:
63 | self._observations[step] = MetricObservation(
64 | value, step=step)
65 |
66 | def get_best_value(self):
67 | values = list(
68 | obs.mean() for obs in self._observations.values())
69 | if not values:
70 | return None
71 | if self.direction == 'min':
72 | return np.nanmin(values)
73 | return np.nanmax(values)
74 |
75 | def get_best_step(self):
76 | best_value = self.get_best_value()
77 | if best_value is None:
78 | return None
79 | for obs in self._observations.values():
80 | if obs.mean() == best_value:
81 | return obs.step
82 |
83 | def get_history(self):
84 | return sorted(self._observations.values(),
85 | key=lambda obs: obs.step)
86 |
87 | def set_history(self, observations):
88 | for obs in observations:
89 | self.update(obs.value, step=obs.step)
90 |
91 | def get_statistics(self):
92 | history = self.get_history()
93 | history_values = [obs.mean() for obs in history]
94 | if not len(history_values):
95 | return {}
96 | return {
97 | 'min': float(np.nanmin(history_values)),
98 | 'max': float(np.nanmax(history_values)),
99 | 'mean': float(np.nanmean(history_values)),
100 | 'median': float(np.nanmedian(history_values)),
101 | 'var': float(np.nanvar(history_values)),
102 | 'std': float(np.nanstd(history_values))
103 | }
104 |
105 | def get_last_value(self):
106 | history = self.get_history()
107 | if history:
108 | last_obs = history[-1]
109 | return last_obs.mean()
110 | else:
111 | return None
112 |
113 | def get_config(self):
114 | config = {'direction': self.direction,
115 | 'observations': [obs.get_config() for obs in self.get_history()]}
116 | return config
117 |
118 | @classmethod
119 | def from_config(cls, config):
120 | instance = cls(config['direction'])
121 | instance.set_history([MetricObservation.from_config(obs)
122 | for obs in config['observations']])
123 | return instance
124 |
125 |
126 | class MetricsTracker(object):
127 |
128 | def __init__(self, metrics=None):
129 | # str -> MetricHistory
130 | self.metrics = {}
131 | self.register_metrics(metrics)
132 |
133 | def exists(self, name):
134 | return name in self.metrics
135 |
136 | def register_metrics(self, metrics=None):
137 | metrics = metrics or []
138 | for metric in metrics:
139 | self.register(metric.name)
140 |
141 | def register(self, name, direction=None):
142 | if self.exists(name):
143 | raise ValueError('Metric already exists: %s' % (name,))
144 | if direction is None:
145 | direction = infer_metric_direction(name)
146 | self.metrics[name] = MetricHistory(direction)
147 |
148 | def update(self, name, value, step=0):
149 | value = float(value)
150 | if not self.exists(name):
151 | self.register(name)
152 |
153 | prev_best = self.metrics[name].get_best_value()
154 | self.metrics[name].update(value, step=step)
155 | new_best = self.metrics[name].get_best_value()
156 |
157 | improved = new_best != prev_best
158 | return improved
159 |
160 | def get_history(self, name):
161 | self._assert_exists(name)
162 | return self.metrics[name].get_history()
163 |
164 | def set_history(self, name, observations):
165 | assert type(observations) == list
166 | if not self.exists(name):
167 | self.register(name)
168 | self.metrics[name].set_history(observations)
169 |
170 | def get_best_value(self, name):
171 | self._assert_exists(name)
172 | return self.metrics[name].get_best_value()
173 |
174 | def get_best_step(self, name):
175 | self._assert_exists(name)
176 | return self.metrics[name].get_best_step()
177 |
178 | def get_statistics(self, name):
179 | self._assert_exists(name)
180 | return self.metrics[name].get_statistics()
181 |
182 | def get_last_value(self, name):
183 | self._assert_exists(name)
184 | return self.metrics[name].get_last_value()
185 |
186 | def get_direction(self, name):
187 | self._assert_exists(name)
188 | return self.metrics[name].direction
189 |
190 | def get_config(self):
191 | return {
192 | 'metrics': {
193 | name: metric_history.get_config()
194 | for name, metric_history in self.metrics.items()}}
195 |
196 | @classmethod
197 | def from_config(cls, config):
198 | instance = cls()
199 | instance.metrics = {
200 | name: MetricHistory.from_config(metric_history)
201 | for name, metric_history in config['metrics'].items()}
202 | return instance
203 |
204 | def _assert_exists(self, name):
205 | if name not in self.metrics:
206 | raise ValueError('Unknown metric: %s' % (name,))
207 |
208 |
209 | _MAX_METRICS = {
210 | 'Accuracy', 'BinaryAccuracy',
211 | 'CategoricalAccuracy', 'SparseCategoricalAccuracy',
212 | 'TopKCategoricalAccuracy', 'SparseTopKCategoricalAccuracy',
213 | 'TruePositives', 'TrueNegatives',
214 | 'Precision', 'Recall', 'AUC',
215 | 'SensitivityAtSpecificity', 'SpecificityAtSensitivity'
216 | }
217 |
218 | _MAX_METRIC_FNS = {
219 | 'accuracy', 'categorical_accuracy', 'binary_accuracy',
220 | 'sparse_categorical_accuracy'
221 | }
222 |
223 |
224 | def infer_metric_direction(metric):
225 | # Handle str input and get canonical object.
226 | if isinstance(metric, str):
227 | metric_name = metric
228 | if len(metric_name) > 4 and metric_name[:4] == 'val_':
229 | metric_name = metric_name[4:]
230 | if metric_name == 'loss':
231 | # Special-case the overall loss.
232 | return 'min'
233 | try:
234 | metric = keras.metrics.get(metric_name)
235 | except ValueError:
236 | # Default to minimization for unknown metric.
237 | return 'min'
238 |
239 | # Metric class or function.
240 | if isinstance(metric, keras.metrics.Metric):
241 | name = metric.__class__.__name__
242 | if name == 'MeanMetricWrapper':
243 | name = metric._fn.__name__
244 | else:
245 | name = metric.__name__
246 |
247 | if name in _MAX_METRICS or name in _MAX_METRIC_FNS:
248 | return 'max'
249 | return 'min'
250 |
--------------------------------------------------------------------------------
/docs/autogen.py:
--------------------------------------------------------------------------------
1 | import os
2 | import pathlib
3 | import shutil
4 |
5 | import keras_autodoc
6 | import tutobooks
7 |
8 |
9 | PAGES = {
10 | 'preprocessor.md': [
11 | 'autorecsys.pipeline.preprocessor.BasePreprocessor',
12 | 'autorecsys.pipeline.preprocessor.BasePreprocessor.format_dataset',
13 | 'autorecsys.pipeline.preprocessor.BasePreprocessor.load_dataset',
14 | 'autorecsys.pipeline.preprocessor.BasePreprocessor.transform_categorical',
15 | 'autorecsys.pipeline.preprocessor.BasePreprocessor.transform_numerical',
16 | 'autorecsys.pipeline.preprocessor.BasePreprocessor.get_hash_size',
17 | 'autorecsys.pipeline.preprocessor.BasePreprocessor.get_x',
18 | 'autorecsys.pipeline.preprocessor.BasePreprocessor.get_x_numerical',
19 | 'autorecsys.pipeline.preprocessor.BasePreprocessor.get_x_categorical',
20 | 'autorecsys.pipeline.preprocessor.BasePreprocessor.get_y',
21 | 'autorecsys.pipeline.preprocessor.BasePreprocessor.get_numerical_count',
22 | 'autorecsys.pipeline.preprocessor.BasePreprocessor.get_categorical_count',
23 | 'autorecsys.pipeline.preprocessor.BasePreprocessor.split_data',
24 | 'autorecsys.pipeline.preprocessor.BasePreprocessor.preprocess',
25 | 'autorecsys.pipeline.preprocessor.AvazuPreprocessor',
26 | 'autorecsys.pipeline.preprocessor.AvazuPreprocessor.preprocess',
27 | 'autorecsys.pipeline.preprocessor.CriteoPreprocessor',
28 | 'autorecsys.pipeline.preprocessor.CriteoPreprocessor.preprocess',
29 | 'autorecsys.pipeline.preprocessor.NetflixPrizePreprocessor',
30 | 'autorecsys.pipeline.preprocessor.NetflixPrizePreprocessor.format_dataset',
31 | 'autorecsys.pipeline.preprocessor.NetflixPrizePreprocessor.preprocess',
32 | 'autorecsys.pipeline.preprocessor.MovielensPreprocessor',
33 | 'autorecsys.pipeline.preprocessor.MovielensPreprocessor.preprocess',
34 | ],
35 | 'node.md': [
36 | 'autorecsys.pipeline.node.Input',
37 | 'autorecsys.pipeline.node.Input.fit_transform',
38 | 'autorecsys.pipeline.node.Input.transform',
39 | 'autorecsys.pipeline.node.StructuredDataInput',
40 | 'autorecsys.pipeline.node.StructuredDataInput.get_state',
41 | 'autorecsys.pipeline.node.StructuredDataInput.set_state',
42 | 'autorecsys.pipeline.node.StructuredDataInput.update',
43 | 'autorecsys.pipeline.node.StructuredDataInput.infer_column_types',
44 | ],
45 | 'mapper.md': [
46 | 'autorecsys.pipeline.mapper.LatentFactorMapper',
47 | 'autorecsys.pipeline.mapper.LatentFactorMapper.get_state',
48 | 'autorecsys.pipeline.mapper.LatentFactorMapper.set_state',
49 | 'autorecsys.pipeline.mapper.LatentFactorMapper.build',
50 | 'autorecsys.pipeline.mapper.DenseFeatureMapper',
51 | 'autorecsys.pipeline.mapper.DenseFeatureMapper.get_state',
52 | 'autorecsys.pipeline.mapper.DenseFeatureMapper.set_state',
53 | 'autorecsys.pipeline.mapper.DenseFeatureMapper.build',
54 | 'autorecsys.pipeline.mapper.SparseFeatureMapper',
55 | 'autorecsys.pipeline.mapper.SparseFeatureMapper.get_state',
56 | 'autorecsys.pipeline.mapper.SparseFeatureMapper.set_state',
57 | 'autorecsys.pipeline.mapper.SparseFeatureMapper.build',
58 | ],
59 | 'interactor.md': [
60 | 'autorecsys.pipeline.interactor.RandomSelectInteraction',
61 | 'autorecsys.pipeline.interactor.RandomSelectInteraction.get_state',
62 | 'autorecsys.pipeline.interactor.RandomSelectInteraction.set_state',
63 | 'autorecsys.pipeline.interactor.RandomSelectInteraction.build',
64 | 'autorecsys.pipeline.interactor.ConcatenateInteraction',
65 | 'autorecsys.pipeline.interactor.ConcatenateInteraction.get_state',
66 | 'autorecsys.pipeline.interactor.ConcatenateInteraction.set_state',
67 | 'autorecsys.pipeline.interactor.ConcatenateInteraction.build',
68 | 'autorecsys.pipeline.interactor.InnerProductInteraction',
69 | 'autorecsys.pipeline.interactor.InnerProductInteraction.get_state',
70 | 'autorecsys.pipeline.interactor.InnerProductInteraction.set_state',
71 | 'autorecsys.pipeline.interactor.InnerProductInteraction.build',
72 | 'autorecsys.pipeline.interactor.ElementwiseInteraction',
73 | 'autorecsys.pipeline.interactor.ElementwiseInteraction.get_state',
74 | 'autorecsys.pipeline.interactor.ElementwiseInteraction.set_state',
75 | 'autorecsys.pipeline.interactor.ElementwiseInteraction.build',
76 | 'autorecsys.pipeline.interactor.MLPInteraction',
77 | 'autorecsys.pipeline.interactor.MLPInteraction.get_state',
78 | 'autorecsys.pipeline.interactor.MLPInteraction.set_state',
79 | 'autorecsys.pipeline.interactor.MLPInteraction.build',
80 | 'autorecsys.pipeline.interactor.HyperInteraction',
81 | 'autorecsys.pipeline.interactor.HyperInteraction.get_state',
82 | 'autorecsys.pipeline.interactor.HyperInteraction.set_state',
83 | 'autorecsys.pipeline.interactor.HyperInteraction.build',
84 | 'autorecsys.pipeline.interactor.FMInteraction',
85 | 'autorecsys.pipeline.interactor.FMInteraction.get_state',
86 | 'autorecsys.pipeline.interactor.FMInteraction.set_state',
87 | 'autorecsys.pipeline.interactor.FMInteraction.build',
88 | 'autorecsys.pipeline.interactor.CrossNetInteraction',
89 | 'autorecsys.pipeline.interactor.CrossNetInteraction.get_state',
90 | 'autorecsys.pipeline.interactor.CrossNetInteraction.set_state',
91 | 'autorecsys.pipeline.interactor.CrossNetInteraction.build',
92 | 'autorecsys.pipeline.interactor.SelfAttentionInteraction',
93 | 'autorecsys.pipeline.interactor.SelfAttentionInteraction.get_state',
94 | 'autorecsys.pipeline.interactor.SelfAttentionInteraction.set_state',
95 | 'autorecsys.pipeline.interactor.SelfAttentionInteraction.build',
96 | ],
97 | 'optimizer.md': [
98 | 'autorecsys.pipeline.optimizer.RatingPredictionOptimizer',
99 | 'autorecsys.pipeline.optimizer.RatingPredictionOptimizer.build',
100 | 'autorecsys.pipeline.optimizer.CTRPredictionOptimizer',
101 | 'autorecsys.pipeline.optimizer.CTRPredictionOptimizer.build',
102 | ],
103 | 'recommender.md': [
104 | 'autorecsys.recommender.RPRecommender',
105 | 'autorecsys.recommender.CTRRecommender',
106 | ],
107 | 'auto_search.md': [
108 | 'autorecsys.auto_search.Search',
109 | 'autorecsys.auto_search.Search.search',
110 | 'autorecsys.auto_search.Search.predict',
111 | 'autorecsys.auto_search.Search.evaluate',
112 | ],
113 |
114 | }
115 |
116 |
117 | aliases_needed = [
118 | 'tensorflow.keras.callbacks.Callback',
119 | 'tensorflow.keras.losses.Loss',
120 | 'tensorflow.keras.metrics.Metric',
121 | 'tensorflow.data.Dataset'
122 | ]
123 |
124 |
125 | ROOT = 'http://autorecsys.com/'
126 |
127 | project_dir = pathlib.Path(__file__).resolve().parents[1]
128 |
129 | def py_to_nb_md(dest_dir):
130 | for file_path in os.listdir('py/'):
131 | dir_path = 'py'
132 | file_name = file_path
133 | py_path = os.path.join(dir_path, file_path)
134 | file_name_no_ext = os.path.splitext(file_name)[0]
135 | ext = os.path.splitext(file_name)[1]
136 |
137 | if ext != '.py':
138 | continue
139 |
140 | nb_path = os.path.join('ipynb', file_name_no_ext + '.ipynb')
141 | md_path = os.path.join(dest_dir, 'tutorial', file_name_no_ext + '.md')
142 |
143 | tutobooks.py_to_md(py_path, nb_path, md_path, 'templates/img')
144 |
145 | github_repo_dir = 'keras-team/autokeras/blob/master/docs/'
146 | with open(md_path, 'r') as md_file:
147 | button_lines = [
148 | ':material-link: '
149 | "[**View in Colab**](https://colab.research.google.com/github/"
150 | + github_repo_dir
151 | + "ipynb/"
152 | + file_name_no_ext + ".ipynb"
153 | + ") "
154 | # + '•'
155 | + ':octicons-octoface: '
156 | "[**GitHub source**](https://github.com/" + github_repo_dir + "py/"
157 | + file_name_no_ext + ".py)",
158 | "\n",
159 | ]
160 | md_content = ''.join(button_lines) + '\n' + md_file.read()
161 |
162 | with open(md_path, 'w') as md_file:
163 | md_file.write(md_content)
164 |
165 |
166 | def generate(dest_dir):
167 | template_dir = project_dir / 'docs' / 'templates'
168 | doc_generator = keras_autodoc.DocumentationGenerator(
169 | PAGES,
170 | 'https://github.com/datamllab/AutoRecSys',
171 | template_dir,
172 | project_dir / 'examples'
173 | )
174 | doc_generator.generate(dest_dir)
175 | readme = (project_dir / 'README.md').read_text()
176 | index = (template_dir / 'index.md').read_text()
177 | index = index.replace('{{autogenerated}}', readme[readme.find('##'):])
178 | (dest_dir / 'index.md').write_text(index, encoding='utf-8')
179 | # shutil.copyfile(project_dir / '.github' / 'CONTRIBUTING.md',
180 | # dest_dir / 'contributing.md')
181 |
182 | # py_to_nb_md(dest_dir)
183 |
184 |
185 | if __name__ == '__main__':
186 | generate(project_dir / 'docs' / 'sources')
187 |
--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
1 | # Welcome to MkDocs
2 |
3 | For full documentation visit [mkdocs.org](https://mkdocs.org).
4 |
5 | ## Commands
6 |
7 | * `mkdocs new [dir-name]` - Create a new project.
8 | * `mkdocs serve` - Start the live-reloading docs server.
9 | * `mkdocs build` - Build the documentation site.
10 | * `mkdocs help` - Print this help message.
11 |
12 | ## Project layout
13 |
14 | mkdocs.yml # The configuration file.
15 | docs/
16 | index.md # The documentation homepage.
17 | ... # Other markdown pages, images and other files.
18 |
--------------------------------------------------------------------------------
/docs/mkdocs.yml:
--------------------------------------------------------------------------------
1 | site_name: AutoRec
2 | theme:
3 | name: 'material'
4 | palette:
5 | primary: 'green'
6 | accent: 'green'
7 |
8 | docs_dir: sources
9 | repo_url: https://github.com/datamllab/AutoRecSys
10 | site_url: http://autokeras.org
11 | edit_uri: ""
12 | site_description: 'Documentation for AutoRec.'
13 | # google_analytics: ['UA-44322747-3', 'autokeras.com']
14 | markdown_extensions:
15 | - codehilite
16 | - pymdownx.superfences:
17 | custom_fences:
18 | - name: mermaid
19 | class: mermaid
20 | format: !!python/name:pymdownx.superfences.fence_div_format
21 | - pymdownx.emoji:
22 | emoji_index: !!python/name:materialx.emoji.twemoji
23 | emoji_generator: !!python/name:materialx.emoji.to_svg
24 | - admonition
25 |
26 | extra_javascript:
27 | - https://unpkg.com/mermaid@8.4.4/dist/mermaid.min.js
28 |
29 | nav:
30 | - Home: index.md
31 | - Installation: install.md
32 | - Documentation:
33 | - Preprocessor: preprocessor.md
34 | - Mapper: mapper.md
35 | - Node: node.md
36 | - Interactor: interactor.md
37 | - Optimizer: optimizer.md
38 | - Recommender: recommender.md
39 | - Auto Search: auto_search.md
40 | - About: about.md
41 |
--------------------------------------------------------------------------------
/docs/readme.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datamllab/AutoRec/2dbc8778cfb597402d8b0337186bf9152663b20a/docs/readme.md
--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | keras-autodoc==0.5.1
2 | mkdocs
3 | mkdocs-material
4 | pygments
5 | jupyter
6 | pymdown-extensions
7 | Sphinx<3.1.0
8 |
--------------------------------------------------------------------------------
/docs/templates/about.md:
--------------------------------------------------------------------------------
1 | This package is developed by [DATA LAB](http://faculty.cs.tamu.edu/xiahu/) at Texas A&M University.
2 |
3 | ## Core Team
4 |
5 | [**Ting-Hsiang Wang**](https://github.com/thwang1231):
6 |
7 | [**Qingquan Song**](https://github.com/song3134):
8 |
9 | [**Xiaotian Han**](https://github.com/ahxt):
10 |
11 | [**Zirui Liu**](https://github.com/warai-otoko):
12 |
13 | [**Haifeng Jin**](https://github.com/haifeng-jin):
14 |
15 | [**Xia "Ben" Hu**](http://faculty.cs.tamu.edu/xiahu/):
16 | Project lead and maintainer.
17 |
18 |
--------------------------------------------------------------------------------
/docs/templates/benchmark.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datamllab/AutoRec/2dbc8778cfb597402d8b0337186bf9152663b20a/docs/templates/benchmark.md
--------------------------------------------------------------------------------
/docs/templates/index.md:
--------------------------------------------------------------------------------
1 | # Welcome to AutoRec
2 |
3 | ## Abstract
4 |
5 | Realistic recommender systems are often required to adapt to ever-changing
6 | data and tasks or to explore different models systematically.
7 | To address the need, we present **AutoRec**, an open-source automated machine
8 | learning (AutoML) platform extended from the TensorFlow ecosystem and, to our
9 | knowledge, the first framework to leverage AutoML for model search and
10 | hyperparameter tuning in deep recommendation models.
11 |
12 | AutoRec also supports a highly flexible pipeline that accommodates both sparse
13 | and dense inputs, rating prediction and click-through rate (CTR) prediction
14 | tasks, and an array of recommendation models.
15 | Lastly, AutoRec provides a simple, user-friendly API.
16 |
17 | Experiments conducted on the benchmark datasets reveal AutoRec is reliable and
18 | can identify models which resemble the best model without prior knowledge.
19 |
20 |
--------------------------------------------------------------------------------
/docs/templates/install.md:
--------------------------------------------------------------------------------
1 | ## Requirements
2 |
3 | **Python 3**: Follow the TensorFlow install steps to install Python 3.
4 |
5 | **Pip**: Follow the TensorFlow install steps to install Pip.
6 |
7 | **Tensorflow >= 2.2.0**: AutoRec is based on TensorFlow.
8 | Please follow
9 | [this tutorial](https://www.tensorflow.org/install/pip) to install TensorFlow for python3.
10 |
11 | **GPU Setup (Optional)**:
12 | If you have GPUs on your machine and want to use them to accelerate the training,
13 | you can follow [this tutorial](https://www.tensorflow.org/install/gpu) to setup.
14 |
15 | ## Install AutoRec
16 |
17 |
--------------------------------------------------------------------------------
/examples/README.md:
--------------------------------------------------------------------------------
1 | # Benchmarking
2 |
3 | Benchmarks for popular recommendation methods.
4 |
5 | ## Rating Predication Task
6 |
7 | We adpot two dataset to evaluate our autoRec.
8 |
9 | - **Movilens**: GroupLens Research has collected and made available rating data sets from the MovieLens web site (http://movielens.org). The data sets were collected over various periods of time, depending on the size of the set.In our experimets, we use different version of this dataset.
10 | - **Netflix**: Netflix held the Netflix Prize open competition for the best algorithm to predict user ratings for films. The grand prize was $1,000,000 and was won by BellKor's Pragmatic Chaos team. This is the dataset that was used in that competition.
11 |
12 | The statistics of the dataset are as follow:
13 |
14 | |Dataset|#user|#item|#interaction|
15 | |---|---:|---:|---:|
16 | |[Movelens1m](#Movelens100k)|6,040|3,900|1,000,209|
17 | |[Movelens10m](#Movelens10M)|71,567|10,681|10,000,054|
18 | |[Movelens_latest](#Movelens_latest)|283,228|58,098|27,753,444|
19 | |[Netflix](#netflix)|480189|480189|100480507|
20 |
21 |
22 | Some popular model for rating prediction:
23 |
24 | - **MF**:Matrix factorization is a class of collaborative filtering algorithms used in recommender systems. Matrix factorization algorithms work by decomposing the user-item interaction matrix into the product of two lower dimensionality rectangular matrices.
25 | - **MLP**: Three collaborative filtering models: Generalized Matrix Factorization (GMF), Multi-Layer Perceptron (MLP), and Neural Matrix Factorization (NeuMF). To target the models for implicit feedback and ranking task, we optimize them using log loss with negative sampling.
26 | - **GMF**: Three collaborative filtering models: Generalized Matrix Factorization (GMF), Multi-Layer Perceptron (MLP), and Neural Matrix Factorization (NeuMF). To target the models for implicit feedback and ranking task, we optimize them using log loss with negative sampling.
27 | - **NeuMF**: Three collaborative filtering models: Generalized Matrix Factorization (GMF), Multi-Layer Perceptron (MLP), and Neural Matrix Factorization (NeuMF). To target the models for implicit feedback and ranking task, we optimize them using log loss with negative sampling.
28 | - **Hp search**: Our autorec with hyperparameter search.
29 | - **Block Search**: Our autorec with both the block search and hyperparameter search.
30 |
31 | All benchmarks were run with our AutoRec Package.
32 | The benchmarks experiments were run on a machine with dual
33 | Intel Xeon E5-2630 v3 processors (8 cores each plus hyperthreading means 32
34 | threads) and one GTX 2080Ti running Ubuntu 16.04 with the Tensorflow 2.1.0 and CUDA 10.0 Release.
35 |
36 | We benchmark all models with a minibatch size of 256 ;
37 | this allows fair comparisons between different models.
38 |
39 | The following models are benchmarked:
40 |
41 | ## Movelens1M
42 | |Movelens1M|Val_MSE|Test_MSE|Time(s)|
43 | |---|---:|---:|---:|
44 | |MF_random|0.7553643584251404|0.7550543546676636|103.57773876190186|
45 | |MF_greedy|0.7503780722618103|0.7502530217170715|85.47167634963989|
46 | |MF_bayesian|0.7521297335624695|0.7517699599266052|1031.2954790592194|
47 | |MLP_random|0.7676995396614075|0.7681054472923279|1383.5030148029327|
48 | |MLP_greedy|0.769902765750885|0.7706407308578491|1292.7048692703247|
49 | |MLP_bayesian|0.758850634098053|0.7597852945327759|1353.2627713680267|
50 | |NeuMF_random|0.7707042694091797|0.7720282077789307|1025.5578093528748|
51 | |NeuMF_greedy|0.7517987489700317|0.7520723342895508|1276.7933542728424|
52 | |NeuMF_bayesian|0.7721487879753113|0.7723560333251953|1098.1503052711487|
53 | |AutoRec_random|0.7500635981559753|0.749731719493866|1577.6531774997711|
54 | |AutoRec_greedy|0.7496007084846497|0.7510735392570496|1689.560632944107|
55 | |AutoRec_bayesian|0.7484513521194458|0.7494882345199585|5405.682264328003|
56 |
57 |
58 | ## Movelens10M
59 | |Movelens10M|Val_MSE|Test_MSE|Time(s)|
60 | |---|---:|---:|---:|
61 | |MF_random|0.6472423672676086|0.6456527709960938|795.4746537208557|
62 | |MF_greedy|0.6473642587661743|0.6467021107673645|838.2489671707153|
63 | |MF_bayesian|0.6490539312362671|0.6481097936630249|7755.805980920792|
64 | |MLP_random||||
65 | |MLP_greedy|0.6532657742500305|0.652294397354126|10709.204501867294|
66 | |MLP_bayesian||||
67 | |NeuMF_random|0.6536459922790527|0.6527888774871826|16713.71854186058|
68 | |NeuMF_greedy|0.6541951298713684|0.6537747979164124|11205.822769880295|
69 | |NeuMF_bayesian|0.650793194770813|0.6504989862442017|15727.56122994423|
70 | |AutoRec_random||||
71 | |AutoRec_greedy||||
72 | |AutoRec_bayesian||||
73 |
74 |
75 |
76 |
77 | ## Movelens_latest
78 | |Movelens_latest|Val_MSE|Test_MSE|Time(s)|
79 | |---|---:|---:|---:|
80 | |MF_random|0.6520289182662964|0.6528090238571167|68519.18232417107|
81 | |MF_greedy||||
82 | |MF_bayesian||||
83 | |MLP_random||||
84 | |MLP_greedy||||
85 | |MLP_bayesian||||
86 | |NeuMF_random||||
87 | |NeuMF_greedy|0.6434351801872253|0.6440964937210083|56383.871745824814|
88 | |NeuMF_bayesian||||
89 | |AutoRec_random|0.6365838050842285|0.6371557712554932|133145.96114301682|
90 | |AutoRec_greedy||||
91 | |AutoRec_bayesian|0.6448036432266235|0.6453331708908081|133532.19134521484|
92 |
93 |
94 | ## Netflix
95 | |Netflix|Val_MSE|Test_MSE|Time(s)|
96 | |---|---:|---:|---:|
97 | |MF_random|0.7473645806312561|0.74784255027771|8169.921831846237|
98 | |MF_greedy|0.7397633790969849|0.7402286529541016|8646.685072422028|
99 | |MF_bayesian|0.7282611727714539|0.7287141680717468|82759.47434949875|
100 | |MLP_random|0.7549719214439392| 0.7553735971450806|59066.82922792435|
101 | |MLP_greedy|0.7648082375526428|0.7652896046638489|56700.0296475887s3|
102 | |MLP_bayesian|0.7546935081481934|0.755224347114563|46708.42347598076|
103 | |NeuMF_random|0.7073774337768555|0.7063089609146118|50333.9074454409|
104 | |NeuMF_greedy|0.6434351801872253|0.6440964937210083|56383.871745824814|
105 | |NeuMF_bayesian|0.70604610443s11523|0.706568717956543|73228.66933822632|
106 | |AutoRec_random|0.6365838050842285|0.6371557712554932|133145.96114301682|
107 | |AutoRec_greedy|0.739780068397522|0.7401751279830933|105307.948792696|
108 | |AutoRec_bayesian|0.6448036432266235|0.6453331708908081|133532.19134521484|
109 |
110 |
111 | MSE and MAE are the mean square error and mean abslute error.
112 |
113 | Time, for the baseline model, is the total training time; for the automated model, is the total search and training time.
114 |
115 |
116 |
117 | ## Click-Through Rate Task
118 |
119 | We adpot two dataset to evaluate our autoRec.
120 |
121 | - **Criteo**: Display advertising is a billion dollar effort and one of the central uses of machine learning on the Internet. However, its data and methods are usually kept under lock and key. In this research competition, CriteoLabs is sharing a week’s worth of data for you to develop models predicting ad click-through rate (CTR). Given a user and the page he is visiting, what is the probability that he will click on a given ad?
122 | - **Avazu**: For this competition, we have provided 11 days worth of Avazu data to build and test prediction models. Can you find a strategy that beats standard classification algorithms? The winning models from this competition will be released under an open-source license.
123 |
124 | The statistics of the dataset are as follow:
125 |
126 | |Dataset|#user|#item|#interaction|
127 | |---|---:|---:|---:|
128 | |[Movielens](#Movielens)|10000|10000|10000|
129 |
130 |
131 | |Dataset|#dense field|#sparse field|#instance|
132 | |---|---:|---:|---:|
133 | |[Criteo](#Criteo)|10000|10000|10000|
134 | |[Avazu](#Avazu)|10000|10000|10000|
135 |
136 |
137 | Some popular model for rating prediction:
138 |
139 | - **NeuMF**: Three collaborative filtering models: Generalized Matrix Factorization (GMF), Multi-Layer Perceptron (MLP), and Neural Matrix Factorization (NeuMF). To target the models for implicit feedback and ranking task, we optimize them using log loss with negative sampling.
140 | - **Hp search**: Our autorec with hyperparameter search.
141 | - **Block Search**: Our autorec with both the block search and hyperparameter search.
142 |
143 | All benchmarks were run with our AutoRec Package.
144 | The benchmarks experiments were run on a machine with dual
145 | Intel Xeon E5-2630 v3 processors (8 cores each plus hyperthreading means 32
146 | threads) and one GTX 2080Ti running Ubuntu 16.04 with the Tensorflow 2.1.0 and CUDA 10.0 Release.
147 |
148 | We benchmark all models with a minibatch size of 256; this allows fair comparisons between different models.
149 | The following models are benchmarked:
150 |
151 | ## Movielens
152 | |Movielens|logloss|AUC|Time(s)|
153 | |---|---:|---:|---:|
154 | |MF|0.0000|0.0000|0.0000|
155 | |GMF|0.0000|0.0000|0.0000|
156 | |MLP|0.0000|0.0000|0.0000|
157 | |NeuMF|0.0000|0.0000|0.0000|
158 | |AutoRec_random|0.0000|0.0000|0.0000|
159 | |AutoRec_bayesian|0.0000|0.0000|0.0000|
160 | |AutoRec_hyperband|0.0000|0.0000|0.0000|
161 |
162 | ## Criteo
163 | |Criteo|logloss|AUC|Time(s)|
164 | |---|---:|---:|---:|
165 | |FM|0.0000|0.0000|0.0000|
166 | |AutoRec_random|0.0000|0.0000|0.0000|
167 | |AutoRec_bayesian|0.0000|0.0000|0.0000|
168 | |AutoRec_hyperband|0.0000|0.0000|0.0000|
169 |
170 |
171 | ## Avazu
172 | |Avazu|logloss|AUC|Time(s)|
173 | |---|---:|---:|---:|
174 | |FM|0.0000|0.0000|0.0000|
175 | |AutoRec_random|0.0000|0.0000|0.0000|
176 | |AutoRec_bayesian|0.0000|0.0000|0.0000|
177 | |AutoRec_hyperband|0.0000|0.0000|0.0000|
178 |
179 | Logloss and AUC are the binary cross-entropy loss and Area Under the Receiver Operating Characteristic Curve Score.
180 |
181 | Time, for the baseline model, is the total training time; for the automated model, is the total search and training time.
--------------------------------------------------------------------------------
/examples/ctr_autoint.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from __future__ import absolute_import, division, print_function, unicode_literals
3 |
4 | import os
5 | os.environ["CUDA_VISIBLE_DEVICES"] = "7"
6 | import logging
7 | import tensorflow as tf
8 | from autorecsys.auto_search import Search
9 | from autorecsys.pipeline import Input, DenseFeatureMapper, SparseFeatureMapper, SelfAttentionInteraction,\
10 | MLPInteraction, CTRPredictionOptimizer
11 | from autorecsys.recommender import CTRRecommender
12 | from autorecsys.pipeline.preprocessor import CriteoPreprocessor
13 |
14 |
15 | # logging setting
16 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
17 | logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
18 | logger = logging.getLogger(__name__)
19 |
20 | # Step 1: Preprocess data
21 | criteo = CriteoPreprocessor() # the default arguments are setup to preprocess the Criteo example dataset
22 | train_X, train_y, val_X, val_y, test_X, test_y = criteo.preprocess()
23 | train_X_numerical, train_X_categorical = criteo.get_x_numerical(train_X), criteo.get_x_categorical(train_X)
24 | val_X_numerical, val_X_categorical = criteo.get_x_numerical(val_X), criteo.get_x_categorical(val_X)
25 | test_X_numerical, test_X_categorical = criteo.get_x_numerical(test_X), criteo.get_x_categorical(test_X)
26 | numerical_count = criteo.get_numerical_count()
27 | categorical_count = criteo.get_categorical_count()
28 | hash_size = criteo.get_hash_size()
29 |
30 | # Step 2: Build the recommender, which provides search space
31 | # Step 2.1: Setup mappers to handle inputs
32 | dense_input_node = Input(shape=[numerical_count])
33 | sparse_input_node = Input(shape=[categorical_count])
34 | dense_feat_emb = DenseFeatureMapper(
35 | num_of_fields=numerical_count,
36 | embedding_dim=2)(dense_input_node)
37 | sparse_feat_emb = SparseFeatureMapper(
38 | num_of_fields=categorical_count,
39 | hash_size=hash_size,
40 | embedding_dim=2)(sparse_input_node)
41 |
42 | # Step 2.2: Setup interactors to handle models
43 | attention_output = SelfAttentionInteraction()([dense_feat_emb, sparse_feat_emb])
44 | bottom_mlp_output = MLPInteraction()([dense_feat_emb])
45 | top_mlp_output = MLPInteraction()([attention_output, bottom_mlp_output])
46 |
47 | # Step 2.3: Setup optimizer to handle the target task
48 | output = CTRPredictionOptimizer()(top_mlp_output)
49 | model = CTRRecommender(inputs=[dense_input_node, sparse_input_node], outputs=output)
50 |
51 | # Step 3: Build the searcher, which provides search algorithm
52 | searcher = Search(model=model,
53 | tuner='random',
54 | tuner_params={'max_trials': 2, 'overwrite': True},
55 | )
56 |
57 | # Step 4: Use the searcher to search the recommender
58 | searcher.search(x=[train_X_numerical, train_X_categorical],
59 | y=train_y,
60 | x_val=[val_X_numerical, val_X_categorical],
61 | y_val=val_y,
62 | objective='val_BinaryCrossentropy',
63 | batch_size=10000,
64 | epochs=2,
65 | callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=1)]
66 | )
67 | logger.info('Validation Accuracy (logloss): {}'.format(searcher.evaluate(x=[val_X_numerical, val_X_categorical],
68 | y_true=val_y)))
69 |
70 | # Step 5: Evaluate the searched model
71 | logger.info('Test Accuracy (logloss): {}'.format(searcher.evaluate(x=[test_X_numerical, test_X_categorical],
72 | y_true=test_y)))
73 |
--------------------------------------------------------------------------------
/examples/ctr_autorec.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from __future__ import absolute_import, division, print_function, unicode_literals
3 |
4 | import os
5 | os.environ["CUDA_VISIBLE_DEVICES"] = "7"
6 |
7 | import logging
8 | import tensorflow as tf
9 | from autorecsys.auto_search import Search
10 | from autorecsys.pipeline import Input, DenseFeatureMapper, SparseFeatureMapper, HyperInteraction, CTRPredictionOptimizer
11 | from autorecsys.recommender import CTRRecommender
12 | from autorecsys.pipeline.preprocessor import CriteoPreprocessor
13 |
14 |
15 | # logging setting
16 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
17 | logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
18 | logger = logging.getLogger(__name__)
19 |
20 | # Step 1: Preprocess data
21 | criteo = CriteoPreprocessor() # the default arguments are setup to preprocess the Criteo example dataset
22 | train_X, train_y, val_X, val_y, test_X, test_y = criteo.preprocess()
23 | train_X_numerical, train_X_categorical = criteo.get_x_numerical(train_X), criteo.get_x_categorical(train_X)
24 | val_X_numerical, val_X_categorical = criteo.get_x_numerical(val_X), criteo.get_x_categorical(val_X)
25 | test_X_numerical, test_X_categorical = criteo.get_x_numerical(test_X), criteo.get_x_categorical(test_X)
26 | numerical_count = criteo.get_numerical_count()
27 | categorical_count = criteo.get_categorical_count()
28 | hash_size = criteo.get_hash_size()
29 |
30 | # Step 2: Build the recommender, which provides search space
31 | # Step 2.1: Setup mappers to handle inputs
32 | dense_input_node = Input(shape=[numerical_count])
33 | sparse_input_node = Input(shape=[categorical_count])
34 | dense_feat_emb = DenseFeatureMapper(
35 | num_of_fields=numerical_count,
36 | embedding_dim=2)(dense_input_node)
37 | sparse_feat_emb = SparseFeatureMapper(
38 | num_of_fields=categorical_count,
39 | hash_size=hash_size,
40 | embedding_dim=2)(sparse_input_node)
41 |
42 | # Step 2.2: Setup interactors to handle models
43 | sparse_feat_bottom_output = HyperInteraction(meta_interactor_num=2)([sparse_feat_emb])
44 | dense_feat_bottom_output = HyperInteraction(meta_interactor_num=2)([dense_feat_emb])
45 | hyper_output = HyperInteraction(meta_interactor_num=2)([sparse_feat_bottom_output, dense_feat_bottom_output])
46 |
47 | # Step 2.3: Setup optimizer to handle the target task
48 | output = CTRPredictionOptimizer()(hyper_output)
49 | model = CTRRecommender(inputs=[dense_input_node, sparse_input_node], outputs=output)
50 |
51 | # Step 3: Build the searcher, which provides search algorithm
52 | searcher = Search(model=model,
53 | tuner='random',
54 | tuner_params={'max_trials': 2, 'overwrite': True},
55 | )
56 |
57 | # Step 4: Use the searcher to search the recommender
58 | searcher.search(x=[train_X_numerical, train_X_categorical],
59 | y=train_y,
60 | x_val=[val_X_numerical, val_X_categorical],
61 | y_val=val_y,
62 | objective='val_BinaryCrossentropy',
63 | batch_size=10000,
64 | epochs=2,
65 | callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=1)]
66 | )
67 | logger.info('Validation Accuracy (logloss): {}'.format(searcher.evaluate(x=[val_X_numerical, val_X_categorical],
68 | y_true=val_y)))
69 |
70 | # Step 5: Evaluate the searched model
71 | logger.info('Test Accuracy (logloss): {}'.format(searcher.evaluate(x=[test_X_numerical, test_X_categorical],
72 | y_true=test_y)))
73 |
--------------------------------------------------------------------------------
/examples/ctr_benchmark.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from __future__ import absolute_import, division, print_function, unicode_literals
3 |
4 | import argparse
5 | import time
6 | import os
7 |
8 | os.environ["CUDA_VISIBLE_DEVICES"] = "1"
9 |
10 | import logging
11 | import tensorflow as tf
12 | from autorecsys.auto_search import Search
13 | from autorecsys.pipeline import Input, DenseFeatureMapper, SparseFeatureMapper, FMInteraction, MLPInteraction, \
14 | CrossNetInteraction, SelfAttentionInteraction, HyperInteraction, InnerProductInteraction, CTRPredictionOptimizer
15 | from autorecsys.pipeline.preprocessor import CriteoPreprocessor, AvazuPreprocessor
16 | from autorecsys.recommender import CTRRecommender
17 |
18 | # logging setting
19 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
20 | logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
21 | logger = logging.getLogger(__name__)
22 |
23 |
24 | def build_dlrm(emb_dict):
25 | if 'user' in emb_dict or 'item' in emb_dict:
26 | emb_list = [emb for _, emb in emb_dict.items()]
27 | output = MLPInteraction(num_layers=2)(emb_list)
28 | else:
29 | sparse_feat_mlp_output = [MLPInteraction()( [emb_dict['sparse']] )] if 'sparse' in emb_dict else []
30 | dense_feat_mlp_output = [MLPInteraction()( [emb_dict['dense']] )] if 'dense' in emb_dict else []
31 | output = MLPInteraction(num_layers=2)(sparse_feat_mlp_output + dense_feat_mlp_output)
32 | return output
33 |
34 |
35 | def build_deepfm(emb_dict):
36 | if 'user' in emb_dict or 'item' in emb_dict:
37 | emb_list = [emb for _, emb in emb_dict.items()]
38 | fm_output = [FMInteraction()(emb_list)]
39 | bottom_mlp_output = [MLPInteraction(num_layers=2)(emb_list)]
40 | output = MLPInteraction(num_layers=2)(fm_output + bottom_mlp_output)
41 | else:
42 | fm_output = [FMInteraction()( [emb_dict['sparse']] )] if 'sparse' in emb_dict else []
43 | bottom_mlp_output = [MLPInteraction()( [emb_dict['dense']] )] if 'dense' in emb_dict else []
44 | output = MLPInteraction(num_layers=2)(fm_output + bottom_mlp_output)
45 | return output
46 |
47 |
48 | def build_crossnet(emb_dict):
49 | if 'user' in emb_dict or 'item' in emb_dict:
50 | emb_list = [emb for _, emb in emb_dict.items()]
51 | fm_output = [CrossNetInteraction()(emb_list)]
52 | bottom_mlp_output = [MLPInteraction(num_layers=2)(emb_list)]
53 | output = MLPInteraction(num_layers=2)(fm_output + bottom_mlp_output)
54 | else:
55 | fm_output = [CrossNetInteraction()( [emb_dict['sparse']] )] if 'sparse' in emb_dict else []
56 | bottom_mlp_output = [MLPInteraction()( [emb_dict['dense']] )] if 'dense' in emb_dict else []
57 | output = MLPInteraction(num_layers=2)(fm_output + bottom_mlp_output)
58 | return output
59 |
60 |
61 | def build_autoint(emb_dict):
62 | if 'user' in emb_dict or 'item' in emb_dict:
63 | emb_list = [emb for _, emb in emb_dict.items()]
64 | fm_output = [SelfAttentionInteraction()(emb_list)]
65 | bottom_mlp_output = [MLPInteraction(num_layers=2)(emb_list)]
66 | output = MLPInteraction(num_layers=2)(fm_output + bottom_mlp_output)
67 | else:
68 | fm_output = [SelfAttentionInteraction()( [emb_dict['sparse']] )] if 'sparse' in emb_dict else []
69 | bottom_mlp_output = [MLPInteraction()( [emb_dict['dense']] )] if 'dense' in emb_dict else []
70 | output = MLPInteraction(num_layers=2)(fm_output + bottom_mlp_output)
71 | return output
72 |
73 |
74 | def build_neumf(emb_dict):
75 | emb_list = [emb for _, emb in emb_dict.items()]
76 | innerproduct_output = [InnerProductInteraction()(emb_list)]
77 | mlp_output = [MLPInteraction(num_layers=2)(emb_list)]
78 | output = innerproduct_output + mlp_output
79 | return output
80 |
81 |
82 | def build_autorec(emb_dict):
83 | if 'user' in emb_dict or 'item' in emb_dict:
84 | emb_list = [emb for _, emb in emb_dict.items()]
85 | output = HyperInteraction()(emb_list)
86 | else:
87 | sparse_feat_bottom_output = [HyperInteraction(meta_interactor_num=2)([sparse_feat_emb])] if 'sparse' in emb_dict else []
88 | dense_feat_bottom_output = [HyperInteraction(meta_interactor_num=2)([dense_feat_emb])] if 'dense' in emb_dict else []
89 | top_mlp_output = HyperInteraction(meta_interactor_num=2)(sparse_feat_bottom_output + dense_feat_bottom_output)
90 | output = HyperInteraction(meta_interactor_num=2)([top_mlp_output])
91 | return output
92 |
93 |
94 | if __name__ == '__main__':
95 | # parse args
96 | parser = argparse.ArgumentParser()
97 | parser.add_argument('-model', type=str, help='input a model name', default='dlrm')
98 | parser.add_argument('-data', type=str, help='dataset name', default="avazu")
99 | parser.add_argument('-data_path', type=str, help='dataset path', default='./example_datasets/avazu/train-10k')
100 | parser.add_argument('-sep', type=str, help='dataset sep')
101 | parser.add_argument('-search', type=str, help='input a search method name', default='random')
102 | parser.add_argument('-batch_size', type=int, help='batch size', default=256)
103 | parser.add_argument('-trials', type=int, help='try number', default=2)
104 | parser.add_argument('-gpu_index', type=int, help='the index of gpu to use', default=0)
105 | args = parser.parse_args()
106 | print("args:", args)
107 | os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu_index)
108 |
109 | if args.sep == None:
110 | args.sep = '::'
111 |
112 | if args.data == "avazu":
113 | # Step 1: Preprocess data
114 | avazu = AvazuPreprocessor(csv_path=args.data_path, validate_percentage=0.1, test_percentage=0.1)
115 | train_X, train_y, val_X, val_y, test_X, test_y = avazu.preprocess()
116 | train_X_categorical = avazu.get_x_categorical(train_X)
117 | val_X_categorical = avazu.get_x_categorical(val_X)
118 | test_X_categorical = avazu.get_x_categorical(test_X)
119 | categorical_count = avazu.get_categorical_count()
120 | hash_size = avazu.get_hash_size()
121 |
122 | # Step 2: Build the recommender, which provides search space
123 | # Step 2.1: Setup mappers to handle inputs
124 | # dense_input_node = None
125 | sparse_input_node = Input(shape=[categorical_count])
126 | input = [sparse_input_node]
127 |
128 | # dense_feat_emb = None
129 | sparse_feat_emb = SparseFeatureMapper(
130 | num_of_fields=categorical_count,
131 | hash_size=hash_size,
132 | embedding_dim=64)(sparse_input_node)
133 |
134 | emb_dict = {'sparse': sparse_feat_emb}
135 |
136 | if args.data == "criteo":
137 | # Step 1: Preprocess data
138 | criteo = CriteoPreprocessor(csv_path=args.data_path, validate_percentage=0.1, test_percentage=0.1)
139 | train_X, train_y, val_X, val_y, test_X, test_y = criteo.preprocess()
140 | train_X_numerical, train_X_categorical = criteo.get_x_numerical(train_X), criteo.get_x_categorical(train_X)
141 | val_X_numerical, val_X_categorical = criteo.get_x_numerical(val_X), criteo.get_x_categorical(val_X)
142 | test_X_numerical, test_X_categorical = criteo.get_x_numerical(test_X), criteo.get_x_categorical(test_X)
143 | numerical_count = criteo.get_numerical_count()
144 | categorical_count = criteo.get_categorical_count()
145 | hash_size = criteo.get_hash_size()
146 |
147 | # Step 2: Build the recommender, which provides search space
148 | # Step 2.1: Setup mappers to handle inputs
149 | dense_input_node = Input(shape=[numerical_count])
150 | sparse_input_node = Input(shape=[categorical_count])
151 | input = [dense_input_node, sparse_input_node]
152 |
153 | dense_feat_emb = DenseFeatureMapper(
154 | num_of_fields=numerical_count,
155 | embedding_dim=64)(dense_input_node)
156 |
157 | sparse_feat_emb = SparseFeatureMapper(
158 | num_of_fields=categorical_count,
159 | hash_size=hash_size,
160 | embedding_dim=64)(sparse_input_node)
161 |
162 | emb_dict = {'dense': dense_feat_emb, 'sparse': sparse_feat_emb}
163 |
164 | # Step 2.2: Setup interactors to handle models
165 | if args.model == 'dlrm':
166 | output = build_dlrm(emb_dict)
167 | if args.model == 'deepfm':
168 | output = build_deepfm(emb_dict)
169 | if args.model == 'crossnet':
170 | output = build_neumf(emb_dict)
171 | if args.model == 'autoint':
172 | output = build_autorec(emb_dict)
173 | if args.model == 'autorec':
174 | output = build_autorec(emb_dict)
175 |
176 | # Step 2.3: Setup optimizer to handle the target task
177 | output = CTRPredictionOptimizer()(output)
178 | model = CTRRecommender(inputs=input, outputs=output)
179 |
180 | # Step 3: Build the searcher, which provides search algorithm
181 | searcher = Search(model=model,
182 | tuner=args.search,
183 | tuner_params={'max_trials': args.trials, 'overwrite': True}
184 | )
185 |
186 | # Step 4: Use the searcher to search the recommender
187 | start_time = time.time()
188 | searcher.search(x=train_X,
189 | y=train_y,
190 | x_val=val_X,
191 | y_val=val_y,
192 | objective='val_BinaryCrossentropy',
193 | batch_size=args.batch_size,
194 | epochs=1,
195 | callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=1)]
196 | )
197 | end_time = time.time()
198 | print("running time:", end_time - start_time)
199 | print("args", args)
200 | logger.info('Validation Accuracy (logloss): {}'.format(searcher.evaluate(x=val_X,
201 | y_true=val_y)))
202 |
203 | # Step 5: Evaluate the searched model
204 | logger.info('Test Accuracy (logloss): {}'.format(searcher.evaluate(x=test_X,
205 | y_true=test_y)))
206 |
--------------------------------------------------------------------------------
/examples/ctr_crossnet.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from __future__ import absolute_import, division, print_function, unicode_literals
3 |
4 | import os
5 | os.environ["CUDA_VISIBLE_DEVICES"] = "7"
6 |
7 | import logging
8 | import tensorflow as tf
9 | from autorecsys.auto_search import Search
10 | from autorecsys.pipeline import Input, DenseFeatureMapper, SparseFeatureMapper, CrossNetInteraction, MLPInteraction,\
11 | CTRPredictionOptimizer
12 | from autorecsys.recommender import CTRRecommender
13 | from autorecsys.pipeline.preprocessor import CriteoPreprocessor
14 |
15 |
16 | # logging setting
17 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
18 | logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
19 | logger = logging.getLogger(__name__)
20 |
21 | # Step 1: Preprocess data
22 | criteo = CriteoPreprocessor() # the default arguments are setup to preprocess the Criteo example dataset
23 | train_X, train_y, val_X, val_y, test_X, test_y = criteo.preprocess()
24 | train_X_numerical, train_X_categorical = criteo.get_x_numerical(train_X), criteo.get_x_categorical(train_X)
25 | val_X_numerical, val_X_categorical = criteo.get_x_numerical(val_X), criteo.get_x_categorical(val_X)
26 | test_X_numerical, test_X_categorical = criteo.get_x_numerical(test_X), criteo.get_x_categorical(test_X)
27 | numerical_count = criteo.get_numerical_count()
28 | categorical_count = criteo.get_categorical_count()
29 | hash_size = criteo.get_hash_size()
30 |
31 | # Step 2: Build the recommender, which provides search space
32 | # Step 2.1: Setup mappers to handle inputs
33 | dense_input_node = Input(shape=[numerical_count])
34 | sparse_input_node = Input(shape=[categorical_count])
35 | dense_feat_emb = DenseFeatureMapper(
36 | num_of_fields=numerical_count,
37 | embedding_dim=2)(dense_input_node)
38 | sparse_feat_emb = SparseFeatureMapper(
39 | num_of_fields=categorical_count,
40 | hash_size=hash_size,
41 | embedding_dim=2)(sparse_input_node)
42 |
43 | # Step 2.2: Setup interactors to handle models
44 | crossnet_output = CrossNetInteraction()([dense_feat_emb, sparse_feat_emb])
45 | bottom_mlp_output = MLPInteraction()([dense_feat_emb])
46 | top_mlp_output = MLPInteraction()([crossnet_output, bottom_mlp_output])
47 |
48 | # Step 2.3: Setup optimizer to handle the target task
49 | output = CTRPredictionOptimizer()(top_mlp_output)
50 | model = CTRRecommender(inputs=[dense_input_node, sparse_input_node], outputs=output)
51 |
52 | # Step 3: Build the searcher, which provides search algorithm
53 | searcher = Search(model=model,
54 | tuner='random',
55 | tuner_params={'max_trials': 2, 'overwrite': True},
56 | )
57 |
58 | # Step 4: Use the searcher to search the recommender
59 | searcher.search(x=[train_X_numerical, train_X_categorical],
60 | y=train_y,
61 | x_val=[val_X_numerical, val_X_categorical],
62 | y_val=val_y,
63 | objective='val_BinaryCrossentropy',
64 | batch_size=10000,
65 | epochs=2,
66 | callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=1)]
67 | )
68 | logger.info('Validation Accuracy (logloss): {}'.format(searcher.evaluate(x=[val_X_numerical, val_X_categorical],
69 | y_true=val_y)))
70 |
71 | # Step 5: Evaluate the searched model
72 | logger.info('Test Accuracy (logloss): {}'.format(searcher.evaluate(x=[test_X_numerical, test_X_categorical],
73 | y_true=test_y)))
74 |
--------------------------------------------------------------------------------
/examples/ctr_deepfm.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from __future__ import absolute_import, division, print_function, unicode_literals
3 |
4 | import os
5 | os.environ["CUDA_VISIBLE_DEVICES"] = "7"
6 |
7 | import logging
8 | import tensorflow as tf
9 | from autorecsys.auto_search import Search
10 | from autorecsys.pipeline import Input, DenseFeatureMapper, SparseFeatureMapper, FMInteraction, MLPInteraction,\
11 | CTRPredictionOptimizer
12 | from autorecsys.recommender import CTRRecommender
13 | from autorecsys.pipeline.preprocessor import CriteoPreprocessor
14 |
15 |
16 | # logging setting
17 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
18 | logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
19 | logger = logging.getLogger(__name__)
20 |
21 | # Step 1: Preprocess data
22 | criteo = CriteoPreprocessor() # the default arguments are setup to preprocess the Criteo example dataset
23 | train_X, train_y, val_X, val_y, test_X, test_y = criteo.preprocess()
24 | train_X_numerical, train_X_categorical = criteo.get_x_numerical(train_X), criteo.get_x_categorical(train_X)
25 | val_X_numerical, val_X_categorical = criteo.get_x_numerical(val_X), criteo.get_x_categorical(val_X)
26 | test_X_numerical, test_X_categorical = criteo.get_x_numerical(test_X), criteo.get_x_categorical(test_X)
27 | numerical_count = criteo.get_numerical_count()
28 | categorical_count = criteo.get_categorical_count()
29 | hash_size = criteo.get_hash_size()
30 |
31 | # Step 2: Build the recommender, which provides search space
32 | # Step 2.1: Setup mappers to handle inputs
33 | dense_input_node = Input(shape=[numerical_count])
34 | sparse_input_node = Input(shape=[categorical_count])
35 | dense_feat_emb = DenseFeatureMapper(
36 | num_of_fields=numerical_count,
37 | embedding_dim=2)(dense_input_node)
38 | sparse_feat_emb = SparseFeatureMapper(
39 | num_of_fields=categorical_count,
40 | hash_size=hash_size,
41 | embedding_dim=2)(sparse_input_node)
42 |
43 | # Step 2.2: Setup interactors to handle models
44 | fm_output = FMInteraction()([sparse_feat_emb])
45 | bottom_mlp_output = MLPInteraction()([dense_feat_emb])
46 | top_mlp_output = MLPInteraction()([fm_output, bottom_mlp_output])
47 |
48 | # Step 2.3: Setup optimizer to handle the target task
49 | output = CTRPredictionOptimizer()(top_mlp_output)
50 | model = CTRRecommender(inputs=[dense_input_node, sparse_input_node], outputs=output)
51 |
52 | # Step 3: Build the searcher, which provides search algorithm
53 | searcher = Search(model=model,
54 | tuner='random',
55 | tuner_params={'max_trials': 2, 'overwrite': True},
56 | )
57 |
58 | # Step 4: Use the searcher to search the recommender
59 | searcher.search(x=[train_X_numerical, train_X_categorical],
60 | y=train_y,
61 | x_val=[val_X_numerical, val_X_categorical],
62 | y_val=val_y,
63 | objective='val_BinaryCrossentropy',
64 | batch_size=10000,
65 | epochs=2,
66 | callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=1)]
67 | )
68 | logger.info('Validation Accuracy (logloss): {}'.format(searcher.evaluate(x=[val_X_numerical, val_X_categorical],
69 | y_true=val_y)))
70 |
71 | # Step 5: Evaluate the searched model
72 | logger.info('Test Accuracy (logloss): {}'.format(searcher.evaluate(x=[test_X_numerical, test_X_categorical],
73 | y_true=test_y)))
74 |
--------------------------------------------------------------------------------
/examples/ctr_dlrm.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from __future__ import absolute_import, division, print_function, unicode_literals
3 |
4 | import os
5 | os.environ["CUDA_VISIBLE_DEVICES"] = "7"
6 |
7 | import logging
8 | import tensorflow as tf
9 | from autorecsys.auto_search import Search
10 | from autorecsys.pipeline import Input, DenseFeatureMapper, SparseFeatureMapper, MLPInteraction, CTRPredictionOptimizer
11 | from autorecsys.recommender import CTRRecommender
12 | from autorecsys.pipeline.preprocessor import CriteoPreprocessor
13 |
14 |
15 | # logging setting
16 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
17 | logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
18 | logger = logging.getLogger(__name__)
19 |
20 | # Step 1: Preprocess data
21 | criteo = CriteoPreprocessor() # the default arguments are setup to preprocess the Criteo example dataset
22 | train_X, train_y, val_X, val_y, test_X, test_y = criteo.preprocess()
23 | train_X_numerical, train_X_categorical = criteo.get_x_numerical(train_X), criteo.get_x_categorical(train_X)
24 | val_X_numerical, val_X_categorical = criteo.get_x_numerical(val_X), criteo.get_x_categorical(val_X)
25 | test_X_numerical, test_X_categorical = criteo.get_x_numerical(test_X), criteo.get_x_categorical(test_X)
26 | numerical_count = criteo.get_numerical_count()
27 | categorical_count = criteo.get_categorical_count()
28 | hash_size = criteo.get_hash_size()
29 |
30 | # Step 2: Build the recommender, which provides search space
31 | # Step 2.1: Setup mappers to handle inputs
32 | dense_input_node = Input(shape=[numerical_count])
33 | sparse_input_node = Input(shape=[categorical_count])
34 | dense_feat_emb = DenseFeatureMapper(
35 | num_of_fields=numerical_count,
36 | embedding_dim=2)(dense_input_node)
37 | sparse_feat_emb = SparseFeatureMapper(
38 | num_of_fields=categorical_count,
39 | hash_size=hash_size,
40 | embedding_dim=2)(sparse_input_node)
41 |
42 | # Step 2.2: Setup interactors to handle models
43 | sparse_feat_mlp_output = MLPInteraction()([sparse_feat_emb])
44 | dense_feat_mlp_output = MLPInteraction()([dense_feat_emb])
45 | top_mlp_output = MLPInteraction(num_layers=2)([sparse_feat_mlp_output, dense_feat_mlp_output])
46 |
47 | # Step 2.3: Setup optimizer to handle the target task
48 | output = CTRPredictionOptimizer()(top_mlp_output)
49 | model = CTRRecommender(inputs=[dense_input_node, sparse_input_node], outputs=output)
50 |
51 | # Step 3: Build the searcher, which provides search algorithm
52 | searcher = Search(model=model,
53 | tuner='random',
54 | tuner_params={'max_trials': 2, 'overwrite': True},
55 | )
56 |
57 | # Step 4: Use the searcher to search the recommender
58 | searcher.search(x=[train_X_numerical, train_X_categorical],
59 | y=train_y,
60 | x_val=[val_X_numerical, val_X_categorical],
61 | y_val=val_y,
62 | objective='val_BinaryCrossentropy',
63 | batch_size=10000,
64 | epochs=2,
65 | callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=1)]
66 | )
67 | logger.info('Validation Accuracy (logloss): {}'.format(searcher.evaluate(x=[val_X_numerical, val_X_categorical],
68 | y_true=val_y)))
69 |
70 | # Step 5: Evaluate the searched model
71 | logger.info('Test Accuracy (logloss): {}'.format(searcher.evaluate(x=[test_X_numerical, test_X_categorical],
72 | y_true=test_y)))
73 |
--------------------------------------------------------------------------------
/examples/ctr_neumf.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from __future__ import absolute_import, division, print_function, unicode_literals
3 |
4 | import os
5 | os.environ["CUDA_VISIBLE_DEVICES"] = "6"
6 |
7 | import logging
8 | import tensorflow as tf
9 | from autorecsys.auto_search import Search
10 | from autorecsys.pipeline import Input, LatentFactorMapper, MLPInteraction, InnerProductInteraction, \
11 | CTRPredictionOptimizer
12 | from autorecsys.recommender import CTRRecommender
13 | from autorecsys.pipeline.preprocessor import CriteoPreprocessor
14 |
15 |
16 | # logging setting
17 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
18 | logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
19 | logger = logging.getLogger(__name__)
20 |
21 | # load dataset
22 | criteo = CriteoPreprocessor() # automatically set up for preprocessing the Criteo dataset
23 | train_X, train_y, val_X, val_y, test_X, test_y = criteo.preprocess()
24 |
25 | # build the pipeline.
26 | input = Input(shape=[criteo.get_categorical_count()])
27 | user_emb_gmf = LatentFactorMapper(column_id=0,
28 | num_of_entities=10000,
29 | embedding_dim=64)(input)
30 | item_emb_gmf = LatentFactorMapper(column_id=1,
31 | num_of_entities=10000,
32 | embedding_dim=64)(input)
33 |
34 | user_emb_mlp = LatentFactorMapper(column_id=0,
35 | num_of_entities=10000,
36 | embedding_dim=64)(input)
37 | item_emb_mlp = LatentFactorMapper(column_id=1,
38 | num_of_entities=10000,
39 | embedding_dim=64)(input)
40 | innerproduct_output = InnerProductInteraction()([user_emb_gmf, item_emb_gmf])
41 | mlp_output = MLPInteraction()([user_emb_mlp, item_emb_mlp])
42 | output = CTRPredictionOptimizer()([innerproduct_output, mlp_output])
43 | model = CTRRecommender(inputs=input, outputs=output)
44 |
45 | # AutoML search and predict.
46 | searcher = Search(model=model,
47 | tuner='random',
48 | tuner_params={'max_trials': 10, 'overwrite': True},
49 | )
50 | searcher.search(x=[criteo.get_x_categorical(train_X)],
51 | y=train_y,
52 | x_val=[criteo.get_x_categorical(val_X)],
53 | y_val=val_y,
54 | objective='val_BinaryCrossentropy',
55 | batch_size=256,
56 | epochs = 20,
57 | callbacks = [ tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=1)]
58 | )
59 | logger.info('Predicted Ratings: {}'.format(searcher.predict(x=[criteo.get_x_categorical(val_X)])))
60 | logger.info('Predicting Accuracy (mse): {}'.format(searcher.evaluate(x=[criteo.get_x_categorical(val_X)], y_true=val_y)))
61 |
--------------------------------------------------------------------------------
/examples/rp_autorec.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from __future__ import absolute_import, division, print_function, unicode_literals
3 |
4 | import os
5 | os.environ["CUDA_VISIBLE_DEVICES"] = "2"
6 |
7 | import logging
8 | import tensorflow as tf
9 | from autorecsys.auto_search import Search
10 | from autorecsys.pipeline import Input, LatentFactorMapper, RatingPredictionOptimizer, HyperInteraction
11 | from autorecsys.pipeline.preprocessor import MovielensPreprocessor
12 | from autorecsys.recommender import RPRecommender
13 |
14 | # logging setting
15 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
16 | logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
17 | logger = logging.getLogger(__name__)
18 |
19 | # Step 1: Preprocess data
20 | movielens = MovielensPreprocessor()
21 | train_X, train_y, val_X, val_y, test_X, test_y = movielens.preprocess()
22 | train_X_categorical = movielens.get_x_categorical(train_X)
23 | val_X_categorical = movielens.get_x_categorical(val_X)
24 | test_X_categorical = movielens.get_x_categorical(test_X)
25 | user_num, item_num = movielens.get_hash_size()
26 |
27 | # Step 2: Build the recommender, which provides search space
28 | # Step 2.1: Setup mappers to handle inputs
29 | input = Input(shape=[2])
30 | user_emb = LatentFactorMapper(column_id=0,
31 | num_of_entities=user_num,
32 | embedding_dim=64)(input)
33 | item_emb = LatentFactorMapper(column_id=1,
34 | num_of_entities=item_num,
35 | embedding_dim=64)(input)
36 |
37 | # Step 2.2: Setup interactors to handle models
38 | output1 = HyperInteraction()([user_emb, item_emb])
39 | output2 = HyperInteraction()([output1, user_emb, item_emb])
40 | output3 = HyperInteraction()([output1, output2, user_emb, item_emb])
41 | output4 = HyperInteraction()([output1, output2, output3, user_emb, item_emb])
42 |
43 | # Step 2.3: Setup optimizer to handle the target task
44 | output = RatingPredictionOptimizer()(output4)
45 | model = RPRecommender(inputs=input, outputs=output)
46 |
47 | # Step 3: Build the searcher, which provides search algorithm
48 | searcher = Search(model=model,
49 | tuner='random',
50 | tuner_params={'max_trials': 2, 'overwrite': True},)
51 |
52 | # Step 4: Use the searcher to search the recommender
53 | searcher.search(x=[train_X_categorical],
54 | y=train_y,
55 | x_val=[val_X_categorical],
56 | y_val=val_y,
57 | objective='val_mse',
58 | batch_size=1024,
59 | epochs=1,
60 | callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=1)])
61 | logger.info('Validation Accuracy (mse): {}'.format(searcher.evaluate(x=val_X_categorical,
62 | y_true=val_y)))
63 |
64 | # Step 5: Evaluate the searched model
65 | logger.info('Test Accuracy (mse): {}'.format(searcher.evaluate(x=test_X_categorical,
66 | y_true=test_y)))
67 |
--------------------------------------------------------------------------------
/examples/rp_benchmark.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from __future__ import absolute_import, division, print_function, unicode_literals
3 |
4 | import argparse
5 | import time
6 | import os
7 | import sys
8 | # os.environ["CUDA_VISIBLE_DEVICES"] = "5"
9 |
10 | import logging
11 | # logging setting
12 | logging.basicConfig(stream=sys.stdout,
13 | level=logging.DEBUG,
14 | format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
15 | # logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
16 | # logger = logging.getLogger(__name__)
17 |
18 |
19 | import tensorflow as tf
20 | from autorecsys.auto_search import Search
21 | from autorecsys.pipeline import Input, LatentFactorMapper, RatingPredictionOptimizer, HyperInteraction, MLPInteraction,\
22 | InnerProductInteraction
23 | from autorecsys.pipeline.preprocessor import MovielensPreprocessor
24 | from autorecsys.recommender import RPRecommender
25 |
26 |
27 |
28 |
29 | def build_mf(user_num, item_num):
30 | input = Input(shape=[2])
31 | user_emb = LatentFactorMapper(column_id=0,
32 | num_of_entities=user_num,
33 | embedding_dim=64)(input)
34 | item_emb = LatentFactorMapper(column_id=1,
35 | num_of_entities=item_num,
36 | embedding_dim=64)(input)
37 | output = InnerProductInteraction()([user_emb, item_emb])
38 | output = RatingPredictionOptimizer()(output)
39 | model = RPRecommender(inputs=input, outputs=output)
40 | return model
41 |
42 |
43 | def build_gmf(user_num, item_num):
44 | input = Input(shape=[2])
45 | user_emb = LatentFactorMapper(column_id=0,
46 | num_of_entities=user_num,
47 | embedding_dim=64)(input)
48 | item_emb = LatentFactorMapper(column_id=1,
49 | num_of_entities=item_num,
50 | embedding_dim=64)(input)
51 | output = InnerProductInteraction()([user_emb, item_emb])
52 | output = RatingPredictionOptimizer()(output)
53 | model = RPRecommender(inputs=input, outputs=output)
54 | return model
55 |
56 |
57 | def build_mlp(user_num, item_num):
58 | input = Input(shape=[2])
59 | user_emb_mlp = LatentFactorMapper(column_id=0,
60 | num_of_entities=user_num,
61 | embedding_dim=64)(input)
62 | item_emb_mlp = LatentFactorMapper(column_id=1,
63 | num_of_entities=user_num,
64 | embedding_dim=64)(input)
65 | output = MLPInteraction()([user_emb_mlp, item_emb_mlp])
66 | output = RatingPredictionOptimizer()(output)
67 | model = RPRecommender(inputs=input, outputs=output)
68 | return model
69 |
70 |
71 | def build_neumf(user_num, item_num):
72 | input = Input(shape=[2])
73 | user_emb_gmf = LatentFactorMapper(column_id=0,
74 | num_of_entities=user_num,
75 | embedding_dim=64)(input)
76 | item_emb_gmf = LatentFactorMapper(column_id=1,
77 | num_of_entities=item_num,
78 | embedding_dim=64)(input)
79 | innerproduct_output = InnerProductInteraction()([user_emb_gmf, item_emb_gmf])
80 |
81 | user_emb_mlp = LatentFactorMapper(column_id=0,
82 | num_of_entities=user_num,
83 | embedding_dim=64)(input)
84 | item_emb_mlp = LatentFactorMapper(column_id=1,
85 | num_of_entities=item_num,
86 | embedding_dim=64)(input)
87 | mlp_output = MLPInteraction()([user_emb_mlp, item_emb_mlp])
88 |
89 | output = RatingPredictionOptimizer()([innerproduct_output, mlp_output])
90 | model = RPRecommender(inputs=input, outputs=output)
91 | return model
92 |
93 |
94 | def build_autorec(user_num, item_num):
95 | input = Input(shape=[2])
96 | user_emb_1 = LatentFactorMapper(column_id=0,
97 | num_of_entities=user_num,
98 | embedding_dim=64)(input)
99 | item_emb_1 = LatentFactorMapper(column_id=1,
100 | num_of_entities=item_num,
101 | embedding_dim=64)(input)
102 |
103 | user_emb_2 = LatentFactorMapper(column_id=0,
104 | num_of_entities=user_num,
105 | embedding_dim=64)(input)
106 | item_emb_2 = LatentFactorMapper(column_id=1,
107 | num_of_entities=item_num,
108 | embedding_dim=64)(input)
109 |
110 | output = HyperInteraction()([user_emb_1, item_emb_1, user_emb_2, item_emb_2])
111 | output = RatingPredictionOptimizer()(output)
112 | model = RPRecommender(inputs=input, outputs=output)
113 | return model
114 |
115 |
116 | if __name__ == '__main__':
117 | # parse args
118 | parser = argparse.ArgumentParser()
119 | parser.add_argument('-model', type=str, help='input a model name')
120 | parser.add_argument('-data', type=str, help='dataset name')
121 | parser.add_argument('-data_path', type=str, help='dataset path')
122 | parser.add_argument('-sep', type=str, help='dataset sep')
123 | parser.add_argument('-search', type=str, help='input a search method name')
124 | parser.add_argument('-batch_size', type=int, help='batch size')
125 | parser.add_argument('-epochs', type=int, help='epochs')
126 | parser.add_argument('-early_stop', type=int, help='early stop')
127 | parser.add_argument('-trials', type=int, help='try number')
128 | args = parser.parse_args()
129 |
130 | if args.sep == None:
131 | args.sep = '::'
132 |
133 | # Step 1: Preprocess data
134 | if args.data == "ml":
135 | data = MovielensPreprocessor(csv_path=args.data_path, validate_percentage=0.1, test_percentage=0.1)
136 | train_X, train_y, val_X, val_y, test_X, test_y = data.preprocess()
137 | train_X_categorical = data.get_x_categorical(train_X)
138 | val_X_categorical = data.get_x_categorical(val_X)
139 | test_X_categorical = data.get_x_categorical(test_X)
140 | user_num, item_num = data.get_hash_size()
141 |
142 | # Step 2: Build the recommender, which provides search space
143 |
144 | if args.model == 'mf':
145 | model = build_mf(user_num, item_num)
146 | if args.model == 'mlp':
147 | model = build_mlp(user_num, item_num)
148 | if args.model == 'gmf':
149 | model = build_gmf(user_num, item_num)
150 | if args.model == 'neumf':
151 | model = build_neumf(user_num, item_num)
152 | if args.model == 'autorec':
153 | model = build_autorec(user_num, item_num)
154 |
155 | # Step 3: Build the searcher, which provides search algorithm
156 | searcher = Search(model=model,
157 | tuner=args.search,
158 | tuner_params={'max_trials': args.trials, 'overwrite': True}
159 | )
160 |
161 | # Step 4: Use the searcher to search the recommender
162 | start_time = time.time()
163 | searcher.search(x=train_X_categorical,
164 | y=train_y,
165 | x_val=val_X_categorical,
166 | y_val=val_y,
167 | objective='val_mse',
168 | batch_size=args.batch_size,
169 | epochs=args.epochs,
170 | callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=args.early_stop)])
171 | end_time = time.time()
172 | print("Runing time:", end_time - start_time)
173 | print("Args", args)
174 | logger.info('Validation Accuracy (mse): {}'.format(searcher.evaluate(x=val_X_categorical,
175 | y_true=val_y)))
176 |
177 | # Step 5: Evaluate the searched model
178 | logger.info('Test Accuracy (mse): {}'.format(searcher.evaluate(x=test_X_categorical,
179 | y_true=test_y)))
180 |
181 |
--------------------------------------------------------------------------------
/examples/rp_mf.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from __future__ import absolute_import, division, print_function, unicode_literals
3 |
4 | import os
5 |
6 | os.environ["CUDA_VISIBLE_DEVICES"] = "5"
7 |
8 | import tensorflow as tf
9 | # gpus = tf.config.experimental.list_physical_devices(device_type='GPU')
10 | # for gpu in gpus:
11 | # tf.config.experimental.set_memory_growth(gpu, True)
12 | # import tensorflow as tf
13 | # physical_devices = tf.config.list_physical_devices('GPU')
14 | # tf.config.experimental.set_memory_growth(physical_devices[0], True)
15 |
16 | import logging
17 | from autorecsys.auto_search import Search
18 | from autorecsys.pipeline import Input, LatentFactorMapper, InnerProductInteraction, RatingPredictionOptimizer
19 | from autorecsys.pipeline.preprocessor import MovielensPreprocessor
20 | from autorecsys.recommender import RPRecommender
21 |
22 | # logging setting
23 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
24 | logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
25 | logger = logging.getLogger(__name__)
26 |
27 | # load dataset
28 | ##Netflix Dataset
29 | # dataset_paths = ["./examples/datasets/netflix-prize-data/combined_data_" + str(i) + ".txt" for i in range(1, 5)]
30 | # data = NetflixPrizePreprocessor(dataset_paths)
31 |
32 | # Step 1: Preprocess data
33 | movielens = MovielensPreprocessor()
34 | train_X, train_y, val_X, val_y, test_X, test_y = movielens.preprocess()
35 | train_X_categorical = movielens.get_x_categorical(train_X)
36 | val_X_categorical = movielens.get_x_categorical(val_X)
37 | test_X_categorical = movielens.get_x_categorical(test_X)
38 | user_num, item_num = movielens.get_hash_size()
39 |
40 | # Step 2: Build the recommender, which provides search space
41 | # Step 2.1: Setup mappers to handle inputs
42 | input = Input(shape=[2])
43 | user_emb = LatentFactorMapper(column_id=0,
44 | num_of_entities=user_num,
45 | embedding_dim=64)(input)
46 | item_emb = LatentFactorMapper(column_id=1,
47 | num_of_entities=item_num,
48 | embedding_dim=64)(input)
49 |
50 | # Step 2.2: Setup interactors to handle models
51 | output = InnerProductInteraction()([user_emb, item_emb])
52 |
53 | # Step 2.3: Setup optimizer to handle the target task
54 | output = RatingPredictionOptimizer()(output)
55 | model = RPRecommender(inputs=input, outputs=output)
56 |
57 | # Step 3: Build the searcher, which provides search algorithm
58 | searcher = Search(model=model,
59 | tuner='greedy', # hyperband, greedy, bayesian
60 | tuner_params={"max_trials": 5}
61 | )
62 |
63 | # Step 4: Use the searcher to search the recommender
64 | searcher.search(x=[train_X_categorical],
65 | y=train_y,
66 | x_val=[val_X_categorical],
67 | y_val=val_y,
68 | objective='val_mse',
69 | batch_size=1024,
70 | epochs=10,
71 | callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=1)])
72 | logger.info('Validation Accuracy (mse): {}'.format(searcher.evaluate(x=val_X_categorical,
73 | y_true=val_y)))
74 |
75 | # Step 5: Evaluate the searched model
76 | logger.info('Test Accuracy (mse): {}'.format(searcher.evaluate(x=test_X_categorical,
77 | y_true=test_y)))
78 |
--------------------------------------------------------------------------------
/examples/rp_neumf.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from __future__ import absolute_import, division, print_function, unicode_literals
3 |
4 | import os
5 | os.environ["CUDA_VISIBLE_DEVICES"] = "6"
6 |
7 | import logging
8 | import tensorflow as tf
9 | from autorecsys.auto_search import Search
10 | from autorecsys.pipeline import Input, LatentFactorMapper, MLPInteraction, InnerProductInteraction,\
11 | RatingPredictionOptimizer
12 | from autorecsys.pipeline.preprocessor import MovielensPreprocessor
13 | from autorecsys.recommender import RPRecommender
14 |
15 | # logging setting
16 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
17 | logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
18 | logger = logging.getLogger(__name__)
19 |
20 | # load dataset
21 | ##Netflix Dataset
22 | # dataset_paths = ["./examples/datasets/netflix-prize-data/combined_data_" + str(i) + ".txt" for i in range(1, 5)]
23 | # data = NetflixPrizePreprocessor(dataset_paths)
24 |
25 | # Step 1: Preprocess data
26 | movielens = MovielensPreprocessor()
27 | train_X, train_y, val_X, val_y, test_X, test_y = movielens.preprocess()
28 | train_X_categorical = movielens.get_x_categorical(train_X)
29 | val_X_categorical = movielens.get_x_categorical(val_X)
30 | test_X_categorical = movielens.get_x_categorical(test_X)
31 | user_num, item_num = movielens.get_hash_size()
32 |
33 | # Step 2: Build the recommender, which provides search space
34 | # Step 2.1: Setup mappers to handle inputs
35 | input = Input(shape=[2])
36 | user_emb_gmf = LatentFactorMapper(column_id=0,
37 | num_of_entities=user_num,
38 | embedding_dim=64)(input)
39 | item_emb_gmf = LatentFactorMapper(column_id=1,
40 | num_of_entities=item_num,
41 | embedding_dim=64)(input)
42 | user_emb_mlp = LatentFactorMapper(column_id=0,
43 | num_of_entities=user_num,
44 | embedding_dim=64)(input)
45 | item_emb_mlp = LatentFactorMapper(column_id=1,
46 | num_of_entities=item_num,
47 | embedding_dim=64)(input)
48 |
49 | # Step 2.2: Setup interactors to handle models
50 | innerproduct_output = InnerProductInteraction()([user_emb_gmf, item_emb_gmf])
51 | mlp_output = MLPInteraction()([user_emb_mlp, item_emb_mlp])
52 |
53 | # Step 2.3: Setup optimizer to handle the target task
54 | output = RatingPredictionOptimizer()([innerproduct_output, mlp_output])
55 | model = RPRecommender(inputs=input, outputs=output)
56 |
57 | # Step 3: Build the searcher, which provides search algorithm
58 | searcher = Search(model=model,
59 | tuner='greedy', # random, greedy
60 | tuner_params={"max_trials": 5, 'overwrite': True}
61 | )
62 |
63 | # Step 4: Use the searcher to search the recommender
64 | searcher.search(x=[train_X_categorical],
65 | y=train_y,
66 | x_val=[val_X_categorical],
67 | y_val=val_y,
68 | objective='val_mse',
69 | batch_size=1024,
70 | epochs=1,
71 | callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=1)])
72 | logger.info('Validation Accuracy (mse): {}'.format(searcher.evaluate(x=val_X_categorical,
73 | y_true=val_y)))
74 |
75 | # Step 5: Evaluate the searched model
76 | logger.info('Test Accuracy (mse): {}'.format(searcher.evaluate(x=test_X_categorical,
77 | y_true=test_y)))
78 |
--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
1 | site_name: AutoRecSys
2 | pages:
3 | - Home: index.md
4 | - About: about.md
5 | theme: readthedocs
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy>=1.17.3
2 | pandas==0.25.2
3 | pytest==5.2.2
4 | scikit-learn==0.21.3
5 | scipy>=1.4.1
6 | tabulate==0.8.5
7 | tensorboard>=2.2.0
8 | tensorflow-gpu==2.4.0
9 | termcolor==1.1.0
10 | terminaltables==3.1.0
11 | tqdm==4.36.1
12 | colorama==0.4.3
13 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | import os
2 | from setuptools import setup, find_packages
3 | import subprocess
4 | import logging
5 |
6 | setup(
7 | name='autorec',
8 | version='0.0.2',
9 | description='AutoRec: An Automated Recommender System',
10 | author='DATA Lab@Texas A&M University',
11 | author_email='thwang1231@tamu.edu',
12 | url='https://github.com/datamllab/AutoRec.git',
13 | packages=find_packages(exclude=['contrib', 'docs', 'tests*']),
14 | # package_data={
15 | # 'tods': ['resources/.entry_points.ini',
16 | # 'resources/.requirements.txt',
17 | # 'resources/default_pipeline.json'
18 | # ]
19 | # },
20 | install_requires=[
21 | 'numpy>=1.17.3',
22 | 'pandas==0.25.2',
23 | 'pytest==5.2.2',
24 | 'scikit-learn==0.21.3',
25 | 'scipy>=1.4.1',
26 | 'tabulate==0.8.5',
27 | 'tensorboard>=2.2.0',
28 | 'tensorflow-gpu==2.4.0',
29 | 'termcolor==1.1.0',
30 | 'terminaltables==3.1.0',
31 | 'tqdm==4.36.1',
32 | 'colorama==0.4.3',
33 | ],
34 |
35 | )
36 |
37 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datamllab/AutoRec/2dbc8778cfb597402d8b0337186bf9152663b20a/tests/__init__.py
--------------------------------------------------------------------------------
/tests/common.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datamllab/AutoRec/2dbc8778cfb597402d8b0337186bf9152663b20a/tests/common.py
--------------------------------------------------------------------------------
/tests/integration_tests.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from autorecsys.auto_search import Search
4 |
5 | @pytest.fixture(scope='module')
6 | def tmp_dir(tmpdir_factory):
7 | return tmpdir_factory.mktemp('integration_test')
8 |
9 |
10 | def test_Search(tmp_dir):
11 | # TODO
12 | pass
--------------------------------------------------------------------------------
/tests/integration_tests/test_models.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import, division, print_function, unicode_literals
2 |
3 | import os
4 | import logging
5 | import unittest
6 |
7 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' # Suppress warning for running TF with CPU
8 | os.chdir("../../examples")
9 |
10 | logger = logging.getLogger(__name__)
11 | # tf.random.set_seed(1)
12 |
13 |
14 | class CTRTestModels(unittest.TestCase):
15 |
16 | def setUp(self):
17 | super(CTRTestModels, self).setUp()
18 | self.ctr_model = {'autoint': 'ctr_autoint.py',
19 | 'autorec': 'ctr_autorec.py',
20 | 'crossnet': 'ctr_crossnet.py',
21 | 'deepfm': 'ctr_deepfm.py',
22 | 'dlrm': 'ctr_dlrm.py',
23 | 'neumf': 'ctr_neumf.py'}
24 |
25 | def test_ctr_autoint(self):
26 | """
27 | Test class in ctr_autoint.py
28 | """
29 | try:
30 | exec(open(self.ctr_model['autoint']).read())
31 | except RuntimeError:
32 | assert False, 'Runtime Error'
33 |
34 | def test_ctr_autorec(self):
35 | """
36 | Test class in ctr_autorec.py
37 | """
38 | try:
39 | exec(open(self.ctr_model['autorec']).read())
40 | except RuntimeError:
41 | assert False, 'Runtime Error'
42 |
43 | def test_ctr_crossnet(self):
44 | """
45 | Test class in ctr_crossnet.py
46 | """
47 | try:
48 | exec(open(self.ctr_model['crossnet']).read())
49 | except RuntimeError:
50 | assert False, 'Runtime Error'
51 |
52 | def test_ctr_deepfm(self):
53 | """
54 | Test class in ctr_deepfm.py
55 | """
56 | try:
57 | exec(open(self.ctr_model['deepfm']).read())
58 | except RuntimeError:
59 | assert False, 'Runtime Error'
60 |
61 | def test_ctr_dlrm(self):
62 | """
63 | Test class in ctr_dlrm.py
64 | """
65 | try:
66 | exec(open(self.ctr_model['dlrm']).read())
67 | except RuntimeError:
68 | assert False, 'Runtime Error'
69 |
70 | def test_ctr_neumf(self):
71 | """
72 | Test class in ctr_neumf.py
73 | """
74 | try:
75 | exec(open(self.ctr_model['neumf']).read())
76 | except RuntimeError:
77 | assert False, 'Runtime Error'
78 |
79 |
80 | class RPTestModels(unittest.TestCase):
81 |
82 | def setUp(self):
83 | super(RPTestModels, self).setUp()
84 | self.rp_model = {'autorec': 'rp_autorec.py',
85 | 'mf': 'rp_autorec.py',
86 | 'neumf': 'rp_neumf.py'}
87 |
88 | def test_rp_autorec(self):
89 | """
90 | Test class in rp_autorec.py
91 | """
92 | try:
93 | exec(open(self.rp_model['autorec']).read())
94 | except RuntimeError:
95 | assert False, 'Runtime Error'
96 |
97 | def test_rp_mf(self):
98 | """
99 | Test class in rp_mf.py
100 | """
101 | try:
102 | exec(open(self.rp_model['mf']).read())
103 | except RuntimeError:
104 | assert False, 'Runtime Error'
105 |
106 | def test_rp_neumf(self):
107 | """
108 | Test class in rp_neumf.py
109 | """
110 | try:
111 | exec(open(self.rp_model['neumf']).read())
112 | except RuntimeError:
113 | assert False, 'Runtime Error'
114 |
--------------------------------------------------------------------------------
/tests/pipeline_tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datamllab/AutoRec/2dbc8778cfb597402d8b0337186bf9152663b20a/tests/pipeline_tests/__init__.py
--------------------------------------------------------------------------------
/tests/pipeline_tests/test_graph.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import tensorflow as tf
3 | from autorecsys.searcher.core import hyperparameters as hp_module
4 |
5 | from autorecsys.pipeline import Input, MLPInteraction, ConcatenateInteraction, RatingPredictionOptimizer
6 | from autorecsys.pipeline import graph as graph_module
7 |
8 | # TODO: we don't support overwrite hp for graph now.
9 | # def test_set_hp():
10 | # input_node = Input((32,))
11 | # output_node = input_node
12 | # output_node = MLPInteraction()(output_node)
13 | # output_node = RatingPredictionOptimizer()[output_node]
14 |
15 | # graph = graph_module.HyperGraph(
16 | # input_node,
17 | # output_node,
18 | # override_hps=[hp_module.Choice('dense_block_1/num_layers', [6], default=6)])
19 | # hp = hp_module.HyperParameters()
20 | # plain_graph = graph.hyper_build(hp)
21 | # plain_graph.build_keras_graph().build(hp)
22 |
23 | # for single_hp in hp.space:
24 | # if single_hp.name == 'dense_block_1/num_layers':
25 | # assert len(single_hp.values) == 1
26 | # assert single_hp.values[0] == 6
27 | # return
28 | # assert False
29 |
30 |
31 | def test_input_output_disconnect():
32 | input_node1 = Input()
33 | output_node = input_node1
34 | _ = MLPInteraction()(output_node)
35 |
36 | input_node = Input()
37 | output_node = input_node
38 | output_node = MLPInteraction()(output_node)
39 | output_node = RatingPredictionOptimizer()(output_node)
40 |
41 | with pytest.raises(ValueError) as info:
42 | graph_module.HyperGraph(input_node1, output_node)
43 | assert 'Inputs and outputs not connected.' in str(info.value)
44 |
45 |
46 | # def test_hyper_graph_cycle():
47 | # input_node1 = Input()
48 | # input_node2 = Input()
49 | # output_node1 = MLPInteraction()(input_node1)
50 | # output_node2 = MLPInteraction()(input_node2)
51 | # output_node = ConcatenateInteraction()([output_node1, output_node2])
52 | # head = RatingPredictionOptimizer()
53 | # output_node = head(output_node)
54 | # head.outputs = output_node1
55 |
56 | # with pytest.raises(ValueError) as info:
57 | # graph_module.HyperGraph([input_node1, input_node2], output_node)
58 | # assert 'The network has a cycle.' in str(info.value)
59 |
60 | # TODO: this test criterion may have some problem
61 | def test_input_missing():
62 | input_node1 = Input()
63 | input_node2 = Input()
64 | output_node1 = MLPInteraction()(input_node1)
65 | output_node2 = MLPInteraction()(input_node2)
66 | output_node = ConcatenateInteraction()([output_node1, output_node2])
67 | output_node = RatingPredictionOptimizer()(output_node)
68 |
69 | with pytest.raises(ValueError) as info:
70 | graph_module.HyperGraph(input_node1, output_node)
71 | assert 'A required input is missing for HyperModel' in str(info.value)
72 |
73 |
74 | def test_graph_basics():
75 | input_node = Input(shape=(30,))
76 | output_node = input_node
77 | output_node = MLPInteraction()(output_node)
78 | output_node = RatingPredictionOptimizer()(output_node)
79 |
80 | graph = graph_module.PlainGraph(input_node, output_node)
81 | model = graph.build_keras_graph().build(hp_module.HyperParameters())
82 | assert model.input_shape == (None, 30)
83 | assert model.output_shape == (None, )
84 |
85 |
--------------------------------------------------------------------------------
/tests/pipeline_tests/test_mapper.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import, division, print_function, unicode_literals
2 |
3 | import os
4 | import logging
5 | import pytest
6 | import unittest
7 |
8 | import numpy as np
9 | import tensorflow as tf
10 | import pandas as pd
11 | from autorecsys.pipeline.mapper import (
12 | LatentFactorMapper,
13 | DenseFeatureMapper,
14 | SparseFeatureMapper
15 | )
16 | from autorecsys.searcher.core import hyperparameters as hp_module
17 | from tensorflow.python.util import nest
18 |
19 |
20 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' # Suppress warning for running TF with CPU
21 |
22 | logger = logging.getLogger(__name__)
23 |
24 |
25 | class TestMappers(unittest.TestCase):
26 | @pytest.fixture(autouse=True)
27 | def initdir(self, tmpdir):
28 | tmpdir.chdir() # change to pytest-provided temporary directory
29 | tmpdir.join("test_mapper.ini").write("# testdata")
30 |
31 | def setUp(self):
32 | super(TestMappers, self).setUp()
33 | self.column_id = 1
34 | self.input_shape = 13
35 | self.batch = 2
36 | self.embed_dim = 8
37 | self.tensor_inputs = [tf.random.uniform([self.batch, self.input_shape])] # standard input type
38 | self.df_inputs = pd.DataFrame(np.random.rand(self.batch, self.input_shape)) # for the ease of getting hash size
39 |
40 | def test_LatentFactorMapper(self):
41 | # test constructor and get_state
42 | p = {
43 | 'column_id': 0,
44 | 'num_of_entities': 3,
45 | 'embedding_dim': 4}
46 | mapper = LatentFactorMapper(**p)
47 | sol_get_state = {
48 | 'name': 'latent_factor_mapper_1',
49 | 'column_id': 0,
50 | 'num_of_entities': 3,
51 | 'embedding_dim': 4}
52 | assert mapper.get_state() == sol_get_state
53 |
54 | # test set_state
55 | p = {
56 | 'column_id': self.column_id,
57 | 'num_of_entities': 10,
58 | 'embedding_dim': self.embed_dim}
59 | sol_set_state = {
60 | 'name': 'latent_factor_mapper_1',
61 | 'column_id': self.column_id,
62 | 'num_of_entities': 10,
63 | 'embedding_dim': self.embed_dim}
64 | mapper.set_state(p)
65 | ans_set_state = mapper.get_state()
66 | assert ans_set_state == sol_set_state
67 |
68 | # test build
69 | hp = hp_module.HyperParameters()
70 | output = mapper.build(hp, self.tensor_inputs)
71 | assert len(nest.flatten(output)) == 1
72 | assert output.shape == (self.batch, self.embed_dim) # LatentFactorMapper does not have input shape dimension
73 |
74 | def test_DenseFeatureMapper(self):
75 | # test constructor and get_state
76 | p = {
77 | 'num_of_fields': 10,
78 | 'embedding_dim': 4}
79 | mapper = DenseFeatureMapper(**p)
80 | sol_get_state = {
81 | 'name': 'dense_feature_mapper_1',
82 | 'num_of_fields': 10,
83 | 'embedding_dim': 4}
84 | assert mapper.get_state() == sol_get_state
85 |
86 | # test set_state
87 | p = {
88 | 'num_of_fields': self.input_shape,
89 | 'embedding_dim': self.embed_dim}
90 | sol_set_state = {
91 | 'name': 'dense_feature_mapper_1',
92 | 'num_of_fields': self.input_shape,
93 | 'embedding_dim': self.embed_dim}
94 | mapper.set_state(p)
95 | ans_set_state = mapper.get_state()
96 | assert ans_set_state == sol_set_state
97 |
98 | # test build
99 | hp = hp_module.HyperParameters()
100 | output = mapper.build(hp, self.tensor_inputs) # Act
101 | assert len(nest.flatten(output)) == 1
102 | assert output.shape == (self.batch, self.input_shape, self.embed_dim)
103 |
104 | def test_SparseFeatureMapper(self):
105 | # test constructor and get_state
106 | p = {
107 | 'num_of_fields': 10,
108 | 'hash_size': [2, 4, 10],
109 | 'embedding_dim': 4}
110 | mapper = SparseFeatureMapper(**p)
111 | sol_get_state = {
112 | 'name': 'sparse_feature_mapper_1',
113 | 'num_of_fields': 10,
114 | 'hash_size': [2, 4, 10],
115 | 'embedding_dim': 4}
116 | assert mapper.get_state() == sol_get_state
117 |
118 | # test set_state
119 | hash_size = self.df_inputs.nunique().tolist()
120 | p = {
121 | 'num_of_fields': self.input_shape,
122 | 'hash_size': hash_size,
123 | 'embedding_dim': self.embed_dim}
124 | sol_set_state = {
125 | 'name': 'sparse_feature_mapper_1',
126 | 'num_of_fields': self.input_shape,
127 | 'hash_size': hash_size,
128 | 'embedding_dim': self.embed_dim}
129 | mapper.set_state(p)
130 | ans_set_state = mapper.get_state()
131 | assert ans_set_state == sol_set_state
132 |
133 | # test build
134 | hp = hp_module.HyperParameters()
135 | tensor_inputs = [tf.convert_to_tensor(self.df_inputs.values)]
136 | mapper = SparseFeatureMapper(**p)
137 | output = mapper.build(hp, tensor_inputs) # Act
138 | assert len(nest.flatten(output)) == 1
139 | assert output.shape == (self.batch, self.input_shape, self.embed_dim)
140 |
--------------------------------------------------------------------------------
/tests/pipeline_tests/test_node.py:
--------------------------------------------------------------------------------
1 | import copy
2 |
3 | import numpy as np
4 | import pytest
5 | import tensorflow as tf
6 |
7 | from autorecsys.pipeline import node
8 |
9 |
10 | def test_input_type_error():
11 | x = 'unknown'
12 | input_node = node.Input()
13 | with pytest.raises(TypeError) as info:
14 | input_node._check(x)
15 | x = input_node.transform(x)
16 | assert 'Expect the data to Input to be numpy' in str(info.value)
17 |
18 |
19 | def test_input_numerical():
20 | x = np.array([[['unknown']]])
21 | input_node = node.Input()
22 | with pytest.raises(TypeError) as info:
23 | input_node._check(x)
24 | x = input_node.transform(x)
25 | assert 'Expect the data to Input to be numerical' in str(info.value)
26 |
27 |
28 |
--------------------------------------------------------------------------------
/tests/pipeline_tests/test_optimizer.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import, division, print_function, unicode_literals
2 |
3 | import os
4 | import logging
5 | import pytest
6 | import unittest
7 | import tensorflow as tf
8 | from autorecsys.pipeline.optimizer import (
9 | CTRPredictionOptimizer,
10 | RatingPredictionOptimizer,
11 | )
12 | from autorecsys.searcher.core import hyperparameters as hp_module
13 | from tensorflow.python.util import nest
14 |
15 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' # Suppress warning for running TF with CPU
16 |
17 | logger = logging.getLogger(__name__)
18 |
19 |
20 | class TestOptimizers(unittest.TestCase):
21 | @pytest.fixture(autouse=True)
22 | def initdir(self, tmpdir):
23 | tmpdir.chdir() # change to pytest-provided temporary directory
24 | tmpdir.join("test_optimizer.ini").write("# testdata")
25 |
26 | def setUp(self):
27 | super(TestOptimizers, self).setUp()
28 | self.batch = 2
29 | self.emb = 4
30 | self.inputs = [tf.random.uniform([self.batch, self.emb], dtype=tf.float32),
31 | tf.random.uniform([self.batch, self.emb], dtype=tf.float32)]
32 |
33 | def test_RatingPredictionOptimizer(self):
34 | hp = hp_module.HyperParameters()
35 | optimizer = RatingPredictionOptimizer()
36 | output = optimizer.build(hp, self.inputs)
37 | assert len(nest.flatten(output)) == 1
38 | assert output.shape == self.batch
39 |
40 | def test_CTRPredictionOptimizer(self):
41 | hp = hp_module.HyperParameters() # Arrange
42 | optimizer = CTRPredictionOptimizer()
43 | output = optimizer.build(hp, self.inputs) # Act
44 | assert len(tf.nest.flatten(output)) == 1 # Assert
45 | assert output.shape == (self.batch, 1)
46 |
--------------------------------------------------------------------------------
/tests/pipeline_tests/test_preprocessor.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import, division, print_function, unicode_literals
2 | from sklearn.utils import shuffle
3 |
4 | import os
5 | import random
6 | import functools
7 | import logging
8 | import pytest
9 | import unittest
10 |
11 | import math
12 | import pandas as pd
13 | import numpy as np
14 | import tensorflow as tf
15 |
16 | from autorecsys.pipeline.preprocessor import BasePreprocessor, NetflixPrizePreprocessor, CriteoPreprocessor, AvazuPreprocessor, MovielensPreprocessor
17 |
18 |
19 | logger = logging.getLogger(__name__)
20 |
21 | # directory of this test file so that datasets are imported no mattter where the code is run
22 | current_directory = os.path.dirname(os.path.abspath(__file__))
23 | dataset_directory = os.path.join(
24 | current_directory, '../../examples/example_datasets')
25 |
26 |
27 | class DummyPreprocessor(BasePreprocessor):
28 | """ Dummy class for testing base functions """
29 |
30 | def __init__(self,
31 | data_df=None,
32 | non_csv_path=None,
33 | csv_path=None,
34 | header=0,
35 | columns=None,
36 | delimiter='\t',
37 | filler=0.0,
38 | dtype_dict=None, # inferred in load_data()
39 | ignored_columns=None,
40 | target_column='rating',
41 | numerical_columns=None,
42 | categorical_columns=None,
43 | categorical_filter=0, # all categories are counted
44 | fit_dictionary_path=None,
45 | transform_path=None,
46 | test_percentage=0.1,
47 | validate_percentage=0.1,
48 | train_path=None,
49 | validate_path=None,
50 | test_path=None):
51 |
52 | if columns is None:
53 | columns = range(3)
54 | if dtype_dict is None:
55 | dtype_dict = {}
56 | if ignored_columns is None:
57 | ignored_columns = []
58 | if numerical_columns is None:
59 | numerical_columns = ['num_people']
60 | if categorical_columns is None:
61 | categorical_columns = ['user_id']
62 |
63 | super().__init__(non_csv_path=non_csv_path,
64 | csv_path=csv_path,
65 | header=header,
66 | delimiter=delimiter,
67 | filler=filler,
68 | dtype_dict=dtype_dict,
69 | columns=columns,
70 | ignored_columns=ignored_columns,
71 | target_column=target_column,
72 | numerical_columns=numerical_columns,
73 | categorical_columns=categorical_columns,
74 | categorical_filter=categorical_filter,
75 | fit_dictionary_path=fit_dictionary_path,
76 | transform_path=transform_path,
77 | test_percentage=test_percentage,
78 | validate_percentage=validate_percentage,
79 | train_path=train_path,
80 | validate_path=validate_path,
81 | test_path=test_path)
82 | self.data_df = data_df
83 |
84 | def preprocess(self):
85 | return []
86 |
87 |
88 | class TestPreprocessors(unittest.TestCase):
89 | @pytest.fixture(autouse=True)
90 | def initdir(self, tmpdir):
91 | tmpdir.chdir() # change to pytest-provided temporary directory
92 | tmpdir.join("test_preprocessor.ini").write("# testdata")
93 |
94 | def setUp(self):
95 | super(TestPreprocessors, self).setUp()
96 |
97 | column_names = ["user_id", "num_people", "rating"]
98 | tabular_data = np.array([
99 | [1, 1, 1], [1, 2, 1], [1, 3, 1], [1, 4, 1],
100 | [2, 1, 1], [2, 2, 1], [2, 3, 1],
101 | [3, 1, 1], [3, 2, 1],
102 | [4, 1, 1]
103 | ])
104 | small_data = np.array([[1, 1, 1], [1, 2, 1], [2, 3, 1]])
105 | self.input_df = pd.DataFrame(tabular_data, columns=column_names)
106 | self.small_input_df = pd.DataFrame(small_data, columns=column_names)
107 | self.x_df = self.small_input_df.drop(["rating"], axis=1)
108 |
109 | def test_split_data(self):
110 | base = DummyPreprocessor(data_df=self.input_df)
111 | train_X, test_X, train_y, test_y = base.split_data(base.get_x(), base.get_y(), 0.2)
112 | assert train_X.shape[0] == 8
113 | assert train_y.shape[0] == 8
114 | assert test_X.shape[0] == 2
115 | assert test_y.shape[0] == 2
116 |
117 | def test_transform_numerical(self):
118 | sol = np.array([[1, 1, 1], [1, 2, 1], [2, math.log(float(3)) ** 2, 1]])
119 | base = DummyPreprocessor(data_df=self.small_input_df)
120 | base.transform_numerical()
121 | assert base.data_df.shape == (3, 3)
122 | assert np.array_equal(sol, base.data_df.values)
123 |
124 | def test_transform_categorical(self):
125 | sol = np.array([[0, 1, 1], [0, 2, 1], [1, 3, 1]])
126 | base = DummyPreprocessor(data_df=self.small_input_df)
127 | base.transform_categorical()
128 | assert base.data_df.shape == (3, 3)
129 | assert np.array_equal(sol, base.data_df.values)
130 |
131 | def test_get_hash_size(self):
132 | base = DummyPreprocessor(data_df=self.small_input_df)
133 | base.transform_categorical()
134 | assert base.get_hash_size() == [2]
135 |
136 | def test_get_x(self):
137 | sol = self.x_df
138 | base = DummyPreprocessor(data_df=self.small_input_df)
139 | pd.testing.assert_frame_equal(sol, base.get_x())
140 |
141 | def test_get_x_numerical(self):
142 | sol = self.x_df[['num_people']].values
143 | base = DummyPreprocessor(data_df=self.small_input_df)
144 | assert np.array_equal(base.get_x_numerical(
145 | self.x_df), sol)
146 |
147 | def test_get_x_categorical(self):
148 | sol = self.x_df[['user_id']].values
149 | base = DummyPreprocessor(data_df=self.small_input_df)
150 | assert np.array_equal(base.get_x_categorical(
151 | self.x_df), sol)
152 |
153 | def test_get_y(self):
154 | sol = np.ones(3)
155 | base = DummyPreprocessor(data_df=self.small_input_df)
156 | assert np.array_equal(base.get_y(), sol)
157 |
158 | def test_get_categorical_count(self):
159 | base = DummyPreprocessor(data_df=self.small_input_df)
160 | assert base.get_categorical_count() == 1
161 |
162 | def test_get_numerical_count(self):
163 | base = DummyPreprocessor(data_df=self.small_input_df)
164 | assert base.get_numerical_count() == 1
165 |
166 | def test_MovielensPreprocessor(self):
167 | movielens = MovielensPreprocessor(csv_path=os.path.join(
168 | dataset_directory, 'movielens/ratings-10k.dat'))
169 | movielens.preprocess()
170 | assert movielens.data_df.shape == (10000, 3)
171 |
172 | def test_CriteoPreprocessor(self):
173 | criteo = CriteoPreprocessor(csv_path=os.path.join(
174 | dataset_directory, 'criteo/train-10k.txt'))
175 | criteo.preprocess()
176 | assert criteo.data_df.shape == (10000, 40)
177 |
178 | def test_NetflixPreprocessor(self):
179 | netflix = NetflixPrizePreprocessor(
180 | non_csv_path=os.path.join(dataset_directory, 'netflix/combined_data_1-10k.txt'),
181 | csv_path=os.path.join(dataset_directory, 'netflix/combined_data_1-10k.csv'))
182 | netflix.preprocess()
183 | assert netflix.data_df.shape == (10000, 3)
184 |
185 | def test_AvazuPreprocessor(self):
186 | avazu = AvazuPreprocessor(csv_path=os.path.join(
187 | dataset_directory, 'avazu/train-10k'))
188 | avazu.preprocess()
189 | assert avazu.data_df.shape == (9999, 23)
190 |
--------------------------------------------------------------------------------
/tests/pipeline_tests/test_utils.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import, division, print_function, unicode_literals
2 |
3 | import os
4 | import logging
5 | import pytest
6 | import unittest
7 |
8 | import numpy as np
9 | import tensorflow as tf
10 | from autorecsys.pipeline.utils import Bias
11 |
12 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' # Suppress warning for running TF with CPU
13 |
14 |
15 | logger = logging.getLogger(__name__)
16 |
17 | class TestBias(unittest.TestCase):
18 | @pytest.fixture(autouse=True)
19 | def initdir(self, tmpdir):
20 | tmpdir.chdir() # change to pytest-provided temporary directory
21 | tmpdir.join("test_utils.ini").write("# testdata")
22 |
23 | def setUp(self):
24 | super(TestBias, self).setUp()
25 | self.inputs = tf.constant([ [1, 2, 3], [4, 5, 6] ], dtype="float32")
26 | self.test_units = 4
27 |
28 | def test_Bias(self):
29 | bias = Bias(units=self.test_units)
30 | assert bias.bias.shape == (self.test_units,)
31 |
32 | def test_call(self):
33 | """
34 | Test Bias.call()
35 | """
36 | bias = Bias(self.inputs.shape[-1]) # Pass shape of input as units argument
37 | ans = bias(self.inputs)
38 | tf.assert_equal(self.inputs, ans) # Assert tensor is equal since bias layer adds zeroes
39 |
--------------------------------------------------------------------------------
/tests/searcher_tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datamllab/AutoRec/2dbc8778cfb597402d8b0337186bf9152663b20a/tests/searcher_tests/__init__.py
--------------------------------------------------------------------------------
/tests/searcher_tests/core_tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datamllab/AutoRec/2dbc8778cfb597402d8b0337186bf9152663b20a/tests/searcher_tests/core_tests/__init__.py
--------------------------------------------------------------------------------
/tests/searcher_tests/core_tests/test_hyperparameters.py:
--------------------------------------------------------------------------------
1 | # Copyright 2019 The Keras Tuner Authors
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import math
16 | import numpy as np
17 | import pytest
18 |
19 | from autorecsys.searcher.core import hyperparameters as hp_module
20 |
21 |
22 | def test_base_hyperparameter():
23 | base_param = hp_module.HyperParameter(name='base', default=0)
24 | assert base_param.name == 'base'
25 | assert base_param.default == 0
26 | assert base_param.get_config() == {'name': 'base', 'default': 0}
27 | base_param = hp_module.HyperParameter.from_config(
28 | base_param.get_config())
29 | assert base_param.name == 'base'
30 | assert base_param.default == 0
31 |
32 |
33 | def test_hyperparameters():
34 | hp = hp_module.HyperParameters()
35 | assert hp.values == {}
36 | assert hp.space == []
37 | hp.Choice('choice', [1, 2, 3], default=2)
38 | assert hp.values == {'choice': 2}
39 | assert len(hp.space) == 1
40 | assert hp.space[0].name == 'choice'
41 | hp.values['choice'] = 3
42 | assert hp.get('choice') == 3
43 | hp = hp.copy()
44 | assert hp.values == {'choice': 3}
45 | assert len(hp.space) == 1
46 | assert hp.space[0].name == 'choice'
47 | with pytest.raises(ValueError, match='Unknown parameter'):
48 | hp.get('wrong')
49 |
50 |
51 | def test_name_collision():
52 | # TODO: figure out how name collision checks
53 | # should work.
54 | pass
55 |
56 |
57 | def test_name_scope():
58 | hp = hp_module.HyperParameters()
59 | hp.Choice('choice', [1, 2, 3], default=2)
60 | with hp.name_scope('scope1'):
61 | hp.Choice('choice', [4, 5, 6], default=5)
62 | with hp.name_scope('scope2'):
63 | hp.Choice('choice', [7, 8, 9], default=8)
64 | hp.Int('range', min_value=0, max_value=10, default=0)
65 |
66 | assert hp.values == {
67 | 'choice': 2,
68 | 'scope1/choice': 5,
69 | 'scope1/scope2/choice': 8,
70 | 'scope1/range': 0
71 | }
72 | assert hp.get_value_in_nested_format() == {
73 | 'choice': 2,
74 | 'scope1': {'choice': 5,
75 | 'scope2': {'choice': 8},
76 | 'range': 0,
77 | },
78 | }
79 |
80 |
81 | def test_parent_name():
82 | hp = hp_module.HyperParameters()
83 | hp.Choice('a', [1, 2, 3], default=2)
84 | b1 = hp.Int(
85 | 'b', 0, 10, parent_name='a', parent_values=1, default=5)
86 | b2 = hp.Int(
87 | 'b', 0, 100, parent_name='a', parent_values=2, default=4)
88 | assert b1 is None
89 | assert b2 == 4
90 | assert hp.values == {
91 | 'a': 2,
92 | 'a=1/b': 5,
93 | 'a=2/b': 4
94 | }
95 |
96 |
97 | def test_conditional_scope():
98 | hp = hp_module.HyperParameters()
99 | hp.Choice('choice', [1, 2, 3], default=2)
100 | with hp.conditional_scope('choice', [1, 3]):
101 | child1 = hp.Choice('child_choice', [4, 5, 6])
102 | with hp.conditional_scope('choice', 2):
103 | child2 = hp.Choice('child_choice', [7, 8, 9])
104 | assert hp.values == {
105 | 'choice': 2,
106 | 'choice=1,3/child_choice': 4,
107 | 'choice=2/child_choice': 7
108 | }
109 | # Assignment to a non-active conditional hyperparameter returns `None`.
110 | assert child1 is None
111 | # Assignment to an active conditional hyperparameter returns the value.
112 | assert child2 == 7
113 |
114 |
115 | def test_nested_conditional_scopes_and_name_scopes():
116 | hp = hp_module.HyperParameters()
117 | a = hp.Choice('a', [1, 2, 3], default=2)
118 | with hp.conditional_scope('a', [1, 3]):
119 | b = hp.Choice('b', [4, 5, 6])
120 | with hp.conditional_scope('b', 6):
121 | c = hp.Choice('c', [7, 8, 9])
122 | with hp.name_scope('d'):
123 | e = hp.Choice('e', [10, 11, 12])
124 | with hp.conditional_scope('a', 2):
125 | f = hp.Choice('f', [13, 14, 15])
126 |
127 | assert hp.values == {
128 | 'a': 2,
129 | 'a=1,3/b': 4,
130 | 'a=1,3/b=6/c': 7,
131 | 'a=1,3/b=6/d/e': 10,
132 | 'a=2/f': 13
133 | }
134 | # Assignment to an active conditional hyperparameter returns the value.
135 | assert a == 2
136 | assert f == 13
137 | # Assignment to a non-active conditional hyperparameter returns `None`.
138 | assert b is None
139 | assert c is None
140 | assert e is None
141 |
142 |
143 | def test_get_with_conditional_scopes():
144 | hp = hp_module.HyperParameters()
145 | hp.Choice('a', [1, 2, 3], default=2)
146 | assert hp.get('a') == 2
147 | with hp.conditional_scope('a', 2):
148 | assert hp.get('a') == 2
149 |
150 |
151 | def test_Choice():
152 | choice = hp_module.Choice('choice', [1, 2, 3], default=2)
153 | choice = hp_module.Choice.from_config(choice.get_config())
154 | assert choice.default == 2
155 | assert choice.random_sample() in [1, 2, 3]
156 | assert choice.random_sample(123) == choice.random_sample(123)
157 | # No default
158 | choice = hp_module.Choice('choice', [1, 2, 3])
159 | assert choice.default == 1
160 | with pytest.raises(ValueError, match='default value should be'):
161 | hp_module.Choice('choice', [1, 2, 3], default=4)
162 |
163 |
164 | @pytest.mark.parametrize(
165 | "values,ordered_arg,ordered_val",
166 | [([1, 2, 3], True, True),
167 | ([1, 2, 3], False, False),
168 | ([1, 2, 3], None, True),
169 | (['a', 'b', 'c'], False, False),
170 | (['a', 'b', 'c'], None, False)])
171 | def test_Choice_ordered(values, ordered_arg, ordered_val):
172 | choice = hp_module.Choice('choice', values, ordered=ordered_arg)
173 | assert choice.ordered == ordered_val
174 | choice_new = hp_module.Choice(**choice.get_config())
175 | assert choice_new.ordered == ordered_val
176 |
177 |
178 | def test_Choice_ordered_invalid():
179 | with pytest.raises(ValueError, match='must be `False`'):
180 | hp_module.Choice('a', ['a', 'b'], ordered=True)
181 |
182 |
183 | def test_Choice_types():
184 | values1 = ['a', 'b', 0]
185 | with pytest.raises(TypeError, match='can contain only one'):
186 | hp_module.Choice('a', values1)
187 | values2 = [{'a': 1}, {'a': 2}]
188 | with pytest.raises(TypeError, match='can contain only `int`'):
189 | hp_module.Choice('a', values2)
190 |
191 |
192 | def test_Float():
193 | # Test with step arg
194 | linear = hp_module.Float(
195 | 'linear', min_value=0.5, max_value=9.5, default=9.)
196 | linear = hp_module.Float.from_config(linear.get_config())
197 | assert linear.default == 9.
198 | assert 0.5 <= linear.random_sample() < 9.5
199 | assert isinstance(linear.random_sample(), float)
200 | assert linear.random_sample(123) == linear.random_sample(123)
201 |
202 | # No default
203 | linear = hp_module.Float(
204 | 'linear', min_value=0.5, max_value=9.5)
205 | assert linear.default == 0.5
206 |
207 |
208 | def test_sampling_arg():
209 | f = hp_module.Float('f', 1e-20, 1e10, sampling='loguniform')
210 | f = hp_module.Float.from_config(f.get_config())
211 | assert f.sampling == 'loguniform'
212 |
213 | i = hp_module.Int('i', 0, 10, sampling='uniform')
214 | i = hp_module.Int.from_config(i.get_config())
215 | assert i.sampling == 'uniform'
216 |
217 | with pytest.raises(ValueError, match='`sampling` must be one of'):
218 | hp_module.Int('j', 0, 10, sampling='invalid')
219 |
220 |
221 | def test_sampling_random_state():
222 | f = hp_module.Float('f', 1e-3, 1e3, sampling='loguniform')
223 | rand_sample = f.random_sample()
224 | assert rand_sample >= f.min_value
225 | assert rand_sample <= f.max_value
226 |
227 | def log_scale(x, min_value, max_value):
228 | return math.log(x/min_value) / math.log(max_value/min_value)
229 |
230 | x = 1e-1
231 | min_value, max_value = 1e-10, 1e10
232 | # Scale x to [0, 1].
233 | x_scaled = log_scale(x, min_value, max_value)
234 | # Scale back.
235 | x_rescaled = hp_module._log_sample(x_scaled, min_value, max_value)
236 | assert np.allclose(x, x_rescaled)
237 |
238 | f = hp_module.Float('f', 1e-3, 1e3, sampling='uniform')
239 | rand_sample = f.random_sample()
240 | assert rand_sample >= f.min_value
241 | assert rand_sample <= f.max_value
242 |
243 |
244 | def test_Int():
245 | rg = hp_module.Int(
246 | 'rg', min_value=5, max_value=9, default=6)
247 | rg = hp_module.Int.from_config(rg.get_config())
248 | assert rg.default == 6
249 | assert 5 <= rg.random_sample() < 9
250 | assert isinstance(rg.random_sample(), int)
251 | assert rg.random_sample(123) == rg.random_sample(123)
252 | # No default
253 | rg = hp_module.Int(
254 | 'rg', min_value=5, max_value=9)
255 | assert rg.default == 5
256 |
257 |
258 | def test_Boolean():
259 | # Test default default
260 | boolean = hp_module.Boolean('bool')
261 | assert boolean.default is False
262 | # Test default setting
263 | boolean = hp_module.Boolean('bool', default=True)
264 | assert boolean.default is True
265 | # Wrong default type
266 | with pytest.raises(ValueError, match='must be a Python boolean'):
267 | hp_module.Boolean('bool', default=None)
268 | # Test serialization
269 | boolean = hp_module.Boolean('bool', default=True)
270 | boolean = hp_module.Boolean.from_config(boolean.get_config())
271 | assert boolean.default is True
272 | assert boolean.name == 'bool'
273 |
274 | # Test random_sample
275 | assert boolean.random_sample() in {True, False}
276 | assert boolean.random_sample(123) == boolean.random_sample(123)
277 |
278 |
279 | def test_merge():
280 | hp = hp_module.HyperParameters()
281 | hp.Int('a', 0, 100)
282 | hp.Float('b', min_value=0.5, max_value=9.5, default=2)
283 |
284 | hp2 = hp_module.HyperParameters()
285 | hp2.Int('a', 3, 4, default=3)
286 | hp.Int('c', 10, 100, default=30)
287 | hp.merge(hp2)
288 |
289 | assert hp.get('a') == 3
290 | assert hp.get('b') == 2
291 | assert hp.get('c') == 30
292 |
293 | hp3 = hp_module.HyperParameters()
294 | hp3.Float('a', 3.5, 4.5)
295 | hp3.Choice('d', [1, 2, 3], default=1)
296 |
297 | hp.merge(hp3, overwrite=False)
298 |
299 | assert hp.get('a') == 3
300 | assert hp.get('b') == 2
301 | assert hp.get('c') == 30
302 | assert hp.get('d') == 1
303 |
304 |
305 | def _sort_space(hps):
306 | space = hps.get_config()['space']
307 | return sorted(space, key=lambda hp: hp['config']['name'])
308 |
--------------------------------------------------------------------------------
/tests/searcher_tests/core_tests/test_oracle.py:
--------------------------------------------------------------------------------
1 | import os
2 | import pytest
3 |
4 | from autorecsys.searcher.core.oracle import Oracle, Objective
5 | from autorecsys.searcher.core import hyperparameters as hps_module
6 | from autorecsys.searcher.core import trial as trial_module
7 |
8 | from tensorflow.keras import metrics
9 |
10 |
11 | @pytest.fixture(scope='function')
12 | def tmp_dir(tmpdir_factory):
13 | return tmpdir_factory.mktemp('oracle_test', numbered=True)
14 |
15 |
16 | class OracleTest(Oracle):
17 | def _populate_space(self, trial_id):
18 | return {'status': trial_module.TrialStatus.IDLE,
19 | 'values': self.hyperparameters.values}
20 |
21 | def test_oracle(tmp_dir):
22 | hps = hps_module.HyperParameters()
23 | hps.Choice('iyo_koiyo', values=[1, 2, 3, 4, 5, 6], ordered=False)
24 | oracle_tst = OracleTest(objective=['mse', 'auc_roc_score'], max_trials=50, hyperparameters=hps)
25 | assert oracle_tst.objective == [Objective(name='mse', direction='min'), Objective(name='auc_roc_score', direction='min')]
26 | trial1 = oracle_tst.create_trial(tuner_id='114514')
27 | trial2 = oracle_tst.create_trial(tuner_id='114514')
28 | oracle_tst.set_project_dir(directory=tmp_dir, project_name='test', overwrite=False)
29 | oracle_tst.save()
30 | assert os.path.exists(os.path.join(tmp_dir, oracle_tst._get_oracle_fname()))
31 | oracle_tst._save_trial(trial1)
32 | oracle_tst._save_trial(trial2)
33 | assert os.path.exists(os.path.join(oracle_tst._project_dir, f'trial_{trial1.trial_id}'))
34 | assert os.path.exists(os.path.join(oracle_tst._project_dir, f'trial_{trial2.trial_id}'))
35 | oracle_tst.reload()
36 | assert all(_id in oracle_tst.trials for _id in [trial1.trial_id, trial2.trial_id])
--------------------------------------------------------------------------------
/tests/searcher_tests/core_tests/test_trial.py:
--------------------------------------------------------------------------------
1 | import os
2 | import pytest
3 |
4 | from autorecsys.utils import metric
5 | from autorecsys.searcher.core import hyperparameters as hps_module
6 | from autorecsys.searcher.core import trial as trial_module
7 |
8 | from tensorflow.keras import metrics
9 |
10 |
11 | @pytest.fixture(scope='function')
12 | def tmp_dir(tmpdir_factory):
13 | return tmpdir_factory.mktemp('trial_test', numbered=True)
14 |
15 |
16 | @pytest.mark.skip(reason="TODO Later")
17 | def test_register_from_metrics():
18 | # As well as direction inference.
19 | tracker = metric.MetricsTracker(
20 | metrics=[metrics.CategoricalAccuracy(),
21 | metrics.MeanSquaredError()]
22 | )
23 | assert set(tracker.metrics.keys()) == {'categorical_accuracy',
24 | 'mean_squared_error'}
25 | assert tracker.metrics['categorical_accuracy'].direction == 'max'
26 | assert tracker.metrics['mean_squared_error'].direction == 'min'
27 |
28 |
29 | def test_register():
30 | tracker = metric.MetricsTracker()
31 | tracker.register('new_metric', direction='max')
32 | assert set(tracker.metrics.keys()) == {'new_metric'}
33 | assert tracker.metrics['new_metric'].direction == 'max'
34 | with pytest.raises(ValueError,
35 | match='`direction` should be one of'):
36 | tracker.register('another_metric', direction='wrong')
37 | with pytest.raises(ValueError,
38 | match='already exists'):
39 | tracker.register('new_metric', direction='max')
40 |
41 |
42 | def test_exists():
43 | tracker = metric.MetricsTracker()
44 | tracker.register('new_metric', direction='max')
45 | assert tracker.exists('new_metric')
46 | assert not tracker.exists('another_metric')
47 |
48 |
49 | def test_update():
50 | tracker = metric.MetricsTracker()
51 | tracker.update('new_metric', 0.5) # automatic registration
52 | assert set(tracker.metrics.keys()) == {'new_metric'}
53 | assert tracker.metrics['new_metric'].direction == 'min' # default direction
54 | assert (tracker.get_history('new_metric') ==
55 | [metric.MetricObservation(0.5, step=0)])
56 |
57 |
58 | def test_get_history():
59 | tracker = metric.MetricsTracker()
60 | tracker.update('new_metric', 0.5, step=0)
61 | tracker.update('new_metric', 1.5, step=1)
62 | tracker.update('new_metric', 2., step=2)
63 | assert tracker.get_history('new_metric') == [
64 | metric.MetricObservation(0.5, 0),
65 | metric.MetricObservation(1.5, 1),
66 | metric.MetricObservation(2., 2),
67 | ]
68 | with pytest.raises(ValueError, match='Unknown metric'):
69 | tracker.get_history('another_metric')
70 |
71 |
72 | def test_get_last_value():
73 | tracker = metric.MetricsTracker()
74 | tracker.register('new_metric', 'min')
75 | assert tracker.get_last_value('new_metric') is None
76 | tracker.set_history(
77 | 'new_metric',
78 | [metric.MetricObservation(1., 0),
79 | metric.MetricObservation(2., 1),
80 | metric.MetricObservation(3., 2)])
81 | assert tracker.get_last_value('new_metric') == 3.
82 |
83 |
84 | def test_serialization():
85 | tracker = metric.MetricsTracker()
86 | tracker.register('metric_min', 'min')
87 | tracker.register('metric_max', 'max')
88 | tracker.set_history(
89 | 'metric_min',
90 | [metric.MetricObservation(1., 0),
91 | metric.MetricObservation(2., 1),
92 | metric.MetricObservation(3., 2)])
93 | tracker.set_history(
94 | 'metric_max',
95 | [metric.MetricObservation(1., 0),
96 | metric.MetricObservation(2., 1),
97 | metric.MetricObservation(3., 2)])
98 |
99 | new_tracker = metric.MetricsTracker.from_config(
100 | tracker.get_config())
101 | assert new_tracker.metrics.keys() == tracker.metrics.keys()
102 |
103 |
104 | def test_trial():
105 | hps = hps_module.HyperParameters()
106 | hps.Int('a', 0, 10, default=3)
107 | trial = trial_module.Trial(
108 | hps, trial_id='trial1', status='COMPLETED')
109 | trial.metrics.register('score', direction='max')
110 | trial.metrics.update('score', 10, step=1)
111 | assert len(trial.hyperparameters.space) == 1
112 | _trail = trial_module.Trial.from_state(trial.get_state())
113 | assert _trail.hyperparameters.get('a') == 3
114 | assert _trail.trial_id == 'trial1'
115 | assert _trail.score is None
116 | assert _trail.best_step is None
117 | assert _trail.metrics.get_best_value('score') == 10
118 | assert _trail.metrics.get_history('score') == [metric.MetricObservation(10, step=1)]
119 |
--------------------------------------------------------------------------------
/tests/searcher_tests/core_tests/test_tuner.py:
--------------------------------------------------------------------------------
1 | # TODO
--------------------------------------------------------------------------------
/tests/searcher_tests/searchers_test.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 |
4 | @pytest.fixture(scope='module')
5 | def tmp_dir(tmpdir_factory):
6 | return tmpdir_factory.mktemp('searcher_test')
7 |
8 |
9 | def test_randomsearch(tmp_dir):
10 | # TODO
11 | pass
12 |
13 |
--------------------------------------------------------------------------------
/tests/utils_test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datamllab/AutoRec/2dbc8778cfb597402d8b0337186bf9152663b20a/tests/utils_test/__init__.py
--------------------------------------------------------------------------------
/tests/utils_test/test.csv:
--------------------------------------------------------------------------------
1 | Sample 1 2 3 4 5
2 |
--------------------------------------------------------------------------------
/tests/utils_test/test_common.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import os
3 | from autorecsys.utils.common import (
4 | set_device,
5 | # dataset_shape,
6 | to_snake_case,
7 | create_directory,
8 | load_dataframe_input,
9 | set_seed,
10 | save_pickle,
11 | load_pickle,
12 | )
13 | import tensorflow as tf
14 | from tensorflow.python.client import device_lib
15 | import pandas as pd
16 | import numpy as np
17 | import random
18 | import unittest
19 |
20 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
21 |
22 | class test_common(unittest.TestCase):
23 | device_info = "cpu:0"
24 | def test_set_cpu(self):
25 | set_device("cpu:0")
26 | # checks that the current devices being used by tf is a cpu
27 | assert (len(tf.config.experimental.list_physical_devices()) > 0)
28 |
29 | def test_to_snake_case(self):
30 | temp = to_snake_case("i am a string")
31 | assert(temp == "i_am_a_string")
32 | temp = to_snake_case("_i am a private string")
33 | assert(temp == "private_i_am_a_private_string")
34 | temp = to_snake_case("IAmStringWithCaps")
35 | assert(temp == "i_am_string_with_caps")
36 | temp = to_snake_case("I#am%a&string(with*special+characters")
37 | assert(temp == "i_am_a_string_with_special_characters")
38 | temp = to_snake_case("MLPInteractor")
39 | assert(temp == "mlp_interactor")
40 |
41 | #Creates a directory and sees if it exists
42 | def test_create_directory(self):
43 | assert(os.path.exists("test_dir")==False)
44 | create_directory("test_dir")
45 | assert(os.path.exists("test_dir")==True)
46 |
47 | #Tests for panda dataframe for 5 possible inputs
48 | def test_load_dataframe_input(self):
49 | #Test for panda dataframe
50 | assert(isinstance(load_dataframe_input(pd.DataFrame(data={'col1': [1, 2], 'col2': [3, 4]})), pd.DataFrame))
51 |
52 | #Test for np_ndarray
53 | assert(isinstance(load_dataframe_input(np.array( [ 1, 2, 3])), pd.Series))
54 |
55 | assert(isinstance(load_dataframe_input(np.array( [[ 1, 2, 3], [ 4, 2, 5]] )), pd.DataFrame))
56 | #Test for string
57 |
58 | try:
59 | load_dataframe_input("wrong_file.exe")
60 | except TypeError:
61 | assert(True)
62 | assert(isinstance(load_dataframe_input("test.csv"), pd.DataFrame))
63 |
64 | #Sets seed then compares the output to the expected output
65 | def test_set_seed(self):
66 | set_seed(10);
67 | temp = random.random()
68 | random.seed(10)
69 | assert(random.random() == temp)
70 |
71 | temp = np.random.rand(1);
72 | np.random.seed(10)
73 | assert(np.random.rand(1)==temp)
74 |
75 | temp = tf.random.uniform([1])
76 | tf.random.set_seed(10)
77 | assert(tf.random.uniform([1]) == temp)
78 |
79 | #Test save and load pickle
80 | def test_save_pickle(self):
81 | save_pickle("test_pickle", { "lion": "yellow", "kitty": "red" })
82 | assert(os.path.exists("test_pickle") == True)
83 |
84 | def test_load_pickle(self):
85 | save_pickle("test_pickle", { "lion": "yellow", "kitty": "red" })
86 | temp = load_pickle("test_pickle")
87 | assert(temp == { "lion": "yellow", "kitty": "red" })
--------------------------------------------------------------------------------