├── .gitignore ├── .travis.yml ├── LICENSE ├── MANIFEST.in ├── Makefile ├── README.md ├── base_image └── Dockerfile ├── doc ├── contribute_models.md └── customized+model.md ├── runnables ├── Dockerfile ├── bin │ ├── __init__.py │ ├── binning_calculator.py │ └── psi_calculator.py ├── binning.py ├── extract_ts_features.py ├── psi.py ├── requirements.txt ├── run_io │ ├── __init__.py │ └── db_adapter.py ├── time_series_processing │ ├── __init__.py │ └── ts_feature_extractor.py └── two_dim_binning.py ├── scripts ├── data │ └── iris.recordio ├── elasticdl_travis_test_job.sh └── test_elasticdl_submit.sh ├── setup.cfg ├── setup.py ├── sqlflow_models ├── Dockerfile ├── __init__.py ├── _version.py ├── arima_with_stl_decomposition.py ├── auto_estimator.py ├── custom_model_example.py ├── deep_embedding_cluster.py ├── dnnclassifier.py ├── dnnclassifier_functional_api_example.py ├── dnnregressor.py ├── gcn.py ├── native_keras.py ├── one_class_svm.py ├── rnn_based_time_series.py ├── rnnclassifier.py ├── score_card.py └── simple_dnn_generator.py └── tests ├── __init__.py ├── base.py ├── test_arima_with_stl_decomposition.py ├── test_auto_estimator.py ├── test_deep_embedding_cluster.py ├── test_dnnclassifier.py ├── test_dnnclassifier_functional_api_example.py ├── test_dnnregressor.py ├── test_gcn.py ├── test_one_class_svm.py ├── test_rnn.py ├── test_rnnts.py ├── test_score_card.py └── test_version.py /.gitignore: -------------------------------------------------------------------------------- 1 | venv/ 2 | build/ 3 | dist/ 4 | 5 | .eggs/ 6 | *.egg-info/ 7 | .pytest_cache 8 | __pycache__/ 9 | 10 | .idea/ 11 | 12 | *.swp 13 | *.vim 14 | *.pyc 15 | *.log 16 | 17 | .DS_Store 18 | 19 | .vscode -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | dist: xenial 2 | cache: pip 3 | branches: 4 | only: 5 | - master 6 | - develop 7 | - "/^v\\d+\\.\\d+(\\.\\d+)?(-\\S*)?$/" 8 | language: python 9 | python: 10 | - 3.6 11 | - 3.7 12 | service: 13 | - docker 14 | install: 15 | - python -m pip install --upgrade pip 16 | - python -m pip install --upgrade setuptools>=41.0.0 17 | - python setup.py install 18 | script: 19 | - python setup.py -q test 20 | 21 | jobs: 22 | include: 23 | - stage: ElasticDLTest 24 | script: 25 | - cd base_image && docker build -t sqlflow/modelzoo_base . && cd .. 26 | - cd sqlflow_models && docker build -t sqlflow/sqlflow_models . && cd .. 27 | - bash scripts/elasticdl_travis_test_job.sh 28 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md LICENSE 2 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | SHELL := /bin/bash 2 | 3 | setup: ## Setup virtual environment for local development 4 | python3 -m venv venv 5 | source venv/bin/activate && \ 6 | pip install -U pip && \ 7 | $(MAKE) install-requirements 8 | 9 | install-requirements: 10 | pip install -U -e . 11 | 12 | test: ## Run tests 13 | python3 setup.py test 14 | 15 | clean: ## Clean up temporary folders 16 | rm -rf build dist .eggs *.egg-info .pytest_cache sqlflow/proto 17 | 18 | help: 19 | @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}' 20 | 21 | .PHONY: help 22 | .DEFAULT_GOAL := help 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SQLFlow Models 2 | 3 | [![Build Status](https://travis-ci.com/sql-machine-learning/models.svg?branch=develop)](https://travis-ci.org/sql-machine-learning) [![PyPI Package](https://img.shields.io/pypi/v/sqlflow_models.svg)](https://pypi.python.org/pypi/sqlflow_models) 4 | 5 | Premade Models for [SQLFlow](https://github.com/sql-machine-learning/sqlflow). 6 | 7 | ## Installation 8 | 9 | This package is available on PyPI as `sqlflow_models`. So you can install it by running the following command: 10 | 11 | ```bash 12 | pip install sqlflow_models 13 | ``` 14 | 15 | ## Development 16 | 17 | ## Prerequisite 18 | ### Python 3 19 | `brew install python` 20 | 21 | ### Setup Environment 22 | `make setup` 23 | 24 | ### Test 25 | `make test` 26 | -------------------------------------------------------------------------------- /base_image/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.7 2 | 3 | # install PAI python support 4 | RUN pip install pypai 5 | 6 | # install go needed by installing ElasticDL 7 | ENV GOPATH /root/go 8 | ENV PATH /usr/local/go/bin:$GOPATH/bin:$PATH 9 | RUN curl --silent https://dl.google.com/go/go1.13.4.linux-amd64.tar.gz | tar -C /usr/local -xzf - 10 | 11 | # install ElasticDL to manage ElasticDL jobs 12 | RUN git clone https://github.com/sql-machine-learning/elasticdl.git && \ 13 | cd elasticdl && \ 14 | git checkout 62b255a918df5b6594c888b19aebbcc74bbce6e4 && \ 15 | pip install -r elasticdl/requirements.txt && \ 16 | python setup.py install && \ 17 | cd .. && rm -rf elasticdl 18 | -------------------------------------------------------------------------------- /doc/contribute_models.md: -------------------------------------------------------------------------------- 1 | # How to Contribute SQLFLow Models 2 | 3 | This guide will introduce how to contribute to SQLFlow models. You can find design doc: [Define SQLFLow Models](/doc/customized+model.md), and feel free to check it out. 4 | 5 | ## Develop an SQLFlow Model 6 | 7 | 1. Open the [SQLFlow models repo](https://github.com/sql-machine-learning/models) on your web browser, and fork the official repo to your account. 8 | 9 | 1. Clone the forked repo on your hosts: 10 | 11 | ``` bash 12 | > git clone https://github.com//models.git 13 | ``` 14 | 15 | 1. Set up your local python environment by `make setup && source venv/bin/activate`. If you are using [PyCharm](https://www.jetbrains.com/pycharm/), you can simply `make setup` and then import the `models` folder as a new project. 16 | 17 | 1. You can add a new mode definition Python script under the folder [sqlflow_models](/sqlflow_models). For example, adding a new Python script `mydnnclassfier.py`: 18 | 19 | ``` text 20 | `-sqlflow_models 21 | |- dnnclassifier.py 22 | `- mydnnclassifier.py 23 | ``` 24 | 25 | 1. You can choose whatever name you like for your model. Your model definition should be a [keras subclass model](https://keras.io/models/about-keras-models/#model-subclassing) 26 | 27 | ``` python 28 | import tensorflow as tf 29 | 30 | class MyDNNClassifier(tf.keras.Model): 31 | def __init__(self, feature_columns, hidden_units=[10,10], n_classes=2): 32 | ... 33 | ... 34 | ``` 35 | 36 | 1. Import `MyDNNClassfier` in [sqlflow_models/\_\_init__.py](/sqlflow_models/__init__.py): 37 | 38 | ``` python 39 | ... 40 | from .mydnnclassfier import MyDNNClassifier 41 | ``` 42 | 43 | 1. You can test your `MyDNNClassifier` by adding a new Python unit test script `tests/test_mydnnclassifier.py` and run the test as: `python tests/test_mydnnclassifier.py`: 44 | 45 | ``` python 46 | from sqlflow_models import MyDNNClassifier 47 | from tests.base import BaseTestCases 48 | 49 | import tensorflow as tf 50 | import unittest 51 | 52 | 53 | class TestMyDNNClassifier(BaseTestCases.BaseTest): 54 | def setUp(self): 55 | self.features = {...} 56 | self.label = [...] 57 | feature_columns = [...] 58 | self.model = MyDNNClassifier(feature_columns=feature_columns) 59 | 60 | if __name__ == '__main__': 61 | unittest.main() 62 | ``` 63 | 64 | ## Test and Debug Your Model With SQLFlow 65 | 66 | If you have developed a new model, please perform the integration test with the SQLFlow gRPC server to make sure it works well with SQLFlow. 67 | 68 | 1. Launch an SQLFlow all-in-one Docker container 69 | 70 | ``` bash 71 | cd ./models 72 | > docker run --rm -it -v $PWD:/models -e PYTHONPATH=/models -p 8888:8888 sqlflow/sqlflow 73 | ``` 74 | 75 | 1. Open a web browser and go to `localhost:8888` to access the Jupyter Notebook. Using your custom model by modifying the `TRAIN` parameter of the SQLFlow extend SQL: `TRAIN sqlflow_models.MyDNNClassifier`: 76 | 77 | ``` sql 78 | SELECT * from iris.train 79 | TRAIN sqlflow_models.MyDNNClassifier 80 | WITH n_classes = 3, hidden_units = [10, 20] 81 | COLUMN sepal_length, sepal_width, petal_length, petal_width 82 | LABEL class 83 | INTO sqlflow_models.my_dnn_model; 84 | ``` 85 | 86 | 1. When you need to update the model and test a gain, just modify the mode Python file on your host then run the SQL statement in the notebook one more time. 87 | 88 | ## Publish your model in the SQLFlow all-in-one Docker image 89 | 90 | If you have already tested your code, please create a pull request and invite other develops to review it. If one of the develops **approve** your pull request, then you can merge it to the develop branch. 91 | The travis-ci would build the SQLFlow all-in-one Docker image with the latest models code every night and push it to the Docker hub with tag: `sqlflow/sqlflow:nightly`, you can find the latest models in it the second day. 92 | -------------------------------------------------------------------------------- /doc/customized+model.md: -------------------------------------------------------------------------------- 1 | # Design Doc: Define Models for SQLFlow 2 | 3 | SQLFlow enables SQL programs to call deep learning models defined in Python. This document is about how to define models for SQLFlow. 4 | 5 | ## Keras v.s. Estimator 6 | 7 | Many deep leareners define models using Keras API or as an Estimator derived class. 8 | We prefer [Keras](https://keras.io/) over [Estimator](https://www.tensorflow.org/guide/estimators) for some reasons: 9 | 10 | 1. [TensorFlow Submit 2019](https://www.youtube.com/watch?v=k5c-vg4rjBw) announced that TensorFlow 2.x will closely integrate with Keras. 11 | 12 | 2. We found more documents about Keras than Estimator. 13 | 14 | 3. We found more models defined using Keras than Estimator. 15 | 16 | ## Keras APIs 17 | 18 | Keras provides three approaches to define models. 19 | 20 | ### 1. Subclassing `tf.keras.Model` 21 | 22 | ```python 23 | class DNNClassifier(tf.keras.Model): 24 | def __init__(self, feature_columns, hidden_units, n_classes): 25 | super(DNNClassifier, self).__init__() 26 | self.feature_layer = tf.keras.layers.DenseFeatures(feature_columns) 27 | self.hidden_layers = [] 28 | for hidden_unit in hidden_units: 29 | self.hidden_layers.append(tf.keras.layers.Dense(hidden_unit)) 30 | self.prediction_layer = tf.keras.layers.Dense(n_classes, activation='softmax') 31 | 32 | def call(self, inputs): 33 | x = self.feature_layer(inputs) 34 | for hidden_layer in self.hidden_layers: 35 | x = hidden_layer(x) 36 | return self.prediction_layer(x) 37 | 38 | model = DNNClassifier(feature_columns, hidden_units, n_classes) 39 | ``` 40 | 41 | Please be aware that `tf.keras.Model` has methods `save_weights` and `load_weights`, which save/load model parameters but no the topology, as expalined in [this guidence](https://stackoverflow.com/questions/51806852/cant-save-custom-subclassed-model) and [this example list](https://stackoverflow.com/questions/52826134/keras-model-subclassing-examples). 42 | 43 | ### 2. Functional API 44 | 45 | ```python 46 | x = tf.feature_column.input_layer(shape=(5,)) 47 | for n in hidden_units: 48 | x = tf.keras.layers.Dense(n, activation='relu')(x) 49 | pred = tf.keras.layers.Dense(n_classes, activation='softmax')(x) 50 | model = tf.keras.models.Model(inputs=feature_columns, outputs=pred) 51 | ``` 52 | 53 | Please be aware that functional API doesn't work with feature column API, as reported [here](https://github.com/tensorflow/tensorflow/issues/27416). However, the approach of deriving classes from `keras.Model` works with the feature column API. 54 | 55 | ### 3. `keras.Sequential` 56 | 57 | ```python 58 | model = tf.keras.Sequential() 59 | model.add(tf.keras.layers.DenseFeatures(feature_columns)) 60 | for n in hidden_units: 61 | model.add(tf.keras.layers.Dense(n, activation='relu')) 62 | model.add(tf.keras.layers.Dense(n_classes, activation='softmax')) 63 | ``` 64 | 65 | Please be aware that `tf.keras.Sequential()` only covers a small variety of models. It doesn't cover many well-known models including ResNet, Transforms, and WideAndDeep. 66 | 67 | ### The Choice 68 | 69 | We chose the approach of subclassing `tf.keras.Model` according to the following table. 70 | 71 | | Keras APIs | Work with feature column API | Save/load models | Model coverage | 72 | | ------------------ | ---------------------------- | -------------------------- | -------------- | 73 | | `tf.keras.Model` | ☑️ | weights-only, no topology | High | 74 | | Functional API | ❌ | ☑️ | High | 75 | | Sequential Model | ☑️ | ☑️ | Low | 76 | 77 | 78 | ## An Example 79 | 80 | Here is an example `DNNClassifier` of multiple hidden layers as a Python class derived from `tf.keras.Model`. To run it, please use TensorFlow 2.0 alpha or newer versions. 81 | 82 | ```python 83 | class DNNClassifier(tf.keras.Model): 84 | def __init__(self, feature_columns, hidden_units, n_classes): 85 | """DNNClassifier 86 | :param feature_columns: feature columns. 87 | :type feature_columns: list[tf.feature_column]. 88 | :param hidden_units: number of hidden units. 89 | :type hidden_units: list[int]. 90 | :param n_classes: List of hidden units per layer. 91 | :type n_classes: int. 92 | """ 93 | super(DNNClassifier, self).__init__() 94 | 95 | # combines all the data as a dense tensor 96 | self.feature_layer = tf.keras.layers.DenseFeatures(feature_columns) 97 | self.hidden_layers = [] 98 | for hidden_unit in hidden_units: 99 | self.hidden_layers.append(tf.keras.layers.Dense(hidden_unit)) 100 | self.prediction_layer = tf.keras.layers.Dense(n_classes, activation='softmax') 101 | 102 | def call(self, inputs): 103 | x = self.feature_layer(inputs) 104 | for hidden_layer in self.hidden_layers: 105 | x = hidden_layer(x) 106 | return self.prediction_layer(x) 107 | 108 | def default_optimizer(self): 109 | """Default optimizer name. Used in model.compile.""" 110 | return 'adam' 111 | 112 | def default_loss(self): 113 | """Default loss function. Used in model.compile.""" 114 | return 'categorical_crossentropy' 115 | 116 | def default_training_epochs(self): 117 | """Default training epochs. Used in model.fit.""" 118 | return 5 119 | 120 | def prepare_prediction_column(self, prediction): 121 | """Return the class label of highest probability.""" 122 | return prediction.argmax(axis=-1) 123 | ``` 124 | 125 | ## Further Reading 126 | 127 | We read the following Keras source code files: [model.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/keras/models.py), [network.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/keras/engine/network.py), and [training.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/keras/engine/training.py). 128 | -------------------------------------------------------------------------------- /runnables/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM sqlflow/sqlflow:step 2 | 3 | RUN apt-get clean && apt-get update && \ 4 | apt-get -qq install libmysqld-dev libmysqlclient-dev 5 | 6 | ADD ./requirements.txt / 7 | RUN pip3 install --no-cache-dir -r /requirements.txt && rm -rf /requirements.txt 8 | 9 | ADD . /opt/sqlflow/run 10 | ENV PYTHONPATH "${PYTHONPATH}:/opt/sqlflow/run" 11 | -------------------------------------------------------------------------------- /runnables/bin/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sql-machine-learning/models/5dc6421f562ea447e501fa355a48a6ee89856a1d/runnables/bin/__init__.py -------------------------------------------------------------------------------- /runnables/bin/binning_calculator.py: -------------------------------------------------------------------------------- 1 | import mars.dataframe as md 2 | import mars.tensor as mt 3 | import numpy as np 4 | import pandas as pd 5 | 6 | 7 | class BinningMethod(object): 8 | BUCKET = "bucket" 9 | QUANTILE = "quantile" 10 | LOG_BUCKET = "log_bucket" 11 | 12 | 13 | def binning( 14 | in_md, 15 | col_name, 16 | bin_method, 17 | bins, 18 | boundaries): 19 | if boundaries: 20 | bin_o, bins = md.cut(in_md[col_name], bins=boundaries, labels=False, retbins=True) 21 | bins_np = bins.to_numpy() 22 | else: 23 | if bin_method.lower() == BinningMethod.BUCKET.lower(): 24 | bin_o, bins = md.cut(in_md[col_name], bins=bins, labels=False, retbins=True) 25 | bins_np = bins.to_numpy() 26 | elif bin_method.lower() == BinningMethod.LOG_BUCKET.lower(): 27 | bin_o, bins = md.cut(mt.log(in_md[col_name]), bins=bins, labels=False, retbins=True) 28 | bins_np = np.exp(bins.to_numpy()) 29 | else: 30 | raise ValueError("Unsupport binning method: {}".format(bin_method)) 31 | 32 | return bin_o, bins_np 33 | 34 | 35 | def cumsum(arr, reverse): 36 | if type(arr) == np.ndarray: 37 | sum_arr = arr 38 | elif type(arr) == pd.DataFrame: 39 | sum_arr = arr.to_numpy() 40 | else: 41 | raise ValueError("Invalid input type: {}".format(type(arr))) 42 | 43 | for i in range(np.ndim(arr)): 44 | sum_arr = np.flip(np.cumsum(np.flip(sum_arr, i), i), i) if reverse else np.cumsum(sum_arr, i) 45 | 46 | if type(arr) == np.ndarray: 47 | return sum_arr 48 | elif type(arr) == pd.DataFrame: 49 | return pd.DataFrame(sum_arr) 50 | else: 51 | raise ValueError("Invalid input type: {}".format(type(arr))) 52 | 53 | 54 | def calc_binning_stats( 55 | in_md, 56 | sel_cols, 57 | bin_methods, 58 | bin_nums, 59 | cols_bin_boundaries, 60 | reverse_cumsum): 61 | cols_bin_stats = [] 62 | for i in range(len(sel_cols)): 63 | sel_col = sel_cols[i] 64 | bin_o, bins = binning(in_md, sel_col, bin_methods[i], bin_nums[i], cols_bin_boundaries.get(sel_col, None)) 65 | bin_num = len(bins) - 1 66 | bin_prob_df = bin_o.value_counts(normalize=True).to_pandas().to_frame() 67 | bin_prob_df = bin_prob_df.reindex(range(bin_num), fill_value=0) 68 | bin_cumsum_prob_df = cumsum(bin_prob_df, reverse_cumsum) 69 | 70 | cols_bin_stats.append( 71 | { 72 | "name": sel_col, 73 | "bin_boundaries": ','.join(bins.astype(str)), 74 | "bin_prob": ','.join(bin_prob_df[bin_prob_df.columns[0]].to_numpy().astype(str)), 75 | "bin_cumsum_prob": ','.join(bin_cumsum_prob_df[bin_cumsum_prob_df.columns[0]].to_numpy().astype(str)) 76 | } 77 | ) 78 | 79 | return pd.DataFrame(cols_bin_stats) 80 | 81 | 82 | def calc_basic_stats( 83 | in_md, 84 | sel_cols): 85 | stats_data = [ 86 | { 87 | "name": sel_col, 88 | "min": mt.min(in_md[sel_col]).to_numpy(), 89 | "max": mt.max(in_md[sel_col]).to_numpy(), 90 | "mean": mt.mean(in_md[sel_col]).to_numpy(), 91 | "median": mt.median(in_md[sel_col]).to_numpy(), 92 | "std": mt.std(in_md[sel_col]).to_numpy(), 93 | } for sel_col in sel_cols 94 | ] 95 | 96 | return pd.DataFrame(stats_data) 97 | 98 | 99 | def calc_stats( 100 | in_md, 101 | sel_cols, 102 | bin_methods, 103 | bin_nums, 104 | cols_bin_boundaries, 105 | reverse_cumsum): 106 | basic_stats_df = calc_basic_stats(in_md, sel_cols) 107 | cols_bin_stats_df = calc_binning_stats(in_md, sel_cols, bin_methods, bin_nums, cols_bin_boundaries, reverse_cumsum) 108 | 109 | stats_df = pd.merge(basic_stats_df, cols_bin_stats_df, how='inner', on='name') 110 | 111 | return stats_df 112 | 113 | 114 | def calc_two_dim_binning_stats( 115 | in_md, 116 | sel_col_1, 117 | sel_col_2, 118 | bin_method_1, 119 | bin_method_2, 120 | bin_num_1, 121 | bin_num_2, 122 | bin_boundaries_1, 123 | bin_boundaries_2, 124 | reverse_cumsum): 125 | bin_o1, bins_1 = binning(in_md, sel_col_1, bin_method_1, bin_num_1, bin_boundaries_1) 126 | bin_o2, bins_2 = binning(in_md, sel_col_2, bin_method_2, bin_num_2, bin_boundaries_2) 127 | 128 | bin_num_1 = len(bins_1) - 1 129 | bin_num_2 = len(bins_2) - 1 130 | 131 | bin_o = bin_o1 * bin_num_2 + bin_o2 132 | bin_prob_df = bin_o.value_counts(normalize=True).to_pandas().to_frame() 133 | bin_prob_df = bin_prob_df.reindex(range(bin_num_1 * bin_num_2), fill_value=0) 134 | two_dim_bin_prob_np = bin_prob_df.to_numpy().reshape((bin_num_1, bin_num_2)) 135 | two_dim_bin_cumsum_prob_np = cumsum(two_dim_bin_prob_np, reverse_cumsum) 136 | 137 | return pd.DataFrame(two_dim_bin_prob_np), pd.DataFrame(two_dim_bin_cumsum_prob_np) 138 | 139 | 140 | def get_cols_bin_boundaries(stats_df): 141 | col_boundaries = {} 142 | for _, row in stats_df.iterrows(): 143 | col_name = row['name'] 144 | boundaries = [float(item) for item in row['bin_boundaries'].split(',')] 145 | col_boundaries[col_name] = boundaries 146 | 147 | return col_boundaries 148 | -------------------------------------------------------------------------------- /runnables/bin/psi_calculator.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | 5 | def calc_psi_per_bin( 6 | expected_prob, 7 | actual_prob): 8 | FALLBACK_VALUE = 0.001 9 | expected_prob = FALLBACK_VALUE if expected_prob == 0.0 else expected_prob 10 | actual_prob = FALLBACK_VALUE if actual_prob == 0.0 else actual_prob 11 | 12 | return (expected_prob - actual_prob) * np.log(expected_prob * 1.0 / actual_prob) 13 | 14 | 15 | def calc_psi( 16 | expected_bin_probs, 17 | actual_bin_probs): 18 | assert(len(expected_bin_probs) == len(actual_bin_probs)) 19 | 20 | result = 0.0 21 | for i in range(len(expected_bin_probs)): 22 | result += calc_psi_per_bin(expected_bin_probs[i], actual_bin_probs[i]) 23 | 24 | return result 25 | 26 | 27 | def get_cols_bin_probs( 28 | stats_df, 29 | bin_prob_column_name): 30 | col_bin_probs = {} 31 | for _, row in stats_df.iterrows(): 32 | col_name = row['name'] 33 | bin_probs = [float(item) for item in row[bin_prob_column_name].split(',')] 34 | col_bin_probs[col_name] = bin_probs 35 | 36 | return col_bin_probs 37 | -------------------------------------------------------------------------------- /runnables/binning.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import mars.dataframe as md 3 | import os 4 | import pandas as pd 5 | from bin.binning_calculator import calc_stats, calc_two_dim_binning_stats, get_cols_bin_boundaries 6 | from run_io.db_adapter import convertDSNToRfc1738 7 | from sqlalchemy import create_engine 8 | 9 | 10 | def build_argument_parser(): 11 | parser = argparse.ArgumentParser(allow_abbrev=False) 12 | parser.add_argument("--dbname", type=str, required=True) 13 | parser.add_argument("--columns", type=str, required=True) 14 | parser.add_argument("--bin_method", type=str, required=False) 15 | parser.add_argument("--bin_num", type=str, required=False) 16 | parser.add_argument("--bin_input_table", type=str, required=False) 17 | parser.add_argument("--reverse_cumsum", type=bool, default=False) 18 | parser.add_argument("--two_dim_bin_cols", type=str, required=False) 19 | 20 | return parser 21 | 22 | 23 | if __name__ == "__main__": 24 | parser = build_argument_parser() 25 | args, _ = parser.parse_known_args() 26 | columns = args.columns.split(',') 27 | bin_method_array = args.bin_method.split(',') if args.bin_method else None 28 | bin_num_array = [int(item) for item in args.bin_num.split(',')] if args.bin_num else None 29 | 30 | select_input = os.getenv("SQLFLOW_TO_RUN_SELECT") 31 | output = os.getenv("SQLFLOW_TO_RUN_INTO") 32 | output_tables = output.split(',') 33 | datasource = os.getenv("SQLFLOW_DATASOURCE") 34 | 35 | assert len(output_tables) == 1, "The output tables shouldn't be null and can contain only one." 36 | 37 | url = convertDSNToRfc1738(datasource, args.dbname) 38 | engine = create_engine(url) 39 | input_md = md.read_sql( 40 | sql=select_input, 41 | con=engine) 42 | input_md.execute() 43 | 44 | cols_bin_boundaries = {} 45 | if args.bin_input_table: 46 | print("Get provided bin boundaries from table {}".format(args.bin_input_table)) 47 | bin_input_df = pd.read_sql_table( 48 | table_name=args.bin_input_table, 49 | con=engine) 50 | cols_bin_boundaries = get_cols_bin_boundaries(bin_input_df) 51 | 52 | if set(columns) > cols_bin_boundaries.keys(): 53 | raise ValueError("The provided bin boundaries contains keys: {}. But they cannot cover all the \ 54 | input columns: {}".format(cols_bin_boundaries.keys(), columns)) 55 | 56 | print("Ignore the bin_num and bin_method arguments") 57 | bin_num_array = [None] * len(columns) 58 | bin_method_array = [None] * len(columns) 59 | else: 60 | if len(bin_num_array) == 1: 61 | bin_num_array = bin_num_array * len(columns) 62 | else: 63 | assert(len(bin_num_array) == len(columns)) 64 | 65 | if len(bin_method_array) == 1: 66 | bin_method_array = bin_method_array * len(columns) 67 | else: 68 | assert(len(bin_method_array) == len(columns)) 69 | 70 | print("Calculate the statistics result for columns: {}".format(columns)) 71 | stats_df = calc_stats( 72 | input_md, 73 | columns, 74 | bin_method_array, 75 | bin_num_array, 76 | cols_bin_boundaries, 77 | args.reverse_cumsum) 78 | 79 | print("Persist the statistics result into the table {}".format(output_tables[0])) 80 | stats_df.to_sql( 81 | name=output_tables[0], 82 | con=engine, 83 | index=False 84 | ) 85 | -------------------------------------------------------------------------------- /runnables/extract_ts_features.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import pandas as pd 4 | from run_io.db_adapter import convertDSNToRfc1738 5 | from sqlalchemy import create_engine 6 | from time_series_processing.ts_feature_extractor import add_features_extracted_from_ts_data, add_lag_columns 7 | 8 | 9 | def build_argument_parser(): 10 | parser = argparse.ArgumentParser(allow_abbrev=False) 11 | parser.add_argument("--dbname", type=str, required=True) 12 | parser.add_argument("--column_id", type=str, required=True) 13 | parser.add_argument("--column_time", type=str, required=True) 14 | parser.add_argument("--columns_value", type=str, required=True) 15 | parser.add_argument("--lag_num", type=int, default=1) 16 | parser.add_argument("--windows", type=str, required=True) 17 | parser.add_argument("--min_window", type=str, default=0) 18 | parser.add_argument("--extract_setting", type=str, default="minimal", choices=["minimal", "efficient", "comprehensive"]) 19 | 20 | return parser 21 | 22 | 23 | if __name__ == "__main__": 24 | parser = build_argument_parser() 25 | args, _ = parser.parse_known_args() 26 | columns_value = args.columns_value.split(',') 27 | windows = [int(item) for item in args.windows.split(',')] 28 | 29 | select_input = os.getenv("SQLFLOW_TO_RUN_SELECT") 30 | output = os.getenv("SQLFLOW_TO_RUN_INTO") 31 | datasource = os.getenv("SQLFLOW_DATASOURCE") 32 | 33 | url = convertDSNToRfc1738(datasource, args.dbname) 34 | engine = create_engine(url) 35 | input = pd.read_sql( 36 | sql=select_input, 37 | con=engine) 38 | 39 | df_with_lag_columns, lag_column_names = add_lag_columns(input, columns_value, args.lag_num) 40 | 41 | print("Start extracting the features from the time series data.") 42 | df_with_extracted_features = add_features_extracted_from_ts_data( 43 | df_with_lag_columns, 44 | column_id=args.column_id, 45 | column_time=args.column_time, 46 | columns_value=lag_column_names, 47 | windows=windows, 48 | min_window=args.min_window, 49 | extract_setting=args.extract_setting) 50 | print("Complete the feature extraction.") 51 | 52 | df_with_extracted_features = df_with_extracted_features.drop(columns=lag_column_names) 53 | 54 | df_with_extracted_features.to_sql( 55 | name=output, 56 | con=engine, 57 | index=False) 58 | print("Complete save the result data into table {}.".format(output)) 59 | -------------------------------------------------------------------------------- /runnables/psi.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import pandas as pd 4 | from bin.psi_calculator import calc_psi, get_cols_bin_probs 5 | from run_io.db_adapter import convertDSNToRfc1738 6 | from sqlalchemy import create_engine 7 | 8 | 9 | def build_argument_parser(): 10 | parser = argparse.ArgumentParser(allow_abbrev=False) 11 | parser.add_argument("--dbname", type=str, required=True) 12 | parser.add_argument("--refer_stats_table", type=str, required=True) 13 | parser.add_argument("--bin_prob_column", type=str, default="bin_prob") 14 | 15 | return parser 16 | 17 | 18 | if __name__ == "__main__": 19 | parser = build_argument_parser() 20 | args, _ = parser.parse_known_args() 21 | 22 | select_input = os.getenv("SQLFLOW_TO_RUN_SELECT") 23 | output = os.getenv("SQLFLOW_TO_RUN_INTO") 24 | datasource = os.getenv("SQLFLOW_DATASOURCE") 25 | 26 | url = convertDSNToRfc1738(datasource, args.dbname) 27 | engine = create_engine(url) 28 | 29 | input_df = pd.read_sql( 30 | sql=select_input, 31 | con=engine) 32 | refer_stats_df = pd.read_sql_table( 33 | table_name=args.refer_stats_table, 34 | con=engine) 35 | 36 | actual_cols_bin_probs = get_cols_bin_probs(input_df, args.bin_prob_column) 37 | expected_cols_bin_probs = get_cols_bin_probs(input_df, args.bin_prob_column) 38 | 39 | common_column_names = set.intersection( 40 | set(actual_cols_bin_probs.keys()), 41 | set(expected_cols_bin_probs.keys())) 42 | 43 | print("Calculate the PSI value for {} fields.".format(len(common_column_names))) 44 | cols_psi_data = [] 45 | for column_name in common_column_names: 46 | psi_value = calc_psi(actual_cols_bin_probs[column_name], expected_cols_bin_probs[column_name]) 47 | cols_psi_data.append( 48 | { 49 | "name": column_name, 50 | "psi": psi_value 51 | } 52 | ) 53 | cols_psi_df = pd.DataFrame(cols_psi_data) 54 | 55 | print("Persist the PSI result into the table {}".format(output)) 56 | cols_psi_df.to_sql( 57 | name=output, 58 | con=engine, 59 | index=False 60 | ) 61 | -------------------------------------------------------------------------------- /runnables/requirements.txt: -------------------------------------------------------------------------------- 1 | tsfresh==0.16.0 2 | sqlalchemy==1.3.19 3 | mysql==0.0.2 4 | pymars==0.5.1 5 | pandas>=1.0.0 -------------------------------------------------------------------------------- /runnables/run_io/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sql-machine-learning/models/5dc6421f562ea447e501fa355a48a6ee89856a1d/runnables/run_io/__init__.py -------------------------------------------------------------------------------- /runnables/run_io/db_adapter.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | def parseMySQLDSN(dsn): 4 | # [username[:password]@][protocol[(address)]]/dbname[?param1=value1&...¶mN=valueN] 5 | pattern = "^(\w*):(\w*)@tcp\(([.a-zA-Z0-9\-]*):([0-9]*)\)/(\w*)(\?.*)?$" # noqa: W605, E501 6 | found_result = re.findall(pattern, dsn) 7 | user, passwd, host, port, database, config_str = found_result[0] 8 | config = {} 9 | if len(config_str) > 1: 10 | for c in config_str[1:].split("&"): 11 | k, v = c.split("=") 12 | config[k] = v 13 | return user, passwd, host, port, database, config 14 | 15 | # TODO(brightcoder01): Should we put this kind of common method 16 | # in sqlflow runtime? While writing the runnable code, users can 17 | # import the runtime library. 18 | def convertDSNToRfc1738(driver_dsn, defaultDbName): 19 | driver, dsn = driver_dsn.split("://") 20 | user, passwd, host, port, database, config = parseMySQLDSN(dsn) 21 | 22 | if not database: 23 | database = defaultDbName 24 | 25 | # mysql://root:root@127.0.0.1:3306/dbname 26 | return "{}://{}:{}@{}:{}/{}".format(driver, user, passwd, host, port, database) 27 | -------------------------------------------------------------------------------- /runnables/time_series_processing/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sql-machine-learning/models/5dc6421f562ea447e501fa355a48a6ee89856a1d/runnables/time_series_processing/__init__.py -------------------------------------------------------------------------------- /runnables/time_series_processing/ts_feature_extractor.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from functools import reduce 3 | from tsfresh import extract_features 4 | from tsfresh.feature_extraction.settings import MinimalFCParameters, ComprehensiveFCParameters, EfficientFCParameters 5 | from tsfresh.utilities.dataframe_functions import roll_time_series 6 | 7 | 8 | EXTRACT_SETTING_NAME_TO_CLASS_DICT = { 9 | "minimal": MinimalFCParameters, 10 | "efficient": EfficientFCParameters, 11 | "comprehensive": ComprehensiveFCParameters 12 | } 13 | 14 | ROLLED_TS_ID_COLUMN_NAME = "id" 15 | ORIGIN_JOIN_ID_COLUMN_NAME = "join_id" 16 | ROLLED_TS_ID_FORMAT = "id={},timeshift={}" 17 | 18 | 19 | def _roll_ts_and_extract_features( 20 | input, 21 | column_id, 22 | column_time, 23 | columns_value, 24 | max_window, 25 | min_window, 26 | extract_setting): 27 | rolled_ts = roll_time_series( 28 | input, 29 | column_id=column_id, 30 | column_kind=None, 31 | column_sort=column_time, 32 | rolling_direction=1, 33 | max_timeshift=max_window, 34 | min_timeshift=min_window, 35 | n_jobs=0) 36 | 37 | rename_columns = { 38 | value_column: "{}_w_{}".format(value_column, max_window) 39 | for value_column in columns_value 40 | } 41 | rolled_ts = rolled_ts.rename(columns=rename_columns) 42 | rolled_ts = rolled_ts.drop(columns=[column_id]) 43 | 44 | extract_setting_clz = EXTRACT_SETTING_NAME_TO_CLASS_DICT.get(extract_setting, MinimalFCParameters) 45 | extracted_features = extract_features( 46 | rolled_ts, 47 | column_id=ROLLED_TS_ID_COLUMN_NAME, 48 | column_sort=column_time, 49 | n_jobs=0, 50 | default_fc_parameters=extract_setting_clz()) 51 | 52 | return extracted_features 53 | 54 | 55 | def add_lag_columns( 56 | input, 57 | columns_value, 58 | lag_num): 59 | lag_column_names = [] 60 | for column_value in columns_value: 61 | lag_column_name = "{}_lag_{}".format(column_value, lag_num) 62 | input[lag_column_name] = input[column_value].shift(lag_num) 63 | lag_column_names.append(lag_column_name) 64 | 65 | return input[lag_num:], lag_column_names 66 | 67 | 68 | def add_features_extracted_from_ts_data( 69 | input, 70 | column_id, 71 | column_time, 72 | columns_value, 73 | windows, 74 | min_window=0, 75 | extract_setting="minimal"): 76 | """Extract features from the time series data and append them to the 77 | original data. 78 | 79 | Build the rolled time series data with various window sizes, extract 80 | the features using TSFresh and then append the derived features to 81 | the original data. 82 | 83 | Args: 84 | input: A pandas DataFrame for the input data. 85 | column_id: The name of the id column to group by the time series data. 86 | The input data can contain the time series for various entities. 87 | For example, the UV for different websites. 88 | column_time: The name of the time column. 89 | columns_value: Array. The names of the columns for the time series data. 90 | windows: Array of window sizes. The time series data will be rolled with 91 | each window size. 92 | min_window: The extract forecast windows smaller or equal than this will 93 | be throwed away. 94 | extract_setting: minimal | efficient | comprehensive. Control which features 95 | will be extracted. The order of feature numbers is: 96 | minimal < efficient < comprehensive 97 | 98 | Returns: 99 | A pandas DataFrame containing the original input data and extracted features. 100 | """ 101 | 102 | input_with_join_id = pd.DataFrame() 103 | input_with_join_id[ORIGIN_JOIN_ID_COLUMN_NAME] = input.apply( 104 | lambda row: ROLLED_TS_ID_FORMAT.format(row[column_id], row[column_time]), 105 | axis=1) 106 | 107 | input_with_join_id = pd.concat( 108 | [input, input_with_join_id], 109 | axis=1) 110 | 111 | input = input[[column_id, column_time] + columns_value] 112 | input.sort_values(by=[column_id, column_time]) 113 | 114 | extracted_features_multi_windows = [ 115 | _roll_ts_and_extract_features( 116 | input=input, 117 | column_id=column_id, 118 | column_time=column_time, 119 | columns_value=columns_value, 120 | max_window=window, 121 | min_window=min_window, 122 | extract_setting=extract_setting 123 | ) for window in windows 124 | ] 125 | 126 | extracted_features_multi_windows = reduce(lambda left, right: pd.merge( 127 | left=left, 128 | right=right, 129 | how="left", 130 | on=ROLLED_TS_ID_COLUMN_NAME 131 | ), extracted_features_multi_windows) 132 | 133 | original_data_with_extracted_features = pd.merge( 134 | input_with_join_id, 135 | extracted_features_multi_windows, 136 | how='inner', 137 | left_on=ORIGIN_JOIN_ID_COLUMN_NAME, 138 | right_on=ROLLED_TS_ID_COLUMN_NAME 139 | ) 140 | 141 | original_data_with_extracted_features.sort_values(by=[column_id, column_time]) 142 | original_data_with_extracted_features = original_data_with_extracted_features.drop(columns=[ORIGIN_JOIN_ID_COLUMN_NAME]) 143 | 144 | return original_data_with_extracted_features 145 | -------------------------------------------------------------------------------- /runnables/two_dim_binning.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import mars.dataframe as md 3 | import os 4 | import pandas as pd 5 | from bin.binning_calculator import calc_stats, calc_two_dim_binning_stats, get_cols_bin_boundaries 6 | from run_io.db_adapter import convertDSNToRfc1738 7 | from sqlalchemy import create_engine 8 | 9 | 10 | def build_argument_parser(): 11 | parser = argparse.ArgumentParser(allow_abbrev=False) 12 | parser.add_argument("--dbname", type=str, required=True) 13 | parser.add_argument("--columns", type=str, required=True) 14 | parser.add_argument("--bin_method", type=str, required=False) 15 | parser.add_argument("--bin_num", type=str, required=False) 16 | parser.add_argument("--bin_input_table", type=str, required=False) 17 | parser.add_argument("--reverse_cumsum", type=bool, default=False) 18 | 19 | return parser 20 | 21 | 22 | if __name__ == "__main__": 23 | parser = build_argument_parser() 24 | args, _ = parser.parse_known_args() 25 | columns = args.columns.split(',') 26 | bin_method_array = args.bin_method.split(',') if args.bin_method else None 27 | bin_num_array = [int(item) for item in args.bin_num.split(',')] if args.bin_num else None 28 | 29 | select_input = os.getenv("SQLFLOW_TO_RUN_SELECT") 30 | output = os.getenv("SQLFLOW_TO_RUN_INTO") 31 | output_tables = output.split(',') 32 | datasource = os.getenv("SQLFLOW_DATASOURCE") 33 | 34 | # Check arguments 35 | assert len(columns) == 2, "The column number should only be 2" 36 | assert len(output_tables) == 3, "The output table number should only be 3" 37 | 38 | url = convertDSNToRfc1738(datasource, args.dbname) 39 | engine = create_engine(url) 40 | input_md = md.read_sql( 41 | sql=select_input, 42 | con=engine) 43 | input_md.execute() 44 | 45 | cols_bin_boundaries = {} 46 | if args.bin_input_table: 47 | print("Get provided bin boundaries from table {}".format(args.bin_input_table)) 48 | bin_input_df = pd.read_sql_table( 49 | table_name=args.bin_input_table, 50 | con=engine) 51 | cols_bin_boundaries = get_cols_bin_boundaries(bin_input_df) 52 | 53 | if set(columns) > cols_bin_boundaries.keys(): 54 | raise ValueError("The provided bin boundaries contains keys: {}. But they cannot cover all the \ 55 | input columns: {}".format(cols_bin_boundaries.keys(), columns)) 56 | 57 | print("Ignore the bin_num and bin_method arguments") 58 | bin_num_array = [None] * len(columns) 59 | bin_method_array = [None] * len(columns) 60 | else: 61 | if len(bin_num_array) == 1: 62 | bin_num_array = bin_num_array * len(columns) 63 | else: 64 | assert(len(bin_num_array) == len(columns)) 65 | 66 | if len(bin_method_array) == 1: 67 | bin_method_array = bin_method_array * len(columns) 68 | else: 69 | assert(len(bin_method_array) == len(columns)) 70 | 71 | print("Calculate the statistics result for columns: {}".format(columns)) 72 | stats_df = calc_stats( 73 | input_md, 74 | columns, 75 | bin_method_array, 76 | bin_num_array, 77 | cols_bin_boundaries, 78 | args.reverse_cumsum) 79 | 80 | print("Persist the statistics result into the table {}".format(output_tables[0])) 81 | stats_df.to_sql( 82 | name=output_tables[0], 83 | con=engine, 84 | index=False 85 | ) 86 | 87 | print("Calculate two dimension binning result for columns: {}".format(columns)) 88 | bin_prob_df, bin_cumsum_prob_df = calc_two_dim_binning_stats( 89 | input_md, 90 | columns[0], 91 | columns[1], 92 | bin_method_array[0], 93 | bin_method_array[1], 94 | bin_num_array[0], 95 | bin_num_array[1], 96 | cols_bin_boundaries.get(columns[0], None), 97 | cols_bin_boundaries.get(columns[1], None), 98 | args.reverse_cumsum) 99 | 100 | print("Persist the binning probabilities into table {}".format(output_tables[1])) 101 | bin_prob_df.to_sql( 102 | name=output_tables[1], 103 | con=engine, 104 | index=False 105 | ) 106 | print("Persist the binning accumulated probabilities into table {}".format(output_tables[2])) 107 | bin_cumsum_prob_df.to_sql( 108 | name=output_tables[2], 109 | con=engine, 110 | index=False 111 | ) 112 | -------------------------------------------------------------------------------- /scripts/data/iris.recordio: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sql-machine-learning/models/5dc6421f562ea447e501fa355a48a6ee89856a1d/scripts/data/iris.recordio -------------------------------------------------------------------------------- /scripts/elasticdl_travis_test_job.sh: -------------------------------------------------------------------------------- 1 | if [ "$SQLFLOW_TEST_DB_MAXCOMPUTE_AK" = "" ] || [ "$SQLFLOW_TEST_DB_MAXCOMPUTE_SK" == "" ]; then 2 | echo "skip maxcompute test because the env SQLFLOW_TEST_DB_MAXCOMPUTE_AK or SQLFLOW_TEST_DB_MAXCOMPUTE_SK is empty" 3 | exit 0 4 | fi 5 | 6 | curl -s https://raw.githubusercontent.com/sql-machine-learning/elasticdl/4a995fe7eaf91bc5a9d50181e9aaaa14d15c8a09/scripts/setup_k8s_env.sh | bash 7 | kubectl apply -f https://raw.githubusercontent.com/sql-machine-learning/elasticdl/develop/elasticdl/manifests/examples/elasticdl-rbac.yaml 8 | 9 | docker run --rm -it --net=host \ 10 | -v $HOME/.kube:/root/.kube \ 11 | -v /home/$USER/.minikube/:/home/$USER/.minikube/ \ 12 | -v /var/run/docker.sock:/var/run/docker.sock \ 13 | -v $PWD:/workspace \ 14 | -e ODPS_ACCESS_ID=$MAXCOMPUTE_AK \ 15 | -e ODPS_ACCESS_KEY=$MAXCOMPUTE_SK \ 16 | sqlflow/sqlflow_models bash /workspace/scripts/test_elasticdl_submit.sh 17 | 18 | docker run --rm -it --net=host \ 19 | -v $HOME/.kube:/root/.kube \ 20 | -v /home/$USER/.minikube/:/home/$USER/.minikube/ \ 21 | sqlflow/sqlflow_models \ 22 | bash -c "curl -s https://raw.githubusercontent.com/sql-machine-learning/elasticdl/62b255a918df5b6594c888b19aebbcc74bbce6e4/scripts/validate_job_status.py | python - odps 1 2" 23 | -------------------------------------------------------------------------------- /scripts/test_elasticdl_submit.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | elasticdl train --image_base=sqlflow/sqlflow_models \ 4 | --model_def=dnnclassifier.DNNClassifier \ 5 | --training_data=sqlflow_test_iris_train \ 6 | --data_reader_params='columns=["sepal_length", "sepal_width", "petal_length", "petal_width", "class"];label_col="class"' \ 7 | --envs="ODPS_PROJECT_NAME=gomaxcompute_driver_w7u,ODPS_ACCESS_ID=$ODPS_ACCESS_ID,ODPS_ACCESS_KEY=$ODPS_ACCESS_KEY" \ 8 | --minibatch_size=32 \ 9 | --num_epochs=2 \ 10 | --model_zoo=/sqlflow_models \ 11 | --job_name=test-odps \ 12 | --num_minibatches_per_task=2 \ 13 | --image_pull_policy=Never \ 14 | --num_workers=2 \ 15 | --num_ps_pods=1 \ 16 | --master_resource_request="cpu=200m,memory=128Mi" \ 17 | --master_resource_limit="cpu=1,memory=2048Mi" \ 18 | --worker_resource_request="cpu=200m,memory=128Mi" \ 19 | --worker_resource_limit="cpu=1,memory=3072Mi" \ 20 | --ps_resource_request="cpu=200m,memory=128Mi" \ 21 | --ps_resource_limit="cpu=1,memory=2048Mi" \ 22 | --grads_to_wait=2 \ 23 | --output=model_output 24 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [aliases] 2 | test=pytest 3 | 4 | [tool:pytest] 5 | rootdir=tests 6 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # Note: To use the 'upload' functionality of this file, you must: 5 | # $ pip install twine 6 | 7 | import io 8 | import os 9 | import sys 10 | from shutil import rmtree 11 | 12 | from setuptools import find_packages, setup, Command 13 | 14 | # Package meta-data. 15 | NAME = 'sqlflow_models' 16 | DESCRIPTION = 'Premade Models for SQLFlow.' 17 | URL = 'https://github.com/sql-machine-learning/models' 18 | EMAIL = 'yzhdoudou@gmail.com' 19 | AUTHOR = 'Yang Yang' 20 | REQUIRES_PYTHON = '>=3.6.0' 21 | VERSION = None 22 | 23 | # What packages are required for this module to be executed? 24 | REQUIRED = [ 25 | 'protobuf==3.7.1', 26 | 'tensorflow==2.0.1', 27 | 'scikit-learn==0.21.0', 28 | 'numpy==1.16.2', 29 | 'pandas>=0.25.1', 30 | 'adanet==0.8.0', 31 | "tensorflow-datasets==3.0.0", 32 | "statsmodels==0.11.1", 33 | "scipy==1.4.1", 34 | "tensorflow-metadata<0.23.0", 35 | ] 36 | 37 | SETUP_REQUIRED = [ 38 | 'pytest-runner' 39 | ] 40 | TEST_REQUIRED = [ 41 | 'pytest', 42 | ] 43 | 44 | # What packages are optional? 45 | EXTRAS = { 46 | } 47 | 48 | # The rest you shouldn't have to touch too much :) 49 | # ------------------------------------------------ 50 | # Except, perhaps the License and Trove Classifiers! 51 | # If you do change the License, remember to change the Trove Classifier for that! 52 | 53 | here = os.path.abspath(os.path.dirname(__file__)) 54 | 55 | # Import the README and use it as the long-description. 56 | # Note: this will only work if 'README.md' is present in your MANIFEST.in file! 57 | try: 58 | with io.open(os.path.join(here, 'README.md'), encoding='utf-8') as f: 59 | long_description = '\n' + f.read() 60 | except FileNotFoundError: 61 | long_description = DESCRIPTION 62 | 63 | # Load the package's __version__.py module as a dictionary. 64 | about = {} 65 | if not VERSION: 66 | with open(os.path.join(here, NAME, '_version.py')) as f: 67 | exec(f.read(), about) 68 | else: 69 | about['__version__'] = VERSION 70 | 71 | 72 | class UploadCommand(Command): 73 | """Support setup.py upload.""" 74 | 75 | description = 'Build and publish the package.' 76 | user_options = [] 77 | 78 | @staticmethod 79 | def status(s): 80 | """Prints things in bold.""" 81 | print('\033[1m{0}\033[0m'.format(s)) 82 | 83 | def initialize_options(self): 84 | pass 85 | 86 | def finalize_options(self): 87 | pass 88 | 89 | def run(self): 90 | try: 91 | self.status('Removing previous builds…') 92 | rmtree(os.path.join(here, 'dist')) 93 | except OSError: 94 | pass 95 | 96 | self.status('Building Source and Wheel (universal) distribution…') 97 | os.system('{0} setup.py sdist bdist_wheel --universal'.format(sys.executable)) 98 | 99 | self.status('Uploading the package to PyPI via Twine…') 100 | os.system('twine upload dist/*') 101 | 102 | self.status('Pushing git tags…') 103 | os.system('git tag v{0}'.format(about['__version__'])) 104 | os.system('git push --tags') 105 | 106 | sys.exit() 107 | 108 | 109 | # Where the magic happens: 110 | setup( 111 | name=NAME, 112 | version=about['__version__'], 113 | description=DESCRIPTION, 114 | long_description=long_description, 115 | long_description_content_type='text/markdown', 116 | author=AUTHOR, 117 | author_email=EMAIL, 118 | python_requires=REQUIRES_PYTHON, 119 | url=URL, 120 | packages=find_packages(exclude=('tests',)), 121 | # If your package is a single module, use this instead of 'packages': 122 | # py_modules=['mypackage'], 123 | 124 | # entry_points={ 125 | # 'console_scripts': ['mycli=mymodule:cli'], 126 | # }, 127 | install_requires=REQUIRED, 128 | setup_requires=SETUP_REQUIRED, 129 | tests_require=TEST_REQUIRED, 130 | extras_require=EXTRAS, 131 | license='Apache License 2.0', 132 | classifiers=[ 133 | # Trove classifiers 134 | # Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers 135 | 'License :: OSI Approved :: Apache Software License', 136 | 'Programming Language :: Python', 137 | 'Programming Language :: Python :: 3', 138 | 'Programming Language :: Python :: 3.6', 139 | 'Programming Language :: Python :: Implementation :: CPython', 140 | 'Programming Language :: Python :: Implementation :: PyPy' 141 | ], 142 | # $ setup.py publish support. 143 | cmdclass={ 144 | 'upload': UploadCommand, 145 | }, 146 | zip_safe=False, 147 | ) 148 | -------------------------------------------------------------------------------- /sqlflow_models/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM sqlflow/modelzoo_base 2 | 3 | RUN pip install tensorflow==2.0.0 scikit-learn==0.21.0 numpy==1.16.2 pandas==0.25.1 4 | ADD *.py /sqlflow_models/ 5 | -------------------------------------------------------------------------------- /sqlflow_models/__init__.py: -------------------------------------------------------------------------------- 1 | import traceback 2 | from ._version import __version__ 3 | from .dnnclassifier import DNNClassifier 4 | from .dnnregressor import DNNRegressor 5 | from .rnnclassifier import StackedRNNClassifier 6 | from .deep_embedding_cluster import DeepEmbeddingClusterModel 7 | from .dnnclassifier_functional_api_example import dnnclassifier_functional_model 8 | from .rnn_based_time_series import RNNBasedTimeSeriesModel 9 | from .auto_estimator import AutoClassifier, AutoRegressor 10 | from .score_card import ScoreCard 11 | from .native_keras import RawDNNClassifier 12 | from .custom_model_example import CustomClassifier 13 | from .gcn import GCN 14 | from .one_class_svm import OneClassSVM 15 | try: 16 | # NOTE: statsmodels have version conflict on PAI 17 | from .arima_with_stl_decomposition import ARIMAWithSTLDecomposition 18 | except: 19 | print("model ARIMAWithSTLDecomposition is not imported") 20 | traceback.print_exc() 21 | -------------------------------------------------------------------------------- /sqlflow_models/_version.py: -------------------------------------------------------------------------------- 1 | VERSION = (0, 1, 0) 2 | 3 | __version__ = '.'.join(map(str, VERSION)) 4 | -------------------------------------------------------------------------------- /sqlflow_models/arima_with_stl_decomposition.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import six 3 | from statsmodels.tsa.arima_model import ARIMA 4 | from statsmodels.tsa.seasonal import STL 5 | from datetime import datetime 6 | import tensorflow as tf 7 | import pandas as pd 8 | 9 | class ARIMAWithSTLDecomposition(tf.keras.Model): 10 | def __init__(self, 11 | order, 12 | period, 13 | date_format, 14 | forecast_start, 15 | forecast_end, 16 | **kwargs): 17 | super(ARIMAWithSTLDecomposition, self).__init__() 18 | 19 | self.order = order 20 | if not isinstance(period, (list, tuple)): 21 | period = period 22 | self.period = period 23 | self.date_format = date_format 24 | self.forecast_start = self._str2date(forecast_start) 25 | self.forecast_end = self._str2date(forecast_end) 26 | self.seasonal = [] 27 | self.kwargs = kwargs 28 | 29 | def _str2date(self, date_str): 30 | if isinstance(date_str, bytes): 31 | date_str = date_str.decode('utf-8') 32 | return datetime.strptime(str(date_str), self.date_format) 33 | 34 | def _read_all_data(self, dataset): 35 | data = None 36 | for batch_idx, items in enumerate(dataset): 37 | if data is None: 38 | data = [[] for _ in six.moves.range(len(items))] 39 | 40 | for i, item in enumerate(items): 41 | if isinstance(item, dict): 42 | assert len(item) == 1 43 | dict_values = list(item.values()) 44 | item = dict_values[0] 45 | 46 | if isinstance(item, tf.Tensor): 47 | item = item.numpy() 48 | 49 | item = np.reshape(item, [-1]).tolist() 50 | data[i].extend(item) 51 | 52 | dates, values = data 53 | sorted_dates_index = sorted(range(len(dates)), key=lambda k: dates[k]) 54 | dates = np.array([self._str2date(dates[i]) for i in sorted_dates_index]) 55 | values = np.array([values[i] for i in sorted_dates_index]).astype('float32') 56 | 57 | return dates, values 58 | 59 | def _stl_decompose(self, values): 60 | left_values = values 61 | self.seasonal = [] 62 | for p in self.period: 63 | stl_model = STL(left_values, period=p).fit() 64 | seasonal = np.array(stl_model.seasonal) 65 | self.seasonal.append(seasonal) 66 | left_values -= seasonal 67 | 68 | return left_values 69 | 70 | def _addup_seasonal(self, dates, values): 71 | time_interval = dates[1] - dates[0] 72 | start_interval = self.forecast_start - dates[0] 73 | start_index = int(start_interval.total_seconds() / time_interval.total_seconds()) 74 | 75 | length = len(values) 76 | 77 | for p, seasonal in six.moves.zip(self.period, self.seasonal): 78 | if length % p == 0: 79 | offset = length 80 | else: 81 | offset = (int(length / p) + 1) * p 82 | 83 | idx = start_index - offset 84 | values += seasonal[idx:idx+length] 85 | 86 | return values 87 | 88 | def _normalize(self, values): 89 | min_value = np.min(values) 90 | max_value = np.max(values) 91 | values = (values - min_value) / (max_value - min_value) 92 | return values, min_value, max_value 93 | 94 | def print_prediction_result(self, prediction, interval): 95 | t_strs = [] 96 | for i, p in enumerate(prediction): 97 | t = self.forecast_start + i * interval 98 | t_str = datetime.strftime(t, self.date_format) 99 | t_strs.append(t_str) 100 | 101 | df = pd.DataFrame(data={'time': t_strs, 'prediction': prediction}) 102 | with pd.option_context('display.max_columns', None): 103 | print(df) 104 | 105 | def sqlflow_train_loop(self, dataset): 106 | dates, values = self._read_all_data(dataset) 107 | 108 | left_values = self._stl_decompose(values) 109 | left_values, min_value, max_value = self._normalize(left_values) 110 | 111 | model = ARIMA(left_values, order=self.order, dates=dates).fit(disp=-1) 112 | 113 | prediction = model.predict(start=self.forecast_start, end=self.forecast_end, typ='levels') 114 | 115 | prediction = prediction * (max_value - min_value) + min_value 116 | prediction = self._addup_seasonal(dates, prediction) 117 | self.print_prediction_result(prediction, interval=dates[1] - dates[0]) 118 | return prediction 119 | 120 | def loss(*args, **kwargs): 121 | return None 122 | 123 | def optimizer(*args, **kwargs): 124 | return None 125 | -------------------------------------------------------------------------------- /sqlflow_models/auto_estimator.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, print_function, unicode_literals 2 | from collections import defaultdict 3 | 4 | import absl 5 | import logging 6 | import tensorflow as tf 7 | import warnings 8 | 9 | absl.logging.set_verbosity(absl.logging.ERROR) 10 | tf.get_logger().setLevel(logging.ERROR) 11 | tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR) 12 | warnings.warn = lambda *args, **kargs:None 13 | import adanet 14 | 15 | from tensorflow import keras 16 | from tensorflow_estimator.python.estimator.canned import optimizers 17 | from .simple_dnn_generator import SimpleDNNGenerator 18 | 19 | 20 | LEARN_MIXTURE_WEIGHTS=True 21 | RANDOM_SEED = 42 22 | 23 | class AutoClassifier(adanet.Estimator): 24 | def __init__(self, feature_columns, layer_size=50, optimizer='Adagrad', linear_optimizer='Ftrl', 25 | model_dir=None, n_classes=2, activation_fn=tf.nn.relu, complexity_penalty=0.01, 26 | search_every_n_steps=1000, max_iterations=10, config=None): 27 | """AutoClassifier 28 | :param feature_columns: Feature columns. 29 | :type feature_columns: list[tf.feature_column]. 30 | :param layer_size: Number of hidden_units in each layers. 31 | :type layer_size: int. 32 | :param n_classes: Number of label classes. Defaults to 2, namely binary classification. 33 | :type n_classes: int. 34 | :param optimizer: Optimizer for the the neural multi-layer parts of the generated network. 35 | :type optimizer: str. 36 | :param linear_optimizer: Optimizer for the linear part of the generated network. 37 | :type linear_optimizer: str. 38 | :param model_dir: Directory to save or restore model checkpoints. 39 | :type model_dir: str. 40 | :param activation_fn: Activation function. 41 | :type activation_fn: function. 42 | :param complexity_penalty: Regularization of the complexity of the network. 43 | :type complexity_penalty: float. 44 | :param search_every_n_steps: Search new architecture every n steps. 45 | :type search_every_n_steps: int. 46 | :param max_iterations: Max times of architecture searching. 47 | :type max_iterations: int. 48 | :param config: Estimator configuration. 49 | :type config: dict. 50 | """ 51 | if n_classes == 2: 52 | head = tf.estimator.BinaryClassHead() 53 | else: 54 | head = tf.estimator.MultiClassHead(n_classes=n_classes) 55 | 56 | opts= defaultdict(lambda: optimizers.get_optimizer_instance(optimizer, 0.001)) 57 | opts[0] = optimizers.get_optimizer_instance(linear_optimizer, 0.1) 58 | # Define the generator, which defines the search space of subnetworks 59 | # to train as candidates to add to the final AdaNet model. 60 | subnetwork_generator = SimpleDNNGenerator( 61 | feature_columns=feature_columns, 62 | layer_size=layer_size, 63 | optimizers=opts, 64 | learn_mixture_weights=LEARN_MIXTURE_WEIGHTS, 65 | seed=RANDOM_SEED) 66 | super(AutoClassifier, self).__init__(head=head, 67 | model_dir=model_dir, 68 | adanet_lambda=complexity_penalty, 69 | subnetwork_generator=subnetwork_generator, 70 | max_iteration_steps=search_every_n_steps, 71 | max_iterations=max_iterations) 72 | 73 | class AutoRegressor(adanet.Estimator): 74 | def __init__(self, feature_columns, layer_size=50, optimizer='Adagrad', linear_optimizer='Ftrl', 75 | model_dir=None, activation_fn=tf.nn.relu, complexity_penalty=0.01, 76 | search_every_n_steps=1000, max_iterations=10, config=None): 77 | """AutoRegressor 78 | :param feature_columns: Feature columns. 79 | :type feature_columns: list[tf.feature_column]. 80 | :param layer_size: Number of hidden_units in each layers. 81 | :type layer_size: int. 82 | :param optimizer: Optimizer for the the neural multi-layer parts of the generated network. 83 | :type optimizer: str. 84 | :param linear_optimizer: Optimizer for the linear part of the generated network. 85 | :type linear_optimizer: str. 86 | :param model_dir: Directory to save or restore model checkpoints. 87 | :type model_dir: str. 88 | :param activation_fn: Activation function. 89 | :type activation_fn: function. 90 | :param complexity_penalty: Regularization of the complexity of the network. 91 | :type complexity_penalty: float. 92 | :param search_every_n_steps: Search new architecture every n steps. 93 | :type search_every_n_steps: int. 94 | :param max_iterations: Max times of architecture searching. 95 | :type max_iterations: int. 96 | :param config: Estimator configuration. 97 | :type config: dict. 98 | """ 99 | head = tf.estimator.RegressionHead() 100 | 101 | opts= defaultdict(lambda: optimizers.get_optimizer_instance(optimizer, 0.001)) 102 | opts[0] = optimizers.get_optimizer_instance(linear_optimizer, 0.1) 103 | # Define the generator, which defines the search space of subnetworks 104 | # to train as candidates to add to the final AdaNet model. 105 | subnetwork_generator = SimpleDNNGenerator( 106 | feature_columns=feature_columns, 107 | layer_size=layer_size, 108 | optimizers=opts, 109 | learn_mixture_weights=LEARN_MIXTURE_WEIGHTS, 110 | seed=RANDOM_SEED) 111 | super(AutoRegressor, self).__init__(head=head, 112 | model_dir=model_dir, 113 | adanet_lambda=complexity_penalty, 114 | subnetwork_generator=subnetwork_generator, 115 | max_iteration_steps=search_every_n_steps, 116 | max_iterations=max_iterations) 117 | -------------------------------------------------------------------------------- /sqlflow_models/custom_model_example.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import random 3 | import numpy as np 4 | 5 | class CustomClassifier(tf.keras.Model): 6 | def __init__(self, feature_columns=None): 7 | """The model init function. You can define any model parameter in the function's argument list. 8 | You can also add custom training routines together with a Keras 9 | model (see deep_embedding_cluster.py), or define a model with out Keras layers 10 | (e.g. use sklearn or numpy only). 11 | """ 12 | pass 13 | 14 | def sqlflow_train_loop(self, x): 15 | """The custom model traininig loop, input x is a tf.dataset object that generates training data. 16 | """ 17 | pass 18 | 19 | def sqlflow_predict_one(self, sample): 20 | """Run prediction with one sample and return the prediction result. The result must be a 21 | list of numpy array. SQLFlow determine the output type by: 22 | - if the array have only one element, the model must be regression model. 23 | - if the array have multiple elements: 24 | - if the sum of all the elements are close to 1, it is likely to be a classification model. 25 | - else the model is a regression model with multiple outputs. 26 | """ 27 | pos = random.random() 28 | neg = 1 - pos 29 | array = np.array([pos, neg]) 30 | return [array] 31 | 32 | def sqlflow_evaluate_loop(self, x, metric_names): 33 | """Run evaluation on the validation dataset and return a list of metrics. 34 | NOTE: the first result metric is always loss. If no loss is defined, add 0. 35 | """ 36 | metric_len = len(metric_names) 37 | result = [] 38 | for i in range(metric_len+1): 39 | result.append(random.random()) 40 | return result 41 | -------------------------------------------------------------------------------- /sqlflow_models/deep_embedding_cluster.py: -------------------------------------------------------------------------------- 1 | #!usr/bin/env python 2 | # -*- coding:utf-8 _*- 3 | 4 | """ 5 | __author__ : chenxiang 6 | __email__ : alfredchenxiang@didichuxing.com 7 | __file_name__ : deep_embedding_cluster.py 8 | __create_time__ : 2019/09/03 9 | """ 10 | from datetime import datetime 11 | import tensorflow as tf 12 | from tensorflow import keras 13 | from tensorflow.python.data import make_one_shot_iterator 14 | from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau 15 | from tensorflow.keras.layers import Dense, Layer, DenseFeatures, InputSpec 16 | from tensorflow.keras import backend 17 | import numpy as np 18 | from sklearn.cluster import KMeans 19 | from tensorflow.keras.losses import kld 20 | from tensorflow.keras.optimizers import SGD 21 | import tensorflow_datasets as tfds 22 | import pandas as pd 23 | 24 | _train_lr = 0.01 25 | _default_loss = kld 26 | 27 | class DeepEmbeddingClusterModel(keras.Model): 28 | 29 | def __init__(self, 30 | feature_columns, 31 | n_clusters=10, 32 | kmeans_init=20, 33 | run_pretrain=True, 34 | existed_pretrain_model=None, 35 | pretrain_dims=[100, 100, 10], 36 | pretrain_activation_func='relu', 37 | pretrain_use_callbacks=False, 38 | pretrain_cbearlystop_patience=30, 39 | pretrain_cbearlystop_mindelta=0.0001, 40 | pretrain_cbreduce_patience=10, 41 | pretrain_cbreduce_factor=0.1, 42 | pretrain_epochs=30, 43 | pretrain_initializer='glorot_uniform', 44 | pretrain_lr=1, 45 | train_lr=0.01, 46 | train_max_iters=8000, 47 | update_interval=100, 48 | train_use_tol=True, 49 | tol=0.0001, 50 | loss=kld): 51 | 52 | """ 53 | Implement cluster model mostly based on DEC. 54 | :param feature_columns: a list of tf.feature_column 55 | :param n_clusters: Number of clusters. 56 | :param kmeans_init: Number of running K-Means to get best choice of centroids. 57 | :param run_pretrain: Run pre-train process or not. 58 | :param existed_pretrain_model: Path of existed pre-train model. Not used now. 59 | :param pretrain_dims: Dims of layers which is used for build autoencoder. 60 | :param pretrain_activation_func: Active function of autoencoder layers. 61 | :param pretrain_use_callbacks: Use callbacks when pre-train or not. 62 | :param pretrain_cbearlystop_patience: Patience value of EarlyStopping when use callbacks. 63 | :param pretrain_cbearlystop_mindelta: Min_delta value of EarlyStopping when use callbacks. 64 | :param pretrain_cbreduce_patience: Patience value of ReduceLROnPlateau when use callbacks. 65 | :param pretrain_cbreduce_factor: Factor value of ReduceLROnPlateau when use callbacks. 66 | :param pretrain_epochs: Number of epochs when pre-train. 67 | :param pretrain_initializer: Initialize function for autoencoder layers. 68 | :param pretrain_lr: learning rate to train the auto encoder. 69 | :param train_lr: learning rate to train the cluster network. 70 | :param train_max_iters: Number of iterations when train. 71 | :param update_interval: Interval between updating target distribution. 72 | :param train_use_tol: Use tolerance during clusteringlayer or not. 73 | :param tol: Tolerance of earlystopping when train during clusteringlayer. 74 | :param loss: Default 'kld' when init. 75 | """ 76 | global _train_lr 77 | global _default_loss 78 | super(DeepEmbeddingClusterModel, self).__init__(name='DECModel') 79 | 80 | # Common 81 | self._feature_columns = feature_columns 82 | self._feature_columns_dims = len(self._feature_columns) 83 | self._n_clusters = n_clusters 84 | _default_loss = loss 85 | self._train_max_iters = train_max_iters 86 | self._update_interval = update_interval 87 | self._current_interval = 0 88 | self._train_use_tol = train_use_tol 89 | self._tol = tol 90 | 91 | # Pre-train 92 | self._run_pretrain = run_pretrain 93 | self._existed_pretrain_model = existed_pretrain_model 94 | self._pretrain_activation_func = pretrain_activation_func 95 | self._pretrain_dims = pretrain_dims 96 | self._pretrain_epochs = pretrain_epochs 97 | self._pretrain_initializer = pretrain_initializer 98 | self._pretrain_lr = pretrain_lr 99 | self._pretrain_optimizer = SGD(lr=self._pretrain_lr, momentum=0.9) 100 | 101 | # Pre-train-callbacks 102 | self._pretrain_use_callbacks = pretrain_use_callbacks 103 | self._pretrain_cbearlystop_patience = pretrain_cbearlystop_patience 104 | self._pretrain_cbearlystop_mindelta = pretrain_cbearlystop_mindelta 105 | self._pretrain_cbreduce_patience = pretrain_cbreduce_patience 106 | self._pretrain_cbreduce_factor = pretrain_cbreduce_factor 107 | 108 | # K-Means 109 | self._kmeans_init = kmeans_init 110 | 111 | # Cluster 112 | _train_lr = train_lr 113 | self._cluster_optimizer = SGD(lr=_train_lr, momentum=0.9) 114 | 115 | # Build model 116 | self._n_stacks = len(self._pretrain_dims) 117 | self.input_layer = DenseFeatures(feature_columns) 118 | 119 | # Layers - encoder 120 | self.encoder_layers = [] 121 | for i in range(self._n_stacks): 122 | self.encoder_layers.append(Dense(units=self._pretrain_dims[i], 123 | activation=self._pretrain_activation_func, 124 | name='encoder_%d' % i)) 125 | 126 | self.clustering_layer = ClusteringLayer(name='clustering', n_clusters=self._n_clusters) 127 | 128 | @staticmethod 129 | def target_distribution(q): 130 | """ 131 | Calculate auxiliary softer target distributions by raising q to the second power and 132 | then normalizing by frequency. 133 | :param q: Original distributions. 134 | :return: Auxiliary softer target distributions 135 | """ 136 | weight = q ** 2 / q.sum(0) 137 | return (weight.T / weight.sum(1)).T 138 | 139 | def pre_train(self, x): 140 | """ 141 | Used for preparing encoder part by loading ready-to-go model or training one. 142 | :param x: 143 | :return: 144 | """ 145 | print('{} Start pre_train.'.format(datetime.now())) 146 | 147 | print('{} Start preparing training dataset to save into memory.'.format(datetime.now())) 148 | # Concatenate input feature to meet requirement of keras.Model.fit() 149 | def _concate_generate(dataset_element): 150 | concate_y = tf.stack([dataset_element[feature.key] for feature in self._feature_columns], axis=1) 151 | return (dataset_element, concate_y) 152 | 153 | y = x.cache().map(map_func=_concate_generate) 154 | y.prefetch(1) 155 | 156 | self.input_x = dict() 157 | self.input_y = None 158 | for np_sample in tfds.as_numpy(y): 159 | sample_dict = np_sample[0] 160 | label = np_sample[1] 161 | if self.input_y is None: 162 | self.input_y = label 163 | else: 164 | self.input_y = np.concatenate([self.input_y, label]) 165 | if len(self.input_x) == 0: 166 | self.input_x = sample_dict 167 | else: 168 | for k in self.input_x: 169 | self.input_x[k] = np.concatenate([self.input_x[k], sample_dict[k]]) 170 | print('{} Done preparing training dataset.'.format(datetime.now())) 171 | 172 | # Layers - decoder 173 | self.decoder_layers = [] 174 | for i in range(self._n_stacks - 2, -1, -1): 175 | self.decoder_layers.append(Dense(units=self._pretrain_dims[i], 176 | activation=self._pretrain_activation_func, 177 | kernel_initializer=self._pretrain_initializer, 178 | name='decoder_%d' % (i + 1))) 179 | 180 | self.decoder_layers.append(Dense(units=self._feature_columns_dims, 181 | kernel_initializer=self._pretrain_initializer, 182 | name='decoder_0')) 183 | # Pretrain - autoencoder, encoder 184 | # autoencoder 185 | self._autoencoder = keras.Sequential(layers=[self.input_layer] + self.encoder_layers + self.decoder_layers, 186 | name='autoencoder') 187 | self._autoencoder.compile(optimizer=self._pretrain_optimizer, loss='mse') 188 | # encoder 189 | self._encoder = keras.Sequential(layers=[self.input_layer] + self.encoder_layers, name='encoder') 190 | self._encoder.compile(optimizer=self._pretrain_optimizer, loss='mse') 191 | 192 | # pretrain_callbacks 193 | print('{} Training auto-encoder.'.format(datetime.now())) 194 | if self._pretrain_use_callbacks: 195 | callbacks = [ 196 | EarlyStopping(monitor='loss', 197 | patience=self._pretrain_cbearlystop_patience, min_delta=self._pretrain_cbearlystop_mindelta), 198 | ReduceLROnPlateau(monitor='loss', 199 | factor=self._pretrain_cbreduce_factor, patience=self._pretrain_cbreduce_patience) 200 | ] 201 | self._autoencoder.fit(self.input_x, self.input_y, 202 | epochs=self._pretrain_epochs, callbacks=callbacks, verbose=1) 203 | else: 204 | self._autoencoder.fit(self.input_x, self.input_y, 205 | epochs=self._pretrain_epochs, verbose=1) 206 | # encoded_input 207 | # type : numpy.ndarray shape : (num_of_all_records,num_of_cluster) (70000,10) if mnist 208 | print('{} Calculating encoded_input.'.format(datetime.now())) 209 | self.encoded_input = self._encoder.predict(x) 210 | 211 | del self._autoencoder 212 | del self._encoder 213 | del self.decoder_layers 214 | print('{} Done pre-train.'.format(datetime.now())) 215 | 216 | def call(self, inputs, training=None, mask=None): 217 | x = self.input_layer(inputs) 218 | for encoder_layer in self.encoder_layers: 219 | x = encoder_layer(x) 220 | return self.clustering_layer(x) 221 | 222 | def init_centroids(self): 223 | """ 224 | Training K-means `_kmeans_init` times on the output of encoder to get best initial centroids. 225 | :return: 226 | """ 227 | self.kmeans = KMeans(n_clusters=self._n_clusters, n_init=self._kmeans_init) 228 | self.y_pred_last = self.kmeans.fit_predict(self.encoded_input) 229 | print('{} Done init centroids by k-means.'.format(datetime.now())) 230 | 231 | def sqlflow_train_loop(self, x, epochs=1, verbose=0): 232 | """ Parameter `epochs` and `verbose` will not be used in this function. """ 233 | # There is a bug which will cause build failed when using `DenseFeatures` with `keras.Model` 234 | # https://github.com/tensorflow/tensorflow/issues/28111 235 | # Using 'predict' to solve this problem here. 236 | # Preparation 237 | for features in x.take(1): 238 | self.predict(x=features) 239 | 240 | # Get train.batch_size from sqlflow 241 | for feature_name, feature_series in features.items(): 242 | self._train_batch_size = feature_series.shape[0] 243 | break 244 | 245 | # Pre-train autoencoder to prepare weights of encoder layers. 246 | self.pre_train(x) 247 | 248 | # Initialize centroids for clustering. 249 | self.init_centroids() 250 | 251 | # Setting cluster layer. 252 | self.get_layer(name='clustering').set_weights([self.kmeans.cluster_centers_]) 253 | 254 | # Train 255 | # flatten y to shape (num_samples, flattened_features) 256 | record_num = self.input_y.shape[0] 257 | feature_dims = self.input_y.shape[1:] 258 | feature_dim_total = 1 259 | for d in feature_dims: 260 | feature_dim_total = feature_dim_total * d 261 | y_reshaped = self.input_y.reshape([record_num, feature_dim_total]) 262 | print('{} Done preparing training dataset.'.format(datetime.now())) 263 | 264 | index_array = np.arange(record_num) 265 | index, loss, p = 0, 0., None 266 | 267 | for ite in range(self._train_max_iters): 268 | if ite % self._update_interval == 0: 269 | q = self.predict(self.input_x) # numpy.ndarray shape(record_num,n_clusters) 270 | p = self.target_distribution(q) # update the auxiliary target distribution p 271 | 272 | if self._train_use_tol: 273 | y_pred = q.argmax(1) 274 | # delta_percentage means the percentage of changed predictions in this train stage. 275 | delta_percentage = np.sum(y_pred != self.y_pred_last).astype(np.float32) / y_pred.shape[0] 276 | print('{} Updating at iter: {} -> delta_percentage: {}.'.format(datetime.now(), ite, delta_percentage)) 277 | self.y_pred_last = np.copy(y_pred) 278 | if ite > 0 and delta_percentage < self._tol: 279 | print('Early stopping since delta_table {} has reached tol {}'.format(delta_percentage, self._tol)) 280 | break 281 | idx = index_array[index * self._train_batch_size: min((index + 1) * self._train_batch_size, record_num)] 282 | 283 | loss = self.train_on_batch(x=list(y_reshaped[idx].T), y=p[idx]) 284 | if ite % 100 == 0: 285 | print('{} Training at iter:{} -> loss:{}.'.format(datetime.now(), ite, loss)) 286 | index = index + 1 if (index + 1) * self._train_batch_size <= record_num else 0 # Update index 287 | 288 | def display_model_info(self, verbose=0): 289 | if verbose >= 0: 290 | print('Summary : ') 291 | print(self.summary()) 292 | if verbose >= 1: 293 | print('Layer\'s Shape : ') 294 | for layer in self.encoder_layers: 295 | print(layer.name + ' : ') 296 | for i in layer.get_weights(): 297 | print(i.shape) 298 | print(self.clustering_layer.name + ' : ') 299 | for i in self.clustering_layer.get_weights(): 300 | print(i.shape) 301 | if verbose >= 2: 302 | print('Layer\'s Info : ') 303 | for layer in self.encoder_layers: 304 | print(layer.name + ' : ') 305 | print(layer.get_weights()) 306 | # Cluster 307 | print(self.clustering_layer.name + ' : ') 308 | print(self.clustering_layer.get_weights()) 309 | 310 | 311 | def optimizer(): 312 | global _train_lr 313 | return SGD(lr=_train_lr, momentum=0.9) 314 | 315 | def loss(labels, output): 316 | global _default_loss 317 | return _default_loss(labels, output) 318 | 319 | def prepare_prediction_column(prediction): 320 | """ Return the cluster label of the highest probability. """ 321 | return prediction.argmax(axis=-1) 322 | 323 | class ClusteringLayer(Layer): 324 | def __init__(self, n_clusters, alpha=1.0, **kwargs): 325 | """ 326 | Using clustering layer to refine the cluster centroids by learning from current high confidence assignment 327 | using auxiliary target distribution. 328 | 329 | :param n_clusters: Number of clusters. 330 | :param weights: Initial cluster centroids. 331 | :param alpha: Degrees of freedom parameters in Student's t-distribution. Default to 1.0 for all experiments. 332 | :param kwargs: 333 | """ 334 | self.n_clusters = n_clusters 335 | self.alpha = alpha 336 | self.input_spec = InputSpec(ndim=2) 337 | super(ClusteringLayer, self).__init__(**kwargs) 338 | 339 | def build(self, input_shape): 340 | input_dim = input_shape[1] 341 | self.input_spec = InputSpec(dtype=backend.floatx(), shape=(None, input_dim)) 342 | shape = tf.TensorShape(dims=(self.n_clusters, input_dim)) 343 | self.kernel = self.add_weight(name='kernel', shape=shape, initializer='glorot_uniform', trainable=True) 344 | super(ClusteringLayer, self).build(shape) 345 | 346 | def call(self, inputs, **kwargs): 347 | q = 1.0 / (1.0 + (backend.sum(backend.square(backend.expand_dims(inputs, axis=1) - self.kernel), 348 | axis=2) / self.alpha)) 349 | q **= (self.alpha + 1.0) / 2.0 350 | q = backend.transpose(backend.transpose(q) / backend.sum(q, axis=1)) 351 | return q 352 | 353 | def compute_output_shape(self, input_shape): 354 | assert input_shape and len(input_shape) == 2 355 | return input_shape[0], self.n_clusters 356 | 357 | def get_config(self): 358 | config = {'n_clusters': self.n_clusters} 359 | base_config = super(ClusteringLayer, self).get_config() 360 | return dict(list(base_config.items()) + list(config.items())) 361 | -------------------------------------------------------------------------------- /sqlflow_models/dnnclassifier.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | class DNNClassifier(tf.keras.Model): 4 | def __init__(self, feature_columns=None, hidden_units=[100,100], n_classes=3): 5 | """DNNClassifier 6 | :param feature_columns: feature columns. 7 | :type feature_columns: list[tf.feature_column]. 8 | :param hidden_units: number of hidden units. 9 | :type hidden_units: list[int]. 10 | :param n_classes: List of hidden units per layer. 11 | :type n_classes: int. 12 | """ 13 | global _loss 14 | super(DNNClassifier, self).__init__() 15 | self.feature_layer = None 16 | self.n_classes = n_classes 17 | if feature_columns is not None: 18 | # combines all the data as a dense tensor 19 | self.feature_layer = tf.keras.layers.DenseFeatures(feature_columns) 20 | self.hidden_layers = [] 21 | for hidden_unit in hidden_units: 22 | self.hidden_layers.append(tf.keras.layers.Dense(hidden_unit, activation='relu')) 23 | if self.n_classes == 2: 24 | # special setup for binary classification 25 | pred_act = 'sigmoid' 26 | _loss = 'binary_crossentropy' 27 | n_out = 1 28 | else: 29 | pred_act = 'softmax' 30 | _loss = 'categorical_crossentropy' 31 | n_out = self.n_classes 32 | self.prediction_layer = tf.keras.layers.Dense(n_out, activation=pred_act) 33 | 34 | def call(self, inputs, training=True): 35 | if self.feature_layer is not None: 36 | x = self.feature_layer(inputs) 37 | else: 38 | x = tf.keras.layers.Flatten()(inputs) 39 | for hidden_layer in self.hidden_layers: 40 | x = hidden_layer(x) 41 | return self.prediction_layer(x) 42 | 43 | def optimizer(learning_rate=0.001): 44 | """Default optimizer name. Used in model.compile.""" 45 | return tf.keras.optimizers.Adagrad(lr=learning_rate) 46 | 47 | def loss(labels, output): 48 | """Default loss function. Used in model.compile.""" 49 | global _loss 50 | if _loss == "binary_crossentropy": 51 | return tf.reduce_mean(tf.keras.losses.binary_crossentropy(labels, output)) 52 | elif _loss == "categorical_crossentropy": 53 | return tf.reduce_mean(tf.keras.losses.sparse_categorical_crossentropy(labels, output)) 54 | 55 | def prepare_prediction_column(prediction): 56 | """Return the class label of highest probability.""" 57 | return prediction.argmax(axis=-1) 58 | 59 | def eval_metrics_fn(): 60 | return { 61 | "accuracy": lambda labels, predictions: tf.equal( 62 | tf.argmax(predictions, 1, output_type=tf.int32), 63 | tf.cast(tf.reshape(labels, [-1]), tf.int32), 64 | ) 65 | } 66 | -------------------------------------------------------------------------------- /sqlflow_models/dnnclassifier_functional_api_example.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | global _loss 4 | 5 | def dnnclassifier_functional_model(feature_columns, field_metas, n_classes=2, learning_rate=0.001): 6 | feature_layer_inputs = dict() 7 | for fmkey in field_metas: 8 | fm = field_metas[fmkey] 9 | feature_layer_inputs[fm["feature_name"]] = tf.keras.Input(shape=(fm["shape"]), name=fm["feature_name"], dtype=fm["dtype"]) 10 | feature_layer = tf.keras.layers.DenseFeatures(feature_columns) 11 | feature_layer_outputs = feature_layer(feature_layer_inputs) 12 | global _loss 13 | if n_classes == 2: 14 | # special setup for binary classification 15 | pred_act = 'sigmoid' 16 | _loss = 'binary_crossentropy' 17 | else: 18 | pred_act = 'softmax' 19 | _loss = 'categorical_crossentropy' 20 | x = tf.keras.layers.Dense(128, activation='relu')(feature_layer_outputs) 21 | x = tf.keras.layers.Dense(64, activation='relu')(x) 22 | pred = tf.keras.layers.Dense(n_classes, activation=pred_act)(x) 23 | return tf.keras.Model(inputs=[v for v in feature_layer_inputs.values()], outputs=pred) 24 | 25 | def loss(labels, output): 26 | global _loss 27 | if _loss == "binary_crossentropy": 28 | return tf.reduce_mean(tf.keras.losses.binary_crossentropy(labels, output)) 29 | elif _loss == "categorical_crossentropy": 30 | return tf.reduce_mean(tf.keras.losses.sparse_categorical_crossentropy(labels, output)) 31 | 32 | def epochs(): 33 | return 1 34 | 35 | def optimizer(lr=0.1): 36 | return tf.keras.optimizers.Adagrad(lr=lr) 37 | 38 | def prepare_prediction_column(self, prediction): 39 | """Return the class label of highest probability.""" 40 | return prediction.argmax(axis=-1) 41 | -------------------------------------------------------------------------------- /sqlflow_models/dnnregressor.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | class DNNRegressor(tf.keras.Model): 4 | def __init__(self, feature_columns=None, hidden_units=[100,100]): 5 | """DNNRegressor 6 | :param feature_columns: feature columns. 7 | :type feature_columns: list[tf.feature_column]. 8 | :param hidden_units: number of hidden units. 9 | :type hidden_units: list[int]. 10 | """ 11 | super(DNNRegressor, self).__init__() 12 | self.feature_layer = None 13 | if feature_columns is not None: 14 | # combines all the data as a dense tensor 15 | self.feature_layer = tf.keras.layers.DenseFeatures(feature_columns) 16 | self.hidden_layers = [] 17 | for hidden_unit in hidden_units: 18 | self.hidden_layers.append(tf.keras.layers.Dense(hidden_unit, activation='relu')) 19 | self.prediction_layer = tf.keras.layers.Dense(1) 20 | 21 | def call(self, inputs, training=True): 22 | if self.feature_layer is not None: 23 | x = self.feature_layer(inputs) 24 | else: 25 | x = tf.keras.layers.Flatten()(inputs) 26 | for hidden_layer in self.hidden_layers: 27 | x = hidden_layer(x) 28 | return self.prediction_layer(x) 29 | 30 | def optimizer(learning_rate=0.001): 31 | """Default optimizer name. Used in model.compile.""" 32 | return tf.keras.optimizers.Adagrad(lr=learning_rate) 33 | 34 | def loss(labels, output): 35 | """Default loss function. Used in model.compile.""" 36 | return tf.keras.losses.MSE(labels, output) 37 | 38 | def prepare_prediction_column(prediction): 39 | """Return the prediction directly.""" 40 | return prediction[0] 41 | 42 | def eval_metrics_fn(): 43 | return { 44 | "mse": lambda labels, predictions: tf.reduce_mean( 45 | tf.pow( 46 | tf.cast(predictions, tf.float64) - tf.cast(labels, tf.float64), 2) 47 | ) 48 | } 49 | -------------------------------------------------------------------------------- /sqlflow_models/gcn.py: -------------------------------------------------------------------------------- 1 | # Based on the code from: https://github.com/tkipf/keras-gcn 2 | import tensorflow as tf 3 | from tensorflow.keras import activations, initializers, constraints 4 | from tensorflow.keras import regularizers 5 | import tensorflow.keras.backend as K 6 | import scipy.sparse as sp 7 | import numpy as np 8 | import pickle, copy 9 | 10 | 11 | class GCN(tf.keras.Model): 12 | def __init__(self, nhid, nclass, epochs, train_ratio, eval_ratio, 13 | sparse_input=True, early_stopping=True, dropout=0.5, nlayer=2, feature_columns=None, 14 | id_col='id', feature_col='features', from_node_col='from_node_id', to_node_col='to_node_id'): 15 | """ 16 | Implementation of GCN in this paper: https://arxiv.org/pdf/1609.02907.pdf. The original tensorflow implementation 17 | is accessible here: https://github.com/tkipf/gcn, and one can find more information about GCN through: 18 | http://tkipf.github.io/graph-convolutional-networks/. 19 | :param nhid: Number of hidden units for GCN. 20 | type nhid: int. 21 | :param nclass: Number of classes in total which will be the output dimension. 22 | type nclass: int. 23 | :param epochs: Number of epochs for the model to be trained. 24 | type epochs: int. 25 | :param train_ratio: Percentage of data points to be used for training. 26 | type train_ratio: float. 27 | :param eval_ratio: Percentage of data points to be used for evaluating. 28 | type eval_ratio: float. 29 | :param early_stopping: Whether to use early stopping trick during the training phase. 30 | type early_stopping: bool. 31 | :param dropout: The rate for dropout. 32 | type dropout: float. 33 | :param nlayer: Number of GCNLayer to be used in the model. 34 | type nlayer: int. 35 | :param feature_columns: a list of tf.feature_column. (Not used in this model) 36 | type feature_columns: list. 37 | :param id_col: Name for the column in database to be used as the id of each node. 38 | type id_col: string. 39 | :param feature_col: Name for the column in database to be used as the features of each node. 40 | type feature_col: string. 41 | :param from_node_col: Name for the column in database to be used as the from_node id of each edge. 42 | type from_node_col: string. 43 | :param to_node_col: Name for the column in database to be used as the to_node id of each edge. 44 | type to_node_col: string. 45 | """ 46 | super(GCN, self).__init__() 47 | 48 | assert dropout < 1 and dropout > 0, "Please make sure dropout rate is a float between 0 and 1." 49 | assert train_ratio < 1 and train_ratio > 0, "Please make sure train_ratio is a float between 0 and 1." 50 | assert eval_ratio < 1 and eval_ratio > 0, "Please make sure eval_ratio is a float between 0 and 1." 51 | self.gc_layers = list() 52 | self.gc_layers.append(GCNLayer(nhid, kernel_regularizer=tf.keras.regularizers.l2(5e-4), sparse_input=sparse_input)) 53 | for i in range(nlayer-1): 54 | self.gc_layers.append(GCNLayer(nhid, kernel_regularizer=tf.keras.regularizers.l2(5e-4))) 55 | self.gc_layers.append(GCNLayer(nclass)) 56 | self.keep_prob = 1 - dropout 57 | self.dropout = tf.keras.layers.Dropout(dropout) 58 | self.nshape = None 59 | self.train_ratio = train_ratio 60 | self.eval_ratio = eval_ratio 61 | self.nlayer = nlayer 62 | self.epochs = epochs 63 | self.early_stopping = early_stopping 64 | self.sparse_input = sparse_input 65 | self.id_col = id_col 66 | self.feature_col = feature_col 67 | self.from_node_col = from_node_col 68 | self.to_node_col = to_node_col 69 | # try to load the result file 70 | try: 71 | with open('./results.pkl', 'rb') as f: 72 | self.results = pickle.load(f) 73 | except (FileNotFoundError, IOError): 74 | self.results = None 75 | 76 | def call(self, data): 77 | x, adj = data 78 | assert self.nshape is not None, "Should calculate the shape of input by preprocessing the data with model.preprocess(data)." 79 | if self.sparse_input: 80 | x = GCN.sparse_dropout(x, self.keep_prob, self.nshape) 81 | else: 82 | x = self.dropout(x) 83 | for i in range(self.nlayer-1): 84 | x = tf.keras.activations.relu(self.gc_layers[i](x, adj)) 85 | x = self.dropout(x) 86 | x = self.gc_layers[-1](x, adj) 87 | 88 | return tf.keras.activations.softmax(x) 89 | 90 | def evaluate(self, data, y, sample_weight): 91 | """Function to evaluate the model.""" 92 | return self.test(sample_weight, return_loss=True) 93 | 94 | def predict(self, data): 95 | """Function to predict labels with the model.""" 96 | x, adj = data 97 | for i in range(self.nlayer-1): 98 | x = tf.keras.activations.relu(self.gc_layers[i](x, adj)) 99 | x = self.gc_layers[-1](x, adj) 100 | return tf.keras.activations.softmax(x) 101 | 102 | @staticmethod 103 | def sparse_dropout(x, keep_prob, noise_shape): 104 | """Dropout for sparse tensors.""" 105 | random_tensor = keep_prob 106 | random_tensor += tf.random.uniform(noise_shape) 107 | dropout_mask = tf.cast(tf.floor(random_tensor), dtype=tf.bool) 108 | pre_out = tf.sparse.retain(x, dropout_mask) 109 | return pre_out * (1./keep_prob) 110 | 111 | @staticmethod 112 | def encode_onehot(labels): 113 | classes = set(labels) 114 | classes_dict = {c: np.identity(len(classes))[i, :] for i, c in enumerate(classes)} 115 | labels_onehot = np.array(list(map(classes_dict.get, labels)), dtype=np.int32) 116 | return labels_onehot 117 | 118 | @staticmethod 119 | def normalize_adj(adjacency, symmetric=True): 120 | """ 121 | Function to normalize the adjacency matrix (get the laplacian matrix). 122 | :param adjacency: Adjacency matrix of the dataset. 123 | type adjacency: Scipy COO_Matrix. 124 | :param symmetric: Boolean variable to determine whether to use symmetric laplacian. 125 | type symmetric: bool. 126 | """ 127 | adjacency += sp.eye(adjacency.shape[0]) 128 | if symmetric: 129 | """L=D^-0.5 * (A+I) * D^-0.5""" 130 | d = sp.diags(np.power(np.array(adjacency.sum(1)), -0.5).flatten(), 0) 131 | a_norm = adjacency.dot(d).transpose().dot(d).tocoo() 132 | else: 133 | """L=D^-1 * (A+I)""" 134 | d = sp.diags(np.power(np.array(adjacency.sum(1)), -1).flatten(), 0) 135 | a_norm = d.dot(adjacency).tocoo() 136 | 137 | return a_norm 138 | 139 | @staticmethod 140 | def normalize_feature(features, sparse_input): 141 | """Function to row-normalize the features input.""" 142 | rowsum = np.array(features.sum(1)) 143 | r_inv = np.power(rowsum, -1).flatten() 144 | r_inv[np.isinf(r_inv)] = 0. 145 | r_mat_inv = sp.diags(r_inv) 146 | features = r_mat_inv.dot(features) 147 | if sparse_input: 148 | return sp.csr_matrix(features).tocoo() 149 | else: 150 | return features 151 | 152 | def preprocess(self, ids, features, labels, edges): 153 | """Function to preprocess the node features and adjacency matrix.""" 154 | if len(features.shape) > 2: 155 | features = np.squeeze(features) 156 | if len(edges.shape) > 2: 157 | edges = np.squeeze(edges) 158 | # sort the data in the correct order 159 | idx = np.argsort(np.array(ids)) 160 | features = features[idx] 161 | labels = labels[idx] 162 | # preprocess 163 | features = GCN.normalize_feature(features, self.sparse_input) 164 | labels = GCN.encode_onehot(labels) 165 | adjacency = sp.coo_matrix((np.ones(len(edges)), 166 | (edges[:, 0], edges[:, 1])), 167 | shape=(features.shape[0], features.shape[0]), dtype="float32") 168 | 169 | adjacency = adjacency + adjacency.T.multiply(adjacency.T > adjacency) - adjacency.multiply(adjacency.T > adjacency) 170 | adjacency = GCN.normalize_adj(adjacency, symmetric=True) 171 | 172 | nf_shape = features.data.shape 173 | na_shape = adjacency.data.shape 174 | if self.sparse_input: 175 | features = tf.SparseTensor( 176 | indices=np.array(list(zip(features.row, features.col)), dtype=np.int64), 177 | values=tf.cast(features.data, tf.float32), 178 | dense_shape=features.shape) 179 | features = tf.sparse.reorder(features) 180 | adjacency = tf.SparseTensor( 181 | indices=np.array(list(zip(adjacency.row, adjacency.col)), dtype=np.int64), 182 | values=tf.cast(adjacency.data, tf.float32), 183 | dense_shape=adjacency.shape) 184 | adjacency = tf.sparse.reorder(adjacency) 185 | 186 | total_num = features.shape[0] 187 | train_num = round(total_num*self.train_ratio) 188 | eval_num = round(total_num*self.eval_ratio) 189 | train_index = np.arange(train_num) 190 | val_index = np.arange(train_num, train_num+eval_num) 191 | test_index = np.arange(train_num+eval_num, total_num) 192 | 193 | self.train_mask = np.zeros(total_num, dtype = np.bool) 194 | self.val_mask = np.zeros(total_num, dtype = np.bool) 195 | self.test_mask = np.zeros(total_num, dtype = np.bool) 196 | self.train_mask[train_index] = True 197 | self.val_mask[val_index] = True 198 | self.test_mask[test_index] = True 199 | 200 | print('Dataset has {} nodes, {} edges, {} features.'.format(features.shape[0], edges.shape[0], features.shape[1])) 201 | 202 | return features, labels, adjacency, nf_shape, na_shape 203 | 204 | def loss_func(self, model, x, y, train_mask, training=True): 205 | '''Customed loss function for the model.''' 206 | 207 | y_ = model(x, training=training) 208 | 209 | test_mask_logits = tf.gather_nd(y_, tf.where(train_mask)) 210 | masked_labels = tf.gather_nd(y, tf.where(train_mask)) 211 | 212 | return loss(labels=masked_labels, output=test_mask_logits) 213 | 214 | def grad(self, model, inputs, targets, train_mask): 215 | '''Calculate the gradients of the parameters.''' 216 | with tf.GradientTape() as tape: 217 | loss_value = self.loss_func(model, inputs, targets, train_mask) 218 | 219 | return loss_value, tape.gradient(loss_value, model.trainable_variables) 220 | 221 | def test(self, mask, return_loss=False): 222 | '''Test the results on the model. Return accuracy''' 223 | logits = self.predict(data=[self.features, self.adjacency]) 224 | 225 | test_mask_logits = tf.gather_nd(logits, tf.where(mask)) 226 | masked_labels = tf.gather_nd(self.labels, tf.where(mask)) 227 | 228 | ll = tf.equal(tf.argmax(masked_labels, -1), tf.argmax(test_mask_logits, -1)) 229 | accuracy = tf.reduce_mean(tf.cast(ll, dtype=tf.float32)) 230 | 231 | if return_loss: 232 | loss_value = loss(labels=masked_labels, output=test_mask_logits) 233 | return [loss_value, accuracy] 234 | 235 | return accuracy 236 | 237 | def sqlflow_train_loop(self, x): 238 | """Customized training function.""" 239 | # load data 240 | ids, ids_check, features, labels, edges, edge_check = list(), dict(), list(), list(), list(), dict() 241 | from_node = 0 242 | for inputs, label in x: 243 | id = inputs[self.id_col].numpy().astype(np.int32) 244 | feature = inputs[self.feature_col].numpy().astype(np.float32) 245 | from_node = inputs[self.from_node_col].numpy().astype(np.int32) 246 | to_node = inputs[self.to_node_col].numpy().astype(np.int32) 247 | if int(id) not in ids_check: 248 | ids.append(int(id)) 249 | features.append(feature) 250 | labels.append(label.numpy()[0]) 251 | ids_check[int(id)] = 0 252 | if tuple([int(from_node), int(to_node)]) not in edge_check: 253 | edge_check[tuple([int(from_node), int(to_node)])] = 0 254 | edges.append([from_node, to_node]) 255 | features = np.stack(features) 256 | labels = np.stack(labels) 257 | edges = np.stack(edges) 258 | 259 | self.features, self.labels, self.adjacency, self.nshape, na_shape = self.preprocess(ids, features, labels, edges) 260 | # training the model 261 | wait = 0 262 | best_acc = -9999999 263 | PATIENCE = 10 264 | for epoch in range(self.epochs): 265 | # calculate the gradients and take the step 266 | loss_value, grads = self.grad(self, [self.features, self.adjacency], self.labels, self.train_mask) 267 | optimizer().apply_gradients(zip(grads, self.trainable_variables)) 268 | # Test on train and evaluate dataset 269 | train_acc = self.test(self.train_mask) 270 | val_acc = self.test(self.val_mask) 271 | print("Epoch {} loss={:6f} accuracy={:6f} val_acc={:6f}".format(epoch, loss_value, train_acc, val_acc)) 272 | # early stopping 273 | if epoch > 50 and self.early_stopping: 274 | if float(val_acc.numpy()) > best_acc: 275 | best_acc = float(val_acc.numpy()) 276 | wait = 0 277 | else: 278 | if wait >= PATIENCE: 279 | print('Epoch {}: early stopping'.format(epoch)) 280 | break 281 | wait += 1 282 | # evaluate the model 283 | result = self.evaluate(data=[self.features, self.adjacency], y=self.labels, sample_weight=self.val_mask) 284 | # get all the results 285 | predicted = self.predict([self.features, self.adjacency]) 286 | # store the results in a pickled file 287 | with open('./results.pkl', 'wb') as f: 288 | results = dict() 289 | for i in range(len(ids)): 290 | results[str(ids[i])] = predicted[i] 291 | results['evaluation'] = result 292 | pickle.dump(results, f) 293 | self.results = results 294 | 295 | def sqlflow_evaluate_loop(self, x, metric_names): 296 | """Customed evaluation, can only support calculating the accuracy.""" 297 | assert self.results is not None, "Please make sure to train the model first." 298 | eval_result = self.results['evaluation'] 299 | return eval_result 300 | 301 | def sqlflow_predict_one(self, sample): 302 | """Customed prediction, sample must be the node id.""" 303 | assert self.results is not None, "Please make sure to train the model first." 304 | prediction = self.results[str(int(sample))] 305 | return [prediction] 306 | 307 | def optimizer(): 308 | """Default optimizer name. Used in model.compile.""" 309 | return tf.keras.optimizers.Adam(lr=0.01) 310 | 311 | def loss(labels, output): 312 | """Default loss function for classification task.""" 313 | criterion = tf.keras.losses.CategoricalCrossentropy(from_logits=False) 314 | return criterion(y_true=labels, y_pred=output) 315 | 316 | # Graph Convolutional Layer 317 | class GCNLayer(tf.keras.layers.Layer): 318 | 319 | def __init__(self, units, use_bias=True, sparse_input=False, 320 | kernel_initializer='glorot_uniform', 321 | bias_initializer='zeros', 322 | kernel_regularizer=None, 323 | bias_regularizer=None, 324 | kernel_constraint=None, 325 | bias_constraint=None, 326 | **kwargs): 327 | """GCNLayer 328 | Graph Convolutional Networks Layer from paper: https://arxiv.org/pdf/1609.02907.pdf. This is used in the GCN model for 329 | classification task on graph-structured data. 330 | :param units: Number of hidden units for the layer. 331 | type units: int. 332 | :param use_bias: Boolean variable to determine whether to use bias. 333 | type use_bias: bool. 334 | :param sparse_input: Boolean variable to check if input tensor is sparse. 335 | type sparse_input: bool. 336 | :param kernel_initializer: Weight initializer for the GCN kernel. 337 | :param bias_initializer: Weight initializer for the bias. 338 | :param kernel_regularizer: Weight regularizer for the GCN kernel. 339 | :param bias_regularizer: Weight regularizer for the bias. 340 | :param kernel_constraint: Weight value constraint for the GCN kernel. 341 | :param bias_constraint: Weight value constraint for the bias. 342 | :param kwargs: 343 | """ 344 | if 'input_shape' not in kwargs and 'input_dim' in kwargs: 345 | kwargs['input_shape'] = (kwargs.pop('input_dim'),) 346 | super(GCNLayer, self).__init__(**kwargs) 347 | self.units = units 348 | self.use_bias = use_bias 349 | self.sparse_input = sparse_input 350 | self.kernel_initializer = initializers.get(kernel_initializer) 351 | self.bias_initializer = initializers.get(bias_initializer) 352 | self.kernel_regularizer = regularizers.get(kernel_regularizer) 353 | self.bias_regularizer = regularizers.get(bias_regularizer) 354 | self.kernel_constraint = constraints.get(kernel_constraint) 355 | self.bias_constraint = constraints.get(bias_constraint) 356 | 357 | def build(self, input_shape): 358 | self.kernel = self.add_weight(shape=(input_shape[-1], self.units), 359 | initializer=self.kernel_initializer, 360 | name='kernel', 361 | regularizer=self.kernel_regularizer, 362 | constraint=self.kernel_constraint, 363 | trainable=True) 364 | if self.use_bias: 365 | self.bias = self.add_weight(shape=(self.units,), 366 | initializer=self.bias_initializer, 367 | name='bias', 368 | regularizer=self.bias_regularizer, 369 | constraint=self.bias_constraint, 370 | trainable=True) 371 | self.built = True 372 | 373 | def call(self, inputs, adj, **kwargs): 374 | assert isinstance(adj, tf.SparseTensor), "Adjacency matrix should be a SparseTensor" 375 | if self.sparse_input: 376 | assert isinstance(inputs, tf.SparseTensor), "Input matrix should be a SparseTensor" 377 | support = tf.sparse.sparse_dense_matmul(inputs, self.kernel) 378 | else: 379 | support = tf.matmul(inputs, self.kernel) 380 | output = tf.sparse.sparse_dense_matmul(adj, support) 381 | if self.use_bias: 382 | output = output + self.bias 383 | else: 384 | output = output 385 | return output 386 | 387 | def get_config(self): 388 | config = {'units': self.units, 389 | 'use_bias': self.use_bias, 390 | 'sparse_input': self.sparse_input, 391 | 'kernel_initializer': initializers.serialize( 392 | self.kernel_initializer), 393 | 'bias_initializer': initializers.serialize( 394 | self.bias_initializer), 395 | 'kernel_regularizer': regularizers.serialize( 396 | self.kernel_regularizer), 397 | 'bias_regularizer': regularizers.serialize( 398 | self.bias_regularizer), 399 | 'kernel_constraint': constraints.serialize( 400 | self.kernel_constraint), 401 | 'bias_constraint': constraints.serialize(self.bias_constraint) 402 | } 403 | base_config = super(GCNLayer, self).get_config() 404 | return dict(list(base_config.items()) + list(config.items())) -------------------------------------------------------------------------------- /sqlflow_models/native_keras.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | class RawDNNClassifier(tf.keras.Model): 4 | def __init__(self, hidden_units=[100,100], n_classes=3): 5 | super(RawDNNClassifier, self).__init__() 6 | self.feature_layer = None 7 | self.n_classes = n_classes 8 | self.hidden_layers = [] 9 | for hidden_unit in hidden_units: 10 | self.hidden_layers.append(tf.keras.layers.Dense(hidden_unit, activation='relu')) 11 | if self.n_classes == 2: 12 | pred_act = 'sigmoid' 13 | n_out = 1 14 | else: 15 | pred_act = 'softmax' 16 | n_out = self.n_classes 17 | self.prediction_layer = tf.keras.layers.Dense(n_out, activation=pred_act) 18 | 19 | def call(self, inputs, training=True): 20 | if self.feature_layer is not None: 21 | x = self.feature_layer(inputs) 22 | else: 23 | x = tf.keras.layers.Flatten()(inputs) 24 | for hidden_layer in self.hidden_layers: 25 | x = hidden_layer(x) 26 | return self.prediction_layer(x) 27 | -------------------------------------------------------------------------------- /sqlflow_models/one_class_svm.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The SQLFlow Authors. All rights reserved. 2 | # Licensed under the Apache License, Version 2.0 (the "License"); 3 | # you may not use this file except in compliance with the License. 4 | # You may obtain a copy of the License at 5 | # 6 | # http://www.apache.org/licenses/LICENSE-2.0 7 | # 8 | # Unless required by applicable law or agreed to in writing, software 9 | # distributed under the License is distributed on an "AS IS" BASIS, 10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | # See the License for the specific language governing permissions and 12 | # limitations under the License. 13 | 14 | import os 15 | import pickle 16 | 17 | import numpy as np 18 | import tensorflow as tf 19 | from sklearn.svm import OneClassSVM as SklearnOneClassSVM 20 | 21 | MODEL_DIR = "model_save" 22 | MODEL_PATH = MODEL_DIR + "/one_class_svm_model" 23 | 24 | ENABLE_EAGER_EXECUTION = False 25 | 26 | try: 27 | tf.enable_eager_execution() 28 | ENABLE_EAGER_EXECUTION = True 29 | except Exception: 30 | try: 31 | tf.compat.v1.enable_eager_execution() 32 | ENABLE_EAGER_EXECUTION = True 33 | except Exception: 34 | ENABLE_EAGER_EXECUTION = False 35 | 36 | 37 | def dataset_reader(dataset): 38 | if ENABLE_EAGER_EXECUTION: 39 | for features in dataset: 40 | yield features 41 | else: 42 | iter = dataset.make_one_shot_iterator() 43 | one_element = iter.get_next() 44 | with tf.Session() as sess: 45 | try: 46 | while True: 47 | yield sess.run(one_element) 48 | except tf.errors.OutOfRangeError: 49 | pass 50 | 51 | 52 | class OneClassSVM(tf.keras.Model): 53 | def __init__(self, 54 | feature_columns=None, 55 | kernel='rbf', 56 | degree=3, 57 | gamma='scale', 58 | coef0=0.0, 59 | tol=0.001, 60 | nu=0.5, 61 | shrinking=True, 62 | cache_size=200, 63 | verbose=False, 64 | max_iter=-1): 65 | if os.path.exists(MODEL_PATH): 66 | with open(MODEL_PATH, "rb") as f: 67 | self.svm = pickle.load(f) 68 | else: 69 | self.svm = SklearnOneClassSVM(kernel=kernel, 70 | degree=degree, 71 | gamma=gamma, 72 | coef0=coef0, 73 | tol=tol, 74 | nu=nu, 75 | shrinking=shrinking, 76 | cache_size=cache_size, 77 | verbose=verbose, 78 | max_iter=max_iter) 79 | 80 | def concat_features(self, features): 81 | assert isinstance(features, dict) 82 | each_feature = [] 83 | for k, v in features.items(): 84 | if ENABLE_EAGER_EXECUTION: 85 | v = v.numpy() 86 | each_feature.append(v) 87 | return np.concatenate(each_feature, axis=1) 88 | 89 | def sqlflow_train_loop(self, dataset): 90 | X = [] 91 | for features in dataset_reader(dataset): 92 | X.append(self.concat_features(features)) 93 | X = np.concatenate(X) 94 | 95 | self.svm.fit(X) 96 | 97 | if not os.path.exists(MODEL_DIR): 98 | os.mkdir(MODEL_DIR) 99 | 100 | with open(MODEL_PATH, "wb") as f: 101 | pickle.dump(self.svm, f, protocol=2) 102 | 103 | def sqlflow_predict_one(self, features): 104 | features = self.concat_features(features) 105 | pred = self.svm.predict(features) 106 | score = self.svm.decision_function(features) 107 | return pred, score 108 | -------------------------------------------------------------------------------- /sqlflow_models/rnn_based_time_series.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | class RNNBasedTimeSeriesModel(tf.keras.Model): 4 | 5 | def __init__(self, 6 | feature_columns=None, 7 | stack_units=[500, 500], 8 | n_in=7, 9 | n_out=1, 10 | n_features=1, 11 | model_type='rnn'): 12 | """RNNBasedTimeSeriesModel 13 | :param feature_columns: All columns must be embedding of sequence column with same sequence_length. 14 | type feature_columns: list[tf.feature_column.numeric_column]. 15 | :param stack_units: Units for RNN layer. 16 | type stack_units: vector of ints. 17 | :param n_in: Size of time window. 18 | type n_in: int. 19 | :param n_out: Number of predicted labels. 20 | type n_out: int. 21 | :param n_features: number of features in every time window. 22 | type n_features: int. 23 | :param model_type: Specific RNN model to be used, which can be chose from: ('rnn', 'lstm' and 'gru'). 24 | type model_type: string. 25 | """ 26 | super(RNNBasedTimeSeriesModel, self).__init__(name='RNN_TS_Model') 27 | # Common 28 | self.feature_columns = feature_columns 29 | self.loss = loss 30 | self.n_out = n_out 31 | self.n_in = n_in 32 | self.n_features = n_features 33 | self.stack_units = stack_units 34 | self.models = {'rnn':tf.keras.layers.SimpleRNN, 'lstm':tf.keras.layers.LSTM, 'gru':tf.keras.layers.GRU} 35 | # combines all the data as a dense tensor 36 | self.feature_layer = None 37 | if feature_columns is not None: 38 | self.feature_layer = tf.keras.layers.DenseFeatures(feature_columns) 39 | self.stack_layers = [] 40 | for unit in self.stack_units[:-1]: 41 | self.stack_layers.append(self.models[model_type.lower()](unit, input_shape=(self.n_in, self.n_features), return_sequences=True)) 42 | self.rnn = self.models[model_type.lower()](self.stack_units[-1], input_shape=(self.n_in, self.n_features)) 43 | self.dropout = tf.keras.layers.Dropout(0.2) 44 | self.prediction_layer = tf.keras.layers.Dense(self.n_out) 45 | 46 | def call(self, inputs): 47 | if self.feature_layer: 48 | x = self.feature_layer(inputs) 49 | else: 50 | x = inputs 51 | x = tf.reshape(x, (-1, self.n_in, self.n_features)) 52 | for i in range(len(self.stack_units) - 1): 53 | x = self.stack_layers[i](x) 54 | x = self.rnn(x) 55 | x = self.dropout(x) 56 | return self.prediction_layer(x) 57 | 58 | def optimizer(learning_rate=0.001): 59 | """Default optimizer name. Used in model.compile.""" 60 | return tf.keras.optimizers.Adam(lr=learning_rate) 61 | 62 | def prepare_prediction_column(prediction): 63 | """Return the prediction directly.""" 64 | return prediction 65 | 66 | def loss(labels, output): 67 | return tf.reduce_mean(tf.keras.losses.MSE(labels, output)) 68 | 69 | -------------------------------------------------------------------------------- /sqlflow_models/rnnclassifier.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | _loss = '' 4 | 5 | class StackedRNNClassifier(tf.keras.Model): 6 | def __init__(self, feature_columns=None, stack_units=[32], hidden_size=64, n_classes=2, model_type='rnn', bidirectional=False): 7 | """StackedRNNClassifier 8 | :param feature_columns: All columns must be embedding of sequence column with same sequence_length. 9 | :type feature_columns: list[tf.embedding_column]. 10 | :param stack_units: Units for RNN layer. 11 | :type stack_units: vector of ints. 12 | :param n_classes: Target number of classes. 13 | :type n_classes: int. 14 | :param model_type: Specific RNN model to be used, which can be chose from: ('rnn', 'lstm' and 'gru'). 15 | :type model_type: string. 16 | :param bidirectional: Whether to use bidirectional or not. 17 | :type bidirectional: bool. 18 | """ 19 | global _loss 20 | super(StackedRNNClassifier, self).__init__() 21 | 22 | self.models = {'rnn':tf.keras.layers.SimpleRNN, 'lstm':tf.keras.layers.LSTM, 'gru':tf.keras.layers.GRU} 23 | self.bidirectionals = {True: tf.keras.layers.Bidirectional, False: lambda x: x} 24 | self.feature_layer = None 25 | if feature_columns is not None: 26 | self.feature_layer = tf.keras.experimental.SequenceFeatures(feature_columns) 27 | self.stack_rnn = [] 28 | self.stack_size = len(stack_units) 29 | self.stack_units = stack_units 30 | self.n_classes = n_classes 31 | if self.stack_size > 1: 32 | for i in range(self.stack_size - 1): 33 | self.stack_rnn.append( 34 | self.bidirectionals[bidirectional](self.models[model_type.lower()](self.stack_units[i], return_sequences=True)) 35 | ) 36 | self.rnn = self.bidirectionals[bidirectional](self.models[model_type.lower()](self.stack_units[-1])) 37 | self.hidden = tf.keras.layers.Dense(hidden_size, activation='relu') 38 | if self.n_classes == 2: 39 | # special setup for binary classification 40 | pred_act = 'sigmoid' 41 | _loss = 'binary_crossentropy' 42 | n_out = 1 43 | else: 44 | pred_act = 'softmax' 45 | _loss = 'categorical_crossentropy' 46 | n_out = self.n_classes 47 | self.pred = tf.keras.layers.Dense(n_out, activation=pred_act) 48 | 49 | def call(self, inputs): 50 | if self.feature_layer: 51 | x, seq_len = self.feature_layer(inputs) 52 | else: 53 | x, seq_len = inputs 54 | seq_mask = tf.sequence_mask(seq_len) 55 | if self.stack_size > 1: 56 | for i in range(self.stack_size - 1): 57 | x = self.stack_rnn[i](x, mask=seq_mask) 58 | x = self.rnn(x, mask=seq_mask) 59 | x = self.hidden(x) 60 | return self.pred(x) 61 | 62 | def optimizer(): 63 | """Default optimizer name. Used in model.compile.""" 64 | return 'adam' 65 | 66 | def loss(labels, output): 67 | global _loss 68 | if _loss == "binary_crossentropy": 69 | return tf.reduce_mean(tf.keras.losses.binary_crossentropy(labels, output)) 70 | elif _loss == "categorical_crossentropy": 71 | return tf.reduce_mean(tf.keras.losses.categorical_crossentropy(labels, output)) 72 | 73 | def prepare_prediction_column(prediction): 74 | """Return the class label of highest probability.""" 75 | return prediction.argmax(axis=-1) 76 | 77 | def eval_metrics_fn(): 78 | return { 79 | "accuracy": lambda labels, predictions: tf.equal( 80 | tf.argmax(predictions, 1, output_type=tf.int32), 81 | tf.cast(tf.reshape(labels, [-1]), tf.int32), 82 | ) 83 | } 84 | -------------------------------------------------------------------------------- /sqlflow_models/score_card.py: -------------------------------------------------------------------------------- 1 | #!/bin/env python 2 | 3 | import tensorflow as tf 4 | from tensorflow import keras 5 | from tensorflow.python.data import make_one_shot_iterator 6 | from tensorflow.keras.losses import kld 7 | from tensorflow.keras.optimizers import SGD 8 | import numpy as np 9 | import pandas as pd 10 | import scipy.stats.stats as stats 11 | import sklearn 12 | from sklearn.linear_model import LogisticRegression 13 | from sklearn.model_selection import train_test_split 14 | from sklearn.metrics import roc_auc_score, auc 15 | import pickle 16 | 17 | 18 | def optimizer(): 19 | return None 20 | 21 | 22 | def loss(): 23 | return None 24 | 25 | 26 | class ScoreCard(keras.Model): 27 | 28 | def __init__(self, feature_columns=None, pf_bin_size=5): 29 | super(ScoreCard, self).__init__(name='ScoreCard') 30 | 31 | self._target_score = 600 32 | self._factor = 20/np.log(2) 33 | self._offset = 600 - 20*np.log(20) / np.log(2) 34 | self._bins = dict() 35 | self._pf_bin_size = pf_bin_size 36 | 37 | def _pf_bin(self, y, x): 38 | # population frequency bucket 39 | bad_num = y.sum() 40 | good_num = y.count() - y.sum() 41 | d1 = pd.DataFrame({'x': x,'y': y,'bucket': pd.qcut(x, self._pf_bin_size, duplicates='drop')}) 42 | d2 = d1.groupby('bucket',as_index=True) 43 | d3 = pd.DataFrame(d2.x.min(),columns=['min_bin']) 44 | 45 | d3["min"] = d2.min().x 46 | d3["max"] = d2.max().x 47 | d3["badcostum"] = d2.sum().y 48 | d3["goodcostum"] = d2.count().y - d2.sum().y 49 | d3["total"] = d2.count().y 50 | d3["bad_rate"] = d2.sum().y/d2.count().y 51 | d3["woe"] = np.log(d3["badcostum"]/d3["goodcostum"]*good_num/bad_num) 52 | iv = ((d3["badcostum"]/bad_num-d3["goodcostum"]/good_num)*d3["woe"]) 53 | d3["iv"] = iv 54 | woe = list(d3["woe"].round(6)) 55 | cut = list(d3["max"].round(6)) 56 | cut.insert(0, float("-inf")) 57 | cut[-1] = float("inf") 58 | return d3, cut, woe, iv 59 | 60 | def _to_dataframe(self, dataset): 61 | x_df = pd.DataFrame() 62 | y_df = pd.DataFrame() 63 | for _, minibatch in enumerate(dataset): 64 | data, label = minibatch 65 | dx = {} 66 | dy = {} 67 | for name, value in data.items(): 68 | dx[name] = value.numpy()[0][0] 69 | x_df = x_df.append(dx, ignore_index=True) 70 | dy['label'] = label.numpy()[0] 71 | y_df = y_df.append(dy, ignore_index=True) 72 | return x_df, y_df 73 | 74 | def _replace_woe(self, x, cut, woe): 75 | return pd.cut(x, cut, labels=pd.Categorical(woe)) 76 | 77 | def _woe_encoder(self, x, y): 78 | x_train_dict = {} 79 | for col in x.columns: 80 | dfx, cut, woe, iv = self._pf_bin(y, x[col]) 81 | self._bins[col] = (dfx, cut, woe, iv) 82 | # replacing by the WOE encode 83 | x_train_dict[col] = self._replace_woe(x[col], cut, woe) 84 | return pd.DataFrame.from_dict(x_train_dict) 85 | 86 | def sqlflow_train_loop(self, dataset, epochs=1, verbose=0): 87 | x_df, y_df = self._to_dataframe(dataset) 88 | x = self._woe_encoder(x_df, y_df['label']) 89 | x.to_csv("/tmp/train_woe.csv") 90 | lr = LogisticRegression() 91 | 92 | x_train, x_test, y_train, y_test = train_test_split(x, y_df['label']) 93 | lr.fit(x_train, y_train) 94 | prob = lr.predict_proba(x_test)[:, 1] 95 | auc_score = roc_auc_score(y_test, prob) 96 | print("AUC: {}\n".format(auc_score)) 97 | 98 | # print the score card 99 | print("TARGET SCORE: %d" % self._target_score) 100 | coe = lr.coef_ 101 | for i, col_name in enumerate(x_df.columns): 102 | bin_cols = self._bins[col_name][0].index.to_list() 103 | for j, w in enumerate(self._bins[col_name][2]): 104 | print(col_name, bin_cols[j], 105 | round(coe[0][i] * w * self._factor + 106 | self._offset/self._pf_bin_size, 0)) 107 | -------------------------------------------------------------------------------- /sqlflow_models/simple_dnn_generator.py: -------------------------------------------------------------------------------- 1 | # This file is based on the AdaNet example 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | 6 | import functools 7 | import adanet 8 | import tensorflow as tf 9 | 10 | _NUM_LAYERS_KEY = "num_layers" 11 | 12 | 13 | class _SimpleDNNBuilder(adanet.subnetwork.Builder): 14 | """Builds a DNN subnetwork for AdaNet.""" 15 | 16 | def __init__(self, feature_columns, optimizer, layer_size, num_layers, learn_mixture_weights, 17 | seed): 18 | """Initializes a `_DNNBuilder`. 19 | 20 | Args: 21 | optimizer: An `Optimizer` instance for training both the subnetwork and 22 | the mixture weights. 23 | layer_size: The number of nodes to output at each hidden layer. 24 | num_layers: The number of hidden layers. 25 | learn_mixture_weights: Whether to solve a learning problem to find the 26 | best mixture weights, or use their default value according to the 27 | mixture weight type. When `False`, the subnetworks will return a no_op 28 | for the mixture weight train op. 29 | seed: A random seed. 30 | 31 | Returns: 32 | An instance of `_SimpleDNNBuilder`. 33 | """ 34 | 35 | self._optimizer = optimizer 36 | self._layer_size = layer_size 37 | self._num_layers = num_layers 38 | self._learn_mixture_weights = learn_mixture_weights 39 | self._feature_columns = feature_columns 40 | self._seed = seed 41 | 42 | def build_subnetwork(self, 43 | features, 44 | logits_dimension, 45 | training, 46 | iteration_step, 47 | summary, 48 | previous_ensemble=None): 49 | """See `adanet.subnetwork.Builder`.""" 50 | 51 | input_layer = tf.compat.v1.feature_column.input_layer(features, self._feature_columns) 52 | kernel_initializer = tf.compat.v1.glorot_uniform_initializer(seed=self._seed) 53 | last_layer = input_layer 54 | for _ in range(self._num_layers): 55 | last_layer = tf.compat.v1.layers.dense( 56 | last_layer, 57 | units=self._layer_size, 58 | activation=tf.nn.relu, 59 | kernel_initializer=kernel_initializer) 60 | logits = tf.compat.v1.layers.dense( 61 | last_layer, 62 | units=logits_dimension, 63 | kernel_initializer=kernel_initializer) 64 | 65 | persisted_tensors = {_NUM_LAYERS_KEY: tf.constant(self._num_layers)} 66 | return adanet.Subnetwork( 67 | last_layer=last_layer, 68 | logits=logits, 69 | complexity=self._measure_complexity(), 70 | persisted_tensors=persisted_tensors) 71 | 72 | def _measure_complexity(self): 73 | """Approximates Rademacher complexity as the square-root of the depth.""" 74 | return tf.sqrt(tf.cast(self._num_layers, tf.float32)) 75 | 76 | def build_subnetwork_train_op(self, subnetwork, loss, var_list, labels, 77 | iteration_step, summary, previous_ensemble): 78 | """See `adanet.subnetwork.Builder`.""" 79 | return self._optimizer.minimize(loss=loss, var_list=var_list) 80 | 81 | def build_mixture_weights_train_op(self, loss, var_list, logits, labels, 82 | iteration_step, summary): 83 | """See `adanet.subnetwork.Builder`.""" 84 | 85 | if not self._learn_mixture_weights: 86 | return tf.no_op() 87 | return self._optimizer.minimize(loss=loss, var_list=var_list) 88 | 89 | @property 90 | def name(self): 91 | """See `adanet.subnetwork.Builder`.""" 92 | 93 | if self._num_layers == 0: 94 | # A DNN with no hidden layers is a linear model. 95 | return "linear" 96 | return "{}_layer_dnn".format(self._num_layers) 97 | 98 | 99 | class SimpleDNNGenerator(adanet.subnetwork.Generator): 100 | """Generates a two DNN subnetworks at each iteration. 101 | 102 | The first DNN has an identical shape to the most recently added subnetwork 103 | in `previous_ensemble`. The second has the same shape plus one more dense 104 | layer on top. This is similar to the adaptive network presented in Figure 2 of 105 | [Cortes et al. ICML 2017](https://arxiv.org/abs/1607.01097), without the 106 | connections to hidden layers of networks from previous iterations. 107 | """ 108 | 109 | def __init__(self, optimizers, feature_columns, layer_size, learn_mixture_weights, seed): 110 | """Initializes a DNN `Generator`. 111 | 112 | Args: 113 | optimizers: A defaultdict of string for training both the subnetwork and 114 | the mixture weights. 115 | layer_size: Number of nodes in each hidden layer of the subnetwork 116 | candidates. Note that this parameter is ignored in a DNN with no hidden 117 | layers. 118 | learn_mixture_weights: Whether to solve a learning problem to find the 119 | best mixture weights, or use their default value according to the 120 | mixture weight type. When `False`, the subnetworks will return a no_op 121 | for the mixture weight train op. 122 | seed: A random seed. 123 | 124 | Returns: 125 | An instance of `Generator`. 126 | """ 127 | 128 | self._seed = seed 129 | self._optimizers = optimizers 130 | self._dnn_builder_fn = functools.partial( 131 | _SimpleDNNBuilder, 132 | layer_size=layer_size, 133 | feature_columns=feature_columns, 134 | learn_mixture_weights=learn_mixture_weights) 135 | 136 | def generate_candidates(self, previous_ensemble, iteration_number, 137 | previous_ensemble_reports, all_reports): 138 | """See `adanet.subnetwork.Generator`.""" 139 | 140 | num_layers = 0 141 | seed = self._seed 142 | if previous_ensemble: 143 | num_layers = tf.get_static_value( 144 | previous_ensemble.weighted_subnetworks[ 145 | -1].subnetwork.persisted_tensors[_NUM_LAYERS_KEY]) 146 | if seed is not None: 147 | seed += iteration_number 148 | optimizer = self._optimizers[num_layers + 0] 149 | return [self._dnn_builder_fn(num_layers=num_layers, optimizer=optimizer, seed=seed), 150 | self._dnn_builder_fn(num_layers=num_layers + 1, optimizer=optimizer, seed=seed)] 151 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sql-machine-learning/models/5dc6421f562ea447e501fa355a48a6ee89856a1d/tests/__init__.py -------------------------------------------------------------------------------- /tests/base.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import unittest 3 | import sys 4 | 5 | def train_input_fn(features, labels, batch_size=32): 6 | dataset = tf.data.Dataset.from_tensor_slices((dict(features), labels)) 7 | dataset = dataset.shuffle(1000).repeat().batch(batch_size) 8 | return dataset 9 | 10 | 11 | def eval_input_fn(features, labels, batch_size=32): 12 | dataset = tf.data.Dataset.from_tensor_slices((dict(features), labels)) 13 | dataset = dataset.batch(batch_size) 14 | return dataset 15 | 16 | class BaseTestCases: 17 | class BaseTest(unittest.TestCase): 18 | def setUp(self): 19 | self.model, self.features, self.label = None, {}, None 20 | 21 | def test_train_and_predict(self): 22 | self.setUp() 23 | model_pkg = sys.modules[self.model_class.__module__] 24 | self.model.compile(optimizer=model_pkg.optimizer(), 25 | loss=model_pkg.loss, 26 | metrics=["accuracy"]) 27 | self.history = self.model.fit(train_input_fn(self.features, self.label), 28 | epochs=10, 29 | steps_per_epoch=200, 30 | verbose=1) 31 | self.historyloss = self.history.history['loss'] 32 | loss_decline_rate = (self.historyloss[0] - self.historyloss[-1]) \ 33 | / self.historyloss[0] 34 | print('historyloss is {}, and the loss_decline_rate is {}'.\ 35 | format(self.historyloss, loss_decline_rate)) 36 | assert(loss_decline_rate > 0.3) 37 | 38 | class BaseEstimatorTest(BaseTest): 39 | def test_train_and_predict(self): 40 | self.setUp() 41 | input_fn = lambda: train_input_fn(self.features, self.label) 42 | train_spec = tf.estimator.TrainSpec(input_fn=input_fn, max_steps=1) 43 | eval_spec = tf.estimator.EvalSpec(input_fn=lambda: eval_input_fn(self.features, self.label)) 44 | baseline = tf.estimator.train_and_evaluate(self.model, train_spec, eval_spec)[0] 45 | train_spec = tf.estimator.TrainSpec(input_fn=input_fn, max_steps=2000) 46 | result = tf.estimator.train_and_evaluate(self.model, train_spec, eval_spec)[0] 47 | loss_decline_rate = 1- result["loss"] / baseline["loss"] 48 | print('historyloss is {}, and the loss_decline_rate is {}'.\ 49 | format(baseline["loss"], loss_decline_rate)) 50 | assert(loss_decline_rate > 0.3) 51 | -------------------------------------------------------------------------------- /tests/test_arima_with_stl_decomposition.py: -------------------------------------------------------------------------------- 1 | from sqlflow_models import ARIMAWithSTLDecomposition 2 | import unittest 3 | import tensorflow as tf 4 | from datetime import datetime, timedelta 5 | import numpy as np 6 | 7 | class TestARIMAWithSTLDecompose(unittest.TestCase): 8 | def setUp(self): 9 | self.order = [7, 0, 2] 10 | self.period = [7, 30] 11 | self.date_format = '%Y-%m-%d' 12 | self.train_start = '2014-04-01' 13 | self.train_end = '2014-08-31' 14 | self.forecast_start = '2014-09-01' 15 | self.forecast_end = '2014-09-30' 16 | 17 | def str2datetime(self, date_str): 18 | if isinstance(date_str, bytes): 19 | date_str = date_str.decode('utf-8') 20 | return datetime.strptime(str(date_str), self.date_format) 21 | 22 | def datetime2str(self, date): 23 | return datetime.strftime(date, self.date_format) 24 | 25 | def create_dataset(self): 26 | def generator(): 27 | start_date = self.str2datetime(self.train_start) 28 | end_date = self.str2datetime(self.train_end) 29 | delta = timedelta(days=1) 30 | while start_date <= end_date: 31 | date_str = np.array(self.datetime2str(start_date)) 32 | label = np.random.random(size=[1]) * 1e8 33 | yield date_str, label 34 | start_date += delta 35 | 36 | def dict_mapper(date_str, label): 37 | return {'time': date_str}, label 38 | 39 | dataset = tf.data.Dataset.from_generator( 40 | generator, output_types=(tf.dtypes.string, tf.dtypes.float32) 41 | ) 42 | dataset = dataset.map(dict_mapper) 43 | return dataset 44 | 45 | def prediction_days(self): 46 | pred_start = self.str2datetime(self.forecast_start) 47 | pred_end = self.str2datetime(self.forecast_end) 48 | return (pred_end - pred_start).days + 1 49 | 50 | def test_main(self): 51 | model = ARIMAWithSTLDecomposition(order=[7, 0, 2], 52 | period=[7, 30], 53 | date_format=self.date_format, 54 | forecast_start=self.forecast_start, 55 | forecast_end=self.forecast_end) 56 | prediction = model.sqlflow_train_loop(self.create_dataset()) 57 | self.assertEqual(len(prediction), self.prediction_days()) 58 | 59 | 60 | if __name__ == '__main__': 61 | unittest.main() 62 | -------------------------------------------------------------------------------- /tests/test_auto_estimator.py: -------------------------------------------------------------------------------- 1 | import sqlflow_models 2 | from tests.base import BaseTestCases, train_input_fn, eval_input_fn 3 | 4 | import sys 5 | import tensorflow as tf 6 | import unittest 7 | import numpy as np 8 | from sklearn.datasets import load_iris, load_boston 9 | 10 | class TestAutoClassifier(BaseTestCases.BaseEstimatorTest): 11 | def setUp(self): 12 | x, y = load_iris(return_X_y=True) 13 | feature_column_names = ['col_{}'.format(d) for d in range(x.shape[1])] 14 | self.features = {} 15 | for feature_name, feature_values in zip(feature_column_names, list(x.T)): 16 | self.features[feature_name] = feature_values 17 | self.label = y 18 | feature_columns = [tf.feature_column.numeric_column(key) for key in self.features] 19 | 20 | self.model_class = sqlflow_models.AutoClassifier 21 | self.model = sqlflow_models.AutoClassifier(feature_columns=feature_columns, n_classes=3) 22 | 23 | class TestAutoBinaryClassifier(BaseTestCases.BaseEstimatorTest): 24 | def setUp(self): 25 | x, y = load_iris(return_X_y=True) 26 | x = np.array([x[i] for i, v in enumerate(y) if v != 2]) 27 | y = np.array([y[i] for i, v in enumerate(y) if v != 2]) 28 | feature_column_names = ['col_{}'.format(d) for d in range(x.shape[1])] 29 | self.features = {} 30 | for feature_name, feature_values in zip(feature_column_names, list(x.T)): 31 | self.features[feature_name] = feature_values 32 | self.label = y 33 | feature_columns = [tf.feature_column.numeric_column(key) for key in self.features] 34 | 35 | self.model_class = sqlflow_models.AutoClassifier 36 | self.model = sqlflow_models.AutoClassifier(feature_columns=feature_columns) 37 | 38 | class TestAutoRegressor(BaseTestCases.BaseEstimatorTest): 39 | def setUp(self): 40 | x, y = load_boston(return_X_y=True) 41 | feature_column_names = ['col_{}'.format(d) for d in range(x.shape[1])] 42 | self.features = {} 43 | for feature_name, feature_values in zip(feature_column_names, list(x.T)): 44 | self.features[feature_name] = feature_values 45 | self.label = y 46 | feature_columns = [tf.feature_column.numeric_column(key) for key in self.features] 47 | self.model_class = sqlflow_models.AutoRegressor 48 | self.model = sqlflow_models.AutoRegressor(feature_columns=feature_columns) 49 | 50 | if __name__ == '__main__': 51 | unittest.main() 52 | 53 | -------------------------------------------------------------------------------- /tests/test_deep_embedding_cluster.py: -------------------------------------------------------------------------------- 1 | from tensorflow.python.keras.losses import kld 2 | 3 | import sqlflow_models 4 | from tests.base import BaseTestCases, eval_input_fn 5 | 6 | import tensorflow as tf 7 | import unittest 8 | from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score 9 | from sklearn.utils.linear_assignment_ import linear_assignment 10 | import numpy as np 11 | from tensorflow.python import keras 12 | import sys 13 | 14 | 15 | def train_input_fn(features, batch_size=32): 16 | dataset = tf.data.Dataset.from_tensor_slices(dict(features)) 17 | dataset = dataset.shuffle(1000).repeat(1).batch(batch_size) 18 | return dataset 19 | 20 | ari = adjusted_rand_score 21 | nmi = normalized_mutual_info_score 22 | 23 | 24 | def acc(y_true, y_pred): 25 | """ 26 | Calculate clustering accuracy. 27 | Using the Hungarian algorithm to solve linear assignment problem. 28 | """ 29 | y_true = y_true.astype(np.int64) 30 | assert y_pred.size == y_true.size 31 | dims = max(y_pred.max(), y_true.max()) + 1 32 | w = np.zeros((dims, dims), dtype=np.int64) 33 | for i in range(y_pred.size): 34 | w[y_pred[i], y_true[i]] += 1 35 | 36 | ind = linear_assignment(w.max() - w) 37 | return sum([w[i, j] for i, j in ind]) * 1.0 / y_pred.size 38 | 39 | 40 | def evaluate(x, y, model): 41 | metric = dict() 42 | q = model.predict(x) 43 | y_pred = q.argmax(1) 44 | metric['acc'] = np.round(acc(y, y_pred), 5) 45 | metric['nmi'] = np.round(nmi(y, y_pred), 5) 46 | metric['ari'] = np.round(ari(y, y_pred), 5) 47 | return metric 48 | 49 | 50 | class TestDeepEmbeddingCluster(BaseTestCases.BaseTest): 51 | def setUp(self): 52 | (train_data, train_labels), (test_data, test_labels) = keras.datasets.mnist.load_data() 53 | x = np.concatenate((train_data, test_data)) 54 | y = np.concatenate((train_labels, test_labels)) 55 | x = x.reshape((x.shape[0], -1)) 56 | x = np.divide(x, 255.) 57 | # Sample 58 | x = x[:100] 59 | y = y[:100] 60 | # Generate Data 61 | feature_num = x.shape[1] 62 | feature_column_names = ['col_{}'.format(d) for d in range(feature_num)] 63 | 64 | self.features = {} 65 | for feature_name, feature_values in zip(feature_column_names, list(x.T)): 66 | self.features[feature_name] = feature_values 67 | 68 | self.label = y 69 | feature_columns = [tf.feature_column.numeric_column(key) for key in self.features] 70 | pretrain_dims = [500, 500, 2000, 10] 71 | # Init model 72 | self.model = sqlflow_models.DeepEmbeddingClusterModel(feature_columns=feature_columns, 73 | n_clusters=10, 74 | kmeans_init=20, 75 | run_pretrain=True, 76 | existed_pretrain_model=None, 77 | pretrain_dims=pretrain_dims, 78 | pretrain_activation_func='relu', 79 | pretrain_use_callbacks=True, 80 | pretrain_cbearlystop_patience=10, 81 | pretrain_cbearlystop_mindelta=0.0001, 82 | pretrain_cbreduce_patience=5, 83 | pretrain_cbreduce_factor=0.2, 84 | pretrain_epochs=20, 85 | pretrain_initializer='glorot_uniform', 86 | train_max_iters=500, 87 | update_interval=100, 88 | train_use_tol=True, 89 | tol=0.0001, 90 | loss=kld) 91 | self.model_class = sqlflow_models.DeepEmbeddingClusterModel 92 | 93 | def test_train_and_predict(self): 94 | self.setUp() 95 | model_pkg = sys.modules[self.model_class.__module__] 96 | self.model.compile(optimizer=model_pkg.optimizer(), 97 | loss=model_pkg.loss) 98 | self.model.sqlflow_train_loop(train_input_fn(self.features)) 99 | metric = evaluate(x=eval_input_fn(self.features, self.label), y=self.label, model=self.model) 100 | print(metric) 101 | assert (metric['acc'] > 0) 102 | 103 | 104 | if __name__ == '__main__': 105 | unittest.main() 106 | -------------------------------------------------------------------------------- /tests/test_dnnclassifier.py: -------------------------------------------------------------------------------- 1 | import sqlflow_models 2 | from tests.base import BaseTestCases 3 | 4 | import tensorflow as tf 5 | import unittest 6 | import numpy as np 7 | from sklearn.datasets import load_iris 8 | 9 | class TestDNNClassifier(BaseTestCases.BaseTest): 10 | def setUp(self): 11 | x, y = load_iris(return_X_y=True) 12 | feature_column_names = ['col_{}'.format(d) for d in range(x.shape[1])] 13 | self.features = {} 14 | for feature_name, feature_values in zip(feature_column_names, list(x.T)): 15 | self.features[feature_name] = feature_values 16 | self.label = y 17 | feature_columns = [tf.feature_column.numeric_column(key) for key in self.features] 18 | 19 | self.model_class = sqlflow_models.DNNClassifier 20 | self.model = sqlflow_models.DNNClassifier(feature_columns=feature_columns, n_classes=3) 21 | 22 | class TestDNNBinaryClassifier(BaseTestCases.BaseTest): 23 | def setUp(self): 24 | x, y = load_iris(return_X_y=True) 25 | x = np.array([x[i] for i, v in enumerate(y) if v != 2]) 26 | y = np.array([y[i] for i, v in enumerate(y) if v != 2]) 27 | feature_column_names = ['col_{}'.format(d) for d in range(x.shape[1])] 28 | self.features = {} 29 | for feature_name, feature_values in zip(feature_column_names, list(x.T)): 30 | self.features[feature_name] = feature_values 31 | self.label = y 32 | feature_columns = [tf.feature_column.numeric_column(key) for key in self.features] 33 | 34 | self.model_class = sqlflow_models.DNNClassifier 35 | self.model = sqlflow_models.DNNClassifier(feature_columns=feature_columns, n_classes=2) 36 | 37 | 38 | 39 | if __name__ == '__main__': 40 | unittest.main() 41 | 42 | -------------------------------------------------------------------------------- /tests/test_dnnclassifier_functional_api_example.py: -------------------------------------------------------------------------------- 1 | import sqlflow_models 2 | from tests.base import BaseTestCases 3 | 4 | import tensorflow as tf 5 | import unittest 6 | 7 | from sklearn.datasets import load_iris 8 | 9 | 10 | def train_input_fn(features, labels, batch_size=32): 11 | dataset = tf.data.Dataset.from_tensor_slices((dict(features), labels)) 12 | dataset = dataset.shuffle(1000).repeat().batch(batch_size) 13 | return dataset 14 | 15 | 16 | def eval_input_fn(features, labels, batch_size=32): 17 | dataset = tf.data.Dataset.from_tensor_slices((dict(features), labels)) 18 | dataset = dataset.batch(batch_size) 19 | return dataset 20 | 21 | class TestDNNClassifier(BaseTestCases.BaseTest): 22 | def setUp(self): 23 | x, y = load_iris(return_X_y=True) 24 | feature_column_names = ['col_{}'.format(d) for d in range(x.shape[1])] 25 | self.features = {} 26 | for feature_name, feature_values in zip(feature_column_names, list(x.T)): 27 | self.features[feature_name] = feature_values 28 | self.label = y 29 | feature_columns = [tf.feature_column.numeric_column(key) for key in self.features] 30 | fieldmetas = { 31 | "col_0": {"feature_name": "col_0", "shape": [1], "dtype": tf.float32}, 32 | "col_1": {"feature_name": "col_1", "shape": [1], "dtype": tf.float32}, 33 | "col_2": {"feature_name": "col_2", "shape": [1], "dtype": tf.float32}, 34 | "col_3": {"feature_name": "col_3", "shape": [1], "dtype": tf.float32}, 35 | } 36 | self.model = sqlflow_models.dnnclassifier_functional_model(feature_columns=feature_columns, field_metas=fieldmetas, n_classes=3) 37 | self.model_class = sqlflow_models.dnnclassifier_functional_model 38 | 39 | 40 | if __name__ == '__main__': 41 | unittest.main() 42 | -------------------------------------------------------------------------------- /tests/test_dnnregressor.py: -------------------------------------------------------------------------------- 1 | import sqlflow_models 2 | from tests.base import BaseTestCases 3 | 4 | import tensorflow as tf 5 | import unittest 6 | from sklearn.datasets import load_boston 7 | 8 | 9 | class TestDNNRegressor(BaseTestCases.BaseTest): 10 | def setUp(self): 11 | x, y = load_boston(return_X_y=True) 12 | feature_column_names = ['col_{}'.format(d) for d in range(x.shape[1])] 13 | self.features = {} 14 | for feature_name, feature_values in zip(feature_column_names, list(x.T)): 15 | self.features[feature_name] = feature_values 16 | self.label = y 17 | feature_columns = [tf.feature_column.numeric_column(key) for key in self.features] 18 | self.model_class = sqlflow_models.DNNRegressor 19 | self.model = sqlflow_models.DNNRegressor(feature_columns=feature_columns) 20 | 21 | 22 | if __name__ == '__main__': 23 | unittest.main() 24 | 25 | -------------------------------------------------------------------------------- /tests/test_gcn.py: -------------------------------------------------------------------------------- 1 | import sqlflow_models 2 | from tests.base import BaseTestCases 3 | 4 | import tensorflow as tf 5 | import numpy as np 6 | import unittest 7 | import random 8 | 9 | 10 | def build_karate_club_graph(): 11 | # All 78 edges are stored in two numpy arrays. One for source endpoints 12 | # while the other for destination endpoints. 13 | # Credit to: https://docs.dgl.ai/tutorials/basics/1_first.html 14 | src = np.array([1, 2, 2, 3, 3, 3, 4, 5, 6, 6, 6, 7, 7, 7, 7, 8, 8, 9, 10, 10, 15 | 10, 11, 12, 12, 13, 13, 13, 13, 16, 16, 17, 17, 19, 19, 21, 21, 16 | 25, 25, 27, 27, 27, 28, 29, 29, 30, 30, 31, 31, 31, 31, 32, 32, 17 | 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 18 | 33, 33, 33, 33, 33, 33, 33, 33, 33, 33]) 19 | dst = np.array([0, 0, 1, 0, 1, 2, 0, 0, 0, 4, 5, 0, 1, 2, 3, 0, 2, 2, 0, 4, 20 | 5, 0, 0, 3, 0, 1, 2, 3, 5, 6, 0, 1, 0, 1, 0, 1, 23, 24, 2, 23, 21 | 24, 2, 23, 26, 1, 8, 0, 24, 25, 28, 2, 8, 14, 15, 18, 20, 22, 23, 22 | 29, 30, 31, 8, 9, 13, 14, 15, 18, 19, 20, 22, 23, 26, 27, 28, 29, 30, 23 | 31, 32]) 24 | u = np.concatenate([src, dst]) 25 | v = np.concatenate([dst, src]) 26 | u = np.expand_dims(u, axis=1) 27 | v = np.expand_dims(v, axis=1) 28 | return np.concatenate([u,v], 1) 29 | 30 | def acc(y, label): 31 | '''Function to calculate the accuracy.''' 32 | ll = tf.equal(tf.argmax(label, -1), tf.argmax(y, -1)) 33 | accuarcy = tf.reduce_mean(tf.cast(ll, dtype=tf.float32)) 34 | return accuarcy 35 | 36 | def evaluate(x, y, model): 37 | '''Function to evaluate the performance of model.''' 38 | metric = dict() 39 | y_pred = model.predict(x) 40 | metric['acc'] = np.round(acc(y, y_pred), 5) 41 | return metric 42 | 43 | class TestGCN(BaseTestCases.BaseTest): 44 | def setUp(self): 45 | feature = [[0,1,2]+random.sample(range(3, 20), 8), 46 | [0,1,2]+random.sample(range(18, 40),8), 47 | [0,1,2]+random.sample(range(38, 60),8), 48 | [0,1,2]+random.sample(range(58, 80),8)] 49 | label = ['Shotokan', 'Gōjū-ryū', 'Wadō-ryū', 'Shitō-ryū'] 50 | nodes = np.array(list(range(34))) 51 | edges = build_karate_club_graph() 52 | features, labels = list(), list() 53 | for i in range(34): 54 | idx = random.randint(0,3) 55 | features.append(np.eye(81)[feature[idx]].sum(0)) 56 | labels.append(label[idx]) 57 | self.inputs = [dict() for i in range(len(edges)*2)] 58 | self.labels = list() 59 | for i in range(len(edges)): 60 | self.inputs[i]['id'] = tf.convert_to_tensor(edges[i][0]) 61 | self.inputs[i]['features'] = tf.convert_to_tensor(features[edges[i][0]]) 62 | self.inputs[i]['from_node_id'] = tf.convert_to_tensor(edges[i][0]) 63 | self.inputs[i]['to_node_id'] = tf.convert_to_tensor(edges[i][1]) 64 | self.labels.append(tf.convert_to_tensor([labels[edges[i][0]]])) 65 | for i in range(len(edges)): 66 | self.inputs[i+len(edges)]['id'] = tf.convert_to_tensor(edges[i][1]) 67 | self.inputs[i+len(edges)]['features'] = tf.convert_to_tensor(features[edges[i][1]]) 68 | self.inputs[i+len(edges)]['from_node_id'] = tf.convert_to_tensor(edges[i][0]) 69 | self.inputs[i+len(edges)]['to_node_id'] = tf.convert_to_tensor(edges[i][1]) 70 | self.labels.append(tf.convert_to_tensor([labels[edges[i][1]]])) 71 | self.model = sqlflow_models.GCN(nhid=16, nclass=4, epochs=20, train_ratio=0.2, eval_ratio=0.15) 72 | self.model_class = sqlflow_models.GCN 73 | 74 | def test_train_and_predict(self): 75 | self.setUp() 76 | self.model.compile(optimizer=optimizer(), 77 | loss='categorical_crossentropy') 78 | self.model.sqlflow_train_loop(zip(self.inputs, self.labels)) 79 | metric = evaluate([self.model.features, self.model.adjacency], self.model.labels, self.model) 80 | assert (metric['acc'] > 0) 81 | 82 | def optimizer(): 83 | return tf.keras.optimizers.Adam(lr=0.01) 84 | 85 | if __name__ == '__main__': 86 | unittest.main() 87 | 88 | 89 | -------------------------------------------------------------------------------- /tests/test_one_class_svm.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The SQLFlow Authors. All rights reserved. 2 | # Licensed under the Apache License, Version 2.0 (the "License"); 3 | # you may not use this file except in compliance with the License. 4 | # You may obtain a copy of the License at 5 | # 6 | # http://www.apache.org/licenses/LICENSE-2.0 7 | # 8 | # Unless required by applicable law or agreed to in writing, software 9 | # distributed under the License is distributed on an "AS IS" BASIS, 10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | # See the License for the specific language governing permissions and 12 | # limitations under the License. 13 | 14 | import os 15 | import shutil 16 | import tempfile 17 | import unittest 18 | 19 | import numpy as np 20 | import tensorflow as tf 21 | from sqlflow_models import OneClassSVM 22 | from sqlflow_models.one_class_svm import dataset_reader 23 | 24 | 25 | class TestOneClassSVM(unittest.TestCase): 26 | def setUp(self): 27 | self.tmp_dir = tempfile.mkdtemp() 28 | self.old_cwd = os.getcwd() 29 | os.chdir(self.tmp_dir) 30 | 31 | def tearDown(self): 32 | os.chdir(self.old_cwd) 33 | shutil.rmtree(self.tmp_dir) 34 | 35 | def create_dataset(self): 36 | def generator(): 37 | for _ in range(10): 38 | x1 = np.random.random(size=[1, 1]) 39 | x2 = np.random.random(size=[1, 1]) 40 | yield x1, x2 41 | 42 | def dict_mapper(x1, x2): 43 | return {"x1": x1, "x2": x2} 44 | 45 | dataset = tf.data.Dataset.from_generator( 46 | generator, output_types=(tf.dtypes.float32, tf.dtypes.float32)) 47 | return dataset.map(dict_mapper) 48 | 49 | def test_main(self): 50 | svm = OneClassSVM() 51 | train_dataset = self.create_dataset() 52 | svm.sqlflow_train_loop(train_dataset) 53 | 54 | predict_dataset = self.create_dataset() 55 | for features in dataset_reader(predict_dataset): 56 | pred = svm.sqlflow_predict_one(features)[0] 57 | pred = np.array(pred) 58 | self.assertEqual(pred.shape, (1,)) 59 | self.assertTrue(pred[0] == 1 or pred[0] == -1) 60 | 61 | 62 | if __name__ == '__main__': 63 | unittest.main() 64 | -------------------------------------------------------------------------------- /tests/test_rnn.py: -------------------------------------------------------------------------------- 1 | import sqlflow_models 2 | from tests.base import BaseTestCases 3 | 4 | import tensorflow as tf 5 | import numpy as np 6 | import unittest 7 | 8 | 9 | class TestStackedRNNClassifier(BaseTestCases.BaseTest): 10 | def setUp(self): 11 | self.features = {"c1": np.array([int(x) for x in range(800)]).reshape(100, 8)} 12 | self.label = [0 for _ in range(50)] + [1 for _ in range(50)] 13 | fea = tf.feature_column.sequence_categorical_column_with_identity( 14 | key="c1", 15 | num_buckets=800 16 | ) 17 | 18 | emb = tf.feature_column.embedding_column( 19 | fea, 20 | dimension=32) 21 | feature_columns = [emb] 22 | self.model = sqlflow_models.StackedRNNClassifier(feature_columns=feature_columns, stack_units=[64, 32], model_type='rnn') 23 | self.model_class = sqlflow_models.StackedRNNClassifier 24 | 25 | class TestStackedBiRNNClassifier(BaseTestCases.BaseTest): 26 | def setUp(self): 27 | self.features = {"c1": np.array([int(x) for x in range(800)]).reshape(100, 8)} 28 | self.label = [0 for _ in range(50)] + [1 for _ in range(50)] 29 | fea = tf.feature_column.sequence_categorical_column_with_identity( 30 | key="c1", 31 | num_buckets=800 32 | ) 33 | 34 | emb = tf.feature_column.embedding_column( 35 | fea, 36 | dimension=32) 37 | feature_columns = [emb] 38 | self.model = sqlflow_models.StackedRNNClassifier(feature_columns=feature_columns, stack_units=[64, 32], model_type='rnn', bidirectional=True) 39 | self.model_class = sqlflow_models.StackedRNNClassifier 40 | 41 | class TestStackedLSTMClassifier(BaseTestCases.BaseTest): 42 | def setUp(self): 43 | self.features = {"c1": np.array([int(x) for x in range(800)]).reshape(100, 8)} 44 | self.label = [0 for _ in range(50)] + [1 for _ in range(50)] 45 | fea = tf.feature_column.sequence_categorical_column_with_identity( 46 | key="c1", 47 | num_buckets=800 48 | ) 49 | 50 | emb = tf.feature_column.embedding_column( 51 | fea, 52 | dimension=32) 53 | feature_columns = [emb] 54 | self.model = sqlflow_models.StackedRNNClassifier(feature_columns=feature_columns, stack_units=[64, 32], model_type='lstm') 55 | self.model_class = sqlflow_models.StackedRNNClassifier 56 | 57 | class TestStackedBiLSTMClassifier(BaseTestCases.BaseTest): 58 | def setUp(self): 59 | self.features = {"c1": np.array([int(x) for x in range(800)]).reshape(100, 8)} 60 | self.label = [0 for _ in range(50)] + [1 for _ in range(50)] 61 | fea = tf.feature_column.sequence_categorical_column_with_identity( 62 | key="c1", 63 | num_buckets=800 64 | ) 65 | 66 | emb = tf.feature_column.embedding_column( 67 | fea, 68 | dimension=32) 69 | feature_columns = [emb] 70 | self.model = sqlflow_models.StackedRNNClassifier(feature_columns=feature_columns, stack_units=[64, 32], model_type='lstm', bidirectional=True) 71 | self.model_class = sqlflow_models.StackedRNNClassifier 72 | 73 | class TestStackedGRUClassifier(BaseTestCases.BaseTest): 74 | def setUp(self): 75 | self.features = {"c1": np.array([int(x) for x in range(800)]).reshape(100, 8)} 76 | self.label = [0 for _ in range(50)] + [1 for _ in range(50)] 77 | fea = tf.feature_column.sequence_categorical_column_with_identity( 78 | key="c1", 79 | num_buckets=800 80 | ) 81 | 82 | emb = tf.feature_column.embedding_column( 83 | fea, 84 | dimension=32) 85 | feature_columns = [emb] 86 | self.model = sqlflow_models.StackedRNNClassifier(feature_columns=feature_columns, stack_units=[64, 32], model_type='gru') 87 | self.model_class = sqlflow_models.StackedRNNClassifier 88 | 89 | class TestStackedBiGRUClassifier(BaseTestCases.BaseTest): 90 | def setUp(self): 91 | self.features = {"c1": np.array([int(x) for x in range(800)]).reshape(100, 8)} 92 | self.label = [0 for _ in range(50)] + [1 for _ in range(50)] 93 | fea = tf.feature_column.sequence_categorical_column_with_identity( 94 | key="c1", 95 | num_buckets=800 96 | ) 97 | 98 | emb = tf.feature_column.embedding_column( 99 | fea, 100 | dimension=32) 101 | feature_columns = [emb] 102 | self.model = sqlflow_models.StackedRNNClassifier(feature_columns=feature_columns, stack_units=[64, 32], model_type='gru', bidirectional=True) 103 | self.model_class = sqlflow_models.StackedRNNClassifier 104 | 105 | if __name__ == '__main__': 106 | unittest.main() 107 | 108 | 109 | -------------------------------------------------------------------------------- /tests/test_rnnts.py: -------------------------------------------------------------------------------- 1 | import sqlflow_models 2 | from tests.base import BaseTestCases 3 | 4 | import tensorflow as tf 5 | import numpy as np 6 | np.random.seed(22) 7 | import unittest 8 | 9 | 10 | class TestRNNBasedTimeSeriesModel(BaseTestCases.BaseTest): 11 | def setUp(self): 12 | # We use sin data plus perturbation to simulate time series data 13 | time_series_data = np.sin(np.arange(56)) + np.random.normal(0, 0.01, 56) 14 | x = np.array(time_series_data).reshape(8, 7) 15 | y = np.array(np.arange(8).reshape(8, 1)) 16 | self.features = {"col1": x} 17 | self.label = y 18 | self.n_in = 7 19 | self.n_out = 1 20 | # time_window=n_in, num_features=n_out 21 | feature_columns = [tf.feature_column.numeric_column(key, shape=(self.n_in, self.n_out)) for key in self.features] 22 | self.model = sqlflow_models.RNNBasedTimeSeriesModel( 23 | feature_columns=feature_columns, 24 | stack_units=[50, 50], 25 | n_in=self.n_in, 26 | n_out=self.n_out, 27 | model_type='rnn') 28 | self.model_class = sqlflow_models.RNNBasedTimeSeriesModel 29 | 30 | class TestLSTMBasedTimeSeriesModel(BaseTestCases.BaseTest): 31 | def setUp(self): 32 | # We use sin data plus perturbation to simulate time series data 33 | time_series_data = np.sin(np.arange(56)) + np.random.normal(0, 0.01, 56) 34 | x = np.array(time_series_data).reshape(8, 7) 35 | y = np.array(np.arange(8).reshape(8, 1)) 36 | self.features = {"col1": x} 37 | self.label = y 38 | self.n_in = 7 39 | self.n_out = 1 40 | # time_window=n_in, num_features=n_out 41 | feature_columns = [tf.feature_column.numeric_column(key, shape=(self.n_in, self.n_out)) for key in self.features] 42 | self.model = sqlflow_models.RNNBasedTimeSeriesModel( 43 | feature_columns=feature_columns, 44 | stack_units=[50, 50], 45 | n_in=self.n_in, 46 | n_out=self.n_out, 47 | model_type='lstm') 48 | self.model_class = sqlflow_models.RNNBasedTimeSeriesModel 49 | 50 | class TestGRUBasedTimeSeriesModel(BaseTestCases.BaseTest): 51 | def setUp(self): 52 | # We use sin data plus perturbation to simulate time series data 53 | time_series_data = np.sin(np.arange(56)) + np.random.normal(0, 0.01, 56) 54 | x = np.array(time_series_data).reshape(8, 7) 55 | y = np.array(np.arange(8).reshape(8, 1)) 56 | self.features = {"col1": x} 57 | self.label = y 58 | self.n_in = 7 59 | self.n_out = 1 60 | # time_window=n_in, num_features=n_out 61 | feature_columns = [tf.feature_column.numeric_column(key, shape=(self.n_in, self.n_out)) for key in self.features] 62 | self.model = sqlflow_models.RNNBasedTimeSeriesModel( 63 | feature_columns=feature_columns, 64 | stack_units=[50, 50], 65 | n_in=self.n_in, 66 | n_out=self.n_out, 67 | model_type='gru') 68 | self.model_class = sqlflow_models.RNNBasedTimeSeriesModel 69 | 70 | 71 | if __name__ == '__main__': 72 | unittest.main() 73 | 74 | -------------------------------------------------------------------------------- /tests/test_score_card.py: -------------------------------------------------------------------------------- 1 | from sqlflow_models import ScoreCard 2 | import unittest 3 | import tensorflow as tf 4 | from datetime import datetime, timedelta 5 | import numpy as np 6 | 7 | 8 | class TestScoreCard(unittest.TestCase): 9 | def create_dataset(self): 10 | samples = 20 11 | f = [np.random.randint(20, size=1) for i in range(samples)] 12 | label = [np.random.randint(2, size=1) for i in range(samples)] 13 | 14 | def generator(): 15 | for i, item in enumerate(f): 16 | yield [f[i]], label[i] 17 | 18 | def dict_mapper(feature, label): 19 | return {'f1': feature}, label 20 | 21 | dataset = tf.data.Dataset.from_generator( 22 | generator, output_types=(tf.dtypes.float32, tf.dtypes.float32) 23 | ) 24 | dataset = dataset.map(dict_mapper) 25 | return dataset 26 | 27 | def test_train(self): 28 | dataset = self.create_dataset() 29 | m = ScoreCard(pf_bin_size=2) 30 | m.sqlflow_train_loop(dataset) 31 | 32 | 33 | if __name__ == '__main__': 34 | unittest.main() 35 | -------------------------------------------------------------------------------- /tests/test_version.py: -------------------------------------------------------------------------------- 1 | import sqlflow_models 2 | 3 | 4 | def test_answer(): 5 | assert sqlflow_models.__version__ == sqlflow_models._version.__version__ 6 | --------------------------------------------------------------------------------