├── .gitignore
├── .travis.yml
├── LICENSE
├── MANIFEST.in
├── Makefile
├── README.md
├── base_image
    └── Dockerfile
├── doc
    ├── contribute_models.md
    └── customized+model.md
├── runnables
    ├── Dockerfile
    ├── bin
    │   ├── __init__.py
    │   ├── binning_calculator.py
    │   └── psi_calculator.py
    ├── binning.py
    ├── extract_ts_features.py
    ├── psi.py
    ├── requirements.txt
    ├── run_io
    │   ├── __init__.py
    │   └── db_adapter.py
    ├── time_series_processing
    │   ├── __init__.py
    │   └── ts_feature_extractor.py
    └── two_dim_binning.py
├── scripts
    ├── data
    │   └── iris.recordio
    ├── elasticdl_travis_test_job.sh
    └── test_elasticdl_submit.sh
├── setup.cfg
├── setup.py
├── sqlflow_models
    ├── Dockerfile
    ├── __init__.py
    ├── _version.py
    ├── arima_with_stl_decomposition.py
    ├── auto_estimator.py
    ├── custom_model_example.py
    ├── deep_embedding_cluster.py
    ├── dnnclassifier.py
    ├── dnnclassifier_functional_api_example.py
    ├── dnnregressor.py
    ├── gcn.py
    ├── native_keras.py
    ├── one_class_svm.py
    ├── rnn_based_time_series.py
    ├── rnnclassifier.py
    ├── score_card.py
    └── simple_dnn_generator.py
└── tests
    ├── __init__.py
    ├── base.py
    ├── test_arima_with_stl_decomposition.py
    ├── test_auto_estimator.py
    ├── test_deep_embedding_cluster.py
    ├── test_dnnclassifier.py
    ├── test_dnnclassifier_functional_api_example.py
    ├── test_dnnregressor.py
    ├── test_gcn.py
    ├── test_one_class_svm.py
    ├── test_rnn.py
    ├── test_rnnts.py
    ├── test_score_card.py
    └── test_version.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | venv/
 2 | build/
 3 | dist/
 4 | 
 5 | .eggs/
 6 | *.egg-info/
 7 | .pytest_cache
 8 | __pycache__/
 9 | 
10 | .idea/
11 | 
12 | *.swp
13 | *.vim
14 | *.pyc
15 | *.log
16 | 
17 | .DS_Store
18 | 
19 | .vscode


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | dist: xenial
 2 | cache: pip
 3 | branches:
 4 |   only:
 5 |   - master
 6 |   - develop
 7 |   - "/^v\\d+\\.\\d+(\\.\\d+)?(-\\S*)?$/"
 8 | language: python
 9 | python:
10 | - 3.6
11 | - 3.7
12 | service:
13 | - docker
14 | install:
15 | - python -m pip install --upgrade pip
16 | - python -m pip install --upgrade setuptools>=41.0.0
17 | - python setup.py install
18 | script:
19 | - python setup.py -q test
20 | 
21 | jobs:
22 |   include:
23 |   - stage: ElasticDLTest
24 |     script:
25 |     - cd base_image && docker build -t sqlflow/modelzoo_base . && cd ..
26 |     - cd sqlflow_models && docker build -t sqlflow/sqlflow_models . && cd ..
27 |     - bash scripts/elasticdl_travis_test_job.sh
28 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright [yyyy] [name of copyright owner]
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.
203 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.md LICENSE
2 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | SHELL := /bin/bash
 2 | 
 3 | setup: ## Setup virtual environment for local development
 4 | 	python3 -m venv venv
 5 | 	source venv/bin/activate && \
 6 | 	pip install -U pip && \
 7 | 	$(MAKE) install-requirements
 8 | 
 9 | install-requirements:
10 | 	pip install -U -e .
11 | 
12 | test: ## Run tests
13 | 	python3 setup.py test
14 | 
15 | clean: ## Clean up temporary folders
16 | 	rm -rf build dist .eggs *.egg-info .pytest_cache sqlflow/proto
17 | 
18 | help:
19 | 	@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}'
20 | 
21 | .PHONY: help
22 | .DEFAULT_GOAL := help
23 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # SQLFlow Models
 2 | 
 3 | [![Build Status](https://travis-ci.com/sql-machine-learning/models.svg?branch=develop)](https://travis-ci.org/sql-machine-learning) [![PyPI Package](https://img.shields.io/pypi/v/sqlflow_models.svg)](https://pypi.python.org/pypi/sqlflow_models)
 4 | 
 5 | Premade Models for [SQLFlow](https://github.com/sql-machine-learning/sqlflow).
 6 | 
 7 | ## Installation
 8 | 
 9 | This package is available on PyPI as `sqlflow_models`. So you can install it by running the following command:
10 | 
11 |   ```bash
12 |   pip install sqlflow_models
13 |   ```
14 | 
15 | ## Development
16 | 
17 | ## Prerequisite
18 | ### Python 3
19 | `brew install python`
20 | 
21 | ### Setup Environment
22 | `make setup`
23 | 
24 | ### Test
25 | `make test`
26 | 


--------------------------------------------------------------------------------
/base_image/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.7
 2 | 
 3 | # install PAI python support
 4 | RUN pip install pypai
 5 | 
 6 | # install go needed by installing ElasticDL
 7 | ENV GOPATH /root/go
 8 | ENV PATH /usr/local/go/bin:$GOPATH/bin:$PATH
 9 | RUN curl --silent https://dl.google.com/go/go1.13.4.linux-amd64.tar.gz | tar -C /usr/local -xzf -
10 | 
11 | # install ElasticDL to manage ElasticDL jobs
12 | RUN git clone https://github.com/sql-machine-learning/elasticdl.git && \
13 | cd elasticdl && \
14 | git checkout 62b255a918df5b6594c888b19aebbcc74bbce6e4 && \
15 | pip install -r elasticdl/requirements.txt && \
16 | python setup.py install && \
17 | cd .. && rm -rf elasticdl
18 | 


--------------------------------------------------------------------------------
/doc/contribute_models.md:
--------------------------------------------------------------------------------
 1 | # How to Contribute SQLFLow Models
 2 | 
 3 | This guide will introduce how to contribute to SQLFlow models. You can find design doc: [Define SQLFLow Models](/doc/customized+model.md), and feel free to check it out.
 4 | 
 5 | ## Develop an SQLFlow Model
 6 | 
 7 | 1. Open the [SQLFlow models repo](https://github.com/sql-machine-learning/models) on your web browser, and fork the official repo to your account.
 8 | 
 9 | 1. Clone the forked repo on your hosts:
10 | 
11 |     ``` bash
12 |     > git clone https://github.com/<Your Github ID>/models.git
13 |     ```
14 | 
15 | 1. Set up your local python environment by `make setup && source venv/bin/activate`. If you are using [PyCharm](https://www.jetbrains.com/pycharm/), you can simply `make setup` and then import the `models` folder as a new project.
16 | 
17 | 1. You can add a new mode definition Python script under the folder [sqlflow_models](/sqlflow_models). For example, adding a new Python script `mydnnclassfier.py`:
18 | 
19 |     ``` text
20 |     `-sqlflow_models
21 |         |- dnnclassifier.py
22 |         `- mydnnclassifier.py
23 |     ```
24 | 
25 | 1. You can choose whatever name you like for your model. Your model definition should be a [keras subclass model](https://keras.io/models/about-keras-models/#model-subclassing)
26 | 
27 |     ``` python
28 |     import tensorflow as tf
29 | 
30 |     class MyDNNClassifier(tf.keras.Model):
31 |         def __init__(self, feature_columns, hidden_units=[10,10], n_classes=2):
32 |             ...
33 |             ...
34 |     ```
35 | 
36 | 1. Import `MyDNNClassfier` in [sqlflow_models/\_\_init__.py](/sqlflow_models/__init__.py):
37 | 
38 |     ``` python
39 |     ...
40 |     from .mydnnclassfier import MyDNNClassifier
41 |     ```
42 | 
43 | 1. You can test your `MyDNNClassifier` by adding a new Python unit test script `tests/test_mydnnclassifier.py` and run the test as: `python tests/test_mydnnclassifier.py`:
44 | 
45 |     ``` python
46 |     from  sqlflow_models import MyDNNClassifier
47 |     from tests.base import BaseTestCases
48 | 
49 |     import tensorflow as tf
50 |     import unittest
51 | 
52 | 
53 |     class TestMyDNNClassifier(BaseTestCases.BaseTest):
54 |         def setUp(self):
55 |             self.features = {...}
56 |             self.label = [...]
57 |             feature_columns = [...]
58 |             self.model = MyDNNClassifier(feature_columns=feature_columns)
59 | 
60 |     if __name__ == '__main__':
61 |         unittest.main()
62 |     ```
63 | 
64 | ## Test and Debug Your Model With SQLFlow
65 | 
66 | If you have developed a new model, please perform the integration test with the SQLFlow gRPC server to make sure it works well with SQLFlow.
67 | 
68 | 1. Launch an SQLFlow all-in-one Docker container
69 | 
70 |     ``` bash
71 |     cd ./models
72 |     > docker run --rm -it -v $PWD:/models -e PYTHONPATH=/models -p 8888:8888 sqlflow/sqlflow
73 |     ```
74 | 
75 | 1. Open a web browser and go to `localhost:8888` to access the Jupyter Notebook. Using your custom model by modifying the `TRAIN` parameter of the SQLFlow extend SQL: `TRAIN sqlflow_models.MyDNNClassifier`:
76 | 
77 | ``` sql
78 | SELECT * from iris.train
79 | TRAIN sqlflow_models.MyDNNClassifier
80 | WITH n_classes = 3, hidden_units = [10, 20]
81 | COLUMN sepal_length, sepal_width, petal_length, petal_width
82 | LABEL class
83 | INTO sqlflow_models.my_dnn_model;
84 | ```
85 | 
86 | 1. When you need to update the model and test a gain, just modify the mode Python file on your host then run the SQL statement in the notebook one more time.
87 | 
88 | ## Publish your model in the SQLFlow all-in-one Docker image
89 | 
90 | If you have already tested your code, please create a pull request and invite other develops to review it. If one of the develops **approve** your pull request, then you can merge it to the develop branch.
91 | The travis-ci would build the SQLFlow all-in-one Docker image with the latest models code every night and push it to the Docker hub with tag: `sqlflow/sqlflow:nightly`, you can find the latest models in it the second day.
92 | 


--------------------------------------------------------------------------------
/doc/customized+model.md:
--------------------------------------------------------------------------------
  1 | # Design Doc: Define Models for SQLFlow
  2 | 
  3 | SQLFlow enables SQL programs to call deep learning models defined in Python. This document is about how to define models for SQLFlow.
  4 | 
  5 | ## Keras v.s. Estimator
  6 | 
  7 | Many deep leareners define models using Keras API or as an Estimator derived class.
  8 | We prefer [Keras](https://keras.io/) over [Estimator](https://www.tensorflow.org/guide/estimators) for some reasons:
  9 | 
 10 | 1. [TensorFlow Submit 2019](https://www.youtube.com/watch?v=k5c-vg4rjBw) announced that TensorFlow 2.x will closely integrate with Keras.
 11 | 
 12 | 2. We found more documents about Keras than Estimator.
 13 | 
 14 | 3. We found more models defined using Keras than Estimator.
 15 | 
 16 | ## Keras APIs
 17 | 
 18 | Keras provides three approaches to define models.
 19 | 
 20 | ### 1. Subclassing `tf.keras.Model`
 21 | 
 22 |   ```python
 23 |   class DNNClassifier(tf.keras.Model):
 24 |       def __init__(self, feature_columns, hidden_units, n_classes):
 25 |           super(DNNClassifier, self).__init__()
 26 |           self.feature_layer = tf.keras.layers.DenseFeatures(feature_columns)
 27 |           self.hidden_layers = []
 28 |           for hidden_unit in hidden_units:
 29 |               self.hidden_layers.append(tf.keras.layers.Dense(hidden_unit))
 30 |           self.prediction_layer = tf.keras.layers.Dense(n_classes, activation='softmax')
 31 |   
 32 |       def call(self, inputs):
 33 |           x = self.feature_layer(inputs)
 34 |           for hidden_layer in self.hidden_layers:
 35 |               x = hidden_layer(x)
 36 |           return self.prediction_layer(x)
 37 |   
 38 |   model = DNNClassifier(feature_columns, hidden_units, n_classes)
 39 |   ```
 40 | 
 41 |   Please be aware that `tf.keras.Model` has methods `save_weights` and `load_weights`, which save/load model parameters but no the topology, as expalined in [this guidence](https://stackoverflow.com/questions/51806852/cant-save-custom-subclassed-model) and [this example list](https://stackoverflow.com/questions/52826134/keras-model-subclassing-examples).
 42 | 
 43 | ### 2. Functional API
 44 | 
 45 |   ```python
 46 |   x = tf.feature_column.input_layer(shape=(5,))
 47 |   for n in hidden_units:
 48 |       x = tf.keras.layers.Dense(n, activation='relu')(x)
 49 |   pred = tf.keras.layers.Dense(n_classes, activation='softmax')(x)
 50 |   model = tf.keras.models.Model(inputs=feature_columns, outputs=pred)
 51 |   ```
 52 | 
 53 |   Please be aware that functional API doesn't work with feature column API, as reported [here](https://github.com/tensorflow/tensorflow/issues/27416). However, the approach of deriving classes from `keras.Model` works with the feature column API.
 54 | 
 55 | ### 3. `keras.Sequential`
 56 | 
 57 |   ```python
 58 |   model = tf.keras.Sequential()
 59 |   model.add(tf.keras.layers.DenseFeatures(feature_columns))
 60 |   for n in hidden_units:
 61 |     model.add(tf.keras.layers.Dense(n, activation='relu'))
 62 |   model.add(tf.keras.layers.Dense(n_classes, activation='softmax'))
 63 |   ```
 64 | 
 65 |   Please be aware that  `tf.keras.Sequential()` only covers a small variety of models.  It doesn't cover many well-known models including ResNet, Transforms, and WideAndDeep.
 66 | 
 67 | ### The Choice
 68 | 
 69 | We chose the approach of subclassing `tf.keras.Model` according to the following table.
 70 | 
 71 | | Keras APIs         | Work with feature column API | Save/load models           | Model coverage |
 72 | | ------------------ | ---------------------------- | -------------------------- | -------------- |
 73 | | `tf.keras.Model`   | ☑️                            | weights-only, no topology  | High           |
 74 | | Functional API     | ❌                           | ☑️                          | High           |
 75 | | Sequential Model   | ☑️                            | ☑️                          | Low            |
 76 | 
 77 | 
 78 | ## An Example
 79 | 
 80 | Here is an example `DNNClassifier` of multiple hidden layers as a Python class derived from `tf.keras.Model`. To run it, please use TensorFlow 2.0 alpha or newer versions.
 81 | 
 82 | ```python
 83 | class DNNClassifier(tf.keras.Model):
 84 |     def __init__(self, feature_columns, hidden_units, n_classes):
 85 |         """DNNClassifier
 86 |         :param feature_columns: feature columns.
 87 |         :type feature_columns: list[tf.feature_column].
 88 |         :param hidden_units: number of hidden units.
 89 |         :type hidden_units: list[int].
 90 |         :param n_classes: List of hidden units per layer.
 91 |         :type n_classes: int.
 92 |         """
 93 |         super(DNNClassifier, self).__init__()
 94 | 
 95 |         # combines all the data as a dense tensor
 96 |         self.feature_layer = tf.keras.layers.DenseFeatures(feature_columns)
 97 |         self.hidden_layers = []
 98 |         for hidden_unit in hidden_units:
 99 |             self.hidden_layers.append(tf.keras.layers.Dense(hidden_unit))
100 |         self.prediction_layer = tf.keras.layers.Dense(n_classes, activation='softmax')
101 | 
102 |     def call(self, inputs):
103 |         x = self.feature_layer(inputs)
104 |         for hidden_layer in self.hidden_layers:
105 |             x = hidden_layer(x)
106 |         return self.prediction_layer(x)
107 | 
108 |     def default_optimizer(self):
109 |         """Default optimizer name. Used in model.compile."""
110 |         return 'adam'
111 | 
112 |     def default_loss(self):
113 |         """Default loss function. Used in model.compile."""
114 |         return 'categorical_crossentropy'
115 | 
116 |     def default_training_epochs(self):
117 |         """Default training epochs. Used in model.fit."""
118 |         return 5
119 | 
120 |     def prepare_prediction_column(self, prediction):
121 |         """Return the class label of highest probability."""
122 |         return prediction.argmax(axis=-1)
123 | ```
124 | 
125 | ## Further Reading
126 | 
127 | We read the following Keras source code files: [model.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/keras/models.py), [network.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/keras/engine/network.py), and [training.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/keras/engine/training.py).
128 | 


--------------------------------------------------------------------------------
/runnables/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM sqlflow/sqlflow:step
 2 | 
 3 | RUN apt-get clean && apt-get update && \
 4 |     apt-get -qq install libmysqld-dev libmysqlclient-dev
 5 | 
 6 | ADD ./requirements.txt /
 7 | RUN pip3 install --no-cache-dir -r /requirements.txt && rm -rf /requirements.txt
 8 | 
 9 | ADD . /opt/sqlflow/run
10 | ENV PYTHONPATH "${PYTHONPATH}:/opt/sqlflow/run"
11 | 


--------------------------------------------------------------------------------
/runnables/bin/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sql-machine-learning/models/5dc6421f562ea447e501fa355a48a6ee89856a1d/runnables/bin/__init__.py


--------------------------------------------------------------------------------
/runnables/bin/binning_calculator.py:
--------------------------------------------------------------------------------
  1 | import mars.dataframe as md
  2 | import mars.tensor as mt
  3 | import numpy as np
  4 | import pandas as pd
  5 | 
  6 | 
  7 | class BinningMethod(object):
  8 |     BUCKET = "bucket"
  9 |     QUANTILE = "quantile"
 10 |     LOG_BUCKET = "log_bucket"
 11 | 
 12 | 
 13 | def binning(
 14 |     in_md,
 15 |     col_name,
 16 |     bin_method,
 17 |     bins,
 18 |     boundaries):
 19 |     if boundaries:
 20 |         bin_o, bins = md.cut(in_md[col_name], bins=boundaries, labels=False, retbins=True)
 21 |         bins_np = bins.to_numpy()
 22 |     else:
 23 |         if bin_method.lower() == BinningMethod.BUCKET.lower():
 24 |             bin_o, bins = md.cut(in_md[col_name], bins=bins, labels=False, retbins=True)
 25 |             bins_np = bins.to_numpy()
 26 |         elif bin_method.lower() == BinningMethod.LOG_BUCKET.lower():
 27 |             bin_o, bins = md.cut(mt.log(in_md[col_name]), bins=bins, labels=False, retbins=True)
 28 |             bins_np = np.exp(bins.to_numpy())
 29 |         else:
 30 |             raise ValueError("Unsupport binning method: {}".format(bin_method))
 31 | 
 32 |     return bin_o, bins_np
 33 | 
 34 | 
 35 | def cumsum(arr, reverse):
 36 |     if type(arr) == np.ndarray:
 37 |         sum_arr = arr
 38 |     elif type(arr) == pd.DataFrame:
 39 |         sum_arr = arr.to_numpy()
 40 |     else:
 41 |         raise ValueError("Invalid input type: {}".format(type(arr)))
 42 | 
 43 |     for i in range(np.ndim(arr)):
 44 |         sum_arr = np.flip(np.cumsum(np.flip(sum_arr, i), i), i) if reverse else np.cumsum(sum_arr, i)
 45 | 
 46 |     if type(arr) == np.ndarray:
 47 |         return sum_arr
 48 |     elif type(arr) == pd.DataFrame:
 49 |         return pd.DataFrame(sum_arr)
 50 |     else:
 51 |         raise ValueError("Invalid input type: {}".format(type(arr)))
 52 | 
 53 | 
 54 | def calc_binning_stats(
 55 |     in_md,
 56 |     sel_cols,
 57 |     bin_methods,
 58 |     bin_nums,
 59 |     cols_bin_boundaries,
 60 |     reverse_cumsum):
 61 |     cols_bin_stats = []
 62 |     for i in range(len(sel_cols)):
 63 |         sel_col = sel_cols[i]
 64 |         bin_o, bins = binning(in_md, sel_col, bin_methods[i], bin_nums[i], cols_bin_boundaries.get(sel_col, None))
 65 |         bin_num = len(bins) - 1
 66 |         bin_prob_df = bin_o.value_counts(normalize=True).to_pandas().to_frame()
 67 |         bin_prob_df = bin_prob_df.reindex(range(bin_num), fill_value=0)
 68 |         bin_cumsum_prob_df = cumsum(bin_prob_df, reverse_cumsum)
 69 | 
 70 |         cols_bin_stats.append(
 71 |             {
 72 |                 "name": sel_col,
 73 |                 "bin_boundaries": ','.join(bins.astype(str)),
 74 |                 "bin_prob": ','.join(bin_prob_df[bin_prob_df.columns[0]].to_numpy().astype(str)),
 75 |                 "bin_cumsum_prob": ','.join(bin_cumsum_prob_df[bin_cumsum_prob_df.columns[0]].to_numpy().astype(str))
 76 |             }
 77 |         )
 78 | 
 79 |     return pd.DataFrame(cols_bin_stats)
 80 | 
 81 | 
 82 | def calc_basic_stats(
 83 |     in_md,
 84 |     sel_cols):
 85 |     stats_data = [
 86 |         {
 87 |             "name": sel_col,
 88 |             "min": mt.min(in_md[sel_col]).to_numpy(),
 89 |             "max": mt.max(in_md[sel_col]).to_numpy(),
 90 |             "mean": mt.mean(in_md[sel_col]).to_numpy(),
 91 |             "median": mt.median(in_md[sel_col]).to_numpy(),
 92 |             "std": mt.std(in_md[sel_col]).to_numpy(),
 93 |         } for sel_col in sel_cols
 94 |     ]
 95 | 
 96 |     return pd.DataFrame(stats_data)
 97 | 
 98 | 
 99 | def calc_stats(
100 |     in_md,
101 |     sel_cols,
102 |     bin_methods,
103 |     bin_nums,
104 |     cols_bin_boundaries,
105 |     reverse_cumsum):
106 |     basic_stats_df = calc_basic_stats(in_md, sel_cols)
107 |     cols_bin_stats_df = calc_binning_stats(in_md, sel_cols, bin_methods, bin_nums, cols_bin_boundaries, reverse_cumsum)
108 |     
109 |     stats_df = pd.merge(basic_stats_df, cols_bin_stats_df, how='inner', on='name')
110 | 
111 |     return stats_df
112 | 
113 | 
114 | def calc_two_dim_binning_stats(
115 |     in_md,
116 |     sel_col_1,
117 |     sel_col_2,
118 |     bin_method_1,
119 |     bin_method_2,
120 |     bin_num_1,
121 |     bin_num_2,
122 |     bin_boundaries_1,
123 |     bin_boundaries_2,
124 |     reverse_cumsum):
125 |     bin_o1, bins_1 = binning(in_md, sel_col_1, bin_method_1, bin_num_1, bin_boundaries_1)
126 |     bin_o2, bins_2 = binning(in_md, sel_col_2, bin_method_2, bin_num_2, bin_boundaries_2)
127 | 
128 |     bin_num_1 = len(bins_1) - 1
129 |     bin_num_2 = len(bins_2) - 1
130 | 
131 |     bin_o = bin_o1 * bin_num_2 + bin_o2
132 |     bin_prob_df = bin_o.value_counts(normalize=True).to_pandas().to_frame()
133 |     bin_prob_df = bin_prob_df.reindex(range(bin_num_1 * bin_num_2), fill_value=0)
134 |     two_dim_bin_prob_np = bin_prob_df.to_numpy().reshape((bin_num_1, bin_num_2))
135 |     two_dim_bin_cumsum_prob_np = cumsum(two_dim_bin_prob_np, reverse_cumsum)
136 | 
137 |     return pd.DataFrame(two_dim_bin_prob_np), pd.DataFrame(two_dim_bin_cumsum_prob_np)
138 | 
139 | 
140 | def get_cols_bin_boundaries(stats_df):
141 |     col_boundaries = {}
142 |     for _, row in stats_df.iterrows():
143 |         col_name = row['name']
144 |         boundaries = [float(item) for item in row['bin_boundaries'].split(',')]
145 |         col_boundaries[col_name] = boundaries
146 | 
147 |     return col_boundaries
148 | 


--------------------------------------------------------------------------------
/runnables/bin/psi_calculator.py:
--------------------------------------------------------------------------------
 1 | import numpy as np 
 2 | import pandas as pd
 3 | 
 4 | 
 5 | def calc_psi_per_bin(
 6 |     expected_prob,
 7 |     actual_prob):
 8 |     FALLBACK_VALUE = 0.001
 9 |     expected_prob = FALLBACK_VALUE if expected_prob == 0.0 else expected_prob
10 |     actual_prob = FALLBACK_VALUE if actual_prob == 0.0 else actual_prob
11 | 
12 |     return (expected_prob - actual_prob) * np.log(expected_prob * 1.0 / actual_prob)
13 | 
14 | 
15 | def calc_psi(
16 |     expected_bin_probs,
17 |     actual_bin_probs):
18 |     assert(len(expected_bin_probs) == len(actual_bin_probs))
19 | 
20 |     result = 0.0
21 |     for i in range(len(expected_bin_probs)):
22 |         result += calc_psi_per_bin(expected_bin_probs[i], actual_bin_probs[i])
23 | 
24 |     return result
25 | 
26 | 
27 | def get_cols_bin_probs(
28 |     stats_df,
29 |     bin_prob_column_name):
30 |     col_bin_probs = {}
31 |     for _, row in stats_df.iterrows():
32 |         col_name = row['name']
33 |         bin_probs = [float(item) for item in row[bin_prob_column_name].split(',')]
34 |         col_bin_probs[col_name] = bin_probs
35 | 
36 |     return col_bin_probs
37 | 


--------------------------------------------------------------------------------
/runnables/binning.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import mars.dataframe as md
 3 | import os
 4 | import pandas as pd
 5 | from bin.binning_calculator import calc_stats, calc_two_dim_binning_stats, get_cols_bin_boundaries
 6 | from run_io.db_adapter import convertDSNToRfc1738
 7 | from sqlalchemy import create_engine
 8 | 
 9 | 
10 | def build_argument_parser():
11 |     parser = argparse.ArgumentParser(allow_abbrev=False)
12 |     parser.add_argument("--dbname", type=str, required=True)
13 |     parser.add_argument("--columns", type=str, required=True)
14 |     parser.add_argument("--bin_method", type=str, required=False)
15 |     parser.add_argument("--bin_num", type=str, required=False)
16 |     parser.add_argument("--bin_input_table", type=str, required=False)
17 |     parser.add_argument("--reverse_cumsum", type=bool, default=False)
18 |     parser.add_argument("--two_dim_bin_cols", type=str, required=False)
19 | 
20 |     return parser
21 | 
22 | 
23 | if __name__ == "__main__":
24 |     parser = build_argument_parser()
25 |     args, _ = parser.parse_known_args()
26 |     columns = args.columns.split(',')
27 |     bin_method_array = args.bin_method.split(',') if args.bin_method else None
28 |     bin_num_array = [int(item) for item in args.bin_num.split(',')] if args.bin_num else None
29 | 
30 |     select_input = os.getenv("SQLFLOW_TO_RUN_SELECT")
31 |     output = os.getenv("SQLFLOW_TO_RUN_INTO")
32 |     output_tables = output.split(',')
33 |     datasource = os.getenv("SQLFLOW_DATASOURCE")
34 | 
35 |     assert len(output_tables) == 1, "The output tables shouldn't be null and can contain only one."
36 | 
37 |     url = convertDSNToRfc1738(datasource, args.dbname)
38 |     engine = create_engine(url)
39 |     input_md = md.read_sql(
40 |         sql=select_input,
41 |         con=engine)
42 |     input_md.execute()
43 | 
44 |     cols_bin_boundaries = {}
45 |     if args.bin_input_table:
46 |         print("Get provided bin boundaries from table {}".format(args.bin_input_table))
47 |         bin_input_df = pd.read_sql_table(
48 |             table_name=args.bin_input_table,
49 |             con=engine)
50 |         cols_bin_boundaries = get_cols_bin_boundaries(bin_input_df)
51 | 
52 |         if set(columns) > cols_bin_boundaries.keys():
53 |             raise ValueError("The provided bin boundaries contains keys: {}. But they cannot cover all the \
54 |                 input columns: {}".format(cols_bin_boundaries.keys(), columns))
55 | 
56 |         print("Ignore the bin_num and bin_method arguments")
57 |         bin_num_array = [None] * len(columns)
58 |         bin_method_array = [None] * len(columns)
59 |     else:
60 |         if len(bin_num_array) == 1:
61 |             bin_num_array = bin_num_array * len(columns)
62 |         else:
63 |             assert(len(bin_num_array) == len(columns))
64 | 
65 |         if len(bin_method_array) == 1:
66 |             bin_method_array = bin_method_array * len(columns)
67 |         else:
68 |             assert(len(bin_method_array) == len(columns))
69 |     
70 |     print("Calculate the statistics result for columns: {}".format(columns))
71 |     stats_df = calc_stats(
72 |         input_md,
73 |         columns,
74 |         bin_method_array,
75 |         bin_num_array,
76 |         cols_bin_boundaries,
77 |         args.reverse_cumsum)
78 | 
79 |     print("Persist the statistics result into the table {}".format(output_tables[0]))
80 |     stats_df.to_sql(
81 |         name=output_tables[0],
82 |         con=engine,
83 |         index=False
84 |     )
85 | 


--------------------------------------------------------------------------------
/runnables/extract_ts_features.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | import pandas as pd 
 4 | from run_io.db_adapter import convertDSNToRfc1738
 5 | from sqlalchemy import create_engine
 6 | from time_series_processing.ts_feature_extractor import add_features_extracted_from_ts_data, add_lag_columns
 7 | 
 8 | 
 9 | def build_argument_parser():
10 |     parser = argparse.ArgumentParser(allow_abbrev=False)
11 |     parser.add_argument("--dbname", type=str, required=True)
12 |     parser.add_argument("--column_id", type=str, required=True)
13 |     parser.add_argument("--column_time", type=str, required=True)
14 |     parser.add_argument("--columns_value", type=str, required=True)
15 |     parser.add_argument("--lag_num", type=int, default=1)
16 |     parser.add_argument("--windows", type=str, required=True)
17 |     parser.add_argument("--min_window", type=str, default=0)
18 |     parser.add_argument("--extract_setting", type=str, default="minimal", choices=["minimal", "efficient", "comprehensive"])
19 | 
20 |     return parser
21 | 
22 | 
23 | if __name__ == "__main__":
24 |     parser = build_argument_parser()
25 |     args, _ = parser.parse_known_args()
26 |     columns_value = args.columns_value.split(',')
27 |     windows = [int(item) for item in args.windows.split(',')]
28 | 
29 |     select_input = os.getenv("SQLFLOW_TO_RUN_SELECT")
30 |     output = os.getenv("SQLFLOW_TO_RUN_INTO")
31 |     datasource = os.getenv("SQLFLOW_DATASOURCE")
32 | 
33 |     url = convertDSNToRfc1738(datasource, args.dbname)
34 |     engine = create_engine(url)
35 |     input = pd.read_sql(
36 |         sql=select_input,
37 |         con=engine)
38 | 
39 |     df_with_lag_columns, lag_column_names = add_lag_columns(input, columns_value, args.lag_num)
40 | 
41 |     print("Start extracting the features from the time series data.")
42 |     df_with_extracted_features = add_features_extracted_from_ts_data(
43 |         df_with_lag_columns,
44 |         column_id=args.column_id,
45 |         column_time=args.column_time,
46 |         columns_value=lag_column_names,
47 |         windows=windows,
48 |         min_window=args.min_window,
49 |         extract_setting=args.extract_setting)
50 |     print("Complete the feature extraction.")
51 | 
52 |     df_with_extracted_features = df_with_extracted_features.drop(columns=lag_column_names)
53 | 
54 |     df_with_extracted_features.to_sql(
55 |         name=output,
56 |         con=engine,
57 |         index=False)
58 |     print("Complete save the result data into table {}.".format(output))
59 | 


--------------------------------------------------------------------------------
/runnables/psi.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | import pandas as pd
 4 | from bin.psi_calculator import calc_psi, get_cols_bin_probs
 5 | from run_io.db_adapter import convertDSNToRfc1738
 6 | from sqlalchemy import create_engine
 7 | 
 8 | 
 9 | def build_argument_parser():
10 |     parser = argparse.ArgumentParser(allow_abbrev=False)
11 |     parser.add_argument("--dbname", type=str, required=True)
12 |     parser.add_argument("--refer_stats_table", type=str, required=True)
13 |     parser.add_argument("--bin_prob_column", type=str, default="bin_prob")
14 | 
15 |     return parser
16 | 
17 | 
18 | if __name__ == "__main__":
19 |     parser = build_argument_parser()
20 |     args, _ = parser.parse_known_args()
21 | 
22 |     select_input = os.getenv("SQLFLOW_TO_RUN_SELECT")
23 |     output = os.getenv("SQLFLOW_TO_RUN_INTO")
24 |     datasource = os.getenv("SQLFLOW_DATASOURCE")
25 | 
26 |     url = convertDSNToRfc1738(datasource, args.dbname)
27 |     engine = create_engine(url)
28 | 
29 |     input_df = pd.read_sql(
30 |         sql=select_input,
31 |         con=engine)
32 |     refer_stats_df = pd.read_sql_table(
33 |         table_name=args.refer_stats_table,
34 |         con=engine)
35 | 
36 |     actual_cols_bin_probs = get_cols_bin_probs(input_df, args.bin_prob_column)
37 |     expected_cols_bin_probs = get_cols_bin_probs(input_df, args.bin_prob_column)
38 | 
39 |     common_column_names = set.intersection(
40 |         set(actual_cols_bin_probs.keys()),
41 |         set(expected_cols_bin_probs.keys()))
42 | 
43 |     print("Calculate the PSI value for {} fields.".format(len(common_column_names)))
44 |     cols_psi_data = []
45 |     for column_name in common_column_names:
46 |         psi_value = calc_psi(actual_cols_bin_probs[column_name], expected_cols_bin_probs[column_name])
47 |         cols_psi_data.append(
48 |             {
49 |                 "name": column_name,
50 |                 "psi": psi_value
51 |             }
52 |         )
53 |     cols_psi_df = pd.DataFrame(cols_psi_data)
54 | 
55 |     print("Persist the PSI result into the table {}".format(output))
56 |     cols_psi_df.to_sql(
57 |         name=output,
58 |         con=engine,
59 |         index=False
60 |     )
61 | 


--------------------------------------------------------------------------------
/runnables/requirements.txt:
--------------------------------------------------------------------------------
1 | tsfresh==0.16.0
2 | sqlalchemy==1.3.19
3 | mysql==0.0.2
4 | pymars==0.5.1
5 | pandas>=1.0.0


--------------------------------------------------------------------------------
/runnables/run_io/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sql-machine-learning/models/5dc6421f562ea447e501fa355a48a6ee89856a1d/runnables/run_io/__init__.py


--------------------------------------------------------------------------------
/runnables/run_io/db_adapter.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | def parseMySQLDSN(dsn):
 4 |     # [username[:password]@][protocol[(address)]]/dbname[?param1=value1&...&paramN=valueN]
 5 |     pattern = "^(\w*):(\w*)@tcp\(([.a-zA-Z0-9\-]*):([0-9]*)\)/(\w*)(\?.*)?$"  # noqa: W605, E501
 6 |     found_result = re.findall(pattern, dsn)
 7 |     user, passwd, host, port, database, config_str = found_result[0]
 8 |     config = {}
 9 |     if len(config_str) > 1:
10 |         for c in config_str[1:].split("&"):
11 |             k, v = c.split("=")
12 |             config[k] = v
13 |     return user, passwd, host, port, database, config
14 | 
15 | # TODO(brightcoder01): Should we put this kind of common method
16 | # in sqlflow runtime? While writing the runnable code, users can
17 | # import the runtime library.
18 | def convertDSNToRfc1738(driver_dsn, defaultDbName):
19 |     driver, dsn = driver_dsn.split("://")
20 |     user, passwd, host, port, database, config = parseMySQLDSN(dsn)
21 | 
22 |     if not database:
23 |         database = defaultDbName
24 | 
25 |     # mysql://root:root@127.0.0.1:3306/dbname
26 |     return "{}://{}:{}@{}:{}/{}".format(driver, user, passwd, host, port, database)
27 | 


--------------------------------------------------------------------------------
/runnables/time_series_processing/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sql-machine-learning/models/5dc6421f562ea447e501fa355a48a6ee89856a1d/runnables/time_series_processing/__init__.py


--------------------------------------------------------------------------------
/runnables/time_series_processing/ts_feature_extractor.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | from functools import reduce
  3 | from tsfresh import extract_features
  4 | from tsfresh.feature_extraction.settings import MinimalFCParameters, ComprehensiveFCParameters, EfficientFCParameters
  5 | from tsfresh.utilities.dataframe_functions import roll_time_series
  6 | 
  7 | 
  8 | EXTRACT_SETTING_NAME_TO_CLASS_DICT = {
  9 |     "minimal": MinimalFCParameters,
 10 |     "efficient": EfficientFCParameters,
 11 |     "comprehensive": ComprehensiveFCParameters
 12 | }
 13 | 
 14 | ROLLED_TS_ID_COLUMN_NAME = "id"
 15 | ORIGIN_JOIN_ID_COLUMN_NAME = "join_id"
 16 | ROLLED_TS_ID_FORMAT = "id={},timeshift={}"
 17 | 
 18 | 
 19 | def _roll_ts_and_extract_features(
 20 |     input,
 21 |     column_id,
 22 |     column_time,
 23 |     columns_value,
 24 |     max_window,
 25 |     min_window,
 26 |     extract_setting):
 27 |     rolled_ts = roll_time_series(
 28 |         input,
 29 |         column_id=column_id,
 30 |         column_kind=None,
 31 |         column_sort=column_time,
 32 |         rolling_direction=1,
 33 |         max_timeshift=max_window,
 34 |         min_timeshift=min_window,
 35 |         n_jobs=0)
 36 | 
 37 |     rename_columns = {
 38 |         value_column: "{}_w_{}".format(value_column, max_window)
 39 |         for value_column in columns_value
 40 |         }
 41 |     rolled_ts = rolled_ts.rename(columns=rename_columns)
 42 |     rolled_ts = rolled_ts.drop(columns=[column_id])
 43 | 
 44 |     extract_setting_clz = EXTRACT_SETTING_NAME_TO_CLASS_DICT.get(extract_setting, MinimalFCParameters)
 45 |     extracted_features = extract_features(
 46 |         rolled_ts,
 47 |         column_id=ROLLED_TS_ID_COLUMN_NAME,
 48 |         column_sort=column_time,
 49 |         n_jobs=0,
 50 |         default_fc_parameters=extract_setting_clz())
 51 | 
 52 |     return extracted_features
 53 | 
 54 | 
 55 | def add_lag_columns(
 56 |     input,
 57 |     columns_value,
 58 |     lag_num):
 59 |     lag_column_names = []
 60 |     for column_value in columns_value:
 61 |         lag_column_name = "{}_lag_{}".format(column_value, lag_num)
 62 |         input[lag_column_name] = input[column_value].shift(lag_num)
 63 |         lag_column_names.append(lag_column_name)
 64 |     
 65 |     return input[lag_num:], lag_column_names
 66 | 
 67 | 
 68 | def add_features_extracted_from_ts_data(
 69 |     input,
 70 |     column_id,
 71 |     column_time,
 72 |     columns_value,
 73 |     windows,
 74 |     min_window=0,
 75 |     extract_setting="minimal"):
 76 |     """Extract features from the time series data and append them to the
 77 |     original data.
 78 | 
 79 |     Build the rolled time series data with various window sizes, extract
 80 |     the features using TSFresh and then append the derived features to
 81 |     the original data.
 82 | 
 83 |     Args:
 84 |         input: A pandas DataFrame for the input data.
 85 |         column_id: The name of the id column to group by the time series data.
 86 |             The input data can contain the time series for various entities.
 87 |             For example, the UV for different websites.
 88 |         column_time: The name of the time column.
 89 |         columns_value: Array. The names of the columns for the time series data.
 90 |         windows: Array of window sizes. The time series data will be rolled with
 91 |             each window size.
 92 |         min_window: The extract forecast windows smaller or equal than this will
 93 |             be throwed away.
 94 |         extract_setting: minimal | efficient | comprehensive. Control which features
 95 |             will be extracted. The order of feature numbers is:
 96 |             minimal < efficient < comprehensive
 97 | 
 98 |     Returns:
 99 |         A pandas DataFrame containing the original input data and extracted features.
100 |     """
101 | 
102 |     input_with_join_id = pd.DataFrame()
103 |     input_with_join_id[ORIGIN_JOIN_ID_COLUMN_NAME] = input.apply(
104 |         lambda row: ROLLED_TS_ID_FORMAT.format(row[column_id], row[column_time]),
105 |         axis=1)
106 | 
107 |     input_with_join_id = pd.concat(
108 |         [input, input_with_join_id],
109 |         axis=1)
110 | 
111 |     input = input[[column_id, column_time] + columns_value]
112 |     input.sort_values(by=[column_id, column_time])
113 | 
114 |     extracted_features_multi_windows = [
115 |         _roll_ts_and_extract_features(
116 |             input=input,
117 |             column_id=column_id,
118 |             column_time=column_time,
119 |             columns_value=columns_value,
120 |             max_window=window,
121 |             min_window=min_window,
122 |             extract_setting=extract_setting
123 |         ) for window in windows
124 |     ]
125 | 
126 |     extracted_features_multi_windows = reduce(lambda left, right: pd.merge(
127 |         left=left,
128 |         right=right,
129 |         how="left",
130 |         on=ROLLED_TS_ID_COLUMN_NAME
131 |     ), extracted_features_multi_windows)
132 | 
133 |     original_data_with_extracted_features = pd.merge(
134 |         input_with_join_id,
135 |         extracted_features_multi_windows,
136 |         how='inner',
137 |         left_on=ORIGIN_JOIN_ID_COLUMN_NAME,
138 |         right_on=ROLLED_TS_ID_COLUMN_NAME
139 |     )
140 | 
141 |     original_data_with_extracted_features.sort_values(by=[column_id, column_time])
142 |     original_data_with_extracted_features = original_data_with_extracted_features.drop(columns=[ORIGIN_JOIN_ID_COLUMN_NAME])
143 | 
144 |     return original_data_with_extracted_features
145 | 


--------------------------------------------------------------------------------
/runnables/two_dim_binning.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import mars.dataframe as md
  3 | import os
  4 | import pandas as pd
  5 | from bin.binning_calculator import calc_stats, calc_two_dim_binning_stats, get_cols_bin_boundaries
  6 | from run_io.db_adapter import convertDSNToRfc1738
  7 | from sqlalchemy import create_engine
  8 | 
  9 | 
 10 | def build_argument_parser():
 11 |     parser = argparse.ArgumentParser(allow_abbrev=False)
 12 |     parser.add_argument("--dbname", type=str, required=True)
 13 |     parser.add_argument("--columns", type=str, required=True)
 14 |     parser.add_argument("--bin_method", type=str, required=False)
 15 |     parser.add_argument("--bin_num", type=str, required=False)
 16 |     parser.add_argument("--bin_input_table", type=str, required=False)
 17 |     parser.add_argument("--reverse_cumsum", type=bool, default=False)
 18 | 
 19 |     return parser
 20 | 
 21 | 
 22 | if __name__ == "__main__":
 23 |     parser = build_argument_parser()
 24 |     args, _ = parser.parse_known_args()
 25 |     columns = args.columns.split(',')
 26 |     bin_method_array = args.bin_method.split(',') if args.bin_method else None
 27 |     bin_num_array = [int(item) for item in args.bin_num.split(',')] if args.bin_num else None
 28 | 
 29 |     select_input = os.getenv("SQLFLOW_TO_RUN_SELECT")
 30 |     output = os.getenv("SQLFLOW_TO_RUN_INTO")
 31 |     output_tables = output.split(',')
 32 |     datasource = os.getenv("SQLFLOW_DATASOURCE")
 33 | 
 34 |     # Check arguments
 35 |     assert len(columns) == 2, "The column number should only be 2"
 36 |     assert len(output_tables) == 3, "The output table number should only be 3"
 37 | 
 38 |     url = convertDSNToRfc1738(datasource, args.dbname)
 39 |     engine = create_engine(url)
 40 |     input_md = md.read_sql(
 41 |         sql=select_input,
 42 |         con=engine)
 43 |     input_md.execute()
 44 | 
 45 |     cols_bin_boundaries = {}
 46 |     if args.bin_input_table:
 47 |         print("Get provided bin boundaries from table {}".format(args.bin_input_table))
 48 |         bin_input_df = pd.read_sql_table(
 49 |             table_name=args.bin_input_table,
 50 |             con=engine)
 51 |         cols_bin_boundaries = get_cols_bin_boundaries(bin_input_df)
 52 | 
 53 |         if set(columns) > cols_bin_boundaries.keys():
 54 |             raise ValueError("The provided bin boundaries contains keys: {}. But they cannot cover all the \
 55 |                 input columns: {}".format(cols_bin_boundaries.keys(), columns))
 56 | 
 57 |         print("Ignore the bin_num and bin_method arguments")
 58 |         bin_num_array = [None] * len(columns)
 59 |         bin_method_array = [None] * len(columns)
 60 |     else:
 61 |         if len(bin_num_array) == 1:
 62 |             bin_num_array = bin_num_array * len(columns)
 63 |         else:
 64 |             assert(len(bin_num_array) == len(columns))
 65 | 
 66 |         if len(bin_method_array) == 1:
 67 |             bin_method_array = bin_method_array * len(columns)
 68 |         else:
 69 |             assert(len(bin_method_array) == len(columns))
 70 |     
 71 |     print("Calculate the statistics result for columns: {}".format(columns))
 72 |     stats_df = calc_stats(
 73 |         input_md,
 74 |         columns,
 75 |         bin_method_array,
 76 |         bin_num_array,
 77 |         cols_bin_boundaries,
 78 |         args.reverse_cumsum)
 79 | 
 80 |     print("Persist the statistics result into the table {}".format(output_tables[0]))
 81 |     stats_df.to_sql(
 82 |         name=output_tables[0],
 83 |         con=engine,
 84 |         index=False
 85 |     )
 86 | 
 87 |     print("Calculate two dimension binning result for columns: {}".format(columns))
 88 |     bin_prob_df, bin_cumsum_prob_df = calc_two_dim_binning_stats(
 89 |         input_md,
 90 |         columns[0],
 91 |         columns[1],
 92 |         bin_method_array[0],
 93 |         bin_method_array[1],
 94 |         bin_num_array[0],
 95 |         bin_num_array[1],
 96 |         cols_bin_boundaries.get(columns[0], None),
 97 |         cols_bin_boundaries.get(columns[1], None),
 98 |         args.reverse_cumsum)
 99 | 
100 |     print("Persist the binning probabilities into table {}".format(output_tables[1]))
101 |     bin_prob_df.to_sql(
102 |         name=output_tables[1],
103 |         con=engine,
104 |         index=False
105 |     )
106 |     print("Persist the binning accumulated probabilities into table {}".format(output_tables[2]))
107 |     bin_cumsum_prob_df.to_sql(
108 |         name=output_tables[2],
109 |         con=engine,
110 |         index=False
111 |     )
112 | 


--------------------------------------------------------------------------------
/scripts/data/iris.recordio:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sql-machine-learning/models/5dc6421f562ea447e501fa355a48a6ee89856a1d/scripts/data/iris.recordio


--------------------------------------------------------------------------------
/scripts/elasticdl_travis_test_job.sh:
--------------------------------------------------------------------------------
 1 | if [ "$SQLFLOW_TEST_DB_MAXCOMPUTE_AK" = "" ] || [ "$SQLFLOW_TEST_DB_MAXCOMPUTE_SK" == "" ]; then
 2 |   echo "skip maxcompute test because the env SQLFLOW_TEST_DB_MAXCOMPUTE_AK or SQLFLOW_TEST_DB_MAXCOMPUTE_SK is empty"
 3 |   exit 0
 4 | fi
 5 | 
 6 | curl -s https://raw.githubusercontent.com/sql-machine-learning/elasticdl/4a995fe7eaf91bc5a9d50181e9aaaa14d15c8a09/scripts/setup_k8s_env.sh | bash
 7 | kubectl apply -f https://raw.githubusercontent.com/sql-machine-learning/elasticdl/develop/elasticdl/manifests/examples/elasticdl-rbac.yaml
 8 | 
 9 | docker run --rm -it --net=host \
10 |       -v $HOME/.kube:/root/.kube \
11 |       -v /home/$USER/.minikube/:/home/$USER/.minikube/ \
12 |       -v /var/run/docker.sock:/var/run/docker.sock \
13 |       -v $PWD:/workspace \
14 |       -e ODPS_ACCESS_ID=$MAXCOMPUTE_AK \
15 |       -e ODPS_ACCESS_KEY=$MAXCOMPUTE_SK \
16 |       sqlflow/sqlflow_models bash /workspace/scripts/test_elasticdl_submit.sh
17 | 
18 | docker run --rm -it --net=host \
19 |       -v $HOME/.kube:/root/.kube \
20 |       -v /home/$USER/.minikube/:/home/$USER/.minikube/ \
21 |       sqlflow/sqlflow_models \
22 |       bash -c "curl -s https://raw.githubusercontent.com/sql-machine-learning/elasticdl/62b255a918df5b6594c888b19aebbcc74bbce6e4/scripts/validate_job_status.py | python - odps 1 2"
23 | 


--------------------------------------------------------------------------------
/scripts/test_elasticdl_submit.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | elasticdl train --image_base=sqlflow/sqlflow_models \
 4 | --model_def=dnnclassifier.DNNClassifier \
 5 | --training_data=sqlflow_test_iris_train \
 6 | --data_reader_params='columns=["sepal_length", "sepal_width", "petal_length", "petal_width", "class"];label_col="class"' \
 7 | --envs="ODPS_PROJECT_NAME=gomaxcompute_driver_w7u,ODPS_ACCESS_ID=$ODPS_ACCESS_ID,ODPS_ACCESS_KEY=$ODPS_ACCESS_KEY" \
 8 | --minibatch_size=32 \
 9 | --num_epochs=2 \
10 | --model_zoo=/sqlflow_models \
11 | --job_name=test-odps \
12 | --num_minibatches_per_task=2 \
13 | --image_pull_policy=Never \
14 | --num_workers=2 \
15 | --num_ps_pods=1 \
16 | --master_resource_request="cpu=200m,memory=128Mi" \
17 | --master_resource_limit="cpu=1,memory=2048Mi" \
18 | --worker_resource_request="cpu=200m,memory=128Mi" \
19 | --worker_resource_limit="cpu=1,memory=3072Mi" \
20 | --ps_resource_request="cpu=200m,memory=128Mi" \
21 | --ps_resource_limit="cpu=1,memory=2048Mi" \
22 | --grads_to_wait=2 \
23 | --output=model_output
24 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [aliases]
2 | test=pytest
3 | 
4 | [tool:pytest]
5 | rootdir=tests
6 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | # Note: To use the 'upload' functionality of this file, you must:
  5 | #   $ pip install twine
  6 | 
  7 | import io
  8 | import os
  9 | import sys
 10 | from shutil import rmtree
 11 | 
 12 | from setuptools import find_packages, setup, Command
 13 | 
 14 | # Package meta-data.
 15 | NAME = 'sqlflow_models'
 16 | DESCRIPTION = 'Premade Models for SQLFlow.'
 17 | URL = 'https://github.com/sql-machine-learning/models'
 18 | EMAIL = 'yzhdoudou@gmail.com'
 19 | AUTHOR = 'Yang Yang'
 20 | REQUIRES_PYTHON = '>=3.6.0'
 21 | VERSION = None
 22 | 
 23 | # What packages are required for this module to be executed?
 24 | REQUIRED = [
 25 |     'protobuf==3.7.1',
 26 |     'tensorflow==2.0.1',
 27 |     'scikit-learn==0.21.0',
 28 |     'numpy==1.16.2',
 29 |     'pandas>=0.25.1',
 30 |     'adanet==0.8.0',
 31 |     "tensorflow-datasets==3.0.0",
 32 |     "statsmodels==0.11.1",
 33 |     "scipy==1.4.1",
 34 |     "tensorflow-metadata<0.23.0",
 35 | ]
 36 | 
 37 | SETUP_REQUIRED = [
 38 |     'pytest-runner'
 39 | ]
 40 | TEST_REQUIRED = [
 41 |     'pytest',
 42 | ]
 43 | 
 44 | # What packages are optional?
 45 | EXTRAS = {
 46 | }
 47 | 
 48 | # The rest you shouldn't have to touch too much :)
 49 | # ------------------------------------------------
 50 | # Except, perhaps the License and Trove Classifiers!
 51 | # If you do change the License, remember to change the Trove Classifier for that!
 52 | 
 53 | here = os.path.abspath(os.path.dirname(__file__))
 54 | 
 55 | # Import the README and use it as the long-description.
 56 | # Note: this will only work if 'README.md' is present in your MANIFEST.in file!
 57 | try:
 58 |     with io.open(os.path.join(here, 'README.md'), encoding='utf-8') as f:
 59 |         long_description = '\n' + f.read()
 60 | except FileNotFoundError:
 61 |     long_description = DESCRIPTION
 62 | 
 63 | # Load the package's __version__.py module as a dictionary.
 64 | about = {}
 65 | if not VERSION:
 66 |     with open(os.path.join(here, NAME, '_version.py')) as f:
 67 |         exec(f.read(), about)
 68 | else:
 69 |     about['__version__'] = VERSION
 70 | 
 71 | 
 72 | class UploadCommand(Command):
 73 |     """Support setup.py upload."""
 74 | 
 75 |     description = 'Build and publish the package.'
 76 |     user_options = []
 77 | 
 78 |     @staticmethod
 79 |     def status(s):
 80 |         """Prints things in bold."""
 81 |         print('\033[1m{0}\033[0m'.format(s))
 82 | 
 83 |     def initialize_options(self):
 84 |         pass
 85 | 
 86 |     def finalize_options(self):
 87 |         pass
 88 | 
 89 |     def run(self):
 90 |         try:
 91 |             self.status('Removing previous builds…')
 92 |             rmtree(os.path.join(here, 'dist'))
 93 |         except OSError:
 94 |             pass
 95 | 
 96 |         self.status('Building Source and Wheel (universal) distribution…')
 97 |         os.system('{0} setup.py sdist bdist_wheel --universal'.format(sys.executable))
 98 | 
 99 |         self.status('Uploading the package to PyPI via Twine…')
100 |         os.system('twine upload dist/*')
101 | 
102 |         self.status('Pushing git tags…')
103 |         os.system('git tag v{0}'.format(about['__version__']))
104 |         os.system('git push --tags')
105 | 
106 |         sys.exit()
107 | 
108 | 
109 | # Where the magic happens:
110 | setup(
111 |     name=NAME,
112 |     version=about['__version__'],
113 |     description=DESCRIPTION,
114 |     long_description=long_description,
115 |     long_description_content_type='text/markdown',
116 |     author=AUTHOR,
117 |     author_email=EMAIL,
118 |     python_requires=REQUIRES_PYTHON,
119 |     url=URL,
120 |     packages=find_packages(exclude=('tests',)),
121 |     # If your package is a single module, use this instead of 'packages':
122 |     # py_modules=['mypackage'],
123 | 
124 |     # entry_points={
125 |     #     'console_scripts': ['mycli=mymodule:cli'],
126 |     # },
127 |     install_requires=REQUIRED,
128 |     setup_requires=SETUP_REQUIRED,
129 |     tests_require=TEST_REQUIRED,
130 |     extras_require=EXTRAS,
131 |     license='Apache License 2.0',
132 |     classifiers=[
133 |         # Trove classifiers
134 |         # Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers
135 |         'License :: OSI Approved :: Apache Software License',
136 |         'Programming Language :: Python',
137 |         'Programming Language :: Python :: 3',
138 |         'Programming Language :: Python :: 3.6',
139 |         'Programming Language :: Python :: Implementation :: CPython',
140 |         'Programming Language :: Python :: Implementation :: PyPy'
141 |     ],
142 |     # $ setup.py publish support.
143 |     cmdclass={
144 |         'upload': UploadCommand,
145 |     },
146 |     zip_safe=False,
147 | )
148 | 


--------------------------------------------------------------------------------
/sqlflow_models/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM sqlflow/modelzoo_base
2 | 
3 | RUN pip install tensorflow==2.0.0 scikit-learn==0.21.0 numpy==1.16.2 pandas==0.25.1
4 | ADD *.py /sqlflow_models/
5 | 


--------------------------------------------------------------------------------
/sqlflow_models/__init__.py:
--------------------------------------------------------------------------------
 1 | import traceback
 2 | from ._version import __version__
 3 | from .dnnclassifier import DNNClassifier
 4 | from .dnnregressor import DNNRegressor
 5 | from .rnnclassifier import StackedRNNClassifier
 6 | from .deep_embedding_cluster import DeepEmbeddingClusterModel
 7 | from .dnnclassifier_functional_api_example import dnnclassifier_functional_model
 8 | from .rnn_based_time_series import RNNBasedTimeSeriesModel
 9 | from .auto_estimator import AutoClassifier, AutoRegressor
10 | from .score_card import ScoreCard
11 | from .native_keras import RawDNNClassifier
12 | from .custom_model_example import CustomClassifier
13 | from .gcn import GCN
14 | from .one_class_svm import OneClassSVM
15 | try:
16 |     # NOTE: statsmodels have version conflict on PAI
17 |     from .arima_with_stl_decomposition import ARIMAWithSTLDecomposition
18 | except:
19 |     print("model ARIMAWithSTLDecomposition is not imported")
20 |     traceback.print_exc()
21 | 


--------------------------------------------------------------------------------
/sqlflow_models/_version.py:
--------------------------------------------------------------------------------
1 | VERSION = (0, 1, 0)
2 | 
3 | __version__ = '.'.join(map(str, VERSION))
4 | 


--------------------------------------------------------------------------------
/sqlflow_models/arima_with_stl_decomposition.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import six
  3 | from statsmodels.tsa.arima_model import ARIMA
  4 | from statsmodels.tsa.seasonal import STL
  5 | from datetime import datetime
  6 | import tensorflow as tf
  7 | import pandas as pd
  8 | 
  9 | class ARIMAWithSTLDecomposition(tf.keras.Model):
 10 |     def __init__(self,
 11 |                  order,
 12 |                  period,
 13 |                  date_format,
 14 |                  forecast_start,
 15 |                  forecast_end,
 16 |                  **kwargs):
 17 |         super(ARIMAWithSTLDecomposition, self).__init__()
 18 | 
 19 |         self.order = order
 20 |         if not isinstance(period, (list, tuple)):
 21 |             period = period
 22 |         self.period = period
 23 |         self.date_format = date_format
 24 |         self.forecast_start = self._str2date(forecast_start)
 25 |         self.forecast_end = self._str2date(forecast_end)
 26 |         self.seasonal = []
 27 |         self.kwargs = kwargs
 28 | 
 29 |     def _str2date(self, date_str):
 30 |         if isinstance(date_str, bytes):
 31 |             date_str = date_str.decode('utf-8')
 32 |         return datetime.strptime(str(date_str), self.date_format)
 33 | 
 34 |     def _read_all_data(self, dataset):
 35 |         data = None
 36 |         for batch_idx, items in enumerate(dataset):
 37 |             if data is None:
 38 |                 data = [[] for _ in six.moves.range(len(items))]
 39 | 
 40 |             for i, item in enumerate(items):
 41 |                 if isinstance(item, dict):
 42 |                     assert len(item) == 1
 43 |                     dict_values = list(item.values())
 44 |                     item = dict_values[0]
 45 | 
 46 |                 if isinstance(item, tf.Tensor):
 47 |                     item = item.numpy()
 48 | 
 49 |                 item = np.reshape(item, [-1]).tolist()
 50 |                 data[i].extend(item)
 51 | 
 52 |         dates, values = data
 53 |         sorted_dates_index = sorted(range(len(dates)), key=lambda k: dates[k])
 54 |         dates = np.array([self._str2date(dates[i]) for i in sorted_dates_index])
 55 |         values = np.array([values[i] for i in sorted_dates_index]).astype('float32')
 56 | 
 57 |         return dates, values
 58 | 
 59 |     def _stl_decompose(self, values):
 60 |         left_values = values
 61 |         self.seasonal = []
 62 |         for p in self.period:
 63 |             stl_model = STL(left_values, period=p).fit()
 64 |             seasonal = np.array(stl_model.seasonal)
 65 |             self.seasonal.append(seasonal)
 66 |             left_values -= seasonal
 67 | 
 68 |         return left_values
 69 | 
 70 |     def _addup_seasonal(self, dates, values):
 71 |         time_interval = dates[1] - dates[0]
 72 |         start_interval = self.forecast_start - dates[0]
 73 |         start_index = int(start_interval.total_seconds() / time_interval.total_seconds())
 74 | 
 75 |         length = len(values)
 76 | 
 77 |         for p, seasonal in six.moves.zip(self.period, self.seasonal):
 78 |             if length % p == 0:
 79 |                 offset = length
 80 |             else:
 81 |                 offset = (int(length / p) + 1) * p
 82 | 
 83 |             idx = start_index - offset
 84 |             values += seasonal[idx:idx+length]
 85 | 
 86 |         return values
 87 | 
 88 |     def _normalize(self, values):
 89 |         min_value = np.min(values)
 90 |         max_value = np.max(values)
 91 |         values = (values - min_value) / (max_value - min_value)
 92 |         return values, min_value, max_value
 93 |   
 94 |     def print_prediction_result(self, prediction, interval):
 95 |         t_strs = []
 96 |         for i, p in enumerate(prediction):
 97 |             t = self.forecast_start + i * interval 
 98 |             t_str = datetime.strftime(t, self.date_format)
 99 |             t_strs.append(t_str)
100 | 
101 |         df = pd.DataFrame(data={'time': t_strs, 'prediction': prediction})
102 |         with pd.option_context('display.max_columns', None):
103 |             print(df)
104 | 
105 |     def sqlflow_train_loop(self, dataset):
106 |         dates, values = self._read_all_data(dataset)
107 | 
108 |         left_values = self._stl_decompose(values)
109 |         left_values, min_value, max_value = self._normalize(left_values)
110 | 
111 |         model = ARIMA(left_values, order=self.order, dates=dates).fit(disp=-1)
112 | 
113 |         prediction = model.predict(start=self.forecast_start, end=self.forecast_end, typ='levels')
114 | 
115 |         prediction = prediction * (max_value - min_value) + min_value
116 |         prediction = self._addup_seasonal(dates, prediction)
117 |         self.print_prediction_result(prediction, interval=dates[1] - dates[0])
118 |         return prediction
119 | 
120 | def loss(*args, **kwargs):
121 |     return None
122 | 
123 | def optimizer(*args, **kwargs):
124 |     return None
125 | 


--------------------------------------------------------------------------------
/sqlflow_models/auto_estimator.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import, division, print_function, unicode_literals
  2 | from collections import defaultdict
  3 | 
  4 | import absl
  5 | import logging
  6 | import tensorflow as tf
  7 | import warnings
  8 | 
  9 | absl.logging.set_verbosity(absl.logging.ERROR)
 10 | tf.get_logger().setLevel(logging.ERROR)
 11 | tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
 12 | warnings.warn = lambda *args, **kargs:None
 13 | import adanet
 14 | 
 15 | from tensorflow import keras
 16 | from tensorflow_estimator.python.estimator.canned import optimizers
 17 | from .simple_dnn_generator import SimpleDNNGenerator
 18 | 
 19 | 
 20 | LEARN_MIXTURE_WEIGHTS=True
 21 | RANDOM_SEED = 42
 22 | 
 23 | class AutoClassifier(adanet.Estimator):
 24 |     def __init__(self, feature_columns, layer_size=50, optimizer='Adagrad', linear_optimizer='Ftrl',
 25 |                  model_dir=None, n_classes=2, activation_fn=tf.nn.relu, complexity_penalty=0.01,
 26 |                  search_every_n_steps=1000, max_iterations=10, config=None):
 27 |         """AutoClassifier
 28 |         :param feature_columns: Feature columns.
 29 |         :type feature_columns: list[tf.feature_column].
 30 |         :param layer_size: Number of hidden_units in each layers.
 31 |         :type layer_size: int.
 32 |         :param n_classes: Number of label classes. Defaults to 2, namely binary classification.
 33 |         :type n_classes: int.
 34 |         :param optimizer: Optimizer for the the neural multi-layer parts of the generated network.
 35 |         :type optimizer: str.
 36 |         :param linear_optimizer: Optimizer for the linear part of the generated network.
 37 |         :type linear_optimizer: str.
 38 |         :param model_dir: Directory to save or restore model checkpoints. 
 39 |         :type model_dir: str.
 40 |         :param activation_fn: Activation function. 
 41 |         :type activation_fn: function.
 42 |         :param complexity_penalty: Regularization of the complexity of the network.
 43 |         :type complexity_penalty: float.
 44 |         :param search_every_n_steps: Search new architecture every n steps.
 45 |         :type search_every_n_steps: int.
 46 |         :param max_iterations: Max times of architecture searching.
 47 |         :type max_iterations: int.
 48 |         :param config: Estimator configuration.
 49 |         :type config: dict.
 50 |         """
 51 |         if n_classes == 2:
 52 |             head = tf.estimator.BinaryClassHead()
 53 |         else:
 54 |             head = tf.estimator.MultiClassHead(n_classes=n_classes)
 55 | 
 56 |         opts= defaultdict(lambda: optimizers.get_optimizer_instance(optimizer, 0.001))
 57 |         opts[0] = optimizers.get_optimizer_instance(linear_optimizer, 0.1)
 58 |         # Define the generator, which defines the search space of subnetworks
 59 |         # to train as candidates to add to the final AdaNet model.
 60 |         subnetwork_generator = SimpleDNNGenerator(
 61 |             feature_columns=feature_columns,
 62 |             layer_size=layer_size,
 63 |             optimizers=opts,
 64 |             learn_mixture_weights=LEARN_MIXTURE_WEIGHTS,
 65 |             seed=RANDOM_SEED)
 66 |         super(AutoClassifier, self).__init__(head=head,
 67 |                                              model_dir=model_dir,
 68 |                                              adanet_lambda=complexity_penalty,
 69 |                                              subnetwork_generator=subnetwork_generator,
 70 |                                              max_iteration_steps=search_every_n_steps,
 71 |                                              max_iterations=max_iterations)
 72 | 
 73 | class AutoRegressor(adanet.Estimator):
 74 |     def __init__(self, feature_columns, layer_size=50, optimizer='Adagrad', linear_optimizer='Ftrl',
 75 |                  model_dir=None, activation_fn=tf.nn.relu, complexity_penalty=0.01,
 76 |                  search_every_n_steps=1000, max_iterations=10, config=None):
 77 |         """AutoRegressor
 78 |         :param feature_columns: Feature columns.
 79 |         :type feature_columns: list[tf.feature_column].
 80 |         :param layer_size: Number of hidden_units in each layers.
 81 |         :type layer_size: int.
 82 |         :param optimizer: Optimizer for the the neural multi-layer parts of the generated network.
 83 |         :type optimizer: str.
 84 |         :param linear_optimizer: Optimizer for the linear part of the generated network.
 85 |         :type linear_optimizer: str.
 86 |         :param model_dir: Directory to save or restore model checkpoints. 
 87 |         :type model_dir: str.
 88 |         :param activation_fn: Activation function. 
 89 |         :type activation_fn: function.
 90 |         :param complexity_penalty: Regularization of the complexity of the network.
 91 |         :type complexity_penalty: float.
 92 |         :param search_every_n_steps: Search new architecture every n steps.
 93 |         :type search_every_n_steps: int.
 94 |         :param max_iterations: Max times of architecture searching.
 95 |         :type max_iterations: int.
 96 |         :param config: Estimator configuration.
 97 |         :type config: dict.
 98 |         """
 99 |         head = tf.estimator.RegressionHead()
100 | 
101 |         opts= defaultdict(lambda: optimizers.get_optimizer_instance(optimizer, 0.001))
102 |         opts[0] = optimizers.get_optimizer_instance(linear_optimizer, 0.1)
103 |         # Define the generator, which defines the search space of subnetworks
104 |         # to train as candidates to add to the final AdaNet model.
105 |         subnetwork_generator = SimpleDNNGenerator(
106 |             feature_columns=feature_columns,
107 |             layer_size=layer_size,
108 |             optimizers=opts,
109 |             learn_mixture_weights=LEARN_MIXTURE_WEIGHTS,
110 |             seed=RANDOM_SEED)
111 |         super(AutoRegressor, self).__init__(head=head,
112 |                                             model_dir=model_dir,
113 |                                             adanet_lambda=complexity_penalty,
114 |                                             subnetwork_generator=subnetwork_generator,
115 |                                             max_iteration_steps=search_every_n_steps,
116 |                                             max_iterations=max_iterations)
117 | 


--------------------------------------------------------------------------------
/sqlflow_models/custom_model_example.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import random
 3 | import numpy as np
 4 | 
 5 | class CustomClassifier(tf.keras.Model):
 6 |     def __init__(self, feature_columns=None):
 7 |         """The model init function. You can define any model parameter in the function's argument list.
 8 |            You can also add custom training routines together with a Keras
 9 |            model (see deep_embedding_cluster.py), or define a model with out Keras layers
10 |            (e.g. use sklearn or numpy only).
11 |         """
12 |         pass
13 | 
14 |     def sqlflow_train_loop(self, x):
15 |         """The custom model traininig loop, input x is a tf.dataset object that generates training data.
16 |         """
17 |         pass
18 |     
19 |     def sqlflow_predict_one(self, sample):
20 |         """Run prediction with one sample and return the prediction result. The result must be a
21 |            list of numpy array. SQLFlow determine the output type by:
22 |            - if the array have only one element, the model must be regression model.
23 |            - if the array have multiple elements:
24 |              - if the sum of all the elements are close to 1, it is likely to be a classification model.
25 |              - else the model is a regression model with multiple outputs.
26 |         """
27 |         pos = random.random()
28 |         neg = 1 - pos
29 |         array = np.array([pos, neg])
30 |         return [array]
31 | 
32 |     def sqlflow_evaluate_loop(self, x, metric_names):
33 |         """Run evaluation on the validation dataset and return a list of metrics.
34 |            NOTE: the first result metric is always loss. If no loss is defined, add 0.
35 |         """
36 |         metric_len = len(metric_names)
37 |         result = []
38 |         for i in range(metric_len+1):
39 |             result.append(random.random())
40 |         return result
41 | 


--------------------------------------------------------------------------------
/sqlflow_models/deep_embedding_cluster.py:
--------------------------------------------------------------------------------
  1 | #!usr/bin/env python
  2 | # -*- coding:utf-8 _*-
  3 | 
  4 | """
  5 | __author__ : chenxiang
  6 | __email__ : alfredchenxiang@didichuxing.com
  7 | __file_name__ : deep_embedding_cluster.py
  8 | __create_time__ : 2019/09/03
  9 | """
 10 | from datetime import datetime
 11 | import tensorflow as tf
 12 | from tensorflow import keras
 13 | from tensorflow.python.data import make_one_shot_iterator
 14 | from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
 15 | from tensorflow.keras.layers import Dense, Layer, DenseFeatures, InputSpec
 16 | from tensorflow.keras import backend
 17 | import numpy as np
 18 | from sklearn.cluster import KMeans
 19 | from tensorflow.keras.losses import kld
 20 | from tensorflow.keras.optimizers import SGD
 21 | import tensorflow_datasets as tfds
 22 | import pandas as pd
 23 | 
 24 | _train_lr = 0.01
 25 | _default_loss = kld
 26 | 
 27 | class DeepEmbeddingClusterModel(keras.Model):
 28 | 
 29 |     def __init__(self,
 30 |                  feature_columns,
 31 |                  n_clusters=10,
 32 |                  kmeans_init=20,
 33 |                  run_pretrain=True,
 34 |                  existed_pretrain_model=None,
 35 |                  pretrain_dims=[100, 100, 10],
 36 |                  pretrain_activation_func='relu',
 37 |                  pretrain_use_callbacks=False,
 38 |                  pretrain_cbearlystop_patience=30,
 39 |                  pretrain_cbearlystop_mindelta=0.0001,
 40 |                  pretrain_cbreduce_patience=10,
 41 |                  pretrain_cbreduce_factor=0.1,
 42 |                  pretrain_epochs=30,
 43 |                  pretrain_initializer='glorot_uniform',
 44 |                  pretrain_lr=1,
 45 |                  train_lr=0.01,
 46 |                  train_max_iters=8000,
 47 |                  update_interval=100,
 48 |                  train_use_tol=True,
 49 |                  tol=0.0001,
 50 |                  loss=kld):
 51 | 
 52 |         """
 53 |         Implement cluster model mostly based on DEC.
 54 |         :param feature_columns: a list of tf.feature_column
 55 |         :param n_clusters: Number of clusters.
 56 |         :param kmeans_init: Number of running K-Means to get best choice of centroids.
 57 |         :param run_pretrain: Run pre-train process or not.
 58 |         :param existed_pretrain_model: Path of existed pre-train model. Not used now.
 59 |         :param pretrain_dims: Dims of layers which is used for build autoencoder.
 60 |         :param pretrain_activation_func: Active function of autoencoder layers.
 61 |         :param pretrain_use_callbacks: Use callbacks when pre-train or not.
 62 |         :param pretrain_cbearlystop_patience: Patience value of EarlyStopping when use callbacks.
 63 |         :param pretrain_cbearlystop_mindelta: Min_delta value of EarlyStopping when use callbacks.
 64 |         :param pretrain_cbreduce_patience: Patience value of ReduceLROnPlateau when use callbacks.
 65 |         :param pretrain_cbreduce_factor: Factor value of ReduceLROnPlateau when use callbacks.
 66 |         :param pretrain_epochs: Number of epochs when pre-train.
 67 |         :param pretrain_initializer: Initialize function for autoencoder layers.
 68 |         :param pretrain_lr: learning rate to train the auto encoder.
 69 |         :param train_lr: learning rate to train the cluster network.
 70 |         :param train_max_iters: Number of iterations when train.
 71 |         :param update_interval: Interval between updating target distribution.
 72 |         :param train_use_tol: Use tolerance during clusteringlayer or not.
 73 |         :param tol: Tolerance of earlystopping when train during clusteringlayer.
 74 |         :param loss: Default 'kld' when init.
 75 |         """
 76 |         global _train_lr
 77 |         global _default_loss
 78 |         super(DeepEmbeddingClusterModel, self).__init__(name='DECModel')
 79 | 
 80 |         # Common
 81 |         self._feature_columns = feature_columns
 82 |         self._feature_columns_dims = len(self._feature_columns)
 83 |         self._n_clusters = n_clusters
 84 |         _default_loss = loss
 85 |         self._train_max_iters = train_max_iters
 86 |         self._update_interval = update_interval
 87 |         self._current_interval = 0
 88 |         self._train_use_tol = train_use_tol
 89 |         self._tol = tol
 90 | 
 91 |         # Pre-train
 92 |         self._run_pretrain = run_pretrain
 93 |         self._existed_pretrain_model = existed_pretrain_model
 94 |         self._pretrain_activation_func = pretrain_activation_func
 95 |         self._pretrain_dims = pretrain_dims
 96 |         self._pretrain_epochs = pretrain_epochs
 97 |         self._pretrain_initializer = pretrain_initializer
 98 |         self._pretrain_lr = pretrain_lr
 99 |         self._pretrain_optimizer = SGD(lr=self._pretrain_lr, momentum=0.9)
100 | 
101 |         # Pre-train-callbacks
102 |         self._pretrain_use_callbacks = pretrain_use_callbacks
103 |         self._pretrain_cbearlystop_patience = pretrain_cbearlystop_patience
104 |         self._pretrain_cbearlystop_mindelta = pretrain_cbearlystop_mindelta
105 |         self._pretrain_cbreduce_patience = pretrain_cbreduce_patience
106 |         self._pretrain_cbreduce_factor = pretrain_cbreduce_factor 
107 | 
108 |         # K-Means
109 |         self._kmeans_init = kmeans_init
110 | 
111 |         # Cluster
112 |         _train_lr = train_lr
113 |         self._cluster_optimizer = SGD(lr=_train_lr, momentum=0.9)
114 | 
115 |         # Build model
116 |         self._n_stacks = len(self._pretrain_dims)
117 |         self.input_layer = DenseFeatures(feature_columns)
118 | 
119 |         # Layers - encoder
120 |         self.encoder_layers = []
121 |         for i in range(self._n_stacks):
122 |             self.encoder_layers.append(Dense(units=self._pretrain_dims[i],
123 |                                              activation=self._pretrain_activation_func,
124 |                                              name='encoder_%d' % i))
125 | 
126 |         self.clustering_layer = ClusteringLayer(name='clustering', n_clusters=self._n_clusters)
127 | 
128 |     @staticmethod
129 |     def target_distribution(q):
130 |         """
131 |         Calculate auxiliary softer target distributions by raising q to the second power and
132 |         then normalizing by frequency.
133 |         :param q: Original distributions.
134 |         :return: Auxiliary softer target distributions
135 |         """
136 |         weight = q ** 2 / q.sum(0)
137 |         return (weight.T / weight.sum(1)).T
138 | 
139 |     def pre_train(self, x):
140 |         """
141 |         Used for preparing encoder part by loading ready-to-go model or training one.
142 |         :param x:
143 |         :return:
144 |         """
145 |         print('{} Start pre_train.'.format(datetime.now()))
146 | 
147 |         print('{} Start preparing training dataset to save into memory.'.format(datetime.now()))
148 |         # Concatenate input feature to meet requirement of keras.Model.fit()
149 |         def _concate_generate(dataset_element):
150 |             concate_y = tf.stack([dataset_element[feature.key] for feature in self._feature_columns], axis=1)
151 |             return (dataset_element, concate_y)
152 | 
153 |         y = x.cache().map(map_func=_concate_generate)
154 |         y.prefetch(1)
155 |         
156 |         self.input_x = dict()
157 |         self.input_y = None
158 |         for np_sample in tfds.as_numpy(y):
159 |             sample_dict = np_sample[0]
160 |             label = np_sample[1]
161 |             if self.input_y is None:
162 |                 self.input_y = label
163 |             else:
164 |                 self.input_y = np.concatenate([self.input_y, label])
165 |             if len(self.input_x) == 0:
166 |                 self.input_x = sample_dict
167 |             else:
168 |                 for k in self.input_x:
169 |                     self.input_x[k] = np.concatenate([self.input_x[k], sample_dict[k]])
170 |         print('{} Done preparing training dataset.'.format(datetime.now()))
171 | 
172 |         # Layers - decoder
173 |         self.decoder_layers = []
174 |         for i in range(self._n_stacks - 2, -1, -1):
175 |             self.decoder_layers.append(Dense(units=self._pretrain_dims[i],
176 |                                              activation=self._pretrain_activation_func,
177 |                                              kernel_initializer=self._pretrain_initializer,
178 |                                              name='decoder_%d' % (i + 1)))
179 | 
180 |         self.decoder_layers.append(Dense(units=self._feature_columns_dims,
181 |                                          kernel_initializer=self._pretrain_initializer,
182 |                                          name='decoder_0'))
183 |         # Pretrain - autoencoder, encoder
184 |         # autoencoder
185 |         self._autoencoder = keras.Sequential(layers=[self.input_layer] + self.encoder_layers + self.decoder_layers,
186 |                                              name='autoencoder')
187 |         self._autoencoder.compile(optimizer=self._pretrain_optimizer, loss='mse')
188 |         # encoder
189 |         self._encoder = keras.Sequential(layers=[self.input_layer] + self.encoder_layers, name='encoder')
190 |         self._encoder.compile(optimizer=self._pretrain_optimizer, loss='mse')
191 | 
192 |         # pretrain_callbacks
193 |         print('{} Training auto-encoder.'.format(datetime.now()))
194 |         if self._pretrain_use_callbacks:
195 |             callbacks = [
196 |                 EarlyStopping(monitor='loss', 
197 |                     patience=self._pretrain_cbearlystop_patience, min_delta=self._pretrain_cbearlystop_mindelta),
198 |                 ReduceLROnPlateau(monitor='loss', 
199 |                     factor=self._pretrain_cbreduce_factor, patience=self._pretrain_cbreduce_patience)
200 |             ]
201 |             self._autoencoder.fit(self.input_x, self.input_y, 
202 |                 epochs=self._pretrain_epochs, callbacks=callbacks, verbose=1)
203 |         else:
204 |             self._autoencoder.fit(self.input_x, self.input_y, 
205 |                 epochs=self._pretrain_epochs, verbose=1)
206 |         # encoded_input
207 |         # type : numpy.ndarray shape : (num_of_all_records,num_of_cluster) (70000,10) if mnist
208 |         print('{} Calculating encoded_input.'.format(datetime.now()))
209 |         self.encoded_input = self._encoder.predict(x)
210 | 
211 |         del self._autoencoder
212 |         del self._encoder
213 |         del self.decoder_layers
214 |         print('{} Done pre-train.'.format(datetime.now()))
215 | 
216 |     def call(self, inputs, training=None, mask=None):
217 |         x = self.input_layer(inputs)
218 |         for encoder_layer in self.encoder_layers:
219 |             x = encoder_layer(x)
220 |         return self.clustering_layer(x)
221 | 
222 |     def init_centroids(self):
223 |         """
224 |         Training K-means `_kmeans_init` times on the output of encoder to get best initial centroids.
225 |         :return:
226 |         """
227 |         self.kmeans = KMeans(n_clusters=self._n_clusters, n_init=self._kmeans_init)
228 |         self.y_pred_last = self.kmeans.fit_predict(self.encoded_input)
229 |         print('{} Done init centroids by k-means.'.format(datetime.now()))
230 | 
231 |     def sqlflow_train_loop(self, x, epochs=1, verbose=0):
232 |         """ Parameter `epochs` and `verbose` will not be used in this function. """
233 |         # There is a bug which will cause build failed when using `DenseFeatures` with `keras.Model`
234 |         # https://github.com/tensorflow/tensorflow/issues/28111
235 |         # Using 'predict' to solve this problem here.
236 |         # Preparation
237 |         for features in x.take(1):
238 |             self.predict(x=features)
239 | 
240 |         # Get train.batch_size from sqlflow
241 |         for feature_name, feature_series in features.items():
242 |             self._train_batch_size = feature_series.shape[0]
243 |             break
244 | 
245 |         # Pre-train autoencoder to prepare weights of encoder layers.
246 |         self.pre_train(x)
247 | 
248 |         # Initialize centroids for clustering.
249 |         self.init_centroids()
250 | 
251 |         # Setting cluster layer.
252 |         self.get_layer(name='clustering').set_weights([self.kmeans.cluster_centers_])
253 | 
254 |         # Train
255 |         # flatten y to shape (num_samples, flattened_features)
256 |         record_num = self.input_y.shape[0]
257 |         feature_dims = self.input_y.shape[1:]
258 |         feature_dim_total = 1
259 |         for d in feature_dims:
260 |             feature_dim_total = feature_dim_total * d
261 |         y_reshaped = self.input_y.reshape([record_num, feature_dim_total])
262 |         print('{} Done preparing training dataset.'.format(datetime.now()))
263 | 
264 |         index_array = np.arange(record_num)
265 |         index, loss, p = 0, 0., None
266 |         
267 |         for ite in range(self._train_max_iters):
268 |             if ite % self._update_interval == 0:
269 |                 q = self.predict(self.input_x)  # numpy.ndarray shape(record_num,n_clusters)
270 |                 p = self.target_distribution(q)  # update the auxiliary target distribution p
271 |                 
272 |                 if self._train_use_tol:
273 |                     y_pred = q.argmax(1)
274 |                     # delta_percentage means the percentage of changed predictions in this train stage.
275 |                     delta_percentage = np.sum(y_pred != self.y_pred_last).astype(np.float32) / y_pred.shape[0]
276 |                     print('{} Updating at iter: {} -> delta_percentage: {}.'.format(datetime.now(), ite, delta_percentage))
277 |                     self.y_pred_last = np.copy(y_pred)
278 |                     if ite > 0 and delta_percentage < self._tol:
279 |                         print('Early stopping since delta_table {} has reached tol {}'.format(delta_percentage, self._tol))
280 |                         break
281 |             idx = index_array[index * self._train_batch_size: min((index + 1) * self._train_batch_size, record_num)]
282 | 
283 |             loss = self.train_on_batch(x=list(y_reshaped[idx].T), y=p[idx])
284 |             if ite % 100 == 0:
285 |                 print('{} Training at iter:{} -> loss:{}.'.format(datetime.now(), ite, loss))
286 |             index = index + 1 if (index + 1) * self._train_batch_size <= record_num else 0  # Update index
287 | 
288 |     def display_model_info(self, verbose=0):
289 |         if verbose >= 0:
290 |             print('Summary : ')
291 |             print(self.summary())
292 |         if verbose >= 1:
293 |             print('Layer\'s Shape : ')
294 |             for layer in self.encoder_layers:
295 |                 print(layer.name + ' : ')
296 |                 for i in layer.get_weights():
297 |                     print(i.shape)
298 |             print(self.clustering_layer.name + ' : ')
299 |             for i in self.clustering_layer.get_weights():
300 |                 print(i.shape)
301 |         if verbose >= 2:
302 |             print('Layer\'s Info : ')
303 |             for layer in self.encoder_layers:
304 |                 print(layer.name + ' : ')
305 |                 print(layer.get_weights())
306 |             # Cluster
307 |             print(self.clustering_layer.name + ' : ')
308 |             print(self.clustering_layer.get_weights())
309 | 
310 | 
311 | def optimizer():
312 |     global _train_lr
313 |     return SGD(lr=_train_lr, momentum=0.9)
314 | 
315 | def loss(labels, output):
316 |     global _default_loss
317 |     return _default_loss(labels, output)
318 | 
319 | def prepare_prediction_column(prediction):
320 |     """ Return the cluster label of the highest probability. """
321 |     return prediction.argmax(axis=-1)
322 | 
323 | class ClusteringLayer(Layer):
324 |     def __init__(self, n_clusters, alpha=1.0, **kwargs):
325 |         """
326 |         Using clustering layer to refine the cluster centroids by learning from current high confidence assignment
327 |         using auxiliary target distribution.
328 | 
329 |         :param n_clusters: Number of clusters.
330 |         :param weights: Initial cluster centroids.
331 |         :param alpha: Degrees of freedom parameters in Student's t-distribution. Default to 1.0 for all experiments.
332 |         :param kwargs:
333 |         """
334 |         self.n_clusters = n_clusters
335 |         self.alpha = alpha
336 |         self.input_spec = InputSpec(ndim=2)
337 |         super(ClusteringLayer, self).__init__(**kwargs)
338 | 
339 |     def build(self, input_shape):
340 |         input_dim = input_shape[1]
341 |         self.input_spec = InputSpec(dtype=backend.floatx(), shape=(None, input_dim))
342 |         shape = tf.TensorShape(dims=(self.n_clusters, input_dim))
343 |         self.kernel = self.add_weight(name='kernel', shape=shape, initializer='glorot_uniform', trainable=True)
344 |         super(ClusteringLayer, self).build(shape)
345 | 
346 |     def call(self, inputs, **kwargs):
347 |         q = 1.0 / (1.0 + (backend.sum(backend.square(backend.expand_dims(inputs, axis=1) - self.kernel),
348 |                                       axis=2) / self.alpha))
349 |         q **= (self.alpha + 1.0) / 2.0
350 |         q = backend.transpose(backend.transpose(q) / backend.sum(q, axis=1))
351 |         return q
352 | 
353 |     def compute_output_shape(self, input_shape):
354 |         assert input_shape and len(input_shape) == 2
355 |         return input_shape[0], self.n_clusters
356 | 
357 |     def get_config(self):
358 |         config = {'n_clusters': self.n_clusters}
359 |         base_config = super(ClusteringLayer, self).get_config()
360 |         return dict(list(base_config.items()) + list(config.items()))
361 | 


--------------------------------------------------------------------------------
/sqlflow_models/dnnclassifier.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | class DNNClassifier(tf.keras.Model):
 4 |     def __init__(self, feature_columns=None, hidden_units=[100,100], n_classes=3):
 5 |         """DNNClassifier
 6 |         :param feature_columns: feature columns.
 7 |         :type feature_columns: list[tf.feature_column].
 8 |         :param hidden_units: number of hidden units.
 9 |         :type hidden_units: list[int].
10 |         :param n_classes: List of hidden units per layer.
11 |         :type n_classes: int.
12 |         """
13 |         global _loss
14 |         super(DNNClassifier, self).__init__()
15 |         self.feature_layer = None
16 |         self.n_classes = n_classes
17 |         if feature_columns is not None:
18 |             # combines all the data as a dense tensor
19 |             self.feature_layer = tf.keras.layers.DenseFeatures(feature_columns)
20 |         self.hidden_layers = []
21 |         for hidden_unit in hidden_units:
22 |             self.hidden_layers.append(tf.keras.layers.Dense(hidden_unit, activation='relu'))
23 |         if self.n_classes == 2:
24 |             # special setup for binary classification
25 |             pred_act = 'sigmoid'
26 |             _loss = 'binary_crossentropy'
27 |             n_out = 1
28 |         else:
29 |             pred_act = 'softmax'
30 |             _loss = 'categorical_crossentropy'
31 |             n_out = self.n_classes
32 |         self.prediction_layer = tf.keras.layers.Dense(n_out, activation=pred_act)
33 | 
34 |     def call(self, inputs, training=True):
35 |         if self.feature_layer is not None:
36 |             x = self.feature_layer(inputs)
37 |         else:
38 |             x = tf.keras.layers.Flatten()(inputs)
39 |         for hidden_layer in self.hidden_layers:
40 |             x = hidden_layer(x)
41 |         return self.prediction_layer(x)
42 | 
43 | def optimizer(learning_rate=0.001):
44 |     """Default optimizer name. Used in model.compile."""
45 |     return tf.keras.optimizers.Adagrad(lr=learning_rate)
46 | 
47 | def loss(labels, output):
48 |     """Default loss function. Used in model.compile."""
49 |     global _loss
50 |     if _loss == "binary_crossentropy":
51 |         return tf.reduce_mean(tf.keras.losses.binary_crossentropy(labels, output))
52 |     elif _loss == "categorical_crossentropy":
53 |         return tf.reduce_mean(tf.keras.losses.sparse_categorical_crossentropy(labels, output))
54 | 
55 | def prepare_prediction_column(prediction):
56 |     """Return the class label of highest probability."""
57 |     return prediction.argmax(axis=-1)
58 | 
59 | def eval_metrics_fn():
60 |     return {
61 |         "accuracy": lambda labels, predictions: tf.equal(
62 |             tf.argmax(predictions, 1, output_type=tf.int32),
63 |             tf.cast(tf.reshape(labels, [-1]), tf.int32),
64 |         )
65 |     }
66 | 


--------------------------------------------------------------------------------
/sqlflow_models/dnnclassifier_functional_api_example.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | global _loss
 4 | 
 5 | def dnnclassifier_functional_model(feature_columns, field_metas, n_classes=2,  learning_rate=0.001):
 6 |     feature_layer_inputs = dict()
 7 |     for fmkey in field_metas:
 8 |         fm = field_metas[fmkey]
 9 |         feature_layer_inputs[fm["feature_name"]] = tf.keras.Input(shape=(fm["shape"]), name=fm["feature_name"], dtype=fm["dtype"])
10 |     feature_layer = tf.keras.layers.DenseFeatures(feature_columns)
11 |     feature_layer_outputs = feature_layer(feature_layer_inputs)
12 |     global _loss
13 |     if n_classes == 2:
14 |         # special setup for binary classification
15 |         pred_act = 'sigmoid'
16 |         _loss = 'binary_crossentropy'
17 |     else:
18 |         pred_act = 'softmax'
19 |         _loss = 'categorical_crossentropy'
20 |     x = tf.keras.layers.Dense(128, activation='relu')(feature_layer_outputs)
21 |     x = tf.keras.layers.Dense(64, activation='relu')(x)
22 |     pred = tf.keras.layers.Dense(n_classes, activation=pred_act)(x)
23 |     return tf.keras.Model(inputs=[v for v in feature_layer_inputs.values()], outputs=pred)
24 | 
25 | def loss(labels, output):
26 |     global _loss
27 |     if _loss == "binary_crossentropy":
28 |         return tf.reduce_mean(tf.keras.losses.binary_crossentropy(labels, output))
29 |     elif _loss == "categorical_crossentropy":
30 |         return tf.reduce_mean(tf.keras.losses.sparse_categorical_crossentropy(labels, output))
31 | 
32 | def epochs():
33 |     return 1
34 | 
35 | def optimizer(lr=0.1):
36 |     return tf.keras.optimizers.Adagrad(lr=lr)
37 | 
38 | def prepare_prediction_column(self, prediction):
39 |     """Return the class label of highest probability."""
40 |     return prediction.argmax(axis=-1)
41 | 


--------------------------------------------------------------------------------
/sqlflow_models/dnnregressor.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | class DNNRegressor(tf.keras.Model):
 4 |     def __init__(self, feature_columns=None, hidden_units=[100,100]):
 5 |         """DNNRegressor
 6 |         :param feature_columns: feature columns.
 7 |         :type feature_columns: list[tf.feature_column].
 8 |         :param hidden_units: number of hidden units.
 9 |         :type hidden_units: list[int].
10 |         """
11 |         super(DNNRegressor, self).__init__()
12 |         self.feature_layer = None
13 |         if feature_columns is not None:
14 |             # combines all the data as a dense tensor
15 |             self.feature_layer = tf.keras.layers.DenseFeatures(feature_columns)
16 |         self.hidden_layers = []
17 |         for hidden_unit in hidden_units:
18 |             self.hidden_layers.append(tf.keras.layers.Dense(hidden_unit, activation='relu'))
19 |         self.prediction_layer = tf.keras.layers.Dense(1)
20 | 
21 |     def call(self, inputs, training=True):
22 |         if self.feature_layer is not None:
23 |             x = self.feature_layer(inputs)
24 |         else:
25 |             x = tf.keras.layers.Flatten()(inputs)
26 |         for hidden_layer in self.hidden_layers:
27 |             x = hidden_layer(x)
28 |         return self.prediction_layer(x)
29 | 
30 | def optimizer(learning_rate=0.001):
31 |     """Default optimizer name. Used in model.compile."""
32 |     return tf.keras.optimizers.Adagrad(lr=learning_rate)
33 | 
34 | def loss(labels, output):
35 |     """Default loss function. Used in model.compile."""
36 |     return tf.keras.losses.MSE(labels, output)
37 | 
38 | def prepare_prediction_column(prediction):
39 |     """Return the prediction directly."""
40 |     return prediction[0]
41 | 
42 | def eval_metrics_fn():
43 |     return {
44 |         "mse": lambda labels, predictions: tf.reduce_mean(
45 |             tf.pow(
46 |                 tf.cast(predictions, tf.float64) - tf.cast(labels, tf.float64), 2)
47 |             )
48 |     }
49 | 


--------------------------------------------------------------------------------
/sqlflow_models/gcn.py:
--------------------------------------------------------------------------------
  1 | # Based on the code from: https://github.com/tkipf/keras-gcn
  2 | import tensorflow as tf
  3 | from tensorflow.keras import activations, initializers, constraints
  4 | from tensorflow.keras import regularizers
  5 | import tensorflow.keras.backend as K
  6 | import scipy.sparse as sp
  7 | import numpy as np
  8 | import pickle, copy
  9 | 
 10 | 
 11 | class GCN(tf.keras.Model):
 12 |     def __init__(self, nhid, nclass, epochs, train_ratio, eval_ratio, 
 13 |                 sparse_input=True, early_stopping=True, dropout=0.5, nlayer=2, feature_columns=None,
 14 |                 id_col='id', feature_col='features', from_node_col='from_node_id', to_node_col='to_node_id'):
 15 |         """
 16 |         Implementation of GCN in this paper: https://arxiv.org/pdf/1609.02907.pdf. The original tensorflow implementation 
 17 |         is accessible here: https://github.com/tkipf/gcn, and one can find more information about GCN through: 
 18 |         http://tkipf.github.io/graph-convolutional-networks/.
 19 |         :param nhid: Number of hidden units for GCN.
 20 |             type nhid: int.
 21 |         :param nclass: Number of classes in total which will be the output dimension.
 22 |             type nclass: int.
 23 |         :param epochs: Number of epochs for the model to be trained.
 24 |             type epochs: int.
 25 |         :param train_ratio: Percentage of data points to be used for training.
 26 |             type train_ratio: float.
 27 |         :param eval_ratio: Percentage of data points to be used for evaluating.
 28 |             type eval_ratio: float.
 29 |         :param early_stopping: Whether to use early stopping trick during the training phase.
 30 |             type early_stopping: bool.
 31 |         :param dropout: The rate for dropout.
 32 |             type dropout: float.
 33 |         :param nlayer: Number of GCNLayer to be used in the model.
 34 |             type nlayer: int.
 35 |         :param feature_columns: a list of tf.feature_column. (Not used in this model)
 36 |             type feature_columns: list.
 37 |         :param id_col: Name for the column in database to be used as the id of each node.
 38 |             type id_col: string.
 39 |         :param feature_col: Name for the column in database to be used as the features of each node.
 40 |             type feature_col: string.
 41 |         :param from_node_col: Name for the column in database to be used as the from_node id of each edge.
 42 |             type from_node_col: string.
 43 |         :param to_node_col: Name for the column in database to be used as the to_node id of each edge.
 44 |             type to_node_col: string.
 45 |         """
 46 |         super(GCN, self).__init__()
 47 | 
 48 |         assert dropout < 1 and dropout > 0, "Please make sure dropout rate is a float between 0 and 1."
 49 |         assert train_ratio < 1 and train_ratio > 0, "Please make sure train_ratio is a float between 0 and 1."
 50 |         assert eval_ratio < 1 and eval_ratio > 0, "Please make sure eval_ratio is a float between 0 and 1."
 51 |         self.gc_layers = list()
 52 |         self.gc_layers.append(GCNLayer(nhid, kernel_regularizer=tf.keras.regularizers.l2(5e-4), sparse_input=sparse_input))
 53 |         for i in range(nlayer-1):
 54 |             self.gc_layers.append(GCNLayer(nhid, kernel_regularizer=tf.keras.regularizers.l2(5e-4)))
 55 |         self.gc_layers.append(GCNLayer(nclass))
 56 |         self.keep_prob = 1 - dropout
 57 |         self.dropout = tf.keras.layers.Dropout(dropout)
 58 |         self.nshape = None
 59 |         self.train_ratio = train_ratio
 60 |         self.eval_ratio = eval_ratio
 61 |         self.nlayer = nlayer
 62 |         self.epochs = epochs
 63 |         self.early_stopping = early_stopping
 64 |         self.sparse_input = sparse_input
 65 |         self.id_col = id_col
 66 |         self.feature_col = feature_col
 67 |         self.from_node_col = from_node_col
 68 |         self.to_node_col = to_node_col
 69 |         # try to load the result file
 70 |         try:
 71 |             with open('./results.pkl', 'rb') as f:
 72 |                 self.results = pickle.load(f)
 73 |         except (FileNotFoundError, IOError):
 74 |             self.results = None
 75 | 
 76 |     def call(self, data):
 77 |         x, adj = data
 78 |         assert self.nshape is not None, "Should calculate the shape of input by preprocessing the data with model.preprocess(data)."
 79 |         if self.sparse_input:
 80 |             x = GCN.sparse_dropout(x, self.keep_prob, self.nshape)
 81 |         else:
 82 |             x = self.dropout(x)
 83 |         for i in range(self.nlayer-1):
 84 |             x = tf.keras.activations.relu(self.gc_layers[i](x, adj))
 85 |             x = self.dropout(x)
 86 |         x = self.gc_layers[-1](x, adj)
 87 | 
 88 |         return tf.keras.activations.softmax(x)
 89 | 
 90 |     def evaluate(self, data, y, sample_weight):
 91 |         """Function to evaluate the model."""
 92 |         return self.test(sample_weight, return_loss=True)
 93 | 
 94 |     def predict(self, data):
 95 |         """Function to predict labels with the model."""
 96 |         x, adj = data
 97 |         for i in range(self.nlayer-1):
 98 |             x = tf.keras.activations.relu(self.gc_layers[i](x, adj))
 99 |         x = self.gc_layers[-1](x, adj)
100 |         return tf.keras.activations.softmax(x)
101 | 
102 |     @staticmethod
103 |     def sparse_dropout(x, keep_prob, noise_shape):
104 |         """Dropout for sparse tensors."""
105 |         random_tensor = keep_prob
106 |         random_tensor += tf.random.uniform(noise_shape)
107 |         dropout_mask = tf.cast(tf.floor(random_tensor), dtype=tf.bool)
108 |         pre_out = tf.sparse.retain(x, dropout_mask)
109 |         return pre_out * (1./keep_prob)
110 | 
111 |     @staticmethod
112 |     def encode_onehot(labels):
113 |         classes = set(labels)
114 |         classes_dict = {c: np.identity(len(classes))[i, :] for i, c in enumerate(classes)}
115 |         labels_onehot = np.array(list(map(classes_dict.get, labels)), dtype=np.int32)
116 |         return labels_onehot
117 | 
118 |     @staticmethod
119 |     def normalize_adj(adjacency, symmetric=True):
120 |         """
121 |         Function to normalize the adjacency matrix (get the laplacian matrix).
122 |         :param adjacency: Adjacency matrix of the dataset.
123 |             type adjacency: Scipy COO_Matrix.
124 |         :param symmetric: Boolean variable to determine whether to use symmetric laplacian.
125 |             type symmetric: bool.
126 |         """
127 |         adjacency += sp.eye(adjacency.shape[0])
128 |         if symmetric:
129 |             """L=D^-0.5 * (A+I) * D^-0.5"""
130 |             d = sp.diags(np.power(np.array(adjacency.sum(1)), -0.5).flatten(), 0)
131 |             a_norm = adjacency.dot(d).transpose().dot(d).tocoo()
132 |         else:
133 |             """L=D^-1 * (A+I)"""
134 |             d = sp.diags(np.power(np.array(adjacency.sum(1)), -1).flatten(), 0)
135 |             a_norm = d.dot(adjacency).tocoo()
136 | 
137 |         return a_norm
138 | 
139 |     @staticmethod
140 |     def normalize_feature(features, sparse_input):
141 |         """Function to row-normalize the features input."""
142 |         rowsum = np.array(features.sum(1))
143 |         r_inv = np.power(rowsum, -1).flatten()
144 |         r_inv[np.isinf(r_inv)] = 0.
145 |         r_mat_inv = sp.diags(r_inv)
146 |         features = r_mat_inv.dot(features)
147 |         if sparse_input:
148 |             return sp.csr_matrix(features).tocoo()
149 |         else:
150 |             return features
151 | 
152 |     def preprocess(self, ids, features, labels, edges):
153 |         """Function to preprocess the node features and adjacency matrix."""
154 |         if len(features.shape) > 2:
155 |             features = np.squeeze(features)
156 |         if len(edges.shape) > 2:
157 |             edges = np.squeeze(edges)
158 |         # sort the data in the correct order
159 |         idx = np.argsort(np.array(ids))
160 |         features = features[idx]
161 |         labels = labels[idx]
162 |         # preprocess
163 |         features = GCN.normalize_feature(features, self.sparse_input)
164 |         labels = GCN.encode_onehot(labels)
165 |         adjacency = sp.coo_matrix((np.ones(len(edges)),
166 |                     (edges[:, 0], edges[:, 1])),
167 |                     shape=(features.shape[0], features.shape[0]), dtype="float32")
168 | 
169 |         adjacency = adjacency + adjacency.T.multiply(adjacency.T > adjacency) - adjacency.multiply(adjacency.T > adjacency)
170 |         adjacency = GCN.normalize_adj(adjacency, symmetric=True)
171 | 
172 |         nf_shape = features.data.shape
173 |         na_shape = adjacency.data.shape
174 |         if self.sparse_input:
175 |             features = tf.SparseTensor(
176 |                         indices=np.array(list(zip(features.row, features.col)), dtype=np.int64),
177 |                         values=tf.cast(features.data, tf.float32),
178 |                         dense_shape=features.shape)
179 |             features = tf.sparse.reorder(features)
180 |         adjacency = tf.SparseTensor(
181 |                         indices=np.array(list(zip(adjacency.row, adjacency.col)), dtype=np.int64),
182 |                         values=tf.cast(adjacency.data, tf.float32),
183 |                         dense_shape=adjacency.shape)
184 |         adjacency = tf.sparse.reorder(adjacency)
185 |         
186 |         total_num = features.shape[0]
187 |         train_num = round(total_num*self.train_ratio)
188 |         eval_num = round(total_num*self.eval_ratio)
189 |         train_index = np.arange(train_num)
190 |         val_index = np.arange(train_num, train_num+eval_num)
191 |         test_index = np.arange(train_num+eval_num, total_num)
192 | 
193 |         self.train_mask = np.zeros(total_num, dtype = np.bool)
194 |         self.val_mask = np.zeros(total_num, dtype = np.bool)
195 |         self.test_mask = np.zeros(total_num, dtype = np.bool)
196 |         self.train_mask[train_index] = True
197 |         self.val_mask[val_index] = True
198 |         self.test_mask[test_index] = True
199 | 
200 |         print('Dataset has {} nodes, {} edges, {} features.'.format(features.shape[0], edges.shape[0], features.shape[1]))
201 | 
202 |         return features, labels, adjacency, nf_shape, na_shape
203 |     
204 |     def loss_func(self, model, x, y, train_mask, training=True):
205 |         '''Customed loss function for the model.'''
206 | 
207 |         y_ = model(x, training=training)
208 | 
209 |         test_mask_logits = tf.gather_nd(y_, tf.where(train_mask))
210 |         masked_labels = tf.gather_nd(y, tf.where(train_mask))
211 | 
212 |         return loss(labels=masked_labels, output=test_mask_logits)
213 | 
214 |     def grad(self, model, inputs, targets, train_mask):
215 |         '''Calculate the gradients of the parameters.'''
216 |         with tf.GradientTape() as tape:
217 |             loss_value = self.loss_func(model, inputs, targets, train_mask)
218 |         
219 |         return loss_value, tape.gradient(loss_value, model.trainable_variables)
220 |     
221 |     def test(self, mask, return_loss=False):
222 |         '''Test the results on the model. Return accuracy'''
223 |         logits = self.predict(data=[self.features, self.adjacency])
224 | 
225 |         test_mask_logits = tf.gather_nd(logits, tf.where(mask))
226 |         masked_labels = tf.gather_nd(self.labels, tf.where(mask))
227 | 
228 |         ll = tf.equal(tf.argmax(masked_labels, -1), tf.argmax(test_mask_logits, -1))
229 |         accuracy = tf.reduce_mean(tf.cast(ll, dtype=tf.float32))
230 | 
231 |         if return_loss:
232 |             loss_value = loss(labels=masked_labels, output=test_mask_logits)
233 |             return [loss_value, accuracy]
234 | 
235 |         return accuracy
236 | 
237 |     def sqlflow_train_loop(self, x):
238 |         """Customized training function."""
239 |         # load data
240 |         ids, ids_check, features, labels, edges, edge_check = list(), dict(), list(), list(), list(), dict()
241 |         from_node = 0
242 |         for inputs, label in x:
243 |             id = inputs[self.id_col].numpy().astype(np.int32)
244 |             feature = inputs[self.feature_col].numpy().astype(np.float32)
245 |             from_node = inputs[self.from_node_col].numpy().astype(np.int32)
246 |             to_node = inputs[self.to_node_col].numpy().astype(np.int32)
247 |             if int(id) not in ids_check:
248 |                 ids.append(int(id))
249 |                 features.append(feature)
250 |                 labels.append(label.numpy()[0])
251 |                 ids_check[int(id)] = 0
252 |             if tuple([int(from_node), int(to_node)]) not in edge_check:
253 |                 edge_check[tuple([int(from_node), int(to_node)])] = 0
254 |                 edges.append([from_node, to_node])
255 |         features = np.stack(features)
256 |         labels = np.stack(labels)
257 |         edges = np.stack(edges)
258 | 
259 |         self.features, self.labels, self.adjacency, self.nshape, na_shape = self.preprocess(ids, features, labels, edges)
260 |         # training the model
261 |         wait = 0
262 |         best_acc = -9999999
263 |         PATIENCE = 10
264 |         for epoch in range(self.epochs):
265 |             # calculate the gradients and take the step
266 |             loss_value, grads = self.grad(self, [self.features, self.adjacency], self.labels, self.train_mask)
267 |             optimizer().apply_gradients(zip(grads, self.trainable_variables))
268 |             # Test on train and evaluate dataset
269 |             train_acc = self.test(self.train_mask)
270 |             val_acc = self.test(self.val_mask)
271 |             print("Epoch {} loss={:6f} accuracy={:6f} val_acc={:6f}".format(epoch, loss_value, train_acc, val_acc))
272 |             # early stopping
273 |             if epoch > 50 and self.early_stopping:
274 |                 if float(val_acc.numpy()) > best_acc:
275 |                     best_acc = float(val_acc.numpy())
276 |                     wait = 0
277 |                 else:
278 |                     if wait >= PATIENCE:
279 |                         print('Epoch {}: early stopping'.format(epoch))
280 |                         break
281 |                     wait += 1
282 |         # evaluate the model
283 |         result = self.evaluate(data=[self.features, self.adjacency], y=self.labels, sample_weight=self.val_mask)
284 |         # get all the results
285 |         predicted = self.predict([self.features, self.adjacency])
286 |         # store the results in a pickled file
287 |         with open('./results.pkl', 'wb') as f:
288 |             results = dict()
289 |             for i in range(len(ids)):
290 |                 results[str(ids[i])] = predicted[i]
291 |             results['evaluation'] = result
292 |             pickle.dump(results, f)
293 |             self.results = results
294 | 
295 |     def sqlflow_evaluate_loop(self, x, metric_names):
296 |         """Customed evaluation, can only support calculating the accuracy."""
297 |         assert self.results is not None, "Please make sure to train the model first."
298 |         eval_result = self.results['evaluation']
299 |         return eval_result
300 | 
301 |     def sqlflow_predict_one(self, sample):
302 |         """Customed prediction, sample must be the node id."""
303 |         assert self.results is not None, "Please make sure to train the model first."
304 |         prediction = self.results[str(int(sample))]
305 |         return [prediction]
306 | 
307 | def optimizer():
308 |     """Default optimizer name. Used in model.compile."""
309 |     return tf.keras.optimizers.Adam(lr=0.01)
310 | 
311 | def loss(labels, output):
312 |     """Default loss function for classification task."""
313 |     criterion = tf.keras.losses.CategoricalCrossentropy(from_logits=False)
314 |     return criterion(y_true=labels, y_pred=output)
315 | 
316 | # Graph Convolutional Layer
317 | class GCNLayer(tf.keras.layers.Layer):
318 |  
319 |     def __init__(self, units, use_bias=True, sparse_input=False,
320 |                  kernel_initializer='glorot_uniform',
321 |                  bias_initializer='zeros',
322 |                  kernel_regularizer=None,
323 |                  bias_regularizer=None,
324 |                  kernel_constraint=None,
325 |                  bias_constraint=None,
326 |                  **kwargs):
327 |         """GCNLayer
328 |         Graph Convolutional Networks Layer from paper: https://arxiv.org/pdf/1609.02907.pdf. This is used in the GCN model for 
329 |         classification task on graph-structured data.
330 |         :param units: Number of hidden units for the layer.
331 |             type units: int.
332 |         :param use_bias: Boolean variable to determine whether to use bias.
333 |             type use_bias: bool.
334 |         :param sparse_input: Boolean variable to check if input tensor is sparse.
335 |             type sparse_input: bool.
336 |         :param kernel_initializer: Weight initializer for the GCN kernel.
337 |         :param bias_initializer: Weight initializer for the bias.
338 |         :param kernel_regularizer: Weight regularizer for the GCN kernel.
339 |         :param bias_regularizer: Weight regularizer for the bias.
340 |         :param kernel_constraint: Weight value constraint for the GCN kernel.
341 |         :param bias_constraint: Weight value constraint for the bias.
342 |         :param kwargs:
343 |         """
344 |         if 'input_shape' not in kwargs and 'input_dim' in kwargs:
345 |             kwargs['input_shape'] = (kwargs.pop('input_dim'),)
346 |         super(GCNLayer, self).__init__(**kwargs)
347 |         self.units = units
348 |         self.use_bias = use_bias
349 |         self.sparse_input = sparse_input
350 |         self.kernel_initializer = initializers.get(kernel_initializer)
351 |         self.bias_initializer = initializers.get(bias_initializer)
352 |         self.kernel_regularizer = regularizers.get(kernel_regularizer)
353 |         self.bias_regularizer = regularizers.get(bias_regularizer)
354 |         self.kernel_constraint = constraints.get(kernel_constraint)
355 |         self.bias_constraint = constraints.get(bias_constraint)
356 |     
357 |     def build(self, input_shape):
358 |         self.kernel = self.add_weight(shape=(input_shape[-1], self.units),
359 |                                       initializer=self.kernel_initializer,
360 |                                       name='kernel',
361 |                                       regularizer=self.kernel_regularizer,
362 |                                       constraint=self.kernel_constraint,
363 |                                       trainable=True)
364 |         if self.use_bias:
365 |             self.bias = self.add_weight(shape=(self.units,),
366 |                                         initializer=self.bias_initializer,
367 |                                         name='bias',
368 |                                         regularizer=self.bias_regularizer,
369 |                                         constraint=self.bias_constraint,
370 |                                         trainable=True)
371 |         self.built = True
372 |     
373 |     def call(self, inputs, adj, **kwargs):
374 |         assert isinstance(adj, tf.SparseTensor), "Adjacency matrix should be a SparseTensor"
375 |         if self.sparse_input:
376 |             assert isinstance(inputs, tf.SparseTensor), "Input matrix should be a SparseTensor"
377 |             support = tf.sparse.sparse_dense_matmul(inputs, self.kernel)
378 |         else:
379 |             support = tf.matmul(inputs, self.kernel)
380 |         output = tf.sparse.sparse_dense_matmul(adj, support)
381 |         if self.use_bias:   
382 |             output = output + self.bias
383 |         else:
384 |             output = output
385 |         return output
386 | 
387 |     def get_config(self):
388 |         config = {'units': self.units,
389 |                   'use_bias': self.use_bias,
390 |                   'sparse_input': self.sparse_input,
391 |                   'kernel_initializer': initializers.serialize(
392 |                       self.kernel_initializer),
393 |                   'bias_initializer': initializers.serialize(
394 |                       self.bias_initializer),
395 |                   'kernel_regularizer': regularizers.serialize(
396 |                       self.kernel_regularizer),
397 |                   'bias_regularizer': regularizers.serialize(
398 |                       self.bias_regularizer),
399 |                   'kernel_constraint': constraints.serialize(
400 |                       self.kernel_constraint),
401 |                   'bias_constraint': constraints.serialize(self.bias_constraint)
402 |         }
403 |         base_config = super(GCNLayer, self).get_config()
404 |         return dict(list(base_config.items()) + list(config.items()))


--------------------------------------------------------------------------------
/sqlflow_models/native_keras.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | class RawDNNClassifier(tf.keras.Model):
 4 |     def __init__(self, hidden_units=[100,100], n_classes=3):
 5 |         super(RawDNNClassifier, self).__init__()
 6 |         self.feature_layer = None
 7 |         self.n_classes = n_classes
 8 |         self.hidden_layers = []
 9 |         for hidden_unit in hidden_units:
10 |             self.hidden_layers.append(tf.keras.layers.Dense(hidden_unit, activation='relu'))
11 |         if self.n_classes == 2:
12 |             pred_act = 'sigmoid'
13 |             n_out = 1
14 |         else:
15 |             pred_act = 'softmax'
16 |             n_out = self.n_classes
17 |         self.prediction_layer = tf.keras.layers.Dense(n_out, activation=pred_act)
18 | 
19 |     def call(self, inputs, training=True):
20 |         if self.feature_layer is not None:
21 |             x = self.feature_layer(inputs)
22 |         else:
23 |             x = tf.keras.layers.Flatten()(inputs)
24 |         for hidden_layer in self.hidden_layers:
25 |             x = hidden_layer(x)
26 |         return self.prediction_layer(x)
27 | 


--------------------------------------------------------------------------------
/sqlflow_models/one_class_svm.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2020 The SQLFlow Authors. All rights reserved.
  2 | # Licensed under the Apache License, Version 2.0 (the "License");
  3 | # you may not use this file except in compliance with the License.
  4 | # You may obtain a copy of the License at
  5 | #
  6 | # http://www.apache.org/licenses/LICENSE-2.0
  7 | #
  8 | # Unless required by applicable law or agreed to in writing, software
  9 | # distributed under the License is distributed on an "AS IS" BASIS,
 10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 11 | # See the License for the specific language governing permissions and
 12 | # limitations under the License.
 13 | 
 14 | import os
 15 | import pickle
 16 | 
 17 | import numpy as np
 18 | import tensorflow as tf
 19 | from sklearn.svm import OneClassSVM as SklearnOneClassSVM
 20 | 
 21 | MODEL_DIR = "model_save"
 22 | MODEL_PATH = MODEL_DIR + "/one_class_svm_model"
 23 | 
 24 | ENABLE_EAGER_EXECUTION = False
 25 | 
 26 | try:
 27 |     tf.enable_eager_execution()
 28 |     ENABLE_EAGER_EXECUTION = True
 29 | except Exception:
 30 |     try:
 31 |         tf.compat.v1.enable_eager_execution()
 32 |         ENABLE_EAGER_EXECUTION = True
 33 |     except Exception:
 34 |         ENABLE_EAGER_EXECUTION = False
 35 | 
 36 | 
 37 | def dataset_reader(dataset):
 38 |     if ENABLE_EAGER_EXECUTION:
 39 |         for features in dataset:
 40 |             yield features
 41 |     else:
 42 |         iter = dataset.make_one_shot_iterator()
 43 |         one_element = iter.get_next()
 44 |         with tf.Session() as sess:
 45 |             try:
 46 |                 while True:
 47 |                     yield sess.run(one_element)
 48 |             except tf.errors.OutOfRangeError:
 49 |                 pass
 50 | 
 51 | 
 52 | class OneClassSVM(tf.keras.Model):
 53 |     def __init__(self,
 54 |                  feature_columns=None,
 55 |                  kernel='rbf',
 56 |                  degree=3,
 57 |                  gamma='scale',
 58 |                  coef0=0.0,
 59 |                  tol=0.001,
 60 |                  nu=0.5,
 61 |                  shrinking=True,
 62 |                  cache_size=200,
 63 |                  verbose=False,
 64 |                  max_iter=-1):
 65 |         if os.path.exists(MODEL_PATH):
 66 |             with open(MODEL_PATH, "rb") as f:
 67 |                 self.svm = pickle.load(f)
 68 |         else:
 69 |             self.svm = SklearnOneClassSVM(kernel=kernel,
 70 |                                           degree=degree,
 71 |                                           gamma=gamma,
 72 |                                           coef0=coef0,
 73 |                                           tol=tol,
 74 |                                           nu=nu,
 75 |                                           shrinking=shrinking,
 76 |                                           cache_size=cache_size,
 77 |                                           verbose=verbose,
 78 |                                           max_iter=max_iter)
 79 | 
 80 |     def concat_features(self, features):
 81 |         assert isinstance(features, dict)
 82 |         each_feature = []
 83 |         for k, v in features.items():
 84 |             if ENABLE_EAGER_EXECUTION:
 85 |                 v = v.numpy()
 86 |             each_feature.append(v)
 87 |         return np.concatenate(each_feature, axis=1)
 88 | 
 89 |     def sqlflow_train_loop(self, dataset):
 90 |         X = []
 91 |         for features in dataset_reader(dataset):
 92 |             X.append(self.concat_features(features))
 93 |         X = np.concatenate(X)
 94 | 
 95 |         self.svm.fit(X)
 96 | 
 97 |         if not os.path.exists(MODEL_DIR):
 98 |             os.mkdir(MODEL_DIR)
 99 | 
100 |         with open(MODEL_PATH, "wb") as f:
101 |             pickle.dump(self.svm, f, protocol=2)
102 | 
103 |     def sqlflow_predict_one(self, features):
104 |         features = self.concat_features(features)
105 |         pred = self.svm.predict(features)
106 |         score = self.svm.decision_function(features)
107 |         return pred, score
108 | 


--------------------------------------------------------------------------------
/sqlflow_models/rnn_based_time_series.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | class RNNBasedTimeSeriesModel(tf.keras.Model):
 4 | 
 5 |     def __init__(self,
 6 |                  feature_columns=None,
 7 |                  stack_units=[500, 500],
 8 |                  n_in=7,
 9 |                  n_out=1,
10 |                  n_features=1, 
11 |                  model_type='rnn'):
12 |         """RNNBasedTimeSeriesModel
13 |         :param feature_columns: All columns must be embedding of sequence column with same sequence_length.
14 |             type feature_columns: list[tf.feature_column.numeric_column].
15 |         :param stack_units: Units for RNN layer.
16 |             type stack_units: vector of ints.
17 |         :param n_in: Size of time window.
18 |             type n_in: int.
19 |         :param n_out: Number of predicted labels.
20 |             type n_out: int.
21 |         :param n_features: number of features in every time window.
22 |             type n_features: int.
23 |         :param model_type: Specific RNN model to be used, which can be chose from: ('rnn', 'lstm' and 'gru').
24 |             type model_type: string.
25 |         """
26 |         super(RNNBasedTimeSeriesModel, self).__init__(name='RNN_TS_Model')
27 |         # Common
28 |         self.feature_columns = feature_columns
29 |         self.loss = loss
30 |         self.n_out = n_out
31 |         self.n_in = n_in
32 |         self.n_features = n_features
33 |         self.stack_units = stack_units
34 |         self.models = {'rnn':tf.keras.layers.SimpleRNN, 'lstm':tf.keras.layers.LSTM, 'gru':tf.keras.layers.GRU}
35 |         # combines all the data as a dense tensor
36 |         self.feature_layer = None
37 |         if feature_columns is not None:
38 |             self.feature_layer = tf.keras.layers.DenseFeatures(feature_columns)
39 |         self.stack_layers = []
40 |         for unit in self.stack_units[:-1]:
41 |             self.stack_layers.append(self.models[model_type.lower()](unit, input_shape=(self.n_in, self.n_features), return_sequences=True))
42 |         self.rnn = self.models[model_type.lower()](self.stack_units[-1], input_shape=(self.n_in, self.n_features))
43 |         self.dropout = tf.keras.layers.Dropout(0.2)
44 |         self.prediction_layer = tf.keras.layers.Dense(self.n_out)
45 | 
46 |     def call(self, inputs):
47 |         if self.feature_layer:
48 |             x = self.feature_layer(inputs)
49 |         else:
50 |             x = inputs
51 |         x = tf.reshape(x, (-1, self.n_in, self.n_features))
52 |         for i in range(len(self.stack_units) - 1):
53 |             x = self.stack_layers[i](x)
54 |         x = self.rnn(x)
55 |         x = self.dropout(x)
56 |         return self.prediction_layer(x)
57 | 
58 | def optimizer(learning_rate=0.001):
59 |     """Default optimizer name. Used in model.compile."""
60 |     return tf.keras.optimizers.Adam(lr=learning_rate)
61 | 
62 | def prepare_prediction_column(prediction):
63 |     """Return the prediction directly."""
64 |     return prediction
65 | 
66 | def loss(labels, output):
67 |     return tf.reduce_mean(tf.keras.losses.MSE(labels, output))
68 | 
69 | 


--------------------------------------------------------------------------------
/sqlflow_models/rnnclassifier.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | _loss = ''
 4 | 
 5 | class StackedRNNClassifier(tf.keras.Model):
 6 |     def __init__(self, feature_columns=None, stack_units=[32], hidden_size=64, n_classes=2, model_type='rnn', bidirectional=False):
 7 |         """StackedRNNClassifier
 8 |         :param feature_columns: All columns must be embedding of sequence column with same sequence_length.
 9 |         :type feature_columns: list[tf.embedding_column].
10 |         :param stack_units: Units for RNN layer.
11 |         :type stack_units: vector of ints.
12 |         :param n_classes: Target number of classes.
13 |         :type n_classes: int.
14 |         :param model_type: Specific RNN model to be used, which can be chose from: ('rnn', 'lstm' and 'gru').
15 |         :type model_type: string.
16 |         :param bidirectional: Whether to use bidirectional or not.
17 |         :type bidirectional: bool.
18 |         """
19 |         global _loss
20 |         super(StackedRNNClassifier, self).__init__()
21 | 
22 |         self.models = {'rnn':tf.keras.layers.SimpleRNN, 'lstm':tf.keras.layers.LSTM, 'gru':tf.keras.layers.GRU}
23 |         self.bidirectionals = {True: tf.keras.layers.Bidirectional, False: lambda x: x}
24 |         self.feature_layer = None
25 |         if feature_columns is not None:
26 |             self.feature_layer = tf.keras.experimental.SequenceFeatures(feature_columns)
27 |         self.stack_rnn = []
28 |         self.stack_size = len(stack_units)
29 |         self.stack_units = stack_units
30 |         self.n_classes = n_classes
31 |         if self.stack_size > 1:
32 |             for i in range(self.stack_size - 1):
33 |                 self.stack_rnn.append(
34 |                     self.bidirectionals[bidirectional](self.models[model_type.lower()](self.stack_units[i], return_sequences=True))
35 |                 )
36 |         self.rnn = self.bidirectionals[bidirectional](self.models[model_type.lower()](self.stack_units[-1]))
37 |         self.hidden = tf.keras.layers.Dense(hidden_size, activation='relu')
38 |         if self.n_classes == 2:
39 |             # special setup for binary classification
40 |             pred_act = 'sigmoid'
41 |             _loss = 'binary_crossentropy'
42 |             n_out = 1
43 |         else:
44 |             pred_act = 'softmax'
45 |             _loss = 'categorical_crossentropy'
46 |             n_out = self.n_classes
47 |         self.pred = tf.keras.layers.Dense(n_out, activation=pred_act)
48 | 
49 |     def call(self, inputs):
50 |         if self.feature_layer:
51 |             x, seq_len = self.feature_layer(inputs)
52 |         else:
53 |             x, seq_len = inputs
54 |         seq_mask = tf.sequence_mask(seq_len)
55 |         if self.stack_size > 1:
56 |             for i in range(self.stack_size - 1):
57 |                 x = self.stack_rnn[i](x, mask=seq_mask)
58 |         x = self.rnn(x, mask=seq_mask)
59 |         x = self.hidden(x)
60 |         return self.pred(x)
61 | 
62 | def optimizer():
63 |     """Default optimizer name. Used in model.compile."""
64 |     return 'adam'
65 | 
66 | def loss(labels, output):
67 |     global _loss
68 |     if _loss == "binary_crossentropy":
69 |         return tf.reduce_mean(tf.keras.losses.binary_crossentropy(labels, output))
70 |     elif _loss == "categorical_crossentropy":
71 |         return tf.reduce_mean(tf.keras.losses.categorical_crossentropy(labels, output))
72 | 
73 | def prepare_prediction_column(prediction):
74 |     """Return the class label of highest probability."""
75 |     return prediction.argmax(axis=-1)
76 | 
77 | def eval_metrics_fn():
78 |     return {
79 |         "accuracy": lambda labels, predictions: tf.equal(
80 |             tf.argmax(predictions, 1, output_type=tf.int32),
81 |             tf.cast(tf.reshape(labels, [-1]), tf.int32),
82 |         )
83 |     }
84 | 


--------------------------------------------------------------------------------
/sqlflow_models/score_card.py:
--------------------------------------------------------------------------------
  1 | #!/bin/env python
  2 | 
  3 | import tensorflow as tf
  4 | from tensorflow import keras
  5 | from tensorflow.python.data import make_one_shot_iterator
  6 | from tensorflow.keras.losses import kld
  7 | from tensorflow.keras.optimizers import SGD
  8 | import numpy as np
  9 | import pandas as pd
 10 | import scipy.stats.stats as stats
 11 | import sklearn
 12 | from sklearn.linear_model import LogisticRegression
 13 | from sklearn.model_selection import train_test_split
 14 | from sklearn.metrics import roc_auc_score, auc
 15 | import pickle
 16 | 
 17 | 
 18 | def optimizer():
 19 |     return None
 20 | 
 21 | 
 22 | def loss():
 23 |     return None
 24 | 
 25 | 
 26 | class ScoreCard(keras.Model):
 27 | 
 28 |     def __init__(self, feature_columns=None, pf_bin_size=5):
 29 |         super(ScoreCard, self).__init__(name='ScoreCard')
 30 | 
 31 |         self._target_score = 600
 32 |         self._factor = 20/np.log(2)
 33 |         self._offset = 600 - 20*np.log(20) / np.log(2)
 34 |         self._bins = dict()
 35 |         self._pf_bin_size = pf_bin_size
 36 | 
 37 |     def _pf_bin(self, y, x):
 38 |         # population frequency bucket
 39 |         bad_num = y.sum()
 40 |         good_num = y.count() - y.sum()
 41 |         d1 = pd.DataFrame({'x': x,'y': y,'bucket': pd.qcut(x, self._pf_bin_size, duplicates='drop')})
 42 |         d2 = d1.groupby('bucket',as_index=True)
 43 |         d3 = pd.DataFrame(d2.x.min(),columns=['min_bin']) 
 44 | 
 45 |         d3["min"] = d2.min().x
 46 |         d3["max"] = d2.max().x
 47 |         d3["badcostum"] = d2.sum().y
 48 |         d3["goodcostum"] = d2.count().y - d2.sum().y
 49 |         d3["total"] = d2.count().y
 50 |         d3["bad_rate"] = d2.sum().y/d2.count().y
 51 |         d3["woe"] = np.log(d3["badcostum"]/d3["goodcostum"]*good_num/bad_num)
 52 |         iv = ((d3["badcostum"]/bad_num-d3["goodcostum"]/good_num)*d3["woe"])
 53 |         d3["iv"] = iv
 54 |         woe = list(d3["woe"].round(6))
 55 |         cut = list(d3["max"].round(6))
 56 |         cut.insert(0, float("-inf"))
 57 |         cut[-1] = float("inf")
 58 |         return d3, cut, woe, iv
 59 | 
 60 |     def _to_dataframe(self, dataset):
 61 |         x_df = pd.DataFrame()
 62 |         y_df = pd.DataFrame()
 63 |         for _, minibatch in enumerate(dataset):
 64 |             data, label = minibatch
 65 |             dx = {}
 66 |             dy = {}
 67 |             for name, value in data.items():
 68 |                 dx[name] = value.numpy()[0][0]
 69 |             x_df = x_df.append(dx, ignore_index=True)
 70 |             dy['label'] = label.numpy()[0]
 71 |             y_df = y_df.append(dy, ignore_index=True)
 72 |         return x_df, y_df
 73 | 
 74 |     def _replace_woe(self, x, cut, woe):
 75 |         return pd.cut(x, cut, labels=pd.Categorical(woe))
 76 | 
 77 |     def _woe_encoder(self, x, y):
 78 |         x_train_dict = {}
 79 |         for col in x.columns:
 80 |             dfx, cut, woe, iv = self._pf_bin(y, x[col])
 81 |             self._bins[col] = (dfx, cut, woe, iv)
 82 |             # replacing by the WOE encode
 83 |             x_train_dict[col] = self._replace_woe(x[col], cut, woe)
 84 |         return pd.DataFrame.from_dict(x_train_dict)
 85 | 
 86 |     def sqlflow_train_loop(self, dataset, epochs=1, verbose=0):
 87 |         x_df, y_df = self._to_dataframe(dataset)
 88 |         x = self._woe_encoder(x_df, y_df['label'])
 89 |         x.to_csv("/tmp/train_woe.csv")
 90 |         lr = LogisticRegression()
 91 | 
 92 |         x_train, x_test, y_train, y_test = train_test_split(x, y_df['label'])
 93 |         lr.fit(x_train, y_train)
 94 |         prob = lr.predict_proba(x_test)[:, 1]
 95 |         auc_score = roc_auc_score(y_test, prob)
 96 |         print("AUC: {}\n".format(auc_score))
 97 | 
 98 |         # print the score card
 99 |         print("TARGET SCORE: %d" % self._target_score)
100 |         coe = lr.coef_
101 |         for i, col_name in enumerate(x_df.columns):
102 |             bin_cols = self._bins[col_name][0].index.to_list()
103 |             for j, w in enumerate(self._bins[col_name][2]):
104 |                 print(col_name, bin_cols[j],
105 |                       round(coe[0][i] * w * self._factor +
106 |                             self._offset/self._pf_bin_size, 0))
107 | 


--------------------------------------------------------------------------------
/sqlflow_models/simple_dnn_generator.py:
--------------------------------------------------------------------------------
  1 | # This file is based on the AdaNet example
  2 | from __future__ import absolute_import
  3 | from __future__ import division
  4 | from __future__ import print_function
  5 | 
  6 | import functools
  7 | import adanet
  8 | import tensorflow as tf
  9 | 
 10 | _NUM_LAYERS_KEY = "num_layers"
 11 | 
 12 | 
 13 | class _SimpleDNNBuilder(adanet.subnetwork.Builder):
 14 |   """Builds a DNN subnetwork for AdaNet."""
 15 | 
 16 |   def __init__(self, feature_columns, optimizer, layer_size, num_layers, learn_mixture_weights,
 17 |                seed):
 18 |     """Initializes a `_DNNBuilder`.
 19 | 
 20 |     Args:
 21 |       optimizer: An `Optimizer` instance for training both the subnetwork and
 22 |         the mixture weights.
 23 |       layer_size: The number of nodes to output at each hidden layer.
 24 |       num_layers: The number of hidden layers.
 25 |       learn_mixture_weights: Whether to solve a learning problem to find the
 26 |         best mixture weights, or use their default value according to the
 27 |         mixture weight type. When `False`, the subnetworks will return a no_op
 28 |         for the mixture weight train op.
 29 |       seed: A random seed.
 30 | 
 31 |     Returns:
 32 |       An instance of `_SimpleDNNBuilder`.
 33 |     """
 34 | 
 35 |     self._optimizer = optimizer
 36 |     self._layer_size = layer_size
 37 |     self._num_layers = num_layers
 38 |     self._learn_mixture_weights = learn_mixture_weights
 39 |     self._feature_columns = feature_columns
 40 |     self._seed = seed
 41 | 
 42 |   def build_subnetwork(self,
 43 |                        features,
 44 |                        logits_dimension,
 45 |                        training,
 46 |                        iteration_step,
 47 |                        summary,
 48 |                        previous_ensemble=None):
 49 |     """See `adanet.subnetwork.Builder`."""
 50 | 
 51 |     input_layer = tf.compat.v1.feature_column.input_layer(features, self._feature_columns)
 52 |     kernel_initializer = tf.compat.v1.glorot_uniform_initializer(seed=self._seed)
 53 |     last_layer = input_layer
 54 |     for _ in range(self._num_layers):
 55 |       last_layer = tf.compat.v1.layers.dense(
 56 |           last_layer,
 57 |           units=self._layer_size,
 58 |           activation=tf.nn.relu,
 59 |           kernel_initializer=kernel_initializer)
 60 |     logits = tf.compat.v1.layers.dense(
 61 |         last_layer,
 62 |         units=logits_dimension,
 63 |         kernel_initializer=kernel_initializer)
 64 | 
 65 |     persisted_tensors = {_NUM_LAYERS_KEY: tf.constant(self._num_layers)}
 66 |     return adanet.Subnetwork(
 67 |         last_layer=last_layer,
 68 |         logits=logits,
 69 |         complexity=self._measure_complexity(),
 70 |         persisted_tensors=persisted_tensors)
 71 | 
 72 |   def _measure_complexity(self):
 73 |     """Approximates Rademacher complexity as the square-root of the depth."""
 74 |     return tf.sqrt(tf.cast(self._num_layers, tf.float32))
 75 | 
 76 |   def build_subnetwork_train_op(self, subnetwork, loss, var_list, labels,
 77 |                                 iteration_step, summary, previous_ensemble):
 78 |     """See `adanet.subnetwork.Builder`."""
 79 |     return self._optimizer.minimize(loss=loss, var_list=var_list)
 80 | 
 81 |   def build_mixture_weights_train_op(self, loss, var_list, logits, labels,
 82 |                                      iteration_step, summary):
 83 |     """See `adanet.subnetwork.Builder`."""
 84 | 
 85 |     if not self._learn_mixture_weights:
 86 |       return tf.no_op()
 87 |     return self._optimizer.minimize(loss=loss, var_list=var_list)
 88 | 
 89 |   @property
 90 |   def name(self):
 91 |     """See `adanet.subnetwork.Builder`."""
 92 | 
 93 |     if self._num_layers == 0:
 94 |       # A DNN with no hidden layers is a linear model.
 95 |       return "linear"
 96 |     return "{}_layer_dnn".format(self._num_layers)
 97 | 
 98 | 
 99 | class SimpleDNNGenerator(adanet.subnetwork.Generator):
100 |   """Generates a two DNN subnetworks at each iteration.
101 | 
102 |   The first DNN has an identical shape to the most recently added subnetwork
103 |   in `previous_ensemble`. The second has the same shape plus one more dense
104 |   layer on top. This is similar to the adaptive network presented in Figure 2 of
105 |   [Cortes et al. ICML 2017](https://arxiv.org/abs/1607.01097), without the
106 |   connections to hidden layers of networks from previous iterations.
107 |   """
108 | 
109 |   def __init__(self, optimizers, feature_columns, layer_size, learn_mixture_weights, seed):
110 |     """Initializes a DNN `Generator`.
111 | 
112 |     Args:
113 |       optimizers: A defaultdict of string for training both the subnetwork and
114 |         the mixture weights.
115 |       layer_size: Number of nodes in each hidden layer of the subnetwork
116 |         candidates. Note that this parameter is ignored in a DNN with no hidden
117 |         layers.
118 |       learn_mixture_weights: Whether to solve a learning problem to find the
119 |         best mixture weights, or use their default value according to the
120 |         mixture weight type. When `False`, the subnetworks will return a no_op
121 |         for the mixture weight train op.
122 |       seed: A random seed.
123 | 
124 |     Returns:
125 |       An instance of `Generator`.
126 |     """
127 | 
128 |     self._seed = seed
129 |     self._optimizers = optimizers
130 |     self._dnn_builder_fn = functools.partial(
131 |         _SimpleDNNBuilder,
132 |         layer_size=layer_size,
133 |         feature_columns=feature_columns,
134 |         learn_mixture_weights=learn_mixture_weights)
135 | 
136 |   def generate_candidates(self, previous_ensemble, iteration_number,
137 |                           previous_ensemble_reports, all_reports):
138 |     """See `adanet.subnetwork.Generator`."""
139 | 
140 |     num_layers = 0
141 |     seed = self._seed
142 |     if previous_ensemble:
143 |       num_layers = tf.get_static_value(
144 |           previous_ensemble.weighted_subnetworks[
145 |               -1].subnetwork.persisted_tensors[_NUM_LAYERS_KEY])
146 |     if seed is not None:
147 |       seed += iteration_number
148 |     optimizer = self._optimizers[num_layers + 0]
149 |     return [self._dnn_builder_fn(num_layers=num_layers, optimizer=optimizer, seed=seed),
150 |             self._dnn_builder_fn(num_layers=num_layers + 1, optimizer=optimizer, seed=seed)]
151 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sql-machine-learning/models/5dc6421f562ea447e501fa355a48a6ee89856a1d/tests/__init__.py


--------------------------------------------------------------------------------
/tests/base.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import unittest
 3 | import sys
 4 | 
 5 | def train_input_fn(features, labels, batch_size=32):
 6 |     dataset = tf.data.Dataset.from_tensor_slices((dict(features), labels))
 7 |     dataset = dataset.shuffle(1000).repeat().batch(batch_size)
 8 |     return dataset
 9 | 
10 | 
11 | def eval_input_fn(features, labels, batch_size=32):
12 |     dataset = tf.data.Dataset.from_tensor_slices((dict(features), labels))
13 |     dataset = dataset.batch(batch_size)
14 |     return dataset
15 | 
16 | class BaseTestCases:
17 |     class BaseTest(unittest.TestCase):
18 |         def setUp(self):
19 |             self.model, self.features, self.label = None, {}, None
20 | 
21 |         def test_train_and_predict(self):
22 |             self.setUp()
23 |             model_pkg = sys.modules[self.model_class.__module__]
24 |             self.model.compile(optimizer=model_pkg.optimizer(),
25 |                 loss=model_pkg.loss,
26 |                 metrics=["accuracy"])
27 |             self.history = self.model.fit(train_input_fn(self.features, self.label),
28 |                 epochs=10,
29 |                 steps_per_epoch=200, 
30 |                 verbose=1)
31 |             self.historyloss =  self.history.history['loss']
32 |             loss_decline_rate = (self.historyloss[0] - self.historyloss[-1]) \
33 |                                 / self.historyloss[0]
34 |             print('historyloss is {}, and the loss_decline_rate is {}'.\
35 |                 format(self.historyloss, loss_decline_rate))
36 |             assert(loss_decline_rate > 0.3)
37 | 
38 |     class BaseEstimatorTest(BaseTest):
39 |         def test_train_and_predict(self):
40 |             self.setUp()
41 |             input_fn = lambda: train_input_fn(self.features, self.label)
42 |             train_spec = tf.estimator.TrainSpec(input_fn=input_fn, max_steps=1)
43 |             eval_spec = tf.estimator.EvalSpec(input_fn=lambda: eval_input_fn(self.features, self.label))
44 |             baseline = tf.estimator.train_and_evaluate(self.model, train_spec, eval_spec)[0]
45 |             train_spec = tf.estimator.TrainSpec(input_fn=input_fn, max_steps=2000)
46 |             result = tf.estimator.train_and_evaluate(self.model, train_spec, eval_spec)[0]
47 |             loss_decline_rate = 1- result["loss"] / baseline["loss"]
48 |             print('historyloss is {}, and the loss_decline_rate is {}'.\
49 |                 format(baseline["loss"], loss_decline_rate))
50 |             assert(loss_decline_rate > 0.3)
51 | 


--------------------------------------------------------------------------------
/tests/test_arima_with_stl_decomposition.py:
--------------------------------------------------------------------------------
 1 | from sqlflow_models import ARIMAWithSTLDecomposition
 2 | import unittest
 3 | import tensorflow as tf
 4 | from datetime import datetime, timedelta
 5 | import numpy as np
 6 | 
 7 | class TestARIMAWithSTLDecompose(unittest.TestCase):
 8 |     def setUp(self):
 9 |         self.order = [7, 0, 2]
10 |         self.period = [7, 30]
11 |         self.date_format = '%Y-%m-%d'
12 |         self.train_start = '2014-04-01'
13 |         self.train_end = '2014-08-31'
14 |         self.forecast_start = '2014-09-01'
15 |         self.forecast_end = '2014-09-30'
16 | 
17 |     def str2datetime(self, date_str):
18 |         if isinstance(date_str, bytes):
19 |             date_str = date_str.decode('utf-8')
20 |         return datetime.strptime(str(date_str), self.date_format)
21 | 
22 |     def datetime2str(self, date):
23 |         return datetime.strftime(date, self.date_format)
24 | 
25 |     def create_dataset(self):
26 |         def generator():
27 |             start_date = self.str2datetime(self.train_start)
28 |             end_date = self.str2datetime(self.train_end)
29 |             delta = timedelta(days=1)
30 |             while start_date <= end_date:
31 |                 date_str = np.array(self.datetime2str(start_date))
32 |                 label = np.random.random(size=[1]) * 1e8
33 |                 yield date_str, label
34 |                 start_date += delta
35 | 
36 |         def dict_mapper(date_str, label):
37 |             return {'time': date_str}, label
38 | 
39 |         dataset = tf.data.Dataset.from_generator(
40 |             generator, output_types=(tf.dtypes.string, tf.dtypes.float32)
41 |         )
42 |         dataset = dataset.map(dict_mapper)
43 |         return dataset
44 | 
45 |     def prediction_days(self):
46 |         pred_start = self.str2datetime(self.forecast_start)
47 |         pred_end = self.str2datetime(self.forecast_end)
48 |         return (pred_end - pred_start).days + 1
49 | 
50 |     def test_main(self):
51 |         model = ARIMAWithSTLDecomposition(order=[7, 0, 2],
52 |                                       period=[7, 30],
53 |                                       date_format=self.date_format,
54 |                                       forecast_start=self.forecast_start,
55 |                                       forecast_end=self.forecast_end)
56 |         prediction = model.sqlflow_train_loop(self.create_dataset())
57 |         self.assertEqual(len(prediction), self.prediction_days())
58 | 
59 | 
60 | if __name__ == '__main__':
61 |     unittest.main()
62 | 


--------------------------------------------------------------------------------
/tests/test_auto_estimator.py:
--------------------------------------------------------------------------------
 1 | import sqlflow_models
 2 | from tests.base import BaseTestCases, train_input_fn, eval_input_fn
 3 | 
 4 | import sys
 5 | import tensorflow as tf
 6 | import unittest
 7 | import numpy as np
 8 | from sklearn.datasets import load_iris, load_boston
 9 | 
10 | class TestAutoClassifier(BaseTestCases.BaseEstimatorTest):
11 |     def setUp(self):
12 |         x, y = load_iris(return_X_y=True)
13 |         feature_column_names = ['col_{}'.format(d) for d in range(x.shape[1])]
14 |         self.features = {}
15 |         for feature_name, feature_values in zip(feature_column_names, list(x.T)):
16 |             self.features[feature_name] = feature_values
17 |         self.label = y
18 |         feature_columns = [tf.feature_column.numeric_column(key) for key in self.features]
19 |         
20 |         self.model_class = sqlflow_models.AutoClassifier
21 |         self.model = sqlflow_models.AutoClassifier(feature_columns=feature_columns, n_classes=3)
22 | 
23 | class TestAutoBinaryClassifier(BaseTestCases.BaseEstimatorTest):
24 |     def setUp(self):
25 |         x, y = load_iris(return_X_y=True)
26 |         x = np.array([x[i] for i, v in enumerate(y) if v != 2])
27 |         y = np.array([y[i] for i, v in enumerate(y) if v != 2])
28 |         feature_column_names = ['col_{}'.format(d) for d in range(x.shape[1])]
29 |         self.features = {}
30 |         for feature_name, feature_values in zip(feature_column_names, list(x.T)):
31 |             self.features[feature_name] = feature_values
32 |         self.label = y
33 |         feature_columns = [tf.feature_column.numeric_column(key) for key in self.features]
34 |         
35 |         self.model_class = sqlflow_models.AutoClassifier
36 |         self.model = sqlflow_models.AutoClassifier(feature_columns=feature_columns)
37 | 
38 | class TestAutoRegressor(BaseTestCases.BaseEstimatorTest):
39 |     def setUp(self):
40 |         x, y = load_boston(return_X_y=True)
41 |         feature_column_names = ['col_{}'.format(d) for d in range(x.shape[1])]
42 |         self.features = {}
43 |         for feature_name, feature_values in zip(feature_column_names, list(x.T)):
44 |             self.features[feature_name] = feature_values
45 |         self.label = y
46 |         feature_columns = [tf.feature_column.numeric_column(key) for key in self.features]
47 |         self.model_class = sqlflow_models.AutoRegressor
48 |         self.model = sqlflow_models.AutoRegressor(feature_columns=feature_columns)
49 | 
50 | if __name__ == '__main__':
51 |     unittest.main()
52 | 
53 | 


--------------------------------------------------------------------------------
/tests/test_deep_embedding_cluster.py:
--------------------------------------------------------------------------------
  1 | from tensorflow.python.keras.losses import kld
  2 | 
  3 | import sqlflow_models
  4 | from tests.base import BaseTestCases, eval_input_fn
  5 | 
  6 | import tensorflow as tf
  7 | import unittest
  8 | from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score
  9 | from sklearn.utils.linear_assignment_ import linear_assignment
 10 | import numpy as np
 11 | from tensorflow.python import keras
 12 | import sys
 13 | 
 14 | 
 15 | def train_input_fn(features, batch_size=32):
 16 |     dataset = tf.data.Dataset.from_tensor_slices(dict(features))
 17 |     dataset = dataset.shuffle(1000).repeat(1).batch(batch_size)
 18 |     return dataset
 19 | 
 20 | ari = adjusted_rand_score
 21 | nmi = normalized_mutual_info_score
 22 | 
 23 | 
 24 | def acc(y_true, y_pred):
 25 |     """
 26 |     Calculate clustering accuracy.
 27 |     Using the Hungarian algorithm to solve linear assignment problem.
 28 |     """
 29 |     y_true = y_true.astype(np.int64)
 30 |     assert y_pred.size == y_true.size
 31 |     dims = max(y_pred.max(), y_true.max()) + 1
 32 |     w = np.zeros((dims, dims), dtype=np.int64)
 33 |     for i in range(y_pred.size):
 34 |         w[y_pred[i], y_true[i]] += 1
 35 | 
 36 |     ind = linear_assignment(w.max() - w)
 37 |     return sum([w[i, j] for i, j in ind]) * 1.0 / y_pred.size
 38 | 
 39 | 
 40 | def evaluate(x, y, model):
 41 |     metric = dict()
 42 |     q = model.predict(x)
 43 |     y_pred = q.argmax(1)
 44 |     metric['acc'] = np.round(acc(y, y_pred), 5)
 45 |     metric['nmi'] = np.round(nmi(y, y_pred), 5)
 46 |     metric['ari'] = np.round(ari(y, y_pred), 5)
 47 |     return metric
 48 | 
 49 | 
 50 | class TestDeepEmbeddingCluster(BaseTestCases.BaseTest):
 51 |     def setUp(self):
 52 |         (train_data, train_labels), (test_data, test_labels) = keras.datasets.mnist.load_data()
 53 |         x = np.concatenate((train_data, test_data))
 54 |         y = np.concatenate((train_labels, test_labels))
 55 |         x = x.reshape((x.shape[0], -1))
 56 |         x = np.divide(x, 255.)
 57 |         # Sample
 58 |         x = x[:100]
 59 |         y = y[:100]
 60 |         # Generate Data
 61 |         feature_num = x.shape[1]
 62 |         feature_column_names = ['col_{}'.format(d) for d in range(feature_num)]
 63 | 
 64 |         self.features = {}
 65 |         for feature_name, feature_values in zip(feature_column_names, list(x.T)):
 66 |             self.features[feature_name] = feature_values
 67 | 
 68 |         self.label = y
 69 |         feature_columns = [tf.feature_column.numeric_column(key) for key in self.features]
 70 |         pretrain_dims = [500, 500, 2000, 10]
 71 |         # Init model
 72 |         self.model = sqlflow_models.DeepEmbeddingClusterModel(feature_columns=feature_columns,
 73 |                                                               n_clusters=10,
 74 |                                                               kmeans_init=20,
 75 |                                                               run_pretrain=True,
 76 |                                                               existed_pretrain_model=None,
 77 |                                                               pretrain_dims=pretrain_dims,
 78 |                                                               pretrain_activation_func='relu',
 79 |                                                               pretrain_use_callbacks=True,
 80 |                                                               pretrain_cbearlystop_patience=10,
 81 |                                                               pretrain_cbearlystop_mindelta=0.0001,
 82 |                                                               pretrain_cbreduce_patience=5,
 83 |                                                               pretrain_cbreduce_factor=0.2,
 84 |                                                               pretrain_epochs=20,
 85 |                                                               pretrain_initializer='glorot_uniform',
 86 |                                                               train_max_iters=500,
 87 |                                                               update_interval=100,
 88 |                                                               train_use_tol=True,
 89 |                                                               tol=0.0001,
 90 |                                                               loss=kld)
 91 |         self.model_class = sqlflow_models.DeepEmbeddingClusterModel
 92 | 
 93 |     def test_train_and_predict(self):
 94 |         self.setUp()
 95 |         model_pkg = sys.modules[self.model_class.__module__]
 96 |         self.model.compile(optimizer=model_pkg.optimizer(),
 97 |                            loss=model_pkg.loss)
 98 |         self.model.sqlflow_train_loop(train_input_fn(self.features))
 99 |         metric = evaluate(x=eval_input_fn(self.features, self.label), y=self.label, model=self.model)
100 |         print(metric)
101 |         assert (metric['acc'] > 0)
102 | 
103 | 
104 | if __name__ == '__main__':
105 |     unittest.main()
106 | 


--------------------------------------------------------------------------------
/tests/test_dnnclassifier.py:
--------------------------------------------------------------------------------
 1 | import sqlflow_models
 2 | from tests.base import BaseTestCases
 3 | 
 4 | import tensorflow as tf
 5 | import unittest
 6 | import numpy as np
 7 | from sklearn.datasets import load_iris
 8 | 
 9 | class TestDNNClassifier(BaseTestCases.BaseTest):
10 |     def setUp(self):
11 |         x, y = load_iris(return_X_y=True)
12 |         feature_column_names = ['col_{}'.format(d) for d in range(x.shape[1])]
13 |         self.features = {}
14 |         for feature_name, feature_values in zip(feature_column_names, list(x.T)):
15 |             self.features[feature_name] = feature_values
16 |         self.label = y
17 |         feature_columns = [tf.feature_column.numeric_column(key) for key in self.features]
18 |         
19 |         self.model_class = sqlflow_models.DNNClassifier
20 |         self.model = sqlflow_models.DNNClassifier(feature_columns=feature_columns, n_classes=3)
21 | 
22 | class TestDNNBinaryClassifier(BaseTestCases.BaseTest):
23 |     def setUp(self):
24 |         x, y = load_iris(return_X_y=True)
25 |         x = np.array([x[i] for i, v in enumerate(y) if v != 2])
26 |         y = np.array([y[i] for i, v in enumerate(y) if v != 2])
27 |         feature_column_names = ['col_{}'.format(d) for d in range(x.shape[1])]
28 |         self.features = {}
29 |         for feature_name, feature_values in zip(feature_column_names, list(x.T)):
30 |             self.features[feature_name] = feature_values
31 |         self.label = y
32 |         feature_columns = [tf.feature_column.numeric_column(key) for key in self.features]
33 |         
34 |         self.model_class = sqlflow_models.DNNClassifier
35 |         self.model = sqlflow_models.DNNClassifier(feature_columns=feature_columns, n_classes=2)
36 | 
37 | 
38 | 
39 | if __name__ == '__main__':
40 |     unittest.main()
41 | 
42 | 


--------------------------------------------------------------------------------
/tests/test_dnnclassifier_functional_api_example.py:
--------------------------------------------------------------------------------
 1 | import sqlflow_models
 2 | from tests.base import BaseTestCases
 3 | 
 4 | import tensorflow as tf
 5 | import unittest
 6 | 
 7 | from sklearn.datasets import load_iris
 8 | 
 9 | 
10 | def train_input_fn(features, labels, batch_size=32):
11 |     dataset = tf.data.Dataset.from_tensor_slices((dict(features), labels))
12 |     dataset = dataset.shuffle(1000).repeat().batch(batch_size)
13 |     return dataset
14 | 
15 | 
16 | def eval_input_fn(features, labels, batch_size=32):
17 |     dataset = tf.data.Dataset.from_tensor_slices((dict(features), labels))
18 |     dataset = dataset.batch(batch_size)
19 |     return dataset
20 | 
21 | class TestDNNClassifier(BaseTestCases.BaseTest):
22 |     def setUp(self):
23 |         x, y = load_iris(return_X_y=True)
24 |         feature_column_names = ['col_{}'.format(d) for d in range(x.shape[1])]
25 |         self.features = {}
26 |         for feature_name, feature_values in zip(feature_column_names, list(x.T)):
27 |             self.features[feature_name] = feature_values
28 |         self.label = y
29 |         feature_columns = [tf.feature_column.numeric_column(key) for key in self.features]
30 |         fieldmetas = {
31 |             "col_0": {"feature_name": "col_0", "shape": [1], "dtype": tf.float32},
32 |             "col_1": {"feature_name": "col_1", "shape": [1], "dtype": tf.float32},
33 |             "col_2": {"feature_name": "col_2", "shape": [1], "dtype": tf.float32},
34 |             "col_3": {"feature_name": "col_3", "shape": [1], "dtype": tf.float32},
35 |         }
36 |         self.model = sqlflow_models.dnnclassifier_functional_model(feature_columns=feature_columns, field_metas=fieldmetas, n_classes=3)
37 |         self.model_class = sqlflow_models.dnnclassifier_functional_model
38 | 
39 | 
40 | if __name__ == '__main__':
41 |     unittest.main()
42 | 


--------------------------------------------------------------------------------
/tests/test_dnnregressor.py:
--------------------------------------------------------------------------------
 1 | import sqlflow_models
 2 | from tests.base import BaseTestCases
 3 | 
 4 | import tensorflow as tf
 5 | import unittest
 6 | from sklearn.datasets import load_boston
 7 | 
 8 | 
 9 | class TestDNNRegressor(BaseTestCases.BaseTest):
10 |     def setUp(self):
11 |         x, y = load_boston(return_X_y=True)
12 |         feature_column_names = ['col_{}'.format(d) for d in range(x.shape[1])]
13 |         self.features = {}
14 |         for feature_name, feature_values in zip(feature_column_names, list(x.T)):
15 |             self.features[feature_name] = feature_values
16 |         self.label = y
17 |         feature_columns = [tf.feature_column.numeric_column(key) for key in self.features]
18 |         self.model_class = sqlflow_models.DNNRegressor
19 |         self.model = sqlflow_models.DNNRegressor(feature_columns=feature_columns)
20 | 
21 | 
22 | if __name__ == '__main__':
23 |     unittest.main()
24 | 
25 | 


--------------------------------------------------------------------------------
/tests/test_gcn.py:
--------------------------------------------------------------------------------
 1 | import sqlflow_models
 2 | from tests.base import BaseTestCases
 3 | 
 4 | import tensorflow as tf
 5 | import numpy as np
 6 | import unittest
 7 | import random
 8 | 
 9 | 
10 | def build_karate_club_graph():
11 |     # All 78 edges are stored in two numpy arrays. One for source endpoints
12 |     # while the other for destination endpoints. 
13 |     # Credit to: https://docs.dgl.ai/tutorials/basics/1_first.html
14 |     src = np.array([1, 2, 2, 3, 3, 3, 4, 5, 6, 6, 6, 7, 7, 7, 7, 8, 8, 9, 10, 10,
15 |         10, 11, 12, 12, 13, 13, 13, 13, 16, 16, 17, 17, 19, 19, 21, 21,
16 |         25, 25, 27, 27, 27, 28, 29, 29, 30, 30, 31, 31, 31, 31, 32, 32,
17 |         32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33,
18 |         33, 33, 33, 33, 33, 33, 33, 33, 33, 33])
19 |     dst = np.array([0, 0, 1, 0, 1, 2, 0, 0, 0, 4, 5, 0, 1, 2, 3, 0, 2, 2, 0, 4,
20 |         5, 0, 0, 3, 0, 1, 2, 3, 5, 6, 0, 1, 0, 1, 0, 1, 23, 24, 2, 23,
21 |         24, 2, 23, 26, 1, 8, 0, 24, 25, 28, 2, 8, 14, 15, 18, 20, 22, 23,
22 |         29, 30, 31, 8, 9, 13, 14, 15, 18, 19, 20, 22, 23, 26, 27, 28, 29, 30,
23 |         31, 32])
24 |     u = np.concatenate([src, dst])
25 |     v = np.concatenate([dst, src])
26 |     u = np.expand_dims(u, axis=1)
27 |     v = np.expand_dims(v, axis=1)
28 |     return np.concatenate([u,v], 1)
29 | 
30 | def acc(y, label):
31 |     '''Function to calculate the accuracy.'''
32 |     ll = tf.equal(tf.argmax(label, -1), tf.argmax(y, -1))
33 |     accuarcy = tf.reduce_mean(tf.cast(ll, dtype=tf.float32))
34 |     return accuarcy
35 | 
36 | def evaluate(x, y, model):
37 |     '''Function to evaluate the performance of model.'''
38 |     metric = dict()
39 |     y_pred = model.predict(x)
40 |     metric['acc'] = np.round(acc(y, y_pred), 5)
41 |     return metric
42 | 
43 | class TestGCN(BaseTestCases.BaseTest):
44 |     def setUp(self):
45 |         feature = [[0,1,2]+random.sample(range(3, 20), 8),
46 |                    [0,1,2]+random.sample(range(18, 40),8),
47 |                    [0,1,2]+random.sample(range(38, 60),8),
48 |                    [0,1,2]+random.sample(range(58, 80),8)]
49 |         label = ['Shotokan', 'Gōjū-ryū', 'Wadō-ryū', 'Shitō-ryū']
50 |         nodes = np.array(list(range(34)))
51 |         edges = build_karate_club_graph()
52 |         features, labels = list(), list()
53 |         for i in range(34):
54 |             idx = random.randint(0,3)
55 |             features.append(np.eye(81)[feature[idx]].sum(0))
56 |             labels.append(label[idx])
57 |         self.inputs = [dict() for i in range(len(edges)*2)]
58 |         self.labels = list()
59 |         for i in range(len(edges)):
60 |             self.inputs[i]['id'] = tf.convert_to_tensor(edges[i][0])
61 |             self.inputs[i]['features'] = tf.convert_to_tensor(features[edges[i][0]])
62 |             self.inputs[i]['from_node_id'] = tf.convert_to_tensor(edges[i][0])
63 |             self.inputs[i]['to_node_id'] = tf.convert_to_tensor(edges[i][1])
64 |             self.labels.append(tf.convert_to_tensor([labels[edges[i][0]]]))
65 |         for i in range(len(edges)):
66 |             self.inputs[i+len(edges)]['id'] = tf.convert_to_tensor(edges[i][1])
67 |             self.inputs[i+len(edges)]['features'] = tf.convert_to_tensor(features[edges[i][1]])
68 |             self.inputs[i+len(edges)]['from_node_id'] = tf.convert_to_tensor(edges[i][0])
69 |             self.inputs[i+len(edges)]['to_node_id'] = tf.convert_to_tensor(edges[i][1])
70 |             self.labels.append(tf.convert_to_tensor([labels[edges[i][1]]]))
71 |         self.model = sqlflow_models.GCN(nhid=16, nclass=4, epochs=20, train_ratio=0.2, eval_ratio=0.15)
72 |         self.model_class = sqlflow_models.GCN
73 | 
74 |     def test_train_and_predict(self):
75 |         self.setUp()
76 |         self.model.compile(optimizer=optimizer(),
77 |                            loss='categorical_crossentropy')
78 |         self.model.sqlflow_train_loop(zip(self.inputs, self.labels))
79 |         metric = evaluate([self.model.features, self.model.adjacency], self.model.labels, self.model)
80 |         assert (metric['acc'] > 0)
81 | 
82 | def optimizer():
83 |     return tf.keras.optimizers.Adam(lr=0.01)
84 | 
85 | if __name__ == '__main__':
86 |     unittest.main()
87 | 
88 | 
89 | 


--------------------------------------------------------------------------------
/tests/test_one_class_svm.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 The SQLFlow Authors. All rights reserved.
 2 | # Licensed under the Apache License, Version 2.0 (the "License");
 3 | # you may not use this file except in compliance with the License.
 4 | # You may obtain a copy of the License at
 5 | #
 6 | # http://www.apache.org/licenses/LICENSE-2.0
 7 | #
 8 | # Unless required by applicable law or agreed to in writing, software
 9 | # distributed under the License is distributed on an "AS IS" BASIS,
10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | # See the License for the specific language governing permissions and
12 | # limitations under the License.
13 | 
14 | import os
15 | import shutil
16 | import tempfile
17 | import unittest
18 | 
19 | import numpy as np
20 | import tensorflow as tf
21 | from sqlflow_models import OneClassSVM
22 | from sqlflow_models.one_class_svm import dataset_reader
23 | 
24 | 
25 | class TestOneClassSVM(unittest.TestCase):
26 |     def setUp(self):
27 |         self.tmp_dir = tempfile.mkdtemp()
28 |         self.old_cwd = os.getcwd()
29 |         os.chdir(self.tmp_dir)
30 | 
31 |     def tearDown(self):
32 |         os.chdir(self.old_cwd)
33 |         shutil.rmtree(self.tmp_dir)
34 | 
35 |     def create_dataset(self):
36 |         def generator():
37 |             for _ in range(10):
38 |                 x1 = np.random.random(size=[1, 1])
39 |                 x2 = np.random.random(size=[1, 1])
40 |                 yield x1, x2
41 | 
42 |         def dict_mapper(x1, x2):
43 |             return {"x1": x1, "x2": x2}
44 | 
45 |         dataset = tf.data.Dataset.from_generator(
46 |             generator, output_types=(tf.dtypes.float32, tf.dtypes.float32))
47 |         return dataset.map(dict_mapper)
48 | 
49 |     def test_main(self):
50 |         svm = OneClassSVM()
51 |         train_dataset = self.create_dataset()
52 |         svm.sqlflow_train_loop(train_dataset)
53 | 
54 |         predict_dataset = self.create_dataset()
55 |         for features in dataset_reader(predict_dataset):
56 |             pred = svm.sqlflow_predict_one(features)[0]
57 |             pred = np.array(pred)
58 |             self.assertEqual(pred.shape, (1,))
59 |             self.assertTrue(pred[0] == 1 or pred[0] == -1)
60 | 
61 | 
62 | if __name__ == '__main__':
63 |     unittest.main()
64 | 


--------------------------------------------------------------------------------
/tests/test_rnn.py:
--------------------------------------------------------------------------------
  1 | import sqlflow_models
  2 | from tests.base import BaseTestCases
  3 | 
  4 | import tensorflow as tf
  5 | import numpy as np
  6 | import unittest
  7 | 
  8 | 
  9 | class TestStackedRNNClassifier(BaseTestCases.BaseTest):
 10 |     def setUp(self):
 11 |         self.features = {"c1": np.array([int(x) for x in range(800)]).reshape(100, 8)}
 12 |         self.label = [0 for _ in range(50)] + [1 for _ in range(50)]
 13 |         fea = tf.feature_column.sequence_categorical_column_with_identity(
 14 |             key="c1",
 15 |             num_buckets=800
 16 |         )
 17 | 
 18 |         emb = tf.feature_column.embedding_column(
 19 |             fea,
 20 |             dimension=32)
 21 |         feature_columns = [emb]
 22 |         self.model = sqlflow_models.StackedRNNClassifier(feature_columns=feature_columns, stack_units=[64, 32], model_type='rnn')
 23 |         self.model_class = sqlflow_models.StackedRNNClassifier
 24 | 
 25 | class TestStackedBiRNNClassifier(BaseTestCases.BaseTest):
 26 |     def setUp(self):
 27 |         self.features = {"c1": np.array([int(x) for x in range(800)]).reshape(100, 8)}
 28 |         self.label = [0 for _ in range(50)] + [1 for _ in range(50)]
 29 |         fea = tf.feature_column.sequence_categorical_column_with_identity(
 30 |             key="c1",
 31 |             num_buckets=800
 32 |         )
 33 | 
 34 |         emb = tf.feature_column.embedding_column(
 35 |             fea,
 36 |             dimension=32)
 37 |         feature_columns = [emb]
 38 |         self.model = sqlflow_models.StackedRNNClassifier(feature_columns=feature_columns, stack_units=[64, 32], model_type='rnn', bidirectional=True)
 39 |         self.model_class = sqlflow_models.StackedRNNClassifier
 40 | 
 41 | class TestStackedLSTMClassifier(BaseTestCases.BaseTest):
 42 |     def setUp(self):
 43 |         self.features = {"c1": np.array([int(x) for x in range(800)]).reshape(100, 8)}
 44 |         self.label = [0 for _ in range(50)] + [1 for _ in range(50)]
 45 |         fea = tf.feature_column.sequence_categorical_column_with_identity(
 46 |             key="c1",
 47 |             num_buckets=800
 48 |         )
 49 | 
 50 |         emb = tf.feature_column.embedding_column(
 51 |             fea,
 52 |             dimension=32)
 53 |         feature_columns = [emb]
 54 |         self.model = sqlflow_models.StackedRNNClassifier(feature_columns=feature_columns, stack_units=[64, 32], model_type='lstm')
 55 |         self.model_class = sqlflow_models.StackedRNNClassifier
 56 | 
 57 | class TestStackedBiLSTMClassifier(BaseTestCases.BaseTest):
 58 |     def setUp(self):
 59 |         self.features = {"c1": np.array([int(x) for x in range(800)]).reshape(100, 8)}
 60 |         self.label = [0 for _ in range(50)] + [1 for _ in range(50)]
 61 |         fea = tf.feature_column.sequence_categorical_column_with_identity(
 62 |             key="c1",
 63 |             num_buckets=800
 64 |         )
 65 | 
 66 |         emb = tf.feature_column.embedding_column(
 67 |             fea,
 68 |             dimension=32)
 69 |         feature_columns = [emb]
 70 |         self.model = sqlflow_models.StackedRNNClassifier(feature_columns=feature_columns, stack_units=[64, 32], model_type='lstm', bidirectional=True)
 71 |         self.model_class = sqlflow_models.StackedRNNClassifier
 72 | 
 73 | class TestStackedGRUClassifier(BaseTestCases.BaseTest):
 74 |     def setUp(self):
 75 |         self.features = {"c1": np.array([int(x) for x in range(800)]).reshape(100, 8)}
 76 |         self.label = [0 for _ in range(50)] + [1 for _ in range(50)]
 77 |         fea = tf.feature_column.sequence_categorical_column_with_identity(
 78 |             key="c1",
 79 |             num_buckets=800
 80 |         )
 81 | 
 82 |         emb = tf.feature_column.embedding_column(
 83 |             fea,
 84 |             dimension=32)
 85 |         feature_columns = [emb]
 86 |         self.model = sqlflow_models.StackedRNNClassifier(feature_columns=feature_columns, stack_units=[64, 32], model_type='gru')
 87 |         self.model_class = sqlflow_models.StackedRNNClassifier
 88 | 
 89 | class TestStackedBiGRUClassifier(BaseTestCases.BaseTest):
 90 |     def setUp(self):
 91 |         self.features = {"c1": np.array([int(x) for x in range(800)]).reshape(100, 8)}
 92 |         self.label = [0 for _ in range(50)] + [1 for _ in range(50)]
 93 |         fea = tf.feature_column.sequence_categorical_column_with_identity(
 94 |             key="c1",
 95 |             num_buckets=800
 96 |         )
 97 | 
 98 |         emb = tf.feature_column.embedding_column(
 99 |             fea,
100 |             dimension=32)
101 |         feature_columns = [emb]
102 |         self.model = sqlflow_models.StackedRNNClassifier(feature_columns=feature_columns, stack_units=[64, 32], model_type='gru', bidirectional=True)
103 |         self.model_class = sqlflow_models.StackedRNNClassifier
104 | 
105 | if __name__ == '__main__':
106 |     unittest.main()
107 | 
108 | 
109 | 


--------------------------------------------------------------------------------
/tests/test_rnnts.py:
--------------------------------------------------------------------------------
 1 | import sqlflow_models
 2 | from tests.base import BaseTestCases
 3 | 
 4 | import tensorflow as tf
 5 | import numpy as np
 6 | np.random.seed(22)
 7 | import unittest
 8 | 
 9 | 
10 | class TestRNNBasedTimeSeriesModel(BaseTestCases.BaseTest):
11 |     def setUp(self):
12 |         # We use sin data plus perturbation to simulate time series data
13 |         time_series_data = np.sin(np.arange(56)) + np.random.normal(0, 0.01, 56)
14 |         x = np.array(time_series_data).reshape(8, 7)
15 |         y = np.array(np.arange(8).reshape(8, 1))
16 |         self.features = {"col1": x}
17 |         self.label = y
18 |         self.n_in = 7
19 |         self.n_out = 1
20 |         # time_window=n_in, num_features=n_out
21 |         feature_columns = [tf.feature_column.numeric_column(key, shape=(self.n_in, self.n_out)) for key in self.features]
22 |         self.model = sqlflow_models.RNNBasedTimeSeriesModel(
23 |             feature_columns=feature_columns, 
24 |             stack_units=[50, 50], 
25 |             n_in=self.n_in,
26 |             n_out=self.n_out,
27 |             model_type='rnn')
28 |         self.model_class = sqlflow_models.RNNBasedTimeSeriesModel
29 | 
30 | class TestLSTMBasedTimeSeriesModel(BaseTestCases.BaseTest):
31 |     def setUp(self):
32 |         # We use sin data plus perturbation to simulate time series data
33 |         time_series_data = np.sin(np.arange(56)) + np.random.normal(0, 0.01, 56)
34 |         x = np.array(time_series_data).reshape(8, 7)
35 |         y = np.array(np.arange(8).reshape(8, 1))
36 |         self.features = {"col1": x}
37 |         self.label = y
38 |         self.n_in = 7
39 |         self.n_out = 1
40 |         # time_window=n_in, num_features=n_out
41 |         feature_columns = [tf.feature_column.numeric_column(key, shape=(self.n_in, self.n_out)) for key in self.features]
42 |         self.model = sqlflow_models.RNNBasedTimeSeriesModel(
43 |             feature_columns=feature_columns, 
44 |             stack_units=[50, 50], 
45 |             n_in=self.n_in,
46 |             n_out=self.n_out,
47 |             model_type='lstm')
48 |         self.model_class = sqlflow_models.RNNBasedTimeSeriesModel
49 | 
50 | class TestGRUBasedTimeSeriesModel(BaseTestCases.BaseTest):
51 |     def setUp(self):
52 |         # We use sin data plus perturbation to simulate time series data
53 |         time_series_data = np.sin(np.arange(56)) + np.random.normal(0, 0.01, 56)
54 |         x = np.array(time_series_data).reshape(8, 7)
55 |         y = np.array(np.arange(8).reshape(8, 1))
56 |         self.features = {"col1": x}
57 |         self.label = y
58 |         self.n_in = 7
59 |         self.n_out = 1
60 |         # time_window=n_in, num_features=n_out
61 |         feature_columns = [tf.feature_column.numeric_column(key, shape=(self.n_in, self.n_out)) for key in self.features]
62 |         self.model = sqlflow_models.RNNBasedTimeSeriesModel(
63 |             feature_columns=feature_columns, 
64 |             stack_units=[50, 50], 
65 |             n_in=self.n_in,
66 |             n_out=self.n_out,
67 |             model_type='gru')
68 |         self.model_class = sqlflow_models.RNNBasedTimeSeriesModel
69 | 
70 | 
71 | if __name__ == '__main__':
72 |     unittest.main()
73 | 
74 | 


--------------------------------------------------------------------------------
/tests/test_score_card.py:
--------------------------------------------------------------------------------
 1 | from sqlflow_models import ScoreCard
 2 | import unittest
 3 | import tensorflow as tf
 4 | from datetime import datetime, timedelta
 5 | import numpy as np
 6 | 
 7 | 
 8 | class TestScoreCard(unittest.TestCase):
 9 |     def create_dataset(self):
10 |         samples = 20
11 |         f = [np.random.randint(20, size=1) for i in range(samples)]
12 |         label = [np.random.randint(2, size=1) for i in range(samples)]
13 | 
14 |         def generator():
15 |             for i, item in enumerate(f):
16 |                 yield [f[i]], label[i]
17 | 
18 |         def dict_mapper(feature, label):
19 |             return {'f1': feature}, label
20 | 
21 |         dataset = tf.data.Dataset.from_generator(
22 |             generator, output_types=(tf.dtypes.float32, tf.dtypes.float32)
23 |         )
24 |         dataset = dataset.map(dict_mapper)
25 |         return dataset
26 | 
27 |     def test_train(self):
28 |         dataset = self.create_dataset()
29 |         m = ScoreCard(pf_bin_size=2)
30 |         m.sqlflow_train_loop(dataset)
31 | 
32 | 
33 | if __name__ == '__main__':
34 |     unittest.main()
35 | 


--------------------------------------------------------------------------------
/tests/test_version.py:
--------------------------------------------------------------------------------
1 | import sqlflow_models
2 | 
3 | 
4 | def test_answer():
5 |     assert sqlflow_models.__version__ == sqlflow_models._version.__version__
6 | 


--------------------------------------------------------------------------------