├── .github
    └── workflows
    │   └── codeql-analysis.yml
├── Deep_Auto_ViML_Timeseries.ipynb
├── LICENSE
├── README.md
├── code-of-conduct.md
├── contributing.md
├── deep_1.jpg
├── deep_2.jpg
├── deep_3.jpg
├── deep_4.jpg
├── deep_5.jpg
├── deep_6.jpg
├── deep_7.jpg
├── deep_8.jpg
├── deep_autoviml
    ├── __init__.py
    ├── __version__.py
    ├── data_load
    │   ├── __pycache__
    │   │   ├── classify_features.cpython-38.pyc
    │   │   └── extract.cpython-38.pyc
    │   ├── classify_features.py
    │   └── extract.py
    ├── deep_autoviml.py
    ├── modeling
    │   ├── __pycache__
    │   │   ├── create_model.cpython-38.pyc
    │   │   ├── one_cycle.cpython-38.pyc
    │   │   ├── predict_model.cpython-38.pyc
    │   │   ├── train_custom_model.cpython-38.pyc
    │   │   ├── train_image_model.cpython-38.pyc
    │   │   ├── train_model.cpython-38.pyc
    │   │   └── train_text_model.cpython-38.pyc
    │   ├── create_model.py
    │   ├── one_cycle.py
    │   ├── predict_model.py
    │   ├── train_custom_model.py
    │   ├── train_image_model.py
    │   ├── train_model.py
    │   └── train_text_model.py
    ├── models
    │   ├── __pycache__
    │   │   ├── basic.cpython-38.pyc
    │   │   ├── big_deep.cpython-38.pyc
    │   │   ├── cnn1.cpython-38.pyc
    │   │   ├── cnn2.cpython-38.pyc
    │   │   ├── deep.cpython-38.pyc
    │   │   ├── deep_and_wide.cpython-38.pyc
    │   │   ├── deep_nn.cpython-38.pyc
    │   │   ├── dnn.cpython-38.pyc
    │   │   ├── dnn_drop.cpython-38.pyc
    │   │   ├── giant_deep.cpython-38.pyc
    │   │   ├── reg_dnn.cpython-38.pyc
    │   │   ├── simple_dnn.cpython-38.pyc
    │   │   └── tf_hub_lookup.cpython-38.pyc
    │   ├── basic.py
    │   ├── cnn1.py
    │   ├── cnn2.py
    │   ├── deep_and_wide.py
    │   ├── dnn.py
    │   ├── dnn_drop.py
    │   ├── giant_deep.py
    │   ├── gru1.py
    │   ├── lstm1.py
    │   ├── reg_dnn.py
    │   ├── rnn1.py
    │   └── tf_hub_lookup.py
    ├── preprocessing
    │   ├── __pycache__
    │   │   ├── preprocessing.cpython-38.pyc
    │   │   ├── preprocessing_images.cpython-38.pyc
    │   │   ├── preprocessing_nlp.cpython-38.pyc
    │   │   ├── preprocessing_tabular.cpython-38.pyc
    │   │   └── preprocessing_text.cpython-38.pyc
    │   ├── preprocessing.py
    │   ├── preprocessing_images.py
    │   ├── preprocessing_nlp.py
    │   ├── preprocessing_tabular.py
    │   └── preprocessing_text.py
    └── utilities
    │   ├── __pycache__
    │       └── utilities.cpython-38.pyc
    │   └── utilities.py
├── examples
    ├── Deep_AutoViML_Text_Test.ipynb
    ├── Deep_Auto_ViML_Demo.ipynb
    ├── Deep_Auto_ViML_Image_Classification_Demo.ipynb
    ├── Deep_Auto_ViML_NLP_Demo.ipynb
    ├── Deep_Auto_ViML_Titanic.ipynb
    ├── NLP_small.csv
    └── boston.csv
├── logo.jpg
├── requirements.txt
├── setup.cfg
└── setup.py


/.github/workflows/codeql-analysis.yml:
--------------------------------------------------------------------------------
 1 | # For most projects, this workflow file will not need changing; you simply need
 2 | # to commit it to your repository.
 3 | #
 4 | # You may wish to alter this file to override the set of languages analyzed,
 5 | # or to provide custom queries or build logic.
 6 | #
 7 | # ******** NOTE ********
 8 | # We have attempted to detect the languages in your repository. Please check
 9 | # the `language` matrix defined below to confirm you have the correct set of
10 | # supported CodeQL languages.
11 | #
12 | name: "CodeQL"
13 | 
14 | on:
15 |   push:
16 |     branches: [ master ]
17 |   pull_request:
18 |     # The branches below must be a subset of the branches above
19 |     branches: [ master ]
20 |   schedule:
21 |     - cron: '45 7 * * 0'
22 | 
23 | jobs:
24 |   analyze:
25 |     name: Analyze
26 |     runs-on: ubuntu-latest
27 |     permissions:
28 |       actions: read
29 |       contents: read
30 |       security-events: write
31 | 
32 |     strategy:
33 |       fail-fast: false
34 |       matrix:
35 |         language: [ 'python' ]
36 |         # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python' ]
37 |         # Learn more:
38 |         # https://docs.github.com/en/free-pro-team@latest/github/finding-security-vulnerabilities-and-errors-in-your-code/configuring-code-scanning#changing-the-languages-that-are-analyzed
39 | 
40 |     steps:
41 |     - name: Checkout repository
42 |       uses: actions/checkout@v2
43 | 
44 |     # Initializes the CodeQL tools for scanning.
45 |     - name: Initialize CodeQL
46 |       uses: github/codeql-action/init@v1
47 |       with:
48 |         languages: ${{ matrix.language }}
49 |         # If you wish to specify custom queries, you can do so here or in a config file.
50 |         # By default, queries listed here will override any specified in a config file.
51 |         # Prefix the list here with "+" to use these queries and those in the config file.
52 |         # queries: ./path/to/local/query, your-org/your-repo/queries@main
53 | 
54 |     # Autobuild attempts to build any compiled languages  (C/C++, C#, or Java).
55 |     # If this step fails, then you should remove it and run the build manually (see below)
56 |     - name: Autobuild
57 |       uses: github/codeql-action/autobuild@v1
58 | 
59 |     # ℹ️ Command-line programs to run using the OS shell.
60 |     # 📚 https://git.io/JvXDl
61 | 
62 |     # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines
63 |     #    and modify them (or add more) to build your code if your project
64 |     #    uses a compiled language
65 | 
66 |     #- run: |
67 |     #   make bootstrap
68 |     #   make release
69 | 
70 |     - name: Perform CodeQL Analysis
71 |       uses: github/codeql-action/analyze@v1
72 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright [yyyy] [name of copyright owner]
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.
203 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # deep_autoviml
  2 | ## Build keras pipelines and models in a single line of code!
  3 | ![banner](logo.jpg)
  4 | [![forthebadge made-with-python](http://ForTheBadge.com/images/badges/made-with-python.svg)](https://www.python.org/)
  5 | [![ForTheBadge built-with-love](http://ForTheBadge.com/images/badges/built-with-love.svg)](https://github.com/AutoViML)
  6 | [![standard-readme compliant](https://img.shields.io/badge/standard--readme-OK-green.svg?style=flat-square)](https://github.com/RichardLitt/standard-readme)
  7 | [![Python Versions](https://img.shields.io/pypi/pyversions/autoviml.svg?logo=python&logoColor=white)](https://pypi.org/project/autoviml)
  8 | [![Build Status](https://travis-ci.org/joemccann/dillinger.svg?branch=master)](https://github.com/AutoViML)
  9 | ## Table of Contents
 10 | <ul>
 11 | <li><a href="#motivation">Motivation</a></li>
 12 | <li><a href="#features">How it works</a></li>
 13 | <li><a href="#technology">Technology</a></li>
 14 | <li><a href="#install">Install</a></li>
 15 | <li><a href="#usage">Usage</a></li>
 16 | <li><a href="#api">API</a></li>
 17 | <li><a href="#image">Image Classification</a></li>
 18 | <li><a href="#nlp">NLP Tasks</a></li>
 19 | <li><a href="#tips">Tips for using deep_autoviml</a></li>
 20 | <li><a href="#maintainers">Maintainers</a></li>
 21 | <li><a href="#contributing">Contributing</a></li>
 22 | <li><a href="#license">License</a></li>
 23 | </ul>
 24 | 
 25 | 
 26 | ## Update (May 2024): Now upgraded to tensorflow 2.12 - latest version of tensroflow!
 27 | You can now use the latest version of tensorflow 2.8 and above to test your deep learning models thanks to deep_autoviml. Enjoy the upgrade!
 28 | 
 29 | ## Update (Jan 2022): Now with mlflow!
 30 | You can now add `mlflow` experiment tracking to all your deep_autoviml runs. [mlflow](https://mlflow.org/) is a popular python library for experiment tracking and MLOps in general. See more details below under `mlflow`.
 31 | 
 32 | ## Motivation
 33 | ✨ deep_autoviml is a powerful new deep learning library with a very simple design goal:  ✨
 34 | ```Make it easy for novices and experts to experiment and build tensorflow.keras preprocessing pipelines and models in fewest steps.```
 35 | But just because we make it easy, does not mean you should trust everything that it does or treat it like a black box. You must still use your own judgement and intutition to make sure the results are accurate and explainable, not to mention that the model conforms to <a href='https://ai.google/principles/'>Responsbile AI principles</a>.
 36 | 
 37 | ### Watch YouTube Video for Demo of Deep_AutoViML
 38 | [![YouTube Demo](deep_6.jpg)](https://www.youtube.com/watch?v=IcpwNNNXsWE)
 39 | 
 40 | ### What is Deep AutoViML?
 41 | Deep AutoViML is the next version of AutoViML, a popular automl library that was developed using pandas, scikit-learn and xgboost+catboost. Deep AutoViML takes the best features of AutoViML and uses the latest generation of tensorflow and keras libraries to build a fast model and data pipeline for MLOps use cases.
 42 | 
 43 | deep autoviml is primarily meant for sophisticated data engineers, data scientists and ML engineers to quickly prototype and build tensorflow 2.4.1+ models and pipelines for any data set, any size using a single line of code. It can build models for structured data, NLP and image datasets. It can also handle time series data sets in the future. 
 44 | 1. You can either choose deep_autoviml to automatically buid a custom Tensorflow model 
 45 | 1. Instead, you can "bring your own model" ("BYOM" option) model to attach keras data pipelines to your model.
 46 | 1. Additionally, you can choose any Tensorflow Hub model (TFHub) to custom train on your data. Just look for instructions below in <a href="#tips">"Tips for using deep_autoviml"</a> section.
 47 | 1. There are 4 ways to build your model quickly or slowly depending on your needs:
 48 | - fast: a quick model that uses only dense layers (deep layers)
 49 | - fast1: a deep and wide model that uses both deep and wide layers. This is slightly slower than `fast` model.
 50 | - fast2: a deep and cross model that crosses some variables (hence deep and cross). This is about the same speed as 'fast1` model.
 51 | - auto: This uses `Optuna` or `Storm-Tuner` to perform combinations of dense layers and select best architecture. This will take the longest time.
 52 | 
 53 | ![why_deep](deep_2.jpg)
 54 | ## Features
 55 | These are the main features that distinguish deep_autoviml from other libraries:
 56 | - It uses keras preprocessing layers which are more intuitive, and are included inside your model to simplify deployment
 57 | - The pipeline is available to you to use as inputs in your own functional model (if you so wish - you must specify that option in the input - see below for "pipeline")
 58 | - It can import any csv, txt or gzip file or file patterns (that fit multiple files) and it can scale to any data set of any size due to tf.data.Dataset's superior data pipelining features (such as cache, prefetch, batch, etc.)
 59 | - It uses an amazing new tuner called [STORM tuner](https://github.com/ben-arnao/StoRM) that quickly searches for the best hyperparameters for your keras model in fewer than 25 trials
 60 | - If you want to fine tune your model even further, you can fiddle with a wide variety of model options or keras options using **kwargs like dictionaries
 61 | - You can import your own custom Sequential model and watch it transform it into a functional model with additional preprocessing and output layers and train the model with your data 
 62 | - You can save the model on your local machine or copy it to any cloud provider's storage bucket and serve it from there using tensorflow Serving (TF.Serving)
 63 | - Since your model contains preprocessing layers built-in, you just need to provide your Tensorflow serving model with raw data to test and get back predictions in the same format as your training labels.
 64 | ![how_it_works](deep_1.jpg)
 65 | 
 66 | ## Technology
 67 | deep_autoviml uses the latest in tensorflow (2.4.1+) td.data.Datasets and tf.keras preprocessing technologies: the Keras preprocessing layers enable you to encapsulate feature engineering and preprocessing into the model itself. This makes the process for training and predictions the same: just feed input data (in the form of files or dataframes) and the model will take care of all preprocessing before predictions. 
 68 | 
 69 | To perform its preprocessing on the model itself, deep_autoviml uses [tensorflow](https://www.tensorflow.org/) (TF 2.4.1+ and later versions) and [tf.keras](https://www.tensorflow.org/api_docs/python/tf/keras) experimental preprocessing layers: these layers are part of your saved model. They become part of the model's computational graph that can be optimized and executed on any device including GPU's and TPU's. By packaging everything as a single unit, we save the effort in reimplementing the preprocessing logic on the production server. The new model can take raw tabular data with numeric and categorical variables or strings text directly without any preprocessing. This avoids missing or incorrect configuration for the preprocesing_layer during production.
 70 | 
 71 | In addition, to select the best hyper parameters for the model, it uses a new open source library:
 72 | - [storm-tuner](https://github.com/ben-arnao/StoRM) - storm-tuner is an amazing new library that enables us to quickly fine tune our keras sequential models with hyperparameters and find a performant model within a few trials.
 73 | ![how_deep](deep_4.jpg)
 74 | 
 75 | ## Install
 76 | deep_autoviml requires [tensorflow](https://www.tensorflow.org/api_docs/python/tf) v2.4.1+ and [storm-tuner](https://github.com/ben-arnao/StoRM)  to run. Don't worry! We will install these libraries when you install deep_autoviml.
 77 | 
 78 | ```
 79 | pip install deep_autoviml
 80 | ```
 81 | 
 82 | For your own conda environment...
 83 | 
 84 | ```
 85 | conda create -n <your_env_name> python=3.7 anaconda
 86 | conda activate <your_env_name> # ON WINDOWS: `source activate <your_env_name>`
 87 | pip install deep_autoviml
 88 | or
 89 | pip install git+https://github.com/AutoViML/deep_autoviml.git
 90 | ```
 91 | 
 92 | ## Usage
 93 | ![deep_usage](deep_5.jpg)
 94 | deep_autoviml can be invoked with a simple import and run statement:
 95 | 
 96 | ```
 97 | from deep_autoviml import deep_autoviml as deepauto
 98 | ```
 99 | 
100 | Load a data set (any .csv or .gzip or .gz or .txt file) into deep_autoviml and it will split it into Train and Validation  datasets inside. You only need to provide a target variable, a project_name to store files in your local machine and leave the rest to defaults:
101 | 
102 | ```
103 | model, cat_vocab_dict = deepauto.fit(train, target, keras_model_type="auto",
104 |             project_name="deep_autoviml", keras_options={}, model_options={}, 
105 |             save_model_flag=True, use_my_model='', model_use_case='', verbose=0,
106 |             use_mlflow=False, mlflow_exp_name='autoviml', mlflow_run_name='first_run')
107 | ```
108 | 
109 | Once deep_autoviml writes your saved model and cat_vocab_dict files to disk in the project_name directory, you can load it from anywhere (including cloud) for predictions like this using the model and cat_vocab_dict generated above:
110 | 
111 | There are two kinds of predictions: This is the usual (typical) format.
112 | ```
113 | predictions = deepauto.predict(model, project_name, test_dataset=test,
114 |             keras_model_type=keras_model_type, cat_vocab_dict=cat_vocab_dict)
115 | ```
116 | 
117 | In case you are performing image classification, then you need to use `deepauto.predict_images()` for making predictions. See the Image section below for more details.
118 | 
119 | ## API
120 | **Arguments**
121 | 
122 | deep_autoviml requires only a single line of code to get started. You can however, fine tune the model we build using multiple options using dictionaries named "model_options" and "keras_options". These two dictionaries act like python **kwargs to enable you to fine tune hyperparameters for building our tf.keras model. Instructions on how to use them are provided below.
123 | 
124 | ![how_deep](deep_3.jpg)
125 | 
126 | - `train`: could be a datapath+filename or a pandas dataframe. Deep Auto_ViML even handles gz or gzip files. You must specify the full path and file name for it find and load it.
127 | - `target`: name of the target variable in the data set.
128 | - `keras_model_type`: default is "auto" ## But always try "fast", then "fast1", and "fast2", finally "auto". If you want to run NLP, use "BERT" and if you want to do image classification, set it to "image". In most structured data sets, keras_model_type is a quick way for you to select some fantastic model architectures that have been successful in the past. For example:
129 | <b>fast</b>: a quick model that applies deep layers for all variables. 
130 | <b>fast1</b>: a deep and wide model that sends the same variables to both a deep and wide layer simultaneously. 
131 | <b>fast2</b>: a deep and cross model that crosses some variables to build a deep and cross layer simultaneously. 
132 | <b>auto</b>: This will build multiple dense layers in sequence that will then use Storm-Tuner to fine tune the hyper parameters for your model. 
133 | - `project_name`: must be a string. Name of the folder where we will save your keras saved model and logs for tensorboard
134 | - `model_options`: must be a dictionary. For example: {'max_trials':5} sets the number of trials to run Storm-Tuner to search for the best hyper parameters for your keras model.
135 | - `keras_options`: must be a dictionary. You can use it for changing any keras model option you want such as "epochs", "kernel_initializer", "activation", "loss", "metrics", etc.
136 | - `model_use_case`: must be a string. You can use it for telling deep_autoviml what kind of use case you will use such as "time series", "seq2seq", modeling etc. This option is currently not used but you should watch this space for more model announcements.
137 | - `save_model_flag`: must be True or False. The model will be saved in keras model format.
138 | - `use_my_model`: This is where "bring your own model" (BYOM) option comes into play. This BYOM model must be a keras Sequential model with NO input layers and output layers! You can define it and send it as input here. We will add input and preprocessing layers to it automatically. Your custom defined model must contain only hidden layers (Dense, Conv1D, Conv2D, etc.), and dropouts, activations, etc. The default for this argument is "" (empty string) which means we will build your model. If you provide your custom model object here, we will use it instead.
139 | - `verbose`: must be 0, 1 or 2. Can also be True or False. You can see more and more outputs as you increase the verbose level. If you want to see a chart of your model, use verbose = 2. But you must have graphviz and pydot installed in your machine to see the model plot.
140 | -`use_mlflow`: default = False. Use for MLflow lifecycle tracking. You can set it to True. MLflow is an open source python library useed to manage ML lifecycle, including experimentation, reproducibility, deployment, and a central model registry.
141 | Once the model training (via `fit` method) is done, you need to run MLflow locally from your working directory. Run below command on command line. This will start MLflow UI on port 5000 (http://localhost:5000/) and user can manage and visualize the end-to-end machine learning lifecycle.<br>
142 | `$ mlflow ui`
143 | -`mlflow_exp_name`:  Default value is 'autoviml'. MLflow experiment name. You can change this to any string you want.
144 | -`mlflow_run_name`: Default value is'first_run'. Each run under an experiment can have a unique run name. You can change this.
145 | 
146 | ## Image
147 | ![image_deep](deep_7.jpg)
148 | Leaf Images referred to here are from Kaggle and are copyright of Kaggle. They are shown for illustrative purposes.
149 | [Kaggle Leaf Image Classification](https://www.kaggle.com/c/leaf-classification)
150 | 
151 | deep_autoviml can do image classification. All you need to do is to organize your image_dir folder under train, validation and test sub folders. Train folder for example, can contain images for each label as a sub-folder. All you need to provide is the name of the image directory for example "leaf_classification" and deep_autoviml will automatically read the images and assign them correct labels and the correct dataset (train, test, etc.)
152 | 
153 | `image_dir` = `"leaf_classification"`
154 | You also need to provide the height and width of each image as well as the number of channels for each image.
155 | ```
156 | img_height = 224
157 | img_width = 224
158 | img_channels = 3
159 | ```
160 | You then need to set the keras model type argument as "image". 
161 | 
162 | `keras_model_type` =  `"image"` 
163 | 
164 | You also need to send in the above arguments as model options as follows:
165 | `model_options = {'image_directory': image_dir, 'image_height': img_height, 'image_width':img_width, 'image_channels':img_channels }`
166 | 
167 | You can then call deep_autoviml for training the model as usual with these inputs:
168 | ```model, dicti = deepauto.fit(trainfile, target, keras_model_type=keras_model_type,  project_name='leaf_classification', save_model_flag=False, model_options=model_options, keras_options=keras_options, use_my_model='', verbose=0)```
169 | 
170 | To make predictions, you need to provide the dictionary ("dicti") from above and the trained model. You also need to provide where the test images are stored as follows.
171 | `test_image_dir = 'leaf_classification/test'`
172 | `predictions = deepauto.predict_images(test_image_dir, model, dicti)`
173 | 
174 | ## NLP
175 | ![NLP_deep](deep_8.jpg)
176 | deep_autoviml can also do NLP text classification. There are two ways to do NLP:
177 | <li>1. Using folders and sub-folders</li>
178 | All you need to do is to organize your text_dir folder under train, validation and test sub folders. Train folder for example, can contain Text files for each label as a sub-folder. All you have to do is:
179 | 
180 | `keras_model_type` as `"BERT"` or `keras_model_type` as `"USE"` or  and it will use either BERT or Universal Sentence Encoder to preprocess and transform your text into embeddings to feed to a model.
181 | <li>2. Using CSV file</li>
182 | Just provide a CSV file with column names and text. If you have multiple text columns, it will handle all of them automatically. If you want to mix numeric and text columns, you can do so in the same CSV file. deep_autoviml will automatically detect which columns are text (NLP) and which columns are numeric and do preprocessing automatically. You can specify whether to use:
183 | 
184 | `keras_model_type` as `"BERT"` or `keras_model_type` as `"USE"` or  and it will use either BERT or Universal Sentence Encoder as specified on your text columns. If you want to use neither of them, you can just specify:
185 | 
186 | `keras_model_type` as `"auto"` and deep_autoviml will automatically choose the best embedding for your model.
187 | 
188 | 
189 | ## Tips
190 | You can use the following arguments in your input to make deep_autoviml work best for you:
191 | - `model_options = {"model_use_case":'pipeline'}`: If you only want keras preprocessing layers (i.e. keras pipeline) then set the model_use_case input to "pipeline" and Deep Auto_ViML will not build a model but just return the keras input and preprocessing layers. You can use these inputs and output layers to any sequential model you choose and build your own custom model.
192 | - `model_options = {'max_trials':5}`: Always start with a small number of max_trials in model_options dictionary or a dataframe. Start with 5 trials and increase it by 20 each time to see if performance improves. Stop when performance of the model doesn't improve any more. This takes time.
193 | - `model_options = {'cat_feat_cross_flag':True}`: default is False but change it to True and see if adding feature crosses with your categorical features helps improve the model. However, do not do this for a large data set! This will explode the number of features in your model. Be careful!
194 | - `model_options = {'nlp_char_limit':20}`: If you want to run NLP Text preprocessing on any column, set this character limit low and deep_autoviml will then detect that column as an NLP column automatically. The default is 30 chars.
195 | - `keras_options = {"patience":30}`: If you want to reduce Early Stopping, then increase the patience to 30 or higher. Your model will train longer but you might get better performance.
196 | - `use_my_model = my_sequential_model`: If you want to bring your own custom model for training, then define a Keras Sequential model (you can name it anything but for example purposes, we have named it my_sequential_model) but don't include inputs or output layers! Just define your hidden layers! Deep Auto_ViML will automatically add inputs and output layers to your model and train it. It will also save your model after training. You can use this model for predictions.
197 | - `keras_model_type = "image"`: If you want to build a model for image classification, then you can use this option. But you must add the following additional options in model_options dictionary: `model_options = {"image_height":__, "image_width": __, "image_channels": __, "image_directory": __}`. 
198 | - `model_options = {"tf_hub_model": "URL"}`: If you want to use a pre-trained Tensorflow Hub model such as [BERT](https://tfhub.dev/google/collections/bert/1) or a [feature extractor](https://tfhub.dev/google/imagenet/mobilenet_v3_small_100_224/feature_vector/5) for image classification, then you can use its TF Hub model URL by providing it in model_options dictionary as follows: `model_options = {"tf_hub_model": "URL of TF hub model"}`
199 | - `keras_model_type = "BERT"` or `keras_model_type = "USE"`: If you want to use a default [BERT](https://tfhub.dev/google/collections/bert/1) model or a Universal Sentence Encoder model, just set this option to either "BERT" or "USE" and we will load a default small pre-trained model from TF Hub, train it on your dataset and give you back a pipeline with BERT/USE in it! If you want to use some other BERT model than the one we have chosen, please go to Tensorflow Hub and find your model's URL and set `model_options = {"tf_hub_model": "URL of TF hub model"}` and we will train whatever BERT model you have chosen with your data.
200 | 
201 | ## Maintainers
202 | 
203 | * [@AutoViML](https://github.com/AutoViML)
204 | 
205 | ## Contributing
206 | 
207 | See [the contributing file](contributing.md)!
208 | 
209 | PRs accepted.
210 | 
211 | ## License
212 | 
213 | Apache License 2.0 © 2020 Ram Seshadri
214 | 
215 | ## DISCLAIMER
216 | This project is not an official Google project. It is not supported by Google and Google specifically disclaims all warranties as to its quality, merchantability, or fitness for a particular purpose.
217 | 


--------------------------------------------------------------------------------
/code-of-conduct.md:
--------------------------------------------------------------------------------
 1 | # Code of Conduct
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as
 6 | contributors and maintainers pledge to making participation in our project and
 7 | our community a harassment-free experience for everyone, regardless of age, body
 8 | size, disability, ethnicity, gender identity and expression, level of
 9 | experience, education, socio-economic status, nationality, personal appearance,
10 | race, religion, or sexual identity and orientation.
11 | 
12 | ## Our Standards
13 | 
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 | 
17 | *   Using welcoming and inclusive language
18 | *   Being respectful of differing viewpoints and experiences
19 | *   Gracefully accepting constructive criticism
20 | *   Focusing on what is best for the community
21 | *   Showing empathy towards other community members
22 | 
23 | Examples of unacceptable behavior by participants include:
24 | 
25 | *   The use of sexualized language or imagery and unwelcome sexual attention or
26 |     advances
27 | *   Trolling, insulting/derogatory comments, and personal or political attacks
28 | *   Public or private harassment
29 | *   Publishing others' private information, such as a physical or electronic
30 |     address, without explicit permission
31 | *   Other conduct which could reasonably be considered inappropriate in a
32 |     professional setting
33 | 
34 | ## Our Responsibilities
35 | 
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 | 
40 | Project maintainers have the right and responsibility to remove, edit, or reject
41 | comments, commits, code, wiki edits, issues, and other contributions that are
42 | not aligned to this Code of Conduct, or to ban temporarily or permanently any
43 | contributor for other behaviors that they deem inappropriate, threatening,
44 | offensive, or harmful.
45 | 
46 | ## Scope
47 | 
48 | This Code of Conduct applies both within project spaces and in public spaces
49 | when an individual is representing the project or its community. Examples of
50 | representing a project or community include using an official project e-mail
51 | address, posting via an official social media account, or acting as an appointed
52 | representative at an online or offline event. Representation of a project may be
53 | further defined and clarified by project maintainers.
54 | 
55 | This Code of Conduct also applies outside the project spaces when the Project
56 | Steward has a reasonable belief that an individual's behavior may have a
57 | negative impact on the project or its community.
58 | 
59 | ## Conflict Resolution
60 | 
61 | We do not believe that all conflict is bad; healthy debate and disagreement
62 | often yield positive results. However, it is never okay to be disrespectful or
63 | to engage in behavior that violates the project’s code of conduct.
64 | 
65 | If you see someone violating the code of conduct, you are encouraged to address
66 | the behavior directly with those involved. Many issues can be resolved quickly
67 | and easily, and this gives people more control over the outcome of their
68 | dispute. If you are unable to resolve the matter for any reason, or if the
69 | behavior is threatening or harassing, report it. We are dedicated to providing
70 | an environment where participants feel welcome and safe.
71 | 
72 | Reports should be directed to *[PROJECT STEWARD NAME(s) AND EMAIL(s)]*, the
73 | Project Steward(s) for *[PROJECT NAME]*. It is the Project Steward’s duty to
74 | receive and address reported violations of the code of conduct. They will then
75 | work with a committee consisting of representatives from the Open Source
76 | Programs Office and the Google Open Source Strategy team. If for any reason you
77 | are uncomfortable reaching out to the Project Steward, please email
78 | opensource@google.com.
79 | 
80 | We will investigate every complaint, but you may not receive a direct response.
81 | We will use our discretion in determining when and how to follow up on reported
82 | incidents, which may range from not taking action to permanent expulsion from
83 | the project and project-sponsored spaces. We will notify the accused of the
84 | report and provide them an opportunity to discuss it before any action is taken.
85 | The identity of the reporter will be omitted from the details of the report
86 | supplied to the accused. In potentially harmful situations, such as ongoing
87 | harassment or threats to anyone's safety, we may take action without notice.
88 | 
89 | ## Attribution
90 | 
91 | This Code of Conduct is adapted from the Contributor Covenant, version 1.4,
92 | available at
93 | https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
94 | 


--------------------------------------------------------------------------------
/contributing.md:
--------------------------------------------------------------------------------
 1 | # How to Contribute
 2 | 
 3 | We'd love to accept your patches and contributions to this project. There are
 4 | just a few small guidelines you need to follow.
 5 | 
 6 | ## Contributor License Agreement
 7 | 
 8 | Contributions to this project must be accompanied by a Contributor License
 9 | Agreement. You (or your employer) retain the copyright to your contribution;
10 | this simply gives us permission to use and redistribute your contributions as
11 | part of the project. Head over to <https://cla.developers.google.com/> to see
12 | your current agreements on file or to sign a new one.
13 | 
14 | You generally only need to submit a CLA once, so if you've already submitted one
15 | (even if it was for a different project), you probably don't need to do it
16 | again.
17 | 
18 | ## Code Reviews
19 | 
20 | All submissions, including submissions by project members, require review. We
21 | use GitHub pull requests for this purpose. Consult
22 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
23 | information on using pull requests.
24 | 
25 | ## Community Guidelines
26 | 
27 | This project follows [Google's Open Source Community
28 | Guidelines](https://opensource.google/conduct/).
29 | 


--------------------------------------------------------------------------------
/deep_1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AutoViML/deep_autoviml/7881393617ba38830cf15a33ac8e5776f1d97fdd/deep_1.jpg


--------------------------------------------------------------------------------
/deep_2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AutoViML/deep_autoviml/7881393617ba38830cf15a33ac8e5776f1d97fdd/deep_2.jpg


--------------------------------------------------------------------------------
/deep_3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AutoViML/deep_autoviml/7881393617ba38830cf15a33ac8e5776f1d97fdd/deep_3.jpg


--------------------------------------------------------------------------------
/deep_4.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AutoViML/deep_autoviml/7881393617ba38830cf15a33ac8e5776f1d97fdd/deep_4.jpg


--------------------------------------------------------------------------------
/deep_5.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AutoViML/deep_autoviml/7881393617ba38830cf15a33ac8e5776f1d97fdd/deep_5.jpg


--------------------------------------------------------------------------------
/deep_6.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AutoViML/deep_autoviml/7881393617ba38830cf15a33ac8e5776f1d97fdd/deep_6.jpg


--------------------------------------------------------------------------------
/deep_7.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AutoViML/deep_autoviml/7881393617ba38830cf15a33ac8e5776f1d97fdd/deep_7.jpg


--------------------------------------------------------------------------------
/deep_8.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AutoViML/deep_autoviml/7881393617ba38830cf15a33ac8e5776f1d97fdd/deep_8.jpg


--------------------------------------------------------------------------------
/deep_autoviml/__init__.py:
--------------------------------------------------------------------------------
 1 | ############################################################################################
 2 | #Copyright 2021 Google LLC
 3 | 
 4 | #Licensed under the Apache License, Version 2.0 (the "License");
 5 | #you may not use this file except in compliance with the License.
 6 | #You may obtain a copy of the License at
 7 | #
 8 | #    https://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #Unless required by applicable law or agreed to in writing, software
11 | #distributed under the License is distributed on an "AS IS" BASIS,
12 | #WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #See the License for the specific language governing permissions and
14 | #limitations under the License.
15 | ############################################################################################
16 | # -*- coding: utf-8 -*-
17 | ################################################################################
18 | #     deep_auto_viml - build and test multiple Tensorflow 2.0 models and pipelines
19 | #     Python v3.6+ tensorflow v2.4.1+
20 | #     Created by Ram Seshadri
21 | #     Licensed under Apache License v2
22 | ################################################################################
23 | # Version
24 | from .__version__ import __version__
25 | __all__ = ['data_load', 'models', 'modeling', 'preprocessing', 'utilities']
26 | import pdb
27 | 
28 | from .deep_autoviml import fit
29 | from deep_autoviml.modeling.predict_model import load_test_data, predict, predict_images, predict_text
30 | from deep_autoviml.utilities.utilities import print_one_row_from_tf_dataset, print_one_row_from_tf_label
31 | from deep_autoviml.utilities.utilities import print_classification_metrics, print_regression_model_stats
32 | from deep_autoviml.utilities.utilities import print_classification_model_stats, plot_history, plot_classification_results
33 | ################################################################################
34 | if __name__ == "__main__":
35 |     module_type = 'Running'
36 | else:
37 |     module_type = 'Imported'
38 | version_number = __version__
39 | print("""
40 | %s deep_auto_viml. version=%s
41 | from deep_autoviml import deep_autoviml as deepauto
42 | -------------------
43 | model, cat_vocab_dict = deepauto.fit(train, target, keras_model_type="fast",
44 | 		project_name="deep_autoviml", keras_options=keras_options,  
45 | 		model_options=model_options, save_model_flag=True, use_my_model='',
46 | 		model_use_case='', verbose=0)
47 | 
48 | predictions = deepauto.predict(model, project_name="deep_autoviml", test_dataset=test,
49 |                                  keras_model_type=keras_model_type, 
50 |                                  cat_vocab_dict=cat_vocab_dict)
51 |                                 """ %(module_type, version_number))
52 | ################################################################################
53 | 


--------------------------------------------------------------------------------
/deep_autoviml/__version__.py:
--------------------------------------------------------------------------------
 1 | ############################################################################################
 2 | #Copyright 2021 Google LLC
 3 | 
 4 | #Licensed under the Apache License, Version 2.0 (the "License");
 5 | #you may not use this file except in compliance with the License.
 6 | #You may obtain a copy of the License at
 7 | #
 8 | #    https://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #Unless required by applicable law or agreed to in writing, software
11 | #distributed under the License is distributed on an "AS IS" BASIS,
12 | #WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #See the License for the specific language governing permissions and
14 | #limitations under the License.
15 | ############################################################################################
16 | # -*- coding: utf-8 -*-
17 | """Specifies the version of the deep_autoviml package."""
18 | 
19 | __title__ = "deep_autoviml"
20 | __author__ = "Ram Seshadri"
21 | __description__ = "deep_autoviml - build and test multiple Tensorflow 2.0 models and pipelines"
22 | __url__ = "https://github.com/Auto_ViML/deep_autoviml.git"
23 | __version__ = "0.0.85"
24 | __license__ = "Apache License 2.0"
25 | __copyright__ = "2020-21 Google"
26 | 


--------------------------------------------------------------------------------
/deep_autoviml/data_load/__pycache__/classify_features.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AutoViML/deep_autoviml/7881393617ba38830cf15a33ac8e5776f1d97fdd/deep_autoviml/data_load/__pycache__/classify_features.cpython-38.pyc


--------------------------------------------------------------------------------
/deep_autoviml/data_load/__pycache__/extract.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AutoViML/deep_autoviml/7881393617ba38830cf15a33ac8e5776f1d97fdd/deep_autoviml/data_load/__pycache__/extract.cpython-38.pyc


--------------------------------------------------------------------------------
/deep_autoviml/modeling/__pycache__/create_model.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AutoViML/deep_autoviml/7881393617ba38830cf15a33ac8e5776f1d97fdd/deep_autoviml/modeling/__pycache__/create_model.cpython-38.pyc


--------------------------------------------------------------------------------
/deep_autoviml/modeling/__pycache__/one_cycle.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AutoViML/deep_autoviml/7881393617ba38830cf15a33ac8e5776f1d97fdd/deep_autoviml/modeling/__pycache__/one_cycle.cpython-38.pyc


--------------------------------------------------------------------------------
/deep_autoviml/modeling/__pycache__/predict_model.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AutoViML/deep_autoviml/7881393617ba38830cf15a33ac8e5776f1d97fdd/deep_autoviml/modeling/__pycache__/predict_model.cpython-38.pyc


--------------------------------------------------------------------------------
/deep_autoviml/modeling/__pycache__/train_custom_model.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AutoViML/deep_autoviml/7881393617ba38830cf15a33ac8e5776f1d97fdd/deep_autoviml/modeling/__pycache__/train_custom_model.cpython-38.pyc


--------------------------------------------------------------------------------
/deep_autoviml/modeling/__pycache__/train_image_model.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AutoViML/deep_autoviml/7881393617ba38830cf15a33ac8e5776f1d97fdd/deep_autoviml/modeling/__pycache__/train_image_model.cpython-38.pyc


--------------------------------------------------------------------------------
/deep_autoviml/modeling/__pycache__/train_model.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AutoViML/deep_autoviml/7881393617ba38830cf15a33ac8e5776f1d97fdd/deep_autoviml/modeling/__pycache__/train_model.cpython-38.pyc


--------------------------------------------------------------------------------
/deep_autoviml/modeling/__pycache__/train_text_model.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AutoViML/deep_autoviml/7881393617ba38830cf15a33ac8e5776f1d97fdd/deep_autoviml/modeling/__pycache__/train_text_model.cpython-38.pyc


--------------------------------------------------------------------------------
/deep_autoviml/modeling/one_cycle.py:
--------------------------------------------------------------------------------
  1 | ############################################################################################
  2 | #Copyright 2021 Google LLC
  3 | 
  4 | #Licensed under the Apache License, Version 2.0 (the "License");
  5 | #you may not use this file except in compliance with the License.
  6 | #You may obtain a copy of the License at
  7 | #
  8 | #    https://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | #Unless required by applicable law or agreed to in writing, software
 11 | #distributed under the License is distributed on an "AS IS" BASIS,
 12 | #WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | #See the License for the specific language governing permissions and
 14 | #limitations under the License.
 15 | ############################################################################################
 16 | import tensorflow as tf
 17 | import numpy as np
 18 | import matplotlib.pyplot as plt
 19 | import logging
 20 | 
 21 | logging.getLogger('tensorflow').setLevel(logging.ERROR)
 22 | 
 23 | from tensorflow.keras.callbacks import Callback
 24 | #########################################################################################################
 25 | ######   One Cycle is a Super-Convergence technique developed by Leslie Smith: https://arxiv.org/abs/1708.07120
 26 | ######                Super-Convergence: Very Fast Training of Neural Networks Using Large Learning Rates
 27 | ######   This particular implementation is by Andrich van Wyk • September 02, 2019
 28 | ######   Used with permission: https://www.avanwyk.com/tensorflow-2-super-convergence-with-the-1cycle-policy/
 29 | #########################################################################################################
 30 | class CosineAnnealer:
 31 |     
 32 |     def __init__(self, start, end, steps):
 33 |         self.start = start
 34 |         self.end = end
 35 |         self.steps = steps
 36 |         self.n = 0
 37 |         
 38 |     def step(self):
 39 |         self.n += 1
 40 |         cos = np.cos(np.pi * (self.n / self.steps)) + 1
 41 |         return self.end + (self.start - self.end) / 2. * cos
 42 | 
 43 | 
 44 | class OneCycleScheduler(Callback):
 45 |     """ 
 46 |     #########################################################################################################
 47 |     ######   One Cycle is a Super-Convergence technique developed by Leslie Smith: https://arxiv.org/abs/1708.07120
 48 |     ######                Super-Convergence: Very Fast Training of Neural Networks Using Large Learning Rates
 49 |     ######   This particular implementation is by Andrich van Wyk • September 02, 2019
 50 |     ######   Credit: https://www.avanwyk.com/tensorflow-2-super-convergence-with-the-1cycle-policy/
 51 |     #########################################################################################################
 52 |     Callback that schedules the learning rate on a 1cycle policy as per Leslie Smith's paper(https://arxiv.org/pdf/1803.09820.pdf).
 53 |     If the model supports a momentum parameter, it will also be adapted by the schedule.
 54 |     The implementation adopts additional improvements as per the fastai library: https://docs.fast.ai/callbacks.one_cycle.html, where
 55 |     only two phases are used and the adaptation is done using cosine annealing.
 56 |     """
 57 | 
 58 |     def __init__(self, lr_max, steps, mom_min=0.85, mom_max=0.95, phase_1_pct=0.3, div_factor=25.):
 59 |         super(OneCycleScheduler, self).__init__()
 60 |         lr_min = lr_max / div_factor
 61 |         final_lr = lr_max / (div_factor * 1e4)
 62 |         phase_1_steps = steps * phase_1_pct
 63 |         phase_2_steps = steps - phase_1_steps
 64 |         
 65 |         self.phase_1_steps = phase_1_steps
 66 |         self.phase_2_steps = phase_2_steps
 67 |         self.phase = 0
 68 |         self.step = 0
 69 |         
 70 |         self.phases = [[CosineAnnealer(lr_min, lr_max, phase_1_steps), CosineAnnealer(mom_max, mom_min, phase_1_steps)], 
 71 |                  [CosineAnnealer(lr_max, final_lr, phase_2_steps), CosineAnnealer(mom_min, mom_max, phase_2_steps)]]
 72 |         
 73 |         self.lrs = []
 74 |         self.moms = []
 75 | 
 76 |     def on_train_begin(self, logs=None):
 77 |         self.phase = 0
 78 |         self.step = 0
 79 | 
 80 |         self.set_lr(self.lr_schedule().start)
 81 |         self.set_momentum(self.mom_schedule().start)
 82 |         
 83 |     def on_train_batch_begin(self, batch, logs=None):
 84 |         self.lrs.append(self.get_lr())
 85 |         self.moms.append(self.get_momentum())
 86 | 
 87 |     def on_train_batch_end(self, batch, logs=None):
 88 |         self.step += 1
 89 |         if self.step >= self.phase_1_steps:
 90 |             self.phase = 1
 91 |         self.set_lr(self.lr_schedule().step())
 92 |         self.set_momentum(self.mom_schedule().step())
 93 |         
 94 |     def get_lr(self):
 95 |         try:
 96 |             return tf.keras.backend.get_value(self.model.optimizer.lr)
 97 |         except AttributeError:
 98 |             return None
 99 |         
100 |     def get_momentum(self):
101 |         try:
102 |             return tf.keras.backend.get_value(self.model.optimizer.momentum)
103 |         except AttributeError:
104 |             return None
105 |         
106 |     def set_lr(self, lr):
107 |         try:
108 |             if lr < 0:
109 |                 lr = 0.1
110 |                 self.phase = 0
111 |                 self.step = 0
112 | 
113 |                 self.set_lr(self.lr_schedule().start)
114 |                 self.set_momentum(self.mom_schedule().start)
115 |                 tf.keras.backend.clear_session()
116 | 
117 |             tf.keras.backend.set_value(self.model.optimizer.lr, lr)
118 |         except AttributeError:
119 |             pass # ignore
120 |         
121 |     def set_momentum(self, mom):
122 |         try:
123 |             tf.keras.backend.set_value(self.model.optimizer.momentum, mom)
124 |         except AttributeError:
125 |             pass # ignore
126 | 
127 |     def lr_schedule(self):
128 |         return self.phases[self.phase][0]
129 |     
130 |     def mom_schedule(self):
131 |         return self.phases[self.phase][1]
132 |     
133 |     def plot(self):
134 |         ax = plt.subplot(1, 2, 1)
135 |         ax.plot(self.lrs)
136 |         ax.set_title('Learning Rate')
137 |         ax = plt.subplot(1, 2, 2)
138 |         ax.plot(self.moms)
139 |         ax.set_title('Momentum')


--------------------------------------------------------------------------------
/deep_autoviml/modeling/train_image_model.py:
--------------------------------------------------------------------------------
  1 | ############################################################################################
  2 | #Copyright 2021 Google LLC
  3 | 
  4 | #Licensed under the Apache License, Version 2.0 (the "License");
  5 | #you may not use this file except in compliance with the License.
  6 | #You may obtain a copy of the License at
  7 | #
  8 | #    https://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | #Unless required by applicable law or agreed to in writing, software
 11 | #distributed under the License is distributed on an "AS IS" BASIS,
 12 | #WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | #See the License for the specific language governing permissions and
 14 | #limitations under the License.
 15 | ############################################################################################
 16 | import pandas as pd
 17 | import numpy as np
 18 | pd.set_option('display.max_columns',500)
 19 | import matplotlib.pyplot as plt
 20 | import tempfile
 21 | import pdb
 22 | import copy
 23 | import warnings
 24 | warnings.filterwarnings(action='ignore')
 25 | import functools
 26 | # Make numpy values easier to read.
 27 | np.set_printoptions(precision=3, suppress=True)
 28 | ############################################################################################
 29 | # TensorFlow ≥2.4 is required
 30 | import tensorflow as tf
 31 | import os
 32 | def set_seed(seed=31415):
 33 |     np.random.seed(seed)
 34 |     tf.random.set_seed(seed)
 35 |     os.environ['PYTHONHASHSEED'] = str(seed)
 36 |     os.environ['TF_DETERMINISTIC_OPS'] = '1'
 37 | from tensorflow.keras import layers
 38 | from tensorflow import keras
 39 | from tensorflow.keras.layers.experimental.preprocessing import Normalization, StringLookup
 40 | from tensorflow.keras.layers.experimental.preprocessing import IntegerLookup, CategoryEncoding
 41 | from tensorflow.keras.layers.experimental.preprocessing import TextVectorization, Discretization, Hashing
 42 | from tensorflow.keras.layers import Embedding, Reshape, Dropout, Dense, GaussianNoise
 43 | 
 44 | from tensorflow.keras.optimizers import SGD, Adam, RMSprop
 45 | from tensorflow.keras import layers
 46 | from tensorflow.keras import optimizers
 47 | from tensorflow.keras.models import Model, load_model
 48 | from tensorflow.keras import callbacks
 49 | from tensorflow.keras import backend as K
 50 | from tensorflow.keras import utils
 51 | from tensorflow.keras.layers import BatchNormalization
 52 | from tensorflow.keras.optimizers import SGD
 53 | from tensorflow.keras import regularizers
 54 | #####################################################################################
 55 | # Utils
 56 | from deep_autoviml.utilities.utilities import print_one_row_from_tf_dataset, print_one_row_from_tf_label
 57 | from deep_autoviml.utilities.utilities import print_classification_metrics, print_regression_model_stats
 58 | from deep_autoviml.utilities.utilities import print_classification_model_stats, plot_history, plot_classification_results
 59 | from deep_autoviml.utilities.utilities import plot_one_history_metric
 60 | from deep_autoviml.utilities.utilities import check_if_GPU_exists
 61 | from deep_autoviml.utilities.utilities import save_valid_predictions, predict_plot_images
 62 | 
 63 | from deep_autoviml.data_load.extract import find_batch_size
 64 | from deep_autoviml.modeling.create_model import check_keras_options
 65 | from deep_autoviml.modeling.one_cycle import OneCycleScheduler
 66 | #####################################################################################
 67 | from sklearn.metrics import roc_auc_score, mean_squared_error, mean_absolute_error
 68 | from IPython.core.display import Image, display
 69 | import pickle
 70 | ##### Suppress all TF2 and TF1.x warnings ###################
 71 | tf2logger = tf.get_logger()
 72 | tf2logger.warning('Silencing TF2.x warnings')
 73 | tf2logger.root.removeHandler(tf2logger.root.handlers)
 74 | tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
 75 | ############################################################################################
 76 | from tensorflow.keras.layers import Reshape, MaxPooling1D, MaxPooling2D, AveragePooling2D, AveragePooling1D
 77 | from tensorflow.keras import Model, Sequential
 78 | from tensorflow.keras.layers import Activation, Dense, Embedding, GlobalAveragePooling1D, GlobalMaxPooling1D, Dropout, Conv1D
 79 | from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
 80 | ############################################################################################
 81 | #### probably the most handy function of all!
 82 | def left_subtract(l1,l2):
 83 |     lst = []
 84 |     for i in l1:
 85 |         if i not in l2:
 86 |             lst.append(i)
 87 |     return lst
 88 | ##############################################################################################
 89 | import time
 90 | import os
 91 | from sklearn.metrics import balanced_accuracy_score, classification_report
 92 | from sklearn.metrics import confusion_matrix, roc_auc_score, accuracy_score
 93 | from collections import defaultdict
 94 | from tensorflow.keras import callbacks
 95 | #############################################################################################
 96 | def train_image_model(deep_model, train_ds, valid_ds, cat_vocab_dict, 
 97 |                       keras_options, model_options, project_name, save_model_flag):
 98 |     epochs = check_keras_options(keras_options, "epochs", 20)
 99 |     save_model_path = model_options['save_model_path']
100 |     tensorboard_logpath = os.path.join(save_model_path,"mylogs")
101 |     print('Tensorboard log directory can be found at: %s' %tensorboard_logpath)
102 |     cp = keras.callbacks.ModelCheckpoint(project_name, save_best_only=True,
103 |                                          save_weights_only=True, save_format='tf')
104 |     es = keras.callbacks.EarlyStopping(monitor=val_monitor, min_delta=0.00001, patience=patience,
105 |                         verbose=1, mode=val_mode, baseline=None, restore_best_weights=True)
106 | 
107 |     tb = keras.callbacks.TensorBoard(log_dir=tensorboard_logpath,
108 |                          histogram_freq=0,
109 |                          write_graph=True,
110 |                          write_images=True,
111 |                          update_freq='epoch',
112 |                          profile_batch=2,
113 |                          embeddings_freq=1
114 |                          )
115 |     callbacks_list = [cp, es, tb]
116 |     print('Training image model. This will take time...')
117 |     history = deep_model.fit(train_ds, epochs=epochs, validation_data=valid_ds,
118 |                 callbacks=callbacks_list)
119 |     result = deep_model.evaluate(valid_ds)
120 |     print('    Model accuracy in Image validation data: %s' %result[1])
121 |     #plot_history(history, "accuracy", 1)
122 |     fig = plt.figure(figsize=(8,6))
123 |     ax1 = plt.subplot(1, 1, 1)
124 |     ax1.set_title('Model Training vs Validation Loss')
125 |     plot_one_history_metric(history, "accuracy", ax1)
126 |     classes = cat_vocab_dict["image_classes"]
127 |     predict_plot_images(deep_model, valid_ds, classes)
128 |     cat_vocab_dict['project_name'] = project_name
129 |     if save_model_flag:
130 |         print('\nSaving model in %s now...this will take time...' %save_model_path)
131 |         if not os.path.exists(save_model_path):
132 |             os.makedirs(save_model_path)
133 |         deep_model.save(save_model_path)
134 |         cat_vocab_dict['saved_model_path'] = save_model_path
135 |         print('     deep_autoviml image_model saved in %s directory' %save_model_path)
136 |     else:
137 |         print('\nModel not being saved since save_model_flag set to False...')
138 |     return deep_model, cat_vocab_dict
139 | 


--------------------------------------------------------------------------------
/deep_autoviml/modeling/train_model.py:
--------------------------------------------------------------------------------
  1 | ############################################################################################
  2 | #Copyright 2021 Google LLC
  3 | 
  4 | #Licensed under the Apache License, Version 2.0 (the "License");
  5 | #you may not use this file except in compliance with the License.
  6 | #You may obtain a copy of the License at
  7 | #
  8 | #    https://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | #Unless required by applicable law or agreed to in writing, software
 11 | #distributed under the License is distributed on an "AS IS" BASIS,
 12 | #WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | #See the License for the specific language governing permissions and
 14 | #limitations under the License.
 15 | ############################################################################################
 16 | import pandas as pd
 17 | import numpy as np
 18 | pd.set_option('display.max_columns',500)
 19 | import matplotlib.pyplot as plt
 20 | import tempfile
 21 | import pdb
 22 | import copy
 23 | import warnings
 24 | warnings.filterwarnings(action='ignore')
 25 | import functools
 26 | # Make numpy values easier to read.
 27 | np.set_printoptions(precision=3, suppress=True)
 28 | ############################################################################################
 29 | # TensorFlow ≥2.4 is required
 30 | import tensorflow as tf
 31 | np.random.seed(42)
 32 | tf.random.set_seed(42)
 33 | from tensorflow.keras import layers
 34 | from tensorflow import keras
 35 | from tensorflow.keras.layers.experimental.preprocessing import Normalization, StringLookup
 36 | from tensorflow.keras.layers.experimental.preprocessing import IntegerLookup, CategoryEncoding
 37 | from tensorflow.keras.layers.experimental.preprocessing import TextVectorization, Discretization, Hashing
 38 | from tensorflow.keras.layers import Embedding, Reshape, Dropout, Dense
 39 | 
 40 | from tensorflow.keras.optimizers import SGD, Adam, RMSprop
 41 | from tensorflow.keras import layers
 42 | from tensorflow.keras import optimizers
 43 | from tensorflow.keras.models import Model, load_model
 44 | from tensorflow.keras import callbacks
 45 | from tensorflow.keras import backend as K
 46 | from tensorflow.keras import utils
 47 | from tensorflow.keras.layers import BatchNormalization
 48 | from tensorflow.keras.optimizers import SGD
 49 | from tensorflow.keras import regularizers
 50 | #####################################################################################
 51 | # Utils
 52 | from deep_autoviml.utilities.utilities import print_one_row_from_tf_dataset, print_one_row_from_tf_label
 53 | from deep_autoviml.utilities.utilities import print_classification_metrics, print_regression_model_stats
 54 | from deep_autoviml.utilities.utilities import plot_regression_residuals
 55 | from deep_autoviml.utilities.utilities import print_classification_model_stats, plot_history, plot_classification_results
 56 | from deep_autoviml.utilities.utilities import save_valid_predictions, print_classification_header
 57 | from deep_autoviml.utilities.utilities import get_callbacks, get_chosen_callback
 58 | from deep_autoviml.utilities.utilities import save_model_artifacts
 59 | from deep_autoviml.modeling.create_model import check_keras_options
 60 | 
 61 | #####################################################################################
 62 | from sklearn.metrics import roc_auc_score, mean_squared_error, mean_absolute_error
 63 | from IPython.core.display import Image, display
 64 | import pickle
 65 | #############################################################################################
 66 | ##### Suppress all TF2 and TF1.x warnings ###################
 67 | tf2logger = tf.get_logger()
 68 | tf2logger.warning('Silencing TF2.x warnings')
 69 | tf2logger.root.removeHandler(tf2logger.root.handlers)
 70 | tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
 71 | ############################################################################################
 72 | from tensorflow.keras.layers import Reshape, MaxPooling1D, MaxPooling2D, AveragePooling2D, AveragePooling1D
 73 | from tensorflow.keras import Model, Sequential
 74 | from tensorflow.keras.layers import Activation, Dense, Embedding, GlobalAveragePooling1D, GlobalMaxPooling1D, Dropout, Conv1D
 75 | from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
 76 | ############################################################################################
 77 | #### probably the most handy function of all!
 78 | def left_subtract(l1,l2):
 79 |     lst = []
 80 |     for i in l1:
 81 |         if i not in l2:
 82 |             lst.append(i)
 83 |     return lst
 84 | ##############################################################################################
 85 | import time
 86 | import os
 87 | from sklearn.metrics import balanced_accuracy_score, classification_report, confusion_matrix, roc_auc_score
 88 | import math
 89 | #########################################################################################
 90 | ### Split raw_train_set into train and valid data sets first
 91 | ### This is a better way to split a dataset into train and test ####
 92 | ### It does not assume a pre-defined size for the data set.
 93 | def is_valid(x, y):
 94 |     return x % 5 == 0
 95 | def is_test(x, y):
 96 |     return x % 2 == 0
 97 | def is_train(x, y):
 98 |     return not is_test(x, y)
 99 | ##################################################################################
100 | def train_model(deep_model, full_ds, target, keras_model_type, keras_options,
101 |                  model_options, var_df, cat_vocab_dict, project_name="", save_model_flag=True, 
102 |                  verbose=0 ):
103 |     """
104 |     Given a keras model and a tf.data.dataset that is batched, this function will 
105 |     train a keras model. It will first split the batched_data into train_ds and  
106 |     valid_ds (80/20). Then it will select the right parameters based on model type and 
107 |     train the model and evaluate it on valid_ds. It will return a keras model fully 
108 |     trained on the full batched_data finally and train history.
109 |     """
110 |     ####  just use modeltype for printing that's all ###
111 |     start_time = time.time()
112 |     ### check the defaults for the following!
113 |     save_model_path = model_options['save_model_path']
114 |     save_weights_only = check_keras_options(keras_options, "save_weights_only", False)
115 |     data_size = check_keras_options(keras_options, 'data_size', 10000)
116 |     batch_size = check_keras_options(keras_options, 'batchsize', 64)
117 |     num_classes = model_options["num_classes"]
118 |     num_labels = model_options["num_labels"]
119 |     modeltype = model_options["modeltype"]
120 |     patience = check_keras_options(keras_options, "patience", 10)
121 |     optimizer = keras_options['optimizer']
122 |     class_weights = check_keras_options(keras_options, "class_weight", {})
123 |     if not isinstance(model_options["label_encode_flag"], str):
124 |         if not model_options["label_encode_flag"]:
125 |             print('    removing class weights since label_encode_flag is set to False which means classes can be anything.')
126 |             class_weights = {}
127 |     print('    class_weights: %s' %class_weights)
128 |     cols_len = len([item for sublist in list(var_df.values()) for item in sublist])
129 |     print('    original datasize = %s, initial batchsize = %s' %(data_size, batch_size))
130 |     NUMBER_OF_EPOCHS = check_keras_options(keras_options, "epochs", 100)
131 |     if keras_options['lr_scheduler'] in ['expo', 'ExponentialDecay', 'exponentialdecay']:
132 |         print('    chosen ExponentialDecay learning rate scheduler')
133 |         expo_steps = (NUMBER_OF_EPOCHS*data_size)//batch_size
134 |         learning_rate = keras.optimizers.schedules.ExponentialDecay(0.01, expo_steps, 0.1)
135 |     else:
136 |         learning_rate = check_keras_options(keras_options, "learning_rate", 5e-1)
137 |     steps = max(10, (data_size//(2*batch_size)))
138 |     print('    recommended steps per epoch = %d' %steps)
139 |     onecycle_steps = math.ceil(data_size / batch_size) * NUMBER_OF_EPOCHS
140 |     print('    recommended OneCycle steps = %d' %onecycle_steps)
141 |     STEPS_PER_EPOCH = check_keras_options(keras_options, "steps_per_epoch", 
142 |                         steps)
143 |     #### These can be standard for every keras option that you use layers ######
144 |     kernel_initializer = check_keras_options(keras_options, 'kernel_initializer', 'lecun_normal')
145 |     activation='selu'
146 |     print('    default initializer = %s, default activation = %s' %(kernel_initializer, activation))
147 |     default_optimizer = keras.optimizers.SGD(learning_rate=learning_rate)
148 |     use_bias = check_keras_options(keras_options, 'use_bias', True)
149 |     val_monitor = keras_options['monitor']
150 |     val_mode = keras_options['mode']
151 |     patience = keras_options["patience"]
152 | 
153 |     if keras_options['lr_scheduler'] in ['',"onecycle", "onecycle2"]:
154 |         #### you need to double the amount of patience for onecycle scheduler ##
155 |         print('    Recommended: Increase patience for "onecycle" scheduler')
156 |         patience = patience * 1.0
157 |     callbacks_dict, tb_logpath = get_callbacks(val_mode, val_monitor, patience, learning_rate, 
158 |                             save_weights_only, onecycle_steps, save_model_path)
159 | 
160 |     early_stopping = check_keras_options(keras_options, "early_stopping", False)
161 |     if keras_options['lr_scheduler'] in ['expo', 'ExponentialDecay', 'exponentialdecay']:
162 |         if early_stopping:
163 |             callbacks_list = [callbacks_dict['early_stop'], callbacks_dict['print']]
164 |         else:
165 |             callbacks_list = [callbacks_dict['print']]
166 |     else:
167 |         chosen_callback = get_chosen_callback(callbacks_dict, keras_options)
168 |         if not keras_options['lr_scheduler']:
169 |             print('    chosen keras LR scheduler = default')
170 |         else:
171 |             print('    chosen keras LR scheduler = %s' %keras_options['lr_scheduler'])
172 |         if keras_options['early_stopping']:
173 |             callbacks_list = [chosen_callback,  callbacks_dict['tensor_board'], callbacks_dict['early_stop']]
174 |         else:
175 |             callbacks_list = [chosen_callback, callbacks_dict['tensor_board'], callbacks_dict['print']]
176 | 
177 |     print('    val mode = %s, val monitor = %s, patience = %s' %(val_mode, val_monitor, patience))
178 |     print('    number of epochs = %d, steps per epoch = %d' %(NUMBER_OF_EPOCHS, STEPS_PER_EPOCH))
179 |     ############## Split train into train and validation datasets here ###############
180 |     ##################################################################################
181 |     recover = lambda x,y: y
182 |     print('\nSplitting train into 80+20 percent: train and validation data')
183 |     valid_ds1 = full_ds.enumerate().filter(is_valid).map(recover)
184 |     train_ds = full_ds.enumerate().filter(is_train).map(recover)
185 |     heldout_ds1 = valid_ds1
186 |     ##################################################################################
187 |     valid_ds = heldout_ds1.enumerate().filter(is_test).map(recover)
188 |     heldout_ds = heldout_ds1.enumerate().filter(is_test).map(recover)
189 |     print('    Splitting validation 20 into 10+10 percent: valid and heldout data')
190 |     ##################################################################################
191 |     ###   V E R Y    I M P O R T A N T  S T E P   B E F O R E   M O D E L   F I T  ###    
192 |     ##################################################################################
193 |     shuffle_size = int(data_size)
194 |     #shuffle_size = 100000
195 |     print('    shuffle size = %d' %shuffle_size)
196 |     train_ds = train_ds.cache().prefetch(batch_size).shuffle(shuffle_size, 
197 |                             reshuffle_each_iteration=False, seed=42)#.repeat()
198 |     valid_ds = valid_ds.prefetch(batch_size)#.repeat()
199 | 
200 |     print('Model training with best hyperparameters for %d epochs' %NUMBER_OF_EPOCHS)
201 |     for each_callback in callbacks_list:
202 |         print('    Callback added: %s' %str(each_callback).split(".")[-1])
203 |     
204 |     ############################    M O D E L     T R A I N I N G   ##################
205 |     np.random.seed(42)
206 |     tf.random.set_seed(42)
207 |     history = deep_model.fit(train_ds, validation_data=valid_ds, class_weight=class_weights,
208 |                     epochs=NUMBER_OF_EPOCHS, #steps_per_epoch=STEPS_PER_EPOCH, 
209 |                     callbacks=callbacks_list, #validation_steps=STEPS_PER_EPOCH,
210 |                    shuffle=False)
211 | 
212 |     print('    Model training completed. Following metrics available: %s' %history.history.keys())
213 |     try:
214 |         ##### this is where it stopped - you have toi subtract patience from it
215 |         stopped_epoch = max(5,int(pd.DataFrame(history.history).shape[0] - patience))
216 |     except:
217 |         stopped_epoch = 100
218 | 
219 |     print('Time taken to train model (in mins) = %0.0f' %((time.time()-start_time)/60))
220 | 
221 |     #### train the model on full train data set now ###
222 |     start_time = time.time()
223 |     print('    Stopped epoch = %s' %stopped_epoch)
224 | 
225 |     ##################################################################################
226 |     #######        S A V E the model here using save_model_name      #################
227 |     ##################################################################################
228 |     
229 |     save_model_artifacts(deep_model, cat_vocab_dict, var_df, save_model_path, 
230 |                         save_model_flag, model_options)
231 |     print() ### just create an extra line after saving that is all 
232 |     
233 |     #################################################################################
234 |     ########     P R E D I C T   O N   H E L D   O U T  D A T A   H E R E      ######
235 |     #################################################################################
236 |     try:
237 |         if num_labels <= 1:
238 |             y_test = np.concatenate(list(heldout_ds.map(lambda x,y: y).as_numpy_iterator()))
239 |             print('    Single-Label: Heldout data shape: %s' %(y_test.shape,))
240 |         else:
241 |             iters = int(data_size/batch_size) + 1
242 |             for inum, each_target in enumerate(target):
243 |                 add_ls = []
244 |                 for feats, labs in heldout_ds.take(iters): 
245 |                     add_ls.append(list(labs[each_target].numpy()))
246 |                 flat_list = [item for sublist in add_ls for item in sublist]
247 |                 if inum == 0:
248 |                     each_array = np.array(flat_list)
249 |                 else:
250 |                     each_array = np.c_[each_array, np.array(flat_list)]
251 |             y_test = copy.deepcopy(each_array)
252 |             print('    Multi-Label: Heldout data shape: %s' %(y_test.shape,))
253 |         scores = []
254 |         ls = []
255 |         if verbose >= 1:
256 |             try:
257 |                 print_one_row_from_tf_label(heldout_ds)
258 |             except:
259 |                 print('could not print samples from heldout ds labels')
260 |         ###########################################################################
261 |     except:
262 |         print('Model erroring on heldout_ds predictions. Returning with model and artifacts dictionary.')
263 |         return deep_model, cat_vocab_dict
264 |         
265 |     y_probas = deep_model.predict(heldout_ds)
266 |     
267 |     if isinstance(target, str):
268 |         if modeltype != 'Regression':
269 |             y_test_preds = y_probas.argmax(axis=1)
270 |         else:
271 |             if y_test.dtype == 'int':
272 |                 y_test_preds = y_probas.round().astype(int)
273 |             else:
274 |                 y_test_preds = y_probas.ravel()
275 |     else:
276 |         if modeltype != 'Regression':
277 |             #### This is for multi-label binary or multi-class problems ##
278 |             for each_t in range(len(target)):
279 |                 if each_t == 0:
280 |                     y_test_preds = y_probas[each_t].argmax(axis=1).astype(int)
281 |                 else:
282 |                     y_test_preds = np.c_[y_test_preds, y_probas[each_t].argmax(axis=1).astype(int)]
283 |         else:
284 |             ### This is for Multi-Label Regression ###
285 |             for each_t in range(len(target)):
286 |                 if each_t == 0:
287 |                     y_test_preds = y_probas[each_t].mean(axis=1)
288 |                 else:
289 |                     y_test_preds = np.c_[y_test_preds, y_probas[each_t].mean(axis=1)]
290 |                 if y_test.dtype == 'int':
291 |                     y_test_preds = y_test_preds.round().astype(int)
292 | 
293 |     print('\nHeld out predictions shape:%s' %(y_test_preds.shape,))
294 |     if verbose >= 1:
295 |         if modeltype != 'Regression':
296 |             print('    Sample predictions: %s' %y_test_preds[:10])
297 |         else:
298 |             if num_labels == 1:
299 |                 print('    Sample predictions: %s' %y_test_preds.ravel()[:10])
300 |             else:
301 |                 print('    Sample predictions:\n%s' %y_test_preds[:10])
302 | 
303 |     #################################################################################
304 |     ########     P L O T T I N G   V A L I D A T I O N   R E S U L T S         ######
305 |     #################################################################################
306 |     ###  Plot the epochs and loss metrics here #####################    
307 |     try:
308 |         #print('    Additionally, Tensorboard logs can be found here: %s' %tb_logpath)
309 |         if modeltype == 'Regression':
310 |             plot_history(history, val_monitor[4:], target)
311 |         elif modeltype == 'Classification':
312 |             plot_history(history, val_monitor[4:], target)
313 |         else:
314 |             plot_history(history, val_monitor[4:], target)
315 |     except:
316 |         print('    Plot history is erroring. Tensorboard logs can be found here: %s' %tb_logpath)
317 | 
318 |     print('\n###########################################################')
319 |     print('         Held-out test data set Results:')
320 |     num_labels = cat_vocab_dict['num_labels']
321 |     num_classes = cat_vocab_dict['num_classes']
322 |     if num_labels <= 1:
323 |         #### This is for Single-Label Problems only ################################
324 |         if modeltype == 'Regression':
325 |             print_regression_model_stats(y_test, y_test_preds,target,plot_name=project_name)
326 |             ### plot the regression results here #########
327 |             plot_regression_residuals(y_test, y_test_preds, target, project_name, num_labels)
328 |         else:
329 |             print_classification_header(num_classes, num_labels, target)
330 |             labels = cat_vocab_dict['original_classes']
331 |             if cat_vocab_dict['target_transformed']:
332 |                 target_names = cat_vocab_dict['transformed_classes']
333 |                 target_le = cat_vocab_dict['target_le']
334 |                 y_pred = y_probas.argmax(axis=1)
335 |                 y_test_trans = target_le.inverse_transform(y_test)
336 |                 y_pred_trans = target_le.inverse_transform(y_pred)
337 |                 plot_classification_results(y_test_trans, y_pred_trans, labels, labels, target)
338 |             else:
339 |                 y_pred = y_probas.argmax(axis=1)
340 |                 plot_classification_results(y_test, y_pred, labels, labels, target)
341 |             print_classification_metrics(y_test, y_probas, proba_flag=True)
342 |     else:
343 |         if modeltype == 'Regression':
344 |             #### This is for Multi-Label Regression ################################
345 |             print_regression_model_stats(y_test, y_test_preds,target,plot_name=project_name)
346 |             ### plot the regression results here #########
347 |             plot_regression_residuals(y_test, y_test_preds, target, project_name, num_labels)
348 |         else:
349 |             #### This is for Multi-Label Classification ################################
350 |             try:
351 |                 targets = cat_vocab_dict["target_variables"]
352 |                 for i, each_target in enumerate(targets):
353 |                     print_classification_header(num_classes, num_labels, each_target)
354 |                     labels = cat_vocab_dict[each_target+'_original_classes']
355 |                     if cat_vocab_dict['target_transformed']:
356 |                         ###### Use a nice classification matrix printing module here #########
357 |                         target_names = cat_vocab_dict[each_target+'_transformed_classes']
358 |                         target_le = cat_vocab_dict['target_le'][i]
359 |                         y_pred = y_probas[i].argmax(axis=1)
360 |                         y_test_trans = target_le.inverse_transform(y_test[:,i])
361 |                         y_pred_trans = target_le.inverse_transform(y_pred)
362 |                         labels = np.unique(y_test_trans) ### sometimes there is less classes
363 |                         plot_classification_results(y_test_trans, y_pred_trans, labels, labels, each_target)
364 |                     else:
365 |                         y_pred = y_probas[i].argmax(axis=1)
366 |                         labels = np.unique(y_test[:,i]) ### sometimes there are fewer classes ##
367 |                         plot_classification_results(y_test[:,i], y_pred, labels, labels, each_target)
368 |                     print_classification_metrics(y_test[:,i], y_probas[i], proba_flag=True)
369 |                     #### This prints additional metrics #############
370 |                     print(classification_report(y_test[:,i],y_test_preds[:,i]))
371 |                     print(confusion_matrix(y_test[:,i], y_test_preds[:,i]))
372 |             except:
373 |                 print_classification_metrics(y_test, y_test_preds, False)
374 |                 print(classification_report(y_test, y_test_preds ))
375 | 
376 |     ##################################################################################
377 |     ###   V E R Y    I M P O R T A N T  S T E P   B E F O R E   M O D E L   F I T  ###    
378 |     ##################################################################################
379 |     print('\nTraining on full train dataset for %d epochs. This will take time...' %stopped_epoch)
380 |     full_ds = full_ds.cache().shuffle(shuffle_size).prefetch(batch_size) #.repeat()
381 |     #heldout_ds = heldout_ds.shuffle(shuffle_size).prefetch(batch_size)
382 |     deep_model.fit(full_ds, epochs=stopped_epoch, #steps_per_epoch=STEPS_PER_EPOCH,
383 |                  class_weight=class_weights, verbose=0)
384 | 
385 |     print('    completed. Time taken (in mins) = %0.0f' %((time.time()-start_time)/100))
386 | 
387 |     return deep_model, cat_vocab_dict
388 | ######################################################################################
389 | 


--------------------------------------------------------------------------------
/deep_autoviml/modeling/train_text_model.py:
--------------------------------------------------------------------------------
  1 | ############################################################################################
  2 | #Copyright 2021 Google LLC
  3 | 
  4 | #Licensed under the Apache License, Version 2.0 (the "License");
  5 | #you may not use this file except in compliance with the License.
  6 | #You may obtain a copy of the License at
  7 | #
  8 | #    https://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | #Unless required by applicable law or agreed to in writing, software
 11 | #distributed under the License is distributed on an "AS IS" BASIS,
 12 | #WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | #See the License for the specific language governing permissions and
 14 | #limitations under the License.
 15 | ############################################################################################
 16 | import pandas as pd
 17 | import numpy as np
 18 | pd.set_option('display.max_columns',500)
 19 | import matplotlib.pyplot as plt
 20 | import tempfile
 21 | import pdb
 22 | import copy
 23 | import warnings
 24 | warnings.filterwarnings(action='ignore')
 25 | import functools
 26 | # Make numpy values easier to read.
 27 | np.set_printoptions(precision=3, suppress=True)
 28 | ############################################################################################
 29 | # TensorFlow ≥2.4 is required
 30 | import tensorflow as tf
 31 | import os
 32 | def set_seed(seed=31415):
 33 |     np.random.seed(seed)
 34 |     tf.random.set_seed(seed)
 35 |     os.environ['PYTHONHASHSEED'] = str(seed)
 36 |     os.environ['TF_DETERMINISTIC_OPS'] = '1'
 37 | from tensorflow.keras import layers
 38 | from tensorflow import keras
 39 | from tensorflow.keras.layers.experimental.preprocessing import Normalization, StringLookup
 40 | from tensorflow.keras.layers.experimental.preprocessing import IntegerLookup, CategoryEncoding
 41 | from tensorflow.keras.layers.experimental.preprocessing import TextVectorization, Discretization, Hashing
 42 | from tensorflow.keras.layers import Embedding, Reshape, Dropout, Dense, GaussianNoise
 43 | 
 44 | from tensorflow.keras.optimizers import SGD, Adam, RMSprop
 45 | from tensorflow.keras import layers
 46 | from tensorflow.keras import optimizers
 47 | from tensorflow.keras.models import Model, load_model
 48 | from tensorflow.keras import callbacks
 49 | from tensorflow.keras import backend as K
 50 | from tensorflow.keras import utils
 51 | from tensorflow.keras.layers import BatchNormalization
 52 | from tensorflow.keras.optimizers import SGD
 53 | from tensorflow.keras import regularizers
 54 | #####################################################################################
 55 | # Utils
 56 | from deep_autoviml.utilities.utilities import print_one_row_from_tf_dataset, print_one_row_from_tf_label
 57 | from deep_autoviml.utilities.utilities import print_classification_metrics, print_regression_model_stats
 58 | from deep_autoviml.utilities.utilities import print_classification_model_stats, plot_history, plot_classification_results
 59 | from deep_autoviml.utilities.utilities import plot_one_history_metric
 60 | from deep_autoviml.utilities.utilities import check_if_GPU_exists
 61 | from deep_autoviml.utilities.utilities import save_valid_predictions, predict_plot_images
 62 | 
 63 | from deep_autoviml.data_load.extract import find_batch_size
 64 | from deep_autoviml.modeling.create_model import check_keras_options
 65 | from deep_autoviml.modeling.one_cycle import OneCycleScheduler
 66 | #####################################################################################
 67 | from sklearn.metrics import roc_auc_score, mean_squared_error, mean_absolute_error
 68 | from IPython.core.display import Image, display
 69 | import pickle
 70 | #############################################################################################
 71 | ##### Suppress all TF2 and TF1.x warnings ###################
 72 | try:
 73 |     tf.logging.set_verbosity(tf.logging.ERROR)
 74 | except:
 75 |     tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
 76 | ############################################################################################
 77 | from tensorflow.keras.layers import Reshape, MaxPooling1D, MaxPooling2D, AveragePooling2D, AveragePooling1D
 78 | from tensorflow.keras import Model, Sequential
 79 | from tensorflow.keras.layers import Activation, Dense, Embedding, GlobalAveragePooling1D, GlobalMaxPooling1D, Dropout, Conv1D
 80 | from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
 81 | ############################################################################################
 82 | #### probably the most handy function of all!
 83 | def left_subtract(l1,l2):
 84 |     lst = []
 85 |     for i in l1:
 86 |         if i not in l2:
 87 |             lst.append(i)
 88 |     return lst
 89 | ##############################################################################################
 90 | import time
 91 | import os
 92 | from sklearn.metrics import balanced_accuracy_score, classification_report
 93 | from sklearn.metrics import confusion_matrix, roc_auc_score, accuracy_score
 94 | from collections import defaultdict
 95 | from tensorflow.keras import callbacks
 96 | #############################################################################################
 97 | def train_text_model(deep_model, train_ds, valid_ds, cat_vocab_dict,
 98 |                       keras_options, model_options, project_name, save_model_flag):
 99 |     epochs = check_keras_options(keras_options, "epochs", 20)
100 |     save_model_path = model_options['save_model_path']
101 |     tensorboard_logpath = os.path.join(save_model_path,"mylogs")
102 |     print('Tensorboard log directory can be found at: %s' %tensorboard_logpath)
103 |     cp = keras.callbacks.ModelCheckpoint(project_name, save_best_only=True,
104 |                                          save_weights_only=True, save_format='tf')
105 |     ### sometimes a model falters and restore_best_weights gives len() not found error. So avoid True option!
106 |     val_mode = "max"
107 |     val_monitor = "val_accuracy"
108 |     patience = check_keras_options(keras_options, "patience", 10)
109 | 
110 |     es = keras.callbacks.EarlyStopping(monitor=val_monitor, min_delta=0.00001, patience=patience,
111 |                         verbose=1, mode=val_mode, baseline=None, restore_best_weights=True)
112 | 
113 |     tb = keras.callbacks.TensorBoard(log_dir=tensorboard_logpath,
114 |                          histogram_freq=0,
115 |                          write_graph=True,
116 |                          write_images=True,
117 |                          update_freq='epoch',
118 |                          profile_batch=2,
119 |                          embeddings_freq=1
120 |                          )
121 |     callbacks_list = [cp, es, tb]
122 |     print('Training text model. This will take time...')
123 |     history = deep_model.fit(train_ds, epochs=epochs, validation_data=valid_ds,
124 |                 callbacks=callbacks_list)
125 |     result = deep_model.evaluate(valid_ds)
126 |     print('    Model accuracy in text validation data: %s' %result[1])
127 |     #plot_history(history, "accuracy", 1)
128 |     fig = plt.figure(figsize=(8,6))
129 |     ax1 = plt.subplot(1, 1, 1)
130 |     ax1.set_title('Model Training vs Validation Loss')
131 |     plot_one_history_metric(history, "accuracy", ax1)
132 |     classes = cat_vocab_dict["text_classes"]
133 |     loss, accuracy = deep_model.evaluate(valid_ds)
134 |     print("Loss: ", loss)
135 |     print("Accuracy: ", accuracy)
136 |     cat_vocab_dict['project_name'] = project_name
137 |     if save_model_flag:
138 |         print('\nSaving model. This will take time...' )
139 |         if not os.path.exists(save_model_path):
140 |             os.makedirs(save_model_path)
141 |         deep_model.save(save_model_path)
142 |         cat_vocab_dict['saved_model_path'] = save_model_path
143 |         print('     deep_autoviml text saved in %s directory' %save_model_path)
144 |     else:
145 |         print('\nModel not being saved since save_model_flag set to False...')
146 |     return deep_model, cat_vocab_dict
147 | #################################################################################
148 | 


--------------------------------------------------------------------------------
/deep_autoviml/models/__pycache__/basic.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AutoViML/deep_autoviml/7881393617ba38830cf15a33ac8e5776f1d97fdd/deep_autoviml/models/__pycache__/basic.cpython-38.pyc


--------------------------------------------------------------------------------
/deep_autoviml/models/__pycache__/big_deep.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AutoViML/deep_autoviml/7881393617ba38830cf15a33ac8e5776f1d97fdd/deep_autoviml/models/__pycache__/big_deep.cpython-38.pyc


--------------------------------------------------------------------------------
/deep_autoviml/models/__pycache__/cnn1.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AutoViML/deep_autoviml/7881393617ba38830cf15a33ac8e5776f1d97fdd/deep_autoviml/models/__pycache__/cnn1.cpython-38.pyc


--------------------------------------------------------------------------------
/deep_autoviml/models/__pycache__/cnn2.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AutoViML/deep_autoviml/7881393617ba38830cf15a33ac8e5776f1d97fdd/deep_autoviml/models/__pycache__/cnn2.cpython-38.pyc


--------------------------------------------------------------------------------
/deep_autoviml/models/__pycache__/deep.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AutoViML/deep_autoviml/7881393617ba38830cf15a33ac8e5776f1d97fdd/deep_autoviml/models/__pycache__/deep.cpython-38.pyc


--------------------------------------------------------------------------------
/deep_autoviml/models/__pycache__/deep_and_wide.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AutoViML/deep_autoviml/7881393617ba38830cf15a33ac8e5776f1d97fdd/deep_autoviml/models/__pycache__/deep_and_wide.cpython-38.pyc


--------------------------------------------------------------------------------
/deep_autoviml/models/__pycache__/deep_nn.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AutoViML/deep_autoviml/7881393617ba38830cf15a33ac8e5776f1d97fdd/deep_autoviml/models/__pycache__/deep_nn.cpython-38.pyc


--------------------------------------------------------------------------------
/deep_autoviml/models/__pycache__/dnn.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AutoViML/deep_autoviml/7881393617ba38830cf15a33ac8e5776f1d97fdd/deep_autoviml/models/__pycache__/dnn.cpython-38.pyc


--------------------------------------------------------------------------------
/deep_autoviml/models/__pycache__/dnn_drop.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AutoViML/deep_autoviml/7881393617ba38830cf15a33ac8e5776f1d97fdd/deep_autoviml/models/__pycache__/dnn_drop.cpython-38.pyc


--------------------------------------------------------------------------------
/deep_autoviml/models/__pycache__/giant_deep.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AutoViML/deep_autoviml/7881393617ba38830cf15a33ac8e5776f1d97fdd/deep_autoviml/models/__pycache__/giant_deep.cpython-38.pyc


--------------------------------------------------------------------------------
/deep_autoviml/models/__pycache__/reg_dnn.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AutoViML/deep_autoviml/7881393617ba38830cf15a33ac8e5776f1d97fdd/deep_autoviml/models/__pycache__/reg_dnn.cpython-38.pyc


--------------------------------------------------------------------------------
/deep_autoviml/models/__pycache__/simple_dnn.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AutoViML/deep_autoviml/7881393617ba38830cf15a33ac8e5776f1d97fdd/deep_autoviml/models/__pycache__/simple_dnn.cpython-38.pyc


--------------------------------------------------------------------------------
/deep_autoviml/models/__pycache__/tf_hub_lookup.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AutoViML/deep_autoviml/7881393617ba38830cf15a33ac8e5776f1d97fdd/deep_autoviml/models/__pycache__/tf_hub_lookup.cpython-38.pyc


--------------------------------------------------------------------------------
/deep_autoviml/models/basic.py:
--------------------------------------------------------------------------------
 1 | ############################################################################################
 2 | #Copyright 2021 Google LLC
 3 | 
 4 | #Licensed under the Apache License, Version 2.0 (the "License");
 5 | #you may not use this file except in compliance with the License.
 6 | #You may obtain a copy of the License at
 7 | #
 8 | #    https://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #Unless required by applicable law or agreed to in writing, software
11 | #distributed under the License is distributed on an "AS IS" BASIS,
12 | #WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #See the License for the specific language governing permissions and
14 | #limitations under the License.
15 | ############################################################################################
16 | import tensorflow as tf
17 | from tensorflow import keras
18 | #### Make sure it is Tensorflow 2.4 or greater!
19 | from tensorflow.keras.optimizers import SGD, Adam, RMSprop
20 | from tensorflow.keras import layers
21 | from tensorflow.keras import optimizers
22 | from tensorflow.keras import models
23 | from tensorflow.keras import callbacks
24 | from tensorflow.keras import backend as K
25 | from tensorflow.keras import utils
26 | from tensorflow.keras import layers
27 | from tensorflow.keras.layers import BatchNormalization, Activation
28 | from tensorflow.keras.optimizers import SGD
29 | from tensorflow.keras import regularizers
30 | from tensorflow.keras.layers import Reshape, MaxPooling1D, MaxPooling2D
31 | from tensorflow.keras.layers import AveragePooling2D, AveragePooling1D
32 | from tensorflow.keras import Model, Sequential
33 | from tensorflow.keras.layers import Embedding, Reshape, Dropout, Dense
34 | from tensorflow.keras.layers import Activation, Dense, Embedding, GlobalAveragePooling1D
35 | from tensorflow.keras.layers import GlobalMaxPooling1D, Dropout, Conv1D
36 | from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
37 | ############################################################################################
38 | from functools import partial
39 | 
40 | RegDense = partial(Dense, kernel_initializer="he_normal", kernel_regularizer=keras.regularizers.l2(0.01))
41 | 
42 | model = Sequential([
43 |     BatchNormalization(),
44 |     Activation("elu"),
45 |     RegDense(100),
46 |     BatchNormalization(),
47 |     Activation("elu"),
48 |     RegDense(100),
49 |     Activation("elu"),
50 |     RegDense(100),
51 | ]);
52 | 


--------------------------------------------------------------------------------
/deep_autoviml/models/cnn1.py:
--------------------------------------------------------------------------------
 1 | ############################################################################################
 2 | #Copyright 2021 Google LLC
 3 | 
 4 | #Licensed under the Apache License, Version 2.0 (the "License");
 5 | #you may not use this file except in compliance with the License.
 6 | #You may obtain a copy of the License at
 7 | #
 8 | #    https://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #Unless required by applicable law or agreed to in writing, software
11 | #distributed under the License is distributed on an "AS IS" BASIS,
12 | #WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #See the License for the specific language governing permissions and
14 | #limitations under the License.
15 | ############################################################################################
16 | import tensorflow as tf
17 | from tensorflow import keras
18 | #### Make sure it is Tensorflow 2.4 or greater!
19 | from tensorflow.keras.optimizers import SGD, Adam, RMSprop
20 | from tensorflow.keras import layers
21 | from tensorflow.keras import optimizers
22 | from tensorflow.keras import models
23 | from tensorflow.keras import callbacks
24 | from tensorflow.keras import backend as K
25 | from tensorflow.keras import utils
26 | from tensorflow.keras import layers
27 | from tensorflow.keras.layers import BatchNormalization
28 | from tensorflow.keras.optimizers import SGD
29 | from tensorflow.keras import regularizers
30 | from tensorflow.keras.layers import Reshape, MaxPooling1D, MaxPooling2D
31 | from tensorflow.keras.layers import AveragePooling2D, AveragePooling1D
32 | from tensorflow.keras import Model, Sequential
33 | from tensorflow.keras.layers import Embedding, Reshape, Dropout, Dense
34 | from tensorflow.keras.layers import Activation, Dense, Embedding, GlobalAveragePooling1D
35 | from tensorflow.keras.layers import GlobalMaxPooling1D, Dropout, Conv1D
36 | from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
37 | ############################################################################################
38 | model = tf.keras.Sequential()
39 | model.add(Reshape((-1, 1)))  ### you need to make input as 3-D for CNN models
40 | #model.add(Conv1D(100, 32, name='conv1', padding="same", activation="relu", strides=2,  data_format='channels_first'))
41 | model.add(Conv1D(100, 32, name='conv1', padding="same", activation="relu", strides=2,  data_format='channels_last'))
42 | model.add(MaxPooling1D(pool_size=5))
43 | model.add(Dropout(0.5))
44 | model.add(Reshape((-1, 1)))  ### you need to make input as 3-D for CNN models
45 | #model.add(Conv1D(64, 16, name='conv2', padding="same", activation="relu", strides=2,  data_format='channels_first'))
46 | model.add(Conv1D(64, 16, name='conv2', padding="same", activation="relu", strides=2,  data_format='channels_last'))
47 | model.add(GlobalAveragePooling1D())
48 | model.add(Dropout(0.5))
49 | model.add(layers.Flatten())
50 | model.add(layers.Dense(32, activation="relu"))
51 | model.add(layers.Dropout(0.25))
52 | 
53 | 


--------------------------------------------------------------------------------
/deep_autoviml/models/cnn2.py:
--------------------------------------------------------------------------------
 1 | ############################################################################################
 2 | #Copyright 2021 Google LLC
 3 | 
 4 | #Licensed under the Apache License, Version 2.0 (the "License");
 5 | #you may not use this file except in compliance with the License.
 6 | #You may obtain a copy of the License at
 7 | #
 8 | #    https://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #Unless required by applicable law or agreed to in writing, software
11 | #distributed under the License is distributed on an "AS IS" BASIS,
12 | #WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #See the License for the specific language governing permissions and
14 | #limitations under the License.
15 | ############################################################################################
16 | import tensorflow as tf
17 | from tensorflow import keras
18 | #### Make sure it is Tensorflow 2.4 or greater!
19 | from tensorflow.keras.optimizers import SGD, Adam, RMSprop
20 | from tensorflow.keras import layers
21 | from tensorflow.keras import optimizers
22 | from tensorflow.keras import models
23 | from tensorflow.keras import callbacks
24 | from tensorflow.keras import backend as K
25 | from tensorflow.keras import utils
26 | from tensorflow.keras import layers
27 | from tensorflow.keras.layers import BatchNormalization
28 | from tensorflow.keras.optimizers import SGD
29 | from tensorflow.keras import regularizers
30 | from tensorflow.keras.layers import Reshape, MaxPooling1D, MaxPooling2D
31 | from tensorflow.keras.layers import AveragePooling2D, AveragePooling1D
32 | from tensorflow.keras import Model, Sequential
33 | from tensorflow.keras.layers import Embedding, Reshape, Dropout, Dense
34 | from tensorflow.keras.layers import Activation, Dense, Embedding, GlobalAveragePooling1D
35 | from tensorflow.keras.layers import GlobalMaxPooling1D, Dropout, Conv1D
36 | from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
37 | ############################################################################################
38 | 
39 | model = tf.keras.Sequential([
40 |     layers.Reshape((-1, 1)),  ### you need to make input as 3-D for CNN models
41 |     layers.Conv1D(100, 64, padding="same", activation="relu", strides=3),
42 |     layers.GlobalMaxPooling1D(),
43 |     layers.Dropout(0.5),
44 |     layers.Reshape((-1, 1)), ### you need to make input as 3-D for CNN models
45 |     layers.Conv1D(64, 32, padding="same", activation="relu", strides=3),
46 |     layers.GlobalMaxPooling1D(),
47 |     layers.Dropout(0.2),
48 |     layers.Flatten(),
49 |     layers.Dense(32, activation="relu"),
50 |     layers.Dropout(0.25),
51 |     ])
52 | 
53 | 


--------------------------------------------------------------------------------
/deep_autoviml/models/deep_and_wide.py:
--------------------------------------------------------------------------------
 1 | ############################################################################################
 2 | #Copyright 2021 Google LLC
 3 | 
 4 | #Licensed under the Apache License, Version 2.0 (the "License");
 5 | #you may not use this file except in compliance with the License.
 6 | #You may obtain a copy of the License at
 7 | #
 8 | #    https://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #Unless required by applicable law or agreed to in writing, software
11 | #distributed under the License is distributed on an "AS IS" BASIS,
12 | #WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #See the License for the specific language governing permissions and
14 | #limitations under the License.
15 | ############################################################################################
16 | import tensorflow as tf
17 | from tensorflow import keras
18 | #### Make sure it is Tensorflow 2.4 or greater!
19 | from tensorflow.keras.optimizers import SGD, Adam, RMSprop
20 | from tensorflow.keras import layers
21 | from tensorflow.keras import optimizers
22 | from tensorflow.keras import models
23 | from tensorflow.keras import callbacks
24 | from tensorflow.keras import backend as K
25 | from tensorflow.keras import utils
26 | from tensorflow.keras import layers
27 | from tensorflow.keras.layers import BatchNormalization
28 | from tensorflow.keras.optimizers import SGD
29 | from tensorflow.keras import regularizers
30 | from tensorflow.keras.layers import Reshape, MaxPooling1D, MaxPooling2D
31 | from tensorflow.keras.layers import AveragePooling2D, AveragePooling1D
32 | from tensorflow.keras import Model, Sequential
33 | from tensorflow.keras.layers import Embedding, Reshape, Dropout, Dense
34 | from tensorflow.keras.layers import Activation, Dense, Embedding, GlobalAveragePooling1D
35 | from tensorflow.keras.layers import GlobalMaxPooling1D, Dropout, Conv1D
36 | from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
37 | ############################################################################################
38 | 
39 | model = models.Sequential([
40 |                         BatchNormalization(),
41 | 						Dropout(0.5),
42 |                         layers.Dense(128, activation='relu', kernel_initializer='he_normal'),
43 | 						BatchNormalization(),
44 | 						Dropout(0.5),
45 |                         layers.Dense(64, activation='relu', kernel_initializer='he_normal'),
46 | 						BatchNormalization(),
47 | 						Dropout(0.2),
48 |                         ])


--------------------------------------------------------------------------------
/deep_autoviml/models/dnn.py:
--------------------------------------------------------------------------------
 1 | ############################################################################################
 2 | #Copyright 2021 Google LLC
 3 | 
 4 | #Licensed under the Apache License, Version 2.0 (the "License");
 5 | #you may not use this file except in compliance with the License.
 6 | #You may obtain a copy of the License at
 7 | #
 8 | #    https://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #Unless required by applicable law or agreed to in writing, software
11 | #distributed under the License is distributed on an "AS IS" BASIS,
12 | #WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #See the License for the specific language governing permissions and
14 | #limitations under the License.
15 | ############################################################################################
16 | import tensorflow as tf
17 | from tensorflow import keras
18 | #### Make sure it is Tensorflow 2.4 or greater!
19 | from tensorflow.keras.optimizers import SGD, Adam, RMSprop
20 | from tensorflow.keras import layers
21 | from tensorflow.keras import optimizers
22 | from tensorflow.keras import models
23 | from tensorflow.keras import callbacks
24 | from tensorflow.keras import backend as K
25 | from tensorflow.keras import utils
26 | from tensorflow.keras import layers
27 | from tensorflow.keras.layers import BatchNormalization
28 | from tensorflow.keras.optimizers import SGD
29 | from tensorflow.keras import regularizers
30 | from tensorflow.keras.layers import Reshape, MaxPooling1D, MaxPooling2D
31 | from tensorflow.keras.layers import AveragePooling2D, AveragePooling1D
32 | from tensorflow.keras import Model, Sequential
33 | from tensorflow.keras.layers import Embedding, Reshape, Dropout, Dense
34 | from tensorflow.keras.layers import Activation, Dense, Embedding, GlobalAveragePooling1D
35 | from tensorflow.keras.layers import GlobalMaxPooling1D, Dropout, Conv1D
36 | from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
37 | ############################################################################################
38 | 
39 | model = Sequential([
40 |     BatchNormalization(),
41 |     Activation("elu"),
42 |     Dense(200),
43 |     BatchNormalization(),
44 |     Activation("elu"),
45 |     Dense(200),
46 |     Activation("elu"),
47 |     Dense(200),
48 | ]);
49 | 


--------------------------------------------------------------------------------
/deep_autoviml/models/dnn_drop.py:
--------------------------------------------------------------------------------
 1 | ############################################################################################
 2 | #Copyright 2021 Google LLC
 3 | 
 4 | #Licensed under the Apache License, Version 2.0 (the "License");
 5 | #you may not use this file except in compliance with the License.
 6 | #You may obtain a copy of the License at
 7 | #
 8 | #    https://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #Unless required by applicable law or agreed to in writing, software
11 | #distributed under the License is distributed on an "AS IS" BASIS,
12 | #WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #See the License for the specific language governing permissions and
14 | #limitations under the License.
15 | ############################################################################################
16 | import tensorflow as tf
17 | from tensorflow import keras
18 | #### Make sure it is Tensorflow 2.4 or greater!
19 | from tensorflow.keras.optimizers import SGD, Adam, RMSprop
20 | from tensorflow.keras import layers
21 | from tensorflow.keras import optimizers
22 | from tensorflow.keras import models
23 | from tensorflow.keras import callbacks
24 | from tensorflow.keras import backend as K
25 | from tensorflow.keras import utils
26 | from tensorflow.keras import layers
27 | from tensorflow.keras.layers import BatchNormalization
28 | from tensorflow.keras.optimizers import SGD
29 | from tensorflow.keras import regularizers
30 | from tensorflow.keras.layers import Reshape, MaxPooling1D, MaxPooling2D
31 | from tensorflow.keras.layers import AveragePooling2D, AveragePooling1D
32 | from tensorflow.keras import Model, Sequential
33 | from tensorflow.keras.layers import Embedding, Reshape, Dropout, Dense
34 | from tensorflow.keras.layers import Activation, Dense, Embedding, GlobalAveragePooling1D
35 | from tensorflow.keras.layers import GlobalMaxPooling1D, Dropout, Conv1D
36 | from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
37 | ############################################################################################
38 | 
39 | model = models.Sequential([
40 |                         BatchNormalization(),
41 | 						Dropout(0.5),
42 |                         layers.Dense(300, activation='relu', kernel_initializer='he_normal'),
43 | 						BatchNormalization(),
44 | 						Dropout(0.5),
45 |                         layers.Dense(300, activation='relu', kernel_initializer='he_normal'),
46 | 						BatchNormalization(),
47 | 						Dropout(0.2),
48 |                         layers.Dense(300, activation='relu', kernel_initializer='he_normal'),
49 | 						BatchNormalization(),
50 | 						Dropout(0.2),
51 |                         ])


--------------------------------------------------------------------------------
/deep_autoviml/models/giant_deep.py:
--------------------------------------------------------------------------------
 1 | ############################################################################################
 2 | #Copyright 2021 Google LLC
 3 | 
 4 | #Licensed under the Apache License, Version 2.0 (the "License");
 5 | #you may not use this file except in compliance with the License.
 6 | #You may obtain a copy of the License at
 7 | #
 8 | #    https://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #Unless required by applicable law or agreed to in writing, software
11 | #distributed under the License is distributed on an "AS IS" BASIS,
12 | #WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #See the License for the specific language governing permissions and
14 | #limitations under the License.
15 | ############################################################################################
16 | import tensorflow as tf
17 | from tensorflow import keras
18 | #### Make sure it is Tensorflow 2.4 or greater!
19 | from tensorflow.keras.optimizers import SGD, Adam, RMSprop
20 | from tensorflow.keras import layers
21 | from tensorflow.keras import optimizers
22 | from tensorflow.keras import models
23 | from tensorflow.keras import callbacks
24 | from tensorflow.keras import backend as K
25 | from tensorflow.keras import utils
26 | from tensorflow.keras import layers
27 | from tensorflow.keras.layers import BatchNormalization
28 | from tensorflow.keras.optimizers import SGD
29 | from tensorflow.keras import regularizers
30 | from tensorflow.keras.layers import Reshape, MaxPooling1D, MaxPooling2D
31 | from tensorflow.keras.layers import AveragePooling2D, AveragePooling1D
32 | from tensorflow.keras import Model, Sequential
33 | from tensorflow.keras.layers import Embedding, Reshape, Dropout, Dense
34 | from tensorflow.keras.layers import Activation, Dense, Embedding, GlobalAveragePooling1D
35 | from tensorflow.keras.layers import GlobalMaxPooling1D, Dropout, Conv1D
36 | from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
37 | ############################################################################################
38 | 
39 | model = models.Sequential([
40 |     layers.BatchNormalization(),
41 |     layers.Dropout(0.5),
42 |     layers.Dense(300, activation='relu', use_bias=True,
43 |                  kernel_initializer='he_normal'),
44 |     layers.BatchNormalization(),
45 |     layers.Dropout(0.50),
46 |     layers.Dense(200, activation='relu', use_bias=True,
47 |                  kernel_initializer='he_normal'),
48 |     layers.BatchNormalization(),
49 |     layers.Dropout(0.25),
50 |     layers.Dense(100, activation='relu', use_bias=True,
51 |                  kernel_initializer='he_normal')
52 |         ])
53 | 
54 | 


--------------------------------------------------------------------------------
/deep_autoviml/models/gru1.py:
--------------------------------------------------------------------------------
 1 | ############################################################################################
 2 | #Copyright 2021 Google LLC
 3 | 
 4 | #Licensed under the Apache License, Version 2.0 (the "License");
 5 | #you may not use this file except in compliance with the License.
 6 | #You may obtain a copy of the License at
 7 | #
 8 | #    https://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #Unless required by applicable law or agreed to in writing, software
11 | #distributed under the License is distributed on an "AS IS" BASIS,
12 | #WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #See the License for the specific language governing permissions and
14 | #limitations under the License.
15 | ############################################################################################
16 | import tensorflow as tf
17 | from tensorflow import keras
18 | #### Make sure it is Tensorflow 2.4 or greater!
19 | from tensorflow.keras.optimizers import SGD, Adam, RMSprop
20 | from tensorflow.keras import layers
21 | from tensorflow.keras import optimizers
22 | from tensorflow.keras import models
23 | from tensorflow.keras import callbacks
24 | from tensorflow.keras import backend as K
25 | from tensorflow.keras import utils
26 | from tensorflow.keras import layers
27 | from tensorflow.keras.layers import BatchNormalization
28 | from tensorflow.keras.optimizers import SGD
29 | from tensorflow.keras import regularizers
30 | from tensorflow.keras.layers import Reshape, MaxPooling1D, MaxPooling2D
31 | from tensorflow.keras.layers import AveragePooling2D, AveragePooling1D
32 | from tensorflow.keras import Model, Sequential
33 | from tensorflow.keras.layers import Embedding, Reshape, Dropout, Dense, GRU, LeakyReLU
34 | from tensorflow.keras.layers import Activation, Dense, Embedding, GlobalAveragePooling1D
35 | from tensorflow.keras.layers import GlobalMaxPooling1D, Dropout, Conv1D
36 | from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
37 | ############################################################################################
38 | 
39 | def make_gru(model_options):
40 |     '''
41 |     Author: Adarsh C
42 |     Date created: 30/01/2022
43 |     Date last modified: 30/01/2022
44 |     contact: chekodu.adarsh@gmail.com
45 |     Inputs:
46 |     model_options: contains important model hyper parameters
47 |     '''
48 |     model = tf.keras.Sequential()
49 |     model.add(GRU(128, input_shape= (model_options['window_length'], len(model_options['features'])), return_sequences=True))
50 |     model.add(LeakyReLU(alpha=0.5))
51 |     model.add(GRU(128, return_sequences=True))
52 |     model.add(LeakyReLU(alpha=0.5)) 
53 |     model.add(Dropout(0.3)) 
54 |     model.add(GRU(64, return_sequences=False))
55 |     model.add(Dropout(0.3)) 
56 |     model.add(Dense(1))
57 |     return model


--------------------------------------------------------------------------------
/deep_autoviml/models/lstm1.py:
--------------------------------------------------------------------------------
 1 | ############################################################################################
 2 | #Copyright 2021 Google LLC
 3 | 
 4 | #Licensed under the Apache License, Version 2.0 (the "License");
 5 | #you may not use this file except in compliance with the License.
 6 | #You may obtain a copy of the License at
 7 | #
 8 | #    https://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #Unless required by applicable law or agreed to in writing, software
11 | #distributed under the License is distributed on an "AS IS" BASIS,
12 | #WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #See the License for the specific language governing permissions and
14 | #limitations under the License.
15 | ############################################################################################
16 | import tensorflow as tf
17 | from tensorflow import keras
18 | #### Make sure it is Tensorflow 2.4 or greater!
19 | from tensorflow.keras.optimizers import SGD, Adam, RMSprop
20 | from tensorflow.keras import layers
21 | from tensorflow.keras import optimizers
22 | from tensorflow.keras import models
23 | from tensorflow.keras import callbacks
24 | from tensorflow.keras import backend as K
25 | from tensorflow.keras import utils
26 | from tensorflow.keras import layers
27 | from tensorflow.keras.layers import BatchNormalization
28 | from tensorflow.keras.optimizers import SGD
29 | from tensorflow.keras import regularizers
30 | from tensorflow.keras.layers import Reshape, MaxPooling1D, MaxPooling2D
31 | from tensorflow.keras.layers import AveragePooling2D, AveragePooling1D
32 | from tensorflow.keras import Model, Sequential
33 | from tensorflow.keras.layers import Embedding, Reshape, Dropout, Dense, LSTM, LeakyReLU
34 | from tensorflow.keras.layers import Activation, Dense, Embedding, GlobalAveragePooling1D
35 | from tensorflow.keras.layers import GlobalMaxPooling1D, Dropout, Conv1D
36 | from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
37 | ############################################################################################
38 | 
39 | def make_lstm(model_options):
40 |     '''
41 |     Author: Adarsh C
42 |     Date created: 30/01/2022
43 |     Date last modified: 30/01/2022
44 |     contact: chekodu.adarsh@gmail.com
45 |     Inputs:
46 |     model_options: contains important model hyper parameters
47 |     '''
48 | 
49 |     # Source:   https://github.com/srivatsan88/End-to-End-Time-Series/blob/master/Multivariate_Time_Series_Modeling_using_LSTM.ipynb
50 |     # Source_Author: https://github.com/srivatsan88
51 |     
52 |     model = tf.keras.Sequential()
53 |     model.add(LSTM(128, input_shape= (model_options['window_length'], len(model_options['features'])), return_sequences=True))
54 |     model.add(LeakyReLU(alpha=0.5))
55 |     model.add(LSTM(128, return_sequences=True))
56 |     model.add(LeakyReLU(alpha=0.5)) 
57 |     model.add(Dropout(0.3)) 
58 |     model.add(LSTM(64, return_sequences=False))
59 |     model.add(Dropout(0.3)) 
60 |     model.add(Dense(1))
61 |     return model


--------------------------------------------------------------------------------
/deep_autoviml/models/reg_dnn.py:
--------------------------------------------------------------------------------
 1 | ############################################################################################
 2 | #Copyright 2021 Google LLC
 3 | 
 4 | #Licensed under the Apache License, Version 2.0 (the "License");
 5 | #you may not use this file except in compliance with the License.
 6 | #You may obtain a copy of the License at
 7 | #
 8 | #    https://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #Unless required by applicable law or agreed to in writing, software
11 | #distributed under the License is distributed on an "AS IS" BASIS,
12 | #WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #See the License for the specific language governing permissions and
14 | #limitations under the License.
15 | ############################################################################################
16 | import tensorflow as tf
17 | from tensorflow import keras
18 | #### Make sure it is Tensorflow 2.4 or greater!
19 | from tensorflow.keras.optimizers import SGD, Adam, RMSprop
20 | from tensorflow.keras import layers
21 | from tensorflow.keras import optimizers
22 | from tensorflow.keras import models
23 | from tensorflow.keras import callbacks
24 | from tensorflow.keras import backend as K
25 | from tensorflow.keras import utils
26 | from tensorflow.keras import layers
27 | from tensorflow.keras.layers import BatchNormalization
28 | from tensorflow.keras.optimizers import SGD
29 | from tensorflow.keras import regularizers
30 | from tensorflow.keras.layers import Reshape, MaxPooling1D, MaxPooling2D
31 | from tensorflow.keras.layers import AveragePooling2D, AveragePooling1D
32 | from tensorflow.keras import Model, Sequential
33 | from tensorflow.keras.layers import Embedding, Reshape, Dropout, Dense
34 | from tensorflow.keras.layers import Activation, Dense, Embedding, GlobalAveragePooling1D
35 | from tensorflow.keras.layers import GlobalMaxPooling1D, Dropout, Conv1D
36 | from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
37 | ############################################################################################
38 | from functools import partial
39 | 
40 | RegDense = partial(Dense, kernel_initializer="he_normal", kernel_regularizer=keras.regularizers.l2(0.01))
41 | 
42 | model = Sequential([
43 |     BatchNormalization(),
44 |     Activation("elu"),
45 |     RegDense(200),
46 |     BatchNormalization(),
47 |     Activation("elu"),
48 |     RegDense(200),
49 |     Activation("elu"),
50 |     RegDense(200),
51 | ]);
52 | 


--------------------------------------------------------------------------------
/deep_autoviml/models/rnn1.py:
--------------------------------------------------------------------------------
 1 | ############################################################################################
 2 | #Copyright 2021 Google LLC
 3 | 
 4 | #Licensed under the Apache License, Version 2.0 (the "License");
 5 | #you may not use this file except in compliance with the License.
 6 | #You may obtain a copy of the License at
 7 | #
 8 | #    https://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #Unless required by applicable law or agreed to in writing, software
11 | #distributed under the License is distributed on an "AS IS" BASIS,
12 | #WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #See the License for the specific language governing permissions and
14 | #limitations under the License.
15 | ############################################################################################
16 | import tensorflow as tf
17 | from tensorflow import keras
18 | #### Make sure it is Tensorflow 2.4 or greater!
19 | from tensorflow.keras.optimizers import SGD, Adam, RMSprop
20 | from tensorflow.keras import layers
21 | from tensorflow.keras import optimizers
22 | from tensorflow.keras import models
23 | from tensorflow.keras import callbacks
24 | from tensorflow.keras import backend as K
25 | from tensorflow.keras import utils
26 | from tensorflow.keras import layers
27 | from tensorflow.keras.layers import BatchNormalization
28 | from tensorflow.keras.optimizers import SGD
29 | from tensorflow.keras import regularizers
30 | from tensorflow.keras.layers import Reshape, MaxPooling1D, MaxPooling2D
31 | from tensorflow.keras.layers import AveragePooling2D, AveragePooling1D
32 | from tensorflow.keras import Model, Sequential
33 | from tensorflow.keras.layers import Embedding, Reshape, Dropout, Dense, SimpleRNN, LeakyReLU
34 | from tensorflow.keras.layers import Activation, Dense, Embedding, GlobalAveragePooling1D
35 | from tensorflow.keras.layers import GlobalMaxPooling1D, Dropout, Conv1D
36 | from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
37 | ############################################################################################
38 | 
39 | 
40 | def make_rnn(model_options):
41 |     '''
42 |     Author: Adarsh C
43 |     Date created: 30/01/2022
44 |     Date last modified: 30/01/2022
45 |     contact: chekodu.adarsh@gmail.com
46 |     Inputs:
47 |     model_options: contains important model hyper parameters
48 |     '''
49 |     model = tf.keras.Sequential()
50 |     model.add(SimpleRNN(128, input_shape= (model_options['window_length'], len(model_options['features'])), return_sequences=True))
51 |     model.add(LeakyReLU(alpha=0.5))
52 |     model.add(SimpleRNN(128, return_sequences=True))
53 |     model.add(LeakyReLU(alpha=0.5)) 
54 |     model.add(Dropout(0.3)) 
55 |     model.add(SimpleRNN(64, return_sequences=False))
56 |     model.add(Dropout(0.3)) 
57 |     model.add(Dense(1))
58 |     return model


--------------------------------------------------------------------------------
/deep_autoviml/models/tf_hub_lookup.py:
--------------------------------------------------------------------------------
  1 | map_name_to_handle = {
  2 |     'bert_en_uncased_L-12_H-768_A-12':
  3 |         'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3',
  4 |     'bert_en_cased_L-12_H-768_A-12':
  5 |         'https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/3',
  6 |     'bert_multi_cased_L-12_H-768_A-12':
  7 |         'https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/3',
  8 |     'small_bert/bert_en_uncased_L-2_H-128_A-2':
  9 |         'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-128_A-2/2',
 10 |     'small_bert/bert_en_uncased_L-2_H-256_A-4':
 11 |         'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-256_A-4/2',
 12 |     'small_bert/bert_en_uncased_L-2_H-512_A-8':
 13 |         'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-512_A-8/2',
 14 |     'small_bert/bert_en_uncased_L-2_H-768_A-12':
 15 |         'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-768_A-12/2',
 16 |     'small_bert/bert_en_uncased_L-4_H-128_A-2':
 17 |         'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-128_A-2/2',
 18 |     'small_bert/bert_en_uncased_L-4_H-256_A-4':
 19 |         'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-256_A-4/2',
 20 |     'small_bert/bert_en_uncased_L-4_H-512_A-8':
 21 |         'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/2',
 22 |     'small_bert/bert_en_uncased_L-4_H-768_A-12':
 23 |         'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-768_A-12/2',
 24 |     'small_bert/bert_en_uncased_L-6_H-128_A-2':
 25 |         'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-128_A-2/2',
 26 |     'small_bert/bert_en_uncased_L-6_H-256_A-4':
 27 |         'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-256_A-4/2',
 28 |     'small_bert/bert_en_uncased_L-6_H-512_A-8':
 29 |         'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-512_A-8/2',
 30 |     'small_bert/bert_en_uncased_L-6_H-768_A-12':
 31 |         'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-768_A-12/2',
 32 |     'small_bert/bert_en_uncased_L-8_H-128_A-2':
 33 |         'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-128_A-2/2',
 34 |     'small_bert/bert_en_uncased_L-8_H-256_A-4':
 35 |         'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-256_A-4/2',
 36 |     'small_bert/bert_en_uncased_L-8_H-512_A-8':
 37 |         'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-512_A-8/2',
 38 |     'small_bert/bert_en_uncased_L-8_H-768_A-12':
 39 |         'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-768_A-12/2',
 40 |     'small_bert/bert_en_uncased_L-10_H-128_A-2':
 41 |         'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-128_A-2/2',
 42 |     'small_bert/bert_en_uncased_L-10_H-256_A-4':
 43 |         'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-256_A-4/2',
 44 |     'small_bert/bert_en_uncased_L-10_H-512_A-8':
 45 |         'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-512_A-8/2',
 46 |     'small_bert/bert_en_uncased_L-10_H-768_A-12':
 47 |         'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-768_A-12/2',
 48 |     'small_bert/bert_en_uncased_L-12_H-128_A-2':
 49 |         'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-128_A-2/2',
 50 |     'small_bert/bert_en_uncased_L-12_H-256_A-4':
 51 |         'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-256_A-4/2',
 52 |     'small_bert/bert_en_uncased_L-12_H-512_A-8':
 53 |         'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-512_A-8/2',
 54 |     'small_bert/bert_en_uncased_L-12_H-768_A-12':
 55 |         'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-768_A-12/2',
 56 |     'albert_en_base':
 57 |         'https://tfhub.dev/tensorflow/albert_en_base/2',
 58 |     'electra_small':
 59 |         'https://tfhub.dev/google/electra_small/2',
 60 |     'electra_base':
 61 |         'https://tfhub.dev/google/electra_base/2',
 62 |     'experts_pubmed':
 63 |         'https://tfhub.dev/google/experts/bert/pubmed/2',
 64 |     'experts_wiki_books':
 65 |         'https://tfhub.dev/google/experts/bert/wiki_books/2',
 66 |     'talking-heads_base':
 67 |         'https://tfhub.dev/tensorflow/talkheads_ggelu_bert_en_base/2',
 68 | }
 69 | 
 70 | map_hub_to_name = dict([(v,k) for (k,v) in map_name_to_handle.items()])
 71 | 
 72 | map_name_to_preprocess = {
 73 |     'bert_en_uncased_L-12_H-768_A-12':
 74 |         'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
 75 |     'bert_en_cased_L-12_H-768_A-12':
 76 |         'https://tfhub.dev/tensorflow/bert_en_cased_preprocess/3',
 77 |     'small_bert/bert_en_uncased_L-2_H-128_A-2':
 78 |         'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
 79 |     'small_bert/bert_en_uncased_L-2_H-256_A-4':
 80 |         'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
 81 |     'small_bert/bert_en_uncased_L-2_H-512_A-8':
 82 |         'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
 83 |     'small_bert/bert_en_uncased_L-2_H-768_A-12':
 84 |         'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
 85 |     'small_bert/bert_en_uncased_L-4_H-128_A-2':
 86 |         'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
 87 |     'small_bert/bert_en_uncased_L-4_H-256_A-4':
 88 |         'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
 89 |     'small_bert/bert_en_uncased_L-4_H-512_A-8':
 90 |         'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
 91 |     'small_bert/bert_en_uncased_L-4_H-768_A-12':
 92 |         'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
 93 |     'small_bert/bert_en_uncased_L-6_H-128_A-2':
 94 |         'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
 95 |     'small_bert/bert_en_uncased_L-6_H-256_A-4':
 96 |         'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
 97 |     'small_bert/bert_en_uncased_L-6_H-512_A-8':
 98 |         'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
 99 |     'small_bert/bert_en_uncased_L-6_H-768_A-12':
100 |         'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
101 |     'small_bert/bert_en_uncased_L-8_H-128_A-2':
102 |         'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
103 |     'small_bert/bert_en_uncased_L-8_H-256_A-4':
104 |         'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
105 |     'small_bert/bert_en_uncased_L-8_H-512_A-8':
106 |         'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
107 |     'small_bert/bert_en_uncased_L-8_H-768_A-12':
108 |         'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
109 |     'small_bert/bert_en_uncased_L-10_H-128_A-2':
110 |         'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
111 |     'small_bert/bert_en_uncased_L-10_H-256_A-4':
112 |         'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
113 |     'small_bert/bert_en_uncased_L-10_H-512_A-8':
114 |         'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
115 |     'small_bert/bert_en_uncased_L-10_H-768_A-12':
116 |         'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
117 |     'small_bert/bert_en_uncased_L-12_H-128_A-2':
118 |         'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
119 |     'small_bert/bert_en_uncased_L-12_H-256_A-4':
120 |         'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
121 |     'small_bert/bert_en_uncased_L-12_H-512_A-8':
122 |         'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
123 |     'small_bert/bert_en_uncased_L-12_H-768_A-12':
124 |         'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
125 |     'bert_multi_cased_L-12_H-768_A-12':
126 |         'https://tfhub.dev/tensorflow/bert_multi_cased_preprocess/3',
127 |     'albert_en_base':
128 |         'https://tfhub.dev/tensorflow/albert_en_preprocess/3',
129 |     'electra_small':
130 |         'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
131 |     'electra_base':
132 |         'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
133 |     'experts_pubmed':
134 |         'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
135 |     'experts_wiki_books':
136 |         'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
137 |     'talking-heads_base':
138 |         'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
139 | }
140 | 


--------------------------------------------------------------------------------
/deep_autoviml/preprocessing/__pycache__/preprocessing.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AutoViML/deep_autoviml/7881393617ba38830cf15a33ac8e5776f1d97fdd/deep_autoviml/preprocessing/__pycache__/preprocessing.cpython-38.pyc


--------------------------------------------------------------------------------
/deep_autoviml/preprocessing/__pycache__/preprocessing_images.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AutoViML/deep_autoviml/7881393617ba38830cf15a33ac8e5776f1d97fdd/deep_autoviml/preprocessing/__pycache__/preprocessing_images.cpython-38.pyc


--------------------------------------------------------------------------------
/deep_autoviml/preprocessing/__pycache__/preprocessing_nlp.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AutoViML/deep_autoviml/7881393617ba38830cf15a33ac8e5776f1d97fdd/deep_autoviml/preprocessing/__pycache__/preprocessing_nlp.cpython-38.pyc


--------------------------------------------------------------------------------
/deep_autoviml/preprocessing/__pycache__/preprocessing_tabular.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AutoViML/deep_autoviml/7881393617ba38830cf15a33ac8e5776f1d97fdd/deep_autoviml/preprocessing/__pycache__/preprocessing_tabular.cpython-38.pyc


--------------------------------------------------------------------------------
/deep_autoviml/preprocessing/__pycache__/preprocessing_text.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AutoViML/deep_autoviml/7881393617ba38830cf15a33ac8e5776f1d97fdd/deep_autoviml/preprocessing/__pycache__/preprocessing_text.cpython-38.pyc


--------------------------------------------------------------------------------
/deep_autoviml/preprocessing/preprocessing.py:
--------------------------------------------------------------------------------
  1 | #Copyright 2021 Google LLC
  2 | 
  3 | #Licensed under the Apache License, Version 2.0 (the "License");
  4 | #you may not use this file except in compliance with the License.
  5 | #You may obtain a copy of the License at
  6 | #
  7 | #    https://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | #Unless required by applicable law or agreed to in writing, software
 10 | #distributed under the License is distributed on an "AS IS" BASIS,
 11 | #WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | #See the License for the specific language governing permissions and
 13 | #limitations under the License.
 14 | ############################################################################################
 15 | import pandas as pd
 16 | import numpy as np
 17 | import matplotlib.pyplot as plt
 18 | import tempfile
 19 | import pdb
 20 | import copy
 21 | import warnings
 22 | warnings.filterwarnings(action='ignore')
 23 | import functools
 24 | # Make numpy values easier to read.
 25 | np.set_printoptions(precision=3, suppress=True)
 26 | from collections import defaultdict
 27 | import os
 28 | ############################################################################################
 29 | # data pipelines and feature engg here
 30 | from deep_autoviml.preprocessing.preprocessing_tabular import preprocessing_tabular
 31 | from deep_autoviml.preprocessing.preprocessing_nlp import preprocessing_nlp, aggregate_nlp_dictionaries
 32 | from deep_autoviml.preprocessing.preprocessing_tabular import encode_auto_inputs
 33 | from deep_autoviml.preprocessing.preprocessing_tabular import create_fast_inputs
 34 | from deep_autoviml.preprocessing.preprocessing_tabular import encode_all_inputs, create_all_inputs
 35 | from deep_autoviml.data_load.classify_features import find_remove_duplicates
 36 | from deep_autoviml.preprocessing.preprocessing_tabular import encode_nlp_inputs, create_nlp_inputs
 37 | 
 38 | 
 39 | # Utils
 40 | #from deep_autoviml.utilities.utilities import get_model_defaults
 41 | from deep_autoviml.modeling.create_model import get_model_defaults
 42 | from deep_autoviml.utilities.utilities import get_hidden_layers
 43 | from deep_autoviml.utilities.utilities import check_model_options
 44 | 
 45 | ############################################################################################
 46 | # TensorFlow ≥2.4 is required
 47 | import tensorflow as tf
 48 | np.random.seed(42)
 49 | tf.random.set_seed(42)
 50 | from tensorflow.keras import layers
 51 | from tensorflow import keras
 52 | from tensorflow.keras.layers.experimental.preprocessing import Normalization, StringLookup
 53 | from tensorflow.keras.layers.experimental.preprocessing import IntegerLookup, CategoryEncoding
 54 | from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
 55 | 
 56 | from tensorflow.keras.optimizers import SGD, Adam, RMSprop
 57 | from tensorflow.keras import layers
 58 | from tensorflow.keras import optimizers
 59 | from tensorflow.keras.models import Model, load_model
 60 | from tensorflow.keras import callbacks
 61 | from tensorflow.keras import backend as K
 62 | from tensorflow.keras import utils
 63 | from tensorflow.keras.layers import BatchNormalization
 64 | from tensorflow.keras.optimizers import SGD
 65 | from tensorflow.keras import regularizers
 66 | from tensorflow.keras.layers import Dense, LSTM, GRU, Input, concatenate, Embedding
 67 | from tensorflow.keras.layers import Reshape, Activation, Flatten
 68 | import tensorflow_hub as hub
 69 | 
 70 | from sklearn.metrics import roc_auc_score, mean_squared_error, mean_absolute_error
 71 | from IPython.core.display import Image, display
 72 | import pickle
 73 | 
 74 | ##### Suppress all TF2 and TF1.x warnings ###################
 75 | ##### Suppress all TF2 and TF1.x warnings ###################
 76 | tf2logger = tf.get_logger()
 77 | tf2logger.warning('Silencing TF2.x warnings')
 78 | tf2logger.root.removeHandler(tf2logger.root.handlers)
 79 | tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
 80 | ############################################################################################
 81 | from tensorflow.keras.layers import Reshape, MaxPooling1D, MaxPooling2D, AveragePooling2D, AveragePooling1D
 82 | from tensorflow.keras import Model, Sequential
 83 | from tensorflow.keras.layers import Activation, Dense, Embedding, GlobalAveragePooling1D, GlobalMaxPooling1D, Dropout, Conv1D
 84 | from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
 85 | ############################################################################################
 86 | #### probably the most handy function of all!
 87 | def left_subtract(l1,l2):
 88 |     lst = []
 89 |     for i in l1:
 90 |         if i not in l2:
 91 |             lst.append(i)
 92 |     return lst
 93 | #############################################################################################
 94 | def perform_preprocessing(train_ds, var_df, cat_vocab_dict, keras_model_type,
 95 |                            keras_options, model_options, verbose=0):
 96 |     """
 97 |     Remember this is the most valuable part of this entire library!
 98 |     This is one humongous preprocessing step to build everything needed for preprocessing into a keras model!
 99 |     But it will break in some cases since we cannot handle every known dataset!
100 |     It will be good enough for most instances to create a fast keras pipeline + baseline model.
101 |     You can always fine tune it.
102 |     You can always create your own model and feed it once you have successfully created preprocessing pipeline.
103 |     """
104 |     num_classes = model_options["num_classes"]
105 |     num_labels = model_options["num_labels"]
106 |     modeltype = model_options["modeltype"]
107 |     embedding_size = model_options["embedding_size"]
108 |     train_data_is_file = model_options['train_data_is_file']
109 |     cat_feat_cross_flag = check_model_options(model_options,"cat_feat_cross_flag", False)
110 |     targets = cat_vocab_dict["target_variables"]
111 |     preds = cat_vocab_dict["predictors_in_train"]
112 |     ############  This is where you get all the classified features ########
113 |     cats = var_df['categorical_vars']  ### these are low cardinality vars - you can one-hot encode them ##
114 |     high_string_vars = var_df['discrete_string_vars']  ## discrete_string_vars are high cardinality vars ## embed them!
115 |     bools = var_df['bools']
116 |     int_cats = var_df['int_cats'] + var_df['int_bools']
117 |     ints = var_df['int_vars']
118 |     floats = var_df['continuous_vars']
119 |     nlps = var_df['nlp_vars']
120 |     lats = var_df['lat_vars']
121 |     lons = var_df['lon_vars']
122 |     floats = left_subtract(floats, lats+lons)
123 |     ####  You must exclude NLP vars from this since they have their own preprocesing
124 |     NON_NLP_VARS = left_subtract(preds, nlps)
125 |     FEATURE_NAMES = bools + cats + high_string_vars + int_cats + ints + floats
126 |     NUMERIC_FEATURE_NAMES = int_cats + ints
127 |     ########  Check if booleans have to be treated as strings or as floats here ##
128 |     if train_data_is_file:
129 |         FLOATS = floats
130 |         CATEGORICAL_FEATURE_NAMES = cats + high_string_vars +bools
131 |     else:
132 |         FLOATS = floats + bools
133 |         CATEGORICAL_FEATURE_NAMES = cats + high_string_vars
134 |     #####################################################################
135 | 
136 |     vocab_dict = defaultdict(list)
137 |     cats_copy = copy.deepcopy(CATEGORICAL_FEATURE_NAMES+NUMERIC_FEATURE_NAMES)
138 |     if len(cats_copy) > 0:
139 |         for each_name in cats_copy:
140 |             vocab_dict[each_name] = cat_vocab_dict[each_name]['vocab']
141 | 
142 |     bools_copy = copy.deepcopy(bools)
143 |     if len(bools_copy) > 0:
144 |         for each_name in bools_copy:
145 |             vocab_dict[each_name] = ['True','False','missing']
146 | 
147 | 
148 |     floats_copy = copy.deepcopy(FLOATS)
149 |     if len(floats_copy) > 0:
150 |         for each_float in floats_copy:
151 |             vocab_dict[each_float] = cat_vocab_dict[each_float]['vocab_min_var']
152 |     ##### set the defaults for the LSTM or GRU model here #########################
153 |     batch_size = 32
154 |     # Convolution
155 |     kernel_size = 3
156 |     filters = 128
157 |     pool_size = 4
158 | 
159 |     # LSTM
160 |     lstm_output_size = 96
161 |     gru_units = 96
162 | 
163 |     # Training
164 |     drop_out = 0.2
165 |     if modeltype == 'Regression':
166 |         class_size = 1
167 |     else:
168 |         if num_classes == 2:
169 |             class_size = 1
170 |         else:
171 |             class_size = num_classes
172 |     ###### Now calculate some layer sizes #####
173 |     data_size = cat_vocab_dict["DS_LEN"]
174 |     data_dim = data_size*len(FEATURE_NAMES)
175 |     dense_layer1, dense_layer2, dense_layer3 = get_hidden_layers(data_dim)
176 |     #################################################################################
177 |     ###########     F E A T U R E    P R E P R O C E S S I N G   H E R E      #######
178 |     #################################################################################
179 |     nlps = var_df['nlp_vars']
180 |     keras_options, model_options, num_predicts, output_activation = get_model_defaults(keras_options,
181 |                                     model_options, targets)
182 |     ##################  NLP Text Features are Proprocessed Here  ################
183 |     nlp_inputs = []
184 |     nlp_names = []
185 |     embedding = []
186 |     ##################  All other Features are Proprocessed Here  ################
187 |     ### make sure you include mixed_nlp and combined_nlp in this list since you want it separated 
188 |     fast_models = ['fast','deep_and_wide','deep_wide','wide_deep', "mixed_nlp","combined_nlp",
189 |                     'wide_and_deep','deep wide', 'wide deep', 'fast1',
190 |                     'deep_and_cross', 'deep_cross', 'deep cross', 'fast2',"text"]
191 |     ##############################################################################
192 |     meta_outputs = []
193 |     print('Preprocessing non-NLP layers for %s Keras model...' %keras_model_type)
194 |     
195 |     if not keras_model_type.lower() in fast_models:
196 |         ############################################################################################
197 |         ############  I N  "A U T O"  M O D E L S  we use Lat and Lon with NLP right here  #########
198 |         ############################################################################################
199 |         if len(lats+lons) > 0:
200 |             print('    Now combine all numeric and non-numeric vars into a Deep only model...')
201 |             meta_outputs, meta_inputs, meta_names = preprocessing_tabular(train_ds, var_df,
202 |                                                         cat_feat_cross_flag, model_options,
203 |                                                         cat_vocab_dict, keras_model_type, verbose)
204 |             print('    All Non-NLP feature preprocessing completed.')
205 |             ### this is the order in which columns have been trained ###
206 |             if len(nlps) > 0:
207 |                 print('Starting NLP string column layer preprocessing...')
208 |                 nlp_inputs = create_nlp_inputs(nlps)
209 |                 max_tokens_zip, seq_tokens_zip, embed_tokens_zip, vocab_train_small = aggregate_nlp_dictionaries(nlps, cat_vocab_dict, model_options)
210 |                 nlp_encoded = encode_nlp_inputs(nlp_inputs, cat_vocab_dict)
211 |                 ### we call nlp_outputs as embedding in this section of the program ####
212 |                 print('NLP Preprocessing completed.')
213 |                 #merged = [meta_outputs, nlp_encoded]
214 |                 merged = layers.concatenate([nlp_encoded, meta_outputs])
215 |                 print('    combined categorical+numeric with nlp outputs successfully for %s model...' %keras_model_type)
216 |                 nlp_inputs = list(nlp_inputs.values())
217 |             else:
218 |                 merged = meta_outputs
219 |             final_training_order = nlp_names + meta_names
220 |             ### find their dtypes - remember to use element_spec[0] for train data sets!
221 |             ds_types = dict([(col_name, train_ds.element_spec[0][col_name].dtype) for col_name in final_training_order ])
222 |             col_type_tuples = [(name,ds_types[name]) for name in final_training_order]
223 |             if verbose >= 2:
224 |                 print('Inferred column names, layers and types (double-check for duplicates and correctness!): \n%s' %col_type_tuples)
225 |             print('    %s model loaded and compiled successfully...' %keras_model_type)
226 |         else:
227 |             ############################################################################################
228 |             #### In "auto" vs. "mixed_nlp", the NLP processings are different. Numeric process is same.
229 |             ####  Here both NLP and NON-NLP varas are combined with embedding to form a deep wide model #
230 |             ############################################################################################
231 |             print('    Now combine all numeric+cat+NLP vars into a Deep and Wide model')
232 |             ##  Since we are processing NLPs separately we need to remove them from inputs ###
233 |             if len(NON_NLP_VARS) == 0:
234 |                 print('    There are zero non-NLP variables in this dataset. No non-NLP preprocesing needed...')
235 |                 meta_inputs = []
236 |             else:
237 |                 FEATURE_NAMES = left_subtract(FEATURE_NAMES, nlps)
238 |                 dropout_rate = 0.1
239 |                 hidden_units = [dense_layer2, dense_layer3]
240 |                 inputs = create_fast_inputs(FEATURE_NAMES, NUMERIC_FEATURE_NAMES, FLOATS)
241 |                 #all_inputs = dict(zip(meta_names,meta_inputs))
242 |                 #### In auto models we want "wide" to be short. Hence use_embedding to be True.
243 |                 wide = encode_auto_inputs(inputs, CATEGORICAL_FEATURE_NAMES, FLOATS, vocab_dict,
244 |                                 hidden_units, use_embedding=True)
245 |                 wide = layers.BatchNormalization()(wide)
246 |                 deep = encode_all_inputs(inputs, CATEGORICAL_FEATURE_NAMES, FLOATS, vocab_dict,
247 |                                 use_embedding=True)
248 |                 deep = layers.BatchNormalization()(deep)
249 |                 meta_inputs = list(inputs.values()) ### convert input layers to a list
250 |                 #### If there are NLP vars in dataset, you must combine the nlp_outputs ##
251 |                 print('    All Non-NLP feature preprocessing completed.')
252 |                 if len(nlps) > 0:
253 |                     print('Starting NLP string column layer preprocessing...')
254 |                     nlp_inputs = create_nlp_inputs(nlps)
255 |                     max_tokens_zip, seq_tokens_zip, embed_tokens_zip, vocab_train_small = aggregate_nlp_dictionaries(nlps, cat_vocab_dict, model_options)
256 |                     nlp_encoded = encode_nlp_inputs(nlp_inputs, cat_vocab_dict)
257 |                     ### we call nlp_outputs as embedding in this section of the program ####
258 |                     print('NLP preprocessing completed.')
259 |                     merged = [wide, deep, nlp_encoded]
260 |                     print('    Combined wide, deep and nlp outputs successfully')
261 |                     nlp_inputs = list(nlp_inputs.values())
262 |                 else:
263 |                     merged = [wide, deep]
264 |                     print('    %s combined wide and deep  successfully...' %keras_model_type)
265 |         ### if NLP_outputs is NOT a list, it means there is some NLP variable in the data set
266 |         if not isinstance(merged, list):
267 |             print('Shape of output from all preprocessing layers before model training = %s' %(merged.shape,))
268 |         return nlp_inputs, meta_inputs, merged, embedding
269 |     elif keras_model_type.lower() in ['mixed_nlp', 'combined_nlp']:
270 |         ### this is similar to auto models but uses TFHub models for NLP preprocessing #####
271 |         if len(NON_NLP_VARS) == 0:
272 |             print('    Non-NLP vars is zero in this dataset. No tabular preprocesing needed...')
273 |             meta_inputs = []
274 |         else:
275 |             ############################################################################################
276 |             #### In "auto" vs. "mixed_nlp", the NLP processings are different. Numeric process is same.
277 |             ############################################################################################
278 |             print('    Now combine all numeric and non-numeric vars into a Deep and Wide model...')
279 |             ####  Here both NLP and NON-NLP varas are combined with embedding to form a deep wide model #
280 |             FEATURE_NAMES = left_subtract(FEATURE_NAMES, nlps)
281 |             dropout_rate = 0.5
282 |             hidden_units = [dense_layer2, dense_layer3]
283 |             inputs = create_fast_inputs(FEATURE_NAMES, NUMERIC_FEATURE_NAMES, FLOATS)
284 |             #all_inputs = dict(zip(meta_names,meta_inputs))
285 |             wide = encode_auto_inputs(inputs, CATEGORICAL_FEATURE_NAMES, FLOATS, vocab_dict,
286 |                             hidden_units, use_embedding=False)
287 |             wide = layers.BatchNormalization()(wide)
288 |             deep = encode_all_inputs(inputs, CATEGORICAL_FEATURE_NAMES, FLOATS, vocab_dict,
289 |                             use_embedding=True)
290 |             deep = layers.BatchNormalization()(deep)
291 |             meta_inputs = list(inputs.values()) ### convert input layers to a list
292 |             print('    All Non-NLP feature preprocessing completed.')
293 |             #### If there are NLP vars in dataset, you use TFHub models in this case ##
294 |             if len(nlps) > 0:
295 |                 print('Starting NLP string column layer preprocessing...')
296 |                 nlp_inputs, embedding, nlp_names = mixed_preprocessing_nlp(train_ds, model_options,
297 |                                                         var_df, cat_vocab_dict,
298 |                                                         keras_model_type, verbose)
299 |                 ### we call nlp_outputs as embedding in this section of the program ####
300 |                 print('    NLP Preprocessing completed.')
301 |             else:
302 |                 print('There are no NLP variables in this dataset for preprocessing...')
303 |                 embedding = []
304 |             if isinstance(embedding, list):
305 |                 ### This means embedding is an empty list with nothing in it ###
306 |                 meta_outputs = layers.concatenate([wide, deep])
307 |                 print('    Combined wide, deep layers  successfully.')
308 |             else:
309 |                 meta_outputs = layers.concatenate([wide, deep, embedding])
310 |                 print('    Combined wide, deep and NLP (with TFHub) successfully.')
311 |     else:
312 |         meta_inputs = []
313 |     ##### You need to send in the ouput from embedding layer to this sequence of layers ####
314 |     
315 |     nlp_outputs = []
316 |     if not isinstance(embedding, list):
317 |         if keras_model_type.lower() in ['bert','text', 'use',"nnlm"]:
318 |             ###### This is where you define the NLP Embedded Layers ########
319 |             #x = layers.Dense(64, activation='relu')(embedding)
320 |             #x = layers.Dense(32, activation='relu')(x)
321 |             #nlp_outputs = layers.Dropout(0.2)(x)
322 |             #nlp_outputs = layers.Dropout(0.2)(embedding)
323 |             if isinstance(meta_outputs, list):
324 |                 #### if there are no other variables then leave it as embedding output
325 |                 nlp_outputs = embedding
326 |             else:
327 |                 #### If there are other variables, then convert this embedding to an output
328 |                 nlp_outputs = layers.Dense(num_predicts, activation=output_activation)(embedding)
329 |         elif keras_model_type.lower() in ['lstm']:
330 |             x = layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True))(embedding)
331 |             x = layers.Bidirectional(tf.keras.layers.LSTM(64))(x)
332 |             x = layers.Dense(64, activation='relu')(x)
333 |             x = layers.Dense(32, activation='relu')(x)
334 |             x = layers.Dropout(0.2)(x)
335 |             nlp_outputs = layers.Dense(num_predicts, activation=output_activation)(x)
336 |             # = layers.Bidirectional(layers.LSTM(dense_layer1, dropout=0.3, recurrent_dropout=0.3,
337 |             #                                    return_sequences=False, batch_size=batch_size,
338 |             #                                    kernel_regularizer=regularizers.l2(0.01)))(x)
339 | 
340 |         elif keras_model_type.lower() in ['cnn1']:
341 |             # Conv1D + global max pooling
342 |             x = Conv1D(dense_layer1, 14, name='cnn_dense1', padding="same",
343 |                                     activation="relu", strides=3)(embedding)
344 |             x = GlobalMaxPooling1D()(x)
345 |             nlp_outputs = layers.Dense(num_predicts, activation=output_activation)(x)
346 |         elif keras_model_type.lower() in fast_models:
347 |             # We add a vanilla hidden layer that's all
348 |             #nlp_outputs = layers.Dense(num_predicts, activation=output_activation)(embedding)
349 |             nlp_outputs = embedding
350 |         elif keras_model_type.lower() in ['gru','cnn2']:
351 |             #### Use this only for Binary-Class classification problems ########
352 |             ####  LSTM with 1D convnet with maxpooling ########
353 |             x = Conv1D(filters,
354 |                              kernel_size,
355 |                              padding='valid',
356 |                              activation='relu',
357 |                              strides=1)(embedding)
358 |             x = MaxPooling1D(pool_size=pool_size)(x)
359 |             x = GRU(units=gru_units,  dropout=drop_out, recurrent_dropout=drop_out)(x)
360 |             if modeltype == 'Regression':
361 |                 #nlp_outputs = Dense(class_size, activation='linear')(x)
362 |                 x = Dense(128, activation='relu')(x)
363 |             else:
364 |                 #nlp_outputs = Dense(class_size, activation='sigmoid')(x)
365 |                 x = Dense(128, activation='relu')(x)
366 |             nlp_outputs = layers.Dense(num_predicts, activation=output_activation)(x)
367 |         elif keras_model_type.lower() in ['cnn']:
368 |             #### Use this only for future Multi-Class classification problems #########
369 |             ####  CNN Model: create a 1D convnet with global maxpooling ########
370 |             x = Conv1D(128, kernel_size, activation='relu')(embedding)
371 |             x = MaxPooling1D(kernel_size)(x)
372 |             x = Conv1D(128, kernel_size, activation='relu')(x)
373 |             x = MaxPooling1D(kernel_size)(x)
374 |             x = Conv1D(128, kernel_size, activation='relu')(x)
375 |             x = GlobalMaxPooling1D()(x)
376 |             x = Dense(128, activation='relu')(x)
377 |             #nlp_outputs = Dense(class_size, activation='softmax')(x)
378 |             nlp_outputs = layers.Dense(num_predicts, activation=output_activation)(x)
379 | 
380 |     #### This is only for all "fast" and "auto" with latitude and longitude columns ##
381 |     if isinstance(meta_outputs, list):
382 |         ### if meta_outputs is a list, it means there is no int, float or cat variable in this data set
383 |         print('There is no numeric or cat or int variables in this data set.')
384 |         if isinstance(nlp_outputs, list):
385 |             ### if NLP_outputs is a list, it means there is no NLP variable in the data set
386 |             print('There is no NLP variable in this data set. Returning')
387 |             consolidated_outputs = meta_outputs
388 |         else:
389 |             print('Shape of encoded NLP variables just before training: %s' %(nlp_outputs.shape,))
390 |             consolidated_outputs = nlp_outputs
391 |     else:
392 |         print('Shape of non-NLP encoded variables just before model training = %s' %(meta_outputs.shape,))
393 |         if isinstance(nlp_outputs, list):
394 |             ### if NLP_outputs is a list, it means there is no NLP variable in the data set
395 |             print('    There is no NLP variable in this data set. Continuing...')
396 |             #x = layers.concatenate([meta_outputs])
397 |             consolidated_outputs = meta_outputs
398 |         else:
399 |             ### if NLP_outputs is NOT a list, it means there is some NLP variable in the data set
400 |             print('    Shape of encoded NLP variables just before training: %s' %(nlp_outputs.shape,))
401 |             consolidated_outputs = layers.concatenate([nlp_outputs, meta_outputs])
402 |             print('Shape of output from all preprocessing layers before model training = %s' %(consolidated_outputs.shape,))
403 |     return nlp_inputs, meta_inputs, consolidated_outputs, nlp_outputs
404 | ##########################################################################################
405 | def mixed_preprocessing_nlp(train_ds, model_options,
406 |                         var_df, cat_vocab_dict,
407 |                         keras_model_type, verbose=0):
408 |     """
409 |     This is only for mixed NLP preprocessing of tabular and nlp datasets
410 |     """
411 |     nlp_inputs = []
412 |     all_nlp_encoded = []
413 |     all_nlp_embeddings = []
414 |     nlp_col_names = []
415 |     nlp_columns = var_df['nlp_vars']
416 |     nlp_columns =  list(set(nlp_columns))
417 |     
418 |     if len(nlp_columns) == 1:
419 |         nlp_column = nlp_columns[0]
420 |     elif keras_model_type.lower() == 'combined_nlp':
421 |         nlp_column = 'combined_nlp_text' ### this is when there are multiple nlp columns ##
422 |     else:
423 |         ### This is to keep nlp columns separate ###
424 |         nlp_column = ''
425 | 
426 |     #### Now perform NLP preproprocessing for each nlp_column ######
427 |     ######### This is where we load Swivel model and process each nlp column ###
428 |     try:
429 |         bert_model_name = "Swivel-20"
430 |         if os.name == 'nt':
431 |             tfhub_path = os.path.join(keras_model_type, 'tf_cache')
432 |             os.environ['TFHUB_CACHE_DIR'] = tfhub_path
433 |             tfhub_handle_encoder = 'https://tfhub.dev/google/tf2-preview/gnews-swivel-20dim/1'
434 |         else:
435 |             tfhub_handle_encoder = 'https://tfhub.dev/google/tf2-preview/gnews-swivel-20dim/1'
436 |         hub_layer = hub.KerasLayer(tfhub_handle_encoder,
437 |                        input_shape=[],
438 |                        dtype=tf.string, 
439 |                        trainable=False, name="Swivel20_encoder")
440 |         print(f'    {bert_model_name} selected from: {tfhub_handle_encoder}')
441 |         ### this is for mixed nlp models. You use Swivel to embed NLP columns fast ####
442 |         if len(nlp_columns) > 1:
443 |             copy_nlp_columns = copy.deepcopy(nlp_columns)
444 |             for each_nlp in copy_nlp_columns:
445 |                 nlp_input = tf.keras.Input(shape=(), dtype=tf.string, name=each_nlp)
446 |                 nlp_inputs.append(nlp_input)
447 |                 x = hub_layer(nlp_input)
448 |                 all_nlp_encoded.append(x)
449 |                 nlp_col_names.append(each_nlp)
450 |         else:
451 |             nlp_input = tf.keras.Input(shape=(), dtype=tf.string, name=nlp_column)
452 |             x = hub_layer(nlp_input)
453 |             ###  Now we combine all inputs and outputs in one place here ###########
454 |             nlp_inputs.append(nlp_input)
455 |             all_nlp_encoded.append(x)
456 |             nlp_col_names.append(nlp_column)
457 |     except:
458 |         print('    Error: Skipping %s for keras layer preprocessing...' %nlp_column)
459 |     ### we gather all outputs above into a single list here called all_features!
460 |     if len(all_nlp_encoded) == 0:
461 |         print('There are no NLP string variables in this dataset to preprocess!')
462 |     elif len(all_nlp_encoded) == 1:
463 |         all_nlp_embeddings = all_nlp_encoded[0]
464 |     else:
465 |         all_nlp_embeddings = layers.concatenate(all_nlp_encoded)
466 | 
467 |     return nlp_inputs, all_nlp_embeddings, nlp_col_names
468 | #################################################################################
469 | 


--------------------------------------------------------------------------------
/deep_autoviml/preprocessing/preprocessing_images.py:
--------------------------------------------------------------------------------
  1 | #Copyright 2021 Google LLC
  2 | 
  3 | #Licensed under the Apache License, Version 2.0 (the "License");
  4 | #you may not use this file except in compliance with the License.
  5 | #You may obtain a copy of the License at
  6 | #
  7 | #    https://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | #Unless required by applicable law or agreed to in writing, software
 10 | #distributed under the License is distributed on an "AS IS" BASIS,
 11 | #WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | #See the License for the specific language governing permissions and
 13 | #limitations under the License.
 14 | ############################################################################################
 15 | import pandas as pd
 16 | import numpy as np
 17 | import matplotlib.pyplot as plt
 18 | import tempfile
 19 | import pdb
 20 | import copy
 21 | import warnings
 22 | warnings.filterwarnings(action='ignore')
 23 | import functools
 24 | from itertools import combinations
 25 | from collections import defaultdict
 26 | 
 27 | # Make numpy values easier to read.
 28 | np.set_printoptions(precision=3, suppress=True)
 29 | ############################################################################################
 30 | # data pipelines and feature engg here
 31 | 
 32 | # pre-defined TF2 Keras models and your own models here
 33 | from deep_autoviml.data_load.classify_features import check_model_options
 34 | 
 35 | # Utils
 36 | 
 37 | ############################################################################################
 38 | # TensorFlow ≥2.4 is required
 39 | import tensorflow as tf
 40 | np.random.seed(42)
 41 | tf.random.set_seed(42)
 42 | from tensorflow.keras import layers
 43 | from tensorflow import keras
 44 | from tensorflow.keras.layers.experimental.preprocessing import Normalization, StringLookup, Hashing
 45 | from tensorflow.keras.layers.experimental.preprocessing import IntegerLookup, CategoryEncoding
 46 | from tensorflow.keras.layers.experimental.preprocessing import TextVectorization, Discretization
 47 | from tensorflow.keras.layers import Embedding, Flatten
 48 | 
 49 | from tensorflow.keras.optimizers import SGD, Adam, RMSprop
 50 | from tensorflow.keras import layers
 51 | from tensorflow.keras import optimizers
 52 | from tensorflow.keras.models import Model, load_model
 53 | from tensorflow.keras import callbacks
 54 | from tensorflow.keras import backend as K
 55 | from tensorflow.keras import utils
 56 | from tensorflow.keras.layers import BatchNormalization
 57 | from tensorflow.keras.optimizers import SGD
 58 | from tensorflow.keras import regularizers
 59 | import tensorflow_hub as hub
 60 | 
 61 | 
 62 | from sklearn.metrics import roc_auc_score, mean_squared_error, mean_absolute_error
 63 | from IPython.core.display import Image, display
 64 | import pickle
 65 | #############################################################################################
 66 | ##### Suppress all TF2 and TF1.x warnings ###################
 67 | try:
 68 |     tf.logging.set_verbosity(tf.logging.ERROR)
 69 | except:
 70 |     tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
 71 | ############################################################################################
 72 | from tensorflow.keras.layers import Reshape, MaxPooling1D, MaxPooling2D, AveragePooling2D, AveragePooling1D
 73 | from tensorflow.keras import Model, Sequential
 74 | from tensorflow.keras.layers import Activation, Dense, Embedding, GlobalAveragePooling1D, GlobalMaxPooling1D, Dropout, Conv1D
 75 | from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
 76 | ############################################################################################
 77 | def preprocessing_images(train_ds, model_options):
 78 |     """
 79 |     This produces a preprocessing layer for an incoming tf.data.Dataset. It can be images only.
 80 |     You need to just send in a tf.data.DataSet from the training folder and a model_options dictionary.
 81 |     It will return a full-model-ready layer that you can add to your Keras Functional model as image layer!
 82 |     ###########   Motivation and suggestions for coding for Image processing came from this blog #########
 83 |     Greatly indebted to Srivatsan for his Github and notebooks: https://github.com/srivatsan88/YouTubeLI
 84 |     ####################################################################################################
 85 |     """
 86 |     try:
 87 |       #######    L O A D     F E A T U R E    E X T R A C T O R   ################
 88 |       url = "https://tfhub.dev/google/tf2-preview/mobilenet_v2/feature_vector/4"
 89 |       feature_extractor = check_model_options(model_options, "tf_hub_model", url)
 90 |       img_height = model_options["image_height"]
 91 |       img_width = model_options["image_width"]
 92 |       image_channels = model_options["image_channels"]
 93 |       num_predicts = model_options["num_predicts"]
 94 |       try:
 95 |           feature_extractor_layer = hub.KerasLayer(feature_extractor, input_shape=(
 96 |                                   img_height,img_width,image_channels))
 97 |       except:
 98 |         print('Loading model from Tensorflow Hub failed. Check the URL and try again...')
 99 |         return
100 |       feature_extractor_layer.trainable = False
101 |       normalization_layer = tf.keras.layers.experimental.preprocessing.Rescaling(1./255)
102 |       tf.random.set_seed(111)
103 |       model = tf.keras.Sequential([
104 |                 normalization_layer,
105 |                 feature_extractor_layer,
106 |                 tf.keras.layers.Dropout(0.3),
107 |                 tf.keras.layers.Dense(num_predicts,activation='softmax')
108 |               ])
109 |       model.compile(
110 |                 optimizer='adam',
111 |                 loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True),
112 |                 metrics=['accuracy'])
113 |     except:
114 |         print('    Error: Failed image preprocessing layer. Returning...')
115 |         return
116 |     return model
117 | 


--------------------------------------------------------------------------------
/deep_autoviml/preprocessing/preprocessing_nlp.py:
--------------------------------------------------------------------------------
  1 | #Copyright 2021 Google LLC
  2 | 
  3 | #Licensed under the Apache License, Version 2.0 (the "License");
  4 | #you may not use this file except in compliance with the License.
  5 | #You may obtain a copy of the License at
  6 | #
  7 | #    https://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | #Unless required by applicable law or agreed to in writing, software
 10 | #distributed under the License is distributed on an "AS IS" BASIS,
 11 | #WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | #See the License for the specific language governing permissions and
 13 | #limitations under the License.
 14 | ############################################################################################
 15 | import pandas as pd
 16 | import numpy as np
 17 | import matplotlib.pyplot as plt
 18 | import tempfile
 19 | import pdb
 20 | import copy
 21 | import warnings
 22 | warnings.filterwarnings(action='ignore')
 23 | import functools
 24 | from itertools import combinations
 25 | from collections import defaultdict
 26 | 
 27 | # Make numpy values easier to read.
 28 | np.set_printoptions(precision=3, suppress=True)
 29 | ############################################################################################
 30 | # data pipelines and feature engg here
 31 | from deep_autoviml.data_load.classify_features import check_model_options
 32 | from deep_autoviml.data_load.classify_features import find_remove_duplicates
 33 | 
 34 | # pre-defined TF2 Keras models and your own models here
 35 | from deep_autoviml.models.tf_hub_lookup import map_hub_to_name, map_name_to_handle
 36 | from deep_autoviml.models.tf_hub_lookup import map_name_to_preprocess
 37 | 
 38 | # Utils
 39 | 
 40 | ############################################################################################
 41 | # TensorFlow ≥2.4 is required
 42 | import tensorflow as tf
 43 | np.random.seed(42)
 44 | tf.random.set_seed(42)
 45 | from tensorflow.keras import layers
 46 | from tensorflow import keras
 47 | from tensorflow.keras.layers.experimental.preprocessing import Normalization, StringLookup, Hashing
 48 | from tensorflow.keras.layers.experimental.preprocessing import IntegerLookup, CategoryEncoding
 49 | from tensorflow.keras.layers.experimental.preprocessing import TextVectorization, Discretization
 50 | from tensorflow.keras.layers import Embedding, Flatten
 51 | 
 52 | from tensorflow.keras.optimizers import SGD, Adam, RMSprop
 53 | from tensorflow.keras import layers
 54 | from tensorflow.keras import optimizers
 55 | from tensorflow.keras.models import Model, load_model
 56 | from tensorflow.keras import callbacks
 57 | from tensorflow.keras import backend as K
 58 | from tensorflow.keras import utils
 59 | from tensorflow.keras.layers import BatchNormalization
 60 | from tensorflow.keras.optimizers import SGD
 61 | from tensorflow.keras import regularizers
 62 | import tensorflow_hub as hub
 63 | 
 64 | 
 65 | from sklearn.metrics import roc_auc_score, mean_squared_error, mean_absolute_error
 66 | from IPython.core.display import Image, display
 67 | import pickle
 68 | #############################################################################################
 69 | ##### Suppress all TF2 and TF1.x warnings ###################
 70 | tf2logger = tf.get_logger()
 71 | tf2logger.warning('Silencing TF2.x warnings')
 72 | tf2logger.root.removeHandler(tf2logger.root.handlers)
 73 | tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
 74 | ############################################################################################
 75 | from tensorflow.keras.layers import Reshape, MaxPooling1D, MaxPooling2D, AveragePooling2D, AveragePooling1D
 76 | from tensorflow.keras import Model, Sequential
 77 | from tensorflow.keras.layers import Activation, Dense, Embedding, GlobalAveragePooling1D, GlobalMaxPooling1D, Dropout, Conv1D
 78 | from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
 79 | ############################################################################################
 80 | #### probably the most handy function of all!
 81 | def left_subtract(l1,l2):
 82 |     lst = []
 83 |     for i in l1:
 84 |         if i not in l2:
 85 |             lst.append(i)
 86 |     return lst
 87 | ##############################################################################################
 88 | import os
 89 | ###########################################################################################
 90 | # We remove punctuations and HTMLs from tweets. This is done in a function,
 91 | # so that it can be passed as a parameter to the TextVectorization object.
 92 | import re
 93 | import string
 94 | def custom_standardization(input_data):
 95 |     lowercase = tf.strings.lower(input_data)
 96 |     stripped_html = tf.strings.regex_replace(lowercase, "<br />", " ")
 97 |     return tf.strings.regex_replace(
 98 |         stripped_html, "[%s]" % re.escape(string.punctuation), ""
 99 |     )
100 | ##############################################################################################
101 | def closest(lst, K):
102 |     """
103 |     Find a number in list lst that is closest to the value K.
104 |     """
105 |     return lst[min(range(len(lst)), key = lambda i: abs(lst[i]-K))]
106 | ##############################################################################################
107 | def preprocessing_nlp(train_ds, model_options, var_df, cat_vocab_dict, keras_model_type, verbose=0):
108 |     """
109 |     This produces a preprocessing layer for an incoming NLP column using TextVectorization from keras.
110 |     You need to just send in a tf.data.DataSet from the training portion of your dataset and an nlp_column name.
111 |     It will return a full-model-ready layer that you can add to your Keras Functional model as an NLP_layer!
112 |     max_tokens_zip is a dictionary of each NLP column name and its max_tokens as defined by train data.
113 |     """
114 |     nlp_inputs = []
115 |     all_nlp_encoded = []
116 |     all_nlp_embeddings = []
117 |     nlp_col_names = []
118 |     nlp_columns = var_df['nlp_vars']
119 |     nlp_columns =  list(set(nlp_columns))
120 |     fast_models = ['fast']
121 | 
122 |     fast_models1 = ['deep_and_wide','deep_wide','wide_deep',
123 |                     'wide_and_deep','deep wide', 'wide deep', 'fast1',
124 |                     'deep_and_cross', 'deep_cross', 'deep cross', 'fast2']
125 |     
126 |     max_tokens_zip, seq_tokens_zip, embed_tokens_zip, vocab_train_small = aggregate_nlp_dictionaries(
127 |                             nlp_columns, cat_vocab_dict, model_options, verbose)
128 | 
129 |     if len(nlp_columns) == 1:
130 |         nlp_column = nlp_columns[0]
131 |     else:
132 |         nlp_column = 'combined_nlp_text' ### this is when there are multiple nlp columns ##
133 | 
134 |     ### Find the best sizes for various dimensions here ###########
135 |     seq_lengths = list(seq_tokens_zip.values())
136 |     maximum_sequence_length = max(seq_lengths)
137 |     ## ideally you should create an unduplicated list of vocabs here and find its size
138 |     ### the vocab_train_small holds the entire vocab of train_small data set!
139 |     max_vocab_size = len(vocab_train_small) + 10
140 |     best_embedding_size = max(list(embed_tokens_zip.values()))
141 |     print('Max vocab size = %s' %max_vocab_size)
142 | 
143 |     ###### Let us set up the defauls for embedding size and max tokens to process each column
144 |     NLP_VARS  = copy.deepcopy(nlp_columns)
145 |     max_features = max_vocab_size ## this is the size of vocab of the whole corpus
146 |     embedding_dim = best_embedding_size ## this is the vector size
147 |     sequence_length = maximum_sequence_length ## this is the length of each sentence consisting of words
148 | 
149 |     #### Now perform NLP preproprocessing for each nlp_column ######
150 |     tf_hub_model = model_options["tf_hub_model"]
151 |     tf_hub = False
152 |     if not tf_hub_model == "":
153 |         print('Using Tensorflow Hub model: %s given as input' %tf_hub_model)
154 |         tf_hub = True
155 |     ##### This is where we use different pre-trained models to create word and sentence embeddings ##
156 |     if keras_model_type.lower() in ['bert']:
157 |         print('Loading %s model this will take time...' %keras_model_type)
158 |         if os.name == 'nt':
159 |             tfhub_path = os.path.join(keras_model_type, 'tf_cache')
160 |             os.environ['TFHUB_CACHE_DIR'] = tfhub_path
161 |         if tf_hub:
162 |             tfhub_handle_encoder = model_options['tf_hub_model']
163 |             try:
164 |                 bert_model_name = map_hub_to_name[tfhub_handle_encoder]
165 |                 tfhub_handle_preprocess = map_name_to_preprocess[bert_model_name]
166 |             except:
167 |                 bert_model_name = 'BERT_given_by_user_input'
168 |                 tfhub_handle_preprocess = "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3"
169 |         else:
170 |             bert_model_name = "BERT Uncased Small"
171 |             tfhub_handle_preprocess = "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3"
172 |             tfhub_handle_encoder = 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/2'
173 |         preprocessor = hub.KerasLayer(tfhub_handle_preprocess, name='BERT_preprocessing')
174 |         encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')
175 |         print(f'    {bert_model_name} selected: {tfhub_handle_encoder}')
176 |         print(f'    Preprocessor auto-selected: {tfhub_handle_preprocess}')
177 |     elif keras_model_type.lower() in ["use"]:
178 |         print('Loading %s model this will take time...' %keras_model_type)
179 |         if os.name == 'nt':
180 |             tfhub_path = os.path.join(keras_model_type, 'tf_cache')
181 |             os.environ['TFHUB_CACHE_DIR'] = tfhub_path
182 |         if tf_hub:
183 |             bert_model_name = "USE given"
184 |             tfhub_handle_encoder = model_options['tf_hub_model']
185 |         else:
186 |             bert_model_name = "Universal Sentence Encoder4"
187 |             tfhub_handle_encoder = "https://tfhub.dev/google/universal-sentence-encoder/4"
188 |         encoder = hub.KerasLayer(tfhub_handle_encoder, 
189 |                        input_shape=[],
190 |                        dtype=tf.string, 
191 |                         trainable=True, name='USE4_encoder')
192 |         print(f'    {bert_model_name} selected: {tfhub_handle_encoder}')
193 |     elif keras_model_type.lower() in fast_models1:
194 |         bert_model_name = "fast NNLM 50 with Normalization"
195 |         if os.name == 'nt':
196 |             tfhub_path = os.path.join(keras_model_type, 'tf_cache')
197 |             os.environ['TFHUB_CACHE_DIR'] = tfhub_path
198 |             tfhub_handle_encoder = 'https://tfhub.dev/google/nnlm-en-dim50-with-normalization/2'
199 |         else:
200 |             tfhub_handle_encoder = 'https://tfhub.dev/google/nnlm-en-dim50-with-normalization/2'
201 |         hub_layer = hub.KerasLayer(tfhub_handle_encoder,
202 |                        input_shape=[],
203 |                        dtype=tf.string, 
204 |                        trainable=False, name="NNLM50_encoder")
205 |         print(f'    {bert_model_name} selected from: {tfhub_handle_encoder}')
206 |     elif keras_model_type.lower() in ["nlp"]:
207 |         bert_model_name = "Swivel-20"
208 |         if os.name == 'nt':
209 |             tfhub_path = os.path.join(keras_model_type, 'tf_cache')
210 |             os.environ['TFHUB_CACHE_DIR'] = tfhub_path
211 |             tfhub_handle_encoder = 'https://tfhub.dev/google/tf2-preview/gnews-swivel-20dim/1'
212 |         else:
213 |             tfhub_handle_encoder = 'https://tfhub.dev/google/tf2-preview/gnews-swivel-20dim/1'
214 |         hub_layer = hub.KerasLayer(tfhub_handle_encoder,
215 |                        input_shape=[],
216 |                        dtype=tf.string, 
217 |                        trainable=False, name="Swivel20_encoder")
218 |         print(f'    {bert_model_name} selected from: {tfhub_handle_encoder}')
219 |     elif keras_model_type.lower() in fast_models:
220 |         #### For fast models you just use Vectorization and Embedding that's all #######
221 |         # Use the text vectorization layer to normalize, split, and map strings to
222 |         # integers. Note that the layer uses the custom standardization defined above.
223 |         # Set maximum_sequence length as all samples are not of the same length.
224 |         ### if you used custom_standardization function, you cannot load the saved model!! be careful!
225 |         bert_model_name = 'Text Vectorization'
226 |         vectorize_layer = TextVectorization(
227 |             standardize='lower_and_strip_punctuation',
228 |             max_tokens=max_features,
229 |             output_mode="int",
230 |             split="whitespace",
231 |             ngrams=None,
232 |             output_sequence_length=sequence_length,
233 |             pad_to_max_tokens=True,
234 |             vocabulary=vocab_train_small,
235 |             )
236 |         print(f'    {bert_model_name} selected along with Embedding layer')
237 |     else:
238 |         ####  This is for auto model option. You can ignore their models in tfhub in that case
239 |         #### If they give the default NLP or text as input, then we would use a default model.
240 |         bert_model_name = 'Swivel_20_model'
241 |         #bert_model_name = "Auto NNLM 50 with Normalization"
242 |         tfhub_handle_encoder = "https://tfhub.dev/google/tf2-preview/gnews-swivel-20dim/1"
243 |         #if os.name == 'nt':
244 |         #    tfhub_path = os.path.join(keras_model_type, 'tf_cache')
245 |         #    os.environ['TFHUB_CACHE_DIR'] = tfhub_path
246 |         #    tfhub_handle_encoder = 'https://tfhub.dev/google/nnlm-en-dim50-with-normalization/2'
247 |         #else:
248 |         #    tfhub_handle_encoder = 'https://tfhub.dev/google/nnlm-en-dim50-with-normalization/2'
249 |         hub_layer = hub.KerasLayer(tfhub_handle_encoder, output_shape=[20],
250 |                        input_shape=[],
251 |                        dtype=tf.string, trainable=False, name="Swivel_encoder")
252 |         #hub_layer = hub.KerasLayer(tfhub_handle_encoder,
253 |         #               input_shape=[],
254 |         #               dtype=tf.string, trainable=False, name="NNLM50_encoder")
255 |         print(f'    {bert_model_name} selected from: {tfhub_handle_encoder}')
256 |     
257 |     #### Next, we add an NLP layer to map those vocab indices into a space of dimensionality
258 |     #### Vocabulary size defines how many unique words you think might be in that training data
259 |     ### Sequence length defines how we should convert each word into a sequence of integers of fixed length
260 | 
261 |     #### Now let us process all NLP columns by using embeddings from Keras ####
262 |     ###### A string input for each string column ###############################
263 |     ##### Now we handle multiple choices in embedding and model building ###
264 |     try:
265 |         if keras_model_type.lower() in ['bert']:
266 |             nlp_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name=nlp_column)
267 |             ### You need to do some special pre-processing if it is a BERT model
268 |             x = encoder(preprocessor(nlp_input))['pooled_output']
269 |         elif keras_model_type.lower() in ["use"]:
270 |             nlp_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name=nlp_column)
271 |             ### You need to do some special pre-processing if it is a BERT model
272 |             x = encoder(nlp_input)
273 |         elif keras_model_type.lower() in fast_models:
274 |             nlp_input = tf.keras.Input(shape=(), dtype=tf.string, name=nlp_column)
275 |             x = vectorize_layer(nlp_input)
276 |             x = layers.Embedding(max_features+1, embedding_dim, input_length=sequence_length, name=nlp_column+'_embedding')(x)
277 |             x = Flatten()(x)
278 |         elif keras_model_type.lower() in ["nlp"]:
279 |             ### this is for NLP models. You use Swivel to embed NLP columns fast ####
280 |             for each_nlp in nlp_columns:
281 |                 nlp_input = tf.keras.Input(shape=(), dtype=tf.string, name=each_nlp)
282 |                 nlp_inputs.append(nlp_input)
283 |         elif keras_model_type.lower() in fast_models1:
284 |             ### this is for AUTO models. You use NNLM or NLNM to embed NLP columns fast ####
285 |             nlp_input = tf.keras.Input(shape=(), dtype=tf.string, name=nlp_column)
286 |             x = hub_layer(nlp_input)
287 |         else:
288 |             ### this is for AUTO models. You use Swivel to embed NLP columns fast ####
289 |             nlp_input = tf.keras.Input(shape=(), dtype=tf.string, name=nlp_column)
290 |             x = hub_layer(nlp_input)
291 |         ###  Now we combine all inputs and outputs in one place here ###########
292 |         nlp_inputs.append(nlp_input)
293 |         all_nlp_encoded.append(x)
294 |         nlp_col_names.append(nlp_column)
295 |     except:
296 |         print('    Error: Skipping %s for keras layer preprocessing...' %nlp_column)
297 | ### we gather all outputs above into a single list here called all_features!
298 |     if len(all_nlp_encoded) == 0:
299 |         print('There are no NLP string variables in this dataset to preprocess!')
300 |     elif len(all_nlp_encoded) == 1:
301 |         all_nlp_embeddings = all_nlp_encoded[0]
302 |     else:
303 |         all_nlp_embeddings = layers.concatenate(all_nlp_encoded)
304 | 
305 |     return nlp_inputs, all_nlp_embeddings, nlp_col_names
306 | ###############################################################################################
307 | def one_hot_encode_categorical_target(features, labels, categories):
308 |     """Returns a one-hot encoded tensor representing categorical values."""
309 |     # The entire encoding can fit on one line:
310 |     labels = tf.cast(tf.equal(categories, tf.reshape(labels, [-1, 1])), tf.int32)
311 |     return (features, labels)
312 | ##############################################################################################
313 | def convert_classifier_targets(labels):
314 |     """
315 |     This handy function converts target labels that are binary or multi-class (whether integer or string) into integers.
316 |     This is similar to a label encoder in scikit-learn but works on tensorflow tf.data.Datasets.
317 |     Just send in a tf.data.Dataset and it will split it into features and labels and then turn them into correct labels.
318 |     It returns the converted labels and a dictionary which you can use to convert it back to original labels. Neat!
319 |     """
320 |     _, converted_labels = tf.unique(labels)
321 |     return converted_labels
322 | #########################################################################################
323 | def compare_two_datasets_with_idcol(train_ds, validate_ds, idcol,verbose=0):
324 |     ls_test = list(validate_ds.as_numpy_iterator())
325 |     ls_train = list(train_ds.as_numpy_iterator())
326 |     if verbose >= 1:
327 |         print('    Size of dataset 1 = %d' %(len(ls_train)))
328 |         print('    Size of dataset 2 = %d' %(len(ls_test)))
329 |     ts_list = [ls_test[x][0][idcol] for x in range(len(ls_test)) ]
330 |     tra_list = [ls_train[x][0][idcol] for x in range(len(ls_train)) ]
331 |     print('Alert! %d rows in common between dataset 1 and 2' %(len(tra_list) - len(left_subtract(tra_list, ts_list))))
332 | ##########################################################################################
333 | def process_continuous_data(data):
334 |     # Normalize data
335 |     max_data = tf.reduce_max(data)
336 |     min_data = tf.reduce_max(data)
337 |     data = (tf.cast(data, tf.float32) - min_data)/(max_data - min_data)
338 |     return tf.reshape(data, [-1, 1])
339 | ##########################################################################################
340 | # Process continuous features.
341 | def preprocess(features, labels):
342 |     for feature in floats:
343 |         features[feature] = process_continuous_data(features[feature])
344 |     return features, labels
345 | ##########################################################################################
346 | def encode_NLP_column(train_ds, nlp_column, nlp_input, vocab_size, sequence_length):
347 |     text_ds = train_ds.map(lambda x,y: x[nlp_column])
348 |     vectorize_layer = TextVectorization(
349 |         #standardize=custom_standardization,
350 |         standardize = 'lower_and_strip_punctuation',
351 |         max_tokens=vocab_size,
352 |         output_mode='int',
353 |         output_sequence_length=sequence_length)
354 |     # Tensorflow uses the word "adapt" to mean "fit" when learning vocabulary from a data set
355 |     # You must call adapt first on a training data set and let it learn from that data set
356 |     vectorize_layer.adapt(text_ds)
357 | 
358 |     ###### This is where you put NLP embedding layer into your data ####
359 |     nlp_vectorized = vectorize_layer(nlp_input)
360 |     ### Sometimes the get_vocabulary() errors due to special chars in utf-8. Hence avoid it.
361 |     #print(f"    {nlp_column} vocab size = {vocab_size}, sequence_length={sequence_length}")
362 |     return nlp_vectorized
363 | ################################################################################################
364 | def aggregate_nlp_dictionaries(nlp_columns, cat_vocab_dict, model_options, verbose=0):
365 |     """
366 |     This function aggregates all the dictionaries you need for nlp processing.
367 |     Just send in a list of nlp variables and a small data sample and it will compute all
368 |     the seq lengths, embedding_dims and vocabs for each nlp variable in the input list.
369 |     """
370 |     lst = [8, 16, 24, 32, 48, 64, 96, 128, 256]
371 |     #### max_tokens_zip calculate the max number of unique words in a vocabulary ####
372 |     max_tokens_zip = defaultdict(int)
373 |     #### seq_tokens_zip calculates the max sequence length in a vocabulary ####
374 |     seq_tokens_zip = defaultdict(int)
375 |     #### embed_tokens_zip calculates the embedding dimension for each nlp_column ####
376 |     embed_tokens_zip = defaultdict(int)
377 |     #### This carries the 
378 |     nlps_copy = copy.deepcopy(nlp_columns)
379 |     seq_lengths = []
380 |     vocab_train_small = []
381 |     if len(nlps_copy) > 0:
382 |         vocab_train_small = []
383 |         for each_name in nlps_copy:
384 |             if verbose >= 2:
385 |                 print('Creating aggregate_nlp_dictionaries for nlp column = %s' %each_name)
386 |             max_tokens_zip[each_name] = cat_vocab_dict[each_name]['size_of_vocab']
387 |             print('    size of vocabulary = %s' %max_tokens_zip[each_name])
388 |             seq_tokens_zip[each_name] = cat_vocab_dict[each_name]['seq_length']
389 |             seq_lengths.append(seq_tokens_zip[each_name])
390 |             if verbose >= 2:
391 |                 print('    sequence length = %s' %seq_tokens_zip[each_name])
392 |             vocab_size = cat_vocab_dict[each_name]['size_of_vocab']
393 |             vocab_train_small += cat_vocab_dict[each_name]['vocab']
394 |             vocab_train_small = np.unique(vocab_train_small).tolist()
395 |             best_embedding_size = closest(lst, vocab_size//50000)
396 |             if verbose >= 2:
397 |                 print('    recommended embedding_size = %s' %best_embedding_size)
398 |             input_embedding_size = check_model_options(model_options, "embedding_size", best_embedding_size)
399 |             if input_embedding_size != best_embedding_size:
400 |                 if verbose >= 2:
401 |                     print('    input embedding size given as %d. Overriding recommended embedding_size...' %input_embedding_size)
402 |                 best_embedding_size = input_embedding_size
403 |             embed_tokens_zip[each_name] = best_embedding_size
404 |     return max_tokens_zip, seq_tokens_zip, embed_tokens_zip, vocab_train_small
405 | ##################################################################################################


--------------------------------------------------------------------------------
/deep_autoviml/preprocessing/preprocessing_text.py:
--------------------------------------------------------------------------------
  1 | #Copyright 2021 Google LLC
  2 | 
  3 | #Licensed under the Apache License, Version 2.0 (the "License");
  4 | #you may not use this file except in compliance with the License.
  5 | #You may obtain a copy of the License at
  6 | #
  7 | #    https://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | #Unless required by applicable law or agreed to in writing, software
 10 | #distributed under the License is distributed on an "AS IS" BASIS,
 11 | #WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | #See the License for the specific language governing permissions and
 13 | #limitations under the License.
 14 | ############################################################################################
 15 | import pandas as pd
 16 | import numpy as np
 17 | import matplotlib.pyplot as plt
 18 | import tempfile
 19 | import pdb
 20 | import copy
 21 | import warnings
 22 | warnings.filterwarnings(action='ignore')
 23 | import functools
 24 | from itertools import combinations
 25 | from collections import defaultdict
 26 | 
 27 | # Make numpy values easier to read.
 28 | np.set_printoptions(precision=3, suppress=True)
 29 | ############################################################################################
 30 | # data pipelines and feature engg here
 31 | 
 32 | # pre-defined TF2 Keras models and your own models here
 33 | from deep_autoviml.data_load.classify_features import check_model_options
 34 | 
 35 | # Utils
 36 | 
 37 | ############################################################################################
 38 | # TensorFlow ≥2.4 is required
 39 | import tensorflow as tf
 40 | np.random.seed(42)
 41 | tf.random.set_seed(42)
 42 | from tensorflow.keras import layers
 43 | from tensorflow import keras
 44 | from tensorflow.keras.layers.experimental.preprocessing import Normalization, StringLookup, Hashing
 45 | from tensorflow.keras.layers.experimental.preprocessing import IntegerLookup, CategoryEncoding
 46 | from tensorflow.keras.layers.experimental.preprocessing import TextVectorization, Discretization
 47 | from tensorflow.keras.layers import Embedding, Flatten
 48 | 
 49 | from tensorflow.keras.optimizers import SGD, Adam, RMSprop
 50 | from tensorflow.keras import layers
 51 | from tensorflow.keras import optimizers
 52 | from tensorflow.keras.models import Model, load_model
 53 | from tensorflow.keras import callbacks
 54 | from tensorflow.keras import backend as K
 55 | from tensorflow.keras import utils
 56 | from tensorflow.keras.layers import BatchNormalization
 57 | from tensorflow.keras.optimizers import SGD
 58 | from tensorflow.keras import regularizers
 59 | import tensorflow_hub as hub
 60 | 
 61 | 
 62 | from sklearn.metrics import roc_auc_score, mean_squared_error, mean_absolute_error
 63 | from IPython.core.display import Image, display
 64 | import pickle
 65 | #############################################################################################
 66 | ##### Suppress all TF2 and TF1.x warnings ###################
 67 | try:
 68 |     tf.logging.set_verbosity(tf.logging.ERROR)
 69 | except:
 70 |     tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
 71 | ############################################################################################
 72 | from tensorflow.keras.layers import Reshape, MaxPooling1D, MaxPooling2D, AveragePooling2D, AveragePooling1D
 73 | from tensorflow.keras import Model, Sequential
 74 | from tensorflow.keras.layers import Activation, Dense, Embedding, GlobalAveragePooling1D, GlobalMaxPooling1D, Dropout, Conv1D
 75 | from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
 76 | ############################################################################################
 77 | def preprocessing_text(train_ds, keras_model_type, model_options):
 78 |     """
 79 |     ####################################################################################################
 80 |     This produces a preprocessing layer for an incoming NLP column using TextVectorization from keras.
 81 |     You need to just send in a tf.data.DataSet from training folder and it will automatically apply NLP.
 82 |     It will return a full-model-ready layer that you can add to your Keras Functional model as an NLP_layer!
 83 |     max_tokens_zip is a dictionary of each NLP column name and its max_tokens as defined by train data.
 84 |     ###########   Motivation and suggestions for coding for Image processing came from this blog #########
 85 |     Greatly indebted to Srivatsan for his Github and notebooks: https://github.com/srivatsan88/YouTubeLI
 86 |     ####################################################################################################
 87 |     """
 88 |     num_predicts = model_options["num_classes"]
 89 |     try:
 90 |         if keras_model_type.lower() in ["text"]:
 91 |             #######    L O A D     F E A T U R E    E X T R A C T O R   ################
 92 |             url = "https://tfhub.dev/google/tf2-preview/gnews-swivel-20dim/1"
 93 |             tf_hub_model = check_model_options(model_options, "tf_hub_model", url)
 94 |             feature_extractor_layer = hub.KerasLayer(tf_hub_model, output_shape=[20],
 95 |                                  input_shape=[], dtype=tf.string, trainable=False)
 96 |             units = 16
 97 |             print('Using Swivel-20D model from TensorFlow Hub')
 98 |         else:
 99 |             tf_hub_model = "https://tfhub.dev/google/nnlm-en-dim50/2"
100 |             feature_extractor_layer = hub.KerasLayer(tf_hub_model, output_shape=[50],
101 |                                  input_shape=[], dtype=tf.string, trainable=True)
102 |             units = 32
103 |             print('    Using NNLM-50D model from TensorFlow Hub')
104 |         tf.random.set_seed(111)
105 |         model = tf.keras.Sequential([
106 |                   feature_extractor_layer,
107 |                   tf.keras.layers.Dense(units, activation='relu'),
108 |                   tf.keras.layers.Dense(num_predicts,activation='sigmoid')
109 |                 ])
110 |         model.compile(
111 |                   optimizer='adam',
112 |                   loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True),
113 |                   metrics=['accuracy'])
114 |     except:
115 |         print('    Error: Failed NLP preprocessing layer. Returning...')
116 |         return
117 |     return model
118 | 


--------------------------------------------------------------------------------
/deep_autoviml/utilities/__pycache__/utilities.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AutoViML/deep_autoviml/7881393617ba38830cf15a33ac8e5776f1d97fdd/deep_autoviml/utilities/__pycache__/utilities.cpython-38.pyc


--------------------------------------------------------------------------------
/logo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AutoViML/deep_autoviml/7881393617ba38830cf15a33ac8e5776f1d97fdd/logo.jpg


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | ipython
 2 | jupyter
 3 | tensorflow>=2.8.0,<=2.12.1
 4 | pandas>=1.1.3, <2.0
 5 | numpy>=1.24
 6 | matplotlib>3.7.4
 7 | scikit-learn>=0.24,<=1.2.2
 8 | regex
 9 | storm-tuner>=0.0.8
10 | emoji
11 | xlrd
12 | tensorflow_hub>=0.12.0
13 | tensorflow-text>=2.8.0,<=2.12.1
14 | tensorboard>=2.8.0,<=2.12.3
15 | optuna
16 | statsmodels
17 | seaborn
18 | scikit-image
19 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [egg_info]
2 | tag_build = 
3 | tag_date = 0
4 | 
5 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | ############################################################################################
 2 | #Copyright 2021 Google LLC
 3 | 
 4 | #Licensed under the Apache License, Version 2.0 (the "License");
 5 | #you may not use this file except in compliance with the License.
 6 | #You may obtain a copy of the License at
 7 | #
 8 | #    https://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #Unless required by applicable law or agreed to in writing, software
11 | #distributed under the License is distributed on an "AS IS" BASIS,
12 | #WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #See the License for the specific language governing permissions and
14 | #limitations under the License.
15 | ############################################################################################
16 | import setuptools
17 | 
18 | with open("README.md", "r", encoding="utf-8") as fh:
19 |     long_description = fh.read()
20 | 
21 | setuptools.setup(
22 |     name="deep_autoviml",
23 |     version="0.0.85",
24 |     author="Ram Seshadri",
25 |     # author_email="author@example.com",
26 |     description="Automatically Build Deep Learning Models and Pipelines fast!",
27 |     long_description=long_description,
28 |     long_description_content_type="text/markdown",
29 |     license='Apache License 2.0',
30 |     url="https://github.com/AutoViML/deep_autoviml",
31 |     packages = [
32 |         "deep_autoviml",
33 |         "deep_autoviml.data_load",
34 |         "deep_autoviml.modeling",
35 |         "deep_autoviml.models",
36 |         "deep_autoviml.preprocessing",
37 |         "deep_autoviml.utilities",
38 |     ],
39 |     include_package_data=True,
40 |     install_requires=[
41 |         "ipython",
42 |         "jupyter",
43 |         "tensorflow>=2.8.0,<=2.12.1",
44 |         "matplotlib>3.7.4",
45 |         "numpy>=1.24",
46 |         "pandas>=1.1.3, <2.0",
47 |         "scikit-learn>=0.24,<=1.2.2",
48 |         "regex",
49 |         "emoji",
50 |         "storm-tuner>=0.0.8",
51 |         "optuna",
52 |         "tensorflow_hub~=0.12.0",
53 |         "tensorflow-text>=2.8.0,<=2.12.1",
54 |         "tensorboard>=2.8.0,<=2.12.3",
55 |         "xlrd"
56 |     ],
57 |     classifiers=[
58 |         "Programming Language :: Python :: 3",
59 |         "Operating System :: OS Independent",
60 |     ],
61 | )
62 | 


--------------------------------------------------------------------------------