├── .gitignore
├── .travis.yml
├── README.md
├── autokeras_pretrained
    ├── __init__.py
    ├── base.py
    ├── bert
    │   ├── __init__.py
    │   ├── modeling.py
    │   ├── optimization.py
    │   ├── tokenization.py
    │   └── utils.py
    ├── constant.py
    ├── face_detector.py
    ├── object_detector.py
    ├── text_classifier.py
    ├── utils.py
    ├── voice_generator
    │   ├── __init__.py
    │   ├── deepvoice3_pytorch
    │   │   ├── __init__.py
    │   │   ├── builder.py
    │   │   ├── conv.py
    │   │   ├── deepvoice3.py
    │   │   ├── frontend.py
    │   │   ├── model.py
    │   │   ├── modules.py
    │   │   ├── text
    │   │   │   ├── __init__.py
    │   │   │   ├── cleaners.py
    │   │   │   ├── cmudict.py
    │   │   │   ├── numbers.py
    │   │   │   ├── symbols.py
    │   │   │   └── text.py
    │   │   └── version.py
    │   └── voice_generator.py
    └── voice_recognizer.py
├── requirements.txt
├── setup.py
└── tests
    ├── __init__.py
    ├── common.py
    ├── pretrained
        ├── __init__.py
        ├── test_face_detector.py
        ├── test_object_detection.py
        ├── test_sentiment_analysis.py
        ├── test_topic_classifier.py
        ├── test_voice_generator.py
        └── test_voice_recognizer.py
    └── resources
        ├── images_test
            ├── Black_white_images
            │   ├── Aaron_Eckhart_0001.jpg
            │   ├── Aaron_Peirsol_0001.jpg
            │   ├── Aaron_Peirsol_0002.jpg
            │   ├── Aaron_Peirsol_0003.jpg
            │   ├── Aaron_Peirsol_0004.jpg
            │   ├── Aaron_Sorkin_0001.jpg
            │   ├── Aaron_Sorkin_0002.jpg
            │   ├── Abdel_Nasser_Assidi_0001.jpg
            │   ├── Abdel_Nasser_Assidi_0002.jpg
            │   ├── Abel_Pacheco_0001.jpg
            │   ├── Abel_Pacheco_0002.jpg
            │   ├── Abel_Pacheco_0003.jpg
            │   ├── Abel_Pacheco_0004.jpg
            │   ├── Abel_Pacheco_0005.jpg
            │   └── Abel_Pacheco_0006.jpg
            ├── Color_images
            │   ├── Aaron_Eckhart_0001.jpg
            │   ├── Aaron_Peirsol_0001.jpg
            │   ├── Aaron_Peirsol_0002.jpg
            │   ├── Aaron_Peirsol_0003.jpg
            │   ├── Aaron_Peirsol_0004.jpg
            │   ├── Aaron_Sorkin_0001.jpg
            │   ├── Aaron_Sorkin_0002.jpg
            │   ├── Abdel_Nasser_Assidi_0001.jpg
            │   ├── Abdel_Nasser_Assidi_0002.jpg
            │   ├── Abel_Pacheco_0001.jpg
            │   ├── Abel_Pacheco_0002.jpg
            │   ├── Abel_Pacheco_0003.jpg
            │   ├── Abel_Pacheco_0004.jpg
            │   ├── Abel_Pacheco_0005.jpg
            │   └── Abel_Pacheco_0006.jpg
            ├── face_detector.jpg
            ├── images_name.csv
            └── od.JPG
        └── temp
            └── .gitkeep


/.gitignore:
--------------------------------------------------------------------------------
  1 | # vim swp files
  2 | *.swp
  3 | # caffe/pytorch model files
  4 | *.pth
  5 | 
  6 | # Mkdocs
  7 | /docs/
  8 | /mkdocs/docs/temp
  9 | 
 10 | .DS_Store
 11 | .idea
 12 | .pytest_cache
 13 | /experiments
 14 | 
 15 | # resource temp folder
 16 | tests/resources/temp/*
 17 | !tests/resources/temp/.gitkeep
 18 | 
 19 | # Byte-compiled / optimized / DLL files
 20 | __pycache__/
 21 | *.py[cod]
 22 | *$py.class
 23 | 
 24 | # C extensions
 25 | *.so
 26 | 
 27 | # Distribution / packaging
 28 | .Python
 29 | build/
 30 | develop-eggs/
 31 | dist/
 32 | downloads/
 33 | eggs/
 34 | .eggs/
 35 | lib/
 36 | lib64/
 37 | parts/
 38 | sdist/
 39 | var/
 40 | wheels/
 41 | *.egg-info/
 42 | .installed.cfg
 43 | *.egg
 44 | MANIFEST
 45 | 
 46 | # PyInstaller
 47 | #  Usually these files are written by a python script from a template
 48 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 49 | *.manifest
 50 | *.spec
 51 | 
 52 | # Installer logs
 53 | pip-log.txt
 54 | pip-delete-this-directory.txt
 55 | 
 56 | # Unit test / coverage reports
 57 | htmlcov/
 58 | .tox/
 59 | .coverage
 60 | .coverage.*
 61 | .cache
 62 | nosetests.xml
 63 | coverage.xml
 64 | *.cover
 65 | .hypothesis/
 66 | 
 67 | # Translations
 68 | *.mo
 69 | *.pot
 70 | 
 71 | # Django stuff:
 72 | *.log
 73 | .static_storage/
 74 | .media/
 75 | local_settings.py
 76 | 
 77 | # Flask stuff:
 78 | instance/
 79 | .webassets-cache
 80 | 
 81 | # Scrapy stuff:
 82 | .scrapy
 83 | 
 84 | # Sphinx documentation
 85 | docs/_build/
 86 | 
 87 | # PyBuilder
 88 | target/
 89 | 
 90 | # Jupyter Notebook
 91 | .ipynb_checkpoints
 92 | 
 93 | # pyenv
 94 | .python-version
 95 | 
 96 | # celery beat schedule file
 97 | celerybeat-schedule
 98 | 
 99 | # SageMath parsed files
100 | *.sage.py
101 | 
102 | # Environments
103 | .env
104 | .venv
105 | env/
106 | venv/
107 | ENV/
108 | env.bak/
109 | venv.bak/
110 | 
111 | # Spyder project settings
112 | .spyderproject
113 | .spyproject
114 | 
115 | # Rope project settings
116 | .ropeproject
117 | 
118 | # mkdocs documentation
119 | /site
120 | 
121 | # mypy
122 | .mypy_cache/
123 | 
124 | examples/text_cnn/glove_embedding/
125 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | python:
 3 | - '3.6'
 4 | install:
 5 | - pip install -r requirements.txt --quiet
 6 | - pip install pytest
 7 | - pip install pytest-cov
 8 | - pip install coverage
 9 | - pip install codacy-coverage
10 | script:
11 | - pytest tests --cov=autokeras_pretrained --cov-report xml:coverage.xml
12 | after_script:
13 | - python-codacy-coverage -r coverage.xml
14 | deploy:
15 | - provider: pypi
16 |   user: jhfjhfj1
17 |   password:
18 |     secure: B36Cg4YbMhIrHyi1LAxilS2fek905PAyU+0anw8BTWgjOF0oM7hLRb64LcgpHrp6E0VawIscE7KGfCaiJR+14S3gqO2blaiJ0Os+ovpbzSu8sAnBVVKfT8Lv24o7prxeq2G8UROXPDST2ZEhy4rugCFjNeMv65WSezpG08TJasmvUAySidQU7rvzoIgoEiKkc7329bzaHpWVQDyuXs/slQSufTK23WBA+OM3cclLrQE02wVH0q/+BtjvkN+44XOVM7Q/gmIrVndj8KGVTIFPYjmKuIiyxjcSGv67oQhXNRH/SSOJyMb93elu/o4y7zexG8eZ2BVr0+Q2pkyBo8JAmfuuWLnz0gONwfU4xwMlLy2m/muGDclMr9eC1i0KWOG0E7afbktFXMVG9EBWr19OPApqGvtfU4997sPUcx7hDFLAwLyznR1hOFIKbeSdtFSUUfGoCnzrCcobsJkD1QKGLoPwr0/gp5o+HJC0bLyMSo45ETsH1m5UGFgFO9Xk4KiGQndL0SpSq7VfZxnMVJttRObqq/8/4wwgFBp8p/bXPpkoS5NK5QajdQgTZ2u8O5SABDBStMq2rdsLdqX4/1tyBcG+u8cKiXjJOfn3chVAwJUNTgddlOa6aGZFM1h3qB8WHgIYJjWxygAKfnH14XRDPAbeEvGdXfwZhJqLfFo6rQs=
19 |   on:
20 |     tags: true
21 |     repo: jhfjhfj1/autokeras-pretrained
22 | env:
23 |   global:
24 |     secure: 0ZmkjLmGZnZSjZplHjm/1x2izv+OC+/S+/jUWaSNGZvHBtGRwsf1EHi65RzkCD9V8YCN42HE3SFh+mEN5nVaYPzxMvhnzrXVzce8oAU8o0qmCTh1K3d74KTzHtJdQGxCZi6KjAhkIZdQclR8FkTnU629BbDGMJ+MCm1imcawyI4ooXFpkCPsa0l+U9A8gZ47/FdKK7lP3Idw/dvIAcU6Cx1+hhkpmyWLqPiig2vB6vYFPYbrRvw2YCkvhhQoADT9pREK9vfYBf/1F9LOt00MAPXcHT/mjok7ziEiqQIrns2sa8CumEX9nUhaTq4TliMxPFDCvKxtMq4+cO5/slr6xD/Nh5hURUjaKytRJG44FmonqBOqKB7Zo3pgRF5/gY6YGUapuh+C2suTQmsJxwXCsG64sFk9SGmpqOImbfvivxxEhcXl7TGQNn7UNH1fxwrklkSj1B84BBnTIDwgAAFQS+4JlHSGo5KAZZwXCzPh0j+6KR2TdPLsrhqm1JIYRDqI8Jq4EkZICSGIjKO52XYq1Hrl+9uudoy6+rvQXlEQz/Pj/jj9DwATt1keqjk8p9Xinnw7X+h0nD+Q2Wtfn6LCzH+E3rDrlpdBRtWhnI37aZHiLwovmqDiMwxY76SsEm5XDsFKABYXxvoiXT5IVHqVd7tEveZR1l+N8mN53B5Q8Z0=
25 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # autokeras-pretrained
 2 | 
 3 | [![Build Status](https://travis-ci.org/jhfjhfj1/autokeras-pretrained.svg?branch=master)](https://travis-ci.org/jhfjhfj1/autokeras-pretrained)
 4 | [![Codacy Badge](https://api.codacy.com/project/badge/Coverage/f6e2bbcea21a486fb3b5d9af80368e58)](https://www.codacy.com/app/jhfjhfj1/autokeras-pretrained?utm_source=github.com&utm_medium=referral&utm_content=datamllab/autokeras-pretrained&utm_campaign=Badge_Coverage)
 5 | [![Codacy Badge](https://api.codacy.com/project/badge/Grade/f6e2bbcea21a486fb3b5d9af80368e58)](https://www.codacy.com/app/jhfjhfj1/autokeras-pretrained?utm_source=github.com&amp;utm_medium=referral&amp;utm_content=datamllab/autokeras-pretrained&amp;utm_campaign=Badge_Grade)
 6 | 
 7 | Pretrained models in Auto-Keras.
 8 | No custom training data needed.
 9 | 
10 | 


--------------------------------------------------------------------------------
/autokeras_pretrained/__init__.py:
--------------------------------------------------------------------------------
1 | from autokeras_pretrained.object_detector import ObjectDetector
2 | from autokeras_pretrained.face_detector import FaceDetector
3 | from autokeras_pretrained.voice_generator.voice_generator import VoiceGenerator
4 | from autokeras_pretrained.voice_recognizer import VoiceRecognizer


--------------------------------------------------------------------------------
/autokeras_pretrained/base.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from abc import ABC, abstractmethod
 4 | 
 5 | from autokeras_pretrained.utils import temp_path_generator, ensure_dir, download_file_from_google_drive, get_device
 6 | 
 7 | 
 8 | class Pretrained(ABC):
 9 |     """The base class for all pretrained task."""
10 | 
11 |     def __init__(self, verbose=True, model_path=None):
12 |         """Initialize the instance."""
13 |         self.verbose = verbose
14 |         self.model = None
15 |         self.device = get_device()
16 |         self.model_path = model_path if model_path is not None else temp_path_generator()
17 |         ensure_dir(self.model_path)
18 |         self.local_paths = [os.path.join(self.model_path, x.local_name) for x in self._google_drive_files]
19 |         for path, x in zip(self.local_paths, self._google_drive_files):
20 |             if not os.path.exists(path):
21 |                 download_file_from_google_drive(file_id=x.google_drive_id,
22 |                                                 dest_path=path,
23 |                                                 verbose=True)
24 | 
25 |     @property
26 |     @abstractmethod
27 |     def _google_drive_files(self):
28 |         pass
29 | 
30 |     @abstractmethod
31 |     def predict(self, input_data, **kwargs):
32 |         """Return predict results for the given image
33 |         Returns:
34 |             A numpy.ndarray containing the results.
35 |         """
36 |         pass
37 | 


--------------------------------------------------------------------------------
/autokeras_pretrained/bert/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datamllab/autokeras-pretrained/2529714e697f5f1333c16809add621a42691b2dd/autokeras_pretrained/bert/__init__.py


--------------------------------------------------------------------------------
/autokeras_pretrained/bert/modeling.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
  3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | """PyTorch BERT model."""
 17 | 
 18 | from __future__ import absolute_import
 19 | from __future__ import division
 20 | from __future__ import print_function
 21 | 
 22 | import os
 23 | import copy
 24 | import json
 25 | import math
 26 | import logging
 27 | import tarfile
 28 | import tempfile
 29 | import shutil
 30 | 
 31 | import torch
 32 | from torch import nn
 33 | from torch.nn import CrossEntropyLoss
 34 | 
 35 | from autokeras_pretrained.constant import Constant
 36 | from autokeras_pretrained.bert.utils import cached_path
 37 | 
 38 | logger = logging.getLogger(__name__)
 39 | 
 40 | PRETRAINED_MODEL_ARCHIVE_MAP = {
 41 |     'bert-base-uncased': Constant.PRETRAINED_MODEL_BERT_BASE_UNCASED,
 42 |     'bert-base-cased': Constant.PRETRAINED_MODEL_BERT_BASE_CASED
 43 | }
 44 | 
 45 | CONFIG_NAME = 'bert_config.json'
 46 | WEIGHTS_NAME = 'pytorch_model.bin'
 47 | 
 48 | 
 49 | def gelu(x):
 50 |     """Implementation of the gelu activation function.
 51 |         For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
 52 |         0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
 53 |     """
 54 |     return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
 55 | 
 56 | 
 57 | def swish(x):
 58 |     return x * torch.sigmoid(x)
 59 | 
 60 | 
 61 | ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish}
 62 | 
 63 | 
 64 | class BertConfig(object):
 65 |     """Configuration class to store the configuration of a `BertModel`.
 66 |     """
 67 |     def __init__(self,
 68 |                  vocab_size_or_config_json_file,
 69 |                  hidden_size=768,
 70 |                  num_hidden_layers=12,
 71 |                  num_attention_heads=12,
 72 |                  intermediate_size=3072,
 73 |                  hidden_act="gelu",
 74 |                  hidden_dropout_prob=0.1,
 75 |                  attention_probs_dropout_prob=0.1,
 76 |                  max_position_embeddings=512,
 77 |                  type_vocab_size=2,
 78 |                  initializer_range=0.02):
 79 |         """Constructs BertConfig.
 80 | 
 81 |         Args:
 82 |             vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`.
 83 |             hidden_size: Size of the encoder layers and the pooler layer.
 84 |             num_hidden_layers: Number of hidden layers in the Transformer encoder.
 85 |             num_attention_heads: Number of attention heads for each attention layer in
 86 |                 the Transformer encoder.
 87 |             intermediate_size: The size of the "intermediate" (i.e., feed-forward)
 88 |                 layer in the Transformer encoder.
 89 |             hidden_act: The non-linear activation function (function or string) in the
 90 |                 encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
 91 |             hidden_dropout_prob: The dropout probabilitiy for all fully connected
 92 |                 layers in the embeddings, encoder, and pooler.
 93 |             attention_probs_dropout_prob: The dropout ratio for the attention
 94 |                 probabilities.
 95 |             max_position_embeddings: The maximum sequence length that this model might
 96 |                 ever be used with. Typically set this to something large just in case
 97 |                 (e.g., 512 or 1024 or 2048).
 98 |             type_vocab_size: The vocabulary size of the `token_type_ids` passed into
 99 |                 `BertModel`.
100 |             initializer_range: The sttdev of the truncated_normal_initializer for
101 |                 initializing all weight matrices.
102 |         """
103 |         if isinstance(vocab_size_or_config_json_file, str):
104 |             with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
105 |                 json_config = json.loads(reader.read())
106 |             for key, value in json_config.items():
107 |                 self.__dict__[key] = value
108 |         elif isinstance(vocab_size_or_config_json_file, int):
109 |             self.vocab_size = vocab_size_or_config_json_file
110 |             self.hidden_size = hidden_size
111 |             self.num_hidden_layers = num_hidden_layers
112 |             self.num_attention_heads = num_attention_heads
113 |             self.hidden_act = hidden_act
114 |             self.intermediate_size = intermediate_size
115 |             self.hidden_dropout_prob = hidden_dropout_prob
116 |             self.attention_probs_dropout_prob = attention_probs_dropout_prob
117 |             self.max_position_embeddings = max_position_embeddings
118 |             self.type_vocab_size = type_vocab_size
119 |             self.initializer_range = initializer_range
120 |         else:
121 |             raise ValueError("First argument must be either a vocabulary size (int)"
122 |                              "or the path to a pretrained model config file (str)")
123 | 
124 |     @classmethod
125 |     def from_dict(cls, json_object):
126 |         """Constructs a `BertConfig` from a Python dictionary of parameters."""
127 |         config = BertConfig(vocab_size_or_config_json_file=-1)
128 |         for key, value in json_object.items():
129 |             config.__dict__[key] = value
130 |         return config
131 | 
132 |     @classmethod
133 |     def from_json_file(cls, json_file):
134 |         """Constructs a `BertConfig` from a json file of parameters."""
135 |         with open(json_file, "r", encoding='utf-8') as reader:
136 |             text = reader.read()
137 |         return cls.from_dict(json.loads(text))
138 | 
139 |     def __repr__(self):
140 |         return str(self.to_json_string())
141 | 
142 |     def to_dict(self):
143 |         """Serializes this instance to a Python dictionary."""
144 |         output = copy.deepcopy(self.__dict__)
145 |         return output
146 | 
147 |     def to_json_string(self):
148 |         """Serializes this instance to a JSON string."""
149 |         return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
150 | 
151 | 
152 | try:
153 |     from apex.normalization.fused_layer_norm import FusedLayerNorm as BertLayerNorm
154 | except ImportError:
155 |     print("Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.")
156 | 
157 |     class BertLayerNorm(nn.Module):
158 |         def __init__(self, hidden_size, eps=1e-12):
159 |             """Construct a layernorm module in the TF style (epsilon inside the square root).
160 |             """
161 |             super(BertLayerNorm, self).__init__()
162 |             self.weight = nn.Parameter(torch.ones(hidden_size))
163 |             self.bias = nn.Parameter(torch.zeros(hidden_size))
164 |             self.variance_epsilon = eps
165 | 
166 |         def forward(self, x):
167 |             u = x.mean(-1, keepdim=True)
168 |             s = (x - u).pow(2).mean(-1, keepdim=True)
169 |             x = (x - u) / torch.sqrt(s + self.variance_epsilon)
170 |             return self.weight * x + self.bias
171 | 
172 | 
173 | class BertEmbeddings(nn.Module):
174 |     """Construct the embeddings from word, position and token_type embeddings.
175 |     """
176 |     def __init__(self, config):
177 |         super(BertEmbeddings, self).__init__()
178 |         self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size)
179 |         self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
180 |         self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
181 | 
182 |         # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
183 |         # any TensorFlow checkpoint file
184 |         self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
185 |         self.dropout = nn.Dropout(config.hidden_dropout_prob)
186 | 
187 |     def forward(self, input_ids, token_type_ids=None):
188 |         seq_length = input_ids.size(1)
189 |         position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)
190 |         position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
191 |         if token_type_ids is None:
192 |             token_type_ids = torch.zeros_like(input_ids)
193 | 
194 |         words_embeddings = self.word_embeddings(input_ids)
195 |         position_embeddings = self.position_embeddings(position_ids)
196 |         token_type_embeddings = self.token_type_embeddings(token_type_ids)
197 | 
198 |         embeddings = words_embeddings + position_embeddings + token_type_embeddings
199 |         embeddings = self.LayerNorm(embeddings)
200 |         embeddings = self.dropout(embeddings)
201 |         return embeddings
202 | 
203 | 
204 | class BertSelfAttention(nn.Module):
205 |     def __init__(self, config):
206 |         super(BertSelfAttention, self).__init__()
207 |         if config.hidden_size % config.num_attention_heads != 0:
208 |             raise ValueError(
209 |                 "The hidden size (%d) is not a multiple of the number of attention "
210 |                 "heads (%d)" % (config.hidden_size, config.num_attention_heads))
211 |         self.num_attention_heads = config.num_attention_heads
212 |         self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
213 |         self.all_head_size = self.num_attention_heads * self.attention_head_size
214 | 
215 |         self.query = nn.Linear(config.hidden_size, self.all_head_size)
216 |         self.key = nn.Linear(config.hidden_size, self.all_head_size)
217 |         self.value = nn.Linear(config.hidden_size, self.all_head_size)
218 | 
219 |         self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
220 | 
221 |     def transpose_for_scores(self, x):
222 |         new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
223 |         x = x.view(*new_x_shape)
224 |         return x.permute(0, 2, 1, 3)
225 | 
226 |     def forward(self, hidden_states, attention_mask):
227 |         mixed_query_layer = self.query(hidden_states)
228 |         mixed_key_layer = self.key(hidden_states)
229 |         mixed_value_layer = self.value(hidden_states)
230 | 
231 |         query_layer = self.transpose_for_scores(mixed_query_layer)
232 |         key_layer = self.transpose_for_scores(mixed_key_layer)
233 |         value_layer = self.transpose_for_scores(mixed_value_layer)
234 | 
235 |         # Take the dot product between "query" and "key" to get the raw attention scores.
236 |         attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
237 |         attention_scores = attention_scores / math.sqrt(self.attention_head_size)
238 |         # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
239 |         attention_scores = attention_scores + attention_mask
240 | 
241 |         # Normalize the attention scores to probabilities.
242 |         attention_probs = nn.Softmax(dim=-1)(attention_scores)
243 | 
244 |         # This is actually dropping out entire tokens to attend to, which might
245 |         # seem a bit unusual, but is taken from the original Transformer paper.
246 |         attention_probs = self.dropout(attention_probs)
247 | 
248 |         context_layer = torch.matmul(attention_probs, value_layer)
249 |         context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
250 |         new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
251 |         context_layer = context_layer.view(*new_context_layer_shape)
252 |         return context_layer
253 | 
254 | 
255 | class BertSelfOutput(nn.Module):
256 |     def __init__(self, config):
257 |         super(BertSelfOutput, self).__init__()
258 |         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
259 |         self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
260 |         self.dropout = nn.Dropout(config.hidden_dropout_prob)
261 | 
262 |     def forward(self, hidden_states, input_tensor):
263 |         hidden_states = self.dense(hidden_states)
264 |         hidden_states = self.dropout(hidden_states)
265 |         hidden_states = self.LayerNorm(hidden_states + input_tensor)
266 |         return hidden_states
267 | 
268 | 
269 | class BertAttention(nn.Module):
270 |     def __init__(self, config):
271 |         super(BertAttention, self).__init__()
272 |         self.self = BertSelfAttention(config)
273 |         self.output = BertSelfOutput(config)
274 | 
275 |     def forward(self, input_tensor, attention_mask):
276 |         self_output = self.self(input_tensor, attention_mask)
277 |         attention_output = self.output(self_output, input_tensor)
278 |         return attention_output
279 | 
280 | 
281 | class BertIntermediate(nn.Module):
282 |     def __init__(self, config):
283 |         super(BertIntermediate, self).__init__()
284 |         self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
285 |         self.intermediate_act_fn = ACT2FN[config.hidden_act] \
286 |             if isinstance(config.hidden_act, str) else config.hidden_act
287 | 
288 |     def forward(self, hidden_states):
289 |         hidden_states = self.dense(hidden_states)
290 |         hidden_states = self.intermediate_act_fn(hidden_states)
291 |         return hidden_states
292 | 
293 | 
294 | class BertOutput(nn.Module):
295 |     def __init__(self, config):
296 |         super(BertOutput, self).__init__()
297 |         self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
298 |         self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
299 |         self.dropout = nn.Dropout(config.hidden_dropout_prob)
300 | 
301 |     def forward(self, hidden_states, input_tensor):
302 |         hidden_states = self.dense(hidden_states)
303 |         hidden_states = self.dropout(hidden_states)
304 |         hidden_states = self.LayerNorm(hidden_states + input_tensor)
305 |         return hidden_states
306 | 
307 | 
308 | class BertLayer(nn.Module):
309 |     def __init__(self, config):
310 |         super(BertLayer, self).__init__()
311 |         self.attention = BertAttention(config)
312 |         self.intermediate = BertIntermediate(config)
313 |         self.output = BertOutput(config)
314 | 
315 |     def forward(self, hidden_states, attention_mask):
316 |         attention_output = self.attention(hidden_states, attention_mask)
317 |         intermediate_output = self.intermediate(attention_output)
318 |         layer_output = self.output(intermediate_output, attention_output)
319 |         return layer_output
320 | 
321 | 
322 | class BertEncoder(nn.Module):
323 |     def __init__(self, config):
324 |         super(BertEncoder, self).__init__()
325 |         layer = BertLayer(config)
326 |         self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)])
327 | 
328 |     def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True):
329 |         all_encoder_layers = []
330 |         for layer_module in self.layer:
331 |             hidden_states = layer_module(hidden_states, attention_mask)
332 |             if output_all_encoded_layers:
333 |                 all_encoder_layers.append(hidden_states)
334 |         if not output_all_encoded_layers:
335 |             all_encoder_layers.append(hidden_states)
336 |         return all_encoder_layers
337 | 
338 | 
339 | class BertPooler(nn.Module):
340 |     def __init__(self, config):
341 |         super(BertPooler, self).__init__()
342 |         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
343 |         self.activation = nn.Tanh()
344 | 
345 |     def forward(self, hidden_states):
346 |         # We "pool" the model by simply taking the hidden state corresponding
347 |         # to the first token.
348 |         first_token_tensor = hidden_states[:, 0]
349 |         pooled_output = self.dense(first_token_tensor)
350 |         pooled_output = self.activation(pooled_output)
351 |         return pooled_output
352 | 
353 | 
354 | class PreTrainedBertModel(nn.Module):
355 |     """ An abstract class to handle weights initialization and
356 |         a simple interface for dowloading and loading pretrained models.
357 |     """
358 |     def __init__(self, config, *inputs, **kwargs):
359 |         super(PreTrainedBertModel, self).__init__()
360 |         if not isinstance(config, BertConfig):
361 |             raise ValueError(
362 |                 "Parameter config in `{}(config)` should be an instance of class `BertConfig`. "
363 |                 "To create a model from a Google pretrained model use "
364 |                 "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
365 |                     self.__class__.__name__, self.__class__.__name__
366 |                 ))
367 |         self.config = config
368 | 
369 |     def init_bert_weights(self, module):
370 |         """ Initialize the weights.
371 |         """
372 |         if isinstance(module, (nn.Linear, nn.Embedding)):
373 |             # Slightly different from the TF version which uses truncated_normal for initialization
374 |             # cf https://github.com/pytorch/pytorch/pull/5617
375 |             module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
376 |         elif isinstance(module, BertLayerNorm):
377 |             module.bias.data.normal_(mean=0.0, std=self.config.initializer_range)
378 |             module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
379 |         if isinstance(module, nn.Linear) and module.bias is not None:
380 |             module.bias.data.zero_()
381 | 
382 |     @classmethod
383 |     def from_pretrained(cls, pretrained_model_name, state_dict=None, cache_dir=None, *inputs, **kwargs):
384 |         """
385 |         Instantiate a PreTrainedBertModel from a pre-trained model file or a pytorch state dict.
386 |         Download and cache the pre-trained model file if needed.
387 | 
388 |         Params:
389 |             pretrained_model_name: either:
390 |                 - a str with the name of a pre-trained model to load selected in the list of:
391 |                     . `bert-base-uncased`
392 |                     . `bert-large-uncased`
393 |                     . `bert-base-cased`
394 |                     . `bert-base-multilingual`
395 |                     . `bert-base-chinese`
396 |                 - a path or url to a pretrained model archive containing:
397 |                     . `bert_config.json` a configuration file for the model
398 |                     . `pytorch_model.bin` a PyTorch dump of a BertForPreTraining instance
399 |             cache_dir: an optional path to a folder in which the pre-trained models will be cached.
400 |             state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of Google pre-trained models
401 |             *inputs, **kwargs: additional input for the specific Bert class
402 |                 (ex: num_labels for BertForSequenceClassification)
403 |         """
404 |         try:
405 |             if pretrained_model_name in PRETRAINED_MODEL_ARCHIVE_MAP:
406 |                 archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[pretrained_model_name]
407 |             else:
408 |                 raise KeyError
409 |         except KeyError:
410 |             logger.error(str(pretrained_model_name) + " model is not available/supported.")
411 | 
412 |         # redirect to the cache, if necessary
413 |         try:
414 |             resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir)
415 |         except FileNotFoundError:
416 |             logger.error(
417 |                 "Model name '{}' was not found in model name list ({}). "
418 |                 "We assumed '{}' was a path or url but couldn't find any file "
419 |                 "associated to this path or url.".format(
420 |                     pretrained_model_name,
421 |                     ', '.join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()),
422 |                     archive_file))
423 |             return None
424 |         if resolved_archive_file == archive_file:
425 |             logger.info("loading archive file {}".format(archive_file))
426 |         else:
427 |             logger.info("loading archive file {} from cache at {}".format(
428 |                 archive_file, resolved_archive_file))
429 |         tempdir = None
430 |         if os.path.isdir(resolved_archive_file):
431 |             serialization_dir = resolved_archive_file
432 |         else:
433 |             # Extract archive to temp dir
434 |             tempdir = tempfile.mkdtemp()
435 |             logger.info("extracting archive file {} to temp dir {}".format(
436 |                 resolved_archive_file, tempdir))
437 |             with tarfile.open(resolved_archive_file, 'r:gz') as archive:
438 |                 archive.extractall(tempdir)
439 |             serialization_dir = tempdir
440 |         # Load config
441 |         config_file = os.path.join(serialization_dir, CONFIG_NAME)
442 |         config = BertConfig.from_json_file(config_file)
443 |         logger.info("Model config {}".format(config))
444 |         # Instantiate model.
445 |         model = cls(config, *inputs, **kwargs)
446 |         if state_dict is None:
447 |             weights_path = os.path.join(serialization_dir, WEIGHTS_NAME)
448 |             state_dict = torch.load(weights_path)
449 | 
450 |         old_keys = []
451 |         new_keys = []
452 |         for key in state_dict.keys():
453 |             new_key = None
454 |             if 'gamma' in key:
455 |                 new_key = key.replace('gamma', 'weight')
456 |             if 'beta' in key:
457 |                 new_key = key.replace('beta', 'bias')
458 |             if new_key:
459 |                 old_keys.append(key)
460 |                 new_keys.append(new_key)
461 |         for old_key, new_key in zip(old_keys, new_keys):
462 |             state_dict[new_key] = state_dict.pop(old_key)
463 | 
464 |         missing_keys = []
465 |         unexpected_keys = []
466 |         error_msgs = []
467 |         # copy state_dict so _load_from_state_dict can modify it
468 |         metadata = getattr(state_dict, '_metadata', None)
469 |         state_dict = state_dict.copy()
470 |         if metadata is not None:
471 |             state_dict._metadata = metadata
472 | 
473 |         def load(module, prefix=''):
474 |             local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
475 |             module._load_from_state_dict(
476 |                 state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs)
477 |             for name, child in module._modules.items():
478 |                 if child is not None:
479 |                     load(child, prefix + name + '.')
480 |         load(model, prefix='' if hasattr(model, 'bert') else 'bert.')
481 |         if len(missing_keys) > 0:
482 |             logger.info("Weights of {} not initialized from pretrained model: {}".format(
483 |                 model.__class__.__name__, missing_keys))
484 |         if len(unexpected_keys) > 0:
485 |             logger.info("Weights from pretrained model not used in {}: {}".format(
486 |                 model.__class__.__name__, unexpected_keys))
487 |         if tempdir:
488 |             # Clean up temp dir
489 |             shutil.rmtree(tempdir)
490 |         return model
491 | 
492 | 
493 | class BertModel(PreTrainedBertModel):
494 |     """BERT model ("Bidirectional Embedding Representations from a Transformer").
495 | 
496 |     Params:
497 |         config: a BertConfig class instance with the configuration to build a new model
498 | 
499 |     Inputs:
500 |         `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
501 |             with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
502 |             `extract_features.py`, `run_classifier.py` and `run_squad.py`)
503 |         `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
504 |             types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
505 |             a `sentence B` token (see BERT paper for more details).
506 |         `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
507 |             selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
508 |             input sequence length in the current batch. It's the mask that we typically use for attention when
509 |             a batch has varying length sentences.
510 |         `output_all_encoded_layers`: boolean which controls the content of the `encoded_layers` output as described below. Default: `True`.
511 | 
512 |     Outputs: Tuple of (encoded_layers, pooled_output)
513 |         `encoded_layers`: controled by `output_all_encoded_layers` argument:
514 |             - `output_all_encoded_layers=True`: outputs a list of the full sequences of encoded-hidden-states at the end
515 |                 of each attention block (i.e. 12 full sequences for BERT-base, 24 for BERT-large), each
516 |                 encoded-hidden-state is a torch.FloatTensor of size [batch_size, sequence_length, hidden_size],
517 |             - `output_all_encoded_layers=False`: outputs only the full sequence of hidden-states corresponding
518 |                 to the last attention block of shape [batch_size, sequence_length, hidden_size],
519 |         `pooled_output`: a torch.FloatTensor of size [batch_size, hidden_size] which is the output of a
520 |             classifier pretrained on top of the hidden state associated to the first character of the
521 |             input (`CLF`) to train on the Next-Sentence task (see BERT's paper).
522 | 
523 |     Example usage:
524 |     ```python
525 |     # Already been converted into WordPiece token ids
526 |     input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
527 |     input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
528 |     token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
529 | 
530 |     config = modeling.BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
531 |         num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
532 | 
533 |     model = modeling.BertModel(config=config)
534 |     all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
535 |     ```
536 |     """
537 |     def __init__(self, config):
538 |         super(BertModel, self).__init__(config)
539 |         self.embeddings = BertEmbeddings(config)
540 |         self.encoder = BertEncoder(config)
541 |         self.pooler = BertPooler(config)
542 |         self.apply(self.init_bert_weights)
543 | 
544 |     def forward(self, input_ids, token_type_ids=None, attention_mask=None, output_all_encoded_layers=True):
545 |         if attention_mask is None:
546 |             attention_mask = torch.ones_like(input_ids)
547 |         if token_type_ids is None:
548 |             token_type_ids = torch.zeros_like(input_ids)
549 | 
550 |         # We create a 3D attention mask from a 2D tensor mask.
551 |         # Sizes are [batch_size, 1, 1, to_seq_length]
552 |         # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
553 |         # this attention mask is more simple than the triangular masking of causal attention
554 |         # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
555 |         extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
556 | 
557 |         # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
558 |         # masked positions, this operation will create a tensor which is 0.0 for
559 |         # positions we want to attend and -10000.0 for masked positions.
560 |         # Since we are adding it to the raw scores before the softmax, this is
561 |         # effectively the same as removing these entirely.
562 |         extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
563 |         extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
564 | 
565 |         embedding_output = self.embeddings(input_ids, token_type_ids)
566 |         encoded_layers = self.encoder(embedding_output,
567 |                                       extended_attention_mask,
568 |                                       output_all_encoded_layers=output_all_encoded_layers)
569 |         sequence_output = encoded_layers[-1]
570 |         pooled_output = self.pooler(sequence_output)
571 |         if not output_all_encoded_layers:
572 |             encoded_layers = encoded_layers[-1]
573 |         return encoded_layers, pooled_output
574 | 
575 | 
576 | class BertForSequenceClassification(PreTrainedBertModel):
577 |     """BERT model for classification.
578 |     This module is composed of the BERT model with a linear layer on top of
579 |     the pooled output.
580 | 
581 |     Params:
582 |         `config`: a BertConfig class instance with the configuration to build a new model.
583 |         `num_labels`: the number of classes for the classifier. Default = 2.
584 | 
585 |     Inputs:
586 |         `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
587 |             with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
588 |             `extract_features.py`, `run_classifier.py` and `run_squad.py`)
589 |         `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
590 |             types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
591 |             a `sentence B` token (see BERT paper for more details).
592 |         `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
593 |             selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
594 |             input sequence length in the current batch. It's the mask that we typically use for attention when
595 |             a batch has varying length sentences.
596 |         `labels`: labels for the classification output: torch.LongTensor of shape [batch_size]
597 |             with indices selected in [0, ..., num_labels].
598 | 
599 |     Outputs:
600 |         if `labels` is not `None`:
601 |             Outputs the CrossEntropy classification loss of the output with the labels.
602 |         if `labels` is `None`:
603 |             Outputs the classification logits of shape [batch_size, num_labels].
604 | 
605 |     Example usage:
606 |     ```python
607 |     # Already been converted into WordPiece token ids
608 |     input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
609 |     input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
610 |     token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
611 | 
612 |     config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
613 |         num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
614 | 
615 |     num_labels = 2
616 | 
617 |     model = BertForSequenceClassification(config, num_labels)
618 |     logits = model(input_ids, token_type_ids, input_mask)
619 |     ```
620 |     """
621 |     def __init__(self, config, num_labels=2):
622 |         super(BertForSequenceClassification, self).__init__(config)
623 |         self.num_labels = num_labels
624 |         self.bert = BertModel(config)
625 |         self.dropout = nn.Dropout(config.hidden_dropout_prob)
626 |         self.classifier = nn.Linear(config.hidden_size, num_labels)
627 |         self.apply(self.init_bert_weights)
628 | 
629 |     def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
630 |         _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)
631 |         pooled_output = self.dropout(pooled_output)
632 |         logits = self.classifier(pooled_output)
633 | 
634 |         if labels is not None:
635 |             loss_fct = CrossEntropyLoss()
636 |             loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
637 |             return loss
638 |         else:
639 |             return logits
640 | 


--------------------------------------------------------------------------------
/autokeras_pretrained/bert/optimization.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """PyTorch optimization for BERT model."""
 16 | 
 17 | import torch
 18 | from torch.optim import Optimizer
 19 | from torch.optim.optimizer import required
 20 | from torch.nn.utils import clip_grad_norm_
 21 | 
 22 | 
 23 | def warmup_linear(x, warmup=0.002):
 24 |     if x < warmup:
 25 |         return x/warmup
 26 |     return 1.0 - x
 27 | 
 28 | 
 29 | def get_lr_scheduled(group, state):
 30 |     if group['t_total'] != -1:
 31 |         schedule_fct = SCHEDULES[group['schedule']]
 32 |         lr_scheduled = group['lr'] * schedule_fct(state['step'] / group['t_total'], group['warmup'])
 33 |     else:
 34 |         lr_scheduled = group['lr']
 35 |     return lr_scheduled
 36 | 
 37 | 
 38 | SCHEDULES = {
 39 |     'warmup_linear':warmup_linear,
 40 | }
 41 | 
 42 | 
 43 | class BertAdam(Optimizer):
 44 |     """Implements BERT version of Adam algorithm with weight decay fix.
 45 |     Params:
 46 |         lr: learning rate
 47 |         warmup: portion of t_total for the warmup, -1  means no warmup. Default: -1
 48 |         t_total: total number of training steps for the learning
 49 |             rate schedule, -1  means constant learning rate. Default: -1
 50 |         schedule: schedule to use for the warmup (see above). Default: 'warmup_linear'
 51 |         b1: Adams b1. Default: 0.9
 52 |         b2: Adams b2. Default: 0.999
 53 |         e: Adams epsilon. Default: 1e-6
 54 |         weight_decay: Weight decay. Default: 0.01
 55 |         max_grad_norm: Maximum norm for the gradients (-1 means no clipping). Default: 1.0
 56 |     """
 57 |     def __init__(self, params, lr=required, warmup=-1, t_total=-1, schedule='warmup_linear',
 58 |                  b1=0.9, b2=0.999, e=1e-6, weight_decay=0.01,
 59 |                  max_grad_norm=1.0):
 60 |         if lr is not required and lr < 0.0:
 61 |             raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr))
 62 |         if schedule not in SCHEDULES:
 63 |             raise ValueError("Invalid schedule parameter: {}".format(schedule))
 64 |         if not 0.0 <= warmup < 1.0 and not warmup == -1:
 65 |             raise ValueError("Invalid warmup: {} - should be in [0.0, 1.0[ or -1".format(warmup))
 66 |         if not 0.0 <= b1 < 1.0:
 67 |             raise ValueError("Invalid b1 parameter: {} - should be in [0.0, 1.0[".format(b1))
 68 |         if not 0.0 <= b2 < 1.0:
 69 |             raise ValueError("Invalid b2 parameter: {} - should be in [0.0, 1.0[".format(b2))
 70 |         if not e >= 0.0:
 71 |             raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(e))
 72 |         defaults = dict(lr=lr, schedule=schedule, warmup=warmup, t_total=t_total,
 73 |                         b1=b1, b2=b2, e=e, weight_decay=weight_decay,
 74 |                         max_grad_norm=max_grad_norm)
 75 |         super(BertAdam, self).__init__(params, defaults)
 76 | 
 77 |     def step(self, closure=None):
 78 |         """Performs a single optimization step.
 79 |         Arguments:
 80 |             closure (callable, optional): A closure that reevaluates the model
 81 |                 and returns the loss.
 82 |         """
 83 |         loss = None
 84 |         if closure is not None:
 85 |             loss = closure()
 86 | 
 87 |         for group in self.param_groups:
 88 |             for p in group['params']:
 89 |                 if p.grad is None:
 90 |                     continue
 91 |                 grad = p.grad.data
 92 |                 if grad.is_sparse:
 93 |                     raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
 94 | 
 95 |                 state = self.state[p]
 96 | 
 97 |                 # State initialization
 98 |                 if len(state) == 0:
 99 |                     state['step'] = 0
100 |                     # Exponential moving average of gradient values
101 |                     state['next_m'] = torch.zeros_like(p.data)
102 |                     # Exponential moving average of squared gradient values
103 |                     state['next_v'] = torch.zeros_like(p.data)
104 | 
105 |                 next_m, next_v = state['next_m'], state['next_v']
106 |                 beta1, beta2 = group['b1'], group['b2']
107 | 
108 |                 # Add grad clipping
109 |                 if group['max_grad_norm'] > 0:
110 |                     clip_grad_norm_(p, group['max_grad_norm'])
111 | 
112 |                 # Decay the first and second moment running average coefficient
113 |                 # In-place operations to update the averages at the same time
114 |                 next_m.mul_(beta1).add_(1 - beta1, grad)
115 |                 next_v.mul_(beta2).addcmul_(1 - beta2, grad, grad)
116 |                 update = next_m / (next_v.sqrt() + group['e'])
117 | 
118 |                 # Just adding the square of the weights to the loss function is *not*
119 |                 # the correct way of using L2 regularization/weight decay with Adam,
120 |                 # since that will interact with the m and v parameters in strange ways.
121 |                 #
122 |                 # Instead we want to decay the weights in a manner that doesn't interact
123 |                 # with the m/v parameters. This is equivalent to adding the square
124 |                 # of the weights to the loss with plain (non-momentum) SGD.
125 |                 if group['weight_decay'] > 0.0:
126 |                     update += group['weight_decay'] * p.data
127 | 
128 |                 update_with_lr = get_lr_scheduled(group, state) * update
129 |                 p.data.add_(-update_with_lr)
130 | 
131 |                 state['step'] += 1
132 | 
133 |                 # step_size = lr_scheduled * math.sqrt(bias_correction2) / bias_correction1
134 |                 # No bias correction
135 |                 # bias_correction1 = 1 - beta1 ** state['step']
136 |                 # bias_correction2 = 1 - beta2 ** state['step']
137 | 
138 |         return loss
139 | 


--------------------------------------------------------------------------------
/autokeras_pretrained/bert/tokenization.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """Tokenization classes."""
 16 | 
 17 | from __future__ import absolute_import
 18 | from __future__ import division
 19 | from __future__ import print_function
 20 | 
 21 | import collections
 22 | import unicodedata
 23 | import os
 24 | import logging
 25 | 
 26 | from autokeras_pretrained.constant import Constant
 27 | from autokeras_pretrained.bert.utils import cached_path
 28 | 
 29 | logger = logging.getLogger(__name__)
 30 | 
 31 | PRETRAINED_VOCAB_ARCHIVE_MAP = {
 32 |     'bert-base-uncased': Constant.PRETRAINED_VOCAB_BERT_BASE_UNCASED,
 33 |     'bert-base-cased': Constant.PRETRAINED_VOCAB_BERT_BASE_UNCASED
 34 | }
 35 | 
 36 | VOCAB_NAME = 'vocab.txt'
 37 | 
 38 | 
 39 | def load_vocab(vocab_file):
 40 |     """Loads a vocabulary file into a dictionary."""
 41 |     vocab = collections.OrderedDict()
 42 |     index = 0
 43 |     with open(vocab_file, "r", encoding="utf-8") as reader:
 44 |         while True:
 45 |             token = reader.readline()
 46 |             if not token:
 47 |                 break
 48 |             token = token.strip()
 49 |             vocab[token] = index
 50 |             index += 1
 51 |     return vocab
 52 | 
 53 | 
 54 | def whitespace_tokenize(text):
 55 |     """Runs basic whitespace cleaning and splitting on a peice of text."""
 56 |     text = text.strip()
 57 |     if not text:
 58 |         return []
 59 |     tokens = text.split()
 60 |     return tokens
 61 | 
 62 | 
 63 | class BertTokenizer(object):
 64 |     """Runs end-to-end tokenization: punctuation splitting + wordpiece"""
 65 | 
 66 |     def __init__(self, vocab_file, do_lower_case=True):
 67 |         if not os.path.isfile(vocab_file):
 68 |             raise ValueError(
 69 |                 "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
 70 |                 "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file))
 71 |         self.vocab = load_vocab(vocab_file)
 72 |         self.ids_to_tokens = collections.OrderedDict(
 73 |             [(ids, tok) for tok, ids in self.vocab.items()])
 74 |         self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
 75 |         self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
 76 | 
 77 |     def tokenize(self, text):
 78 |         split_tokens = []
 79 |         for token in self.basic_tokenizer.tokenize(text):
 80 |             for sub_token in self.wordpiece_tokenizer.tokenize(token):
 81 |                 split_tokens.append(sub_token)
 82 |         return split_tokens
 83 | 
 84 |     def convert_tokens_to_ids(self, tokens):
 85 |         """Converts a sequence of tokens into ids using the vocab."""
 86 |         ids = []
 87 |         for token in tokens:
 88 |             ids.append(self.vocab[token])
 89 |         return ids
 90 | 
 91 |     @classmethod
 92 |     def from_pretrained(cls, pretrained_model_name, cache_dir=None, *inputs, **kwargs):
 93 |         """
 94 |         Instantiate a PreTrainedBertModel from a pre-trained model file.
 95 |         Download and cache the pre-trained model file if needed.
 96 |         """
 97 |         try:
 98 |             if pretrained_model_name in PRETRAINED_VOCAB_ARCHIVE_MAP:
 99 |                 vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name]
100 |             else:
101 |                 raise KeyError
102 |         except KeyError:
103 |             logger.error(str(pretrained_model_name) + " tokenizer is not available/supported.")
104 | 
105 |         # redirect to the cache, if necessary
106 |         try:
107 |             resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
108 |         except FileNotFoundError:
109 |             logger.error(
110 |                 "Model name '{}' was not found in model name list ({}). "
111 |                 "We assumed '{}' was a path or url but couldn't find any file "
112 |                 "associated to this path or url.".format(
113 |                     pretrained_model_name,
114 |                     ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
115 |                     vocab_file))
116 |             return None
117 |         if resolved_vocab_file == vocab_file:
118 |             logger.info("loading vocabulary file {}".format(vocab_file))
119 |         else:
120 |             logger.info("loading vocabulary file {} from cache at {}".format(
121 |                 vocab_file, resolved_vocab_file))
122 |         # Instantiate tokenizer.
123 |         tokenizer = cls(resolved_vocab_file, *inputs, **kwargs)
124 |         return tokenizer
125 | 
126 | 
127 | class BasicTokenizer(object):
128 |     """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
129 | 
130 |     def __init__(self, do_lower_case=True):
131 |         """Constructs a BasicTokenizer.
132 | 
133 |         Args:
134 |           do_lower_case: Whether to lower case the input.
135 |         """
136 |         self.do_lower_case = do_lower_case
137 | 
138 |     def tokenize(self, text):
139 |         """Tokenizes a piece of text."""
140 |         text = self._clean_text(text)
141 |         # This was added on November 1st, 2018 for the multilingual and Chinese
142 |         # models. This is also applied to the English models now, but it doesn't
143 |         # matter since the English models were not trained on any Chinese data
144 |         # and generally don't have any Chinese data in them (there are Chinese
145 |         # characters in the vocabulary because Wikipedia does have some Chinese
146 |         # words in the English Wikipedia.).
147 |         text = self._tokenize_chinese_chars(text)
148 |         orig_tokens = whitespace_tokenize(text)
149 |         split_tokens = []
150 |         for token in orig_tokens:
151 |             if self.do_lower_case:
152 |                 token = token.lower()
153 |                 token = self._run_strip_accents(token)
154 |             split_tokens.extend(self._run_split_on_punc(token))
155 | 
156 |         output_tokens = whitespace_tokenize(" ".join(split_tokens))
157 |         return output_tokens
158 | 
159 |     @staticmethod
160 |     def _run_strip_accents(text):
161 |         """Strips accents from a piece of text."""
162 |         text = unicodedata.normalize("NFD", text)
163 |         output = []
164 |         for char in text:
165 |             cat = unicodedata.category(char)
166 |             if cat == "Mn":
167 |                 continue
168 |             output.append(char)
169 |         return "".join(output)
170 | 
171 |     @staticmethod
172 |     def _run_split_on_punc(text):
173 |         """Splits punctuation on a piece of text."""
174 |         chars = list(text)
175 |         i = 0
176 |         start_new_word = True
177 |         output = []
178 |         while i < len(chars):
179 |             char = chars[i]
180 |             if _is_punctuation(char):
181 |                 output.append([char])
182 |                 start_new_word = True
183 |             else:
184 |                 if start_new_word:
185 |                     output.append([])
186 |                 start_new_word = False
187 |                 output[-1].append(char)
188 |             i += 1
189 |         return ["".join(x) for x in output]
190 | 
191 |     @classmethod
192 |     def _tokenize_chinese_chars(cls, text):
193 |         """Adds whitespace around any CJK character."""
194 |         output = []
195 |         for char in text:
196 |             cp = ord(char)
197 |             if cls._is_chinese_char(cp):
198 |                 output.append(" ")
199 |                 output.append(char)
200 |                 output.append(" ")
201 |             else:
202 |                 output.append(char)
203 |         return "".join(output)
204 | 
205 |     @staticmethod
206 |     def _is_chinese_char(cp):
207 |         """Checks whether CP is the codepoint of a CJK character."""
208 |         # This defines a "chinese character" as anything in the CJK Unicode block:
209 |         #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
210 |         #
211 |         # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
212 |         # despite its name. The modern Korean Hangul alphabet is a different block,
213 |         # as is Japanese Hiragana and Katakana. Those alphabets are used to write
214 |         # space-separated words, so they are not treated specially and handled
215 |         # like the all of the other languages.
216 |         chinese_character_ranges = [
217 |             (0x4E00, 0x9FFF),
218 |             (0x3400, 0x4DBF),
219 |             (0xF900, 0xFAFF),
220 |             (0x20000, 0x2A6DF),
221 |             (0x2A700, 0x2B73F),
222 |             (0x2B740, 0x2B81F),
223 |             (0x2B820, 0x2CEAF),
224 |             (0x2F800, 0x2FA1F)]
225 |         for start, end in chinese_character_ranges:
226 |             if start <= cp <= end:
227 |                 return True
228 |         return False
229 | 
230 |     @staticmethod
231 |     def _clean_text(text):
232 |         """Performs invalid character removal and whitespace cleanup on text."""
233 |         output = []
234 |         for char in text:
235 |             cp = ord(char)
236 |             if cp == 0 or cp == 0xfffd or _is_control(char):
237 |                 continue
238 |             if _is_whitespace(char):
239 |                 output.append(" ")
240 |             else:
241 |                 output.append(char)
242 |         return "".join(output)
243 | 
244 | 
245 | class WordpieceTokenizer(object):
246 |     """Runs WordPiece tokenization."""
247 | 
248 |     def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100):
249 |         self.vocab = vocab
250 |         self.unk_token = unk_token
251 |         self.max_input_chars_per_word = max_input_chars_per_word
252 | 
253 |     def tokenize(self, text):
254 |         """Tokenizes a piece of text into its word pieces.
255 | 
256 |         This uses a greedy longest-match-first algorithm to perform tokenization
257 |         using the given vocabulary.
258 | 
259 |         For example:
260 |           input = "unaffable"
261 |           output = ["un", "##aff", "##able"]
262 | 
263 |         Args:
264 |           text: A single token or whitespace separated tokens. This should have
265 |             already been passed through `BasicTokenizer.
266 | 
267 |         Returns:
268 |           A list of wordpiece tokens.
269 |         """
270 | 
271 |         output_tokens = []
272 |         for token in whitespace_tokenize(text):
273 |             chars = list(token)
274 |             if len(chars) > self.max_input_chars_per_word:
275 |                 output_tokens.append(self.unk_token)
276 |                 continue
277 | 
278 |             is_bad = False
279 |             start = 0
280 |             sub_tokens = []
281 |             while start < len(chars):
282 |                 end = len(chars)
283 |                 cur_substr = None
284 |                 while start < end:
285 |                     substr = "".join(chars[start:end])
286 |                     if start > 0:
287 |                         substr = "##" + substr
288 |                     if substr in self.vocab:
289 |                         cur_substr = substr
290 |                         break
291 |                     end -= 1
292 |                 if cur_substr is None:
293 |                     is_bad = True
294 |                     break
295 |                 sub_tokens.append(cur_substr)
296 |                 start = end
297 | 
298 |             if is_bad:
299 |                 output_tokens.append(self.unk_token)
300 |             else:
301 |                 output_tokens.extend(sub_tokens)
302 |         return output_tokens
303 | 
304 | 
305 | def _is_whitespace(char):
306 |     """Checks whether `chars` is a whitespace character."""
307 |     # \t, \n, and \r are technically contorl characters but we treat them
308 |     # as whitespace since they are generally considered as such.
309 |     if char == " " or char == "\t" or char == "\n" or char == "\r":
310 |         return True
311 |     cat = unicodedata.category(char)
312 |     if cat == "Zs":
313 |         return True
314 |     return False
315 | 
316 | 
317 | def _is_control(char):
318 |     """Checks whether `chars` is a control character."""
319 |     # These are technically control characters but we count them as whitespace
320 |     # characters.
321 |     if char == "\t" or char == "\n" or char == "\r":
322 |         return False
323 |     cat = unicodedata.category(char)
324 |     if cat.startswith("C"):
325 |         return True
326 |     return False
327 | 
328 | 
329 | def _is_punctuation(char):
330 |     """Checks whether `chars` is a punctuation character."""
331 |     cp = ord(char)
332 |     # We treat all non-letter/number ASCII as punctuation.
333 |     # Characters such as "^", "$", and "`" are not in the Unicode
334 |     # Punctuation class but we treat them as punctuation anyways, for
335 |     # consistency.
336 |     punctuation_ranges = [(33, 47), (58, 64), (91, 96), (123, 126)]
337 |     for start, end in punctuation_ranges:
338 |         if start <= cp <= end:
339 |             return True
340 |     cat = unicodedata.category(char)
341 |     if cat.startswith("P"):
342 |         return True
343 |     return False
344 | 


--------------------------------------------------------------------------------
/autokeras_pretrained/bert/utils.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Original work Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
  3 | # Modified work Copyright 2019 The AutoKeras team.
  4 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  5 | #
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | #
 10 | #     http://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | 
 18 | import os
 19 | import torch
 20 | 
 21 | from pathlib import Path
 22 | 
 23 | from autokeras_pretrained.utils import download_file_from_google_drive
 24 | 
 25 | PYTORCH_PRETRAINED_BERT_CACHE = Path(os.getenv('PYTORCH_PRETRAINED_BERT_CACHE',
 26 |                                                Path.home() / '.pytorch_pretrained_bert'))
 27 | 
 28 | 
 29 | class InputFeatures(object):
 30 |     """A single set of features of data."""
 31 | 
 32 |     def __init__(self, input_ids, input_mask, segment_ids):
 33 |         self.input_ids = input_ids
 34 |         self.input_mask = input_mask
 35 |         self.segment_ids = segment_ids
 36 | 
 37 | 
 38 | def convert_examples_to_features(examples, tokenizer, max_seq_length):
 39 |     """ Convert text examples to BERT specific input format.
 40 | 
 41 |     Tokenize the input text and convert into features.
 42 | 
 43 |     Args:
 44 |         examples: Text data.
 45 |         tokenizer: Tokenizer to process the text into tokens.
 46 |         max_seq_length: The maximum length of the text sequence supported.
 47 | 
 48 |     Returns:
 49 |         all_input_ids: ndarray containing the ids for each token.
 50 |         all_input_masks: ndarray containing 1's or 0's based on if the tokens are real or padded.
 51 |         all_segment_ids: ndarray containing all 0's since it is a classification task.
 52 |     """
 53 |     features = []
 54 |     for (_, example) in enumerate(examples):
 55 |         tokens_a = tokenizer.tokenize(example)
 56 | 
 57 |         if len(tokens_a) > max_seq_length - 2:
 58 |             tokens_a = tokens_a[:(max_seq_length - 2)]
 59 | 
 60 |         tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
 61 |         segment_ids = [0] * len(tokens)
 62 | 
 63 |         input_ids = tokenizer.convert_tokens_to_ids(tokens)
 64 | 
 65 |         input_mask = [1] * len(input_ids)
 66 | 
 67 |         padding = [0] * (max_seq_length - len(input_ids))
 68 |         input_ids += padding
 69 |         input_mask += padding
 70 |         segment_ids += padding
 71 | 
 72 |         if len(input_ids) != max_seq_length or \
 73 |                 len(input_mask) != max_seq_length or \
 74 |                 len(segment_ids) != max_seq_length:
 75 |             raise AssertionError()
 76 | 
 77 |         features.append(InputFeatures(input_ids=input_ids,
 78 |                                       input_mask=input_mask,
 79 |                                       segment_ids=segment_ids))
 80 | 
 81 |     all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
 82 |     all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
 83 |     all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
 84 | 
 85 |     return all_input_ids, all_input_mask, all_segment_ids
 86 | 
 87 | 
 88 | def cached_path(file_info, cache_dir=None):
 89 |     if cache_dir is None:
 90 |         cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
 91 | 
 92 |     os.makedirs(cache_dir, exist_ok=True)
 93 |     file_path = os.path.join(cache_dir, file_info.local_name)
 94 | 
 95 |     if not os.path.exists(file_path):
 96 |         download_file_from_google_drive(file_id=file_info.google_drive_id,
 97 |                                         dest_path=file_path,
 98 |                                         verbose=True)
 99 |     return file_path
100 | 


--------------------------------------------------------------------------------
/autokeras_pretrained/constant.py:
--------------------------------------------------------------------------------
 1 | from collections import namedtuple
 2 | 
 3 | GoogleDriveFile = namedtuple('GoogleDriveFile', ['google_drive_id', 'local_name'])
 4 | 
 5 | 
 6 | class Constant:
 7 | 
 8 |     # Text Classifier
 9 | 
10 |     BERT_TRAINER_EPOCHS = 4
11 |     BERT_TRAINER_BATCH_SIZE = 32
12 | 
13 |     # text preprocessor
14 | 
15 |     EMBEDDING_DIM = 100
16 |     MAX_SEQUENCE_LENGTH = 400
17 |     MAX_NB_WORDS = 5000
18 |     EXTRACT_PATH = "glove/"
19 |     STORE_PATH = ''
20 | 
21 |     # Download file name
22 | 
23 |     FILE_PATH = "glove.zip"
24 |     PRE_TRAIN_FILE_LINK = "http://nlp.stanford.edu/data/glove.6B.zip"
25 |     PRE_TRAIN_FILE_NAME = "glove.6B.100d.txt"
26 | 
27 |     PRE_TRAIN_DETECTION_FILE_LINK = "https://s3.amazonaws.com/amdegroot-models/ssd300_mAP_77.43_v2.pth"
28 | 
29 |     VOICE_GENERATOR_MODELS = [
30 |         GoogleDriveFile(google_drive_id='1E-B92LZz4dgg8DU81D6pyhOzM9yvvBTj', local_name='vg.pth')]
31 |     VOICE_RECONGINIZER_MODELS = [
32 |         GoogleDriveFile(google_drive_id='1RQQB-Yd-aqb6scWtnu1K4nlSTxTyaKjI', local_name='vr.pth')]
33 |     FACE_DETECTOR_MODELS = [
34 |         GoogleDriveFile(google_drive_id='1QJWKpAHRrAjrYPl6hQNDaoyBjoa_LRgz', local_name='pnet.pt'),
35 |         GoogleDriveFile(google_drive_id='10aCiR393E6TLkp9KPPl4JhZamYqUVBO1', local_name='rnet.pt'),
36 |         GoogleDriveFile(google_drive_id='1RRBtPlzw46peS-A8pyYGsPRHHFIUrSVV', local_name='onet.pt')]
37 |     OBJECT_DETECTOR_MODELS = [
38 |         GoogleDriveFile(google_drive_id='1QGG1trfj-z5_2OGNoSarUB4wx81cG-sa', local_name='oo.pth')]
39 |     SENTIMENT_ANALYSIS_MODELS = [
40 |         GoogleDriveFile(google_drive_id='1flRlQjfIa2toQ6HNmInhqrh4NuxGh8pT', local_name='sa.pth')]
41 |     TOPIC_CLASSIFIER_MODELS = [
42 |         GoogleDriveFile(google_drive_id='1U7C3xPid1ZvBKpkfW9KikrmNui0yJqnk', local_name='tc.pth')]
43 |     PRETRAINED_VOCAB_BERT_BASE_UNCASED = \
44 |         GoogleDriveFile(google_drive_id='1hlPkUSPeT5ZQBYZ1Z734BbnHIvpx2ZLj', local_name='vbbu.txt')
45 |     PRETRAINED_VOCAB_BERT_BASE_CASED = \
46 |         GoogleDriveFile(google_drive_id='1FLytUhOIF0mTfA4A9MtE3aQ1kJr96oTR', local_name='vbbc.txt')
47 |     PRETRAINED_MODEL_BERT_BASE_UNCASED = \
48 |         GoogleDriveFile(google_drive_id='1rp1rVBoQwqgvg-JE8JwLL-adgLE07oTG', local_name='mbbu.pth')
49 |     PRETRAINED_MODEL_BERT_BASE_CASED = \
50 |         GoogleDriveFile(google_drive_id='1YKoGj-e4zoyTabt5dYpgEPe-PAmjOTDV', local_name='mbbc.pth')
51 | 
52 |     VOICE_RECONGINIZER_LABELS = "_'ABCDEFGHIJKLMNOPQRSTUVWXYZ "
53 |     VOICE_RECONGINIZER_AUDIO_CONF = {'sample_rate': 16000, 'window_size': 0.02, 'window_stride': 0.01,
54 |                                      'window': 'hamming', 'noise_dir': None, 'noise_prob': 0.4,
55 |                                      'noise_levels': (0.0, 0.5)}
56 | 
57 |     # Image Resize
58 | 
59 |     MAX_IMAGE_SIZE = 128 * 128
60 | 
61 |     # SYS Constant
62 | 
63 |     SYS_LINUX = 'linux'
64 |     SYS_WINDOWS = 'windows'
65 |     SYS_GOOGLE_COLAB = 'goog_colab'
66 | 
67 |     # Google drive downloader
68 |     CHUNK_SIZE = 32768
69 |     DOWNLOAD_URL = "https://docs.google.com/uc?export=download"
70 | 


--------------------------------------------------------------------------------
/autokeras_pretrained/face_detector.py:
--------------------------------------------------------------------------------
  1 | # This is DFace's implementation of MTCNN modified for AutoKeras
  2 | # Link to DFace: https://github.com/kuaikuaikim/DFace
  3 | import os
  4 | 
  5 | import cv2
  6 | import matplotlib.patches as patches
  7 | import matplotlib.pyplot as plt
  8 | import numpy as np
  9 | import torch
 10 | import torch.nn as nn
 11 | import torchvision.transforms as transforms
 12 | from torch.autograd.variable import Variable
 13 | 
 14 | from autokeras_pretrained.constant import Constant
 15 | from autokeras_pretrained.base import Pretrained
 16 | 
 17 | 
 18 | def weights_init(m):
 19 |     if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear):
 20 |         nn.init.xavier_uniform_(m.weight.data)
 21 |         nn.init.constant_(m.bias, 0.1)
 22 | 
 23 | 
 24 | class PNet(nn.Module):
 25 | 
 26 |     def __init__(self):
 27 |         super(PNet, self).__init__()
 28 | 
 29 |         self.pre_layer = nn.Sequential(
 30 |             nn.Conv2d(3, 10, kernel_size=3, stride=1),
 31 |             nn.PReLU(),
 32 |             nn.MaxPool2d(kernel_size=2, stride=2),
 33 |             nn.Conv2d(10, 16, kernel_size=3, stride=1),
 34 |             nn.PReLU(),
 35 |             nn.Conv2d(16, 32, kernel_size=3, stride=1),
 36 |             nn.PReLU()
 37 |         )
 38 |         self.conv4_1 = nn.Conv2d(32, 1, kernel_size=1, stride=1)
 39 |         self.conv4_2 = nn.Conv2d(32, 4, kernel_size=1, stride=1)
 40 |         self.conv4_3 = nn.Conv2d(32, 10, kernel_size=1, stride=1)
 41 | 
 42 |         self.apply(weights_init)
 43 | 
 44 |     def forward(self, x):
 45 |         x = self.pre_layer(x)
 46 |         label = torch.sigmoid(self.conv4_1(x))
 47 |         offset = self.conv4_2(x)
 48 |         return label, offset
 49 | 
 50 | 
 51 | class RNet(nn.Module):
 52 | 
 53 |     def __init__(self):
 54 |         super(RNet, self).__init__()
 55 | 
 56 |         self.pre_layer = nn.Sequential(
 57 |             nn.Conv2d(3, 28, kernel_size=3, stride=1),
 58 |             nn.PReLU(),
 59 |             nn.MaxPool2d(kernel_size=3, stride=2),
 60 |             nn.Conv2d(28, 48, kernel_size=3, stride=1),
 61 |             nn.PReLU(),
 62 |             nn.MaxPool2d(kernel_size=3, stride=2),
 63 |             nn.Conv2d(48, 64, kernel_size=2, stride=1),
 64 |             nn.PReLU()
 65 | 
 66 |         )
 67 |         self.conv4 = nn.Linear(64 * 2 * 2, 128)
 68 |         self.prelu4 = nn.PReLU()
 69 |         self.conv5_1 = nn.Linear(128, 1)
 70 |         self.conv5_2 = nn.Linear(128, 4)
 71 |         self.conv5_3 = nn.Linear(128, 10)
 72 |         self.apply(weights_init)
 73 | 
 74 |     def forward(self, x):
 75 |         x = self.pre_layer(x)
 76 |         x = x.view(x.size(0), -1)
 77 |         x = self.conv4(x)
 78 |         x = self.prelu4(x)
 79 |         det = torch.sigmoid(self.conv5_1(x))
 80 |         box = self.conv5_2(x)
 81 |         return det, box
 82 | 
 83 | 
 84 | class ONet(nn.Module):
 85 | 
 86 |     def __init__(self):
 87 |         super(ONet, self).__init__()
 88 | 
 89 |         self.pre_layer = nn.Sequential(
 90 |             nn.Conv2d(3, 32, kernel_size=3, stride=1),
 91 |             nn.PReLU(),
 92 |             nn.MaxPool2d(kernel_size=3, stride=2),
 93 |             nn.Conv2d(32, 64, kernel_size=3, stride=1),
 94 |             nn.PReLU(),
 95 |             nn.MaxPool2d(kernel_size=3, stride=2),
 96 |             nn.Conv2d(64, 64, kernel_size=3, stride=1),
 97 |             nn.PReLU(),
 98 |             nn.MaxPool2d(kernel_size=2, stride=2),
 99 |             nn.Conv2d(64, 128, kernel_size=2, stride=1),
100 |             nn.PReLU()
101 |         )
102 |         self.conv5 = nn.Linear(128 * 2 * 2, 256)
103 |         self.prelu5 = nn.PReLU()
104 |         self.conv6_1 = nn.Linear(256, 1)
105 |         self.conv6_2 = nn.Linear(256, 4)
106 |         self.conv6_3 = nn.Linear(256, 10)
107 |         self.apply(weights_init)
108 | 
109 |     def forward(self, x):
110 |         x = self.pre_layer(x)
111 |         x = x.view(x.size(0), -1)
112 |         x = self.conv5(x)
113 |         x = self.prelu5(x)
114 |         det = torch.sigmoid(self.conv6_1(x))
115 |         box = self.conv6_2(x)
116 |         landmark = self.conv6_3(x)
117 |         return det, box, landmark
118 | 
119 | 
120 | def get_square_bbox(bbox):
121 |     square_bbox = bbox.copy()
122 | 
123 |     h = bbox[:, 3] - bbox[:, 1] + 1
124 |     w = bbox[:, 2] - bbox[:, 0] + 1
125 |     l = np.maximum(h, w)
126 |     square_bbox[:, 0] = bbox[:, 0] + w * 0.5 - l * 0.5
127 |     square_bbox[:, 1] = bbox[:, 1] + h * 0.5 - l * 0.5
128 | 
129 |     square_bbox[:, 2] = square_bbox[:, 0] + l - 1
130 |     square_bbox[:, 3] = square_bbox[:, 1] + l - 1
131 |     return square_bbox
132 | 
133 | 
134 | def generate_bounding_box(map_, reg, scale, threshold):
135 |     stride = 2
136 |     cellsize = 12
137 | 
138 |     t_index = np.where(map_ > threshold)
139 | 
140 |     if t_index[0].size == 0:
141 |         return np.array([])
142 | 
143 |     dx1, dy1, dx2, dy2 = [reg[0, t_index[0], t_index[1], i] for i in range(4)]
144 |     reg = np.array([dx1, dy1, dx2, dy2])
145 | 
146 |     score = map_[t_index[0], t_index[1], 0]
147 |     boundingbox = np.vstack([np.round((stride * t_index[1]) / scale),
148 |                              np.round((stride * t_index[0]) / scale),
149 |                              np.round((stride * t_index[1] + cellsize) / scale),
150 |                              np.round((stride * t_index[0] + cellsize) / scale),
151 |                              score,
152 |                              reg
153 |                              ])
154 | 
155 |     return boundingbox.T
156 | 
157 | 
158 | def resize_image(img, scale):
159 |     height, width, _ = img.shape
160 |     new_height = int(height * scale)
161 |     new_width = int(width * scale)
162 |     new_dim = (new_width, new_height)
163 |     img_resized = cv2.resize(img, new_dim, interpolation=cv2.INTER_LINEAR)
164 |     return img_resized
165 | 
166 | 
167 | def pad(bboxes, w, h):
168 |     tmpw = (bboxes[:, 2] - bboxes[:, 0] + 1).astype(np.int32)
169 |     tmph = (bboxes[:, 3] - bboxes[:, 1] + 1).astype(np.int32)
170 |     numbox = bboxes.shape[0]
171 | 
172 |     dx = np.zeros((numbox,))
173 |     dy = np.zeros((numbox,))
174 |     edx, edy = tmpw.copy() - 1, tmph.copy() - 1
175 | 
176 |     x, y, ex, ey = bboxes[:, 0], bboxes[:, 1], bboxes[:, 2], bboxes[:, 3]
177 | 
178 |     tmp_index = np.where(ex > w - 1)
179 |     edx[tmp_index] = tmpw[tmp_index] + w - 2 - ex[tmp_index]
180 |     ex[tmp_index] = w - 1
181 | 
182 |     tmp_index = np.where(ey > h - 1)
183 |     edy[tmp_index] = tmph[tmp_index] + h - 2 - ey[tmp_index]
184 |     ey[tmp_index] = h - 1
185 | 
186 |     tmp_index = np.where(x < 0)
187 |     dx[tmp_index] = 0 - x[tmp_index]
188 |     x[tmp_index] = 0
189 | 
190 |     tmp_index = np.where(y < 0)
191 |     dy[tmp_index] = 0 - y[tmp_index]
192 |     y[tmp_index] = 0
193 | 
194 |     return_list = [dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph]
195 |     return_list = [item.astype(np.int32) for item in return_list]
196 | 
197 |     return return_list
198 | 
199 | 
200 | def nms(dets, thresh, mode="Union"):
201 |     x1 = dets[:, 0]
202 |     y1 = dets[:, 1]
203 |     x2 = dets[:, 2]
204 |     y2 = dets[:, 3]
205 |     scores = dets[:, 4]
206 | 
207 |     areas = (x2 - x1 + 1) * (y2 - y1 + 1)
208 |     order = scores.argsort()[::-1]
209 | 
210 |     keep = []
211 |     while order.size > 0:
212 |         i = order[0]
213 |         keep.append(i)
214 |         xx1 = np.maximum(x1[i], x1[order[1:]])
215 |         yy1 = np.maximum(y1[i], y1[order[1:]])
216 |         xx2 = np.minimum(x2[i], x2[order[1:]])
217 |         yy2 = np.minimum(y2[i], y2[order[1:]])
218 | 
219 |         w = np.maximum(0.0, xx2 - xx1 + 1)
220 |         h = np.maximum(0.0, yy2 - yy1 + 1)
221 |         inter = w * h
222 |         if mode == "Union":
223 |             ovr = inter / (areas[i] + areas[order[1:]] - inter)
224 |         elif mode == "Minimum":
225 |             ovr = inter / np.minimum(areas[i], areas[order[1:]])
226 | 
227 |         inds = np.where(ovr <= thresh)[0]
228 |         order = order[inds + 1]
229 | 
230 |     return keep
231 | 
232 | 
233 | def convert_image_to_tensor(image):
234 |     transform = transforms.ToTensor()
235 |     return transform(image)
236 | 
237 | 
238 | def convert_chw_tensor_to_hwc_numpy(tensor):
239 |     if isinstance(tensor, Variable):
240 |         return np.transpose(tensor.data.numpy(), (0, 2, 3, 1))
241 |     elif isinstance(tensor, torch.FloatTensor):
242 |         return np.transpose(tensor.numpy(), (0, 2, 3, 1))
243 |     else:
244 |         raise Exception("covert b*c*h*w tensor to b*h*w*c numpy error.This tensor must have 4 dimension.")
245 | 
246 | 
247 | def vis_face(im_array, dets, output_file_path, landmarks=None):
248 |     fig, ax = plt.subplots(1)
249 |     ax.imshow(im_array)
250 | 
251 |     for i in range(dets.shape[0]):
252 |         bbox = dets[i, :4]
253 | 
254 |         rect = plt.Rectangle((bbox[0], bbox[1]),
255 |                              bbox[2] - bbox[0],
256 |                              bbox[3] - bbox[1], fill=False,
257 |                              edgecolor='yellow', linewidth=0.9)
258 |         ax.add_patch(rect)
259 | 
260 |     if landmarks is not None:
261 |         for i in range(landmarks.shape[0]):
262 |             landmarks_one = landmarks[i, :]
263 |             landmarks_one = landmarks_one.reshape((5, 2))
264 |             for j in range(5):
265 |                 cir1 = patches.Circle(xy=(landmarks_one[j, 0], landmarks_one[j, 1]), radius=2, alpha=0.4, color="red")
266 |                 ax.add_patch(cir1)
267 |         plt.axis('off')
268 |         fig.savefig(output_file_path, bbox_inches='tight', pad_inches=0)
269 | 
270 | 
271 | class FaceDetector(Pretrained):
272 |     """A class to predict faces using the MTCNN pre-trained model.
273 |     """
274 | 
275 |     def __init__(self, **kwargs):
276 |         super().__init__(**kwargs)
277 |         pnet, rnet, onet = (torch.load(path, map_location=lambda storage, loc: storage) for path in self.local_paths)
278 | 
279 |         self.pnet_detector = PNet()
280 |         self.pnet_detector.load_state_dict(pnet)
281 |         self.pnet_detector = self.pnet_detector.to(self.device)
282 |         self.pnet_detector.eval()
283 | 
284 |         self.rnet_detector = RNet()
285 |         self.rnet_detector.load_state_dict(rnet)
286 |         self.rnet_detector = self.rnet_detector.to(self.device)
287 |         self.rnet_detector.eval()
288 | 
289 |         self.onet_detector = ONet()
290 |         self.onet_detector.load_state_dict(onet)
291 |         self.onet_detector = self.onet_detector.to(self.device)
292 |         self.onet_detector.eval()
293 | 
294 |         self.min_face_size = 24
295 |         self.stride = 2
296 |         self.threshold = [0.6, 0.7, 0.7]
297 |         self.scale_factor = 0.709
298 | 
299 |     @property
300 |     def _google_drive_files(self):
301 |         return Constant.FACE_DETECTOR_MODELS
302 | 
303 |     def predict(self, img_path, output_file_path=None):
304 |         """Predicts faces in an image.
305 | 
306 |         Args:
307 |             img_path: A string. The path to the image on which the prediction is to be done.
308 |             output_file_path: A string. The path where the output image is to be saved after the prediction. `None` by default.
309 | 
310 |         Returns:
311 |             A tuple containing numpy arrays of bounding boxes and landmarks. Bounding boxes are of shape `(n, 5)` and
312 |             landmarks are of shape `(n, 10)` where `n` is the number of faces predicted. Each bounding box is of length
313 |             5 and the corresponding rectangle is defined by the first four values. Each bounding box has five landmarks
314 |             represented by 10 coordinates.
315 |         """
316 |         if not os.path.exists(img_path):
317 |             raise ValueError('Image does not exist')
318 |         img = cv2.imread(img_path)
319 |         img_bg = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
320 |         bounding_boxes, landmarks = self.detect_face(img)
321 |         if output_file_path is not None:
322 |             vis_face(img_bg, bounding_boxes, output_file_path, landmarks)
323 |         return bounding_boxes, landmarks
324 | 
325 |     def detect_pnet(self, im):
326 |         h, w, c = im.shape
327 |         net_size = 12
328 | 
329 |         current_scale = float(net_size) / self.min_face_size
330 |         im_resized = resize_image(im, current_scale)
331 |         current_height, current_width, _ = im_resized.shape
332 | 
333 |         all_boxes = list()
334 |         while min(current_height, current_width) > net_size:
335 |             feed_imgs = []
336 |             image_tensor = convert_image_to_tensor(im_resized)
337 |             feed_imgs.append(image_tensor)
338 |             feed_imgs = torch.stack(feed_imgs)
339 |             feed_imgs = Variable(feed_imgs)
340 | 
341 |             feed_imgs = feed_imgs.to(self.device)
342 | 
343 |             cls_map, reg = self.pnet_detector(feed_imgs)
344 | 
345 |             cls_map_np = convert_chw_tensor_to_hwc_numpy(cls_map.cpu())
346 |             reg_np = convert_chw_tensor_to_hwc_numpy(reg.cpu())
347 | 
348 |             boxes = generate_bounding_box(cls_map_np[0, :, :], reg_np, current_scale, self.threshold[0])
349 | 
350 |             current_scale *= self.scale_factor
351 |             im_resized = resize_image(im, current_scale)
352 |             current_height, current_width, _ = im_resized.shape
353 | 
354 |             if boxes.size == 0:
355 |                 continue
356 |             keep = nms(boxes[:, :5], 0.5, 'Union')
357 |             boxes = boxes[keep]
358 |             all_boxes.append(boxes)
359 | 
360 |         if len(all_boxes) == 0:
361 |             return None, None
362 | 
363 |         all_boxes = np.vstack(all_boxes)
364 | 
365 |         keep = nms(all_boxes[:, 0:5], 0.7, 'Union')
366 |         all_boxes = all_boxes[keep]
367 | 
368 |         bw = all_boxes[:, 2] - all_boxes[:, 0] + 1
369 |         bh = all_boxes[:, 3] - all_boxes[:, 1] + 1
370 | 
371 |         boxes = np.vstack([all_boxes[:, 0],
372 |                            all_boxes[:, 1],
373 |                            all_boxes[:, 2],
374 |                            all_boxes[:, 3],
375 |                            all_boxes[:, 4]
376 |                            ])
377 | 
378 |         boxes = boxes.T
379 | 
380 |         align_topx = all_boxes[:, 0] + all_boxes[:, 5] * bw
381 |         align_topy = all_boxes[:, 1] + all_boxes[:, 6] * bh
382 |         align_bottomx = all_boxes[:, 2] + all_boxes[:, 7] * bw
383 |         align_bottomy = all_boxes[:, 3] + all_boxes[:, 8] * bh
384 | 
385 |         boxes_align = np.vstack([align_topx,
386 |                                  align_topy,
387 |                                  align_bottomx,
388 |                                  align_bottomy,
389 |                                  all_boxes[:, 4]
390 |                                  ])
391 |         boxes_align = boxes_align.T
392 | 
393 |         return boxes, boxes_align
394 | 
395 |     def detect_rnet(self, im, dets):
396 |         h, w, c = im.shape
397 | 
398 |         if dets is None:
399 |             return None, None
400 | 
401 |         dets = get_square_bbox(dets)
402 |         dets[:, 0:4] = np.round(dets[:, 0:4])
403 | 
404 |         [dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph] = pad(dets, w, h)
405 |         num_boxes = dets.shape[0]
406 | 
407 |         cropped_ims_tensors = []
408 |         for i in range(num_boxes):
409 |             tmp = np.zeros((tmph[i], tmpw[i], 3), dtype=np.uint8)
410 |             tmp[dy[i]:edy[i] + 1, dx[i]:edx[i] + 1, :] = im[y[i]:ey[i] + 1, x[i]:ex[i] + 1, :]
411 |             crop_im = cv2.resize(tmp, (24, 24))
412 |             crop_im_tensor = convert_image_to_tensor(crop_im)
413 |             cropped_ims_tensors.append(crop_im_tensor)
414 |         feed_imgs = Variable(torch.stack(cropped_ims_tensors))
415 | 
416 |         feed_imgs = feed_imgs.to(self.device)
417 | 
418 |         cls_map, reg = self.rnet_detector(feed_imgs)
419 | 
420 |         cls_map = cls_map.cpu().data.numpy()
421 |         reg = reg.cpu().data.numpy()
422 | 
423 |         keep_inds = np.where(cls_map > self.threshold[1])[0]
424 | 
425 |         if len(keep_inds) > 0:
426 |             boxes = dets[keep_inds]
427 |             cls = cls_map[keep_inds]
428 |             reg = reg[keep_inds]
429 |         else:
430 |             return None, None
431 | 
432 |         keep = nms(boxes, 0.7)
433 | 
434 |         if len(keep) == 0:
435 |             return None, None
436 | 
437 |         keep_cls = cls[keep]
438 |         keep_boxes = boxes[keep]
439 |         keep_reg = reg[keep]
440 | 
441 |         bw = keep_boxes[:, 2] - keep_boxes[:, 0] + 1
442 |         bh = keep_boxes[:, 3] - keep_boxes[:, 1] + 1
443 | 
444 |         boxes = np.vstack([keep_boxes[:, 0],
445 |                            keep_boxes[:, 1],
446 |                            keep_boxes[:, 2],
447 |                            keep_boxes[:, 3],
448 |                            keep_cls[:, 0]
449 |                            ])
450 | 
451 |         align_topx = keep_boxes[:, 0] + keep_reg[:, 0] * bw
452 |         align_topy = keep_boxes[:, 1] + keep_reg[:, 1] * bh
453 |         align_bottomx = keep_boxes[:, 2] + keep_reg[:, 2] * bw
454 |         align_bottomy = keep_boxes[:, 3] + keep_reg[:, 3] * bh
455 | 
456 |         boxes_align = np.vstack([align_topx,
457 |                                  align_topy,
458 |                                  align_bottomx,
459 |                                  align_bottomy,
460 |                                  keep_cls[:, 0]
461 |                                  ])
462 | 
463 |         boxes = boxes.T
464 |         boxes_align = boxes_align.T
465 | 
466 |         return boxes, boxes_align
467 | 
468 |     def detect_onet(self, im, dets):
469 |         h, w, _ = im.shape
470 | 
471 |         if dets is None:
472 |             return None, None
473 | 
474 |         dets = get_square_bbox(dets)
475 |         dets[:, 0:4] = np.round(dets[:, 0:4])
476 | 
477 |         [dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph] = pad(dets, w, h)
478 |         num_boxes = dets.shape[0]
479 | 
480 |         cropped_ims_tensors = []
481 |         for i in range(num_boxes):
482 |             tmp = np.zeros((tmph[i], tmpw[i], 3), dtype=np.uint8)
483 |             tmp[dy[i]:edy[i] + 1, dx[i]:edx[i] + 1, :] = im[y[i]:ey[i] + 1, x[i]:ex[i] + 1, :]
484 |             crop_im = cv2.resize(tmp, (48, 48))
485 |             crop_im_tensor = convert_image_to_tensor(crop_im)
486 |             cropped_ims_tensors.append(crop_im_tensor)
487 |         feed_imgs = Variable(torch.stack(cropped_ims_tensors))
488 | 
489 |         feed_imgs = feed_imgs.to(self.device)
490 | 
491 |         cls_map, reg, landmark = self.onet_detector(feed_imgs)
492 | 
493 |         cls_map = cls_map.cpu().data.numpy()
494 |         reg = reg.cpu().data.numpy()
495 |         landmark = landmark.cpu().data.numpy()
496 | 
497 |         keep_inds = np.where(cls_map > self.threshold[2])[0]
498 | 
499 |         if len(keep_inds) > 0:
500 |             boxes = dets[keep_inds]
501 |             cls = cls_map[keep_inds]
502 |             reg = reg[keep_inds]
503 |             landmark = landmark[keep_inds]
504 |         else:
505 |             return None, None
506 | 
507 |         keep = nms(boxes, 0.7, mode="Minimum")
508 | 
509 |         if len(keep) == 0:
510 |             return None, None
511 | 
512 |         keep_cls = cls[keep]
513 |         keep_boxes = boxes[keep]
514 |         keep_reg = reg[keep]
515 |         keep_landmark = landmark[keep]
516 | 
517 |         bw = keep_boxes[:, 2] - keep_boxes[:, 0] + 1
518 |         bh = keep_boxes[:, 3] - keep_boxes[:, 1] + 1
519 | 
520 |         align_topx = keep_boxes[:, 0] + keep_reg[:, 0] * bw
521 |         align_topy = keep_boxes[:, 1] + keep_reg[:, 1] * bh
522 |         align_bottomx = keep_boxes[:, 2] + keep_reg[:, 2] * bw
523 |         align_bottomy = keep_boxes[:, 3] + keep_reg[:, 3] * bh
524 | 
525 |         align_landmark_topx = keep_boxes[:, 0]
526 |         align_landmark_topy = keep_boxes[:, 1]
527 | 
528 |         boxes_align = np.vstack([align_topx,
529 |                                  align_topy,
530 |                                  align_bottomx,
531 |                                  align_bottomy,
532 |                                  keep_cls[:, 0]
533 |                                  ])
534 | 
535 |         boxes_align = boxes_align.T
536 | 
537 |         landmark = np.vstack([
538 |             align_landmark_topx + keep_landmark[:, 0] * bw,
539 |             align_landmark_topy + keep_landmark[:, 1] * bh,
540 |             align_landmark_topx + keep_landmark[:, 2] * bw,
541 |             align_landmark_topy + keep_landmark[:, 3] * bh,
542 |             align_landmark_topx + keep_landmark[:, 4] * bw,
543 |             align_landmark_topy + keep_landmark[:, 5] * bh,
544 |             align_landmark_topx + keep_landmark[:, 6] * bw,
545 |             align_landmark_topy + keep_landmark[:, 7] * bh,
546 |             align_landmark_topx + keep_landmark[:, 8] * bw,
547 |             align_landmark_topy + keep_landmark[:, 9] * bh,
548 |         ])
549 | 
550 |         landmark_align = landmark.T
551 | 
552 |         return boxes_align, landmark_align
553 | 
554 |     def detect_face(self, img):
555 |         boxes_align = np.array([])
556 |         landmark_align = np.array([])
557 | 
558 |         if self.pnet_detector:
559 |             _, boxes_align = self.detect_pnet(img)
560 |             if boxes_align is None:
561 |                 return np.array([]), np.array([])
562 | 
563 |         if self.rnet_detector:
564 |             boxes, boxes_align = self.detect_rnet(img, boxes_align)
565 |             if boxes_align is None:
566 |                 return np.array([]), np.array([])
567 | 
568 |         if self.onet_detector:
569 |             boxes_align, landmark_align = self.detect_onet(img, boxes_align)
570 |             if boxes_align is None:
571 |                 return np.array([]), np.array([])
572 | 
573 |         return boxes_align, landmark_align
574 | 


--------------------------------------------------------------------------------
/autokeras_pretrained/object_detector.py:
--------------------------------------------------------------------------------
  1 | # ----------------------------------
  2 | # The idea of the classes and functions in this file is largely borrowed from
  3 | # https://github.com/amdegroot/ssd.pytorch
  4 | # A huge thank you to the authors: Max deGroot and Ellis Brown
  5 | # modified by: Wuyang Chen, Haifeng Jin
  6 | # ----------------------------------
  7 | 
  8 | import os
  9 | from itertools import product as product
 10 | from math import sqrt as sqrt
 11 | 
 12 | import cv2
 13 | import numpy as np
 14 | import torch
 15 | from matplotlib import pyplot as plt
 16 | from torch import nn as nn
 17 | from torch.autograd import Variable, Function
 18 | from torch.nn import functional
 19 | from torch.nn import init as init
 20 | 
 21 | from autokeras_pretrained.utils import get_device
 22 | from autokeras_pretrained.constant import Constant
 23 | from autokeras_pretrained.base import Pretrained
 24 | 
 25 | """VOC Dataset Classes
 26 | 
 27 | Original author: Francisco Massa
 28 | https://github.com/fmassa/vision/blob/voc_dataset/torchvision/datasets/voc.py
 29 | 
 30 | Updated by: Ellis Brown, Max deGroot
 31 | """
 32 | 
 33 | # gets home dir cross platform
 34 | HOME = os.path.expanduser("~")
 35 | 
 36 | # for making bounding boxes pretty
 37 | COLORS = ((255, 0, 0, 128), (0, 255, 0, 128), (0, 0, 255, 128),
 38 |           (0, 255, 255, 128), (255, 0, 255, 128), (255, 255, 0, 128))
 39 | 
 40 | MEANS = (104, 117, 123)
 41 | 
 42 | # SSD300 CONFIGS
 43 | VOC = {
 44 |     'num_classes': 21,
 45 |     'lr_steps': (80000, 100000, 120000),
 46 |     'max_iter': 120000,
 47 |     'feature_maps': [38, 19, 10, 5, 3, 1],
 48 |     'min_dim': 300,
 49 |     'steps': [8, 16, 32, 64, 100, 300],
 50 |     'min_sizes': [30, 60, 111, 162, 213, 264],
 51 |     'max_sizes': [60, 111, 162, 213, 264, 315],
 52 |     'aspect_ratios': [[2], [2, 3], [2, 3], [2, 3], [2], [2]],
 53 |     'variance': [0.1, 0.2],
 54 |     'clip': True,
 55 |     'name': 'VOC',
 56 | }
 57 | 
 58 | COCO = {
 59 |     'num_classes': 91,
 60 |     'lr_steps': (280000, 360000, 400000),
 61 |     'max_iter': 400000,
 62 |     'feature_maps': [38, 19, 10, 5, 3, 1],
 63 |     'min_dim': 300,
 64 |     'steps': [8, 16, 32, 64, 100, 300],
 65 |     'min_sizes': [21, 45, 99, 153, 207, 261],
 66 |     'max_sizes': [45, 99, 153, 207, 261, 315],
 67 |     'aspect_ratios': [[2], [2, 3], [2, 3], [2, 3], [2], [2]],
 68 |     'variance': [0.1, 0.2],
 69 |     'clip': True,
 70 |     'name': 'COCO',
 71 | }
 72 | 
 73 | VOC_CLASSES = (  # always index 0
 74 |     'aeroplane', 'bicycle', 'bird', 'boat',
 75 |     'bottle', 'bus', 'car', 'cat', 'chair',
 76 |     'cow', 'diningtable', 'dog', 'horse',
 77 |     'motorbike', 'person', 'pottedplant',
 78 |     'sheep', 'sofa', 'train', 'tvmonitor')
 79 | 
 80 | # note: if you used our download scripts, this should be right
 81 | VOC_ROOT = os.path.join(HOME, "object_detection/data/VOCdevkit/")
 82 | 
 83 | 
 84 | class SSD(nn.Module):
 85 |     """Single Shot Multibox Architecture
 86 |     The network is composed of a base VGG network followed by the
 87 |     added multibox conv layers.  Each multibox layer branches into
 88 |         1) conv2d for class conf scores
 89 |         2) conv2d for localization predictions
 90 |         3) associated priorbox layer to produce default bounding
 91 |            boxes specific to the layer's feature map size.
 92 |     See: https://arxiv.org/pdf/1512.02325.pdf for more details.
 93 | 
 94 |     Args:
 95 |         phase: (string) Can be "test" or "train"
 96 |         size: input image size
 97 |         base: VGG16 layers for input, size of either 300 or 500
 98 |         extras: extra layers that feed to multibox loc and conf layers
 99 |         head: "multibox head" consists of loc and conf conv layers
100 |     """
101 | 
102 |     def __init__(self, phase, size, base, extras, head, num_classes, device):
103 |         super(SSD, self).__init__()
104 |         self.phase = phase
105 |         self.num_classes = num_classes
106 |         self.cfg = (COCO, VOC)[num_classes == 21]
107 |         self.priorbox = PriorBox(self.cfg)
108 |         self.device = device
109 |         self.priors = Variable(self.priorbox.forward(), volatile=True).to(self.device)
110 |         self.size = size
111 | 
112 |         # SSD network
113 |         self.vgg = nn.ModuleList(base)
114 |         # Layer learns to scale the l2 normalized features from conv4_3
115 |         self.L2Norm = L2Norm(512, 20)
116 |         self.extras = nn.ModuleList(extras)
117 | 
118 |         self.loc = nn.ModuleList(head[0])
119 |         self.conf = nn.ModuleList(head[1])
120 | 
121 |         if phase == 'test':
122 |             self.softmax = nn.Softmax(dim=-1)
123 |             self.detect = Detect(num_classes, 0, 200, 0.01, 0.45)
124 | 
125 |     def forward(self, x):
126 |         """Applies network layers and ops on input image(s) x.
127 | 
128 |         Args:
129 |             x: input image or batch of images. Shape: [batch,3,300,300].
130 | 
131 |         Return:
132 |             Depending on phase:
133 |             test:
134 |                 Variable(tensor) of output class label predictions,
135 |                 confidence score, and corresponding location predictions for
136 |                 each object detected. Shape: [batch,topk,7]
137 | 
138 |             train:
139 |                 list of concat outputs from:
140 |                     1: confidence layers, Shape: [batch*num_priors,num_classes]
141 |                     2: localization layers, Shape: [batch,num_priors*4]
142 |                     3: priorbox layers, Shape: [2,num_priors*4]
143 |         """
144 |         sources = list()
145 |         loc = list()
146 |         conf = list()
147 | 
148 |         # apply vgg up to conv4_3 relu
149 |         for k in range(23):
150 |             x = self.vgg[k](x)
151 | 
152 |         s = self.L2Norm(x)
153 |         sources.append(s)
154 | 
155 |         # apply vgg up to fc7
156 |         for k in range(23, len(self.vgg)):
157 |             x = self.vgg[k](x)
158 |         sources.append(x)
159 | 
160 |         # apply extra layers and cache source layer outputs
161 |         for k, v in enumerate(self.extras):
162 |             x = functional.relu(v(x), inplace=True)
163 |             if k % 2 == 1:
164 |                 sources.append(x)
165 | 
166 |         # apply multibox head to source layers
167 |         for (x, l, c) in zip(sources, self.loc, self.conf):
168 |             loc.append(l(x).permute(0, 2, 3, 1).contiguous())
169 |             conf.append(c(x).permute(0, 2, 3, 1).contiguous())
170 | 
171 |         loc = torch.cat([o.view(o.size(0), -1) for o in loc], 1)
172 |         conf = torch.cat([o.view(o.size(0), -1) for o in conf], 1)
173 |         if self.phase == "test":
174 |             output = self.detect(
175 |                 loc.view(loc.size(0), -1, 4).to(self.device),  # loc preds
176 |                 self.softmax(conf.view(conf.size(0), -1,
177 |                                        self.num_classes)).to(self.device),  # conf preds
178 |                 self.priors.type(type(x.data)).to(self.device)  # default boxes
179 |             )
180 |         else:
181 |             output = (
182 |                 loc.view(loc.size(0), -1, 4),
183 |                 conf.view(conf.size(0), -1, self.num_classes),
184 |                 self.priors
185 |             )
186 |         return output
187 | 
188 |     def load_weights(self, base_file):
189 |         _, ext = os.path.splitext(base_file)
190 |         if ext == '.pkl' or '.pth':
191 |             print('Loading weights into state dict...')
192 |             self.load_state_dict(torch.load(base_file,
193 |                                             map_location=lambda storage, loc: storage))
194 |             print('Finished!')
195 |         else:
196 |             print('Sorry only .pth and .pkl files supported.')
197 | 
198 | 
199 | # This function is derived from torchvision VGG make_layers()
200 | # https://github.com/pytorch/vision/blob/master/torchvision/models/vgg.py
201 | def vgg(cfg, i, batch_norm=False):
202 |     layers = []
203 |     in_channels = i
204 |     for v in cfg:
205 |         if v == 'M':
206 |             layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
207 |         elif v == 'C':
208 |             layers += [nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)]
209 |         else:
210 |             conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1)
211 |             if batch_norm:
212 |                 layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)]
213 |             else:
214 |                 layers += [conv2d, nn.ReLU(inplace=True)]
215 |             in_channels = v
216 |     pool5 = nn.MaxPool2d(kernel_size=3, stride=1, padding=1)
217 |     conv6 = nn.Conv2d(512, 1024, kernel_size=3, padding=6, dilation=6)
218 |     conv7 = nn.Conv2d(1024, 1024, kernel_size=1)
219 |     layers += [pool5, conv6,
220 |                nn.ReLU(inplace=True), conv7, nn.ReLU(inplace=True)]
221 |     return layers
222 | 
223 | 
224 | def add_extras(cfg, i):
225 |     # Extra layers added to VGG for feature scaling
226 |     layers = []
227 |     in_channels = i
228 |     flag = False
229 |     for k, v in enumerate(cfg):
230 |         if in_channels != 'S':
231 |             if v == 'S':
232 |                 layers += [nn.Conv2d(in_channels, cfg[k + 1],
233 |                                      kernel_size=(1, 3)[flag], stride=2, padding=1)]
234 |             else:
235 |                 layers += [nn.Conv2d(in_channels, v, kernel_size=(1, 3)[flag])]
236 |             flag = not flag
237 |         in_channels = v
238 |     return layers
239 | 
240 | 
241 | def multi_box(vgg_result, extra_layers, cfg, num_classes):
242 |     loc_layers = []
243 |     conf_layers = []
244 |     vgg_source = [21, -2]
245 |     for k, v in enumerate(vgg_source):
246 |         loc_layers += [nn.Conv2d(vgg_result[v].out_channels,
247 |                                  cfg[k] * 4, kernel_size=3, padding=1)]
248 |         conf_layers += [nn.Conv2d(vgg_result[v].out_channels,
249 |                                   cfg[k] * num_classes, kernel_size=3, padding=1)]
250 |     for k, v in enumerate(extra_layers[1::2], 2):
251 |         loc_layers += [nn.Conv2d(v.out_channels, cfg[k]
252 |                                  * 4, kernel_size=3, padding=1)]
253 |         conf_layers += [nn.Conv2d(v.out_channels, cfg[k]
254 |                                   * num_classes, kernel_size=3, padding=1)]
255 |     return vgg_result, extra_layers, (loc_layers, conf_layers)
256 | 
257 | 
258 | class L2Norm(nn.Module):
259 |     def __init__(self, n_channels, scale):
260 |         super(L2Norm, self).__init__()
261 |         self.n_channels = n_channels
262 |         self.gamma = scale or None
263 |         self.eps = 1e-10
264 |         self.weight = nn.Parameter(torch.Tensor(self.n_channels))
265 |         self.reset_parameters()
266 | 
267 |     def reset_parameters(self):
268 |         init.constant(self.weight, self.gamma)
269 | 
270 |     def forward(self, x):
271 |         norm = x.pow(2).sum(dim=1, keepdim=True).sqrt() + self.eps
272 |         # x /= norm
273 |         x = torch.div(x, norm)
274 |         out = self.weight.unsqueeze(0).unsqueeze(2).unsqueeze(3).expand_as(x) * x
275 |         return out
276 | 
277 | 
278 | class PriorBox(object):
279 |     """Compute priorbox coordinates in center-offset form for each source
280 |     feature map.
281 |     """
282 | 
283 |     def __init__(self, cfg):
284 |         super(PriorBox, self).__init__()
285 |         self.image_size = cfg['min_dim']
286 |         # number of priors for feature map location (either 4 or 6)
287 |         self.num_priors = len(cfg['aspect_ratios'])
288 |         self.variance = cfg['variance'] or [0.1]
289 |         self.feature_maps = cfg['feature_maps']
290 |         self.min_sizes = cfg['min_sizes']
291 |         self.max_sizes = cfg['max_sizes']
292 |         self.steps = cfg['steps']
293 |         self.aspect_ratios = cfg['aspect_ratios']
294 |         self.clip = cfg['clip']
295 |         self.version = cfg['name']
296 |         for v in self.variance:
297 |             if v <= 0:
298 |                 raise ValueError('Variances must be greater than 0')
299 | 
300 |     def forward(self):
301 |         mean = []
302 |         for k, f in enumerate(self.feature_maps):
303 |             for i, j in product(range(f), repeat=2):
304 |                 f_k = self.image_size / self.steps[k]
305 |                 # unit center x,y
306 |                 cx = (j + 0.5) / f_k
307 |                 cy = (i + 0.5) / f_k
308 | 
309 |                 # aspect_ratio: 1
310 |                 # rel size: min_size
311 |                 s_k = self.min_sizes[k] / self.image_size
312 |                 mean += [cx, cy, s_k, s_k]
313 | 
314 |                 # aspect_ratio: 1
315 |                 # rel size: sqrt(s_k * s_(k+1))
316 |                 s_k_prime = sqrt(s_k * (self.max_sizes[k] / self.image_size))
317 |                 mean += [cx, cy, s_k_prime, s_k_prime]
318 | 
319 |                 # rest of aspect ratios
320 |                 for ar in self.aspect_ratios[k]:
321 |                     mean += [cx, cy, s_k * sqrt(ar), s_k / sqrt(ar)]
322 |                     mean += [cx, cy, s_k / sqrt(ar), s_k * sqrt(ar)]
323 |         # back to torch land
324 |         output = torch.Tensor(mean).view(-1, 4)
325 |         if self.clip:
326 |             output.clamp_(max=1, min=0)
327 |         return output
328 | 
329 | 
330 | class Detect(Function):
331 |     """At test time, Detect is the final layer of SSD.  Decode location preds,
332 |     apply non-maximum suppression to location predictions based on conf
333 |     scores and threshold to a top_k number of output predictions for both
334 |     confidence score and locations.
335 |     """
336 | 
337 |     @staticmethod
338 |     def backward(ctx, *grad_outputs):
339 |         pass
340 | 
341 |     def __init__(self, num_classes, bkg_label, top_k, conf_thresh, nms_thresh):
342 |         self.num_classes = num_classes
343 |         self.background_label = bkg_label
344 |         self.top_k = top_k
345 |         # Parameters used in nms.
346 |         self.nms_thresh = nms_thresh
347 |         if nms_thresh <= 0:
348 |             raise ValueError('nms_threshold must be non negative.')
349 |         self.conf_thresh = conf_thresh
350 |         self.variance = VOC['variance']
351 | 
352 |     def forward(self, loc_data, conf_data, prior_data):
353 |         """
354 |         Args:
355 |             loc_data: (tensor) Loc preds from loc layers
356 |                 Shape: [batch,num_priors*4]
357 |             conf_data: (tensor) Shape: Conf preds from conf layers
358 |                 Shape: [batch*num_priors,num_classes]
359 |             prior_data: (tensor) Prior boxes and variances from priorbox layers
360 |                 Shape: [1,num_priors,4]
361 |         """
362 |         num = loc_data.size(0)  # batch size
363 |         num_priors = prior_data.size(0)
364 |         output = torch.zeros(num, self.num_classes, self.top_k, 5)
365 |         conf_preds = conf_data.view(num, num_priors,
366 |                                     self.num_classes).transpose(2, 1)
367 | 
368 |         # Decode predictions into bboxes.
369 |         for i in range(num):
370 |             decoded_boxes = decode(loc_data[i], prior_data, self.variance)
371 |             # For each class, perform nms
372 |             conf_scores = conf_preds[i].clone()
373 | 
374 |             for cl in range(1, self.num_classes):
375 |                 c_mask = conf_scores[cl].gt(self.conf_thresh)
376 |                 scores = conf_scores[cl][c_mask]
377 |                 # if scores.dim() == 0:
378 |                 if scores.size(0) == 0:
379 |                     continue
380 |                 l_mask = c_mask.unsqueeze(1).expand_as(decoded_boxes)
381 |                 boxes = decoded_boxes[l_mask].view(-1, 4)
382 |                 # idx of highest scoring and non-overlapping boxes per class
383 |                 ids, count = nms(boxes, scores, self.nms_thresh, self.top_k)
384 |                 output[i, cl, :count] = \
385 |                     torch.cat((scores[ids[:count]].unsqueeze(1),
386 |                                boxes[ids[:count]]), 1)
387 |         flt = output.contiguous().view(num, -1, 5)
388 |         _, idx = flt[:, :, 0].sort(1, descending=True)
389 |         _, rank = idx.sort(1)
390 |         flt[(rank < self.top_k).unsqueeze(-1).expand_as(flt)].fill_(0)
391 |         return output
392 | 
393 | 
394 | # Adapted from https://github.com/Hakuyume/chainer-ssd
395 | def decode(loc, priors, variances):
396 |     """Decode locations from predictions using priors to undo
397 |     the encoding we did for offset regression at train time.
398 |     Args:
399 |         loc (tensor): location predictions for loc layers,
400 |             Shape: [num_priors,4]
401 |         priors (tensor): Prior boxes in center-offset form.
402 |             Shape: [num_priors,4].
403 |         variances: (list[float]) Variances of priorboxes
404 |     Return:
405 |         decoded bounding box predictions
406 |     """
407 | 
408 |     boxes = torch.cat((
409 |         priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:],
410 |         priors[:, 2:] * torch.exp(loc[:, 2:] * variances[1])), 1)
411 |     boxes[:, :2] -= boxes[:, 2:] / 2
412 |     boxes[:, 2:] += boxes[:, :2]
413 |     return boxes
414 | 
415 | 
416 | # Original author: Francisco Massa:
417 | # https://github.com/fmassa/object-detection.torch
418 | # Ported to PyTorch by Max deGroot (02/01/2017)
419 | def nms(boxes, scores, overlap=0.5, top_k=200):
420 |     """Apply non-maximum suppression at test time to avoid detecting too many
421 |     overlapping bounding boxes for a given object.
422 |     Args:
423 |         boxes: (tensor) The location preds for the img, Shape: [num_priors,4].
424 |         scores: (tensor) The class predscores for the img, Shape:[num_priors].
425 |         overlap: (float) The overlap thresh for suppressing unnecessary boxes.
426 |         top_k: (int) The Maximum number of box preds to consider.
427 |     Return:
428 |         The indices of the kept boxes with respect to num_priors.
429 |     """
430 | 
431 |     keep = scores.new(scores.size(0)).zero_().long()
432 |     if boxes.numel() == 0:
433 |         return keep
434 |     x1 = boxes[:, 0]
435 |     y1 = boxes[:, 1]
436 |     x2 = boxes[:, 2]
437 |     y2 = boxes[:, 3]
438 |     area = torch.mul(x2 - x1, y2 - y1)
439 |     _, idx = scores.sort(0)  # sort in ascending order
440 |     # I = I[v >= 0.01]
441 |     idx = idx[-top_k:]  # indices of the top-k largest vals
442 |     xx1 = boxes.new()
443 |     yy1 = boxes.new()
444 |     xx2 = boxes.new()
445 |     yy2 = boxes.new()
446 |     w = boxes.new()
447 |     h = boxes.new()
448 | 
449 |     # keep = torch.Tensor()
450 |     count = 0
451 |     while idx.numel() > 0:
452 |         i = idx[-1]  # index of current largest val
453 |         # keep.append(i)
454 |         keep[count] = i
455 |         count += 1
456 |         if idx.size(0) == 1:
457 |             break
458 |         idx = idx[:-1]  # remove kept element from view
459 |         # load bboxes of next highest vals
460 |         torch.index_select(x1, 0, idx, out=xx1)
461 |         torch.index_select(y1, 0, idx, out=yy1)
462 |         torch.index_select(x2, 0, idx, out=xx2)
463 |         torch.index_select(y2, 0, idx, out=yy2)
464 |         # store element-wise max with next highest score
465 |         xx1 = torch.clamp(xx1, min=x1[i])
466 |         yy1 = torch.clamp(yy1, min=y1[i])
467 |         xx2 = torch.clamp(xx2, max=x2[i])
468 |         yy2 = torch.clamp(yy2, max=y2[i])
469 |         w.resize_as_(xx2)
470 |         h.resize_as_(yy2)
471 |         w = xx2 - xx1
472 |         h = yy2 - yy1
473 |         # check sizes of xx1 and xx2.. after each iteration
474 |         w = torch.clamp(w, min=0.0)
475 |         h = torch.clamp(h, min=0.0)
476 |         inter = w * h
477 |         # iou = i / (area(a) + area(b) - i)
478 |         rem_areas = torch.index_select(area, 0, idx)  # load remaining areas)
479 |         union = (rem_areas - inter) + area[i]
480 |         iou = inter / union  # store result in iou
481 |         # keep only elements with an iou <= overlap
482 |         idx = idx[iou.le(overlap)]
483 |     return keep, count
484 | 
485 | 
486 | class ObjectDetector(Pretrained):
487 | 
488 |     def __init__(self):
489 |         super(ObjectDetector, self).__init__()
490 |         self.model = None
491 |         self.device = get_device()
492 |         # load net
493 |         num_classes = len(VOC_CLASSES) + 1  # +1 for background
494 |         self.model = self._build_ssd('test', 300, num_classes)  # initialize SSD
495 |         if self.device.startswith("cuda"):
496 |             self.model.load_state_dict(torch.load(self.local_paths[0]))
497 |         else:
498 |             self.model.load_state_dict(torch.load(self.local_paths[0], map_location=lambda storage, loc: storage))
499 |         self.model.eval()
500 |         print('Finished loading model!')
501 | 
502 |         self.model = self.model.to(self.device)
503 | 
504 |     @property
505 |     def _google_drive_files(self):
506 |         return Constant.OBJECT_DETECTOR_MODELS
507 | 
508 |     def _build_ssd(self, phase, size=300, num_classes=21):
509 |         if phase != "test" and phase != "train":
510 |             print("ERROR: Phase: " + phase + " not recognized")
511 |             return
512 |         if size != 300:
513 |             print("ERROR: You specified size " + repr(size) + ". However, " +
514 |                   "currently only SSD300 (size=300) is supported!")
515 |             return
516 |         base = {
517 |             '300': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'C', 512, 512, 512, 'M',
518 |                     512, 512, 512],
519 |             '512': [],
520 |         }
521 |         extras = {
522 |             '300': [256, 'S', 512, 128, 'S', 256, 128, 256, 128, 256],
523 |             '512': [],
524 |         }
525 |         mbox = {
526 |             '300': [4, 6, 6, 6, 4, 4],  # number of boxes per feature map location
527 |             '512': [],
528 |         }
529 | 
530 |         base_, extras_, head_ = multi_box(vgg(base[str(size)], 3),
531 |                                           add_extras(extras[str(size)], 1024),
532 |                                           mbox[str(size)], num_classes)
533 |         return SSD(phase, size, base_, extras_, head_, num_classes, self.device)
534 | 
535 |     def predict(self, input_data, output_file_path=None):
536 |         """
537 |         
538 |         Returns:
539 |             List of dictionaries. Each dictionary is like
540 |             {"left": int, "top": int, "width": int, "height": int: "category": str, "confidence": float}
541 |         """
542 |         from matplotlib.ticker import NullLocator
543 | 
544 |         dataset_mean = (104, 117, 123)
545 | 
546 |         image = cv2.imread(input_data, cv2.IMREAD_COLOR)
547 |         height, width, _ = image.shape
548 |         rgb_image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
549 |         x = base_transform(rgb_image, 300, dataset_mean)
550 |         x = x.astype(np.float32)
551 |         x = torch.from_numpy(x).permute(2, 0, 1)
552 |         xx = Variable(x.unsqueeze(0))  # wrap tensor in Variable
553 |         # if self.device.startswith("cuda"):
554 |         xx = xx.to(self.device)
555 |         y = self.model(xx)
556 | 
557 |         # (batch, num_classes, top_k, 5), 5 means (confidence, )
558 |         detections = y.data
559 |         results = []
560 |         # scale each detection back up to the image
561 |         scale = torch.Tensor(rgb_image.shape[1::-1]).repeat(2)
562 |         for i in range(detections.size(1)):
563 |             j = 0
564 |             while detections[0, i, j, 0] >= 0.6:
565 |                 score = detections[0, i, j, 0].item()
566 |                 label_name = VOC_CLASSES[i - 1]
567 |                 pt = (detections[0, i, j, 1:] * scale).cpu().numpy()
568 |                 # result = ((pt[0], pt[1]), (pt[2] - pt[0] + 1, pt[3] - pt[1] + 1), label_name, score)
569 |                 result = {
570 |                     "left": max(int(np.round(pt[0])), 0),
571 |                     "top": max(int(np.round(pt[1])), 0),
572 |                     "width": min(int(np.round(pt[2] - pt[0] + 1)), width),
573 |                     "height": min(int(np.round(pt[3] - pt[1] + 1)), height),
574 |                     "category": label_name,
575 |                     "confidence": score
576 |                 }
577 |                 results.append(result)
578 |                 j += 1
579 | 
580 |         if output_file_path is not None:
581 |             # plt.figure(figsize=(10,10))
582 |             colors = plt.cm.hsv(np.linspace(0, 1, 21)).tolist()
583 |             plt.imshow(rgb_image)  # plot the image for matplotlib
584 |             current_axis = plt.gca()
585 |             current_axis.set_axis_off()
586 |             current_axis.xaxis.set_major_locator(NullLocator())
587 |             current_axis.yaxis.set_major_locator(NullLocator())
588 | 
589 |             # scale each detection back up to the image
590 |             for i in range(detections.size(1)):
591 |                 j = 0
592 |                 while detections[0, i, j, 0] >= 0.6:
593 |                     score = detections[0, i, j, 0]
594 |                     label_name = VOC_CLASSES[i - 1]
595 |                     display_txt = '%s: %.2f' % (label_name, score)
596 |                     pt = (detections[0, i, j, 1:] * scale).cpu().numpy()
597 |                     coords = (pt[0], pt[1]), pt[2] - pt[0] + 1, pt[3] - pt[1] + 1
598 |                     color = colors[i]
599 |                     current_axis.add_patch(plt.Rectangle(*coords, fill=False, edgecolor=color, linewidth=2))
600 |                     current_axis.text(pt[0], pt[1], display_txt, bbox={'facecolor': color, 'alpha': 0.5})
601 |                     j += 1
602 |             plt.axis('off')
603 |             plt.tight_layout()
604 |             save_name = input_data.split('/')[-1]
605 |             save_name = save_name.split('.')
606 |             save_name = '.'.join(save_name[:-1]) + "_prediction." + save_name[-1]
607 |             plt.savefig(os.path.join(output_file_path, save_name), bbox_inches='tight', pad_inches=0)
608 |             plt.clf()
609 | 
610 |         return results
611 | 
612 | 
613 | def base_transform(image, size, mean):
614 |     x = cv2.resize(image, (size, size)).astype(np.float32)
615 |     x -= mean
616 |     x = x.astype(np.float32)
617 |     return x
618 | 


--------------------------------------------------------------------------------
/autokeras_pretrained/text_classifier.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Original work Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
  3 | # Modified work Copyright 2019 The AutoKeras team.
  4 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  5 | #
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | #
 10 | #     http://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | 
 18 | import numpy as np
 19 | import torch
 20 | from abc import ABC
 21 | 
 22 | from autokeras_pretrained.constant import Constant
 23 | from autokeras_pretrained.base import Pretrained
 24 | from autokeras_pretrained.bert.modeling import BertForSequenceClassification
 25 | from autokeras_pretrained.bert.utils import convert_examples_to_features
 26 | from autokeras_pretrained.bert.tokenization import BertTokenizer
 27 | from torch.utils.data import TensorDataset, DataLoader, SequentialSampler
 28 | 
 29 | 
 30 | class TextClassifier(Pretrained, ABC):
 31 |     """A pre-trained TextClassifier class based on Google AI's BERT model.
 32 | 
 33 |     Attributes:
 34 |         model: Type of BERT model to be used for the classification task. E.g:- Uncased, Cased, etc.
 35 |         The current pre-trained models are using 'bert-base-uncased'.
 36 |         tokenizer: Tokenizer used with BERT model.
 37 |     """
 38 | 
 39 |     def __init__(self, num_classes=None, **kwargs):
 40 |         super().__init__(**kwargs)
 41 |         self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
 42 | 
 43 |         model_state_dict = torch.load(self.local_paths[0], map_location=lambda storage, loc: storage)
 44 |         self.model = BertForSequenceClassification.from_pretrained('bert-base-uncased',
 45 |                                                                    state_dict=model_state_dict,
 46 |                                                                    num_labels=num_classes)
 47 |         self.model.to(self.device)
 48 | 
 49 |     def y_predict(self, x_predict):
 50 |         """ Predict the labels for the provided input data.
 51 | 
 52 |         Args:
 53 |             x_predict: ndarray containing the data inputs.
 54 | 
 55 |         Returns:
 56 |             ndarray containing the predicted labels/outputs for x_predict.
 57 |         """
 58 |         all_input_ids, all_input_mask, all_segment_ids = convert_examples_to_features([x_predict],
 59 |                                                                                       self.tokenizer,
 60 |                                                                                       max_seq_length=128)
 61 | 
 62 |         eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids)
 63 | 
 64 |         eval_sampler = SequentialSampler(eval_data)
 65 |         eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=1)
 66 | 
 67 |         self.model.eval()
 68 |         for input_ids, input_mask, segment_ids in eval_dataloader:
 69 |             input_ids = input_ids.to(self.device)
 70 |             input_mask = input_mask.to(self.device)
 71 |             segment_ids = segment_ids.to(self.device)
 72 | 
 73 |             with torch.no_grad():
 74 |                 logits = self.model(input_ids, segment_ids, input_mask)
 75 | 
 76 |             logits = logits.detach().cpu().numpy()
 77 | 
 78 |             for logit in logits:
 79 |                 exp = np.exp(logit)
 80 |                 exp = exp / np.sum(exp)
 81 |                 y_pred = exp
 82 | 
 83 |         return y_pred
 84 | 
 85 | 
 86 | class SentimentAnalysis(TextClassifier):
 87 |     """A SentimentAnalysis class inherited from TextClassifier.
 88 | 
 89 |     The model is trained on the IMDb dataset. The link for the dataset is given below.
 90 |     http://ai.stanford.edu/~amaas/data/sentiment/
 91 |     """
 92 | 
 93 |     def __init__(self, **kwargs):
 94 |         super().__init__(num_classes=2, **kwargs)
 95 | 
 96 |     @property
 97 |     def _google_drive_files(self):
 98 |         return Constant.SENTIMENT_ANALYSIS_MODELS
 99 | 
100 |     def predict(self, x_predict, **kwargs):
101 |         y_pred = self.y_predict(x_predict)
102 |         return round(y_pred[1], 2)
103 | 
104 | 
105 | class TopicClassifier(TextClassifier):
106 |     """A pre-trained TopicClassifier class inherited from TextClassifier.
107 | 
108 |     The model is trained on the AG News dataset. The link for the dataset is given below.
109 |     https://www.di.unipi.it/~gulli/AG_corpus_of_news_articles.html
110 |     """
111 | 
112 |     def __init__(self, **kwargs):
113 |         super().__init__(num_classes=4, **kwargs)
114 | 
115 |     @property
116 |     def _google_drive_files(self):
117 |         return Constant.TOPIC_CLASSIFIER_MODELS
118 | 
119 |     def predict(self, x_predict, **kwargs):
120 |         y_pred = self.y_predict(x_predict)
121 |         class_id = np.argmax(y_pred)
122 |         if class_id == 0:
123 |             return "Business"
124 |         elif class_id == 1:
125 |             return "Sci/Tech"
126 |         elif class_id == 2:
127 |             return "World"
128 |         else:
129 |             return "Sports"
130 | 


--------------------------------------------------------------------------------
/autokeras_pretrained/utils.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | import itertools
  3 | import logging
  4 | import os
  5 | import pickle
  6 | import random
  7 | import string
  8 | import sys
  9 | import tempfile
 10 | import zipfile
 11 | from os import makedirs
 12 | from os.path import dirname
 13 | from os.path import exists
 14 | from sys import stdout
 15 | 
 16 | import imageio
 17 | import numpy as np
 18 | import requests
 19 | import torch
 20 | from scipy.ndimage import zoom
 21 | 
 22 | from autokeras_pretrained.constant import Constant
 23 | 
 24 | 
 25 | class NoImprovementError(Exception):
 26 |     def __init__(self, message):
 27 |         self.message = message
 28 | 
 29 | 
 30 | def ensure_dir(directory):
 31 |     """Create directory if it does not exist."""
 32 |     if not os.path.exists(directory):
 33 |         os.makedirs(directory)
 34 | 
 35 | 
 36 | def ensure_file_dir(path):
 37 |     """Create path if it does not exist."""
 38 |     ensure_dir(os.path.dirname(path))
 39 | 
 40 | 
 41 | def has_file(path):
 42 |     """Check if the given path exists."""
 43 |     return os.path.exists(path)
 44 | 
 45 | 
 46 | def pickle_from_file(path):
 47 |     """Load the pickle file from the provided path and returns the object."""
 48 |     return pickle.load(open(path, 'rb'))
 49 | 
 50 | 
 51 | def pickle_to_file(obj, path):
 52 |     """Save the pickle file to the specified path."""
 53 |     pickle.dump(obj, open(path, 'wb'))
 54 | 
 55 | 
 56 | def temp_path_generator():
 57 |     sys_temp = tempfile.gettempdir()
 58 |     path = os.path.join(sys_temp, 'autokeras')
 59 |     return path
 60 | 
 61 | 
 62 | def rand_temp_folder_generator():
 63 |     """Create and return a temporary directory with the path name '/temp_dir_name/autokeras' (E:g:- /tmp/autokeras)."""
 64 |     chars = string.ascii_uppercase + string.digits
 65 |     size = 6
 66 |     random_suffix = ''.join(random.choice(chars) for _ in range(size))
 67 |     sys_temp = temp_path_generator()
 68 |     path = sys_temp + '_' + random_suffix
 69 |     ensure_dir(path)
 70 |     return path
 71 | 
 72 | 
 73 | def download_file(file_link, file_path):
 74 |     """Download the file specified in `file_link` and saves it in `file_path`."""
 75 |     if not os.path.exists(file_path):
 76 |         with open(file_path, "wb") as f:
 77 |             print("\nDownloading %s" % file_path)
 78 |             response = requests.get(file_link, stream=True)
 79 |             total_length = response.headers.get('content-length')
 80 | 
 81 |             if total_length is None:  # no content length header
 82 |                 f.write(response.content)
 83 |             else:
 84 |                 dl = 0
 85 |                 total_length = int(total_length)
 86 |                 for data in response.iter_content(chunk_size=4096):
 87 |                     dl += len(data)
 88 |                     f.write(data)
 89 |                     done = int(50 * dl / total_length)
 90 |                     sys.stdout.write("\r[%s%s]" % ('=' * done, ' ' * (50 - done)))
 91 |                     sys.stdout.flush()
 92 | 
 93 | 
 94 | def download_file_with_extract(file_link, file_path, extract_path):
 95 |     """Download the file specified in `file_link`, save to `file_path` and extract to the directory `extract_path`."""
 96 |     if not os.path.exists(extract_path):
 97 |         download_file(file_link, file_path)
 98 |         zip_ref = zipfile.ZipFile(file_path, 'r')
 99 |         print("extracting downloaded file...")
100 |         zip_ref.extractall(extract_path)
101 |         os.remove(file_path)
102 |         print("extracted and removed downloaded zip file")
103 |     print("file already extracted in the path %s" % extract_path)
104 | 
105 | 
106 | def assert_search_space(search_space):
107 |     grid = search_space
108 |     value_list = []
109 |     if Constant.LENGTH_DIM not in list(grid.keys()):
110 |         print('No length dimension found in search Space. Using default values')
111 |         grid[Constant.LENGTH_DIM] = Constant.DEFAULT_LENGTH_SEARCH
112 |     elif not isinstance(grid[Constant.LENGTH_DIM][0], int):
113 |         print('Converting String to integers. Next time please make sure to enter integer values for Length Dimension')
114 |         grid[Constant.LENGTH_DIM] = list(map(int, grid[Constant.LENGTH_DIM]))
115 | 
116 |     if Constant.WIDTH_DIM not in list(grid.keys()):
117 |         print('No width dimension found in search Space. Using default values')
118 |         grid[Constant.WIDTH_DIM] = Constant.DEFAULT_WIDTH_SEARCH
119 |     elif not isinstance(grid[Constant.WIDTH_DIM][0], int):
120 |         print('Converting String to integers. Next time please make sure to enter integer values for Width Dimension')
121 |         grid[Constant.WIDTH_DIM] = list(map(int, grid[Constant.WIDTH_DIM]))
122 | 
123 |     grid_key_list = list(grid.keys())
124 |     grid_key_list.sort()
125 |     for key in grid_key_list:
126 |         value_list.append(grid[key])
127 | 
128 |     dimension = list(itertools.product(*value_list))
129 |     # print(dimension)
130 |     return grid, dimension
131 | 
132 | 
133 | def verbose_print(new_father_id, new_graph, new_model_id):
134 |     """Print information about the operation performed on father model to obtain current model and father's id."""
135 |     cell_size = [24, 49]
136 |     logging.info('New Model Id - ' + str(new_model_id))
137 |     header = ['Father Model ID', 'Added Operation']
138 |     line = '|'.join(str(x).center(cell_size[i]) for i, x in enumerate(header))
139 |     logging.info('\n' + '+' + '-' * len(line) + '+')
140 |     logging.info('|' + line + '|')
141 |     logging.info('+' + '-' * len(line) + '+')
142 |     for i in range(len(new_graph.operation_history)):
143 |         if i == len(new_graph.operation_history) // 2:
144 |             r = [str(new_father_id), ' '.join(str(item) for item in new_graph.operation_history[i])]
145 |         else:
146 |             r = [' ', ' '.join(str(item) for item in new_graph.operation_history[i])]
147 |         line = '|'.join(str(x).center(cell_size[i]) for i, x in enumerate(r))
148 |         logging.info('|' + line + '|')
149 |     logging.info('+' + '-' * len(line) + '+')
150 | 
151 | 
152 | def validate_xy(x_train, y_train):
153 |     """Validate `x_train`'s type and the shape of `x_train`, `y_train`."""
154 |     try:
155 |         x_train = x_train.astype('float64')
156 |     except ValueError:
157 |         raise ValueError('x_train should only contain numerical data.')
158 | 
159 |     if len(x_train.shape) < 2:
160 |         raise ValueError('x_train should at least has 2 dimensions.')
161 | 
162 |     if x_train.shape[0] != y_train.shape[0]:
163 |         raise ValueError('x_train and y_train should have the same number of instances.')
164 | 
165 | 
166 | def read_csv_file(csv_file_path):
167 |     """Read the csv file and returns two separate list containing file names and their labels.
168 | 
169 |     Args:
170 |         csv_file_path: Path to the CSV file.
171 | 
172 |     Returns:
173 |         file_names: List containing files names.
174 |         file_label: List containing their respective labels.
175 |     """
176 |     file_names = []
177 |     file_labels = []
178 |     with open(csv_file_path, 'r') as files_path:
179 |         path_list = csv.DictReader(files_path)
180 |         fieldnames = path_list.fieldnames
181 |         for path in path_list:
182 |             file_names.append(path[fieldnames[0]])
183 |             file_labels.append(path[fieldnames[1]])
184 |     return file_names, file_labels
185 | 
186 | 
187 | def read_tsv_file(input_file, quotechar=None):
188 |     """Reads a tab separated value (tsv) file and return two lists containing file names and labels."""
189 |     with open(input_file, "r", encoding='utf-8') as f:
190 |         reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
191 |         x, y = [], []
192 |         for line in reader:
193 |             x.append(line[0])
194 |             y.append(int(line[1]))
195 |         return x, y
196 | 
197 | 
198 | def read_image(img_path):
199 |     """Read the image contained in the provided path `image_path`."""
200 |     img = imageio.imread(uri=img_path)
201 |     return img
202 | 
203 | 
204 | def compute_image_resize_params(data):
205 |     """Compute median dimension of all images in data.
206 | 
207 |     It used to resize the images later. Number of channels do not change from the original data.
208 | 
209 |     Args:
210 |         data: 1-D, 2-D or 3-D images. The Images are expected to have channel last configuration.
211 | 
212 |     Returns:
213 |         median shape.
214 |     """
215 |     if data is None or len(data.shape) == 0:
216 |         return []
217 | 
218 |     if len(data.shape) == len(data[0].shape) + 1 and np.prod(data[0].shape[:-1]) <= Constant.MAX_IMAGE_SIZE:
219 |         return data[0].shape
220 | 
221 |     data_shapes = []
222 |     for x in data:
223 |         data_shapes.append(x.shape)
224 | 
225 |     median_shape = np.median(np.array(data_shapes), axis=0)
226 |     median_size = np.prod(median_shape[:-1])
227 | 
228 |     if median_size > Constant.MAX_IMAGE_SIZE:
229 |         reduction_factor = np.power(Constant.MAX_IMAGE_SIZE / median_size, 1 / (len(median_shape) - 1))
230 |         median_shape[:-1] = median_shape[:-1] * reduction_factor
231 | 
232 |     return median_shape.astype(int)
233 | 
234 | 
235 | def resize_image_data(data, resize_shape):
236 |     """Resize images to given dimension.
237 | 
238 |     Args:
239 |         data: 1-D, 2-D or 3-D images. The Images are expected to have channel last configuration.
240 |         resize_shape: Image resize dimension.
241 | 
242 |     Returns:
243 |         data: Reshaped data.
244 |     """
245 |     if data is None or len(resize_shape) == 0:
246 |         return data
247 | 
248 |     if len(data.shape) > 1 and np.array_equal(data[0].shape, resize_shape):
249 |         return data
250 | 
251 |     output_data = []
252 |     for im in data:
253 |         output_data.append(zoom(input=im, zoom=np.divide(resize_shape, im.shape)))
254 | 
255 |     return np.array(output_data)
256 | 
257 | 
258 | def get_system():
259 |     """Get the current system environment. If the current system is not supported, raise an exception.
260 | 
261 |     Returns:
262 |          A string to represent the current OS name.
263 |          "posix" stands for Linux, Mac or Solaris architecture.
264 |          "nt" stands for Windows system.
265 |     """
266 |     if 'google.colab' in sys.modules:
267 |         return Constant.SYS_GOOGLE_COLAB
268 |     if os.name == 'posix':
269 |         return Constant.SYS_LINUX
270 |     if os.name == 'nt':
271 |         return Constant.SYS_WINDOWS
272 | 
273 |     raise EnvironmentError('Unsupported environment')
274 | 
275 | 
276 | def download_file_from_google_drive(file_id, dest_path, verbose=False):
277 |     """
278 |     Downloads a shared file from google drive into a given folder.
279 |     Optionally unzips it.
280 | 
281 |     Refact from:
282 |     https://github.com/ndrplz/google-drive-downloader/blob/master/google_drive_downloader/google_drive_downloader.py
283 | 
284 |     Args:
285 |         verbose:
286 |         file_id: str
287 |             the file identifier.
288 |             You can obtain it from the sharable link.
289 |         dest_path: str
290 |             the destination where to save the downloaded file.
291 |             Must be a path (for example: './downloaded_file.txt')
292 |     """
293 | 
294 |     destination_directory = dirname(dest_path)
295 |     if len(destination_directory) > 0 and not exists(destination_directory):
296 |         makedirs(destination_directory)
297 | 
298 |     session = requests.Session()
299 | 
300 |     if verbose:
301 |         print('Downloading file with Google ID {} into {}... '.format(file_id, dest_path), end='')
302 |     stdout.flush()
303 | 
304 |     response = session.get(Constant.DOWNLOAD_URL, params={'id': file_id}, stream=True)
305 | 
306 |     token = get_confirm_token(response)
307 |     if token:
308 |         params = {'id': file_id, 'confirm': token}
309 |         response = session.get(Constant.DOWNLOAD_URL, params=params, stream=True)
310 | 
311 |     save_response_content(response, dest_path)
312 |     if verbose:
313 |         print('Download completed.')
314 | 
315 | 
316 | def get_confirm_token(response):
317 |     for key, value in response.cookies.items():
318 |         if key.startswith('download_warning'):
319 |             return value
320 |     return None
321 | 
322 | 
323 | def save_response_content(response, destination):
324 |     with open(destination, "wb") as f:
325 |         for chunk in response.iter_content(Constant.CHUNK_SIZE):
326 |             if chunk:  # filter out keep-alive new chunks
327 |                 f.write(chunk)
328 | 
329 | 
330 | def get_device():
331 |     """ If CUDA is available, use CUDA device, else use CPU device.
332 |     Returns: string device name
333 |     """
334 |     return 'cuda' if torch.cuda.is_available() else 'cpu'
335 | 


--------------------------------------------------------------------------------
/autokeras_pretrained/voice_generator/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datamllab/autokeras-pretrained/2529714e697f5f1333c16809add621a42691b2dd/autokeras_pretrained/voice_generator/__init__.py


--------------------------------------------------------------------------------
/autokeras_pretrained/voice_generator/deepvoice3_pytorch/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datamllab/autokeras-pretrained/2529714e697f5f1333c16809add621a42691b2dd/autokeras_pretrained/voice_generator/deepvoice3_pytorch/__init__.py


--------------------------------------------------------------------------------
/autokeras_pretrained/voice_generator/deepvoice3_pytorch/builder.py:
--------------------------------------------------------------------------------
 1 | from autokeras_pretrained.voice_generator.deepvoice3_pytorch.model import MultiSpeakerTTSModel, AttentionSeq2Seq
 2 | 
 3 | 
 4 | def deepvoice3(n_vocab, embed_dim=256, mel_dim=80, linear_dim=513, r=4, n_speakers=1, speaker_embed_dim=16,
 5 |                padding_idx=0, dropout=(1 - 0.95), kernel_size=5, encoder_channels=128, decoder_channels=256,
 6 |                converter_channels=256, query_position_rate=1.0, key_position_rate=1.29, use_memory_mask=False,
 7 |                trainable_positional_encodings=False, force_monotonic_attention=True,
 8 |                use_decoder_state_for_postnet_input=True, max_positions=512, embedding_weight_std=0.1,
 9 |                freeze_embedding=False, window_ahead=3, window_backward=1):
10 |     """Build deepvoice3
11 |     """
12 |     from autokeras_pretrained.voice_generator.deepvoice3_pytorch.deepvoice3 import Encoder, Decoder, Converter
13 | 
14 |     # Seq2seq
15 |     h = encoder_channels  # hidden dim (channels)
16 |     k = kernel_size  # kernel size
17 |     encoder = Encoder(n_vocab, embed_dim, n_speakers=n_speakers, speaker_embed_dim=speaker_embed_dim,
18 |                       padding_idx=padding_idx, embedding_weight_std=embedding_weight_std,
19 |                       convolutions=[(h, k, 1), (h, k, 3), (h, k, 9), (h, k, 27),
20 |                                     (h, k, 1), (h, k, 3), (h, k, 9), (h, k, 27),
21 |                                     (h, k, 1), (h, k, 3)], dropout=dropout)
22 | 
23 |     h = decoder_channels
24 |     decoder = Decoder(embed_dim, n_speakers=n_speakers, speaker_embed_dim=speaker_embed_dim, in_dim=mel_dim, r=r,
25 |                       max_positions=max_positions, preattention=[(h, k, 1), (h, k, 3)],
26 |                       convolutions=[(h, k, 1), (h, k, 3), (h, k, 9), (h, k, 27),
27 |                                     (h, k, 1)], attention=[True, False, False, False, True], dropout=dropout,
28 |                       use_memory_mask=use_memory_mask, force_monotonic_attention=force_monotonic_attention,
29 |                       query_position_rate=query_position_rate, key_position_rate=key_position_rate,
30 |                       window_ahead=window_ahead, window_backward=window_backward)
31 | 
32 |     seq2seq = AttentionSeq2Seq(encoder, decoder)
33 | 
34 |     # Post net
35 |     in_dim = h // r
36 | 
37 |     h = converter_channels
38 |     converter = Converter(n_speakers=n_speakers, speaker_embed_dim=speaker_embed_dim, in_dim=in_dim, out_dim=linear_dim,
39 |                           convolutions=[(h, k, 1), (h, k, 3), (2 * h, k, 1), (2 * h, k, 3)], dropout=dropout)
40 | 
41 |     # Seq2seq + post net
42 |     model = MultiSpeakerTTSModel(seq2seq, converter, mel_dim=mel_dim, linear_dim=linear_dim, n_speakers=n_speakers,
43 |                                  speaker_embed_dim=speaker_embed_dim,
44 |                                  trainable_positional_encodings=trainable_positional_encodings,
45 |                                  use_decoder_state_for_postnet_input=use_decoder_state_for_postnet_input,
46 |                                  freeze_embedding=freeze_embedding)
47 | 
48 |     return model
49 | 


--------------------------------------------------------------------------------
/autokeras_pretrained/voice_generator/deepvoice3_pytorch/conv.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | from torch import nn
 3 | from torch.nn import functional as F
 4 | 
 5 | 
 6 | class Conv1d(nn.Conv1d):
 7 |     """Extended nn.Conv1d for incremental dilated convolutions
 8 |     """
 9 | 
10 |     def __init__(self, *args, **kwargs):
11 |         super().__init__(*args, **kwargs)
12 |         self.clear_buffer()
13 |         self._linearized_weight = None
14 | 
15 |     def incremental_forward(self, input_data):
16 | 
17 |         # reshape weight
18 |         weight = self._get_linearized_weight()
19 |         kw = self.kernel_size[0]
20 |         dilation = self.dilation[0]
21 | 
22 |         bsz = input_data.size(0)  # conv_input: bsz x len x dim
23 |         if kw > 1:
24 |             input_data = input_data.data
25 |             if self.input_buffer is None:
26 |                 self.input_buffer = input_data.new(bsz, kw + (kw - 1) * (dilation - 1), input_data.size(2))
27 |                 self.input_buffer.zero_()
28 |             else:
29 |                 # shift buffer
30 |                 self.input_buffer[:, :-1, :] = self.input_buffer[:, 1:, :].clone()
31 |             # append next input
32 |             self.input_buffer[:, -1, :] = input_data[:, -1, :]
33 |             input_data = self.input_buffer
34 |             if dilation > 1:
35 |                 input_data = input_data[:, 0::dilation, :].contiguous()
36 |         input_data = F.linear(input_data.view(bsz, -1), weight, self.bias)
37 |         return input_data.view(bsz, 1, -1)
38 | 
39 |     def clear_buffer(self):
40 |         self.input_buffer = None
41 | 
42 |     def _get_linearized_weight(self):
43 |         if self._linearized_weight is None:
44 |             kw = self.kernel_size[0]
45 |             # nn.Conv1d
46 |             weight = self.weight.transpose(1, 2).contiguous()
47 | 
48 |             if weight.size() != (self.out_channels, kw, self.in_channels):
49 |                 raise AssertionError()
50 |             self._linearized_weight = weight.view(self.out_channels, -1)
51 |         return self._linearized_weight
52 | 


--------------------------------------------------------------------------------
/autokeras_pretrained/voice_generator/deepvoice3_pytorch/deepvoice3.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | 
  3 | import math
  4 | import torch
  5 | from torch import nn
  6 | from torch.nn import functional as F
  7 | 
  8 | from .modules import conv1d, conv_transpose1d, embedding, linear
  9 | from .modules import SinusoidalEncoding, Conv1dGLU
 10 | 
 11 | 
 12 | class Encoder(nn.Module):
 13 |     def __init__(self, n_vocab, embed_dim, n_speakers, speaker_embed_dim, padding_idx=None, embedding_weight_std=0.1,
 14 |                  convolutions=((64, 5, .1),) * 7, dropout=0.1, apply_grad_scaling=False):
 15 |         super(Encoder, self).__init__()
 16 |         self.dropout = dropout
 17 |         self.num_attention_layers = None
 18 |         self.apply_grad_scaling = apply_grad_scaling
 19 | 
 20 |         # Text input embeddings
 21 |         self.embed_tokens = embedding(
 22 |             n_vocab, embed_dim, padding_idx, embedding_weight_std)
 23 | 
 24 |         self.n_speakers = n_speakers
 25 | 
 26 |         # Non causual convolution blocks
 27 |         in_channels = embed_dim
 28 |         self.convolutions = nn.ModuleList()
 29 |         std_mul = 1.0
 30 |         for (out_channels, kernel_size, dilation) in convolutions:
 31 |             if in_channels != out_channels:
 32 |                 # Conv1d + ReLU
 33 |                 self.convolutions.append(
 34 |                     conv1d(in_channels, out_channels, kernel_size=1, padding=0,
 35 |                            dilation=1, std_mul=std_mul))
 36 |                 self.convolutions.append(nn.ReLU(inplace=True))
 37 |                 in_channels = out_channels
 38 |                 std_mul = 2.0
 39 |             self.convolutions.append(
 40 |                 Conv1dGLU(n_speakers, speaker_embed_dim,
 41 |                           in_channels, out_channels, kernel_size, causal=False,
 42 |                           dilation=dilation, dropout=dropout, std_mul=std_mul,
 43 |                           residual=True))
 44 |             in_channels = out_channels
 45 |             std_mul = 4.0
 46 |         # Last 1x1 convolution
 47 |         self.convolutions.append(conv1d(in_channels, embed_dim, kernel_size=1,
 48 |                                         padding=0, dilation=1, std_mul=std_mul,
 49 |                                         dropout=dropout))
 50 | 
 51 |     def forward(self, text_sequences, text_positions=None, lengths=None,
 52 |                 speaker_embed=None):
 53 |         if self.n_speakers != 1 and speaker_embed is None:
 54 |             raise AssertionError("Expected \033[1;31m<self.n_speakers>\033[m to be 1 or \033[1:31m<speaker_embed>[m to be not None, but was not")
 55 |         # embed text_sequences
 56 |         x = self.embed_tokens(text_sequences.long())
 57 |         x = F.dropout(x, p=self.dropout, training=self.training)
 58 | 
 59 |         # expand speaker embedding for all time steps
 60 |         speaker_embed_btc = None
 61 | 
 62 |         input_embedding = x
 63 | 
 64 |         # B x T x C -> B x C x T
 65 |         x = x.transpose(1, 2)
 66 | 
 67 |         # １D conv blocks
 68 |         for f in self.convolutions:
 69 |             x = f(x, speaker_embed_btc) if isinstance(f, Conv1dGLU) else f(x)
 70 | 
 71 |         # Back to B x T x C
 72 |         keys = x.transpose(1, 2)
 73 | 
 74 |         # scale gradients (this only affects backward, not forward)
 75 |         # add output to input embedding for attention
 76 |         values = (keys + input_embedding) * math.sqrt(0.5)
 77 | 
 78 |         return keys, values
 79 | 
 80 | 
 81 | class AttentionLayer(nn.Module):
 82 |     def __init__(self, conv_channels, embed_dim, dropout=0.1, window_ahead=3, window_backward=1):
 83 |         super(AttentionLayer, self).__init__()
 84 |         self.query_projection = linear(conv_channels, embed_dim)
 85 |         self.key_projection = None
 86 |         self.value_projection = None
 87 |         self.out_projection = linear(embed_dim, conv_channels)
 88 |         self.dropout = dropout
 89 |         self.window_ahead = window_ahead
 90 |         self.window_backward = window_backward
 91 | 
 92 |     def forward(self, query, encoder_out, mask=None, last_attended=None):
 93 |         keys, values = encoder_out
 94 |         residual = query
 95 | 
 96 |         # attention
 97 |         x = self.query_projection(query)
 98 |         x = torch.bmm(x, keys)
 99 | 
100 |         mask_value = -float("inf")
101 | 
102 |         if last_attended is not None:
103 |             backward = last_attended - self.window_backward
104 |             if backward > 0:
105 |                 x[:, :, :backward] = mask_value
106 |             ahead = last_attended + self.window_ahead
107 |             if ahead < x.size(-1):
108 |                 x[:, :, ahead:] = mask_value
109 | 
110 |         # softmax over last dim
111 |         # (B, tgt_len, src_len)
112 |         sz = x.size()
113 |         x = F.softmax(x.view(sz[0] * sz[1], sz[2]), dim=1)
114 |         x = x.view(sz)
115 |         attn_scores = x
116 | 
117 |         x = F.dropout(x, p=self.dropout, training=self.training)
118 | 
119 |         x = torch.bmm(x, values)
120 | 
121 |         # scale attention output
122 |         s = values.size(1)
123 |         x = x * (s * math.sqrt(1.0 / s))
124 | 
125 |         # project back
126 |         x = self.out_projection(x)
127 |         x = (x + residual) * math.sqrt(0.5)
128 |         return x, attn_scores
129 | 
130 | 
131 | class Decoder(nn.Module):
132 |     def __init__(self, embed_dim, n_speakers, speaker_embed_dim, in_dim=80, r=5, max_positions=512,
133 |                  preattention=((128, 5, 1),) * 4, convolutions=((128, 5, 1),) * 4, attention=True, dropout=0.1,
134 |                  use_memory_mask=False, force_monotonic_attention=False, query_position_rate=1.0,
135 |                  key_position_rate=1.29, window_ahead=3, window_backward=1):
136 |         super(Decoder, self).__init__()
137 |         self.dropout = dropout
138 |         self.in_dim = in_dim
139 |         self.r = r
140 |         self.query_position_rate = query_position_rate
141 |         self.key_position_rate = key_position_rate
142 | 
143 |         # Position encodings for query (decoder states) and keys (encoder states)
144 |         self.embed_query_positions = SinusoidalEncoding(
145 |             max_positions, convolutions[0][0])
146 |         self.embed_keys_positions = SinusoidalEncoding(
147 |             max_positions, embed_dim)
148 |         # Used for compute multiplier for positional encodings
149 |         self.speaker_proj1, self.speaker_proj2 = None, None
150 | 
151 |         # Prenet: causal convolution blocks
152 |         self.preattention = nn.ModuleList()
153 |         in_channels = in_dim * r
154 |         std_mul = 1.0
155 |         for out_channels, kernel_size, dilation in preattention:
156 |             if in_channels != out_channels:
157 |                 # Conv1d + ReLU
158 |                 self.preattention.append(
159 |                     conv1d(in_channels, out_channels, kernel_size=1, padding=0,
160 |                            dilation=1, std_mul=std_mul))
161 |                 self.preattention.append(nn.ReLU(inplace=True))
162 |                 in_channels = out_channels
163 |                 std_mul = 2.0
164 |             self.preattention.append(
165 |                 Conv1dGLU(n_speakers, speaker_embed_dim,
166 |                           in_channels, out_channels, kernel_size, causal=True,
167 |                           dilation=dilation, dropout=dropout, std_mul=std_mul,
168 |                           residual=True))
169 |             in_channels = out_channels
170 |             std_mul = 4.0
171 | 
172 |         # Causal convolution blocks + attention layers
173 |         self.convolutions = nn.ModuleList()
174 |         self.attention = nn.ModuleList()
175 | 
176 |         for i, (out_channels, kernel_size, dilation) in enumerate(convolutions):
177 |             if in_channels != out_channels:
178 |                 raise AssertionError("Expected \033[1;31m<in_channels>\033[m to be equal to \033[1:31m<out_channels>[m, but was not")
179 |             self.convolutions.append(
180 |                 Conv1dGLU(n_speakers, speaker_embed_dim,
181 |                           in_channels, out_channels, kernel_size, causal=True,
182 |                           dilation=dilation, dropout=dropout, std_mul=std_mul,
183 |                           residual=False))
184 |             self.attention.append(
185 |                 AttentionLayer(out_channels, embed_dim, dropout=dropout, window_ahead=window_ahead,
186 |                                window_backward=window_backward)
187 |                 if attention[i] else None)
188 |             in_channels = out_channels
189 |             std_mul = 4.0
190 |         # Last 1x1 convolution
191 |         self.last_conv = conv1d(in_channels, in_dim * r, kernel_size=1,
192 |                                 padding=0, dilation=1, std_mul=std_mul,
193 |                                 dropout=dropout)
194 | 
195 |         # Mel-spectrogram (before sigmoid) -> Done binary flag
196 |         self.fc = linear(in_dim * r, 1)
197 | 
198 |         self.max_decoder_steps = 200
199 |         self.min_decoder_steps = 10
200 |         self.use_memory_mask = use_memory_mask
201 |         self.force_monotonic_attention = [force_monotonic_attention] * len(convolutions)
202 | 
203 |     def forward(self, encoder_out, inputs=None,
204 |                 text_positions=None, frame_positions=None,
205 |                 speaker_embed=None, lengths=None):
206 |         if inputs is None:
207 |             if text_positions is None:
208 |                 raise AssertionError("Expected \033[1;31m<text_positions>\033[m to be not None, but was")
209 |             self.start_fresh_sequence()
210 |             outputs = self.incremental_forward(encoder_out, text_positions)
211 |             return outputs
212 | 
213 |         # Grouping multiple frames if necessary
214 | 
215 |     def incremental_forward(self, encoder_out, text_positions, initial_input=None, test_inputs=None):
216 |         keys, values = encoder_out
217 |         b = keys.size(0)
218 | 
219 |         # position encodings
220 |         w = self.key_position_rate
221 |         text_pos_embed = self.embed_keys_positions(text_positions, w)
222 |         keys = keys + text_pos_embed
223 | 
224 |         # transpose only once to speed up attention layers
225 |         keys = keys.transpose(1, 2).contiguous()
226 | 
227 |         decoder_states = []
228 |         outputs = []
229 |         alignments = []
230 |         dones = []
231 |         # intially set to zeros
232 |         last_attended = [None] * len(self.attention)
233 |         for idx, v in enumerate(self.force_monotonic_attention):
234 |             last_attended[idx] = 0 if v else None
235 | 
236 |         num_attention_layers = sum([layer is not None for layer in self.attention])
237 |         t = 0
238 |         if initial_input is None:
239 |             initial_input = keys.data.new(b, 1, self.in_dim * self.r).zero_()
240 |         current_input = initial_input
241 |         while True:
242 |             # frame pos start with 1.
243 |             frame_pos = keys.data.new(b, 1).fill_(t + 1).long()
244 |             w = self.query_position_rate
245 |             frame_pos_embed = self.embed_query_positions(frame_pos, w)
246 | 
247 |             if t > 0:
248 |                 current_input = outputs[-1]
249 |             output_tensor = current_input
250 |             output_tensor = F.dropout(output_tensor, p=self.dropout, training=self.training)
251 | 
252 |             # Prenet
253 |             for f in self.preattention:
254 |                 if isinstance(f, Conv1dGLU):
255 |                     output_tensor = f.incremental_forward(output_tensor)
256 |                 else:
257 |                     try:
258 |                         output_tensor = f.incremental_forward(output_tensor, )
259 |                     except AttributeError:
260 |                         output_tensor = f(output_tensor)
261 | 
262 |             # Casual convolutions + Multi-hop attentions
263 |             ave_alignment = None
264 |             for idx, (f, attention) in enumerate(zip(self.convolutions,
265 |                                                      self.attention)):
266 |                 residual = output_tensor
267 |                 if isinstance(f, Conv1dGLU):
268 |                     output_tensor = f.incremental_forward(output_tensor)
269 | 
270 |                 if attention is not None:
271 | 
272 |                     if isinstance(f, Conv1dGLU) is False:
273 |                         raise AssertionError()
274 |                     output_tensor = output_tensor + frame_pos_embed
275 |                     output_tensor, alignment = attention(output_tensor, (keys, values),
276 |                                                          last_attended=last_attended[idx])
277 |                     if self.force_monotonic_attention[idx]:
278 |                         last_attended[idx] = alignment.max(-1)[1].view(-1).data[0]
279 |                     if ave_alignment is None:
280 |                         ave_alignment = alignment
281 |                     else:
282 |                         ave_alignment = ave_alignment + ave_alignment
283 | 
284 |                 # residual
285 |                 if isinstance(f, Conv1dGLU):
286 |                     output_tensor = (output_tensor + residual) * math.sqrt(0.5)
287 | 
288 |             decoder_state = output_tensor
289 |             output_tensor = self.last_conv.incremental_forward(output_tensor, )
290 |             ave_alignment = ave_alignment.div_(num_attention_layers)
291 | 
292 |             # Ooutput & done flag predictions
293 |             output = F.sigmoid(output_tensor)
294 |             done = F.sigmoid(self.fc(output_tensor))
295 | 
296 |             decoder_states += [decoder_state]
297 |             outputs += [output]
298 |             alignments += [ave_alignment]
299 |             dones += [done]
300 | 
301 |             t += 1
302 |             if test_inputs is None:
303 |                 if (done > 0.5).all() and t > self.min_decoder_steps:
304 |                     break
305 | 
306 |         # Remove 1-element time axis
307 |         alignments = list(map(lambda x: x.squeeze(1), alignments))
308 |         decoder_states = list(map(lambda x: x.squeeze(1), decoder_states))
309 |         outputs = list(map(lambda x: x.squeeze(1), outputs))
310 | 
311 |         # Combine outputs for all time steps
312 |         alignments = torch.stack(alignments).transpose(0, 1)
313 |         decoder_states = torch.stack(decoder_states).transpose(0, 1).contiguous()
314 |         outputs = torch.stack(outputs).transpose(0, 1).contiguous()
315 | 
316 |         return outputs, alignments, dones, decoder_states
317 | 
318 |     def start_fresh_sequence(self):
319 |         _clear_modules(self.preattention)
320 |         _clear_modules(self.convolutions)
321 |         self.last_conv.clear_buffer()
322 | 
323 | 
324 | def _clear_modules(modules):
325 |     for m in modules:
326 |         try:
327 |             m.clear_buffer()
328 |         except AttributeError:
329 |             pass
330 | 
331 | 
332 | class Converter(nn.Module):
333 |     def __init__(self, n_speakers, speaker_embed_dim, in_dim, out_dim, convolutions=((256, 5, 1),) * 4, dropout=0.1):
334 |         super(Converter, self).__init__()
335 |         self.dropout = dropout
336 |         self.in_dim = in_dim
337 |         self.out_dim = out_dim
338 |         self.n_speakers = n_speakers
339 | 
340 |         # Non causual convolution blocks
341 |         in_channels = convolutions[0][0]
342 |         # Idea from nyanko
343 |         self.convolutions = nn.ModuleList([
344 |             conv1d(in_dim, in_channels, kernel_size=1, padding=0, dilation=1,
345 |                    std_mul=1.0),
346 |             conv_transpose1d(in_channels, in_channels, kernel_size=2,
347 |                              padding=0, stride=2, std_mul=1.0),
348 |             Conv1dGLU(n_speakers, speaker_embed_dim,
349 |                       in_channels, in_channels, kernel_size=3, causal=False,
350 |                       dilation=1, dropout=dropout, std_mul=1.0, residual=True),
351 |             Conv1dGLU(n_speakers, speaker_embed_dim,
352 |                       in_channels, in_channels, kernel_size=3, causal=False,
353 |                       dilation=3, dropout=dropout, std_mul=4.0, residual=True),
354 |             conv_transpose1d(in_channels, in_channels, kernel_size=2,
355 |                              padding=0, stride=2, std_mul=4.0),
356 |             Conv1dGLU(n_speakers, speaker_embed_dim,
357 |                       in_channels, in_channels, kernel_size=3, causal=False,
358 |                       dilation=1, dropout=dropout, std_mul=1.0, residual=True),
359 |             Conv1dGLU(n_speakers, speaker_embed_dim,
360 |                       in_channels, in_channels, kernel_size=3, causal=False,
361 |                       dilation=3, dropout=dropout, std_mul=4.0, residual=True),
362 |         ])
363 | 
364 |         std_mul = 4.0
365 |         for (out_channels, kernel_size, dilation) in convolutions:
366 |             if in_channels != out_channels:
367 |                 self.convolutions.append(
368 |                     conv1d(in_channels, out_channels, kernel_size=1, padding=0,
369 |                            dilation=1, std_mul=std_mul))
370 |                 self.convolutions.append(nn.ReLU(inplace=True))
371 |                 in_channels = out_channels
372 |                 std_mul = 2.0
373 |             self.convolutions.append(
374 |                 Conv1dGLU(n_speakers, speaker_embed_dim,
375 |                           in_channels, out_channels, kernel_size, causal=False,
376 |                           dilation=dilation, dropout=dropout, std_mul=std_mul,
377 |                           residual=True))
378 |             in_channels = out_channels
379 |             std_mul = 4.0
380 |         # Last 1x1 convolution
381 |         self.convolutions.append(conv1d(in_channels, out_dim, kernel_size=1,
382 |                                         padding=0, dilation=1, std_mul=std_mul,
383 |                                         dropout=dropout))
384 | 
385 |     def forward(self, x, speaker_embed=None):
386 |         if self.n_speakers != 1 and speaker_embed == None:
387 |             raise AssertionError("Expected \033[1;31m<self.n_speakers>\033[m to be 1 or \033[1:31m<speaker_embed>[m to be not None, but was not")
388 |         speaker_embed_btc = None
389 |         # Generic case: B x T x C -> B x C x T
390 |         x = x.transpose(1, 2)
391 |         for f in self.convolutions:
392 |             x = f(x, speaker_embed_btc) if isinstance(f, Conv1dGLU) else f(x)
393 |         # Back to B x T x C
394 |         x = x.transpose(1, 2)
395 | 
396 |         return F.sigmoid(x)
397 | 


--------------------------------------------------------------------------------
/autokeras_pretrained/voice_generator/deepvoice3_pytorch/frontend.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import os
 3 | import nltk
 4 | from autokeras_pretrained.voice_generator.deepvoice3_pytorch.text.symbols import symbols
 5 | 
 6 | 
 7 | N_VOCAB = len(symbols)
 8 | n_vocab = N_VOCAB
 9 | 
10 | try:
11 |     _ARPHABET = nltk.corpus.cmudict.dict()
12 | except BaseException:
13 |     nltk.download("cmudict")
14 |     _ARPHABET = nltk.corpus.cmudict.dict()
15 | 
16 | 
17 | def _maybe_get_arpabet(word, pro):
18 |     try:
19 |         phonemes = _ARPHABET[word][0]
20 |         phonemes = " ".join(phonemes)
21 |     except KeyError:
22 |         return word
23 | 
24 |     return '{%s}' % phonemes if ord(os.urandom(1)) < pro else word
25 | 
26 | 
27 | def mix_pronunciation(text, pro):
28 |     text = ' '.join(_maybe_get_arpabet(word, pro) for word in text.split(' '))
29 |     return text
30 | 
31 | 
32 | def text_to_sequence(text, p=0.0):
33 |     pro = p
34 |     if pro >= 0:
35 |         text = mix_pronunciation(text, pro)
36 |     from autokeras_pretrained.voice_generator.deepvoice3_pytorch.text.text import text_to_sequence
37 |     text = text_to_sequence(text, ["english_cleaners"])
38 |     return text
39 | 


--------------------------------------------------------------------------------
/autokeras_pretrained/voice_generator/deepvoice3_pytorch/model.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | 
 3 | from torch import nn
 4 | 
 5 | 
 6 | class MultiSpeakerTTSModel(nn.Module):
 7 |     """Attention seq2seq model + post processing network
 8 |     """
 9 | 
10 |     def __init__(self, seq2seq, postnet, mel_dim=80, linear_dim=513, n_speakers=1, speaker_embed_dim=16,
11 |                  trainable_positional_encodings=False, use_decoder_state_for_postnet_input=False,
12 |                  freeze_embedding=False):
13 |         super(MultiSpeakerTTSModel, self).__init__()
14 |         self.seq2seq = seq2seq
15 |         self.postnet = postnet  # referred as "Converter" in DeepVoice3
16 |         self.mel_dim = mel_dim
17 |         self.linear_dim = linear_dim
18 |         self.trainable_positional_encodings = trainable_positional_encodings
19 |         self.use_decoder_state_for_postnet_input = use_decoder_state_for_postnet_input
20 |         self.freeze_embedding = freeze_embedding
21 | 
22 |         self.n_speakers = n_speakers
23 |         self.speaker_embed_dim = speaker_embed_dim
24 | 
25 |     def make_generation_fast_(self):
26 | 
27 |         def remove_weight_norm(m):
28 |             try:
29 |                 nn.utils.remove_weight_norm(m)
30 |             except ValueError:  # this module didn't have weight norm
31 |                 return
32 |         self.apply(remove_weight_norm)
33 | 
34 |     def forward(self, text_sequences, mel_targets=None, speaker_ids=None,
35 |                 text_positions=None, frame_positions=None, input_lengths=None):
36 |         b = text_sequences.size(0)
37 | 
38 |         speaker_embed = None
39 | 
40 |         # Apply seq2seq
41 |         # (B, T//r, mel_dim*r)
42 |         mel_outputs, alignments, done, decoder_states = self.seq2seq(
43 |             text_sequences, mel_targets, speaker_embed,
44 |             text_positions, frame_positions, input_lengths)
45 | 
46 |         # Reshape
47 |         # (B, T, mel_dim)
48 |         mel_outputs = mel_outputs.view(b, -1, self.mel_dim)
49 | 
50 |         # Prepare postnet inputs
51 |         postnet_inputs = decoder_states.view(b, mel_outputs.size(1), -1)
52 | 
53 |         # (B, T, linear_dim)
54 |         # Convert coarse mel-spectrogram (or decoder hidden states) to
55 |         # high resolution spectrogram
56 |         linear_outputs = self.postnet(postnet_inputs, speaker_embed)
57 | 
58 |         if linear_outputs.size(-1) != self.linear_dim:
59 |             raise AssertionError()
60 |         return mel_outputs, linear_outputs, alignments, done
61 | 
62 | 
63 | class AttentionSeq2Seq(nn.Module):
64 |     """Encoder + Decoder with attention
65 |     """
66 | 
67 |     def __init__(self, encoder, decoder):
68 |         super(AttentionSeq2Seq, self).__init__()
69 |         self.encoder = encoder
70 |         self.decoder = decoder
71 |         if isinstance(self.decoder.attention, nn.ModuleList):
72 |             self.encoder.num_attention_layers = sum(
73 |                 [layer is not None for layer in decoder.attention])
74 | 
75 |     def forward(self, text_sequences, mel_targets=None, speaker_embed=None,
76 |                 text_positions=None, frame_positions=None, input_lengths=None):
77 |         # (B, T, text_embed_dim)
78 |         encoder_outputs = self.encoder(
79 |             text_sequences, lengths=input_lengths, speaker_embed=speaker_embed)
80 | 
81 |         # Mel: (B, T//r, mel_dim*r)
82 |         # Alignments: (N, B, T_target, T_input)
83 |         # Done: (B, T//r, 1)
84 |         mel_outputs, alignments, done, decoder_states = self.decoder(
85 |             encoder_outputs, mel_targets,
86 |             text_positions=text_positions, frame_positions=frame_positions,
87 |             speaker_embed=speaker_embed, lengths=input_lengths)
88 | 
89 |         return mel_outputs, alignments, done, decoder_states
90 | 


--------------------------------------------------------------------------------
/autokeras_pretrained/voice_generator/deepvoice3_pytorch/modules.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | 
  3 | import torch
  4 | from torch import nn
  5 | import math
  6 | import numpy as np
  7 | from torch.nn import functional as F
  8 | 
  9 | 
 10 | def position_encoding_init(n_position, d_pos_vec, position_rate=1.0):
 11 |     """Init the sinusoid position encoding table """
 12 | 
 13 |     # keep dim 0 for padding token position encoding zero vector
 14 |     position_enc = np.array([
 15 |         [position_rate * pos / np.power(10000, 2 * (i // 2) / d_pos_vec) for i in range(d_pos_vec)]
 16 |         if pos != 0 else np.zeros(d_pos_vec) for pos in range(n_position)])
 17 | 
 18 |     position_enc = torch.from_numpy(position_enc).float()
 19 | 
 20 |     return position_enc
 21 | 
 22 | 
 23 | def sinusoidal_encode(x, w):
 24 |     y = w * x
 25 |     y[1:, 0::2] = torch.sin(y[1:, 0::2].clone())
 26 |     y[1:, 1::2] = torch.cos(y[1:, 1::2].clone())
 27 |     return y
 28 | 
 29 | 
 30 | class SinusoidalEncoding(nn.Embedding):
 31 | 
 32 |     def __init__(self, num_embeddings, embedding_dim,
 33 |                  *args, **kwargs):
 34 |         super(SinusoidalEncoding, self).__init__(num_embeddings, embedding_dim,
 35 |                                                  padding_idx=0,
 36 |                                                  *args, **kwargs)
 37 |         self.weight.data = position_encoding_init(num_embeddings, embedding_dim, position_rate=1.0)
 38 | 
 39 |     def forward(self, x, w=1.0):
 40 |         isscaler = np.isscalar(w)
 41 |         if self.padding_idx is None:
 42 |             raise AssertionError()
 43 | 
 44 |         if isscaler or w.size(0) == 1:
 45 |             weight = sinusoidal_encode(self.weight, w)
 46 |             return F.embedding(
 47 |                 x, weight, self.padding_idx, self.max_norm,
 48 |                 self.norm_type, self.scale_grad_by_freq, self.sparse)
 49 | 
 50 | 
 51 | def linear(in_features, out_features, dropout=0):
 52 |     """Weight-normalized Linear layer (input: N x T x C)"""
 53 |     m = nn.Linear(in_features, out_features)
 54 |     m.weight.data.normal_(mean=0, std=math.sqrt((1 - dropout) / in_features))
 55 |     m.bias.data.zero_()
 56 |     return nn.utils.weight_norm(m)
 57 | 
 58 | 
 59 | def embedding(num_embeddings, embedding_dim, padding_idx, std=0.01):
 60 |     m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx)
 61 |     m.weight.data.normal_(0, std)
 62 |     return m
 63 | 
 64 | def m_modification(m, in_channels, dropout, std_mul):
 65 |     std = math.sqrt((std_mul * (1.0 - dropout)) / (m.kernel_size[0] * in_channels))
 66 |     m.weight.data.normal_(mean=0, std=std)
 67 |     m.bias.data.zero_()
 68 |     return m
 69 | 
 70 | def conv1d(in_channels, out_channels, kernel_size, dropout=0, std_mul=4.0, **kwargs):
 71 |     from .conv import Conv1d
 72 |     m = Conv1d(in_channels, out_channels, kernel_size, **kwargs)
 73 |     m = m_modification(m,in_channels,dropout,std_mul)
 74 |     return nn.utils.weight_norm(m)
 75 | 
 76 | 
 77 | def conv_transpose1d(in_channels, out_channels, kernel_size, dropout=0,
 78 |                      std_mul=1.0, **kwargs):
 79 |     m = nn.ConvTranspose1d(in_channels, out_channels, kernel_size, **kwargs)
 80 |     m = m_modification(m,in_channels,dropout,std_mul)
 81 |     return nn.utils.weight_norm(m)
 82 | 
 83 | 
 84 | class Conv1dGLU(nn.Module):
 85 |     """(Dilated) Conv1d + Gated linear unit + (optionally) speaker embedding
 86 |     """
 87 | 
 88 |     def __init__(self, n_speakers, speaker_embed_dim,
 89 |                  in_channels, out_channels, kernel_size,
 90 |                  dropout, padding=None, dilation=1, causal=False, residual=False,
 91 |                  *args, **kwargs):
 92 |         super(Conv1dGLU, self).__init__()
 93 |         self.dropout = dropout
 94 |         self.residual = residual
 95 |         if padding is None:
 96 |             # no future time stamps available
 97 |             if causal:
 98 |                 padding = (kernel_size - 1) * dilation
 99 |             else:
100 |                 padding = (kernel_size - 1) // 2 * dilation
101 |         self.causal = causal
102 | 
103 |         self.conv = conv1d(in_channels, 2 * out_channels, kernel_size,
104 |                            dropout=dropout, padding=padding, dilation=dilation,
105 |                            *args, **kwargs)
106 |         if n_speakers > 1:
107 |             self.speaker_proj = linear(speaker_embed_dim, out_channels)
108 |         else:
109 |             self.speaker_proj = None
110 | 
111 |     def forward(self, x, speaker_embed=None):
112 |         return self._forward(x, False)
113 | 
114 |     def incremental_forward(self, x):
115 |         return self._forward(x, True)
116 | 
117 |     def _forward(self, x, is_incremental):
118 |         residual = x
119 |         x = F.dropout(x, p=self.dropout, training=self.training)
120 |         if is_incremental:
121 |             splitdim = -1
122 |             x = self.conv.incremental_forward(x, )
123 |         else:
124 |             splitdim = 1
125 |             x = self.conv(x)
126 |             # remove future time steps
127 |             x = x[:, :, :residual.size(-1)] if self.causal else x
128 | 
129 |         a, b = x.split(x.size(splitdim) // 2, dim=splitdim)
130 |         x = a * F.sigmoid(b)
131 |         return (x + residual) * math.sqrt(0.5) if self.residual else x
132 | 
133 |     def clear_buffer(self):
134 |         self.conv.clear_buffer()
135 | 


--------------------------------------------------------------------------------
/autokeras_pretrained/voice_generator/deepvoice3_pytorch/text/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datamllab/autokeras-pretrained/2529714e697f5f1333c16809add621a42691b2dd/autokeras_pretrained/voice_generator/deepvoice3_pytorch/text/__init__.py


--------------------------------------------------------------------------------
/autokeras_pretrained/voice_generator/deepvoice3_pytorch/text/cleaners.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Cleaners are transformations that run over the input text at both training and eval time.
 3 | 
 4 | Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
 5 | hyperparameter. Some cleaners are English-specific. You'll typically want to use:
 6 |   1. "english_cleaners" for English text
 7 |   2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
 8 |      the Unidecode library (https://pypi.python.org/pypi/Unidecode)
 9 |   3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
10 |      the symbols in symbols.py to match your data).
11 | """
12 | 
13 | import re
14 | 
15 | from unidecode import unidecode
16 | 
17 | from .numbers import normalize_numbers
18 | 
19 | # Regular expression matching whitespace:
20 | _whitespace_re = re.compile(r'\s+')
21 | 
22 | # List of (regular expression, replacement) pairs for abbreviations:
23 | _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
24 |     ('mrs', 'misess'),
25 |     ('mr', 'mister'),
26 |     ('dr', 'doctor'),
27 |     ('st', 'saint'),
28 |     ('co', 'company'),
29 |     ('jr', 'junior'),
30 |     ('maj', 'major'),
31 |     ('gen', 'general'),
32 |     ('drs', 'doctors'),
33 |     ('rev', 'reverend'),
34 |     ('lt', 'lieutenant'),
35 |     ('hon', 'honorable'),
36 |     ('sgt', 'sergeant'),
37 |     ('capt', 'captain'),
38 |     ('esq', 'esquire'),
39 |     ('ltd', 'limited'),
40 |     ('col', 'colonel'),
41 |     ('ft', 'fort'),
42 | ]]
43 | 
44 | 
45 | def expand_abbreviations(text):
46 |     for regex, replacement in _abbreviations:
47 |         text = re.sub(regex, replacement, text)
48 |     return text
49 | 
50 | 
51 | def expand_numbers(text):
52 |     return normalize_numbers(text)
53 | 
54 | 
55 | def lowercase(text):
56 |     return text.lower()
57 | 
58 | 
59 | def collapse_whitespace(text):
60 |     return re.sub(_whitespace_re, ' ', text)
61 | 
62 | 
63 | def convert_to_ascii(text):
64 |     return unidecode(text)
65 | 
66 | 
67 | def add_punctuation(text):
68 |     if text[-1] not in '!,.:;?':
69 |         text = text + '.'  # without this decoder is confused when to output EOS
70 |     return text
71 | 
72 | 
73 | def english_cleaners(text):
74 |     """Pipeline for English text, including number and abbreviation expansion."""
75 |     text = convert_to_ascii(text)
76 |     text = add_punctuation(text)
77 |     text = lowercase(text)
78 |     text = expand_numbers(text)
79 |     text = expand_abbreviations(text)
80 |     text = collapse_whitespace(text)
81 |     return text
82 | 


--------------------------------------------------------------------------------
/autokeras_pretrained/voice_generator/deepvoice3_pytorch/text/cmudict.py:
--------------------------------------------------------------------------------
 1 | valid_symbols = [
 2 |     'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', 'AH2',
 3 |     'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', 'AY1', 'AY2',
 4 |     'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', 'ER1', 'ER2', 'EY',
 5 |     'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', 'IH1', 'IH2', 'IY', 'IY0', 'IY1',
 6 |     'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0',
 7 |     'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW',
 8 |     'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH'
 9 | ]
10 | 
11 | _valid_symbol_set = set(valid_symbols)
12 | 
13 | 


--------------------------------------------------------------------------------
/autokeras_pretrained/voice_generator/deepvoice3_pytorch/text/numbers.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import inflect
 4 | import re
 5 | 
 6 | _inflect = inflect.engine()
 7 | _comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
 8 | _decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
 9 | _pounds_re = re.compile(r'£([0-9\,]*[0-9]+)')
10 | _ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)')
11 | _number_re = re.compile(r'[0-9]+')
12 | _dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)')
13 | 
14 | 
15 | def _remove_commas(m):
16 |     return m.group(1).replace(',', '')
17 | 
18 | 
19 | def _expand_decimal_point(m):
20 |     return m.group(1).replace('.', ' point ')
21 | 
22 | 
23 | def _expand_dollars(m):
24 |     match = m.group(1)
25 |     parts = match.split('.')
26 |     if len(parts) > 2:
27 |         return match + ' dollars'  # Unexpected format
28 |     dollars = int(parts[0]) if parts[0] else 0
29 |     cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
30 |     if dollars and cents:
31 |         dollar_unit = 'dollar' if dollars == 1 else 'dollars'
32 |         cent_unit = 'cent' if cents == 1 else 'cents'
33 |         return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit)
34 |     elif dollars:
35 |         dollar_unit = 'dollar' if dollars == 1 else 'dollars'
36 |         return '%s %s' % (dollars, dollar_unit)
37 |     elif cents:
38 |         cent_unit = 'cent' if cents == 1 else 'cents'
39 |         return '%s %s' % (cents, cent_unit)
40 | 
41 | 
42 | def _expand_ordinal(m):
43 |     return _inflect.number_to_words(m.group(0))
44 | 
45 | 
46 | def _expand_number(m):
47 |     num = int(m.group(0))
48 |     if 10000 > num > 1000:
49 |         if num % 100 == 0:
50 |             return _inflect.number_to_words(num // 100) + ' hundred'
51 |         else:
52 |             return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ')
53 |     else:
54 |         return _inflect.number_to_words(num, andword='')
55 | 
56 | 
57 | def normalize_numbers(text):
58 |     text = re.sub(_comma_number_re, _remove_commas, text)
59 |     text = re.sub(_pounds_re, r'\1 pounds', text)
60 |     text = re.sub(_dollars_re, _expand_dollars, text)
61 |     text = re.sub(_decimal_number_re, _expand_decimal_point, text)
62 |     text = re.sub(_ordinal_re, _expand_ordinal, text)
63 |     text = re.sub(_number_re, _expand_number, text)
64 |     return text
65 | 


--------------------------------------------------------------------------------
/autokeras_pretrained/voice_generator/deepvoice3_pytorch/text/symbols.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Defines the set of symbols used in text input to the model.
 3 | 
 4 | The default is a set of ASCII characters that works well for English or text that has been run
 5 | through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details.
 6 | '''
 7 | from .cmudict import valid_symbols
 8 | 
 9 | _pad = '_'
10 | _eos = '~'
11 | _characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!\'(),-.:;? '
12 | 
13 | # Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):
14 | _arpabet = ['@' + s for s in valid_symbols]
15 | 
16 | # Export all symbols:
17 | symbols = [_pad, _eos] + list(_characters) + _arpabet
18 | 


--------------------------------------------------------------------------------
/autokeras_pretrained/voice_generator/deepvoice3_pytorch/text/text.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | from autokeras_pretrained.voice_generator.deepvoice3_pytorch.text import cleaners
 4 | from autokeras_pretrained.voice_generator.deepvoice3_pytorch.text.symbols import symbols
 5 | 
 6 | # Mappings from symbol to numeric ID and vice versa:
 7 | _symbol_to_id = {s: i for i, s in enumerate(symbols)}
 8 | _id_to_symbol = {i: s for i, s in enumerate(symbols)}
 9 | 
10 | # Regular expression matching text enclosed in curly braces:
11 | _curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)')
12 | 
13 | 
14 | def text_to_sequence(text, cleaner_names):
15 |     """Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
16 | 
17 |       The text can optionally have ARPAbet sequences enclosed in curly braces embedded
18 |       in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street."
19 | 
20 |       Args:
21 |         text: string to convert to a sequence
22 |         cleaner_names: names of the cleaner functions to run the text through
23 | 
24 |       Returns:
25 |         List of integers corresponding to the symbols in the text
26 |     """
27 |     sequence = []
28 | 
29 |     # Check for curly braces and treat their contents as ARPAbet:
30 |     while len(text):
31 |         m = _curly_re.match(text)
32 |         if not m:
33 |             sequence += _symbols_to_sequence(_clean_text(text, cleaner_names))
34 |             break
35 |         sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names))
36 |         sequence += _arpabet_to_sequence(m.group(2))
37 |         text = m.group(3)
38 | 
39 |     # Append EOS token
40 |     sequence.append(_symbol_to_id['~'])
41 |     return sequence
42 | 
43 | 
44 | def _clean_text(text, cleaner_names):
45 |     for name in cleaner_names:
46 |         cleaner = getattr(cleaners, name)
47 |         text = cleaner(text)
48 |     return text
49 | 
50 | 
51 | def _symbols_to_sequence(symbols):
52 |     return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)]
53 | 
54 | 
55 | def _arpabet_to_sequence(text):
56 |     return _symbols_to_sequence(['@' + s for s in text.split()])
57 | 
58 | 
59 | def _should_keep_symbol(s):
60 |     return s in _symbol_to_id and s is not '_' and s is not '~'
61 | 


--------------------------------------------------------------------------------
/autokeras_pretrained/voice_generator/deepvoice3_pytorch/version.py:
--------------------------------------------------------------------------------
1 | __version__ = '0.0.6+7a10ac6'
2 | 


--------------------------------------------------------------------------------
/autokeras_pretrained/voice_generator/voice_generator.py:
--------------------------------------------------------------------------------
  1 | import lws
  2 | 
  3 | 
  4 | import librosa
  5 | import numpy as np
  6 | import torch
  7 | from scipy import signal
  8 | 
  9 | from autokeras_pretrained.constant import Constant
 10 | from autokeras_pretrained.base import Pretrained
 11 | from autokeras_pretrained.voice_generator.deepvoice3_pytorch import frontend, builder
 12 | 
 13 | 
 14 | # NOTE: If you want full control for model architecture. please take a look
 15 | # at the code and change whatever you want. Some hyper parameters are hardcoded.
 16 | 
 17 | # Default hyperparameters:
 18 | 
 19 | class Hparams:
 20 |     name = "deepvoice3"
 21 | 
 22 |     # Text:
 23 |     # [en  jp]
 24 |     frontend = 'en'
 25 | 
 26 |     # Replace words to its pronunciation with fixed probability.
 27 |     # e.g.  'hello' to 'HH AH0 L OW1'
 28 |     # [en  jp]
 29 |     # en: Word -> pronunciation using CMUDict
 30 |     # jp: Word -> pronounciation usnig MeCab
 31 |     # [0 ~ 1.0]: 0 means no replacement happens.
 32 |     replace_pronunciation_prob = 0.5
 33 | 
 34 |     # Convenient model builder
 35 |     # Definitions can be found at deepvoice3_pytorch/builder.py
 36 |     # deepvoice3: DeepVoice3 https://arxiv.org/abs/1710.07654
 37 |     builder = "deepvoice3"
 38 | 
 39 |     # Must be configured depends on the dataset and model you use
 40 |     n_speakers = 1
 41 |     speaker_embed_dim = 16
 42 | 
 43 |     # Audio:
 44 |     num_mels = 80
 45 |     fmin = 125
 46 |     fmax = 7600
 47 |     fft_size = 1024
 48 |     hop_size = 256
 49 |     sample_rate = 22050
 50 |     preemphasis = 0.97
 51 |     min_level_db = -100
 52 |     ref_level_db = 20
 53 |     # whether to rescale waveform or not.
 54 |     # Let x is an input waveform  rescaled waveform y is given by:
 55 |     # y = x / np.abs(x).max() * rescaling_max
 56 |     rescaling = False
 57 |     rescaling_max = 0.999
 58 |     # mel-spectrogram is normalized to [0  1] for each utterance and clipping may
 59 |     # happen depends on min_level_db and ref_level_db  causing clipping noise.
 60 |     # If False  assertion is added to ensure no clipping happens.
 61 |     allow_clipping_in_normalization = True
 62 | 
 63 |     # Model:
 64 |     downsample_step = 4  # must be 4 when builder="nyanko"
 65 |     outputs_per_step = 1  # must be 1 when builder="nyanko"
 66 |     embedding_weight_std = 0.1
 67 |     speaker_embedding_weight_std = 0.01
 68 |     padding_idx = 0
 69 |     # Maximum number of input text length
 70 |     # try setting larger value if you want to give very long text input
 71 |     max_positions = 512
 72 |     dropout = 1 - 0.95
 73 |     kernel_size = 3
 74 |     text_embed_dim = 128
 75 |     encoder_channels = 256
 76 |     decoder_channels = 256
 77 |     # Note: large converter channels requires significant computational cost
 78 |     converter_channels = 256
 79 |     query_position_rate = 1.0
 80 |     # can be computed by `compute_timestamp_ratio.py`.
 81 |     key_position_rate = 1.385  # 2.37 for jsut
 82 |     key_projection = False
 83 |     value_projection = False
 84 |     use_memory_mask = True
 85 |     trainable_positional_encodings = False
 86 |     freeze_embedding = False
 87 |     # If True  use decoder's internal representation for postnet inputs
 88 |     # otherwise use mel-spectrogram.
 89 |     use_decoder_state_for_postnet_input = True
 90 | 
 91 |     # Data loader
 92 |     pin_memory = True
 93 |     num_workers = 2  # Set it to 1 when in Windows (MemoryError  THAllocator.c 0x5)
 94 | 
 95 |     # Loss
 96 |     masked_loss_weight = 0.5  # (1-w)*loss + w * masked_loss
 97 |     priority_freq = 3000  # heuristic: priotrize [0 ~ priotiry_freq] for linear loss
 98 |     priority_freq_weight = 0.0  # (1-w)*linear_loss + w*priority_linear_loss
 99 |     # https://arxiv.org/pdf/1710.08969.pdf
100 |     # Adding the divergence to the loss stabilizes training  expecially for
101 |     # very deep (> 10 layers) networks.
102 |     # Binary div loss seems has approx 10x scale compared to L1 loss  so I choose 0.1.
103 |     binary_divergence_weight = 0.1  # set 0 to disable
104 |     use_guided_attention = True
105 |     guided_attention_sigma = 0.2
106 | 
107 |     # Training:
108 |     batch_size = 16
109 |     adam_beta1 = 0.5
110 |     adam_beta2 = 0.9
111 |     adam_eps = 1e-6
112 |     amsgrad = False
113 |     initial_learning_rate = 5e-4  # 0.001
114 |     lr_schedule = "noam_learning_rate_decay"
115 |     lr_schedule_kwargs = {}
116 |     nepochs = 2000
117 |     weight_decay = 0.0
118 |     clip_thresh = 0.1
119 | 
120 |     # Save
121 |     checkpoint_interval = 10000
122 |     eval_interval = 10000
123 |     save_optimizer_state = True
124 | 
125 |     # Eval:
126 |     # this can be list for multple layers of attention
127 |     # e.g.  [True  False  False  False  True]
128 |     force_monotonic_attention = True
129 |     # Attention constraint for incremental decoding
130 |     window_ahead = 3
131 |     # 0 tends to prevent word repretetion  but sometime causes skip words
132 |     window_backward = 1
133 |     power = 1.4  # Power to raise magnitudes to prior to phase retrieval
134 | 
135 |     # GC:
136 |     # Forced garbage collection probability
137 |     # Use only when MemoryError continues in Windows (Disabled by default)
138 |     # gc_probability = 0.001
139 | 
140 |     # json_meta mode only
141 |     # 0: "use all"
142 |     # 1: "ignore only unmatched_alignment"
143 |     # 2: "fully ignore recognition"
144 |     ignore_recognition_level = 2
145 |     # when dealing with non-dedicated speech dataset(e.g. movie excerpts)  setting min_text above 15 is desirable.
146 |     # Can be adjusted by dataset.
147 |     min_text = 20
148 |     # if true  data without phoneme alignment file(.lab) will be ignored
149 |     process_only_htk_aligned = False
150 | 
151 | 
152 | fs = Hparams.sample_rate
153 | global_step = 0
154 | global_epoch = 0
155 | 
156 | 
157 | def build_model():
158 |     model = getattr(builder, Hparams.builder)(
159 |         n_speakers=Hparams.n_speakers,
160 |         speaker_embed_dim=Hparams.speaker_embed_dim,
161 |         n_vocab=frontend.n_vocab,
162 |         embed_dim=Hparams.text_embed_dim,
163 |         mel_dim=Hparams.num_mels,
164 |         linear_dim=Hparams.fft_size // 2 + 1,
165 |         r=Hparams.outputs_per_step,
166 |         padding_idx=Hparams.padding_idx,
167 |         dropout=Hparams.dropout,
168 |         kernel_size=Hparams.kernel_size,
169 |         encoder_channels=Hparams.encoder_channels,
170 |         decoder_channels=Hparams.decoder_channels,
171 |         converter_channels=Hparams.converter_channels,
172 |         use_memory_mask=Hparams.use_memory_mask,
173 |         trainable_positional_encodings=Hparams.trainable_positional_encodings,
174 |         force_monotonic_attention=Hparams.force_monotonic_attention,
175 |         use_decoder_state_for_postnet_input=Hparams.use_decoder_state_for_postnet_input,
176 |         max_positions=Hparams.max_positions,
177 |         freeze_embedding=Hparams.freeze_embedding,
178 |         window_ahead=Hparams.window_ahead,
179 |         window_backward=Hparams.window_backward
180 |     )
181 |     return model
182 | 
183 | 
184 | def inv_preemphasis(x, coef=Hparams.preemphasis):
185 |     """Inverse operation of pre-emphasis
186 | 
187 |     Args:
188 |         x (1d-array): Input signal.
189 |         coef (float): Pre-emphasis coefficient.
190 | 
191 |     Returns:
192 |         array: Output filtered signal.
193 | 
194 |     See also:
195 |         :func:`preemphasis`
196 |     """
197 |     b = np.array([1.], x.dtype)
198 |     a = np.array([1., -coef], x.dtype)
199 |     return signal.lfilter(b, a, x)
200 | 
201 | 
202 | def inv_spectrogram(spectrogram):
203 |     """Converts spectrogram to waveform using librosa"""
204 |     S = _db_to_amp(_denormalize(spectrogram) + Hparams.ref_level_db)  # Convert back to linear
205 |     processor = _lws_processor()
206 |     D = processor.run_lws(S.astype(np.float64).T ** Hparams.power)
207 |     y = processor.istft(D).astype(np.float32)
208 |     return inv_preemphasis(y)
209 | 
210 | 
211 | def _lws_processor():
212 |     return lws.lws(Hparams.fft_size, Hparams.hop_size, mode="speech")
213 | 
214 | 
215 | _mel_basis = None
216 | 
217 | 
218 | def _db_to_amp(x):
219 |     return np.power(10.0, x * 0.05)
220 | 
221 | 
222 | def _denormalize(S):
223 |     return (np.clip(S, 0, 1) * -Hparams.min_level_db) + Hparams.min_level_db
224 | 
225 | 
226 | class VoiceGenerator(Pretrained):
227 |     def __init__(self, **kwargs):
228 |         super().__init__(**kwargs)
229 |         self.sample_rate = 0
230 |         self.hop_length = 0
231 |         self.sample_rate = Hparams.sample_rate
232 |         self.hop_length = Hparams.hop_size
233 | 
234 |         self.model = self.load_checkpoint()
235 |         self.model.to(self.device)
236 | 
237 |     @property
238 |     def _google_drive_files(self):
239 |         return Constant.VOICE_GENERATOR_MODELS
240 | 
241 |     def load_checkpoint(self):
242 |         global global_step
243 |         global global_epoch
244 | 
245 |         model = build_model()
246 |         print("Load checkpoint from: {}".format(self.local_paths[0]))
247 |         if self.device.startswith("cuda"):
248 |             checkpoint = torch.load(self.local_paths[0])
249 |         else:
250 |             checkpoint = torch.load(self.local_paths[0], map_location=lambda storage, loc: storage)
251 |         model.load_state_dict(checkpoint["state_dict"])
252 |         global_step = checkpoint["global_step"]
253 |         global_epoch = checkpoint["global_epoch"]
254 | 
255 |         return model
256 | 
257 |     def predict(self, text, path=None):
258 |         waveform, alignment, spectrogram, _ = self.tts(text)
259 |         if path is None:
260 |             AssertionError('Please provide the output file path.')
261 |         librosa.output.write_wav(path, waveform, self.sample_rate)
262 | 
263 |     def tts(self, text, p=0, speaker_id=None, fast=True):
264 |         """Convert text to speech waveform given a deepvoice3 model.
265 | 
266 |         Args:
267 |             speaker_id:
268 |             fast:
269 |             text (str) : Input text to be synthesized
270 |             p (float) : Replace word to pronounciation if p > 0. Default is 0.
271 |         """
272 |         self.model.eval()
273 |         if fast:
274 |             self.model.make_generation_fast_()
275 | 
276 |         sequence = np.array(frontend.text_to_sequence(text, p=p))
277 |         sequence = torch.from_numpy(sequence).unsqueeze(0).long().to(self.device)
278 |         text_positions = torch.arange(1, sequence.size(-1) + 1).unsqueeze(0).long().to(self.device)
279 |         speaker_ids = None if speaker_id is None else torch.LongTensor([speaker_id]).to(self.device)
280 | 
281 |         # Greedy decoding
282 |         with torch.no_grad():
283 |             mel_outputs, linear_outputs, alignments, _ = self.model(
284 |                 sequence, text_positions=text_positions, speaker_ids=speaker_ids)
285 | 
286 |         linear_output = linear_outputs[0].cpu().data.numpy()
287 |         spectrogram = _denormalize(linear_output)
288 |         alignment = alignments[0].cpu().data.numpy()
289 |         mel = mel_outputs[0].cpu().data.numpy()
290 |         mel = _denormalize(mel)
291 | 
292 |         # Predicted audio signal
293 |         waveform = inv_spectrogram(linear_output.T)
294 | 
295 |         return waveform, alignment, spectrogram, mel
296 | 


--------------------------------------------------------------------------------
/autokeras_pretrained/voice_recognizer.py:
--------------------------------------------------------------------------------
  1 | from collections import OrderedDict
  2 | 
  3 | import torch
  4 | import torch.nn as nn
  5 | import torch.nn.functional as F
  6 | from torch.autograd import Variable
  7 | 
  8 | from autokeras_pretrained.base import Pretrained
  9 | from autokeras_pretrained.constant import Constant
 10 | 
 11 | supported_rnns = {
 12 |     'lstm': nn.LSTM,
 13 |     'rnn': nn.RNN,
 14 |     'gru': nn.GRU
 15 | }
 16 | 
 17 | 
 18 | class Decoder(object):
 19 |     """
 20 |     Basic decoder class from which all other decoders inherit. Implements several
 21 |     helper functions. Subclasses should implement the decode() method.
 22 | 
 23 |     Arguments:
 24 |         labels (string): mapping from integers to characters.
 25 |         blank_index (int, optional): index for the blank '_' character. Defaults to 0.
 26 |         space_index (int, optional): index for the space ' ' character. Defaults to 28.
 27 |     """
 28 | 
 29 |     def __init__(self, labels, blank_index=0):
 30 |         # e.g. labels = "_'ABCDEFGHIJKLMNOPQRSTUVWXYZ#"
 31 |         self.labels = labels
 32 |         self.int_to_char = dict([(i, c) for (i, c) in enumerate(labels)])
 33 |         self.blank_index = blank_index
 34 |         space_index = len(labels)  # To prevent errors in decode, we add an out of bounds index for the space
 35 |         if ' ' in labels:
 36 |             space_index = labels.index(' ')
 37 |         self.space_index = space_index
 38 | 
 39 |     def decode(self, probs):
 40 |         """
 41 |         Given a matrix of character probabilities, returns the decoder's
 42 |         best guess of the transcription
 43 | 
 44 |         Arguments:
 45 |             probs: Tensor of character probabilities, where probs[c,t]
 46 |                             is the probability of character c at time t
 47 |             sizes(optional): Size of each sequence in the mini-batch
 48 |         Returns:
 49 |             string: sequence of the model's best guess for the transcription
 50 |         """
 51 |         raise NotImplementedError
 52 | 
 53 | 
 54 | class GreedyDecoder(Decoder):
 55 |     def __init__(self, labels, blank_index=0):
 56 |         super(GreedyDecoder, self).__init__(labels, blank_index)
 57 | 
 58 |     def convert_to_strings(self, sequences, return_offsets=True):
 59 |         """Given a list of numeric sequences, returns the corresponding strings"""
 60 |         strings = []
 61 |         offsets = []
 62 |         for sequence in sequences:
 63 |             seq_len = len(sequence)
 64 |             string, string_offsets = self.process_string(sequence, seq_len)
 65 |             strings.append([string])  # We only return one path
 66 |             if return_offsets:
 67 |                 offsets.append([string_offsets])
 68 |         return strings, offsets
 69 | 
 70 |     def process_string(self, sequence, size):
 71 |         string = ''
 72 |         offsets = []
 73 |         for i in range(size):
 74 |             char = self.int_to_char[sequence[i].item()]
 75 |             if char == self.int_to_char[self.blank_index]:
 76 |                 continue
 77 |                 # if this char is a repetition and remove_repetitions=true, then skip
 78 |             if i != 0 and char == self.int_to_char[sequence[i - 1].item()]:
 79 |                 continue
 80 |             if char == self.labels[self.space_index]:
 81 |                 string += ' '
 82 |                 offsets.append(i)
 83 |             else:
 84 |                 string = string + char
 85 |                 offsets.append(i)
 86 |         return string, torch.IntTensor(offsets)
 87 | 
 88 |     def decode(self, probs):
 89 |         """
 90 |         Returns the argmax decoding given the probability matrix. Removes
 91 |         repeated elements in the sequence, as well as blanks.
 92 | 
 93 |         Arguments:
 94 |             probs: Tensor of character probabilities from the network. Expected shape of seq_length x batch x output_dim
 95 |             sizes(optional): Size of each sequence in the mini-batch
 96 |         Returns:
 97 |             strings: sequences of the model's best guess for the transcription on inputs
 98 |             offsets: time step per character predicted
 99 |         """
100 |         _, max_probs = torch.max(probs.transpose(0, 1), 2)
101 |         strings, offsets = self.convert_to_strings(max_probs.view(max_probs.size(0), max_probs.size(1)),
102 |                                                    return_offsets=True)
103 |         return strings, offsets
104 | 
105 | 
106 | class SequenceWise(nn.Module):
107 |     def __init__(self, module):
108 |         """
109 |         Collapses input of dim T*N*H to (T*N)*H, and applies to a module.
110 |         Allows handling of variable sequence lengths and minibatch sizes.
111 |         :param module: Module to apply input to.
112 |         """
113 |         super(SequenceWise, self).__init__()
114 |         self.module = module
115 | 
116 |     def forward(self, x):
117 |         t, n = x.size(0), x.size(1)
118 |         x = x.view(t * n, -1)
119 |         x = self.module(x)
120 |         x = x.view(t, n, -1)
121 |         return x
122 | 
123 | 
124 | class InferenceBatchSoftmax(nn.Module):
125 |     def __init__(self):
126 |         super(InferenceBatchSoftmax, self).__init__()
127 | 
128 |     @staticmethod
129 |     def forward(input_):
130 |         return F.softmax(input_, dim=-1)
131 | 
132 | 
133 | class BatchRNN(nn.Module):
134 |     def __init__(self, input_size, hidden_size, rnn_type=nn.LSTM, bidirectional=False, batch_norm=True):
135 |         super(BatchRNN, self).__init__()
136 |         self.input_size = input_size
137 |         self.hidden_size = hidden_size
138 |         self.bidirectional = bidirectional
139 |         self.batch_norm = SequenceWise(nn.BatchNorm1d(input_size)) if batch_norm else None
140 |         self.rnn = rnn_type(input_size=input_size, hidden_size=hidden_size,
141 |                             bidirectional=bidirectional, bias=False)
142 |         self.num_directions = 2 if bidirectional else 1
143 | 
144 |     def flatten_parameters(self):
145 |         self.rnn.flatten_parameters()
146 | 
147 |     def forward(self, x):
148 |         if self.batch_norm is not None:
149 |             x = self.batch_norm(x)
150 |         x, _ = self.rnn(x)
151 |         if self.bidirectional:
152 |             x = x.view(x.size(0), x.size(1), 2, -1).sum(2).view(x.size(0), x.size(1), -1)  # (TxNxH*2) -> (TxNxH) by sum
153 |         return x
154 | 
155 | 
156 | class DeepSpeech(nn.Module):
157 |     def __init__(self, rnn_type=nn.LSTM, labels="abc", rnn_hidden_size=768, nb_layers=5,
158 |                  bidirectional=True):
159 |         super(DeepSpeech, self).__init__()
160 | 
161 |         # model metadata needed for serialization/deserialization
162 |         self._version = '0.0.1'
163 |         self._hidden_size = rnn_hidden_size
164 |         self._hidden_layers = nb_layers
165 |         self._rnn_type = rnn_type
166 |         self._labels = labels
167 |         self._bidirectional = bidirectional
168 | 
169 |         num_classes = len(self._labels)
170 | 
171 |         self.conv = nn.Sequential(
172 |             nn.Conv2d(1, 32, kernel_size=(41, 11), stride=(2, 2), padding=(0, 10)),
173 |             nn.BatchNorm2d(32),
174 |             nn.Hardtanh(0, 20, inplace=True),
175 |             nn.Conv2d(32, 32, kernel_size=(21, 11), stride=(2, 1), padding=(0, 10)),
176 |             nn.BatchNorm2d(32),
177 |             nn.Hardtanh(0, 20, inplace=True)
178 |         )
179 |         # Based on above convolutions and spectrogram size using conv formula (W - F + 2P)/ S+1
180 |         # rnn_input_size = int(math.floor((sample_rate * window_size) / 2) + 1)
181 |         # rnn_input_size = int(math.floor(rnn_input_size - 41) / 2 + 1)
182 |         # rnn_input_size = int(math.floor(rnn_input_size - 21) / 2 + 1)
183 |         # rnn_input_size *= 32
184 |         rnn_input_size = 672
185 | 
186 |         rnns = []
187 |         rnn = BatchRNN(input_size=rnn_input_size, hidden_size=rnn_hidden_size, rnn_type=rnn_type,
188 |                        bidirectional=bidirectional, batch_norm=False)
189 |         rnns.append(('0', rnn))
190 |         for x in range(nb_layers - 1):
191 |             rnn = BatchRNN(input_size=rnn_hidden_size, hidden_size=rnn_hidden_size, rnn_type=rnn_type,
192 |                            bidirectional=bidirectional)
193 |             rnns.append(('%d' % (x + 1), rnn))
194 |         self.rnns = nn.Sequential(OrderedDict(rnns))
195 |         fully_connected = nn.Sequential(
196 |             nn.BatchNorm1d(rnn_hidden_size),
197 |             nn.Linear(rnn_hidden_size, num_classes, bias=False)
198 |         )
199 |         self.fc = nn.Sequential(
200 |             SequenceWise(fully_connected),
201 |         )
202 |         self.inference_softmax = InferenceBatchSoftmax()
203 | 
204 |     def forward(self, x):
205 |         x = self.conv(x)
206 | 
207 |         sizes = x.size()
208 |         x = x.view(sizes[0], sizes[1] * sizes[2], sizes[3])  # Collapse feature dimension
209 |         x = x.transpose(1, 2).transpose(0, 1).contiguous()  # TxNxH
210 | 
211 |         x = self.rnns(x)
212 | 
213 |         x = self.fc(x)
214 |         x = x.transpose(0, 1)
215 |         # identity in training mode, softmax in eval mode
216 |         x = self.inference_softmax(x)
217 |         return x
218 | 
219 |     @classmethod
220 |     def load_model(cls, path, cuda=False):
221 |         package = torch.load(path, map_location=lambda storage, loc: storage)
222 |         model = cls(rnn_hidden_size=package['hidden_size'], nb_layers=package['hidden_layers'],
223 |                     labels=package['labels'], rnn_type=supported_rnns[package['rnn_type']],
224 |                     bidirectional=package.get('bidirectional', True))
225 |         # the blacklist parameters are params that were previous erroneously saved by the model
226 |         # care should be taken in future versions that if batch_norm on the first rnn is required
227 |         # that it be named something else
228 |         blacklist = ['rnns.0.batch_norm.module.weight', 'rnns.0.batch_norm.module.bias',
229 |                      'rnns.0.batch_norm.module.running_mean', 'rnns.0.batch_norm.module.running_var']
230 |         model.load_state_dict(package['state_dict'])
231 |         for x in model.rnns:
232 |             x.flatten_parameters()
233 |         if cuda:
234 |             model = torch.nn.DataParallel(model).cuda()
235 |         return model
236 | 
237 | 
238 | class VoiceRecognizer(Pretrained):
239 |     def __init__(self, **kwargs):
240 |         super().__init__(**kwargs)
241 | 
242 |         self.model = self.load_checkpoint()
243 |         labels = Constant.VOICE_RECONGINIZER_LABELS
244 |         self.decoder = GreedyDecoder(labels, blank_index=labels.index('_'))
245 | 
246 |     @property
247 |     def _google_drive_files(self):
248 |         return Constant.VOICE_RECONGINIZER_MODELS
249 | 
250 |     def load_checkpoint(self):
251 |         model = DeepSpeech.load_model(self.local_paths[0], cuda=(self.device == 'cuda'))
252 |         model.eval()
253 |         return model
254 | 
255 |     def predict(self, audio_data, audio_path=None):
256 |         if audio_data is None:
257 |             raise TypeError("audio_data cannot be None")
258 |         audio_data = audio_data.view(1, 1, audio_data.size(0), audio_data.size(1))
259 |         with torch.no_grad():
260 |             out = self.model(Variable(audio_data))
261 |             out = out.transpose(0, 1)  # TxNxH
262 |             decoded_output, _ = self.decoder.decode(out.data)
263 |         return decoded_output[0][0]
264 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | .
2 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from distutils.core import setup
 2 | from setuptools import find_packages
 3 | 
 4 | setup(
 5 |     name='autokeras-pretrained',
 6 |     packages=find_packages(exclude=('tests',)),
 7 |     install_requires=['scipy==1.2.0',
 8 |                       'torch==1.0.1.post2',
 9 |                       'torchvision==0.2.1',
10 |                       'numpy==1.16.1',
11 |                       'scikit-image==0.14.2',
12 |                       'imageio==2.5.0',
13 |                       'requests==2.21.0',
14 |                       'librosa==0.6.2',
15 |                       'numba',
16 |                       'inflect',
17 |                       'unidecode',
18 |                       'nltk==3.3',
19 |                       'lws==1.2',
20 |                       'opencv-python==4.0.0.21',
21 |                       'boto3'],
22 |     version='0.0.3',
23 |     description='Pretrained models for Auto-Keras',
24 |     author='DATA Lab at Texas A&M University',
25 |     author_email='jhfjhfj1@gmail.com',
26 |     url='http://autokeras.com',
27 |     download_url='https://github.com/jhfjhfj1/autokeras-pretrained/archive/0.0.3.tar.gz',
28 |     keywords=['autokeras', 'keras'],
29 |     classifiers=[]
30 | )
31 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/tests/common.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import os
 3 | 
 4 | TEST_TEMP_AUTO_KERAS_DIR = 'tests/resources/temp/autokeras'
 5 | TEST_TEMP_DIR = 'tests/resources/temp'
 6 | 
 7 | 
 8 | def clean_dir(path):
 9 |     for f in os.listdir(path):
10 |         full_path = os.path.join(path, f)
11 |         if f != '.gitkeep':
12 |             if os.path.isfile(full_path):
13 |                 os.remove(full_path)
14 |             else:
15 |                 os.rmdir(full_path)
16 | # def mock_nvidia_smi_output(*arg, **kwargs):
17 | #     return \
18 | #         '    Free                        : 1 MiB \n' \
19 | #         '    Free                        : 11176 MiB \n' \
20 | #         '    Free                        : 1 MiB \n' \
21 | #         '    Free                        : 1 MiB'
22 | 


--------------------------------------------------------------------------------
/tests/pretrained/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datamllab/autokeras-pretrained/2529714e697f5f1333c16809add621a42691b2dd/tests/pretrained/__init__.py


--------------------------------------------------------------------------------
/tests/pretrained/test_face_detector.py:
--------------------------------------------------------------------------------
 1 | from autokeras_pretrained.face_detector import FaceDetector
 2 | import os
 3 | 
 4 | from tests.common import TEST_TEMP_DIR, clean_dir
 5 | 
 6 | 
 7 | def test_face_detector():
 8 |     img_file, out_file = 'tests/resources/images_test/face_detector.jpg', os.path.join(TEST_TEMP_DIR, 'output.jpg')
 9 |     if os.path.exists(out_file):
10 |         os.remove(out_file)
11 |     face_detection = FaceDetector()
12 |     bboxs1, landmarks1 = face_detection.predict(img_file, out_file)
13 |     assert os.path.exists(out_file)
14 |     bboxs2, landmarks2 = face_detection.predict(img_file)
15 |     assert bboxs1.shape == bboxs2.shape == (11, 5) and landmarks1.shape == landmarks2.shape == (11, 10)
16 |     clean_dir(TEST_TEMP_DIR)
17 | 


--------------------------------------------------------------------------------
/tests/pretrained/test_object_detection.py:
--------------------------------------------------------------------------------
 1 | from autokeras_pretrained.object_detector import ObjectDetector
 2 | from tests.common import TEST_TEMP_DIR, clean_dir
 3 | 
 4 | 
 5 | def test_object_detection():
 6 |     detector = ObjectDetector()
 7 |     img_path = 'tests/resources/images_test/od.JPG'
 8 |     result = detector.predict(img_path, TEST_TEMP_DIR)
 9 |     assert isinstance(result, list)
10 |     clean_dir(TEST_TEMP_DIR)
11 | 


--------------------------------------------------------------------------------
/tests/pretrained/test_sentiment_analysis.py:
--------------------------------------------------------------------------------
 1 | from autokeras_pretrained.text_classifier import SentimentAnalysis
 2 | 
 3 | 
 4 | def test_sentiment_analysis():
 5 |     sentiment_analyzer = SentimentAnalysis()
 6 | 
 7 |     positive_polarity = sentiment_analyzer.predict("The model is working really well.")
 8 |     if positive_polarity <= 0.5:
 9 |         raise AssertionError()
10 | 
11 |     negative_polarity = sentiment_analyzer.predict("The university intake has reduced drastically this year.")
12 |     if negative_polarity >= 0.5:
13 |         raise AssertionError()
14 | 


--------------------------------------------------------------------------------
/tests/pretrained/test_topic_classifier.py:
--------------------------------------------------------------------------------
 1 | from autokeras_pretrained.text_classifier import TopicClassifier
 2 | 
 3 | 
 4 | def test_topic_classifier():
 5 |     topic_classifier = TopicClassifier()
 6 | 
 7 |     topic_name = topic_classifier.predict(
 8 |         "Risk mitigation is the pursuit of opportunities where the potential upside is far greater than the potential "
 9 |         "downside", )
10 | 
11 |     if topic_name != "Business":
12 |         raise AssertionError()
13 | 
14 |     topic_name = topic_classifier.predict(
15 |         "With a tap on the screen the app will recognise your face and bring up the filter menu", )
16 | 
17 |     if topic_name != "Sci/Tech":
18 |         raise AssertionError()
19 | 
20 |     topic_name = topic_classifier.predict(
21 |         "Anthony received a loud ovation when he was shown on the overhead videoboard in the first quarter", )
22 | 
23 |     if topic_name != "Sports":
24 |         raise AssertionError()
25 | 
26 |     topic_name = topic_classifier.predict("The soviet union was created about five years after Russian Revolution.", )
27 | 
28 |     if topic_name != "World":
29 |         raise AssertionError()
30 | 


--------------------------------------------------------------------------------
/tests/pretrained/test_voice_generator.py:
--------------------------------------------------------------------------------
 1 | from autokeras_pretrained import VoiceGenerator
 2 | from tests.common import TEST_TEMP_DIR, clean_dir
 3 | import os
 4 | 
 5 | 
 6 | def test_voice_generator():
 7 |     voice_generator = VoiceGenerator()
 8 |     clean_dir(TEST_TEMP_DIR)
 9 |     texts = [
10 |         "Generative adversarial network or variational auto-encoder.",
11 |         "The tuition of the coming semster is 6300 dollars.",
12 |         "The tuition of the coming semster is 6350 dollars.",
13 |         "Turn left on {HH AW1 S S T AH0 N} Street.",
14 |         "This is expensive, it costs me $300.2",
15 |         "This is expensive, it costs me $300",
16 |         "This is cheap, it only costs me $.2",
17 |         "Today he won the 1st prize of the competition",
18 |         "The approximation of pi is 3.14",
19 |     ]
20 | 
21 |     for idx, text in enumerate(texts):
22 |         save_name = "test_" + str(idx) + ".wav"
23 |         save_name = os.path.join(TEST_TEMP_DIR, save_name)
24 |         voice_generator.predict(text, path=save_name)
25 |     clean_dir(TEST_TEMP_DIR)
26 | 


--------------------------------------------------------------------------------
/tests/pretrained/test_voice_recognizer.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from autokeras_pretrained import VoiceRecognizer
 4 | 
 5 | 
 6 | def test_voice_generator():
 7 |     spect2 = torch.rand(161, 131)
 8 |     voice_recognizer = VoiceRecognizer()
 9 |     print(voice_recognizer.predict(audio_data=spect2))
10 | 
11 | 
12 | def test_voice_generator_none_type_error():
13 |     voice_recognizer = VoiceRecognizer()
14 |     try:
15 |         print(voice_recognizer.predict(audio_data=None))
16 |     except TypeError:
17 |         pass
18 | 


--------------------------------------------------------------------------------
/tests/resources/images_test/Black_white_images/Aaron_Eckhart_0001.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datamllab/autokeras-pretrained/2529714e697f5f1333c16809add621a42691b2dd/tests/resources/images_test/Black_white_images/Aaron_Eckhart_0001.jpg


--------------------------------------------------------------------------------
/tests/resources/images_test/Black_white_images/Aaron_Peirsol_0001.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datamllab/autokeras-pretrained/2529714e697f5f1333c16809add621a42691b2dd/tests/resources/images_test/Black_white_images/Aaron_Peirsol_0001.jpg


--------------------------------------------------------------------------------
/tests/resources/images_test/Black_white_images/Aaron_Peirsol_0002.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datamllab/autokeras-pretrained/2529714e697f5f1333c16809add621a42691b2dd/tests/resources/images_test/Black_white_images/Aaron_Peirsol_0002.jpg


--------------------------------------------------------------------------------
/tests/resources/images_test/Black_white_images/Aaron_Peirsol_0003.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datamllab/autokeras-pretrained/2529714e697f5f1333c16809add621a42691b2dd/tests/resources/images_test/Black_white_images/Aaron_Peirsol_0003.jpg


--------------------------------------------------------------------------------
/tests/resources/images_test/Black_white_images/Aaron_Peirsol_0004.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datamllab/autokeras-pretrained/2529714e697f5f1333c16809add621a42691b2dd/tests/resources/images_test/Black_white_images/Aaron_Peirsol_0004.jpg


--------------------------------------------------------------------------------
/tests/resources/images_test/Black_white_images/Aaron_Sorkin_0001.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datamllab/autokeras-pretrained/2529714e697f5f1333c16809add621a42691b2dd/tests/resources/images_test/Black_white_images/Aaron_Sorkin_0001.jpg


--------------------------------------------------------------------------------
/tests/resources/images_test/Black_white_images/Aaron_Sorkin_0002.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datamllab/autokeras-pretrained/2529714e697f5f1333c16809add621a42691b2dd/tests/resources/images_test/Black_white_images/Aaron_Sorkin_0002.jpg


--------------------------------------------------------------------------------
/tests/resources/images_test/Black_white_images/Abdel_Nasser_Assidi_0001.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datamllab/autokeras-pretrained/2529714e697f5f1333c16809add621a42691b2dd/tests/resources/images_test/Black_white_images/Abdel_Nasser_Assidi_0001.jpg


--------------------------------------------------------------------------------
/tests/resources/images_test/Black_white_images/Abdel_Nasser_Assidi_0002.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datamllab/autokeras-pretrained/2529714e697f5f1333c16809add621a42691b2dd/tests/resources/images_test/Black_white_images/Abdel_Nasser_Assidi_0002.jpg


--------------------------------------------------------------------------------
/tests/resources/images_test/Black_white_images/Abel_Pacheco_0001.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datamllab/autokeras-pretrained/2529714e697f5f1333c16809add621a42691b2dd/tests/resources/images_test/Black_white_images/Abel_Pacheco_0001.jpg


--------------------------------------------------------------------------------
/tests/resources/images_test/Black_white_images/Abel_Pacheco_0002.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datamllab/autokeras-pretrained/2529714e697f5f1333c16809add621a42691b2dd/tests/resources/images_test/Black_white_images/Abel_Pacheco_0002.jpg


--------------------------------------------------------------------------------
/tests/resources/images_test/Black_white_images/Abel_Pacheco_0003.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datamllab/autokeras-pretrained/2529714e697f5f1333c16809add621a42691b2dd/tests/resources/images_test/Black_white_images/Abel_Pacheco_0003.jpg


--------------------------------------------------------------------------------
/tests/resources/images_test/Black_white_images/Abel_Pacheco_0004.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datamllab/autokeras-pretrained/2529714e697f5f1333c16809add621a42691b2dd/tests/resources/images_test/Black_white_images/Abel_Pacheco_0004.jpg


--------------------------------------------------------------------------------
/tests/resources/images_test/Black_white_images/Abel_Pacheco_0005.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datamllab/autokeras-pretrained/2529714e697f5f1333c16809add621a42691b2dd/tests/resources/images_test/Black_white_images/Abel_Pacheco_0005.jpg


--------------------------------------------------------------------------------
/tests/resources/images_test/Black_white_images/Abel_Pacheco_0006.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datamllab/autokeras-pretrained/2529714e697f5f1333c16809add621a42691b2dd/tests/resources/images_test/Black_white_images/Abel_Pacheco_0006.jpg


--------------------------------------------------------------------------------
/tests/resources/images_test/Color_images/Aaron_Eckhart_0001.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datamllab/autokeras-pretrained/2529714e697f5f1333c16809add621a42691b2dd/tests/resources/images_test/Color_images/Aaron_Eckhart_0001.jpg


--------------------------------------------------------------------------------
/tests/resources/images_test/Color_images/Aaron_Peirsol_0001.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datamllab/autokeras-pretrained/2529714e697f5f1333c16809add621a42691b2dd/tests/resources/images_test/Color_images/Aaron_Peirsol_0001.jpg


--------------------------------------------------------------------------------
/tests/resources/images_test/Color_images/Aaron_Peirsol_0002.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datamllab/autokeras-pretrained/2529714e697f5f1333c16809add621a42691b2dd/tests/resources/images_test/Color_images/Aaron_Peirsol_0002.jpg


--------------------------------------------------------------------------------
/tests/resources/images_test/Color_images/Aaron_Peirsol_0003.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datamllab/autokeras-pretrained/2529714e697f5f1333c16809add621a42691b2dd/tests/resources/images_test/Color_images/Aaron_Peirsol_0003.jpg


--------------------------------------------------------------------------------
/tests/resources/images_test/Color_images/Aaron_Peirsol_0004.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datamllab/autokeras-pretrained/2529714e697f5f1333c16809add621a42691b2dd/tests/resources/images_test/Color_images/Aaron_Peirsol_0004.jpg


--------------------------------------------------------------------------------
/tests/resources/images_test/Color_images/Aaron_Sorkin_0001.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datamllab/autokeras-pretrained/2529714e697f5f1333c16809add621a42691b2dd/tests/resources/images_test/Color_images/Aaron_Sorkin_0001.jpg


--------------------------------------------------------------------------------
/tests/resources/images_test/Color_images/Aaron_Sorkin_0002.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datamllab/autokeras-pretrained/2529714e697f5f1333c16809add621a42691b2dd/tests/resources/images_test/Color_images/Aaron_Sorkin_0002.jpg


--------------------------------------------------------------------------------
/tests/resources/images_test/Color_images/Abdel_Nasser_Assidi_0001.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datamllab/autokeras-pretrained/2529714e697f5f1333c16809add621a42691b2dd/tests/resources/images_test/Color_images/Abdel_Nasser_Assidi_0001.jpg


--------------------------------------------------------------------------------
/tests/resources/images_test/Color_images/Abdel_Nasser_Assidi_0002.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datamllab/autokeras-pretrained/2529714e697f5f1333c16809add621a42691b2dd/tests/resources/images_test/Color_images/Abdel_Nasser_Assidi_0002.jpg


--------------------------------------------------------------------------------
/tests/resources/images_test/Color_images/Abel_Pacheco_0001.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datamllab/autokeras-pretrained/2529714e697f5f1333c16809add621a42691b2dd/tests/resources/images_test/Color_images/Abel_Pacheco_0001.jpg


--------------------------------------------------------------------------------
/tests/resources/images_test/Color_images/Abel_Pacheco_0002.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datamllab/autokeras-pretrained/2529714e697f5f1333c16809add621a42691b2dd/tests/resources/images_test/Color_images/Abel_Pacheco_0002.jpg


--------------------------------------------------------------------------------
/tests/resources/images_test/Color_images/Abel_Pacheco_0003.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datamllab/autokeras-pretrained/2529714e697f5f1333c16809add621a42691b2dd/tests/resources/images_test/Color_images/Abel_Pacheco_0003.jpg


--------------------------------------------------------------------------------
/tests/resources/images_test/Color_images/Abel_Pacheco_0004.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datamllab/autokeras-pretrained/2529714e697f5f1333c16809add621a42691b2dd/tests/resources/images_test/Color_images/Abel_Pacheco_0004.jpg


--------------------------------------------------------------------------------
/tests/resources/images_test/Color_images/Abel_Pacheco_0005.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datamllab/autokeras-pretrained/2529714e697f5f1333c16809add621a42691b2dd/tests/resources/images_test/Color_images/Abel_Pacheco_0005.jpg


--------------------------------------------------------------------------------
/tests/resources/images_test/Color_images/Abel_Pacheco_0006.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datamllab/autokeras-pretrained/2529714e697f5f1333c16809add621a42691b2dd/tests/resources/images_test/Color_images/Abel_Pacheco_0006.jpg


--------------------------------------------------------------------------------
/tests/resources/images_test/face_detector.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datamllab/autokeras-pretrained/2529714e697f5f1333c16809add621a42691b2dd/tests/resources/images_test/face_detector.jpg


--------------------------------------------------------------------------------
/tests/resources/images_test/images_name.csv:
--------------------------------------------------------------------------------
1 | File Name,Label
2 | Aaron_Peirsol_0001.jpg,0
3 | Aaron_Peirsol_0002.jpg,0
4 | Aaron_Peirsol_0003.jpg,0
5 | Aaron_Peirsol_0004.jpg,0
6 | Aaron_Sorkin_0001.jpg,1
7 | 


--------------------------------------------------------------------------------
/tests/resources/images_test/od.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datamllab/autokeras-pretrained/2529714e697f5f1333c16809add621a42691b2dd/tests/resources/images_test/od.JPG


--------------------------------------------------------------------------------
/tests/resources/temp/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datamllab/autokeras-pretrained/2529714e697f5f1333c16809add621a42691b2dd/tests/resources/temp/.gitkeep


--------------------------------------------------------------------------------